From fd26e92c7822f850bc0c7592614a8f3e4203ad5b Mon Sep 17 00:00:00 2001
From: jop611 <meijer.jop@gmail.com>
Date: Fri, 9 Aug 2024 12:53:37 +0200
Subject: [PATCH 01/34] changes to consistent and assemble for performance

---
 src/jagged_array.jl    |  10 +
 src/p_sparse_matrix.jl | 858 +++++++++++++++++++++++++++++------------
 2 files changed, 615 insertions(+), 253 deletions(-)

diff --git a/src/jagged_array.jl b/src/jagged_array.jl
index 60a74dec..a3d61789 100644
--- a/src/jagged_array.jl
+++ b/src/jagged_array.jl
@@ -154,6 +154,16 @@ function JaggedArray{T,Ti}(a::AbstractArray{<:AbstractArray}) where {T,Ti}
   JaggedArray(data,ptrs)
 end
 
+# New
+function jagged_range(a::Union{JaggedArray,GenericJaggedArray},i::Integer)
+  u = one(eltype(a.ptrs))
+  pini = a.ptrs[i]
+  pend = a.ptrs[i+1]-u
+  pini:pend
+end
+
+
+###########
 
 Base.size(a::Union{JaggedArray,GenericJaggedArray}) = (length(a.ptrs)-1,)
 function Base.getindex(a::Union{JaggedArray,GenericJaggedArray},i::Int)
diff --git a/src/p_sparse_matrix.jl b/src/p_sparse_matrix.jl
index f2b65fc8..d9561d6e 100644
--- a/src/p_sparse_matrix.jl
+++ b/src/p_sparse_matrix.jl
@@ -1357,146 +1357,322 @@ function psparse_assemble_impl(A,::Type,rows)
     error("Case not implemented yet")
 end
 
-function psparse_assemble_impl(
-        A,
-        ::Type{<:AbstractSplitMatrix},
-        rows;
-        reuse=Val(false),
-        assembly_neighbors_options_cols=(;))
-
-    function setup_cache_snd(A,parts_snd,rows_sa,cols_sa)
-        A_ghost_own   = A.blocks.ghost_own
-        A_ghost_ghost = A.blocks.ghost_ghost
-        gen = ( owner=>i for (i,owner) in enumerate(parts_snd) )
-        owner_to_p = Dict(gen)
-        ptrs = zeros(Int32,length(parts_snd)+1)
-        ghost_to_owner_row = ghost_to_owner(rows_sa)
-        ghost_to_global_row = ghost_to_global(rows_sa)
-        own_to_global_col = own_to_global(cols_sa)
-        ghost_to_global_col = ghost_to_global(cols_sa)
-        for (i,_,_) in nziterator(A_ghost_own)
-            owner = ghost_to_owner_row[i]
-            ptrs[owner_to_p[owner]+1] += 1
-        end
-        for (i,_,_) in nziterator(A_ghost_ghost)
-            owner = ghost_to_owner_row[i]
-            ptrs[owner_to_p[owner]+1] += 1
-        end
-        length_to_ptrs!(ptrs)
-        Tv = eltype(A_ghost_own)
-        ndata = ptrs[end]-1
-        I_snd_data = zeros(Int,ndata)
-        J_snd_data = zeros(Int,ndata)
-        V_snd_data = zeros(Tv,ndata)
-        k_snd_data = zeros(Int32,ndata)
-        nnz_ghost_own = 0
-        for (k,(i,j,v)) in enumerate(nziterator(A_ghost_own))
-            owner = ghost_to_owner_row[i]
-            p = ptrs[owner_to_p[owner]]
-            I_snd_data[p] = ghost_to_global_row[i]
-            J_snd_data[p] = own_to_global_col[j]
-            V_snd_data[p] = v
-            k_snd_data[p] = k
-            ptrs[owner_to_p[owner]] += 1
-            nnz_ghost_own += 1
-        end
-        for (k,(i,j,v)) in enumerate(nziterator(A_ghost_ghost))
-            owner = ghost_to_owner_row[i]
-            p = ptrs[owner_to_p[owner]]
-            I_snd_data[p] = ghost_to_global_row[i]
-            J_snd_data[p] = ghost_to_global_col[j]
-            V_snd_data[p] = v
-            k_snd_data[p] = k+nnz_ghost_own
-            ptrs[owner_to_p[owner]] += 1
-        end
-        rewind_ptrs!(ptrs)
-        I_snd = JaggedArray(I_snd_data,ptrs)
-        J_snd = JaggedArray(J_snd_data,ptrs)
-        V_snd = JaggedArray(V_snd_data,ptrs)
-        k_snd = JaggedArray(k_snd_data,ptrs)
-        (;I_snd,J_snd,V_snd,k_snd,parts_snd)
-    end
-    function setup_cache_rcv(I_rcv,J_rcv,V_rcv,parts_rcv)
-        k_rcv_data = zeros(Int32,length(I_rcv.data))
-        k_rcv = JaggedArray(k_rcv_data,I_rcv.ptrs)
-        (;I_rcv,J_rcv,V_rcv,k_rcv,parts_rcv)
-    end
-    function setup_own_triplets(A,cache_rcv,rows_sa,cols_sa)
-        nz_own_own = findnz(A.blocks.own_own)
-        nz_own_ghost = findnz(A.blocks.own_ghost)
-        I_rcv_data = cache_rcv.I_rcv.data
-        J_rcv_data = cache_rcv.J_rcv.data
-        V_rcv_data = cache_rcv.V_rcv.data
-        k_rcv_data = cache_rcv.k_rcv.data
-        global_to_own_col = global_to_own(cols_sa)
-        is_ghost = findall(j->global_to_own_col[j]==0,J_rcv_data)
-        is_own = findall(j->global_to_own_col[j]!=0,J_rcv_data)
-        I_rcv_own = view(I_rcv_data,is_own)
-        J_rcv_own = view(J_rcv_data,is_own)
-        V_rcv_own = view(V_rcv_data,is_own)
-        k_rcv_own = view(k_rcv_data,is_own)
-        I_rcv_ghost = view(I_rcv_data,is_ghost)
-        J_rcv_ghost = view(J_rcv_data,is_ghost)
-        V_rcv_ghost = view(V_rcv_data,is_ghost)
-        k_rcv_ghost = view(k_rcv_data,is_ghost)
-        # After this col ids in own_ghost triplet remain global
-        map_global_to_own!(I_rcv_own,rows_sa)
-        map_global_to_own!(J_rcv_own,cols_sa)
-        map_global_to_own!(I_rcv_ghost,rows_sa)
-        map_ghost_to_global!(nz_own_ghost[2],cols_sa)
-        own_own_I = vcat(nz_own_own[1],I_rcv_own)
-        own_own_J = vcat(nz_own_own[2],J_rcv_own)
-        own_own_V = vcat(nz_own_own[3],V_rcv_own)
-        own_own_triplet = (own_own_I,own_own_J,own_own_V)
-        own_ghost_I = vcat(nz_own_ghost[1],I_rcv_ghost)
-        own_ghost_J = vcat(nz_own_ghost[2],J_rcv_ghost)
-        own_ghost_V = vcat(nz_own_ghost[3],V_rcv_ghost)
-        map_global_to_ghost!(nz_own_ghost[2],cols_sa)
-        own_ghost_triplet = (own_ghost_I,own_ghost_J,own_ghost_V)
-        triplets = (own_own_triplet,own_ghost_triplet)
-        aux = (I_rcv_own,J_rcv_own,k_rcv_own,I_rcv_ghost,J_rcv_ghost,k_rcv_ghost,nz_own_own,nz_own_ghost)
-        triplets, own_ghost_J, aux
-    end
-    function finalize_values(A,rows_fa,cols_fa,cache_snd,cache_rcv,triplets,aux)
-        (own_own_triplet,own_ghost_triplet) = triplets
-        (I_rcv_own,J_rcv_own,k_rcv_own,I_rcv_ghost,J_rcv_ghost,k_rcv_ghost,nz_own_own,nz_own_ghost) = aux
-        map_global_to_ghost!(own_ghost_triplet[2],cols_fa)
-        map_global_to_ghost!(J_rcv_ghost,cols_fa)
-        TA = typeof(A.blocks.own_own)
-        n_own_rows = own_length(rows_fa)
-        n_own_cols = own_length(cols_fa)
-        n_ghost_rows = ghost_length(rows_fa)
-        n_ghost_cols = ghost_length(cols_fa)
-        Ti = indextype(A.blocks.own_own)
-        Tv = eltype(A.blocks.own_own)
-        own_own = compresscoo(TA,own_own_triplet...,n_own_rows,n_own_cols)
-        own_ghost = compresscoo(TA,own_ghost_triplet...,n_own_rows,n_ghost_cols)
-        ghost_own = compresscoo(TA,Ti[],Ti[],Tv[],n_ghost_rows,n_own_cols)
-        ghost_ghost = compresscoo(TA,Ti[],Ti[],Tv[],n_ghost_rows,n_ghost_cols)
-        blocks = split_matrix_blocks(own_own,own_ghost,ghost_own,ghost_ghost)
-        values = split_matrix(blocks,local_permutation(rows_fa),local_permutation(cols_fa))
-        nnz_own_own = nnz(own_own)
-        k_own_sa = precompute_nzindex(own_own,own_own_triplet[1:2]...)
-        k_ghost_sa = precompute_nzindex(own_ghost,own_ghost_triplet[1:2]...)
-        for p in 1:length(I_rcv_own)
-            i = I_rcv_own[p]
-            j = J_rcv_own[p]
-            k_rcv_own[p] = nzindex(own_own,i,j)
-        end
-        for p in 1:length(I_rcv_ghost)
-            i = I_rcv_ghost[p]
-            j = J_rcv_ghost[p]
-            k_rcv_ghost[p] = nzindex(own_ghost,i,j) + nnz_own_own
-        end
-        cache = (;k_own_sa,k_ghost_sa,cache_snd...,cache_rcv...)
-        values, cache
+# function psparse_assemble_impl(
+#         A,
+#         ::Type{<:AbstractSplitMatrix},
+#         rows;
+#         reuse=Val(false),
+#         assembly_neighbors_options_cols=(;))
+
+#     function setup_cache_snd(A,parts_snd,rows_sa,cols_sa)
+#         A_ghost_own   = A.blocks.ghost_own
+#         A_ghost_ghost = A.blocks.ghost_ghost
+#         gen = ( owner=>i for (i,owner) in enumerate(parts_snd) )
+#         owner_to_p = Dict(gen)
+#         ptrs = zeros(Int32,length(parts_snd)+1)
+#         ghost_to_owner_row = ghost_to_owner(rows_sa)
+#         ghost_to_global_row = ghost_to_global(rows_sa)
+#         own_to_global_col = own_to_global(cols_sa)
+#         ghost_to_global_col = ghost_to_global(cols_sa)
+#         for (i,_,_) in nziterator(A_ghost_own)
+#             owner = ghost_to_owner_row[i]
+#             ptrs[owner_to_p[owner]+1] += 1
+#         end
+#         for (i,_,_) in nziterator(A_ghost_ghost)
+#             owner = ghost_to_owner_row[i]
+#             ptrs[owner_to_p[owner]+1] += 1
+#         end
+#         length_to_ptrs!(ptrs)
+#         Tv = eltype(A_ghost_own)
+#         ndata = ptrs[end]-1
+#         I_snd_data = zeros(Int,ndata)
+#         J_snd_data = zeros(Int,ndata)
+#         V_snd_data = zeros(Tv,ndata)
+#         k_snd_data = zeros(Int32,ndata)
+#         nnz_ghost_own = 0
+#         for (k,(i,j,v)) in enumerate(nziterator(A_ghost_own))
+#             owner = ghost_to_owner_row[i]
+#             p = ptrs[owner_to_p[owner]]
+#             I_snd_data[p] = ghost_to_global_row[i]
+#             J_snd_data[p] = own_to_global_col[j]
+#             V_snd_data[p] = v
+#             k_snd_data[p] = k
+#             ptrs[owner_to_p[owner]] += 1
+#             nnz_ghost_own += 1
+#         end
+#         for (k,(i,j,v)) in enumerate(nziterator(A_ghost_ghost))
+#             owner = ghost_to_owner_row[i]
+#             p = ptrs[owner_to_p[owner]]
+#             I_snd_data[p] = ghost_to_global_row[i]
+#             J_snd_data[p] = ghost_to_global_col[j]
+#             V_snd_data[p] = v
+#             k_snd_data[p] = k+nnz_ghost_own
+#             ptrs[owner_to_p[owner]] += 1
+#         end
+#         rewind_ptrs!(ptrs)
+#         I_snd = JaggedArray(I_snd_data,ptrs)
+#         J_snd = JaggedArray(J_snd_data,ptrs)
+#         V_snd = JaggedArray(V_snd_data,ptrs)
+#         k_snd = JaggedArray(k_snd_data,ptrs)
+#         (;I_snd,J_snd,V_snd,k_snd,parts_snd)
+#     end
+#     function setup_cache_rcv(I_rcv,J_rcv,V_rcv,parts_rcv)
+#         k_rcv_data = zeros(Int32,length(I_rcv.data))
+#         k_rcv = JaggedArray(k_rcv_data,I_rcv.ptrs)
+#         (;I_rcv,J_rcv,V_rcv,k_rcv,parts_rcv)
+#     end
+#     function setup_own_triplets(A,cache_rcv,rows_sa,cols_sa)
+#         nz_own_own = findnz(A.blocks.own_own)
+#         nz_own_ghost = findnz(A.blocks.own_ghost)
+#         I_rcv_data = cache_rcv.I_rcv.data
+#         J_rcv_data = cache_rcv.J_rcv.data
+#         V_rcv_data = cache_rcv.V_rcv.data
+#         k_rcv_data = cache_rcv.k_rcv.data
+#         global_to_own_col = global_to_own(cols_sa)
+#         is_ghost = findall(j->global_to_own_col[j]==0,J_rcv_data)
+#         is_own = findall(j->global_to_own_col[j]!=0,J_rcv_data)
+#         I_rcv_own = view(I_rcv_data,is_own)
+#         J_rcv_own = view(J_rcv_data,is_own)
+#         V_rcv_own = view(V_rcv_data,is_own)
+#         k_rcv_own = view(k_rcv_data,is_own)
+#         I_rcv_ghost = view(I_rcv_data,is_ghost)
+#         J_rcv_ghost = view(J_rcv_data,is_ghost)
+#         V_rcv_ghost = view(V_rcv_data,is_ghost)
+#         k_rcv_ghost = view(k_rcv_data,is_ghost)
+#         # After this col ids in own_ghost triplet remain global
+#         map_global_to_own!(I_rcv_own,rows_sa)
+#         map_global_to_own!(J_rcv_own,cols_sa)
+#         map_global_to_own!(I_rcv_ghost,rows_sa)
+#         map_ghost_to_global!(nz_own_ghost[2],cols_sa)
+#         own_own_I = vcat(nz_own_own[1],I_rcv_own)
+#         own_own_J = vcat(nz_own_own[2],J_rcv_own)
+#         own_own_V = vcat(nz_own_own[3],V_rcv_own)
+#         own_own_triplet = (own_own_I,own_own_J,own_own_V)
+#         own_ghost_I = vcat(nz_own_ghost[1],I_rcv_ghost)
+#         own_ghost_J = vcat(nz_own_ghost[2],J_rcv_ghost)
+#         own_ghost_V = vcat(nz_own_ghost[3],V_rcv_ghost)
+#         map_global_to_ghost!(nz_own_ghost[2],cols_sa)
+#         own_ghost_triplet = (own_ghost_I,own_ghost_J,own_ghost_V)
+#         triplets = (own_own_triplet,own_ghost_triplet)
+#         aux = (I_rcv_own,J_rcv_own,k_rcv_own,I_rcv_ghost,J_rcv_ghost,k_rcv_ghost,nz_own_own,nz_own_ghost)
+#         triplets, own_ghost_J, aux
+#     end
+#     function finalize_values(A,rows_fa,cols_fa,cache_snd,cache_rcv,triplets,aux)
+#         (own_own_triplet,own_ghost_triplet) = triplets
+#         (I_rcv_own,J_rcv_own,k_rcv_own,I_rcv_ghost,J_rcv_ghost,k_rcv_ghost,nz_own_own,nz_own_ghost) = aux
+#         map_global_to_ghost!(own_ghost_triplet[2],cols_fa)
+#         map_global_to_ghost!(J_rcv_ghost,cols_fa)
+#         TA = typeof(A.blocks.own_own)
+#         n_own_rows = own_length(rows_fa)
+#         n_own_cols = own_length(cols_fa)
+#         n_ghost_rows = ghost_length(rows_fa)
+#         n_ghost_cols = ghost_length(cols_fa)
+#         Ti = indextype(A.blocks.own_own)
+#         Tv = eltype(A.blocks.own_own)
+#         own_own = compresscoo(TA,own_own_triplet...,n_own_rows,n_own_cols)
+#         own_ghost = compresscoo(TA,own_ghost_triplet...,n_own_rows,n_ghost_cols)
+#         ghost_own = compresscoo(TA,Ti[],Ti[],Tv[],n_ghost_rows,n_own_cols)
+#         ghost_ghost = compresscoo(TA,Ti[],Ti[],Tv[],n_ghost_rows,n_ghost_cols)
+#         blocks = split_matrix_blocks(own_own,own_ghost,ghost_own,ghost_ghost)
+#         values = split_matrix(blocks,local_permutation(rows_fa),local_permutation(cols_fa))
+#         nnz_own_own = nnz(own_own)
+#         k_own_sa = precompute_nzindex(own_own,own_own_triplet[1:2]...)
+#         k_ghost_sa = precompute_nzindex(own_ghost,own_ghost_triplet[1:2]...)
+#         for p in 1:length(I_rcv_own)
+#             i = I_rcv_own[p]
+#             j = J_rcv_own[p]
+#             k_rcv_own[p] = nzindex(own_own,i,j)
+#         end
+#         for p in 1:length(I_rcv_ghost)
+#             i = I_rcv_ghost[p]
+#             j = J_rcv_ghost[p]
+#             k_rcv_ghost[p] = nzindex(own_ghost,i,j) + nnz_own_own
+#         end
+#         cache = (;k_own_sa,k_ghost_sa,cache_snd...,cache_rcv...)
+#         values, cache
+#     end
+#     rows_sa = partition(axes(A,1))
+#     cols_sa = partition(axes(A,2))
+#     #rows = map(remove_ghost,rows_sa)
+#     cols = map(remove_ghost,cols_sa)
+#     parts_snd, parts_rcv = assembly_neighbors(rows_sa)
+#     cache_snd = map(setup_cache_snd,partition(A),parts_snd,rows_sa,cols_sa)
+#     I_snd = map(i->i.I_snd,cache_snd)
+#     J_snd = map(i->i.J_snd,cache_snd)
+#     V_snd = map(i->i.V_snd,cache_snd)
+#     graph = ExchangeGraph(parts_snd,parts_rcv)
+#     t_I = exchange(I_snd,graph)
+#     t_J = exchange(J_snd,graph)
+#     t_V = exchange(V_snd,graph)
+#     @fake_async begin
+#         I_rcv = fetch(t_I)
+#         J_rcv = fetch(t_J)
+#         V_rcv = fetch(t_V)
+#         cache_rcv = map(setup_cache_rcv,I_rcv,J_rcv,V_rcv,parts_rcv)
+#         triplets,J,aux = map(setup_own_triplets,partition(A),cache_rcv,rows_sa,cols_sa) |> tuple_of_arrays
+#         J_owner = find_owner(cols_sa,J)
+#         rows_fa = rows
+#         cols_fa = map(union_ghost,cols,J,J_owner)
+#         assembly_neighbors(cols_fa;assembly_neighbors_options_cols...)
+#         vals_fa, cache = map(finalize_values,partition(A),rows_fa,cols_fa,cache_snd,cache_rcv,triplets,aux) |> tuple_of_arrays
+#         assembled = true
+#         B = PSparseMatrix(vals_fa,rows_fa,cols_fa,assembled)
+#         if val_parameter(reuse) == false
+#             B
+#         else
+#             B, cache
+#         end
+#     end
+# end
+
+# New assemble
+####################
+
+function setup_cache_snd(A,parts_snd,rows_sa,cols_sa)
+    A_ghost_own   = A.blocks.ghost_own
+    A_ghost_ghost = A.blocks.ghost_ghost
+    gen = ( owner=>i for (i,owner) in enumerate(parts_snd) )
+    owner_to_p = Dict(gen)
+    ptrs = zeros(Int32,length(parts_snd)+1)
+    ghost_to_owner_row = ghost_to_owner(rows_sa)
+    ghost_to_global_row = ghost_to_global(rows_sa)
+    own_to_global_col = own_to_global(cols_sa)
+    ghost_to_global_col = ghost_to_global(cols_sa)
+    for (i,_,_) in nziterator(A_ghost_own)
+        owner = ghost_to_owner_row[i]
+        ptrs[owner_to_p[owner]+1] += 1
+    end
+    for (i,_,_) in nziterator(A_ghost_ghost)
+        owner = ghost_to_owner_row[i]
+        ptrs[owner_to_p[owner]+1] += 1
+    end
+    length_to_ptrs!(ptrs)
+    Tv = eltype(A_ghost_own)
+    ndata = ptrs[end]-1
+    I_snd_data = zeros(Int,ndata)
+    J_snd_data = zeros(Int,ndata)
+    V_snd_data = zeros(Tv,ndata)
+    k_snd_data = zeros(Int32,ndata)
+    nnz_ghost_own = 0
+    for (k,(i,j,v)) in enumerate(nziterator(A_ghost_own))
+        owner = ghost_to_owner_row[i]
+        p = ptrs[owner_to_p[owner]]
+        I_snd_data[p] = ghost_to_global_row[i]
+        J_snd_data[p] = own_to_global_col[j]
+        V_snd_data[p] = v
+        k_snd_data[p] = k
+        ptrs[owner_to_p[owner]] += 1
+        nnz_ghost_own += 1
+    end
+    for (k,(i,j,v)) in enumerate(nziterator(A_ghost_ghost))
+        owner = ghost_to_owner_row[i]
+        p = ptrs[owner_to_p[owner]]
+        I_snd_data[p] = ghost_to_global_row[i]
+        J_snd_data[p] = ghost_to_global_col[j]
+        V_snd_data[p] = v
+        k_snd_data[p] = k+nnz_ghost_own
+        ptrs[owner_to_p[owner]] += 1
+    end
+    rewind_ptrs!(ptrs)
+    I_snd = JaggedArray(I_snd_data,ptrs)
+    J_snd = JaggedArray(J_snd_data,ptrs)
+    V_snd = JaggedArray(V_snd_data,ptrs)
+    k_snd = JaggedArray(k_snd_data,ptrs)
+    (;I_snd,J_snd,V_snd,k_snd,parts_snd)
+end
+
+function setup_cache_rcv(I_rcv,J_rcv,V_rcv,parts_rcv)
+    k_rcv_data = zeros(Int32,length(I_rcv.data))
+    k_rcv = JaggedArray(k_rcv_data,I_rcv.ptrs)
+    (;I_rcv,J_rcv,V_rcv,k_rcv,parts_rcv)
+end
+
+function setup_own_triplets(A,cache_rcv,rows_sa,cols_sa)
+    nz_own_own = findnz(A.blocks.own_own)
+    nz_own_ghost = findnz(A.blocks.own_ghost)
+    I_rcv_data = cache_rcv.I_rcv.data
+    J_rcv_data = cache_rcv.J_rcv.data
+    V_rcv_data = cache_rcv.V_rcv.data
+    k_rcv_data = cache_rcv.k_rcv.data
+    global_to_own_col = global_to_own(cols_sa)
+    is_ghost = findall(j->global_to_own_col[j]==0,J_rcv_data)
+    is_own = findall(j->global_to_own_col[j]!=0,J_rcv_data)
+    I_rcv_own = view(I_rcv_data,is_own)
+    J_rcv_own = view(J_rcv_data,is_own)
+    V_rcv_own = view(V_rcv_data,is_own)
+    k_rcv_own = view(k_rcv_data,is_own)
+    I_rcv_ghost = view(I_rcv_data,is_ghost)
+    J_rcv_ghost = view(J_rcv_data,is_ghost)
+    V_rcv_ghost = view(V_rcv_data,is_ghost)
+    k_rcv_ghost = view(k_rcv_data,is_ghost)
+    # After this col ids in own_ghost triplet remain global
+    map_global_to_own!(I_rcv_own,rows_sa)
+    map_global_to_own!(J_rcv_own,cols_sa)
+    map_global_to_own!(I_rcv_ghost,rows_sa)
+    map_ghost_to_global!(nz_own_ghost[2],cols_sa)
+    own_own_I = vcat(nz_own_own[1],I_rcv_own)
+    own_own_J = vcat(nz_own_own[2],J_rcv_own)
+    own_own_V = vcat(nz_own_own[3],V_rcv_own)
+    own_own_triplet = (own_own_I,own_own_J,own_own_V)
+    own_ghost_I = vcat(nz_own_ghost[1],I_rcv_ghost)
+    own_ghost_J = vcat(nz_own_ghost[2],J_rcv_ghost)
+    own_ghost_V = vcat(nz_own_ghost[3],V_rcv_ghost)
+    map_global_to_ghost!(nz_own_ghost[2],cols_sa)
+    own_ghost_triplet = (own_ghost_I,own_ghost_J,own_ghost_V)
+    triplets = (own_own_triplet,own_ghost_triplet)
+    aux = (I_rcv_own,J_rcv_own,k_rcv_own,I_rcv_ghost,J_rcv_ghost,k_rcv_ghost,nz_own_own,nz_own_ghost)
+    triplets, own_ghost_J, aux
+end
+
+function finalize_values(A,rows_fa,cols_fa,cache_snd,cache_rcv,triplets,aux)
+    (own_own_triplet,own_ghost_triplet) = triplets
+    (I_rcv_own,J_rcv_own,k_rcv_own,I_rcv_ghost,J_rcv_ghost,k_rcv_ghost,nz_own_own,nz_own_ghost) = aux
+    map_global_to_ghost!(own_ghost_triplet[2],cols_fa)
+    map_global_to_ghost!(J_rcv_ghost,cols_fa)
+    TA = typeof(A.blocks.own_own)
+    n_own_rows = own_length(rows_fa)
+    n_own_cols = own_length(cols_fa)
+    n_ghost_rows = ghost_length(rows_fa)
+    n_ghost_cols = ghost_length(cols_fa)
+    Ti = indextype(A.blocks.own_own)
+    Tv = eltype(A.blocks.own_own)
+    own_own = compresscoo(TA,own_own_triplet...,n_own_rows,n_own_cols)
+    own_ghost = compresscoo(TA,own_ghost_triplet...,n_own_rows,n_ghost_cols)
+    ghost_own = compresscoo(TA,Ti[],Ti[],Tv[],n_ghost_rows,n_own_cols)
+    ghost_ghost = compresscoo(TA,Ti[],Ti[],Tv[],n_ghost_rows,n_ghost_cols)
+    blocks = split_matrix_blocks(own_own,own_ghost,ghost_own,ghost_ghost)
+    values = split_matrix(blocks,local_permutation(rows_fa),local_permutation(cols_fa))
+    nnz_own_own = nnz(own_own)
+    k_own_sa = precompute_nzindex(own_own,own_own_triplet[1:2]...)
+    k_ghost_sa = precompute_nzindex(own_ghost,own_ghost_triplet[1:2]...)
+    for p in 1:length(I_rcv_own)
+        i = I_rcv_own[p]
+        j = J_rcv_own[p]
+        k_rcv_own[p] = nzindex(own_own,i,j)
+    end
+    for p in 1:length(I_rcv_ghost)
+        i = I_rcv_ghost[p]
+        j = J_rcv_ghost[p]
+        k_rcv_ghost[p] = nzindex(own_ghost,i,j) + nnz_own_own
     end
+    cache = (;k_own_sa,k_ghost_sa,cache_snd...,cache_rcv...)
+    values, cache
+end
+
+function PartitionedArrays.psparse_assemble_impl(
+                            A,
+                            ::Type{<:AbstractSplitMatrix},
+                            rows;
+                            reuse=Val(false),
+                            assembly_neighbors_options_cols=(;))
+
+
     rows_sa = partition(axes(A,1))
     cols_sa = partition(axes(A,2))
-    #rows = map(remove_ghost,rows_sa)
     cols = map(remove_ghost,cols_sa)
     parts_snd, parts_rcv = assembly_neighbors(rows_sa)
     cache_snd = map(setup_cache_snd,partition(A),parts_snd,rows_sa,cols_sa)
+
     I_snd = map(i->i.I_snd,cache_snd)
     J_snd = map(i->i.J_snd,cache_snd)
     V_snd = map(i->i.V_snd,cache_snd)
@@ -1525,6 +1701,9 @@ function psparse_assemble_impl(
     end
 end
 
+# End new assemble
+####################
+
 function psparse_assemble_impl!(B,A,::Type,cache)
     error("case not implemented")
 end
@@ -1603,136 +1782,306 @@ function consistent!(B::PSparseMatrix,A::PSparseMatrix,cache)
     psparse_consistent_impl!(B,A,T,cache)
 end
 
-function psparse_consistent_impl(
-    A,
-    ::Type{<:AbstractSplitMatrix},
-    rows_co;
-    reuse=Val(false))
-
-    function setup_snd(A,parts_snd,lids_snd,rows_co,cols_fa)
-        own_to_local_row = own_to_local(rows_co)
-        own_to_global_row = own_to_global(rows_co)
-        own_to_global_col = own_to_global(cols_fa)
-        ghost_to_global_col = ghost_to_global(cols_fa)
-        nl = size(A,1)
-        li_to_ps_ptrs = zeros(Int32,nl+1)
-        for p in 1:length(lids_snd)
-            for li in lids_snd[p]
-                li_to_ps_ptrs[li+1] += 1
-            end
+# function psparse_consistent_impl(
+#     A,
+#     ::Type{<:AbstractSplitMatrix},
+#     rows_co;
+#     reuse=Val(false))
+
+#     function setup_snd(A,parts_snd,lids_snd,rows_co,cols_fa)
+#         own_to_local_row = own_to_local(rows_co)
+#         own_to_global_row = own_to_global(rows_co)
+#         own_to_global_col = own_to_global(cols_fa)
+#         ghost_to_global_col = ghost_to_global(cols_fa)
+#         nl = size(A,1)
+#         li_to_ps_ptrs = zeros(Int32,nl+1)
+#         for p in 1:length(lids_snd)
+#             for li in lids_snd[p]
+#                 li_to_ps_ptrs[li+1] += 1
+#             end
+#         end
+#         length_to_ptrs!(li_to_ps_ptrs)
+#         ndata = li_to_ps_ptrs[end]-1
+#         li_to_ps_data = zeros(Int32,ndata)
+#         for p in 1:length(lids_snd)
+#             for li in lids_snd[p]
+#                 q = li_to_ps_ptrs[li]
+#                 li_to_ps_data[q] = p
+#                 li_to_ps_ptrs[li] = q + 1
+#             end
+#         end
+#         rewind_ptrs!(li_to_ps_ptrs)
+#         li_to_ps = JaggedArray(li_to_ps_data,li_to_ps_ptrs)
+#         ptrs = zeros(Int32,length(parts_snd)+1)
+#         for (i,j,v) in nziterator(A.blocks.own_own)
+#             li = own_to_local_row[i]
+#             for p in li_to_ps[li]
+#                 ptrs[p+1] += 1
+#             end
+#         end
+#         for (i,j,v) in nziterator(A.blocks.own_ghost)
+#             li = own_to_local_row[i]
+#             for p in li_to_ps[li]
+#                 ptrs[p+1] += 1
+#             end
+#         end
+#         length_to_ptrs!(ptrs)
+#         ndata = ptrs[end]-1
+#         T = eltype(A)
+#         I_snd = JaggedArray(zeros(Int,ndata),ptrs)
+#         J_snd = JaggedArray(zeros(Int,ndata),ptrs)
+#         V_snd = JaggedArray(zeros(T,ndata),ptrs)
+#         k_snd = JaggedArray(zeros(Int32,ndata),ptrs)
+#         for (k,(i,j,v)) in enumerate(nziterator(A.blocks.own_own))
+#             li = own_to_local_row[i]
+#             for p in li_to_ps[li]
+#                 q = ptrs[p]
+#                 I_snd.data[q] = own_to_global_row[i]
+#                 J_snd.data[q] = own_to_global_col[j]
+#                 V_snd.data[q] = v
+#                 k_snd.data[q] = k
+#                 ptrs[p] += 1
+#             end
+#         end
+#         nnz_own_own = nnz(A.blocks.own_own)
+#         for (k,(i,j,v)) in enumerate(nziterator(A.blocks.own_ghost))
+#             li = own_to_local_row[i]
+#             for p in li_to_ps[li]
+#                 q = ptrs[p]
+#                 I_snd.data[q] = own_to_global_row[i]
+#                 J_snd.data[q] = ghost_to_global_col[j]
+#                 V_snd.data[q] = v
+#                 k_snd.data[q] = k+nnz_own_own
+#                 ptrs[p] += 1
+#             end
+#         end
+#         rewind_ptrs!(ptrs)
+#         cache_snd = (;parts_snd,lids_snd,I_snd,J_snd,V_snd,k_snd)
+#         cache_snd
+#     end
+#     function setup_rcv(parts_rcv,lids_rcv,I_rcv,J_rcv,V_rcv)
+#         cache_rcv = (;parts_rcv,lids_rcv,I_rcv,J_rcv,V_rcv)
+#         cache_rcv
+#     end
+#     function finalize(A,cache_snd,cache_rcv,rows_co,cols_fa,cols_co)
+#         I_rcv_data = cache_rcv.I_rcv.data
+#         J_rcv_data = cache_rcv.J_rcv.data
+#         V_rcv_data = cache_rcv.V_rcv.data
+#         global_to_own_col = global_to_own(cols_co)
+#         global_to_ghost_col = global_to_ghost(cols_co)
+#         is_own = findall(j->global_to_own_col[j]!=0,J_rcv_data)
+#         is_ghost = findall(j->global_to_ghost_col[j]!=0,J_rcv_data)
+#         I_rcv_own = I_rcv_data[is_own]
+#         J_rcv_own = J_rcv_data[is_own]
+#         V_rcv_own = V_rcv_data[is_own]
+#         I_rcv_ghost = I_rcv_data[is_ghost]
+#         J_rcv_ghost = J_rcv_data[is_ghost]
+#         V_rcv_ghost = V_rcv_data[is_ghost]
+#         map_global_to_ghost!(I_rcv_own,rows_co)
+#         map_global_to_ghost!(I_rcv_ghost,rows_co)
+#         map_global_to_own!(J_rcv_own,cols_co)
+#         map_global_to_ghost!(J_rcv_ghost,cols_co)
+#         I2,J2,V2 = findnz(A.blocks.own_ghost)
+#         map_ghost_to_global!(J2,cols_fa)
+#         map_global_to_ghost!(J2,cols_co)
+#         n_own_rows = own_length(rows_co)
+#         n_ghost_rows = ghost_length(rows_co)
+#         n_own_cols = own_length(cols_co)
+#         n_ghost_cols = ghost_length(cols_co)
+#         TA = typeof(A.blocks.ghost_own)
+#         own_own = A.blocks.own_own
+#         own_ghost = compresscoo(TA,I2,J2,V2,n_own_rows,n_ghost_cols) # TODO this can be improved
+#         ghost_own = compresscoo(TA,I_rcv_own,J_rcv_own,V_rcv_own,n_ghost_rows,n_own_cols)
+#         ghost_ghost = compresscoo(TA,I_rcv_ghost,J_rcv_ghost,V_rcv_ghost,n_ghost_rows,n_ghost_cols)
+#         K_own = precompute_nzindex(ghost_own,I_rcv_own,J_rcv_own)
+#         K_ghost = precompute_nzindex(ghost_ghost,I_rcv_ghost,J_rcv_ghost)
+#         blocks = split_matrix_blocks(own_own,own_ghost,ghost_own,ghost_ghost)
+#         values = split_matrix(blocks,local_permutation(rows_co),local_permutation(cols_co))
+#         k_snd = cache_snd.k_snd
+#         V_snd = cache_snd.V_snd
+#         V_rcv = cache_rcv.V_rcv
+#         parts_snd = cache_snd.parts_snd
+#         parts_rcv = cache_rcv.parts_rcv
+#         cache = (;parts_snd,parts_rcv,k_snd,V_snd,V_rcv,is_ghost,is_own,V_rcv_own,V_rcv_ghost,K_own,K_ghost)
+#         values,cache
+#     end
+#     @assert matching_own_indices(axes(A,1),PRange(rows_co))
+#     rows_fa = partition(axes(A,1))
+#     cols_fa = partition(axes(A,2))
+#     # snd and rcv are swapped on purpose
+#     parts_rcv,parts_snd = assembly_neighbors(rows_co)
+#     lids_rcv,lids_snd = assembly_local_indices(rows_co)
+#     cache_snd = map(setup_snd,partition(A),parts_snd,lids_snd,rows_co,cols_fa)
+#     I_snd = map(i->i.I_snd,cache_snd)
+#     J_snd = map(i->i.J_snd,cache_snd)
+#     V_snd = map(i->i.V_snd,cache_snd)
+#     graph = ExchangeGraph(parts_snd,parts_rcv)
+#     t_I = exchange(I_snd,graph)
+#     t_J = exchange(J_snd,graph)
+#     t_V = exchange(V_snd,graph)
+#     @fake_async begin
+#         I_rcv = fetch(t_I)
+#         J_rcv = fetch(t_J)
+#         V_rcv = fetch(t_V)
+#         J_rcv_data = map(x->x.data,J_rcv)
+#         J_rcv_owner = find_owner(cols_fa,J_rcv_data)
+#         cols_co = map(union_ghost,cols_fa,J_rcv_data,J_rcv_owner)
+#         cache_rcv = map(setup_rcv,parts_rcv,lids_rcv,I_rcv,J_rcv,V_rcv)
+#         values,cache = map(finalize,partition(A),cache_snd,cache_rcv,rows_co,cols_fa,cols_co) |> tuple_of_arrays
+#         B = PSparseMatrix(values,rows_co,cols_co,A.assembled)
+#         if val_parameter(reuse) == false
+#             B
+#         else
+#             B,cache
+#         end
+#     end
+# end
+
+# New consistent
+####################
+
+function consistent_setup_snd(A,parts_snd,lids_snd,rows_co,cols_fa)
+    own_to_local_row::UnitRange{Int32} = own_to_local(rows_co)
+    own_to_global_row = own_to_global(rows_co)
+    own_to_global_col = own_to_global(cols_fa)
+    ghost_to_global_col = ghost_to_global(cols_fa)
+    nl = size(A,1)
+    li_to_ps_ptrs = zeros(Int32,nl+1)
+    for p in 1:length(lids_snd)
+        for li_ptr in jagged_range(lids_snd,p)
+            li = lids_snd.data[li_ptr]
+            li_to_ps_ptrs[li+1] += 1
         end
-        length_to_ptrs!(li_to_ps_ptrs)
-        ndata = li_to_ps_ptrs[end]-1
-        li_to_ps_data = zeros(Int32,ndata)
-        for p in 1:length(lids_snd)
-            for li in lids_snd[p]
-                q = li_to_ps_ptrs[li]
-                li_to_ps_data[q] = p
-                li_to_ps_ptrs[li] = q + 1
-            end
+    end
+    length_to_ptrs!(li_to_ps_ptrs)
+    ndata = li_to_ps_ptrs[end]-1
+    li_to_ps_data = zeros(Int32,ndata)
+    for p in 1:length(lids_snd)
+        for li_ptr in jagged_range(lids_snd,p)
+            li = lids_snd.data[li_ptr]
+            q = li_to_ps_ptrs[li]
+            li_to_ps_data[q] = p
+            li_to_ps_ptrs[li] = q + 1
         end
-        rewind_ptrs!(li_to_ps_ptrs)
-        li_to_ps = JaggedArray(li_to_ps_data,li_to_ps_ptrs)
-        ptrs = zeros(Int32,length(parts_snd)+1)
-        for (i,j,v) in nziterator(A.blocks.own_own)
-            li = own_to_local_row[i]
-            for p in li_to_ps[li]
-                ptrs[p+1] += 1
-            end
+    end
+
+    rewind_ptrs!(li_to_ps_ptrs)
+    li_to_ps = JaggedArray(li_to_ps_data,li_to_ps_ptrs)
+    ptrs = zeros(Int32,length(parts_snd)+1)
+    for (i,j,v) in nziterator(A.blocks.own_own)
+        # @show(typeof(own_to_local_row))
+        li = own_to_local_row[i]
+        for li_ptr in jagged_range(li_to_ps,li)
+            p = li_to_ps.data[li_ptr]
+            ptrs[p+1] += 1
         end
-        for (i,j,v) in nziterator(A.blocks.own_ghost)
-            li = own_to_local_row[i]
-            for p in li_to_ps[li]
-                ptrs[p+1] += 1
-            end
+    end
+
+    for (i,j,v) in nziterator(A.blocks.own_ghost)
+        li = own_to_local_row[i]
+        for ptr in jagged_range(li_to_ps,li)
+            p=li_to_ps.data[ptr]
+            ptrs[p+1] += 1
         end
-        length_to_ptrs!(ptrs)
-        ndata = ptrs[end]-1
-        T = eltype(A)
-        I_snd = JaggedArray(zeros(Int,ndata),ptrs)
-        J_snd = JaggedArray(zeros(Int,ndata),ptrs)
-        V_snd = JaggedArray(zeros(T,ndata),ptrs)
-        k_snd = JaggedArray(zeros(Int32,ndata),ptrs)
-        for (k,(i,j,v)) in enumerate(nziterator(A.blocks.own_own))
-            li = own_to_local_row[i]
-            for p in li_to_ps[li]
-                q = ptrs[p]
-                I_snd.data[q] = own_to_global_row[i]
-                J_snd.data[q] = own_to_global_col[j]
-                V_snd.data[q] = v
-                k_snd.data[q] = k
-                ptrs[p] += 1
-            end
+    end
+    length_to_ptrs!(ptrs)
+    ndata = ptrs[end]-1
+    T = eltype(A)
+    I_snd = JaggedArray(zeros(Int,ndata),ptrs)
+    J_snd = JaggedArray(zeros(Int,ndata),ptrs)
+    V_snd = JaggedArray(zeros(T,ndata),ptrs)
+    k_snd = JaggedArray(zeros(Int32,ndata),ptrs)
+    for (k,(i,j,v)) in enumerate(nziterator(A.blocks.own_own))
+        li = own_to_local_row[i]
+        for p_ptr in jagged_range(li_to_ps,li)
+            p = li_to_ps.data[p_ptr]
+            q = ptrs[p]
+            I_snd.data[q] = own_to_global_row[i]
+            J_snd.data[q] = own_to_global_col[j]
+            V_snd.data[q] = v
+            k_snd.data[q] = k
+            ptrs[p] += 1
         end
-        nnz_own_own = nnz(A.blocks.own_own)
-        for (k,(i,j,v)) in enumerate(nziterator(A.blocks.own_ghost))
-            li = own_to_local_row[i]
-            for p in li_to_ps[li]
-                q = ptrs[p]
-                I_snd.data[q] = own_to_global_row[i]
-                J_snd.data[q] = ghost_to_global_col[j]
-                V_snd.data[q] = v
-                k_snd.data[q] = k+nnz_own_own
-                ptrs[p] += 1
-            end
+    end
+
+    nnz_own_own = nnz(A.blocks.own_own)
+    for (k,(i,j,v)) in enumerate(nziterator(A.blocks.own_ghost))
+        li = own_to_local_row[i]
+        for p_ptr in jagged_range(li_to_ps,li)
+            p=li_to_ps.data[p_ptr]
+            q = ptrs[p]
+            I_snd.data[q] = own_to_global_row[i]
+            J_snd.data[q] = ghost_to_global_col[j]
+            V_snd.data[q] = v
+            k_snd.data[q] = k+nnz_own_own
+            ptrs[p] += 1
         end
-        rewind_ptrs!(ptrs)
-        cache_snd = (;parts_snd,lids_snd,I_snd,J_snd,V_snd,k_snd)
-        cache_snd
-    end
-    function setup_rcv(parts_rcv,lids_rcv,I_rcv,J_rcv,V_rcv)
-        cache_rcv = (;parts_rcv,lids_rcv,I_rcv,J_rcv,V_rcv)
-        cache_rcv
-    end
-    function finalize(A,cache_snd,cache_rcv,rows_co,cols_fa,cols_co)
-        I_rcv_data = cache_rcv.I_rcv.data
-        J_rcv_data = cache_rcv.J_rcv.data
-        V_rcv_data = cache_rcv.V_rcv.data
-        global_to_own_col = global_to_own(cols_co)
-        global_to_ghost_col = global_to_ghost(cols_co)
-        is_own = findall(j->global_to_own_col[j]!=0,J_rcv_data)
-        is_ghost = findall(j->global_to_ghost_col[j]!=0,J_rcv_data)
-        I_rcv_own = I_rcv_data[is_own]
-        J_rcv_own = J_rcv_data[is_own]
-        V_rcv_own = V_rcv_data[is_own]
-        I_rcv_ghost = I_rcv_data[is_ghost]
-        J_rcv_ghost = J_rcv_data[is_ghost]
-        V_rcv_ghost = V_rcv_data[is_ghost]
-        map_global_to_ghost!(I_rcv_own,rows_co)
-        map_global_to_ghost!(I_rcv_ghost,rows_co)
-        map_global_to_own!(J_rcv_own,cols_co)
-        map_global_to_ghost!(J_rcv_ghost,cols_co)
-        I2,J2,V2 = findnz(A.blocks.own_ghost)
-        map_ghost_to_global!(J2,cols_fa)
-        map_global_to_ghost!(J2,cols_co)
-        n_own_rows = own_length(rows_co)
-        n_ghost_rows = ghost_length(rows_co)
-        n_own_cols = own_length(cols_co)
-        n_ghost_cols = ghost_length(cols_co)
-        TA = typeof(A.blocks.ghost_own)
-        own_own = A.blocks.own_own
-        own_ghost = compresscoo(TA,I2,J2,V2,n_own_rows,n_ghost_cols) # TODO this can be improved
-        ghost_own = compresscoo(TA,I_rcv_own,J_rcv_own,V_rcv_own,n_ghost_rows,n_own_cols)
-        ghost_ghost = compresscoo(TA,I_rcv_ghost,J_rcv_ghost,V_rcv_ghost,n_ghost_rows,n_ghost_cols)
-        K_own = precompute_nzindex(ghost_own,I_rcv_own,J_rcv_own)
-        K_ghost = precompute_nzindex(ghost_ghost,I_rcv_ghost,J_rcv_ghost)
-        blocks = split_matrix_blocks(own_own,own_ghost,ghost_own,ghost_ghost)
-        values = split_matrix(blocks,local_permutation(rows_co),local_permutation(cols_co))
-        k_snd = cache_snd.k_snd
-        V_snd = cache_snd.V_snd
-        V_rcv = cache_rcv.V_rcv
-        parts_snd = cache_snd.parts_snd
-        parts_rcv = cache_rcv.parts_rcv
-        cache = (;parts_snd,parts_rcv,k_snd,V_snd,V_rcv,is_ghost,is_own,V_rcv_own,V_rcv_ghost,K_own,K_ghost)
-        values,cache
     end
+    rewind_ptrs!(ptrs)
+    cache_snd = (;parts_snd,lids_snd,I_snd,J_snd,V_snd,k_snd)
+    cache_snd
+end
+
+function consistent_setup_rcv(parts_rcv,lids_rcv,I_rcv,J_rcv,V_rcv)
+    cache_rcv = (;parts_rcv,lids_rcv,I_rcv,J_rcv,V_rcv)
+    cache_rcv
+end
+
+function consistent_finalize(A,cache_snd,cache_rcv,rows_co,cols_fa,cols_co)
+    I_rcv_data = cache_rcv.I_rcv.data
+    J_rcv_data = cache_rcv.J_rcv.data
+    V_rcv_data = cache_rcv.V_rcv.data
+    global_to_own_col = global_to_own(cols_co)
+    global_to_ghost_col = global_to_ghost(cols_co)
+    is_own = findall(j->global_to_own_col[j]!=0,J_rcv_data)
+    is_ghost = findall(j->global_to_ghost_col[j]!=0,J_rcv_data)
+    I_rcv_own = I_rcv_data[is_own]
+    J_rcv_own = J_rcv_data[is_own]
+    V_rcv_own = V_rcv_data[is_own]
+    I_rcv_ghost = I_rcv_data[is_ghost]
+    J_rcv_ghost = J_rcv_data[is_ghost]
+    V_rcv_ghost = V_rcv_data[is_ghost]
+    map_global_to_ghost!(I_rcv_own,rows_co)
+    map_global_to_ghost!(I_rcv_ghost,rows_co)
+    map_global_to_own!(J_rcv_own,cols_co)
+    map_global_to_ghost!(J_rcv_ghost,cols_co)
+    I2,J2,V2 = findnz(A.blocks.own_ghost)
+    map_ghost_to_global!(J2,cols_fa)
+    map_global_to_ghost!(J2,cols_co)
+    n_own_rows = own_length(rows_co)
+    n_ghost_rows = ghost_length(rows_co)
+    n_own_cols = own_length(cols_co)
+    n_ghost_cols = ghost_length(cols_co)
+    TA = typeof(A.blocks.ghost_own)
+    own_own = A.blocks.own_own
+    own_ghost = compresscoo(TA,I2,J2,V2,n_own_rows,n_ghost_cols) # TODO this can be improved
+    ghost_own = compresscoo(TA,I_rcv_own,J_rcv_own,V_rcv_own,n_ghost_rows,n_own_cols)
+    ghost_ghost = compresscoo(TA,I_rcv_ghost,J_rcv_ghost,V_rcv_ghost,n_ghost_rows,n_ghost_cols)
+    K_own = precompute_nzindex(ghost_own,I_rcv_own,J_rcv_own)
+    K_ghost = precompute_nzindex(ghost_ghost,I_rcv_ghost,J_rcv_ghost)
+    blocks = split_matrix_blocks(own_own,own_ghost,ghost_own,ghost_ghost)
+    values = split_matrix(blocks,local_permutation(rows_co),local_permutation(cols_co))
+    k_snd = cache_snd.k_snd
+    V_snd = cache_snd.V_snd
+    V_rcv = cache_rcv.V_rcv
+    parts_snd = cache_snd.parts_snd
+    parts_rcv = cache_rcv.parts_rcv
+    cache = (;parts_snd,parts_rcv,k_snd,V_snd,V_rcv,is_ghost,is_own,V_rcv_own,V_rcv_ghost,K_own,K_ghost)
+    values,cache
+end
+
+function psparse_consistent_impl(
+                            A,
+                            ::Type{<:AbstractSplitMatrix},
+                            rows_co;
+                            reuse=Val(false))
     @assert matching_own_indices(axes(A,1),PRange(rows_co))
     rows_fa = partition(axes(A,1))
     cols_fa = partition(axes(A,2))
     # snd and rcv are swapped on purpose
     parts_rcv,parts_snd = assembly_neighbors(rows_co)
     lids_rcv,lids_snd = assembly_local_indices(rows_co)
-    cache_snd = map(setup_snd,partition(A),parts_snd,lids_snd,rows_co,cols_fa)
+    cache_snd = map(consistent_setup_snd,partition(A),parts_snd,lids_snd,rows_co,cols_fa)
     I_snd = map(i->i.I_snd,cache_snd)
     J_snd = map(i->i.J_snd,cache_snd)
     V_snd = map(i->i.V_snd,cache_snd)
@@ -1747,8 +2096,8 @@ function psparse_consistent_impl(
         J_rcv_data = map(x->x.data,J_rcv)
         J_rcv_owner = find_owner(cols_fa,J_rcv_data)
         cols_co = map(union_ghost,cols_fa,J_rcv_data,J_rcv_owner)
-        cache_rcv = map(setup_rcv,parts_rcv,lids_rcv,I_rcv,J_rcv,V_rcv)
-        values,cache = map(finalize,partition(A),cache_snd,cache_rcv,rows_co,cols_fa,cols_co) |> tuple_of_arrays
+        cache_rcv = map(consistent_setup_rcv,parts_rcv,lids_rcv,I_rcv,J_rcv,V_rcv)
+        values,cache = map(consistent_finalize,partition(A),cache_snd,cache_rcv,rows_co,cols_fa,cols_co) |> tuple_of_arrays
         B = PSparseMatrix(values,rows_co,cols_co,A.assembled)
         if val_parameter(reuse) == false
             B
@@ -1758,6 +2107,9 @@ function psparse_consistent_impl(
     end
 end
 
+# End new consistent
+####################
+
 function psparse_consistent_impl!(B,A,::Type{<:AbstractSplitMatrix},cache)
     function setup_snd(A,cache)
         k_snd_data = cache.k_snd.data

From b36653ef812da029cb6b00224d12dc38938edfa2 Mon Sep 17 00:00:00 2001
From: jop611 <meijer.jop@gmail.com>
Date: Fri, 9 Aug 2024 17:16:10 +0200
Subject: [PATCH 02/34] added relevant functions to export

---
 src/PartitionedArrays.jl | 1 +
 src/jagged_array.jl      | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/PartitionedArrays.jl b/src/PartitionedArrays.jl
index bec2f615..45ec6594 100644
--- a/src/PartitionedArrays.jl
+++ b/src/PartitionedArrays.jl
@@ -11,6 +11,7 @@ import Distances
 
 export length_to_ptrs!
 export rewind_ptrs!
+export jagged_range
 export jagged_array
 export GenericJaggedArray
 export JaggedArray
diff --git a/src/jagged_array.jl b/src/jagged_array.jl
index a3d61789..4a58f48a 100644
--- a/src/jagged_array.jl
+++ b/src/jagged_array.jl
@@ -162,7 +162,6 @@ function jagged_range(a::Union{JaggedArray,GenericJaggedArray},i::Integer)
   pini:pend
 end
 
-
 ###########
 
 Base.size(a::Union{JaggedArray,GenericJaggedArray}) = (length(a.ptrs)-1,)

From 29fcd0afa3ee4f50f9338924f33d4aaf7b9eddee Mon Sep 17 00:00:00 2001
From: jop611 <meijer.jop@gmail.com>
Date: Wed, 14 Aug 2024 16:05:25 +0200
Subject: [PATCH 03/34] minor optimization to consistent, including  reduced
 cache size

---
 src/p_sparse_matrix.jl | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/src/p_sparse_matrix.jl b/src/p_sparse_matrix.jl
index d9561d6e..a2d12e44 100644
--- a/src/p_sparse_matrix.jl
+++ b/src/p_sparse_matrix.jl
@@ -1659,7 +1659,7 @@ function finalize_values(A,rows_fa,cols_fa,cache_snd,cache_rcv,triplets,aux)
     values, cache
 end
 
-function PartitionedArrays.psparse_assemble_impl(
+function psparse_assemble_impl(
                             A,
                             ::Type{<:AbstractSplitMatrix},
                             rows;
@@ -2033,11 +2033,12 @@ function consistent_finalize(A,cache_snd,cache_rcv,rows_co,cols_fa,cols_co)
     V_rcv_data = cache_rcv.V_rcv.data
     global_to_own_col = global_to_own(cols_co)
     global_to_ghost_col = global_to_ghost(cols_co)
-    is_own = findall(j->global_to_own_col[j]!=0,J_rcv_data)
-    is_ghost = findall(j->global_to_ghost_col[j]!=0,J_rcv_data)
+    is_own_condition = k -> global_to_own_col[k]!=0
+    is_own = is_own_condition.(J_rcv_data)
     I_rcv_own = I_rcv_data[is_own]
     J_rcv_own = J_rcv_data[is_own]
     V_rcv_own = V_rcv_data[is_own]
+    is_ghost = map!(!,is_own, is_own) # inverse is_own bitvector to effectively represent is_ghost mask
     I_rcv_ghost = I_rcv_data[is_ghost]
     J_rcv_ghost = J_rcv_data[is_ghost]
     V_rcv_ghost = V_rcv_data[is_ghost]
@@ -2066,7 +2067,7 @@ function consistent_finalize(A,cache_snd,cache_rcv,rows_co,cols_fa,cols_co)
     V_rcv = cache_rcv.V_rcv
     parts_snd = cache_snd.parts_snd
     parts_rcv = cache_rcv.parts_rcv
-    cache = (;parts_snd,parts_rcv,k_snd,V_snd,V_rcv,is_ghost,is_own,V_rcv_own,V_rcv_ghost,K_own,K_ghost)
+    cache = (;parts_snd,parts_rcv,k_snd,V_snd,V_rcv,V_rcv_own,V_rcv_ghost,K_own,K_ghost)
     values,cache
 end
 
@@ -2127,13 +2128,10 @@ function psparse_consistent_impl!(B,A,::Type{<:AbstractSplitMatrix},cache)
         end
     end
     function setup_rcv(B,cache)
-        is_ghost = cache.is_ghost
-        is_own = cache.is_own
-        V_rcv_data = cache.V_rcv.data
         K_own = cache.K_own
         K_ghost = cache.K_ghost
-        V_rcv_own = V_rcv_data[is_own]
-        V_rcv_ghost = V_rcv_data[is_ghost]
+        V_rcv_own = cache.V_rcv_own
+        V_rcv_ghost = cache.V_rcv_ghost
         setcoofast!(B.blocks.ghost_own,V_rcv_own,K_own)
         setcoofast!(B.blocks.ghost_ghost,V_rcv_ghost,K_ghost)
         B

From a4e1960931aab50d8a22660fa4012f6ff5e25c5a Mon Sep 17 00:00:00 2001
From: jop611 <meijer.jop@gmail.com>
Date: Thu, 15 Aug 2024 18:28:45 +0200
Subject: [PATCH 04/34] movig some functions back to inner scope

---
 src/p_sparse_matrix.jl | 827 +++++++++++++++++------------------------
 1 file changed, 345 insertions(+), 482 deletions(-)

diff --git a/src/p_sparse_matrix.jl b/src/p_sparse_matrix.jl
index a2d12e44..b315e2af 100644
--- a/src/p_sparse_matrix.jl
+++ b/src/p_sparse_matrix.jl
@@ -1528,177 +1528,187 @@ end
 # New assemble
 ####################
 
-function setup_cache_snd(A,parts_snd,rows_sa,cols_sa)
-    A_ghost_own   = A.blocks.ghost_own
-    A_ghost_ghost = A.blocks.ghost_ghost
-    gen = ( owner=>i for (i,owner) in enumerate(parts_snd) )
-    owner_to_p = Dict(gen)
-    ptrs = zeros(Int32,length(parts_snd)+1)
-    ghost_to_owner_row = ghost_to_owner(rows_sa)
-    ghost_to_global_row = ghost_to_global(rows_sa)
-    own_to_global_col = own_to_global(cols_sa)
-    ghost_to_global_col = ghost_to_global(cols_sa)
-    for (i,_,_) in nziterator(A_ghost_own)
-        owner = ghost_to_owner_row[i]
-        ptrs[owner_to_p[owner]+1] += 1
-    end
-    for (i,_,_) in nziterator(A_ghost_ghost)
-        owner = ghost_to_owner_row[i]
-        ptrs[owner_to_p[owner]+1] += 1
-    end
-    length_to_ptrs!(ptrs)
-    Tv = eltype(A_ghost_own)
-    ndata = ptrs[end]-1
-    I_snd_data = zeros(Int,ndata)
-    J_snd_data = zeros(Int,ndata)
-    V_snd_data = zeros(Tv,ndata)
-    k_snd_data = zeros(Int32,ndata)
-    nnz_ghost_own = 0
-    for (k,(i,j,v)) in enumerate(nziterator(A_ghost_own))
-        owner = ghost_to_owner_row[i]
-        p = ptrs[owner_to_p[owner]]
-        I_snd_data[p] = ghost_to_global_row[i]
-        J_snd_data[p] = own_to_global_col[j]
-        V_snd_data[p] = v
-        k_snd_data[p] = k
-        ptrs[owner_to_p[owner]] += 1
-        nnz_ghost_own += 1
-    end
-    for (k,(i,j,v)) in enumerate(nziterator(A_ghost_ghost))
-        owner = ghost_to_owner_row[i]
-        p = ptrs[owner_to_p[owner]]
-        I_snd_data[p] = ghost_to_global_row[i]
-        J_snd_data[p] = ghost_to_global_col[j]
-        V_snd_data[p] = v
-        k_snd_data[p] = k+nnz_ghost_own
-        ptrs[owner_to_p[owner]] += 1
-    end
-    rewind_ptrs!(ptrs)
-    I_snd = JaggedArray(I_snd_data,ptrs)
-    J_snd = JaggedArray(J_snd_data,ptrs)
-    V_snd = JaggedArray(V_snd_data,ptrs)
-    k_snd = JaggedArray(k_snd_data,ptrs)
-    (;I_snd,J_snd,V_snd,k_snd,parts_snd)
-end
-
-function setup_cache_rcv(I_rcv,J_rcv,V_rcv,parts_rcv)
-    k_rcv_data = zeros(Int32,length(I_rcv.data))
-    k_rcv = JaggedArray(k_rcv_data,I_rcv.ptrs)
-    (;I_rcv,J_rcv,V_rcv,k_rcv,parts_rcv)
-end
-
-function setup_own_triplets(A,cache_rcv,rows_sa,cols_sa)
-    nz_own_own = findnz(A.blocks.own_own)
-    nz_own_ghost = findnz(A.blocks.own_ghost)
-    I_rcv_data = cache_rcv.I_rcv.data
-    J_rcv_data = cache_rcv.J_rcv.data
-    V_rcv_data = cache_rcv.V_rcv.data
-    k_rcv_data = cache_rcv.k_rcv.data
-    global_to_own_col = global_to_own(cols_sa)
-    is_ghost = findall(j->global_to_own_col[j]==0,J_rcv_data)
-    is_own = findall(j->global_to_own_col[j]!=0,J_rcv_data)
-    I_rcv_own = view(I_rcv_data,is_own)
-    J_rcv_own = view(J_rcv_data,is_own)
-    V_rcv_own = view(V_rcv_data,is_own)
-    k_rcv_own = view(k_rcv_data,is_own)
-    I_rcv_ghost = view(I_rcv_data,is_ghost)
-    J_rcv_ghost = view(J_rcv_data,is_ghost)
-    V_rcv_ghost = view(V_rcv_data,is_ghost)
-    k_rcv_ghost = view(k_rcv_data,is_ghost)
-    # After this col ids in own_ghost triplet remain global
-    map_global_to_own!(I_rcv_own,rows_sa)
-    map_global_to_own!(J_rcv_own,cols_sa)
-    map_global_to_own!(I_rcv_ghost,rows_sa)
-    map_ghost_to_global!(nz_own_ghost[2],cols_sa)
-    own_own_I = vcat(nz_own_own[1],I_rcv_own)
-    own_own_J = vcat(nz_own_own[2],J_rcv_own)
-    own_own_V = vcat(nz_own_own[3],V_rcv_own)
-    own_own_triplet = (own_own_I,own_own_J,own_own_V)
-    own_ghost_I = vcat(nz_own_ghost[1],I_rcv_ghost)
-    own_ghost_J = vcat(nz_own_ghost[2],J_rcv_ghost)
-    own_ghost_V = vcat(nz_own_ghost[3],V_rcv_ghost)
-    map_global_to_ghost!(nz_own_ghost[2],cols_sa)
-    own_ghost_triplet = (own_ghost_I,own_ghost_J,own_ghost_V)
-    triplets = (own_own_triplet,own_ghost_triplet)
-    aux = (I_rcv_own,J_rcv_own,k_rcv_own,I_rcv_ghost,J_rcv_ghost,k_rcv_ghost,nz_own_own,nz_own_ghost)
-    triplets, own_ghost_J, aux
-end
-
-function finalize_values(A,rows_fa,cols_fa,cache_snd,cache_rcv,triplets,aux)
-    (own_own_triplet,own_ghost_triplet) = triplets
-    (I_rcv_own,J_rcv_own,k_rcv_own,I_rcv_ghost,J_rcv_ghost,k_rcv_ghost,nz_own_own,nz_own_ghost) = aux
-    map_global_to_ghost!(own_ghost_triplet[2],cols_fa)
-    map_global_to_ghost!(J_rcv_ghost,cols_fa)
-    TA = typeof(A.blocks.own_own)
-    n_own_rows = own_length(rows_fa)
-    n_own_cols = own_length(cols_fa)
-    n_ghost_rows = ghost_length(rows_fa)
-    n_ghost_cols = ghost_length(cols_fa)
-    Ti = indextype(A.blocks.own_own)
-    Tv = eltype(A.blocks.own_own)
-    own_own = compresscoo(TA,own_own_triplet...,n_own_rows,n_own_cols)
-    own_ghost = compresscoo(TA,own_ghost_triplet...,n_own_rows,n_ghost_cols)
-    ghost_own = compresscoo(TA,Ti[],Ti[],Tv[],n_ghost_rows,n_own_cols)
-    ghost_ghost = compresscoo(TA,Ti[],Ti[],Tv[],n_ghost_rows,n_ghost_cols)
-    blocks = split_matrix_blocks(own_own,own_ghost,ghost_own,ghost_ghost)
-    values = split_matrix(blocks,local_permutation(rows_fa),local_permutation(cols_fa))
-    nnz_own_own = nnz(own_own)
-    k_own_sa = precompute_nzindex(own_own,own_own_triplet[1:2]...)
-    k_ghost_sa = precompute_nzindex(own_ghost,own_ghost_triplet[1:2]...)
-    for p in 1:length(I_rcv_own)
-        i = I_rcv_own[p]
-        j = J_rcv_own[p]
-        k_rcv_own[p] = nzindex(own_own,i,j)
-    end
-    for p in 1:length(I_rcv_ghost)
-        i = I_rcv_ghost[p]
-        j = J_rcv_ghost[p]
-        k_rcv_ghost[p] = nzindex(own_ghost,i,j) + nnz_own_own
-    end
-    cache = (;k_own_sa,k_ghost_sa,cache_snd...,cache_rcv...)
-    values, cache
-end
-
 function psparse_assemble_impl(
-                            A,
-                            ::Type{<:AbstractSplitMatrix},
-                            rows;
-                            reuse=Val(false),
-                            assembly_neighbors_options_cols=(;))
-
-
-    rows_sa = partition(axes(A,1))
-    cols_sa = partition(axes(A,2))
-    cols = map(remove_ghost,cols_sa)
-    parts_snd, parts_rcv = assembly_neighbors(rows_sa)
-    cache_snd = map(setup_cache_snd,partition(A),parts_snd,rows_sa,cols_sa)
-
-    I_snd = map(i->i.I_snd,cache_snd)
-    J_snd = map(i->i.J_snd,cache_snd)
-    V_snd = map(i->i.V_snd,cache_snd)
-    graph = ExchangeGraph(parts_snd,parts_rcv)
-    t_I = exchange(I_snd,graph)
-    t_J = exchange(J_snd,graph)
-    t_V = exchange(V_snd,graph)
-    @fake_async begin
-        I_rcv = fetch(t_I)
-        J_rcv = fetch(t_J)
-        V_rcv = fetch(t_V)
-        cache_rcv = map(setup_cache_rcv,I_rcv,J_rcv,V_rcv,parts_rcv)
-        triplets,J,aux = map(setup_own_triplets,partition(A),cache_rcv,rows_sa,cols_sa) |> tuple_of_arrays
-        J_owner = find_owner(cols_sa,J)
-        rows_fa = rows
-        cols_fa = map(union_ghost,cols,J,J_owner)
-        assembly_neighbors(cols_fa;assembly_neighbors_options_cols...)
-        vals_fa, cache = map(finalize_values,partition(A),rows_fa,cols_fa,cache_snd,cache_rcv,triplets,aux) |> tuple_of_arrays
-        assembled = true
-        B = PSparseMatrix(vals_fa,rows_fa,cols_fa,assembled)
-        if val_parameter(reuse) == false
-            B
-        else
-            B, cache
+                                A,
+                                ::Type{T},
+                                rows;
+                                reuse=Val(false),
+                                assembly_neighbors_options_cols=(;)) where T<:AbstractSplitMatrix
+                                
+    function setup_cache_snd(A,parts_snd,rows_sa,cols_sa)
+        A_ghost_own   = A.blocks.ghost_own
+        A_ghost_ghost = A.blocks.ghost_ghost
+        gen = ( owner=>i for (i,owner) in enumerate(parts_snd) )
+        owner_to_p = Dict(gen)
+        ptrs = zeros(Int32,length(parts_snd)+1)
+        ghost_to_owner_row = ghost_to_owner(rows_sa)
+        ghost_to_global_row = ghost_to_global(rows_sa)
+        own_to_global_col = own_to_global(cols_sa)
+        ghost_to_global_col = ghost_to_global(cols_sa)
+        for (i,_,_) in nziterator(A_ghost_own)
+            owner = ghost_to_owner_row[i]
+            ptrs[owner_to_p[owner]+1] += 1
+        end
+        for (i,_,_) in nziterator(A_ghost_ghost)
+            owner = ghost_to_owner_row[i]
+            ptrs[owner_to_p[owner]+1] += 1
+        end
+        length_to_ptrs!(ptrs)
+        Tv = eltype(A_ghost_own)
+        ndata = ptrs[end]-1
+        I_snd_data = zeros(Int,ndata)
+        J_snd_data = zeros(Int,ndata)
+        V_snd_data = zeros(Tv,ndata)
+        k_snd_data = zeros(Int32,ndata)
+        nnz_ghost_own = 0
+        for (k,(i,j,v)) in enumerate(nziterator(A_ghost_own))
+            owner = ghost_to_owner_row[i]
+            p = ptrs[owner_to_p[owner]]
+            I_snd_data[p] = ghost_to_global_row[i]
+            J_snd_data[p] = own_to_global_col[j]
+            V_snd_data[p] = v
+            k_snd_data[p] = k
+            ptrs[owner_to_p[owner]] += 1
+            nnz_ghost_own += 1
+        end
+        for (k,(i,j,v)) in enumerate(nziterator(A_ghost_ghost))
+            owner = ghost_to_owner_row[i]
+            p = ptrs[owner_to_p[owner]]
+            I_snd_data[p] = ghost_to_global_row[i]
+            J_snd_data[p] = ghost_to_global_col[j]
+            V_snd_data[p] = v
+            k_snd_data[p] = k+nnz_ghost_own
+            ptrs[owner_to_p[owner]] += 1
+        end
+        rewind_ptrs!(ptrs)
+        I_snd = JaggedArray(I_snd_data,ptrs)
+        J_snd = JaggedArray(J_snd_data,ptrs)
+        V_snd = JaggedArray(V_snd_data,ptrs)
+        k_snd = JaggedArray(k_snd_data,ptrs)
+        (;I_snd,J_snd,V_snd,k_snd,parts_snd)
+    end
+    
+    function setup_cache_rcv(I_rcv,J_rcv,V_rcv,parts_rcv)
+        k_rcv_data = zeros(Int32,length(I_rcv.data))
+        k_rcv = JaggedArray(k_rcv_data,I_rcv.ptrs)
+        (;I_rcv,J_rcv,V_rcv,k_rcv,parts_rcv)
+    end
+    
+    function setup_own_triplets(A,cache_rcv,rows_sa,cols_sa)
+        nz_own_own = findnz(A.blocks.own_own)
+        nz_own_ghost = findnz(A.blocks.own_ghost)
+        I_rcv_data = cache_rcv.I_rcv.data
+        J_rcv_data = cache_rcv.J_rcv.data
+        V_rcv_data = cache_rcv.V_rcv.data
+        k_rcv_data = cache_rcv.k_rcv.data
+        global_to_own_col = global_to_own(cols_sa)
+        is_ghost = findall(j->global_to_own_col[j]==0,J_rcv_data)
+        is_own = findall(j->global_to_own_col[j]!=0,J_rcv_data)
+        I_rcv_own = view(I_rcv_data,is_own)
+        J_rcv_own = view(J_rcv_data,is_own)
+        V_rcv_own = view(V_rcv_data,is_own)
+        k_rcv_own = view(k_rcv_data,is_own)
+        I_rcv_ghost = view(I_rcv_data,is_ghost)
+        J_rcv_ghost = view(J_rcv_data,is_ghost)
+        V_rcv_ghost = view(V_rcv_data,is_ghost)
+        k_rcv_ghost = view(k_rcv_data,is_ghost)
+        # After this col ids in own_ghost triplet remain global
+        map_global_to_own!(I_rcv_own,rows_sa)
+        map_global_to_own!(J_rcv_own,cols_sa)
+        map_global_to_own!(I_rcv_ghost,rows_sa)
+        map_ghost_to_global!(nz_own_ghost[2],cols_sa)
+        own_own_I = vcat(nz_own_own[1],I_rcv_own)
+        own_own_J = vcat(nz_own_own[2],J_rcv_own)
+        own_own_V = vcat(nz_own_own[3],V_rcv_own)
+        own_own_triplet = (own_own_I,own_own_J,own_own_V)
+        own_ghost_I = vcat(nz_own_ghost[1],I_rcv_ghost)
+        own_ghost_J = vcat(nz_own_ghost[2],J_rcv_ghost)
+        own_ghost_V = vcat(nz_own_ghost[3],V_rcv_ghost)
+        map_global_to_ghost!(nz_own_ghost[2],cols_sa)
+        own_ghost_triplet = (own_ghost_I,own_ghost_J,own_ghost_V)
+        triplets = (own_own_triplet,own_ghost_triplet)
+        aux = (I_rcv_own,J_rcv_own,k_rcv_own,I_rcv_ghost,J_rcv_ghost,k_rcv_ghost,nz_own_own,nz_own_ghost)
+        triplets, own_ghost_J, aux
+    end
+    
+    function finalize_values(A,rows_fa,cols_fa,cache_snd,cache_rcv,triplets,aux)
+        (own_own_triplet,own_ghost_triplet) = triplets
+        (I_rcv_own,J_rcv_own,k_rcv_own,I_rcv_ghost,J_rcv_ghost,k_rcv_ghost,nz_own_own,nz_own_ghost) = aux
+        map_global_to_ghost!(own_ghost_triplet[2],cols_fa)
+        map_global_to_ghost!(J_rcv_ghost,cols_fa)
+        TA = typeof(A.blocks.own_own)
+        n_own_rows = own_length(rows_fa)
+        n_own_cols = own_length(cols_fa)
+        n_ghost_rows = ghost_length(rows_fa)
+        n_ghost_cols = ghost_length(cols_fa)
+        Ti = indextype(A.blocks.own_own)
+        Tv = eltype(A.blocks.own_own)
+        own_own = compresscoo(TA,own_own_triplet...,n_own_rows,n_own_cols)
+        own_ghost = compresscoo(TA,own_ghost_triplet...,n_own_rows,n_ghost_cols)
+        ghost_own = compresscoo(TA,Ti[],Ti[],Tv[],n_ghost_rows,n_own_cols)
+        ghost_ghost = compresscoo(TA,Ti[],Ti[],Tv[],n_ghost_rows,n_ghost_cols)
+        blocks = split_matrix_blocks(own_own,own_ghost,ghost_own,ghost_ghost)
+        values = split_matrix(blocks,local_permutation(rows_fa),local_permutation(cols_fa))
+        nnz_own_own = nnz(own_own)
+        k_own_sa = precompute_nzindex(own_own,own_own_triplet[1:2]...)
+        k_ghost_sa = precompute_nzindex(own_ghost,own_ghost_triplet[1:2]...)
+        for p in 1:length(I_rcv_own)
+            i = I_rcv_own[p]
+            j = J_rcv_own[p]
+            k_rcv_own[p] = nzindex(own_own,i,j)
+        end
+        for p in 1:length(I_rcv_ghost)
+            i = I_rcv_ghost[p]
+            j = J_rcv_ghost[p]
+            k_rcv_ghost[p] = nzindex(own_ghost,i,j) + nnz_own_own
+        end
+        cache = (;k_own_sa,k_ghost_sa,cache_snd...,cache_rcv...)
+        values, cache
+    end
+    
+    function _psparse_assemble_impl(
+                                A,
+                                ::Type{<:AbstractSplitMatrix},
+                                rows;
+                                reuse=Val(false),
+                                assembly_neighbors_options_cols=(;))
+    
+    
+        rows_sa = partition(axes(A,1))
+        cols_sa = partition(axes(A,2))
+        cols = map(remove_ghost,cols_sa)
+        parts_snd, parts_rcv = assembly_neighbors(rows_sa)
+        cache_snd = map(setup_cache_snd,partition(A),parts_snd,rows_sa,cols_sa)
+    
+        I_snd = map(i->i.I_snd,cache_snd)
+        J_snd = map(i->i.J_snd,cache_snd)
+        V_snd = map(i->i.V_snd,cache_snd)
+        graph = ExchangeGraph(parts_snd,parts_rcv)
+        t_I = exchange(I_snd,graph)
+        t_J = exchange(J_snd,graph)
+        t_V = exchange(V_snd,graph)
+        @fake_async begin
+            I_rcv = fetch(t_I)
+            J_rcv = fetch(t_J)
+            V_rcv = fetch(t_V)
+            cache_rcv = map(setup_cache_rcv,I_rcv,J_rcv,V_rcv,parts_rcv)
+            triplets,J,aux = map(setup_own_triplets,partition(A),cache_rcv,rows_sa,cols_sa) |> tuple_of_arrays
+            J_owner = find_owner(cols_sa,J)
+            rows_fa = rows
+            cols_fa = map(union_ghost,cols,J,J_owner)
+            assembly_neighbors(cols_fa;assembly_neighbors_options_cols...)
+            vals_fa, cache = map(finalize_values,partition(A),rows_fa,cols_fa,cache_snd,cache_rcv,triplets,aux) |> tuple_of_arrays
+            assembled = true
+            B = PSparseMatrix(vals_fa,rows_fa,cols_fa,assembled)
+            if val_parameter(reuse) == false
+                B
+            else
+                B, cache
+            end
         end
     end
+
+    _psparse_assemble_impl(A,T,rows;reuse,assembly_neighbors_options_cols)
 end
 
 # End new assemble
@@ -1782,332 +1792,185 @@ function consistent!(B::PSparseMatrix,A::PSparseMatrix,cache)
     psparse_consistent_impl!(B,A,T,cache)
 end
 
-# function psparse_consistent_impl(
-#     A,
-#     ::Type{<:AbstractSplitMatrix},
-#     rows_co;
-#     reuse=Val(false))
-
-#     function setup_snd(A,parts_snd,lids_snd,rows_co,cols_fa)
-#         own_to_local_row = own_to_local(rows_co)
-#         own_to_global_row = own_to_global(rows_co)
-#         own_to_global_col = own_to_global(cols_fa)
-#         ghost_to_global_col = ghost_to_global(cols_fa)
-#         nl = size(A,1)
-#         li_to_ps_ptrs = zeros(Int32,nl+1)
-#         for p in 1:length(lids_snd)
-#             for li in lids_snd[p]
-#                 li_to_ps_ptrs[li+1] += 1
-#             end
-#         end
-#         length_to_ptrs!(li_to_ps_ptrs)
-#         ndata = li_to_ps_ptrs[end]-1
-#         li_to_ps_data = zeros(Int32,ndata)
-#         for p in 1:length(lids_snd)
-#             for li in lids_snd[p]
-#                 q = li_to_ps_ptrs[li]
-#                 li_to_ps_data[q] = p
-#                 li_to_ps_ptrs[li] = q + 1
-#             end
-#         end
-#         rewind_ptrs!(li_to_ps_ptrs)
-#         li_to_ps = JaggedArray(li_to_ps_data,li_to_ps_ptrs)
-#         ptrs = zeros(Int32,length(parts_snd)+1)
-#         for (i,j,v) in nziterator(A.blocks.own_own)
-#             li = own_to_local_row[i]
-#             for p in li_to_ps[li]
-#                 ptrs[p+1] += 1
-#             end
-#         end
-#         for (i,j,v) in nziterator(A.blocks.own_ghost)
-#             li = own_to_local_row[i]
-#             for p in li_to_ps[li]
-#                 ptrs[p+1] += 1
-#             end
-#         end
-#         length_to_ptrs!(ptrs)
-#         ndata = ptrs[end]-1
-#         T = eltype(A)
-#         I_snd = JaggedArray(zeros(Int,ndata),ptrs)
-#         J_snd = JaggedArray(zeros(Int,ndata),ptrs)
-#         V_snd = JaggedArray(zeros(T,ndata),ptrs)
-#         k_snd = JaggedArray(zeros(Int32,ndata),ptrs)
-#         for (k,(i,j,v)) in enumerate(nziterator(A.blocks.own_own))
-#             li = own_to_local_row[i]
-#             for p in li_to_ps[li]
-#                 q = ptrs[p]
-#                 I_snd.data[q] = own_to_global_row[i]
-#                 J_snd.data[q] = own_to_global_col[j]
-#                 V_snd.data[q] = v
-#                 k_snd.data[q] = k
-#                 ptrs[p] += 1
-#             end
-#         end
-#         nnz_own_own = nnz(A.blocks.own_own)
-#         for (k,(i,j,v)) in enumerate(nziterator(A.blocks.own_ghost))
-#             li = own_to_local_row[i]
-#             for p in li_to_ps[li]
-#                 q = ptrs[p]
-#                 I_snd.data[q] = own_to_global_row[i]
-#                 J_snd.data[q] = ghost_to_global_col[j]
-#                 V_snd.data[q] = v
-#                 k_snd.data[q] = k+nnz_own_own
-#                 ptrs[p] += 1
-#             end
-#         end
-#         rewind_ptrs!(ptrs)
-#         cache_snd = (;parts_snd,lids_snd,I_snd,J_snd,V_snd,k_snd)
-#         cache_snd
-#     end
-#     function setup_rcv(parts_rcv,lids_rcv,I_rcv,J_rcv,V_rcv)
-#         cache_rcv = (;parts_rcv,lids_rcv,I_rcv,J_rcv,V_rcv)
-#         cache_rcv
-#     end
-#     function finalize(A,cache_snd,cache_rcv,rows_co,cols_fa,cols_co)
-#         I_rcv_data = cache_rcv.I_rcv.data
-#         J_rcv_data = cache_rcv.J_rcv.data
-#         V_rcv_data = cache_rcv.V_rcv.data
-#         global_to_own_col = global_to_own(cols_co)
-#         global_to_ghost_col = global_to_ghost(cols_co)
-#         is_own = findall(j->global_to_own_col[j]!=0,J_rcv_data)
-#         is_ghost = findall(j->global_to_ghost_col[j]!=0,J_rcv_data)
-#         I_rcv_own = I_rcv_data[is_own]
-#         J_rcv_own = J_rcv_data[is_own]
-#         V_rcv_own = V_rcv_data[is_own]
-#         I_rcv_ghost = I_rcv_data[is_ghost]
-#         J_rcv_ghost = J_rcv_data[is_ghost]
-#         V_rcv_ghost = V_rcv_data[is_ghost]
-#         map_global_to_ghost!(I_rcv_own,rows_co)
-#         map_global_to_ghost!(I_rcv_ghost,rows_co)
-#         map_global_to_own!(J_rcv_own,cols_co)
-#         map_global_to_ghost!(J_rcv_ghost,cols_co)
-#         I2,J2,V2 = findnz(A.blocks.own_ghost)
-#         map_ghost_to_global!(J2,cols_fa)
-#         map_global_to_ghost!(J2,cols_co)
-#         n_own_rows = own_length(rows_co)
-#         n_ghost_rows = ghost_length(rows_co)
-#         n_own_cols = own_length(cols_co)
-#         n_ghost_cols = ghost_length(cols_co)
-#         TA = typeof(A.blocks.ghost_own)
-#         own_own = A.blocks.own_own
-#         own_ghost = compresscoo(TA,I2,J2,V2,n_own_rows,n_ghost_cols) # TODO this can be improved
-#         ghost_own = compresscoo(TA,I_rcv_own,J_rcv_own,V_rcv_own,n_ghost_rows,n_own_cols)
-#         ghost_ghost = compresscoo(TA,I_rcv_ghost,J_rcv_ghost,V_rcv_ghost,n_ghost_rows,n_ghost_cols)
-#         K_own = precompute_nzindex(ghost_own,I_rcv_own,J_rcv_own)
-#         K_ghost = precompute_nzindex(ghost_ghost,I_rcv_ghost,J_rcv_ghost)
-#         blocks = split_matrix_blocks(own_own,own_ghost,ghost_own,ghost_ghost)
-#         values = split_matrix(blocks,local_permutation(rows_co),local_permutation(cols_co))
-#         k_snd = cache_snd.k_snd
-#         V_snd = cache_snd.V_snd
-#         V_rcv = cache_rcv.V_rcv
-#         parts_snd = cache_snd.parts_snd
-#         parts_rcv = cache_rcv.parts_rcv
-#         cache = (;parts_snd,parts_rcv,k_snd,V_snd,V_rcv,is_ghost,is_own,V_rcv_own,V_rcv_ghost,K_own,K_ghost)
-#         values,cache
-#     end
-#     @assert matching_own_indices(axes(A,1),PRange(rows_co))
-#     rows_fa = partition(axes(A,1))
-#     cols_fa = partition(axes(A,2))
-#     # snd and rcv are swapped on purpose
-#     parts_rcv,parts_snd = assembly_neighbors(rows_co)
-#     lids_rcv,lids_snd = assembly_local_indices(rows_co)
-#     cache_snd = map(setup_snd,partition(A),parts_snd,lids_snd,rows_co,cols_fa)
-#     I_snd = map(i->i.I_snd,cache_snd)
-#     J_snd = map(i->i.J_snd,cache_snd)
-#     V_snd = map(i->i.V_snd,cache_snd)
-#     graph = ExchangeGraph(parts_snd,parts_rcv)
-#     t_I = exchange(I_snd,graph)
-#     t_J = exchange(J_snd,graph)
-#     t_V = exchange(V_snd,graph)
-#     @fake_async begin
-#         I_rcv = fetch(t_I)
-#         J_rcv = fetch(t_J)
-#         V_rcv = fetch(t_V)
-#         J_rcv_data = map(x->x.data,J_rcv)
-#         J_rcv_owner = find_owner(cols_fa,J_rcv_data)
-#         cols_co = map(union_ghost,cols_fa,J_rcv_data,J_rcv_owner)
-#         cache_rcv = map(setup_rcv,parts_rcv,lids_rcv,I_rcv,J_rcv,V_rcv)
-#         values,cache = map(finalize,partition(A),cache_snd,cache_rcv,rows_co,cols_fa,cols_co) |> tuple_of_arrays
-#         B = PSparseMatrix(values,rows_co,cols_co,A.assembled)
-#         if val_parameter(reuse) == false
-#             B
-#         else
-#             B,cache
-#         end
-#     end
-# end
-
 # New consistent
 ####################
-
-function consistent_setup_snd(A,parts_snd,lids_snd,rows_co,cols_fa)
-    own_to_local_row::UnitRange{Int32} = own_to_local(rows_co)
-    own_to_global_row = own_to_global(rows_co)
-    own_to_global_col = own_to_global(cols_fa)
-    ghost_to_global_col = ghost_to_global(cols_fa)
-    nl = size(A,1)
-    li_to_ps_ptrs = zeros(Int32,nl+1)
-    for p in 1:length(lids_snd)
-        for li_ptr in jagged_range(lids_snd,p)
-            li = lids_snd.data[li_ptr]
-            li_to_ps_ptrs[li+1] += 1
+function psparse_consistent_impl(
+        A,
+        ::Type{T},
+        rows_co;
+        reuse=Val(false)) where T<:AbstractSplitMatrix
+    function consistent_setup_snd(A,parts_snd,lids_snd,rows_co,cols_fa)
+        own_to_local_row::UnitRange{Int32} = own_to_local(rows_co)
+        own_to_global_row = own_to_global(rows_co)
+        own_to_global_col = own_to_global(cols_fa)
+        ghost_to_global_col = ghost_to_global(cols_fa)
+        nl = size(A,1)
+        li_to_ps_ptrs = zeros(Int32,nl+1)
+        for p in 1:length(lids_snd)
+            for li_ptr in jagged_range(lids_snd,p)
+                li = lids_snd.data[li_ptr]
+                li_to_ps_ptrs[li+1] += 1
+            end
         end
-    end
-    length_to_ptrs!(li_to_ps_ptrs)
-    ndata = li_to_ps_ptrs[end]-1
-    li_to_ps_data = zeros(Int32,ndata)
-    for p in 1:length(lids_snd)
-        for li_ptr in jagged_range(lids_snd,p)
-            li = lids_snd.data[li_ptr]
-            q = li_to_ps_ptrs[li]
-            li_to_ps_data[q] = p
-            li_to_ps_ptrs[li] = q + 1
+        length_to_ptrs!(li_to_ps_ptrs)
+        ndata = li_to_ps_ptrs[end]-1
+        li_to_ps_data = zeros(Int32,ndata)
+        for p in 1:length(lids_snd)
+            for li_ptr in jagged_range(lids_snd,p)
+                li = lids_snd.data[li_ptr]
+                q = li_to_ps_ptrs[li]
+                li_to_ps_data[q] = p
+                li_to_ps_ptrs[li] = q + 1
+            end
         end
-    end
-
-    rewind_ptrs!(li_to_ps_ptrs)
-    li_to_ps = JaggedArray(li_to_ps_data,li_to_ps_ptrs)
-    ptrs = zeros(Int32,length(parts_snd)+1)
-    for (i,j,v) in nziterator(A.blocks.own_own)
-        # @show(typeof(own_to_local_row))
-        li = own_to_local_row[i]
-        for li_ptr in jagged_range(li_to_ps,li)
-            p = li_to_ps.data[li_ptr]
-            ptrs[p+1] += 1
+    
+        rewind_ptrs!(li_to_ps_ptrs)
+        li_to_ps = JaggedArray(li_to_ps_data,li_to_ps_ptrs)
+        ptrs = zeros(Int32,length(parts_snd)+1)
+        for (i,j,v) in nziterator(A.blocks.own_own)
+            # @show(typeof(own_to_local_row))
+            li = own_to_local_row[i]
+            for li_ptr in jagged_range(li_to_ps,li)
+                p = li_to_ps.data[li_ptr]
+                ptrs[p+1] += 1
+            end
         end
-    end
-
-    for (i,j,v) in nziterator(A.blocks.own_ghost)
-        li = own_to_local_row[i]
-        for ptr in jagged_range(li_to_ps,li)
-            p=li_to_ps.data[ptr]
-            ptrs[p+1] += 1
+    
+        for (i,j,v) in nziterator(A.blocks.own_ghost)
+            li = own_to_local_row[i]
+            for ptr in jagged_range(li_to_ps,li)
+                p=li_to_ps.data[ptr]
+                ptrs[p+1] += 1
+            end
         end
-    end
-    length_to_ptrs!(ptrs)
-    ndata = ptrs[end]-1
-    T = eltype(A)
-    I_snd = JaggedArray(zeros(Int,ndata),ptrs)
-    J_snd = JaggedArray(zeros(Int,ndata),ptrs)
-    V_snd = JaggedArray(zeros(T,ndata),ptrs)
-    k_snd = JaggedArray(zeros(Int32,ndata),ptrs)
-    for (k,(i,j,v)) in enumerate(nziterator(A.blocks.own_own))
-        li = own_to_local_row[i]
-        for p_ptr in jagged_range(li_to_ps,li)
-            p = li_to_ps.data[p_ptr]
-            q = ptrs[p]
-            I_snd.data[q] = own_to_global_row[i]
-            J_snd.data[q] = own_to_global_col[j]
-            V_snd.data[q] = v
-            k_snd.data[q] = k
-            ptrs[p] += 1
+        length_to_ptrs!(ptrs)
+        ndata = ptrs[end]-1
+        T = eltype(A)
+        I_snd = JaggedArray(zeros(Int,ndata),ptrs)
+        J_snd = JaggedArray(zeros(Int,ndata),ptrs)
+        V_snd = JaggedArray(zeros(T,ndata),ptrs)
+        k_snd = JaggedArray(zeros(Int32,ndata),ptrs)
+        for (k,(i,j,v)) in enumerate(nziterator(A.blocks.own_own))
+            li = own_to_local_row[i]
+            for p_ptr in jagged_range(li_to_ps,li)
+                p = li_to_ps.data[p_ptr]
+                q = ptrs[p]
+                I_snd.data[q] = own_to_global_row[i]
+                J_snd.data[q] = own_to_global_col[j]
+                V_snd.data[q] = v
+                k_snd.data[q] = k
+                ptrs[p] += 1
+            end
         end
-    end
-
-    nnz_own_own = nnz(A.blocks.own_own)
-    for (k,(i,j,v)) in enumerate(nziterator(A.blocks.own_ghost))
-        li = own_to_local_row[i]
-        for p_ptr in jagged_range(li_to_ps,li)
-            p=li_to_ps.data[p_ptr]
-            q = ptrs[p]
-            I_snd.data[q] = own_to_global_row[i]
-            J_snd.data[q] = ghost_to_global_col[j]
-            V_snd.data[q] = v
-            k_snd.data[q] = k+nnz_own_own
-            ptrs[p] += 1
+    
+        nnz_own_own = nnz(A.blocks.own_own)
+        for (k,(i,j,v)) in enumerate(nziterator(A.blocks.own_ghost))
+            li = own_to_local_row[i]
+            for p_ptr in jagged_range(li_to_ps,li)
+                p=li_to_ps.data[p_ptr]
+                q = ptrs[p]
+                I_snd.data[q] = own_to_global_row[i]
+                J_snd.data[q] = ghost_to_global_col[j]
+                V_snd.data[q] = v
+                k_snd.data[q] = k+nnz_own_own
+                ptrs[p] += 1
+            end
         end
-    end
-    rewind_ptrs!(ptrs)
-    cache_snd = (;parts_snd,lids_snd,I_snd,J_snd,V_snd,k_snd)
-    cache_snd
-end
-
-function consistent_setup_rcv(parts_rcv,lids_rcv,I_rcv,J_rcv,V_rcv)
-    cache_rcv = (;parts_rcv,lids_rcv,I_rcv,J_rcv,V_rcv)
-    cache_rcv
-end
-
-function consistent_finalize(A,cache_snd,cache_rcv,rows_co,cols_fa,cols_co)
-    I_rcv_data = cache_rcv.I_rcv.data
-    J_rcv_data = cache_rcv.J_rcv.data
-    V_rcv_data = cache_rcv.V_rcv.data
-    global_to_own_col = global_to_own(cols_co)
-    global_to_ghost_col = global_to_ghost(cols_co)
-    is_own_condition = k -> global_to_own_col[k]!=0
-    is_own = is_own_condition.(J_rcv_data)
-    I_rcv_own = I_rcv_data[is_own]
-    J_rcv_own = J_rcv_data[is_own]
-    V_rcv_own = V_rcv_data[is_own]
-    is_ghost = map!(!,is_own, is_own) # inverse is_own bitvector to effectively represent is_ghost mask
-    I_rcv_ghost = I_rcv_data[is_ghost]
-    J_rcv_ghost = J_rcv_data[is_ghost]
-    V_rcv_ghost = V_rcv_data[is_ghost]
-    map_global_to_ghost!(I_rcv_own,rows_co)
-    map_global_to_ghost!(I_rcv_ghost,rows_co)
-    map_global_to_own!(J_rcv_own,cols_co)
-    map_global_to_ghost!(J_rcv_ghost,cols_co)
-    I2,J2,V2 = findnz(A.blocks.own_ghost)
-    map_ghost_to_global!(J2,cols_fa)
-    map_global_to_ghost!(J2,cols_co)
-    n_own_rows = own_length(rows_co)
-    n_ghost_rows = ghost_length(rows_co)
-    n_own_cols = own_length(cols_co)
-    n_ghost_cols = ghost_length(cols_co)
-    TA = typeof(A.blocks.ghost_own)
-    own_own = A.blocks.own_own
-    own_ghost = compresscoo(TA,I2,J2,V2,n_own_rows,n_ghost_cols) # TODO this can be improved
-    ghost_own = compresscoo(TA,I_rcv_own,J_rcv_own,V_rcv_own,n_ghost_rows,n_own_cols)
-    ghost_ghost = compresscoo(TA,I_rcv_ghost,J_rcv_ghost,V_rcv_ghost,n_ghost_rows,n_ghost_cols)
-    K_own = precompute_nzindex(ghost_own,I_rcv_own,J_rcv_own)
-    K_ghost = precompute_nzindex(ghost_ghost,I_rcv_ghost,J_rcv_ghost)
-    blocks = split_matrix_blocks(own_own,own_ghost,ghost_own,ghost_ghost)
-    values = split_matrix(blocks,local_permutation(rows_co),local_permutation(cols_co))
-    k_snd = cache_snd.k_snd
-    V_snd = cache_snd.V_snd
-    V_rcv = cache_rcv.V_rcv
-    parts_snd = cache_snd.parts_snd
-    parts_rcv = cache_rcv.parts_rcv
-    cache = (;parts_snd,parts_rcv,k_snd,V_snd,V_rcv,V_rcv_own,V_rcv_ghost,K_own,K_ghost)
-    values,cache
-end
-
-function psparse_consistent_impl(
-                            A,
-                            ::Type{<:AbstractSplitMatrix},
-                            rows_co;
-                            reuse=Val(false))
-    @assert matching_own_indices(axes(A,1),PRange(rows_co))
-    rows_fa = partition(axes(A,1))
-    cols_fa = partition(axes(A,2))
-    # snd and rcv are swapped on purpose
-    parts_rcv,parts_snd = assembly_neighbors(rows_co)
-    lids_rcv,lids_snd = assembly_local_indices(rows_co)
-    cache_snd = map(consistent_setup_snd,partition(A),parts_snd,lids_snd,rows_co,cols_fa)
-    I_snd = map(i->i.I_snd,cache_snd)
-    J_snd = map(i->i.J_snd,cache_snd)
-    V_snd = map(i->i.V_snd,cache_snd)
-    graph = ExchangeGraph(parts_snd,parts_rcv)
-    t_I = exchange(I_snd,graph)
-    t_J = exchange(J_snd,graph)
-    t_V = exchange(V_snd,graph)
-    @fake_async begin
-        I_rcv = fetch(t_I)
-        J_rcv = fetch(t_J)
-        V_rcv = fetch(t_V)
-        J_rcv_data = map(x->x.data,J_rcv)
-        J_rcv_owner = find_owner(cols_fa,J_rcv_data)
-        cols_co = map(union_ghost,cols_fa,J_rcv_data,J_rcv_owner)
-        cache_rcv = map(consistent_setup_rcv,parts_rcv,lids_rcv,I_rcv,J_rcv,V_rcv)
-        values,cache = map(consistent_finalize,partition(A),cache_snd,cache_rcv,rows_co,cols_fa,cols_co) |> tuple_of_arrays
-        B = PSparseMatrix(values,rows_co,cols_co,A.assembled)
-        if val_parameter(reuse) == false
-            B
-        else
-            B,cache
+        rewind_ptrs!(ptrs)
+        cache_snd = (;parts_snd,lids_snd,I_snd,J_snd,V_snd,k_snd)
+        cache_snd
+    end
+        
+    function consistent_setup_rcv(parts_rcv,lids_rcv,I_rcv,J_rcv,V_rcv)
+        cache_rcv = (;parts_rcv,lids_rcv,I_rcv,J_rcv,V_rcv)
+        cache_rcv
+    end
+    
+    function consistent_finalize(A,cache_snd,cache_rcv,rows_co,cols_fa,cols_co)
+        I_rcv_data = cache_rcv.I_rcv.data
+        J_rcv_data = cache_rcv.J_rcv.data
+        V_rcv_data = cache_rcv.V_rcv.data
+        global_to_own_col = global_to_own(cols_co)
+        global_to_ghost_col = global_to_ghost(cols_co)
+        is_own_condition = k -> global_to_own_col[k]!=0
+        is_own = is_own_condition.(J_rcv_data)
+        I_rcv_own = I_rcv_data[is_own]
+        J_rcv_own = J_rcv_data[is_own]
+        V_rcv_own = V_rcv_data[is_own]
+        is_ghost = map!(!,is_own, is_own) # inverse is_own bitvector to effectively represent is_ghost mask
+        I_rcv_ghost = I_rcv_data[is_ghost]
+        J_rcv_ghost = J_rcv_data[is_ghost]
+        V_rcv_ghost = V_rcv_data[is_ghost]
+        map_global_to_ghost!(I_rcv_own,rows_co)
+        map_global_to_ghost!(I_rcv_ghost,rows_co)
+        map_global_to_own!(J_rcv_own,cols_co)
+        map_global_to_ghost!(J_rcv_ghost,cols_co)
+        I2,J2,V2 = findnz(A.blocks.own_ghost)
+        map_ghost_to_global!(J2,cols_fa)
+        map_global_to_ghost!(J2,cols_co)
+        n_own_rows = own_length(rows_co)
+        n_ghost_rows = ghost_length(rows_co)
+        n_own_cols = own_length(cols_co)
+        n_ghost_cols = ghost_length(cols_co)
+        TA = typeof(A.blocks.ghost_own)
+        own_own = A.blocks.own_own
+        own_ghost = compresscoo(TA,I2,J2,V2,n_own_rows,n_ghost_cols) # TODO this can be improved
+        ghost_own = compresscoo(TA,I_rcv_own,J_rcv_own,V_rcv_own,n_ghost_rows,n_own_cols)
+        ghost_ghost = compresscoo(TA,I_rcv_ghost,J_rcv_ghost,V_rcv_ghost,n_ghost_rows,n_ghost_cols)
+        K_own = precompute_nzindex(ghost_own,I_rcv_own,J_rcv_own)
+        K_ghost = precompute_nzindex(ghost_ghost,I_rcv_ghost,J_rcv_ghost)
+        blocks = split_matrix_blocks(own_own,own_ghost,ghost_own,ghost_ghost)
+        values = split_matrix(blocks,local_permutation(rows_co),local_permutation(cols_co))
+        k_snd = cache_snd.k_snd
+        V_snd = cache_snd.V_snd
+        V_rcv = cache_rcv.V_rcv
+        parts_snd = cache_snd.parts_snd
+        parts_rcv = cache_rcv.parts_rcv
+        cache = (;parts_snd,parts_rcv,k_snd,V_snd,V_rcv,V_rcv_own,V_rcv_ghost,K_own,K_ghost)
+        values,cache
+    end
+
+    function _psparse_consistent_impl(
+                                A,
+                                ::Type{<:AbstractSplitMatrix},
+                                rows_co;
+                                reuse=Val(false))
+        @assert matching_own_indices(axes(A,1),PRange(rows_co))
+        rows_fa = partition(axes(A,1))
+        cols_fa = partition(axes(A,2))
+        # snd and rcv are swapped on purpose
+        parts_rcv,parts_snd = assembly_neighbors(rows_co)
+        lids_rcv,lids_snd = assembly_local_indices(rows_co)
+        cache_snd = map(consistent_setup_snd,partition(A),parts_snd,lids_snd,rows_co,cols_fa)
+        I_snd = map(i->i.I_snd,cache_snd)
+        J_snd = map(i->i.J_snd,cache_snd)
+        V_snd = map(i->i.V_snd,cache_snd)
+        graph = ExchangeGraph(parts_snd,parts_rcv)
+        t_I = exchange(I_snd,graph)
+        t_J = exchange(J_snd,graph)
+        t_V = exchange(V_snd,graph)
+        @fake_async begin
+            I_rcv = fetch(t_I)
+            J_rcv = fetch(t_J)
+            V_rcv = fetch(t_V)
+            J_rcv_data = map(x->x.data,J_rcv)
+            J_rcv_owner = find_owner(cols_fa,J_rcv_data)
+            cols_co = map(union_ghost,cols_fa,J_rcv_data,J_rcv_owner)
+            cache_rcv = map(consistent_setup_rcv,parts_rcv,lids_rcv,I_rcv,J_rcv,V_rcv)
+            values,cache = map(consistent_finalize,partition(A),cache_snd,cache_rcv,rows_co,cols_fa,cols_co) |> tuple_of_arrays
+            B = PSparseMatrix(values,rows_co,cols_co,A.assembled)
+            if val_parameter(reuse) == false
+                B
+            else
+                B,cache
+            end
         end
     end
+
+    _psparse_consistent_impl(A,T,rows_co;reuse)
 end
 
+
 # End new consistent
 ####################
 

From 7bdb36561704d756b6f65770da198ff414ad4468 Mon Sep 17 00:00:00 2001
From: jop611 <meijer.jop@gmail.com>
Date: Thu, 15 Aug 2024 18:35:48 +0200
Subject: [PATCH 05/34] fix in consistent_impl

---
 src/p_sparse_matrix.jl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/p_sparse_matrix.jl b/src/p_sparse_matrix.jl
index b315e2af..4c299a94 100644
--- a/src/p_sparse_matrix.jl
+++ b/src/p_sparse_matrix.jl
@@ -1534,7 +1534,7 @@ function psparse_assemble_impl(
                                 rows;
                                 reuse=Val(false),
                                 assembly_neighbors_options_cols=(;)) where T<:AbstractSplitMatrix
-                                
+
     function setup_cache_snd(A,parts_snd,rows_sa,cols_sa)
         A_ghost_own   = A.blocks.ghost_own
         A_ghost_ghost = A.blocks.ghost_ghost
@@ -1845,10 +1845,10 @@ function psparse_consistent_impl(
         end
         length_to_ptrs!(ptrs)
         ndata = ptrs[end]-1
-        T = eltype(A)
+        Tv = eltype(A)
         I_snd = JaggedArray(zeros(Int,ndata),ptrs)
         J_snd = JaggedArray(zeros(Int,ndata),ptrs)
-        V_snd = JaggedArray(zeros(T,ndata),ptrs)
+        V_snd = JaggedArray(zeros(Tv,ndata),ptrs)
         k_snd = JaggedArray(zeros(Int32,ndata),ptrs)
         for (k,(i,j,v)) in enumerate(nziterator(A.blocks.own_own))
             li = own_to_local_row[i]
@@ -1932,7 +1932,7 @@ function psparse_consistent_impl(
 
     function _psparse_consistent_impl(
                                 A,
-                                ::Type{<:AbstractSplitMatrix},
+                                ::T,
                                 rows_co;
                                 reuse=Val(false))
         @assert matching_own_indices(axes(A,1),PRange(rows_co))

From cc8a11de29a9aaa6b1ddcabac8c7ba25fb5d3321 Mon Sep 17 00:00:00 2001
From: jop611 <meijer.jop@gmail.com>
Date: Thu, 15 Aug 2024 18:37:12 +0200
Subject: [PATCH 06/34] fix in consistent_impl

---
 src/p_sparse_matrix.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/p_sparse_matrix.jl b/src/p_sparse_matrix.jl
index 4c299a94..367ae1aa 100644
--- a/src/p_sparse_matrix.jl
+++ b/src/p_sparse_matrix.jl
@@ -1668,7 +1668,7 @@ function psparse_assemble_impl(
     
     function _psparse_assemble_impl(
                                 A,
-                                ::Type{<:AbstractSplitMatrix},
+                                ::T,
                                 rows;
                                 reuse=Val(false),
                                 assembly_neighbors_options_cols=(;))

From 851b3b125d7a455eb131cedfb379fc530e63e555 Mon Sep 17 00:00:00 2001
From: jop611 <meijer.jop@gmail.com>
Date: Thu, 15 Aug 2024 18:49:34 +0200
Subject: [PATCH 07/34] fix in assemble_impl

---
 src/p_sparse_matrix.jl | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/p_sparse_matrix.jl b/src/p_sparse_matrix.jl
index 367ae1aa..a44d1f0f 100644
--- a/src/p_sparse_matrix.jl
+++ b/src/p_sparse_matrix.jl
@@ -1668,10 +1668,10 @@ function psparse_assemble_impl(
     
     function _psparse_assemble_impl(
                                 A,
-                                ::T,
+                                ::Type{T},
                                 rows;
                                 reuse=Val(false),
-                                assembly_neighbors_options_cols=(;))
+                                assembly_neighbors_options_cols=(;)) where T<:AbstractSplitMatrix
     
     
         rows_sa = partition(axes(A,1))
@@ -1799,6 +1799,7 @@ function psparse_consistent_impl(
         ::Type{T},
         rows_co;
         reuse=Val(false)) where T<:AbstractSplitMatrix
+
     function consistent_setup_snd(A,parts_snd,lids_snd,rows_co,cols_fa)
         own_to_local_row::UnitRange{Int32} = own_to_local(rows_co)
         own_to_global_row = own_to_global(rows_co)

From cb2d5ef56852ba1dd4539b47d3c5be159d1c6c52 Mon Sep 17 00:00:00 2001
From: jop611 <meijer.jop@gmail.com>
Date: Thu, 15 Aug 2024 18:52:41 +0200
Subject: [PATCH 08/34] fix in consistent_impl

---
 src/p_sparse_matrix.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/p_sparse_matrix.jl b/src/p_sparse_matrix.jl
index a44d1f0f..3a372245 100644
--- a/src/p_sparse_matrix.jl
+++ b/src/p_sparse_matrix.jl
@@ -1933,9 +1933,9 @@ function psparse_consistent_impl(
 
     function _psparse_consistent_impl(
                                 A,
-                                ::T,
+                                ::Type{T},
                                 rows_co;
-                                reuse=Val(false))
+                                reuse=Val(false)) where T<:AbstractSplitMatrix
         @assert matching_own_indices(axes(A,1),PRange(rows_co))
         rows_fa = partition(axes(A,1))
         cols_fa = partition(axes(A,2))

From b40dcdee042d622a9454f402de71b02e7a59aaf3 Mon Sep 17 00:00:00 2001
From: jop611 <meijer.jop@gmail.com>
Date: Fri, 16 Aug 2024 15:06:18 +0200
Subject: [PATCH 09/34] minor changes+some cleanup

---
 src/p_sparse_matrix.jl | 40 +++++++++++++++-------------------------
 1 file changed, 15 insertions(+), 25 deletions(-)

diff --git a/src/p_sparse_matrix.jl b/src/p_sparse_matrix.jl
index 3a372245..e1c2ed8d 100644
--- a/src/p_sparse_matrix.jl
+++ b/src/p_sparse_matrix.jl
@@ -1528,12 +1528,11 @@ end
 # New assemble
 ####################
 
-function psparse_assemble_impl(
-                                A,
-                                ::Type{T},
-                                rows;
-                                reuse=Val(false),
-                                assembly_neighbors_options_cols=(;)) where T<:AbstractSplitMatrix
+function psparse_assemble_impl(A::PSparseMatrix{V,B,C,D,Tv} where {V,B,C,D},
+                               ::Type{T},
+                               rows;
+                               reuse=Val(false),
+                               assembly_neighbors_options_cols=(;)) where {T<:AbstractSplitMatrix, Tv}
 
     function setup_cache_snd(A,parts_snd,rows_sa,cols_sa)
         A_ghost_own   = A.blocks.ghost_own
@@ -1554,7 +1553,6 @@ function psparse_assemble_impl(
             ptrs[owner_to_p[owner]+1] += 1
         end
         length_to_ptrs!(ptrs)
-        Tv = eltype(A_ghost_own)
         ndata = ptrs[end]-1
         I_snd_data = zeros(Int,ndata)
         J_snd_data = zeros(Int,ndata)
@@ -1642,7 +1640,6 @@ function psparse_assemble_impl(
         n_ghost_rows = ghost_length(rows_fa)
         n_ghost_cols = ghost_length(cols_fa)
         Ti = indextype(A.blocks.own_own)
-        Tv = eltype(A.blocks.own_own)
         own_own = compresscoo(TA,own_own_triplet...,n_own_rows,n_own_cols)
         own_ghost = compresscoo(TA,own_ghost_triplet...,n_own_rows,n_ghost_cols)
         ghost_own = compresscoo(TA,Ti[],Ti[],Tv[],n_ghost_rows,n_own_cols)
@@ -1652,12 +1649,12 @@ function psparse_assemble_impl(
         nnz_own_own = nnz(own_own)
         k_own_sa = precompute_nzindex(own_own,own_own_triplet[1:2]...)
         k_ghost_sa = precompute_nzindex(own_ghost,own_ghost_triplet[1:2]...)
-        for p in 1:length(I_rcv_own)
+        for p in eachindex(I_rcv_own)
             i = I_rcv_own[p]
             j = J_rcv_own[p]
             k_rcv_own[p] = nzindex(own_own,i,j)
         end
-        for p in 1:length(I_rcv_ghost)
+        for p in eachindex(I_rcv_ghost)
             i = I_rcv_ghost[p]
             j = J_rcv_ghost[p]
             k_rcv_ghost[p] = nzindex(own_ghost,i,j) + nnz_own_own
@@ -1687,7 +1684,7 @@ function psparse_assemble_impl(
         t_I = exchange(I_snd,graph)
         t_J = exchange(J_snd,graph)
         t_V = exchange(V_snd,graph)
-        @fake_async begin
+        @sync begin
             I_rcv = fetch(t_I)
             J_rcv = fetch(t_J)
             V_rcv = fetch(t_V)
@@ -1794,14 +1791,13 @@ end
 
 # New consistent
 ####################
-function psparse_consistent_impl(
-        A,
-        ::Type{T},
-        rows_co;
-        reuse=Val(false)) where T<:AbstractSplitMatrix
+function psparse_consistent_impl(A::PSparseMatrix{V,B,C,D,Tv} where {V,B,C,D},
+                                 ::Type{T},
+                                 rows_co;
+                                 reuse=Val(false)) where {T<:AbstractSplitMatrix, Tv}
 
     function consistent_setup_snd(A,parts_snd,lids_snd,rows_co,cols_fa)
-        own_to_local_row::UnitRange{Int32} = own_to_local(rows_co)
+        own_to_local_row = own_to_local(rows_co)
         own_to_global_row = own_to_global(rows_co)
         own_to_global_col = own_to_global(cols_fa)
         ghost_to_global_col = ghost_to_global(cols_fa)
@@ -1829,7 +1825,6 @@ function psparse_consistent_impl(
         li_to_ps = JaggedArray(li_to_ps_data,li_to_ps_ptrs)
         ptrs = zeros(Int32,length(parts_snd)+1)
         for (i,j,v) in nziterator(A.blocks.own_own)
-            # @show(typeof(own_to_local_row))
             li = own_to_local_row[i]
             for li_ptr in jagged_range(li_to_ps,li)
                 p = li_to_ps.data[li_ptr]
@@ -1846,7 +1841,6 @@ function psparse_consistent_impl(
         end
         length_to_ptrs!(ptrs)
         ndata = ptrs[end]-1
-        Tv = eltype(A)
         I_snd = JaggedArray(zeros(Int,ndata),ptrs)
         J_snd = JaggedArray(zeros(Int,ndata),ptrs)
         V_snd = JaggedArray(zeros(Tv,ndata),ptrs)
@@ -1892,7 +1886,7 @@ function psparse_consistent_impl(
         J_rcv_data = cache_rcv.J_rcv.data
         V_rcv_data = cache_rcv.V_rcv.data
         global_to_own_col = global_to_own(cols_co)
-        global_to_ghost_col = global_to_ghost(cols_co)
+        # global_to_ghost_col = global_to_ghost(cols_co)
         is_own_condition = k -> global_to_own_col[k]!=0
         is_own = is_own_condition.(J_rcv_data)
         I_rcv_own = I_rcv_data[is_own]
@@ -1937,7 +1931,6 @@ function psparse_consistent_impl(
                                 rows_co;
                                 reuse=Val(false)) where T<:AbstractSplitMatrix
         @assert matching_own_indices(axes(A,1),PRange(rows_co))
-        rows_fa = partition(axes(A,1))
         cols_fa = partition(axes(A,2))
         # snd and rcv are swapped on purpose
         parts_rcv,parts_snd = assembly_neighbors(rows_co)
@@ -1950,7 +1943,7 @@ function psparse_consistent_impl(
         t_I = exchange(I_snd,graph)
         t_J = exchange(J_snd,graph)
         t_V = exchange(V_snd,graph)
-        @fake_async begin
+        @sync begin
             I_rcv = fetch(t_I)
             J_rcv = fetch(t_J)
             V_rcv = fetch(t_V)
@@ -1967,11 +1960,8 @@ function psparse_consistent_impl(
             end
         end
     end
-
     _psparse_consistent_impl(A,T,rows_co;reuse)
 end
-
-
 # End new consistent
 ####################
 

From 315999cfb8c60d54d08af034b3e727da775828b0 Mon Sep 17 00:00:00 2001
From: jop611 <meijer.jop@gmail.com>
Date: Fri, 16 Aug 2024 15:23:45 +0200
Subject: [PATCH 10/34] fixed leftover debug setting

---
 src/p_sparse_matrix.jl | 4 ++--
 times.txt              | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)
 create mode 100644 times.txt

diff --git a/src/p_sparse_matrix.jl b/src/p_sparse_matrix.jl
index e1c2ed8d..07e43889 100644
--- a/src/p_sparse_matrix.jl
+++ b/src/p_sparse_matrix.jl
@@ -1684,7 +1684,7 @@ function psparse_assemble_impl(A::PSparseMatrix{V,B,C,D,Tv} where {V,B,C,D},
         t_I = exchange(I_snd,graph)
         t_J = exchange(J_snd,graph)
         t_V = exchange(V_snd,graph)
-        @sync begin
+        @fake_async begin
             I_rcv = fetch(t_I)
             J_rcv = fetch(t_J)
             V_rcv = fetch(t_V)
@@ -1943,7 +1943,7 @@ function psparse_consistent_impl(A::PSparseMatrix{V,B,C,D,Tv} where {V,B,C,D},
         t_I = exchange(I_snd,graph)
         t_J = exchange(J_snd,graph)
         t_V = exchange(V_snd,graph)
-        @sync begin
+        @fake_async begin
             I_rcv = fetch(t_I)
             J_rcv = fetch(t_J)
             V_rcv = fetch(t_V)
diff --git a/times.txt b/times.txt
new file mode 100644
index 00000000..cc438bc4
--- /dev/null
+++ b/times.txt
@@ -0,0 +1 @@
+Dict{String, @NamedTuple{min::Float64, max::Float64, avg::Float64}}("Phase 143" => (min = 0.2010888, max = 0.2010888, avg = 0.2010888), "Phase 3" => (min = 1.0e-7, max = 1.0e-7, avg = 1.0e-7), "Matrix Assembly" => (min = 0.4020026, max = 0.4020026, avg = 0.4020026), "Phase 1" => (min = 5.0e-7, max = 5.0e-7, avg = 5.0e-7))

From 395fa618007a3ed3177ef57ae3b2b5719b7873f0 Mon Sep 17 00:00:00 2001
From: jop611 <meijer.jop@gmail.com>
Date: Fri, 16 Aug 2024 19:46:26 +0200
Subject: [PATCH 11/34] fixed bug in updated psaprse_consistent_impl!

---
 src/p_sparse_matrix.jl        | 13 +++++++++----
 test/p_sparse_matrix_tests.jl |  1 -
 test/runtests.jl              |  2 +-
 times.txt                     |  2 +-
 4 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/src/p_sparse_matrix.jl b/src/p_sparse_matrix.jl
index 07e43889..b8af2da2 100644
--- a/src/p_sparse_matrix.jl
+++ b/src/p_sparse_matrix.jl
@@ -1889,10 +1889,10 @@ function psparse_consistent_impl(A::PSparseMatrix{V,B,C,D,Tv} where {V,B,C,D},
         # global_to_ghost_col = global_to_ghost(cols_co)
         is_own_condition = k -> global_to_own_col[k]!=0
         is_own = is_own_condition.(J_rcv_data)
+        is_ghost = map(!,is_own) # inverse is_own bitvector to effectively represent is_ghost mask
         I_rcv_own = I_rcv_data[is_own]
         J_rcv_own = J_rcv_data[is_own]
         V_rcv_own = V_rcv_data[is_own]
-        is_ghost = map!(!,is_own, is_own) # inverse is_own bitvector to effectively represent is_ghost mask
         I_rcv_ghost = I_rcv_data[is_ghost]
         J_rcv_ghost = J_rcv_data[is_ghost]
         V_rcv_ghost = V_rcv_data[is_ghost]
@@ -1921,7 +1921,7 @@ function psparse_consistent_impl(A::PSparseMatrix{V,B,C,D,Tv} where {V,B,C,D},
         V_rcv = cache_rcv.V_rcv
         parts_snd = cache_snd.parts_snd
         parts_rcv = cache_rcv.parts_rcv
-        cache = (;parts_snd,parts_rcv,k_snd,V_snd,V_rcv,V_rcv_own,V_rcv_ghost,K_own,K_ghost)
+        cache = (;parts_snd,parts_rcv,k_snd,V_snd,V_rcv,is_own,is_ghost,V_rcv_own,V_rcv_ghost,K_own,K_ghost)
         values,cache
     end
 
@@ -1962,6 +1962,7 @@ function psparse_consistent_impl(A::PSparseMatrix{V,B,C,D,Tv} where {V,B,C,D},
     end
     _psparse_consistent_impl(A,T,rows_co;reuse)
 end
+
 # End new consistent
 ####################
 
@@ -1982,10 +1983,14 @@ function psparse_consistent_impl!(B,A,::Type{<:AbstractSplitMatrix},cache)
         end
     end
     function setup_rcv(B,cache)
+        is_own = cache.is_own
+        is_ghost = cache.is_ghost
+        V_rcv_data = cache.V_rcv.data
         K_own = cache.K_own
         K_ghost = cache.K_ghost
-        V_rcv_own = cache.V_rcv_own
-        V_rcv_ghost = cache.V_rcv_ghost
+        # Allocates memory, while cache.V_rcv_own/ghost could be reused.
+        V_rcv_own = V_rcv_data[is_own]
+        V_rcv_ghost = V_rcv_data[is_ghost]  
         setcoofast!(B.blocks.ghost_own,V_rcv_own,K_own)
         setcoofast!(B.blocks.ghost_ghost,V_rcv_ghost,K_ghost)
         B
diff --git a/test/p_sparse_matrix_tests.jl b/test/p_sparse_matrix_tests.jl
index 5ed4c903..08b24574 100644
--- a/test/p_sparse_matrix_tests.jl
+++ b/test/p_sparse_matrix_tests.jl
@@ -378,7 +378,6 @@ function p_sparse_matrix_tests(distribute)
     A_seq = centralize(A)
     spmm!(B,Z,A,cacheB)
     @test centralize(B) ≈ Z_seq*(A_seq)
-
     B = transpose(Z)*A
     @test centralize(B) ≈ transpose(Z_seq)*A_seq
 
diff --git a/test/runtests.jl b/test/runtests.jl
index 92768453..ed7aff49 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -5,6 +5,6 @@ using Test
 @testset "jagged_array" begin include("jagged_array_tests.jl") end
 @testset "sparse_utils" begin include("sparse_utils_tests.jl") end
 @testset "debug_array" begin include("debug_array/runtests.jl") end
-@testset "mpi_array" begin include("mpi_array/runtests.jl") end
+# @testset "mpi_array" begin include("mpi_array/runtests.jl") end
 
 end # module
diff --git a/times.txt b/times.txt
index cc438bc4..7db5927d 100644
--- a/times.txt
+++ b/times.txt
@@ -1 +1 @@
-Dict{String, @NamedTuple{min::Float64, max::Float64, avg::Float64}}("Phase 143" => (min = 0.2010888, max = 0.2010888, avg = 0.2010888), "Phase 3" => (min = 1.0e-7, max = 1.0e-7, avg = 1.0e-7), "Matrix Assembly" => (min = 0.4020026, max = 0.4020026, avg = 0.4020026), "Phase 1" => (min = 5.0e-7, max = 5.0e-7, avg = 5.0e-7))
+Dict{String, @NamedTuple{min::Float64, max::Float64, avg::Float64}}("Phase 143" => (min = 0.2188811, max = 0.2188811, avg = 0.2188811), "Phase 3" => (min = 0.0, max = 0.0, avg = 0.0), "Matrix Assembly" => (min = 0.4098738, max = 0.4098738, avg = 0.4098738), "Phase 1" => (min = 5.0e-7, max = 5.0e-7, avg = 5.0e-7))

From c04ce81a22dbfe36c92d78d32be0374b74826ddd Mon Sep 17 00:00:00 2001
From: jop611 <meijer.jop@gmail.com>
Date: Thu, 5 Sep 2024 18:07:34 +0200
Subject: [PATCH 12/34] reset to earlier state

---
 src/p_sparse_matrix.jl | 167 -----------------------------------------
 times.txt              |   2 +-
 2 files changed, 1 insertion(+), 168 deletions(-)

diff --git a/src/p_sparse_matrix.jl b/src/p_sparse_matrix.jl
index b8af2da2..5018965e 100644
--- a/src/p_sparse_matrix.jl
+++ b/src/p_sparse_matrix.jl
@@ -1357,173 +1357,6 @@ function psparse_assemble_impl(A,::Type,rows)
     error("Case not implemented yet")
 end
 
-# function psparse_assemble_impl(
-#         A,
-#         ::Type{<:AbstractSplitMatrix},
-#         rows;
-#         reuse=Val(false),
-#         assembly_neighbors_options_cols=(;))
-
-#     function setup_cache_snd(A,parts_snd,rows_sa,cols_sa)
-#         A_ghost_own   = A.blocks.ghost_own
-#         A_ghost_ghost = A.blocks.ghost_ghost
-#         gen = ( owner=>i for (i,owner) in enumerate(parts_snd) )
-#         owner_to_p = Dict(gen)
-#         ptrs = zeros(Int32,length(parts_snd)+1)
-#         ghost_to_owner_row = ghost_to_owner(rows_sa)
-#         ghost_to_global_row = ghost_to_global(rows_sa)
-#         own_to_global_col = own_to_global(cols_sa)
-#         ghost_to_global_col = ghost_to_global(cols_sa)
-#         for (i,_,_) in nziterator(A_ghost_own)
-#             owner = ghost_to_owner_row[i]
-#             ptrs[owner_to_p[owner]+1] += 1
-#         end
-#         for (i,_,_) in nziterator(A_ghost_ghost)
-#             owner = ghost_to_owner_row[i]
-#             ptrs[owner_to_p[owner]+1] += 1
-#         end
-#         length_to_ptrs!(ptrs)
-#         Tv = eltype(A_ghost_own)
-#         ndata = ptrs[end]-1
-#         I_snd_data = zeros(Int,ndata)
-#         J_snd_data = zeros(Int,ndata)
-#         V_snd_data = zeros(Tv,ndata)
-#         k_snd_data = zeros(Int32,ndata)
-#         nnz_ghost_own = 0
-#         for (k,(i,j,v)) in enumerate(nziterator(A_ghost_own))
-#             owner = ghost_to_owner_row[i]
-#             p = ptrs[owner_to_p[owner]]
-#             I_snd_data[p] = ghost_to_global_row[i]
-#             J_snd_data[p] = own_to_global_col[j]
-#             V_snd_data[p] = v
-#             k_snd_data[p] = k
-#             ptrs[owner_to_p[owner]] += 1
-#             nnz_ghost_own += 1
-#         end
-#         for (k,(i,j,v)) in enumerate(nziterator(A_ghost_ghost))
-#             owner = ghost_to_owner_row[i]
-#             p = ptrs[owner_to_p[owner]]
-#             I_snd_data[p] = ghost_to_global_row[i]
-#             J_snd_data[p] = ghost_to_global_col[j]
-#             V_snd_data[p] = v
-#             k_snd_data[p] = k+nnz_ghost_own
-#             ptrs[owner_to_p[owner]] += 1
-#         end
-#         rewind_ptrs!(ptrs)
-#         I_snd = JaggedArray(I_snd_data,ptrs)
-#         J_snd = JaggedArray(J_snd_data,ptrs)
-#         V_snd = JaggedArray(V_snd_data,ptrs)
-#         k_snd = JaggedArray(k_snd_data,ptrs)
-#         (;I_snd,J_snd,V_snd,k_snd,parts_snd)
-#     end
-#     function setup_cache_rcv(I_rcv,J_rcv,V_rcv,parts_rcv)
-#         k_rcv_data = zeros(Int32,length(I_rcv.data))
-#         k_rcv = JaggedArray(k_rcv_data,I_rcv.ptrs)
-#         (;I_rcv,J_rcv,V_rcv,k_rcv,parts_rcv)
-#     end
-#     function setup_own_triplets(A,cache_rcv,rows_sa,cols_sa)
-#         nz_own_own = findnz(A.blocks.own_own)
-#         nz_own_ghost = findnz(A.blocks.own_ghost)
-#         I_rcv_data = cache_rcv.I_rcv.data
-#         J_rcv_data = cache_rcv.J_rcv.data
-#         V_rcv_data = cache_rcv.V_rcv.data
-#         k_rcv_data = cache_rcv.k_rcv.data
-#         global_to_own_col = global_to_own(cols_sa)
-#         is_ghost = findall(j->global_to_own_col[j]==0,J_rcv_data)
-#         is_own = findall(j->global_to_own_col[j]!=0,J_rcv_data)
-#         I_rcv_own = view(I_rcv_data,is_own)
-#         J_rcv_own = view(J_rcv_data,is_own)
-#         V_rcv_own = view(V_rcv_data,is_own)
-#         k_rcv_own = view(k_rcv_data,is_own)
-#         I_rcv_ghost = view(I_rcv_data,is_ghost)
-#         J_rcv_ghost = view(J_rcv_data,is_ghost)
-#         V_rcv_ghost = view(V_rcv_data,is_ghost)
-#         k_rcv_ghost = view(k_rcv_data,is_ghost)
-#         # After this col ids in own_ghost triplet remain global
-#         map_global_to_own!(I_rcv_own,rows_sa)
-#         map_global_to_own!(J_rcv_own,cols_sa)
-#         map_global_to_own!(I_rcv_ghost,rows_sa)
-#         map_ghost_to_global!(nz_own_ghost[2],cols_sa)
-#         own_own_I = vcat(nz_own_own[1],I_rcv_own)
-#         own_own_J = vcat(nz_own_own[2],J_rcv_own)
-#         own_own_V = vcat(nz_own_own[3],V_rcv_own)
-#         own_own_triplet = (own_own_I,own_own_J,own_own_V)
-#         own_ghost_I = vcat(nz_own_ghost[1],I_rcv_ghost)
-#         own_ghost_J = vcat(nz_own_ghost[2],J_rcv_ghost)
-#         own_ghost_V = vcat(nz_own_ghost[3],V_rcv_ghost)
-#         map_global_to_ghost!(nz_own_ghost[2],cols_sa)
-#         own_ghost_triplet = (own_ghost_I,own_ghost_J,own_ghost_V)
-#         triplets = (own_own_triplet,own_ghost_triplet)
-#         aux = (I_rcv_own,J_rcv_own,k_rcv_own,I_rcv_ghost,J_rcv_ghost,k_rcv_ghost,nz_own_own,nz_own_ghost)
-#         triplets, own_ghost_J, aux
-#     end
-#     function finalize_values(A,rows_fa,cols_fa,cache_snd,cache_rcv,triplets,aux)
-#         (own_own_triplet,own_ghost_triplet) = triplets
-#         (I_rcv_own,J_rcv_own,k_rcv_own,I_rcv_ghost,J_rcv_ghost,k_rcv_ghost,nz_own_own,nz_own_ghost) = aux
-#         map_global_to_ghost!(own_ghost_triplet[2],cols_fa)
-#         map_global_to_ghost!(J_rcv_ghost,cols_fa)
-#         TA = typeof(A.blocks.own_own)
-#         n_own_rows = own_length(rows_fa)
-#         n_own_cols = own_length(cols_fa)
-#         n_ghost_rows = ghost_length(rows_fa)
-#         n_ghost_cols = ghost_length(cols_fa)
-#         Ti = indextype(A.blocks.own_own)
-#         Tv = eltype(A.blocks.own_own)
-#         own_own = compresscoo(TA,own_own_triplet...,n_own_rows,n_own_cols)
-#         own_ghost = compresscoo(TA,own_ghost_triplet...,n_own_rows,n_ghost_cols)
-#         ghost_own = compresscoo(TA,Ti[],Ti[],Tv[],n_ghost_rows,n_own_cols)
-#         ghost_ghost = compresscoo(TA,Ti[],Ti[],Tv[],n_ghost_rows,n_ghost_cols)
-#         blocks = split_matrix_blocks(own_own,own_ghost,ghost_own,ghost_ghost)
-#         values = split_matrix(blocks,local_permutation(rows_fa),local_permutation(cols_fa))
-#         nnz_own_own = nnz(own_own)
-#         k_own_sa = precompute_nzindex(own_own,own_own_triplet[1:2]...)
-#         k_ghost_sa = precompute_nzindex(own_ghost,own_ghost_triplet[1:2]...)
-#         for p in 1:length(I_rcv_own)
-#             i = I_rcv_own[p]
-#             j = J_rcv_own[p]
-#             k_rcv_own[p] = nzindex(own_own,i,j)
-#         end
-#         for p in 1:length(I_rcv_ghost)
-#             i = I_rcv_ghost[p]
-#             j = J_rcv_ghost[p]
-#             k_rcv_ghost[p] = nzindex(own_ghost,i,j) + nnz_own_own
-#         end
-#         cache = (;k_own_sa,k_ghost_sa,cache_snd...,cache_rcv...)
-#         values, cache
-#     end
-#     rows_sa = partition(axes(A,1))
-#     cols_sa = partition(axes(A,2))
-#     #rows = map(remove_ghost,rows_sa)
-#     cols = map(remove_ghost,cols_sa)
-#     parts_snd, parts_rcv = assembly_neighbors(rows_sa)
-#     cache_snd = map(setup_cache_snd,partition(A),parts_snd,rows_sa,cols_sa)
-#     I_snd = map(i->i.I_snd,cache_snd)
-#     J_snd = map(i->i.J_snd,cache_snd)
-#     V_snd = map(i->i.V_snd,cache_snd)
-#     graph = ExchangeGraph(parts_snd,parts_rcv)
-#     t_I = exchange(I_snd,graph)
-#     t_J = exchange(J_snd,graph)
-#     t_V = exchange(V_snd,graph)
-#     @fake_async begin
-#         I_rcv = fetch(t_I)
-#         J_rcv = fetch(t_J)
-#         V_rcv = fetch(t_V)
-#         cache_rcv = map(setup_cache_rcv,I_rcv,J_rcv,V_rcv,parts_rcv)
-#         triplets,J,aux = map(setup_own_triplets,partition(A),cache_rcv,rows_sa,cols_sa) |> tuple_of_arrays
-#         J_owner = find_owner(cols_sa,J)
-#         rows_fa = rows
-#         cols_fa = map(union_ghost,cols,J,J_owner)
-#         assembly_neighbors(cols_fa;assembly_neighbors_options_cols...)
-#         vals_fa, cache = map(finalize_values,partition(A),rows_fa,cols_fa,cache_snd,cache_rcv,triplets,aux) |> tuple_of_arrays
-#         assembled = true
-#         B = PSparseMatrix(vals_fa,rows_fa,cols_fa,assembled)
-#         if val_parameter(reuse) == false
-#             B
-#         else
-#             B, cache
-#         end
-#     end
-# end
 
 # New assemble
 ####################
diff --git a/times.txt b/times.txt
index 7db5927d..24e91690 100644
--- a/times.txt
+++ b/times.txt
@@ -1 +1 @@
-Dict{String, @NamedTuple{min::Float64, max::Float64, avg::Float64}}("Phase 143" => (min = 0.2188811, max = 0.2188811, avg = 0.2188811), "Phase 3" => (min = 0.0, max = 0.0, avg = 0.0), "Matrix Assembly" => (min = 0.4098738, max = 0.4098738, avg = 0.4098738), "Phase 1" => (min = 5.0e-7, max = 5.0e-7, avg = 5.0e-7))
+Dict{String, @NamedTuple{min::Float64, max::Float64, avg::Float64}}("Phase 143" => (min = 0.2146177, max = 0.2146177, avg = 0.2146177), "Phase 3" => (min = 1.0e-7, max = 1.0e-7, avg = 1.0e-7), "Matrix Assembly" => (min = 0.4019553, max = 0.4019553, avg = 0.4019553), "Phase 1" => (min = 1.4e-6, max = 1.4e-6, avg = 1.4e-6))

From a458fcc43a84d1ca62f84a4cce0f3ba76faa5cc6 Mon Sep 17 00:00:00 2001
From: jop611 <meijer.jop@gmail.com>
Date: Mon, 9 Sep 2024 16:32:22 +0200
Subject: [PATCH 13/34] fixed mistake in PSparseMatrix documentation
 (fieldnames/types), added versions of repartition(A,rows,cols) and centralize
 that support non-default sparse method.

---
 src/p_sparse_matrix.jl | 50 ++++++++++++++++++++++++++++++++++++++----
 1 file changed, 46 insertions(+), 4 deletions(-)

diff --git a/src/p_sparse_matrix.jl b/src/p_sparse_matrix.jl
index 5018965e..62b6da5d 100644
--- a/src/p_sparse_matrix.jl
+++ b/src/p_sparse_matrix.jl
@@ -948,9 +948,9 @@ parallel implementations.
 
 # Properties
 
-- `matrix_partition::A`
-- `row_partition::B`
-- `col_partition::C`
+- `matrix_partition::B`
+- `row_partition::C`
+- `col_partition::D`
 - `assembled::Bool`
 
 `matrix_partition[i]` contains a (sparse) matrix with the local rows and the
@@ -964,7 +964,7 @@ is fully contained in the own rows.
 
 # Supertype hierarchy
 
-    PSparseMatrix{V,A,B,C,T} <: AbstractMatrix{T}
+    PSparseMatrix{V,B,C,D,T} <: AbstractMatrix{T}
 
 with `T=eltype(V)`.
 """
@@ -2212,6 +2212,39 @@ function repartition(A::PSparseMatrix,new_rows,new_cols;reuse=Val(false))
     end
 end
 
+function repartition(sparse,A::PSparseMatrix,new_rows,new_cols;reuse=Val(false))
+    @assert A.assembled "repartition on a sub-assembled matrix not implemented yet"
+    function prepare_triplets(A_own_own,A_own_ghost,A_rows,A_cols)
+        I1,J1,V1 = findnz(A_own_own)
+        I2,J2,V2 = findnz(A_own_ghost)
+        map_own_to_global!(I1,A_rows)
+        map_own_to_global!(I2,A_rows)
+        map_own_to_global!(J1,A_cols)
+        map_ghost_to_global!(J2,A_cols)
+        I = vcat(I1,I2)
+        J = vcat(J1,J2)
+        V = vcat(V1,V2)
+        (I,J,V)
+    end
+    A_own_own = own_own_values(A)
+    A_own_ghost = own_ghost_values(A)
+    A_rows = partition(axes(A,1))
+    A_cols = partition(axes(A,2))
+    I,J,V = map(prepare_triplets,A_own_own,A_own_ghost,A_rows,A_cols) |> tuple_of_arrays
+    # TODO this one does not preserve the local storage layout of A
+    t = psparse(sparse,I,J,V,new_rows,new_cols;reuse=true)
+    @fake_async begin
+        B,cacheB = fetch(t)
+        if val_parameter(reuse) == false
+            B
+        else
+            cache = (V,cacheB)
+            B, cache
+        end
+    end
+end
+
+
 """
     repartition!(B::PSparseMatrix,A::PSparseMatrix,cache)
 """
@@ -2281,6 +2314,15 @@ function centralize(A::PSparseMatrix)
     own_own_values(a_in_main) |> multicast |> getany
 end
 
+function centralize(sparse,A::PSparseMatrix)
+    m,n = size(A)
+    ranks = linear_indices(partition(A))
+    rows_trivial = trivial_partition(ranks,m)
+    cols_trivial = trivial_partition(ranks,n)
+    a_in_main = repartition(sparse,A,rows_trivial,cols_trivial) |> fetch
+    own_own_values(a_in_main) |> multicast |> getany
+end
+
 """
     psystem(I,J,V,I2,V2,rows,cols;kwargs...)
 """

From 0d4c06f36bcd3b399181b39374f4824f5b03938a Mon Sep 17 00:00:00 2001
From: jop611 <meijer.jop@gmail.com>
Date: Mon, 21 Oct 2024 10:49:50 +0200
Subject: [PATCH 14/34] added versions of sparse_diag_matrix with custom matrix
 type option, improved cosistent further by reusing own_ghost_block as much as
 possible through new sparse utils expand_sparse_matrix _columns

---
 src/gallery.jl         | 162 +++++++++++++++++++++++++++++++++++++++++
 src/p_range.jl         |   4 +-
 src/p_sparse_matrix.jl |  40 +++++++---
 src/sparse_utils.jl    |  17 +++++
 times.txt              |   2 +-
 5 files changed, 212 insertions(+), 13 deletions(-)

diff --git a/src/gallery.jl b/src/gallery.jl
index f68fec9d..8023c1f5 100644
--- a/src/gallery.jl
+++ b/src/gallery.jl
@@ -553,4 +553,166 @@ function nullspace_linear_elasticity!(B,x)
 end
 
 
+function prolongator(T,
+                     nodes_per_dir,
+                     parts_per_dir,
+                     parts;
+                     index_type::Type{Ti} = Int64,
+                     value_type::Type{Tv} = Float64) where {Ti,Tv}
+    # Improved version of aggregate function not using inefficient direct JaggedRange indexing causing many view allocations
+    # Also uses a generic function to obtain pointer arrays and index arrays to work with both CSC and CSR.
+    # If the problem is symmetric neighbours dont change by this.
+    function aggregate(A,diagA=dense_diag(A);epsilon=0)
+        # This one is algorithm 5.1 from
+        # "Algebraic multigrid by smoothed aggregation for second and fourth order elliptic problems"
+        epsi = epsilon
+        typeof_strength = eltype(A.nzval)
+
+        nnodes = size(A,1)
+        pending = Ti(0)
+        isolated = Ti(-1)
+        
+        node_to_aggregate = fill(pending,nnodes)
+        node_to_old_aggregate = similar(node_to_aggregate)
+
+        node_to_neigs = jagged_array(index_array(A),pointer_array(A))
+        neigs = node_to_neigs.data
+        node_to_vals = jagged_array(A.nzval,pointer_array(A))
+        vals = node_to_vals.data
+        strongly_connected = (node,ineig) -> begin
+            neig = neigs[ineig]
+            aii = diagA[node]
+            ajj = diagA[neig]
+            aij = vals[ineig]
+            abs(aij) > epsi*sqrt(aii*ajj)
+        end
+        coupling_strength = (node,ineig) -> begin
+            abs(vals[ineig])
+        end
+
+        # Initialization
+        for node in 1:nnodes
+            neig_range = jagged_range(node_to_neigs,node)
+            isolated_node = count(i->neigs[i]!=node,neig_range) == 0
+            if isolated_node
+                node_to_aggregate[node] = isolated
+            end
+        end
+        # Step 1
+        aggregate = Ti(0)
+        for node in 1:nnodes
+            if node_to_aggregate[node] != pending
+                continue
+            end
+            neig_range = jagged_range(node_to_neigs,node)
+            all_pending = true
+            for ineig in neig_range
+                neig = neigs[ineig]
+                if neig == node || !strongly_connected(node,ineig)
+                    continue
+                end
+                all_pending &= (node_to_aggregate[neig] == pending)
+            end
+            if !all_pending
+                continue
+            end
+            aggregate += Ti(1)
+            node_to_aggregate[node] = aggregate
+            for ineig in neig_range
+                neig = neigs[ineig]
+                if neig == node || !strongly_connected(node,ineig)
+                    continue
+                end
+                node_to_aggregate[neig] = aggregate
+            end
+        end
+        # Step 2
+        copy!(node_to_old_aggregate,node_to_aggregate)
+        for node in 1:nnodes
+            if node_to_aggregate[node] != pending
+                continue
+            end
+            strength = zero(typeof_strength)
+            neig_range = jagged_range(node_to_neigs, node)
+            for ineig in neig_range
+                neig = neigs[ineig]
+                if neig == node || !strongly_connected(node,ineig)
+                    continue
+                end
+                neig_aggregate = node_to_old_aggregate[neig]
+                if neig_aggregate != pending && neig_aggregate != isolated
+                    neig_strength = coupling_strength(node,ineig)
+                    if neig_strength > strength
+                        strength = neig_strength
+                        node_to_aggregate[node] = neig_aggregate
+                    end
+                end
+            end
+        end
+
+        # Step 3
+        for node in 1:nnodes
+            if node_to_aggregate[node] != pending
+                continue
+            end
+            aggregate += Ti(1)
+            node_to_aggregate[node] = aggregate
+            # neigs = node_to_neigs[node]
+            neig_range = jagged_range(node_to_neigs, node)
+            for ineig in neig_range
+                neig = neigs[ineig]
+                if neig == node || !strongly_connected(node,ineig)
+                    continue
+                end
+                neig_aggregate = node_to_old_aggregate[neig]
+                if neig_aggregate == pending || neig_aggregate == isolated
+                    node_to_aggregate[neig] = aggregate
+                end
+            end
+        end
+        naggregates = aggregate
+
+        if nnodes == 1
+            node_to_aggregate .= 1
+            naggregates = 1
+        end
+        node_to_aggregate, 1:naggregates
+    end
+
+    function aggregate(A::PSparseMatrix,diagA=dense_diag(A);kwargs...)
+        # This is the vanilla "uncoupled" strategy from "Parallel Smoothed Aggregation Multigrid : Aggregation Strategies on Massively Parallel Machines"
+        # TODO: implement other more advanced strategies
+        @assert A.assembled
+        node_to_aggregate_data, local_ranges = map((A,diagA)->aggregate(A,diagA;kwargs...),own_own_values(A),own_values(diagA)) |> tuple_of_arrays
+        nown = map(length,local_ranges)
+        n_aggregates = sum(nown)
+        nparts = length(nown)
+        aggregate_partition = variable_partition(nown,n_aggregates)
+        node_partition = partition(axes(A,1))
+        map(map_own_to_global!,node_to_aggregate_data,aggregate_partition)
+        node_to_aggregate = PVector(node_to_aggregate_data,node_partition)
+        node_to_aggregate, PRange(aggregate_partition)
+    end
+
+    function constant_prolongator(T,node_to_aggregate::PVector,aggregates::PRange,n_nullspace_vecs)
+        if n_nullspace_vecs != 1
+            error("case not implemented yet")
+        end
+        function setup_triplets(node_to_aggregate,nodes)
+            myI = UnitRange{Ti}(1:local_length(nodes))
+            myJ = node_to_aggregate
+            myV = ones(length(node_to_aggregate))
+            (myI,myJ,myV)
+        end
+        node_partition = partition(axes(node_to_aggregate,1))
+        I,J,V = map(setup_triplets,partition(node_to_aggregate),node_partition) |> tuple_of_arrays
+        aggregate_partition = partition(aggregates)
+        J_owner = find_owner(aggregate_partition,J)
+        aggregate_partition = map(union_ghost,aggregate_partition,J,J_owner)
+        map(map_global_to_local!,J,aggregate_partition)
+        P0 = psparse(T,I,J,V,node_partition,aggregate_partition;assembled=true,indices=:local) |> fetch
+        P0
+    end
+end
+
 
diff --git a/src/p_range.jl b/src/p_range.jl
index cfa330b0..c72f5c9f 100644
--- a/src/p_range.jl
+++ b/src/p_range.jl
@@ -408,7 +408,7 @@ end
 """
     neigs_snd, neigs_rcv = assembly_neighbors(index_partition;kwargs...)
 
-Return the ids of the neighbor parts from we send and receive data respectively
+Return the ids of the neighbor parts from which we send and receive data respectively
 in the assembly of distributed vectors defined on the index
 partition `index_partition`.
 partition `index_partition`. `kwargs` are delegated to [`ExchangeGraph`](@ref)
@@ -470,7 +470,7 @@ end
 
 function assembly_local_indices(indices,neighbors_snd,neighbors_rcv)
     cache = map(assembly_cache,indices)
-    mask =  map(cache) do mycache
+    mask = map(cache) do mycache
         isassigned(mycache.local_indices_snd) && isassigned(mycache.local_indices_rcv)
     end
     if ! getany(mask)
diff --git a/src/p_sparse_matrix.jl b/src/p_sparse_matrix.jl
index 62b6da5d..f579b4fd 100644
--- a/src/p_sparse_matrix.jl
+++ b/src/p_sparse_matrix.jl
@@ -1733,16 +1733,13 @@ function psparse_consistent_impl(A::PSparseMatrix{V,B,C,D,Tv} where {V,B,C,D},
         map_global_to_ghost!(I_rcv_ghost,rows_co)
         map_global_to_own!(J_rcv_own,cols_co)
         map_global_to_ghost!(J_rcv_ghost,cols_co)
-        I2,J2,V2 = findnz(A.blocks.own_ghost)
-        map_ghost_to_global!(J2,cols_fa)
-        map_global_to_ghost!(J2,cols_co)
-        n_own_rows = own_length(rows_co)
         n_ghost_rows = ghost_length(rows_co)
         n_own_cols = own_length(cols_co)
         n_ghost_cols = ghost_length(cols_co)
         TA = typeof(A.blocks.ghost_own)
         own_own = A.blocks.own_own
-        own_ghost = compresscoo(TA,I2,J2,V2,n_own_rows,n_ghost_cols) # TODO this can be improved
+        # New own_ghost shares as much memory with existing own_ghost block as possible. Extent depends on sparse format in use.
+        own_ghost = expand_sparse_matrix_columns(A.blocks.own_ghost,n_ghost_cols) 
         ghost_own = compresscoo(TA,I_rcv_own,J_rcv_own,V_rcv_own,n_ghost_rows,n_own_cols)
         ghost_ghost = compresscoo(TA,I_rcv_ghost,J_rcv_ghost,V_rcv_ghost,n_ghost_rows,n_ghost_cols)
         K_own = precompute_nzindex(ghost_own,I_rcv_own,J_rcv_own)
@@ -1767,7 +1764,8 @@ function psparse_consistent_impl(A::PSparseMatrix{V,B,C,D,Tv} where {V,B,C,D},
         cols_fa = partition(axes(A,2))
         # snd and rcv are swapped on purpose
         parts_rcv,parts_snd = assembly_neighbors(rows_co)
-        lids_rcv,lids_snd = assembly_local_indices(rows_co)
+        # assembly_neighbors is called again in assembly_local_indices?
+        lids_rcv,lids_snd = assembly_local_indices(rows_co,parts_rcv,parts_snd)
         cache_snd = map(consistent_setup_snd,partition(A),parts_snd,lids_snd,rows_co,cols_fa)
         I_snd = map(i->i.I_snd,cache_snd)
         J_snd = map(i->i.J_snd,cache_snd)
@@ -2021,6 +2019,19 @@ function sparse_diag_matrix(d::PVector,shape)
     psparse(I,J,V,row_partition,col_partition;assembled=true) |> fetch
 end
 
+# Version of sparse_diag_matrix for preserving local matrix type T (when default CSC is not wanted)
+function sparse_diag_matrix(::Type{T},d::PVector,shape) where T
+    row_partition,col_partition = map(partition,shape)
+    function setup(own_d,rows,cols)
+        I = own_to_global(rows) |> collect
+        J = own_to_global(cols) |> collect
+        V = own_d
+        I,J,V
+    end
+    I,J,V = map(setup,own_values(d),row_partition,col_partition) |> tuple_of_arrays
+    psparse(T,I,J,V,row_partition,col_partition;assembled=true) |> fetch
+end
+
 function rap(R,A,P;reuse=Val(false))
     Ac = R*A*P
     if val_parameter(reuse)
@@ -2126,6 +2137,15 @@ function Base.:-(I::LinearAlgebra.UniformScaling,A::PSparseMatrix)
     D-A
 end
 
+# Version of I-A for preserving local matrix type T (when default CSC is not wanted)
+function Base.:-(T,I::LinearAlgebra.UniformScaling,A::PSparseMatrix)
+    Tv = eltype(A)
+    row_partition = partition(axes(A,1))
+    d = pones(Tv,row_partition)
+    D = PartitionedArrays.sparse_diag_matrix(T,d,axes(A))
+    D-A
+end
+
 Base.similar(a::PSparseMatrix) = similar(a,eltype(a))
 function Base.similar(a::PSparseMatrix,::Type{T}) where T
     matrix_partition = map(partition(a)) do values
@@ -2212,7 +2232,7 @@ function repartition(A::PSparseMatrix,new_rows,new_cols;reuse=Val(false))
     end
 end
 
-function repartition(sparse,A::PSparseMatrix,new_rows,new_cols;reuse=Val(false))
+function repartition(::Type{T},A::PSparseMatrix,new_rows,new_cols;reuse=Val(false)) where T
     @assert A.assembled "repartition on a sub-assembled matrix not implemented yet"
     function prepare_triplets(A_own_own,A_own_ghost,A_rows,A_cols)
         I1,J1,V1 = findnz(A_own_own)
@@ -2232,7 +2252,7 @@ function repartition(sparse,A::PSparseMatrix,new_rows,new_cols;reuse=Val(false))
     A_cols = partition(axes(A,2))
     I,J,V = map(prepare_triplets,A_own_own,A_own_ghost,A_rows,A_cols) |> tuple_of_arrays
     # TODO this one does not preserve the local storage layout of A
-    t = psparse(sparse,I,J,V,new_rows,new_cols;reuse=true)
+    t = psparse(T,I,J,V,new_rows,new_cols;reuse=true)
     @fake_async begin
         B,cacheB = fetch(t)
         if val_parameter(reuse) == false
@@ -2314,12 +2334,12 @@ function centralize(A::PSparseMatrix)
     own_own_values(a_in_main) |> multicast |> getany
 end
 
-function centralize(sparse,A::PSparseMatrix)
+function centralize(::Type{T},A::PSparseMatrix) where T
     m,n = size(A)
     ranks = linear_indices(partition(A))
     rows_trivial = trivial_partition(ranks,m)
     cols_trivial = trivial_partition(ranks,n)
-    a_in_main = repartition(sparse,A,rows_trivial,cols_trivial) |> fetch
+    a_in_main = repartition(T,A,rows_trivial,cols_trivial) |> fetch
     own_own_values(a_in_main) |> multicast |> getany
 end
 
diff --git a/src/sparse_utils.jl b/src/sparse_utils.jl
index 85bbf01a..57b3b57f 100644
--- a/src/sparse_utils.jl
+++ b/src/sparse_utils.jl
@@ -680,3 +680,20 @@ function spmv_csc!(b,x,colptr_A,rowval_A,nzval_A)
     b
 end
 
+function expand_sparse_matrix_columns(A::SparseMatrixCSR{Bi,Ti,Tv}, n) where {Bi,Tv,Ti}
+    p,q = size(A)
+    @assert n >= q
+    SparseMatrixCSR{Bi,Ti,Tv}(p,n,A.rowptr,A.colval,A.nzval)
+end
+
+function expand_sparse_matrix_columns(A::SparseMatrixCSC{Ti,Tv}, n) where {Tv,Ti}
+    p,q = size(A)
+    @assert n >= q
+    new_colptr = similar(A.colptr,n+1)
+    map!(identity,new_colptr,A.colptr)
+    last_index = A.colptr[end]
+    foreach(q+1:n+1) do i
+        new_colptr[i] = last_index
+    end
+    SparseMatrixCSC{Ti,Tv}(p,n,new_colptr,A.rowval,A.nzval)
+end
\ No newline at end of file
diff --git a/times.txt b/times.txt
index 24e91690..95ffa1dd 100644
--- a/times.txt
+++ b/times.txt
@@ -1 +1 @@
-Dict{String, @NamedTuple{min::Float64, max::Float64, avg::Float64}}("Phase 143" => (min = 0.2146177, max = 0.2146177, avg = 0.2146177), "Phase 3" => (min = 1.0e-7, max = 1.0e-7, avg = 1.0e-7), "Matrix Assembly" => (min = 0.4019553, max = 0.4019553, avg = 0.4019553), "Phase 1" => (min = 1.4e-6, max = 1.4e-6, avg = 1.4e-6))
+Dict{String, @NamedTuple{min::Float64, max::Float64, avg::Float64}}("Phase 143" => (min = 0.2018849, max = 0.2018849, avg = 0.2018849), "Phase 3" => (min = 1.0e-7, max = 1.0e-7, avg = 1.0e-7), "Matrix Assembly" => (min = 0.4173665, max = 0.4173665, avg = 0.4173665), "Phase 1" => (min = 1.2e-6, max = 1.2e-6, avg = 1.2e-6))

From 47dea153d449ef3f859fe224236efe9ffcf96c21 Mon Sep 17 00:00:00 2001
From: jop611 <meijer.jop@gmail.com>
Date: Mon, 21 Oct 2024 10:50:52 +0200
Subject: [PATCH 15/34] ?

---
 src/gallery.jl | 167 +------------------------------------------------
 1 file changed, 1 insertion(+), 166 deletions(-)

diff --git a/src/gallery.jl b/src/gallery.jl
index 8023c1f5..f9667f0f 100644
--- a/src/gallery.jl
+++ b/src/gallery.jl
@@ -550,169 +550,4 @@ function nullspace_linear_elasticity!(B,x)
         error("case not implemented")
     end
     B
-end
-
-
-function prolongator(T,
-                     nodes_per_dir,
-                     parts_per_dir,
-                     parts;
-                     index_type::Type{Ti} = Int64,
-                     value_type::Type{Tv} = Float64) where {Ti,Tv}
-    # Improved version of aggregate function not using inefficient direct JaggedRange indexing causing many view allocations
-    # Also uses a generic function to obtain pointer arrays and index arrays to work with both CSC and CSR.
-    # If the problem is symmetric neighbours dont change by this.
-    function aggregate(A,diagA=dense_diag(A);epsilon=0)
-        # This one is algorithm 5.1 from
-        # "Algebraic multigrid by smoothed aggregation for second and fourth order elliptic problems"
-        epsi = epsilon
-        typeof_strength = eltype(A.nzval)
-
-        nnodes = size(A,1)
-        pending = Ti(0)
-        isolated = Ti(-1)
-        
-        node_to_aggregate = fill(pending,nnodes)
-        node_to_old_aggregate = similar(node_to_aggregate)
-
-        node_to_neigs = jagged_array(index_array(A),pointer_array(A))
-        neigs = node_to_neigs.data
-        node_to_vals = jagged_array(A.nzval,pointer_array(A))
-        vals = node_to_vals.data
-        strongly_connected = (node,ineig) -> begin
-            neig = neigs[ineig]
-            aii = diagA[node]
-            ajj = diagA[neig]
-            aij = vals[ineig]
-            abs(aij) > epsi*sqrt(aii*ajj)
-        end
-        coupling_strength = (node,ineig) -> begin
-            abs(vals[ineig])
-        end
-
-        # Initialization
-        for node in 1:nnodes
-            neig_range = jagged_range(node_to_neigs,node)
-            isolated_node = count(i->neigs[i]!=node,neig_range) == 0
-            if isolated_node
-                node_to_aggregate[node] = isolated
-            end
-        end
-        # Step 1
-        aggregate = Ti(0)
-        for node in 1:nnodes
-            if node_to_aggregate[node] != pending
-                continue
-            end
-            neig_range = jagged_range(node_to_neigs,node)
-            all_pending = true
-            for ineig in neig_range
-                neig = neigs[ineig]
-                if neig == node || !strongly_connected(node,ineig)
-                    continue
-                end
-                all_pending &= (node_to_aggregate[neig] == pending)
-            end
-            if !all_pending
-                continue
-            end
-            aggregate += Ti(1)
-            node_to_aggregate[node] = aggregate
-            for ineig in neig_range
-                neig = neigs[ineig]
-                if neig == node || !strongly_connected(node,ineig)
-                    continue
-                end
-                node_to_aggregate[neig] = aggregate
-            end
-        end
-        # Step 2
-        copy!(node_to_old_aggregate,node_to_aggregate)
-        for node in 1:nnodes
-            if node_to_aggregate[node] != pending
-                continue
-            end
-            strength = zero(typeof_strength)
-            neig_range = jagged_range(node_to_neigs, node)
-            for ineig in neig_range
-                neig = neigs[ineig]
-                if neig == node || !strongly_connected(node,ineig)
-                    continue
-                end
-                neig_aggregate = node_to_old_aggregate[neig]
-                if neig_aggregate != pending && neig_aggregate != isolated
-                    neig_strength = coupling_strength(node,ineig)
-                    if neig_strength > strength
-                        strength = neig_strength
-                        node_to_aggregate[node] = neig_aggregate
-                    end
-                end
-            end
-        end
-
-        # Step 3
-        for node in 1:nnodes
-            if node_to_aggregate[node] != pending
-                continue
-            end
-            aggregate += Ti(1)
-            node_to_aggregate[node] = aggregate
-            # neigs = node_to_neigs[node]
-            neig_range = jagged_range(node_to_neigs, node)
-            for ineig in neig_range
-                neig = neigs[ineig]
-                if neig == node || !strongly_connected(node,ineig)
-                    continue
-                end
-                neig_aggregate = node_to_old_aggregate[neig]
-                if neig_aggregate == pending || neig_aggregate == isolated
-                    node_to_aggregate[neig] = aggregate
-                end
-            end
-        end
-        naggregates = aggregate
-
-        if nnodes == 1
-            node_to_aggregate .= 1
-            naggregates = 1
-        end
-        node_to_aggregate, 1:naggregates
-    end
-
-    function aggregate(A::PSparseMatrix,diagA=dense_diag(A);kwargs...)
-        # This is the vanilla "uncoupled" strategy from "Parallel Smoothed Aggregation Multigrid : Aggregation Strategies on Massively Parallel Machines"
-        # TODO: implement other more advanced strategies
-        @assert A.assembled
-        node_to_aggregate_data, local_ranges = map((A,diagA)->aggregate(A,diagA;kwargs...),own_own_values(A),own_values(diagA)) |> tuple_of_arrays
-        nown = map(length,local_ranges)
-        n_aggregates = sum(nown)
-        nparts = length(nown)
-        aggregate_partition = variable_partition(nown,n_aggregates)
-        node_partition = partition(axes(A,1))
-        map(map_own_to_global!,node_to_aggregate_data,aggregate_partition)
-        node_to_aggregate = PVector(node_to_aggregate_data,node_partition)
-        node_to_aggregate, PRange(aggregate_partition)
-    end
-
-    function constant_prolongator(T,node_to_aggregate::PVector,aggregates::PRange,n_nullspace_vecs)
-        if n_nullspace_vecs != 1
-            error("case not implemented yet")
-        end
-        function setup_triplets(node_to_aggregate,nodes)
-            myI = UnitRange{Ti}(1:local_length(nodes))
-            myJ = node_to_aggregate
-            myV = ones(length(node_to_aggregate))
-            (myI,myJ,myV)
-        end
-        node_partition = partition(axes(node_to_aggregate,1))
-        I,J,V = map(setup_triplets,partition(node_to_aggregate),node_partition) |> tuple_of_arrays
-        aggregate_partition = partition(aggregates)
-        J_owner = find_owner(aggregate_partition,J)
-        aggregate_partition = map(union_ghost,aggregate_partition,J,J_owner)
-        map(map_global_to_local!,J,aggregate_partition)
-        P0 = psparse(T,I,J,V,node_partition,aggregate_partition;assembled=true,indices=:local) |> fetch
-        P0
-    end
-end
-
-
+end
\ No newline at end of file

From 6079cf7fc9eb1de9f44b394362c0816dd33c3d94 Mon Sep 17 00:00:00 2001
From: jop611 <meijer.jop@gmail.com>
Date: Mon, 21 Oct 2024 11:28:29 +0200
Subject: [PATCH 16/34] added some function with spare matrix construct
 function passed

---
 src/p_sparse_matrix.jl | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/src/p_sparse_matrix.jl b/src/p_sparse_matrix.jl
index f579b4fd..fcb2cbe6 100644
--- a/src/p_sparse_matrix.jl
+++ b/src/p_sparse_matrix.jl
@@ -2264,6 +2264,37 @@ function repartition(::Type{T},A::PSparseMatrix,new_rows,new_cols;reuse=Val(fals
     end
 end
 
+function repartition(sparse,A::PSparseMatrix,new_rows,new_cols;reuse=Val(false)) where T
+    @assert A.assembled "repartition on a sub-assembled matrix not implemented yet"
+    function prepare_triplets(A_own_own,A_own_ghost,A_rows,A_cols)
+        I1,J1,V1 = findnz(A_own_own)
+        I2,J2,V2 = findnz(A_own_ghost)
+        map_own_to_global!(I1,A_rows)
+        map_own_to_global!(I2,A_rows)
+        map_own_to_global!(J1,A_cols)
+        map_ghost_to_global!(J2,A_cols)
+        I = vcat(I1,I2)
+        J = vcat(J1,J2)
+        V = vcat(V1,V2)
+        (I,J,V)
+    end
+    A_own_own = own_own_values(A)
+    A_own_ghost = own_ghost_values(A)
+    A_rows = partition(axes(A,1))
+    A_cols = partition(axes(A,2))
+    I,J,V = map(prepare_triplets,A_own_own,A_own_ghost,A_rows,A_cols) |> tuple_of_arrays
+    t = psparse(sparse,I,J,V,new_rows,new_cols;reuse=true)
+    @fake_async begin
+        B,cacheB = fetch(t)
+        if val_parameter(reuse) == false
+            B
+        else
+            cache = (V,cacheB)
+            B, cache
+        end
+    end
+end
+
 
 """
     repartition!(B::PSparseMatrix,A::PSparseMatrix,cache)
@@ -2343,6 +2374,15 @@ function centralize(::Type{T},A::PSparseMatrix) where T
     own_own_values(a_in_main) |> multicast |> getany
 end
 
+function centralize(sparse,A::PSparseMatrix) where T
+    m,n = size(A)
+    ranks = linear_indices(partition(A))
+    rows_trivial = trivial_partition(ranks,m)
+    cols_trivial = trivial_partition(ranks,n)
+    a_in_main = repartition(sparse,A,rows_trivial,cols_trivial) |> fetch
+    own_own_values(a_in_main) |> multicast |> getany
+end
+
 """
     psystem(I,J,V,I2,V2,rows,cols;kwargs...)
 """

From 82c17a9a4ebb8fe8138a3a636cae12e597e296fb Mon Sep 17 00:00:00 2001
From: jop611 <meijer.jop@gmail.com>
Date: Mon, 21 Oct 2024 11:30:21 +0200
Subject: [PATCH 17/34] fixed problem related to previous commit

---
 src/p_sparse_matrix.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/p_sparse_matrix.jl b/src/p_sparse_matrix.jl
index fcb2cbe6..b69a3456 100644
--- a/src/p_sparse_matrix.jl
+++ b/src/p_sparse_matrix.jl
@@ -2264,7 +2264,7 @@ function repartition(::Type{T},A::PSparseMatrix,new_rows,new_cols;reuse=Val(fals
     end
 end
 
-function repartition(sparse,A::PSparseMatrix,new_rows,new_cols;reuse=Val(false)) where T
+function repartition(sparse,A::PSparseMatrix,new_rows,new_cols;reuse=Val(false))
     @assert A.assembled "repartition on a sub-assembled matrix not implemented yet"
     function prepare_triplets(A_own_own,A_own_ghost,A_rows,A_cols)
         I1,J1,V1 = findnz(A_own_own)
@@ -2374,7 +2374,7 @@ function centralize(::Type{T},A::PSparseMatrix) where T
     own_own_values(a_in_main) |> multicast |> getany
 end
 
-function centralize(sparse,A::PSparseMatrix) where T
+function centralize(sparse,A::PSparseMatrix)
     m,n = size(A)
     ranks = linear_indices(partition(A))
     rows_trivial = trivial_partition(ranks,m)

From 5afff2ea30f3dc7333e9c53c1624f3ce65b98156 Mon Sep 17 00:00:00 2001
From: jop611 <meijer.jop@gmail.com>
Date: Mon, 21 Oct 2024 11:35:51 +0200
Subject: [PATCH 18/34] fixed bug in expand_sparse_matrix functions

---
 src/sparse_utils.jl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/sparse_utils.jl b/src/sparse_utils.jl
index 57b3b57f..4f95f6eb 100644
--- a/src/sparse_utils.jl
+++ b/src/sparse_utils.jl
@@ -680,13 +680,13 @@ function spmv_csc!(b,x,colptr_A,rowval_A,nzval_A)
     b
 end
 
-function expand_sparse_matrix_columns(A::SparseMatrixCSR{Bi,Ti,Tv}, n) where {Bi,Tv,Ti}
+function expand_sparse_matrix_columns(A::SparseMatrixCSR{Bi,Tv,Ti}, n) where {Bi,Tv,Ti}
     p,q = size(A)
     @assert n >= q
-    SparseMatrixCSR{Bi,Ti,Tv}(p,n,A.rowptr,A.colval,A.nzval)
+    SparseMatrixCSR{Bi,Tv,Ti}(p,n,A.rowptr,A.colval,A.nzval)
 end
 
-function expand_sparse_matrix_columns(A::SparseMatrixCSC{Ti,Tv}, n) where {Tv,Ti}
+function expand_sparse_matrix_columns(A::SparseMatrixCSC{Tv,Ti}, n) where {Tv,Ti}
     p,q = size(A)
     @assert n >= q
     new_colptr = similar(A.colptr,n+1)
@@ -695,5 +695,5 @@ function expand_sparse_matrix_columns(A::SparseMatrixCSC{Ti,Tv}, n) where {Tv,Ti
     foreach(q+1:n+1) do i
         new_colptr[i] = last_index
     end
-    SparseMatrixCSC{Ti,Tv}(p,n,new_colptr,A.rowval,A.nzval)
+    SparseMatrixCSC{Tv,Ti}(p,n,new_colptr,A.rowval,A.nzval)
 end
\ No newline at end of file

From cb3d40ad64635701f89a0ed66167733f9bd6035b Mon Sep 17 00:00:00 2001
From: jop611 <meijer.jop@gmail.com>
Date: Mon, 21 Oct 2024 11:40:41 +0200
Subject: [PATCH 19/34] fixed another issue with expand_sparse_matrix_columns

---
 src/sparse_utils.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/sparse_utils.jl b/src/sparse_utils.jl
index 4f95f6eb..7b0d2b40 100644
--- a/src/sparse_utils.jl
+++ b/src/sparse_utils.jl
@@ -680,10 +680,10 @@ function spmv_csc!(b,x,colptr_A,rowval_A,nzval_A)
     b
 end
 
-function expand_sparse_matrix_columns(A::SparseMatrixCSR{Bi,Tv,Ti}, n) where {Bi,Tv,Ti}
+function expand_sparse_matrix_columns(A::SparseMatrixCSR{Bi,Tv,Ti} where {Tv, Ti}, n) where Bi
     p,q = size(A)
     @assert n >= q
-    SparseMatrixCSR{Bi,Tv,Ti}(p,n,A.rowptr,A.colval,A.nzval)
+    SparseMatrixCSR{Bi}(p,n,A.rowptr,A.colval,A.nzval)
 end
 
 function expand_sparse_matrix_columns(A::SparseMatrixCSC{Tv,Ti}, n) where {Tv,Ti}

From da64ff00dcda7374c1bd02322522a2b38007e34d Mon Sep 17 00:00:00 2001
From: jop611 <meijer.jop@gmail.com>
Date: Tue, 22 Oct 2024 18:39:57 +0200
Subject: [PATCH 20/34] helper function for thesis contributions added

---
 src/p_sparse_matrix.jl |   2 +-
 src/sparse_utils.jl    | 227 ++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 227 insertions(+), 2 deletions(-)

diff --git a/src/p_sparse_matrix.jl b/src/p_sparse_matrix.jl
index b69a3456..22d56f3b 100644
--- a/src/p_sparse_matrix.jl
+++ b/src/p_sparse_matrix.jl
@@ -2142,7 +2142,7 @@ function Base.:-(T,I::LinearAlgebra.UniformScaling,A::PSparseMatrix)
     Tv = eltype(A)
     row_partition = partition(axes(A,1))
     d = pones(Tv,row_partition)
-    D = PartitionedArrays.sparse_diag_matrix(T,d,axes(A))
+    D = sparse_diag_matrix(T,d,axes(A))
     D-A
 end
 
diff --git a/src/sparse_utils.jl b/src/sparse_utils.jl
index 7b0d2b40..83f7f2a8 100644
--- a/src/sparse_utils.jl
+++ b/src/sparse_utils.jl
@@ -431,6 +431,40 @@ end
 #    A
 #end
 
+# Variants for findnz() that only allocates memory for the conversion of the pointer array to an index array.
+function findnz_minimal(A::SparseMatrixCSC)
+    J = ptr_to_coo(A.colptr)
+    rowvals(A),J,nonzeros(A)
+end
+function findnz_minimal(A::SparseMatrixCSR)
+    I = ptr_to_coo(A.rowptr)
+    I,colvals(A),nonzeros(A)
+end
+
+# Behaves like findnz, but without copying the values.
+function find_indices(A::SparseMatrixCSC)
+    I,J,_ = findnz_minimal(A)
+    copy(I),J
+end
+function find_indices(A::SparseMatrixCSR)
+    I,J,_ = findnz_minimal(A)
+    I,copy(J)
+end
+
+# Could be optimized by a two-way merge-like method when A is a guaranteed submatrix of C.
+function precompute_nzindex(C::AbstractSparseArray,A::AbstractSparseArray)
+    I,J,_ = findnz_minimal(A)
+    K = similar(I)
+    K .= 0
+    for (p,(i,j)) in enumerate(zip(I,J))
+        if i < 1 || j < 1
+            continue
+        end
+        K[p] = nzindex(C,i,j)
+    end
+    K
+end
+
 function precompute_nzindex(A,I,J)
     K = zeros(Int32,length(I))
     for (p,(i,j)) in enumerate(zip(I,J))
@@ -442,6 +476,17 @@ function precompute_nzindex(A,I,J)
     K
 end
 
+# Reuse I vector as K vector. 
+function precompute_nzindex!(I,A,J)
+    for (p,(i,j)) in enumerate(zip(I,J))
+        if i < 1 || j < 1
+            continue
+        end
+        I[p] = nzindex(A,i,j)
+    end
+    I
+end
+
 function sparse_matrix!(A,V,K;reset=true)
     if reset
         LinearAlgebra.fillstored!(A,0)
@@ -459,7 +504,7 @@ end
 
 # Notation
 # csrr: csr with repeated and unsorted columns
-# csru: csr witu unsorted columns
+# csru: csr with unsorted columns
 # csc: csc with sorted columns
 
 struct SparseMatrixCSRR{Tv,Ti,A}
@@ -696,4 +741,184 @@ function expand_sparse_matrix_columns(A::SparseMatrixCSC{Tv,Ti}, n) where {Tv,Ti
         new_colptr[i] = last_index
     end
     SparseMatrixCSC{Tv,Ti}(p,n,new_colptr,A.rowval,A.nzval)
+end
+
+function Base.similar(A::SparseMatrixCSR{Bi}, m::Integer, n::Integer) where Bi
+    SparseMatrixCSR{1}(m, n, ones(eltype(A.rowptr), m+1), eltype(A.colval)[], eltype(A.nzval)[])
+end
+
+function Base.similar(A::SparseMatrixCSR{Bi}) where Bi
+    SparseMatrixCSR{Bi}(size(A)..., copy(A.rowptr), copy(colvals(A)), similar(nonzeros(A)))
+end
+
+function Base.copy(A::SparseMatrixCSR{Bi}) where Bi
+    SparseMatrixCSR{Bi}(size(A)..., copy(A.rowptr), copy(colvals(A)), copy(nonzeros(A)))
+end
+
+function Base.copy(At::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}}) where {Bi,Tv,Ti}
+    A = At.parent
+    p,q = size(A)
+    Acsc = ascsc(A)
+    Acsc_T = copy(transpose(Acsc)) # materialize SparseMAtrixCSC transpose
+    SparseMatrixCSR{Bi}(q, p, Acsc_T.colptr, rowvals(Acsc_T), nonzeros(Acsc_T))
+end
+
+function SparseMatricesCSR.sparsecsr(A::SparseMatrixCSC)
+    sparsecsr(findnz(A)..., size(A)...)
+end
+
+function SparseMatricesCSR.sparsecsr(At::Transpose)
+    transpose(sparsecsr(At.parent))
+end
+
+function SparseMatricesCSR.sparsecsr(A::SparseMatrixCSR)
+    A
+end
+
+function SparseMatricesCSR.sparsecsr(T::Type, A::SparseMatrixCSC)
+    compresscoo(T,findnz(A)..., size(A)...)
+end
+
+
+function pointer_array(A::SparseMatrixCSR)
+    A.rowptr
+end
+
+function pointer_array(A::SparseMatrixCSC)
+    A.colptr
+end
+
+function index_array(A::SparseMatrixCSR)
+    colvals(A)
+end
+
+function index_array(A::SparseMatrixCSC)
+    rowvals(A)
+end
+
+function ptr_to_coo(ptr_array)
+    K = zeros(Int32, (ptr_array[end]-1))
+    for i in 1:(length(ptr_array)-1)
+        for p in ptr_array[i]:ptr_array[i+1]-1
+            K[p] = i
+        end
+    end
+    K
+end
+
+function find_max_row_length(A::SparseMatrixCSR)
+    max_rA = 0
+    for i in 1:size(A,1)
+        l = length(nzrange(A,i))
+        max_rA = max_rA > l ? max_rA : l
+    end
+    max_rA
+end
+
+function find_max_col_length(A::SparseMatrixCSC)
+    max_cA = 0
+    for j in 1:size(A,2)
+        l = length(nzrange(A,j))
+        max_cA = max_cA > l ? max_cA : l
+    end
+    max_cA
+end
+
+# Lazily convert CSC matrix to CSR matrix, by interpreting columnpointers as row pointers, and colvals as rowvals,
+# effectively transposing it in the process.
+function ascsr(A::SparseMatrixCSC{Tv,Ti}) where {Tv,Ti}
+    p,q = size(A)
+    SparseMatrixCSR{1}(q,p,A.colptr,rowvals(A),nonzeros(A))
+end
+
+# Lazily convert CSR matrix to CSC matrix, by interpreting rowpointers as column pointers, and rowvals as colvals,
+# effectively transposing it in the process.
+function ascsc(A::SparseMatrixCSR{Bi,Tv,Ti}) where {Bi,Tv,Ti}
+    p,q = size(A)
+    SparseMatrixCSC{Tv,Ti}(q,p,A.rowptr,colvals(A),nonzeros(A))
+end
+
+
+function halfperm(A::SparseMatrixCSR{Bi,Tv,Ti}) where {Bi,Tv,Ti}
+    q = size(A,2)
+    JA,VA = colvals(A),nonzeros(A)
+    IAt,JAt,VAt = similar(A.rowptr,q+1),similar(JA),similar(VA)
+    halfperm!(IAt,JAt,VAt,A)
+end
+
+# transpose A into At using vectors IAt,JAt, and VAt
+function halfperm!(IAt,JAt,VAt,A::SparseMatrixCSR{Bi,Tv,Ti}) where {Bi,Tv,Ti}
+    JA,VA = colvals(A),nonzeros(A)
+    p,q = size(A)
+    count_occurrences!(IAt,JA)
+    counts_to_ptrs!(IAt)
+    shift_by_one!(IAt)
+    for i in 1:p
+        for jp in nzrange(A,i)
+            j = JA[jp]
+            jpt = IAt[j+1]
+            JAt[jpt] = i
+            VAt[jpt] = VA[jp]
+            IAt[j+1] = jpt+1
+        end
+    end
+    IAt[1] = 1
+    SparseMatrixCSR{Bi}(q,p,IAt,JAt,VAt)
+end
+
+# retranspose At back into A
+function halfperm!(A::SparseMatrixCSR{Bi,Tv,Ti},At::SparseMatrixCSR{Bi,Tv,Ti}) where {Bi,Tv,Ti}
+    IA,JA,VA = A.rowptr,colvals(A),nonzeros(A)
+    JAt,VAt = colvals(At),nonzeros(At)
+    p,q = size(At)
+    shift_by_one!(IA) # pointer to row 1 must be located at IA[2], row 2 at IA[3] etc.
+    IA[1] = 1
+    for i in 1:p
+        for jpt in nzrange(At,i)
+            j = JAt[jpt]
+            jp = IA[j+1]
+            JA[jp] = i
+            VA[jp] = VAt[jpt]
+            IA[j+1] = jp+1
+        end
+    end
+    At
+end
+
+function halfperm!(A::SparseMatrixCSC,At::SparseMatrixCSC)
+    halfperm!(ascsr(A),ascsr(At))
+    A
+end
+
+function halfperm(A::SparseMatrixCSC)
+    At = halfperm(ascsr(A))
+    ascsc(At)
+end
+
+function count_occurrences!(v1::AbstractVector{<:Integer},v2::AbstractVector{<:Integer};set_zero=true)
+    if set_zero
+        v1 .= 0
+    end
+    foreach(i->v1[i]+=1,v2)
+    v1
+end
+
+# shift all entries one element to the right in-place. Not circular.
+function shift_by_one!(v)
+    l = length(v)
+    prev = v[1]
+    tmp = prev
+    for i in 1:l-1
+        tmp = v[i+1]
+        v[i+1] = prev
+        prev = tmp
+    end
+end
+
+function counts_to_ptrs!(v)
+    l = length(v)
+    v[1] += 1
+    foreach(i->v[i]+=v[i-1],2:l)
+    shift_by_one!(v)
+    v[1] = 1
 end
\ No newline at end of file

From 2f2e927049868e0fcc6e11c854612548dd7cdc72 Mon Sep 17 00:00:00 2001
From: jop611 <meijer.jop@gmail.com>
Date: Tue, 22 Oct 2024 19:01:30 +0200
Subject: [PATCH 21/34] removed copy function for SparseCSR, as it is
 implemented by SparseMatricesCSR now.

---
 src/sparse_utils.jl | 19 ++++++++++---------
 times.txt           |  2 +-
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/src/sparse_utils.jl b/src/sparse_utils.jl
index 83f7f2a8..48172218 100644
--- a/src/sparse_utils.jl
+++ b/src/sparse_utils.jl
@@ -743,24 +743,25 @@ function expand_sparse_matrix_columns(A::SparseMatrixCSC{Tv,Ti}, n) where {Tv,Ti
     SparseMatrixCSC{Tv,Ti}(p,n,new_colptr,A.rowval,A.nzval)
 end
 
+# Currently not implemented by the SparseMatricesCSR module
 function Base.similar(A::SparseMatrixCSR{Bi}, m::Integer, n::Integer) where Bi
     SparseMatrixCSR{1}(m, n, ones(eltype(A.rowptr), m+1), eltype(A.colval)[], eltype(A.nzval)[])
 end
 
+# Currently not implemented by the SparseMatricesCSR module
 function Base.similar(A::SparseMatrixCSR{Bi}) where Bi
     SparseMatrixCSR{Bi}(size(A)..., copy(A.rowptr), copy(colvals(A)), similar(nonzeros(A)))
 end
 
-function Base.copy(A::SparseMatrixCSR{Bi}) where Bi
-    SparseMatrixCSR{Bi}(size(A)..., copy(A.rowptr), copy(colvals(A)), copy(nonzeros(A)))
-end
+# This method is implemented also by SparseMatricesCSR, but related methods aren't.
+# function Base.copy(A::SparseMatrixCSR{Bi}) where Bi
+#     SparseMatrixCSR{Bi}(size(A)..., copy(A.rowptr), copy(colvals(A)), copy(nonzeros(A)))
+# end
 
-function Base.copy(At::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}}) where {Bi,Tv,Ti}
-    A = At.parent
-    p,q = size(A)
-    Acsc = ascsc(A)
-    Acsc_T = copy(transpose(Acsc)) # materialize SparseMAtrixCSC transpose
-    SparseMatrixCSR{Bi}(q, p, Acsc_T.colptr, rowvals(Acsc_T), nonzeros(Acsc_T))
+# Currently not implemented by the SparseMatricesCSR module
+function Base.copy(At::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}} where {Bi,Tv,Ti})
+    Acsc_T = copy(transpose(ascsc(At.parent))) # materialize SparseMatrixCSC transpose
+    ascsr(Acsc_T)
 end
 
 function SparseMatricesCSR.sparsecsr(A::SparseMatrixCSC)
diff --git a/times.txt b/times.txt
index 95ffa1dd..53764f74 100644
--- a/times.txt
+++ b/times.txt
@@ -1 +1 @@
-Dict{String, @NamedTuple{min::Float64, max::Float64, avg::Float64}}("Phase 143" => (min = 0.2018849, max = 0.2018849, avg = 0.2018849), "Phase 3" => (min = 1.0e-7, max = 1.0e-7, avg = 1.0e-7), "Matrix Assembly" => (min = 0.4173665, max = 0.4173665, avg = 0.4173665), "Phase 1" => (min = 1.2e-6, max = 1.2e-6, avg = 1.2e-6))
+Dict{String, @NamedTuple{min::Float64, max::Float64, avg::Float64}}("Phase 143" => (min = 0.2068526, max = 0.2068526, avg = 0.2068526), "Phase 3" => (min = 0.0, max = 0.0, avg = 0.0), "Matrix Assembly" => (min = 0.4054487, max = 0.4054487, avg = 0.4054487), "Phase 1" => (min = 1.1e-6, max = 1.1e-6, avg = 1.1e-6))

From cabad59933a45aed8d70c931d64c7624147f05fc Mon Sep 17 00:00:00 2001
From: jop611 <meijer.jop@gmail.com>
Date: Fri, 24 Jan 2025 23:30:57 +0100
Subject: [PATCH 22/34] Added new distributed SpMM, SpMtM, SPMMM and SpMtMM
 algorithms with latency hiding to PartitionedArrays. Additional tests for
 these functions have been added. Sparse utils has been extended with some new
 functions required by SpMM, etc. Serial kernels for the methods are provided
 in a new file 'sequential implementations'.

---
 src/PartitionedArrays.jl          |   13 +
 src/p_sparse_matrix.jl            |  551 +++++++++-
 src/sequential_implementations.jl | 1672 +++++++++++++++++++++++++++++
 src/sparse_utils.jl               |  104 +-
 test/debug_array/runtests.jl      |    2 +
 test/debug_array/spmtmm_tests.jl  |   14 +
 test/mpi_array/runtests.jl        |    1 +
 test/mpi_array/spmtmm_tests.jl    |    4 +
 test/spmtmm_tests.jl              |  216 ++++
 times.txt                         |    2 +-
 10 files changed, 2479 insertions(+), 100 deletions(-)
 create mode 100644 src/sequential_implementations.jl
 create mode 100644 test/debug_array/spmtmm_tests.jl
 create mode 100644 test/mpi_array/spmtmm_tests.jl
 create mode 100644 test/spmtmm_tests.jl

diff --git a/src/PartitionedArrays.jl b/src/PartitionedArrays.jl
index 317c1c2a..8505d709 100644
--- a/src/PartitionedArrays.jl
+++ b/src/PartitionedArrays.jl
@@ -170,9 +170,17 @@ export spmv!
 export spmtv!
 export spmm
 export spmm!
+export spmmm
+export spmmm!
 export spmtm
 export spmtm!
+export spmtmm
+export spmtmm!
 export centralize
+export explicit_transpose
+export explicit_transpose!
+export add
+export add!
 include("p_sparse_matrix.jl")
 
 export BRange
@@ -196,4 +204,9 @@ export nullspace_linear_elasticity!
 export near_nullspace_linear_elasticity
 include("gallery.jl")
 
+export RAP
+export RAP!
+export -,+
+include("sequential_implementations.jl")
+
 end # module
diff --git a/src/p_sparse_matrix.jl b/src/p_sparse_matrix.jl
index c95e9bf9..e663b841 100644
--- a/src/p_sparse_matrix.jl
+++ b/src/p_sparse_matrix.jl
@@ -2262,21 +2262,56 @@ function sparse_diag_matrix(::Type{T},d::PVector,shape) where T
     psparse(T,I,J,V,row_partition,col_partition;assembled=true) |> fetch
 end
 
-function rap(R,A,P;reuse=Val(false))
-    Ac = R*A*P
+### OLD ###
+# function rap(R,A,P;reuse=Val(false))
+#     Ac = R*A*P
+#     if val_parameter(reuse)
+#         return Ac, nothing
+#     end
+#     Ac
+# end
+
+### NEW ###
+function rap(R::PSparseMatrix,A::PSparseMatrix,P::PSparseMatrix;reuse=Val(false))
+    Ac, cache = spmmm(R,A,P)
     if val_parameter(reuse)
-        return Ac, nothing
+        return Ac, cache
     end
     Ac
 end
 
-function rap!(Ac,R,A,P,cache)
-    # TODO improve performance
-    tmp = R*A*P
-    copyto!(Ac,tmp)
+### OLD ###
+# function rap!(Ac,R,A,P,cache)
+#     # TODO improve performance
+#     tmp = R*A*P
+#     copyto!(Ac,tmp)
+#     Ac
+# end
+
+### NEW ###
+function rap!(Ac::PSparseMatrix,R::PSparseMatrix,A::PSparseMatrix,P::PSparseMatrix,cache)
+    spmmm!(Ac,R,A,P,cache)
     Ac
 end
 
+### NEW ###
+function rap(Pt::Transpose{Tv,<:PSparseMatrix} where Tv, A::PSparseMatrix,P::PSparseMatrix;reuse=Val(false))
+    spmtmm(Pt.parent,A,P;reuse=reuse)
+end
+
+function rap!(Ac::PSparseMatrix,Pt::Transpose{Tv,<:PSparseMatrix} where Tv, A::PSparseMatrix,P::PSparseMatrix,cache)
+    spmtmm!(Ac,Pt.parent,A,P,cache)
+end
+
+function rap(A::PSparseMatrix,P::PSparseMatrix;reuse=Val(false))
+    spmtmm(P,A,P;reuse=reuse)
+end
+
+function rap!(Ac::PSparseMatrix,A::PSparseMatrix,P::PSparseMatrix,cache)
+    spmtmm!(Ac,A,P,cache)
+end
+### End NEW ###
+
 function spmm(A,B;reuse=Val(false))
     C = A*B
     if val_parameter(reuse)
@@ -2290,28 +2325,82 @@ function spmm!(C,A,B,state)
     C
 end
 
+### OLD ###
+# function spmm(A::PSparseMatrix,B::PSparseMatrix;reuse=Val(false))
+#     # TODO latency hiding
+#     @assert A.assembled
+#     @assert B.assembled
+#     col_partition = partition(axes(A,2))
+#     C,cacheC = consistent(B,col_partition;reuse=true) |> fetch
+#     D_partition,cacheD = map((args...)->spmm(args...;reuse=true),partition(A),partition(C)) |> tuple_of_arrays
+#     assembled = true
+#     D = PSparseMatrix(D_partition,partition(axes(A,1)),partition(axes(C,2)),assembled)
+#     if val_parameter(reuse)
+#         cache = (C,cacheC,cacheD)
+#         return D,cache
+#     end
+#     D
+# end
+
+# function spmm!(D::PSparseMatrix,A::PSparseMatrix,B::PSparseMatrix,cache)
+#     (C,cacheC,cacheD)= cache
+#     consistent!(C,B,cacheC) |> wait
+#     map(spmm!,partition(D),partition(A),partition(C),cacheD)
+#     D
+# end
+
+### NEW ###
 function spmm(A::PSparseMatrix,B::PSparseMatrix;reuse=Val(false))
-    # TODO latency hiding
-    @assert A.assembled
-    @assert B.assembled
-    col_partition = partition(axes(A,2))
-    C,cacheC = consistent(B,col_partition;reuse=true) |> fetch
-    D_partition,cacheD = map((args...)->spmm(args...;reuse=true),partition(A),partition(C)) |> tuple_of_arrays
-    assembled = true
-    D = PSparseMatrix(D_partition,partition(axes(A,1)),partition(axes(C,2)),assembled)
+    t = consistent(B,partition(axes(A,2)),reuse=true)
+    A_own_own = own_own_values(A)
+    A_own_ghost = own_ghost_values(A)
+
+    C_own_own_1 = map(*,A_own_own,own_own_values(B))
+    
+    # Wait for consistent
+    B2, cacheB2 = fetch(t)
+    C_own_ghost_1 = map(*,A_own_own,own_ghost_values(B2))
+    C_own_own_2 = map(*,A_own_ghost,ghost_own_values(B2))
+    C_own_ghost_2 = map(*,A_own_ghost,ghost_ghost_values(B2))
+    
+    C_own_own = map(+, C_own_own_1, C_own_own_2)
+    C_own_ghost = map(+, C_own_ghost_1, C_own_ghost_2)
+    
+    Coo_cache = map(construct_spmm_cache, C_own_own)
+    Cog_cache = map(construct_spmm_cache, C_own_ghost)
+    
+    C_values = map(C_own_own,C_own_ghost,partition(A),partition(B2)) do own_own,own_ghost,A_part,B_part
+        ghost_own = similar(own_own,0,size(own_own,2))
+        ghost_ghost = similar(own_own,0,size(own_ghost,2))
+        blocks = split_matrix_blocks(own_own,own_ghost,ghost_own,ghost_ghost)
+        split_matrix(blocks,A_part.row_permutation,B_part.col_permutation)
+    end
+    
+    C = PSparseMatrix(C_values,partition(axes(A,1)),partition(axes(B2,2)),true)
     if val_parameter(reuse)
-        cache = (C,cacheC,cacheD)
-        return D,cache
+        cache = (B2,cacheB2,(Coo_cache,Cog_cache))
+        return C,cache
     end
-    D
+    C
 end
 
-function spmm!(D::PSparseMatrix,A::PSparseMatrix,B::PSparseMatrix,cache)
-    (C,cacheC,cacheD)= cache
-    consistent!(C,B,cacheC) |> wait
-    map(spmm!,partition(D),partition(A),partition(C),cacheD)
-    D
+function spmm!(C::PSparseMatrix,A::PSparseMatrix,B::PSparseMatrix,cache)
+    (B2,cacheB2,(Coo_cache,Cog_cache)) = cache
+    t = consistent!(B2,B,cacheB2)
+    A_own_own = own_own_values(A)
+    A_own_ghost = own_ghost_values(A)
+    C_own_own = own_own_values(C)
+    C_own_ghost = own_ghost_values(C)
+
+    map(mul!, C_own_own, A_own_own, own_own_values(B),Coo_cache)
+    wait(t)
+    map(mul!, C_own_ghost, A_own_own, own_ghost_values(B2),Cog_cache)
+
+    map((C,A,B,cache) -> mul!(C,A,B,1,1,cache), C_own_own,A_own_ghost,ghost_own_values(B2),Coo_cache)
+    map((C,A,B,cache) -> mul!(C,A,B,1,1,cache), C_own_ghost,A_own_ghost,ghost_ghost_values(B2),Cog_cache)
+    C
 end
+### End NEW ###
 
 function spmtm(A,B;reuse=Val(false))
     C = transpose(A)*B
@@ -2326,27 +2415,99 @@ function spmtm!(C,A,B,cache)
     C
 end
 
+### OLD ###
+# function spmtm(A::PSparseMatrix,B::PSparseMatrix;reuse=Val(false))
+#     # TODO latency hiding
+#     @assert A.assembled
+#     @assert B.assembled
+#     D_partition,cacheD = map((args...)->spmtm(args...;reuse=true),partition(A),partition(B)) |> tuple_of_arrays
+#     assembled = false
+#     D = PSparseMatrix(D_partition,partition(axes(A,2)),partition(axes(B,2)),assembled)
+#     C,cacheC = assemble(D;reuse=true) |> fetch
+#     if val_parameter(reuse)
+#         cache = (D,cacheC,cacheD)
+#         return C,cache
+#     end
+#     C
+# end
+
+# function spmtm!(C::PSparseMatrix,A::PSparseMatrix,B::PSparseMatrix,cache)
+#     (D,cacheC,cacheD)= cache
+#     map(spmtm!,partition(D),partition(A),partition(B),cacheD)
+#     assemble!(C,D,cacheC) |> wait
+#     C
+# end
+
+### NEW ###
 function spmtm(A::PSparseMatrix,B::PSparseMatrix;reuse=Val(false))
-    # TODO latency hiding
-    @assert A.assembled
-    @assert B.assembled
-    D_partition,cacheD = map((args...)->spmtm(args...;reuse=true),partition(A),partition(B)) |> tuple_of_arrays
+    Aoo = own_own_values(A)
+    Aog = own_ghost_values(A)
+    Boo = own_own_values(B)
+    Bog = own_ghost_values(B)
+
+    C1go = map((A,B)->transpose(A)*B,Aog,Boo)
+    C1gg = map((A,B)->transpose(A)*B,Aog,Bog)
+
+    C1_values = map(C1go, C1gg, partition(A), partition(B)) do ghost_own, ghost_ghost, A_part, B_part
+        own_own = similar(ghost_ghost, size(A_part.blocks.own_own, 2), size(B_part.blocks.own_own, 2))
+        own_ghost = similar(ghost_ghost, size(A_part.blocks.own_own, 2), size(B_part.blocks.own_ghost, 2))
+        blocks = split_matrix_blocks(own_own,own_ghost,ghost_own,ghost_ghost)
+        split_matrix(blocks,A_part.col_permutation,B_part.col_permutation)
+    end
+    
     assembled = false
-    D = PSparseMatrix(D_partition,partition(axes(A,2)),partition(axes(B,2)),assembled)
-    C,cacheC = assemble(D;reuse=true) |> fetch
+    C1_unassembled = PSparseMatrix(C1_values,partition(axes(A,2)),partition(axes(B,2)),assembled)
+    t = assemble(C1_unassembled,reuse=true)
+
+    C2oo = map((A,B)->transpose(A)*B,Aoo,Boo)
+    C2og = map((A,B)->transpose(A)*B,Aoo,Bog)
+
+    C2_values = map(C2oo, C2og, partition(A), partition(B)) do own_own, own_ghost, A_part, B_part
+        ghost_own = similar(own_own,0,size(own_own,2))
+        ghost_ghost = similar(own_own,0,size(own_ghost,2))
+        blocks = split_matrix_blocks(own_own,own_ghost,ghost_own,ghost_ghost)
+        split_matrix(blocks, A_part.col_permutation, B_part.col_permutation)
+    end
+
+    # No cache returned by SparseArrays, so this is a workaround. 
+    Coo_cache = map(construct_spmtm_cache, C2oo)
+    Cog_cache = map(construct_spmtm_cache, C2og)
+    Cgo_cache = map(construct_spmtm_cache, C1go)
+    Cgg_cache = map(construct_spmtm_cache, C1gg)
+
+    assembled = true
+    C2 = PSparseMatrix(C2_values,partition(axes(A,2)),partition(axes(B,2)),assembled)
+    C1, assemblyCache = fetch(t)
+    C, mergeCache = add(C1, C2)
+
     if val_parameter(reuse)
-        cache = (D,cacheC,cacheD)
+        sequential_caches = (Coo_cache,Cog_cache,Cgo_cache,Cgg_cache)
+        cache = (C1, C1_unassembled, assemblyCache, C2, mergeCache, sequential_caches)
         return C,cache
     end
     C
 end
 
 function spmtm!(C::PSparseMatrix,A::PSparseMatrix,B::PSparseMatrix,cache)
-    (D,cacheC,cacheD)= cache
-    map(spmtm!,partition(D),partition(A),partition(B),cacheD)
-    assemble!(C,D,cacheC) |> wait
+    C1, C1_unassembled, assemblyCache, C2, mergeCache, sequential_caches = cache
+    (Coo_cache,Cog_cache,Cgo_cache,Cgg_cache) = sequential_caches
+
+    Aoo = own_own_values(A)
+    Aog = own_ghost_values(A)
+    Boo = own_own_values(B)
+    Bog = own_ghost_values(B)
+
+    map((C,A,B,cache)->mul!(C,transpose(A),B,cache),ghost_own_values(C1_unassembled),Aog,Boo,Cgo_cache)
+    map((C,A,B,cache)->mul!(C,transpose(A),B,cache),ghost_ghost_values(C1_unassembled),Aog,Bog,Cgg_cache)
+        
+    t = assemble!(C1, C1_unassembled, assemblyCache)
+    map((C,A,B,cache)->mul!(C,transpose(A),B,cache),own_own_values(C2),Aoo,Boo,Coo_cache)
+    map((C,A,B,cache)->mul!(C,transpose(A),B,cache),own_ghost_values(C2),Aoo,Bog,Cog_cache)
+    wait(t)
+    add!(C, C1, C2, mergeCache)
     C
 end
+### End NEW ###
 
 function Base.:*(A::PSparseMatrix,B::PSparseMatrix)
     C = spmm(A,B)
@@ -2462,6 +2623,8 @@ function repartition(A::PSparseMatrix,new_rows,new_cols;reuse=Val(false))
     end
 end
 
+### NEW ###
+# Repartition that follows local data layout of type T (some sparse matrix format)
 function repartition(::Type{T},A::PSparseMatrix,new_rows,new_cols;reuse=Val(false)) where T
     @assert A.assembled "repartition on a sub-assembled matrix not implemented yet"
     function prepare_triplets(A_own_own,A_own_ghost,A_rows,A_cols)
@@ -2481,7 +2644,7 @@ function repartition(::Type{T},A::PSparseMatrix,new_rows,new_cols;reuse=Val(fals
     A_rows = partition(axes(A,1))
     A_cols = partition(axes(A,2))
     I,J,V = map(prepare_triplets,A_own_own,A_own_ghost,A_rows,A_cols) |> tuple_of_arrays
-    # TODO this one does not preserve the local storage layout of A
+
     t = psparse(T,I,J,V,new_rows,new_cols;reuse=true)
     @fake_async begin
         B,cacheB = fetch(t)
@@ -2494,6 +2657,8 @@ function repartition(::Type{T},A::PSparseMatrix,new_rows,new_cols;reuse=Val(fals
     end
 end
 
+### NEW ###
+# Repartition that follows local data layout by using sparse function "sparse"
 function repartition(sparse,A::PSparseMatrix,new_rows,new_cols;reuse=Val(false))
     @assert A.assembled "repartition on a sub-assembled matrix not implemented yet"
     function prepare_triplets(A_own_own,A_own_ghost,A_rows,A_cols)
@@ -2595,6 +2760,8 @@ function centralize(A::PSparseMatrix)
     own_own_values(a_in_main) |> multicast |> getany
 end
 
+### NEW ### 
+# Centralize function with local storage layout of type T (some sparse matrix format)
 function centralize(::Type{T},A::PSparseMatrix) where T
     m,n = size(A)
     ranks = linear_indices(partition(A))
@@ -2604,6 +2771,8 @@ function centralize(::Type{T},A::PSparseMatrix) where T
     own_own_values(a_in_main) |> multicast |> getany
 end
 
+### NEW ### 
+# Centralize function that follows local data layout resulting from "sparse"
 function centralize(sparse,A::PSparseMatrix)
     m,n = size(A)
     ranks = linear_indices(partition(A))
@@ -2849,3 +3018,317 @@ function laplace_matrix(nodes_per_dir,parts_per_dir,ranks)
     I,J,V = map(setup,node_partition) |> tuple_of_arrays
     A = psparse(sparse,I,J,V,node_partition,node_partition) |> fetch
 end
+
+
+################ NEW ################
+
+# Locally transpose SplitMatrix
+function explicit_transpose(A::AbstractSplitMatrix)
+    own_own = halfperm(A.blocks.own_own)
+    own_ghost = halfperm(A.blocks.ghost_own)
+    ghost_own = halfperm(A.blocks.own_ghost)
+    ghost_ghost = halfperm(A.blocks.ghost_ghost)
+    blocks = split_matrix_blocks(own_own,own_ghost,ghost_own,ghost_ghost)
+    split_matrix(blocks,A.col_permutation,A.row_permutation)
+end
+
+# Redistribute PSparseMatrix, returns unassembled transpose and a assmbly task when reuse is true, or only the assembly task otherwise
+function explicit_transpose(A::PSparseMatrix;reuse=false)
+    mats = map(explicit_transpose,partition(A))
+    rows, cols = axes(A)
+    B = PSparseMatrix(mats,partition(cols),partition(rows),false)
+    t = assemble(B,reuse=reuse)
+    if val_parameter(reuse)
+        B,t
+    else
+        t
+    end
+end
+
+function explicit_transpose!(B::AbstractSplitMatrix,A::AbstractSplitMatrix)
+    halfperm!(B.blocks.own_own,A.blocks.own_own)
+    halfperm!(B.blocks.own_ghost,A.blocks.ghost_own)
+    halfperm!(B.blocks.ghost_own,A.blocks.own_ghost)
+    halfperm!(B.blocks.ghost_ghost,A.blocks.ghost_ghost)
+end
+
+function explicit_transpose!(B::PSparseMatrix,B_local::PSparseMatrix,A::PSparseMatrix,cache)
+    map(explicit_transpose!,partition(B_local),partition(A))
+    assemble!(B, B_local, cache)
+end
+
+function add(A::PSparseMatrix,B::PSparseMatrix)
+    function add_own_own(A,B)
+        C = A+B
+        # reuse IA/IB for cache
+        KA = precompute_nzindex(C,A)
+        KB = precompute_nzindex(C,B)
+        C,(KA,KB)
+    end
+    function add_own_ghost(own_ghost_A, own_ghost_B, colsA, colsB, cols)
+        # Minimize allocated memory, but could be replaced with findnz(...)
+        iA,jA = find_indices(own_ghost_A) # local nonzero
+        vA = nonzeros(own_ghost_A)
+        iB,jB = find_indices(own_ghost_B) # local nonzero
+        vB = nonzeros(own_ghost_B)
+        jC = zeros(eltype(jA), (length(jA) + length(jB)))
+        ghostA_to_global = ghost_to_global(colsA)
+        ghostB_to_global = ghost_to_global(colsB)
+        global_to_ghostC = global_to_ghost(cols)
+        l = zero(eltype(jA))
+        for k in eachindex(jA)
+            l += 1
+            j = jA[k]
+            jC[l] = global_to_ghostC[ghostA_to_global[j]]
+            jA[k] = jC[l]
+        end
+        for k in eachindex(jB)
+            l += 1
+            j = jB[k]
+            jC[l] = global_to_ghostC[ghostB_to_global[j]]
+            jB[k] = jC[l]
+        end
+        own_ghost = compresscoo(typeof(own_ghost_A), vcat(iA, iB), jC, vcat(vA, vB), size(own_ghost_A, 1), ghost_length(cols))
+        # reuse auxiliary iA, iB arrays as caches
+        precompute_nzindex!(iA,own_ghost,iA,jA)
+        precompute_nzindex!(iB,own_ghost,iB,jB)
+        own_ghost, (iA, iB)
+    end
+    function _add(A,B)
+        colsA = partition(axes(A,2))
+        colsB = partition(axes(B,2))
+        J = map(ghost_to_global, colsB)
+        J_owner = map(ghost_to_owner, colsB)
+        cols = map(union_ghost, colsA, J, J_owner)
+        rows = partition(axes(A,1))
+        Coo, Koo = map(add_own_own, own_own_values(A), own_own_values(B)) |> tuple_of_arrays
+        Cog, Kog = map(add_own_ghost, own_ghost_values(A), own_ghost_values(B), colsA, colsB, cols) |> tuple_of_arrays
+        C_vals = map(Coo,Cog,rows,cols) do Coo, Cog, rows, cols
+            Cgo = similar(Coo, 0, size(Coo,2))
+            Cgg = similar(Coo, 0, size(Cog,2))
+            blocks = split_matrix_blocks(Coo, Cog, Cgo, Cgg)
+            split_matrix(blocks, local_permutation(rows), local_permutation(cols))
+        end
+        assembled = true
+        K = (Koo, Kog)
+        PSparseMatrix(C_vals,rows,cols,assembled), K
+    end
+    _add(A,B)
+end
+
+function add!(C::PSparseMatrix,A::PSparseMatrix,B::PSparseMatrix,cache)
+    function add_blocks!(C, A, B, K)
+        K_A, K_B = K
+        sparse_matrix!(C, nonzeros(A), K_A)
+        sparse_matrix!(C, nonzeros(B), K_B, reset=false)
+    end
+    Koo, Kog = cache
+    map(add_blocks!, own_own_values(C), own_own_values(A), own_own_values(B), Koo)
+    map(add_blocks!, own_ghost_values(C), own_ghost_values(A), own_ghost_values(B), Kog)
+end
+
+# Interpret A as if its transpose is needed
+function spmtmm(A::PSparseMatrix,B::PSparseMatrix,C::PSparseMatrix;reuse=Val(false))
+    consistency_task = consistent(C, partition(axes(B,2)),reuse=true)
+    
+    Aoo = own_own_values(A)
+    Boo = own_own_values(B)
+    Cog = own_own_values(C)
+    
+    Aog = own_ghost_values(A)
+    Bog = own_ghost_values(B)
+    
+    Doo1, Doo_cache = map((A,B,C)->RAP(transpose(A),B,C), Aoo,Boo,Cog) |> tuple_of_arrays
+    Dgo1, Dgo_cache = map((A,B,C)->RAP(transpose(A),B,C), Aog,Boo,Cog) |> tuple_of_arrays
+    
+    # Collect ghost rows from P before continuing
+    C2, consistencyCache = fetch(consistency_task)
+
+    Cog2 = own_ghost_values(C2)
+    Cgo = ghost_own_values(C2)
+    Cgg = ghost_ghost_values(C2)
+
+    Dgo2, Dgo_cache = map((A,B,C,cache)->RAP(transpose(A),B,C,cache), Aog,Bog,Cgo,Dgo_cache) |> tuple_of_arrays
+    Dog1, Dog_cache = map((A,B,C,cache)->RAP(transpose(A),B,C,cache), Aog,Boo,Cog2,Dgo_cache) |> tuple_of_arrays
+    Dog2, Dog_cache = map((A,B,C,cache)->RAP(transpose(A),B,C,cache), Aog,Bog,Cgg,Dog_cache) |> tuple_of_arrays        
+
+    Dgo = map(+,Dgo1,Dgo2) # different sparsity patterns so not in-place.
+    Dog = map(+,Dog1,Dog2)
+
+    D1_values = map(Dgo, Dog, partition(C), partition(C2)) do ghost_own, ghost_ghost, C_part, C2_part
+        own_own = similar(ghost_ghost, size(C_part.blocks.own_own, 2), size(C2_part.blocks.own_own, 2))
+        own_ghost = similar(ghost_ghost, size(C_part.blocks.own_own, 2), size(C2_part.blocks.own_ghost, 2))
+        blocks = split_matrix_blocks(own_own, own_ghost, ghost_own, ghost_ghost)
+        split_matrix(blocks, C_part.col_permutation, C2_part.col_permutation)
+    end
+    D1_unassembled = PSparseMatrix(D1_values, partition(axes(C,2)), partition(axes(C2,2)), false)
+    assembly_task = assemble(D1_unassembled, reuse=true)
+
+    Dog1, Dog_cache = map((A,B,C,cache)->RAP(transpose(A),B,C,cache), Aoo,Boo,Cog2,Doo_cache) |> tuple_of_arrays
+    Doo2,Doo_cache = map((A,B,C,cache)->RAP(transpose(A),B,C,cache), Aoo,Bog,Cgo,Doo_cache) |> tuple_of_arrays
+    Dog2,Dog_cache = map((A,B,C,cache)->RAP(transpose(A),B,C,cache), Aoo,Bog,Cgg,Dog_cache) |> tuple_of_arrays
+
+    Doo = map(+,Doo1,Doo2)
+    Dog = map(+,Dog1,Dog2)
+
+    Doo_cache_final = map((cache,D)->reduce_spmtmm_cache(cache,typeof(D)),Doo_cache,Doo)
+    Dog_cache_final = map((cache,D)->reduce_spmtmm_cache(cache,typeof(D)),Dog_cache,Dog)
+    Dgo_cache_final = map((cache,D)->reduce_spmtmm_cache(cache,typeof(D)),Dgo_cache,Dgo)
+    Dog_cache_final = map((cache,D)->reduce_spmtmm_cache(cache,typeof(D)),Dog_cache,Dog)
+
+    D2_values = map(Doo, Dog, partition(C2)) do own_own, own_ghost, C_part
+        ghost_own = similar(own_own,0,size(own_own, 2))
+        ghost_ghost = similar(own_ghost,0,size(own_ghost, 2))
+        blocks = split_matrix_blocks(own_own,own_ghost,ghost_own,ghost_ghost)
+        split_matrix(blocks, C_part.col_permutation, C_part.col_permutation)
+    end
+
+    D1, assemblyCache = fetch(assembly_task)
+    D2 = PSparseMatrix(D2_values, partition(axes(D1,1)), partition(axes(C2,2)), true)
+    D, mergeCache = add(D1, D2)
+    sequential_caches = (Doo_cache_final, Dog_cache_final, Dgo_cache_final, Dog_cache_final)
+    if val_parameter(reuse)
+        cache = (C2, consistencyCache, D1, D1_unassembled, assemblyCache, D2, mergeCache, sequential_caches)
+        return D,cache
+    end
+    D
+end
+
+function spmtmm(A::PSparseMatrix,P::PSparseMatrix;kwargs...)
+    spmtmm(transpose(P),A,P;kwargs...)
+end
+
+function spmtmm!(D::PSparseMatrix,A::PSparseMatrix,B::PSparseMatrix,C::PSparseMatrix,cache)
+    C2, consistencyCache, D1, D1_unassembled, assemblyCache, D2, mergeCache, sequential_caches = cache
+    Doo_cache, Dog_cache, Dgo_cache, Dgg_cache = sequential_caches
+    C2, consistencyCache, D1, D1_unassembled, assemblyCache, D2, mergeCache = cache
+    
+    consistency_task = consistent!(C2, C, consistencyCache)
+    Doo = own_own_values(D2)
+    Dog = own_ghost_values(D2)
+    Dgo = ghost_own_values(D1_unassembled)
+    Dgg = ghost_ghost_values(D1_unassembled)
+
+    Aoo = own_own_values(A)
+    Boo = own_own_values(B)
+    Coo = own_own_values(C)
+
+    Aog = own_ghost_values(A)
+    Bog = own_ghost_values(B)
+    
+    map((D,A,B,C,cache)->RAP!(D,transpose(A),B,C,cache), Doo,Aoo,Boo,Coo,Doo_cache)
+    map((D,A,B,C,cache)->RAP!(D,transpose(A),B,C,cache), Dgo,Aog,Boo,Coo,Dgo_cache)
+    
+    # Collect ghost rows from P before continuing
+    wait(consistency_task)
+    Cog2 = own_ghost_values(C2)
+    Cgo = ghost_own_values(C2)
+    Cgg = ghost_ghost_values(C2)
+
+    map((D,A,B,C,cache)->RAP!(D,transpose(A),B,C,cache), Dgg,Aog,Boo,Cog2,Dgg_cache)
+
+    map((D,A,B,C,cache)->RAP!(D,transpose(A),B,C,1,1,cache), Dgo,Aog,Bog,Cgo,Dgo_cache)
+    map((D,A,B,C,cache)->RAP!(D,transpose(A),B,C,1,1,cache), Dgg,Aog,Bog,Cgg,Dgg_cache)
+
+    assembly_task = assemble!(D1, D1_unassembled, assemblyCache)
+    
+    map((D,A,B,C,cache)->RAP!(D,transpose(A),B,C,1,1,cache), Doo,Aoo,Bog,Cgo,Doo_cache)
+    map((D,A,B,C,cache)->RAP!(D,transpose(A),B,C,cache), Dog,Aoo,Boo,Cog2,Dog_cache)
+    map((D,A,B,C,cache)->RAP!(D,transpose(A),B,C,1,1,cache), Dog,Aoo,Bog,Cgg,Dog_cache)
+    
+    wait(assembly_task)
+    add!(D, D1, D2, mergeCache)
+    D
+end
+
+function spmtmm!(C::PSparseMatrix,A::PSparseMatrix,P::PSparseMatrix,cache)
+    spmtmm!(C,P,A,P,cache)
+end
+
+function spmmm(A::PSparseMatrix,B::PSparseMatrix,C::PSparseMatrix;reuse=Val(false))
+    B2_task = consistent(B,partition(axes(A,2)),reuse=true)
+    Aoo = own_own_values(A)
+    Aog = own_ghost_values(A)
+    Boo = own_own_values(B)
+    Coo = own_own_values(C)
+
+    Doo1,Doo_cache = map(RAP,Aoo,Boo,Coo) |> tuple_of_arrays
+    B2, Bcache = fetch(B2_task)
+    C2_task = consistent(C,partition(axes(B2,2)),reuse=true)
+
+    Bog = own_ghost_values(B2)
+    Bgo = ghost_own_values(B2)
+    Bgg = ghost_ghost_values(B2)
+
+    Doo2,Doo_cache = map(RAP,Aog,Bgo,Coo,Doo_cache) |> tuple_of_arrays
+    Doo12 = map(+,Doo1,Doo2)
+
+    C2, Ccache = fetch(C2_task)
+  
+    Cog = own_ghost_values(C2)
+    Cgo = ghost_own_values(C2)
+    Cgg = ghost_ghost_values(C2)
+
+    Doo3,Doo_cache = map(RAP,Aoo,Bog,Cgo,Doo_cache) |> tuple_of_arrays
+    Doo4,Doo_cache = map(RAP,Aog,Bgg,Cgo,Doo_cache) |> tuple_of_arrays
+  
+    Doo34 = map(+,Doo3,Doo4)
+    Doo = map(+,Doo12,Doo34)
+  
+    Dog1,Dog_cache = map(RAP,Aoo,Boo,Cog) |> tuple_of_arrays
+    Dog2,Dog_cache = map(RAP,Aog,Bgo,Cog,Dog_cache) |> tuple_of_arrays
+    Dog3,Dog_cache = map(RAP,Aoo,Bog,Cgg,Dog_cache) |> tuple_of_arrays
+    Dog4,Dog_cache = map(RAP,Aog,Bgg,Cgg,Dog_cache) |> tuple_of_arrays
+
+    Dog12 = map(+,Dog1,Dog2)
+    Dog34 = map(+,Dog3,Dog4)
+    Dog = map(+,Dog12,Dog34)
+
+    D_values = map(Doo, Dog, partition(A),partition(C2)) do own_own, own_ghost, A_part,C_part
+        ghost_own = similar(own_own,0,size(own_own, 2))
+        ghost_ghost = similar(own_ghost,0,size(own_ghost, 2))
+        blocks = split_matrix_blocks(own_own,own_ghost,ghost_own,ghost_ghost)
+        split_matrix(blocks, A_part.row_permutation, C_part.col_permutation)
+    end
+
+    D = PSparseMatrix(D_values, partition(axes(A,1)), partition(axes(C2,2)), true)
+    if val_parameter(reuse)
+        cache = B2,Bcache,C2,Ccache,(Doo_cache,Dog_cache)
+        return D,cache
+    end
+    D
+end
+
+function spmmm!(D::PSparseMatrix,A::PSparseMatrix,B::PSparseMatrix,C::PSparseMatrix,cache)
+    B2,Bcache,C2,Ccache,sequential_caches = cache
+    Doo_cache, Dog_cache = sequential_caches
+    B2_task = consistent!(B2,B,Bcache)
+
+    Doo = own_own_values(D)
+    Dog = own_ghost_values(D)
+    Aoo = own_own_values(A)
+    Aog = own_ghost_values(A)
+    Boo = own_own_values(B)
+    Coo = own_own_values(C)
+    map(RAP!,Doo,Aoo,Boo,Coo,Doo_cache)
+    wait(B2_task)
+
+    C2_task = consistent!(C2,C,Ccache)
+    Bog = own_ghost_values(B2)
+    Bgo = ghost_own_values(B2)
+    Bgg = ghost_ghost_values(B2)
+    map((D,A,B,C,cache)->RAP!(D,A,B,C,1,1,cache),Doo,Aog,Bgo,Coo,Doo_cache)
+
+    wait(C2_task)
+    Cog = own_ghost_values(C2)
+    Cgo = ghost_own_values(C2)
+    Cgg = ghost_ghost_values(C2)
+
+    map((D,A,B,C,cache)->RAP!(D,A,B,C,1,1,cache),Doo,Aoo,Bog,Cgo,Doo_cache)
+    map((D,A,B,C,cache)->RAP!(D,A,B,C,1,1,cache),Doo,Aog,Bgg,Cgo,Doo_cache)
+    map(RAP!,Dog,Aoo,Boo,Cog,Dog_cache)
+    map((D,A,B,C,cache)->RAP!(D,A,B,C,1,1,cache),Dog,Aog,Bgo,Cog,Dog_cache)
+    map((D,A,B,C,cache)->RAP!(D,A,B,C,1,1,cache),Dog,Aoo,Bog,Cgg,Dog_cache)
+    map((D,A,B,C,cache)->RAP!(D,A,B,C,1,1,cache),Dog,Aog,Bgg,Cgg,Dog_cache)
+    D
+end
\ No newline at end of file
diff --git a/src/sequential_implementations.jl b/src/sequential_implementations.jl
new file mode 100644
index 00000000..94ab1d08
--- /dev/null
+++ b/src/sequential_implementations.jl
@@ -0,0 +1,1672 @@
+function Base.:*(A::SparseMatrixCSR{Bi,TvA,TiA},B::SparseMatrixCSR{Bi,TvB,TiB}) where {Bi,TvA,TiA,TvB,TiB}
+    C = ascsc(B)*ascsc(A)
+    ascsr(C)
+end
+
+function Base.:*(At::Transpose{Tv, SparseMatrixCSR{Bi,Tv,Ti}},B::SparseMatrixCSR{Bi,Tv,Ti}) where {Bi,Tv,Ti}
+    C = ascsc(B)*transpose(ascsc(At.parent))
+    ascsr(C)
+end
+
+function Base.:*(A::SparseMatrixCSR{Bi,Tv,Ti},Bt::Transpose{Tv, SparseMatrixCSR{Bi,Tv,Ti}}) where {Bi,Tv,Ti}
+    C = transpose(ascsc(Bt.parent))*ascsc(A)
+    ascsr(C)
+end
+
+function Base.:*(At::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}},Bt::Transpose{Tv, SparseMatrixCSR{Bi,Tv,Ti}}) where {Bi,Tv,Ti}
+    C = transpose(ascsc(Bt.parent))*transpose(ascsc(At.parent))
+    ascsr(C)
+end
+
+function Base.:*(x::Number,A::SparseMatrixCSR{Bi,Tv,Ti}) where {Bi,Tv,Ti}
+    SparseMatrixCSR{Bi}(size(A)..., copy(A.rowptr), copy(A.colval), map(a -> x*a, A.nzval))
+end
+function Base.:*(A::SparseMatrixCSR,x::Number) *(x,A) end
+
+function Base.:/(A::SparseMatrixCSR{Bi,Tv,Ti},x::Number) where {Bi,Tv,Ti}
+    SparseMatrixCSR{Bi}(size(A)..., copy(A.rowptr), copy(A.colval), map(a -> a/x, A.nzval))
+end
+
+
+# Alternative to lazy csr to csc for matrix addition that does not drop structural zeros.
+function Base.:+(A::SparseMatrixCSR{Bi,Tv,Ti},B::SparseMatrixCSR{Bi,Tv,Ti}) where {Bi,Tv,Ti}
+    if size(A) == size(B) || throw(DimensionMismatch("Size of B $(size(B)) must match size of A $(size(A))"));end
+    p,q = size(A)
+    nnz_C_upperbound = nnz(A) + nnz(B)
+    IC = Vector{Ti}(undef, p+1)
+    JC = Vector{Ti}(undef, nnz_C_upperbound)
+    VC = Vector{Tv}(undef, nnz_C_upperbound)
+    
+    pC = 1
+    JA = colvals(A)
+    VA = nonzeros(A)
+    JB = colvals(B)
+    VB = nonzeros(B)
+    for i in 1:p
+        IC[i] = pC
+        jpA_range = nzrange(A, i)
+        jpA, jpA_end = jpA_range.start, jpA_range.stop
+        jpB_range = nzrange(B, i)
+        jpB, jpB_end = jpB_range.start, jpB_range.stop
+        while jpA <= jpA_end && jpB <= jpB_end
+            jA = JA[jpA]
+            jB = JB[jpB]
+            if jA < jB
+                JC[pC] = jA
+                VC[pC] = VA[jpA]
+                jpA += 1
+            elseif jB < jA
+                JC[pC] = jB
+                VC[pC] = VB[jpB]
+                jpB += 1
+            else
+                JC[pC] = jA
+                VC[pC] = VA[jpA] + VB[jpB]
+                jpA += 1
+                jpB += 1
+            end
+            pC += 1
+        end
+        while jpA <= jpA_end
+            JC[pC] = JA[jpA]
+            VC[pC] = VA[jpA]
+            jpA += 1
+            pC += 1
+        end
+        while jpB <= jpB_end
+            JC[pC] = JB[jpB]
+            VC[pC] = VB[jpB]
+            jpB += 1
+            pC += 1
+        end
+    end
+    IC[end] = pC
+    resize!(JC, (pC-1))
+    resize!(VC, (pC-1))
+    SparseMatrixCSR{Bi}(p,q,IC,JC,VC)   # A += B
+end
+
+# Alternative to lazy csr to csc for matrix subtraction that does not drop structural zeros. Subtracts B from A, i.e. A - B.
+function Base.:-(A::SparseMatrixCSR{Bi,Tv,Ti},B::SparseMatrixCSR{Bi,Tv,Ti}) where {Bi,Tv,Ti}
+    if size(A) == size(B) || throw(DimensionMismatch("Size of B $(size(B)) must match size of A $(size(A))"));end
+    nnz_C_upperbound = nnz(A) + nnz(B)
+    p,r = size(A)
+    IC = Vector{Ti}(undef, p+1)
+    JC = Vector{Ti}(undef, nnz_C_upperbound)
+    VC = Vector{Tv}(undef, nnz_C_upperbound)
+    
+    pC = 1
+    JA = colvals(A)
+    VA = nonzeros(A)
+    JB = colvals(B)
+    VB = nonzeros(B)
+    for i in 1:p
+        IC[i] = pC
+        jpA_range = nzrange(A, i)
+        jpA, jpA_end = jpA_range.start, jpA_range.stop
+        jpB_range = nzrange(B, i)
+        jpB, jpB_end = jpB_range.start, jpB_range.stop
+        while jpA <= jpA_end && jpB <= jpB_end
+            jA = JA[jpA]
+            jB = JB[jpB]
+            if jA < jB
+                JC[pC] = jA
+                VC[pC] = VA[jpA]
+                jpA += 1
+            elseif jB < jA
+                JC[pC] = jB
+                VC[pC] = -VB[jpB]
+                jpB += 1
+            else
+                JC[pC] = jA
+                VC[pC] = VA[jpA] - VB[jpB]
+                jpA += 1
+                jpB += 1
+            end
+            pC += 1
+        end
+        while jpA <= jpA_end
+            JC[pC] = JA[jpA]
+            VC[pC] = VA[jpA]
+            jpA += 1
+            pC += 1
+        end
+        while jpB <= jpB_end
+            JC[pC] = JB[jpB]
+            VC[pC] = -VB[jpB]
+            jpB += 1
+            pC += 1
+        end
+    end
+    IC[end] = pC
+    resize!(JC, (pC-1))
+    resize!(VC, (pC-1))
+    SparseMatrixCSR{Bi}(p,r,IC,JC,VC)   # A += B
+end
+
+function Base.:-(A::SparseMatrixCSR{Bi,Tv,Ti}) where {Bi,Tv,Ti}
+    SparseMatrixCSR{Bi}(size(A)..., copy(A.rowptr), copy(A.colval), map(a->-a, A.nzval))
+end
+
+# Alternative to lazy csr to csc for matrix addition that does not drop structural zeros.
+function Base.:+(A::SparseMatrixCSC{Tv,Ti},B::SparseMatrixCSC{Tv,Ti}) where {Tv,Ti}
+    if size(A) != size(B) && throw(DimensionMismatch("Size of B $(size(B)) must match size of A $(size(A))"));end
+    p,q = size(A)
+    nnz_C_upperbound = nnz(A) + nnz(B)
+    JC = Vector{Ti}(undef, q+1)
+    IC = Vector{Ti}(undef, nnz_C_upperbound)
+    VC = Vector{Tv}(undef, nnz_C_upperbound)
+    
+    pC = 1
+    IA = rowvals(A)
+    VA = nonzeros(A)
+    IB = rowvals(B)
+    VB = nonzeros(B)
+    for j in 1:q
+        JC[j] = pC
+        ipA_range = nzrange(A, j)
+        ipA, ipA_end = ipA_range.start, ipA_range.stop
+        ipB_range = nzrange(B, j)
+        ipB, ipB_end = ipB_range.start, ipB_range.stop
+        while ipA <= ipA_end && ipB <= ipB_end
+            iA = IA[ipA]
+            iB = IB[ipB]
+            if iA < iB
+                IC[pC] = iA
+                VC[pC] = VA[ipA]
+                ipA += 1
+            elseif iB < iA
+                IC[pC] = iB
+                VC[pC] = VB[ipB]
+                ipB += 1
+            else
+                IC[pC] = iA
+                VC[pC] = VA[ipA] + VB[ipB]
+                ipA += 1
+                ipB += 1
+            end
+            pC += 1
+        end
+        while ipA <= ipA_end
+            IC[pC] = IA[ipA]
+            VC[pC] = VA[ipA]
+            ipA += 1
+            pC += 1
+        end
+        while ipB <= ipB_end
+            IC[pC] = IB[ipB]
+            VC[pC] = VB[ipB]
+            ipB += 1
+            pC += 1
+        end
+    end
+    JC[end] = pC
+    resize!(IC, (pC-1))
+    resize!(VC, (pC-1))
+    SparseMatrixCSC{Tv,Ti}(p,q,JC,IC,VC)
+end
+
+# Alternative to lazy csr to csc for matrix subtraction that does not drop structural zeros. Subtracts B from A, i.e. A - B.
+function Base.:-(A::SparseMatrixCSC{Tv,Ti},B::SparseMatrixCSC{Tv,Ti}) where {Tv,Ti}
+    if size(A) == size(B) || throw(DimensionMismatch("Size of B $(size(B)) must match size of A $(size(A))"));end
+    p,q = size(A)
+    nnz_C_upperbound = nnz(A) + nnz(B)
+    JC = Vector{Ti}(undef, q+1)
+    IC = Vector{Ti}(undef, nnz_C_upperbound)
+    VC = Vector{Tv}(undef, nnz_C_upperbound)
+    
+    pC = 1
+    IA = rowvals(A)
+    VA = nonzeros(A)
+    IB = rowvals(B)
+    VB = nonzeros(B)
+    for j in 1:q
+        JC[j] = pC
+        ipA_range = nzrange(A, j)
+        ipA, ipA_end = ipA_range.start, ipA_range.stop
+        ipB_range = nzrange(B, j)
+        ipB, ipB_end = ipB_range.start, ipB_range.stop
+        while ipA <= ipA_end && ipB <= ipB_end
+            iA = IA[ipA]
+            iB = IB[ipB]
+            if iA < iB
+                IC[pC] = iA
+                VC[pC] = VA[ipA]
+                ipA += 1
+            elseif iB < iA
+                IC[pC] = iB
+                VC[pC] = VB[ipB]
+                ipB += 1
+            else
+                IC[pC] = iA
+                VC[pC] = VA[ipA] - VB[ipB]
+                ipA += 1
+                ipB += 1
+            end
+            pC += 1
+        end
+        while ipA <= ipA_end
+            IC[pC] = IA[ipA]
+            VC[pC] = VA[ipA]
+            ipA += 1
+            pC += 1
+        end
+        while ipB <= ipB_end
+            IC[pC] = IB[ipB]
+            VC[pC] = -VB[ipB]
+            ipB += 1
+            pC += 1
+        end
+    end
+    JC[end] = pC
+    resize!(IC, (pC-1))
+    resize!(VC, (pC-1))
+    SparseMatrixCSC{Tv,Ti}(p,q,JC,IC,VC)
+end
+
+function Base.:-(A::SparseMatrixCSC{Tv,Ti}) where {Tv,Ti}
+    SparseMatrixCSC{Tv,Ti}(size(A)..., copy(A.colptr), copy(A.rowval), map(a->-a, A.nzval))
+end
+
+
+function LinearAlgebra.mul!(C::SparseMatrixCSC{Tv,Ti},
+    A::SparseMatrixCSC{Tv,Ti},
+    B::SparseMatrixCSC{Tv,Ti},
+    cache) where {Tv,Ti}
+    mul!(ascsr(C),ascsr(B),ascsr(A),cache)
+end
+
+
+function LinearAlgebra.mul!(C::SparseMatrixCSC{Tv,Ti},
+    A::SparseMatrixCSC{Tv,Ti},
+    B::SparseMatrixCSC{Tv,Ti},
+    α::Number,
+    β::Number,
+    cache) where {Tv,Ti}
+    mul!(ascsr(C),ascsr(B),ascsr(A),α,β,cache)
+end
+
+function LinearAlgebra.mul!(C::SparseMatrixCSC{Tv,Ti},
+                            At::Transpose{Tv,SparseMatrixCSC{Tv,Ti}},
+                            B::SparseMatrixCSC{Tv,Ti}) where {Tv,Ti}
+    a,b = size(C)
+    p,q = size(At)
+    r,s = size(B)
+    if q != r && throw(DimensionMismatch("A has dimensions ($(p),$(q)) but B has dimensions ($(p),$(q))"));end
+    if (a,b) != (p,s) && throw(DimensionMismatch("C has dimensions $((a,b)) but AB will have dimensions ($(p),$(s))"));end
+    A = At.parent
+    VC = nonzeros(C)
+    VC .= 0
+    IC = rowvals(C)
+    JA = rowvals(A) # When virtually transposed rowvals represent colvals.
+    VA = nonzeros(A)
+    IB = rowvals(B)
+    VB = nonzeros(B)
+    for j in 1:s
+        # loop over columns "j" in row i of A
+        Bj = nzrange(B, j)
+        ptrB_start = Bj.start
+        ptrB_stop = Bj.stop
+        for ip in nzrange(C, j)
+            i = IC[ip]
+            # loop over columns "k" in row j of B
+            Ai = nzrange(A, i)
+            ptrB = ptrB_start
+            ptrA = Ai.start
+            vC = 0
+            while ptrA <= Ai.stop && ptrB <= ptrB_stop
+                jA = JA[ptrA]
+                iB = IB[ptrB]
+                if jA < iB
+                    ptrA += 1
+                elseif iB < jA
+                    ptrB += 1
+                else # jA == iB
+                    vC += VA[ptrA]*VB[ptrB]
+                    ptrA += 1
+                    ptrB += 1
+                end
+            end
+            VC[ip] = vC
+        end
+    end
+    C
+end
+
+function LinearAlgebra.mul!(C::SparseMatrixCSC{Tv,Ti},
+                            At::Transpose{Tv,SparseMatrixCSC{Tv,Ti}},
+                            B::SparseMatrixCSC{Tv,Ti},
+                            α::Number,
+                            β::Number) where {Tv,Ti}
+    a,b = size(C)
+    p,q = size(At)
+    r,s = size(B)
+    if q != r && throw(DimensionMismatch("A has dimensions ($(p),$(q)) but B has dimensions ($(p),$(q))"));end
+    if (a,b) != (p,s) && throw(DimensionMismatch("C has dimensions $((a,b)) but AB will have dimensions ($(p),$(s))"));end
+    A = At.parent
+    VC = nonzeros(C)
+    IC = rowvals(C)
+    VC .*= β
+    JA = rowvals(A) # When virtually transposed rowvals represent colvals.
+    VA = nonzeros(A)
+    IB = rowvals(B)
+    VB = nonzeros(B)
+    for j in 1:s
+        # loop over columns "j" in row i of A
+        Bj = nzrange(B, j)
+        for jp in nzrange(C, j)
+            i = IC[jp]
+            # loop over columns "k" in row j of B
+            Ai = nzrange(A, i)
+            ptrB = Bj.start
+            ptrA = Ai.start
+            vC = 0
+            while ptrA <= Ai.stop && ptrB <= Bj.stop
+                jA = JA[ptrA]
+                iB = IB[ptrB]
+                if jA == iB
+                    vC += VA[ptrA]*VB[ptrB]
+                    ptrA += 1
+                    ptrB += 1
+                elseif jA < iB
+                    ptrA += 1
+                else
+                    ptrB += 1
+                end
+            end
+            VC[jp] += α*vC
+        end
+    end
+    C
+end
+
+function LinearAlgebra.mul!(C::SparseMatrixCSC{Tv,Ti},
+    A::SparseMatrixCSC{Tv,Ti},
+    Bt::Transpose{Tv,SparseMatrixCSC{Tv,Ti}}) where {Tv,Ti}
+    mul!(ascsr(C),transpose(ascsr(B)),ascsr(A))
+end
+
+function LinearAlgebra.mul!(C::SparseMatrixCSR{Bi,Tv,Ti},
+                            A::SparseMatrixCSR{Bi,Tv,Ti},
+                            B::SparseMatrixCSR{Bi,Tv,Ti},
+                            cache) where {Bi,Tv,Ti}
+    a,b = size(C)
+    p,q = size(A)
+    r,s = size(B)
+    if q != r && throw(DimensionMismatch("A has dimensions ($(p),$(q)) but B has dimensions ($(p),$(q))"));end
+    if (a,b) != (p,s) && throw(DimensionMismatch("C has dimensions $((a,b)) but AB will have dimensions ($(p),$(s))"));end
+    JC = colvals(C)
+    VC = nonzeros(C)
+    VC .= zero(Tv)
+    JA = colvals(A)
+    VA = nonzeros(A)
+    JB = colvals(B)
+    VB = nonzeros(B)
+    # A cache here would remove need for allocating acumulating arrays
+    # xb = zeros(Ti, p)
+    xb,x = cache
+    xb .= 0
+    # x = similar(xb, Tv) # sparse accumulator, can be zeros() to remove if statement in inner loop.
+    for i in 1:p # !
+        # loop over rows Ai in col Bj
+        for jpa in nzrange(A, i) 
+            ja = JA[jpa]
+            va = VA[jpa]
+            # loop over columns "k" in row j of B
+            for jpb in nzrange(B, ja) 
+                jb = JB[jpb]
+                vb = VB[jpb]
+                # since C is constructed rowwise, xb tracks if a column index is present in a new row in C.
+                if xb[jb] != i
+                    xb[jb] = i
+                    x[jb] = va*vb
+                else
+                    x[jb] += va*vb
+                end
+            end
+        end
+        for jpc in nzrange(C,i)
+            jc = JC[jpc]
+            # To support in-place products whose sparsity patterns are subsets of the sparsity of C, this check is required.
+            if xb[jc] == i
+                VC[jpc] = x[jc]
+            end
+        end
+    end
+    C
+end
+
+function LinearAlgebra.mul!(C::SparseMatrixCSC{Tv,Ti},
+    A::SparseMatrixCSC{Tv,Ti},
+    Bt::Transpose{Tv,SparseMatrixCSC{Tv,Ti}},
+    cache) where {Tv,Ti}
+    mul!(ascsr(C),transpose(ascsr(B)),ascsr(A),cache)
+end
+
+function LinearAlgebra.mul!(C::SparseMatrixCSR{Bi,Tv,Ti},
+                            A::SparseMatrixCSR{Bi,Tv,Ti},
+                            B::SparseMatrixCSR{Bi,Tv,Ti},
+                            α::Number,
+                            β::Number,
+                            cache) where {Bi,Tv,Ti}
+    a,b = size(C)
+    p,q = size(A)
+    r,s = size(B)
+    if q != r && throw(DimensionMismatch("A has dimensions ($(p),$(q)) but B has dimensions ($(p),$(q))"));end
+    if (a,b) != (p,s) && throw(DimensionMismatch("C has dimensions $((a,b)) but AB will have dimensions ($(p),$(s))"));end
+    JC = colvals(C)
+    VC = nonzeros(C)
+    VC .*= β
+    JA = colvals(A)
+    VA = nonzeros(A)
+    JB = colvals(B)
+    VB = nonzeros(B)
+    # A cache here would remove need for allocating acumulating arrays
+    # xb = zeros(Ti, p)
+    xb,x = cache
+    xb .= 0
+    # x = similar(xb, Tv) # sparse accumulator, can be zeros() to remove if statement in inner loop.
+    for i in 1:p # !
+        # loop over rows Ai in col Bj
+        for jpa in nzrange(A, i) 
+            ja = JA[jpa]
+            va = VA[jpa]
+            # loop over columns "k" in row j of B
+            for jpb in nzrange(B, ja) 
+                jb = JB[jpb]
+                vb = VB[jpb]
+                # since C is constructed rowwise, xb tracks if a column index is present in a new row in C.
+                if xb[jb] != i
+                    xb[jb] = i
+                    x[jb] = va*vb
+                else
+                    x[jb] += va*vb
+                end
+            end
+        end
+        for jpc in nzrange(C,i)
+            jc = JC[jpc]
+            # To support in-place products whose sparsity patterns are subsets of the sparsity of C, this check is required.
+            if xb[jc] == i
+                VC[jpc] += α * x[jc]
+            end
+        end
+    end
+    C
+end
+
+function LinearAlgebra.mul!(C::SparseMatrixCSC{Tv,Ti},
+                            A::SparseMatrixCSC{Tv,Ti},
+                            Bt::Transpose{Tv,SparseMatrixCSC{Tv,Ti}},
+                            α::Number,
+                            β::Number,
+                            cache) where {Tv,Ti}
+    mul!(ascsr(C),transpose(ascsr(Bt.parent)),ascsr(A),α,β,cache)
+end
+
+function LinearAlgebra.mul!(C::SparseMatrixCSC{Tv,Ti},
+                            At::Transpose{Tv,SparseMatrixCSC{Tv,Ti}},
+                            B::SparseMatrixCSC{Tv,Ti},
+                            cache) where {Tv,Ti}
+    mul!(ascsr(C),ascsr(B),transpose(ascsr(At.parent)))
+end
+
+function LinearAlgebra.mul!(C::SparseMatrixCSC{Tv,Ti},
+                            At::Transpose{Tv,SparseMatrixCSC{Tv,Ti}},
+                            B::SparseMatrixCSC{Tv,Ti},
+                            α::Number,
+                            β::Number,
+                            cache) where {Tv,Ti}
+    mul!(ascsr(C),ascsr(A),transpose(ascsr(At.parent)),α,β)
+end
+
+# Workaround to supply in-place mul! with auxiliary array, as these are not returned by multiply function exported by SparseArrays
+function construct_spmm_cache(A::SparseMatrixCSR{Bi,Tv,Ti} where Bi) where {Tv,Ti}
+    q = size(A,2)
+    xb = zeros(Ti,q)
+    x = similar(xb,Tv)
+    xb,x
+end
+function construct_spmm_cache(A::SparseMatrixCSC{Tv,Ti}) where {Tv,Ti}
+    construct_spmm_cache(ascsr(A))
+end
+
+function construct_spmtm_cache(A::SparseMatrixCSR{Bi,Tv,Ti} where Bi) where {Tv,Ti}
+    q = size(A,2)
+    xb = zeros(Ti,q)
+    x = similar(xb,Tv)
+    xb,x
+end
+
+function construct_spmtm_cache(A::SparseMatrixCSC{Tv,Ti}) where {Tv,Ti}
+    construct_spmtm_cache(ascsr(A))
+end
+
+function LinearAlgebra.mul!(C::SparseMatrixCSR{Bi,Tv,Ti},
+                            At::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}},
+                            B::SparseMatrixCSR{Bi,Tv,Ti},
+                            cache) where {Bi,Tv,Ti}
+    a,b = size(C)
+    p,q = size(At)
+    r,s = size(B)
+    if q != r && throw(DimensionMismatch("A has dimensions ($(p),$(q)) but B has dimensions ($(p),$(q))"));end
+    if (a,b) != (p,s) && throw(DimensionMismatch("C has dimensions $((a,b)) but AB will have dimensions ($(p),$(s))"));end
+    A = At.parent
+    VC = nonzeros(C)
+    VC .= zero(Tv)
+    JC = colvals(C)
+    JA = colvals(A) # When virtually transposed colvals represent rowvals.
+    VA = nonzeros(A)
+    JB = colvals(B)
+    VB = nonzeros(B)
+    xb,x = cache
+    xb .= 0
+    for k in 1:q
+        # loop over columns "j" in row i of B
+        for jpb in nzrange(B,k)
+            jb = JB[jpb]
+            vb = VB[jpb]
+            xb[jb] = k
+            x[jb] = vb
+        end
+        for ipa in nzrange(A,k)
+            ia = JA[ipa] # interpret column index of A as row index of A^T.
+            va = VA[ipa]
+            for jpc in nzrange(C, ia)
+                jc = JC[jpc]
+                # This check is required, as the outerproduct might not contribute to to all nonzero entries in this row of C.
+                if xb[jc] == k
+                    VC[jpc] += va*x[jc]
+                end
+            end
+        end
+
+    end
+    C
+end
+
+function LinearAlgebra.mul!(C::SparseMatrixCSR{Bi,Tv,Ti},
+                            At::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}},
+                            B::SparseMatrixCSR{Bi,Tv,Ti},
+                            α::Number,
+                            β::Number,
+                            cache) where {Bi,Tv,Ti}
+    a,b = size(C)
+    p,q = size(At)
+    r,s = size(B)
+    if q != r && throw(DimensionMismatch("A has dimensions ($(p),$(q)) but B has dimensions ($(p),$(q))"));end
+    if (a,b) != (p,s) && throw(DimensionMismatch("C has dimensions $((a,b)) but AB will have dimensions ($(p),$(s))"));end
+    A = At.parent
+    VC = nonzeros(C)
+    VC .*= β
+    JC = colvals(C)
+    JA = colvals(A) # When virtually transposed colvals represent rowvals.
+    VA = nonzeros(A)
+    JB = colvals(B)
+    VB = nonzeros(B)
+    xb,x = cache
+    xb .= 0
+    for k in 1:q
+        # loop over columns "j" in row i of B
+        for jpb in nzrange(B,k)
+            jb = JB[jpb]
+            vb = VB[jpb]
+            xb[jb] = k
+            x[jb] = α*vb
+        end
+        for ipa in nzrange(A,k)
+            ia = JA[ipa] # interpret column index of A as row index of A^T.
+            va = VA[ipa]
+            for jpc in nzrange(C, ia)
+                jc = JC[jpc]
+                # This check is required, as the outerproduct might not contribute to to all nonzero entries in this row of C.
+                if xb[jc] == k
+                    VC[jpc] += va*x[jc]
+                end
+            end
+        end
+
+    end
+    C
+end
+
+function LinearAlgebra.mul!(C::SparseMatrixCSR{Bi,Tv,Ti},
+                            A::SparseMatrixCSR{Bi,Tv,Ti},
+                            Bt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}}) where {Bi,Tv,Ti}
+    mul!(ascsc(C), transpose(ascsc(Bt.parent)), ascsc(A))
+    C
+end
+
+function LinearAlgebra.mul!(C::SparseMatrixCSR{Bi,Tv,Ti},
+                            A::SparseMatrixCSR{Bi,Tv,Ti},
+                            Bt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}},
+                            α::Number,
+                            β::Number) where {Bi,Tv,Ti}
+    mul!(ascsc(C), transpose(ascsc(Bt.parent)), ascsc(A), α, β)
+    C
+end
+
+# PtAP variants
+function RAP(Plt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}}, A::SparseMatrixCSR{Bi,Tv,Ti}, Pr::SparseMatrixCSR{Bi,Tv,Ti}) where {Bi,Tv,Ti}
+    p,q = size(Plt)
+    m,r = size(A)
+    n,s = size(Pr)
+    if r != n && throw(DimensionMismatch("Invalid dimensions for A*P: ($m,$r)*($n,$s),"));end
+    if q != m && throw(DimensionMismatch("Invalid dimensions: R*AP: ($p,$q)*($m,$s)"));end
+    function RAP_symbolic_count!(R,A,Pr)
+        JR = R.data
+        JA = colvals(A)
+        JPr = colvals(Pr) # colvals can be interpreted as rowvals when Pr is virtually transposed.
+        xbRA = zeros(Ti, r)
+        xbC = zeros(Ti, s) # this vector will also serve as as colptr array in halfperm
+        max_rR = find_max_row_length(R)
+        max_rA = find_max_row_length(A)
+        max_rPr = find_max_row_length(Pr)
+
+        max_rC = max((max_rR*max_rA*max_rPr),(max_rA*max_rR))
+        JRA = Vector{Ti}(undef,max_rC)
+        IC = Vector{Ti}(undef,p+1)
+        nnz_C = 1
+        IC[1] = nnz_C
+        for i in 1:p
+            ccRA = 0 # local column pointer, refresh every row, start at 0 to allow empty rows
+            # loop over columns "j" in row i of A
+            for jp in jagged_range(R, i)
+                j = JR[jp]
+                # loop over columns "k" in row j of B
+                for kp in nzrange(A, j)
+                    k = JA[kp]
+                    # since C is constructed rowwise, xb tracks if a column index is present in a new row in C.
+                    if xbRA[k] != i
+                        ccRA += 1
+                        JRA[ccRA] = k
+                        xbRA[k] = i
+                    end
+                end
+            end
+            ccC = 0 # local column pointer, refresh every row, start at 0 to allow empty rows
+            for jp in 1:ccRA
+                j = JRA[jp]
+                for kp in nzrange(Pr,j)
+                    k = JPr[kp]
+                    if xbC[k] != i
+                        xbC[k] = i
+                        ccC += 1
+                    end
+                end
+            end
+            nnz_C += ccC
+            IC[i+1] = nnz_C
+        end
+        JC = Vector{Ti}(undef, nnz_C-1)
+        VC = zeros(Tv,nnz_C-1)
+        JAP = Vector{Ti}(undef,min(max_rA*max_rPr,s)) # upper bound estimate for length of virtual row of AP
+        xbRA .= 0
+        xbC .= 0
+        cache = (xbRA,JRA,xbC,JAP)
+        SparseMatrixCSR{Bi}(p,s,IC,JC,VC), cache # values not yet initialized
+    end
+    function RAP_symbolic_fill!(C,R,A,Pr,cache)
+        (xbRA,JRA,xbC,JAP) = cache
+        JC = colvals(C)
+        JR = R.data
+        JA = colvals(A)
+        JPr = colvals(Pr) # colvals can be interpreted as rowvals when Pr is virtually transposed.
+        pC = 0
+        for i in 1:p
+            ccRA = 0 # local column pointer, refresh every row, start at 0 to allow empty rows
+            # loop over columns "j" in row i of A
+            for jp in jagged_range(R, i)
+                j = JR[jp]
+                # loop over columns "k" in row j of B
+                for kp in nzrange(A, j)
+                    k = JA[kp]
+                    # since C is constructed rowwise, xb tracks if a column index is present in a new row in C.
+                    if xbRA[k] != i
+                        ccRA += 1 
+                        JRA[ccRA] = k
+                        xbRA[k] = i
+                    end
+                end
+            end
+            for jp in 1:ccRA
+                j = JRA[jp]
+                for kp in nzrange(Pr,j)
+                    k = JPr[kp]
+                    if xbC[k] != i
+                        pC += 1
+                        xbC[k] = i
+                        JC[pC] = k
+                    end
+                end
+            end
+        end
+        xbC .= 0
+        outer_cache = (xbC,similar(xbC, Tv),JAP)
+        C, outer_cache # values not yet initialized
+    end
+    function _RAP(Plt,A,Pr)
+        R = symbolic_halfperm(Plt.parent)
+        C,symbolic_cache = RAP_symbolic_count!(R,A,Pr) # precompute nz structure with a symbolic transpose
+        _,outer_cache = RAP_symbolic_fill!(C,R,A,Pr,symbolic_cache)
+        Ct = symbolic_halfperm(C)
+        symbolic_halfperm!(C,Ct)
+        RAP!(C,Plt,A,Pr,outer_cache),(outer_cache...,R)
+    end
+    _RAP(Plt,A,Pr)
+end
+
+function RAP(Plt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}},
+             A::SparseMatrixCSR{Bi,Tv,Ti},
+             Pr::SparseMatrixCSR{Bi,Tv,Ti},
+             cache) where {Bi,Tv,Ti}
+    p,q = size(Plt)
+    m,r = size(A)
+    n,s = size(Pr)
+    if r != n && throw(DimensionMismatch("Invalid dimensions for A*P: ($m,$r)*($n,$s),"));end
+    if q != m && throw(DimensionMismatch("Invalid dimensions: R*AP: ($p,$q)*($m,$s)"));end
+    
+    function RAP_symbolic_count!(R,A,Pr)
+        JR = R.data
+        JA = colvals(A)
+        JPr = colvals(Pr) # colvals can be interpreted as rowvals when Pr is virtually transposed.
+        xbRA = zeros(Ti, r)
+        xbC = zeros(Ti, s) # this vector will also serve as as colptr array in halfperm
+        max_rR = find_max_row_length(R)
+        max_rA = find_max_row_length(A)
+        max_rPr = find_max_row_length(Pr)
+
+        max_rC = max((max_rR*max_rA*max_rPr),(max_rA*max_rR))
+        JRA = Vector{Ti}(undef,max_rC)
+        IC = Vector{Ti}(undef,p+1)
+        nnz_C = 1
+        IC[1] = nnz_C
+        for i in 1:p
+            ccRA = 0 # local column pointer, refresh every row, start at 0 to allow empty rows
+            # loop over columns "j" in row i of A
+            for jp in jagged_range(R, i)
+                j = JR[jp]
+                # loop over columns "k" in row j of B
+                for kp in nzrange(A, j)
+                    k = JA[kp]
+                    # since C is constructed rowwise, xb tracks if a column index is present in a new row in C.
+                    if xbRA[k] != i
+                        ccRA += 1
+                        JRA[ccRA] = k
+                        xbRA[k] = i
+                    end
+                end
+            end
+            ccC = 0 # local column pointer, refresh every row, start at 0 to allow empty rows
+            for jp in 1:ccRA
+                j = JRA[jp]
+                for kp in nzrange(Pr,j)
+                    k = JPr[kp]
+                    if xbC[k] != i
+                        xbC[k] = i
+                        ccC += 1
+                    end
+                end
+            end
+            nnz_C += ccC
+            IC[i+1] = nnz_C
+        end
+        JC = Vector{Ti}(undef, nnz_C-1)
+        VC = zeros(Tv,nnz_C-1)
+        JAP = Vector{Ti}(undef,min(max_rA*max_rPr,s)) # upper bound estimate for length of virtual row of AP
+        xbRA .= 0
+        xbC .= 0
+        SparseMatrixCSR{Bi}(p,s,IC,JC,VC),(xbRA,JRA,xbC,JAP) # values in CSR matrix not yet initialized
+    end
+    function RAP_symbolic_fill!(C,R,A,Pr,cache)
+        (xbRA,JRA,xbC,JAP) = cache
+        JC = colvals(C)
+        JR = R.data
+        JA = colvals(A)
+        JPr = colvals(Pr) # colvals can be interpreted as rowvals when Pr is virtually transposed.
+        pC = 0
+        for i in 1:p
+            ccRA = 0 # local column pointer, refresh every row, start at 0 to allow empty rows
+            # loop over columns "j" in row i of A
+            for jp in jagged_range(R, i)
+                j = JR[jp]
+                # loop over columns "k" in row j of B
+                for kp in nzrange(A, j)
+                    k = JA[kp]
+                    # since C is constructed rowwise, xb tracks if a column index is present in a new row in C.
+                    if xbRA[k] != i
+                        ccRA += 1 
+                        JRA[ccRA] = k
+                        xbRA[k] = i
+                    end
+                end
+            end
+            for jp in 1:ccRA
+                j = JRA[jp]
+                for kp in nzrange(Pr,j)
+                    k = JPr[kp]
+                    if xbC[k] != i
+                        pC += 1
+                        xbC[k] = i
+                        JC[pC] = k
+                    end
+                end
+            end
+        end
+        xbC .= 0
+        C, (xbC,similar(xbC, Tv),JAP) # values not yet initialized
+    end
+    function _RAP(Plt,A,Pr,old_cache)
+        xb,x,JAP,R = old_cache
+        old_outer_cache = (xb,x,JAP)
+        C,symbolic_cache = RAP_symbolic_count!(R, A, Pr)
+        _,new_outer_cache = RAP_symbolic_fill!(C,R, A, Pr, symbolic_cache)
+        Ct = symbolic_halfperm(C)
+        symbolic_halfperm!(C,Ct)
+        outer_cache = map((c1,c2) -> length(c1) >= length(c2) ? c1 : c2, old_outer_cache,new_outer_cache)
+        RAP!(C,Plt,A,Pr,outer_cache),(outer_cache...,R)
+    end
+    _RAP(Plt,A,Pr,cache)
+end
+
+function reduce_spmtmm_cache(cache,::Type{SparseMatrixCSR})
+    (xb,x,JAP,_) = cache
+    (xb,x,JAP)
+end
+
+function RAP!(C::SparseMatrixCSR{Bi,Tv,Ti}, 
+              Plt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}},
+              A::SparseMatrixCSR{Bi,Tv,Ti},
+              Pr::SparseMatrixCSR{Bi,Tv,Ti},
+              cache) where {Bi,Tv,Ti}
+    (a,b) = size(C)
+    p,q = size(Plt)
+    m,r = size(A)
+    n,s = size(Pr)
+    if r != n && throw(DimensionMismatch("Invalid dimensions for A*P: ($m,$r)*($n,$s),"));end
+    if q != m && throw(DimensionMismatch("Invalid dimensions: R*AP: ($p,$q)*($m,$s)"));end
+    if (a,b) != (p,s) && throw(DimensionMismatch("Dimensions of C $(size(C)) don't match dimensions of R*A*P ($p,$q)*($m,$r)*($n,$s)."));end
+    Pl = Plt.parent
+    JC = colvals(C)
+    VC = nonzeros(C)
+    VC .= zero(Tv)
+
+    JA = colvals(A)
+    VA = nonzeros(A)
+    JPr = colvals(Pr)
+    VPr = nonzeros(Pr)
+    xb, x, JAP = cache
+    xb .= 0
+    # loop over rows in A
+    for i in 1:m
+        lp = 0
+        # loop over columns "j" in row i of A
+        for jp in nzrange(A, i)
+            j = JA[jp]
+            va = VA[jp]
+            # loop over columns "k" in row j of B
+            for kp in nzrange(Pr, j)
+                k = JPr[kp]
+                # since C is constructed rowwise, xb tracks if a column index is present in a new row in C.
+                if xb[k] != i
+                    lp += 1
+                    JAP[lp] = k
+                    xb[k] = i
+                    x[k] = va * VPr[kp]
+                else
+                    x[k] += va * VPr[kp]
+                end
+            end
+        end
+        for kp in nzrange(Pl, i)
+            k = colvals(Pl)[kp] # rowvals when transposed conceptually
+            v = nonzeros(Pl)[kp]
+            for jp in nzrange(C,k)
+                j = JC[jp]
+                if xb[j] == i
+                    VC[jp] += v*x[j]
+                end 
+            end
+        end
+    end
+    C
+end
+
+function RAP!(C::SparseMatrixCSR{Bi,Tv,Ti},
+              Plt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}},
+              A::SparseMatrixCSR{Bi,Tv,Ti},
+              Pr::SparseMatrixCSR{Bi,Tv,Ti},
+              α::Number,
+              β::Number,
+              cache) where {Bi,Tv,Ti}
+    (a,b) = size(C)
+    p,q = size(Plt)
+    m,r = size(A)
+    n,s = size(Pr)
+    if r != n && throw(DimensionMismatch("Invalid dimensions for A*P: ($m,$r)*($n,$s),"));end
+    if q != m && throw(DimensionMismatch("Invalid dimensions: R*AP: ($p,$q)*($m,$s)"));end
+    if (a,b) != (p,s) && throw(DimensionMismatch("Dimensions of C $(size(C)) don't match dimensions of R*A*P ($p,$q)*($m,$r)*($n,$s)."));end
+    Pl = Plt.parent
+    JC = colvals(C)
+    VC = nonzeros(C)
+    JA = colvals(A)
+    VA = nonzeros(A)
+    JPr = colvals(Pr)
+    VPr = nonzeros(Pr)
+    xb, x, JAP = cache
+    xb .= 0
+    VC .*= β
+    # loop over rows in A
+    for i in 1:m
+        lp = 0
+        # loop over columns "j" in row i of A
+        for jp in nzrange(A, i)
+            j = JA[jp]
+            va = α*VA[jp]
+            # loop over columns "k" in row j of B
+            for kp in nzrange(Pr, j)
+                k = JPr[kp]
+                # since C is constructed rowwise, xb tracks if a column index is present in a new row in C.
+                if xb[k] != i
+                    lp += 1
+                    JAP[lp] = k
+                    xb[k] = i
+                    x[k] = va*VPr[kp]
+                else
+                    x[k] += va*VPr[kp]
+                end
+            end
+        end
+        for kp in nzrange(Pl, i)
+            k = colvals(Pl)[kp] # rowvals when transposed conceptually
+            vpl = nonzeros(Pl)[kp]
+            for jp in nzrange(C,k)
+                j = JC[jp]
+                if xb[j] == i
+                    VC[jp] += vpl*x[j]
+                end 
+            end
+        end
+    end
+    C
+end
+
+# RAP variants
+function RAP(Pl::SparseMatrixCSR{Bi,Tv,TiPl},
+             A::SparseMatrixCSR{Bi,Tv,TiA},
+             Pr::SparseMatrixCSR{Bi,Tv,TiPr}) where {Bi,Tv,TiPl,TiA,TiPr}
+    p,q = size(Pl)
+    m,r = size(A)
+    n,s = size(Pr)
+    if q == m || throw(DimensionMismatch("Invalid dimensions for R*A: ($p,$q)*($m,$r),"));end
+    if r == n || throw(DimensionMismatch("Invalid dimensions: RA*P: ($p,$r)*($n,$s)"));end
+    function RAP_symbolic!(Pl,A,Pr)
+        JPl = colvals(Pl)
+        JA = colvals(A)
+        JPr = colvals(Pr) # colvals can be interpreted as rowvals when Pr is virtually transposed.
+        xbRA = zeros(TiA, r)
+        xbC = zeros(TiA, s+1) # this vector will also serve as as colptr array in halfperm
+        xRA = similar(xbRA, Tv) # sparse accumulator
+        xC = similar(xbC, Tv) # sparse accumulator
+        max_rPl = find_max_row_length(Pl)
+        max_rA = find_max_row_length(A)
+        max_rPr = find_max_row_length(Pr)
+
+        max_rC = max((max_rPl*max_rA*max_rPr),(max_rA*max_rPl))
+        JRA = Vector{TiA}(undef,max_rC)
+        IC = Vector{TiA}(undef,p+1)
+        nnz_C = 1
+        IC[1] = nnz_C
+        for i in 1:p
+            ccRA = 0
+            # loop over columns "j" in row i of A
+            for jp in nzrange(Pl, i)
+                j = JPl[jp]
+                # loop over columns "k" in row j of B
+                for kp in nzrange(A, j)
+                    k = JA[kp]
+                    # since C is constructed rowwise, xb tracks if a column index is present in a new row in C.
+                    if xbRA[k] != i
+                        ccRA += 1
+                        JRA[ccRA] = k
+                        xbRA[k] = i
+                    end
+                end
+            end
+            ccC = 0
+            for jp in 1:ccRA
+                j = JRA[jp]
+                for kp in nzrange(Pr,j)
+                    k = JPr[kp]
+                    if xbC[k] != i
+                        xbC[k] = i
+                        ccC += 1
+                    end
+                end
+            end
+            nnz_C += ccC
+            IC[i+1] = nnz_C
+        end
+        JC = Vector{TiA}(undef, nnz_C-1)
+        VC = zeros(Tv,nnz_C-1)
+        cache = (xbRA,xRA,JRA,xbC,xC)
+        SparseMatrixCSR{Bi}(p,s,IC,JC,VC), cache # values not yet initialized
+    end
+    function RAP_numeric!(C,Pl,A,Pr,cache)
+        JPl = colvals(Pl)
+        VPl = nonzeros(Pl)
+        JA = colvals(A)
+        VA = nonzeros(A)
+        JPr = colvals(Pr) # colvals can be interpreted as rowvals when Pr is virtually transposed.
+        VPr = nonzeros(Pr)
+        JC = colvals(C)
+        VC = nonzeros(C)
+        (xbRA,xRA,JRA,xbC,xC) = cache
+        jpC = 1
+        for i in 1:p
+            ccRA = 0 # local column pointer, refresh every row, start at 0 to allow empty rows
+            # loop over columns "j" in row i of A
+            for jp in nzrange(Pl, i)
+                j = JPl[jp]
+                vpl = VPl[jp]
+                # loop over columns "k" in row j of B
+                for kp in nzrange(A, j)
+                    k = JA[kp]
+                    # since C is constructed rowwise, xb tracks if a column index is present in a new row in C.
+                    if xbRA[k] != i
+                        ccRA += 1
+                        JRA[ccRA] = k
+                        xbRA[k] = i
+                        xRA[k] = vpl * VA[kp]
+                    else
+                        xRA[k] += vpl * VA[kp]
+                    end
+                end
+            end
+            for jp in 1:ccRA
+                j = JRA[jp]
+                for kp in nzrange(Pr,j)
+                    k = JPr[kp]
+                    if xbC[k] != i
+                        xbC[k] = i
+                        JC[jpC] = k
+                        jpC += 1
+                        xC[k] = xRA[j]*VPr[kp]
+                    else
+                        xC[k] += xRA[j]*VPr[kp]
+                    end
+                end
+            end
+            for ind in nzrange(C,i)
+                j = JC[ind]
+                VC[ind] = xC[j]
+            end
+        end
+    end
+    function _RAP(Pl,A,Pr)
+        C,(xbRA,xRA,JRA,xbC,xC) = RAP_symbolic!(Pl,A,Pr)
+        xbRA .= 0
+        xbC .= 0
+        cache = (xbRA,xRA,JRA,xbC,xC)
+        RAP_numeric!(C,Pl,A,Pr,cache)
+        Ct = halfperm!(xbC,similar(colvals(C)),similar(nonzeros(C)),C)
+        halfperm!(C,Ct)
+        C,cache
+    end
+    _RAP(Pl,A,Pr)
+end
+
+# Reuses internal arrays of A!!!
+function construct_spmmm_cache(C::SparseMatrixCSR,A::SparseMatrixCSR)
+    cache = JaggedArray(colvals(A), A.rowptr)
+end
+
+function construct_spmmm_cache(C::SparseMatrixCSC,A::SparseMatrixCSC)
+    cache = JaggedArray(rowvals(A), A.colptr)
+end
+
+function reduce_spmtmm_cache(cache,::Type{M} where M <: SparseMatrixCSR)
+    (xb,x,JAP,_) = cache
+    (xb,x,JAP)
+end
+
+function reduce_spmtmm_cache(cache,::Type{M}  where M <: SparseMatrixCSC)
+    reduce_spmmmt_cache(cache,SparseMatrixCSR)
+end
+
+function RAP(Pl::SparseMatrixCSR{Bi,Tv,TiPl},
+             A::SparseMatrixCSR{Bi,Tv,TiA},
+             Pr::SparseMatrixCSR{Bi,Tv,TiPr},
+             cache) where {Bi,Tv,TiPl,TiA,TiPr}
+    p,q = size(Pl)
+    m,r = size(A)
+    n,s = size(Pr)
+    if q == m || throw(DimensionMismatch("Invalid dimensions for R*A: ($p,$q)*($m,$r),"));end
+    if r == n || throw(DimensionMismatch("Invalid dimensions: RA*P: ($p,$r)*($n,$s)"));end
+    function RAP_symbolic!(Pl,A,Pr,cache)
+        JPl = colvals(Pl)
+        JA = colvals(A)
+        JPr = colvals(Pr) # colvals can be interpreted as rowvals when Pr is virtually transposed.
+        (xbRA,_,JRA,xbC,_) = cache
+        IC = Vector{TiA}(undef,p+1)
+        nnz_C = 1
+        IC[1] = nnz_C
+        for i in 1:p
+            ccRA = 0 # local column pointer, refresh every row, start at 0 to allow empty rows
+            # loop over columns "j" in row i of A
+            for jp in nzrange(Pl, i)
+                j = JPl[jp]
+                # loop over columns "k" in row j of B
+                for kp in nzrange(A, j)
+                    k = JA[kp]
+                    # since C is constructed rowwise, xb tracks if a column index is present in a new row in C.
+                    if xbRA[k] != i
+                        ccRA += 1
+                        JRA[ccRA] = k
+                        xbRA[k] = i
+                    end
+                end
+            end
+            ccC = 0 # local column pointer, refresh every row, start at 0 to allow empty rows
+            for jp in 1:ccRA
+                j = JRA[jp]
+                for kp in nzrange(Pr,j)
+                    k = JPr[kp]
+                    if xbC[k] != i
+                        xbC[k] = i
+                        ccC += 1
+                    end
+                end
+            end
+            nnz_C += ccC
+            IC[i+1] = nnz_C
+        end
+        JC = Vector{TiA}(undef, nnz_C-1)
+        VC = zeros(Tv,nnz_C-1)
+        SparseMatrixCSR{Bi}(p,s,IC,JC,VC) # values not yet initialized
+    end
+    function RAP_numeric!(C,Pl,A,Pr,cache)
+        JPl = colvals(Pl)
+        VPl = nonzeros(Pl)
+        JA = colvals(A)
+        VA = nonzeros(A)
+        JPr = colvals(Pr) # colvals can be interpreted as rowvals when Pr is virtually transposed.
+        VPr = nonzeros(Pr)
+        JC = colvals(C)
+        VC = nonzeros(C)
+        (xbRA,xRA,JRA,xbC,xC) = cache
+        jpC = 1
+        for i in 1:p
+            ccRA = 0 # local column pointer, refresh every row, start at 0 to allow empty rows
+            # loop over columns "j" in row i of A
+            for jp in nzrange(Pl, i)
+                j = JPl[jp]
+                vpl = VPl[jp]
+                # loop over columns "k" in row j of B
+                for kp in nzrange(A, j)
+                    k = JA[kp]
+                    # since C is constructed rowwise, xb tracks if a column index is present in a new row in C.
+                    if xbRA[k] != i
+                        ccRA += 1
+                        JRA[ccRA] = k
+                        xbRA[k] = i
+                        xRA[k] = vpl * VA[kp]
+                    else
+                        xRA[k] += vpl * VA[kp]
+                    end
+                end
+            end
+            for jp in 1:ccRA
+                j = JRA[jp]
+                for kp in nzrange(Pr,j)
+                    k = JPr[kp]
+                    if xbC[k] != i
+                        xbC[k] = i
+                        JC[jpC] = k
+                        jpC += 1
+                        xC[k] = xRA[j]*VPr[kp]
+                    else
+                        xC[k] += xRA[j]*VPr[kp]
+                    end
+                end
+            end
+            for ind in nzrange(C,i)
+                j = JC[ind]
+                VC[ind] = xC[j]
+            end
+        end
+    end
+    function _RAP(Pl,A,Pr,old_cache)
+        max_rPl = find_max_row_length(Pl)
+        max_rA = find_max_row_length(A)
+        max_rPr = find_max_row_length(Pr)
+        (xbRA,xRA,JRA,xbC,xC) = old_cache
+        max_rC = max((max_rPl*max_rA*max_rPr),(max_rA*max_rPl))
+        JRA2 = max_rC > length(JRA) ? similar(JRA,max_rC) : JRA
+        if r > length(xbRA)
+            xbRA2 = similar(xbRA,r)
+            xRA2 = similar(xRA,r)
+        else
+            xbRA2 = xbRA
+            xRA2 = xRA
+        end
+
+        new_cache = (xbRA2,xRA2,JRA2,xbC,xC)
+        xbRA2 .= 0
+        xbC .= 0
+        C = RAP_symbolic!(Pl,A,Pr,new_cache)
+        xbRA2 .= 0
+        xbC .= 0
+        RAP_numeric!(C,Pl,A,Pr,new_cache)
+        Ct = halfperm!(xbC,similar(colvals(C)),similar(nonzeros(C)),C)
+        halfperm!(C,Ct)
+        C,new_cache
+    end
+    _RAP(Pl,A,Pr,cache)
+end
+
+function reduce_spmmmt_cache(cache,::Type{M} where M <: SparseMatrixCSR)
+    (xbRA,xRA,JRA,_,_) = cache
+    (xbRA,xRA,JRA)
+end
+
+function reduce_spmmmt_cache(cache,::Type{M} where M <: SparseMatrixCSC)
+    reduce_spmtmm_cache(cache,SparseMatrixCSR)
+end
+
+function RAP!(C::SparseMatrixCSR{Bi,Tv,TiC},
+              Pl::SparseMatrixCSR{Bi,Tv,TiPl},
+              A::SparseMatrixCSR{Bi,Tv,TiA},
+              Pr::SparseMatrixCSR{Bi,Tv,TiPr},
+              cache) where {Bi,Tv,TiC,TiPl,TiA,TiPr}
+    p,q = size(Pl)
+    m,r = size(A)
+    n,s = size(Pr)
+    if q == m || throw(DimensionMismatch("Invalid dimensions for R*A: ($p,$q)*($m,$r),"));end
+    if r == n || throw(DimensionMismatch("Invalid dimensions: RA*P: ($p,$r)*($n,$s)"));end
+    JPl = colvals(Pl)
+    VPl = nonzeros(Pl)
+    JA = colvals(A)
+    VA = nonzeros(A)
+    JPr = colvals(Pr) # colvals can be interpreted as rowvals when Pr is virtually transposed.
+    VPr = nonzeros(Pr)
+    JC = colvals(C)
+    VC = nonzeros(C)
+    VC .= zero(Tv)
+    (xbRA,xRA,JRA,xbC,xC) = cache
+    xbRA .= 0
+    xbC .= 0
+    for i in 1:p
+        lp = 0 # local column pointer, refresh every row, start at 0 to allow empty rows
+        # loop over columns "j" in row i of A
+        for jp in nzrange(Pl, i)
+            j = JPl[jp]
+            vpl = VPl[jp]
+
+            # loop over columns "k" in row j of B
+            for kp in nzrange(A, j)
+                k = JA[kp]
+                va = VA[kp]
+                # since C is constructed rowwise, xb tracks if a column index is present in a new row in C.
+                if xbRA[k] != i
+                    lp += 1
+                    JRA[lp] = k
+                    xbRA[k] = i
+                    xRA[k] = vpl * va
+                else
+                    xRA[k] += vpl * va
+                end
+            end
+        end
+        for jp in 1:lp
+            j = JRA[jp]
+            vra = xRA[j]
+            for kp in nzrange(Pr,j)
+                k = JPr[kp]
+                if xbC[k] != i
+                    xbC[k] = i
+                    xC[k] = vra*VPr[kp]
+                else
+                    xC[k] += vra*VPr[kp]
+                end
+            end
+        end
+        for ind in nzrange(C,i)
+            j = JC[ind]
+            if xbC[j] == i
+                VC[ind] = xC[j]
+            end
+        end
+    end
+    C
+end
+
+function RAP!(C::SparseMatrixCSR{Bi,Tv,TiC},
+              Pl::SparseMatrixCSR{Bi,Tv,TiPl},
+              A::SparseMatrixCSR{Bi,Tv,TiA},
+              Pr::SparseMatrixCSR{Bi,Tv,TiPr},
+              α::Number,
+              β::Number,
+              cache) where {Bi,Tv,TiC,TiPl,TiA,TiPr}
+    p,q = size(Pl)
+    m,r = size(A)
+    n,s = size(Pr)
+    if q == m || throw(DimensionMismatch("Invalid dimensions for R*A: ($p,$q)*($m,$r),"));end
+    if r == n || throw(DimensionMismatch("Invalid dimensions: RA*P: ($p,$r)*($n,$s)"));end
+    JPl = colvals(Pl)
+    VPl = nonzeros(Pl)
+    JA = colvals(A)
+    VA = nonzeros(A)
+    JPr = colvals(Pr) # colvals can be interpreted as rowvals when Pr is virtually transposed.
+    VPr = nonzeros(Pr)
+    JC = colvals(C)
+    VC = nonzeros(C)
+    VC .*= β
+    (xbRA,xRA,JRA,xbC,xC) = cache
+    xbRA .= 0
+    xbC .= 0
+    xC .= zero(Tv)
+    for i in 1:p
+        lp = 0 # local column pointer, refresh every row, start at 0 to allow empty rows
+        # loop over columns "j" in row i of A
+        for jp in nzrange(Pl, i)
+            j = JPl[jp]
+            vpl = VPl[jp]
+            # loop over columns "k" in row j of B
+            for kp in nzrange(A, j)
+                k = JA[kp]
+                # since C is constructed rowwise, xb tracks if a column index is present in a new row in C.
+                if xbRA[k] != i
+                    lp += 1
+                    JRA[lp] = k
+                    xbRA[k] = i
+                    xRA[k] = vpl * VA[kp]
+                else
+                    xRA[k] += vpl * VA[kp]
+                end
+            end
+        end
+        for jp in 1:lp
+            j = JRA[jp]
+            for kp in nzrange(Pr,j)
+                k = JPr[kp]
+                if xbC[k] != i
+                    xbC[k] = i
+                    xC[k] = xRA[j]*VPr[kp]
+                else
+                    xC[k] += xRA[j]*VPr[kp]
+                end
+            end
+        end
+        for ind in nzrange(C,i)
+            j = JC[ind]
+            if xbC[j] == i
+                VC[ind] += α*xC[j]
+            end
+        end
+    end
+    C
+end
+
+# RARt variants
+function RAP(Pl::SparseMatrixCSR{Bi,Tv,TiA},
+             A::SparseMatrixCSR{Bi,Tv,TiB},
+             Prt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,TiC}}) where {Bi,Tv,TiA,TiB,TiC}
+    p,q = size(Pl)
+    m,r = size(A)
+    n,s = size(Prt)
+    if q == m || throw(DimensionMismatch("Invalid dimensions for R*A: ($p,$q)*($m,$r),"));end
+    if r == n || throw(DimensionMismatch("Invalid dimensions for RA*P: ($p,$r)*($n,$s)"));end
+    RAP(Pl,A,copy(Prt))
+end
+
+function RAP(Pl::SparseMatrixCSR{Bi,Tv,TiA},
+             A::SparseMatrixCSR{Bi,Tv,TiB},
+             Prt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,TiC}},cache) where {Bi,Tv,TiA,TiB,TiC}
+    p,q = size(Pl)
+    m,r = size(A)
+    n,s = size(Prt)
+    if q == m || throw(DimensionMismatch("Invalid dimensions for R*A: ($p,$q)*($m,$r),"));end
+    if r == n || throw(DimensionMismatch("Invalid dimensions for RA*P: ($p,$r)*($n,$s)"));end
+    RAP(Pl,A,copy(Prt),cache)
+end
+
+function RAP!(C::SparseMatrixCSR{Bi,Tv,TiC},
+              Pl::SparseMatrixCSR{Bi,Tv,TiPl}, 
+              A::SparseMatrixCSR{Bi,Tv,TiA}, 
+              Prt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,TiPr}},
+              cache) where {Bi,Tv,TiC,TiPl,TiA,TiPr}
+    p,q = size(Pl)
+    m,r = size(A)
+    n,s = size(Prt)
+    if q == m || throw(DimensionMismatch("Invalid dimensions for R*A: ($p,$q)*($m,$r),"));end
+    if r == n || throw(DimensionMismatch("Invalid dimensions: RA*P: ($p,$r)*($n,$s)"));end
+    Pr = Prt.parent
+    JPl = colvals(Pl)
+    VPl = nonzeros(Pl)
+    JA = colvals(A)
+    VA = nonzeros(A)
+    IPr = colvals(Pr) # colvals can be interpreted as rowvals when Pr is virtually transposed.
+    VPr = nonzeros(Pr)
+    JC = colvals(C)
+    VC = nonzeros(C)
+    # some cache items are present with the regular RAP product in mind, which is how the allocating verison is performed
+    xb,x = cache
+    xb .= 0
+    for i in 1:p
+        # loop over columns "j" in row i of A
+        for jp in nzrange(Pl, i)
+            j = JPl[jp]
+            vpl = VPl[jp]
+            # loop over columns "k" in row j of B
+            for kp in nzrange(A, j)
+                k = JA[kp]
+                # since C is constructed rowwise, xb tracks if a column index is present in a new row in C.
+                if xb[k] != i
+                    xb[k] = i
+                    x[k] = vpl * VA[kp]
+                else
+                    x[k] += vpl * VA[kp]
+                end
+            end
+        end
+        for jpPr in nzrange(C,i)
+            jPr = JC[jpPr]
+            v = Tv(0)
+            for ip in nzrange(Pr,jPr)
+                iPr = IPr[ip]
+                if xb[iPr] == i
+                    v += x[iPr]*VPr[ip]
+                end
+            end
+            VC[jpPr] = v
+        end
+    end
+    C
+end
+
+function RAP!(C::SparseMatrixCSR{Bi,Tv,TiC},
+              Pl::SparseMatrixCSR{Bi,Tv,TiPl},
+              A::SparseMatrixCSR{Bi,Tv,TiA},
+              Prt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,TiPr}},
+              α::Number,
+              β::Number,
+              cache) where {Bi,Tv,TiC,TiPl,TiA,TiPr}
+    p,q = size(Pl)
+    m,r = size(A)
+    n,s = size(Prt)
+    if q == m || throw(DimensionMismatch("Invalid dimensions for R*A: ($p,$q)*($m,$r),"));end
+    if r == n || throw(DimensionMismatch("Invalid dimensions: RA*P: ($p,$r)*($n,$s)"));end
+    Pr = Prt.parent
+    JPl = colvals(Pl)
+    VPl = nonzeros(Pl)
+    JA = colvals(A)
+    VA = nonzeros(A)
+    IPr = colvals(Pr) # colvals can be interpreted as rowvals when Pr is virtually transposed.
+    VPr = nonzeros(Pr)
+    JC = colvals(C)
+    VC = nonzeros(C)
+    VC .*= β
+    # some cache items are present with the regular RAP product in mind, which is how the allocating verison is performed
+    xb,x = cache
+    xb .= 0
+    for i in 1:p
+        # loop over columns "j" in row i of A
+        for jp in nzrange(Pl, i)
+            j = JPl[jp]
+            vpl = VPl[jp]
+            # loop over columns "k" in row j of B
+            for kp in nzrange(A, j)
+                k = JA[kp]
+                # since C is constructed rowwise, xb tracks if a column index is present in a new row in C.
+                if xb[k] != i
+                    xb[k] = i
+                    x[k] = vpl * VA[kp]
+                else
+                    x[k] += vpl * VA[kp]
+                end
+            end
+        end
+        for jpPr in nzrange(C,i)
+            jPr = JC[jpPr]
+            v = Tv(0)
+            for ip in nzrange(Pr,jPr)
+                iPr = IPr[ip]
+                if xb[iPr] == i
+                    v += x[iPr]*VPr[ip]
+                end
+            end
+            VC[jpPr] += α*v
+        end
+    end
+    C
+end
+
+### CSC in terms of CSR
+function RAP(A::SparseMatrixCSC{Tv,TiA},
+             B::SparseMatrixCSC{Tv,TiB},
+             C::SparseMatrixCSC{Tv,TiC}) where {Tv,TiA,TiB,TiC}
+    D,cache = RAP(ascsr(C),ascsr(B),ascsr(A))
+    ascsc(D),cache
+end
+
+function RAP(A::SparseMatrixCSC{Tv,TiA},
+             B::SparseMatrixCSC{Tv,TiB},
+             C::SparseMatrixCSC{Tv,TiC},
+             cache) where {Tv,TiA,TiB,TiC}
+    D,new_cache = RAP(ascsr(C),ascsr(B),ascsr(A),cache)
+    ascsc(D),new_cache
+end
+
+function RAP!(D::SparseMatrixCSC{Tv,TiD},
+              A::SparseMatrixCSC{Tv,TiA},
+              B::SparseMatrixCSC{Tv,TiB},
+              C::SparseMatrixCSC{Tv,TiC},
+              cache) where {Tv,TiD,TiA,TiB,TiC}
+    RAP!(ascsr(D),ascsr(C),ascsr(B),ascsr(A),cache)
+    D
+end
+
+function RAP!(D::SparseMatrixCSC{Tv,TiD},
+              A::SparseMatrixCSC{Tv,TiA},
+              B::SparseMatrixCSC{Tv,TiB},
+              C::SparseMatrixCSC{Tv,TiC},
+              cache::JaggedArray{X,Y} where {X<:Integer, Y<:Integer},
+              acc) where {Tv,TiD,TiA,TiB,TiC}
+    RAP!(ascsr(D),ascsr(C),ascsr(B),ascsr(A),cache,acc)
+    D
+end
+
+function RAP!(D::SparseMatrixCSC{Tv,TiD},
+              A::SparseMatrixCSC{Tv,TiA},
+              B::SparseMatrixCSC{Tv,TiB},
+              C::SparseMatrixCSC{Tv,TiC},
+              α::Number,
+              β::Number,
+              cache) where {Tv,TiD,TiA,TiB,TiC}
+    RAP!(ascsr(D),ascsr(C),ascsr(B),ascsr(A),α,β,cache)
+    D
+end
+
+function RAP!(D::SparseMatrixCSC{Tv,TiD},
+              A::SparseMatrixCSC{Tv,TiA},
+              B::SparseMatrixCSC{Tv,TiB},
+              C::SparseMatrixCSC{Tv,TiC},
+              α::Number,
+              β::Number,
+              cache::JaggedArray{X,Y} where {X <: Integer, Y<:Integer},
+              acc) where {Tv,TiD,TiA,TiB,TiC}
+    RAP!(ascsr(D),ascsr(C),ascsr(B),ascsr(A),α,β,cache,acc)
+    D
+end
+
+# PtAP
+function RAP(A::Transpose{Tv,SparseMatrixCSC{Tv,TiA}},
+             B::SparseMatrixCSC{Tv,TiB},
+             C::SparseMatrixCSC{Tv,TiC}) where {Tv,TiA,TiB,TiC}
+    D,cache = RAP(ascsr(C),ascsr(B),transpose(ascsr(A.parent)))
+    ascsc(D),cache
+end
+
+function RAP(A::Transpose{Tv,SparseMatrixCSC{Tv,TiA}},
+             B::SparseMatrixCSC{Tv,TiB},
+             C::SparseMatrixCSC{Tv,TiC},
+             cache) where {Tv,TiA,TiB,TiC}
+    D,cache = RAP(ascsr(C),ascsr(B),transpose(ascsr(A.parent)),cache)
+    ascsc(D),cache
+end
+
+function RAP!(D::SparseMatrixCSC{Tv,TiD},
+              A::Transpose{Tv,SparseMatrixCSC{Tv,TiA}},
+              B::SparseMatrixCSC{Tv,TiB},
+              C::SparseMatrixCSC{Tv,TiC},
+              cache) where {Tv,TiD,TiA,TiB,TiC}
+    RAP!(ascsr(D),ascsr(C),ascsr(B),transpose(ascsr(A.parent)),cache)
+    D
+end
+
+function RAP!(D::SparseMatrixCSC{Tv,TiD},
+              A::Transpose{Tv,SparseMatrixCSC{Tv,TiA}},
+              B::SparseMatrixCSC{Tv,TiB},
+              C::SparseMatrixCSC{Tv,TiC},
+              α::Number,
+              β::Number,
+              cache) where {Tv,TiD,TiA,TiB,TiC}
+    RAP!(ascsr(D),ascsr(C),ascsr(B),transpose(ascsr(A.parent)),α,β,cache)
+    D
+end
+
+# RARt
+function RAP(A::SparseMatrixCSC{Tv,Ti},
+             B::SparseMatrixCSC{Tv,Ti},
+             C::Transpose{Tv,SparseMatrixCSC{Tv,Ti}}) where {Tv,Ti<:Integer}
+    D,new_cache = RAP(transpose(ascsr(C.parent)),ascsr(B),ascsr(A))
+    ascsc(D),new_cache
+end
+function RAP(A::SparseMatrixCSC{Tv,Ti},
+             B::SparseMatrixCSC{Tv,Ti},
+             C::Transpose{Tv,SparseMatrixCSC{Tv,Ti}},
+             cache) where {Tv,Ti<:Integer}
+    D,new_cache = RAP(transpose(ascsr(C.parent)),ascsr(B),ascsr(A),cache)
+    ascsc(D),new_cache
+end
+
+function RAP!(D::SparseMatrixCSC{Tv,Ti},
+              A::SparseMatrixCSC{Tv,Ti},
+              B::SparseMatrixCSC{Tv,Ti},
+              C::Transpose{Tv,SparseMatrixCSC{Tv,Ti}},
+              cache) where {Tv,Ti<:Integer}
+    RAP!(ascsr(D),transpose(ascsr(C.parent)),ascsr(B),ascsr(A),cache)
+    D
+end
+
+function RAP!(D::SparseMatrixCSC{Tv,Ti},
+              A::SparseMatrixCSC{Tv,Ti},
+              B::SparseMatrixCSC{Tv,Ti},
+              C::Transpose{Tv,SparseMatrixCSC{Tv,Ti}},
+              α::Number,
+              β::Number,
+              cache) where {Tv,Ti<:Integer}
+    RAP!(ascsr(D),transpose(ascsr(C.parent)),ascsr(B),ascsr(A),α,β,cache)
+    D
+end
\ No newline at end of file
diff --git a/src/sparse_utils.jl b/src/sparse_utils.jl
index 0b1d6fa2..08e0bd11 100644
--- a/src/sparse_utils.jl
+++ b/src/sparse_utils.jl
@@ -431,40 +431,6 @@ end
 #    A
 #end
 
-# Variants for findnz() that only allocates memory for the conversion of the pointer array to an index array.
-function findnz_minimal(A::SparseMatrixCSC)
-    J = ptr_to_coo(A.colptr)
-    rowvals(A),J,nonzeros(A)
-end
-function findnz_minimal(A::SparseMatrixCSR)
-    I = ptr_to_coo(A.rowptr)
-    I,colvals(A),nonzeros(A)
-end
-
-# Behaves like findnz, but without copying the values.
-function find_indices(A::SparseMatrixCSC)
-    I,J,_ = findnz_minimal(A)
-    copy(I),J
-end
-function find_indices(A::SparseMatrixCSR)
-    I,J,_ = findnz_minimal(A)
-    I,copy(J)
-end
-
-# Could be optimized by a two-way merge-like method when A is a guaranteed submatrix of C.
-function precompute_nzindex(C::AbstractSparseArray,A::AbstractSparseArray)
-    I,J,_ = findnz_minimal(A)
-    K = similar(I)
-    K .= 0
-    for (p,(i,j)) in enumerate(zip(I,J))
-        if i < 1 || j < 1
-            continue
-        end
-        K[p] = nzindex(C,i,j)
-    end
-    K
-end
-
 function precompute_nzindex(A,I,J)
     K = zeros(Int32,length(I))
     for (p,(i,j)) in enumerate(zip(I,J))
@@ -476,25 +442,14 @@ function precompute_nzindex(A,I,J)
     K
 end
 
-# Reuse I vector as K vector. 
-# function precompute_nzindex!(I,A,J)
-#     for (p,(i,j)) in enumerate(zip(I,J))
-#         if i < 1 || j < 1
-#             continue
-#         end
-#         I[p] = nzindex(A,i,j)
-#     end
-#     I
-# end
-
 function precompute_nzindex!(K, A, I, J)
     for (p, (i, j)) in enumerate(zip(I, J))
         if i < 1 || j < 1
             continue
         end
         K[p] = nzindex(A, i, j)
-  end
-
+    end
+end
 
 function sparse_matrix!(A,V,K;reset=true)
     if reset
@@ -510,7 +465,6 @@ function sparse_matrix!(A,V,K;reset=true)
     A
 end
 
-
 # Notation
 # csrr: csr with repeated and unsorted columns
 # csru: csr with unsorted columns
@@ -734,6 +688,43 @@ function spmv_csc!(b,x,colptr_A,rowval_A,nzval_A)
     b
 end
 
+################ NEW ################
+
+# Variants for findnz() that only allocates memory for the conversion of the pointer array to an index array.
+# Only use for read-only operations.
+function findnz_minimal(A::SparseMatrixCSC)
+    J = ptr_to_coo(A.colptr)
+    rowvals(A),J,nonzeros(A)
+end
+function findnz_minimal(A::SparseMatrixCSR)
+    I = ptr_to_coo(A.rowptr)
+    I,colvals(A),nonzeros(A)
+end
+
+# Behaves like findnz, but without the values.
+function find_indices(A::SparseMatrixCSC)
+    I,J,_ = findnz_minimal(A)
+    copy(I),J
+end
+function find_indices(A::SparseMatrixCSR)
+    I,J,_ = findnz_minimal(A)
+    I,copy(J)
+end
+
+# TODO Could be done without binary searches from nzindex(...), when it is known that A and C are ordered, and A is a guaranteed submatrix of C.
+function precompute_nzindex(C::AbstractSparseArray,A::AbstractSparseArray)
+    I,J,_ = findnz_minimal(A)
+    K = similar(I)
+    K .= 0
+    for (p,(i,j)) in enumerate(zip(I,J))
+        if i < 1 || j < 1
+            continue
+        end
+        K[p] = nzindex(C,i,j)
+    end
+    K
+end
+
 function expand_sparse_matrix_columns(A::SparseMatrixCSR{Bi,Tv,Ti} where {Tv, Ti}, n) where Bi
     p,q = size(A)
     @assert n >= q
@@ -773,23 +764,6 @@ function Base.copy(At::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}} where {Bi,Tv,Ti})
     ascsr(Acsc_T)
 end
 
-function SparseMatricesCSR.sparsecsr(A::SparseMatrixCSC)
-    sparsecsr(findnz(A)..., size(A)...)
-end
-
-function SparseMatricesCSR.sparsecsr(At::Transpose)
-    transpose(sparsecsr(At.parent))
-end
-
-function SparseMatricesCSR.sparsecsr(A::SparseMatrixCSR)
-    A
-end
-
-function SparseMatricesCSR.sparsecsr(T::Type, A::SparseMatrixCSC)
-    compresscoo(T,findnz(A)..., size(A)...)
-end
-
-
 function pointer_array(A::SparseMatrixCSR)
     A.rowptr
 end
diff --git a/test/debug_array/runtests.jl b/test/debug_array/runtests.jl
index 2c1a61ab..a175b722 100644
--- a/test/debug_array/runtests.jl
+++ b/test/debug_array/runtests.jl
@@ -23,4 +23,6 @@ using PartitionedArrays
 
 @testset "fem_example" begin include("fem_example.jl")  end
 
+@testset "spmtmm_tests" begin include("spmtmm_tests.jl")  end
+
 end #module
diff --git a/test/debug_array/spmtmm_tests.jl b/test/debug_array/spmtmm_tests.jl
new file mode 100644
index 00000000..384aeb21
--- /dev/null
+++ b/test/debug_array/spmtmm_tests.jl
@@ -0,0 +1,14 @@
+module DebugArraySpMtMMTests
+
+using PartitionedArrays
+using SparseArrays
+
+include(joinpath("..","primitives_tests.jl"))
+
+M = sparse(1:5,1:5,1:5)
+@test nnz(M-M) == nnz(M)
+display(M-M)
+
+with_debug(primitives_tests)
+
+end # module
diff --git a/test/mpi_array/runtests.jl b/test/mpi_array/runtests.jl
index 26a3a5d3..ffdc1f1e 100644
--- a/test/mpi_array/runtests.jl
+++ b/test/mpi_array/runtests.jl
@@ -13,5 +13,6 @@ using PartitionedArrays
 @testset "p_timer_tests" begin include("p_timer_tests.jl")  end
 @testset "fdm_example" begin include("fdm_example.jl")  end
 @testset "fem_example" begin include("fem_example.jl")  end
+@testset "spmtmm_tests" begin include("spmtmm_tests.jl")  end
 
 end #module
diff --git a/test/mpi_array/spmtmm_tests.jl b/test/mpi_array/spmtmm_tests.jl
new file mode 100644
index 00000000..c9063604
--- /dev/null
+++ b/test/mpi_array/spmtmm_tests.jl
@@ -0,0 +1,4 @@
+using MPI
+include("run_mpi_driver.jl")
+file = joinpath(@__DIR__,"drivers","spmtmm_tests.jl")
+run_mpi_driver(file;procs=4)
diff --git a/test/spmtmm_tests.jl b/test/spmtmm_tests.jl
new file mode 100644
index 00000000..8e162bd3
--- /dev/null
+++ b/test/spmtmm_tests.jl
@@ -0,0 +1,216 @@
+module SpMtMMTests
+
+using SparseArrays
+using SparseMatricesCSR
+using PartitionedArrays
+using LinearAlgebra
+using Test
+
+# Equality definition for SparseCSC and SparseCSR. If the size and lengths match, the CSR matrix is converted
+function strictly_equivalent(A::SparseMatrixCSC, B::SparseMatrixCSR)
+    if size(A) != size(B) && return false; end
+    if length(nonzeros(A)) != length(nonzeros(B)) && return false; end
+    Bcsc = sparse(findnz(B)...,size(B)...)
+    if rowvals(A) != rowvals(Bcsc) && return false; end
+    if nonzeros(A) != nonzeros(Bcsc) && return false; end
+    true
+end
+
+function strictly_equivalent(A::SparseMatrixCSC, B::SparseMatrixCSC)
+    if size(A) != size(B) && return false; end
+    if length(nonzeros(A)) != length(nonzeros(B)) && return false; end
+    if A.colptr != B.colptr && return false; end
+    if rowvals(A) != rowvals(B) && return false; end
+    if nonzeros(A) != nonzeros(B) && return false; end
+    true
+end
+
+function strictly_equivalent(A::SparseMatrixCSR, B::SparseMatrixCSR)
+    if size(A) != size(B) && return false; end
+    if length(nonzeros(A)) != length(nonzeros(B)) && return false; end
+    if A.rowptr != B.rowptr && return false; end
+    if colvals(A) != colvals(B) && return false; end
+    if nonzeros(A) != nonzeros(B) && return false; end
+    true
+end
+
+function strictly_equivalent(A::SparseMatrixCSR,B::SparseMatrixCSC) strictly_equivalent(B,A) end
+
+# Equality definition for SparseCSC and SparseCSR. If the size and lengths match, the CSR matrix is converted
+function approx_equivalent(A::SparseMatrixCSC, B::SparseMatrixCSR,args...)
+    if size(A) != size(B) && return false; end
+    if length(nonzeros(A)) != length(nonzeros(B)) && return false; end
+    Bcsc = sparse(findnz(B)...,size(B)...)
+    if A.colptr != Bcsc.colptr && return false; end
+    if rowvals(A) != rowvals(Bcsc) && return false; end
+    if !isapprox(nonzeros(A),nonzeros(B),args...) && return false; end
+    true
+end
+
+function approx_equivalent(A::SparseMatrixCSC, B::SparseMatrixCSC,args...)
+    if size(A) != size(B) && return false; end
+    if length(nonzeros(A)) != length(nonzeros(B)) && return false; end
+    if A.colptr != B.colptr && return false; end
+    if rowvals(A) != rowvals(B) && return false; end
+    if !isapprox(nonzeros(A),nonzeros(B),args...) && return false; end
+    true
+end
+
+# Structurally A and B must be equal, but numerically the can be approximately equal
+function approx_equivalent(A::SparseMatrixCSR, B::SparseMatrixCSR,args...)
+    if size(A) != size(B) && return false; end
+    if length(nonzeros(A)) != length(nonzeros(B)) && return false; end
+    if A.rowptr != B.rowptr && return false; end
+    if colvals(A) != colvals(B) && return false; end
+    if !isapprox(nonzeros(A),nonzeros(B),args...) && return false; end
+    true
+end
+
+function approx_equivalent(A::SparseMatrixCSR,B::SparseMatrixCSC) strictly_equivalent(B,A) end
+
+
+function parallel_tests(pA,pB,sparse_func)
+    A = centralize(sparse_func,pA)
+    B = centralize(sparse_func,pB)
+    # explicit parallel transpose
+
+    pBt = explicit_transpose(pB) |> fetch
+    Bt = centralize(sparse_func,pBt)
+    @test Bt == copy(transpose(B))
+    hp_B = halfperm(B)
+    B_struct = symbolic_halfperm(B)
+    @test pointer_array(hp_B) == B_struct.ptrs
+    @test index_array(hp_B) == B_struct.data
+    @test Bt == hp_B
+
+    pBt_local,t = explicit_transpose(pB,reuse=true)
+    pBt, transpose_cache = fetch(t)
+    Bt = centralize(sparse_func,pBt)
+    @test Bt == copy(transpose(B))
+    hp_B = halfperm(B)
+    @test Bt == hp_B
+
+    t = explicit_transpose!(pBt,pBt_local,pB,transpose_cache)
+    wait(t)
+    Bt = centralize(sparse_func,pBt)
+    @test Bt == copy(transpose(B))
+    hp_B = halfperm(B)
+    @test Bt == hp_B
+
+    AB0 = A*B
+    C0 = transpose(B)*AB0
+    # test basic sequential csr implementations to default csc sequential implementations.
+    pAB,cacheAB = spmm(pA,pB,reuse=true)
+    AB = centralize(sparse_func,pAB)
+    @test approx_equivalent(AB,AB0)
+    
+    # pB will be transposed internally
+    pC,cacheC = spmtm(pB,pAB,reuse=true)
+    C = centralize(sparse_func,pC)
+    @test approx_equivalent(C,C0)
+    spmm!(pAB,pA,pB,cacheAB)
+    AB = centralize(sparse_func,pAB)
+
+    @test approx_equivalent(AB,AB0)
+    spmtm!(pC,pB,pAB,cacheC)
+    C = centralize(sparse_func,pC)
+    @test approx_equivalent(C,C0)
+
+    pC,cacheC = spmtmm(pB,pA,pB,reuse=true)
+    C = centralize(sparse_func,pC)
+    @test approx_equivalent(C,C0)
+
+    spmtmm!(pC,pB,pA,pB,cacheC)
+    C = centralize(sparse_func,pC)
+    @test approx_equivalent(C,C0)
+    
+    # test basic sequential csr implementations to default csc sequential implementations.
+    pC,cacheC = spmm(pBt,pAB,reuse=true)
+    C = centralize(sparse_func,pC)
+    @test approx_equivalent(C,C0)
+    spmm!(pC,pBt,pAB,cacheC)
+    C = centralize(sparse_func,pC)
+    @test approx_equivalent(C,C0)
+
+    # pB will be transposed internally
+    pC,cacheC = spmmm(pBt,pA,pB,reuse=true)
+    C = centralize(sparse_func,pC)
+    @test approx_equivalent(C,C0)
+    spmmm!(pC,pBt,pA,pB,cacheC)
+    C = centralize(sparse_func,pC)
+    @test approx_equivalent(C,C0)
+
+    # unequal sizes backward (small to large)
+    if size(pA) != size(pB)
+        CB0 = C0*Bt
+        D0 = transpose(Bt)*CB0
+        pCB,cacheCB = spmm(pC,pBt,reuse=true)
+        CB = centralize(sparse_func,pCB)
+        @test approx_equivalent(CB,CB0)
+
+        pD,cacheD = spmtm(pBt,pCB,reuse=true)
+        D = centralize(sparse_func,pD)
+        @test approx_equivalent(D,D0)
+        spmm!(pCB,pC,pBt,cacheCB)
+        CB = centralize(sparse_func,pCB)
+        @test approx_equivalent(CB,CB0)
+        spmtm!(pD,pBt,pCB,cacheD)
+        D = centralize(sparse_func,pD)
+        @test approx_equivalent(D,D0)
+        
+        pD,cacheD = spmtmm(pBt,pC,pBt,reuse=true)
+        D = centralize(sparse_func,pD)
+        @test approx_equivalent(D,D0)
+        spmtmm!(pD,pBt,pC,pBt,cacheD)
+        D = centralize(sparse_func,pD)
+        @test approx_equivalent(D,D0)
+
+        pD,cacheD = spmm(pB,pCB,reuse=true)
+        D = centralize(sparse_func,pD)
+        @test approx_equivalent(D,D0)
+        
+        pD,cacheD = spmmm(pB,pC,pBt,reuse=true)
+        D = centralize(sparse_func,pD)
+        @test approx_equivalent(D,D0)
+        spmmm!(pD,pB,pC,pBt,cacheD)
+        D = centralize(sparse_func,pD)
+        @test approx_equivalent(D,D0)
+    end
+end
+
+function spmtmm_tests(distribute)
+    nodes_per_dir = (5,5,5)
+    parts_per_dir = (1,2,2)
+    np = prod(parts_per_dir)
+    ranks = distribute(LinearIndices((np,)))
+    Ti = Int32
+    pA = psparse(sparsecsr,laplacian_fdm(nodes_per_dir,parts_per_dir,ranks; index_type = Ti)...) |> fetch
+    pB = pA
+    parallel_tests(pA,pB,sparsecsr)
+
+    T = eltype(typeof(own_own_values(pA).items))
+
+    pB = prolongator(T,pA)
+    B = centralize(T,pB)
+    sequential_tests(pA,pB)
+
+    #### CSC
+    do_CSC = true
+    if do_CSC
+        pA = psparse(sparse,laplacian_fdm(nodes_per_dir,parts_per_dir,ranks; index_type = Ti)...) |> fetch
+        T = eltype(typeof(own_own_values(pA).items))
+
+        pB = pA
+        parallel_tests(pA,pB,sparse)
+
+        parallel_time(pA,pB,sparse)
+        T = eltype(typeof(own_own_values(pA).items))
+        pB = prolongator(T,pA)
+        B = centralize(T,pB)
+        parallel_tests(pA,pB,sparse)
+    end
+end
+
+end # module
+;
+
diff --git a/times.txt b/times.txt
index 53764f74..f7794112 100644
--- a/times.txt
+++ b/times.txt
@@ -1 +1 @@
-Dict{String, @NamedTuple{min::Float64, max::Float64, avg::Float64}}("Phase 143" => (min = 0.2068526, max = 0.2068526, avg = 0.2068526), "Phase 3" => (min = 0.0, max = 0.0, avg = 0.0), "Matrix Assembly" => (min = 0.4054487, max = 0.4054487, avg = 0.4054487), "Phase 1" => (min = 1.1e-6, max = 1.1e-6, avg = 1.1e-6))
+Dict{String, @NamedTuple{min::Float64, max::Float64, avg::Float64}}("Phase 143" => (min = 0.2174075, max = 0.2174075, avg = 0.2174075), "Phase 3" => (min = 1.0e-7, max = 1.0e-7, avg = 1.0e-7), "Matrix Assembly" => (min = 0.4207836, max = 0.4207836, avg = 0.4207836), "Phase 1" => (min = 1.4e-6, max = 1.4e-6, avg = 1.4e-6))

From 33a36ada74c23b6b4550a886bbc6b2519c4aadd9 Mon Sep 17 00:00:00 2001
From: jop611 <meijer.jop@gmail.com>
Date: Mon, 27 Jan 2025 17:45:51 +0100
Subject: [PATCH 23/34] fixed tests, add missing symbolic_halfperm methods.

---
 src/PartitionedArrays.jl         |  7 +++
 src/gallery.jl                   |  2 +-
 src/sparse_utils.jl              | 73 ++++++++++++++++++++++++++-
 test/debug_array/spmtmm_tests.jl | 13 +++--
 test/mpi_array/runtests.jl       | 20 ++++----
 test/spmtmm_tests.jl             | 87 ++++++--------------------------
 times.txt                        |  2 +-
 7 files changed, 114 insertions(+), 90 deletions(-)

diff --git a/src/PartitionedArrays.jl b/src/PartitionedArrays.jl
index 8505d709..2d250e0a 100644
--- a/src/PartitionedArrays.jl
+++ b/src/PartitionedArrays.jl
@@ -25,6 +25,12 @@ export compresscoo
 export indextype
 export sparse_matrix
 export sparse_matrix!
+export index_array
+export pointer_array
+export halfperm
+export halfperm!
+export symbolic_halfperm
+export symbolic_halfperm!
 include("sparse_utils.jl")
 
 export linear_indices
@@ -202,6 +208,7 @@ export node_coordinates_unit_cube
 export nullspace_linear_elasticity
 export nullspace_linear_elasticity!
 export near_nullspace_linear_elasticity
+export prolongator
 include("gallery.jl")
 
 export RAP
diff --git a/src/gallery.jl b/src/gallery.jl
index 0757ad9d..06933725 100644
--- a/src/gallery.jl
+++ b/src/gallery.jl
@@ -586,4 +586,4 @@ function nullspace_linear_elasticity!(B,x)
         error("case not implemented")
     end
     B
-end
\ No newline at end of file
+end
diff --git a/src/sparse_utils.jl b/src/sparse_utils.jl
index 08e0bd11..ea12f3f9 100644
--- a/src/sparse_utils.jl
+++ b/src/sparse_utils.jl
@@ -799,6 +799,15 @@ function find_max_row_length(A::SparseMatrixCSR)
     max_rA
 end
 
+function find_max_row_length(A::JaggedArray)
+    max_rA = 0
+    for i in 1:length(A.ptrs)-1
+        l = length(jagged_range(A,i))
+        max_rA = max_rA > l ? max_rA : l
+    end
+    max_rA
+end
+
 function find_max_col_length(A::SparseMatrixCSC)
     max_cA = 0
     for j in 1:size(A,2)
@@ -822,7 +831,6 @@ function ascsc(A::SparseMatrixCSR{Bi,Tv,Ti}) where {Bi,Tv,Ti}
     SparseMatrixCSC{Tv,Ti}(q,p,A.rowptr,colvals(A),nonzeros(A))
 end
 
-
 function halfperm(A::SparseMatrixCSR{Bi,Tv,Ti}) where {Bi,Tv,Ti}
     q = size(A,2)
     JA,VA = colvals(A),nonzeros(A)
@@ -905,4 +913,65 @@ function counts_to_ptrs!(v)
     foreach(i->v[i]+=v[i-1],2:l)
     shift_by_one!(v)
     v[1] = 1
-end
\ No newline at end of file
+end
+
+function symbolic_halfperm(A::SparseMatrixCSR)
+    q = size(A,2)
+    JA = colvals(A)
+    IAt,JAt = similar(A.rowptr,q+1),similar(JA)
+    symbolic_halfperm!(IAt,JAt,A)
+end
+
+# transpose A into At using vectors IAt,JAt, and VAt
+function symbolic_halfperm!(IAt,JAt,A::SparseMatrixCSR)
+    JA= colvals(A)
+    p,q = size(A)
+    count_occurrences!(IAt,JA)
+    counts_to_ptrs!(IAt)
+    shift_by_one!(IAt)
+    for i in 1:p
+        for jp in nzrange(A,i)
+            j = JA[jp]
+            jpt = IAt[j+1]
+            JAt[jpt] = i
+            IAt[j+1] = jpt+1
+        end
+    end
+    IAt[1] = 1
+    JaggedArray(JAt,IAt)
+end
+
+# transpose A into At using vectors IAt,JAt, and VAt
+function symbolic_halfperm!(JAt,IAt,A::SparseMatrixCSC)
+    symbolic_halfperm!(JAt,IAt,ascsr(A))
+end
+
+function symbolic_halfperm(A::SparseMatrixCSC)
+    symbolic_halfperm(ascsr(A))
+end
+
+# retranspose At back into A
+function symbolic_halfperm!(A::SparseMatrixCSR,At::JaggedArray)
+    IA,JA = pointer_array(A),index_array(A)
+    JAt = At.data
+    # p = size(A,1)
+    shift_by_one!(IA) # pointer to row 1 must be located at IA[2], row 2 at IA[3] etc.
+    IA[1] = 1
+    for i in 1:size(A,2)
+        for jpt in jagged_range(At,i)
+            j = JAt[jpt]
+            jp = IA[j+1]
+            JA[jp] = i
+            IA[j+1] = jp+1
+        end
+    end
+    A
+end
+
+# retranspose At back into A
+function symbolic_halfperm!(A::SparseMatrixCSC,At::JaggedArray)
+    symbolic_halfperm!(ascsr(A),At)
+    A
+end
+
+
diff --git a/test/debug_array/spmtmm_tests.jl b/test/debug_array/spmtmm_tests.jl
index 384aeb21..ba3cf431 100644
--- a/test/debug_array/spmtmm_tests.jl
+++ b/test/debug_array/spmtmm_tests.jl
@@ -1,14 +1,19 @@
 module DebugArraySpMtMMTests
 
 using PartitionedArrays
-using SparseArrays
+using Test
 
-include(joinpath("..","primitives_tests.jl"))
+include(joinpath("..","spmtmm_tests.jl"))
 
-M = sparse(1:5,1:5,1:5)
+v = 1:5
+M = sparse(v,v,v)
 @test nnz(M-M) == nnz(M)
 display(M-M)
 
-with_debug(primitives_tests)
+M = sparsecsr(v,v,v)
+@test nnz(M-M) == nnz(M)
+display(M-M)
+
+with_debug(spmtmm_tests)
 
 end # module
diff --git a/test/mpi_array/runtests.jl b/test/mpi_array/runtests.jl
index ffdc1f1e..fc6f0aee 100644
--- a/test/mpi_array/runtests.jl
+++ b/test/mpi_array/runtests.jl
@@ -3,16 +3,16 @@ module MPIArrayRunTests
 using Test
 using PartitionedArrays
 
-@testset "mpi_array" begin include("mpi_array_tests.jl") end
-@testset "primitives" begin include("primitives_tests.jl")  end
-@testset "p_range_tests" begin include("p_range_tests.jl")  end
-@testset "p_vector_tests" begin include("p_vector_tests.jl")  end
-@testset "p_sparse_matrix_tests" begin include("p_sparse_matrix_tests.jl")  end
-@testset "gallery" begin include("gallery_tests.jl")  end
-@testset "block_arrays" begin include("block_arrays_tests.jl")  end
-@testset "p_timer_tests" begin include("p_timer_tests.jl")  end
-@testset "fdm_example" begin include("fdm_example.jl")  end
-@testset "fem_example" begin include("fem_example.jl")  end
+# @testset "mpi_array" begin include("mpi_array_tests.jl") end
+# @testset "primitives" begin include("primitives_tests.jl")  end
+# @testset "p_range_tests" begin include("p_range_tests.jl")  end
+# @testset "p_vector_tests" begin include("p_vector_tests.jl")  end
+# @testset "p_sparse_matrix_tests" begin include("p_sparse_matrix_tests.jl")  end
+# @testset "gallery" begin include("gallery_tests.jl")  end
+# @testset "block_arrays" begin include("block_arrays_tests.jl")  end
+# @testset "p_timer_tests" begin include("p_timer_tests.jl")  end
+# @testset "fdm_example" begin include("fdm_example.jl")  end
+# @testset "fem_example" begin include("fem_example.jl")  end
 @testset "spmtmm_tests" begin include("spmtmm_tests.jl")  end
 
 end #module
diff --git a/test/spmtmm_tests.jl b/test/spmtmm_tests.jl
index 8e162bd3..d8d4e658 100644
--- a/test/spmtmm_tests.jl
+++ b/test/spmtmm_tests.jl
@@ -1,52 +1,9 @@
-module SpMtMMTests
-
 using SparseArrays
 using SparseMatricesCSR
 using PartitionedArrays
 using LinearAlgebra
 using Test
 
-# Equality definition for SparseCSC and SparseCSR. If the size and lengths match, the CSR matrix is converted
-function strictly_equivalent(A::SparseMatrixCSC, B::SparseMatrixCSR)
-    if size(A) != size(B) && return false; end
-    if length(nonzeros(A)) != length(nonzeros(B)) && return false; end
-    Bcsc = sparse(findnz(B)...,size(B)...)
-    if rowvals(A) != rowvals(Bcsc) && return false; end
-    if nonzeros(A) != nonzeros(Bcsc) && return false; end
-    true
-end
-
-function strictly_equivalent(A::SparseMatrixCSC, B::SparseMatrixCSC)
-    if size(A) != size(B) && return false; end
-    if length(nonzeros(A)) != length(nonzeros(B)) && return false; end
-    if A.colptr != B.colptr && return false; end
-    if rowvals(A) != rowvals(B) && return false; end
-    if nonzeros(A) != nonzeros(B) && return false; end
-    true
-end
-
-function strictly_equivalent(A::SparseMatrixCSR, B::SparseMatrixCSR)
-    if size(A) != size(B) && return false; end
-    if length(nonzeros(A)) != length(nonzeros(B)) && return false; end
-    if A.rowptr != B.rowptr && return false; end
-    if colvals(A) != colvals(B) && return false; end
-    if nonzeros(A) != nonzeros(B) && return false; end
-    true
-end
-
-function strictly_equivalent(A::SparseMatrixCSR,B::SparseMatrixCSC) strictly_equivalent(B,A) end
-
-# Equality definition for SparseCSC and SparseCSR. If the size and lengths match, the CSR matrix is converted
-function approx_equivalent(A::SparseMatrixCSC, B::SparseMatrixCSR,args...)
-    if size(A) != size(B) && return false; end
-    if length(nonzeros(A)) != length(nonzeros(B)) && return false; end
-    Bcsc = sparse(findnz(B)...,size(B)...)
-    if A.colptr != Bcsc.colptr && return false; end
-    if rowvals(A) != rowvals(Bcsc) && return false; end
-    if !isapprox(nonzeros(A),nonzeros(B),args...) && return false; end
-    true
-end
-
 function approx_equivalent(A::SparseMatrixCSC, B::SparseMatrixCSC,args...)
     if size(A) != size(B) && return false; end
     if length(nonzeros(A)) != length(nonzeros(B)) && return false; end
@@ -66,9 +23,6 @@ function approx_equivalent(A::SparseMatrixCSR, B::SparseMatrixCSR,args...)
     true
 end
 
-function approx_equivalent(A::SparseMatrixCSR,B::SparseMatrixCSC) strictly_equivalent(B,A) end
-
-
 function parallel_tests(pA,pB,sparse_func)
     A = centralize(sparse_func,pA)
     B = centralize(sparse_func,pB)
@@ -188,29 +142,18 @@ function spmtmm_tests(distribute)
     pB = pA
     parallel_tests(pA,pB,sparsecsr)
 
-    T = eltype(typeof(own_own_values(pA).items))
-
-    pB = prolongator(T,pA)
-    B = centralize(T,pB)
-    sequential_tests(pA,pB)
-
-    #### CSC
-    do_CSC = true
-    if do_CSC
-        pA = psparse(sparse,laplacian_fdm(nodes_per_dir,parts_per_dir,ranks; index_type = Ti)...) |> fetch
-        T = eltype(typeof(own_own_values(pA).items))
-
-        pB = pA
-        parallel_tests(pA,pB,sparse)
-
-        parallel_time(pA,pB,sparse)
-        T = eltype(typeof(own_own_values(pA).items))
-        pB = prolongator(T,pA)
-        B = centralize(T,pB)
-        parallel_tests(pA,pB,sparse)
-    end
-end
-
-end # module
-;
-
+    # Testing with a real prolongator requires PartitionedSolvers
+    # T = eltype(typeof(own_own_values(pA).items))
+    # pB = prolongator(T,pA)
+    # parallel_tests(pA,pB,sparsecsr)
+    
+    #### CSC ####
+    pA = psparse(sparse,laplacian_fdm(nodes_per_dir,parts_per_dir,ranks; index_type = Ti)...) |> fetch
+    pB = pA
+    parallel_tests(pA,pB,sparse)
+    
+    # Testing with a real prolongator requires PartitionedSolvers
+    # T = eltype(typeof(own_own_values(pA).items))
+    # pB = prolongator(T,pA)
+    # parallel_tests(pA,pB,sparse)
+end
\ No newline at end of file
diff --git a/times.txt b/times.txt
index f7794112..1f2c118d 100644
--- a/times.txt
+++ b/times.txt
@@ -1 +1 @@
-Dict{String, @NamedTuple{min::Float64, max::Float64, avg::Float64}}("Phase 143" => (min = 0.2174075, max = 0.2174075, avg = 0.2174075), "Phase 3" => (min = 1.0e-7, max = 1.0e-7, avg = 1.0e-7), "Matrix Assembly" => (min = 0.4207836, max = 0.4207836, avg = 0.4207836), "Phase 1" => (min = 1.4e-6, max = 1.4e-6, avg = 1.4e-6))
+Dict{String, @NamedTuple{min::Float64, max::Float64, avg::Float64}}("Phase 143" => (min = 0.2025824, max = 0.2025824, avg = 0.2025824), "Phase 3" => (min = 1.0e-7, max = 1.0e-7, avg = 1.0e-7), "Matrix Assembly" => (min = 0.4105116, max = 0.4105116, avg = 0.4105116), "Phase 1" => (min = 1.4e-6, max = 1.4e-6, avg = 1.4e-6))

From a1b0f8f1b6572ed0f8b148387137b96752bfd0fb Mon Sep 17 00:00:00 2001
From: jop611 <meijer.jop@gmail.com>
Date: Mon, 27 Jan 2025 17:59:16 +0100
Subject: [PATCH 24/34] uncommented a test line.

---
 test/runtests.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/runtests.jl b/test/runtests.jl
index ed7aff49..92768453 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -5,6 +5,6 @@ using Test
 @testset "jagged_array" begin include("jagged_array_tests.jl") end
 @testset "sparse_utils" begin include("sparse_utils_tests.jl") end
 @testset "debug_array" begin include("debug_array/runtests.jl") end
-# @testset "mpi_array" begin include("mpi_array/runtests.jl") end
+@testset "mpi_array" begin include("mpi_array/runtests.jl") end
 
 end # module

From c29b7d3246afb577c0f3ede3bf6833452501181a Mon Sep 17 00:00:00 2001
From: jop611 <meijer.jop@gmail.com>
Date: Tue, 28 Jan 2025 13:10:18 +0100
Subject: [PATCH 25/34] Added spmtmm mpi driver to tests

---
 test/mpi_array/drivers/spmtmm_tests.jl | 10 ++++++++++
 times.txt                              |  2 +-
 2 files changed, 11 insertions(+), 1 deletion(-)
 create mode 100644 test/mpi_array/drivers/spmtmm_tests.jl

diff --git a/test/mpi_array/drivers/spmtmm_tests.jl b/test/mpi_array/drivers/spmtmm_tests.jl
new file mode 100644
index 00000000..50c3668a
--- /dev/null
+++ b/test/mpi_array/drivers/spmtmm_tests.jl
@@ -0,0 +1,10 @@
+module MPIArrayPrimitivesTests
+
+using PartitionedArrays
+
+include(joinpath("..","..","spmtmm_tests.jl"))
+
+with_mpi(spmtmm_tests)
+
+end # module
+
diff --git a/times.txt b/times.txt
index 1f2c118d..2e3f93ca 100644
--- a/times.txt
+++ b/times.txt
@@ -1 +1 @@
-Dict{String, @NamedTuple{min::Float64, max::Float64, avg::Float64}}("Phase 143" => (min = 0.2025824, max = 0.2025824, avg = 0.2025824), "Phase 3" => (min = 1.0e-7, max = 1.0e-7, avg = 1.0e-7), "Matrix Assembly" => (min = 0.4105116, max = 0.4105116, avg = 0.4105116), "Phase 1" => (min = 1.4e-6, max = 1.4e-6, avg = 1.4e-6))
+Dict{String, @NamedTuple{min::Float64, max::Float64, avg::Float64}}("Phase 143" => (min = 0.2149521, max = 0.2149521, avg = 0.2149521), "Phase 3" => (min = 1.0e-7, max = 1.0e-7, avg = 1.0e-7), "Matrix Assembly" => (min = 0.4041403, max = 0.4041403, avg = 0.4041403), "Phase 1" => (min = 4.0e-7, max = 4.0e-7, avg = 4.0e-7))

From 18b564e6b9fb372712b151443e6df5d1b1e5094b Mon Sep 17 00:00:00 2001
From: jop611 <meijer.jop@gmail.com>
Date: Tue, 28 Jan 2025 13:19:53 +0100
Subject: [PATCH 26/34] uncommented old code for PartitionedSOlvers tests
 (rap(...) and rap!(...)).

---
 src/p_sparse_matrix.jl | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/p_sparse_matrix.jl b/src/p_sparse_matrix.jl
index e663b841..7b76a129 100644
--- a/src/p_sparse_matrix.jl
+++ b/src/p_sparse_matrix.jl
@@ -2263,13 +2263,13 @@ function sparse_diag_matrix(::Type{T},d::PVector,shape) where T
 end
 
 ### OLD ###
-# function rap(R,A,P;reuse=Val(false))
-#     Ac = R*A*P
-#     if val_parameter(reuse)
-#         return Ac, nothing
-#     end
-#     Ac
-# end
+function rap(R,A,P;reuse=Val(false))
+    Ac = R*A*P
+    if val_parameter(reuse)
+        return Ac, nothing
+    end
+    Ac
+end
 
 ### NEW ###
 function rap(R::PSparseMatrix,A::PSparseMatrix,P::PSparseMatrix;reuse=Val(false))
@@ -2281,12 +2281,12 @@ function rap(R::PSparseMatrix,A::PSparseMatrix,P::PSparseMatrix;reuse=Val(false)
 end
 
 ### OLD ###
-# function rap!(Ac,R,A,P,cache)
-#     # TODO improve performance
-#     tmp = R*A*P
-#     copyto!(Ac,tmp)
-#     Ac
-# end
+function rap!(Ac,R,A,P,cache)
+    # TODO improve performance
+    tmp = R*A*P
+    copyto!(Ac,tmp)
+    Ac
+end
 
 ### NEW ###
 function rap!(Ac::PSparseMatrix,R::PSparseMatrix,A::PSparseMatrix,P::PSparseMatrix,cache)

From cddd0ad52ffd68d84a8b641737264428c2d15355 Mon Sep 17 00:00:00 2001
From: jop611 <meijer.jop@gmail.com>
Date: Tue, 28 Jan 2025 13:22:49 +0100
Subject: [PATCH 27/34] changed RAP function names to rap for consistency

---
 src/p_sparse_matrix.jl            |  64 +++++++-------
 src/sequential_implementations.jl | 142 +++++++++++++++---------------
 2 files changed, 103 insertions(+), 103 deletions(-)

diff --git a/src/p_sparse_matrix.jl b/src/p_sparse_matrix.jl
index 7b76a129..e8948c54 100644
--- a/src/p_sparse_matrix.jl
+++ b/src/p_sparse_matrix.jl
@@ -3138,8 +3138,8 @@ function spmtmm(A::PSparseMatrix,B::PSparseMatrix,C::PSparseMatrix;reuse=Val(fal
     Aog = own_ghost_values(A)
     Bog = own_ghost_values(B)
     
-    Doo1, Doo_cache = map((A,B,C)->RAP(transpose(A),B,C), Aoo,Boo,Cog) |> tuple_of_arrays
-    Dgo1, Dgo_cache = map((A,B,C)->RAP(transpose(A),B,C), Aog,Boo,Cog) |> tuple_of_arrays
+    Doo1, Doo_cache = map((A,B,C)->rap(transpose(A),B,C), Aoo,Boo,Cog) |> tuple_of_arrays
+    Dgo1, Dgo_cache = map((A,B,C)->rap(transpose(A),B,C), Aog,Boo,Cog) |> tuple_of_arrays
     
     # Collect ghost rows from P before continuing
     C2, consistencyCache = fetch(consistency_task)
@@ -3148,9 +3148,9 @@ function spmtmm(A::PSparseMatrix,B::PSparseMatrix,C::PSparseMatrix;reuse=Val(fal
     Cgo = ghost_own_values(C2)
     Cgg = ghost_ghost_values(C2)
 
-    Dgo2, Dgo_cache = map((A,B,C,cache)->RAP(transpose(A),B,C,cache), Aog,Bog,Cgo,Dgo_cache) |> tuple_of_arrays
-    Dog1, Dog_cache = map((A,B,C,cache)->RAP(transpose(A),B,C,cache), Aog,Boo,Cog2,Dgo_cache) |> tuple_of_arrays
-    Dog2, Dog_cache = map((A,B,C,cache)->RAP(transpose(A),B,C,cache), Aog,Bog,Cgg,Dog_cache) |> tuple_of_arrays        
+    Dgo2, Dgo_cache = map((A,B,C,cache)->rap(transpose(A),B,C,cache), Aog,Bog,Cgo,Dgo_cache) |> tuple_of_arrays
+    Dog1, Dog_cache = map((A,B,C,cache)->rap(transpose(A),B,C,cache), Aog,Boo,Cog2,Dgo_cache) |> tuple_of_arrays
+    Dog2, Dog_cache = map((A,B,C,cache)->rap(transpose(A),B,C,cache), Aog,Bog,Cgg,Dog_cache) |> tuple_of_arrays        
 
     Dgo = map(+,Dgo1,Dgo2) # different sparsity patterns so not in-place.
     Dog = map(+,Dog1,Dog2)
@@ -3164,9 +3164,9 @@ function spmtmm(A::PSparseMatrix,B::PSparseMatrix,C::PSparseMatrix;reuse=Val(fal
     D1_unassembled = PSparseMatrix(D1_values, partition(axes(C,2)), partition(axes(C2,2)), false)
     assembly_task = assemble(D1_unassembled, reuse=true)
 
-    Dog1, Dog_cache = map((A,B,C,cache)->RAP(transpose(A),B,C,cache), Aoo,Boo,Cog2,Doo_cache) |> tuple_of_arrays
-    Doo2,Doo_cache = map((A,B,C,cache)->RAP(transpose(A),B,C,cache), Aoo,Bog,Cgo,Doo_cache) |> tuple_of_arrays
-    Dog2,Dog_cache = map((A,B,C,cache)->RAP(transpose(A),B,C,cache), Aoo,Bog,Cgg,Dog_cache) |> tuple_of_arrays
+    Dog1, Dog_cache = map((A,B,C,cache)->rap(transpose(A),B,C,cache), Aoo,Boo,Cog2,Doo_cache) |> tuple_of_arrays
+    Doo2,Doo_cache = map((A,B,C,cache)->rap(transpose(A),B,C,cache), Aoo,Bog,Cgo,Doo_cache) |> tuple_of_arrays
+    Dog2,Dog_cache = map((A,B,C,cache)->rap(transpose(A),B,C,cache), Aoo,Bog,Cgg,Dog_cache) |> tuple_of_arrays
 
     Doo = map(+,Doo1,Doo2)
     Dog = map(+,Dog1,Dog2)
@@ -3216,8 +3216,8 @@ function spmtmm!(D::PSparseMatrix,A::PSparseMatrix,B::PSparseMatrix,C::PSparseMa
     Aog = own_ghost_values(A)
     Bog = own_ghost_values(B)
     
-    map((D,A,B,C,cache)->RAP!(D,transpose(A),B,C,cache), Doo,Aoo,Boo,Coo,Doo_cache)
-    map((D,A,B,C,cache)->RAP!(D,transpose(A),B,C,cache), Dgo,Aog,Boo,Coo,Dgo_cache)
+    map((D,A,B,C,cache)->rap!(D,transpose(A),B,C,cache), Doo,Aoo,Boo,Coo,Doo_cache)
+    map((D,A,B,C,cache)->rap!(D,transpose(A),B,C,cache), Dgo,Aog,Boo,Coo,Dgo_cache)
     
     # Collect ghost rows from P before continuing
     wait(consistency_task)
@@ -3225,16 +3225,16 @@ function spmtmm!(D::PSparseMatrix,A::PSparseMatrix,B::PSparseMatrix,C::PSparseMa
     Cgo = ghost_own_values(C2)
     Cgg = ghost_ghost_values(C2)
 
-    map((D,A,B,C,cache)->RAP!(D,transpose(A),B,C,cache), Dgg,Aog,Boo,Cog2,Dgg_cache)
+    map((D,A,B,C,cache)->rap!(D,transpose(A),B,C,cache), Dgg,Aog,Boo,Cog2,Dgg_cache)
 
-    map((D,A,B,C,cache)->RAP!(D,transpose(A),B,C,1,1,cache), Dgo,Aog,Bog,Cgo,Dgo_cache)
-    map((D,A,B,C,cache)->RAP!(D,transpose(A),B,C,1,1,cache), Dgg,Aog,Bog,Cgg,Dgg_cache)
+    map((D,A,B,C,cache)->rap!(D,transpose(A),B,C,1,1,cache), Dgo,Aog,Bog,Cgo,Dgo_cache)
+    map((D,A,B,C,cache)->rap!(D,transpose(A),B,C,1,1,cache), Dgg,Aog,Bog,Cgg,Dgg_cache)
 
     assembly_task = assemble!(D1, D1_unassembled, assemblyCache)
     
-    map((D,A,B,C,cache)->RAP!(D,transpose(A),B,C,1,1,cache), Doo,Aoo,Bog,Cgo,Doo_cache)
-    map((D,A,B,C,cache)->RAP!(D,transpose(A),B,C,cache), Dog,Aoo,Boo,Cog2,Dog_cache)
-    map((D,A,B,C,cache)->RAP!(D,transpose(A),B,C,1,1,cache), Dog,Aoo,Bog,Cgg,Dog_cache)
+    map((D,A,B,C,cache)->rap!(D,transpose(A),B,C,1,1,cache), Doo,Aoo,Bog,Cgo,Doo_cache)
+    map((D,A,B,C,cache)->rap!(D,transpose(A),B,C,cache), Dog,Aoo,Boo,Cog2,Dog_cache)
+    map((D,A,B,C,cache)->rap!(D,transpose(A),B,C,1,1,cache), Dog,Aoo,Bog,Cgg,Dog_cache)
     
     wait(assembly_task)
     add!(D, D1, D2, mergeCache)
@@ -3252,7 +3252,7 @@ function spmmm(A::PSparseMatrix,B::PSparseMatrix,C::PSparseMatrix;reuse=Val(fals
     Boo = own_own_values(B)
     Coo = own_own_values(C)
 
-    Doo1,Doo_cache = map(RAP,Aoo,Boo,Coo) |> tuple_of_arrays
+    Doo1,Doo_cache = map(rap,Aoo,Boo,Coo) |> tuple_of_arrays
     B2, Bcache = fetch(B2_task)
     C2_task = consistent(C,partition(axes(B2,2)),reuse=true)
 
@@ -3260,7 +3260,7 @@ function spmmm(A::PSparseMatrix,B::PSparseMatrix,C::PSparseMatrix;reuse=Val(fals
     Bgo = ghost_own_values(B2)
     Bgg = ghost_ghost_values(B2)
 
-    Doo2,Doo_cache = map(RAP,Aog,Bgo,Coo,Doo_cache) |> tuple_of_arrays
+    Doo2,Doo_cache = map(rap,Aog,Bgo,Coo,Doo_cache) |> tuple_of_arrays
     Doo12 = map(+,Doo1,Doo2)
 
     C2, Ccache = fetch(C2_task)
@@ -3269,16 +3269,16 @@ function spmmm(A::PSparseMatrix,B::PSparseMatrix,C::PSparseMatrix;reuse=Val(fals
     Cgo = ghost_own_values(C2)
     Cgg = ghost_ghost_values(C2)
 
-    Doo3,Doo_cache = map(RAP,Aoo,Bog,Cgo,Doo_cache) |> tuple_of_arrays
-    Doo4,Doo_cache = map(RAP,Aog,Bgg,Cgo,Doo_cache) |> tuple_of_arrays
+    Doo3,Doo_cache = map(rap,Aoo,Bog,Cgo,Doo_cache) |> tuple_of_arrays
+    Doo4,Doo_cache = map(rap,Aog,Bgg,Cgo,Doo_cache) |> tuple_of_arrays
   
     Doo34 = map(+,Doo3,Doo4)
     Doo = map(+,Doo12,Doo34)
   
-    Dog1,Dog_cache = map(RAP,Aoo,Boo,Cog) |> tuple_of_arrays
-    Dog2,Dog_cache = map(RAP,Aog,Bgo,Cog,Dog_cache) |> tuple_of_arrays
-    Dog3,Dog_cache = map(RAP,Aoo,Bog,Cgg,Dog_cache) |> tuple_of_arrays
-    Dog4,Dog_cache = map(RAP,Aog,Bgg,Cgg,Dog_cache) |> tuple_of_arrays
+    Dog1,Dog_cache = map(rap,Aoo,Boo,Cog) |> tuple_of_arrays
+    Dog2,Dog_cache = map(rap,Aog,Bgo,Cog,Dog_cache) |> tuple_of_arrays
+    Dog3,Dog_cache = map(rap,Aoo,Bog,Cgg,Dog_cache) |> tuple_of_arrays
+    Dog4,Dog_cache = map(rap,Aog,Bgg,Cgg,Dog_cache) |> tuple_of_arrays
 
     Dog12 = map(+,Dog1,Dog2)
     Dog34 = map(+,Dog3,Dog4)
@@ -3310,25 +3310,25 @@ function spmmm!(D::PSparseMatrix,A::PSparseMatrix,B::PSparseMatrix,C::PSparseMat
     Aog = own_ghost_values(A)
     Boo = own_own_values(B)
     Coo = own_own_values(C)
-    map(RAP!,Doo,Aoo,Boo,Coo,Doo_cache)
+    map(rap!,Doo,Aoo,Boo,Coo,Doo_cache)
     wait(B2_task)
 
     C2_task = consistent!(C2,C,Ccache)
     Bog = own_ghost_values(B2)
     Bgo = ghost_own_values(B2)
     Bgg = ghost_ghost_values(B2)
-    map((D,A,B,C,cache)->RAP!(D,A,B,C,1,1,cache),Doo,Aog,Bgo,Coo,Doo_cache)
+    map((D,A,B,C,cache)->rap!(D,A,B,C,1,1,cache),Doo,Aog,Bgo,Coo,Doo_cache)
 
     wait(C2_task)
     Cog = own_ghost_values(C2)
     Cgo = ghost_own_values(C2)
     Cgg = ghost_ghost_values(C2)
 
-    map((D,A,B,C,cache)->RAP!(D,A,B,C,1,1,cache),Doo,Aoo,Bog,Cgo,Doo_cache)
-    map((D,A,B,C,cache)->RAP!(D,A,B,C,1,1,cache),Doo,Aog,Bgg,Cgo,Doo_cache)
-    map(RAP!,Dog,Aoo,Boo,Cog,Dog_cache)
-    map((D,A,B,C,cache)->RAP!(D,A,B,C,1,1,cache),Dog,Aog,Bgo,Cog,Dog_cache)
-    map((D,A,B,C,cache)->RAP!(D,A,B,C,1,1,cache),Dog,Aoo,Bog,Cgg,Dog_cache)
-    map((D,A,B,C,cache)->RAP!(D,A,B,C,1,1,cache),Dog,Aog,Bgg,Cgg,Dog_cache)
+    map((D,A,B,C,cache)->rap!(D,A,B,C,1,1,cache),Doo,Aoo,Bog,Cgo,Doo_cache)
+    map((D,A,B,C,cache)->rap!(D,A,B,C,1,1,cache),Doo,Aog,Bgg,Cgo,Doo_cache)
+    map(rap!,Dog,Aoo,Boo,Cog,Dog_cache)
+    map((D,A,B,C,cache)->rap!(D,A,B,C,1,1,cache),Dog,Aog,Bgo,Cog,Dog_cache)
+    map((D,A,B,C,cache)->rap!(D,A,B,C,1,1,cache),Dog,Aoo,Bog,Cgg,Dog_cache)
+    map((D,A,B,C,cache)->rap!(D,A,B,C,1,1,cache),Dog,Aog,Bgg,Cgg,Dog_cache)
     D
 end
\ No newline at end of file
diff --git a/src/sequential_implementations.jl b/src/sequential_implementations.jl
index 94ab1d08..b628a3d9 100644
--- a/src/sequential_implementations.jl
+++ b/src/sequential_implementations.jl
@@ -647,13 +647,13 @@ function LinearAlgebra.mul!(C::SparseMatrixCSR{Bi,Tv,Ti},
 end
 
 # PtAP variants
-function RAP(Plt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}}, A::SparseMatrixCSR{Bi,Tv,Ti}, Pr::SparseMatrixCSR{Bi,Tv,Ti}) where {Bi,Tv,Ti}
+function rap(Plt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}}, A::SparseMatrixCSR{Bi,Tv,Ti}, Pr::SparseMatrixCSR{Bi,Tv,Ti}) where {Bi,Tv,Ti}
     p,q = size(Plt)
     m,r = size(A)
     n,s = size(Pr)
     if r != n && throw(DimensionMismatch("Invalid dimensions for A*P: ($m,$r)*($n,$s),"));end
     if q != m && throw(DimensionMismatch("Invalid dimensions: R*AP: ($p,$q)*($m,$s)"));end
-    function RAP_symbolic_count!(R,A,Pr)
+    function rap_symbolic_count!(R,A,Pr)
         JR = R.data
         JA = colvals(A)
         JPr = colvals(Pr) # colvals can be interpreted as rowvals when Pr is virtually transposed.
@@ -706,7 +706,7 @@ function RAP(Plt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}}, A::SparseMatrixCSR{Bi
         cache = (xbRA,JRA,xbC,JAP)
         SparseMatrixCSR{Bi}(p,s,IC,JC,VC), cache # values not yet initialized
     end
-    function RAP_symbolic_fill!(C,R,A,Pr,cache)
+    function rap_symbolic_fill!(C,R,A,Pr,cache)
         (xbRA,JRA,xbC,JAP) = cache
         JC = colvals(C)
         JR = R.data
@@ -745,18 +745,18 @@ function RAP(Plt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}}, A::SparseMatrixCSR{Bi
         outer_cache = (xbC,similar(xbC, Tv),JAP)
         C, outer_cache # values not yet initialized
     end
-    function _RAP(Plt,A,Pr)
+    function _rap(Plt,A,Pr)
         R = symbolic_halfperm(Plt.parent)
-        C,symbolic_cache = RAP_symbolic_count!(R,A,Pr) # precompute nz structure with a symbolic transpose
-        _,outer_cache = RAP_symbolic_fill!(C,R,A,Pr,symbolic_cache)
+        C,symbolic_cache = rap_symbolic_count!(R,A,Pr) # precompute nz structure with a symbolic transpose
+        _,outer_cache = rap_symbolic_fill!(C,R,A,Pr,symbolic_cache)
         Ct = symbolic_halfperm(C)
         symbolic_halfperm!(C,Ct)
-        RAP!(C,Plt,A,Pr,outer_cache),(outer_cache...,R)
+        rap!(C,Plt,A,Pr,outer_cache),(outer_cache...,R)
     end
-    _RAP(Plt,A,Pr)
+    _rap(Plt,A,Pr)
 end
 
-function RAP(Plt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}},
+function rap(Plt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}},
              A::SparseMatrixCSR{Bi,Tv,Ti},
              Pr::SparseMatrixCSR{Bi,Tv,Ti},
              cache) where {Bi,Tv,Ti}
@@ -766,7 +766,7 @@ function RAP(Plt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}},
     if r != n && throw(DimensionMismatch("Invalid dimensions for A*P: ($m,$r)*($n,$s),"));end
     if q != m && throw(DimensionMismatch("Invalid dimensions: R*AP: ($p,$q)*($m,$s)"));end
     
-    function RAP_symbolic_count!(R,A,Pr)
+    function rap_symbolic_count!(R,A,Pr)
         JR = R.data
         JA = colvals(A)
         JPr = colvals(Pr) # colvals can be interpreted as rowvals when Pr is virtually transposed.
@@ -818,7 +818,7 @@ function RAP(Plt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}},
         xbC .= 0
         SparseMatrixCSR{Bi}(p,s,IC,JC,VC),(xbRA,JRA,xbC,JAP) # values in CSR matrix not yet initialized
     end
-    function RAP_symbolic_fill!(C,R,A,Pr,cache)
+    function rap_symbolic_fill!(C,R,A,Pr,cache)
         (xbRA,JRA,xbC,JAP) = cache
         JC = colvals(C)
         JR = R.data
@@ -856,17 +856,17 @@ function RAP(Plt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}},
         xbC .= 0
         C, (xbC,similar(xbC, Tv),JAP) # values not yet initialized
     end
-    function _RAP(Plt,A,Pr,old_cache)
+    function _rap(Plt,A,Pr,old_cache)
         xb,x,JAP,R = old_cache
         old_outer_cache = (xb,x,JAP)
-        C,symbolic_cache = RAP_symbolic_count!(R, A, Pr)
-        _,new_outer_cache = RAP_symbolic_fill!(C,R, A, Pr, symbolic_cache)
+        C,symbolic_cache = rap_symbolic_count!(R, A, Pr)
+        _,new_outer_cache = rap_symbolic_fill!(C,R, A, Pr, symbolic_cache)
         Ct = symbolic_halfperm(C)
         symbolic_halfperm!(C,Ct)
         outer_cache = map((c1,c2) -> length(c1) >= length(c2) ? c1 : c2, old_outer_cache,new_outer_cache)
-        RAP!(C,Plt,A,Pr,outer_cache),(outer_cache...,R)
+        rap!(C,Plt,A,Pr,outer_cache),(outer_cache...,R)
     end
-    _RAP(Plt,A,Pr,cache)
+    _rap(Plt,A,Pr,cache)
 end
 
 function reduce_spmtmm_cache(cache,::Type{SparseMatrixCSR})
@@ -874,7 +874,7 @@ function reduce_spmtmm_cache(cache,::Type{SparseMatrixCSR})
     (xb,x,JAP)
 end
 
-function RAP!(C::SparseMatrixCSR{Bi,Tv,Ti}, 
+function rap!(C::SparseMatrixCSR{Bi,Tv,Ti}, 
               Plt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}},
               A::SparseMatrixCSR{Bi,Tv,Ti},
               Pr::SparseMatrixCSR{Bi,Tv,Ti},
@@ -932,7 +932,7 @@ function RAP!(C::SparseMatrixCSR{Bi,Tv,Ti},
     C
 end
 
-function RAP!(C::SparseMatrixCSR{Bi,Tv,Ti},
+function rap!(C::SparseMatrixCSR{Bi,Tv,Ti},
               Plt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}},
               A::SparseMatrixCSR{Bi,Tv,Ti},
               Pr::SparseMatrixCSR{Bi,Tv,Ti},
@@ -991,8 +991,8 @@ function RAP!(C::SparseMatrixCSR{Bi,Tv,Ti},
     C
 end
 
-# RAP variants
-function RAP(Pl::SparseMatrixCSR{Bi,Tv,TiPl},
+# rap variants
+function rap(Pl::SparseMatrixCSR{Bi,Tv,TiPl},
              A::SparseMatrixCSR{Bi,Tv,TiA},
              Pr::SparseMatrixCSR{Bi,Tv,TiPr}) where {Bi,Tv,TiPl,TiA,TiPr}
     p,q = size(Pl)
@@ -1000,7 +1000,7 @@ function RAP(Pl::SparseMatrixCSR{Bi,Tv,TiPl},
     n,s = size(Pr)
     if q == m || throw(DimensionMismatch("Invalid dimensions for R*A: ($p,$q)*($m,$r),"));end
     if r == n || throw(DimensionMismatch("Invalid dimensions: RA*P: ($p,$r)*($n,$s)"));end
-    function RAP_symbolic!(Pl,A,Pr)
+    function rap_symbolic!(Pl,A,Pr)
         JPl = colvals(Pl)
         JA = colvals(A)
         JPr = colvals(Pr) # colvals can be interpreted as rowvals when Pr is virtually transposed.
@@ -1052,7 +1052,7 @@ function RAP(Pl::SparseMatrixCSR{Bi,Tv,TiPl},
         cache = (xbRA,xRA,JRA,xbC,xC)
         SparseMatrixCSR{Bi}(p,s,IC,JC,VC), cache # values not yet initialized
     end
-    function RAP_numeric!(C,Pl,A,Pr,cache)
+    function rap_numeric!(C,Pl,A,Pr,cache)
         JPl = colvals(Pl)
         VPl = nonzeros(Pl)
         JA = colvals(A)
@@ -1103,17 +1103,17 @@ function RAP(Pl::SparseMatrixCSR{Bi,Tv,TiPl},
             end
         end
     end
-    function _RAP(Pl,A,Pr)
-        C,(xbRA,xRA,JRA,xbC,xC) = RAP_symbolic!(Pl,A,Pr)
+    function _rap(Pl,A,Pr)
+        C,(xbRA,xRA,JRA,xbC,xC) = rap_symbolic!(Pl,A,Pr)
         xbRA .= 0
         xbC .= 0
         cache = (xbRA,xRA,JRA,xbC,xC)
-        RAP_numeric!(C,Pl,A,Pr,cache)
+        rap_numeric!(C,Pl,A,Pr,cache)
         Ct = halfperm!(xbC,similar(colvals(C)),similar(nonzeros(C)),C)
         halfperm!(C,Ct)
         C,cache
     end
-    _RAP(Pl,A,Pr)
+    _rap(Pl,A,Pr)
 end
 
 # Reuses internal arrays of A!!!
@@ -1134,7 +1134,7 @@ function reduce_spmtmm_cache(cache,::Type{M}  where M <: SparseMatrixCSC)
     reduce_spmmmt_cache(cache,SparseMatrixCSR)
 end
 
-function RAP(Pl::SparseMatrixCSR{Bi,Tv,TiPl},
+function rap(Pl::SparseMatrixCSR{Bi,Tv,TiPl},
              A::SparseMatrixCSR{Bi,Tv,TiA},
              Pr::SparseMatrixCSR{Bi,Tv,TiPr},
              cache) where {Bi,Tv,TiPl,TiA,TiPr}
@@ -1143,7 +1143,7 @@ function RAP(Pl::SparseMatrixCSR{Bi,Tv,TiPl},
     n,s = size(Pr)
     if q == m || throw(DimensionMismatch("Invalid dimensions for R*A: ($p,$q)*($m,$r),"));end
     if r == n || throw(DimensionMismatch("Invalid dimensions: RA*P: ($p,$r)*($n,$s)"));end
-    function RAP_symbolic!(Pl,A,Pr,cache)
+    function rap_symbolic!(Pl,A,Pr,cache)
         JPl = colvals(Pl)
         JA = colvals(A)
         JPr = colvals(Pr) # colvals can be interpreted as rowvals when Pr is virtually transposed.
@@ -1185,7 +1185,7 @@ function RAP(Pl::SparseMatrixCSR{Bi,Tv,TiPl},
         VC = zeros(Tv,nnz_C-1)
         SparseMatrixCSR{Bi}(p,s,IC,JC,VC) # values not yet initialized
     end
-    function RAP_numeric!(C,Pl,A,Pr,cache)
+    function rap_numeric!(C,Pl,A,Pr,cache)
         JPl = colvals(Pl)
         VPl = nonzeros(Pl)
         JA = colvals(A)
@@ -1236,7 +1236,7 @@ function RAP(Pl::SparseMatrixCSR{Bi,Tv,TiPl},
             end
         end
     end
-    function _RAP(Pl,A,Pr,old_cache)
+    function _rap(Pl,A,Pr,old_cache)
         max_rPl = find_max_row_length(Pl)
         max_rA = find_max_row_length(A)
         max_rPr = find_max_row_length(Pr)
@@ -1254,15 +1254,15 @@ function RAP(Pl::SparseMatrixCSR{Bi,Tv,TiPl},
         new_cache = (xbRA2,xRA2,JRA2,xbC,xC)
         xbRA2 .= 0
         xbC .= 0
-        C = RAP_symbolic!(Pl,A,Pr,new_cache)
+        C = rap_symbolic!(Pl,A,Pr,new_cache)
         xbRA2 .= 0
         xbC .= 0
-        RAP_numeric!(C,Pl,A,Pr,new_cache)
+        rap_numeric!(C,Pl,A,Pr,new_cache)
         Ct = halfperm!(xbC,similar(colvals(C)),similar(nonzeros(C)),C)
         halfperm!(C,Ct)
         C,new_cache
     end
-    _RAP(Pl,A,Pr,cache)
+    _rap(Pl,A,Pr,cache)
 end
 
 function reduce_spmmmt_cache(cache,::Type{M} where M <: SparseMatrixCSR)
@@ -1274,7 +1274,7 @@ function reduce_spmmmt_cache(cache,::Type{M} where M <: SparseMatrixCSC)
     reduce_spmtmm_cache(cache,SparseMatrixCSR)
 end
 
-function RAP!(C::SparseMatrixCSR{Bi,Tv,TiC},
+function rap!(C::SparseMatrixCSR{Bi,Tv,TiC},
               Pl::SparseMatrixCSR{Bi,Tv,TiPl},
               A::SparseMatrixCSR{Bi,Tv,TiA},
               Pr::SparseMatrixCSR{Bi,Tv,TiPr},
@@ -1341,7 +1341,7 @@ function RAP!(C::SparseMatrixCSR{Bi,Tv,TiC},
     C
 end
 
-function RAP!(C::SparseMatrixCSR{Bi,Tv,TiC},
+function rap!(C::SparseMatrixCSR{Bi,Tv,TiC},
               Pl::SparseMatrixCSR{Bi,Tv,TiPl},
               A::SparseMatrixCSR{Bi,Tv,TiA},
               Pr::SparseMatrixCSR{Bi,Tv,TiPr},
@@ -1409,7 +1409,7 @@ function RAP!(C::SparseMatrixCSR{Bi,Tv,TiC},
 end
 
 # RARt variants
-function RAP(Pl::SparseMatrixCSR{Bi,Tv,TiA},
+function rap(Pl::SparseMatrixCSR{Bi,Tv,TiA},
              A::SparseMatrixCSR{Bi,Tv,TiB},
              Prt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,TiC}}) where {Bi,Tv,TiA,TiB,TiC}
     p,q = size(Pl)
@@ -1417,10 +1417,10 @@ function RAP(Pl::SparseMatrixCSR{Bi,Tv,TiA},
     n,s = size(Prt)
     if q == m || throw(DimensionMismatch("Invalid dimensions for R*A: ($p,$q)*($m,$r),"));end
     if r == n || throw(DimensionMismatch("Invalid dimensions for RA*P: ($p,$r)*($n,$s)"));end
-    RAP(Pl,A,copy(Prt))
+    rap(Pl,A,copy(Prt))
 end
 
-function RAP(Pl::SparseMatrixCSR{Bi,Tv,TiA},
+function rap(Pl::SparseMatrixCSR{Bi,Tv,TiA},
              A::SparseMatrixCSR{Bi,Tv,TiB},
              Prt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,TiC}},cache) where {Bi,Tv,TiA,TiB,TiC}
     p,q = size(Pl)
@@ -1428,10 +1428,10 @@ function RAP(Pl::SparseMatrixCSR{Bi,Tv,TiA},
     n,s = size(Prt)
     if q == m || throw(DimensionMismatch("Invalid dimensions for R*A: ($p,$q)*($m,$r),"));end
     if r == n || throw(DimensionMismatch("Invalid dimensions for RA*P: ($p,$r)*($n,$s)"));end
-    RAP(Pl,A,copy(Prt),cache)
+    rap(Pl,A,copy(Prt),cache)
 end
 
-function RAP!(C::SparseMatrixCSR{Bi,Tv,TiC},
+function rap!(C::SparseMatrixCSR{Bi,Tv,TiC},
               Pl::SparseMatrixCSR{Bi,Tv,TiPl}, 
               A::SparseMatrixCSR{Bi,Tv,TiA}, 
               Prt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,TiPr}},
@@ -1450,7 +1450,7 @@ function RAP!(C::SparseMatrixCSR{Bi,Tv,TiC},
     VPr = nonzeros(Pr)
     JC = colvals(C)
     VC = nonzeros(C)
-    # some cache items are present with the regular RAP product in mind, which is how the allocating verison is performed
+    # some cache items are present with the regular rap product in mind, which is how the allocating verison is performed
     xb,x = cache
     xb .= 0
     for i in 1:p
@@ -1485,7 +1485,7 @@ function RAP!(C::SparseMatrixCSR{Bi,Tv,TiC},
     C
 end
 
-function RAP!(C::SparseMatrixCSR{Bi,Tv,TiC},
+function rap!(C::SparseMatrixCSR{Bi,Tv,TiC},
               Pl::SparseMatrixCSR{Bi,Tv,TiPl},
               A::SparseMatrixCSR{Bi,Tv,TiA},
               Prt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,TiPr}},
@@ -1507,7 +1507,7 @@ function RAP!(C::SparseMatrixCSR{Bi,Tv,TiC},
     JC = colvals(C)
     VC = nonzeros(C)
     VC .*= β
-    # some cache items are present with the regular RAP product in mind, which is how the allocating verison is performed
+    # some cache items are present with the regular rap product in mind, which is how the allocating verison is performed
     xb,x = cache
     xb .= 0
     for i in 1:p
@@ -1543,52 +1543,52 @@ function RAP!(C::SparseMatrixCSR{Bi,Tv,TiC},
 end
 
 ### CSC in terms of CSR
-function RAP(A::SparseMatrixCSC{Tv,TiA},
+function rap(A::SparseMatrixCSC{Tv,TiA},
              B::SparseMatrixCSC{Tv,TiB},
              C::SparseMatrixCSC{Tv,TiC}) where {Tv,TiA,TiB,TiC}
-    D,cache = RAP(ascsr(C),ascsr(B),ascsr(A))
+    D,cache = rap(ascsr(C),ascsr(B),ascsr(A))
     ascsc(D),cache
 end
 
-function RAP(A::SparseMatrixCSC{Tv,TiA},
+function rap(A::SparseMatrixCSC{Tv,TiA},
              B::SparseMatrixCSC{Tv,TiB},
              C::SparseMatrixCSC{Tv,TiC},
              cache) where {Tv,TiA,TiB,TiC}
-    D,new_cache = RAP(ascsr(C),ascsr(B),ascsr(A),cache)
+    D,new_cache = rap(ascsr(C),ascsr(B),ascsr(A),cache)
     ascsc(D),new_cache
 end
 
-function RAP!(D::SparseMatrixCSC{Tv,TiD},
+function rap!(D::SparseMatrixCSC{Tv,TiD},
               A::SparseMatrixCSC{Tv,TiA},
               B::SparseMatrixCSC{Tv,TiB},
               C::SparseMatrixCSC{Tv,TiC},
               cache) where {Tv,TiD,TiA,TiB,TiC}
-    RAP!(ascsr(D),ascsr(C),ascsr(B),ascsr(A),cache)
+    rap!(ascsr(D),ascsr(C),ascsr(B),ascsr(A),cache)
     D
 end
 
-function RAP!(D::SparseMatrixCSC{Tv,TiD},
+function rap!(D::SparseMatrixCSC{Tv,TiD},
               A::SparseMatrixCSC{Tv,TiA},
               B::SparseMatrixCSC{Tv,TiB},
               C::SparseMatrixCSC{Tv,TiC},
               cache::JaggedArray{X,Y} where {X<:Integer, Y<:Integer},
               acc) where {Tv,TiD,TiA,TiB,TiC}
-    RAP!(ascsr(D),ascsr(C),ascsr(B),ascsr(A),cache,acc)
+    rap!(ascsr(D),ascsr(C),ascsr(B),ascsr(A),cache,acc)
     D
 end
 
-function RAP!(D::SparseMatrixCSC{Tv,TiD},
+function rap!(D::SparseMatrixCSC{Tv,TiD},
               A::SparseMatrixCSC{Tv,TiA},
               B::SparseMatrixCSC{Tv,TiB},
               C::SparseMatrixCSC{Tv,TiC},
               α::Number,
               β::Number,
               cache) where {Tv,TiD,TiA,TiB,TiC}
-    RAP!(ascsr(D),ascsr(C),ascsr(B),ascsr(A),α,β,cache)
+    rap!(ascsr(D),ascsr(C),ascsr(B),ascsr(A),α,β,cache)
     D
 end
 
-function RAP!(D::SparseMatrixCSC{Tv,TiD},
+function rap!(D::SparseMatrixCSC{Tv,TiD},
               A::SparseMatrixCSC{Tv,TiA},
               B::SparseMatrixCSC{Tv,TiB},
               C::SparseMatrixCSC{Tv,TiC},
@@ -1596,77 +1596,77 @@ function RAP!(D::SparseMatrixCSC{Tv,TiD},
               β::Number,
               cache::JaggedArray{X,Y} where {X <: Integer, Y<:Integer},
               acc) where {Tv,TiD,TiA,TiB,TiC}
-    RAP!(ascsr(D),ascsr(C),ascsr(B),ascsr(A),α,β,cache,acc)
+    rap!(ascsr(D),ascsr(C),ascsr(B),ascsr(A),α,β,cache,acc)
     D
 end
 
 # PtAP
-function RAP(A::Transpose{Tv,SparseMatrixCSC{Tv,TiA}},
+function rap(A::Transpose{Tv,SparseMatrixCSC{Tv,TiA}},
              B::SparseMatrixCSC{Tv,TiB},
              C::SparseMatrixCSC{Tv,TiC}) where {Tv,TiA,TiB,TiC}
-    D,cache = RAP(ascsr(C),ascsr(B),transpose(ascsr(A.parent)))
+    D,cache = rap(ascsr(C),ascsr(B),transpose(ascsr(A.parent)))
     ascsc(D),cache
 end
 
-function RAP(A::Transpose{Tv,SparseMatrixCSC{Tv,TiA}},
+function rap(A::Transpose{Tv,SparseMatrixCSC{Tv,TiA}},
              B::SparseMatrixCSC{Tv,TiB},
              C::SparseMatrixCSC{Tv,TiC},
              cache) where {Tv,TiA,TiB,TiC}
-    D,cache = RAP(ascsr(C),ascsr(B),transpose(ascsr(A.parent)),cache)
+    D,cache = rap(ascsr(C),ascsr(B),transpose(ascsr(A.parent)),cache)
     ascsc(D),cache
 end
 
-function RAP!(D::SparseMatrixCSC{Tv,TiD},
+function rap!(D::SparseMatrixCSC{Tv,TiD},
               A::Transpose{Tv,SparseMatrixCSC{Tv,TiA}},
               B::SparseMatrixCSC{Tv,TiB},
               C::SparseMatrixCSC{Tv,TiC},
               cache) where {Tv,TiD,TiA,TiB,TiC}
-    RAP!(ascsr(D),ascsr(C),ascsr(B),transpose(ascsr(A.parent)),cache)
+    rap!(ascsr(D),ascsr(C),ascsr(B),transpose(ascsr(A.parent)),cache)
     D
 end
 
-function RAP!(D::SparseMatrixCSC{Tv,TiD},
+function rap!(D::SparseMatrixCSC{Tv,TiD},
               A::Transpose{Tv,SparseMatrixCSC{Tv,TiA}},
               B::SparseMatrixCSC{Tv,TiB},
               C::SparseMatrixCSC{Tv,TiC},
               α::Number,
               β::Number,
               cache) where {Tv,TiD,TiA,TiB,TiC}
-    RAP!(ascsr(D),ascsr(C),ascsr(B),transpose(ascsr(A.parent)),α,β,cache)
+    rap!(ascsr(D),ascsr(C),ascsr(B),transpose(ascsr(A.parent)),α,β,cache)
     D
 end
 
 # RARt
-function RAP(A::SparseMatrixCSC{Tv,Ti},
+function rap(A::SparseMatrixCSC{Tv,Ti},
              B::SparseMatrixCSC{Tv,Ti},
              C::Transpose{Tv,SparseMatrixCSC{Tv,Ti}}) where {Tv,Ti<:Integer}
-    D,new_cache = RAP(transpose(ascsr(C.parent)),ascsr(B),ascsr(A))
+    D,new_cache = rap(transpose(ascsr(C.parent)),ascsr(B),ascsr(A))
     ascsc(D),new_cache
 end
-function RAP(A::SparseMatrixCSC{Tv,Ti},
+function rap(A::SparseMatrixCSC{Tv,Ti},
              B::SparseMatrixCSC{Tv,Ti},
              C::Transpose{Tv,SparseMatrixCSC{Tv,Ti}},
              cache) where {Tv,Ti<:Integer}
-    D,new_cache = RAP(transpose(ascsr(C.parent)),ascsr(B),ascsr(A),cache)
+    D,new_cache = rap(transpose(ascsr(C.parent)),ascsr(B),ascsr(A),cache)
     ascsc(D),new_cache
 end
 
-function RAP!(D::SparseMatrixCSC{Tv,Ti},
+function rap!(D::SparseMatrixCSC{Tv,Ti},
               A::SparseMatrixCSC{Tv,Ti},
               B::SparseMatrixCSC{Tv,Ti},
               C::Transpose{Tv,SparseMatrixCSC{Tv,Ti}},
               cache) where {Tv,Ti<:Integer}
-    RAP!(ascsr(D),transpose(ascsr(C.parent)),ascsr(B),ascsr(A),cache)
+    rap!(ascsr(D),transpose(ascsr(C.parent)),ascsr(B),ascsr(A),cache)
     D
 end
 
-function RAP!(D::SparseMatrixCSC{Tv,Ti},
+function rap!(D::SparseMatrixCSC{Tv,Ti},
               A::SparseMatrixCSC{Tv,Ti},
               B::SparseMatrixCSC{Tv,Ti},
               C::Transpose{Tv,SparseMatrixCSC{Tv,Ti}},
               α::Number,
               β::Number,
               cache) where {Tv,Ti<:Integer}
-    RAP!(ascsr(D),transpose(ascsr(C.parent)),ascsr(B),ascsr(A),α,β,cache)
+    rap!(ascsr(D),transpose(ascsr(C.parent)),ascsr(B),ascsr(A),α,β,cache)
     D
 end
\ No newline at end of file

From 3a03304b56f8cff5e5e85cdd8f79aa608a2a08cd Mon Sep 17 00:00:00 2001
From: jop611 <meijer.jop@gmail.com>
Date: Tue, 28 Jan 2025 13:29:20 +0100
Subject: [PATCH 28/34] changed RAP function names to rap for consistency

---
 times.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/times.txt b/times.txt
index 2e3f93ca..4e9217e8 100644
--- a/times.txt
+++ b/times.txt
@@ -1 +1 @@
-Dict{String, @NamedTuple{min::Float64, max::Float64, avg::Float64}}("Phase 143" => (min = 0.2149521, max = 0.2149521, avg = 0.2149521), "Phase 3" => (min = 1.0e-7, max = 1.0e-7, avg = 1.0e-7), "Matrix Assembly" => (min = 0.4041403, max = 0.4041403, avg = 0.4041403), "Phase 1" => (min = 4.0e-7, max = 4.0e-7, avg = 4.0e-7))
+Dict{String, @NamedTuple{min::Float64, max::Float64, avg::Float64}}("Phase 143" => (min = 0.2073056, max = 0.2073056, avg = 0.2073056), "Phase 3" => (min = 1.0e-7, max = 1.0e-7, avg = 1.0e-7), "Matrix Assembly" => (min = 0.4125464, max = 0.4125464, avg = 0.4125464), "Phase 1" => (min = 1.0e-6, max = 1.0e-6, avg = 1.0e-6))

From 7d37364b73138a481b9d84cf089ee619a166c12c Mon Sep 17 00:00:00 2001
From: jop611 <meijer.jop@gmail.com>
Date: Wed, 29 Jan 2025 11:50:17 +0100
Subject: [PATCH 29/34] Simplified dispatch, included automatic type promotion
 when required.

---
 src/p_sparse_matrix.jl            |   2 +-
 src/sequential_implementations.jl | 778 +++++++++++++++---------------
 test/spmtmm_tests.jl              | 109 ++++-
 times.txt                         |   2 +-
 4 files changed, 484 insertions(+), 407 deletions(-)

diff --git a/src/p_sparse_matrix.jl b/src/p_sparse_matrix.jl
index e8948c54..560ba66a 100644
--- a/src/p_sparse_matrix.jl
+++ b/src/p_sparse_matrix.jl
@@ -2273,7 +2273,7 @@ end
 
 ### NEW ###
 function rap(R::PSparseMatrix,A::PSparseMatrix,P::PSparseMatrix;reuse=Val(false))
-    Ac, cache = spmmm(R,A,P)
+    Ac, cache = spmmm(R,A,P;reuse=true)
     if val_parameter(reuse)
         return Ac, cache
     end
diff --git a/src/sequential_implementations.jl b/src/sequential_implementations.jl
index b628a3d9..3585e38b 100644
--- a/src/sequential_implementations.jl
+++ b/src/sequential_implementations.jl
@@ -1,19 +1,19 @@
-function Base.:*(A::SparseMatrixCSR{Bi,TvA,TiA},B::SparseMatrixCSR{Bi,TvB,TiB}) where {Bi,TvA,TiA,TvB,TiB}
+function Base.:*(A::SparseMatrixCSR,B::SparseMatrixCSR)
     C = ascsc(B)*ascsc(A)
     ascsr(C)
 end
 
-function Base.:*(At::Transpose{Tv, SparseMatrixCSR{Bi,Tv,Ti}},B::SparseMatrixCSR{Bi,Tv,Ti}) where {Bi,Tv,Ti}
+function Base.:*(At::Transpose{Tv,<:SparseMatrixCSR} where Tv,B::SparseMatrixCSR)
     C = ascsc(B)*transpose(ascsc(At.parent))
     ascsr(C)
 end
 
-function Base.:*(A::SparseMatrixCSR{Bi,Tv,Ti},Bt::Transpose{Tv, SparseMatrixCSR{Bi,Tv,Ti}}) where {Bi,Tv,Ti}
+function Base.:*(A::SparseMatrixCSR,Bt::Transpose{Tv,<:SparseMatrixCSR} where Tv)
     C = transpose(ascsc(Bt.parent))*ascsc(A)
     ascsr(C)
 end
 
-function Base.:*(At::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}},Bt::Transpose{Tv, SparseMatrixCSR{Bi,Tv,Ti}}) where {Bi,Tv,Ti}
+function Base.:*(At::Transpose{TvA,<:SparseMatrixCSR} where TvA,Bt::Transpose{TvB,<:SparseMatrixCSR} where TvB)
     C = transpose(ascsc(Bt.parent))*transpose(ascsc(At.parent))
     ascsr(C)
 end
@@ -27,10 +27,11 @@ function Base.:/(A::SparseMatrixCSR{Bi,Tv,Ti},x::Number) where {Bi,Tv,Ti}
     SparseMatrixCSR{Bi}(size(A)..., copy(A.rowptr), copy(A.colval), map(a -> a/x, A.nzval))
 end
 
-
 # Alternative to lazy csr to csc for matrix addition that does not drop structural zeros.
-function Base.:+(A::SparseMatrixCSR{Bi,Tv,Ti},B::SparseMatrixCSR{Bi,Tv,Ti}) where {Bi,Tv,Ti}
+function Base.:+(A::SparseMatrixCSR{Bi,TvA,TiA},B::SparseMatrixCSR{Bi,TvB,TiB}) where {Bi,TvA,TvB,TiA,TiB}
     if size(A) == size(B) || throw(DimensionMismatch("Size of B $(size(B)) must match size of A $(size(A))"));end
+    Ti = promote_type(TiA,TiB)
+    Tv = promote_type(TvA,TvB)
     p,q = size(A)
     nnz_C_upperbound = nnz(A) + nnz(B)
     IC = Vector{Ti}(undef, p+1)
@@ -87,8 +88,10 @@ function Base.:+(A::SparseMatrixCSR{Bi,Tv,Ti},B::SparseMatrixCSR{Bi,Tv,Ti}) wher
 end
 
 # Alternative to lazy csr to csc for matrix subtraction that does not drop structural zeros. Subtracts B from A, i.e. A - B.
-function Base.:-(A::SparseMatrixCSR{Bi,Tv,Ti},B::SparseMatrixCSR{Bi,Tv,Ti}) where {Bi,Tv,Ti}
+function Base.:-(A::SparseMatrixCSR{Bi,TvA,TiA},B::SparseMatrixCSR{Bi,TvB,TiB}) where {Bi,TvA,TvB,TiA,TiB}
     if size(A) == size(B) || throw(DimensionMismatch("Size of B $(size(B)) must match size of A $(size(A))"));end
+    Ti = promote_type(TiA,TiB)
+    Tv = promote_type(TvA,TvB)
     nnz_C_upperbound = nnz(A) + nnz(B)
     p,r = size(A)
     IC = Vector{Ti}(undef, p+1)
@@ -149,8 +152,10 @@ function Base.:-(A::SparseMatrixCSR{Bi,Tv,Ti}) where {Bi,Tv,Ti}
 end
 
 # Alternative to lazy csr to csc for matrix addition that does not drop structural zeros.
-function Base.:+(A::SparseMatrixCSC{Tv,Ti},B::SparseMatrixCSC{Tv,Ti}) where {Tv,Ti}
+function Base.:+(A::SparseMatrixCSC{TvA,TiA},B::SparseMatrixCSC{TvB,TiB}) where {TvA,TvB,TiA,TiB}
     if size(A) != size(B) && throw(DimensionMismatch("Size of B $(size(B)) must match size of A $(size(A))"));end
+    Ti = promote_type(TiA,TiB)
+    Tv = promote_type(TvA,TvB)
     p,q = size(A)
     nnz_C_upperbound = nnz(A) + nnz(B)
     JC = Vector{Ti}(undef, q+1)
@@ -207,8 +212,10 @@ function Base.:+(A::SparseMatrixCSC{Tv,Ti},B::SparseMatrixCSC{Tv,Ti}) where {Tv,
 end
 
 # Alternative to lazy csr to csc for matrix subtraction that does not drop structural zeros. Subtracts B from A, i.e. A - B.
-function Base.:-(A::SparseMatrixCSC{Tv,Ti},B::SparseMatrixCSC{Tv,Ti}) where {Tv,Ti}
+function Base.:-(A::SparseMatrixCSC{TvA,TiA},B::SparseMatrixCSC{TvB,TiB}) where {TvA,TvB,TiA,TiB}
     if size(A) == size(B) || throw(DimensionMismatch("Size of B $(size(B)) must match size of A $(size(A))"));end
+    Ti = promote_type(TiA,TiB)
+    Tv = promote_type(TvA,TvB)
     p,q = size(A)
     nnz_C_upperbound = nnz(A) + nnz(B)
     JC = Vector{Ti}(undef, q+1)
@@ -269,26 +276,28 @@ function Base.:-(A::SparseMatrixCSC{Tv,Ti}) where {Tv,Ti}
 end
 
 
-function LinearAlgebra.mul!(C::SparseMatrixCSC{Tv,Ti},
-    A::SparseMatrixCSC{Tv,Ti},
-    B::SparseMatrixCSC{Tv,Ti},
-    cache) where {Tv,Ti}
+function LinearAlgebra.mul!(C::SparseMatrixCSC,
+                            A::SparseMatrixCSC,
+                            B::SparseMatrixCSC,
+                            cache)
     mul!(ascsr(C),ascsr(B),ascsr(A),cache)
+    C
 end
 
 
-function LinearAlgebra.mul!(C::SparseMatrixCSC{Tv,Ti},
-    A::SparseMatrixCSC{Tv,Ti},
-    B::SparseMatrixCSC{Tv,Ti},
-    α::Number,
-    β::Number,
-    cache) where {Tv,Ti}
+function LinearAlgebra.mul!(C::SparseMatrixCSC,
+                            A::SparseMatrixCSC,
+                            B::SparseMatrixCSC,
+                            α::Number,
+                            β::Number,
+                            cache)
     mul!(ascsr(C),ascsr(B),ascsr(A),α,β,cache)
+    C
 end
 
-function LinearAlgebra.mul!(C::SparseMatrixCSC{Tv,Ti},
-                            At::Transpose{Tv,SparseMatrixCSC{Tv,Ti}},
-                            B::SparseMatrixCSC{Tv,Ti}) where {Tv,Ti}
+function LinearAlgebra.mul!(C::SparseMatrixCSC,
+                            At::Transpose{Tv,<:SparseMatrixCSC} where Tv,
+                            B::SparseMatrixCSC)
     a,b = size(C)
     p,q = size(At)
     r,s = size(B)
@@ -380,16 +389,17 @@ function LinearAlgebra.mul!(C::SparseMatrixCSC{Tv,Ti},
     C
 end
 
-function LinearAlgebra.mul!(C::SparseMatrixCSC{Tv,Ti},
-    A::SparseMatrixCSC{Tv,Ti},
-    Bt::Transpose{Tv,SparseMatrixCSC{Tv,Ti}}) where {Tv,Ti}
+function LinearAlgebra.mul!(C::SparseMatrixCSC,
+                            A::SparseMatrixCSC,
+                            Bt::Transpose{Tv,<:SparseMatrixCSC} where Tv)
     mul!(ascsr(C),transpose(ascsr(B)),ascsr(A))
+    C
 end
 
-function LinearAlgebra.mul!(C::SparseMatrixCSR{Bi,Tv,Ti},
-                            A::SparseMatrixCSR{Bi,Tv,Ti},
-                            B::SparseMatrixCSR{Bi,Tv,Ti},
-                            cache) where {Bi,Tv,Ti}
+function LinearAlgebra.mul!(C::SparseMatrixCSR,
+                            A::SparseMatrixCSR,
+                            B::SparseMatrixCSR,
+                            cache)
     a,b = size(C)
     p,q = size(A)
     r,s = size(B)
@@ -397,7 +407,7 @@ function LinearAlgebra.mul!(C::SparseMatrixCSR{Bi,Tv,Ti},
     if (a,b) != (p,s) && throw(DimensionMismatch("C has dimensions $((a,b)) but AB will have dimensions ($(p),$(s))"));end
     JC = colvals(C)
     VC = nonzeros(C)
-    VC .= zero(Tv)
+    VC .= zero(eltype(C))
     JA = colvals(A)
     VA = nonzeros(A)
     JB = colvals(B)
@@ -436,19 +446,20 @@ function LinearAlgebra.mul!(C::SparseMatrixCSR{Bi,Tv,Ti},
     C
 end
 
-function LinearAlgebra.mul!(C::SparseMatrixCSC{Tv,Ti},
-    A::SparseMatrixCSC{Tv,Ti},
-    Bt::Transpose{Tv,SparseMatrixCSC{Tv,Ti}},
-    cache) where {Tv,Ti}
+function LinearAlgebra.mul!(C::SparseMatrixCSC,
+                            A::SparseMatrixCSC,
+                            Bt::Transpose{Tv,<:SparseMatrixCSC} where Tv,
+                            cache)
     mul!(ascsr(C),transpose(ascsr(B)),ascsr(A),cache)
+    C
 end
 
-function LinearAlgebra.mul!(C::SparseMatrixCSR{Bi,Tv,Ti},
-                            A::SparseMatrixCSR{Bi,Tv,Ti},
-                            B::SparseMatrixCSR{Bi,Tv,Ti},
+function LinearAlgebra.mul!(C::SparseMatrixCSR,
+                            A::SparseMatrixCSR,
+                            B::SparseMatrixCSR,
                             α::Number,
                             β::Number,
-                            cache) where {Bi,Tv,Ti}
+                            cache)
     a,b = size(C)
     p,q = size(A)
     r,s = size(B)
@@ -495,29 +506,32 @@ function LinearAlgebra.mul!(C::SparseMatrixCSR{Bi,Tv,Ti},
     C
 end
 
-function LinearAlgebra.mul!(C::SparseMatrixCSC{Tv,Ti},
-                            A::SparseMatrixCSC{Tv,Ti},
-                            Bt::Transpose{Tv,SparseMatrixCSC{Tv,Ti}},
+function LinearAlgebra.mul!(C::SparseMatrixCSC,
+                            A::SparseMatrixCSC,
+                            Bt::Transpose{Tv,<:SparseMatrixCSC} where Tv,
                             α::Number,
                             β::Number,
-                            cache) where {Tv,Ti}
+                            cache)
     mul!(ascsr(C),transpose(ascsr(Bt.parent)),ascsr(A),α,β,cache)
+    C
 end
 
-function LinearAlgebra.mul!(C::SparseMatrixCSC{Tv,Ti},
-                            At::Transpose{Tv,SparseMatrixCSC{Tv,Ti}},
-                            B::SparseMatrixCSC{Tv,Ti},
-                            cache) where {Tv,Ti}
+function LinearAlgebra.mul!(C::SparseMatrixCSC,
+                            At::Transpose{Tv,<:SparseMatrixCSC} where Tv,
+                            B::SparseMatrixCSC,
+                            cache)
     mul!(ascsr(C),ascsr(B),transpose(ascsr(At.parent)))
+    C
 end
 
-function LinearAlgebra.mul!(C::SparseMatrixCSC{Tv,Ti},
-                            At::Transpose{Tv,SparseMatrixCSC{Tv,Ti}},
-                            B::SparseMatrixCSC{Tv,Ti},
+function LinearAlgebra.mul!(C::SparseMatrixCSC,
+                            At::Transpose{Tv,<:SparseMatrixCSC} where Tv,
+                            B::SparseMatrixCSC,
                             α::Number,
                             β::Number,
-                            cache) where {Tv,Ti}
+                            cache)
     mul!(ascsr(C),ascsr(A),transpose(ascsr(At.parent)),α,β)
+    C
 end
 
 # Workaround to supply in-place mul! with auxiliary array, as these are not returned by multiply function exported by SparseArrays
@@ -542,10 +556,10 @@ function construct_spmtm_cache(A::SparseMatrixCSC{Tv,Ti}) where {Tv,Ti}
     construct_spmtm_cache(ascsr(A))
 end
 
-function LinearAlgebra.mul!(C::SparseMatrixCSR{Bi,Tv,Ti},
-                            At::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}},
-                            B::SparseMatrixCSR{Bi,Tv,Ti},
-                            cache) where {Bi,Tv,Ti}
+function LinearAlgebra.mul!(C::SparseMatrixCSR,
+                            At::Transpose{Tv,<:SparseMatrixCSR} where Tv,
+                            B::SparseMatrixCSR,
+                            cache)
     a,b = size(C)
     p,q = size(At)
     r,s = size(B)
@@ -553,7 +567,7 @@ function LinearAlgebra.mul!(C::SparseMatrixCSR{Bi,Tv,Ti},
     if (a,b) != (p,s) && throw(DimensionMismatch("C has dimensions $((a,b)) but AB will have dimensions ($(p),$(s))"));end
     A = At.parent
     VC = nonzeros(C)
-    VC .= zero(Tv)
+    VC .= zero((eltype(C)))
     JC = colvals(C)
     JA = colvals(A) # When virtually transposed colvals represent rowvals.
     VA = nonzeros(A)
@@ -585,12 +599,12 @@ function LinearAlgebra.mul!(C::SparseMatrixCSR{Bi,Tv,Ti},
     C
 end
 
-function LinearAlgebra.mul!(C::SparseMatrixCSR{Bi,Tv,Ti},
-                            At::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}},
-                            B::SparseMatrixCSR{Bi,Tv,Ti},
+function LinearAlgebra.mul!(C::SparseMatrixCSR,
+                            At::Transpose{Tv,<:SparseMatrixCSR} where Tv,
+                            B::SparseMatrixCSR,
                             α::Number,
                             β::Number,
-                            cache) where {Bi,Tv,Ti}
+                            cache)
     a,b = size(C)
     p,q = size(At)
     r,s = size(B)
@@ -630,40 +644,44 @@ function LinearAlgebra.mul!(C::SparseMatrixCSR{Bi,Tv,Ti},
     C
 end
 
-function LinearAlgebra.mul!(C::SparseMatrixCSR{Bi,Tv,Ti},
-                            A::SparseMatrixCSR{Bi,Tv,Ti},
-                            Bt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}}) where {Bi,Tv,Ti}
+function LinearAlgebra.mul!(C::SparseMatrixCSR,
+                            A::SparseMatrixCSR,
+                            Bt::Transpose{Tv,<:SparseMatrixCSR} where Tv)
     mul!(ascsc(C), transpose(ascsc(Bt.parent)), ascsc(A))
     C
 end
 
-function LinearAlgebra.mul!(C::SparseMatrixCSR{Bi,Tv,Ti},
-                            A::SparseMatrixCSR{Bi,Tv,Ti},
-                            Bt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}},
+function LinearAlgebra.mul!(C::SparseMatrixCSR,
+                            A::SparseMatrixCSR,
+                            Bt::Transpose{Tv,<:SparseMatrixCSR} where Tv,
                             α::Number,
-                            β::Number) where {Bi,Tv,Ti}
+                            β::Number)
     mul!(ascsc(C), transpose(ascsc(Bt.parent)), ascsc(A), α, β)
     C
 end
 
 # PtAP variants
-function rap(Plt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}}, A::SparseMatrixCSR{Bi,Tv,Ti}, Pr::SparseMatrixCSR{Bi,Tv,Ti}) where {Bi,Tv,Ti}
-    p,q = size(Plt)
+function rap(Rt::Transpose{TvR,SparseMatrixCSR{Bi,TvR,TiR}},
+             A::SparseMatrixCSR{Bi,TvA,TiA},
+             P::SparseMatrixCSR{Bi,TvP,TiP}) where {Bi,TvR,TvA,TvP,TiR,TiA,TiP}
+    p,q = size(Rt)
     m,r = size(A)
-    n,s = size(Pr)
+    n,s = size(P)
     if r != n && throw(DimensionMismatch("Invalid dimensions for A*P: ($m,$r)*($n,$s),"));end
     if q != m && throw(DimensionMismatch("Invalid dimensions: R*AP: ($p,$q)*($m,$s)"));end
-    function rap_symbolic_count!(R,A,Pr)
+    function rap_symbolic_count!(R,A,P)
+        Ti = promote_type(TiR,TiA,TiP)
+        Tv = promote_type(TvR,TvA,TvP)
         JR = R.data
         JA = colvals(A)
-        JPr = colvals(Pr) # colvals can be interpreted as rowvals when Pr is virtually transposed.
+        JP = colvals(P) # colvals can be interpreted as rowvals when P is virtually transposed.
         xbRA = zeros(Ti, r)
         xbC = zeros(Ti, s) # this vector will also serve as as colptr array in halfperm
         max_rR = find_max_row_length(R)
         max_rA = find_max_row_length(A)
-        max_rPr = find_max_row_length(Pr)
+        max_rP = find_max_row_length(P)
 
-        max_rC = max((max_rR*max_rA*max_rPr),(max_rA*max_rR))
+        max_rC = max((max_rR*max_rA*max_rP),(max_rA*max_rR))
         JRA = Vector{Ti}(undef,max_rC)
         IC = Vector{Ti}(undef,p+1)
         nnz_C = 1
@@ -687,8 +705,8 @@ function rap(Plt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}}, A::SparseMatrixCSR{Bi
             ccC = 0 # local column pointer, refresh every row, start at 0 to allow empty rows
             for jp in 1:ccRA
                 j = JRA[jp]
-                for kp in nzrange(Pr,j)
-                    k = JPr[kp]
+                for kp in nzrange(P,j)
+                    k = JP[kp]
                     if xbC[k] != i
                         xbC[k] = i
                         ccC += 1
@@ -700,18 +718,18 @@ function rap(Plt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}}, A::SparseMatrixCSR{Bi
         end
         JC = Vector{Ti}(undef, nnz_C-1)
         VC = zeros(Tv,nnz_C-1)
-        JAP = Vector{Ti}(undef,min(max_rA*max_rPr,s)) # upper bound estimate for length of virtual row of AP
+        JAP = Vector{Ti}(undef,min(max_rA*max_rP,s)) # upper bound estimate for length of virtual row of AP
         xbRA .= 0
         xbC .= 0
         cache = (xbRA,JRA,xbC,JAP)
         SparseMatrixCSR{Bi}(p,s,IC,JC,VC), cache # values not yet initialized
     end
-    function rap_symbolic_fill!(C,R,A,Pr,cache)
+    function rap_symbolic_fill!(C,R,A,P,cache)
         (xbRA,JRA,xbC,JAP) = cache
         JC = colvals(C)
         JR = R.data
         JA = colvals(A)
-        JPr = colvals(Pr) # colvals can be interpreted as rowvals when Pr is virtually transposed.
+        JP = colvals(P) # colvals can be interpreted as rowvals when P is virtually transposed.
         pC = 0
         for i in 1:p
             ccRA = 0 # local column pointer, refresh every row, start at 0 to allow empty rows
@@ -731,8 +749,8 @@ function rap(Plt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}}, A::SparseMatrixCSR{Bi
             end
             for jp in 1:ccRA
                 j = JRA[jp]
-                for kp in nzrange(Pr,j)
-                    k = JPr[kp]
+                for kp in nzrange(P,j)
+                    k = JP[kp]
                     if xbC[k] != i
                         pC += 1
                         xbC[k] = i
@@ -742,41 +760,43 @@ function rap(Plt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}}, A::SparseMatrixCSR{Bi
             end
         end
         xbC .= 0
-        outer_cache = (xbC,similar(xbC, Tv),JAP)
+        outer_cache = (xbC,similar(xbC, eltype(C)),JAP)
         C, outer_cache # values not yet initialized
     end
-    function _rap(Plt,A,Pr)
-        R = symbolic_halfperm(Plt.parent)
-        C,symbolic_cache = rap_symbolic_count!(R,A,Pr) # precompute nz structure with a symbolic transpose
-        _,outer_cache = rap_symbolic_fill!(C,R,A,Pr,symbolic_cache)
+    function _rap(Rt,A,P)
+        R = symbolic_halfperm(Rt.parent)
+        C,symbolic_cache = rap_symbolic_count!(R,A,P) # precompute nz structure with a symbolic transpose
+        _,outer_cache = rap_symbolic_fill!(C,R,A,P,symbolic_cache)
         Ct = symbolic_halfperm(C)
         symbolic_halfperm!(C,Ct)
-        rap!(C,Plt,A,Pr,outer_cache),(outer_cache...,R)
+        rap!(C,Rt,A,P,outer_cache),(outer_cache...,R)
     end
-    _rap(Plt,A,Pr)
+    _rap(Rt,A,P)
 end
 
-function rap(Plt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}},
-             A::SparseMatrixCSR{Bi,Tv,Ti},
-             Pr::SparseMatrixCSR{Bi,Tv,Ti},
-             cache) where {Bi,Tv,Ti}
-    p,q = size(Plt)
+function rap(Rt::Transpose{TvR,SparseMatrixCSR{Bi,TvR,TiR}},
+             A::SparseMatrixCSR{Bi,TvA,TiA},
+             P::SparseMatrixCSR{Bi,TvP,TiP},
+             cache) where {Bi,TvR,TvA,TvP,TiR,TiA,TiP}
+    p,q = size(Rt)
     m,r = size(A)
-    n,s = size(Pr)
+    n,s = size(P)
     if r != n && throw(DimensionMismatch("Invalid dimensions for A*P: ($m,$r)*($n,$s),"));end
     if q != m && throw(DimensionMismatch("Invalid dimensions: R*AP: ($p,$q)*($m,$s)"));end
-    
-    function rap_symbolic_count!(R,A,Pr)
+
+    function rap_symbolic_count(R,A,P)
+        Ti = promote_type(TiR,TiA,TiP)
+        Tv = promote_type(TvR,TvA,TvP)
         JR = R.data
         JA = colvals(A)
-        JPr = colvals(Pr) # colvals can be interpreted as rowvals when Pr is virtually transposed.
+        JP = colvals(P) # colvals can be interpreted as rowvals when P is virtually transposed.
         xbRA = zeros(Ti, r)
         xbC = zeros(Ti, s) # this vector will also serve as as colptr array in halfperm
         max_rR = find_max_row_length(R)
         max_rA = find_max_row_length(A)
-        max_rPr = find_max_row_length(Pr)
+        max_rP = find_max_row_length(P)
 
-        max_rC = max((max_rR*max_rA*max_rPr),(max_rA*max_rR))
+        max_rC = max((max_rR*max_rA*max_rP),(max_rA*max_rR))
         JRA = Vector{Ti}(undef,max_rC)
         IC = Vector{Ti}(undef,p+1)
         nnz_C = 1
@@ -800,8 +820,8 @@ function rap(Plt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}},
             ccC = 0 # local column pointer, refresh every row, start at 0 to allow empty rows
             for jp in 1:ccRA
                 j = JRA[jp]
-                for kp in nzrange(Pr,j)
-                    k = JPr[kp]
+                for kp in nzrange(P,j)
+                    k = JP[kp]
                     if xbC[k] != i
                         xbC[k] = i
                         ccC += 1
@@ -813,17 +833,17 @@ function rap(Plt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}},
         end
         JC = Vector{Ti}(undef, nnz_C-1)
         VC = zeros(Tv,nnz_C-1)
-        JAP = Vector{Ti}(undef,min(max_rA*max_rPr,s)) # upper bound estimate for length of virtual row of AP
+        JAP = Vector{Ti}(undef,min(max_rA*max_rP,s)) # upper bound estimate for length of virtual row of AP
         xbRA .= 0
         xbC .= 0
         SparseMatrixCSR{Bi}(p,s,IC,JC,VC),(xbRA,JRA,xbC,JAP) # values in CSR matrix not yet initialized
     end
-    function rap_symbolic_fill!(C,R,A,Pr,cache)
+    function rap_symbolic_fill!(C,R,A,P,cache)
         (xbRA,JRA,xbC,JAP) = cache
         JC = colvals(C)
         JR = R.data
         JA = colvals(A)
-        JPr = colvals(Pr) # colvals can be interpreted as rowvals when Pr is virtually transposed.
+        JP = colvals(P) # colvals can be interpreted as rowvals when P is virtually transposed.
         pC = 0
         for i in 1:p
             ccRA = 0 # local column pointer, refresh every row, start at 0 to allow empty rows
@@ -843,8 +863,8 @@ function rap(Plt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}},
             end
             for jp in 1:ccRA
                 j = JRA[jp]
-                for kp in nzrange(Pr,j)
-                    k = JPr[kp]
+                for kp in nzrange(P,j)
+                    k = JP[kp]
                     if xbC[k] != i
                         pC += 1
                         xbC[k] = i
@@ -854,19 +874,19 @@ function rap(Plt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}},
             end
         end
         xbC .= 0
-        C, (xbC,similar(xbC, Tv),JAP) # values not yet initialized
+        C, (xbC,similar(xbC, eltype(C)),JAP) # values not yet initialized
     end
-    function _rap(Plt,A,Pr,old_cache)
+    function _rap(Rt,A,P,old_cache)
         xb,x,JAP,R = old_cache
         old_outer_cache = (xb,x,JAP)
-        C,symbolic_cache = rap_symbolic_count!(R, A, Pr)
-        _,new_outer_cache = rap_symbolic_fill!(C,R, A, Pr, symbolic_cache)
+        C,symbolic_cache = rap_symbolic_count(R, A, P)
+        _,new_outer_cache = rap_symbolic_fill!(C,R, A, P, symbolic_cache)
         Ct = symbolic_halfperm(C)
         symbolic_halfperm!(C,Ct)
         outer_cache = map((c1,c2) -> length(c1) >= length(c2) ? c1 : c2, old_outer_cache,new_outer_cache)
-        rap!(C,Plt,A,Pr,outer_cache),(outer_cache...,R)
+        rap!(C,Rt,A,P,outer_cache),(outer_cache...,R)
     end
-    _rap(Plt,A,Pr,cache)
+    _rap(Rt,A,P,cache)
 end
 
 function reduce_spmtmm_cache(cache,::Type{SparseMatrixCSR})
@@ -874,27 +894,27 @@ function reduce_spmtmm_cache(cache,::Type{SparseMatrixCSR})
     (xb,x,JAP)
 end
 
-function rap!(C::SparseMatrixCSR{Bi,Tv,Ti}, 
-              Plt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}},
-              A::SparseMatrixCSR{Bi,Tv,Ti},
-              Pr::SparseMatrixCSR{Bi,Tv,Ti},
-              cache) where {Bi,Tv,Ti}
+function rap!(C::SparseMatrixCSR, 
+              Rt::Transpose{Tv,<:SparseMatrixCSR} where Tv,
+              A::SparseMatrixCSR,
+              P::SparseMatrixCSR,
+              cache)
     (a,b) = size(C)
-    p,q = size(Plt)
+    p,q = size(Rt)
     m,r = size(A)
-    n,s = size(Pr)
+    n,s = size(P)
     if r != n && throw(DimensionMismatch("Invalid dimensions for A*P: ($m,$r)*($n,$s),"));end
     if q != m && throw(DimensionMismatch("Invalid dimensions: R*AP: ($p,$q)*($m,$s)"));end
     if (a,b) != (p,s) && throw(DimensionMismatch("Dimensions of C $(size(C)) don't match dimensions of R*A*P ($p,$q)*($m,$r)*($n,$s)."));end
-    Pl = Plt.parent
+    R = Rt.parent
     JC = colvals(C)
     VC = nonzeros(C)
-    VC .= zero(Tv)
+    VC .= zero(eltype(C))
 
     JA = colvals(A)
     VA = nonzeros(A)
-    JPr = colvals(Pr)
-    VPr = nonzeros(Pr)
+    JP = colvals(P)
+    VP = nonzeros(P)
     xb, x, JAP = cache
     xb .= 0
     # loop over rows in A
@@ -905,22 +925,22 @@ function rap!(C::SparseMatrixCSR{Bi,Tv,Ti},
             j = JA[jp]
             va = VA[jp]
             # loop over columns "k" in row j of B
-            for kp in nzrange(Pr, j)
-                k = JPr[kp]
+            for kp in nzrange(P, j)
+                k = JP[kp]
                 # since C is constructed rowwise, xb tracks if a column index is present in a new row in C.
                 if xb[k] != i
                     lp += 1
                     JAP[lp] = k
                     xb[k] = i
-                    x[k] = va * VPr[kp]
+                    x[k] = va * VP[kp]
                 else
-                    x[k] += va * VPr[kp]
+                    x[k] += va * VP[kp]
                 end
             end
         end
-        for kp in nzrange(Pl, i)
-            k = colvals(Pl)[kp] # rowvals when transposed conceptually
-            v = nonzeros(Pl)[kp]
+        for kp in nzrange(R, i)
+            k = colvals(R)[kp] # rowvals when transposed conceptually
+            v = nonzeros(R)[kp]
             for jp in nzrange(C,k)
                 j = JC[jp]
                 if xb[j] == i
@@ -932,27 +952,27 @@ function rap!(C::SparseMatrixCSR{Bi,Tv,Ti},
     C
 end
 
-function rap!(C::SparseMatrixCSR{Bi,Tv,Ti},
-              Plt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}},
-              A::SparseMatrixCSR{Bi,Tv,Ti},
-              Pr::SparseMatrixCSR{Bi,Tv,Ti},
+function rap!(C::SparseMatrixCSR,
+              Rt::Transpose{Tv,<:SparseMatrixCSR} where Tv,
+              A::SparseMatrixCSR,
+              P::SparseMatrixCSR,
               α::Number,
               β::Number,
-              cache) where {Bi,Tv,Ti}
+              cache)
     (a,b) = size(C)
-    p,q = size(Plt)
+    p,q = size(Rt)
     m,r = size(A)
-    n,s = size(Pr)
+    n,s = size(P)
     if r != n && throw(DimensionMismatch("Invalid dimensions for A*P: ($m,$r)*($n,$s),"));end
     if q != m && throw(DimensionMismatch("Invalid dimensions: R*AP: ($p,$q)*($m,$s)"));end
     if (a,b) != (p,s) && throw(DimensionMismatch("Dimensions of C $(size(C)) don't match dimensions of R*A*P ($p,$q)*($m,$r)*($n,$s)."));end
-    Pl = Plt.parent
+    R = Rt.parent
     JC = colvals(C)
     VC = nonzeros(C)
     JA = colvals(A)
     VA = nonzeros(A)
-    JPr = colvals(Pr)
-    VPr = nonzeros(Pr)
+    JP = colvals(P)
+    VP = nonzeros(P)
     xb, x, JAP = cache
     xb .= 0
     VC .*= β
@@ -964,22 +984,22 @@ function rap!(C::SparseMatrixCSR{Bi,Tv,Ti},
             j = JA[jp]
             va = α*VA[jp]
             # loop over columns "k" in row j of B
-            for kp in nzrange(Pr, j)
-                k = JPr[kp]
+            for kp in nzrange(P, j)
+                k = JP[kp]
                 # since C is constructed rowwise, xb tracks if a column index is present in a new row in C.
                 if xb[k] != i
                     lp += 1
                     JAP[lp] = k
                     xb[k] = i
-                    x[k] = va*VPr[kp]
+                    x[k] = va*VP[kp]
                 else
-                    x[k] += va*VPr[kp]
+                    x[k] += va*VP[kp]
                 end
             end
         end
-        for kp in nzrange(Pl, i)
-            k = colvals(Pl)[kp] # rowvals when transposed conceptually
-            vpl = nonzeros(Pl)[kp]
+        for kp in nzrange(R, i)
+            k = colvals(R)[kp] # rowvals when transposed conceptually
+            vpl = nonzeros(R)[kp]
             for jp in nzrange(C,k)
                 j = JC[jp]
                 if xb[j] == i
@@ -991,37 +1011,41 @@ function rap!(C::SparseMatrixCSR{Bi,Tv,Ti},
     C
 end
 
-# rap variants
-function rap(Pl::SparseMatrixCSR{Bi,Tv,TiPl},
-             A::SparseMatrixCSR{Bi,Tv,TiA},
-             Pr::SparseMatrixCSR{Bi,Tv,TiPr}) where {Bi,Tv,TiPl,TiA,TiPr}
-    p,q = size(Pl)
+# RAP variants
+function rap(R::SparseMatrixCSR{Bi,TvR,TiR},
+             A::SparseMatrixCSR{Bi,TvA,TiA},
+             P::SparseMatrixCSR{Bi,TvP,TiP}) where {Bi,TvR,TvA,TvP,TiR,TiA,TiP}
+    p,q = size(R)
     m,r = size(A)
-    n,s = size(Pr)
+    n,s = size(P)
     if q == m || throw(DimensionMismatch("Invalid dimensions for R*A: ($p,$q)*($m,$r),"));end
     if r == n || throw(DimensionMismatch("Invalid dimensions: RA*P: ($p,$r)*($n,$s)"));end
-    function rap_symbolic!(Pl,A,Pr)
-        JPl = colvals(Pl)
+
+    function rap_symbolic!(R,A,P)
+        Ti = promote_type(TiR,TiA,TiP)
+        Tv = promote_type(TvR,TvA,TvP)
+
+        JR = colvals(R)
         JA = colvals(A)
-        JPr = colvals(Pr) # colvals can be interpreted as rowvals when Pr is virtually transposed.
-        xbRA = zeros(TiA, r)
-        xbC = zeros(TiA, s+1) # this vector will also serve as as colptr array in halfperm
+        JP = colvals(P) # colvals can be interpreted as rowvals when P is virtually transposed.
+        xbRA = zeros(Ti, r)
+        xbC = zeros(Ti, s+1) # this vector will also serve as as colptr array in halfperm
         xRA = similar(xbRA, Tv) # sparse accumulator
         xC = similar(xbC, Tv) # sparse accumulator
-        max_rPl = find_max_row_length(Pl)
+        max_rR = find_max_row_length(R)
         max_rA = find_max_row_length(A)
-        max_rPr = find_max_row_length(Pr)
+        max_rP = find_max_row_length(P)
+        max_rC = max((max_rR*max_rA*max_rP),(max_rA*max_rR))
 
-        max_rC = max((max_rPl*max_rA*max_rPr),(max_rA*max_rPl))
-        JRA = Vector{TiA}(undef,max_rC)
-        IC = Vector{TiA}(undef,p+1)
+        JRA = Vector{Ti}(undef,max_rC)
+        IC = Vector{Ti}(undef,p+1)
         nnz_C = 1
         IC[1] = nnz_C
         for i in 1:p
             ccRA = 0
             # loop over columns "j" in row i of A
-            for jp in nzrange(Pl, i)
-                j = JPl[jp]
+            for jp in nzrange(R, i)
+                j = JR[jp]
                 # loop over columns "k" in row j of B
                 for kp in nzrange(A, j)
                     k = JA[kp]
@@ -1036,8 +1060,8 @@ function rap(Pl::SparseMatrixCSR{Bi,Tv,TiPl},
             ccC = 0
             for jp in 1:ccRA
                 j = JRA[jp]
-                for kp in nzrange(Pr,j)
-                    k = JPr[kp]
+                for kp in nzrange(P,j)
+                    k = JP[kp]
                     if xbC[k] != i
                         xbC[k] = i
                         ccC += 1
@@ -1047,18 +1071,18 @@ function rap(Pl::SparseMatrixCSR{Bi,Tv,TiPl},
             nnz_C += ccC
             IC[i+1] = nnz_C
         end
-        JC = Vector{TiA}(undef, nnz_C-1)
+        JC = Vector{Ti}(undef, nnz_C-1)
         VC = zeros(Tv,nnz_C-1)
         cache = (xbRA,xRA,JRA,xbC,xC)
         SparseMatrixCSR{Bi}(p,s,IC,JC,VC), cache # values not yet initialized
     end
-    function rap_numeric!(C,Pl,A,Pr,cache)
-        JPl = colvals(Pl)
-        VPl = nonzeros(Pl)
+    function rap_numeric!(C,R,A,P,cache)
+        JR = colvals(R)
+        VR = nonzeros(R)
         JA = colvals(A)
         VA = nonzeros(A)
-        JPr = colvals(Pr) # colvals can be interpreted as rowvals when Pr is virtually transposed.
-        VPr = nonzeros(Pr)
+        JP = colvals(P) # colvals can be interpreted as rowvals when P is virtually transposed.
+        VP = nonzeros(P)
         JC = colvals(C)
         VC = nonzeros(C)
         (xbRA,xRA,JRA,xbC,xC) = cache
@@ -1066,9 +1090,9 @@ function rap(Pl::SparseMatrixCSR{Bi,Tv,TiPl},
         for i in 1:p
             ccRA = 0 # local column pointer, refresh every row, start at 0 to allow empty rows
             # loop over columns "j" in row i of A
-            for jp in nzrange(Pl, i)
-                j = JPl[jp]
-                vpl = VPl[jp]
+            for jp in nzrange(R, i)
+                j = JR[jp]
+                vpl = VR[jp]
                 # loop over columns "k" in row j of B
                 for kp in nzrange(A, j)
                     k = JA[kp]
@@ -1085,15 +1109,15 @@ function rap(Pl::SparseMatrixCSR{Bi,Tv,TiPl},
             end
             for jp in 1:ccRA
                 j = JRA[jp]
-                for kp in nzrange(Pr,j)
-                    k = JPr[kp]
+                for kp in nzrange(P,j)
+                    k = JP[kp]
                     if xbC[k] != i
                         xbC[k] = i
                         JC[jpC] = k
                         jpC += 1
-                        xC[k] = xRA[j]*VPr[kp]
+                        xC[k] = xRA[j]*VP[kp]
                     else
-                        xC[k] += xRA[j]*VPr[kp]
+                        xC[k] += xRA[j]*VP[kp]
                     end
                 end
             end
@@ -1103,17 +1127,17 @@ function rap(Pl::SparseMatrixCSR{Bi,Tv,TiPl},
             end
         end
     end
-    function _rap(Pl,A,Pr)
-        C,(xbRA,xRA,JRA,xbC,xC) = rap_symbolic!(Pl,A,Pr)
+    function _rap(R,A,P)
+        C,(xbRA,xRA,JRA,xbC,xC) = rap_symbolic!(R,A,P)
         xbRA .= 0
         xbC .= 0
         cache = (xbRA,xRA,JRA,xbC,xC)
-        rap_numeric!(C,Pl,A,Pr,cache)
+        rap_numeric!(C,R,A,P,cache)
         Ct = halfperm!(xbC,similar(colvals(C)),similar(nonzeros(C)),C)
         halfperm!(C,Ct)
         C,cache
     end
-    _rap(Pl,A,Pr)
+    _rap(R,A,P)
 end
 
 # Reuses internal arrays of A!!!
@@ -1134,28 +1158,31 @@ function reduce_spmtmm_cache(cache,::Type{M}  where M <: SparseMatrixCSC)
     reduce_spmmmt_cache(cache,SparseMatrixCSR)
 end
 
-function rap(Pl::SparseMatrixCSR{Bi,Tv,TiPl},
-             A::SparseMatrixCSR{Bi,Tv,TiA},
-             Pr::SparseMatrixCSR{Bi,Tv,TiPr},
-             cache) where {Bi,Tv,TiPl,TiA,TiPr}
-    p,q = size(Pl)
+function rap(R::SparseMatrixCSR{Bi,TvR,TiR},
+             A::SparseMatrixCSR{Bi,TvA,TiA},
+             P::SparseMatrixCSR{Bi,TvP,TiP},
+             cache) where {Bi,TvR,TvA,TvP,TiR,TiA,TiP}
+    p,q = size(R)
     m,r = size(A)
-    n,s = size(Pr)
+    n,s = size(P)
     if q == m || throw(DimensionMismatch("Invalid dimensions for R*A: ($p,$q)*($m,$r),"));end
     if r == n || throw(DimensionMismatch("Invalid dimensions: RA*P: ($p,$r)*($n,$s)"));end
-    function rap_symbolic!(Pl,A,Pr,cache)
-        JPl = colvals(Pl)
+
+    function rap_symbolic!(R,A,P,cache)
+        Ti = promote_type(TiR,TiA,TiP)
+        Tv = promote_type(TvR,TvA,TvP)
+        JR = colvals(R)
         JA = colvals(A)
-        JPr = colvals(Pr) # colvals can be interpreted as rowvals when Pr is virtually transposed.
+        JP = colvals(P) # colvals can be interpreted as rowvals when P is virtually transposed.
         (xbRA,_,JRA,xbC,_) = cache
-        IC = Vector{TiA}(undef,p+1)
+        IC = Vector{Ti}(undef,p+1)
         nnz_C = 1
         IC[1] = nnz_C
         for i in 1:p
             ccRA = 0 # local column pointer, refresh every row, start at 0 to allow empty rows
             # loop over columns "j" in row i of A
-            for jp in nzrange(Pl, i)
-                j = JPl[jp]
+            for jp in nzrange(R, i)
+                j = JR[jp]
                 # loop over columns "k" in row j of B
                 for kp in nzrange(A, j)
                     k = JA[kp]
@@ -1170,8 +1197,8 @@ function rap(Pl::SparseMatrixCSR{Bi,Tv,TiPl},
             ccC = 0 # local column pointer, refresh every row, start at 0 to allow empty rows
             for jp in 1:ccRA
                 j = JRA[jp]
-                for kp in nzrange(Pr,j)
-                    k = JPr[kp]
+                for kp in nzrange(P,j)
+                    k = JP[kp]
                     if xbC[k] != i
                         xbC[k] = i
                         ccC += 1
@@ -1181,17 +1208,17 @@ function rap(Pl::SparseMatrixCSR{Bi,Tv,TiPl},
             nnz_C += ccC
             IC[i+1] = nnz_C
         end
-        JC = Vector{TiA}(undef, nnz_C-1)
+        JC = Vector{Ti}(undef, nnz_C-1)
         VC = zeros(Tv,nnz_C-1)
         SparseMatrixCSR{Bi}(p,s,IC,JC,VC) # values not yet initialized
     end
-    function rap_numeric!(C,Pl,A,Pr,cache)
-        JPl = colvals(Pl)
-        VPl = nonzeros(Pl)
+    function rap_numeric!(C,R,A,P,cache)
+        JR = colvals(R)
+        VR = nonzeros(R)
         JA = colvals(A)
         VA = nonzeros(A)
-        JPr = colvals(Pr) # colvals can be interpreted as rowvals when Pr is virtually transposed.
-        VPr = nonzeros(Pr)
+        JP = colvals(P) # colvals can be interpreted as rowvals when P is virtually transposed.
+        VP = nonzeros(P)
         JC = colvals(C)
         VC = nonzeros(C)
         (xbRA,xRA,JRA,xbC,xC) = cache
@@ -1199,9 +1226,9 @@ function rap(Pl::SparseMatrixCSR{Bi,Tv,TiPl},
         for i in 1:p
             ccRA = 0 # local column pointer, refresh every row, start at 0 to allow empty rows
             # loop over columns "j" in row i of A
-            for jp in nzrange(Pl, i)
-                j = JPl[jp]
-                vpl = VPl[jp]
+            for jp in nzrange(R, i)
+                j = JR[jp]
+                vpl = VR[jp]
                 # loop over columns "k" in row j of B
                 for kp in nzrange(A, j)
                     k = JA[kp]
@@ -1218,15 +1245,15 @@ function rap(Pl::SparseMatrixCSR{Bi,Tv,TiPl},
             end
             for jp in 1:ccRA
                 j = JRA[jp]
-                for kp in nzrange(Pr,j)
-                    k = JPr[kp]
+                for kp in nzrange(P,j)
+                    k = JP[kp]
                     if xbC[k] != i
                         xbC[k] = i
                         JC[jpC] = k
                         jpC += 1
-                        xC[k] = xRA[j]*VPr[kp]
+                        xC[k] = xRA[j]*VP[kp]
                     else
-                        xC[k] += xRA[j]*VPr[kp]
+                        xC[k] += xRA[j]*VP[kp]
                     end
                 end
             end
@@ -1236,12 +1263,12 @@ function rap(Pl::SparseMatrixCSR{Bi,Tv,TiPl},
             end
         end
     end
-    function _rap(Pl,A,Pr,old_cache)
-        max_rPl = find_max_row_length(Pl)
+    function _rap(R,A,P,old_cache)
+        max_rR = find_max_row_length(R)
         max_rA = find_max_row_length(A)
-        max_rPr = find_max_row_length(Pr)
+        max_rP = find_max_row_length(P)
         (xbRA,xRA,JRA,xbC,xC) = old_cache
-        max_rC = max((max_rPl*max_rA*max_rPr),(max_rA*max_rPl))
+        max_rC = max((max_rR*max_rA*max_rP),(max_rA*max_rR))
         JRA2 = max_rC > length(JRA) ? similar(JRA,max_rC) : JRA
         if r > length(xbRA)
             xbRA2 = similar(xbRA,r)
@@ -1254,15 +1281,15 @@ function rap(Pl::SparseMatrixCSR{Bi,Tv,TiPl},
         new_cache = (xbRA2,xRA2,JRA2,xbC,xC)
         xbRA2 .= 0
         xbC .= 0
-        C = rap_symbolic!(Pl,A,Pr,new_cache)
+        C = rap_symbolic!(R,A,P,new_cache)
         xbRA2 .= 0
         xbC .= 0
-        rap_numeric!(C,Pl,A,Pr,new_cache)
+        rap_numeric!(C,R,A,P,new_cache)
         Ct = halfperm!(xbC,similar(colvals(C)),similar(nonzeros(C)),C)
         halfperm!(C,Ct)
         C,new_cache
     end
-    _rap(Pl,A,Pr,cache)
+    _rap(R,A,P,cache)
 end
 
 function reduce_spmmmt_cache(cache,::Type{M} where M <: SparseMatrixCSR)
@@ -1274,34 +1301,34 @@ function reduce_spmmmt_cache(cache,::Type{M} where M <: SparseMatrixCSC)
     reduce_spmtmm_cache(cache,SparseMatrixCSR)
 end
 
-function rap!(C::SparseMatrixCSR{Bi,Tv,TiC},
-              Pl::SparseMatrixCSR{Bi,Tv,TiPl},
-              A::SparseMatrixCSR{Bi,Tv,TiA},
-              Pr::SparseMatrixCSR{Bi,Tv,TiPr},
-              cache) where {Bi,Tv,TiC,TiPl,TiA,TiPr}
-    p,q = size(Pl)
+function rap!(C::SparseMatrixCSR,
+              R::SparseMatrixCSR,
+              A::SparseMatrixCSR,
+              P::SparseMatrixCSR,
+              cache)
+    p,q = size(R)
     m,r = size(A)
-    n,s = size(Pr)
+    n,s = size(P)
     if q == m || throw(DimensionMismatch("Invalid dimensions for R*A: ($p,$q)*($m,$r),"));end
     if r == n || throw(DimensionMismatch("Invalid dimensions: RA*P: ($p,$r)*($n,$s)"));end
-    JPl = colvals(Pl)
-    VPl = nonzeros(Pl)
+    JR = colvals(R)
+    VR = nonzeros(R)
     JA = colvals(A)
     VA = nonzeros(A)
-    JPr = colvals(Pr) # colvals can be interpreted as rowvals when Pr is virtually transposed.
-    VPr = nonzeros(Pr)
+    JP = colvals(P) # colvals can be interpreted as rowvals when P is virtually transposed.
+    VP = nonzeros(P)
     JC = colvals(C)
     VC = nonzeros(C)
-    VC .= zero(Tv)
+    VC .= zero(eltype(C))
     (xbRA,xRA,JRA,xbC,xC) = cache
     xbRA .= 0
     xbC .= 0
     for i in 1:p
         lp = 0 # local column pointer, refresh every row, start at 0 to allow empty rows
         # loop over columns "j" in row i of A
-        for jp in nzrange(Pl, i)
-            j = JPl[jp]
-            vpl = VPl[jp]
+        for jp in nzrange(R, i)
+            j = JR[jp]
+            vpl = VR[jp]
 
             # loop over columns "k" in row j of B
             for kp in nzrange(A, j)
@@ -1321,13 +1348,13 @@ function rap!(C::SparseMatrixCSR{Bi,Tv,TiC},
         for jp in 1:lp
             j = JRA[jp]
             vra = xRA[j]
-            for kp in nzrange(Pr,j)
-                k = JPr[kp]
+            for kp in nzrange(P,j)
+                k = JP[kp]
                 if xbC[k] != i
                     xbC[k] = i
-                    xC[k] = vra*VPr[kp]
+                    xC[k] = vra*VP[kp]
                 else
-                    xC[k] += vra*VPr[kp]
+                    xC[k] += vra*VP[kp]
                 end
             end
         end
@@ -1341,37 +1368,37 @@ function rap!(C::SparseMatrixCSR{Bi,Tv,TiC},
     C
 end
 
-function rap!(C::SparseMatrixCSR{Bi,Tv,TiC},
-              Pl::SparseMatrixCSR{Bi,Tv,TiPl},
-              A::SparseMatrixCSR{Bi,Tv,TiA},
-              Pr::SparseMatrixCSR{Bi,Tv,TiPr},
+function rap!(C::SparseMatrixCSR,
+              R::SparseMatrixCSR,
+              A::SparseMatrixCSR,
+              P::SparseMatrixCSR,
               α::Number,
               β::Number,
-              cache) where {Bi,Tv,TiC,TiPl,TiA,TiPr}
-    p,q = size(Pl)
+              cache)
+    p,q = size(R)
     m,r = size(A)
-    n,s = size(Pr)
+    n,s = size(P)
     if q == m || throw(DimensionMismatch("Invalid dimensions for R*A: ($p,$q)*($m,$r),"));end
     if r == n || throw(DimensionMismatch("Invalid dimensions: RA*P: ($p,$r)*($n,$s)"));end
-    JPl = colvals(Pl)
-    VPl = nonzeros(Pl)
+    JR = colvals(R)
+    VR = nonzeros(R)
     JA = colvals(A)
     VA = nonzeros(A)
-    JPr = colvals(Pr) # colvals can be interpreted as rowvals when Pr is virtually transposed.
-    VPr = nonzeros(Pr)
+    JP = colvals(P) # colvals can be interpreted as rowvals when P is virtually transposed.
+    VP = nonzeros(P)
     JC = colvals(C)
     VC = nonzeros(C)
     VC .*= β
     (xbRA,xRA,JRA,xbC,xC) = cache
     xbRA .= 0
     xbC .= 0
-    xC .= zero(Tv)
+    # xC .= zero(Tv)
     for i in 1:p
         lp = 0 # local column pointer, refresh every row, start at 0 to allow empty rows
         # loop over columns "j" in row i of A
-        for jp in nzrange(Pl, i)
-            j = JPl[jp]
-            vpl = VPl[jp]
+        for jp in nzrange(R, i)
+            j = JR[jp]
+            vpl = VR[jp]
             # loop over columns "k" in row j of B
             for kp in nzrange(A, j)
                 k = JA[kp]
@@ -1388,13 +1415,13 @@ function rap!(C::SparseMatrixCSR{Bi,Tv,TiC},
         end
         for jp in 1:lp
             j = JRA[jp]
-            for kp in nzrange(Pr,j)
-                k = JPr[kp]
+            for kp in nzrange(P,j)
+                k = JP[kp]
                 if xbC[k] != i
                     xbC[k] = i
-                    xC[k] = xRA[j]*VPr[kp]
+                    xC[k] = xRA[j]*VP[kp]
                 else
-                    xC[k] += xRA[j]*VPr[kp]
+                    xC[k] += xRA[j]*VP[kp]
                 end
             end
         end
@@ -1409,45 +1436,46 @@ function rap!(C::SparseMatrixCSR{Bi,Tv,TiC},
 end
 
 # RARt variants
-function rap(Pl::SparseMatrixCSR{Bi,Tv,TiA},
-             A::SparseMatrixCSR{Bi,Tv,TiB},
-             Prt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,TiC}}) where {Bi,Tv,TiA,TiB,TiC}
-    p,q = size(Pl)
+function rap(R::SparseMatrixCSR,
+             A::SparseMatrixCSR,
+             Pt::Transpose{Tv,<:SparseMatrixCSR} where Tv)
+    p,q = size(R)
     m,r = size(A)
-    n,s = size(Prt)
+    n,s = size(Pt)
     if q == m || throw(DimensionMismatch("Invalid dimensions for R*A: ($p,$q)*($m,$r),"));end
     if r == n || throw(DimensionMismatch("Invalid dimensions for RA*P: ($p,$r)*($n,$s)"));end
-    rap(Pl,A,copy(Prt))
+    rap(R,A,copy(Pt))
 end
 
-function rap(Pl::SparseMatrixCSR{Bi,Tv,TiA},
-             A::SparseMatrixCSR{Bi,Tv,TiB},
-             Prt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,TiC}},cache) where {Bi,Tv,TiA,TiB,TiC}
-    p,q = size(Pl)
+function rap(R::SparseMatrixCSR,
+             A::SparseMatrixCSR,
+             Pt::Transpose{Tv,<:SparseMatrixCSR} where Tv,
+             cache) 
+    p,q = size(R)
     m,r = size(A)
-    n,s = size(Prt)
+    n,s = size(Pt)
     if q == m || throw(DimensionMismatch("Invalid dimensions for R*A: ($p,$q)*($m,$r),"));end
     if r == n || throw(DimensionMismatch("Invalid dimensions for RA*P: ($p,$r)*($n,$s)"));end
-    rap(Pl,A,copy(Prt),cache)
+    rap(R,A,copy(Pt),cache)
 end
 
-function rap!(C::SparseMatrixCSR{Bi,Tv,TiC},
-              Pl::SparseMatrixCSR{Bi,Tv,TiPl}, 
-              A::SparseMatrixCSR{Bi,Tv,TiA}, 
-              Prt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,TiPr}},
-              cache) where {Bi,Tv,TiC,TiPl,TiA,TiPr}
-    p,q = size(Pl)
+function rap!(C::SparseMatrixCSR,
+              R::SparseMatrixCSR, 
+              A::SparseMatrixCSR, 
+              Pt::Transpose{Tv,<:SparseMatrixCSR} where Tv,
+              cache)
+    p,q = size(R)
     m,r = size(A)
-    n,s = size(Prt)
+    n,s = size(Pt)
     if q == m || throw(DimensionMismatch("Invalid dimensions for R*A: ($p,$q)*($m,$r),"));end
     if r == n || throw(DimensionMismatch("Invalid dimensions: RA*P: ($p,$r)*($n,$s)"));end
-    Pr = Prt.parent
-    JPl = colvals(Pl)
-    VPl = nonzeros(Pl)
+    P = Pt.parent
+    JR = colvals(R)
+    VR = nonzeros(R)
     JA = colvals(A)
     VA = nonzeros(A)
-    IPr = colvals(Pr) # colvals can be interpreted as rowvals when Pr is virtually transposed.
-    VPr = nonzeros(Pr)
+    IP = colvals(P) # colvals can be interpreted as rowvals when P is virtually transposed.
+    VP = nonzeros(P)
     JC = colvals(C)
     VC = nonzeros(C)
     # some cache items are present with the regular rap product in mind, which is how the allocating verison is performed
@@ -1455,9 +1483,9 @@ function rap!(C::SparseMatrixCSR{Bi,Tv,TiC},
     xb .= 0
     for i in 1:p
         # loop over columns "j" in row i of A
-        for jp in nzrange(Pl, i)
-            j = JPl[jp]
-            vpl = VPl[jp]
+        for jp in nzrange(R, i)
+            j = JR[jp]
+            vpl = VR[jp]
             # loop over columns "k" in row j of B
             for kp in nzrange(A, j)
                 k = JA[kp]
@@ -1470,40 +1498,40 @@ function rap!(C::SparseMatrixCSR{Bi,Tv,TiC},
                 end
             end
         end
-        for jpPr in nzrange(C,i)
-            jPr = JC[jpPr]
-            v = Tv(0)
-            for ip in nzrange(Pr,jPr)
-                iPr = IPr[ip]
-                if xb[iPr] == i
-                    v += x[iPr]*VPr[ip]
+        for jpP in nzrange(C,i)
+            jP = JC[jpP]
+            v = zero(eltype(C))
+            for ip in nzrange(P,jP)
+                iP = IP[ip]
+                if xb[iP] == i
+                    v += x[iP]*VP[ip]
                 end
             end
-            VC[jpPr] = v
+            VC[jpP] = v
         end
     end
     C
 end
 
-function rap!(C::SparseMatrixCSR{Bi,Tv,TiC},
-              Pl::SparseMatrixCSR{Bi,Tv,TiPl},
-              A::SparseMatrixCSR{Bi,Tv,TiA},
-              Prt::Transpose{Tv,SparseMatrixCSR{Bi,Tv,TiPr}},
+function rap!(C::SparseMatrixCSR,
+              R::SparseMatrixCSR,
+              A::SparseMatrixCSR,
+              Pt::Transpose{Tv,<:SparseMatrixCSR} where Tv,
               α::Number,
               β::Number,
-              cache) where {Bi,Tv,TiC,TiPl,TiA,TiPr}
-    p,q = size(Pl)
+              cache)
+    p,q = size(R)
     m,r = size(A)
-    n,s = size(Prt)
+    n,s = size(Pt)
     if q == m || throw(DimensionMismatch("Invalid dimensions for R*A: ($p,$q)*($m,$r),"));end
     if r == n || throw(DimensionMismatch("Invalid dimensions: RA*P: ($p,$r)*($n,$s)"));end
-    Pr = Prt.parent
-    JPl = colvals(Pl)
-    VPl = nonzeros(Pl)
+    P = Pt.parent
+    JR = colvals(R)
+    VR = nonzeros(R)
     JA = colvals(A)
     VA = nonzeros(A)
-    IPr = colvals(Pr) # colvals can be interpreted as rowvals when Pr is virtually transposed.
-    VPr = nonzeros(Pr)
+    IP = colvals(P) # colvals can be interpreted as rowvals when P is virtually transposed.
+    VP = nonzeros(P)
     JC = colvals(C)
     VC = nonzeros(C)
     VC .*= β
@@ -1512,9 +1540,9 @@ function rap!(C::SparseMatrixCSR{Bi,Tv,TiC},
     xb .= 0
     for i in 1:p
         # loop over columns "j" in row i of A
-        for jp in nzrange(Pl, i)
-            j = JPl[jp]
-            vpl = VPl[jp]
+        for jp in nzrange(R, i)
+            j = JR[jp]
+            vpl = VR[jp]
             # loop over columns "k" in row j of B
             for kp in nzrange(A, j)
                 k = JA[kp]
@@ -1527,146 +1555,124 @@ function rap!(C::SparseMatrixCSR{Bi,Tv,TiC},
                 end
             end
         end
-        for jpPr in nzrange(C,i)
-            jPr = JC[jpPr]
-            v = Tv(0)
-            for ip in nzrange(Pr,jPr)
-                iPr = IPr[ip]
-                if xb[iPr] == i
-                    v += x[iPr]*VPr[ip]
+        for jpP in nzrange(C,i)
+            jP = JC[jpP]
+            v = zero(eltype(C))
+            for ip in nzrange(P,jP)
+                iP = IP[ip]
+                if xb[iP] == i
+                    v += x[iP]*VP[ip]
                 end
             end
-            VC[jpPr] += α*v
+            VC[jpP] += α*v
         end
     end
     C
 end
 
 ### CSC in terms of CSR
-function rap(A::SparseMatrixCSC{Tv,TiA},
-             B::SparseMatrixCSC{Tv,TiB},
-             C::SparseMatrixCSC{Tv,TiC}) where {Tv,TiA,TiB,TiC}
+function rap(A::SparseMatrixCSC,
+             B::SparseMatrixCSC,
+             C::SparseMatrixCSC)
     D,cache = rap(ascsr(C),ascsr(B),ascsr(A))
     ascsc(D),cache
 end
 
-function rap(A::SparseMatrixCSC{Tv,TiA},
-             B::SparseMatrixCSC{Tv,TiB},
-             C::SparseMatrixCSC{Tv,TiC},
-             cache) where {Tv,TiA,TiB,TiC}
+function rap(A::SparseMatrixCSC,
+             B::SparseMatrixCSC,
+             C::SparseMatrixCSC,
+             cache)
     D,new_cache = rap(ascsr(C),ascsr(B),ascsr(A),cache)
     ascsc(D),new_cache
 end
 
-function rap!(D::SparseMatrixCSC{Tv,TiD},
-              A::SparseMatrixCSC{Tv,TiA},
-              B::SparseMatrixCSC{Tv,TiB},
-              C::SparseMatrixCSC{Tv,TiC},
-              cache) where {Tv,TiD,TiA,TiB,TiC}
+function rap!(D::SparseMatrixCSC,
+              A::SparseMatrixCSC,
+              B::SparseMatrixCSC,
+              C::SparseMatrixCSC,
+              cache)
     rap!(ascsr(D),ascsr(C),ascsr(B),ascsr(A),cache)
     D
 end
 
-function rap!(D::SparseMatrixCSC{Tv,TiD},
-              A::SparseMatrixCSC{Tv,TiA},
-              B::SparseMatrixCSC{Tv,TiB},
-              C::SparseMatrixCSC{Tv,TiC},
-              cache::JaggedArray{X,Y} where {X<:Integer, Y<:Integer},
-              acc) where {Tv,TiD,TiA,TiB,TiC}
-    rap!(ascsr(D),ascsr(C),ascsr(B),ascsr(A),cache,acc)
-    D
-end
-
-function rap!(D::SparseMatrixCSC{Tv,TiD},
-              A::SparseMatrixCSC{Tv,TiA},
-              B::SparseMatrixCSC{Tv,TiB},
-              C::SparseMatrixCSC{Tv,TiC},
+function rap!(D::SparseMatrixCSC,
+              A::SparseMatrixCSC,
+              B::SparseMatrixCSC,
+              C::SparseMatrixCSC,
               α::Number,
               β::Number,
-              cache) where {Tv,TiD,TiA,TiB,TiC}
+              cache)
     rap!(ascsr(D),ascsr(C),ascsr(B),ascsr(A),α,β,cache)
     D
 end
 
-function rap!(D::SparseMatrixCSC{Tv,TiD},
-              A::SparseMatrixCSC{Tv,TiA},
-              B::SparseMatrixCSC{Tv,TiB},
-              C::SparseMatrixCSC{Tv,TiC},
-              α::Number,
-              β::Number,
-              cache::JaggedArray{X,Y} where {X <: Integer, Y<:Integer},
-              acc) where {Tv,TiD,TiA,TiB,TiC}
-    rap!(ascsr(D),ascsr(C),ascsr(B),ascsr(A),α,β,cache,acc)
-    D
-end
-
 # PtAP
-function rap(A::Transpose{Tv,SparseMatrixCSC{Tv,TiA}},
-             B::SparseMatrixCSC{Tv,TiB},
-             C::SparseMatrixCSC{Tv,TiC}) where {Tv,TiA,TiB,TiC}
+function rap(A::Transpose{Tv,<:SparseMatrixCSC} where Tv,
+             B::SparseMatrixCSC,
+             C::SparseMatrixCSC)
     D,cache = rap(ascsr(C),ascsr(B),transpose(ascsr(A.parent)))
     ascsc(D),cache
 end
 
-function rap(A::Transpose{Tv,SparseMatrixCSC{Tv,TiA}},
-             B::SparseMatrixCSC{Tv,TiB},
-             C::SparseMatrixCSC{Tv,TiC},
-             cache) where {Tv,TiA,TiB,TiC}
+function rap(A::Transpose{Tv,<:SparseMatrixCSC} where Tv,
+             B::SparseMatrixCSC,
+             C::SparseMatrixCSC,
+             cache)
     D,cache = rap(ascsr(C),ascsr(B),transpose(ascsr(A.parent)),cache)
     ascsc(D),cache
 end
 
-function rap!(D::SparseMatrixCSC{Tv,TiD},
-              A::Transpose{Tv,SparseMatrixCSC{Tv,TiA}},
-              B::SparseMatrixCSC{Tv,TiB},
-              C::SparseMatrixCSC{Tv,TiC},
-              cache) where {Tv,TiD,TiA,TiB,TiC}
+function rap!(D::SparseMatrixCSC,
+              A::Transpose{Tv,<:SparseMatrixCSC} where Tv,
+              B::SparseMatrixCSC,
+              C::SparseMatrixCSC,
+              cache)
     rap!(ascsr(D),ascsr(C),ascsr(B),transpose(ascsr(A.parent)),cache)
     D
 end
 
-function rap!(D::SparseMatrixCSC{Tv,TiD},
-              A::Transpose{Tv,SparseMatrixCSC{Tv,TiA}},
-              B::SparseMatrixCSC{Tv,TiB},
-              C::SparseMatrixCSC{Tv,TiC},
+function rap!(D::SparseMatrixCSC,
+              A::Transpose{Tv,<:SparseMatrixCSC} where Tv,
+              B::SparseMatrixCSC,
+              C::SparseMatrixCSC,
               α::Number,
               β::Number,
-              cache) where {Tv,TiD,TiA,TiB,TiC}
+              cache)
     rap!(ascsr(D),ascsr(C),ascsr(B),transpose(ascsr(A.parent)),α,β,cache)
     D
 end
 
 # RARt
-function rap(A::SparseMatrixCSC{Tv,Ti},
-             B::SparseMatrixCSC{Tv,Ti},
-             C::Transpose{Tv,SparseMatrixCSC{Tv,Ti}}) where {Tv,Ti<:Integer}
+function rap(A::SparseMatrixCSC,
+             B::SparseMatrixCSC,
+             C::Transpose{Tv,<:SparseMatrixCSC} where Tv)
     D,new_cache = rap(transpose(ascsr(C.parent)),ascsr(B),ascsr(A))
     ascsc(D),new_cache
 end
-function rap(A::SparseMatrixCSC{Tv,Ti},
-             B::SparseMatrixCSC{Tv,Ti},
-             C::Transpose{Tv,SparseMatrixCSC{Tv,Ti}},
-             cache) where {Tv,Ti<:Integer}
+function rap(A::SparseMatrixCSC,
+             B::SparseMatrixCSC,
+             C::Transpose{Tv,<:SparseMatrixCSC} where Tv,
+             cache)
     D,new_cache = rap(transpose(ascsr(C.parent)),ascsr(B),ascsr(A),cache)
     ascsc(D),new_cache
 end
 
-function rap!(D::SparseMatrixCSC{Tv,Ti},
-              A::SparseMatrixCSC{Tv,Ti},
-              B::SparseMatrixCSC{Tv,Ti},
-              C::Transpose{Tv,SparseMatrixCSC{Tv,Ti}},
-              cache) where {Tv,Ti<:Integer}
+function rap!(D::SparseMatrixCSC,
+              A::SparseMatrixCSC,
+              B::SparseMatrixCSC,
+              C::Transpose{Tv,<:SparseMatrixCSC} where Tv,
+              cache)
     rap!(ascsr(D),transpose(ascsr(C.parent)),ascsr(B),ascsr(A),cache)
     D
 end
 
-function rap!(D::SparseMatrixCSC{Tv,Ti},
-              A::SparseMatrixCSC{Tv,Ti},
-              B::SparseMatrixCSC{Tv,Ti},
-              C::Transpose{Tv,SparseMatrixCSC{Tv,Ti}},
+function rap!(D::SparseMatrixCSC,
+              A::SparseMatrixCSC,
+              B::SparseMatrixCSC,
+              C::Transpose{Tv,<:SparseMatrixCSC} where Tv,
               α::Number,
               β::Number,
-              cache) where {Tv,Ti<:Integer}
+              cache)
     rap!(ascsr(D),transpose(ascsr(C.parent)),ascsr(B),ascsr(A),α,β,cache)
     D
 end
\ No newline at end of file
diff --git a/test/spmtmm_tests.jl b/test/spmtmm_tests.jl
index d8d4e658..6d8ab9b0 100644
--- a/test/spmtmm_tests.jl
+++ b/test/spmtmm_tests.jl
@@ -3,6 +3,7 @@ using SparseMatricesCSR
 using PartitionedArrays
 using LinearAlgebra
 using Test
+using InteractiveUtils
 
 function approx_equivalent(A::SparseMatrixCSC, B::SparseMatrixCSC,args...)
     if size(A) != size(B) && return false; end
@@ -132,28 +133,98 @@ function parallel_tests(pA,pB,sparse_func)
     end
 end
 
+function parallel_time(pA,pB,sparse_func)
+    A = centralize(sparse_func,pA)
+    B = centralize(sparse_func,pB)
+    # explicit parallel transpose
+    pBt = explicit_transpose(pB) |> fetch
+    Bt = centralize(sparse_func,pBt)
+    @test Bt == copy(transpose(B))
+    hp_B = halfperm(B)
+    @test Bt == hp_B
+
+    AB0 = A*B
+    C0 = transpose(B)*AB0
+    # test basic sequential csr implementations to default csc sequential implementations.
+    pAB,cacheAB = spmm(pA,pB,reuse=true)
+    print("spmm:\t")
+    @time spmm(pA,pB,reuse=true)
+    
+    # pB will be transposed internally
+    pC,cacheC = spmtm(pB,pAB,reuse=true)
+    print("spmtm:\t")
+    @time spmtm(pB,pAB,reuse=true)
+    spmm!(pAB,pA,pB,cacheAB)
+    print("spmm!:\t")
+    @time spmm!(pAB,pA,pB,cacheAB)
+    spmtm!(pC,pB,pAB,cacheC)
+    print("spmtm!:\t")
+    @time spmtm!(pC,pB,pAB,cacheC)
+    # pC,cacheC = spmtmm(pA,pB)
+    pC,cacheC = spmtmm(pB,pA,pB,reuse=true)
+    print("spmtmm:\t")
+    # @time spmtmm(pA,pB)
+    @time spmtmm(pB,pA,pB,reuse=true)
+    # spmtmm!(pC,pA,pB,cacheC)
+    spmtmm!(pC,pB,pA,pB,cacheC)
+    print("spmtmm!:")
+    # @time spmtmm!(pC,pA,pB,cacheC)
+    @time spmtmm!(pC,pB,pA,pB,cacheC)
+    pC,cacheC = spmm(pBt,pAB,reuse=true)
+    print("spmm:\t")
+    @time spmm(pBt,pAB,reuse=true)
+    spmm!(pC,pBt,pAB,cacheC)
+    print("spmm!:\t")
+    @time spmm!(pC,pBt,pAB,cacheC)
+
+    # pB will be transposed internally
+    pC,cacheC = spmmm(pBt,pA,pB,reuse=true)
+    print("spmmm: ")
+    @time spmmm(pBt,pA,pB,reuse=true)
+    spmmm!(pC,pBt,pA,pB,cacheC)
+    print("spmmm!:")
+    @time spmmm!(pC,pBt,pA,pB,cacheC)
+
+    # @code_warntype spmmm!(pC,pBt,pA,pB,cacheC)
+    print("Local SpMM:\t")
+    C = A*B
+    @time C = A*B
+    X,cache = rap(Bt,A,B)
+    print("RAP:\t")
+    @time rap(Bt,A,B)
+    rap!(X,Bt,A,B,cache)
+    print("RAP!:\t")
+    @time rap!(X,Bt,A,B,cache)
+end
+
+function Base.display(A::SparseMatrixCSR)
+    display(halfperm(A) |> PartitionedArrays.ascsc)
+end
+
 function spmtmm_tests(distribute)
     nodes_per_dir = (5,5,5)
     parts_per_dir = (1,2,2)
     np = prod(parts_per_dir)
     ranks = distribute(LinearIndices((np,)))
-    Ti = Int32
-    pA = psparse(sparsecsr,laplacian_fdm(nodes_per_dir,parts_per_dir,ranks; index_type = Ti)...) |> fetch
-    pB = pA
-    parallel_tests(pA,pB,sparsecsr)
-
-    # Testing with a real prolongator requires PartitionedSolvers
-    # T = eltype(typeof(own_own_values(pA).items))
-    # pB = prolongator(T,pA)
-    # parallel_tests(pA,pB,sparsecsr)
-    
-    #### CSC ####
-    pA = psparse(sparse,laplacian_fdm(nodes_per_dir,parts_per_dir,ranks; index_type = Ti)...) |> fetch
-    pB = pA
-    parallel_tests(pA,pB,sparse)
-    
-    # Testing with a real prolongator requires PartitionedSolvers
-    # T = eltype(typeof(own_own_values(pA).items))
-    # pB = prolongator(T,pA)
-    # parallel_tests(pA,pB,sparse)
+    for (TiA,TiB,TvA,TvB) in [(Int32,Int32,Float32,Float32),(Int32,Int64,Float32,Float32),(Int32,Int32,Float32,Float64),(Int32,Int64,Float32,Float64),(Int32,Int64,Int64,Int64),(Int32,Int64,Int64,Float32),(Int32,Int64,Float64,Int32)]
+        pA = psparse(sparsecsr,laplacian_fdm(nodes_per_dir,parts_per_dir,ranks;index_type=TiA,value_type=TvA)...) |> fetch
+        pB = psparse(sparsecsr,laplacian_fdm(nodes_per_dir,parts_per_dir,ranks;index_type=TiB,value_type=TvB)...) |> fetch
+
+        parallel_tests(pA,pB,sparsecsr)
+        # Testing with a real prolongator requires PartitionedSolvers
+        # T = eltype(typeof(own_own_values(pA).items))
+        # pB = prolongator(T,pA)
+        # parallel_tests(pA,pB,sparsecsr)
+        
+        #### CSC ####
+        pA = psparse(sparse,laplacian_fdm(nodes_per_dir,parts_per_dir,ranks; index_type = TiA, value_type=TvA)...) |> fetch
+        pB = psparse(sparse,laplacian_fdm(nodes_per_dir,parts_per_dir,ranks; index_type = TiB, value_type=TvB)...) |> fetch
+
+        parallel_tests(pA,pB,sparse)
+        # Testing with a real prolongator requires PartitionedSolvers
+        # T = eltype(typeof(own_own_values(pA).items))
+        # pB = prolongator(T,pA)
+        # parallel_tests(pA,pB,sparse)
+        # break
+    end
 end
\ No newline at end of file
diff --git a/times.txt b/times.txt
index 4e9217e8..835d3320 100644
--- a/times.txt
+++ b/times.txt
@@ -1 +1 @@
-Dict{String, @NamedTuple{min::Float64, max::Float64, avg::Float64}}("Phase 143" => (min = 0.2073056, max = 0.2073056, avg = 0.2073056), "Phase 3" => (min = 1.0e-7, max = 1.0e-7, avg = 1.0e-7), "Matrix Assembly" => (min = 0.4125464, max = 0.4125464, avg = 0.4125464), "Phase 1" => (min = 1.0e-6, max = 1.0e-6, avg = 1.0e-6))
+Dict{String, @NamedTuple{min::Float64, max::Float64, avg::Float64}}("Phase 143" => (min = 0.2139673, max = 0.2139673, avg = 0.2139673), "Phase 3" => (min = 1.0e-7, max = 1.0e-7, avg = 1.0e-7), "Matrix Assembly" => (min = 0.4185178, max = 0.4185178, avg = 0.4185178), "Phase 1" => (min = 1.0e-6, max = 1.0e-6, avg = 1.0e-6))

From 1e48b64f1f54aed68482355e035cacab5e00d0a5 Mon Sep 17 00:00:00 2001
From: jop611 <meijer.jop@gmail.com>
Date: Wed, 29 Jan 2025 12:36:01 +0100
Subject: [PATCH 30/34] fixed exported function name (rap vs RAP).

---
 src/PartitionedArrays.jl     |  4 ++--
 test/debug_array/runtests.jl | 20 ++++++++++----------
 test/spmtmm_tests.jl         |  5 ++---
 3 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/src/PartitionedArrays.jl b/src/PartitionedArrays.jl
index 2d250e0a..eaa88a17 100644
--- a/src/PartitionedArrays.jl
+++ b/src/PartitionedArrays.jl
@@ -211,8 +211,8 @@ export near_nullspace_linear_elasticity
 export prolongator
 include("gallery.jl")
 
-export RAP
-export RAP!
+export rap
+export rap!
 export -,+
 include("sequential_implementations.jl")
 
diff --git a/test/debug_array/runtests.jl b/test/debug_array/runtests.jl
index a175b722..88abfdeb 100644
--- a/test/debug_array/runtests.jl
+++ b/test/debug_array/runtests.jl
@@ -3,25 +3,25 @@ module DebugArrayRunTests
 using Test
 using PartitionedArrays
 
-@testset "debug_array" begin include("debug_array_tests.jl") end
+# @testset "debug_array" begin include("debug_array_tests.jl") end
 
-@testset "primitives" begin include("primitives_tests.jl")  end
+# @testset "primitives" begin include("primitives_tests.jl")  end
 
-@testset "p_range" begin include("p_range_tests.jl")  end
+# @testset "p_range" begin include("p_range_tests.jl")  end
 
-@testset "p_vector" begin include("p_vector_tests.jl")  end
+# @testset "p_vector" begin include("p_vector_tests.jl")  end
 
-@testset "p_sparse_matrix" begin include("p_sparse_matrix_tests.jl")  end
+# @testset "p_sparse_matrix" begin include("p_sparse_matrix_tests.jl")  end
 
-@testset "block_arrays" begin include("block_arrays_tests.jl")  end
+# @testset "block_arrays" begin include("block_arrays_tests.jl")  end
 
-@testset "gallery" begin include("gallery_tests.jl")  end
+# @testset "gallery" begin include("gallery_tests.jl")  end
 
-@testset "p_timer" begin include("p_timer_tests.jl")  end
+# @testset "p_timer" begin include("p_timer_tests.jl")  end
 
-@testset "fdm_example" begin include("fdm_example.jl")  end
+# @testset "fdm_example" begin include("fdm_example.jl")  end
 
-@testset "fem_example" begin include("fem_example.jl")  end
+# @testset "fem_example" begin include("fem_example.jl")  end
 
 @testset "spmtmm_tests" begin include("spmtmm_tests.jl")  end
 
diff --git a/test/spmtmm_tests.jl b/test/spmtmm_tests.jl
index 6d8ab9b0..1ceceeea 100644
--- a/test/spmtmm_tests.jl
+++ b/test/spmtmm_tests.jl
@@ -3,7 +3,6 @@ using SparseMatricesCSR
 using PartitionedArrays
 using LinearAlgebra
 using Test
-using InteractiveUtils
 
 function approx_equivalent(A::SparseMatrixCSC, B::SparseMatrixCSC,args...)
     if size(A) != size(B) && return false; end
@@ -209,8 +208,8 @@ function spmtmm_tests(distribute)
     for (TiA,TiB,TvA,TvB) in [(Int32,Int32,Float32,Float32),(Int32,Int64,Float32,Float32),(Int32,Int32,Float32,Float64),(Int32,Int64,Float32,Float64),(Int32,Int64,Int64,Int64),(Int32,Int64,Int64,Float32),(Int32,Int64,Float64,Int32)]
         pA = psparse(sparsecsr,laplacian_fdm(nodes_per_dir,parts_per_dir,ranks;index_type=TiA,value_type=TvA)...) |> fetch
         pB = psparse(sparsecsr,laplacian_fdm(nodes_per_dir,parts_per_dir,ranks;index_type=TiB,value_type=TvB)...) |> fetch
-
         parallel_tests(pA,pB,sparsecsr)
+        
         # Testing with a real prolongator requires PartitionedSolvers
         # T = eltype(typeof(own_own_values(pA).items))
         # pB = prolongator(T,pA)
@@ -219,8 +218,8 @@ function spmtmm_tests(distribute)
         #### CSC ####
         pA = psparse(sparse,laplacian_fdm(nodes_per_dir,parts_per_dir,ranks; index_type = TiA, value_type=TvA)...) |> fetch
         pB = psparse(sparse,laplacian_fdm(nodes_per_dir,parts_per_dir,ranks; index_type = TiB, value_type=TvB)...) |> fetch
-
         parallel_tests(pA,pB,sparse)
+        
         # Testing with a real prolongator requires PartitionedSolvers
         # T = eltype(typeof(own_own_values(pA).items))
         # pB = prolongator(T,pA)

From 33840d8b0b5f87b44451033b291d1795ed23d5db Mon Sep 17 00:00:00 2001
From: jop611 <meijer.jop@gmail.com>
Date: Wed, 29 Jan 2025 13:29:12 +0100
Subject: [PATCH 31/34] workaround for absence of 'reuse' kwargs in local
 sparse matrix multiplications algorithms

---
 src/sequential_implementations.jl |  11 +++
 test/mpi_array/runtests.jl        |  20 ++---
 test/spmtmm_tests.jl              | 136 +++++++++++++++---------------
 3 files changed, 89 insertions(+), 78 deletions(-)

diff --git a/src/sequential_implementations.jl b/src/sequential_implementations.jl
index 3585e38b..49540046 100644
--- a/src/sequential_implementations.jl
+++ b/src/sequential_implementations.jl
@@ -660,6 +660,17 @@ function LinearAlgebra.mul!(C::SparseMatrixCSR,
     C
 end
 
+function rap(A::Union{Transpose{TA,<:AbstractSparseMatrix},<:AbstractSparseMatrix} where TA,
+             B::M where M<:AbstractSparseMatrix,
+             C::Union{Transpose{TC,<:AbstractSparseMatrix},<:AbstractSparseMatrix} where TC
+             ;reuse=Val(true))
+    D,cache = rap(A,B,C)
+    if val_parameter(reuse)
+        return D,cache
+    end
+    D
+end
+
 # PtAP variants
 function rap(Rt::Transpose{TvR,SparseMatrixCSR{Bi,TvR,TiR}},
              A::SparseMatrixCSR{Bi,TvA,TiA},
diff --git a/test/mpi_array/runtests.jl b/test/mpi_array/runtests.jl
index fc6f0aee..ffdc1f1e 100644
--- a/test/mpi_array/runtests.jl
+++ b/test/mpi_array/runtests.jl
@@ -3,16 +3,16 @@ module MPIArrayRunTests
 using Test
 using PartitionedArrays
 
-# @testset "mpi_array" begin include("mpi_array_tests.jl") end
-# @testset "primitives" begin include("primitives_tests.jl")  end
-# @testset "p_range_tests" begin include("p_range_tests.jl")  end
-# @testset "p_vector_tests" begin include("p_vector_tests.jl")  end
-# @testset "p_sparse_matrix_tests" begin include("p_sparse_matrix_tests.jl")  end
-# @testset "gallery" begin include("gallery_tests.jl")  end
-# @testset "block_arrays" begin include("block_arrays_tests.jl")  end
-# @testset "p_timer_tests" begin include("p_timer_tests.jl")  end
-# @testset "fdm_example" begin include("fdm_example.jl")  end
-# @testset "fem_example" begin include("fem_example.jl")  end
+@testset "mpi_array" begin include("mpi_array_tests.jl") end
+@testset "primitives" begin include("primitives_tests.jl")  end
+@testset "p_range_tests" begin include("p_range_tests.jl")  end
+@testset "p_vector_tests" begin include("p_vector_tests.jl")  end
+@testset "p_sparse_matrix_tests" begin include("p_sparse_matrix_tests.jl")  end
+@testset "gallery" begin include("gallery_tests.jl")  end
+@testset "block_arrays" begin include("block_arrays_tests.jl")  end
+@testset "p_timer_tests" begin include("p_timer_tests.jl")  end
+@testset "fdm_example" begin include("fdm_example.jl")  end
+@testset "fem_example" begin include("fem_example.jl")  end
 @testset "spmtmm_tests" begin include("spmtmm_tests.jl")  end
 
 end #module
diff --git a/test/spmtmm_tests.jl b/test/spmtmm_tests.jl
index 1ceceeea..aacfdcad 100644
--- a/test/spmtmm_tests.jl
+++ b/test/spmtmm_tests.jl
@@ -132,73 +132,73 @@ function parallel_tests(pA,pB,sparse_func)
     end
 end
 
-function parallel_time(pA,pB,sparse_func)
-    A = centralize(sparse_func,pA)
-    B = centralize(sparse_func,pB)
-    # explicit parallel transpose
-    pBt = explicit_transpose(pB) |> fetch
-    Bt = centralize(sparse_func,pBt)
-    @test Bt == copy(transpose(B))
-    hp_B = halfperm(B)
-    @test Bt == hp_B
-
-    AB0 = A*B
-    C0 = transpose(B)*AB0
-    # test basic sequential csr implementations to default csc sequential implementations.
-    pAB,cacheAB = spmm(pA,pB,reuse=true)
-    print("spmm:\t")
-    @time spmm(pA,pB,reuse=true)
+# function parallel_time(pA,pB,sparse_func)
+#     A = centralize(sparse_func,pA)
+#     B = centralize(sparse_func,pB)
+#     # explicit parallel transpose
+#     pBt = explicit_transpose(pB) |> fetch
+#     Bt = centralize(sparse_func,pBt)
+#     @test Bt == copy(transpose(B))
+#     hp_B = halfperm(B)
+#     @test Bt == hp_B
+
+#     AB0 = A*B
+#     C0 = transpose(B)*AB0
+#     # test basic sequential csr implementations to default csc sequential implementations.
+#     pAB,cacheAB = spmm(pA,pB,reuse=true)
+#     print("spmm:\t")
+#     @time spmm(pA,pB,reuse=true)
     
-    # pB will be transposed internally
-    pC,cacheC = spmtm(pB,pAB,reuse=true)
-    print("spmtm:\t")
-    @time spmtm(pB,pAB,reuse=true)
-    spmm!(pAB,pA,pB,cacheAB)
-    print("spmm!:\t")
-    @time spmm!(pAB,pA,pB,cacheAB)
-    spmtm!(pC,pB,pAB,cacheC)
-    print("spmtm!:\t")
-    @time spmtm!(pC,pB,pAB,cacheC)
-    # pC,cacheC = spmtmm(pA,pB)
-    pC,cacheC = spmtmm(pB,pA,pB,reuse=true)
-    print("spmtmm:\t")
-    # @time spmtmm(pA,pB)
-    @time spmtmm(pB,pA,pB,reuse=true)
-    # spmtmm!(pC,pA,pB,cacheC)
-    spmtmm!(pC,pB,pA,pB,cacheC)
-    print("spmtmm!:")
-    # @time spmtmm!(pC,pA,pB,cacheC)
-    @time spmtmm!(pC,pB,pA,pB,cacheC)
-    pC,cacheC = spmm(pBt,pAB,reuse=true)
-    print("spmm:\t")
-    @time spmm(pBt,pAB,reuse=true)
-    spmm!(pC,pBt,pAB,cacheC)
-    print("spmm!:\t")
-    @time spmm!(pC,pBt,pAB,cacheC)
-
-    # pB will be transposed internally
-    pC,cacheC = spmmm(pBt,pA,pB,reuse=true)
-    print("spmmm: ")
-    @time spmmm(pBt,pA,pB,reuse=true)
-    spmmm!(pC,pBt,pA,pB,cacheC)
-    print("spmmm!:")
-    @time spmmm!(pC,pBt,pA,pB,cacheC)
-
-    # @code_warntype spmmm!(pC,pBt,pA,pB,cacheC)
-    print("Local SpMM:\t")
-    C = A*B
-    @time C = A*B
-    X,cache = rap(Bt,A,B)
-    print("RAP:\t")
-    @time rap(Bt,A,B)
-    rap!(X,Bt,A,B,cache)
-    print("RAP!:\t")
-    @time rap!(X,Bt,A,B,cache)
-end
-
-function Base.display(A::SparseMatrixCSR)
-    display(halfperm(A) |> PartitionedArrays.ascsc)
-end
+#     # pB will be transposed internally
+#     pC,cacheC = spmtm(pB,pAB,reuse=true)
+#     print("spmtm:\t")
+#     @time spmtm(pB,pAB,reuse=true)
+#     spmm!(pAB,pA,pB,cacheAB)
+#     print("spmm!:\t")
+#     @time spmm!(pAB,pA,pB,cacheAB)
+#     spmtm!(pC,pB,pAB,cacheC)
+#     print("spmtm!:\t")
+#     @time spmtm!(pC,pB,pAB,cacheC)
+#     # pC,cacheC = spmtmm(pA,pB)
+#     pC,cacheC = spmtmm(pB,pA,pB,reuse=true)
+#     print("spmtmm:\t")
+#     # @time spmtmm(pA,pB)
+#     @time spmtmm(pB,pA,pB,reuse=true)
+#     # spmtmm!(pC,pA,pB,cacheC)
+#     spmtmm!(pC,pB,pA,pB,cacheC)
+#     print("spmtmm!:")
+#     # @time spmtmm!(pC,pA,pB,cacheC)
+#     @time spmtmm!(pC,pB,pA,pB,cacheC)
+#     pC,cacheC = spmm(pBt,pAB,reuse=true)
+#     print("spmm:\t")
+#     @time spmm(pBt,pAB,reuse=true)
+#     spmm!(pC,pBt,pAB,cacheC)
+#     print("spmm!:\t")
+#     @time spmm!(pC,pBt,pAB,cacheC)
+
+#     # pB will be transposed internally
+#     pC,cacheC = spmmm(pBt,pA,pB,reuse=true)
+#     print("spmmm: ")
+#     @time spmmm(pBt,pA,pB,reuse=true)
+#     spmmm!(pC,pBt,pA,pB,cacheC)
+#     print("spmmm!:")
+#     @time spmmm!(pC,pBt,pA,pB,cacheC)
+
+#     # @code_warntype spmmm!(pC,pBt,pA,pB,cacheC)
+#     print("Local SpMM:\t")
+#     C = A*B
+#     @time C = A*B
+#     X,cache = rap(Bt,A,B)
+#     print("RAP:\t")
+#     @time rap(Bt,A,B)
+#     rap!(X,Bt,A,B,cache)
+#     print("RAP!:\t")
+#     @time rap!(X,Bt,A,B,cache)
+# end
+
+# function Base.display(A::SparseMatrixCSR)
+#     display(halfperm(A) |> PartitionedArrays.ascsc)
+# end
 
 function spmtmm_tests(distribute)
     nodes_per_dir = (5,5,5)
@@ -209,7 +209,7 @@ function spmtmm_tests(distribute)
         pA = psparse(sparsecsr,laplacian_fdm(nodes_per_dir,parts_per_dir,ranks;index_type=TiA,value_type=TvA)...) |> fetch
         pB = psparse(sparsecsr,laplacian_fdm(nodes_per_dir,parts_per_dir,ranks;index_type=TiB,value_type=TvB)...) |> fetch
         parallel_tests(pA,pB,sparsecsr)
-        
+
         # Testing with a real prolongator requires PartitionedSolvers
         # T = eltype(typeof(own_own_values(pA).items))
         # pB = prolongator(T,pA)
@@ -219,7 +219,7 @@ function spmtmm_tests(distribute)
         pA = psparse(sparse,laplacian_fdm(nodes_per_dir,parts_per_dir,ranks; index_type = TiA, value_type=TvA)...) |> fetch
         pB = psparse(sparse,laplacian_fdm(nodes_per_dir,parts_per_dir,ranks; index_type = TiB, value_type=TvB)...) |> fetch
         parallel_tests(pA,pB,sparse)
-        
+
         # Testing with a real prolongator requires PartitionedSolvers
         # T = eltype(typeof(own_own_values(pA).items))
         # pB = prolongator(T,pA)

From 1816fd739002fa378f8c5025b266c7a0c084d8b9 Mon Sep 17 00:00:00 2001
From: jop611 <meijer.jop@gmail.com>
Date: Wed, 29 Jan 2025 15:41:52 +0100
Subject: [PATCH 32/34] fixed type piracy

---
 src/PartitionedArrays.jl          |   6 +-
 src/p_sparse_matrix.jl            |  71 +++++++++++---------
 src/sequential_implementations.jl | 106 +++++++++++++++++-------------
 test/debug_array/runtests.jl      |  20 +++---
 test/debug_array/spmtmm_tests.jl  |  10 +--
 test/spmtmm_tests.jl              |   8 +--
 times.txt                         |   2 +-
 7 files changed, 129 insertions(+), 94 deletions(-)

diff --git a/src/PartitionedArrays.jl b/src/PartitionedArrays.jl
index eaa88a17..05746ce7 100644
--- a/src/PartitionedArrays.jl
+++ b/src/PartitionedArrays.jl
@@ -211,9 +211,13 @@ export near_nullspace_linear_elasticity
 export prolongator
 include("gallery.jl")
 
+export add
+export subtract
+export mul
+export matmul
+export matmul!
 export rap
 export rap!
-export -,+
 include("sequential_implementations.jl")
 
 end # module
diff --git a/src/p_sparse_matrix.jl b/src/p_sparse_matrix.jl
index 560ba66a..196bdb90 100644
--- a/src/p_sparse_matrix.jl
+++ b/src/p_sparse_matrix.jl
@@ -2351,20 +2351,22 @@ end
 
 ### NEW ###
 function spmm(A::PSparseMatrix,B::PSparseMatrix;reuse=Val(false))
+    @assert A.assembled
+    @assert B.assembled
     t = consistent(B,partition(axes(A,2)),reuse=true)
     A_own_own = own_own_values(A)
     A_own_ghost = own_ghost_values(A)
 
-    C_own_own_1 = map(*,A_own_own,own_own_values(B))
+    C_own_own_1 = map(matmul,A_own_own,own_own_values(B))
     
     # Wait for consistent
     B2, cacheB2 = fetch(t)
-    C_own_ghost_1 = map(*,A_own_own,own_ghost_values(B2))
-    C_own_own_2 = map(*,A_own_ghost,ghost_own_values(B2))
-    C_own_ghost_2 = map(*,A_own_ghost,ghost_ghost_values(B2))
+    C_own_ghost_1 = map(matmul,A_own_own,own_ghost_values(B2))
+    C_own_own_2 = map(matmul,A_own_ghost,ghost_own_values(B2))
+    C_own_ghost_2 = map(matmul,A_own_ghost,ghost_ghost_values(B2))
     
-    C_own_own = map(+, C_own_own_1, C_own_own_2)
-    C_own_ghost = map(+, C_own_ghost_1, C_own_ghost_2)
+    C_own_own = map(add, C_own_own_1, C_own_own_2)
+    C_own_ghost = map(add, C_own_ghost_1, C_own_ghost_2)
     
     Coo_cache = map(construct_spmm_cache, C_own_own)
     Cog_cache = map(construct_spmm_cache, C_own_ghost)
@@ -2392,12 +2394,12 @@ function spmm!(C::PSparseMatrix,A::PSparseMatrix,B::PSparseMatrix,cache)
     C_own_own = own_own_values(C)
     C_own_ghost = own_ghost_values(C)
 
-    map(mul!, C_own_own, A_own_own, own_own_values(B),Coo_cache)
+    map(matmul!, C_own_own, A_own_own, own_own_values(B),Coo_cache)
     wait(t)
-    map(mul!, C_own_ghost, A_own_own, own_ghost_values(B2),Cog_cache)
+    map(matmul!, C_own_ghost, A_own_own, own_ghost_values(B2),Cog_cache)
 
-    map((C,A,B,cache) -> mul!(C,A,B,1,1,cache), C_own_own,A_own_ghost,ghost_own_values(B2),Coo_cache)
-    map((C,A,B,cache) -> mul!(C,A,B,1,1,cache), C_own_ghost,A_own_ghost,ghost_ghost_values(B2),Cog_cache)
+    map((C,A,B,cache) -> matmul!(C,A,B,1,1,cache), C_own_own,A_own_ghost,ghost_own_values(B2),Coo_cache)
+    map((C,A,B,cache) -> matmul!(C,A,B,1,1,cache), C_own_ghost,A_own_ghost,ghost_ghost_values(B2),Cog_cache)
     C
 end
 ### End NEW ###
@@ -2440,13 +2442,15 @@ end
 
 ### NEW ###
 function spmtm(A::PSparseMatrix,B::PSparseMatrix;reuse=Val(false))
+    @assert A.assembled
+    @assert B.assembled
     Aoo = own_own_values(A)
     Aog = own_ghost_values(A)
     Boo = own_own_values(B)
     Bog = own_ghost_values(B)
 
-    C1go = map((A,B)->transpose(A)*B,Aog,Boo)
-    C1gg = map((A,B)->transpose(A)*B,Aog,Bog)
+    C1go = map((A,B)->matmul(transpose(A),B),Aog,Boo)
+    C1gg = map((A,B)->matmul(transpose(A),B),Aog,Bog)
 
     C1_values = map(C1go, C1gg, partition(A), partition(B)) do ghost_own, ghost_ghost, A_part, B_part
         own_own = similar(ghost_ghost, size(A_part.blocks.own_own, 2), size(B_part.blocks.own_own, 2))
@@ -2459,8 +2463,8 @@ function spmtm(A::PSparseMatrix,B::PSparseMatrix;reuse=Val(false))
     C1_unassembled = PSparseMatrix(C1_values,partition(axes(A,2)),partition(axes(B,2)),assembled)
     t = assemble(C1_unassembled,reuse=true)
 
-    C2oo = map((A,B)->transpose(A)*B,Aoo,Boo)
-    C2og = map((A,B)->transpose(A)*B,Aoo,Bog)
+    C2oo = map((A,B)->matmul(transpose(A),B),Aoo,Boo)
+    C2og = map((A,B)->matmul(transpose(A),B),Aoo,Bog)
 
     C2_values = map(C2oo, C2og, partition(A), partition(B)) do own_own, own_ghost, A_part, B_part
         ghost_own = similar(own_own,0,size(own_own,2))
@@ -2497,12 +2501,12 @@ function spmtm!(C::PSparseMatrix,A::PSparseMatrix,B::PSparseMatrix,cache)
     Boo = own_own_values(B)
     Bog = own_ghost_values(B)
 
-    map((C,A,B,cache)->mul!(C,transpose(A),B,cache),ghost_own_values(C1_unassembled),Aog,Boo,Cgo_cache)
-    map((C,A,B,cache)->mul!(C,transpose(A),B,cache),ghost_ghost_values(C1_unassembled),Aog,Bog,Cgg_cache)
+    map((C,A,B,cache)->matmul!(C,transpose(A),B,cache),ghost_own_values(C1_unassembled),Aog,Boo,Cgo_cache)
+    map((C,A,B,cache)->matmul!(C,transpose(A),B,cache),ghost_ghost_values(C1_unassembled),Aog,Bog,Cgg_cache)
         
     t = assemble!(C1, C1_unassembled, assemblyCache)
-    map((C,A,B,cache)->mul!(C,transpose(A),B,cache),own_own_values(C2),Aoo,Boo,Coo_cache)
-    map((C,A,B,cache)->mul!(C,transpose(A),B,cache),own_ghost_values(C2),Aoo,Bog,Cog_cache)
+    map((C,A,B,cache)->matmul!(C,transpose(A),B,cache),own_own_values(C2),Aoo,Boo,Coo_cache)
+    map((C,A,B,cache)->matmul!(C,transpose(A),B,cache),own_ghost_values(C2),Aoo,Bog,Cog_cache)
     wait(t)
     add!(C, C1, C2, mergeCache)
     C
@@ -3059,7 +3063,7 @@ end
 
 function add(A::PSparseMatrix,B::PSparseMatrix)
     function add_own_own(A,B)
-        C = A+B
+        C = add(A,B)
         # reuse IA/IB for cache
         KA = precompute_nzindex(C,A)
         KB = precompute_nzindex(C,B)
@@ -3129,6 +3133,9 @@ end
 
 # Interpret A as if its transpose is needed
 function spmtmm(A::PSparseMatrix,B::PSparseMatrix,C::PSparseMatrix;reuse=Val(false))
+    @assert A.assembled
+    @assert B.assembled
+    @assert C.assembled
     consistency_task = consistent(C, partition(axes(B,2)),reuse=true)
     
     Aoo = own_own_values(A)
@@ -3152,8 +3159,8 @@ function spmtmm(A::PSparseMatrix,B::PSparseMatrix,C::PSparseMatrix;reuse=Val(fal
     Dog1, Dog_cache = map((A,B,C,cache)->rap(transpose(A),B,C,cache), Aog,Boo,Cog2,Dgo_cache) |> tuple_of_arrays
     Dog2, Dog_cache = map((A,B,C,cache)->rap(transpose(A),B,C,cache), Aog,Bog,Cgg,Dog_cache) |> tuple_of_arrays        
 
-    Dgo = map(+,Dgo1,Dgo2) # different sparsity patterns so not in-place.
-    Dog = map(+,Dog1,Dog2)
+    Dgo = map(add,Dgo1,Dgo2) # different sparsity patterns so not in-place.
+    Dog = map(add,Dog1,Dog2)
 
     D1_values = map(Dgo, Dog, partition(C), partition(C2)) do ghost_own, ghost_ghost, C_part, C2_part
         own_own = similar(ghost_ghost, size(C_part.blocks.own_own, 2), size(C2_part.blocks.own_own, 2))
@@ -3168,8 +3175,8 @@ function spmtmm(A::PSparseMatrix,B::PSparseMatrix,C::PSparseMatrix;reuse=Val(fal
     Doo2,Doo_cache = map((A,B,C,cache)->rap(transpose(A),B,C,cache), Aoo,Bog,Cgo,Doo_cache) |> tuple_of_arrays
     Dog2,Dog_cache = map((A,B,C,cache)->rap(transpose(A),B,C,cache), Aoo,Bog,Cgg,Dog_cache) |> tuple_of_arrays
 
-    Doo = map(+,Doo1,Doo2)
-    Dog = map(+,Dog1,Dog2)
+    Doo = map(add,Doo1,Doo2)
+    Dog = map(add,Dog1,Dog2)
 
     Doo_cache_final = map((cache,D)->reduce_spmtmm_cache(cache,typeof(D)),Doo_cache,Doo)
     Dog_cache_final = map((cache,D)->reduce_spmtmm_cache(cache,typeof(D)),Dog_cache,Dog)
@@ -3195,6 +3202,8 @@ function spmtmm(A::PSparseMatrix,B::PSparseMatrix,C::PSparseMatrix;reuse=Val(fal
 end
 
 function spmtmm(A::PSparseMatrix,P::PSparseMatrix;kwargs...)
+    @assert A.assembled
+    @assert P.assembled
     spmtmm(transpose(P),A,P;kwargs...)
 end
 
@@ -3226,7 +3235,6 @@ function spmtmm!(D::PSparseMatrix,A::PSparseMatrix,B::PSparseMatrix,C::PSparseMa
     Cgg = ghost_ghost_values(C2)
 
     map((D,A,B,C,cache)->rap!(D,transpose(A),B,C,cache), Dgg,Aog,Boo,Cog2,Dgg_cache)
-
     map((D,A,B,C,cache)->rap!(D,transpose(A),B,C,1,1,cache), Dgo,Aog,Bog,Cgo,Dgo_cache)
     map((D,A,B,C,cache)->rap!(D,transpose(A),B,C,1,1,cache), Dgg,Aog,Bog,Cgg,Dgg_cache)
 
@@ -3246,6 +3254,9 @@ function spmtmm!(C::PSparseMatrix,A::PSparseMatrix,P::PSparseMatrix,cache)
 end
 
 function spmmm(A::PSparseMatrix,B::PSparseMatrix,C::PSparseMatrix;reuse=Val(false))
+    @assert A.assembled
+    @assert B.assembled
+    @assert C.assembled
     B2_task = consistent(B,partition(axes(A,2)),reuse=true)
     Aoo = own_own_values(A)
     Aog = own_ghost_values(A)
@@ -3261,7 +3272,7 @@ function spmmm(A::PSparseMatrix,B::PSparseMatrix,C::PSparseMatrix;reuse=Val(fals
     Bgg = ghost_ghost_values(B2)
 
     Doo2,Doo_cache = map(rap,Aog,Bgo,Coo,Doo_cache) |> tuple_of_arrays
-    Doo12 = map(+,Doo1,Doo2)
+    Doo12 = map(add,Doo1,Doo2)
 
     C2, Ccache = fetch(C2_task)
   
@@ -3272,17 +3283,17 @@ function spmmm(A::PSparseMatrix,B::PSparseMatrix,C::PSparseMatrix;reuse=Val(fals
     Doo3,Doo_cache = map(rap,Aoo,Bog,Cgo,Doo_cache) |> tuple_of_arrays
     Doo4,Doo_cache = map(rap,Aog,Bgg,Cgo,Doo_cache) |> tuple_of_arrays
   
-    Doo34 = map(+,Doo3,Doo4)
-    Doo = map(+,Doo12,Doo34)
+    Doo34 = map(add,Doo3,Doo4)
+    Doo = map(add,Doo12,Doo34)
   
     Dog1,Dog_cache = map(rap,Aoo,Boo,Cog) |> tuple_of_arrays
     Dog2,Dog_cache = map(rap,Aog,Bgo,Cog,Dog_cache) |> tuple_of_arrays
     Dog3,Dog_cache = map(rap,Aoo,Bog,Cgg,Dog_cache) |> tuple_of_arrays
     Dog4,Dog_cache = map(rap,Aog,Bgg,Cgg,Dog_cache) |> tuple_of_arrays
 
-    Dog12 = map(+,Dog1,Dog2)
-    Dog34 = map(+,Dog3,Dog4)
-    Dog = map(+,Dog12,Dog34)
+    Dog12 = map(add,Dog1,Dog2)
+    Dog34 = map(add,Dog3,Dog4)
+    Dog = map(add,Dog12,Dog34)
 
     D_values = map(Doo, Dog, partition(A),partition(C2)) do own_own, own_ghost, A_part,C_part
         ghost_own = similar(own_own,0,size(own_own, 2))
diff --git a/src/sequential_implementations.jl b/src/sequential_implementations.jl
index 49540046..f69da7b1 100644
--- a/src/sequential_implementations.jl
+++ b/src/sequential_implementations.jl
@@ -1,34 +1,52 @@
-function Base.:*(A::SparseMatrixCSR,B::SparseMatrixCSR)
-    C = ascsc(B)*ascsc(A)
+function matmul(A::SparseMatrixCSC,B::SparseMatrixCSC)
+    A*B
+end
+
+function matmul(A::Transpose{Tv,<:SparseMatrixCSC} where Tv,B::SparseMatrixCSC)
+    A*B
+end
+
+function matmul(A::SparseMatrixCSC,B::Transpose{Tv,<:SparseMatrixCSC} where Tv)
+    A*B
+end
+
+function matmul(A::Transpose{TvA,<:SparseMatrixCSC} where TvA,B::Transpose{TvB,<:SparseMatrixCSC} where TvB)
+    A*B
+end
+
+function matmul(A::SparseMatrixCSR,B::SparseMatrixCSR)
+    C = matmul(ascsc(B),ascsc(A))
     ascsr(C)
 end
 
-function Base.:*(At::Transpose{Tv,<:SparseMatrixCSR} where Tv,B::SparseMatrixCSR)
-    C = ascsc(B)*transpose(ascsc(At.parent))
+function matmul(At::Transpose{Tv,<:SparseMatrixCSR} where Tv,B::SparseMatrixCSR)
+    C = matmul(ascsc(B),transpose(ascsc(At.parent)))
     ascsr(C)
 end
 
-function Base.:*(A::SparseMatrixCSR,Bt::Transpose{Tv,<:SparseMatrixCSR} where Tv)
+function matmul(A::SparseMatrixCSR,Bt::Transpose{Tv,<:SparseMatrixCSR} where Tv)
     C = transpose(ascsc(Bt.parent))*ascsc(A)
     ascsr(C)
 end
 
-function Base.:*(At::Transpose{TvA,<:SparseMatrixCSR} where TvA,Bt::Transpose{TvB,<:SparseMatrixCSR} where TvB)
+function matmul(At::Transpose{TvA,<:SparseMatrixCSR} where TvA,Bt::Transpose{TvB,<:SparseMatrixCSR} where TvB)
     C = transpose(ascsc(Bt.parent))*transpose(ascsc(At.parent))
     ascsr(C)
 end
 
-function Base.:*(x::Number,A::SparseMatrixCSR{Bi,Tv,Ti}) where {Bi,Tv,Ti}
+function mul(x::Number,A::SparseMatrixCSR{Bi,Tv,Ti}) where {Bi,Tv,Ti}
     SparseMatrixCSR{Bi}(size(A)..., copy(A.rowptr), copy(A.colval), map(a -> x*a, A.nzval))
 end
-function Base.:*(A::SparseMatrixCSR,x::Number) *(x,A) end
 
-function Base.:/(A::SparseMatrixCSR{Bi,Tv,Ti},x::Number) where {Bi,Tv,Ti}
-    SparseMatrixCSR{Bi}(size(A)..., copy(A.rowptr), copy(A.colval), map(a -> a/x, A.nzval))
-end
+function mul(A::SparseMatrixCSR,x::Number) mul(x,A) end
+
+
+# function quotient(A::SparseMatrixCSR{Bi,Tv,Ti},x::Number) where {Bi,Tv,Ti}
+#     SparseMatrixCSR{Bi}(size(A)..., copy(A.rowptr), copy(A.colval), map(a -> a/x, A.nzval))
+# end
 
 # Alternative to lazy csr to csc for matrix addition that does not drop structural zeros.
-function Base.:+(A::SparseMatrixCSR{Bi,TvA,TiA},B::SparseMatrixCSR{Bi,TvB,TiB}) where {Bi,TvA,TvB,TiA,TiB}
+function add(A::SparseMatrixCSR{Bi,TvA,TiA},B::SparseMatrixCSR{Bi,TvB,TiB}) where {Bi,TvA,TvB,TiA,TiB}
     if size(A) == size(B) || throw(DimensionMismatch("Size of B $(size(B)) must match size of A $(size(A))"));end
     Ti = promote_type(TiA,TiB)
     Tv = promote_type(TvA,TvB)
@@ -88,7 +106,7 @@ function Base.:+(A::SparseMatrixCSR{Bi,TvA,TiA},B::SparseMatrixCSR{Bi,TvB,TiB})
 end
 
 # Alternative to lazy csr to csc for matrix subtraction that does not drop structural zeros. Subtracts B from A, i.e. A - B.
-function Base.:-(A::SparseMatrixCSR{Bi,TvA,TiA},B::SparseMatrixCSR{Bi,TvB,TiB}) where {Bi,TvA,TvB,TiA,TiB}
+function subtract(A::SparseMatrixCSR{Bi,TvA,TiA},B::SparseMatrixCSR{Bi,TvB,TiB}) where {Bi,TvA,TvB,TiA,TiB}
     if size(A) == size(B) || throw(DimensionMismatch("Size of B $(size(B)) must match size of A $(size(A))"));end
     Ti = promote_type(TiA,TiB)
     Tv = promote_type(TvA,TvB)
@@ -147,12 +165,12 @@ function Base.:-(A::SparseMatrixCSR{Bi,TvA,TiA},B::SparseMatrixCSR{Bi,TvB,TiB})
     SparseMatrixCSR{Bi}(p,r,IC,JC,VC)   # A += B
 end
 
-function Base.:-(A::SparseMatrixCSR{Bi,Tv,Ti}) where {Bi,Tv,Ti}
+function subtract(A::SparseMatrixCSR{Bi,Tv,Ti}) where {Bi,Tv,Ti}
     SparseMatrixCSR{Bi}(size(A)..., copy(A.rowptr), copy(A.colval), map(a->-a, A.nzval))
 end
 
 # Alternative to lazy csr to csc for matrix addition that does not drop structural zeros.
-function Base.:+(A::SparseMatrixCSC{TvA,TiA},B::SparseMatrixCSC{TvB,TiB}) where {TvA,TvB,TiA,TiB}
+function add(A::SparseMatrixCSC{TvA,TiA},B::SparseMatrixCSC{TvB,TiB}) where {TvA,TvB,TiA,TiB}
     if size(A) != size(B) && throw(DimensionMismatch("Size of B $(size(B)) must match size of A $(size(A))"));end
     Ti = promote_type(TiA,TiB)
     Tv = promote_type(TvA,TvB)
@@ -212,7 +230,7 @@ function Base.:+(A::SparseMatrixCSC{TvA,TiA},B::SparseMatrixCSC{TvB,TiB}) where
 end
 
 # Alternative to lazy csr to csc for matrix subtraction that does not drop structural zeros. Subtracts B from A, i.e. A - B.
-function Base.:-(A::SparseMatrixCSC{TvA,TiA},B::SparseMatrixCSC{TvB,TiB}) where {TvA,TvB,TiA,TiB}
+function subtract(A::SparseMatrixCSC{TvA,TiA},B::SparseMatrixCSC{TvB,TiB}) where {TvA,TvB,TiA,TiB}
     if size(A) == size(B) || throw(DimensionMismatch("Size of B $(size(B)) must match size of A $(size(A))"));end
     Ti = promote_type(TiA,TiB)
     Tv = promote_type(TvA,TvB)
@@ -271,33 +289,33 @@ function Base.:-(A::SparseMatrixCSC{TvA,TiA},B::SparseMatrixCSC{TvB,TiB}) where
     SparseMatrixCSC{Tv,Ti}(p,q,JC,IC,VC)
 end
 
-function Base.:-(A::SparseMatrixCSC{Tv,Ti}) where {Tv,Ti}
+function subtract(A::SparseMatrixCSC{Tv,Ti}) where {Tv,Ti}
     SparseMatrixCSC{Tv,Ti}(size(A)..., copy(A.colptr), copy(A.rowval), map(a->-a, A.nzval))
 end
 
 
-function LinearAlgebra.mul!(C::SparseMatrixCSC,
+function matmul!(C::SparseMatrixCSC,
                             A::SparseMatrixCSC,
                             B::SparseMatrixCSC,
                             cache)
-    mul!(ascsr(C),ascsr(B),ascsr(A),cache)
+    matmul!(ascsr(C),ascsr(B),ascsr(A),cache)
     C
 end
 
 
-function LinearAlgebra.mul!(C::SparseMatrixCSC,
+function matmul!(C::SparseMatrixCSC,
                             A::SparseMatrixCSC,
                             B::SparseMatrixCSC,
                             α::Number,
                             β::Number,
                             cache)
-    mul!(ascsr(C),ascsr(B),ascsr(A),α,β,cache)
+    matmul!(ascsr(C),ascsr(B),ascsr(A),α,β,cache)
     C
 end
 
-function LinearAlgebra.mul!(C::SparseMatrixCSC,
-                            At::Transpose{Tv,<:SparseMatrixCSC} where Tv,
-                            B::SparseMatrixCSC)
+function matmul!(C::SparseMatrixCSC,
+                 At::Transpose{Tv,<:SparseMatrixCSC} where Tv,
+                 B::SparseMatrixCSC)
     a,b = size(C)
     p,q = size(At)
     r,s = size(B)
@@ -342,7 +360,7 @@ function LinearAlgebra.mul!(C::SparseMatrixCSC,
     C
 end
 
-function LinearAlgebra.mul!(C::SparseMatrixCSC{Tv,Ti},
+function matmul!(C::SparseMatrixCSC{Tv,Ti},
                             At::Transpose{Tv,SparseMatrixCSC{Tv,Ti}},
                             B::SparseMatrixCSC{Tv,Ti},
                             α::Number,
@@ -389,14 +407,14 @@ function LinearAlgebra.mul!(C::SparseMatrixCSC{Tv,Ti},
     C
 end
 
-function LinearAlgebra.mul!(C::SparseMatrixCSC,
+function matmul!(C::SparseMatrixCSC,
                             A::SparseMatrixCSC,
                             Bt::Transpose{Tv,<:SparseMatrixCSC} where Tv)
-    mul!(ascsr(C),transpose(ascsr(B)),ascsr(A))
+    matmul!(ascsr(C),transpose(ascsr(Bt.parent)),ascsr(A))
     C
 end
 
-function LinearAlgebra.mul!(C::SparseMatrixCSR,
+function matmul!(C::SparseMatrixCSR,
                             A::SparseMatrixCSR,
                             B::SparseMatrixCSR,
                             cache)
@@ -446,15 +464,15 @@ function LinearAlgebra.mul!(C::SparseMatrixCSR,
     C
 end
 
-function LinearAlgebra.mul!(C::SparseMatrixCSC,
+function matmul!(C::SparseMatrixCSC,
                             A::SparseMatrixCSC,
                             Bt::Transpose{Tv,<:SparseMatrixCSC} where Tv,
                             cache)
-    mul!(ascsr(C),transpose(ascsr(B)),ascsr(A),cache)
+    matmul!(ascsr(C),transpose(ascsr(Bt.parent)),ascsr(A),cache)
     C
 end
 
-function LinearAlgebra.mul!(C::SparseMatrixCSR,
+function matmul!(C::SparseMatrixCSR,
                             A::SparseMatrixCSR,
                             B::SparseMatrixCSR,
                             α::Number,
@@ -506,35 +524,35 @@ function LinearAlgebra.mul!(C::SparseMatrixCSR,
     C
 end
 
-function LinearAlgebra.mul!(C::SparseMatrixCSC,
+function matmul!(C::SparseMatrixCSC,
                             A::SparseMatrixCSC,
                             Bt::Transpose{Tv,<:SparseMatrixCSC} where Tv,
                             α::Number,
                             β::Number,
                             cache)
-    mul!(ascsr(C),transpose(ascsr(Bt.parent)),ascsr(A),α,β,cache)
+    matmul!(ascsr(C),transpose(ascsr(Bt.parent)),ascsr(A),α,β,cache)
     C
 end
 
-function LinearAlgebra.mul!(C::SparseMatrixCSC,
+function matmul!(C::SparseMatrixCSC,
                             At::Transpose{Tv,<:SparseMatrixCSC} where Tv,
                             B::SparseMatrixCSC,
                             cache)
-    mul!(ascsr(C),ascsr(B),transpose(ascsr(At.parent)))
+    matmul!(ascsr(C),ascsr(B),transpose(ascsr(At.parent)))
     C
 end
 
-function LinearAlgebra.mul!(C::SparseMatrixCSC,
+function matmul!(C::SparseMatrixCSC,
                             At::Transpose{Tv,<:SparseMatrixCSC} where Tv,
                             B::SparseMatrixCSC,
                             α::Number,
                             β::Number,
                             cache)
-    mul!(ascsr(C),ascsr(A),transpose(ascsr(At.parent)),α,β)
+    matmul!(ascsr(C),ascsr(A),transpose(ascsr(At.parent)),α,β)
     C
 end
 
-# Workaround to supply in-place mul! with auxiliary array, as these are not returned by multiply function exported by SparseArrays
+# Workaround to supply in-place matmul with auxiliary array, as these are not returned by multiply function exported by SparseArrays
 function construct_spmm_cache(A::SparseMatrixCSR{Bi,Tv,Ti} where Bi) where {Tv,Ti}
     q = size(A,2)
     xb = zeros(Ti,q)
@@ -556,7 +574,7 @@ function construct_spmtm_cache(A::SparseMatrixCSC{Tv,Ti}) where {Tv,Ti}
     construct_spmtm_cache(ascsr(A))
 end
 
-function LinearAlgebra.mul!(C::SparseMatrixCSR,
+function matmul!(C::SparseMatrixCSR,
                             At::Transpose{Tv,<:SparseMatrixCSR} where Tv,
                             B::SparseMatrixCSR,
                             cache)
@@ -599,7 +617,7 @@ function LinearAlgebra.mul!(C::SparseMatrixCSR,
     C
 end
 
-function LinearAlgebra.mul!(C::SparseMatrixCSR,
+function matmul!(C::SparseMatrixCSR,
                             At::Transpose{Tv,<:SparseMatrixCSR} where Tv,
                             B::SparseMatrixCSR,
                             α::Number,
@@ -644,19 +662,19 @@ function LinearAlgebra.mul!(C::SparseMatrixCSR,
     C
 end
 
-function LinearAlgebra.mul!(C::SparseMatrixCSR,
+function matmul!(C::SparseMatrixCSR,
                             A::SparseMatrixCSR,
                             Bt::Transpose{Tv,<:SparseMatrixCSR} where Tv)
-    mul!(ascsc(C), transpose(ascsc(Bt.parent)), ascsc(A))
+    matmul!(ascsc(C), transpose(ascsc(Bt.parent)), ascsc(A))
     C
 end
 
-function LinearAlgebra.mul!(C::SparseMatrixCSR,
+function matmul!(C::SparseMatrixCSR,
                             A::SparseMatrixCSR,
                             Bt::Transpose{Tv,<:SparseMatrixCSR} where Tv,
                             α::Number,
                             β::Number)
-    mul!(ascsc(C), transpose(ascsc(Bt.parent)), ascsc(A), α, β)
+    matmul!(ascsc(C), transpose(ascsc(Bt.parent)), ascsc(A), α, β)
     C
 end
 
diff --git a/test/debug_array/runtests.jl b/test/debug_array/runtests.jl
index 88abfdeb..a175b722 100644
--- a/test/debug_array/runtests.jl
+++ b/test/debug_array/runtests.jl
@@ -3,25 +3,25 @@ module DebugArrayRunTests
 using Test
 using PartitionedArrays
 
-# @testset "debug_array" begin include("debug_array_tests.jl") end
+@testset "debug_array" begin include("debug_array_tests.jl") end
 
-# @testset "primitives" begin include("primitives_tests.jl")  end
+@testset "primitives" begin include("primitives_tests.jl")  end
 
-# @testset "p_range" begin include("p_range_tests.jl")  end
+@testset "p_range" begin include("p_range_tests.jl")  end
 
-# @testset "p_vector" begin include("p_vector_tests.jl")  end
+@testset "p_vector" begin include("p_vector_tests.jl")  end
 
-# @testset "p_sparse_matrix" begin include("p_sparse_matrix_tests.jl")  end
+@testset "p_sparse_matrix" begin include("p_sparse_matrix_tests.jl")  end
 
-# @testset "block_arrays" begin include("block_arrays_tests.jl")  end
+@testset "block_arrays" begin include("block_arrays_tests.jl")  end
 
-# @testset "gallery" begin include("gallery_tests.jl")  end
+@testset "gallery" begin include("gallery_tests.jl")  end
 
-# @testset "p_timer" begin include("p_timer_tests.jl")  end
+@testset "p_timer" begin include("p_timer_tests.jl")  end
 
-# @testset "fdm_example" begin include("fdm_example.jl")  end
+@testset "fdm_example" begin include("fdm_example.jl")  end
 
-# @testset "fem_example" begin include("fem_example.jl")  end
+@testset "fem_example" begin include("fem_example.jl")  end
 
 @testset "spmtmm_tests" begin include("spmtmm_tests.jl")  end
 
diff --git a/test/debug_array/spmtmm_tests.jl b/test/debug_array/spmtmm_tests.jl
index ba3cf431..5969b60f 100644
--- a/test/debug_array/spmtmm_tests.jl
+++ b/test/debug_array/spmtmm_tests.jl
@@ -7,12 +7,14 @@ include(joinpath("..","spmtmm_tests.jl"))
 
 v = 1:5
 M = sparse(v,v,v)
-@test nnz(M-M) == nnz(M)
-display(M-M)
+Z = subtract(M,M)
+@test nnz(Z) == nnz(M)
+display(Z)
 
 M = sparsecsr(v,v,v)
-@test nnz(M-M) == nnz(M)
-display(M-M)
+Z = subtract(M,M)
+@test nnz(Z) == nnz(M)
+display(Z)
 
 with_debug(spmtmm_tests)
 
diff --git a/test/spmtmm_tests.jl b/test/spmtmm_tests.jl
index aacfdcad..bf80328a 100644
--- a/test/spmtmm_tests.jl
+++ b/test/spmtmm_tests.jl
@@ -51,8 +51,8 @@ function parallel_tests(pA,pB,sparse_func)
     hp_B = halfperm(B)
     @test Bt == hp_B
 
-    AB0 = A*B
-    C0 = transpose(B)*AB0
+    AB0 = matmul(A,B)
+    C0 = matmul(transpose(B),AB0)
     # test basic sequential csr implementations to default csc sequential implementations.
     pAB,cacheAB = spmm(pA,pB,reuse=true)
     AB = centralize(sparse_func,pAB)
@@ -96,8 +96,8 @@ function parallel_tests(pA,pB,sparse_func)
 
     # unequal sizes backward (small to large)
     if size(pA) != size(pB)
-        CB0 = C0*Bt
-        D0 = transpose(Bt)*CB0
+        CB0 = matmul(C0,Bt)
+        D0 = matmul(transpose(Bt),CB0)
         pCB,cacheCB = spmm(pC,pBt,reuse=true)
         CB = centralize(sparse_func,pCB)
         @test approx_equivalent(CB,CB0)
diff --git a/times.txt b/times.txt
index 835d3320..e4fd4e27 100644
--- a/times.txt
+++ b/times.txt
@@ -1 +1 @@
-Dict{String, @NamedTuple{min::Float64, max::Float64, avg::Float64}}("Phase 143" => (min = 0.2139673, max = 0.2139673, avg = 0.2139673), "Phase 3" => (min = 1.0e-7, max = 1.0e-7, avg = 1.0e-7), "Matrix Assembly" => (min = 0.4185178, max = 0.4185178, avg = 0.4185178), "Phase 1" => (min = 1.0e-6, max = 1.0e-6, avg = 1.0e-6))
+Dict{String, @NamedTuple{min::Float64, max::Float64, avg::Float64}}("Phase 143" => (min = 0.2017, max = 0.2017, avg = 0.2017), "Phase 3" => (min = 1.0e-7, max = 1.0e-7, avg = 1.0e-7), "Matrix Assembly" => (min = 0.4044642, max = 0.4044642, avg = 0.4044642), "Phase 1" => (min = 2.0e-7, max = 2.0e-7, avg = 2.0e-7))

From 1794482bc23e1875fb1859fbfb1b154b32bf2b03 Mon Sep 17 00:00:00 2001
From: jop611 <meijer.jop@gmail.com>
Date: Thu, 30 Jan 2025 13:39:53 +0100
Subject: [PATCH 33/34] fixed oversight in sarse matrix expansion in consistent

---
 src/p_sparse_matrix.jl            | 134 ++++++++++++++---------------
 src/sequential_implementations.jl | 135 ++++++++++++------------------
 src/sparse_utils.jl               |  33 ++++----
 3 files changed, 137 insertions(+), 165 deletions(-)

diff --git a/src/p_sparse_matrix.jl b/src/p_sparse_matrix.jl
index 196bdb90..29bef80a 100644
--- a/src/p_sparse_matrix.jl
+++ b/src/p_sparse_matrix.jl
@@ -1964,12 +1964,13 @@ function psparse_consistent_impl(A::PSparseMatrix{V,B,C,D,Tv} where {V,B,C,D},
         map_global_to_own!(J_rcv_own,cols_co)
         map_global_to_ghost!(J_rcv_ghost,cols_co)
         n_ghost_rows = ghost_length(rows_co)
+        n_own_rows = own_length(rows_co)
         n_own_cols = own_length(cols_co)
         n_ghost_cols = ghost_length(cols_co)
         TA = typeof(A.blocks.ghost_own)
         own_own = A.blocks.own_own
-        # New own_ghost shares as much memory with existing own_ghost block as possible. Extent depends on sparse format in use.
-        own_ghost = expand_sparse_matrix_columns(A.blocks.own_ghost,n_ghost_cols) 
+        # New own_ghost shares index and value arrays with existing own_ghost block. Pointer arrays are newly allocated (in case of CSC and CSR).
+        own_ghost = expand_sparse_matrix(A.blocks.own_ghost,n_own_rows,n_ghost_cols)
         ghost_own = compresscoo(TA,I_rcv_own,J_rcv_own,V_rcv_own,n_ghost_rows,n_own_cols)
         ghost_ghost = compresscoo(TA,I_rcv_ghost,J_rcv_ghost,V_rcv_ghost,n_ghost_rows,n_ghost_cols)
         K_own = precompute_nzindex(ghost_own,I_rcv_own,J_rcv_own)
@@ -2326,82 +2327,81 @@ function spmm!(C,A,B,state)
 end
 
 ### OLD ###
-# function spmm(A::PSparseMatrix,B::PSparseMatrix;reuse=Val(false))
-#     # TODO latency hiding
-#     @assert A.assembled
-#     @assert B.assembled
-#     col_partition = partition(axes(A,2))
-#     C,cacheC = consistent(B,col_partition;reuse=true) |> fetch
-#     D_partition,cacheD = map((args...)->spmm(args...;reuse=true),partition(A),partition(C)) |> tuple_of_arrays
-#     assembled = true
-#     D = PSparseMatrix(D_partition,partition(axes(A,1)),partition(axes(C,2)),assembled)
-#     if val_parameter(reuse)
-#         cache = (C,cacheC,cacheD)
-#         return D,cache
-#     end
-#     D
-# end
-
-# function spmm!(D::PSparseMatrix,A::PSparseMatrix,B::PSparseMatrix,cache)
-#     (C,cacheC,cacheD)= cache
-#     consistent!(C,B,cacheC) |> wait
-#     map(spmm!,partition(D),partition(A),partition(C),cacheD)
-#     D
-# end
-
-### NEW ###
 function spmm(A::PSparseMatrix,B::PSparseMatrix;reuse=Val(false))
+    # TODO latency hiding
     @assert A.assembled
     @assert B.assembled
-    t = consistent(B,partition(axes(A,2)),reuse=true)
-    A_own_own = own_own_values(A)
-    A_own_ghost = own_ghost_values(A)
+    col_partition = partition(axes(A,2))
+    C,cacheC = consistent(B,col_partition;reuse=true) |> fetch
+    D_partition,cacheD = map((args...)->spmm(args...;reuse=true),partition(A),partition(C)) |> tuple_of_arrays
+    assembled = true
+    D = PSparseMatrix(D_partition,partition(axes(A,1)),partition(axes(C,2)),assembled)
+    if val_parameter(reuse)
+        cache = (C,cacheC,cacheD)
+        return D,cache
+    end
+    D
+end
 
-    C_own_own_1 = map(matmul,A_own_own,own_own_values(B))
-    
-    # Wait for consistent
-    B2, cacheB2 = fetch(t)
-    C_own_ghost_1 = map(matmul,A_own_own,own_ghost_values(B2))
-    C_own_own_2 = map(matmul,A_own_ghost,ghost_own_values(B2))
-    C_own_ghost_2 = map(matmul,A_own_ghost,ghost_ghost_values(B2))
+function spmm!(D::PSparseMatrix,A::PSparseMatrix,B::PSparseMatrix,cache)
+    (C,cacheC,cacheD)= cache
+    consistent!(C,B,cacheC) |> wait
+    map(spmm!,partition(D),partition(A),partition(C),cacheD)
+    D
+end
+
+### NEW ###
+# function spmm(A::PSparseMatrix,B::PSparseMatrix;reuse=Val(false))
+#     @assert A.assembled
+#     @assert B.assembled
+#     t = consistent(B,partition(axes(A,2)),reuse=true)
+#     A_own_own = own_own_values(A)
+#     A_own_ghost = own_ghost_values(A)
+#     C_own_own_1 = map(matmul,A_own_own,own_own_values(B))
+
+#     # Wait for consistent
+#     B2, cacheB2 = fetch(t)
+#     C_own_ghost_1 = map(matmul,A_own_own,own_ghost_values(B2))
+#     C_own_own_2 = map(matmul,A_own_ghost,ghost_own_values(B2))
+#     C_own_ghost_2 = map(matmul,A_own_ghost,ghost_ghost_values(B2))
     
-    C_own_own = map(add, C_own_own_1, C_own_own_2)
-    C_own_ghost = map(add, C_own_ghost_1, C_own_ghost_2)
+#     C_own_own = map(add, C_own_own_1, C_own_own_2)
+#     C_own_ghost = map(add, C_own_ghost_1, C_own_ghost_2)
     
-    Coo_cache = map(construct_spmm_cache, C_own_own)
-    Cog_cache = map(construct_spmm_cache, C_own_ghost)
+#     Coo_cache = map(construct_spmm_cache, C_own_own)
+#     Cog_cache = map(construct_spmm_cache, C_own_ghost)
     
-    C_values = map(C_own_own,C_own_ghost,partition(A),partition(B2)) do own_own,own_ghost,A_part,B_part
-        ghost_own = similar(own_own,0,size(own_own,2))
-        ghost_ghost = similar(own_own,0,size(own_ghost,2))
-        blocks = split_matrix_blocks(own_own,own_ghost,ghost_own,ghost_ghost)
-        split_matrix(blocks,A_part.row_permutation,B_part.col_permutation)
-    end
+#     C_values = map(C_own_own,C_own_ghost,partition(A),partition(B2)) do own_own,own_ghost,A_part,B_part
+#         ghost_own = similar(own_own,0,size(own_own,2))
+#         ghost_ghost = similar(own_own,0,size(own_ghost,2))
+#         blocks = split_matrix_blocks(own_own,own_ghost,ghost_own,ghost_ghost)
+#         split_matrix(blocks,A_part.row_permutation,B_part.col_permutation)
+#     end
     
-    C = PSparseMatrix(C_values,partition(axes(A,1)),partition(axes(B2,2)),true)
-    if val_parameter(reuse)
-        cache = (B2,cacheB2,(Coo_cache,Cog_cache))
-        return C,cache
-    end
-    C
-end
+#     C = PSparseMatrix(C_values,partition(axes(A,1)),partition(axes(B2,2)),true)
+#     if val_parameter(reuse)
+#         cache = (B2,cacheB2,(Coo_cache,Cog_cache))
+#         return C,cache
+#     end
+#     C
+# end
 
-function spmm!(C::PSparseMatrix,A::PSparseMatrix,B::PSparseMatrix,cache)
-    (B2,cacheB2,(Coo_cache,Cog_cache)) = cache
-    t = consistent!(B2,B,cacheB2)
-    A_own_own = own_own_values(A)
-    A_own_ghost = own_ghost_values(A)
-    C_own_own = own_own_values(C)
-    C_own_ghost = own_ghost_values(C)
+# function spmm!(C::PSparseMatrix,A::PSparseMatrix,B::PSparseMatrix,cache)
+#     (B2,cacheB2,(Coo_cache,Cog_cache)) = cache
+#     t = consistent!(B2,B,cacheB2)
+#     A_own_own = own_own_values(A)
+#     A_own_ghost = own_ghost_values(A)
+#     C_own_own = own_own_values(C)
+#     C_own_ghost = own_ghost_values(C)
 
-    map(matmul!, C_own_own, A_own_own, own_own_values(B),Coo_cache)
-    wait(t)
-    map(matmul!, C_own_ghost, A_own_own, own_ghost_values(B2),Cog_cache)
+#     map(matmul!,C_own_own,A_own_own,own_own_values(B),Coo_cache)
+#     wait(t)
+#     map(matmul!,C_own_ghost,A_own_own,own_ghost_values(B2),Cog_cache)
 
-    map((C,A,B,cache) -> matmul!(C,A,B,1,1,cache), C_own_own,A_own_ghost,ghost_own_values(B2),Coo_cache)
-    map((C,A,B,cache) -> matmul!(C,A,B,1,1,cache), C_own_ghost,A_own_ghost,ghost_ghost_values(B2),Cog_cache)
-    C
-end
+#     map((C,A,B,cache)->matmul!(C,A,B,1,1,cache),C_own_own,A_own_ghost,ghost_own_values(B2),Coo_cache)
+#     map((C,A,B,cache)->matmul!(C,A,B,1,1,cache),C_own_ghost,A_own_ghost,ghost_ghost_values(B2),Cog_cache)
+#     C
+# end
 ### End NEW ###
 
 function spmtm(A,B;reuse=Val(false))
diff --git a/src/sequential_implementations.jl b/src/sequential_implementations.jl
index f69da7b1..ed952606 100644
--- a/src/sequential_implementations.jl
+++ b/src/sequential_implementations.jl
@@ -1,16 +1,5 @@
-function matmul(A::SparseMatrixCSC,B::SparseMatrixCSC)
-    A*B
-end
-
-function matmul(A::Transpose{Tv,<:SparseMatrixCSC} where Tv,B::SparseMatrixCSC)
-    A*B
-end
-
-function matmul(A::SparseMatrixCSC,B::Transpose{Tv,<:SparseMatrixCSC} where Tv)
-    A*B
-end
-
-function matmul(A::Transpose{TvA,<:SparseMatrixCSC} where TvA,B::Transpose{TvB,<:SparseMatrixCSC} where TvB)
+function matmul(A::Union{Transpose{TvA,<:SparseMatrixCSC},<:SparseMatrixCSC} where TvA,
+                B::Union{Transpose{TvB,<:SparseMatrixCSC},<:SparseMatrixCSC} where TvB)
     A*B
 end
 
@@ -40,11 +29,6 @@ end
 
 function mul(A::SparseMatrixCSR,x::Number) mul(x,A) end
 
-
-# function quotient(A::SparseMatrixCSR{Bi,Tv,Ti},x::Number) where {Bi,Tv,Ti}
-#     SparseMatrixCSR{Bi}(size(A)..., copy(A.rowptr), copy(A.colval), map(a -> a/x, A.nzval))
-# end
-
 # Alternative to lazy csr to csc for matrix addition that does not drop structural zeros.
 function add(A::SparseMatrixCSR{Bi,TvA,TiA},B::SparseMatrixCSR{Bi,TvB,TiB}) where {Bi,TvA,TvB,TiA,TiB}
     if size(A) == size(B) || throw(DimensionMismatch("Size of B $(size(B)) must match size of A $(size(A))"));end
@@ -293,22 +277,20 @@ function subtract(A::SparseMatrixCSC{Tv,Ti}) where {Tv,Ti}
     SparseMatrixCSC{Tv,Ti}(size(A)..., copy(A.colptr), copy(A.rowval), map(a->-a, A.nzval))
 end
 
-
 function matmul!(C::SparseMatrixCSC,
-                            A::SparseMatrixCSC,
-                            B::SparseMatrixCSC,
-                            cache)
+                 A::SparseMatrixCSC,
+                 B::SparseMatrixCSC,
+                 cache)
     matmul!(ascsr(C),ascsr(B),ascsr(A),cache)
     C
 end
 
-
 function matmul!(C::SparseMatrixCSC,
-                            A::SparseMatrixCSC,
-                            B::SparseMatrixCSC,
-                            α::Number,
-                            β::Number,
-                            cache)
+                 A::SparseMatrixCSC,
+                 B::SparseMatrixCSC,
+                 α::Number,
+                 β::Number,
+                 cache)
     matmul!(ascsr(C),ascsr(B),ascsr(A),α,β,cache)
     C
 end
@@ -361,10 +343,10 @@ function matmul!(C::SparseMatrixCSC,
 end
 
 function matmul!(C::SparseMatrixCSC{Tv,Ti},
-                            At::Transpose{Tv,SparseMatrixCSC{Tv,Ti}},
-                            B::SparseMatrixCSC{Tv,Ti},
-                            α::Number,
-                            β::Number) where {Tv,Ti}
+                 At::Transpose{Tv,SparseMatrixCSC{Tv,Ti}},
+                 B::SparseMatrixCSC{Tv,Ti},
+                 α::Number,
+                 β::Number) where {Tv,Ti}
     a,b = size(C)
     p,q = size(At)
     r,s = size(B)
@@ -408,16 +390,16 @@ function matmul!(C::SparseMatrixCSC{Tv,Ti},
 end
 
 function matmul!(C::SparseMatrixCSC,
-                            A::SparseMatrixCSC,
-                            Bt::Transpose{Tv,<:SparseMatrixCSC} where Tv)
+                 A::SparseMatrixCSC,
+                 Bt::Transpose{Tv,<:SparseMatrixCSC} where Tv)
     matmul!(ascsr(C),transpose(ascsr(Bt.parent)),ascsr(A))
     C
 end
 
 function matmul!(C::SparseMatrixCSR,
-                            A::SparseMatrixCSR,
-                            B::SparseMatrixCSR,
-                            cache)
+                 A::SparseMatrixCSR,
+                 B::SparseMatrixCSR,
+                 cache)
     a,b = size(C)
     p,q = size(A)
     r,s = size(B)
@@ -465,19 +447,19 @@ function matmul!(C::SparseMatrixCSR,
 end
 
 function matmul!(C::SparseMatrixCSC,
-                            A::SparseMatrixCSC,
-                            Bt::Transpose{Tv,<:SparseMatrixCSC} where Tv,
-                            cache)
+                 A::SparseMatrixCSC,
+                 Bt::Transpose{Tv,<:SparseMatrixCSC} where Tv,
+                 cache)
     matmul!(ascsr(C),transpose(ascsr(Bt.parent)),ascsr(A),cache)
     C
 end
 
 function matmul!(C::SparseMatrixCSR,
-                            A::SparseMatrixCSR,
-                            B::SparseMatrixCSR,
-                            α::Number,
-                            β::Number,
-                            cache)
+                 A::SparseMatrixCSR,
+                 B::SparseMatrixCSR,
+                 α::Number,
+                 β::Number,
+                 cache)
     a,b = size(C)
     p,q = size(A)
     r,s = size(B)
@@ -525,29 +507,29 @@ function matmul!(C::SparseMatrixCSR,
 end
 
 function matmul!(C::SparseMatrixCSC,
-                            A::SparseMatrixCSC,
-                            Bt::Transpose{Tv,<:SparseMatrixCSC} where Tv,
-                            α::Number,
-                            β::Number,
-                            cache)
+                 A::SparseMatrixCSC,
+                 Bt::Transpose{Tv,<:SparseMatrixCSC} where Tv,
+                 α::Number,
+                 β::Number,
+                 cache)
     matmul!(ascsr(C),transpose(ascsr(Bt.parent)),ascsr(A),α,β,cache)
     C
 end
 
 function matmul!(C::SparseMatrixCSC,
-                            At::Transpose{Tv,<:SparseMatrixCSC} where Tv,
-                            B::SparseMatrixCSC,
-                            cache)
+                 At::Transpose{Tv,<:SparseMatrixCSC} where Tv,
+                 B::SparseMatrixCSC,
+                 cache)
     matmul!(ascsr(C),ascsr(B),transpose(ascsr(At.parent)))
     C
 end
 
 function matmul!(C::SparseMatrixCSC,
-                            At::Transpose{Tv,<:SparseMatrixCSC} where Tv,
-                            B::SparseMatrixCSC,
-                            α::Number,
-                            β::Number,
-                            cache)
+                 At::Transpose{Tv,<:SparseMatrixCSC} where Tv,
+                 B::SparseMatrixCSC,
+                 α::Number,
+                 β::Number,
+                 cache)
     matmul!(ascsr(C),ascsr(A),transpose(ascsr(At.parent)),α,β)
     C
 end
@@ -575,9 +557,9 @@ function construct_spmtm_cache(A::SparseMatrixCSC{Tv,Ti}) where {Tv,Ti}
 end
 
 function matmul!(C::SparseMatrixCSR,
-                            At::Transpose{Tv,<:SparseMatrixCSR} where Tv,
-                            B::SparseMatrixCSR,
-                            cache)
+                 At::Transpose{Tv,<:SparseMatrixCSR} where Tv,
+                 B::SparseMatrixCSR,
+                 cache)
     a,b = size(C)
     p,q = size(At)
     r,s = size(B)
@@ -618,11 +600,11 @@ function matmul!(C::SparseMatrixCSR,
 end
 
 function matmul!(C::SparseMatrixCSR,
-                            At::Transpose{Tv,<:SparseMatrixCSR} where Tv,
-                            B::SparseMatrixCSR,
-                            α::Number,
-                            β::Number,
-                            cache)
+                 At::Transpose{Tv,<:SparseMatrixCSR} where Tv,
+                 B::SparseMatrixCSR,
+                 α::Number,
+                 β::Number,
+                 cache)
     a,b = size(C)
     p,q = size(At)
     r,s = size(B)
@@ -663,17 +645,17 @@ function matmul!(C::SparseMatrixCSR,
 end
 
 function matmul!(C::SparseMatrixCSR,
-                            A::SparseMatrixCSR,
-                            Bt::Transpose{Tv,<:SparseMatrixCSR} where Tv)
+                 A::SparseMatrixCSR,
+                 Bt::Transpose{Tv,<:SparseMatrixCSR} where Tv)
     matmul!(ascsc(C), transpose(ascsc(Bt.parent)), ascsc(A))
     C
 end
 
 function matmul!(C::SparseMatrixCSR,
-                            A::SparseMatrixCSR,
-                            Bt::Transpose{Tv,<:SparseMatrixCSR} where Tv,
-                            α::Number,
-                            β::Number)
+                 A::SparseMatrixCSR,
+                 Bt::Transpose{Tv,<:SparseMatrixCSR} where Tv,
+                 α::Number,
+                 β::Number)
     matmul!(ascsc(C), transpose(ascsc(Bt.parent)), ascsc(A), α, β)
     C
 end
@@ -1169,15 +1151,6 @@ function rap(R::SparseMatrixCSR{Bi,TvR,TiR},
     _rap(R,A,P)
 end
 
-# Reuses internal arrays of A!!!
-function construct_spmmm_cache(C::SparseMatrixCSR,A::SparseMatrixCSR)
-    cache = JaggedArray(colvals(A), A.rowptr)
-end
-
-function construct_spmmm_cache(C::SparseMatrixCSC,A::SparseMatrixCSC)
-    cache = JaggedArray(rowvals(A), A.colptr)
-end
-
 function reduce_spmtmm_cache(cache,::Type{M} where M <: SparseMatrixCSR)
     (xb,x,JAP,_) = cache
     (xb,x,JAP)
diff --git a/src/sparse_utils.jl b/src/sparse_utils.jl
index ea12f3f9..12f6d248 100644
--- a/src/sparse_utils.jl
+++ b/src/sparse_utils.jl
@@ -725,39 +725,38 @@ function precompute_nzindex(C::AbstractSparseArray,A::AbstractSparseArray)
     K
 end
 
-function expand_sparse_matrix_columns(A::SparseMatrixCSR{Bi,Tv,Ti} where {Tv, Ti}, n) where Bi
-    p,q = size(A)
-    @assert n >= q
-    SparseMatrixCSR{Bi}(p,n,A.rowptr,A.colval,A.nzval)
+function expand_sparse_matrix(A::SparseMatrixCSR{Bi,Tv,Ti} where {Tv, Ti},m,n) where Bi
+    p = size(A,1)
+    new_rowptr = similar(A.rowptr,m+1)
+    map!(identity,new_rowptr,A.rowptr)
+    last_index = A.colptr[end]
+    for i in p+1:m+1
+        new_colptr[i] = last_index
+    end
+    SparseMatrixCSR{Bi}(m,n,A.new_rowptr,A.colval,A.nzval)
 end
 
-function expand_sparse_matrix_columns(A::SparseMatrixCSC{Tv,Ti}, n) where {Tv,Ti}
-    p,q = size(A)
-    @assert n >= q
+function expand_sparse_matrix(A::SparseMatrixCSC{Tv,Ti},m,n) where {Tv,Ti}
+    q = size(A,2)
     new_colptr = similar(A.colptr,n+1)
     map!(identity,new_colptr,A.colptr)
     last_index = A.colptr[end]
-    foreach(q+1:n+1) do i
-        new_colptr[i] = last_index
+    for j in q+1:n+1
+        new_colptr[j] = last_index
     end
-    SparseMatrixCSC{Tv,Ti}(p,n,new_colptr,A.rowval,A.nzval)
+    SparseMatrixCSC{Tv,Ti}(m,n,new_colptr,A.rowval,A.nzval)
 end
 
-# Currently not implemented by the SparseMatricesCSR module
+# Currently not implemented by the SparseMatricesCSR package
 function Base.similar(A::SparseMatrixCSR{Bi}, m::Integer, n::Integer) where Bi
     SparseMatrixCSR{1}(m, n, ones(eltype(A.rowptr), m+1), eltype(A.colval)[], eltype(A.nzval)[])
 end
 
-# Currently not implemented by the SparseMatricesCSR module
+# Currently not implemented by SparseMatricesCSR
 function Base.similar(A::SparseMatrixCSR{Bi}) where Bi
     SparseMatrixCSR{Bi}(size(A)..., copy(A.rowptr), copy(colvals(A)), similar(nonzeros(A)))
 end
 
-# This method is implemented also by SparseMatricesCSR, but related methods aren't.
-# function Base.copy(A::SparseMatrixCSR{Bi}) where Bi
-#     SparseMatrixCSR{Bi}(size(A)..., copy(A.rowptr), copy(colvals(A)), copy(nonzeros(A)))
-# end
-
 # Currently not implemented by the SparseMatricesCSR module
 function Base.copy(At::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}} where {Bi,Tv,Ti})
     Acsc_T = copy(transpose(ascsc(At.parent))) # materialize SparseMatrixCSC transpose

From 2378df50f580a6bf1cec3d47cb7731187021da74 Mon Sep 17 00:00:00 2001
From: jop611 <meijer.jop@gmail.com>
Date: Fri, 31 Jan 2025 12:55:52 +0100
Subject: [PATCH 34/34] fixes to expand_sparse_matrix and inclusion of general
 case.

---
 src/p_sparse_matrix.jl           | 128 +++++++++++++++----------------
 src/sparse_utils.jl              |  53 +++++++++----
 test/debug_array/spmtmm_tests.jl |  22 ++++--
 3 files changed, 116 insertions(+), 87 deletions(-)

diff --git a/src/p_sparse_matrix.jl b/src/p_sparse_matrix.jl
index 29bef80a..447d96b4 100644
--- a/src/p_sparse_matrix.jl
+++ b/src/p_sparse_matrix.jl
@@ -2327,81 +2327,81 @@ function spmm!(C,A,B,state)
 end
 
 ### OLD ###
+# function spmm(A::PSparseMatrix,B::PSparseMatrix;reuse=Val(false))
+#     # TODO latency hiding
+#     @assert A.assembled
+#     @assert B.assembled
+#     col_partition = partition(axes(A,2))
+#     C,cacheC = consistent(B,col_partition;reuse=true) |> fetch
+#     D_partition,cacheD = map((args...)->spmm(args...;reuse=true),partition(A),partition(C)) |> tuple_of_arrays
+#     assembled = true
+#     D = PSparseMatrix(D_partition,partition(axes(A,1)),partition(axes(C,2)),assembled)
+#     if val_parameter(reuse)
+#         cache = (C,cacheC,cacheD)
+#         return D,cache
+#     end
+#     D
+# end
+
+# function spmm!(D::PSparseMatrix,A::PSparseMatrix,B::PSparseMatrix,cache)
+#     (C,cacheC,cacheD)= cache
+#     consistent!(C,B,cacheC) |> wait
+#     map(spmm!,partition(D),partition(A),partition(C),cacheD)
+#     D
+# end
+
+### NEW ###
 function spmm(A::PSparseMatrix,B::PSparseMatrix;reuse=Val(false))
-    # TODO latency hiding
     @assert A.assembled
     @assert B.assembled
-    col_partition = partition(axes(A,2))
-    C,cacheC = consistent(B,col_partition;reuse=true) |> fetch
-    D_partition,cacheD = map((args...)->spmm(args...;reuse=true),partition(A),partition(C)) |> tuple_of_arrays
-    assembled = true
-    D = PSparseMatrix(D_partition,partition(axes(A,1)),partition(axes(C,2)),assembled)
-    if val_parameter(reuse)
-        cache = (C,cacheC,cacheD)
-        return D,cache
-    end
-    D
-end
-
-function spmm!(D::PSparseMatrix,A::PSparseMatrix,B::PSparseMatrix,cache)
-    (C,cacheC,cacheD)= cache
-    consistent!(C,B,cacheC) |> wait
-    map(spmm!,partition(D),partition(A),partition(C),cacheD)
-    D
-end
+    t = consistent(B,partition(axes(A,2)),reuse=true)
+    A_own_own = own_own_values(A)
+    A_own_ghost = own_ghost_values(A)
+    C_own_own_1 = map(matmul,A_own_own,own_own_values(B))
 
-### NEW ###
-# function spmm(A::PSparseMatrix,B::PSparseMatrix;reuse=Val(false))
-#     @assert A.assembled
-#     @assert B.assembled
-#     t = consistent(B,partition(axes(A,2)),reuse=true)
-#     A_own_own = own_own_values(A)
-#     A_own_ghost = own_ghost_values(A)
-#     C_own_own_1 = map(matmul,A_own_own,own_own_values(B))
-
-#     # Wait for consistent
-#     B2, cacheB2 = fetch(t)
-#     C_own_ghost_1 = map(matmul,A_own_own,own_ghost_values(B2))
-#     C_own_own_2 = map(matmul,A_own_ghost,ghost_own_values(B2))
-#     C_own_ghost_2 = map(matmul,A_own_ghost,ghost_ghost_values(B2))
+    # Wait for consistent
+    B2, cacheB2 = fetch(t)
+    C_own_ghost_1 = map(matmul,A_own_own,own_ghost_values(B2))
+    C_own_own_2 = map(matmul,A_own_ghost,ghost_own_values(B2))
+    C_own_ghost_2 = map(matmul,A_own_ghost,ghost_ghost_values(B2))
     
-#     C_own_own = map(add, C_own_own_1, C_own_own_2)
-#     C_own_ghost = map(add, C_own_ghost_1, C_own_ghost_2)
+    C_own_own = map(add, C_own_own_1, C_own_own_2)
+    C_own_ghost = map(add, C_own_ghost_1, C_own_ghost_2)
     
-#     Coo_cache = map(construct_spmm_cache, C_own_own)
-#     Cog_cache = map(construct_spmm_cache, C_own_ghost)
+    Coo_cache = map(construct_spmm_cache, C_own_own)
+    Cog_cache = map(construct_spmm_cache, C_own_ghost)
     
-#     C_values = map(C_own_own,C_own_ghost,partition(A),partition(B2)) do own_own,own_ghost,A_part,B_part
-#         ghost_own = similar(own_own,0,size(own_own,2))
-#         ghost_ghost = similar(own_own,0,size(own_ghost,2))
-#         blocks = split_matrix_blocks(own_own,own_ghost,ghost_own,ghost_ghost)
-#         split_matrix(blocks,A_part.row_permutation,B_part.col_permutation)
-#     end
+    C_values = map(C_own_own,C_own_ghost,partition(A),partition(B2)) do own_own,own_ghost,A_part,B_part
+        ghost_own = similar(own_own,0,size(own_own,2))
+        ghost_ghost = similar(own_own,0,size(own_ghost,2))
+        blocks = split_matrix_blocks(own_own,own_ghost,ghost_own,ghost_ghost)
+        split_matrix(blocks,A_part.row_permutation,B_part.col_permutation)
+    end
     
-#     C = PSparseMatrix(C_values,partition(axes(A,1)),partition(axes(B2,2)),true)
-#     if val_parameter(reuse)
-#         cache = (B2,cacheB2,(Coo_cache,Cog_cache))
-#         return C,cache
-#     end
-#     C
-# end
+    C = PSparseMatrix(C_values,partition(axes(A,1)),partition(axes(B2,2)),true)
+    if val_parameter(reuse)
+        cache = (B2,cacheB2,(Coo_cache,Cog_cache))
+        return C,cache
+    end
+    C
+end
 
-# function spmm!(C::PSparseMatrix,A::PSparseMatrix,B::PSparseMatrix,cache)
-#     (B2,cacheB2,(Coo_cache,Cog_cache)) = cache
-#     t = consistent!(B2,B,cacheB2)
-#     A_own_own = own_own_values(A)
-#     A_own_ghost = own_ghost_values(A)
-#     C_own_own = own_own_values(C)
-#     C_own_ghost = own_ghost_values(C)
+function spmm!(C::PSparseMatrix,A::PSparseMatrix,B::PSparseMatrix,cache)
+    (B2,cacheB2,(Coo_cache,Cog_cache)) = cache
+    t = consistent!(B2,B,cacheB2)
+    A_own_own = own_own_values(A)
+    A_own_ghost = own_ghost_values(A)
+    C_own_own = own_own_values(C)
+    C_own_ghost = own_ghost_values(C)
 
-#     map(matmul!,C_own_own,A_own_own,own_own_values(B),Coo_cache)
-#     wait(t)
-#     map(matmul!,C_own_ghost,A_own_own,own_ghost_values(B2),Cog_cache)
+    map(matmul!,C_own_own,A_own_own,own_own_values(B),Coo_cache)
+    wait(t)
+    map(matmul!,C_own_ghost,A_own_own,own_ghost_values(B2),Cog_cache)
 
-#     map((C,A,B,cache)->matmul!(C,A,B,1,1,cache),C_own_own,A_own_ghost,ghost_own_values(B2),Coo_cache)
-#     map((C,A,B,cache)->matmul!(C,A,B,1,1,cache),C_own_ghost,A_own_ghost,ghost_ghost_values(B2),Cog_cache)
-#     C
-# end
+    map((C,A,B,cache)->matmul!(C,A,B,1,1,cache),C_own_own,A_own_ghost,ghost_own_values(B2),Coo_cache)
+    map((C,A,B,cache)->matmul!(C,A,B,1,1,cache),C_own_ghost,A_own_ghost,ghost_ghost_values(B2),Cog_cache)
+    C
+end
 ### End NEW ###
 
 function spmtm(A,B;reuse=Val(false))
diff --git a/src/sparse_utils.jl b/src/sparse_utils.jl
index 12f6d248..def57040 100644
--- a/src/sparse_utils.jl
+++ b/src/sparse_utils.jl
@@ -688,8 +688,8 @@ function spmv_csc!(b,x,colptr_A,rowval_A,nzval_A)
     b
 end
 
-################ NEW ################
 
+################ NEW ################
 # Variants for findnz() that only allocates memory for the conversion of the pointer array to an index array.
 # Only use for read-only operations.
 function findnz_minimal(A::SparseMatrixCSC)
@@ -725,24 +725,45 @@ function precompute_nzindex(C::AbstractSparseArray,A::AbstractSparseArray)
     K
 end
 
+# General matrix expansion to a larger size, allocates new matrix with new size.
+function expand_sparse_matrix(A,m,n)
+    compresscoo(typeof(A),findnz(A)...,m,n)
+end
+
+# Expand matrix to a larger size without changing non-zero entries. 
+# Might allocate a new pointer array, but shares index and value arrays with A.
 function expand_sparse_matrix(A::SparseMatrixCSR{Bi,Tv,Ti} where {Tv, Ti},m,n) where Bi
-    p = size(A,1)
-    new_rowptr = similar(A.rowptr,m+1)
-    map!(identity,new_rowptr,A.rowptr)
-    last_index = A.colptr[end]
-    for i in p+1:m+1
-        new_colptr[i] = last_index
+    p,q = size(A)
+    @assert m >= p
+    @assert n >= q
+    if m > p
+        new_rowptr = similar(A.rowptr,m+1)
+        map!(identity,new_rowptr,A.rowptr)
+        last_index = A.rowptr[end]
+        for i in p+1:m+1
+            new_rowptr[i] = last_index
+        end
+    else
+        new_rowptr = A.rowptr
     end
-    SparseMatrixCSR{Bi}(m,n,A.new_rowptr,A.colval,A.nzval)
+    SparseMatrixCSR{Bi}(m,n,new_rowptr,A.colval,A.nzval)
 end
 
+# Expand matrix to a larger size without changing non-zero entries. 
+# Might allocate a new pointer array, but shares index and value arrays with A.
 function expand_sparse_matrix(A::SparseMatrixCSC{Tv,Ti},m,n) where {Tv,Ti}
-    q = size(A,2)
-    new_colptr = similar(A.colptr,n+1)
-    map!(identity,new_colptr,A.colptr)
-    last_index = A.colptr[end]
-    for j in q+1:n+1
-        new_colptr[j] = last_index
+    p,q = size(A)
+    @assert m >= p
+    @assert n >= q
+    if n > q
+        new_colptr = similar(A.colptr,n+1)
+        map!(identity,new_colptr,A.colptr)
+        last_index = A.colptr[end]
+        for j in q+1:n+1
+            new_colptr[j] = last_index
+        end
+    else
+        new_colptr = A.colptr
     end
     SparseMatrixCSC{Tv,Ti}(m,n,new_colptr,A.rowval,A.nzval)
 end
@@ -971,6 +992,4 @@ end
 function symbolic_halfperm!(A::SparseMatrixCSC,At::JaggedArray)
     symbolic_halfperm!(ascsr(A),At)
     A
-end
-
-
+end
\ No newline at end of file
diff --git a/test/debug_array/spmtmm_tests.jl b/test/debug_array/spmtmm_tests.jl
index 5969b60f..1b154b59 100644
--- a/test/debug_array/spmtmm_tests.jl
+++ b/test/debug_array/spmtmm_tests.jl
@@ -6,14 +6,24 @@ using Test
 include(joinpath("..","spmtmm_tests.jl"))
 
 v = 1:5
-M = sparse(v,v,v)
-Z = subtract(M,M)
-@test nnz(Z) == nnz(M)
+A = sparse(v,v,v)
+Z = subtract(A,A)
+@test nnz(Z) == nnz(A)
 display(Z)
 
-M = sparsecsr(v,v,v)
-Z = subtract(M,M)
-@test nnz(Z) == nnz(M)
+B = sparse(v,v,-v)
+Z = add(A,B)
+@test nnz(Z) == nnz(A)
+display(Z)
+
+A = sparsecsr(v,v,v)
+Z = subtract(A,A)
+@test nnz(Z) == nnz(A)
+display(Z)
+
+B = sparsecsr(v,v,-v)
+Z = add(A,B)
+@test nnz(Z) == nnz(A)
 display(Z)
 
 with_debug(spmtmm_tests)