diff --git a/Project.toml b/Project.toml index f31b3b4..263d9ee 100644 --- a/Project.toml +++ b/Project.toml @@ -22,7 +22,7 @@ ForwardDiff = "0.10" Gridap = "0.18.7" LinearAlgebra = "1" MPI = "0.16, 0.17, 0.18, 0.19, 0.20" -PartitionedArrays = "0.3.3" +PartitionedArrays = "0.5" SparseArrays = "1.3" SparseMatricesCSR = "0.6.6" WriteVTK = "1.12.0" diff --git a/src/Algebra.jl b/src/Algebra.jl index 57bcbac..64410c9 100644 --- a/src/Algebra.jl +++ b/src/Algebra.jl @@ -29,192 +29,6 @@ function Algebra.allocate_in_domain(matrix::BlockPMatrix) allocate_in_domain(BlockPVector{V},matrix) end -# PSparseMatrix copy - -function Base.copy(a::PSparseMatrix) - mats = map(copy,partition(a)) - cache = map(PartitionedArrays.copy_cache,a.cache) - return PSparseMatrix(mats,partition(axes(a,1)),partition(axes(a,2)),cache) -end - -# PartitionedArrays extras - -function LinearAlgebra.axpy!(α,x::PVector,y::PVector) - @check partition(axes(x,1)) === partition(axes(y,1)) - map(partition(x),partition(y)) do x,y - LinearAlgebra.axpy!(α,x,y) - end - consistent!(y) |> wait - return y -end - -function LinearAlgebra.axpy!(α,x::BlockPVector,y::BlockPVector) - map(blocks(x),blocks(y)) do x,y - LinearAlgebra.axpy!(α,x,y) - end - return y -end - -function Algebra.axpy_entries!( - α::Number, A::PSparseMatrix, B::PSparseMatrix; - check::Bool=true -) -# We should definitely check here that the index partitions are the same. -# However: Because the different matrices are assembled separately, the objects are not the -# same (i.e can't use ===). Checking the index partitions would then be costly... - @assert reduce(&,map(PartitionedArrays.matching_local_indices,partition(axes(A,1)),partition(axes(B,1)))) - @assert reduce(&,map(PartitionedArrays.matching_local_indices,partition(axes(A,2)),partition(axes(B,2)))) - map(partition(A),partition(B)) do A, B - Algebra.axpy_entries!(α,A,B;check) - end - return B -end - -function Algebra.axpy_entries!( - α::Number, A::BlockPMatrix, B::BlockPMatrix; - check::Bool=true -) - map(blocks(A),blocks(B)) do A, B - Algebra.axpy_entries!(α,A,B;check) - end - return B -end - -# This might go to Gridap in the future. We keep it here for the moment. -function change_axes(a::Algebra.ArrayCounter,axes) - @notimplemented -end - -# This might go to Gridap in the future. We keep it here for the moment. -function change_axes(a::Algebra.CounterCOO{T,A}, axes::A) where {T,A} - b = Algebra.CounterCOO{T}(axes) - b.nnz = a.nnz - b -end - -# This might go to Gridap in the future. We keep it here for the moment. -function change_axes(a::Algebra.AllocationCOO{T,A}, axes::A) where {T,A} - counter = change_axes(a.counter,axes) - Algebra.AllocationCOO(counter,a.I,a.J,a.V) -end - -# Array of PArrays -> PArray of Arrays -function to_parray_of_arrays(a::AbstractArray{<:MPIArray}) - indices = linear_indices(first(a)) - map(indices) do i - map(a) do aj - PartitionedArrays.getany(aj) - end - end -end - -function to_parray_of_arrays(a::AbstractArray{<:DebugArray}) - indices = linear_indices(first(a)) - map(indices) do i - map(a) do aj - aj.items[i] - end - end -end - -# This type is required because MPIArray from PArrays -# cannot be instantiated with a NULL communicator -struct MPIVoidVector{T} <: AbstractVector{T} - comm::MPI.Comm - function MPIVoidVector(::Type{T}) where {T} - new{T}(MPI.COMM_NULL) - end -end - -Base.size(a::MPIVoidVector) = (0,) -Base.IndexStyle(::Type{<:MPIVoidVector}) = IndexLinear() -function Base.getindex(a::MPIVoidVector,i::Int) - error("Indexing of MPIVoidVector not possible.") -end -function Base.setindex!(a::MPIVoidVector,v,i::Int) - error("Indexing of MPIVoidVector not possible.") -end -function Base.show(io::IO,k::MIME"text/plain",data::MPIVoidVector) - println(io,"MPIVoidVector") -end - -function num_parts(comm::MPI.Comm) - if comm != MPI.COMM_NULL - nparts = MPI.Comm_size(comm) - else - nparts = -1 - end - nparts -end -@inline num_parts(comm::MPIArray) = num_parts(comm.comm) -@inline num_parts(comm::DebugArray) = length(comm.items) -@inline num_parts(comm::MPIVoidVector) = num_parts(comm.comm) - -function get_part_id(comm::MPI.Comm) - if comm != MPI.COMM_NULL - id = MPI.Comm_rank(comm)+1 - else - id = -1 - end - id -end -@inline get_part_id(comm::MPIArray) = get_part_id(comm.comm) -@inline get_part_id(comm::MPIVoidVector) = get_part_id(comm.comm) - -""" - i_am_in(comm::MPIArray) - i_am_in(comm::DebugArray) - - Returns `true` if the processor is part of the subcommunicator `comm`. -""" -function i_am_in(comm::MPI.Comm) - get_part_id(comm) >=0 -end -@inline i_am_in(comm::MPIArray) = i_am_in(comm.comm) -@inline i_am_in(comm::MPIVoidVector) = i_am_in(comm.comm) -@inline i_am_in(comm::DebugArray) = true - -function change_parts(x::Union{MPIArray,DebugArray,Nothing,MPIVoidVector}, new_parts; default=nothing) - x_new = map(new_parts) do p - if isa(x,MPIArray) - PartitionedArrays.getany(x) - elseif isa(x,DebugArray) && (p <= length(x.items)) - x.items[p] - else - default - end - end - return x_new -end - -function generate_subparts(parts::MPIArray,new_comm_size) - root_comm = parts.comm - root_size = MPI.Comm_size(root_comm) - rank = MPI.Comm_rank(root_comm) - - @static if isdefined(MPI,:MPI_UNDEFINED) - mpi_undefined = MPI.MPI_UNDEFINED[] - else - mpi_undefined = MPI.API.MPI_UNDEFINED[] - end - - if root_size == new_comm_size - return parts - else - if rank < new_comm_size - comm = MPI.Comm_split(root_comm,0,0) - return distribute_with_mpi(LinearIndices((new_comm_size,));comm=comm,duplicate_comm=false) - else - comm = MPI.Comm_split(root_comm,mpi_undefined,mpi_undefined) - return MPIVoidVector(eltype(parts)) - end - end -end - -function generate_subparts(parts::DebugArray,new_comm_size) - DebugArray(LinearIndices((new_comm_size,))) -end - # local_views function local_views(a) @@ -254,9 +68,11 @@ end # change_ghost function change_ghost(a::PVector{T},ids::PRange;is_consistent=false,make_consistent=false) where T - same_partition = (a.index_partition === partition(ids)) - a_new = same_partition ? a : change_ghost(T,a,ids) - if make_consistent && (!same_partition || !is_consistent) + if (a.index_partition === partition(ids)) + return a + end + a_new = change_ghost(T,a,ids) + if make_consistent && !is_consistent consistent!(a_new) |> wait end return a_new @@ -285,44 +101,26 @@ function change_ghost(a::BlockPVector,ids::BlockPRange;is_consistent=false,make_ return BlockPVector(vals,ids) end -# This function computes a mapping among the local identifiers of a and b -# for which the corresponding global identifiers are both in a and b. -# Note that the haskey check is necessary because in the general case -# there might be gids in b which are not present in a -function find_local_to_local_map(a::AbstractLocalIndices,b::AbstractLocalIndices) - a_local_to_b_local = fill(Int32(-1),local_length(a)) - b_local_to_global = local_to_global(b) - a_global_to_local = global_to_local(a) - for blid in 1:local_length(b) - gid = b_local_to_global[blid] - if a_global_to_local[gid] != zero(eltype(a_global_to_local)) - alid = a_global_to_local[gid] - a_local_to_b_local[alid] = blid - end - end - a_local_to_b_local -end - # This type is required in order to be able to access the local portion # of distributed sparse matrices and vectors using local indices from the # distributed test and trial spaces -struct LocalView{T,N,A,B} <:AbstractArray{T,N} +struct LocalView{T,N,A} <:AbstractArray{T,N} plids_to_value::A - d_to_lid_to_plid::B + d_to_lid_to_plid::NTuple{N,Vector{Int32}} local_size::NTuple{N,Int} function LocalView( - plids_to_value::AbstractArray{T,N},d_to_lid_to_plid::NTuple{N}) where {T,N} + plids_to_value::AbstractArray{T,N}, d_to_lid_to_plid::NTuple{N,Vector{Int32}} + ) where {T,N} A = typeof(plids_to_value) - B = typeof(d_to_lid_to_plid) local_size = map(length,d_to_lid_to_plid) - new{T,N,A,B}(plids_to_value,d_to_lid_to_plid,local_size) + new{T,N,A}(plids_to_value,d_to_lid_to_plid,local_size) end end Base.size(a::LocalView) = a.local_size Base.IndexStyle(::Type{<:LocalView}) = IndexCartesian() function Base.getindex(a::LocalView{T,N},lids::Vararg{Integer,N}) where {T,N} - plids = map(_lid_to_plid,lids,a.d_to_lid_to_plid) + plids = map(getindex,a.d_to_lid_to_plid,lids) if all(i->i>0,plids) a.plids_to_value[plids...] else @@ -330,23 +128,18 @@ function Base.getindex(a::LocalView{T,N},lids::Vararg{Integer,N}) where {T,N} end end function Base.setindex!(a::LocalView{T,N},v,lids::Vararg{Integer,N}) where {T,N} - plids = map(_lid_to_plid,lids,a.d_to_lid_to_plid) + plids = map(getindex,a.d_to_lid_to_plid,lids) @check all(i->i>0,plids) "You are trying to set a value that is not stored in the local portion" a.plids_to_value[plids...] = v end -function _lid_to_plid(lid,lid_to_plid) - plid = lid_to_plid[lid] - plid -end - function local_views(a::PVector,new_rows::PRange) old_rows = axes(a,1) if partition(old_rows) === partition(new_rows) partition(a) else - map(partition(a),partition(old_rows),partition(new_rows)) do vector_partition,old_rows,new_rows - LocalView(vector_partition,(find_local_to_local_map(new_rows,old_rows),)) + map(partition(a),partition(old_rows),partition(new_rows)) do a,old_rows,new_rows + LocalView(a,(local_to_local_map(new_rows,old_rows),)) end end end @@ -356,12 +149,12 @@ function local_views(a::PSparseMatrix,new_rows::PRange,new_cols::PRange) if (partition(old_rows) === partition(new_rows) && partition(old_cols) === partition(new_cols) ) partition(a) else - map(partition(a), - partition(old_rows),partition(old_cols), - partition(new_rows),partition(new_cols)) do matrix_partition,old_rows,old_cols,new_rows,new_cols - rl2lmap = find_local_to_local_map(new_rows,old_rows) - cl2lmap = find_local_to_local_map(new_cols,old_cols) - LocalView(matrix_partition,(rl2lmap,cl2lmap)) + map( + partition(a),partition(old_rows),partition(old_cols),partition(new_rows),partition(new_cols) + ) do a,old_rows,old_cols,new_rows,new_cols + rl2lmap = local_to_local_map(new_rows,old_rows) + cl2lmap = local_to_local_map(new_cols,old_cols) + LocalView(a,(rl2lmap,cl2lmap)) end end end @@ -377,867 +170,3 @@ function local_views(a::BlockPMatrix,new_rows::BlockPRange,new_cols::BlockPRange end |> to_parray_of_arrays return map(mortar,vals) end - -# PSparseMatrix assembly - -struct FullyAssembledRows end -struct SubAssembledRows end - -# For the moment we use COO format even though -# it is quite memory consuming. -# We need to implement communication in other formats in -# PartitionedArrays.jl - -struct PSparseMatrixBuilderCOO{T,B} - local_matrix_type::Type{T} - par_strategy::B -end - -function Algebra.nz_counter( - builder::PSparseMatrixBuilderCOO{A}, axs::Tuple{<:PRange,<:PRange}) where A - test_dofs_gids_prange, trial_dofs_gids_prange = axs - counters = map(partition(test_dofs_gids_prange),partition(trial_dofs_gids_prange)) do r,c - axs = (Base.OneTo(local_length(r)),Base.OneTo(local_length(c))) - Algebra.CounterCOO{A}(axs) - end - DistributedCounterCOO(builder.par_strategy,counters,test_dofs_gids_prange,trial_dofs_gids_prange) -end - -function Algebra.get_array_type(::PSparseMatrixBuilderCOO{Tv}) where Tv - T = eltype(Tv) - return PSparseMatrix{T} -end - - -""" -""" -struct DistributedCounterCOO{A,B,C,D} <: GridapType - par_strategy::A - counters::B - test_dofs_gids_prange::C - trial_dofs_gids_prange::D - function DistributedCounterCOO( - par_strategy, - counters::AbstractArray{<:Algebra.CounterCOO}, - test_dofs_gids_prange::PRange, - trial_dofs_gids_prange::PRange) - A = typeof(par_strategy) - B = typeof(counters) - C = typeof(test_dofs_gids_prange) - D = typeof(trial_dofs_gids_prange) - new{A,B,C,D}(par_strategy,counters,test_dofs_gids_prange,trial_dofs_gids_prange) - end -end - -function local_views(a::DistributedCounterCOO) - a.counters -end - -function local_views(a::DistributedCounterCOO,test_dofs_gids_prange,trial_dofs_gids_prange) - @check test_dofs_gids_prange === a.test_dofs_gids_prange - @check trial_dofs_gids_prange === a.trial_dofs_gids_prange - a.counters -end - -function Algebra.nz_allocation(a::DistributedCounterCOO) - allocs = map(nz_allocation,a.counters) - DistributedAllocationCOO(a.par_strategy,allocs,a.test_dofs_gids_prange,a.trial_dofs_gids_prange) -end - -struct DistributedAllocationCOO{A,B,C,D} <: GridapType - par_strategy::A - allocs::B - test_dofs_gids_prange::C - trial_dofs_gids_prange::D - function DistributedAllocationCOO( - par_strategy, - allocs::AbstractArray{<:Algebra.AllocationCOO}, - test_dofs_gids_prange::PRange, - trial_dofs_gids_prange::PRange) - A = typeof(par_strategy) - B = typeof(allocs) - C = typeof(test_dofs_gids_prange) - D = typeof(trial_dofs_gids_prange) - new{A,B,C,D}(par_strategy,allocs,test_dofs_gids_prange,trial_dofs_gids_prange) - end -end - -function change_axes(a::DistributedAllocationCOO{A,B,<:PRange,<:PRange}, - axes::Tuple{<:PRange,<:PRange}) where {A,B} - local_axes = map(partition(axes[1]),partition(axes[2])) do rows,cols - (Base.OneTo(local_length(rows)), Base.OneTo(local_length(cols))) - end - allocs = map(change_axes,a.allocs,local_axes) - DistributedAllocationCOO(a.par_strategy,allocs,axes[1],axes[2]) -end - -function change_axes(a::MatrixBlock{<:DistributedAllocationCOO}, - axes::Tuple{<:Vector,<:Vector}) - block_ids = CartesianIndices(a.array) - rows, cols = axes - array = map(block_ids) do I - change_axes(a[I],(rows[I[1]],cols[I[2]])) - end - return ArrayBlock(array,a.touched) -end - -function local_views(a::DistributedAllocationCOO) - a.allocs -end - -function local_views(a::DistributedAllocationCOO,test_dofs_gids_prange,trial_dofs_gids_prange) - @check test_dofs_gids_prange === a.test_dofs_gids_prange - @check trial_dofs_gids_prange === a.trial_dofs_gids_prange - a.allocs -end - -function local_views(a::MatrixBlock{<:DistributedAllocationCOO}) - array = map(local_views,a.array) |> to_parray_of_arrays - return map(ai -> ArrayBlock(ai,a.touched),array) -end - -function get_allocations(a::DistributedAllocationCOO) - I,J,V = map(local_views(a)) do alloc - alloc.I, alloc.J, alloc.V - end |> tuple_of_arrays - return I,J,V -end - -function get_allocations(a::ArrayBlock{<:DistributedAllocationCOO}) - tuple_of_array_of_parrays = map(get_allocations,a.array) |> tuple_of_arrays - return tuple_of_array_of_parrays -end - -get_test_gids(a::DistributedAllocationCOO) = a.test_dofs_gids_prange -get_trial_gids(a::DistributedAllocationCOO) = a.trial_dofs_gids_prange -get_test_gids(a::ArrayBlock{<:DistributedAllocationCOO}) = map(get_test_gids,diag(a.array)) -get_trial_gids(a::ArrayBlock{<:DistributedAllocationCOO}) = map(get_trial_gids,diag(a.array)) - -function Algebra.create_from_nz(a::PSparseMatrix) - # For FullyAssembledRows the underlying Exchanger should - # not have ghost layer making assemble! do nothing (TODO check) - assemble!(a) |> wait - a -end - -function Algebra.create_from_nz(a::DistributedAllocationCOO{<:FullyAssembledRows}) - f(x) = nothing - A, = _fa_create_from_nz_with_callback(f,a) - return A -end - -function Algebra.create_from_nz(a::ArrayBlock{<:DistributedAllocationCOO{<:FullyAssembledRows}}) - f(x) = nothing - A, = _fa_create_from_nz_with_callback(f,a) - return A -end - -function _fa_create_from_nz_with_callback(callback,a) - - # Recover some data - I,J,V = get_allocations(a) - test_dofs_gids_prange = get_test_gids(a) - trial_dofs_gids_prange = get_trial_gids(a) - - rows = _setup_prange(test_dofs_gids_prange,I;ghost=false,ax=:rows) - b = callback(rows) - - # convert I and J to global dof ids - to_global_indices!(I,test_dofs_gids_prange;ax=:rows) - to_global_indices!(J,trial_dofs_gids_prange;ax=:cols) - - # Create the range for cols - cols = _setup_prange(trial_dofs_gids_prange,J;ax=:cols) - - # Convert again I,J to local numeration - to_local_indices!(I,rows;ax=:rows) - to_local_indices!(J,cols;ax=:cols) - - # Adjust local matrix size to linear system's index sets - asys = change_axes(a,(rows,cols)) - - # Compress local portions - values = map(create_from_nz,local_views(asys)) - - # Finally build the matrix - A = _setup_matrix(values,rows,cols) - return A, b -end - -function Algebra.create_from_nz(a::DistributedAllocationCOO{<:SubAssembledRows}) - f(x) = nothing - A, = _sa_create_from_nz_with_callback(f,f,a,nothing) - return A -end - -function Algebra.create_from_nz(a::ArrayBlock{<:DistributedAllocationCOO{<:SubAssembledRows}}) - f(x) = nothing - A, = _sa_create_from_nz_with_callback(f,f,a,nothing) - return A -end - -function _sa_create_from_nz_with_callback(callback,async_callback,a,b) - # Recover some data - I,J,V = get_allocations(a) - test_dofs_gids_prange = get_test_gids(a) - trial_dofs_gids_prange = get_trial_gids(a) - - # convert I and J to global dof ids - to_global_indices!(I,test_dofs_gids_prange;ax=:rows) - to_global_indices!(J,trial_dofs_gids_prange;ax=:cols) - - # Create the Prange for the rows - rows = _setup_prange(test_dofs_gids_prange,I;ax=:rows) - - # Move (I,J,V) triplets to the owner process of each row I. - # The triplets are accompanyed which Jo which is the process column owner - Jo = get_gid_owners(J,trial_dofs_gids_prange;ax=:cols) - t = _assemble_coo!(I,J,V,rows;owners=Jo) - - # Here we can overlap computations - # This is a good place to overlap since - # sending the matrix rows is a lot of data - if !isa(b,Nothing) - bprange=_setup_prange_from_pvector_allocation(b) - b = callback(bprange) - end - - # Wait the transfer to finish - wait(t) - - # Create the Prange for the cols - cols = _setup_prange(trial_dofs_gids_prange,J;ax=:cols,owners=Jo) - - # Overlap rhs communications with CSC compression - t2 = async_callback(b) - - # Convert again I,J to local numeration - to_local_indices!(I,rows;ax=:rows) - to_local_indices!(J,cols;ax=:cols) - - # Adjust local matrix size to linear system's index sets - asys = change_axes(a,(rows,cols)) - - # Compress the local matrices - values = map(create_from_nz,local_views(asys)) - - # Wait the transfer to finish - if !isa(t2,Nothing) - wait(t2) - end - - # Finally build the matrix - A = _setup_matrix(values,rows,cols) - return A, b -end - - -# PVector assembly - -struct PVectorBuilder{T,B} - local_vector_type::Type{T} - par_strategy::B -end - -function Algebra.nz_counter(builder::PVectorBuilder,axs::Tuple{<:PRange}) - T = builder.local_vector_type - rows, = axs - counters = map(partition(rows)) do rows - axs = (Base.OneTo(local_length(rows)),) - nz_counter(ArrayBuilder(T),axs) - end - PVectorCounter(builder.par_strategy,counters,rows) -end - -function Algebra.get_array_type(::PVectorBuilder{Tv}) where Tv - T = eltype(Tv) - return PVector{T} -end - -struct PVectorCounter{A,B,C} - par_strategy::A - counters::B - test_dofs_gids_prange::C -end - -Algebra.LoopStyle(::Type{<:PVectorCounter}) = DoNotLoop() - -function local_views(a::PVectorCounter) - a.counters -end - -function local_views(a::PVectorCounter,rows) - @check rows === a.test_dofs_gids_prange - a.counters -end - -function Arrays.nz_allocation(a::PVectorCounter{<:FullyAssembledRows}) - dofs = a.test_dofs_gids_prange - values = map(nz_allocation,a.counters) - PVectorAllocationTrackOnlyValues(a.par_strategy,values,dofs) -end - -struct PVectorAllocationTrackOnlyValues{A,B,C} - par_strategy::A - values::B - test_dofs_gids_prange::C -end - -function local_views(a::PVectorAllocationTrackOnlyValues) - a.values -end - -function local_views(a::PVectorAllocationTrackOnlyValues,rows) - @check rows === a.test_dofs_gids_prange - a.values -end - -function Algebra.create_from_nz(a::PVectorAllocationTrackOnlyValues{<:FullyAssembledRows}) - rows = _setup_prange_without_ghosts(a.test_dofs_gids_prange) - _rhs_callback(a,rows) -end - -function Algebra.create_from_nz(a::PVectorAllocationTrackOnlyValues{<:SubAssembledRows}) - # This point MUST NEVER be reached. If reached there is an inconsistency - # in the parallel code in charge of vector assembly - @assert false -end - -function _rhs_callback(row_partitioned_vector_partition,rows) - # The ghost values in row_partitioned_vector_partition are - # aligned with the FESpace but not with the ghost values in the rows of A - b_fespace = PVector(row_partitioned_vector_partition.values, - partition(row_partitioned_vector_partition.test_dofs_gids_prange)) - - # This one is aligned with the rows of A - b = similar(b_fespace,eltype(b_fespace),(rows,)) - - # First transfer owned values - b .= b_fespace - - # Now transfer ghost - function transfer_ghost(b,b_fespace,ids,ids_fespace) - num_ghosts_vec = ghost_length(ids) - gho_to_loc_vec = ghost_to_local(ids) - loc_to_glo_vec = local_to_global(ids) - gid_to_lid_fe = global_to_local(ids_fespace) - for ghost_lid_vec in 1:num_ghosts_vec - lid_vec = gho_to_loc_vec[ghost_lid_vec] - gid = loc_to_glo_vec[lid_vec] - lid_fespace = gid_to_lid_fe[gid] - b[lid_vec] = b_fespace[lid_fespace] - end - end - map( - transfer_ghost, - partition(b), - partition(b_fespace), - b.index_partition, - b_fespace.index_partition) - - return b -end - -function Algebra.create_from_nz(a::PVector) - assemble!(a) |> wait - return a -end - -function Algebra.create_from_nz( - a::DistributedAllocationCOO{<:FullyAssembledRows}, - c_fespace::PVectorAllocationTrackOnlyValues{<:FullyAssembledRows}) - - function callback(rows) - _rhs_callback(c_fespace,rows) - end - - A,b = _fa_create_from_nz_with_callback(callback,a) - return A,b -end - -struct PVectorAllocationTrackTouchedAndValues{A,B,C} - allocations::A - values::B - test_dofs_gids_prange::C -end - -function Algebra.create_from_nz( - a::DistributedAllocationCOO{<:SubAssembledRows}, - b::PVectorAllocationTrackTouchedAndValues) - - function callback(rows) - _rhs_callback(b,rows) - end - - function async_callback(b) - # now we can assemble contributions - assemble!(b) - end - - A,b = _sa_create_from_nz_with_callback(callback,async_callback,a,b) - return A,b -end - -struct ArrayAllocationTrackTouchedAndValues{A} - touched::Vector{Bool} - values::A -end - -Gridap.Algebra.LoopStyle(::Type{<:ArrayAllocationTrackTouchedAndValues}) = Gridap.Algebra.Loop() - - -function local_views(a::PVectorAllocationTrackTouchedAndValues,rows) - @check rows === a.test_dofs_gids_prange - a.allocations -end - -@inline function Arrays.add_entry!(c::Function,a::ArrayAllocationTrackTouchedAndValues,v,i,j) - @notimplemented -end -@inline function Arrays.add_entry!(c::Function,a::ArrayAllocationTrackTouchedAndValues,v,i) - if i>0 - if !(a.touched[i]) - a.touched[i]=true - end - if !isa(v,Nothing) - a.values[i]=c(v,a.values[i]) - end - end - nothing -end -@inline function Arrays.add_entries!(c::Function,a::ArrayAllocationTrackTouchedAndValues,v,i,j) - @notimplemented -end -@inline function Arrays.add_entries!(c::Function,a::ArrayAllocationTrackTouchedAndValues,v,i) - if !isa(v,Nothing) - for (ve,ie) in zip(v,i) - Arrays.add_entry!(c,a,ve,ie) - end - else - for ie in i - Arrays.add_entry!(c,a,nothing,ie) - end - end - nothing -end - - -function _setup_touched_and_allocations_arrays(values) - touched = map(values) do values - fill!(Vector{Bool}(undef,length(values)),false) - end - allocations = map(values,touched) do values,touched - ArrayAllocationTrackTouchedAndValues(touched,values) - end - touched, allocations -end - -function Arrays.nz_allocation(a::DistributedCounterCOO{<:SubAssembledRows}, - b::PVectorCounter{<:SubAssembledRows}) - A = nz_allocation(a) - dofs = b.test_dofs_gids_prange - values = map(nz_allocation,b.counters) - touched,allocations=_setup_touched_and_allocations_arrays(values) - B = PVectorAllocationTrackTouchedAndValues(allocations,values,dofs) - return A,B -end - -function Arrays.nz_allocation(a::PVectorCounter{<:SubAssembledRows}) - dofs = a.test_dofs_gids_prange - values = map(nz_allocation,a.counters) - touched,allocations=_setup_touched_and_allocations_arrays(values) - return PVectorAllocationTrackTouchedAndValues(allocations,values,dofs) -end - -function local_views(a::PVectorAllocationTrackTouchedAndValues) - a.allocations -end - -function _setup_prange_from_pvector_allocation(a::PVectorAllocationTrackTouchedAndValues) - - # Find the ghost rows - allocations = local_views(a.allocations) - indices = partition(a.test_dofs_gids_prange) - I_ghost_lids_to_dofs_ghost_lids = map(allocations, indices) do allocation, indices - dofs_lids_touched = findall(allocation.touched) - loc_to_gho = local_to_ghost(indices) - n_I_ghost_lids = count((x)->loc_to_gho[x]!=0,dofs_lids_touched) - I_ghost_lids = Vector{Int32}(undef,n_I_ghost_lids) - cur = 1 - for lid in dofs_lids_touched - dof_lid = loc_to_gho[lid] - if dof_lid != 0 - I_ghost_lids[cur] = dof_lid - cur = cur+1 - end - end - I_ghost_lids - end - - ghost_to_global, ghost_to_owner = map( - find_gid_and_owner,I_ghost_lids_to_dofs_ghost_lids,indices) |> tuple_of_arrays - - ngids = length(a.test_dofs_gids_prange) - _setup_prange_impl_(ngids,indices,ghost_to_global,ghost_to_owner) -end - -function Algebra.create_from_nz(a::PVectorAllocationTrackTouchedAndValues) - rows = _setup_prange_from_pvector_allocation(a) - b = _rhs_callback(a,rows) - t2 = assemble!(b) - # Wait the transfer to finish - if t2 !== nothing - wait(t2) - end - return b -end - -# Common Assembly Utilities -function first_gdof_from_ids(ids) - if own_length(ids) == 0 - return 1 - end - lid_to_gid = local_to_global(ids) - owned_to_lid = own_to_local(ids) - return Int(lid_to_gid[first(owned_to_lid)]) -end - -function find_gid_and_owner(ighost_to_jghost,jindices) - jghost_to_local = ghost_to_local(jindices) - jlocal_to_global = local_to_global(jindices) - jlocal_to_owner = local_to_owner(jindices) - ighost_to_jlocal = sort(view(jghost_to_local,ighost_to_jghost)) - - ighost_to_global = jlocal_to_global[ighost_to_jlocal] - ighost_to_owner = jlocal_to_owner[ighost_to_jlocal] - return ighost_to_global, ighost_to_owner -end - -# The given ids are assumed to be a sub-set of the lids -function ghost_lids_touched(a::AbstractLocalIndices,gids::AbstractVector{<:Integer}) - glo_to_loc = global_to_local(a) - loc_to_gho = local_to_ghost(a) - - # First pass: Allocate - i = 0 - ghost_lids_touched = fill(false,ghost_length(a)) - for gid in gids - lid = glo_to_loc[gid] - ghost_lid = loc_to_gho[lid] - if ghost_lid > 0 && !ghost_lids_touched[ghost_lid] - ghost_lids_touched[ghost_lid] = true - i += 1 - end - end - gids_ghost_lid_to_ghost_lid = Vector{Int32}(undef,i) - - # Second pass: fill - i = 1 - fill!(ghost_lids_touched,false) - for gid in gids - lid = glo_to_loc[gid] - ghost_lid = loc_to_gho[lid] - if ghost_lid > 0 && !ghost_lids_touched[ghost_lid] - ghost_lids_touched[ghost_lid] = true - gids_ghost_lid_to_ghost_lid[i] = ghost_lid - i += 1 - end - end - - return gids_ghost_lid_to_ghost_lid -end - -# Find the neighbours of partition1 trying -# to use those in partition2 as a hint -function _find_neighbours!(partition1, partition2) - partition2_snd, partition2_rcv = assembly_neighbors(partition2) - partition2_graph = ExchangeGraph(partition2_snd, partition2_rcv) - return assembly_neighbors(partition1; neighbors=partition2_graph) -end - -# to_global! & to_local! analogs, needed for dispatching in block assembly - -function to_local_indices!(I,ids::PRange;kwargs...) - map(to_local!,I,partition(ids)) -end - -function to_global_indices!(I,ids::PRange;kwargs...) - map(to_global!,I,partition(ids)) -end - -function get_gid_owners(I,ids::PRange;kwargs...) - map(I,partition(ids)) do I, indices - glo_to_loc = global_to_local(indices) - loc_to_own = local_to_owner(indices) - map(x->loc_to_own[glo_to_loc[x]], I) - end -end - -for f in [:to_local_indices!, :to_global_indices!, :get_gid_owners] - @eval begin - function $f(I::Vector,ids::AbstractVector{<:PRange};kwargs...) - map($f,I,ids) - end - - function $f(I::Matrix,ids::AbstractVector{<:PRange};ax=:rows) - @check ax ∈ [:rows,:cols] - block_ids = CartesianIndices(I) - map(block_ids) do id - i = id[1]; j = id[2]; - if ax == :rows - $f(I[i,j],ids[i]) - else - $f(I[i,j],ids[j]) - end - end - end - end -end - -# _setup_matrix : local matrices + global PRanges -> Global matrix - -function _setup_matrix(values,rows::PRange,cols::PRange) - return PSparseMatrix(values,partition(rows),partition(cols)) -end - -function _setup_matrix(values,rows::Vector{<:PRange},cols::Vector{<:PRange}) - block_ids = CartesianIndices((length(rows),length(cols))) - block_mats = map(block_ids) do I - block_values = map(v -> blocks(v)[I],values) - return _setup_matrix(block_values,rows[I[1]],cols[I[2]]) - end - return mortar(block_mats) -end - -# _assemble_coo! : local coo triplets + global PRange -> Global coo values - -function _assemble_coo!(I,J,V,rows::PRange;owners=nothing) - if isa(owners,Nothing) - PArrays.assemble_coo!(I,J,V,partition(rows)) - else - assemble_coo_with_column_owner!(I,J,V,partition(rows),owners) - end -end - -function _assemble_coo!(I,J,V,rows::Vector{<:PRange};owners=nothing) - block_ids = CartesianIndices(I) - map(block_ids) do id - i = id[1]; j = id[2]; - if isa(owners,Nothing) - _assemble_coo!(I[i,j],J[i,j],V[i,j],rows[i]) - else - _assemble_coo!(I[i,j],J[i,j],V[i,j],rows[i],owners=owners[i,j]) - end - end -end - -function assemble_coo_with_column_owner!(I,J,V,row_partition,Jown) - """ - Returns three JaggedArrays with the coo triplets - to be sent to the corresponding owner parts in parts_snd - """ - function setup_snd(part,parts_snd,row_lids,coo_entries_with_column_owner) - global_to_local_row = global_to_local(row_lids) - local_row_to_owner = local_to_owner(row_lids) - owner_to_i = Dict(( owner=>i for (i,owner) in enumerate(parts_snd) )) - ptrs = zeros(Int32,length(parts_snd)+1) - k_gi, k_gj, k_jo, k_v = coo_entries_with_column_owner - for k in 1:length(k_gi) - gi = k_gi[k] - li = global_to_local_row[gi] - owner = local_row_to_owner[li] - if owner != part - ptrs[owner_to_i[owner]+1] +=1 - end - end - PArrays.length_to_ptrs!(ptrs) - gi_snd_data = zeros(eltype(k_gi),ptrs[end]-1) - gj_snd_data = zeros(eltype(k_gj),ptrs[end]-1) - jo_snd_data = zeros(eltype(k_jo),ptrs[end]-1) - v_snd_data = zeros(eltype(k_v),ptrs[end]-1) - for k in 1:length(k_gi) - gi = k_gi[k] - li = global_to_local_row[gi] - owner = local_row_to_owner[li] - if owner != part - gj = k_gj[k] - v = k_v[k] - p = ptrs[owner_to_i[owner]] - gi_snd_data[p] = gi - gj_snd_data[p] = gj - jo_snd_data[p] = k_jo[k] - v_snd_data[p] = v - k_v[k] = zero(v) - ptrs[owner_to_i[owner]] += 1 - end - end - PArrays.rewind_ptrs!(ptrs) - gi_snd = JaggedArray(gi_snd_data,ptrs) - gj_snd = JaggedArray(gj_snd_data,ptrs) - jo_snd = JaggedArray(jo_snd_data,ptrs) - v_snd = JaggedArray(v_snd_data,ptrs) - gi_snd, gj_snd, jo_snd, v_snd - end - """ - Pushes to coo_entries_with_column_owner the tuples - gi_rcv,gj_rcv,jo_rcv,v_rcv received from remote processes - """ - function setup_rcv!(coo_entries_with_column_owner,gi_rcv,gj_rcv,jo_rcv,v_rcv) - k_gi, k_gj, k_jo, k_v = coo_entries_with_column_owner - current_n = length(k_gi) - new_n = current_n + length(gi_rcv.data) - resize!(k_gi,new_n) - resize!(k_gj,new_n) - resize!(k_jo,new_n) - resize!(k_v,new_n) - for p in 1:length(gi_rcv.data) - k_gi[current_n+p] = gi_rcv.data[p] - k_gj[current_n+p] = gj_rcv.data[p] - k_jo[current_n+p] = jo_rcv.data[p] - k_v[current_n+p] = v_rcv.data[p] - end - end - part = linear_indices(row_partition) - parts_snd, parts_rcv = assembly_neighbors(row_partition) - coo_entries_with_column_owner = map(tuple,I,J,Jown,V) - gi_snd, gj_snd, jo_snd, v_snd = map(setup_snd,part,parts_snd,row_partition,coo_entries_with_column_owner) |> tuple_of_arrays - graph = ExchangeGraph(parts_snd,parts_rcv) - t1 = exchange(gi_snd,graph) - t2 = exchange(gj_snd,graph) - t3 = exchange(jo_snd,graph) - t4 = exchange(v_snd,graph) - @async begin - gi_rcv = fetch(t1) - gj_rcv = fetch(t2) - jo_rcv = fetch(t3) - v_rcv = fetch(t4) - map(setup_rcv!,coo_entries_with_column_owner,gi_rcv,gj_rcv,jo_rcv,v_rcv) - I,J,Jown,V - end -end - -# dofs_gids_prange can be either test_dofs_gids_prange or trial_dofs_gids_prange -# In the former case, gids is a vector of global test dof identifiers, while in the -# latter, a vector of global trial dof identifiers -function _setup_prange(dofs_gids_prange::PRange,gids;ghost=true,owners=nothing,kwargs...) - if !ghost - _setup_prange_without_ghosts(dofs_gids_prange) - elseif isa(owners,Nothing) - _setup_prange_with_ghosts(dofs_gids_prange,gids) - else - _setup_prange_with_ghosts(dofs_gids_prange,gids,owners) - end -end - -function _setup_prange( - dofs_gids_prange::AbstractVector{<:PRange}, - gids::AbstractMatrix; - ax=:rows,ghost=true,owners=nothing -) - @check ax ∈ (:rows,:cols) - block_ids = LinearIndices(dofs_gids_prange) - pvcat(x) = map(xi -> vcat(xi...), to_parray_of_arrays(x)) - - gids_union, owners_union = map(block_ids,dofs_gids_prange) do id, prange - gids_slice = (ax == :rows) ? gids[id,:] : gids[:,id] - gids_union = pvcat(gids_slice) - - owners_union = nothing - if !isnothing(owners) - owners_slice = (ax == :rows) ? owners[id,:] : owners[:,id] - owners_union = pvcat(owners_slice) - end - - return gids_union, owners_union - end |> tuple_of_arrays - - return map((p,g,o) -> _setup_prange(p,g;ghost=ghost,owners=o),dofs_gids_prange,gids_union,owners_union) -end - -# Create PRange for the rows of the linear system -# without local ghost dofs as per required in the -# FullyAssembledRows() parallel assembly strategy -function _setup_prange_without_ghosts(dofs_gids_prange::PRange) - ngdofs = length(dofs_gids_prange) - indices = map(partition(dofs_gids_prange)) do dofs_indices - owner = part_id(dofs_indices) - own_indices = OwnIndices(ngdofs,owner,own_to_global(dofs_indices)) - ghost_indices = GhostIndices(ngdofs,Int64[],Int32[]) - OwnAndGhostIndices(own_indices,ghost_indices) - end - return PRange(indices) -end - -# Here we are assuming that the sparse communication graph underlying test_dofs_gids_partition -# is a superset of the one underlying indices. This is (has to be) true for the rows of the linear system. -# The precondition required for the consistency of any parallel assembly process in GridapDistributed -# is that each processor can determine locally with a single layer of ghost cells the global indices and associated -# processor owners of the rows that it touches after assembly of integration terms posed on locally-owned entities -# (i.e., either cells or faces). -function _setup_prange_with_ghosts(dofs_gids_prange::PRange,gids) - ngdofs = length(dofs_gids_prange) - dofs_gids_partition = partition(dofs_gids_prange) - - # Selected ghost ids -> dof PRange ghost ids - gids_ghost_lids_to_dofs_ghost_lids = map(ghost_lids_touched,dofs_gids_partition,gids) - - # Selected ghost ids -> [global dof ids, owner processor ids] - gids_ghost_to_global, gids_ghost_to_owner = map( - find_gid_and_owner,gids_ghost_lids_to_dofs_ghost_lids,dofs_gids_partition) |> tuple_of_arrays - - return _setup_prange_impl_(ngdofs,dofs_gids_partition,gids_ghost_to_global,gids_ghost_to_owner) -end - -# Here we cannot assume that the sparse communication graph underlying -# trial_dofs_gids_partition is a superset of the one underlying indices. -# Here we chould check whether it is included and call _find_neighbours!() -# if this is the case. At present, we are not taking advantage of this, -# but let the parallel scalable algorithm to compute the reciprocal to do the job. -function _setup_prange_with_ghosts(dofs_gids_prange::PRange,gids,owners) - ngdofs = length(dofs_gids_prange) - dofs_gids_partition = partition(dofs_gids_prange) - - # Selected ghost ids -> [global dof ids, owner processor ids] - gids_ghost_to_global, gids_ghost_to_owner = map( - gids,owners,dofs_gids_partition) do gids, owners, indices - ghost_touched = Dict{Int,Bool}() - ghost_to_global = Int64[] - ghost_to_owner = Int64[] - me = part_id(indices) - for (j,jo) in zip(gids,owners) - if jo != me - if !haskey(ghost_touched,j) - push!(ghost_to_global,j) - push!(ghost_to_owner,jo) - ghost_touched[j] = true - end - end - end - ghost_to_global, ghost_to_owner - end |> tuple_of_arrays - - return _setup_prange_impl_(ngdofs, - dofs_gids_partition, - gids_ghost_to_global, - gids_ghost_to_owner; - discover_neighbours=false) -end - -function _setup_prange_impl_(ngdofs, - dofs_gids_partition, - gids_ghost_to_global, - gids_ghost_to_owner; - discover_neighbours=true) - indices = map(dofs_gids_partition, - gids_ghost_to_global, - gids_ghost_to_owner) do dofs_indices, ghost_to_global, ghost_to_owner - owner = part_id(dofs_indices) - own_indices = OwnIndices(ngdofs,owner,own_to_global(dofs_indices)) - ghost_indices = GhostIndices(ngdofs,ghost_to_global,ghost_to_owner) - OwnAndGhostIndices(own_indices,ghost_indices) - end - if discover_neighbours - _find_neighbours!(indices,dofs_gids_partition) - end - return PRange(indices) -end diff --git a/src/Assembly.jl b/src/Assembly.jl new file mode 100644 index 0000000..41673c5 --- /dev/null +++ b/src/Assembly.jl @@ -0,0 +1,381 @@ + +# Parallel assembly strategies + +struct Assembled <: FESpaces.AssemblyStrategy end +struct SubAssembled <: FESpaces.AssemblyStrategy end +struct LocallyAssembled <: FESpaces.AssemblyStrategy end + +function local_assembly_strategy(::Union{Assembled,SubAssembled},rows,cols) + DefaultAssemblyStrategy() +end + +# When LocallyAssembling, make sure that you also loop over ghost cells. +function local_assembly_strategy(::LocallyAssembled,rows,cols) + rows_local_to_ghost = local_to_ghost(rows) + GenericAssemblyStrategy( + identity, + identity, + row->iszero(rows_local_to_ghost[row]), + col->true + ) +end + +# PSparseMatrix and PVector builders + +struct DistributedArrayBuilder{T,N,B} + local_array_type::Type{T} + strategy::B + function DistributedArrayBuilder( + local_array_type::Type{T}, + strategy::AssemblyStrategy + ) where T + N = ndims(T) + B = typeof(strategy) + new{T,N,B}(local_array_type,strategy) + end +end + +const PVectorBuilder{T,B} = DistributedArrayBuilder{T,1,B} +const PSparseMatrixBuilder{T,B} = DistributedArrayBuilder{T,2,B} + +function Algebra.get_array_type(::PSparseMatrixBuilder{Tm}) where Tm + T = eltype(Tm) + return PSparseMatrix{T} +end + +function Algebra.get_array_type(::PVectorBuilder{Tv}) where Tv + T = eltype(Tv) + return PVector{T} +end + +# Distributed counters and allocators + +""" + DistributedCounter{S,T,N} <: GridapType + +Distributed N-dimensional counter, with local counters of type T. +Follows assembly strategy S. +""" +struct DistributedCounter{S,T,N,A,B} <: GridapType + counters :: A + axes :: B + strategy :: S + function DistributedCounter( + counters :: AbstractArray{T}, + axes :: NTuple{N,<:PRange}, + strategy :: AssemblyStrategy + ) where {T,N} + A, B, S = typeof(counters), typeof(axes), typeof(strategy) + new{S,T,N,A,B}(counters,axes,strategy) + end +end + +Base.axes(a::DistributedCounter) = a.axes +Base.axes(a::DistributedCounter,d::Integer) = a.axes[d] +local_views(a::DistributedCounter) = a.counters + +function local_views(a::DistributedCounter,axes::PRange...) + @check all(map(PArrays.matching_local_indices,axes,a.axes)) + return a.counters +end + +const PVectorCounter{S,T,A,B} = DistributedCounter{S,T,1,A,B} +Algebra.LoopStyle(::Type{<:PVectorCounter}) = DoNotLoop() + +const PSparseMatrixCounter{S,T,A,B} = DistributedCounter{S,T,2,A,B} +Algebra.LoopStyle(::Type{<:PSparseMatrixCounter}) = Loop() + +""" + DistributedAllocation{S,T,N} <: GridapType + +Distributed N-dimensional allocator, with local allocators of type T. +Follows assembly strategy S. +""" +struct DistributedAllocation{S,T,N,A,B} <: GridapType + allocs :: A + axes :: B + strategy :: S + function DistributedAllocation( + allocs :: AbstractArray{T}, + axes :: NTuple{N,<:PRange}, + strategy :: AssemblyStrategy + ) where {T,N} + A, B, S = typeof(allocs), typeof(axes), typeof(strategy) + new{S,T,N,A,B}(allocs,axes,strategy) + end +end + +Base.axes(a::DistributedAllocation) = a.axes +Base.axes(a::DistributedAllocation,d::Integer) = a.axes[d] +local_views(a::DistributedAllocation) = a.allocs + +function local_views(a::DistributedAllocation,axes::PRange...) + @check all(map(PArrays.matching_local_indices,axes,a.axes)) + return a.allocs +end + +function change_axes(a::DistributedAllocation{S,T,N},axes::NTuple{N,<:PRange}) where {S,T,N} + indices = map(partition,axes) + local_axes = map(indices...) do indices... + map(ids -> Base.OneTo(local_length(ids)), indices) + end + allocs = map(change_axes,a.allocs,local_axes) + DistributedAllocation(allocs,axes,a.strategy) +end + +const PVectorAllocation{S,T} = DistributedAllocation{S,T,1} +const PSparseMatrixAllocation{S,T} = DistributedAllocation{S,T,2} + +function get_allocations(a::PSparseMatrixAllocation{S,<:Algebra.AllocationCOO}) where S + I,J,V = map(local_views(a)) do alloc + alloc.I, alloc.J, alloc.V + end |> tuple_of_arrays + return I,J,V +end + +function collect_touched_ids(a::PVectorAllocation{S,<:TrackedArrayAllocation}) where S + touched_ids = map(local_views(a),partition(axes(a,1))) do a, ids + n_global = global_length(ids) + rows = remove_ghost(unpermute(ids)) + + ghost_lids = ghost_to_local(ids) + touched_ghost_lids = collect(Int,filter(lid -> a.touched[lid], ghost_lids)) + touched_ghost_owners = local_to_owner(ids)[touched_ghost_lids] + touched_ghost_gids = to_global!(touched_ghost_lids, ids) + ghost = GhostIndices(n_global,touched_ghost_gids,touched_ghost_owners) + replace_ghost(rows,ghost) + end + return touched_ids +end + +# PSparseMatrix assembly chain +# +# 1 - nz_counter(PSparseMatrixBuilder) -> PSparseMatrixCounter +# 2 - nz_allocation(PSparseMatrixCounter) -> PSparseMatrixAllocation +# 3 - create_from_nz(PSparseMatrixAllocation) -> PSparseMatrix + +function Algebra.nz_counter(builder::PSparseMatrixBuilder{MT},axs::NTuple{2,<:PRange}) where MT + rows, cols = axs # test ids, trial ids + counters = map(partition(rows),partition(cols)) do rows, cols + local_axs = (Base.OneTo(local_length(rows)),Base.OneTo(local_length(cols))) + Algebra.CounterCOO{MT}(local_axs) + end + DistributedCounter(counters,axs,builder.strategy) +end + +function Algebra.nz_allocation(a::PSparseMatrixCounter) + allocs = map(nz_allocation,local_views(a)) + DistributedAllocation(allocs,a.axes,a.strategy) +end + +function Algebra.create_from_nz(a::PSparseMatrix) + assemble!(a) |> wait + return a +end + +function Algebra.create_from_nz(a::PSparseMatrixAllocation{<:LocallyAssembled}) + A, = create_from_nz_locally_assembled(a) + return A +end + +function Algebra.create_from_nz(a::PSparseMatrixAllocation{<:Assembled}) + A, = create_from_nz_assembled(a) + return A +end + +function Algebra.create_from_nz(a::PSparseMatrixAllocation{<:SubAssembled}) + A, = create_from_nz_subassembled(a) + return A +end + +# PVector assembly chain: +# +# 1 - nz_counter(PVectorBuilder) -> PVectorCounter +# 2 - nz_allocation(PVectorCounter) -> PVectorAllocation +# 3 - create_from_nz(PVectorAllocation) -> PVector + +function Algebra.nz_counter(builder::PVectorBuilder{VT},axs::NTuple{1,<:PRange}) where VT + rows, = axs + counters = map(partition(rows)) do rows + axs = (Base.OneTo(local_length(rows)),) + nz_counter(ArrayBuilder(VT),axs) + end + DistributedCounter(counters,(rows,),builder.strategy) +end + +function Arrays.nz_allocation(a::PVectorCounter{<:Union{LocallyAssembled,SubAssembled}}) + allocs = map(nz_allocation,local_views(a)) + DistributedAllocation(allocs,a.axes,a.strategy) +end + +function Arrays.nz_allocation(a::PVectorCounter{<:Assembled}) + values = map(nz_allocation,local_views(a)) + allocs = map(TrackedArrayAllocation,values) + return DistributedAllocation(allocs,a.axes,a.strategy) +end + +function Algebra.create_from_nz(a::PVector) + assemble!(a) |> wait + return a +end + +function Algebra.create_from_nz(a::PVectorAllocation{<:Assembled,<:AbstractVector}) + # This point MUST NEVER be reached. If reached there is an inconsistency + # in the parallel code in charge of vector assembly + @assert false +end + +function Algebra.create_from_nz(a::PVectorAllocation{<:Union{LocallyAssembled,SubAssembled}}) + rows = remove_ghost(unpermute(axes(a,1))) + values = local_views(a) + b = rhs_callback(values,rows) + return b +end + +function Algebra.create_from_nz(a::PVectorAllocation{<:Assembled}) + rows = collect_touched_ids(a) + values = map(ai -> ai.values, local_views(a)) + b = rhs_callback(values,rows) + t2 = assemble!(b) + if t2 !== nothing + wait(t2) + end + return b +end + +# PSystem assembly chain: +# +# When assembling a full system (matrix + vector), it is more efficient to +# overlap communications the assembly of the matrix and the vector. +# Not only it is faster, but also necessary to ensure identical ghost indices +# in both the matrix and vector rows. +# This is done by using the following specializations: + +function Arrays.nz_allocation( + a::PSparseMatrixCounter, b::PVectorCounter +) + return nz_allocation(a), nz_allocation(b) +end + +function Algebra.create_from_nz( + a::PSparseMatrixAllocation{<:LocallyAssembled}, + b::PVectorAllocation{<:LocallyAssembled} +) + function callback(rows) + values = local_views(b) + b_fespace = PVector(values,partition(axes(b,1))) + locally_repartition(b_fespace,rows) + end + A, B = create_from_nz_locally_assembled(a,callback) + return A, B +end + +function Algebra.create_from_nz( + a::PSparseMatrixAllocation{<:Assembled}, + b::PVectorAllocation{<:Assembled} +) + function callback(rows) + new_indices = collect_touched_ids(b) + values = map(ai -> ai.values, local_views(b)) + b_fespace = PVector(values,partition(axes(b,1))) + locally_repartition(b_fespace,new_indices) + end + function async_callback(b) + assemble!(b) + end + A, B = create_from_nz_assembled(a,callback,async_callback) + return A, B +end + +function Algebra.create_from_nz( + a::PSparseMatrixAllocation{<:SubAssembled}, + b::PVectorAllocation{<:SubAssembled} +) + function callback(rows) + @check PArrays.matching_local_indices(PRange(rows),axes(b,1)) + values = local_views(b) + PVector(values,partition(axes(b,1))) + end + A, B = create_from_nz_subassembled(a,callback) + return A, B +end + +# Low-level assembly methods + +function create_from_nz_locally_assembled( + a, + callback::Function = rows -> nothing +) + I,J,V = get_allocations(a) + test_ids = partition(axes(a,1)) + trial_ids = partition(axes(a,2)) + + rows = map(remove_ghost,map(unpermute,test_ids)) + b = callback(rows) + + map(map_local_to_global!,I,test_ids) + map(map_local_to_global!,J,trial_ids) + + cols = filter_and_replace_ghost(map(unpermute,trial_ids),J) + + map(map_global_to_local!,I,rows) + map(map_global_to_local!,J,cols) + + assembled = true + a_sys = change_axes(a,(PRange(rows),PRange(cols))) + values = map(create_from_nz,local_views(a_sys)) + A = PSparseMatrix(values,rows,cols,assembled) + + return A, b +end + +function create_from_nz_assembled( + a, + callback::Function = rows -> nothing, + async_callback::Function = b -> empty_async_task +) + I,J,V = get_allocations(a) + test_ids = partition(axes(a,1)) + trial_ids = partition(axes(a,2)) + + # convert I and J to global dof ids + map(map_local_to_global!,I,test_ids) + map(map_local_to_global!,J,trial_ids) + + # Overlapped COO communication and vector assembly + rows = filter_and_replace_ghost(map(unpermute,test_ids),I) + t = PartitionedArrays.assemble_coo!(I,J,V,rows) + b = callback(rows) + wait(t) + + # Overlap rhs communications with CSC compression + t2 = async_callback(b) + cols = filter_and_replace_ghost(map(unpermute,trial_ids),J) + + map(map_global_to_local!,I,rows) + map(map_global_to_local!,J,cols) + + assembled = true + a_sys = change_axes(a,(PRange(rows),PRange(cols))) + values = map(create_from_nz,local_views(a_sys)) + A = PSparseMatrix(values,rows,cols,assembled) + + wait(t2) + return A, b +end + +function create_from_nz_subassembled( + a, + callback::Function = rows -> nothing, +) + rows = partition(axes(a,1)) + cols = partition(axes(a,2)) + + b = callback(rows) + + assembled = false + values = map(create_from_nz,local_views(a)) + A = PSparseMatrix(values,rows,cols,assembled) + + return A, b +end diff --git a/src/BlockPartitionedArrays.jl b/src/BlockPartitionedArrays.jl index 41e3f6d..942a7c2 100644 --- a/src/BlockPartitionedArrays.jl +++ b/src/BlockPartitionedArrays.jl @@ -271,8 +271,8 @@ function PartitionedArrays.partition(a::BlockPArray) return map(mortar,vals) end -function PartitionedArrays.to_trivial_partition(a::BlockPArray) - vals = map(PartitionedArrays.to_trivial_partition,blocks(a)) +function PartitionedArrays.centralize(a::BlockPArray) + vals = map(PartitionedArrays.centralize,blocks(a)) return mortar(vals) end diff --git a/src/FESpaces.jl b/src/FESpaces.jl index 319e218..f68b747 100644 --- a/src/FESpaces.jl +++ b/src/FESpaces.jl @@ -56,36 +56,25 @@ function FESpaces.gather_free_and_dirichlet_values(f::DistributedFESpace,cell_va return free_values, dirichlet_values end -function dof_wise_to_cell_wise!(cell_wise_vector,dof_wise_vector,cell_to_ldofs,cell_prange) - map(cell_wise_vector, - dof_wise_vector, - cell_to_ldofs, - partition(cell_prange)) do cwv,dwv,cell_to_ldofs,indices +function dof_wise_to_cell_wise!(cell_wise_vector,dof_wise_vector,cell_to_ldofs,cell_ids) + map(cell_wise_vector,dof_wise_vector,cell_to_ldofs,cell_ids) do cwv,dwv,cell_to_ldofs,cell_ids cache = array_cache(cell_to_ldofs) - ncells = length(cell_to_ldofs) - ptrs = cwv.ptrs - data = cwv.data - cell_own_to_local = own_to_local(indices) - for cell in cell_own_to_local + for cell in cell_ids ldofs = getindex!(cache,cell_to_ldofs,cell) - p = ptrs[cell]-1 + p = cwv.ptrs[cell]-1 for (i,ldof) in enumerate(ldofs) if ldof > 0 - data[i+p] = dwv[ldof] + cwv.data[i+p] = dwv[ldof] end end end end end -function cell_wise_to_dof_wise!(dof_wise_vector,cell_wise_vector,cell_to_ldofs,cell_range) - map(dof_wise_vector, - cell_wise_vector, - cell_to_ldofs, - partition(cell_range)) do dwv,cwv,cell_to_ldofs,indices +function cell_wise_to_dof_wise!(dof_wise_vector,cell_wise_vector,cell_to_ldofs,cell_ids) + map(dof_wise_vector,cell_wise_vector,cell_to_ldofs,cell_ids) do dwv,cwv,cell_to_ldofs,cell_ids cache = array_cache(cell_to_ldofs) - cell_ghost_to_local = ghost_to_local(indices) - for cell in cell_ghost_to_local + for cell in cell_ids ldofs = getindex!(cache,cell_to_ldofs,cell) p = cwv.ptrs[cell]-1 for (i,ldof) in enumerate(ldofs) @@ -97,28 +86,19 @@ function cell_wise_to_dof_wise!(dof_wise_vector,cell_wise_vector,cell_to_ldofs,c end end -function dof_wise_to_cell_wise(dof_wise_vector,cell_to_ldofs,cell_prange) - cwv=map(cell_to_ldofs) do cell_to_ldofs - cache = array_cache(cell_to_ldofs) - ncells = length(cell_to_ldofs) - ptrs = Vector{Int32}(undef,ncells+1) - for cell in 1:ncells - ldofs = getindex!(cache,cell_to_ldofs,cell) - ptrs[cell+1] = length(ldofs) - end - PArrays.length_to_ptrs!(ptrs) - ndata = ptrs[end]-1 - data = Vector{Int}(undef,ndata) - data .= -1 - JaggedArray(data,ptrs) - end - dof_wise_to_cell_wise!(cwv,dof_wise_vector,cell_to_ldofs,cell_prange) - cwv +function dof_wise_to_cell_wise(dof_wise_vector,cell_to_ldofs,cell_ids) + cwv = map(cell_to_ldofs) do cell_to_ldofs + ptrs = generate_ptrs(cell_to_ldofs) + data = fill(-1,ptrs[end]-1) + JaggedArray(data,ptrs) + end + dof_wise_to_cell_wise!(cwv,dof_wise_vector,cell_to_ldofs,cell_ids) + return cwv end function fetch_vector_ghost_values_cache(vector_partition,partition) cache = PArrays.p_vector_cache(vector_partition,partition) - map(reverse,cache) + reverse(cache) end function fetch_vector_ghost_values!(vector_partition,cache) @@ -128,13 +108,15 @@ end function generate_gids( cell_range::PRange, cell_to_ldofs::AbstractArray{<:AbstractArray}, - nldofs::AbstractArray{<:Integer}) + nldofs::AbstractArray{<:Integer} +) + ranks = linear_indices(partition(cell_range)) # Find and count number owned dofs ldof_to_owner, nodofs = map(partition(cell_range),cell_to_ldofs,nldofs) do indices,cell_to_ldofs,nldofs ldof_to_owner = fill(Int32(0),nldofs) - cache = array_cache(cell_to_ldofs) lcell_to_owner = local_to_owner(indices) + cache = array_cache(cell_to_ldofs) for cell in 1:length(cell_to_ldofs) owner = lcell_to_owner[cell] ldofs = getindex!(cache,cell_to_ldofs,cell) @@ -151,102 +133,53 @@ function generate_gids( ldof_to_owner, nodofs end |> tuple_of_arrays - cell_ldofs_to_part = dof_wise_to_cell_wise(ldof_to_owner, - cell_to_ldofs, - cell_range) - - # Note1 : this call potentially updates cell_prange with the - # info required to exchange info among nearest neighbours - # so that it can be re-used in the future for other exchanges - - # Note2 : we need to call reverse() as the senders and receivers are - # swapped in the AssemblyCache of partition(cell_range) - - # Exchange the dof owners - cache_fetch=fetch_vector_ghost_values_cache(cell_ldofs_to_part,partition(cell_range)) - fetch_vector_ghost_values!(cell_ldofs_to_part,cache_fetch) |> wait - - cell_wise_to_dof_wise!(ldof_to_owner, - cell_ldofs_to_part, - cell_to_ldofs, - cell_range) - # Find the global range of owned dofs first_gdof = scan(+,nodofs,type=:exclusive,init=one(eltype(nodofs))) + + # Exchange the dof owners. Cell owner always has correct dof owner. + cell_ldofs_to_owner = dof_wise_to_cell_wise(ldof_to_owner,cell_to_ldofs,own_to_local(cell_range)) + consistent!(PVector(cell_ldofs_to_owner,partition(cell_range))) |> wait + cell_wise_to_dof_wise!(ldof_to_owner,cell_ldofs_to_owner,cell_to_ldofs,ghost_to_local(cell_range)) - # Distribute gdofs to owned ones - ldof_to_gdof = map(first_gdof,ldof_to_owner,partition(cell_range)) do first_gdof,ldof_to_owner,indices - me = part_id(indices) + # Fill owned gids + ldof_to_gdof = map(ranks,first_gdof,ldof_to_owner) do rank,first_gdof,ldof_to_owner + ldof_to_gdof = fill(0,length(ldof_to_owner)) offset = first_gdof-1 - ldof_to_gdof = Vector{Int}(undef,length(ldof_to_owner)) odof = 0 for (ldof,owner) in enumerate(ldof_to_owner) - if owner == me + if owner == rank odof += 1 - ldof_to_gdof[ldof] = odof - else - ldof_to_gdof[ldof] = 0 - end - end - for (ldof,owner) in enumerate(ldof_to_owner) - if owner == me - ldof_to_gdof[ldof] += offset + ldof_to_gdof[ldof] = odof + offset end end ldof_to_gdof end - # Create cell-wise global dofs - cell_to_gdofs = dof_wise_to_cell_wise(ldof_to_gdof, - cell_to_ldofs, - cell_range) - - # Exchange the global dofs - fetch_vector_ghost_values!(cell_to_gdofs,cache_fetch) |> wait + # Exchange gids + cell_to_gdofs = dof_wise_to_cell_wise(ldof_to_gdof,cell_to_ldofs,own_to_local(cell_range)) + consistent!(PVector(cell_to_gdofs,partition(cell_range))) |> wait - # Distribute global dof ids also to ghost + # Fill ghost gids with exchanged information map(cell_to_ldofs,cell_to_gdofs,ldof_to_gdof,ldof_to_owner,partition(cell_range)) do cell_to_ldofs,cell_to_gdofs,ldof_to_gdof,ldof_to_owner,indices - gdof = 0 cache = array_cache(cell_to_ldofs) - cell_ghost_to_local = ghost_to_local(indices) cell_local_to_owner = local_to_owner(indices) - for cell in cell_ghost_to_local + for cell in ghost_to_local(indices) ldofs = getindex!(cache,cell_to_ldofs,cell) + cell_owner = cell_local_to_owner[cell] p = cell_to_gdofs.ptrs[cell]-1 for (i,ldof) in enumerate(ldofs) - if ldof > 0 && ldof_to_owner[ldof] == cell_local_to_owner[cell] + if ldof > 0 && isequal(ldof_to_owner[ldof],cell_owner) ldof_to_gdof[ldof] = cell_to_gdofs.data[i+p] end end end end - dof_wise_to_cell_wise!(cell_to_gdofs, - ldof_to_gdof, - cell_to_ldofs, - cell_range) - - fetch_vector_ghost_values!(cell_to_gdofs,cache_fetch) |> wait - - cell_wise_to_dof_wise!(ldof_to_gdof, - cell_to_gdofs, - cell_to_ldofs, - cell_range) - - # Setup DoFs LocalIndices - ngdofs = reduction(+,nodofs,destination=:all,init=zero(eltype(nodofs))) - local_indices = map(ngdofs,partition(cell_range),ldof_to_gdof,ldof_to_owner) do ngdofs, - indices, - ldof_to_gdof, - ldof_to_owner - me = part_id(indices) - LocalIndices(ngdofs,me,ldof_to_gdof,ldof_to_owner) - end - - # Setup dof range - dofs_range = PRange(local_indices) - - return dofs_range + # Setup DoFs AbstractLocalIndices + dof_ids = permuted_variable_partition( + nodofs,ldof_to_gdof,ldof_to_owner;start=first_gdof + ) + return PRange(dof_ids) end # FEFunction related @@ -571,193 +504,6 @@ function _add_distributed_constraint(F::DistributedFESpace,order::Integer,constr V end -# Assembly - -function FESpaces.collect_cell_matrix( - trial::DistributedFESpace, - test::DistributedFESpace, - a::DistributedDomainContribution) - map(collect_cell_matrix,local_views(trial),local_views(test),local_views(a)) -end - -function FESpaces.collect_cell_vector( - test::DistributedFESpace, a::DistributedDomainContribution) - map(collect_cell_vector,local_views(test),local_views(a)) -end - -function FESpaces.collect_cell_matrix_and_vector( - trial::DistributedFESpace, - test::DistributedFESpace, - biform::DistributedDomainContribution, - liform::DistributedDomainContribution) - map(collect_cell_matrix_and_vector, - local_views(trial), - local_views(test), - local_views(biform), - local_views(liform)) -end - -function FESpaces.collect_cell_matrix_and_vector( - trial::DistributedFESpace, - test::DistributedFESpace, - biform::DistributedDomainContribution, - liform::DistributedDomainContribution, - uhd) - map(collect_cell_matrix_and_vector, - local_views(trial), - local_views(test), - local_views(biform), - local_views(liform), - local_views(uhd)) -end - -function FESpaces.collect_cell_vector( - test::DistributedFESpace,l::Number) - map(local_views(test)) do s - collect_cell_vector(s,l) - end -end - -function FESpaces.collect_cell_matrix_and_vector( - trial::DistributedFESpace, - test::DistributedFESpace, - mat::DistributedDomainContribution, - l::Number) - map(local_views(trial),local_views(test),local_views(mat)) do u,v,m - collect_cell_matrix_and_vector(u,v,m,l) - end -end - -function FESpaces.collect_cell_matrix_and_vector( - trial::DistributedFESpace, - test::DistributedFESpace, - mat::DistributedDomainContribution, - l::Number, - uhd) - map(local_views(trial),local_views(test),local_views(mat),local_views(uhd)) do u,v,m,f - collect_cell_matrix_and_vector(u,v,m,l,f) - end -end -""" -""" -struct DistributedSparseMatrixAssembler{A,B,C,D,E,F} <: SparseMatrixAssembler - strategy::A - assems::B - matrix_builder::C - vector_builder::D - test_dofs_gids_prange::E - trial_dofs_gids_prange::F -end - -local_views(a::DistributedSparseMatrixAssembler) = a.assems - -FESpaces.get_rows(a::DistributedSparseMatrixAssembler) = a.test_dofs_gids_prange -FESpaces.get_cols(a::DistributedSparseMatrixAssembler) = a.trial_dofs_gids_prange -FESpaces.get_matrix_builder(a::DistributedSparseMatrixAssembler) = a.matrix_builder -FESpaces.get_vector_builder(a::DistributedSparseMatrixAssembler) = a.vector_builder -FESpaces.get_assembly_strategy(a::DistributedSparseMatrixAssembler) = a.strategy - -function FESpaces.symbolic_loop_matrix!(A,a::DistributedSparseMatrixAssembler,matdata) - rows = get_rows(a) - cols = get_cols(a) - map(symbolic_loop_matrix!,local_views(A,rows,cols),local_views(a),matdata) -end - -function FESpaces.numeric_loop_matrix!(A,a::DistributedSparseMatrixAssembler,matdata) - rows = get_rows(a) - cols = get_cols(a) - map(numeric_loop_matrix!,local_views(A,rows,cols),local_views(a),matdata) -end - -function FESpaces.symbolic_loop_vector!(b,a::DistributedSparseMatrixAssembler,vecdata) - rows = get_rows(a) - map(symbolic_loop_vector!,local_views(b,rows),local_views(a),vecdata) -end - -function FESpaces.numeric_loop_vector!(b,a::DistributedSparseMatrixAssembler,vecdata) - rows = get_rows(a) - map(numeric_loop_vector!,local_views(b,rows),local_views(a),vecdata) -end - -function FESpaces.symbolic_loop_matrix_and_vector!(A,b,a::DistributedSparseMatrixAssembler,data) - rows = get_rows(a) - cols = get_cols(a) - Aviews=local_views(A,rows,cols) - bviews=local_views(b,rows) - aviews=local_views(a) - map(symbolic_loop_matrix_and_vector!,Aviews,bviews,aviews,data) -end - -function FESpaces.numeric_loop_matrix_and_vector!(A,b,a::DistributedSparseMatrixAssembler,data) - rows = get_rows(a) - cols = get_cols(a) - map(numeric_loop_matrix_and_vector!,local_views(A,rows,cols),local_views(b,rows),local_views(a),data) -end - -# Parallel Assembly strategies - -function local_assembly_strategy(::SubAssembledRows,rows,cols) - DefaultAssemblyStrategy() -end - -# When using this one, make sure that you also loop over ghost cells. -# This is at your own risk. -function local_assembly_strategy(::FullyAssembledRows,rows,cols) - rows_local_to_ghost = local_to_ghost(rows) - GenericAssemblyStrategy( - identity, - identity, - row->rows_local_to_ghost[row]==0, - col->true - ) -end - -# Assembler high level constructors -function FESpaces.SparseMatrixAssembler( - local_mat_type, - local_vec_type, - rows::PRange, - cols::PRange, - par_strategy=SubAssembledRows() -) - assems = map(partition(rows),partition(cols)) do rows,cols - local_strategy = local_assembly_strategy(par_strategy,rows,cols) - FESpaces.GenericSparseMatrixAssembler( - SparseMatrixBuilder(local_mat_type), - ArrayBuilder(local_vec_type), - Base.OneTo(length(rows)), - Base.OneTo(length(cols)), - local_strategy - ) - end - mat_builder = PSparseMatrixBuilderCOO(local_mat_type,par_strategy) - vec_builder = PVectorBuilder(local_vec_type,par_strategy) - return DistributedSparseMatrixAssembler(par_strategy,assems,mat_builder,vec_builder,rows,cols) -end - -function FESpaces.SparseMatrixAssembler( - local_mat_type, - local_vec_type, - trial::DistributedFESpace, - test::DistributedFESpace, - par_strategy=SubAssembledRows() -) - rows = get_free_dof_ids(test) - cols = get_free_dof_ids(trial) - SparseMatrixAssembler(local_mat_type,local_vec_type,rows,cols,par_strategy) -end - -function FESpaces.SparseMatrixAssembler( - trial::DistributedFESpace, - test::DistributedFESpace, - par_strategy=SubAssembledRows() -) - Tv = PartitionedArrays.getany(map(get_vector_type,local_views(trial))) - T = eltype(Tv) - Tm = SparseMatrixCSC{T,Int} - SparseMatrixAssembler(Tm,Tv,trial,test,par_strategy) -end - # ZeroMean FESpace struct DistributedZeroMeanCache{A,B} dvol::A @@ -917,3 +663,174 @@ function FESpaces.ConstantFESpace( vector_type = _find_vector_type(spaces,gids) return DistributedSingleFieldFESpace(spaces,gids,trian,vector_type) end + +# Assembly + +""" +""" +struct DistributedSparseMatrixAssembler{A,B,C,D,E,F} <: SparseMatrixAssembler + strategy::A + assems::B + matrix_builder::C + vector_builder::D + test_gids::E + trial_gids::F +end + +local_views(a::DistributedSparseMatrixAssembler) = a.assems + +FESpaces.get_rows(a::DistributedSparseMatrixAssembler) = a.test_gids +FESpaces.get_cols(a::DistributedSparseMatrixAssembler) = a.trial_gids +FESpaces.get_matrix_builder(a::DistributedSparseMatrixAssembler) = a.matrix_builder +FESpaces.get_vector_builder(a::DistributedSparseMatrixAssembler) = a.vector_builder +FESpaces.get_assembly_strategy(a::DistributedSparseMatrixAssembler) = a.strategy + +function FESpaces.SparseMatrixAssembler( + local_mat_type, + local_vec_type, + rows::PRange, + cols::PRange, + par_strategy=Assembled() +) + assems = map(partition(rows),partition(cols)) do rows,cols + FESpaces.GenericSparseMatrixAssembler( + SparseMatrixBuilder(local_mat_type), + ArrayBuilder(local_vec_type), + Base.OneTo(length(rows)), + Base.OneTo(length(cols)), + local_assembly_strategy(par_strategy,rows,cols) + ) + end + mat_builder = DistributedArrayBuilder(local_mat_type,par_strategy) + vec_builder = DistributedArrayBuilder(local_vec_type,par_strategy) + return DistributedSparseMatrixAssembler(par_strategy,assems,mat_builder,vec_builder,rows,cols) +end + +function FESpaces.SparseMatrixAssembler( + local_mat_type, + local_vec_type, + trial::DistributedFESpace, + test::DistributedFESpace, + par_strategy::FESpaces.AssemblyStrategy=Assembled(); + kwargs... +) + rows = get_free_dof_ids(test) + cols = get_free_dof_ids(trial) + SparseMatrixAssembler(local_mat_type,local_vec_type,rows,cols,par_strategy;kwargs...) +end + +function FESpaces.SparseMatrixAssembler( + trial::DistributedFESpace, + test::DistributedFESpace, + par_strategy::FESpaces.AssemblyStrategy=Assembled(); + kwargs... +) + Tv = getany(map(get_vector_type,local_views(trial))) + Tm = SparseMatrixCSC{eltype(Tv),Int} + SparseMatrixAssembler(Tm,Tv,trial,test,par_strategy;kwargs...) +end + +function FESpaces.symbolic_loop_matrix!(A,a::DistributedSparseMatrixAssembler,matdata) + rows, cols = get_rows(a), get_cols(a) + map(symbolic_loop_matrix!,local_views(A,rows,cols),local_views(a),matdata) +end + +function FESpaces.numeric_loop_matrix!(A,a::DistributedSparseMatrixAssembler,matdata) + rows, cols = get_rows(a), get_cols(a) + map(numeric_loop_matrix!,local_views(A,rows,cols),local_views(a),matdata) +end + +function FESpaces.symbolic_loop_vector!(b,a::DistributedSparseMatrixAssembler,vecdata) + rows = get_rows(a) + map(symbolic_loop_vector!,local_views(b,rows),local_views(a),vecdata) +end + +function FESpaces.numeric_loop_vector!(b,a::DistributedSparseMatrixAssembler,vecdata) + rows = get_rows(a) + map(numeric_loop_vector!,local_views(b,rows),local_views(a),vecdata) +end + +function FESpaces.symbolic_loop_matrix_and_vector!(A,b,a::DistributedSparseMatrixAssembler,data) + rows, cols = get_rows(a), get_cols(a) + map(symbolic_loop_matrix_and_vector!,local_views(A,rows,cols),local_views(b,rows),local_views(a),data) +end + +function FESpaces.numeric_loop_matrix_and_vector!(A,b,a::DistributedSparseMatrixAssembler,data) + rows, cols = get_rows(a), get_cols(a) + map(numeric_loop_matrix_and_vector!,local_views(A,rows,cols),local_views(b,rows),local_views(a),data) +end + +function FESpaces.collect_cell_matrix( + trial::DistributedFESpace, + test::DistributedFESpace, + a::DistributedDomainContribution +) + map(collect_cell_matrix,local_views(trial),local_views(test),local_views(a)) +end + +function FESpaces.collect_cell_vector( + test::DistributedFESpace, a::DistributedDomainContribution +) + map(collect_cell_vector,local_views(test),local_views(a)) +end + +function FESpaces.collect_cell_matrix_and_vector( + trial::DistributedFESpace, + test::DistributedFESpace, + biform::DistributedDomainContribution, + liform::DistributedDomainContribution +) + map(collect_cell_matrix_and_vector, + local_views(trial), + local_views(test), + local_views(biform), + local_views(liform) + ) +end + +function FESpaces.collect_cell_matrix_and_vector( + trial::DistributedFESpace, + test::DistributedFESpace, + biform::DistributedDomainContribution, + liform::DistributedDomainContribution, + uhd +) + map(collect_cell_matrix_and_vector, + local_views(trial), + local_views(test), + local_views(biform), + local_views(liform), + local_views(uhd) + ) +end + +function FESpaces.collect_cell_vector( + test::DistributedFESpace,l::Number +) + map(local_views(test)) do s + collect_cell_vector(s,l) + end +end + +function FESpaces.collect_cell_matrix_and_vector( + trial::DistributedFESpace, + test::DistributedFESpace, + mat::DistributedDomainContribution, + l::Number +) + map(local_views(trial),local_views(test),local_views(mat)) do u,v,m + collect_cell_matrix_and_vector(u,v,m,l) + end +end + +function FESpaces.collect_cell_matrix_and_vector( + trial::DistributedFESpace, + test::DistributedFESpace, + mat::DistributedDomainContribution, + l::Number, + uhd +) + map(local_views(trial),local_views(test),local_views(mat),local_views(uhd)) do u,v,m,f + collect_cell_matrix_and_vector(u,v,m,l,f) + end +end diff --git a/src/Geometry.jl b/src/Geometry.jl index 57ec00a..f9ae43c 100644 --- a/src/Geometry.jl +++ b/src/Geometry.jl @@ -633,7 +633,7 @@ function filter_cells_when_needed( end function filter_cells_when_needed( - portion::FullyAssembledRows, + portion::LocallyAssembled, cell_gids::AbstractLocalIndices, trian::Triangulation) @@ -641,7 +641,7 @@ function filter_cells_when_needed( end function filter_cells_when_needed( - portion::SubAssembledRows, + portion::Assembled, cell_gids::AbstractLocalIndices, trian::Triangulation) diff --git a/src/GridapDistributed.jl b/src/GridapDistributed.jl index 3e1832f..ed67a55 100644 --- a/src/GridapDistributed.jl +++ b/src/GridapDistributed.jl @@ -32,8 +32,8 @@ import Base: inv, abs, abs2, *, +, -, /, adjoint, transpose, real, imag, conj, g import Gridap.Fields: grad2curl import Gridap.CellData: Interpolable -export FullyAssembledRows -export SubAssembledRows +export LocallyAssembled +export Assembled export get_cell_gids export get_face_gids @@ -45,8 +45,13 @@ export redistribute include("BlockPartitionedArrays.jl") +include("GridapExtras.jl") +include("PArraysExtras.jl") + include("Algebra.jl") +include("Assembly.jl") + include("Geometry.jl") include("CellData.jl") diff --git a/src/GridapExtras.jl b/src/GridapExtras.jl new file mode 100644 index 0000000..e072318 --- /dev/null +++ b/src/GridapExtras.jl @@ -0,0 +1,68 @@ + +# Extensions to Gridap/Arrays/Tables.jl + +function generate_ptrs(vv::AbstractArray{<:AbstractArray{T}}) where T + ptrs = Vector{Int32}(undef,length(vv)+1) + Arrays._generate_data_and_ptrs_fill_ptrs!(ptrs,vv) + Arrays.length_to_ptrs!(ptrs) + return ptrs +end + +# New type of Vector allocation + +""" + TrackedArrayAllocation{T} <: GridapType + +Array that keeps track of which entries have been touched. For assembly purposes. +""" +struct TrackedArrayAllocation{T} + values :: Vector{T} + touched :: Vector{Bool} +end + +function TrackedArrayAllocation(values) + touched = fill(false,length(values)) + TrackedArrayAllocation(values,touched) +end + +Algebra.LoopStyle(::Type{<:TrackedArrayAllocation}) = Algebra.Loop() +Algebra.create_from_nz(a::TrackedArrayAllocation) = a.values + +@inline function Arrays.add_entry!(combine::Function,a::TrackedArrayAllocation,v::Nothing,i) + if i > 0 + a.touched[i] = true + end + nothing +end +@inline function Arrays.add_entry!(combine::Function,a::TrackedArrayAllocation,v,i) + if i > 0 + a.touched[i] = true + a.values[i] = combine(v,a.values[i]) + end + nothing +end +@inline function Arrays.add_entries!(combine::Function,a::TrackedArrayAllocation,v::Nothing,i) + for ie in i + Arrays.add_entry!(combine,a,nothing,ie) + end + nothing +end +@inline function Arrays.add_entries!(combine::Function,a::TrackedArrayAllocation,v,i) + for (ve,ie) in zip(v,i) + Arrays.add_entry!(combine,a,ve,ie) + end + nothing +end + +# change_axes + +function change_axes(a::Algebra.CounterCOO{T,A}, axes::A) where {T,A} + b = Algebra.CounterCOO{T}(axes) + b.nnz = a.nnz + b +end + +function change_axes(a::Algebra.AllocationCOO{T,A}, axes::A) where {T,A} + counter = change_axes(a.counter,axes) + Algebra.AllocationCOO(counter,a.I,a.J,a.V) +end diff --git a/src/MultiField.jl b/src/MultiField.jl index 5fa989c..b0129ba 100644 --- a/src/MultiField.jl +++ b/src/MultiField.jl @@ -454,7 +454,7 @@ function FESpaces.SparseMatrixAssembler( local_vec_type, trial::DistributedMultiFieldFESpace{<:BlockMultiFieldStyle{NB,SB,P}}, test::DistributedMultiFieldFESpace{<:BlockMultiFieldStyle{NB,SB,P}}, - par_strategy=SubAssembledRows()) where {NB,SB,P} + par_strategy=Assembled()) where {NB,SB,P} block_idx = CartesianIndices((NB,NB)) block_rows = blocks(test.gids) diff --git a/src/PArraysExtras.jl b/src/PArraysExtras.jl new file mode 100644 index 0000000..302944a --- /dev/null +++ b/src/PArraysExtras.jl @@ -0,0 +1,400 @@ + +""" + permuted_variable_partition(n_own,gids,owners; n_global, start) + +Create indices which are a permuted version of a variable_partition. +The advantage of this w.r.t. the `LocalIndices` type, is that we can compute +dof owners with minimal communications, since we only need the size of the blocks. + +NOTE: This is the type for our FESpace dof_ids. +""" +function permuted_variable_partition( + n_own::AbstractArray{<:Integer}, + gids::AbstractArray{<:AbstractArray{<:Integer}}, + owners::AbstractArray{<:AbstractArray{<:Integer}}; + n_global=reduction(+,n_own,destination=:all,init=zero(eltype(n_own))), + start=scan(+,n_own,type=:exclusive,init=one(eltype(n_own))) +) + ranks = linear_indices(n_own) + np = length(ranks) + map(ranks,n_own,n_global,start,gids,owners) do rank,n_own,n_global,start,gids,owners + n_local = length(gids) + n_ghost = n_local - n_own + perm = fill(zero(Int32),n_local) + ghost_gids = fill(zero(Int),n_ghost) + ghost_owners = fill(zero(Int32),n_ghost) + + n_ghost = 0 + for (lid,(gid,owner)) in enumerate(zip(gids,owners)) + if owner == rank + perm[lid] = gid-start+1 + else + n_ghost += 1 + ghost_gids[n_ghost] = gid + ghost_owners[n_ghost] = owner + perm[lid] = n_own + n_ghost + end + end + @assert n_ghost == n_local - n_own + + ghost = GhostIndices(n_global,ghost_gids,ghost_owners) + dof_ids = PartitionedArrays.LocalIndicesWithVariableBlockSize( + CartesianIndex((rank,)),(np,),(n_global,),((1:n_own).+(start-1),),ghost + ) + permute_indices(dof_ids,perm) + end +end + +""" + unpermute(indices::AbstractLocalIndices) + +Given local indices, reorders them to (locally) have own indices first, +followed by ghost indices. +""" +function unpermute(indices::AbstractLocalIndices) + @notimplemented +end + +unpermute(indices::PermutedLocalIndices) = unpermute(indices.indices) +unpermute(indices::PartitionedArrays.LocalIndicesInBlockPartition) = indices +unpermute(indices::OwnAndGhostIndices) = indices + +function unpermute(indices::LocalIndices) + nglobal = global_length(indices) + rank = part_id(indices) + own = OwnIndices(nglobal,rank,own_to_global(indices)) + ghost = GhostIndices(nglobal,ghost_to_global(indices),ghost_to_owner(indices)) + OwnAndGhostIndices(own,ghost,global_to_owner(indices)) +end + +""" + locally_repartition(v::PVector,new_indices) + +Map the values of a PVector to a new partitioning of the indices. + +Similar to `PartitionedArrays.repartition`, but without any communications. Instead, +it is assumed that the local-to-local mapping can be done locally. +""" +function locally_repartition(v::PVector,new_indices) + w = similar(v,PRange(new_indices)) + locally_repartition!(w,v) +end + +function locally_repartition!(w::PVector,v::PVector) + # Fill own values + map(copy!,own_values(w),own_values(v)) + + # Fill ghost values + new_indices = partition(axes(w,1)) + old_indices = partition(axes(v,1)) + map(partition(w),partition(v),new_indices,old_indices) do w,v,new_ids,old_ids + old_gid_to_lid = global_to_local(old_ids) + for (lid,gid) in zip(ghost_to_local(new_ids),ghost_to_global(new_ids)) + old_lid = old_gid_to_lid[gid] + w[lid] = v[old_lid] + end + end + + return w +end + +""" + filter_and_replace_ghost(indices,gids) + +Replace ghost ids in `indices` with the ghost ids within `gids`. + +NOTE: The issue is that `replace_ghost` does not check if all gids are ghosts or whether +they are repeated. It also shifts ownership of the ghosts. Its all quite messy and not what we +would need. TODO: Make me better. +""" +function filter_and_replace_ghost(indices,gids) + owners = find_owner(indices,gids) + new_indices = map(indices,gids,owners) do indices, gids, owners + ghost_gids, ghost_owners = _filter_ghost(indices,gids,owners) + replace_ghost(indices, ghost_gids, ghost_owners) + end + return new_indices +end + +# Same as PartitionedArrays.filter_ghost, but we do not exclude ghost indices that +# belong to `indices`. This could eventually be a flag in the original function. +function _filter_ghost(indices,gids,owners) + ghosts = Set{Int}() + part_owner = part_id(indices) + + n_ghost = 0 + for (gid,owner) in zip(gids,owners) + if gid < 1 + continue + end + if (owner != part_owner) && !(gid in ghosts) + n_ghost += 1 + push!(ghosts,gid) + end + end + + new_ghost_to_global = zeros(Int,n_ghost) + new_ghost_to_owner = zeros(Int32,n_ghost) + + empty!(ghosts) + n_ghost = 0 + for (gid,owner) in zip(gids,owners) + if gid < 1 + continue + end + if (owner != part_owner) && !(gid in ghosts) + n_ghost += 1 + new_ghost_to_global[n_ghost] = gid + new_ghost_to_owner[n_ghost] = owner + push!(ghosts,gid) + end + end + + return new_ghost_to_global, new_ghost_to_owner +end + +# function PArrays.remove_ghost(indices::PermutedLocalIndices) +# n_global = global_length(indices) +# own = OwnIndices(n_global,part_id(indices),own_to_global(indices)) +# ghost = GhostIndices(n_global,Int[],Int32[]) +# OwnAndGhostIndices(own,ghost) +# end + +function PArrays.remove_ghost(indices::PermutedLocalIndices) + perm = indices.perm + own_to_local = indices.own_to_local + display(own_to_local) + println(issorted(own_to_local)) + display(perm) + println("-------------------------------") + remove_ghost(indices.indices) +end + +# This function computes a mapping among the local identifiers of a and b +# for which the corresponding global identifiers are both in a and b. +# Note that the haskey check is necessary because in the general case +# there might be gids in b which are not present in a +function local_to_local_map(a::AbstractLocalIndices,b::AbstractLocalIndices) + a_lid_to_b_lid = fill(zero(Int32),local_length(a)) + + b_local_to_global = local_to_global(b) + a_global_to_local = global_to_local(a) + for b_lid in 1:local_length(b) + gid = b_local_to_global[b_lid] + a_lid = a_global_to_local[gid] + if !iszero(a_lid) + a_lid_to_b_lid[a_lid] = b_lid + end + end + a_lid_to_b_lid +end + +# SubSparseMatrix extensions + +function SparseArrays.findnz(A::PartitionedArrays.SubSparseMatrix) + I,J,V = findnz(A.parent) + rowmap, colmap = A.inv_indices + for k in eachindex(I) + I[k] = rowmap[I[k]] + J[k] = colmap[J[k]] + end + mask = map((i,j) -> (i > 0 && j > 0), I, J) + return I[mask], J[mask], V[mask] +end + +# Async tasks + +const empty_async_task = PartitionedArrays.FakeTask(() -> nothing) + +# Linear algebra + +function LinearAlgebra.axpy!(α,x::PVector,y::PVector) + @check matching_local_indices(partition(axes(x,1)),partition(axes(y,1))) + map(partition(x),partition(y)) do x,y + LinearAlgebra.axpy!(α,x,y) + end + consistent!(y) |> wait + return y +end + +function LinearAlgebra.axpy!(α,x::BlockPVector,y::BlockPVector) + map(blocks(x),blocks(y)) do x,y + LinearAlgebra.axpy!(α,x,y) + end + return y +end + +function Algebra.axpy_entries!( + α::Number, A::PSparseMatrix, B::PSparseMatrix; + check::Bool=true +) +# We should definitely check here that the index partitions are the same. +# However: Because the different matrices are assembled separately, the objects are not the +# same (i.e can't use ===). Checking the index partitions would then be costly... + @assert reduce(&,map(PArrays.matching_local_indices,partition(axes(A,1)),partition(axes(B,1)))) + @assert reduce(&,map(PArrays.matching_local_indices,partition(axes(A,2)),partition(axes(B,2)))) + map(partition(A),partition(B)) do A, B + Algebra.axpy_entries!(α,A,B;check) + end + return B +end + +function Algebra.axpy_entries!( + α::Number, A::BlockPMatrix, B::BlockPMatrix; + check::Bool=true +) + map(blocks(A),blocks(B)) do A, B + Algebra.axpy_entries!(α,A,B;check) + end + return B +end + +# Array of PArrays -> PArray of Arrays +# TODO: I think this is now implemented in PartitionedArrays.jl (check) +function to_parray_of_arrays(a::AbstractArray{<:MPIArray}) + indices = linear_indices(first(a)) + map(indices) do i + map(a) do aj + PartitionedArrays.getany(aj) + end + end +end + +function to_parray_of_arrays(a::AbstractArray{<:DebugArray}) + indices = linear_indices(first(a)) + map(indices) do i + map(a) do aj + aj.items[i] + end + end +end + +# To local/to global for blocks + +function to_local_indices!(I,ids::PRange;kwargs...) + map(to_local!,I,partition(ids)) +end + +function to_global_indices!(I,ids::PRange;kwargs...) + map(to_global!,I,partition(ids)) +end +for f in [:to_local_indices!, :to_global_indices!, :get_gid_owners] + @eval begin + function $f(I::Vector,ids::AbstractVector{<:PRange};kwargs...) + map($f,I,ids) + end + + function $f(I::Matrix,ids::AbstractVector{<:PRange};ax=:rows) + @check ax ∈ [:rows,:cols] + block_ids = CartesianIndices(I) + map(block_ids) do id + i = id[1]; j = id[2]; + if ax == :rows + $f(I[i,j],ids[i]) + else + $f(I[i,j],ids[j]) + end + end + end + end +end + +# This type is required because MPIArray from PArrays +# cannot be instantiated with a NULL communicator +struct MPIVoidVector{T} <: AbstractVector{T} + comm::MPI.Comm + function MPIVoidVector(::Type{T}) where {T} + new{T}(MPI.COMM_NULL) + end +end + +Base.size(a::MPIVoidVector) = (0,) +Base.IndexStyle(::Type{<:MPIVoidVector}) = IndexLinear() +function Base.getindex(a::MPIVoidVector,i::Int) + error("Indexing of MPIVoidVector not possible.") +end +function Base.setindex!(a::MPIVoidVector,v,i::Int) + error("Indexing of MPIVoidVector not possible.") +end +function Base.show(io::IO,k::MIME"text/plain",data::MPIVoidVector) + println(io,"MPIVoidVector") +end + +# Communication extras, subpartitioning extras + +function num_parts(comm::MPI.Comm) + if comm != MPI.COMM_NULL + nparts = MPI.Comm_size(comm) + else + nparts = -1 + end + nparts +end +@inline num_parts(comm::MPIArray) = num_parts(comm.comm) +@inline num_parts(comm::DebugArray) = length(comm.items) +@inline num_parts(comm::MPIVoidVector) = num_parts(comm.comm) + +function get_part_id(comm::MPI.Comm) + if comm != MPI.COMM_NULL + id = MPI.Comm_rank(comm)+1 + else + id = -1 + end + id +end +@inline get_part_id(comm::MPIArray) = get_part_id(comm.comm) +@inline get_part_id(comm::MPIVoidVector) = get_part_id(comm.comm) + +""" + i_am_in(comm::MPIArray) + i_am_in(comm::DebugArray) + + Returns `true` if the processor is part of the subcommunicator `comm`. +""" +function i_am_in(comm::MPI.Comm) + get_part_id(comm) >=0 +end +@inline i_am_in(comm::MPIArray) = i_am_in(comm.comm) +@inline i_am_in(comm::MPIVoidVector) = i_am_in(comm.comm) +@inline i_am_in(comm::DebugArray) = true + +function change_parts(x::Union{MPIArray,DebugArray,Nothing,MPIVoidVector}, new_parts; default=nothing) + x_new = map(new_parts) do p + if isa(x,MPIArray) + PartitionedArrays.getany(x) + elseif isa(x,DebugArray) && (p <= length(x.items)) + x.items[p] + else + default + end + end + return x_new +end + +function generate_subparts(parts::MPIArray,new_comm_size) + root_comm = parts.comm + root_size = MPI.Comm_size(root_comm) + rank = MPI.Comm_rank(root_comm) + + @static if isdefined(MPI,:MPI_UNDEFINED) + mpi_undefined = MPI.MPI_UNDEFINED[] + else + mpi_undefined = MPI.API.MPI_UNDEFINED[] + end + + if root_size == new_comm_size + return parts + else + if rank < new_comm_size + comm = MPI.Comm_split(root_comm,0,0) + return distribute_with_mpi(LinearIndices((new_comm_size,));comm=comm,duplicate_comm=false) + else + comm = MPI.Comm_split(root_comm,mpi_undefined,mpi_undefined) + return MPIVoidVector(eltype(parts)) + end + end +end + +function generate_subparts(parts::DebugArray,new_comm_size) + DebugArray(LinearIndices((new_comm_size,))) +end diff --git a/src/_Algebra_old.jl b/src/_Algebra_old.jl new file mode 100644 index 0000000..e5aefd6 --- /dev/null +++ b/src/_Algebra_old.jl @@ -0,0 +1,1238 @@ + +# Vector allocation + +function Algebra.allocate_vector(::Type{<:PVector{V}},ids::PRange) where {V} + PVector{V}(undef,partition(ids)) +end + +function Algebra.allocate_vector(::Type{<:BlockPVector{V}},ids::BlockPRange) where {V} + BlockPVector{V}(undef,ids) +end + +function Algebra.allocate_in_range(matrix::PSparseMatrix) + V = Vector{eltype(matrix)} + allocate_in_range(PVector{V},matrix) +end + +function Algebra.allocate_in_domain(matrix::PSparseMatrix) + V = Vector{eltype(matrix)} + allocate_in_domain(PVector{V},matrix) +end + +function Algebra.allocate_in_range(matrix::BlockPMatrix) + V = Vector{eltype(matrix)} + allocate_in_range(BlockPVector{V},matrix) +end + +function Algebra.allocate_in_domain(matrix::BlockPMatrix) + V = Vector{eltype(matrix)} + allocate_in_domain(BlockPVector{V},matrix) +end + +# PartitionedArrays extras + +function LinearAlgebra.axpy!(α,x::PVector,y::PVector) + @check partition(axes(x,1)) === partition(axes(y,1)) + map(partition(x),partition(y)) do x,y + LinearAlgebra.axpy!(α,x,y) + end + consistent!(y) |> wait + return y +end + +function LinearAlgebra.axpy!(α,x::BlockPVector,y::BlockPVector) + map(blocks(x),blocks(y)) do x,y + LinearAlgebra.axpy!(α,x,y) + end + return y +end + +function Algebra.axpy_entries!( + α::Number, A::PSparseMatrix, B::PSparseMatrix; + check::Bool=true +) +# We should definitely check here that the index partitions are the same. +# However: Because the different matrices are assembled separately, the objects are not the +# same (i.e can't use ===). Checking the index partitions would then be costly... + @assert reduce(&,map(PArrays.matching_local_indices,partition(axes(A,1)),partition(axes(B,1)))) + @assert reduce(&,map(PArrays.matching_local_indices,partition(axes(A,2)),partition(axes(B,2)))) + map(partition(A),partition(B)) do A, B + Algebra.axpy_entries!(α,A,B;check) + end + return B +end + +function Algebra.axpy_entries!( + α::Number, A::BlockPMatrix, B::BlockPMatrix; + check::Bool=true +) + map(blocks(A),blocks(B)) do A, B + Algebra.axpy_entries!(α,A,B;check) + end + return B +end + +# TODO: Can we remove this? +# This might go to Gridap in the future. We keep it here for the moment. +function change_axes(a::Algebra.ArrayCounter,axes) + @notimplemented +end + +# This might go to Gridap in the future. We keep it here for the moment. +function change_axes(a::Algebra.CounterCOO{T,A}, axes::A) where {T,A} + b = Algebra.CounterCOO{T}(axes) + b.nnz = a.nnz + b +end + +# This might go to Gridap in the future. We keep it here for the moment. +function change_axes(a::Algebra.AllocationCOO{T,A}, axes::A) where {T,A} + counter = change_axes(a.counter,axes) + Algebra.AllocationCOO(counter,a.I,a.J,a.V) +end + +# Array of PArrays -> PArray of Arrays +function to_parray_of_arrays(a::AbstractArray{<:MPIArray}) + indices = linear_indices(first(a)) + map(indices) do i + map(a) do aj + PartitionedArrays.getany(aj) + end + end +end + +function to_parray_of_arrays(a::AbstractArray{<:DebugArray}) + indices = linear_indices(first(a)) + map(indices) do i + map(a) do aj + aj.items[i] + end + end +end + +# This type is required because MPIArray from PArrays +# cannot be instantiated with a NULL communicator +struct MPIVoidVector{T} <: AbstractVector{T} + comm::MPI.Comm + function MPIVoidVector(::Type{T}) where {T} + new{T}(MPI.COMM_NULL) + end +end + +Base.size(a::MPIVoidVector) = (0,) +Base.IndexStyle(::Type{<:MPIVoidVector}) = IndexLinear() +function Base.getindex(a::MPIVoidVector,i::Int) + error("Indexing of MPIVoidVector not possible.") +end +function Base.setindex!(a::MPIVoidVector,v,i::Int) + error("Indexing of MPIVoidVector not possible.") +end +function Base.show(io::IO,k::MIME"text/plain",data::MPIVoidVector) + println(io,"MPIVoidVector") +end + +function num_parts(comm::MPI.Comm) + if comm != MPI.COMM_NULL + nparts = MPI.Comm_size(comm) + else + nparts = -1 + end + nparts +end +@inline num_parts(comm::MPIArray) = num_parts(comm.comm) +@inline num_parts(comm::DebugArray) = length(comm.items) +@inline num_parts(comm::MPIVoidVector) = num_parts(comm.comm) + +function get_part_id(comm::MPI.Comm) + if comm != MPI.COMM_NULL + id = MPI.Comm_rank(comm)+1 + else + id = -1 + end + id +end +@inline get_part_id(comm::MPIArray) = get_part_id(comm.comm) +@inline get_part_id(comm::MPIVoidVector) = get_part_id(comm.comm) + +""" + i_am_in(comm::MPIArray) + i_am_in(comm::DebugArray) + + Returns `true` if the processor is part of the subcommunicator `comm`. +""" +function i_am_in(comm::MPI.Comm) + get_part_id(comm) >=0 +end +@inline i_am_in(comm::MPIArray) = i_am_in(comm.comm) +@inline i_am_in(comm::MPIVoidVector) = i_am_in(comm.comm) +@inline i_am_in(comm::DebugArray) = true + +function change_parts(x::Union{MPIArray,DebugArray,Nothing,MPIVoidVector}, new_parts; default=nothing) + x_new = map(new_parts) do p + if isa(x,MPIArray) + PartitionedArrays.getany(x) + elseif isa(x,DebugArray) && (p <= length(x.items)) + x.items[p] + else + default + end + end + return x_new +end + +function generate_subparts(parts::MPIArray,new_comm_size) + root_comm = parts.comm + root_size = MPI.Comm_size(root_comm) + rank = MPI.Comm_rank(root_comm) + + @static if isdefined(MPI,:MPI_UNDEFINED) + mpi_undefined = MPI.MPI_UNDEFINED[] + else + mpi_undefined = MPI.API.MPI_UNDEFINED[] + end + + if root_size == new_comm_size + return parts + else + if rank < new_comm_size + comm = MPI.Comm_split(root_comm,0,0) + return distribute_with_mpi(LinearIndices((new_comm_size,));comm=comm,duplicate_comm=false) + else + comm = MPI.Comm_split(root_comm,mpi_undefined,mpi_undefined) + return MPIVoidVector(eltype(parts)) + end + end +end + +function generate_subparts(parts::DebugArray,new_comm_size) + DebugArray(LinearIndices((new_comm_size,))) +end + +# local_views + +function local_views(a) + @abstractmethod +end + +function get_parts(a) + return linear_indices(local_views(a)) +end + +function local_views(a::AbstractVector,rows) + @notimplemented +end + +function local_views(a::AbstractMatrix,rows,cols) + @notimplemented +end + +local_views(a::AbstractArray) = a +local_views(a::PRange) = partition(a) +local_views(a::PVector) = partition(a) +local_views(a::PSparseMatrix) = partition(a) + +function local_views(a::BlockPRange) + map(blocks(a)) do a + local_views(a) + end |> to_parray_of_arrays +end + +function local_views(a::BlockPArray) + vals = map(blocks(a)) do a + local_views(a) + end |> to_parray_of_arrays + return map(mortar,vals) +end + +# change_ghost + +function change_ghost(a::PVector{T},ids::PRange;is_consistent=false,make_consistent=false) where T + same_partition = (a.index_partition === partition(ids)) + a_new = same_partition ? a : change_ghost(T,a,ids) + if make_consistent && (!same_partition || !is_consistent) + consistent!(a_new) |> wait + end + return a_new +end + +function change_ghost(::Type{<:AbstractVector},a::PVector,ids::PRange) + a_new = similar(a,eltype(a),(ids,)) + # Equivalent to copy!(a_new,a) but does not check that owned indices match + map(copy!,own_values(a_new),own_values(a)) + return a_new +end + +function change_ghost(::Type{<:OwnAndGhostVectors},a::PVector,ids::PRange) + values = map(own_values(a),partition(ids)) do own_vals,ids + ghost_vals = fill(zero(eltype(a)),ghost_length(ids)) + perm = PartitionedArrays.local_permutation(ids) + OwnAndGhostVectors(own_vals,ghost_vals,perm) + end + return PVector(values,partition(ids)) +end + +function change_ghost(a::BlockPVector,ids::BlockPRange;is_consistent=false,make_consistent=false) + vals = map(blocks(a),blocks(ids)) do a, ids + change_ghost(a,ids;is_consistent=is_consistent,make_consistent=make_consistent) + end + return BlockPVector(vals,ids) +end + +# TODO: Can we remove this? +# This function computes a mapping among the local identifiers of a and b +# for which the corresponding global identifiers are both in a and b. +# Note that the haskey check is necessary because in the general case +# there might be gids in b which are not present in a +function find_local_to_local_map(a::AbstractLocalIndices,b::AbstractLocalIndices) + a_local_to_b_local = fill(Int32(-1),local_length(a)) + b_local_to_global = local_to_global(b) + a_global_to_local = global_to_local(a) + for blid in 1:local_length(b) + gid = b_local_to_global[blid] + if a_global_to_local[gid] != zero(eltype(a_global_to_local)) + alid = a_global_to_local[gid] + a_local_to_b_local[alid] = blid + end + end + a_local_to_b_local +end + +# TODO: Can we remove this? +# This type is required in order to be able to access the local portion +# of distributed sparse matrices and vectors using local indices from the +# distributed test and trial spaces +struct LocalView{T,N,A,B} <:AbstractArray{T,N} + plids_to_value::A + d_to_lid_to_plid::B + local_size::NTuple{N,Int} + function LocalView( + plids_to_value::AbstractArray{T,N},d_to_lid_to_plid::NTuple{N}) where {T,N} + A = typeof(plids_to_value) + B = typeof(d_to_lid_to_plid) + local_size = map(length,d_to_lid_to_plid) + new{T,N,A,B}(plids_to_value,d_to_lid_to_plid,local_size) + end +end + +Base.size(a::LocalView) = a.local_size +Base.IndexStyle(::Type{<:LocalView}) = IndexCartesian() +function Base.getindex(a::LocalView{T,N},lids::Vararg{Integer,N}) where {T,N} + plids = map(_lid_to_plid,lids,a.d_to_lid_to_plid) + if all(i->i>0,plids) + a.plids_to_value[plids...] + else + zero(T) + end +end +function Base.setindex!(a::LocalView{T,N},v,lids::Vararg{Integer,N}) where {T,N} + plids = map(_lid_to_plid,lids,a.d_to_lid_to_plid) + @check all(i->i>0,plids) "You are trying to set a value that is not stored in the local portion" + a.plids_to_value[plids...] = v +end + +function _lid_to_plid(lid,lid_to_plid) + plid = lid_to_plid[lid] + plid +end + +function local_views(a::PVector,new_rows::PRange) + old_rows = axes(a,1) + if partition(old_rows) === partition(new_rows) + partition(a) + else + map(partition(a),partition(old_rows),partition(new_rows)) do vector_partition,old_rows,new_rows + LocalView(vector_partition,(find_local_to_local_map(new_rows,old_rows),)) + end + end +end + +function local_views(a::PSparseMatrix,new_rows::PRange,new_cols::PRange) + old_rows, old_cols = axes(a) + if (partition(old_rows) === partition(new_rows) && partition(old_cols) === partition(new_cols) ) + partition(a) + else + map(partition(a), + partition(old_rows),partition(old_cols), + partition(new_rows),partition(new_cols)) do matrix_partition,old_rows,old_cols,new_rows,new_cols + rl2lmap = find_local_to_local_map(new_rows,old_rows) + cl2lmap = find_local_to_local_map(new_cols,old_cols) + LocalView(matrix_partition,(rl2lmap,cl2lmap)) + end + end +end + +function local_views(a::BlockPVector,new_rows::BlockPRange) + vals = map(local_views,blocks(a),blocks(new_rows)) |> to_parray_of_arrays + return map(mortar,vals) +end + +function local_views(a::BlockPMatrix,new_rows::BlockPRange,new_cols::BlockPRange) + vals = map(CartesianIndices(blocksize(a))) do I + local_views(blocks(a)[I],blocks(new_rows)[I],blocks(new_cols)[I]) + end |> to_parray_of_arrays + return map(mortar,vals) +end + +# PSparseMatrix assembly + +struct Assembled <: FESpaces.AssemblyStrategy end +struct SubAssembled <: FESpaces.AssemblyStrategy end +struct LocallyAssembled <: FESpaces.AssemblyStrategy end + +struct PSparseMatrixBuilderCOO{T,B} + local_matrix_type::Type{T} + par_strategy::B +end + +function Algebra.nz_counter( + builder::PSparseMatrixBuilderCOO{A}, axs::Tuple{<:PRange,<:PRange} +) where A + rows, cols = axs # test ids, trial ids + counters = map(partition(rows),partition(cols)) do r,c + axs = (Base.OneTo(local_length(r)),Base.OneTo(local_length(c))) + Algebra.CounterCOO{A}(axs) + end + DistributedCounterCOO(builder.par_strategy,counters,rows,cols) +end + +function Algebra.get_array_type(::PSparseMatrixBuilderCOO{Tv}) where Tv + T = eltype(Tv) + return PSparseMatrix{T} +end + + +""" +""" +struct DistributedCounterCOO{A,B,C,D} <: GridapType + par_strategy::A + counters ::B + test_gids ::C + trial_gids ::D + function DistributedCounterCOO( + par_strategy, + counters::AbstractArray{<:Algebra.CounterCOO}, + test_gids::PRange, + trial_gids::PRange + ) + A = typeof(par_strategy) + B = typeof(counters) + C = typeof(test_gids) + D = typeof(trial_gids) + new{A,B,C,D}(par_strategy,counters,test_gids,trial_gids) + end +end + +function local_views(a::DistributedCounterCOO) + return a.counters +end + +function local_views(a::DistributedCounterCOO,test_gids,trial_gids) + @check PArrays.matching_local_indices(test_gids,a.test_gids) + @check PArrays.matching_local_indices(trial_gids,a.trial_gids) + return a.counters +end + +function Algebra.nz_allocation(a::DistributedCounterCOO) + allocs = map(nz_allocation,a.counters) # Array{AllocationCOO} + DistributedAllocationCOO(a.par_strategy,allocs,a.test_gids,a.trial_gids) +end + +struct DistributedAllocationCOO{A,B,C,D} <: GridapType + par_strategy::A + allocs::B + test_gids::C + trial_gids::D + function DistributedAllocationCOO( + par_strategy, + allocs::AbstractArray{<:Algebra.AllocationCOO}, + test_gids::PRange, + trial_gids::PRange) + A = typeof(par_strategy) + B = typeof(allocs) + C = typeof(test_gids) + D = typeof(trial_gids) + new{A,B,C,D}(par_strategy,allocs,test_gids,trial_gids) + end +end + +function change_axes( + a::DistributedAllocationCOO{A,B,<:PRange,<:PRange}, + axes::Tuple{<:PRange,<:PRange} +) where {A,B} + local_axes = map(partition(axes[1]),partition(axes[2])) do rows,cols + (Base.OneTo(local_length(rows)), Base.OneTo(local_length(cols))) + end + allocs = map(change_axes,a.allocs,local_axes) + DistributedAllocationCOO(a.par_strategy,allocs,axes[1],axes[2]) +end + +function change_axes( + a::MatrixBlock{<:DistributedAllocationCOO}, + axes::Tuple{<:Vector,<:Vector} +) + block_ids = CartesianIndices(a.array) + rows, cols = axes + array = map(block_ids) do I + change_axes(a[I],(rows[I[1]],cols[I[2]])) + end + return ArrayBlock(array,a.touched) +end + +function local_views(a::DistributedAllocationCOO) + a.allocs +end + +function local_views(a::DistributedAllocationCOO,test_gids,trial_gids) + @check test_gids === a.test_gids + @check trial_gids === a.trial_gids + a.allocs +end + +function local_views(a::MatrixBlock{<:DistributedAllocationCOO}) + array = map(local_views,a.array) |> to_parray_of_arrays + return map(ai -> ArrayBlock(ai,a.touched),array) +end + +function get_allocations(a::DistributedAllocationCOO) + I,J,V = map(local_views(a)) do alloc + alloc.I, alloc.J, alloc.V + end |> tuple_of_arrays + return I,J,V +end + +function get_allocations(a::ArrayBlock{<:DistributedAllocationCOO}) + tuple_of_array_of_parrays = map(get_allocations,a.array) |> tuple_of_arrays + return tuple_of_array_of_parrays +end + +get_test_gids(a::DistributedAllocationCOO) = a.test_gids +get_trial_gids(a::DistributedAllocationCOO) = a.trial_gids +get_test_gids(a::ArrayBlock{<:DistributedAllocationCOO}) = map(get_test_gids,diag(a.array)) +get_trial_gids(a::ArrayBlock{<:DistributedAllocationCOO}) = map(get_trial_gids,diag(a.array)) + +function Algebra.create_from_nz(a::PSparseMatrix) + # For LocallyAssembled the underlying Exchanger should + # not have ghost layer making assemble! do nothing (TODO check) + assemble!(a) |> wait + a +end + +function Algebra.create_from_nz(a::DistributedAllocationCOO{<:LocallyAssembled}) + f(x) = nothing + A, = _fa_create_from_nz_with_callback(f,a) + return A +end + +function Algebra.create_from_nz(a::ArrayBlock{<:DistributedAllocationCOO{<:LocallyAssembled}}) + f(x) = nothing + A, = _fa_create_from_nz_with_callback(f,a) + return A +end + +function _fa_create_from_nz_with_callback(callback,a) + + # Recover some data + I,J,V = get_allocations(a) + test_gids = get_test_gids(a) + trial_gids = get_trial_gids(a) + + rows = _setup_prange(test_gids,I;ghost=false,ax=:rows) + b = callback(rows) + + # convert I and J to global dof ids + to_global_indices!(I,test_gids;ax=:rows) + to_global_indices!(J,trial_gids;ax=:cols) + + # Create the range for cols + cols = _setup_prange(trial_gids,J;ax=:cols) + + # Convert again I,J to local numeration + to_local_indices!(I,rows;ax=:rows) + to_local_indices!(J,cols;ax=:cols) + + A = _setup_matrix(I,J,V,rows,cols) + return A, b +end + +function Algebra.create_from_nz(a::DistributedAllocationCOO{<:Assembled}) + f(x) = nothing + A, = _sa_create_from_nz_with_callback(f,f,a,nothing) + return A +end + +function Algebra.create_from_nz(a::ArrayBlock{<:DistributedAllocationCOO{<:Assembled}}) + f(x) = nothing + A, = _sa_create_from_nz_with_callback(f,f,a,nothing) + return A +end + +function _sa_create_from_nz_with_callback(callback,async_callback,a,b) + # Recover some data + I,J,V = get_allocations(a) + test_gids = get_test_gids(a) + trial_gids = get_trial_gids(a) + + # convert I and J to global dof ids + to_global_indices!(I,test_gids;ax=:rows) + to_global_indices!(J,trial_gids;ax=:cols) + + # Create the Prange for the rows + rows = _setup_prange(test_gids,I;ax=:rows) + + # Move (I,J,V) triplets to the owner process of each row I. + # The triplets are accompanyed which Jo which is the process column owner + Jo = get_gid_owners(J,trial_gids;ax=:cols) + t = _assemble_coo!(I,J,V,rows;owners=Jo) + + # Here we can overlap computations + # This is a good place to overlap since + # sending the matrix rows is a lot of data + if !isa(b,Nothing) + bprange=_setup_prange_from_pvector_allocation(b) + b = callback(bprange) + end + + # Wait the transfer to finish + wait(t) + + # Create the Prange for the cols + cols = _setup_prange(trial_gids,J;ax=:cols,owners=Jo) + + # Overlap rhs communications with CSC compression + t2 = async_callback(b) + + # Convert again I,J to local numeration + to_local_indices!(I,rows;ax=:rows) + to_local_indices!(J,cols;ax=:cols) + + A = _setup_matrix(a,I,J,V,rows,cols) + + # Wait the transfer to finish + if !isa(t2,Nothing) + wait(t2) + end + + return A, b +end + + +# PVector assembly + +struct PVectorBuilder{T,B} + local_vector_type::Type{T} + par_strategy::B +end + +function Algebra.nz_counter(builder::PVectorBuilder,axs::Tuple{<:PRange}) + T = builder.local_vector_type + rows, = axs + counters = map(partition(rows)) do rows + axs = (Base.OneTo(local_length(rows)),) + nz_counter(ArrayBuilder(T),axs) + end + PVectorCounter(builder.par_strategy,counters,rows) +end + +function Algebra.get_array_type(::PVectorBuilder{Tv}) where Tv + T = eltype(Tv) + return PVector{T} +end + +struct PVectorCounter{A,B,C} + par_strategy::A + counters::B + test_gids::C +end + +Algebra.LoopStyle(::Type{<:PVectorCounter}) = DoNotLoop() + +function local_views(a::PVectorCounter) + a.counters +end + +function local_views(a::PVectorCounter,rows) + @check rows === a.test_gids + a.counters +end + +function Arrays.nz_allocation(a::PVectorCounter{<:LocallyAssembled}) + dofs = a.test_gids + values = map(nz_allocation,a.counters) + PVectorAllocationTrackOnlyValues(a.par_strategy,values,dofs) +end + +struct PVectorAllocationTrackOnlyValues{A,B,C} + par_strategy::A + values::B + test_gids::C +end + +function local_views(a::PVectorAllocationTrackOnlyValues) + a.values +end + +function local_views(a::PVectorAllocationTrackOnlyValues,rows) + @check rows === a.test_gids + a.values +end + +function Algebra.create_from_nz(a::PVectorAllocationTrackOnlyValues{<:LocallyAssembled}) + rows = _setup_prange_without_ghosts(a.test_gids) + _rhs_callback(a,rows) +end + +function Algebra.create_from_nz(a::PVectorAllocationTrackOnlyValues{<:Assembled}) + # This point MUST NEVER be reached. If reached there is an inconsistency + # in the parallel code in charge of vector assembly + @assert false +end + +function _rhs_callback(row_partitioned_vector_partition,rows) + # The ghost values in row_partitioned_vector_partition are + # aligned with the FESpace but not with the ghost values in the rows of A + b_fespace = PVector(row_partitioned_vector_partition.values, + partition(row_partitioned_vector_partition.test_gids)) + + # This one is aligned with the rows of A + b = similar(b_fespace,eltype(b_fespace),(rows,)) + + # First transfer owned values + b .= b_fespace + + # Now transfer ghost + function transfer_ghost(b,b_fespace,ids,ids_fespace) + num_ghosts_vec = ghost_length(ids) + gho_to_loc_vec = ghost_to_local(ids) + loc_to_glo_vec = local_to_global(ids) + gid_to_lid_fe = global_to_local(ids_fespace) + for ghost_lid_vec in 1:num_ghosts_vec + lid_vec = gho_to_loc_vec[ghost_lid_vec] + gid = loc_to_glo_vec[lid_vec] + lid_fespace = gid_to_lid_fe[gid] + b[lid_vec] = b_fespace[lid_fespace] + end + end + map( + transfer_ghost, + partition(b), + partition(b_fespace), + b.index_partition, + b_fespace.index_partition + ) + + return b +end + +function Algebra.create_from_nz(a::PVector) + assemble!(a) |> wait + return a +end + +function Algebra.create_from_nz( + a::DistributedAllocationCOO{<:LocallyAssembled}, + c_fespace::PVectorAllocationTrackOnlyValues{<:LocallyAssembled} +) + + function callback(rows) + _rhs_callback(c_fespace,rows) + end + + A,b = _fa_create_from_nz_with_callback(callback,a) + return A,b +end + +struct PVectorAllocationTrackTouchedAndValues{A,B,C} + allocations::A + values::B + test_gids::C +end + +function Algebra.create_from_nz( + a::DistributedAllocationCOO{<:Assembled}, + b::PVectorAllocationTrackTouchedAndValues +) + + function callback(rows) + _rhs_callback(b,rows) + end + + function async_callback(b) + # now we can assemble contributions + assemble!(b) + end + + A,b = _sa_create_from_nz_with_callback(callback,async_callback,a,b) + return A,b +end + +struct ArrayAllocationTrackTouchedAndValues{A} + touched::Vector{Bool} + values::A +end + +Gridap.Algebra.LoopStyle(::Type{<:ArrayAllocationTrackTouchedAndValues}) = Gridap.Algebra.Loop() + +function local_views(a::PVectorAllocationTrackTouchedAndValues,rows) + @check PArrays.matching_local_indices(rows, a.test_gids) + a.allocations +end + +@inline function Arrays.add_entry!(c::Function,a::ArrayAllocationTrackTouchedAndValues,v,i,j) + @notimplemented +end +@inline function Arrays.add_entry!(c::Function,a::ArrayAllocationTrackTouchedAndValues,v,i) + if i > 0 + a.touched[i]=true + if !isnothing(v) + a.values[i]=c(v,a.values[i]) + end + end + nothing +end +@inline function Arrays.add_entries!(c::Function,a::ArrayAllocationTrackTouchedAndValues,v,i,j) + @notimplemented +end +@inline function Arrays.add_entries!(c::Function,a::ArrayAllocationTrackTouchedAndValues,v,i) + if !isa(v,Nothing) + for (ve,ie) in zip(v,i) + Arrays.add_entry!(c,a,ve,ie) + end + else + for ie in i + Arrays.add_entry!(c,a,nothing,ie) + end + end + nothing +end + + +function _setup_touched_and_allocations_arrays(values) + touched = map(values) do values + fill!(Vector{Bool}(undef,length(values)),false) + end + allocations = map(values,touched) do values,touched + ArrayAllocationTrackTouchedAndValues(touched,values) + end + touched, allocations +end + +function Arrays.nz_allocation( + a::DistributedCounterCOO{<:Assembled}, + b::PVectorCounter{<:Assembled} +) + A = nz_allocation(a) + dofs = b.test_gids + values = map(nz_allocation,b.counters) + touched, allocations = _setup_touched_and_allocations_arrays(values) + B = PVectorAllocationTrackTouchedAndValues(allocations,values,dofs) + return A, B +end + +function Arrays.nz_allocation(a::PVectorCounter{<:Assembled}) + dofs = a.test_gids + values = map(nz_allocation,a.counters) + touched,allocations=_setup_touched_and_allocations_arrays(values) + return PVectorAllocationTrackTouchedAndValues(allocations,values,dofs) +end + +function local_views(a::PVectorAllocationTrackTouchedAndValues) + a.allocations +end + +function _setup_prange_from_pvector_allocation(a::PVectorAllocationTrackTouchedAndValues) + + # Find the ghost rows + allocations = local_views(a.allocations) + indices = partition(a.test_gids) + I_ghost_lids_to_dofs_ghost_lids = map(allocations, indices) do allocation, indices + dofs_lids_touched = findall(allocation.touched) + loc_to_gho = local_to_ghost(indices) + n_I_ghost_lids = count((x)->loc_to_gho[x]!=0,dofs_lids_touched) + I_ghost_lids = Vector{Int32}(undef,n_I_ghost_lids) + cur = 1 + for lid in dofs_lids_touched + dof_lid = loc_to_gho[lid] + if dof_lid != 0 + I_ghost_lids[cur] = dof_lid + cur = cur+1 + end + end + I_ghost_lids + end + + ghost_to_global, ghost_to_owner = map( + find_gid_and_owner,I_ghost_lids_to_dofs_ghost_lids,indices) |> tuple_of_arrays + + ngids = length(a.test_gids) + _setup_prange_impl_(ngids,indices,ghost_to_global,ghost_to_owner) +end + +function Algebra.create_from_nz(a::PVectorAllocationTrackTouchedAndValues) + rows = _setup_prange_from_pvector_allocation(a) + b = _rhs_callback(a,rows) + t2 = assemble!(b) + # Wait the transfer to finish + if t2 !== nothing + wait(t2) + end + return b +end + +# Common Assembly Utilities +function first_gdof_from_ids(ids) + if own_length(ids) == 0 + return 1 + end + lid_to_gid = local_to_global(ids) + owned_to_lid = own_to_local(ids) + return Int(lid_to_gid[first(owned_to_lid)]) +end + +function find_gid_and_owner(ighost_to_jghost,jindices) + jghost_to_local = ghost_to_local(jindices) + jlocal_to_global = local_to_global(jindices) + jlocal_to_owner = local_to_owner(jindices) + ighost_to_jlocal = sort(view(jghost_to_local,ighost_to_jghost)) + + ighost_to_global = jlocal_to_global[ighost_to_jlocal] + ighost_to_owner = jlocal_to_owner[ighost_to_jlocal] + return ighost_to_global, ighost_to_owner +end + +# The given ids are assumed to be a sub-set of the lids +function ghost_lids_touched(a::AbstractLocalIndices,gids::AbstractVector{<:Integer}) + glo_to_loc = global_to_local(a) + loc_to_gho = local_to_ghost(a) + + # First pass: Allocate + i = 0 + ghost_lids_touched = fill(false,ghost_length(a)) + for gid in gids + lid = glo_to_loc[gid] + ghost_lid = loc_to_gho[lid] + if ghost_lid > 0 && !ghost_lids_touched[ghost_lid] + ghost_lids_touched[ghost_lid] = true + i += 1 + end + end + gids_ghost_lid_to_ghost_lid = Vector{Int32}(undef,i) + + # Second pass: fill + i = 1 + fill!(ghost_lids_touched,false) + for gid in gids + lid = glo_to_loc[gid] + ghost_lid = loc_to_gho[lid] + if ghost_lid > 0 && !ghost_lids_touched[ghost_lid] + ghost_lids_touched[ghost_lid] = true + gids_ghost_lid_to_ghost_lid[i] = ghost_lid + i += 1 + end + end + + return gids_ghost_lid_to_ghost_lid +end + +# Find the neighbours of partition1 trying +# to use those in partition2 as a hint +function _find_neighbours!(partition1, partition2) + partition2_snd, partition2_rcv = assembly_neighbors(partition2) + partition2_graph = ExchangeGraph(partition2_snd, partition2_rcv) + return assembly_neighbors(partition1; neighbors=partition2_graph) +end + +# to_global! & to_local! analogs, needed for dispatching in block assembly + +function to_local_indices!(I,ids::PRange;kwargs...) + map(to_local!,I,partition(ids)) +end + +function to_global_indices!(I,ids::PRange;kwargs...) + map(to_global!,I,partition(ids)) +end + +function get_gid_owners(I,ids::PRange;kwargs...) + map(I,partition(ids)) do I, indices + glo_to_loc = global_to_local(indices) + loc_to_own = local_to_owner(indices) + map(x->loc_to_own[glo_to_loc[x]], I) + end +end + +for f in [:to_local_indices!, :to_global_indices!, :get_gid_owners] + @eval begin + function $f(I::Vector,ids::AbstractVector{<:PRange};kwargs...) + map($f,I,ids) + end + + function $f(I::Matrix,ids::AbstractVector{<:PRange};ax=:rows) + @check ax ∈ [:rows,:cols] + block_ids = CartesianIndices(I) + map(block_ids) do id + i = id[1]; j = id[2]; + if ax == :rows + $f(I[i,j],ids[i]) + else + $f(I[i,j],ids[j]) + end + end + end + end +end + + +function _setup_matrix( + a,I,J,V,rows::PRange,cols::PRange +) + _rows = PRange(map(remove_ghost,partition(rows))) + asys = change_axes(a,(_rows,cols)) + values = map(create_from_nz,local_views(asys)) + PSparseMatrix(values,partition(_rows),partition(cols),true) +end + +function _setup_matrix(a,I,J,V,rows::Vector{<:PRange},cols::Vector{<:PRange}) + block_ids = CartesianIndices(I) + block_mats = map(block_ids) do id + i = id[1]; j = id[2]; + _setup_matrix(a[i,j],I[i,j],J[i,j],V[i,j],rows[i],cols[j]) + end + return mortar(block_mats) +end + +# _assemble_coo! : local coo triplets + global PRange -> Global coo values + +function _assemble_coo!(I,J,V,rows::PRange;owners=nothing) + if isa(owners,Nothing) + PArrays.assemble_coo!(I,J,V,partition(rows)) + else + assemble_coo_with_column_owner!(I,J,V,partition(rows),owners) + end +end + +function _assemble_coo!(I,J,V,rows::Vector{<:PRange};owners=nothing) + block_ids = CartesianIndices(I) + map(block_ids) do id + i = id[1]; j = id[2]; + if isa(owners,Nothing) + _assemble_coo!(I[i,j],J[i,j],V[i,j],rows[i]) + else + _assemble_coo!(I[i,j],J[i,j],V[i,j],rows[i],owners=owners[i,j]) + end + end +end + +function assemble_coo_with_column_owner!(I,J,V,row_partition,Jown) + """ + Returns three JaggedArrays with the coo triplets + to be sent to the corresponding owner parts in parts_snd + """ + function setup_snd(part,parts_snd,row_lids,coo_entries_with_column_owner) + global_to_local_row = global_to_local(row_lids) + local_row_to_owner = local_to_owner(row_lids) + owner_to_i = Dict(( owner=>i for (i,owner) in enumerate(parts_snd) )) + ptrs = zeros(Int32,length(parts_snd)+1) + k_gi, k_gj, k_jo, k_v = coo_entries_with_column_owner + for k in 1:length(k_gi) + gi = k_gi[k] + li = global_to_local_row[gi] + owner = local_row_to_owner[li] + if owner != part + ptrs[owner_to_i[owner]+1] +=1 + end + end + PArrays.length_to_ptrs!(ptrs) + gi_snd_data = zeros(eltype(k_gi),ptrs[end]-1) + gj_snd_data = zeros(eltype(k_gj),ptrs[end]-1) + jo_snd_data = zeros(eltype(k_jo),ptrs[end]-1) + v_snd_data = zeros(eltype(k_v),ptrs[end]-1) + for k in 1:length(k_gi) + gi = k_gi[k] + li = global_to_local_row[gi] + owner = local_row_to_owner[li] + if owner != part + gj = k_gj[k] + v = k_v[k] + p = ptrs[owner_to_i[owner]] + gi_snd_data[p] = gi + gj_snd_data[p] = gj + jo_snd_data[p] = k_jo[k] + v_snd_data[p] = v + k_v[k] = zero(v) + ptrs[owner_to_i[owner]] += 1 + end + end + PArrays.rewind_ptrs!(ptrs) + gi_snd = JaggedArray(gi_snd_data,ptrs) + gj_snd = JaggedArray(gj_snd_data,ptrs) + jo_snd = JaggedArray(jo_snd_data,ptrs) + v_snd = JaggedArray(v_snd_data,ptrs) + gi_snd, gj_snd, jo_snd, v_snd + end + """ + Pushes to coo_entries_with_column_owner the tuples + gi_rcv,gj_rcv,jo_rcv,v_rcv received from remote processes + """ + function setup_rcv!(coo_entries_with_column_owner,gi_rcv,gj_rcv,jo_rcv,v_rcv) + k_gi, k_gj, k_jo, k_v = coo_entries_with_column_owner + current_n = length(k_gi) + new_n = current_n + length(gi_rcv.data) + resize!(k_gi,new_n) + resize!(k_gj,new_n) + resize!(k_jo,new_n) + resize!(k_v,new_n) + for p in 1:length(gi_rcv.data) + k_gi[current_n+p] = gi_rcv.data[p] + k_gj[current_n+p] = gj_rcv.data[p] + k_jo[current_n+p] = jo_rcv.data[p] + k_v[current_n+p] = v_rcv.data[p] + end + end + part = linear_indices(row_partition) + parts_snd, parts_rcv = assembly_neighbors(row_partition) + coo_entries_with_column_owner = map(tuple,I,J,Jown,V) + gi_snd, gj_snd, jo_snd, v_snd = map(setup_snd,part,parts_snd,row_partition,coo_entries_with_column_owner) |> tuple_of_arrays + graph = ExchangeGraph(parts_snd,parts_rcv) + t1 = exchange(gi_snd,graph) + t2 = exchange(gj_snd,graph) + t3 = exchange(jo_snd,graph) + t4 = exchange(v_snd,graph) + @async begin + gi_rcv = fetch(t1) + gj_rcv = fetch(t2) + jo_rcv = fetch(t3) + v_rcv = fetch(t4) + map(setup_rcv!,coo_entries_with_column_owner,gi_rcv,gj_rcv,jo_rcv,v_rcv) + I,J,Jown,V + end +end + +# dofs_gids_prange can be either test_gids or trial_gids +# In the former case, gids is a vector of global test dof identifiers, while in the +# latter, a vector of global trial dof identifiers +function _setup_prange(dofs_gids_prange::PRange,gids;ghost=true,owners=nothing,kwargs...) + if !ghost + _setup_prange_without_ghosts(dofs_gids_prange) + elseif isa(owners,Nothing) + _setup_prange_with_ghosts(dofs_gids_prange,gids) + else + _setup_prange_with_ghosts(dofs_gids_prange,gids,owners) + end +end + +function _setup_prange( + dofs_gids_prange::AbstractVector{<:PRange}, + gids::AbstractMatrix; + ax=:rows,ghost=true,owners=nothing +) + @check ax ∈ (:rows,:cols) + block_ids = LinearIndices(dofs_gids_prange) + pvcat(x) = map(xi -> vcat(xi...), to_parray_of_arrays(x)) + + gids_union, owners_union = map(block_ids,dofs_gids_prange) do id, prange + gids_slice = (ax == :rows) ? gids[id,:] : gids[:,id] + gids_union = pvcat(gids_slice) + + owners_union = nothing + if !isnothing(owners) + owners_slice = (ax == :rows) ? owners[id,:] : owners[:,id] + owners_union = pvcat(owners_slice) + end + + return gids_union, owners_union + end |> tuple_of_arrays + + return map((p,g,o) -> _setup_prange(p,g;ghost=ghost,owners=o),dofs_gids_prange,gids_union,owners_union) +end + +# Create PRange for the rows of the linear system +# without local ghost dofs as per required in the +# LocallyAssembled() parallel assembly strategy +function _setup_prange_without_ghosts(dofs_gids_prange::PRange) + ngdofs = length(dofs_gids_prange) + indices = map(partition(dofs_gids_prange)) do dofs_indices + owner = part_id(dofs_indices) + own_indices = OwnIndices(ngdofs,owner,own_to_global(dofs_indices)) + ghost_indices = GhostIndices(ngdofs,Int64[],Int32[]) + OwnAndGhostIndices(own_indices,ghost_indices) + end + return PRange(indices) +end + +# Here we are assuming that the sparse communication graph underlying test_dofs_gids_partition +# is a superset of the one underlying indices. This is (has to be) true for the rows of the linear system. +# The precondition required for the consistency of any parallel assembly process in GridapDistributed +# is that each processor can determine locally with a single layer of ghost cells the global indices and associated +# processor owners of the rows that it touches after assembly of integration terms posed on locally-owned entities +# (i.e., either cells or faces). +function _setup_prange_with_ghosts(dofs_gids_prange::PRange,gids) + ngdofs = length(dofs_gids_prange) + dofs_gids_partition = partition(dofs_gids_prange) + + # Selected ghost ids -> dof PRange ghost ids + gids_ghost_lids_to_dofs_ghost_lids = map(ghost_lids_touched,dofs_gids_partition,gids) + + # Selected ghost ids -> [global dof ids, owner processor ids] + gids_ghost_to_global, gids_ghost_to_owner = map( + find_gid_and_owner,gids_ghost_lids_to_dofs_ghost_lids,dofs_gids_partition + ) |> tuple_of_arrays + + return _setup_prange_impl_(ngdofs,dofs_gids_partition,gids_ghost_to_global,gids_ghost_to_owner) +end + +# Here we cannot assume that the sparse communication graph underlying +# trial_dofs_gids_partition is a superset of the one underlying indices. +# Here we chould check whether it is included and call _find_neighbours!() +# if this is the case. At present, we are not taking advantage of this, +# but let the parallel scalable algorithm to compute the reciprocal to do the job. +function _setup_prange_with_ghosts(dofs_gids_prange::PRange,gids,owners) + ngdofs = length(dofs_gids_prange) + dofs_gids_partition = partition(dofs_gids_prange) + + # Selected ghost ids -> [global dof ids, owner processor ids] + gids_ghost_to_global, gids_ghost_to_owner = map( + gids,owners,dofs_gids_partition) do gids, owners, indices + ghost_touched = Dict{Int,Bool}() + ghost_to_global = Int64[] + ghost_to_owner = Int64[] + me = part_id(indices) + for (j,jo) in zip(gids,owners) + if jo != me + if !haskey(ghost_touched,j) + push!(ghost_to_global,j) + push!(ghost_to_owner,jo) + ghost_touched[j] = true + end + end + end + ghost_to_global, ghost_to_owner + end |> tuple_of_arrays + + return _setup_prange_impl_( + ngdofs, + dofs_gids_partition, + gids_ghost_to_global, + gids_ghost_to_owner; + discover_neighbours=false + ) +end + +function _setup_prange_impl_( + ngdofs, + dofs_gids_partition, + gids_ghost_to_global, + gids_ghost_to_owner; + discover_neighbours=true +) + indices = map(dofs_gids_partition, + gids_ghost_to_global, + gids_ghost_to_owner) do dofs_indices, ghost_to_global, ghost_to_owner + owner = part_id(dofs_indices) + own_indices = OwnIndices(ngdofs,owner,own_to_global(dofs_indices)) + ghost_indices = GhostIndices(ngdofs,ghost_to_global,ghost_to_owner) + OwnAndGhostIndices(own_indices,ghost_indices) + end + if discover_neighbours + _find_neighbours!(indices,dofs_gids_partition) + end + return PRange(indices) +end diff --git a/test/BlockPartitionedArraysTests.jl b/test/BlockPartitionedArraysTests.jl index 3e81008..5ec6579 100644 --- a/test/BlockPartitionedArraysTests.jl +++ b/test/BlockPartitionedArraysTests.jl @@ -76,7 +76,7 @@ t = assemble!(__v) assemble!(__m) |> wait fetch(t); -PartitionedArrays.to_trivial_partition(m) +PartitionedArrays.centralize(m) partition(v) partition(m) diff --git a/test/BlockSparseMatrixAssemblersTests.jl b/test/BlockSparseMatrixAssemblersTests.jl index 82bafc9..645d7dc 100644 --- a/test/BlockSparseMatrixAssemblersTests.jl +++ b/test/BlockSparseMatrixAssemblersTests.jl @@ -52,7 +52,7 @@ function _main(n_spaces,mfs,weakform,U,V) matdata = collect_cell_matrix(X,Y,biform(u,v)) vecdata = collect_cell_vector(Y,liform(v)) - assem = SparseMatrixAssembler(X,Y,FullyAssembledRows()) + assem = SparseMatrixAssembler(X,Y,LocallyAssembled()) A1 = assemble_matrix(assem,matdata) b1 = assemble_vector(assem,vecdata) A2,b2 = assemble_matrix_and_vector(assem,data); @@ -68,7 +68,7 @@ function _main(n_spaces,mfs,weakform,U,V) bmatdata = collect_cell_matrix(Xb,Yb,biform(ub,vb)) bvecdata = collect_cell_vector(Yb,liform(vb)) - assem_blocks = SparseMatrixAssembler(Xb,Yb,FullyAssembledRows()) + assem_blocks = SparseMatrixAssembler(Xb,Yb,LocallyAssembled()) A1_blocks = assemble_matrix(assem_blocks,bmatdata); b1_blocks = assemble_vector(assem_blocks,bvecdata); @test is_same_vector(b1_blocks,b1,Yb,Y) diff --git a/test/DivConformingTests.jl b/test/DivConformingTests.jl index 2a1886b..3804456 100644 --- a/test/DivConformingTests.jl +++ b/test/DivConformingTests.jl @@ -63,7 +63,7 @@ function f(model,reffe) V = FESpace(model,reffe,conformity=:Hdiv) U = TrialFESpace(V) - das = FullyAssembledRows() + das = LocallyAssembled() trian = Triangulation(das,model) degree = 2 dΩ = Measure(trian,degree) diff --git a/test/FESpacesTests.jl b/test/FESpacesTests.jl index bf1e78e..22e3545 100644 --- a/test/FESpacesTests.jl +++ b/test/FESpacesTests.jl @@ -71,8 +71,8 @@ function assemble_tests(das,dΩ,dΩass,U,V) end function main(distribute,parts) - main(distribute,parts,SubAssembledRows()) - main(distribute,parts,FullyAssembledRows()) + main(distribute,parts,Assembled()) + main(distribute,parts,LocallyAssembled()) end function main(distribute,parts,das) diff --git a/test/HeatEquationTests.jl b/test/HeatEquationTests.jl index fa96dc1..89077ec 100644 --- a/test/HeatEquationTests.jl +++ b/test/HeatEquationTests.jl @@ -37,7 +37,7 @@ function main(distribute,parts) op = TransientFEOperator(res,jac,jac_t,U,V0) - assembler = SparseMatrixAssembler(U,V0,SubAssembledRows()) + assembler = SparseMatrixAssembler(U,V0,Assembled()) op_constant = TransientLinearFEOperator( (a,m),b,U,V0,constant_forms=(true,true),assembler=assembler ) diff --git a/test/PLaplacianTests.jl b/test/PLaplacianTests.jl index a20a268..bf50135 100644 --- a/test/PLaplacianTests.jl +++ b/test/PLaplacianTests.jl @@ -8,8 +8,8 @@ using Test using SparseArrays function main(distribute,parts) - main(distribute,parts,FullyAssembledRows(),SparseMatrixCSR{0,Float64,Int},false) - main(distribute,parts,SubAssembledRows(),SparseMatrixCSC{Float64,Int},true) + main(distribute,parts,LocallyAssembled(),SparseMatrixCSR{0,Float64,Int},false) + main(distribute,parts,Assembled(),SparseMatrixCSC{Float64,Int},true) end function main(distribute,parts,strategy,local_matrix_type,autodiff) diff --git a/test/PeriodicBCsTests.jl b/test/PeriodicBCsTests.jl index 6177752..247cb6f 100644 --- a/test/PeriodicBCsTests.jl +++ b/test/PeriodicBCsTests.jl @@ -35,4 +35,6 @@ function main(distribute,parts) end +main(DebugArray,(2,2)) + end # module diff --git a/test/parrays_update.jl b/test/parrays_update.jl new file mode 100644 index 0000000..ec10673 --- /dev/null +++ b/test/parrays_update.jl @@ -0,0 +1,64 @@ + +using Gridap +using PartitionedArrays +using GridapDistributed + +using Gridap.FESpaces, Gridap.Algebra + +np = (1,2) +ranks = with_debug() do distribute + distribute(LinearIndices((prod(np),))) +end + +nc = (2,4) +serial_model = CartesianDiscreteModel((0,1,0,1),nc) +dist_model = CartesianDiscreteModel(ranks,np,(0,1,0,1),nc) + +cids = get_cell_gids(dist_model) + +reffe = ReferenceFE(lagrangian,Float64,1) +serial_V = TestFESpace(serial_model,reffe) +dist_V = TestFESpace(dist_model,reffe) + +serial_ids = get_free_dof_ids(serial_V) +dist_ids = get_free_dof_ids(dist_V) + +dof_ids = get_free_dof_ids(dist_V) + +serial_Ω = Triangulation(serial_model) +serial_dΩ = Measure(serial_Ω,2) + +dist_Ω = Triangulation(dist_model) +dist_dΩ = Measure(dist_Ω,2) + +dist_Ωg = Triangulation(GridapDistributed.with_ghost,dist_model) +dist_dΩg = Measure(dist_Ωg,2) + +serial_a(u,v) = ∫(u⋅v)*serial_dΩ +dist_a(u,v) = ∫(u⋅v)*dist_dΩ +dist_ag(u,v) = ∫(u⋅v)*dist_dΩg + +serial_A = assemble_matrix(serial_a,serial_V,serial_V) + +assem = SparseMatrixAssembler(dist_V,dist_V,GridapDistributed.Assembled()) +dist_A_AS = assemble_matrix(dist_a,assem,dist_V,dist_V) + +assem = SparseMatrixAssembler(dist_V,dist_V,GridapDistributed.LocallyAssembled()) +dist_A_LA = assemble_matrix(dist_ag,assem,dist_V,dist_V) + +assem = SparseMatrixAssembler(dist_V,dist_V,GridapDistributed.SubAssembled()) +dist_A_SA = assemble_matrix(dist_a,assem,dist_V,dist_V) + +all(centralize(dist_A_AS) - serial_A .< 1e-10) + +x_AS = prand(partition(axes(dist_A_AS,2))) +x_LA = GridapDistributed.change_ghost(x_AS,axes(dist_A_LA,2)) +x_SA = GridapDistributed.change_ghost(x_AS,axes(dist_A_SA,2)) + +norm(dist_A_AS*x_AS - dist_A_LA*x_LA) +norm(dist_A_AS*x_AS - dist_A_SA*x_SA) + +assemble_matrix!(dist_a,dist_A_AS,assem,dist_V,dist_V) +norm(dist_A_AS*x_AS - dist_A_SA*x_SA) + +############################################################################################