diff --git a/Project.toml b/Project.toml
index f31b3b4..263d9ee 100644
--- a/Project.toml
+++ b/Project.toml
@@ -22,7 +22,7 @@ ForwardDiff = "0.10"
 Gridap = "0.18.7"
 LinearAlgebra = "1"
 MPI = "0.16, 0.17, 0.18, 0.19, 0.20"
-PartitionedArrays = "0.3.3"
+PartitionedArrays = "0.5"
 SparseArrays = "1.3"
 SparseMatricesCSR = "0.6.6"
 WriteVTK = "1.12.0"
diff --git a/src/Algebra.jl b/src/Algebra.jl
index 57bcbac..64410c9 100644
--- a/src/Algebra.jl
+++ b/src/Algebra.jl
@@ -29,192 +29,6 @@ function Algebra.allocate_in_domain(matrix::BlockPMatrix)
   allocate_in_domain(BlockPVector{V},matrix)
 end
 
-# PSparseMatrix copy
-
-function Base.copy(a::PSparseMatrix)
-  mats = map(copy,partition(a))
-  cache = map(PartitionedArrays.copy_cache,a.cache)
-  return PSparseMatrix(mats,partition(axes(a,1)),partition(axes(a,2)),cache)
-end
-
-# PartitionedArrays extras
-
-function LinearAlgebra.axpy!(α,x::PVector,y::PVector)
-  @check partition(axes(x,1)) === partition(axes(y,1))
-  map(partition(x),partition(y)) do x,y
-    LinearAlgebra.axpy!(α,x,y)
-  end
-  consistent!(y) |> wait
-  return y
-end
-
-function LinearAlgebra.axpy!(α,x::BlockPVector,y::BlockPVector)
-  map(blocks(x),blocks(y)) do x,y
-    LinearAlgebra.axpy!(α,x,y)
-  end
-  return y
-end
-
-function Algebra.axpy_entries!(
-  α::Number, A::PSparseMatrix, B::PSparseMatrix;
-  check::Bool=true
-)
-# We should definitely check here that the index partitions are the same. 
-# However: Because the different matrices are assembled separately, the objects are not the 
-# same (i.e can't use ===). Checking the index partitions would then be costly...
-  @assert reduce(&,map(PartitionedArrays.matching_local_indices,partition(axes(A,1)),partition(axes(B,1))))
-  @assert reduce(&,map(PartitionedArrays.matching_local_indices,partition(axes(A,2)),partition(axes(B,2))))
-  map(partition(A),partition(B)) do A, B
-    Algebra.axpy_entries!(α,A,B;check)
-  end
-  return B
-end
-
-function Algebra.axpy_entries!(
-  α::Number, A::BlockPMatrix, B::BlockPMatrix;
-  check::Bool=true
-)
-  map(blocks(A),blocks(B)) do A, B
-    Algebra.axpy_entries!(α,A,B;check)
-  end
-  return B
-end
-
-# This might go to Gridap in the future. We keep it here for the moment.
-function change_axes(a::Algebra.ArrayCounter,axes)
-  @notimplemented
-end
-
-# This might go to Gridap in the future. We keep it here for the moment.
-function change_axes(a::Algebra.CounterCOO{T,A}, axes::A) where {T,A}
-  b = Algebra.CounterCOO{T}(axes)
-  b.nnz = a.nnz
-  b
-end
-
-# This might go to Gridap in the future. We keep it here for the moment.
-function change_axes(a::Algebra.AllocationCOO{T,A}, axes::A) where {T,A}
-  counter = change_axes(a.counter,axes)
-  Algebra.AllocationCOO(counter,a.I,a.J,a.V)
-end
-
-# Array of PArrays -> PArray of Arrays 
-function to_parray_of_arrays(a::AbstractArray{<:MPIArray})
-  indices = linear_indices(first(a))
-  map(indices) do i
-    map(a) do aj
-      PartitionedArrays.getany(aj)
-    end
-  end
-end
-
-function to_parray_of_arrays(a::AbstractArray{<:DebugArray})
-  indices = linear_indices(first(a))
-  map(indices) do i
-    map(a) do aj
-      aj.items[i]
-    end
-  end
-end
-
-# This type is required because MPIArray from PArrays 
-# cannot be instantiated with a NULL communicator
-struct MPIVoidVector{T} <: AbstractVector{T}
-  comm::MPI.Comm
-  function MPIVoidVector(::Type{T}) where {T}
-    new{T}(MPI.COMM_NULL)
-  end
-end
-
-Base.size(a::MPIVoidVector) = (0,)
-Base.IndexStyle(::Type{<:MPIVoidVector}) = IndexLinear()
-function Base.getindex(a::MPIVoidVector,i::Int)
-  error("Indexing of MPIVoidVector not possible.")
-end
-function Base.setindex!(a::MPIVoidVector,v,i::Int)
-  error("Indexing of MPIVoidVector not possible.")
-end
-function Base.show(io::IO,k::MIME"text/plain",data::MPIVoidVector)
-  println(io,"MPIVoidVector")
-end
-
-function num_parts(comm::MPI.Comm)
-  if comm != MPI.COMM_NULL
-    nparts = MPI.Comm_size(comm)
-  else
-    nparts = -1
-  end
-  nparts
-end
-@inline num_parts(comm::MPIArray) = num_parts(comm.comm)
-@inline num_parts(comm::DebugArray) = length(comm.items)
-@inline num_parts(comm::MPIVoidVector) = num_parts(comm.comm)
-
-function get_part_id(comm::MPI.Comm)
-  if comm != MPI.COMM_NULL
-    id = MPI.Comm_rank(comm)+1
-  else
-    id = -1
-  end
-  id
-end
-@inline get_part_id(comm::MPIArray) = get_part_id(comm.comm)
-@inline get_part_id(comm::MPIVoidVector) = get_part_id(comm.comm)
-
-"""
-    i_am_in(comm::MPIArray)
-    i_am_in(comm::DebugArray)
-  
-  Returns `true` if the processor is part of the subcommunicator `comm`.
-"""
-function i_am_in(comm::MPI.Comm)
-  get_part_id(comm) >=0
-end
-@inline i_am_in(comm::MPIArray) = i_am_in(comm.comm)
-@inline i_am_in(comm::MPIVoidVector) = i_am_in(comm.comm)
-@inline i_am_in(comm::DebugArray) = true
-
-function change_parts(x::Union{MPIArray,DebugArray,Nothing,MPIVoidVector}, new_parts; default=nothing)
-  x_new = map(new_parts) do p
-    if isa(x,MPIArray)
-      PartitionedArrays.getany(x)
-    elseif isa(x,DebugArray) && (p <= length(x.items))
-      x.items[p]
-    else
-      default
-    end
-  end
-  return x_new
-end
-
-function generate_subparts(parts::MPIArray,new_comm_size)
-  root_comm = parts.comm
-  root_size = MPI.Comm_size(root_comm)
-  rank = MPI.Comm_rank(root_comm)
-
-  @static if isdefined(MPI,:MPI_UNDEFINED)
-    mpi_undefined = MPI.MPI_UNDEFINED[]
-  else
-    mpi_undefined = MPI.API.MPI_UNDEFINED[]
-  end
-  
-  if root_size == new_comm_size
-    return parts
-  else
-    if rank < new_comm_size
-      comm = MPI.Comm_split(root_comm,0,0)
-      return distribute_with_mpi(LinearIndices((new_comm_size,));comm=comm,duplicate_comm=false)
-    else
-      comm = MPI.Comm_split(root_comm,mpi_undefined,mpi_undefined)
-      return MPIVoidVector(eltype(parts))
-    end
-  end
-end
-
-function generate_subparts(parts::DebugArray,new_comm_size)
-  DebugArray(LinearIndices((new_comm_size,)))
-end
-
 # local_views
 
 function local_views(a)
@@ -254,9 +68,11 @@ end
 # change_ghost
 
 function change_ghost(a::PVector{T},ids::PRange;is_consistent=false,make_consistent=false) where T
-  same_partition = (a.index_partition === partition(ids))
-  a_new = same_partition ? a : change_ghost(T,a,ids)
-  if make_consistent && (!same_partition || !is_consistent)
+  if (a.index_partition === partition(ids))
+    return a
+  end
+  a_new = change_ghost(T,a,ids)
+  if make_consistent && !is_consistent
     consistent!(a_new) |> wait
   end
   return a_new
@@ -285,44 +101,26 @@ function change_ghost(a::BlockPVector,ids::BlockPRange;is_consistent=false,make_
   return BlockPVector(vals,ids)
 end
 
-# This function computes a mapping among the local identifiers of a and b
-# for which the corresponding global identifiers are both in a and b. 
-# Note that the haskey check is necessary because in the general case 
-# there might be gids in b which are not present in a
-function find_local_to_local_map(a::AbstractLocalIndices,b::AbstractLocalIndices)
-  a_local_to_b_local = fill(Int32(-1),local_length(a))
-  b_local_to_global  = local_to_global(b)
-  a_global_to_local  = global_to_local(a)
-  for blid in 1:local_length(b)
-    gid = b_local_to_global[blid]
-    if a_global_to_local[gid] != zero(eltype(a_global_to_local))
-      alid = a_global_to_local[gid]
-      a_local_to_b_local[alid] = blid
-    end  
-  end
-  a_local_to_b_local
-end
-
 # This type is required in order to be able to access the local portion 
 # of distributed sparse matrices and vectors using local indices from the 
 # distributed test and trial spaces
-struct LocalView{T,N,A,B} <:AbstractArray{T,N}
+struct LocalView{T,N,A} <:AbstractArray{T,N}
   plids_to_value::A
-  d_to_lid_to_plid::B
+  d_to_lid_to_plid::NTuple{N,Vector{Int32}}
   local_size::NTuple{N,Int}
   function LocalView(
-    plids_to_value::AbstractArray{T,N},d_to_lid_to_plid::NTuple{N}) where {T,N}
+    plids_to_value::AbstractArray{T,N}, d_to_lid_to_plid::NTuple{N,Vector{Int32}}
+  ) where {T,N}
     A = typeof(plids_to_value)
-    B = typeof(d_to_lid_to_plid)
     local_size = map(length,d_to_lid_to_plid)
-    new{T,N,A,B}(plids_to_value,d_to_lid_to_plid,local_size)
+    new{T,N,A}(plids_to_value,d_to_lid_to_plid,local_size)
   end
 end
 
 Base.size(a::LocalView) = a.local_size
 Base.IndexStyle(::Type{<:LocalView}) = IndexCartesian()
 function Base.getindex(a::LocalView{T,N},lids::Vararg{Integer,N}) where {T,N}
-  plids = map(_lid_to_plid,lids,a.d_to_lid_to_plid)
+  plids = map(getindex,a.d_to_lid_to_plid,lids)
   if all(i->i>0,plids)
     a.plids_to_value[plids...]
   else
@@ -330,23 +128,18 @@ function Base.getindex(a::LocalView{T,N},lids::Vararg{Integer,N}) where {T,N}
   end
 end
 function Base.setindex!(a::LocalView{T,N},v,lids::Vararg{Integer,N}) where {T,N}
-  plids = map(_lid_to_plid,lids,a.d_to_lid_to_plid)
+  plids = map(getindex,a.d_to_lid_to_plid,lids)
   @check all(i->i>0,plids) "You are trying to set a value that is not stored in the local portion"
   a.plids_to_value[plids...] = v
 end
 
-function _lid_to_plid(lid,lid_to_plid)
-  plid = lid_to_plid[lid]
-  plid
-end
-
 function local_views(a::PVector,new_rows::PRange)
   old_rows = axes(a,1)
   if partition(old_rows) === partition(new_rows)
     partition(a)
   else
-    map(partition(a),partition(old_rows),partition(new_rows)) do vector_partition,old_rows,new_rows
-      LocalView(vector_partition,(find_local_to_local_map(new_rows,old_rows),))
+    map(partition(a),partition(old_rows),partition(new_rows)) do a,old_rows,new_rows
+      LocalView(a,(local_to_local_map(new_rows,old_rows),))
     end
   end
 end
@@ -356,12 +149,12 @@ function local_views(a::PSparseMatrix,new_rows::PRange,new_cols::PRange)
   if (partition(old_rows) === partition(new_rows) && partition(old_cols) === partition(new_cols) )
     partition(a)
   else
-    map(partition(a),
-        partition(old_rows),partition(old_cols),
-        partition(new_rows),partition(new_cols)) do matrix_partition,old_rows,old_cols,new_rows,new_cols
-      rl2lmap = find_local_to_local_map(new_rows,old_rows)
-      cl2lmap = find_local_to_local_map(new_cols,old_cols)
-      LocalView(matrix_partition,(rl2lmap,cl2lmap))
+    map(
+      partition(a),partition(old_rows),partition(old_cols),partition(new_rows),partition(new_cols)
+    ) do a,old_rows,old_cols,new_rows,new_cols
+      rl2lmap = local_to_local_map(new_rows,old_rows)
+      cl2lmap = local_to_local_map(new_cols,old_cols)
+      LocalView(a,(rl2lmap,cl2lmap))
     end
   end
 end
@@ -377,867 +170,3 @@ function local_views(a::BlockPMatrix,new_rows::BlockPRange,new_cols::BlockPRange
   end |> to_parray_of_arrays
   return map(mortar,vals)
 end
-
-# PSparseMatrix assembly
-
-struct FullyAssembledRows end
-struct SubAssembledRows end
-
-# For the moment we use COO format even though
-# it is quite memory consuming.
-# We need to implement communication in other formats in
-# PartitionedArrays.jl
-
-struct PSparseMatrixBuilderCOO{T,B}
-  local_matrix_type::Type{T}
-  par_strategy::B
-end
-
-function Algebra.nz_counter(
-  builder::PSparseMatrixBuilderCOO{A}, axs::Tuple{<:PRange,<:PRange}) where A
-  test_dofs_gids_prange, trial_dofs_gids_prange = axs
-  counters = map(partition(test_dofs_gids_prange),partition(trial_dofs_gids_prange)) do r,c
-    axs = (Base.OneTo(local_length(r)),Base.OneTo(local_length(c)))
-    Algebra.CounterCOO{A}(axs)
-  end
-  DistributedCounterCOO(builder.par_strategy,counters,test_dofs_gids_prange,trial_dofs_gids_prange)
-end
-
-function Algebra.get_array_type(::PSparseMatrixBuilderCOO{Tv}) where Tv
-  T = eltype(Tv)
-  return PSparseMatrix{T}
-end
-
-
-"""
-"""
-struct DistributedCounterCOO{A,B,C,D} <: GridapType
-  par_strategy::A
-  counters::B
-  test_dofs_gids_prange::C
-  trial_dofs_gids_prange::D
-  function DistributedCounterCOO(
-    par_strategy,
-    counters::AbstractArray{<:Algebra.CounterCOO},
-    test_dofs_gids_prange::PRange,
-    trial_dofs_gids_prange::PRange)
-    A = typeof(par_strategy)
-    B = typeof(counters)
-    C = typeof(test_dofs_gids_prange)
-    D = typeof(trial_dofs_gids_prange)
-    new{A,B,C,D}(par_strategy,counters,test_dofs_gids_prange,trial_dofs_gids_prange)
-  end
-end
-
-function local_views(a::DistributedCounterCOO)
-  a.counters
-end
-
-function local_views(a::DistributedCounterCOO,test_dofs_gids_prange,trial_dofs_gids_prange)
-  @check test_dofs_gids_prange === a.test_dofs_gids_prange
-  @check trial_dofs_gids_prange === a.trial_dofs_gids_prange
-  a.counters
-end
-
-function Algebra.nz_allocation(a::DistributedCounterCOO)
-  allocs = map(nz_allocation,a.counters)
-  DistributedAllocationCOO(a.par_strategy,allocs,a.test_dofs_gids_prange,a.trial_dofs_gids_prange)
-end
-
-struct DistributedAllocationCOO{A,B,C,D} <: GridapType
-  par_strategy::A
-  allocs::B
-  test_dofs_gids_prange::C
-  trial_dofs_gids_prange::D
-  function DistributedAllocationCOO(
-    par_strategy,
-    allocs::AbstractArray{<:Algebra.AllocationCOO},
-    test_dofs_gids_prange::PRange,
-    trial_dofs_gids_prange::PRange)
-    A = typeof(par_strategy)
-    B = typeof(allocs)
-    C = typeof(test_dofs_gids_prange)
-    D = typeof(trial_dofs_gids_prange)
-    new{A,B,C,D}(par_strategy,allocs,test_dofs_gids_prange,trial_dofs_gids_prange)
-  end
-end
-
-function change_axes(a::DistributedAllocationCOO{A,B,<:PRange,<:PRange},
-                     axes::Tuple{<:PRange,<:PRange}) where {A,B}
-  local_axes = map(partition(axes[1]),partition(axes[2])) do rows,cols
-    (Base.OneTo(local_length(rows)), Base.OneTo(local_length(cols)))
-  end
-  allocs = map(change_axes,a.allocs,local_axes)
-  DistributedAllocationCOO(a.par_strategy,allocs,axes[1],axes[2])
-end
-
-function change_axes(a::MatrixBlock{<:DistributedAllocationCOO},
-                     axes::Tuple{<:Vector,<:Vector})
-  block_ids  = CartesianIndices(a.array)
-  rows, cols = axes
-  array = map(block_ids) do I
-    change_axes(a[I],(rows[I[1]],cols[I[2]]))
-  end
-  return ArrayBlock(array,a.touched)
-end
-
-function local_views(a::DistributedAllocationCOO)
-  a.allocs
-end
-
-function local_views(a::DistributedAllocationCOO,test_dofs_gids_prange,trial_dofs_gids_prange)
-  @check test_dofs_gids_prange === a.test_dofs_gids_prange
-  @check trial_dofs_gids_prange === a.trial_dofs_gids_prange
-  a.allocs
-end
-
-function local_views(a::MatrixBlock{<:DistributedAllocationCOO})
-  array = map(local_views,a.array) |> to_parray_of_arrays
-  return map(ai -> ArrayBlock(ai,a.touched),array)
-end
-
-function get_allocations(a::DistributedAllocationCOO)
-  I,J,V = map(local_views(a)) do alloc
-    alloc.I, alloc.J, alloc.V
-  end |> tuple_of_arrays
-  return I,J,V
-end
-
-function get_allocations(a::ArrayBlock{<:DistributedAllocationCOO})
-  tuple_of_array_of_parrays = map(get_allocations,a.array) |> tuple_of_arrays
-  return tuple_of_array_of_parrays
-end
-
-get_test_gids(a::DistributedAllocationCOO)  = a.test_dofs_gids_prange
-get_trial_gids(a::DistributedAllocationCOO) = a.trial_dofs_gids_prange
-get_test_gids(a::ArrayBlock{<:DistributedAllocationCOO})  = map(get_test_gids,diag(a.array))
-get_trial_gids(a::ArrayBlock{<:DistributedAllocationCOO}) = map(get_trial_gids,diag(a.array))
-
-function Algebra.create_from_nz(a::PSparseMatrix)
-  # For FullyAssembledRows the underlying Exchanger should
-  # not have ghost layer making assemble! do nothing (TODO check)
-  assemble!(a) |> wait
-  a
-end
-
-function Algebra.create_from_nz(a::DistributedAllocationCOO{<:FullyAssembledRows})
-  f(x) = nothing
-  A, = _fa_create_from_nz_with_callback(f,a)
-  return A
-end
-
-function Algebra.create_from_nz(a::ArrayBlock{<:DistributedAllocationCOO{<:FullyAssembledRows}})
-  f(x) = nothing
-  A, = _fa_create_from_nz_with_callback(f,a)
-  return A
-end
-
-function _fa_create_from_nz_with_callback(callback,a)
-
-  # Recover some data
-  I,J,V = get_allocations(a)
-  test_dofs_gids_prange  = get_test_gids(a)
-  trial_dofs_gids_prange = get_trial_gids(a)
-
-  rows = _setup_prange(test_dofs_gids_prange,I;ghost=false,ax=:rows)
-  b = callback(rows)
-
-  # convert I and J to global dof ids
-  to_global_indices!(I,test_dofs_gids_prange;ax=:rows)
-  to_global_indices!(J,trial_dofs_gids_prange;ax=:cols)
-
-  # Create the range for cols
-  cols = _setup_prange(trial_dofs_gids_prange,J;ax=:cols)
-
-  # Convert again I,J to local numeration
-  to_local_indices!(I,rows;ax=:rows)
-  to_local_indices!(J,cols;ax=:cols)
-
-  # Adjust local matrix size to linear system's index sets
-  asys = change_axes(a,(rows,cols))
-
-  # Compress local portions
-  values = map(create_from_nz,local_views(asys))
-
-  # Finally build the matrix
-  A = _setup_matrix(values,rows,cols)
-  return A, b
-end
-
-function Algebra.create_from_nz(a::DistributedAllocationCOO{<:SubAssembledRows})
-  f(x) = nothing
-  A, = _sa_create_from_nz_with_callback(f,f,a,nothing)
-  return A
-end
-
-function Algebra.create_from_nz(a::ArrayBlock{<:DistributedAllocationCOO{<:SubAssembledRows}})
-  f(x) = nothing
-  A, = _sa_create_from_nz_with_callback(f,f,a,nothing)
-  return A
-end
-
-function _sa_create_from_nz_with_callback(callback,async_callback,a,b)
-  # Recover some data
-  I,J,V = get_allocations(a)
-  test_dofs_gids_prange = get_test_gids(a)
-  trial_dofs_gids_prange = get_trial_gids(a)
-
-  # convert I and J to global dof ids
-  to_global_indices!(I,test_dofs_gids_prange;ax=:rows)
-  to_global_indices!(J,trial_dofs_gids_prange;ax=:cols)
-
-  # Create the Prange for the rows
-  rows = _setup_prange(test_dofs_gids_prange,I;ax=:rows)
-  
-  # Move (I,J,V) triplets to the owner process of each row I.
-  # The triplets are accompanyed which Jo which is the process column owner
-  Jo = get_gid_owners(J,trial_dofs_gids_prange;ax=:cols)
-  t  = _assemble_coo!(I,J,V,rows;owners=Jo)
-
-  # Here we can overlap computations
-  # This is a good place to overlap since
-  # sending the matrix rows is a lot of data
-  if !isa(b,Nothing)
-    bprange=_setup_prange_from_pvector_allocation(b)
-    b = callback(bprange)
-  end
-
-  # Wait the transfer to finish
-  wait(t)
-
-  # Create the Prange for the cols
-  cols = _setup_prange(trial_dofs_gids_prange,J;ax=:cols,owners=Jo)
-
-  # Overlap rhs communications with CSC compression
-  t2 = async_callback(b)
-
-  # Convert again I,J to local numeration
-  to_local_indices!(I,rows;ax=:rows)
-  to_local_indices!(J,cols;ax=:cols)
-
-  # Adjust local matrix size to linear system's index sets
-  asys = change_axes(a,(rows,cols))
-
-  # Compress the local matrices
-  values = map(create_from_nz,local_views(asys))
-
-  # Wait the transfer to finish
-  if !isa(t2,Nothing)
-    wait(t2)
-  end
-
-  # Finally build the matrix
-  A = _setup_matrix(values,rows,cols)
-  return A, b
-end
-
-
-# PVector assembly 
-
-struct PVectorBuilder{T,B}
-  local_vector_type::Type{T}
-  par_strategy::B
-end
-
-function Algebra.nz_counter(builder::PVectorBuilder,axs::Tuple{<:PRange})
-  T = builder.local_vector_type
-  rows, = axs
-  counters = map(partition(rows)) do rows
-    axs = (Base.OneTo(local_length(rows)),)
-    nz_counter(ArrayBuilder(T),axs)
-  end
-  PVectorCounter(builder.par_strategy,counters,rows)
-end
-
-function Algebra.get_array_type(::PVectorBuilder{Tv}) where Tv
-  T = eltype(Tv)
-  return PVector{T}
-end
-
-struct PVectorCounter{A,B,C}
-  par_strategy::A
-  counters::B
-  test_dofs_gids_prange::C
-end
-
-Algebra.LoopStyle(::Type{<:PVectorCounter}) = DoNotLoop()
-
-function local_views(a::PVectorCounter)
-  a.counters
-end
-
-function local_views(a::PVectorCounter,rows)
-  @check rows === a.test_dofs_gids_prange
-  a.counters
-end
-
-function Arrays.nz_allocation(a::PVectorCounter{<:FullyAssembledRows})
-  dofs = a.test_dofs_gids_prange
-  values = map(nz_allocation,a.counters)
-  PVectorAllocationTrackOnlyValues(a.par_strategy,values,dofs)
-end
-
-struct PVectorAllocationTrackOnlyValues{A,B,C}
-  par_strategy::A
-  values::B
-  test_dofs_gids_prange::C
-end
-
-function local_views(a::PVectorAllocationTrackOnlyValues)
-  a.values
-end
-
-function local_views(a::PVectorAllocationTrackOnlyValues,rows)
-  @check rows === a.test_dofs_gids_prange
-  a.values
-end
-
-function Algebra.create_from_nz(a::PVectorAllocationTrackOnlyValues{<:FullyAssembledRows})
-  rows = _setup_prange_without_ghosts(a.test_dofs_gids_prange)
-  _rhs_callback(a,rows)
-end
-
-function Algebra.create_from_nz(a::PVectorAllocationTrackOnlyValues{<:SubAssembledRows})
-  # This point MUST NEVER be reached. If reached there is an inconsistency
-  # in the parallel code in charge of vector assembly
-  @assert false
-end
-
-function _rhs_callback(row_partitioned_vector_partition,rows)
-  # The ghost values in row_partitioned_vector_partition are 
-  # aligned with the FESpace but not with the ghost values in the rows of A
-  b_fespace = PVector(row_partitioned_vector_partition.values,
-                      partition(row_partitioned_vector_partition.test_dofs_gids_prange))
-
-  # This one is aligned with the rows of A
-  b = similar(b_fespace,eltype(b_fespace),(rows,))
-
-  # First transfer owned values
-  b .= b_fespace
-
-  # Now transfer ghost
-  function transfer_ghost(b,b_fespace,ids,ids_fespace)
-    num_ghosts_vec = ghost_length(ids)
-    gho_to_loc_vec = ghost_to_local(ids)
-    loc_to_glo_vec = local_to_global(ids)
-    gid_to_lid_fe  = global_to_local(ids_fespace)
-    for ghost_lid_vec in 1:num_ghosts_vec
-      lid_vec     = gho_to_loc_vec[ghost_lid_vec]
-      gid         = loc_to_glo_vec[lid_vec]
-      lid_fespace = gid_to_lid_fe[gid]
-      b[lid_vec] = b_fespace[lid_fespace]
-    end
-  end
-  map(
-    transfer_ghost,
-    partition(b),
-    partition(b_fespace),
-    b.index_partition,
-    b_fespace.index_partition)
-
-  return b
-end
-
-function Algebra.create_from_nz(a::PVector)
-  assemble!(a) |> wait
-  return a
-end
-
-function Algebra.create_from_nz(
-  a::DistributedAllocationCOO{<:FullyAssembledRows},
-  c_fespace::PVectorAllocationTrackOnlyValues{<:FullyAssembledRows})
-
-  function callback(rows)
-    _rhs_callback(c_fespace,rows)
-  end
-
-  A,b = _fa_create_from_nz_with_callback(callback,a)
-  return A,b
-end
-
-struct PVectorAllocationTrackTouchedAndValues{A,B,C}
-  allocations::A
-  values::B
-  test_dofs_gids_prange::C
-end
-
-function Algebra.create_from_nz(
-  a::DistributedAllocationCOO{<:SubAssembledRows},
-  b::PVectorAllocationTrackTouchedAndValues)
-
-  function callback(rows)
-    _rhs_callback(b,rows)
-  end
-
-  function async_callback(b)
-    # now we can assemble contributions
-    assemble!(b)
-  end
-
-  A,b = _sa_create_from_nz_with_callback(callback,async_callback,a,b)
-  return A,b
-end
-
-struct ArrayAllocationTrackTouchedAndValues{A}
-  touched::Vector{Bool}
-  values::A
-end
-
-Gridap.Algebra.LoopStyle(::Type{<:ArrayAllocationTrackTouchedAndValues}) = Gridap.Algebra.Loop()
-
-
-function local_views(a::PVectorAllocationTrackTouchedAndValues,rows)
-  @check rows === a.test_dofs_gids_prange
-  a.allocations
-end
-
-@inline function Arrays.add_entry!(c::Function,a::ArrayAllocationTrackTouchedAndValues,v,i,j)
-  @notimplemented
-end
-@inline function Arrays.add_entry!(c::Function,a::ArrayAllocationTrackTouchedAndValues,v,i)
-  if i>0
-    if !(a.touched[i])
-      a.touched[i]=true
-    end
-    if !isa(v,Nothing)
-      a.values[i]=c(v,a.values[i])
-    end
-  end
-  nothing
-end
-@inline function Arrays.add_entries!(c::Function,a::ArrayAllocationTrackTouchedAndValues,v,i,j)
-  @notimplemented
-end
-@inline function Arrays.add_entries!(c::Function,a::ArrayAllocationTrackTouchedAndValues,v,i)
-  if !isa(v,Nothing)
-    for (ve,ie) in zip(v,i)
-      Arrays.add_entry!(c,a,ve,ie)
-    end
-  else
-    for ie in i
-      Arrays.add_entry!(c,a,nothing,ie)
-    end
-  end
-  nothing
-end
-
-
-function _setup_touched_and_allocations_arrays(values)
-  touched = map(values) do values
-    fill!(Vector{Bool}(undef,length(values)),false)
-  end
-  allocations = map(values,touched) do values,touched
-   ArrayAllocationTrackTouchedAndValues(touched,values)
-  end
-  touched, allocations
-end
-
-function Arrays.nz_allocation(a::DistributedCounterCOO{<:SubAssembledRows},
-                              b::PVectorCounter{<:SubAssembledRows})
-  A      = nz_allocation(a)
-  dofs   = b.test_dofs_gids_prange
-  values = map(nz_allocation,b.counters)
-  touched,allocations=_setup_touched_and_allocations_arrays(values)
-  B = PVectorAllocationTrackTouchedAndValues(allocations,values,dofs)
-  return A,B
-end
-
-function Arrays.nz_allocation(a::PVectorCounter{<:SubAssembledRows})
-  dofs = a.test_dofs_gids_prange
-  values = map(nz_allocation,a.counters)
-  touched,allocations=_setup_touched_and_allocations_arrays(values)
-  return PVectorAllocationTrackTouchedAndValues(allocations,values,dofs)
-end
-
-function local_views(a::PVectorAllocationTrackTouchedAndValues)
-  a.allocations
-end
-
-function _setup_prange_from_pvector_allocation(a::PVectorAllocationTrackTouchedAndValues)
-
-  # Find the ghost rows
-  allocations = local_views(a.allocations)
-  indices = partition(a.test_dofs_gids_prange)
-  I_ghost_lids_to_dofs_ghost_lids = map(allocations, indices) do allocation, indices
-    dofs_lids_touched = findall(allocation.touched)
-    loc_to_gho = local_to_ghost(indices)
-    n_I_ghost_lids = count((x)->loc_to_gho[x]!=0,dofs_lids_touched)
-    I_ghost_lids = Vector{Int32}(undef,n_I_ghost_lids)
-    cur = 1
-    for lid in dofs_lids_touched
-      dof_lid = loc_to_gho[lid]
-      if dof_lid != 0
-        I_ghost_lids[cur] = dof_lid
-        cur = cur+1
-      end
-    end
-    I_ghost_lids
-  end
-
-  ghost_to_global, ghost_to_owner = map(
-    find_gid_and_owner,I_ghost_lids_to_dofs_ghost_lids,indices) |> tuple_of_arrays
-
-  ngids = length(a.test_dofs_gids_prange)
-  _setup_prange_impl_(ngids,indices,ghost_to_global,ghost_to_owner)
-end
-
-function Algebra.create_from_nz(a::PVectorAllocationTrackTouchedAndValues)
-  rows = _setup_prange_from_pvector_allocation(a)
-  b    = _rhs_callback(a,rows)
-  t2   = assemble!(b)
-  # Wait the transfer to finish
-  if t2 !== nothing
-    wait(t2)
-  end
-  return b
-end
-
-# Common Assembly Utilities
-function first_gdof_from_ids(ids)
-  if own_length(ids) == 0
-    return 1
-  end
-  lid_to_gid   = local_to_global(ids) 
-  owned_to_lid = own_to_local(ids)
-  return Int(lid_to_gid[first(owned_to_lid)])
-end
-
-function find_gid_and_owner(ighost_to_jghost,jindices)
-  jghost_to_local  = ghost_to_local(jindices)
-  jlocal_to_global = local_to_global(jindices)
-  jlocal_to_owner  = local_to_owner(jindices)
-  ighost_to_jlocal = sort(view(jghost_to_local,ighost_to_jghost))
-
-  ighost_to_global = jlocal_to_global[ighost_to_jlocal]
-  ighost_to_owner  = jlocal_to_owner[ighost_to_jlocal]
-  return ighost_to_global, ighost_to_owner
-end
-
-# The given ids are assumed to be a sub-set of the lids
-function ghost_lids_touched(a::AbstractLocalIndices,gids::AbstractVector{<:Integer})
-  glo_to_loc = global_to_local(a)
-  loc_to_gho = local_to_ghost(a)
-  
-  # First pass: Allocate
-  i = 0
-  ghost_lids_touched = fill(false,ghost_length(a))
-  for gid in gids
-    lid = glo_to_loc[gid]
-    ghost_lid = loc_to_gho[lid]
-    if ghost_lid > 0 && !ghost_lids_touched[ghost_lid]
-      ghost_lids_touched[ghost_lid] = true
-      i += 1
-    end
-  end
-  gids_ghost_lid_to_ghost_lid = Vector{Int32}(undef,i)
-
-  # Second pass: fill 
-  i = 1
-  fill!(ghost_lids_touched,false)
-  for gid in gids
-    lid = glo_to_loc[gid]
-    ghost_lid = loc_to_gho[lid]
-    if ghost_lid > 0 && !ghost_lids_touched[ghost_lid]
-      ghost_lids_touched[ghost_lid] = true
-      gids_ghost_lid_to_ghost_lid[i] = ghost_lid
-      i += 1
-    end
-  end
-
-  return gids_ghost_lid_to_ghost_lid
-end
-
-# Find the neighbours of partition1 trying 
-# to use those in partition2 as a hint 
-function _find_neighbours!(partition1, partition2)
-  partition2_snd, partition2_rcv = assembly_neighbors(partition2)
-  partition2_graph = ExchangeGraph(partition2_snd, partition2_rcv)
-  return assembly_neighbors(partition1; neighbors=partition2_graph)
-end
-
-# to_global! & to_local! analogs, needed for dispatching in block assembly
-
-function to_local_indices!(I,ids::PRange;kwargs...)
-  map(to_local!,I,partition(ids))
-end
-
-function to_global_indices!(I,ids::PRange;kwargs...)
-  map(to_global!,I,partition(ids))
-end
-
-function get_gid_owners(I,ids::PRange;kwargs...)
-  map(I,partition(ids)) do I, indices 
-    glo_to_loc = global_to_local(indices) 
-    loc_to_own = local_to_owner(indices)
-    map(x->loc_to_own[glo_to_loc[x]], I)
-  end 
-end
-
-for f in [:to_local_indices!, :to_global_indices!, :get_gid_owners]
-  @eval begin
-    function $f(I::Vector,ids::AbstractVector{<:PRange};kwargs...)
-      map($f,I,ids)
-    end
-
-    function $f(I::Matrix,ids::AbstractVector{<:PRange};ax=:rows)
-      @check ax ∈ [:rows,:cols]
-      block_ids = CartesianIndices(I)
-      map(block_ids) do id
-        i = id[1]; j = id[2];
-        if ax == :rows
-          $f(I[i,j],ids[i])
-        else
-          $f(I[i,j],ids[j])
-        end
-      end
-    end
-  end
-end
-
-# _setup_matrix : local matrices + global PRanges -> Global matrix
-
-function _setup_matrix(values,rows::PRange,cols::PRange)
-  return PSparseMatrix(values,partition(rows),partition(cols))
-end
-
-function _setup_matrix(values,rows::Vector{<:PRange},cols::Vector{<:PRange})
-  block_ids  = CartesianIndices((length(rows),length(cols)))
-  block_mats = map(block_ids) do I
-    block_values = map(v -> blocks(v)[I],values)
-    return _setup_matrix(block_values,rows[I[1]],cols[I[2]])
-  end
-  return mortar(block_mats)
-end
-
-# _assemble_coo! : local coo triplets + global PRange -> Global coo values
-
-function _assemble_coo!(I,J,V,rows::PRange;owners=nothing)
-  if isa(owners,Nothing)
-    PArrays.assemble_coo!(I,J,V,partition(rows))
-  else
-    assemble_coo_with_column_owner!(I,J,V,partition(rows),owners)
-  end
-end
-
-function _assemble_coo!(I,J,V,rows::Vector{<:PRange};owners=nothing)
-  block_ids = CartesianIndices(I)
-  map(block_ids) do id
-    i = id[1]; j = id[2];
-    if isa(owners,Nothing)
-      _assemble_coo!(I[i,j],J[i,j],V[i,j],rows[i])
-    else
-      _assemble_coo!(I[i,j],J[i,j],V[i,j],rows[i],owners=owners[i,j])
-    end
-  end
-end
-
-function assemble_coo_with_column_owner!(I,J,V,row_partition,Jown)
-  """
-    Returns three JaggedArrays with the coo triplets
-    to be sent to the corresponding owner parts in parts_snd
-  """
-  function setup_snd(part,parts_snd,row_lids,coo_entries_with_column_owner)
-    global_to_local_row = global_to_local(row_lids)
-    local_row_to_owner = local_to_owner(row_lids)
-    owner_to_i = Dict(( owner=>i for (i,owner) in enumerate(parts_snd) ))
-    ptrs = zeros(Int32,length(parts_snd)+1)
-    k_gi, k_gj, k_jo, k_v = coo_entries_with_column_owner
-    for k in 1:length(k_gi)
-      gi = k_gi[k]
-      li = global_to_local_row[gi]
-      owner = local_row_to_owner[li]
-      if owner != part
-        ptrs[owner_to_i[owner]+1] +=1
-      end
-    end
-    PArrays.length_to_ptrs!(ptrs)
-    gi_snd_data = zeros(eltype(k_gi),ptrs[end]-1)
-    gj_snd_data = zeros(eltype(k_gj),ptrs[end]-1)
-    jo_snd_data = zeros(eltype(k_jo),ptrs[end]-1)
-    v_snd_data = zeros(eltype(k_v),ptrs[end]-1)
-    for k in 1:length(k_gi)
-      gi = k_gi[k]
-      li = global_to_local_row[gi]
-      owner = local_row_to_owner[li]
-      if owner != part
-        gj = k_gj[k]
-        v = k_v[k]
-        p = ptrs[owner_to_i[owner]]
-        gi_snd_data[p] = gi
-        gj_snd_data[p] = gj
-        jo_snd_data[p] = k_jo[k]
-        v_snd_data[p]  = v
-        k_v[k] = zero(v)
-        ptrs[owner_to_i[owner]] += 1
-      end
-    end
-    PArrays.rewind_ptrs!(ptrs)
-    gi_snd = JaggedArray(gi_snd_data,ptrs)
-    gj_snd = JaggedArray(gj_snd_data,ptrs)
-    jo_snd = JaggedArray(jo_snd_data,ptrs)
-    v_snd = JaggedArray(v_snd_data,ptrs)
-    gi_snd, gj_snd, jo_snd, v_snd
-  end
-  """
-    Pushes to coo_entries_with_column_owner the tuples 
-    gi_rcv,gj_rcv,jo_rcv,v_rcv received from remote processes
-  """
-  function setup_rcv!(coo_entries_with_column_owner,gi_rcv,gj_rcv,jo_rcv,v_rcv)
-    k_gi, k_gj, k_jo, k_v = coo_entries_with_column_owner
-    current_n = length(k_gi)
-    new_n = current_n + length(gi_rcv.data)
-    resize!(k_gi,new_n)
-    resize!(k_gj,new_n)
-    resize!(k_jo,new_n)
-    resize!(k_v,new_n)
-    for p in 1:length(gi_rcv.data)
-        k_gi[current_n+p] = gi_rcv.data[p]
-        k_gj[current_n+p] = gj_rcv.data[p]
-        k_jo[current_n+p] = jo_rcv.data[p]
-        k_v[current_n+p] = v_rcv.data[p]
-    end
-  end
-  part = linear_indices(row_partition)
-  parts_snd, parts_rcv = assembly_neighbors(row_partition)
-  coo_entries_with_column_owner = map(tuple,I,J,Jown,V)
-  gi_snd, gj_snd, jo_snd, v_snd = map(setup_snd,part,parts_snd,row_partition,coo_entries_with_column_owner) |> tuple_of_arrays
-  graph = ExchangeGraph(parts_snd,parts_rcv)
-  t1 = exchange(gi_snd,graph)
-  t2 = exchange(gj_snd,graph)
-  t3 = exchange(jo_snd,graph)
-  t4 = exchange(v_snd,graph)
-  @async begin
-      gi_rcv = fetch(t1)
-      gj_rcv = fetch(t2)
-      jo_rcv = fetch(t3)
-      v_rcv = fetch(t4)
-      map(setup_rcv!,coo_entries_with_column_owner,gi_rcv,gj_rcv,jo_rcv,v_rcv)
-      I,J,Jown,V
-  end
-end
-
-# dofs_gids_prange can be either test_dofs_gids_prange or trial_dofs_gids_prange
-# In the former case, gids is a vector of global test dof identifiers, while in the 
-# latter, a vector of global trial dof identifiers
-function _setup_prange(dofs_gids_prange::PRange,gids;ghost=true,owners=nothing,kwargs...)
-  if !ghost
-    _setup_prange_without_ghosts(dofs_gids_prange)
-  elseif isa(owners,Nothing)
-    _setup_prange_with_ghosts(dofs_gids_prange,gids)
-  else
-    _setup_prange_with_ghosts(dofs_gids_prange,gids,owners)
-  end
-end
-
-function _setup_prange(
-  dofs_gids_prange::AbstractVector{<:PRange},
-  gids::AbstractMatrix;
-  ax=:rows,ghost=true,owners=nothing
-)
-  @check ax ∈ (:rows,:cols)
-  block_ids = LinearIndices(dofs_gids_prange)
-  pvcat(x) = map(xi -> vcat(xi...), to_parray_of_arrays(x))
-
-  gids_union, owners_union = map(block_ids,dofs_gids_prange) do id, prange
-    gids_slice = (ax == :rows) ? gids[id,:] : gids[:,id]
-    gids_union = pvcat(gids_slice)
-
-    owners_union = nothing
-    if !isnothing(owners)
-      owners_slice = (ax == :rows) ? owners[id,:] : owners[:,id]
-      owners_union = pvcat(owners_slice)
-    end
-
-    return gids_union, owners_union
-  end |> tuple_of_arrays
-  
-  return map((p,g,o) -> _setup_prange(p,g;ghost=ghost,owners=o),dofs_gids_prange,gids_union,owners_union)
-end
-
-# Create PRange for the rows of the linear system
-# without local ghost dofs as per required in the 
-# FullyAssembledRows() parallel assembly strategy 
-function _setup_prange_without_ghosts(dofs_gids_prange::PRange)
-  ngdofs = length(dofs_gids_prange)
-  indices = map(partition(dofs_gids_prange)) do dofs_indices 
-    owner = part_id(dofs_indices)
-    own_indices = OwnIndices(ngdofs,owner,own_to_global(dofs_indices))
-    ghost_indices = GhostIndices(ngdofs,Int64[],Int32[])
-    OwnAndGhostIndices(own_indices,ghost_indices)
-  end
-  return PRange(indices)
-end
-
-# Here we are assuming that the sparse communication graph underlying test_dofs_gids_partition
-# is a superset of the one underlying indices. This is (has to be) true for the rows of the linear system.
-# The precondition required for the consistency of any parallel assembly process in GridapDistributed 
-# is that each processor can determine locally with a single layer of ghost cells the global indices and associated 
-# processor owners of the rows that it touches after assembly of integration terms posed on locally-owned entities 
-# (i.e., either cells or faces). 
-function _setup_prange_with_ghosts(dofs_gids_prange::PRange,gids)
-  ngdofs = length(dofs_gids_prange)
-  dofs_gids_partition = partition(dofs_gids_prange)
-
-  # Selected ghost ids -> dof PRange ghost ids
-  gids_ghost_lids_to_dofs_ghost_lids = map(ghost_lids_touched,dofs_gids_partition,gids)
-
-  # Selected ghost ids -> [global dof ids, owner processor ids]
-  gids_ghost_to_global, gids_ghost_to_owner = map(
-    find_gid_and_owner,gids_ghost_lids_to_dofs_ghost_lids,dofs_gids_partition) |> tuple_of_arrays
-
-  return _setup_prange_impl_(ngdofs,dofs_gids_partition,gids_ghost_to_global,gids_ghost_to_owner)
-end
-
-# Here we cannot assume that the sparse communication graph underlying 
-# trial_dofs_gids_partition is a superset of the one underlying indices.
-# Here we chould check whether it is included and call _find_neighbours!()
-# if this is the case. At present, we are not taking advantage of this, 
-# but let the parallel scalable algorithm to compute the reciprocal to do the job. 
-function _setup_prange_with_ghosts(dofs_gids_prange::PRange,gids,owners)
-  ngdofs = length(dofs_gids_prange)
-  dofs_gids_partition = partition(dofs_gids_prange)
-
-  # Selected ghost ids -> [global dof ids, owner processor ids]
-  gids_ghost_to_global, gids_ghost_to_owner = map(
-    gids,owners,dofs_gids_partition) do gids, owners, indices
-    ghost_touched   = Dict{Int,Bool}()
-    ghost_to_global = Int64[] 
-    ghost_to_owner  = Int64[]
-    me = part_id(indices)
-    for (j,jo) in zip(gids,owners)
-      if jo != me
-        if !haskey(ghost_touched,j)
-          push!(ghost_to_global,j)
-          push!(ghost_to_owner,jo)
-          ghost_touched[j] = true
-        end
-      end
-    end
-    ghost_to_global, ghost_to_owner
-  end |> tuple_of_arrays
-
-  return _setup_prange_impl_(ngdofs,
-                             dofs_gids_partition,
-                             gids_ghost_to_global,
-                             gids_ghost_to_owner;
-                             discover_neighbours=false)
-end 
-
-function _setup_prange_impl_(ngdofs,
-                             dofs_gids_partition,
-                             gids_ghost_to_global,
-                             gids_ghost_to_owner;
-                             discover_neighbours=true)
-  indices = map(dofs_gids_partition, 
-                gids_ghost_to_global, 
-                gids_ghost_to_owner) do dofs_indices, ghost_to_global, ghost_to_owner 
-    owner = part_id(dofs_indices)
-    own_indices   = OwnIndices(ngdofs,owner,own_to_global(dofs_indices))
-    ghost_indices = GhostIndices(ngdofs,ghost_to_global,ghost_to_owner)
-    OwnAndGhostIndices(own_indices,ghost_indices)
-  end
-  if discover_neighbours
-    _find_neighbours!(indices,dofs_gids_partition)
-  end
-  return PRange(indices)
-end
diff --git a/src/Assembly.jl b/src/Assembly.jl
new file mode 100644
index 0000000..41673c5
--- /dev/null
+++ b/src/Assembly.jl
@@ -0,0 +1,381 @@
+
+# Parallel assembly strategies
+
+struct Assembled <: FESpaces.AssemblyStrategy end
+struct SubAssembled <: FESpaces.AssemblyStrategy end
+struct LocallyAssembled <: FESpaces.AssemblyStrategy end
+
+function local_assembly_strategy(::Union{Assembled,SubAssembled},rows,cols)
+  DefaultAssemblyStrategy()
+end
+
+# When LocallyAssembling, make sure that you also loop over ghost cells.
+function local_assembly_strategy(::LocallyAssembled,rows,cols)
+  rows_local_to_ghost = local_to_ghost(rows)
+  GenericAssemblyStrategy(
+    identity,
+    identity,
+    row->iszero(rows_local_to_ghost[row]),
+    col->true
+  )
+end
+
+# PSparseMatrix and PVector builders
+
+struct DistributedArrayBuilder{T,N,B}
+  local_array_type::Type{T}
+  strategy::B
+  function DistributedArrayBuilder(
+    local_array_type::Type{T},
+    strategy::AssemblyStrategy
+  ) where T
+    N = ndims(T)
+    B = typeof(strategy)
+    new{T,N,B}(local_array_type,strategy)
+  end
+end
+
+const PVectorBuilder{T,B} = DistributedArrayBuilder{T,1,B}
+const PSparseMatrixBuilder{T,B} = DistributedArrayBuilder{T,2,B}
+
+function Algebra.get_array_type(::PSparseMatrixBuilder{Tm}) where Tm
+  T = eltype(Tm)
+  return PSparseMatrix{T}
+end
+
+function Algebra.get_array_type(::PVectorBuilder{Tv}) where Tv
+  T = eltype(Tv)
+  return PVector{T}
+end
+
+# Distributed counters and allocators
+
+"""
+    DistributedCounter{S,T,N} <: GridapType
+
+Distributed N-dimensional counter, with local counters of type T.
+Follows assembly strategy S.
+"""
+struct DistributedCounter{S,T,N,A,B} <: GridapType
+  counters :: A
+  axes     :: B
+  strategy :: S
+  function DistributedCounter(
+    counters :: AbstractArray{T},
+    axes     :: NTuple{N,<:PRange},
+    strategy :: AssemblyStrategy
+  ) where {T,N}
+    A, B, S = typeof(counters), typeof(axes), typeof(strategy)
+    new{S,T,N,A,B}(counters,axes,strategy)
+  end
+end
+
+Base.axes(a::DistributedCounter) = a.axes
+Base.axes(a::DistributedCounter,d::Integer) = a.axes[d]
+local_views(a::DistributedCounter) = a.counters
+
+function local_views(a::DistributedCounter,axes::PRange...)
+  @check all(map(PArrays.matching_local_indices,axes,a.axes))
+  return a.counters
+end
+
+const PVectorCounter{S,T,A,B} = DistributedCounter{S,T,1,A,B}
+Algebra.LoopStyle(::Type{<:PVectorCounter}) = DoNotLoop()
+
+const PSparseMatrixCounter{S,T,A,B} = DistributedCounter{S,T,2,A,B}
+Algebra.LoopStyle(::Type{<:PSparseMatrixCounter}) = Loop()
+
+"""
+    DistributedAllocation{S,T,N} <: GridapType
+
+Distributed N-dimensional allocator, with local allocators of type T. 
+Follows assembly strategy S.
+"""
+struct DistributedAllocation{S,T,N,A,B} <: GridapType
+  allocs   :: A
+  axes     :: B
+  strategy :: S
+  function DistributedAllocation(
+    allocs   :: AbstractArray{T},
+    axes     :: NTuple{N,<:PRange},
+    strategy :: AssemblyStrategy
+  ) where {T,N}
+    A, B, S = typeof(allocs), typeof(axes), typeof(strategy)
+    new{S,T,N,A,B}(allocs,axes,strategy)
+  end
+end
+
+Base.axes(a::DistributedAllocation) = a.axes
+Base.axes(a::DistributedAllocation,d::Integer) = a.axes[d]
+local_views(a::DistributedAllocation) = a.allocs
+
+function local_views(a::DistributedAllocation,axes::PRange...)
+  @check all(map(PArrays.matching_local_indices,axes,a.axes))
+  return a.allocs
+end
+
+function change_axes(a::DistributedAllocation{S,T,N},axes::NTuple{N,<:PRange}) where {S,T,N}
+  indices = map(partition,axes)
+  local_axes = map(indices...) do indices...
+    map(ids -> Base.OneTo(local_length(ids)), indices)
+  end
+  allocs = map(change_axes,a.allocs,local_axes)
+  DistributedAllocation(allocs,axes,a.strategy)
+end
+
+const PVectorAllocation{S,T} = DistributedAllocation{S,T,1}
+const PSparseMatrixAllocation{S,T} = DistributedAllocation{S,T,2}
+
+function get_allocations(a::PSparseMatrixAllocation{S,<:Algebra.AllocationCOO}) where S
+  I,J,V = map(local_views(a)) do alloc
+    alloc.I, alloc.J, alloc.V
+  end |> tuple_of_arrays
+  return I,J,V
+end
+
+function collect_touched_ids(a::PVectorAllocation{S,<:TrackedArrayAllocation}) where S
+  touched_ids = map(local_views(a),partition(axes(a,1))) do a, ids
+    n_global = global_length(ids)
+    rows = remove_ghost(unpermute(ids))
+
+    ghost_lids = ghost_to_local(ids)
+    touched_ghost_lids = collect(Int,filter(lid -> a.touched[lid], ghost_lids))
+    touched_ghost_owners = local_to_owner(ids)[touched_ghost_lids]
+    touched_ghost_gids = to_global!(touched_ghost_lids, ids)
+    ghost = GhostIndices(n_global,touched_ghost_gids,touched_ghost_owners)
+    replace_ghost(rows,ghost)
+  end
+  return touched_ids
+end
+
+# PSparseMatrix assembly chain
+#
+#   1 - nz_counter(PSparseMatrixBuilder) -> PSparseMatrixCounter
+#   2 - nz_allocation(PSparseMatrixCounter) -> PSparseMatrixAllocation
+#   3 - create_from_nz(PSparseMatrixAllocation) -> PSparseMatrix
+
+function Algebra.nz_counter(builder::PSparseMatrixBuilder{MT},axs::NTuple{2,<:PRange}) where MT
+  rows, cols = axs # test ids, trial ids
+  counters = map(partition(rows),partition(cols)) do rows, cols
+    local_axs = (Base.OneTo(local_length(rows)),Base.OneTo(local_length(cols)))
+    Algebra.CounterCOO{MT}(local_axs)
+  end
+  DistributedCounter(counters,axs,builder.strategy)
+end
+
+function Algebra.nz_allocation(a::PSparseMatrixCounter)
+  allocs = map(nz_allocation,local_views(a))
+  DistributedAllocation(allocs,a.axes,a.strategy)
+end
+
+function Algebra.create_from_nz(a::PSparseMatrix)
+  assemble!(a) |> wait
+  return a
+end
+
+function Algebra.create_from_nz(a::PSparseMatrixAllocation{<:LocallyAssembled})
+  A, = create_from_nz_locally_assembled(a)
+  return A
+end
+
+function Algebra.create_from_nz(a::PSparseMatrixAllocation{<:Assembled})
+  A, = create_from_nz_assembled(a)
+  return A
+end
+
+function Algebra.create_from_nz(a::PSparseMatrixAllocation{<:SubAssembled})
+  A, = create_from_nz_subassembled(a)
+  return A
+end
+
+# PVector assembly chain:
+#
+#   1 - nz_counter(PVectorBuilder) -> PVectorCounter
+#   2 - nz_allocation(PVectorCounter) -> PVectorAllocation
+#   3 - create_from_nz(PVectorAllocation) -> PVector
+
+function Algebra.nz_counter(builder::PVectorBuilder{VT},axs::NTuple{1,<:PRange}) where VT
+  rows, = axs
+  counters = map(partition(rows)) do rows
+    axs = (Base.OneTo(local_length(rows)),)
+    nz_counter(ArrayBuilder(VT),axs)
+  end
+  DistributedCounter(counters,(rows,),builder.strategy)
+end
+
+function Arrays.nz_allocation(a::PVectorCounter{<:Union{LocallyAssembled,SubAssembled}})
+  allocs = map(nz_allocation,local_views(a))
+  DistributedAllocation(allocs,a.axes,a.strategy)
+end
+
+function Arrays.nz_allocation(a::PVectorCounter{<:Assembled})
+  values = map(nz_allocation,local_views(a))
+  allocs = map(TrackedArrayAllocation,values)
+  return DistributedAllocation(allocs,a.axes,a.strategy)
+end
+
+function Algebra.create_from_nz(a::PVector)
+  assemble!(a) |> wait
+  return a
+end
+
+function Algebra.create_from_nz(a::PVectorAllocation{<:Assembled,<:AbstractVector})
+  # This point MUST NEVER be reached. If reached there is an inconsistency
+  # in the parallel code in charge of vector assembly
+  @assert false
+end
+
+function Algebra.create_from_nz(a::PVectorAllocation{<:Union{LocallyAssembled,SubAssembled}})
+  rows = remove_ghost(unpermute(axes(a,1)))
+  values = local_views(a)
+  b = rhs_callback(values,rows)
+  return b
+end
+
+function Algebra.create_from_nz(a::PVectorAllocation{<:Assembled})
+  rows = collect_touched_ids(a)
+  values = map(ai -> ai.values, local_views(a))
+  b  = rhs_callback(values,rows)
+  t2 = assemble!(b)
+  if t2 !== nothing
+    wait(t2)
+  end
+  return b
+end
+
+# PSystem assembly chain:
+#
+# When assembling a full system (matrix + vector), it is more efficient to 
+# overlap communications the assembly of the matrix and the vector.
+# Not only it is faster, but also necessary to ensure identical ghost indices 
+# in both the matrix and vector rows.
+# This is done by using the following specializations:
+
+function Arrays.nz_allocation(
+  a::PSparseMatrixCounter, b::PVectorCounter
+)
+  return nz_allocation(a), nz_allocation(b)
+end
+
+function Algebra.create_from_nz(
+  a::PSparseMatrixAllocation{<:LocallyAssembled},
+  b::PVectorAllocation{<:LocallyAssembled}
+)
+  function callback(rows)
+    values = local_views(b)
+    b_fespace = PVector(values,partition(axes(b,1)))
+    locally_repartition(b_fespace,rows)
+  end
+  A, B = create_from_nz_locally_assembled(a,callback)
+  return A, B
+end
+
+function Algebra.create_from_nz(
+  a::PSparseMatrixAllocation{<:Assembled},
+  b::PVectorAllocation{<:Assembled}
+)
+  function callback(rows)
+    new_indices = collect_touched_ids(b)
+    values = map(ai -> ai.values, local_views(b))
+    b_fespace = PVector(values,partition(axes(b,1)))
+    locally_repartition(b_fespace,new_indices)
+  end
+  function async_callback(b)
+    assemble!(b)
+  end
+  A, B = create_from_nz_assembled(a,callback,async_callback)
+  return A, B
+end
+
+function Algebra.create_from_nz(
+  a::PSparseMatrixAllocation{<:SubAssembled},
+  b::PVectorAllocation{<:SubAssembled}
+)
+  function callback(rows)
+    @check PArrays.matching_local_indices(PRange(rows),axes(b,1))
+    values = local_views(b)
+    PVector(values,partition(axes(b,1)))
+  end
+  A, B = create_from_nz_subassembled(a,callback)
+  return A, B
+end
+
+# Low-level assembly methods
+
+function create_from_nz_locally_assembled(
+  a,
+  callback::Function = rows -> nothing
+)
+  I,J,V = get_allocations(a)
+  test_ids = partition(axes(a,1))
+  trial_ids = partition(axes(a,2))
+
+  rows = map(remove_ghost,map(unpermute,test_ids))
+  b = callback(rows)
+
+  map(map_local_to_global!,I,test_ids)
+  map(map_local_to_global!,J,trial_ids)
+
+  cols = filter_and_replace_ghost(map(unpermute,trial_ids),J)
+
+  map(map_global_to_local!,I,rows)
+  map(map_global_to_local!,J,cols)
+
+  assembled = true
+  a_sys = change_axes(a,(PRange(rows),PRange(cols)))
+  values = map(create_from_nz,local_views(a_sys))
+  A = PSparseMatrix(values,rows,cols,assembled)
+
+  return A, b
+end
+
+function create_from_nz_assembled(
+  a,
+  callback::Function = rows -> nothing,
+  async_callback::Function = b -> empty_async_task
+)
+  I,J,V = get_allocations(a)
+  test_ids = partition(axes(a,1))
+  trial_ids = partition(axes(a,2))
+
+  # convert I and J to global dof ids
+  map(map_local_to_global!,I,test_ids)
+  map(map_local_to_global!,J,trial_ids)
+
+  # Overlapped COO communication and vector assembly
+  rows = filter_and_replace_ghost(map(unpermute,test_ids),I)
+  t = PartitionedArrays.assemble_coo!(I,J,V,rows)
+  b = callback(rows)
+  wait(t)
+
+  # Overlap rhs communications with CSC compression
+  t2 = async_callback(b)
+  cols = filter_and_replace_ghost(map(unpermute,trial_ids),J)
+
+  map(map_global_to_local!,I,rows)
+  map(map_global_to_local!,J,cols)
+
+  assembled = true
+  a_sys = change_axes(a,(PRange(rows),PRange(cols)))
+  values = map(create_from_nz,local_views(a_sys))
+  A = PSparseMatrix(values,rows,cols,assembled)
+
+  wait(t2)
+  return A, b
+end
+
+function create_from_nz_subassembled(
+  a,
+  callback::Function = rows -> nothing,
+)
+  rows = partition(axes(a,1))
+  cols = partition(axes(a,2))
+
+  b = callback(rows)
+
+  assembled = false
+  values = map(create_from_nz,local_views(a))
+  A = PSparseMatrix(values,rows,cols,assembled)
+
+  return A, b
+end
diff --git a/src/BlockPartitionedArrays.jl b/src/BlockPartitionedArrays.jl
index 41e3f6d..942a7c2 100644
--- a/src/BlockPartitionedArrays.jl
+++ b/src/BlockPartitionedArrays.jl
@@ -271,8 +271,8 @@ function PartitionedArrays.partition(a::BlockPArray)
   return map(mortar,vals)
 end
 
-function PartitionedArrays.to_trivial_partition(a::BlockPArray)
-  vals = map(PartitionedArrays.to_trivial_partition,blocks(a))
+function PartitionedArrays.centralize(a::BlockPArray)
+  vals = map(PartitionedArrays.centralize,blocks(a))
   return mortar(vals)
 end
 
diff --git a/src/FESpaces.jl b/src/FESpaces.jl
index 319e218..f68b747 100644
--- a/src/FESpaces.jl
+++ b/src/FESpaces.jl
@@ -56,36 +56,25 @@ function FESpaces.gather_free_and_dirichlet_values(f::DistributedFESpace,cell_va
   return free_values, dirichlet_values
 end
 
-function dof_wise_to_cell_wise!(cell_wise_vector,dof_wise_vector,cell_to_ldofs,cell_prange)
-  map(cell_wise_vector,
-          dof_wise_vector,
-          cell_to_ldofs,
-          partition(cell_prange)) do cwv,dwv,cell_to_ldofs,indices
+function dof_wise_to_cell_wise!(cell_wise_vector,dof_wise_vector,cell_to_ldofs,cell_ids)
+  map(cell_wise_vector,dof_wise_vector,cell_to_ldofs,cell_ids) do cwv,dwv,cell_to_ldofs,cell_ids
     cache  = array_cache(cell_to_ldofs)
-    ncells = length(cell_to_ldofs)
-    ptrs = cwv.ptrs
-    data = cwv.data
-    cell_own_to_local = own_to_local(indices)
-    for cell in cell_own_to_local
+    for cell in cell_ids
       ldofs = getindex!(cache,cell_to_ldofs,cell)
-      p = ptrs[cell]-1
+      p = cwv.ptrs[cell]-1
       for (i,ldof) in enumerate(ldofs)
         if ldof > 0
-          data[i+p] = dwv[ldof]
+          cwv.data[i+p] = dwv[ldof]
         end
       end
     end
   end
 end
 
-function cell_wise_to_dof_wise!(dof_wise_vector,cell_wise_vector,cell_to_ldofs,cell_range)
-  map(dof_wise_vector,
-      cell_wise_vector,
-      cell_to_ldofs,
-      partition(cell_range)) do dwv,cwv,cell_to_ldofs,indices
+function cell_wise_to_dof_wise!(dof_wise_vector,cell_wise_vector,cell_to_ldofs,cell_ids)
+  map(dof_wise_vector,cell_wise_vector,cell_to_ldofs,cell_ids) do dwv,cwv,cell_to_ldofs,cell_ids
     cache = array_cache(cell_to_ldofs)
-    cell_ghost_to_local = ghost_to_local(indices)
-    for cell in cell_ghost_to_local
+    for cell in cell_ids
       ldofs = getindex!(cache,cell_to_ldofs,cell)
       p = cwv.ptrs[cell]-1
       for (i,ldof) in enumerate(ldofs)
@@ -97,28 +86,19 @@ function cell_wise_to_dof_wise!(dof_wise_vector,cell_wise_vector,cell_to_ldofs,c
   end
 end
 
-function dof_wise_to_cell_wise(dof_wise_vector,cell_to_ldofs,cell_prange)
-    cwv=map(cell_to_ldofs) do cell_to_ldofs
-      cache = array_cache(cell_to_ldofs)
-      ncells = length(cell_to_ldofs)
-      ptrs = Vector{Int32}(undef,ncells+1)
-      for cell in 1:ncells
-        ldofs = getindex!(cache,cell_to_ldofs,cell)
-        ptrs[cell+1] = length(ldofs)
-      end
-      PArrays.length_to_ptrs!(ptrs)
-      ndata = ptrs[end]-1
-      data = Vector{Int}(undef,ndata)
-      data .= -1
-      JaggedArray(data,ptrs)
-    end
-    dof_wise_to_cell_wise!(cwv,dof_wise_vector,cell_to_ldofs,cell_prange)
-    cwv
+function dof_wise_to_cell_wise(dof_wise_vector,cell_to_ldofs,cell_ids)
+  cwv = map(cell_to_ldofs) do cell_to_ldofs
+    ptrs = generate_ptrs(cell_to_ldofs)
+    data = fill(-1,ptrs[end]-1)
+    JaggedArray(data,ptrs)
+  end
+  dof_wise_to_cell_wise!(cwv,dof_wise_vector,cell_to_ldofs,cell_ids)
+  return cwv
 end
 
 function fetch_vector_ghost_values_cache(vector_partition,partition)
   cache = PArrays.p_vector_cache(vector_partition,partition)
-  map(reverse,cache)
+  reverse(cache)
 end
 
 function fetch_vector_ghost_values!(vector_partition,cache)
@@ -128,13 +108,15 @@ end
 function generate_gids(
   cell_range::PRange,
   cell_to_ldofs::AbstractArray{<:AbstractArray},
-  nldofs::AbstractArray{<:Integer})
+  nldofs::AbstractArray{<:Integer}
+)
+  ranks = linear_indices(partition(cell_range))
 
   # Find and count number owned dofs
   ldof_to_owner, nodofs = map(partition(cell_range),cell_to_ldofs,nldofs) do indices,cell_to_ldofs,nldofs
     ldof_to_owner = fill(Int32(0),nldofs)
-    cache = array_cache(cell_to_ldofs)
     lcell_to_owner = local_to_owner(indices)
+    cache = array_cache(cell_to_ldofs)
     for cell in 1:length(cell_to_ldofs)
       owner = lcell_to_owner[cell]
       ldofs = getindex!(cache,cell_to_ldofs,cell)
@@ -151,102 +133,53 @@ function generate_gids(
     ldof_to_owner, nodofs
   end |> tuple_of_arrays
 
-  cell_ldofs_to_part = dof_wise_to_cell_wise(ldof_to_owner,
-                                              cell_to_ldofs,
-                                              cell_range)
-
-  # Note1 : this call potentially updates cell_prange with the 
-  #         info required to exchange info among nearest neighbours
-  #         so that it can be re-used in the future for other exchanges
-
-  # Note2 : we need to call reverse() as the senders and receivers are 
-  #         swapped in the AssemblyCache of partition(cell_range)
-
-  # Exchange the dof owners
-  cache_fetch=fetch_vector_ghost_values_cache(cell_ldofs_to_part,partition(cell_range))
-  fetch_vector_ghost_values!(cell_ldofs_to_part,cache_fetch) |> wait
-  
-  cell_wise_to_dof_wise!(ldof_to_owner,
-                         cell_ldofs_to_part,
-                         cell_to_ldofs,
-                         cell_range)
-
   # Find the global range of owned dofs
   first_gdof = scan(+,nodofs,type=:exclusive,init=one(eltype(nodofs)))
+
+  # Exchange the dof owners. Cell owner always has correct dof owner.
+  cell_ldofs_to_owner = dof_wise_to_cell_wise(ldof_to_owner,cell_to_ldofs,own_to_local(cell_range))
+  consistent!(PVector(cell_ldofs_to_owner,partition(cell_range))) |> wait
+  cell_wise_to_dof_wise!(ldof_to_owner,cell_ldofs_to_owner,cell_to_ldofs,ghost_to_local(cell_range))
   
-  # Distribute gdofs to owned ones
-  ldof_to_gdof = map(first_gdof,ldof_to_owner,partition(cell_range)) do first_gdof,ldof_to_owner,indices
-    me = part_id(indices)
+  # Fill owned gids
+  ldof_to_gdof = map(ranks,first_gdof,ldof_to_owner) do rank,first_gdof,ldof_to_owner
+    ldof_to_gdof = fill(0,length(ldof_to_owner))
     offset = first_gdof-1
-    ldof_to_gdof = Vector{Int}(undef,length(ldof_to_owner))
     odof = 0
     for (ldof,owner) in enumerate(ldof_to_owner)
-      if owner == me
+      if owner == rank
         odof += 1
-        ldof_to_gdof[ldof] = odof
-      else
-        ldof_to_gdof[ldof] = 0
-      end
-    end
-    for (ldof,owner) in enumerate(ldof_to_owner)
-      if owner == me
-        ldof_to_gdof[ldof] += offset
+        ldof_to_gdof[ldof] = odof + offset
       end
     end
     ldof_to_gdof
   end
 
-  # Create cell-wise global dofs
-  cell_to_gdofs = dof_wise_to_cell_wise(ldof_to_gdof,
-                                        cell_to_ldofs,
-                                        cell_range)
-
-  # Exchange the global dofs
-  fetch_vector_ghost_values!(cell_to_gdofs,cache_fetch) |> wait
+  # Exchange gids
+  cell_to_gdofs = dof_wise_to_cell_wise(ldof_to_gdof,cell_to_ldofs,own_to_local(cell_range))
+  consistent!(PVector(cell_to_gdofs,partition(cell_range))) |> wait
 
-  # Distribute global dof ids also to ghost
+  # Fill ghost gids with exchanged information
   map(cell_to_ldofs,cell_to_gdofs,ldof_to_gdof,ldof_to_owner,partition(cell_range)) do cell_to_ldofs,cell_to_gdofs,ldof_to_gdof,ldof_to_owner,indices
-    gdof = 0
     cache = array_cache(cell_to_ldofs)
-    cell_ghost_to_local = ghost_to_local(indices)
     cell_local_to_owner = local_to_owner(indices)
-    for cell in cell_ghost_to_local
+    for cell in ghost_to_local(indices)
       ldofs = getindex!(cache,cell_to_ldofs,cell)
+      cell_owner = cell_local_to_owner[cell]
       p = cell_to_gdofs.ptrs[cell]-1
       for (i,ldof) in enumerate(ldofs)
-        if ldof > 0 && ldof_to_owner[ldof] == cell_local_to_owner[cell]
+        if ldof > 0 && isequal(ldof_to_owner[ldof],cell_owner)
           ldof_to_gdof[ldof] = cell_to_gdofs.data[i+p]
         end
       end
     end
   end
 
-  dof_wise_to_cell_wise!(cell_to_gdofs,
-                         ldof_to_gdof,
-                         cell_to_ldofs,
-                         cell_range)
-
-  fetch_vector_ghost_values!(cell_to_gdofs,cache_fetch) |> wait
-
-  cell_wise_to_dof_wise!(ldof_to_gdof,
-                         cell_to_gdofs,
-                         cell_to_ldofs,
-                         cell_range)
-
-  # Setup DoFs LocalIndices
-  ngdofs = reduction(+,nodofs,destination=:all,init=zero(eltype(nodofs)))
-  local_indices = map(ngdofs,partition(cell_range),ldof_to_gdof,ldof_to_owner) do ngdofs,
-                                                                                  indices,
-                                                                                  ldof_to_gdof,
-                                                                                  ldof_to_owner
-     me = part_id(indices)
-     LocalIndices(ngdofs,me,ldof_to_gdof,ldof_to_owner)
-  end
-
-  # Setup dof range
-  dofs_range = PRange(local_indices)
-
-  return dofs_range
+  # Setup DoFs AbstractLocalIndices
+  dof_ids = permuted_variable_partition(
+    nodofs,ldof_to_gdof,ldof_to_owner;start=first_gdof
+  )
+  return PRange(dof_ids)
 end
 
 # FEFunction related
@@ -571,193 +504,6 @@ function _add_distributed_constraint(F::DistributedFESpace,order::Integer,constr
   V
 end
 
-# Assembly
-
-function FESpaces.collect_cell_matrix(
-  trial::DistributedFESpace,
-  test::DistributedFESpace,
-  a::DistributedDomainContribution)
-  map(collect_cell_matrix,local_views(trial),local_views(test),local_views(a))
-end
-
-function FESpaces.collect_cell_vector(
-  test::DistributedFESpace, a::DistributedDomainContribution)
-  map(collect_cell_vector,local_views(test),local_views(a))
-end
-
-function FESpaces.collect_cell_matrix_and_vector(
-  trial::DistributedFESpace,
-  test::DistributedFESpace,
-  biform::DistributedDomainContribution,
-  liform::DistributedDomainContribution)
-  map(collect_cell_matrix_and_vector,
-    local_views(trial),
-    local_views(test),
-    local_views(biform),
-    local_views(liform))
-end
-
-function FESpaces.collect_cell_matrix_and_vector(
-  trial::DistributedFESpace,
-  test::DistributedFESpace,
-  biform::DistributedDomainContribution,
-  liform::DistributedDomainContribution,
-  uhd)
-  map(collect_cell_matrix_and_vector,
-    local_views(trial),
-    local_views(test),
-    local_views(biform),
-    local_views(liform),
-    local_views(uhd))
-end
-
-function FESpaces.collect_cell_vector(
-  test::DistributedFESpace,l::Number)
-  map(local_views(test)) do s
-    collect_cell_vector(s,l)
-  end
-end
-
-function FESpaces.collect_cell_matrix_and_vector(
-  trial::DistributedFESpace,
-  test::DistributedFESpace,
-  mat::DistributedDomainContribution,
-  l::Number)
-  map(local_views(trial),local_views(test),local_views(mat)) do u,v,m
-    collect_cell_matrix_and_vector(u,v,m,l)
-  end
-end
-
-function FESpaces.collect_cell_matrix_and_vector(
-  trial::DistributedFESpace,
-  test::DistributedFESpace,
-  mat::DistributedDomainContribution,
-  l::Number,
-  uhd)
-  map(local_views(trial),local_views(test),local_views(mat),local_views(uhd)) do u,v,m,f
-    collect_cell_matrix_and_vector(u,v,m,l,f)
-  end
-end
-"""
-"""
-struct DistributedSparseMatrixAssembler{A,B,C,D,E,F} <: SparseMatrixAssembler
-  strategy::A
-  assems::B
-  matrix_builder::C
-  vector_builder::D
-  test_dofs_gids_prange::E
-  trial_dofs_gids_prange::F
-end
-
-local_views(a::DistributedSparseMatrixAssembler) = a.assems
-
-FESpaces.get_rows(a::DistributedSparseMatrixAssembler) = a.test_dofs_gids_prange
-FESpaces.get_cols(a::DistributedSparseMatrixAssembler) = a.trial_dofs_gids_prange
-FESpaces.get_matrix_builder(a::DistributedSparseMatrixAssembler) = a.matrix_builder
-FESpaces.get_vector_builder(a::DistributedSparseMatrixAssembler) = a.vector_builder
-FESpaces.get_assembly_strategy(a::DistributedSparseMatrixAssembler) = a.strategy
-
-function FESpaces.symbolic_loop_matrix!(A,a::DistributedSparseMatrixAssembler,matdata)
-  rows = get_rows(a)
-  cols = get_cols(a)
-  map(symbolic_loop_matrix!,local_views(A,rows,cols),local_views(a),matdata)
-end
-
-function FESpaces.numeric_loop_matrix!(A,a::DistributedSparseMatrixAssembler,matdata)
-  rows = get_rows(a)
-  cols = get_cols(a)
-  map(numeric_loop_matrix!,local_views(A,rows,cols),local_views(a),matdata)
-end
-
-function FESpaces.symbolic_loop_vector!(b,a::DistributedSparseMatrixAssembler,vecdata)
-  rows = get_rows(a)
-  map(symbolic_loop_vector!,local_views(b,rows),local_views(a),vecdata)
-end
-
-function FESpaces.numeric_loop_vector!(b,a::DistributedSparseMatrixAssembler,vecdata)
-  rows = get_rows(a)
-  map(numeric_loop_vector!,local_views(b,rows),local_views(a),vecdata)
-end
-
-function FESpaces.symbolic_loop_matrix_and_vector!(A,b,a::DistributedSparseMatrixAssembler,data)
-  rows = get_rows(a)
-  cols = get_cols(a)
-  Aviews=local_views(A,rows,cols)
-  bviews=local_views(b,rows)
-  aviews=local_views(a)
-  map(symbolic_loop_matrix_and_vector!,Aviews,bviews,aviews,data)  
-end
-
-function FESpaces.numeric_loop_matrix_and_vector!(A,b,a::DistributedSparseMatrixAssembler,data)
-  rows = get_rows(a)
-  cols = get_cols(a)
-  map(numeric_loop_matrix_and_vector!,local_views(A,rows,cols),local_views(b,rows),local_views(a),data)
-end
-
-# Parallel Assembly strategies
-
-function local_assembly_strategy(::SubAssembledRows,rows,cols)
-  DefaultAssemblyStrategy()
-end
-
-# When using this one, make sure that you also loop over ghost cells.
-# This is at your own risk.
-function local_assembly_strategy(::FullyAssembledRows,rows,cols)
-  rows_local_to_ghost = local_to_ghost(rows)
-  GenericAssemblyStrategy(
-    identity,
-    identity,
-    row->rows_local_to_ghost[row]==0,
-    col->true
-  )
-end
-
-# Assembler high level constructors
-function FESpaces.SparseMatrixAssembler(
-  local_mat_type,
-  local_vec_type,
-  rows::PRange,
-  cols::PRange,
-  par_strategy=SubAssembledRows()
-)
-  assems = map(partition(rows),partition(cols)) do rows,cols
-    local_strategy = local_assembly_strategy(par_strategy,rows,cols)
-    FESpaces.GenericSparseMatrixAssembler(
-      SparseMatrixBuilder(local_mat_type),
-      ArrayBuilder(local_vec_type),
-      Base.OneTo(length(rows)),
-      Base.OneTo(length(cols)),
-      local_strategy
-    )
-  end
-  mat_builder = PSparseMatrixBuilderCOO(local_mat_type,par_strategy)
-  vec_builder = PVectorBuilder(local_vec_type,par_strategy)
-  return DistributedSparseMatrixAssembler(par_strategy,assems,mat_builder,vec_builder,rows,cols)
-end
-
-function FESpaces.SparseMatrixAssembler(
-  local_mat_type,
-  local_vec_type,
-  trial::DistributedFESpace,
-  test::DistributedFESpace,
-  par_strategy=SubAssembledRows()
-)
-  rows = get_free_dof_ids(test)
-  cols = get_free_dof_ids(trial)
-  SparseMatrixAssembler(local_mat_type,local_vec_type,rows,cols,par_strategy)
-end
-
-function FESpaces.SparseMatrixAssembler(
-  trial::DistributedFESpace,
-  test::DistributedFESpace,
-  par_strategy=SubAssembledRows()
-)
-  Tv = PartitionedArrays.getany(map(get_vector_type,local_views(trial)))
-  T  = eltype(Tv)
-  Tm = SparseMatrixCSC{T,Int}
-  SparseMatrixAssembler(Tm,Tv,trial,test,par_strategy)
-end
-
 # ZeroMean FESpace
 struct DistributedZeroMeanCache{A,B}
   dvol::A
@@ -917,3 +663,174 @@ function FESpaces.ConstantFESpace(
   vector_type = _find_vector_type(spaces,gids)
   return DistributedSingleFieldFESpace(spaces,gids,trian,vector_type)
 end
+
+# Assembly
+
+"""
+"""
+struct DistributedSparseMatrixAssembler{A,B,C,D,E,F} <: SparseMatrixAssembler
+  strategy::A
+  assems::B
+  matrix_builder::C
+  vector_builder::D
+  test_gids::E
+  trial_gids::F
+end
+
+local_views(a::DistributedSparseMatrixAssembler) = a.assems
+
+FESpaces.get_rows(a::DistributedSparseMatrixAssembler) = a.test_gids
+FESpaces.get_cols(a::DistributedSparseMatrixAssembler) = a.trial_gids
+FESpaces.get_matrix_builder(a::DistributedSparseMatrixAssembler) = a.matrix_builder
+FESpaces.get_vector_builder(a::DistributedSparseMatrixAssembler) = a.vector_builder
+FESpaces.get_assembly_strategy(a::DistributedSparseMatrixAssembler) = a.strategy
+
+function FESpaces.SparseMatrixAssembler(
+  local_mat_type,
+  local_vec_type,
+  rows::PRange,
+  cols::PRange,
+  par_strategy=Assembled()
+)
+  assems = map(partition(rows),partition(cols)) do rows,cols
+    FESpaces.GenericSparseMatrixAssembler(
+      SparseMatrixBuilder(local_mat_type), 
+      ArrayBuilder(local_vec_type),
+      Base.OneTo(length(rows)),
+      Base.OneTo(length(cols)),
+      local_assembly_strategy(par_strategy,rows,cols)
+    )
+  end
+  mat_builder = DistributedArrayBuilder(local_mat_type,par_strategy)
+  vec_builder = DistributedArrayBuilder(local_vec_type,par_strategy)
+  return DistributedSparseMatrixAssembler(par_strategy,assems,mat_builder,vec_builder,rows,cols)
+end
+
+function FESpaces.SparseMatrixAssembler(
+  local_mat_type,
+  local_vec_type,
+  trial::DistributedFESpace,
+  test::DistributedFESpace,
+  par_strategy::FESpaces.AssemblyStrategy=Assembled();
+  kwargs...
+)
+  rows = get_free_dof_ids(test)
+  cols = get_free_dof_ids(trial)
+  SparseMatrixAssembler(local_mat_type,local_vec_type,rows,cols,par_strategy;kwargs...)
+end
+
+function FESpaces.SparseMatrixAssembler(
+  trial::DistributedFESpace,
+  test::DistributedFESpace,
+  par_strategy::FESpaces.AssemblyStrategy=Assembled();
+  kwargs...
+)
+  Tv = getany(map(get_vector_type,local_views(trial)))
+  Tm = SparseMatrixCSC{eltype(Tv),Int}
+  SparseMatrixAssembler(Tm,Tv,trial,test,par_strategy;kwargs...)
+end
+
+function FESpaces.symbolic_loop_matrix!(A,a::DistributedSparseMatrixAssembler,matdata)
+  rows, cols = get_rows(a), get_cols(a)
+  map(symbolic_loop_matrix!,local_views(A,rows,cols),local_views(a),matdata)
+end
+
+function FESpaces.numeric_loop_matrix!(A,a::DistributedSparseMatrixAssembler,matdata)
+  rows, cols = get_rows(a), get_cols(a)
+  map(numeric_loop_matrix!,local_views(A,rows,cols),local_views(a),matdata)
+end
+
+function FESpaces.symbolic_loop_vector!(b,a::DistributedSparseMatrixAssembler,vecdata)
+  rows = get_rows(a)
+  map(symbolic_loop_vector!,local_views(b,rows),local_views(a),vecdata)
+end
+
+function FESpaces.numeric_loop_vector!(b,a::DistributedSparseMatrixAssembler,vecdata)
+  rows = get_rows(a)
+  map(numeric_loop_vector!,local_views(b,rows),local_views(a),vecdata)
+end
+
+function FESpaces.symbolic_loop_matrix_and_vector!(A,b,a::DistributedSparseMatrixAssembler,data)
+  rows, cols = get_rows(a), get_cols(a)
+  map(symbolic_loop_matrix_and_vector!,local_views(A,rows,cols),local_views(b,rows),local_views(a),data)  
+end
+
+function FESpaces.numeric_loop_matrix_and_vector!(A,b,a::DistributedSparseMatrixAssembler,data)
+  rows, cols = get_rows(a), get_cols(a)
+  map(numeric_loop_matrix_and_vector!,local_views(A,rows,cols),local_views(b,rows),local_views(a),data)
+end
+
+function FESpaces.collect_cell_matrix(
+  trial::DistributedFESpace,
+  test::DistributedFESpace,
+  a::DistributedDomainContribution
+)
+  map(collect_cell_matrix,local_views(trial),local_views(test),local_views(a))
+end
+
+function FESpaces.collect_cell_vector(
+  test::DistributedFESpace, a::DistributedDomainContribution
+)
+  map(collect_cell_vector,local_views(test),local_views(a))
+end
+
+function FESpaces.collect_cell_matrix_and_vector(
+  trial::DistributedFESpace,
+  test::DistributedFESpace,
+  biform::DistributedDomainContribution,
+  liform::DistributedDomainContribution
+)
+  map(collect_cell_matrix_and_vector,
+    local_views(trial),
+    local_views(test),
+    local_views(biform),
+    local_views(liform)
+  )
+end
+
+function FESpaces.collect_cell_matrix_and_vector(
+  trial::DistributedFESpace,
+  test::DistributedFESpace,
+  biform::DistributedDomainContribution,
+  liform::DistributedDomainContribution,
+  uhd
+)
+  map(collect_cell_matrix_and_vector,
+    local_views(trial),
+    local_views(test),
+    local_views(biform),
+    local_views(liform),
+    local_views(uhd)
+  )
+end
+
+function FESpaces.collect_cell_vector(
+  test::DistributedFESpace,l::Number
+)
+  map(local_views(test)) do s
+    collect_cell_vector(s,l)
+  end
+end
+
+function FESpaces.collect_cell_matrix_and_vector(
+  trial::DistributedFESpace,
+  test::DistributedFESpace,
+  mat::DistributedDomainContribution,
+  l::Number
+)
+  map(local_views(trial),local_views(test),local_views(mat)) do u,v,m
+    collect_cell_matrix_and_vector(u,v,m,l)
+  end
+end
+
+function FESpaces.collect_cell_matrix_and_vector(
+  trial::DistributedFESpace,
+  test::DistributedFESpace,
+  mat::DistributedDomainContribution,
+  l::Number,
+  uhd
+)
+  map(local_views(trial),local_views(test),local_views(mat),local_views(uhd)) do u,v,m,f
+    collect_cell_matrix_and_vector(u,v,m,l,f)
+  end
+end
diff --git a/src/Geometry.jl b/src/Geometry.jl
index 57ec00a..f9ae43c 100644
--- a/src/Geometry.jl
+++ b/src/Geometry.jl
@@ -633,7 +633,7 @@ function filter_cells_when_needed(
 end
 
 function filter_cells_when_needed(
-  portion::FullyAssembledRows,
+  portion::LocallyAssembled,
   cell_gids::AbstractLocalIndices,
   trian::Triangulation)
 
@@ -641,7 +641,7 @@ function filter_cells_when_needed(
 end
 
 function filter_cells_when_needed(
-  portion::SubAssembledRows,
+  portion::Assembled,
   cell_gids::AbstractLocalIndices,
   trian::Triangulation)
 
diff --git a/src/GridapDistributed.jl b/src/GridapDistributed.jl
index 3e1832f..ed67a55 100644
--- a/src/GridapDistributed.jl
+++ b/src/GridapDistributed.jl
@@ -32,8 +32,8 @@ import Base: inv, abs, abs2, *, +, -, /, adjoint, transpose, real, imag, conj, g
 import Gridap.Fields: grad2curl
 import Gridap.CellData: Interpolable
 
-export FullyAssembledRows
-export SubAssembledRows
+export LocallyAssembled
+export Assembled
 
 export get_cell_gids
 export get_face_gids
@@ -45,8 +45,13 @@ export redistribute
 
 include("BlockPartitionedArrays.jl")
 
+include("GridapExtras.jl")
+include("PArraysExtras.jl")
+
 include("Algebra.jl")
 
+include("Assembly.jl")
+
 include("Geometry.jl")
 
 include("CellData.jl")
diff --git a/src/GridapExtras.jl b/src/GridapExtras.jl
new file mode 100644
index 0000000..e072318
--- /dev/null
+++ b/src/GridapExtras.jl
@@ -0,0 +1,68 @@
+
+# Extensions to Gridap/Arrays/Tables.jl
+
+function generate_ptrs(vv::AbstractArray{<:AbstractArray{T}}) where T
+  ptrs = Vector{Int32}(undef,length(vv)+1)
+  Arrays._generate_data_and_ptrs_fill_ptrs!(ptrs,vv)
+  Arrays.length_to_ptrs!(ptrs)
+  return ptrs
+end
+
+# New type of Vector allocation
+
+"""
+    TrackedArrayAllocation{T} <: GridapType
+
+Array that keeps track of which entries have been touched. For assembly purposes.
+"""
+struct TrackedArrayAllocation{T}
+  values  :: Vector{T}
+  touched :: Vector{Bool}
+end
+
+function TrackedArrayAllocation(values)
+  touched = fill(false,length(values))
+  TrackedArrayAllocation(values,touched)
+end
+
+Algebra.LoopStyle(::Type{<:TrackedArrayAllocation}) = Algebra.Loop()
+Algebra.create_from_nz(a::TrackedArrayAllocation) = a.values
+
+@inline function Arrays.add_entry!(combine::Function,a::TrackedArrayAllocation,v::Nothing,i)
+  if i > 0
+    a.touched[i] = true
+  end
+  nothing
+end
+@inline function Arrays.add_entry!(combine::Function,a::TrackedArrayAllocation,v,i)
+  if i > 0
+    a.touched[i] = true
+    a.values[i] = combine(v,a.values[i])
+  end
+  nothing
+end
+@inline function Arrays.add_entries!(combine::Function,a::TrackedArrayAllocation,v::Nothing,i)
+  for ie in i
+    Arrays.add_entry!(combine,a,nothing,ie)
+  end
+  nothing
+end
+@inline function Arrays.add_entries!(combine::Function,a::TrackedArrayAllocation,v,i)
+  for (ve,ie) in zip(v,i)
+    Arrays.add_entry!(combine,a,ve,ie)
+  end
+  nothing
+end
+
+# change_axes
+
+function change_axes(a::Algebra.CounterCOO{T,A}, axes::A) where {T,A}
+  b = Algebra.CounterCOO{T}(axes)
+  b.nnz = a.nnz
+  b
+end
+
+function change_axes(a::Algebra.AllocationCOO{T,A}, axes::A) where {T,A}
+  counter = change_axes(a.counter,axes)
+  Algebra.AllocationCOO(counter,a.I,a.J,a.V)
+end
diff --git a/src/MultiField.jl b/src/MultiField.jl
index 5fa989c..b0129ba 100644
--- a/src/MultiField.jl
+++ b/src/MultiField.jl
@@ -454,7 +454,7 @@ function FESpaces.SparseMatrixAssembler(
   local_vec_type,
   trial::DistributedMultiFieldFESpace{<:BlockMultiFieldStyle{NB,SB,P}},
   test::DistributedMultiFieldFESpace{<:BlockMultiFieldStyle{NB,SB,P}},
-  par_strategy=SubAssembledRows()) where {NB,SB,P}
+  par_strategy=Assembled()) where {NB,SB,P}
 
   block_idx  = CartesianIndices((NB,NB))
   block_rows = blocks(test.gids)
diff --git a/src/PArraysExtras.jl b/src/PArraysExtras.jl
new file mode 100644
index 0000000..302944a
--- /dev/null
+++ b/src/PArraysExtras.jl
@@ -0,0 +1,400 @@
+
+"""
+    permuted_variable_partition(n_own,gids,owners; n_global, start)
+
+Create indices which are a permuted version of a variable_partition.
+The advantage of this w.r.t. the `LocalIndices` type, is that we can compute 
+dof owners with minimal communications, since we only need the size of the blocks.
+
+NOTE: This is the type for our FESpace dof_ids.
+"""
+function permuted_variable_partition(
+  n_own::AbstractArray{<:Integer},
+  gids::AbstractArray{<:AbstractArray{<:Integer}},
+  owners::AbstractArray{<:AbstractArray{<:Integer}};
+  n_global=reduction(+,n_own,destination=:all,init=zero(eltype(n_own))),
+  start=scan(+,n_own,type=:exclusive,init=one(eltype(n_own)))
+)
+  ranks = linear_indices(n_own)
+  np = length(ranks)
+  map(ranks,n_own,n_global,start,gids,owners) do rank,n_own,n_global,start,gids,owners
+    n_local = length(gids)
+    n_ghost = n_local - n_own
+    perm = fill(zero(Int32),n_local)
+    ghost_gids = fill(zero(Int),n_ghost)
+    ghost_owners = fill(zero(Int32),n_ghost)
+
+    n_ghost = 0
+    for (lid,(gid,owner)) in enumerate(zip(gids,owners))
+      if owner == rank
+        perm[lid] = gid-start+1
+      else
+        n_ghost += 1
+        ghost_gids[n_ghost] = gid
+        ghost_owners[n_ghost] = owner
+        perm[lid] = n_own + n_ghost
+      end
+    end
+    @assert n_ghost == n_local - n_own
+
+    ghost = GhostIndices(n_global,ghost_gids,ghost_owners)
+    dof_ids = PartitionedArrays.LocalIndicesWithVariableBlockSize(
+      CartesianIndex((rank,)),(np,),(n_global,),((1:n_own).+(start-1),),ghost
+    )
+    permute_indices(dof_ids,perm)
+  end
+end
+
+"""
+    unpermute(indices::AbstractLocalIndices)
+
+Given local indices, reorders them to (locally) have own indices first, 
+followed by ghost indices.
+"""
+function unpermute(indices::AbstractLocalIndices) 
+  @notimplemented
+end
+
+unpermute(indices::PermutedLocalIndices) = unpermute(indices.indices)
+unpermute(indices::PartitionedArrays.LocalIndicesInBlockPartition) = indices
+unpermute(indices::OwnAndGhostIndices) = indices
+
+function unpermute(indices::LocalIndices)
+  nglobal = global_length(indices)
+  rank = part_id(indices)
+  own = OwnIndices(nglobal,rank,own_to_global(indices))
+  ghost = GhostIndices(nglobal,ghost_to_global(indices),ghost_to_owner(indices))
+  OwnAndGhostIndices(own,ghost,global_to_owner(indices))
+end
+
+"""
+    locally_repartition(v::PVector,new_indices)
+
+Map the values of a PVector to a new partitioning of the indices.
+
+Similar to `PartitionedArrays.repartition`, but without any communications. Instead, 
+it is assumed that the local-to-local mapping can be done locally.
+"""
+function locally_repartition(v::PVector,new_indices)
+  w = similar(v,PRange(new_indices))
+  locally_repartition!(w,v)
+end
+
+function locally_repartition!(w::PVector,v::PVector)
+  # Fill own values
+  map(copy!,own_values(w),own_values(v))
+
+  # Fill ghost values
+  new_indices = partition(axes(w,1))
+  old_indices = partition(axes(v,1))
+  map(partition(w),partition(v),new_indices,old_indices) do w,v,new_ids,old_ids
+    old_gid_to_lid = global_to_local(old_ids)
+    for (lid,gid) in zip(ghost_to_local(new_ids),ghost_to_global(new_ids))
+      old_lid = old_gid_to_lid[gid]
+      w[lid] = v[old_lid]
+    end
+  end
+
+  return w
+end
+
+"""
+    filter_and_replace_ghost(indices,gids)
+
+Replace ghost ids in `indices` with the ghost ids within `gids`.
+
+NOTE: The issue is that `replace_ghost` does not check if all gids are ghosts or whether 
+they are repeated. It also shifts ownership of the ghosts. Its all quite messy and not what we 
+would need. TODO: Make me better. 
+"""
+function filter_and_replace_ghost(indices,gids)
+  owners = find_owner(indices,gids)
+  new_indices = map(indices,gids,owners) do indices, gids, owners
+    ghost_gids, ghost_owners = _filter_ghost(indices,gids,owners)
+    replace_ghost(indices, ghost_gids, ghost_owners)
+  end
+  return new_indices
+end
+
+# Same as PartitionedArrays.filter_ghost, but we do not exclude ghost indices that 
+# belong to `indices`. This could eventually be a flag in the original function.
+function _filter_ghost(indices,gids,owners)
+  ghosts = Set{Int}()
+  part_owner = part_id(indices)
+
+  n_ghost = 0
+  for (gid,owner) in zip(gids,owners)
+    if gid < 1
+      continue
+    end
+    if (owner != part_owner) && !(gid in ghosts)
+      n_ghost += 1
+      push!(ghosts,gid)
+    end
+  end
+
+  new_ghost_to_global = zeros(Int,n_ghost)
+  new_ghost_to_owner = zeros(Int32,n_ghost)
+
+  empty!(ghosts)
+  n_ghost = 0
+  for (gid,owner) in zip(gids,owners)
+    if gid < 1
+      continue
+    end
+    if (owner != part_owner) && !(gid in ghosts)
+      n_ghost += 1
+      new_ghost_to_global[n_ghost] = gid
+      new_ghost_to_owner[n_ghost] = owner
+      push!(ghosts,gid)
+    end
+  end
+
+  return new_ghost_to_global, new_ghost_to_owner
+end
+
+# function PArrays.remove_ghost(indices::PermutedLocalIndices)
+#   n_global = global_length(indices) 
+#   own = OwnIndices(n_global,part_id(indices),own_to_global(indices))
+#   ghost = GhostIndices(n_global,Int[],Int32[])
+#   OwnAndGhostIndices(own,ghost)
+# end
+
+function PArrays.remove_ghost(indices::PermutedLocalIndices)
+  perm = indices.perm
+  own_to_local = indices.own_to_local
+  display(own_to_local)
+  println(issorted(own_to_local))
+  display(perm)
+  println("-------------------------------")
+  remove_ghost(indices.indices)
+end
+
+# This function computes a mapping among the local identifiers of a and b
+# for which the corresponding global identifiers are both in a and b. 
+# Note that the haskey check is necessary because in the general case 
+# there might be gids in b which are not present in a
+function local_to_local_map(a::AbstractLocalIndices,b::AbstractLocalIndices)
+  a_lid_to_b_lid = fill(zero(Int32),local_length(a))
+
+  b_local_to_global  = local_to_global(b)
+  a_global_to_local  = global_to_local(a)
+  for b_lid in 1:local_length(b)
+    gid = b_local_to_global[b_lid]
+    a_lid = a_global_to_local[gid]
+    if !iszero(a_lid)
+      a_lid_to_b_lid[a_lid] = b_lid
+    end  
+  end
+  a_lid_to_b_lid
+end
+
+# SubSparseMatrix extensions
+
+function SparseArrays.findnz(A::PartitionedArrays.SubSparseMatrix)
+  I,J,V = findnz(A.parent)
+  rowmap, colmap = A.inv_indices
+  for k in eachindex(I)
+    I[k] = rowmap[I[k]]
+    J[k] = colmap[J[k]]
+  end
+  mask = map((i,j) -> (i > 0 && j > 0), I, J)
+  return I[mask], J[mask], V[mask]
+end
+
+# Async tasks
+
+const empty_async_task = PartitionedArrays.FakeTask(() -> nothing)
+
+# Linear algebra
+
+function LinearAlgebra.axpy!(α,x::PVector,y::PVector)
+  @check matching_local_indices(partition(axes(x,1)),partition(axes(y,1)))
+  map(partition(x),partition(y)) do x,y
+    LinearAlgebra.axpy!(α,x,y)
+  end
+  consistent!(y) |> wait
+  return y
+end
+
+function LinearAlgebra.axpy!(α,x::BlockPVector,y::BlockPVector)
+  map(blocks(x),blocks(y)) do x,y
+    LinearAlgebra.axpy!(α,x,y)
+  end
+  return y
+end
+
+function Algebra.axpy_entries!(
+  α::Number, A::PSparseMatrix, B::PSparseMatrix;
+  check::Bool=true
+)
+# We should definitely check here that the index partitions are the same. 
+# However: Because the different matrices are assembled separately, the objects are not the 
+# same (i.e can't use ===). Checking the index partitions would then be costly...
+  @assert reduce(&,map(PArrays.matching_local_indices,partition(axes(A,1)),partition(axes(B,1))))
+  @assert reduce(&,map(PArrays.matching_local_indices,partition(axes(A,2)),partition(axes(B,2))))
+  map(partition(A),partition(B)) do A, B
+    Algebra.axpy_entries!(α,A,B;check)
+  end
+  return B
+end
+
+function Algebra.axpy_entries!(
+  α::Number, A::BlockPMatrix, B::BlockPMatrix;
+  check::Bool=true
+)
+  map(blocks(A),blocks(B)) do A, B
+    Algebra.axpy_entries!(α,A,B;check)
+  end
+  return B
+end
+
+# Array of PArrays -> PArray of Arrays 
+# TODO: I think this is now implemented in PartitionedArrays.jl (check)
+function to_parray_of_arrays(a::AbstractArray{<:MPIArray})
+  indices = linear_indices(first(a))
+  map(indices) do i
+    map(a) do aj
+      PartitionedArrays.getany(aj)
+    end
+  end
+end
+
+function to_parray_of_arrays(a::AbstractArray{<:DebugArray})
+  indices = linear_indices(first(a))
+  map(indices) do i
+    map(a) do aj
+      aj.items[i]
+    end
+  end
+end
+
+# To local/to global for blocks 
+
+function to_local_indices!(I,ids::PRange;kwargs...)
+  map(to_local!,I,partition(ids))
+end
+
+function to_global_indices!(I,ids::PRange;kwargs...)
+  map(to_global!,I,partition(ids))
+end
+for f in [:to_local_indices!, :to_global_indices!, :get_gid_owners]
+  @eval begin
+    function $f(I::Vector,ids::AbstractVector{<:PRange};kwargs...)
+      map($f,I,ids)
+    end
+
+    function $f(I::Matrix,ids::AbstractVector{<:PRange};ax=:rows)
+      @check ax ∈ [:rows,:cols]
+      block_ids = CartesianIndices(I)
+      map(block_ids) do id
+        i = id[1]; j = id[2];
+        if ax == :rows
+          $f(I[i,j],ids[i])
+        else
+          $f(I[i,j],ids[j])
+        end
+      end
+    end
+  end
+end
+
+# This type is required because MPIArray from PArrays 
+# cannot be instantiated with a NULL communicator
+struct MPIVoidVector{T} <: AbstractVector{T}
+  comm::MPI.Comm
+  function MPIVoidVector(::Type{T}) where {T}
+    new{T}(MPI.COMM_NULL)
+  end
+end
+
+Base.size(a::MPIVoidVector) = (0,)
+Base.IndexStyle(::Type{<:MPIVoidVector}) = IndexLinear()
+function Base.getindex(a::MPIVoidVector,i::Int)
+  error("Indexing of MPIVoidVector not possible.")
+end
+function Base.setindex!(a::MPIVoidVector,v,i::Int)
+  error("Indexing of MPIVoidVector not possible.")
+end
+function Base.show(io::IO,k::MIME"text/plain",data::MPIVoidVector)
+  println(io,"MPIVoidVector")
+end
+
+# Communication extras, subpartitioning extras
+
+function num_parts(comm::MPI.Comm)
+  if comm != MPI.COMM_NULL
+    nparts = MPI.Comm_size(comm)
+  else
+    nparts = -1
+  end
+  nparts
+end
+@inline num_parts(comm::MPIArray) = num_parts(comm.comm)
+@inline num_parts(comm::DebugArray) = length(comm.items)
+@inline num_parts(comm::MPIVoidVector) = num_parts(comm.comm)
+
+function get_part_id(comm::MPI.Comm)
+  if comm != MPI.COMM_NULL
+    id = MPI.Comm_rank(comm)+1
+  else
+    id = -1
+  end
+  id
+end
+@inline get_part_id(comm::MPIArray) = get_part_id(comm.comm)
+@inline get_part_id(comm::MPIVoidVector) = get_part_id(comm.comm)
+
+"""
+    i_am_in(comm::MPIArray)
+    i_am_in(comm::DebugArray)
+  
+  Returns `true` if the processor is part of the subcommunicator `comm`.
+"""
+function i_am_in(comm::MPI.Comm)
+  get_part_id(comm) >=0
+end
+@inline i_am_in(comm::MPIArray) = i_am_in(comm.comm)
+@inline i_am_in(comm::MPIVoidVector) = i_am_in(comm.comm)
+@inline i_am_in(comm::DebugArray) = true
+
+function change_parts(x::Union{MPIArray,DebugArray,Nothing,MPIVoidVector}, new_parts; default=nothing)
+  x_new = map(new_parts) do p
+    if isa(x,MPIArray)
+      PartitionedArrays.getany(x)
+    elseif isa(x,DebugArray) && (p <= length(x.items))
+      x.items[p]
+    else
+      default
+    end
+  end
+  return x_new
+end
+
+function generate_subparts(parts::MPIArray,new_comm_size)
+  root_comm = parts.comm
+  root_size = MPI.Comm_size(root_comm)
+  rank = MPI.Comm_rank(root_comm)
+
+  @static if isdefined(MPI,:MPI_UNDEFINED)
+    mpi_undefined = MPI.MPI_UNDEFINED[]
+  else
+    mpi_undefined = MPI.API.MPI_UNDEFINED[]
+  end
+  
+  if root_size == new_comm_size
+    return parts
+  else
+    if rank < new_comm_size
+      comm = MPI.Comm_split(root_comm,0,0)
+      return distribute_with_mpi(LinearIndices((new_comm_size,));comm=comm,duplicate_comm=false)
+    else
+      comm = MPI.Comm_split(root_comm,mpi_undefined,mpi_undefined)
+      return MPIVoidVector(eltype(parts))
+    end
+  end
+end
+
+function generate_subparts(parts::DebugArray,new_comm_size)
+  DebugArray(LinearIndices((new_comm_size,)))
+end
diff --git a/src/_Algebra_old.jl b/src/_Algebra_old.jl
new file mode 100644
index 0000000..e5aefd6
--- /dev/null
+++ b/src/_Algebra_old.jl
@@ -0,0 +1,1238 @@
+
+# Vector allocation 
+
+function Algebra.allocate_vector(::Type{<:PVector{V}},ids::PRange) where {V}
+  PVector{V}(undef,partition(ids))
+end
+
+function Algebra.allocate_vector(::Type{<:BlockPVector{V}},ids::BlockPRange) where {V}
+  BlockPVector{V}(undef,ids)
+end
+
+function Algebra.allocate_in_range(matrix::PSparseMatrix)
+  V = Vector{eltype(matrix)}
+  allocate_in_range(PVector{V},matrix)
+end
+
+function Algebra.allocate_in_domain(matrix::PSparseMatrix)
+  V = Vector{eltype(matrix)}
+  allocate_in_domain(PVector{V},matrix)
+end
+
+function Algebra.allocate_in_range(matrix::BlockPMatrix)
+  V = Vector{eltype(matrix)}
+  allocate_in_range(BlockPVector{V},matrix)
+end
+
+function Algebra.allocate_in_domain(matrix::BlockPMatrix)
+  V = Vector{eltype(matrix)}
+  allocate_in_domain(BlockPVector{V},matrix)
+end
+
+# PartitionedArrays extras
+
+function LinearAlgebra.axpy!(α,x::PVector,y::PVector)
+  @check partition(axes(x,1)) === partition(axes(y,1))
+  map(partition(x),partition(y)) do x,y
+    LinearAlgebra.axpy!(α,x,y)
+  end
+  consistent!(y) |> wait
+  return y
+end
+
+function LinearAlgebra.axpy!(α,x::BlockPVector,y::BlockPVector)
+  map(blocks(x),blocks(y)) do x,y
+    LinearAlgebra.axpy!(α,x,y)
+  end
+  return y
+end
+
+function Algebra.axpy_entries!(
+  α::Number, A::PSparseMatrix, B::PSparseMatrix;
+  check::Bool=true
+)
+# We should definitely check here that the index partitions are the same. 
+# However: Because the different matrices are assembled separately, the objects are not the 
+# same (i.e can't use ===). Checking the index partitions would then be costly...
+  @assert reduce(&,map(PArrays.matching_local_indices,partition(axes(A,1)),partition(axes(B,1))))
+  @assert reduce(&,map(PArrays.matching_local_indices,partition(axes(A,2)),partition(axes(B,2))))
+  map(partition(A),partition(B)) do A, B
+    Algebra.axpy_entries!(α,A,B;check)
+  end
+  return B
+end
+
+function Algebra.axpy_entries!(
+  α::Number, A::BlockPMatrix, B::BlockPMatrix;
+  check::Bool=true
+)
+  map(blocks(A),blocks(B)) do A, B
+    Algebra.axpy_entries!(α,A,B;check)
+  end
+  return B
+end
+
+# TODO: Can we remove this?
+# This might go to Gridap in the future. We keep it here for the moment.
+function change_axes(a::Algebra.ArrayCounter,axes)
+  @notimplemented
+end
+
+# This might go to Gridap in the future. We keep it here for the moment.
+function change_axes(a::Algebra.CounterCOO{T,A}, axes::A) where {T,A}
+  b = Algebra.CounterCOO{T}(axes)
+  b.nnz = a.nnz
+  b
+end
+
+# This might go to Gridap in the future. We keep it here for the moment.
+function change_axes(a::Algebra.AllocationCOO{T,A}, axes::A) where {T,A}
+  counter = change_axes(a.counter,axes)
+  Algebra.AllocationCOO(counter,a.I,a.J,a.V)
+end
+
+# Array of PArrays -> PArray of Arrays 
+function to_parray_of_arrays(a::AbstractArray{<:MPIArray})
+  indices = linear_indices(first(a))
+  map(indices) do i
+    map(a) do aj
+      PartitionedArrays.getany(aj)
+    end
+  end
+end
+
+function to_parray_of_arrays(a::AbstractArray{<:DebugArray})
+  indices = linear_indices(first(a))
+  map(indices) do i
+    map(a) do aj
+      aj.items[i]
+    end
+  end
+end
+
+# This type is required because MPIArray from PArrays 
+# cannot be instantiated with a NULL communicator
+struct MPIVoidVector{T} <: AbstractVector{T}
+  comm::MPI.Comm
+  function MPIVoidVector(::Type{T}) where {T}
+    new{T}(MPI.COMM_NULL)
+  end
+end
+
+Base.size(a::MPIVoidVector) = (0,)
+Base.IndexStyle(::Type{<:MPIVoidVector}) = IndexLinear()
+function Base.getindex(a::MPIVoidVector,i::Int)
+  error("Indexing of MPIVoidVector not possible.")
+end
+function Base.setindex!(a::MPIVoidVector,v,i::Int)
+  error("Indexing of MPIVoidVector not possible.")
+end
+function Base.show(io::IO,k::MIME"text/plain",data::MPIVoidVector)
+  println(io,"MPIVoidVector")
+end
+
+function num_parts(comm::MPI.Comm)
+  if comm != MPI.COMM_NULL
+    nparts = MPI.Comm_size(comm)
+  else
+    nparts = -1
+  end
+  nparts
+end
+@inline num_parts(comm::MPIArray) = num_parts(comm.comm)
+@inline num_parts(comm::DebugArray) = length(comm.items)
+@inline num_parts(comm::MPIVoidVector) = num_parts(comm.comm)
+
+function get_part_id(comm::MPI.Comm)
+  if comm != MPI.COMM_NULL
+    id = MPI.Comm_rank(comm)+1
+  else
+    id = -1
+  end
+  id
+end
+@inline get_part_id(comm::MPIArray) = get_part_id(comm.comm)
+@inline get_part_id(comm::MPIVoidVector) = get_part_id(comm.comm)
+
+"""
+    i_am_in(comm::MPIArray)
+    i_am_in(comm::DebugArray)
+  
+  Returns `true` if the processor is part of the subcommunicator `comm`.
+"""
+function i_am_in(comm::MPI.Comm)
+  get_part_id(comm) >=0
+end
+@inline i_am_in(comm::MPIArray) = i_am_in(comm.comm)
+@inline i_am_in(comm::MPIVoidVector) = i_am_in(comm.comm)
+@inline i_am_in(comm::DebugArray) = true
+
+function change_parts(x::Union{MPIArray,DebugArray,Nothing,MPIVoidVector}, new_parts; default=nothing)
+  x_new = map(new_parts) do p
+    if isa(x,MPIArray)
+      PartitionedArrays.getany(x)
+    elseif isa(x,DebugArray) && (p <= length(x.items))
+      x.items[p]
+    else
+      default
+    end
+  end
+  return x_new
+end
+
+function generate_subparts(parts::MPIArray,new_comm_size)
+  root_comm = parts.comm
+  root_size = MPI.Comm_size(root_comm)
+  rank = MPI.Comm_rank(root_comm)
+
+  @static if isdefined(MPI,:MPI_UNDEFINED)
+    mpi_undefined = MPI.MPI_UNDEFINED[]
+  else
+    mpi_undefined = MPI.API.MPI_UNDEFINED[]
+  end
+  
+  if root_size == new_comm_size
+    return parts
+  else
+    if rank < new_comm_size
+      comm = MPI.Comm_split(root_comm,0,0)
+      return distribute_with_mpi(LinearIndices((new_comm_size,));comm=comm,duplicate_comm=false)
+    else
+      comm = MPI.Comm_split(root_comm,mpi_undefined,mpi_undefined)
+      return MPIVoidVector(eltype(parts))
+    end
+  end
+end
+
+function generate_subparts(parts::DebugArray,new_comm_size)
+  DebugArray(LinearIndices((new_comm_size,)))
+end
+
+# local_views
+
+function local_views(a)
+  @abstractmethod
+end
+
+function get_parts(a)
+  return linear_indices(local_views(a))
+end
+
+function local_views(a::AbstractVector,rows)
+  @notimplemented
+end
+
+function local_views(a::AbstractMatrix,rows,cols)
+  @notimplemented
+end
+
+local_views(a::AbstractArray) = a
+local_views(a::PRange) = partition(a)
+local_views(a::PVector) = partition(a)
+local_views(a::PSparseMatrix) = partition(a)
+
+function local_views(a::BlockPRange)
+  map(blocks(a)) do a
+    local_views(a)
+  end |> to_parray_of_arrays
+end
+
+function local_views(a::BlockPArray)
+  vals = map(blocks(a)) do a
+    local_views(a)
+  end |> to_parray_of_arrays
+  return map(mortar,vals)
+end
+
+# change_ghost
+
+function change_ghost(a::PVector{T},ids::PRange;is_consistent=false,make_consistent=false) where T
+  same_partition = (a.index_partition === partition(ids))
+  a_new = same_partition ? a : change_ghost(T,a,ids)
+  if make_consistent && (!same_partition || !is_consistent)
+    consistent!(a_new) |> wait
+  end
+  return a_new
+end
+
+function change_ghost(::Type{<:AbstractVector},a::PVector,ids::PRange)
+  a_new = similar(a,eltype(a),(ids,))
+  # Equivalent to copy!(a_new,a) but does not check that owned indices match
+  map(copy!,own_values(a_new),own_values(a))
+  return a_new
+end
+
+function change_ghost(::Type{<:OwnAndGhostVectors},a::PVector,ids::PRange)
+  values = map(own_values(a),partition(ids)) do own_vals,ids
+    ghost_vals = fill(zero(eltype(a)),ghost_length(ids))
+    perm = PartitionedArrays.local_permutation(ids)
+    OwnAndGhostVectors(own_vals,ghost_vals,perm)
+  end
+  return PVector(values,partition(ids))
+end
+
+function change_ghost(a::BlockPVector,ids::BlockPRange;is_consistent=false,make_consistent=false)
+  vals = map(blocks(a),blocks(ids)) do a, ids
+    change_ghost(a,ids;is_consistent=is_consistent,make_consistent=make_consistent)
+  end
+  return BlockPVector(vals,ids)
+end
+
+# TODO: Can we remove this?
+# This function computes a mapping among the local identifiers of a and b
+# for which the corresponding global identifiers are both in a and b. 
+# Note that the haskey check is necessary because in the general case 
+# there might be gids in b which are not present in a
+function find_local_to_local_map(a::AbstractLocalIndices,b::AbstractLocalIndices)
+  a_local_to_b_local = fill(Int32(-1),local_length(a))
+  b_local_to_global  = local_to_global(b)
+  a_global_to_local  = global_to_local(a)
+  for blid in 1:local_length(b)
+    gid = b_local_to_global[blid]
+    if a_global_to_local[gid] != zero(eltype(a_global_to_local))
+      alid = a_global_to_local[gid]
+      a_local_to_b_local[alid] = blid
+    end  
+  end
+  a_local_to_b_local
+end
+
+# TODO: Can we remove this?
+# This type is required in order to be able to access the local portion 
+# of distributed sparse matrices and vectors using local indices from the 
+# distributed test and trial spaces
+struct LocalView{T,N,A,B} <:AbstractArray{T,N}
+  plids_to_value::A
+  d_to_lid_to_plid::B
+  local_size::NTuple{N,Int}
+  function LocalView(
+    plids_to_value::AbstractArray{T,N},d_to_lid_to_plid::NTuple{N}) where {T,N}
+    A = typeof(plids_to_value)
+    B = typeof(d_to_lid_to_plid)
+    local_size = map(length,d_to_lid_to_plid)
+    new{T,N,A,B}(plids_to_value,d_to_lid_to_plid,local_size)
+  end
+end
+
+Base.size(a::LocalView) = a.local_size
+Base.IndexStyle(::Type{<:LocalView}) = IndexCartesian()
+function Base.getindex(a::LocalView{T,N},lids::Vararg{Integer,N}) where {T,N}
+  plids = map(_lid_to_plid,lids,a.d_to_lid_to_plid)
+  if all(i->i>0,plids)
+    a.plids_to_value[plids...]
+  else
+    zero(T)
+  end
+end
+function Base.setindex!(a::LocalView{T,N},v,lids::Vararg{Integer,N}) where {T,N}
+  plids = map(_lid_to_plid,lids,a.d_to_lid_to_plid)
+  @check all(i->i>0,plids) "You are trying to set a value that is not stored in the local portion"
+  a.plids_to_value[plids...] = v
+end
+
+function _lid_to_plid(lid,lid_to_plid)
+  plid = lid_to_plid[lid]
+  plid
+end
+
+function local_views(a::PVector,new_rows::PRange)
+  old_rows = axes(a,1)
+  if partition(old_rows) === partition(new_rows)
+    partition(a)
+  else
+    map(partition(a),partition(old_rows),partition(new_rows)) do vector_partition,old_rows,new_rows
+      LocalView(vector_partition,(find_local_to_local_map(new_rows,old_rows),))
+    end
+  end
+end
+
+function local_views(a::PSparseMatrix,new_rows::PRange,new_cols::PRange)
+  old_rows, old_cols = axes(a)
+  if (partition(old_rows) === partition(new_rows) && partition(old_cols) === partition(new_cols) )
+    partition(a)
+  else
+    map(partition(a),
+        partition(old_rows),partition(old_cols),
+        partition(new_rows),partition(new_cols)) do matrix_partition,old_rows,old_cols,new_rows,new_cols
+      rl2lmap = find_local_to_local_map(new_rows,old_rows)
+      cl2lmap = find_local_to_local_map(new_cols,old_cols)
+      LocalView(matrix_partition,(rl2lmap,cl2lmap))
+    end
+  end
+end
+
+function local_views(a::BlockPVector,new_rows::BlockPRange)
+  vals = map(local_views,blocks(a),blocks(new_rows)) |> to_parray_of_arrays
+  return map(mortar,vals)
+end
+
+function local_views(a::BlockPMatrix,new_rows::BlockPRange,new_cols::BlockPRange)
+  vals = map(CartesianIndices(blocksize(a))) do I
+    local_views(blocks(a)[I],blocks(new_rows)[I],blocks(new_cols)[I])
+  end |> to_parray_of_arrays
+  return map(mortar,vals)
+end
+
+# PSparseMatrix assembly
+
+struct Assembled <: FESpaces.AssemblyStrategy end
+struct SubAssembled <: FESpaces.AssemblyStrategy end
+struct LocallyAssembled <: FESpaces.AssemblyStrategy end
+
+struct PSparseMatrixBuilderCOO{T,B}
+  local_matrix_type::Type{T}
+  par_strategy::B
+end
+
+function Algebra.nz_counter(
+  builder::PSparseMatrixBuilderCOO{A}, axs::Tuple{<:PRange,<:PRange}
+) where A
+  rows, cols = axs # test ids, trial ids
+  counters = map(partition(rows),partition(cols)) do r,c
+    axs = (Base.OneTo(local_length(r)),Base.OneTo(local_length(c)))
+    Algebra.CounterCOO{A}(axs)
+  end
+  DistributedCounterCOO(builder.par_strategy,counters,rows,cols)
+end
+
+function Algebra.get_array_type(::PSparseMatrixBuilderCOO{Tv}) where Tv
+  T = eltype(Tv)
+  return PSparseMatrix{T}
+end
+
+
+"""
+"""
+struct DistributedCounterCOO{A,B,C,D} <: GridapType
+  par_strategy::A
+  counters    ::B
+  test_gids   ::C
+  trial_gids  ::D
+  function DistributedCounterCOO(
+    par_strategy,
+    counters::AbstractArray{<:Algebra.CounterCOO},
+    test_gids::PRange,
+    trial_gids::PRange
+  )
+    A = typeof(par_strategy)
+    B = typeof(counters)
+    C = typeof(test_gids)
+    D = typeof(trial_gids)
+    new{A,B,C,D}(par_strategy,counters,test_gids,trial_gids)
+  end
+end
+
+function local_views(a::DistributedCounterCOO)
+  return a.counters
+end
+
+function local_views(a::DistributedCounterCOO,test_gids,trial_gids)
+  @check PArrays.matching_local_indices(test_gids,a.test_gids)
+  @check PArrays.matching_local_indices(trial_gids,a.trial_gids)
+  return a.counters
+end
+
+function Algebra.nz_allocation(a::DistributedCounterCOO)
+  allocs = map(nz_allocation,a.counters) # Array{AllocationCOO}
+  DistributedAllocationCOO(a.par_strategy,allocs,a.test_gids,a.trial_gids)
+end
+
+struct DistributedAllocationCOO{A,B,C,D} <: GridapType
+  par_strategy::A
+  allocs::B
+  test_gids::C
+  trial_gids::D
+  function DistributedAllocationCOO(
+    par_strategy,
+    allocs::AbstractArray{<:Algebra.AllocationCOO},
+    test_gids::PRange,
+    trial_gids::PRange)
+    A = typeof(par_strategy)
+    B = typeof(allocs)
+    C = typeof(test_gids)
+    D = typeof(trial_gids)
+    new{A,B,C,D}(par_strategy,allocs,test_gids,trial_gids)
+  end
+end
+
+function change_axes(
+  a::DistributedAllocationCOO{A,B,<:PRange,<:PRange},
+  axes::Tuple{<:PRange,<:PRange}
+) where {A,B}
+  local_axes = map(partition(axes[1]),partition(axes[2])) do rows,cols
+    (Base.OneTo(local_length(rows)), Base.OneTo(local_length(cols)))
+  end
+  allocs = map(change_axes,a.allocs,local_axes)
+  DistributedAllocationCOO(a.par_strategy,allocs,axes[1],axes[2])
+end
+
+function change_axes(
+  a::MatrixBlock{<:DistributedAllocationCOO},
+  axes::Tuple{<:Vector,<:Vector}
+)
+  block_ids  = CartesianIndices(a.array)
+  rows, cols = axes
+  array = map(block_ids) do I
+    change_axes(a[I],(rows[I[1]],cols[I[2]]))
+  end
+  return ArrayBlock(array,a.touched)
+end
+
+function local_views(a::DistributedAllocationCOO)
+  a.allocs
+end
+
+function local_views(a::DistributedAllocationCOO,test_gids,trial_gids)
+  @check test_gids === a.test_gids
+  @check trial_gids === a.trial_gids
+  a.allocs
+end
+
+function local_views(a::MatrixBlock{<:DistributedAllocationCOO})
+  array = map(local_views,a.array) |> to_parray_of_arrays
+  return map(ai -> ArrayBlock(ai,a.touched),array)
+end
+
+function get_allocations(a::DistributedAllocationCOO)
+  I,J,V = map(local_views(a)) do alloc
+    alloc.I, alloc.J, alloc.V
+  end |> tuple_of_arrays
+  return I,J,V
+end
+
+function get_allocations(a::ArrayBlock{<:DistributedAllocationCOO})
+  tuple_of_array_of_parrays = map(get_allocations,a.array) |> tuple_of_arrays
+  return tuple_of_array_of_parrays
+end
+
+get_test_gids(a::DistributedAllocationCOO)  = a.test_gids
+get_trial_gids(a::DistributedAllocationCOO) = a.trial_gids
+get_test_gids(a::ArrayBlock{<:DistributedAllocationCOO})  = map(get_test_gids,diag(a.array))
+get_trial_gids(a::ArrayBlock{<:DistributedAllocationCOO}) = map(get_trial_gids,diag(a.array))
+
+function Algebra.create_from_nz(a::PSparseMatrix)
+  # For LocallyAssembled the underlying Exchanger should
+  # not have ghost layer making assemble! do nothing (TODO check)
+  assemble!(a) |> wait
+  a
+end
+
+function Algebra.create_from_nz(a::DistributedAllocationCOO{<:LocallyAssembled})
+  f(x) = nothing
+  A, = _fa_create_from_nz_with_callback(f,a)
+  return A
+end
+
+function Algebra.create_from_nz(a::ArrayBlock{<:DistributedAllocationCOO{<:LocallyAssembled}})
+  f(x) = nothing
+  A, = _fa_create_from_nz_with_callback(f,a)
+  return A
+end
+
+function _fa_create_from_nz_with_callback(callback,a)
+
+  # Recover some data
+  I,J,V = get_allocations(a)
+  test_gids  = get_test_gids(a)
+  trial_gids = get_trial_gids(a)
+
+  rows = _setup_prange(test_gids,I;ghost=false,ax=:rows)
+  b = callback(rows)
+
+  # convert I and J to global dof ids
+  to_global_indices!(I,test_gids;ax=:rows)
+  to_global_indices!(J,trial_gids;ax=:cols)
+
+  # Create the range for cols
+  cols = _setup_prange(trial_gids,J;ax=:cols)
+
+  # Convert again I,J to local numeration
+  to_local_indices!(I,rows;ax=:rows)
+  to_local_indices!(J,cols;ax=:cols)
+
+  A = _setup_matrix(I,J,V,rows,cols)
+  return A, b
+end
+
+function Algebra.create_from_nz(a::DistributedAllocationCOO{<:Assembled})
+  f(x) = nothing
+  A, = _sa_create_from_nz_with_callback(f,f,a,nothing)
+  return A
+end
+
+function Algebra.create_from_nz(a::ArrayBlock{<:DistributedAllocationCOO{<:Assembled}})
+  f(x) = nothing
+  A, = _sa_create_from_nz_with_callback(f,f,a,nothing)
+  return A
+end
+
+function _sa_create_from_nz_with_callback(callback,async_callback,a,b)
+  # Recover some data
+  I,J,V = get_allocations(a)
+  test_gids = get_test_gids(a)
+  trial_gids = get_trial_gids(a)
+
+  # convert I and J to global dof ids
+  to_global_indices!(I,test_gids;ax=:rows)
+  to_global_indices!(J,trial_gids;ax=:cols)
+
+  # Create the Prange for the rows
+  rows = _setup_prange(test_gids,I;ax=:rows)
+  
+  # Move (I,J,V) triplets to the owner process of each row I.
+  # The triplets are accompanyed which Jo which is the process column owner
+  Jo = get_gid_owners(J,trial_gids;ax=:cols)
+  t  = _assemble_coo!(I,J,V,rows;owners=Jo)
+
+  # Here we can overlap computations
+  # This is a good place to overlap since
+  # sending the matrix rows is a lot of data
+  if !isa(b,Nothing)
+    bprange=_setup_prange_from_pvector_allocation(b)
+    b = callback(bprange)
+  end
+
+  # Wait the transfer to finish
+  wait(t)
+
+  # Create the Prange for the cols
+  cols = _setup_prange(trial_gids,J;ax=:cols,owners=Jo)
+
+  # Overlap rhs communications with CSC compression
+  t2 = async_callback(b)
+
+  # Convert again I,J to local numeration
+  to_local_indices!(I,rows;ax=:rows)
+  to_local_indices!(J,cols;ax=:cols)
+
+  A = _setup_matrix(a,I,J,V,rows,cols)
+
+  # Wait the transfer to finish
+  if !isa(t2,Nothing)
+    wait(t2)
+  end
+  
+  return A, b
+end
+
+
+# PVector assembly 
+
+struct PVectorBuilder{T,B}
+  local_vector_type::Type{T}
+  par_strategy::B
+end
+
+function Algebra.nz_counter(builder::PVectorBuilder,axs::Tuple{<:PRange})
+  T = builder.local_vector_type
+  rows, = axs
+  counters = map(partition(rows)) do rows
+    axs = (Base.OneTo(local_length(rows)),)
+    nz_counter(ArrayBuilder(T),axs)
+  end
+  PVectorCounter(builder.par_strategy,counters,rows)
+end
+
+function Algebra.get_array_type(::PVectorBuilder{Tv}) where Tv
+  T = eltype(Tv)
+  return PVector{T}
+end
+
+struct PVectorCounter{A,B,C}
+  par_strategy::A
+  counters::B
+  test_gids::C
+end
+
+Algebra.LoopStyle(::Type{<:PVectorCounter}) = DoNotLoop()
+
+function local_views(a::PVectorCounter)
+  a.counters
+end
+
+function local_views(a::PVectorCounter,rows)
+  @check rows === a.test_gids
+  a.counters
+end
+
+function Arrays.nz_allocation(a::PVectorCounter{<:LocallyAssembled})
+  dofs = a.test_gids
+  values = map(nz_allocation,a.counters)
+  PVectorAllocationTrackOnlyValues(a.par_strategy,values,dofs)
+end
+
+struct PVectorAllocationTrackOnlyValues{A,B,C}
+  par_strategy::A
+  values::B
+  test_gids::C
+end
+
+function local_views(a::PVectorAllocationTrackOnlyValues)
+  a.values
+end
+
+function local_views(a::PVectorAllocationTrackOnlyValues,rows)
+  @check rows === a.test_gids
+  a.values
+end
+
+function Algebra.create_from_nz(a::PVectorAllocationTrackOnlyValues{<:LocallyAssembled})
+  rows = _setup_prange_without_ghosts(a.test_gids)
+  _rhs_callback(a,rows)
+end
+
+function Algebra.create_from_nz(a::PVectorAllocationTrackOnlyValues{<:Assembled})
+  # This point MUST NEVER be reached. If reached there is an inconsistency
+  # in the parallel code in charge of vector assembly
+  @assert false
+end
+
+function _rhs_callback(row_partitioned_vector_partition,rows)
+  # The ghost values in row_partitioned_vector_partition are 
+  # aligned with the FESpace but not with the ghost values in the rows of A
+  b_fespace = PVector(row_partitioned_vector_partition.values,
+                      partition(row_partitioned_vector_partition.test_gids))
+
+  # This one is aligned with the rows of A
+  b = similar(b_fespace,eltype(b_fespace),(rows,))
+
+  # First transfer owned values
+  b .= b_fespace
+
+  # Now transfer ghost
+  function transfer_ghost(b,b_fespace,ids,ids_fespace)
+    num_ghosts_vec = ghost_length(ids)
+    gho_to_loc_vec = ghost_to_local(ids)
+    loc_to_glo_vec = local_to_global(ids)
+    gid_to_lid_fe  = global_to_local(ids_fespace)
+    for ghost_lid_vec in 1:num_ghosts_vec
+      lid_vec     = gho_to_loc_vec[ghost_lid_vec]
+      gid         = loc_to_glo_vec[lid_vec]
+      lid_fespace = gid_to_lid_fe[gid]
+      b[lid_vec] = b_fespace[lid_fespace]
+    end
+  end
+  map(
+    transfer_ghost,
+    partition(b),
+    partition(b_fespace),
+    b.index_partition,
+    b_fespace.index_partition
+  )
+
+  return b
+end
+
+function Algebra.create_from_nz(a::PVector)
+  assemble!(a) |> wait
+  return a
+end
+
+function Algebra.create_from_nz(
+  a::DistributedAllocationCOO{<:LocallyAssembled},
+  c_fespace::PVectorAllocationTrackOnlyValues{<:LocallyAssembled}
+)
+
+  function callback(rows)
+    _rhs_callback(c_fespace,rows)
+  end
+
+  A,b = _fa_create_from_nz_with_callback(callback,a)
+  return A,b
+end
+
+struct PVectorAllocationTrackTouchedAndValues{A,B,C}
+  allocations::A
+  values::B
+  test_gids::C
+end
+
+function Algebra.create_from_nz(
+  a::DistributedAllocationCOO{<:Assembled},
+  b::PVectorAllocationTrackTouchedAndValues
+)
+
+  function callback(rows)
+    _rhs_callback(b,rows)
+  end
+
+  function async_callback(b)
+    # now we can assemble contributions
+    assemble!(b)
+  end
+
+  A,b = _sa_create_from_nz_with_callback(callback,async_callback,a,b)
+  return A,b
+end
+
+struct ArrayAllocationTrackTouchedAndValues{A}
+  touched::Vector{Bool}
+  values::A
+end
+
+Gridap.Algebra.LoopStyle(::Type{<:ArrayAllocationTrackTouchedAndValues}) = Gridap.Algebra.Loop()
+
+function local_views(a::PVectorAllocationTrackTouchedAndValues,rows)
+  @check PArrays.matching_local_indices(rows, a.test_gids)
+  a.allocations
+end
+
+@inline function Arrays.add_entry!(c::Function,a::ArrayAllocationTrackTouchedAndValues,v,i,j)
+  @notimplemented
+end
+@inline function Arrays.add_entry!(c::Function,a::ArrayAllocationTrackTouchedAndValues,v,i)
+  if i > 0
+    a.touched[i]=true
+    if !isnothing(v)
+      a.values[i]=c(v,a.values[i])
+    end
+  end
+  nothing
+end
+@inline function Arrays.add_entries!(c::Function,a::ArrayAllocationTrackTouchedAndValues,v,i,j)
+  @notimplemented
+end
+@inline function Arrays.add_entries!(c::Function,a::ArrayAllocationTrackTouchedAndValues,v,i)
+  if !isa(v,Nothing)
+    for (ve,ie) in zip(v,i)
+      Arrays.add_entry!(c,a,ve,ie)
+    end
+  else
+    for ie in i
+      Arrays.add_entry!(c,a,nothing,ie)
+    end
+  end
+  nothing
+end
+
+
+function _setup_touched_and_allocations_arrays(values)
+  touched = map(values) do values
+    fill!(Vector{Bool}(undef,length(values)),false)
+  end
+  allocations = map(values,touched) do values,touched
+   ArrayAllocationTrackTouchedAndValues(touched,values)
+  end
+  touched, allocations
+end
+
+function Arrays.nz_allocation(
+  a::DistributedCounterCOO{<:Assembled},
+  b::PVectorCounter{<:Assembled}
+)
+  A      = nz_allocation(a)
+  dofs   = b.test_gids
+  values = map(nz_allocation,b.counters)
+  touched, allocations = _setup_touched_and_allocations_arrays(values)
+  B = PVectorAllocationTrackTouchedAndValues(allocations,values,dofs)
+  return A, B
+end
+
+function Arrays.nz_allocation(a::PVectorCounter{<:Assembled})
+  dofs = a.test_gids
+  values = map(nz_allocation,a.counters)
+  touched,allocations=_setup_touched_and_allocations_arrays(values)
+  return PVectorAllocationTrackTouchedAndValues(allocations,values,dofs)
+end
+
+function local_views(a::PVectorAllocationTrackTouchedAndValues)
+  a.allocations
+end
+
+function _setup_prange_from_pvector_allocation(a::PVectorAllocationTrackTouchedAndValues)
+
+  # Find the ghost rows
+  allocations = local_views(a.allocations)
+  indices = partition(a.test_gids)
+  I_ghost_lids_to_dofs_ghost_lids = map(allocations, indices) do allocation, indices
+    dofs_lids_touched = findall(allocation.touched)
+    loc_to_gho = local_to_ghost(indices)
+    n_I_ghost_lids = count((x)->loc_to_gho[x]!=0,dofs_lids_touched)
+    I_ghost_lids = Vector{Int32}(undef,n_I_ghost_lids)
+    cur = 1
+    for lid in dofs_lids_touched
+      dof_lid = loc_to_gho[lid]
+      if dof_lid != 0
+        I_ghost_lids[cur] = dof_lid
+        cur = cur+1
+      end
+    end
+    I_ghost_lids
+  end
+
+  ghost_to_global, ghost_to_owner = map(
+    find_gid_and_owner,I_ghost_lids_to_dofs_ghost_lids,indices) |> tuple_of_arrays
+
+  ngids = length(a.test_gids)
+  _setup_prange_impl_(ngids,indices,ghost_to_global,ghost_to_owner)
+end
+
+function Algebra.create_from_nz(a::PVectorAllocationTrackTouchedAndValues)
+  rows = _setup_prange_from_pvector_allocation(a)
+  b    = _rhs_callback(a,rows)
+  t2   = assemble!(b)
+  # Wait the transfer to finish
+  if t2 !== nothing
+    wait(t2)
+  end
+  return b
+end
+
+# Common Assembly Utilities
+function first_gdof_from_ids(ids)
+  if own_length(ids) == 0
+    return 1
+  end
+  lid_to_gid   = local_to_global(ids) 
+  owned_to_lid = own_to_local(ids)
+  return Int(lid_to_gid[first(owned_to_lid)])
+end
+
+function find_gid_and_owner(ighost_to_jghost,jindices)
+  jghost_to_local  = ghost_to_local(jindices)
+  jlocal_to_global = local_to_global(jindices)
+  jlocal_to_owner  = local_to_owner(jindices)
+  ighost_to_jlocal = sort(view(jghost_to_local,ighost_to_jghost))
+
+  ighost_to_global = jlocal_to_global[ighost_to_jlocal]
+  ighost_to_owner  = jlocal_to_owner[ighost_to_jlocal]
+  return ighost_to_global, ighost_to_owner
+end
+
+# The given ids are assumed to be a sub-set of the lids
+function ghost_lids_touched(a::AbstractLocalIndices,gids::AbstractVector{<:Integer})
+  glo_to_loc = global_to_local(a)
+  loc_to_gho = local_to_ghost(a)
+  
+  # First pass: Allocate
+  i = 0
+  ghost_lids_touched = fill(false,ghost_length(a))
+  for gid in gids
+    lid = glo_to_loc[gid]
+    ghost_lid = loc_to_gho[lid]
+    if ghost_lid > 0 && !ghost_lids_touched[ghost_lid]
+      ghost_lids_touched[ghost_lid] = true
+      i += 1
+    end
+  end
+  gids_ghost_lid_to_ghost_lid = Vector{Int32}(undef,i)
+
+  # Second pass: fill 
+  i = 1
+  fill!(ghost_lids_touched,false)
+  for gid in gids
+    lid = glo_to_loc[gid]
+    ghost_lid = loc_to_gho[lid]
+    if ghost_lid > 0 && !ghost_lids_touched[ghost_lid]
+      ghost_lids_touched[ghost_lid] = true
+      gids_ghost_lid_to_ghost_lid[i] = ghost_lid
+      i += 1
+    end
+  end
+
+  return gids_ghost_lid_to_ghost_lid
+end
+
+# Find the neighbours of partition1 trying 
+# to use those in partition2 as a hint 
+function _find_neighbours!(partition1, partition2)
+  partition2_snd, partition2_rcv = assembly_neighbors(partition2)
+  partition2_graph = ExchangeGraph(partition2_snd, partition2_rcv)
+  return assembly_neighbors(partition1; neighbors=partition2_graph)
+end
+
+# to_global! & to_local! analogs, needed for dispatching in block assembly
+
+function to_local_indices!(I,ids::PRange;kwargs...)
+  map(to_local!,I,partition(ids))
+end
+
+function to_global_indices!(I,ids::PRange;kwargs...)
+  map(to_global!,I,partition(ids))
+end
+
+function get_gid_owners(I,ids::PRange;kwargs...)
+  map(I,partition(ids)) do I, indices 
+    glo_to_loc = global_to_local(indices) 
+    loc_to_own = local_to_owner(indices)
+    map(x->loc_to_own[glo_to_loc[x]], I)
+  end 
+end
+
+for f in [:to_local_indices!, :to_global_indices!, :get_gid_owners]
+  @eval begin
+    function $f(I::Vector,ids::AbstractVector{<:PRange};kwargs...)
+      map($f,I,ids)
+    end
+
+    function $f(I::Matrix,ids::AbstractVector{<:PRange};ax=:rows)
+      @check ax ∈ [:rows,:cols]
+      block_ids = CartesianIndices(I)
+      map(block_ids) do id
+        i = id[1]; j = id[2];
+        if ax == :rows
+          $f(I[i,j],ids[i])
+        else
+          $f(I[i,j],ids[j])
+        end
+      end
+    end
+  end
+end
+
+
+function _setup_matrix(
+  a,I,J,V,rows::PRange,cols::PRange
+)
+  _rows = PRange(map(remove_ghost,partition(rows)))
+  asys = change_axes(a,(_rows,cols))
+  values = map(create_from_nz,local_views(asys))
+  PSparseMatrix(values,partition(_rows),partition(cols),true)
+end
+
+function _setup_matrix(a,I,J,V,rows::Vector{<:PRange},cols::Vector{<:PRange})
+  block_ids  = CartesianIndices(I)
+  block_mats = map(block_ids) do id
+    i = id[1]; j = id[2];
+    _setup_matrix(a[i,j],I[i,j],J[i,j],V[i,j],rows[i],cols[j])
+  end
+  return mortar(block_mats)
+end
+
+# _assemble_coo! : local coo triplets + global PRange -> Global coo values
+
+function _assemble_coo!(I,J,V,rows::PRange;owners=nothing)
+  if isa(owners,Nothing)
+    PArrays.assemble_coo!(I,J,V,partition(rows))
+  else
+    assemble_coo_with_column_owner!(I,J,V,partition(rows),owners)
+  end
+end
+
+function _assemble_coo!(I,J,V,rows::Vector{<:PRange};owners=nothing)
+  block_ids = CartesianIndices(I)
+  map(block_ids) do id
+    i = id[1]; j = id[2];
+    if isa(owners,Nothing)
+      _assemble_coo!(I[i,j],J[i,j],V[i,j],rows[i])
+    else
+      _assemble_coo!(I[i,j],J[i,j],V[i,j],rows[i],owners=owners[i,j])
+    end
+  end
+end
+
+function assemble_coo_with_column_owner!(I,J,V,row_partition,Jown)
+  """
+    Returns three JaggedArrays with the coo triplets
+    to be sent to the corresponding owner parts in parts_snd
+  """
+  function setup_snd(part,parts_snd,row_lids,coo_entries_with_column_owner)
+    global_to_local_row = global_to_local(row_lids)
+    local_row_to_owner = local_to_owner(row_lids)
+    owner_to_i = Dict(( owner=>i for (i,owner) in enumerate(parts_snd) ))
+    ptrs = zeros(Int32,length(parts_snd)+1)
+    k_gi, k_gj, k_jo, k_v = coo_entries_with_column_owner
+    for k in 1:length(k_gi)
+      gi = k_gi[k]
+      li = global_to_local_row[gi]
+      owner = local_row_to_owner[li]
+      if owner != part
+        ptrs[owner_to_i[owner]+1] +=1
+      end
+    end
+    PArrays.length_to_ptrs!(ptrs)
+    gi_snd_data = zeros(eltype(k_gi),ptrs[end]-1)
+    gj_snd_data = zeros(eltype(k_gj),ptrs[end]-1)
+    jo_snd_data = zeros(eltype(k_jo),ptrs[end]-1)
+    v_snd_data = zeros(eltype(k_v),ptrs[end]-1)
+    for k in 1:length(k_gi)
+      gi = k_gi[k]
+      li = global_to_local_row[gi]
+      owner = local_row_to_owner[li]
+      if owner != part
+        gj = k_gj[k]
+        v = k_v[k]
+        p = ptrs[owner_to_i[owner]]
+        gi_snd_data[p] = gi
+        gj_snd_data[p] = gj
+        jo_snd_data[p] = k_jo[k]
+        v_snd_data[p]  = v
+        k_v[k] = zero(v)
+        ptrs[owner_to_i[owner]] += 1
+      end
+    end
+    PArrays.rewind_ptrs!(ptrs)
+    gi_snd = JaggedArray(gi_snd_data,ptrs)
+    gj_snd = JaggedArray(gj_snd_data,ptrs)
+    jo_snd = JaggedArray(jo_snd_data,ptrs)
+    v_snd = JaggedArray(v_snd_data,ptrs)
+    gi_snd, gj_snd, jo_snd, v_snd
+  end
+  """
+    Pushes to coo_entries_with_column_owner the tuples 
+    gi_rcv,gj_rcv,jo_rcv,v_rcv received from remote processes
+  """
+  function setup_rcv!(coo_entries_with_column_owner,gi_rcv,gj_rcv,jo_rcv,v_rcv)
+    k_gi, k_gj, k_jo, k_v = coo_entries_with_column_owner
+    current_n = length(k_gi)
+    new_n = current_n + length(gi_rcv.data)
+    resize!(k_gi,new_n)
+    resize!(k_gj,new_n)
+    resize!(k_jo,new_n)
+    resize!(k_v,new_n)
+    for p in 1:length(gi_rcv.data)
+        k_gi[current_n+p] = gi_rcv.data[p]
+        k_gj[current_n+p] = gj_rcv.data[p]
+        k_jo[current_n+p] = jo_rcv.data[p]
+        k_v[current_n+p] = v_rcv.data[p]
+    end
+  end
+  part = linear_indices(row_partition)
+  parts_snd, parts_rcv = assembly_neighbors(row_partition)
+  coo_entries_with_column_owner = map(tuple,I,J,Jown,V)
+  gi_snd, gj_snd, jo_snd, v_snd = map(setup_snd,part,parts_snd,row_partition,coo_entries_with_column_owner) |> tuple_of_arrays
+  graph = ExchangeGraph(parts_snd,parts_rcv)
+  t1 = exchange(gi_snd,graph)
+  t2 = exchange(gj_snd,graph)
+  t3 = exchange(jo_snd,graph)
+  t4 = exchange(v_snd,graph)
+  @async begin
+      gi_rcv = fetch(t1)
+      gj_rcv = fetch(t2)
+      jo_rcv = fetch(t3)
+      v_rcv = fetch(t4)
+      map(setup_rcv!,coo_entries_with_column_owner,gi_rcv,gj_rcv,jo_rcv,v_rcv)
+      I,J,Jown,V
+  end
+end
+
+# dofs_gids_prange can be either test_gids or trial_gids
+# In the former case, gids is a vector of global test dof identifiers, while in the 
+# latter, a vector of global trial dof identifiers
+function _setup_prange(dofs_gids_prange::PRange,gids;ghost=true,owners=nothing,kwargs...)
+  if !ghost
+    _setup_prange_without_ghosts(dofs_gids_prange)
+  elseif isa(owners,Nothing)
+    _setup_prange_with_ghosts(dofs_gids_prange,gids)
+  else
+    _setup_prange_with_ghosts(dofs_gids_prange,gids,owners)
+  end
+end
+
+function _setup_prange(
+  dofs_gids_prange::AbstractVector{<:PRange},
+  gids::AbstractMatrix;
+  ax=:rows,ghost=true,owners=nothing
+)
+  @check ax ∈ (:rows,:cols)
+  block_ids = LinearIndices(dofs_gids_prange)
+  pvcat(x) = map(xi -> vcat(xi...), to_parray_of_arrays(x))
+
+  gids_union, owners_union = map(block_ids,dofs_gids_prange) do id, prange
+    gids_slice = (ax == :rows) ? gids[id,:] : gids[:,id]
+    gids_union = pvcat(gids_slice)
+
+    owners_union = nothing
+    if !isnothing(owners)
+      owners_slice = (ax == :rows) ? owners[id,:] : owners[:,id]
+      owners_union = pvcat(owners_slice)
+    end
+
+    return gids_union, owners_union
+  end |> tuple_of_arrays
+  
+  return map((p,g,o) -> _setup_prange(p,g;ghost=ghost,owners=o),dofs_gids_prange,gids_union,owners_union)
+end
+
+# Create PRange for the rows of the linear system
+# without local ghost dofs as per required in the 
+# LocallyAssembled() parallel assembly strategy 
+function _setup_prange_without_ghosts(dofs_gids_prange::PRange)
+  ngdofs = length(dofs_gids_prange)
+  indices = map(partition(dofs_gids_prange)) do dofs_indices 
+    owner = part_id(dofs_indices)
+    own_indices = OwnIndices(ngdofs,owner,own_to_global(dofs_indices))
+    ghost_indices = GhostIndices(ngdofs,Int64[],Int32[])
+    OwnAndGhostIndices(own_indices,ghost_indices)
+  end
+  return PRange(indices)
+end
+
+# Here we are assuming that the sparse communication graph underlying test_dofs_gids_partition
+# is a superset of the one underlying indices. This is (has to be) true for the rows of the linear system.
+# The precondition required for the consistency of any parallel assembly process in GridapDistributed 
+# is that each processor can determine locally with a single layer of ghost cells the global indices and associated 
+# processor owners of the rows that it touches after assembly of integration terms posed on locally-owned entities 
+# (i.e., either cells or faces). 
+function _setup_prange_with_ghosts(dofs_gids_prange::PRange,gids)
+  ngdofs = length(dofs_gids_prange)
+  dofs_gids_partition = partition(dofs_gids_prange)
+
+  # Selected ghost ids -> dof PRange ghost ids
+  gids_ghost_lids_to_dofs_ghost_lids = map(ghost_lids_touched,dofs_gids_partition,gids)
+
+  # Selected ghost ids -> [global dof ids, owner processor ids]
+  gids_ghost_to_global, gids_ghost_to_owner = map(
+    find_gid_and_owner,gids_ghost_lids_to_dofs_ghost_lids,dofs_gids_partition
+  ) |> tuple_of_arrays
+
+  return _setup_prange_impl_(ngdofs,dofs_gids_partition,gids_ghost_to_global,gids_ghost_to_owner)
+end
+
+# Here we cannot assume that the sparse communication graph underlying 
+# trial_dofs_gids_partition is a superset of the one underlying indices.
+# Here we chould check whether it is included and call _find_neighbours!()
+# if this is the case. At present, we are not taking advantage of this, 
+# but let the parallel scalable algorithm to compute the reciprocal to do the job. 
+function _setup_prange_with_ghosts(dofs_gids_prange::PRange,gids,owners)
+  ngdofs = length(dofs_gids_prange)
+  dofs_gids_partition = partition(dofs_gids_prange)
+
+  # Selected ghost ids -> [global dof ids, owner processor ids]
+  gids_ghost_to_global, gids_ghost_to_owner = map(
+    gids,owners,dofs_gids_partition) do gids, owners, indices
+    ghost_touched   = Dict{Int,Bool}()
+    ghost_to_global = Int64[] 
+    ghost_to_owner  = Int64[]
+    me = part_id(indices)
+    for (j,jo) in zip(gids,owners)
+      if jo != me
+        if !haskey(ghost_touched,j)
+          push!(ghost_to_global,j)
+          push!(ghost_to_owner,jo)
+          ghost_touched[j] = true
+        end
+      end
+    end
+    ghost_to_global, ghost_to_owner
+  end |> tuple_of_arrays
+
+  return _setup_prange_impl_(
+    ngdofs,
+    dofs_gids_partition,
+    gids_ghost_to_global,
+    gids_ghost_to_owner;
+    discover_neighbours=false
+  )
+end 
+
+function _setup_prange_impl_(
+  ngdofs,
+  dofs_gids_partition,
+  gids_ghost_to_global,
+  gids_ghost_to_owner;
+  discover_neighbours=true
+)
+  indices = map(dofs_gids_partition, 
+                gids_ghost_to_global, 
+                gids_ghost_to_owner) do dofs_indices, ghost_to_global, ghost_to_owner 
+    owner = part_id(dofs_indices)
+    own_indices   = OwnIndices(ngdofs,owner,own_to_global(dofs_indices))
+    ghost_indices = GhostIndices(ngdofs,ghost_to_global,ghost_to_owner)
+    OwnAndGhostIndices(own_indices,ghost_indices)
+  end
+  if discover_neighbours
+    _find_neighbours!(indices,dofs_gids_partition)
+  end
+  return PRange(indices)
+end
diff --git a/test/BlockPartitionedArraysTests.jl b/test/BlockPartitionedArraysTests.jl
index 3e81008..5ec6579 100644
--- a/test/BlockPartitionedArraysTests.jl
+++ b/test/BlockPartitionedArraysTests.jl
@@ -76,7 +76,7 @@ t = assemble!(__v)
 assemble!(__m) |> wait
 fetch(t);
 
-PartitionedArrays.to_trivial_partition(m)
+PartitionedArrays.centralize(m)
 
 partition(v)
 partition(m)
diff --git a/test/BlockSparseMatrixAssemblersTests.jl b/test/BlockSparseMatrixAssemblersTests.jl
index 82bafc9..645d7dc 100644
--- a/test/BlockSparseMatrixAssemblersTests.jl
+++ b/test/BlockSparseMatrixAssemblersTests.jl
@@ -52,7 +52,7 @@ function _main(n_spaces,mfs,weakform,U,V)
   matdata = collect_cell_matrix(X,Y,biform(u,v))
   vecdata = collect_cell_vector(Y,liform(v))  
 
-  assem = SparseMatrixAssembler(X,Y,FullyAssembledRows())
+  assem = SparseMatrixAssembler(X,Y,LocallyAssembled())
   A1 = assemble_matrix(assem,matdata)
   b1 = assemble_vector(assem,vecdata)
   A2,b2 = assemble_matrix_and_vector(assem,data);
@@ -68,7 +68,7 @@ function _main(n_spaces,mfs,weakform,U,V)
   bmatdata = collect_cell_matrix(Xb,Yb,biform(ub,vb))
   bvecdata = collect_cell_vector(Yb,liform(vb))
 
-  assem_blocks = SparseMatrixAssembler(Xb,Yb,FullyAssembledRows())
+  assem_blocks = SparseMatrixAssembler(Xb,Yb,LocallyAssembled())
   A1_blocks = assemble_matrix(assem_blocks,bmatdata);
   b1_blocks = assemble_vector(assem_blocks,bvecdata);
   @test is_same_vector(b1_blocks,b1,Yb,Y)
diff --git a/test/DivConformingTests.jl b/test/DivConformingTests.jl
index 2a1886b..3804456 100644
--- a/test/DivConformingTests.jl
+++ b/test/DivConformingTests.jl
@@ -63,7 +63,7 @@ function f(model,reffe)
     V = FESpace(model,reffe,conformity=:Hdiv)
     U = TrialFESpace(V)
 
-    das = FullyAssembledRows()
+    das = LocallyAssembled()
     trian = Triangulation(das,model)
     degree = 2
     dΩ = Measure(trian,degree)
diff --git a/test/FESpacesTests.jl b/test/FESpacesTests.jl
index bf1e78e..22e3545 100644
--- a/test/FESpacesTests.jl
+++ b/test/FESpacesTests.jl
@@ -71,8 +71,8 @@ function assemble_tests(das,dΩ,dΩass,U,V)
 end
 
 function main(distribute,parts)
-  main(distribute,parts,SubAssembledRows())
-  main(distribute,parts,FullyAssembledRows())
+  main(distribute,parts,Assembled())
+  main(distribute,parts,LocallyAssembled())
 end
 
 function main(distribute,parts,das)
diff --git a/test/HeatEquationTests.jl b/test/HeatEquationTests.jl
index fa96dc1..89077ec 100644
--- a/test/HeatEquationTests.jl
+++ b/test/HeatEquationTests.jl
@@ -37,7 +37,7 @@ function main(distribute,parts)
 
   op = TransientFEOperator(res,jac,jac_t,U,V0)
 
-  assembler = SparseMatrixAssembler(U,V0,SubAssembledRows())
+  assembler = SparseMatrixAssembler(U,V0,Assembled())
   op_constant = TransientLinearFEOperator(
     (a,m),b,U,V0,constant_forms=(true,true),assembler=assembler
   )
diff --git a/test/PLaplacianTests.jl b/test/PLaplacianTests.jl
index a20a268..bf50135 100644
--- a/test/PLaplacianTests.jl
+++ b/test/PLaplacianTests.jl
@@ -8,8 +8,8 @@ using Test
 using SparseArrays
 
 function main(distribute,parts)
-  main(distribute,parts,FullyAssembledRows(),SparseMatrixCSR{0,Float64,Int},false)
-  main(distribute,parts,SubAssembledRows(),SparseMatrixCSC{Float64,Int},true)
+  main(distribute,parts,LocallyAssembled(),SparseMatrixCSR{0,Float64,Int},false)
+  main(distribute,parts,Assembled(),SparseMatrixCSC{Float64,Int},true)
 end
 
 function main(distribute,parts,strategy,local_matrix_type,autodiff)
diff --git a/test/PeriodicBCsTests.jl b/test/PeriodicBCsTests.jl
index 6177752..247cb6f 100644
--- a/test/PeriodicBCsTests.jl
+++ b/test/PeriodicBCsTests.jl
@@ -35,4 +35,6 @@ function main(distribute,parts)
 
 end
 
+main(DebugArray,(2,2))
+
 end # module
diff --git a/test/parrays_update.jl b/test/parrays_update.jl
new file mode 100644
index 0000000..ec10673
--- /dev/null
+++ b/test/parrays_update.jl
@@ -0,0 +1,64 @@
+
+using Gridap
+using PartitionedArrays
+using GridapDistributed
+
+using Gridap.FESpaces, Gridap.Algebra
+
+np = (1,2)
+ranks = with_debug() do distribute
+  distribute(LinearIndices((prod(np),)))
+end
+
+nc = (2,4)
+serial_model = CartesianDiscreteModel((0,1,0,1),nc)
+dist_model = CartesianDiscreteModel(ranks,np,(0,1,0,1),nc)
+
+cids = get_cell_gids(dist_model)
+
+reffe = ReferenceFE(lagrangian,Float64,1)
+serial_V = TestFESpace(serial_model,reffe)
+dist_V = TestFESpace(dist_model,reffe)
+
+serial_ids = get_free_dof_ids(serial_V)
+dist_ids = get_free_dof_ids(dist_V)
+
+dof_ids = get_free_dof_ids(dist_V)
+
+serial_Ω = Triangulation(serial_model)
+serial_dΩ = Measure(serial_Ω,2)
+
+dist_Ω = Triangulation(dist_model)
+dist_dΩ = Measure(dist_Ω,2)
+
+dist_Ωg = Triangulation(GridapDistributed.with_ghost,dist_model)
+dist_dΩg = Measure(dist_Ωg,2)
+
+serial_a(u,v) = ∫(u⋅v)*serial_dΩ
+dist_a(u,v) = ∫(u⋅v)*dist_dΩ
+dist_ag(u,v) = ∫(u⋅v)*dist_dΩg
+
+serial_A = assemble_matrix(serial_a,serial_V,serial_V)
+
+assem = SparseMatrixAssembler(dist_V,dist_V,GridapDistributed.Assembled())
+dist_A_AS = assemble_matrix(dist_a,assem,dist_V,dist_V)
+
+assem = SparseMatrixAssembler(dist_V,dist_V,GridapDistributed.LocallyAssembled())
+dist_A_LA = assemble_matrix(dist_ag,assem,dist_V,dist_V)
+
+assem = SparseMatrixAssembler(dist_V,dist_V,GridapDistributed.SubAssembled())
+dist_A_SA = assemble_matrix(dist_a,assem,dist_V,dist_V)
+
+all(centralize(dist_A_AS) - serial_A .< 1e-10)
+
+x_AS = prand(partition(axes(dist_A_AS,2)))
+x_LA = GridapDistributed.change_ghost(x_AS,axes(dist_A_LA,2))
+x_SA = GridapDistributed.change_ghost(x_AS,axes(dist_A_SA,2))
+
+norm(dist_A_AS*x_AS - dist_A_LA*x_LA)
+norm(dist_A_AS*x_AS - dist_A_SA*x_SA)
+
+assemble_matrix!(dist_a,dist_A_AS,assem,dist_V,dist_V)
+norm(dist_A_AS*x_AS - dist_A_SA*x_SA)
+
+############################################################################################