diff --git a/include/picongpu/plugins/openPMD/NDScalars.hpp b/include/picongpu/plugins/openPMD/NDScalars.hpp index 7ef07f6235..d027417ed1 100644 --- a/include/picongpu/plugins/openPMD/NDScalars.hpp +++ b/include/picongpu/plugins/openPMD/NDScalars.hpp @@ -174,11 +174,17 @@ namespace picongpu DataSpace gridPos = Environment::get().GridController().getPosition(); ::openPMD::Offset start; ::openPMD::Extent count; + ::openPMD::Extent extent = mrc.getExtent(); start.reserve(ndim); count.reserve(ndim); for(int d = 0; d < ndim; ++d) { - start.push_back(gridPos.revert()[d]); + /* + * When restarting with more parallel processes than the checkpoint had originally been written + * with, we must take care not to index past the dataset boundaries. Just loop around to the start + * in that case. Not the finest way, but it does the job for now.. + */ + start.push_back(gridPos.revert()[d] % extent[d]); count.push_back(1); } diff --git a/include/picongpu/plugins/openPMD/openPMDWriter.x.cpp b/include/picongpu/plugins/openPMD/openPMDWriter.x.cpp index b835de9627..5f5921f4e3 100644 --- a/include/picongpu/plugins/openPMD/openPMDWriter.x.cpp +++ b/include/picongpu/plugins/openPMD/openPMDWriter.x.cpp @@ -1024,6 +1024,12 @@ make sure that environment variable OPENPMD_BP_BACKEND is not set to ADIOS1. { loadRngStatesImpl(&mThreadParams, restartStep); } + catch(std::exception const& e) + { + log("openPMD: loading RNG states failed, they will be re-initialized " + "instead. Original error:\n\t%1%") + % e.what(); + } catch(...) { log( diff --git a/include/picongpu/plugins/openPMD/restart/LoadSpecies.hpp b/include/picongpu/plugins/openPMD/restart/LoadSpecies.hpp index d7487bbda4..d1fb263f0f 100644 --- a/include/picongpu/plugins/openPMD/restart/LoadSpecies.hpp +++ b/include/picongpu/plugins/openPMD/restart/LoadSpecies.hpp @@ -35,8 +35,10 @@ # include +# include # include # include +# include # include @@ -46,6 +48,84 @@ namespace picongpu { using namespace pmacc; + template + struct RedistributeFilteredParticles + { + template + HINLINE void operator()( + FrameType& frame, + FilterType const& filter, + RemapType const& remap, + uint64_t const numParticlesCurrentBatch, + char const filterRemove) + { + using Identifier = T_Identifier; + using ValueType = typename pmacc::traits::Resolve::type::type; + using ComponentType = typename GetComponentsType::type; + + ValueType* dataPtr = frame.getIdentifier(Identifier()).getPointer(); + + for(size_t particleIndex = 0; particleIndex < numParticlesCurrentBatch; ++particleIndex) + { + if(filter[particleIndex] == filterRemove) + { + continue; + } + dataPtr[remap[particleIndex]] = dataPtr[particleIndex]; + } + } + }; + + struct KernelFilterParticles + { + template + DINLINE void operator()( + T_Worker const& worker, + FrameType&& loadedData, + MemIdxType size, + DataSpace patchTotalOffset, + DataSpace patchUpperCorner, + char const filterKeep, + char const filterRemove, + FilterType&& filterOut) const + { + // DataSpace<1> particleIndex = worker.blockDomIdxND(); + constexpr uint32_t blockDomSize = T_Worker::blockDomSize(); + auto numDataBlocks = (size + blockDomSize - 1u) / blockDomSize; + auto position_ = loadedData.getIdentifier(position()).getPointer(); + auto positionOffset = loadedData.getIdentifier(totalCellIdx()).getPointer(); + + // grid-strided loop over the chunked data + for(uint32_t dataBlock = worker.blockDomIdx(); dataBlock < numDataBlocks; + dataBlock += worker.gridDomSize()) + { + auto dataBlockOffset = dataBlock * blockDomSize; + auto forEach = pmacc::lockstep::makeForEach(worker); + forEach( + [&](uint32_t const inBlockIdx) + { + auto idx = dataBlockOffset + inBlockIdx; + if(idx < size) + { + auto& positionVec = position_[idx]; + auto& positionOffsetVec = positionOffset[idx]; + char filterCurrent = filterKeep; + for(size_t d = 0; d < simDim; ++d) + { + auto positionInD = positionVec[d] + positionOffsetVec[d]; + if(positionInD < patchTotalOffset[d] || positionInD >= patchUpperCorner[d]) + { + filterCurrent = filterRemove; + break; + } + } + filterOut[idx] = filterCurrent; + } + }); + } + } + }; + /** Load species from openPMD checkpoint storage * * @tparam T_Species type of species @@ -70,6 +150,227 @@ namespace picongpu using NewParticleDescription = typename ReplaceValueTypeSeq::type; + struct LoadParams + { + using FrameType = Frame; + using BufferType = Frame; + + std::string const& speciesName; + ::openPMD::ParticleSpecies const& particleSpecies; + std::shared_ptr const& speciesTmp; + uint64_t* patchNumParticles; + uint64_t* patchNumParticlesOffset; + DataSpace const& cellOffsetToTotalDomain; + uint32_t restartChunkSize; + + auto numParticlesAndChunkSize(std::deque const& matches) const -> std::pair + { + uint64_t totalNumParticles = std::transform_reduce( + matches.begin(), + matches.end(), + 0, + /* reduce = */ [](uint64_t left, uint64_t right) { return left + right; }, + /* transform = */ [this](size_t patchIdx) { return patchNumParticles[patchIdx]; }); + uint64_t maxChunkSize = std::min(static_cast(restartChunkSize), totalNumParticles); + return std::make_pair(totalNumParticles, maxChunkSize); + } + + template + void loadMatchesGeneric( + std::deque const& matches, + ThreadParams* threadParams, + uint64_t totalNumParticles, + uint64_t maxChunkSize, + Functor&& forEachPatch) const + { + BufferType buffers; + FrameType mappedFrame; + log("openPMD: malloc mapped memory: %1%") % speciesName; + /*malloc mapped memory*/ + meta::ForEach> + mallocMem; + mallocMem(buffers, mappedFrame, maxChunkSize); + + for(size_t const patchIdx : matches) + { + uint64_t particleOffset = patchNumParticlesOffset[patchIdx]; + uint64_t numParticles = patchNumParticles[patchIdx]; + + log("openPMD: Loading %1% particles from offset %2%") + % (long long unsigned) totalNumParticles % (long long unsigned) particleOffset; + + + uint32_t const numLoadIterations + = maxChunkSize == 0u ? 0u : alpaka::core::divCeil(numParticles, maxChunkSize); + + for(uint64_t loadRound = 0u; loadRound < numLoadIterations; ++loadRound) + { + auto particleLoadOffset = particleOffset + loadRound * maxChunkSize; + auto numLeftParticles = numParticles - loadRound * maxChunkSize; + + auto numParticlesCurrentBatch = std::min(numLeftParticles, maxChunkSize); + + log("openPMD: (begin) load species %1% round: %2%/%3%") % speciesName + % (loadRound + 1) % numLoadIterations; + if(numParticlesCurrentBatch != 0) + { + std::forward(forEachPatch)( + loadRound, + numParticlesCurrentBatch, + mappedFrame, + particleLoadOffset); + } + log("openPMD: ( end ) load species %1% round: %2%/%3%") % speciesName + % (loadRound + 1) % numLoadIterations; + } + } + } + + void loadFullMatches(std::deque const& fullMatches, ThreadParams* threadParams) const + { + auto [totalNumParticles, maxChunkSize] = numParticlesAndChunkSize(fullMatches); + loadMatchesGeneric( + fullMatches, + threadParams, + totalNumParticles, + maxChunkSize, + /* forEachPatch = */ + [this, threadParams]( + uint64_t loadRound, + uint64_t numParticlesCurrentBatch, + FrameType& mappedFrame, + uint64_t particleLoadOffset) + { + meta::ForEach< + typename NewParticleDescription::ValueTypeSeq, + LoadParticleAttributesFromOpenPMD> + loadAttributes; + loadAttributes( + threadParams, + mappedFrame, + particleSpecies, + particleLoadOffset, + numParticlesCurrentBatch); + + pmacc::particles::operations::splitIntoListOfFrames( + *speciesTmp, + mappedFrame, + numParticlesCurrentBatch, + cellOffsetToTotalDomain, + totalCellIdx_, + *threadParams->cellDescription, + picLog::INPUT_OUTPUT()); + }); + } + + void loadPartialMatches(std::deque const& partialMatches, ThreadParams* threadParams) const + { + auto [totalNumParticles, maxChunkSize] = numParticlesAndChunkSize(partialMatches); + + SubGrid const& subGrid = Environment::get().SubGrid(); + pmacc::Selection const localDomain = subGrid.getLocalDomain(); + pmacc::Selection const globalDomain = subGrid.getGlobalDomain(); + /* Offset to transform local particle offsets into total offsets for all particles within the + * current local domain. + * @attention A window can be the full simulation domain or the moving window. + */ + DataSpace localToTotalDomainOffset(localDomain.offset + globalDomain.offset); + + /* params->localWindowToDomainOffset is in PIConGPU for a restart zero but to stay generic we take + * the variable into account. + */ + DataSpace const patchTotalOffset + = localToTotalDomainOffset + threadParams->localWindowToDomainOffset; + DataSpace const patchExtent = threadParams->window.localDimensions.size; + DataSpace const patchUpperCorner = patchTotalOffset + patchExtent; + + constexpr bool isMappedMemorySupported + = alpaka::hasMappedBufSupport<::alpaka::Platform>; + PMACC_VERIFY_MSG(isMappedMemorySupported, "Device must support mapped memory!"); + auto filter = alpaka::allocMappedBuf( + manager::Device::get().current(), + manager::Device::get().getPlatform(), + MemSpace(maxChunkSize).toAlpakaMemVec()); + auto remap = alpaka::allocMappedBuf( + manager::Device::get().current(), + manager::Device::get().getPlatform(), + MemSpace(maxChunkSize).toAlpakaMemVec()); + + loadMatchesGeneric( + partialMatches, + threadParams, + totalNumParticles, + maxChunkSize, /* forEachPatch = */ + [this, threadParams, &filter, &remap, &patchTotalOffset, &patchExtent, &patchUpperCorner]( + uint64_t loadRound, + uint64_t numParticlesCurrentBatch, + FrameType& mappedFrame, + uint64_t particleLoadOffset) + { + meta::ForEach< + typename NewParticleDescription::ValueTypeSeq, + LoadParticleAttributesFromOpenPMD> + loadAttributes; + loadAttributes( + threadParams, + mappedFrame, + particleSpecies, + particleLoadOffset, + numParticlesCurrentBatch); + + constexpr char filterKeep{1}, filterRemove{0}; + PMACC_LOCKSTEP_KERNEL(KernelFilterParticles{}) + .config(pmacc::math::Vector{numParticlesCurrentBatch})( + mappedFrame, + numParticlesCurrentBatch, + patchTotalOffset, + patchUpperCorner, + filterKeep, + filterRemove, + alpaka::getPtrNative(filter)); + eventSystem::getTransactionEvent().waitForFinished(); + + // For simplicity, do the remapping on the CPU again + MemIdxType remapCurrent = 0; + for(size_t particleIndex = 0; particleIndex < numParticlesCurrentBatch; ++particleIndex) + { + if(filter[particleIndex] == filterKeep) + { + remap[particleIndex] = remapCurrent++; + } + else + { + remap[particleIndex] = std::numeric_limits::max(); + } + } + + meta::ForEach< + typename NewParticleDescription::ValueTypeSeq, + RedistributeFilteredParticles> + redistributeFilteredParticles; + redistributeFilteredParticles( + mappedFrame, + filter, + remap, + numParticlesCurrentBatch, + filterRemove); + + log( + "openPMD: Keeping %1% of the current batch's %2% particles after filtering.") + % remapCurrent % numParticlesCurrentBatch; + + pmacc::particles::operations::splitIntoListOfFrames( + *speciesTmp, + mappedFrame, + remapCurrent, // !! not numParticlesCurrentBatch, filtered vs. unfiltered number + cellOffsetToTotalDomain, + totalCellIdx_, + *threadParams->cellDescription, + picLog::INPUT_OUTPUT()); + }); + } + }; + /** Load species from openPMD checkpoint storage * * @param params thread params @@ -81,7 +382,6 @@ namespace picongpu std::string const speciesName = FrameType::getName(); log("openPMD: (begin) load species: %1%") % speciesName; DataConnector& dc = Environment<>::get().DataConnector(); - GridController& gc = Environment::get().GridController(); ::openPMD::Series& series = *params->openPMDSeries; ::openPMD::Container<::openPMD::ParticleSpecies>& particles @@ -99,93 +399,52 @@ namespace picongpu // openPMD eventSystem::getTransactionEvent().waitForFinished(); - auto numRanks = gc.getGlobalSize(); - - size_t patchIdx = getPatchIdx(params, particleSpecies, numRanks); + auto [fullMatches, partialMatches] = getPatchIdx(params, particleSpecies); - std::shared_ptr fullParticlesInfoShared - = particleSpecies.particlePatches["numParticles"][::openPMD::RecordComponent::SCALAR] - .load(); + std::shared_ptr numParticlesShared + = particleSpecies.particlePatches["numParticles"].load(); + std::shared_ptr numParticlesOffsetShared + = particleSpecies.particlePatches["numParticlesOffset"].load(); particles.seriesFlush(); - uint64_t* fullParticlesInfo = fullParticlesInfoShared.get(); - - /* Run a prefix sum over the numParticles[0] element in - * particlesInfo to retreive the offset of particles - */ - uint64_t particleOffset = 0u; - /* count total number of particles on the device */ - uint64_t totalNumParticles = 0u; - - assert(patchIdx < numRanks); - - for(size_t i = 0u; i <= patchIdx; ++i) - { - if(i < patchIdx) - particleOffset += fullParticlesInfo[i]; - if(i == patchIdx) - totalNumParticles = fullParticlesInfo[i]; - } - - log("openPMD: Loading %1% particles from offset %2%") - % (long long unsigned) totalNumParticles % (long long unsigned) particleOffset; + uint64_t* patchNumParticles = numParticlesShared.get(); + uint64_t* patchNumParticlesOffset = numParticlesOffsetShared.get(); + + LoadParams lp{ + speciesName, + particleSpecies, + speciesTmp, + patchNumParticles, + patchNumParticlesOffset, + cellOffsetToTotalDomain, + restartChunkSize}; + lp.loadFullMatches(fullMatches, params); + lp.loadPartialMatches(partialMatches, params); - log("openPMD: malloc mapped memory: %1%") % speciesName; - - using FrameType = Frame; - using BufferType = Frame; - - BufferType buffers; - FrameType mappedFrame; - - uint64_t maxChunkSize = std::min(static_cast(restartChunkSize), totalNumParticles); - - /*malloc mapped memory*/ - meta::ForEach> - mallocMem; - mallocMem(buffers, mappedFrame, maxChunkSize); + log("openPMD: ( end ) load species: %1%") % speciesName; + } - uint32_t const numLoadIterations - = maxChunkSize == 0u ? 0u : alpaka::core::divCeil(totalNumParticles, maxChunkSize); + private: + // o: offset, e: extent, u: upper corner (= o+e) + static std::pair, DataSpace> intersect( + DataSpace const& o1, + DataSpace const& e1, + DataSpace const& o2, + DataSpace const& e2) + { + // Convert extents into upper coordinates + auto u1 = o1 + e1; + auto u2 = o2 + e2; - for(uint64_t loadRound = 0u; loadRound < numLoadIterations; ++loadRound) + DataSpace intersect_o, intersect_u, intersect_e; + for(unsigned d = 0; d < simDim; ++d) { - auto particleLoadOffset = particleOffset + loadRound * maxChunkSize; - auto numLeftParticles = totalNumParticles - loadRound * maxChunkSize; - - auto numParticlesCurrentBatch = std::min(numLeftParticles, maxChunkSize); - - log("openPMD: (begin) load species %1% round: %2%/%3%") % speciesName - % (loadRound + 1) % numLoadIterations; - if(numParticlesCurrentBatch != 0) - { - meta::ForEach< - typename NewParticleDescription::ValueTypeSeq, - LoadParticleAttributesFromOpenPMD> - loadAttributes; - loadAttributes( - params, - mappedFrame, - particleSpecies, - particleLoadOffset, - numParticlesCurrentBatch); - - - pmacc::particles::operations::splitIntoListOfFrames( - *speciesTmp, - mappedFrame, - numParticlesCurrentBatch, - cellOffsetToTotalDomain, - totalCellIdx_, - *(params->cellDescription), - picLog::INPUT_OUTPUT()); - } - log("openPMD: ( end ) load species %1% round: %2%/%3%") % speciesName - % (loadRound + 1) % numLoadIterations; + intersect_o[d] = std::max(o1[d], o2[d]); + intersect_u[d] = std::min(u1[d], u2[d]); + intersect_e[d] = intersect_u[d] > intersect_o[d] ? intersect_u[d] - intersect_o[d] : 0; } - log("openPMD: ( end ) load species: %1%") % speciesName; + return {intersect_o, intersect_e}; } - private: /** get index for particle data within the openPMD patch data * * It is not possible to assume that we can use the MPI rank to load the particle data. @@ -197,13 +456,16 @@ namespace picongpu * * @return index of the particle patch within the openPMD data */ - HINLINE size_t - getPatchIdx(ThreadParams* params, ::openPMD::ParticleSpecies particleSpecies, size_t numRanks) + HINLINE std::pair, std::deque> getPatchIdx( + ThreadParams* params, + ::openPMD::ParticleSpecies particleSpecies) { std::string const name_lookup[] = {"x", "y", "z"}; - std::vector> offsets(numRanks); - std::vector> extents(numRanks); + size_t patches = particleSpecies.particlePatches["numParticles"].getExtent()[0]; + + std::vector> offsets(patches); + std::vector> extents(patches); // transform openPMD particle patch data into PIConGPU data objects for(uint32_t d = 0; d < simDim; ++d) @@ -213,7 +475,7 @@ namespace picongpu std::shared_ptr patchExtentsInfoShared = particleSpecies.particlePatches["extent"][name_lookup[d]].load(); particleSpecies.seriesFlush(); - for(size_t i = 0; i < numRanks; ++i) + for(size_t i = 0; i < patches; ++i) { offsets[i][d] = patchOffsetsInfoShared.get()[i]; extents[i][d] = patchExtentsInfoShared.get()[i]; @@ -235,18 +497,47 @@ namespace picongpu DataSpace const patchTotalOffset = localToTotalDomainOffset + params->localWindowToDomainOffset; DataSpace const patchExtent = params->window.localDimensions.size; + math::Vector true_; + for(unsigned d = 0; d < simDim; ++d) + { + true_[d] = true; + } // search the patch index based on the offset and extents of local domain size - for(size_t i = 0; i < numRanks; ++i) + std::deque fullMatches; + std::deque partialMatches; + size_t noMatches = 0; + for(size_t i = 0; i < patches; ++i) { - if(patchTotalOffset == offsets[i] && patchExtent == extents[i]) - return i; + // std::cout << "Comp.: " << patchTotalOffset << " - " << (patchTotalOffset + patchExtent) + // << "\tAGAINST " << offsets[i] << " - " << (offsets[i] + extents[i]) + // << "\toffsets: " << (patchTotalOffset <= offsets[i]) + // << ", extents: " << ((offsets[i] + extents[i]) <= (patchTotalOffset + + // patchExtent)) << + // '\n'; + if((patchTotalOffset <= offsets[i]) == true_ + && ((offsets[i] + extents[i]) <= (patchTotalOffset + patchExtent)) == true_) + { + fullMatches.emplace_back(i); + } + else if( + intersect(offsets[i], extents[i], patchTotalOffset, patchExtent).second.productOfComponents() + != 0) + { + partialMatches.emplace_back(i); + } + else + { + ++noMatches; + } } - // If no patch fits the conditions, something went wrong before - throw std::runtime_error( - "Error while restarting: no particle patch matches the required offset and extent"); - // Fake return still needed to avoid warnings - return 0; + + log( + "openPMD: Found %1% fully and %2% partially matching particle patch(es). %3% " + "patch was / patches were not matched.") + % fullMatches.size() % partialMatches.size() % noMatches; + + return std::make_pair(std::move(fullMatches), std::move(partialMatches)); } }; diff --git a/include/picongpu/plugins/openPMD/restart/RestartFieldLoader.hpp b/include/picongpu/plugins/openPMD/restart/RestartFieldLoader.hpp index 939d65b3e0..ec66a6ee0b 100644 --- a/include/picongpu/plugins/openPMD/restart/RestartFieldLoader.hpp +++ b/include/picongpu/plugins/openPMD/restart/RestartFieldLoader.hpp @@ -77,6 +77,9 @@ namespace picongpu DataSpace local_domain_size = params->window.localDimensions.size; bool useLinearIdxAsDestination = false; + ::openPMD::Series& series = *params->openPMDSeries; + ::openPMD::Mesh& mesh = series.iterations[currentStep].open().meshes[objectName]; + /* Patch for non-domain-bound fields * This is an ugly fix to allow output of reduced 1d PML buffers */ @@ -85,6 +88,7 @@ namespace picongpu auto const field_layout = field.getGridLayout(); auto const field_no_guard = field_layout.sizeWithoutGuardND(); auto const elementCount = field_no_guard.productOfComponents(); + uint64_t pmlTotalSize = 0; /* Scan the PML buffer local size along all local domains * This code is symmetric to one in Field::writeField() @@ -112,9 +116,20 @@ namespace picongpu { if(localSizes.at(2u * r + 1u) < rank) domainOffset += localSizes.at(2u * r); + pmlTotalSize += localSizes.at(2u * r); } log("openPMD: (end) collect PML sizes for %1%") % objectName; + if(auto const& extentOnDisk = mesh.begin()->second.getExtent(); + extentOnDisk != ::openPMD::Extent{1, 1, pmlTotalSize}) + { + log( + "openPMD: Skip loading for PML fields. Expecting extent %1%, found extent %2% on disk. " + "This may happen when restarting with a different domain decomposition.") + % pmlTotalSize % extentOnDisk.at(2); + return; + } + domain_offset = DataSpace::create(0); domain_offset[0] = domainOffset; local_domain_size = DataSpace::create(1); @@ -122,9 +137,6 @@ namespace picongpu useLinearIdxAsDestination = true; } - ::openPMD::Series& series = *params->openPMDSeries; - ::openPMD::Container<::openPMD::Mesh>& meshes = series.iterations[currentStep].open().meshes; - auto destBox = field.getHostBuffer().getDataBox(); for(uint32_t n = 0; n < numComponents; ++n) { @@ -133,9 +145,8 @@ namespace picongpu // data. log("openPMD: Read from domain: offset=%1% size=%2%") % domain_offset % local_domain_size; - ::openPMD::RecordComponent rc = numComponents > 1 - ? meshes[objectName][name_lookup_tpl[n]] - : meshes[objectName][::openPMD::RecordComponent::SCALAR]; + ::openPMD::RecordComponent rc + = numComponents > 1 ? mesh[name_lookup_tpl[n]] : mesh[::openPMD::RecordComponent::SCALAR]; log("openPMD: Read from field '%1%'") % objectName; @@ -158,7 +169,7 @@ namespace picongpu std::shared_ptr field_container = rc.loadChunk(start, count); /* start a blocking read of all scheduled variables */ - meshes.seriesFlush(); + mesh.seriesFlush(); int const elementCount = local_domain_size.productOfComponents(); @@ -219,6 +230,7 @@ namespace picongpu /* load from openPMD */ bool const isDomainBound = traits::IsFieldDomainBound::value; + RestartFieldLoader::loadField( field->getGridBuffer(), (uint32_t) T_Field::numComponents,