Skip to content

Commit 6548e1e

Browse files
committed
[df] Support MT snapshotting to RNTuple
Switch the existing code to use the RNTupleParallelWriter with one RNTupleFillContext per slot. This should be (almost) as efficient as the RNTupleWriter (one additional cloned RNTupleModel for the only fill context), but save quite a bit of code duplication and in testing effort.
1 parent ff493cd commit 6548e1e

File tree

4 files changed

+65
-40
lines changed

4 files changed

+65
-40
lines changed

tree/dataframe/inc/ROOT/RDF/InterfaceUtils.hxx

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -293,19 +293,13 @@ BuildAction(const ColumnNames_t &colNames, const std::shared_ptr<SnapshotHelperA
293293

294294
std::unique_ptr<RActionBase> actionPtr;
295295
if (snapHelperArgs->fToNTuple) {
296-
if (!ROOT::IsImplicitMTEnabled()) {
297-
// single-thread snapshot
298-
using Helper_t = UntypedSnapshotRNTupleHelper;
299-
using Action_t = RActionSnapshot<Helper_t, PrevNodeType>;
296+
// We use the same helper for single- and multi-thread snapshot.
297+
using Helper_t = UntypedSnapshotRNTupleHelper;
298+
using Action_t = RActionSnapshot<Helper_t, PrevNodeType>;
300299

301-
actionPtr.reset(new Action_t(
302-
Helper_t(filename, dirname, treename, colNames, outputColNames, options, inputLM, outputLM, colTypeIDs),
303-
colNames, colTypeIDs, prevNode, colRegister));
304-
} else {
305-
// multi-thread snapshot to RNTuple is not yet supported
306-
// TODO(fdegeus) Add MT snapshotting
307-
throw std::runtime_error("Snapshot: Snapshotting to RNTuple with IMT enabled is not supported yet.");
308-
}
300+
actionPtr.reset(new Action_t(Helper_t(nSlots, filename, dirname, treename, colNames, outputColNames, options,
301+
inputLM, outputLM, colTypeIDs),
302+
colNames, colTypeIDs, prevNode, colRegister));
309303
} else {
310304
if (!ROOT::IsImplicitMTEnabled()) {
311305
// single-thread snapshot

tree/dataframe/inc/ROOT/RDF/SnapshotHelpers.hxx

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,10 @@ class TBranch;
3030
class TFile;
3131

3232
namespace ROOT {
33-
class RNTupleWriter;
33+
namespace Experimental {
34+
class RNTupleFillContext;
35+
class RNTupleParallelWriter;
36+
} // namespace Experimental
3437
class REntry;
3538
class RFieldToken;
3639
class TBufferMerger;
@@ -64,16 +67,18 @@ class R__CLING_PTRCHECK(off) UntypedSnapshotRNTupleHelper final : public RAction
6467
ROOT::Detail::RDF::RLoopManager *fOutputLoopManager;
6568
ColumnNames_t fInputFieldNames; // This contains the resolved aliases
6669
ColumnNames_t fOutputFieldNames;
67-
std::unique_ptr<ROOT::RNTupleWriter> fWriter;
70+
std::unique_ptr<ROOT::Experimental::RNTupleParallelWriter> fWriter;
6871
std::vector<ROOT::RFieldToken> fFieldTokens;
6972

70-
std::unique_ptr<ROOT::REntry> fOutputEntry;
73+
unsigned int fNSlots;
74+
std::vector<std::shared_ptr<ROOT::Experimental::RNTupleFillContext>> fFillContexts;
75+
std::vector<std::unique_ptr<ROOT::REntry>> fEntries;
7176

7277
std::vector<const std::type_info *> fInputColumnTypeIDs; // Types for the input columns
7378

7479
public:
75-
UntypedSnapshotRNTupleHelper(std::string_view filename, std::string_view dirname, std::string_view ntuplename,
76-
const ColumnNames_t &vfnames, const ColumnNames_t &fnames,
80+
UntypedSnapshotRNTupleHelper(unsigned int nSlots, std::string_view filename, std::string_view dirname,
81+
std::string_view ntuplename, const ColumnNames_t &vfnames, const ColumnNames_t &fnames,
7782
const RSnapshotOptions &options, ROOT::Detail::RDF::RLoopManager *inputLM,
7883
ROOT::Detail::RDF::RLoopManager *outputLM,
7984
const std::vector<const std::type_info *> &colTypeIDs);
@@ -86,9 +91,11 @@ public:
8691

8792
void Initialize();
8893

89-
void InitTask(TTreeReader *, unsigned int /* slot */) {}
94+
void Exec(unsigned int slot, const std::vector<void *> &values);
95+
96+
void InitTask(TTreeReader *, unsigned int slot);
9097

91-
void Exec(unsigned int /* slot */, const std::vector<void *> &values);
98+
void FinalizeTask(unsigned int slot);
9299

93100
void Finalize();
94101

tree/dataframe/src/RDFSnapshotHelpers.cxx

Lines changed: 37 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,8 @@
2323
#include <ROOT/RFieldToken.hxx>
2424
#include <ROOT/RNTuple.hxx>
2525
#include <ROOT/RNTupleDS.hxx>
26-
#include <ROOT/RNTupleWriter.hxx>
26+
#include <ROOT/RNTupleFillContext.hxx>
27+
#include <ROOT/RNTupleParallelWriter.hxx>
2728
#include <ROOT/RTTreeDS.hxx>
2829
#include <ROOT/TBufferMerger.hxx>
2930

@@ -800,9 +801,10 @@ ROOT::Internal::RDF::UntypedSnapshotTTreeHelperMT::MakeNew(void *newName, std::s
800801
}
801802

802803
ROOT::Internal::RDF::UntypedSnapshotRNTupleHelper::UntypedSnapshotRNTupleHelper(
803-
std::string_view filename, std::string_view dirname, std::string_view ntuplename, const ColumnNames_t &vfnames,
804-
const ColumnNames_t &fnames, const RSnapshotOptions &options, ROOT::Detail::RDF::RLoopManager *inputLM,
805-
ROOT::Detail::RDF::RLoopManager *outputLM, const std::vector<const std::type_info *> &colTypeIDs)
804+
unsigned int nSlots, std::string_view filename, std::string_view dirname, std::string_view ntuplename,
805+
const ColumnNames_t &vfnames, const ColumnNames_t &fnames, const RSnapshotOptions &options,
806+
ROOT::Detail::RDF::RLoopManager *inputLM, ROOT::Detail::RDF::RLoopManager *outputLM,
807+
const std::vector<const std::type_info *> &colTypeIDs)
806808
: fFileName(filename),
807809
fDirName(dirname),
808810
fNTupleName(ntuplename),
@@ -811,6 +813,9 @@ ROOT::Internal::RDF::UntypedSnapshotRNTupleHelper::UntypedSnapshotRNTupleHelper(
811813
fOutputLoopManager(outputLM),
812814
fInputFieldNames(vfnames),
813815
fOutputFieldNames(ReplaceDotWithUnderscore(fnames)),
816+
fNSlots(nSlots),
817+
fFillContexts(nSlots),
818+
fEntries(nSlots),
814819
fInputColumnTypeIDs(colTypeIDs)
815820
{
816821
EnsureValidSnapshotRNTupleOutput(fOptions, fNTupleName, fFileName);
@@ -845,7 +850,6 @@ void ROOT::Internal::RDF::UntypedSnapshotRNTupleHelper::Initialize()
845850
fFieldTokens[i] = model->GetToken(fOutputFieldNames[i]);
846851
}
847852
model->Freeze();
848-
fOutputEntry = model->CreateBareEntry();
849853

850854
ROOT::RNTupleWriteOptions writeOptions;
851855
writeOptions.SetCompression(fOptions.fCompressionAlgorithm, fOptions.fCompressionLevel);
@@ -864,20 +868,41 @@ void ROOT::Internal::RDF::UntypedSnapshotRNTupleHelper::Initialize()
864868
outputDir = fOutputFile->mkdir(fDirName.c_str());
865869
}
866870

867-
fWriter = ROOT::RNTupleWriter::Append(std::move(model), fNTupleName, *outputDir, writeOptions);
871+
fWriter = ROOT::Experimental::RNTupleParallelWriter::Append(std::move(model), fNTupleName, *outputDir, writeOptions);
872+
}
873+
874+
void ROOT::Internal::RDF::UntypedSnapshotRNTupleHelper::InitTask(TTreeReader *, unsigned int slot)
875+
{
876+
if (!fFillContexts[slot]) {
877+
fFillContexts[slot] = fWriter->CreateFillContext();
878+
fEntries[slot] = fFillContexts[slot]->GetModel().CreateBareEntry();
879+
}
868880
}
869881

870-
void ROOT::Internal::RDF::UntypedSnapshotRNTupleHelper::Exec(unsigned int /* slot */, const std::vector<void *> &values)
882+
void ROOT::Internal::RDF::UntypedSnapshotRNTupleHelper::Exec(unsigned int slot, const std::vector<void *> &values)
871883
{
884+
auto &fillContext = fFillContexts[slot];
885+
auto &outputEntry = fEntries[slot];
872886
assert(values.size() == fFieldTokens.size());
873887
for (decltype(values.size()) i = 0; i < values.size(); i++) {
874-
fOutputEntry->BindRawPtr(fFieldTokens[i], values[i]);
888+
outputEntry->BindRawPtr(fFieldTokens[i], values[i]);
875889
}
876-
fWriter->Fill(*fOutputEntry);
890+
fillContext->Fill(*outputEntry);
891+
}
892+
893+
void ROOT::Internal::RDF::UntypedSnapshotRNTupleHelper::FinalizeTask(unsigned int slot)
894+
{
895+
// In principle we would not need to flush a cluster here, but we want to benefit from parallelism for compression.
896+
// NB: RNTupleFillContext::FlushCluster() is a nop if there is no new entry since the last flush.
897+
fFillContexts[slot]->FlushCluster();
877898
}
878899

879900
void ROOT::Internal::RDF::UntypedSnapshotRNTupleHelper::Finalize()
880901
{
902+
// First clear and destroy all entries, which were created from the RNTupleFillContexts.
903+
fEntries.clear();
904+
fFillContexts.clear();
905+
// Then destroy the RNTupleParallelWriter and write the metadata.
881906
fWriter.reset();
882907
// We can now set the data source of the loop manager for the RDataFrame that is returned by the Snapshot call.
883908
fOutputLoopManager->SetDataSource(std::make_unique<ROOT::RDF::RNTupleDS>(fDirName + "/" + fNTupleName, fFileName));
@@ -898,7 +923,7 @@ ROOT::Internal::RDF::UntypedSnapshotRNTupleHelper
898923
ROOT::Internal::RDF::UntypedSnapshotRNTupleHelper::MakeNew(void *newName)
899924
{
900925
const std::string finalName = *reinterpret_cast<const std::string *>(newName);
901-
return UntypedSnapshotRNTupleHelper{finalName, fDirName, fNTupleName,
902-
fInputFieldNames, fOutputFieldNames, fOptions,
903-
fInputLoopManager, fOutputLoopManager, fInputColumnTypeIDs};
926+
return UntypedSnapshotRNTupleHelper{
927+
fNSlots, finalName, fDirName, fNTupleName, fInputFieldNames,
928+
fOutputFieldNames, fOptions, fInputLoopManager, fOutputLoopManager, fInputColumnTypeIDs};
904929
}

tree/dataframe/test/dataframe_snapshot_ntuple.cxx

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -614,23 +614,22 @@ struct TIMTEnabler {
614614
~TIMTEnabler() { ROOT::DisableImplicitMT(); }
615615
};
616616

617-
TEST(RDFSnapshotRNTuple, ThrowIfMT)
617+
TEST(RDFSnapshotRNTuple, WithMT)
618618
{
619619
TIMTEnabler _(4);
620620

621-
FileRAII fileGuard{"RDFSnapshotRNTuple_throw_if_mt.root"};
621+
FileRAII fileGuard{"RDFSnapshotRNTuple_mt.root"};
622622

623623
auto df = ROOT::RDataFrame(25ull).Define("x", [] { return 10; });
624624

625625
RSnapshotOptions opts;
626626
opts.fOutputFormat = ROOT::RDF::ESnapshotOutputFormat::kRNTuple;
627627

628-
try {
629-
auto sdf = df.Snapshot("ntuple", fileGuard.GetPath(), {"x"}, opts);
630-
*sdf;
631-
FAIL() << "MT snapshotting to RNTuple is not supported yet";
632-
} catch (const std::runtime_error &err) {
633-
EXPECT_STREQ(err.what(), "Snapshot: Snapshotting to RNTuple with IMT enabled is not supported yet.");
634-
}
628+
auto sdf = df.Snapshot("ntuple", fileGuard.GetPath(), {"x"}, opts);
629+
*sdf;
630+
631+
auto reader = RNTupleReader::Open("ntuple", fileGuard.GetPath());
632+
EXPECT_EQ(25, reader->GetNEntries());
633+
// There should be more than one cluster, but this is not guaranteed because of scheduling...
635634
}
636635
#endif // R__USE_IMT

0 commit comments

Comments
 (0)