diff --git a/tree/dataframe/inc/ROOT/RDF/ActionHelpers.hxx b/tree/dataframe/inc/ROOT/RDF/ActionHelpers.hxx index 0c892b00b9297..4b3221e725fce 100644 --- a/tree/dataframe/inc/ROOT/RDF/ActionHelpers.hxx +++ b/tree/dataframe/inc/ROOT/RDF/ActionHelpers.hxx @@ -44,6 +44,7 @@ #include // std::index_sequence #include #include // std::accumulate in MeanHelper +#include class TCollection; class TStatistic; @@ -1316,6 +1317,193 @@ public: } }; +// Implements Welford's Online Algorithm for Skewness +class R__CLING_PTRCHECK(off) SkewnessHelper : public RActionImpl { + unsigned int fNSlots; + std::shared_ptr fResultSkewness; + + // Accumulators per slot + std::vector fCounts; + std::vector fMeans; + std::vector fM2; // Sum of squares of differences + std::vector fM3; // Sum of cubed differences + +public: + SkewnessHelper(const std::shared_ptr &resPtr, const unsigned int nSlots) + : fNSlots(nSlots), + fResultSkewness(resPtr), + fCounts(fNSlots, 0), + fMeans(fNSlots, 0.), + fM2(fNSlots, 0.), + fM3(fNSlots, 0.) + { + } + + SkewnessHelper(SkewnessHelper &&) = default; + SkewnessHelper(const SkewnessHelper &) = delete; + + void InitTask(TTreeReader *, unsigned int) {} + + void Exec(unsigned int slot, double val) + { + ULong64_t n = ++fCounts[slot]; + double delta = val - fMeans[slot]; + double delta_n = delta / n; + double term1 = delta * delta_n * (n - 1); + + fM3[slot] += term1 * delta_n * (n - 2) - 3 * delta_n * fM2[slot]; + fM2[slot] += term1; + fMeans[slot] += delta_n; + } + + template ::value, int> = 0> + void Exec(unsigned int slot, const T &vs) + { + for (auto &&v : vs) { + Exec(slot, v); + } + } + + void Initialize() { /* noop */ } + + void Finalize() + { + // Merge all slots into slot 0 using Chan et al. parallel algorithm + for (unsigned int i = 1; i < fNSlots; ++i) { + if (fCounts[i] == 0) + continue; + + ULong64_t n1 = fCounts[0]; + ULong64_t n2 = fCounts[i]; + ULong64_t n = n1 + n2; + + double delta = fMeans[i] - fMeans[0]; + double delta2 = delta * delta; + double delta3 = delta * delta2; + + fM3[0] += fM3[i] + delta3 * n1 * n2 * (n1 - n2) / (n * n) + 3.0 * delta * (n1 * fM2[i] - n2 * fM2[0]) / n; + fM2[0] += fM2[i] + delta2 * n1 * n2 / n; + fMeans[0] += delta * n2 / n; + fCounts[0] = n; + } + + if (fCounts[0] > 2 && fM2[0] > 0) { + // Cast required to resolve ambiguity with RVec math wrappers + *fResultSkewness = (std::sqrt(static_cast(fCounts[0])) * fM3[0]) / std::pow(fM2[0], 1.5); + } else { + *fResultSkewness = 0.0; + } + } + + std::string GetActionName() { return "Skewness"; } + + SkewnessHelper MakeNew(void *newResult, std::string_view /*variation*/ = "nominal") + { + auto &result = *static_cast *>(newResult); + return SkewnessHelper(result, fCounts.size()); + } + + std::unique_ptr GetMergeableValue() const final { return nullptr; } +}; + +// Implements Welford's Online Algorithm for Kurtosis +class R__CLING_PTRCHECK(off) KurtosisHelper : public RActionImpl { + unsigned int fNSlots; + std::shared_ptr fResultKurtosis; + + // Accumulators per slot + std::vector fCounts; + std::vector fMeans; + std::vector fM2; + std::vector fM3; + std::vector fM4; + +public: + KurtosisHelper(const std::shared_ptr &resPtr, const unsigned int nSlots) + : fNSlots(nSlots), + fResultKurtosis(resPtr), + fCounts(fNSlots, 0), + fMeans(fNSlots, 0.), + fM2(fNSlots, 0.), + fM3(fNSlots, 0.), + fM4(fNSlots, 0.) + { + } + + KurtosisHelper(KurtosisHelper &&) = default; + KurtosisHelper(const KurtosisHelper &) = delete; + + void InitTask(TTreeReader *, unsigned int) {} + + void Exec(unsigned int slot, double val) + { + ULong64_t n = ++fCounts[slot]; + double delta = val - fMeans[slot]; + double delta_n = delta / n; + double term1 = delta * delta_n * (n - 1); + + fM4[slot] += + term1 * delta_n * delta_n * (n * n - 3 * n + 3) + 6 * delta_n * delta_n * fM2[slot] - 4 * delta_n * fM3[slot]; + fM3[slot] += term1 * delta_n * (n - 2) - 3 * delta_n * fM2[slot]; + fM2[slot] += term1; + fMeans[slot] += delta_n; + } + + template ::value, int> = 0> + void Exec(unsigned int slot, const T &vs) + { + for (auto &&v : vs) { + Exec(slot, v); + } + } + + void Initialize() { /* noop */ } + + void Finalize() + { + for (unsigned int i = 1; i < fNSlots; ++i) { + if (fCounts[i] == 0) + continue; + + ULong64_t n1 = fCounts[0]; + ULong64_t n2 = fCounts[i]; + ULong64_t n = n1 + n2; + + double delta = fMeans[i] - fMeans[0]; + double delta2 = delta * delta; + double delta3 = delta * delta2; + double delta4 = delta2 * delta2; + + fM4[0] += fM4[i] + delta4 * n1 * n2 * (n1 * n1 - n1 * n2 + n2 * n2) / (n * n * n) + + 6.0 * delta2 * (n1 * n1 * fM2[i] + n2 * n2 * fM2[0]) / (n * n) + + 4.0 * delta * (n1 * fM3[i] - n2 * fM3[0]) / n; + + fM3[0] += fM3[i] + delta3 * n1 * n2 * (n1 - n2) / (n * n) + 3.0 * delta * (n1 * fM2[i] - n2 * fM2[0]) / n; + fM2[0] += fM2[i] + delta2 * n1 * n2 / n; + fMeans[0] += delta * n2 / n; + fCounts[0] = n; + } + + if (fCounts[0] > 3 && fM2[0] > 0) { + // Calculate Excess Kurtosis: (N*M4) / (M2^2) - 3 + double n = static_cast(fCounts[0]); + *fResultKurtosis = (n * fM4[0]) / (fM2[0] * fM2[0]) - 3.0; + } else { + *fResultKurtosis = -3.0; + } + } + + std::string GetActionName() { return "Kurtosis"; } + + KurtosisHelper MakeNew(void *newResult, std::string_view /*variation*/ = "nominal") + { + auto &result = *static_cast *>(newResult); + return KurtosisHelper(result, fCounts.size()); + } + + std::unique_ptr GetMergeableValue() const final { return nullptr; } +}; + template class R__CLING_PTRCHECK(off) DisplayHelper : public RActionImpl> { private: diff --git a/tree/dataframe/inc/ROOT/RDF/InterfaceUtils.hxx b/tree/dataframe/inc/ROOT/RDF/InterfaceUtils.hxx index 6f4a428640a6f..538441f27ddfa 100644 --- a/tree/dataframe/inc/ROOT/RDF/InterfaceUtils.hxx +++ b/tree/dataframe/inc/ROOT/RDF/InterfaceUtils.hxx @@ -100,6 +100,8 @@ struct Sum{}; struct Mean{}; struct Fill{}; struct StdDev{}; +struct Skewness{}; +struct Kurtosis{}; struct Display{}; struct Snapshot{}; struct Book{}; @@ -246,6 +248,28 @@ BuildAction(const ColumnNames_t &bl, const std::shared_ptr &stdDeviation return std::make_unique(Helper_t(stdDeviationV, nSlots), bl, prevNode, colRegister); } +// Skewness action +template +std::unique_ptr +BuildAction(const ColumnNames_t &bl, const std::shared_ptr &skewnessV, const unsigned int nSlots, + std::shared_ptr prevNode, ActionTags::Skewness, const RColumnRegister &colRegister) +{ + using Helper_t = SkewnessHelper; + using Action_t = RAction>; + return std::make_unique(Helper_t(skewnessV, nSlots), bl, prevNode, colRegister); +} + +// Kurtosis action +template +std::unique_ptr +BuildAction(const ColumnNames_t &bl, const std::shared_ptr &kurtosisV, const unsigned int nSlots, + std::shared_ptr prevNode, ActionTags::Kurtosis, const RColumnRegister &colRegister) +{ + using Helper_t = KurtosisHelper; + using Action_t = RAction>; + return std::make_unique(Helper_t(kurtosisV, nSlots), bl, prevNode, colRegister); +} + using displayHelperArgs_t = std::pair>; // Display action diff --git a/tree/dataframe/inc/ROOT/RDF/RInterface.hxx b/tree/dataframe/inc/ROOT/RDF/RInterface.hxx index 242a34bd94f67..d2817e45c55a1 100644 --- a/tree/dataframe/inc/ROOT/RDF/RInterface.hxx +++ b/tree/dataframe/inc/ROOT/RDF/RInterface.hxx @@ -2906,7 +2906,60 @@ public: return CreateAction(userColumns, stdDeviationV, stdDeviationV, fProxiedPtr); } - // clang-format off + //////////////////////////////////////////////////////////////////////////// + /// \brief Return the skewness of processed column values (*lazy action*). + /// \tparam T The type of the branch/column. + /// \param[in] columnName The name of the branch/column to be treated. + /// \return the skewness of the selected column wrapped in a RResultPtr. + /// + /// If T is not specified, RDataFrame will infer it from the data and just-in-time compile the correct + /// template specialization of this method. + /// + /// This action is *lazy*: upon invocation of this method the calculation is + /// booked but not executed. Also see RResultPtr. + /// + /// ### Example usage: + /// ~~~{.cpp} + /// // Deduce column type (this invocation needs jitting internally) + /// auto skew = df.Skewness("values"); + /// ~~~ + /// + /// \ingroup dataframe_actions + template + RResultPtr Skewness(std::string_view columnName = "") + { + const auto userColumns = columnName.empty() ? ColumnNames_t() : ColumnNames_t({std::string(columnName)}); + auto skewnessV = std::make_shared(0); + return CreateAction(userColumns, skewnessV, skewnessV, fProxiedPtr); + } + + //////////////////////////////////////////////////////////////////////////// + /// \brief Return the excess kurtosis of processed column values (*lazy action*). + /// \tparam T The type of the branch/column. + /// \param[in] columnName The name of the branch/column to be treated. + /// \return the excess kurtosis of the selected column wrapped in a RResultPtr. + /// + /// If T is not specified, RDataFrame will infer it from the data and just-in-time compile the correct + /// template specialization of this method. + /// + /// This action is *lazy*: upon invocation of this method the calculation is + /// booked but not executed. Also see RResultPtr. + /// + /// ### Example usage: + /// ~~~{.cpp} + /// // Deduce column type (this invocation needs jitting internally) + /// auto kurt = df.Kurtosis("values"); + /// ~~~ + /// + /// \ingroup dataframe_actions + template + RResultPtr Kurtosis(std::string_view columnName = "") + { + const auto userColumns = columnName.empty() ? ColumnNames_t() : ColumnNames_t({std::string(columnName)}); + auto kurtosisV = std::make_shared(0); + return CreateAction(userColumns, kurtosisV, kurtosisV, fProxiedPtr); + } + //////////////////////////////////////////////////////////////////////////// /// \brief Return the sum of processed column values (*lazy action*). /// \tparam T The type of the branch/column.