Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
188 changes: 188 additions & 0 deletions tree/dataframe/inc/ROOT/RDF/ActionHelpers.hxx
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
#include <utility> // std::index_sequence
#include <vector>
#include <numeric> // std::accumulate in MeanHelper
#include <cmath>

class TCollection;
class TStatistic;
Expand Down Expand Up @@ -1316,6 +1317,193 @@ public:
}
};

// Implements Welford's Online Algorithm for Skewness
class R__CLING_PTRCHECK(off) SkewnessHelper : public RActionImpl<SkewnessHelper> {
unsigned int fNSlots;
std::shared_ptr<double> fResultSkewness;

// Accumulators per slot
std::vector<ULong64_t> fCounts;
std::vector<double> fMeans;
std::vector<double> fM2; // Sum of squares of differences
std::vector<double> fM3; // Sum of cubed differences

public:
SkewnessHelper(const std::shared_ptr<double> &resPtr, const unsigned int nSlots)
: fNSlots(nSlots),
fResultSkewness(resPtr),
fCounts(fNSlots, 0),
fMeans(fNSlots, 0.),
fM2(fNSlots, 0.),
fM3(fNSlots, 0.)
{
}

SkewnessHelper(SkewnessHelper &&) = default;
SkewnessHelper(const SkewnessHelper &) = delete;

void InitTask(TTreeReader *, unsigned int) {}

void Exec(unsigned int slot, double val)
{
ULong64_t n = ++fCounts[slot];
double delta = val - fMeans[slot];
double delta_n = delta / n;
double term1 = delta * delta_n * (n - 1);

fM3[slot] += term1 * delta_n * (n - 2) - 3 * delta_n * fM2[slot];
fM2[slot] += term1;
fMeans[slot] += delta_n;
}

template <typename T, std::enable_if_t<IsDataContainer<T>::value, int> = 0>
void Exec(unsigned int slot, const T &vs)
{
for (auto &&v : vs) {
Exec(slot, v);
}
}

void Initialize() { /* noop */ }

void Finalize()
{
// Merge all slots into slot 0 using Chan et al. parallel algorithm
for (unsigned int i = 1; i < fNSlots; ++i) {
if (fCounts[i] == 0)
continue;

ULong64_t n1 = fCounts[0];
ULong64_t n2 = fCounts[i];
ULong64_t n = n1 + n2;

double delta = fMeans[i] - fMeans[0];
double delta2 = delta * delta;
double delta3 = delta * delta2;

fM3[0] += fM3[i] + delta3 * n1 * n2 * (n1 - n2) / (n * n) + 3.0 * delta * (n1 * fM2[i] - n2 * fM2[0]) / n;
fM2[0] += fM2[i] + delta2 * n1 * n2 / n;
fMeans[0] += delta * n2 / n;
fCounts[0] = n;
}

if (fCounts[0] > 2 && fM2[0] > 0) {
// Cast required to resolve ambiguity with RVec math wrappers
*fResultSkewness = (std::sqrt(static_cast<double>(fCounts[0])) * fM3[0]) / std::pow(fM2[0], 1.5);
} else {
*fResultSkewness = 0.0;
}
}

std::string GetActionName() { return "Skewness"; }

SkewnessHelper MakeNew(void *newResult, std::string_view /*variation*/ = "nominal")
{
auto &result = *static_cast<std::shared_ptr<double> *>(newResult);
return SkewnessHelper(result, fCounts.size());
}

std::unique_ptr<RMergeableValueBase> GetMergeableValue() const final { return nullptr; }
};

// Implements Welford's Online Algorithm for Kurtosis
class R__CLING_PTRCHECK(off) KurtosisHelper : public RActionImpl<KurtosisHelper> {
unsigned int fNSlots;
std::shared_ptr<double> fResultKurtosis;

// Accumulators per slot
std::vector<ULong64_t> fCounts;
std::vector<double> fMeans;
std::vector<double> fM2;
std::vector<double> fM3;
std::vector<double> fM4;

public:
KurtosisHelper(const std::shared_ptr<double> &resPtr, const unsigned int nSlots)
: fNSlots(nSlots),
fResultKurtosis(resPtr),
fCounts(fNSlots, 0),
fMeans(fNSlots, 0.),
fM2(fNSlots, 0.),
fM3(fNSlots, 0.),
fM4(fNSlots, 0.)
{
}

KurtosisHelper(KurtosisHelper &&) = default;
KurtosisHelper(const KurtosisHelper &) = delete;

void InitTask(TTreeReader *, unsigned int) {}

void Exec(unsigned int slot, double val)
{
ULong64_t n = ++fCounts[slot];
double delta = val - fMeans[slot];
double delta_n = delta / n;
double term1 = delta * delta_n * (n - 1);

fM4[slot] +=
term1 * delta_n * delta_n * (n * n - 3 * n + 3) + 6 * delta_n * delta_n * fM2[slot] - 4 * delta_n * fM3[slot];
fM3[slot] += term1 * delta_n * (n - 2) - 3 * delta_n * fM2[slot];
fM2[slot] += term1;
fMeans[slot] += delta_n;
}

template <typename T, std::enable_if_t<IsDataContainer<T>::value, int> = 0>
void Exec(unsigned int slot, const T &vs)
{
for (auto &&v : vs) {
Exec(slot, v);
}
}

void Initialize() { /* noop */ }

void Finalize()
{
for (unsigned int i = 1; i < fNSlots; ++i) {
if (fCounts[i] == 0)
continue;

ULong64_t n1 = fCounts[0];
ULong64_t n2 = fCounts[i];
ULong64_t n = n1 + n2;

double delta = fMeans[i] - fMeans[0];
double delta2 = delta * delta;
double delta3 = delta * delta2;
double delta4 = delta2 * delta2;

fM4[0] += fM4[i] + delta4 * n1 * n2 * (n1 * n1 - n1 * n2 + n2 * n2) / (n * n * n) +
6.0 * delta2 * (n1 * n1 * fM2[i] + n2 * n2 * fM2[0]) / (n * n) +
4.0 * delta * (n1 * fM3[i] - n2 * fM3[0]) / n;

fM3[0] += fM3[i] + delta3 * n1 * n2 * (n1 - n2) / (n * n) + 3.0 * delta * (n1 * fM2[i] - n2 * fM2[0]) / n;
fM2[0] += fM2[i] + delta2 * n1 * n2 / n;
fMeans[0] += delta * n2 / n;
fCounts[0] = n;
}

if (fCounts[0] > 3 && fM2[0] > 0) {
// Calculate Excess Kurtosis: (N*M4) / (M2^2) - 3
double n = static_cast<double>(fCounts[0]);
*fResultKurtosis = (n * fM4[0]) / (fM2[0] * fM2[0]) - 3.0;
} else {
*fResultKurtosis = -3.0;
}
}

std::string GetActionName() { return "Kurtosis"; }

KurtosisHelper MakeNew(void *newResult, std::string_view /*variation*/ = "nominal")
{
auto &result = *static_cast<std::shared_ptr<double> *>(newResult);
return KurtosisHelper(result, fCounts.size());
}

std::unique_ptr<RMergeableValueBase> GetMergeableValue() const final { return nullptr; }
};

template <typename PrevNodeType>
class R__CLING_PTRCHECK(off) DisplayHelper : public RActionImpl<DisplayHelper<PrevNodeType>> {
private:
Expand Down
24 changes: 24 additions & 0 deletions tree/dataframe/inc/ROOT/RDF/InterfaceUtils.hxx
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,8 @@ struct Sum{};
struct Mean{};
struct Fill{};
struct StdDev{};
struct Skewness{};
struct Kurtosis{};
struct Display{};
struct Snapshot{};
struct Book{};
Expand Down Expand Up @@ -246,6 +248,28 @@ BuildAction(const ColumnNames_t &bl, const std::shared_ptr<double> &stdDeviation
return std::make_unique<Action_t>(Helper_t(stdDeviationV, nSlots), bl, prevNode, colRegister);
}

// Skewness action
template <typename ColType, typename PrevNodeType>
std::unique_ptr<RActionBase>
BuildAction(const ColumnNames_t &bl, const std::shared_ptr<double> &skewnessV, const unsigned int nSlots,
std::shared_ptr<PrevNodeType> prevNode, ActionTags::Skewness, const RColumnRegister &colRegister)
{
using Helper_t = SkewnessHelper;
using Action_t = RAction<Helper_t, PrevNodeType, TTraits::TypeList<ColType>>;
return std::make_unique<Action_t>(Helper_t(skewnessV, nSlots), bl, prevNode, colRegister);
}

// Kurtosis action
template <typename ColType, typename PrevNodeType>
std::unique_ptr<RActionBase>
BuildAction(const ColumnNames_t &bl, const std::shared_ptr<double> &kurtosisV, const unsigned int nSlots,
std::shared_ptr<PrevNodeType> prevNode, ActionTags::Kurtosis, const RColumnRegister &colRegister)
{
using Helper_t = KurtosisHelper;
using Action_t = RAction<Helper_t, PrevNodeType, TTraits::TypeList<ColType>>;
return std::make_unique<Action_t>(Helper_t(kurtosisV, nSlots), bl, prevNode, colRegister);
}

using displayHelperArgs_t = std::pair<size_t, std::shared_ptr<ROOT::RDF::RDisplay>>;

// Display action
Expand Down
55 changes: 54 additions & 1 deletion tree/dataframe/inc/ROOT/RDF/RInterface.hxx
Original file line number Diff line number Diff line change
Expand Up @@ -2906,7 +2906,60 @@ public:
return CreateAction<RDFInternal::ActionTags::StdDev, T>(userColumns, stdDeviationV, stdDeviationV, fProxiedPtr);
}

// clang-format off
////////////////////////////////////////////////////////////////////////////
/// \brief Return the skewness of processed column values (*lazy action*).
/// \tparam T The type of the branch/column.
/// \param[in] columnName The name of the branch/column to be treated.
/// \return the skewness of the selected column wrapped in a RResultPtr.
///
/// If T is not specified, RDataFrame will infer it from the data and just-in-time compile the correct
/// template specialization of this method.
///
/// This action is *lazy*: upon invocation of this method the calculation is
/// booked but not executed. Also see RResultPtr.
///
/// ### Example usage:
/// ~~~{.cpp}
/// // Deduce column type (this invocation needs jitting internally)
/// auto skew = df.Skewness("values");
/// ~~~
///
/// \ingroup dataframe_actions
template <typename T = RDFDetail::RInferredType>
RResultPtr<double> Skewness(std::string_view columnName = "")
{
const auto userColumns = columnName.empty() ? ColumnNames_t() : ColumnNames_t({std::string(columnName)});
auto skewnessV = std::make_shared<double>(0);
return CreateAction<RDFInternal::ActionTags::Skewness, T>(userColumns, skewnessV, skewnessV, fProxiedPtr);
}

////////////////////////////////////////////////////////////////////////////
/// \brief Return the excess kurtosis of processed column values (*lazy action*).
/// \tparam T The type of the branch/column.
/// \param[in] columnName The name of the branch/column to be treated.
/// \return the excess kurtosis of the selected column wrapped in a RResultPtr.
///
/// If T is not specified, RDataFrame will infer it from the data and just-in-time compile the correct
/// template specialization of this method.
///
/// This action is *lazy*: upon invocation of this method the calculation is
/// booked but not executed. Also see RResultPtr.
///
/// ### Example usage:
/// ~~~{.cpp}
/// // Deduce column type (this invocation needs jitting internally)
/// auto kurt = df.Kurtosis("values");
/// ~~~
///
/// \ingroup dataframe_actions
template <typename T = RDFDetail::RInferredType>
RResultPtr<double> Kurtosis(std::string_view columnName = "")
{
const auto userColumns = columnName.empty() ? ColumnNames_t() : ColumnNames_t({std::string(columnName)});
auto kurtosisV = std::make_shared<double>(0);
return CreateAction<RDFInternal::ActionTags::Kurtosis, T>(userColumns, kurtosisV, kurtosisV, fProxiedPtr);
}

////////////////////////////////////////////////////////////////////////////
/// \brief Return the sum of processed column values (*lazy action*).
/// \tparam T The type of the branch/column.
Expand Down