Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
02767e8
add xsimd dependency
Nov 4, 2025
e9c12f3
load and store functions added to all array like data types
Nov 4, 2025
d4fc2d3
implementation of type traits and concepts to check whether an expres…
Nov 4, 2025
c588306
implemented for_each with SIMD chunks and assign arrays in SIMD chunk…
Nov 4, 2025
8a90043
vectorized the existing algorithms
Nov 4, 2025
7175c24
added march=native flag when compiling tests
Nov 4, 2025
ef1969a
added mock_simd class and updated Load concept to include mock_simd
Nov 4, 2025
c3784f8
added tests for load and store functions for every data types and upd…
Nov 4, 2025
2583a37
added tests for is_simd_enabled trait for different expressions and a…
Nov 4, 2025
3bff50e
vectorize the existing math functions using xsimd
Nov 5, 2025
7b3c944
correctly enable -march=native on test runs
Nov 5, 2025
201745f
add simd cost model and basic expression tree cost calculator for sta…
Nov 6, 2025
8724b3a
add dispatch policy and divide is_simd_enabled trait into 4 smaller i…
Nov 6, 2025
54b3954
Refactor load function to use tagged dispatch and cost-based emulatio…
Nov 7, 2025
2bf0588
add some tests to newly added traits for is_simd_enabled and correct …
Nov 7, 2025
a8dcb34
add tests for expression cost model and dispatch policy
Nov 7, 2025
ff56a0a
use FORCEINLINE instead of [[gnu::always_inline]]
Nov 10, 2025
b6de1ce
merge load function inside expr with auto simd_tag to avoid duplicate…
Nov 10, 2025
849547a
simplify mapped functions to use single lambda function and make them…
Nov 10, 2025
5124560
Convert trait structs (expression_cost, has_same_layout, has_vectoriz…
Nov 11, 2025
47e936f
move forward declaration to simd_fwd.hpp file
Nov 11, 2025
4c4bc70
make simd_cost_model more clear and understandable with two different…
Nov 11, 2025
d34c353
remove if constexpr by replacing it with xsimd::fma and fix a typo fr…
Nov 11, 2025
537f9f5
add more comments on diagonal_simd lambda inside expr class to make i…
Nov 11, 2025
3d9c14a
comment out the fold function
Nov 11, 2025
cf8d83f
make the template naming in simd_cost consistent
Nov 11, 2025
2641d1e
update the apply_function inside mock_simd class to force loop unrolling
Nov 11, 2025
6a92fe9
replace else if constexpr with if constexpr in simd_cost file
Nov 12, 2025
2bfbabb
add FORCEINLINE to some of the functions
Nov 12, 2025
d038a6e
fix static_assert message
Nov 12, 2025
0ee0309
add bound checks to load and store functions in basic_array/basic_arr…
Nov 12, 2025
8377d0a
remove duplication in for_each_static_impl
Nov 13, 2025
ab7339e
remove reduntant using std::xxx in mapped_functions as xsimd has alre…
Nov 13, 2025
5aa1931
replace assert with throw for out ouf bounds check for load function
Nov 13, 2025
2d1df02
remove duplicated load functions with auto in arrays and expr_unary
Nov 13, 2025
b139894
update cmake files
Nov 14, 2025
1065880
add licenses to newly created files
Nov 24, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ doc/html
.*.swo
.*.swn
.claude
.idea
3 changes: 3 additions & 0 deletions c++/nda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,9 @@ if(OpenMPSupport)
target_compile_definitions(${PROJECT_NAME}_c PUBLIC NDA_HAVE_OPENMP)
endif ()

#XSIMD
target_link_libraries(${PROJECT_NAME}_c PUBLIC xsimd)

# ========= Blas / Lapack ==========

message(STATUS "-------- Lapack detection -------------")
Expand Down
43 changes: 42 additions & 1 deletion c++/nda/_impl_basic_array_view_common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,39 @@ FORCEINLINE decltype(auto) operator()(Ts const &...idxs) && noexcept(has_no_boun
return call<Algebra, true>(*this, idxs...);
}

private:
// Right now we are only doing SIMD access in contiguous layouts. If this rule is relaxed we need to change this function as well.
void assert_simd_access_bounds(const long offset) const noexcept(has_no_boundcheck) {
static_assert(
has_contiguous_layout<self_t>,
"This functions should only be called when we have a contiguous layout. This can fail only when the rules of vectorization is relaxed therefore this function needs to be updated");
if constexpr (!has_no_boundcheck) {
if (offset + native_simd<ValueType>::size > this->size()) {
throw std::runtime_error("Index out of bounds for SIMD access.\n");
}
}
}

public:

template <typename... Args>
FORCEINLINE native_simd<ValueType> load(auto simd_tag, Args... idx) const noexcept(has_no_boundcheck) {
static_assert(std::is_same_v<decltype(simd_tag), simd::vectorize_t> or std::is_same_v<decltype(simd_tag), simd::emulate_t>,
"Load tag can only be vectorize or emulate");
static_assert(Vectorizable<ValueType>, "Load function is called with a type that is not a vectorizable type");
const long offset = lay(idx...);
assert_simd_access_bounds(offset);
return native_simd<ValueType>::load_unaligned(data() + offset);
}

template <typename... Args>
FORCEINLINE void store(const native_simd<ValueType> &value, Args... idx) noexcept(has_no_boundcheck) {
static_assert(Vectorizable<ValueType>, "Store function is called with a type that is not a vectorizable type");
const long offset = lay(idx...);
assert_simd_access_bounds(offset);
value.store_unaligned(data() + offset);
}

/**
* @brief Subscript operator to access the 1-dimensional view/array.
*
Expand Down Expand Up @@ -496,7 +529,15 @@ void assign_from_ndarray(RHS const &rhs) { // FIXME noexcept {
if constexpr (mem::on_device<self_t> || mem::on_device<RHS>) {
NDA_RUNTIME_ERROR << "Error in assign_from_ndarray: Fallback to elementwise assignment not implemented for arrays/views on the GPU";
}
nda::for_each(shape(), [this, &rhs](auto const &...args) { (*this)(args...) = rhs(args...); });
using dispatch_t = simd::dispatch_policy_t<RHS, ValueType>;
if constexpr (same_stride_order
and is_simd_enabled_v<self_t> and (std::is_same_v<dispatch_t, simd::vectorize_t> or std::is_same_v<dispatch_t, simd::emulate_t>)) {
nda::for_each_static<0, get_layout_info<self_t>.stride_order, native_simd<ValueType>::size>(
shape(), [this, &rhs](auto const &...args) { (*this).store(rhs.load(dispatch_t{}, args...), args...); },
[this, &rhs](auto const &...args) { (*this)(args...) = rhs(args...); });
} else {
nda::for_each(shape(), [this, &rhs](auto const &...args) { (*this)(args...) = rhs(args...); });
}
}

// Implementation to fill a view/array with a constant scalar value.
Expand Down
139 changes: 116 additions & 23 deletions c++/nda/algorithms.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,20 @@ namespace nda {
return fold(std::move(f), a, get_value_t<A>{});
}

//TODO: Maybe add another fold function that can interact with SIMD types.
// template <Array A, typename F_SIMD, typename F_SCALAR, Vectorizable R>
// requires(std::is_same_v<simd::dispatch_policy_t<A, R>, simd::vectorize_t> or std::is_same_v<simd::dispatch_policy_t<A, R>, simd::emulate_t>)
// auto fold(F_SIMD f_simd, F_SCALAR f_scalar, A const &a, native_simd<R> r_simd, R r_scalar) {
// nda::for_each_static<0, get_layout_info<A>.stride_order, native_simd<R>::size>(
// a.shape(),
// [&a, &r_simd, &f_simd](auto &&...args) { r_simd = f_simd(r_simd, native_simd<R>(a.load(simd::dispatch_policy_t<A, R>{}, args...))); },
// [&a, &r_scalar, &f_scalar](auto &&...args) { r_scalar = f_scalar(r_scalar, a(args...)); });
// alignas(native_simd<R>::arch_type::alignment()) std::array<R, r_simd.size()> res;
// r_simd.store(res.data());
// for (int i = 0; i < r_simd.size(); i++) { r_scalar = f_scalar(r_scalar, res[i]); }
// return r_scalar;
// }

/**
* @brief Does any of the elements of the array evaluate to true?
*
Expand Down Expand Up @@ -122,12 +136,24 @@ namespace nda {
*/
template <Array A>
auto max_element(A const &a) {
return fold(
[](auto const &x, auto const &y) {
using std::max;
return max(x, y);
},
a, get_first_element(a));
using dispatch_t = simd::dispatch_policy_t<A>;
if constexpr (std::is_same_v<dispatch_t, simd::scalar_t>) {
return fold(
[](auto const &x, auto const &y) {
using std::max;
return max(x, y);
},
a, get_first_element(a));
} else {
using value_t = get_value_t<A>;
using simd_t = native_simd<value_t>;
simd_t max_simd(get_first_element(a));
auto f_simd = [&a, &max_simd](auto &&...args) { max_simd = xsimd::max(max_simd, a.load(dispatch_t{}, args...)); };
value_t max_scalar = get_first_element(a);
auto f_scalar = [&a, &max_scalar](auto &&...args) { max_scalar = std::max(max_scalar, a(args...)); };
nda::for_each_static<0, get_layout_info<A>.stride_order, simd_t::size>(a.shape(), std::move(f_simd), std::move(f_scalar));
return std::max(max_scalar, xsimd::reduce_max(max_simd));
}
}

/**
Expand All @@ -141,12 +167,24 @@ namespace nda {
*/
template <Array A>
auto min_element(A const &a) {
return fold(
[](auto const &x, auto const &y) {
using std::min;
return min(x, y);
},
a, get_first_element(a));
using dispatch_t = simd::dispatch_policy_t<A>;
if constexpr (std::is_same_v<dispatch_t, simd::scalar_t>) {
return fold(
[](auto const &x, auto const &y) {
using std::min;
return min(x, y);
},
a, get_first_element(a));
} else {
using value_t = get_value_t<A>;
using simd_t = native_simd<value_t>;
simd_t min_simd(get_first_element(a));
auto f_simd = [&a, &min_simd](auto &&...args) { min_simd = xsimd::min(min_simd, a.load(dispatch_t{}, args...)); };
value_t min_scalar = get_first_element(a);
auto f_scalar = [&a, &min_scalar](auto &&...args) { min_scalar = std::min(min_scalar, a(args...)); };
nda::for_each_static<0, get_layout_info<A>.stride_order, simd_t::size>(a.shape(), std::move(f_simd), std::move(f_scalar));
return std::min(min_scalar, xsimd::reduce_min(min_simd));
}
}

/**
Expand All @@ -159,12 +197,31 @@ namespace nda {
*/
template <ArrayOfRank<2> A>
double frobenius_norm(A const &a) {
return std::sqrt(fold(
[](double r, auto const &x) -> double {
auto ab = std::abs(x);
return r + ab * ab;
},
a, double(0)));
using dispatch_t = simd::dispatch_policy_t<A>;
if constexpr (std::is_same_v<dispatch_t, simd::scalar_t> or is_complex_v<get_value_t<A>>) {
return std::sqrt(fold(
[](double r, auto const &x) -> double {
auto abs = std::abs(x);
return xsimd::fma(abs, abs, r);
},
a, double(0)));
} else {
using value_t = get_value_t<A>;
using simd_t = native_simd<value_t>;
simd_t r_simd(value_t(0));
auto f_simd = [&a, &r_simd](auto &&...args) {
simd_t x = a.load(dispatch_t{}, args...);
simd_t abs = xsimd::abs(x);
r_simd = xsimd::fma(abs, abs, r_simd);
};
double r = 0;
auto f_scalar = [&a, &r](auto &&...args) {
auto abs = std::abs(a(args...));
r = xsimd::fma(abs, abs, r);
};
nda::for_each_static<0, get_layout_info<A>.stride_order, simd_t::size>(a.shape(), std::move(f_simd), std::move(f_scalar));
return std::sqrt((static_cast<double>(xsimd::reduce_add(r_simd)) + r));
}
}

/**
Expand All @@ -179,8 +236,21 @@ namespace nda {
requires(nda::Scalar<Value> or nda::Array<Value>)
{
if constexpr (nda::Scalar<Value>) {
return fold(std::plus<>{}, a);
} else { // Array<Value>
using dispatch_t = simd::dispatch_policy_t<A>;
if constexpr (std::is_same_v<dispatch_t, simd::scalar_t>) {
return fold(std::plus<>{}, a);
} else {
using value_t = get_value_t<A>;
using simd_t = native_simd<value_t>;
simd_t sum_simd(value_t{0});
auto f_simd = [&a, &sum_simd](auto &&...args) { sum_simd += a.load(dispatch_t{}, args...); };
value_t sum_scalar{0};
auto f_scalar = [&a, &sum_scalar](auto &&...args) { sum_scalar += a(args...); };
nda::for_each_static<0, get_layout_info<A>.stride_order, simd_t::size>(a.shape(), std::move(f_simd), std::move(f_scalar));
return sum_scalar + xsimd::reduce_add(sum_simd);
}
} else {
// Array<Value>
return fold(std::plus<>{}, a, Value::zeros(get_first_element(a).shape()));
}
}
Expand All @@ -197,8 +267,21 @@ namespace nda {
requires(nda::Scalar<Value> or nda::Array<Value>)
{
if constexpr (nda::Scalar<Value>) {
return fold(std::multiplies<>{}, a, get_value_t<A>{1});
} else { // Array<Value>
using dispatch_t = simd::dispatch_policy_t<A>;
if constexpr (std::is_same_v<dispatch_t, simd::scalar_t>) {
return fold(std::multiplies<>{}, a);
} else {
using value_t = get_value_t<A>;
using simd_t = native_simd<value_t>;
simd_t product_simd(value_t{1});
auto f_simd = [&a, &product_simd](auto &&...args) { product_simd *= a.load(dispatch_t{}, args...); };
value_t product_scalar{1};
auto f_scalar = [&a, &product_scalar](auto &&...args) { product_scalar *= a(args...); };
nda::for_each_static<0, get_layout_info<A>.stride_order, simd_t::size>(a.shape(), std::move(f_simd), std::move(f_scalar));
return product_scalar * xsimd::reduce_mul(product_simd);
}
} else {
// Array<Value>
return fold(std::multiplies<>{}, a, Value::ones(get_first_element(a).shape()));
}
}
Expand All @@ -215,7 +298,17 @@ namespace nda {
template <Array A, Array B>
requires(nda::get_rank<A> == nda::get_rank<B>)
[[nodiscard]] constexpr auto hadamard(A &&a, B &&b) {
return nda::map([](auto const &x, auto const &y) { return x * y; })(std::forward<A>(a), std::forward<B>(b));
if constexpr (is_simd_enabled_v<A> and is_simd_enabled_v<B> and std::is_same_v<get_value_t<A>, get_value_t<B>>) {
using value_t = get_value_t<A>;
using simd_t = native_simd<value_t>;
struct mul {
value_t operator()(const value_t &x, const value_t &y) const { return x * y; }
simd_t load(const simd_t &x, const simd_t &y) const { return x * y; };
};
return nda::map(mul{})(std::forward<A>(a), std::forward<B>(b));
} else {
return nda::map([](auto const &x, auto const &y) { return x * y; })(std::forward<A>(a), std::forward<B>(b));
}
}

/**
Expand Down
Loading
Loading