TRIQS · emrys53 · Nov 4, 2025 · Nov 4, 2025 · Nov 4, 2025 · Nov 4, 2025
diff --git a/.gitignore b/.gitignore
@@ -7,3 +7,4 @@ doc/html
 .*.swo
 .*.swn
 .claude
+.idea
diff --git a/c++/nda/CMakeLists.txt b/c++/nda/CMakeLists.txt
@@ -58,6 +58,9 @@ if(OpenMPSupport)
   target_compile_definitions(${PROJECT_NAME}_c PUBLIC NDA_HAVE_OPENMP)
 endif ()
 
+#XSIMD
+target_link_libraries(${PROJECT_NAME}_c PUBLIC xsimd)
+
 # ========= Blas / Lapack ==========
 
 message(STATUS "-------- Lapack detection -------------")

diff --git a/c++/nda/_impl_basic_array_view_common.hpp b/c++/nda/_impl_basic_array_view_common.hpp
@@ -262,6 +262,39 @@ FORCEINLINE decltype(auto) operator()(Ts const &...idxs) && noexcept(has_no_boun
   return call<Algebra, true>(*this, idxs...);
 }
 
+private:
+// Right now we are only doing SIMD access in contiguous layouts. If this rule is relaxed we need to change this function as well.
+void assert_simd_access_bounds(const long offset) const noexcept(has_no_boundcheck) {
+  static_assert(
+     has_contiguous_layout<self_t>,
+     "This functions should only be called when we have a contiguous layout. This can fail only when the rules of vectorization is relaxed therefore this function needs to be updated");
+  if constexpr (!has_no_boundcheck) {
+    if (offset + native_simd<ValueType>::size > this->size()) {
+      throw std::runtime_error("Index out of bounds for SIMD access.\n");
+    }
+  }
+}
+
+public:
+
+template <typename... Args>
+FORCEINLINE native_simd<ValueType> load(auto simd_tag, Args... idx) const noexcept(has_no_boundcheck) {
+  static_assert(std::is_same_v<decltype(simd_tag), simd::vectorize_t> or std::is_same_v<decltype(simd_tag), simd::emulate_t>,
+                "Load tag can only be vectorize or emulate");
+  static_assert(Vectorizable<ValueType>, "Load function is called with a type that is not a vectorizable type");
+  const long offset = lay(idx...);
+  assert_simd_access_bounds(offset);
+  return native_simd<ValueType>::load_unaligned(data() + offset);
+}
+
+template <typename... Args>
+FORCEINLINE void store(const native_simd<ValueType> &value, Args... idx) noexcept(has_no_boundcheck) {
+  static_assert(Vectorizable<ValueType>, "Store function is called with a type that is not a vectorizable type");
+  const long offset = lay(idx...);
+  assert_simd_access_bounds(offset);
+  value.store_unaligned(data() + offset);
+}
+
 /**
  * @brief Subscript operator to access the 1-dimensional view/array.
  *
@@ -496,7 +529,15 @@ void assign_from_ndarray(RHS const &rhs) { // FIXME noexcept {
   if constexpr (mem::on_device<self_t> || mem::on_device<RHS>) {
     NDA_RUNTIME_ERROR << "Error in assign_from_ndarray: Fallback to elementwise assignment not implemented for arrays/views on the GPU";
   }
-  nda::for_each(shape(), [this, &rhs](auto const &...args) { (*this)(args...) = rhs(args...); });
+  using dispatch_t = simd::dispatch_policy_t<RHS, ValueType>;
+  if constexpr (same_stride_order
+                and is_simd_enabled_v<self_t> and (std::is_same_v<dispatch_t, simd::vectorize_t> or std::is_same_v<dispatch_t, simd::emulate_t>)) {
+    nda::for_each_static<0, get_layout_info<self_t>.stride_order, native_simd<ValueType>::size>(
+       shape(), [this, &rhs](auto const &...args) { (*this).store(rhs.load(dispatch_t{}, args...), args...); },
+       [this, &rhs](auto const &...args) { (*this)(args...) = rhs(args...); });
+  } else {
+    nda::for_each(shape(), [this, &rhs](auto const &...args) { (*this)(args...) = rhs(args...); });
+  }
 }
 
 // Implementation to fill a view/array with a constant scalar value.

diff --git a/c++/nda/algorithms.hpp b/c++/nda/algorithms.hpp
@@ -69,6 +69,20 @@ namespace nda {
     return fold(std::move(f), a, get_value_t<A>{});
   }
 
+  //TODO: Maybe add another fold function that can interact with SIMD types.
+//  template <Array A, typename F_SIMD, typename F_SCALAR, Vectorizable R>
+//    requires(std::is_same_v<simd::dispatch_policy_t<A, R>, simd::vectorize_t> or std::is_same_v<simd::dispatch_policy_t<A, R>, simd::emulate_t>)
+//  auto fold(F_SIMD f_simd, F_SCALAR f_scalar, A const &a, native_simd<R> r_simd, R r_scalar) {
+//    nda::for_each_static<0, get_layout_info<A>.stride_order, native_simd<R>::size>(
+//       a.shape(),
+//       [&a, &r_simd, &f_simd](auto &&...args) { r_simd = f_simd(r_simd, native_simd<R>(a.load(simd::dispatch_policy_t<A, R>{}, args...))); },
+//       [&a, &r_scalar, &f_scalar](auto &&...args) { r_scalar = f_scalar(r_scalar, a(args...)); });
+//    alignas(native_simd<R>::arch_type::alignment()) std::array<R, r_simd.size()> res;
+//    r_simd.store(res.data());
+//    for (int i = 0; i < r_simd.size(); i++) { r_scalar = f_scalar(r_scalar, res[i]); }
+//    return r_scalar;
+//  }
+
   /**
    * @brief Does any of the elements of the array evaluate to true?
    *
@@ -122,12 +136,24 @@ namespace nda {
    */
   template <Array A>
   auto max_element(A const &a) {
-    return fold(
-       [](auto const &x, auto const &y) {
-         using std::max;
-         return max(x, y);
-       },
-       a, get_first_element(a));
+    using dispatch_t = simd::dispatch_policy_t<A>;
+    if constexpr (std::is_same_v<dispatch_t, simd::scalar_t>) {
+      return fold(
+         [](auto const &x, auto const &y) {
+           using std::max;
+           return max(x, y);
+         },
+         a, get_first_element(a));
+    } else {
+      using value_t = get_value_t<A>;
+      using simd_t  = native_simd<value_t>;
+      simd_t max_simd(get_first_element(a));
+      auto f_simd        = [&a, &max_simd](auto &&...args) { max_simd = xsimd::max(max_simd, a.load(dispatch_t{}, args...)); };
+      value_t max_scalar = get_first_element(a);
+      auto f_scalar      = [&a, &max_scalar](auto &&...args) { max_scalar = std::max(max_scalar, a(args...)); };
+      nda::for_each_static<0, get_layout_info<A>.stride_order, simd_t::size>(a.shape(), std::move(f_simd), std::move(f_scalar));
+      return std::max(max_scalar, xsimd::reduce_max(max_simd));
+    }
   }
 
   /**
@@ -141,12 +167,24 @@ namespace nda {
    */
   template <Array A>
   auto min_element(A const &a) {
-    return fold(
-       [](auto const &x, auto const &y) {
-         using std::min;
-         return min(x, y);
-       },
-       a, get_first_element(a));
+    using dispatch_t = simd::dispatch_policy_t<A>;
+    if constexpr (std::is_same_v<dispatch_t, simd::scalar_t>) {
+      return fold(
+         [](auto const &x, auto const &y) {
+           using std::min;
+           return min(x, y);
+         },
+         a, get_first_element(a));
+    } else {
+      using value_t = get_value_t<A>;
+      using simd_t  = native_simd<value_t>;
+      simd_t min_simd(get_first_element(a));
+      auto f_simd        = [&a, &min_simd](auto &&...args) { min_simd = xsimd::min(min_simd, a.load(dispatch_t{}, args...)); };
+      value_t min_scalar = get_first_element(a);
+      auto f_scalar      = [&a, &min_scalar](auto &&...args) { min_scalar = std::min(min_scalar, a(args...)); };
+      nda::for_each_static<0, get_layout_info<A>.stride_order, simd_t::size>(a.shape(), std::move(f_simd), std::move(f_scalar));
+      return std::min(min_scalar, xsimd::reduce_min(min_simd));
+    }
   }
 
   /**
@@ -159,12 +197,31 @@ namespace nda {
    */
   template <ArrayOfRank<2> A>
   double frobenius_norm(A const &a) {
-    return std::sqrt(fold(
-       [](double r, auto const &x) -> double {
-         auto ab = std::abs(x);
-         return r + ab * ab;
-       },
-       a, double(0)));
+    using dispatch_t = simd::dispatch_policy_t<A>;
+    if constexpr (std::is_same_v<dispatch_t, simd::scalar_t> or is_complex_v<get_value_t<A>>) {
+      return std::sqrt(fold(
+         [](double r, auto const &x) -> double {
+           auto abs = std::abs(x);
+           return xsimd::fma(abs, abs, r);
+         },
+         a, double(0)));
+    } else {
+      using value_t = get_value_t<A>;
+      using simd_t  = native_simd<value_t>;
+      simd_t r_simd(value_t(0));
+      auto f_simd = [&a, &r_simd](auto &&...args) {
+        simd_t x   = a.load(dispatch_t{}, args...);
+        simd_t abs = xsimd::abs(x);
+        r_simd     = xsimd::fma(abs, abs, r_simd);
+      };
+      double r      = 0;
+      auto f_scalar = [&a, &r](auto &&...args) {
+        auto abs = std::abs(a(args...));
+        r        = xsimd::fma(abs, abs, r);
+      };
+      nda::for_each_static<0, get_layout_info<A>.stride_order, simd_t::size>(a.shape(), std::move(f_simd), std::move(f_scalar));
+      return std::sqrt((static_cast<double>(xsimd::reduce_add(r_simd)) + r));
+    }
   }
 
   /**
@@ -179,8 +236,21 @@ namespace nda {
     requires(nda::Scalar<Value> or nda::Array<Value>)
   {
     if constexpr (nda::Scalar<Value>) {
-      return fold(std::plus<>{}, a);
-    } else { // Array<Value>
+      using dispatch_t = simd::dispatch_policy_t<A>;
+      if constexpr (std::is_same_v<dispatch_t, simd::scalar_t>) {
+        return fold(std::plus<>{}, a);
+      } else {
+        using value_t = get_value_t<A>;
+        using simd_t  = native_simd<value_t>;
+        simd_t sum_simd(value_t{0});
+        auto f_simd = [&a, &sum_simd](auto &&...args) { sum_simd += a.load(dispatch_t{}, args...); };
+        value_t sum_scalar{0};
+        auto f_scalar = [&a, &sum_scalar](auto &&...args) { sum_scalar += a(args...); };
+        nda::for_each_static<0, get_layout_info<A>.stride_order, simd_t::size>(a.shape(), std::move(f_simd), std::move(f_scalar));
+        return sum_scalar + xsimd::reduce_add(sum_simd);
+      }
+    } else {
+      // Array<Value>
       return fold(std::plus<>{}, a, Value::zeros(get_first_element(a).shape()));
     }
   }
@@ -197,8 +267,21 @@ namespace nda {
     requires(nda::Scalar<Value> or nda::Array<Value>)
   {
     if constexpr (nda::Scalar<Value>) {
-      return fold(std::multiplies<>{}, a, get_value_t<A>{1});
-    } else { // Array<Value>
+      using dispatch_t = simd::dispatch_policy_t<A>;
+      if constexpr (std::is_same_v<dispatch_t, simd::scalar_t>) {
+        return fold(std::multiplies<>{}, a);
+      } else {
+        using value_t = get_value_t<A>;
+        using simd_t  = native_simd<value_t>;
+        simd_t product_simd(value_t{1});
+        auto f_simd = [&a, &product_simd](auto &&...args) { product_simd *= a.load(dispatch_t{}, args...); };
+        value_t product_scalar{1};
+        auto f_scalar = [&a, &product_scalar](auto &&...args) { product_scalar *= a(args...); };
+        nda::for_each_static<0, get_layout_info<A>.stride_order, simd_t::size>(a.shape(), std::move(f_simd), std::move(f_scalar));
+        return product_scalar * xsimd::reduce_mul(product_simd);
+      }
+    } else {
+      // Array<Value>
       return fold(std::multiplies<>{}, a, Value::ones(get_first_element(a).shape()));
     }
   }
@@ -215,7 +298,17 @@ namespace nda {
   template <Array A, Array B>
     requires(nda::get_rank<A> == nda::get_rank<B>)
   [[nodiscard]] constexpr auto hadamard(A &&a, B &&b) {
-    return nda::map([](auto const &x, auto const &y) { return x * y; })(std::forward<A>(a), std::forward<B>(b));
+    if constexpr (is_simd_enabled_v<A> and is_simd_enabled_v<B> and std::is_same_v<get_value_t<A>, get_value_t<B>>) {
+      using value_t = get_value_t<A>;
+      using simd_t  = native_simd<value_t>;
+      struct mul {
+        value_t operator()(const value_t &x, const value_t &y) const { return x * y; }
+        simd_t load(const simd_t &x, const simd_t &y) const { return x * y; };
+      };
+      return nda::map(mul{})(std::forward<A>(a), std::forward<B>(b));
+    } else {
+      return nda::map([](auto const &x, auto const &y) { return x * y; })(std::forward<A>(a), std::forward<B>(b));
+    }
   }
 
   /**
-Original file line number
+Diff line change
@@ Expand Up / @@ -7,3 +7,4 @@ doc/html @@
     .*.swo
     .*.swn
     .claude
+    .idea