|
1 | 1 | /**
|
2 | 2 | * Copyright 2017-2023 by XGBoost Contributors
|
| 3 | + * Copyright 2024 FUJITSU LIMITED |
3 | 4 | * \file hist_util.cc
|
4 | 5 | */
|
5 | 6 | #include "hist_util.h"
|
|
15 | 16 | #include "xgboost/context.h" // for Context
|
16 | 17 | #include "xgboost/data.h" // for SparsePage, SortedCSCPage
|
17 | 18 |
|
| 19 | +#if defined(SVE_SUPPORT_DETECTED) |
| 20 | +#include <arm_sve.h> //to leverage sve intrinsics |
| 21 | +#endif |
| 22 | + |
18 | 23 | #if defined(XGBOOST_MM_PREFETCH_PRESENT)
|
19 | 24 | #include <xmmintrin.h>
|
20 | 25 | #define PREFETCH_READ_T0(addr) _mm_prefetch(reinterpret_cast<const char*>(addr), _MM_HINT_T0)
|
@@ -252,13 +257,55 @@ void RowsWiseBuildHistKernel(Span<GradientPair const> gpair, Span<bst_idx_t cons
|
252 | 257 |
|
253 | 258 | // The trick with pgh_t buffer helps the compiler to generate faster binary.
|
254 | 259 | const float pgh_t[] = {p_gpair[idx_gh], p_gpair[idx_gh + 1]};
|
255 |
| - for (size_t j = 0; j < row_size; ++j) { |
256 |
| - const uint32_t idx_bin = |
257 |
| - two * (static_cast<uint32_t>(gr_index_local[j]) + (kAnyMissing ? 0 : offsets[j])); |
258 |
| - auto hist_local = hist_data + idx_bin; |
259 |
| - *(hist_local) += pgh_t[0]; |
260 |
| - *(hist_local + 1) += pgh_t[1]; |
261 |
| - } |
| 260 | + |
| 261 | + #if defined(SVE_SUPPORT_DETECTED) |
| 262 | + svfloat64_t pgh_t0_vec = svdup_n_f64(pgh_t[0]); |
| 263 | + svfloat64_t pgh_t1_vec = svdup_n_f64(pgh_t[1]); |
| 264 | + |
| 265 | + for (size_t j = 0; j < row_size; j+=svcntw()) { |
| 266 | + svbool_t pg32 = svwhilelt_b32(j,row_size); |
| 267 | + svbool_t pg64 = svwhilelt_b64(j,row_size); |
| 268 | + |
| 269 | + svuint32_t gr_index_vec = svld1ub_u32(pg32, reinterpret_cast<const uint8_t *> (&gr_index_local[j])); |
| 270 | + svuint32_t offsets_vec = svld1(pg32, &offsets[j]); |
| 271 | + |
| 272 | + svuint32_t idx_bin_vec; |
| 273 | + if (kAnyMissing) { |
| 274 | + idx_bin_vec = svmul_n_u32_x(pg32, gr_index_vec, two); |
| 275 | + } else { |
| 276 | + svuint32_t temp = svadd_u32_m(pg32, gr_index_vec, offsets_vec); |
| 277 | + idx_bin_vec = svmul_n_u32_x(pg32, temp, two); |
| 278 | + } |
| 279 | + |
| 280 | + svuint64_t idx_bin_vec0_0 = svunpklo_u64(idx_bin_vec); |
| 281 | + svuint64_t idx_bin_vec0_1 = svunpkhi_u64(idx_bin_vec); |
| 282 | + svuint64_t idx_bin_vec1_0 = svadd_n_u64_m(pg64, idx_bin_vec0_0, 1); |
| 283 | + svuint64_t idx_bin_vec1_1 = svadd_n_u64_m(pg64, idx_bin_vec0_1, 1); |
| 284 | + |
| 285 | + svfloat64_t hist0_vec0 = svld1_gather_index(pg64, hist_data, idx_bin_vec0_0); |
| 286 | + svfloat64_t hist0_vec1 = svld1_gather_index(pg64, hist_data, idx_bin_vec0_1); |
| 287 | + svfloat64_t hist1_vec0 = svld1_gather_index(pg64, hist_data, idx_bin_vec1_0); |
| 288 | + svfloat64_t hist1_vec1 = svld1_gather_index(pg64, hist_data, idx_bin_vec1_1); |
| 289 | + |
| 290 | + hist0_vec0 = svadd_f64_m(pg64, hist0_vec0, pgh_t0_vec); |
| 291 | + hist0_vec1 = svadd_f64_m(pg64, hist0_vec1, pgh_t0_vec); |
| 292 | + hist1_vec0 = svadd_f64_m(pg64, hist1_vec0, pgh_t1_vec); |
| 293 | + hist1_vec1 = svadd_f64_m(pg64, hist1_vec1, pgh_t1_vec); |
| 294 | + |
| 295 | + svst1_scatter_index(pg64, hist_data, idx_bin_vec0_0, hist0_vec0); |
| 296 | + svst1_scatter_index(pg64, hist_data, idx_bin_vec0_1, hist0_vec1); |
| 297 | + svst1_scatter_index(pg64, hist_data, idx_bin_vec1_0, hist1_vec0); |
| 298 | + svst1_scatter_index(pg64, hist_data, idx_bin_vec1_1, hist1_vec1); |
| 299 | + } |
| 300 | + #else |
| 301 | + for (size_t j = 0; j < row_size; ++j) { |
| 302 | + const uint32_t idx_bin = |
| 303 | + two * (static_cast<uint32_t>(gr_index_local[j]) + (kAnyMissing ? 0 : offsets[j])); |
| 304 | + auto hist_local = hist_data + idx_bin; |
| 305 | + *(hist_local) += pgh_t[0]; |
| 306 | + *(hist_local + 1) += pgh_t[1]; |
| 307 | + } |
| 308 | + #endif |
262 | 309 | }
|
263 | 310 | }
|
264 | 311 |
|
|
0 commit comments