Merge adaptation to the size_type change in the Accessor

Thomas Grützmacher · web-flow · commit 8137ba67b60c · 2022-02-03T16:06:29.000+01:00
The Accessor inside Ginkgo was recently changed to use std::int64_t
instead of std::size_t as the size_type. This PR adopts this change to
this repository as well.
diff --git a/cuda/dot_benchmark.cu b/cuda/dot_benchmark.cu
@@ -21,13 +21,14 @@ int main(int argc, char **argv)
 {
     using ar_type = double;
     using st_type = float;
+    using size_type = matrix_info::size_type;
 
-    constexpr std::size_t min_size{1'000'000};
-    constexpr std::size_t default_max_size{535 * 1000 * 1000};
+    constexpr size_type min_size{1'000'000};
+    constexpr size_type default_max_size{535 * 1000 * 1000};
     constexpr char DELIM{';'};
 
     bool detailed_error{false};
-    std::size_t max_size{default_max_size};
+    size_type max_size{default_max_size};
 
     const std::string use_error_string("--error");
     const std::string set_size_string("--size");
@@ -84,7 +85,7 @@ int main(int argc, char **argv)
         return static_cast<ar_type>(st_data.get_result());
     };
 
-    constexpr std::size_t benchmark_reference{0};
+    constexpr size_type benchmark_reference{0};
     using benchmark_info_t =
         std::tuple<std::string, std::function<void(matrix_info, matrix_info)>,
                    std::function<ar_type()>>;
@@ -139,7 +140,7 @@ int main(int argc, char **argv)
                                         st_data.gpu_y(), st_data.gpu_res());
                          },
                          st_get_result}};
-    const std::size_t benchmark_num{benchmark_info.size()};
+    const size_type benchmark_num{static_cast<size_type>(benchmark_info.size())};
 
 
     std::cout << "Vector Size";
@@ -163,32 +164,32 @@ int main(int argc, char **argv)
     };
 
     // Number of elements of a vector at the start of the benchmark
-    const std::size_t start = std::min(max_size, min_size);
+    const size_type start = std::min(max_size, min_size);
     // Increase in number of elements between consecutive benchmark runs
-    constexpr std::size_t row_incr = 2'000'000;
+    constexpr size_type row_incr = 2'000'000;
     // Number of benchmark runs (ignoring randomization)
-    const std::size_t steps =
+    const size_type steps =
         (max_size < start) ? 0 : (max_size - start) / row_incr;
     // Number of benchmark restarts with a different randomization for vectors
     // Only used for a detailed error run
-    constexpr std::size_t max_randomize_num{10};
+    constexpr size_type max_randomize_num{10};
 
-    std::vector<std::size_t> benchmark_vec_size((steps + 1));
+    std::vector<size_type> benchmark_vec_size((steps + 1));
     std::vector<double> benchmark_time((steps + 1) * benchmark_num);
     // std::vector<ar_type> benchmark_error((steps + 1) * benchmark_num);
     // stores the result for all different benchmark runs to compute the error
     const auto actual_randomize_num = detailed_error ? max_randomize_num : 1;
     std::vector<ar_type> raw_result(actual_randomize_num * (steps + 1) *
                                        benchmark_num);
     const auto get_raw_idx = [benchmark_num, actual_randomize_num](
-                                 std::size_t rnd, std::size_t step,
-                                 std::size_t bi) {
+                                 size_type rnd, size_type step,
+                                 size_type bi) {
         return step * actual_randomize_num * benchmark_num +
                bi * actual_randomize_num + rnd;
     };
 
     // Run all benchmarks and collect the raw data here
-    for (std::size_t randomize = 0; randomize < actual_randomize_num;
+    for (size_type randomize = 0; randomize < actual_randomize_num;
          ++randomize) {
         if (randomize != 0) {
             write_random({{max_size, 1}}, vector_dist, rengine,
@@ -198,14 +199,14 @@ int main(int argc, char **argv)
             ar_data.copy_cpu_to_gpu();
             st_data.convert_from(ar_data);
         }
-        for (std::size_t vec_size = start, i = 0; vec_size <= max_size;
+        for (size_type vec_size = start, i = 0; vec_size <= max_size;
              vec_size += row_incr, ++i) {
             benchmark_vec_size.at(i) = vec_size;
             const matrix_info x_info{{vec_size, 1}};
             const matrix_info y_info{{vec_size, 1}};
 
-            for (std::size_t bi = 0; bi < benchmark_num; ++bi) {
-                const std::size_t idx = i * benchmark_num + bi;
+            for (size_type bi = 0; bi < benchmark_num; ++bi) {
+                const size_type idx = i * benchmark_num + bi;
                 auto curr_lambda = [&]() {
                     std::get<1>(benchmark_info[bi])(x_info, y_info);
                 };
@@ -218,26 +219,26 @@ int main(int argc, char **argv)
     }
 
     // Print the evaluated results
-    for (std::size_t i = 0; i <= steps; ++i) {
+    for (size_type i = 0; i <= steps; ++i) {
         if (!detailed_error) {
             std::cout << benchmark_vec_size[i];
-            for (std::size_t bi = 0; bi < benchmark_num; ++bi) {
+            for (size_type bi = 0; bi < benchmark_num; ++bi) {
                 std::cout << DELIM << benchmark_time[i * benchmark_num + bi];
             }
             const auto result_ref =
                 raw_result[get_raw_idx(0, i, benchmark_reference)];
-            for (std::size_t bi = 0; bi < benchmark_num; ++bi) {
+            for (size_type bi = 0; bi < benchmark_num; ++bi) {
                 std::cout << DELIM
                           << get_error(raw_result[i * benchmark_num + bi],
                                        result_ref);
             }
             std::cout << '\n';
         } else {
             std::cout << benchmark_vec_size[i];
-            for (std::size_t bi = 0; bi < benchmark_num; ++bi) {
+            for (size_type bi = 0; bi < benchmark_num; ++bi) {
                 // sort and compute the median
                 std::array<ar_type, max_randomize_num> local_error;
-                for (std::size_t rnd = 0; rnd < actual_randomize_num; ++rnd) {
+                for (size_type rnd = 0; rnd < actual_randomize_num; ++rnd) {
                     const auto result_ref =
                         raw_result[get_raw_idx(rnd, i, benchmark_reference)];
                     local_error[rnd] = get_error(
@@ -270,11 +271,11 @@ int main(int argc, char **argv)
         std::cout << DELIM << "Result " << std::get<0>(info);
     }
     std::cout << '\n';
-    for (std::size_t i = 0; i <= steps; ++i) {
-        for (std::size_t randomize = 0; randomize < actual_randomize_num;
+    for (size_type i = 0; i <= steps; ++i) {
+        for (size_type randomize = 0; randomize < actual_randomize_num;
              ++randomize) {
             std::cout << randomize << DELIM << benchmark_vec_size[i];
-            for (std::size_t bi = 0; bi < benchmark_num; ++bi) {
+            for (size_type bi = 0; bi < benchmark_num; ++bi) {
                 std::cout << DELIM << raw_result[get_raw_idx(randomize, i, bi)];
             }
             std::cout << '\n';
diff --git a/cuda/dot_kernels.cuh b/cuda/dot_kernels.cuh
@@ -232,8 +232,8 @@ void acc_dot(myBlasHandle *handle, const matrix_info x_info, const StType *x,
 
     // Accessor Setup
     constexpr std::size_t dimensionality{2};
-    std::array<std::size_t, dimensionality - 1> x_stride{x_info.stride};
-    std::array<std::size_t, dimensionality - 1> y_stride{y_info.stride};
+    std::array<gko::acc::size_type, dimensionality - 1> x_stride{x_info.stride};
+    std::array<gko::acc::size_type, dimensionality - 1> y_stride{y_info.stride};
 
     using accessor =
         gko::acc::reduced_row_major<dimensionality, ArType, StType>;
diff --git a/cuda/dot_memory.cuh b/cuda/dot_memory.cuh
@@ -34,7 +34,7 @@ public:
      * @param engine  random engine used to generate the values
      */
     template <typename VectDist, typename RndEngine>
-    DotMemory(std::size_t size, VectDist &&vect_dist, RndEngine &&engine)
+    DotMemory(matrix_info::size_type size, VectDist &&vect_dist, RndEngine &&engine)
         : x_info_{{size, 1}},
           y_info_{{size, 1}},
           cpu_x_(gen_mtx<ValueType>(x_info_, vect_dist, engine)),
diff --git a/cuda/gemv_benchmark.cu b/cuda/gemv_benchmark.cu
@@ -20,15 +20,16 @@ int main(int argc, char **argv)
 {
     using ar_type = double;
     using st_type = float;
+    using size_type = matrix_info::size_type;
 
     constexpr ar_type ar_alpha{1.0};
     constexpr ar_type ar_beta{1.0};
     constexpr st_type st_alpha{static_cast<st_type>(ar_alpha)};
     constexpr st_type st_beta{static_cast<st_type>(ar_beta)};
 
-    constexpr std::size_t default_max_size{24500};
-    constexpr std::size_t min_size{100};
-    std::size_t max_size{default_max_size};
+    constexpr size_type default_max_size{24500};
+    constexpr size_type min_size{100};
+    size_type max_size{default_max_size};
 
     bool measure_error{false};
 
@@ -122,7 +123,7 @@ int main(int argc, char **argv)
         return error / res_ref_norm;
     };
 
-    constexpr std::size_t benchmark_reference{0};
+    constexpr size_type benchmark_reference{0};
     using benchmark_info_t =
         std::tuple<std::string,
                    std::function<void(matrix_info, matrix_info, matrix_info)>,
@@ -190,7 +191,7 @@ int main(int argc, char **argv)
             },
             st_compute_error},
     };
-    const std::size_t benchmark_num{benchmark_info.size()};
+    const size_type benchmark_num{static_cast<size_type>(benchmark_info.size())};
 
     std::cout << "Num rows";
     for (const auto &info : benchmark_info) {
@@ -209,7 +210,7 @@ int main(int argc, char **argv)
     std::vector<ar_type> local_res(benchmark_num);
     constexpr auto start = min_size;
     constexpr auto row_incr = start;
-    for (std::size_t num_rows = start; num_rows <= max_size;
+    for (size_type num_rows = start; num_rows <= max_size;
          num_rows += row_incr) {
         const matrix_info m_info{{num_rows, num_rows}, max_size};
         const matrix_info x_info{{num_rows, 1}};
@@ -229,7 +230,7 @@ int main(int argc, char **argv)
                 ar_data.gpu_res_memory().copy_from(ar_cpu_res_init);
             }
         }
-        for (std::size_t i = 0; i < benchmark_num; ++i) {
+        for (size_type i = 0; i < benchmark_num; ++i) {
             auto local_func = [&]() {
                 std::get<1>(benchmark_info[i])(m_info, x_info, res_info);
             };
diff --git a/cuda/gemv_kernels.cuh b/cuda/gemv_kernels.cuh
@@ -176,9 +176,9 @@ void acc_gemv(const matrix_info m_info, ArType alpha, const StType *mtx,
 
     // Accessor Setup
     constexpr std::size_t dimensionality{2};
-    std::array<std::size_t, dimensionality - 1> m_stride{m_info.stride};
-    std::array<std::size_t, dimensionality - 1> x_stride{x_info.stride};
-    std::array<std::size_t, dimensionality - 1> res_stride{res_info.stride};
+    std::array<gko::acc::size_type, dimensionality - 1> m_stride{m_info.stride};
+    std::array<gko::acc::size_type, dimensionality - 1> x_stride{x_info.stride};
+    std::array<gko::acc::size_type, dimensionality - 1> res_stride{res_info.stride};
 
     using accessor =
         gko::acc::reduced_row_major<dimensionality, ArType, StType>;
diff --git a/cuda/gemv_memory.cuh b/cuda/gemv_memory.cuh
@@ -37,7 +37,7 @@ public:
      * @param engine  random engine used to generate the values
      */
     template <typename MtxDist, typename VectDist, typename RndEngine>
-    GemvMemory(std::size_t max_size, MtxDist &&mtx_dist,
+    GemvMemory(matrix_info::size_type max_size, MtxDist &&mtx_dist,
                VectDist &&vect_dist, RndEngine &&engine)
         : m_info_{{max_size, max_size}},
           x_info_{{max_size, 1}},
diff --git a/cuda/trsv_benchmark.cu b/cuda/trsv_benchmark.cu
@@ -21,14 +21,15 @@ int main(int argc, char **argv)
 {
     using ar_type = double;
     using st_type = float;
+    using size_type = matrix_info::size_type;
 
     constexpr tmtx_t t_matrix_type = tmtx_t::upper;
     constexpr dmtx_t d_matrix_type = dmtx_t::unit;
 
-    constexpr std::size_t default_max_size{24 * 1000};
-    constexpr std::size_t min_size{100};
+    constexpr size_type default_max_size{24 * 1000};
+    constexpr size_type min_size{100};
 
-    std::size_t max_size{default_max_size};
+    auto max_size{default_max_size};
     bool measure_error{false};
 
     const std::string use_error_string("--error");
@@ -196,8 +197,8 @@ int main(int argc, char **argv)
 
     std::vector<ar_type> local_res(benchmark_num);
 
-    const std::size_t start = std::min(max_size, min_size);
-    const std::size_t row_incr = start;
+    const auto start = std::min(max_size, min_size);
+    const auto row_incr = start;
 
     for (auto num_rows = start; num_rows <= max_size; num_rows += row_incr) {
         const matrix_info m_info{{num_rows, num_rows}, max_size};
diff --git a/cuda/trsv_kernels.cuh b/cuda/trsv_kernels.cuh
@@ -86,7 +86,7 @@ __global__ __launch_bounds__(swarps_per_block *swarp_size) void lower_trsv(
 
     // stores the trianglular system in column major
     __shared__ ValueType triang[swarp_size * triang_stride];
-    __shared__ std::uint32_t shared_row_block_idx;
+    __shared__ std::int32_t shared_row_block_idx;
     __shared__ ValueType x_correction[swarp_size];
 
     const auto group = cg::this_thread_block();
@@ -280,7 +280,7 @@ __global__ __launch_bounds__(swarps_per_block *swarp_size) void upper_trsv(
 
     // stores the trianglular system in column major
     __shared__ ValueType triang[swarp_size * triang_stride];
-    __shared__ std::uint32_t shared_row_block_idx;
+    __shared__ std::int32_t shared_row_block_idx;
     __shared__ ValueType x_correction[swarp_size];
 
     const auto group = cg::this_thread_block();
@@ -461,7 +461,7 @@ void trsv(const matrix_info m_info, tmtx_t ttype, dmtx_t dtype,
     constexpr std::int32_t swarps_per_block{4};
     const dim3 block_solve(subwarp_size, swarps_per_block, 1);
     const dim3 grid_solve(
-        ceildiv(m_info.size[0], static_cast<std::size_t>(subwarp_size)), 1, 1);
+        ceildiv(m_info.size[0], static_cast<std::int64_t>(subwarp_size)), 1, 1);
 
     kernel::trsv_init<<<1, 1>>>(trsv_helper);
     if (dtype == dmtx_t::unit) {
@@ -544,7 +544,7 @@ __global__ __launch_bounds__(swarps_per_block *swarp_size) void acc_lower_trsv(
 
     // stores the trianglular system in column major
     __shared__ ar_type triang[swarp_size * triang_stride];
-    __shared__ std::uint32_t shared_row_block_idx;
+    __shared__ std::int32_t shared_row_block_idx;
     __shared__ ar_type x_correction[swarp_size];
 
     const auto group = cg::this_thread_block();
@@ -742,7 +742,7 @@ __global__ __launch_bounds__(swarps_per_block *swarp_size) void acc_upper_trsv(
 
     // stores the trianglular system in column major
     __shared__ ar_type triang[swarp_size * triang_stride];
-    __shared__ std::uint32_t shared_row_block_idx;
+    __shared__ std::int32_t shared_row_block_idx;
     __shared__ ar_type x_correction[swarp_size];
 
     const auto group = cg::this_thread_block();
@@ -922,8 +922,8 @@ void acc_trsv(const matrix_info m_info, tmtx_t ttype, dmtx_t dtype,
 {
     // Accessor Setup
     constexpr std::size_t dimensionality{2};
-    std::array<std::size_t, dimensionality - 1> m_stride{m_info.stride};
-    std::array<std::size_t, dimensionality - 1> x_stride{x_info.stride};
+    std::array<gko::acc::size_type, dimensionality - 1> m_stride{m_info.stride};
+    std::array<gko::acc::size_type, dimensionality - 1> x_stride{x_info.stride};
 
     using accessor =
         gko::acc::reduced_row_major<dimensionality, ArType, StType>;
@@ -936,7 +936,7 @@ void acc_trsv(const matrix_info m_info, tmtx_t ttype, dmtx_t dtype,
     constexpr std::int32_t swarps_per_block{4};
     const dim3 block_solve(subwarp_size, swarps_per_block, 1);
     const dim3 grid_solve(
-        ceildiv(m_info.size[0], static_cast<std::size_t>(subwarp_size)), 1, 1);
+        ceildiv(m_info.size[0], static_cast<std::int64_t>(subwarp_size)), 1, 1);
 
     kernel::trsv_init<<<1, 1>>>(trsv_helper);
     if (dtype == dmtx_t::unit) {
diff --git a/cuda/trsv_memory.cuh b/cuda/trsv_memory.cuh
@@ -108,7 +108,7 @@ public:
      *                      initializing)
      */
     template <typename MtxGen, typename VectGen>
-    TrsvMemory(std::size_t max_size, MtxGen &&cpu_mtx_gen,
+    TrsvMemory(matrix_info::size_type max_size, MtxGen &&cpu_mtx_gen,
                VectGen &&cpu_vect_gen)
         : m_info_{{max_size, max_size}},
           x_info_{{max_size, 1}},
@@ -139,7 +139,7 @@ public:
         const auto pivot_size = std::max(m_info_.size[0], m_info_.size[1]);
         Memory<int> cpu_pivot(Memory<int>::Device::cpu, pivot_size);
         Memory<int> gpu_pivot(Memory<int>::Device::gpu, pivot_size);
-        for (std::size_t i = 0; i < pivot_size; ++i) {
+        for (matrix_info::size_type i = 0; i < pivot_size; ++i) {
             cpu_pivot.data()[i] = i;
         }
         gpu_pivot = cpu_pivot;
diff --git a/cuda/utils.cuh b/cuda/utils.cuh