diff --git a/CMakeLists.txt b/CMakeLists.txt index 29b152c..9efdb98 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,9 +1,18 @@ cmake_minimum_required(VERSION 3.12) -project(hellocmake LANGUAGES CXX) +project(benchtest LANGUAGES CXX) -set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD 20) +set(CMAKE_BUILD_TYPE Release) if (NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE Release) endif() + add_executable(main main.cpp) +find_package(OpenMP REQUIRED) +target_link_libraries(main PUBLIC OpenMP::OpenMP_CXX) +target_compile_options(main PUBLIC -ffast-math -march=native) + +add_executable(baseline baseline.cpp) +target_link_libraries(baseline PUBLIC OpenMP::OpenMP_CXX) +target_compile_options(baseline PUBLIC -ffast-math -march=native) diff --git a/baseline.cpp b/baseline.cpp new file mode 100644 index 0000000..3298423 --- /dev/null +++ b/baseline.cpp @@ -0,0 +1,88 @@ +#include +#include +#include +#include +#include + +float frand() { + return (float)rand() / RAND_MAX * 2 - 1; +} + +struct Star { + float px, py, pz; + float vx, vy, vz; + float mass; +}; + +std::vector stars; + +void init() { + for (int i = 0; i < 48; i++) { + stars.push_back({ + frand(), frand(), frand(), + frand(), frand(), frand(), + frand() + 1, + }); + } +} + +float G = 0.001; +float eps = 0.001; +float dt = 0.01; + +void step() { + for (auto &star: stars) { + for (auto &other: stars) { + float dx = other.px - star.px; + float dy = other.py - star.py; + float dz = other.pz - star.pz; + float d2 = dx * dx + dy * dy + dz * dz + eps * eps; + d2 *= sqrt(d2); + star.vx += dx * other.mass * G * dt / d2; + star.vy += dy * other.mass * G * dt / d2; + star.vz += dz * other.mass * G * dt / d2; + } + } + for (auto &star: stars) { + star.px += star.vx * dt; + star.py += star.vy * dt; + star.pz += star.vz * dt; + } +} + +float calc() { + float energy = 0; + for (auto &star: stars) { + float v2 = star.vx * star.vx + star.vy * star.vy + star.vz * star.vz; + energy += star.mass * v2 / 2; + for (auto &other: stars) { + float dx = other.px - star.px; + float dy = other.py - star.py; + float dz = other.pz - star.pz; + float d2 = dx * dx + dy * dy + dz * dz + eps * eps; + energy -= other.mass * star.mass * G / sqrt(d2) / 2; + } + } + return energy; +} + +template +long benchmark(Func const &func) { + auto t0 = std::chrono::steady_clock::now(); + func(); + auto t1 = std::chrono::steady_clock::now(); + auto dt = std::chrono::duration_cast(t1 - t0); + return dt.count(); +} + +int main() { + init(); + printf("Initial energy: %f\n", calc()); + auto dt = benchmark([&] { + for (int i = 0; i < 100000; i++) + step(); + }); + printf("Final energy: %f\n", calc()); + printf("Time elapsed: %ld ms\n", dt); + return 0; +} \ No newline at end of file diff --git a/main.cpp b/main.cpp index cf6369b..3a7401b 100644 --- a/main.cpp +++ b/main.cpp @@ -3,64 +3,138 @@ #include #include #include +#include +constexpr float rand_tmp = (float)1/RAND_MAX*2; float frand() { - return (float)rand() / RAND_MAX * 2 - 1; + return (float)rand()*rand_tmp - 1; } -struct Star { - float px, py, pz; - float vx, vy, vz; - float mass; +struct alignas(32) Star { + float px[48], py[48], pz[48]; + float vx[48], vy[48], vz[48]; + float mass[48]; }; -std::vector stars; +Star stars; void init() { - for (int i = 0; i < 48; i++) { - stars.push_back({ - frand(), frand(), frand(), - frand(), frand(), frand(), - frand() + 1, - }); + float rand_array[48*7]; + for(size_t i=0;i<48*7;i++) + rand_array[i]=frand(); + for(size_t i=0;i<48;i++) + { + stars.px[i]=rand_array[i]; + } + for(size_t i=0;i<48;i++) + { + stars.py[i]=rand_array[48+i]; + } + for(size_t i=0;i<48;i++) + { + stars.pz[i]=rand_array[48*2+i]; + } + for(size_t i=0;i<48;i++) + { + stars.vx[i]=rand_array[48*3+i]; + } + for(size_t i=0;i<48;i++) + { + stars.vy[i]=rand_array[48*4+i]; + } + for(size_t i=0;i<48;i++) + { + stars.vz[i]=rand_array[48*5+i]; + } + for(size_t i=0;i<48;i++) + { + stars.mass[i]=rand_array[48*6+i]+1; } } -float G = 0.001; -float eps = 0.001; -float dt = 0.01; +constexpr const float G = 0.001; +constexpr const float eps = 0.001; +constexpr const float dt = 0.01; void step() { - for (auto &star: stars) { - for (auto &other: stars) { - float dx = other.px - star.px; - float dy = other.py - star.py; - float dz = other.pz - star.pz; - float d2 = dx * dx + dy * dy + dz * dz + eps * eps; - d2 *= sqrt(d2); - star.vx += dx * other.mass * G * dt / d2; - star.vy += dy * other.mass * G * dt / d2; - star.vz += dz * other.mass * G * dt / d2; + float Gdt=G * dt; + #pragma omp simd + for (size_t i=0;i<48;i++) { + float dx[48],dy[48],dz[48],d2[48],tmp[48]; + #pragma omp simd + for(size_t j=0;j<48;j++) + { + dx[j] = stars.px[j] - stars.px[i]; + dy[j] = stars.py[j] - stars.py[i]; + dz[j] = stars.pz[j] - stars.pz[i]; + } + #pragma omp simd + for(size_t j=0;j<48;j++) + { + d2[j]=eps * eps; + d2[j] += dx[j] * dx[j]; + d2[j] += dy[j] * dy[j]; + d2[j] += dz[j] * dz[j]; + } + #pragma omp simd + for(size_t j=0;j<48;j++) + { + d2[j] *= std::sqrt(d2[j]); + tmp[j] = 1/d2[j]; + } + + #pragma omp simd + for (size_t j=0;j<48;j++) { + stars.vx[i] += dx[j] * stars.mass[j] * Gdt * tmp[j]; + stars.vy[i] += dy[j] * stars.mass[j] * Gdt * tmp[j]; + stars.vz[i] += dz[j] * stars.mass[j] * Gdt * tmp[j]; } } - for (auto &star: stars) { - star.px += star.vx * dt; - star.py += star.vy * dt; - star.pz += star.vz * dt; + #pragma omp simd + for (size_t i=0;i<48;i++) { + stars.px[i] += stars.vx[i] * dt; + stars.py[i] += stars.vy[i] * dt; + stars.pz[i] += stars.vz[i] * dt; } } float calc() { float energy = 0; - for (auto &star: stars) { - float v2 = star.vx * star.vx + star.vy * star.vy + star.vz * star.vz; - energy += star.mass * v2 / 2; - for (auto &other: stars) { - float dx = other.px - star.px; - float dy = other.py - star.py; - float dz = other.pz - star.pz; - float d2 = dx * dx + dy * dy + dz * dz + eps * eps; - energy -= other.mass * star.mass * G / sqrt(d2) / 2; + float v2[48],dx[48],dy[48],dz[48],d2[48],tmp[48]; + #pragma omp simd + for(size_t i=0;i<48;i++) + { + v2[i] = stars.vx[i] * stars.vx[i]; + v2[i] += stars.vy[i] * stars.vy[i]; + v2[i] += stars.vz[i] * stars.vz[i]; + } + float tmp_energy=0; + for(size_t i=0;i<48;i++) + { + tmp_energy += stars.mass[i] * v2[i]; + } + energy+=tmp_energy*0.5; + #pragma omp simd + for (size_t i=0;i<48;i++) { + #pragma omp simd + for (size_t j=0;j<48;j++) { + dx[j] = stars.px[j] - stars.px[i]; + dy[j] = stars.py[j] - stars.py[i]; + dz[j] = stars.pz[j] - stars.pz[i]; + } + #pragma omp simd + for(size_t j=0;j<48;j++) + { + d2[j] = eps * eps; + d2[j] += dx[j] * dx[j]; + d2[j] += dy[j] * dy[j]; + d2[j] += dz[j] * dz[j]; + } + #pragma omp simd + for(size_t j=0;j<48;j++) + { + tmp[j] = 1/std::sqrt(d2[j])*0.5; + energy -= stars.mass[j] * stars.mass[i] * G * tmp[j]; } } return energy; @@ -79,10 +153,21 @@ int main() { init(); printf("Initial energy: %f\n", calc()); auto dt = benchmark([&] { - for (int i = 0; i < 100000; i++) + for (size_t i = 0; i < 100000; i++) step(); }); printf("Final energy: %f\n", calc()); printf("Time elapsed: %ld ms\n", dt); return 0; } +/* +baseline +Initial energy: -8.571528 +Final energy: -8.511633 +Time elapsed: 1546 ms + +simd optimized +Initial energy: -9.936085 +Final energy: -9.926659 +Time elapsed: 198 ms +*/