From cc441c3bcd3e0c282a87dff4c515a36ae98cbe9b Mon Sep 17 00:00:00 2001 From: Hebangwen Date: Sat, 27 Aug 2022 16:11:34 +0800 Subject: [PATCH 1/5] feat: add `-ffmath` and `-march=native` --- CMakeLists.txt | 1 + main.cpp | 12 +++++++----- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 29b152c..6c85c06 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,3 +7,4 @@ if (NOT CMAKE_BUILD_TYPE) endif() add_executable(main main.cpp) +target_compile_options(main PUBLIC -ffast-math -march=native) diff --git a/main.cpp b/main.cpp index cf6369b..144c498 100644 --- a/main.cpp +++ b/main.cpp @@ -31,16 +31,18 @@ float eps = 0.001; float dt = 0.01; void step() { + float Gdt = G * dt; + float eps_square = eps * eps; for (auto &star: stars) { for (auto &other: stars) { float dx = other.px - star.px; float dy = other.py - star.py; float dz = other.pz - star.pz; - float d2 = dx * dx + dy * dy + dz * dz + eps * eps; - d2 *= sqrt(d2); - star.vx += dx * other.mass * G * dt / d2; - star.vy += dy * other.mass * G * dt / d2; - star.vz += dz * other.mass * G * dt / d2; + float d2 = dx * dx + dy * dy + dz * dz + eps_square; + d2 *= std::sqrt(d2); + star.vx += dx * other.mass * Gdt / d2; + star.vy += dy * other.mass * Gdt / d2; + star.vz += dz * other.mass * Gdt / d2; } } for (auto &star: stars) { From 86eaa76ded1c08c17005c70d2681c05000ad6aed Mon Sep 17 00:00:00 2001 From: Hebangwen Date: Sat, 27 Aug 2022 16:57:28 +0800 Subject: [PATCH 2/5] feat: add omp and soa --- CMakeLists.txt | 2 ++ main.cpp | 81 +++++++++++++++++++++++++++++++------------------- 2 files changed, 52 insertions(+), 31 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6c85c06..4b861c5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,5 +6,7 @@ if (NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE Release) endif() +find_package(OpenMP REQUIRED) add_executable(main main.cpp) +target_link_libraries(main PUBLIC OpenMP::OpenMP_CXX) target_compile_options(main PUBLIC -ffast-math -march=native) diff --git a/main.cpp b/main.cpp index 144c498..2e3a844 100644 --- a/main.cpp +++ b/main.cpp @@ -8,21 +8,25 @@ float frand() { return (float)rand() / RAND_MAX * 2 - 1; } -struct Star { - float px, py, pz; - float vx, vy, vz; - float mass; +const int N = 48; + +struct StarField { + std::vector px, py, pz; + std::vector vx, vy, vz; + std::vector mass; }; -std::vector stars; +StarField stars; void init() { - for (int i = 0; i < 48; i++) { - stars.push_back({ - frand(), frand(), frand(), - frand(), frand(), frand(), - frand() + 1, - }); + for (int i = 0; i < N; i++) { + stars.px.push_back(frand()); + stars.py.push_back(frand()); + stars.pz.push_back(frand()); + stars.vx.push_back(frand()); + stars.vy.push_back(frand()); + stars.vz.push_back(frand()); + stars.mass.push_back(frand() + 1); } } @@ -33,36 +37,51 @@ float dt = 0.01; void step() { float Gdt = G * dt; float eps_square = eps * eps; - for (auto &star: stars) { - for (auto &other: stars) { - float dx = other.px - star.px; - float dy = other.py - star.py; - float dz = other.pz - star.pz; + + // 原代码是内层不变, 我的代码是内层变 + for (int i = 0; i < N; i++) { + float px = stars.px[i]; + float py = stars.py[i]; + float pz = stars.pz[i]; + float mass_Gdt = stars.mass[i] * Gdt; + + #pragma omp simd + for (int j = 0; j < N; j++) { + float dx = px - stars.px[j]; + float dy = py - stars.py[j]; + float dz = pz - stars.pz[j]; float d2 = dx * dx + dy * dy + dz * dz + eps_square; d2 *= std::sqrt(d2); - star.vx += dx * other.mass * Gdt / d2; - star.vy += dy * other.mass * Gdt / d2; - star.vz += dz * other.mass * Gdt / d2; + stars.vx[j] += dx * mass_Gdt / d2; + stars.vy[j] += dy * mass_Gdt / d2; + stars.vz[j] += dz * mass_Gdt / d2; } } - for (auto &star: stars) { - star.px += star.vx * dt; - star.py += star.vy * dt; - star.pz += star.vz * dt; + + #pragma omp simd + for (int i = 0; i < N; i++) { + stars.px[i] += stars.vx[i] * dt; + stars.py[i] += stars.vy[i] * dt; + stars.pz[i] += stars.vz[i] * dt; } } float calc() { float energy = 0; - for (auto &star: stars) { - float v2 = star.vx * star.vx + star.vy * star.vy + star.vz * star.vz; - energy += star.mass * v2 / 2; - for (auto &other: stars) { - float dx = other.px - star.px; - float dy = other.py - star.py; - float dz = other.pz - star.pz; + + for (int i = 0; i < N; i++) { + float v2 = stars.vx[i] * stars.vx[i] + stars.vy[i] * stars.vy[i] + stars.vz[i] * stars.vz[i]; + energy += stars.mass[i] * v2 * 0.5; + float px = stars.px[i]; + float py = stars.py[i]; + float pz = stars.pz[i]; + float mass_G_half = stars.mass[i] * G * 0.5; + for (int j = 0; j < N; j++) { + float dx = stars.px[j] - px; + float dy = stars.py[j] - py; + float dz = stars.pz[j] - pz; float d2 = dx * dx + dy * dy + dz * dz + eps * eps; - energy -= other.mass * star.mass * G / sqrt(d2) / 2; + energy -= stars.mass[j] * mass_G_half / std::sqrt(d2); } } return energy; From dc279c0eea01c1755873e8d230ed21e1829916b6 Mon Sep 17 00:00:00 2001 From: Hebangwen Date: Sat, 27 Aug 2022 17:39:19 +0800 Subject: [PATCH 3/5] fix: remove omp --- CMakeLists.txt | 2 -- main.cpp | 2 -- 2 files changed, 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4b861c5..6c85c06 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,5 @@ if (NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE Release) endif() -find_package(OpenMP REQUIRED) add_executable(main main.cpp) -target_link_libraries(main PUBLIC OpenMP::OpenMP_CXX) target_compile_options(main PUBLIC -ffast-math -march=native) diff --git a/main.cpp b/main.cpp index 2e3a844..b22777c 100644 --- a/main.cpp +++ b/main.cpp @@ -45,7 +45,6 @@ void step() { float pz = stars.pz[i]; float mass_Gdt = stars.mass[i] * Gdt; - #pragma omp simd for (int j = 0; j < N; j++) { float dx = px - stars.px[j]; float dy = py - stars.py[j]; @@ -58,7 +57,6 @@ void step() { } } - #pragma omp simd for (int i = 0; i < N; i++) { stars.px[i] += stars.vx[i] * dt; stars.py[i] += stars.vy[i] * dt; From e88442fade471cf675973514b9e0a8d03fe491b1 Mon Sep 17 00:00:00 2001 From: hebangwen Date: Sat, 27 Aug 2022 20:41:34 +0800 Subject: [PATCH 4/5] feat: use array and unroll --- main.cpp | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/main.cpp b/main.cpp index b22777c..0ddd29d 100644 --- a/main.cpp +++ b/main.cpp @@ -3,6 +3,7 @@ #include #include #include +#include float frand() { return (float)rand() / RAND_MAX * 2 - 1; @@ -11,22 +12,22 @@ float frand() { const int N = 48; struct StarField { - std::vector px, py, pz; - std::vector vx, vy, vz; - std::vector mass; + std::array px, py, pz; + std::array vx, vy, vz; + std::array mass; }; StarField stars; void init() { for (int i = 0; i < N; i++) { - stars.px.push_back(frand()); - stars.py.push_back(frand()); - stars.pz.push_back(frand()); - stars.vx.push_back(frand()); - stars.vy.push_back(frand()); - stars.vz.push_back(frand()); - stars.mass.push_back(frand() + 1); + stars.px.fill(frand()); + stars.py.fill(frand()); + stars.pz.fill(frand()); + stars.vx.fill(frand()); + stars.vy.fill(frand()); + stars.vz.fill(frand()); + stars.mass.fill(frand() + 1); } } @@ -39,13 +40,14 @@ void step() { float eps_square = eps * eps; // 原代码是内层不变, 我的代码是内层变 - for (int i = 0; i < N; i++) { + for (size_t i = 0; i < N; i++) { float px = stars.px[i]; float py = stars.py[i]; float pz = stars.pz[i]; float mass_Gdt = stars.mass[i] * Gdt; - for (int j = 0; j < N; j++) { + #pragma GCC unroll 2 + for (size_t j = 0; j < N; j++) { float dx = px - stars.px[j]; float dy = py - stars.py[j]; float dz = pz - stars.pz[j]; @@ -57,7 +59,8 @@ void step() { } } - for (int i = 0; i < N; i++) { + #pragma GCC unroll 3 + for (size_t i = 0; i < N; i++) { stars.px[i] += stars.vx[i] * dt; stars.py[i] += stars.vy[i] * dt; stars.pz[i] += stars.vz[i] * dt; From 5cbe243ecd86b3da557441b241f9c4726fedd3aa Mon Sep 17 00:00:00 2001 From: hebangwen Date: Sun, 28 Aug 2022 01:08:09 +0800 Subject: [PATCH 5/5] feat: follow the original assignment rules --- main.cpp | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/main.cpp b/main.cpp index 0ddd29d..4923931 100644 --- a/main.cpp +++ b/main.cpp @@ -39,24 +39,26 @@ void step() { float Gdt = G * dt; float eps_square = eps * eps; - // 原代码是内层不变, 我的代码是内层变 + // 修改为外层是变化的 for (size_t i = 0; i < N; i++) { - float px = stars.px[i]; - float py = stars.py[i]; - float pz = stars.pz[i]; - float mass_Gdt = stars.mass[i] * Gdt; - + float px = stars.px[i], py = stars.py[i], pz = stars.pz[i]; + float vx = stars.vx[i], vy = stars.vy[i], vz = stars.vz[i]; + #pragma GCC unroll 2 for (size_t j = 0; j < N; j++) { - float dx = px - stars.px[j]; - float dy = py - stars.py[j]; - float dz = pz - stars.pz[j]; + float dx = stars.px[j] - px; + float dy = stars.py[j] - py; + float dz = stars.pz[j] - pz; float d2 = dx * dx + dy * dy + dz * dz + eps_square; d2 *= std::sqrt(d2); - stars.vx[j] += dx * mass_Gdt / d2; - stars.vy[j] += dy * mass_Gdt / d2; - stars.vz[j] += dz * mass_Gdt / d2; + vx += dx * stars.mass[j] * Gdt / d2; + vy += dy * stars.mass[j] * Gdt / d2; + vz += dz * stars.mass[j] * Gdt / d2; } + + stars.vx[i] = vx; + stars.vy[i] = vy; + stars.vz[i] = vz; } #pragma GCC unroll 3