Tabs --> Spaces

AndiH · AndiH · commit 16505f768064 · 2022-06-01T16:45:41.000+02:00
diff --git a/08-H_NCCL_NVSHMEM/.master/NCCL/jacobi.cpp b/08-H_NCCL_NVSHMEM/.master/NCCL/jacobi.cpp
@@ -293,13 +293,13 @@ int main(int argc, char* argv[]) {
         NCCL_CALL(ncclGroupEnd());
         CUDA_RT_CALL(cudaStreamSynchronize(compute_stream));
 #else
-	MPI_CALL(MPI_Sendrecv(a_new + iy_start * nx, nx, MPI_REAL_TYPE, top, 0,
+    MPI_CALL(MPI_Sendrecv(a_new + iy_start * nx, nx, MPI_REAL_TYPE, top, 0,
                               a_new + (iy_end * nx), nx, MPI_REAL_TYPE, bottom, 0, MPI_COMM_WORLD,
                               MPI_STATUS_IGNORE));
         MPI_CALL(MPI_Sendrecv(a_new + (iy_end - 1) * nx, nx, MPI_REAL_TYPE, bottom, 0, a_new, nx,
-      	                      MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE));
+                              MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE));
 #endif
-	std::swap(a_new, a);
+    std::swap(a_new, a);
     }
     POP_RANGE
 
@@ -326,7 +326,7 @@ int main(int argc, char* argv[]) {
         CUDA_RT_CALL(cudaStreamWaitEvent(push_stream, reset_l2norm_done, 0));
         calculate_norm = (iter % nccheck) == 0 || (!csv && (iter % 100) == 0);
 
-	launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, calculate_norm,
+    launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, calculate_norm,
                              compute_stream);
 
         launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, (iy_start + 1), nx, calculate_norm,
@@ -346,7 +346,7 @@ int main(int argc, char* argv[]) {
         const int bottom = (rank + 1) % size;
 
         // Apply periodic boundary conditions
-	//TODO: Modify the lable for the RANGE, and replace MPI_Sendrecv with ncclSend and ncclRecv calls
+    //TODO: Modify the lable for the RANGE, and replace MPI_Sendrecv with ncclSend and ncclRecv calls
         //      using the nccl communicator and push_stream.
         //      Remember to use ncclGroupStart() and ncclGroupEnd()
 #ifdef SOLUTION
@@ -358,14 +358,14 @@ int main(int argc, char* argv[]) {
         NCCL_CALL(ncclSend(a_new + iy_start * nx,     nx, NCCL_REAL_TYPE, top,    nccl_comm, push_stream));
         NCCL_CALL(ncclGroupEnd());
 #else
-	PUSH_RANGE("MPI", 5)
+    PUSH_RANGE("MPI", 5)
         MPI_CALL(MPI_Sendrecv(a_new + iy_start * nx, nx, MPI_REAL_TYPE, top, 0,
                               a_new + (iy_end * nx), nx, MPI_REAL_TYPE, bottom, 0, MPI_COMM_WORLD,
                               MPI_STATUS_IGNORE));
         MPI_CALL(MPI_Sendrecv(a_new + (iy_end - 1) * nx, nx, MPI_REAL_TYPE, bottom, 0, a_new, nx,
                               MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE));
 #endif
-	CUDA_RT_CALL(cudaEventRecord(push_done, push_stream));
+    CUDA_RT_CALL(cudaEventRecord(push_done, push_stream));
         POP_RANGE
 
         if (calculate_norm) {
@@ -410,13 +410,13 @@ int main(int argc, char* argv[]) {
 
     if (rank == 0 && result_correct) {
         if (csv) {
-	    //TODO: Dont forget to change your output lable from mpi_overlap to nccl_overlap 
+        //TODO: Dont forget to change your output lable from mpi_overlap to nccl_overlap 
 #ifdef SOLUTION
             printf("nccl_overlap, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size,
 #else
-	    printf("mpi_overlap, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size,
+        printf("mpi_overlap, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size,
 #endif
-	    (stop - start), runtime_serial);
+        (stop - start), runtime_serial);
         } else {
             printf("Num GPUs: %d.\n", size);
             printf(
diff --git a/08-H_NCCL_NVSHMEM/.master/NVSHMEM/jacobi.cu b/08-H_NCCL_NVSHMEM/.master/NVSHMEM/jacobi.cu
@@ -341,25 +341,25 @@ int main(int argc, char* argv[]) {
         CUDA_RT_CALL(cudaStreamWaitEvent(push_stream, reset_l2norm_done, 0));
         calculate_norm = (iter % nccheck) == 0 || (!csv && (iter % 100) == 0);
 
-	launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, calculate_norm, compute_stream);
+    launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, calculate_norm, compute_stream);
 
-	launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, (iy_start + 1), nx, calculate_norm, push_stream);
-	
-	launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_end - 1), iy_end, nx, calculate_norm, push_stream);
+    launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, (iy_start + 1), nx, calculate_norm, push_stream);
+    
+    launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_end - 1), iy_end, nx, calculate_norm, push_stream);
 
-	CUDA_RT_CALL(cudaEventRecord(push_prep_done, push_stream));
+    CUDA_RT_CALL(cudaEventRecord(push_prep_done, push_stream));
 
         if (calculate_norm) {
-	    CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_prep_done, 0));
+        CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_prep_done, 0));
             CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_h, l2_norm_d, sizeof(real), cudaMemcpyDeviceToHost,
                                          compute_stream));
         }
 
-	//TODO: Replace MPI communication with Host initiated NVSHMEM calls
+    //TODO: Replace MPI communication with Host initiated NVSHMEM calls
         // Apply periodic boundary conditions
 #ifdef SOLUTION
-	PUSH_RANGE("NVSHMEM", 5)
-	nvshmemx_float_put_on_stream(a_new + iy_top_lower_boundary_idx * nx, a_new + iy_start * nx, nx, top, push_stream);
+    PUSH_RANGE("NVSHMEM", 5)
+    nvshmemx_float_put_on_stream(a_new + iy_top_lower_boundary_idx * nx, a_new + iy_start * nx, nx, top, push_stream);
         nvshmemx_float_put_on_stream(a_new + iy_bottom_upper_boundary_idx * nx, a_new + (iy_end - 1) * nx, nx, bottom,  push_stream);
 #else
         PUSH_RANGE("MPI", 5)
@@ -369,12 +369,12 @@ int main(int argc, char* argv[]) {
         MPI_CALL(MPI_Sendrecv(a_new + (iy_end - 1) * nx, nx, MPI_REAL_TYPE, bottom, 0, a_new, nx,
                               MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE));
 #endif
-	CUDA_RT_CALL(cudaEventRecord(push_done, push_stream));
+    CUDA_RT_CALL(cudaEventRecord(push_done, push_stream));
         POP_RANGE
 
         CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_done, 0));
 
-	//TODO: add necessary inter PE synchronization using the nvshmemx_barrier_all_on_stream(...) 
+    //TODO: add necessary inter PE synchronization using the nvshmemx_barrier_all_on_stream(...) 
 #ifdef SOLUTION
         nvshmemx_barrier_all_on_stream(compute_stream);
 #endif
@@ -421,7 +421,7 @@ int main(int argc, char* argv[]) {
         if (csv) {
 //TODO: Replace MPI with NVSHMEM for your output
 #ifdef SOLUTION
-	    printf("nvshmem, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size,
+        printf("nvshmem, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size,
 #else
             printf("mpi, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size,
 #endif
diff --git a/08-H_NCCL_NVSHMEM/solutions/NCCL/jacobi.cpp b/08-H_NCCL_NVSHMEM/solutions/NCCL/jacobi.cpp
@@ -281,7 +281,7 @@ int main(int argc, char* argv[]) {
         NCCL_CALL(ncclSend(a_new + iy_start * nx,     nx, NCCL_REAL_TYPE, top,    nccl_comm, compute_stream));
         NCCL_CALL(ncclGroupEnd());
         CUDA_RT_CALL(cudaStreamSynchronize(compute_stream));
-	std::swap(a_new, a);
+    std::swap(a_new, a);
     }
     POP_RANGE
 
@@ -308,7 +308,7 @@ int main(int argc, char* argv[]) {
         CUDA_RT_CALL(cudaStreamWaitEvent(push_stream, reset_l2norm_done, 0));
         calculate_norm = (iter % nccheck) == 0 || (!csv && (iter % 100) == 0);
 
-	launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, calculate_norm,
+    launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, calculate_norm,
                              compute_stream);
 
         launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, (iy_start + 1), nx, calculate_norm,
@@ -328,7 +328,7 @@ int main(int argc, char* argv[]) {
         const int bottom = (rank + 1) % size;
 
         // Apply periodic boundary conditions
-	//TODO: Modify the lable for the RANGE, and replace MPI_Sendrecv with ncclSend and ncclRecv calls
+    //TODO: Modify the lable for the RANGE, and replace MPI_Sendrecv with ncclSend and ncclRecv calls
         //      using the nccl communicator and push_stream.
         //      Remember to use ncclGroupStart() and ncclGroupEnd()
         PUSH_RANGE("NCCL_LAUNCH", 5)
@@ -338,7 +338,7 @@ int main(int argc, char* argv[]) {
         NCCL_CALL(ncclRecv(a_new + (iy_end * nx),     nx, NCCL_REAL_TYPE, bottom, nccl_comm, push_stream));
         NCCL_CALL(ncclSend(a_new + iy_start * nx,     nx, NCCL_REAL_TYPE, top,    nccl_comm, push_stream));
         NCCL_CALL(ncclGroupEnd());
-	CUDA_RT_CALL(cudaEventRecord(push_done, push_stream));
+    CUDA_RT_CALL(cudaEventRecord(push_done, push_stream));
         POP_RANGE
 
         if (calculate_norm) {
@@ -383,9 +383,9 @@ int main(int argc, char* argv[]) {
 
     if (rank == 0 && result_correct) {
         if (csv) {
-	    //TODO: Dont forget to change your output lable from mpi_overlap to nccl_overlap 
+        //TODO: Dont forget to change your output lable from mpi_overlap to nccl_overlap 
             printf("nccl_overlap, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size,
-	    (stop - start), runtime_serial);
+        (stop - start), runtime_serial);
         } else {
             printf("Num GPUs: %d.\n", size);
             printf(
diff --git a/08-H_NCCL_NVSHMEM/solutions/NVSHMEM/jacobi.cu b/08-H_NCCL_NVSHMEM/solutions/NVSHMEM/jacobi.cu
@@ -328,31 +328,31 @@ int main(int argc, char* argv[]) {
         CUDA_RT_CALL(cudaStreamWaitEvent(push_stream, reset_l2norm_done, 0));
         calculate_norm = (iter % nccheck) == 0 || (!csv && (iter % 100) == 0);
 
-	launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, calculate_norm, compute_stream);
+    launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, calculate_norm, compute_stream);
 
-	launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, (iy_start + 1), nx, calculate_norm, push_stream);
-	
-	launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_end - 1), iy_end, nx, calculate_norm, push_stream);
+    launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, (iy_start + 1), nx, calculate_norm, push_stream);
+    
+    launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_end - 1), iy_end, nx, calculate_norm, push_stream);
 
-	CUDA_RT_CALL(cudaEventRecord(push_prep_done, push_stream));
+    CUDA_RT_CALL(cudaEventRecord(push_prep_done, push_stream));
 
         if (calculate_norm) {
-	    CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_prep_done, 0));
+        CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_prep_done, 0));
             CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_h, l2_norm_d, sizeof(real), cudaMemcpyDeviceToHost,
                                          compute_stream));
         }
 
-	//TODO: Replace MPI communication with Host initiated NVSHMEM calls
+    //TODO: Replace MPI communication with Host initiated NVSHMEM calls
         // Apply periodic boundary conditions
-	PUSH_RANGE("NVSHMEM", 5)
-	nvshmemx_float_put_on_stream(a_new + iy_top_lower_boundary_idx * nx, a_new + iy_start * nx, nx, top, push_stream);
+    PUSH_RANGE("NVSHMEM", 5)
+    nvshmemx_float_put_on_stream(a_new + iy_top_lower_boundary_idx * nx, a_new + iy_start * nx, nx, top, push_stream);
         nvshmemx_float_put_on_stream(a_new + iy_bottom_upper_boundary_idx * nx, a_new + (iy_end - 1) * nx, nx, bottom,  push_stream);
-	CUDA_RT_CALL(cudaEventRecord(push_done, push_stream));
+    CUDA_RT_CALL(cudaEventRecord(push_done, push_stream));
         POP_RANGE
 
         CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_done, 0));
 
-	//TODO: add necessary inter PE synchronization using the nvshmemx_barrier_all_on_stream(...) 
+    //TODO: add necessary inter PE synchronization using the nvshmemx_barrier_all_on_stream(...) 
         nvshmemx_barrier_all_on_stream(compute_stream);
 
         if (calculate_norm) {
@@ -396,7 +396,7 @@ int main(int argc, char* argv[]) {
     if (rank == 0 && result_correct) {
         if (csv) {
 //TODO: Replace MPI with NVSHMEM for your output
-	    printf("nvshmem, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size,
+        printf("nvshmem, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size,
                    (stop - start), runtime_serial);
         } else {
             printf("Num GPUs: %d.\n", size);
diff --git a/08-H_NCCL_NVSHMEM/tasks/NCCL/jacobi.cpp b/08-H_NCCL_NVSHMEM/tasks/NCCL/jacobi.cpp
@@ -260,12 +260,12 @@ int main(int argc, char* argv[]) {
         //      on the compute_stream.
         //      Remeber that a group of ncclRecv and ncclSend should be within a ncclGroupStart() and ncclGroupEnd()
         //      Also, Rember to stream synchronize on the compute_stream at the end
-	MPI_CALL(MPI_Sendrecv(a_new + iy_start * nx, nx, MPI_REAL_TYPE, top, 0,
+    MPI_CALL(MPI_Sendrecv(a_new + iy_start * nx, nx, MPI_REAL_TYPE, top, 0,
                               a_new + (iy_end * nx), nx, MPI_REAL_TYPE, bottom, 0, MPI_COMM_WORLD,
                               MPI_STATUS_IGNORE));
         MPI_CALL(MPI_Sendrecv(a_new + (iy_end - 1) * nx, nx, MPI_REAL_TYPE, bottom, 0, a_new, nx,
-      	                      MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE));
-	std::swap(a_new, a);
+                              MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE));
+    std::swap(a_new, a);
     }
     POP_RANGE
 
@@ -292,7 +292,7 @@ int main(int argc, char* argv[]) {
         CUDA_RT_CALL(cudaStreamWaitEvent(push_stream, reset_l2norm_done, 0));
         calculate_norm = (iter % nccheck) == 0 || (!csv && (iter % 100) == 0);
 
-	launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, calculate_norm,
+    launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, calculate_norm,
                              compute_stream);
 
         launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, (iy_start + 1), nx, calculate_norm,
@@ -312,16 +312,16 @@ int main(int argc, char* argv[]) {
         const int bottom = (rank + 1) % size;
 
         // Apply periodic boundary conditions
-	//TODO: Modify the lable for the RANGE, and replace MPI_Sendrecv with ncclSend and ncclRecv calls
+    //TODO: Modify the lable for the RANGE, and replace MPI_Sendrecv with ncclSend and ncclRecv calls
         //      using the nccl communicator and push_stream.
         //      Remember to use ncclGroupStart() and ncclGroupEnd()
-	PUSH_RANGE("MPI", 5)
+    PUSH_RANGE("MPI", 5)
         MPI_CALL(MPI_Sendrecv(a_new + iy_start * nx, nx, MPI_REAL_TYPE, top, 0,
                               a_new + (iy_end * nx), nx, MPI_REAL_TYPE, bottom, 0, MPI_COMM_WORLD,
                               MPI_STATUS_IGNORE));
         MPI_CALL(MPI_Sendrecv(a_new + (iy_end - 1) * nx, nx, MPI_REAL_TYPE, bottom, 0, a_new, nx,
                               MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE));
-	CUDA_RT_CALL(cudaEventRecord(push_done, push_stream));
+    CUDA_RT_CALL(cudaEventRecord(push_done, push_stream));
         POP_RANGE
 
         if (calculate_norm) {
@@ -366,9 +366,9 @@ int main(int argc, char* argv[]) {
 
     if (rank == 0 && result_correct) {
         if (csv) {
-	    //TODO: Dont forget to change your output lable from mpi_overlap to nccl_overlap 
-	    printf("mpi_overlap, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size,
-	    (stop - start), runtime_serial);
+        //TODO: Dont forget to change your output lable from mpi_overlap to nccl_overlap 
+        printf("mpi_overlap, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size,
+        (stop - start), runtime_serial);
         } else {
             printf("Num GPUs: %d.\n", size);
             printf(
diff --git a/08-H_NCCL_NVSHMEM/tasks/NVSHMEM/jacobi.cu b/08-H_NCCL_NVSHMEM/tasks/NVSHMEM/jacobi.cu
@@ -319,34 +319,34 @@ int main(int argc, char* argv[]) {
         CUDA_RT_CALL(cudaStreamWaitEvent(push_stream, reset_l2norm_done, 0));
         calculate_norm = (iter % nccheck) == 0 || (!csv && (iter % 100) == 0);
 
-	launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, calculate_norm, compute_stream);
+    launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, calculate_norm, compute_stream);
 
-	launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, (iy_start + 1), nx, calculate_norm, push_stream);
-	
-	launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_end - 1), iy_end, nx, calculate_norm, push_stream);
+    launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, (iy_start + 1), nx, calculate_norm, push_stream);
+    
+    launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_end - 1), iy_end, nx, calculate_norm, push_stream);
 
-	CUDA_RT_CALL(cudaEventRecord(push_prep_done, push_stream));
+    CUDA_RT_CALL(cudaEventRecord(push_prep_done, push_stream));
 
         if (calculate_norm) {
-	    CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_prep_done, 0));
+        CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_prep_done, 0));
             CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_h, l2_norm_d, sizeof(real), cudaMemcpyDeviceToHost,
                                          compute_stream));
         }
 
-	//TODO: Replace MPI communication with Host initiated NVSHMEM calls
+    //TODO: Replace MPI communication with Host initiated NVSHMEM calls
         // Apply periodic boundary conditions
         PUSH_RANGE("MPI", 5)
         MPI_CALL(MPI_Sendrecv(a_new + iy_start * nx, nx, MPI_REAL_TYPE, top, 0,
                               a_new + (iy_end * nx), nx, MPI_REAL_TYPE, bottom, 0, MPI_COMM_WORLD,
                               MPI_STATUS_IGNORE));
         MPI_CALL(MPI_Sendrecv(a_new + (iy_end - 1) * nx, nx, MPI_REAL_TYPE, bottom, 0, a_new, nx,
                               MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE));
-	CUDA_RT_CALL(cudaEventRecord(push_done, push_stream));
+    CUDA_RT_CALL(cudaEventRecord(push_done, push_stream));
         POP_RANGE
 
         CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_done, 0));
 
-	//TODO: add necessary inter PE synchronization using the nvshmemx_barrier_all_on_stream(...) 
+    //TODO: add necessary inter PE synchronization using the nvshmemx_barrier_all_on_stream(...) 
 
         if (calculate_norm) {
             CUDA_RT_CALL(cudaStreamSynchronize(compute_stream));