Skip to content

Commit 16505f7

Browse files
committed
Tabs --> Spaces
1 parent 4a83027 commit 16505f7

File tree

6 files changed

+59
-59
lines changed

6 files changed

+59
-59
lines changed

08-H_NCCL_NVSHMEM/.master/NCCL/jacobi.cpp

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -293,13 +293,13 @@ int main(int argc, char* argv[]) {
293293
NCCL_CALL(ncclGroupEnd());
294294
CUDA_RT_CALL(cudaStreamSynchronize(compute_stream));
295295
#else
296-
MPI_CALL(MPI_Sendrecv(a_new + iy_start * nx, nx, MPI_REAL_TYPE, top, 0,
296+
MPI_CALL(MPI_Sendrecv(a_new + iy_start * nx, nx, MPI_REAL_TYPE, top, 0,
297297
a_new + (iy_end * nx), nx, MPI_REAL_TYPE, bottom, 0, MPI_COMM_WORLD,
298298
MPI_STATUS_IGNORE));
299299
MPI_CALL(MPI_Sendrecv(a_new + (iy_end - 1) * nx, nx, MPI_REAL_TYPE, bottom, 0, a_new, nx,
300-
MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE));
300+
MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE));
301301
#endif
302-
std::swap(a_new, a);
302+
std::swap(a_new, a);
303303
}
304304
POP_RANGE
305305

@@ -326,7 +326,7 @@ int main(int argc, char* argv[]) {
326326
CUDA_RT_CALL(cudaStreamWaitEvent(push_stream, reset_l2norm_done, 0));
327327
calculate_norm = (iter % nccheck) == 0 || (!csv && (iter % 100) == 0);
328328

329-
launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, calculate_norm,
329+
launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, calculate_norm,
330330
compute_stream);
331331

332332
launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, (iy_start + 1), nx, calculate_norm,
@@ -346,7 +346,7 @@ int main(int argc, char* argv[]) {
346346
const int bottom = (rank + 1) % size;
347347

348348
// Apply periodic boundary conditions
349-
//TODO: Modify the lable for the RANGE, and replace MPI_Sendrecv with ncclSend and ncclRecv calls
349+
//TODO: Modify the lable for the RANGE, and replace MPI_Sendrecv with ncclSend and ncclRecv calls
350350
// using the nccl communicator and push_stream.
351351
// Remember to use ncclGroupStart() and ncclGroupEnd()
352352
#ifdef SOLUTION
@@ -358,14 +358,14 @@ int main(int argc, char* argv[]) {
358358
NCCL_CALL(ncclSend(a_new + iy_start * nx, nx, NCCL_REAL_TYPE, top, nccl_comm, push_stream));
359359
NCCL_CALL(ncclGroupEnd());
360360
#else
361-
PUSH_RANGE("MPI", 5)
361+
PUSH_RANGE("MPI", 5)
362362
MPI_CALL(MPI_Sendrecv(a_new + iy_start * nx, nx, MPI_REAL_TYPE, top, 0,
363363
a_new + (iy_end * nx), nx, MPI_REAL_TYPE, bottom, 0, MPI_COMM_WORLD,
364364
MPI_STATUS_IGNORE));
365365
MPI_CALL(MPI_Sendrecv(a_new + (iy_end - 1) * nx, nx, MPI_REAL_TYPE, bottom, 0, a_new, nx,
366366
MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE));
367367
#endif
368-
CUDA_RT_CALL(cudaEventRecord(push_done, push_stream));
368+
CUDA_RT_CALL(cudaEventRecord(push_done, push_stream));
369369
POP_RANGE
370370

371371
if (calculate_norm) {
@@ -410,13 +410,13 @@ int main(int argc, char* argv[]) {
410410

411411
if (rank == 0 && result_correct) {
412412
if (csv) {
413-
//TODO: Dont forget to change your output lable from mpi_overlap to nccl_overlap
413+
//TODO: Dont forget to change your output lable from mpi_overlap to nccl_overlap
414414
#ifdef SOLUTION
415415
printf("nccl_overlap, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size,
416416
#else
417-
printf("mpi_overlap, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size,
417+
printf("mpi_overlap, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size,
418418
#endif
419-
(stop - start), runtime_serial);
419+
(stop - start), runtime_serial);
420420
} else {
421421
printf("Num GPUs: %d.\n", size);
422422
printf(

08-H_NCCL_NVSHMEM/.master/NVSHMEM/jacobi.cu

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -341,25 +341,25 @@ int main(int argc, char* argv[]) {
341341
CUDA_RT_CALL(cudaStreamWaitEvent(push_stream, reset_l2norm_done, 0));
342342
calculate_norm = (iter % nccheck) == 0 || (!csv && (iter % 100) == 0);
343343

344-
launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, calculate_norm, compute_stream);
344+
launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, calculate_norm, compute_stream);
345345

346-
launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, (iy_start + 1), nx, calculate_norm, push_stream);
347-
348-
launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_end - 1), iy_end, nx, calculate_norm, push_stream);
346+
launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, (iy_start + 1), nx, calculate_norm, push_stream);
347+
348+
launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_end - 1), iy_end, nx, calculate_norm, push_stream);
349349

350-
CUDA_RT_CALL(cudaEventRecord(push_prep_done, push_stream));
350+
CUDA_RT_CALL(cudaEventRecord(push_prep_done, push_stream));
351351

352352
if (calculate_norm) {
353-
CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_prep_done, 0));
353+
CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_prep_done, 0));
354354
CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_h, l2_norm_d, sizeof(real), cudaMemcpyDeviceToHost,
355355
compute_stream));
356356
}
357357

358-
//TODO: Replace MPI communication with Host initiated NVSHMEM calls
358+
//TODO: Replace MPI communication with Host initiated NVSHMEM calls
359359
// Apply periodic boundary conditions
360360
#ifdef SOLUTION
361-
PUSH_RANGE("NVSHMEM", 5)
362-
nvshmemx_float_put_on_stream(a_new + iy_top_lower_boundary_idx * nx, a_new + iy_start * nx, nx, top, push_stream);
361+
PUSH_RANGE("NVSHMEM", 5)
362+
nvshmemx_float_put_on_stream(a_new + iy_top_lower_boundary_idx * nx, a_new + iy_start * nx, nx, top, push_stream);
363363
nvshmemx_float_put_on_stream(a_new + iy_bottom_upper_boundary_idx * nx, a_new + (iy_end - 1) * nx, nx, bottom, push_stream);
364364
#else
365365
PUSH_RANGE("MPI", 5)
@@ -369,12 +369,12 @@ int main(int argc, char* argv[]) {
369369
MPI_CALL(MPI_Sendrecv(a_new + (iy_end - 1) * nx, nx, MPI_REAL_TYPE, bottom, 0, a_new, nx,
370370
MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE));
371371
#endif
372-
CUDA_RT_CALL(cudaEventRecord(push_done, push_stream));
372+
CUDA_RT_CALL(cudaEventRecord(push_done, push_stream));
373373
POP_RANGE
374374

375375
CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_done, 0));
376376

377-
//TODO: add necessary inter PE synchronization using the nvshmemx_barrier_all_on_stream(...)
377+
//TODO: add necessary inter PE synchronization using the nvshmemx_barrier_all_on_stream(...)
378378
#ifdef SOLUTION
379379
nvshmemx_barrier_all_on_stream(compute_stream);
380380
#endif
@@ -421,7 +421,7 @@ int main(int argc, char* argv[]) {
421421
if (csv) {
422422
//TODO: Replace MPI with NVSHMEM for your output
423423
#ifdef SOLUTION
424-
printf("nvshmem, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size,
424+
printf("nvshmem, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size,
425425
#else
426426
printf("mpi, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size,
427427
#endif

08-H_NCCL_NVSHMEM/solutions/NCCL/jacobi.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -281,7 +281,7 @@ int main(int argc, char* argv[]) {
281281
NCCL_CALL(ncclSend(a_new + iy_start * nx, nx, NCCL_REAL_TYPE, top, nccl_comm, compute_stream));
282282
NCCL_CALL(ncclGroupEnd());
283283
CUDA_RT_CALL(cudaStreamSynchronize(compute_stream));
284-
std::swap(a_new, a);
284+
std::swap(a_new, a);
285285
}
286286
POP_RANGE
287287

@@ -308,7 +308,7 @@ int main(int argc, char* argv[]) {
308308
CUDA_RT_CALL(cudaStreamWaitEvent(push_stream, reset_l2norm_done, 0));
309309
calculate_norm = (iter % nccheck) == 0 || (!csv && (iter % 100) == 0);
310310

311-
launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, calculate_norm,
311+
launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, calculate_norm,
312312
compute_stream);
313313

314314
launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, (iy_start + 1), nx, calculate_norm,
@@ -328,7 +328,7 @@ int main(int argc, char* argv[]) {
328328
const int bottom = (rank + 1) % size;
329329

330330
// Apply periodic boundary conditions
331-
//TODO: Modify the lable for the RANGE, and replace MPI_Sendrecv with ncclSend and ncclRecv calls
331+
//TODO: Modify the lable for the RANGE, and replace MPI_Sendrecv with ncclSend and ncclRecv calls
332332
// using the nccl communicator and push_stream.
333333
// Remember to use ncclGroupStart() and ncclGroupEnd()
334334
PUSH_RANGE("NCCL_LAUNCH", 5)
@@ -338,7 +338,7 @@ int main(int argc, char* argv[]) {
338338
NCCL_CALL(ncclRecv(a_new + (iy_end * nx), nx, NCCL_REAL_TYPE, bottom, nccl_comm, push_stream));
339339
NCCL_CALL(ncclSend(a_new + iy_start * nx, nx, NCCL_REAL_TYPE, top, nccl_comm, push_stream));
340340
NCCL_CALL(ncclGroupEnd());
341-
CUDA_RT_CALL(cudaEventRecord(push_done, push_stream));
341+
CUDA_RT_CALL(cudaEventRecord(push_done, push_stream));
342342
POP_RANGE
343343

344344
if (calculate_norm) {
@@ -383,9 +383,9 @@ int main(int argc, char* argv[]) {
383383

384384
if (rank == 0 && result_correct) {
385385
if (csv) {
386-
//TODO: Dont forget to change your output lable from mpi_overlap to nccl_overlap
386+
//TODO: Dont forget to change your output lable from mpi_overlap to nccl_overlap
387387
printf("nccl_overlap, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size,
388-
(stop - start), runtime_serial);
388+
(stop - start), runtime_serial);
389389
} else {
390390
printf("Num GPUs: %d.\n", size);
391391
printf(

08-H_NCCL_NVSHMEM/solutions/NVSHMEM/jacobi.cu

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -328,31 +328,31 @@ int main(int argc, char* argv[]) {
328328
CUDA_RT_CALL(cudaStreamWaitEvent(push_stream, reset_l2norm_done, 0));
329329
calculate_norm = (iter % nccheck) == 0 || (!csv && (iter % 100) == 0);
330330

331-
launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, calculate_norm, compute_stream);
331+
launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, calculate_norm, compute_stream);
332332

333-
launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, (iy_start + 1), nx, calculate_norm, push_stream);
334-
335-
launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_end - 1), iy_end, nx, calculate_norm, push_stream);
333+
launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, (iy_start + 1), nx, calculate_norm, push_stream);
334+
335+
launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_end - 1), iy_end, nx, calculate_norm, push_stream);
336336

337-
CUDA_RT_CALL(cudaEventRecord(push_prep_done, push_stream));
337+
CUDA_RT_CALL(cudaEventRecord(push_prep_done, push_stream));
338338

339339
if (calculate_norm) {
340-
CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_prep_done, 0));
340+
CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_prep_done, 0));
341341
CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_h, l2_norm_d, sizeof(real), cudaMemcpyDeviceToHost,
342342
compute_stream));
343343
}
344344

345-
//TODO: Replace MPI communication with Host initiated NVSHMEM calls
345+
//TODO: Replace MPI communication with Host initiated NVSHMEM calls
346346
// Apply periodic boundary conditions
347-
PUSH_RANGE("NVSHMEM", 5)
348-
nvshmemx_float_put_on_stream(a_new + iy_top_lower_boundary_idx * nx, a_new + iy_start * nx, nx, top, push_stream);
347+
PUSH_RANGE("NVSHMEM", 5)
348+
nvshmemx_float_put_on_stream(a_new + iy_top_lower_boundary_idx * nx, a_new + iy_start * nx, nx, top, push_stream);
349349
nvshmemx_float_put_on_stream(a_new + iy_bottom_upper_boundary_idx * nx, a_new + (iy_end - 1) * nx, nx, bottom, push_stream);
350-
CUDA_RT_CALL(cudaEventRecord(push_done, push_stream));
350+
CUDA_RT_CALL(cudaEventRecord(push_done, push_stream));
351351
POP_RANGE
352352

353353
CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_done, 0));
354354

355-
//TODO: add necessary inter PE synchronization using the nvshmemx_barrier_all_on_stream(...)
355+
//TODO: add necessary inter PE synchronization using the nvshmemx_barrier_all_on_stream(...)
356356
nvshmemx_barrier_all_on_stream(compute_stream);
357357

358358
if (calculate_norm) {
@@ -396,7 +396,7 @@ int main(int argc, char* argv[]) {
396396
if (rank == 0 && result_correct) {
397397
if (csv) {
398398
//TODO: Replace MPI with NVSHMEM for your output
399-
printf("nvshmem, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size,
399+
printf("nvshmem, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size,
400400
(stop - start), runtime_serial);
401401
} else {
402402
printf("Num GPUs: %d.\n", size);

08-H_NCCL_NVSHMEM/tasks/NCCL/jacobi.cpp

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -260,12 +260,12 @@ int main(int argc, char* argv[]) {
260260
// on the compute_stream.
261261
// Remeber that a group of ncclRecv and ncclSend should be within a ncclGroupStart() and ncclGroupEnd()
262262
// Also, Rember to stream synchronize on the compute_stream at the end
263-
MPI_CALL(MPI_Sendrecv(a_new + iy_start * nx, nx, MPI_REAL_TYPE, top, 0,
263+
MPI_CALL(MPI_Sendrecv(a_new + iy_start * nx, nx, MPI_REAL_TYPE, top, 0,
264264
a_new + (iy_end * nx), nx, MPI_REAL_TYPE, bottom, 0, MPI_COMM_WORLD,
265265
MPI_STATUS_IGNORE));
266266
MPI_CALL(MPI_Sendrecv(a_new + (iy_end - 1) * nx, nx, MPI_REAL_TYPE, bottom, 0, a_new, nx,
267-
MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE));
268-
std::swap(a_new, a);
267+
MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE));
268+
std::swap(a_new, a);
269269
}
270270
POP_RANGE
271271

@@ -292,7 +292,7 @@ int main(int argc, char* argv[]) {
292292
CUDA_RT_CALL(cudaStreamWaitEvent(push_stream, reset_l2norm_done, 0));
293293
calculate_norm = (iter % nccheck) == 0 || (!csv && (iter % 100) == 0);
294294

295-
launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, calculate_norm,
295+
launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, calculate_norm,
296296
compute_stream);
297297

298298
launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, (iy_start + 1), nx, calculate_norm,
@@ -312,16 +312,16 @@ int main(int argc, char* argv[]) {
312312
const int bottom = (rank + 1) % size;
313313

314314
// Apply periodic boundary conditions
315-
//TODO: Modify the lable for the RANGE, and replace MPI_Sendrecv with ncclSend and ncclRecv calls
315+
//TODO: Modify the lable for the RANGE, and replace MPI_Sendrecv with ncclSend and ncclRecv calls
316316
// using the nccl communicator and push_stream.
317317
// Remember to use ncclGroupStart() and ncclGroupEnd()
318-
PUSH_RANGE("MPI", 5)
318+
PUSH_RANGE("MPI", 5)
319319
MPI_CALL(MPI_Sendrecv(a_new + iy_start * nx, nx, MPI_REAL_TYPE, top, 0,
320320
a_new + (iy_end * nx), nx, MPI_REAL_TYPE, bottom, 0, MPI_COMM_WORLD,
321321
MPI_STATUS_IGNORE));
322322
MPI_CALL(MPI_Sendrecv(a_new + (iy_end - 1) * nx, nx, MPI_REAL_TYPE, bottom, 0, a_new, nx,
323323
MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE));
324-
CUDA_RT_CALL(cudaEventRecord(push_done, push_stream));
324+
CUDA_RT_CALL(cudaEventRecord(push_done, push_stream));
325325
POP_RANGE
326326

327327
if (calculate_norm) {
@@ -366,9 +366,9 @@ int main(int argc, char* argv[]) {
366366

367367
if (rank == 0 && result_correct) {
368368
if (csv) {
369-
//TODO: Dont forget to change your output lable from mpi_overlap to nccl_overlap
370-
printf("mpi_overlap, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size,
371-
(stop - start), runtime_serial);
369+
//TODO: Dont forget to change your output lable from mpi_overlap to nccl_overlap
370+
printf("mpi_overlap, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size,
371+
(stop - start), runtime_serial);
372372
} else {
373373
printf("Num GPUs: %d.\n", size);
374374
printf(

08-H_NCCL_NVSHMEM/tasks/NVSHMEM/jacobi.cu

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -319,34 +319,34 @@ int main(int argc, char* argv[]) {
319319
CUDA_RT_CALL(cudaStreamWaitEvent(push_stream, reset_l2norm_done, 0));
320320
calculate_norm = (iter % nccheck) == 0 || (!csv && (iter % 100) == 0);
321321

322-
launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, calculate_norm, compute_stream);
322+
launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, calculate_norm, compute_stream);
323323

324-
launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, (iy_start + 1), nx, calculate_norm, push_stream);
325-
326-
launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_end - 1), iy_end, nx, calculate_norm, push_stream);
324+
launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, (iy_start + 1), nx, calculate_norm, push_stream);
325+
326+
launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_end - 1), iy_end, nx, calculate_norm, push_stream);
327327

328-
CUDA_RT_CALL(cudaEventRecord(push_prep_done, push_stream));
328+
CUDA_RT_CALL(cudaEventRecord(push_prep_done, push_stream));
329329

330330
if (calculate_norm) {
331-
CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_prep_done, 0));
331+
CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_prep_done, 0));
332332
CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_h, l2_norm_d, sizeof(real), cudaMemcpyDeviceToHost,
333333
compute_stream));
334334
}
335335

336-
//TODO: Replace MPI communication with Host initiated NVSHMEM calls
336+
//TODO: Replace MPI communication with Host initiated NVSHMEM calls
337337
// Apply periodic boundary conditions
338338
PUSH_RANGE("MPI", 5)
339339
MPI_CALL(MPI_Sendrecv(a_new + iy_start * nx, nx, MPI_REAL_TYPE, top, 0,
340340
a_new + (iy_end * nx), nx, MPI_REAL_TYPE, bottom, 0, MPI_COMM_WORLD,
341341
MPI_STATUS_IGNORE));
342342
MPI_CALL(MPI_Sendrecv(a_new + (iy_end - 1) * nx, nx, MPI_REAL_TYPE, bottom, 0, a_new, nx,
343343
MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE));
344-
CUDA_RT_CALL(cudaEventRecord(push_done, push_stream));
344+
CUDA_RT_CALL(cudaEventRecord(push_done, push_stream));
345345
POP_RANGE
346346

347347
CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_done, 0));
348348

349-
//TODO: add necessary inter PE synchronization using the nvshmemx_barrier_all_on_stream(...)
349+
//TODO: add necessary inter PE synchronization using the nvshmemx_barrier_all_on_stream(...)
350350

351351
if (calculate_norm) {
352352
CUDA_RT_CALL(cudaStreamSynchronize(compute_stream));

0 commit comments

Comments
 (0)