Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions docs/env_vars.rst
Original file line number Diff line number Diff line change
Expand Up @@ -90,3 +90,11 @@ The following CSV files are generated:
Each CSV file includes grid configuration information as comments at the top, followed by performance data in comma-separated format.

Default setting is unset (no CSV files written). Setting this variable to a directory path will enable CSV file output.

CUDECOMP_USE_COL_MAJOR_RANK_ORDER
--------------------------------------
(since v0.6.0)

:code:`CUDECOMP_USE_COL_MAJOR_RANK_ORDER` controls the rank assignment order in the process grid. By default, ranks are assigned in row-major order for consistency with :code:`MPI_Cart_*` routines. When enabled, ranks are assigned in column-major order.

Default setting is off (:code:`0`). Setting this variable to :code:`1` will enable column-major rank assignment.
4 changes: 2 additions & 2 deletions docs/overview.rst
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ cuDecomp can distribute 3D Cartesian domains with dimensions :math:`[X, Y, Z]`,
the *global grid*. The global grid is decomposed
across :math:`N_{\text{GPU}}` processes in a 2D **process grid** with dimensions :math:`P_{\text{row}} \times P_{\text{col}}`. The processes
are logically grouped by column and row index into :math:`P_{\text{row}}` *row* communicators and :math:`P_{\text{col}}` *column* communicators.
For consistency with :code:`MPI_Cart_*` routines, the ranks are assigned in a row-major ordering (i.e. row communicators
are composed of sequential ranks).
By default, for consistency with :code:`MPI_Cart_*` routines, the ranks are assigned in a row-major ordering (i.e. row communicators
are composed of sequential ranks). This can be changed to column-major ordering using the :code:`CUDECOMP_USE_COL_MAJOR_RANK_ORDER` environment variable (see :ref:`env-var-section-ref`).

cuDecomp will distribute the global domain data so that each process is assigned a unique *pencil* of data, with three different
pencil configurations corresponding to different transposed configurations of the global domain. The domain can be
Expand Down
12 changes: 6 additions & 6 deletions include/internal/comm_routines.h
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ nvshmemAlltoallV(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_
for (int i = 1; i < send_counts.size(); ++i) {
int src_rank, dst_rank;
getAlltoallPeerRanks(grid_desc, comm_axis, i, src_rank, dst_rank);
int dst_rank_global = getGlobalRank(grid_desc, comm_axis, dst_rank);
int dst_rank_global = getGlobalRank(handle, grid_desc, comm_axis, dst_rank);
if (nvshmem_ptr(recv_buff, dst_rank_global)) { continue; }

params.send_offsets[count] = send_offsets[dst_rank];
Expand All @@ -140,7 +140,7 @@ nvshmemAlltoallV(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_
for (int i = 1; i < send_counts.size(); ++i) {
int src_rank, dst_rank;
getAlltoallPeerRanks(grid_desc, comm_axis, i, src_rank, dst_rank);
int dst_rank_global = getGlobalRank(grid_desc, comm_axis, dst_rank);
int dst_rank_global = getGlobalRank(handle, grid_desc, comm_axis, dst_rank);
if (nvshmem_ptr(recv_buff, dst_rank_global)) {

if (comm_info.ngroups == 1 && handle->device_p2p_ce_count == 1 &&
Expand Down Expand Up @@ -242,7 +242,7 @@ static void cudecompAlltoall(const cudecompHandle_t& handle, const cudecompGridD

CHECK_NCCL(ncclGroupStart());
for (int i = 0; i < send_counts.size(); ++i) {
int peer_rank_global = getGlobalRank(grid_desc, comm_axis, i);
int peer_rank_global = getGlobalRank(handle, grid_desc, comm_axis, i);
if (comm_info.ngroups == 1) { peer_rank_global = handle->rank_to_clique_rank[peer_rank_global]; }
if (send_counts[i] != 0) {
CHECK_NCCL(ncclSend(send_buff + send_offsets[i], send_counts[i] * sizeof(T), ncclChar, peer_rank_global, comm,
Expand Down Expand Up @@ -400,7 +400,7 @@ cudecompAlltoallPipelined(const cudecompHandle_t& handle, const cudecompGridDesc
synced = true;
}

int dst_rank_global = getGlobalRank(grid_desc, comm_axis, dst_rank);
int dst_rank_global = getGlobalRank(handle, grid_desc, comm_axis, dst_rank);
// Need to chunk host API calls due to 2 GiB limitation in API
size_t send_bytes = send_counts[dst_rank] * sizeof(T);
int nchunks = (send_bytes + CUDECOMP_NVSHMEM_CHUNK_SZ - 1) / CUDECOMP_NVSHMEM_CHUNK_SZ;
Expand Down Expand Up @@ -463,8 +463,8 @@ cudecompAlltoallPipelined(const cudecompHandle_t& handle, const cudecompGridDesc
CHECK_NCCL(ncclGroupStart());
group_started = true;
}
int src_rank_global = getGlobalRank(grid_desc, comm_axis, src_rank);
int dst_rank_global = getGlobalRank(grid_desc, comm_axis, dst_rank);
int src_rank_global = getGlobalRank(handle, grid_desc, comm_axis, src_rank);
int dst_rank_global = getGlobalRank(handle, grid_desc, comm_axis, dst_rank);
if (comm_info.ngroups == 1) {
src_rank_global = handle->rank_to_clique_rank[src_rank_global];
dst_rank_global = handle->rank_to_clique_rank[dst_rank_global];
Expand Down
21 changes: 15 additions & 6 deletions include/internal/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,8 @@ struct cudecompHandle {
""; // directory to write CSV performance reports, empty means no file writing

// Miscellaneous
int32_t device_p2p_ce_count = 0; // number of P2P CEs available
int32_t device_p2p_ce_count = 0; // number of P2P CEs available
bool use_col_major_rank_order = false; // Flag to control whether to use column-major rank order
};

// Structure with information about row/column communicator
Expand Down Expand Up @@ -224,9 +225,17 @@ using comm_count_t = int32_t;
enum cudecompCommAxis { CUDECOMP_COMM_COL = 0, CUDECOMP_COMM_ROW = 1 };

// Helper function to convert row or column rank to global rank
static inline int getGlobalRank(const cudecompGridDesc_t grid_desc, cudecompCommAxis axis, int axis_rank) {
return (axis == CUDECOMP_COMM_ROW) ? grid_desc->config.pdims[1] * grid_desc->pidx[0] + axis_rank
: grid_desc->pidx[1] + axis_rank * grid_desc->config.pdims[1];
static inline int getGlobalRank(const cudecompHandle_t handle, const cudecompGridDesc_t grid_desc,
cudecompCommAxis axis, int axis_rank) {
if (handle->use_col_major_rank_order) {
// Column-major rank order
return (axis == CUDECOMP_COMM_ROW) ? grid_desc->pidx[0] + axis_rank * grid_desc->config.pdims[0]
: grid_desc->config.pdims[0] * grid_desc->pidx[1] + axis_rank;
} else {
// Row-major rank order (default)
return (axis == CUDECOMP_COMM_ROW) ? grid_desc->config.pdims[1] * grid_desc->pidx[0] + axis_rank
: grid_desc->pidx[1] + axis_rank * grid_desc->config.pdims[1];
}
}

// Helper function to return maximum pencil size across all processes for a given axis
Expand Down Expand Up @@ -320,7 +329,7 @@ static void setCommInfo(cudecompHandle_t& handle, cudecompGridDesc_t& grid_desc,
// Count occurences of hostname in row/col communicator
std::map<std::string, int> host_counts;
for (int i = 0; i < info.nranks; ++i) {
int peer_rank_global = getGlobalRank(grid_desc, comm_axis, i);
int peer_rank_global = getGlobalRank(handle, grid_desc, comm_axis, i);
std::string hostname = std::string(handle->hostnames[peer_rank_global].data());
host_counts[hostname]++;
}
Expand All @@ -338,7 +347,7 @@ static void setCommInfo(cudecompHandle_t& handle, cudecompGridDesc_t& grid_desc,
// For MNNVL configurations, count occurences of clique in row/col communicator
std::map<unsigned int, int> clique_counts;
for (int i = 0; i < info.nranks; ++i) {
int peer_rank_global = getGlobalRank(grid_desc, comm_axis, i);
int peer_rank_global = getGlobalRank(handle, grid_desc, comm_axis, i);
unsigned int clique = handle->rank_to_clique[peer_rank_global];
clique_counts[clique]++;
}
Expand Down
18 changes: 14 additions & 4 deletions src/autotune.cc
Original file line number Diff line number Diff line change
Expand Up @@ -155,8 +155,13 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
for (auto& pdim1 : pdim1_list) {
grid_desc->config.pdims[0] = handle->nranks / pdim1;
grid_desc->config.pdims[1] = pdim1;
grid_desc->pidx[0] = handle->rank / grid_desc->config.pdims[1];
grid_desc->pidx[1] = handle->rank % grid_desc->config.pdims[1];
if (handle->use_col_major_rank_order) {
grid_desc->pidx[0] = handle->rank % grid_desc->config.pdims[0];
grid_desc->pidx[1] = handle->rank / grid_desc->config.pdims[0];
} else {
grid_desc->pidx[0] = handle->rank / grid_desc->config.pdims[1];
grid_desc->pidx[1] = handle->rank % grid_desc->config.pdims[1];
}

cudecompPencilInfo_t pinfo_x0, pinfo_x3;
cudecompPencilInfo_t pinfo_y0, pinfo_y1, pinfo_y2, pinfo_y3;
Expand Down Expand Up @@ -583,8 +588,13 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
for (auto& pdim1 : pdim1_list) {
grid_desc->config.pdims[0] = handle->nranks / pdim1;
grid_desc->config.pdims[1] = pdim1;
grid_desc->pidx[0] = handle->rank / grid_desc->config.pdims[1];
grid_desc->pidx[1] = handle->rank % grid_desc->config.pdims[1];
if (handle->use_col_major_rank_order) {
grid_desc->pidx[0] = handle->rank % grid_desc->config.pdims[0];
grid_desc->pidx[1] = handle->rank / grid_desc->config.pdims[0];
} else {
grid_desc->pidx[0] = handle->rank / grid_desc->config.pdims[1];
grid_desc->pidx[1] = handle->rank % grid_desc->config.pdims[1];
}

cudecompPencilInfo_t pinfo;
CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo, options->halo_axis, options->halo_extents,
Expand Down
24 changes: 19 additions & 5 deletions src/cudecomp.cc
Original file line number Diff line number Diff line change
Expand Up @@ -374,6 +374,10 @@ static void getCudecompEnvVars(cudecompHandle_t& handle) {
// Check CUDECOMP_PERFORMANCE_REPORT_WRITE_DIR (Directory for CSV performance reports)
const char* performance_write_dir_str = std::getenv("CUDECOMP_PERFORMANCE_REPORT_WRITE_DIR");
if (performance_write_dir_str) { handle->performance_report_write_dir = std::string(performance_write_dir_str); }

// Check CUDECOMP_USE_COL_MAJOR_RANK_ORDER (Column-major rank assignment)
const char* col_major_rank_str = std::getenv("CUDECOMP_USE_COL_MAJOR_RANK_ORDER");
if (col_major_rank_str) { handle->use_col_major_rank_order = std::strtol(col_major_rank_str, nullptr, 10) == 1; }
}

#ifdef ENABLE_NVSHMEM
Expand Down Expand Up @@ -634,8 +638,13 @@ cudecompResult_t cudecompGridDescCreate(cudecompHandle_t handle, cudecompGridDes
if (grid_desc->config.pdims[0] > 0 && grid_desc->config.pdims[1] > 0) {
// If pdims are set, temporarily set up comm info stuctures to determine if we need to create a local NCCL
// communicator
grid_desc->pidx[0] = handle->rank / grid_desc->config.pdims[1];
grid_desc->pidx[1] = handle->rank % grid_desc->config.pdims[1];
if (handle->use_col_major_rank_order) {
grid_desc->pidx[0] = handle->rank % grid_desc->config.pdims[0];
grid_desc->pidx[1] = handle->rank / grid_desc->config.pdims[0];
} else {
grid_desc->pidx[0] = handle->rank / grid_desc->config.pdims[1];
grid_desc->pidx[1] = handle->rank % grid_desc->config.pdims[1];
}
int color_row = grid_desc->pidx[0];
MPI_Comm row_comm;
CHECK_MPI(MPI_Comm_split(handle->mpi_comm, color_row, handle->rank, &row_comm));
Expand Down Expand Up @@ -721,8 +730,13 @@ cudecompResult_t cudecompGridDescCreate(cudecompHandle_t handle, cudecompGridDes
THROW_NOT_SUPPORTED("No valid decomposition found during autotuning with provided arguments.");
}

grid_desc->pidx[0] = handle->rank / grid_desc->config.pdims[1];
grid_desc->pidx[1] = handle->rank % grid_desc->config.pdims[1];
if (handle->use_col_major_rank_order) {
grid_desc->pidx[0] = handle->rank % grid_desc->config.pdims[0];
grid_desc->pidx[1] = handle->rank / grid_desc->config.pdims[0];
} else {
grid_desc->pidx[0] = handle->rank / grid_desc->config.pdims[1];
grid_desc->pidx[1] = handle->rank % grid_desc->config.pdims[1];
}

// Setup final row and column communicators
int color_row = grid_desc->pidx[0];
Expand Down Expand Up @@ -1341,7 +1355,7 @@ cudecompResult_t cudecompGetShiftedRank(cudecompHandle_t handle, cudecompGridDes
*shifted_rank = -1; // "null" case
} else {
int comm_peer = (shifted + grid_desc->config.pdims[comm_axis]) % grid_desc->config.pdims[comm_axis];
int global_peer = getGlobalRank(grid_desc, comm_axis, comm_peer);
int global_peer = getGlobalRank(handle, grid_desc, comm_axis, comm_peer);
*shifted_rank = global_peer;
}

Expand Down