Merge pull request #9 from tskisner/comm_reuse

tskisner · web-flow · commit c98e4b469b6a · 2021-10-02T23:00:45.000-07:00
Support passing in pre-created node and node-rank communicators
diff --git a/pshmem/shmem.py b/pshmem/shmem.py
@@ -12,8 +12,7 @@
 
 
 class MPIShared(object):
-    """
-    Create a shared memory buffer that is replicated across nodes.
+    """Create a shared memory buffer that is replicated across nodes.
 
     For the given array dimensions and datatype, the original communicator
     is split into groups of processes that can share memory (i.e. that are
@@ -32,13 +31,19 @@ class MPIShared(object):
     If comm is None, a simple local numpy array is used.
 
     Args:
-        shape (tuple): the dimensions of the array.
-        dtype (np.dtype): the data type of the array.
-        comm (MPI.Comm): the full communicator to use.  This may span
+        shape (tuple):  The dimensions of the array.
+        dtype (np.dtype):  The data type of the array.
+        comm (MPI.Comm):  The full communicator to use.  This may span
             multiple nodes, and each node will have a copy of the data.
+        comm_node (MPI.Comm):  The communicator of processes within the
+            same node.  If None, the node communicator will be created.
+        comm_node_rank (MPI.Comm):  The communicator of processes with
+            the same rank across all nodes.  If None, this will be
+            created.
+
     """
 
-    def __init__(self, shape, dtype, comm):
+    def __init__(self, shape, dtype, comm, comm_node=None, comm_node_rank=None):
         # Copy the datatype in order to support arguments that are aliases,
         # like "numpy.float64".
         self._dtype = np.dtype(dtype)
@@ -79,14 +84,43 @@ def __init__(self, shape, dtype, comm):
         if self._comm is not None:
             from mpi4py import MPI
 
-            self._nodecomm = self._comm.Split_type(MPI.COMM_TYPE_SHARED, 0)
+            self._free_comm_node = False
+            if comm_node is None:
+                # Create it
+                self._nodecomm = self._comm.Split_type(MPI.COMM_TYPE_SHARED, 0)
+                self._free_comm_node = True
+            else:
+                # Check it
+                if self._procs % comm_node.size != 0:
+                    msg = "Node communicator size ({}) does not divide ".format(
+                        comm_node.size
+                    )
+                    msg += "evenly into the total number of processes ({})".format(
+                        self._procs
+                    )
+                    raise ValueError(msg)
+                self._nodecomm = comm_node
             self._noderank = self._nodecomm.rank
             self._nodeprocs = self._nodecomm.size
             self._nodes = self._procs // self._nodeprocs
             if self._nodes * self._nodeprocs < self._procs:
                 self._nodes += 1
             self._mynode = self._rank // self._nodeprocs
-            self._rankcomm = self._comm.Split(self._noderank, self._mynode)
+
+            self._free_comm_node_rank = False
+            if comm_node_rank is None:
+                # Create it
+                self._rankcomm = self._comm.Split(self._noderank, self._mynode)
+                self._free_comm_node_rank = True
+            else:
+                # Check it
+                if comm_node_rank.size != self._nodes:
+                    msg = "Node rank communicator size ({}) does not match ".format(
+                        comm_node_rank.size
+                    )
+                    msg += "the number of nodes ({})".format(self._nodes)
+                    raise ValueError(msg)
+                self._rankcomm = comm_node_rank
 
         # Consider a corner case of the previous calculation.  Imagine that
         # the number of processes is not evenly divisible by the number of
@@ -291,10 +325,18 @@ def close(self):
             self._win.Free()
             self._win = None
         # Free other communicators if needed
-        if hasattr(self, "_rankcomm") and (self._rankcomm is not None):
+        if (
+            hasattr(self, "_rankcomm")
+            and (self._rankcomm is not None)
+            and self._free_comm_node_rank
+        ):
             self._rankcomm.Free()
             self._rankcomm = None
-        if hasattr(self, "_nodecomm") and (self._nodecomm is not None):
+        if (
+            hasattr(self, "_nodecomm")
+            and (self._nodecomm is not None)
+            and self._free_comm_node
+        ):
             self._nodecomm.Free()
             self._nodecomm = None
         return
diff --git a/pshmem/test.py b/pshmem/test.py
@@ -27,7 +27,7 @@
     try:
         import mpi4py.MPI as MPI
     except ImportError:
-        raise ImportError("Cannot import mpi4py, will only test serial functionality.")
+        print("Cannot import mpi4py, will only test serial functionality.")
 
 
 class ShmemTest(unittest.TestCase):
@@ -44,7 +44,7 @@ def setUp(self):
     def tearDown(self):
         pass
 
-    def read_write(self, comm):
+    def read_write(self, comm, comm_node=None, comm_node_rank=None):
         """Run a sequence of various access tests."""
         rank = 0
         procs = 1
@@ -76,7 +76,13 @@ def read_write(self, comm):
             # object has no dangling reference counts after leaving the context,
             # and will ensure that the shared memory is freed properly.
 
-            with MPIShared(local.shape, local.dtype, comm) as shm:
+            with MPIShared(
+                local.shape,
+                local.dtype,
+                comm,
+                comm_node=comm_node,
+                comm_node_rank=comm_node_rank,
+            ) as shm:
                 for p in range(procs):
                     # Every process takes turns writing to the buffer.
                     setdata = None
@@ -94,7 +100,7 @@ def read_write(self, comm):
                         try:
                             # All processes call set(), but only data on rank p matters.
                             shm.set(setdata, setoffset, fromrank=p)
-                        except:
+                        except (RuntimeError, ValueError):
                             print(
                                 "proc {} threw exception during set()".format(rank),
                                 flush=True,
@@ -117,7 +123,7 @@ def read_write(self, comm):
                                     setoffset[1] : setoffset[1] + setdata.shape[1],
                                     setoffset[2] : setoffset[2] + setdata.shape[2],
                                 ] = setdata
-                        except:
+                        except (RuntimeError, ValueError):
                             print(
                                 "proc {} threw exception during __setitem__".format(
                                     rank
@@ -221,6 +227,36 @@ def test_comm_self(self):
             # Every process does the operations on COMM_SELF
             self.read_write(MPI.COMM_SELF)
 
+    def test_comm_reuse(self):
+        if self.comm is not None:
+            if self.comm.rank == 0:
+                print("Testing MPIShared with re-used node comm...", flush=True)
+            nodecomm = self.comm.Split_type(MPI.COMM_TYPE_SHARED, 0)
+            noderank = nodecomm.rank
+            nodeprocs = nodecomm.size
+            nodes = self.comm.size // nodeprocs
+            mynode = self.comm.rank // nodeprocs
+            rankcomm = self.comm.Split(noderank, mynode)
+
+            self.read_write(self.comm, comm_node=nodecomm, comm_node_rank=rankcomm)
+
+            if nodes > 1 and nodeprocs > 2:
+                # We have at least one node, test passing in an incorrect
+                # communicator for the node comm.
+                evenoddcomm = self.comm.Split(self.comm.rank % 2, self.comm.rank // 2)
+                try:
+                    test_shared = MPIShared(
+                        (10, 5),
+                        np.float64,
+                        self.comm,
+                        comm_node=evenoddcomm,
+                        comm_node_rank=evenoddcomm,
+                    )
+                    print("Failed to catch construction with bad node comm")
+                    self.assertTrue(False)
+                except ValueError:
+                    print("Successfully caught construction with bad node comm")
+
     def test_shape(self):
         good_dims = [
             (2, 5, 10),
@@ -245,7 +281,7 @@ def test_shape(self):
                 if self.rank == 0:
                     print("successful creation with shape {}".format(dims), flush=True)
                 del shm
-            except Exception:
+            except (RuntimeError, ValueError):
                 if self.rank == 0:
                     print(
                         "unsuccessful creation with shape {}".format(dims), flush=True
@@ -256,7 +292,7 @@ def test_shape(self):
                 if self.rank == 0:
                     print("unsuccessful rejection of shape {}".format(dims), flush=True)
                 del shm
-            except Exception:
+            except (RuntimeError, ValueError):
                 if self.rank == 0:
                     print("successful rejection of shape {}".format(dims), flush=True)