From 7c9d7d8bc55db72c887334c17c6ee6e591e9f51a Mon Sep 17 00:00:00 2001 From: zhengchenyu Date: Wed, 24 Sep 2025 18:49:02 +0800 Subject: [PATCH] Avoiding socket file conflicts --- dlrover/python/common/multi_process.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/dlrover/python/common/multi_process.py b/dlrover/python/common/multi_process.py index 900bd40dc..c2888b96a 100644 --- a/dlrover/python/common/multi_process.py +++ b/dlrover/python/common/multi_process.py @@ -26,6 +26,7 @@ import _posixshmem +from . import env_utils from .constants import NodeEnv from .log import default_logger as logger @@ -217,10 +218,11 @@ def _create_socket_path(self): """Create a file path for the local socket.""" fname = self.__class__.__name__.lower() + "_" + self._name + ".sock" job_name = os.getenv(NodeEnv.TORCHELASTIC_RUN_ID, "") + node_rank = env_utils.get_node_rank() if job_name: - root_dir = os.path.join(SOCKET_TMP_DIR, job_name) + root_dir = os.path.join(SOCKET_TMP_DIR, job_name, str(node_rank)) else: - root_dir = SOCKET_TMP_DIR + root_dir = os.path.join(SOCKET_TMP_DIR, str(node_rank)) os.makedirs(root_dir, exist_ok=True) return os.path.join(root_dir, fname)