Skip to content

Commit 1d99280

Browse files
tushar00jainfacebook-github-bot
authored andcommitted
reset flight recorder trace (#283)
Summary: - call FR api to reset the trace after every quorum - we reset so that after every quorum, we start a fresh FR trace since the pg's could have changed and we already dumped FR trace from previous errors - change the env var that's used to determine the file after every quorum Differential Revision: D84260745
1 parent b3be7ad commit 1d99280

File tree

1 file changed

+13
-0
lines changed

1 file changed

+13
-0
lines changed

torchft/manager.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,8 @@
8888
# crash if call to quorum fails, all replicas will crash.
8989
QUORUM_RETRIES_ENV: str = "TORCHFT_QUORUM_RETRIES"
9090

91+
TORCH_FR_DUMP_TEMP_FILE_ENV: str = "TORCH_FR_DUMP_TEMP_FILE"
92+
9193
T = TypeVar("T")
9294

9395

@@ -223,6 +225,9 @@ def __init__(
223225
self._load_state_dict_fns: Dict[str, Callable[[object], None]] = {}
224226
self._user_state_dicts: Dict[str, Callable[[], object]] = {}
225227

228+
self._original_fr_dump_temp_file: Optional[str] = os.environ.get(
229+
TORCH_FR_DUMP_TEMP_FILE_ENV
230+
)
226231
self._replica_id = replica_id
227232

228233
# Protects state dict
@@ -666,8 +671,16 @@ def _async_quorum(
666671
# We use the replica rank and world as we want all replicas in the PG.
667672
try:
668673
with torch.profiler.record_function("torchft::manager::_pg::configure"):
674+
# Reset GPU state for Flight Recorder
669675
if torch.accelerator.is_available():
670676
torch.accelerator.synchronize()
677+
torch._C._distributed_c10d._reset_fr_recording_nccl()
678+
679+
if self._original_fr_dump_temp_file is not None:
680+
os.environ[TORCH_FR_DUMP_TEMP_FILE_ENV] = (
681+
f"{self._original_fr_dump_temp_file}_quorum_{quorum_id}/replica_{self._replica_id}_"
682+
)
683+
671684
self._pg.configure(
672685
store_prefixed_addr,
673686
self._replica_id if self._replica_id is not None else "0",

0 commit comments

Comments
 (0)