File tree Expand file tree Collapse file tree 3 files changed +16
-6
lines changed Expand file tree Collapse file tree 3 files changed +16
-6
lines changed Original file line number Diff line number Diff line change 2525
2626          sudo apt-get install -y protobuf-compiler 
2727
28+           pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu128 
2829          pip install .[dev] -v 
2930
3031          pip install -r docs/requirements.txt 
Original file line number Diff line number Diff line change 1010    init_device_mesh ,
1111    ProcessGroup  as  BaseProcessGroup ,
1212)
13+ from  torch .distributed ._mesh_layout  import  _MeshLayout 
1314from  torch .distributed .tensor .device_mesh  import  _mesh_resources 
1415
1516from  torchft .manager  import  Manager 
@@ -69,12 +70,15 @@ def __init__(
6970        self .replicate_dim_name : str  =  mesh_dim_names [replicate_dim ]
7071        self .parent  =  parent 
7172        self .flatten_meshes : Dict [str , DeviceMesh ] =  {}
73+         self ._flatten_mapping : Dict [str , "DeviceMesh" ] =  {}
7274        self ._device_type : str 
7375        if  mesh  is  not   None :
7476            self ._device_type  =  mesh .device_type 
77+             self ._layout : _MeshLayout  =  mesh ._layout 
7578        else :
7679            assert  parent  is  not   None 
7780            self ._device_type  =  parent .device_type 
81+             self ._layout : _MeshLayout  =  parent ._layout 
7882        self ._flatten_mesh_list : tuple [DeviceMesh , ...] =  tuple ()
7983        self ._thread_id : Optional [int ] =  None 
8084        self ._hash : Optional [int ] =  None 
Original file line number Diff line number Diff line change @@ -1253,14 +1253,19 @@ def _assert_same_stream(self) -> None:
12531253    def  wait (self , timeout : Optional [timedelta ] =  None ) ->  bool :
12541254        self ._assert_same_stream ()
12551255
1256-         with  get_stream_context (self ._stream ):
1257-             self ._work .wait ()
1258-             self ._set_future_callback ()
1256+         try :
1257+             with  get_stream_context (self ._stream ):
1258+                 self ._work .wait ()
1259+                 self ._set_future_callback ()
12591260
1260-         with  get_stream_context (self ._stream ):
1261-             self ._managed_fut_tail .wait ()
1261+              with  get_stream_context (self ._stream ):
1262+                  self ._managed_fut_tail .wait ()
12621263
1263-         return  True 
1264+             return  True 
1265+         except  Exception  as  e :
1266+             self ._manager ._logger .exception (f"got exception waiting for work { e }  " )
1267+             self ._manager .report_error (e )
1268+             return  False 
12641269
12651270    def  block_current_stream (self , timeout : Optional [timedelta ] =  None ) ->  None :
12661271        self ._assert_same_stream ()
    
 
   
 
     
   
   
          
     
  
    
     
 
    
      
     
 
     
    You can’t perform that action at this time.
  
 
    
  
     
    
      
        
     
 
       
      
     
   
 
    
    
  
 
  
 
     
    
0 commit comments