@@ -175,11 +175,14 @@ def async_save_tf_savables(
175175 When this call returns, `value_map` can be safely mutated, but saving to `dir` will not
176176 complete unless the returned future is set.
177177 """
178+ logging .info ("******* DEBUG Saving TF savables to %s async" , dir )
178179 # pylint: disable-next=consider-using-with
179180 f = tempfile .TemporaryDirectory ()
180181 for path , value in utils .flatten_items (value_map ):
181182 tf_checkpoint = tf .train .Checkpoint (value )
183+ logging .info ("******* DEBUG Writing %s to path %s" , f .name , path )
182184 tf_checkpoint .write (os .path .join (f .name , path ))
185+ logging .info ("******* DEBUG Done writing %s to path %s" , f .name , path )
183186 return executor .submit (_upload_dir , f , dst_dir = dir )
184187
185188
@@ -399,6 +402,7 @@ def __init__(self, cfg: Config):
399402 # TODO(markblee): Consider making BoundedDataShardedAsyncCheckpointManager
400403 # the default once stable.
401404 if cfg .max_concurrent_gb is not None or cfg .max_data_shard_degree :
405+ logging .info ("******* DEBUG Using BoundedDataShardedAsyncCheckpointManager" )
402406 self ._manager = BoundedDataShardedAsyncCheckpointManager (
403407 max_concurrent_gb = cfg .max_concurrent_gb ,
404408 timeout_secs = cfg .timeout_secs ,
@@ -411,6 +415,7 @@ def __init__(self, cfg: Config):
411415 f"shard_threshold_bytes is set to { cfg .shard_threshold_bytes } , but "
412416 "max_data_shard_degree is not set. It will not take any effect."
413417 )
418+ logging .info ("******* DEBUG Using GlobalAsyncCheckpointManager" )
414419 self ._manager = GlobalAsyncCheckpointManager (timeout_secs = cfg .timeout_secs )
415420 if cfg .max_concurrent_restore_gb is not None and cfg .max_concurrent_restore_gb <= 0 :
416421 raise ValueError (
@@ -514,8 +519,12 @@ def save_to_dir(
514519 logging .info ("Creating directories: %s" , dirs )
515520 list (self ._executor .map (fs .makedirs , dirs ))
516521 logging .info ("All directories created" )
522+
523+ logging .info ("******* DEBUG starting sync_global_devices" )
517524 # Wait for directory and index creation.
518525 multihost_utils .sync_global_devices (ckpt_dir )
526+ logging .info ("******* DEBUG finished sync_global_devices" )
527+
519528 # Each worker writes its tf checkpoints under a different path.
520529 save_tf_future = async_save_tf_savables (
521530 spec .tf_ckpt_map ,
@@ -527,6 +536,7 @@ def save_to_dir(
527536 )
528537
529538 def commit ():
539+ logging .info ("******* DEBUG starting on_commit_callback" )
530540 on_commit_callback (ckpt_dir = ckpt_dir , index = spec .index )
531541 logging .info (
532542 "Serialization of %s completed in %s seconds." ,
@@ -538,6 +548,9 @@ def commit():
538548 logging .debug (
539549 "array_values=%s tensorstore=%s" , utils .shapes (spec .gda_values ), spec .tensorstore_specs
540550 )
551+ logging .info (
552+ "array_values=%s tensorstore=%s" , utils .shapes (spec .gda_values ), spec .tensorstore_specs
553+ )
541554 self ._manager .serialize (
542555 spec .gda_values ,
543556 spec .tensorstore_specs ,
0 commit comments