Update params to work for number of devices

SujeethJinesh · SujeethJinesh · commit 97b37073215e · 2025-07-29T22:01:05.000Z
diff --git a/Dockerfile b/Dockerfile
@@ -88,7 +88,15 @@ ARG EXTRAS=
 ENV UV_FIND_LINKS=https://storage.googleapis.com/jax-releases/libtpu_releases.html
 # Ensure we install the TPU version, even if building locally.
 # Jax will fallback to CPU when run on a machine without TPU.
-RUN uv pip install --prerelease=allow .[core,tpu] && uv cache clean
+COPY libtpu.so /root/libtpu.so
+RUN uv pip install --prerelease=allow .[core,gcp,tpu] && uv cache clean
+RUN uv pip install libtpu==0.0.14
+
+# Add this line to print the installed version of libtpu.
+RUN pip show libtpu | grep Version
+RUN pip show jax | grep Version
+RUN pip show jaxlib | grep Version
+
 RUN if [ -n "$EXTRAS" ]; then uv pip install .[$EXTRAS] && uv cache clean; fi
 COPY . .
 
diff --git a/axlearn/cloud/gcp/jobs/launch.py b/axlearn/cloud/gcp/jobs/launch.py
@@ -780,6 +780,6 @@ def _wrapped_usage(
 
 
 if __name__ == "__main__":
-    configure_logging(logging.INFO)
+    configure_logging(logging.DEBUG)
     _private_flags()
     app.run(main)
diff --git a/axlearn/cloud/gcp/jobset_utils.py b/axlearn/cloud/gcp/jobset_utils.py
@@ -4,7 +4,6 @@
 
 import io
 import logging
-import math
 import os
 from dataclasses import dataclass
 from typing import Any, Optional, Sequence
@@ -27,10 +26,7 @@
 )
 from axlearn.cloud.gcp.config import gcp_settings
 from axlearn.cloud.gcp.node_pool import PRE_PROVISIONER_LABEL
-from axlearn.cloud.gcp.system_characteristics import (
-    GCE_MACHINE_TYPE_TO_MEMORY_CHARACTERISTICS,
-    USER_FACING_NAME_TO_SYSTEM_CHARACTERISTICS,
-)
+from axlearn.cloud.gcp.system_characteristics import USER_FACING_NAME_TO_SYSTEM_CHARACTERISTICS
 from axlearn.cloud.gcp.tpu import get_default_env, infer_tpu_workers
 from axlearn.cloud.gcp.utils import validate_jobset_name
 from axlearn.common.compiler_options import infer_tpu_type
@@ -451,15 +447,17 @@ def _build_container(self) -> Nested[Any]:
         if cfg.enable_tpu_ici_resiliency is not None:
             env_vars["ENABLE_ICI_RESILIENCY"] = str(cfg.enable_tpu_ici_resiliency).lower()
 
+        env_vars["TPU_LIBRARY_PATH"] = "/root/libtpu.so"
+
         resources = {"limits": {"google.com/tpu": system.chips_per_vm}}
-        # Set request memory by host machine type.
-        machine_memory_gi = GCE_MACHINE_TYPE_TO_MEMORY_CHARACTERISTICS.get(
-            system.gce_machine_type, None
-        )
-        if machine_memory_gi is not None:
-            request_memory_gi = machine_memory_gi * _MEMORY_REQUEST_PERCENTAGE
-            resources["limits"]["memory"] = f"{machine_memory_gi}Gi"
-            resources["requests"] = {"memory": f"{math.floor(request_memory_gi)}Gi"}
+        # # Set request memory by host machine type.
+        # machine_memory_gi = GCE_MACHINE_TYPE_TO_MEMORY_CHARACTERISTICS.get(
+        #     system.gce_machine_type, None
+        # )
+        # if machine_memory_gi is not None:
+        #     request_memory_gi = machine_memory_gi * _MEMORY_REQUEST_PERCENTAGE
+        #     resources["limits"]["memory"] = f"{machine_memory_gi}Gi"
+        #     resources["requests"] = {"memory": f"{math.floor(request_memory_gi)}Gi"}
 
         k8s_env_vars = [dict(name=k, value=str(v)) for k, v in env_vars.items()]
         k8s_env_vars.append(
@@ -509,8 +507,8 @@ def _build_uploader_container(
         interval_s = 60
         sync_command = f"while true; do gsutil -m rsync -r {src} {dst}; sleep {interval_s}; done"
         resources = {
-            "requests": {"cpu": "100m", "memory": "128Mi"},
-            "limits": {"cpu": "500m", "memory": "256Mi"},
+            # "requests": {"cpu": "100m", "memory": "128Mi"},
+            # "limits": {"cpu": "500m", "memory": "256Mi"},
         }
         return dict(
             name="output-uploader",
diff --git a/axlearn/cloud/gcp/tpu.py b/axlearn/cloud/gcp/tpu.py
@@ -13,12 +13,14 @@
 
 def get_default_env(*, tpu_type: str, num_tpu_slices: int, job_name: str) -> dict[str, Any]:
     """Gets the default environment for TPU pods."""
+    del job_name  # Unused.
     return dict(
         # Use a large refresh to mitigate DNS timeout issues until tf>2.12 upgrade.
         GCS_RESOLVE_REFRESH_SECS=600,
         TPU_TYPE=tpu_type,
         NUM_TPU_SLICES=num_tpu_slices,
-        XLA_FLAGS=f"--xla_dump_to=/output/{job_name}/xla",
+        XLA_FLAGS="",
+        # XLA_FLAGS=f"--xla_dump_to=/output/{job_name}/xla",
         TF_CPP_MIN_LOG_LEVEL=0,
         # Necessary for surfacing FATAL TPU errors.
         TPU_STDERR_LOG_LEVEL=0,
diff --git a/axlearn/common/array_serialization.py b/axlearn/common/array_serialization.py
@@ -559,21 +559,34 @@ def serialize(
 
         # pylint: disable-next=redefined-outer-name
         async def _run_serializer():
+            logging.info(
+                "******* DEBUG GlobalAsyncCheckpointManager _run_serializer "
+                "with number of commit_futures: %s",
+                len(commit_futures),
+            )
             future_writer = jax.tree.map(
                 serialization.async_serialize, arrays, tensorstore_specs, commit_futures
             )
+            logging.info("******* DEBUG GlobalAsyncCheckpointManager _run_serializer Completed")
             return await asyncio.gather(*future_writer)
 
+        # Is this the problem?
+        logging.info("******* DEBUG Starting to run _run_serializer")
+
         # Note: We need to run the coroutine in another event loop driven by a separate thread.
         # The current event loop might be already running an async function when `serialize` is
         # invoked from a coroutine, in which case asyncio.get_running_loop().run_until_complete()
         # would not be able to execute another coroutine to completion.
         asyncio.run_coroutine_threadsafe(_run_serializer(), self._loop).result()
 
+        logging.info("******* DEBUG Starting to run _run_serializer")
+
         self._add_futures(
             jax.tree_util.tree_flatten(commit_futures)[0] + (additional_futures or [])
         )
 
+        logging.info("******* DEBUG Starting to run async_commit")
+
         # Used in wait_until_finished to check on process != 0, if the checkpoint
         # has finished writing.
         self._start_async_commit(on_commit_callback)
diff --git a/axlearn/common/checkpointer.py b/axlearn/common/checkpointer.py
@@ -175,11 +175,14 @@ def async_save_tf_savables(
     When this call returns, `value_map` can be safely mutated, but saving to `dir` will not
     complete unless the returned future is set.
     """
+    logging.info("******* DEBUG Saving TF savables to %s async", dir)
     # pylint: disable-next=consider-using-with
     f = tempfile.TemporaryDirectory()
     for path, value in utils.flatten_items(value_map):
         tf_checkpoint = tf.train.Checkpoint(value)
+        logging.info("******* DEBUG Writing %s to path %s", f.name, path)
         tf_checkpoint.write(os.path.join(f.name, path))
+        logging.info("******* DEBUG Done writing %s to path %s", f.name, path)
     return executor.submit(_upload_dir, f, dst_dir=dir)
 
 
@@ -399,6 +402,7 @@ def __init__(self, cfg: Config):
         # TODO(markblee): Consider making BoundedDataShardedAsyncCheckpointManager
         # the default once stable.
         if cfg.max_concurrent_gb is not None or cfg.max_data_shard_degree:
+            logging.info("******* DEBUG Using BoundedDataShardedAsyncCheckpointManager")
             self._manager = BoundedDataShardedAsyncCheckpointManager(
                 max_concurrent_gb=cfg.max_concurrent_gb,
                 timeout_secs=cfg.timeout_secs,
@@ -411,6 +415,7 @@ def __init__(self, cfg: Config):
                     f"shard_threshold_bytes is set to {cfg.shard_threshold_bytes}, but "
                     "max_data_shard_degree is not set. It will not take any effect."
                 )
+            logging.info("******* DEBUG Using GlobalAsyncCheckpointManager")
             self._manager = GlobalAsyncCheckpointManager(timeout_secs=cfg.timeout_secs)
         if cfg.max_concurrent_restore_gb is not None and cfg.max_concurrent_restore_gb <= 0:
             raise ValueError(
@@ -514,8 +519,12 @@ def save_to_dir(
             logging.info("Creating directories: %s", dirs)
             list(self._executor.map(fs.makedirs, dirs))
             logging.info("All directories created")
+
+        logging.info("******* DEBUG starting sync_global_devices")
         # Wait for directory and index creation.
         multihost_utils.sync_global_devices(ckpt_dir)
+        logging.info("******* DEBUG finished sync_global_devices")
+
         # Each worker writes its tf checkpoints under a different path.
         save_tf_future = async_save_tf_savables(
             spec.tf_ckpt_map,
@@ -527,6 +536,7 @@ def save_to_dir(
         )
 
         def commit():
+            logging.info("******* DEBUG starting on_commit_callback")
             on_commit_callback(ckpt_dir=ckpt_dir, index=spec.index)
             logging.info(
                 "Serialization of %s completed in %s seconds.",
@@ -538,6 +548,9 @@ def commit():
         logging.debug(
             "array_values=%s tensorstore=%s", utils.shapes(spec.gda_values), spec.tensorstore_specs
         )
+        logging.info(
+            "array_values=%s tensorstore=%s", utils.shapes(spec.gda_values), spec.tensorstore_specs
+        )
         self._manager.serialize(
             spec.gda_values,
             spec.tensorstore_specs,
diff --git a/axlearn/common/compiler_options.py b/axlearn/common/compiler_options.py
@@ -59,6 +59,12 @@ def default_xla_options(
             # further if you see "Allocator failed to allocate". A feature
             # to dynamically allocate may come later: b/380514965
             megascale_grpc_premap_memory_bytes=17179869184,
+            # DEBUGGING ONLY: RapidEye output directory for debugging purposes,
+            megascale_rapid_eye_error_digest_log_path="/output/rapideye/",
+            # megascale_jax_offset_launch_id_by_module_name="false",
+            # megascale_jax_use_device_set_based_launch_id="false",
+            # enable megascale debug port.
+            megascale_debug_port=8081,
             # Flag controlling the maximum number of overlapping host offloadings.
             xla_tpu_host_transfer_overlap_limit=24,
             # Flag controlling the maximum number of overlapping cross-DCN send/recv.
@@ -149,12 +155,20 @@ def default_xla_options(
             # Similar to megascale_error_reporter_abort_on_hang but for unrecoverable errors.
             megascale_error_reporter_abort_on_error="true",
             # Increase the timeout at which a hang is detected/reported, default is 5m.
-            megascale_graph_hang_threshold="10m",
+            megascale_graph_hang_threshold="60m",
             # Similar to megascale_graph_hang_threshold but specific to within a launch_id.
             # Default is 1m.
-            megascale_graph_within_launch_hang_threshold="10m",
+            megascale_graph_within_launch_hang_threshold="60m",
             # TODO(ethanli): temporary workaround to avoid memory leak in megascale.
             megascale_grpc_enable_xor_tracer="false",
+            # # The duration of missing heartbeats before shutting down.
+            # jax_heartbeat_timeout="100s",
+            # # JAX gRPC timeout duration.
+            # jax_rpc_timeout="120s",
+            # # JAX distributed initialization timeout.
+            # jax_distributed_initialization_timeout="3600s",
+            # # JAX shutdown timeout duration
+            # jax_distributed_shutdown_timeout="5m",
         )
 
     # Validate options. Will never fail if this function is implemented correctly.
@@ -163,7 +177,20 @@ def default_xla_options(
             int(v)
             continue
         except ValueError:
-            assert v in [True, False, "true", "false", "megachip_tccontrol", "10m"], (k, v)
+            assert v in [
+                True,
+                False,
+                "true",
+                "false",
+                "megachip_tccontrol",
+                "10m",
+                "60m",
+                "100s",
+                "120s",
+                "3600s",
+                "5m",
+                "/output/rapideye/",
+            ], (k, v)
 
     return options
 
@@ -302,6 +329,7 @@ def infer_xla_performance_flags(
     if current_configuration in mesh_configurations_for_sparse_core_offloading:
         flags = dict(
             # Must disable continuation fusion to enable sparse core offloading.
+            # AXLEARN TESTING NOTE: We are disabling this to test for SparseCore related issues.
             xla_tpu_enable_async_collective_fusion_fuse_all_gather="false",
             xla_tpu_enable_async_collective_fusion_fuse_all_reduce="false",
             xla_tpu_enable_async_collective_fusion_fuse_reduce_scatter="false",
diff --git a/axlearn/common/trainer.py b/axlearn/common/trainer.py
@@ -626,7 +626,7 @@ def run(
                             )
                         self.vlog(3, "Done step %s", self.step)
                         num_steps += 1
-                        if num_steps % 100 == 0:
+                        if num_steps % 10 == 0:
                             now = time.perf_counter()
                             average_step_time = (now - start_time) / num_steps
                             self._step_log("Average step time: %s seconds", average_step_time)
@@ -1099,7 +1099,7 @@ def _run_step(
             # Run the compiled function.
             self._trainer_state, outputs = compiled_train_step_fn(self.trainer_state, input_batch)
 
-        if self.step % 100 == 0 or 0 <= self.step <= 5:
+        if self.step % 10 == 0 or 0 <= self.step <= 5:
             self._step_log(
                 "loss=%s aux=%s",
                 outputs["loss"],
diff --git a/axlearn/common/utils_spmd.py b/axlearn/common/utils_spmd.py
@@ -88,6 +88,16 @@ def setup(
                 coordinator_address=distributed_coordinator,
                 num_processes=num_processes,
                 process_id=process_id,
+                # The duration of missing heartbeats before shutting down.
+                heartbeat_timeout="120s",
+                # JAX distributed initialization timeout.
+                initialization_timeout="3600s",
+                # JAX distributed shutdown timeout.
+                shutdown_timeout="3600s",
+                # RPC timeout.
+                rpc_timeout="3600s",
+                # RPC timeout for heartbeat.
+                coordinator_rpc_timeout="3600s",
             )
             if jax_backend == "gpu":
                 # jax 0.4.34 introduced a change to cluster auto-detection behavior, supplying
diff --git a/axlearn/experiments/text/gpt/fuji.py b/axlearn/experiments/text/gpt/fuji.py
diff --git a/disrupt_nodes.sh b/disrupt_nodes.sh
diff --git a/pyproject.toml b/pyproject.toml