Update params to work for number of devices

SujeethJinesh · SujeethJinesh · commit 96500210ab04 · 2025-07-25T18:03:53.000Z
diff --git a/Dockerfile b/Dockerfile
@@ -88,7 +88,15 @@ ARG EXTRAS=
 ENV UV_FIND_LINKS=https://storage.googleapis.com/jax-releases/libtpu_releases.html
 # Ensure we install the TPU version, even if building locally.
 # Jax will fallback to CPU when run on a machine without TPU.
-RUN uv pip install --prerelease=allow .[core,tpu] && uv cache clean
+COPY libtpu.so /root/libtpu.so
+RUN uv pip install --prerelease=allow .[core,gcp,tpu] && uv cache clean
+RUN uv pip install libtpu==0.0.14
+
+# Add this line to print the installed version of libtpu.
+RUN pip show libtpu | grep Version
+RUN pip show jax | grep Version
+RUN pip show jaxlib | grep Version
+
 RUN if [ -n "$EXTRAS" ]; then uv pip install .[$EXTRAS] && uv cache clean; fi
 COPY . .
 
diff --git a/axlearn/cloud/gcp/jobs/launch.py b/axlearn/cloud/gcp/jobs/launch.py
@@ -780,6 +780,6 @@ def _wrapped_usage(
 
 
 if __name__ == "__main__":
-    configure_logging(logging.INFO)
+    configure_logging(logging.DEBUG)
     _private_flags()
     app.run(main)
diff --git a/axlearn/cloud/gcp/jobset_utils.py b/axlearn/cloud/gcp/jobset_utils.py
@@ -4,7 +4,6 @@
 
 import io
 import logging
-import math
 import os
 from dataclasses import dataclass
 from typing import Any, Optional, Sequence
@@ -27,10 +26,7 @@
 )
 from axlearn.cloud.gcp.config import gcp_settings
 from axlearn.cloud.gcp.node_pool import PRE_PROVISIONER_LABEL
-from axlearn.cloud.gcp.system_characteristics import (
-    GCE_MACHINE_TYPE_TO_MEMORY_CHARACTERISTICS,
-    USER_FACING_NAME_TO_SYSTEM_CHARACTERISTICS,
-)
+from axlearn.cloud.gcp.system_characteristics import USER_FACING_NAME_TO_SYSTEM_CHARACTERISTICS
 from axlearn.cloud.gcp.tpu import get_default_env, infer_tpu_workers
 from axlearn.cloud.gcp.utils import validate_jobset_name
 from axlearn.common.compiler_options import infer_tpu_type
@@ -451,15 +447,17 @@ def _build_container(self) -> Nested[Any]:
         if cfg.enable_tpu_ici_resiliency is not None:
             env_vars["ENABLE_ICI_RESILIENCY"] = str(cfg.enable_tpu_ici_resiliency).lower()
 
+        env_vars["TPU_LIBRARY_PATH"] = "/root/libtpu.so"
+
         resources = {"limits": {"google.com/tpu": system.chips_per_vm}}
-        # Set request memory by host machine type.
-        machine_memory_gi = GCE_MACHINE_TYPE_TO_MEMORY_CHARACTERISTICS.get(
-            system.gce_machine_type, None
-        )
-        if machine_memory_gi is not None:
-            request_memory_gi = machine_memory_gi * _MEMORY_REQUEST_PERCENTAGE
-            resources["limits"]["memory"] = f"{machine_memory_gi}Gi"
-            resources["requests"] = {"memory": f"{math.floor(request_memory_gi)}Gi"}
+        # # Set request memory by host machine type.
+        # machine_memory_gi = GCE_MACHINE_TYPE_TO_MEMORY_CHARACTERISTICS.get(
+        #     system.gce_machine_type, None
+        # )
+        # if machine_memory_gi is not None:
+        #     request_memory_gi = machine_memory_gi * _MEMORY_REQUEST_PERCENTAGE
+        #     resources["limits"]["memory"] = f"{machine_memory_gi}Gi"
+        #     resources["requests"] = {"memory": f"{math.floor(request_memory_gi)}Gi"}
 
         k8s_env_vars = [dict(name=k, value=str(v)) for k, v in env_vars.items()]
         k8s_env_vars.append(
@@ -509,8 +507,8 @@ def _build_uploader_container(
         interval_s = 60
         sync_command = f"while true; do gsutil -m rsync -r {src} {dst}; sleep {interval_s}; done"
         resources = {
-            "requests": {"cpu": "100m", "memory": "128Mi"},
-            "limits": {"cpu": "500m", "memory": "256Mi"},
+            # "requests": {"cpu": "100m", "memory": "128Mi"},
+            # "limits": {"cpu": "500m", "memory": "256Mi"},
         }
         return dict(
             name="output-uploader",
diff --git a/axlearn/common/array_serialization.py b/axlearn/common/array_serialization.py
@@ -559,21 +559,34 @@ def serialize(
 
         # pylint: disable-next=redefined-outer-name
         async def _run_serializer():
+            logging.info(
+                "******* DEBUG GlobalAsyncCheckpointManager _run_serializer "
+                "with number of commit_futures: %s",
+                len(commit_futures),
+            )
             future_writer = jax.tree.map(
                 serialization.async_serialize, arrays, tensorstore_specs, commit_futures
             )
+            logging.info("******* DEBUG GlobalAsyncCheckpointManager _run_serializer Completed")
             return await asyncio.gather(*future_writer)
 
+        # Is this the problem?
+        logging.info("******* DEBUG Starting to run _run_serializer")
+
         # Note: We need to run the coroutine in another event loop driven by a separate thread.
         # The current event loop might be already running an async function when `serialize` is
         # invoked from a coroutine, in which case asyncio.get_running_loop().run_until_complete()
         # would not be able to execute another coroutine to completion.
         asyncio.run_coroutine_threadsafe(_run_serializer(), self._loop).result()
 
+        logging.info("******* DEBUG Starting to run _run_serializer")
+
         self._add_futures(
             jax.tree_util.tree_flatten(commit_futures)[0] + (additional_futures or [])
         )
 
+        logging.info("******* DEBUG Starting to run async_commit")
+
         # Used in wait_until_finished to check on process != 0, if the checkpoint
         # has finished writing.
         self._start_async_commit(on_commit_callback)
diff --git a/axlearn/common/checkpointer.py b/axlearn/common/checkpointer.py
@@ -175,11 +175,14 @@ def async_save_tf_savables(
     When this call returns, `value_map` can be safely mutated, but saving to `dir` will not
     complete unless the returned future is set.
     """
+    logging.info("******* DEBUG Saving TF savables to %s async", dir)
     # pylint: disable-next=consider-using-with
     f = tempfile.TemporaryDirectory()
     for path, value in utils.flatten_items(value_map):
         tf_checkpoint = tf.train.Checkpoint(value)
+        logging.info("******* DEBUG Writing %s to path %s", f.name, path)
         tf_checkpoint.write(os.path.join(f.name, path))
+        logging.info("******* DEBUG Done writing %s to path %s", f.name, path)
     return executor.submit(_upload_dir, f, dst_dir=dir)
 
 
@@ -399,6 +402,7 @@ def __init__(self, cfg: Config):
         # TODO(markblee): Consider making BoundedDataShardedAsyncCheckpointManager
         # the default once stable.
         if cfg.max_concurrent_gb is not None or cfg.max_data_shard_degree:
+            logging.info("******* DEBUG Using BoundedDataShardedAsyncCheckpointManager")
             self._manager = BoundedDataShardedAsyncCheckpointManager(
                 max_concurrent_gb=cfg.max_concurrent_gb,
                 timeout_secs=cfg.timeout_secs,
@@ -411,6 +415,7 @@ def __init__(self, cfg: Config):
                     f"shard_threshold_bytes is set to {cfg.shard_threshold_bytes}, but "
                     "max_data_shard_degree is not set. It will not take any effect."
                 )
+            logging.info("******* DEBUG Using GlobalAsyncCheckpointManager")
             self._manager = GlobalAsyncCheckpointManager(timeout_secs=cfg.timeout_secs)
         if cfg.max_concurrent_restore_gb is not None and cfg.max_concurrent_restore_gb <= 0:
             raise ValueError(
@@ -514,8 +519,12 @@ def save_to_dir(
             logging.info("Creating directories: %s", dirs)
             list(self._executor.map(fs.makedirs, dirs))
             logging.info("All directories created")
+
+        logging.info("******* DEBUG starting sync_global_devices")
         # Wait for directory and index creation.
         multihost_utils.sync_global_devices(ckpt_dir)
+        logging.info("******* DEBUG finished sync_global_devices")
+
         # Each worker writes its tf checkpoints under a different path.
         save_tf_future = async_save_tf_savables(
             spec.tf_ckpt_map,
@@ -527,6 +536,7 @@ def save_to_dir(
         )
 
         def commit():
+            logging.info("******* DEBUG starting on_commit_callback")
             on_commit_callback(ckpt_dir=ckpt_dir, index=spec.index)
             logging.info(
                 "Serialization of %s completed in %s seconds.",
@@ -538,6 +548,9 @@ def commit():
         logging.debug(
             "array_values=%s tensorstore=%s", utils.shapes(spec.gda_values), spec.tensorstore_specs
         )
+        logging.info(
+            "array_values=%s tensorstore=%s", utils.shapes(spec.gda_values), spec.tensorstore_specs
+        )
         self._manager.serialize(
             spec.gda_values,
             spec.tensorstore_specs,
diff --git a/axlearn/common/compiler_options.py b/axlearn/common/compiler_options.py
@@ -59,6 +59,12 @@ def default_xla_options(
             # further if you see "Allocator failed to allocate". A feature
             # to dynamically allocate may come later: b/380514965
             megascale_grpc_premap_memory_bytes=17179869184,
+            # DEBUGGING ONLY: RapidEye output directory for debugging purposes,
+            megascale_rapid_eye_error_digest_log_path="/output/rapideye/",
+            # megascale_jax_offset_launch_id_by_module_name="false",
+            # megascale_jax_use_device_set_based_launch_id="false",
+            # enable megascale debug port.
+            megascale_debug_port=8081,
             # Flag controlling the maximum number of overlapping host offloadings.
             xla_tpu_host_transfer_overlap_limit=24,
             # Flag controlling the maximum number of overlapping cross-DCN send/recv.
@@ -149,10 +155,10 @@ def default_xla_options(
             # Similar to megascale_error_reporter_abort_on_hang but for unrecoverable errors.
             megascale_error_reporter_abort_on_error="true",
             # Increase the timeout at which a hang is detected/reported, default is 5m.
-            megascale_graph_hang_threshold="10m",
+            megascale_graph_hang_threshold="20m",
             # Similar to megascale_graph_hang_threshold but specific to within a launch_id.
             # Default is 1m.
-            megascale_graph_within_launch_hang_threshold="10m",
+            megascale_graph_within_launch_hang_threshold="20m",
             # TODO(ethanli): temporary workaround to avoid memory leak in megascale.
             megascale_grpc_enable_xor_tracer="false",
         )
@@ -163,7 +169,16 @@ def default_xla_options(
             int(v)
             continue
         except ValueError:
-            assert v in [True, False, "true", "false", "megachip_tccontrol", "10m"], (k, v)
+            assert v in [
+                True,
+                False,
+                "true",
+                "false",
+                "megachip_tccontrol",
+                "10m",
+                "20m",
+                "/output/rapideye/",
+            ], (k, v)
 
     return options
 
@@ -302,6 +317,7 @@ def infer_xla_performance_flags(
     if current_configuration in mesh_configurations_for_sparse_core_offloading:
         flags = dict(
             # Must disable continuation fusion to enable sparse core offloading.
+            # AXLEARN TESTING NOTE: We are disabling this to test for SparseCore related issues.
             xla_tpu_enable_async_collective_fusion_fuse_all_gather="false",
             xla_tpu_enable_async_collective_fusion_fuse_all_reduce="false",
             xla_tpu_enable_async_collective_fusion_fuse_reduce_scatter="false",
diff --git a/axlearn/common/trainer.py b/axlearn/common/trainer.py
@@ -638,7 +638,7 @@ def run(
                             )
                         self.vlog(3, "Done step %s", self.step)
                         num_steps += 1
-                        if num_steps % 100 == 0:
+                        if num_steps % 1 == 0:
                             now = time.perf_counter()
                             average_step_time = (now - start_time) / num_steps
                             self._step_log("Average step time: %s seconds", average_step_time)
@@ -1111,7 +1111,7 @@ def _run_step(
             # Run the compiled function.
             self._trainer_state, outputs = compiled_train_step_fn(self.trainer_state, input_batch)
 
-        if self.step % 100 == 0 or 0 <= self.step <= 5:
+        if self.step % 10 == 0 or 0 <= self.step <= 5:
             self._step_log(
                 "loss=%s aux=%s",
                 outputs["loss"],
diff --git a/axlearn/experiments/text/gpt/fuji.py b/axlearn/experiments/text/gpt/fuji.py
@@ -15,6 +15,8 @@
 import itertools
 from typing import Any, List, NamedTuple, Optional, Union
 
+import jax
+from absl import logging
 from jax.ad_checkpoint import checkpoint_policies as jax_remat_policies
 
 from axlearn.common import causal_lm, config
@@ -252,7 +254,6 @@ def get_trainer_kwargs(
     max_step = TOTAL_TOKENS[version][model_size] // tokens_per_batch
     max_sequence_length = MAX_SEQUENCE_LENGTH[version]
     train_batch_size = tokens_per_batch // max_sequence_length
-
     # Whether to use grouped query attention.
     num_kv_heads = None
     if version in (Version.V3, Version.V3_TIKTOKEN):
@@ -813,6 +814,67 @@ def get_trainer_kwargs(
             ),
         )
     elif model_size == "150B":
+        ##################################################################################
+        max_sequence_length = MAX_SEQUENCE_LENGTH[Version.V2]  # 4096
+
+        # model_parallelism * fsdp == num_chips_in_trillium (256)
+        model_parallelism = 4
+        fsdp = 64
+
+        current_pdbs = 0.5
+        train_batch_size = int(current_pdbs * len(jax.devices()))
+
+        # 16 * (1024**2) / 4096 = 4096
+        tokens_per_batch = int(train_batch_size * max_sequence_length)
+
+        # 32M tokens is the max global tokens we can train on.
+        # We must modify either the pdbs or the model sharding to accommodate 128 slices.
+        if tokens_per_batch > 32 * (1024**2):
+            tokens_per_batch = 32 * (1024**2)
+            # if we want to modify the pdbs:
+            # current_pdbs = 0.25
+
+            # otherwise we can modify the model sharding.
+            model_parallelism = 8
+            fsdp = 32
+
+        # 32M tokens is the max global tokens we can train on.
+        assert tokens_per_batch <= 32 * (1024**2)
+        assert fsdp * model_parallelism == 256
+
+        # 1 / model_parallelism = 1 / 4 = 0.25
+        min_pdbs = 1 / model_parallelism
+        max_pdbs = 1
+
+        # More than 1 pdbs causes an OOM.
+        assert current_pdbs < max_pdbs
+        assert current_pdbs >= min_pdbs
+
+        # maximum number of devices we can use this config on =
+        # train_batch_size // min_pdbs = 4096 / 0.25 = 16384
+        max_devices = int(train_batch_size // min_pdbs)
+
+        assert isinstance(train_batch_size, int)
+        assert isinstance(tokens_per_batch, int)
+
+        logging.info(
+            (
+                "******* DEBUGGING: max_sequence_length: %s, model_parallelism: %s,"
+                " fsdp: %s, current_pdbs: %s, train_batch_size: %s,"
+                " tokens_per_batch: %s, min_pdbs: %s, max_pdbs: %s, max_devices: %s"
+            ),
+            max_sequence_length,
+            model_parallelism,
+            fsdp,
+            current_pdbs,
+            train_batch_size,
+            tokens_per_batch,
+            min_pdbs,
+            max_pdbs,
+            max_devices,
+        )
+        ##################################################################################
+
         trainer_kwargs = dict(
             model_kwargs=dict(
                 num_layers=80,
@@ -828,8 +890,9 @@ def get_trainer_kwargs(
             learner_kwargs=dict(peak_lr=1.5e-4, weight_decay=0.1),
             max_sequence_length=max_sequence_length,
             train_batch_size=train_batch_size,
-            max_step=max_step,
-            mesh_shape=mesh_shape_from_axes(data=-1, fsdp=64, model=4),
+            max_step=100_000,  # max_step,
+            save_every_n_steps=100,
+            mesh_shape=mesh_shape_from_axes(data=-1, fsdp=fsdp, model=model_parallelism),
             mesh_rules=(
                 (
                     # Target per-device token count = 4k.
@@ -971,6 +1034,12 @@ def trainer_configs(
         if model_size not in TOTAL_TOKENS[version]:  # This combination does not exist.
             continue
         vocab_size = VOCAB_SIZE[version]
+        logging.info(
+            "******* DEBUGGING: version: %s, model_size: %s, flash_attention: %s",
+            version,
+            model_size,
+            flash_attention,
+        )
         config_name = make_config_name(
             arch=arch,
             model_size=model_size,
diff --git a/disrupt_nodes.sh b/disrupt_nodes.sh