pathways images finalized

lkolluru05 · lkolluru05 · commit 697380861545 · 2025-10-07T21:12:01.000Z
diff --git a/axlearn/cloud/gcp/bundler.py b/axlearn/cloud/gcp/bundler.py
@@ -48,7 +48,6 @@
 
 import os
 import subprocess
-import time
 from typing import Optional
 
 from absl import app, flags, logging
@@ -59,10 +58,10 @@
 from axlearn.cloud.common.bundler import register_bundler
 from axlearn.cloud.common.docker import registry_from_repo
 from axlearn.cloud.common.utils import canonicalize_to_list, to_bool
-from axlearn.cloud.gcp.cloud_build import get_cloud_build_status
+from axlearn.cloud.gcp.cloud_build import wait_for_cloud_build
 from axlearn.cloud.gcp.config import gcp_settings
 from axlearn.cloud.gcp.utils import common_flags
-from axlearn.common.config import REQUIRED, Required, config_class, maybe_set_config, config_for_class
+from axlearn.common.config import REQUIRED, Required, config_class, maybe_set_config
 
 FLAGS = flags.FLAGS
 
@@ -105,6 +104,8 @@ class Config(DockerBundler.Config):
 
         Attributes:
             colocated_image_required: Bool to build a colocated image
+            colocated_image_name: Colocated Image Name
+            colocated_dockerfile: Colocated Dockerfile
         """
         # Build image asynchronously.
         colocated_image_required: bool = False
@@ -129,7 +130,6 @@ def _build_and_push(self, *args, **kwargs):
             check=True,
         )
 
-        print("actual",cfg)
         actual_name = cfg.image
         actual_dockerfile=cfg.dockerfile
         actual_target=cfg.target
@@ -138,17 +138,14 @@ def _build_and_push(self, *args, **kwargs):
             cfg.dockerfile=cfg.colocated_dockerfile
             cfg.image=cfg.colocated_image_name
             cfg.target=None
-            print("updated config: ",cfg)
+            
             colocated_bundler_class = ColocatedArtifactRegistryBundler(cfg=cfg)
             colocated_image_name = colocated_bundler_class.bundle(tag="latest")
-            print(colocated_image_name)
         
             cfg.dockerfile=actual_dockerfile
             cfg.image=actual_name
             cfg.target=actual_target
-
-
-            
+  
         return super()._build_and_push(*args, **kwargs)
 
 
@@ -164,7 +161,6 @@ def from_spec(cls, spec: list[str], *, fv: Optional[flags.FlagValues]) -> Docker
 
     def _build_and_push(self, *args, **kwargs):
         cfg = self.config
-        print("colocated",cfg)
         subprocess.run(
             ["gcloud", "auth", "configure-docker", registry_from_repo(cfg.repo)],
             check=True,
@@ -296,36 +292,14 @@ def wait_until_finished(self, name: str, wait_timeout=3600):
             TimeoutError: If the build does not complete within the overall timeout.
             ValueError: If the async build fails.
         """
-        start_time = time.perf_counter()
         cfg: CloudBuildBundler.Config = self.config
-        while cfg.is_async:
-            elapsed_time = time.perf_counter() - start_time
-            if elapsed_time > wait_timeout:
-                timeout_msg = (
-                    f"Timed out waiting for CloudBuild to finish for more than "
-                    f"{wait_timeout} seconds."
-                )
-                logging.error(timeout_msg)
-                raise TimeoutError(timeout_msg)
-            try:
-                build_status = get_cloud_build_status(
-                    project_id=cfg.project, image_name=self.id(name), tags=[name]
-                )
-            except Exception as e:  # pylint: disable=broad-except
-                # TODO(liang-he,markblee): Distinguish transient from non-transient errors.
-                logging.warning("Failed to get the CloudBuild status, will retry: %s", e)
-            else:
-                if not build_status:
-                    logging.warning("CloudBuild for %s does not exist yet.", name)
-                elif build_status.is_pending():
-                    logging.info("CloudBuild for %s is pending: %s.", name, build_status)
-                elif build_status.is_success():
-                    logging.info("CloudBuild for %s is successful: %s.", name, build_status)
-                    return
-                else:
-                    # Unknown status is also considered a failure.
-                    raise RuntimeError(f"CloudBuild for {name} failed: {build_status}.")
-            time.sleep(30)
+        if cfg.is_async:
+            wait_for_cloud_build(
+                project_id=cfg.project,
+                image_id=self.id(name),
+                tags=[name],
+                wait_timeout=wait_timeout,
+            )
 
 
 def with_tpu_extras(bundler: Bundler.Config) -> Bundler.Config:
diff --git a/axlearn/cloud/gcp/pathways_utils.py b/axlearn/cloud/gcp/pathways_utils.py
@@ -21,7 +21,11 @@
     _LoadBalancer,
 )
 from axlearn.cloud.gcp.lws_utils import BaseLeaderWorkerTemplate, TPULeaderWorkerTemplate
-from axlearn.cloud.gcp.system_characteristics import USER_FACING_NAME_TO_SYSTEM_CHARACTERISTICS
+from axlearn.cloud.gcp.system_characteristics import (
+    GCE_MACHINE_TYPE_TO_MEMORY_CHARACTERISTICS,
+    USER_FACING_NAME_TO_SYSTEM_CHARACTERISTICS,
+    support_twisted_topology,
+)
 from axlearn.cloud.gcp.tpu import infer_tpu_workers
 from axlearn.cloud.gcp.utils import validate_jobset_name
 from axlearn.common.compiler_options import (
@@ -47,30 +51,20 @@
 # There is no guarantee that this image will work with newer Jax releases.
 # This image version extends GRPC timeout for long context models, based on jax-0.5.3-patch060625
 # This image extends GRPC timeout for long context models.
-_PATHWAYS_IMAGE_TAG = "disable_settings_20250701"
+_PATHWAYS_IMAGE_TAG = "2025-10-03"
 
 # The docker image used by pathways proxy container.
 # pylint: disable=line-too-long
-# _PATHWAYS_PROXY_IMAGE = "us-docker.pkg.dev/cloud-tpu-v2-images-dev/pathways/gke/ksadi/unsanitized_proxy_server_maxtext:latest"
 _PATHWAYS_PROXY_IMAGE = (
     # pylint: disable=line-too-long
-    "us-docker.pkg.dev/cloud-tpu-v2-images-dev/pathways/gke/ksadi/unsanitized_proxy_server:latest"
+    f"us-docker.pkg.dev/cloud-tpu-v2-images/pathways-colocated-python/proxy_server:{_PATHWAYS_IMAGE_TAG}"
 )
 # The docker image used by pathways resource manager container and worker container.
 _PATHWAYS_SERVER_IMAGE = (
     # pylint: disable=line-too-long
-    # "us-docker.pkg.dev/cloud-tpu-v2-images-dev/pathways/gke/ksadi/unsanitized_server@sha256:fde763e2bae514d0fa758840e501b71a9ea48781dddafa5d8ed3a0fa316fd1ae"
-    "us-docker.pkg.dev/cloud-tpu-v2-images-dev/pathways/gke/ksadi/unsanitized_server:latest"
-    # "us-docker.pkg.dev/cloud-tpu-v2-images-dev/pathways/gke/ksadi/unsanitized_server_maxtext:latest"
-)
-_COLOCATED_PYTHON_IMAGE = (
-    # "gcr.io/cloud-tpu-multipod-dev/ksadi_sidecar_maxtext:latest"
-    # pylint: disable=line-too-long
-    #"us-docker.pkg.dev/cloud-tpu-multipod-dev/colocated-images/sam:v6"
-    "us-docker.pkg.dev/cloud-tpu-multipod-dev/axlearn/colocated-img13:latest"
-    # "us-docker.pkg.dev/cloud-tpu-multipod-dev/colocated-images/lk-colocated-image:latest"
-    # "gcr.io/cloud-tpu-multipod-dev/sujinesh_sidecar_debug@sha256:19abcd94addb6ff2749c299d6b0cc4748f27a4ab8759a18b466d0bdd3e5b71e8"
+    f"us-docker.pkg.dev/cloud-tpu-v2-images/pathways-colocated-python/server:{_PATHWAYS_IMAGE_TAG}"
 )
+
 # The container name of pathways resourcemanager.
 _PATHWAYS_RESOURCE_MANAGER_CONTAINER_NAME = "pathways-rm"
 # The container name of pathways proxy.
@@ -95,24 +89,19 @@
 
 def get_colocated_python_image(colocated_image_name, fv: flags.FlagValues = FLAGS) -> str:
     repo = gcp_settings("docker_repo", required=False, fv=fv)
-    print(repo)
-    print(colocated_image_name)
-    print(repo+colocated_image_name+":latest")
     return repo+"/"+colocated_image_name+":latest"
 
 
 def parse_xla_flag_value(value: str) -> Union[int, bool, str]:
-    """Attempts to convert an XLA flag string value to int, then bool.
+    """Attempts to convert an XLA flag string value to int.
 
     If conversion fails, returns the original string (stripped).
     """
-    bool_mapper = {"true": True, "false": False}
     stripped_value_str = value.strip()
     try:
         return int(stripped_value_str)
     except ValueError:
-        # Not an integer, try boolean conversion.
-        return bool_mapper.get(stripped_value_str.lower(), stripped_value_str)
+        return stripped_value_str
 
 
 def get_pathways_tpu_version(gke_machine_type: str) -> str:
@@ -162,6 +151,25 @@ def get_xla_options(
     """
     return {k: v for k, v in xla_options.items() if k.startswith("xla_")}
 
+def round_up_to_power_of_2(n):
+    """
+    Rounds an integer up to the nearest power of 2.
+
+    Args:
+        n (int): The number to round up. Must be a positive integer.
+
+    Returns:
+        int: The smallest power of 2 that is greater than or equal to n.
+
+    Examples:
+        round_up_to_power_of_2(7)   -> 8
+        round_up_to_power_of_2(8)   -> 8
+        round_up_to_power_of_2(9)   -> 16
+        round_up_to_power_of_2(32)  -> 32
+    """
+    assert isinstance(n, int) and n > 0
+    return 1 << (n - 1).bit_length()
+
 
 class PathwaysReplicatedJob(BaseReplicatedJob):
     """Builds a replicated jobspec for Pathways on TPU, to be used with JobSet API."""
@@ -311,7 +319,12 @@ def _build_pathways_head_container(self) -> dict:
         # In Jax 0.6.2 and beyond this flag can be renamed to
         # IFRT_PROXY_USE_INSECURE_GRPC_CREDENTIALS as well.
         self._update_env_list(env_list, "TEST_UNDECLARED_OUTPUTS_DIR", "true")
-
+        # Threshold for using shared memory between Jax client and Pathways proxy.
+        # Setting it to 1 byte so effectively all Jax device_put use shared memory.
+        self._update_env_list(env_list, "IFRT_PROXY_LARGE_TRANSFER_THRESHOLD", "1")
+        self._update_env_list(
+            env_list, "IFRT_PROXY_LARGE_TRANSFER_OPTIMIZATION_DIRECTORY", "/tmp/ifrt_proxy"
+        )
         env_list.append(
             {
                 "name": "HOST_ADDRESS",
@@ -351,10 +364,13 @@ def _build_pathways_head_container(self) -> dict:
         mem_req = f"{self.config.pathways_head_mem}Gi"
         resources = {
             "requests": {"cpu": cpu_req, "memory": mem_req},
-            "limits": {"cpu": cpu_req, "memory": mem_req},
         }
         head_container["resources"] = resources
 
+        volume_mounts = head_container.get("volumeMounts", [])
+        volume_mounts.append(dict(name="shared-memory", mountPath="/tmp/ifrt_proxy"))
+        head_container["volumeMounts"] = volume_mounts
+
         return head_container
 
     def _build_pathways_head_sidecar_containers(self) -> list[Nested[Any]]:
@@ -384,10 +400,9 @@ def _build_pathways_head_sidecar_containers(self) -> list[Nested[Any]]:
         ]
         cmd_args.extend(xla_flags_from_options(self._xla_options).split())
 
-        # This is required for GKE Workload Identity and Mac Jax Client support.
-        # TODO(samos123): Remove this once this becomes the default.
-        proxy_env = [{"name": "IFRT_PROXY_USE_INSECURE_GRPC_CREDENTIALS", "value": "true"}]
-
+        instance_type = f"{pathways_tpu_version}:{system.topology}"
+        if support_twisted_topology(self._tpu_type):
+            instance_type = f"{instance_type}_untwisted"
         return [
             dict(
                 name=_PATHWAYS_PROXY_CONTAINER_NAME,
@@ -396,8 +411,21 @@ def _build_pathways_head_sidecar_containers(self) -> list[Nested[Any]]:
                 # SideCar container is an init container with restartPolicy as "Always".
                 restartPolicy="Always",
                 args=cmd_args,
-                env=proxy_env,
+                env=[
+                    # This is required for GKE Workload Identity and Mac Jax Client support.
+                    # TODO(samos123): Remove this once this becomes the default.
+                    {"name": "IFRT_PROXY_USE_INSECURE_GRPC_CREDENTIALS", "value": "true"},
+                    {"name": "XLA_FLAGS", "value": f"--xla_dump_to=/output/{cfg.name}/xla"},
+                    {
+                        "name": "IFRT_PROXY_LARGE_TRANSFER_OPTIMIZATION_DIRECTORY",
+                        "value": "/tmp/ifrt_proxy",
+                    },
+                ],
                 ports=[dict(containerPort=_PATHWAYS_PROXY_PORT)],
+                volumeMounts=[
+                    dict(name="shared-output", mountPath="/output"),
+                    dict(name="shared-memory", mountPath="/tmp/ifrt_proxy"),
+                ],
             ),
             dict(
                 name=_PATHWAYS_RESOURCE_MANAGER_CONTAINER_NAME,
@@ -415,17 +443,18 @@ def _build_pathways_head_sidecar_containers(self) -> list[Nested[Any]]:
                     f"--server_port={_PATHWAYS_RESOURCE_MANAGER_PORT}",
                     "--node_type=resource_manager",
                     f"--instance_count={pathways_instance_count}",
-                    f"--instance_type={pathways_tpu_version}:{system.topology}",
+                    f"--instance_type={instance_type}",
                     f"--gcs_scratch_location={staging_location}",
                 ],
+                volumeMounts=[dict(name="shared-output", mountPath="/output")],
             ),
         ]
 
     def _colocated_python_container(self):
         cfg: PathwaysReplicatedJob.Config = self.config
         return dict(
             name=_COLOCATED_PYTHON_SIDECAR_NAME,
-            image=get_colocated_python_image(cfg.colocated_image), #_COLOCATED_PYTHON_IMAGE,
+            image=get_colocated_python_image(cfg.colocated_image),
             restartPolicy="Always",
             env=[
                 {
@@ -450,6 +479,7 @@ def _build_pathways_head_pod(self) -> Nested[Any]:
             labels.update({BASTION_JOB_VERSION_LABEL: os.environ.get(BASTION_JOB_VERSION_ENV_VAR)})
 
         volumes.append(dict(name="shared-output", emptyDir={}))
+        volumes.append(dict(name="shared-memory", emptyDir=dict(medium="Memory")))
 
         if cfg.gcsfuse_mount:
             annotations.update(
@@ -466,7 +496,11 @@ def _build_pathways_head_pod(self) -> Nested[Any]:
         }
 
         head_container = self._build_pathways_head_container()
-        init_containers = self._build_pathways_head_sidecar_containers()
+        init_containers = [
+            *self._build_pathways_head_sidecar_containers(),
+            # pylint: disable-next=protected-access
+            self._inner._build_uploader_container(),
+        ]
 
         # Hardcode metadata.google.internal ip address to avoid transient DNS resolution issue.
         metadata_host_alias = dict(
@@ -524,6 +558,8 @@ def _build_pathways_worker_container(
     ) -> dict:
         """Build the container for the 'pathways-worker' role."""
         cfg: TPUReplicatedJob.Config = self._inner.config
+        system = USER_FACING_NAME_TO_SYSTEM_CHARACTERISTICS[self._tpu_type]
+        host_memory = GCE_MACHINE_TYPE_TO_MEMORY_CHARACTERISTICS[system.gce_machine_type]
         # pylint: disable-next=protected-access
         container = self._inner._build_container()
 
@@ -583,9 +619,9 @@ def _build_pathways_worker_container(
             # Recycling host memory gives a slight increase in performance.
             "--tpu_pinned_host_allocation_recycle=true",
             # The flag below is needed for better H2D performance.
-            # Rule of thumb: 3x the shard size. So 128GB to be safe.
-            # Decrease if you start running out of host memory on TPU VMs.
-            "--tpu_premapped_buffer_size=137438953472",
+            # We use 1/4 of the host memory, rounding up to power of 2 as premapped buffer.
+            # Note that pathways worker requires this flag to be a power of 2.
+            f"--tpu_premapped_buffer_size={round_up_to_power_of_2(host_memory//4)*(1<<30)}",
         ]
         mega_scale_args = xla_flags_from_options(self._mxla_options).split()
         worker_container["args"].extend(mega_scale_args)
@@ -608,6 +644,7 @@ def _build_pathways_worker_pod(
     ) -> Nested[Any]:
         """Conoverts a worker pod to a new pod for the 'pathways-workers' role."""
         cfg: TPUReplicatedJob.Config = self._inner.config
+        pathways_cfg: PathwaysReplicatedJob.Config = self.config
         # pylint: disable-next=protected-access
         pod = self._inner._build_pod()
         worker_pod = copy.deepcopy(pod)
@@ -623,7 +660,9 @@ def _build_pathways_worker_pod(
         pod_spec["containers"] = [
             self._build_pathways_worker_container(pathways_worker_replicated_job_index)
         ]
-        pod_spec["initContainers"] = [self._colocated_python_container()]
+
+        if pathways_cfg.colocated_image:
+            pod_spec["initContainers"] = [self._colocated_python_container()]
 
         worker_pod["spec"] = pod_spec
 
@@ -965,7 +1004,7 @@ def _build_head_container(self) -> dict:
         }
         return dict(
             name=cfg.name,
-            image=self._bundler.id(cfg.name),
+            image=cfg.image_id or self._bundler.id(cfg.name),
             command=["bash", "-c", cfg.command],
             env=[
                 {
diff --git a/colocated_commands.txt b/colocated_commands.txt
@@ -3,6 +3,7 @@
 
 export NAME=axlearn-img
 export COLOCATED_NAME=colocated-img
+export CKPT_BUCKET_NAME=<>
 
 axlearn gcp bundle --name=$NAME \
         --bundler_spec=allow_dirty=True \
@@ -27,4 +28,4 @@ axlearn gcp launch run --cluster=mlperf-v5p  \
         --bundler_spec=dockerfile=Dockerfile  \
         --bundler_spec=target=tpu \
         --colocated_image=$COLOCATED_NAME   \  
-        -- TPU_PREMAPPED_BUFFER_SIZE=34359738368   python3 test_benchmark.py --ckpt_path gs://cloud-tpu-multipod-dev-euw4/axlearn-fuji-v3-70b/checkpoints/step_00000100
+        -- TPU_PREMAPPED_BUFFER_SIZE=34359738368   python3 test_benchmark.py --ckpt_path $CKPT_BUCKET_NAME