From a26a868c38eeab4983025644ca730c5eca4e9338 Mon Sep 17 00:00:00 2001 From: Shaurya Date: Sun, 2 Mar 2025 22:05:40 +0000 Subject: [PATCH] Add startupPolicy: InOrder to the pathways containers along with streamz changes Minor edit to the interactive workload connection command --- src/xpk/commands/workload.py | 210 +++++++++++++++++------------------ 1 file changed, 105 insertions(+), 105 deletions(-) diff --git a/src/xpk/commands/workload.py b/src/xpk/commands/workload.py index 94aa7727e..db3c1744d 100644 --- a/src/xpk/commands/workload.py +++ b/src/xpk/commands/workload.py @@ -258,97 +258,9 @@ operator: "All" targetReplicatedJobs: - {args.targetReplicatedJob} + startupPolicy: + startupPolicyOrder: InOrder replicatedJobs: - - name: worker - replicas: {args.num_slices} - template: - metadata: - annotations: - alpha.jobset.sigs.k8s.io/exclusive-topology: cloud.google.com/gke-nodepool - labels: - xpk.google.com/workload: {args.workload} - spec: - backoffLimit: {backoff_limit} - completions: {system.vms_per_slice} - parallelism: {system.vms_per_slice} - template: - metadata: - annotations: - {storage_annotations} - spec: - terminationGracePeriodSeconds: {args.termination_grace_period_seconds} - serviceAccountName: {service_account} - containers: - - args: - {pathways_worker_args} - image: {args.server_image} - imagePullPolicy: Always - name: pathways-worker - ports: - - containerPort: 29001 - - containerPort: 8471 - - containerPort: 8080 - resources: - limits: - {resource_type}: {system.chips_per_vm} - securityContext: - privileged: true - volumeMounts: - - mountPath: /tmp - name: shared-tmp - {storage_volume_mounts} - env: - - name: PROJECT_ID - value: {args.project} - - name: LOCATION - value: {args.zone} - - name: CLUSTER_NAME - value: {args.cluster} - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: CONTAINER_NAME - value: "pathways-worker" - - name: NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - # Workaround for v6e - - name: MEGASCALE_GRPC_ENABLE_XOR_TRACER - value: "false" - - name: MEGASCALE_NUM_SLICES - valueFrom: - fieldRef: - fieldPath: "metadata.labels['jobset.sigs.k8s.io/replicatedjob-replicas']" - - name: JOBSET_NAME - valueFrom: - fieldRef: - fieldPath: metadata.annotations['jobset.sigs.k8s.io/jobset-name'] - - name: REPLICATED_JOB_NAME - valueFrom: - fieldRef: - fieldPath: metadata.annotations['jobset.sigs.k8s.io/replicatedjob-name'] - - name: MEGASCALE_SLICE_ID - valueFrom: - fieldRef: - fieldPath: "metadata.labels['jobset.sigs.k8s.io/job-index']" - - name: MEGASCALE_COORDINATOR_ADDRESS - value: "$(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-$(MEGASCALE_SLICE_ID)-0.$(JOBSET_NAME)" - {pathways_sidecar_container} - nodeSelector: - {accelerator_label} - {machine_label} - {autoprovisioning_args} - priorityClassName: {args.priority} - hostNetwork: true - dnsPolicy: ClusterFirstWithHostNet - volumes: - - hostPath: - path: /tmp - type: DirectoryOrCreate - name: shared-tmp - {storage_volumes} - name: rm replicas: 1 template: @@ -365,6 +277,18 @@ - args: {pathways_rm_args} env: + - name: REPLICATED_JOB_NAME + valueFrom: + fieldRef: + fieldPath: metadata.annotations['jobset.sigs.k8s.io/replicatedjob-name'] + - name: JOBSET_NAME + valueFrom: + fieldRef: + fieldPath: metadata.annotations['jobset.sigs.k8s.io/jobset-name'] + - name: HOST_ADDRESS + value: $(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-0-0.$(JOBSET_NAME) + - name: TPU_SKIP_MDS_QUERY + value: "true" - name: PROJECT_ID value: {args.project} - name: LOCATION @@ -381,19 +305,6 @@ valueFrom: fieldRef: fieldPath: metadata.namespace - - name: REPLICATED_JOB_NAME - valueFrom: - fieldRef: - fieldPath: metadata.annotations['jobset.sigs.k8s.io/replicatedjob-name'] - - name: JOBSET_NAME - valueFrom: - fieldRef: - fieldPath: metadata.annotations['jobset.sigs.k8s.io/jobset-name'] - - name: HOST_ADDRESS - value: $(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-0-0.$(JOBSET_NAME) - - name: TPU_SKIP_MDS_QUERY - value: "true" - image: {args.server_image} imagePullPolicy: Always name: pathways-rm ports: @@ -454,6 +365,96 @@ nodeSelector: cloud.google.com/gke-nodepool: cpu-proxy-np {user_workload} + - name: worker + replicas: {args.num_slices} + template: + metadata: + annotations: + alpha.jobset.sigs.k8s.io/exclusive-topology: cloud.google.com/gke-nodepool + labels: + xpk.google.com/workload: {args.workload} + spec: + backoffLimit: {backoff_limit} + completions: {system.vms_per_slice} + parallelism: {system.vms_per_slice} + template: + metadata: + annotations: + {storage_annotations} + spec: + terminationGracePeriodSeconds: {args.termination_grace_period_seconds} + serviceAccountName: {service_account} + containers: + - args: + {pathways_worker_args} + image: {args.server_image} + imagePullPolicy: Always + name: pathways-worker + ports: + - containerPort: 29001 + - containerPort: 8471 + - containerPort: 8080 + resources: + limits: + {resource_type}: {system.chips_per_vm} + securityContext: + privileged: true + volumeMounts: + - mountPath: /tmp + name: shared-tmp + {storage_volume_mounts} + env: + - name: PROJECT_ID + value: {args.project} + - name: LOCATION + value: {args.zone} + - name: CLUSTER_NAME + value: {args.cluster} + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: CONTAINER_NAME + value: "pathways-worker" + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + # Workaround for v6e + - name: MEGASCALE_GRPC_ENABLE_XOR_TRACER + value: "false" + - name: MEGASCALE_NUM_SLICES + valueFrom: + fieldRef: + fieldPath: "metadata.labels['jobset.sigs.k8s.io/replicatedjob-replicas']" + - name: JOBSET_NAME + valueFrom: + fieldRef: + fieldPath: metadata.annotations['jobset.sigs.k8s.io/jobset-name'] + - name: REPLICATED_JOB_NAME + valueFrom: + fieldRef: + fieldPath: metadata.annotations['jobset.sigs.k8s.io/replicatedjob-name'] + - name: MEGASCALE_SLICE_ID + valueFrom: + fieldRef: + fieldPath: "metadata.labels['jobset.sigs.k8s.io/job-index']" + - name: MEGASCALE_COORDINATOR_ADDRESS + value: "$(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-$(MEGASCALE_SLICE_ID)-0.$(JOBSET_NAME)" + {pathways_sidecar_container} + nodeSelector: + {accelerator_label} + {machine_label} + {autoprovisioning_args} + priorityClassName: {args.priority} + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet + volumes: + - hostPath: + path: /tmp + type: DirectoryOrCreate + name: shared-tmp + {storage_volumes} """ @@ -742,8 +743,7 @@ def workload_create(args) -> None: ' done! ******* ' ) xpk_print( - 'Steps to connect to the proxy: kubectl get pods | grep proxy ;' - ' kubectl port-forward 29000:29000; ' + 'Steps to connect to the proxy: kubectl get pods | grep {args.workload}-proxy-0 | awk "{print $1}" | xargs -I {} kubectl port-forward {} 29000:29000 &' ' JAX_PLATFORMS=proxy; JAX_BACKEND_TARGET=grpc://127.0.0.1:29000;' " python -c 'import pathwaysutils; import jax; print(jax.devices())'" )