From 3931c1597a59fd40c284eb8e60a190137e90e91f Mon Sep 17 00:00:00 2001 From: Sujeeth Jinesh Date: Fri, 23 May 2025 19:16:03 +0000 Subject: [PATCH] Update Docker Resource Limits for Pathways Main Workload --- src/xpk/core/cluster.py | 16 ++++++++-------- src/xpk/core/docker_resources.py | 4 ++-- src/xpk/core/nodepool.py | 22 +++++++++++++++------- src/xpk/core/pathways.py | 20 ++++++++++++++++---- src/xpk/parser/cluster.py | 6 ++++++ 5 files changed, 47 insertions(+), 21 deletions(-) diff --git a/src/xpk/core/cluster.py b/src/xpk/core/cluster.py index 2bf428139..687b9409f 100644 --- a/src/xpk/core/cluster.py +++ b/src/xpk/core/cluster.py @@ -617,14 +617,14 @@ def update_cluster_with_workload_identity_if_necessary(args) -> int: if is_workload_identity_enabled_on_cluster(args): return 0 - cluster_update_return_code = ( - update_gke_cluster_with_workload_identity_enabled(args) - ) - if cluster_update_return_code > 0: - xpk_print( - 'Updating GKE cluster to enable Workload Identity Federation failed!' - ) - return cluster_update_return_code + # cluster_update_return_code = ( + # update_gke_cluster_with_workload_identity_enabled(args) + # ) + # if cluster_update_return_code > 0: + # xpk_print( + # 'Updating GKE cluster to enable Workload Identity Federation failed!' + # ) + # return cluster_update_return_code return 0 diff --git a/src/xpk/core/docker_resources.py b/src/xpk/core/docker_resources.py index a95c55735..c946c7b41 100644 --- a/src/xpk/core/docker_resources.py +++ b/src/xpk/core/docker_resources.py @@ -34,8 +34,8 @@ def get_main_container_resources( Workload resources port as a YAML string """ # Resources requirements for Pathways workload containers are known. - resources_yaml = """cpu: "24" - memory: 100G""" + resources_yaml = f"""cpu: "60" + memory: 490G""" if args.use_pathways: return resources_yaml diff --git a/src/xpk/core/nodepool.py b/src/xpk/core/nodepool.py index 6c649b6b4..4609f9fb3 100644 --- a/src/xpk/core/nodepool.py +++ b/src/xpk/core/nodepool.py @@ -310,18 +310,26 @@ def run_gke_node_pool_create_command( create_commands.append(command) create_task_names.append(task) - desired_pw_cpu_node_pools = ['cpu-np'] + desired_pw_cpu_node_pools = ['cpu-np', 'highmem-cpu-np'] if args.enable_pathways: # Pathways needs CPU nodepools in addition to TPU nodepools for node_pool_name in desired_pw_cpu_node_pools: if node_pool_name in existing_node_pool_names: continue - command = ( - 'gcloud beta container node-pools create' - f' {node_pool_name} --node-version={gke_node_pool_version} --cluster={args.cluster} --project={args.project} --node-locations={args.zone} --region={zone_to_region(args.zone)} --num-nodes=1' - f' --machine-type={args.pathways_gce_machine_type} --scopes=storage-full,gke-default,{CLOUD_PLATFORM_AUTH_SCOPE_URL} --enable-autoscaling' - ' --min-nodes=1 --max-nodes=20' - ) + if node_pool_name == 'cpu-np': + command = ( + 'gcloud beta container node-pools create' + f' {node_pool_name} --node-version={gke_node_pool_version} --cluster={args.cluster} --project={args.project} --node-locations={args.zone} --region={zone_to_region(args.zone)} --num-nodes=1' + f' --machine-type={args.pathways_gce_machine_type} --scopes=storage-full,gke-default,{CLOUD_PLATFORM_AUTH_SCOPE_URL} --enable-autoscaling' + ' --min-nodes=1 --max-nodes=20' + ) + else: + command = ( + 'gcloud beta container node-pools create' + f' {node_pool_name} --node-version={gke_node_pool_version} --cluster={args.cluster} --project={args.project} --node-locations={args.zone} --region={zone_to_region(args.zone)} --num-nodes=1' + f' --machine-type={args.pathways_highmem_gce_machine_type} --scopes=storage-full,gke-default,{CLOUD_PLATFORM_AUTH_SCOPE_URL} --enable-autoscaling' + ' --min-nodes=1 --max-nodes=20' + ) task = f'NodepoolCreate-{node_pool_name}' create_commands.append(command) create_task_names.append(task) diff --git a/src/xpk/core/pathways.py b/src/xpk/core/pathways.py index 6e238c01f..ec8238f0b 100644 --- a/src/xpk/core/pathways.py +++ b/src/xpk/core/pathways.py @@ -32,6 +32,14 @@ def add_pw_resource_flavors(args): spec: nodeLabels: cloud.google.com/gke-nodepool: cpu-np +--- +apiVersion: kueue.x-k8s.io/v1beta1 +kind: ResourceFlavor +metadata: + name: highmem-cpu-user +spec: + nodeLabels: + cloud.google.com/gke-nodepool: highmem-cpu-np ---""" if args.enable_pathways: return resource_flavor_yaml @@ -47,7 +55,13 @@ def add_pw_resources_to_kueue(args): - name: "cpu" nominalQuota: 480 - name: "memory" - nominalQuota: 2000G""" + nominalQuota: 2000G + - name: highmem-cpu-user + resources: + - name: "cpu" + nominalQuota: 480 + - name: "memory" + nominalQuota: 4000G""" if args.enable_pathways: return resources_yaml return '' @@ -79,7 +93,7 @@ def ensure_pathways_workload_prerequisites(args, system) -> bool: # Ensure the cluster and CPU nodepools were created with create-pathways all_node_pools = get_all_nodepools_programmatic(args) - desired_pw_cpu_node_pools = {'cpu-np'} + desired_pw_cpu_node_pools = {'cpu-np', 'highmem-cpu-np'} if not desired_pw_cpu_node_pools.issubset(set(all_node_pools[0])): xpk_print( 'Cluster needs to be created with `xpk create-pathways` to run' @@ -263,8 +277,6 @@ def get_user_workload_for_pathways( spec: containers: {container} - nodeSelector: - cloud.google.com/gke-nodepool: cpu-np hostNetwork: true dnsPolicy: ClusterFirstWithHostNet restartPolicy: Never diff --git a/src/xpk/parser/cluster.py b/src/xpk/parser/cluster.py index 11aee9c22..7b7a469bf 100644 --- a/src/xpk/parser/cluster.py +++ b/src/xpk/parser/cluster.py @@ -528,6 +528,12 @@ def add_shared_cluster_create_optional_arguments(parser: ArgumentParser): default='n2-standard-64', help='The CPU type for Pathways CPU nodepools', ) + parser.add_argument( + '--pathways-highmem-gce-machine-type', + type=str, + default='c4-highmem-192', + help='The highmem CPU type for Pathways CPU nodepools', + ) parser.add_argument( '--default-pool-cpu-machine-type', type=str,