Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions src/xpk/core/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -617,14 +617,14 @@ def update_cluster_with_workload_identity_if_necessary(args) -> int:

if is_workload_identity_enabled_on_cluster(args):
return 0
cluster_update_return_code = (
update_gke_cluster_with_workload_identity_enabled(args)
)
if cluster_update_return_code > 0:
xpk_print(
'Updating GKE cluster to enable Workload Identity Federation failed!'
)
return cluster_update_return_code
# cluster_update_return_code = (
# update_gke_cluster_with_workload_identity_enabled(args)
# )
# if cluster_update_return_code > 0:
# xpk_print(
# 'Updating GKE cluster to enable Workload Identity Federation failed!'
# )
# return cluster_update_return_code

return 0

Expand Down
4 changes: 2 additions & 2 deletions src/xpk/core/docker_resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@ def get_main_container_resources(
Workload resources port as a YAML string
"""
# Resources requirements for Pathways workload containers are known.
resources_yaml = """cpu: "24"
memory: 100G"""
resources_yaml = f"""cpu: "60"
memory: 490G"""
if args.use_pathways:
return resources_yaml

Expand Down
22 changes: 15 additions & 7 deletions src/xpk/core/nodepool.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,18 +310,26 @@ def run_gke_node_pool_create_command(
create_commands.append(command)
create_task_names.append(task)

desired_pw_cpu_node_pools = ['cpu-np']
desired_pw_cpu_node_pools = ['cpu-np', 'highmem-cpu-np']
if args.enable_pathways:
# Pathways needs CPU nodepools in addition to TPU nodepools
for node_pool_name in desired_pw_cpu_node_pools:
if node_pool_name in existing_node_pool_names:
continue
command = (
'gcloud beta container node-pools create'
f' {node_pool_name} --node-version={gke_node_pool_version} --cluster={args.cluster} --project={args.project} --node-locations={args.zone} --region={zone_to_region(args.zone)} --num-nodes=1'
f' --machine-type={args.pathways_gce_machine_type} --scopes=storage-full,gke-default,{CLOUD_PLATFORM_AUTH_SCOPE_URL} --enable-autoscaling'
' --min-nodes=1 --max-nodes=20'
)
if node_pool_name == 'cpu-np':
command = (
'gcloud beta container node-pools create'
f' {node_pool_name} --node-version={gke_node_pool_version} --cluster={args.cluster} --project={args.project} --node-locations={args.zone} --region={zone_to_region(args.zone)} --num-nodes=1'
f' --machine-type={args.pathways_gce_machine_type} --scopes=storage-full,gke-default,{CLOUD_PLATFORM_AUTH_SCOPE_URL} --enable-autoscaling'
' --min-nodes=1 --max-nodes=20'
)
else:
command = (
'gcloud beta container node-pools create'
f' {node_pool_name} --node-version={gke_node_pool_version} --cluster={args.cluster} --project={args.project} --node-locations={args.zone} --region={zone_to_region(args.zone)} --num-nodes=1'
f' --machine-type={args.pathways_highmem_gce_machine_type} --scopes=storage-full,gke-default,{CLOUD_PLATFORM_AUTH_SCOPE_URL} --enable-autoscaling'
' --min-nodes=1 --max-nodes=20'
)
task = f'NodepoolCreate-{node_pool_name}'
create_commands.append(command)
create_task_names.append(task)
Expand Down
20 changes: 16 additions & 4 deletions src/xpk/core/pathways.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,14 @@ def add_pw_resource_flavors(args):
spec:
nodeLabels:
cloud.google.com/gke-nodepool: cpu-np
---
apiVersion: kueue.x-k8s.io/v1beta1
kind: ResourceFlavor
metadata:
name: highmem-cpu-user
spec:
nodeLabels:
cloud.google.com/gke-nodepool: highmem-cpu-np
---"""
if args.enable_pathways:
return resource_flavor_yaml
Expand All @@ -47,7 +55,13 @@ def add_pw_resources_to_kueue(args):
- name: "cpu"
nominalQuota: 480
- name: "memory"
nominalQuota: 2000G"""
nominalQuota: 2000G
- name: highmem-cpu-user
resources:
- name: "cpu"
nominalQuota: 480
- name: "memory"
nominalQuota: 4000G"""
if args.enable_pathways:
return resources_yaml
return ''
Expand Down Expand Up @@ -79,7 +93,7 @@ def ensure_pathways_workload_prerequisites(args, system) -> bool:

# Ensure the cluster and CPU nodepools were created with create-pathways
all_node_pools = get_all_nodepools_programmatic(args)
desired_pw_cpu_node_pools = {'cpu-np'}
desired_pw_cpu_node_pools = {'cpu-np', 'highmem-cpu-np'}
if not desired_pw_cpu_node_pools.issubset(set(all_node_pools[0])):
xpk_print(
'Cluster needs to be created with `xpk create-pathways` to run'
Expand Down Expand Up @@ -263,8 +277,6 @@ def get_user_workload_for_pathways(
spec:
containers:
{container}
nodeSelector:
cloud.google.com/gke-nodepool: cpu-np
hostNetwork: true
dnsPolicy: ClusterFirstWithHostNet
restartPolicy: Never
Expand Down
6 changes: 6 additions & 0 deletions src/xpk/parser/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -528,6 +528,12 @@ def add_shared_cluster_create_optional_arguments(parser: ArgumentParser):
default='n2-standard-64',
help='The CPU type for Pathways CPU nodepools',
)
parser.add_argument(
'--pathways-highmem-gce-machine-type',
type=str,
default='c4-highmem-192',
help='The highmem CPU type for Pathways CPU nodepools',
)
parser.add_argument(
'--default-pool-cpu-machine-type',
type=str,
Expand Down
Loading