Merge pull request #6313 from kiryl-filatau:feature/kubernetes-scale-to-1-gpu

copybara-github · copybara-github · commit 0c3ebb016b2b · 2026-01-22T06:36:44.000-08:00
PiperOrigin-RevId: 859584009
diff --git a/perfkitbenchmarker/container_service/kubernetes_cluster.py b/perfkitbenchmarker/container_service/kubernetes_cluster.py
@@ -189,7 +189,7 @@ def _ModifyPodSpecPlacementYaml(
     del name
     node_selectors = self.GetNodeSelectors(machine_type)
     if node_selectors:
-      pod_spec_yaml['nodeSelector'].update(node_selectors)
+      pod_spec_yaml.setdefault('nodeSelector', {}).update(node_selectors)
 
   @property
   def _ingress_manifest_path(self) -> str:
diff --git a/perfkitbenchmarker/data/container/kubernetes_scale/aws-gpu-nodepool.yaml.j2 b/perfkitbenchmarker/data/container/kubernetes_scale/aws-gpu-nodepool.yaml.j2
@@ -0,0 +1,38 @@
+apiVersion: karpenter.sh/v1
+kind: NodePool
+metadata:
+  name: {{ gpu_nodepool_name | default('gpu') }}
+spec:
+  disruption:
+    consolidateAfter: {{ gpu_consolidate_after | default('1m') }}
+    consolidationPolicy: {{ gpu_consolidation_policy | default('WhenEmptyOrUnderutilized') }}
+  limits:
+    cpu: {{ gpu_nodepool_cpu_limit | default(1000) }}
+  template:
+    metadata:
+      labels:
+        pkb_nodepool: {{ gpu_nodepool_label | default('gpu') }}
+    spec:
+      nodeClassRef:
+        group: karpenter.k8s.aws
+        kind: EC2NodeClass
+        name: {{ karpenter_ec2nodeclass_name | default('default') }}
+      requirements:
+        - key: kubernetes.io/arch
+          operator: In
+          values: {{ gpu_arch | default(['amd64']) }}
+        - key: kubernetes.io/os
+          operator: In
+          values: {{ gpu_os | default(['linux']) }}
+        - key: karpenter.sh/capacity-type
+          operator: In
+          values: {{ gpu_capacity_types | default(['on-demand']) }}
+        - key: karpenter.k8s.aws/instance-category
+          operator: In
+          values: {{ gpu_instance_categories | default(['g']) }}
+        - key: karpenter.k8s.aws/instance-family
+          operator: In
+          values: {{ gpu_instance_families | default(['g6','g6e']) }}
+      taints:
+        - key: {{ gpu_taint_key | default('nvidia.com/gpu') }}
+          effect: NoSchedule
diff --git a/perfkitbenchmarker/data/container/kubernetes_scale/kubernetes_scale.yaml.j2 b/perfkitbenchmarker/data/container/kubernetes_scale/kubernetes_scale.yaml.j2
@@ -20,6 +20,13 @@ spec:
         command: {{ Command }}
         {%- endif %}
         resources:
+          requests:
+            cpu: {{ CpuRequest }}
+            memory: {{ MemoryRequest }}
+            ephemeral-storage: {{ EphemeralStorageRequest }}
+            {%- if NvidiaGpuRequest %}
+            nvidia.com/gpu: {{ NvidiaGpuRequest }}
+            {%- endif %}
           limits:
             cpu: {{ CpuRequest }}
             memory: {{ MemoryRequest }}
@@ -53,3 +60,8 @@ spec:
         operator: "Exists"
         effect: "NoExecute"
         tolerationSeconds: {{ PodTimeout }}
+      {%- if GpuTaintKey %}
+      - key: {{ GpuTaintKey }}
+        operator: Exists
+        effect: NoSchedule
+      {%- endif %}
diff --git a/perfkitbenchmarker/linux_benchmarks/kubernetes_scale_benchmark.py b/perfkitbenchmarker/linux_benchmarks/kubernetes_scale_benchmark.py
@@ -89,16 +89,48 @@ def GetConfig(user_config):
   return config
 
 
+def _IsEksKarpenterAwsGpu(cluster: container_service.KubernetesCluster) -> bool:
+  return bool(
+      virtual_machine.GPU_COUNT.value
+      and FLAGS.cloud.lower() == 'aws'
+      and getattr(cluster, 'CLUSTER_TYPE', None) == 'Karpenter'
+  )
+
+
+def _EnsureEksKarpenterGpuNodepool(
+    cluster: container_service.KubernetesCluster,
+) -> None:
+  """Ensures a GPU NodePool exists for EKS Karpenter before applying workloads."""
+  if not _IsEksKarpenterAwsGpu(cluster):
+    return
+  cluster.ApplyManifest(
+      'container/kubernetes_scale/aws-gpu-nodepool.yaml.j2',
+      gpu_nodepool_name='gpu',
+      gpu_nodepool_label='gpu',
+      karpenter_ec2nodeclass_name='default',
+      gpu_instance_categories=['g'],
+      gpu_instance_families=['g6', 'g6e'],
+      gpu_capacity_types=['on-demand'],
+      gpu_arch=['amd64'],
+      gpu_os=['linux'],
+      gpu_taint_key='nvidia.com/gpu',
+      gpu_consolidate_after='1m',
+      gpu_consolidation_policy='WhenEmptyOrUnderutilized',
+      gpu_nodepool_cpu_limit=1000,
+  )
+
+
 def Prepare(bm_spec: benchmark_spec.BenchmarkSpec):
   """Sets additional spec attributes."""
   bm_spec.always_call_cleanup = True
+  assert bm_spec.container_cluster
+  _EnsureEksKarpenterGpuNodepool(bm_spec.container_cluster)
 
 
 def _GetRolloutCreationTime(rollout_name: str) -> int:
   """Returns the time when the rollout was created."""
   out, _, _ = container_service.RunRetryableKubectlCommand([
-      'rollout',
-      'history',
+      'get',
       rollout_name,
       '-o',
       'jsonpath={.metadata.creationTimestamp}',
@@ -122,6 +154,7 @@ def Run(bm_spec: benchmark_spec.BenchmarkSpec) -> list[sample.Sample]:
   assert bm_spec.container_cluster
   cluster = bm_spec.container_cluster
   assert isinstance(cluster, container_service.KubernetesCluster)
+  cluster: container_service.KubernetesCluster = cluster
 
   # Warm up the cluster by creating a single pod. This compensates for
   # differences between Standard & Autopilot, where Standard already has 1 node
@@ -180,8 +213,10 @@ def ScaleUpPods(
   max_wait_time = _GetScaleTimeout()
   resource_timeout = max_wait_time + 60 * 5  # 5 minutes after waiting to avoid
   # pod delete events from polluting data collection.
-  yaml_docs = cluster.ConvertManifestToYamlDicts(
-      MANIFEST_TEMPLATE,
+
+  is_eks_karpenter_aws_gpu = _IsEksKarpenterAwsGpu(cluster)
+
+  manifest_kwargs = dict(
       Name='kubernetes-scaleup',
       Replicas=num_new_pods,
       CpuRequest=CPUS_PER_POD.value,
@@ -192,12 +227,26 @@ def ScaleUpPods(
       EphemeralStorageRequest='10Mi',
       RolloutTimeout=max_wait_time,
       PodTimeout=resource_timeout,
+      Cloud=FLAGS.cloud.lower(),
+      GpuTaintKey=None,
+  )
+
+  # GpuTaintKey is still needed for tolerations in the yaml template
+  if is_eks_karpenter_aws_gpu:
+    manifest_kwargs['GpuTaintKey'] = 'nvidia.com/gpu'
+
+  yaml_docs = cluster.ConvertManifestToYamlDicts(
+      MANIFEST_TEMPLATE,
+      **manifest_kwargs,
   )
+
+  # Use ModifyPodSpecPlacementYaml to add nodeSelectors via GetNodeSelectors()
   cluster.ModifyPodSpecPlacementYaml(
       yaml_docs,
       'kubernetes-scaleup',
       cluster.default_nodepool.machine_type,
   )
+
   resource_names = cluster.ApplyYaml(yaml_docs)
 
   assert resource_names
@@ -366,7 +415,7 @@ def GetStatusConditionsForResourceType(
 def ConvertToEpochTime(timestamp: str) -> int:
   """Converts a timestamp to epoch time."""
   # Example: 2024-11-08T23:44:36Z
-  return parser.parse(timestamp).timestamp()
+  return int(parser.parse(timestamp).timestamp())
 
 
 def ParseStatusChanges(
diff --git a/perfkitbenchmarker/providers/aws/elastic_kubernetes_service.py b/perfkitbenchmarker/providers/aws/elastic_kubernetes_service.py
@@ -313,9 +313,7 @@ def _ingress_manifest_path(self) -> str:
     """The path to the ingress manifest template file."""
     return 'container/ingress.yaml.j2'
 
-  def _WaitForIngress(
-      self, name: str, namespace: str, port: int
-  ) -> str:
+  def _WaitForIngress(self, name: str, namespace: str, port: int) -> str:
     """Waits for an Ingress resource to be deployed to the cluster."""
     del port
     self.WaitForResource(
@@ -1283,10 +1281,16 @@ def ResizeNodePool(
 
   def GetNodeSelectors(self, machine_type: str | None = None) -> dict[str, str]:
     """Gets the node selectors section of a yaml for the provider."""
-    machine_family = util.GetMachineFamily(machine_type)
-    if machine_family:
-      return {'karpenter.k8s.aws/instance-family': machine_family}
-    return {}
+    selectors = {}
+    # If GPU is requested, use the GPU nodepool
+    if virtual_machine.GPU_TYPE.value:
+      selectors['karpenter.sh/nodepool'] = 'gpu'
+    else:
+      # Otherwise, use instance-family selector if machine_type is specified
+      machine_family = util.GetMachineFamily(machine_type)
+      if machine_family:
+        selectors['karpenter.k8s.aws/instance-family'] = machine_family
+    return selectors
 
   def GetNodePoolNames(self) -> list[str]:
     """Gets node pool names for the cluster.