redhat-ai-services · shebistar · Sep 19, 2025 · Sep 19, 2025 · Sep 19, 2025 · Sep 19, 2025
@@ -0,0 +1,6 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+resources:
+  - ../../base
+  - ../../../clusters/overlays/rhoai-stable-2.22-aws-gpu-time-sliced
@@ -0,0 +1,58 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+namespace: openshift-gitops
+
+resources:
+  - ../../base
+  - ../../../components/argocd/apps/overlays/rhoai-stable-2.22-aws-gpu-time-sliced
+
+patches:
+  # Uncomment patches to disable automatic sync
+  - path: patch-applicationset-manual-sync.yaml
+    target:
+      group: argoproj.io
+      version: v1alpha1
+      kind: ApplicationSet
+      name: tenants
+  # - path: patch-application-manual-sync.yaml
+  #   target:
+  #     group: argoproj.io
+  #     kind: Application
+  #     version: v1alpha1
+  - path: patch-remove-common-overlay.yaml
+    target:
+      group: argoproj.io
+      kind: ApplicationSet
+      version: v1alpha1
+      name: tenants
+
+replacements:
+  # copy the repo from the application to the applicationsets
+  - source:
+      kind: Application
+      fieldPath: spec.source.repoURL
+    targets:
+      - select:
+          kind: ApplicationSet
+        fieldPaths:
+          - spec.template.spec.source.repoURL
+      - select:
+          kind: ApplicationSet
+          name: tenants
+        fieldPaths:
+          - spec.generators.*.git.repoURL
+  # copy the branch from the application to the applicationsets
+  - source:
+      kind: Application
+      fieldPath: spec.source.targetRevision
+    targets:
+      - select:
+          kind: ApplicationSet
+        fieldPaths:
+          - spec.template.spec.source.targetRevision
+      - select:
+          kind: ApplicationSet
+          name: tenants
+        fieldPaths:
+          - spec.generators.*.git.revision
@@ -0,0 +1,2 @@
+- op: remove
+  path: /spec/syncPolicy
@@ -0,0 +1,2 @@
+- op: remove
+  path: /spec/template/spec/syncPolicy
@@ -0,0 +1,3 @@
+- op: replace
+  path: /spec/generators/0/git/directories/1/path
+  value: ""
@@ -12,8 +12,8 @@ spec:
   project: cluster-config
   source:
     path: patch-me-see-overlays
-    repoURL: https://github.com/redhat-ai-services/ai-accelerator.git
-    targetRevision: main
+    repoURL: https://github.com/shebistar/ai-accelerator.git
+    targetRevision: rhoai-2.22-gpu-as-a-service-overlay
   syncPolicy:
     automated:
       prune: false

@@ -0,0 +1,23 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+resources:
+  - ../../base
+
+patches:
+  - path: patch-cluster-config-app-of-apps.yaml
+    target:
+      kind: Application
+      name: cluster-config-app-of-apps
+  - path: patch-operators-list.yaml
+    target:
+      kind: ApplicationSet
+      name: cluster-operators
+  - path: patch-configs-list.yaml
+    target:
+      kind: ApplicationSet
+      name: cluster-configs
+  - path: patch-tenants-applicationset.yaml
+    target:
+      kind: ApplicationSet
+      name: tenants
@@ -0,0 +1,7 @@
+apiVersion: argoproj.io/v1alpha1
+kind: Application
+metadata:
+  name: cluster-config-app-of-apps
+spec:
+  source:
+    path: clusters/overlays/rhoai-stable-2.22-aws-gpu-time-sliced
@@ -0,0 +1,18 @@
+apiVersion: argoproj.io/v1alpha1
+kind: ApplicationSet
+metadata:
+  name: cluster-configs
+spec:
+  generators:
+  - list:
+      elements:
+      - cluster: local
+        url: https://kubernetes.default.svc
+        values:
+          name: user-workload-monitoring
+          path: components/cluster-configs/user-workload-monitoring/overlays/default
+      - cluster: local
+        url: https://kubernetes.default.svc
+        values:
+          name: cluster-autoscaling
+          path: components/cluster-configs/autoscaling/overlays/default
@@ -0,0 +1,59 @@
+apiVersion: argoproj.io/v1alpha1
+kind: ApplicationSet
+metadata:
+  name: cluster-operators
+spec:
+  generators:
+  - list:
+      elements:
+      - cluster: local
+        url: https://kubernetes.default.svc
+        values:
+          name: authorino-operator
+          path: components/operators/authorino-operator/operator/overlays/stable
+      - cluster: local
+        url: https://kubernetes.default.svc
+        values:
+          name: nvidia-gpu-operator
+          path: components/operators/gpu-operator-certified/aggregate/overlays/aws-time-sliced
+      - cluster: local
+        url: https://kubernetes.default.svc
+        values:
+          name: nfd-operator
+          path: components/operators/nfd/aggregate/overlays/default
+      - cluster: local
+        url: https://kubernetes.default.svc
+        values:
+          name: openshift-ai-operator
+          path: components/operators/openshift-ai/aggregate/overlays/stable-2.22-nvidia-gpu-time-sliced
+      - cluster: local
+        url: https://kubernetes.default.svc
+        values:
+          name: openshift-gitops-operator
+          path: components/operators/openshift-gitops/aggregate/overlays/rhdp
+      - cluster: local
+        url: https://kubernetes.default.svc
+        values:
+          name: openshift-pipelines-operator
+          path: components/operators/openshift-pipelines/operator/overlays/latest
+      - cluster: local
+        url: https://kubernetes.default.svc
+        values:
+          name: openshift-serverless-operator
+          path: components/operators/openshift-serverless/operator/overlays/stable
+      - cluster: local
+        url: https://kubernetes.default.svc
+        values:
+          name: openshift-servicemesh-operator
+          path: components/operators/openshift-servicemesh/operator/overlays/stable
+
+      # - cluster: local
+      #   url: https://kubernetes.default.svc
+      #   values:
+      #     name: openshift-logging-operator
+      #     path: components/operators/openshift-logging/aggregate/overlays/default
+      # - cluster: local
+      #   url: https://kubernetes.default.svc
+      #   values:
+      #     name: web-terminal-operator
+      #     path: components/operators/web-terminal-operator/operator/overlays/fast
@@ -0,0 +1,3 @@
+- op: replace
+  path: /spec/generators/0/git/directories/0/path
+  value: tenants/*/*/overlays/rhoai-stable-2.22-aws-gpu-time-sliced
@@ -0,0 +1,9 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+commonAnnotations:
+  argocd.argoproj.io/sync-options: SkipDryRunOnMissingResource=true
+
+resources:
+  - ../../../operator/overlays/stable
+  - ../../../instance/overlays/aws-time-sliced
@@ -1,11 +1,12 @@
-# time-sliced
+# aws-gpu-machineset
 
 ## Purpose
 
-This component is designed to enable to enable time slicing on GPUs.
+This component is designed to setup a MachineSet with GPUs on an AWS based OpenShift cluster.
 
-To learn more about the monitoring dashboard, please refer to the official [docs](
-https://docs.nvidia.com/datacenter/cloud-native/openshift/latest/time-slicing-gpus-in-openshift.html)
+This component triggers a job that creates a MachineSet based on your current MachineSet.
+
+This component has been tested using AWS based OpenShift instances provisioned by demo.redhat.com.
 
 ## Usage
 
@@ -22,6 +23,5 @@ components:
   - ../../components/time-sliced
 ```
 
-This component is intended to be used with additional configurations to set the number of replicas.
 
-Please refer to [time-sliced-2](../time-sliced-2) and [time-sliced-4](../time-sliced-4) for complete implementations of the time slicing configuration.
+You can customize the taint applied to the GPU nodes by updating [machineset-patch.yaml](./machineset-patch.yaml) file.
@@ -0,0 +1,100 @@
+#!/usr/bin/env bash
+# shellcheck disable=SC1091
+
+set -e
+
+ocp_aws_cluster(){
+  TARGET_NS=kube-system
+  OBJ=secret/aws-creds
+  echo "Checking if ${OBJ} exists in ${TARGET_NS} namespace"
+  oc -n "${TARGET_NS}" get "${OBJ}" -o name > /dev/null 2>&1 || return 1
+  echo "AWS cluster detected"
+}
+
+ocp_aws_create_gpu_machineset(){
+  # https://aws.amazon.com/ec2/instance-types/g4
+  # single gpu: g4dn.{2,4,8,16}xlarge
+  # multi gpu:  g4dn.12xlarge
+  # practical:  g4ad.4xlarge
+  # a100 (MIG): p4d.24xlarge
+  # h100 (MIG): p5.48xlarge
+
+  # https://aws.amazon.com/ec2/instance-types/dl1
+  # 8 x gaudi:  dl1.24xlarge
+
+  INSTANCE_TYPE=${1:-g4dn.2xlarge}
+
+  ocp_aws_clone_machineset "${INSTANCE_TYPE}"
+
+  MACHINE_SET_TYPE=$(oc -n openshift-machine-api get machinesets.machine.openshift.io -o name | grep "${INSTANCE_TYPE%.*}" | head -n1)
+
+  PATCH_FILE="$(dirname "$0")/machineset-patch.yaml"
+
+  if [ -f ${PATCH_FILE} ]; then
+    echo "Patching ${MACHINE_SET_TYPE} with ${PATCH_FILE}."
+    oc -n openshift-machine-api \
+      patch "${MACHINE_SET_TYPE}" \
+      --type=merge --patch-file ${PATCH_FILE}
+  else
+    echo "Unable to taint nodes, patch file ${PATCH_FILE} not found."
+    exit 1
+  fi
+
+  oc -n openshift-machine-api \
+    patch "${MACHINE_SET_TYPE}" \
+    --type=merge --patch '{"spec":{"template":{"spec":{"providerSpec":{"value":{"instanceType":"'"${INSTANCE_TYPE}"'"}}}}}}'
+}
+
+ocp_aws_clone_machineset(){
+  [ -z "${1}" ] && \
+  echo "
+    usage: ocp_aws_create_gpu_machineset < instance type, default g4dn.4xlarge >
+  "
+
+  INSTANCE_TYPE=${1:-g4dn.4xlarge}
+  MACHINE_SET=$(oc -n openshift-machine-api get machinesets.machine.openshift.io -o name | grep worker | head -n1)
+
+  # check for an existing instance machine set
+  if oc -n openshift-machine-api get machinesets.machine.openshift.io -o name | grep -q "${INSTANCE_TYPE%.*}"; then
+    echo "Exists: machineset - ${INSTANCE_TYPE}"
+  else
+    echo "Creating: machineset - ${INSTANCE_TYPE}"
+    oc -n openshift-machine-api \
+      get "${MACHINE_SET}" -o yaml | \
+        sed '/machine/ s/-worker/-'"${INSTANCE_TYPE}"'/g
+          /name/ s/-worker/-'"${INSTANCE_TYPE%.*}"'/g
+          s/instanceType.*/instanceType: '"${INSTANCE_TYPE}"'/
+          s/replicas.*/replicas: 2/' | \
+      oc apply -f -
+  fi
+}
+
+ocp_create_machineset_autoscale(){
+  MACHINE_MIN=${1:-2}
+  MACHINE_MAX=${2:-4}
+  MACHINE_SETS=${3:-$(oc -n openshift-machine-api get machinesets.machine.openshift.io -o name | sed 's@.*/@@' )}
+
+  for set in ${MACHINE_SETS}
+  do
+cat << YAML | oc apply -f -
+apiVersion: "autoscaling.openshift.io/v1beta1"
+kind: "MachineAutoscaler"
+metadata:
+  name: "${set}"
+  namespace: "openshift-machine-api"
+spec:
+  minReplicas: ${MACHINE_MIN}
+  maxReplicas: ${MACHINE_MAX}
+  scaleTargetRef:
+    apiVersion: machine.openshift.io/v1beta1
+    kind: MachineSet
+    name: "${set}"
+YAML
+  done
+}
+
+INSTANCE_TYPE=${INSTANCE_TYPE:-g4dn.4xlarge}
+
+ocp_aws_cluster || exit 0
+ocp_aws_create_gpu_machineset ${INSTANCE_TYPE}
+ocp_create_machineset_autoscale
@@ -0,0 +1,40 @@
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  generateName: job-aws-gpu-machineset-
+  name: job-aws-gpu-machineset
+  namespace: nvidia-gpu-operator
+  # annotations:
+  #   argocd.argoproj.io/hook: Sync
+    # argocd.argoproj.io/hook-delete-policy: HookSucceeded
+spec:
+  template:
+    spec:
+      containers:
+        - name: job-aws-gpu-machineset
+          # image: image-registry.openshift-image-registry.svc:5000/openshift/tools:latest
+          image: registry.redhat.io/openshift4/ose-cli
+          env:
+            - name: INSTANCE_TYPE
+              value: "g5.2xlarge"
+            - name: NAMESPACE
+              valueFrom:
+                fieldRef:
+                  fieldPath: metadata.namespace
+          command:
+            - /bin/bash
+            - -c
+            - /scripts/job.sh
+          volumeMounts:
+            - name: scripts
+              mountPath: /scripts
+      volumes:
+        - name: scripts
+          configMap:
+            name: job-aws-gpu-machineset
+            defaultMode: 0755
+      restartPolicy: Never
+      terminationGracePeriodSeconds: 30
+      serviceAccount: job-aws-gpu-machineset
+      serviceAccountName: job-aws-gpu-machineset
@@ -0,0 +1,11 @@
+apiVersion: kustomize.config.k8s.io/v1alpha1
+kind: Component
+
+patches:
+  - path: patch-gpu-cluster-policy.yaml
+    target:
+      kind: ClusterPolicy
+  - path: patch-device-plugin-config.yaml
+    target:
+      kind: ConfigMap
+      name: device-plugin-config
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		- op: remove
		path: /spec/template/spec/syncPolicy