diff --git a/bootstrap/overlays/rhoai-stable-2.22-aws-gpu-time-sliced/kustomization.yaml b/bootstrap/overlays/rhoai-stable-2.22-aws-gpu-time-sliced/kustomization.yaml new file mode 100644 index 000000000..e10632c76 --- /dev/null +++ b/bootstrap/overlays/rhoai-stable-2.22-aws-gpu-time-sliced/kustomization.yaml @@ -0,0 +1,6 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - ../../base + - ../../../clusters/overlays/rhoai-stable-2.22-aws-gpu-time-sliced diff --git a/clusters/overlays/rhoai-stable-2.22-aws-gpu-time-sliced/kustomization.yaml b/clusters/overlays/rhoai-stable-2.22-aws-gpu-time-sliced/kustomization.yaml new file mode 100644 index 000000000..98a053193 --- /dev/null +++ b/clusters/overlays/rhoai-stable-2.22-aws-gpu-time-sliced/kustomization.yaml @@ -0,0 +1,58 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: openshift-gitops + +resources: + - ../../base + - ../../../components/argocd/apps/overlays/rhoai-stable-2.22-aws-gpu-time-sliced + +patches: + # Uncomment patches to disable automatic sync + - path: patch-applicationset-manual-sync.yaml + target: + group: argoproj.io + version: v1alpha1 + kind: ApplicationSet + name: tenants + # - path: patch-application-manual-sync.yaml + # target: + # group: argoproj.io + # kind: Application + # version: v1alpha1 + - path: patch-remove-common-overlay.yaml + target: + group: argoproj.io + kind: ApplicationSet + version: v1alpha1 + name: tenants + +replacements: + # copy the repo from the application to the applicationsets + - source: + kind: Application + fieldPath: spec.source.repoURL + targets: + - select: + kind: ApplicationSet + fieldPaths: + - spec.template.spec.source.repoURL + - select: + kind: ApplicationSet + name: tenants + fieldPaths: + - spec.generators.*.git.repoURL + # copy the branch from the application to the applicationsets + - source: + kind: Application + fieldPath: spec.source.targetRevision + targets: + - select: + kind: ApplicationSet + fieldPaths: + - spec.template.spec.source.targetRevision + - select: + kind: ApplicationSet + name: tenants + fieldPaths: + - spec.generators.*.git.revision diff --git a/clusters/overlays/rhoai-stable-2.22-aws-gpu-time-sliced/patch-application-manual-sync.yaml b/clusters/overlays/rhoai-stable-2.22-aws-gpu-time-sliced/patch-application-manual-sync.yaml new file mode 100644 index 000000000..d289dd4a3 --- /dev/null +++ b/clusters/overlays/rhoai-stable-2.22-aws-gpu-time-sliced/patch-application-manual-sync.yaml @@ -0,0 +1,2 @@ +- op: remove + path: /spec/syncPolicy diff --git a/clusters/overlays/rhoai-stable-2.22-aws-gpu-time-sliced/patch-applicationset-manual-sync.yaml b/clusters/overlays/rhoai-stable-2.22-aws-gpu-time-sliced/patch-applicationset-manual-sync.yaml new file mode 100644 index 000000000..a60ffc013 --- /dev/null +++ b/clusters/overlays/rhoai-stable-2.22-aws-gpu-time-sliced/patch-applicationset-manual-sync.yaml @@ -0,0 +1,2 @@ +- op: remove + path: /spec/template/spec/syncPolicy diff --git a/clusters/overlays/rhoai-stable-2.22-aws-gpu-time-sliced/patch-remove-common-overlay.yaml b/clusters/overlays/rhoai-stable-2.22-aws-gpu-time-sliced/patch-remove-common-overlay.yaml new file mode 100644 index 000000000..c023490a1 --- /dev/null +++ b/clusters/overlays/rhoai-stable-2.22-aws-gpu-time-sliced/patch-remove-common-overlay.yaml @@ -0,0 +1,3 @@ +- op: replace + path: /spec/generators/0/git/directories/1/path + value: "" diff --git a/components/argocd/apps/base/cluster-config-app-of-apps.yaml b/components/argocd/apps/base/cluster-config-app-of-apps.yaml index 3eb16d8ed..966a7b444 100644 --- a/components/argocd/apps/base/cluster-config-app-of-apps.yaml +++ b/components/argocd/apps/base/cluster-config-app-of-apps.yaml @@ -12,8 +12,8 @@ spec: project: cluster-config source: path: patch-me-see-overlays - repoURL: https://github.com/redhat-ai-services/ai-accelerator.git - targetRevision: main + repoURL: https://github.com/shebistar/ai-accelerator.git + targetRevision: rhoai-2.22-gpu-as-a-service-overlay syncPolicy: automated: prune: false diff --git a/components/argocd/apps/overlays/rhoai-stable-2.22-aws-gpu-time-sliced/kustomization.yaml b/components/argocd/apps/overlays/rhoai-stable-2.22-aws-gpu-time-sliced/kustomization.yaml new file mode 100644 index 000000000..36329f400 --- /dev/null +++ b/components/argocd/apps/overlays/rhoai-stable-2.22-aws-gpu-time-sliced/kustomization.yaml @@ -0,0 +1,23 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - ../../base + +patches: + - path: patch-cluster-config-app-of-apps.yaml + target: + kind: Application + name: cluster-config-app-of-apps + - path: patch-operators-list.yaml + target: + kind: ApplicationSet + name: cluster-operators + - path: patch-configs-list.yaml + target: + kind: ApplicationSet + name: cluster-configs + - path: patch-tenants-applicationset.yaml + target: + kind: ApplicationSet + name: tenants diff --git a/components/argocd/apps/overlays/rhoai-stable-2.22-aws-gpu-time-sliced/patch-cluster-config-app-of-apps.yaml b/components/argocd/apps/overlays/rhoai-stable-2.22-aws-gpu-time-sliced/patch-cluster-config-app-of-apps.yaml new file mode 100644 index 000000000..7124efd13 --- /dev/null +++ b/components/argocd/apps/overlays/rhoai-stable-2.22-aws-gpu-time-sliced/patch-cluster-config-app-of-apps.yaml @@ -0,0 +1,7 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: cluster-config-app-of-apps +spec: + source: + path: clusters/overlays/rhoai-stable-2.22-aws-gpu-time-sliced diff --git a/components/argocd/apps/overlays/rhoai-stable-2.22-aws-gpu-time-sliced/patch-configs-list.yaml b/components/argocd/apps/overlays/rhoai-stable-2.22-aws-gpu-time-sliced/patch-configs-list.yaml new file mode 100644 index 000000000..173fe3a17 --- /dev/null +++ b/components/argocd/apps/overlays/rhoai-stable-2.22-aws-gpu-time-sliced/patch-configs-list.yaml @@ -0,0 +1,18 @@ +apiVersion: argoproj.io/v1alpha1 +kind: ApplicationSet +metadata: + name: cluster-configs +spec: + generators: + - list: + elements: + - cluster: local + url: https://kubernetes.default.svc + values: + name: user-workload-monitoring + path: components/cluster-configs/user-workload-monitoring/overlays/default + - cluster: local + url: https://kubernetes.default.svc + values: + name: cluster-autoscaling + path: components/cluster-configs/autoscaling/overlays/default diff --git a/components/argocd/apps/overlays/rhoai-stable-2.22-aws-gpu-time-sliced/patch-operators-list.yaml b/components/argocd/apps/overlays/rhoai-stable-2.22-aws-gpu-time-sliced/patch-operators-list.yaml new file mode 100644 index 000000000..1e24f2785 --- /dev/null +++ b/components/argocd/apps/overlays/rhoai-stable-2.22-aws-gpu-time-sliced/patch-operators-list.yaml @@ -0,0 +1,59 @@ +apiVersion: argoproj.io/v1alpha1 +kind: ApplicationSet +metadata: + name: cluster-operators +spec: + generators: + - list: + elements: + - cluster: local + url: https://kubernetes.default.svc + values: + name: authorino-operator + path: components/operators/authorino-operator/operator/overlays/stable + - cluster: local + url: https://kubernetes.default.svc + values: + name: nvidia-gpu-operator + path: components/operators/gpu-operator-certified/aggregate/overlays/aws-time-sliced + - cluster: local + url: https://kubernetes.default.svc + values: + name: nfd-operator + path: components/operators/nfd/aggregate/overlays/default + - cluster: local + url: https://kubernetes.default.svc + values: + name: openshift-ai-operator + path: components/operators/openshift-ai/aggregate/overlays/stable-2.22-nvidia-gpu-time-sliced + - cluster: local + url: https://kubernetes.default.svc + values: + name: openshift-gitops-operator + path: components/operators/openshift-gitops/aggregate/overlays/rhdp + - cluster: local + url: https://kubernetes.default.svc + values: + name: openshift-pipelines-operator + path: components/operators/openshift-pipelines/operator/overlays/latest + - cluster: local + url: https://kubernetes.default.svc + values: + name: openshift-serverless-operator + path: components/operators/openshift-serverless/operator/overlays/stable + - cluster: local + url: https://kubernetes.default.svc + values: + name: openshift-servicemesh-operator + path: components/operators/openshift-servicemesh/operator/overlays/stable + + # - cluster: local + # url: https://kubernetes.default.svc + # values: + # name: openshift-logging-operator + # path: components/operators/openshift-logging/aggregate/overlays/default + # - cluster: local + # url: https://kubernetes.default.svc + # values: + # name: web-terminal-operator + # path: components/operators/web-terminal-operator/operator/overlays/fast diff --git a/components/argocd/apps/overlays/rhoai-stable-2.22-aws-gpu-time-sliced/patch-tenants-applicationset.yaml b/components/argocd/apps/overlays/rhoai-stable-2.22-aws-gpu-time-sliced/patch-tenants-applicationset.yaml new file mode 100644 index 000000000..fcd04bf2c --- /dev/null +++ b/components/argocd/apps/overlays/rhoai-stable-2.22-aws-gpu-time-sliced/patch-tenants-applicationset.yaml @@ -0,0 +1,3 @@ +- op: replace + path: /spec/generators/0/git/directories/0/path + value: tenants/*/*/overlays/rhoai-stable-2.22-aws-gpu-time-sliced diff --git a/components/operators/gpu-operator-certified/aggregate/overlays/aws-time-sliced/kustomization.yaml b/components/operators/gpu-operator-certified/aggregate/overlays/aws-time-sliced/kustomization.yaml new file mode 100644 index 000000000..43fee81c9 --- /dev/null +++ b/components/operators/gpu-operator-certified/aggregate/overlays/aws-time-sliced/kustomization.yaml @@ -0,0 +1,9 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +commonAnnotations: + argocd.argoproj.io/sync-options: SkipDryRunOnMissingResource=true + +resources: + - ../../../operator/overlays/stable + - ../../../instance/overlays/aws-time-sliced diff --git a/components/operators/gpu-operator-certified/instance/components/time-sliced/README.md b/components/operators/gpu-operator-certified/instance/components/time-sliced/README.md index bce39de46..41bfc0059 100644 --- a/components/operators/gpu-operator-certified/instance/components/time-sliced/README.md +++ b/components/operators/gpu-operator-certified/instance/components/time-sliced/README.md @@ -1,11 +1,12 @@ -# time-sliced +# aws-gpu-machineset ## Purpose -This component is designed to enable to enable time slicing on GPUs. +This component is designed to setup a MachineSet with GPUs on an AWS based OpenShift cluster. -To learn more about the monitoring dashboard, please refer to the official [docs]( -https://docs.nvidia.com/datacenter/cloud-native/openshift/latest/time-slicing-gpus-in-openshift.html) +This component triggers a job that creates a MachineSet based on your current MachineSet. + +This component has been tested using AWS based OpenShift instances provisioned by demo.redhat.com. ## Usage @@ -22,6 +23,5 @@ components: - ../../components/time-sliced ``` -This component is intended to be used with additional configurations to set the number of replicas. -Please refer to [time-sliced-2](../time-sliced-2) and [time-sliced-4](../time-sliced-4) for complete implementations of the time slicing configuration. +You can customize the taint applied to the GPU nodes by updating [machineset-patch.yaml](./machineset-patch.yaml) file. diff --git a/components/operators/gpu-operator-certified/instance/components/time-sliced/job.sh b/components/operators/gpu-operator-certified/instance/components/time-sliced/job.sh new file mode 100755 index 000000000..be3cb1f85 --- /dev/null +++ b/components/operators/gpu-operator-certified/instance/components/time-sliced/job.sh @@ -0,0 +1,100 @@ +#!/usr/bin/env bash +# shellcheck disable=SC1091 + +set -e + +ocp_aws_cluster(){ + TARGET_NS=kube-system + OBJ=secret/aws-creds + echo "Checking if ${OBJ} exists in ${TARGET_NS} namespace" + oc -n "${TARGET_NS}" get "${OBJ}" -o name > /dev/null 2>&1 || return 1 + echo "AWS cluster detected" +} + +ocp_aws_create_gpu_machineset(){ + # https://aws.amazon.com/ec2/instance-types/g4 + # single gpu: g4dn.{2,4,8,16}xlarge + # multi gpu: g4dn.12xlarge + # practical: g4ad.4xlarge + # a100 (MIG): p4d.24xlarge + # h100 (MIG): p5.48xlarge + + # https://aws.amazon.com/ec2/instance-types/dl1 + # 8 x gaudi: dl1.24xlarge + + INSTANCE_TYPE=${1:-g4dn.2xlarge} + + ocp_aws_clone_machineset "${INSTANCE_TYPE}" + + MACHINE_SET_TYPE=$(oc -n openshift-machine-api get machinesets.machine.openshift.io -o name | grep "${INSTANCE_TYPE%.*}" | head -n1) + + PATCH_FILE="$(dirname "$0")/machineset-patch.yaml" + + if [ -f ${PATCH_FILE} ]; then + echo "Patching ${MACHINE_SET_TYPE} with ${PATCH_FILE}." + oc -n openshift-machine-api \ + patch "${MACHINE_SET_TYPE}" \ + --type=merge --patch-file ${PATCH_FILE} + else + echo "Unable to taint nodes, patch file ${PATCH_FILE} not found." + exit 1 + fi + + oc -n openshift-machine-api \ + patch "${MACHINE_SET_TYPE}" \ + --type=merge --patch '{"spec":{"template":{"spec":{"providerSpec":{"value":{"instanceType":"'"${INSTANCE_TYPE}"'"}}}}}}' +} + +ocp_aws_clone_machineset(){ + [ -z "${1}" ] && \ + echo " + usage: ocp_aws_create_gpu_machineset < instance type, default g4dn.4xlarge > + " + + INSTANCE_TYPE=${1:-g4dn.4xlarge} + MACHINE_SET=$(oc -n openshift-machine-api get machinesets.machine.openshift.io -o name | grep worker | head -n1) + + # check for an existing instance machine set + if oc -n openshift-machine-api get machinesets.machine.openshift.io -o name | grep -q "${INSTANCE_TYPE%.*}"; then + echo "Exists: machineset - ${INSTANCE_TYPE}" + else + echo "Creating: machineset - ${INSTANCE_TYPE}" + oc -n openshift-machine-api \ + get "${MACHINE_SET}" -o yaml | \ + sed '/machine/ s/-worker/-'"${INSTANCE_TYPE}"'/g + /name/ s/-worker/-'"${INSTANCE_TYPE%.*}"'/g + s/instanceType.*/instanceType: '"${INSTANCE_TYPE}"'/ + s/replicas.*/replicas: 2/' | \ + oc apply -f - + fi +} + +ocp_create_machineset_autoscale(){ + MACHINE_MIN=${1:-2} + MACHINE_MAX=${2:-4} + MACHINE_SETS=${3:-$(oc -n openshift-machine-api get machinesets.machine.openshift.io -o name | sed 's@.*/@@' )} + + for set in ${MACHINE_SETS} + do +cat << YAML | oc apply -f - +apiVersion: "autoscaling.openshift.io/v1beta1" +kind: "MachineAutoscaler" +metadata: + name: "${set}" + namespace: "openshift-machine-api" +spec: + minReplicas: ${MACHINE_MIN} + maxReplicas: ${MACHINE_MAX} + scaleTargetRef: + apiVersion: machine.openshift.io/v1beta1 + kind: MachineSet + name: "${set}" +YAML + done +} + +INSTANCE_TYPE=${INSTANCE_TYPE:-g4dn.4xlarge} + +ocp_aws_cluster || exit 0 +ocp_aws_create_gpu_machineset ${INSTANCE_TYPE} +ocp_create_machineset_autoscale diff --git a/components/operators/gpu-operator-certified/instance/components/time-sliced/job.yaml b/components/operators/gpu-operator-certified/instance/components/time-sliced/job.yaml new file mode 100644 index 000000000..a990611ee --- /dev/null +++ b/components/operators/gpu-operator-certified/instance/components/time-sliced/job.yaml @@ -0,0 +1,40 @@ +--- +apiVersion: batch/v1 +kind: Job +metadata: + generateName: job-aws-gpu-machineset- + name: job-aws-gpu-machineset + namespace: nvidia-gpu-operator + # annotations: + # argocd.argoproj.io/hook: Sync + # argocd.argoproj.io/hook-delete-policy: HookSucceeded +spec: + template: + spec: + containers: + - name: job-aws-gpu-machineset + # image: image-registry.openshift-image-registry.svc:5000/openshift/tools:latest + image: registry.redhat.io/openshift4/ose-cli + env: + - name: INSTANCE_TYPE + value: "g5.2xlarge" + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + command: + - /bin/bash + - -c + - /scripts/job.sh + volumeMounts: + - name: scripts + mountPath: /scripts + volumes: + - name: scripts + configMap: + name: job-aws-gpu-machineset + defaultMode: 0755 + restartPolicy: Never + terminationGracePeriodSeconds: 30 + serviceAccount: job-aws-gpu-machineset + serviceAccountName: job-aws-gpu-machineset diff --git a/components/operators/gpu-operator-certified/instance/components/time-sliced/kustomization.ori.yaml b/components/operators/gpu-operator-certified/instance/components/time-sliced/kustomization.ori.yaml new file mode 100644 index 000000000..6b9a229f9 --- /dev/null +++ b/components/operators/gpu-operator-certified/instance/components/time-sliced/kustomization.ori.yaml @@ -0,0 +1,11 @@ +apiVersion: kustomize.config.k8s.io/v1alpha1 +kind: Component + +patches: + - path: patch-gpu-cluster-policy.yaml + target: + kind: ClusterPolicy + - path: patch-device-plugin-config.yaml + target: + kind: ConfigMap + name: device-plugin-config diff --git a/components/operators/gpu-operator-certified/instance/components/time-sliced/kustomization.yaml b/components/operators/gpu-operator-certified/instance/components/time-sliced/kustomization.yaml index 6b9a229f9..6e0215da6 100644 --- a/components/operators/gpu-operator-certified/instance/components/time-sliced/kustomization.yaml +++ b/components/operators/gpu-operator-certified/instance/components/time-sliced/kustomization.yaml @@ -1,6 +1,20 @@ apiVersion: kustomize.config.k8s.io/v1alpha1 kind: Component +resources: + # - ../../../../../../scripts/library + - job.yaml + - rbac.yaml + +generatorOptions: + disableNameSuffixHash: true + +configMapGenerator: + - name: job-aws-gpu-machineset + namespace: nvidia-gpu-operator + files: + - job.sh + - machineset-patch.yaml patches: - path: patch-gpu-cluster-policy.yaml target: diff --git a/components/operators/gpu-operator-certified/instance/components/time-sliced/machineset-patch.yaml b/components/operators/gpu-operator-certified/instance/components/time-sliced/machineset-patch.yaml new file mode 100644 index 000000000..c28cf3547 --- /dev/null +++ b/components/operators/gpu-operator-certified/instance/components/time-sliced/machineset-patch.yaml @@ -0,0 +1,16 @@ +apiVersion: machine.openshift.io/v1beta1 +kind: MachineSet +metadata: + labels: + cluster-api/accelerator: "nvidia-gpu" +spec: + template: + spec: + metadata: + labels: + node-role.kubernetes.io/gpu: "" + cluster-api/accelerator: "nvidia-gpu" + nvidia.com/device-plugin.config: "time-sliced" + taints: + - key: nvidia.com/gpu + effect: NoSchedule diff --git a/components/operators/gpu-operator-certified/instance/components/time-sliced/patch-device-plugin-config.yaml b/components/operators/gpu-operator-certified/instance/components/time-sliced/patch-device-plugin-config.yaml index 29ad51288..2af92a5ed 100644 --- a/components/operators/gpu-operator-certified/instance/components/time-sliced/patch-device-plugin-config.yaml +++ b/components/operators/gpu-operator-certified/instance/components/time-sliced/patch-device-plugin-config.yaml @@ -3,10 +3,10 @@ kind: ConfigMap metadata: name: device-plugin-config data: - no-time-sliced: |- + time-sliced: |- version: v1 sharing: timeSlicing: resources: - name: nvidia.com/gpu - replicas: 0 + replicas: 8 diff --git a/components/operators/gpu-operator-certified/instance/components/time-sliced/rbac.yaml b/components/operators/gpu-operator-certified/instance/components/time-sliced/rbac.yaml new file mode 100644 index 000000000..4efbb245e --- /dev/null +++ b/components/operators/gpu-operator-certified/instance/components/time-sliced/rbac.yaml @@ -0,0 +1,50 @@ +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: job-aws-gpu-machineset + namespace: nvidia-gpu-operator +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: job-aws-gpu-machineset +rules: +- apiGroups: + - machine.openshift.io + resources: + - machinesets + verbs: + - '*' +- apiGroups: + - autoscaling.openshift.io + resources: + - machineautoscalers + verbs: + - '*' +- apiGroups: + - '' + resources: + - secrets + resourceNames: + - aws-creds + verbs: + - get + - list +# - nonResourceURLs: +# - '*' +# verbs: +# - '*' +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: job-aws-gpu-machineset +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: job-aws-gpu-machineset +subjects: + - kind: ServiceAccount + name: job-aws-gpu-machineset + namespace: nvidia-gpu-operator diff --git a/components/operators/gpu-operator-certified/instance/overlays/aws-time-sliced-2/kustomization.yaml b/components/operators/gpu-operator-certified/instance/overlays/aws-time-sliced-2/kustomization.yaml index bdf06964c..ddd91fe7e 100644 --- a/components/operators/gpu-operator-certified/instance/overlays/aws-time-sliced-2/kustomization.yaml +++ b/components/operators/gpu-operator-certified/instance/overlays/aws-time-sliced-2/kustomization.yaml @@ -5,6 +5,5 @@ resources: - ../../base components: - - ../../components/aws-gpu-machineset - ../../components/schedule-on-tainted-nodes - ../../components/time-sliced-2 diff --git a/components/operators/gpu-operator-certified/instance/overlays/aws-time-sliced-4/kustomization.yaml b/components/operators/gpu-operator-certified/instance/overlays/aws-time-sliced-4/kustomization.yaml index 7afd01425..f3070c5ce 100644 --- a/components/operators/gpu-operator-certified/instance/overlays/aws-time-sliced-4/kustomization.yaml +++ b/components/operators/gpu-operator-certified/instance/overlays/aws-time-sliced-4/kustomization.yaml @@ -5,6 +5,5 @@ resources: - ../../base components: - - ../../components/aws-gpu-machineset - ../../components/schedule-on-tainted-nodes - ../../components/time-sliced-4 diff --git a/components/operators/gpu-operator-certified/instance/overlays/aws-time-sliced/kustomization.yaml b/components/operators/gpu-operator-certified/instance/overlays/aws-time-sliced/kustomization.yaml new file mode 100644 index 000000000..cb9684daf --- /dev/null +++ b/components/operators/gpu-operator-certified/instance/overlays/aws-time-sliced/kustomization.yaml @@ -0,0 +1,8 @@ +kind: Kustomization + +resources: + - ../../base + +components: + - ../../components/schedule-on-tainted-nodes + - ../../components/time-sliced diff --git a/components/operators/gpu-operator-certified/instance/overlays/time-sliced/kustomization.yaml b/components/operators/gpu-operator-certified/instance/overlays/time-sliced/kustomization.yaml new file mode 100644 index 000000000..52936a5eb --- /dev/null +++ b/components/operators/gpu-operator-certified/instance/overlays/time-sliced/kustomization.yaml @@ -0,0 +1,9 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - ../../base + +components: + - ../../components/schedule-on-tainted-nodes + - ../../components/time-sliced diff --git a/components/operators/openshift-ai/aggregate/overlays/stable-2.22-nvidia-gpu-time-sliced/kustomization.yaml b/components/operators/openshift-ai/aggregate/overlays/stable-2.22-nvidia-gpu-time-sliced/kustomization.yaml new file mode 100644 index 000000000..8db5b3782 --- /dev/null +++ b/components/operators/openshift-ai/aggregate/overlays/stable-2.22-nvidia-gpu-time-sliced/kustomization.yaml @@ -0,0 +1,9 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +commonAnnotations: + argocd.argoproj.io/sync-options: SkipDryRunOnMissingResource=true + +resources: + - ../../../operator/overlays/stable-2.22 + - ../../../instance/overlays/stable-2.22-nvidia-gpu-time-sliced diff --git a/components/operators/openshift-ai/instance/components/kueue-operator/README.md b/components/operators/openshift-ai/instance/components/kueue-operator/README.md new file mode 100644 index 000000000..b39f388e0 --- /dev/null +++ b/components/operators/openshift-ai/instance/components/kueue-operator/README.md @@ -0,0 +1,25 @@ +# components-distributed-compute + +## Purpose +This component is designed help configure the distributed compute specific components including the following items: + +CodeFlare +Ray +Kueue + +The Distributed Compute Components are Generally Available as of RHOAI 2.9. + +## Usage + +This component can be added to a base by adding the `components` section to your overlay `kustomization.yaml` file: + +``` +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - ../../base + +components: + - ../../components/components-distributed-compute +``` diff --git a/components/operators/openshift-ai/instance/components/kueue-operator/kustomization.yaml b/components/operators/openshift-ai/instance/components/kueue-operator/kustomization.yaml new file mode 100644 index 000000000..f5a4caf20 --- /dev/null +++ b/components/operators/openshift-ai/instance/components/kueue-operator/kustomization.yaml @@ -0,0 +1,7 @@ +apiVersion: kustomize.config.k8s.io/v1alpha1 +kind: Component + +patches: + - path: patch-datasciencecluster.yaml + target: + kind: DataScienceCluster diff --git a/components/operators/openshift-ai/instance/components/kueue-operator/patch-datasciencecluster.yaml b/components/operators/openshift-ai/instance/components/kueue-operator/patch-datasciencecluster.yaml new file mode 100644 index 000000000..8a8e81221 --- /dev/null +++ b/components/operators/openshift-ai/instance/components/kueue-operator/patch-datasciencecluster.yaml @@ -0,0 +1,12 @@ +kind: DataScienceCluster +apiVersion: datasciencecluster.opendatahub.io/v1 +metadata: + name: default +spec: + components: + codeflare: + managementState: Managed + kueue: + managementState: Removed + ray: + managementState: Managed diff --git a/components/operators/openshift-ai/instance/overlays/stable-2.22-nvidia-gpu-time-sliced/kustomization.yaml b/components/operators/openshift-ai/instance/overlays/stable-2.22-nvidia-gpu-time-sliced/kustomization.yaml new file mode 100644 index 000000000..98c179891 --- /dev/null +++ b/components/operators/openshift-ai/instance/overlays/stable-2.22-nvidia-gpu-time-sliced/kustomization.yaml @@ -0,0 +1,21 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - ../../base + +components: + - ../../components/auth-with-authorino + - ../../components/kueue-operator + - ../../components/components-kserve + - ../../components/components-modelmesh + - ../../components/components-training + - ../../components/components-trustyai + - ../../components/default-notebook-pvc-size + - ../../components/dashboard-feature-hardware-profiles + - ../../components/nvidia-gpu-hardware-profile + - ../../components/idle-notebook-culling + - ../../components/notebook-pod-sizes + - ../../components/make-kubeadmin-cluster-admin + - ../../components/model-server-pod-sizes + - ../../components/rhoai-auth