diff --git a/.gitignore b/.gitignore index 9c186b89..e9d84683 100644 --- a/.gitignore +++ b/.gitignore @@ -16,4 +16,5 @@ out gen .idea/ .idea/workspace.xml - +*.log +*.orig diff --git a/ansible/.gitignore b/ansible/.gitignore index 2768362a..f2310062 100644 --- a/ansible/.gitignore +++ b/ansible/.gitignore @@ -1 +1,2 @@ smcipmitool.tar.gz +*.sw* diff --git a/ansible/copy-pull-secret.yml b/ansible/copy-pull-secret.yml new file mode 100644 index 00000000..061a3039 --- /dev/null +++ b/ansible/copy-pull-secret.yml @@ -0,0 +1,15 @@ +--- +# Copy pull secret playbook +# +# This playbook is used to copy the pull secret to the nodes in the cluster. +# It is used to updae the pull secret on nodes to pull images from the Red Hat registry. +# +# Example Usage: +# +# ansible-playbook ansible/copy-pull-secret.yml +# + +- name: Copies pull secret to nodes + hosts: hv_vm + roles: + - copy-pull-secret diff --git a/ansible/hv-vm-start-one.yml b/ansible/hv-vm-start-one.yml new file mode 100644 index 00000000..f4f64662 --- /dev/null +++ b/ansible/hv-vm-start-one.yml @@ -0,0 +1,6 @@ +--- +- name: start one VMs + gather_facts: false + hosts: hv + roles: + - hv-vm-start diff --git a/ansible/hv-vm-stop-all.yml b/ansible/hv-vm-stop-all.yml new file mode 100644 index 00000000..ee8db9a4 --- /dev/null +++ b/ansible/hv-vm-stop-all.yml @@ -0,0 +1,5 @@ +--- +- name: destroy all VMs + hosts: hv + roles: + - hv-vm-destroy diff --git a/ansible/mno-add-vm-workers.yml b/ansible/mno-add-vm-workers.yml new file mode 100644 index 00000000..142bd1d3 --- /dev/null +++ b/ansible/mno-add-vm-workers.yml @@ -0,0 +1,121 @@ +--- +# Create and deploy a cluster with the Assisted Installer +# +# Example Usage: +# +# ansible-playbook -i ansible/inventory/cloud42.local ansible/mno-deploy.yml +# + +- name: Prep cluster to add hosts + hosts: bastion + vars_files: + - vars/lab.yml + - vars/all.yml + gather_facts: false + tasks: + - name: Set assisted installer connection + set_fact: + assisted_installer_host: "{{ groups['bastion'][0] }}" + assisted_installer_port: "8090" + + - name: Get cluster status + uri: + url: "http://{{ assisted_installer_host }}:{{ assisted_installer_port }}/api/assisted-install/v2/clusters/{{ ai_cluster_id }}" + method: GET + body_format: json + status_code: [200] + return_content: true + register: cluster_data + failed_when: cluster_data.json.status not in ['installed', 'adding-hosts'] + + - name: Set cluster status to adding-hosts + uri: + url: "http://{{ assisted_installer_host }}:{{ assisted_installer_port }}/api/assisted-install/v2/clusters/{{ ai_cluster_id }}/actions/allow-add-workers" + method: POST + body_format: json + status_code: [201, 202] + when: cluster_data.json.status == 'installed' + + - name: Get infra-env + uri: + url: "http://{{ assisted_installer_host }}:{{ assisted_installer_port }}/api/assisted-install/v2/clusters/{{ ai_cluster_id }}" + method: GET + body_format: json + status_code: [200] + return_content: true + register: infra_env_return + + - name: Set ai_infraenv_id + set_fact: + ai_infraenv_id: "{{ infra_env_return.json.hosts[0].infra_env_id }}" + + - name: Get infra-env static_network_config + uri: + url: "http://{{ assisted_installer_host }}:{{ assisted_installer_port }}/api/assisted-install/v2/infra-envs/{{ ai_infraenv_id }}" + method: GET + body_format: json + status_code: [200] + return_content: true + register: infra_env_return + + - name: Set ai_infraenv_static_config + set_fact: + ai_infraenv_static_config: "{{ infra_env_return.json.static_network_config }}" + + - name: Set empty static network configuration + set_fact: + static_network_config: [] + + - name: Generate Static Network Config for VMs + ansible.builtin.include_role: + name: create-ai-cluster + tasks_from: static_network_config + vars: + hybrid_worker_count: "{{ add_worker_count }}" + loop: "{{ groups['hv_vm'][:hybrid_worker_count | int] }}" + + - name: show ai_infraenv_static_config + debug: + var: ai_infraenv_static_config + + - name: show static_network_config + debug: + var: static_network_config + + - name: Set static network composite + set_fact: + static_network_config_comp: "{{ static_network_config + ai_infraenv_static_config }}" + + - name: show static_network_config composite + debug: + var: static_network_config_comp + + - name: Update static config + uri: + url: "http://{{ assisted_installer_host }}:{{ assisted_installer_port }}/api/assisted-install/v2/infra-envs/{{ ai_infraenv_id }}" + body: { + "static_network_config": "{{ static_network_config + ai_infraenv_static_config }}" + } + method: PATCH + body_format: json + status_code: [201] + return_content: true + + +- name: Boot / Install VMs + hosts: bastion + vars_files: + - vars/lab.yml + - vars/all.yml + roles: + - generate-discovery-iso + - role: boot-iso + vars: + inventory_group: hv_vm + index: "{{ add_worker_count }}" + virtual_media_iso: "discovery.iso" + - role: wait-hosts-discovered + vars: + inventory_nodes: "{{ groups['hv_vm'][:add_worker_count|int] }}" + discover_nodes: "{{ groups['hv_vm'][:add_worker_count|int] }}" + - add-hosts-install diff --git a/ansible/roles/copy-pull-secret/tasks/main.yml b/ansible/roles/copy-pull-secret/tasks/main.yml new file mode 100644 index 00000000..078e04fb --- /dev/null +++ b/ansible/roles/copy-pull-secret/tasks/main.yml @@ -0,0 +1,11 @@ +--- +- name: Copy pull secret + copy: + src: pull-secret.json + dest: "/var/lib/kubelet/config.json" + become: true +- name: touch force update + file: + path: /run/machine-config-daemon-force + state: touch + become: true diff --git a/ansible/roles/hv-install/defaults/main.yml b/ansible/roles/hv-install/defaults/main.yml new file mode 100644 index 00000000..bfd2265b --- /dev/null +++ b/ansible/roles/hv-install/defaults/main.yml @@ -0,0 +1,17 @@ +--- +# hv-install default vars + +# Hugepages configuration for hypervisors +enable_hugepages: false + +# Hugepage size: 2M or 1G +hugepage_size: "1G" + +# Number of hugepages to allocate (e.g., 32 for 32GB of 1G hugepages) +hugepage_count: 32 + +# Additional kernel parameters for performance tuning +additional_kernel_params: [] + +# Number of hugepages per node (e.g. total / 2) +hugepages_count_per_node: 190 diff --git a/ansible/roles/hv-install/tasks/main.yml b/ansible/roles/hv-install/tasks/main.yml index 4451e0c4..a73d5ee9 100644 --- a/ansible/roles/hv-install/tasks/main.yml +++ b/ansible/roles/hv-install/tasks/main.yml @@ -21,6 +21,55 @@ name: sushy-tools version: 1.2.0 +- name: Configure hugepages support + when: enable_hugepages + block: + + - name: Run grubby to add hugepages arguments + command: grubby --update-kernel=ALL --args="default_hugepagesz={{ hugepage_size }} hugepagesz={{ hugepage_size }}" + register: grub_updated + + - name: Set reboot required flag + set_fact: + hugepages_reboot_required: true + when: grub_updated.changed + + - name: Create hugetlb-gigantic-pages.service file + copy: + dest: /usr/lib/systemd/system/hugetlb-gigantic-pages.service + content: | + [Unit] + Description=HugeTLB Gigantic Pages Reservation + DefaultDependencies=no + Before=dev-hugepages.mount + ConditionPathExists=/sys/devices/system/node + ConditionKernelCommandLine=hugepagesz=1G + + [Service] + Type=oneshot + RemainAfterExit=yes + ExecStart=/usr/lib/systemd/hugetlb-reserve-pages.sh + + [Install] + WantedBy=sysinit.target + + - name: Create hugetlb-reserve-pages.sh + template: + src: hugetlb-reserve-pages.sh.j2 + dest: /usr/lib/systemd/hugetlb-reserve-pages.sh + mode: "0755" + register: hugetlb_script + + - name: Set reboot required flag + set_fact: + hugepages_reboot_required: true + when: hugetlb_script.changed + + - name: Enable hugetlb-gigantic-pages.service + systemd: + enabled: true + name: hugetlb-gigantic-pages.service + - name: Get coredns get_url: validate_certs: false @@ -65,3 +114,21 @@ state: started enabled: true name: ksmtuned + +- name: Reboot hypervisor for hugepages configuration + when: + - enable_hugepages + - hugepages_reboot_required | default(false) + block: + - name: Reboot hypervisor + reboot: + msg: "Rebooting to apply hugepages configuration" + reboot_timeout: 600 + + - name: Verify hugepages are configured + shell: cat /proc/meminfo | grep -E "HugePages_Total|HugePages_Free|Hugepagesize" + register: hugepages_status + + - name: Display hugepages status + debug: + msg: "{{ hugepages_status.stdout_lines }}" diff --git a/ansible/roles/hv-install/templates/hugetlb-reserve-pages.sh.j2 b/ansible/roles/hv-install/templates/hugetlb-reserve-pages.sh.j2 new file mode 100644 index 00000000..5bed529d --- /dev/null +++ b/ansible/roles/hv-install/templates/hugetlb-reserve-pages.sh.j2 @@ -0,0 +1,15 @@ +#!/bin/sh + +nodes_path=/sys/devices/system/node/ +if [ ! -d $nodes_path ]; then + echo "ERROR: $nodes_path does not exist" + exit 1 +fi + +reserve_pages() +{ + echo $1 > $nodes_path/$2/hugepages/hugepages-1048576kB/nr_hugepages +} + +reserve_pages {{ hugepages_count_per_node }} node0 +reserve_pages {{ hugepages_count_per_node }} node1 diff --git a/ansible/roles/hv-vm-create/defaults/main.yml b/ansible/roles/hv-vm-create/defaults/main.yml index 7585df82..527ba186 100644 --- a/ansible/roles/hv-vm-create/defaults/main.yml +++ b/ansible/roles/hv-vm-create/defaults/main.yml @@ -10,7 +10,7 @@ vnuma_enabled: false vnuma_memory_placement: "static" vnuma_cpu_placement: "static" -# Manual vNUMA configuration +# Manual vNUMA configuration # vnuma_nodes: # - id: 0 # cpus: "0-3" @@ -21,4 +21,19 @@ vnuma_cpu_placement: "static" # vNUMA topology settings vnuma_memory_mode: "strict" # strict, preferred, interleave -vnuma_cpu_mode: "strict" # strict, preferred +vnuma_cpu_mode: "strict" # strict, preferred + +# Hugepages configuration for VMs +vm_hugepages: false + +# Hugepage size for VMs: 2M or 1G +vm_hugepage_size: "1G" + +# Number of hugepages to allocate per VM (auto-calculated based on VM memory if not specified) +vm_hugepage_count: + +# Hugepage mount path in VMs +vm_hugepage_mount: "/mnt/hugepages" + +# Enable IGB NICs for VMs +vm_igb_nics: false \ No newline at end of file diff --git a/ansible/roles/hv-vm-create/tasks/main.yml b/ansible/roles/hv-vm-create/tasks/main.yml index 875407a8..f78add51 100644 --- a/ansible/roles/hv-vm-create/tasks/main.yml +++ b/ansible/roles/hv-vm-create/tasks/main.yml @@ -20,6 +20,33 @@ set_fact: hv_vm_cpu_count: "{{ hostvars[inventory_hostname]['cpus'] }}" +- name: Configure VM hugepages + when: vm_hugepages + block: + - name: Calculate hugepages needed for VM if not specified + set_fact: + calculated_vm_hugepage_count: "{{ (hostvars[inventory_hostname]['memory'] | int) // (vm_hugepage_size[:-1] | int) }}" + when: vm_hugepage_count is not defined or vm_hugepage_count == "" + + - name: Set hugepage count for VM + set_fact: + vm_hugepages_needed: "{{ vm_hugepage_count if vm_hugepage_count is defined and vm_hugepage_count != '' else calculated_vm_hugepage_count }}" + + - name: Check host hugepages availability + shell: | + grep -E "HugePages_Free.*{{ vm_hugepage_size }}" /proc/meminfo | awk '{print $2}' || echo "0" + register: host_hugepages_free + delegate_to: "{{ hostvars[inventory_hostname]['ansible_host'] }}" + + - name: Validate sufficient hugepages available + fail: + msg: "Not enough {{ vm_hugepage_size }} hugepages available on host {{ hostvars[inventory_hostname]['ansible_host'] }}. Need: {{ vm_hugepages_needed }}, Available: {{ host_hugepages_free.stdout }}" + when: (host_hugepages_free.stdout | int) < (vm_hugepages_needed | int) + + - name: Display hugepages configuration for VM + debug: + msg: "VM {{ inventory_hostname }} will use {{ vm_hugepages_needed }} {{ vm_hugepage_size }} hugepages ({{ (vm_hugepages_needed | int) * (vm_hugepage_size[:-1] | int) }}G total)" + - name: Set vNUMA configuration tasks when: vnuma_enabled block: diff --git a/ansible/roles/hv-vm-create/templates/kvm-def.xml.j2 b/ansible/roles/hv-vm-create/templates/kvm-def.xml.j2 index df33d85a..197e2fca 100644 --- a/ansible/roles/hv-vm-create/templates/kvm-def.xml.j2 +++ b/ansible/roles/hv-vm-create/templates/kvm-def.xml.j2 @@ -3,6 +3,13 @@ {{ hostvars[inventory_hostname]['domain_uuid'] }} {{ hostvars[inventory_hostname]['memory'] }} {{ hostvars[inventory_hostname]['memory'] }} +{% if vm_hugepages %} + + + + + +{% endif %} {{ hv_vm_cpu_count | int }} hvm @@ -11,6 +18,9 @@ +{% if vm_igb_nics | default(false) %} + +{% endif %} {% if vnuma_enabled %} @@ -125,6 +135,20 @@ {% endif %}
+{% if vm_igb_nics | default(false) %} +{% for i in range(1, 6) %} + +{% set mac_prefix = "%s:%02x" | format('52:54:00',i) %} + + + +
+ +{% endfor %} + + + +{% endif %} diff --git a/ansible/roles/hv-vm-destroy/tasks/main.yml b/ansible/roles/hv-vm-destroy/tasks/main.yml new file mode 100644 index 00000000..28049f8f --- /dev/null +++ b/ansible/roles/hv-vm-destroy/tasks/main.yml @@ -0,0 +1,5 @@ +--- +- name: Stop all vms + shell: + for i in $(virsh list --all --name | grep vm) ; do virsh destroy $i ; done + become: true diff --git a/ansible/roles/hv-vm-start/tasks/main.yml b/ansible/roles/hv-vm-start/tasks/main.yml new file mode 100644 index 00000000..e278efff --- /dev/null +++ b/ansible/roles/hv-vm-start/tasks/main.yml @@ -0,0 +1,5 @@ +--- +- name: Start one vm + shell: + for i in $(virsh list --all --name --state-shutoff | grep vm |head -1) ; do virsh start $i ; done + become: true diff --git a/ansible/roles/ocp-scale-out-csr/tasks/check_nodes_joined.yml b/ansible/roles/ocp-scale-out-csr/tasks/check_nodes_joined.yml index f2be9be4..853ff866 100644 --- a/ansible/roles/ocp-scale-out-csr/tasks/check_nodes_joined.yml +++ b/ansible/roles/ocp-scale-out-csr/tasks/check_nodes_joined.yml @@ -1,15 +1,5 @@ --- -- name: Set Facts to recurse with - set_fact: - r_qry: "{{ qry }}" - r_worker_counter: "{{ worker_counter }}" - -- name: Set KUBECONFIG path based on cluster type - set_fact: - cluster_kubeconfig: "{{ bastion_cluster_config_dir }}/{{ 'kubeconfig' if cluster_type != 'sno' else groups['sno'][0] + '/kubeconfig' }}" - -- name: approve CSRs and check if nodes have joined the cluster - block: +- block: - name: Increment the retry count set_fact: retry: "{{ 0 if retry is undefined else retry | int + 1 }}" @@ -19,35 +9,21 @@ seconds: "30" when: retry|int > 0 - - name: Get CSRs + - name: Get Pending CSRs shell: | - KUBECONFIG={{ cluster_kubeconfig }} oc get csr -o json + KUBECONFIG={{ bastion_cluster_config_dir }}/kubeconfig oc get csr --no-headers | grep Pending | awk '{ print $1 }' register: oc_get_csr - name: Approve pending CSRs shell: | - KUBECONFIG={{ cluster_kubeconfig }} oc adm certificate approve {{ item.metadata.name }} - loop: "{{ oc_get_csr.stdout | from_json | json_query(qry) }}" - loop_control: - label: "{{ item.metadata.name }}" - - - name: Get worker node count - shell: | - KUBECONFIG={{ cluster_kubeconfig }} oc get nodes | {{ worker_counter }} - register: oc_get_nodes_workers - - - name: Current Worker Node Count - debug: - var: oc_get_nodes_workers.stdout - - - name: Waiting for Worker Node Count - debug: - msg: "{{ current_worker_count+scale_out_count }}" + KUBECONFIG={{ bastion_cluster_config_dir }}/kubeconfig oc adm certificate approve {{ item }} + loop: "{{ oc_get_csr.stdout_lines }}" + when: oc_get_csr.stdout_lines | length > 0 - - name: Raise fail to trigger retry if all nodes didn't meet requirments + - name: Raise fail to trigger retry if CSRs still Pending fail: - msg: All nodes have not met check requirements - when: oc_get_nodes_workers.stdout|int < current_worker_count+scale_out_count + msg: CSRs still pending. Try again + when: oc_get_csr.stdout_lines |length > 0 rescue: - name: Fail on maximum retry count fail: @@ -56,6 +32,3 @@ - name: Retry the check include_tasks: check_nodes_joined.yml - vars: - qry: "{{ r_qry }}" - worker_counter: "{{ r_worker_counter }}" diff --git a/ansible/vars/hv.sample.yml b/ansible/vars/hv.sample.yml index b50b2551..288e4959 100644 --- a/ansible/vars/hv.sample.yml +++ b/ansible/vars/hv.sample.yml @@ -48,3 +48,38 @@ hv_vm_manifest_acm_cr: true use_bastion_registry: false # Provide pull-secret for connected manifests pull_secret: "{{ lookup('file', '../pull-secret.txt') | b64encode }}" + +################################################################################ +# Hugepages Configuration +################################################################################ + +# Enable hugepages on hypervisors +enable_hugepages: false + +# Hugepage size for hypervisors: 2M or 1G +hugepage_size: "1G" + +# Number of hugepages to allocate on hypervisors (e.g., 64 for 64GB of 1G hugepages) +# Calculate based on total memory and VM requirements +hugepage_count: 64 + +# Additional kernel parameters for performance tuning +additional_kernel_params: + - "intel_iommu=on" + - "iommu=pt" + - "isolcpus=2-15,18-31" + +# Enable hugepages for VMs +vm_hugepages: false + +# Hugepage size for VMs (should match hypervisor hugepage_size) +vm_hugepage_size: "1G" + +# Number of hugepages per VM (auto-calculated based on VM memory if not specified) +# vm_hugepage_count: 18 + +# Enable vNUMA for performance (recommended with hugepages) +vnuma_enabled: false + +# Enable IGB NICs for VMs +vm_igb_nics: false \ No newline at end of file diff --git a/ansible/vars/lab.yml b/ansible/vars/lab.yml index 7f7ca2d4..81d54250 100644 --- a/ansible/vars/lab.yml +++ b/ansible/vars/lab.yml @@ -218,7 +218,7 @@ hw_vm_counts: nvme0n1: 12 r650: default: 4 - nvme0n1: 23 + nvme0n1: 16 r660: default: 4 nvme0n1: 23 diff --git a/ansible/vars/scale_out.sample.yml b/ansible/vars/scale_out.sample.yml index 30133fef..4b0b7990 100644 --- a/ansible/vars/scale_out.sample.yml +++ b/ansible/vars/scale_out.sample.yml @@ -3,9 +3,9 @@ # This assumes they are all listed in the worker inventory # group. This varable is an offset used to skip worker node # records in the worker inventory group. -current_worker_count: 120 +current_worker_count: 3 # Set this to the number of worker nodes being added to the # cluster. At minimum, current_worker_count + scale_out_count # inventory records must exist in the inventory file. -scale_out_count: 100 +scale_out_count: 3 diff --git a/ansible/vm-sriov-disable.yml b/ansible/vm-sriov-disable.yml new file mode 100644 index 00000000..9fdba3c2 --- /dev/null +++ b/ansible/vm-sriov-disable.yml @@ -0,0 +1,25 @@ +--- +# Disables igb VFs from attempting to connect, which never succeeds and thus drives up CPU across all the workers. +# +# Exepects an inventory that has only the [worker] block, as with the normal inventory created where workers show up under [worker] and [hv_vm] seem to have some variables that affect how the node is accessed. +# +# Example Usage: +# +# ansible-playbook -i ansible/inventory/cloud42.local ansible/vm-sriov-disable.yml +# + +- name: Disable all fake sr-iov devices and connections + gather_facts: false + hosts: worker + tasks: + - name: devices down + shell: + for i in {5..9} ; do for j in {0..6} ; do nmcli d down enp${i}s0v${j} ; done ; done + become: true + ignore_errors: true + + - name: connections autoconnect off + shell: + for i in $( nmcli conn show | grep "Wired connection" | awk '{ print $4 }' ) ; do nmcli conn mod $i connection.autoconnect no ; done + become: true + ignore_errors: true diff --git a/docs/deploy-vmno.md b/docs/deploy-vmno.md index 109a0a1f..2bae6e56 100644 --- a/docs/deploy-vmno.md +++ b/docs/deploy-vmno.md @@ -123,9 +123,21 @@ hw_vm_counts: nvme0n1: 7 ``` +When mixing different machines, the hv_vm_counts may be adjusted for those machine models to create the same number of VMs per hypervisor. For example, when mixing Dell r640 and r650 in ScaleLab, the following counts were used: + +```yaml +hw_vm_counts: + scalelab: + r650: + default: 4 + nvme0n1: 16 +``` + > [!NOTE] > Depending upon your hardware, you may have to parition and format a 2nd disk to help store VM disk files. +In some VM scenarios, hugepages may be required. To configure VMs with hugepages, enable with the variable `enable_hugepages`, and configure specifics with other similar variables found in: `ansible/roles/hv-install/defaults/main.yml`. + ## Configure Ansible vars in `hv.yml` ```console @@ -484,3 +496,25 @@ vm00008 Ready worker 1d v1.31.7 (.ansible) [root@ jetlag]# cat /root/vmno/kubeadmin-password xxxxx-xxxxx-xxxxx-xxxxx ``` + +## Additional helper playbooks for VM management + +If VMs become unresponsive, sometimes destroying and restarting them is the only remedy. Since the garbage cleanup of pods of all VMs on a single hypervisor at a time can cause stalling, it also may be beneficial to start one VM per HV at a time. Playbooks have been added for all of these tasks. + +See the following playbooks to help in these cases: +``` +ansible/hv-vm-stop-all.yml +ansible/hv-vm-start-one.yml +``` + +## Disabling NetworkManager devices and connections for SR-IOV devices on VMs + +One option of creating SR-IOV capable interfaces in a VM is to create them using the Intel IGB driver. +This may be achieved by setting the variable `vm_igb_nics: true` in your variables. + +**Please note:** When VMs are created with SR-IOV devices using the IGB driver, the devices and connections will never fully initialize. NetworkManager repeatedly attempts to start them, which results in a large amount of churn on the VMs. A workaround to this churn is to force the devices down and connections' autoconnect off for those created for the interfaces. + +See the following playbook: +``` +ansible/vm-sriov-disable.yml +``` \ No newline at end of file diff --git a/docs/hypervisors.md b/docs/hypervisors.md index d050123b..1d605a49 100644 --- a/docs/hypervisors.md +++ b/docs/hypervisors.md @@ -112,16 +112,18 @@ Check if the servers in your allocation support NUMA config: ``` Example output indicating NUMA support: +```console NUMA node(s): 2 NUMA node0 CPU(s): 0-11,24-35 NUMA node1 CPU(s): 12-23,36-47 +``` -Add this var to your ansible/vars/all.yml file to enable vnuma config for virtual deployments: +Add this var to your `ansible/vars/all.yml` file to enable vnuma config for virtual deployments: ```yaml vnuma_enabled: true ``` -Refer to ansible/roles/hv-vm-create/defaults/main.yml for other vNUMA configuration options. +Refer to `ansible/roles/hv-vm-create/defaults/main.yml` for other vNUMA configuration options. ## Create/Delete/Replace VMs diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index 2d47c9a1..c3bc5ba4 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -12,6 +12,7 @@ _**Table of Contents**_ - [Failed on Wait for cluster to be ready](#failed-on-wait-for-cluster-to-be-ready) - [Failed on Adjust by-path selected install disk](#failed-on-adjust-by-path-selected-install-disk) - [Failed on Insert Virtual Media](#failed-on-insert-virtual-media) + - [Failing ImagePull due to Pull Secret](#failing-imagepull-due-to-deactivated-pull-secret) - [Bastion](#bastion) - [Accessing services](#accessing-services) - [Clean all container services / podman pods](#clean-all-container-services--podman-pods) @@ -274,6 +275,18 @@ racadm>>set iDRAC.VirtualMedia.Attached Attached Object value modified successfully ``` +## Failing ImagePull due to Deactivated Pull Secret + +If a cluster has been running for some time or has changed hands between owners, there is a chance the pull secret supplied at install time has expired. +Any attempt to update the pull secret by standard means (i.e. `oc edit -n openshift-config secret/pull-secret`) will not work. + +If the cluster is degraded enough as a result, the control plane will not be able to update the kubelet's pull secret automatically. + +For this emergency scenario, a playbook has been created that should hopefully help: +``` +ansible-playbook ansible/copy-pull-secret.yml +``` + # Bastion ## Accessing services