Skip to content

Commit a68cc2d

Browse files
committed
Changes from 500 VMs Hybrid work
Adds VM recovery playbooks Enable hugepages on the hypervisor and VM configuration. Add playbook to disable devices created for virtual functions Working changes to configure hugetlb Complaint of a missing var, this import seems required Improve CSR approve and node Ready wait loop Add interfaces that can be used as virtual functions But they seemed to generate a lot of iowait activity on the VMs, so I don't know whether something is wrong with them or not Some changes were generated using Cursor and the claude-4-sonnet model. Signed-off-by: Andrew Collins <[email protected]>
1 parent a6e9f30 commit a68cc2d

File tree

27 files changed

+528
-54
lines changed

27 files changed

+528
-54
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,4 +15,5 @@ out
1515
gen
1616
.idea/
1717
.idea/workspace.xml
18-
18+
*.log
19+
*.orig

ansible.cfg

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
11
[defaults]
2-
interpreter_python=auto
2+
interpreter_python=auto_silent
33
callbacks_enabled = profile_tasks
4+
deprecation_warnings = False
5+
log_path = ~/.ansible/jetlag-ansible.log
6+
display_args_to_stdout = True

ansible/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
11
smcipmitool.tar.gz
2+
*.sw*

ansible/copy-pull-secret.yml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
---
2+
# Copy pull secret playbook
3+
#
4+
# This playbook is used to copy the pull secret to the nodes in the cluster.
5+
# It is used to updae the pull secret on nodes to pull images from the Red Hat registry.
6+
#
7+
# Example Usage:
8+
#
9+
# ansible-playbook ansible/copy-pull-secret.yml
10+
#
11+
12+
- name: Copies pull secret to nodes
13+
hosts: hv_vm
14+
roles:
15+
- copy-pull-secret

ansible/hv-vm-start-one.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
---
2+
- name: start one VMs
3+
gather_facts: false
4+
hosts: hv
5+
roles:
6+
- hv-vm-start

ansible/hv-vm-stop-all.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
- name: destroy all VMs
3+
hosts: hv
4+
roles:
5+
- hv-vm-destroy

ansible/mno-add-vm-workers.yml

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
---
2+
# Create and deploy a cluster with the Assisted Installer
3+
#
4+
# Example Usage:
5+
#
6+
# ansible-playbook -i ansible/inventory/cloud42.local ansible/mno-deploy.yml
7+
#
8+
9+
- name: Prep cluster to add hosts
10+
hosts: bastion
11+
vars_files:
12+
- vars/lab.yml
13+
- vars/all.yml
14+
gather_facts: false
15+
tasks:
16+
- name: Set assisted installer connection
17+
set_fact:
18+
assisted_installer_host: "{{ groups['bastion'][0] }}"
19+
assisted_installer_port: "8090"
20+
21+
- name: Get cluster status
22+
uri:
23+
url: "http://{{ assisted_installer_host }}:{{ assisted_installer_port }}/api/assisted-install/v2/clusters/{{ ai_cluster_id }}"
24+
method: GET
25+
body_format: json
26+
status_code: [200]
27+
return_content: true
28+
register: cluster_data
29+
failed_when: cluster_data.json.status not in ['installed', 'adding-hosts']
30+
31+
- name: Set cluster status to adding-hosts
32+
uri:
33+
url: "http://{{ assisted_installer_host }}:{{ assisted_installer_port }}/api/assisted-install/v2/clusters/{{ ai_cluster_id }}/actions/allow-add-workers"
34+
method: POST
35+
body_format: json
36+
status_code: [201, 202]
37+
when: cluster_data.json.status == 'installed'
38+
39+
- name: Get infra-env
40+
uri:
41+
url: "http://{{ assisted_installer_host }}:{{ assisted_installer_port }}/api/assisted-install/v2/clusters/{{ ai_cluster_id }}"
42+
method: GET
43+
body_format: json
44+
status_code: [200]
45+
return_content: true
46+
register: infra_env_return
47+
48+
- name: Set ai_infraenv_id
49+
set_fact:
50+
ai_infraenv_id: "{{ infra_env_return.json.hosts[0].infra_env_id }}"
51+
52+
- name: Get infra-env static_network_config
53+
uri:
54+
url: "http://{{ assisted_installer_host }}:{{ assisted_installer_port }}/api/assisted-install/v2/infra-envs/{{ ai_infraenv_id }}"
55+
method: GET
56+
body_format: json
57+
status_code: [200]
58+
return_content: true
59+
register: infra_env_return
60+
61+
- name: Set ai_infraenv_static_config
62+
set_fact:
63+
ai_infraenv_static_config: "{{ infra_env_return.json.static_network_config }}"
64+
65+
- name: Set empty static network configuration
66+
set_fact:
67+
static_network_config: []
68+
69+
- name: Generate Static Network Config for VMs
70+
ansible.builtin.include_role:
71+
name: create-ai-cluster
72+
tasks_from: static_network_config
73+
vars:
74+
hybrid_worker_count: "{{ add_worker_count }}"
75+
loop: "{{ groups['hv_vm'][:hybrid_worker_count | int] }}"
76+
77+
- name: show ai_infraenv_static_config
78+
debug:
79+
var: ai_infraenv_static_config
80+
81+
- name: show static_network_config
82+
debug:
83+
var: static_network_config
84+
85+
- name: Set static network composite
86+
set_fact:
87+
static_network_config_comp: "{{ static_network_config + ai_infraenv_static_config }}"
88+
89+
- name: show static_network_config composite
90+
debug:
91+
var: static_network_config_comp
92+
93+
- name: Update static config
94+
uri:
95+
url: "http://{{ assisted_installer_host }}:{{ assisted_installer_port }}/api/assisted-install/v2/infra-envs/{{ ai_infraenv_id }}"
96+
body: {
97+
"static_network_config": "{{ static_network_config + ai_infraenv_static_config }}"
98+
}
99+
method: PATCH
100+
body_format: json
101+
status_code: [201]
102+
return_content: true
103+
104+
105+
- name: Boot / Install VMs
106+
hosts: bastion
107+
vars_files:
108+
- vars/lab.yml
109+
- vars/all.yml
110+
roles:
111+
- generate-discovery-iso
112+
- role: boot-iso
113+
vars:
114+
inventory_group: hv_vm
115+
index: "{{ add_worker_count }}"
116+
virtual_media_iso: "discovery.iso"
117+
- role: wait-hosts-discovered
118+
vars:
119+
inventory_nodes: "{{ groups['hv_vm'][:add_worker_count|int] }}"
120+
discover_nodes: "{{ groups['hv_vm'][:add_worker_count|int] }}"
121+
- add-hosts-install
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
---
2+
- name: Copy pull secret
3+
copy:
4+
src: pull-secret.json
5+
dest: "/var/lib/kubelet/config.json"
6+
become: true
7+
- name: touch force update
8+
file:
9+
path: /run/machine-config-daemon-force
10+
state: touch
11+
become: true
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
---
2+
# hv-install default vars
3+
4+
# Hugepages configuration for hypervisors
5+
enable_hugepages: false
6+
7+
# Hugepage size: 2M or 1G
8+
hugepage_size: "1G"
9+
10+
# Number of hugepages to allocate (e.g., 32 for 32GB of 1G hugepages)
11+
hugepage_count: 32
12+
13+
# Additional kernel parameters for performance tuning
14+
additional_kernel_params: []
15+
16+
# Number of hugepages per node (e.g. total / 2)
17+
hugepages_count_per_node: 190

ansible/roles/hv-install/tasks/main.yml

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,55 @@
2121
name: sushy-tools
2222
version: 1.2.0
2323

24+
- name: Configure hugepages support
25+
when: enable_hugepages
26+
block:
27+
28+
- name: Run grubby to add hugepages arguments
29+
command: grubby --update-kernel=ALL --args="default_hugepagesz={{ hugepage_size }} hugepagesz={{ hugepage_size }}"
30+
register: grub_updated
31+
32+
- name: Set reboot required flag
33+
set_fact:
34+
hugepages_reboot_required: true
35+
when: grub_updated.changed
36+
37+
- name: Create hugetlb-gigantic-pages.service file
38+
copy:
39+
dest: /usr/lib/systemd/system/hugetlb-gigantic-pages.service
40+
content: |
41+
[Unit]
42+
Description=HugeTLB Gigantic Pages Reservation
43+
DefaultDependencies=no
44+
Before=dev-hugepages.mount
45+
ConditionPathExists=/sys/devices/system/node
46+
ConditionKernelCommandLine=hugepagesz=1G
47+
48+
[Service]
49+
Type=oneshot
50+
RemainAfterExit=yes
51+
ExecStart=/usr/lib/systemd/hugetlb-reserve-pages.sh
52+
53+
[Install]
54+
WantedBy=sysinit.target
55+
56+
- name: Create hugetlb-reserve-pages.sh
57+
template:
58+
src: hugetlb-reserve-pages.sh.j2
59+
dest: /usr/lib/systemd/hugetlb-reserve-pages.sh
60+
mode: "0755"
61+
register: hugetlb_script
62+
63+
- name: Set reboot required flag
64+
set_fact:
65+
hugepages_reboot_required: true
66+
when: hugetlb_script.changed
67+
68+
- name: Enable hugetlb-gigantic-pages.service
69+
systemd:
70+
enabled: true
71+
name: hugetlb-gigantic-pages.service
72+
2473
- name: Get coredns
2574
get_url:
2675
validate_certs: false
@@ -65,3 +114,21 @@
65114
state: started
66115
enabled: true
67116
name: ksmtuned
117+
118+
- name: Reboot hypervisor for hugepages configuration
119+
when:
120+
- enable_hugepages
121+
- hugepages_reboot_required | default(false)
122+
block:
123+
- name: Reboot hypervisor
124+
reboot:
125+
msg: "Rebooting to apply hugepages configuration"
126+
reboot_timeout: 600
127+
128+
- name: Verify hugepages are configured
129+
shell: cat /proc/meminfo | grep -E "HugePages_Total|HugePages_Free|Hugepagesize"
130+
register: hugepages_status
131+
132+
- name: Display hugepages status
133+
debug:
134+
msg: "{{ hugepages_status.stdout_lines }}"

0 commit comments

Comments
 (0)