Skip to content

Commit eabf59b

Browse files
authored
Add support for Node Health Checks (#654)
* add nhc * change nhc configuration from replace to lineinfile * move things around * revert doca move * revert slurm changes * tweak NHC docs * remove debugging stop * rename final playbook and let dnf repos work for post-hook in both site and fatimage * note returntoservice bug * revert slurm playbook changes * revert ordering change for repo disable - see issue 708 * bump CI image * fix exporting NHC config * rememeber compute-init config is synced to /var/tmp * enable NHC during rebuild for stackhpc * fix path to shared compute-init files for NHC * don't write compute-init share into fstab for reliability -is unmounted once synced * bump CI image * change NHC to use templating instead of autoconfiguration * fix nhc task file from compute-init * bump CI image * fix NHC configuration directory * bump CI image * fix NHC nodename/hostname mismatch * remove un-needed nhc conf dir tasks * bump CI image * Revert "bump CI image" 4def7a5 This reverts commit 4def7a5. * Revert "remove un-needed nhc conf dir tasks" c38f9fd This reverts commit c38f9fd. * fix nhc mid-upgrade * bump CI image
1 parent 6a19aa8 commit eabf59b

File tree

24 files changed

+210
-38
lines changed

24 files changed

+210
-38
lines changed

ansible/.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,3 +90,5 @@ roles/*
9090
!roles/gateway/**
9191
!roles/alertmanager/
9292
!roles/alertmanager/**
93+
!roles/nhc/
94+
!roles/nhc/**

ansible/bootstrap.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,7 @@
134134

135135
- hosts: dnf_repos
136136
become: yes
137+
tags: dnf_repos
137138
tasks:
138139
- name: Check that creds won't be leaked to users
139140
ansible.builtin.assert:

ansible/disable-repos.yml

Lines changed: 0 additions & 8 deletions
This file was deleted.

ansible/extras.yml

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -58,17 +58,6 @@
5858
- import_role:
5959
name: persist_hostkeys
6060

61-
62-
- name: Setup NFS export for compute node configuration
63-
hosts: compute_init:!builder
64-
# NB: has to be after eeesi and os-manila-mount
65-
tags: compute_init
66-
become: yes
67-
tasks:
68-
- include_role:
69-
name: compute_init
70-
tasks_from: export.yml
71-
7261
- name: Install k9s
7362
become: yes
7463
hosts: k9s

ansible/fatimage.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -257,7 +257,7 @@
257257
import_playbook: "{{ hook_path if hook_path | exists else 'noop.yml' }}"
258258
when: hook_path | exists
259259

260-
- import_playbook: disable-repos.yml
260+
- import_playbook: final.yml
261261

262262
- hosts: builder
263263
become: yes

ansible/final.yml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
- hosts: dnf_repos
2+
become: yes
3+
tags: dnf_repos
4+
tasks:
5+
- name: Disable pulp repos
6+
ansible.builtin.include_role:
7+
name: dnf_repos
8+
tasks_from: disable_repos.yml
9+
10+
- name: Setup NFS export for compute_init
11+
hosts: compute_init:!builder
12+
# NB: done last so other roles can prepare configuration etc
13+
tags: compute_init
14+
become: yes
15+
tasks:
16+
- include_role:
17+
name: compute_init
18+
tasks_from: export.yml

ansible/roles/compute_init/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ it also requires an image build with the role name added to the
8484
| slurm.yml | openhpc [10] | All slurmd functionality | No |
8585
| slurm.yml | (set memory limits) | Fully supported | No |
8686
| slurm.yml | (block ssh) | Fully supported | No |
87+
| slurm.yml | nhc | Fully supported | No |
8788
| portal.yml | (openondemand server) | Not relevant for compute nodes | n/a |
8889
| portal.yml | (openondemand vnc desktop) | None required - use image build | No |
8990
| portal.yml | (openondemand jupyter server) | None required - use image build | No |

ansible/roles/compute_init/files/compute-init.yml

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
enable_basic_users: "{{ os_metadata.meta.basic_users | default(false) | bool }}"
2020
enable_eessi: "{{ os_metadata.meta.eessi | default(false) | bool }}"
2121
enable_chrony: "{{ os_metadata.meta.chrony | default(false) | bool }}"
22+
enable_nhc: "{{ os_metadata.meta.nhc | default(false) | bool }}"
2223

2324
# TODO: "= role defaults" - could be moved to a vars_file: on play with similar precedence effects
2425
resolv_conf_nameservers: []
@@ -63,12 +64,12 @@
6364
mode: u=rX,g=rwX,o=
6465

6566
- name: Mount /mnt/cluster
66-
mount:
67+
ansible.posix.mount:
6768
path: /mnt/cluster
6869
src: "{{ server_node_ip }}:/exports/cluster"
6970
fstype: nfs
7071
opts: ro,sync
71-
state: mounted
72+
state: ephemeral # will be unmounted after sync, don't want it in fstab
7273
register: _mount_mnt_cluster
7374
ignore_errors: true
7475
# exits from playbook if this failed below, allowing ansible-init to
@@ -350,6 +351,11 @@
350351
enabled: true
351352
state: started
352353

354+
- name: Provide NHC configuration
355+
ansible.builtin.include_role:
356+
name: nhc
357+
tasks_from: boot.yml
358+
when: enable_nhc
353359

354360
- name: Ensure node is resumed
355361
# TODO: consider if this is always safe for all job states?

ansible/roles/compute_init/tasks/export.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,3 +98,9 @@
9898
name: sshd
9999
tasks_from: export.yml
100100
when: "'sshd' in group_names"
101+
102+
- name: Export generated NHC config
103+
import_role:
104+
name: nhc
105+
tasks_from: export.yml
106+
when: "'nhc' in group_names"

ansible/roles/compute_init/tasks/install.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,8 @@
4949
dest: roles/
5050
- src: ../../lustre
5151
dest: roles/
52+
- src: ../../nhc
53+
dest: roles/
5254

5355
- name: Add filter_plugins to ansible.cfg
5456
lineinfile:

0 commit comments

Comments
 (0)