Skip to content

Commit 3df7f67

Browse files
author
Bharat Kunwar
authored
Merge pull request #10 from stackhpc/drain-resume
Drain/resume toggle (v0.4.0)
2 parents 3eadb46 + 54509da commit 3df7f67

File tree

5 files changed

+139
-21
lines changed

5 files changed

+139
-21
lines changed

README.md

Lines changed: 74 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -19,18 +19,42 @@ Role Variables
1919

2020
`openhpc_packages`: additional OpenHPC packages to install
2121

22-
`openhpc_enable`:
22+
`openhpc_enable`:
2323
* `control`: whether to enable control host
24-
* `batch`: whether to enable compute nodes
24+
* `batch`: whether to enable compute nodes
2525
* `runtime`: whether to enable OpenHPC runtime
26+
* `drain`: whether to drain compute nodes
27+
* `resume`: whether to resume compute nodes
2628

27-
Example Playbook
29+
Example Inventory
30+
-----------------
31+
32+
And an Ansible inventory as this:
33+
34+
[openhpc_login]
35+
openhpc-login-0 ansible_host=10.60.253.40 ansible_user=centos
36+
37+
[openhpc_compute]
38+
openhpc-compute-0 ansible_host=10.60.253.31 ansible_user=centos
39+
openhpc-compute-1 ansible_host=10.60.253.32 ansible_user=centos
40+
41+
[cluster_login:children]
42+
openhpc_login
43+
44+
[cluster_control:children]
45+
openhpc_login
46+
47+
[cluster_batch:children]
48+
openhpc_compute
49+
50+
Example Playbooks
2851
----------------
29-
52+
3053
To deploy, create a playbook which looks like this:
3154

3255
---
3356
- hosts:
57+
- cluster_login
3458
- cluster_control
3559
- cluster_batch
3660
become: yes
@@ -53,19 +77,52 @@ To deploy, create a playbook which looks like this:
5377
openhpc_packages: []
5478
...
5579

56-
Example Inventory
57-
-----------------
58-
59-
And an Ansible inventory as this:
60-
61-
[openhpc_login]
62-
openhpc-login-0 ansible_host=10.60.253.40 ansible_user=centos
6380

64-
[openhpc_compute]
65-
openhpc-compute-0 ansible_host=10.60.253.33 ansible_user=centos
81+
To drain nodes, for example, before scaling down the cluster to 6 nodes:
6682

67-
[cluster_control:children]
68-
openhpc_login
83+
---
84+
- hosts: openstack
85+
gather_facts: false
86+
vars:
87+
partition: "{{ cluster_group.output_value | selectattr('group', 'equalto', item.name) | list }}"
88+
openhpc_slurm_partitions:
89+
- name: "compute"
90+
flavor: "compute-A"
91+
image: "CentOS7.5-OpenHPC"
92+
num_nodes: 6
93+
user: "centos"
94+
openhpc_cluster_name: openhpc
95+
roles:
96+
# Our stackhpc.cluster-infra role can be invoked in `query` mode which
97+
# looks up the state of the cluster by querying the Heat API.
98+
- role: stackhpc.cluster-infra
99+
cluster_name: "{{ cluster_name }}"
100+
cluster_state: query
101+
cluster_params:
102+
cluster_groups: "{{ cluster_groups }}"
103+
tasks:
104+
# Given that the original cluster that was created had 8 nodes and the
105+
# cluster we want to create has 6 nodes, the computed desired_state
106+
# variable stores the list of instances to leave untouched.
107+
- name: Count the number of compute nodes per slurm partition
108+
set_fact:
109+
desired_state: "{{ (( partition | first).nodes | map(attribute='name') | list )[:item.num_nodes] + desired_state | default([]) }}"
110+
when: partition | length > 0
111+
with_items: "{{ openhpc_slurm_partitions }}"
112+
- debug: var=desired_state
113+
114+
- hosts: cluster_batch
115+
become: yes
116+
vars:
117+
desired_state: "{{ hostvars['localhost']['desired_state'] | default([]) }}"
118+
roles:
119+
# Now, the stackhpc.openhpc role is invoked in drain/resume modes where
120+
# the instances in desired_state are resumed if in a drained state and
121+
# drained if in a resumed state.
122+
- role: stackhpc.openhpc
123+
openhpc_slurm_control_host: "{{ groups['cluster_control'] | first }}"
124+
openhpc_enable:
125+
drain: "{{ inventory_hostname not in desired_state }}"
126+
resume: "{{ inventory_hostname in desired_state }}"
127+
...
69128

70-
[cluster_batch:children]
71-
openhpc_compute

defaults/main.yml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,16 @@
11
---
22
openhpc_slurm_service_enabled: true
33
openhpc_slurm_service:
4-
openhpc_slurm_control_host:
4+
openhpc_slurm_control_host: "{{ inventory_hostname }}"
55
openhpc_slurm_partitions: []
66
openhpc_cluster_name:
77
openhpc_packages: []
8+
openhpc_drain_timeout: 86400
9+
openhpc_resume_timeout: 300
10+
openhpc_retry_delay: 10
811
openhpc_enable:
912
control: false
1013
batch: false
1114
runtime: false
15+
drain: false
16+
resume: false

tasks/drain.yml

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
---
2+
# Ansible tasks to drain a Slurm compute node. Waits for the compute node to be
3+
# drained for up to a day by default.
4+
#
5+
# Variables:
6+
# - node_to_drain: compute node to drain
7+
# - drain_timeout: seconds to wait for node to drain, default is 86400.
8+
9+
- name: Get nodes in DRAINED state
10+
command: "sinfo --noheader --Node --format='%N' --states=DRAINED"
11+
register: drained_nodes_results
12+
changed_when: false
13+
14+
- name: Drain compute node
15+
command: "scontrol update nodename={{ inventory_hostname }} state=DRAIN reason='maintenance'"
16+
when: inventory_hostname not in drained_nodes_results.stdout_lines
17+
18+
- name: Check node has drained
19+
command: "sinfo --noheader --Node --format='%N' --states=DRAINED"
20+
register: drained_nodes
21+
until: "inventory_hostname in drained_nodes.stdout_lines"
22+
delay: "{{ openhpc_retry_delay }}"
23+
retries: "{{ (openhpc_drain_timeout/openhpc_retry_delay) | int }}"
24+
changed_when: false

tasks/main.yml

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,18 @@
11
---
22
- include: control.yml
3-
when: openhpc_enable.control | bool
3+
when: openhpc_enable.control | default(false) | bool
44

55
- include: compute.yml
6-
when: openhpc_enable.batch | bool
6+
when: openhpc_enable.batch | default(false) | bool
77

88
- include: runtime.yml
9-
when: openhpc_enable.runtime | bool
9+
when: openhpc_enable.runtime | default(false) | bool
10+
11+
- include: drain.yml
12+
when: openhpc_enable.drain | default(false) | bool
13+
delegate_to: "{{ openhpc_slurm_control_host }}"
14+
15+
- include: resume.yml
16+
when: openhpc_enable.resume | default(false) | bool
17+
delegate_to: "{{ openhpc_slurm_control_host }}"
1018
...

tasks/resume.yml

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
---
2+
# Ansible tasks to resume a Slurm compute node. Waits for the compute node to
3+
# change state for 5 minutes by default.
4+
#
5+
# Variables:
6+
# - nodes_to_resume: compute node to resume
7+
# - resume_timeout: seconds to wait for node to resume, default is 300.
8+
9+
- name: Get nodes in ALLOC,IDLE states
10+
command: "sinfo --noheader --Node --format='%N' --states=ALLOC,IDLE"
11+
register: resumed_nodes_results
12+
changed_when: false
13+
14+
- name: Resume compute node
15+
command: "scontrol update nodename={{ inventory_hostname }} state=RESUME"
16+
when: inventory_hostname not in resumed_nodes_results.stdout_lines
17+
18+
- name: Check node has resumed
19+
command: "sinfo --noheader --Node --format='%N' --states=ALLOC,IDLE"
20+
register: resumed_nodes
21+
until: "inventory_hostname in resumed_nodes.stdout_lines"
22+
delay: "{{ openhpc_retry_delay }}"
23+
retries: "{{ (openhpc_resume_timeout/openhpc_retry_delay) | int }}"
24+
changed_when: false

0 commit comments

Comments
 (0)