From fc48709f18241f1053d823a35167ccf1069bc0cd Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Wed, 9 Jul 2025 15:13:14 +0100 Subject: [PATCH 01/10] Added topology aware scheduled for compute VMs --- ansible/.gitignore | 2 + ansible/roles/topology/defaults/main.yml | 2 + ansible/roles/topology/library/map_hosts.py | 98 +++++++++++++++++++ ansible/roles/topology/tasks/main.yml | 16 +++ .../roles/topology/templates/topology.conf.j2 | 13 +++ ansible/slurm.yml | 3 + .../inventory/group_vars/all/openhpc.yml | 1 + 7 files changed, 135 insertions(+) create mode 100644 ansible/roles/topology/defaults/main.yml create mode 100644 ansible/roles/topology/library/map_hosts.py create mode 100644 ansible/roles/topology/tasks/main.yml create mode 100644 ansible/roles/topology/templates/topology.conf.j2 diff --git a/ansible/.gitignore b/ansible/.gitignore index 6ae64c72e..62c9a543c 100644 --- a/ansible/.gitignore +++ b/ansible/.gitignore @@ -96,3 +96,5 @@ roles/* !roles/nhc/** !roles/eessi/ !roles/eessi/** +!roles/topology/ +!roles/topology/** diff --git a/ansible/roles/topology/defaults/main.yml b/ansible/roles/topology/defaults/main.yml new file mode 100644 index 000000000..b44e63ac9 --- /dev/null +++ b/ansible/roles/topology/defaults/main.yml @@ -0,0 +1,2 @@ +# If set to non-empty string, will override topology.conf file auto-detected from OpenStack project +topology_topology_override: "" diff --git a/ansible/roles/topology/library/map_hosts.py b/ansible/roles/topology/library/map_hosts.py new file mode 100644 index 000000000..8c7f4431e --- /dev/null +++ b/ansible/roles/topology/library/map_hosts.py @@ -0,0 +1,98 @@ +#!/usr/bin/python + +# Copyright: (c) 2025, StackHPC +# Apache 2 License + +from ansible.module_utils.basic import AnsibleModule +import openstack + +DOCUMENTATION = """ +--- +module: map_hosts +short_description: Creates map of OpenStack VM network topology +description: + - Creates map representing the network topology tree of an OpenStack project with a heirarchy + of: Availability Zone -> Hypervisors/Baremetal nodes -> VMs/Baremetal instances +options: + compute_vms: + description: + - List of VM names within the target OpenStack project to include in the tree + required: true + type: str +author: + - Steve Brasier, William Tripp, StackHPC +""" + +RETURN = """ +topology: + description: + Map representing tree of project topology. Top level keys are AZ names, their values + are maps of shortened unique identifiers of hosts UUIDs to lists of VM names + returned: success + type: dict[str, dict[str,list[str]]] + sample: + "nova-az": + "afe9": + - "mycluster-compute-0" + - "mycluster-compute-1" + "00f9": + - "mycluster-compute-vm-on-other-hypervisor" +""" + +EXAMPLES = """ +- name: Get topology map + map_hosts: + compute_vms: + - mycluster-compute-0 + - mycluster-compute-1 +""" + +def min_prefix(uuids, start=4): + """ Take a list of uuids and return the smallest length >= start which keeps them unique """ + for length in range(start, len(uuids[0])): + prefixes = set(uuid[:length] for uuid in uuids) + if len(prefixes) == len(uuids): + return length + +def run_module(): + module_args = dict( + compute_vms=dict(type='list', elements='str', required=True) + ) + module = AnsibleModule(argument_spec=module_args, supports_check_mode=True) + + conn = openstack.connection.from_config() + + servers = [s for s in conn.compute.servers() if s["name"] in module.params["compute_vms"]] + + topo = {} + all_host_ids = [] + for s in servers: + az = s['availability_zone'] + host_id = s['host_id'] + if host_id != '': + all_host_ids.append(host_id) + if az not in topo: + topo[az] = {} + if host_id not in topo[az]: + topo[az][host_id] = [] + topo[az][host_id].append(s['name']) + + uuid_len = min_prefix(list(set(all_host_ids))) + + for az in topo: + topo[az] = dict((k[:uuid_len], v) for (k, v) in topo[az].items()) + + result = { + "changed": True, + "topology": topo, + } + + module.exit_json(**result) + + +def main(): + run_module() + + +if __name__ == "__main__": + main() diff --git a/ansible/roles/topology/tasks/main.yml b/ansible/roles/topology/tasks/main.yml new file mode 100644 index 000000000..a0901093b --- /dev/null +++ b/ansible/roles/topology/tasks/main.yml @@ -0,0 +1,16 @@ +- name: Map instances to hosts + become: false + map_hosts: + compute_vms: "{{ groups['compute'] }}" + register: _topology + delegate_to: localhost + run_once: true + +- name: Template topology.conf + become: true + ansible.builtin.template: + src: templates/topology.conf.j2 + dest: /etc/slurm/topology.conf + owner: root + group: root + mode: 0644 diff --git a/ansible/roles/topology/templates/topology.conf.j2 b/ansible/roles/topology/templates/topology.conf.j2 new file mode 100644 index 000000000..e6eab52ed --- /dev/null +++ b/ansible/roles/topology/templates/topology.conf.j2 @@ -0,0 +1,13 @@ +# topology.conf +# Switch Configuration +{% if topology_topology_override != '' %} +{{ topology_topology_override }} +{% else %} +{% for az in _topology.topology.keys() %} +{% for instance_host in _topology.topology[az].keys() %} +SwitchName={{ instance_host }} Nodes={{ _topology.topology[az][instance_host] | join(",") }} +{% endfor %} +SwitchName={{ az }} Switches={{ _topology.topology[az].keys() | join(",") }} +{% endfor %} +SwitchName=master Switches={{ _topology.topology.keys() | join(",") }} +{% endif %} diff --git a/ansible/slurm.yml b/ansible/slurm.yml index fd3424023..4422bce6d 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -61,6 +61,9 @@ tags: - openhpc tasks: + - include_role: + name: topology + when: appliances_mode == 'configure' - include_role: name: stackhpc.openhpc tasks_from: "{{ 'runtime.yml' if appliances_mode == 'configure' else 'main.yml' }}" diff --git a/environments/common/inventory/group_vars/all/openhpc.yml b/environments/common/inventory/group_vars/all/openhpc.yml index 9c6aca272..a599176e7 100644 --- a/environments/common/inventory/group_vars/all/openhpc.yml +++ b/environments/common/inventory/group_vars/all/openhpc.yml @@ -57,6 +57,7 @@ openhpc_config_default: - enable_configless TaskPlugin: task/cgroup,task/affinity ReturnToService: 2 # workaround for templating bug TODO: Remove once on stackhpc.openhpc v1.2.0 + TopologyPlugin: topology/tree # default additional slurm.conf parameters when "rebuild" enabled: openhpc_config_rebuild: From 2d80af3030449f280467f0ab3552e11b8ae05d82 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 11 Jul 2025 16:19:52 +0100 Subject: [PATCH 02/10] compute nodes must now be marked as available for topo-aware scheduling --- ansible/slurm.yml | 4 +++- environments/common/inventory/groups | 6 ++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/ansible/slurm.yml b/ansible/slurm.yml index 4422bce6d..92668798f 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -63,7 +63,9 @@ tasks: - include_role: name: topology - when: appliances_mode == 'configure' + # Gated on topology group having compute nodes but role also + # needs to run on control and login nodes + when: appliances_mode == 'configure' and (groups['topology'] | length) > 0 - include_role: name: stackhpc.openhpc tasks_from: "{{ 'runtime.yml' if appliances_mode == 'configure' else 'main.yml' }}" diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index 1cc5523fb..3183649b6 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -24,6 +24,12 @@ openhpc [builder] # Do not add hosts here manually - used as part of Packer image build pipeline. See packer/README.md. +[topology] +# Compute nodes to be included in the Slurm topology plugin's topology tree +# Should be set to `compute` if enabled +# Note that this feature currently assumes all compute nodes are VMs, enabling +# when the cluster contains baremetal compute nodes may lead to unexpected scheduling behaviour + [podman:children] # Hosts running containers for below services: opensearch From 3c9d5097a05865cecbc44083bb54c796e32fb1ba Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Mon, 14 Jul 2025 09:54:16 +0100 Subject: [PATCH 03/10] Now allows AZ to be specified for non-BM instances --- .../{{cookiecutter.environment}}/tofu/node_group/nodes.tf | 6 ++++-- .../tofu/node_group/variables.tf | 5 ++--- .../skeleton/{{cookiecutter.environment}}/tofu/variables.tf | 6 ++++-- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/nodes.tf b/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/nodes.tf index 818dd95ab..f3f9ab91f 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/nodes.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/nodes.tf @@ -25,6 +25,8 @@ locals { } ) } + + baremetal_az = var.availability_zone != null ? var.availability_zone : "nova" } resource "openstack_blockstorage_volume_v3" "compute" { @@ -115,7 +117,7 @@ resource "openstack_compute_instance_v2" "compute_fixed_image" { fqdn: ${local.fqdns[each.key]} EOF - availability_zone = var.match_ironic_node ? "${var.availability_zone}::${var.baremetal_nodes[each.key]}" : null + availability_zone = var.match_ironic_node ? "${local.baremetal_az}::${var.baremetal_nodes[each.key]}" : var.availability_zone lifecycle { ignore_changes = [ @@ -170,7 +172,7 @@ resource "openstack_compute_instance_v2" "compute" { fqdn: ${local.fqdns[each.key]} EOF - availability_zone = var.match_ironic_node ? "${var.availability_zone}::${var.baremetal_nodes[each.key]}" : null + availability_zone = var.match_ironic_node ? "${local.baremetal_az}::${var.baremetal_nodes[each.key]}" : var.availability_zone } diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/variables.tf index 44c862fe5..ae2e50196 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/variables.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/variables.tf @@ -150,9 +150,8 @@ variable "match_ironic_node" { variable "availability_zone" { type = string - description = "Name of availability zone - ignored unless match_ironic_node is true" - default = "nova" - nullable = false + description = "Name of availability zone. If undefined, defaults to 'nova' if match_ironic_node is true, defered to OpenStack otherwise" + default = null } variable "baremetal_nodes" { diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/tofu/variables.tf index 592119c39..856c51bb9 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/variables.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/variables.tf @@ -79,7 +79,8 @@ variable "login" { For any networks not specified here the cloud will select addresses. match_ironic_node: Set true to launch instances on the Ironic node of the same name as each cluster node - availability_zone: Name of availability zone - ignored unless match_ironic_node is true (default: "nova") + availability_zone: Name of availability zone"Name of availability zone. If undefined, defaults to 'nova' + if match_ironic_node is true, defered to OpenStack otherwise" gateway_ip: Address to add default route via nodename_template: Overrides variable cluster_nodename_template EOF @@ -122,7 +123,8 @@ variable "compute" { For any networks not specified here the cloud will select addresses. match_ironic_node: Set true to launch instances on the Ironic node of the same name as each cluster node - availability_zone: Name of availability zone - ignored unless match_ironic_node is true (default: "nova") + availability_zone: Name of availability zone. "Name of availability zone. If undefined, defaults to 'nova' + if match_ironic_node is true, defered to OpenStack otherwise" gateway_ip: Address to add default route via nodename_template: Overrides variable cluster_nodename_template From f1d8fb0255c483829a4c4ce48a16089eb101c9d8 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Mon, 14 Jul 2025 10:12:59 +0100 Subject: [PATCH 04/10] refactor + added to CI groups --- ansible/roles/topology/defaults/main.yml | 7 +++++-- ansible/roles/topology/tasks/main.yml | 2 +- ansible/roles/topology/templates/topology.conf.j2 | 2 +- environments/common/inventory/group_vars/all/topology.yml | 1 + environments/common/layouts/everything | 3 +++ 5 files changed, 11 insertions(+), 4 deletions(-) create mode 100644 environments/common/inventory/group_vars/all/topology.yml diff --git a/ansible/roles/topology/defaults/main.yml b/ansible/roles/topology/defaults/main.yml index b44e63ac9..801c5af82 100644 --- a/ansible/roles/topology/defaults/main.yml +++ b/ansible/roles/topology/defaults/main.yml @@ -1,2 +1,5 @@ -# If set to non-empty string, will override topology.conf file auto-detected from OpenStack project -topology_topology_override: "" +# Nodes to be included in topology tree, must include all Slurm compute nodes +topology_topology_nodes: [] + +# If set, will override topology.conf file auto-detected from OpenStack project +# topology_topology_override: diff --git a/ansible/roles/topology/tasks/main.yml b/ansible/roles/topology/tasks/main.yml index a0901093b..bb0f1c3e0 100644 --- a/ansible/roles/topology/tasks/main.yml +++ b/ansible/roles/topology/tasks/main.yml @@ -1,7 +1,7 @@ - name: Map instances to hosts become: false map_hosts: - compute_vms: "{{ groups['compute'] }}" + compute_vms: "{{ topology_topology_nodes }}" register: _topology delegate_to: localhost run_once: true diff --git a/ansible/roles/topology/templates/topology.conf.j2 b/ansible/roles/topology/templates/topology.conf.j2 index e6eab52ed..4bf1ea38d 100644 --- a/ansible/roles/topology/templates/topology.conf.j2 +++ b/ansible/roles/topology/templates/topology.conf.j2 @@ -1,6 +1,6 @@ # topology.conf # Switch Configuration -{% if topology_topology_override != '' %} +{% if topology_topology_override is defined %} {{ topology_topology_override }} {% else %} {% for az in _topology.topology.keys() %} diff --git a/environments/common/inventory/group_vars/all/topology.yml b/environments/common/inventory/group_vars/all/topology.yml new file mode 100644 index 000000000..30fd6b53d --- /dev/null +++ b/environments/common/inventory/group_vars/all/topology.yml @@ -0,0 +1 @@ +topology_topology_groups: "{{ groups['topology'] }}" diff --git a/environments/common/layouts/everything b/environments/common/layouts/everything index b7a7035e6..6d0f571a2 100644 --- a/environments/common/layouts/everything +++ b/environments/common/layouts/everything @@ -135,3 +135,6 @@ builder [nhc:children] # Hosts to configure for node health checks compute + +[topology:children] +compute From 96c13de39dde252e40ddeb95f5adb64ac18c7898 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Mon, 14 Jul 2025 11:26:27 +0100 Subject: [PATCH 05/10] typo --- environments/common/inventory/group_vars/all/topology.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environments/common/inventory/group_vars/all/topology.yml b/environments/common/inventory/group_vars/all/topology.yml index 30fd6b53d..43cfabfff 100644 --- a/environments/common/inventory/group_vars/all/topology.yml +++ b/environments/common/inventory/group_vars/all/topology.yml @@ -1 +1 @@ -topology_topology_groups: "{{ groups['topology'] }}" +topology_topology_nodes: "{{ groups['topology'] }}" From 28457a7bd7e35aa6ff4f332e5a65a21b5421ba4e Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Mon, 14 Jul 2025 11:35:55 +0100 Subject: [PATCH 06/10] added readme --- ansible/roles/topology/README.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 ansible/roles/topology/README.md diff --git a/ansible/roles/topology/README.md b/ansible/roles/topology/README.md new file mode 100644 index 000000000..cbfcae559 --- /dev/null +++ b/ansible/roles/topology/README.md @@ -0,0 +1,14 @@ +topology +======== + +Templates out /etc/slurm/topology.conf file based on an Openstack project for use by +Slurm's [topology/tree plugin.](https://slurm.schedmd.com/topology.html) Models +project as tree with a heirarchy of: + +Project -> Availability Zones -> Hypervisors -> VMs + +Role Variables +-------------- + +- `topology_topology_nodes: []`: Required list[str]. List of nodes to include in topology tree. Must be set to include all compute nodes in Slurm cluster. Default `[]`. +- `topology_topology_override:`: Optional str. If set, will override templating and be provided as custom topology.conf content. Undefined by default. \ No newline at end of file From b00188ef123d96671e5c9f74b4fa9eca96b01a9b Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Tue, 22 Jul 2025 12:34:56 +0100 Subject: [PATCH 07/10] docs updates + review suggestions + refactor template override --- ansible/roles/topology/README.md | 9 +++++---- ansible/roles/topology/defaults/main.yml | 4 ++-- ansible/roles/topology/library/map_hosts.py | 2 +- ansible/roles/topology/tasks/main.yml | 2 +- ansible/roles/topology/templates/topology.conf.j2 | 4 ---- ansible/slurm.yml | 4 +++- environments/common/inventory/groups | 2 +- environments/common/layouts/everything | 4 ++++ .../tofu/node_group/variables.tf | 2 +- .../{{cookiecutter.environment}}/tofu/variables.tf | 8 ++++---- 10 files changed, 22 insertions(+), 19 deletions(-) diff --git a/ansible/roles/topology/README.md b/ansible/roles/topology/README.md index cbfcae559..7ab0e994e 100644 --- a/ansible/roles/topology/README.md +++ b/ansible/roles/topology/README.md @@ -3,12 +3,13 @@ topology Templates out /etc/slurm/topology.conf file based on an Openstack project for use by Slurm's [topology/tree plugin.](https://slurm.schedmd.com/topology.html) Models -project as tree with a heirarchy of: +cluster as tree with a heirarchy of: -Project -> Availability Zones -> Hypervisors -> VMs +Top-level inter-rack Switch -> Availability Zones -> Hypervisors -> VMs Role Variables -------------- -- `topology_topology_nodes: []`: Required list[str]. List of nodes to include in topology tree. Must be set to include all compute nodes in Slurm cluster. Default `[]`. -- `topology_topology_override:`: Optional str. If set, will override templating and be provided as custom topology.conf content. Undefined by default. \ No newline at end of file +- `topology_topology_nodes:`: Required list of strs. List of inventory hostnames of nodes to include in topology tree. Must be set to include all compute nodes in Slurm cluster. Default `[]`. +- `topology_conf_template`: Optional str. Path to Jinja2 template of topology.conf file. Default + `templates/topology.conf.j2` \ No newline at end of file diff --git a/ansible/roles/topology/defaults/main.yml b/ansible/roles/topology/defaults/main.yml index 801c5af82..6e1382387 100644 --- a/ansible/roles/topology/defaults/main.yml +++ b/ansible/roles/topology/defaults/main.yml @@ -1,5 +1,5 @@ # Nodes to be included in topology tree, must include all Slurm compute nodes topology_topology_nodes: [] -# If set, will override topology.conf file auto-detected from OpenStack project -# topology_topology_override: +# Override to use custom topology.conf template +topology_conf_template: templates/topology.conf.j2 diff --git a/ansible/roles/topology/library/map_hosts.py b/ansible/roles/topology/library/map_hosts.py index 8c7f4431e..aae9c0067 100644 --- a/ansible/roles/topology/library/map_hosts.py +++ b/ansible/roles/topology/library/map_hosts.py @@ -12,7 +12,7 @@ short_description: Creates map of OpenStack VM network topology description: - Creates map representing the network topology tree of an OpenStack project with a heirarchy - of: Availability Zone -> Hypervisors/Baremetal nodes -> VMs/Baremetal instances + of: Availability Zone -> Hypervisors -> VMs/Baremetal instances options: compute_vms: description: diff --git a/ansible/roles/topology/tasks/main.yml b/ansible/roles/topology/tasks/main.yml index bb0f1c3e0..2f58fab63 100644 --- a/ansible/roles/topology/tasks/main.yml +++ b/ansible/roles/topology/tasks/main.yml @@ -9,7 +9,7 @@ - name: Template topology.conf become: true ansible.builtin.template: - src: templates/topology.conf.j2 + src: "{{ topology_conf_template }}" dest: /etc/slurm/topology.conf owner: root group: root diff --git a/ansible/roles/topology/templates/topology.conf.j2 b/ansible/roles/topology/templates/topology.conf.j2 index 4bf1ea38d..92e22c808 100644 --- a/ansible/roles/topology/templates/topology.conf.j2 +++ b/ansible/roles/topology/templates/topology.conf.j2 @@ -1,8 +1,5 @@ # topology.conf # Switch Configuration -{% if topology_topology_override is defined %} -{{ topology_topology_override }} -{% else %} {% for az in _topology.topology.keys() %} {% for instance_host in _topology.topology[az].keys() %} SwitchName={{ instance_host }} Nodes={{ _topology.topology[az][instance_host] | join(",") }} @@ -10,4 +7,3 @@ SwitchName={{ instance_host }} Nodes={{ _topology.topology[az][instance_host] | SwitchName={{ az }} Switches={{ _topology.topology[az].keys() | join(",") }} {% endfor %} SwitchName=master Switches={{ _topology.topology.keys() | join(",") }} -{% endif %} diff --git a/ansible/slurm.yml b/ansible/slurm.yml index 92668798f..35297559f 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -65,7 +65,9 @@ name: topology # Gated on topology group having compute nodes but role also # needs to run on control and login nodes - when: appliances_mode == 'configure' and (groups['topology'] | length) > 0 + when: + - appliances_mode == 'configure' + - groups['topology'] | length > 0 - include_role: name: stackhpc.openhpc tasks_from: "{{ 'runtime.yml' if appliances_mode == 'configure' else 'main.yml' }}" diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index 3183649b6..cc4b57dce 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -25,7 +25,7 @@ openhpc # Do not add hosts here manually - used as part of Packer image build pipeline. See packer/README.md. [topology] -# Compute nodes to be included in the Slurm topology plugin's topology tree +# Compute nodes to be included in the Slurm topology plugin's topology tree. See ansible/roles/topology # Should be set to `compute` if enabled # Note that this feature currently assumes all compute nodes are VMs, enabling # when the cluster contains baremetal compute nodes may lead to unexpected scheduling behaviour diff --git a/environments/common/layouts/everything b/environments/common/layouts/everything index 6d0f571a2..8c63247e6 100644 --- a/environments/common/layouts/everything +++ b/environments/common/layouts/everything @@ -137,4 +137,8 @@ builder compute [topology:children] +# Compute nodes to be included in the Slurm topology plugin's topology tree. See ansible/roles/topology +# Should be set to `compute` if enabled +# Note that this feature currently assumes all compute nodes are VMs, enabling +# when the cluster contains baremetal compute nodes may lead to unexpected scheduling behaviour compute diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/variables.tf index ae2e50196..deb174b91 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/variables.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/variables.tf @@ -150,7 +150,7 @@ variable "match_ironic_node" { variable "availability_zone" { type = string - description = "Name of availability zone. If undefined, defaults to 'nova' if match_ironic_node is true, defered to OpenStack otherwise" + description = "Name of availability zone. If undefined, defaults to 'nova' if match_ironic_node is true, deferred to OpenStack otherwise" default = null } diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/tofu/variables.tf index 856c51bb9..a6626ab6e 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/variables.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/variables.tf @@ -79,8 +79,8 @@ variable "login" { For any networks not specified here the cloud will select addresses. match_ironic_node: Set true to launch instances on the Ironic node of the same name as each cluster node - availability_zone: Name of availability zone"Name of availability zone. If undefined, defaults to 'nova' - if match_ironic_node is true, defered to OpenStack otherwise" + availability_zone: Name of availability zone. If undefined, defaults to 'nova' + if match_ironic_node is true, defered to OpenStack otherwise gateway_ip: Address to add default route via nodename_template: Overrides variable cluster_nodename_template EOF @@ -123,8 +123,8 @@ variable "compute" { For any networks not specified here the cloud will select addresses. match_ironic_node: Set true to launch instances on the Ironic node of the same name as each cluster node - availability_zone: Name of availability zone. "Name of availability zone. If undefined, defaults to 'nova' - if match_ironic_node is true, defered to OpenStack otherwise" + availability_zone: Name of availability zone. If undefined, defaults to 'nova' + if match_ironic_node is true, defered to OpenStack otherwise gateway_ip: Address to add default route via nodename_template: Overrides variable cluster_nodename_template From 80ce744f94ae61e0c542a5e0e380f8bebd7a8b66 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Tue, 22 Jul 2025 15:39:47 +0100 Subject: [PATCH 08/10] add top level topology override + gate plugin on group being enabled --- ansible/roles/topology/README.md | 15 ++++++++++++++- ansible/roles/topology/defaults/main.yml | 3 +++ ansible/roles/topology/templates/topology.conf.j2 | 4 ++++ .../common/inventory/group_vars/all/openhpc.yml | 2 +- 4 files changed, 22 insertions(+), 2 deletions(-) diff --git a/ansible/roles/topology/README.md b/ansible/roles/topology/README.md index 7ab0e994e..61e54c2e6 100644 --- a/ansible/roles/topology/README.md +++ b/ansible/roles/topology/README.md @@ -12,4 +12,17 @@ Role Variables - `topology_topology_nodes:`: Required list of strs. List of inventory hostnames of nodes to include in topology tree. Must be set to include all compute nodes in Slurm cluster. Default `[]`. - `topology_conf_template`: Optional str. Path to Jinja2 template of topology.conf file. Default - `templates/topology.conf.j2` \ No newline at end of file + `templates/topology.conf.j2` +- `topology_above_rack_topology`: Optionally multiline str. Used to define topology above racks/AZs if + you wish to partition racks further under different logical switches. New switches above should be + defined as [SwitchName lines](https://slurm.schedmd.com/topology.html#hierarchical) referencing + rack Availability Zones under that switch in their `Switches fields`. These switches must themselves + be under a top level switch. e.g + ``` + topology_above_rack_topology: | + SwitchName=rack-group-1 Switches=rack-az-1,rack-az-2 + SwitchName=rack-group-2 Switches=rack-az-3,rack-az-4 + SwitchName=top-level Switches=rack-group-1,rack-group-2 + ``` + Defaults to an empty string, which causes all AZs to be put under a + single top level switch. \ No newline at end of file diff --git a/ansible/roles/topology/defaults/main.yml b/ansible/roles/topology/defaults/main.yml index 6e1382387..0d8e45507 100644 --- a/ansible/roles/topology/defaults/main.yml +++ b/ansible/roles/topology/defaults/main.yml @@ -3,3 +3,6 @@ topology_topology_nodes: [] # Override to use custom topology.conf template topology_conf_template: templates/topology.conf.j2 + +topology_above_rack_topology: "" + diff --git a/ansible/roles/topology/templates/topology.conf.j2 b/ansible/roles/topology/templates/topology.conf.j2 index 92e22c808..690b3c98e 100644 --- a/ansible/roles/topology/templates/topology.conf.j2 +++ b/ansible/roles/topology/templates/topology.conf.j2 @@ -6,4 +6,8 @@ SwitchName={{ instance_host }} Nodes={{ _topology.topology[az][instance_host] | {% endfor %} SwitchName={{ az }} Switches={{ _topology.topology[az].keys() | join(",") }} {% endfor %} +{% if topology_above_rack_topology == '' %} SwitchName=master Switches={{ _topology.topology.keys() | join(",") }} +{% else %} +{{ topology_above_rack_topology }} +{% endif %} diff --git a/environments/common/inventory/group_vars/all/openhpc.yml b/environments/common/inventory/group_vars/all/openhpc.yml index a599176e7..41131f8a4 100644 --- a/environments/common/inventory/group_vars/all/openhpc.yml +++ b/environments/common/inventory/group_vars/all/openhpc.yml @@ -57,7 +57,7 @@ openhpc_config_default: - enable_configless TaskPlugin: task/cgroup,task/affinity ReturnToService: 2 # workaround for templating bug TODO: Remove once on stackhpc.openhpc v1.2.0 - TopologyPlugin: topology/tree + TopologyPlugin: "topology/{{ 'tree' if (topology_topology_nodes | length) > 0 else 'flat' }}" # default additional slurm.conf parameters when "rebuild" enabled: openhpc_config_rebuild: From 68d55f4d6206ade491de67d1a39bb133ea90986a Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Tue, 22 Jul 2025 15:52:12 +0100 Subject: [PATCH 09/10] typos + renames + added reconfigure warning to docs --- ansible/roles/topology/README.md | 11 ++++++++--- ansible/roles/topology/defaults/main.yml | 2 +- ansible/roles/topology/tasks/main.yml | 2 +- .../common/inventory/group_vars/all/openhpc.yml | 2 +- .../common/inventory/group_vars/all/topology.yml | 2 +- 5 files changed, 12 insertions(+), 7 deletions(-) diff --git a/ansible/roles/topology/README.md b/ansible/roles/topology/README.md index 61e54c2e6..057134490 100644 --- a/ansible/roles/topology/README.md +++ b/ansible/roles/topology/README.md @@ -1,16 +1,21 @@ topology ======== -Templates out /etc/slurm/topology.conf file based on an Openstack project for use by +Templates out /etc/slurm/topology.conf file based on an OpenStack project for use by Slurm's [topology/tree plugin.](https://slurm.schedmd.com/topology.html) Models -cluster as tree with a heirarchy of: +cluster as tree with a hierarchy of: Top-level inter-rack Switch -> Availability Zones -> Hypervisors -> VMs +Warning: This role doesn't currently trigger a restart of Slurm so will therefore not +reconfigure an already running cluster after a `ansible/site.yml` run. You will therefore need +to run the `ansible/adhoc/restart-slurm.yml` playbook for changes to topology.conf to be +recognised. + Role Variables -------------- -- `topology_topology_nodes:`: Required list of strs. List of inventory hostnames of nodes to include in topology tree. Must be set to include all compute nodes in Slurm cluster. Default `[]`. +- `topology_nodes:`: Required list of strs. List of inventory hostnames of nodes to include in topology tree. Must be set to include all compute nodes in Slurm cluster. Default `[]`. - `topology_conf_template`: Optional str. Path to Jinja2 template of topology.conf file. Default `templates/topology.conf.j2` - `topology_above_rack_topology`: Optionally multiline str. Used to define topology above racks/AZs if diff --git a/ansible/roles/topology/defaults/main.yml b/ansible/roles/topology/defaults/main.yml index 0d8e45507..6b6224302 100644 --- a/ansible/roles/topology/defaults/main.yml +++ b/ansible/roles/topology/defaults/main.yml @@ -1,5 +1,5 @@ # Nodes to be included in topology tree, must include all Slurm compute nodes -topology_topology_nodes: [] +topology_nodes: [] # Override to use custom topology.conf template topology_conf_template: templates/topology.conf.j2 diff --git a/ansible/roles/topology/tasks/main.yml b/ansible/roles/topology/tasks/main.yml index 2f58fab63..8debddeab 100644 --- a/ansible/roles/topology/tasks/main.yml +++ b/ansible/roles/topology/tasks/main.yml @@ -1,7 +1,7 @@ - name: Map instances to hosts become: false map_hosts: - compute_vms: "{{ topology_topology_nodes }}" + compute_vms: "{{ topology_nodes }}" register: _topology delegate_to: localhost run_once: true diff --git a/environments/common/inventory/group_vars/all/openhpc.yml b/environments/common/inventory/group_vars/all/openhpc.yml index 41131f8a4..e4b3df670 100644 --- a/environments/common/inventory/group_vars/all/openhpc.yml +++ b/environments/common/inventory/group_vars/all/openhpc.yml @@ -57,7 +57,7 @@ openhpc_config_default: - enable_configless TaskPlugin: task/cgroup,task/affinity ReturnToService: 2 # workaround for templating bug TODO: Remove once on stackhpc.openhpc v1.2.0 - TopologyPlugin: "topology/{{ 'tree' if (topology_topology_nodes | length) > 0 else 'flat' }}" + TopologyPlugin: "topology/{{ 'tree' if (topology_nodes | length) > 0 else 'flat' }}" # default additional slurm.conf parameters when "rebuild" enabled: openhpc_config_rebuild: diff --git a/environments/common/inventory/group_vars/all/topology.yml b/environments/common/inventory/group_vars/all/topology.yml index 43cfabfff..233eecbe8 100644 --- a/environments/common/inventory/group_vars/all/topology.yml +++ b/environments/common/inventory/group_vars/all/topology.yml @@ -1 +1 @@ -topology_topology_nodes: "{{ groups['topology'] }}" +topology_nodes: "{{ groups['topology'] }}" From 025827eeb811faa22faaa013438506926acddc5a Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Thu, 24 Jul 2025 08:28:07 +0100 Subject: [PATCH 10/10] set changed false + comments --- ansible/roles/topology/library/map_hosts.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ansible/roles/topology/library/map_hosts.py b/ansible/roles/topology/library/map_hosts.py index aae9c0067..196113261 100644 --- a/ansible/roles/topology/library/map_hosts.py +++ b/ansible/roles/topology/library/map_hosts.py @@ -69,7 +69,7 @@ def run_module(): for s in servers: az = s['availability_zone'] host_id = s['host_id'] - if host_id != '': + if host_id != '': # empty string if e.g. server is shelved all_host_ids.append(host_id) if az not in topo: topo[az] = {} @@ -83,7 +83,7 @@ def run_module(): topo[az] = dict((k[:uuid_len], v) for (k, v) in topo[az].items()) result = { - "changed": True, + "changed": False, "topology": topo, }