diff --git a/ansible/.gitignore b/ansible/.gitignore index 6ae64c72e..62c9a543c 100644 --- a/ansible/.gitignore +++ b/ansible/.gitignore @@ -96,3 +96,5 @@ roles/* !roles/nhc/** !roles/eessi/ !roles/eessi/** +!roles/topology/ +!roles/topology/** diff --git a/ansible/roles/topology/README.md b/ansible/roles/topology/README.md new file mode 100644 index 000000000..057134490 --- /dev/null +++ b/ansible/roles/topology/README.md @@ -0,0 +1,33 @@ +topology +======== + +Templates out /etc/slurm/topology.conf file based on an OpenStack project for use by +Slurm's [topology/tree plugin.](https://slurm.schedmd.com/topology.html) Models +cluster as tree with a hierarchy of: + +Top-level inter-rack Switch -> Availability Zones -> Hypervisors -> VMs + +Warning: This role doesn't currently trigger a restart of Slurm so will therefore not +reconfigure an already running cluster after a `ansible/site.yml` run. You will therefore need +to run the `ansible/adhoc/restart-slurm.yml` playbook for changes to topology.conf to be +recognised. + +Role Variables +-------------- + +- `topology_nodes:`: Required list of strs. List of inventory hostnames of nodes to include in topology tree. Must be set to include all compute nodes in Slurm cluster. Default `[]`. +- `topology_conf_template`: Optional str. Path to Jinja2 template of topology.conf file. Default + `templates/topology.conf.j2` +- `topology_above_rack_topology`: Optionally multiline str. Used to define topology above racks/AZs if + you wish to partition racks further under different logical switches. New switches above should be + defined as [SwitchName lines](https://slurm.schedmd.com/topology.html#hierarchical) referencing + rack Availability Zones under that switch in their `Switches fields`. These switches must themselves + be under a top level switch. e.g + ``` + topology_above_rack_topology: | + SwitchName=rack-group-1 Switches=rack-az-1,rack-az-2 + SwitchName=rack-group-2 Switches=rack-az-3,rack-az-4 + SwitchName=top-level Switches=rack-group-1,rack-group-2 + ``` + Defaults to an empty string, which causes all AZs to be put under a + single top level switch. \ No newline at end of file diff --git a/ansible/roles/topology/defaults/main.yml b/ansible/roles/topology/defaults/main.yml new file mode 100644 index 000000000..6b6224302 --- /dev/null +++ b/ansible/roles/topology/defaults/main.yml @@ -0,0 +1,8 @@ +# Nodes to be included in topology tree, must include all Slurm compute nodes +topology_nodes: [] + +# Override to use custom topology.conf template +topology_conf_template: templates/topology.conf.j2 + +topology_above_rack_topology: "" + diff --git a/ansible/roles/topology/library/map_hosts.py b/ansible/roles/topology/library/map_hosts.py new file mode 100644 index 000000000..196113261 --- /dev/null +++ b/ansible/roles/topology/library/map_hosts.py @@ -0,0 +1,98 @@ +#!/usr/bin/python + +# Copyright: (c) 2025, StackHPC +# Apache 2 License + +from ansible.module_utils.basic import AnsibleModule +import openstack + +DOCUMENTATION = """ +--- +module: map_hosts +short_description: Creates map of OpenStack VM network topology +description: + - Creates map representing the network topology tree of an OpenStack project with a heirarchy + of: Availability Zone -> Hypervisors -> VMs/Baremetal instances +options: + compute_vms: + description: + - List of VM names within the target OpenStack project to include in the tree + required: true + type: str +author: + - Steve Brasier, William Tripp, StackHPC +""" + +RETURN = """ +topology: + description: + Map representing tree of project topology. Top level keys are AZ names, their values + are maps of shortened unique identifiers of hosts UUIDs to lists of VM names + returned: success + type: dict[str, dict[str,list[str]]] + sample: + "nova-az": + "afe9": + - "mycluster-compute-0" + - "mycluster-compute-1" + "00f9": + - "mycluster-compute-vm-on-other-hypervisor" +""" + +EXAMPLES = """ +- name: Get topology map + map_hosts: + compute_vms: + - mycluster-compute-0 + - mycluster-compute-1 +""" + +def min_prefix(uuids, start=4): + """ Take a list of uuids and return the smallest length >= start which keeps them unique """ + for length in range(start, len(uuids[0])): + prefixes = set(uuid[:length] for uuid in uuids) + if len(prefixes) == len(uuids): + return length + +def run_module(): + module_args = dict( + compute_vms=dict(type='list', elements='str', required=True) + ) + module = AnsibleModule(argument_spec=module_args, supports_check_mode=True) + + conn = openstack.connection.from_config() + + servers = [s for s in conn.compute.servers() if s["name"] in module.params["compute_vms"]] + + topo = {} + all_host_ids = [] + for s in servers: + az = s['availability_zone'] + host_id = s['host_id'] + if host_id != '': # empty string if e.g. server is shelved + all_host_ids.append(host_id) + if az not in topo: + topo[az] = {} + if host_id not in topo[az]: + topo[az][host_id] = [] + topo[az][host_id].append(s['name']) + + uuid_len = min_prefix(list(set(all_host_ids))) + + for az in topo: + topo[az] = dict((k[:uuid_len], v) for (k, v) in topo[az].items()) + + result = { + "changed": False, + "topology": topo, + } + + module.exit_json(**result) + + +def main(): + run_module() + + +if __name__ == "__main__": + main() diff --git a/ansible/roles/topology/tasks/main.yml b/ansible/roles/topology/tasks/main.yml new file mode 100644 index 000000000..8debddeab --- /dev/null +++ b/ansible/roles/topology/tasks/main.yml @@ -0,0 +1,16 @@ +- name: Map instances to hosts + become: false + map_hosts: + compute_vms: "{{ topology_nodes }}" + register: _topology + delegate_to: localhost + run_once: true + +- name: Template topology.conf + become: true + ansible.builtin.template: + src: "{{ topology_conf_template }}" + dest: /etc/slurm/topology.conf + owner: root + group: root + mode: 0644 diff --git a/ansible/roles/topology/templates/topology.conf.j2 b/ansible/roles/topology/templates/topology.conf.j2 new file mode 100644 index 000000000..690b3c98e --- /dev/null +++ b/ansible/roles/topology/templates/topology.conf.j2 @@ -0,0 +1,13 @@ +# topology.conf +# Switch Configuration +{% for az in _topology.topology.keys() %} +{% for instance_host in _topology.topology[az].keys() %} +SwitchName={{ instance_host }} Nodes={{ _topology.topology[az][instance_host] | join(",") }} +{% endfor %} +SwitchName={{ az }} Switches={{ _topology.topology[az].keys() | join(",") }} +{% endfor %} +{% if topology_above_rack_topology == '' %} +SwitchName=master Switches={{ _topology.topology.keys() | join(",") }} +{% else %} +{{ topology_above_rack_topology }} +{% endif %} diff --git a/ansible/slurm.yml b/ansible/slurm.yml index fd3424023..35297559f 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -61,6 +61,13 @@ tags: - openhpc tasks: + - include_role: + name: topology + # Gated on topology group having compute nodes but role also + # needs to run on control and login nodes + when: + - appliances_mode == 'configure' + - groups['topology'] | length > 0 - include_role: name: stackhpc.openhpc tasks_from: "{{ 'runtime.yml' if appliances_mode == 'configure' else 'main.yml' }}" diff --git a/environments/common/inventory/group_vars/all/openhpc.yml b/environments/common/inventory/group_vars/all/openhpc.yml index 9c6aca272..e4b3df670 100644 --- a/environments/common/inventory/group_vars/all/openhpc.yml +++ b/environments/common/inventory/group_vars/all/openhpc.yml @@ -57,6 +57,7 @@ openhpc_config_default: - enable_configless TaskPlugin: task/cgroup,task/affinity ReturnToService: 2 # workaround for templating bug TODO: Remove once on stackhpc.openhpc v1.2.0 + TopologyPlugin: "topology/{{ 'tree' if (topology_nodes | length) > 0 else 'flat' }}" # default additional slurm.conf parameters when "rebuild" enabled: openhpc_config_rebuild: diff --git a/environments/common/inventory/group_vars/all/topology.yml b/environments/common/inventory/group_vars/all/topology.yml new file mode 100644 index 000000000..233eecbe8 --- /dev/null +++ b/environments/common/inventory/group_vars/all/topology.yml @@ -0,0 +1 @@ +topology_nodes: "{{ groups['topology'] }}" diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index 1cc5523fb..cc4b57dce 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -24,6 +24,12 @@ openhpc [builder] # Do not add hosts here manually - used as part of Packer image build pipeline. See packer/README.md. +[topology] +# Compute nodes to be included in the Slurm topology plugin's topology tree. See ansible/roles/topology +# Should be set to `compute` if enabled +# Note that this feature currently assumes all compute nodes are VMs, enabling +# when the cluster contains baremetal compute nodes may lead to unexpected scheduling behaviour + [podman:children] # Hosts running containers for below services: opensearch diff --git a/environments/common/layouts/everything b/environments/common/layouts/everything index b7a7035e6..8c63247e6 100644 --- a/environments/common/layouts/everything +++ b/environments/common/layouts/everything @@ -135,3 +135,10 @@ builder [nhc:children] # Hosts to configure for node health checks compute + +[topology:children] +# Compute nodes to be included in the Slurm topology plugin's topology tree. See ansible/roles/topology +# Should be set to `compute` if enabled +# Note that this feature currently assumes all compute nodes are VMs, enabling +# when the cluster contains baremetal compute nodes may lead to unexpected scheduling behaviour +compute diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/nodes.tf b/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/nodes.tf index 818dd95ab..f3f9ab91f 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/nodes.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/nodes.tf @@ -25,6 +25,8 @@ locals { } ) } + + baremetal_az = var.availability_zone != null ? var.availability_zone : "nova" } resource "openstack_blockstorage_volume_v3" "compute" { @@ -115,7 +117,7 @@ resource "openstack_compute_instance_v2" "compute_fixed_image" { fqdn: ${local.fqdns[each.key]} EOF - availability_zone = var.match_ironic_node ? "${var.availability_zone}::${var.baremetal_nodes[each.key]}" : null + availability_zone = var.match_ironic_node ? "${local.baremetal_az}::${var.baremetal_nodes[each.key]}" : var.availability_zone lifecycle { ignore_changes = [ @@ -170,7 +172,7 @@ resource "openstack_compute_instance_v2" "compute" { fqdn: ${local.fqdns[each.key]} EOF - availability_zone = var.match_ironic_node ? "${var.availability_zone}::${var.baremetal_nodes[each.key]}" : null + availability_zone = var.match_ironic_node ? "${local.baremetal_az}::${var.baremetal_nodes[each.key]}" : var.availability_zone } diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/variables.tf index 44c862fe5..deb174b91 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/variables.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/variables.tf @@ -150,9 +150,8 @@ variable "match_ironic_node" { variable "availability_zone" { type = string - description = "Name of availability zone - ignored unless match_ironic_node is true" - default = "nova" - nullable = false + description = "Name of availability zone. If undefined, defaults to 'nova' if match_ironic_node is true, deferred to OpenStack otherwise" + default = null } variable "baremetal_nodes" { diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/tofu/variables.tf index 592119c39..a6626ab6e 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/variables.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/variables.tf @@ -79,7 +79,8 @@ variable "login" { For any networks not specified here the cloud will select addresses. match_ironic_node: Set true to launch instances on the Ironic node of the same name as each cluster node - availability_zone: Name of availability zone - ignored unless match_ironic_node is true (default: "nova") + availability_zone: Name of availability zone. If undefined, defaults to 'nova' + if match_ironic_node is true, defered to OpenStack otherwise gateway_ip: Address to add default route via nodename_template: Overrides variable cluster_nodename_template EOF @@ -122,7 +123,8 @@ variable "compute" { For any networks not specified here the cloud will select addresses. match_ironic_node: Set true to launch instances on the Ironic node of the same name as each cluster node - availability_zone: Name of availability zone - ignored unless match_ironic_node is true (default: "nova") + availability_zone: Name of availability zone. If undefined, defaults to 'nova' + if match_ironic_node is true, defered to OpenStack otherwise gateway_ip: Address to add default route via nodename_template: Overrides variable cluster_nodename_template