diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml index 27559952c..b418a5a30 100644 --- a/ansible/bootstrap.yml +++ b/ansible/bootstrap.yml @@ -311,14 +311,31 @@ - include_role: name: ofed -- hosts: ansible_init +- hosts: doca:&builder + become: yes + gather_facts: yes + tasks: + - name: Install NVIDIA DOCA + import_role: + name: doca + +- hosts: ansible_init:&builder gather_facts: yes become: yes tags: linux_ansible_init tasks: - - include_role: + - name: Install ansible-init + ansible.builtin.include_role: name: azimuth_cloud.image_utils.linux_ansible_init +- hosts: gateway:&builder + become: yes + tags: gateway + tasks: + - name: Install ansible-init gateway playbook + ansible.builtin.include_role: + name: gateway + - hosts: k3s:&builder become: yes tags: k3s diff --git a/ansible/disable-repos.yml b/ansible/disable-repos.yml deleted file mode 100644 index 3e8022965..000000000 --- a/ansible/disable-repos.yml +++ /dev/null @@ -1,7 +0,0 @@ -- hosts: dnf_repos - become: yes - tasks: - - name: Disable pulp repos - ansible.builtin.include_role: - name: dnf_repos - tasks_from: disable_repos.yml diff --git a/ansible/extras.yml b/ansible/extras.yml index c7cacb877..8e3248d3f 100644 --- a/ansible/extras.yml +++ b/ansible/extras.yml @@ -58,17 +58,6 @@ - import_role: name: persist_hostkeys - -- name: Setup NFS export for compute node configuration - hosts: compute_init:!builder - # NB: has to be after eeesi and os-manila-mount - tags: compute_init - become: yes - tasks: - - include_role: - name: compute_init - tasks_from: export.yml - - name: Install k9s become: yes hosts: k9s diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index 0b4335b14..db7fde901 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -1,15 +1,5 @@ # Builder version of site.yml just installing binaries -- hosts: builder - become: no - gather_facts: no - tasks: - - name: Report hostname (= final image name) - command: hostname - - name: Report inventory groups - debug: - var: group_names - - name: Run pre.yml hook vars: appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}" @@ -25,18 +15,9 @@ tasks_from: sync.yml apply: delegate_to: localhost - when: appliances_mode != 'configure' - import_playbook: bootstrap.yml -- hosts: doca - become: yes - gather_facts: yes - tasks: - - name: Install NVIDIA DOCA - import_role: - name: doca - - name: Run post-bootstrap.yml hook vars: appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}" @@ -44,22 +25,12 @@ import_playbook: "{{ hook_path if hook_path | exists else 'noop.yml' }}" when: hook_path | exists +- import_playbook: iam.yml + - hosts: builder become: yes gather_facts: yes tasks: - # - import_playbook: iam.yml - - name: Install FreeIPA client - import_role: - name: freeipa - tasks_from: client-install.yml - when: "'freeipa_client' in group_names" - - name: Install sssd - import_role: - name: sssd - tasks_from: install.yml - when: "'sssd' in group_names" - # - import_playbook: filesystems.yml: - name: Install nfs packages dnf: @@ -77,43 +48,12 @@ when: "'lustre' in group_names" - import_playbook: extras.yml - -# TODO: is this the right place? -- name: Install compute_init playbook - hosts: compute_init - tags: compute_init # tagged to allow running on cluster instances for dev - become: yes - tasks: - - include_role: - name: compute_init - tasks_from: install.yml - -- name: Install gateway playbook - hosts: gateway - tags: gateway - become: yes - gather_facts: no - tasks: - - include_role: - name: gateway +- import_playbook: slurm.yml - hosts: builder become: yes gather_facts: yes tasks: - # - import_playbook: slurm.yml: - - name: Setup DB - include_role: - name: mysql - tasks_from: install.yml - when: "'mysql' in group_names" - - - name: OpenHPC - import_role: - name: stackhpc.openhpc - tasks_from: install.yml - when: "'openhpc' in group_names" - # - import_playbook: portal.yml - name: Open Ondemand server (packages) include_role: @@ -257,15 +197,4 @@ import_playbook: "{{ hook_path if hook_path | exists else 'noop.yml' }}" when: hook_path | exists -- import_playbook: disable-repos.yml - -- hosts: builder - become: yes - gather_facts: yes - tags: finalise - tasks: - - name: Cleanup image - import_tasks: cleanup.yml - - - name: Shutdown Packer VM - community.general.shutdown: +- import_playbook: final.yml diff --git a/ansible/final.yml b/ansible/final.yml new file mode 100644 index 000000000..8a12d12c3 --- /dev/null +++ b/ansible/final.yml @@ -0,0 +1,38 @@ +- hosts: compute_init + tags: compute_init + become: yes + tasks: + - name: Install compute_init playbook + ansible.builtin.include_role: + name: compute_init + tasks_from: 'install.yml' + when: "{{ appliances_mode == 'build' }}" + # conditional used instead of compute_init!builder to make dev easier + +- hosts: compute_init:!builder + tags: compute_init + become: yes + tasks: + - name: Setup NFS export for compute node configuration + ansible.builtin.include_role: + name: compute_init + tasks_from: export.yml + +- hosts: dnf_repos + become: yes + tasks: + - name: Disable pulp repos + ansible.builtin.include_role: + name: dnf_repos + tasks_from: disable_repos.yml + +- hosts: builder + become: yes + gather_facts: yes + tags: finalise + tasks: + - name: Cleanup image + import_tasks: cleanup.yml + + - name: Shutdown Packer VM + community.general.shutdown: diff --git a/ansible/iam.yml b/ansible/iam.yml index 857b8f840..a0c59df0f 100644 --- a/ansible/iam.yml +++ b/ansible/iam.yml @@ -1,4 +1,4 @@ -- hosts: freeipa_client +- hosts: freeipa_client:!builder tags: - freeipa - freeipa_server # as this is only relevant if using freeipa_server @@ -23,12 +23,20 @@ import_role: name: freeipa tasks_from: client-install.yml + +- hosts: freeipa_client:!builder + tags: + - freeipa + - freeipa_client + gather_facts: yes + become: yes + tasks: - name: Enrol FreeIPA client import_role: name: freeipa tasks_from: enrol.yml -- hosts: freeipa_server +- hosts: freeipa_server:!builder tags: - freeipa - freeipa_server @@ -47,5 +55,6 @@ tags: sssd tasks: - name: Configure sssd - import_role: + ansible.builtin.include_role: name: sssd + tasks_from: "{{ {'build':'install.yml', 'configure':'configure.yml'}[appliances_mode] | default('main.yml') }}" diff --git a/ansible/roles/compute_init/README.md b/ansible/roles/compute_init/README.md index 81a62bade..0d470ee1b 100644 --- a/ansible/roles/compute_init/README.md +++ b/ansible/roles/compute_init/README.md @@ -143,35 +143,32 @@ a new image: additionally configure the control node to export compute hostvars over NFS. Check the cluster is up. -2. Reimage the compute nodes: +2. Optionally, reimage the compute nodes to reset services etc.: ansible-playbook --limit compute ansible/adhoc/rebuild.yml -3. Add metadata to a compute node e.g. via Horizon to turn on compute-init - playbook functionality. +3. Add metadata to a compute node (directly via Horizon or via OpenTofu) to + enable the new compute-init playbook functionality. -4. Stop ansible-init from running +4. Stop ansible-init from running: ansible all -ba "systemctl stop ansible-init" -5. Fake an image build to deploy the compute-init playbook: +5. Fake an image build and rerunning the `site.yml` playbook: - ansible-playbook ansible/fatimage.yml --tags compute_init + ansible-playbook ansible/final.yml --tags compute_init - NB: This will also re-export the compute hostvars, as the nodes are not - in the builder group, which conveniently means any changes made to that - play also get picked up. + This both re-installs the compute-init playbook and re-configures the NFS + share with exported compute hostvars etc. -6. Fake a reimage of compute to run ansible-init and the updated compute-init playbook: +6. Fake a reimage of compute nodes to re-run ansible-init and the updated + compute-init playbook: ansible all -ba "rm -f /var/lib/ansible-init.done && systemctl restart ansible-init" - Use `systemctl status ansible-init` to view stdout/stderr from Ansible. - -Steps 4/5/6 can be repeated with changes to the compute script. If required, -reimage the compute node(s) first as in step 2 and/or add additional metadata -as in step 3. +7. Use `systemctl status ansible-init` to view stdout/stderr from Ansible. +Steps 4-7 can be repeated with changes to the compute script until it works. ## Design notes - Duplicating code in roles into the `compute-init` script is unfortunate, but diff --git a/ansible/roles/mysql/tasks/configure.yml b/ansible/roles/mysql/tasks/configure.yml index d4dd4cd54..05550276f 100644 --- a/ansible/roles/mysql/tasks/configure.yml +++ b/ansible/roles/mysql/tasks/configure.yml @@ -11,9 +11,12 @@ - name: Ensure mysql service state systemd: name: mysql - state: "{{ mysql_state | default('restarted' if _mysql_unitfile.changed else 'started') }}" + state: "{{ mysql_state | default('restarted' if _mysql_unitfile_changed else 'started') }}" enabled: "{{ mysql_systemd_service_enabled }}" - daemon_reload: "{{ _mysql_unitfile.changed }}" + daemon_reload: "{{ _mysql_unitfile_changed }}" + vars: + # cope with not having run install tasks + _mysql_unitfile_changed: "{{ _mysql_unitfile.changed | default(false) }}" - block: - name: Wait for mysql to initialise diff --git a/ansible/site.yml b/ansible/site.yml index d973d9cb3..faeca23fd 100644 --- a/ansible/site.yml +++ b/ansible/site.yml @@ -27,7 +27,6 @@ - import_playbook: slurm.yml - import_playbook: portal.yml - import_playbook: monitoring.yml -- import_playbook: disable-repos.yml - name: Run post.yml hook vars: @@ -37,12 +36,6 @@ import_playbook: "{{ hook_path if hook_path | exists else 'noop.yml' }}" when: hook_path | exists -- name: Clean up and shutdown Packer VM - hosts: builder - gather_facts: no - become: yes - tasks: - - import_tasks: cleanup.yml - - community.general.shutdown: +- import_playbook: final.yml ... \ No newline at end of file diff --git a/ansible/slurm.yml b/ansible/slurm.yml index d1bb93a9f..9445208e0 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -3,11 +3,11 @@ - name: Setup DB hosts: mysql become: true - tags: - - mysql + tags: mysql tasks: - include_role: name: mysql + tasks_from: "{{ {'build':'install.yml', 'configure':'configure.yml'}[appliances_mode] | default('main.yml') }}" - name: Setup slurm-driven rebuild hosts: rebuild:!builder @@ -20,12 +20,9 @@ name: rebuild - name: Set locked memory limits on user-facing nodes - hosts: - - compute - - login + hosts: compute:login:!builder become: yes - tags: - - openhpc + tags: openhpc tasks: - name: set memory limits lineinfile: @@ -34,10 +31,9 @@ line: "* soft memlock unlimited" - name: Block ssh to compute nodes for non-privileged users without running jobs - hosts: compute + hosts: compute:!builder become: yes - tags: - - openhpc + tags: openhpc tasks: - name: Configure sshd pam module blockinfile: @@ -57,9 +53,8 @@ - name: Setup slurm hosts: openhpc become: yes - tags: - - openhpc + tags: openhpc tasks: - include_role: name: stackhpc.openhpc - tasks_from: "{{ 'runtime.yml' if appliances_mode == 'configure' else 'main.yml' }}" + tasks_from: "{{ {'build':'install.yml', 'configure':'runtime.yml'}[appliances_mode] | default('main.yml') }}" diff --git a/environments/.stackhpc/hooks/pre.yml b/environments/.stackhpc/hooks/pre.yml index 305713a61..7ccf50b3a 100644 --- a/environments/.stackhpc/hooks/pre.yml +++ b/environments/.stackhpc/hooks/pre.yml @@ -1,3 +1,13 @@ +- hosts: builder + become: no + gather_facts: no + tasks: + - name: Report hostname (= final image name) + command: hostname + - name: Report inventory groups + debug: + var: group_names + - hosts: control:!builder become: yes gather_facts: false diff --git a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json index f9117a26a..120d2e09b 100644 --- a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-250506-1259-abb6394b", - "RL9": "openhpc-RL9-250506-1259-abb6394b" + "RL8": "openhpc-RL8-250507-0759-a6321302", + "RL9": "openhpc-RL9-250507-0759-a6321302" } } diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index 1fc2a8424..d8ac9704c 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -63,7 +63,8 @@ mysql cluster [rebuild] -# Enable rebuild of nodes on an OpenStack cloud; add 'control' group. +# Add 'control' group to enable slurm-controlled rebuild of compute nodes +# NB: Compute nodes need compute_init enabled [update] # All hosts to (optionally) run yum update on. @@ -133,8 +134,10 @@ freeipa_client [tuned] # Hosts to run TuneD configuration -[ansible_init] -# Hosts to run linux-anisble-init +[ansible_init:children] +# Hosts to run linux-ansible-init +compute_init +gateway [sssd] # Hosts to configure sssd on @@ -143,7 +146,10 @@ freeipa_client # Hosts where the OpenSSH server daemon should be configured [compute_init] -# EXPERIMENTAL: Compute hosts to enable joining cluster on boot on +# EXPERIMENTAL: Compute hosts which should rejoin the cluster after rebuild +# without running site.yml playbook. +# NB: Additional configuration is required and not all functionality is +# currently supported - ee ansible/roles/compute_init/README.md [k3s:children] # Hosts to run k3s server/agent @@ -181,4 +187,5 @@ extra_packages # Hosts where crony configuration is applied. See docs/chrony.md for more details. [gateway] -# Add builder to this group to install gateway ansible-init playbook into image +# Hosts to install gateway configuration functionality on during image build. +# The actual configuration is performed on boot using ansible-init. diff --git a/environments/common/layouts/everything b/environments/common/layouts/everything index e3c3f763d..2a419a05a 100644 --- a/environments/common/layouts/everything +++ b/environments/common/layouts/everything @@ -25,7 +25,9 @@ control [filebeat:children] slurm_stats -# NB: [rebuild] not defined here as likely to need features not currently supported +[rebuild] +# Add 'control' group to enable slurm-controlled rebuild of compute nodes +# NB: Compute nodes need compute_init enabled [update:children] @@ -83,10 +85,6 @@ openondemand [tuned:children] # Hosts to run TuneD configuration -[ansible_init:children] -# Hosts to run linux-anisble-init -cluster - [sssd] # Hosts to configure sssd on @@ -94,7 +92,10 @@ cluster # Hosts where the OpenSSH server daemon should be configured [compute_init] -# EXPERIMENTAL: Compute hosts to enable joining cluster on boot on +# EXPERIMENTAL: Compute hosts which should rejoin the cluster after rebuild +# without running site.yml playbook. +# NB: Additional configuration is required and not all functionality is +# currently supported - ee ansible/roles/compute_init/README.md [k3s_server:children] # Hosts to run k3s server (should only be single node i.e control node) @@ -123,5 +124,7 @@ builder # Hosts where crony configuration is applied. See docs/chrony.md for more details. [gateway:children] -# Add builder to this group to install gateway ansible-init playbook into image -builder +# Hosts to install gateway configuration functionality on during image build. +# The actual configuration is performed on boot using ansible-init. +# Default `cluster` means this is installed during "fat image" builds. +cluster