From f41358cd749e152756ee5220495b44217201311b Mon Sep 17 00:00:00 2001 From: Will Szumski Date: Fri, 22 Mar 2024 10:44:22 +0000 Subject: [PATCH 1/2] Add StackHPC Ironic tunings --- doc/source/configuration/index.rst | 1 + doc/source/configuration/ironic.rst | 31 ++++++++++ etc/kayobe/kolla/config/ironic-inspector.conf | 13 ++++ .../kolla/config/ironic/ironic-api.conf | 6 ++ .../kolla/config/ironic/ironic-conductor.conf | 60 +++++++++++++++++++ etc/kayobe/kolla/config/neutron.conf | 12 ++++ etc/kayobe/kolla/config/nova.conf | 13 ++++ .../config/nova/nova-compute-ironic.conf | 13 ++++ etc/kayobe/kolla/globals.yml | 16 +++++ 9 files changed, 165 insertions(+) create mode 100644 doc/source/configuration/ironic.rst create mode 100644 etc/kayobe/kolla/config/ironic-inspector.conf create mode 100644 etc/kayobe/kolla/config/ironic/ironic-api.conf create mode 100644 etc/kayobe/kolla/config/ironic/ironic-conductor.conf create mode 100644 etc/kayobe/kolla/config/neutron.conf diff --git a/doc/source/configuration/index.rst b/doc/source/configuration/index.rst index bb0e1a9fe..78fe0d542 100644 --- a/doc/source/configuration/index.rst +++ b/doc/source/configuration/index.rst @@ -11,6 +11,7 @@ the various features provided. walled-garden release-train host-images + ironic lvm swap cephadm diff --git a/doc/source/configuration/ironic.rst b/doc/source/configuration/ironic.rst new file mode 100644 index 000000000..9bcdf121c --- /dev/null +++ b/doc/source/configuration/ironic.rst @@ -0,0 +1,31 @@ +====== +Ironic +====== + +Cleaning +======== + +Storage +------- + +Hardware assisted secure erase, i.e the ``erase_devices`` clean step, is +enabled by default. This is normally dependent on the `Hardware Manager +`__ +in use. For example, when using the GenericHardwareManager the priority would +be 10, whereas if using the `ProliantHardwareManager +`__ +it would be 0. The idea is that we will prevent the catastrophic case where +data could be leaked to another tenant; forcing you to have to explicitly relax +this setting if this is a risk you want to take. This can be customised by +editing the following variables: + +.. code-block:: + :caption: $KAYOBE_CONFIG_PATH/kolla/config/ironic/ironic-conductor.conf + + [deploy] + erase_devices_priority=10 + erase_devices_metadata_priority=0 + +See `Ironic documentation +`__ for more +details. diff --git a/etc/kayobe/kolla/config/ironic-inspector.conf b/etc/kayobe/kolla/config/ironic-inspector.conf new file mode 100644 index 000000000..7b8635793 --- /dev/null +++ b/etc/kayobe/kolla/config/ironic-inspector.conf @@ -0,0 +1,13 @@ +[DEFAULT] +timeout = 0 +{% if "genericswitch" in kolla_neutron_ml2_mechanism_drivers %} +# We are increasing the RPC response timeouts to 5 minutes due to the neutron +# generic switch driver, which synchronously applies switch configuration for +# each ironic port during node provisioning and tear down. +# The specific API calls that require this long timeout are: +# - Creation and deletion of VLAN networks. +# - Creation or update of ports, adding binding information. +# - Update of ports, removing binding information. +# - Deletion of ports. +rpc_response_timeout = 360 +{% endif %} diff --git a/etc/kayobe/kolla/config/ironic/ironic-api.conf b/etc/kayobe/kolla/config/ironic/ironic-api.conf new file mode 100644 index 000000000..117d6d3d7 --- /dev/null +++ b/etc/kayobe/kolla/config/ironic/ironic-api.conf @@ -0,0 +1,6 @@ +[DEFAULT] +# Avoid some timeouts of heartbeats and vif deletes +rpc_response_timeout = 360 + +[neutron] +timeout = 300 diff --git a/etc/kayobe/kolla/config/ironic/ironic-conductor.conf b/etc/kayobe/kolla/config/ironic/ironic-conductor.conf new file mode 100644 index 000000000..ad03c56f2 --- /dev/null +++ b/etc/kayobe/kolla/config/ironic/ironic-conductor.conf @@ -0,0 +1,60 @@ +[DEFAULT] +# Make direct deploy faster, transfer sparse qcow2 images +force_raw_images = False +# Avoid some rpc timeouts +rpc_response_timeout = 360 + +[conductor] +automated_clean=true +# We have busy conductors failing to heartbeat +# Default is 10 secs +heartbeat_interval = 30 +# Default is 60 seconds +heartbeat_timeout = 360 +sync_local_state_interval = 360 + +# Normally this is 100. We see eventlet threads +# not making much progress, to for saftey reduce +# this by half, should leave work on rabbit queu +workers_pool_size = 50 +# Normally this is 8, keep it same +period_max_workers = 8 + +# Increase power sync interval to reduce load +sync_power_state_interval = 120 +power_failure_recovery_interval = 120 +# Stop checking for orphan allocations for now +check_allocations_interval = 120 + +# Wait much longer before provision timeout check, to reduce background load +# The default is 60 seconds +check_provision_state_interval = 120 +check_rescue_state_interval = 120 + +[database] +# Usually this is 50, reduce to stop DB connection timeouts +# and instead just make eventlet threads wait a bit longer +max_overflow = 5 +# By default this is 30 seconds, but as we reduce +# the pool overflow, some people will need to wait longer +pool_timeout = 60 + +[deploy] +# Force Hardware assisted secure erase by default. +erase_devices_priority=10 +erase_devices_metadata_priority=0 + +[pxe] +# Increase cache size to 120GB and TTL to 28 hours +image_cache_size = 122880 +image_cache_ttl = 100800 + +[neutron] +# Increase the neutron client timeout to allow for the slow management +# switches. +timeout = 300 +request_timeout = 300 + +[glance] +# Retry image download at least once if failure +num_retries = 1 diff --git a/etc/kayobe/kolla/config/neutron.conf b/etc/kayobe/kolla/config/neutron.conf new file mode 100644 index 000000000..60587aecb --- /dev/null +++ b/etc/kayobe/kolla/config/neutron.conf @@ -0,0 +1,12 @@ +[DEFAULT] +{% if kolla_enable_ironic | bool and "genericswitch" in kolla_neutron_ml2_mechanism_drivers %} +# We are increasing the RPC response timeouts to 5 minutes due to the neutron +# generic switch driver, which synchronously applies switch configuration for +# each ironic port during node provisioning and tear down. +# The specific API calls that require this long timeout are: +# - Creation and deletion of VLAN networks. +# - Creation or update of ports, adding binding information. +# - Update of ports, removing binding information. +# - Deletion of ports. +rpc_response_timeout = 360 +{% endif %} diff --git a/etc/kayobe/kolla/config/nova.conf b/etc/kayobe/kolla/config/nova.conf index d1000be56..585bd402c 100644 --- a/etc/kayobe/kolla/config/nova.conf +++ b/etc/kayobe/kolla/config/nova.conf @@ -1,2 +1,15 @@ +[DEFAULT] +{% if kolla_enable_ironic | bool and "genericswitch" in kolla_neutron_ml2_mechanism_drivers %} +# We are increasing the RPC response timeouts to 5 minutes due to the neutron +# generic switch driver, which synchronously applies switch configuration for +# each ironic port during node provisioning and tear down. +# The specific API calls that require this long timeout are: +# - Creation and deletion of VLAN networks. +# - Creation or update of ports, adding binding information. +# - Update of ports, removing binding information. +# - Deletion of ports. +rpc_response_timeout = 360 +{% endif %} + [libvirt] hw_machine_type = x86_64=q35 diff --git a/etc/kayobe/kolla/config/nova/nova-compute-ironic.conf b/etc/kayobe/kolla/config/nova/nova-compute-ironic.conf index 9f6db7a55..987ac0351 100644 --- a/etc/kayobe/kolla/config/nova/nova-compute-ironic.conf +++ b/etc/kayobe/kolla/config/nova/nova-compute-ironic.conf @@ -2,3 +2,16 @@ [DEFAULT] host = {{ kolla_nova_compute_ironic_static_host_name | mandatory('You must set a static host name to help with service failover. See the operations documentation, Ironic section.') }} {% endif %} +# Don't limit the number of concurrent builds for the nova ironic compute +# service. +max_concurrent_builds = 35 + +force_config_drive = True + +[ironic] +# Ramp up maximum retries to allow time for baremetal node reboot and switch configs +api_max_retries = 720 + +[compute] +# Don't disable the compute service due to failed builds. +consecutive_build_service_disable_threshold = 0 diff --git a/etc/kayobe/kolla/globals.yml b/etc/kayobe/kolla/globals.yml index c0663d939..e0d8922de 100644 --- a/etc/kayobe/kolla/globals.yml +++ b/etc/kayobe/kolla/globals.yml @@ -53,3 +53,19 @@ prometheus_instance_label: "{% raw %}{{ ansible_facts.hostname }}{% endraw %}" # in Yoga. This is required to include a valid value for the flavor_id label on # openstack_nova_server_status metrics. prometheus_openstack_exporter_compute_api_version: "2.1" + +{% if kolla_enable_ironic | bool and "genericswitch" in kolla_neutron_ml2_mechanism_drivers %} +# NOTE: We are increasing the HAProxy timeouts to 5 minutes due to the neutron +# generic switch driver, which synchronously applies switch configuration for +# each ironic port during node provisioning and tear down. +# The specific API calls that require this long timeout are: +# - Creation and deletion of VLAN networks. +# - Creation or update of ports, adding binding information. +# - Update of ports, removing binding information. +# - Deletion of ports. +haproxy_client_timeout: 5m30 +haproxy_server_timeout: 5m30 +# If using Neutron backend TLS: +neutron_tls_proxy_client_timeout: 5m30 +neutron_tls_proxy_server_timeout: 5m30 +{% endif %} From 49e5d7dbbf922cc5e99d76f0b5ce31017d0ed5ff Mon Sep 17 00:00:00 2001 From: Will Szumski Date: Tue, 4 Feb 2025 18:11:09 +0000 Subject: [PATCH 2/2] Leo's suggestion from code review --- etc/kayobe/kolla/config/nova/nova-compute-ironic.conf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etc/kayobe/kolla/config/nova/nova-compute-ironic.conf b/etc/kayobe/kolla/config/nova/nova-compute-ironic.conf index 987ac0351..ce046f3fc 100644 --- a/etc/kayobe/kolla/config/nova/nova-compute-ironic.conf +++ b/etc/kayobe/kolla/config/nova/nova-compute-ironic.conf @@ -1,5 +1,5 @@ -{% if kolla_enable_ironic|bool and kolla_nova_compute_ironic_host is not none %} [DEFAULT] +{% if kolla_enable_ironic|bool and kolla_nova_compute_ironic_host is not none %} host = {{ kolla_nova_compute_ironic_static_host_name | mandatory('You must set a static host name to help with service failover. See the operations documentation, Ironic section.') }} {% endif %} # Don't limit the number of concurrent builds for the nova ironic compute