Skip to content

Commit fedc257

Browse files
committed
[Performance] Add chef attribute cluster/in_place_update_on_fleet_enabled to disable in-place updates on compute and login nodes by disabling cfn-hup on those nodes.
As a consequence, it also disables the cluster readiness checks executed by the head node on cluster update. Disabling cfn-hup mitigates a relevant performance degradation that may occur with tightly coupled workload st scale.
1 parent 7a280df commit fedc257

File tree

8 files changed

+143
-4
lines changed

8 files changed

+143
-4
lines changed

cookbooks/aws-parallelcluster-platform/recipes/config/supervisord_config.rb

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
dcv_port: node['cluster']['dcv_port'],
3333
dcv_auth_certificate: node['cluster']['dcv']['authenticator']['certificate'],
3434
dcv_auth_private_key: node['cluster']['dcv']['authenticator']['private_key'],
35-
dcv_auth_user: node['cluster']['dcv']['authenticator']['user']
35+
dcv_auth_user: node['cluster']['dcv']['authenticator']['user'],
36+
cfnhup_enabled: cfnhup_enabled?
3637
)
3738
end

cookbooks/aws-parallelcluster-platform/spec/unit/recipes/supervisord_config_spec.rb

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,28 @@
5757
end
5858
end
5959

60+
context "when head node and cfn-hup disabled on fleet" do
61+
cached(:chef_run) do
62+
runner = runner(platform: platform, version: version) do |node|
63+
node.override['cluster']['node_type'] = 'HeadNode'
64+
node.override['cluster']['dcv_enabled'] = 'head_node'
65+
node.override['cluster']['in_place_update_on_fleet_enabled'] = 'false'
66+
allow_any_instance_of(Object).to receive(:dcv_installed?).and_return(true)
67+
end
68+
runner.converge(described_recipe)
69+
end
70+
cached(:node) { chef_run.node }
71+
72+
it 'has the correct content' do
73+
is_expected.to render_file('/etc/parallelcluster/parallelcluster_supervisord.conf')
74+
.with_content("[program:cfn-hup]")
75+
.with_content("[program:clustermgtd]")
76+
.with_content("[program:clusterstatusmgtd]")
77+
.with_content("[program:pcluster_dcv_authenticator]")
78+
.with_content("--port 8444")
79+
end
80+
end
81+
6082
context "when compute fleet" do
6183
cached(:chef_run) do
6284
runner = runner(platform: platform, version: version) do |node|
@@ -77,6 +99,25 @@
7799
.with_content("[program:pcluster_dcv_authenticator]")
78100
end
79101
end
102+
103+
context "when compute fleet with cfn-hup disabled on fleet" do
104+
cached(:chef_run) do
105+
runner = runner(platform: platform, version: version) do |node|
106+
node.override['cluster']['node_type'] = 'ComputeFleet'
107+
node.override['cluster']['in_place_update_on_fleet_enabled'] = 'false'
108+
end
109+
runner.converge(described_recipe)
110+
end
111+
cached(:node) { chef_run.node }
112+
113+
it 'has the correct content' do
114+
is_expected.to render_file('/etc/parallelcluster/parallelcluster_supervisord.conf')
115+
.with_content("[program:computemgtd]")
116+
117+
is_expected.not_to render_file('/etc/parallelcluster/parallelcluster_supervisord.conf')
118+
.with_content("[program:cfn-hup]")
119+
end
120+
end
80121
context "when login node and dcv configured" do
81122
cached(:chef_run) do
82123
runner = runner(platform: platform, version: version) do |node|
@@ -109,12 +150,32 @@
109150

110151
it 'has the correct content' do
111152
is_expected.to render_file('/etc/parallelcluster/parallelcluster_supervisord.conf')
153+
.with_content("[program:cfn-hup]")
112154
.with_content("[program:loginmgtd]")
113155

114156
is_expected.not_to render_file('/etc/parallelcluster/parallelcluster_supervisord.conf')
115157
.with_content("[program:pcluster_dcv_authenticator]")
116158
end
117159
end
160+
161+
context "when login node with cfn-hup disabled on fleet" do
162+
cached(:chef_run) do
163+
runner = runner(platform: platform, version: version) do |node|
164+
node.override['cluster']['node_type'] = 'LoginNode'
165+
node.override['cluster']['in_place_update_on_fleet_enabled'] = 'false'
166+
end
167+
runner.converge(described_recipe)
168+
end
169+
cached(:node) { chef_run.node }
170+
171+
it 'has the correct content' do
172+
is_expected.to render_file('/etc/parallelcluster/parallelcluster_supervisord.conf')
173+
.with_content("[program:loginmgtd]")
174+
175+
is_expected.not_to render_file('/etc/parallelcluster/parallelcluster_supervisord.conf')
176+
.with_content("[program:cfn-hup]")
177+
end
178+
end
118179
end
119180
end
120181
end

cookbooks/aws-parallelcluster-platform/templates/supervisord/parallelcluster_supervisord.conf.erb

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
# Generated by Chef for AWS ParallelCluster <%= node['cluster']['node_type'] -%>
22
# Local modifications could be overwritten.
33
<%# HeadNode, ComputeFleet, LoginNode -%>
4-
<% case node['cluster']['node_type'] -%>
5-
<% when 'HeadNode', 'ComputeFleet', 'LoginNode' -%>
4+
<% if @cfnhup_enabled -%>
65
[program:cfn-hup]
76
command = <%= node['cluster']['scripts_dir']%>/cfn-hup-runner.sh
87
autorestart = true

cookbooks/aws-parallelcluster-shared/attributes/cluster.rb

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,3 +34,6 @@
3434

3535
# Default NFS mount options
3636
default['cluster']['nfs']['hard_mount_options'] = 'hard,_netdev,noatime'
37+
38+
# Cluster Updates
39+
default['cluster']['in_place_update_on_fleet_enabled'] = 'true'

cookbooks/aws-parallelcluster-shared/libraries/helpers.rb

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,3 +106,14 @@ def wait_sync_file(path)
106106
timeout 5
107107
end
108108
end
109+
110+
def cfnhup_enabled?
111+
# cfn-hup is always enabled on the head node, as it is required to perform cluster updates.
112+
# cfn-hup can be disabled on compute nodes and login nodes, limiting the cluster update in the sense that
113+
# live updates on compute and login nodes are not possible.
114+
node['cluster']['node_type'] == 'HeadNode' || node['cluster']['in_place_update_on_fleet_enabled'] == 'true'
115+
end
116+
117+
def cluster_readiness_check_on_update_enabled?
118+
node['cluster']['in_place_update_on_fleet_enabled'] == 'true'
119+
end
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
require_relative '../../../libraries/helpers'
2+
require 'spec_helper'
3+
4+
describe 'cfnhup_enabled?' do
5+
let(:node) { Chef::Node.new }
6+
7+
context 'when node type is HeadNode' do
8+
before { node.override['cluster']['node_type'] = 'HeadNode' }
9+
10+
it 'returns true regardless of in_place_update_on_fleet_enabled setting' do
11+
node.override['cluster']['in_place_update_on_fleet_enabled'] = 'false'
12+
expect(cfnhup_enabled?).to be true
13+
end
14+
end
15+
16+
%w(ComputeFleet LoginNode).each do |node_type|
17+
context "when node type is #{node_type}" do
18+
before { node.override['cluster']['node_type'] = node_type }
19+
20+
it 'returns true when in_place_update_on_fleet_enabled is true' do
21+
node.override['cluster']['in_place_update_on_fleet_enabled'] = 'true'
22+
expect(cfnhup_enabled?).to be true
23+
end
24+
25+
it 'returns false when in_place_update_on_fleet_enabled is false' do
26+
node.override['cluster']['in_place_update_on_fleet_enabled'] = 'false'
27+
expect(cfnhup_enabled?).to be false
28+
end
29+
end
30+
end
31+
end
32+
33+
describe 'cluster_readiness_check_on_update_enabled?' do
34+
let(:node) { Chef::Node.new }
35+
36+
[true, false].each do |in_place_update_on_fleet_enabled|
37+
it "returns #{in_place_update_on_fleet_enabled} when in_place_update_on_fleet_enabled is #{in_place_update_on_fleet_enabled}" do
38+
node.override['cluster']['in_place_update_on_fleet_enabled'] = in_place_update_on_fleet_enabled.to_s
39+
expect(cluster_readiness_check_on_update_enabled?).to be in_place_update_on_fleet_enabled
40+
end
41+
end
42+
end

cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -272,7 +272,7 @@ def update_nodes_in_queue(strategy, queues)
272272

273273
chef_sleep '15'
274274

275-
wait_cluster_ready
275+
wait_cluster_ready if cluster_readiness_check_on_update_enabled?
276276

277277
execute 'start clustermgtd' do
278278
command "#{cookbook_virtualenv_path}/bin/supervisorctl start clustermgtd"

cookbooks/aws-parallelcluster-slurm/spec/unit/recipes/update_head_node_spec.rb

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
allow_any_instance_of(Object).to receive(:are_mount_or_unmount_required?).and_return(are_mount_or_unmount_required)
1616
allow_any_instance_of(Object).to receive(:dig).and_return(true)
1717
allow_any_instance_of(Object).to receive(:cookbook_virtualenv_path).and_return(cookbook_venv_path)
18+
allow_any_instance_of(Object).to receive(:cluster_readiness_check_on_update_enabled?).and_return(true)
1819
RSpec::Mocks.configuration.allow_message_expectations_on_nil = true
1920

2021
node.override['cluster']['stack_name'] = cluster_name
@@ -58,6 +59,27 @@
5859
end
5960
end
6061
end
62+
63+
context 'when cluster readiness check is disabled' do
64+
cached(:chef_run) do
65+
runner = runner(platform: platform, version: version) do |node|
66+
allow_any_instance_of(Object).to receive(:are_mount_or_unmount_required?).and_return(false)
67+
allow_any_instance_of(Object).to receive(:dig).and_return(true)
68+
allow_any_instance_of(Object).to receive(:cookbook_virtualenv_path).and_return(cookbook_venv_path)
69+
allow_any_instance_of(Object).to receive(:cluster_readiness_check_on_update_enabled?).and_return(false)
70+
RSpec::Mocks.configuration.allow_message_expectations_on_nil = true
71+
72+
node.override['cluster']['stack_name'] = cluster_name
73+
node.override['cluster']['region'] = region
74+
node.override['cluster']['cluster_config_version'] = cluster_config_version
75+
node.override['cluster']['scripts_dir'] = scripts_dir
76+
end
77+
runner.converge(described_recipe)
78+
end
79+
it 'does not check cluster readiness' do
80+
is_expected.not_to run_execute("Check cluster readiness")
81+
end
82+
end
6183
end
6284
end
6385
end

0 commit comments

Comments
 (0)