Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@ This file is used to list changes made in each version of the AWS ParallelCluste
- Libfabric-aws: libfabric-aws-2.3.1-1
- Rdma-core: rdma-core-60.0-1
- Open MPI: openmpi40-aws-4.1.7-2 and openmpi50-aws-5.0.8-11
- Upgrade NVIDIA driver to version 580.105.08 (from 570.172.08) for all OSs except Amazon Linux 2.
- Upgrade GDRCopy to version 2.5.1 (from 2.4.4).
- Upgrade DCV to version 2025.0-20103 (from 2024.0-19030).
- Upgrade CUDA Toolkit to version 13.0.2 (from 12.8.1) for all OSs except Amazon Linux 2.
- Upgrade NVIDIA Fabric manager to 580.105.08 for all OSs except Amazon Linux 2.
- Upgrade Python to 3.14.2 (from 3.12.11) for all OSs except Amazon Linux 2.
- Upgrade aws-cfn-bootstrap to version 2.0-38 (from 2.0-33).

Expand Down
12 changes: 6 additions & 6 deletions cookbooks/aws-parallelcluster-platform/attributes/platform.rb
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

# NVidia
default['cluster']['nvidia']['enabled'] = 'no'
default['cluster']['nvidia']['driver_version'] = '570.172.08'
default['cluster']['nvidia']['driver_version'] = '580.105.08'
default['cluster']['nvidia']['dcgm_version'] = '4.4.1-1'
if platform?('amazon') && node['platform_version'] == "2"
default['cluster']['nvidia']['driver_version'] = '550.127.08'
Expand All @@ -40,13 +40,13 @@
default['cluster']['dcv']['authenticator']['private_key'] = "#{node['cluster']['etc_dir']}/ext-auth-private-key.pem"
default['cluster']['dcv']['authenticator']['virtualenv_name'] = "dcv_authenticator_virtualenv"
default['cluster']['dcv']['authenticator']['virtualenv_path'] = "#{node['cluster']['system_pyenv_root']}/versions/#{node['cluster']['python-version']}/envs/#{node['cluster']['dcv']['authenticator']['virtualenv_name']}"
default['cluster']['dcv']['version'] = '2024.0-19030'
default['cluster']['dcv']['version'] = '2025.0-20103'
default['cluster']['dcv_port'] = 8443

default['cluster']['dcv']['server']['version'] = '2024.0.19030-1'
default['cluster']['dcv']['xdcv']['version'] = '2024.0.654-1'
default['cluster']['dcv']['gl']['version'] = '2024.0.1096-1'
default['cluster']['dcv']['web_viewer']['version'] = '2024.0.19030-1'
default['cluster']['dcv']['server']['version'] = '2025.0.20103-1'
default['cluster']['dcv']['xdcv']['version'] = '2025.0.688-1'
default['cluster']['dcv']['gl']['version'] = '2025.0.1112-1'
default['cluster']['dcv']['web_viewer']['version'] = '2025.0.20103-1'

# OpenSSH settings for AWS ParallelCluster instances
default['openssh']['server']['protocol'] = '2'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,11 @@

# Cuda installer from https://developer.nvidia.com/cuda-toolkit-archive
# Cuda installer naming: cuda_11.8.0_520.61.05_linux
cuda_version = '12.8'
cuda_patch = '1'
cuda_version = '13.0'
cuda_patch = '2'
cuda_complete_version = "#{cuda_version}.#{cuda_patch}"
cuda_version_suffix = '570.124.06'
cuda_samples_version = '12.8'
cuda_version_suffix = '580.95.05'
cuda_samples_version = '13.0'
if platform?('amazon') && node['platform_version'] == "2"
cuda_version = '12.4'
cuda_patch = '1'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@

def dcv_sha256sum
if arm_instance?
'065f7f63b8bf92a062c85ea749d7bdbaff66acb4d6404cf31200889f1461b624'
'770058467e36686e77b4cd6a3d9a953f6714862182e2c9ccd72958d59f35df5b'
else
'd631d48e8b268d91c55cc3c56f59c9aeaba0217bc1f649f8c6c75957d41e011b'
'acfc339c9e57be9800f25734cb18dec87da2b0457b3cfd2582fc57f05de7c792'
end
end
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@

def dcv_sha256sum
if arm_instance?
'eddd8ef8afbd3e960641b0bde4d3f76faf9e5a1c9b5b40c50da98af62cb53635'
'47205acfab5d35a7f6fe18f9311295aee51189636a2a76d14e0eca39f780192d'
else
'fbbe1157bed43d0da2c2f0da8c13645649d8eb7d722d9855f052b32c382c9f64'
'a39374d39f2d849bd13ee101970bb9eea15a8c5ec743799b7cbb7f562ece9e17'
end
end
Original file line number Diff line number Diff line change
Expand Up @@ -18,33 +18,33 @@ def dcv_sha256sum
case el_string
when "amzn2"
# ALINUX2
'4b77afb807c4aa87e0ac958223f12887d4fc2f1e95adf313cf42025b94adfed8'
'e706e6531a979ce325a8ffd5eeb4a377f7c897f0df9a614e6d59526bc188c3b3'
when "amzn2023"
# ALINUX2023
"60001ea60e91513b5c5018c38c2178cb0fac5cd0f15875ccf694ab95d7cfe661"
"8c9d29b41ee5f9fdfced4ae257c8e6444298a61da62beb2a38add1783c2e3858"
when "el8"
# RHEL and Rocky8
'1f59654f27e5f6c148bdc8520994fe2a150a84650af3bc9fefce7f07ff7d310d'
'89fcb456ee47464ff1fd4657e50814e6a9b18dd7a1fc29ba89b6649239103eda'
when "el9"
# RHEL and Rocky9
'59ed3e6b2698aad03112d759f8bf9a6ffa6850fdf1072fa4afb4756e7314e19d'
'90b33e27e149ad3ca2ebaf8b562c86ba9115c8c282e5d87bd75cfb8ba3054419'
else
''
end
else
case el_string
when "amzn2"
# ALINUX2
'3b9a0ad9c9d521b8a9f6d5c2db0640bd97413d34fe32d418a8a7fd9cae7cc828'
'81d120b639963dff9d7c60f73dda254e2adeb42fcdb3390e1396a40395812865'
when "amzn2023"
# ALINUX2023
"35128b988dee4f1f4582bd912dc4764b8712c1f0e3a35082a5da7e039eb7ff92"
"d98eb986f3b547af22a7732ca26cb6541c3842b9ed57218f503c9acc3b29e7e2"
when "el8"
# RHEL and Rocky8
'b9d24624b857d4315bcd5d90047d18d4924940153d98828b67ae78521916dd83'
'a3038cb0119c9e287c08afb84c687e48896cb4e7af2f9c8a7724b5ae9226e718'
when "el9"
# RHEL and Rocky9
'473b439f95a3354c99718d97338256a280431c7103b5d4bed0d8d63dfc8f6312'
'830e8113d63c11ae663886b4f85f55fc5ae7b64bc24ec485cba71fa304d87ddf'
else
''
end
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,6 @@
use 'partial/_fabric_manager_common.rb'
use 'partial/_fabric_manager_install_rhel.rb'

def fabric_manager_package
'nvidia-fabric-manager'
end

def fabric_manager_version
_nvidia_driver_version
end

def platform
'rhel9'
end
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,10 @@
use 'partial/_fabric_manager_common.rb'
use 'partial/_fabric_manager_install_rhel.rb'

def fabric_manager_package
'nvidia-fabric-manager'
end

def fabric_manager_version
_nvidia_driver_version
end

def platform
'rhel7'
end

def fabric_manager_package
'nvidia-fabric-manager'
end
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,6 @@
use 'partial/_fabric_manager_common.rb'
use 'partial/_fabric_manager_install_rhel.rb'

def fabric_manager_package
'nvidia-fabric-manager'
end

def fabric_manager_version
_nvidia_driver_version
end

def platform
"rhel#{node['platform_version'].to_i}"
end
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,6 @@
use 'partial/_fabric_manager_common.rb'
use 'partial/_fabric_manager_install_rhel.rb'

def fabric_manager_package
'nvidia-fabric-manager'
end

def fabric_manager_version
_nvidia_driver_version
end

def platform
"rhel#{node['platform_version'].to_i}"
end
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,6 @@
use 'partial/_fabric_manager_common.rb'
use 'partial/_fabric_manager_install_debian.rb'

def fabric_manager_package
'nvidia-fabricmanager-570'
end

def fabric_manager_version
"#{_nvidia_driver_version}"
end

def platform
"ubuntu#{node['platform_version'].delete('.')}"
end
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
action :configure do
# Start nvidia fabric manager on NVSwitch enabled systems, except for GB200 which does not need it
if get_nvswitches > 1 && !is_gb200_node?
service 'nvidia-fabricmanager' do
service "#{fabric_manager_package}" do
action %i(start enable)
supports status: true
end
Expand All @@ -52,6 +52,14 @@ def _nvidia_driver_version
nvidia_driver_version || node['cluster']['nvidia']['driver_version']
end

def fabric_manager_package
'nvidia-fabricmanager'
end

def fabric_manager_version
_nvidia_driver_version
end

# Get number of nv switches
def get_nvswitches
# We sum the count for all these deviceIds as output of lscpi command will be >0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,10 @@ def gdrcopy_arch
def installation_code
<<~COMMAND
CUDA=/usr/local/cuda ./build-deb-packages.sh
dpkg -i gdrdrv-dkms_#{gdrcopy_version}_#{gdrcopy_arch}.#{gdrcopy_platform}.deb
dpkg -i libgdrapi_#{gdrcopy_version}_#{gdrcopy_arch}.#{gdrcopy_platform}.deb
dpkg -i gdrcopy-tests_#{gdrcopy_version}_#{gdrcopy_arch}.#{gdrcopy_platform}+cuda*.deb
dpkg -i gdrcopy_#{gdrcopy_version}_#{gdrcopy_arch}.#{gdrcopy_platform}.deb
dpkg -i gdrdrv-dkms_#{gdrcopy_version}-1_#{gdrcopy_arch}.#{gdrcopy_platform}.deb
dpkg -i libgdrapi_#{gdrcopy_version}-1_#{gdrcopy_arch}.#{gdrcopy_platform}.deb
dpkg -i gdrcopy-tests_#{gdrcopy_version}-1_#{gdrcopy_arch}.#{gdrcopy_platform}+cuda*.deb
dpkg -i gdrcopy_#{gdrcopy_version}-1_#{gdrcopy_arch}.#{gdrcopy_platform}.deb
COMMAND
end

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@
# See the License for the specific language governing permissions and limitations under the License.

def gdrcopy_version
'2.4.4'
'2.5.1'
end

def gdrcopy_checksum
'8802f7bc4a589a610118023bdcdd83c10a56dea399acf6eeaac32e8cc10739a8'
'c6d5ebb7dabb89d798f27609511735595004da73af28d93ac041bb5290c4cbec'
end

unified_mode true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,7 @@ def _nvidia_dcgm_enabled
def platform
'rhel7'
end

def dcgm4_package
"#{dcgm_package}-4-cuda12"
end
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def dcgm_package
end

def dcgm4_package
"#{dcgm_package}-4-cuda12"
"#{dcgm_package}-4-cuda13"
end

def dcgm4_core_package
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def dcgm_package
end

def dcgm4_package
"#{dcgm_package}-4-cuda12"
"#{dcgm_package}-4-cuda13"
end

def dcgm4_core_package
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,7 @@ def imex_installed?
action :configure do
# Do nothing
end

def nvidia_imex_package
"#{nvidia_imex_service}-#{nvidia_driver_major_version}"
end
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@
end

def nvidia_imex_package
"#{nvidia_imex_service}-#{nvidia_driver_major_version}"
"#{nvidia_imex_service}"
end

def nvidia_driver_major_version
Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
require 'spec_helper'

describe 'aws-parallelcluster-platform::cuda' do
cached(:cuda_version) { '12.8' }
cached(:cuda_patch) { '1' }
cached(:cuda_version) { '13.0' }
cached(:cuda_patch) { '2' }
cached(:cuda_complete_version) { "#{cuda_version}.#{cuda_patch}" }
cached(:cuda_version_suffix) { '570.124.06' }
cached(:cuda_version_suffix) { '580.95.05' }

context 'when nvidia not enabled' do
cached(:chef_run) do
Expand All @@ -20,7 +20,7 @@
context 'when on arm' do
cached(:cuda_arch) { 'linux_sbsa' }
cached(:cuda_url) { "#{node['cluster']['artifacts_s3_url']}/dependencies/cuda/cuda_#{cuda_complete_version}_#{cuda_version_suffix}_#{cuda_arch}.run" }
cached(:cuda_samples_version) { '12.8' }
cached(:cuda_samples_version) { '13.0' }
cached(:cuda_samples_url) { "#{node['cluster']['artifacts_s3_url']}/dependencies/cuda/samples/v#{cuda_samples_version}.tar.gz" }

cached(:chef_run) do
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -168,8 +168,8 @@ def self.configure(chef_run)

for_all_oses do |platform, version|
context "on #{platform}#{version}" do
cached(:fabric_manager_package) { platform == 'ubuntu' ? 'nvidia-fabricmanager-570' : 'nvidia-fabric-manager' }
cached(:fabric_manager_version) { platform == 'ubuntu' ? "#{nvidia_driver_version}" : nvidia_driver_version }
cached(:fabric_manager_version) { nvidia_driver_version }
cached(:fabric_manager_package) { platform == 'amazon' && version == '2' ? 'nvidia-fabric-manager' : 'nvidia-fabricmanager' }

context 'when fabric manager is to install' do
cached(:chef_run) do
Expand Down Expand Up @@ -204,7 +204,7 @@ def self.configure(chef_run)
end

it 'installs fabric manager' do
is_expected.to run_bash("Install nvidia-fabric-manager")
is_expected.to run_bash("Install #{fabric_manager_package}")
.with(user: 'root')
.with_retries(3)
.with_retry_delay(5)
Expand All @@ -222,9 +222,8 @@ def self.configure(chef_run)
[true, false].each do |is_gb200|
for_all_oses do |platform, version|
context "on #{platform}#{version} on #{is_gb200} gb200 node" do
cached(:fabric_manager_package) { platform == 'ubuntu' ? 'nvidia-fabricmanager-535' : 'nvidia-fabric-manager' }
cached(:fabric_manager_version) { platform == 'ubuntu' ? "#{nvidia_driver_version}" : nvidia_driver_version }

cached(:fabric_manager_version) { nvidia_driver_version }
cached(:fabric_manager_package) { platform == 'amazon' && version == '2' ? 'nvidia-fabric-manager' : 'nvidia-fabricmanager' }
context('when nvswithes are > 1') do
cached(:chef_run) do
stubs_for_provider('fabric_manager') do |res|
Expand All @@ -243,13 +242,13 @@ def self.configure(chef_run)

if is_gb200
it 'does not start nvidia-fabricmanager service' do
is_expected.not_to start_service('nvidia-fabricmanager')
is_expected.not_to start_service("#{fabric_manager_package}")
.with_action(%i(start enable))
.with_supports({ status: true })
end
else
it 'starts nvidia-fabricmanager service' do
is_expected.to start_service('nvidia-fabricmanager')
is_expected.to start_service("#{fabric_manager_package}")
.with_action(%i(start enable))
.with_supports({ status: true })
end
Expand All @@ -269,7 +268,7 @@ def self.configure(chef_run)
end

it "doesn't start nvidia-fabricmanager service" do
is_expected.not_to start_service('nvidia-fabricmanager')
is_expected.not_to start_service("#{fabric_manager_package}")
end
end
end
Expand Down
Loading
Loading