diff --git a/CHANGELOG.md b/CHANGELOG.md index 54a1c85b0d..2b1cc1b849 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,11 @@ This file is used to list changes made in each version of the AWS ParallelCluste - Libfabric-aws: libfabric-aws-2.3.1-1 - Rdma-core: rdma-core-60.0-1 - Open MPI: openmpi40-aws-4.1.7-2 and openmpi50-aws-5.0.8-11 +- Upgrade NVIDIA driver to version 580.105.08 (from 570.172.08) for all OSs except Amazon Linux 2. +- Upgrade GDRCopy to version 2.5.1 (from 2.4.4). +- Upgrade DCV to version 2025.0-20103 (from 2024.0-19030). +- Upgrade CUDA Toolkit to version 13.0.2 (from 12.8.1) for all OSs except Amazon Linux 2. +- Upgrade NVIDIA Fabric manager to 580.105.08 for all OSs except Amazon Linux 2. - Upgrade Python to 3.14.2 (from 3.12.11) for all OSs except Amazon Linux 2. - Upgrade aws-cfn-bootstrap to version 2.0-38 (from 2.0-33). diff --git a/cookbooks/aws-parallelcluster-platform/attributes/platform.rb b/cookbooks/aws-parallelcluster-platform/attributes/platform.rb index 56d97ec2bd..e6fe183221 100644 --- a/cookbooks/aws-parallelcluster-platform/attributes/platform.rb +++ b/cookbooks/aws-parallelcluster-platform/attributes/platform.rb @@ -16,7 +16,7 @@ # NVidia default['cluster']['nvidia']['enabled'] = 'no' -default['cluster']['nvidia']['driver_version'] = '570.172.08' +default['cluster']['nvidia']['driver_version'] = '580.105.08' default['cluster']['nvidia']['dcgm_version'] = '4.4.1-1' if platform?('amazon') && node['platform_version'] == "2" default['cluster']['nvidia']['driver_version'] = '550.127.08' @@ -40,13 +40,13 @@ default['cluster']['dcv']['authenticator']['private_key'] = "#{node['cluster']['etc_dir']}/ext-auth-private-key.pem" default['cluster']['dcv']['authenticator']['virtualenv_name'] = "dcv_authenticator_virtualenv" default['cluster']['dcv']['authenticator']['virtualenv_path'] = "#{node['cluster']['system_pyenv_root']}/versions/#{node['cluster']['python-version']}/envs/#{node['cluster']['dcv']['authenticator']['virtualenv_name']}" -default['cluster']['dcv']['version'] = '2024.0-19030' +default['cluster']['dcv']['version'] = '2025.0-20103' default['cluster']['dcv_port'] = 8443 -default['cluster']['dcv']['server']['version'] = '2024.0.19030-1' -default['cluster']['dcv']['xdcv']['version'] = '2024.0.654-1' -default['cluster']['dcv']['gl']['version'] = '2024.0.1096-1' -default['cluster']['dcv']['web_viewer']['version'] = '2024.0.19030-1' +default['cluster']['dcv']['server']['version'] = '2025.0.20103-1' +default['cluster']['dcv']['xdcv']['version'] = '2025.0.688-1' +default['cluster']['dcv']['gl']['version'] = '2025.0.1112-1' +default['cluster']['dcv']['web_viewer']['version'] = '2025.0.20103-1' # OpenSSH settings for AWS ParallelCluster instances default['openssh']['server']['protocol'] = '2' diff --git a/cookbooks/aws-parallelcluster-platform/recipes/install/cuda.rb b/cookbooks/aws-parallelcluster-platform/recipes/install/cuda.rb index a311ab0ba9..f551d7b04e 100644 --- a/cookbooks/aws-parallelcluster-platform/recipes/install/cuda.rb +++ b/cookbooks/aws-parallelcluster-platform/recipes/install/cuda.rb @@ -19,11 +19,11 @@ # Cuda installer from https://developer.nvidia.com/cuda-toolkit-archive # Cuda installer naming: cuda_11.8.0_520.61.05_linux -cuda_version = '12.8' -cuda_patch = '1' +cuda_version = '13.0' +cuda_patch = '2' cuda_complete_version = "#{cuda_version}.#{cuda_patch}" -cuda_version_suffix = '570.124.06' -cuda_samples_version = '12.8' +cuda_version_suffix = '580.95.05' +cuda_samples_version = '13.0' if platform?('amazon') && node['platform_version'] == "2" cuda_version = '12.4' cuda_patch = '1' diff --git a/cookbooks/aws-parallelcluster-platform/resources/dcv/dcv_ubuntu22.rb b/cookbooks/aws-parallelcluster-platform/resources/dcv/dcv_ubuntu22.rb index 705d906e3a..b5dcfe086c 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/dcv/dcv_ubuntu22.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/dcv/dcv_ubuntu22.rb @@ -21,8 +21,8 @@ def dcv_sha256sum if arm_instance? - '065f7f63b8bf92a062c85ea749d7bdbaff66acb4d6404cf31200889f1461b624' + '770058467e36686e77b4cd6a3d9a953f6714862182e2c9ccd72958d59f35df5b' else - 'd631d48e8b268d91c55cc3c56f59c9aeaba0217bc1f649f8c6c75957d41e011b' + 'acfc339c9e57be9800f25734cb18dec87da2b0457b3cfd2582fc57f05de7c792' end end diff --git a/cookbooks/aws-parallelcluster-platform/resources/dcv/dcv_ubuntu24.rb b/cookbooks/aws-parallelcluster-platform/resources/dcv/dcv_ubuntu24.rb index 21ad4d3b2f..2ed745f450 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/dcv/dcv_ubuntu24.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/dcv/dcv_ubuntu24.rb @@ -21,8 +21,8 @@ def dcv_sha256sum if arm_instance? - 'eddd8ef8afbd3e960641b0bde4d3f76faf9e5a1c9b5b40c50da98af62cb53635' + '47205acfab5d35a7f6fe18f9311295aee51189636a2a76d14e0eca39f780192d' else - 'fbbe1157bed43d0da2c2f0da8c13645649d8eb7d722d9855f052b32c382c9f64' + 'a39374d39f2d849bd13ee101970bb9eea15a8c5ec743799b7cbb7f562ece9e17' end end diff --git a/cookbooks/aws-parallelcluster-platform/resources/dcv/partial/_rhel_common.rb b/cookbooks/aws-parallelcluster-platform/resources/dcv/partial/_rhel_common.rb index 2d474b9282..49023108ae 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/dcv/partial/_rhel_common.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/dcv/partial/_rhel_common.rb @@ -18,16 +18,16 @@ def dcv_sha256sum case el_string when "amzn2" # ALINUX2 - '4b77afb807c4aa87e0ac958223f12887d4fc2f1e95adf313cf42025b94adfed8' + 'e706e6531a979ce325a8ffd5eeb4a377f7c897f0df9a614e6d59526bc188c3b3' when "amzn2023" # ALINUX2023 - "60001ea60e91513b5c5018c38c2178cb0fac5cd0f15875ccf694ab95d7cfe661" + "8c9d29b41ee5f9fdfced4ae257c8e6444298a61da62beb2a38add1783c2e3858" when "el8" # RHEL and Rocky8 - '1f59654f27e5f6c148bdc8520994fe2a150a84650af3bc9fefce7f07ff7d310d' + '89fcb456ee47464ff1fd4657e50814e6a9b18dd7a1fc29ba89b6649239103eda' when "el9" # RHEL and Rocky9 - '59ed3e6b2698aad03112d759f8bf9a6ffa6850fdf1072fa4afb4756e7314e19d' + '90b33e27e149ad3ca2ebaf8b562c86ba9115c8c282e5d87bd75cfb8ba3054419' else '' end @@ -35,16 +35,16 @@ def dcv_sha256sum case el_string when "amzn2" # ALINUX2 - '3b9a0ad9c9d521b8a9f6d5c2db0640bd97413d34fe32d418a8a7fd9cae7cc828' + '81d120b639963dff9d7c60f73dda254e2adeb42fcdb3390e1396a40395812865' when "amzn2023" # ALINUX2023 - "35128b988dee4f1f4582bd912dc4764b8712c1f0e3a35082a5da7e039eb7ff92" + "d98eb986f3b547af22a7732ca26cb6541c3842b9ed57218f503c9acc3b29e7e2" when "el8" # RHEL and Rocky8 - 'b9d24624b857d4315bcd5d90047d18d4924940153d98828b67ae78521916dd83' + 'a3038cb0119c9e287c08afb84c687e48896cb4e7af2f9c8a7724b5ae9226e718' when "el9" # RHEL and Rocky9 - '473b439f95a3354c99718d97338256a280431c7103b5d4bed0d8d63dfc8f6312' + '830e8113d63c11ae663886b4f85f55fc5ae7b64bc24ec485cba71fa304d87ddf' else '' end diff --git a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_alinux2023.rb b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_alinux2023.rb index 46c891ed43..c4705c1f1b 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_alinux2023.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_alinux2023.rb @@ -19,14 +19,6 @@ use 'partial/_fabric_manager_common.rb' use 'partial/_fabric_manager_install_rhel.rb' -def fabric_manager_package - 'nvidia-fabric-manager' -end - -def fabric_manager_version - _nvidia_driver_version -end - def platform 'rhel9' end diff --git a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_amazon2.rb b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_amazon2.rb index 375dcb02ce..347687b3da 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_amazon2.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_amazon2.rb @@ -17,14 +17,10 @@ use 'partial/_fabric_manager_common.rb' use 'partial/_fabric_manager_install_rhel.rb' -def fabric_manager_package - 'nvidia-fabric-manager' -end - -def fabric_manager_version - _nvidia_driver_version -end - def platform 'rhel7' end + +def fabric_manager_package + 'nvidia-fabric-manager' +end diff --git a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_redhat8.rb b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_redhat8.rb index 223cabaf89..8c5f33f0ab 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_redhat8.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_redhat8.rb @@ -19,14 +19,6 @@ use 'partial/_fabric_manager_common.rb' use 'partial/_fabric_manager_install_rhel.rb' -def fabric_manager_package - 'nvidia-fabric-manager' -end - -def fabric_manager_version - _nvidia_driver_version -end - def platform "rhel#{node['platform_version'].to_i}" end diff --git a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_rocky8.rb b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_rocky8.rb index c0d76676c2..9c979560b0 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_rocky8.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_rocky8.rb @@ -19,14 +19,6 @@ use 'partial/_fabric_manager_common.rb' use 'partial/_fabric_manager_install_rhel.rb' -def fabric_manager_package - 'nvidia-fabric-manager' -end - -def fabric_manager_version - _nvidia_driver_version -end - def platform "rhel#{node['platform_version'].to_i}" end diff --git a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_ubuntu22+.rb b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_ubuntu22+.rb index a19faad54e..dd22386b20 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_ubuntu22+.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_ubuntu22+.rb @@ -19,14 +19,6 @@ use 'partial/_fabric_manager_common.rb' use 'partial/_fabric_manager_install_debian.rb' -def fabric_manager_package - 'nvidia-fabricmanager-570' -end - -def fabric_manager_version - "#{_nvidia_driver_version}" -end - def platform "ubuntu#{node['platform_version'].delete('.')}" end diff --git a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_common.rb b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_common.rb index e615f07c89..dcadd80617 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_common.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_common.rb @@ -32,7 +32,7 @@ action :configure do # Start nvidia fabric manager on NVSwitch enabled systems, except for GB200 which does not need it if get_nvswitches > 1 && !is_gb200_node? - service 'nvidia-fabricmanager' do + service "#{fabric_manager_package}" do action %i(start enable) supports status: true end @@ -52,6 +52,14 @@ def _nvidia_driver_version nvidia_driver_version || node['cluster']['nvidia']['driver_version'] end +def fabric_manager_package + 'nvidia-fabricmanager' +end + +def fabric_manager_version + _nvidia_driver_version +end + # Get number of nv switches def get_nvswitches # We sum the count for all these deviceIds as output of lscpi command will be >0 diff --git a/cookbooks/aws-parallelcluster-platform/resources/gdrcopy/gdrcopy_ubuntu22+.rb b/cookbooks/aws-parallelcluster-platform/resources/gdrcopy/gdrcopy_ubuntu22+.rb index ef51f0e04c..e352ad119c 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/gdrcopy/gdrcopy_ubuntu22+.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/gdrcopy/gdrcopy_ubuntu22+.rb @@ -29,10 +29,10 @@ def gdrcopy_arch def installation_code <<~COMMAND CUDA=/usr/local/cuda ./build-deb-packages.sh - dpkg -i gdrdrv-dkms_#{gdrcopy_version}_#{gdrcopy_arch}.#{gdrcopy_platform}.deb - dpkg -i libgdrapi_#{gdrcopy_version}_#{gdrcopy_arch}.#{gdrcopy_platform}.deb - dpkg -i gdrcopy-tests_#{gdrcopy_version}_#{gdrcopy_arch}.#{gdrcopy_platform}+cuda*.deb - dpkg -i gdrcopy_#{gdrcopy_version}_#{gdrcopy_arch}.#{gdrcopy_platform}.deb + dpkg -i gdrdrv-dkms_#{gdrcopy_version}-1_#{gdrcopy_arch}.#{gdrcopy_platform}.deb + dpkg -i libgdrapi_#{gdrcopy_version}-1_#{gdrcopy_arch}.#{gdrcopy_platform}.deb + dpkg -i gdrcopy-tests_#{gdrcopy_version}-1_#{gdrcopy_arch}.#{gdrcopy_platform}+cuda*.deb + dpkg -i gdrcopy_#{gdrcopy_version}-1_#{gdrcopy_arch}.#{gdrcopy_platform}.deb COMMAND end diff --git a/cookbooks/aws-parallelcluster-platform/resources/gdrcopy/partial/_gdrcopy_common.rb b/cookbooks/aws-parallelcluster-platform/resources/gdrcopy/partial/_gdrcopy_common.rb index e7cce185a6..87ec76f2a6 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/gdrcopy/partial/_gdrcopy_common.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/gdrcopy/partial/_gdrcopy_common.rb @@ -13,11 +13,11 @@ # See the License for the specific language governing permissions and limitations under the License. def gdrcopy_version - '2.4.4' + '2.5.1' end def gdrcopy_checksum - '8802f7bc4a589a610118023bdcdd83c10a56dea399acf6eeaac32e8cc10739a8' + 'c6d5ebb7dabb89d798f27609511735595004da73af28d93ac041bb5290c4cbec' end unified_mode true diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_amazon2.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_amazon2.rb index 293fc1bc78..2ade08c58a 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_amazon2.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_amazon2.rb @@ -24,3 +24,7 @@ def _nvidia_dcgm_enabled def platform 'rhel7' end + +def dcgm4_package + "#{dcgm_package}-4-cuda12" +end diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_debian.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_debian.rb index e4882101ad..5cf41d11c0 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_debian.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_debian.rb @@ -45,7 +45,7 @@ def dcgm_package end def dcgm4_package - "#{dcgm_package}-4-cuda12" + "#{dcgm_package}-4-cuda13" end def dcgm4_core_package diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_rhel.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_rhel.rb index c22f791e39..925d6c2b6c 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_rhel.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_rhel.rb @@ -47,7 +47,7 @@ def dcgm_package end def dcgm4_package - "#{dcgm_package}-4-cuda12" + "#{dcgm_package}-4-cuda13" end def dcgm4_core_package diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_amazon2.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_amazon2.rb index 5f0c765bb7..d46666e068 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_amazon2.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_amazon2.rb @@ -25,3 +25,7 @@ def imex_installed? action :configure do # Do nothing end + +def nvidia_imex_package + "#{nvidia_imex_service}-#{nvidia_driver_major_version}" +end diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb index 454ea6e99b..205d6c6fe7 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb @@ -80,7 +80,7 @@ end def nvidia_imex_package - "#{nvidia_imex_service}-#{nvidia_driver_major_version}" + "#{nvidia_imex_service}" end def nvidia_driver_major_version diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/cuda_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/cuda_spec.rb index 27a001ff05..4c07ca73bc 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/cuda_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/cuda_spec.rb @@ -1,10 +1,10 @@ require 'spec_helper' describe 'aws-parallelcluster-platform::cuda' do - cached(:cuda_version) { '12.8' } - cached(:cuda_patch) { '1' } + cached(:cuda_version) { '13.0' } + cached(:cuda_patch) { '2' } cached(:cuda_complete_version) { "#{cuda_version}.#{cuda_patch}" } - cached(:cuda_version_suffix) { '570.124.06' } + cached(:cuda_version_suffix) { '580.95.05' } context 'when nvidia not enabled' do cached(:chef_run) do @@ -20,7 +20,7 @@ context 'when on arm' do cached(:cuda_arch) { 'linux_sbsa' } cached(:cuda_url) { "#{node['cluster']['artifacts_s3_url']}/dependencies/cuda/cuda_#{cuda_complete_version}_#{cuda_version_suffix}_#{cuda_arch}.run" } - cached(:cuda_samples_version) { '12.8' } + cached(:cuda_samples_version) { '13.0' } cached(:cuda_samples_url) { "#{node['cluster']['artifacts_s3_url']}/dependencies/cuda/samples/v#{cuda_samples_version}.tar.gz" } cached(:chef_run) do diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/fabric_manager_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/fabric_manager_spec.rb index 9fa1ce4fde..b675aa4416 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/fabric_manager_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/fabric_manager_spec.rb @@ -168,8 +168,8 @@ def self.configure(chef_run) for_all_oses do |platform, version| context "on #{platform}#{version}" do - cached(:fabric_manager_package) { platform == 'ubuntu' ? 'nvidia-fabricmanager-570' : 'nvidia-fabric-manager' } - cached(:fabric_manager_version) { platform == 'ubuntu' ? "#{nvidia_driver_version}" : nvidia_driver_version } + cached(:fabric_manager_version) { nvidia_driver_version } + cached(:fabric_manager_package) { platform == 'amazon' && version == '2' ? 'nvidia-fabric-manager' : 'nvidia-fabricmanager' } context 'when fabric manager is to install' do cached(:chef_run) do @@ -204,7 +204,7 @@ def self.configure(chef_run) end it 'installs fabric manager' do - is_expected.to run_bash("Install nvidia-fabric-manager") + is_expected.to run_bash("Install #{fabric_manager_package}") .with(user: 'root') .with_retries(3) .with_retry_delay(5) @@ -222,9 +222,8 @@ def self.configure(chef_run) [true, false].each do |is_gb200| for_all_oses do |platform, version| context "on #{platform}#{version} on #{is_gb200} gb200 node" do - cached(:fabric_manager_package) { platform == 'ubuntu' ? 'nvidia-fabricmanager-535' : 'nvidia-fabric-manager' } - cached(:fabric_manager_version) { platform == 'ubuntu' ? "#{nvidia_driver_version}" : nvidia_driver_version } - + cached(:fabric_manager_version) { nvidia_driver_version } + cached(:fabric_manager_package) { platform == 'amazon' && version == '2' ? 'nvidia-fabric-manager' : 'nvidia-fabricmanager' } context('when nvswithes are > 1') do cached(:chef_run) do stubs_for_provider('fabric_manager') do |res| @@ -243,13 +242,13 @@ def self.configure(chef_run) if is_gb200 it 'does not start nvidia-fabricmanager service' do - is_expected.not_to start_service('nvidia-fabricmanager') + is_expected.not_to start_service("#{fabric_manager_package}") .with_action(%i(start enable)) .with_supports({ status: true }) end else it 'starts nvidia-fabricmanager service' do - is_expected.to start_service('nvidia-fabricmanager') + is_expected.to start_service("#{fabric_manager_package}") .with_action(%i(start enable)) .with_supports({ status: true }) end @@ -269,7 +268,7 @@ def self.configure(chef_run) end it "doesn't start nvidia-fabricmanager service" do - is_expected.not_to start_service('nvidia-fabricmanager') + is_expected.not_to start_service("#{fabric_manager_package}") end end end diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/gdrcopy_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/gdrcopy_spec.rb index be61bcf74a..776ef0a1e9 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/gdrcopy_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/gdrcopy_spec.rb @@ -136,7 +136,7 @@ def self.configure(chef_run) expected_gdrcopy_version = if platform == "centos" "2.3.1" else - "2.4.4" + "2.5.1" end expect(resource.gdrcopy_version).to eq(expected_gdrcopy_version) end @@ -160,7 +160,7 @@ def self.configure(chef_run) expected_gdrcopy_checksum = if platform == "centos" "59b3cc97a4fc6008a5407506d9e67ecc4144cfad61c261217fabcb671cd30ca8" else - "8802f7bc4a589a610118023bdcdd83c10a56dea399acf6eeaac32e8cc10739a8" + "c6d5ebb7dabb89d798f27609511735595004da73af28d93ac041bb5290c4cbec" end expect(resource.gdrcopy_checksum).to eq(expected_gdrcopy_checksum) end @@ -186,12 +186,12 @@ def self.configure(chef_run) context "on #{platform}#{version} when gdrcopy enabled" do cached(:sources_dir) { 'sources_dir' } - cached(:gdrcopy_version) { platform == 'centos' ? '2.3.1' : '2.4.4' } + cached(:gdrcopy_version) { platform == 'centos' ? '2.3.1' : '2.5.1' } cached(:gdrcopy_checksum) do if platform == 'centos' '59b3cc97a4fc6008a5407506d9e67ecc4144cfad61c261217fabcb671cd30ca8' else - '8802f7bc4a589a610118023bdcdd83c10a56dea399acf6eeaac32e8cc10739a8' + 'c6d5ebb7dabb89d798f27609511735595004da73af28d93ac041bb5290c4cbec' end end cached(:gdrcopy_service) { platform == 'ubuntu' ? 'gdrdrv' : 'gdrcopy' } @@ -274,10 +274,10 @@ def self.configure(chef_run) if platform == 'ubuntu' expect(installation_code).to match(%r{CUDA=/usr/local/cuda ./build-deb-packages.sh}) - expect(installation_code).to match(/dpkg -i gdrdrv-dkms_#{gdrcopy_version}_#{gdrcopy_arch}.#{gdrcopy_platform}.deb/) - expect(installation_code).to match(/dpkg -i libgdrapi_#{gdrcopy_version}_#{gdrcopy_arch}.#{gdrcopy_platform}.deb/) - expect(installation_code).to match(/dpkg -i gdrcopy-tests_#{gdrcopy_version}_#{gdrcopy_arch}.#{gdrcopy_platform}\+cuda\*.deb/) - expect(installation_code).to match(/dpkg -i gdrcopy_#{gdrcopy_version}_#{gdrcopy_arch}.#{gdrcopy_platform}.deb/) + expect(installation_code).to match(/dpkg -i gdrdrv-dkms_#{gdrcopy_version}-1_#{gdrcopy_arch}.#{gdrcopy_platform}.deb/) + expect(installation_code).to match(/dpkg -i libgdrapi_#{gdrcopy_version}-1_#{gdrcopy_arch}.#{gdrcopy_platform}.deb/) + expect(installation_code).to match(/dpkg -i gdrcopy-tests_#{gdrcopy_version}-1_#{gdrcopy_arch}.#{gdrcopy_platform}\+cuda\*.deb/) + expect(installation_code).to match(/dpkg -i gdrcopy_#{gdrcopy_version}-1_#{gdrcopy_arch}.#{gdrcopy_platform}.deb/) elsif platform == 'centos' expect(installation_code).to match(%r{CUDA=/usr/local/cuda ./build-rpm-packages.sh}) expect(installation_code).to match(/rpm -q gdrcopy-kmod-#{gdrcopy_version}-1dkms || rpm -Uvh gdrcopy-kmod-#{gdrcopy_version}-1dkms.noarch.#{gdrcopy_platform}.rpm/) diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_dcgm_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_dcgm_spec.rb index 08e45803d1..965354466d 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_dcgm_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_dcgm_spec.rb @@ -171,7 +171,7 @@ def self.setup(chef_run, nvidia_enabled: nil) else it 'installs datacenter gpu manager' do is_expected.to run_bash('Install datacenter-gpu-manager-4-core') - is_expected.to run_bash('Install datacenter-gpu-manager-4-cuda12') + is_expected.to run_bash('Install datacenter-gpu-manager-4-cuda13') end end end diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb index 44da506152..a0a780fc9d 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb @@ -202,7 +202,7 @@ def self.configure(chef_run) %w(aarch64 x86_64).each do |arm_or_x86| context "when nvidia is enabled on #{arm_or_x86}" do cached(:nvidia_imex_version) { "1.2.3-1" } - cached(:nvidia_imex_package) { "nvidia-imex-1" } + cached(:nvidia_imex_package) { "nvidia-imex" } cached(:nvidia_imex_name) do if %(redhat rocky).include?(platform) || platform == 'amazon' && version == '2023' "#{nvidia_imex_package}-#{nvidia_imex_version}" diff --git a/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_dcgm_spec.rb b/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_dcgm_spec.rb index 29d8179436..27d1863403 100644 --- a/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_dcgm_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_dcgm_spec.rb @@ -19,7 +19,7 @@ it { should be_installed } end else - describe package('datacenter-gpu-manager-4-cuda12') do + describe package('datacenter-gpu-manager-4-cuda13') do it { should be_installed } end end diff --git a/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_gdrcopy_spec.rb b/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_gdrcopy_spec.rb index 276891b82b..9a6d2b1988 100644 --- a/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_gdrcopy_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_gdrcopy_spec.rb @@ -15,7 +15,7 @@ (node['cluster']['nvidia']['enabled'] == 'yes' || node['cluster']['nvidia']['enabled'] == true) end - expected_gdrcopy_version = "2.4" + expected_gdrcopy_version = "2.5" describe "gdrcopy version is expected to be #{expected_gdrcopy_version}" do subject { command('modinfo -F version gdrdrv').stdout.strip() }