Skip to content

Commit 633cd28

Browse files
Merge pull request #60 from oracle-quickstart/2.11.2
What's Changed - Improved monitoring and alerting - Lustre Support - B200 support - Add Slurm Rest API support - Fix home on existing FSS - New Healthchecks - Bug fixes - ... Release notes: https://github.com/oracle-quickstart/oci-hpc/releases/tag/v2.11.2 Readme: https://github.com/oracle-quickstart/oci-hpc/blob/v2.11.2/README.md
2 parents a9f4a4d + 90e8205 commit 633cd28

File tree

168 files changed

+7873
-842
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

168 files changed

+7873
-842
lines changed

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33
[![Deploy to Oracle Cloud](https://oci-resourcemanager-plugin.plugins.oci.oraclecloud.com/latest/deploy-to-oracle-cloud.svg)](https://cloud.oracle.com/resourcemanager/stacks/create?zipUrl=https://github.com/oracle-quickstart/oci-hpc/archive/refs/heads/master.zip)
44

5+
## Create a dynamic group
6+
For customer tenancies, check or create dynamic group. In OCI Console, navigate to Identity->Domains->Default domain->Dynamic groups, create a dynamic group e.g. instance_principal with `Any {instance.compartment.id = 'ocid1.compartment.oc1.example-ocid'}`. Though this provides a liberal access for the instance principal, it is recommended to narrow the scope depending on customer's security posture requirements. If you change the name of policy, be sure make the change in the policy examples that follow this section. Currently they all assume that you named the dynamic group as `instance_principal`.
57

68
## Policies to deploy the stack:
79
```
@@ -24,6 +26,7 @@ Allow dynamic-group instance_principal to manage instance-family in compartment
2426
Allow dynamic-group instance_principal to use virtual-network-family in compartment compartmentName
2527
Allow dynamic-group instance_principal to use volumes in compartment compartmentName
2628
Allow dynamic-group instance_principal to manage dns in compartment compartmentName
29+
Allow dynamic-group instance_principal to read metrics in compartment compartmentName
2730
```
2831
or:
2932

autoscaling/tf_init/controller_update.tf

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,13 +40,21 @@ resource "local_file" "inventory" {
4040
home_nfs = var.home_nfs,
4141
create_fss = var.create_fss,
4242
home_fss = var.home_fss,
43+
mount_target_count = var.mount_target_count,
44+
nfs_list_of_mount_target_IPs = var.nfs_list_of_mount_target_IPs,
45+
manual_multiple_mount_target = var.manual_multiple_mount_target,
4346
add_nfs = var.add_nfs,
4447
slurm_nfs_path = var.slurm_nfs_path,
4548
rack_aware = var.rack_aware,
4649
nfs_target_path = var.nfs_target_path,
4750
nfs_source_IP = var.nfs_source_IP,
4851
nfs_source_path = var.nfs_source_path,
4952
nfs_options = var.nfs_options,
53+
add_lfs = var.add_lfs,
54+
lfs_target_path = var.lfs_target_path,
55+
lfs_source_IP = var.lfs_source_IP,
56+
lfs_source_path = var.lfs_source_path,
57+
lfs_options = var.lfs_options,
5058
localdisk = var.localdisk,
5159
log_vol = var.log_vol,
5260
redundancy = var.redundancy,

autoscaling/tf_init/inventory.tpl

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,9 @@ scratch_nfs = ${scratch_nfs}
3232
home_nfs = ${home_nfs}
3333
create_fss = ${create_fss}
3434
home_fss = ${home_fss}
35+
mount_target_count = ${mount_target_count}
36+
nfs_list_of_mount_target_IPs = ${nfs_list_of_mount_target_IPs}
37+
manual_multiple_mount_target = ${manual_multiple_mount_target}
3538
cluster_nfs = ${cluster_nfs}
3639
cluster_nfs_path = ${cluster_nfs_path}
3740
slurm_nfs_path = ${slurm_nfs_path}
@@ -81,3 +84,8 @@ healthchecks=${healthchecks}
8184
change_hostname=${change_hostname}
8285
hostname_convention=${hostname_convention}
8386
ons_topic_ocid=${ons_topic_ocid}
87+
add_lfs=${add_lfs}
88+
lfs_target_path=${lfs_target_path}
89+
lfs_source_IP=${lfs_source_IP}
90+
lfs_source_path=${lfs_source_path}
91+
lfs_options=${lfs_options}

autoscaling/tf_init/versions.tf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ terraform {
33
required_providers {
44
oci = {
55
source = "oracle/oci"
6-
version = "6.9.0"
6+
version = "7.1.0"
77
}
88
}
99
}

bin/controller.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,7 @@ fi
171171
ansible-galaxy collection install ansible.netcommon:=2.5.1 --force > /dev/null
172172
ansible-galaxy collection install community.general:=4.8.1 --force > /dev/null
173173
ansible-galaxy collection install ansible.posix --force > /dev/null
174-
ansible-galaxy collection install community.crypto --force > /dev/null
174+
ansible-galaxy collection install community.crypto:=2.26.3 --force > /dev/null
175175

176176
threads=$(nproc)
177177
forks=$(($threads * 8))

conf/variables.tpl

Lines changed: 28 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -53,11 +53,10 @@ variable "marketplace_listing" {
5353
variable "marketplace_version_id" {
5454
type = map(string)
5555
default = {
56-
"HPC_OL8" = "Oracle-Linux-8.10-2025.02.28-0-OCA-RHCK-OFED-24.10-1.1.4.0-2025.03.27-0"
57-
"GPU_OL8_NV550" = "Oracle-Linux-8.10-2025.02.28-0-OCA-RHCK-OFED-24.10-1.1.4.0-GPU-550-CUDA-12.4-2025.03.27-0"
58-
"GPU_OL8_NV560" = "Oracle-Linux-8.10-2025.02.28-0-OCA-RHCK-OFED-24.10-1.1.4.0-GPU-550-CUDA-12.4-2025.03.27-0"
59-
"GPU_OL8_NV570" = "Oracle-Linux-8.10-2025.02.28-0-OCA-RHCK-OFED-24.10-1.1.4.0-GPU-550-CUDA-12.4-2025.03.27-0"
60-
"GPU_OL8_AMD632" = "Oracle-Linux-8.10-2025.02.28-0-OCA-RHCK-OFED-24.10-1.1.4.0-AMD-ROCM-632-2025.03.28-0"
56+
"HPC_OL8" = "Oracle-Linux-8.10-2025.06.17-0-RHCK-OFED-24.10-1.1.4.0-2025.07.19-0"
57+
"GPU_OL8_NV550" = "Oracle-Linux-8.10-2025.06.17-0-RHCK-OFED-24.10-1.1.4.0-GPU-550-CUDA-12.4-2025.07.19-0"
58+
"GPU_OL8_NV570" = "Oracle-Linux-8.10-2025.06.17-0-RHCK-OFED-24.10-1.1.4.0-GPU-570-OPEN-CUDA-12.8-2025.07.18-0"
59+
"GPU_OL8_AMD632" = "Oracle-Linux-8.10-2025.06.17-0-RHCK-OFED-24.10-1.1.4.0-AMD-ROCM-632-2025.07.20-0"
6160
}
6261
}
6362

@@ -106,6 +105,9 @@ variable "controller_mount_ip" {default = "${controller_mount_ip}"}
106105
variable "login_mount_ip" {default = "${login_mount_ip}"}
107106
variable "home_nfs" { default = ${home_nfs} }
108107
variable "home_fss" { default = ${home_fss} }
108+
variable "mount_target_count" { default = ${mount_target_count} }
109+
variable "nfs_list_of_mount_target_IPs" { default = "${nfs_list_of_mount_target_IPs}" }
110+
variable "manual_multiple_mount_target" { default = ${manual_multiple_mount_target} }
109111
variable "latency_check" { default = ${latency_check} }
110112
variable "create_fss" { default = ${create_fss} }
111113
variable "configure" { default = true }
@@ -166,4 +168,24 @@ variable "hostname_convention" {
166168
}
167169
variable "ons_topic_ocid" {
168170
default = "${ons_topic_ocid}"
169-
}
171+
}
172+
variable "add_lfs" {
173+
default = "${add_lfs}"
174+
type = bool
175+
}
176+
variable "lfs_target_path" {
177+
default = "${lfs_target_path}"
178+
type = string
179+
}
180+
variable "lfs_source_IP" {
181+
default = "${lfs_source_IP}"
182+
type = string
183+
}
184+
variable "lfs_source_path" {
185+
default = "${lfs_source_path}"
186+
type = string
187+
}
188+
variable "lfs_options" {
189+
default = "${lfs_options}"
190+
type = string
191+
}

controller.tf

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -258,6 +258,9 @@ resource "null_resource" "cluster" {
258258
home_nfs = var.home_nfs,
259259
create_fss = var.create_fss,
260260
home_fss = var.home_fss,
261+
mount_target_count = var.mount_target_count,
262+
nfs_list_of_mount_target_IPs = local.nfs_list_of_mount_target_IPs,
263+
manual_multiple_mount_target = var.manual_multiple_mount_target,
261264
scratch_nfs = var.use_scratch_nfs && var.node_count > 0,
262265
cluster_nfs = var.use_cluster_nfs,
263266
cluster_nfs_path = var.cluster_nfs_path,
@@ -267,6 +270,11 @@ resource "null_resource" "cluster" {
267270
nfs_source_IP = local.nfs_source_IP,
268271
nfs_source_path = var.nfs_source_path,
269272
nfs_options = var.nfs_options,
273+
add_lfs = var.add_lfs,
274+
lfs_target_path = var.lfs_target_path,
275+
lfs_source_IP = local.luster_IP,
276+
lfs_source_path = var.lfs_source_path,
277+
lfs_options = var.lfs_options,
270278
localdisk = var.localdisk,
271279
log_vol = var.log_vol,
272280
redundancy = var.redundancy,
@@ -436,11 +444,19 @@ resource "null_resource" "cluster" {
436444
home_nfs = var.home_nfs,
437445
create_fss = var.create_fss,
438446
home_fss = var.home_fss,
447+
mount_target_count = var.mount_target_count,
448+
nfs_list_of_mount_target_IPs = local.nfs_list_of_mount_target_IPs,
449+
manual_multiple_mount_target = var.manual_multiple_mount_target,
439450
add_nfs = var.add_nfs,
440451
nfs_target_path = var.nfs_target_path,
441452
nfs_source_IP = local.nfs_source_IP,
442453
nfs_source_path = var.nfs_source_path,
443454
nfs_options = var.nfs_options,
455+
add_lfs = var.add_lfs,
456+
lfs_target_path = var.lfs_target_path,
457+
lfs_source_IP = local.luster_IP,
458+
lfs_source_path = var.lfs_source_path,
459+
lfs_options = var.lfs_options,
444460
localdisk = var.localdisk,
445461
log_vol = var.log_vol,
446462
redundancy = var.redundancy,
@@ -596,4 +612,4 @@ resource "oci_dns_rrset" "rrset-controller" {
596612
}
597613
scope = "PRIVATE"
598614
view_id = data.oci_dns_views.dns_views.views[0].id
599-
}
615+
}

inventory.tpl

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,9 @@ scratch_nfs = ${scratch_nfs}
2828
home_nfs = ${home_nfs}
2929
create_fss = ${create_fss}
3030
home_fss = ${home_fss}
31+
mount_target_count = ${mount_target_count}
32+
nfs_list_of_mount_target_IPs = ${nfs_list_of_mount_target_IPs}
33+
manual_multiple_mount_target = ${manual_multiple_mount_target}
3134
cluster_nfs = ${cluster_nfs}
3235
cluster_nfs_path = ${cluster_nfs_path}
3336
slurm_nfs_path = ${slurm_nfs_path}
@@ -86,4 +89,8 @@ healthchecks=${healthchecks}
8689
change_hostname=${change_hostname}
8790
hostname_convention=${hostname_convention}
8891
ons_topic_ocid=${ons_topic_ocid}
89-
92+
add_lfs=${add_lfs}
93+
lfs_target_path=${lfs_target_path}
94+
lfs_source_IP=${lfs_source_IP}
95+
lfs_source_path=${lfs_source_path}
96+
lfs_options=${lfs_options}

locals.tf

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ locals {
2727
subnet_id = var.private_deployment ? var.use_existing_vcn ? var.private_subnet_id : element(concat(oci_core_subnet.private-subnet.*.id, [""]), 1) : var.use_existing_vcn ? var.private_subnet_id : element(concat(oci_core_subnet.private-subnet.*.id, [""]), 0)
2828

2929
nfs_source_IP = var.create_fss ? oci_dns_rrset.fss-dns-round-robin[0].domain : var.nfs_source_IP
30-
nfs_list_of_mount_target_IPs = var.create_fss ? "[\"${join("\",\"",oci_file_storage_mount_target.FSSMountTarget.*.ip_address)}\"]" : var.nfs_source_IP
30+
nfs_list_of_mount_target_IPs = var.create_fss ? join(",",oci_file_storage_mount_target.FSSMountTarget.*.ip_address) : var.nfs_source_IP
3131

3232
// subnet id derived either from created subnet or existing if specified
3333
// controller_subnet_id = var.use_existing_vcn ? var.public_subnet_id : element(concat(oci_core_subnet.public-subnet.*.id, [""]), 0)
@@ -83,4 +83,7 @@ locals {
8383
platform_type = local.shape == "BM.GPU4.8" ? "AMD_ROME_BM_GPU" : local.shape == "BM.GPU.B4.8" || local.shape == "BM.GPU.A100-v2.8" ? "AMD_MILAN_BM_GPU" : local.shape == "BM.Standard.E3.128" ? "AMD_ROME_BM" : local.shape == "BM.Standard.E4.128" || local.shape == "BM.DenseIO.E4.128" ? "AMD_MILAN_BM" : "GENERIC_BM"
8484

8585
topic_id = var.alerting ? oci_ons_notification_topic.grafana_alerts[0].id : ""
86+
87+
// Lustre IP.
88+
luster_IP = var.create_lfs ? oci_lustre_file_storage_lustre_file_system.lustre_file_system[0].management_service_address : var.lfs_source_IP
8689
}

lustre.tf

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
resource "oci_lustre_file_storage_lustre_file_system" "lustre_file_system" {
2+
#Required
3+
count = var.create_lfs ? 1 : 0
4+
availability_domain = var.ad
5+
capacity_in_gbs = var.lfs_capacity_in_gbs
6+
compartment_id = var.lfs_compartment
7+
file_system_name = var.lfs_source_path # Mount name
8+
performance_tier = var.lfs_perf_tier
9+
10+
root_squash_configuration {
11+
12+
# #Optional
13+
# client_exceptions = var.lfs_root_squash_configuration_client_exceptions
14+
identity_squash = "NONE"
15+
# squash_gid = var.lfs_root_squash_configuration_squash_gid
16+
# squash_uid = var.lfs_root_squash_configuration_squash_uid
17+
}
18+
subnet_id = local.subnet_id
19+
20+
#Optional
21+
# cluster_placement_group_id = oci_cluster_placement_groups_cluster_placement_group.test_cluster_placement_group.id
22+
# defined_tags = {"Operations.CostCenter"= "42"}
23+
display_name = "${local.cluster_name}-lfs" # File system name
24+
#file_system_description = var.lfs_file_system_description # File system description Optional
25+
freeform_tags = {
26+
"lfs_cluster_name" = local.cluster_name
27+
}
28+
# kms_key_id = oci_kms_key.test_key.id
29+
# nsg_ids = var.lustre_file_system_nsg_ids
30+
}

0 commit comments

Comments
 (0)