Skip to content

Commit aebeabf

Browse files
authored
Merge branch 'develop' into develop-tag-update
2 parents 8a4a7d1 + 33b737b commit aebeabf

File tree

17 files changed

+646
-29
lines changed

17 files changed

+646
-29
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@ CHANGELOG
1212
- Add support for p6-b300 instances for all OSs except AL2.
1313
- Add alarm on missing clustermgtd heartbeat.
1414
- Support updates of `Tags` during cluster-updates.
15+
- Add `LaunchTemplateOverrides` to cluster config to allow network interfaces to be customized by overriding the launch template of a compute resource.
16+
- This overrides the parallelcluster default using a shallow merge.
1517

1618
**BUG FIXES**
1719
- Add validation to block updates that change tag order. Blocking such change prevents update failures.

cli/src/pcluster/aws/ec2.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -683,3 +683,27 @@ def get_instance_type_and_reservation_type_from_capacity_reservation(
683683
reservation_type = reservation.reservation_type()
684684

685685
return instance_type, reservation_type
686+
687+
@AWSExceptionHandler.handle_client_exception
688+
def describe_launch_template_version(self, launch_template_id: str, version: str):
689+
"""
690+
Describe a launch template version and return its data.
691+
692+
Args:
693+
launch_template_id: The launch template ID (e.g., 'lt-0abc123def456')
694+
version: The version number or '$Default'/'$Latest'
695+
696+
Returns:
697+
dict: The LaunchTemplateData from the specified version
698+
"""
699+
response = self._client.describe_launch_template_versions(
700+
LaunchTemplateId=launch_template_id,
701+
Versions=[str(version)],
702+
)
703+
versions = response.get("LaunchTemplateVersions", [])
704+
if versions:
705+
return versions[0].get("LaunchTemplateData", {})
706+
raise AWSClientError(
707+
function_name="describe_launch_template_versions",
708+
message=f"Launch template {launch_template_id} version {version} not found",
709+
)

cli/src/pcluster/config/cluster_config.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,7 @@
188188
InstancesEFAValidator,
189189
InstancesMemorySchedulingWarningValidator,
190190
InstancesNetworkingValidator,
191+
LaunchTemplateValidator,
191192
)
192193
from pcluster.validators.kms_validators import KmsKeyIdEncryptedValidator, KmsKeyValidator
193194
from pcluster.validators.monitoring_validators import DetailedMonitoringValidator, LogRotationValidator
@@ -878,6 +879,15 @@ def __init__(self, enabled: bool = None, gdr_support: bool = None, **kwargs):
878879
self.gdr_support = Resource.init_param(gdr_support, default=False)
879880

880881

882+
class LaunchTemplateOverrides(Resource):
883+
"""Represent the Launch Template Overrides configuration."""
884+
885+
def __init__(self, launch_template_id: str = None, version: int = None, **kwargs):
886+
super().__init__(**kwargs)
887+
self.launch_template_id = Resource.init_param(launch_template_id)
888+
self.version = Resource.init_param(version)
889+
890+
881891
# ---------------------- Health Checks ---------------------- #
882892

883893

@@ -2230,6 +2240,7 @@ def __init__(
22302240
tags: List[Tag] = None,
22312241
static_node_priority: int = None,
22322242
dynamic_node_priority: int = None,
2243+
launch_template_overrides: LaunchTemplateOverrides = None,
22332244
**kwargs,
22342245
):
22352246
super().__init__(**kwargs)
@@ -2250,6 +2261,7 @@ def __init__(
22502261
self.tags = tags
22512262
self.static_node_priority = Resource.init_param(static_node_priority, default=1)
22522263
self.dynamic_node_priority = Resource.init_param(dynamic_node_priority, default=1000)
2264+
self.launch_template_overrides = launch_template_overrides
22532265

22542266
@abstractmethod
22552267
def is_flexible(self) -> bool:
@@ -2357,6 +2369,13 @@ def _register_validators(self, context: ValidatorContext = None):
23572369
ec2memory=min_memory,
23582370
instance_type=smallest_type,
23592371
)
2372+
if self.launch_template_overrides:
2373+
self._register_validator(
2374+
LaunchTemplateValidator,
2375+
compute_resource_name=self.name,
2376+
launch_template_id=self.launch_template_overrides.launch_template_id,
2377+
launch_template_version=self.launch_template_overrides.version,
2378+
)
23602379

23612380
def is_flexible(self):
23622381
"""Return True because the ComputeResource can contain multiple instance types."""
@@ -2449,6 +2468,13 @@ def _register_validators(self, context: ValidatorContext = None):
24492468
ec2memory=self._instance_type_info.ec2memory_size_in_mib(),
24502469
instance_type=self.instance_type,
24512470
)
2471+
if self.launch_template_overrides:
2472+
self._register_validator(
2473+
LaunchTemplateValidator,
2474+
compute_resource_name=self.name,
2475+
launch_template_id=self.launch_template_overrides.launch_template_id,
2476+
launch_template_version=self.launch_template_overrides.version,
2477+
)
24522478

24532479
@property
24542480
def architecture(self) -> str:

cli/src/pcluster/schemas/cluster_schema.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@
6464
Image,
6565
Imds,
6666
IntelSoftware,
67+
LaunchTemplateOverrides,
6768
LocalStorage,
6869
LoginNodes,
6970
LoginNodesIam,
@@ -829,6 +830,31 @@ def make_resource(self, data, **kwargs):
829830
return Efa(**data)
830831

831832

833+
class LaunchTemplateOverridesSchema(BaseSchema):
834+
"""Represent the schema of LaunchTemplateOverrides for a Compute Resource."""
835+
836+
launch_template_id = fields.Str(
837+
required=True,
838+
metadata={"update_policy": UpdatePolicy.QUEUE_UPDATE_STRATEGY},
839+
)
840+
version = fields.Int(
841+
validate=validate.Range(min=1),
842+
required=True,
843+
metadata={"update_policy": UpdatePolicy.QUEUE_UPDATE_STRATEGY},
844+
)
845+
846+
@validates_schema
847+
def validate_version_required(self, data, **kwargs):
848+
"""Validate that version is required when launch_template_id is specified."""
849+
if data.get("launch_template_id") and not data.get("version"):
850+
raise ValidationError("Version is required when LaunchTemplateId is specified.", field_name="Version")
851+
852+
@post_load
853+
def make_resource(self, data, **kwargs):
854+
"""Generate resource."""
855+
return LaunchTemplateOverrides(**data)
856+
857+
832858
# ---------------------- Monitoring ---------------------- #
833859

834860

@@ -1585,6 +1611,9 @@ class SlurmComputeResourceSchema(_ComputeResourceSchema):
15851611
validate=validate.Range(min=MIN_SLURM_NODE_PRIORITY, max=MAX_SLURM_NODE_PRIORITY),
15861612
metadata={"update_policy": UpdatePolicy.SUPPORTED},
15871613
)
1614+
launch_template_overrides = fields.Nested(
1615+
LaunchTemplateOverridesSchema, metadata={"update_policy": UpdatePolicy.QUEUE_UPDATE_STRATEGY}
1616+
)
15881617

15891618
@validates_schema
15901619
def no_coexist_instance_type_flexibility(self, data, **kwargs):

cli/src/pcluster/templates/queues_stack.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,31 @@
3737
from pcluster.utils import get_attr, get_http_tokens_setting
3838

3939

40+
def _apply_launch_template_overrides(launch_template, compute_resource):
41+
"""
42+
Apply user-specified launch template overrides to the generated launch template.
43+
44+
For each property in the user's launch template, applies it using add_property_override.
45+
This replaces the specific property while CDK handles the rest normally.
46+
47+
Args:
48+
launch_template: The CDK CfnLaunchTemplate construct
49+
compute_resource: The compute resource configuration
50+
"""
51+
launch_template_overrides = getattr(compute_resource, "launch_template_overrides", None)
52+
if not launch_template_overrides or not launch_template_overrides.launch_template_id:
53+
return
54+
55+
version = str(launch_template_overrides.version)
56+
override_lt_data = AWSApi.instance().ec2.describe_launch_template_version(
57+
launch_template_overrides.launch_template_id, version
58+
)
59+
60+
# Apply each property from the override launch template
61+
for key, value in override_lt_data.items():
62+
launch_template.add_property_override(f"LaunchTemplateData.{key}", value)
63+
64+
4065
class QueuesStack(NestedStack):
4166
"""Stack encapsulating a set of queues and the associated resources."""
4267

@@ -364,6 +389,8 @@ def _add_compute_resource_launch_template(
364389
),
365390
)
366391

392+
_apply_launch_template_overrides(launch_template, compute_resource)
393+
367394
return launch_template
368395

369396

cli/src/pcluster/validators/instances_validators.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from enum import Enum
1212
from typing import Callable, Dict
1313

14+
from pcluster.aws.aws_api import AWSApi
1415
from pcluster.aws.aws_resources import InstanceTypeInfo
1516
from pcluster.config import cluster_config
1617
from pcluster.constants import MIN_MEMORY_ABSOLUTE_DIFFERENCE, MIN_MEMORY_PRECENTAGE_DIFFERENCE
@@ -335,3 +336,42 @@ def _min_max_memory(self, instance_types_info: Dict[str, InstanceTypeInfo]):
335336
# EC2 API should return valid values, but since it's really cheap better add a check
336337
available_memory = [value for value in available_memory if value is not None]
337338
return min(available_memory), max(available_memory)
339+
340+
341+
class LaunchTemplateValidator(Validator):
342+
"""Validate that the specified launch template exists, is accessible, and only contains allowed properties."""
343+
344+
# Only these properties are allowed in the launch template
345+
ALLOWED_PROPERTIES = {"NetworkInterfaces"}
346+
347+
def _validate(
348+
self,
349+
compute_resource_name: str,
350+
launch_template_id: str,
351+
launch_template_version: int,
352+
**kwargs,
353+
):
354+
"""Check if the launch template exists, is valid, and only contains NetworkInterfaces."""
355+
if not launch_template_id:
356+
return
357+
358+
version = str(launch_template_version)
359+
try:
360+
lt_data = AWSApi.instance().ec2.describe_launch_template_version(launch_template_id, version)
361+
except Exception as e:
362+
self._add_failure(
363+
f"Launch template '{launch_template_id}' version '{version}' specified in Compute Resource "
364+
f"'{compute_resource_name}' could not be found or accessed: {e}",
365+
FailureLevel.ERROR,
366+
)
367+
return
368+
369+
# Check for disallowed properties
370+
disallowed_properties = set(lt_data.keys()) - self.ALLOWED_PROPERTIES
371+
if disallowed_properties:
372+
self._add_failure(
373+
f"Launch template '{launch_template_id}' in Compute Resource '{compute_resource_name}' contains "
374+
f"properties that are not allowed: {', '.join(sorted(disallowed_properties))}. "
375+
f"Only NetworkInterfaces is allowed.",
376+
FailureLevel.ERROR,
377+
)

cli/tests/pcluster/aws/dummy_aws_api.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,10 @@ def __init__(self):
136136
}
137137
self.security_groups_cache = {}
138138

139+
def describe_launch_template_version(self, launch_template_id, version):
140+
"""Return mock launch template data."""
141+
return {"NetworkInterfaces": [{"DeviceIndex": 0, "SubnetId": "subnet-123"}]}
142+
139143
def get_official_image_id(self, os, architecture, filters=None):
140144
return "dummy-ami-id"
141145

@@ -430,5 +434,9 @@ def mock_aws_api(mocker, mock_instance_type_info=True):
430434
"pcluster.aws.ec2.Ec2Client.describe_image",
431435
return_value=ImageInfo({"BlockDeviceMappings": [{"DeviceName": "/dev/sda1", "Ebs": {"VolumeSize": 35}}]}),
432436
)
437+
mocker.patch(
438+
"pcluster.aws.ec2.Ec2Client.describe_launch_template_version",
439+
return_value={"NetworkInterfaces": [{"DeviceIndex": 0, "SubnetId": "subnet-123"}]},
440+
)
433441
if mock_instance_type_info:
434442
mocker.patch("pcluster.aws.ec2.Ec2Client.get_instance_type_info", side_effect=_DummyInstanceTypeInfo)

cli/tests/pcluster/example_configs/slurm.full.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,9 @@ Scheduling:
188188
Efa:
189189
Enabled: true
190190
GdrSupport: false
191+
LaunchTemplateOverrides:
192+
LaunchTemplateId: lt-01234567890abcdef
193+
Version: 1
191194
Iam:
192195
InstanceProfile: arn:aws:iam::aws:instance-profile/CustomNodeInstanceProfile
193196
Image:

0 commit comments

Comments
 (0)