Skip to content

Commit f6c81f5

Browse files
Merge branch 'main' into health_check_extension
2 parents 64c58c3 + 92115e8 commit f6c81f5

File tree

76 files changed

+2440
-317
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

76 files changed

+2440
-317
lines changed
Lines changed: 246 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,246 @@
1+
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
# SPDX-License-Identifier: MIT
3+
name: "Install/Remove Helm Charts After Scaling"
4+
on:
5+
# Use workflow_run to trigger this workflow after the scaling workflow completes
6+
workflow_run:
7+
workflows: ["EKS Cluster Scaling"]
8+
types:
9+
- completed
10+
branches:
11+
- main
12+
13+
# Keep the manual trigger option
14+
workflow_dispatch:
15+
inputs:
16+
# Required Core Settings
17+
cluster_name:
18+
description: 'EKS Cluster Name'
19+
required: true
20+
type: string
21+
default: 'eks-performance'
22+
region:
23+
description: 'AWS Region'
24+
required: true
25+
type: string
26+
default: 'us-west-2'
27+
28+
# Optional Settings
29+
cloudwatch_agent_repository:
30+
description: 'CloudWatch Agent Repository'
31+
type: string
32+
cloudwatch_agent_tag:
33+
description: 'CloudWatch Agent Tag'
34+
type: string
35+
cloudwatch_agent_operator_repository:
36+
description: 'CloudWatch Agent Operator Repository'
37+
type: string
38+
cloudwatch_agent_operator_tag:
39+
description: 'CloudWatch Agent Operator Tag'
40+
type: string
41+
helm-charts-branch:
42+
description: 'Branch of the helm charts to test'
43+
type: string
44+
default: 'main'
45+
operator-branch:
46+
description: 'Branch of the operator to test'
47+
type: string
48+
default: 'main'
49+
terraform_assume_role:
50+
description: 'AWS IAM Role to assume'
51+
type: string
52+
53+
concurrency:
54+
group: ${{ github.workflow }}-${{ github.ref }}
55+
cancel-in-progress: true
56+
57+
env:
58+
# Cluster environment variables
59+
AWS_REGION: ${{ inputs.region || 'us-west-2' }}
60+
CLUSTER_NAME: ${{ inputs.cluster_name || 'eks-performance' }}
61+
TERRAFORM_AWS_ASSUME_ROLE: ${{ inputs.terraform_assume_role || vars.TERRAFORM_AWS_ASSUME_ROLE }}
62+
TERRAFORM_AWS_ASSUME_ROLE_DURATION: 3600 # 1 hour duration
63+
64+
# ECR repository environment variables
65+
AGENT_ECR_TEST_REPO: "cwagent-integration-test"
66+
OPERATOR_ECR_TEST_REPO: "cwagent-operator-pre-release"
67+
68+
# Github repository environment variables
69+
OPERATOR_GITHUB_REPO_NAME: "aws/amazon-cloudwatch-agent-operator"
70+
71+
jobs:
72+
# Check if this workflow should run
73+
check-trigger:
74+
runs-on: ubuntu-latest
75+
if: ${{ github.event_name == 'workflow_dispatch' || (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success' && github.event.workflow_run.event == 'schedule') }}
76+
steps:
77+
- name: Check trigger type
78+
id: check-trigger
79+
run: |
80+
if [ "${{ github.event_name }}" == "workflow_run" ]; then
81+
echo "Triggered by workflow_run from a scheduled event"
82+
else
83+
echo "Triggered manually via workflow_dispatch"
84+
fi
85+
outputs:
86+
should_continue: ${{ github.event_name == 'workflow_dispatch' || (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success' && github.event.workflow_run.event == 'schedule') }}
87+
88+
# Get latest operator commit from github repo
89+
GetLatestOperatorCommitSHA:
90+
needs: check-trigger
91+
if: ${{ needs.check-trigger.outputs.should_continue == 'true' }}
92+
runs-on: ubuntu-latest
93+
outputs:
94+
operator_commit_sha: ${{steps.get_latest_sha.outputs.operator_sha}}
95+
operator_repo_name: ${{env.OPERATOR_GITHUB_REPO_NAME}}
96+
steps:
97+
- name: Checkout the target repo
98+
uses: actions/checkout@v4
99+
with:
100+
repository: ${{env.OPERATOR_GITHUB_REPO_NAME}}
101+
ref: ${{ inputs.operator-branch || 'main' }}
102+
path: operator-repo
103+
104+
- name: Get latest commit SHA
105+
id: get_latest_sha
106+
run: |
107+
cd operator-repo
108+
latest_sha=$(git rev-parse HEAD)
109+
echo "operator_sha=$latest_sha" >> "$GITHUB_OUTPUT"
110+
111+
# Build and upload agent image to ECR repo
112+
BuildAgent:
113+
needs: check-trigger
114+
if: ${{ needs.check-trigger.outputs.should_continue == 'true' }}
115+
uses: ./.github/workflows/build-test-artifacts.yml
116+
concurrency:
117+
group: "Build-Test-Artifacts-${{github.ref_name}}"
118+
cancel-in-progress: true
119+
secrets: inherit
120+
permissions:
121+
id-token: write
122+
contents: read
123+
with:
124+
test-image-before-upload: false
125+
126+
# Build and upload operator image to ECR repo
127+
BuildOperator:
128+
needs: [ check-trigger, GetLatestOperatorCommitSHA ]
129+
if: ${{ needs.check-trigger.outputs.should_continue == 'true' }}
130+
uses: aws/amazon-cloudwatch-agent-operator/.github/workflows/build-and-upload.yml@main
131+
concurrency:
132+
group: ${{ github.workflow }}-operator-${{ inputs.operator-branch || 'main' }}
133+
cancel-in-progress: true
134+
secrets: inherit
135+
with:
136+
tag: ${{needs.GetLatestOperatorCommitSHA.outputs.operator_commit_sha}}
137+
target-sha: ${{needs.GetLatestOperatorCommitSHA.outputs.operator_commit_sha}}
138+
repository: ${{needs.GetLatestOperatorCommitSHA.outputs.operator_repo_name}}
139+
test-image-before-upload: false
140+
141+
install-helm:
142+
needs: [ check-trigger, BuildAgent, BuildOperator, GetLatestOperatorCommitSHA ]
143+
if: ${{ needs.check-trigger.outputs.should_continue == 'true' }}
144+
runs-on: ubuntu-latest
145+
permissions:
146+
id-token: write
147+
contents: read
148+
steps:
149+
- name: Configure AWS Credentials
150+
uses: aws-actions/configure-aws-credentials@v4
151+
with:
152+
role-to-assume: ${{ env.TERRAFORM_AWS_ASSUME_ROLE}}
153+
aws-region: ${{ env.AWS_REGION}}
154+
role-duration-seconds: ${{ env.TERRAFORM_AWS_ASSUME_ROLE_DURATION }}
155+
156+
- name: Login ECR
157+
id: login-ecr
158+
uses: aws-actions/amazon-ecr-login@v2
159+
160+
- name: Install kubectl
161+
uses: azure/setup-kubectl@v3
162+
with:
163+
version: 'latest'
164+
165+
- name: Install Helm
166+
uses: azure/setup-helm@v3
167+
with:
168+
version: 'latest'
169+
170+
- name: Update kubeconfig
171+
run: |
172+
aws eks update-kubeconfig --name $CLUSTER_NAME --region $AWS_REGION
173+
174+
- name: Clone Helm Charts Repository
175+
run: |
176+
rm -rf ./helm-charts
177+
git clone -b ${{ inputs.helm-charts-branch || 'main' }} https://github.com/aws-observability/helm-charts.git ./helm-charts
178+
179+
- name: Check node count and manage Helm chart
180+
run: |
181+
NODE_COUNT=$(kubectl get nodes --no-headers | wc -l)
182+
183+
if [ "$NODE_COUNT" -eq 0 ]; then
184+
echo "Node count is 0, removing Helm chart"
185+
helm uninstall amazon-cloudwatch-observability -n amazon-cloudwatch || echo "Chart not found or already removed"
186+
else
187+
echo "Node count is $NODE_COUNT, installing/updating Helm chart"
188+
189+
# Echo all variables being passed to helm
190+
echo "CLUSTER_NAME: ${{ inputs.cluster_name ||env.CLUSTER_NAME }}"
191+
echo "REGION: ${{ inputs.region || env.AWS_REGION }}"
192+
echo "AGENT_REPOSITORY: ${{ inputs.cloudwatch_agent_repository || env.AGENT_ECR_TEST_REPO }}"
193+
echo "AGENT_TAG: ${{ inputs.cloudwatch_agent_tag || github.sha }}"
194+
echo "AGENT_REPOSITORY_DOMAIN: ${{ steps.login-ecr.outputs.registry }}"
195+
echo "MANAGER_REPOSITORY: ${{ inputs.cloudwatch_agent_operator_repository || env.OPERATOR_ECR_TEST_REPO }}"
196+
echo "MANAGER_TAG: ${{ inputs.cloudwatch_agent_operator_tag || needs.GetLatestOperatorCommitSHA.outputs.operator_commit_sha }}"
197+
echo "MANAGER_REPOSITORY_DOMAIN: ${{ steps.login-ecr.outputs.registry }}"
198+
199+
helm upgrade --install amazon-cloudwatch-observability \
200+
./helm-charts/charts/amazon-cloudwatch-observability \
201+
--namespace amazon-cloudwatch \
202+
--create-namespace \
203+
--set clusterName=${{ inputs.cluster_name ||env.CLUSTER_NAME }} \
204+
--set region=${{ inputs.region || env.AWS_REGION }} \
205+
--set agent.image.repository=${{ inputs.cloudwatch_agent_repository || env.AGENT_ECR_TEST_REPO }} \
206+
--set agent.image.tag=${{ inputs.cloudwatch_agent_tag || github.sha }} \
207+
--set agent.image.repositoryDomainMap.public=${{ steps.login-ecr.outputs.registry }} \
208+
--set manager.image.repository=${{ inputs.cloudwatch_agent_operator_repository || env.OPERATOR_ECR_TEST_REPO }} \
209+
--set manager.image.tag=${{ inputs.cloudwatch_agent_operator_tag || needs.GetLatestOperatorCommitSHA.outputs.operator_commit_sha }} \
210+
--set manager.image.repositoryDomainMap.public=${{ steps.login-ecr.outputs.registry }}
211+
fi
212+
213+
cleanup-on-failure:
214+
if: ${{ failure() || cancelled() }}
215+
runs-on: ubuntu-latest
216+
needs: [ install-helm ]
217+
permissions:
218+
id-token: write
219+
contents: read
220+
steps:
221+
- name: Configure AWS Credentials
222+
uses: aws-actions/configure-aws-credentials@v4
223+
with:
224+
role-to-assume: ${{ env.TERRAFORM_AWS_ASSUME_ROLE}}
225+
aws-region: ${{ env.AWS_REGION}}
226+
role-duration-seconds: ${{ env.TERRAFORM_AWS_ASSUME_ROLE_DURATION }}
227+
228+
- name: Install kubectl
229+
uses: azure/setup-kubectl@v3
230+
with:
231+
version: 'latest'
232+
233+
- name: Install Helm
234+
uses: azure/setup-helm@v3
235+
with:
236+
version: 'latest'
237+
238+
- name: Update kubeconfig
239+
run: |
240+
aws eks update-kubeconfig --name $CLUSTER_NAME --region $AWS_REGION
241+
242+
- name: Uninstall Helm chart
243+
run: |
244+
echo "Test was cancelled or failed. Cleaning up resources..."
245+
helm uninstall amazon-cloudwatch-observability -n amazon-cloudwatch || echo "Chart not found or already removed"
246+
echo "Cleanup completed"
Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
# SPDX-License-Identifier: MIT
3+
name: EKS Cluster Scaling
4+
5+
on:
6+
schedule:
7+
- cron: '0 9 * * 0' # Scale up: Runs every Sunday at 9:00 AM
8+
- cron: '0 21 * * 1' # Scale down: Runs every Monday at 9:00 PM
9+
workflow_dispatch:
10+
inputs:
11+
region:
12+
description: 'AWS Region'
13+
required: true
14+
type: string
15+
default: 'us-west-2'
16+
cluster_name:
17+
description: 'EKS Cluster Name'
18+
required: true
19+
type: string
20+
default: 'eks-performance'
21+
desired_capacity_per_nodegroup:
22+
description: 'Desired capacity for each node group'
23+
required: true
24+
type: number
25+
default: 500
26+
node_group_count:
27+
description: 'Count of node groups'
28+
type: number
29+
default: 10
30+
31+
env:
32+
AWS_REGION: ${{ inputs.region || 'us-west-2' }}
33+
CLUSTER_NAME: ${{ inputs.cluster_name || 'eks-performance' }}
34+
NODE_GROUP_COUNT: ${{ inputs.node_group_count || 10 }}
35+
DESIRED_CAPACITY_PER_NODEGROUP: ${{ inputs.desired_capacity_per_nodegroup || 500 }}
36+
TERRAFORM_AWS_ASSUME_ROLE: ${{ vars.TERRAFORM_AWS_ASSUME_ROLE }}
37+
TERRAFORM_AWS_ASSUME_ROLE_DURATION: 3600 # 1 hour duration
38+
CWA_GITHUB_TEST_REPO_NAME: "aws/amazon-cloudwatch-agent-test"
39+
CWA_GITHUB_TEST_REPO_URL: "https://github.com/aws/amazon-cloudwatch-agent-test.git"
40+
CWA_GITHUB_TEST_REPO_BRANCH: "main"
41+
42+
jobs:
43+
scale-eks-cluster:
44+
runs-on: ubuntu-latest
45+
permissions:
46+
id-token: write
47+
contents: read
48+
steps:
49+
- uses: actions/checkout@v3
50+
with:
51+
repository: ${{ env.CWA_GITHUB_TEST_REPO_NAME }}
52+
ref: ${{ env.CWA_GITHUB_TEST_REPO_BRANCH }}
53+
54+
- name: Configure AWS Credentials
55+
uses: aws-actions/configure-aws-credentials@v4
56+
with:
57+
role-to-assume: ${{ env.TERRAFORM_AWS_ASSUME_ROLE}}
58+
aws-region: ${{ inputs.region || 'us-west-2' }}
59+
role-duration-seconds: ${{ env.TERRAFORM_AWS_ASSUME_ROLE_DURATION }}
60+
61+
- name: Install kubectl
62+
uses: azure/setup-kubectl@v3
63+
with:
64+
version: 'latest'
65+
66+
- name: Update kubeconfig for EKS cluster
67+
run: |
68+
aws eks update-kubeconfig --name $CLUSTER_NAME --region $AWS_REGION
69+
70+
- name: Scale up node groups (Sunday)
71+
if: github.event.schedule == '0 9 * * 0'
72+
run: |
73+
echo "Starting scale UP operation with desired capacity: $DESIRED_CAPACITY_PER_NODEGROUP"
74+
75+
for i in $(seq 1 $NODE_GROUP_COUNT); do
76+
echo "Scaling node group: $CLUSTER_NAME-node-${i} to $DESIRED_CAPACITY_PER_NODEGROUP"
77+
aws eks update-nodegroup-config \
78+
--cluster-name $CLUSTER_NAME \
79+
--nodegroup-name $CLUSTER_NAME-node-${i} \
80+
--region $AWS_REGION \
81+
--scaling-config desiredSize=$DESIRED_CAPACITY_PER_NODEGROUP
82+
83+
echo "Waiting 1 minute before scaling next node group..."
84+
sleep 60
85+
done
86+
87+
- name: Scale down node groups (Monday)
88+
if: github.event.schedule == '0 21 * * 1'
89+
run: |
90+
echo "Starting scale DOWN operation with desired capacity: 0"
91+
92+
for i in $(seq 1 $NODE_GROUP_COUNT); do
93+
echo "Scaling node group: $CLUSTER_NAME-node-${i} to 0"
94+
aws eks update-nodegroup-config \
95+
--cluster-name $CLUSTER_NAME \
96+
--nodegroup-name $CLUSTER_NAME-node-${i} \
97+
--region $AWS_REGION \
98+
--scaling-config desiredSize=0
99+
100+
echo "Waiting 1 minute before scaling next node group..."
101+
sleep 60
102+
done
103+
104+
- name: Scale node groups (Manual)
105+
if: github.event_name == 'workflow_dispatch'
106+
run: |
107+
echo "Starting manual scaling operation with desired capacity: $DESIRED_CAPACITY_PER_NODEGROUP"
108+
109+
for i in $(seq 1 $NODE_GROUP_COUNT); do
110+
echo "Scaling node group: $CLUSTER_NAME-node-${i} to $DESIRED_CAPACITY_PER_NODEGROUP"
111+
aws eks update-nodegroup-config \
112+
--cluster-name $CLUSTER_NAME \
113+
--nodegroup-name $CLUSTER_NAME-node-${i} \
114+
--region $AWS_REGION \
115+
--scaling-config desiredSize=$DESIRED_CAPACITY_PER_NODEGROUP
116+
117+
done
118+
119+
- name: Validate total node count
120+
run: |
121+
echo "Waiting 20 minutes for scaling operations to complete..."
122+
sleep 1200
123+
124+
echo "Validating total number of nodes in the cluster..."
125+
ACTUAL_NODE_COUNT=$(kubectl get nodes --no-headers | wc -l)
126+
127+
# Determine expected count based on trigger type
128+
if [ "${{ github.event.schedule }}" = "0 21 * * 1" ]; then
129+
EXPECTED_NODE_COUNT=$(($NODE_GROUP_COUNT * 0))
130+
else
131+
EXPECTED_NODE_COUNT=$(($NODE_GROUP_COUNT * $DESIRED_CAPACITY_PER_NODEGROUP))
132+
fi
133+
134+
echo "Expected total nodes: $EXPECTED_NODE_COUNT"
135+
echo "Actual total nodes: $ACTUAL_NODE_COUNT"
136+
137+
if [ "$ACTUAL_NODE_COUNT" -eq "$EXPECTED_NODE_COUNT" ]; then
138+
echo "Validation successful! Node count matches expected value."
139+
else
140+
echo "Validation failed. Expected $EXPECTED_NODE_COUNT nodes but found $ACTUAL_NODE_COUNT nodes."
141+
exit 1
142+
fi

.github/workflows/test-artifacts.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1361,4 +1361,4 @@ jobs:
13611361
else
13621362
cd terraform/eks/addon/gpu
13631363
fi
1364-
terraform destroy -auto-approve
1364+
terraform destroy -auto-approve

0 commit comments

Comments
 (0)