-
Notifications
You must be signed in to change notification settings - Fork 56
Improve the reliability of the system bring-up scripts #1168
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: release-4.19
Are you sure you want to change the base?
Changes from all commits
3a45aef
9b3c177
c1b4155
c634d53
22320bf
060329b
6255e67
b3d28fd
e658a37
efab23c
a35018c
f66ea10
7864591
d086d67
4cfee5d
0298bbf
fcae494
df91481
fd191fe
975315d
af25437
5500127
4408e0a
69ae19a
eddf3e8
a271d9c
7116ea0
28535ef
5636dbf
2855386
fe06a1f
5e27e7b
9c159a4
9b2e043
9f8b565
31b5633
b1ee39f
9c56363
3191110
a4d8e22
664f723
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
anjannath marked this conversation as resolved.
Show resolved
Hide resolved
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,128 @@ | ||
#!/bin/bash | ||
|
||
set -o pipefail | ||
set -o errexit | ||
set -o nounset | ||
set -o errtrace | ||
set -x | ||
|
||
# set -x is safe, the secrets are passed via stdin | ||
|
||
AWS_CLI_IMG=docker.io/amazon/aws-cli | ||
MIN_CHAR_COUNT=8 # minimum number of chars for the secret to be | ||
# assumed valid | ||
|
||
umask 0077 # 0600 file permission for secrets | ||
install -d -m 0700 /opt/crc # ensure that the target directory exists | ||
|
||
PULL_SECRETS_KEY=${1:-} | ||
KUBEADM_PASS_KEY=${2:-} | ||
DEVELOPER_PASS_KEY=${3:-} | ||
|
||
if [[ -z "$PULL_SECRETS_KEY" || -z "$KUBEADM_PASS_KEY" || -z "$DEVELOPER_PASS_KEY" ]]; then | ||
echo "ERROR: expected to receive 3 parameters: PULL_SECRETS_KEY KUBEADM_PASS_KEY DEVELOPER_PASS_KEY" | ||
exit 1 | ||
fi | ||
|
||
DELAY=5 | ||
TOTAL_PERIOD=$(( 3*60 )) | ||
ATTEMPTS=$(( TOTAL_PERIOD / DELAY)) | ||
function retry_compact() { | ||
for i in $(seq 1 $ATTEMPTS); do | ||
# If the command succeeds (returns 0), exit the function with success. | ||
if "$@"; then | ||
echo "'$*' succeeded after $i attempts " | ||
return 0 | ||
fi | ||
echo "'$*' still failing after $i/$ATTEMPTS attempts ..." | ||
sleep "$DELAY" | ||
done | ||
echo "'$*' didn't succeed after $i attempt ..." | ||
# If the loop finishes, the command never succeeded. | ||
return 1 | ||
} | ||
|
||
cleanup() { | ||
rm -f /tmp/aws-region /opt/crc/pull-secret.tmp /opt/crc/pass_kubeadmin.tmp /opt/crc/pass_developer.tmp | ||
echo "Temp files cleanup complete." | ||
} | ||
|
||
# Cleanup happens automatically via trap on error or at script end | ||
trap cleanup ERR EXIT | ||
|
||
SECONDS=0 | ||
podman pull --quiet "$AWS_CLI_IMG" | ||
echo "Took $SECONDS seconds to pull the $AWS_CLI_IMG" | ||
|
||
check_imds_available_and_get_region() { | ||
IMDS_TOKEN_COMMAND=( | ||
curl | ||
--connect-timeout 1 | ||
-X PUT | ||
"http://169.254.169.254/latest/api/token" | ||
-H "X-aws-ec2-metadata-token-ttl-seconds: 21600" | ||
-Ssf | ||
) | ||
|
||
if ! TOKEN=$("${IMDS_TOKEN_COMMAND[@]}"); then | ||
echo "Couldn't fetch the token..." >&2 | ||
return 1 | ||
fi | ||
|
||
# Then, use the token to get the region | ||
echo "Fetching the AWS region ..." | ||
curl -Ssf -H "X-aws-ec2-metadata-token: $TOKEN" http://169.254.169.254/latest/meta-data/placement/region > /tmp/aws-region | ||
echo >> /tmp/aws-region # add EOL at EOF, for consistency | ||
echo "AWS region: $(< /tmp/aws-region)" | ||
} | ||
coderabbitai[bot] marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
( | ||
set +x # disable the xtrace as the token would be leaked | ||
echo "Waiting for the AWS IMDS service to be available ..." | ||
SECONDS=0 | ||
retry_compact check_imds_available_and_get_region | ||
echo "Took $SECONDS for the IMDS service to become available." | ||
) | ||
|
||
save_secret() { | ||
name=$1 | ||
key=$2 | ||
dest=$3 | ||
|
||
# --log-driver=none avoids that the journal captures the stdout | ||
# logs of podman and leaks the passwords in the journal ... | ||
if ! podman run \ | ||
--name "cloud-init-fetch-$name" \ | ||
--env AWS_REGION="$(< /tmp/aws-region)" \ | ||
--log-driver=none \ | ||
--rm \ | ||
kpouget marked this conversation as resolved.
Show resolved
Hide resolved
|
||
"$AWS_CLI_IMG" \ | ||
ssm get-parameter \ | ||
--name "$key" \ | ||
--with-decryption \ | ||
--query "Parameter.Value" \ | ||
--output text \ | ||
> "${dest}.tmp" | ||
coderabbitai[bot] marked this conversation as resolved.
Show resolved
Hide resolved
|
||
then | ||
rm -f "${dest}.tmp" | ||
echo "ERROR: failed to get the '$name' secret ... (fetched from $key)" | ||
return 1 | ||
fi | ||
char_count=$(wc -c < "${dest}.tmp") | ||
if (( char_count < MIN_CHAR_COUNT )); then | ||
echo "ERROR: the content of the '$name' secret is too short ... (fetched from $key)" | ||
rm -f "${dest}.tmp" | ||
return 1 | ||
fi | ||
|
||
mv "${dest}.tmp" "${dest}" # atomic creation of the file | ||
|
||
return 0 | ||
} | ||
|
||
# execution will abort if 'retry_compact' fails. | ||
retry_compact save_secret "pull-secrets" "$PULL_SECRETS_KEY" /opt/crc/pull-secret | ||
retry_compact save_secret "kubeadmin-pass" "$KUBEADM_PASS_KEY" /opt/crc/pass_kubeadmin | ||
retry_compact save_secret "developer-pass" "$DEVELOPER_PASS_KEY" /opt/crc/pass_developer | ||
|
||
exit 0 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
[Unit] | ||
Description=Ensure that tap0 network configuration is disabled when not necessary | ||
Before=NetworkManager.service | ||
[email protected] | ||
coderabbitai[bot] marked this conversation as resolved.
Show resolved
Hide resolved
|
||
After=local-fs.target | ||
RequiresMountsFor=/etc/NetworkManager/system-connections | ||
|
||
[Service] | ||
Type=oneshot | ||
ExecStart=/usr/local/bin/crc-conditionally-disable-tap.sh | ||
|
||
[Install] | ||
WantedBy=NetworkManager.service | ||
[email protected] |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,7 +6,8 @@ set -o nounset | |
set -o errtrace | ||
set -x | ||
|
||
export KUBECONFIG=/opt/kubeconfig | ||
MAXIMUM_LOGIN_RETRY=10 | ||
RETRY_DELAY=5 | ||
|
||
if [ ! -f /opt/crc/pass_kubeadmin ]; then | ||
echo "kubeadmin password file not found" | ||
|
@@ -15,25 +16,49 @@ fi | |
|
||
rm -rf /tmp/.crc-cluster-ready | ||
|
||
SECONDS=0 | ||
if ! oc adm wait-for-stable-cluster --minimum-stable-period=1m --timeout=10m; then | ||
exit 1 | ||
fi | ||
|
||
echo "Cluster took $SECONDS seconds to stabilize." | ||
|
||
echo "Logging into OpenShift with kubeadmin user to update $KUBECONFIG" | ||
COUNTER=1 | ||
MAXIMUM_LOGIN_RETRY=10 | ||
echo "Logging into OpenShift with kubeadmin user to update the KUBECONFIG" | ||
|
||
try_login() { | ||
( # use a `(set +x)` subshell to avoid leaking the password | ||
set +x | ||
set +e # don't abort on error in this subshell | ||
oc login --insecure-skip-tls-verify=true \ | ||
-u kubeadmin \ | ||
-p "$(cat /opt/crc/pass_kubeadmin)" \ | ||
https://api.crc.testing:6443 > /dev/null 2>&1 | ||
kpouget marked this conversation as resolved.
Show resolved
Hide resolved
|
||
) | ||
local success="$?" | ||
if [[ "$success" == 0 ]]; then | ||
echo "Login succeeded" | ||
else | ||
echo "Login did not complete ..." | ||
fi | ||
|
||
# use a `(set +x)` subshell to avoid leaking the password | ||
until (set +x ; oc login --insecure-skip-tls-verify=true -u kubeadmin -p "$(cat /opt/crc/pass_kubeadmin)" https://api.crc.testing:6443 > /dev/null 2>&1); do | ||
if [ "$COUNTER" -ge "$MAXIMUM_LOGIN_RETRY" ]; then | ||
echo "Unable to login to the cluster..., authentication failed." | ||
return "$success" | ||
} | ||
Comment on lines
+28
to
+45
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Pass Without the Apply this diff: - oc login --insecure-skip-tls-verify=true \
+ oc login --kubeconfig "$KUBECONFIG" --insecure-skip-tls-verify=true \
-u kubeadmin \
-p "$(cat /opt/crc/pass_kubeadmin)" \
https://api.crc.testing:6443 > /dev/null 2>&1 Additionally, add a guard at the script start to ensure # After line 10, before the password file check
: "${KUBECONFIG:?KUBECONFIG must be set}" 🤖 Prompt for AI Agents
|
||
|
||
for ((counter=1; counter<=MAXIMUM_LOGIN_RETRY; counter++)); do | ||
echo "Login attempt $counter/$MAXIMUM_LOGIN_RETRY…" | ||
if try_login; then | ||
break | ||
fi | ||
if (( counter == MAXIMUM_LOGIN_RETRY )); then | ||
echo "Unable to login to the cluster after $counter attempts; authentication failed." | ||
exit 1 | ||
fi | ||
echo "Logging into OpenShift with updated credentials try $COUNTER, hang on...." | ||
sleep 5 | ||
((COUNTER++)) | ||
sleep "$RETRY_DELAY" | ||
done | ||
|
||
# need to set a marker to let `crc` know the cluster is ready | ||
touch /tmp/.crc-cluster-ready | ||
|
||
echo "All done after $SECONDS seconds " | ||
|
||
exit 0 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
#!/bin/bash | ||
|
||
set -o pipefail | ||
set -o errexit | ||
set -o nounset | ||
set -o errtrace | ||
set -x | ||
|
||
# Nothing to do here if CRC needs the TAP interface | ||
if /usr/local/bin/crc-needs-tap.sh; then | ||
echo "TAP device is required, doing nothing." | ||
exit 0 | ||
fi | ||
|
||
echo "TAP device not required, running disable script..." | ||
|
||
exec /usr/local/bin/crc-disable-tap.sh |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,5 @@ | ||
[Unit] | ||
Description=crc custom target | ||
Requires=kubelet-dependencies.target | ||
After=kubelet-dependencies.target | ||
Description=CRC custom target | ||
Requires=crc-wait-apiserver-up.service | ||
Requires=crc-cluster-status.service | ||
After=crc-wait-apiserver-up.service crc-cluster-status.service |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
#!/bin/bash | ||
|
||
set -o pipefail | ||
set -o errexit | ||
set -o nounset | ||
set -o errtrace | ||
set -x | ||
|
||
echo "Disabling the tap0 network configuration ..." | ||
|
||
rm -f /etc/NetworkManager/system-connections/tap0.nmconnection | ||
systemctl disable --now [email protected] || true | ||
|
||
exit 0 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
#!/bin/bash | ||
|
||
set -o pipefail | ||
set -o errexit | ||
set -o nounset | ||
set -o errtrace | ||
set -x | ||
kpouget marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
source /etc/sysconfig/crc-env || echo "WARNING: crc-env not found" | ||
|
||
EXIT_NEED_TAP=0 | ||
EXIT_DONT_NEED_TAP=77 | ||
EXIT_ERROR=1 | ||
|
||
if /usr/local/bin/crc-self-sufficient-env.sh; then | ||
echo "Running a self-sufficient bundle. Don't need tap0" | ||
exit "$EXIT_DONT_NEED_TAP" | ||
fi | ||
|
||
if /usr/local/bin/crc-user-mode-networking.sh system; then | ||
echo "Running with CRC and system-mode networking. Don't need tap0. (Fairly rare case.)" | ||
exit "$EXIT_DONT_NEED_TAP" | ||
fi | ||
|
||
# running with CRC (not a self-sufficient bundle) | ||
# running with user-mode networking | ||
# --> vfkit doesn't need tap0 | ||
# --> other platforms do need it | ||
|
||
virt="$(systemd-detect-virt || true)" | ||
|
||
case "${virt}" in | ||
apple) | ||
echo "Running with vfkit ($virt) virtualization. Don't need tap0." | ||
exit "$EXIT_DONT_NEED_TAP" | ||
;; | ||
none) | ||
echo "Bare metal detected. Don't need tap0." | ||
exit "$EXIT_DONT_NEED_TAP" | ||
;; | ||
*) | ||
echo "Running with '$virt' virtualization. Need tap0." | ||
exit "$EXIT_NEED_TAP" | ||
;; | ||
esac |
This file was deleted.
Uh oh!
There was an error while loading. Please reload this page.