diff --git a/createdisk-library.sh b/createdisk-library.sh index 05b08405b..50ff1d3f9 100755 --- a/createdisk-library.sh +++ b/createdisk-library.sh @@ -410,7 +410,9 @@ function copy_systemd_units() { ${SSH} core@${VM_IP} -- 'mkdir -p /home/core/systemd-units && mkdir -p /home/core/systemd-scripts' ${SCP} systemd/crc-*.service core@${VM_IP}:/home/core/systemd-units/ ${SCP} systemd/crc-*.target core@${VM_IP}:/home/core/systemd-units/ + ${SCP} -r systemd/*.d core@${VM_IP}:/home/core/systemd-units/ ${SCP} systemd/crc-*.sh core@${VM_IP}:/home/core/systemd-scripts/ + ${SCP} systemd/crc-*.py core@${VM_IP}:/home/core/systemd-scripts/ case "${BUNDLE_TYPE}" in "snc"|"okd") @@ -419,7 +421,7 @@ function copy_systemd_units() { ;; esac - ${SSH} core@${VM_IP} -- 'sudo cp /home/core/systemd-units/* /etc/systemd/system/ && sudo cp /home/core/systemd-scripts/* /usr/local/bin/' + ${SSH} core@${VM_IP} -- 'sudo cp -r /home/core/systemd-units/* /etc/systemd/system/ && sudo cp /home/core/systemd-scripts/* /usr/local/bin/' ${SSH} core@${VM_IP} -- 'ls /home/core/systemd-scripts/ | xargs -t -I % sudo chmod +x /usr/local/bin/%' ${SSH} core@${VM_IP} -- 'sudo restorecon -rv /usr/local/bin' diff --git a/createdisk.sh b/createdisk.sh index 84cfd0fa9..9ce573438 100755 --- a/createdisk.sh +++ b/createdisk.sh @@ -52,7 +52,7 @@ wait_for_ssh ${VM_NAME} ${VM_IP} if [ ${BUNDLE_TYPE} != "microshift" ]; then # Disable kubelet service ${SSH} core@${VM_IP} -- sudo systemctl disable kubelet - + # Stop the kubelet service so it will not reprovision the pods ${SSH} core@${VM_IP} -- sudo systemctl stop kubelet fi @@ -109,11 +109,15 @@ ${SSH} core@${VM_IP} 'sudo bash -x -s' <&2 + return 1 + fi + + # Then, use the token to get the region + echo "Fetching the AWS region ..." + curl -Ssf -H "X-aws-ec2-metadata-token: $TOKEN" http://169.254.169.254/latest/meta-data/placement/region > /tmp/aws-region + echo >> /tmp/aws-region # add EOL at EOF, for consistency + echo "AWS region: $(< /tmp/aws-region)" +} + +( + set +x # disable the xtrace as the token would be leaked + echo "Waiting for the AWS IMDS service to be available ..." + SECONDS=0 + retry_compact check_imds_available_and_get_region + echo "Took $SECONDS for the IMDS service to become available." +) + +save_secret() { + name=$1 + key=$2 + dest=$3 + + # --log-driver=none avoids that the journal captures the stdout + # logs of podman and leaks the passwords in the journal ... + if ! podman run \ + --name "cloud-init-fetch-$name" \ + --env AWS_REGION="$(< /tmp/aws-region)" \ + --log-driver=none \ + --rm \ + "$AWS_CLI_IMG" \ + ssm get-parameter \ + --name "$key" \ + --with-decryption \ + --query "Parameter.Value" \ + --output text \ + > "${dest}.tmp" + then + rm -f "${dest}.tmp" + echo "ERROR: failed to get the '$name' secret ... (fetched from $key)" + return 1 + fi + char_count=$(wc -c < "${dest}.tmp") + if (( char_count < MIN_CHAR_COUNT )); then + echo "ERROR: the content of the '$name' secret is too short ... (fetched from $key)" + rm -f "${dest}.tmp" + return 1 + fi + + mv "${dest}.tmp" "${dest}" # atomic creation of the file + + return 0 +} + +# execution will abort if 'retry_compact' fails. +retry_compact save_secret "pull-secrets" "$PULL_SECRETS_KEY" /opt/crc/pull-secret +retry_compact save_secret "kubeadmin-pass" "$KUBEADM_PASS_KEY" /opt/crc/pass_kubeadmin +retry_compact save_secret "developer-pass" "$DEVELOPER_PASS_KEY" /opt/crc/pass_developer + +exit 0 diff --git a/systemd/crc-check-tap.service b/systemd/crc-check-tap.service new file mode 100644 index 000000000..473e77a29 --- /dev/null +++ b/systemd/crc-check-tap.service @@ -0,0 +1,14 @@ +[Unit] +Description=Ensure that tap0 network configuration is disabled when not necessary +Before=NetworkManager.service +Before=gv-user-network@tap0.service +After=local-fs.target +RequiresMountsFor=/etc/NetworkManager/system-connections + +[Service] +Type=oneshot +ExecStart=/usr/local/bin/crc-conditionally-disable-tap.sh + +[Install] +WantedBy=NetworkManager.service +WantedBy=gv-user-network@tap0.service diff --git a/systemd/crc-cluster-status.service b/systemd/crc-cluster-status.service index 92d73dffe..fd7f70b19 100644 --- a/systemd/crc-cluster-status.service +++ b/systemd/crc-cluster-status.service @@ -4,6 +4,8 @@ After=crc-wait-apiserver-up.service crc-pullsecret.service After=ocp-mco-sshkey.service ocp-cluster-ca.service After=ocp-custom-domain.service ocp-userpasswords.service After=ocp-clusterid.service +After=ocp-wait-apiservices-available.service +After=crc-wait-node-ready.service StartLimitIntervalSec=450 StartLimitBurst=10 @@ -11,7 +13,7 @@ StartLimitBurst=10 Type=oneshot Restart=on-failure RestartSec=40 -EnvironmentFile=-/etc/sysconfig/crc-env +Environment=KUBECONFIG=/opt/kubeconfig ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh ExecStart=/usr/local/bin/crc-cluster-status.sh RemainAfterExit=true diff --git a/systemd/crc-cluster-status.sh b/systemd/crc-cluster-status.sh index a62586438..9b25dece3 100644 --- a/systemd/crc-cluster-status.sh +++ b/systemd/crc-cluster-status.sh @@ -6,7 +6,8 @@ set -o nounset set -o errtrace set -x -export KUBECONFIG=/opt/kubeconfig +MAXIMUM_LOGIN_RETRY=10 +RETRY_DELAY=5 if [ ! -f /opt/crc/pass_kubeadmin ]; then echo "kubeadmin password file not found" @@ -15,25 +16,49 @@ fi rm -rf /tmp/.crc-cluster-ready +SECONDS=0 if ! oc adm wait-for-stable-cluster --minimum-stable-period=1m --timeout=10m; then exit 1 fi +echo "Cluster took $SECONDS seconds to stabilize." -echo "Logging into OpenShift with kubeadmin user to update $KUBECONFIG" -COUNTER=1 -MAXIMUM_LOGIN_RETRY=10 +echo "Logging into OpenShift with kubeadmin user to update the KUBECONFIG" + +try_login() { + ( # use a `(set +x)` subshell to avoid leaking the password + set +x + set +e # don't abort on error in this subshell + oc login --insecure-skip-tls-verify=true \ + -u kubeadmin \ + -p "$(cat /opt/crc/pass_kubeadmin)" \ + https://api.crc.testing:6443 > /dev/null 2>&1 + ) + local success="$?" + if [[ "$success" == 0 ]]; then + echo "Login succeeded" + else + echo "Login did not complete ..." + fi -# use a `(set +x)` subshell to avoid leaking the password -until (set +x ; oc login --insecure-skip-tls-verify=true -u kubeadmin -p "$(cat /opt/crc/pass_kubeadmin)" https://api.crc.testing:6443 > /dev/null 2>&1); do - if [ "$COUNTER" -ge "$MAXIMUM_LOGIN_RETRY" ]; then - echo "Unable to login to the cluster..., authentication failed." + return "$success" +} + +for ((counter=1; counter<=MAXIMUM_LOGIN_RETRY; counter++)); do + echo "Login attempt $counter/$MAXIMUM_LOGIN_RETRY…" + if try_login; then + break + fi + if (( counter == MAXIMUM_LOGIN_RETRY )); then + echo "Unable to login to the cluster after $counter attempts; authentication failed." exit 1 fi - echo "Logging into OpenShift with updated credentials try $COUNTER, hang on...." - sleep 5 - ((COUNTER++)) + sleep "$RETRY_DELAY" done # need to set a marker to let `crc` know the cluster is ready touch /tmp/.crc-cluster-ready + +echo "All done after $SECONDS seconds " + +exit 0 diff --git a/systemd/crc-conditionally-disable-tap.sh b/systemd/crc-conditionally-disable-tap.sh new file mode 100644 index 000000000..f54689a91 --- /dev/null +++ b/systemd/crc-conditionally-disable-tap.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +set -o pipefail +set -o errexit +set -o nounset +set -o errtrace +set -x + +# Nothing to do here if CRC needs the TAP interface +if /usr/local/bin/crc-needs-tap.sh; then + echo "TAP device is required, doing nothing." + exit 0 +fi + +echo "TAP device not required, running disable script..." + +exec /usr/local/bin/crc-disable-tap.sh diff --git a/systemd/crc-custom.target b/systemd/crc-custom.target index 206d482fa..8fa380f48 100644 --- a/systemd/crc-custom.target +++ b/systemd/crc-custom.target @@ -1,4 +1,5 @@ [Unit] -Description=crc custom target -Requires=kubelet-dependencies.target -After=kubelet-dependencies.target +Description=CRC custom target +Requires=crc-wait-apiserver-up.service +Requires=crc-cluster-status.service +After=crc-wait-apiserver-up.service crc-cluster-status.service diff --git a/systemd/crc-disable-tap.sh b/systemd/crc-disable-tap.sh new file mode 100644 index 000000000..a0492a1cf --- /dev/null +++ b/systemd/crc-disable-tap.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +set -o pipefail +set -o errexit +set -o nounset +set -o errtrace +set -x + +echo "Disabling the tap0 network configuration ..." + +rm -f /etc/NetworkManager/system-connections/tap0.nmconnection +systemctl disable --now gv-user-network@tap0.service || true + +exit 0 diff --git a/systemd/crc-dnsmasq.service b/systemd/crc-dnsmasq.service index 42d45a93d..a53f2b320 100644 --- a/systemd/crc-dnsmasq.service +++ b/systemd/crc-dnsmasq.service @@ -8,7 +8,6 @@ StartLimitIntervalSec=30 [Service] Type=oneshot Restart=on-failure -EnvironmentFile=-/etc/sysconfig/crc-env ExecStartPre=/bin/systemctl start ovs-configuration.service ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh ExecStart=/usr/local/bin/crc-dnsmasq.sh diff --git a/systemd/crc-needs-tap.sh b/systemd/crc-needs-tap.sh new file mode 100644 index 000000000..ebe1b2c11 --- /dev/null +++ b/systemd/crc-needs-tap.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +set -o pipefail +set -o errexit +set -o nounset +set -o errtrace +set -x + +source /etc/sysconfig/crc-env || echo "WARNING: crc-env not found" + +EXIT_NEED_TAP=0 +EXIT_DONT_NEED_TAP=77 +EXIT_ERROR=1 + +virt="$(systemd-detect-virt || true)" + +case "${virt}" in + apple) + echo "Running with vfkit ($virt) virtualization. Don't need tap0." + exit "$EXIT_DONT_NEED_TAP" + ;; + none) + echo "Bare metal detected. Don't need tap0." + exit "$EXIT_DONT_NEED_TAP" + ;; + *) + echo "Running with '$virt' virtualization. Need tap0." + exit "$EXIT_NEED_TAP" + ;; +esac diff --git a/systemd/crc-no-tap.service b/systemd/crc-no-tap.service deleted file mode 100644 index fe215e2c4..000000000 --- a/systemd/crc-no-tap.service +++ /dev/null @@ -1,13 +0,0 @@ -[Unit] -Description=Ensure that tap0 network configuration is absent on Apple Virtualization -Before=NetworkManager.service -After=local-fs.target -RequiresMountsFor=/etc/NetworkManager/system-connections - -[Service] -Type=oneshot -EnvironmentFile=-/etc/sysconfig/crc-env -ExecStart=/usr/local/bin/crc-no-tap.sh - -[Install] -WantedBy=NetworkManager.service diff --git a/systemd/crc-no-tap.sh b/systemd/crc-no-tap.sh deleted file mode 100644 index 1f0410221..000000000 --- a/systemd/crc-no-tap.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash - -# Return true if running under Apple Virtualization or CRC_SELF_SUFFICIENT is set, otherwise false - -if systemd-detect-virt | grep -q '^apple$' || [ -n "$CRC_SELF_SUFFICIENT" ]; then - rm -f /etc/NetworkManager/system-connections/tap0.nmconnection - systemctl disable --now gv-user-network@tap0.service -fi - -exit 0 diff --git a/systemd/crc-pullsecret.service b/systemd/crc-pullsecret.service index 4c88531b7..4523549f0 100644 --- a/systemd/crc-pullsecret.service +++ b/systemd/crc-pullsecret.service @@ -1,17 +1,19 @@ [Unit] Description=CRC Unit for adding pull secret to cluster After=crc-wait-apiserver-up.service +After=cloud-final.service StartLimitIntervalSec=450 -StartLimitBurst=10 +StartLimitBurst=40 ConditionPathExists=!/opt/crc/%n.done [Service] Type=oneshot Restart=on-failure -RestartSec=40 -EnvironmentFile=-/etc/sysconfig/crc-env +RestartSec=10 +Environment=KUBECONFIG=/opt/kubeconfig ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh -ExecStart=/usr/local/bin/crc-pullsecret.sh +ExecStartPre=/usr/bin/test -f /opt/crc/pull-secret +ExecStart=/usr/local/bin/crc-pullsecret.sh /opt/crc/pull-secret ExecStartPost=-touch /opt/crc/%n.done [Install] diff --git a/systemd/crc-pullsecret.sh b/systemd/crc-pullsecret.sh index 895a5480a..0b636a67b 100644 --- a/systemd/crc-pullsecret.sh +++ b/systemd/crc-pullsecret.sh @@ -7,26 +7,48 @@ set -o errtrace set -x source /usr/local/bin/crc-systemd-common.sh -export KUBECONFIG="/opt/kubeconfig" -wait_for_resource secret +PULL_SECRETS_FILE="${1:-}" -set +x # disable the logging to avoid leaking the pull secrets +wait_for_resource_or_die secret -# check if existing pull-secret is valid if not add the one from /opt/crc/pull-secret -existingPsB64=$(oc get secret pull-secret -n openshift-config -o jsonpath="{['data']['\.dockerconfigjson']}") -existingPs=$(echo "${existingPsB64}" | base64 -d) +# The pull secret data is piped through stdin and not exposed in command arguments, +# so `set -x` is safe to keep # check if the .auths field is there -if echo "${existingPs}" | jq -e 'has("auths")' >/dev/null 2>&1; then - echo "Cluster already has the pull secrets, nothing to do" +if oc get secret pull-secret \ + -n openshift-config \ + -o jsonpath="{['data']['\.dockerconfigjson']}" \ + | base64 -d \ + | jq -e 'has("auths")' >/dev/null 2>&1 +then + echo "Cluster already has some pull secrets, nothing to do." exit 0 fi -echo "Cluster doesn't have the pull secrets. Setting them from /opt/crc/pull-secret ..." -pullSecretB64=$(base64 -w0 < /opt/crc/pull-secret) +echo "Cluster doesn't have the pull secrets. Setting them from $PULL_SECRETS_FILE ..." + +# enforced by systemd +if [[ ! -r "$PULL_SECRETS_FILE" ]]; +then + echo "ERROR: $PULL_SECRETS_FILE is missing or unreadable" 1>&2 + exit 1 +fi + +if ! jq -e 'has("auths")' < "$PULL_SECRETS_FILE" >/dev/null; +then + echo "ERROR: pull-secrets file doesn't have the required '.auths' field" + exit 1 +fi + # Create the JSON patch in memory and pipe it to the oc command -printf '{"data":{".dockerconfigjson": "%s"}}' "${pullSecretB64}" | \ - oc patch secret pull-secret -n openshift-config --type merge --patch-file=/dev/stdin +base64 -w0 < "$PULL_SECRETS_FILE" | \ + jq -R '{"data": {".dockerconfigjson": .}}' | \ + oc patch secret pull-secret \ + -n openshift-config \ + --type merge \ + --patch-file=/dev/stdin + +echo "All done" exit 0 diff --git a/systemd/crc-routes-controller.service b/systemd/crc-routes-controller.service index fe56fde2a..e73f71100 100644 --- a/systemd/crc-routes-controller.service +++ b/systemd/crc-routes-controller.service @@ -8,7 +8,8 @@ StartLimitBurst=10 Type=oneshot Restart=on-failure RestartSec=40 -EnvironmentFile=-/etc/sysconfig/crc-env +Environment=KUBECONFIG=/opt/kubeconfig +ExecCondition=/usr/local/bin/crc-user-mode-networking.sh ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh ExecStart=/usr/local/bin/crc-routes-controller.sh diff --git a/systemd/crc-routes-controller.sh b/systemd/crc-routes-controller.sh index 7aa2c3316..ee15b968d 100644 --- a/systemd/crc-routes-controller.sh +++ b/systemd/crc-routes-controller.sh @@ -1,16 +1,20 @@ #!/bin/bash +set -o pipefail +set -o errexit +set -o nounset +set -o errtrace set -x -if [[ ${CRC_NETWORK_MODE_USER} -eq 0 ]]; then - echo -n "network-mode 'system' detected: skipping routes-controller pod deployment" - exit 0 -fi +ROUTE_CONTROLLER=/opt/crc/routes-controller.yaml source /usr/local/bin/crc-systemd-common.sh -export KUBECONFIG=/opt/kubeconfig -wait_for_resource pods +wait_for_resource_or_die pods +wait_for_resource_or_die deployments -oc apply -f /opt/crc/routes-controller.yaml +oc apply -f "$ROUTE_CONTROLLER" +echo "All done." + +exit 0 diff --git a/systemd/crc-self-sufficient-env.sh b/systemd/crc-self-sufficient-env.sh index 32dde4294..4bb4cc1d8 100644 --- a/systemd/crc-self-sufficient-env.sh +++ b/systemd/crc-self-sufficient-env.sh @@ -1,8 +1,55 @@ #!/bin/bash -set -euo pipefail -# Optional: load env if unit forgot EnvironmentFile -[ -r /etc/sysconfig/crc-env ] && . /etc/sysconfig/crc-env -if [ "${CRC_SELF_SUFFICIENT:-}" = "1" ] || [ "${CRC_CLOUD:-}" = "1" ]; then - exit 0 + +set -o pipefail +set -o nounset +set -o errtrace + +source /etc/sysconfig/crc-env || echo "WARNING: crc-env not found" + +if [[ "${CRC_SELF_SUFFICIENT:-}" ]]; then + echo "Found CRC_SELF_SUFFICIENT=$CRC_SELF_SUFFICIENT" + + if [[ ! "${CRC_SELF_SUFFICIENT}" =~ ^[01]$ ]]; then + echo "ERROR: CRC_SELF_SUFFICIENT should be 0 or 1 ..." >&2 + exit 1 + fi + + if [[ "$CRC_SELF_SUFFICIENT" == 1 ]]; then + exit 0 + else + exit 1 + fi fi -exit 1 \ No newline at end of file + +TEST_TIMEOUT=120 +VSOCK_COMM_PORT=1024 + +set +o errexit +# set -o errexit disabled to capture the test return code +timeout "$TEST_TIMEOUT" python3 /usr/local/bin/crc-test-vsock.py "$VSOCK_COMM_PORT" +returncode=$? +set -o errexit + +case "$returncode" in + 19) # ENODEV + echo "vsock device doesn't exist, not running self-sufficient bundle" >&2 + exit 1 + ;; + 124) + echo "ERROR: vsock/${VSOCK_COMM_PORT} test timed out after $TEST_TIMEOUT seconds :/" >&2 + exit 124 + ;; + 1) + echo "vsock/${VSOCK_COMM_PORT} not working, running with a self-sufficient bundle" >&2 + exit 0 + ;; + 0) + echo "vsock/${VSOCK_COMM_PORT} works, not running with a self-sufficient bundle" >&2 + exit 1 + ;; + *) + echo "ERROR: unexpected return code from the vsock test: $returncode" >&2 + exit "$returncode" +esac + +# cannot be reached diff --git a/systemd/crc-systemd-common.sh b/systemd/crc-systemd-common.sh index bd68169ed..583ad22ad 100644 --- a/systemd/crc-systemd-common.sh +++ b/systemd/crc-systemd-common.sh @@ -1,15 +1,52 @@ # $1 is the resource to check # $2 is an optional maximum retry count; default 20 -function wait_for_resource() { - local retry=0 +function wait_for_resource_or_die() { + local resource=${1:-} local max_retry=${2:-20} local wait_sec=${3:-5} - until oc get "$1" > /dev/null 2>&1 - do - [[ "$retry" -ge "$max_retry" ]] && exit 1 - sleep $wait_sec - ((retry++)) + + local xtrace_was_disabled=0 + # Check if xtrace is currently DISABLED. If so, set a flag. + [[ $- == *x* ]] || xtrace_was_disabled=1 + set +x # disable xtrace to reduce the verbosity of this function + + if [[ -z "$resource" ]]; then + echo "ERROR: expected a K8s resource as first parameter ..." + echo "ERROR: wait_for_resource_or_die RESOURCE [max_retry=20] [wait_sec=5]" + exit 1 # this is wait_for_resource_or_die, so die ... + fi + + local start_time + start_time=$(date +%s) + + # Loop from 1 up to max_retry + for (( retry=1; retry<=max_retry; retry++ )); do + # Try the command. If it succeeds, exit the loop. + if oc get $resource > /dev/null 2>&1; then + local end_time + end_time=$(date +%s) + + local duration=$((end_time - start_time)) + echo "Resource '$resource' found after $retry tries ($duration seconds)." + + if (( ! xtrace_was_disabled )); then + set -x # reenable xtrace + fi + + return 0 + fi + + # If it's the last attempt, log a failure message before exiting + if (( retry == max_retry )); then + echo "Error: Timed out waiting for resource '$resource' after ${max_retry} attempts x ${wait_sec} seconds." >&2 + exit 1 # this is wait_for_resource_or_die, so die ... + fi + + # Wait before the next attempt + echo "Attempt ${retry}/${max_retry} didn't succeed." + echo "Waiting $wait_sec seconds for '$resource'." + sleep "$wait_sec" done - return 0 + # unreachable } diff --git a/systemd/crc-test-vsock.py b/systemd/crc-test-vsock.py new file mode 100644 index 000000000..f8ae0a6b3 --- /dev/null +++ b/systemd/crc-test-vsock.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python3 + +import socket +import sys +import time +import fcntl, struct +import os +import errno +import pathlib + +VSOCK_DEV = pathlib.Path("/dev/vsock") +HOST_CID = 2 # VMADDR_CID_HOST + +def main(): + if len(sys.argv) != 2: + print("ERROR: expected a vsock port number as first argument.") + raise SystemExit(errno.EINVAL) + + port = int(sys.argv[1]) + tries = 5 + while not VSOCK_DEV.exists(): + tries -= 1 + + if not tries: + print(f"ERROR: {VSOCK_DEV} didn't appear ...") + return errno.ENODEV + print(f"Waiting for {VSOCK_DEV} to appear ... ({tries} tries left)") + time.sleep(1) + + print(f"Looking up the CID in {VSOCK_DEV}...") + with open(VSOCK_DEV, 'rb') as f: + r = fcntl.ioctl(f, socket.IOCTL_VM_SOCKETS_GET_LOCAL_CID, ' ') + cid = struct.unpack('I', r)[0] + print(f'Our vsock CID is {cid}.') + + s = socket.socket(socket.AF_VSOCK, socket.SOCK_STREAM) + + try: + s.connect((HOST_CID, port)) + except OSError as e: + + if e.errno in (errno.ENODEV, errno.ECONNREFUSED, errno.EHOSTUNREACH, errno.ETIMEDOUT, errno.ECONNRESET): + print(f"No remote host on vsock://{HOST_CID}:{port} ({e.strerror})") + s.close() + return 1 + + print(f"Unexpected error connecting vsock://{HOST_CID}:{port}: {e}") + s.close() + return 1 + + msg = b"hello" + s.sendall(msg) + + s.sendall(b"\n") + + s.close() + print(f"A remote host is listening on vsock://{HOST_CID}:{port}") + + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/systemd/crc-user-mode-networking.sh b/systemd/crc-user-mode-networking.sh new file mode 100644 index 000000000..c60b548f5 --- /dev/null +++ b/systemd/crc-user-mode-networking.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +set -o pipefail +set -o errexit +set -o nounset +set -o errtrace + +source /etc/sysconfig/crc-env || echo "WARNING: crc-env not found" + +EXIT_ERROR=77 + +target="${1:-}" +if [[ "$target" == user || -z "$target" ]]; then + # searching for user mode, return 0 if user + EXIT_USER_MODE=0 + EXIT_NOT_USER_MODE=1 +elif [[ "$target" == system ]]; then + # searching for system mode, return 0 if system + EXIT_NOT_USER_MODE=0 + EXIT_USER_MODE=1 +else + echo "ERROR: invalid target '$target'. Should be 'user' (default) or 'system'. Got '$target'." >&2 + exit "$EXIT_ERROR" +fi + + +if /usr/local/bin/crc-self-sufficient-env.sh; then + echo "Running a self-sufficient bundle. Not user-mode networking." + if [[ "${CRC_NETWORK_MODE_USER:-}" ]]; then + echo "WARNING: Ignoring CRC_NETWORK_MODE_USER='$CRC_NETWORK_MODE_USER' in the self-sufficient bundle." + fi + + exit "$EXIT_NOT_USER_MODE" +fi + +# no value --> error +if [[ -z "${CRC_NETWORK_MODE_USER:-}" ]]; then + echo "ERROR: CRC_NETWORK_MODE_USER not set. Assuming user networking." >&2 + exit "$EXIT_USER_MODE" +fi + +# value not in [0, 1] --> error +if [[ ! "${CRC_NETWORK_MODE_USER}" =~ ^[01]$ ]]; then + echo "ERROR: unknown network mode: CRC_NETWORK_MODE_USER=${CRC_NETWORK_MODE_USER} (expected 0 or 1)" >&2 + exit "$EXIT_ERROR" +fi + +# value == 0 --> not user-node +if (( CRC_NETWORK_MODE_USER == 0 )); then + echo "network-mode 'system' detected" + exit "$EXIT_NOT_USER_MODE" +fi + +# value == 1 --> user-mode +if (( CRC_NETWORK_MODE_USER == 1 )); then + echo "network-mode 'user' detected" + exit "$EXIT_USER_MODE" +fi + +# anything else --> error (can't be reached) +echo "ERROR: unknown network mode: CRC_NETWORK_MODE_USER=$CRC_NETWORK_MODE_USER." >&2 +echo "Assuming user networking." >&2 +echo "SHOULD NOT BE REACHED." >&2 + +exit "$EXIT_ERROR" diff --git a/systemd/crc-wait-apiserver-up.service b/systemd/crc-wait-apiserver-up.service index 7cf21e000..78ee273c9 100644 --- a/systemd/crc-wait-apiserver-up.service +++ b/systemd/crc-wait-apiserver-up.service @@ -7,7 +7,7 @@ Before=ocp-delete-mco-leases.service [Service] Type=oneshot Restart=on-failure -EnvironmentFile=-/etc/sysconfig/crc-env +Environment=KUBECONFIG=/opt/kubeconfig ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh ExecStart=/usr/local/bin/crc-wait-apiserver-up.sh diff --git a/systemd/crc-wait-apiserver-up.sh b/systemd/crc-wait-apiserver-up.sh index 28299a5d4..25bfe8b2e 100644 --- a/systemd/crc-wait-apiserver-up.sh +++ b/systemd/crc-wait-apiserver-up.sh @@ -1,9 +1,20 @@ #!/bin/bash -set -x +set -o pipefail +set -o errexit +set -o nounset +set -o errtrace source /usr/local/bin/crc-systemd-common.sh -export KUBECONFIG=/opt/kubeconfig +SECONDS=0 + +echo "Waiting for the node resource to be available ..." # $1 resource, $2 retry count, $3 wait time -wait_for_resource node 4 60 +wait_for_resource_or_die node 60 5 + +echo "node resource available, APIServer is ready after $SECONDS seconds." + +echo "All done" + +exit 0 diff --git a/systemd/crc-wait-node-ready.service b/systemd/crc-wait-node-ready.service new file mode 100644 index 000000000..facefe55c --- /dev/null +++ b/systemd/crc-wait-node-ready.service @@ -0,0 +1,18 @@ +[Unit] +Description=CRC Unit waiting till k8s node is ready +Requires=kubelet.service +After=kubelet.service +After=crc-wait-apiserver-up.service +StartLimitIntervalSec=450 +StartLimitBurst=10 + +[Service] +Type=oneshot +Restart=on-failure +RestartSec=10 +Environment=KUBECONFIG=/opt/kubeconfig +ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh +ExecStart=/usr/local/bin/crc-wait-node-ready.sh + +[Install] +WantedBy=crc-custom.target diff --git a/systemd/crc-wait-node-ready.sh b/systemd/crc-wait-node-ready.sh new file mode 100644 index 000000000..dd3d59d65 --- /dev/null +++ b/systemd/crc-wait-node-ready.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +set -o pipefail +set -o errexit +set -o nounset +set -o errtrace + +source /usr/local/bin/crc-systemd-common.sh + +SECONDS=0 +MAX_RETRY=150 +WAIT_SEC=2 +NODE_NAME=node/crc +# Loop from 1 up to max_retry +for retry in $(seq 1 "$MAX_RETRY"); do + node_status=$(oc get "$NODE_NAME" --no-headers | awk '{print $2}' || true) + node_status=${node_status:-""} + + # Check if the node status is "Ready" + if [[ $node_status == "Ready" ]]; then + echo "CRC node is ready after $SECONDS seconds." + exit 0 + fi + + echo "CRC node is not ready. Status: $node_status" + + # If it's the last attempt, log a failure message before exiting + if (( retry == MAX_RETRY )); then + echo "ERROR: Timed out waiting for the CRC node to be ready after $MAX_RETRY attempts x $WAIT_SEC seconds." >&2 + exit 1 + fi + + # Wait before the next attempt + echo "Waiting $WAIT_SEC seconds for crc node to be ready ... (Attempt ${retry}/${MAX_RETRY})" + sleep "$WAIT_SEC" +done + +# cannot be reached + +exit 1 diff --git a/systemd/dnsmasq.sh.template b/systemd/dnsmasq.sh.template index f0168fd94..7942fc961 100644 --- a/systemd/dnsmasq.sh.template +++ b/systemd/dnsmasq.sh.template @@ -1,12 +1,33 @@ #!/bin/bash +set -o pipefail +set -o errexit +set -o nounset +set -o errtrace set -x -if [[ ${CRC_NETWORK_MODE_USER} -eq 1 ]]; then +source /etc/sysconfig/crc-env || echo "WARNING: crc-env not found" + + +if (( ${CRC_NETWORK_MODE_USER:-0} == 1 )); then echo -n "network-mode 'user' detected: skipping dnsmasq configuration" exit 0 fi +# The value of APPS_DOMAIN is set by the +# createdisk-library.sh::copy_systemd_units script during the template +# instantiation. So in the end system, the test below should be a +# tautologie (ie, always true if correctly set up) + +# disable this to properly reach the error block (cannot use ${var:-} +# here because of the envsubst instantiating the template) +set +o nounset +if [[ -z "${APPS_DOMAIN}" ]]; then + echo "ERROR: APPS_DOMAIN must be defined to use this script" + exit 1 +fi +set -o nounset + hostName=$(hostname) hostIp=$(hostname --all-ip-addresses | awk '{print $1}') diff --git a/systemd/kubelet.service.d/wants-crc-custom.conf b/systemd/kubelet.service.d/wants-crc-custom.conf new file mode 100644 index 000000000..be4b777c2 --- /dev/null +++ b/systemd/kubelet.service.d/wants-crc-custom.conf @@ -0,0 +1,3 @@ +[Unit] +Wants=crc-custom.target +Before=crc-custom.target diff --git a/systemd/ocp-cluster-ca.service b/systemd/ocp-cluster-ca.service index 374383fca..c36cafbcd 100644 --- a/systemd/ocp-cluster-ca.service +++ b/systemd/ocp-cluster-ca.service @@ -1,6 +1,7 @@ [Unit] Description=CRC Unit setting custom cluster ca After=crc-wait-apiserver-up.service +After=ocp-wait-apiservices-available.service StartLimitIntervalSec=450 StartLimitBurst=10 ConditionPathExists=!/opt/crc/%n.done @@ -9,7 +10,7 @@ ConditionPathExists=!/opt/crc/%n.done Type=oneshot Restart=on-failure RestartSec=40 -EnvironmentFile=-/etc/sysconfig/crc-env +Environment=KUBECONFIG=/opt/kubeconfig ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh ExecStart=/usr/local/bin/ocp-cluster-ca.sh ExecStartPost=-touch /opt/crc/%n.done diff --git a/systemd/ocp-cluster-ca.sh b/systemd/ocp-cluster-ca.sh index fc82e5ced..01e6f2e12 100644 --- a/systemd/ocp-cluster-ca.sh +++ b/systemd/ocp-cluster-ca.sh @@ -4,20 +4,27 @@ # https://access.redhat.com/solutions/5286371 # https://access.redhat.com/solutions/6054981 +set -o pipefail +set -o errexit +set -o nounset +set -o errtrace set -x +source /etc/sysconfig/crc-env || echo "WARNING: crc-env not found" + source /usr/local/bin/crc-systemd-common.sh -export KUBECONFIG="/opt/kubeconfig" -wait_for_resource configmap +wait_for_resource_or_die configmap -external_ip_path=/opt/crc/eip +CRC_EXTERNAL_IP_FILE_PATH=/opt/crc/eip # may or may not be there. See below ... -if oc get configmap client-ca-custom -n openshift-config; then +if oc get configmap client-ca-custom -n openshift-config 2>/dev/null; then echo "API Server Client CA already rotated..." exit 0 fi +echo "API Server Client CA not rotated. Doing it now ..." + # generate CA CA_FILE_PATH="/tmp/custom-ca.crt" CA_KEY_FILE_PATH="/tmp/custom-ca.key" @@ -28,52 +35,78 @@ CA_SUBJ="/OU=openshift/CN=admin-kubeconfig-signer-custom" CLIENT_SUBJ="/O=system:masters/CN=system:admin" VALIDITY=365 +cleanup() { + rm -f "$CA_FILE_PATH" "$CA_KEY_FILE_PATH" \ + "$CLIENT_CA_FILE_PATH" "$CLIENT_CA_KEY_FILE_PATH" "$CLIENT_CSR_FILE_PATH" + echo "Temp files cleanup complete." +} + +# keep cleanup bound to EXIT; no need to clear ERR early +trap cleanup ERR EXIT + # generate the CA private key -openssl genrsa -out ${CA_KEY_FILE_PATH} 4096 +openssl genrsa -out "$CA_KEY_FILE_PATH" 4096 # Create the CA certificate -openssl req -x509 -new -nodes -key ${CA_KEY_FILE_PATH} -sha256 -days $VALIDITY -out ${CA_FILE_PATH} -subj "${CA_SUBJ}" +openssl req -x509 -new -nodes -key "$CA_KEY_FILE_PATH" -sha256 -days "$VALIDITY" -out "$CA_FILE_PATH" -subj "$CA_SUBJ" # create CSR -openssl req -new -newkey rsa:4096 -nodes -keyout ${CLIENT_CA_KEY_FILE_PATH} -out ${CLIENT_CSR_FILE_PATH} -subj "${CLIENT_SUBJ}" +openssl req -new -newkey rsa:4096 -nodes -keyout "$CLIENT_CA_KEY_FILE_PATH" -out "$CLIENT_CSR_FILE_PATH" -subj "$CLIENT_SUBJ" # sign the CSR with above CA -openssl x509 -extfile <(printf "extendedKeyUsage = clientAuth") -req -in ${CLIENT_CSR_FILE_PATH} -CA ${CA_FILE_PATH} \ - -CAkey ${CA_KEY_FILE_PATH} -CAcreateserial -out ${CLIENT_CA_FILE_PATH} -days $VALIDITY -sha256 - -oc create configmap client-ca-custom -n openshift-config --from-file=ca-bundle.crt=${CA_FILE_PATH} -oc patch apiserver cluster --type=merge -p '{"spec": {"clientCA": {"name": "client-ca-custom"}}}' +openssl x509 -extfile <(printf "extendedKeyUsage = clientAuth") -req -in "$CLIENT_CSR_FILE_PATH" -CA "$CA_FILE_PATH" \ + -CAkey "$CA_KEY_FILE_PATH" -CAcreateserial -out "$CLIENT_CA_FILE_PATH" -days "$VALIDITY" -sha256 + +oc create configmap client-ca-custom \ + -n openshift-config \ + --from-file=ca-bundle.crt="$CA_FILE_PATH" \ + --dry-run=client -o yaml \ + | oc apply -f - + +jq -n ' +{ + "spec": { + "clientCA": { + "name": "client-ca-custom" + } + } +}' | oc patch apiserver cluster --type=merge --patch-file=/dev/stdin cluster_name=$(oc config view -o jsonpath='{.clusters[0].name}') -apiserver_url=$(oc config view -o jsonpath='{.clusters[0].cluster.server}') -if [ -f "${external_ip_path}" ]; then - apiserver_url=https://api.$(cat "${external_ip_path}").nip.io:6443 +if [[ -r "$CRC_EXTERNAL_IP_FILE_PATH" ]]; then + external_ip=$(tr -d '\r\n' < "$CRC_EXTERNAL_IP_FILE_PATH") + apiserver_url=https://api.${external_ip}.nip.io:6443 + echo "INFO: CRC external IP file found. Using apiserver_url='$apiserver_url'." +else + apiserver_url=$(oc config view -o jsonpath='{.clusters[0].cluster.server}') + echo "INFO: CRC external IP file does not exist ($CRC_EXTERNAL_IP_FILE_PATH). Using apiserver_url='$apiserver_url'." fi -updated_kubeconfig_path=/opt/crc/kubeconfig -rm -rf "${updated_kubeconfig_path}" +export KUBECONFIG=/opt/crc/kubeconfig +rm -rf "$KUBECONFIG" -oc config set-credentials system:admin --client-certificate=${CLIENT_CA_FILE_PATH} --client-key=${CLIENT_CA_KEY_FILE_PATH} \ - --embed-certs --kubeconfig="${updated_kubeconfig_path}" -oc config set-context system:admin --cluster="${cluster_name}" --namespace=default --user=system:admin --kubeconfig="${updated_kubeconfig_path}" -oc config set-cluster "${cluster_name}" --server="${apiserver_url}" --insecure-skip-tls-verify=true --kubeconfig="${updated_kubeconfig_path}" -oc config use-context system:admin --kubeconfig="${updated_kubeconfig_path}" +oc config set-credentials system:admin \ + --client-certificate="$CLIENT_CA_FILE_PATH" \ + --client-key="$CLIENT_CA_KEY_FILE_PATH" \ + --embed-certs -COUNTER=0 -until oc get co --kubeconfig="${updated_kubeconfig_path}"; -do - if [ $COUNTER == 90 ]; then - echo "Unable to access API server using new client certitificate..." - exit 1 - fi - echo "Acess API server with new client cert, try $COUNTER, hang on...." - sleep 2 - ((COUNTER++)) -done +oc config set-context system:admin --cluster="$cluster_name" --namespace=default --user=system:admin +oc config set-cluster "$cluster_name" --server="$apiserver_url" --insecure-skip-tls-verify=true +oc config use-context system:admin +wait_for_resource_or_die clusteroperators 90 2 -oc create configmap admin-kubeconfig-client-ca -n openshift-config --from-file=ca-bundle.crt=${CA_FILE_PATH} \ - --dry-run=client -o yaml | oc replace -f - +oc create configmap admin-kubeconfig-client-ca \ + -n openshift-config \ + --from-file=ca-bundle.crt="$CA_FILE_PATH" \ + --dry-run=client -oyaml \ + | oc apply -f- # copy the new kubeconfig to /opt/kubeconfig -rm -rf /opt/kubeconfig +rm -f /opt/kubeconfig cp /opt/crc/kubeconfig /opt/kubeconfig -chmod 0666 /opt/kubeconfig +chmod 0666 /opt/kubeconfig # keep the file readable by everyone in the system, this is safe + +# cleanup will apply here + +echo "All done" + +exit 0 diff --git a/systemd/ocp-clusterid.service b/systemd/ocp-clusterid.service index 19479bb8c..d9909f29c 100644 --- a/systemd/ocp-clusterid.service +++ b/systemd/ocp-clusterid.service @@ -8,7 +8,7 @@ StartLimitBurst=10 Type=oneshot Restart=on-failure RestartSec=40 -EnvironmentFile=-/etc/sysconfig/crc-env +Environment=KUBECONFIG=/opt/kubeconfig ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh ExecStart=/usr/local/bin/ocp-clusterid.sh diff --git a/systemd/ocp-clusterid.sh b/systemd/ocp-clusterid.sh index 686deaa56..3beee5eba 100644 --- a/systemd/ocp-clusterid.sh +++ b/systemd/ocp-clusterid.sh @@ -1,11 +1,19 @@ #!/bin/bash -set -x +set -o pipefail +set -o errexit +set -o nounset +set -o errtrace source /usr/local/bin/crc-systemd-common.sh -export KUBECONFIG="/opt/kubeconfig" + +wait_for_resource_or_die clusterversion + uuid=$(uuidgen) -wait_for_resource clusterversion +jq -n --arg id "${uuid}" '{spec: {clusterID: $id}}' \ + | oc patch clusterversion version --type merge --patch-file=/dev/stdin + +echo "All done" -oc patch clusterversion version -p "{\"spec\":{\"clusterID\":\"${uuid}\"}}" --type merge +exit 0 diff --git a/systemd/ocp-custom-domain.service b/systemd/ocp-custom-domain.service index 6ec401c64..db19d0fa2 100644 --- a/systemd/ocp-custom-domain.service +++ b/systemd/ocp-custom-domain.service @@ -1,6 +1,7 @@ [Unit] Description=CRC Unit setting nip.io domain for cluster After=crc-wait-apiserver-up.service +After=ocp-wait-apiservices-available.service StartLimitIntervalSec=450 StartLimitBurst=10 ConditionPathExists=!/opt/crc/%n.done @@ -9,9 +10,10 @@ ConditionPathExists=!/opt/crc/%n.done Type=oneshot Restart=on-failure RestartSec=40 -EnvironmentFile=-/etc/sysconfig/crc-env +Environment=KUBECONFIG=/opt/kubeconfig ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh -ExecStart=/usr/local/bin/ocp-custom-domain.sh +ExecStartPre=/usr/bin/test -f /opt/crc/eip +ExecStart=/usr/local/bin/ocp-custom-domain.sh /opt/crc/eip ExecStartPost=-touch /opt/crc/%n.done [Install] diff --git a/systemd/ocp-custom-domain.sh b/systemd/ocp-custom-domain.sh index 47c563ffe..023df73b5 100644 --- a/systemd/ocp-custom-domain.sh +++ b/systemd/ocp-custom-domain.sh @@ -1,49 +1,121 @@ #!/bin/bash +set -o pipefail +set -o errexit +set -o nounset +set -o errtrace set -x source /usr/local/bin/crc-systemd-common.sh -export KUBECONFIG="/opt/kubeconfig" -if [ ! -f /opt/crc/eip ]; then - echo "external ip not found" +CRC_EXTERNAL_IP_FILE_PATH="${1:-}" + +if [[ -z "$CRC_EXTERNAL_IP_FILE_PATH" ]]; then + echo "ERROR: expected to receive the external IP file as first argument ..." >&2 + exit 1 +fi + +# enforced by systemd +if [[ ! -r "$CRC_EXTERNAL_IP_FILE_PATH" ]]; then + echo "ERROR: CRC external ip file not found ($CRC_EXTERNAL_IP_FILE_PATH)" >&2 + exit 1 +fi + +EIP=$(tr -d '\r\n' < "$CRC_EXTERNAL_IP_FILE_PATH") + +if [[ -z "$EIP" ]]; then + echo "ERROR: External IP file is empty: $CRC_EXTERNAL_IP_FILE_PATH" >&2 + exit 1 +fi + +# Basic IPv4 sanity check; adjust if IPv6 is expected +if [[ ! "$EIP" =~ ^([0-9]{1,3}\.){3}[0-9]{1,3}$ ]]; then + echo "ERROR: Invalid IPv4 address read from $CRC_EXTERNAL_IP_FILE_PATH: '$EIP'" >&2 exit 1 fi -EIP=$(cat /opt/crc/eip) +wait_for_resource_or_die secret -STEPS_SLEEP_TIME=30 +TMP_KEY_FILE=$(mktemp /tmp/nip.key.XXXXX) +TMP_CRT_FILE=$(mktemp /tmp/nip.crt.XXXXX) -wait_for_resource secret +cleanup() { + rm -f "$TMP_KEY_FILE" "$TMP_CRT_FILE" + echo "Temp files cleanup complete." +} + +# Cleanup happens automatically via trap on error or at script end +trap cleanup ERR EXIT # create cert and add as secret -openssl req -newkey rsa:2048 -new -nodes -x509 -days 3650 -keyout /tmp/nip.key -out /tmp/nip.crt -subj "/CN=$EIP.nip.io" -addext "subjectAltName=DNS:apps.$EIP.nip.io,DNS:*.apps.$EIP.nip.io,DNS:api.$EIP.nip.io" -oc delete secret nip-secret -n openshift-config || true -oc create secret tls nip-secret --cert=/tmp/nip.crt --key=/tmp/nip.key -n openshift-config -sleep $STEPS_SLEEP_TIME +openssl req -newkey rsa:2048 -new \ + -nodes -x509 -days 3650 \ + -keyout "$TMP_KEY_FILE" -out "$TMP_CRT_FILE" \ + -subj "/CN=$EIP.nip.io" \ + -addext "subjectAltName=DNS:apps.$EIP.nip.io,DNS:*.apps.$EIP.nip.io,DNS:api.$EIP.nip.io" + +oc delete secret nip-secret -n openshift-config --ignore-not-found +oc create secret tls nip-secret \ + --cert="$TMP_CRT_FILE" \ + --key="$TMP_KEY_FILE" \ + -n openshift-config # patch ingress - cat < /tmp/ingress-patch.yaml -spec: - appsDomain: apps.$EIP.nip.io - componentRoutes: - - hostname: console-openshift-console.apps.$EIP.nip.io - name: console - namespace: openshift-console - servingCertKeyPairSecret: - name: nip-secret - - hostname: oauth-openshift.apps.$EIP.nip.io - name: oauth-openshift - namespace: openshift-authentication - servingCertKeyPairSecret: - name: nip-secret -EOF -oc patch ingresses.config.openshift.io cluster --type=merge --patch-file=/tmp/ingress-patch.yaml +wait_for_resource_or_die ingresses.config.openshift.io +jq -n --arg eip "$EIP" ' +{ + "spec": { + "appsDomain": "apps.\($eip).nip.io", + "componentRoutes": [ + { + "hostname": "console-openshift-console.apps.\($eip).nip.io", + "name": "console", + "namespace": "openshift-console", + "servingCertKeyPairSecret": { + "name": "nip-secret" + } + }, + { + "hostname": "oauth-openshift.apps.\($eip).nip.io", + "name": "oauth-openshift", + "namespace": "openshift-authentication", + "servingCertKeyPairSecret": { + "name": "nip-secret" + } + } + ] + } +}' | oc patch ingresses.config.openshift.io cluster --type=merge --patch-file=/dev/stdin # patch API server to use new CA secret -oc patch apiserver cluster --type=merge -p '{"spec":{"servingCerts": {"namedCertificates":[{"names":["api.'$EIP'.nip.io"],"servingCertificate": {"name": "nip-secret"}}]}}}' +wait_for_resource_or_die apiserver.config.openshift.io +jq -n --arg eip "$EIP" ' +{ + "spec": { + "servingCerts": { + "namedCertificates": [ + { + "names": [ + "api.\($eip).nip.io" + ], + "servingCertificate": { + "name": "nip-secret" + } + } + ] + } + } +}' | oc patch apiserver cluster --type=merge --patch-file=/dev/stdin # patch image registry route -oc patch -p '{"spec": {"host": "default-route-openshift-image-registry.'$EIP'.nip.io"}}' route default-route -n openshift-image-registry --type=merge +wait_for_resource_or_die route.route.openshift.io +jq -n --arg eip "$EIP" ' +{ + "spec": { + "host": "default-route-openshift-image-registry.\($eip).nip.io" + } +}' | oc patch route default-route -n openshift-image-registry --type=merge --patch-file=/dev/stdin + +echo "All done" -#wait_cluster_become_healthy "authentication|console|etcd|ingress|openshift-apiserver" +exit 0 diff --git a/systemd/ocp-growfs.service b/systemd/ocp-growfs.service deleted file mode 100644 index ff92d99cd..000000000 --- a/systemd/ocp-growfs.service +++ /dev/null @@ -1,12 +0,0 @@ -[Unit] -Description=CRC Unit to grow the root filesystem -Requires=crc-custom.target - -[Service] -Type=oneshot -EnvironmentFile=-/etc/sysconfig/crc-env -ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh -ExecStart=/usr/local/bin/ocp-growfs.sh - -[Install] -WantedBy=multi-user.target diff --git a/systemd/ocp-growfs.sh b/systemd/ocp-growfs.sh deleted file mode 100644 index c637a7c08..000000000 --- a/systemd/ocp-growfs.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -set -x - -root_partition=$(/usr/sbin/blkid -t TYPE=xfs -o device) -/usr/bin/growpart "${root_partition%?}" "${root_partition#/dev/???}" - -rootFS="/sysroot" -mount -o remount,rw "${rootFS}" -xfs_growfs "${rootFS}" -#mount -o remount,ro "${rootFS}" diff --git a/systemd/ocp-mco-sshkey.service b/systemd/ocp-mco-sshkey.service index 85aaa170e..94ea9c203 100644 --- a/systemd/ocp-mco-sshkey.service +++ b/systemd/ocp-mco-sshkey.service @@ -1,6 +1,7 @@ [Unit] Description=CRC Unit patching the MachineConfig to add new ssh key After=crc-wait-apiserver-up.service +After=cloud-final.service StartLimitIntervalSec=450 StartLimitBurst=10 @@ -8,9 +9,10 @@ StartLimitBurst=10 Type=oneshot Restart=on-failure RestartSec=40 -EnvironmentFile=-/etc/sysconfig/crc-env +Environment=KUBECONFIG=/opt/kubeconfig ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh -ExecStart=/usr/local/bin/ocp-mco-sshkey.sh +ExecStartPre=/usr/bin/test -f /opt/crc/id_rsa.pub +ExecStart=/usr/local/bin/ocp-mco-sshkey.sh /opt/crc/id_rsa.pub RemainAfterExit=true [Install] diff --git a/systemd/ocp-mco-sshkey.sh b/systemd/ocp-mco-sshkey.sh index 0f1d441bd..31bb5bae8 100644 --- a/systemd/ocp-mco-sshkey.sh +++ b/systemd/ocp-mco-sshkey.sh @@ -1,22 +1,53 @@ #!/bin/bash +set -o pipefail +set -o errexit +set -o nounset +set -o errtrace set -x source /usr/local/bin/crc-systemd-common.sh -export KUBECONFIG="/opt/kubeconfig" -pub_key_path="/opt/crc/id_rsa.pub" +CRC_PUB_KEY_PATH="${1:-}" -if [ ! -f "${pub_key_path}" ]; then - echo "No pubkey file found" +if [[ -z "$CRC_PUB_KEY_PATH" ]]; then + echo "ERROR: expected to receive the path to the pub key file as first argument." exit 1 fi -echo "Updating the public key resource for machine config operator" -pub_key=$(tr -d '\n\r' < ${pub_key_path}) -wait_for_resource machineconfig -if ! oc patch machineconfig 99-master-ssh -p "{\"spec\": {\"config\": {\"passwd\": {\"users\": [{\"name\": \"core\", \"sshAuthorizedKeys\": [\"${pub_key}\"]}]}}}}" --type merge; -then - echo "failed to update public key to machine config operator" +# enforced by systemd +if [[ ! -r "$CRC_PUB_KEY_PATH" ]]; then + echo "ERROR: CRC pubkey file does not exist ($CRC_PUB_KEY_PATH)" exit 1 fi + +wait_for_resource_or_die machineconfig/99-master-ssh + +echo "Updating the public key resource for machine config operator" + +# Use --rawfile to read the key file directly into a jq variable named 'pub_key'. +# The key's content is never exposed as a command-line argument. +# We use jq's rtrimstr function to remove any trailing newlines from the file. + +jq -n --rawfile pub_key "$CRC_PUB_KEY_PATH" ' +{ + "spec": { + "config": { + "passwd": { + "users": [ + { + "name": "core", + "sshAuthorizedKeys": [ + # Trim trailing newlines and carriage returns from the slurped file content + $pub_key | rtrimstr("\n") | rtrimstr("\r") + ] + } + ] + } + } + } +}' | oc patch machineconfig 99-master-ssh --type merge --patch-file=/dev/stdin + +echo "All done" + +exit 0 diff --git a/systemd/ocp-userpasswords.service b/systemd/ocp-userpasswords.service index 57937762f..30919f51c 100644 --- a/systemd/ocp-userpasswords.service +++ b/systemd/ocp-userpasswords.service @@ -2,6 +2,7 @@ Description=CRC Unit setting the developer and kubeadmin user password Before=ocp-cluster-ca.service After=crc-wait-apiserver-up.service +After=cloud-final.service StartLimitIntervalSec=450 StartLimitBurst=10 ConditionPathExists=!/opt/crc/%n.done @@ -10,10 +11,11 @@ ConditionPathExists=!/opt/crc/%n.done Type=oneshot Restart=on-failure RestartSec=40 -EnvironmentFile=-/etc/sysconfig/crc-env +Environment=KUBECONFIG=/opt/kubeconfig ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh -ExecStartPre=/usr/bin/sleep 5 -ExecStart=/usr/local/bin/ocp-userpasswords.sh +ExecStartPre=/usr/bin/test -f /opt/crc/pass_developer +ExecStartPre=/usr/bin/test -f /opt/crc/pass_kubeadmin +ExecStart=/usr/local/bin/ocp-userpasswords.sh /opt/crc/pass_kubeadmin /opt/crc/pass_developer ExecStartPost=-touch /opt/crc/%n.done [Install] diff --git a/systemd/ocp-userpasswords.sh b/systemd/ocp-userpasswords.sh index f2a6d2a02..f3e508430 100644 --- a/systemd/ocp-userpasswords.sh +++ b/systemd/ocp-userpasswords.sh @@ -7,43 +7,59 @@ set -o errtrace set -x source /usr/local/bin/crc-systemd-common.sh -export KUBECONFIG="/opt/kubeconfig" + +CRC_PASS_KUBEADMIN_PATH=${1:-} +CRC_PASS_DEVELOPER_PATH=${2:-} + +if [[ -z "$CRC_PASS_KUBEADMIN_PATH" || -z "$CRC_PASS_DEVELOPER_PATH" ]]; then + echo "ERROR: expected to receive the kubeadmin password file as 1st arg and the dev password file as 2nd arg. Got '$CRC_PASS_KUBEADMIN_PATH' and '$CRC_PASS_DEVELOPER_PATH'" + exit 1 +fi + +CRC_HTPASSWD_IMAGE=registry.access.redhat.com/ubi10/httpd-24 function gen_htpasswd() { if [ -z "${1:-}" ] || [ -z "${2:-}" ]; then - echo "gen_htpasswd needs two arguments: username password" 1>&2 + echo "gen_htpasswd needs two arguments: username password" >&2 return 1 fi - podman run --rm docker.io/xmartlabs/htpasswd "$1" "$2" + # --log-driver=none avoids that the journal captures the stdout + # logs of podman and leaks the passwords in the journal ... + podman run --log-driver=none --rm "$CRC_HTPASSWD_IMAGE" htpasswd -nb "$1" "$2" } -wait_for_resource secret - -if [ ! -f /opt/crc/pass_developer ]; then - echo "developer password does not exist" +# enforced by systemd +if [[ ! -r "$CRC_PASS_DEVELOPER_PATH" ]]; then + echo "ERROR: CRC developer password does not exist ($CRC_PASS_DEVELOPER_PATH)" exit 1 fi -if [ ! -f /opt/crc/pass_kubeadmin ]; then - echo "kubeadmin password does not exist" +# enforced by systemd +if [[ ! -r "$CRC_PASS_KUBEADMIN_PATH" ]]; then + echo "ERROR: CRC kubeadmin password does not exist ($CRC_PASS_KUBEADMIN_PATH)" exit 1 fi -echo "generating the kubeadmin and developer passwords ..." +echo "Pulling $CRC_HTPASSWD_IMAGE ..." +podman pull --quiet "$CRC_HTPASSWD_IMAGE" -set +x # /!\ disable the logging to avoid leaking the passwords +wait_for_resource_or_die secret -dev_pass=$(gen_htpasswd developer "$(cat /opt/crc/pass_developer)") -adm_pass=$(gen_htpasswd kubeadmin "$(cat /opt/crc/pass_kubeadmin)") +echo "Generating the kubeadmin and developer passwords ..." +set +x # disable the logging to avoid leaking the passwords + +dev_pass=$(gen_htpasswd developer "$(cat "$CRC_PASS_DEVELOPER_PATH")") +adm_pass=$(gen_htpasswd kubeadmin "$(cat "$CRC_PASS_KUBEADMIN_PATH")") echo "creating the password secret ..." -# use bash <() to use a temporary fd file -# use sed to remove the empty lines +# use bash "<()" to use a temporary fd file (safer to handle secrets) oc create secret generic htpass-secret \ --from-file=htpasswd=<(printf '%s\n%s\n' "$dev_pass" "$adm_pass") \ -n openshift-config \ --dry-run=client -oyaml \ | oc apply -f- -echo "all done" +echo "All done" + +exit 0 diff --git a/systemd/ocp-wait-apiservices-available.service b/systemd/ocp-wait-apiservices-available.service new file mode 100644 index 000000000..a82cde3a7 --- /dev/null +++ b/systemd/ocp-wait-apiservices-available.service @@ -0,0 +1,23 @@ +[Unit] +Description=Wait for all Kubernetes APIServices to be Available + +# This service needs network to talk to the k8s API server +Wants=network-online.target +After=network-online.target +After=crc-wait-apiserver-up.service +After=crc-wait-node-ready.service +Requires=crc-wait-node-ready.service +StartLimitIntervalSec=450 +StartLimitBurst=10 + +[Service] +Type=oneshot +Restart=on-failure +RestartSec=20 +ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh +ExecStart=/usr/local/bin/ocp-wait-apiservices-available.sh + +Environment=KUBECONFIG=/opt/kubeconfig + +[Install] +WantedBy=crc-custom.target diff --git a/systemd/ocp-wait-apiservices-available.sh b/systemd/ocp-wait-apiservices-available.sh new file mode 100644 index 000000000..1bb89e0a1 --- /dev/null +++ b/systemd/ocp-wait-apiservices-available.sh @@ -0,0 +1,69 @@ +#!/bin/bash + +set -o pipefail +set -o errexit +set -o nounset +set -o errtrace + +echo "➡️ Waiting for all APIServices to become available..." + +SECONDS=0 +MAX_RETRY=60 +WAIT_SEC=5 + +for retry in $(seq 1 "$MAX_RETRY"); do + # This command gets the 'status' of the 'Available' condition for every apiservice. + # It produces a list of "True" and/or "False" strings. We then count how many are "False". + APISERVICE_DATA=$(oc get apiservices -o json 2>/dev/null || true) + if [[ -z "$APISERVICE_DATA" ]]; then + UNAVAILABLE_COUNT=999 + echo "⚠️ Couldn't get the list of apiservices ..." + else + UNAVAILABLE_COUNT=$(jq -r ' + [ .items[] + | select(((.status.conditions // []) + | any(.type=="Available" and .status=="True")) | not) + ] | length + ' <<<"$APISERVICE_DATA") + UNAVAILABLE_COUNT=${UNAVAILABLE_COUNT:-0} + fi + + if [ "$UNAVAILABLE_COUNT" -eq 0 ]; then + echo "✅ All APIServices are now available after $SECONDS seconds." + break + fi + + echo + echo "⏳ Still waiting for $UNAVAILABLE_COUNT APIService(s) to become available. Retrying in $WAIT_SEC seconds." + echo "--------------------------------------------------------------------------------" + echo "Unavailable services and their messages:" + + # Get all apiservices as JSON and pipe to jq for filtering and formatting. + # The '-r' flag outputs raw strings instead of JSON-quoted strings. + if ! oc get apiservices -o json | jq -r ' + .items[] | + . as $item | + ( + $item.status.conditions[]? | + select(.type == "Available" and .status == "False") + ) | + " - \($item.metadata.name): \(.reason) - \(.message)" + ' + then + echo "⚠️ Unable to list unavailable APIServices details (will retry)" >&2 + fi + + echo "--------------------------------------------------------------------------------" + + # If it's the last attempt, log a failure message before exiting + if (( retry == MAX_RETRY )); then + echo "ERROR: Timed out waiting for the api-services to get ready, after $MAX_RETRY attempts x $WAIT_SEC seconds = $SECONDS seconds." >&2 + exit 1 + fi + + sleep "$WAIT_SEC" +done + +echo "🎉 Done." + +exit 0 diff --git a/systemd/ovs-configuration.service.d/mute-console.conf b/systemd/ovs-configuration.service.d/mute-console.conf new file mode 100644 index 000000000..749ed6d5d --- /dev/null +++ b/systemd/ovs-configuration.service.d/mute-console.conf @@ -0,0 +1,3 @@ +[Service] +StandardOutput=append:/var/log/ovs-configure.log +StandardError=append:/var/log/ovs-configure.log diff --git a/tools.sh b/tools.sh index 3b3d98a56..478d9ad0e 100755 --- a/tools.sh +++ b/tools.sh @@ -221,8 +221,10 @@ function generate_htpasswd_file { local pass_file=$2 ( set +x # use a subshell to avoid leaking the password - local random_password=$(cat $1/auth/kubeadmin-password) - ${HTPASSWD} -c -B -i "${pass_file}" developer <<<"developer" - ${HTPASSWD} -B -i "${pass_file}" kubeadmin <<<"${random_password}" + + local random_password + random_password=$(cat "$auth_file_dir/auth/kubeadmin-password") + "${HTPASSWD}" -c -B -i "${pass_file}" developer <<< "developer" # use -c to create the file + "${HTPASSWD}" -B -i "${pass_file}" kubeadmin <<< "${random_password}" # append to the existing password file ) }