From 3a45aef736849d7afaa54251f6c9c645e6a03f30 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 1 Oct 2025 15:49:48 +0200 Subject: [PATCH 01/44] createdisk: remove trailing spaces --- createdisk.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/createdisk.sh b/createdisk.sh index 84cfd0fa9..c09cb70c1 100755 --- a/createdisk.sh +++ b/createdisk.sh @@ -52,7 +52,7 @@ wait_for_ssh ${VM_NAME} ${VM_IP} if [ ${BUNDLE_TYPE} != "microshift" ]; then # Disable kubelet service ${SSH} core@${VM_IP} -- sudo systemctl disable kubelet - + # Stop the kubelet service so it will not reprovision the pods ${SSH} core@${VM_IP} -- sudo systemctl stop kubelet fi From 9b3c17771995a0ffe8f0f9d3860fc6a55495919b Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 1 Oct 2025 16:00:32 +0200 Subject: [PATCH 02/44] systemd/*.service: Ensure that crc-env exists before starting This commit enforces that the CRC services that require the CRC configuration file (`/etc/sysconfig/crc-env`) don't start before the file has been populated. Most of the services actually have this like: ``` ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh ``` which requires `/etc/sysconfig/crc-env`, so the synchronization point ``` After=crc-env-file-exists.service ``` is present in most of the services. This will go away when the self-sufficient bundle becomes the default path. --- systemd/crc-cluster-status.service | 1 + systemd/crc-custom.target | 1 + systemd/crc-dnsmasq.service | 1 + systemd/crc-env-file-exists.service | 20 ++++++++++++++++++++ systemd/crc-no-tap.service | 1 + systemd/crc-pullsecret.service | 1 + systemd/crc-routes-controller.service | 1 + systemd/crc-wait-apiserver-up.service | 1 + systemd/ocp-clusterid.service | 1 + systemd/ocp-custom-domain.service | 1 + systemd/ocp-growfs.service | 1 + systemd/ocp-mco-sshkey.service | 1 + systemd/ocp-userpasswords.service | 1 + 13 files changed, 32 insertions(+) create mode 100644 systemd/crc-env-file-exists.service diff --git a/systemd/crc-cluster-status.service b/systemd/crc-cluster-status.service index 92d73dffe..a30379fb3 100644 --- a/systemd/crc-cluster-status.service +++ b/systemd/crc-cluster-status.service @@ -1,5 +1,6 @@ [Unit] Description=CRC Unit checking if cluster is ready +After=crc-env-file-exists.service After=crc-wait-apiserver-up.service crc-pullsecret.service After=ocp-mco-sshkey.service ocp-cluster-ca.service After=ocp-custom-domain.service ocp-userpasswords.service diff --git a/systemd/crc-custom.target b/systemd/crc-custom.target index 206d482fa..cffb4d5b9 100644 --- a/systemd/crc-custom.target +++ b/systemd/crc-custom.target @@ -1,4 +1,5 @@ [Unit] Description=crc custom target Requires=kubelet-dependencies.target +Requires=crc-env-file-exists.service After=kubelet-dependencies.target diff --git a/systemd/crc-dnsmasq.service b/systemd/crc-dnsmasq.service index 42d45a93d..17346235b 100644 --- a/systemd/crc-dnsmasq.service +++ b/systemd/crc-dnsmasq.service @@ -1,6 +1,7 @@ [Unit] Description=CRC Unit for configuring dnsmasq Wants=ovs-configuration.service +After=crc-env-file-exists.service After=ovs-configuration.service Before=kubelet-dependencies.target StartLimitIntervalSec=30 diff --git a/systemd/crc-env-file-exists.service b/systemd/crc-env-file-exists.service new file mode 100644 index 000000000..35a9fb344 --- /dev/null +++ b/systemd/crc-env-file-exists.service @@ -0,0 +1,20 @@ +[Unit] +Description=Wait for /etc/sysconfig/crc-env file to be populated + +[Service] +# This service runs a command once and then exits. +Type=oneshot + +# This is the magic part. It keeps the service in an 'active' state +# after the command exits, so other services can see it succeeded. +RemainAfterExit=yes + +# This is the command that waits for the file. +# It checks every second if the file does not exist ('! -f'). +# Once the file is found, the loop exits, the command succeeds, and the service is 'active'. +ExecStart=/bin/sh -c 'while [ ! -f /etc/sysconfig/crc-env ]; do sleep 1; done' +TimeoutStartSec=300 + +[Install] +# Ensure this service is started during the normal boot process. +WantedBy=crc-custom.target diff --git a/systemd/crc-no-tap.service b/systemd/crc-no-tap.service index fe215e2c4..318467d54 100644 --- a/systemd/crc-no-tap.service +++ b/systemd/crc-no-tap.service @@ -2,6 +2,7 @@ Description=Ensure that tap0 network configuration is absent on Apple Virtualization Before=NetworkManager.service After=local-fs.target +After=crc-env-file-exists.service RequiresMountsFor=/etc/NetworkManager/system-connections [Service] diff --git a/systemd/crc-pullsecret.service b/systemd/crc-pullsecret.service index 4c88531b7..79c86d53f 100644 --- a/systemd/crc-pullsecret.service +++ b/systemd/crc-pullsecret.service @@ -1,5 +1,6 @@ [Unit] Description=CRC Unit for adding pull secret to cluster +After=crc-env-file-exists.service After=crc-wait-apiserver-up.service StartLimitIntervalSec=450 StartLimitBurst=10 diff --git a/systemd/crc-routes-controller.service b/systemd/crc-routes-controller.service index fe56fde2a..4ac3408a0 100644 --- a/systemd/crc-routes-controller.service +++ b/systemd/crc-routes-controller.service @@ -1,5 +1,6 @@ [Unit] Description=CRC Unit starting routes controller +After=crc-env-file-exists.service After=crc-wait-apiserver-up.service StartLimitIntervalSec=450 StartLimitBurst=10 diff --git a/systemd/crc-wait-apiserver-up.service b/systemd/crc-wait-apiserver-up.service index 7cf21e000..477692345 100644 --- a/systemd/crc-wait-apiserver-up.service +++ b/systemd/crc-wait-apiserver-up.service @@ -2,6 +2,7 @@ Description=CRC Unit waiting till k8s API server is up Requires=kubelet.service After=kubelet.service +After=crc-env-file-exists.service Before=ocp-delete-mco-leases.service [Service] diff --git a/systemd/ocp-clusterid.service b/systemd/ocp-clusterid.service index 19479bb8c..9c4ec761e 100644 --- a/systemd/ocp-clusterid.service +++ b/systemd/ocp-clusterid.service @@ -1,5 +1,6 @@ [Unit] Description=CRC Unit setting random cluster ID +After=crc-env-file-exists.service After=crc-wait-apiserver-up.service StartLimitIntervalSec=450 StartLimitBurst=10 diff --git a/systemd/ocp-custom-domain.service b/systemd/ocp-custom-domain.service index 6ec401c64..273ec7950 100644 --- a/systemd/ocp-custom-domain.service +++ b/systemd/ocp-custom-domain.service @@ -1,5 +1,6 @@ [Unit] Description=CRC Unit setting nip.io domain for cluster +After=crc-env-file-exists.service After=crc-wait-apiserver-up.service StartLimitIntervalSec=450 StartLimitBurst=10 diff --git a/systemd/ocp-growfs.service b/systemd/ocp-growfs.service index ff92d99cd..307bdbece 100644 --- a/systemd/ocp-growfs.service +++ b/systemd/ocp-growfs.service @@ -1,6 +1,7 @@ [Unit] Description=CRC Unit to grow the root filesystem Requires=crc-custom.target +After=crc-env-file-exists.service [Service] Type=oneshot diff --git a/systemd/ocp-mco-sshkey.service b/systemd/ocp-mco-sshkey.service index 85aaa170e..81e0fc1c2 100644 --- a/systemd/ocp-mco-sshkey.service +++ b/systemd/ocp-mco-sshkey.service @@ -1,5 +1,6 @@ [Unit] Description=CRC Unit patching the MachineConfig to add new ssh key +After=crc-env-file-exists.service After=crc-wait-apiserver-up.service StartLimitIntervalSec=450 StartLimitBurst=10 diff --git a/systemd/ocp-userpasswords.service b/systemd/ocp-userpasswords.service index 57937762f..e5e30af1a 100644 --- a/systemd/ocp-userpasswords.service +++ b/systemd/ocp-userpasswords.service @@ -1,6 +1,7 @@ [Unit] Description=CRC Unit setting the developer and kubeadmin user password Before=ocp-cluster-ca.service +After=crc-env-file-exists.service After=crc-wait-apiserver-up.service StartLimitIntervalSec=450 StartLimitBurst=10 From c1b4155bbab0c3a39073e9f8331391c246bd354a Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 1 Oct 2025 16:00:32 +0200 Subject: [PATCH 03/44] systemd/*.service: don't use EnvironmentFile when not needed The `EnvironmentFile` makes sense when the `unit.service` file relies on these environment variables, which isn't the case for CRC services. Instead, the relevant scripts should call `source /etc/sysconfig/crc-env` to be self-sufficient. This will be done in a follow up commit. --- systemd/crc-cluster-status.service | 1 - systemd/crc-dnsmasq.service | 1 - systemd/crc-no-tap.service | 1 - systemd/crc-pullsecret.service | 1 - systemd/crc-routes-controller.service | 1 - systemd/crc-wait-apiserver-up.service | 1 - systemd/ocp-cluster-ca.service | 1 - systemd/ocp-clusterid.service | 1 - systemd/ocp-custom-domain.service | 1 - systemd/ocp-growfs.service | 1 - systemd/ocp-mco-sshkey.service | 1 - systemd/ocp-userpasswords.service | 1 - 12 files changed, 12 deletions(-) diff --git a/systemd/crc-cluster-status.service b/systemd/crc-cluster-status.service index a30379fb3..74c8af9a0 100644 --- a/systemd/crc-cluster-status.service +++ b/systemd/crc-cluster-status.service @@ -12,7 +12,6 @@ StartLimitBurst=10 Type=oneshot Restart=on-failure RestartSec=40 -EnvironmentFile=-/etc/sysconfig/crc-env ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh ExecStart=/usr/local/bin/crc-cluster-status.sh RemainAfterExit=true diff --git a/systemd/crc-dnsmasq.service b/systemd/crc-dnsmasq.service index 17346235b..a01107210 100644 --- a/systemd/crc-dnsmasq.service +++ b/systemd/crc-dnsmasq.service @@ -9,7 +9,6 @@ StartLimitIntervalSec=30 [Service] Type=oneshot Restart=on-failure -EnvironmentFile=-/etc/sysconfig/crc-env ExecStartPre=/bin/systemctl start ovs-configuration.service ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh ExecStart=/usr/local/bin/crc-dnsmasq.sh diff --git a/systemd/crc-no-tap.service b/systemd/crc-no-tap.service index 318467d54..3f7c64e59 100644 --- a/systemd/crc-no-tap.service +++ b/systemd/crc-no-tap.service @@ -7,7 +7,6 @@ RequiresMountsFor=/etc/NetworkManager/system-connections [Service] Type=oneshot -EnvironmentFile=-/etc/sysconfig/crc-env ExecStart=/usr/local/bin/crc-no-tap.sh [Install] diff --git a/systemd/crc-pullsecret.service b/systemd/crc-pullsecret.service index 79c86d53f..a76b36584 100644 --- a/systemd/crc-pullsecret.service +++ b/systemd/crc-pullsecret.service @@ -10,7 +10,6 @@ ConditionPathExists=!/opt/crc/%n.done Type=oneshot Restart=on-failure RestartSec=40 -EnvironmentFile=-/etc/sysconfig/crc-env ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh ExecStart=/usr/local/bin/crc-pullsecret.sh ExecStartPost=-touch /opt/crc/%n.done diff --git a/systemd/crc-routes-controller.service b/systemd/crc-routes-controller.service index 4ac3408a0..a1f3c53c4 100644 --- a/systemd/crc-routes-controller.service +++ b/systemd/crc-routes-controller.service @@ -9,7 +9,6 @@ StartLimitBurst=10 Type=oneshot Restart=on-failure RestartSec=40 -EnvironmentFile=-/etc/sysconfig/crc-env ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh ExecStart=/usr/local/bin/crc-routes-controller.sh diff --git a/systemd/crc-wait-apiserver-up.service b/systemd/crc-wait-apiserver-up.service index 477692345..2a6061917 100644 --- a/systemd/crc-wait-apiserver-up.service +++ b/systemd/crc-wait-apiserver-up.service @@ -8,7 +8,6 @@ Before=ocp-delete-mco-leases.service [Service] Type=oneshot Restart=on-failure -EnvironmentFile=-/etc/sysconfig/crc-env ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh ExecStart=/usr/local/bin/crc-wait-apiserver-up.sh diff --git a/systemd/ocp-cluster-ca.service b/systemd/ocp-cluster-ca.service index 374383fca..8f5a8d2ef 100644 --- a/systemd/ocp-cluster-ca.service +++ b/systemd/ocp-cluster-ca.service @@ -9,7 +9,6 @@ ConditionPathExists=!/opt/crc/%n.done Type=oneshot Restart=on-failure RestartSec=40 -EnvironmentFile=-/etc/sysconfig/crc-env ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh ExecStart=/usr/local/bin/ocp-cluster-ca.sh ExecStartPost=-touch /opt/crc/%n.done diff --git a/systemd/ocp-clusterid.service b/systemd/ocp-clusterid.service index 9c4ec761e..f901f8160 100644 --- a/systemd/ocp-clusterid.service +++ b/systemd/ocp-clusterid.service @@ -9,7 +9,6 @@ StartLimitBurst=10 Type=oneshot Restart=on-failure RestartSec=40 -EnvironmentFile=-/etc/sysconfig/crc-env ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh ExecStart=/usr/local/bin/ocp-clusterid.sh diff --git a/systemd/ocp-custom-domain.service b/systemd/ocp-custom-domain.service index 273ec7950..93d644ce0 100644 --- a/systemd/ocp-custom-domain.service +++ b/systemd/ocp-custom-domain.service @@ -10,7 +10,6 @@ ConditionPathExists=!/opt/crc/%n.done Type=oneshot Restart=on-failure RestartSec=40 -EnvironmentFile=-/etc/sysconfig/crc-env ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh ExecStart=/usr/local/bin/ocp-custom-domain.sh ExecStartPost=-touch /opt/crc/%n.done diff --git a/systemd/ocp-growfs.service b/systemd/ocp-growfs.service index 307bdbece..0790e4dd7 100644 --- a/systemd/ocp-growfs.service +++ b/systemd/ocp-growfs.service @@ -5,7 +5,6 @@ After=crc-env-file-exists.service [Service] Type=oneshot -EnvironmentFile=-/etc/sysconfig/crc-env ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh ExecStart=/usr/local/bin/ocp-growfs.sh diff --git a/systemd/ocp-mco-sshkey.service b/systemd/ocp-mco-sshkey.service index 81e0fc1c2..42b4b5587 100644 --- a/systemd/ocp-mco-sshkey.service +++ b/systemd/ocp-mco-sshkey.service @@ -9,7 +9,6 @@ StartLimitBurst=10 Type=oneshot Restart=on-failure RestartSec=40 -EnvironmentFile=-/etc/sysconfig/crc-env ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh ExecStart=/usr/local/bin/ocp-mco-sshkey.sh RemainAfterExit=true diff --git a/systemd/ocp-userpasswords.service b/systemd/ocp-userpasswords.service index e5e30af1a..b47b9fcf1 100644 --- a/systemd/ocp-userpasswords.service +++ b/systemd/ocp-userpasswords.service @@ -11,7 +11,6 @@ ConditionPathExists=!/opt/crc/%n.done Type=oneshot Restart=on-failure RestartSec=40 -EnvironmentFile=-/etc/sysconfig/crc-env ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh ExecStartPre=/usr/bin/sleep 5 ExecStart=/usr/local/bin/ocp-userpasswords.sh From c634d53d64620fa40b9950794bbd6ffa4fd9bc1d Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 1 Oct 2025 16:20:47 +0200 Subject: [PATCH 04/44] systemd: Improve the handling of the tap networking This commit clarifies the enablement/disablement of the TAP networking. * `systemd/crc-needs-tap.sh` this script tells if the TAP networking should be enabled or not. The choice is currently done by checking the CRC configuration file. * `systemd/crc-conditionally-disable-tap.sh` this script checks if the TAP networking should be disabled or not. I had to use a script to make the choice, as using the SystemD directives would have failed the service. * `systemd/crc-disable-tap.sh` this script disables the TAP networking, by disactivating the GV proxy and the `tap0` network configuration. * `crc-self-sufficient-env.sh` this script tells if VM is running a self-sufficient bundle * `crc-user-mode-networking.sh` this script checks if the user-mode networking should be enabled --- createdisk.sh | 4 ++ ...c-no-tap.service => crc-check-tap.service} | 6 +- systemd/crc-conditionally-disable-tap.sh | 17 +++++ systemd/crc-disable-tap.sh | 14 ++++ systemd/crc-needs-tap.sh | 44 +++++++++++++ systemd/crc-no-tap.sh | 10 --- systemd/crc-self-sufficient-env.sh | 18 +++-- systemd/crc-user-mode-networking.sh | 65 +++++++++++++++++++ 8 files changed, 161 insertions(+), 17 deletions(-) rename systemd/{crc-no-tap.service => crc-check-tap.service} (50%) create mode 100644 systemd/crc-conditionally-disable-tap.sh create mode 100644 systemd/crc-disable-tap.sh create mode 100644 systemd/crc-needs-tap.sh delete mode 100644 systemd/crc-no-tap.sh create mode 100644 systemd/crc-user-mode-networking.sh diff --git a/createdisk.sh b/createdisk.sh index c09cb70c1..9ce573438 100755 --- a/createdisk.sh +++ b/createdisk.sh @@ -109,11 +109,15 @@ ${SSH} core@${VM_IP} 'sudo bash -x -s' < vfkit doesn't need tap0 +# --> other platforms do need it + +virt="$(systemd-detect-virt || true)" + +if [[ -z "$virt" ]]; then + echo "ERROR: systemd couldn't detect the virtualization :/" >&2 + exit "$EXIT_ERROR" +fi + +if [[ "${virt}" == apple ]] ; then + echo "Running with vfkit ($virt) virtualization. Don't need tap0." + exit "$EXIT_DONT_NEED_TAP" +fi + +echo "Running with '$virt' virtualization. Need tap0." + +exit "$EXIT_NEED_TAP" diff --git a/systemd/crc-no-tap.sh b/systemd/crc-no-tap.sh deleted file mode 100644 index 1f0410221..000000000 --- a/systemd/crc-no-tap.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash - -# Return true if running under Apple Virtualization or CRC_SELF_SUFFICIENT is set, otherwise false - -if systemd-detect-virt | grep -q '^apple$' || [ -n "$CRC_SELF_SUFFICIENT" ]; then - rm -f /etc/NetworkManager/system-connections/tap0.nmconnection - systemctl disable --now gv-user-network@tap0.service -fi - -exit 0 diff --git a/systemd/crc-self-sufficient-env.sh b/systemd/crc-self-sufficient-env.sh index 32dde4294..a19f7c7d4 100644 --- a/systemd/crc-self-sufficient-env.sh +++ b/systemd/crc-self-sufficient-env.sh @@ -1,8 +1,16 @@ #!/bin/bash -set -euo pipefail -# Optional: load env if unit forgot EnvironmentFile -[ -r /etc/sysconfig/crc-env ] && . /etc/sysconfig/crc-env -if [ "${CRC_SELF_SUFFICIENT:-}" = "1" ] || [ "${CRC_CLOUD:-}" = "1" ]; then + +set -o pipefail +set -o errexit +set -o nounset +set -o errtrace + +source /etc/sysconfig/crc-env || echo "WARNING: crc-env not found" + +if (( ${CRC_SELF_SUFFICIENT:-0} == 1 )); then + echo "Running with a self-sufficient bundle" exit 0 +else + echo "Not running in a self-sufficient bundle" + exit 1 fi -exit 1 \ No newline at end of file diff --git a/systemd/crc-user-mode-networking.sh b/systemd/crc-user-mode-networking.sh new file mode 100644 index 000000000..109603e5e --- /dev/null +++ b/systemd/crc-user-mode-networking.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +set -o pipefail +set -o errexit +set -o nounset +set -o errtrace + +source /etc/sysconfig/crc-env || echo "WARNING: crc-env not found" + +EXIT_ERROR=77 + +target="${1:-}" +if [[ "$target" == user || -z "$target" ]]; then + # searching for user mode, return 0 if user + EXIT_USER_MODE=0 + EXIT_NOT_USER_MODE=1 +elif [[ "$target" == system ]]; then + # searching for system mode, return 0 if system + EXIT_NOT_USER_MODE=0 + EXIT_USER_MODE=1 +else + echo "ERROR: invalid target '$target'. Should be 'user' (default) or 'system'. Got '$target'." >&2 + exit "$EXIT_ERROR" +fi + + +if /usr/local/bin/crc-self-sufficient-env.sh; then + echo "Running a self-sufficient bundle. Not user-mode networking." + if [[ "${CRC_NETWORK_MODE_USER:-}" ]]; then + echo "WARNING: Ignoring CRC_NETWORK_MODE_USER='$CRC_NETWORK_MODE_USER' in the self-sufficient bundle." + fi + + exit "$EXIT_NOT_USER_MODE" +fi + +# no value --> error +if [[ -z "${CRC_NETWORK_MODE_USER:-}" ]]; then + echo "ERROR: CRC_NETWORK_MODE_USER not set. Assuming user networking." >&2 + exit "$EXIT_ERROR" +fi + +# value not in [0, 1] --> error +if [[ ! "${CRC_NETWORK_MODE_USER}" =~ ^[01]$ ]]; then + echo "ERROR: unknown network mode: CRC_NETWORK_MODE_USER=${CRC_NETWORK_MODE_USER} (expected 0 or 1)" >&2 + exit "$EXIT_ERROR" +fi + +# value == 0 --> not user-node +if (( CRC_NETWORK_MODE_USER == 0 )); then + echo "network-mode 'system' detected" + exit "$EXIT_NOT_USER_MODE" +fi + +# value == 1 --> user-mode +if (( CRC_NETWORK_MODE_USER == 1 )); then + echo "network-mode 'user' detected" + exit "$EXIT_USER_MODE" +fi + +# anything else --> error (can't be reached) +echo "ERROR: unknown network mode: CRC_NETWORK_MODE_USER=$CRC_NETWORK_MODE_USER." >&2 +echo "Assuming user networking." >&2 +echo "SHOULD NOT BE REACHED." >&2 + +exit "$EXIT_ERROR" From 22320bf36b3453492e9549479218f8c5f7d1d04b Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 1 Oct 2025 16:23:18 +0200 Subject: [PATCH 05/44] tools.sh: improve the bash syntax of the `generate_htpasswd_file` function Minor improvements of the `generate_htpasswd_file` function: - don't use `local var=$(command)` as this avoids the `set -e` safety net (if `command` fails, the failure is ignored by Bash) - consistent use of `$auth_file_dir` instead of `$1` - better comments to explain why the two `htpasswd` calls don't have the same arguments --- tools.sh | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tools.sh b/tools.sh index 3b3d98a56..478d9ad0e 100755 --- a/tools.sh +++ b/tools.sh @@ -221,8 +221,10 @@ function generate_htpasswd_file { local pass_file=$2 ( set +x # use a subshell to avoid leaking the password - local random_password=$(cat $1/auth/kubeadmin-password) - ${HTPASSWD} -c -B -i "${pass_file}" developer <<<"developer" - ${HTPASSWD} -B -i "${pass_file}" kubeadmin <<<"${random_password}" + + local random_password + random_password=$(cat "$auth_file_dir/auth/kubeadmin-password") + "${HTPASSWD}" -c -B -i "${pass_file}" developer <<< "developer" # use -c to create the file + "${HTPASSWD}" -B -i "${pass_file}" kubeadmin <<< "${random_password}" # append to the existing password file ) } From 060329bdd0b63679b41ed62d1987e74db18fb5a3 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 1 Oct 2025 16:27:02 +0200 Subject: [PATCH 06/44] crc-systemd-common.sh: improve the bash syntax, rename into wait_for_resource_or_die * Rename the function into `wait_for_resource_or_die` to make it clear that the function exits if the wait fails * Disable `set -x` during the wait, to reduce the journal verbosity * Check that the `$resource` argument isn't missing * Use of Bash arithmetic syntax to make the code more readable * Explicit of `for (())` and `(( retry == max_retry ))` checks to easily read the execution flow * Clear error messages --- systemd/crc-systemd-common.sh | 50 +++++++++++++++++++++++++++++------ 1 file changed, 42 insertions(+), 8 deletions(-) diff --git a/systemd/crc-systemd-common.sh b/systemd/crc-systemd-common.sh index bd68169ed..c384eb39d 100644 --- a/systemd/crc-systemd-common.sh +++ b/systemd/crc-systemd-common.sh @@ -1,15 +1,49 @@ # $1 is the resource to check # $2 is an optional maximum retry count; default 20 -function wait_for_resource() { - local retry=0 +function wait_for_resource_or_die() { + local resource=${1:-} local max_retry=${2:-20} local wait_sec=${3:-5} - until oc get "$1" > /dev/null 2>&1 - do - [[ "$retry" -ge "$max_retry" ]] && exit 1 - sleep $wait_sec - ((retry++)) + + local xtrace_was_disabled=0 + # Check if xtrace is currently DISABLED. If so, set a flag. + [[ $- == *x* ]] || xtrace_was_disabled=1 + set +x # disable xtrace to reduce the verbosity of this function + + if [[ -z "$resource" ]]; then + echo "ERROR: expected a K8s resource as first parameter ..." + echo "ERROR: wait_for_resource_or_die RESOURCE [max_retry=20] [wait_sec=5]" + exit 1 # this is wait_for_resource_or_die, so die ... + fi + + # Loop from 1 up to max_retry + for (( retry=1; retry<=max_retry; retry++ )); do + # Try the command. If it succeeds, exit the loop. + if oc get $resource > /dev/null 2>&1; then + local end_time + end_time=$(date +%s) + + local duration=$((end_time - start_time)) + echo "Resource '$resource' found after $retry tries ($duration seconds)." + + if (( ! xtrace_was_disabled )); then + set -x # reenable xtrace + fi + + return 0 + fi + + # If it's the last attempt, log a failure message before exiting + if (( retry == max_retry )); then + echo "Error: Timed out waiting for resource '$resource' after ${max_retry} attempts x ${wait_sec} seconds." >&2 + exit 1 # this is wait_for_resource_or_die, so die ... + fi + + # Wait before the next attempt + echo "Attempt ${retry}/${max_retry} didn't succeed." + echo "Waiting $wait_sec seconds for '$resource'." + sleep "$wait_sec" done - return 0 + # unreachable } From 6255e673687f91a36cd46244caf41ae8112d3d46 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 1 Oct 2025 16:26:16 +0200 Subject: [PATCH 07/44] crc-cluster-status: improve the bash syntax * Clarification of the login test/retry logic. * Simple time tracking for a quick glance assessment of the wait duration * Better logging and constant definition --- systemd/crc-cluster-status.sh | 46 +++++++++++++++++++++++++++-------- 1 file changed, 36 insertions(+), 10 deletions(-) diff --git a/systemd/crc-cluster-status.sh b/systemd/crc-cluster-status.sh index a62586438..2529779ad 100644 --- a/systemd/crc-cluster-status.sh +++ b/systemd/crc-cluster-status.sh @@ -7,6 +7,8 @@ set -o errtrace set -x export KUBECONFIG=/opt/kubeconfig +MAXIMUM_LOGIN_RETRY=10 +RETRY_DELAY=5 if [ ! -f /opt/crc/pass_kubeadmin ]; then echo "kubeadmin password file not found" @@ -15,25 +17,49 @@ fi rm -rf /tmp/.crc-cluster-ready +SECONDS=0 if ! oc adm wait-for-stable-cluster --minimum-stable-period=1m --timeout=10m; then exit 1 fi +echo "Cluster took $SECONDS seconds to stabilize." -echo "Logging into OpenShift with kubeadmin user to update $KUBECONFIG" -COUNTER=1 -MAXIMUM_LOGIN_RETRY=10 +echo "Logging into OpenShift with kubeadmin user to update the KUBECONFIG" + +try_login() { + ( # use a `(set +x)` subshell to avoid leaking the password + set +x + set +e # don't abort on error in this subshell + oc login --insecure-skip-tls-verify=true \ + -u kubeadmin \ + -p "$(cat /opt/crc/pass_kubeadmin)" \ + https://api.crc.testing:6443 > /dev/null 2>&1 + ) + local success="$?" + if [[ "$success" == 0 ]]; then + echo "Login succeeded" + else + echo "Login did not complete ..." + fi -# use a `(set +x)` subshell to avoid leaking the password -until (set +x ; oc login --insecure-skip-tls-verify=true -u kubeadmin -p "$(cat /opt/crc/pass_kubeadmin)" https://api.crc.testing:6443 > /dev/null 2>&1); do - if [ "$COUNTER" -ge "$MAXIMUM_LOGIN_RETRY" ]; then - echo "Unable to login to the cluster..., authentication failed." + return "$success" +} + +for ((counter=1; counter<=MAXIMUM_LOGIN_RETRY; counter++)); do + echo "Login attempt $counter/$MAXIMUM_LOGIN_RETRY…" + if try_login; then + break + fi + if (( counter == MAXIMUM_LOGIN_RETRY )); then + echo "Unable to login to the cluster after $counter attempts; authentication failed." exit 1 fi - echo "Logging into OpenShift with updated credentials try $COUNTER, hang on...." - sleep 5 - ((COUNTER++)) + sleep "$RETRY_DELAY" done # need to set a marker to let `crc` know the cluster is ready touch /tmp/.crc-cluster-ready + +echo "All done after $SECONDS seconds " + +exit 0 From b3d28fdf865fdcf8f15c6754856b1a7c1ab7e3b4 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 1 Oct 2025 16:28:28 +0200 Subject: [PATCH 08/44] crc-pullsecret.sh: syntax and reliability improvements * More resilient checks of the pull secrets file * More secure handling of the pull secrets: * don't pass them via the command-line, but via stdin * use `jq` to enforce that the secrets are properly inserted in the patch JSON object --- systemd/crc-pullsecret.sh | 44 +++++++++++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 11 deletions(-) diff --git a/systemd/crc-pullsecret.sh b/systemd/crc-pullsecret.sh index 895a5480a..d1665357b 100644 --- a/systemd/crc-pullsecret.sh +++ b/systemd/crc-pullsecret.sh @@ -9,24 +9,46 @@ set -x source /usr/local/bin/crc-systemd-common.sh export KUBECONFIG="/opt/kubeconfig" -wait_for_resource secret +PULL_SECRETS_FILE="/opt/crc/pull-secret" -set +x # disable the logging to avoid leaking the pull secrets +wait_for_resource_or_die secret -# check if existing pull-secret is valid if not add the one from /opt/crc/pull-secret -existingPsB64=$(oc get secret pull-secret -n openshift-config -o jsonpath="{['data']['\.dockerconfigjson']}") -existingPs=$(echo "${existingPsB64}" | base64 -d) +# The pull secret data is piped through stdin and not exposed in command arguments, +# so `set -x` is safe to keep # check if the .auths field is there -if echo "${existingPs}" | jq -e 'has("auths")' >/dev/null 2>&1; then - echo "Cluster already has the pull secrets, nothing to do" +if oc get secret pull-secret \ + -n openshift-config \ + -o jsonpath="{['data']['\.dockerconfigjson']}" \ + | base64 -d \ + | jq -e 'has("auths")' >/dev/null 2>&1 +then + echo "Cluster already has some pull secrets, nothing to do." exit 0 fi -echo "Cluster doesn't have the pull secrets. Setting them from /opt/crc/pull-secret ..." -pullSecretB64=$(base64 -w0 < /opt/crc/pull-secret) +echo "Cluster doesn't have the pull secrets. Setting them from $PULL_SECRETS_FILE ..." + +if [[ ! -r "$PULL_SECRETS_FILE" ]]; +then + echo "ERROR: $PULL_SECRETS_FILE is missing or unreadable" 1>&2 + exit 1 +fi + +if ! jq -e 'has("auths")' < "$PULL_SECRETS_FILE" >/dev/null; +then + echo "ERROR: pull-secrets file doesn't have the required '.auths' field" + exit 1 +fi + # Create the JSON patch in memory and pipe it to the oc command -printf '{"data":{".dockerconfigjson": "%s"}}' "${pullSecretB64}" | \ - oc patch secret pull-secret -n openshift-config --type merge --patch-file=/dev/stdin +base64 -w0 < "$PULL_SECRETS_FILE" | \ + jq -R '{"data": {".dockerconfigjson": .}}' | \ + oc patch secret pull-secret \ + -n openshift-config \ + --type merge \ + --patch-file=/dev/stdin + +echo "All done" exit 0 From e658a377f9afa4624043daf3dd509a35a017b848 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 1 Oct 2025 16:30:03 +0200 Subject: [PATCH 09/44] crc-routes-controller.service: add a condition on user-mode networking Add a SystemD primitive to enforce that the `crc-routes-controller` is only deployed when user-mode networking has been enabled --- systemd/crc-routes-controller.service | 1 + systemd/crc-routes-controller.sh | 4 ---- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/systemd/crc-routes-controller.service b/systemd/crc-routes-controller.service index a1f3c53c4..65f6e1973 100644 --- a/systemd/crc-routes-controller.service +++ b/systemd/crc-routes-controller.service @@ -9,6 +9,7 @@ StartLimitBurst=10 Type=oneshot Restart=on-failure RestartSec=40 +ExecCondition=/usr/local/bin/crc-user-mode-networking.sh ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh ExecStart=/usr/local/bin/crc-routes-controller.sh diff --git a/systemd/crc-routes-controller.sh b/systemd/crc-routes-controller.sh index 7aa2c3316..3fc3fb919 100644 --- a/systemd/crc-routes-controller.sh +++ b/systemd/crc-routes-controller.sh @@ -2,10 +2,6 @@ set -x -if [[ ${CRC_NETWORK_MODE_USER} -eq 0 ]]; then - echo -n "network-mode 'system' detected: skipping routes-controller pod deployment" - exit 0 -fi source /usr/local/bin/crc-systemd-common.sh export KUBECONFIG=/opt/kubeconfig From efab23ccb3c008de5fb9b3b1d6fad81d6a5381fb Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 1 Oct 2025 16:31:21 +0200 Subject: [PATCH 10/44] crc-routes-controller.sh: minor syntax improvements * Make the script more resilient by failing on any error. * Better use of script constants * Switch to `wait_for_resource_or_die` --- systemd/crc-routes-controller.sh | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/systemd/crc-routes-controller.sh b/systemd/crc-routes-controller.sh index 3fc3fb919..64d3f6f42 100644 --- a/systemd/crc-routes-controller.sh +++ b/systemd/crc-routes-controller.sh @@ -1,12 +1,22 @@ #!/bin/bash +set -o pipefail +set -o errexit +set -o nounset +set -o errtrace set -x +ROUTE_CONTROLLER=/opt/crc/routes-controller.yaml source /usr/local/bin/crc-systemd-common.sh + export KUBECONFIG=/opt/kubeconfig -wait_for_resource pods +wait_for_resource_or_die pods +wait_for_resource_or_die deployments + +oc apply -f "$ROUTE_CONTROLLER" -oc apply -f /opt/crc/routes-controller.yaml +echo "All done." +exit 0 From a35018c0a3062b47ab17a9078ba247225c3084a9 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 1 Oct 2025 16:32:25 +0200 Subject: [PATCH 11/44] crc-wait-apiserver-up.sh: minor syntax improvements * Make the script more resilient by failing on any error * Make more verbose * Use `wait_for_resource_or_die` * Switch the retry-delay from 4 tries, 60s delay to 60 tries, 4 seconds delay * this makes the script detect earlier when the APIServer becomes available --- systemd/crc-wait-apiserver-up.sh | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/systemd/crc-wait-apiserver-up.sh b/systemd/crc-wait-apiserver-up.sh index 28299a5d4..e18494a01 100644 --- a/systemd/crc-wait-apiserver-up.sh +++ b/systemd/crc-wait-apiserver-up.sh @@ -1,9 +1,19 @@ #!/bin/bash -set -x +set -o pipefail +set -o errexit +set -o nounset +set -o errtrace source /usr/local/bin/crc-systemd-common.sh export KUBECONFIG=/opt/kubeconfig +echo "Waiting for the node resource to be available ..." # $1 resource, $2 retry count, $3 wait time -wait_for_resource node 4 60 +wait_for_resource_or_die node 4 60 + +echo "node resource available, APIServer is ready." + +echo "All done" + +exit 0 From f66ea10964a230a1b472a682df0dd99e637e4990 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 1 Oct 2025 16:32:50 +0200 Subject: [PATCH 12/44] dnsmasq.sh.template: minor syntax improvements * Make more resilient by failing with any error * Use bash arithmetic syntax, more readable * Describe that `APPS_DOMAIN` is a template variable and not an environment variable --- systemd/dnsmasq.sh.template | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/systemd/dnsmasq.sh.template b/systemd/dnsmasq.sh.template index f0168fd94..7942fc961 100644 --- a/systemd/dnsmasq.sh.template +++ b/systemd/dnsmasq.sh.template @@ -1,12 +1,33 @@ #!/bin/bash +set -o pipefail +set -o errexit +set -o nounset +set -o errtrace set -x -if [[ ${CRC_NETWORK_MODE_USER} -eq 1 ]]; then +source /etc/sysconfig/crc-env || echo "WARNING: crc-env not found" + + +if (( ${CRC_NETWORK_MODE_USER:-0} == 1 )); then echo -n "network-mode 'user' detected: skipping dnsmasq configuration" exit 0 fi +# The value of APPS_DOMAIN is set by the +# createdisk-library.sh::copy_systemd_units script during the template +# instantiation. So in the end system, the test below should be a +# tautologie (ie, always true if correctly set up) + +# disable this to properly reach the error block (cannot use ${var:-} +# here because of the envsubst instantiating the template) +set +o nounset +if [[ -z "${APPS_DOMAIN}" ]]; then + echo "ERROR: APPS_DOMAIN must be defined to use this script" + exit 1 +fi +set -o nounset + hostName=$(hostname) hostIp=$(hostname --all-ip-addresses | awk '{print $1}') From 7864591a2b2d3f98e62d97ddd8f8884739824a8d Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 1 Oct 2025 16:33:48 +0200 Subject: [PATCH 13/44] ocp-cluster-ca.sh: syntax and reliability improvements * Make more resilient by failing on any error * Better use of script constants * Introduce a cleanup mechanism to remove the temporary cert files * Make more resilient by properly isolating variables (`"$VARIABLE"`) * More readable syntax by removing unnecessary `${VARIABLE}` brackets * Make more resilient by using `oc create ... --dry-run | oc apply-f-` * Make more readable by using `jq` to generate to patch JSON * Make more readable by splitting the long commands over multiple lines * Reuse the existing `wait_for_resource_or_die` --- systemd/ocp-cluster-ca.sh | 106 +++++++++++++++++++++++++------------- 1 file changed, 70 insertions(+), 36 deletions(-) diff --git a/systemd/ocp-cluster-ca.sh b/systemd/ocp-cluster-ca.sh index fc82e5ced..19a1ba57d 100644 --- a/systemd/ocp-cluster-ca.sh +++ b/systemd/ocp-cluster-ca.sh @@ -4,20 +4,28 @@ # https://access.redhat.com/solutions/5286371 # https://access.redhat.com/solutions/6054981 +set -o pipefail +set -o errexit +set -o nounset +set -o errtrace set -x +source /etc/sysconfig/crc-env || echo "WARNING: crc-env not found" + source /usr/local/bin/crc-systemd-common.sh export KUBECONFIG="/opt/kubeconfig" -wait_for_resource configmap +wait_for_resource_or_die configmap -external_ip_path=/opt/crc/eip +CRC_EXTERNAL_IP_FILE_PATH=/opt/crc/eip # may or may not be there. See below ... -if oc get configmap client-ca-custom -n openshift-config; then +if oc get configmap client-ca-custom -n openshift-config 2>/dev/null; then echo "API Server Client CA already rotated..." exit 0 fi +echo "API Server Client CA not rotated. Doing it now ..." + # generate CA CA_FILE_PATH="/tmp/custom-ca.crt" CA_KEY_FILE_PATH="/tmp/custom-ca.key" @@ -28,52 +36,78 @@ CA_SUBJ="/OU=openshift/CN=admin-kubeconfig-signer-custom" CLIENT_SUBJ="/O=system:masters/CN=system:admin" VALIDITY=365 +cleanup() { + rm -f "$CA_FILE_PATH" "$CA_KEY_FILE_PATH" \ + "$CLIENT_CA_FILE_PATH" "$CLIENT_CA_KEY_FILE_PATH" "$CLIENT_CSR_FILE_PATH" + echo "Temp files cleanup complete." +} + +# keep cleanup bound to EXIT; no need to clear ERR early +trap cleanup ERR EXIT + # generate the CA private key -openssl genrsa -out ${CA_KEY_FILE_PATH} 4096 +openssl genrsa -out "$CA_KEY_FILE_PATH" 4096 # Create the CA certificate -openssl req -x509 -new -nodes -key ${CA_KEY_FILE_PATH} -sha256 -days $VALIDITY -out ${CA_FILE_PATH} -subj "${CA_SUBJ}" +openssl req -x509 -new -nodes -key "$CA_KEY_FILE_PATH" -sha256 -days "$VALIDITY" -out "$CA_FILE_PATH" -subj "$CA_SUBJ" # create CSR -openssl req -new -newkey rsa:4096 -nodes -keyout ${CLIENT_CA_KEY_FILE_PATH} -out ${CLIENT_CSR_FILE_PATH} -subj "${CLIENT_SUBJ}" +openssl req -new -newkey rsa:4096 -nodes -keyout "$CLIENT_CA_KEY_FILE_PATH" -out "$CLIENT_CSR_FILE_PATH" -subj "$CLIENT_SUBJ" # sign the CSR with above CA -openssl x509 -extfile <(printf "extendedKeyUsage = clientAuth") -req -in ${CLIENT_CSR_FILE_PATH} -CA ${CA_FILE_PATH} \ - -CAkey ${CA_KEY_FILE_PATH} -CAcreateserial -out ${CLIENT_CA_FILE_PATH} -days $VALIDITY -sha256 - -oc create configmap client-ca-custom -n openshift-config --from-file=ca-bundle.crt=${CA_FILE_PATH} -oc patch apiserver cluster --type=merge -p '{"spec": {"clientCA": {"name": "client-ca-custom"}}}' +openssl x509 -extfile <(printf "extendedKeyUsage = clientAuth") -req -in "$CLIENT_CSR_FILE_PATH" -CA "$CA_FILE_PATH" \ + -CAkey "$CA_KEY_FILE_PATH" -CAcreateserial -out "$CLIENT_CA_FILE_PATH" -days "$VALIDITY" -sha256 + +oc create configmap client-ca-custom \ + -n openshift-config \ + --from-file=ca-bundle.crt="$CA_FILE_PATH" \ + --dry-run=client -o yaml \ + | oc apply -f - + +jq -n ' +{ + "spec": { + "clientCA": { + "name": "client-ca-custom" + } + } +}' | oc patch apiserver cluster --type=merge --patch-file=/dev/stdin cluster_name=$(oc config view -o jsonpath='{.clusters[0].name}') -apiserver_url=$(oc config view -o jsonpath='{.clusters[0].cluster.server}') -if [ -f "${external_ip_path}" ]; then - apiserver_url=https://api.$(cat "${external_ip_path}").nip.io:6443 +if [[ -r "$CRC_EXTERNAL_IP_FILE_PATH" ]]; then + external_ip=$(tr -d '\r\n' < "$CRC_EXTERNAL_IP_FILE_PATH") + apiserver_url=https://api.${external_ip}.nip.io:6443 + echo "INFO: CRC external IP file found. Using apiserver_url='$apiserver_url'." +else + apiserver_url=$(oc config view -o jsonpath='{.clusters[0].cluster.server}') + echo "INFO: CRC external IP file does not exist ($CRC_EXTERNAL_IP_FILE_PATH). Using apiserver_url='$apiserver_url'." fi -updated_kubeconfig_path=/opt/crc/kubeconfig -rm -rf "${updated_kubeconfig_path}" +export KUBECONFIG=/opt/crc/kubeconfig +rm -rf "$KUBECONFIG" -oc config set-credentials system:admin --client-certificate=${CLIENT_CA_FILE_PATH} --client-key=${CLIENT_CA_KEY_FILE_PATH} \ - --embed-certs --kubeconfig="${updated_kubeconfig_path}" -oc config set-context system:admin --cluster="${cluster_name}" --namespace=default --user=system:admin --kubeconfig="${updated_kubeconfig_path}" -oc config set-cluster "${cluster_name}" --server="${apiserver_url}" --insecure-skip-tls-verify=true --kubeconfig="${updated_kubeconfig_path}" -oc config use-context system:admin --kubeconfig="${updated_kubeconfig_path}" +oc config set-credentials system:admin \ + --client-certificate="$CLIENT_CA_FILE_PATH" \ + --client-key="$CLIENT_CA_KEY_FILE_PATH" \ + --embed-certs -COUNTER=0 -until oc get co --kubeconfig="${updated_kubeconfig_path}"; -do - if [ $COUNTER == 90 ]; then - echo "Unable to access API server using new client certitificate..." - exit 1 - fi - echo "Acess API server with new client cert, try $COUNTER, hang on...." - sleep 2 - ((COUNTER++)) -done +oc config set-context system:admin --cluster="$cluster_name" --namespace=default --user=system:admin +oc config set-cluster "$cluster_name" --server="$apiserver_url" --insecure-skip-tls-verify=true +oc config use-context system:admin +wait_for_resource_or_die clusteroperators 90 2 -oc create configmap admin-kubeconfig-client-ca -n openshift-config --from-file=ca-bundle.crt=${CA_FILE_PATH} \ - --dry-run=client -o yaml | oc replace -f - +oc create configmap admin-kubeconfig-client-ca \ + -n openshift-config \ + --from-file=ca-bundle.crt="$CA_FILE_PATH" \ + --dry-run=client -oyaml \ + | oc apply -f- # copy the new kubeconfig to /opt/kubeconfig -rm -rf /opt/kubeconfig +rm -f /opt/kubeconfig cp /opt/crc/kubeconfig /opt/kubeconfig -chmod 0666 /opt/kubeconfig +chmod 0666 /opt/kubeconfig # keep the file readable by everyone in the system, this is safe + +# cleanup will apply here + +echo "All done" + +exit 0 From d086d67fe20af40c4f51cb720779186e04c2fcee Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 1 Oct 2025 16:39:08 +0200 Subject: [PATCH 14/44] ocp-clusterid.sh: minor syntax improvements * Make more resilient by failing on any error * Generate the patch file with JQ (more readable, less error-prone) --- systemd/ocp-clusterid.sh | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/systemd/ocp-clusterid.sh b/systemd/ocp-clusterid.sh index 686deaa56..e144f0983 100644 --- a/systemd/ocp-clusterid.sh +++ b/systemd/ocp-clusterid.sh @@ -1,11 +1,20 @@ #!/bin/bash -set -x +set -o pipefail +set -o errexit +set -o nounset +set -o errtrace source /usr/local/bin/crc-systemd-common.sh export KUBECONFIG="/opt/kubeconfig" + +wait_for_resource_or_die clusterversion + uuid=$(uuidgen) -wait_for_resource clusterversion +jq -n --arg id "${uuid}" '{spec: {clusterID: $id}}' \ + | oc patch clusterversion version --type merge --patch-file=/dev/stdin + +echo "All done" -oc patch clusterversion version -p "{\"spec\":{\"clusterID\":\"${uuid}\"}}" --type merge +exit 0 From 4cfee5d461a10ccc0f2cffb3cf9d6c77579d9b39 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 1 Oct 2025 16:44:32 +0200 Subject: [PATCH 15/44] ocp-custom-domain.sh: syntax and reliability improvements * Make more resilient by failing on any error * Stronger verifications on the external-ip file * Make more readable by splitting long lines * Better isolation and cleanup of the temporary cert files * Use of `jq` to set the JSON arguments --- systemd/ocp-custom-domain.sh | 123 +++++++++++++++++++++++++++-------- 1 file changed, 95 insertions(+), 28 deletions(-) diff --git a/systemd/ocp-custom-domain.sh b/systemd/ocp-custom-domain.sh index 47c563ffe..68d8795ee 100644 --- a/systemd/ocp-custom-domain.sh +++ b/systemd/ocp-custom-domain.sh @@ -1,49 +1,116 @@ #!/bin/bash +set -o pipefail +set -o errexit +set -o nounset +set -o errtrace set -x source /usr/local/bin/crc-systemd-common.sh export KUBECONFIG="/opt/kubeconfig" -if [ ! -f /opt/crc/eip ]; then - echo "external ip not found" +CRC_EXTERNAL_IP_FILE_PATH=/opt/crc/eip + +if [[ ! -r "$CRC_EXTERNAL_IP_FILE_PATH" ]]; then + echo "ERROR: CRC external ip file not found ($CRC_EXTERNAL_IP_FILE_PATH)" >&2 exit 1 fi -EIP=$(cat /opt/crc/eip) +EIP=$(tr -d '\r\n' < "$CRC_EXTERNAL_IP_FILE_PATH") + +if [[ -z "$EIP" ]]; then + echo "ERROR: External IP file is empty: $CRC_EXTERNAL_IP_FILE_PATH" >&2 + exit 1 +fi -STEPS_SLEEP_TIME=30 +# Basic IPv4 sanity check; adjust if IPv6 is expected +if [[ ! "$EIP" =~ ^([0-9]{1,3}\.){3}[0-9]{1,3}$ ]]; then + echo "ERROR: Invalid IPv4 address read from $CRC_EXTERNAL_IP_FILE_PATH: '$EIP'" >&2 + exit 1 +fi -wait_for_resource secret +wait_for_resource_or_die secret + +TMP_KEY_FILE=$(mktemp /tmp/nip.key.XXXXX) +TMP_CRT_FILE=$(mktemp /tmp/nip.crt.XXXXX) + +cleanup() { + rm -f "$TMP_KEY_FILE" "$TMP_CRT_FILE" + echo "Temp files cleanup complete." +} + +# Cleanup happens automatically via trap on error or at script end +trap cleanup ERR EXIT # create cert and add as secret -openssl req -newkey rsa:2048 -new -nodes -x509 -days 3650 -keyout /tmp/nip.key -out /tmp/nip.crt -subj "/CN=$EIP.nip.io" -addext "subjectAltName=DNS:apps.$EIP.nip.io,DNS:*.apps.$EIP.nip.io,DNS:api.$EIP.nip.io" -oc delete secret nip-secret -n openshift-config || true -oc create secret tls nip-secret --cert=/tmp/nip.crt --key=/tmp/nip.key -n openshift-config -sleep $STEPS_SLEEP_TIME +openssl req -newkey rsa:2048 -new \ + -nodes -x509 -days 3650 \ + -keyout "$TMP_KEY_FILE" -out "$TMP_CRT_FILE" \ + -subj "/CN=$EIP.nip.io" \ + -addext "subjectAltName=DNS:apps.$EIP.nip.io,DNS:*.apps.$EIP.nip.io,DNS:api.$EIP.nip.io" + +oc delete secret nip-secret -n openshift-config --ignore-not-found +oc create secret tls nip-secret \ + --cert="$TMP_CRT_FILE" \ + --key="$TMP_KEY_FILE" \ + -n openshift-config # patch ingress - cat < /tmp/ingress-patch.yaml -spec: - appsDomain: apps.$EIP.nip.io - componentRoutes: - - hostname: console-openshift-console.apps.$EIP.nip.io - name: console - namespace: openshift-console - servingCertKeyPairSecret: - name: nip-secret - - hostname: oauth-openshift.apps.$EIP.nip.io - name: oauth-openshift - namespace: openshift-authentication - servingCertKeyPairSecret: - name: nip-secret -EOF -oc patch ingresses.config.openshift.io cluster --type=merge --patch-file=/tmp/ingress-patch.yaml +wait_for_resource_or_die ingresses.config.openshift.io +jq -n --arg eip "$EIP" ' +{ + "spec": { + "appsDomain": "apps.\($eip).nip.io", + "componentRoutes": [ + { + "hostname": "console-openshift-console.apps.\($eip).nip.io", + "name": "console", + "namespace": "openshift-console", + "servingCertKeyPairSecret": { + "name": "nip-secret" + } + }, + { + "hostname": "oauth-openshift.apps.\($eip).nip.io", + "name": "oauth-openshift", + "namespace": "openshift-authentication", + "servingCertKeyPairSecret": { + "name": "nip-secret" + } + } + ] + } +}' | oc patch ingresses.config.openshift.io cluster --type=merge --patch-file=/dev/stdin # patch API server to use new CA secret -oc patch apiserver cluster --type=merge -p '{"spec":{"servingCerts": {"namedCertificates":[{"names":["api.'$EIP'.nip.io"],"servingCertificate": {"name": "nip-secret"}}]}}}' +wait_for_resource_or_die apiserver.config.openshift.io +jq -n --arg eip "$EIP" ' +{ + "spec": { + "servingCerts": { + "namedCertificates": [ + { + "names": [ + "api.\($eip).nip.io" + ], + "servingCertificate": { + "name": "nip-secret" + } + } + ] + } + } +}' | oc patch apiserver cluster --type=merge --patch-file=/dev/stdin # patch image registry route -oc patch -p '{"spec": {"host": "default-route-openshift-image-registry.'$EIP'.nip.io"}}' route default-route -n openshift-image-registry --type=merge +wait_for_resource_or_die route.route.openshift.io +jq -n --arg eip "$EIP" ' +{ + "spec": { + "host": "default-route-openshift-image-registry.\($eip).nip.io" + } +}' | oc patch route default-route -n openshift-image-registry --type=merge --patch-file=/dev/stdin + +echo "All done" -#wait_cluster_become_healthy "authentication|console|etcd|ingress|openshift-apiserver" +exit 0 From 0298bbfb43e4ec93a0820be56ef5c4f06977b52a Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 1 Oct 2025 16:44:48 +0200 Subject: [PATCH 16/44] ocp-growfs.sh: syntax improvements (will be removed in a follow up commit) --- systemd/ocp-growfs.sh | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/systemd/ocp-growfs.sh b/systemd/ocp-growfs.sh index c637a7c08..b74ae8457 100644 --- a/systemd/ocp-growfs.sh +++ b/systemd/ocp-growfs.sh @@ -1,5 +1,9 @@ #!/bin/bash +set -o pipefail +set -o errexit +set -o nounset +set -o errtrace set -x root_partition=$(/usr/sbin/blkid -t TYPE=xfs -o device) @@ -8,4 +12,9 @@ root_partition=$(/usr/sbin/blkid -t TYPE=xfs -o device) rootFS="/sysroot" mount -o remount,rw "${rootFS}" xfs_growfs "${rootFS}" + #mount -o remount,ro "${rootFS}" + +echo "All done" + +exit 0 From fcae494041e0c9fb19c6ae4e69c55231d7adcb5d Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 1 Oct 2025 16:47:20 +0200 Subject: [PATCH 17/44] ocp-mco-sshkey.sh: syntax improvements * Make more resilient by failing on any error * Improve the logging and argument validation * Use `jq` to to guarantee that the patch file is valid JSON (will be updated in a follow up commit to avoid passing the pub key in the CLI) --- systemd/ocp-mco-sshkey.sh | 40 +++++++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 10 deletions(-) diff --git a/systemd/ocp-mco-sshkey.sh b/systemd/ocp-mco-sshkey.sh index 0f1d441bd..b412769e7 100644 --- a/systemd/ocp-mco-sshkey.sh +++ b/systemd/ocp-mco-sshkey.sh @@ -1,22 +1,42 @@ #!/bin/bash +set -o pipefail +set -o errexit +set -o nounset +set -o errtrace set -x source /usr/local/bin/crc-systemd-common.sh export KUBECONFIG="/opt/kubeconfig" -pub_key_path="/opt/crc/id_rsa.pub" +CRC_PUB_KEY_PATH="/opt/crc/id_rsa.pub" -if [ ! -f "${pub_key_path}" ]; then - echo "No pubkey file found" +if [[ ! -r "$CRC_PUB_KEY_PATH" ]]; then + echo "ERROR: CRC pubkey file does not exist ($CRC_PUB_KEY_PATH)" exit 1 fi +wait_for_resource_or_die machineconfig/99-master-ssh + echo "Updating the public key resource for machine config operator" -pub_key=$(tr -d '\n\r' < ${pub_key_path}) -wait_for_resource machineconfig -if ! oc patch machineconfig 99-master-ssh -p "{\"spec\": {\"config\": {\"passwd\": {\"users\": [{\"name\": \"core\", \"sshAuthorizedKeys\": [\"${pub_key}\"]}]}}}}" --type merge; -then - echo "failed to update public key to machine config operator" - exit 1 -fi +pub_key=$(cat "$CRC_PUB_KEY_PATH" | tr -d '\n\r') + +jq -n --arg key "${pub_key}" ' +{ + "spec": { + "config": { + "passwd": { + "users": [ + { + "name": "core", + "sshAuthorizedKeys": [ $key ] + } + ] + } + } + } +}' | oc patch machineconfig 99-master-ssh --type merge --patch-file=/dev/stdin + +echo "All done" + +exit 0 From df914814bc006a347d7f3f3b419758cfdd45adeb Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 1 Oct 2025 16:48:47 +0200 Subject: [PATCH 18/44] ocp-userpasswords.sh: syntax improvements * Better use of script constants * Better validation of the arguments * Better logging File will be further updated to prevent leaking passwords in the journal logs. --- systemd/ocp-userpasswords.sh | 37 +++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/systemd/ocp-userpasswords.sh b/systemd/ocp-userpasswords.sh index f2a6d2a02..88446df84 100644 --- a/systemd/ocp-userpasswords.sh +++ b/systemd/ocp-userpasswords.sh @@ -9,41 +9,48 @@ set -x source /usr/local/bin/crc-systemd-common.sh export KUBECONFIG="/opt/kubeconfig" +CRC_PASS_DEVELOPER_PATH=/opt/crc/pass_developer +CRC_PASS_KUBEADMIN_PATH=/opt/crc/pass_kubeadmin +CRC_HTPASSWD_IMAGE=registry.access.redhat.com/ubi10/httpd-24 + function gen_htpasswd() { if [ -z "${1:-}" ] || [ -z "${2:-}" ]; then - echo "gen_htpasswd needs two arguments: username password" 1>&2 + echo "gen_htpasswd needs two arguments: username password" >&2 return 1 fi - podman run --rm docker.io/xmartlabs/htpasswd "$1" "$2" + podman run --rm "$CRC_HTPASSWD_IMAGE" htpasswd -nb "$1" "$2" } -wait_for_resource secret - -if [ ! -f /opt/crc/pass_developer ]; then - echo "developer password does not exist" +if [[ ! -r "$CRC_PASS_DEVELOPER_PATH" ]]; then + echo "ERROR: CRC developer password does not exist ($CRC_PASS_DEVELOPER_PATH)" exit 1 fi -if [ ! -f /opt/crc/pass_kubeadmin ]; then - echo "kubeadmin password does not exist" +if [[ ! -r "$CRC_PASS_KUBEADMIN_PATH" ]]; then + echo "ERROR: CRC kubeadmin password does not exist ($CRC_PASS_KUBEADMIN_PATH)" exit 1 fi -echo "generating the kubeadmin and developer passwords ..." +echo "Pulling $CRC_HTPASSWD_IMAGE ..." +podman pull --quiet "$CRC_HTPASSWD_IMAGE" -set +x # /!\ disable the logging to avoid leaking the passwords +wait_for_resource_or_die secret -dev_pass=$(gen_htpasswd developer "$(cat /opt/crc/pass_developer)") -adm_pass=$(gen_htpasswd kubeadmin "$(cat /opt/crc/pass_kubeadmin)") +echo "Generating the kubeadmin and developer passwords ..." +set +x # disable the logging to avoid leaking the passwords + +dev_pass=$(gen_htpasswd developer "$(cat "$CRC_PASS_DEVELOPER_PATH")") +adm_pass=$(gen_htpasswd kubeadmin "$(cat "$CRC_PASS_KUBEADMIN_PATH")") echo "creating the password secret ..." -# use bash <() to use a temporary fd file -# use sed to remove the empty lines +# use bash "<()" to use a temporary fd file (safer to handle secrets) oc create secret generic htpass-secret \ --from-file=htpasswd=<(printf '%s\n%s\n' "$dev_pass" "$adm_pass") \ -n openshift-config \ --dry-run=client -oyaml \ | oc apply -f- -echo "all done" +echo "All done" + +exit 0 From fd191fe425ced7031149c1308e776a76b97626f1 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Thu, 2 Oct 2025 09:41:21 +0200 Subject: [PATCH 19/44] ocp-userpasswords.service: remove unnecessary sleep No need to sleep `5s` here, the SystemD dependencies should enforce the correct ordering. --- systemd/ocp-userpasswords.service | 1 - 1 file changed, 1 deletion(-) diff --git a/systemd/ocp-userpasswords.service b/systemd/ocp-userpasswords.service index b47b9fcf1..04be04957 100644 --- a/systemd/ocp-userpasswords.service +++ b/systemd/ocp-userpasswords.service @@ -12,7 +12,6 @@ Type=oneshot Restart=on-failure RestartSec=40 ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh -ExecStartPre=/usr/bin/sleep 5 ExecStart=/usr/local/bin/ocp-userpasswords.sh ExecStartPost=-touch /opt/crc/%n.done From 975315d9f90e493ee292c2bd46b5840f22331ccd Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Thu, 2 Oct 2025 14:26:46 +0200 Subject: [PATCH 20/44] Define the KUBECONFIG in the systemd service Define the `KUBECONFIG` in the Systemd unit file, so that the CRC scripts don't have to care about it. ``` Environment=KUBECONFIG=/opt/kubeconfig ``` Gives a better separation of concerns. --- systemd/crc-cluster-status.service | 1 + systemd/crc-cluster-status.sh | 1 - systemd/crc-pullsecret.service | 1 + systemd/crc-pullsecret.sh | 1 - systemd/crc-routes-controller.service | 1 + systemd/crc-routes-controller.sh | 2 -- systemd/crc-wait-apiserver-up.service | 1 + systemd/crc-wait-apiserver-up.sh | 1 - systemd/ocp-cluster-ca.service | 1 + systemd/ocp-cluster-ca.sh | 1 - systemd/ocp-clusterid.service | 1 + systemd/ocp-clusterid.sh | 1 - systemd/ocp-custom-domain.service | 1 + systemd/ocp-custom-domain.sh | 1 - systemd/ocp-mco-sshkey.service | 1 + systemd/ocp-mco-sshkey.sh | 1 - systemd/ocp-userpasswords.service | 1 + systemd/ocp-userpasswords.sh | 1 - 18 files changed, 9 insertions(+), 10 deletions(-) diff --git a/systemd/crc-cluster-status.service b/systemd/crc-cluster-status.service index 74c8af9a0..950acdb90 100644 --- a/systemd/crc-cluster-status.service +++ b/systemd/crc-cluster-status.service @@ -12,6 +12,7 @@ StartLimitBurst=10 Type=oneshot Restart=on-failure RestartSec=40 +Environment=KUBECONFIG=/opt/kubeconfig ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh ExecStart=/usr/local/bin/crc-cluster-status.sh RemainAfterExit=true diff --git a/systemd/crc-cluster-status.sh b/systemd/crc-cluster-status.sh index 2529779ad..9b25dece3 100644 --- a/systemd/crc-cluster-status.sh +++ b/systemd/crc-cluster-status.sh @@ -6,7 +6,6 @@ set -o nounset set -o errtrace set -x -export KUBECONFIG=/opt/kubeconfig MAXIMUM_LOGIN_RETRY=10 RETRY_DELAY=5 diff --git a/systemd/crc-pullsecret.service b/systemd/crc-pullsecret.service index a76b36584..0baece570 100644 --- a/systemd/crc-pullsecret.service +++ b/systemd/crc-pullsecret.service @@ -10,6 +10,7 @@ ConditionPathExists=!/opt/crc/%n.done Type=oneshot Restart=on-failure RestartSec=40 +Environment=KUBECONFIG=/opt/kubeconfig ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh ExecStart=/usr/local/bin/crc-pullsecret.sh ExecStartPost=-touch /opt/crc/%n.done diff --git a/systemd/crc-pullsecret.sh b/systemd/crc-pullsecret.sh index d1665357b..74f62ac3f 100644 --- a/systemd/crc-pullsecret.sh +++ b/systemd/crc-pullsecret.sh @@ -7,7 +7,6 @@ set -o errtrace set -x source /usr/local/bin/crc-systemd-common.sh -export KUBECONFIG="/opt/kubeconfig" PULL_SECRETS_FILE="/opt/crc/pull-secret" diff --git a/systemd/crc-routes-controller.service b/systemd/crc-routes-controller.service index 65f6e1973..869b4ab95 100644 --- a/systemd/crc-routes-controller.service +++ b/systemd/crc-routes-controller.service @@ -9,6 +9,7 @@ StartLimitBurst=10 Type=oneshot Restart=on-failure RestartSec=40 +Environment=KUBECONFIG=/opt/kubeconfig ExecCondition=/usr/local/bin/crc-user-mode-networking.sh ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh ExecStart=/usr/local/bin/crc-routes-controller.sh diff --git a/systemd/crc-routes-controller.sh b/systemd/crc-routes-controller.sh index 64d3f6f42..ee15b968d 100644 --- a/systemd/crc-routes-controller.sh +++ b/systemd/crc-routes-controller.sh @@ -10,8 +10,6 @@ ROUTE_CONTROLLER=/opt/crc/routes-controller.yaml source /usr/local/bin/crc-systemd-common.sh -export KUBECONFIG=/opt/kubeconfig - wait_for_resource_or_die pods wait_for_resource_or_die deployments diff --git a/systemd/crc-wait-apiserver-up.service b/systemd/crc-wait-apiserver-up.service index 2a6061917..9ab3e168c 100644 --- a/systemd/crc-wait-apiserver-up.service +++ b/systemd/crc-wait-apiserver-up.service @@ -8,6 +8,7 @@ Before=ocp-delete-mco-leases.service [Service] Type=oneshot Restart=on-failure +Environment=KUBECONFIG=/opt/kubeconfig ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh ExecStart=/usr/local/bin/crc-wait-apiserver-up.sh diff --git a/systemd/crc-wait-apiserver-up.sh b/systemd/crc-wait-apiserver-up.sh index e18494a01..614c13dbb 100644 --- a/systemd/crc-wait-apiserver-up.sh +++ b/systemd/crc-wait-apiserver-up.sh @@ -6,7 +6,6 @@ set -o nounset set -o errtrace source /usr/local/bin/crc-systemd-common.sh -export KUBECONFIG=/opt/kubeconfig echo "Waiting for the node resource to be available ..." # $1 resource, $2 retry count, $3 wait time diff --git a/systemd/ocp-cluster-ca.service b/systemd/ocp-cluster-ca.service index 8f5a8d2ef..832f64093 100644 --- a/systemd/ocp-cluster-ca.service +++ b/systemd/ocp-cluster-ca.service @@ -9,6 +9,7 @@ ConditionPathExists=!/opt/crc/%n.done Type=oneshot Restart=on-failure RestartSec=40 +Environment=KUBECONFIG=/opt/kubeconfig ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh ExecStart=/usr/local/bin/ocp-cluster-ca.sh ExecStartPost=-touch /opt/crc/%n.done diff --git a/systemd/ocp-cluster-ca.sh b/systemd/ocp-cluster-ca.sh index 19a1ba57d..01e6f2e12 100644 --- a/systemd/ocp-cluster-ca.sh +++ b/systemd/ocp-cluster-ca.sh @@ -13,7 +13,6 @@ set -x source /etc/sysconfig/crc-env || echo "WARNING: crc-env not found" source /usr/local/bin/crc-systemd-common.sh -export KUBECONFIG="/opt/kubeconfig" wait_for_resource_or_die configmap diff --git a/systemd/ocp-clusterid.service b/systemd/ocp-clusterid.service index f901f8160..2e6ad70f3 100644 --- a/systemd/ocp-clusterid.service +++ b/systemd/ocp-clusterid.service @@ -9,6 +9,7 @@ StartLimitBurst=10 Type=oneshot Restart=on-failure RestartSec=40 +Environment=KUBECONFIG=/opt/kubeconfig ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh ExecStart=/usr/local/bin/ocp-clusterid.sh diff --git a/systemd/ocp-clusterid.sh b/systemd/ocp-clusterid.sh index e144f0983..3beee5eba 100644 --- a/systemd/ocp-clusterid.sh +++ b/systemd/ocp-clusterid.sh @@ -6,7 +6,6 @@ set -o nounset set -o errtrace source /usr/local/bin/crc-systemd-common.sh -export KUBECONFIG="/opt/kubeconfig" wait_for_resource_or_die clusterversion diff --git a/systemd/ocp-custom-domain.service b/systemd/ocp-custom-domain.service index 93d644ce0..d48b8f4c1 100644 --- a/systemd/ocp-custom-domain.service +++ b/systemd/ocp-custom-domain.service @@ -10,6 +10,7 @@ ConditionPathExists=!/opt/crc/%n.done Type=oneshot Restart=on-failure RestartSec=40 +Environment=KUBECONFIG=/opt/kubeconfig ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh ExecStart=/usr/local/bin/ocp-custom-domain.sh ExecStartPost=-touch /opt/crc/%n.done diff --git a/systemd/ocp-custom-domain.sh b/systemd/ocp-custom-domain.sh index 68d8795ee..6b706a3dc 100644 --- a/systemd/ocp-custom-domain.sh +++ b/systemd/ocp-custom-domain.sh @@ -7,7 +7,6 @@ set -o errtrace set -x source /usr/local/bin/crc-systemd-common.sh -export KUBECONFIG="/opt/kubeconfig" CRC_EXTERNAL_IP_FILE_PATH=/opt/crc/eip diff --git a/systemd/ocp-mco-sshkey.service b/systemd/ocp-mco-sshkey.service index 42b4b5587..f2d66c21b 100644 --- a/systemd/ocp-mco-sshkey.service +++ b/systemd/ocp-mco-sshkey.service @@ -9,6 +9,7 @@ StartLimitBurst=10 Type=oneshot Restart=on-failure RestartSec=40 +Environment=KUBECONFIG=/opt/kubeconfig ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh ExecStart=/usr/local/bin/ocp-mco-sshkey.sh RemainAfterExit=true diff --git a/systemd/ocp-mco-sshkey.sh b/systemd/ocp-mco-sshkey.sh index b412769e7..00a90ed64 100644 --- a/systemd/ocp-mco-sshkey.sh +++ b/systemd/ocp-mco-sshkey.sh @@ -7,7 +7,6 @@ set -o errtrace set -x source /usr/local/bin/crc-systemd-common.sh -export KUBECONFIG="/opt/kubeconfig" CRC_PUB_KEY_PATH="/opt/crc/id_rsa.pub" diff --git a/systemd/ocp-userpasswords.service b/systemd/ocp-userpasswords.service index 04be04957..5d3d61ad6 100644 --- a/systemd/ocp-userpasswords.service +++ b/systemd/ocp-userpasswords.service @@ -11,6 +11,7 @@ ConditionPathExists=!/opt/crc/%n.done Type=oneshot Restart=on-failure RestartSec=40 +Environment=KUBECONFIG=/opt/kubeconfig ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh ExecStart=/usr/local/bin/ocp-userpasswords.sh ExecStartPost=-touch /opt/crc/%n.done diff --git a/systemd/ocp-userpasswords.sh b/systemd/ocp-userpasswords.sh index 88446df84..2adeed8be 100644 --- a/systemd/ocp-userpasswords.sh +++ b/systemd/ocp-userpasswords.sh @@ -7,7 +7,6 @@ set -o errtrace set -x source /usr/local/bin/crc-systemd-common.sh -export KUBECONFIG="/opt/kubeconfig" CRC_PASS_DEVELOPER_PATH=/opt/crc/pass_developer CRC_PASS_KUBEADMIN_PATH=/opt/crc/pass_kubeadmin From af25437779681d062aa5c66d44855163f0d4e548 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Thu, 2 Oct 2025 14:39:39 +0200 Subject: [PATCH 21/44] systemd: add a synchronization on ocp-wait-apiservices-available Introduce the `ocp-apiservices-available.sh` script, which waits for the `apiservices` to be all available. The APIServices are made up of two groups: - the K8s APIs, which are always available (pods, secrets, configmaps,...) - the OCP APIs, which need OCP Operators and Pods to be ready (routes, projects, ...) This script waits for the second group to finish its initialization. --- systemd/crc-check-tap.service | 1 + systemd/crc-cluster-status.service | 1 + systemd/ocp-cluster-ca.service | 1 + systemd/ocp-custom-domain.service | 1 + .../ocp-wait-apiservices-available.service | 21 ++++++ systemd/ocp-wait-apiservices-available.sh | 69 +++++++++++++++++++ 6 files changed, 94 insertions(+) create mode 100644 systemd/ocp-wait-apiservices-available.service create mode 100644 systemd/ocp-wait-apiservices-available.sh diff --git a/systemd/crc-check-tap.service b/systemd/crc-check-tap.service index 38ce68057..46c5e3a24 100644 --- a/systemd/crc-check-tap.service +++ b/systemd/crc-check-tap.service @@ -5,6 +5,7 @@ Before=gv-user-network@tap0.service After=local-fs.target After=crc-env-file-exists.service RequiresMountsFor=/etc/NetworkManager/system-connections +Requires=crc-env-file-exists.service [Service] Type=oneshot diff --git a/systemd/crc-cluster-status.service b/systemd/crc-cluster-status.service index 950acdb90..565b04e85 100644 --- a/systemd/crc-cluster-status.service +++ b/systemd/crc-cluster-status.service @@ -5,6 +5,7 @@ After=crc-wait-apiserver-up.service crc-pullsecret.service After=ocp-mco-sshkey.service ocp-cluster-ca.service After=ocp-custom-domain.service ocp-userpasswords.service After=ocp-clusterid.service +After=ocp-wait-apiservices-available.service StartLimitIntervalSec=450 StartLimitBurst=10 diff --git a/systemd/ocp-cluster-ca.service b/systemd/ocp-cluster-ca.service index 832f64093..c36cafbcd 100644 --- a/systemd/ocp-cluster-ca.service +++ b/systemd/ocp-cluster-ca.service @@ -1,6 +1,7 @@ [Unit] Description=CRC Unit setting custom cluster ca After=crc-wait-apiserver-up.service +After=ocp-wait-apiservices-available.service StartLimitIntervalSec=450 StartLimitBurst=10 ConditionPathExists=!/opt/crc/%n.done diff --git a/systemd/ocp-custom-domain.service b/systemd/ocp-custom-domain.service index d48b8f4c1..ab2cc2f72 100644 --- a/systemd/ocp-custom-domain.service +++ b/systemd/ocp-custom-domain.service @@ -2,6 +2,7 @@ Description=CRC Unit setting nip.io domain for cluster After=crc-env-file-exists.service After=crc-wait-apiserver-up.service +After=ocp-wait-apiservices-available.service StartLimitIntervalSec=450 StartLimitBurst=10 ConditionPathExists=!/opt/crc/%n.done diff --git a/systemd/ocp-wait-apiservices-available.service b/systemd/ocp-wait-apiservices-available.service new file mode 100644 index 000000000..38627eed1 --- /dev/null +++ b/systemd/ocp-wait-apiservices-available.service @@ -0,0 +1,21 @@ +[Unit] +Description=Wait for all Kubernetes APIServices to be Available + +# This service needs network to talk to the k8s API server +Wants=network-online.target +After=network-online.target +After=crc-wait-apiserver-up.service +StartLimitIntervalSec=450 +StartLimitBurst=10 + +[Service] +Type=oneshot +Restart=on-failure +RestartSec=20 +ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh +ExecStart=/usr/local/bin/ocp-wait-apiservices-available.sh + +Environment=KUBECONFIG=/opt/kubeconfig + +[Install] +WantedBy=crc-custom.target diff --git a/systemd/ocp-wait-apiservices-available.sh b/systemd/ocp-wait-apiservices-available.sh new file mode 100644 index 000000000..1bb89e0a1 --- /dev/null +++ b/systemd/ocp-wait-apiservices-available.sh @@ -0,0 +1,69 @@ +#!/bin/bash + +set -o pipefail +set -o errexit +set -o nounset +set -o errtrace + +echo "➡️ Waiting for all APIServices to become available..." + +SECONDS=0 +MAX_RETRY=60 +WAIT_SEC=5 + +for retry in $(seq 1 "$MAX_RETRY"); do + # This command gets the 'status' of the 'Available' condition for every apiservice. + # It produces a list of "True" and/or "False" strings. We then count how many are "False". + APISERVICE_DATA=$(oc get apiservices -o json 2>/dev/null || true) + if [[ -z "$APISERVICE_DATA" ]]; then + UNAVAILABLE_COUNT=999 + echo "⚠️ Couldn't get the list of apiservices ..." + else + UNAVAILABLE_COUNT=$(jq -r ' + [ .items[] + | select(((.status.conditions // []) + | any(.type=="Available" and .status=="True")) | not) + ] | length + ' <<<"$APISERVICE_DATA") + UNAVAILABLE_COUNT=${UNAVAILABLE_COUNT:-0} + fi + + if [ "$UNAVAILABLE_COUNT" -eq 0 ]; then + echo "✅ All APIServices are now available after $SECONDS seconds." + break + fi + + echo + echo "⏳ Still waiting for $UNAVAILABLE_COUNT APIService(s) to become available. Retrying in $WAIT_SEC seconds." + echo "--------------------------------------------------------------------------------" + echo "Unavailable services and their messages:" + + # Get all apiservices as JSON and pipe to jq for filtering and formatting. + # The '-r' flag outputs raw strings instead of JSON-quoted strings. + if ! oc get apiservices -o json | jq -r ' + .items[] | + . as $item | + ( + $item.status.conditions[]? | + select(.type == "Available" and .status == "False") + ) | + " - \($item.metadata.name): \(.reason) - \(.message)" + ' + then + echo "⚠️ Unable to list unavailable APIServices details (will retry)" >&2 + fi + + echo "--------------------------------------------------------------------------------" + + # If it's the last attempt, log a failure message before exiting + if (( retry == MAX_RETRY )); then + echo "ERROR: Timed out waiting for the api-services to get ready, after $MAX_RETRY attempts x $WAIT_SEC seconds = $SECONDS seconds." >&2 + exit 1 + fi + + sleep "$WAIT_SEC" +done + +echo "🎉 Done." + +exit 0 From 550012728518bebfd08c124802a69d2907819a4e Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Thu, 2 Oct 2025 16:17:58 +0200 Subject: [PATCH 22/44] crc-wait-apiserver-up.sh: try more often The `crc-wait-apiserver-up` wait for the K8s APIServer to be up and running. This commit makes the patch try 60 times with 5s delay, instead of 5 times with 60s delay. The script becomes more reactive to the APIServer activation. --- systemd/crc-wait-apiserver-up.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/systemd/crc-wait-apiserver-up.sh b/systemd/crc-wait-apiserver-up.sh index 614c13dbb..209592f48 100644 --- a/systemd/crc-wait-apiserver-up.sh +++ b/systemd/crc-wait-apiserver-up.sh @@ -9,7 +9,7 @@ source /usr/local/bin/crc-systemd-common.sh echo "Waiting for the node resource to be available ..." # $1 resource, $2 retry count, $3 wait time -wait_for_resource_or_die node 4 60 +wait_for_resource_or_die node 60 5 echo "node resource available, APIServer is ready." From 4408e0a7f6ad2247ecc972594dc3a2ae7adfd8a4 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Fri, 3 Oct 2025 10:25:26 +0200 Subject: [PATCH 23/44] systemd: add synchronization on crc-wait-node-ready This script add a synchronization point on the `ready` status of the CRC node. Before the node is ready, services can interact with the K8s APIServer, but user (and OCP) services won't start their deployment before the CRC node is ready. This synchronization point avoid that other services (like the `ocp-wait-apiservices`) wait in vain while their target didn't start their own deployment. --- systemd/crc-cluster-status.service | 1 + systemd/crc-wait-node-ready.service | 19 +++++++++ systemd/crc-wait-node-ready.sh | 39 +++++++++++++++++++ .../ocp-wait-apiservices-available.service | 2 + 4 files changed, 61 insertions(+) create mode 100644 systemd/crc-wait-node-ready.service create mode 100644 systemd/crc-wait-node-ready.sh diff --git a/systemd/crc-cluster-status.service b/systemd/crc-cluster-status.service index 565b04e85..4bd12abd2 100644 --- a/systemd/crc-cluster-status.service +++ b/systemd/crc-cluster-status.service @@ -6,6 +6,7 @@ After=ocp-mco-sshkey.service ocp-cluster-ca.service After=ocp-custom-domain.service ocp-userpasswords.service After=ocp-clusterid.service After=ocp-wait-apiservices-available.service +After=crc-wait-node-ready.service StartLimitIntervalSec=450 StartLimitBurst=10 diff --git a/systemd/crc-wait-node-ready.service b/systemd/crc-wait-node-ready.service new file mode 100644 index 000000000..6daf0472d --- /dev/null +++ b/systemd/crc-wait-node-ready.service @@ -0,0 +1,19 @@ +[Unit] +Description=CRC Unit waiting till k8s node is ready +Requires=kubelet.service +After=kubelet.service +After=crc-env-file-exists.service +After=crc-wait-apiserver-up.service +StartLimitIntervalSec=450 +StartLimitBurst=10 + +[Service] +Type=oneshot +Restart=on-failure +RestartSec=10 +Environment=KUBECONFIG=/opt/kubeconfig +ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh +ExecStart=/usr/local/bin/crc-wait-node-ready.sh + +[Install] +WantedBy=crc-custom.target diff --git a/systemd/crc-wait-node-ready.sh b/systemd/crc-wait-node-ready.sh new file mode 100644 index 000000000..0e3d43380 --- /dev/null +++ b/systemd/crc-wait-node-ready.sh @@ -0,0 +1,39 @@ +#!/bin/bash + +set -o pipefail +set -o errexit +set -o nounset +set -o errtrace + +source /usr/local/bin/crc-systemd-common.sh + +MAX_RETRY=150 +WAIT_SEC=2 +NODE_NAME=node/crc +# Loop from 1 up to max_retry +for retry in $(seq 1 "$MAX_RETRY"); do + node_status=$(oc get "$NODE_NAME" --no-headers | awk '{print $2}' || true) + node_status=${node_status:-""} + + # Check if the node status is "Ready" + if [[ $node_status == "Ready" ]]; then + echo "CRC node is ready." + exit 0 + fi + + echo "CRC node is not ready. Status: $node_status" + + # If it's the last attempt, log a failure message before exiting + if (( retry == MAX_RETRY )); then + echo "Error: Timed out waiting for the CRC node to be ready after $MAX_RETRY attempts x $WAIT_SEC seconds." >&2 + exit 1 + fi + + # Wait before the next attempt + echo "Waiting for crc node to be ready ... (Attempt ${retry}/${MAX_RETRY})" + sleep "$WAIT_SEC" +done + +# cannot be reached + +exit 1 diff --git a/systemd/ocp-wait-apiservices-available.service b/systemd/ocp-wait-apiservices-available.service index 38627eed1..a82cde3a7 100644 --- a/systemd/ocp-wait-apiservices-available.service +++ b/systemd/ocp-wait-apiservices-available.service @@ -5,6 +5,8 @@ Description=Wait for all Kubernetes APIServices to be Available Wants=network-online.target After=network-online.target After=crc-wait-apiserver-up.service +After=crc-wait-node-ready.service +Requires=crc-wait-node-ready.service StartLimitIntervalSec=450 StartLimitBurst=10 From 69ae19a1208d085c58e5aecca1460d1e920d94e4 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Mon, 6 Oct 2025 10:58:23 +0200 Subject: [PATCH 24/44] ocp-growfs: remove This script has been broken for a while, will be handled by cloud-init. --- docs/self-sufficient-bundle.md | 1 - systemd/ocp-growfs.service | 12 ------------ systemd/ocp-growfs.sh | 20 -------------------- 3 files changed, 33 deletions(-) delete mode 100644 systemd/ocp-growfs.service delete mode 100644 systemd/ocp-growfs.sh diff --git a/docs/self-sufficient-bundle.md b/docs/self-sufficient-bundle.md index e7016b157..3cb8a221d 100644 --- a/docs/self-sufficient-bundle.md +++ b/docs/self-sufficient-bundle.md @@ -15,7 +15,6 @@ services to do their work. | `ocp-cluster-ca.service` | ocp | /opt/crc/custom-ca.crt | none | | `ocp-clusterid.service` | ocp | none | none | | `ocp-custom-domain.service` | ocp | none | none | -| `ocp-growfs.service` | ocp | none | none | | `ocp-userpasswords.service` | ocp | /opt/crc/pass_{kubeadmin, developer} | none | In addition to the above services we have `ocp-cluster-ca.path`, `crc-pullsecret.path` and `ocp-userpasswords.path` that monitors the filesystem paths diff --git a/systemd/ocp-growfs.service b/systemd/ocp-growfs.service deleted file mode 100644 index 0790e4dd7..000000000 --- a/systemd/ocp-growfs.service +++ /dev/null @@ -1,12 +0,0 @@ -[Unit] -Description=CRC Unit to grow the root filesystem -Requires=crc-custom.target -After=crc-env-file-exists.service - -[Service] -Type=oneshot -ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh -ExecStart=/usr/local/bin/ocp-growfs.sh - -[Install] -WantedBy=multi-user.target diff --git a/systemd/ocp-growfs.sh b/systemd/ocp-growfs.sh deleted file mode 100644 index b74ae8457..000000000 --- a/systemd/ocp-growfs.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash - -set -o pipefail -set -o errexit -set -o nounset -set -o errtrace -set -x - -root_partition=$(/usr/sbin/blkid -t TYPE=xfs -o device) -/usr/bin/growpart "${root_partition%?}" "${root_partition#/dev/???}" - -rootFS="/sysroot" -mount -o remount,rw "${rootFS}" -xfs_growfs "${rootFS}" - -#mount -o remount,ro "${rootFS}" - -echo "All done" - -exit 0 From eddf3e8cd97dd8ab9d34ba3ff48178d9ae7a4d08 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Mon, 6 Oct 2025 16:20:48 +0200 Subject: [PATCH 25/44] crc-aws-fetch-secrets.sh: new script for mapt to fetch the secrets from AWS-CLI This script offloads `mapt` and other AWS deployers from the task of fetching the secrets from AWS IMDS service. This script should be include in the `cloud-init` user-data configuration file, with this kind of invocation: ``` /usr/local/bin/crc-aws-fetch-secrets.sh \ "{{ .SSMPullSecretName }}" \ "{{ .SSMKubeAdminPasswordName }}" \ "{{ .SSMDeveloperPasswordName }}" ``` where the parameters specify the location of the three secrets in the IMDS store. --- systemd/crc-aws-fetch-secrets.sh | 126 +++++++++++++++++++++++++++++++ 1 file changed, 126 insertions(+) create mode 100644 systemd/crc-aws-fetch-secrets.sh diff --git a/systemd/crc-aws-fetch-secrets.sh b/systemd/crc-aws-fetch-secrets.sh new file mode 100644 index 000000000..fdc3dd947 --- /dev/null +++ b/systemd/crc-aws-fetch-secrets.sh @@ -0,0 +1,126 @@ +#!/bin/bash + +set -o pipefail +set -o errexit +set -o nounset +set -o errtrace +set -x + +# set -x is safe, the secrets are passed via stdin + +AWS_CLI_IMG=docker.io/amazon/aws-cli +MIN_CHAR_COUNT=8 # minimum number of chars for the secret to be + # assumed valid + +umask 0077 # 0600 file permission for secrets +install -d -m 0700 /opt/crc # ensure that the target directory exists + +PULL_SECRETS_KEY=${1:-} +KUBEADM_PASS_KEY=${2:-} +DEVELOPER_PASS_KEY=${3:-} + +if [[ -z "$PULL_SECRETS_KEY" || -z "$KUBEADM_PASS_KEY" || -z "$DEVELOPER_PASS_KEY" ]]; then + echo "ERROR: expected to receive 3 parameters: PULL_SECRETS_KEY KUBEADM_PASS_KEY DEVELOPER_PASS_KEY" + exit 1 +fi + +SECONDS=0 +podman pull --quiet "$AWS_CLI_IMG" +echo "Took $SECONDS seconds to pull the $AWS_CLI_IMG" + +wait_imds_available_and_get_region() { + total_timeout_minutes=5 + retry_interval_seconds=5 + + IMDS_TOKEN_COMMAND=( + curl + --connect-timeout 1 + -X PUT + "http://169.254.169.254/latest/api/token" + -H "X-aws-ec2-metadata-token-ttl-seconds: 21600" + -Ssf + ) + success=false + deadline=$(( $(date +%s) + (total_timeout_minutes * 60) )) + while [[ $(date +%s) -lt $deadline ]]; do + # By placing the command in an 'if' condition, we can test its exit code + # without triggering 'set -e'. The output is still captured. + if TOKEN=$("${IMDS_TOKEN_COMMAND[@]}"); then + # This block only runs if the curl command succeeds (exit code 0) + success=true + echo "Successfully fetched token." >&2 + break # Exit the loop on success + fi + + # This block runs if the curl command fails + echo "Failed to connect. Retrying in $retry_interval_seconds seconds..." >&2 + sleep "$retry_interval_seconds" + done + + if [[ "$success" != "true" ]]; then + echo "ERROR: Could not fetch token after $total_timeout_minutes minutes." >&2 + return 1 + fi + + # Then, use the token to get the region + echo "Fetching the AWS region ..." + curl -Ssf -H "X-aws-ec2-metadata-token: $TOKEN" http://169.254.169.254/latest/meta-data/placement/region > /tmp/aws-region + echo >> /tmp/aws-region # add EOL at EOF, for consistency + echo "AWS region: $(< /tmp/aws-region)" +} + +( + set +x # disable the xtrace as the token would be leaked + echo "Waiting for the AWS IMDS service to be available ..." + SECONDS=0 + wait_imds_available_and_get_region + echo "Took $SECONDS for the IMDS service to become available." +) + +missing_secrets=0 + +save_secret() { + name=$1 + key=$2 + dest=$3 + + # --log-driver=none avoids that the journal captures the stdout + # logs of podman and leaks the passwords in the journal ... + if ! podman run \ + --name "cloud-init-fetch-$name" \ + --env AWS_REGION="$(< /tmp/aws-region)" \ + --rm \ + "$AWS_CLI_IMG" \ + ssm get-parameter \ + --name "$key" \ + --with-decryption \ + --query "Parameter.Value" \ + --output text \ + > "${dest}.tmp" + then + rm -f "${dest}.tmp" + echo "ERROR: failed to get the '$name' secret ... (fetched from $key)" + ((missing_secrets += 1)) + return + fi + char_count=$(wc -c < "${dest}.tmp") + if (( char_count < MIN_CHAR_COUNT )); then + echo "ERROR: the content of the '$name' secret is too short ... (fetched from $key)" + rm -f "${dest}.tmp" + ((missing_secrets += 1)) + return + fi + + mv "${dest}.tmp" "${dest}" # atomic creation of the file +} + +save_secret "pull-secrets" "$PULL_SECRETS_KEY" /opt/crc/pull-secret +save_secret "kubeadmin-pass" "$KUBEADM_PASS_KEY" /opt/crc/pass_kubeadmin +save_secret "developer-pass" "$DEVELOPER_PASS_KEY" /opt/crc/pass_developer + +if (( missing_secrets != 0 )); then + echo "ERROR: failed to fetch $missing_secrets secrets ..." + exit 1 +fi + +exit 0 From a271d9c21b8bda5ef07714f8ec08ed8fe7580fb4 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Mon, 6 Oct 2025 17:06:34 +0200 Subject: [PATCH 26/44] systemd: let systemd enforce that the mandatory secrets files exist This commit moves the definition of the secret file locations from the scripts to the SystemD unit. This way, SystemD can enforce that the files exist before launching the relevant services. --- systemd/crc-pullsecret.service | 3 ++- systemd/crc-pullsecret.sh | 3 ++- systemd/ocp-custom-domain.service | 3 ++- systemd/ocp-custom-domain.sh | 8 +++++++- systemd/ocp-userpasswords.service | 4 +++- systemd/ocp-userpasswords.sh | 12 ++++++++++-- 6 files changed, 26 insertions(+), 7 deletions(-) diff --git a/systemd/crc-pullsecret.service b/systemd/crc-pullsecret.service index 0baece570..8781a0bf1 100644 --- a/systemd/crc-pullsecret.service +++ b/systemd/crc-pullsecret.service @@ -12,7 +12,8 @@ Restart=on-failure RestartSec=40 Environment=KUBECONFIG=/opt/kubeconfig ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh -ExecStart=/usr/local/bin/crc-pullsecret.sh +ExecStartPre=/usr/bin/test -f /opt/crc/pull-secret +ExecStart=/usr/local/bin/crc-pullsecret.sh /opt/crc/pull-secret ExecStartPost=-touch /opt/crc/%n.done [Install] diff --git a/systemd/crc-pullsecret.sh b/systemd/crc-pullsecret.sh index 74f62ac3f..0b636a67b 100644 --- a/systemd/crc-pullsecret.sh +++ b/systemd/crc-pullsecret.sh @@ -8,7 +8,7 @@ set -x source /usr/local/bin/crc-systemd-common.sh -PULL_SECRETS_FILE="/opt/crc/pull-secret" +PULL_SECRETS_FILE="${1:-}" wait_for_resource_or_die secret @@ -28,6 +28,7 @@ fi echo "Cluster doesn't have the pull secrets. Setting them from $PULL_SECRETS_FILE ..." +# enforced by systemd if [[ ! -r "$PULL_SECRETS_FILE" ]]; then echo "ERROR: $PULL_SECRETS_FILE is missing or unreadable" 1>&2 diff --git a/systemd/ocp-custom-domain.service b/systemd/ocp-custom-domain.service index ab2cc2f72..5fb41e38a 100644 --- a/systemd/ocp-custom-domain.service +++ b/systemd/ocp-custom-domain.service @@ -6,6 +6,7 @@ After=ocp-wait-apiservices-available.service StartLimitIntervalSec=450 StartLimitBurst=10 ConditionPathExists=!/opt/crc/%n.done +AssertPathExists=/opt/crc/eip [Service] Type=oneshot @@ -13,7 +14,7 @@ Restart=on-failure RestartSec=40 Environment=KUBECONFIG=/opt/kubeconfig ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh -ExecStart=/usr/local/bin/ocp-custom-domain.sh +ExecStart=/usr/local/bin/ocp-custom-domain.sh /opt/crc/eip ExecStartPost=-touch /opt/crc/%n.done [Install] diff --git a/systemd/ocp-custom-domain.sh b/systemd/ocp-custom-domain.sh index 6b706a3dc..023df73b5 100644 --- a/systemd/ocp-custom-domain.sh +++ b/systemd/ocp-custom-domain.sh @@ -8,8 +8,14 @@ set -x source /usr/local/bin/crc-systemd-common.sh -CRC_EXTERNAL_IP_FILE_PATH=/opt/crc/eip +CRC_EXTERNAL_IP_FILE_PATH="${1:-}" +if [[ -z "$CRC_EXTERNAL_IP_FILE_PATH" ]]; then + echo "ERROR: expected to receive the external IP file as first argument ..." >&2 + exit 1 +fi + +# enforced by systemd if [[ ! -r "$CRC_EXTERNAL_IP_FILE_PATH" ]]; then echo "ERROR: CRC external ip file not found ($CRC_EXTERNAL_IP_FILE_PATH)" >&2 exit 1 diff --git a/systemd/ocp-userpasswords.service b/systemd/ocp-userpasswords.service index 5d3d61ad6..e16bf311b 100644 --- a/systemd/ocp-userpasswords.service +++ b/systemd/ocp-userpasswords.service @@ -13,7 +13,9 @@ Restart=on-failure RestartSec=40 Environment=KUBECONFIG=/opt/kubeconfig ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh -ExecStart=/usr/local/bin/ocp-userpasswords.sh +ExecStartPre=/usr/bin/test -f /opt/crc/pass_developer +ExecStartPre=/usr/bin/test -f /opt/crc/pass_kubeadmin +ExecStart=/usr/local/bin/ocp-userpasswords.sh /opt/crc/pass_kubeadmin /opt/crc/pass_developer ExecStartPost=-touch /opt/crc/%n.done [Install] diff --git a/systemd/ocp-userpasswords.sh b/systemd/ocp-userpasswords.sh index 2adeed8be..3a80cd853 100644 --- a/systemd/ocp-userpasswords.sh +++ b/systemd/ocp-userpasswords.sh @@ -8,8 +8,14 @@ set -x source /usr/local/bin/crc-systemd-common.sh -CRC_PASS_DEVELOPER_PATH=/opt/crc/pass_developer -CRC_PASS_KUBEADMIN_PATH=/opt/crc/pass_kubeadmin +CRC_PASS_KUBEADMIN_PATH=${1:-} +CRC_PASS_DEVELOPER_PATH=${2:-} + +if [[ -z "$CRC_PASS_KUBEADMIN_PATH" || -z "$CRC_PASS_DEVELOPER_PATH" ]]; then + echo "ERROR: expected to receive the kubeadmin password file as 1st arg and the dev password file as 2nd arg. Got '$CRC_PASS_KUBEADMIN_PATH' and '$CRC_PASS_DEVELOPER_PATH'" + exit 1 +fi + CRC_HTPASSWD_IMAGE=registry.access.redhat.com/ubi10/httpd-24 function gen_htpasswd() { @@ -21,11 +27,13 @@ function gen_htpasswd() { podman run --rm "$CRC_HTPASSWD_IMAGE" htpasswd -nb "$1" "$2" } +# enforced by systemd if [[ ! -r "$CRC_PASS_DEVELOPER_PATH" ]]; then echo "ERROR: CRC developer password does not exist ($CRC_PASS_DEVELOPER_PATH)" exit 1 fi +# enforced by systemd if [[ ! -r "$CRC_PASS_KUBEADMIN_PATH" ]]; then echo "ERROR: CRC kubeadmin password does not exist ($CRC_PASS_KUBEADMIN_PATH)" exit 1 From 7116ea0d7cdeb50c7a84c3ba108ccb3d46d3b4d4 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Mon, 6 Oct 2025 21:41:21 +0200 Subject: [PATCH 27/44] systemd: prevent podman from leaking passwords in the journal A review of the systemd journal logs of the different services highlighted that the SystemD journal captures information about the Podman containers via a Podman-internal logging mechanism. This commit disables the logging mechanism to the containers handling secrets. --- systemd/crc-aws-fetch-secrets.sh | 1 + systemd/ocp-userpasswords.sh | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/systemd/crc-aws-fetch-secrets.sh b/systemd/crc-aws-fetch-secrets.sh index fdc3dd947..8d0174d43 100644 --- a/systemd/crc-aws-fetch-secrets.sh +++ b/systemd/crc-aws-fetch-secrets.sh @@ -89,6 +89,7 @@ save_secret() { if ! podman run \ --name "cloud-init-fetch-$name" \ --env AWS_REGION="$(< /tmp/aws-region)" \ + --log-driver=none \ --rm \ "$AWS_CLI_IMG" \ ssm get-parameter \ diff --git a/systemd/ocp-userpasswords.sh b/systemd/ocp-userpasswords.sh index 3a80cd853..f3e508430 100644 --- a/systemd/ocp-userpasswords.sh +++ b/systemd/ocp-userpasswords.sh @@ -24,7 +24,9 @@ function gen_htpasswd() { return 1 fi - podman run --rm "$CRC_HTPASSWD_IMAGE" htpasswd -nb "$1" "$2" + # --log-driver=none avoids that the journal captures the stdout + # logs of podman and leaks the passwords in the journal ... + podman run --log-driver=none --rm "$CRC_HTPASSWD_IMAGE" htpasswd -nb "$1" "$2" } # enforced by systemd From 28535efeb126683bc38d30e0f205f98eabfc248d Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Mon, 6 Oct 2025 21:41:51 +0200 Subject: [PATCH 28/44] systemd: log the wait durations To ease the quick glance review of the CRC boot timing, this scripts adds a simple timing measurement, based on Bash's `SECONDS` special variable (automatically tracking time past after its `SECONDS=0` reset). For a stronger time tracking, refer to the journal timestamps of the services. --- systemd/crc-systemd-common.sh | 3 +++ systemd/crc-wait-apiserver-up.sh | 4 +++- systemd/crc-wait-node-ready.sh | 7 ++++--- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/systemd/crc-systemd-common.sh b/systemd/crc-systemd-common.sh index c384eb39d..583ad22ad 100644 --- a/systemd/crc-systemd-common.sh +++ b/systemd/crc-systemd-common.sh @@ -16,6 +16,9 @@ function wait_for_resource_or_die() { exit 1 # this is wait_for_resource_or_die, so die ... fi + local start_time + start_time=$(date +%s) + # Loop from 1 up to max_retry for (( retry=1; retry<=max_retry; retry++ )); do # Try the command. If it succeeds, exit the loop. diff --git a/systemd/crc-wait-apiserver-up.sh b/systemd/crc-wait-apiserver-up.sh index 209592f48..25bfe8b2e 100644 --- a/systemd/crc-wait-apiserver-up.sh +++ b/systemd/crc-wait-apiserver-up.sh @@ -7,11 +7,13 @@ set -o errtrace source /usr/local/bin/crc-systemd-common.sh +SECONDS=0 + echo "Waiting for the node resource to be available ..." # $1 resource, $2 retry count, $3 wait time wait_for_resource_or_die node 60 5 -echo "node resource available, APIServer is ready." +echo "node resource available, APIServer is ready after $SECONDS seconds." echo "All done" diff --git a/systemd/crc-wait-node-ready.sh b/systemd/crc-wait-node-ready.sh index 0e3d43380..dd3d59d65 100644 --- a/systemd/crc-wait-node-ready.sh +++ b/systemd/crc-wait-node-ready.sh @@ -7,6 +7,7 @@ set -o errtrace source /usr/local/bin/crc-systemd-common.sh +SECONDS=0 MAX_RETRY=150 WAIT_SEC=2 NODE_NAME=node/crc @@ -17,7 +18,7 @@ for retry in $(seq 1 "$MAX_RETRY"); do # Check if the node status is "Ready" if [[ $node_status == "Ready" ]]; then - echo "CRC node is ready." + echo "CRC node is ready after $SECONDS seconds." exit 0 fi @@ -25,12 +26,12 @@ for retry in $(seq 1 "$MAX_RETRY"); do # If it's the last attempt, log a failure message before exiting if (( retry == MAX_RETRY )); then - echo "Error: Timed out waiting for the CRC node to be ready after $MAX_RETRY attempts x $WAIT_SEC seconds." >&2 + echo "ERROR: Timed out waiting for the CRC node to be ready after $MAX_RETRY attempts x $WAIT_SEC seconds." >&2 exit 1 fi # Wait before the next attempt - echo "Waiting for crc node to be ready ... (Attempt ${retry}/${MAX_RETRY})" + echo "Waiting $WAIT_SEC seconds for crc node to be ready ... (Attempt ${retry}/${MAX_RETRY})" sleep "$WAIT_SEC" done From 5636dbf946f498f274a282a945d20aa7659cffcd Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Tue, 7 Oct 2025 12:03:42 +0200 Subject: [PATCH 29/44] createdisk-library: add the ability to upload `unit-name.service.d` config-ovewrite directories SystemD allows overwriting the definition of services by writing new properties in the `unit-name.service.d/override.conf` files. This commit allows the CRC image creation script to properly upload these files and directories to the VM image. --- createdisk-library.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/createdisk-library.sh b/createdisk-library.sh index 05b08405b..b55e3a419 100755 --- a/createdisk-library.sh +++ b/createdisk-library.sh @@ -410,6 +410,7 @@ function copy_systemd_units() { ${SSH} core@${VM_IP} -- 'mkdir -p /home/core/systemd-units && mkdir -p /home/core/systemd-scripts' ${SCP} systemd/crc-*.service core@${VM_IP}:/home/core/systemd-units/ ${SCP} systemd/crc-*.target core@${VM_IP}:/home/core/systemd-units/ + ${SCP} -r systemd/*.d core@${VM_IP}:/home/core/systemd-units/ ${SCP} systemd/crc-*.sh core@${VM_IP}:/home/core/systemd-scripts/ case "${BUNDLE_TYPE}" in @@ -419,7 +420,7 @@ function copy_systemd_units() { ;; esac - ${SSH} core@${VM_IP} -- 'sudo cp /home/core/systemd-units/* /etc/systemd/system/ && sudo cp /home/core/systemd-scripts/* /usr/local/bin/' + ${SSH} core@${VM_IP} -- 'sudo cp -r /home/core/systemd-units/* /etc/systemd/system/ && sudo cp /home/core/systemd-scripts/* /usr/local/bin/' ${SSH} core@${VM_IP} -- 'ls /home/core/systemd-scripts/ | xargs -t -I % sudo chmod +x /usr/local/bin/%' ${SSH} core@${VM_IP} -- 'sudo restorecon -rv /usr/local/bin' From 2855386c1bc47613843d947907dc86d17bd0cc40 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Tue, 7 Oct 2025 12:04:02 +0200 Subject: [PATCH 30/44] ovs-configuration.service.d/mute-console.conf: mute the journal logs This override prevents the `ovs-configuration.service` from logging its xtrace execution into the journal and the console. This service is very verbose, and makes the console impossible to follow in real-time. Instead, its output is logged in a `/var/log` file. --- systemd/ovs-configuration.service.d/mute-console.conf | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 systemd/ovs-configuration.service.d/mute-console.conf diff --git a/systemd/ovs-configuration.service.d/mute-console.conf b/systemd/ovs-configuration.service.d/mute-console.conf new file mode 100644 index 000000000..749ed6d5d --- /dev/null +++ b/systemd/ovs-configuration.service.d/mute-console.conf @@ -0,0 +1,3 @@ +[Service] +StandardOutput=append:/var/log/ovs-configure.log +StandardError=append:/var/log/ovs-configure.log From fe06a1fafa4f0115776f97e16640f6f1661cde25 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Tue, 7 Oct 2025 14:55:19 +0200 Subject: [PATCH 31/44] crc-pullsecret.service: retry more often This commit reduces the Restart duration of the service. The SystemD dependencies should already avoid any failure of the script. --- systemd/crc-pullsecret.service | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/systemd/crc-pullsecret.service b/systemd/crc-pullsecret.service index 8781a0bf1..bc2203a58 100644 --- a/systemd/crc-pullsecret.service +++ b/systemd/crc-pullsecret.service @@ -3,13 +3,13 @@ Description=CRC Unit for adding pull secret to cluster After=crc-env-file-exists.service After=crc-wait-apiserver-up.service StartLimitIntervalSec=450 -StartLimitBurst=10 +StartLimitBurst=40 ConditionPathExists=!/opt/crc/%n.done [Service] Type=oneshot Restart=on-failure -RestartSec=40 +RestartSec=10 Environment=KUBECONFIG=/opt/kubeconfig ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh ExecStartPre=/usr/bin/test -f /opt/crc/pull-secret From 5e27e7bff04e6263bd2fae9d755d9d64a337ed9e Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Tue, 7 Oct 2025 15:28:52 +0200 Subject: [PATCH 32/44] crc-pullsecret.service: only run after cloud-final.service Add a dependency on the `cloud-final.service`, to be sure that the pull-secrets have been pulled when the service starts. --- systemd/crc-pullsecret.service | 1 + 1 file changed, 1 insertion(+) diff --git a/systemd/crc-pullsecret.service b/systemd/crc-pullsecret.service index bc2203a58..da313824e 100644 --- a/systemd/crc-pullsecret.service +++ b/systemd/crc-pullsecret.service @@ -2,6 +2,7 @@ Description=CRC Unit for adding pull secret to cluster After=crc-env-file-exists.service After=crc-wait-apiserver-up.service +After=cloud-final.service StartLimitIntervalSec=450 StartLimitBurst=40 ConditionPathExists=!/opt/crc/%n.done From 9c159a4c9b95214f5159e1b9511bd995a34135ae Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Tue, 7 Oct 2025 15:29:02 +0200 Subject: [PATCH 33/44] ocp-userpasswords.service: only run after cloud-final.service Add a dependency on the `cloud-final.service` to be sure that the CRC passwords have been fetched before starting. --- systemd/ocp-userpasswords.service | 1 + 1 file changed, 1 insertion(+) diff --git a/systemd/ocp-userpasswords.service b/systemd/ocp-userpasswords.service index e16bf311b..9eda0cc50 100644 --- a/systemd/ocp-userpasswords.service +++ b/systemd/ocp-userpasswords.service @@ -3,6 +3,7 @@ Description=CRC Unit setting the developer and kubeadmin user password Before=ocp-cluster-ca.service After=crc-env-file-exists.service After=crc-wait-apiserver-up.service +After=cloud-final.service StartLimitIntervalSec=450 StartLimitBurst=10 ConditionPathExists=!/opt/crc/%n.done From 9b2e043dba9b293affbdc4ab45525a3fd73c37e6 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Tue, 7 Oct 2025 16:18:52 +0200 Subject: [PATCH 34/44] crc-custom.target: reformulate the dependencies Reformulate the dependencies of the `crc-custom.target` to avoid startup deadlocks. Load `crc-custom.target` as a dependency of the `kubelet.service`. --- systemd/crc-custom.target | 8 ++++---- systemd/kubelet.service.d/wants-crc-custom.conf | 3 +++ 2 files changed, 7 insertions(+), 4 deletions(-) create mode 100644 systemd/kubelet.service.d/wants-crc-custom.conf diff --git a/systemd/crc-custom.target b/systemd/crc-custom.target index cffb4d5b9..8fa380f48 100644 --- a/systemd/crc-custom.target +++ b/systemd/crc-custom.target @@ -1,5 +1,5 @@ [Unit] -Description=crc custom target -Requires=kubelet-dependencies.target -Requires=crc-env-file-exists.service -After=kubelet-dependencies.target +Description=CRC custom target +Requires=crc-wait-apiserver-up.service +Requires=crc-cluster-status.service +After=crc-wait-apiserver-up.service crc-cluster-status.service diff --git a/systemd/kubelet.service.d/wants-crc-custom.conf b/systemd/kubelet.service.d/wants-crc-custom.conf new file mode 100644 index 000000000..be4b777c2 --- /dev/null +++ b/systemd/kubelet.service.d/wants-crc-custom.conf @@ -0,0 +1,3 @@ +[Unit] +Wants=crc-custom.target +Before=crc-custom.target From 9f8b5655edbe51cd08795890b4cf5fc8cde814b3 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Tue, 7 Oct 2025 16:43:53 +0200 Subject: [PATCH 35/44] ocp-mco-sshkey.service: ensure that the pubkey is there before starting Ensure that the pub key has been fetched before starting the service. --- systemd/ocp-mco-sshkey.service | 4 +++- systemd/ocp-mco-sshkey.sh | 8 +++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/systemd/ocp-mco-sshkey.service b/systemd/ocp-mco-sshkey.service index f2d66c21b..5ac8f4c6e 100644 --- a/systemd/ocp-mco-sshkey.service +++ b/systemd/ocp-mco-sshkey.service @@ -2,6 +2,7 @@ Description=CRC Unit patching the MachineConfig to add new ssh key After=crc-env-file-exists.service After=crc-wait-apiserver-up.service +After=cloud-final.service StartLimitIntervalSec=450 StartLimitBurst=10 @@ -11,7 +12,8 @@ Restart=on-failure RestartSec=40 Environment=KUBECONFIG=/opt/kubeconfig ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh -ExecStart=/usr/local/bin/ocp-mco-sshkey.sh +ExecStartPre=/usr/bin/test -f /opt/crc/id_rsa.pub +ExecStart=/usr/local/bin/ocp-mco-sshkey.sh /opt/crc/id_rsa.pub RemainAfterExit=true [Install] diff --git a/systemd/ocp-mco-sshkey.sh b/systemd/ocp-mco-sshkey.sh index 00a90ed64..49b9c5c6b 100644 --- a/systemd/ocp-mco-sshkey.sh +++ b/systemd/ocp-mco-sshkey.sh @@ -8,8 +8,14 @@ set -x source /usr/local/bin/crc-systemd-common.sh -CRC_PUB_KEY_PATH="/opt/crc/id_rsa.pub" +CRC_PUB_KEY_PATH="${1:-}" +if [[ -z "$CRC_PUB_KEY_PATH" ]]; then + echo "ERROR: expected to receive the path to the pub key file as first argument." + exit 1 +fi + +# enforced by systemd if [[ ! -r "$CRC_PUB_KEY_PATH" ]]; then echo "ERROR: CRC pubkey file does not exist ($CRC_PUB_KEY_PATH)" exit 1 From 31b5633ec6bef2016e434cccd68bb1f00862d8db Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Tue, 7 Oct 2025 16:43:53 +0200 Subject: [PATCH 36/44] ocp-mco-sshkey.sh: don't expose the pub key to the journal Better use of `jq` to ensure that the public key isn't exposed in the journal logs. Exposing a public key isn't a security leak, but better avoid disclosing it as a good practice. --- systemd/ocp-mco-sshkey.sh | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/systemd/ocp-mco-sshkey.sh b/systemd/ocp-mco-sshkey.sh index 49b9c5c6b..31bb5bae8 100644 --- a/systemd/ocp-mco-sshkey.sh +++ b/systemd/ocp-mco-sshkey.sh @@ -24,9 +24,12 @@ fi wait_for_resource_or_die machineconfig/99-master-ssh echo "Updating the public key resource for machine config operator" -pub_key=$(cat "$CRC_PUB_KEY_PATH" | tr -d '\n\r') -jq -n --arg key "${pub_key}" ' +# Use --rawfile to read the key file directly into a jq variable named 'pub_key'. +# The key's content is never exposed as a command-line argument. +# We use jq's rtrimstr function to remove any trailing newlines from the file. + +jq -n --rawfile pub_key "$CRC_PUB_KEY_PATH" ' { "spec": { "config": { @@ -34,7 +37,10 @@ jq -n --arg key "${pub_key}" ' "users": [ { "name": "core", - "sshAuthorizedKeys": [ $key ] + "sshAuthorizedKeys": [ + # Trim trailing newlines and carriage returns from the slurped file content + $pub_key | rtrimstr("\n") | rtrimstr("\r") + ] } ] } From b1ee39f7726d6fb265a00cf123c4fce44c4bcad5 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Tue, 14 Oct 2025 21:48:26 +0200 Subject: [PATCH 37/44] ocp-custom-domain.service: don't use AssertPathExists `AssertPathExists` is checked before the condition is tested. Use a `ExecStartPre` directive instead. ``` ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh ``` --- systemd/ocp-custom-domain.service | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/systemd/ocp-custom-domain.service b/systemd/ocp-custom-domain.service index 5fb41e38a..b879410c6 100644 --- a/systemd/ocp-custom-domain.service +++ b/systemd/ocp-custom-domain.service @@ -6,7 +6,6 @@ After=ocp-wait-apiservices-available.service StartLimitIntervalSec=450 StartLimitBurst=10 ConditionPathExists=!/opt/crc/%n.done -AssertPathExists=/opt/crc/eip [Service] Type=oneshot @@ -14,6 +13,7 @@ Restart=on-failure RestartSec=40 Environment=KUBECONFIG=/opt/kubeconfig ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh +ExecStartPre=/usr/bin/test -f /opt/crc/eip ExecStart=/usr/local/bin/ocp-custom-domain.sh /opt/crc/eip ExecStartPost=-touch /opt/crc/%n.done From 9c56363e5f2e06118b0575a467d00a5d59e323f7 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 15 Oct 2025 09:37:23 +0200 Subject: [PATCH 38/44] systemd: remove the dependency on crc-env-file-exists.service --- systemd/crc-check-tap.service | 2 -- systemd/crc-cluster-status.service | 1 - systemd/crc-dnsmasq.service | 1 - systemd/crc-env-file-exists.service | 20 -------------------- systemd/crc-pullsecret.service | 1 - systemd/crc-routes-controller.service | 1 - systemd/crc-wait-apiserver-up.service | 1 - systemd/crc-wait-node-ready.service | 1 - systemd/ocp-clusterid.service | 1 - systemd/ocp-custom-domain.service | 1 - systemd/ocp-mco-sshkey.service | 1 - systemd/ocp-userpasswords.service | 1 - 12 files changed, 32 deletions(-) delete mode 100644 systemd/crc-env-file-exists.service diff --git a/systemd/crc-check-tap.service b/systemd/crc-check-tap.service index 46c5e3a24..473e77a29 100644 --- a/systemd/crc-check-tap.service +++ b/systemd/crc-check-tap.service @@ -3,9 +3,7 @@ Description=Ensure that tap0 network configuration is disabled when not necessar Before=NetworkManager.service Before=gv-user-network@tap0.service After=local-fs.target -After=crc-env-file-exists.service RequiresMountsFor=/etc/NetworkManager/system-connections -Requires=crc-env-file-exists.service [Service] Type=oneshot diff --git a/systemd/crc-cluster-status.service b/systemd/crc-cluster-status.service index 4bd12abd2..fd7f70b19 100644 --- a/systemd/crc-cluster-status.service +++ b/systemd/crc-cluster-status.service @@ -1,6 +1,5 @@ [Unit] Description=CRC Unit checking if cluster is ready -After=crc-env-file-exists.service After=crc-wait-apiserver-up.service crc-pullsecret.service After=ocp-mco-sshkey.service ocp-cluster-ca.service After=ocp-custom-domain.service ocp-userpasswords.service diff --git a/systemd/crc-dnsmasq.service b/systemd/crc-dnsmasq.service index a01107210..a53f2b320 100644 --- a/systemd/crc-dnsmasq.service +++ b/systemd/crc-dnsmasq.service @@ -1,7 +1,6 @@ [Unit] Description=CRC Unit for configuring dnsmasq Wants=ovs-configuration.service -After=crc-env-file-exists.service After=ovs-configuration.service Before=kubelet-dependencies.target StartLimitIntervalSec=30 diff --git a/systemd/crc-env-file-exists.service b/systemd/crc-env-file-exists.service deleted file mode 100644 index 35a9fb344..000000000 --- a/systemd/crc-env-file-exists.service +++ /dev/null @@ -1,20 +0,0 @@ -[Unit] -Description=Wait for /etc/sysconfig/crc-env file to be populated - -[Service] -# This service runs a command once and then exits. -Type=oneshot - -# This is the magic part. It keeps the service in an 'active' state -# after the command exits, so other services can see it succeeded. -RemainAfterExit=yes - -# This is the command that waits for the file. -# It checks every second if the file does not exist ('! -f'). -# Once the file is found, the loop exits, the command succeeds, and the service is 'active'. -ExecStart=/bin/sh -c 'while [ ! -f /etc/sysconfig/crc-env ]; do sleep 1; done' -TimeoutStartSec=300 - -[Install] -# Ensure this service is started during the normal boot process. -WantedBy=crc-custom.target diff --git a/systemd/crc-pullsecret.service b/systemd/crc-pullsecret.service index da313824e..4523549f0 100644 --- a/systemd/crc-pullsecret.service +++ b/systemd/crc-pullsecret.service @@ -1,6 +1,5 @@ [Unit] Description=CRC Unit for adding pull secret to cluster -After=crc-env-file-exists.service After=crc-wait-apiserver-up.service After=cloud-final.service StartLimitIntervalSec=450 diff --git a/systemd/crc-routes-controller.service b/systemd/crc-routes-controller.service index 869b4ab95..e73f71100 100644 --- a/systemd/crc-routes-controller.service +++ b/systemd/crc-routes-controller.service @@ -1,6 +1,5 @@ [Unit] Description=CRC Unit starting routes controller -After=crc-env-file-exists.service After=crc-wait-apiserver-up.service StartLimitIntervalSec=450 StartLimitBurst=10 diff --git a/systemd/crc-wait-apiserver-up.service b/systemd/crc-wait-apiserver-up.service index 9ab3e168c..78ee273c9 100644 --- a/systemd/crc-wait-apiserver-up.service +++ b/systemd/crc-wait-apiserver-up.service @@ -2,7 +2,6 @@ Description=CRC Unit waiting till k8s API server is up Requires=kubelet.service After=kubelet.service -After=crc-env-file-exists.service Before=ocp-delete-mco-leases.service [Service] diff --git a/systemd/crc-wait-node-ready.service b/systemd/crc-wait-node-ready.service index 6daf0472d..facefe55c 100644 --- a/systemd/crc-wait-node-ready.service +++ b/systemd/crc-wait-node-ready.service @@ -2,7 +2,6 @@ Description=CRC Unit waiting till k8s node is ready Requires=kubelet.service After=kubelet.service -After=crc-env-file-exists.service After=crc-wait-apiserver-up.service StartLimitIntervalSec=450 StartLimitBurst=10 diff --git a/systemd/ocp-clusterid.service b/systemd/ocp-clusterid.service index 2e6ad70f3..d9909f29c 100644 --- a/systemd/ocp-clusterid.service +++ b/systemd/ocp-clusterid.service @@ -1,6 +1,5 @@ [Unit] Description=CRC Unit setting random cluster ID -After=crc-env-file-exists.service After=crc-wait-apiserver-up.service StartLimitIntervalSec=450 StartLimitBurst=10 diff --git a/systemd/ocp-custom-domain.service b/systemd/ocp-custom-domain.service index b879410c6..db19d0fa2 100644 --- a/systemd/ocp-custom-domain.service +++ b/systemd/ocp-custom-domain.service @@ -1,6 +1,5 @@ [Unit] Description=CRC Unit setting nip.io domain for cluster -After=crc-env-file-exists.service After=crc-wait-apiserver-up.service After=ocp-wait-apiservices-available.service StartLimitIntervalSec=450 diff --git a/systemd/ocp-mco-sshkey.service b/systemd/ocp-mco-sshkey.service index 5ac8f4c6e..94ea9c203 100644 --- a/systemd/ocp-mco-sshkey.service +++ b/systemd/ocp-mco-sshkey.service @@ -1,6 +1,5 @@ [Unit] Description=CRC Unit patching the MachineConfig to add new ssh key -After=crc-env-file-exists.service After=crc-wait-apiserver-up.service After=cloud-final.service StartLimitIntervalSec=450 diff --git a/systemd/ocp-userpasswords.service b/systemd/ocp-userpasswords.service index 9eda0cc50..30919f51c 100644 --- a/systemd/ocp-userpasswords.service +++ b/systemd/ocp-userpasswords.service @@ -1,7 +1,6 @@ [Unit] Description=CRC Unit setting the developer and kubeadmin user password Before=ocp-cluster-ca.service -After=crc-env-file-exists.service After=crc-wait-apiserver-up.service After=cloud-final.service StartLimitIntervalSec=450 From 3191110c4750a0a2b6be283facaec379a1da4d36 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 15 Oct 2025 09:59:56 +0200 Subject: [PATCH 39/44] systemd: update the self-sufficient/user-mode tests to avoid relying on the env file --- createdisk-library.sh | 1 + systemd/crc-self-sufficient-env.sh | 34 +++++++++++----- systemd/crc-test-vsock.py | 63 +++++++++++++++++++++++++++++ systemd/crc-user-mode-networking.sh | 2 +- 4 files changed, 90 insertions(+), 10 deletions(-) create mode 100644 systemd/crc-test-vsock.py diff --git a/createdisk-library.sh b/createdisk-library.sh index b55e3a419..50ff1d3f9 100755 --- a/createdisk-library.sh +++ b/createdisk-library.sh @@ -412,6 +412,7 @@ function copy_systemd_units() { ${SCP} systemd/crc-*.target core@${VM_IP}:/home/core/systemd-units/ ${SCP} -r systemd/*.d core@${VM_IP}:/home/core/systemd-units/ ${SCP} systemd/crc-*.sh core@${VM_IP}:/home/core/systemd-scripts/ + ${SCP} systemd/crc-*.py core@${VM_IP}:/home/core/systemd-scripts/ case "${BUNDLE_TYPE}" in "snc"|"okd") diff --git a/systemd/crc-self-sufficient-env.sh b/systemd/crc-self-sufficient-env.sh index a19f7c7d4..4aa61ad4b 100644 --- a/systemd/crc-self-sufficient-env.sh +++ b/systemd/crc-self-sufficient-env.sh @@ -1,16 +1,32 @@ #!/bin/bash +# set -o errexit disabled to capture the test return code set -o pipefail -set -o errexit set -o nounset set -o errtrace -source /etc/sysconfig/crc-env || echo "WARNING: crc-env not found" +TEST_TIMEOUT=120 +VSOCK_COMM_PORT=1024 -if (( ${CRC_SELF_SUFFICIENT:-0} == 1 )); then - echo "Running with a self-sufficient bundle" - exit 0 -else - echo "Not running in a self-sufficient bundle" - exit 1 -fi +timeout "$TEST_TIMEOUT" python3 /usr/local/bin/crc-test-vsock.py "$VSOCK_COMM_PORT" +returncode=$? + +case "$returncode" in + 124) + echo "ERROR: vsock/${VSOCK_COMM_PORT} test timed out after $TEST_TIMEOUT seconds :/" >&2 + exit 124 + ;; + 1) + echo "vsock/${VSOCK_COMM_PORT} not working, running with a self-sufficient bundle" >&2 + exit 0 + ;; + 0) + echo "vsock/${VSOCK_COMM_PORT} works, not running with a self-sufficient bundle" >&2 + exit 1 + ;; + *) + echo "ERROR: unexpected return code from the vsock test: $returncode" >&2 + exit "$returncode" +esac + +# cannot be reached diff --git a/systemd/crc-test-vsock.py b/systemd/crc-test-vsock.py new file mode 100644 index 000000000..fb93a07ee --- /dev/null +++ b/systemd/crc-test-vsock.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python3 + +import socket +import sys +import time +import fcntl, struct +import os +import errno +import pathlib + +VSOCK_DEV = pathlib.Path("/dev/vsock") +HOST_CID = 2 # VMADDR_CID_HOST + +def main(): + if len(sys.argv) != 2: + print("ERROR: expected a vsock port number as first argument.") + raise SystemExit(errno.EINVAL) + + port = int(sys.argv[1]) + tries = 60 + while not VSOCK_DEV.exists(): + tries -= 1 + + if not tries: + print(f"ERROR: {VSOCK_DEV} didn't appear ...") + return errno.ENODEV + print(f"Waiting for {VSOCK_DEV} to appear ... ({tries} tries left)") + time.sleep(1) + + print(f"Looking up the CID in {VSOCK_DEV}...") + with open(VSOCK_DEV, 'rb') as f: + r = fcntl.ioctl(f, socket.IOCTL_VM_SOCKETS_GET_LOCAL_CID, ' ') + cid = struct.unpack('I', r)[0] + print(f'Our vsock CID is {cid}.') + + s = socket.socket(socket.AF_VSOCK, socket.SOCK_STREAM) + + try: + s.connect((HOST_CID, port)) + except OSError as e: + + if e.errno in (errno.ENODEV, errno.ECONNREFUSED, errno.EHOSTUNREACH, errno.ETIMEDOUT, errno.ECONNRESET): + print(f"No remote host on vsock://{HOST_CID}:{port} ({e.strerror})") + s.close() + return 1 + + print(f"Unexpected error connecting vsock://{HOST_CID}:{port}: {e}") + s.close() + return 1 + + msg = b"hello" + s.sendall(msg) + + s.sendall(b"\n") + + s.close() + print(f"A remote host is listening on vsock://{HOST_CID}:{port}") + + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/systemd/crc-user-mode-networking.sh b/systemd/crc-user-mode-networking.sh index 109603e5e..c60b548f5 100644 --- a/systemd/crc-user-mode-networking.sh +++ b/systemd/crc-user-mode-networking.sh @@ -36,7 +36,7 @@ fi # no value --> error if [[ -z "${CRC_NETWORK_MODE_USER:-}" ]]; then echo "ERROR: CRC_NETWORK_MODE_USER not set. Assuming user networking." >&2 - exit "$EXIT_ERROR" + exit "$EXIT_USER_MODE" fi # value not in [0, 1] --> error From a4d8e22e09cada167c172d06fd212e0665825e07 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 15 Oct 2025 12:31:13 +0200 Subject: [PATCH 40/44] crc-aws-fetch-secrets: try multiple times to get the secrets from the IMDS --- systemd/crc-aws-fetch-secrets.sh | 75 ++++++++++++++++---------------- 1 file changed, 38 insertions(+), 37 deletions(-) diff --git a/systemd/crc-aws-fetch-secrets.sh b/systemd/crc-aws-fetch-secrets.sh index 8d0174d43..e174398fe 100644 --- a/systemd/crc-aws-fetch-secrets.sh +++ b/systemd/crc-aws-fetch-secrets.sh @@ -24,14 +24,37 @@ if [[ -z "$PULL_SECRETS_KEY" || -z "$KUBEADM_PASS_KEY" || -z "$DEVELOPER_PASS_KE exit 1 fi +DELAY=5 +TOTAL_PERIOD=$(( 3*60 )) +ATTEMPTS=$(( TOTAL_PERIOD / DELAY)) +function retry_compact() { + for i in $(seq 1 $ATTEMPTS); do + # If the command succeeds (returns 0), exit the function with success. + if "$@"; then + echo "'$*' succeeded after $i attempts " + return 0 + fi + echo "'$*' still failing after $i/$ATTEMPTS attempts ..." + sleep "$DELAY" + done + echo "'$*' didn't succeed after $i attempt ..." + # If the loop finishes, the command never succeeded. + return 1 +} + +cleanup() { + rm -f /tmp/aws-region /opt/crc/pull-secret.tmp /opt/crc/pass_kubeadmin.tmp /opt/crc/pass_developer.tmp + echo "Temp files cleanup complete." +} + +# Cleanup happens automatically via trap on error or at script end +trap cleanup ERR EXIT + SECONDS=0 podman pull --quiet "$AWS_CLI_IMG" echo "Took $SECONDS seconds to pull the $AWS_CLI_IMG" -wait_imds_available_and_get_region() { - total_timeout_minutes=5 - retry_interval_seconds=5 - +check_imds_available_and_get_region() { IMDS_TOKEN_COMMAND=( curl --connect-timeout 1 @@ -40,25 +63,9 @@ wait_imds_available_and_get_region() { -H "X-aws-ec2-metadata-token-ttl-seconds: 21600" -Ssf ) - success=false - deadline=$(( $(date +%s) + (total_timeout_minutes * 60) )) - while [[ $(date +%s) -lt $deadline ]]; do - # By placing the command in an 'if' condition, we can test its exit code - # without triggering 'set -e'. The output is still captured. - if TOKEN=$("${IMDS_TOKEN_COMMAND[@]}"); then - # This block only runs if the curl command succeeds (exit code 0) - success=true - echo "Successfully fetched token." >&2 - break # Exit the loop on success - fi - # This block runs if the curl command fails - echo "Failed to connect. Retrying in $retry_interval_seconds seconds..." >&2 - sleep "$retry_interval_seconds" - done - - if [[ "$success" != "true" ]]; then - echo "ERROR: Could not fetch token after $total_timeout_minutes minutes." >&2 + if ! TOKEN=$("${IMDS_TOKEN_COMMAND[@]}"); then + echo "Couldn't fetch the token..." >&2 return 1 fi @@ -73,12 +80,10 @@ wait_imds_available_and_get_region() { set +x # disable the xtrace as the token would be leaked echo "Waiting for the AWS IMDS service to be available ..." SECONDS=0 - wait_imds_available_and_get_region + retry_compact check_imds_available_and_get_region echo "Took $SECONDS for the IMDS service to become available." ) -missing_secrets=0 - save_secret() { name=$1 key=$2 @@ -101,27 +106,23 @@ save_secret() { then rm -f "${dest}.tmp" echo "ERROR: failed to get the '$name' secret ... (fetched from $key)" - ((missing_secrets += 1)) - return + return 1 fi char_count=$(wc -c < "${dest}.tmp") if (( char_count < MIN_CHAR_COUNT )); then echo "ERROR: the content of the '$name' secret is too short ... (fetched from $key)" rm -f "${dest}.tmp" - ((missing_secrets += 1)) - return + return 1 fi mv "${dest}.tmp" "${dest}" # atomic creation of the file -} -save_secret "pull-secrets" "$PULL_SECRETS_KEY" /opt/crc/pull-secret -save_secret "kubeadmin-pass" "$KUBEADM_PASS_KEY" /opt/crc/pass_kubeadmin -save_secret "developer-pass" "$DEVELOPER_PASS_KEY" /opt/crc/pass_developer + return 0 +} -if (( missing_secrets != 0 )); then - echo "ERROR: failed to fetch $missing_secrets secrets ..." - exit 1 -fi +# execution will abort if 'retry_compact' fails. +retry_compact save_secret "pull-secrets" "$PULL_SECRETS_KEY" /opt/crc/pull-secret +retry_compact save_secret "kubeadmin-pass" "$KUBEADM_PASS_KEY" /opt/crc/pass_kubeadmin +retry_compact save_secret "developer-pass" "$DEVELOPER_PASS_KEY" /opt/crc/pass_developer exit 0 From 664f723db7866763ff7a718d84016be2f414a648 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Thu, 16 Oct 2025 10:45:09 +0200 Subject: [PATCH 41/44] systemd: crc-needs-tap.sh: skip self-sufficient/user-mode networking test Not working before the network is established --- systemd/crc-needs-tap.sh | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/systemd/crc-needs-tap.sh b/systemd/crc-needs-tap.sh index f12796bde..d05db66b0 100644 --- a/systemd/crc-needs-tap.sh +++ b/systemd/crc-needs-tap.sh @@ -29,16 +29,17 @@ fi virt="$(systemd-detect-virt || true)" -if [[ -z "$virt" ]]; then - echo "ERROR: systemd couldn't detect the virtualization :/" >&2 - exit "$EXIT_ERROR" -fi - -if [[ "${virt}" == apple ]] ; then +case "${virt}" in + apple) echo "Running with vfkit ($virt) virtualization. Don't need tap0." exit "$EXIT_DONT_NEED_TAP" -fi - -echo "Running with '$virt' virtualization. Need tap0." - -exit "$EXIT_NEED_TAP" + ;; + none) + echo "Bare metal detected. Don't need tap0." + exit "$EXIT_DONT_NEED_TAP" + ;; + *) + echo "Running with '$virt' virtualization. Need tap0." + exit "$EXIT_NEED_TAP" + ;; +esac From 400ce15466e39716c5a46b0314d7272383a202f9 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Fri, 17 Oct 2025 11:40:40 +0200 Subject: [PATCH 42/44] crc-self-sufficient-env: better handling of missing /dev/vsock --- systemd/crc-self-sufficient-env.sh | 4 ++++ systemd/crc-test-vsock.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/systemd/crc-self-sufficient-env.sh b/systemd/crc-self-sufficient-env.sh index 4aa61ad4b..d783a56ca 100644 --- a/systemd/crc-self-sufficient-env.sh +++ b/systemd/crc-self-sufficient-env.sh @@ -12,6 +12,10 @@ timeout "$TEST_TIMEOUT" python3 /usr/local/bin/crc-test-vsock.py "$VSOCK_COMM_PO returncode=$? case "$returncode" in + 19) # ENODEV + echo "vsock device doesn't exist, not running self-sufficient bundle" >&2 + exit 1 + ;; 124) echo "ERROR: vsock/${VSOCK_COMM_PORT} test timed out after $TEST_TIMEOUT seconds :/" >&2 exit 124 diff --git a/systemd/crc-test-vsock.py b/systemd/crc-test-vsock.py index fb93a07ee..f8ae0a6b3 100644 --- a/systemd/crc-test-vsock.py +++ b/systemd/crc-test-vsock.py @@ -17,7 +17,7 @@ def main(): raise SystemExit(errno.EINVAL) port = int(sys.argv[1]) - tries = 60 + tries = 5 while not VSOCK_DEV.exists(): tries -= 1 From 2359478f766875306d7607495373f878c9360632 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Fri, 17 Oct 2025 12:14:44 +0200 Subject: [PATCH 43/44] systemd: crc-needs-tap: only use the virt type to detect if TAP is needed --- systemd/crc-needs-tap.sh | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/systemd/crc-needs-tap.sh b/systemd/crc-needs-tap.sh index d05db66b0..ebe1b2c11 100644 --- a/systemd/crc-needs-tap.sh +++ b/systemd/crc-needs-tap.sh @@ -12,21 +12,6 @@ EXIT_NEED_TAP=0 EXIT_DONT_NEED_TAP=77 EXIT_ERROR=1 -if /usr/local/bin/crc-self-sufficient-env.sh; then - echo "Running a self-sufficient bundle. Don't need tap0" - exit "$EXIT_DONT_NEED_TAP" -fi - -if /usr/local/bin/crc-user-mode-networking.sh system; then - echo "Running with CRC and system-mode networking. Don't need tap0. (Fairly rare case.)" - exit "$EXIT_DONT_NEED_TAP" -fi - -# running with CRC (not a self-sufficient bundle) -# running with user-mode networking -# --> vfkit doesn't need tap0 -# --> other platforms do need it - virt="$(systemd-detect-virt || true)" case "${virt}" in From 52636079cbb1abd91b6e448b368360722e287444 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Fri, 17 Oct 2025 12:15:15 +0200 Subject: [PATCH 44/44] systemd: crc-self-sufficient-env: prefer CRC_SELF_SUFFICIENT if available --- systemd/crc-self-sufficient-env.sh | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/systemd/crc-self-sufficient-env.sh b/systemd/crc-self-sufficient-env.sh index d783a56ca..4bb4cc1d8 100644 --- a/systemd/crc-self-sufficient-env.sh +++ b/systemd/crc-self-sufficient-env.sh @@ -1,15 +1,34 @@ #!/bin/bash -# set -o errexit disabled to capture the test return code set -o pipefail set -o nounset set -o errtrace +source /etc/sysconfig/crc-env || echo "WARNING: crc-env not found" + +if [[ "${CRC_SELF_SUFFICIENT:-}" ]]; then + echo "Found CRC_SELF_SUFFICIENT=$CRC_SELF_SUFFICIENT" + + if [[ ! "${CRC_SELF_SUFFICIENT}" =~ ^[01]$ ]]; then + echo "ERROR: CRC_SELF_SUFFICIENT should be 0 or 1 ..." >&2 + exit 1 + fi + + if [[ "$CRC_SELF_SUFFICIENT" == 1 ]]; then + exit 0 + else + exit 1 + fi +fi + TEST_TIMEOUT=120 VSOCK_COMM_PORT=1024 +set +o errexit +# set -o errexit disabled to capture the test return code timeout "$TEST_TIMEOUT" python3 /usr/local/bin/crc-test-vsock.py "$VSOCK_COMM_PORT" returncode=$? +set -o errexit case "$returncode" in 19) # ENODEV