From 6c0f14a203f05a1649173b6d03d9c3a84008ad48 Mon Sep 17 00:00:00 2001 From: Min Hong Yun Date: Wed, 1 Apr 2026 15:47:25 -0700 Subject: [PATCH 01/15] Add container cleanup service on stop Create a systemd service to send SIGTERM to running containerd tasks during shutdown/service stop to allow for graceful termination. --- launcher/image/container-cleanup.service | 15 +++++++++++++++ launcher/image/container-cleanup.sh | 20 ++++++++++++++++++++ launcher/image/preload.sh | 7 +++++++ 3 files changed, 42 insertions(+) create mode 100644 launcher/image/container-cleanup.service create mode 100755 launcher/image/container-cleanup.sh diff --git a/launcher/image/container-cleanup.service b/launcher/image/container-cleanup.service new file mode 100644 index 000000000..9c892a4db --- /dev/null +++ b/launcher/image/container-cleanup.service @@ -0,0 +1,15 @@ +[Unit] +Description=Container Cleanup on Service Stop +Wants=containerd.service container-runner.service +After=containerd.service container-runner.service + +[Service] +Type=oneshot +RemainAfterExit=yes +ExecStart=/bin/sh -c "echo 'Container cleanup service started'" +ExecStop=/usr/share/oem/confidential_space/container-cleanup.sh +StandardOutput=journal +StandardError=journal + +[Install] +WantedBy=multi-user.target diff --git a/launcher/image/container-cleanup.sh b/launcher/image/container-cleanup.sh new file mode 100755 index 000000000..44d39c322 --- /dev/null +++ b/launcher/image/container-cleanup.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +SHUTDOWN_TIMEOUT_SEC=15 + +# Send SIGTERM to all running workloads so that they can shutdown gracefully. +for ns in $(ctr ns ls -q); do + tasks=$(ctr -n "$ns" task ls -q) + + if [ -n "$tasks" ]; then + # Send SIGTERM and move on. No waiting, no killing, no deleting. + # A workload may decide to ignore or not handle SIGTERM. + ctr -n "$ns" tasks kill --signal SIGTERM $tasks >/dev/null 2>&1 + echo "SIGTERM sent to $tasks; allowing $SHUTDOWN_TIMEOUT_SEC seconds for graceful shutdown." + + # Allow $SHUTDOWN_TIMEOUT_SEC seconds for the workload to shutdown. + sleep $SHUTDOWN_TIMEOUT_SEC + + echo "Container cleanup exits; unended tasks will be killed." + fi +done diff --git a/launcher/image/preload.sh b/launcher/image/preload.sh index 83cde3823..1c0547209 100644 --- a/launcher/image/preload.sh +++ b/launcher/image/preload.sh @@ -19,6 +19,11 @@ setup_launcher_systemd_unit() { cp exit_script.sh "${CS_PATH}/exit_script.sh" } +setup_container_cleanup_service() { + cp container-cleanup.service "${CS_PATH}/container-cleanup.service" + cp container-cleanup.sh "${CS_PATH}/container-cleanup.sh" +} + append_cmdline() { local arg="$1" if [[ ! -d /mnt/disks/efi ]]; then @@ -113,6 +118,8 @@ main() { # Install container launcher. copy_launcher setup_launcher_systemd_unit + # Install container cleanup service. + setup_container_cleanup_service # Minimum required COS version for 'e': cos-dev-105-17222-0-0. # Minimum required COS version for 'm': cos-dev-113-18203-0-0. append_cmdline "cos.protected_stateful_partition=m" From 4dea08f3fd483763e7fe0a661f6465dd0c10e61e Mon Sep 17 00:00:00 2001 From: Min Hong Yun Date: Thu, 2 Apr 2026 11:17:06 -0700 Subject: [PATCH 02/15] Adding comments on service files --- launcher/image/container-cleanup.service | 1 + launcher/image/container-runner.service | 1 + 2 files changed, 2 insertions(+) diff --git a/launcher/image/container-cleanup.service b/launcher/image/container-cleanup.service index 9c892a4db..c81646d3a 100644 --- a/launcher/image/container-cleanup.service +++ b/launcher/image/container-cleanup.service @@ -1,3 +1,4 @@ +# Container cleanup service. [Unit] Description=Container Cleanup on Service Stop Wants=containerd.service container-runner.service diff --git a/launcher/image/container-runner.service b/launcher/image/container-runner.service index 3f0d7a575..dcd39a4da 100644 --- a/launcher/image/container-runner.service +++ b/launcher/image/container-runner.service @@ -1,3 +1,4 @@ +# Container launcher service. [Unit] Description=Confidential Space Launcher Wants=network-online.target gcr-online.target containerd.service From 25a5e8b0e9bdb1ed5aaed96f1ff19c02b5a5c392 Mon Sep 17 00:00:00 2001 From: Min Hong Yun Date: Thu, 2 Apr 2026 11:38:16 -0700 Subject: [PATCH 03/15] Copy service the cleanup service unit file, start the service. Also delete the comments added in the previous commit --- launcher/image/container-cleanup.service | 1 - launcher/image/container-runner.service | 1 - launcher/image/entrypoint.sh | 5 +++-- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/launcher/image/container-cleanup.service b/launcher/image/container-cleanup.service index c81646d3a..9c892a4db 100644 --- a/launcher/image/container-cleanup.service +++ b/launcher/image/container-cleanup.service @@ -1,4 +1,3 @@ -# Container cleanup service. [Unit] Description=Container Cleanup on Service Stop Wants=containerd.service container-runner.service diff --git a/launcher/image/container-runner.service b/launcher/image/container-runner.service index dcd39a4da..3f0d7a575 100644 --- a/launcher/image/container-runner.service +++ b/launcher/image/container-runner.service @@ -1,4 +1,3 @@ -# Container launcher service. [Unit] Description=Confidential Space Launcher Wants=network-online.target gcr-online.target containerd.service diff --git a/launcher/image/entrypoint.sh b/launcher/image/entrypoint.sh index 8462b9279..e0c0f8d10 100644 --- a/launcher/image/entrypoint.sh +++ b/launcher/image/entrypoint.sh @@ -3,6 +3,7 @@ main() { # Copy service files. cp /usr/share/oem/confidential_space/container-runner.service /etc/systemd/system/container-runner.service + cp /usr/share/oem/confidential_space/container-cleanup.service /etc/systemd/system/container-cleanup.service # Override default fluent-bit config. cp /usr/share/oem/confidential_space/fluent-bit-cs.conf /etc/fluent-bit/fluent-bit.conf @@ -15,8 +16,8 @@ main() { # Override default kernel-monitor.json for node-problem-detector. cp /usr/share/oem/confidential_space/kernel-monitor-cs.json /etc/node_problem_detector/kernel-monitor.json systemctl daemon-reload - systemctl enable container-runner.service - systemctl start container-runner.service + systemctl enable container-runner.service container-cleanup.service + systemctl start container-runner.service container-cleanup.service systemctl start fluent-bit.service } From 9e6b1a1597160dda217421bf67188aabbb090ad8 Mon Sep 17 00:00:00 2001 From: Min Hong Yun Date: Thu, 2 Apr 2026 13:27:32 -0700 Subject: [PATCH 04/15] Poll containers' status instead of blindly waiting for 15 seconds. When all workloads handle SIGTERM before the timeout, the system can stop earlier. --- launcher/image/container-cleanup.sh | 33 ++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/launcher/image/container-cleanup.sh b/launcher/image/container-cleanup.sh index 44d39c322..78b6ab5df 100755 --- a/launcher/image/container-cleanup.sh +++ b/launcher/image/container-cleanup.sh @@ -10,11 +10,34 @@ for ns in $(ctr ns ls -q); do # Send SIGTERM and move on. No waiting, no killing, no deleting. # A workload may decide to ignore or not handle SIGTERM. ctr -n "$ns" tasks kill --signal SIGTERM $tasks >/dev/null 2>&1 - echo "SIGTERM sent to $tasks; allowing $SHUTDOWN_TIMEOUT_SEC seconds for graceful shutdown." - - # Allow $SHUTDOWN_TIMEOUT_SEC seconds for the workload to shutdown. - sleep $SHUTDOWN_TIMEOUT_SEC + echo "SIGTERM sent to $tasks in namespace $ns." + fi +done + +echo "Waiting up to $SHUTDOWN_TIMEOUT_SEC seconds for workloads to shutdown..." + +start_time=$(date +%s) +while true; do + all_empty=true + for ns in $(ctr ns ls -q); do + tasks=$(ctr -n "$ns" task ls -q) + if [ -n "$tasks" ]; then + all_empty=false + break + fi + done - echo "Container cleanup exits; unended tasks will be killed." + if [ "$all_empty" = true ]; then + echo "All workloads have shutdown gracefully." + break fi + + current_time=$(date +%s) + elapsed=$((current_time - start_time)) + if [ $elapsed -ge $SHUTDOWN_TIMEOUT_SEC ]; then + echo "Timeout reached; unended tasks will be killed." + break + fi + + sleep 1 done From 0ab13bfa3b634e1f04823e0ecf2f26e32bacc018 Mon Sep 17 00:00:00 2001 From: Min Hong Yun Date: Fri, 3 Apr 2026 16:36:25 -0700 Subject: [PATCH 05/15] Add test for workload cleanup service Add a new integration test to verify that a workload receives SIGTERM for a graceful shutdown. --- launcher/cloudbuild.yaml | 14 ++++ .../test/scripts/test_workload_cleanup.sh | 37 +++++++++ .../test/test_workloadcleanup_cloudbuild.yaml | 79 +++++++++++++++++++ .../workloadcleanup/monitor/Dockerfile | 11 +++ .../workloadcleanup/workload/Dockerfile | 11 +++ .../workloadcleanup/workload/main.go | 78 ++++++++++++++++++ 6 files changed, 230 insertions(+) create mode 100644 launcher/image/test/scripts/test_workload_cleanup.sh create mode 100644 launcher/image/test/test_workloadcleanup_cloudbuild.yaml create mode 100644 launcher/image/testworkloads/workloadcleanup/monitor/Dockerfile create mode 100644 launcher/image/testworkloads/workloadcleanup/workload/Dockerfile create mode 100644 launcher/image/testworkloads/workloadcleanup/workload/main.go diff --git a/launcher/cloudbuild.yaml b/launcher/cloudbuild.yaml index c2bc0aec6..47df7bf2f 100644 --- a/launcher/cloudbuild.yaml +++ b/launcher/cloudbuild.yaml @@ -453,6 +453,20 @@ steps: --substitutions _IMAGE_NAME=${OUTPUT_IMAGE_NAME},_IMAGE_PROJECT=${PROJECT_ID} exit +- name: 'gcr.io/cloud-builders/gcloud' + id: WorkloadCleanupTests + waitFor: ['HardenedImageBuild'] + env: + - 'OUTPUT_IMAGE_NAME=${_OUTPUT_IMAGE_PREFIX}-hardened-${_OUTPUT_IMAGE_SUFFIX}' + - 'PROJECT_ID=$PROJECT_ID' + script: | + #!/usr/bin/env bash + cd launcher/image/test + echo "running workload cleanup tests on ${OUTPUT_IMAGE_NAME}" + gcloud builds submit --config=test_workloadcleanup_cloudbuild.yaml --region us-west1 \ + --substitutions _IMAGE_NAME=${OUTPUT_IMAGE_NAME},_IMAGE_PROJECT=${PROJECT_ID} + exit + - name: 'gcr.io/cloud-builders/gcloud' id: ExportHardenedImage waitFor: ['HardenedImageTests'] diff --git a/launcher/image/test/scripts/test_workload_cleanup.sh b/launcher/image/test/scripts/test_workload_cleanup.sh new file mode 100644 index 000000000..8e3de9270 --- /dev/null +++ b/launcher/image/test/scripts/test_workload_cleanup.sh @@ -0,0 +1,37 @@ +#!/bin/bash +set -euo pipefail + +MONITOR_VM=$1 +WORKLOAD_VM=$2 +ZONE=$3 + +timeout_seconds=300 +start_time=$(date +%s) + +check_timeout() { + local current_time=$(date +%s) + local elapsed=$((current_time - start_time)) + local remaining=$((timeout_seconds - elapsed)) + if [ $remaining -le 0 ]; then + echo "failed: $1" > /workspace/status.txt + exit 0 + fi + echo $remaining +} + +echo "Polling for heartbeat..." +remaining=$(check_timeout "Timeout before heartbeat poll") +timeout $remaining \ + gcloud compute instances tail-serial-port-output $MONITOR_VM --zone $ZONE | \ + grep -q 'Workload heartbeat' || \ + { echo "failed: Heartbeat not found within timeout" > /workspace/status.txt; exit 0; } + +echo "Stopping workload VM..." +gcloud compute instances stop $WORKLOAD_VM --zone $ZONE + +echo "Polling for graceful exit..." +remaining=$(check_timeout "Timeout before graceful exit poll") +timeout $remaining \ + gcloud compute instances tail-serial-port-output $MONITOR_VM --zone $ZONE | \ + grep -q 'Workload exiting gracefully' || \ + { echo "failed: Graceful exit message not found within timeout" > /workspace/status.txt; exit 0; } diff --git a/launcher/image/test/test_workloadcleanup_cloudbuild.yaml b/launcher/image/test/test_workloadcleanup_cloudbuild.yaml new file mode 100644 index 000000000..7a445ae25 --- /dev/null +++ b/launcher/image/test/test_workloadcleanup_cloudbuild.yaml @@ -0,0 +1,79 @@ +# Test container cleanup behavior. +# 1. Create Monitor VM (listening on UDP). +# 2. Capture Monitor IP. +# 3. Create Workload VM (sending heartbeats to Monitor). +# 4. Verify heartbeat. +# 5. Stop Workload VM. +# 6. Verify graceful exit message in Monitor logs. +substitutions: + '_IMAGE_NAME': '' + '_IMAGE_PROJECT': '' + '_CLEANUP': 'true' + '_IMAGE_REPO': 'us-west1-docker.pkg.dev/confidential-space-images-dev/cs-integ-test-images' + '_MONITOR_IMAGE': '${_IMAGE_REPO}/workloadcleanup/monitor:latest' + '_WORKLOAD_IMAGE': '${_IMAGE_REPO}/workloadcleanup/workload:latest' + '_ZONE': 'us-west1-a' + +steps: +- name: 'gcr.io/cloud-builders/gcloud' + id: CreateMonitorVM + entrypoint: 'bash' + env: + - 'BUILD_ID=$BUILD_ID' + args: ['create_vm.sh','-i', '${_IMAGE_NAME}', + '-p', '${_IMAGE_PROJECT}', + '-m', 'tee-image-reference=${_MONITOR_IMAGE},tee-container-log-redirect=true', + '-n', 'cs-monitor-${BUILD_ID}', + '-z', '${_ZONE}', + ] + +- name: 'gcr.io/cloud-builders/gcloud' + id: CaptureIP + script: | + #!/usr/bin/env bash + gcloud compute instances describe cs-monitor-${BUILD_ID} \ + --zone ${_ZONE} \ + --format="get(networkInterfaces[0].networkIP)" > /workspace/monitor_ip.txt + +- name: 'gcr.io/cloud-builders/gcloud' + id: CreateWorkloadVM + env: + - 'BUILD_ID=$BUILD_ID' + script: | + #!/usr/bin/env bash + MONITOR_IP=$(cat /workspace/monitor_ip.txt) + bash create_vm.sh -i ${_IMAGE_NAME} \ + -p ${_IMAGE_PROJECT} \ + -m "tee-image-reference=${_WORKLOAD_IMAGE},tee-container-log-redirect=true,tee-cmd=--server-addr=${MONITOR_IP}:2020" \ + -n cs-workload-${BUILD_ID} \ + -z ${_ZONE} + +- name: 'gcr.io/cloud-builders/gcloud' + id: TestSequence + entrypoint: 'bash' + args: ['scripts/test_workload_cleanup.sh', + 'cs-monitor-${BUILD_ID}', + 'cs-workload-${BUILD_ID}', + '${_ZONE}', + ] + +- name: 'gcr.io/cloud-builders/gcloud' + id: CleanUp + env: + - 'CLEANUP=$_CLEANUP' + script: | + #!/usr/bin/env bash + bash cleanup.sh cs-monitor-${BUILD_ID} ${_ZONE} + bash cleanup.sh cs-workload-${BUILD_ID} ${_ZONE} + +- name: 'gcr.io/cloud-builders/gcloud' + id: CheckFailure + entrypoint: 'bash' + env: + - 'BUILD_ID=$BUILD_ID' + args: ['check_failure.sh'] + +options: + dynamic_substitutions: true + pool: + name: 'projects/confidential-space-images-dev/locations/us-west1/workerPools/cs-image-build-vpc' diff --git a/launcher/image/testworkloads/workloadcleanup/monitor/Dockerfile b/launcher/image/testworkloads/workloadcleanup/monitor/Dockerfile new file mode 100644 index 000000000..b0462efb3 --- /dev/null +++ b/launcher/image/testworkloads/workloadcleanup/monitor/Dockerfile @@ -0,0 +1,11 @@ +# From current directory: +# gcloud builds submit --tag us-west1-docker.pkg.dev/confidential-space-images-dev/cs-integ-test-images/workloadcleanup/monitor:latest + +FROM alpine + +RUN apk add --no-cache sudo netcat-openbsd + +LABEL "tee.launch_policy.log_redirect"="always" + +# Listen on UDP port 2020 and pipe output to serial port ttyS0 +ENTRYPOINT ["sh", "-c", "nc -ulp 2020 | sudo tee /dev/ttyS0"] diff --git a/launcher/image/testworkloads/workloadcleanup/workload/Dockerfile b/launcher/image/testworkloads/workloadcleanup/workload/Dockerfile new file mode 100644 index 000000000..03b3d0e05 --- /dev/null +++ b/launcher/image/testworkloads/workloadcleanup/workload/Dockerfile @@ -0,0 +1,11 @@ +# From current directory: +# GOOS=linux GOARCH=amd64 CGO_ENABLED=0 go build -o main . +# gcloud builds submit --tag us-west1-docker.pkg.dev/confidential-space-images-dev/cs-integ-test-images/workloadcleanup/workload:latest + +FROM alpine + +COPY main / + +LABEL "tee.launch_policy.log_redirect"="always" + +ENTRYPOINT ["/main"] diff --git a/launcher/image/testworkloads/workloadcleanup/workload/main.go b/launcher/image/testworkloads/workloadcleanup/workload/main.go new file mode 100644 index 000000000..7a80da2b8 --- /dev/null +++ b/launcher/image/testworkloads/workloadcleanup/workload/main.go @@ -0,0 +1,78 @@ +package main + +import ( + "flag" + "io" + "log" + "net" + "os" + "os/signal" + "syscall" + "time" +) + +var serverAddr = flag.String("server-addr", "", "UDP log server address in IP:port format (required)") + +func main() { + flag.Parse() + + // Create a buffered channel to receive signals early. + sigs := make(chan os.Signal, 1) + done := make(chan bool, 1) + signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM) + + // Validate server address (IP:PORT). + if *serverAddr == "" { + log.Fatal("-server-addr flag is required") + } + host, _, err := net.SplitHostPort(*serverAddr) + if err != nil { + log.Fatalf("invalid -server-addr format: %v", err) + } + if net.ParseIP(host) == nil { + log.Fatal("-server-addr must contain a valid IP address") + } + + // Send log messages to a server if it exists. + conn, err := net.Dial("udp", *serverAddr) + if err != nil { + log.Printf("Could not connect to log server %s: %v. Logging to stderr.", *serverAddr, err) + } else { + // Use io.MultiWriter to send logs to the server AND print them to standard error + log.SetOutput(io.MultiWriter(os.Stderr, conn)) + defer conn.Close() + log.Println("Connected to log server.") + } + + + // Start a goroutine to wait for the signal and handle the shutdown logic. + go func() { + sig := <-sigs // Block until a signal is received. + log.Printf("Workload received signal: %v\n", sig) + + // Perform cleanup operations here (e.g., close database connections, + // stop servers, flush logs, etc.). + log.Println("Workload performing cleanup for 10 seconds. Check the next message about the graceful exit") + time.Sleep(10 * time.Second) + + done <- true // Signal that cleanup is complete. + }() + + // Block the main goroutine until the 'done' channel receives a value. + log.Println("Workload awaits signal.") + + ticker := time.NewTicker(1 * time.Second) + defer ticker.Stop() + + cnt := 0 + for { + select { + case <-ticker.C: + log.Printf("Workload heartbeat (%d)\n", cnt) + cnt++ + case <-done: + log.Println("Workload exiting gracefully.") + return + } + } +} From ef2e2bb5710ee6eedc2d42bd43fac7b14b619822 Mon Sep 17 00:00:00 2001 From: Min Hong Yun Date: Fri, 3 Apr 2026 16:57:48 -0700 Subject: [PATCH 06/15] Fix Go format and use directly --- launcher/image/container-cleanup.service | 2 +- .../workloadcleanup/workload/main.go | 119 +++++++++--------- 2 files changed, 60 insertions(+), 61 deletions(-) diff --git a/launcher/image/container-cleanup.service b/launcher/image/container-cleanup.service index 9c892a4db..cf8a661e0 100644 --- a/launcher/image/container-cleanup.service +++ b/launcher/image/container-cleanup.service @@ -6,7 +6,7 @@ After=containerd.service container-runner.service [Service] Type=oneshot RemainAfterExit=yes -ExecStart=/bin/sh -c "echo 'Container cleanup service started'" +ExecStart=/bin/echo "Container cleanup service started" ExecStop=/usr/share/oem/confidential_space/container-cleanup.sh StandardOutput=journal StandardError=journal diff --git a/launcher/image/testworkloads/workloadcleanup/workload/main.go b/launcher/image/testworkloads/workloadcleanup/workload/main.go index 7a80da2b8..a8b58f2a2 100644 --- a/launcher/image/testworkloads/workloadcleanup/workload/main.go +++ b/launcher/image/testworkloads/workloadcleanup/workload/main.go @@ -1,78 +1,77 @@ package main import ( - "flag" - "io" - "log" - "net" - "os" - "os/signal" - "syscall" - "time" + "flag" + "io" + "log" + "net" + "os" + "os/signal" + "syscall" + "time" ) var serverAddr = flag.String("server-addr", "", "UDP log server address in IP:port format (required)") func main() { - flag.Parse() + flag.Parse() - // Create a buffered channel to receive signals early. - sigs := make(chan os.Signal, 1) - done := make(chan bool, 1) - signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM) + // Create a buffered channel to receive signals early. + sigs := make(chan os.Signal, 1) + done := make(chan bool, 1) + signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM) - // Validate server address (IP:PORT). - if *serverAddr == "" { - log.Fatal("-server-addr flag is required") - } - host, _, err := net.SplitHostPort(*serverAddr) - if err != nil { - log.Fatalf("invalid -server-addr format: %v", err) - } - if net.ParseIP(host) == nil { - log.Fatal("-server-addr must contain a valid IP address") - } + // Validate server address (IP:PORT). + if *serverAddr == "" { + log.Fatal("-server-addr flag is required") + } + host, _, err := net.SplitHostPort(*serverAddr) + if err != nil { + log.Fatalf("invalid -server-addr format: %v", err) + } + if net.ParseIP(host) == nil { + log.Fatal("-server-addr must contain a valid IP address") + } - // Send log messages to a server if it exists. - conn, err := net.Dial("udp", *serverAddr) - if err != nil { - log.Printf("Could not connect to log server %s: %v. Logging to stderr.", *serverAddr, err) - } else { - // Use io.MultiWriter to send logs to the server AND print them to standard error - log.SetOutput(io.MultiWriter(os.Stderr, conn)) - defer conn.Close() - log.Println("Connected to log server.") - } + // Send log messages to a server if it exists. + conn, err := net.Dial("udp", *serverAddr) + if err != nil { + log.Printf("Could not connect to log server %s: %v. Logging to stderr.", *serverAddr, err) + } else { + // Use io.MultiWriter to send logs to the server AND print them to standard error + log.SetOutput(io.MultiWriter(os.Stderr, conn)) + defer conn.Close() + log.Println("Connected to log server.") + } + // Start a goroutine to wait for the signal and handle the shutdown logic. + go func() { + sig := <-sigs // Block until a signal is received. + log.Printf("Workload received signal: %v\n", sig) - // Start a goroutine to wait for the signal and handle the shutdown logic. - go func() { - sig := <-sigs // Block until a signal is received. - log.Printf("Workload received signal: %v\n", sig) + // Perform cleanup operations here (e.g., close database connections, + // stop servers, flush logs, etc.). + log.Println("Workload performing cleanup for 10 seconds. Check the next message about the graceful exit") + time.Sleep(10 * time.Second) - // Perform cleanup operations here (e.g., close database connections, - // stop servers, flush logs, etc.). - log.Println("Workload performing cleanup for 10 seconds. Check the next message about the graceful exit") - time.Sleep(10 * time.Second) + done <- true // Signal that cleanup is complete. + }() - done <- true // Signal that cleanup is complete. - }() + // Block the main goroutine until the 'done' channel receives a value. + log.Println("Workload awaits signal.") - // Block the main goroutine until the 'done' channel receives a value. - log.Println("Workload awaits signal.") + ticker := time.NewTicker(1 * time.Second) + defer ticker.Stop() - ticker := time.NewTicker(1 * time.Second) - defer ticker.Stop() - - cnt := 0 - for { - select { - case <-ticker.C: - log.Printf("Workload heartbeat (%d)\n", cnt) - cnt++ - case <-done: - log.Println("Workload exiting gracefully.") - return - } - } + cnt := 0 + for { + select { + case <-ticker.C: + log.Printf("Workload heartbeat (%d)\n", cnt) + cnt++ + case <-done: + log.Println("Workload exiting gracefully.") + return + } + } } From c76ff8c2c28dbd7793380df0ff2773b8743d9822 Mon Sep 17 00:00:00 2001 From: Min Hong Yun Date: Fri, 3 Apr 2026 17:24:23 -0700 Subject: [PATCH 07/15] Add the package comment --- launcher/image/testworkloads/workloadcleanup/workload/main.go | 1 + 1 file changed, 1 insertion(+) diff --git a/launcher/image/testworkloads/workloadcleanup/workload/main.go b/launcher/image/testworkloads/workloadcleanup/workload/main.go index a8b58f2a2..767a5bcf9 100644 --- a/launcher/image/testworkloads/workloadcleanup/workload/main.go +++ b/launcher/image/testworkloads/workloadcleanup/workload/main.go @@ -1,3 +1,4 @@ +// Package main provides a test workload for cleanup. package main import ( From f2ab7710ddb45eced78cb6562601025434617710 Mon Sep 17 00:00:00 2001 From: Min Hong Yun Date: Sat, 4 Apr 2026 08:12:34 -0700 Subject: [PATCH 08/15] Fix Cloud Build substitution resolution in the workload cleanup test Map substitutions to environment variables in `script` blocks to prevent them from evaluating to empty strings and causing exit code 2 errors in `gcloud` commands. --- .../test/test_workloadcleanup_cloudbuild.yaml | 27 ++++++++++++------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/launcher/image/test/test_workloadcleanup_cloudbuild.yaml b/launcher/image/test/test_workloadcleanup_cloudbuild.yaml index 7a445ae25..4ca3f1d7e 100644 --- a/launcher/image/test/test_workloadcleanup_cloudbuild.yaml +++ b/launcher/image/test/test_workloadcleanup_cloudbuild.yaml @@ -29,24 +29,31 @@ steps: - name: 'gcr.io/cloud-builders/gcloud' id: CaptureIP + env: + - 'BUILD_ID=$BUILD_ID' + - 'ZONE=$_ZONE' script: | #!/usr/bin/env bash - gcloud compute instances describe cs-monitor-${BUILD_ID} \ - --zone ${_ZONE} \ + gcloud compute instances describe cs-monitor-$BUILD_ID \ + --zone $ZONE \ --format="get(networkInterfaces[0].networkIP)" > /workspace/monitor_ip.txt - name: 'gcr.io/cloud-builders/gcloud' id: CreateWorkloadVM env: - 'BUILD_ID=$BUILD_ID' + - 'IMAGE_NAME=$_IMAGE_NAME' + - 'IMAGE_PROJECT=$_IMAGE_PROJECT' + - 'WORKLOAD_IMAGE=$_WORKLOAD_IMAGE' + - 'ZONE=$_ZONE' script: | #!/usr/bin/env bash MONITOR_IP=$(cat /workspace/monitor_ip.txt) - bash create_vm.sh -i ${_IMAGE_NAME} \ - -p ${_IMAGE_PROJECT} \ - -m "tee-image-reference=${_WORKLOAD_IMAGE},tee-container-log-redirect=true,tee-cmd=--server-addr=${MONITOR_IP}:2020" \ - -n cs-workload-${BUILD_ID} \ - -z ${_ZONE} + bash create_vm.sh -i $IMAGE_NAME \ + -p $IMAGE_PROJECT \ + -m "tee-image-reference=$WORKLOAD_IMAGE,tee-container-log-redirect=true,tee-cmd=--server-addr=${MONITOR_IP}:2020" \ + -n cs-workload-$BUILD_ID \ + -z $ZONE - name: 'gcr.io/cloud-builders/gcloud' id: TestSequence @@ -61,10 +68,12 @@ steps: id: CleanUp env: - 'CLEANUP=$_CLEANUP' + - 'BUILD_ID=$BUILD_ID' + - 'ZONE=$_ZONE' script: | #!/usr/bin/env bash - bash cleanup.sh cs-monitor-${BUILD_ID} ${_ZONE} - bash cleanup.sh cs-workload-${BUILD_ID} ${_ZONE} + bash cleanup.sh cs-monitor-$BUILD_ID $ZONE + bash cleanup.sh cs-workload-$BUILD_ID $ZONE - name: 'gcr.io/cloud-builders/gcloud' id: CheckFailure From 283b49d65a827276f2915ef994c29c24e4150889 Mon Sep 17 00:00:00 2001 From: Min Hong Yun Date: Sat, 4 Apr 2026 11:19:06 -0700 Subject: [PATCH 09/15] Use explicit VM prefixes for cleanup test instances Also, increase the timeout to 10 minutes --- .../image/test/scripts/test_workload_cleanup.sh | 2 +- .../test/test_workloadcleanup_cloudbuild.yaml | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/launcher/image/test/scripts/test_workload_cleanup.sh b/launcher/image/test/scripts/test_workload_cleanup.sh index 8e3de9270..e189a5748 100644 --- a/launcher/image/test/scripts/test_workload_cleanup.sh +++ b/launcher/image/test/scripts/test_workload_cleanup.sh @@ -5,7 +5,7 @@ MONITOR_VM=$1 WORKLOAD_VM=$2 ZONE=$3 -timeout_seconds=300 +timeout_seconds=600 start_time=$(date +%s) check_timeout() { diff --git a/launcher/image/test/test_workloadcleanup_cloudbuild.yaml b/launcher/image/test/test_workloadcleanup_cloudbuild.yaml index 4ca3f1d7e..68620aed0 100644 --- a/launcher/image/test/test_workloadcleanup_cloudbuild.yaml +++ b/launcher/image/test/test_workloadcleanup_cloudbuild.yaml @@ -23,7 +23,7 @@ steps: args: ['create_vm.sh','-i', '${_IMAGE_NAME}', '-p', '${_IMAGE_PROJECT}', '-m', 'tee-image-reference=${_MONITOR_IMAGE},tee-container-log-redirect=true', - '-n', 'cs-monitor-${BUILD_ID}', + '-n', 'cleanup-monitor-${BUILD_ID}', '-z', '${_ZONE}', ] @@ -34,7 +34,7 @@ steps: - 'ZONE=$_ZONE' script: | #!/usr/bin/env bash - gcloud compute instances describe cs-monitor-$BUILD_ID \ + gcloud compute instances describe cleanup-monitor-$BUILD_ID \ --zone $ZONE \ --format="get(networkInterfaces[0].networkIP)" > /workspace/monitor_ip.txt @@ -52,15 +52,15 @@ steps: bash create_vm.sh -i $IMAGE_NAME \ -p $IMAGE_PROJECT \ -m "tee-image-reference=$WORKLOAD_IMAGE,tee-container-log-redirect=true,tee-cmd=--server-addr=${MONITOR_IP}:2020" \ - -n cs-workload-$BUILD_ID \ + -n cleanup-test-$BUILD_ID \ -z $ZONE - name: 'gcr.io/cloud-builders/gcloud' id: TestSequence entrypoint: 'bash' args: ['scripts/test_workload_cleanup.sh', - 'cs-monitor-${BUILD_ID}', - 'cs-workload-${BUILD_ID}', + 'cleanup-monitor-${BUILD_ID}', + 'cleanup-test-${BUILD_ID}', '${_ZONE}', ] @@ -72,8 +72,8 @@ steps: - 'ZONE=$_ZONE' script: | #!/usr/bin/env bash - bash cleanup.sh cs-monitor-$BUILD_ID $ZONE - bash cleanup.sh cs-workload-$BUILD_ID $ZONE + bash cleanup.sh cleanup-monitor-$BUILD_ID $ZONE + bash cleanup.sh cleanup-test-$BUILD_ID $ZONE - name: 'gcr.io/cloud-builders/gcloud' id: CheckFailure From 53541a28cb6c6c1d42aeb74e09b7fb86fa18f325 Mon Sep 17 00:00:00 2001 From: Min Hong Yun Date: Mon, 6 Apr 2026 10:23:36 -0700 Subject: [PATCH 10/15] Use standard GCE VM for monitor in cleanup test Running a UDP listener inside a Confidential Space (CS) VM is complex because the hardened environment restricts direct access to host devices (like `/dev/ttyS0` used to dump logs) and prevents standard troubleshooting of container networking. To simplify the test, this commit switches the monitor to a standard GCE VM. The VM dynamically selects the latest x86_64 Debian image and the smallest available machine type meeting the minimum requirements of 2 vCPUs and 1 GB of memory (equivalent to `e2-micro`). --- .../test/test_workloadcleanup_cloudbuild.yaml | 44 ++++++++++++++++--- 1 file changed, 37 insertions(+), 7 deletions(-) diff --git a/launcher/image/test/test_workloadcleanup_cloudbuild.yaml b/launcher/image/test/test_workloadcleanup_cloudbuild.yaml index 68620aed0..0dbb4fd12 100644 --- a/launcher/image/test/test_workloadcleanup_cloudbuild.yaml +++ b/launcher/image/test/test_workloadcleanup_cloudbuild.yaml @@ -17,15 +17,45 @@ substitutions: steps: - name: 'gcr.io/cloud-builders/gcloud' id: CreateMonitorVM - entrypoint: 'bash' env: - 'BUILD_ID=$BUILD_ID' - args: ['create_vm.sh','-i', '${_IMAGE_NAME}', - '-p', '${_IMAGE_PROJECT}', - '-m', 'tee-image-reference=${_MONITOR_IMAGE},tee-container-log-redirect=true', - '-n', 'cleanup-monitor-${BUILD_ID}', - '-z', '${_ZONE}', - ] + - 'ZONE=$_ZONE' + script: | + #!/usr/bin/env bash + LATEST_DEBIAN=$(gcloud compute images list \ + --project=debian-cloud \ + --no-standard-images \ + --filter="architecture=X86_64" \ + --format="value(family)" | sort -V | uniq | tail -n 1) + + if [ -z "$LATEST_DEBIAN" ]; then + echo "Failed to find a Debian image family. Defaulting to debian-12." + LATEST_DEBIAN="debian-12" + else + echo "Found latest Debian family: $LATEST_DEBIAN" + fi + + MACHINE_TYPE=$(gcloud compute machine-types list \ + --zones=$ZONE \ + --filter="architecture=X86_64 AND guestCpus>=2 AND memoryMb>=1024" \ + --format="value(name, memoryMb)" | sort -k2 -n | head -n 1 | awk '{print $1}') + + if [ -z "$MACHINE_TYPE" ]; then + echo "Failed to find a matching machine type. Defaulting to e2-micro." + MACHINE_TYPE="e2-micro" + else + echo "Found smallest x86 machine type: $MACHINE_TYPE" + fi + + gcloud compute instances create cleanup-monitor-$BUILD_ID \ + --zone $ZONE \ + --machine-type=$MACHINE_TYPE \ + --image-family=$LATEST_DEBIAN \ + --image-project=debian-cloud \ + --metadata=startup-script="#!/bin/bash + apt-get update + apt-get install -y netcat-openbsd + nc -l -u -p 2020 > /dev/ttyS0" - name: 'gcr.io/cloud-builders/gcloud' id: CaptureIP From b5afe2fc77ef89b9ed3662d9ce01058758ba4f33 Mon Sep 17 00:00:00 2001 From: Min Hong Yun Date: Mon, 6 Apr 2026 10:48:39 -0700 Subject: [PATCH 11/15] Clean up unused _MONITOR_IMAGE substitution in cleanup test --- launcher/image/test/test_workloadcleanup_cloudbuild.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/launcher/image/test/test_workloadcleanup_cloudbuild.yaml b/launcher/image/test/test_workloadcleanup_cloudbuild.yaml index 0dbb4fd12..a15a37431 100644 --- a/launcher/image/test/test_workloadcleanup_cloudbuild.yaml +++ b/launcher/image/test/test_workloadcleanup_cloudbuild.yaml @@ -10,7 +10,6 @@ substitutions: '_IMAGE_PROJECT': '' '_CLEANUP': 'true' '_IMAGE_REPO': 'us-west1-docker.pkg.dev/confidential-space-images-dev/cs-integ-test-images' - '_MONITOR_IMAGE': '${_IMAGE_REPO}/workloadcleanup/monitor:latest' '_WORKLOAD_IMAGE': '${_IMAGE_REPO}/workloadcleanup/workload:latest' '_ZONE': 'us-west1-a' From d38fd68585b318db61be20ac479e748ae86e4bef Mon Sep 17 00:00:00 2001 From: Min Hong Yun Date: Mon, 6 Apr 2026 11:26:42 -0700 Subject: [PATCH 12/15] Use the explicit machine-type, `e2-micro`. Dynamic lookup failed because some machine families (like `e4`) are not available within the project due to quota limits. --- .../test/test_workloadcleanup_cloudbuild.yaml | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/launcher/image/test/test_workloadcleanup_cloudbuild.yaml b/launcher/image/test/test_workloadcleanup_cloudbuild.yaml index a15a37431..4ceb8db0d 100644 --- a/launcher/image/test/test_workloadcleanup_cloudbuild.yaml +++ b/launcher/image/test/test_workloadcleanup_cloudbuild.yaml @@ -34,21 +34,9 @@ steps: echo "Found latest Debian family: $LATEST_DEBIAN" fi - MACHINE_TYPE=$(gcloud compute machine-types list \ - --zones=$ZONE \ - --filter="architecture=X86_64 AND guestCpus>=2 AND memoryMb>=1024" \ - --format="value(name, memoryMb)" | sort -k2 -n | head -n 1 | awk '{print $1}') - - if [ -z "$MACHINE_TYPE" ]; then - echo "Failed to find a matching machine type. Defaulting to e2-micro." - MACHINE_TYPE="e2-micro" - else - echo "Found smallest x86 machine type: $MACHINE_TYPE" - fi - gcloud compute instances create cleanup-monitor-$BUILD_ID \ --zone $ZONE \ - --machine-type=$MACHINE_TYPE \ + --machine-type=e2-micro \ --image-family=$LATEST_DEBIAN \ --image-project=debian-cloud \ --metadata=startup-script="#!/bin/bash From b38d25e6a69b4635163a344c93ce137baf52e80a Mon Sep 17 00:00:00 2001 From: Min Hong Yun Date: Mon, 6 Apr 2026 12:09:03 -0700 Subject: [PATCH 13/15] Fix tee-cmd format in metadata Pass `tee-cmd` as an array to fix the parsing error. --- launcher/image/test/test_workloadcleanup_cloudbuild.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/launcher/image/test/test_workloadcleanup_cloudbuild.yaml b/launcher/image/test/test_workloadcleanup_cloudbuild.yaml index 4ceb8db0d..776832b18 100644 --- a/launcher/image/test/test_workloadcleanup_cloudbuild.yaml +++ b/launcher/image/test/test_workloadcleanup_cloudbuild.yaml @@ -68,7 +68,7 @@ steps: MONITOR_IP=$(cat /workspace/monitor_ip.txt) bash create_vm.sh -i $IMAGE_NAME \ -p $IMAGE_PROJECT \ - -m "tee-image-reference=$WORKLOAD_IMAGE,tee-container-log-redirect=true,tee-cmd=--server-addr=${MONITOR_IP}:2020" \ + -m "tee-image-reference=$WORKLOAD_IMAGE,tee-container-log-redirect=true,tee-cmd=[\"--server-addr=${MONITOR_IP}:2020\"]" \ -n cleanup-test-$BUILD_ID \ -z $ZONE From 05d5f317c7f4b43c139fa671e8a2ac00d251c19b Mon Sep 17 00:00:00 2001 From: Min Hong Yun Date: Mon, 6 Apr 2026 13:55:00 -0700 Subject: [PATCH 14/15] Allow cmd override and redirect cleanup service logs to ttyS0 - Added `allow_cmd_override` label to the workload Dockerfile to permit `tee-cmd` usage. - Redirected the `container-cleanup.service` output to `/dev/ttyS0` for serial console visibility. --- launcher/image/container-cleanup.service | 5 +++-- .../image/testworkloads/workloadcleanup/workload/Dockerfile | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/launcher/image/container-cleanup.service b/launcher/image/container-cleanup.service index cf8a661e0..05b58f773 100644 --- a/launcher/image/container-cleanup.service +++ b/launcher/image/container-cleanup.service @@ -8,8 +8,9 @@ Type=oneshot RemainAfterExit=yes ExecStart=/bin/echo "Container cleanup service started" ExecStop=/usr/share/oem/confidential_space/container-cleanup.sh -StandardOutput=journal -StandardError=journal +TTYPath=/dev/ttyS0 +StandardOutput=tty +StandardError=tty [Install] WantedBy=multi-user.target diff --git a/launcher/image/testworkloads/workloadcleanup/workload/Dockerfile b/launcher/image/testworkloads/workloadcleanup/workload/Dockerfile index 03b3d0e05..aec68edb0 100644 --- a/launcher/image/testworkloads/workloadcleanup/workload/Dockerfile +++ b/launcher/image/testworkloads/workloadcleanup/workload/Dockerfile @@ -7,5 +7,6 @@ FROM alpine COPY main / LABEL "tee.launch_policy.log_redirect"="always" +LABEL "tee.launch_policy.allow_cmd_override"="true" ENTRYPOINT ["/main"] From df36b2c662830138b0917a4538d1003a929104d2 Mon Sep 17 00:00:00 2001 From: Min Hong Yun Date: Fri, 10 Apr 2026 09:02:39 -0700 Subject: [PATCH 15/15] Use a power button listener service to watch /dev/input/eventX to sense power button press As the hardened image lacks `systemd-logind`, the new service, power button listener, takes the responsibility of watching /dev/input/eventX which was previously done by `logind`. When it detects a power button press, or VM stop, it triggers systemd to stop services, including the listener itself. Then, it uses the service's ExecStop script to send SIGTERM to all containers. --- launcher/cloudbuild.yaml | 4 + launcher/image/container-cleanup.service | 16 -- launcher/image/container-cleanup.sh | 3 +- launcher/image/entrypoint.sh | 6 +- launcher/image/power-button-listener.service | 16 ++ launcher/image/preload.sh | 9 +- .../test/scripts/test_workload_cleanup.sh | 30 ++- .../test/test_workloadcleanup_cloudbuild.yaml | 4 +- .../workloadcleanup/{workload => }/Dockerfile | 3 +- .../workloadcleanup/{workload => }/main.go | 2 +- .../workloadcleanup/monitor/Dockerfile | 11 - launcher/power-button-listener/main.go | 230 ++++++++++++++++++ 12 files changed, 285 insertions(+), 49 deletions(-) delete mode 100644 launcher/image/container-cleanup.service create mode 100644 launcher/image/power-button-listener.service rename launcher/image/testworkloads/workloadcleanup/{workload => }/Dockerfile (67%) rename launcher/image/testworkloads/workloadcleanup/{workload => }/main.go (97%) delete mode 100644 launcher/image/testworkloads/workloadcleanup/monitor/Dockerfile create mode 100644 launcher/power-button-listener/main.go diff --git a/launcher/cloudbuild.yaml b/launcher/cloudbuild.yaml index 47df7bf2f..05b627f39 100644 --- a/launcher/cloudbuild.yaml +++ b/launcher/cloudbuild.yaml @@ -72,6 +72,10 @@ steps: export CGO_LDFLAGS="-L/workspace/keymanager/target/release" go build -o ../image/launcher -ldflags="-extldflags=-Wl,-z,lazy -X 'main.BuildCommit=${SHORT_SHA}'" + # Build the power button listener + cd /workspace/launcher/power-button-listener + go build -o /workspace/launcher/image/power-button-listener + - name: 'gcr.io/cloud-builders/gcloud' id: DownloadExpBinary entrypoint: 'gcloud' diff --git a/launcher/image/container-cleanup.service b/launcher/image/container-cleanup.service deleted file mode 100644 index 05b58f773..000000000 --- a/launcher/image/container-cleanup.service +++ /dev/null @@ -1,16 +0,0 @@ -[Unit] -Description=Container Cleanup on Service Stop -Wants=containerd.service container-runner.service -After=containerd.service container-runner.service - -[Service] -Type=oneshot -RemainAfterExit=yes -ExecStart=/bin/echo "Container cleanup service started" -ExecStop=/usr/share/oem/confidential_space/container-cleanup.sh -TTYPath=/dev/ttyS0 -StandardOutput=tty -StandardError=tty - -[Install] -WantedBy=multi-user.target diff --git a/launcher/image/container-cleanup.sh b/launcher/image/container-cleanup.sh index 78b6ab5df..0ce6082ba 100755 --- a/launcher/image/container-cleanup.sh +++ b/launcher/image/container-cleanup.sh @@ -1,11 +1,12 @@ #!/bin/bash +echo "Container cleanup script started" SHUTDOWN_TIMEOUT_SEC=15 # Send SIGTERM to all running workloads so that they can shutdown gracefully. for ns in $(ctr ns ls -q); do tasks=$(ctr -n "$ns" task ls -q) - + if [ -n "$tasks" ]; then # Send SIGTERM and move on. No waiting, no killing, no deleting. # A workload may decide to ignore or not handle SIGTERM. diff --git a/launcher/image/entrypoint.sh b/launcher/image/entrypoint.sh index e0c0f8d10..1f0f48b38 100644 --- a/launcher/image/entrypoint.sh +++ b/launcher/image/entrypoint.sh @@ -3,7 +3,7 @@ main() { # Copy service files. cp /usr/share/oem/confidential_space/container-runner.service /etc/systemd/system/container-runner.service - cp /usr/share/oem/confidential_space/container-cleanup.service /etc/systemd/system/container-cleanup.service + cp /usr/share/oem/confidential_space/power-button-listener.service /etc/systemd/system/power-button-listener.service # Override default fluent-bit config. cp /usr/share/oem/confidential_space/fluent-bit-cs.conf /etc/fluent-bit/fluent-bit.conf @@ -16,8 +16,8 @@ main() { # Override default kernel-monitor.json for node-problem-detector. cp /usr/share/oem/confidential_space/kernel-monitor-cs.json /etc/node_problem_detector/kernel-monitor.json systemctl daemon-reload - systemctl enable container-runner.service container-cleanup.service - systemctl start container-runner.service container-cleanup.service + systemctl enable container-runner.service power-button-listener.service + systemctl start container-runner.service power-button-listener.service systemctl start fluent-bit.service } diff --git a/launcher/image/power-button-listener.service b/launcher/image/power-button-listener.service new file mode 100644 index 000000000..968123707 --- /dev/null +++ b/launcher/image/power-button-listener.service @@ -0,0 +1,16 @@ +[Unit] +Description=Power Button Listener +After=containerd.service container-runner.service + +[Service] +Type=simple +ExecStart=/usr/share/oem/confidential_space/power-button-listener +ExecStop=/usr/share/oem/confidential_space/container-cleanup.sh +Restart=on-failure +RestartSec=5 +TTYPath=/dev/ttyS0 +StandardOutput=tty +StandardError=tty + +[Install] +WantedBy=basic.target diff --git a/launcher/image/preload.sh b/launcher/image/preload.sh index 1c0547209..d32557f0a 100644 --- a/launcher/image/preload.sh +++ b/launcher/image/preload.sh @@ -19,8 +19,9 @@ setup_launcher_systemd_unit() { cp exit_script.sh "${CS_PATH}/exit_script.sh" } -setup_container_cleanup_service() { - cp container-cleanup.service "${CS_PATH}/container-cleanup.service" +copy_power_button_listener() { + cp power-button-listener "${CS_PATH}/power-button-listener" + cp power-button-listener.service "${CS_PATH}/power-button-listener.service" cp container-cleanup.sh "${CS_PATH}/container-cleanup.sh" } @@ -118,8 +119,8 @@ main() { # Install container launcher. copy_launcher setup_launcher_systemd_unit - # Install container cleanup service. - setup_container_cleanup_service + # Install power button listener. + copy_power_button_listener # Minimum required COS version for 'e': cos-dev-105-17222-0-0. # Minimum required COS version for 'm': cos-dev-113-18203-0-0. append_cmdline "cos.protected_stateful_partition=m" diff --git a/launcher/image/test/scripts/test_workload_cleanup.sh b/launcher/image/test/scripts/test_workload_cleanup.sh index e189a5748..682c17af5 100644 --- a/launcher/image/test/scripts/test_workload_cleanup.sh +++ b/launcher/image/test/scripts/test_workload_cleanup.sh @@ -19,19 +19,31 @@ check_timeout() { echo $remaining } -echo "Polling for heartbeat..." +echo "Starting to tail serial port in background..." +gcloud compute instances tail-serial-port-output $MONITOR_VM --zone $ZONE > /workspace/serial_output.txt & +TAIL_PID=$! + +# Give gcloud a few seconds to establish the connection +sleep 5 + +echo "Polling for heartbeat (Workload: $WORKLOAD_VM, Monitor: $MONITOR_VM)..." remaining=$(check_timeout "Timeout before heartbeat poll") -timeout $remaining \ - gcloud compute instances tail-serial-port-output $MONITOR_VM --zone $ZONE | \ - grep -q 'Workload heartbeat' || \ - { echo "failed: Heartbeat not found within timeout" > /workspace/status.txt; exit 0; } +timeout $remaining bash -c "until grep -q 'Workload heartbeat' /workspace/serial_output.txt; do sleep 1; done" || { + echo "failed: Heartbeat not found within timeout" > /workspace/status.txt + kill $TAIL_PID + exit 0 +} echo "Stopping workload VM..." gcloud compute instances stop $WORKLOAD_VM --zone $ZONE echo "Polling for graceful exit..." remaining=$(check_timeout "Timeout before graceful exit poll") -timeout $remaining \ - gcloud compute instances tail-serial-port-output $MONITOR_VM --zone $ZONE | \ - grep -q 'Workload exiting gracefully' || \ - { echo "failed: Graceful exit message not found within timeout" > /workspace/status.txt; exit 0; } +timeout $remaining bash -c "until grep -q 'Workload exiting gracefully' /workspace/serial_output.txt; do sleep 1; done" || { + echo "failed: Graceful exit message not found within timeout" > /workspace/status.txt + kill $TAIL_PID + exit 0 +} + +# Success! Clean up the background process +kill $TAIL_PID diff --git a/launcher/image/test/test_workloadcleanup_cloudbuild.yaml b/launcher/image/test/test_workloadcleanup_cloudbuild.yaml index 776832b18..722c0f765 100644 --- a/launcher/image/test/test_workloadcleanup_cloudbuild.yaml +++ b/launcher/image/test/test_workloadcleanup_cloudbuild.yaml @@ -10,7 +10,7 @@ substitutions: '_IMAGE_PROJECT': '' '_CLEANUP': 'true' '_IMAGE_REPO': 'us-west1-docker.pkg.dev/confidential-space-images-dev/cs-integ-test-images' - '_WORKLOAD_IMAGE': '${_IMAGE_REPO}/workloadcleanup/workload:latest' + '_WORKLOAD_IMAGE': '${_IMAGE_REPO}/workloadcleanup:latest' '_ZONE': 'us-west1-a' steps: @@ -68,7 +68,7 @@ steps: MONITOR_IP=$(cat /workspace/monitor_ip.txt) bash create_vm.sh -i $IMAGE_NAME \ -p $IMAGE_PROJECT \ - -m "tee-image-reference=$WORKLOAD_IMAGE,tee-container-log-redirect=true,tee-cmd=[\"--server-addr=${MONITOR_IP}:2020\"]" \ + -m "tee-image-reference=$WORKLOAD_IMAGE,tee-cmd=[\"--server-addr=${MONITOR_IP}:2020\"]" \ -n cleanup-test-$BUILD_ID \ -z $ZONE diff --git a/launcher/image/testworkloads/workloadcleanup/workload/Dockerfile b/launcher/image/testworkloads/workloadcleanup/Dockerfile similarity index 67% rename from launcher/image/testworkloads/workloadcleanup/workload/Dockerfile rename to launcher/image/testworkloads/workloadcleanup/Dockerfile index aec68edb0..dd0c188dd 100644 --- a/launcher/image/testworkloads/workloadcleanup/workload/Dockerfile +++ b/launcher/image/testworkloads/workloadcleanup/Dockerfile @@ -1,12 +1,11 @@ # From current directory: # GOOS=linux GOARCH=amd64 CGO_ENABLED=0 go build -o main . -# gcloud builds submit --tag us-west1-docker.pkg.dev/confidential-space-images-dev/cs-integ-test-images/workloadcleanup/workload:latest +# gcloud builds submit --tag us-west1-docker.pkg.dev/confidential-space-images-dev/cs-integ-test-images/workloadcleanup:latest FROM alpine COPY main / -LABEL "tee.launch_policy.log_redirect"="always" LABEL "tee.launch_policy.allow_cmd_override"="true" ENTRYPOINT ["/main"] diff --git a/launcher/image/testworkloads/workloadcleanup/workload/main.go b/launcher/image/testworkloads/workloadcleanup/main.go similarity index 97% rename from launcher/image/testworkloads/workloadcleanup/workload/main.go rename to launcher/image/testworkloads/workloadcleanup/main.go index 767a5bcf9..34c2189f7 100644 --- a/launcher/image/testworkloads/workloadcleanup/workload/main.go +++ b/launcher/image/testworkloads/workloadcleanup/main.go @@ -1,4 +1,4 @@ -// Package main provides a test workload for cleanup. +// A test workload for testing workload cleanup. package main import ( diff --git a/launcher/image/testworkloads/workloadcleanup/monitor/Dockerfile b/launcher/image/testworkloads/workloadcleanup/monitor/Dockerfile deleted file mode 100644 index b0462efb3..000000000 --- a/launcher/image/testworkloads/workloadcleanup/monitor/Dockerfile +++ /dev/null @@ -1,11 +0,0 @@ -# From current directory: -# gcloud builds submit --tag us-west1-docker.pkg.dev/confidential-space-images-dev/cs-integ-test-images/workloadcleanup/monitor:latest - -FROM alpine - -RUN apk add --no-cache sudo netcat-openbsd - -LABEL "tee.launch_policy.log_redirect"="always" - -# Listen on UDP port 2020 and pipe output to serial port ttyS0 -ENTRYPOINT ["sh", "-c", "nc -ulp 2020 | sudo tee /dev/ttyS0"] diff --git a/launcher/power-button-listener/main.go b/launcher/power-button-listener/main.go new file mode 100644 index 000000000..b8bf75acd --- /dev/null +++ b/launcher/power-button-listener/main.go @@ -0,0 +1,230 @@ +// Power button listener listens for power button events and triggers systemd poweroff. +package main + +import ( + "bufio" + "encoding/binary" + "fmt" + "os" + "os/exec" + "path/filepath" + "strconv" + "strings" +) + +const ( + udevDir = "/run/udev/data" + udevInputDevicePattern = "c13:*" + powerSwitchTag = "power-switch" + + eventMinorOffset = 64 + eventFilePrefix = "/dev/input/event" + + procDevicesPath = "/proc/bus/input/devices" + + // The following constants are defined in Linux kernel's + evKey = 1 + keyPower = 116 + keyPower2 = 356 // 0x164 +) + +// findPowerButton attempts to find the /dev/input/eventX file for the power button. +// It prioritizes searching udev data files for the 'power-switch' tag, and falls back to /proc/bus/input/devices. +func findPowerButton() (string, error) { + + // 1. Search files named c13:* (c = char device, 13 = input subsystem major number) in /run/udev/data/ + path, err := searchUdevFiles(udevDir, udevInputDevicePattern) + if err == nil { + return path, nil + } + + // 2. If not found, search all files in /run/udev/data/ + path, err = searchUdevFiles(udevDir, "*") + if err == nil { + return path, nil + } + + // 3. If not found, look at /proc/bus/input/devices + return searchProcDevices() +} + +// searchUdevFiles searches files in the udev directory matching the pattern for the power-switch tag. +func searchUdevFiles(dir, pattern string) (string, error) { + files, err := filepath.Glob(filepath.Join(dir, pattern)) + if err != nil { + return "", fmt.Errorf("globing udev files with %s failed: %w", pattern, err) + } + + for _, file := range files { + found, err := fileContainsTag(file, powerSwitchTag) + if err != nil { + continue + } + if found { + // Extract the minor number from filename (e.g., "c13:65" -> "65") + parts := strings.Split(filepath.Base(file), ":") + if len(parts) == 2 { + minor, err := strconv.Atoi(parts[1]) + if err == nil { + // Minor numbers for /dev/input/eventX start at 64 in Linux, so `minor - eventMinorOffset` gives us the event number. + return fmt.Sprintf("%s%d", eventFilePrefix, minor-eventMinorOffset), nil + } + } + } + } + return "", fmt.Errorf("not found in udev with pattern %s", pattern) +} + +// fileContainsTag checks if a udev data file contains the power-switch tag. +func fileContainsTag(filePath, tag string) (bool, error) { + file, err := os.Open(filePath) + if err != nil { + return false, err + } + defer func() { + if err := file.Close(); err != nil { + fmt.Println("Warning: failed to close file:", err) + } + }() + + scanner := bufio.NewScanner(file) + for scanner.Scan() { + line := scanner.Text() + // G: indicates tags in udev db v0, Q: indicates current tags in db v1. + // Checking both ensures compatibility across different udev versions. + if strings.HasPrefix(line, "G:") || strings.HasPrefix(line, "Q:") { + if strings.Contains(line, tag) { + return true, nil + } + } + } + return false, scanner.Err() +} + +// searchProcDevices falls back to parsing /proc/bus/input/devices. +// We are looking for a block like the following in the file: +// I: Bus=0019 Vendor=0000 Product=0001 Version=0000 +// N: Name="Power Button" +// P: Phys=LNXPWRBN/button/input0 +// S: Sysfs=/devices/LNXSYSTM:00/LNXPWRBN:00/input/input2 +// U: Uniq= +// H: Handlers=kbd event1 +// B: PROP=0 +// B: EV=3 +// B: KEY=10000000000000 0 +func searchProcDevices() (string, error) { + file, err := os.Open(procDevicesPath) + if err != nil { + return "", fmt.Errorf("opening %s failed: %w", procDevicesPath, err) + } + defer func() { + if err := file.Close(); err != nil { + fmt.Println("Warning: failed to close file:", err) + } + }() + + scanner := bufio.NewScanner(file) + isPBBlock := false + + for scanner.Scan() { + line := scanner.Text() + + if strings.Contains(line, `Name="Power Button"`) { + isPBBlock = true + } + + if isPBBlock && strings.Contains(line, "Handlers=") { + fields := strings.Fields(line) + for _, field := range fields { + if strings.HasPrefix(field, "event") { + return eventFilePrefix + strings.TrimPrefix(field, "event"), nil + } + } + } + + if line == "" { // End of block + isPBBlock = false + } + } + + return "", fmt.Errorf("power button not found in %s", procDevicesPath) +} + +func run() error { + fmt.Println("Starting Power Button Listener...") + + path, err := findPowerButton() + if err != nil { + return fmt.Errorf("finding power button failed: %w", err) + } + + fmt.Printf("Found Power Button at: %s\n", path) + fmt.Println("Listening for shutdown signal...") + + // Open the event file + file, err := os.Open(path) + if err != nil { + return fmt.Errorf("opening event file failed: %w", err) + } + defer func() { + if err := file.Close(); err != nil { + fmt.Printf("Warning: failed to close file: %v\n", err) + } + }() + + if err := waitForPowerButtonEvent(file); err != nil { + return fmt.Errorf("waiting for power button event failed: %w", err) + } + + fmt.Println("Power button pressed! Triggering systemd poweroff...") + + // Trigger systemd poweroff + output, err := exec.Command("systemctl", "poweroff").CombinedOutput() + if err != nil { + return fmt.Errorf("running systemctl poweroff failed: %w, output: %s", err, string(output)) + } + + return nil +} + +// waitForPowerButtonEvent blocks until a power button press event is detected. +func waitForPowerButtonEvent(file *os.File) error { + buf := make([]byte, 24) + for { + n, err := file.Read(buf) + if err != nil { + return fmt.Errorf("reading event file failed: %w", err) + } + + var evType, evCode uint16 + var evValue int32 + + switch n { + case 16: + // 32-bit system layout + evType = binary.LittleEndian.Uint16(buf[8:10]) + evCode = binary.LittleEndian.Uint16(buf[10:12]) + evValue = int32(binary.LittleEndian.Uint32(buf[12:16])) + case 24: + // 64-bit system layout + evType = binary.LittleEndian.Uint16(buf[16:18]) + evCode = binary.LittleEndian.Uint16(buf[18:20]) + evValue = int32(binary.LittleEndian.Uint32(buf[20:24])) + default: + // Ignore partial or unknown event sizes + continue + } + + // Value > 0 handles both Press (1) and Repeat (2). + if evType == evKey && (evCode == keyPower || evCode == keyPower2) && evValue > 0 { + return nil + } + } +} + +func main() { + if err := run(); err != nil { + fmt.Printf("Error: %v\n", err) + os.Exit(1) + } +}