Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions launcher/cloudbuild.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,10 @@ steps:
export CGO_LDFLAGS="-L/workspace/keymanager/target/release"
go build -o ../image/launcher -ldflags="-extldflags=-Wl,-z,lazy -X 'main.BuildCommit=${SHORT_SHA}'"

# Build the power button listener
cd /workspace/launcher/power-button-listener
go build -o /workspace/launcher/image/power-button-listener

- name: 'gcr.io/cloud-builders/gcloud'
id: DownloadExpBinary
entrypoint: 'gcloud'
Expand Down Expand Up @@ -453,6 +457,20 @@ steps:
--substitutions _IMAGE_NAME=${OUTPUT_IMAGE_NAME},_IMAGE_PROJECT=${PROJECT_ID}
exit

- name: 'gcr.io/cloud-builders/gcloud'
id: WorkloadCleanupTests
waitFor: ['HardenedImageBuild']
env:
- 'OUTPUT_IMAGE_NAME=${_OUTPUT_IMAGE_PREFIX}-hardened-${_OUTPUT_IMAGE_SUFFIX}'
- 'PROJECT_ID=$PROJECT_ID'
script: |
#!/usr/bin/env bash
cd launcher/image/test
echo "running workload cleanup tests on ${OUTPUT_IMAGE_NAME}"
gcloud builds submit --config=test_workloadcleanup_cloudbuild.yaml --region us-west1 \
--substitutions _IMAGE_NAME=${OUTPUT_IMAGE_NAME},_IMAGE_PROJECT=${PROJECT_ID}
exit

- name: 'gcr.io/cloud-builders/gcloud'
id: ExportHardenedImage
waitFor: ['HardenedImageTests']
Expand Down
44 changes: 44 additions & 0 deletions launcher/image/container-cleanup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#!/bin/bash
echo "Container cleanup script started"

SHUTDOWN_TIMEOUT_SEC=15

# Send SIGTERM to all running workloads so that they can shutdown gracefully.
for ns in $(ctr ns ls -q); do
tasks=$(ctr -n "$ns" task ls -q)

if [ -n "$tasks" ]; then
# Send SIGTERM and move on. No waiting, no killing, no deleting.
# A workload may decide to ignore or not handle SIGTERM.
ctr -n "$ns" tasks kill --signal SIGTERM $tasks >/dev/null 2>&1
echo "SIGTERM sent to $tasks in namespace $ns."
fi
done

echo "Waiting up to $SHUTDOWN_TIMEOUT_SEC seconds for workloads to shutdown..."

start_time=$(date +%s)
while true; do
all_empty=true
for ns in $(ctr ns ls -q); do
tasks=$(ctr -n "$ns" task ls -q)
if [ -n "$tasks" ]; then
all_empty=false
break
fi
done

if [ "$all_empty" = true ]; then
echo "All workloads have shutdown gracefully."
break
fi

current_time=$(date +%s)
elapsed=$((current_time - start_time))
if [ $elapsed -ge $SHUTDOWN_TIMEOUT_SEC ]; then
echo "Timeout reached; unended tasks will be killed."
break
fi

sleep 1
done
5 changes: 3 additions & 2 deletions launcher/image/entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
main() {
# Copy service files.
cp /usr/share/oem/confidential_space/container-runner.service /etc/systemd/system/container-runner.service
cp /usr/share/oem/confidential_space/power-button-listener.service /etc/systemd/system/power-button-listener.service
# Override default fluent-bit config.
cp /usr/share/oem/confidential_space/fluent-bit-cs.conf /etc/fluent-bit/fluent-bit.conf

Expand All @@ -15,8 +16,8 @@ main() {
# Override default kernel-monitor.json for node-problem-detector.
cp /usr/share/oem/confidential_space/kernel-monitor-cs.json /etc/node_problem_detector/kernel-monitor.json
systemctl daemon-reload
systemctl enable container-runner.service
systemctl start container-runner.service
systemctl enable container-runner.service power-button-listener.service
systemctl start container-runner.service power-button-listener.service
systemctl start fluent-bit.service
}

Expand Down
16 changes: 16 additions & 0 deletions launcher/image/power-button-listener.service
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
[Unit]
Description=Power Button Listener
After=containerd.service container-runner.service

[Service]
Type=simple
ExecStart=/usr/share/oem/confidential_space/power-button-listener
ExecStop=/usr/share/oem/confidential_space/container-cleanup.sh
Restart=on-failure
RestartSec=5
TTYPath=/dev/ttyS0
StandardOutput=tty
StandardError=tty

[Install]
WantedBy=basic.target
8 changes: 8 additions & 0 deletions launcher/image/preload.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,12 @@ setup_launcher_systemd_unit() {
cp exit_script.sh "${CS_PATH}/exit_script.sh"
}

copy_power_button_listener() {
cp power-button-listener "${CS_PATH}/power-button-listener"
cp power-button-listener.service "${CS_PATH}/power-button-listener.service"
cp container-cleanup.sh "${CS_PATH}/container-cleanup.sh"
}

append_cmdline() {
local arg="$1"
if [[ ! -d /mnt/disks/efi ]]; then
Expand Down Expand Up @@ -113,6 +119,8 @@ main() {
# Install container launcher.
copy_launcher
setup_launcher_systemd_unit
# Install power button listener.
copy_power_button_listener
# Minimum required COS version for 'e': cos-dev-105-17222-0-0.
# Minimum required COS version for 'm': cos-dev-113-18203-0-0.
append_cmdline "cos.protected_stateful_partition=m"
Expand Down
49 changes: 49 additions & 0 deletions launcher/image/test/scripts/test_workload_cleanup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#!/bin/bash
set -euo pipefail

MONITOR_VM=$1
WORKLOAD_VM=$2
ZONE=$3

timeout_seconds=600
start_time=$(date +%s)

check_timeout() {
local current_time=$(date +%s)
local elapsed=$((current_time - start_time))
local remaining=$((timeout_seconds - elapsed))
if [ $remaining -le 0 ]; then
echo "failed: $1" > /workspace/status.txt
exit 0
fi
echo $remaining
}

echo "Starting to tail serial port in background..."
gcloud compute instances tail-serial-port-output $MONITOR_VM --zone $ZONE > /workspace/serial_output.txt &
TAIL_PID=$!

# Give gcloud a few seconds to establish the connection
sleep 5

echo "Polling for heartbeat (Workload: $WORKLOAD_VM, Monitor: $MONITOR_VM)..."
remaining=$(check_timeout "Timeout before heartbeat poll")
timeout $remaining bash -c "until grep -q 'Workload heartbeat' /workspace/serial_output.txt; do sleep 1; done" || {
echo "failed: Heartbeat not found within timeout" > /workspace/status.txt
kill $TAIL_PID
exit 0
}

echo "Stopping workload VM..."
gcloud compute instances stop $WORKLOAD_VM --zone $ZONE

echo "Polling for graceful exit..."
remaining=$(check_timeout "Timeout before graceful exit poll")
timeout $remaining bash -c "until grep -q 'Workload exiting gracefully' /workspace/serial_output.txt; do sleep 1; done" || {
echo "failed: Graceful exit message not found within timeout" > /workspace/status.txt
kill $TAIL_PID
exit 0
}

# Success! Clean up the background process
kill $TAIL_PID
105 changes: 105 additions & 0 deletions launcher/image/test/test_workloadcleanup_cloudbuild.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
# Test container cleanup behavior.
# 1. Create Monitor VM (listening on UDP).
# 2. Capture Monitor IP.
# 3. Create Workload VM (sending heartbeats to Monitor).
# 4. Verify heartbeat.
# 5. Stop Workload VM.
# 6. Verify graceful exit message in Monitor logs.
substitutions:
'_IMAGE_NAME': ''
'_IMAGE_PROJECT': ''
'_CLEANUP': 'true'
'_IMAGE_REPO': 'us-west1-docker.pkg.dev/confidential-space-images-dev/cs-integ-test-images'
'_WORKLOAD_IMAGE': '${_IMAGE_REPO}/workloadcleanup:latest'
'_ZONE': 'us-west1-a'

steps:
- name: 'gcr.io/cloud-builders/gcloud'
id: CreateMonitorVM
env:
- 'BUILD_ID=$BUILD_ID'
- 'ZONE=$_ZONE'
script: |
#!/usr/bin/env bash
LATEST_DEBIAN=$(gcloud compute images list \
--project=debian-cloud \
--no-standard-images \
--filter="architecture=X86_64" \
--format="value(family)" | sort -V | uniq | tail -n 1)

if [ -z "$LATEST_DEBIAN" ]; then
echo "Failed to find a Debian image family. Defaulting to debian-12."
LATEST_DEBIAN="debian-12"
else
echo "Found latest Debian family: $LATEST_DEBIAN"
fi

gcloud compute instances create cleanup-monitor-$BUILD_ID \
--zone $ZONE \
--machine-type=e2-micro \
--image-family=$LATEST_DEBIAN \
--image-project=debian-cloud \
--metadata=startup-script="#!/bin/bash
apt-get update
apt-get install -y netcat-openbsd
nc -l -u -p 2020 > /dev/ttyS0"

- name: 'gcr.io/cloud-builders/gcloud'
id: CaptureIP
env:
- 'BUILD_ID=$BUILD_ID'
- 'ZONE=$_ZONE'
script: |
#!/usr/bin/env bash
gcloud compute instances describe cleanup-monitor-$BUILD_ID \
--zone $ZONE \
--format="get(networkInterfaces[0].networkIP)" > /workspace/monitor_ip.txt

- name: 'gcr.io/cloud-builders/gcloud'
id: CreateWorkloadVM
env:
- 'BUILD_ID=$BUILD_ID'
- 'IMAGE_NAME=$_IMAGE_NAME'
- 'IMAGE_PROJECT=$_IMAGE_PROJECT'
- 'WORKLOAD_IMAGE=$_WORKLOAD_IMAGE'
- 'ZONE=$_ZONE'
script: |
#!/usr/bin/env bash
MONITOR_IP=$(cat /workspace/monitor_ip.txt)
bash create_vm.sh -i $IMAGE_NAME \
-p $IMAGE_PROJECT \
-m "tee-image-reference=$WORKLOAD_IMAGE,tee-cmd=[\"--server-addr=${MONITOR_IP}:2020\"]" \
-n cleanup-test-$BUILD_ID \
-z $ZONE

- name: 'gcr.io/cloud-builders/gcloud'
id: TestSequence
entrypoint: 'bash'
args: ['scripts/test_workload_cleanup.sh',
'cleanup-monitor-${BUILD_ID}',
'cleanup-test-${BUILD_ID}',
'${_ZONE}',
]

- name: 'gcr.io/cloud-builders/gcloud'
id: CleanUp
env:
- 'CLEANUP=$_CLEANUP'
- 'BUILD_ID=$BUILD_ID'
- 'ZONE=$_ZONE'
script: |
#!/usr/bin/env bash
bash cleanup.sh cleanup-monitor-$BUILD_ID $ZONE
bash cleanup.sh cleanup-test-$BUILD_ID $ZONE

- name: 'gcr.io/cloud-builders/gcloud'
id: CheckFailure
entrypoint: 'bash'
env:
- 'BUILD_ID=$BUILD_ID'
args: ['check_failure.sh']

options:
dynamic_substitutions: true
pool:
name: 'projects/confidential-space-images-dev/locations/us-west1/workerPools/cs-image-build-vpc'
11 changes: 11 additions & 0 deletions launcher/image/testworkloads/workloadcleanup/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# From current directory:
# GOOS=linux GOARCH=amd64 CGO_ENABLED=0 go build -o main .
# gcloud builds submit --tag us-west1-docker.pkg.dev/confidential-space-images-dev/cs-integ-test-images/workloadcleanup:latest

FROM alpine

COPY main /

LABEL "tee.launch_policy.allow_cmd_override"="true"

ENTRYPOINT ["/main"]
78 changes: 78 additions & 0 deletions launcher/image/testworkloads/workloadcleanup/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
// A test workload for testing workload cleanup.
package main

import (
"flag"
"io"
"log"
"net"
"os"
"os/signal"
"syscall"
"time"
)

var serverAddr = flag.String("server-addr", "", "UDP log server address in IP:port format (required)")

func main() {
flag.Parse()

// Create a buffered channel to receive signals early.
sigs := make(chan os.Signal, 1)
done := make(chan bool, 1)
signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM)

// Validate server address (IP:PORT).
if *serverAddr == "" {
log.Fatal("-server-addr flag is required")
}
host, _, err := net.SplitHostPort(*serverAddr)
if err != nil {
log.Fatalf("invalid -server-addr format: %v", err)
}
if net.ParseIP(host) == nil {
log.Fatal("-server-addr must contain a valid IP address")
}

// Send log messages to a server if it exists.
conn, err := net.Dial("udp", *serverAddr)
if err != nil {
log.Printf("Could not connect to log server %s: %v. Logging to stderr.", *serverAddr, err)
} else {
// Use io.MultiWriter to send logs to the server AND print them to standard error
log.SetOutput(io.MultiWriter(os.Stderr, conn))
defer conn.Close()
log.Println("Connected to log server.")
}

// Start a goroutine to wait for the signal and handle the shutdown logic.
go func() {
sig := <-sigs // Block until a signal is received.
log.Printf("Workload received signal: %v\n", sig)

// Perform cleanup operations here (e.g., close database connections,
// stop servers, flush logs, etc.).
log.Println("Workload performing cleanup for 10 seconds. Check the next message about the graceful exit")
time.Sleep(10 * time.Second)

done <- true // Signal that cleanup is complete.
}()

// Block the main goroutine until the 'done' channel receives a value.
log.Println("Workload awaits signal.")

ticker := time.NewTicker(1 * time.Second)
defer ticker.Stop()

cnt := 0
for {
select {
case <-ticker.C:
log.Printf("Workload heartbeat (%d)\n", cnt)
cnt++
case <-done:
log.Println("Workload exiting gracefully.")
return
}
}
}
Loading
Loading