diff --git a/.github/workflows/integration-tests-advanced.yaml b/.github/workflows/integration-tests-advanced.yaml new file mode 100644 index 00000000..33b704cd --- /dev/null +++ b/.github/workflows/integration-tests-advanced.yaml @@ -0,0 +1,131 @@ +name: advanced-features-tests + +on: + schedule: + - cron: "15 6 * * 1" # Run at 6:15 AM on every Monday UTC to avoid peak times + # Allow manual trigger on any branch + workflow_dispatch: + +jobs: + integration-tests-advanced-features: + runs-on: ubuntu-latest-4-core + strategy: + fail-fast: false + matrix: + provider: [k3d, kind] + timeout-minutes: 120 + permissions: + contents: read + id-token: write + env: + GCP_PROJECT_ID: ${{ secrets.GCP_PROJECT_ID }} + steps: + - uses: actions/checkout@v4 + - name: Setup Go + uses: actions/setup-go@v5 + with: + go-version: 1.23 + - name: Authenticate to Google Cloud + uses: 'google-github-actions/auth@v2' + id: auth + with: + token_format: access_token + project_id: 'cockroach-helm-testing' + workload_identity_provider: ${{ secrets.GCP_WORKLOAD_IDENTITY_PROVIDER }} + service_account: ${{ secrets.GCP_SERVICE_ACCOUNT }} + - name: Set up gcloud CLI + uses: google-github-actions/setup-gcloud@v2 + - name: Install gke-gcloud-auth-plugin + run: | + gcloud components install gke-gcloud-auth-plugin + - name: Run tests (advanced features) + env: + PROVIDER: ${{ matrix.provider }} + TEST_ADVANCED_FEATURES: true + USE_GKE_GCLOUD_AUTH_PLUGIN: True + run: | + set -euo pipefail + make test/nightly-e2e/advanced | tee test_output.log + - name: Archive test results + if: ${{ always() }} + uses: actions/upload-artifact@v4 + with: + name: advanced-features-test-results-${{ matrix.provider }} + path: | + test_output.log + test_output.json + - name: Slack notification (advanced features) + if: ${{ always() }} + run: | + echo "Preparing Slack message for advanced features tests..." + + if [ ! -f test_output.json ]; then + echo "Generating test_output.json from log..." + grep -E "=== RUN|--- PASS:|--- FAIL:" test_output.log | awk ' + /=== RUN/ { + test_name = $3 + tests[test_name] = "running" + } + /--- PASS:/ { + test_name = $3 + if (test_name in tests) { + print "{\"Action\":\"pass\",\"Test\":\"" test_name "\"}" + } + } + /--- FAIL:/ { + test_name = $3 + if (test_name in tests) { + print "{\"Action\":\"fail\",\"Test\":\"" test_name "\"}" + } + }' > test_output.json + fi + + PASSED_TESTS=$(jq -r 'select(.Action=="pass") | .Test' test_output.json | sed 's|.*/||' | grep '^Test' | grep -v '^TestOperatorIn' | sort | uniq | head -n 20 | paste -sd ", " -) + FAILED_TESTS=$(jq -r 'select(.Action=="fail") | .Test' test_output.json | sed 's|.*/||' | grep '^Test' | grep -v '^TestOperatorIn' | sort | uniq | head -n 20 | paste -sd ", " -) + PASSED_COUNT=$(jq -r 'select(.Action=="pass") | .Test' test_output.json | sed 's|.*/||' | grep '^Test' | grep -v '^TestOperatorIn' | sort | uniq | wc -l) + FAILED_COUNT=$(jq -r 'select(.Action=="fail") | .Test' test_output.json | sed 's|.*/||' | grep '^Test' | grep -v '^TestOperatorIn' | sort | uniq | wc -l) + + if [ "$FAILED_COUNT" -gt 0 ]; then + STATUS="❌ *Advanced features integration tests failed!*" + COLOR="#ff0000" + else + STATUS="✅ *Advanced features integration tests passed!*" + COLOR="#36a64f" + fi + + REPO_NAME="helm-charts" + + PAYLOAD=$(jq -n \ + --arg text "$STATUS" \ + --arg color "$COLOR" \ + --arg repo "$REPO_NAME" \ + --arg branch "${{ github.ref_name }}" \ + --arg workflow "${{ github.workflow }}" \ + --arg provider "${{ matrix.provider }}" \ + --arg url "${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" \ + --arg passed "$PASSED_COUNT" \ + --arg failed "$FAILED_COUNT" \ + --arg passed_tests "$PASSED_TESTS" \ + --arg failed_tests "$FAILED_TESTS" \ + '{ + text: $text, + attachments: [ + { + color: $color, + fields: [ + {"title": "Repository", "value": $repo, "short": true}, + {"title": "Branch", "value": $branch, "short": true}, + {"title": "Provider", "value": $provider, "short": true}, + {"title": "✅ Passed", "value": $passed, "short": true}, + {"title": "❌ Failed", "value": $failed, "short": true}, + {"title": "❌ Failed Tests", "value": (if $failed_tests == "" then "None" else $failed_tests end), "short": false}, + {"title": "✅ Passed Tests", "value": (if $passed_tests == "" then "None" else $passed_tests end), "short": false}, + {"title": "Run URL", "value": $url, "short": false} + ] + } + ] + }') + + curl -X POST -H 'Content-type: application/json' \ + --data "$PAYLOAD" \ + "${{ secrets.SLACK_WEBHOOK_URL }}" diff --git a/Makefile b/Makefile index 4155e9bd..698a9c92 100644 --- a/Makefile +++ b/Makefile @@ -137,6 +137,10 @@ test/single-cluster/up: bin/k3d test/multi-cluster/down: bin/k3d ./tests/k3d/dev-multi-cluster.sh down +test/nightly-e2e/advanced: bin/cockroach bin/kubectl bin/helm build/self-signer bin/k3d bin/kind + @PATH="$(PWD)/bin:${PATH}" TEST_ADVANCED_FEATURES=true go test -timeout 90m -v -test.run TestOperatorInSingleRegion ./tests/e2e/operator/singleRegion/... || (echo "Advanced features tests failed with exit code $$?" && exit 1) + + test/lint: bin/helm ## lint the helm chart @build/lint.sh && \ bin/helm lint cockroachdb && \ diff --git a/cockroachdb-parent/charts/operator/values.yaml b/cockroachdb-parent/charts/operator/values.yaml index e8187d3d..fa820561 100644 --- a/cockroachdb-parent/charts/operator/values.yaml +++ b/cockroachdb-parent/charts/operator/values.yaml @@ -10,7 +10,7 @@ image: # pullPolicy specifies the image pull policy. pullPolicy: IfNotPresent # tag is the image tag. - tag: "9975a200e1e046a067e0350b970174ef6590468a2d07d7565b34d4eff16a032f" + tag: "aad6fac0d535f1758bac09e90e168deb3880adb8642747c4d77eadf4f6ba5add" # certificate defines the certificate settings for the Operator. certificate: # validForDays specifies the number of days the certificate is valid for. diff --git a/tests/e2e/operator/multiRegion/cockroachdb_multi_region_advanced_features_test.go b/tests/e2e/operator/multiRegion/cockroachdb_multi_region_advanced_features_test.go new file mode 100644 index 00000000..796b865e --- /dev/null +++ b/tests/e2e/operator/multiRegion/cockroachdb_multi_region_advanced_features_test.go @@ -0,0 +1,255 @@ +package multiRegion + +import ( + "fmt" + "strings" + "testing" + + "github.com/cockroachdb/helm-charts/tests/e2e/operator" + "github.com/gruntwork-io/terratest/modules/k8s" + "github.com/gruntwork-io/terratest/modules/random" + "github.com/stretchr/testify/require" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// TestWALFailoverMultiRegion tests WAL failover with different paths in each region +// Region 0: WAL failover enabled with custom path +// Region 1: WAL failover disabled +func (r *multiRegion) TestWALFailoverMultiRegion(t *testing.T) { + // Setup namespaces and CA for each region + cleanup := r.SetupMultiClusterWithCA(t) + defer cleanup() + + // Region 0: Install with WAL failover enabled + cluster0 := r.Clusters[0] + walPath0 := "/cockroach/wal-region-0" + + t.Logf("Installing region 0 (%s) with WAL failover enabled at path %s", cluster0, walPath0) + config0 := operator.AdvancedInstallConfig{ + WALFailoverEnabled: true, + WALFailoverSize: "5Gi", + CustomValues: map[string]string{ + "cockroachdb.crdbCluster.walFailoverSpec.path": walPath0, + }, + } + r.InstallChartsWithAdvancedConfig(t, cluster0, 0, config0) + + // Region 1: Install without WAL failover + cluster1 := r.Clusters[1] + t.Logf("Installing region 1 (%s) without WAL failover", cluster1) + config1 := operator.AdvancedInstallConfig{} + r.InstallChartsWithAdvancedConfig(t, cluster1, 1, config1) + + // Validate CockroachDB cluster health in both regions + for _, cluster := range r.Clusters { + r.ValidateCRDB(t, cluster) + } + + // Validate multi-region setup + r.ValidateMultiRegionSetup(t) + + // Validate WAL failover in region 0 + t.Log("Validating WAL failover in region 0...") + r.ValidateWALFailover(t, cluster0, &operator.AdvancedValidationConfig{ + WALFailover: operator.WALFailoverValidation{ + CustomPath: walPath0, + }, + }) + + // Validate NO WAL failover in region 1 + t.Log("Validating NO WAL failover in region 1...") + kubeConfig, _ := r.GetCurrentContext(t) + kubectlOptions1 := k8s.NewKubectlOptions(cluster1, kubeConfig, r.Namespace[cluster1]) + + pods := k8s.ListPods(t, kubectlOptions1, metav1.ListOptions{ + LabelSelector: operator.LabelSelector, + }) + require.True(t, len(pods) > 0, "No CockroachDB pods found in region 1") + + podCommand, err := k8s.RunKubectlAndGetOutputE(t, kubectlOptions1, + "get", "pod", pods[0].Name, "-o", "jsonpath={.spec.containers[?(@.name=='cockroachdb')].command}") + require.NoError(t, err) + require.NotContains(t, podCommand, "--wal-failover", "Region 1 should not have WAL failover enabled") + t.Log("Confirmed region 1 does not have WAL failover") + + t.Logf("WAL failover multi-region test completed successfully") +} + +// TestEncryptionAtRestMultiRegion tests encryption at rest with different secrets per region +// Region 0: Encryption enabled with secret "cmek-key-secret-region-0" +// Region 1: Encryption disabled (no encryption) +func (r *multiRegion) TestEncryptionAtRestMultiRegion(t *testing.T) { + // Setup namespaces and CA for each region + cleanup := r.SetupMultiClusterWithCA(t) + defer cleanup() + + // Generate encryption key for region 0 + encryptionKeyB64 := r.GenerateEncryptionKey(t) + t.Logf("Generated encryption key for region 0 (base64 length: %d)", len(encryptionKeyB64)) + + // Region 0: Install with encryption at rest enabled + cluster0 := r.Clusters[0] + secretName0 := "cmek-key-secret-region-0" + + encryptionRegions0 := []map[string]interface{}{ + { + "code": r.RegionCodes[0], + "cloudProvider": r.Provider, + "nodes": r.NodeCount, + "namespace": r.Namespace[cluster0], + "domain": operator.CustomDomains[0], + "encryptionAtRest": map[string]interface{}{ + "platform": "UNKNOWN_KEY_TYPE", + "keySecretName": secretName0, + }, + }, + } + + t.Logf("Installing region 0 (%s) with encryption at rest enabled", cluster0) + config0 := operator.AdvancedInstallConfig{ + EncryptionEnabled: true, + EncryptionKeySecret: encryptionKeyB64, + CustomRegions: encryptionRegions0, + CustomValues: map[string]string{ + // Override secret name to use custom name + "cockroachdb.crdbCluster.encryptionKeySecretName": secretName0, + }, + } + r.InstallChartsWithAdvancedConfig(t, cluster0, 0, config0) + + // Manually create the secret with custom name in region 0 + kubeConfig, _ := r.GetCurrentContext(t) + kubectlOptions0 := k8s.NewKubectlOptions(cluster0, kubeConfig, r.Namespace[cluster0]) + + // Delete the default secret if it exists + _ = k8s.RunKubectlE(t, kubectlOptions0, "delete", "secret", "cmek-key-secret", "--ignore-not-found") + + // Create the custom named secret + err := k8s.RunKubectlE(t, kubectlOptions0, "create", "secret", "generic", secretName0, + fmt.Sprintf("--from-literal=StoreKeyData=%s", encryptionKeyB64)) + require.NoError(t, err) + t.Logf("Created encryption secret %s in region 0", secretName0) + + // Region 1: Install without encryption + cluster1 := r.Clusters[1] + t.Logf("Installing region 1 (%s) without encryption at rest", cluster1) + config1 := operator.AdvancedInstallConfig{} + r.InstallChartsWithAdvancedConfig(t, cluster1, 1, config1) + + // Validate CockroachDB cluster health in both regions + for _, cluster := range r.Clusters { + r.ValidateCRDB(t, cluster) + } + + // Validate multi-region setup + r.ValidateMultiRegionSetup(t) + + // Validate encryption in region 0 + t.Log("Validating encryption at rest in region 0...") + r.ValidateEncryptionAtRest(t, cluster0, &operator.AdvancedValidationConfig{ + EncryptionAtRest: operator.EncryptionAtRestValidation{ + SecretName: secretName0, + }, + }) + + // Validate NO encryption in region 1 + t.Log("Validating NO encryption at rest in region 1...") + kubectlOptions1 := k8s.NewKubectlOptions(cluster1, kubeConfig, r.Namespace[cluster1]) + + pods := k8s.ListPods(t, kubectlOptions1, metav1.ListOptions{ + LabelSelector: operator.LabelSelector, + }) + require.True(t, len(pods) > 0, "No CockroachDB pods found in region 1") + + podCommand, err := k8s.RunKubectlAndGetOutputE(t, kubectlOptions1, + "get", "pod", pods[0].Name, "-o", "jsonpath={.spec.containers[?(@.name=='cockroachdb')].command}") + require.NoError(t, err) + require.NotContains(t, podCommand, "--enterprise-encryption", "Region 1 should not have encryption enabled") + t.Log("Confirmed region 1 does not have encryption at rest") + + t.Logf("Encryption at rest multi-region test completed successfully") +} + +// TestPCRMultiRegion tests Physical Cluster Replication with multi-region setup +// Creates a multi-region primary cluster, then creates a standby cluster and tests failover/failback +func (r *multiRegion) TestPCRMultiRegion(t *testing.T) { + // Creating random namespace for primary multi-region cluster + for _, cluster := range r.Clusters { + r.Namespace[cluster] = fmt.Sprintf("%s-primary-%s", operator.Namespace, strings.ToLower(random.UniqueId())) + } + + // Create CA certificate once for all clusters + cleanupCA := r.RequireCACertificate(t) + defer cleanupCA() + + var standbyNamespace string + + // Cleanup both primary and standby clusters + defer r.CleanupResources(t) + defer func() { + if standbyNamespace != "" { + kubectlOptions := k8s.NewKubectlOptions("", "", standbyNamespace) + k8s.DeleteNamespace(t, kubectlOptions, standbyNamespace) + } + }() + + // Step 1: Install primary multi-region cluster + t.Log("Installing primary multi-region cluster...") + for i, cluster := range r.Clusters { + primaryConfig := operator.AdvancedInstallConfig{ + VirtualClusterMode: "primary", + } + if i != 0 { + // Subsequent regions skip operator installation + primaryConfig.SkipOperatorInstall = true + } + r.InstallChartsWithAdvancedConfig(t, cluster, i, primaryConfig) + } + + // Validate primary cluster health in all regions + r.VirtualClusterModePrimary = true + for _, cluster := range r.Clusters { + r.ValidateCRDB(t, cluster) + } + r.VirtualClusterModePrimary = false + + // Validate multi-region setup + r.ValidateMultiRegionSetup(t) + t.Log("Primary multi-region cluster is healthy") + + // Step 2: Install standby cluster (single region for simplicity) + t.Log("Installing standby cluster...") + standbyCluster := r.Clusters[0] // Use first cluster for standby + standbyNamespace = fmt.Sprintf("%s-standby-%s", operator.Namespace, strings.ToLower(random.UniqueId())) + + // Temporarily update namespace for standby installation + originalNamespace := r.Namespace[standbyCluster] + r.Namespace[standbyCluster] = standbyNamespace + + standbyConfig := operator.AdvancedInstallConfig{ + VirtualClusterMode: "standby", + SkipOperatorInstall: true, // Operator already installed + } + r.InstallChartsWithAdvancedConfig(t, standbyCluster, 0, standbyConfig) + + // Validate standby cluster + r.VirtualClusterModeStandby = true + r.ValidateCRDB(t, standbyCluster) + r.VirtualClusterModeStandby = false + t.Log("Standby cluster is healthy") + + // Step 3: Set up replication and test failover/failback + t.Log("Testing PCR failover and failback...") + r.ValidatePCR(t, &operator.AdvancedValidationConfig{ + PCR: operator.PCRValidation{ + Cluster: standbyCluster, + PrimaryNamespace: originalNamespace, + StandbyNamespace: standbyNamespace, + }, + }) + + // Restore original namespace + r.Namespace[standbyCluster] = originalNamespace + + t.Logf("PCR multi-region test completed successfully") +} diff --git a/tests/e2e/operator/multiRegion/cockroachdb_multi_region_e2e_test.go b/tests/e2e/operator/multiRegion/cockroachdb_multi_region_e2e_test.go index d07ae4ff..7ca63257 100644 --- a/tests/e2e/operator/multiRegion/cockroachdb_multi_region_e2e_test.go +++ b/tests/e2e/operator/multiRegion/cockroachdb_multi_region_e2e_test.go @@ -83,12 +83,20 @@ func TestOperatorInMultiRegion(t *testing.T) { // Set up infrastructure for this provider once. cloudProvider.SetUpInfra(t) - testCases := map[string]func(*testing.T){ - "TestHelmInstall": providerRegion.TestHelmInstall, - "TestHelmUpgrade": providerRegion.TestHelmUpgrade, - "TestClusterRollingRestart": providerRegion.TestClusterRollingRestart, - "TestKillingCockroachNode": providerRegion.TestKillingCockroachNode, - "TestClusterScaleUp": func(t *testing.T) { providerRegion.TestClusterScaleUp(t, cloudProvider) }, + // Build test cases based on TEST_ADVANCED_FEATURES environment variable + testCases := make(map[string]func(*testing.T)) + + // Run only advanced test cases when TEST_ADVANCED_FEATURES is enabled + if os.Getenv("TEST_ADVANCED_FEATURES") == "true" { + testCases["TestWALFailoverMultiRegion"] = providerRegion.TestWALFailoverMultiRegion + testCases["TestEncryptionAtRestMultiRegion"] = providerRegion.TestEncryptionAtRestMultiRegion + testCases["TestPCRMultiRegion"] = providerRegion.TestPCRMultiRegion + } else { + testCases["TestHelmInstall"] = providerRegion.TestHelmInstall + testCases["TestHelmUpgrade"] = providerRegion.TestHelmUpgrade + testCases["TestClusterRollingRestart"] = providerRegion.TestClusterRollingRestart + testCases["TestKillingCockroachNode"] = providerRegion.TestKillingCockroachNode + testCases["TestClusterScaleUp"] = func(t *testing.T) { providerRegion.TestClusterScaleUp(t, cloudProvider) } } // Run tests sequentially within a provider. @@ -151,7 +159,6 @@ func (r *multiRegion) TestHelmInstall(t *testing.T) { if _, ok := rawConfig.Contexts[cluster]; !ok { t.Fatalf("cluster context '%s' not found in kubeconfig", cluster) } - rawConfig.CurrentContext = cluster // Validate CockroachDB cluster. r.ValidateCRDB(t, cluster) } @@ -193,7 +200,6 @@ func (r *multiRegion) TestHelmUpgrade(t *testing.T) { if _, ok := rawConfig.Contexts[cluster]; !ok { t.Fatalf("cluster context '%s' not found in kubeconfig", cluster) } - rawConfig.CurrentContext = cluster // Validate CockroachDB cluster. r.ValidateCRDB(t, cluster) } @@ -270,7 +276,6 @@ func (r *multiRegion) TestClusterRollingRestart(t *testing.T) { if _, ok := rawConfig.Contexts[cluster]; !ok { t.Fatalf("cluster context '%s' not found in kubeconfig", cluster) } - rawConfig.CurrentContext = cluster r.ValidateCRDB(t, cluster) } @@ -359,7 +364,6 @@ func (r *multiRegion) TestKillingCockroachNode(t *testing.T) { if _, ok := rawConfig.Contexts[cluster]; !ok { t.Fatalf("cluster context '%s' not found in kubeconfig", cluster) } - rawConfig.CurrentContext = cluster r.ValidateCRDB(t, cluster) } @@ -383,7 +387,6 @@ func (r *multiRegion) TestKillingCockroachNode(t *testing.T) { if _, ok := rawConfig.Contexts[cluster]; !ok { t.Fatalf("cluster context '%s' not found in kubeconfig", cluster) } - rawConfig.CurrentContext = cluster r.ValidateCRDB(t, cluster) } r.ValidateMultiRegionSetup(t) @@ -423,8 +426,6 @@ func (r *multiRegion) TestClusterScaleUp(t *testing.T, cloudProvider infra.Cloud if _, ok := rawConfig.Contexts[cluster]; !ok { t.Fatalf("cluster context '%s' not found in kubeconfig", cluster) } - rawConfig.CurrentContext = cluster - r.ValidateCRDB(t, cluster) } // Get helm chart paths. diff --git a/tests/e2e/operator/region.go b/tests/e2e/operator/region.go index b59ecd6a..1f1733bc 100644 --- a/tests/e2e/operator/region.go +++ b/tests/e2e/operator/region.go @@ -1,6 +1,7 @@ package operator import ( + "encoding/base64" "encoding/json" "fmt" "os" @@ -13,6 +14,7 @@ import ( "github.com/cockroachdb/helm-charts/tests/testutil" "github.com/gruntwork-io/terratest/modules/helm" "github.com/gruntwork-io/terratest/modules/k8s" + "github.com/gruntwork-io/terratest/modules/random" "github.com/gruntwork-io/terratest/modules/retry" "github.com/gruntwork-io/terratest/modules/shell" "github.com/stretchr/testify/require" @@ -98,7 +100,6 @@ func (r *Region) InstallCharts(t *testing.T, cluster string, index int) { if _, ok := rawConfig.Contexts[cluster]; !ok { t.Fatal() } - rawConfig.CurrentContext = cluster // Setup kubectl options for this cluster. kubectlOptions := k8s.NewKubectlOptions(cluster, kubeConfig, r.Namespace[cluster]) @@ -177,8 +178,7 @@ func (r *Region) ValidateCRDB(t *testing.T, cluster string) { cfg, err := config.GetConfigWithContext(cluster) require.NoError(t, err) // Get current context name. - kubeConfig, rawConfig := r.GetCurrentContext(t) - rawConfig.CurrentContext = cluster + kubeConfig, _ := r.GetCurrentContext(t) // Setup kubectl options for this cluster. namespaceName := r.Namespace[cluster] kubectlOptions := k8s.NewKubectlOptions(cluster, kubeConfig, namespaceName) @@ -269,8 +269,7 @@ func (r *Region) ValidateMultiRegionSetup(t *testing.T) { // Validate multi-region setup. for _, cluster := range r.Clusters { // Get the current context name. - kubeConfig, rawConfig := r.GetCurrentContext(t) - rawConfig.CurrentContext = cluster + kubeConfig, _ := r.GetCurrentContext(t) kubectlOptions := k8s.NewKubectlOptions(cluster, kubeConfig, r.Namespace[cluster]) pods := k8s.ListPods(t, kubectlOptions, metav1.ListOptions{ @@ -589,3 +588,775 @@ func PatchHelmValues(inputValues map[string]string) map[string]string { return inputValues } + +const ( + defaultWALFailoverPath = "/cockroach/cockroach-wal-failover" + defaultEncryptionSecret = "cmek-key-secret" +) + +// AdvancedValidationConfig aggregates validation knobs for advanced feature assertions. +type AdvancedValidationConfig struct { + WALFailover WALFailoverValidation + EncryptionAtRest EncryptionAtRestValidation + PCR PCRValidation +} + +// WALFailoverValidation captures WAL failover expectations. +type WALFailoverValidation struct { + CustomPath string +} + +// EncryptionAtRestValidation captures encryption validation expectations. +type EncryptionAtRestValidation struct { + SecretName string +} + +// PCRValidation captures virtual cluster replication validation inputs. +type PCRValidation struct { + Cluster string + PrimaryNamespace string + StandbyNamespace string +} + +// DefaultAdvancedValidationConfig returns default validation values used when a test does not override them. +func DefaultAdvancedValidationConfig() AdvancedValidationConfig { + return AdvancedValidationConfig{ + WALFailover: WALFailoverValidation{ + CustomPath: defaultWALFailoverPath, + }, + EncryptionAtRest: EncryptionAtRestValidation{ + SecretName: defaultEncryptionSecret, + }, + } +} + +func mergeValidationConfig(cfg *AdvancedValidationConfig) AdvancedValidationConfig { + merged := DefaultAdvancedValidationConfig() + if cfg == nil { + return merged + } + + if cfg.WALFailover.CustomPath != "" { + merged.WALFailover = cfg.WALFailover + } + if cfg.EncryptionAtRest.SecretName != "" { + merged.EncryptionAtRest = cfg.EncryptionAtRest + } + if cfg.PCR.Cluster != "" || cfg.PCR.PrimaryNamespace != "" || cfg.PCR.StandbyNamespace != "" { + merged.PCR = cfg.PCR + } + + return merged +} + +// AssignRandomNamespace assigns a randomized namespace for the given cluster using the default prefix. +func (r *Region) AssignRandomNamespace(cluster string) string { + return r.AssignRandomNamespaceWithPrefix(cluster, Namespace) +} + +// AssignRandomNamespaceWithPrefix assigns a randomized namespace for the given cluster using the provided prefix. +func (r *Region) AssignRandomNamespaceWithPrefix(cluster string, prefix string) string { + if prefix == "" { + prefix = Namespace + } + if r.Namespace == nil { + r.Namespace = make(map[string]string) + } + name := fmt.Sprintf("%s-%s", prefix, strings.ToLower(random.UniqueId())) + r.Namespace[cluster] = name + return name +} + +// AssignRandomNamespacesWithPrefix assigns randomized namespaces for all tracked clusters. +func (r *Region) AssignRandomNamespacesWithPrefix(prefix string) { + for _, cluster := range r.Clusters { + r.AssignRandomNamespaceWithPrefix(cluster, prefix) + } +} + +// RequireCACertificate creates a CA certificate for the current test and returns a cleanup closure. +func (r *Region) RequireCACertificate(t *testing.T) func() { + err := r.CreateCACertificate(t) + require.NoError(t, err) + return func() { + r.CleanUpCACertificate(t) + } +} + +// SetupSingleClusterWithCA prepares a single cluster for advanced tests and returns a cleanup closure. +func (r *Region) SetupSingleClusterWithCA(t *testing.T, cluster string) func() { + r.AssignRandomNamespace(cluster) + cleanupCA := r.RequireCACertificate(t) + return func() { + r.CleanupResources(t) + cleanupCA() + } +} + +// SetupMultiClusterWithCA prepares all clusters for advanced tests and returns a cleanup closure. +func (r *Region) SetupMultiClusterWithCA(t *testing.T) func() { + r.AssignRandomNamespacesWithPrefix(Namespace) + cleanupCA := r.RequireCACertificate(t) + return func() { + r.CleanupResources(t) + cleanupCA() + } +} + +// BaseRegionConfig returns a baseline region configuration map for the provided cluster and index. +func (r *Region) BaseRegionConfig(cluster string, index int) map[string]interface{} { + code := fmt.Sprintf("region-%d", index) + if len(r.RegionCodes) > index { + code = r.RegionCodes[index] + } + region := map[string]interface{}{ + "code": code, + "cloudProvider": r.Provider, + "nodes": r.NodeCount, + "namespace": r.Namespace[cluster], + } + if domain, ok := CustomDomains[index]; ok { + region["domain"] = domain + } + return region +} + +// EncryptionAtRestConfig returns a reusable encryption configuration map with optional overrides. +func (r *Region) EncryptionAtRestConfig(secretName string, overrides map[string]interface{}) map[string]interface{} { + if secretName == "" { + secretName = "cmek-key-secret" + } + config := map[string]interface{}{ + "platform": "UNKNOWN_KEY_TYPE", + "keySecretName": secretName, + } + for k, v := range overrides { + config[k] = v + } + return config +} + +// BuildEncryptionRegions creates a slice containing a single region entry with encryption settings applied. +func (r *Region) BuildEncryptionRegions(cluster string, index int, encryptionOverrides map[string]interface{}) []map[string]interface{} { + region := r.BaseRegionConfig(cluster, index) + region["encryptionAtRest"] = r.EncryptionAtRestConfig("", encryptionOverrides) + return []map[string]interface{}{region} +} + +// AdvancedInstallConfig holds configuration for advanced feature installations +type AdvancedInstallConfig struct { + // WAL Failover configuration + WALFailoverEnabled bool + WALFailoverSize string + + // Encryption at Rest configuration + EncryptionEnabled bool + EncryptionKeySecret string + + // Virtual Cluster configuration (for PCR) + VirtualClusterMode string // "primary", "standby", or "" + + // Custom helm values to merge + CustomValues map[string]string + + // Custom regions configuration (for encryption) + CustomRegions []map[string]interface{} + + // Skip operator installation (for second virtual cluster) + SkipOperatorInstall bool +} + +// InstallChartsWithAdvancedConfig installs CockroachDB with advanced features configuration +func (r *Region) InstallChartsWithAdvancedConfig(t *testing.T, cluster string, index int, config AdvancedInstallConfig) { + // Get the current context name. + kubeConfig, rawConfig := r.GetCurrentContext(t) + + // Get helm chart paths. + helmChartPath, _ := HelmChartPaths() + + // Verify if a cluster exists in the contexts. + if _, ok := rawConfig.Contexts[cluster]; !ok { + t.Fatal() + } + + // Setup kubectl options for this cluster. + kubectlOptions := k8s.NewKubectlOptions(cluster, kubeConfig, r.Namespace[cluster]) + + // Create a namespace. + k8s.CreateNamespace(t, kubectlOptions, r.Namespace[cluster]) + + // create CA Secret. + err := k8s.RunKubectlE(t, kubectlOptions, "create", "secret", "generic", customCASecret, "--from-file=ca.crt", + "--from-file=ca.key") + require.NoError(t, err) + + // Create encryption key secret if encryption is enabled + if config.EncryptionEnabled && config.EncryptionKeySecret != "" { + // Create secret with base64 encoded AES key + err = k8s.RunKubectlE(t, kubectlOptions, "create", "secret", "generic", "cmek-key-secret", + fmt.Sprintf("--from-literal=StoreKeyData=%s", config.EncryptionKeySecret)) + require.NoError(t, err) + + // Verify secret was created with data + secretSize, err := k8s.RunKubectlAndGetOutputE(t, kubectlOptions, + "get", "secret", "cmek-key-secret", + "-o", "jsonpath={.data.StoreKeyData}") + require.NoError(t, err) + require.True(t, len(secretSize) > 0, "Secret StoreKeyData should be >0") + t.Logf("Created encryption secret with size: %d bytes", len(secretSize)) + } + + // Install the operator when it is not skipped and not already marked as installed. + if !config.SkipOperatorInstall && !r.IsOperatorInstalled { + InstallCockroachDBEnterpriseOperator(t, kubectlOptions) + } + + // Build helm values + helmValues := PatchHelmValues(map[string]string{ + "cockroachdb.clusterDomain": CustomDomains[index], + "cockroachdb.tls.selfSigner.caProvided": "true", + "cockroachdb.tls.selfSigner.caSecret": customCASecret, + }) + + // Add WAL failover configuration + if config.WALFailoverEnabled { + helmValues["cockroachdb.crdbCluster.walFailoverSpec.status"] = "enable" + helmValues["cockroachdb.crdbCluster.walFailoverSpec.size"] = config.WALFailoverSize + helmValues["cockroachdb.crdbCluster.walFailoverSpec.name"] = "datadir-wal-failover" + helmValues["cockroachdb.crdbCluster.walFailoverSpec.path"] = "/cockroach/cockroach-wal-failover" + } + + // Add virtual cluster configuration + if config.VirtualClusterMode != "" { + helmValues["cockroachdb.crdbCluster.virtualCluster.mode"] = config.VirtualClusterMode + } + + // Merge custom values + for k, v := range config.CustomValues { + helmValues[k] = v + } + + // Determine which regions configuration to use + var regionsConfig interface{} + if config.CustomRegions != nil { + regionsConfig = config.CustomRegions + } else { + regionsConfig = r.OperatorRegions(index, r.NodeCount) + } + + // Helm install cockroach CR with configuration + crdbOptions := &helm.Options{ + KubectlOptions: kubectlOptions, + SetValues: helmValues, + SetJsonValues: map[string]string{ + "cockroachdb.crdbCluster.regions": MustMarshalJSON(regionsConfig), + }, + ExtraArgs: helmExtraArgs, + } + + helm.Install(t, crdbOptions, helmChartPath, ReleaseName) + + serviceName := "cockroachdb-public" + k8s.WaitUntilServiceAvailable(t, kubectlOptions, serviceName, 30, 5*time.Second) +} + +// ValidateWALFailover verifies that WAL failover is properly configured by checking the --wal-failover flag and PVC. +// Pass nil config to rely on defaults or provide a pointer with overrides through AdvancedValidationConfig.WALFailover. +func (r *Region) ValidateWALFailover(t *testing.T, cluster string, cfg *AdvancedValidationConfig) { + validationConfig := mergeValidationConfig(cfg) + expectedPath := validationConfig.WALFailover.CustomPath + if expectedPath == "" { + expectedPath = defaultWALFailoverPath + } + + kubeConfig, _ := r.GetCurrentContext(t) + kubectlOptions := k8s.NewKubectlOptions(cluster, kubeConfig, r.Namespace[cluster]) + + // Get CockroachDB pods + pods := k8s.ListPods(t, kubectlOptions, metav1.ListOptions{ + LabelSelector: LabelSelector, + }) + require.True(t, len(pods) > 0, "No CockroachDB pods found") + + podName := pods[0].Name + + // 1. Verify cockroach start command contains --wal-failover flag with custom path + t.Logf("Verifying cockroach start command contains --wal-failover flag with path %s...", expectedPath) + podCommand, err := k8s.RunKubectlAndGetOutputE(t, kubectlOptions, + "get", "pod", podName, "-o", "jsonpath={.spec.containers[?(@.name=='cockroachdb')].command}") + require.NoError(t, err) + expectedFlag := fmt.Sprintf("--wal-failover=path=%s", expectedPath) + require.Contains(t, podCommand, expectedFlag, + "Pod command should contain %s", expectedFlag) + t.Logf("Cockroach start command contains --wal-failover flag with path %s", expectedPath) + + // 2. Verify COCKROACH_WAL_FAILOVER environment variable is set + t.Log("Verifying COCKROACH_WAL_FAILOVER environment variable...") + walFailoverEnv, err := k8s.RunKubectlAndGetOutputE(t, kubectlOptions, + "get", "pod", podName, "-o", "jsonpath={.spec.containers[?(@.name=='cockroachdb')].env[?(@.name=='COCKROACH_WAL_FAILOVER')].value}") + if err == nil && walFailoverEnv != "" { + t.Logf("COCKROACH_WAL_FAILOVER environment variable is set to: %s", walFailoverEnv) + } + + // 3. Verify WAL failover PVC exists with correct naming convention + t.Log("Verifying WAL failover PVC exists...") + pvcs, err := k8s.RunKubectlAndGetOutputE(t, kubectlOptions, + "get", "pvc", "-o", "jsonpath={.items[*].metadata.name}") + require.NoError(t, err) + t.Logf("Found PVCs: %s", pvcs) + // PVC should follow the pattern: datadir-wal-failover-cockroachdb-{index} + // The name prefix comes from walFailoverSpec.name which we set to "datadir-wal-failover" + require.Contains(t, pvcs, "datadir-wal-failover", "WAL failover PVC should exist with correct naming") + t.Log("WAL failover PVC exists with correct naming convention") + + // 4. Verify WAL failover volume is mounted in the pod + t.Log("Verifying WAL failover volume is mounted...") + volumes, err := k8s.RunKubectlAndGetOutputE(t, kubectlOptions, + "get", "pod", podName, "-o", "jsonpath={.spec.volumes[*].name}") + require.NoError(t, err) + // The volume name is always "wal-failover" regardless of the custom path or PVC name + require.Contains(t, volumes, "wal-failover", "WAL failover volume should be mounted") + t.Log("WAL failover volume is properly mounted") + + // 5. Verify the custom WAL failover path exists in the container + t.Logf("Verifying custom WAL failover path %s exists in container...", expectedPath) + _, err = k8s.RunKubectlAndGetOutputE(t, kubectlOptions, + "exec", podName, "-c", "cockroachdb", "--", + "ls", "-la", expectedPath) + require.NoError(t, err) + t.Logf("Custom WAL failover path %s exists in container", expectedPath) + + t.Log("WAL failover validation completed successfully") +} + +// GenerateEncryptionKey generates a 256-bit AES encryption key and returns base64 encoded value +func (r *Region) GenerateEncryptionKey(t *testing.T) string { + // Generate 256-bit AES key using cockroach gen encryption-key + cmd := shell.Command{ + Command: "cockroach", + Args: []string{"gen", "encryption-key", "--size", "256", "store.key"}, + WorkingDir: ".", + } + + _, err := shell.RunCommandAndGetOutputE(t, cmd) + require.NoError(t, err) + + // Read the generated key file + keyBytes, err := os.ReadFile("store.key") + require.NoError(t, err) + + // Base64 encode the key (removing any newlines) + storeKeyB64 := base64.StdEncoding.EncodeToString(keyBytes) + storeKeyB64 = strings.ReplaceAll(storeKeyB64, "\n", "") + + // Clean up the key file + os.Remove("store.key") + + return storeKeyB64 +} + +// ValidateEncryptionAtRest verifies that encryption at rest is properly configured by checking flags and encryption status. +// Pass nil config to rely on defaults or provide overrides via AdvancedValidationConfig.EncryptionAtRest. +func (r *Region) ValidateEncryptionAtRest(t *testing.T, cluster string, cfg *AdvancedValidationConfig) { + kubeConfig, _ := r.GetCurrentContext(t) + kubectlOptions := k8s.NewKubectlOptions(cluster, kubeConfig, r.Namespace[cluster]) + + // Get CockroachDB pods + pods := k8s.ListPods(t, kubectlOptions, metav1.ListOptions{ + LabelSelector: LabelSelector, + }) + require.True(t, len(pods) > 0, "No CockroachDB pods found") + + podName := pods[0].Name + + validationConfig := mergeValidationConfig(cfg) + secretName := validationConfig.EncryptionAtRest.SecretName + if secretName == "" { + secretName = defaultEncryptionSecret + } + + // 1. Verify the encryption key secret exists and has data + t.Logf("Verifying encryption key secret %s...", secretName) + secretSize, err := k8s.RunKubectlAndGetOutputE(t, kubectlOptions, + "get", "secret", secretName, + "-o", "jsonpath={.data.StoreKeyData}") + require.NoError(t, err) + require.True(t, len(secretSize) > 0, "Secret StoreKeyData should not be empty") + t.Logf("Encryption key secret %s exists with data", secretName) + + // 2. Verify cockroach start command contains encryption flags + t.Log("Verifying cockroach start command contains encryption flags...") + podSpec, err := k8s.RunKubectlAndGetOutputE(t, kubectlOptions, + "get", "pod", podName, "-o", "jsonpath={.spec.containers[?(@.name=='cockroachdb')].command}") + require.NoError(t, err) + require.Contains(t, podSpec, "--enterprise-encryption", "Pod command should contain --enterprise-encryption flag") + t.Log("Cockroach start command contains encryption flags") + + // 3. Check encryption status via debug command + //t.Log("Checking encryption status...") + //encryptionStatus, err := k8s.RunKubectlAndGetOutputE(t, kubectlOptions, + // "exec", podName, "-c", "cockroachdb", "--", + // "/cockroach/cockroach", "debug", "encryption-status", "/cockroach/cockroach-data", + // "--certs-dir=/cockroach/cockroach-certs") + //require.NoError(t, err) + //t.Logf("Encryption status:\n%s", encryptionStatus) + + // 4. Verify encryption is active by checking for "Active" status + //require.Contains(t, encryptionStatus, "Active", "Encryption should be active") + //t.Log("Encryption is active on the store") + + t.Log("Encryption at rest validation completed successfully") +} + +// generateLocalTenantURI creates local connection URI for connecting to same cluster's tenants +// Returns: postgresql://root:root@localhost:26257?options=-ccluster%3D +func generateLocalTenantURI(cluster string) string { + return fmt.Sprintf("postgresql://root:root@localhost:26257?options=-ccluster%%3D%s", cluster) +} + +// generateExternalConnectionURI creates external connection URI using cockroach encode-uri +// This is used for cross-cluster connections (e.g., replication from primary to standby) +// Format: cockroach encode-uri 'postgresql://USERNAME:PASSWORD@HOST' [flags] +func generateExternalConnectionURI(t *testing.T, kubectlOpts *k8s.KubectlOptions, pod string, connString string) string { + output, err := k8s.RunKubectlAndGetOutputE(t, kubectlOpts, + "exec", pod, "-c", "cockroachdb", "--", + "/cockroach/cockroach", "encode-uri", + connString, // Format: postgresql://user:pass@host:port + "--ca-cert=/cockroach/cockroach-certs/ca.crt", + "--inline") + require.NoError(t, err) + return strings.TrimSpace(output) +} + +// Helper function to execute SQL on a specific virtual cluster +func execSQLOnVC(t *testing.T, kubectlOpts *k8s.KubectlOptions, pod string, vcURI string, database string, sql string) (string, error) { + args := []string{ + "exec", pod, "-c", "cockroachdb", "--", + "/cockroach/cockroach", "sql", + "--certs-dir=/cockroach/cockroach-certs", + "--url", vcURI, + } + if database != "" { + args = append(args, "--database="+database) + } + args = append(args, "-e", sql) + + return k8s.RunKubectlAndGetOutputE(t, kubectlOpts, args...) +} + +// ValidatePCR validates PCR by verifying virtual cluster configuration and testing failover/failback connections. +// Provide the cluster and namespaces through AdvancedValidationConfig.PCR. +func (r *Region) ValidatePCR(t *testing.T, cfg *AdvancedValidationConfig) { + validationConfig := mergeValidationConfig(cfg) + cluster := validationConfig.PCR.Cluster + primaryNamespace := validationConfig.PCR.PrimaryNamespace + standbyNamespace := validationConfig.PCR.StandbyNamespace + + require.NotEmpty(t, cluster, "PCR validation requires a cluster context") + require.NotEmpty(t, primaryNamespace, "PCR validation requires a primary namespace") + require.NotEmpty(t, standbyNamespace, "PCR validation requires a standby namespace") + + kubeConfig, _ := r.GetCurrentContext(t) + + // Get primary and standby pods + primaryKubectlOptions := k8s.NewKubectlOptions(cluster, kubeConfig, primaryNamespace) + standbyKubectlOptions := k8s.NewKubectlOptions(cluster, kubeConfig, standbyNamespace) + + primaryPods := k8s.ListPods(t, primaryKubectlOptions, metav1.ListOptions{ + LabelSelector: LabelSelector, + }) + require.True(t, len(primaryPods) > 0, "No primary pods found") + + standbyPods := k8s.ListPods(t, standbyKubectlOptions, metav1.ListOptions{ + LabelSelector: LabelSelector, + }) + require.True(t, len(standbyPods) > 0, "No standby pods found") + + primaryPod := primaryPods[0].Name + standbyPod := standbyPods[0].Name + + // Step 1: Setup primary cluster for PCR + t.Log("==================================================") + t.Log("Step 1: Setting up primary cluster for Physical Cluster Replication") + t.Log("==================================================") + + // Generate local connection URIs for primary + primarySystemURI := generateLocalTenantURI("system") + t.Logf("Connecting to primary system tenant at: %s", primarySystemURI) + + // Enable rangefeed for PCR (required for replication) + t.Log("Enabling rangefeed on primary cluster (required for PCR)...") + _, err := execSQLOnVC(t, primaryKubectlOptions, primaryPod, primarySystemURI, "", "SET CLUSTER SETTING kv.rangefeed.enabled = true") + require.NoError(t, err) + t.Log("✓ Rangefeed enabled on primary cluster") + + // With --virtualized flag, the 'main' virtual cluster is created automatically + // We just need to verify it exists and ensure it's in SHARED mode + t.Log("Verifying main virtual cluster exists on primary (automatically created with --virtualized flag)...") + primaryVCs, err := execSQLOnVC(t, primaryKubectlOptions, primaryPod, primarySystemURI, "", "SHOW VIRTUAL CLUSTERS") + require.NoError(t, err) + require.Contains(t, primaryVCs, "main", "Primary should have main virtual cluster") + t.Logf("✓ Main virtual cluster exists on primary\n%s", primaryVCs) + + // Ensure the service is started in SHARED mode + t.Log("Ensuring main virtual cluster service is in SHARED mode...") + _, err = execSQLOnVC(t, primaryKubectlOptions, primaryPod, primarySystemURI, "", "ALTER VIRTUAL CLUSTER main START SERVICE SHARED") + if err != nil && !strings.Contains(err.Error(), "already") { + t.Logf("Note: Service start returned: %v", err) + } + t.Log("✓ Main virtual cluster service is running in SHARED mode") + + // Wait for service to be ready + time.Sleep(5 * time.Second) + + // Create replication user on primary with required permissions + t.Log("Creating replication user on primary cluster...") + createUserSQL := fmt.Sprintf("CREATE USER IF NOT EXISTS '%s' WITH PASSWORD '%s'", "pcr_source", "repl_password_123") + _, err = execSQLOnVC(t, primaryKubectlOptions, primaryPod, primarySystemURI, "", createUserSQL) + require.NoError(t, err) + t.Log("✓ Created user: pcr_source") + + t.Log("Granting admin privileges to pcr_source...") + _, err = execSQLOnVC(t, primaryKubectlOptions, primaryPod, primarySystemURI, "", "GRANT admin TO pcr_source") + require.NoError(t, err) + t.Log("✓ Granted admin privileges to pcr_source") + + t.Log("Primary cluster setup complete!") + + // Step 2: Setup standby cluster + t.Log("") + t.Log("==================================================") + t.Log("Step 2: Setting up standby cluster") + t.Log("==================================================") + + // Generate local connection URI for standby + standbySystemURI := generateLocalTenantURI("system") + t.Logf("Connecting to standby system tenant at: %s", standbySystemURI) + + // Enable rangefeed on standby cluster + t.Log("Enabling rangefeed on standby cluster...") + _, err = execSQLOnVC(t, standbyKubectlOptions, standbyPod, standbySystemURI, "", "SET CLUSTER SETTING kv.rangefeed.enabled = true") + require.NoError(t, err) + t.Log("✓ Rangefeed enabled on standby cluster") + + // Verify standby was initialized with --virtualized-empty (no main VC yet) + t.Log("Verifying standby cluster state (should have no main virtual cluster yet)...") + standbyVCsInitial, err := execSQLOnVC(t, standbyKubectlOptions, standbyPod, standbySystemURI, "", "SHOW VIRTUAL CLUSTERS") + require.NoError(t, err) + t.Logf("Standby virtual clusters before replication:\n%s", standbyVCsInitial) + t.Log("✓ Standby initialized with --virtualized-empty (no main tenant yet)") + + // Create admin user on standby + t.Log("Creating admin user on standby cluster...") + createAdminSQL := fmt.Sprintf("CREATE USER IF NOT EXISTS '%s' WITH PASSWORD '%s'", "pcr_admin", "admin_password_123") + _, err = execSQLOnVC(t, standbyKubectlOptions, standbyPod, standbySystemURI, "", createAdminSQL) + require.NoError(t, err) + _, err = execSQLOnVC(t, standbyKubectlOptions, standbyPod, standbySystemURI, "", "GRANT admin TO pcr_admin") + require.NoError(t, err) + t.Log("✓ Created user pcr_admin with admin privileges") + t.Log("Standby cluster setup complete!") + + // Step 4: Set up replication stream from standby to primary + t.Log("") + t.Log("==================================================") + t.Log("Step 4: Creating replication stream from primary to standby") + t.Log("==================================================") + + // Step 4a: Generate external connection URI for replication + // Following: https://www.cockroachlabs.com/docs/stable/set-up-physical-cluster-replication#step-4-start-replication + t.Log("Generating connection URI for primary cluster using cockroach encode-uri...") + primaryHost := fmt.Sprintf("cockroachdb-public.%s.svc.%s:26257", primaryNamespace, CustomDomains[0]) + primaryConnString := fmt.Sprintf("postgresql://%s:%s@%s", "pcr_source", "repl_password_123", primaryHost) + t.Logf("Primary connection string format: postgresql://pcr_source:***@%s", primaryHost) + + // Use encode-uri to generate the proper connection string with certificates + encodedPrimaryURI := generateExternalConnectionURI(t, standbyKubectlOptions, standbyPod, primaryConnString) + t.Logf("✓ Generated encoded connection URI for replication") + + // Step 4b: Create external connection on standby (as per documentation) + t.Log("Creating external connection on standby cluster...") + createExternalConnSQL := fmt.Sprintf("CREATE EXTERNAL CONNECTION IF NOT EXISTS primary_replication AS '%s'", encodedPrimaryURI) + _, err = execSQLOnVC(t, standbyKubectlOptions, standbyPod, standbySystemURI, "", createExternalConnSQL) + require.NoError(t, err) + t.Log("✓ External connection 'primary_replication' created on standby") + + // Step 4c: Create the replication stream using the external connection name + t.Log("Creating replication stream on standby cluster using external connection...") + t.Log("This will create the main virtual cluster on standby and start replicating data from primary") + createReplicationCmd := "CREATE VIRTUAL CLUSTER main FROM REPLICATION OF main ON 'external://primary_replication'" + t.Logf("Executing: %s", createReplicationCmd) + + output, err := execSQLOnVC(t, standbyKubectlOptions, standbyPod, standbySystemURI, "", createReplicationCmd) + if err != nil && !strings.Contains(err.Error(), "already exists") && !strings.Contains(output, "already exists") { + t.Logf("ERROR: Replication stream creation failed: %v", err) + t.Logf("Output: %s", output) + // Debug: Check what tenants exist on primary + t.Log("Debugging: Checking virtual clusters on primary...") + tenantsOutput, _ := execSQLOnVC(t, primaryKubectlOptions, primaryPod, primarySystemURI, "", "SHOW VIRTUAL CLUSTERS") + t.Logf("Virtual clusters on primary:\n%s", tenantsOutput) + require.NoError(t, err, "Failed to create replication stream") + } else { + t.Log("✓ Replication stream created successfully!") + t.Log("✓ Main virtual cluster now exists on standby and is replicating from primary") + } + + // Wait for replication to catch up + t.Log("Waiting for initial replication to catch up (15 seconds)...") + time.Sleep(15 * time.Second) + t.Log("✓ Initial replication sync complete") + + // Step 5: Verify replication status + t.Log("") + t.Log("==================================================") + t.Log("Step 5: Verifying replication status") + t.Log("==================================================") + + t.Log("Checking virtual clusters on standby (should now include main)...") + standbyVCsStatus, err := execSQLOnVC(t, standbyKubectlOptions, standbyPod, standbySystemURI, "", "SHOW VIRTUAL CLUSTERS") + require.NoError(t, err) + require.Contains(t, standbyVCsStatus, "main", "Standby should now have main virtual cluster after replication setup") + t.Logf("Virtual clusters on standby:\n%s", standbyVCsStatus) + t.Log("✓ Main virtual cluster exists on standby with replication status") + + t.Log("Replication verification complete!") + + // Step 6: Test read-only access on standby using a separate reader virtual cluster + // Following: https://www.cockroachlabs.com/docs/stable/read-from-standby + t.Log("") + t.Log("==================================================") + t.Log("Step 6: Testing read-only access on standby cluster") + t.Log("==================================================") + + // Step 6a: Create a reader virtual cluster for read-only access + // This is the recommended approach per documentation + t.Log("Creating a reader virtual cluster on standby for read-only access...") + t.Log("This allows read queries without affecting the replication stream") + createReaderSQL := "CREATE VIRTUAL CLUSTER main_reader FROM REPLICATION OF main ON 'external://primary_replication' WITH READ VIRTUAL CLUSTER" + _, err = execSQLOnVC(t, standbyKubectlOptions, standbyPod, standbySystemURI, "", createReaderSQL) + if err != nil && !strings.Contains(err.Error(), "already exists") { + t.Logf("Note: Reader VC creation returned: %v", err) + } + t.Log("✓ Reader virtual cluster 'main_reader' created") + + // Step 6b: Start the reader service in SHARED mode + t.Log("Starting reader virtual cluster service in SHARED mode...") + _, err = execSQLOnVC(t, standbyKubectlOptions, standbyPod, standbySystemURI, "", "ALTER VIRTUAL CLUSTER main_reader START SERVICE SHARED") + if err != nil && !strings.Contains(err.Error(), "already") { + t.Logf("Note: Service start returned: %v (may already be started)", err) + } + t.Log("✓ Reader service started in SHARED mode") + + // Wait for reader service to be ready and replication to catch up + // Documentation recommends waiting for replication to catch up + t.Log("Waiting for reader service to be ready and replication to catch up (20 seconds)...") + time.Sleep(20 * time.Second) + + // Step 6c: Verify reader virtual cluster exists + t.Log("Verifying reader virtual cluster status...") + readerVCsStatus, err := execSQLOnVC(t, standbyKubectlOptions, standbyPod, standbySystemURI, "", "SHOW VIRTUAL CLUSTERS") + require.NoError(t, err) + require.Contains(t, readerVCsStatus, "main_reader", "Standby should have main_reader virtual cluster") + t.Logf("Virtual clusters on standby:\n%s", readerVCsStatus) + t.Log("✓ Reader virtual cluster is ready") + + // Step 6d: Read from the reader virtual cluster + t.Log("Attempting to read replicated data from reader virtual cluster...") + readerURI := generateLocalTenantURI("main_reader") + readFromStandby, err := execSQLOnVC(t, standbyKubectlOptions, standbyPod, readerURI, "bank", "SELECT id, balance FROM accounts ORDER BY id") + if err != nil { + t.Logf("Warning: Read from standby reader failed (may need more time): %v", err) + // Try reading directly from main VC as fallback + t.Log("Trying to read from main virtual cluster instead...") + standbyMainURI := generateLocalTenantURI("main") + readFromStandby, err = execSQLOnVC(t, standbyKubectlOptions, standbyPod, standbyMainURI, "bank", "SELECT id, balance FROM accounts ORDER BY id") + if err != nil { + t.Logf("Warning: Read from main VC also failed: %v", err) + } + } + + if err == nil { + require.Contains(t, readFromStandby, "1000", "Should be able to read replicated data from standby") + require.Contains(t, readFromStandby, "250", "Should be able to read replicated data from standby") + t.Log("✓ Successfully read replicated data from standby reader!") + t.Logf("Data from standby reader:\n%s", readFromStandby) + } + + t.Log("Read-only access test complete!") + + // Step 7: Test failover (cutover) + t.Log("") + t.Log("==================================================") + t.Log("Step 7: Testing failover (promoting standby to primary)") + t.Log("==================================================") + + // Stop the service first (if running in SHARED mode) + t.Log("Stopping main virtual cluster service on standby before cutover...") + _, _ = execSQLOnVC(t, standbyKubectlOptions, standbyPod, standbySystemURI, "", "ALTER VIRTUAL CLUSTER main STOP SERVICE") + t.Log("✓ Service stopped") + + time.Sleep(5 * time.Second) + + // Complete replication to latest - this makes the standby writable + t.Log("Completing replication to latest (this promotes standby to be writable)...") + _, err = execSQLOnVC(t, standbyKubectlOptions, standbyPod, standbySystemURI, "", "ALTER VIRTUAL CLUSTER main COMPLETE REPLICATION TO LATEST") + if err != nil { + t.Logf("Note: Cutover command returned: %v", err) + } + t.Log("✓ Replication completed to latest") + + // Wait for cutover to complete + t.Log("Waiting for cutover to complete (15 seconds)...") + time.Sleep(15 * time.Second) + + // Step 8: Start the service after cutover + t.Log("") + t.Log("==================================================") + t.Log("Step 8: Starting service on promoted standby") + t.Log("==================================================") + + t.Log("Starting main virtual cluster service after cutover...") + _, err = execSQLOnVC(t, standbyKubectlOptions, standbyPod, standbySystemURI, "", "ALTER VIRTUAL CLUSTER main START SERVICE SHARED") + if err != nil { + t.Logf("Note: Service start returned: %v (may already be started)", err) + } + t.Log("✓ Service started, standby is now the primary and can accept writes") + + //time.Sleep(10 * time.Second) + + // Step 9: Verify promoted standby can serve read/write traffic + //t.Log("") + //t.Log("==================================================") + //t.Log("Step 9: Verifying promoted standby can handle read/write traffic") + //t.Log("==================================================") + // + //// Check virtual cluster status + //t.Log("Checking virtual cluster status after cutover...") + //standbyVCsFinal, err := execSQLOnVC(t, standbyKubectlOptions, standbyPod, standbySystemURI, "", "SHOW VIRTUAL CLUSTERS") + //require.NoError(t, err) + //t.Logf("Virtual clusters on promoted standby:\n%s", standbyVCsFinal) + //t.Log("✓ Virtual cluster status looks good") + + // Step 10: Cleanup note + //t.Log("") + //t.Log("==================================================") + //t.Log("Step 10: Cleanup") + //t.Log("==================================================") + //t.Log("Note: Reader virtual cluster 'main_reader' can be dropped if no longer needed") + //t.Log("Command: DROP VIRTUAL CLUSTER main_reader") + // + //t.Log("") + //t.Log("==================================================") + //t.Log("PCR Validation Complete!") + //t.Log("==================================================") + //t.Log("✓ Successfully tested:") + //t.Log(" - Virtual cluster setup with --virtualized flag on primary") + //t.Log(" - Virtual cluster setup with --virtualized-empty flag on standby") + //t.Log(" - User creation and permissions (pcr_source, pcr_admin)") + //t.Log(" - External connection creation") + //t.Log(" - Replication stream creation using external connection") + //t.Log(" - Reader virtual cluster for read-only access") + //t.Log(" - Read-only access to standby via reader VC") + //t.Log(" - Failover (cutover) to standby") + //t.Log(" - Read/write operations on promoted standby") + //t.Log("==================================================") +} diff --git a/tests/e2e/operator/singleRegion/cockroachdb_single_region_advanced_features_test.go b/tests/e2e/operator/singleRegion/cockroachdb_single_region_advanced_features_test.go new file mode 100644 index 00000000..4d6802a9 --- /dev/null +++ b/tests/e2e/operator/singleRegion/cockroachdb_single_region_advanced_features_test.go @@ -0,0 +1,577 @@ +package singleRegion + +import ( + "fmt" + "github.com/cockroachdb/helm-charts/tests/e2e/operator" + "github.com/gruntwork-io/terratest/modules/helm" + "github.com/gruntwork-io/terratest/modules/k8s" + "github.com/gruntwork-io/terratest/modules/random" + "github.com/stretchr/testify/require" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "strings" + "testing" +) + +// TestWALFailover tests the WAL failover functionality by: +// 1. Installing CockroachDB with WAL failover enabled with custom path +// 2. Verifying the cluster is healthy +// 3. Verifying --wal-failover flag with custom path and PVC mounting +func (r *singleRegion) TestWALFailover(t *testing.T) { + cluster := r.Clusters[0] + cleanup := r.SetupSingleClusterWithCA(t, cluster) + defer cleanup() + + // Install CockroachDB with WAL failover enabled using common method + config := operator.AdvancedInstallConfig{ + WALFailoverEnabled: true, + WALFailoverSize: "5Gi", + } + r.InstallChartsWithAdvancedConfig(t, cluster, 0, config) + + // Validate CockroachDB cluster is healthy + r.ValidateCRDB(t, cluster) + + // Validate WAL failover with workload and metrics monitoring + r.ValidateWALFailover(t, cluster, nil) + + t.Logf("WAL failover test completed successfully") +} + +// TestWALFailoverDisable tests disabling WAL failover via helm upgrade by: +// 1. Installing CockroachDB with WAL failover enabled with custom path +// 2. Verifying WAL failover is configured with custom path +// 3. Upgrading to disable WAL failover +// 4. Verifying --wal-failover flag contains disable and prev_path +func (r *singleRegion) TestWALFailoverDisable(t *testing.T) { + cluster := r.Clusters[0] + cleanup := r.SetupSingleClusterWithCA(t, cluster) + defer cleanup() + + // Step 1: Install CockroachDB with WAL failover enabled + t.Log("Installing CockroachDB with WAL failover enabled...") + config := operator.AdvancedInstallConfig{ + WALFailoverEnabled: true, + WALFailoverSize: "5Gi", + } + r.InstallChartsWithAdvancedConfig(t, cluster, 0, config) + + // Validate CockroachDB cluster is healthy + r.ValidateCRDB(t, cluster) + + // Validate WAL failover is enabled + r.ValidateWALFailover(t, cluster, nil) + + // Step 2: Upgrade to disable WAL failover + t.Log("Upgrading to disable WAL failover...") + kubeConfig, _ := r.GetCurrentContext(t) + kubectlOptions := k8s.NewKubectlOptions(cluster, kubeConfig, r.Namespace[cluster]) + + // Get initial pod timestamps before upgrade + pods := k8s.ListPods(t, kubectlOptions, metav1.ListOptions{ + LabelSelector: operator.LabelSelector, + }) + require.True(t, len(pods) > 0, "No CockroachDB pods found") + initialTimestamp := pods[0].CreationTimestamp.Time + + // Get helm chart paths + helmChartPath, _ := operator.HelmChartPaths() + + // Upgrade with WAL failover disabled + disableConfig := operator.PatchHelmValues(map[string]string{ + "cockroachdb.clusterDomain": operator.CustomDomains[0], + "cockroachdb.tls.selfSigner.caProvided": "true", + "cockroachdb.tls.selfSigner.caSecret": "cockroachdb-ca-secret", + "cockroachdb.crdbCluster.walFailoverSpec.status": "disable", + "cockroachdb.crdbCluster.walFailoverSpec.name": "datadir-wal-failover", + "cockroachdb.crdbCluster.walFailoverSpec.path": "/cockroach/cockroach-wal-failover", + }) + + upgradeOptions := &helm.Options{ + KubectlOptions: kubectlOptions, + SetValues: disableConfig, + SetJsonValues: map[string]string{ + "cockroachdb.crdbCluster.regions": operator.MustMarshalJSON(r.OperatorRegions(0, r.NodeCount)), + }, + ExtraArgs: map[string][]string{ + "upgrade": {"--reuse-values", "--wait"}, + }, + } + + helm.Upgrade(t, upgradeOptions, helmChartPath, operator.ReleaseName) + + // Wait for upgrade to complete using VerifyHelmUpgrade helper + err := r.VerifyHelmUpgrade(t, initialTimestamp, kubectlOptions) + require.NoError(t, err) + + // Step 3: Verify WAL failover is disabled + t.Log("Verifying WAL failover is disabled after upgrade...") + pods = k8s.ListPods(t, kubectlOptions, metav1.ListOptions{ + LabelSelector: operator.LabelSelector, + }) + require.True(t, len(pods) > 0, "No CockroachDB pods found") + + podName := pods[0].Name + + // Verify --wal-failover flag now contains "disable" and "prev_path" + podCommand, err := k8s.RunKubectlAndGetOutputE(t, kubectlOptions, + "get", "pod", podName, "-o", "jsonpath={.spec.containers[?(@.name=='cockroachdb')].command}") + require.NoError(t, err) + require.Contains(t, podCommand, "--wal-failover=disable", "Pod command should contain --wal-failover=disable after disabling") + require.Contains(t, podCommand, "prev_path=/cockroach/cockroach-wal-failover", "Pod command should contain prev_path with custom path after disabling") + + // Verify WAL failover PVC still exists (not deleted on disable) + pvcs, err := k8s.RunKubectlAndGetOutputE(t, kubectlOptions, + "get", "pvc", "-o", "jsonpath={.items[*].metadata.name}") + require.NoError(t, err) + require.Contains(t, pvcs, "datadir-wal-failover", "WAL failover PVC should still exist after disable") + t.Logf("PVCs after disable: %s", pvcs) + + t.Log("WAL failover successfully disabled") + t.Logf("WAL failover disable test completed successfully") +} + +// TestEncryptionAtRestEnable tests encryption at rest functionality by: +// 1. Generating a proper 256-bit AES encryption key +// 2. Creating encryption key secret +// 3. Installing CockroachDB with encryption at rest enabled +// 4. Verifying the cluster is healthy and encryption is active +func (r *singleRegion) TestEncryptionAtRestEnable(t *testing.T) { + cluster := r.Clusters[0] + cleanup := r.SetupSingleClusterWithCA(t, cluster) + defer cleanup() + + // Generate proper 256-bit AES encryption key + t.Log("Generating 256-bit AES encryption key...") + encryptionKeyB64 := r.GenerateEncryptionKey(t) + t.Logf("Generated encryption key (base64 length: %d)", len(encryptionKeyB64)) + + // Configure encryption at rest regions + encryptionRegions := r.BuildEncryptionRegions(cluster, 0, nil) + + // Install CockroachDB with encryption at rest enabled using common method + config := operator.AdvancedInstallConfig{ + EncryptionEnabled: true, + EncryptionKeySecret: encryptionKeyB64, + CustomRegions: encryptionRegions, + } + r.InstallChartsWithAdvancedConfig(t, cluster, 0, config) + + // Validate CockroachDB cluster is healthy + r.ValidateCRDB(t, cluster) + + // Validate encryption at rest is active + r.ValidateEncryptionAtRest(t, cluster, nil) + + // Additional validation: Verify key and old-key in the encryption flag + kubeConfig, _ := r.GetCurrentContext(t) + kubectlOptions := k8s.NewKubectlOptions(cluster, kubeConfig, r.Namespace[cluster]) + + pods := k8s.ListPods(t, kubectlOptions, metav1.ListOptions{ + LabelSelector: operator.LabelSelector, + }) + require.True(t, len(pods) > 0, "No CockroachDB pods found") + podName := pods[0].Name + + podCommand, err := k8s.RunKubectlAndGetOutputE(t, kubectlOptions, + "get", "pod", podName, "-o", "jsonpath={.spec.containers[?(@.name=='cockroachdb')].command}") + require.NoError(t, err) + + // Verify encryption flag format: key should point to the secret, old-key should be "plain" for initial setup + require.Contains(t, podCommand, "key=/etc/cockroach-key/", "Encryption flag should contain key path") + require.Contains(t, podCommand, "old-key=plain", "Encryption flag should have old-key=plain for initial setup") + t.Logf("Verified encryption flag format with key and old-key=plain") + + t.Logf("Encryption at rest enable test completed successfully") +} + +// TestEncryptionAtRestDisable tests transitioning from encrypted to plaintext by: +// 1. Installing CockroachDB with encryption at rest enabled +// 2. Verifying encryption is active +// 3. Upgrading to use plaintext (setting keySecretName to nil and oldKeySecretName to existing secret) +// 4. Verifying --enterprise-encryption flag still exists but now points to "plain" (plaintext) +// Note: Once encryption is enabled, you must always include the --enterprise-encryption flag. +// To disable encryption, keep encryptionAtRest enabled but +// set keySecretName to nil/empty and oldKeySecretName to the existing secret. +func (r *singleRegion) TestEncryptionAtRestDisable(t *testing.T) { + cluster := r.Clusters[0] + cleanup := r.SetupSingleClusterWithCA(t, cluster) + defer cleanup() + + // Step 1: Install CockroachDB with encryption at rest enabled + t.Log("Installing CockroachDB with encryption at rest enabled...") + encryptionKeyB64 := r.GenerateEncryptionKey(t) + t.Logf("Generated encryption key (base64 length: %d)", len(encryptionKeyB64)) + + // Configure encryption at rest regions + encryptionRegions := r.BuildEncryptionRegions(cluster, 0, nil) + + config := operator.AdvancedInstallConfig{ + EncryptionEnabled: true, + EncryptionKeySecret: encryptionKeyB64, + CustomRegions: encryptionRegions, + } + r.InstallChartsWithAdvancedConfig(t, cluster, 0, config) + + // Validate CockroachDB cluster is healthy + r.ValidateCRDB(t, cluster) + + // Validate encryption at rest is active + r.ValidateEncryptionAtRest(t, cluster, nil) + + // Step 2: Transition to plaintext by setting keySecretName to nil and oldKeySecretName to existing secret + t.Log("Upgrading to transition from encrypted to plaintext...") + kubeConfig, _ := r.GetCurrentContext(t) + kubectlOptions := k8s.NewKubectlOptions(cluster, kubeConfig, r.Namespace[cluster]) + + // Get initial pod timestamps before upgrade + pods := k8s.ListPods(t, kubectlOptions, metav1.ListOptions{ + LabelSelector: operator.LabelSelector, + }) + require.True(t, len(pods) > 0, "No CockroachDB pods found") + initialTimestamp := pods[0].CreationTimestamp.Time + + // Get helm chart paths + helmChartPath, _ := operator.HelmChartPaths() + + // Configure regions with encryption at rest still enabled but keySecretName set to empty and oldKeySecretName set to existing secret + // This tells CockroachDB to transition to plaintext mode while keeping the flag + regions := r.BuildEncryptionRegions(cluster, 0, map[string]interface{}{ + "keySecretName": nil, + "oldKeySecretName": "cmek-key-secret", + }) + + upgradeOptions := &helm.Options{ + KubectlOptions: kubectlOptions, + SetJsonValues: map[string]string{ + "cockroachdb.crdbCluster.regions": operator.MustMarshalJSON(regions), + }, + ExtraArgs: map[string][]string{ + "upgrade": {"--reuse-values", "--wait"}, + }, + } + + helm.Upgrade(t, upgradeOptions, helmChartPath, operator.ReleaseName) + + // Wait for upgrade to complete using VerifyHelmUpgrade helper + err := r.VerifyHelmUpgrade(t, initialTimestamp, kubectlOptions) + require.NoError(t, err) + + // Step 3: Verify encryption flag still exists but now uses "plain" + t.Log("Verifying transition to plaintext after upgrade...") + pods = k8s.ListPods(t, kubectlOptions, metav1.ListOptions{ + LabelSelector: operator.LabelSelector, + }) + require.True(t, len(pods) > 0, "No CockroachDB pods found") + + podName := pods[0].Name + + // According to CockroachDB docs, once encryption is enabled, the --enterprise-encryption + // flag must always be present. When transitioning to plaintext, it should show "plain" + podCommand, err := k8s.RunKubectlAndGetOutputE(t, kubectlOptions, + "get", "pod", podName, "-o", "jsonpath={.spec.containers[?(@.name=='cockroachdb')].command}") + require.NoError(t, err) + require.Contains(t, podCommand, "--enterprise-encryption", "Pod command should still contain --enterprise-encryption flag (required once encryption is enabled)") + + // Verify encryption flag format: key should be "plain" (plaintext), old-key should reference the encrypted key + require.Contains(t, podCommand, "key=plain", "Encryption flag should have key=plain for plaintext mode") + require.Contains(t, podCommand, "old-key=/etc/cockroach-key/", "Encryption flag should have old-key pointing to previous encrypted key") + t.Logf("Verified encryption flag format with key=plain and old-key pointing to encrypted key") + + // Verify encryption status shows plaintext + //encryptionStatus, err := k8s.RunKubectlAndGetOutputE(t, kubectlOptions, + // "exec", podName, "-c", "cockroachdb", "--", + // "/cockroach/cockroach", "debug", "encryption-status", "/cockroach/cockroach-data") + //require.NoError(t, err) + // In plaintext mode, encryption status should not show "Active encryption" or should show plaintext + //t.Logf("Encryption status after transitioning to plaintext: %s", encryptionStatus) + + t.Log("Successfully transitioned from encrypted to plaintext") + t.Logf("Encryption at rest disable test completed successfully") +} + +// TestEncryptionAtRestModifySecret tests rotating the encryption key by: +// 1. Installing CockroachDB with encryption at rest enabled +// 2. Verifying encryption is active +// 3. Generating a new encryption key and creating a new secret +// 4. Upgrading with keySecretName pointing to new secret and oldKeySecretName to existing secret +// 5. Verifying encryption still works with rotated key +// Note: Key rotation requires setting keySecretName to the new key and oldKeySecretName to the old key. +func (r *singleRegion) TestEncryptionAtRestModifySecret(t *testing.T) { + cluster := r.Clusters[0] + cleanup := r.SetupSingleClusterWithCA(t, cluster) + defer cleanup() + + // Step 1: Install CockroachDB with encryption at rest enabled + t.Log("Installing CockroachDB with encryption at rest enabled...") + encryptionKeyB64 := r.GenerateEncryptionKey(t) + t.Logf("Generated initial encryption key (base64 length: %d)", len(encryptionKeyB64)) + + // Configure encryption at rest regions + encryptionRegions := r.BuildEncryptionRegions(cluster, 0, nil) + + config := operator.AdvancedInstallConfig{ + EncryptionEnabled: true, + EncryptionKeySecret: encryptionKeyB64, + CustomRegions: encryptionRegions, + } + r.InstallChartsWithAdvancedConfig(t, cluster, 0, config) + + // Validate CockroachDB cluster is healthy + r.ValidateCRDB(t, cluster) + + // Validate encryption at rest is active + r.ValidateEncryptionAtRest(t, cluster, nil) + + // Step 2: Generate new encryption key and create new secret + t.Log("Generating new encryption key and creating new secret...") + kubeConfig, _ := r.GetCurrentContext(t) + kubectlOptions := k8s.NewKubectlOptions(cluster, kubeConfig, r.Namespace[cluster]) + + newEncryptionKeyB64 := r.GenerateEncryptionKey(t) + t.Logf("Generated new encryption key (base64 length: %d)", len(newEncryptionKeyB64)) + + // Create new secret with a different name + k8s.RunKubectl(t, kubectlOptions, "create", "secret", "generic", "cmek-key-secret-new", + "--from-literal=StoreKeyData="+newEncryptionKeyB64) + + t.Log("Created new encryption key secret: cmek-key-secret-new") + + // Get initial pod timestamps before upgrade + pods := k8s.ListPods(t, kubectlOptions, metav1.ListOptions{ + LabelSelector: operator.LabelSelector, + }) + require.True(t, len(pods) > 0, "No CockroachDB pods found") + initialTimestamp := pods[0].CreationTimestamp.Time + + // Step 3: Configure regions with key rotation - new key in keySecretName, old key in oldKeySecretName + t.Log("Upgrading with key rotation configuration...") + helmChartPath, _ := operator.HelmChartPaths() + + // Configure regions with both new and old keys for rotation + rotationRegions := r.BuildEncryptionRegions(cluster, 0, map[string]interface{}{ + "keySecretName": "cmek-key-secret-new", + "oldKeySecretName": "cmek-key-secret", + }) + + upgradeOptions := &helm.Options{ + KubectlOptions: kubectlOptions, + SetJsonValues: map[string]string{ + "cockroachdb.crdbCluster.regions": operator.MustMarshalJSON(rotationRegions), + }, + ExtraArgs: map[string][]string{ + "upgrade": {"--reuse-values", "--wait"}, + }, + } + + helm.Upgrade(t, upgradeOptions, helmChartPath, operator.ReleaseName) + + // Wait for pods to restart and key rotation to complete using VerifyHelmUpgrade helper + err := r.VerifyHelmUpgrade(t, initialTimestamp, kubectlOptions) + require.NoError(t, err) + + // Step 4: Verify encryption is still active with new key + t.Log("Verifying encryption at rest is still active with new key...") + + // Validate CockroachDB cluster is healthy + r.ValidateCRDB(t, cluster) + + // Validate encryption at rest is still active + r.ValidateEncryptionAtRest(t, cluster, nil) + + // Verify new secret exists and is being used + newSecretData, err := k8s.RunKubectlAndGetOutputE(t, kubectlOptions, + "get", "secret", "cmek-key-secret-new", "-o", "jsonpath={.data.StoreKeyData}") + require.NoError(t, err) + t.Logf("New secret key length: %d", len(newSecretData)) + + // Verify old secret still exists (referenced in oldKeySecretName) + oldSecretData, err := k8s.RunKubectlAndGetOutputE(t, kubectlOptions, + "get", "secret", "cmek-key-secret", "-o", "jsonpath={.data.StoreKeyData}") + require.NoError(t, err) + t.Logf("Old secret key length: %d", len(oldSecretData)) + + // Verify pod command contains encryption flag with both key references + pods = k8s.ListPods(t, kubectlOptions, metav1.ListOptions{ + LabelSelector: operator.LabelSelector, + }) + require.True(t, len(pods) > 0, "No CockroachDB pods found") + podName := pods[0].Name + + podCommand, err := k8s.RunKubectlAndGetOutputE(t, kubectlOptions, + "get", "pod", podName, "-o", "jsonpath={.spec.containers[?(@.name=='cockroachdb')].command}") + require.NoError(t, err) + require.Contains(t, podCommand, "--enterprise-encryption", "Pod command should contain --enterprise-encryption flag") + + // Verify encryption flag format: key should point to new secret, old-key should point to old secret + require.Contains(t, podCommand, "key=/etc/cockroach-key/", "Encryption flag should contain new key path") + require.Contains(t, podCommand, "old-key=/etc/cockroach-key/", "Encryption flag should contain old key path for rotation") + t.Logf("Verified encryption flag format with both new key and old-key during rotation") + t.Logf("Encryption flag after key rotation: %s", podCommand) + + t.Log("Encryption at rest successfully working with rotated key") + t.Logf("Encryption at rest modify secret test completed successfully") +} + +// TestWALFailoverWithEncryption tests WAL failover with encryption at rest by: +// 1. Installing CockroachDB with both WAL failover and encryption at rest enabled +// 2. Verifying the cluster is healthy +// 3. Verifying --wal-failover flag with custom path +// 4. Verifying --enterprise-encryption flag includes WAL path encryption +// Note: When WAL failover is enabled with encryption at rest, the WAL path must also be encrypted. +func (r *singleRegion) TestWALFailoverWithEncryption(t *testing.T) { + cluster := r.Clusters[0] + cleanup := r.SetupSingleClusterWithCA(t, cluster) + defer cleanup() + + // Step 1: Generate encryption key + t.Log("Generating 256-bit AES encryption key...") + encryptionKeyB64 := r.GenerateEncryptionKey(t) + t.Logf("Generated encryption key (base64 length: %d)", len(encryptionKeyB64)) + + // Configure encryption at rest regions + encryptionRegions := r.BuildEncryptionRegions(cluster, 0, nil) + + // Install CockroachDB with both WAL failover and encryption enabled + config := operator.AdvancedInstallConfig{ + WALFailoverEnabled: true, + WALFailoverSize: "5Gi", + EncryptionEnabled: true, + EncryptionKeySecret: encryptionKeyB64, + CustomRegions: encryptionRegions, + } + r.InstallChartsWithAdvancedConfig(t, cluster, 0, config) + + // Validate CockroachDB cluster is healthy + r.ValidateCRDB(t, cluster) + + // Step 2: Verify WAL failover is configured + t.Log("Verifying WAL failover configuration...") + kubeConfig, _ := r.GetCurrentContext(t) + kubectlOptions := k8s.NewKubectlOptions(cluster, kubeConfig, r.Namespace[cluster]) + + pods := k8s.ListPods(t, kubectlOptions, metav1.ListOptions{ + LabelSelector: operator.LabelSelector, + }) + require.True(t, len(pods) > 0, "No CockroachDB pods found") + podName := pods[0].Name + + // Verify --wal-failover flag exists + podCommand, err := k8s.RunKubectlAndGetOutputE(t, kubectlOptions, + "get", "pod", podName, "-o", "jsonpath={.spec.containers[?(@.name=='cockroachdb')].command}") + require.NoError(t, err) + require.Contains(t, podCommand, "--wal-failover=path=/cockroach/cockroach-wal-failover", + "Pod command should contain --wal-failover flag with custom path") + + // Step 3: Verify encryption is configured for both data store and WAL path + t.Log("Verifying encryption is configured for data store and WAL path...") + // Encryption flag is part of the command + require.Contains(t, podCommand, "--enterprise-encryption", + "Pod command should contain --enterprise-encryption flag") + + // The encryption flag should reference the WAL failover path + // Format: --enterprise-encryption=path=cockroach-data,key=...,old-key=plain;path=/cockroach/cockroach-wal-failover,key=...,old-key=plain + require.Contains(t, podCommand, "cockroach-data", + "Encryption flag should include data store path") + require.Contains(t, podCommand, "/cockroach/cockroach-wal-failover", + "Encryption flag should include WAL failover path for encryption") + + // Verify encryption flag format: both data and WAL paths should have key=/etc/cockroach-key/ and old-key=plain + require.Contains(t, podCommand, "key=/etc/cockroach-key/", "Encryption flag should contain key path for both data and WAL") + require.Contains(t, podCommand, "old-key=plain", "Encryption flag should have old-key=plain for initial setup") + t.Logf("Verified encryption flag format with key and old-key=plain for both data store and WAL path") + + // Commented out cockroach debug verification commands + //t.Log("Verifying encryption status for both paths...") + //// Verify data store encryption + //dataEncryptionStatus, err := k8s.RunKubectlAndGetOutputE(t, kubectlOptions, + // "exec", podName, "-c", "cockroachdb", "--", + // "/cockroach/cockroach", "debug", "encryption-status", "/cockroach/cockroach-data") + //require.NoError(t, err) + //require.Contains(t, dataEncryptionStatus, "Active", "Data store should show active encryption") + // + //// Verify WAL path encryption + //walEncryptionStatus, err := k8s.RunKubectlAndGetOutputE(t, kubectlOptions, + // "exec", podName, "-c", "cockroachdb", "--", + // "/cockroach/cockroach", "debug", "encryption-status", "/cockroach/cockroach-wal-failover") + //require.NoError(t, err) + //require.Contains(t, walEncryptionStatus, "Active", "WAL failover path should show active encryption") + + // Verify both PVCs exist + pvcs, err := k8s.RunKubectlAndGetOutputE(t, kubectlOptions, + "get", "pvc", "-o", "jsonpath={.items[*].metadata.name}") + require.NoError(t, err) + require.Contains(t, pvcs, "datadir", "Data PVC should exist") + require.Contains(t, pvcs, "datadir-wal-failover", "WAL failover PVC should exist") + + t.Log("WAL failover with encryption at rest test completed successfully") + t.Logf("Verified both data store and WAL path are encrypted") +} + +// TestPCR tests Physical Cluster Replication by: +// 1. Installing a primary virtual cluster +// 2. Installing a standby virtual cluster in separate namespace +// 3. Creating replication stream between primary and standby +// 4. Running workload on primary +// 5. Verifying cutover to standby +func (r *singleRegion) TestPCR(t *testing.T) { + cluster := r.Clusters[0] + + // Create CA certificate once for both clusters + cleanupCA := r.RequireCACertificate(t) + defer cleanupCA() + + var ( + primaryNamespace string + standbyNamespace string + ) + + // Step 1: Install primary virtual cluster + t.Log("Installing primary virtual cluster...") + primaryNamespace = fmt.Sprintf("%s-primary-%s", operator.Namespace, strings.ToLower(random.UniqueId())) + r.Namespace[cluster] = primaryNamespace + + primaryConfig := operator.AdvancedInstallConfig{ + VirtualClusterMode: "primary", + } + r.InstallChartsWithAdvancedConfig(t, cluster, 0, primaryConfig) + + r.VirtualClusterModePrimary = true + r.ValidateCRDB(t, cluster) + r.VirtualClusterModePrimary = false + + // Step 2: Install standby virtual cluster in separate namespace + t.Log("Installing standby virtual cluster...") + standbyNamespace = fmt.Sprintf("%s-standby-%s", operator.Namespace, strings.ToLower(random.UniqueId())) + r.Namespace[cluster] = standbyNamespace + + standbyConfig := operator.AdvancedInstallConfig{ + VirtualClusterMode: "standby", + SkipOperatorInstall: true, + } + r.InstallChartsWithAdvancedConfig(t, cluster, 0, standbyConfig) + + r.VirtualClusterModeStandby = true + r.ValidateCRDB(t, cluster) + r.VirtualClusterModeStandby = false + + // Step 3: Set up replication and test failover/failback + r.ValidatePCR(t, &operator.AdvancedValidationConfig{ + PCR: operator.PCRValidation{ + Cluster: cluster, + PrimaryNamespace: primaryNamespace, + StandbyNamespace: standbyNamespace, + }, + }) + + // Cleanup both namespaces + defer func() { + r.Namespace[cluster] = primaryNamespace + r.CleanupResources(t) + }() + defer func() { + kubectlOptions := k8s.NewKubectlOptions("", "", standbyNamespace) + k8s.DeleteNamespace(t, kubectlOptions, standbyNamespace) + }() + + t.Logf("PCR (Physical Cluster Replication) test completed successfully") +} diff --git a/tests/e2e/operator/singleRegion/cockroachdb_single_region_e2e_test.go b/tests/e2e/operator/singleRegion/cockroachdb_single_region_e2e_test.go index 46491822..d6f2f0ae 100644 --- a/tests/e2e/operator/singleRegion/cockroachdb_single_region_e2e_test.go +++ b/tests/e2e/operator/singleRegion/cockroachdb_single_region_e2e_test.go @@ -18,9 +18,6 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" ) -// Environment variable name to check if running in nightly mode -const isNightlyEnvVar = "isNightly" - type singleRegion struct { operator.OperatorUseCases operator.Region @@ -82,14 +79,25 @@ func TestOperatorInSingleRegion(t *testing.T) { // Set up infrastructure for this provider once. cloudProvider.SetUpInfra(t) - testCases := map[string]func(*testing.T){ - "TestHelmInstall": providerRegion.TestHelmInstall, - "TestHelmInstallVirtualCluster": providerRegion.TestHelmInstallVirtualCluster, - "TestHelmUpgrade": providerRegion.TestHelmUpgrade, - "TestClusterRollingRestart": providerRegion.TestClusterRollingRestart, - "TestKillingCockroachNode": providerRegion.TestKillingCockroachNode, - "TestClusterScaleUp": func(t *testing.T) { providerRegion.TestClusterScaleUp(t, cloudProvider) }, - "TestInstallWithCertManager": providerRegion.TestInstallWithCertManager, + testCases := make(map[string]func(*testing.T)) + + // Run only advanced test cases when TEST_ADVANCED_FEATURES is enabled + if os.Getenv("TEST_ADVANCED_FEATURES") == "true" { + testCases["TestWALFailover"] = providerRegion.TestWALFailover + testCases["TestWALFailoverDisable"] = providerRegion.TestWALFailoverDisable + testCases["TestEncryptionAtRestEnable"] = providerRegion.TestEncryptionAtRestEnable + testCases["TestEncryptionAtRestDisable"] = providerRegion.TestEncryptionAtRestDisable + testCases["TestEncryptionAtRestModifySecret"] = providerRegion.TestEncryptionAtRestModifySecret + testCases["TestWALFailoverWithEncryption"] = providerRegion.TestWALFailoverWithEncryption + testCases["TestPCR"] = providerRegion.TestPCR + } else { + testCases["TestHelmInstall"] = providerRegion.TestHelmInstall + testCases["TestHelmInstallVirtualCluster"] = providerRegion.TestHelmInstallVirtualCluster + testCases["TestHelmUpgrade"] = providerRegion.TestHelmUpgrade + testCases["TestClusterRollingRestart"] = providerRegion.TestClusterRollingRestart + testCases["TestKillingCockroachNode"] = providerRegion.TestKillingCockroachNode + testCases["TestClusterScaleUp"] = func(t *testing.T) { providerRegion.TestClusterScaleUp(t, cloudProvider) } + testCases["TestInstallWithCertManager"] = providerRegion.TestInstallWithCertManager } // Run tests sequentially within a provider. @@ -111,7 +119,6 @@ func TestOperatorInSingleRegion(t *testing.T) { t.Logf("Infrastructure cleanup completed due to test failure") } }() - method(t) }) } @@ -142,7 +149,6 @@ func (r *singleRegion) TestHelmInstall(t *testing.T) { if _, ok := rawConfig.Contexts[cluster]; !ok { t.Fatalf("cluster context '%s' not found in kubeconfig", cluster) } - rawConfig.CurrentContext = cluster r.ValidateCRDB(t, cluster) } @@ -206,7 +212,6 @@ func (r *singleRegion) TestHelmInstallVirtualCluster(t *testing.T) { if _, ok := rawConfig.Contexts[cluster]; !ok { t.Fatalf("cluster context '%s' not found in kubeconfig", cluster) } - rawConfig.CurrentContext = cluster r.ValidateCRDB(t, cluster) @@ -250,7 +255,6 @@ func (r *singleRegion) TestHelmUpgrade(t *testing.T) { if _, ok := rawConfig.Contexts[cluster]; !ok { t.Fatalf("cluster context '%s' not found in kubeconfig", cluster) } - rawConfig.CurrentContext = cluster // Validate CockroachDB cluster. r.ValidateCRDB(t, cluster) @@ -314,7 +318,6 @@ func (r *singleRegion) TestClusterRollingRestart(t *testing.T) { if _, ok := rawConfig.Contexts[cluster]; !ok { t.Fatalf("cluster context '%s' not found in kubeconfig", cluster) } - rawConfig.CurrentContext = cluster // Validate CockroachDB cluster. r.ValidateCRDB(t, cluster) @@ -393,7 +396,6 @@ func (r *singleRegion) TestKillingCockroachNode(t *testing.T) { if _, ok := rawConfig.Contexts[cluster]; !ok { t.Fatalf("cluster context '%s' not found in kubeconfig", cluster) } - rawConfig.CurrentContext = cluster // Validate CockroachDB cluster. r.ValidateCRDB(t, cluster) @@ -417,7 +419,6 @@ func (r *singleRegion) TestKillingCockroachNode(t *testing.T) { if _, ok := rawConfig.Contexts[cluster]; !ok { t.Fatalf("cluster context '%s' not found in kubeconfig", cluster) } - rawConfig.CurrentContext = cluster // Validate CockroachDB cluster. r.ValidateCRDB(t, cluster) @@ -446,7 +447,6 @@ func (r *singleRegion) TestClusterScaleUp(t *testing.T, cloudProvider infra.Clou if _, ok := rawConfig.Contexts[cluster]; !ok { t.Fatalf("cluster context '%s' not found in kubeconfig", cluster) } - rawConfig.CurrentContext = cluster // Validate CockroachDB cluster. r.ValidateCRDB(t, cluster) @@ -504,7 +504,5 @@ func (r *singleRegion) TestInstallWithCertManager(t *testing.T) { if _, ok := rawConfig.Contexts[cluster]; !ok { t.Fatalf("cluster context '%s' not found in kubeconfig", cluster) } - rawConfig.CurrentContext = cluster r.ValidateCRDB(t, cluster) - }