From 233203defbcf894c84bbdf3fe1fb77e00380834c Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 7 Jul 2025 15:13:38 -0700 Subject: [PATCH] Add schema changes for quiescing --- nexus/db-model/src/db_metadata.rs | 76 ++- nexus/db-model/src/schema_versions.rs | 10 +- .../src/db/datastore/db_metadata.rs | 560 ++++++++++++++++-- nexus/db-queries/src/db/datastore/mod.rs | 52 +- nexus/db-schema/src/schema.rs | 2 + nexus/src/bin/schema-updater.rs | 8 +- schema/crdb/README.adoc | 30 + schema/crdb/db-metadata-quiesce/up01.sql | 3 + schema/crdb/db-metadata-quiesce/up02.sql | 3 + schema/crdb/dbinit.sql | 35 +- 10 files changed, 711 insertions(+), 68 deletions(-) create mode 100644 schema/crdb/db-metadata-quiesce/up01.sql create mode 100644 schema/crdb/db-metadata-quiesce/up02.sql diff --git a/nexus/db-model/src/db_metadata.rs b/nexus/db-model/src/db_metadata.rs index de7e2862eb..b28742d87c 100644 --- a/nexus/db-model/src/db_metadata.rs +++ b/nexus/db-model/src/db_metadata.rs @@ -7,12 +7,16 @@ use chrono::{DateTime, Utc}; use nexus_db_schema::schema::db_metadata; use serde::{Deserialize, Serialize}; -/// Internal database metadata +/// These fields of "db_metadata" have been stable since the initial +/// release of the database. +/// +/// For backwards-compatibility purposes, they can be loaded separately +/// from DbMetadata. #[derive( Queryable, Insertable, Debug, Clone, Selectable, Serialize, Deserialize, )] #[diesel(table_name = db_metadata)] -pub struct DbMetadata { +pub struct DbMetadataBase { singleton: bool, time_created: DateTime, time_modified: DateTime, @@ -20,16 +24,78 @@ pub struct DbMetadata { target_version: Option, } +impl DbMetadataBase { + pub fn version(&self) -> &SemverVersion { + &self.version + } +} + +/// Internal database metadata +#[derive( + Queryable, Insertable, Debug, Clone, Selectable, Serialize, Deserialize, +)] +#[diesel(table_name = db_metadata)] +pub struct DbMetadata { + #[diesel(embed)] + base: DbMetadataBase, + quiesce_started: bool, + quiesce_completed: bool, +} + impl DbMetadata { + pub fn from_base(base: DbMetadataBase, quiesced: bool) -> Self { + Self { base, quiesce_started: quiesced, quiesce_completed: quiesced } + } + pub fn time_created(&self) -> &DateTime { - &self.time_created + &self.base.time_created } pub fn time_modified(&self) -> &DateTime { - &self.time_modified + &self.base.time_modified } pub fn version(&self) -> &SemverVersion { - &self.version + &self.base.version + } + + pub fn target_version(&self) -> Option<&SemverVersion> { + self.base.target_version.as_ref() + } + + pub fn quiesce_started(&self) -> bool { + self.quiesce_started + } + + pub fn quiesce_completed(&self) -> bool { + self.quiesce_completed + } +} + +#[derive(AsChangeset)] +#[diesel(table_name = db_metadata)] +pub struct DbMetadataUpdate { + time_modified: DateTime, + version: String, + #[diesel(treat_none_as_null = true)] + target_version: Option, + quiesce_started: Option, + quiesce_completed: Option, +} + +impl DbMetadataUpdate { + pub fn update_to_version(version: semver::Version) -> Self { + Self { + time_modified: Utc::now(), + version: version.to_string(), + target_version: None, + quiesce_started: None, + quiesce_completed: None, + } + } + + pub fn clear_quiesce(&mut self) { + self.quiesce_started = Some(false); + self.quiesce_completed = Some(false); } } diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index 1507aeab46..751227431c 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -16,7 +16,7 @@ use std::{collections::BTreeMap, sync::LazyLock}; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: Version = Version::new(173, 0, 0); +pub const SCHEMA_VERSION: Version = Version::new(174, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -28,6 +28,7 @@ static KNOWN_VERSIONS: LazyLock> = LazyLock::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(174, "db-metadata-quiesce"), KnownVersion::new(173, "inv-internal-dns"), KnownVersion::new(172, "add-zones-with-mupdate-override"), KnownVersion::new(171, "inv-clear-mupdate-override"), @@ -217,6 +218,13 @@ static KNOWN_VERSIONS: LazyLock> = LazyLock::new(|| { /// The earliest supported schema version. pub const EARLIEST_SUPPORTED_VERSION: Version = Version::new(1, 0, 0); +/// The version of the schema where "quisce" fields were added +/// to db_metadata. +/// +/// omicron.public.db_metadata is read a part of performing schema changes, +/// so this version is treated specially for backwards compatibiliy. +pub const QUIESCE_VERSION: Version = Version::new(174, 0, 0); + /// Describes one version of the database schema #[derive(Debug, Clone)] struct KnownVersion { diff --git a/nexus/db-queries/src/db/datastore/db_metadata.rs b/nexus/db-queries/src/db/datastore/db_metadata.rs index dbc1de5857..86710c2ab6 100644 --- a/nexus/db-queries/src/db/datastore/db_metadata.rs +++ b/nexus/db-queries/src/db/datastore/db_metadata.rs @@ -5,13 +5,16 @@ //! [`DataStore`] methods on Database Metadata. use super::DataStore; -use anyhow::{Context, bail, ensure}; +use anyhow::{Context, anyhow}; use async_bb8_diesel::{AsyncRunQueryDsl, AsyncSimpleConnection}; use chrono::Utc; use diesel::prelude::*; use nexus_db_errors::ErrorHandler; use nexus_db_errors::public_error_from_diesel; use nexus_db_model::AllSchemaVersions; +use nexus_db_model::DbMetadata; +use nexus_db_model::DbMetadataBase; +use nexus_db_model::DbMetadataUpdate; use nexus_db_model::EARLIEST_SUPPORTED_VERSION; use nexus_db_model::SchemaUpgradeStep; use nexus_db_model::SchemaVersion; @@ -99,37 +102,115 @@ fn skippable_version( return false; } +/// Reasons why [DataStore::ensure_schema] might fail +#[derive(thiserror::Error, Debug)] +pub enum EnsureSchemaError { + /// This Nexus is too old to understand the schema + /// + /// This may happen when an old Nexus is booting / rebooting during + /// a handoff to a newer Nexus. + #[error("Nexus is too old to use this schema; handoff: {handoff}")] + NexusTooOld { + /// Should this Nexus still try to participate in handoff to a newer + /// Nexus? + /// + /// If "no", then we should terminate. + handoff: bool, + }, + + /// This Nexus could upgrade, but is waiting for quiesce to complete + #[error("Nexus is waiting for older Nexus versions to quiesce")] + WaitingForQuiesce, + + /// The schema does not match, and we aren't considering changing it + #[error( + "DB schema {found} < Nexus schema {desired}, but schema update is disabled" + )] + SchemaUpdateDisabled { found: Version, desired: Version }, + + /// During schema upgrade, we couldn't find the necessary upgrade steps + #[error("Schema update from {found} -> {desired} missing step: {missing}")] + SchemaUpdateStepMissing { + found: Version, + desired: Version, + missing: Version, + }, + + /// Any other reason + #[error(transparent)] + Other(#[from] anyhow::Error), +} + +/// Describes whether or not [DataStore::ensure_schema] will proceed +/// with an update. +pub enum UpdateConfiguration<'a> { + /// The update will not be triggered + Disabled, + + /// If the schema update is possible, it will trigger + Enabled { + /// Known schema versions, and ways to upgrade between them + all_versions: &'a AllSchemaVersions, + + /// If "true", ignore the quiesce status in the database + /// + /// This can be used to ignore the work of ongoing Nexuses which may be + /// shutting down, and should only be set to "true" cautiously (e.g., + /// this is used by the schema-updater binary, which can be used by + /// operators). + /// + /// For most cases, this should be set to "false". + ignore_quiesce: bool, + }, +} + impl DataStore { - // Ensures that the database schema matches "desired_version". - // - // - Updating the schema makes the database incompatible with older - // versions of Nexus, which are not running "desired_version". - // - This is a one-way operation that cannot be undone. - // - The caller is responsible for ensuring that the new version is valid, - // and that all running Nexus instances can understand the new schema - // version. - // - // TODO: This function assumes that all concurrently executing Nexus - // instances on the rack are operating on the same version of software. - // If that assumption is broken, nothing would stop a "new deployment" - // from making a change that invalidates the queries used by an "old - // deployment". + /// Ensures that the database schema matches "desired_version". + /// + /// The `config` argument can be used to upgrade the schema to the desired + /// version (if "Enabled") or can be used as a read-only check to validate + /// the schema (if "Disabled"). + /// + /// - Updating the schema makes the database incompatible with older + /// versions of Nexus, which are not running "desired_version". + /// - This is a one-way operation that cannot be undone. + /// - The caller is responsible for ensuring that the new version is valid, + /// and that all running Nexus instances can understand the new schema + /// version. pub async fn ensure_schema( &self, log: &Logger, desired_version: Version, - all_versions: Option<&AllSchemaVersions>, - ) -> Result<(), anyhow::Error> { - let (found_version, found_target_version) = self - .database_schema_version() + config: UpdateConfiguration<'_>, + ) -> Result<(), EnsureSchemaError> { + let db_metadata = self + .database_metadata() .await .context("Cannot read database schema version")?; + let found_version: Version = db_metadata.version().clone().into(); + let found_target_version: Option = + db_metadata.target_version().map(|v| v.clone().into()); + let quiesce_started = db_metadata.quiesce_started(); + let quiesce_completed = db_metadata.quiesce_completed(); + let log = log.new(o!( "found_version" => found_version.to_string(), "desired_version" => desired_version.to_string(), + "quiesce_started" => quiesce_started, + "quiesce_completed" => quiesce_completed, )); + // This case is effectively: Quiesce has already completed, AND the + // schema migration has already been marked as completed. + if found_version > desired_version { + warn!( + log, + "Found schema version is newer than desired schema version"; + ); + return Err(EnsureSchemaError::NexusTooOld { handoff: false }); + } + // NOTE: We could run with a less tight restriction. // // If we respect the meaning of the semver version, it should be @@ -140,31 +221,50 @@ impl DataStore { // not exactly match the schema version, we refuse to continue without // modification. if found_version == desired_version { + if quiesce_completed { + warn!( + log, + "Schema version is up-to-date, but quiescing has completed" + ); + return Err(EnsureSchemaError::NexusTooOld { handoff: false }); + } + if quiesce_started { + warn!( + log, + "Schema version is up-to-date, but quiescing is in-progress" + ); + return Err(EnsureSchemaError::NexusTooOld { handoff: true }); + } info!(log, "Database schema version is up to date"); return Ok(()); } - if found_version > desired_version { - error!( - log, - "Found schema version is newer than desired schema version"; - ); - bail!( - "Found schema version ({}) is newer than desired schema \ - version ({})", - found_version, - desired_version, - ) - } + // At this point, we know the "found_version < desired_version". - let Some(all_versions) = all_versions else { - error!( + let (all_versions, ignore_quiesce) = match config { + UpdateConfiguration::Disabled => { + error!( + log, + "Database schema version is out of date, but automatic \ + update is disabled", + ); + return Err(EnsureSchemaError::SchemaUpdateDisabled { + found: found_version.clone(), + desired: desired_version.clone(), + }); + } + UpdateConfiguration::Enabled { all_versions, ignore_quiesce } => { + (all_versions, ignore_quiesce) + } + }; + + if !ignore_quiesce && !quiesce_completed { + warn!( log, - "Database schema version is out of date, but automatic update \ - is disabled", + "Schema version is older than desired; waiting for quiesce to complete"; ); - bail!("Schema is out of date but automatic update is disabled"); - }; + return Err(EnsureSchemaError::WaitingForQuiesce); + } // If we're here, we know the following: // @@ -172,16 +272,22 @@ impl DataStore { // didn't when we read it moments ago). // - We should attempt to automatically upgrade the schema. info!(log, "Database schema is out of date. Attempting upgrade."); - ensure!( - all_versions.contains_version(&found_version), - "Found schema version {found_version} was not found", - ); + if !all_versions.contains_version(&found_version) { + return Err(EnsureSchemaError::SchemaUpdateStepMissing { + found: found_version.clone(), + desired: desired_version.clone(), + missing: found_version.clone(), + }); + } // TODO: Test this? - ensure!( - all_versions.contains_version(&desired_version), - "Desired version {desired_version} was not found", - ); + if !all_versions.contains_version(&desired_version) { + return Err(EnsureSchemaError::SchemaUpdateStepMissing { + found: found_version.clone(), + desired: desired_version.clone(), + missing: desired_version.clone(), + }); + } let target_versions: Vec<&SchemaVersion> = all_versions .versions_range(( @@ -204,7 +310,10 @@ impl DataStore { // For the rationale here, see: StepSemverVersion::new. if target_version.semver().pre != semver::Prerelease::EMPTY { - bail!("Cannot upgrade to version which includes pre-release"); + return Err(anyhow!( + "Cannot upgrade to version which includes pre-release" + ) + .into()); } // Iterate over each individual file that comprises a schema change. @@ -261,11 +370,24 @@ impl DataStore { // Now that the schema change has completed, set the following: // - db_metadata.version = new version // - db_metadata.target_version = NULL + // - db_metadata.quiesce_started = false + // - db_metadata.quiesce_completed = false let last_step_version = last_step_version .ok_or_else(|| anyhow::anyhow!("Missing final step version"))?; - self.finalize_schema_update(¤t_version, &last_step_version) - .await - .context("Failed to finalize schema update")?; + + // We will keep "quiesced" set to true up until the last update. + // + // This means that if we crash and reboot partway through an update, + // we'll be able to resume it - Nexuses are still quiesced, and + // should not need to re-negotiate. + let clear_quiesced = *target_version.semver() == desired_version; + self.finalize_schema_update( + ¤t_version, + &last_step_version, + clear_quiesced, + ) + .await + .context("Failed to finalize schema update")?; info!( log, @@ -340,6 +462,54 @@ impl DataStore { Ok(()) } + pub async fn database_metadata(&self) -> Result { + use nexus_db_schema::schema::db_metadata::dsl; + + // The "quiesced" fields do not exist on all deployed schemas, + // so we read "DbMetadataBase" before trying to read the rest of the + // columns. + // + // If we don't have these fields yet, read the more stable portions of + // db_metadata, and treat the system as "already quiesced". This is + // necessary for some of our backwards-compatibility testing. + // + // TODO: Once we've sufficiently updated in-field devices, we should + // be able to read "DbMetadata" entirely, and avoid reading + // "DbMetadataBase" altogether. + let base = dsl::db_metadata + .filter(dsl::singleton.eq(true)) + .select(DbMetadataBase::as_select()) + .get_result_async(&*self.pool_connection_unauthorized().await?) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + if base.version().0 < nexus_db_model::QUIESCE_VERSION { + // Why are we treating this case as "quiesced = true"? + // + // - If Nexus wants to be running on a schema older than + // QUIESCE_VERSION, it's older than this code. As a part of our + // operator-driven update process, these Nexuses will be stopped + // manually by an operator during MUPdate. + // - If Nexus wants to be running on a schema newer than or equal to + // QUIESCE_VERSION, but it sees an older schema in the DB, then it + // is trying to perform an update (as in our tests) or waiting + // for an operator to run the schema-updater. In either of these + // cases, Nexus is assuming no older versions of Nexus are running. + // + // Note that once we've upgraded past QUIESCE_VERSION, this + // conditional becomes irrelevant, and we can rely on a more + // sophisticated model of "old Nexus" -> "new Nexus" handoff. + let quiesced = true; + return Ok(DbMetadata::from_base(base, quiesced)); + } + + dsl::db_metadata + .filter(dsl::singleton.eq(true)) + .select(DbMetadata::as_select()) + .get_result_async(&*self.pool_connection_unauthorized().await?) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } + pub async fn database_schema_version( &self, ) -> Result<(Version, Option), Error> { @@ -456,27 +626,36 @@ impl DataStore { // Completes a schema migration, upgrading to the new version. // + // This also allows clearing the "quiesce started/completed" state, as + // Nexuses on the of the *new* version are not quiescing. + // // - from_version: What we expect "version" must be to proceed // - last_step: What we expect "target_version" must be to proceed. async fn finalize_schema_update( &self, from_version: &Version, last_step: &StepSemverVersion, + clear_quiesced: bool, ) -> Result<(), Error> { use nexus_db_schema::schema::db_metadata::dsl; let to_version = last_step.without_prerelease(); + let mut updates = + DbMetadataUpdate::update_to_version(to_version.clone()); + + // We only try to clear the "quiesced" fields (setting them to false) + // if we think the schema knows what those fields are. + if clear_quiesced && to_version >= nexus_db_model::QUIESCE_VERSION { + updates.clear_quiesce(); + } + let rows_updated = diesel::update( dsl::db_metadata .filter(dsl::singleton.eq(true)) .filter(dsl::version.eq(from_version.to_string())) .filter(dsl::target_version.eq(last_step.version.to_string())), ) - .set(( - dsl::time_modified.eq(Utc::now()), - dsl::version.eq(to_version.to_string()), - dsl::target_version.eq(None as Option), - )) + .set(updates) .execute_async(&*self.pool_connection_unauthorized().await?) .await .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; @@ -509,7 +688,11 @@ mod test { let datastore = db.datastore(); datastore - .ensure_schema(&logctx.log, SCHEMA_VERSION, None) + .ensure_schema( + &logctx.log, + SCHEMA_VERSION, + UpdateConfiguration::Disabled, + ) .await .expect("Failed to ensure schema"); @@ -609,6 +792,13 @@ mod test { [&v0, &v1, &v2].into_iter(), ) .expect("failed to load schema"); + + // Mark quiescing as completed to allow the migration to proceed + let quiesce_started = true; + let quiesce_completed = true; + let datastore = DataStore::new_unchecked(log.clone(), pool.clone()); + set_quisece(&datastore, quiesce_started, quiesce_completed).await; + let _ = futures::future::join_all((0..10).map(|_| { let all_versions = all_versions.clone(); let log = log.clone(); @@ -650,6 +840,7 @@ mod test { .collect::, _>>() .expect("Failed to create datastore"); + datastore.terminate().await; db.terminate().await; logctx.cleanup_successful(); } @@ -743,8 +934,18 @@ mod test { // Manually construct the datastore to avoid the backoff timeout. // We want to trigger errors, but have no need to wait. let datastore = DataStore::new_unchecked(log.clone(), pool.clone()); + let quiesce_started = true; + let quiesce_completed = true; + set_quisece(&datastore, quiesce_started, quiesce_completed).await; while let Err(e) = datastore - .ensure_schema(&log, SCHEMA_VERSION, Some(&all_versions)) + .ensure_schema( + &log, + SCHEMA_VERSION, + UpdateConfiguration::Enabled { + all_versions: &all_versions, + ignore_quiesce: false, + }, + ) .await { warn!(log, "Failed to ensure schema"; "err" => %e); @@ -771,4 +972,251 @@ mod test { db.terminate().await; logctx.cleanup_successful(); } + + async fn set_quisece( + datastore: &DataStore, + started: bool, + completed: bool, + ) { + let conn = datastore.pool_connection_for_tests().await.unwrap(); + diesel::dsl::sql::(&format!( + "UPDATE omicron.public.db_metadata \ + SET \ + quiesce_started = {started}, \ + quiesce_completed = {completed} \ + WHERE singleton = TRUE" + )) + .execute_async(&*conn) + .await + .expect("Failed to update metadata"); + } + + #[tokio::test] + async fn ensure_schema_with_old_nexus() { + let logctx = dev::test_setup_log("ensure_schema_with_old_nexus"); + let log = &logctx.log; + let db = TestDatabase::new_with_pool(&logctx.log).await; + let pool = db.pool(); + + // The contents don't really matter here - just make the schema appear + // "old". + let mut old_schema = SCHEMA_VERSION; + old_schema.major -= 1; + + // Case: Version matches the DB, and quiesce has started. + { + let datastore = DataStore::new_unchecked(log.clone(), pool.clone()); + let quiesce_started = true; + let quiesce_completed = false; + set_quisece(&datastore, quiesce_started, quiesce_completed).await; + + let Err(err) = datastore + .ensure_schema( + &log, + SCHEMA_VERSION, + UpdateConfiguration::Disabled, + ) + .await + else { + panic!("Ensuring schema mid-quiesce should fail"); + }; + assert!( + matches!(err, EnsureSchemaError::NexusTooOld { handoff: true },), + "Unexpected error: {err:?} (expected: Nexus too old, with handoff)", + ); + } + + // Case: Version matches the DB, and quiesce has completed. + { + let datastore = DataStore::new_unchecked(log.clone(), pool.clone()); + let quiesce_started = true; + let quiesce_completed = true; + set_quisece(&datastore, quiesce_started, quiesce_completed).await; + + let Err(err) = datastore + .ensure_schema( + &log, + SCHEMA_VERSION, + UpdateConfiguration::Disabled, + ) + .await + else { + panic!("Ensuring schema post-quiesce should fail"); + }; + assert!( + matches!( + err, + EnsureSchemaError::NexusTooOld { handoff: false }, + ), + "Unexpected error: {err:?} (expected: Nexus too old, with no handoff)", + ); + } + + // Case: The schema we request is older than what's in the database. + // + // This is what would happen if an old Nexus booted after a schema + // migration already completed. + { + let datastore = DataStore::new_unchecked(log.clone(), pool.clone()); + let quiesce_started = false; + let quiesce_completed = false; + set_quisece(&datastore, quiesce_started, quiesce_completed).await; + + let Err(err) = datastore + .ensure_schema(&log, old_schema, UpdateConfiguration::Disabled) + .await + else { + panic!("Ensuring old schema should fail"); + }; + assert!( + matches!( + err, + EnsureSchemaError::NexusTooOld { handoff: false }, + ), + "Unexpected error: {err:?} (expected: Nexus too old, with no handoff)", + ); + } + + db.terminate().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn ensure_schema_waits_for_quiesce() { + let logctx = dev::test_setup_log("ensure_schema_waits_for_quiesce"); + let log = &logctx.log; + let db = TestDatabase::new_with_pool(&logctx.log).await; + let pool = db.pool(); + + let old_schema = SCHEMA_VERSION; + let mut new_schema = SCHEMA_VERSION; + new_schema.major += 1; + + let config_dir = Utf8TempDir::new().unwrap(); + add_upgrade(config_dir.path(), old_schema.clone(), "SELECT true;") + .await; + add_upgrade(config_dir.path(), new_schema.clone(), "SELECT true;") + .await; + let all_versions = AllSchemaVersions::load_specific_legacy_versions( + config_dir.path(), + [&old_schema, &new_schema].into_iter(), + ) + .expect("failed to load schema"); + + // Load the new version, before quiescing started + // + // This is the case where a new Nexus has been started, but old + // Nexuses have not yet started quiescing. + let datastore = DataStore::new_unchecked(log.clone(), pool.clone()); + let quiesce_started = false; + let quiesce_completed = false; + set_quisece(&datastore, quiesce_started, quiesce_completed).await; + + let Err(err) = datastore + .ensure_schema( + &log, + new_schema.clone(), + UpdateConfiguration::Enabled { + all_versions: &all_versions, + ignore_quiesce: false, + }, + ) + .await + else { + panic!("Ensuring schema pre-quiesce should fail"); + }; + assert!( + matches!(err, EnsureSchemaError::WaitingForQuiesce), + "Unexpected error: {err:?} (expected: Wait for Quiesce)", + ); + + // Load the new version, but while quiesce is in-progress + let datastore = DataStore::new_unchecked(log.clone(), pool.clone()); + let quiesce_started = true; + let quiesce_completed = false; + set_quisece(&datastore, quiesce_started, quiesce_completed).await; + + let Err(err) = datastore + .ensure_schema( + &log, + new_schema.clone(), + UpdateConfiguration::Enabled { + all_versions: &all_versions, + ignore_quiesce: false, + }, + ) + .await + else { + panic!("Ensuring schema mid-quiesce should fail"); + }; + assert!( + matches!(err, EnsureSchemaError::WaitingForQuiesce), + "Unexpected error: {err:?} (expected: Wait for Quiesce)", + ); + + // Let's suppose that later on, quiesce completes. + // + // This should let us proceed with the update. + let quiesce_started = true; + let quiesce_completed = true; + set_quisece(&datastore, quiesce_started, quiesce_completed).await; + datastore + .ensure_schema( + &log, + new_schema.clone(), + UpdateConfiguration::Enabled { + all_versions: &all_versions, + ignore_quiesce: false, + }, + ) + .await + .expect("Ensuring schema should have succeeded"); + + let db_metadata = datastore + .database_metadata() + .await + .expect("Should be able to read database metadata post-update"); + + // Since the update succeeded, we should be able to see: + // - We're using the new version + // - There is no "target version" + // - Quiescing state is cleared (set to "false") + assert_eq!( + semver::Version::from(db_metadata.version().clone()), + new_schema + ); + assert_eq!(db_metadata.target_version(), None); + assert_eq!(db_metadata.quiesce_started(), false); + assert_eq!(db_metadata.quiesce_completed(), false); + + db.terminate().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn ensure_schema_without_upgrade() { + let logctx = dev::test_setup_log("ensure_schema_without_upgrade"); + let log = &logctx.log; + let db = TestDatabase::new_with_pool(&logctx.log).await; + let pool = db.pool(); + + let mut new_schema = SCHEMA_VERSION; + new_schema.major += 1; + + // Load the new version, but don't enabled update. + let datastore = DataStore::new_unchecked(log.clone(), pool.clone()); + let Err(err) = datastore + .ensure_schema(&log, new_schema, UpdateConfiguration::Disabled) + .await + else { + panic!("Ensuring schema without enabling update should fail"); + }; + assert!( + matches!(err, EnsureSchemaError::SchemaUpdateDisabled { .. },), + "Unexpected error: {err:?} (expected: Schema Update Disabled)", + ); + + db.terminate().await; + logctx.cleanup_successful(); + } } diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index 55f0e58222..f068efafcf 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -119,6 +119,7 @@ pub mod webhook_delivery; mod zpool; pub use address_lot::AddressLotCreateResult; +pub use db_metadata::UpdateConfiguration; pub use dns::DataStoreDnsTest; pub use dns::DnsVersionUpdateBuilder; pub use instance::{ @@ -240,20 +241,65 @@ impl DataStore { || async { if let Some(try_for) = try_for { if std::time::Instant::now() > start + try_for { - return Err(BackoffError::permanent(())); + return Err(BackoffError::permanent( + format!("Tried to update schema for {} ms; terminating", try_for.as_millis()) + )); } } + let config = if let Some(all_versions) = config { + crate::db::datastore::UpdateConfiguration::Enabled { + all_versions, + ignore_quiesce: false, + } + } else { + crate::db::datastore::UpdateConfiguration::Disabled + }; + match datastore .ensure_schema(&log, EXPECTED_VERSION, config) .await { Ok(()) => return Ok(()), + Err(db_metadata::EnsureSchemaError::NexusTooOld { handoff }) => { + if handoff { + // We are running an out-of-date schema, but still + // need to participate in handoff. + // + // TODO(https://github.com/oxidecomputer/omicron/issues/8501): + // This should be implemented by contacting other + // Nexus instances, verifying that quiescing has + // completed, and setting "quiesce_complete" once + // that has completed. + // + // NOTE: We shouldn't hit this condition until any + // deployed Nexuses start setting the "quiesce" + // booleans of omicron.public.db_metadata to true. + error!(log, "Schema handoff from old -> new Nexus is not yet implemented"); + return Err(BackoffError::permanent(format!( + "Nexus @ schema {EXPECTED_VERSION} needs to \ + handoff, but not it is not implemented" + ))); + } else { + // We are running an out-of-date schema, and no + // longer need to participate in handoff (e.g., + // maybe we participated in handoff, rebooted, and + // the schema has progressed since then). + error!( + log, + "Our database schema appears too old"; + "version" => ?EXPECTED_VERSION + ); + return Err(BackoffError::permanent(format!( + "Schema {EXPECTED_VERSION:?} is too old" + ))); + } + } Err(e) => { - warn!(log, "Failed to ensure schema version"; "error" => #%e); + warn!(log, "Failed to ensure schema version"; "error" => ?e); + return Err(BackoffError::transient(e.to_string())); } }; - return Err(BackoffError::transient(())); }, |_, _| {}, ) diff --git a/nexus/db-schema/src/schema.rs b/nexus/db-schema/src/schema.rs index 8ef398e44d..949ca94f12 100644 --- a/nexus/db-schema/src/schema.rs +++ b/nexus/db-schema/src/schema.rs @@ -2333,6 +2333,8 @@ table! { time_modified -> Timestamptz, version -> Text, target_version -> Nullable, + quiesce_started -> Bool, + quiesce_completed -> Bool, } } diff --git a/nexus/src/bin/schema-updater.rs b/nexus/src/bin/schema-updater.rs index ead8554f73..b6fff84d1d 100644 --- a/nexus/src/bin/schema-updater.rs +++ b/nexus/src/bin/schema-updater.rs @@ -108,8 +108,14 @@ async fn main_impl() -> anyhow::Result<()> { } Cmd::Upgrade { version } => { println!("Upgrading to {version}"); + + let config = db::datastore::UpdateConfiguration::Enabled { + all_versions: &all_versions, + ignore_quiesce: true, + }; + datastore - .ensure_schema(&log, version.clone(), Some(&all_versions)) + .ensure_schema(&log, version.clone(), config) .await .map_err(|e| anyhow!(e))?; println!("Upgrade to {version} complete"); diff --git a/schema/crdb/README.adoc b/schema/crdb/README.adoc index a9c437bc9a..4b58c53053 100644 --- a/schema/crdb/README.adoc +++ b/schema/crdb/README.adoc @@ -52,6 +52,36 @@ fall-back pathway for performing upgrades. See RFD 319 for more discussion of the online upgrade plans. +== Self-Service Schema Upgrade + +Nexus is currently performing schema migrations using an <> +mechanism. As a part of this process, there will be a period where we expect +downtime, and where we must prevent access to the database for the duration of +the upgrade. + +Historically, these updates have taken the following form: + +* Nexus boots, and reads the `version` field of `omicron.public.db_metadata`. +* If this version matches what Nexus was compiled against, it can use the database. +* If this version does not match what Nexus expects, it waits for the schema to be + updated, and periodically re-checks this value. +* An operator (human) would need to manually execute the `schema-updater` binary + (link:../nexus/src/bin/schema-updater.rs[Source]) to upgrade the on-disk schema. + +As we've discussed in RFD 527, we're migrating to a more nuanced model for the +purposes of self-service upgrade. In this model: + +* Upgrading will involve running multiple versions of Nexus concurrently. +* Only one version of Nexus will have unconstrained access to the database at a time. +* During upgrade from version N to N+1, Nexus zones running at version N will + **quiesce** their access to the database. Once this quiescing process is complete, + handoff to the Nexuses at version N+1 can proceed. +* Once handoff occurs, the Nexus at version N+1 can automatically apply schema updates, + just as operators had manually been performing with the `schema-updater` tool. +* Afterwards, the Nexuses operating at N+1 can resume normal operation. +* This process can repeat indefinitely, when a Nexus with version N+2 appears and forces + the Nexuses at version N+1 to quiesce. + == How to change the schema Assumptions: diff --git a/schema/crdb/db-metadata-quiesce/up01.sql b/schema/crdb/db-metadata-quiesce/up01.sql new file mode 100644 index 0000000000..340dbdda01 --- /dev/null +++ b/schema/crdb/db-metadata-quiesce/up01.sql @@ -0,0 +1,3 @@ +ALTER TABLE omicron.public.db_metadata + ADD COLUMN IF NOT EXISTS quiesce_started BOOL NOT NULL DEFAULT FALSE, + ADD COLUMN IF NOT EXISTS quiesce_completed BOOL NOT NULL DEFAULT FALSE; diff --git a/schema/crdb/db-metadata-quiesce/up02.sql b/schema/crdb/db-metadata-quiesce/up02.sql new file mode 100644 index 0000000000..d73d9d7d9c --- /dev/null +++ b/schema/crdb/db-metadata-quiesce/up02.sql @@ -0,0 +1,3 @@ +ALTER TABLE omicron.public.db_metadata + ALTER COLUMN quiesce_started DROP DEFAULT, + ALTER COLUMN quiesce_completed DROP DEFAULT; diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 3da3de08a8..968a95b65e 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -5556,6 +5556,35 @@ CREATE TABLE IF NOT EXISTS omicron.public.db_metadata ( -- (Optional) Semver representation of the DB version to which we're upgrading target_version STRING(64), + -- During the nexus boot process, it must check this table to understand the + -- state of the Schema. + -- + -- There are a handful of states to consider: + -- + -- - "db.version = version in binary" + "quiesce_started = true": An + -- upgrade is in-progress, and the underlying database should not be used. + -- This Nexus should coordinate with other Nexus instances at this version + -- (via internal DNS) to set "quiesce_completed" to true. + -- - "db.version = version in binary" + "quiesce_completed = true" OR + -- "db.version > version in binary" + -- This Nexus should avoid accessing the database, but has no need to coordinate + -- with other Nexuses. + -- - "db.version < version in binary": + -- This Nexus should upgrade to the new schema, but only once "quiesce_completed" + -- is set to true. + -- Once the upgrade is complete, "quiesce_started" and "quiesce_completed" + -- should both be set to false. Note that multiple Nexuses may be attempting this + -- schema upgrade operation concurrently. + + + -- Identifies that a schema migration has started. + -- Nexuses with "db.version = version in binary" should not access the database any longer. + quiesce_started BOOL NOT NULL, + + -- Identifies that a schema migration is ready to hand-off from Old Nexus versions + -- to newer Nexus versions. + quiesce_completed BOOL NOT NULL, + CHECK (singleton = true) ); @@ -6340,9 +6369,11 @@ INSERT INTO omicron.public.db_metadata ( time_created, time_modified, version, - target_version + target_version, + quiesce_started, + quiesce_completed ) VALUES - (TRUE, NOW(), NOW(), '173.0.0', NULL) + (TRUE, NOW(), NOW(), '174.0.0', NULL, FALSE, FALSE) ON CONFLICT DO NOTHING; COMMIT;