Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
6dcdd51
update quiesce states to reflect RFD 588
davepacheco Aug 20, 2025
330c721
self-review + regenerate API spec
davepacheco Aug 20, 2025
b273427
tests need to wait for sagas to be enabled
davepacheco Aug 20, 2025
845d371
need to activate blueprint loader after inserting initial blueprint
davepacheco Aug 21, 2025
78ee24f
add the "second" Nexus to the test suite blueprint; fix omdb tests
davepacheco Sep 2, 2025
8af16b6
review feedback
davepacheco Aug 22, 2025
48483d0
fix tests on GNU/Linux
davepacheco Aug 23, 2025
cdfafb0
fix end to end dns test
davepacheco Sep 2, 2025
f0a31b8
fix omdb test
davepacheco Aug 23, 2025
d2b1f68
add test that Nexus quiesces when reading a blueprint saying so
davepacheco Aug 25, 2025
3ce8d59
fixup conflict
davepacheco Sep 2, 2025
6c84ded
pull in BlueprintBuilder.set_nexus_generation()
davepacheco Sep 2, 2025
93baf41
quiesce needs to keep track of blueprint ids
davepacheco Aug 27, 2025
6d5e952
add test
davepacheco Aug 27, 2025
1ecea95
is_fully_drained() can be more private
davepacheco Aug 27, 2025
a1c52ba
update omdb
davepacheco Aug 27, 2025
791886a
omdb output tweaks
davepacheco Aug 27, 2025
6fa9d9d
review feedback
davepacheco Aug 28, 2025
1317dd7
review feedback
davepacheco Sep 2, 2025
f04c429
review feedback
davepacheco Sep 2, 2025
405943b
Merge branch 'dap/handoff-quiesce-1' into dap/handoff-quiesce-2
davepacheco Sep 3, 2025
b569f09
Merge commit '9eade0677ea09aeda4e807d15d86f3a3e6622976' into dap/hand…
davepacheco Sep 3, 2025
2d61323
Merge commit 'b11266905429b319220414f08af1cce902e30c48' into dap/hand…
davepacheco Sep 3, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 74 additions & 10 deletions dev-tools/omdb/src/bin/omdb/nexus/quiesce.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,10 @@ use chrono::TimeDelta;
use chrono::Utc;
use clap::Args;
use clap::Subcommand;
use nexus_client::types::PendingRecovery;
use nexus_client::types::QuiesceState;
use nexus_client::types::QuiesceStatus;
use nexus_client::types::SagaQuiesceStatus;
use std::time::Duration;

#[derive(Debug, Args)]
Expand All @@ -31,9 +34,9 @@ pub enum QuiesceCommands {

#[derive(Debug, Args)]
pub struct QuiesceShowArgs {
/// Show details about held database connections
/// Show stack traces for held database connections
#[clap(short, long, default_value_t = false)]
verbose: bool,
stacks: bool,
}

pub async fn cmd_nexus_quiesce(
Expand All @@ -60,7 +63,10 @@ async fn quiesce_show(
.await
.context("fetching quiesce state")?
.into_inner();
match quiesce.state {

let QuiesceStatus { db_claims, sagas, state } = quiesce;

match state {
QuiesceState::Undetermined => {
println!("has not yet determined if it is quiescing");
}
Expand Down Expand Up @@ -145,25 +151,83 @@ async fn quiesce_show(
}
}

println!("sagas running: {}", quiesce.sagas_pending.len());
for saga in &quiesce.sagas_pending {
let SagaQuiesceStatus {
sagas_pending,
drained_blueprint_id,
first_recovery_complete,
new_sagas_allowed,
reassignment_blueprint_id,
reassignment_generation,
reassignment_pending,
recovered_blueprint_id,
recovered_reassignment_generation,
recovery_pending,
} = sagas;

println!("saga quiesce:");
println!(" new sagas: {:?}", new_sagas_allowed);
println!(
" drained as of blueprint: {}",
drained_blueprint_id
.map(|s| s.to_string())
.as_deref()
.unwrap_or("none")
);
println!(
" blueprint for last completed recovery pass: {}",
recovered_blueprint_id
.map(|s| s.to_string())
.as_deref()
.unwrap_or("none")
);
println!(
" blueprint for last reassignment pass: {}",
reassignment_blueprint_id
.map(|s| s.to_string())
.as_deref()
.unwrap_or("none")
);
println!(
" reassignment generation: {} (pass running: {})",
reassignment_generation,
if reassignment_pending { "yes" } else { "no" }
);
println!(" recovered generation: {}", recovered_reassignment_generation);
println!(
" recovered at least once successfully: {}",
if first_recovery_complete { "yes" } else { "no" },
);
print!(" recovery pending: ");
if let Some(PendingRecovery { generation, blueprint_id }) = recovery_pending
{
println!(
"yes (generation {}, blueprint id {})",
generation,
blueprint_id.map(|s| s.to_string()).as_deref().unwrap_or("none")
);
} else {
println!("no");
}

println!(" sagas running: {}", sagas_pending.len());
for saga in &sagas_pending {
println!(
" saga {} pending since {} ({})",
" saga {} pending since {} ({})",
saga.saga_id,
humantime::format_rfc3339_millis(saga.time_pending.into()),
saga.saga_name
);
}

println!("database connections held: {}", quiesce.db_claims.len());
for claim in &quiesce.db_claims {
println!("database connections held: {}", db_claims.len());
for claim in &db_claims {
println!(
" claim {} held since {} ({} ago)",
claim.id,
claim.held_since,
format_time_delta(Utc::now() - claim.held_since),
);
if args.verbose {
if args.stacks {
println!(" acquired by:");
println!("{}", textwrap::indent(&claim.debug, " "));
}
Expand All @@ -177,7 +241,7 @@ async fn quiesce_start(
_token: DestructiveOperationToken,
) -> Result<(), anyhow::Error> {
client.quiesce_start().await.context("quiescing Nexus")?;
quiesce_show(client, &QuiesceShowArgs { verbose: false }).await
quiesce_show(client, &QuiesceShowArgs { stacks: false }).await
}

fn format_duration_ms(duration: Duration) -> String {
Expand Down
21 changes: 10 additions & 11 deletions nexus/reconfigurator/execution/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ use nexus_types::deployment::execution::{
StepHandle, StepResult, UpdateEngine,
};
use nexus_types::quiesce::SagaQuiesceHandle;
use nexus_types::quiesce::SagaReassignmentDone;
use omicron_uuid_kinds::OmicronZoneUuid;
use slog::info;
use slog_error_chain::InlineErrorChain;
Expand Down Expand Up @@ -662,18 +663,16 @@ fn register_reassign_sagas_step<'a>(
match reassigned {
Ok(needs_saga_recovery) => (
StepSuccess::new(needs_saga_recovery).build(),
needs_saga_recovery,
SagaReassignmentDone::ReassignedAllAsOf(
blueprint.id,
needs_saga_recovery,
),
),
Err(error) => (
StepWarning::new(false, error.to_string())
.build(),
SagaReassignmentDone::Indeterminate,
),
Err(error) => {
// It's possible that we failed after having
// re-assigned sagas in the database.
let maybe_reassigned = true;
(
StepWarning::new(false, error.to_string())
.build(),
maybe_reassigned,
)
}
}
})
.await)
Expand Down
12 changes: 7 additions & 5 deletions nexus/src/app/quiesce.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,9 @@ impl super::Nexus {
) -> LookupResult<QuiesceStatus> {
opctx.authorize(authz::Action::Read, &authz::QUIESCE_STATE).await?;
let state = self.quiesce.state();
let sagas_pending = self.quiesce.sagas().sagas_pending();
let sagas = self.quiesce.sagas().status();
let db_claims = self.datastore().claims_held();
Ok(QuiesceStatus { state, sagas_pending, db_claims })
Ok(QuiesceStatus { state, sagas, db_claims })
}
}

Expand Down Expand Up @@ -283,7 +283,7 @@ mod test {
assert!(duration_total >= duration_draining_db);
assert!(duration_total >= duration_recording_quiesce);
assert!(duration_total <= (after - before).to_std().unwrap());
assert!(status.sagas_pending.is_empty());
assert!(status.sagas.sagas_pending.is_empty());
assert!(status.db_claims.is_empty());
}

Expand Down Expand Up @@ -357,7 +357,9 @@ mod test {
quiesce_status.state,
QuiesceState::DrainingSagas { .. }
);
assert!(quiesce_status.sagas_pending.contains_key(&demo_saga.saga_id));
assert!(
quiesce_status.sagas.sagas_pending.contains_key(&demo_saga.saga_id)
);
// We should see at least one held database claim from the one we took
// above.
assert!(!quiesce_status.db_claims.is_empty());
Expand Down Expand Up @@ -421,7 +423,7 @@ mod test {
if !matches!(rv.state, QuiesceState::DrainingDb { .. }) {
return Err(CondCheckError::<NexusClientError>::NotYet);
}
assert!(rv.sagas_pending.is_empty());
assert!(rv.sagas.sagas_pending.is_empty());
// The database claim we took is still held.
assert!(!rv.db_claims.is_empty());
Ok(())
Expand Down
9 changes: 3 additions & 6 deletions nexus/types/src/internal_api/views.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ use crate::deployment::TargetReleaseDescription;
use crate::inventory::BaseboardId;
use crate::inventory::CabooseWhich;
use crate::inventory::Collection;
use crate::quiesce::SagaQuiesceStatus;
use chrono::DateTime;
use chrono::SecondsFormat;
use chrono::Utc;
Expand Down Expand Up @@ -978,12 +979,8 @@ pub struct QuiesceStatus {
/// what stage of quiescing is Nexus at
pub state: QuiesceState,

/// what sagas are currently running or known needing to be recovered
///
/// This should only be non-empty when state is `Running` or
/// `WaitingForSagas`. Entries here prevent transitioning from
/// `WaitingForSagas` to `WaitingForDb`.
pub sagas_pending: IdOrdMap<PendingSagaInfo>,
/// information about saga quiescing
pub sagas: SagaQuiesceStatus,

/// what database claims are currently held (by any part of Nexus)
///
Expand Down
Loading
Loading