Skip to content
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
6dcdd51
update quiesce states to reflect RFD 588
davepacheco Aug 20, 2025
330c721
self-review + regenerate API spec
davepacheco Aug 20, 2025
b273427
tests need to wait for sagas to be enabled
davepacheco Aug 20, 2025
845d371
need to activate blueprint loader after inserting initial blueprint
davepacheco Aug 21, 2025
78ee24f
add the "second" Nexus to the test suite blueprint; fix omdb tests
davepacheco Sep 2, 2025
8af16b6
review feedback
davepacheco Aug 22, 2025
48483d0
fix tests on GNU/Linux
davepacheco Aug 23, 2025
cdfafb0
fix end to end dns test
davepacheco Sep 2, 2025
f0a31b8
fix omdb test
davepacheco Aug 23, 2025
d2b1f68
add test that Nexus quiesces when reading a blueprint saying so
davepacheco Aug 25, 2025
3ce8d59
fixup conflict
davepacheco Sep 2, 2025
6c84ded
pull in BlueprintBuilder.set_nexus_generation()
davepacheco Sep 2, 2025
93baf41
quiesce needs to keep track of blueprint ids
davepacheco Aug 27, 2025
6d5e952
add test
davepacheco Aug 27, 2025
1ecea95
is_fully_drained() can be more private
davepacheco Aug 27, 2025
a1c52ba
update omdb
davepacheco Aug 27, 2025
791886a
omdb output tweaks
davepacheco Aug 27, 2025
6fa9d9d
review feedback
davepacheco Aug 28, 2025
1317dd7
review feedback
davepacheco Sep 2, 2025
f04c429
review feedback
davepacheco Sep 2, 2025
405943b
Merge branch 'dap/handoff-quiesce-1' into dap/handoff-quiesce-2
davepacheco Sep 3, 2025
b569f09
Merge commit '9eade0677ea09aeda4e807d15d86f3a3e6622976' into dap/hand…
davepacheco Sep 3, 2025
2d61323
Merge commit 'b11266905429b319220414f08af1cce902e30c48' into dap/hand…
davepacheco Sep 3, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 23 additions & 3 deletions common/src/address.rs
Original file line number Diff line number Diff line change
Expand Up @@ -396,7 +396,9 @@ impl std::fmt::Display for IpVersion {
///
/// The first address in the range is guaranteed to be no greater than the last
/// address.
#[derive(Clone, Copy, Debug, PartialEq, Eq, Deserialize, Serialize)]
#[derive(
Clone, Copy, Debug, PartialEq, Eq, Deserialize, Serialize, Ord, PartialOrd,
)]
#[serde(untagged)]
pub enum IpRange {
V4(Ipv4Range),
Expand Down Expand Up @@ -548,7 +550,16 @@ impl From<Ipv6Range> for IpRange {
///
/// The first address must be less than or equal to the last address.
#[derive(
Clone, Copy, Debug, PartialEq, Eq, Deserialize, Serialize, JsonSchema,
Clone,
Copy,
Debug,
PartialEq,
Eq,
Deserialize,
Serialize,
JsonSchema,
PartialOrd,
Ord,
)]
#[serde(try_from = "AnyIpv4Range")]
pub struct Ipv4Range {
Expand Down Expand Up @@ -612,7 +623,16 @@ impl TryFrom<AnyIpv4Range> for Ipv4Range {
///
/// The first address must be less than or equal to the last address.
#[derive(
Clone, Copy, Debug, PartialEq, Eq, Deserialize, Serialize, JsonSchema,
PartialOrd,
Ord,
Clone,
Copy,
Debug,
PartialEq,
Eq,
Deserialize,
Serialize,
JsonSchema,
)]
#[serde(try_from = "AnyIpv6Range")]
pub struct Ipv6Range {
Expand Down
12 changes: 9 additions & 3 deletions dev-tools/omdb/src/bin/omdb/db.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5016,7 +5016,7 @@ async fn cmd_db_dns_diff(
// Load the added and removed items.
use nexus_db_schema::schema::dns_name::dsl;

let added = dsl::dns_name
let mut added = dsl::dns_name
.filter(dsl::dns_zone_id.eq(zone.id))
.filter(dsl::version_added.eq(version.version))
.limit(i64::from(u32::from(limit)))
Expand All @@ -5026,7 +5026,7 @@ async fn cmd_db_dns_diff(
.context("loading added names")?;
check_limit(&added, limit, || "loading added names");

let removed = dsl::dns_name
let mut removed = dsl::dns_name
.filter(dsl::dns_zone_id.eq(zone.id))
.filter(dsl::version_removed.eq(version.version))
.limit(i64::from(u32::from(limit)))
Expand All @@ -5042,6 +5042,11 @@ async fn cmd_db_dns_diff(
);
println!("");

// This is kind of stupid-expensive, but there aren't a lot of records
// here and it's helpful for this output to be stable.
added.sort_by_cached_key(|k| format!("{} {:?}", k.name, k.records()));
removed.sort_by_cached_key(|k| format!("{} {:?}", k.name, k.records()));

for a in added {
print_name("+", &a.name, a.records().context("parsing records"));
}
Expand Down Expand Up @@ -5097,7 +5102,8 @@ async fn cmd_db_dns_names(
}
});

for (name, records) in names {
for (name, mut records) in names {
records.sort();
print_name("", &name, Ok(records));
}
}
Expand Down
128 changes: 110 additions & 18 deletions dev-tools/omdb/src/bin/omdb/nexus/quiesce.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,10 @@ use chrono::TimeDelta;
use chrono::Utc;
use clap::Args;
use clap::Subcommand;
use nexus_client::types::PendingRecovery;
use nexus_client::types::QuiesceState;
use nexus_client::types::QuiesceStatus;
use nexus_client::types::SagaQuiesceStatus;
use std::time::Duration;

#[derive(Debug, Args)]
Expand All @@ -31,9 +34,9 @@ pub enum QuiesceCommands {

#[derive(Debug, Args)]
pub struct QuiesceShowArgs {
/// Show details about held database connections
/// Show stack traces for held database connections
#[clap(short, long, default_value_t = false)]
verbose: bool,
stacks: bool,
}

pub async fn cmd_nexus_quiesce(
Expand All @@ -60,21 +63,27 @@ async fn quiesce_show(
.await
.context("fetching quiesce state")?
.into_inner();
match quiesce.state {

let QuiesceStatus { db_claims, sagas, state } = quiesce;

match state {
QuiesceState::Undetermined => {
println!("has not yet determined if it is quiescing");
}
QuiesceState::Running => {
println!("running normally (not quiesced, not quiescing)");
}
QuiesceState::WaitingForSagas { time_requested } => {
QuiesceState::DrainingSagas { time_requested } => {
println!(
"quiescing since {} ({} ago)",
humantime::format_rfc3339_millis(time_requested.into()),
format_time_delta(now - time_requested),
);
println!("details: waiting for running sagas to finish");
}
QuiesceState::WaitingForDb {
QuiesceState::DrainingDb {
time_requested,
duration_waiting_for_sagas,
duration_draining_sagas,
..
} => {
println!(
Expand All @@ -87,13 +96,34 @@ async fn quiesce_show(
);
println!(
" previously: waiting for sagas took {}",
format_duration_ms(duration_waiting_for_sagas.into()),
format_duration_ms(duration_draining_sagas.into()),
);
}
QuiesceState::RecordingQuiesce {
time_requested,
duration_draining_sagas,
duration_draining_db,
..
} => {
println!(
"quiescing since {} ({} ago)",
humantime::format_rfc3339_millis(time_requested.into()),
format_time_delta(now - time_requested),
);
println!(
" waiting for sagas took {}",
format_duration_ms(duration_draining_sagas.into()),
);
println!(
" waiting for db quiesce took {}",
format_duration_ms(duration_draining_db.into()),
);
}
QuiesceState::Quiesced {
time_quiesced,
duration_waiting_for_sagas,
duration_waiting_for_db,
duration_draining_sagas,
duration_draining_db,
duration_recording_quiesce,
duration_total,
..
} => {
Expand All @@ -104,11 +134,15 @@ async fn quiesce_show(
);
println!(
" waiting for sagas took {}",
format_duration_ms(duration_waiting_for_sagas.into()),
format_duration_ms(duration_draining_sagas.into()),
);
println!(
" waiting for db quiesce took {}",
format_duration_ms(duration_waiting_for_db.into()),
format_duration_ms(duration_draining_db.into()),
);
println!(
" recording quiesce took {}",
format_duration_ms(duration_recording_quiesce.into()),
);
println!(
" total quiesce time: {}",
Expand All @@ -117,25 +151,83 @@ async fn quiesce_show(
}
}

println!("sagas running: {}", quiesce.sagas_pending.len());
for saga in &quiesce.sagas_pending {
let SagaQuiesceStatus {
sagas_pending,
drained_blueprint_id,
first_recovery_complete,
new_sagas_allowed,
reassignment_blueprint_id,
reassignment_generation,
reassignment_pending,
recovered_blueprint_id,
recovered_reassignment_generation,
recovery_pending,
} = sagas;

println!("saga quiesce:");
println!(" new sagas: {:?}", new_sagas_allowed);
println!(
" drained as of blueprint: {}",
drained_blueprint_id
.map(|s| s.to_string())
.as_deref()
.unwrap_or("none")
);
println!(
" blueprint for last completed recovery pass: {}",
recovered_blueprint_id
.map(|s| s.to_string())
.as_deref()
.unwrap_or("none")
);
println!(
" blueprint for last reassignment pass: {}",
reassignment_blueprint_id
.map(|s| s.to_string())
.as_deref()
.unwrap_or("none")
);
println!(
" reassignment generation: {} (pass running: {})",
reassignment_generation,
if reassignment_pending { "yes" } else { "no" }
);
println!(" recovered generation: {}", recovered_reassignment_generation);
println!(
" recovered at least once successfully: {}",
if first_recovery_complete { "yes" } else { "no" },
);
print!(" recovery pending: ");
if let Some(PendingRecovery { generation, blueprint_id }) = recovery_pending
{
println!(
"yes (generation {}, blueprint id {})",
generation,
blueprint_id.map(|s| s.to_string()).as_deref().unwrap_or("none")
);
} else {
println!("no");
}

println!(" sagas running: {}", sagas_pending.len());
for saga in &sagas_pending {
println!(
" saga {} pending since {} ({})",
" saga {} pending since {} ({})",
saga.saga_id,
humantime::format_rfc3339_millis(saga.time_pending.into()),
saga.saga_name
);
}

println!("database connections held: {}", quiesce.db_claims.len());
for claim in &quiesce.db_claims {
println!("database connections held: {}", db_claims.len());
for claim in &db_claims {
println!(
" claim {} held since {} ({} ago)",
claim.id,
claim.held_since,
format_time_delta(Utc::now() - claim.held_since),
);
if args.verbose {
if args.stacks {
println!(" acquired by:");
println!("{}", textwrap::indent(&claim.debug, " "));
}
Expand All @@ -149,7 +241,7 @@ async fn quiesce_start(
_token: DestructiveOperationToken,
) -> Result<(), anyhow::Error> {
client.quiesce_start().await.context("quiescing Nexus")?;
quiesce_show(client, &QuiesceShowArgs { verbose: false }).await
quiesce_show(client, &QuiesceShowArgs { stacks: false }).await
}

fn format_duration_ms(duration: Duration) -> String {
Expand Down
Loading
Loading