Skip to content

Commit 5094764

Browse files
committed
Do reconciliation if all downstairs are in live-repair
1 parent f50ae68 commit 5094764

File tree

4 files changed

+251
-59
lines changed

4 files changed

+251
-59
lines changed

upstairs/src/client.rs

Lines changed: 41 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -446,8 +446,20 @@ impl DownstairsClient {
446446
self.client_id
447447
);
448448
};
449-
assert_eq!(state.discriminant(), NegotiationState::WaitQuorum);
450-
assert_eq!(mode, &ConnectionMode::New);
449+
// There are two cases where reconciliation is allowed: either from a
450+
// new connection, or if all three Downstairs need live-repair
451+
// simultaneously.
452+
match (state.discriminant(), &mode) {
453+
(NegotiationState::WaitQuorum, ConnectionMode::New) => {
454+
// This is fine.
455+
}
456+
(NegotiationState::LiveRepairReady, ConnectionMode::Faulted) => {
457+
// This is also fine, but we need to tweak our connection mode
458+
// because we're no longer doing live-repair.
459+
*mode = ConnectionMode::New;
460+
}
461+
s => panic!("invalid (state, mode) tuple: ({s:?}"),
462+
}
451463
*state = NegotiationStateData::Reconcile;
452464
}
453465

@@ -894,6 +906,20 @@ impl DownstairsClient {
894906
},
895907
) => true,
896908

909+
// Special case: LiveRepairReady is allowed to jump sideways into
910+
// reconciliation if all three downstairs require live-repair
911+
// (because otherwise we have no-one to repair from)
912+
(
913+
DsStateData::Connecting {
914+
state: NegotiationStateData::LiveRepairReady(..),
915+
mode: ConnectionMode::Faulted,
916+
},
917+
DsStateData::Connecting {
918+
state: NegotiationStateData::Reconcile,
919+
mode: ConnectionMode::New,
920+
},
921+
) => true,
922+
897923
// Check normal negotiation path
898924
(
899925
DsStateData::Connecting {
@@ -926,7 +952,7 @@ impl DownstairsClient {
926952
ConnectionMode::Offline
927953
) | (NegotiationStateData::Reconcile, ConnectionMode::New)
928954
| (
929-
NegotiationStateData::LiveRepairReady,
955+
NegotiationStateData::LiveRepairReady(..),
930956
ConnectionMode::Faulted | ConnectionMode::Replaced
931957
)
932958
)
@@ -938,7 +964,7 @@ impl DownstairsClient {
938964
matches!(
939965
(state, mode),
940966
(
941-
NegotiationStateData::LiveRepairReady,
967+
NegotiationStateData::LiveRepairReady(..),
942968
ConnectionMode::Faulted | ConnectionMode::Replaced
943969
)
944970
)
@@ -1212,7 +1238,7 @@ impl DownstairsClient {
12121238
let DsStateData::Connecting { state, .. } = &self.state else {
12131239
return;
12141240
};
1215-
if matches!(state, NegotiationStateData::LiveRepairReady) {
1241+
if matches!(state, NegotiationStateData::LiveRepairReady(..)) {
12161242
assert!(self.cfg.read_only);
12171243

12181244
// TODO: could we do this transition early, by automatically
@@ -1230,7 +1256,7 @@ impl DownstairsClient {
12301256
let DsStateData::Connecting { state, .. } = &self.state else {
12311257
panic!("invalid state");
12321258
};
1233-
assert!(matches!(state, NegotiationStateData::LiveRepairReady));
1259+
assert!(matches!(state, NegotiationStateData::LiveRepairReady(..)));
12341260
self.checked_state_transition(up_state, DsStateData::LiveRepair);
12351261
}
12361262

@@ -1574,7 +1600,7 @@ impl DownstairsClient {
15741600
}
15751601

15761602
ConnectionMode::Faulted | ConnectionMode::Replaced => {
1577-
*state = NegotiationStateData::LiveRepairReady;
1603+
*state = NegotiationStateData::LiveRepairReady(dsr);
15781604
NegotiationResult::LiveRepair
15791605
}
15801606
ConnectionMode::Offline => {
@@ -1756,10 +1782,10 @@ impl DownstairsClient {
17561782
/// │ │ New │ Faulted / Replaced
17571783
/// │ ┌──────▼───┐ ┌────▼──────────┐
17581784
/// │ │WaitQuorum│ │LiveRepairReady│
1759-
/// │ └────┬─────┘ └───┬──────────┘
1760-
/// │ │
1761-
/// │ ┌────▼────┐
1762-
/// │ │Reconcile
1785+
/// │ └────┬─────┘ └───┬──────────┘
1786+
/// │ │
1787+
/// │ ┌────▼────┐
1788+
/// │ │Reconcile◄───────┘
17631789
/// │ └────┬────┘ │
17641790
/// │ │ │
17651791
/// │ ┌───▼──┐ │
@@ -1803,7 +1829,9 @@ pub enum NegotiationStateData {
18031829
Reconcile,
18041830

18051831
/// Waiting for live-repair to begin
1806-
LiveRepairReady,
1832+
// This state includes [`RegionMetadata`], because if all three Downstairs
1833+
// end up in `LiveRepairReady`, we have to perform reconciliation instead.
1834+
LiveRepairReady(RegionMetadata),
18071835
}
18081836

18091837
impl NegotiationStateData {
@@ -1842,7 +1870,7 @@ impl NegotiationStateData {
18421870
ConnectionMode::New
18431871
) | (
18441872
NegotiationStateData::GetExtentVersions,
1845-
NegotiationStateData::LiveRepairReady,
1873+
NegotiationStateData::LiveRepairReady(..),
18461874
ConnectionMode::Faulted | ConnectionMode::Replaced,
18471875
)
18481876
)

upstairs/src/downstairs.rs

Lines changed: 88 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,8 @@ use crate::{
2525
DownstairsIO, DownstairsMend, DsState, DsStateData, ExtentFix,
2626
ExtentRepairIDs, IOState, IOStateCount, IOop, ImpactedBlocks, JobId,
2727
Message, NegotiationState, RawReadResponse, RawWrite, ReconcileIO,
28-
ReconciliationId, RegionDefinition, ReplaceResult, SnapshotDetails,
29-
WorkSummary,
28+
ReconciliationId, RegionDefinition, RegionMetadata, ReplaceResult,
29+
SnapshotDetails, WorkSummary,
3030
};
3131
use crucible_common::{BlockIndex, ExtentId, NegotiationError};
3232
use crucible_protocol::WriteHeader;
@@ -165,6 +165,20 @@ pub(crate) enum LiveRepairState {
165165
},
166166
}
167167

168+
#[derive(Copy, Clone, Debug, PartialEq)]
169+
pub(crate) enum LiveRepairStart {
170+
/// Live-repair has started
171+
Started,
172+
/// Live-repair is already running
173+
AlreadyRunning,
174+
/// No downstairs is in `LiveRepairReady`
175+
NotNeeded,
176+
/// All three downstairs need live-repair (oh no)
177+
AllNeedRepair,
178+
/// There is no source downstairs available
179+
NoSource,
180+
}
181+
168182
impl LiveRepairState {
169183
fn dummy() -> Self {
170184
LiveRepairState::Noop {
@@ -874,7 +888,16 @@ impl Downstairs {
874888
/// Returns `true` if repair is needed, `false` otherwise
875889
pub(crate) fn collate(&mut self) -> Result<bool, NegotiationError> {
876890
let r = self.check_region_metadata()?;
877-
Ok(self.start_reconciliation(r))
891+
Ok(self.start_reconciliation(r, |data| {
892+
let DsStateData::Connecting {
893+
state: NegotiationStateData::WaitQuorum(r),
894+
..
895+
} = data
896+
else {
897+
panic!("client is not in WaitQuorum");
898+
};
899+
r
900+
}))
878901
}
879902

880903
/// Checks that region metadata is valid
@@ -963,9 +986,46 @@ impl Downstairs {
963986
Ok(CollateData { max_flush, max_gen })
964987
}
965988

989+
/// Begins reconciliation from all downstairs in `LiveRepairReady`
990+
///
991+
/// # Panics
992+
/// If any of the downstairs is not in `LiveRepairReady`
993+
#[must_use]
994+
pub(crate) fn reconcile_from_live_repair_ready(&mut self) -> bool {
995+
let mut max_flush = 0;
996+
let mut max_gen = 0;
997+
for client in self.clients.iter_mut() {
998+
let DsStateData::Connecting {
999+
state: NegotiationStateData::LiveRepairReady(data),
1000+
..
1001+
} = client.state_data()
1002+
else {
1003+
panic!("got invalid client state");
1004+
};
1005+
for m in data.iter() {
1006+
max_flush = max_flush.max(m.flush + 1);
1007+
max_gen = max_gen.max(m.gen + 1);
1008+
}
1009+
}
1010+
self.start_reconciliation(CollateData { max_gen, max_flush }, |data| {
1011+
let DsStateData::Connecting {
1012+
state: NegotiationStateData::LiveRepairReady(r),
1013+
..
1014+
} = data
1015+
else {
1016+
panic!("client is not in LiveRepairReady");
1017+
};
1018+
r
1019+
})
1020+
}
1021+
9661022
/// Begins reconciliation, using the given collation data
9671023
#[must_use]
968-
fn start_reconciliation(&mut self, data: CollateData) -> bool {
1024+
fn start_reconciliation<G: Fn(&DsStateData) -> &RegionMetadata>(
1025+
&mut self,
1026+
data: CollateData,
1027+
getter: G,
1028+
) -> bool {
9691029
let CollateData { max_flush, max_gen } = data;
9701030

9711031
/*
@@ -978,7 +1038,7 @@ impl Downstairs {
9781038
* Determine what extents don't match and what to do
9791039
* about that
9801040
*/
981-
if let Some(reconcile_list) = self.mismatch_list() {
1041+
if let Some(reconcile_list) = self.mismatch_list(getter) {
9821042
for c in self.clients.iter_mut() {
9831043
c.begin_reconcile();
9841044
}
@@ -1026,10 +1086,14 @@ impl Downstairs {
10261086
///
10271087
/// This function is idempotent; it returns without doing anything if
10281088
/// live-repair either can't be started or is already running.
1029-
pub(crate) fn check_live_repair_start(&mut self, up_state: &UpstairsState) {
1089+
#[must_use]
1090+
pub(crate) fn check_live_repair_start(
1091+
&mut self,
1092+
up_state: &UpstairsState,
1093+
) -> LiveRepairStart {
10301094
// If we're already doing live-repair, then we can't start live-repair
10311095
if self.live_repair_in_progress() {
1032-
return;
1096+
return LiveRepairStart::AlreadyRunning;
10331097
}
10341098

10351099
// Begin setting up live-repair state
@@ -1054,16 +1118,15 @@ impl Downstairs {
10541118

10551119
// Can't start live-repair if no one is LiveRepairReady
10561120
if repair_downstairs.is_empty() {
1057-
return;
1121+
return LiveRepairStart::NotNeeded;
1122+
} else if repair_downstairs.len() == 3 {
1123+
warn!(self.log, "All three downstairs require repair");
1124+
return LiveRepairStart::AllNeedRepair;
10581125
}
10591126

10601127
// Can't start live-repair if we don't have a source downstairs
10611128
let Some(source_downstairs) = source_downstairs else {
1062-
warn!(self.log, "No source, no Live Repair possible");
1063-
if repair_downstairs.len() == 3 {
1064-
warn!(self.log, "All three downstairs require repair");
1065-
}
1066-
return;
1129+
return LiveRepairStart::NoSource;
10671130
};
10681131

10691132
// Move the upstairs that were LiveRepairReady to LiveRepair
@@ -1105,6 +1168,8 @@ impl Downstairs {
11051168

11061169
let repair = self.repair.as_ref().unwrap();
11071170
self.notify_live_repair_start(repair);
1171+
1172+
LiveRepairStart::Started
11081173
}
11091174

11101175
/// Checks whether live-repair can continue
@@ -2008,18 +2073,14 @@ impl Downstairs {
20082073
}
20092074

20102075
/// Compares region metadata from all three clients and builds a mend list
2011-
fn mismatch_list(&self) -> Option<DownstairsMend> {
2076+
fn mismatch_list<G: Fn(&DsStateData) -> &RegionMetadata>(
2077+
&self,
2078+
getter: G,
2079+
) -> Option<DownstairsMend> {
20122080
let log = self.log.new(o!("" => "mend".to_string()));
20132081
let mut meta = ClientMap::new();
20142082
for i in ClientId::iter() {
2015-
let DsStateData::Connecting {
2016-
state: NegotiationStateData::WaitQuorum(r),
2017-
..
2018-
} = self.clients[i].state_data()
2019-
else {
2020-
panic!("client {i} is not in WaitQuorum");
2021-
};
2022-
meta.insert(i, r);
2083+
meta.insert(i, getter(self.clients[i].state_data()));
20232084
}
20242085
DownstairsMend::new(&meta, log)
20252086
}
@@ -4006,7 +4067,8 @@ struct DownstairsBackpressureConfig {
40064067
pub(crate) mod test {
40074068
use super::{
40084069
ClientFaultReason, ClientNegotiationFailed, ClientStopReason,
4009-
ConnectionMode, Downstairs, DsState, NegotiationStateData, PendingJob,
4070+
ConnectionMode, Downstairs, DsState, LiveRepairStart,
4071+
NegotiationStateData, PendingJob,
40104072
};
40114073
use crate::{
40124074
downstairs::{LiveRepairData, LiveRepairState, ReconcileData},
@@ -4081,7 +4143,7 @@ pub(crate) mod test {
40814143
NegotiationStateData::WaitForPromote,
40824144
NegotiationStateData::WaitForRegionInfo,
40834145
NegotiationStateData::GetExtentVersions,
4084-
NegotiationStateData::LiveRepairReady,
4146+
NegotiationStateData::LiveRepairReady(Default::default()),
40854147
] {
40864148
ds.clients[to_repair].checked_state_transition(
40874149
&UpstairsState::Active,
@@ -9613,7 +9675,8 @@ pub(crate) mod test {
96139675

96149676
// Start the repair normally. This enqueues the close & reopen jobs, and
96159677
// reserves Job IDs for the repair/noop
9616-
ds.check_live_repair_start(&UpstairsState::Active);
9678+
let r = ds.check_live_repair_start(&UpstairsState::Active);
9679+
assert_eq!(r, LiveRepairStart::Started);
96179680
assert!(ds.live_repair_in_progress());
96189681

96199682
// Submit a write.

upstairs/src/live_repair.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1113,7 +1113,7 @@ pub mod repair_test {
11131113
// Fault and start live-repair for client 1
11141114
to_live_repair_ready(&mut up, ClientId::new(1));
11151115

1116-
up.check_live_repair_start();
1116+
up.ensure_downstairs_consistency();
11171117
assert!(up.downstairs.live_repair_in_progress());
11181118
assert_eq!(up.downstairs.last_repair_extent(), Some(ExtentId(0)));
11191119

0 commit comments

Comments
 (0)