Skip to content

Commit 5d0f8a0

Browse files
Ensure custody backfill sync couples all responses before importing (#8339)
Custody backfill sync has a bug when we request columns from more than one peer per batch. The fix here ensures we wait for all requests to be completed before performing verification and importing the responses. I've also added an endpoint `lighthouse/custody/backfill` that resets a nodes earliest available data column to the current epoch so that custody backfill can be triggered. This endpoint is needed to rescue any nodes that may have missing columns due to the custody backfill sync bug without requiring a full re-sync. Co-Authored-By: Eitan Seri- Levi <[email protected]> Co-Authored-By: Eitan Seri-Levi <[email protected]> Co-Authored-By: Jimmy Chen <[email protected]> Co-Authored-By: Michael Sproul <[email protected]>
1 parent 4908687 commit 5d0f8a0

File tree

11 files changed

+230
-24
lines changed

11 files changed

+230
-24
lines changed

beacon_node/beacon_chain/src/custody_context.rs

Lines changed: 75 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -120,9 +120,7 @@ impl ValidatorRegistrations {
120120
let effective_epoch =
121121
(current_slot + effective_delay_slots).epoch(E::slots_per_epoch()) + 1;
122122
self.epoch_validator_custody_requirements
123-
.entry(effective_epoch)
124-
.and_modify(|old_custody| *old_custody = validator_custody_requirement)
125-
.or_insert(validator_custody_requirement);
123+
.insert(effective_epoch, validator_custody_requirement);
126124
Some((effective_epoch, validator_custody_requirement))
127125
} else {
128126
None
@@ -154,11 +152,25 @@ impl ValidatorRegistrations {
154152
});
155153

156154
self.epoch_validator_custody_requirements
157-
.entry(effective_epoch)
158-
.and_modify(|old_custody| *old_custody = latest_validator_custody)
159-
.or_insert(latest_validator_custody);
155+
.insert(effective_epoch, latest_validator_custody);
160156
}
161157
}
158+
159+
/// Updates the `epoch -> cgc` map by pruning records before `effective_epoch`
160+
/// while setting the `cgc` at `effective_epoch` to the latest validator custody requirement.
161+
///
162+
/// This is used to restart custody backfill sync at `effective_epoch`
163+
pub fn reset_validator_custody_requirements(&mut self, effective_epoch: Epoch) {
164+
if let Some(latest_validator_custody_requirements) =
165+
self.latest_validator_custody_requirement()
166+
{
167+
self.epoch_validator_custody_requirements
168+
.retain(|&epoch, _| epoch >= effective_epoch);
169+
170+
self.epoch_validator_custody_requirements
171+
.insert(effective_epoch, latest_validator_custody_requirements);
172+
};
173+
}
162174
}
163175

164176
/// Given the `validator_custody_units`, return the custody requirement based on
@@ -535,6 +547,14 @@ impl<E: EthSpec> CustodyContext<E> {
535547
.write()
536548
.backfill_validator_custody_requirements(effective_epoch, expected_cgc);
537549
}
550+
551+
/// The node is attempting to restart custody backfill. Update the internal records so that
552+
/// custody backfill can start backfilling at `effective_epoch`.
553+
pub fn reset_validator_custody_requirements(&self, effective_epoch: Epoch) {
554+
self.validator_registrations
555+
.write()
556+
.reset_validator_custody_requirements(effective_epoch);
557+
}
538558
}
539559

540560
/// Indicates that the custody group count (CGC) has increased.
@@ -1491,4 +1511,53 @@ mod tests {
14911511
);
14921512
}
14931513
}
1514+
1515+
#[test]
1516+
fn reset_validator_custody_requirements() {
1517+
let spec = E::default_spec();
1518+
let minimum_cgc = 4u64;
1519+
let initial_cgc = 8u64;
1520+
let mid_cgc = 16u64;
1521+
let final_cgc = 32u64;
1522+
1523+
// Setup: Node restart after multiple validator registrations causing CGC increases
1524+
let head_epoch = Epoch::new(20);
1525+
let epoch_and_cgc_tuples = vec![
1526+
(Epoch::new(0), initial_cgc),
1527+
(Epoch::new(10), mid_cgc),
1528+
(head_epoch, final_cgc),
1529+
];
1530+
let custody_context = setup_custody_context(&spec, head_epoch, epoch_and_cgc_tuples);
1531+
1532+
// Backfill from epoch 20 to 9
1533+
complete_backfill_for_epochs(&custody_context, Epoch::new(20), Epoch::new(9), final_cgc);
1534+
1535+
// Reset validator custody requirements to the latest cgc requirements at `head_epoch` up to the boundary epoch
1536+
custody_context.reset_validator_custody_requirements(head_epoch);
1537+
1538+
// Verify epochs 0 - 19 return the minimum cgc requirement because of the validator custody requirement reset
1539+
for epoch in 0..=19 {
1540+
assert_eq!(
1541+
custody_context.custody_group_count_at_epoch(Epoch::new(epoch), &spec),
1542+
minimum_cgc,
1543+
);
1544+
}
1545+
1546+
// Verify epoch 20 returns a CGC of 32
1547+
assert_eq!(
1548+
custody_context.custody_group_count_at_epoch(head_epoch, &spec),
1549+
final_cgc
1550+
);
1551+
1552+
// Rerun Backfill to epoch 20
1553+
complete_backfill_for_epochs(&custody_context, Epoch::new(20), Epoch::new(0), final_cgc);
1554+
1555+
// Verify epochs 0 - 20 return the final cgc requirements
1556+
for epoch in 0..=20 {
1557+
assert_eq!(
1558+
custody_context.custody_group_count_at_epoch(Epoch::new(epoch), &spec),
1559+
final_cgc,
1560+
);
1561+
}
1562+
}
14941563
}

beacon_node/beacon_chain/src/historical_data_columns.rs

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -89,11 +89,6 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
8989
.get_data_column(&block_root, &data_column.index)?
9090
.is_some()
9191
{
92-
debug!(
93-
block_root = ?block_root,
94-
column_index = data_column.index,
95-
"Skipping data column import as identical data column exists"
96-
);
9792
continue;
9893
}
9994
if block_root != data_column.block_root() {

beacon_node/http_api/src/lib.rs

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4604,6 +4604,37 @@ pub fn serve<T: BeaconChainTypes>(
46044604
},
46054605
);
46064606

4607+
// POST lighthouse/custody/backfill
4608+
let post_lighthouse_custody_backfill = warp::path("lighthouse")
4609+
.and(warp::path("custody"))
4610+
.and(warp::path("backfill"))
4611+
.and(warp::path::end())
4612+
.and(task_spawner_filter.clone())
4613+
.and(chain_filter.clone())
4614+
.then(
4615+
|task_spawner: TaskSpawner<T::EthSpec>, chain: Arc<BeaconChain<T>>| {
4616+
task_spawner.blocking_json_task(Priority::P1, move || {
4617+
// Calling this endpoint will trigger custody backfill once `effective_epoch``
4618+
// is finalized.
4619+
let effective_epoch = chain
4620+
.canonical_head
4621+
.cached_head()
4622+
.head_slot()
4623+
.epoch(T::EthSpec::slots_per_epoch())
4624+
+ 1;
4625+
let custody_context = chain.data_availability_checker.custody_context();
4626+
// Reset validator custody requirements to `effective_epoch` with the latest
4627+
// cgc requiremnets.
4628+
custody_context.reset_validator_custody_requirements(effective_epoch);
4629+
// Update `DataColumnCustodyInfo` to reflect the custody change.
4630+
chain.update_data_column_custody_info(Some(
4631+
effective_epoch.start_slot(T::EthSpec::slots_per_epoch()),
4632+
));
4633+
Ok(())
4634+
})
4635+
},
4636+
);
4637+
46074638
// GET lighthouse/analysis/block_rewards
46084639
let get_lighthouse_block_rewards = warp::path("lighthouse")
46094640
.and(warp::path("analysis"))
@@ -4963,6 +4994,7 @@ pub fn serve<T: BeaconChainTypes>(
49634994
.uor(post_lighthouse_compaction)
49644995
.uor(post_lighthouse_add_peer)
49654996
.uor(post_lighthouse_remove_peer)
4997+
.uor(post_lighthouse_custody_backfill)
49664998
.recover(warp_utils::reject::handle_rejection),
49674999
),
49685000
)

beacon_node/http_api/src/test_utils.rs

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
use crate::{Config, Context};
22
use beacon_chain::{
33
BeaconChain, BeaconChainTypes,
4+
custody_context::NodeCustodyType,
45
test_utils::{BeaconChainHarness, BoxedMutator, Builder, EphemeralHarnessType},
56
};
67
use beacon_processor::{
@@ -67,6 +68,20 @@ impl<E: EthSpec> InteractiveTester<E> {
6768
None,
6869
Config::default(),
6970
true,
71+
NodeCustodyType::Fullnode,
72+
)
73+
.await
74+
}
75+
76+
pub async fn new_supernode(spec: Option<ChainSpec>, validator_count: usize) -> Self {
77+
Self::new_with_initializer_and_mutator(
78+
spec,
79+
validator_count,
80+
None,
81+
None,
82+
Config::default(),
83+
true,
84+
NodeCustodyType::Supernode,
7085
)
7186
.await
7287
}
@@ -78,6 +93,7 @@ impl<E: EthSpec> InteractiveTester<E> {
7893
mutator: Option<Mutator<E>>,
7994
config: Config,
8095
use_mock_builder: bool,
96+
node_custody_type: NodeCustodyType,
8197
) -> Self {
8298
let mut harness_builder = BeaconChainHarness::builder(E::default())
8399
.spec_or_default(spec.map(Arc::new))
@@ -93,6 +109,8 @@ impl<E: EthSpec> InteractiveTester<E> {
93109
.fresh_ephemeral_store()
94110
};
95111

112+
harness_builder = harness_builder.node_custody_type(node_custody_type);
113+
96114
// Add a mutator for the beacon chain builder which will be called in
97115
// `HarnessBuilder::build`.
98116
if let Some(mutator) = mutator {

beacon_node/http_api/tests/broadcast_validation_tests.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
use beacon_chain::custody_context::NodeCustodyType;
12
use beacon_chain::test_utils::test_spec;
23
use beacon_chain::{
34
GossipVerifiedBlock, IntoGossipVerifiedBlock, WhenSlotSkipped,
@@ -1956,6 +1957,7 @@ pub async fn duplicate_block_status_code() {
19561957
..Config::default()
19571958
},
19581959
true,
1960+
NodeCustodyType::Fullnode,
19591961
)
19601962
.await;
19611963

beacon_node/http_api/tests/fork_tests.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
//! Tests for API behaviour across fork boundaries.
2+
use beacon_chain::custody_context::NodeCustodyType;
23
use beacon_chain::{
34
StateSkipConfig,
45
test_utils::{DEFAULT_ETH1_BLOCK_HASH, HARNESS_GENESIS_TIME, RelativeSyncCommittee},
@@ -426,6 +427,7 @@ async fn bls_to_execution_changes_update_all_around_capella_fork() {
426427
None,
427428
Default::default(),
428429
true,
430+
NodeCustodyType::Fullnode,
429431
)
430432
.await;
431433
let harness = &tester.harness;

beacon_node/http_api/tests/interactive_tests.rs

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
//! Generic tests that make use of the (newer) `InteractiveApiTester`
2+
use beacon_chain::custody_context::NodeCustodyType;
23
use beacon_chain::{
34
ChainConfig,
45
chain_config::{DisallowedReOrgOffsets, ReOrgThreshold},
@@ -76,6 +77,7 @@ async fn state_by_root_pruned_from_fork_choice() {
7677
None,
7778
Default::default(),
7879
false,
80+
NodeCustodyType::Fullnode,
7981
)
8082
.await;
8183

@@ -433,6 +435,7 @@ pub async fn proposer_boost_re_org_test(
433435
})),
434436
Default::default(),
435437
false,
438+
NodeCustodyType::Fullnode,
436439
)
437440
.await;
438441
let harness = &tester.harness;
@@ -1049,6 +1052,68 @@ async fn proposer_duties_with_gossip_tolerance() {
10491052
);
10501053
}
10511054

1055+
// Test that a request to `lighthouse/custody/backfill` succeeds by verifying that `CustodyContext` and `DataColumnCustodyInfo`
1056+
// have been updated with the correct values.
1057+
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
1058+
async fn lighthouse_restart_custody_backfill() {
1059+
let spec = test_spec::<E>();
1060+
1061+
// Skip pre-Fulu.
1062+
if !spec.is_fulu_scheduled() {
1063+
return;
1064+
}
1065+
1066+
let validator_count = 24;
1067+
1068+
let tester = InteractiveTester::<E>::new_supernode(Some(spec), validator_count).await;
1069+
let harness = &tester.harness;
1070+
let spec = &harness.spec;
1071+
let client = &tester.client;
1072+
let min_cgc = spec.custody_requirement;
1073+
let max_cgc = spec.number_of_custody_groups;
1074+
1075+
let num_blocks = 2 * E::slots_per_epoch();
1076+
1077+
let custody_context = harness.chain.data_availability_checker.custody_context();
1078+
1079+
harness.advance_slot();
1080+
harness
1081+
.extend_chain_with_sync(
1082+
num_blocks as usize,
1083+
BlockStrategy::OnCanonicalHead,
1084+
AttestationStrategy::AllValidators,
1085+
SyncCommitteeStrategy::NoValidators,
1086+
LightClientStrategy::Disabled,
1087+
)
1088+
.await;
1089+
1090+
let cgc_at_head = custody_context.custody_group_count_at_head(spec);
1091+
let earliest_data_column_epoch = harness.chain.earliest_custodied_data_column_epoch();
1092+
1093+
assert_eq!(cgc_at_head, max_cgc);
1094+
assert_eq!(earliest_data_column_epoch, None);
1095+
1096+
custody_context
1097+
.update_and_backfill_custody_count_at_epoch(harness.chain.epoch().unwrap(), cgc_at_head);
1098+
client.post_lighthouse_custody_backfill().await.unwrap();
1099+
1100+
let cgc_at_head = custody_context.custody_group_count_at_head(spec);
1101+
let cgc_at_previous_epoch =
1102+
custody_context.custody_group_count_at_epoch(harness.chain.epoch().unwrap() - 1, spec);
1103+
let earliest_data_column_epoch = harness.chain.earliest_custodied_data_column_epoch();
1104+
1105+
// `DataColumnCustodyInfo` should have been updated to the head epoch
1106+
assert_eq!(
1107+
earliest_data_column_epoch,
1108+
Some(harness.chain.epoch().unwrap() + 1)
1109+
);
1110+
// Cgc requirements should have stayed the same at head
1111+
assert_eq!(cgc_at_head, max_cgc);
1112+
// Cgc requirements at the previous epoch should be `min_cgc`
1113+
// This allows for custody backfill to re-fetch columns for this epoch.
1114+
assert_eq!(cgc_at_previous_epoch, min_cgc);
1115+
}
1116+
10521117
// Test that a request for next epoch proposer duties suceeds when the current slot clock is within
10531118
// gossip clock disparity (500ms) of the new epoch.
10541119
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]

beacon_node/network/src/sync/custody_backfill_sync/mod.rs

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -382,11 +382,9 @@ impl<T: BeaconChainTypes> CustodyBackFillSync<T> {
382382
return None;
383383
};
384384

385-
let mut missing_columns = HashSet::new();
386-
387385
// Skip all batches (Epochs) that don't have missing columns.
388386
for epoch in Epoch::range_inclusive_rev(self.to_be_downloaded, column_da_boundary) {
389-
missing_columns = self.beacon_chain.get_missing_columns_for_epoch(epoch);
387+
let missing_columns = self.beacon_chain.get_missing_columns_for_epoch(epoch);
390388

391389
if !missing_columns.is_empty() {
392390
self.to_be_downloaded = epoch;
@@ -445,6 +443,7 @@ impl<T: BeaconChainTypes> CustodyBackFillSync<T> {
445443
self.include_next_batch()
446444
}
447445
Entry::Vacant(entry) => {
446+
let missing_columns = self.beacon_chain.get_missing_columns_for_epoch(batch_id);
448447
entry.insert(BatchInfo::new(
449448
&batch_id,
450449
CUSTODY_BACKFILL_EPOCHS_PER_BATCH,

beacon_node/network/src/sync/range_data_column_batch_request.rs

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -70,16 +70,17 @@ impl<T: BeaconChainTypes> RangeDataColumnBatchRequest<T> {
7070
HashMap::new();
7171
let mut column_to_peer_id: HashMap<u64, PeerId> = HashMap::new();
7272

73-
for column in self
74-
.requests
75-
.values()
76-
.filter_map(|req| req.to_finished())
77-
.flatten()
78-
{
79-
received_columns_for_slot
80-
.entry(column.slot())
81-
.or_default()
82-
.push(column.clone());
73+
for req in self.requests.values() {
74+
let Some(columns) = req.to_finished() else {
75+
return None;
76+
};
77+
78+
for column in columns {
79+
received_columns_for_slot
80+
.entry(column.slot())
81+
.or_default()
82+
.push(column.clone());
83+
}
8384
}
8485

8586
// Note: this assumes that only 1 peer is responsible for a column

book/src/api_lighthouse.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -447,6 +447,16 @@ indicating that all states with slots `>= 0` are available, i.e., full state his
447447
on the specific meanings of these fields see the docs on [Checkpoint
448448
Sync](./advanced_checkpoint_sync.md#how-to-run-an-archived-node).
449449

450+
## `/lighthouse/custody/backfill`
451+
452+
Starts a custody backfill sync from the next epoch with the node's latest custody requirements. The sync won't begin immediately, it waits until the next epoch is finalized before triggering.
453+
454+
This endpoint should only be used to fix nodes that may have partial custody columns due to a prior backfill bug (present in v8.0.0-rc.2). Use with caution as it re-downloads all historic custody data columns and may consume significant bandwidth.
455+
456+
```bash
457+
curl -X POST "http://localhost:5052/lighthouse/custody/backfill"
458+
```
459+
450460
## `/lighthouse/merge_readiness`
451461

452462
Returns the current difficulty and terminal total difficulty of the network. Before [The Merge](https://ethereum.org/en/roadmap/merge/) on 15<sup>th</sup> September 2022, you will see that the current difficulty is less than the terminal total difficulty, An example is shown below:

0 commit comments

Comments
 (0)