diff --git a/Cargo.lock b/Cargo.lock index ba6a4587b6f..0e559182438 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -980,6 +980,7 @@ dependencies = [ "metrics", "num_cpus", "parking_lot 0.12.3", + "rayon", "serde", "slot_clock", "strum", diff --git a/beacon_node/beacon_chain/src/builder.rs b/beacon_node/beacon_chain/src/builder.rs index 5e7aa7d4f87..35432632cc2 100644 --- a/beacon_node/beacon_chain/src/builder.rs +++ b/beacon_node/beacon_chain/src/builder.rs @@ -899,6 +899,7 @@ where let genesis_time = head_snapshot.beacon_state.genesis_time(); let canonical_head = CanonicalHead::new(fork_choice, Arc::new(head_snapshot)); let shuffling_cache_size = self.chain_config.shuffling_cache_size; + let complete_blob_backfill = self.chain_config.complete_blob_backfill; // Calculate the weak subjectivity point in which to backfill blocks to. let genesis_backfill_slot = if self.chain_config.genesis_backfill { @@ -1013,6 +1014,7 @@ where genesis_backfill_slot, data_availability_checker: Arc::new( DataAvailabilityChecker::new( + complete_blob_backfill, slot_clock, self.kzg.clone(), store, diff --git a/beacon_node/beacon_chain/src/chain_config.rs b/beacon_node/beacon_chain/src/chain_config.rs index d6be96afe94..a7defa9fa2a 100644 --- a/beacon_node/beacon_chain/src/chain_config.rs +++ b/beacon_node/beacon_chain/src/chain_config.rs @@ -86,6 +86,8 @@ pub struct ChainConfig { /// If using a weak-subjectivity sync, whether we should download blocks all the way back to /// genesis. pub genesis_backfill: bool, + /// EXPERIMENTAL: backfill blobs and data columns beyond the data availability window. + pub complete_blob_backfill: bool, /// Whether to send payload attributes every slot, regardless of connected proposers. /// /// This is useful for block builders and testing. @@ -144,6 +146,7 @@ impl Default for ChainConfig { optimistic_finalized_sync: true, shuffling_cache_size: crate::shuffling_cache::DEFAULT_CACHE_SIZE, genesis_backfill: false, + complete_blob_backfill: false, always_prepare_payload: false, epochs_per_migration: crate::migrate::DEFAULT_EPOCHS_PER_MIGRATION, enable_light_client_server: true, diff --git a/beacon_node/beacon_chain/src/data_availability_checker.rs b/beacon_node/beacon_chain/src/data_availability_checker.rs index 307dc0e227a..88cd8f3aab4 100644 --- a/beacon_node/beacon_chain/src/data_availability_checker.rs +++ b/beacon_node/beacon_chain/src/data_availability_checker.rs @@ -78,6 +78,7 @@ pub const STATE_LRU_CAPACITY: usize = STATE_LRU_CAPACITY_NON_ZERO.get(); /// proposer. Having a capacity > 1 is an optimization to prevent sync lookup from having re-fetch /// data during moments of unstable network conditions. pub struct DataAvailabilityChecker { + complete_blob_backfill: bool, availability_cache: Arc>, slot_clock: T::SlotClock, kzg: Arc, @@ -116,6 +117,7 @@ impl Debug for Availability { impl DataAvailabilityChecker { pub fn new( + complete_blob_backfill: bool, slot_clock: T::SlotClock, kzg: Arc, store: BeaconStore, @@ -129,6 +131,7 @@ impl DataAvailabilityChecker { spec.clone(), )?; Ok(Self { + complete_blob_backfill, availability_cache: Arc::new(inner), slot_clock, kzg, @@ -518,9 +521,15 @@ impl DataAvailabilityChecker { /// The epoch at which we require a data availability check in block processing. /// `None` if the `Deneb` fork is disabled. pub fn data_availability_boundary(&self) -> Option { - let current_epoch = self.slot_clock.now()?.epoch(T::EthSpec::slots_per_epoch()); - self.spec - .min_epoch_data_availability_boundary(current_epoch) + let fork_epoch = self.spec.deneb_fork_epoch?; + + if self.complete_blob_backfill { + Some(fork_epoch) + } else { + let current_epoch = self.slot_clock.now()?.epoch(T::EthSpec::slots_per_epoch()); + self.spec + .min_epoch_data_availability_boundary(current_epoch) + } } /// Returns true if the given epoch lies within the da boundary and false otherwise. @@ -1076,7 +1085,15 @@ mod test { let kzg = get_kzg(&spec); let store = Arc::new(HotColdDB::open_ephemeral(<_>::default(), spec.clone()).unwrap()); let custody_context = Arc::new(CustodyContext::new(false)); - DataAvailabilityChecker::new(slot_clock, kzg, store, custody_context, spec) - .expect("should initialise data availability checker") + let complete_blob_backfill = false; + DataAvailabilityChecker::new( + complete_blob_backfill, + slot_clock, + kzg, + store, + custody_context, + spec, + ) + .expect("should initialise data availability checker") } } diff --git a/beacon_node/beacon_processor/Cargo.toml b/beacon_node/beacon_processor/Cargo.toml index afd4660c9a3..262badf7f97 100644 --- a/beacon_node/beacon_processor/Cargo.toml +++ b/beacon_node/beacon_processor/Cargo.toml @@ -12,6 +12,7 @@ logging = { workspace = true } metrics = { workspace = true } num_cpus = { workspace = true } parking_lot = { workspace = true } +rayon = { workspace = true } serde = { workspace = true } slot_clock = { workspace = true } strum = { workspace = true } diff --git a/beacon_node/beacon_processor/src/lib.rs b/beacon_node/beacon_processor/src/lib.rs index 84723fb6a09..64aeb4ceaf2 100644 --- a/beacon_node/beacon_processor/src/lib.rs +++ b/beacon_node/beacon_processor/src/lib.rs @@ -38,6 +38,7 @@ //! checks the queues to see if there are more parcels of work that can be spawned in a new worker //! task. +use crate::rayon_manager::RayonManager; use crate::work_reprocessing_queue::{ QueuedBackfillBatch, QueuedColumnReconstruction, QueuedGossipBlock, ReprocessQueueMessage, }; @@ -47,6 +48,7 @@ use lighthouse_network::{MessageId, NetworkGlobals, PeerId}; use logging::TimeLatch; use logging::crit; use parking_lot::Mutex; +use rayon::ThreadPool; pub use scheduler::work_reprocessing_queue; use serde::{Deserialize, Serialize}; use slot_clock::SlotClock; @@ -74,6 +76,7 @@ use work_reprocessing_queue::{ }; mod metrics; +pub mod rayon_manager; pub mod scheduler; /// The maximum size of the channel for work events to the `BeaconProcessor`. @@ -603,7 +606,7 @@ pub enum Work { process_fn: BlockingFn, }, ChainSegment(AsyncFn), - ChainSegmentBackfill(AsyncFn), + ChainSegmentBackfill(BlockingFn), Status(BlockingFn), BlocksByRangeRequest(AsyncFn), BlocksByRootsRequest(AsyncFn), @@ -807,6 +810,7 @@ pub struct BeaconProcessor { pub network_globals: Arc>, pub executor: TaskExecutor, pub current_workers: usize, + pub rayon_manager: RayonManager, pub config: BeaconProcessorConfig, } @@ -1603,7 +1607,17 @@ impl BeaconProcessor { Work::BlocksByRangeRequest(work) | Work::BlocksByRootsRequest(work) => { task_spawner.spawn_async(work) } - Work::ChainSegmentBackfill(process_fn) => task_spawner.spawn_async(process_fn), + Work::ChainSegmentBackfill(process_fn) => { + if self.config.enable_backfill_rate_limiting { + task_spawner.spawn_blocking_with_rayon( + self.rayon_manager.low_priority_threadpool.clone(), + process_fn, + ) + } else { + // use the global rayon thread pool if backfill rate limiting is disabled. + task_spawner.spawn_blocking(process_fn) + } + } Work::ApiRequestP0(process_fn) | Work::ApiRequestP1(process_fn) => match process_fn { BlockingOrAsync::Blocking(process_fn) => task_spawner.spawn_blocking(process_fn), BlockingOrAsync::Async(process_fn) => task_spawner.spawn_async(process_fn), @@ -1665,6 +1679,22 @@ impl TaskSpawner { WORKER_TASK_NAME, ) } + + /// Spawns a blocking task on a rayon thread pool, dropping the `SendOnDrop` after task completion. + fn spawn_blocking_with_rayon(self, thread_pool: Arc, task: F) + where + F: FnOnce() + Send + 'static, + { + self.executor.spawn_blocking( + move || { + thread_pool.install(|| { + task(); + }); + drop(self.send_idle_on_drop) + }, + WORKER_TASK_NAME, + ) + } } /// This struct will send a message on `self.tx` when it is dropped. An error will be logged diff --git a/beacon_node/beacon_processor/src/rayon_manager.rs b/beacon_node/beacon_processor/src/rayon_manager.rs new file mode 100644 index 00000000000..99fe32d5cc4 --- /dev/null +++ b/beacon_node/beacon_processor/src/rayon_manager.rs @@ -0,0 +1,27 @@ +use rayon::{ThreadPool, ThreadPoolBuilder}; +use std::sync::Arc; + +const DEFAULT_LOW_PRIORITY_DIVISOR: usize = 4; +const MINIMUM_LOW_PRIORITY_THREAD_COUNT: usize = 1; + +pub struct RayonManager { + /// Smaller rayon thread pool for lower-priority, compute-intensive tasks. + /// By default ~25% of CPUs or a minimum of 1 thread. + pub low_priority_threadpool: Arc, +} + +impl Default for RayonManager { + fn default() -> Self { + let low_prio_threads = + (num_cpus::get() / DEFAULT_LOW_PRIORITY_DIVISOR).max(MINIMUM_LOW_PRIORITY_THREAD_COUNT); + let low_priority_threadpool = Arc::new( + ThreadPoolBuilder::new() + .num_threads(low_prio_threads) + .build() + .expect("failed to build low-priority rayon pool"), + ); + Self { + low_priority_threadpool, + } + } +} diff --git a/beacon_node/beacon_processor/src/scheduler/work_reprocessing_queue.rs b/beacon_node/beacon_processor/src/scheduler/work_reprocessing_queue.rs index 9565e57589d..8c33cf58693 100644 --- a/beacon_node/beacon_processor/src/scheduler/work_reprocessing_queue.rs +++ b/beacon_node/beacon_processor/src/scheduler/work_reprocessing_queue.rs @@ -37,7 +37,9 @@ const TASK_NAME: &str = "beacon_processor_reprocess_queue"; const GOSSIP_BLOCKS: &str = "gossip_blocks"; const RPC_BLOCKS: &str = "rpc_blocks"; const ATTESTATIONS: &str = "attestations"; +const ATTESTATIONS_PER_ROOT: &str = "attestations_per_root"; const LIGHT_CLIENT_UPDATES: &str = "lc_updates"; +const LIGHT_CLIENT_UPDATES_PER_PARENT_ROOT: &str = "lc_updates_per_parent_root"; /// Queue blocks for re-processing with an `ADDITIONAL_QUEUED_BLOCK_DELAY` after the slot starts. /// This is to account for any slight drift in the system clock. @@ -171,7 +173,7 @@ pub struct IgnoredRpcBlock { } /// A backfill batch work that has been queued for processing later. -pub struct QueuedBackfillBatch(pub AsyncFn); +pub struct QueuedBackfillBatch(pub BlockingFn); pub struct QueuedColumnReconstruction { pub block_root: Hash256, @@ -829,10 +831,19 @@ impl ReprocessQueue { ); } - if let Some(queued_atts) = self.awaiting_attestations_per_root.get_mut(&root) - && let Some(index) = queued_atts.iter().position(|&id| id == queued_id) + if let Entry::Occupied(mut queued_atts) = + self.awaiting_attestations_per_root.entry(root) + && let Some(index) = + queued_atts.get().iter().position(|&id| id == queued_id) { - queued_atts.swap_remove(index); + let queued_atts_mut = queued_atts.get_mut(); + queued_atts_mut.swap_remove(index); + + // If the vec is empty after this attestation's removal, we need to delete + // the entry to prevent bloating the hashmap indefinitely. + if queued_atts_mut.is_empty() { + queued_atts.remove_entry(); + } } } } @@ -853,13 +864,19 @@ impl ReprocessQueue { error!("Failed to send scheduled light client optimistic update"); } - if let Some(queued_lc_updates) = self - .awaiting_lc_updates_per_parent_root - .get_mut(&parent_root) - && let Some(index) = - queued_lc_updates.iter().position(|&id| id == queued_id) + if let Entry::Occupied(mut queued_lc_updates) = + self.awaiting_lc_updates_per_parent_root.entry(parent_root) + && let Some(index) = queued_lc_updates + .get() + .iter() + .position(|&id| id == queued_id) { - queued_lc_updates.swap_remove(index); + let queued_lc_updates_mut = queued_lc_updates.get_mut(); + queued_lc_updates_mut.swap_remove(index); + + if queued_lc_updates_mut.is_empty() { + queued_lc_updates.remove_entry(); + } } } } @@ -929,11 +946,21 @@ impl ReprocessQueue { &[ATTESTATIONS], self.attestations_delay_queue.len() as i64, ); + metrics::set_gauge_vec( + &metrics::BEACON_PROCESSOR_REPROCESSING_QUEUE_TOTAL, + &[ATTESTATIONS_PER_ROOT], + self.awaiting_attestations_per_root.len() as i64, + ); metrics::set_gauge_vec( &metrics::BEACON_PROCESSOR_REPROCESSING_QUEUE_TOTAL, &[LIGHT_CLIENT_UPDATES], self.lc_updates_delay_queue.len() as i64, ); + metrics::set_gauge_vec( + &metrics::BEACON_PROCESSOR_REPROCESSING_QUEUE_TOTAL, + &[LIGHT_CLIENT_UPDATES_PER_PARENT_ROOT], + self.awaiting_lc_updates_per_parent_root.len() as i64, + ); } fn recompute_next_backfill_batch_event(&mut self) { @@ -979,6 +1006,7 @@ impl ReprocessQueue { #[cfg(test)] mod tests { use super::*; + use crate::BeaconProcessorConfig; use logging::create_test_tracing_subscriber; use slot_clock::{ManualSlotClock, TestingSlotClock}; use std::ops::Add; @@ -1056,7 +1084,7 @@ mod tests { // Now queue a backfill sync batch. work_reprocessing_tx .try_send(ReprocessQueueMessage::BackfillSync(QueuedBackfillBatch( - Box::pin(async {}), + Box::new(|| {}), ))) .unwrap(); tokio::task::yield_now().await; @@ -1101,4 +1129,97 @@ mod tests { Duration::from_secs(slot_duration), ) } + + fn test_queue() -> ReprocessQueue { + create_test_tracing_subscriber(); + + let config = BeaconProcessorConfig::default(); + let (ready_work_tx, _) = mpsc::channel::(config.max_scheduled_work_queue_len); + let (_, reprocess_work_rx) = + mpsc::channel::(config.max_scheduled_work_queue_len); + let slot_clock = Arc::new(testing_slot_clock(12)); + + ReprocessQueue::new(ready_work_tx, reprocess_work_rx, slot_clock) + } + + // This is a regression test for a memory leak in `awaiting_attestations_per_root`. + // See: https://github.com/sigp/lighthouse/pull/8065 + #[tokio::test] + async fn prune_awaiting_attestations_per_root() { + create_test_tracing_subscriber(); + + let mut queue = test_queue(); + + // Pause time so it only advances manually + tokio::time::pause(); + + let beacon_block_root = Hash256::repeat_byte(0xaf); + + // Insert an attestation. + let att = ReprocessQueueMessage::UnknownBlockUnaggregate(QueuedUnaggregate { + beacon_block_root, + process_fn: Box::new(|| {}), + }); + + // Process the event to enter it into the delay queue. + queue.handle_message(InboundEvent::Msg(att)); + + // Check that it is queued. + assert_eq!(queue.awaiting_attestations_per_root.len(), 1); + assert!( + queue + .awaiting_attestations_per_root + .contains_key(&beacon_block_root) + ); + + // Advance time to expire the attestation. + advance_time(&queue.slot_clock, 2 * QUEUED_ATTESTATION_DELAY).await; + let ready_msg = queue.next().await.unwrap(); + assert!(matches!(ready_msg, InboundEvent::ReadyAttestation(_))); + queue.handle_message(ready_msg); + + // The entry for the block root should be gone. + assert!(queue.awaiting_attestations_per_root.is_empty()); + } + + // This is a regression test for a memory leak in `awaiting_lc_updates_per_parent_root`. + // See: https://github.com/sigp/lighthouse/pull/8065 + #[tokio::test] + async fn prune_awaiting_lc_updates_per_parent_root() { + create_test_tracing_subscriber(); + + let mut queue = test_queue(); + + // Pause time so it only advances manually + tokio::time::pause(); + + let parent_root = Hash256::repeat_byte(0xaf); + + // Insert an attestation. + let msg = + ReprocessQueueMessage::UnknownLightClientOptimisticUpdate(QueuedLightClientUpdate { + parent_root, + process_fn: Box::new(|| {}), + }); + + // Process the event to enter it into the delay queue. + queue.handle_message(InboundEvent::Msg(msg)); + + // Check that it is queued. + assert_eq!(queue.awaiting_lc_updates_per_parent_root.len(), 1); + assert!( + queue + .awaiting_lc_updates_per_parent_root + .contains_key(&parent_root) + ); + + // Advance time to expire the update. + advance_time(&queue.slot_clock, 2 * QUEUED_LIGHT_CLIENT_UPDATE_DELAY).await; + let ready_msg = queue.next().await.unwrap(); + assert!(matches!(ready_msg, InboundEvent::ReadyLightClientUpdate(_))); + queue.handle_message(ready_msg); + + // The entry for the block root should be gone. + assert!(queue.awaiting_lc_updates_per_parent_root.is_empty()); + } } diff --git a/beacon_node/client/src/builder.rs b/beacon_node/client/src/builder.rs index d984d5fedce..87cdcc45ef7 100644 --- a/beacon_node/client/src/builder.rs +++ b/beacon_node/client/src/builder.rs @@ -17,6 +17,7 @@ use beacon_chain::{ store::{HotColdDB, ItemStore, StoreConfig}, }; use beacon_chain::{Kzg, LightClientProducerEvent}; +use beacon_processor::rayon_manager::RayonManager; use beacon_processor::{BeaconProcessor, BeaconProcessorChannels}; use beacon_processor::{BeaconProcessorConfig, BeaconProcessorQueueLengths}; use environment::RuntimeContext; @@ -680,6 +681,7 @@ where executor: beacon_processor_context.executor.clone(), current_workers: 0, config: beacon_processor_config, + rayon_manager: RayonManager::default(), } .spawn_manager( beacon_processor_channels.beacon_processor_rx, diff --git a/beacon_node/http_api/src/test_utils.rs b/beacon_node/http_api/src/test_utils.rs index fe9e0dff704..7be8960e691 100644 --- a/beacon_node/http_api/src/test_utils.rs +++ b/beacon_node/http_api/src/test_utils.rs @@ -5,6 +5,7 @@ use beacon_chain::{ }; use beacon_processor::{ BeaconProcessor, BeaconProcessorChannels, BeaconProcessorConfig, BeaconProcessorQueueLengths, + rayon_manager::RayonManager, }; use directory::DEFAULT_ROOT_DIR; use eth2::{BeaconNodeHttpClient, Timeouts}; @@ -247,6 +248,7 @@ pub async fn create_api_server_with_config( executor: test_runtime.task_executor.clone(), current_workers: 0, config: beacon_processor_config, + rayon_manager: RayonManager::default(), } .spawn_manager( beacon_processor_rx, diff --git a/beacon_node/lighthouse_tracing/src/lib.rs b/beacon_node/lighthouse_tracing/src/lib.rs index 60fda12cc20..18a9874252a 100644 --- a/beacon_node/lighthouse_tracing/src/lib.rs +++ b/beacon_node/lighthouse_tracing/src/lib.rs @@ -26,6 +26,7 @@ pub const SPAN_PROCESS_RPC_BLOCK: &str = "process_rpc_block"; pub const SPAN_PROCESS_RPC_BLOBS: &str = "process_rpc_blobs"; pub const SPAN_PROCESS_RPC_CUSTODY_COLUMNS: &str = "process_rpc_custody_columns"; pub const SPAN_PROCESS_CHAIN_SEGMENT: &str = "process_chain_segment"; +pub const SPAN_PROCESS_CHAIN_SEGMENT_BACKFILL: &str = "process_chain_segment_backfill"; /// Fork choice root spans pub const SPAN_RECOMPUTE_HEAD: &str = "recompute_head_at_slot"; @@ -61,6 +62,7 @@ pub const LH_BN_ROOT_SPAN_NAMES: &[&str] = &[ SPAN_PROCESS_RPC_BLOBS, SPAN_PROCESS_RPC_CUSTODY_COLUMNS, SPAN_PROCESS_CHAIN_SEGMENT, + SPAN_PROCESS_CHAIN_SEGMENT_BACKFILL, SPAN_HANDLE_BLOCKS_BY_RANGE_REQUEST, SPAN_HANDLE_BLOBS_BY_RANGE_REQUEST, SPAN_HANDLE_DATA_COLUMNS_BY_RANGE_REQUEST, diff --git a/beacon_node/network/src/network_beacon_processor/mod.rs b/beacon_node/network/src/network_beacon_processor/mod.rs index 691c06f2687..85ccde1d591 100644 --- a/beacon_node/network/src/network_beacon_processor/mod.rs +++ b/beacon_node/network/src/network_beacon_processor/mod.rs @@ -6,9 +6,7 @@ use beacon_chain::data_column_verification::{GossipDataColumnError, observe_goss use beacon_chain::fetch_blobs::{ EngineGetBlobsOutput, FetchEngineBlobError, fetch_and_process_engine_blobs, }; -use beacon_chain::{ - AvailabilityProcessingStatus, BeaconChain, BeaconChainTypes, BlockError, NotifyExecutionLayer, -}; +use beacon_chain::{AvailabilityProcessingStatus, BeaconChain, BeaconChainTypes, BlockError}; use beacon_processor::{ BeaconProcessorSend, DuplicateCache, GossipAggregatePackage, GossipAttestationPackage, Work, WorkEvent as BeaconWorkEvent, @@ -500,33 +498,23 @@ impl NetworkBeaconProcessor { process_id: ChainSegmentProcessId, blocks: Vec>, ) -> Result<(), Error> { - let is_backfill = matches!(&process_id, ChainSegmentProcessId::BackSyncBatchId { .. }); debug!(blocks = blocks.len(), id = ?process_id, "Batch sending for process"); - let processor = self.clone(); - let process_fn = async move { - let notify_execution_layer = if processor - .network_globals - .sync_state - .read() - .is_syncing_finalized() - { - NotifyExecutionLayer::No - } else { - NotifyExecutionLayer::Yes - }; - processor - .process_chain_segment(process_id, blocks, notify_execution_layer) - .await; - }; - let process_fn = Box::pin(process_fn); // Back-sync batches are dispatched with a different `Work` variant so // they can be rate-limited. - let work = if is_backfill { - Work::ChainSegmentBackfill(process_fn) - } else { - Work::ChainSegment(process_fn) + let work = match process_id { + ChainSegmentProcessId::RangeBatchId(_, _) => { + let process_fn = async move { + processor.process_chain_segment(process_id, blocks).await; + }; + Work::ChainSegment(Box::pin(process_fn)) + } + ChainSegmentProcessId::BackSyncBatchId(_) => { + let process_fn = + move || processor.process_chain_segment_backfill(process_id, blocks); + Work::ChainSegmentBackfill(Box::new(process_fn)) + } }; self.try_send(BeaconWorkEvent { diff --git a/beacon_node/network/src/network_beacon_processor/sync_methods.rs b/beacon_node/network/src/network_beacon_processor/sync_methods.rs index edeed7e98cf..b61a6e25c50 100644 --- a/beacon_node/network/src/network_beacon_processor/sync_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/sync_methods.rs @@ -19,9 +19,10 @@ use beacon_processor::{ use beacon_processor::{Work, WorkEvent}; use lighthouse_network::PeerAction; use lighthouse_tracing::{ - SPAN_PROCESS_CHAIN_SEGMENT, SPAN_PROCESS_RPC_BLOBS, SPAN_PROCESS_RPC_BLOCK, - SPAN_PROCESS_RPC_CUSTODY_COLUMNS, + SPAN_PROCESS_CHAIN_SEGMENT, SPAN_PROCESS_CHAIN_SEGMENT_BACKFILL, SPAN_PROCESS_RPC_BLOBS, + SPAN_PROCESS_RPC_BLOCK, SPAN_PROCESS_RPC_CUSTODY_COLUMNS, }; +use logging::crit; use std::sync::Arc; use std::time::Duration; use store::KzgCommitment; @@ -434,27 +435,42 @@ impl NetworkBeaconProcessor { parent = None, level = "debug", skip_all, - fields(sync_type = ?sync_type, downloaded_blocks = downloaded_blocks.len()) + fields(process_id = ?process_id, downloaded_blocks = downloaded_blocks.len()) )] pub async fn process_chain_segment( &self, - sync_type: ChainSegmentProcessId, + process_id: ChainSegmentProcessId, downloaded_blocks: Vec>, - notify_execution_layer: NotifyExecutionLayer, ) { - let result = match sync_type { - // this a request from the range sync - ChainSegmentProcessId::RangeBatchId(chain_id, epoch) => { - let start_slot = downloaded_blocks.first().map(|b| b.slot().as_u64()); - let end_slot = downloaded_blocks.last().map(|b| b.slot().as_u64()); - let sent_blocks = downloaded_blocks.len(); - - match self - .process_blocks(downloaded_blocks.iter(), notify_execution_layer) - .await - { - (imported_blocks, Ok(_)) => { - debug!( + let ChainSegmentProcessId::RangeBatchId(chain_id, epoch) = process_id else { + // This is a request from range sync, this should _never_ happen + crit!( + error = "process_chain_segment called on a variant other than RangeBatchId", + "Please notify the devs" + ); + return; + }; + + let start_slot = downloaded_blocks.first().map(|b| b.slot().as_u64()); + let end_slot = downloaded_blocks.last().map(|b| b.slot().as_u64()); + let sent_blocks = downloaded_blocks.len(); + let notify_execution_layer = if self + .network_globals + .sync_state + .read() + .is_syncing_finalized() + { + NotifyExecutionLayer::No + } else { + NotifyExecutionLayer::Yes + }; + + let result = match self + .process_blocks(downloaded_blocks.iter(), notify_execution_layer) + .await + { + (imported_blocks, Ok(_)) => { + debug!( batch_epoch = %epoch, first_block_slot = start_slot, chain = chain_id, @@ -462,13 +478,13 @@ impl NetworkBeaconProcessor { processed_blocks = sent_blocks, service= "sync", "Batch processed"); - BatchProcessResult::Success { - sent_blocks, - imported_blocks, - } - } - (imported_blocks, Err(e)) => { - debug!( + BatchProcessResult::Success { + sent_blocks, + imported_blocks, + } + } + (imported_blocks, Err(e)) => { + debug!( batch_epoch = %epoch, first_block_slot = start_slot, chain = chain_id, @@ -477,33 +493,61 @@ impl NetworkBeaconProcessor { error = %e.message, service = "sync", "Batch processing failed"); - match e.peer_action { - Some(penalty) => BatchProcessResult::FaultyFailure { - imported_blocks, - penalty, - }, - None => BatchProcessResult::NonFaultyFailure, - } - } + match e.peer_action { + Some(penalty) => BatchProcessResult::FaultyFailure { + imported_blocks, + penalty, + }, + None => BatchProcessResult::NonFaultyFailure, } } - // this a request from the Backfill sync - ChainSegmentProcessId::BackSyncBatchId(epoch) => { - let start_slot = downloaded_blocks.first().map(|b| b.slot().as_u64()); - let end_slot = downloaded_blocks.last().map(|b| b.slot().as_u64()); - let sent_blocks = downloaded_blocks.len(); - let n_blobs = downloaded_blocks - .iter() - .map(|wrapped| wrapped.n_blobs()) - .sum::(); - let n_data_columns = downloaded_blocks - .iter() - .map(|wrapped| wrapped.n_data_columns()) - .sum::(); - - match self.process_backfill_blocks(downloaded_blocks) { - (imported_blocks, Ok(_)) => { - debug!( + }; + + self.send_sync_message(SyncMessage::BatchProcessed { + sync_type: process_id, + result, + }); + } + + /// Attempt to import the chain segment (`blocks`) to the beacon chain, informing the sync + /// thread if more blocks are needed to process it. + #[instrument( + name = SPAN_PROCESS_CHAIN_SEGMENT_BACKFILL, + parent = None, + level = "debug", + skip_all, + fields(downloaded_blocks = downloaded_blocks.len()) + )] + pub fn process_chain_segment_backfill( + &self, + process_id: ChainSegmentProcessId, + downloaded_blocks: Vec>, + ) { + let ChainSegmentProcessId::BackSyncBatchId(epoch) = process_id else { + // this a request from RangeSync, this should _never_ happen + crit!( + error = + "process_chain_segment_backfill called on a variant other than BackSyncBatchId", + "Please notify the devs" + ); + return; + }; + + let start_slot = downloaded_blocks.first().map(|b| b.slot().as_u64()); + let end_slot = downloaded_blocks.last().map(|b| b.slot().as_u64()); + let sent_blocks = downloaded_blocks.len(); + let n_blobs = downloaded_blocks + .iter() + .map(|wrapped| wrapped.n_blobs()) + .sum::(); + let n_data_columns = downloaded_blocks + .iter() + .map(|wrapped| wrapped.n_data_columns()) + .sum::(); + + let result = match self.process_backfill_blocks(downloaded_blocks) { + (imported_blocks, Ok(_)) => { + debug!( batch_epoch = %epoch, first_block_slot = start_slot, keep_execution_payload = !self.chain.store.get_config().prune_payloads, @@ -513,34 +557,35 @@ impl NetworkBeaconProcessor { processed_data_columns = n_data_columns, service= "sync", "Backfill batch processed"); - BatchProcessResult::Success { - sent_blocks, - imported_blocks, - } - } - (_, Err(e)) => { - debug!( - batch_epoch = %epoch, - first_block_slot = start_slot, - last_block_slot = end_slot, - processed_blobs = n_blobs, - error = %e.message, - service = "sync", - "Backfill batch processing failed" - ); - match e.peer_action { - Some(penalty) => BatchProcessResult::FaultyFailure { - imported_blocks: 0, - penalty, - }, - None => BatchProcessResult::NonFaultyFailure, - } - } + BatchProcessResult::Success { + sent_blocks, + imported_blocks, + } + } + (_, Err(e)) => { + debug!( + batch_epoch = %epoch, + first_block_slot = start_slot, + last_block_slot = end_slot, + processed_blobs = n_blobs, + error = %e.message, + service = "sync", + "Backfill batch processing failed" + ); + match e.peer_action { + Some(penalty) => BatchProcessResult::FaultyFailure { + imported_blocks: 0, + penalty, + }, + None => BatchProcessResult::NonFaultyFailure, } } }; - self.send_sync_message(SyncMessage::BatchProcessed { sync_type, result }); + self.send_sync_message(SyncMessage::BatchProcessed { + sync_type: process_id, + result, + }); } /// Helper function to process blocks batches which only consumes the chain and blocks to process. diff --git a/beacon_node/network/src/network_beacon_processor/tests.rs b/beacon_node/network/src/network_beacon_processor/tests.rs index d3a93d48637..99410bc5e51 100644 --- a/beacon_node/network/src/network_beacon_processor/tests.rs +++ b/beacon_node/network/src/network_beacon_processor/tests.rs @@ -17,6 +17,7 @@ use beacon_chain::test_utils::{ test_spec, }; use beacon_chain::{BeaconChain, WhenSlotSkipped}; +use beacon_processor::rayon_manager::RayonManager; use beacon_processor::{work_reprocessing_queue::*, *}; use gossipsub::MessageAcceptance; use itertools::Itertools; @@ -266,6 +267,7 @@ impl TestRig { executor, current_workers: 0, config: beacon_processor_config, + rayon_manager: RayonManager::default(), } .spawn_manager( beacon_processor_rx, @@ -458,10 +460,10 @@ impl TestRig { .unwrap(); } - pub fn enqueue_backfill_batch(&self) { + pub fn enqueue_backfill_batch(&self, epoch: Epoch) { self.network_beacon_processor .send_chain_segment( - ChainSegmentProcessId::BackSyncBatchId(Epoch::default()), + ChainSegmentProcessId::BackSyncBatchId(epoch), Vec::default(), ) .unwrap(); @@ -606,7 +608,7 @@ impl TestRig { } pub async fn assert_event_journal(&mut self, expected: &[&str]) { - self.assert_event_journal_with_timeout(expected, STANDARD_TIMEOUT) + self.assert_event_journal_with_timeout(expected, STANDARD_TIMEOUT, false, false) .await } @@ -623,6 +625,8 @@ impl TestRig { .chain(std::iter::once(NOTHING_TO_DO)) .collect::>(), timeout, + false, + false, ) .await } @@ -666,11 +670,21 @@ impl TestRig { &mut self, expected: &[&str], timeout: Duration, + ignore_worker_freed: bool, + ignore_nothing_to_do: bool, ) { let mut events = Vec::with_capacity(expected.len()); let drain_future = async { while let Some(event) = self.work_journal_rx.recv().await { + if event == WORKER_FREED && ignore_worker_freed { + continue; + } + + if event == NOTHING_TO_DO && ignore_nothing_to_do { + continue; + } + events.push(event); // Break as soon as we collect the desired number of events. @@ -1384,6 +1398,8 @@ async fn requeue_unknown_block_gossip_attestation_without_import() { NOTHING_TO_DO, ], Duration::from_secs(1) + QUEUED_ATTESTATION_DELAY, + false, + false, ) .await; @@ -1424,6 +1440,8 @@ async fn requeue_unknown_block_gossip_aggregated_attestation_without_import() { NOTHING_TO_DO, ], Duration::from_secs(1) + QUEUED_ATTESTATION_DELAY, + false, + false, ) .await; @@ -1558,8 +1576,8 @@ async fn test_backfill_sync_processing() { // (not straight forward to manipulate `TestingSlotClock` due to cloning of `SlotClock` in code) // and makes the test very slow, hence timing calculation is unit tested separately in // `work_reprocessing_queue`. - for _ in 0..1 { - rig.enqueue_backfill_batch(); + for i in 0..1 { + rig.enqueue_backfill_batch(Epoch::new(i)); // ensure queued batch is not processed until later rig.assert_no_events_for(Duration::from_millis(100)).await; // A new batch should be processed within a slot. @@ -1570,6 +1588,8 @@ async fn test_backfill_sync_processing() { NOTHING_TO_DO, ], rig.chain.slot_clock.slot_duration(), + false, + false, ) .await; } @@ -1590,8 +1610,8 @@ async fn test_backfill_sync_processing_rate_limiting_disabled() { ) .await; - for _ in 0..3 { - rig.enqueue_backfill_batch(); + for i in 0..3 { + rig.enqueue_backfill_batch(Epoch::new(i)); } // ensure all batches are processed @@ -1602,6 +1622,8 @@ async fn test_backfill_sync_processing_rate_limiting_disabled() { WorkType::ChainSegmentBackfill.into(), ], Duration::from_millis(100), + true, + true, ) .await; } diff --git a/beacon_node/src/cli.rs b/beacon_node/src/cli.rs index 386eb721a04..9a981c65812 100644 --- a/beacon_node/src/cli.rs +++ b/beacon_node/src/cli.rs @@ -401,6 +401,16 @@ pub fn cli_app() -> Command { .help_heading(FLAG_HEADER) .display_order(0) ) + .arg( + Arg::new("complete-blob-backfill") + .long("complete-blob-backfill") + .help("Download all blobs back to the Deneb fork epoch. This will likely result in \ + the node banning most of its peers.") + .action(ArgAction::SetTrue) + .help_heading(FLAG_HEADER) + .display_order(0) + .hide(true) + ) .arg( Arg::new("enable-private-discovery") .long("enable-private-discovery") diff --git a/beacon_node/src/config.rs b/beacon_node/src/config.rs index 1b5f25b3175..3681556d11e 100644 --- a/beacon_node/src/config.rs +++ b/beacon_node/src/config.rs @@ -825,6 +825,14 @@ pub fn get_config( client_config.chain.genesis_backfill = true; } + client_config.chain.complete_blob_backfill = cli_args.get_flag("complete-blob-backfill"); + + // Ensure `prune_blobs` is false whenever complete-blob-backfill is set. This overrides any + // setting of `--prune-blobs true` applied earlier in flag parsing. + if client_config.chain.complete_blob_backfill { + client_config.store.prune_blobs = false; + } + // Backfill sync rate-limiting client_config.beacon_processor.enable_backfill_rate_limiting = !cli_args.get_flag("disable-backfill-rate-limiting"); diff --git a/lighthouse/tests/beacon_node.rs b/lighthouse/tests/beacon_node.rs index 1fd3cc1b792..0660073bbc5 100644 --- a/lighthouse/tests/beacon_node.rs +++ b/lighthouse/tests/beacon_node.rs @@ -392,6 +392,37 @@ fn genesis_backfill_with_historic_flag() { .with_config(|config| assert!(config.chain.genesis_backfill)); } +#[test] +fn complete_blob_backfill_default() { + CommandLineTest::new() + .run_with_zero_port() + .with_config(|config| assert!(!config.chain.complete_blob_backfill)); +} + +#[test] +fn complete_blob_backfill_flag() { + CommandLineTest::new() + .flag("complete-blob-backfill", None) + .run_with_zero_port() + .with_config(|config| { + assert!(config.chain.complete_blob_backfill); + assert!(!config.store.prune_blobs); + }); +} + +// Even if `--prune-blobs true` is provided, `--complete-blob-backfill` should override it to false. +#[test] +fn complete_blob_backfill_and_prune_blobs_true() { + CommandLineTest::new() + .flag("complete-blob-backfill", None) + .flag("prune-blobs", Some("true")) + .run_with_zero_port() + .with_config(|config| { + assert!(config.chain.complete_blob_backfill); + assert!(!config.store.prune_blobs); + }); +} + // Tests for Eth1 flags. // DEPRECATED but should not crash #[test] diff --git a/scripts/print_release_diffs.py b/scripts/print_release_diffs.py new file mode 100644 index 00000000000..d910b1be5bd --- /dev/null +++ b/scripts/print_release_diffs.py @@ -0,0 +1,72 @@ +""" +Summarise pull requests between two Lighthouse releases. + +Usage: + export GITHUB_TOKEN=your_token + python -m pip install requests==2.32.4 + python print_release_diffs.py --base v7.0.1 --head release-v7.1.0 + +Shows commit SHA, PR number, 'backwards-incompat' label status, and PR title. +""" + +import requests +import re +import argparse +import os + +GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN") +if not GITHUB_TOKEN: + raise SystemExit("Error: Please set the GITHUB_TOKEN environment variable.") + +parser = argparse.ArgumentParser(description="Summarise PRs between two Lighthouse versions.") +parser.add_argument("--base", required=True, help="Base tag or branch (older release)") +parser.add_argument("--head", required=True, help="Head tag or branch (newer release)") +args = parser.parse_args() + +BASE = args.base +HEAD = args.head +OWNER = 'sigp' +REPO = 'lighthouse' + +HEADERS = { + 'Authorization': f'token {GITHUB_TOKEN}', + 'Accept': 'application/vnd.github+json' +} + +def get_commits_between(base, head): + url = f'https://api.github.com/repos/{OWNER}/{REPO}/compare/{base}...{head}' + response = requests.get(url, headers=HEADERS) + response.raise_for_status() + return response.json()['commits'] + +def has_backwards_incompat_label(pr_number): + url = f'https://api.github.com/repos/{OWNER}/{REPO}/issues/{pr_number}' + response = requests.get(url, headers=HEADERS) + if response.status_code != 200: + raise Exception(f"Failed to fetch PR #{pr_number}") + labels = response.json().get('labels', []) + return any(label['name'] == 'backwards-incompat' for label in labels) + +def main(): + commits = get_commits_between(BASE, HEAD) + print(" # Commit SHA PR Number Has backwards-incompat Label PR Title") + print("--- ------------ ----------- ------------------------------ --------------------------------------------") + + for i, commit in enumerate(commits, 1): + sha = commit['sha'][:12] + message = commit['commit']['message'] + pr_match = re.search(r"\(#(\d+)\)", message) + + if not pr_match: + print(f"{i:<3} {sha} {'-':<11} {'-':<30} [NO PR MATCH]: {message.splitlines()[0]}") + continue + + pr_number = int(pr_match.group(1)) + try: + has_label = has_backwards_incompat_label(pr_number) + print(f"{i:<3} {sha} {pr_number:<11} {str(has_label):<30} {message.splitlines()[0]}") + except Exception as e: + print(f"{i:<3} {sha} {pr_number:<11} {'ERROR':<30} [ERROR FETCHING PR]: {e}") + +if __name__ == '__main__': + main()