-
Notifications
You must be signed in to change notification settings - Fork 2.2k
Introduce ValidatorManager to track all requests and validators' scores #4752
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: testnet_conway
Are you sure you want to change the base?
Changes from all commits
ebc7930
28d17f0
6d40c58
20dc5ed
ba7691d
2a1976b
37b4a40
2cee3fb
1d1288b
e196913
9cc3f6b
89d4d9d
8b5590e
dc71720
b7946a6
5c552d1
6e4676f
7ec8053
06ca6ce
da08874
79f8d38
84bc85c
4210ace
cc74a7f
cdd68cd
de72645
d953d59
bf5f0a5
bab37b7
89401b1
aa3a5cb
1cb477f
4e87f0f
a3ecee2
2a491e6
c80e0e8
dd385ac
de5ddef
eecb470
9fdb79a
6e09717
f0a3362
d092a22
2c1d752
fc19004
148dc02
aa0d481
e20f8af
b871b6c
963f8ad
f8efd60
2a8d3fe
d07c3c6
5e6c51d
ba357d3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -186,6 +186,58 @@ pub struct ClientContextOptions { | |||||||||||||||||||||
/// Maximum number of tasks that can are joined concurrently in the client. | ||||||||||||||||||||||
#[arg(long, default_value = "100")] | ||||||||||||||||||||||
pub max_joined_tasks: usize, | ||||||||||||||||||||||
|
||||||||||||||||||||||
/// Maximum concurrent requests per validator node | ||||||||||||||||||||||
#[arg( | ||||||||||||||||||||||
long, | ||||||||||||||||||||||
default_value_t = linera_core::client::validator_manager::MAX_IN_FLIGHT_REQUESTS, | ||||||||||||||||||||||
env = "LINERA_VALIDATOR_MANAGER_MAX_IN_FLIGHT_REQUESTS" | ||||||||||||||||||||||
)] | ||||||||||||||||||||||
pub max_in_flight_requests: usize, | ||||||||||||||||||||||
|
||||||||||||||||||||||
/// Maximum expected latency in milliseconds for score normalization | ||||||||||||||||||||||
#[arg( | ||||||||||||||||||||||
long, | ||||||||||||||||||||||
default_value_t = linera_core::client::validator_manager::MAX_ACCEPTED_LATENCY_MS, | ||||||||||||||||||||||
env = "LINERA_VALIDATOR_MANAGER_MAX_ACCEPTED_LATENCY_MS" | ||||||||||||||||||||||
)] | ||||||||||||||||||||||
pub max_accepted_latency_ms: f64, | ||||||||||||||||||||||
|
||||||||||||||||||||||
/// Time-to-live for cached responses in seconds | ||||||||||||||||||||||
#[arg( | ||||||||||||||||||||||
long, | ||||||||||||||||||||||
default_value_t = linera_core::client::validator_manager::CACHE_TTL_SEC, | ||||||||||||||||||||||
env = "LINERA_VALIDATOR_MANAGER_CACHE_TTL_SEC" | ||||||||||||||||||||||
)] | ||||||||||||||||||||||
pub cache_ttl_sec: u64, | ||||||||||||||||||||||
|
||||||||||||||||||||||
/// Maximum number of entries in the cache | ||||||||||||||||||||||
#[arg( | ||||||||||||||||||||||
long, | ||||||||||||||||||||||
default_value_t = linera_core::client::validator_manager::CACHE_MAX_SIZE, | ||||||||||||||||||||||
env = "LINERA_VALIDATOR_MANAGER_CACHE_MAX_SIZE" | ||||||||||||||||||||||
)] | ||||||||||||||||||||||
pub cache_max_size: usize, | ||||||||||||||||||||||
|
||||||||||||||||||||||
/// Maximum latency for an in-flight request before we stop deduplicating it (in milliseconds) | ||||||||||||||||||||||
#[arg( | ||||||||||||||||||||||
long, | ||||||||||||||||||||||
default_value_t = linera_core::client::validator_manager::MAX_REQUEST_TTL_MS, | ||||||||||||||||||||||
env = "LINERA_VALIDATOR_MANAGER_MAX_REQUEST_TTL_MS" | ||||||||||||||||||||||
)] | ||||||||||||||||||||||
pub max_request_ttl_ms: u64, | ||||||||||||||||||||||
|
||||||||||||||||||||||
/// Smoothing factor for Exponential Moving Averages (0 < alpha < 1) | ||||||||||||||||||||||
/// Higher values give more weight to recent observations | ||||||||||||||||||||||
/// Typical values are between 0.01 and 0.5 | ||||||||||||||||||||||
/// A value of 0.1 means that 10% of the new observation is considered | ||||||||||||||||||||||
/// and 90% of the previous average is retained | ||||||||||||||||||||||
Comment on lines
+230
to
+234
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||||||
#[arg( | ||||||||||||||||||||||
long, | ||||||||||||||||||||||
default_value_t = linera_core::client::validator_manager::ALPHA_SMOOTHING_FACTOR, | ||||||||||||||||||||||
env = "LINERA_VALIDATOR_MANAGER_ALPHA" | ||||||||||||||||||||||
)] | ||||||||||||||||||||||
pub alpha: f64, | ||||||||||||||||||||||
} | ||||||||||||||||||||||
|
||||||||||||||||||||||
impl ClientContextOptions { | ||||||||||||||||||||||
|
@@ -218,6 +270,20 @@ impl ClientContextOptions { | |||||||||||||||||||||
report_interval_secs: self.timing_interval, | ||||||||||||||||||||||
} | ||||||||||||||||||||||
} | ||||||||||||||||||||||
|
||||||||||||||||||||||
/// Creates [`ValidatorManagerConfig`] with the corresponding values. | ||||||||||||||||||||||
pub(crate) fn to_validator_manager_config( | ||||||||||||||||||||||
&self, | ||||||||||||||||||||||
) -> linera_core::client::ValidatorManagerConfig { | ||||||||||||||||||||||
linera_core::client::ValidatorManagerConfig { | ||||||||||||||||||||||
max_in_flight_requests: self.max_in_flight_requests, | ||||||||||||||||||||||
max_accepted_latency_ms: self.max_accepted_latency_ms, | ||||||||||||||||||||||
cache_ttl_sec: self.cache_ttl_sec, | ||||||||||||||||||||||
cache_max_size: self.cache_max_size, | ||||||||||||||||||||||
max_request_ttl_ms: self.max_request_ttl_ms, | ||||||||||||||||||||||
alpha: self.alpha, | ||||||||||||||||||||||
} | ||||||||||||||||||||||
} | ||||||||||||||||||||||
} | ||||||||||||||||||||||
|
||||||||||||||||||||||
#[derive(Debug, Clone, clap::Args)] | ||||||||||||||||||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -86,6 +86,9 @@ mod chain_client_state; | |
#[cfg(test)] | ||
#[path = "../unit_tests/client_tests.rs"] | ||
mod client_tests; | ||
pub mod validator_manager; | ||
|
||
pub use validator_manager::{ScoringWeights, ValidatorManager, ValidatorManagerConfig}; | ||
mod received_log; | ||
mod validator_trackers; | ||
|
||
|
@@ -149,6 +152,8 @@ pub struct Client<Env: Environment> { | |
/// Local node to manage the execution state and the local storage of the chains that we are | ||
/// tracking. | ||
local_node: LocalNodeClient<Env::Storage>, | ||
/// Manages the requests sent to validator nodes. | ||
validator_manager: ValidatorManager<Env>, | ||
/// The admin chain ID. | ||
admin_id: ChainId, | ||
/// Chains that should be tracked by the client. | ||
|
@@ -175,6 +180,7 @@ impl<Env: Environment> Client<Env> { | |
chain_worker_ttl: Duration, | ||
sender_chain_worker_ttl: Duration, | ||
options: ChainClientOptions, | ||
validator_manager_config: validator_manager::ValidatorManagerConfig, | ||
) -> Self { | ||
let tracked_chains = Arc::new(RwLock::new(tracked_chains.into_iter().collect())); | ||
let state = WorkerState::new_for_client( | ||
|
@@ -188,10 +194,12 @@ impl<Env: Environment> Client<Env> { | |
.with_chain_worker_ttl(chain_worker_ttl) | ||
.with_sender_chain_worker_ttl(sender_chain_worker_ttl); | ||
let local_node = LocalNodeClient::new(state); | ||
let validator_manager = ValidatorManager::new(vec![], validator_manager_config); | ||
|
||
Self { | ||
environment, | ||
local_node, | ||
validator_manager, | ||
chains: papaya::HashMap::new(), | ||
admin_id, | ||
tracked_chains, | ||
|
@@ -347,8 +355,10 @@ impl<Env: Environment> Client<Env> { | |
.checked_sub(u64::from(next_height)) | ||
.ok_or(ArithmeticError::Overflow)? | ||
.min(self.options.certificate_download_batch_size); | ||
let certificates = remote_node | ||
.query_certificates_from(chain_id, next_height, limit) | ||
|
||
let certificates = self | ||
.validator_manager | ||
.download_certificates(remote_node, chain_id, next_height, limit) | ||
.await?; | ||
let Some(info) = self.process_certificates(remote_node, certificates).await? else { | ||
break; | ||
|
@@ -362,17 +372,20 @@ impl<Env: Environment> Client<Env> { | |
|
||
async fn download_blobs( | ||
&self, | ||
remote_node: &RemoteNode<impl ValidatorNode>, | ||
blob_ids: impl IntoIterator<Item = BlobId>, | ||
remote_nodes: &[RemoteNode<Env::ValidatorNode>], | ||
blob_ids: &[BlobId], | ||
) -> Result<(), ChainClientError> { | ||
self.local_node | ||
.store_blobs( | ||
&futures::stream::iter(blob_ids.into_iter().map(|blob_id| async move { | ||
remote_node.try_download_blob(blob_id).await.unwrap() | ||
})) | ||
.buffer_unordered(self.options.max_joined_tasks) | ||
.collect::<Vec<_>>() | ||
.await, | ||
&self | ||
.validator_manager | ||
.download_blobs(remote_nodes, blob_ids, self.options.blob_download_timeout) | ||
.await? | ||
.ok_or_else(|| { | ||
ChainClientError::RemoteNodeError(NodeError::BlobsNotFound( | ||
blob_ids.to_vec(), | ||
)) | ||
})?, | ||
Comment on lines
+380
to
+388
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd put that in a local variable before we call There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK, but that was already here – I just moved it around. |
||
) | ||
.await | ||
.map_err(Into::into) | ||
|
@@ -383,7 +396,7 @@ impl<Env: Environment> Client<Env> { | |
#[instrument(level = "trace", skip_all)] | ||
async fn process_certificates( | ||
&self, | ||
remote_node: &RemoteNode<impl ValidatorNode>, | ||
remote_node: &RemoteNode<Env::ValidatorNode>, | ||
certificates: Vec<ConfirmedBlockCertificate>, | ||
) -> Result<Option<Box<ChainInfo>>, ChainClientError> { | ||
let mut info = None; | ||
|
@@ -398,7 +411,8 @@ impl<Env: Environment> Client<Env> { | |
.await | ||
{ | ||
Err(LocalNodeError::BlobsNotFound(blob_ids)) => { | ||
self.download_blobs(remote_node, blob_ids).await?; | ||
self.download_blobs(&[remote_node.clone()], &blob_ids) | ||
.await?; | ||
} | ||
x => { | ||
x?; | ||
|
@@ -409,7 +423,8 @@ impl<Env: Environment> Client<Env> { | |
info = Some( | ||
match self.handle_certificate(certificate.clone()).await { | ||
Err(LocalNodeError::BlobsNotFound(blob_ids)) => { | ||
self.download_blobs(remote_node, blob_ids).await?; | ||
self.download_blobs(&[remote_node.clone()], &blob_ids) | ||
.await?; | ||
self.handle_certificate(certificate).await? | ||
} | ||
x => x?, | ||
|
@@ -663,7 +678,6 @@ impl<Env: Environment> Client<Env> { | |
) -> Result<(), ChainClientError> { | ||
let certificate = Box::new(certificate); | ||
let block = certificate.block(); | ||
|
||
// Recover history from the network. | ||
self.download_certificates(block.header.chain_id, block.header.height) | ||
.await?; | ||
|
@@ -672,14 +686,9 @@ impl<Env: Environment> Client<Env> { | |
if let Err(err) = self.process_certificate(certificate.clone()).await { | ||
match &err { | ||
LocalNodeError::BlobsNotFound(blob_ids) => { | ||
let blobs = RemoteNode::download_blobs( | ||
blob_ids, | ||
&self.validator_nodes().await?, | ||
self.options.blob_download_timeout, | ||
) | ||
.await | ||
.ok_or(err)?; | ||
self.local_node.store_blobs(&blobs).await?; | ||
self.download_blobs(&self.validator_nodes().await?, blob_ids) | ||
.await | ||
.map_err(|_| err)?; | ||
self.process_certificate(certificate).await?; | ||
} | ||
_ => { | ||
|
@@ -716,14 +725,7 @@ impl<Env: Environment> Client<Env> { | |
if let Err(err) = self.handle_certificate(certificate.clone()).await { | ||
match &err { | ||
LocalNodeError::BlobsNotFound(blob_ids) => { | ||
let blobs = RemoteNode::download_blobs( | ||
blob_ids, | ||
&nodes, | ||
self.options.blob_download_timeout, | ||
) | ||
.await | ||
.ok_or(err)?; | ||
self.local_node.store_blobs(&blobs).await?; | ||
self.download_blobs(&nodes, blob_ids).await?; | ||
self.handle_certificate(certificate.clone()).await?; | ||
} | ||
_ => { | ||
|
@@ -777,8 +779,13 @@ impl<Env: Environment> Client<Env> { | |
// anything from the validator - let the function try the other validators | ||
return Err(()); | ||
} | ||
let certificates = remote_node | ||
.download_certificates_by_heights(sender_chain_id, remote_heights) | ||
let certificates = self | ||
.validator_manager | ||
.download_certificates_by_heights( | ||
&remote_node, | ||
sender_chain_id, | ||
remote_heights, | ||
) | ||
.await | ||
.map_err(|_| ())?; | ||
let mut certificates_with_check_results = vec![]; | ||
|
@@ -934,8 +941,13 @@ impl<Env: Environment> Client<Env> { | |
// Stop if we've reached the height we've already processed. | ||
while current_height >= next_outbox_height { | ||
// Download the certificate for this height. | ||
let downloaded = remote_node | ||
.download_certificates_by_heights(sender_chain_id, vec![current_height]) | ||
let downloaded = self | ||
.validator_manager | ||
.download_certificates_by_heights( | ||
remote_node, | ||
sender_chain_id, | ||
vec![current_height], | ||
) | ||
.await?; | ||
let Some(certificate) = downloaded.into_iter().next() else { | ||
return Err(ChainClientError::CannotDownloadMissingSenderBlock { | ||
|
@@ -1119,9 +1131,9 @@ impl<Env: Environment> Client<Env> { | |
if !required_blob_ids.is_empty() { | ||
let mut blobs = Vec::new(); | ||
for blob_id in required_blob_ids { | ||
let blob_content = match remote_node | ||
.node | ||
.download_pending_blob(chain_id, blob_id) | ||
let blob_content = match self | ||
.validator_manager | ||
.download_pending_blob(remote_node, chain_id, blob_id) | ||
.await | ||
{ | ||
Ok(content) => content, | ||
|
@@ -1217,9 +1229,9 @@ impl<Env: Environment> Client<Env> { | |
Err(LocalNodeError::BlobsNotFound(blob_ids)) => { | ||
let mut blobs = Vec::new(); | ||
for blob_id in blob_ids { | ||
let blob_content = remote_node | ||
.node | ||
.download_pending_blob(chain_id, blob_id) | ||
let blob_content = self | ||
.validator_manager | ||
.download_pending_blob(remote_node, chain_id, blob_id) | ||
.await?; | ||
blobs.push(Blob::new(blob_content)); | ||
} | ||
|
@@ -1248,7 +1260,10 @@ impl<Env: Environment> Client<Env> { | |
communicate_concurrently( | ||
remote_nodes, | ||
async move |remote_node| { | ||
afck marked this conversation as resolved.
Show resolved
Hide resolved
|
||
let certificate = remote_node.download_certificate_for_blob(blob_id).await?; | ||
let certificate = self | ||
.validator_manager | ||
.download_certificate_for_blob(&remote_node, blob_id) | ||
.await?; | ||
self.receive_sender_certificate( | ||
certificate, | ||
ReceiveCertificateMode::NeedsCheck, | ||
|
@@ -4123,7 +4138,7 @@ impl<Env: Environment> ChainClient<Env> { | |
} | ||
|
||
/// Performs `f` in parallel on multiple nodes, starting with a quadratically increasing delay on | ||
/// each subsequent node. Returns error `err` is all of the nodes fail. | ||
/// each subsequent node. Returns error `err` if all of the nodes fail. | ||
async fn communicate_concurrently<'a, A, E1, E2, F, G, R, V>( | ||
nodes: &[RemoteNode<A>], | ||
f: F, | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should we make that milliseconds, too? We use ms for almost all timing options.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I thought that we'd never need anything less than 1 second but I agree with you we should be consistent. Will change.