From fd313d50389a9cfd316d34a6c91746e217beaf79 Mon Sep 17 00:00:00 2001 From: Bohdan Khorolets Date: Fri, 25 Jul 2025 11:12:00 +0300 Subject: [PATCH 01/13] refactor(database,state-indexer): Introduce compact schema for state_changes that is more efficient to read from --- Cargo.lock | 2 + database/Cargo.toml | 7 + database/src/base/state_indexer.rs | 84 +- ...2509_create_state_changes_compact.down.sql | 4 + ...132509_create_state_changes_compact.up.sql | 75 ++ database/src/postgres/state_indexer.rs | 1020 ++++++++++++++--- logic-state-indexer/src/lib.rs | 19 +- 7 files changed, 1052 insertions(+), 159 deletions(-) create mode 100644 database/src/postgres/migrations/shard_db/20250218132509_create_state_changes_compact.down.sql create mode 100644 database/src/postgres/migrations/shard_db/20250218132509_create_state_changes_compact.up.sql diff --git a/Cargo.lock b/Cargo.lock index b1131d36..7608e529 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2191,6 +2191,8 @@ dependencies = [ "scylla", "serde_json", "sqlx", + "tokio", + "tracing", ] [[package]] diff --git a/database/Cargo.toml b/database/Cargo.toml index 6bc9eeab..7681960c 100644 --- a/database/Cargo.toml +++ b/database/Cargo.toml @@ -28,6 +28,13 @@ sqlx = { version = "0.8.2", features = [ num-bigint = "0.3.3" num-traits = "0.2.19" scylla = { version = "0.15.1", features = ["ssl", "full-serialization"] } +tokio = { version = "1.36.0", features = [ + "sync", + "time", + "macros", + "rt-multi-thread", +] } +tracing = "0.1.34" configuration.workspace = true readnode-primitives.workspace = true diff --git a/database/src/base/state_indexer.rs b/database/src/base/state_indexer.rs index 73048fc7..85ab5b37 100644 --- a/database/src/base/state_indexer.rs +++ b/database/src/base/state_indexer.rs @@ -59,7 +59,26 @@ pub trait StateIndexerDbManager { shard_id: near_primitives::types::ShardId, state_changes: Vec, block_height: u64, - block_hash: near_primitives::hash::CryptoHash, + ) -> anyhow::Result<()> { + self.update_state_changes_data(shard_id, state_changes.clone(), block_height) + .await?; + self.insert_state_changes_data(shard_id, state_changes, block_height) + .await?; + Ok(()) + } + + async fn insert_state_changes_data( + &self, + shard_id: near_primitives::types::ShardId, + state_changes: Vec, + block_height: u64, + ) -> anyhow::Result<()>; + + async fn update_state_changes_data( + &self, + shard_id: near_primitives::types::ShardId, + state_changes: Vec, + block_height: u64, ) -> anyhow::Result<()>; async fn save_state_changes_access_key( @@ -67,7 +86,26 @@ pub trait StateIndexerDbManager { shard_id: near_primitives::types::ShardId, state_changes: Vec, block_height: u64, - block_hash: near_primitives::hash::CryptoHash, + ) -> anyhow::Result<()> { + self.update_state_changes_access_key(shard_id, state_changes.clone(), block_height) + .await?; + self.insert_state_changes_access_key(shard_id, state_changes, block_height) + .await?; + Ok(()) + } + + async fn insert_state_changes_access_key( + &self, + shard_id: near_primitives::types::ShardId, + state_changes: Vec, + block_height: u64, + ) -> anyhow::Result<()>; + + async fn update_state_changes_access_key( + &self, + shard_id: near_primitives::types::ShardId, + state_changes: Vec, + block_height: u64, ) -> anyhow::Result<()>; async fn save_state_changes_contract( @@ -75,7 +113,26 @@ pub trait StateIndexerDbManager { shard_id: near_primitives::types::ShardId, state_changes: Vec, block_height: u64, - block_hash: near_primitives::hash::CryptoHash, + ) -> anyhow::Result<()> { + self.update_state_changes_contract(shard_id, state_changes.clone(), block_height) + .await?; + self.insert_state_changes_contract(shard_id, state_changes, block_height) + .await?; + Ok(()) + } + + async fn insert_state_changes_contract( + &self, + shard_id: near_primitives::types::ShardId, + state_changes: Vec, + block_height: u64, + ) -> anyhow::Result<()>; + + async fn update_state_changes_contract( + &self, + shard_id: near_primitives::types::ShardId, + state_changes: Vec, + block_height: u64, ) -> anyhow::Result<()>; async fn save_state_changes_account( @@ -83,6 +140,25 @@ pub trait StateIndexerDbManager { shard_id: near_primitives::types::ShardId, state_changes: Vec, block_height: u64, - block_hash: near_primitives::hash::CryptoHash, + ) -> anyhow::Result<()> { + self.update_state_changes_account(shard_id, state_changes.clone(), block_height) + .await?; + self.insert_state_changes_account(shard_id, state_changes, block_height) + .await?; + Ok(()) + } + + async fn insert_state_changes_account( + &self, + shard_id: near_primitives::types::ShardId, + state_changes: Vec, + block_height: u64, + ) -> anyhow::Result<()>; + + async fn update_state_changes_account( + &self, + shard_id: near_primitives::types::ShardId, + state_changes: Vec, + block_height: u64, ) -> anyhow::Result<()>; } diff --git a/database/src/postgres/migrations/shard_db/20250218132509_create_state_changes_compact.down.sql b/database/src/postgres/migrations/shard_db/20250218132509_create_state_changes_compact.down.sql new file mode 100644 index 00000000..097c6490 --- /dev/null +++ b/database/src/postgres/migrations/shard_db/20250218132509_create_state_changes_compact.down.sql @@ -0,0 +1,4 @@ +DROP TABLE IF EXISTS state_changes_data_compact CASCADE; +DROP TABLE IF EXISTS state_changes_access_key_compact CASCADE; +DROP TABLE IF EXISTS state_changes_contract_compact CASCADE; +DROP TABLE IF EXISTS state_changes_account_compact CASCADE; diff --git a/database/src/postgres/migrations/shard_db/20250218132509_create_state_changes_compact.up.sql b/database/src/postgres/migrations/shard_db/20250218132509_create_state_changes_compact.up.sql new file mode 100644 index 00000000..20903f13 --- /dev/null +++ b/database/src/postgres/migrations/shard_db/20250218132509_create_state_changes_compact.up.sql @@ -0,0 +1,75 @@ +CREATE TABLE IF NOT EXISTS state_changes_data_compact ( + account_id text NOT NULL, + data_key text NOT NULL, + data_value bytea NOT NULL, + block_height_from numeric(20,0) NOT NULL, + block_height_to numeric(20,0) NULL, + PRIMARY KEY (account_id, data_key, block_height_from) +) PARTITION BY HASH (account_id); + +-- Create state_changes_data partitions +DO $$ +DECLARE + i INT; +BEGIN + FOR i IN 0..99 LOOP + EXECUTE format('CREATE TABLE IF NOT EXISTS state_changes_data_compact_%s PARTITION OF state_changes_data_compact FOR VALUES WITH (MODULUS 100, REMAINDER %s)', i, i); + END LOOP; +END $$; + + +CREATE TABLE IF NOT EXISTS state_changes_access_key_compact ( + account_id text NOT NULL, + data_key text NOT NULL, + data_value bytea NOT NULL, + block_height_from numeric(20,0) NOT NULL, + block_height_to numeric(20,0) NULL, + PRIMARY KEY (account_id, data_key, block_height_from) +) PARTITION BY HASH (account_id); + +-- Create state_changes_access_key partitions +DO $$ +DECLARE + i INT; +BEGIN + FOR i IN 0..99 LOOP + EXECUTE format('CREATE TABLE IF NOT EXISTS state_changes_access_key_compact_%s PARTITION OF state_changes_access_key_compact FOR VALUES WITH (MODULUS 100, REMAINDER %s)', i, i); + END LOOP; +END $$; + + +CREATE TABLE IF NOT EXISTS state_changes_contract_compact ( + account_id text NOT NULL, + data_value bytea NOT NULL, + block_height_from numeric(20,0) NOT NULL, + block_height_to numeric(20,0) NULL, + PRIMARY KEY (account_id, block_height_from) +) PARTITION BY HASH (account_id); + +-- Create state_changes_contract partitions +DO $$ +DECLARE + i INT; +BEGIN + FOR i IN 0..99 LOOP + EXECUTE format('CREATE TABLE IF NOT EXISTS state_changes_contract_compact_%s PARTITION OF state_changes_contract_compact FOR VALUES WITH (MODULUS 100, REMAINDER %s)', i, i); + END LOOP; +END $$; + +CREATE TABLE IF NOT EXISTS state_changes_account_compact ( + account_id text NOT NULL, + data_value bytea NULL, + block_height_from numeric(20,0) NOT NULL, + block_height_to numeric(20,0) NULL, + PRIMARY KEY (account_id, block_height_from) +) PARTITION BY HASH (account_id); + +-- Create state_changes_account partitions +DO $$ +DECLARE + i INT; +BEGIN + FOR i IN 0..99 LOOP + EXECUTE format('CREATE TABLE IF NOT EXISTS state_changes_account_compact_%s PARTITION OF state_changes_account_compact FOR VALUES WITH (MODULUS 100, REMAINDER %s)', i, i); + END LOOP; +END $$; diff --git a/database/src/postgres/state_indexer.rs b/database/src/postgres/state_indexer.rs index 3f3b11f9..8d4c67fc 100644 --- a/database/src/postgres/state_indexer.rs +++ b/database/src/postgres/state_indexer.rs @@ -1,5 +1,12 @@ +use std::collections::HashMap; +use std::time::Instant; + use bigdecimal::ToPrimitive; -use futures::FutureExt; +use futures::{future::try_join_all, FutureExt}; +use sqlx::Row; + +const PARTITIONS: i32 = 100; +const MAX_CONCURRENT_QUERIES: usize = 16; impl crate::PostgresDBManager { async fn save_chunks_unique( @@ -73,6 +80,38 @@ impl crate::PostgresDBManager { } Ok(()) } + + async fn partition_map( + &self, + pool: &sqlx::PgPool, + account_ids: &Vec, + ) -> anyhow::Result> { + let now = std::time::Instant::now(); + let partition_rows = sqlx::query( + "SELECT account_id, mod(hashtext(account_id), $2)::int AS partition + FROM unnest($1::text[]) AS account_id", + ) + .bind(account_ids) + .bind(PARTITIONS) + .fetch_all(pool) + .await?; + + let partition_map: HashMap = partition_rows + .into_iter() + .map(|row| { + let account_id: String = row.try_get("account_id").unwrap(); + let partition: i32 = row.try_get("partition").unwrap(); + (account_id, partition) + }) + .collect(); + tracing::debug!( + target: "database::postgres::state_indexer", + "Partition map computed in {:?} for {} accounts", + now.elapsed(), + account_ids.len() + ); + Ok(partition_map) + } } #[async_trait::async_trait] impl crate::StateIndexerDbManager for crate::PostgresDBManager { @@ -211,12 +250,11 @@ impl crate::StateIndexerDbManager for crate::PostgresDBManager { Ok(()) } - async fn save_state_changes_data( + async fn insert_state_changes_data( &self, shard_id: near_primitives::types::ShardId, state_changes: Vec, block_height: u64, - block_hash: near_primitives::hash::CryptoHash, ) -> anyhow::Result<()> { crate::metrics::SHARD_DATABASE_WRITE_QUERIES .with_label_values(&[ @@ -225,56 +263,242 @@ impl crate::StateIndexerDbManager for crate::PostgresDBManager { "state_changes_data", ]) .inc(); - let mut query_builder: sqlx::QueryBuilder = sqlx::QueryBuilder::new( - "INSERT INTO state_changes_data (account_id, block_height, block_hash, data_key, data_value) ", - ); - query_builder.push_values(state_changes.iter(), |mut values, state_change| { - match &state_change.value { - near_primitives::views::StateChangeValueView::DataUpdate { + + // Extract relevant data + let inserts: Vec<(String, String, Vec, bigdecimal::BigDecimal)> = state_changes + .iter() + .filter_map(|change| { + if let near_primitives::views::StateChangeValueView::DataUpdate { account_id, key, value, - } => { - let data_key: &[u8] = key.as_ref(); - let data_value: &[u8] = value.as_ref(); - values - .push_bind(account_id.to_string()) - .push_bind(bigdecimal::BigDecimal::from(block_height)) - .push_bind(block_hash.to_string()) - .push_bind(hex::encode(data_key).to_string()) - .push_bind(data_value); - } - near_primitives::views::StateChangeValueView::DataDeletion { account_id, key } => { - let data_key: &[u8] = key.as_ref(); - let data_value: Option<&[u8]> = None; - values - .push_bind(account_id.to_string()) - .push_bind(bigdecimal::BigDecimal::from(block_height)) - .push_bind(block_hash.to_string()) - .push_bind(hex::encode(data_key).to_string()) - .push_bind(data_value); + } = &change.value + { + let data_key: String = hex::encode(key.as_slice()); + Some(( + account_id.to_string(), + data_key, + value.clone().to_vec(), + bigdecimal::BigDecimal::from(block_height), + )) + } else { + None } - _ => {} + }) + .collect(); + + if inserts.is_empty() { + return Ok(()); + } + + // Get all account_ids to compute partition map + let account_ids: Vec = inserts.iter().map(|(id, _, _, _)| id.clone()).collect(); + + let pool = self.shards_pool.get(&shard_id).ok_or(anyhow::anyhow!( + "Database connection for Shard_{} not found", + shard_id + ))?; + + // TODO: Watch for PARTITION_MAP_ELAPSED_TIME metric + // This happens in each method, but we don't have a place to call it once for all operations. + // Right now it seems like a neglectable overhead, but we should consider optimizing this. + let partition_map = self.partition_map(&pool, &account_ids).await?; + + // Group inserts per partition + let mut inserts_per_partition: HashMap> = HashMap::new(); + for (account_id, data_key, data_value, block_height) in inserts { + if let Some(&partition) = partition_map.get(&account_id) { + inserts_per_partition.entry(partition).or_default().push(( + account_id, + data_key, + data_value, + block_height, + )); + } else { + tracing::warn!("Partition not found for account_id: {}", account_id); } - }); - query_builder.push(" ON CONFLICT DO NOTHING;"); - query_builder - .build() - .execute(self.shards_pool.get(&shard_id).ok_or(anyhow::anyhow!( - "Database connection for Shard_{} not found", - shard_id - ))?) - .await?; + } + + // Build and execute inserts in parallel + let semaphore = std::sync::Arc::new(tokio::sync::Semaphore::new(MAX_CONCURRENT_QUERIES)); + let mut tasks = Vec::new(); + + for (partition_id, rows) in inserts_per_partition { + let pool = pool.clone(); + let semaphore = semaphore.clone(); + let table_name = format!("state_changes_data_compact_{}", partition_id); + + let task = tokio::spawn(async move { + let start = Instant::now(); + let _permit = semaphore.acquire_owned().await.unwrap(); + let mut qb = sqlx::QueryBuilder::new(format!( + "INSERT INTO {} (account_id, data_key, data_value, block_height_from, block_height_to) ", + table_name, + )); + + qb.push_values( + rows.iter(), + |mut row, (account_id, data_key, data_value, block_height)| { + row.push_bind(account_id) + .push_bind(data_key) + .push_bind(data_value) + .push_bind(block_height) + .push_bind(None::>); + }, + ); + + qb.push(" ON CONFLICT DO NOTHING"); + + let result = qb.build().execute(&pool).await.map_err(anyhow::Error::from); + + tracing::debug!( + target: "database::postgres::state_indexer", + "Insert done partition={} elapsed={:?} rows={}", + partition_id, + start.elapsed(), + rows.len() + ); + + result + }); + + tasks.push(task); + } + + try_join_all(tasks).await?; Ok(()) } - async fn save_state_changes_access_key( + async fn update_state_changes_data( + &self, + shard_id: near_primitives::types::ShardId, + state_changes: Vec, + block_height: u64, + ) -> anyhow::Result<()> { + crate::metrics::SHARD_DATABASE_WRITE_QUERIES + .with_label_values(&[ + &shard_id.to_string(), + "save_state_changes_data", + "state_changes_data", + ]) + .inc(); + + let updates: Vec<(String, String, bigdecimal::BigDecimal)> = + state_changes + .iter() + .filter_map(|change| match &change.value { + near_primitives::views::StateChangeValueView::DataUpdate { + account_id, + key, + .. + } + | near_primitives::views::StateChangeValueView::DataDeletion { + account_id, + key, + } => { + let data_key: &[u8] = key.as_ref(); + let data_key = hex::encode(data_key).to_string(); + Some(( + account_id.to_string(), + data_key, + bigdecimal::BigDecimal::from(block_height), + )) + } + _ => None, + }) + .collect(); + + let account_ids: Vec = updates + .iter() + .map(|(account_id, _, _)| account_id.clone()) + .collect(); + + let pool = self.shards_pool.get(&shard_id).ok_or(anyhow::anyhow!( + "Database connection for Shard_{} not found", + shard_id + ))?; + + let partition_map = self.partition_map(&pool, &account_ids).await?; + + let mut updates_per_partition: HashMap> = + HashMap::new(); + + for (account_id, data_key, block_height) in updates { + if let Some(&partition) = partition_map.get(&account_id) { + updates_per_partition.entry(partition).or_default().push(( + account_id, + data_key, + block_height, + )); + } else { + tracing::warn!("Partition not found for account_id: {}", account_id); + } + } + + let semaphore = std::sync::Arc::new(tokio::sync::Semaphore::new(MAX_CONCURRENT_QUERIES)); + let mut tasks = Vec::new(); + + for (partition_id, rows) in updates_per_partition { + let pool = pool.clone(); + let semaphore = semaphore.clone(); + let table_name = format!("state_changes_data_compact_{}", partition_id); + + let task = tokio::spawn(async move { + let _permit = semaphore.acquire_owned().await.unwrap(); + let start = Instant::now(); + let mut qb = sqlx::QueryBuilder::new( + "WITH new_data (account_id, data_key, block_height) AS (", + ); + + qb.push_values( + rows.iter(), + |mut row, (account_id, data_key, block_height)| { + row.push_bind(account_id) + .push_bind(data_key) + .push_bind(block_height); + }, + ); + + qb.push(format!( + ") UPDATE {} AS old \ + SET block_height_to = new_data.block_height \ + FROM new_data \ + WHERE old.account_id = new_data.account_id \ + AND old.data_key = new_data.data_key \ + AND old.block_height_from < new_data.block_height \ + AND old.block_height_to IS NULL;", + table_name, + )); + + let result = qb.build().execute(&pool).await.map_err(anyhow::Error::from); + + tracing::debug!( + target: "database::postgres::state_indexer", + "Update done partition={} elapsed={:?} rows={}", + partition_id, + start.elapsed(), + rows.len() + ); + + result + }); + + tasks.push(task); + } + + // Wait for all partition updates to complete + try_join_all(tasks).await?; + Ok(()) + } + + async fn insert_state_changes_access_key( &self, shard_id: near_primitives::types::ShardId, state_changes: Vec, block_height: u64, - block_hash: near_primitives::hash::CryptoHash, ) -> anyhow::Result<()> { + let overall_start = Instant::now(); + crate::metrics::SHARD_DATABASE_WRITE_QUERIES .with_label_values(&[ &shard_id.to_string(), @@ -282,62 +506,244 @@ impl crate::StateIndexerDbManager for crate::PostgresDBManager { "state_changes_access_key", ]) .inc(); - let mut query_builder: sqlx::QueryBuilder = sqlx::QueryBuilder::new( - "INSERT INTO state_changes_access_key (account_id, block_height, block_hash, data_key, data_value) ", - ); - query_builder.push_values(state_changes.iter(), |mut values, state_change| { - match &state_change.value { - near_primitives::views::StateChangeValueView::AccessKeyUpdate { + + // Extract relevant updates + let inserts: Vec<(String, String, Vec, bigdecimal::BigDecimal)> = state_changes + .iter() + .filter_map(|change| { + if let near_primitives::views::StateChangeValueView::AccessKeyUpdate { account_id, public_key, access_key, - } => { - let data_key = - borsh::to_vec(public_key).expect("Failed to borsh serialize public key"); + } = &change.value + { + let data_key = hex::encode( + borsh::to_vec(public_key).expect("Failed to borsh serialize public key"), + ); let data_value = borsh::to_vec(access_key).expect("Failed to borsh serialize access key"); - values - .push_bind(account_id.to_string()) - .push_bind(bigdecimal::BigDecimal::from(block_height)) - .push_bind(block_hash.to_string()) - .push_bind(hex::encode(data_key).to_string()) - .push_bind(data_value); + Some(( + account_id.to_string(), + data_key, + data_value, + bigdecimal::BigDecimal::from(block_height), + )) + } else { + None } - near_primitives::views::StateChangeValueView::AccessKeyDeletion { + }) + .collect(); + + if inserts.is_empty() { + return Ok(()); + } + + // Get all account_ids for partition mapping + let account_ids: Vec = inserts.iter().map(|(id, _, _, _)| id.clone()).collect(); + + let pool = self.shards_pool.get(&shard_id).ok_or(anyhow::anyhow!( + "Database connection for Shard_{} not found", + shard_id + ))?; + + let partition_map = self.partition_map(&pool, &account_ids).await?; + + // Group inserts per partition + let mut inserts_per_partition: HashMap> = HashMap::new(); + for (account_id, data_key, data_value, block_height) in inserts { + if let Some(&partition) = partition_map.get(&account_id) { + inserts_per_partition.entry(partition).or_default().push(( + account_id, + data_key, + data_value, + block_height, + )); + } + } + + // Insert in parallel per partition + let semaphore = std::sync::Arc::new(tokio::sync::Semaphore::new(MAX_CONCURRENT_QUERIES)); + let mut tasks = Vec::new(); + for (partition_id, rows) in inserts_per_partition { + let pool = pool.clone(); + let semaphore = semaphore.clone(); + let table_name = format!("state_changes_access_key_compact_{}", partition_id); + + let task = tokio::spawn(async move { + let _permit = semaphore.acquire_owned().await.unwrap(); + let start = Instant::now(); + + let mut qb = sqlx::QueryBuilder::new(format!( + "INSERT INTO {} (account_id, data_key, data_value, block_height_from, block_height_to) ", + table_name + )); + + qb.push_values( + rows.iter(), + |mut row, (account_id, data_key, data_value, block_height)| { + row.push_bind(account_id) + .push_bind(data_key) + .push_bind(data_value) + .push_bind(block_height) + .push_bind(None::>); + }, + ); + + qb.push(" ON CONFLICT DO NOTHING"); + + let result = qb.build().execute(&pool).await.map_err(anyhow::Error::from); + + tracing::debug!( + target: "database::postgres::state_indexer", + "Insert done partition={} elapsed={:?} rows={}", + partition_id, + start.elapsed(), + rows.len() + ); + + result + }); + + tasks.push(task); + } + + try_join_all(tasks).await?; + + tracing::debug!( + target: "database::postgres::state_indexer", + "Total insert_state_changes_access_key duration shard={} elapsed={:?}", + shard_id, + overall_start.elapsed() + ); + + Ok(()) + } + + async fn update_state_changes_access_key( + &self, + shard_id: near_primitives::types::ShardId, + state_changes: Vec, + block_height: u64, + ) -> anyhow::Result<()> { + let overall_start = Instant::now(); + + crate::metrics::SHARD_DATABASE_WRITE_QUERIES + .with_label_values(&[ + &shard_id.to_string(), + "save_state_changes_access_key", + "state_changes_access_key", + ]) + .inc(); + + // Collect updates: (account_id, data_key) + let updates: Vec<(String, String)> = state_changes + .iter() + .filter_map(|c| match &c.value { + near_primitives::views::StateChangeValueView::AccessKeyUpdate { account_id, public_key, - } => { - let data_key = - borsh::to_vec(public_key).expect("Failed to borsh serialize public key"); - let data_value: Option<&[u8]> = None; - values - .push_bind(account_id.to_string()) - .push_bind(bigdecimal::BigDecimal::from(block_height)) - .push_bind(block_hash.to_string()) - .push_bind(hex::encode(data_key).to_string()) - .push_bind(data_value); + .. } - _ => {} + | near_primitives::views::StateChangeValueView::AccessKeyDeletion { + account_id, + public_key, + } => Some((account_id.to_string(), hex::encode(public_key.key_data()))), + _ => None, + }) + .collect(); + + if updates.is_empty() { + return Ok(()); + } + + // Compute partitions + let account_ids: Vec = updates.iter().map(|(id, _)| id.clone()).collect(); + let pool = self.shards_pool.get(&shard_id).ok_or(anyhow::anyhow!( + "Database connection for Shard_{} not found", + shard_id + ))?; + + let partition_map = self.partition_map(&pool, &account_ids).await?; + + // Group updates per partition + let mut updates_per_partition: HashMap> = HashMap::new(); + for (account_id, data_key) in updates { + if let Some(&partition) = partition_map.get(&account_id) { + updates_per_partition + .entry(partition) + .or_default() + .push((account_id, data_key)); } - }); - query_builder.push(" ON CONFLICT DO NOTHING;"); - query_builder - .build() - .execute(self.shards_pool.get(&shard_id).ok_or(anyhow::anyhow!( - "Database connection for Shard_{} not found", - shard_id - ))?) - .await?; + } + + // Parallel update execution per partition + let semaphore = std::sync::Arc::new(tokio::sync::Semaphore::new(MAX_CONCURRENT_QUERIES)); + let mut tasks = Vec::new(); + for (partition_id, rows) in updates_per_partition { + let pool = pool.clone(); + let semaphore = semaphore.clone(); + let block_height_bd = bigdecimal::BigDecimal::from(block_height); + + let task = tokio::spawn(async move { + let _permit = semaphore.acquire_owned().await.unwrap(); + let start = Instant::now(); + let (account_ids, data_keys): (Vec<_>, Vec<_>) = rows.into_iter().unzip(); + + // Use UNNEST for batch update + let query = format!( + r#" + UPDATE state_changes_access_key_compact_{partition_id} AS t + SET block_height_to = $3 + FROM ( + SELECT unnest($1::text[]) AS account_id, unnest($2::text[]) AS data_key + ) AS u + WHERE t.account_id = u.account_id + AND t.data_key = u.data_key + AND t.block_height_to IS NULL; + "#, + partition_id = partition_id + ); + + sqlx::query(&query) + .bind(&account_ids) + .bind(&data_keys) + .bind(&block_height_bd) + .execute(&pool) + .await?; + + tracing::debug!( + target: "database::postgres::state_indexer", + "Update done partition={} elapsed={:?} rows={}", + partition_id, + start.elapsed(), + account_ids.len() + ); + + Ok::<(), anyhow::Error>(()) + }); + + tasks.push(task); + } + + try_join_all(tasks).await?; + + tracing::debug!( + target: "database::postgres::state_indexer", + "Total update_state_changes_access_key duration shard={} elapsed={:?}", + shard_id, + overall_start.elapsed() + ); + Ok(()) } - async fn save_state_changes_contract( + async fn insert_state_changes_contract( &self, shard_id: near_primitives::types::ShardId, state_changes: Vec, block_height: u64, - block_hash: near_primitives::hash::CryptoHash, ) -> anyhow::Result<()> { + let overall_start = Instant::now(); crate::metrics::SHARD_DATABASE_WRITE_QUERIES .with_label_values(&[ &shard_id.to_string(), @@ -345,53 +751,221 @@ impl crate::StateIndexerDbManager for crate::PostgresDBManager { "state_changes_contract", ]) .inc(); - let mut query_builder: sqlx::QueryBuilder = sqlx::QueryBuilder::new( - "INSERT INTO state_changes_contract (account_id, block_height, block_hash, data_value) ", - ); - query_builder.push_values(state_changes.iter(), |mut values, state_change| { - match &state_change.value { - near_primitives::views::StateChangeValueView::ContractCodeUpdate { + + // Extract only ContractCodeUpdate + let inserts: Vec<(String, Vec, bigdecimal::BigDecimal)> = state_changes + .into_iter() + .filter_map(|change| { + if let near_primitives::views::StateChangeValueView::ContractCodeUpdate { account_id, code, - } => { - let data_value: &[u8] = code.as_ref(); - values - .push_bind(account_id.to_string()) - .push_bind(bigdecimal::BigDecimal::from(block_height)) - .push_bind(block_hash.to_string()) - .push_bind(data_value); + } = change.value + { + Some(( + account_id.to_string(), + code.to_vec(), + bigdecimal::BigDecimal::from(block_height), + )) + } else { + None } - near_primitives::views::StateChangeValueView::ContractCodeDeletion { + }) + .collect(); + + if inserts.is_empty() { + return Ok(()); + } + + // Compute partitions + let account_ids: Vec = inserts.iter().map(|(id, _, _)| id.clone()).collect(); + let pool = self.shards_pool.get(&shard_id).ok_or(anyhow::anyhow!( + "Database connection for Shard_{} not found", + shard_id + ))?; + + let partition_map = self.partition_map(&pool, &account_ids).await?; + + // Group rows by partition + let mut inserts_per_partition: HashMap> = HashMap::new(); + for (account_id, data_value, block_height) in inserts { + if let Some(&partition) = partition_map.get(&account_id) { + inserts_per_partition.entry(partition).or_default().push(( account_id, - } => { - let data_value: Option<&[u8]> = None; - values - .push_bind(account_id.to_string()) - .push_bind(bigdecimal::BigDecimal::from(block_height)) - .push_bind(block_hash.to_string()) - .push_bind(data_value); + data_value, + block_height, + )); + } + } + + let semaphore = std::sync::Arc::new(tokio::sync::Semaphore::new(MAX_CONCURRENT_QUERIES)); + let mut tasks = Vec::new(); + + for (partition_id, rows) in inserts_per_partition { + let pool = pool.clone(); + let semaphore = semaphore.clone(); + + let task = tokio::spawn(async move { + let _permit = semaphore.acquire().await.unwrap(); + let start = Instant::now(); + + let table_name = format!("state_changes_contract_compact_{}", partition_id); + let mut qb = sqlx::QueryBuilder::new(format!( + "INSERT INTO {} (account_id, data_value, block_height_from, block_height_to) ", + table_name + )); + + qb.push_values( + rows.iter(), + |mut row, (account_id, data_value, block_height)| { + row.push_bind(account_id) + .push_bind(data_value) + .push_bind(block_height) + .push_bind(None::>); + }, + ); + + qb.push(" ON CONFLICT DO NOTHING"); + qb.build().execute(&pool).await?; + + tracing::debug!( + target: "database::postgres::state_indexer", + "Insert contract partition={} elapsed={:?} rows={}", + partition_id, + start.elapsed(), + rows.len() + ); + + Ok::<(), anyhow::Error>(()) + }); + + tasks.push(task); + } + + try_join_all(tasks).await?; + tracing::debug!( + target: "database::postgres::state_indexer", + "Total insert_state_changes_contract duration shard={} elapsed={:?}", + shard_id, + overall_start.elapsed() + ); + + Ok(()) + } + + /// Update contract state changes with partitions under semaphore + async fn update_state_changes_contract( + &self, + shard_id: near_primitives::types::ShardId, + state_changes: Vec, + block_height: u64, + ) -> anyhow::Result<()> { + let overall_start = Instant::now(); + crate::metrics::SHARD_DATABASE_WRITE_QUERIES + .with_label_values(&[ + &shard_id.to_string(), + "save_state_changes_contract", + "state_changes_contract", + ]) + .inc(); + + // Collect account_ids for updates + let accounts: Vec = state_changes + .into_iter() + .filter_map(|change| match change.value { + near_primitives::views::StateChangeValueView::ContractCodeUpdate { + account_id, + .. } - _ => {} + | near_primitives::views::StateChangeValueView::ContractCodeDeletion { + account_id, + } => Some(account_id.to_string()), + _ => None, + }) + .collect(); + + if accounts.is_empty() { + return Ok(()); + } + + let pool = self.shards_pool.get(&shard_id).ok_or(anyhow::anyhow!( + "Database connection for Shard_{} not found", + shard_id + ))?; + + let partition_map = self.partition_map(&pool, &accounts).await?; + + let mut updates_per_partition: HashMap> = HashMap::new(); + for account_id in accounts { + if let Some(&partition) = partition_map.get(&account_id) { + updates_per_partition + .entry(partition) + .or_default() + .push(account_id); + } else { + tracing::warn!("Partition not found for account_id: {}", account_id); } - }); - query_builder.push(" ON CONFLICT DO NOTHING;"); - query_builder - .build() - .execute(self.shards_pool.get(&shard_id).ok_or(anyhow::anyhow!( - "Database connection for Shard_{} not found", - shard_id - ))?) - .await?; + } + + let semaphore = std::sync::Arc::new(tokio::sync::Semaphore::new(MAX_CONCURRENT_QUERIES)); + let mut tasks = Vec::new(); + + for (partition_id, account_ids) in updates_per_partition { + let pool = pool.clone(); + let semaphore = semaphore.clone(); + let block_height_bd = bigdecimal::BigDecimal::from(block_height); + + let task = tokio::spawn(async move { + let _permit = semaphore.acquire().await.unwrap(); + let start = Instant::now(); + + let query = format!( + r#" + UPDATE state_changes_contract_compact_{partition_id} AS t + SET block_height_to = $2 + FROM (SELECT unnest($1::text[]) AS account_id) AS u + WHERE t.account_id = u.account_id + AND t.block_height_to IS NULL; + "# + ); + + sqlx::query(&query) + .bind(&account_ids) + .bind(&block_height_bd) + .execute(&pool) + .await?; + + tracing::debug!( + target: "database::postgres::state_indexer", + "Update contract partition={} elapsed={:?} rows={}", + partition_id, + start.elapsed(), + account_ids.len() + ); + + Ok::<(), anyhow::Error>(()) + }); + + tasks.push(task); + } + + try_join_all(tasks).await?; + tracing::debug!( + target: "database::postgres::state_indexer", + "Total update_state_changes_contract duration shard={} elapsed={:?}", + shard_id, + overall_start.elapsed() + ); + Ok(()) } - async fn save_state_changes_account( + async fn insert_state_changes_account( &self, shard_id: near_primitives::types::ShardId, state_changes: Vec, block_height: u64, - block_hash: near_primitives::hash::CryptoHash, ) -> anyhow::Result<()> { + let overall_start = Instant::now(); crate::metrics::SHARD_DATABASE_WRITE_QUERIES .with_label_values(&[ &shard_id.to_string(), @@ -399,43 +973,207 @@ impl crate::StateIndexerDbManager for crate::PostgresDBManager { "state_changes_account", ]) .inc(); - let mut query_builder: sqlx::QueryBuilder = sqlx::QueryBuilder::new( - "INSERT INTO state_changes_account (account_id, block_height, block_hash, data_value) ", - ); - query_builder.push_values(state_changes.iter(), |mut values, state_change| { - match &state_change.value { - near_primitives::views::StateChangeValueView::AccountUpdate { + + // Extract account updates + let inserts: Vec<(String, Vec, bigdecimal::BigDecimal)> = state_changes + .into_iter() + .filter_map(|change| { + if let near_primitives::views::StateChangeValueView::AccountUpdate { account_id, account, - } => { + } = change.value + { let data_value = - borsh::to_vec(&near_primitives::account::Account::from(account)) + borsh::to_vec(&near_primitives::account::Account::from(&account)) .expect("Failed to borsh serialize account"); - values - .push_bind(account_id.to_string()) - .push_bind(bigdecimal::BigDecimal::from(block_height)) - .push_bind(block_hash.to_string()) - .push_bind(data_value); + Some(( + account_id.to_string(), + data_value, + bigdecimal::BigDecimal::from(block_height), + )) + } else { + None } - near_primitives::views::StateChangeValueView::AccountDeletion { account_id } => { - let data_value: Option<&[u8]> = None; - values - .push_bind(account_id.to_string()) - .push_bind(bigdecimal::BigDecimal::from(block_height)) - .push_bind(block_hash.to_string()) - .push_bind(data_value); + }) + .collect(); + + if inserts.is_empty() { + return Ok(()); + } + + // Compute partitions + let account_ids: Vec = inserts.iter().map(|(id, _, _)| id.clone()).collect(); + let pool = self.shards_pool.get(&shard_id).unwrap(); + + let partition_map = self.partition_map(&pool, &account_ids).await?; + + let mut inserts_per_partition: HashMap> = HashMap::new(); + for (account_id, data_value, block_height) in inserts { + if let Some(&partition) = partition_map.get(&account_id) { + inserts_per_partition.entry(partition).or_default().push(( + account_id, + data_value, + block_height, + )); + } + } + + let semaphore = std::sync::Arc::new(tokio::sync::Semaphore::new(MAX_CONCURRENT_QUERIES)); + let mut tasks = Vec::new(); + + for (partition_id, rows) in inserts_per_partition { + let pool = pool.clone(); + let semaphore = semaphore.clone(); + + let task = tokio::spawn(async move { + let _permit = semaphore.acquire().await.unwrap(); + let start = Instant::now(); + + let table_name = format!("state_changes_account_compact_{}", partition_id); + let mut qb = sqlx::QueryBuilder::new(format!( + "INSERT INTO {} (account_id, data_value, block_height_from, block_height_to) ", + table_name + )); + + qb.push_values( + rows.iter(), + |mut row, (account_id, data_value, block_height)| { + row.push_bind(account_id) + .push_bind(data_value) + .push_bind(block_height) + .push_bind(None::>); + }, + ); + + qb.push(" ON CONFLICT DO NOTHING"); + qb.build().execute(&pool).await?; + + tracing::debug!( + target: "database::postgres::state_indexer", + "Insert account partition={} elapsed={:?} rows={}", + partition_id, + start.elapsed(), + rows.len() + ); + + Ok::<(), anyhow::Error>(()) + }); + + tasks.push(task); + } + + try_join_all(tasks).await?; + tracing::debug!( + target: "database::postgres::state_indexer", + "Total insert_state_changes_account duration shard={} elapsed={:?}", + shard_id, + overall_start.elapsed() + ); + + Ok(()) + } + + /// Update Account state changes using partitions + concurrency limit + async fn update_state_changes_account( + &self, + shard_id: near_primitives::types::ShardId, + state_changes: Vec, + block_height: u64, + ) -> anyhow::Result<()> { + let overall_start = Instant::now(); + crate::metrics::SHARD_DATABASE_WRITE_QUERIES + .with_label_values(&[ + &shard_id.to_string(), + "save_state_changes_account", + "state_changes_account", + ]) + .inc(); + + // Collect accounts for update + let accounts: Vec = state_changes + .into_iter() + .filter_map(|change| match change.value { + near_primitives::views::StateChangeValueView::AccountUpdate { + account_id, .. + } + | near_primitives::views::StateChangeValueView::AccountDeletion { account_id } => { + Some(account_id.to_string()) } - _ => {} + _ => None, + }) + .collect(); + + if accounts.is_empty() { + return Ok(()); + } + + // Compute partitions + let pool = self.shards_pool.get(&shard_id).unwrap(); + + let partition_map = self.partition_map(&pool, &accounts).await?; + + let mut updates_per_partition: HashMap> = HashMap::new(); + for account_id in accounts { + if let Some(&partition) = partition_map.get(&account_id) { + updates_per_partition + .entry(partition) + .or_default() + .push(account_id); + } else { + tracing::warn!("Partition not found for account_id: {}", account_id); } - }); - query_builder.push(" ON CONFLICT DO NOTHING;"); - query_builder - .build() - .execute(self.shards_pool.get(&shard_id).ok_or(anyhow::anyhow!( - "Database connection for Shard_{} not found", - shard_id - ))?) - .await?; + } + + let semaphore = std::sync::Arc::new(tokio::sync::Semaphore::new(MAX_CONCURRENT_QUERIES)); + let mut tasks = Vec::new(); + + for (partition_id, account_ids) in updates_per_partition { + let pool = pool.clone(); + let semaphore = semaphore.clone(); + let block_height_bd = bigdecimal::BigDecimal::from(block_height); + + let task = tokio::spawn(async move { + let _permit = semaphore.acquire().await.unwrap(); + let start = Instant::now(); + + let query = format!( + r#" + UPDATE state_changes_account_compact_{partition_id} AS t + SET block_height_to = $2 + FROM (SELECT unnest($1::text[]) AS account_id) AS u + WHERE t.account_id = u.account_id + AND t.block_height_to IS NULL; + "# + ); + + sqlx::query(&query) + .bind(&account_ids) + .bind(&block_height_bd) + .execute(&pool) + .await?; + + tracing::debug!( + target: "database::postgres::state_indexer", + "Update account partition={} elapsed={:?} rows={}", + partition_id, + start.elapsed(), + account_ids.len() + ); + + Ok::<(), anyhow::Error>(()) + }); + + tasks.push(task); + } + + try_join_all(tasks).await?; + tracing::debug!( + target: "database::postgres::state_indexer", + "Total update_state_changes_account duration shard={} elapsed={:?}", + shard_id, + overall_start.elapsed() + ); + Ok(()) } } diff --git a/logic-state-indexer/src/lib.rs b/logic-state-indexer/src/lib.rs index e41c3a04..2e630e06 100644 --- a/logic-state-indexer/src/lib.rs +++ b/logic-state-indexer/src/lib.rs @@ -50,7 +50,6 @@ impl StateChangesToStore { &self, db_manager: &(impl database::StateIndexerDbManager + Sync + Send + 'static), block_height: u64, - block_hash: CryptoHash, ) -> anyhow::Result<()> { if !self.data.is_empty() { let futures = self.data.iter().map(|(shard_id, state_changes)| { @@ -58,7 +57,6 @@ impl StateChangesToStore { *shard_id, state_changes.values().cloned().collect(), block_height, - block_hash, ) }); futures::future::join_all(futures) @@ -75,7 +73,6 @@ impl StateChangesToStore { &self, db_manager: &(impl database::StateIndexerDbManager + Sync + Send + 'static), block_height: u64, - block_hash: CryptoHash, ) -> anyhow::Result<()> { if !self.access_key.is_empty() { let futures = self.access_key.iter().map(|(shard_id, state_changes)| { @@ -83,7 +80,6 @@ impl StateChangesToStore { *shard_id, state_changes.values().cloned().collect(), block_height, - block_hash, ) }); futures::future::join_all(futures) @@ -100,7 +96,6 @@ impl StateChangesToStore { &self, db_manager: &(impl database::StateIndexerDbManager + Sync + Send + 'static), block_height: u64, - block_hash: CryptoHash, ) -> anyhow::Result<()> { if !self.contract.is_empty() { let futures = self.contract.iter().map(|(shard_id, state_changes)| { @@ -108,7 +103,6 @@ impl StateChangesToStore { *shard_id, state_changes.values().cloned().collect(), block_height, - block_hash, ) }); futures::future::join_all(futures) @@ -125,7 +119,6 @@ impl StateChangesToStore { &self, db_manager: &(impl database::StateIndexerDbManager + Sync + Send + 'static), block_height: u64, - block_hash: CryptoHash, ) -> anyhow::Result<()> { if !self.account.is_empty() { let futures = self.account.iter().map(|(shard_id, state_changes)| { @@ -133,7 +126,6 @@ impl StateChangesToStore { *shard_id, state_changes.values().cloned().collect(), block_height, - block_hash, ) }); futures::future::join_all(futures) @@ -148,12 +140,11 @@ impl StateChangesToStore { &self, db_manager: &(impl database::StateIndexerDbManager + Sync + Send + 'static), block_height: u64, - block_hash: CryptoHash, ) -> anyhow::Result<()> { - let save_data_future = self.save_data(db_manager, block_height, block_hash); - let save_access_key_future = self.save_access_key(db_manager, block_height, block_hash); - let save_contract_future = self.save_contract(db_manager, block_height, block_hash); - let save_account_future = self.save_account(db_manager, block_height, block_hash); + let save_data_future = self.save_data(db_manager, block_height); + let save_access_key_future = self.save_access_key(db_manager, block_height); + let save_contract_future = self.save_contract(db_manager, block_height); + let save_account_future = self.save_account(db_manager, block_height); futures::future::join_all([ save_data_future.boxed(), @@ -432,6 +423,6 @@ async fn handle_state_changes( } state_changes_to_store - .save_state_changes(db_manager, block_height, block_hash) + .save_state_changes(db_manager, block_height) .await } From b0be71c031994a6fd9a10cb6600beaee6f5b45ff Mon Sep 17 00:00:00 2001 From: Bohdan Khorolets Date: Fri, 25 Jul 2025 12:30:13 +0300 Subject: [PATCH 02/13] chore(database, logic-state-indexer): Split database/postgres/state_indexer into files. Remove redundant block_hash from handle_state_changes method in logic-state-indexer --- database/src/postgres/state_indexer.rs | 1179 ----------------- .../src/postgres/state_indexer/helpers.rs | 538 ++++++++ database/src/postgres/state_indexer/mod.rs | 714 ++++++++++ logic-state-indexer/src/lib.rs | 2 - 4 files changed, 1252 insertions(+), 1181 deletions(-) delete mode 100644 database/src/postgres/state_indexer.rs create mode 100644 database/src/postgres/state_indexer/helpers.rs create mode 100644 database/src/postgres/state_indexer/mod.rs diff --git a/database/src/postgres/state_indexer.rs b/database/src/postgres/state_indexer.rs deleted file mode 100644 index 8d4c67fc..00000000 --- a/database/src/postgres/state_indexer.rs +++ /dev/null @@ -1,1179 +0,0 @@ -use std::collections::HashMap; -use std::time::Instant; - -use bigdecimal::ToPrimitive; -use futures::{future::try_join_all, FutureExt}; -use sqlx::Row; - -const PARTITIONS: i32 = 100; -const MAX_CONCURRENT_QUERIES: usize = 16; - -impl crate::PostgresDBManager { - async fn save_chunks_unique( - &self, - block_height: u64, - chunks: Vec<( - crate::primitives::ChunkHash, - crate::primitives::ShardId, - crate::primitives::HeightIncluded, - )>, - ) -> anyhow::Result<()> { - let unique_chunks = chunks - .iter() - .filter(|(_chunk_hash, _shard_id, height_included)| height_included == &block_height) - .collect::>(); - - if !unique_chunks.is_empty() { - crate::metrics::META_DATABASE_WRITE_QUERIES - .with_label_values(&["save_chunks", "chunks"]) - .inc(); - let mut query_builder: sqlx::QueryBuilder = - sqlx::QueryBuilder::new("INSERT INTO chunks (chunk_hash, block_height, shard_id) "); - - query_builder.push_values( - unique_chunks.iter(), - |mut values, (chunk_hash, shard_id, height_included)| { - values - .push_bind(chunk_hash.to_string()) - .push_bind(bigdecimal::BigDecimal::from(*height_included)) - .push_bind(bigdecimal::BigDecimal::from(*shard_id)); - }, - ); - query_builder.push(" ON CONFLICT DO NOTHING;"); - query_builder.build().execute(&self.meta_db_pool).await?; - } - Ok(()) - } - - async fn save_chunks_duplicate( - &self, - block_height: u64, - chunks: Vec<( - crate::primitives::ChunkHash, - crate::primitives::ShardId, - crate::primitives::HeightIncluded, - )>, - ) -> anyhow::Result<()> { - let chunks_duplicate = chunks - .iter() - .filter(|(_chunk_hash, _shard_id, height_included)| height_included != &block_height) - .collect::>(); - if !chunks_duplicate.is_empty() { - crate::metrics::META_DATABASE_WRITE_QUERIES - .with_label_values(&["save_chunks", "chunks_duplicate"]) - .inc(); - let mut query_builder: sqlx::QueryBuilder = - sqlx::QueryBuilder::new("INSERT INTO chunks_duplicate (chunk_hash, block_height, shard_id, included_in_block_height) "); - - query_builder.push_values( - chunks.iter(), - |mut values, (chunk_hash, shard_id, height_included)| { - values - .push_bind(chunk_hash.to_string()) - .push_bind(bigdecimal::BigDecimal::from(block_height)) - .push_bind(bigdecimal::BigDecimal::from(*shard_id)) - .push_bind(bigdecimal::BigDecimal::from(*height_included)); - }, - ); - query_builder.push(" ON CONFLICT DO NOTHING;"); - query_builder.build().execute(&self.meta_db_pool).await?; - } - Ok(()) - } - - async fn partition_map( - &self, - pool: &sqlx::PgPool, - account_ids: &Vec, - ) -> anyhow::Result> { - let now = std::time::Instant::now(); - let partition_rows = sqlx::query( - "SELECT account_id, mod(hashtext(account_id), $2)::int AS partition - FROM unnest($1::text[]) AS account_id", - ) - .bind(account_ids) - .bind(PARTITIONS) - .fetch_all(pool) - .await?; - - let partition_map: HashMap = partition_rows - .into_iter() - .map(|row| { - let account_id: String = row.try_get("account_id").unwrap(); - let partition: i32 = row.try_get("partition").unwrap(); - (account_id, partition) - }) - .collect(); - tracing::debug!( - target: "database::postgres::state_indexer", - "Partition map computed in {:?} for {} accounts", - now.elapsed(), - account_ids.len() - ); - Ok(partition_map) - } -} -#[async_trait::async_trait] -impl crate::StateIndexerDbManager for crate::PostgresDBManager { - async fn save_block( - &self, - block_height: u64, - block_hash: near_primitives::hash::CryptoHash, - ) -> anyhow::Result<()> { - crate::metrics::META_DATABASE_WRITE_QUERIES - .with_label_values(&["save_block", "blocks"]) - .inc(); - sqlx::query( - " - INSERT INTO blocks (block_height, block_hash) - VALUES ($1, $2) ON CONFLICT DO NOTHING; - ", - ) - .bind(bigdecimal::BigDecimal::from(block_height)) - .bind(block_hash.to_string()) - .execute(&self.meta_db_pool) - .await?; - Ok(()) - } - - async fn save_chunks( - &self, - block_height: u64, - chunks: Vec<( - crate::primitives::ChunkHash, - crate::primitives::ShardId, - crate::primitives::HeightIncluded, - )>, - ) -> anyhow::Result<()> { - let save_chunks_unique_future = self.save_chunks_unique(block_height, chunks.clone()); - let save_chunks_duplicate_future = self.save_chunks_duplicate(block_height, chunks); - - futures::future::join_all([ - save_chunks_unique_future.boxed(), - save_chunks_duplicate_future.boxed(), - ]) - .await - .into_iter() - .collect::>() - } - - async fn get_block_height_by_hash( - &self, - block_hash: near_primitives::hash::CryptoHash, - method_name: &str, - ) -> anyhow::Result { - crate::metrics::META_DATABASE_READ_QUERIES - .with_label_values(&[method_name, "blocks"]) - .inc(); - let (block_height,): (bigdecimal::BigDecimal,) = sqlx::query_as( - " - SELECT block_height - FROM blocks - WHERE block_hash = $1 - LIMIT 1; - ", - ) - .bind(block_hash.to_string()) - .fetch_one(&self.meta_db_pool) - .await?; - block_height - .to_u64() - .ok_or_else(|| anyhow::anyhow!("Failed to parse `block_height` to u64")) - } - - async fn update_meta(&self, indexer_id: &str, block_height: u64) -> anyhow::Result<()> { - crate::metrics::META_DATABASE_WRITE_QUERIES - .with_label_values(&["update_meta", "meta"]) - .inc(); - sqlx::query( - " - INSERT INTO meta (indexer_id, last_processed_block_height) - VALUES ($1, $2) - ON CONFLICT (indexer_id) - DO UPDATE SET last_processed_block_height = $2; - ", - ) - .bind(indexer_id) - .bind(bigdecimal::BigDecimal::from(block_height)) - .execute(&self.meta_db_pool) - .await?; - Ok(()) - } - - async fn get_last_processed_block_height(&self, indexer_id: &str) -> anyhow::Result { - crate::metrics::META_DATABASE_READ_QUERIES - .with_label_values(&["get_last_processed_block_height", "meta"]) - .inc(); - let (last_processed_block_height,): (bigdecimal::BigDecimal,) = sqlx::query_as( - " - SELECT last_processed_block_height - FROM meta - WHERE indexer_id = $1 - LIMIT 1; - ", - ) - .bind(indexer_id) - .fetch_one(&self.meta_db_pool) - .await?; - last_processed_block_height - .to_u64() - .ok_or_else(|| anyhow::anyhow!("Failed to parse `last_processed_block_height` to u64")) - } - - async fn save_validators( - &self, - epoch_id: near_primitives::hash::CryptoHash, - epoch_height: u64, - epoch_start_height: u64, - validators_info: &near_primitives::views::EpochValidatorInfo, - epoch_end_block_hash: near_primitives::hash::CryptoHash, - ) -> anyhow::Result<()> { - crate::metrics::META_DATABASE_WRITE_QUERIES - .with_label_values(&["add_validators", "validators"]) - .inc(); - let epoch_end_block_height = self - .get_block_height_by_hash(epoch_end_block_hash, "add_validators") - .await?; - sqlx::query( - " - INSERT INTO validators (epoch_id, epoch_height, epoch_start_height, epoch_end_height, validators_info) - VALUES ($1, $2, $3, $4, $5) ON CONFLICT DO NOTHING; - " - ) - .bind(epoch_id.to_string()) - .bind(bigdecimal::BigDecimal::from(epoch_height)) - .bind(bigdecimal::BigDecimal::from(epoch_start_height)) - .bind(bigdecimal::BigDecimal::from(epoch_end_block_height)) - .bind(&serde_json::to_value(validators_info)?) - .execute(&self.meta_db_pool) - .await?; - Ok(()) - } - - async fn insert_state_changes_data( - &self, - shard_id: near_primitives::types::ShardId, - state_changes: Vec, - block_height: u64, - ) -> anyhow::Result<()> { - crate::metrics::SHARD_DATABASE_WRITE_QUERIES - .with_label_values(&[ - &shard_id.to_string(), - "save_state_changes_data", - "state_changes_data", - ]) - .inc(); - - // Extract relevant data - let inserts: Vec<(String, String, Vec, bigdecimal::BigDecimal)> = state_changes - .iter() - .filter_map(|change| { - if let near_primitives::views::StateChangeValueView::DataUpdate { - account_id, - key, - value, - } = &change.value - { - let data_key: String = hex::encode(key.as_slice()); - Some(( - account_id.to_string(), - data_key, - value.clone().to_vec(), - bigdecimal::BigDecimal::from(block_height), - )) - } else { - None - } - }) - .collect(); - - if inserts.is_empty() { - return Ok(()); - } - - // Get all account_ids to compute partition map - let account_ids: Vec = inserts.iter().map(|(id, _, _, _)| id.clone()).collect(); - - let pool = self.shards_pool.get(&shard_id).ok_or(anyhow::anyhow!( - "Database connection for Shard_{} not found", - shard_id - ))?; - - // TODO: Watch for PARTITION_MAP_ELAPSED_TIME metric - // This happens in each method, but we don't have a place to call it once for all operations. - // Right now it seems like a neglectable overhead, but we should consider optimizing this. - let partition_map = self.partition_map(&pool, &account_ids).await?; - - // Group inserts per partition - let mut inserts_per_partition: HashMap> = HashMap::new(); - for (account_id, data_key, data_value, block_height) in inserts { - if let Some(&partition) = partition_map.get(&account_id) { - inserts_per_partition.entry(partition).or_default().push(( - account_id, - data_key, - data_value, - block_height, - )); - } else { - tracing::warn!("Partition not found for account_id: {}", account_id); - } - } - - // Build and execute inserts in parallel - let semaphore = std::sync::Arc::new(tokio::sync::Semaphore::new(MAX_CONCURRENT_QUERIES)); - let mut tasks = Vec::new(); - - for (partition_id, rows) in inserts_per_partition { - let pool = pool.clone(); - let semaphore = semaphore.clone(); - let table_name = format!("state_changes_data_compact_{}", partition_id); - - let task = tokio::spawn(async move { - let start = Instant::now(); - let _permit = semaphore.acquire_owned().await.unwrap(); - let mut qb = sqlx::QueryBuilder::new(format!( - "INSERT INTO {} (account_id, data_key, data_value, block_height_from, block_height_to) ", - table_name, - )); - - qb.push_values( - rows.iter(), - |mut row, (account_id, data_key, data_value, block_height)| { - row.push_bind(account_id) - .push_bind(data_key) - .push_bind(data_value) - .push_bind(block_height) - .push_bind(None::>); - }, - ); - - qb.push(" ON CONFLICT DO NOTHING"); - - let result = qb.build().execute(&pool).await.map_err(anyhow::Error::from); - - tracing::debug!( - target: "database::postgres::state_indexer", - "Insert done partition={} elapsed={:?} rows={}", - partition_id, - start.elapsed(), - rows.len() - ); - - result - }); - - tasks.push(task); - } - - try_join_all(tasks).await?; - Ok(()) - } - - async fn update_state_changes_data( - &self, - shard_id: near_primitives::types::ShardId, - state_changes: Vec, - block_height: u64, - ) -> anyhow::Result<()> { - crate::metrics::SHARD_DATABASE_WRITE_QUERIES - .with_label_values(&[ - &shard_id.to_string(), - "save_state_changes_data", - "state_changes_data", - ]) - .inc(); - - let updates: Vec<(String, String, bigdecimal::BigDecimal)> = - state_changes - .iter() - .filter_map(|change| match &change.value { - near_primitives::views::StateChangeValueView::DataUpdate { - account_id, - key, - .. - } - | near_primitives::views::StateChangeValueView::DataDeletion { - account_id, - key, - } => { - let data_key: &[u8] = key.as_ref(); - let data_key = hex::encode(data_key).to_string(); - Some(( - account_id.to_string(), - data_key, - bigdecimal::BigDecimal::from(block_height), - )) - } - _ => None, - }) - .collect(); - - let account_ids: Vec = updates - .iter() - .map(|(account_id, _, _)| account_id.clone()) - .collect(); - - let pool = self.shards_pool.get(&shard_id).ok_or(anyhow::anyhow!( - "Database connection for Shard_{} not found", - shard_id - ))?; - - let partition_map = self.partition_map(&pool, &account_ids).await?; - - let mut updates_per_partition: HashMap> = - HashMap::new(); - - for (account_id, data_key, block_height) in updates { - if let Some(&partition) = partition_map.get(&account_id) { - updates_per_partition.entry(partition).or_default().push(( - account_id, - data_key, - block_height, - )); - } else { - tracing::warn!("Partition not found for account_id: {}", account_id); - } - } - - let semaphore = std::sync::Arc::new(tokio::sync::Semaphore::new(MAX_CONCURRENT_QUERIES)); - let mut tasks = Vec::new(); - - for (partition_id, rows) in updates_per_partition { - let pool = pool.clone(); - let semaphore = semaphore.clone(); - let table_name = format!("state_changes_data_compact_{}", partition_id); - - let task = tokio::spawn(async move { - let _permit = semaphore.acquire_owned().await.unwrap(); - let start = Instant::now(); - let mut qb = sqlx::QueryBuilder::new( - "WITH new_data (account_id, data_key, block_height) AS (", - ); - - qb.push_values( - rows.iter(), - |mut row, (account_id, data_key, block_height)| { - row.push_bind(account_id) - .push_bind(data_key) - .push_bind(block_height); - }, - ); - - qb.push(format!( - ") UPDATE {} AS old \ - SET block_height_to = new_data.block_height \ - FROM new_data \ - WHERE old.account_id = new_data.account_id \ - AND old.data_key = new_data.data_key \ - AND old.block_height_from < new_data.block_height \ - AND old.block_height_to IS NULL;", - table_name, - )); - - let result = qb.build().execute(&pool).await.map_err(anyhow::Error::from); - - tracing::debug!( - target: "database::postgres::state_indexer", - "Update done partition={} elapsed={:?} rows={}", - partition_id, - start.elapsed(), - rows.len() - ); - - result - }); - - tasks.push(task); - } - - // Wait for all partition updates to complete - try_join_all(tasks).await?; - Ok(()) - } - - async fn insert_state_changes_access_key( - &self, - shard_id: near_primitives::types::ShardId, - state_changes: Vec, - block_height: u64, - ) -> anyhow::Result<()> { - let overall_start = Instant::now(); - - crate::metrics::SHARD_DATABASE_WRITE_QUERIES - .with_label_values(&[ - &shard_id.to_string(), - "save_state_changes_access_key", - "state_changes_access_key", - ]) - .inc(); - - // Extract relevant updates - let inserts: Vec<(String, String, Vec, bigdecimal::BigDecimal)> = state_changes - .iter() - .filter_map(|change| { - if let near_primitives::views::StateChangeValueView::AccessKeyUpdate { - account_id, - public_key, - access_key, - } = &change.value - { - let data_key = hex::encode( - borsh::to_vec(public_key).expect("Failed to borsh serialize public key"), - ); - let data_value = - borsh::to_vec(access_key).expect("Failed to borsh serialize access key"); - Some(( - account_id.to_string(), - data_key, - data_value, - bigdecimal::BigDecimal::from(block_height), - )) - } else { - None - } - }) - .collect(); - - if inserts.is_empty() { - return Ok(()); - } - - // Get all account_ids for partition mapping - let account_ids: Vec = inserts.iter().map(|(id, _, _, _)| id.clone()).collect(); - - let pool = self.shards_pool.get(&shard_id).ok_or(anyhow::anyhow!( - "Database connection for Shard_{} not found", - shard_id - ))?; - - let partition_map = self.partition_map(&pool, &account_ids).await?; - - // Group inserts per partition - let mut inserts_per_partition: HashMap> = HashMap::new(); - for (account_id, data_key, data_value, block_height) in inserts { - if let Some(&partition) = partition_map.get(&account_id) { - inserts_per_partition.entry(partition).or_default().push(( - account_id, - data_key, - data_value, - block_height, - )); - } - } - - // Insert in parallel per partition - let semaphore = std::sync::Arc::new(tokio::sync::Semaphore::new(MAX_CONCURRENT_QUERIES)); - let mut tasks = Vec::new(); - for (partition_id, rows) in inserts_per_partition { - let pool = pool.clone(); - let semaphore = semaphore.clone(); - let table_name = format!("state_changes_access_key_compact_{}", partition_id); - - let task = tokio::spawn(async move { - let _permit = semaphore.acquire_owned().await.unwrap(); - let start = Instant::now(); - - let mut qb = sqlx::QueryBuilder::new(format!( - "INSERT INTO {} (account_id, data_key, data_value, block_height_from, block_height_to) ", - table_name - )); - - qb.push_values( - rows.iter(), - |mut row, (account_id, data_key, data_value, block_height)| { - row.push_bind(account_id) - .push_bind(data_key) - .push_bind(data_value) - .push_bind(block_height) - .push_bind(None::>); - }, - ); - - qb.push(" ON CONFLICT DO NOTHING"); - - let result = qb.build().execute(&pool).await.map_err(anyhow::Error::from); - - tracing::debug!( - target: "database::postgres::state_indexer", - "Insert done partition={} elapsed={:?} rows={}", - partition_id, - start.elapsed(), - rows.len() - ); - - result - }); - - tasks.push(task); - } - - try_join_all(tasks).await?; - - tracing::debug!( - target: "database::postgres::state_indexer", - "Total insert_state_changes_access_key duration shard={} elapsed={:?}", - shard_id, - overall_start.elapsed() - ); - - Ok(()) - } - - async fn update_state_changes_access_key( - &self, - shard_id: near_primitives::types::ShardId, - state_changes: Vec, - block_height: u64, - ) -> anyhow::Result<()> { - let overall_start = Instant::now(); - - crate::metrics::SHARD_DATABASE_WRITE_QUERIES - .with_label_values(&[ - &shard_id.to_string(), - "save_state_changes_access_key", - "state_changes_access_key", - ]) - .inc(); - - // Collect updates: (account_id, data_key) - let updates: Vec<(String, String)> = state_changes - .iter() - .filter_map(|c| match &c.value { - near_primitives::views::StateChangeValueView::AccessKeyUpdate { - account_id, - public_key, - .. - } - | near_primitives::views::StateChangeValueView::AccessKeyDeletion { - account_id, - public_key, - } => Some((account_id.to_string(), hex::encode(public_key.key_data()))), - _ => None, - }) - .collect(); - - if updates.is_empty() { - return Ok(()); - } - - // Compute partitions - let account_ids: Vec = updates.iter().map(|(id, _)| id.clone()).collect(); - let pool = self.shards_pool.get(&shard_id).ok_or(anyhow::anyhow!( - "Database connection for Shard_{} not found", - shard_id - ))?; - - let partition_map = self.partition_map(&pool, &account_ids).await?; - - // Group updates per partition - let mut updates_per_partition: HashMap> = HashMap::new(); - for (account_id, data_key) in updates { - if let Some(&partition) = partition_map.get(&account_id) { - updates_per_partition - .entry(partition) - .or_default() - .push((account_id, data_key)); - } - } - - // Parallel update execution per partition - let semaphore = std::sync::Arc::new(tokio::sync::Semaphore::new(MAX_CONCURRENT_QUERIES)); - let mut tasks = Vec::new(); - for (partition_id, rows) in updates_per_partition { - let pool = pool.clone(); - let semaphore = semaphore.clone(); - let block_height_bd = bigdecimal::BigDecimal::from(block_height); - - let task = tokio::spawn(async move { - let _permit = semaphore.acquire_owned().await.unwrap(); - let start = Instant::now(); - let (account_ids, data_keys): (Vec<_>, Vec<_>) = rows.into_iter().unzip(); - - // Use UNNEST for batch update - let query = format!( - r#" - UPDATE state_changes_access_key_compact_{partition_id} AS t - SET block_height_to = $3 - FROM ( - SELECT unnest($1::text[]) AS account_id, unnest($2::text[]) AS data_key - ) AS u - WHERE t.account_id = u.account_id - AND t.data_key = u.data_key - AND t.block_height_to IS NULL; - "#, - partition_id = partition_id - ); - - sqlx::query(&query) - .bind(&account_ids) - .bind(&data_keys) - .bind(&block_height_bd) - .execute(&pool) - .await?; - - tracing::debug!( - target: "database::postgres::state_indexer", - "Update done partition={} elapsed={:?} rows={}", - partition_id, - start.elapsed(), - account_ids.len() - ); - - Ok::<(), anyhow::Error>(()) - }); - - tasks.push(task); - } - - try_join_all(tasks).await?; - - tracing::debug!( - target: "database::postgres::state_indexer", - "Total update_state_changes_access_key duration shard={} elapsed={:?}", - shard_id, - overall_start.elapsed() - ); - - Ok(()) - } - - async fn insert_state_changes_contract( - &self, - shard_id: near_primitives::types::ShardId, - state_changes: Vec, - block_height: u64, - ) -> anyhow::Result<()> { - let overall_start = Instant::now(); - crate::metrics::SHARD_DATABASE_WRITE_QUERIES - .with_label_values(&[ - &shard_id.to_string(), - "save_state_changes_contract", - "state_changes_contract", - ]) - .inc(); - - // Extract only ContractCodeUpdate - let inserts: Vec<(String, Vec, bigdecimal::BigDecimal)> = state_changes - .into_iter() - .filter_map(|change| { - if let near_primitives::views::StateChangeValueView::ContractCodeUpdate { - account_id, - code, - } = change.value - { - Some(( - account_id.to_string(), - code.to_vec(), - bigdecimal::BigDecimal::from(block_height), - )) - } else { - None - } - }) - .collect(); - - if inserts.is_empty() { - return Ok(()); - } - - // Compute partitions - let account_ids: Vec = inserts.iter().map(|(id, _, _)| id.clone()).collect(); - let pool = self.shards_pool.get(&shard_id).ok_or(anyhow::anyhow!( - "Database connection for Shard_{} not found", - shard_id - ))?; - - let partition_map = self.partition_map(&pool, &account_ids).await?; - - // Group rows by partition - let mut inserts_per_partition: HashMap> = HashMap::new(); - for (account_id, data_value, block_height) in inserts { - if let Some(&partition) = partition_map.get(&account_id) { - inserts_per_partition.entry(partition).or_default().push(( - account_id, - data_value, - block_height, - )); - } - } - - let semaphore = std::sync::Arc::new(tokio::sync::Semaphore::new(MAX_CONCURRENT_QUERIES)); - let mut tasks = Vec::new(); - - for (partition_id, rows) in inserts_per_partition { - let pool = pool.clone(); - let semaphore = semaphore.clone(); - - let task = tokio::spawn(async move { - let _permit = semaphore.acquire().await.unwrap(); - let start = Instant::now(); - - let table_name = format!("state_changes_contract_compact_{}", partition_id); - let mut qb = sqlx::QueryBuilder::new(format!( - "INSERT INTO {} (account_id, data_value, block_height_from, block_height_to) ", - table_name - )); - - qb.push_values( - rows.iter(), - |mut row, (account_id, data_value, block_height)| { - row.push_bind(account_id) - .push_bind(data_value) - .push_bind(block_height) - .push_bind(None::>); - }, - ); - - qb.push(" ON CONFLICT DO NOTHING"); - qb.build().execute(&pool).await?; - - tracing::debug!( - target: "database::postgres::state_indexer", - "Insert contract partition={} elapsed={:?} rows={}", - partition_id, - start.elapsed(), - rows.len() - ); - - Ok::<(), anyhow::Error>(()) - }); - - tasks.push(task); - } - - try_join_all(tasks).await?; - tracing::debug!( - target: "database::postgres::state_indexer", - "Total insert_state_changes_contract duration shard={} elapsed={:?}", - shard_id, - overall_start.elapsed() - ); - - Ok(()) - } - - /// Update contract state changes with partitions under semaphore - async fn update_state_changes_contract( - &self, - shard_id: near_primitives::types::ShardId, - state_changes: Vec, - block_height: u64, - ) -> anyhow::Result<()> { - let overall_start = Instant::now(); - crate::metrics::SHARD_DATABASE_WRITE_QUERIES - .with_label_values(&[ - &shard_id.to_string(), - "save_state_changes_contract", - "state_changes_contract", - ]) - .inc(); - - // Collect account_ids for updates - let accounts: Vec = state_changes - .into_iter() - .filter_map(|change| match change.value { - near_primitives::views::StateChangeValueView::ContractCodeUpdate { - account_id, - .. - } - | near_primitives::views::StateChangeValueView::ContractCodeDeletion { - account_id, - } => Some(account_id.to_string()), - _ => None, - }) - .collect(); - - if accounts.is_empty() { - return Ok(()); - } - - let pool = self.shards_pool.get(&shard_id).ok_or(anyhow::anyhow!( - "Database connection for Shard_{} not found", - shard_id - ))?; - - let partition_map = self.partition_map(&pool, &accounts).await?; - - let mut updates_per_partition: HashMap> = HashMap::new(); - for account_id in accounts { - if let Some(&partition) = partition_map.get(&account_id) { - updates_per_partition - .entry(partition) - .or_default() - .push(account_id); - } else { - tracing::warn!("Partition not found for account_id: {}", account_id); - } - } - - let semaphore = std::sync::Arc::new(tokio::sync::Semaphore::new(MAX_CONCURRENT_QUERIES)); - let mut tasks = Vec::new(); - - for (partition_id, account_ids) in updates_per_partition { - let pool = pool.clone(); - let semaphore = semaphore.clone(); - let block_height_bd = bigdecimal::BigDecimal::from(block_height); - - let task = tokio::spawn(async move { - let _permit = semaphore.acquire().await.unwrap(); - let start = Instant::now(); - - let query = format!( - r#" - UPDATE state_changes_contract_compact_{partition_id} AS t - SET block_height_to = $2 - FROM (SELECT unnest($1::text[]) AS account_id) AS u - WHERE t.account_id = u.account_id - AND t.block_height_to IS NULL; - "# - ); - - sqlx::query(&query) - .bind(&account_ids) - .bind(&block_height_bd) - .execute(&pool) - .await?; - - tracing::debug!( - target: "database::postgres::state_indexer", - "Update contract partition={} elapsed={:?} rows={}", - partition_id, - start.elapsed(), - account_ids.len() - ); - - Ok::<(), anyhow::Error>(()) - }); - - tasks.push(task); - } - - try_join_all(tasks).await?; - tracing::debug!( - target: "database::postgres::state_indexer", - "Total update_state_changes_contract duration shard={} elapsed={:?}", - shard_id, - overall_start.elapsed() - ); - - Ok(()) - } - - async fn insert_state_changes_account( - &self, - shard_id: near_primitives::types::ShardId, - state_changes: Vec, - block_height: u64, - ) -> anyhow::Result<()> { - let overall_start = Instant::now(); - crate::metrics::SHARD_DATABASE_WRITE_QUERIES - .with_label_values(&[ - &shard_id.to_string(), - "save_state_changes_account", - "state_changes_account", - ]) - .inc(); - - // Extract account updates - let inserts: Vec<(String, Vec, bigdecimal::BigDecimal)> = state_changes - .into_iter() - .filter_map(|change| { - if let near_primitives::views::StateChangeValueView::AccountUpdate { - account_id, - account, - } = change.value - { - let data_value = - borsh::to_vec(&near_primitives::account::Account::from(&account)) - .expect("Failed to borsh serialize account"); - Some(( - account_id.to_string(), - data_value, - bigdecimal::BigDecimal::from(block_height), - )) - } else { - None - } - }) - .collect(); - - if inserts.is_empty() { - return Ok(()); - } - - // Compute partitions - let account_ids: Vec = inserts.iter().map(|(id, _, _)| id.clone()).collect(); - let pool = self.shards_pool.get(&shard_id).unwrap(); - - let partition_map = self.partition_map(&pool, &account_ids).await?; - - let mut inserts_per_partition: HashMap> = HashMap::new(); - for (account_id, data_value, block_height) in inserts { - if let Some(&partition) = partition_map.get(&account_id) { - inserts_per_partition.entry(partition).or_default().push(( - account_id, - data_value, - block_height, - )); - } - } - - let semaphore = std::sync::Arc::new(tokio::sync::Semaphore::new(MAX_CONCURRENT_QUERIES)); - let mut tasks = Vec::new(); - - for (partition_id, rows) in inserts_per_partition { - let pool = pool.clone(); - let semaphore = semaphore.clone(); - - let task = tokio::spawn(async move { - let _permit = semaphore.acquire().await.unwrap(); - let start = Instant::now(); - - let table_name = format!("state_changes_account_compact_{}", partition_id); - let mut qb = sqlx::QueryBuilder::new(format!( - "INSERT INTO {} (account_id, data_value, block_height_from, block_height_to) ", - table_name - )); - - qb.push_values( - rows.iter(), - |mut row, (account_id, data_value, block_height)| { - row.push_bind(account_id) - .push_bind(data_value) - .push_bind(block_height) - .push_bind(None::>); - }, - ); - - qb.push(" ON CONFLICT DO NOTHING"); - qb.build().execute(&pool).await?; - - tracing::debug!( - target: "database::postgres::state_indexer", - "Insert account partition={} elapsed={:?} rows={}", - partition_id, - start.elapsed(), - rows.len() - ); - - Ok::<(), anyhow::Error>(()) - }); - - tasks.push(task); - } - - try_join_all(tasks).await?; - tracing::debug!( - target: "database::postgres::state_indexer", - "Total insert_state_changes_account duration shard={} elapsed={:?}", - shard_id, - overall_start.elapsed() - ); - - Ok(()) - } - - /// Update Account state changes using partitions + concurrency limit - async fn update_state_changes_account( - &self, - shard_id: near_primitives::types::ShardId, - state_changes: Vec, - block_height: u64, - ) -> anyhow::Result<()> { - let overall_start = Instant::now(); - crate::metrics::SHARD_DATABASE_WRITE_QUERIES - .with_label_values(&[ - &shard_id.to_string(), - "save_state_changes_account", - "state_changes_account", - ]) - .inc(); - - // Collect accounts for update - let accounts: Vec = state_changes - .into_iter() - .filter_map(|change| match change.value { - near_primitives::views::StateChangeValueView::AccountUpdate { - account_id, .. - } - | near_primitives::views::StateChangeValueView::AccountDeletion { account_id } => { - Some(account_id.to_string()) - } - _ => None, - }) - .collect(); - - if accounts.is_empty() { - return Ok(()); - } - - // Compute partitions - let pool = self.shards_pool.get(&shard_id).unwrap(); - - let partition_map = self.partition_map(&pool, &accounts).await?; - - let mut updates_per_partition: HashMap> = HashMap::new(); - for account_id in accounts { - if let Some(&partition) = partition_map.get(&account_id) { - updates_per_partition - .entry(partition) - .or_default() - .push(account_id); - } else { - tracing::warn!("Partition not found for account_id: {}", account_id); - } - } - - let semaphore = std::sync::Arc::new(tokio::sync::Semaphore::new(MAX_CONCURRENT_QUERIES)); - let mut tasks = Vec::new(); - - for (partition_id, account_ids) in updates_per_partition { - let pool = pool.clone(); - let semaphore = semaphore.clone(); - let block_height_bd = bigdecimal::BigDecimal::from(block_height); - - let task = tokio::spawn(async move { - let _permit = semaphore.acquire().await.unwrap(); - let start = Instant::now(); - - let query = format!( - r#" - UPDATE state_changes_account_compact_{partition_id} AS t - SET block_height_to = $2 - FROM (SELECT unnest($1::text[]) AS account_id) AS u - WHERE t.account_id = u.account_id - AND t.block_height_to IS NULL; - "# - ); - - sqlx::query(&query) - .bind(&account_ids) - .bind(&block_height_bd) - .execute(&pool) - .await?; - - tracing::debug!( - target: "database::postgres::state_indexer", - "Update account partition={} elapsed={:?} rows={}", - partition_id, - start.elapsed(), - account_ids.len() - ); - - Ok::<(), anyhow::Error>(()) - }); - - tasks.push(task); - } - - try_join_all(tasks).await?; - tracing::debug!( - target: "database::postgres::state_indexer", - "Total update_state_changes_account duration shard={} elapsed={:?}", - shard_id, - overall_start.elapsed() - ); - - Ok(()) - } -} diff --git a/database/src/postgres/state_indexer/helpers.rs b/database/src/postgres/state_indexer/helpers.rs new file mode 100644 index 00000000..2fa817e7 --- /dev/null +++ b/database/src/postgres/state_indexer/helpers.rs @@ -0,0 +1,538 @@ +use std::collections::HashMap; +use std::time::Instant; + +use futures::future::try_join_all; +use sqlx::Row; + +/// PostgreSQL State Indexer Implementation +/// +/// ARCHITECTURAL OVERVIEW: +/// This module implements efficient batch processing for NEAR Protocol state changes +/// across partitioned PostgreSQL tables. The design focuses on: +/// +/// 1. **Partitioned Tables**: All state tables are horizontally partitioned by account_id +/// using PostgreSQL's hashtext() function for consistent distribution +/// +/// 2. **Batch Processing**: Operations are grouped by partition and executed in parallel +/// to maximize throughput while respecting connection pool limits +/// +/// 3. **Helper Function Strategy**: Four specialized helpers handle different data patterns: +/// - execute_partitioned_account_update: Account-only updates (UNNEST pattern) +/// - execute_partitioned_key_update: Composite key updates (CTE pattern) +/// - execute_partitioned_standard_insert: 4-column inserts with data_key +/// - execute_partitioned_keyless_insert: 3-column inserts without data_key +/// +/// 4. **SQL Pattern Selection**: Different update patterns optimize for different scenarios: +/// - UNNEST: Efficient for uniform operations (same block_height, simple matching) +/// - CTE: Required for variable data operations (different block_heights, complex conditions) +/// +/// 5. **Concurrency Control**: Semaphores limit parallel operations to prevent database +/// connection pool exhaustion while maximizing throughput +/// +/// SPECIAL CASES: +/// - update_state_changes_access_key: Uses in-place UNNEST pattern instead of CTE helper +/// due to uniform block_height and simpler matching requirements (see method comments) +/// +impl crate::PostgresDBManager { + /// Helper function for partitioned update operations using account-only updates + /// + /// This helper is used for tables that only have account_id as the primary key component + /// (like account and contract tables), where we need to update block_height_to for all + /// rows matching specific account_ids. + /// + /// SQL Pattern: Uses UNNEST with a single array for account_ids + /// ```sql + /// UPDATE table_partition AS t + /// SET block_height_to = $2 + /// FROM (SELECT unnest($1::text[]) AS account_id) AS u + /// WHERE t.account_id = u.account_id AND t.block_height_to IS NULL; + /// ``` + /// + /// Used by: update_state_changes_account, update_state_changes_contract + pub(crate) async fn execute_partitioned_account_update( + &self, + shard_id: near_primitives::types::ShardId, + table_prefix: String, + operation_name: String, + account_ids: Vec, + block_height: u64, + ) -> anyhow::Result<()> { + if account_ids.is_empty() { + return Ok(()); + } + + // Get database connection pool for this shard + let pool = self.get_shard_pool(shard_id)?; + + // Compute partition assignments for all account_ids using PostgreSQL's hashtext() function + // This ensures consistent partition distribution matching the table partitioning scheme + let partition_map = self.partition_map(&pool, &account_ids).await?; + + // Group account_ids by their target partition for batch processing + // This reduces the number of database queries by updating entire partitions at once + let mut accounts_per_partition: HashMap> = HashMap::new(); + for account_id in account_ids { + if let Some(&partition) = partition_map.get(&account_id) { + accounts_per_partition + .entry(partition) + .or_default() + .push(account_id); + } else { + tracing::warn!("Partition not found for account_id: {}", account_id); + } + } + + // Execute updates in parallel across partitions with concurrency control + // Each partition can be updated independently, improving throughput + let semaphore = + std::sync::Arc::new(tokio::sync::Semaphore::new(super::MAX_CONCURRENT_QUERIES)); + let mut tasks = Vec::new(); + + for (partition_id, partition_accounts) in accounts_per_partition { + let pool = pool.clone(); + let semaphore = semaphore.clone(); + let operation_name = operation_name.clone(); + let table_prefix = table_prefix.clone(); + let block_height_bd = bigdecimal::BigDecimal::from(block_height); + + let task = tokio::spawn(async move { + // Acquire semaphore permit to limit concurrent database operations + let _permit = semaphore.acquire_owned().await.unwrap(); + let start = Instant::now(); + + // Build UPDATE query using UNNEST to batch-process multiple account_ids + // UNNEST converts the array parameter into rows for efficient JOIN operations + let query = format!( + r#" + UPDATE {table_prefix}_{partition_id} AS t + SET block_height_to = $2 + FROM (SELECT unnest($1::text[]) AS account_id) AS u + WHERE t.account_id = u.account_id + AND t.block_height_to IS NULL; + "#, + table_prefix = table_prefix, + partition_id = partition_id + ); + + sqlx::query(&query) + .bind(&partition_accounts) + .bind(&block_height_bd) + .execute(&pool) + .await?; + + tracing::debug!( + target: "database::postgres::state_indexer", + "Update done operation={} partition={} elapsed={:?} rows={}", + operation_name, + partition_id, + start.elapsed(), + partition_accounts.len() + ); + + Ok::<(), anyhow::Error>(()) + }); + + tasks.push(task); + } + + try_join_all(tasks).await?; + Ok(()) + } + + /// Helper function for partitioned updates with composite key (account_id + data_key) + /// + /// This helper is used for tables that have composite primary keys with both account_id + /// and data_key components (like state_changes_data table), where we need precise + /// row-level updates based on both key components. + /// + /// SQL Pattern: Uses CTE (Common Table Expression) with VALUES for structured data + /// ```sql + /// WITH new_data (account_id, data_key, block_height) AS ( + /// VALUES ('acc1', 'key1', 100), ('acc2', 'key2', 100), ... + /// ) + /// UPDATE table AS old + /// SET block_height_to = new_data.block_height + /// FROM new_data + /// WHERE old.account_id = new_data.account_id + /// AND old.data_key = new_data.data_key + /// AND old.block_height_from < new_data.block_height + /// AND old.block_height_to IS NULL; + /// ``` + /// + /// The CTE approach is necessary here because we need to match on multiple columns + /// with different block_height values per row, which UNNEST cannot handle efficiently. + /// + /// Used by: update_state_changes_data + pub(crate) async fn execute_partitioned_key_update( + &self, + shard_id: near_primitives::types::ShardId, + table_prefix: String, + operation_name: String, + updates: Vec<(String, String, bigdecimal::BigDecimal)>, // (account_id, data_key, block_height) + ) -> anyhow::Result<()> { + if updates.is_empty() { + return Ok(()); + } + + let pool = self.get_shard_pool(shard_id)?; + // Extract account_ids for partition mapping (data_key distribution is handled by account_id partitioning) + let account_ids: Vec = updates.iter().map(|(id, _, _)| id.clone()).collect(); + let partition_map = self.partition_map(&pool, &account_ids).await?; + + // Group updates by partition, preserving the complete tuple for CTE processing + let mut updates_per_partition: HashMap> = + HashMap::new(); + for (account_id, data_key, block_height) in updates { + if let Some(&partition) = partition_map.get(&account_id) { + updates_per_partition.entry(partition).or_default().push(( + account_id, + data_key, + block_height, + )); + } else { + tracing::warn!("Partition not found for account_id: {}", account_id); + } + } + + let semaphore = + std::sync::Arc::new(tokio::sync::Semaphore::new(super::MAX_CONCURRENT_QUERIES)); + let mut tasks = Vec::new(); + + for (partition_id, rows) in updates_per_partition { + let pool = pool.clone(); + let semaphore = semaphore.clone(); + let table_name = format!("{}_{}", table_prefix, partition_id); + let operation_name = operation_name.clone(); + + let task = tokio::spawn(async move { + let _permit = semaphore.acquire_owned().await.unwrap(); + let start = Instant::now(); + + // Build CTE-based UPDATE query using sqlx QueryBuilder for type safety + // CTE allows us to provide structured data (account_id, data_key, block_height) + // and join it efficiently with the target table for precise updates + let mut qb = sqlx::QueryBuilder::new( + "WITH new_data (account_id, data_key, block_height) AS (", + ); + + qb.push_values( + rows.iter(), + |mut row, (account_id, data_key, block_height)| { + row.push_bind(account_id) + .push_bind(data_key) + .push_bind(block_height); + }, + ); + + // Complete the CTE and add the UPDATE clause with all necessary conditions + // The four AND conditions ensure data integrity and proper versioning: + // 1. account_id match - partition-level key + // 2. data_key match - row-level key + // 3. block_height comparison - prevents updating newer data with older data + // 4. NULL check - only update active records (not already closed) + qb.push(format!( + ") UPDATE {} AS old \ + SET block_height_to = new_data.block_height \ + FROM new_data \ + WHERE old.account_id = new_data.account_id \ + AND old.data_key = new_data.data_key \ + AND old.block_height_from < new_data.block_height \ + AND old.block_height_to IS NULL;", + table_name, + )); + + let result = qb.build().execute(&pool).await.map_err(anyhow::Error::from); + + tracing::debug!( + target: "database::postgres::state_indexer", + "Update done operation={} partition={} elapsed={:?} rows={}", + operation_name, + partition_id, + start.elapsed(), + rows.len() + ); + + result + }); + + tasks.push(task); + } + + try_join_all(tasks).await?; + Ok(()) + } + + /// Helper function for standard 4-column inserts with data_key + /// + /// This helper handles inserts into tables with composite keys that include both + /// account_id and data_key (like state_changes_data and state_changes_access_key). + /// + /// Table Schema: (account_id, data_key, data_value, block_height_from, block_height_to) + /// - account_id: partition key for data distribution + /// - data_key: secondary key component (hex-encoded for data, borsh-serialized for access keys) + /// - data_value: the actual state data (raw bytes) + /// - block_height_from: when this version became active + /// - block_height_to: when this version was superseded (NULL for current) + /// + /// Used by: insert_state_changes_data, insert_state_changes_access_key + pub(crate) async fn execute_partitioned_standard_insert( + &self, + shard_id: near_primitives::types::ShardId, + table_prefix: String, + operation_name: String, + inserts: Vec<(String, String, Vec, bigdecimal::BigDecimal)>, // (account_id, data_key, data_value, block_height) + ) -> anyhow::Result<()> { + if inserts.is_empty() { + return Ok(()); + } + + let pool = self.get_shard_pool(shard_id)?; + let account_ids: Vec = inserts.iter().map(|(id, _, _, _)| id.clone()).collect(); + let partition_map = self.partition_map(&pool, &account_ids).await?; + + // Group inserts by partition for efficient batch processing + let mut inserts_per_partition: HashMap< + i32, + Vec<(String, String, Vec, bigdecimal::BigDecimal)>, + > = HashMap::new(); + for (account_id, data_key, data_value, block_height) in inserts { + if let Some(&partition) = partition_map.get(&account_id) { + inserts_per_partition.entry(partition).or_default().push(( + account_id, + data_key, + data_value, + block_height, + )); + } else { + tracing::warn!("Partition not found for account_id: {}", account_id); + } + } + + let semaphore = + std::sync::Arc::new(tokio::sync::Semaphore::new(super::MAX_CONCURRENT_QUERIES)); + let mut tasks = Vec::new(); + + for (partition_id, rows) in inserts_per_partition { + let pool = pool.clone(); + let semaphore = semaphore.clone(); + let table_name = format!("{}_{}", table_prefix, partition_id); + let operation_name = operation_name.clone(); + + let task = tokio::spawn(async move { + let start = Instant::now(); + let _permit = semaphore.acquire_owned().await.unwrap(); + + // Build batch INSERT using sqlx QueryBuilder for type safety and performance + let mut qb = sqlx::QueryBuilder::new(format!( + "INSERT INTO {} (account_id, data_key, data_value, block_height_from, block_height_to) ", + table_name, + )); + + qb.push_values( + rows.iter(), + |mut row, (account_id, data_key, data_value, block_height)| { + row.push_bind(account_id) + .push_bind(data_key) + .push_bind(data_value) + .push_bind(block_height) + .push_bind(None::>); // block_height_to starts as NULL (active) + }, + ); + + // Use ON CONFLICT DO NOTHING to handle duplicate inserts gracefully + // This is important for idempotency during indexer restarts or replays + qb.push(" ON CONFLICT DO NOTHING"); + + let result = qb.build().execute(&pool).await.map_err(anyhow::Error::from); + + tracing::debug!( + target: "database::postgres::state_indexer", + "Insert done operation={} partition={} elapsed={:?} rows={}", + operation_name, + partition_id, + start.elapsed(), + rows.len() + ); + + result + }); + + tasks.push(task); + } + + try_join_all(tasks).await?; + Ok(()) + } + + /// Helper function for keyless 3-column inserts without data_key + /// + /// This helper handles inserts into tables that only use account_id as the key + /// (like state_changes_account and state_changes_contract), where the data_value + /// represents the entire state of the account or contract. + /// + /// Table Schema: (account_id, data_value, block_height_from, block_height_to) + /// - account_id: primary partition key + /// - data_value: complete state data (borsh-serialized account or contract code) + /// - block_height_from: when this version became active + /// - block_height_to: when this version was superseded (NULL for current) + /// + /// The absence of data_key means each account/contract has exactly one active + /// record at any given block height, representing its complete state. + /// + /// Used by: insert_state_changes_account, insert_state_changes_contract + pub(crate) async fn execute_partitioned_keyless_insert( + &self, + shard_id: near_primitives::types::ShardId, + table_prefix: String, + operation_name: String, + inserts: Vec<(String, Vec, bigdecimal::BigDecimal)>, // (account_id, data_value, block_height) + ) -> anyhow::Result<()> { + if inserts.is_empty() { + return Ok(()); + } + + let pool = self.get_shard_pool(shard_id)?; + let account_ids: Vec = inserts.iter().map(|(id, _, _)| id.clone()).collect(); + let partition_map = self.partition_map(&pool, &account_ids).await?; + + // Group inserts per partition + let mut inserts_per_partition: HashMap< + i32, + Vec<(String, Vec, bigdecimal::BigDecimal)>, + > = HashMap::new(); + for (account_id, data_value, block_height) in inserts { + if let Some(&partition) = partition_map.get(&account_id) { + inserts_per_partition.entry(partition).or_default().push(( + account_id, + data_value, + block_height, + )); + } else { + tracing::warn!("Partition not found for account_id: {}", account_id); + } + } + + let semaphore = + std::sync::Arc::new(tokio::sync::Semaphore::new(super::MAX_CONCURRENT_QUERIES)); + let mut tasks = Vec::new(); + + for (partition_id, rows) in inserts_per_partition { + let pool = pool.clone(); + let semaphore = semaphore.clone(); + let table_name = format!("{}_{}", table_prefix, partition_id); + let operation_name = operation_name.clone(); + + let task = tokio::spawn(async move { + let start = Instant::now(); + let _permit = semaphore.acquire_owned().await.unwrap(); + // Build batch INSERT for keyless tables (no data_key column) + let mut qb = sqlx::QueryBuilder::new(format!( + "INSERT INTO {} (account_id, data_value, block_height_from, block_height_to) ", + table_name, + )); + + qb.push_values( + rows.iter(), + |mut row, (account_id, data_value, block_height)| { + row.push_bind(account_id) + .push_bind(data_value) + .push_bind(block_height) + .push_bind(None::>); // block_height_to starts as NULL + }, + ); + + qb.push(" ON CONFLICT DO NOTHING"); + + let result = qb.build().execute(&pool).await.map_err(anyhow::Error::from); + + tracing::debug!( + target: "database::postgres::state_indexer", + "Insert done operation={} partition={} elapsed={:?} rows={}", + operation_name, + partition_id, + start.elapsed(), + rows.len() + ); + + result + }); + + tasks.push(task); + } + + try_join_all(tasks).await?; + Ok(()) + } + + /// Helper function to record metrics consistently across all database operations + /// + /// Increments the SHARD_DATABASE_WRITE_QUERIES metric with standardized labels + /// for monitoring and observability of database write patterns. + pub(crate) fn record_shard_write_metric( + &self, + shard_id: near_primitives::types::ShardId, + operation: &str, + table: &str, + ) { + crate::metrics::SHARD_DATABASE_WRITE_QUERIES + .with_label_values(&[&shard_id.to_string(), operation, table]) + .inc(); + } + + /// Helper function to get database pool with consistent error handling + /// + /// Centralizes the pool retrieval logic and provides meaningful error messages + /// when a shard's database connection is not available. + pub(crate) fn get_shard_pool( + &self, + shard_id: near_primitives::types::ShardId, + ) -> anyhow::Result<&sqlx::PgPool> { + self.shards_pool + .get(&shard_id) + .ok_or_else(|| anyhow::anyhow!("Database connection for Shard_{} not found", shard_id)) + } + + /// Compute partition assignments for account_ids using PostgreSQL's hashtext() function + /// + /// This function is critical for maintaining consistency with the database partitioning scheme. + /// It uses the same hash function (hashtext) and modulo operation that PostgreSQL uses + /// for automatic partition routing, ensuring our manual partition targeting matches + /// the database's internal partition selection. + /// + /// The computation is done in PostgreSQL rather than Rust to guarantee identical + /// hash results regardless of client-side hash implementations or endianness differences. + pub(crate) async fn partition_map( + &self, + pool: &sqlx::PgPool, + account_ids: &Vec, + ) -> anyhow::Result> { + let now = std::time::Instant::now(); + + // Execute partition calculation in PostgreSQL to ensure consistency + // This MUST use the same hash function and modulo as the partitioned table definitions + let partition_rows = sqlx::query( + "SELECT account_id, mod(hashtext(account_id), $2)::int AS partition + FROM unnest($1::text[]) AS account_id", + ) + .bind(account_ids) + .bind(super::PARTITIONS) + .fetch_all(pool) + .await?; + + let partition_map: HashMap = partition_rows + .into_iter() + .map(|row| { + let account_id: String = row.try_get("account_id").unwrap(); + let partition: i32 = row.try_get("partition").unwrap(); + (account_id, partition) + }) + .collect(); + tracing::debug!( + target: "database::postgres::state_indexer", + "Partition map computed in {:?} for {} accounts", + now.elapsed(), + account_ids.len() + ); + Ok(partition_map) + } +} diff --git a/database/src/postgres/state_indexer/mod.rs b/database/src/postgres/state_indexer/mod.rs new file mode 100644 index 00000000..80bdab01 --- /dev/null +++ b/database/src/postgres/state_indexer/mod.rs @@ -0,0 +1,714 @@ +use std::collections::HashMap; +use std::time::Instant; + +use bigdecimal::ToPrimitive; +use futures::{future::try_join_all, FutureExt}; + +mod helpers; + +// Database configuration constants +pub(crate) const PARTITIONS: i32 = 100; // Number of partitions for sharded tables (consistent with hashtext() mod) +pub(crate) const MAX_CONCURRENT_QUERIES: usize = 16; // Limit parallel database operations to prevent connection pool exhaustion + +impl crate::PostgresDBManager { + async fn save_chunks_unique( + &self, + block_height: u64, + chunks: Vec<( + crate::primitives::ChunkHash, + crate::primitives::ShardId, + crate::primitives::HeightIncluded, + )>, + ) -> anyhow::Result<()> { + let unique_chunks = chunks + .iter() + .filter(|(_chunk_hash, _shard_id, height_included)| height_included == &block_height) + .collect::>(); + + if !unique_chunks.is_empty() { + crate::metrics::META_DATABASE_WRITE_QUERIES + .with_label_values(&["save_chunks", "chunks"]) + .inc(); + let mut query_builder: sqlx::QueryBuilder = + sqlx::QueryBuilder::new("INSERT INTO chunks (chunk_hash, block_height, shard_id) "); + + query_builder.push_values( + unique_chunks.iter(), + |mut values, (chunk_hash, shard_id, height_included)| { + values + .push_bind(chunk_hash.to_string()) + .push_bind(bigdecimal::BigDecimal::from(*height_included)) + .push_bind(bigdecimal::BigDecimal::from(*shard_id)); + }, + ); + query_builder.push(" ON CONFLICT DO NOTHING;"); + query_builder.build().execute(&self.meta_db_pool).await?; + } + Ok(()) + } + + async fn save_chunks_duplicate( + &self, + block_height: u64, + chunks: Vec<( + crate::primitives::ChunkHash, + crate::primitives::ShardId, + crate::primitives::HeightIncluded, + )>, + ) -> anyhow::Result<()> { + let chunks_duplicate = chunks + .iter() + .filter(|(_chunk_hash, _shard_id, height_included)| height_included != &block_height) + .collect::>(); + if !chunks_duplicate.is_empty() { + crate::metrics::META_DATABASE_WRITE_QUERIES + .with_label_values(&["save_chunks", "chunks_duplicate"]) + .inc(); + let mut query_builder: sqlx::QueryBuilder = + sqlx::QueryBuilder::new("INSERT INTO chunks_duplicate (chunk_hash, block_height, shard_id, included_in_block_height) "); + + query_builder.push_values( + chunks.iter(), + |mut values, (chunk_hash, shard_id, height_included)| { + values + .push_bind(chunk_hash.to_string()) + .push_bind(bigdecimal::BigDecimal::from(block_height)) + .push_bind(bigdecimal::BigDecimal::from(*shard_id)) + .push_bind(bigdecimal::BigDecimal::from(*height_included)); + }, + ); + query_builder.push(" ON CONFLICT DO NOTHING;"); + query_builder.build().execute(&self.meta_db_pool).await?; + } + Ok(()) + } +} +#[async_trait::async_trait] +impl crate::StateIndexerDbManager for crate::PostgresDBManager { + async fn save_block( + &self, + block_height: u64, + block_hash: near_primitives::hash::CryptoHash, + ) -> anyhow::Result<()> { + crate::metrics::META_DATABASE_WRITE_QUERIES + .with_label_values(&["save_block", "blocks"]) + .inc(); + sqlx::query( + " + INSERT INTO blocks (block_height, block_hash) + VALUES ($1, $2) ON CONFLICT DO NOTHING; + ", + ) + .bind(bigdecimal::BigDecimal::from(block_height)) + .bind(block_hash.to_string()) + .execute(&self.meta_db_pool) + .await?; + Ok(()) + } + + async fn save_chunks( + &self, + block_height: u64, + chunks: Vec<( + crate::primitives::ChunkHash, + crate::primitives::ShardId, + crate::primitives::HeightIncluded, + )>, + ) -> anyhow::Result<()> { + let save_chunks_unique_future = self.save_chunks_unique(block_height, chunks.clone()); + let save_chunks_duplicate_future = self.save_chunks_duplicate(block_height, chunks); + + futures::future::join_all([ + save_chunks_unique_future.boxed(), + save_chunks_duplicate_future.boxed(), + ]) + .await + .into_iter() + .collect::>() + } + + async fn get_block_height_by_hash( + &self, + block_hash: near_primitives::hash::CryptoHash, + method_name: &str, + ) -> anyhow::Result { + crate::metrics::META_DATABASE_READ_QUERIES + .with_label_values(&[method_name, "blocks"]) + .inc(); + let (block_height,): (bigdecimal::BigDecimal,) = sqlx::query_as( + " + SELECT block_height + FROM blocks + WHERE block_hash = $1 + LIMIT 1; + ", + ) + .bind(block_hash.to_string()) + .fetch_one(&self.meta_db_pool) + .await?; + block_height + .to_u64() + .ok_or_else(|| anyhow::anyhow!("Failed to parse `block_height` to u64")) + } + + async fn update_meta(&self, indexer_id: &str, block_height: u64) -> anyhow::Result<()> { + crate::metrics::META_DATABASE_WRITE_QUERIES + .with_label_values(&["update_meta", "meta"]) + .inc(); + sqlx::query( + " + INSERT INTO meta (indexer_id, last_processed_block_height) + VALUES ($1, $2) + ON CONFLICT (indexer_id) + DO UPDATE SET last_processed_block_height = $2; + ", + ) + .bind(indexer_id) + .bind(bigdecimal::BigDecimal::from(block_height)) + .execute(&self.meta_db_pool) + .await?; + Ok(()) + } + + async fn get_last_processed_block_height(&self, indexer_id: &str) -> anyhow::Result { + crate::metrics::META_DATABASE_READ_QUERIES + .with_label_values(&["get_last_processed_block_height", "meta"]) + .inc(); + let (last_processed_block_height,): (bigdecimal::BigDecimal,) = sqlx::query_as( + " + SELECT last_processed_block_height + FROM meta + WHERE indexer_id = $1 + LIMIT 1; + ", + ) + .bind(indexer_id) + .fetch_one(&self.meta_db_pool) + .await?; + last_processed_block_height + .to_u64() + .ok_or_else(|| anyhow::anyhow!("Failed to parse `last_processed_block_height` to u64")) + } + + async fn save_validators( + &self, + epoch_id: near_primitives::hash::CryptoHash, + epoch_height: u64, + epoch_start_height: u64, + validators_info: &near_primitives::views::EpochValidatorInfo, + epoch_end_block_hash: near_primitives::hash::CryptoHash, + ) -> anyhow::Result<()> { + crate::metrics::META_DATABASE_WRITE_QUERIES + .with_label_values(&["add_validators", "validators"]) + .inc(); + let epoch_end_block_height = self + .get_block_height_by_hash(epoch_end_block_hash, "add_validators") + .await?; + sqlx::query( + " + INSERT INTO validators (epoch_id, epoch_height, epoch_start_height, epoch_end_height, validators_info) + VALUES ($1, $2, $3, $4, $5) ON CONFLICT DO NOTHING; + " + ) + .bind(epoch_id.to_string()) + .bind(bigdecimal::BigDecimal::from(epoch_height)) + .bind(bigdecimal::BigDecimal::from(epoch_start_height)) + .bind(bigdecimal::BigDecimal::from(epoch_end_block_height)) + .bind(&serde_json::to_value(validators_info)?) + .execute(&self.meta_db_pool) + .await?; + Ok(()) + } + + async fn insert_state_changes_data( + &self, + shard_id: near_primitives::types::ShardId, + state_changes: Vec, + block_height: u64, + ) -> anyhow::Result<()> { + self.record_shard_write_metric(shard_id, "save_state_changes_data", "state_changes_data"); + + // Extract relevant data + let inserts: Vec<(String, String, Vec, bigdecimal::BigDecimal)> = state_changes + .iter() + .filter_map(|change| { + if let near_primitives::views::StateChangeValueView::DataUpdate { + account_id, + key, + value, + } = &change.value + { + let data_key: String = hex::encode(key.as_slice()); + Some(( + account_id.to_string(), + data_key, + value.clone().to_vec(), + bigdecimal::BigDecimal::from(block_height), + )) + } else { + None + } + }) + .collect(); + + // Use the standard insert helper + self.execute_partitioned_standard_insert( + shard_id, + "state_changes_data_compact".to_string(), + "insert_state_changes_data".to_string(), + inserts, + ) + .await + } + + async fn update_state_changes_data( + &self, + shard_id: near_primitives::types::ShardId, + state_changes: Vec, + block_height: u64, + ) -> anyhow::Result<()> { + self.record_shard_write_metric(shard_id, "save_state_changes_data", "state_changes_data"); + + let updates: Vec<(String, String, bigdecimal::BigDecimal)> = + state_changes + .iter() + .filter_map(|change| match &change.value { + near_primitives::views::StateChangeValueView::DataUpdate { + account_id, + key, + .. + } + | near_primitives::views::StateChangeValueView::DataDeletion { + account_id, + key, + } => { + let data_key: &[u8] = key.as_ref(); + let data_key = hex::encode(data_key).to_string(); + Some(( + account_id.to_string(), + data_key, + bigdecimal::BigDecimal::from(block_height), + )) + } + _ => None, + }) + .collect(); + + // Use the key-based update helper that preserves all complex AND conditions + // This uses CTE pattern because data updates can have varying block heights + // and need the full 4-condition WHERE clause for data integrity + self.execute_partitioned_key_update( + shard_id, + "state_changes_data_compact".to_string(), + "update_state_changes_data".to_string(), + updates, + ) + .await + } + + async fn insert_state_changes_access_key( + &self, + shard_id: near_primitives::types::ShardId, + state_changes: Vec, + block_height: u64, + ) -> anyhow::Result<()> { + let overall_start = Instant::now(); + self.record_shard_write_metric( + shard_id, + "save_state_changes_access_key", + "state_changes_access_key", + ); + + // Extract relevant updates + let inserts: Vec<(String, String, Vec, bigdecimal::BigDecimal)> = state_changes + .iter() + .filter_map(|change| { + if let near_primitives::views::StateChangeValueView::AccessKeyUpdate { + account_id, + public_key, + access_key, + } = &change.value + { + let data_key = hex::encode( + borsh::to_vec(public_key).expect("Failed to borsh serialize public key"), + ); + let data_value = + borsh::to_vec(access_key).expect("Failed to borsh serialize access key"); + Some(( + account_id.to_string(), + data_key, + data_value, + bigdecimal::BigDecimal::from(block_height), + )) + } else { + None + } + }) + .collect(); + + // Use the standard insert helper + self.execute_partitioned_standard_insert( + shard_id, + "state_changes_access_key_compact".to_string(), + "insert_access_key".to_string(), + inserts, + ) + .await?; + + tracing::debug!( + target: "database::postgres::state_indexer", + "Total insert_state_changes_access_key duration shard={} elapsed={:?}", + shard_id, + overall_start.elapsed() + ); + + Ok(()) + } + + /// Update access key state changes - SPECIAL CASE: In-place implementation + /// + /// This method is NOT using execute_partitioned_key_update helper, and here's why: + /// + /// TECHNICAL REASONING: + /// 1. **Different SQL Pattern**: Access key updates use UNNEST with separate arrays + /// instead of CTE (Common Table Expression) used by data updates + /// 2. **Uniform Block Height**: All updates in a batch share the same block_height, + /// unlike data updates where each row might have different block heights + /// 3. **Simpler Matching**: Only needs (account_id, data_key) pairs for matching, + /// doesn't need the complex 4-condition WHERE clause of data updates + /// + /// SQL PATTERN COMPARISON: + /// - Data updates (CTE): WITH new_data(...) UPDATE table SET... FROM new_data WHERE... + /// - Access key updates: UPDATE table SET... FROM (SELECT unnest(...)) WHERE... + /// + /// PERFORMANCE CONSIDERATIONS: + /// - UNNEST with arrays is more efficient for simple key-pair matching + /// - CTE is better for complex multi-column operations with varying data + /// - The uniform block_height allows using a single parameter ($3) instead of per-row values + /// + /// This is a legitimate architectural decision, not an oversight in refactoring. + async fn update_state_changes_access_key( + &self, + shard_id: near_primitives::types::ShardId, + state_changes: Vec, + block_height: u64, + ) -> anyhow::Result<()> { + let overall_start = Instant::now(); + self.record_shard_write_metric( + shard_id, + "save_state_changes_access_key", + "state_changes_access_key", + ); + + // Collect updates: (account_id, data_key) pairs for access key modifications + // Note: We only need the key pairs since all updates share the same block_height + let updates: Vec<(String, String)> = state_changes + .iter() + .filter_map(|c| match &c.value { + near_primitives::views::StateChangeValueView::AccessKeyUpdate { + account_id, + public_key, + .. + } + | near_primitives::views::StateChangeValueView::AccessKeyDeletion { + account_id, + public_key, + } => Some((account_id.to_string(), hex::encode(public_key.key_data()))), + _ => None, + }) + .collect(); + + if updates.is_empty() { + return Ok(()); + } + + // IMPLEMENTATION NOTE: Custom partitioning logic (not using execute_partitioned_key_update) + // This is intentional because access key updates have different requirements: + // - Uniform block_height for all updates (allows single parameter binding) + // - Simple (account_id, data_key) matching (UNNEST more efficient than CTE) + // - No need for block_height comparison per row (all updates are from same block) + let pool = self.get_shard_pool(shard_id)?; + let account_ids: Vec = updates.iter().map(|(id, _)| id.clone()).collect(); + let partition_map = self.partition_map(&pool, &account_ids).await?; + + // Group updates by partition but keep them as simple (account_id, data_key) pairs + let mut updates_per_partition: HashMap> = HashMap::new(); + for (account_id, data_key) in updates { + if let Some(&partition) = partition_map.get(&account_id) { + updates_per_partition + .entry(partition) + .or_default() + .push((account_id, data_key)); + } + } + + // Parallel update execution per partition using the UNNEST pattern + let semaphore = std::sync::Arc::new(tokio::sync::Semaphore::new(MAX_CONCURRENT_QUERIES)); + let mut tasks = Vec::new(); + for (partition_id, rows) in updates_per_partition { + let pool = pool.clone(); + let semaphore = semaphore.clone(); + let block_height_bd = bigdecimal::BigDecimal::from(block_height); + + let task = tokio::spawn(async move { + let _permit = semaphore.acquire_owned().await.unwrap(); + let start = Instant::now(); + + // Separate account_ids and data_keys into parallel arrays for UNNEST + // This is the key difference from CTE approach used in data updates + let (account_ids, data_keys): (Vec<_>, Vec<_>) = rows.into_iter().unzip(); + + // UNNEST PATTERN: Convert arrays to rows and JOIN for batch updates + // This is more efficient than CTE when all updates share the same block_height + // and only need simple key-pair matching without per-row data variations + let query = format!( + r#" + UPDATE state_changes_access_key_compact_{partition_id} AS t + SET block_height_to = $3 + FROM ( + SELECT unnest($1::text[]) AS account_id, unnest($2::text[]) AS data_key + ) AS u + WHERE t.account_id = u.account_id + AND t.data_key = u.data_key + AND t.block_height_to IS NULL; + "#, + partition_id = partition_id + ); + + sqlx::query(&query) + .bind(&account_ids) + .bind(&data_keys) + .bind(&block_height_bd) + .execute(&pool) + .await?; + + tracing::debug!( + target: "database::postgres::state_indexer", + "Update done partition={} elapsed={:?} rows={}", + partition_id, + start.elapsed(), + account_ids.len() + ); + + Ok::<(), anyhow::Error>(()) + }); + + tasks.push(task); + } + + try_join_all(tasks).await?; + + tracing::debug!( + target: "database::postgres::state_indexer", + "Total update_state_changes_access_key duration shard={} elapsed={:?}", + shard_id, + overall_start.elapsed() + ); + + Ok(()) + } + + async fn insert_state_changes_contract( + &self, + shard_id: near_primitives::types::ShardId, + state_changes: Vec, + block_height: u64, + ) -> anyhow::Result<()> { + let overall_start = Instant::now(); + self.record_shard_write_metric( + shard_id, + "save_state_changes_contract", + "state_changes_contract", + ); + + // Extract only ContractCodeUpdate + let inserts: Vec<(String, Vec, bigdecimal::BigDecimal)> = state_changes + .into_iter() + .filter_map(|change| { + if let near_primitives::views::StateChangeValueView::ContractCodeUpdate { + account_id, + code, + } = change.value + { + Some(( + account_id.to_string(), + code.to_vec(), + bigdecimal::BigDecimal::from(block_height), + )) + } else { + None + } + }) + .collect(); + + // Use the keyless insert helper (3-column) + self.execute_partitioned_keyless_insert( + shard_id, + "state_changes_contract_compact".to_string(), + "insert_contract".to_string(), + inserts, + ) + .await?; + + tracing::debug!( + target: "database::postgres::state_indexer", + "Total insert_state_changes_contract duration shard={} elapsed={:?}", + shard_id, + overall_start.elapsed() + ); + + Ok(()) + } + + /// Update contract state changes with partitions under semaphore + async fn update_state_changes_contract( + &self, + shard_id: near_primitives::types::ShardId, + state_changes: Vec, + block_height: u64, + ) -> anyhow::Result<()> { + let overall_start = Instant::now(); + self.record_shard_write_metric( + shard_id, + "save_state_changes_contract", + "state_changes_contract", + ); + + // Collect account_ids for updates + let accounts: Vec = state_changes + .into_iter() + .filter_map(|change| match change.value { + near_primitives::views::StateChangeValueView::ContractCodeUpdate { + account_id, + .. + } + | near_primitives::views::StateChangeValueView::ContractCodeDeletion { + account_id, + } => Some(account_id.to_string()), + _ => None, + }) + .collect(); + + // Use the account-only update helper + self.execute_partitioned_account_update( + shard_id, + "state_changes_contract_compact".to_string(), + "update_contract".to_string(), + accounts, + block_height, + ) + .await?; + + tracing::debug!( + target: "database::postgres::state_indexer", + "Total update_state_changes_contract duration shard={} elapsed={:?}", + shard_id, + overall_start.elapsed() + ); + + Ok(()) + } + + async fn insert_state_changes_account( + &self, + shard_id: near_primitives::types::ShardId, + state_changes: Vec, + block_height: u64, + ) -> anyhow::Result<()> { + let overall_start = Instant::now(); + self.record_shard_write_metric( + shard_id, + "save_state_changes_account", + "state_changes_account", + ); + + // Extract account updates + let inserts: Vec<(String, Vec, bigdecimal::BigDecimal)> = state_changes + .into_iter() + .filter_map(|change| { + if let near_primitives::views::StateChangeValueView::AccountUpdate { + account_id, + account, + } = change.value + { + let data_value = + borsh::to_vec(&near_primitives::account::Account::from(&account)) + .expect("Failed to borsh serialize account"); + Some(( + account_id.to_string(), + data_value, + bigdecimal::BigDecimal::from(block_height), + )) + } else { + None + } + }) + .collect(); + + // Use the keyless insert helper (3-column) + self.execute_partitioned_keyless_insert( + shard_id, + "state_changes_account_compact".to_string(), + "insert_account".to_string(), + inserts, + ) + .await?; + + tracing::debug!( + target: "database::postgres::state_indexer", + "Total insert_state_changes_account duration shard={} elapsed={:?}", + shard_id, + overall_start.elapsed() + ); + + Ok(()) + } + + /// Update Account state changes using partitions + concurrency limit + async fn update_state_changes_account( + &self, + shard_id: near_primitives::types::ShardId, + state_changes: Vec, + block_height: u64, + ) -> anyhow::Result<()> { + let overall_start = Instant::now(); + self.record_shard_write_metric( + shard_id, + "save_state_changes_account", + "state_changes_account", + ); + + // Collect accounts for update + let accounts: Vec = state_changes + .into_iter() + .filter_map(|change| match change.value { + near_primitives::views::StateChangeValueView::AccountUpdate { + account_id, .. + } + | near_primitives::views::StateChangeValueView::AccountDeletion { account_id } => { + Some(account_id.to_string()) + } + _ => None, + }) + .collect(); + + // Use the account-only update helper + self.execute_partitioned_account_update( + shard_id, + "state_changes_account_compact".to_string(), + "update_account".to_string(), + accounts, + block_height, + ) + .await?; + + tracing::debug!( + target: "database::postgres::state_indexer", + "Total update_state_changes_account duration shard={} elapsed={:?}", + shard_id, + overall_start.elapsed() + ); + + Ok(()) + } +} diff --git a/logic-state-indexer/src/lib.rs b/logic-state-indexer/src/lib.rs index 2e630e06..684f6e4a 100644 --- a/logic-state-indexer/src/lib.rs +++ b/logic-state-indexer/src/lib.rs @@ -235,7 +235,6 @@ pub async fn handle_streamer_message( &streamer_message, db_manager, block_height, - block_hash, &indexer_config, shard_layout, ) @@ -330,7 +329,6 @@ async fn handle_state_changes( streamer_message: &near_indexer_primitives::StreamerMessage, db_manager: &(impl database::StateIndexerDbManager + Sync + Send + 'static), block_height: u64, - block_hash: CryptoHash, indexer_config: &(impl configuration::RightsizingConfig + std::fmt::Debug), shard_layout: &near_primitives::shard_layout::ShardLayout, ) -> anyhow::Result<()> { From fec57e3645ae750473a0640fd61158d7315ec218 Mon Sep 17 00:00:00 2001 From: Bohdan Khorolets Date: Fri, 25 Jul 2025 15:20:06 +0300 Subject: [PATCH 03/13] chore(database, state-indexer): Add additional metrics to monitor how long state indexer writes take time and how many partitions touched --- database/src/metrics.rs | 28 ++++++ .../src/postgres/state_indexer/helpers.rs | 94 ++++++++++++++++++- database/src/postgres/state_indexer/mod.rs | 37 +++++++- 3 files changed, 153 insertions(+), 6 deletions(-) diff --git a/database/src/metrics.rs b/database/src/metrics.rs index 7ba79cb9..8d52a731 100644 --- a/database/src/metrics.rs +++ b/database/src/metrics.rs @@ -38,4 +38,32 @@ lazy_static! { &["method_name", "table_name"] ) .unwrap(); + + pub(crate) static ref SHARD_DATABASE_WRITE_ELAPSED_TIME: IntCounterVec = register_int_counter_vec( + "shard_database_write_elapsed_time", + "Total elapsed time of shard database write query by shard_id, operation_name, time_elapsed", + &["shard_id", "operation_name", "time_elapsed"] + ) + .unwrap(); + + pub(crate) static ref AFFECTED_ACCOUNTS_COUNT: IntCounterVec = register_int_counter_vec( + "affected_accounts_count", + "Total number of affected accounts by shard_id, operation_name and accounts_number", + &["shard_id", "operation_name", "accounts_number"] + ) + .unwrap(); + + pub(crate) static ref PARTITIONS_TOUCHED_COUNT: IntCounterVec = register_int_counter_vec( + "partitions_touched_count", + "Total number of partitions touched by shard_id, operation_name and partitions_number", + &["shard_id", "operation_name", "partitions_number"] + ) + .unwrap(); + + pub(crate) static ref PARTITION_MAP_TIME_ELAPSED: IntCounterVec = register_int_counter_vec( + "partition_map_time_elapsed", + "Total elapsed time of partition map by shard_id and time_elapsed", + &["shard_id", "time_elapsed"] + ) + .unwrap(); } diff --git a/database/src/postgres/state_indexer/helpers.rs b/database/src/postgres/state_indexer/helpers.rs index 2fa817e7..b4a5db06 100644 --- a/database/src/postgres/state_indexer/helpers.rs +++ b/database/src/postgres/state_indexer/helpers.rs @@ -66,7 +66,7 @@ impl crate::PostgresDBManager { // Compute partition assignments for all account_ids using PostgreSQL's hashtext() function // This ensures consistent partition distribution matching the table partitioning scheme - let partition_map = self.partition_map(&pool, &account_ids).await?; + let partition_map = self.partition_map(&shard_id, &pool, &account_ids).await?; // Group account_ids by their target partition for batch processing // This reduces the number of database queries by updating entire partitions at once @@ -81,6 +81,13 @@ impl crate::PostgresDBManager { tracing::warn!("Partition not found for account_id: {}", account_id); } } + crate::metrics::PARTITIONS_TOUCHED_COUNT + .with_label_values(&[ + &shard_id.to_string(), + &operation_name, + &accounts_per_partition.len().to_string(), + ]) + .inc(); // Execute updates in parallel across partitions with concurrency control // Each partition can be updated independently, improving throughput @@ -120,6 +127,13 @@ impl crate::PostgresDBManager { .execute(&pool) .await?; + crate::metrics::SHARD_DATABASE_WRITE_ELAPSED_TIME + .with_label_values(&[ + &shard_id.to_string(), + &operation_name, + &start.elapsed().as_millis().to_string(), + ]) + .inc(); tracing::debug!( target: "database::postgres::state_indexer", "Update done operation={} partition={} elapsed={:?} rows={}", @@ -177,7 +191,14 @@ impl crate::PostgresDBManager { let pool = self.get_shard_pool(shard_id)?; // Extract account_ids for partition mapping (data_key distribution is handled by account_id partitioning) let account_ids: Vec = updates.iter().map(|(id, _, _)| id.clone()).collect(); - let partition_map = self.partition_map(&pool, &account_ids).await?; + crate::metrics::AFFECTED_ACCOUNTS_COUNT + .with_label_values(&[ + &shard_id.to_string(), + &operation_name, + &account_ids.len().to_string(), + ]) + .inc(); + let partition_map = self.partition_map(&shard_id, &pool, &account_ids).await?; // Group updates by partition, preserving the complete tuple for CTE processing let mut updates_per_partition: HashMap> = @@ -193,6 +214,13 @@ impl crate::PostgresDBManager { tracing::warn!("Partition not found for account_id: {}", account_id); } } + crate::metrics::PARTITIONS_TOUCHED_COUNT + .with_label_values(&[ + &shard_id.to_string(), + &operation_name, + &updates_per_partition.len().to_string(), + ]) + .inc(); let semaphore = std::sync::Arc::new(tokio::sync::Semaphore::new(super::MAX_CONCURRENT_QUERIES)); @@ -242,7 +270,13 @@ impl crate::PostgresDBManager { )); let result = qb.build().execute(&pool).await.map_err(anyhow::Error::from); - + crate::metrics::SHARD_DATABASE_WRITE_ELAPSED_TIME + .with_label_values(&[ + &shard_id.to_string(), + &operation_name, + &start.elapsed().as_millis().to_string(), + ]) + .inc(); tracing::debug!( target: "database::postgres::state_indexer", "Update done operation={} partition={} elapsed={:?} rows={}", @@ -288,7 +322,15 @@ impl crate::PostgresDBManager { let pool = self.get_shard_pool(shard_id)?; let account_ids: Vec = inserts.iter().map(|(id, _, _, _)| id.clone()).collect(); - let partition_map = self.partition_map(&pool, &account_ids).await?; + crate::metrics::AFFECTED_ACCOUNTS_COUNT + .with_label_values(&[ + &shard_id.to_string(), + &operation_name, + &account_ids.len().to_string(), + ]) + .inc(); + + let partition_map = self.partition_map(&shard_id, &pool, &account_ids).await?; // Group inserts by partition for efficient batch processing let mut inserts_per_partition: HashMap< @@ -307,6 +349,13 @@ impl crate::PostgresDBManager { tracing::warn!("Partition not found for account_id: {}", account_id); } } + crate::metrics::PARTITIONS_TOUCHED_COUNT + .with_label_values(&[ + &shard_id.to_string(), + &operation_name, + &inserts_per_partition.len().to_string(), + ]) + .inc(); let semaphore = std::sync::Arc::new(tokio::sync::Semaphore::new(super::MAX_CONCURRENT_QUERIES)); @@ -345,6 +394,13 @@ impl crate::PostgresDBManager { let result = qb.build().execute(&pool).await.map_err(anyhow::Error::from); + crate::metrics::SHARD_DATABASE_WRITE_ELAPSED_TIME + .with_label_values(&[ + &shard_id.to_string(), + &operation_name, + &start.elapsed().as_millis().to_string(), + ]) + .inc(); tracing::debug!( target: "database::postgres::state_indexer", "Insert done operation={} partition={} elapsed={:?} rows={}", @@ -393,7 +449,14 @@ impl crate::PostgresDBManager { let pool = self.get_shard_pool(shard_id)?; let account_ids: Vec = inserts.iter().map(|(id, _, _)| id.clone()).collect(); - let partition_map = self.partition_map(&pool, &account_ids).await?; + crate::metrics::AFFECTED_ACCOUNTS_COUNT + .with_label_values(&[ + &shard_id.to_string(), + &operation_name, + &account_ids.len().to_string(), + ]) + .inc(); + let partition_map = self.partition_map(&shard_id, &pool, &account_ids).await?; // Group inserts per partition let mut inserts_per_partition: HashMap< @@ -411,6 +474,13 @@ impl crate::PostgresDBManager { tracing::warn!("Partition not found for account_id: {}", account_id); } } + crate::metrics::PARTITIONS_TOUCHED_COUNT + .with_label_values(&[ + &shard_id.to_string(), + &operation_name, + &inserts_per_partition.len().to_string(), + ]) + .inc(); let semaphore = std::sync::Arc::new(tokio::sync::Semaphore::new(super::MAX_CONCURRENT_QUERIES)); @@ -445,6 +515,13 @@ impl crate::PostgresDBManager { let result = qb.build().execute(&pool).await.map_err(anyhow::Error::from); + crate::metrics::SHARD_DATABASE_WRITE_ELAPSED_TIME + .with_label_values(&[ + &shard_id.to_string(), + &operation_name, + &start.elapsed().as_millis().to_string(), + ]) + .inc(); tracing::debug!( target: "database::postgres::state_indexer", "Insert done operation={} partition={} elapsed={:?} rows={}", @@ -503,6 +580,7 @@ impl crate::PostgresDBManager { /// hash results regardless of client-side hash implementations or endianness differences. pub(crate) async fn partition_map( &self, + shard_id: &near_primitives::types::ShardId, pool: &sqlx::PgPool, account_ids: &Vec, ) -> anyhow::Result> { @@ -527,6 +605,12 @@ impl crate::PostgresDBManager { (account_id, partition) }) .collect(); + crate::metrics::PARTITION_MAP_TIME_ELAPSED + .with_label_values(&[ + &shard_id.to_string(), + &now.elapsed().as_millis().to_string(), + ]) + .inc(); tracing::debug!( target: "database::postgres::state_indexer", "Partition map computed in {:?} for {} accounts", diff --git a/database/src/postgres/state_indexer/mod.rs b/database/src/postgres/state_indexer/mod.rs index 80bdab01..860198bd 100644 --- a/database/src/postgres/state_indexer/mod.rs +++ b/database/src/postgres/state_indexer/mod.rs @@ -429,7 +429,14 @@ impl crate::StateIndexerDbManager for crate::PostgresDBManager { // - No need for block_height comparison per row (all updates are from same block) let pool = self.get_shard_pool(shard_id)?; let account_ids: Vec = updates.iter().map(|(id, _)| id.clone()).collect(); - let partition_map = self.partition_map(&pool, &account_ids).await?; + crate::metrics::AFFECTED_ACCOUNTS_COUNT + .with_label_values(&[ + &shard_id.to_string(), + "update_state_changes_access_key", + &account_ids.len().to_string(), + ]) + .inc(); + let partition_map = self.partition_map(&shard_id, &pool, &account_ids).await?; // Group updates by partition but keep them as simple (account_id, data_key) pairs let mut updates_per_partition: HashMap> = HashMap::new(); @@ -441,6 +448,13 @@ impl crate::StateIndexerDbManager for crate::PostgresDBManager { .push((account_id, data_key)); } } + crate::metrics::PARTITIONS_TOUCHED_COUNT + .with_label_values(&[ + &shard_id.to_string(), + "update_state_changes_access_key", + &updates_per_partition.len().to_string(), + ]) + .inc(); // Parallel update execution per partition using the UNNEST pattern let semaphore = std::sync::Arc::new(tokio::sync::Semaphore::new(MAX_CONCURRENT_QUERIES)); @@ -482,6 +496,13 @@ impl crate::StateIndexerDbManager for crate::PostgresDBManager { .execute(&pool) .await?; + crate::metrics::SHARD_DATABASE_WRITE_ELAPSED_TIME + .with_label_values(&[ + &shard_id.to_string(), + "update_state_changes_access_key", + &start.elapsed().as_millis().to_string(), + ]) + .inc(); tracing::debug!( target: "database::postgres::state_indexer", "Update done partition={} elapsed={:?} rows={}", @@ -588,6 +609,13 @@ impl crate::StateIndexerDbManager for crate::PostgresDBManager { _ => None, }) .collect(); + crate::metrics::AFFECTED_ACCOUNTS_COUNT + .with_label_values(&[ + &shard_id.to_string(), + "update_state_changes_contract", + &accounts.len().to_string(), + ]) + .inc(); // Use the account-only update helper self.execute_partitioned_account_update( @@ -691,6 +719,13 @@ impl crate::StateIndexerDbManager for crate::PostgresDBManager { _ => None, }) .collect(); + crate::metrics::AFFECTED_ACCOUNTS_COUNT + .with_label_values(&[ + &shard_id.to_string(), + "update_state_changes_account", + &accounts.len().to_string(), + ]) + .inc(); // Use the account-only update helper self.execute_partitioned_account_update( From bc7f3c7cbe13032e86e6f68f0f9ad08e2fe26377 Mon Sep 17 00:00:00 2001 From: Bohdan Khorolets Date: Mon, 28 Jul 2025 12:07:04 +0300 Subject: [PATCH 04/13] refactor(database, state-indexer): Add postgresql function to match partition number. Switch CTE to unnest for updates --- ...reate_get_text_partition_function.down.sql | 1 + ..._create_get_text_partition_function.up.sql | 16 ++ database/src/postgres/rpc_server.rs | 253 ++++++------------ .../src/postgres/state_indexer/helpers.rs | 109 ++++---- database/src/postgres/state_indexer/mod.rs | 147 ++-------- readnode-primitives/src/lib.rs | 49 +--- 6 files changed, 178 insertions(+), 397 deletions(-) create mode 100644 database/src/postgres/migrations/shard_db/20250218132510_create_get_text_partition_function.down.sql create mode 100644 database/src/postgres/migrations/shard_db/20250218132510_create_get_text_partition_function.up.sql diff --git a/database/src/postgres/migrations/shard_db/20250218132510_create_get_text_partition_function.down.sql b/database/src/postgres/migrations/shard_db/20250218132510_create_get_text_partition_function.down.sql new file mode 100644 index 00000000..7de51251 --- /dev/null +++ b/database/src/postgres/migrations/shard_db/20250218132510_create_get_text_partition_function.down.sql @@ -0,0 +1 @@ +DROP FUNCTION IF EXISTS get_text_partition(TEXT, INT); \ No newline at end of file diff --git a/database/src/postgres/migrations/shard_db/20250218132510_create_get_text_partition_function.up.sql b/database/src/postgres/migrations/shard_db/20250218132510_create_get_text_partition_function.up.sql new file mode 100644 index 00000000..2a0790b5 --- /dev/null +++ b/database/src/postgres/migrations/shard_db/20250218132510_create_get_text_partition_function.up.sql @@ -0,0 +1,16 @@ +CREATE OR REPLACE FUNCTION get_text_partition(value TEXT, partitions INT) +RETURNS INT AS $$ +DECLARE + hash_val numeric; +BEGIN + hash_val := ( + ( + hashtextextended(value, 8816678312871386365)::numeric + + 5305509591434766563 + + 18446744073709551616 + ) % 18446744073709551616 + ); + + RETURN (hash_val % partitions)::int; +END; +$$ LANGUAGE plpgsql IMMUTABLE; diff --git a/database/src/postgres/rpc_server.rs b/database/src/postgres/rpc_server.rs index 4e67dde4..01614f9d 100644 --- a/database/src/postgres/rpc_server.rs +++ b/database/src/postgres/rpc_server.rs @@ -76,36 +76,14 @@ impl crate::ReaderDbManager for crate::PostgresDBManager { }; let mut stream = sqlx::query_as::<_, (String, Vec)>( " - WITH latest_blocks AS ( - SELECT - data_key, - MAX(block_height) AS max_block_height - FROM - state_changes_data - WHERE - account_id = $1 - AND block_height <= $2 - GROUP BY - data_key - ) - SELECT - sc.data_key, - sc.data_value - FROM - state_changes_data sc - INNER JOIN latest_blocks lb - ON - sc.data_key = lb.data_key - AND sc.block_height = lb.max_block_height - WHERE - sc.account_id = $1 - AND sc.data_value IS NOT NULL - AND ( - $3 IS NULL OR - sc.data_key > $3 - ) + SELECT data_key, data_value + FROM state_changes_data_compact + WHERE account_id = $1 + AND block_height_from <= $2 + AND (block_height_to IS NULL OR block_height_to > $2) + AND ($3 IS NULL OR data_key > $3) ORDER BY - sc.data_key ASC + data_key ASC LIMIT $4; ", ) @@ -153,31 +131,12 @@ impl crate::ReaderDbManager for crate::PostgresDBManager { let mut items = std::collections::HashMap::new(); let mut stream = sqlx::query_as::<_, (String, Vec)>( " - WITH latest_blocks AS ( - SELECT - data_key, - MAX(block_height) AS max_block_height - FROM - state_changes_data - WHERE - account_id = $1 - AND data_key LIKE $2 - AND block_height <= $3 - GROUP BY - data_key - ) - SELECT - sc.data_key, - sc.data_value - FROM - state_changes_data sc - INNER JOIN latest_blocks lb - ON - sc.data_key = lb.data_key - AND sc.block_height = lb.max_block_height - WHERE - sc.account_id = $1 - AND sc.data_value IS NOT NULL; + SELECT data_key, data_value + FROM state_changes_data_compact + WHERE account_id = $1 + AND data_key LIKE $2 + AND block_height_from <= $3 + AND (block_height_to IS NULL OR block_height_to > $3); ", ) .bind(account_id.to_string()) @@ -210,30 +169,11 @@ impl crate::ReaderDbManager for crate::PostgresDBManager { let mut items = std::collections::HashMap::new(); let mut stream = sqlx::query_as::<_, (String, Vec)>( " - WITH latest_blocks AS ( - SELECT - data_key, - MAX(block_height) AS max_block_height - FROM - state_changes_data - WHERE - account_id = $1 - AND block_height <= $2 - GROUP BY - data_key - ) - SELECT - sc.data_key, - sc.data_value - FROM - state_changes_data sc - INNER JOIN latest_blocks lb - ON - sc.data_key = lb.data_key - AND sc.block_height = lb.max_block_height - WHERE - sc.account_id = $1 - AND sc.data_value IS NOT NULL; + SELECT data_key, data_value + FROM state_changes_data_compact + WHERE account_id = $1 + AND block_height_from <= $2 + AND (block_height_to IS NULL OR block_height_to > $2); ", ) .bind(account_id.to_string()) @@ -266,12 +206,13 @@ impl crate::ReaderDbManager for crate::PostgresDBManager { .inc(); let (data_value,): (Vec,) = sqlx::query_as( " - SELECT data_value - FROM state_changes_data + SELECT data_key, data_value + FROM state_changes_data_compact WHERE account_id = $1 - AND data_key = $2 - AND block_height <= $3 - ORDER BY block_height DESC + AND data_key = $2 + AND block_height_from <= $3 + AND (block_height_to IS NULL OR block_height_to > $3) + ORDER BY block_height_from DESC LIMIT 1; ", ) @@ -290,6 +231,12 @@ impl crate::ReaderDbManager for crate::PostgresDBManager { method_name: &str, ) -> anyhow::Result> { let shard_id_pool = self.get_shard_connection(account_id).await?; + tracing::debug!( + "Getting account data for {} at block height {} from shard {}", + account_id, + request_block_height, + shard_id_pool.shard_id + ); crate::metrics::SHARD_DATABASE_READ_QUERIES .with_label_values(&[ &shard_id_pool.shard_id.to_string(), @@ -297,27 +244,28 @@ impl crate::ReaderDbManager for crate::PostgresDBManager { "state_changes_account", ]) .inc(); - let (block_height, block_hash, data_value): (bigdecimal::BigDecimal, String, Vec) = - sqlx::query_as( - " - SELECT block_height, block_hash, data_value - FROM state_changes_account + let result: (Vec, bigdecimal::BigDecimal) = sqlx::query_as( + " + SELECT data_value, block_height_from + FROM state_changes_account_compact WHERE account_id = $1 - AND block_height <= $2 - ORDER BY block_height DESC + AND block_height_from <= $2 + AND (block_height_to IS NULL OR block_height_to > $2) + ORDER BY block_height_from DESC LIMIT 1; ", - ) - .bind(account_id.to_string()) - .bind(bigdecimal::BigDecimal::from(request_block_height)) - .fetch_one(shard_id_pool.pool) - .await?; - let block = readnode_primitives::BlockRecord::try_from((block_hash, block_height))?; - readnode_primitives::QueryData::::try_from(( - data_value, - block.height, - block.hash, - )) + ) + .bind(account_id.to_string()) + .bind(bigdecimal::BigDecimal::from(request_block_height)) + .fetch_one(shard_id_pool.pool) + .await?; + tracing::debug!( + "Fetched account data for {} at block height {}: {:?}", + account_id, + request_block_height, + result + ); + readnode_primitives::QueryData::::try_from(result) } async fn get_contract_code( @@ -334,27 +282,23 @@ impl crate::ReaderDbManager for crate::PostgresDBManager { "state_changes_contract", ]) .inc(); - let (block_height, block_hash, contract_code): (bigdecimal::BigDecimal, String, Vec) = - sqlx::query_as( - " - SELECT block_height, block_hash, data_value - FROM state_changes_contract + let result: (Vec, bigdecimal::BigDecimal) = sqlx::query_as( + " + SELECT data_value, block_height_from + FROM state_changes_contract_compact WHERE account_id = $1 - AND block_height <= $2 - ORDER BY block_height DESC + AND block_height_from <= $2 + AND (block_height_to IS NULL OR block_height_to > $2) + ORDER BY block_height_from DESC LIMIT 1; ", - ) - .bind(account_id.to_string()) - .bind(bigdecimal::BigDecimal::from(request_block_height)) - .fetch_one(shard_id_pool.pool) - .await?; - let block = readnode_primitives::BlockRecord::try_from((block_hash, block_height))?; - Ok(readnode_primitives::QueryData { - data: contract_code, - block_height: block.height, - block_hash: block.hash, - }) + ) + .bind(account_id.to_string()) + .bind(bigdecimal::BigDecimal::from(request_block_height)) + .fetch_one(shard_id_pool.pool) + .await?; + + readnode_primitives::QueryData::>::try_from(result) } async fn get_access_key( @@ -373,29 +317,24 @@ impl crate::ReaderDbManager for crate::PostgresDBManager { ]) .inc(); let key_data = borsh::to_vec(&public_key)?; - let (block_height, block_hash, data_value): (bigdecimal::BigDecimal, String, Vec) = - sqlx::query_as( - " - SELECT block_height, block_hash, data_value - FROM state_changes_access_key + let result: (Vec, bigdecimal::BigDecimal) = sqlx::query_as( + " + SELECT data_value, block_height_from + FROM state_changes_access_key_compact WHERE account_id = $1 - AND data_key = $2 - AND block_height <= $3 - ORDER BY block_height DESC + AND data_key = $2 + AND block_height_from <= $3 + AND (block_height_to IS NULL OR block_height_to > $3) + ORDER BY block_height_from DESC LIMIT 1; ", - ) - .bind(account_id.to_string()) - .bind(hex::encode(&key_data).to_string()) - .bind(bigdecimal::BigDecimal::from(request_block_height)) - .fetch_one(shard_id_pool.pool) - .await?; - let block = readnode_primitives::BlockRecord::try_from((block_hash, block_height))?; - readnode_primitives::QueryData::::try_from(( - data_value, - block.height, - block.hash, - )) + ) + .bind(account_id.to_string()) + .bind(hex::encode(&key_data).to_string()) + .bind(bigdecimal::BigDecimal::from(request_block_height)) + .fetch_one(shard_id_pool.pool) + .await?; + readnode_primitives::QueryData::::try_from(result) } async fn get_account_access_keys( @@ -413,42 +352,20 @@ impl crate::ReaderDbManager for crate::PostgresDBManager { ]) .inc(); let mut access_keys = vec![]; - let mut stream = sqlx::query_as::<_, (String, Vec, bigdecimal::BigDecimal)>( + let mut stream = sqlx::query_as::<_, (String, Vec)>( " - WITH latest_blocks AS ( - SELECT - data_key, - account_id, - MAX(block_height) as max_block_height - FROM - state_changes_access_key - WHERE - account_id = $1 - AND block_height <= $2 - GROUP BY - data_key, - account_id - ) - SELECT - sc.data_key, - sc.data_value, - sc.block_height - FROM - state_changes_access_key sc - INNER JOIN latest_blocks lb - ON - sc.data_key = lb.data_key - AND sc.block_height = lb.max_block_height - AND sc.account_id = lb.account_id - WHERE - sc.data_value IS NOT NULL; + SELECT data_key, data_value + FROM state_changes_access_key_compact + WHERE account_id = $1 + AND block_height_from <= $2 + AND (block_height_to IS NULL OR block_height_to > $2); ", ) .bind(account_id.to_string()) .bind(bigdecimal::BigDecimal::from(block_height)) .fetch(shard_id_pool.pool); while let Some(row) = stream.next().await { - let (public_key_hex, access_key, _): (String, Vec, _) = row?; + let (public_key_hex, access_key): (String, Vec) = row?; let access_key_view = near_primitives::views::AccessKeyInfoView { public_key: borsh::from_slice::(&hex::decode( public_key_hex, diff --git a/database/src/postgres/state_indexer/helpers.rs b/database/src/postgres/state_indexer/helpers.rs index b4a5db06..219748d7 100644 --- a/database/src/postgres/state_indexer/helpers.rs +++ b/database/src/postgres/state_indexer/helpers.rs @@ -155,28 +155,11 @@ impl crate::PostgresDBManager { /// Helper function for partitioned updates with composite key (account_id + data_key) /// - /// This helper is used for tables that have composite primary keys with both account_id - /// and data_key components (like state_changes_data table), where we need precise - /// row-level updates based on both key components. + /// This helper now uses `UNNEST()` instead of CTE for performance: + /// - `UNNEST()` is much faster for large batches compared to VALUES in CTE. + /// - Allows PostgreSQL to stream and avoid heavy planning overhead. /// - /// SQL Pattern: Uses CTE (Common Table Expression) with VALUES for structured data - /// ```sql - /// WITH new_data (account_id, data_key, block_height) AS ( - /// VALUES ('acc1', 'key1', 100), ('acc2', 'key2', 100), ... - /// ) - /// UPDATE table AS old - /// SET block_height_to = new_data.block_height - /// FROM new_data - /// WHERE old.account_id = new_data.account_id - /// AND old.data_key = new_data.data_key - /// AND old.block_height_from < new_data.block_height - /// AND old.block_height_to IS NULL; - /// ``` - /// - /// The CTE approach is necessary here because we need to match on multiple columns - /// with different block_height values per row, which UNNEST cannot handle efficiently. - /// - /// Used by: update_state_changes_data + /// Uses: `update_state_changes_data` pub(crate) async fn execute_partitioned_key_update( &self, shard_id: near_primitives::types::ShardId, @@ -189,8 +172,9 @@ impl crate::PostgresDBManager { } let pool = self.get_shard_pool(shard_id)?; - // Extract account_ids for partition mapping (data_key distribution is handled by account_id partitioning) let account_ids: Vec = updates.iter().map(|(id, _, _)| id.clone()).collect(); + + // Metrics for accounts affected crate::metrics::AFFECTED_ACCOUNTS_COUNT .with_label_values(&[ &shard_id.to_string(), @@ -198,9 +182,10 @@ impl crate::PostgresDBManager { &account_ids.len().to_string(), ]) .inc(); + let partition_map = self.partition_map(&shard_id, &pool, &account_ids).await?; - // Group updates by partition, preserving the complete tuple for CTE processing + // Group updates per partition let mut updates_per_partition: HashMap> = HashMap::new(); for (account_id, data_key, block_height) in updates { @@ -210,10 +195,10 @@ impl crate::PostgresDBManager { data_key, block_height, )); - } else { - tracing::warn!("Partition not found for account_id: {}", account_id); } } + + // Track partitions touched crate::metrics::PARTITIONS_TOUCHED_COUNT .with_label_values(&[ &shard_id.to_string(), @@ -236,40 +221,34 @@ impl crate::PostgresDBManager { let _permit = semaphore.acquire_owned().await.unwrap(); let start = Instant::now(); - // Build CTE-based UPDATE query using sqlx QueryBuilder for type safety - // CTE allows us to provide structured data (account_id, data_key, block_height) - // and join it efficiently with the target table for precise updates - let mut qb = sqlx::QueryBuilder::new( - "WITH new_data (account_id, data_key, block_height) AS (", - ); + // Use unzip_n_vec to split tuples into separate Vecs for UNNEST + let (account_ids, data_keys, block_heights): (Vec<_>, Vec<_>, Vec<_>) = + unzip_n_vec(rows); - qb.push_values( - rows.iter(), - |mut row, (account_id, data_key, block_height)| { - row.push_bind(account_id) - .push_bind(data_key) - .push_bind(block_height); - }, + let query = format!( + r#" + UPDATE {table_name} AS old + SET block_height_to = new_data.block_height + FROM ( + SELECT UNNEST($1::text[]) AS account_id, + UNNEST($2::text[]) AS data_key, + UNNEST($3::numeric[]) AS block_height + ) AS new_data + WHERE old.account_id = new_data.account_id + AND old.data_key = new_data.data_key + AND old.block_height_from < new_data.block_height + AND old.block_height_to IS NULL; + "#, ); - // Complete the CTE and add the UPDATE clause with all necessary conditions - // The four AND conditions ensure data integrity and proper versioning: - // 1. account_id match - partition-level key - // 2. data_key match - row-level key - // 3. block_height comparison - prevents updating newer data with older data - // 4. NULL check - only update active records (not already closed) - qb.push(format!( - ") UPDATE {} AS old \ - SET block_height_to = new_data.block_height \ - FROM new_data \ - WHERE old.account_id = new_data.account_id \ - AND old.data_key = new_data.data_key \ - AND old.block_height_from < new_data.block_height \ - AND old.block_height_to IS NULL;", - table_name, - )); + let result = sqlx::query(&query) + .bind(&account_ids) + .bind(&data_keys) + .bind(&block_heights) + .execute(&pool) + .await + .map_err(anyhow::Error::from); - let result = qb.build().execute(&pool).await.map_err(anyhow::Error::from); crate::metrics::SHARD_DATABASE_WRITE_ELAPSED_TIME .with_label_values(&[ &shard_id.to_string(), @@ -277,13 +256,14 @@ impl crate::PostgresDBManager { &start.elapsed().as_millis().to_string(), ]) .inc(); + tracing::debug!( target: "database::postgres::state_indexer", "Update done operation={} partition={} elapsed={:?} rows={}", operation_name, partition_id, start.elapsed(), - rows.len() + account_ids.len() ); result @@ -589,8 +569,8 @@ impl crate::PostgresDBManager { // Execute partition calculation in PostgreSQL to ensure consistency // This MUST use the same hash function and modulo as the partitioned table definitions let partition_rows = sqlx::query( - "SELECT account_id, mod(hashtext(account_id), $2)::int AS partition - FROM unnest($1::text[]) AS account_id", + "SELECT account_id, get_text_partition(account_id, $2) AS partition + FROM unnest($1::text[]) AS account_id", ) .bind(account_ids) .bind(super::PARTITIONS) @@ -620,3 +600,16 @@ impl crate::PostgresDBManager { Ok(partition_map) } } + +/// Utility to unzip Vec of 3-tuples into 3 separate Vecs +fn unzip_n_vec(input: Vec<(T1, T2, T3)>) -> (Vec, Vec, Vec) { + let mut v1 = Vec::with_capacity(input.len()); + let mut v2 = Vec::with_capacity(input.len()); + let mut v3 = Vec::with_capacity(input.len()); + for (a, b, c) in input { + v1.push(a); + v2.push(b); + v3.push(c); + } + (v1, v2, v3) +} diff --git a/database/src/postgres/state_indexer/mod.rs b/database/src/postgres/state_indexer/mod.rs index 860198bd..4f3d587f 100644 --- a/database/src/postgres/state_indexer/mod.rs +++ b/database/src/postgres/state_indexer/mod.rs @@ -365,44 +365,22 @@ impl crate::StateIndexerDbManager for crate::PostgresDBManager { Ok(()) } - /// Update access key state changes - SPECIAL CASE: In-place implementation - /// - /// This method is NOT using execute_partitioned_key_update helper, and here's why: - /// - /// TECHNICAL REASONING: - /// 1. **Different SQL Pattern**: Access key updates use UNNEST with separate arrays - /// instead of CTE (Common Table Expression) used by data updates - /// 2. **Uniform Block Height**: All updates in a batch share the same block_height, - /// unlike data updates where each row might have different block heights - /// 3. **Simpler Matching**: Only needs (account_id, data_key) pairs for matching, - /// doesn't need the complex 4-condition WHERE clause of data updates - /// - /// SQL PATTERN COMPARISON: - /// - Data updates (CTE): WITH new_data(...) UPDATE table SET... FROM new_data WHERE... - /// - Access key updates: UPDATE table SET... FROM (SELECT unnest(...)) WHERE... - /// - /// PERFORMANCE CONSIDERATIONS: - /// - UNNEST with arrays is more efficient for simple key-pair matching - /// - CTE is better for complex multi-column operations with varying data - /// - The uniform block_height allows using a single parameter ($3) instead of per-row values - /// - /// This is a legitimate architectural decision, not an oversight in refactoring. + // TODO: provide docstring async fn update_state_changes_access_key( &self, shard_id: near_primitives::types::ShardId, state_changes: Vec, block_height: u64, ) -> anyhow::Result<()> { - let overall_start = Instant::now(); self.record_shard_write_metric( shard_id, "save_state_changes_access_key", "state_changes_access_key", ); - // Collect updates: (account_id, data_key) pairs for access key modifications - // Note: We only need the key pairs since all updates share the same block_height - let updates: Vec<(String, String)> = state_changes + // Collect updates as triples (account_id, data_key, block_height) + let block_height_bd = bigdecimal::BigDecimal::from(block_height); + let updates: Vec<(String, String, bigdecimal::BigDecimal)> = state_changes .iter() .filter_map(|c| match &c.value { near_primitives::views::StateChangeValueView::AccessKeyUpdate { @@ -413,7 +391,11 @@ impl crate::StateIndexerDbManager for crate::PostgresDBManager { | near_primitives::views::StateChangeValueView::AccessKeyDeletion { account_id, public_key, - } => Some((account_id.to_string(), hex::encode(public_key.key_data()))), + } => Some(( + account_id.to_string(), + hex::encode(public_key.key_data()), + block_height_bd.clone(), // same height for all rows + )), _ => None, }) .collect(); @@ -422,111 +404,14 @@ impl crate::StateIndexerDbManager for crate::PostgresDBManager { return Ok(()); } - // IMPLEMENTATION NOTE: Custom partitioning logic (not using execute_partitioned_key_update) - // This is intentional because access key updates have different requirements: - // - Uniform block_height for all updates (allows single parameter binding) - // - Simple (account_id, data_key) matching (UNNEST more efficient than CTE) - // - No need for block_height comparison per row (all updates are from same block) - let pool = self.get_shard_pool(shard_id)?; - let account_ids: Vec = updates.iter().map(|(id, _)| id.clone()).collect(); - crate::metrics::AFFECTED_ACCOUNTS_COUNT - .with_label_values(&[ - &shard_id.to_string(), - "update_state_changes_access_key", - &account_ids.len().to_string(), - ]) - .inc(); - let partition_map = self.partition_map(&shard_id, &pool, &account_ids).await?; - - // Group updates by partition but keep them as simple (account_id, data_key) pairs - let mut updates_per_partition: HashMap> = HashMap::new(); - for (account_id, data_key) in updates { - if let Some(&partition) = partition_map.get(&account_id) { - updates_per_partition - .entry(partition) - .or_default() - .push((account_id, data_key)); - } - } - crate::metrics::PARTITIONS_TOUCHED_COUNT - .with_label_values(&[ - &shard_id.to_string(), - "update_state_changes_access_key", - &updates_per_partition.len().to_string(), - ]) - .inc(); - - // Parallel update execution per partition using the UNNEST pattern - let semaphore = std::sync::Arc::new(tokio::sync::Semaphore::new(MAX_CONCURRENT_QUERIES)); - let mut tasks = Vec::new(); - for (partition_id, rows) in updates_per_partition { - let pool = pool.clone(); - let semaphore = semaphore.clone(); - let block_height_bd = bigdecimal::BigDecimal::from(block_height); - - let task = tokio::spawn(async move { - let _permit = semaphore.acquire_owned().await.unwrap(); - let start = Instant::now(); - - // Separate account_ids and data_keys into parallel arrays for UNNEST - // This is the key difference from CTE approach used in data updates - let (account_ids, data_keys): (Vec<_>, Vec<_>) = rows.into_iter().unzip(); - - // UNNEST PATTERN: Convert arrays to rows and JOIN for batch updates - // This is more efficient than CTE when all updates share the same block_height - // and only need simple key-pair matching without per-row data variations - let query = format!( - r#" - UPDATE state_changes_access_key_compact_{partition_id} AS t - SET block_height_to = $3 - FROM ( - SELECT unnest($1::text[]) AS account_id, unnest($2::text[]) AS data_key - ) AS u - WHERE t.account_id = u.account_id - AND t.data_key = u.data_key - AND t.block_height_to IS NULL; - "#, - partition_id = partition_id - ); - - sqlx::query(&query) - .bind(&account_ids) - .bind(&data_keys) - .bind(&block_height_bd) - .execute(&pool) - .await?; - - crate::metrics::SHARD_DATABASE_WRITE_ELAPSED_TIME - .with_label_values(&[ - &shard_id.to_string(), - "update_state_changes_access_key", - &start.elapsed().as_millis().to_string(), - ]) - .inc(); - tracing::debug!( - target: "database::postgres::state_indexer", - "Update done partition={} elapsed={:?} rows={}", - partition_id, - start.elapsed(), - account_ids.len() - ); - - Ok::<(), anyhow::Error>(()) - }); - - tasks.push(task); - } - - try_join_all(tasks).await?; - - tracing::debug!( - target: "database::postgres::state_indexer", - "Total update_state_changes_access_key duration shard={} elapsed={:?}", + // Delegate to the same partitioned update helper + self.execute_partitioned_key_update( shard_id, - overall_start.elapsed() - ); - - Ok(()) + "state_changes_access_key_compact".to_string(), + "update_state_changes_access_key".to_string(), + updates, + ) + .await } async fn insert_state_changes_contract( diff --git a/readnode-primitives/src/lib.rs b/readnode-primitives/src/lib.rs index 35c3ce5c..93b803d6 100644 --- a/readnode-primitives/src/lib.rs +++ b/readnode-primitives/src/lib.rs @@ -220,12 +220,11 @@ pub type StateValue = Vec; pub struct BlockHeightShardId(pub u64, pub u64); pub struct QueryData { pub data: T, - // block_height and block_hash we return here represents the moment + // block_height we return here represents the moment // when the data was last updated in the database // We used to return it in the `QueryResponse` but it was replaced with // the logic that corresponds the logic of the `nearcore` RPC API pub block_height: near_indexer_primitives::types::BlockHeight, - pub block_hash: CryptoHash, } #[derive(Debug, Clone)] @@ -300,31 +299,20 @@ where } } -impl - TryFrom<( - Vec, - near_indexer_primitives::types::BlockHeight, - CryptoHash, - )> for QueryData +impl TryFrom<(Vec, B)> for QueryData where T: borsh::BorshDeserialize, + B: ToPrimitive, { type Error = anyhow::Error; - fn try_from( - value: ( - Vec, - near_indexer_primitives::types::BlockHeight, - CryptoHash, - ), - ) -> Result { + fn try_from(value: (Vec, B)) -> Result { let data = T::try_from_slice(&value.0)?; - - Ok(Self { - data, - block_height: value.1, - block_hash: value.2, - }) + let block_height = value + .1 + .to_u64() + .ok_or_else(|| anyhow::anyhow!("Failed to parse `block_height` to u64"))?; + Ok(Self { data, block_height }) } } @@ -415,22 +403,3 @@ where }) } } - -impl TryFrom<(String, T)> for BlockRecord -where - T: ToPrimitive, -{ - type Error = anyhow::Error; - - fn try_from(value: (String, T)) -> Result { - let height = value - .1 - .to_u64() - .ok_or_else(|| anyhow::anyhow!("Failed to parse `block_height` to u64"))?; - let hash = CryptoHash::from_str(&value.0).map_err(|err| { - anyhow::anyhow!("Failed to parse `block_hash` to CryptoHash: {}", err) - })?; - - Ok(BlockRecord { height, hash }) - } -} From 98c92469f49d23a64ab56edb1c493d3c05b786f2 Mon Sep 17 00:00:00 2001 From: Bohdan Khorolets Date: Wed, 30 Jul 2025 09:34:52 +0300 Subject: [PATCH 05/13] refactor(database, state-indexer): Replace numberic(20,0) for block_heights to biging (i64) to speed inserts and updates up --- .../src/postgres/state_indexer/helpers.rs | 20 +++++-------- database/src/postgres/state_indexer/mod.rs | 30 ++++++++----------- 2 files changed, 20 insertions(+), 30 deletions(-) diff --git a/database/src/postgres/state_indexer/helpers.rs b/database/src/postgres/state_indexer/helpers.rs index 219748d7..292e1eea 100644 --- a/database/src/postgres/state_indexer/helpers.rs +++ b/database/src/postgres/state_indexer/helpers.rs @@ -165,7 +165,7 @@ impl crate::PostgresDBManager { shard_id: near_primitives::types::ShardId, table_prefix: String, operation_name: String, - updates: Vec<(String, String, bigdecimal::BigDecimal)>, // (account_id, data_key, block_height) + updates: Vec<(String, String, i64)>, // (account_id, data_key, block_height) ) -> anyhow::Result<()> { if updates.is_empty() { return Ok(()); @@ -186,8 +186,7 @@ impl crate::PostgresDBManager { let partition_map = self.partition_map(&shard_id, &pool, &account_ids).await?; // Group updates per partition - let mut updates_per_partition: HashMap> = - HashMap::new(); + let mut updates_per_partition: HashMap> = HashMap::new(); for (account_id, data_key, block_height) in updates { if let Some(&partition) = partition_map.get(&account_id) { updates_per_partition.entry(partition).or_default().push(( @@ -294,7 +293,7 @@ impl crate::PostgresDBManager { shard_id: near_primitives::types::ShardId, table_prefix: String, operation_name: String, - inserts: Vec<(String, String, Vec, bigdecimal::BigDecimal)>, // (account_id, data_key, data_value, block_height) + inserts: Vec<(String, String, Vec, i64)>, // (account_id, data_key, data_value, block_height) ) -> anyhow::Result<()> { if inserts.is_empty() { return Ok(()); @@ -313,10 +312,8 @@ impl crate::PostgresDBManager { let partition_map = self.partition_map(&shard_id, &pool, &account_ids).await?; // Group inserts by partition for efficient batch processing - let mut inserts_per_partition: HashMap< - i32, - Vec<(String, String, Vec, bigdecimal::BigDecimal)>, - > = HashMap::new(); + let mut inserts_per_partition: HashMap, i64)>> = + HashMap::new(); for (account_id, data_key, data_value, block_height) in inserts { if let Some(&partition) = partition_map.get(&account_id) { inserts_per_partition.entry(partition).or_default().push(( @@ -421,7 +418,7 @@ impl crate::PostgresDBManager { shard_id: near_primitives::types::ShardId, table_prefix: String, operation_name: String, - inserts: Vec<(String, Vec, bigdecimal::BigDecimal)>, // (account_id, data_value, block_height) + inserts: Vec<(String, Vec, i64)>, // (account_id, data_value, block_height) ) -> anyhow::Result<()> { if inserts.is_empty() { return Ok(()); @@ -439,10 +436,7 @@ impl crate::PostgresDBManager { let partition_map = self.partition_map(&shard_id, &pool, &account_ids).await?; // Group inserts per partition - let mut inserts_per_partition: HashMap< - i32, - Vec<(String, Vec, bigdecimal::BigDecimal)>, - > = HashMap::new(); + let mut inserts_per_partition: HashMap, i64)>> = HashMap::new(); for (account_id, data_value, block_height) in inserts { if let Some(&partition) = partition_map.get(&account_id) { inserts_per_partition.entry(partition).or_default().push(( diff --git a/database/src/postgres/state_indexer/mod.rs b/database/src/postgres/state_indexer/mod.rs index 4f3d587f..47e79b54 100644 --- a/database/src/postgres/state_indexer/mod.rs +++ b/database/src/postgres/state_indexer/mod.rs @@ -229,7 +229,7 @@ impl crate::StateIndexerDbManager for crate::PostgresDBManager { self.record_shard_write_metric(shard_id, "save_state_changes_data", "state_changes_data"); // Extract relevant data - let inserts: Vec<(String, String, Vec, bigdecimal::BigDecimal)> = state_changes + let inserts: Vec<(String, String, Vec, i64)> = state_changes .iter() .filter_map(|change| { if let near_primitives::views::StateChangeValueView::DataUpdate { @@ -243,7 +243,7 @@ impl crate::StateIndexerDbManager for crate::PostgresDBManager { account_id.to_string(), data_key, value.clone().to_vec(), - bigdecimal::BigDecimal::from(block_height), + block_height as i64, // Convert to i64 for database compatibility )) } else { None @@ -269,7 +269,7 @@ impl crate::StateIndexerDbManager for crate::PostgresDBManager { ) -> anyhow::Result<()> { self.record_shard_write_metric(shard_id, "save_state_changes_data", "state_changes_data"); - let updates: Vec<(String, String, bigdecimal::BigDecimal)> = + let updates: Vec<(String, String, i64)> = state_changes .iter() .filter_map(|change| match &change.value { @@ -287,7 +287,7 @@ impl crate::StateIndexerDbManager for crate::PostgresDBManager { Some(( account_id.to_string(), data_key, - bigdecimal::BigDecimal::from(block_height), + block_height as i64, // Convert to i64 for database compatibility )) } _ => None, @@ -320,7 +320,7 @@ impl crate::StateIndexerDbManager for crate::PostgresDBManager { ); // Extract relevant updates - let inserts: Vec<(String, String, Vec, bigdecimal::BigDecimal)> = state_changes + let inserts: Vec<(String, String, Vec, i64)> = state_changes .iter() .filter_map(|change| { if let near_primitives::views::StateChangeValueView::AccessKeyUpdate { @@ -338,7 +338,7 @@ impl crate::StateIndexerDbManager for crate::PostgresDBManager { account_id.to_string(), data_key, data_value, - bigdecimal::BigDecimal::from(block_height), + block_height as i64, // Convert to i64 for database compatibility )) } else { None @@ -379,8 +379,8 @@ impl crate::StateIndexerDbManager for crate::PostgresDBManager { ); // Collect updates as triples (account_id, data_key, block_height) - let block_height_bd = bigdecimal::BigDecimal::from(block_height); - let updates: Vec<(String, String, bigdecimal::BigDecimal)> = state_changes + let block_height_bd = block_height as i64; // Convert to i64 for database compatibility + let updates: Vec<(String, String, i64)> = state_changes .iter() .filter_map(|c| match &c.value { near_primitives::views::StateChangeValueView::AccessKeyUpdate { @@ -394,7 +394,7 @@ impl crate::StateIndexerDbManager for crate::PostgresDBManager { } => Some(( account_id.to_string(), hex::encode(public_key.key_data()), - block_height_bd.clone(), // same height for all rows + block_height_bd, // same height for all rows )), _ => None, }) @@ -428,7 +428,7 @@ impl crate::StateIndexerDbManager for crate::PostgresDBManager { ); // Extract only ContractCodeUpdate - let inserts: Vec<(String, Vec, bigdecimal::BigDecimal)> = state_changes + let inserts: Vec<(String, Vec, i64)> = state_changes .into_iter() .filter_map(|change| { if let near_primitives::views::StateChangeValueView::ContractCodeUpdate { @@ -436,11 +436,7 @@ impl crate::StateIndexerDbManager for crate::PostgresDBManager { code, } = change.value { - Some(( - account_id.to_string(), - code.to_vec(), - bigdecimal::BigDecimal::from(block_height), - )) + Some((account_id.to_string(), code.to_vec(), block_height as i64)) } else { None } @@ -536,7 +532,7 @@ impl crate::StateIndexerDbManager for crate::PostgresDBManager { ); // Extract account updates - let inserts: Vec<(String, Vec, bigdecimal::BigDecimal)> = state_changes + let inserts: Vec<(String, Vec, i64)> = state_changes .into_iter() .filter_map(|change| { if let near_primitives::views::StateChangeValueView::AccountUpdate { @@ -550,7 +546,7 @@ impl crate::StateIndexerDbManager for crate::PostgresDBManager { Some(( account_id.to_string(), data_value, - bigdecimal::BigDecimal::from(block_height), + block_height as i64, // Convert to i64 for database compatibility )) } else { None From b34a87dbafe72ca4291e28a79d1de32b5c0db99a Mon Sep 17 00:00:00 2001 From: Bohdan Khorolets Date: Wed, 30 Jul 2025 10:05:55 +0300 Subject: [PATCH 06/13] refactor(database, rpc-server): Update read queries related to states to use i64 instead of BigDecimal --- database/src/postgres/rpc_server.rs | 22 +++++++++---------- .../src/postgres/state_indexer/helpers.rs | 17 +++++++++----- database/src/postgres/state_indexer/mod.rs | 3 +-- readnode-primitives/src/lib.rs | 11 ++++------ 4 files changed, 28 insertions(+), 25 deletions(-) diff --git a/database/src/postgres/rpc_server.rs b/database/src/postgres/rpc_server.rs index 01614f9d..fc0b161f 100644 --- a/database/src/postgres/rpc_server.rs +++ b/database/src/postgres/rpc_server.rs @@ -88,7 +88,7 @@ impl crate::ReaderDbManager for crate::PostgresDBManager { ", ) .bind(account_id.to_string()) - .bind(bigdecimal::BigDecimal::from(block_height)) + .bind(block_height as i64) // Convert to i64 for database compatibility .bind(page_state.last_data_key.clone()) .bind(page_state.page_size) .fetch(shard_id_pool.pool); @@ -141,7 +141,7 @@ impl crate::ReaderDbManager for crate::PostgresDBManager { ) .bind(account_id.to_string()) .bind(format!("{}%", hex::encode(prefix))) - .bind(bigdecimal::BigDecimal::from(block_height)) + .bind(block_height as i64) // Convert to i64 for database compatibility .fetch(shard_id_pool.pool); while let Some(row) = stream.next().await { let (key, value): (String, Vec) = row?; @@ -177,7 +177,7 @@ impl crate::ReaderDbManager for crate::PostgresDBManager { ", ) .bind(account_id.to_string()) - .bind(bigdecimal::BigDecimal::from(block_height)) + .bind(block_height as i64) // Convert to i64 for database compatibility .fetch(shard_id_pool.pool); while let Some(row) = stream.next().await { let (key, value): (String, Vec) = row?; @@ -218,7 +218,7 @@ impl crate::ReaderDbManager for crate::PostgresDBManager { ) .bind(account_id.to_string()) .bind(hex::encode(&key_data).to_string()) - .bind(bigdecimal::BigDecimal::from(block_height)) + .bind(block_height as i64) // Convert to i64 for database compatibility .fetch_one(shard_id_pool.pool) .await?; Ok((key_data, data_value)) @@ -244,7 +244,7 @@ impl crate::ReaderDbManager for crate::PostgresDBManager { "state_changes_account", ]) .inc(); - let result: (Vec, bigdecimal::BigDecimal) = sqlx::query_as( + let result: (Vec, i64) = sqlx::query_as( " SELECT data_value, block_height_from FROM state_changes_account_compact @@ -256,7 +256,7 @@ impl crate::ReaderDbManager for crate::PostgresDBManager { ", ) .bind(account_id.to_string()) - .bind(bigdecimal::BigDecimal::from(request_block_height)) + .bind(request_block_height as i64) // Convert to i64 for database compatibility .fetch_one(shard_id_pool.pool) .await?; tracing::debug!( @@ -282,7 +282,7 @@ impl crate::ReaderDbManager for crate::PostgresDBManager { "state_changes_contract", ]) .inc(); - let result: (Vec, bigdecimal::BigDecimal) = sqlx::query_as( + let result: (Vec, i64) = sqlx::query_as( " SELECT data_value, block_height_from FROM state_changes_contract_compact @@ -294,7 +294,7 @@ impl crate::ReaderDbManager for crate::PostgresDBManager { ", ) .bind(account_id.to_string()) - .bind(bigdecimal::BigDecimal::from(request_block_height)) + .bind(request_block_height as i64) // Convert to i64 for database compatibility .fetch_one(shard_id_pool.pool) .await?; @@ -317,7 +317,7 @@ impl crate::ReaderDbManager for crate::PostgresDBManager { ]) .inc(); let key_data = borsh::to_vec(&public_key)?; - let result: (Vec, bigdecimal::BigDecimal) = sqlx::query_as( + let result: (Vec, i64) = sqlx::query_as( " SELECT data_value, block_height_from FROM state_changes_access_key_compact @@ -331,7 +331,7 @@ impl crate::ReaderDbManager for crate::PostgresDBManager { ) .bind(account_id.to_string()) .bind(hex::encode(&key_data).to_string()) - .bind(bigdecimal::BigDecimal::from(request_block_height)) + .bind(request_block_height as i64) // Convert to i64 for database compatibility .fetch_one(shard_id_pool.pool) .await?; readnode_primitives::QueryData::::try_from(result) @@ -362,7 +362,7 @@ impl crate::ReaderDbManager for crate::PostgresDBManager { ", ) .bind(account_id.to_string()) - .bind(bigdecimal::BigDecimal::from(block_height)) + .bind(block_height as i64) // Convert to i64 for database compatibility .fetch(shard_id_pool.pool); while let Some(row) = stream.next().await { let (public_key_hex, access_key): (String, Vec) = row?; diff --git a/database/src/postgres/state_indexer/helpers.rs b/database/src/postgres/state_indexer/helpers.rs index 292e1eea..dc71a7b7 100644 --- a/database/src/postgres/state_indexer/helpers.rs +++ b/database/src/postgres/state_indexer/helpers.rs @@ -4,6 +4,13 @@ use std::time::Instant; use futures::future::try_join_all; use sqlx::Row; +type StateChangeKeyDataAtBlockHeight = ( + String, // account_id + String, // data_key + Vec, // data_value + i64, // block_height_from +); + /// PostgreSQL State Indexer Implementation /// /// ARCHITECTURAL OVERVIEW: @@ -66,7 +73,7 @@ impl crate::PostgresDBManager { // Compute partition assignments for all account_ids using PostgreSQL's hashtext() function // This ensures consistent partition distribution matching the table partitioning scheme - let partition_map = self.partition_map(&shard_id, &pool, &account_ids).await?; + let partition_map = self.partition_map(&shard_id, pool, &account_ids).await?; // Group account_ids by their target partition for batch processing // This reduces the number of database queries by updating entire partitions at once @@ -183,7 +190,7 @@ impl crate::PostgresDBManager { ]) .inc(); - let partition_map = self.partition_map(&shard_id, &pool, &account_ids).await?; + let partition_map = self.partition_map(&shard_id, pool, &account_ids).await?; // Group updates per partition let mut updates_per_partition: HashMap> = HashMap::new(); @@ -309,10 +316,10 @@ impl crate::PostgresDBManager { ]) .inc(); - let partition_map = self.partition_map(&shard_id, &pool, &account_ids).await?; + let partition_map = self.partition_map(&shard_id, pool, &account_ids).await?; // Group inserts by partition for efficient batch processing - let mut inserts_per_partition: HashMap, i64)>> = + let mut inserts_per_partition: HashMap> = HashMap::new(); for (account_id, data_key, data_value, block_height) in inserts { if let Some(&partition) = partition_map.get(&account_id) { @@ -433,7 +440,7 @@ impl crate::PostgresDBManager { &account_ids.len().to_string(), ]) .inc(); - let partition_map = self.partition_map(&shard_id, &pool, &account_ids).await?; + let partition_map = self.partition_map(&shard_id, pool, &account_ids).await?; // Group inserts per partition let mut inserts_per_partition: HashMap, i64)>> = HashMap::new(); diff --git a/database/src/postgres/state_indexer/mod.rs b/database/src/postgres/state_indexer/mod.rs index 47e79b54..21eb095e 100644 --- a/database/src/postgres/state_indexer/mod.rs +++ b/database/src/postgres/state_indexer/mod.rs @@ -1,8 +1,7 @@ -use std::collections::HashMap; use std::time::Instant; use bigdecimal::ToPrimitive; -use futures::{future::try_join_all, FutureExt}; +use futures::FutureExt; mod helpers; diff --git a/readnode-primitives/src/lib.rs b/readnode-primitives/src/lib.rs index 93b803d6..481ef49e 100644 --- a/readnode-primitives/src/lib.rs +++ b/readnode-primitives/src/lib.rs @@ -299,19 +299,16 @@ where } } -impl TryFrom<(Vec, B)> for QueryData +impl TryFrom<(Vec, i64)> for QueryData where T: borsh::BorshDeserialize, - B: ToPrimitive, { type Error = anyhow::Error; - fn try_from(value: (Vec, B)) -> Result { + fn try_from(value: (Vec, i64)) -> Result { let data = T::try_from_slice(&value.0)?; - let block_height = value - .1 - .to_u64() - .ok_or_else(|| anyhow::anyhow!("Failed to parse `block_height` to u64"))?; + let block_height = u64::try_from(value.1) + .map_err(|_| anyhow::anyhow!("Failed to cast `block_height` from i64 to u64"))?; Ok(Self { data, block_height }) } } From d07e2b2ecc8c6ee2a9cb5fc5c3f3a96934305c4d Mon Sep 17 00:00:00 2001 From: Yurii Koba Date: Wed, 30 Jul 2025 15:25:15 +0300 Subject: [PATCH 07/13] add migration scripts --- database/database_migrations/README.md | 100 ++++++++++++++++++ .../migrate_access_keys.sh | 46 ++++++++ .../database_migrations/migrate_accounts.sh | 44 ++++++++ .../database_migrations/migrate_contracts.sh | 44 ++++++++ .../migrate_state_changes.sh | 46 ++++++++ .../database_migrations/shard_migration.sh | 63 +++++++++++ ...132509_create_state_changes_compact.up.sql | 16 +-- ...reate_get_text_partition_function.down.sql | 2 +- 8 files changed, 352 insertions(+), 9 deletions(-) create mode 100644 database/database_migrations/README.md create mode 100644 database/database_migrations/migrate_access_keys.sh create mode 100644 database/database_migrations/migrate_accounts.sh create mode 100644 database/database_migrations/migrate_contracts.sh create mode 100644 database/database_migrations/migrate_state_changes.sh create mode 100644 database/database_migrations/shard_migration.sh diff --git a/database/database_migrations/README.md b/database/database_migrations/README.md new file mode 100644 index 00000000..4a63b363 --- /dev/null +++ b/database/database_migrations/README.md @@ -0,0 +1,100 @@ +> **Note:** If you are starting the project from scratch (with a new, empty database), you do not need to run the migration scripts in this directory. These scripts are only necessary when migrating data from an existing database or upgrading shards. For new deployments, follow the standard database initialization procedures instead. + +Database Migration Scripts + +This directory contains scripts for migrating database shards and related data. + +## Shard Migration Script + +The `shard_migration.sh` script is the main migration orchestrator that runs multiple migration scripts in parallel. + +### Usage + +The script accepts the following command-line arguments: + +- `--db_name`: Database name +- `--db_user`: Database username +- `--db_password`: Database password +- `--host`: Database host +- `--port`: Database port + +### Examples + +#### Basic Example: +```bash +./shard_migration.sh --db_name my_database --db_user postgres --db_password mypassword --host localhost --port 5432 +``` + +#### Local PostgreSQL Database: +```bash +./shard_migration.sh \ + --db_name read_rpc_db \ + --db_user postgres \ + --db_password secretpassword \ + --host localhost \ + --port 5432 +``` + +#### Remote Database: +```bash +./shard_migration.sh \ + --db_name production_db \ + --db_user readrpc_user \ + --db_password prod_password123 \ + --host db.example.com \ + --port 5432 +``` + +#### Using Environment Variables: +```bash +# Set environment variables first +export DB_NAME="my_database" +export DB_USER="postgres" +export PGPASSWORD="mypassword" +export DB_HOST="localhost" +export DB_PORT="5432" + +# Then run the script (it will use the environment variables) +./shard_migration.sh +``` + +### Prerequisites + +1. **Make the script executable:** + ```bash + chmod +x shard_migration.sh + ``` + +2. **Make all migration scripts executable:** + ```bash + chmod +x migrate_*.sh + ``` + +3. **Ensure all required migration scripts exist:** + - `migrate_access_keys.sh` + - `migrate_accounts.sh` + - `migrate_contracts.sh` + - `migrate_state_changes.sh` + +### How it Works + +The `shard_migration.sh` script: + +1. Parses command-line arguments and sets environment variables +2. Creates a log file named `migration_${DB_NAME}.log` +3. Runs four migration scripts in parallel using the `&` operator +4. Waits for all migrations to complete using the `wait` command +5. Logs start and completion times + +### Output + +- Migration progress and results are logged to `migration_${DB_NAME}.log` +- Console output shows start and completion timestamps +- Each individual migration script may produce its own output + +### Notes + +- All arguments are required for the script to function properly +- The script runs migrations in parallel to improve performance +- Make sure you have proper database permissions before running the migration +- Review the individual migration scripts to understand what data will be migrated diff --git a/database/database_migrations/migrate_access_keys.sh b/database/database_migrations/migrate_access_keys.sh new file mode 100644 index 00000000..e0312fa4 --- /dev/null +++ b/database/database_migrations/migrate_access_keys.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +# Function to migrate a single partition +migrate_partition() { + local partition=$1 + # shellcheck disable=SC2155 + local start_time=$(date +"%T") + + echo "[INFO] Starting migration for partition state_changes_access_key_$partition at $start_time" + echo "[INFO] Starting migration for partition state_changes_access_key_$partition at $start_time" >> "$LOG_FILE" + + psql -U "$DB_USER" -d "$DB_NAME" -h "$DB_HOST" -p "$DB_PORT" -c " + WITH ordered_data AS ( + SELECT + account_id, + data_key, + data_value, + block_height AS block_height_from, + LAG(block_height) OVER (PARTITION BY account_id, data_key ORDER BY block_height DESC) AS block_height_to + FROM state_changes_access_key_$partition + ) + INSERT INTO state_changes_access_key_compact_$partition (account_id, data_key, data_value, block_height_from, block_height_to) + SELECT + account_id, + data_key, + data_value, + block_height_from::bigint, + block_height_to::bigint + FROM ordered_data + WHERE data_value IS NOT NULL + ON CONFLICT (account_id, data_key, block_height_from) DO NOTHING; + " 2>&1 | tee -a "$LOG_FILE" + + # shellcheck disable=SC2155 + local end_time=$(date +"%T") + echo "[INFO] Finished migration for partition state_changes_access_key_$partition at $end_time" + echo "[INFO] Finished migration for partition state_changes_access_key_$partition at $end_time" >> "$LOG_FILE" +} + +# Run migrations in parallel for partitions 0 to 99 +for i in $(seq 0 99); do + migrate_partition "$i" & +done + +# Wait for all background jobs to finish +wait diff --git a/database/database_migrations/migrate_accounts.sh b/database/database_migrations/migrate_accounts.sh new file mode 100644 index 00000000..74da3785 --- /dev/null +++ b/database/database_migrations/migrate_accounts.sh @@ -0,0 +1,44 @@ +#!/bin/bash + +# Function to migrate a single partition +migrate_partition() { + local partition=$1 + # shellcheck disable=SC2155 + local start_time=$(date +"%T") + + echo "[INFO] Starting migration for partition state_changes_account_$partition at $start_time" + echo "[INFO] Starting migration for partition state_changes_account_$partition at $start_time" >> "$LOG_FILE" + + psql -U "$DB_USER" -d "$DB_NAME" -h "$DB_HOST" -p "$DB_PORT" -c " + WITH ordered_data AS ( + SELECT + account_id, + data_value, + block_height AS block_height_from, + LAG(block_height) OVER (PARTITION BY account_id ORDER BY block_height DESC) AS block_height_to + FROM state_changes_account_$partition + ) + INSERT INTO state_changes_account_compact_$partition (account_id, data_value, block_height_from, block_height_to) + SELECT + account_id, + data_value, + block_height_from::bigint, + block_height_to::bigint + FROM ordered_data + WHERE data_value IS NOT NULL + ON CONFLICT (account_id, block_height_from) DO NOTHING; + " 2>&1 | tee -a "$LOG_FILE" + + # shellcheck disable=SC2155 + local end_time=$(date +"%T") + echo "[INFO] Finished migration for partition state_changes_account_$partition at $end_time" + echo "[INFO] Finished migration for partition state_changes_account_$partition at $end_time" >> "$LOG_FILE" +} + +# Run migrations in parallel for partitions 0 to 99 +for i in $(seq 0 99); do + migrate_partition "$i" & +done + +# Wait for all background jobs to finish +wait diff --git a/database/database_migrations/migrate_contracts.sh b/database/database_migrations/migrate_contracts.sh new file mode 100644 index 00000000..6bc9a38a --- /dev/null +++ b/database/database_migrations/migrate_contracts.sh @@ -0,0 +1,44 @@ +#!/bin/bash + +# Function to migrate a single partition +migrate_partition() { + local partition=$1 + # shellcheck disable=SC2155 + local start_time=$(date +"%T") + + echo "[INFO] Starting migration for partition state_changes_contract_$partition at $start_time" + echo "[INFO] Starting migration for partition state_changes_contract_$partition at $start_time" >> "$LOG_FILE" + + psql -U "$DB_USER" -d "$DB_NAME" -h "$DB_HOST" -p "$DB_PORT" -c " + WITH ordered_data AS ( + SELECT + account_id, + data_value, + block_height AS block_height_from, + LAG(block_height) OVER (PARTITION BY account_id ORDER BY block_height DESC) AS block_height_to + FROM state_changes_contract_$partition + ) + INSERT INTO state_changes_contract_compact$partition (account_id, data_value, block_height_from, block_height_to) + SELECT + account_id, + data_value, + block_height_from::bigint, + block_height_to::bigint + FROM ordered_data + WHERE data_value IS NOT NULL + ON CONFLICT (account_id, block_height_from) DO NOTHING; + " 2>&1 | tee -a "$LOG_FILE" + + # shellcheck disable=SC2155 + local end_time=$(date +"%T") + echo "[INFO] Finished migration for partition state_changes_contract_$partition at $end_time" + echo "[INFO] Finished migration for partition state_changes_contract_$partition at $end_time" >> "$LOG_FILE" +} + +# Run migrations in parallel for partitions 0 to 99 +for i in $(seq 0 99); do + migrate_partition "$i" & +done + +# Wait for all background jobs to finish +wait \ No newline at end of file diff --git a/database/database_migrations/migrate_state_changes.sh b/database/database_migrations/migrate_state_changes.sh new file mode 100644 index 00000000..0129d143 --- /dev/null +++ b/database/database_migrations/migrate_state_changes.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +# Function to migrate a single partition +migrate_partition() { + local partition=$1 + # shellcheck disable=SC2155 + local start_time=$(date +"%T") + + echo "[INFO] Starting migration for partition state_changes_data_$partition at $start_time" + echo "[INFO] Starting migration for partition state_changes_data_$partition at $start_time" >> "$LOG_FILE" + + psql -U "$DB_USER" -d "$DB_NAME" -h "$DB_HOST" -p "$DB_PORT" -c " + WITH ordered_data AS ( + SELECT + account_id, + data_key, + data_value, + block_height AS block_height_from, + LAG(block_height) OVER (PARTITION BY account_id, data_key ORDER BY block_height DESC) AS block_height_to + FROM state_changes_data_$partition + ) + INSERT INTO state_changes_data_compact_$partition (account_id, data_key, data_value, block_height_from, block_height_to) + SELECT + account_id, + data_key, + data_value, + block_height_from::bigint, + block_height_to::bigint + FROM ordered_data + WHERE data_value IS NOT NULL + ON CONFLICT (account_id, data_key, block_height_from) DO NOTHING; + " 2>&1 | tee -a "$LOG_FILE" + + # shellcheck disable=SC2155 + local end_time=$(date +"%T") + echo "[INFO] Finished migration for partition state_changes_data_$partition at $end_time" + echo "[INFO] Finished migration for partition state_changes_data_$partition at $end_time" >> "$LOG_FILE" +} + +# Run migrations in parallel for partitions 0 to 99 +for i in $(seq 0 99); do + migrate_partition "$i" & +done + +# Wait for all background jobs to finish +wait diff --git a/database/database_migrations/shard_migration.sh b/database/database_migrations/shard_migration.sh new file mode 100644 index 00000000..1ca0a17b --- /dev/null +++ b/database/database_migrations/shard_migration.sh @@ -0,0 +1,63 @@ +#!/bin/bash + +# Parse arguments or use environment variables +while [[ $# -gt 0 ]]; do + key="$1" + case $key in + --db_name) + export DB_NAME="$2" + shift 2 + ;; + --db_user) + export DB_USER="$2" + shift 2 + ;; + --db_password) + export PGPASSWORD="$2" + shift 2 + ;; + --host) + export DB_HOST="$2" + shift 2 + ;; + --port) + export DB_PORT="$2" + shift 2 + ;; + *) + echo "Unknown option: $1" + exit 1 + ;; + esac +done + +# Set defaults from environment if not set by args +: "${DB_NAME:=${DB_NAME}}" +: "${DB_USER:=${DB_USER}}" +: "${PGPASSWORD:=${PGPASSWORD}}" +: "${DB_HOST:=${DB_HOST}}" +: "${DB_PORT:=${DB_PORT}}" + +# Check required variables +if [[ -z "$DB_NAME" || -z "$DB_USER" || -z "$PGPASSWORD" || -z "$DB_HOST" || -z "$DB_PORT" ]]; then + echo "All arguments are required: --db_name, --db_user, --db_password, --host, --port (or set corresponding env vars)" + exit 1 +fi + + +# Set log file +export LOG_FILE="migration_${DB_NAME}.log" +# Remove old log file if it exists +rm -f "$LOG_FILE" +touch "$LOG_FILE" + +echo "Starting migration at $(date)" | tee -a "$LOG_FILE" + +./migrate_access_keys.sh & +./migrate_accounts.sh & +./migrate_contracts.sh & +./migrate_state_changes.sh & + +wait + +echo "Migration completed at $(date)" | tee -a "$LOG_FILE" diff --git a/database/src/postgres/migrations/shard_db/20250218132509_create_state_changes_compact.up.sql b/database/src/postgres/migrations/shard_db/20250218132509_create_state_changes_compact.up.sql index 20903f13..af16ce41 100644 --- a/database/src/postgres/migrations/shard_db/20250218132509_create_state_changes_compact.up.sql +++ b/database/src/postgres/migrations/shard_db/20250218132509_create_state_changes_compact.up.sql @@ -2,8 +2,8 @@ CREATE TABLE IF NOT EXISTS state_changes_data_compact ( account_id text NOT NULL, data_key text NOT NULL, data_value bytea NOT NULL, - block_height_from numeric(20,0) NOT NULL, - block_height_to numeric(20,0) NULL, + block_height_from bigint NOT NULL, + block_height_to bigint NULL, PRIMARY KEY (account_id, data_key, block_height_from) ) PARTITION BY HASH (account_id); @@ -22,8 +22,8 @@ CREATE TABLE IF NOT EXISTS state_changes_access_key_compact ( account_id text NOT NULL, data_key text NOT NULL, data_value bytea NOT NULL, - block_height_from numeric(20,0) NOT NULL, - block_height_to numeric(20,0) NULL, + block_height_from bigint NOT NULL, + block_height_to bigint NULL, PRIMARY KEY (account_id, data_key, block_height_from) ) PARTITION BY HASH (account_id); @@ -41,8 +41,8 @@ END $$; CREATE TABLE IF NOT EXISTS state_changes_contract_compact ( account_id text NOT NULL, data_value bytea NOT NULL, - block_height_from numeric(20,0) NOT NULL, - block_height_to numeric(20,0) NULL, + block_height_from bigint NOT NULL, + block_height_to bigint NULL, PRIMARY KEY (account_id, block_height_from) ) PARTITION BY HASH (account_id); @@ -59,8 +59,8 @@ END $$; CREATE TABLE IF NOT EXISTS state_changes_account_compact ( account_id text NOT NULL, data_value bytea NULL, - block_height_from numeric(20,0) NOT NULL, - block_height_to numeric(20,0) NULL, + block_height_from bigint NOT NULL, + block_height_to bigint NULL, PRIMARY KEY (account_id, block_height_from) ) PARTITION BY HASH (account_id); diff --git a/database/src/postgres/migrations/shard_db/20250218132510_create_get_text_partition_function.down.sql b/database/src/postgres/migrations/shard_db/20250218132510_create_get_text_partition_function.down.sql index 7de51251..0bb39c5e 100644 --- a/database/src/postgres/migrations/shard_db/20250218132510_create_get_text_partition_function.down.sql +++ b/database/src/postgres/migrations/shard_db/20250218132510_create_get_text_partition_function.down.sql @@ -1 +1 @@ -DROP FUNCTION IF EXISTS get_text_partition(TEXT, INT); \ No newline at end of file +DROP FUNCTION IF EXISTS get_text_partition(TEXT, INT); From d5b9f4d18dbaf20a8b7fae8939bff8eec766ee05 Mon Sep 17 00:00:00 2001 From: Yurii Koba Date: Wed, 30 Jul 2025 15:59:40 +0300 Subject: [PATCH 08/13] add indexes for new tables --- database/database_migrations/migrate_contracts.sh | 4 ++-- .../20250218132509_create_state_changes_compact.up.sql | 8 ++++++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/database/database_migrations/migrate_contracts.sh b/database/database_migrations/migrate_contracts.sh index 6bc9a38a..be04e782 100644 --- a/database/database_migrations/migrate_contracts.sh +++ b/database/database_migrations/migrate_contracts.sh @@ -18,7 +18,7 @@ migrate_partition() { LAG(block_height) OVER (PARTITION BY account_id ORDER BY block_height DESC) AS block_height_to FROM state_changes_contract_$partition ) - INSERT INTO state_changes_contract_compact$partition (account_id, data_value, block_height_from, block_height_to) + INSERT INTO state_changes_contract_compact_$partition (account_id, data_value, block_height_from, block_height_to) SELECT account_id, data_value, @@ -41,4 +41,4 @@ for i in $(seq 0 99); do done # Wait for all background jobs to finish -wait \ No newline at end of file +wait diff --git a/database/src/postgres/migrations/shard_db/20250218132509_create_state_changes_compact.up.sql b/database/src/postgres/migrations/shard_db/20250218132509_create_state_changes_compact.up.sql index af16ce41..6c19baad 100644 --- a/database/src/postgres/migrations/shard_db/20250218132509_create_state_changes_compact.up.sql +++ b/database/src/postgres/migrations/shard_db/20250218132509_create_state_changes_compact.up.sql @@ -14,6 +14,8 @@ DECLARE BEGIN FOR i IN 0..99 LOOP EXECUTE format('CREATE TABLE IF NOT EXISTS state_changes_data_compact_%s PARTITION OF state_changes_data_compact FOR VALUES WITH (MODULUS 100, REMAINDER %s)', i, i); + -- Indexes for state_changes_data_compact partitions + EXECUTE format('CREATE INDEX IF NOT EXISTS idx_data_%s_keys ON state_changes_data_compact_%s (account_id, data_key) WHERE block_height_to IS NULL;', i, i); END LOOP; END $$; @@ -34,6 +36,8 @@ DECLARE BEGIN FOR i IN 0..99 LOOP EXECUTE format('CREATE TABLE IF NOT EXISTS state_changes_access_key_compact_%s PARTITION OF state_changes_access_key_compact FOR VALUES WITH (MODULUS 100, REMAINDER %s)', i, i); + -- Indexes for state_changes_access_key_compact partitions + EXECUTE format('CREATE INDEX IF NOT EXISTS idx_access_key_%s_keys ON state_changes_access_key_compact_%s (account_id, data_key) WHERE block_height_to IS NULL;', i, i); END LOOP; END $$; @@ -53,6 +57,8 @@ DECLARE BEGIN FOR i IN 0..99 LOOP EXECUTE format('CREATE TABLE IF NOT EXISTS state_changes_contract_compact_%s PARTITION OF state_changes_contract_compact FOR VALUES WITH (MODULUS 100, REMAINDER %s)', i, i); + -- Indexes for state_changes_contract_compact partitions + EXECUTE format('CREATE INDEX IF NOT EXISTS idx_contract_%s_acc ON state_changes_contract_compact_%s (account_id) WHERE block_height_to IS NULL;', i, i); END LOOP; END $$; @@ -71,5 +77,7 @@ DECLARE BEGIN FOR i IN 0..99 LOOP EXECUTE format('CREATE TABLE IF NOT EXISTS state_changes_account_compact_%s PARTITION OF state_changes_account_compact FOR VALUES WITH (MODULUS 100, REMAINDER %s)', i, i); + -- Indexes for state_changes_account_compact partitions + EXECUTE format('CREATE INDEX IF NOT EXISTS idx_account_%s_acc ON state_changes_account_compact_%s (account_id) WHERE block_height_to IS NULL;', i, i); END LOOP; END $$; From 1bcc8c6f129ccce9ae820c4a6b9c49e70334778c Mon Sep 17 00:00:00 2001 From: Yurii Koba Date: Thu, 31 Jul 2025 13:34:49 +0300 Subject: [PATCH 09/13] start state indexer from interaption block --- logic-state-indexer/src/configs.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/logic-state-indexer/src/configs.rs b/logic-state-indexer/src/configs.rs index b90da712..96b81445 100644 --- a/logic-state-indexer/src/configs.rs +++ b/logic-state-indexer/src/configs.rs @@ -41,7 +41,7 @@ pub async fn get_start_block_height( } StartOptions::FromLatest => final_block_height(near_client).await?, }; - Ok(start_block_height - 100) // Start just a bit earlier to overlap indexed blocks to ensure we don't miss anything in-between + Ok(start_block_height) } pub(crate) async fn final_block_height( From c370570b11669de538c04b9863dc9591620732e0 Mon Sep 17 00:00:00 2001 From: Yurii Koba Date: Mon, 11 Aug 2025 12:06:12 +0300 Subject: [PATCH 10/13] paginated state optimization --- rpc-server/src/modules/queries/methods.rs | 2 +- rpc-server/src/modules/state/methods.rs | 4 +- rpc-server/src/modules/state/utils.rs | 73 +++++++++++++++++------ 3 files changed, 59 insertions(+), 20 deletions(-) diff --git a/rpc-server/src/modules/queries/methods.rs b/rpc-server/src/modules/queries/methods.rs index 9259c70c..b73edfaa 100644 --- a/rpc-server/src/modules/queries/methods.rs +++ b/rpc-server/src/modules/queries/methods.rs @@ -566,7 +566,7 @@ async fn optimistic_view_state( } #[cfg_attr(feature = "tracing-instrumentation", tracing::instrument(skip(data)))] -async fn database_view_state( +pub async fn database_view_state( data: &Data, block: &near_primitives::views::BlockView, account_id: &near_primitives::types::AccountId, diff --git a/rpc-server/src/modules/state/methods.rs b/rpc-server/src/modules/state/methods.rs index a764ee90..ae815dae 100644 --- a/rpc-server/src/modules/state/methods.rs +++ b/rpc-server/src/modules/state/methods.rs @@ -18,9 +18,9 @@ pub async fn view_state_paginated( fetch_block_from_cache_or_get(&data, &block_reference, "view_state_paginated").await?; let state_values = get_state_from_db_paginated( - &data.db_manager, + &data, &request_data.account_id, - block.header.height, + &block, request_data.next_page_token, ) .await?; diff --git a/rpc-server/src/modules/state/utils.rs b/rpc-server/src/modules/state/utils.rs index 98317c45..aa7a1004 100644 --- a/rpc-server/src/modules/state/utils.rs +++ b/rpc-server/src/modules/state/utils.rs @@ -1,37 +1,76 @@ -#[cfg_attr( - feature = "tracing-instrumentation", - tracing::instrument(skip(db_manager)) -)] +use crate::{config::ServerContext, modules::queries::methods::database_view_state}; +use actix_web::web::Data; + +#[cfg_attr(feature = "tracing-instrumentation", tracing::instrument(skip(data)))] pub async fn get_state_from_db_paginated( - db_manager: &std::sync::Arc>, + data: &Data, account_id: &near_primitives::types::AccountId, - block_height: near_primitives::types::BlockHeight, + block: &near_primitives::views::BlockView, page_token: database::PageToken, ) -> Result { tracing::debug!( "`get_state_from_db_paginated` call. AccountId {}, block {}, page_token {:?}", account_id, - block_height, + block.header.height, page_token, ); - let (values, next_page_token) = db_manager - .get_state_by_page(account_id, block_height, page_token, "view_state_paginated") + let account = data + .db_manager + .get_account(account_id, block.header.height, "query_view_state") + .await + .map_err( + |_err| near_jsonrpc::primitives::types::query::RpcQueryError::UnknownAccount { + requested_account_id: account_id.clone(), + block_height: block.header.height, + block_hash: block.header.hash, + }, + )?; + + // Calculate the state size excluding the contract code size to check if it's too large to fetch. + // The state size is the storage usage minus the code size. + // more details: nearcore/runtime/runtime/src/state_viewer/mod.rs:150 + let code_len = data + .db_manager + .get_contract_code(account_id, block.header.height, "query_view_state") .await - .map_err(|err| { - near_jsonrpc::primitives::errors::RpcError::new_internal_error( - Some(serde_json::Value::String(err.to_string())), - "Failed to get page state from DB. Please try again!".to_string(), + .map(|code| code.data.len() as u64) + .unwrap_or_default(); + + let state_size = account.data.storage_usage().saturating_sub(code_len); + let (values, next_page_token) = if state_size <= 1_000_000 { + let values = database_view_state(data, block, account_id, &[]).await?; + (values, None) + } else { + // If the state size is too large, we try to fetch the state in pages. + // This is a fallback mechanism to avoid fetching too much data at once. + let (raw_values, next_page_token) = data + .db_manager + .get_state_by_page( + account_id, + block.header.height, + database::PageToken::default(), + "view_state_paginated", ) - })?; - Ok(crate::modules::state::PageStateValues { - values: values + .await + .map_err(|err| { + near_jsonrpc::primitives::errors::RpcError::new_internal_error( + Some(serde_json::Value::String(err.to_string())), + "Failed to get page state from DB. Please try again!".to_string(), + ) + })?; + let values = raw_values .into_iter() .map(|(k, v)| near_primitives::views::StateItem { key: k.into(), value: v.into(), }) - .collect(), + .collect(); + (values, next_page_token) + }; + + Ok(crate::modules::state::PageStateValues { + values, next_page_token, }) } From a10ead2c479e4c9b90d0f6d394f01b6dc93558e7 Mon Sep 17 00:00:00 2001 From: Yurii Koba Date: Mon, 11 Aug 2025 13:01:32 +0300 Subject: [PATCH 11/13] fix paginated state --- database/src/postgres/rpc_server.rs | 37 ++++++++++++++++++++------- rpc-server/src/modules/state/utils.rs | 4 +-- 2 files changed, 30 insertions(+), 11 deletions(-) diff --git a/database/src/postgres/rpc_server.rs b/database/src/postgres/rpc_server.rs index fc0b161f..3b1e5806 100644 --- a/database/src/postgres/rpc_server.rs +++ b/database/src/postgres/rpc_server.rs @@ -74,24 +74,43 @@ impl crate::ReaderDbManager for crate::PostgresDBManager { } else { crate::postgres::PageState::new(1000) }; - let mut stream = sqlx::query_as::<_, (String, Vec)>( - " + let mut stream = if let Some(last_data_key) = &page_state.last_data_key { + sqlx::query_as::<_, (String, Vec)>( + " SELECT data_key, data_value FROM state_changes_data_compact WHERE account_id = $1 AND block_height_from <= $2 AND (block_height_to IS NULL OR block_height_to > $2) - AND ($3 IS NULL OR data_key > $3) + AND data_key > $3 ORDER BY data_key ASC LIMIT $4; ", - ) - .bind(account_id.to_string()) - .bind(block_height as i64) // Convert to i64 for database compatibility - .bind(page_state.last_data_key.clone()) - .bind(page_state.page_size) - .fetch(shard_id_pool.pool); + ) + .bind(account_id.to_string()) + .bind(block_height as i64) // Convert to i64 for database compatibility + .bind(last_data_key.clone()) + .bind(page_state.page_size) + .fetch(shard_id_pool.pool) + } else { + sqlx::query_as::<_, (String, Vec)>( + " + SELECT data_key, data_value + FROM state_changes_data_compact + WHERE account_id = $1 + AND block_height_from <= $2 + AND (block_height_to IS NULL OR block_height_to > $2) + ORDER BY + data_key ASC + LIMIT $4; + ", + ) + .bind(account_id.to_string()) + .bind(block_height as i64) // Convert to i64 for database compatibility + .bind(page_state.page_size) + .fetch(shard_id_pool.pool) + }; let mut items = std::collections::HashMap::new(); let mut last_data_key = String::new(); while let Some(row) = stream.next().await { diff --git a/rpc-server/src/modules/state/utils.rs b/rpc-server/src/modules/state/utils.rs index aa7a1004..7214c32e 100644 --- a/rpc-server/src/modules/state/utils.rs +++ b/rpc-server/src/modules/state/utils.rs @@ -17,7 +17,7 @@ pub async fn get_state_from_db_paginated( let account = data .db_manager - .get_account(account_id, block.header.height, "query_view_state") + .get_account(account_id, block.header.height, "view_state_paginated") .await .map_err( |_err| near_jsonrpc::primitives::types::query::RpcQueryError::UnknownAccount { @@ -32,7 +32,7 @@ pub async fn get_state_from_db_paginated( // more details: nearcore/runtime/runtime/src/state_viewer/mod.rs:150 let code_len = data .db_manager - .get_contract_code(account_id, block.header.height, "query_view_state") + .get_contract_code(account_id, block.header.height, "view_state_paginated") .await .map(|code| code.data.len() as u64) .unwrap_or_default(); From 7890fc996c5b011024fa12392ea5000b2069bec7 Mon Sep 17 00:00:00 2001 From: Yurii Koba Date: Mon, 11 Aug 2025 13:10:52 +0300 Subject: [PATCH 12/13] fix query --- database/src/postgres/rpc_server.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/database/src/postgres/rpc_server.rs b/database/src/postgres/rpc_server.rs index 3b1e5806..a4d32b98 100644 --- a/database/src/postgres/rpc_server.rs +++ b/database/src/postgres/rpc_server.rs @@ -103,7 +103,7 @@ impl crate::ReaderDbManager for crate::PostgresDBManager { AND (block_height_to IS NULL OR block_height_to > $2) ORDER BY data_key ASC - LIMIT $4; + LIMIT $3; ", ) .bind(account_id.to_string()) From e060710da319a7795a432dbe6de2ee09813c34a4 Mon Sep 17 00:00:00 2001 From: Yurii Koba Date: Mon, 11 Aug 2025 13:26:16 +0300 Subject: [PATCH 13/13] fix page_token --- rpc-server/src/modules/state/utils.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rpc-server/src/modules/state/utils.rs b/rpc-server/src/modules/state/utils.rs index 7214c32e..03dfa0f5 100644 --- a/rpc-server/src/modules/state/utils.rs +++ b/rpc-server/src/modules/state/utils.rs @@ -49,7 +49,7 @@ pub async fn get_state_from_db_paginated( .get_state_by_page( account_id, block.header.height, - database::PageToken::default(), + page_token, "view_state_paginated", ) .await