Skip to content

Commit 352132f

Browse files
authored
Warn if cgroup memory limit is misaligned with rocksdb memory limit (#3925)
1 parent c0bfc0f commit 352132f

File tree

2 files changed

+60
-2
lines changed

2 files changed

+60
-2
lines changed

crates/rocksdb/src/db_manager.rs

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ use std::time::Duration;
1616
use parking_lot::RwLock;
1717
use rocksdb::{Cache, WriteBufferManager};
1818
use tokio_util::task::TaskTracker;
19-
use tracing::{debug, info, warn};
19+
use tracing::{debug, error, info, warn};
2020

2121
use restate_core::{ShutdownError, TaskCenter, TaskKind, cancellation_watcher};
2222
use restate_serde_util::ByteCount;
@@ -67,6 +67,9 @@ impl RocksDbManager {
6767
}
6868
metric_definitions::describe_metrics();
6969
let opts = &Configuration::pinned().common;
70+
71+
check_memory_limit(opts);
72+
7073
let cache = Cache::new_lru_cache(opts.rocksdb_total_memory_size.get());
7174
let write_buffer_manager = WriteBufferManager::new_write_buffer_manager_with_cache(
7275
opts.rocksdb_actual_total_memtables_size(),
@@ -421,6 +424,7 @@ impl DbWatchdog {
421424
"[config update] Setting rocksdb total memory limit to {}",
422425
ByteCount::from(new_common_opts.rocksdb_total_memory_size)
423426
);
427+
check_memory_limit(new_common_opts);
424428
self.cache
425429
.set_capacity(new_common_opts.rocksdb_total_memory_size.get());
426430
self.manager
@@ -460,4 +464,30 @@ impl DbWatchdog {
460464
}
461465
}
462466

467+
fn check_memory_limit(opts: &CommonOptions) {
468+
if let Some(process_memory_size) = opts.process_total_memory_size() {
469+
let memory_ratio =
470+
opts.rocksdb_total_memory_size.get() as f64 / process_memory_size.get() as f64;
471+
if memory_ratio < 0.5 {
472+
warn!(
473+
"'rocksdb-total-memory-size' parameter is set to {}, less than half the process memory limit of {}. Roughly 75% of process memory should be given to RocksDB",
474+
ByteCount::from(opts.rocksdb_total_memory_size),
475+
ByteCount::from(process_memory_size),
476+
)
477+
} else if memory_ratio > 1.0 {
478+
error!(
479+
"'rocksdb-total-memory-size' parameter is set to {}, more than the process memory limit of {}. This guarantees an OOM under load; roughly 75% of process memory should be given to RocksDB",
480+
ByteCount::from(opts.rocksdb_total_memory_size),
481+
ByteCount::from(process_memory_size),
482+
)
483+
} else if memory_ratio > 0.9 {
484+
error!(
485+
"'rocksdb-total-memory-size' parameter is set to {}, more than 90% of the process memory limit of {}. This risks an OOM under load; roughly 75% of process memory should be given to RocksDB",
486+
ByteCount::from(opts.rocksdb_total_memory_size),
487+
ByteCount::from(process_memory_size),
488+
)
489+
}
490+
}
491+
}
492+
463493
static_assertions::assert_impl_all!(RocksDbManager: Send, Sync);

crates/types/src/config/common.rs

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -352,6 +352,15 @@ pub struct CommonOptions {
352352
#[serde(skip_serializing_if = "Option::is_none")]
353353
pub storage_low_priority_bg_threads: Option<NonZeroUsize>,
354354

355+
/// # Total memory limit for this process
356+
///
357+
/// This is intended to be determined automatically on Linux based on the cgroup limit,
358+
/// and is used to emit warning logs if other memory limits are set too close to it.
359+
#[serde_as(as = "Option<NonZeroByteCount>")]
360+
#[serde(skip_serializing_if = "Option::is_none")]
361+
#[cfg_attr(feature = "schemars", schemars(skip))]
362+
pub process_total_memory_size: Option<NonZeroUsize>,
363+
355364
/// # Total memory limit for rocksdb caches and memtables.
356365
///
357366
/// This includes memory for uncompressed block cache and all memtables by all open databases.
@@ -497,6 +506,24 @@ impl CommonOptions {
497506
self.base_dir.as_ref()
498507
}
499508

509+
#[cfg(target_os = "linux")]
510+
pub fn process_total_memory_size(&self) -> Option<NonZeroUsize> {
511+
self.process_total_memory_size.or_else(|| {
512+
[
513+
"/sys/fs/cgroup/memory.max", // cgroup v2, takes precedence
514+
"/sys/fs/cgroup/memory/memory.limit_in_bytes", // cgroup v1
515+
]
516+
.iter()
517+
.find_map(|path| std::fs::read_to_string(path).ok())
518+
.and_then(|contents| contents.trim().parse().ok())
519+
})
520+
}
521+
522+
#[cfg(not(target_os = "linux"))]
523+
pub fn process_total_memory_size(&self) -> Option<NonZeroUsize> {
524+
self.process_total_memory_size
525+
}
526+
500527
pub fn rocksdb_actual_total_memtables_size(&self) -> usize {
501528
let sanitized = self.rocksdb_total_memtables_ratio.clamp(0.0, 1.0) as f64;
502529
let total_mem = self.rocksdb_total_memory_size.get() as f64;
@@ -597,8 +624,9 @@ impl Default for CommonOptions {
597624
default_thread_pool_size: None,
598625
storage_high_priority_bg_threads: None,
599626
storage_low_priority_bg_threads: None,
600-
rocksdb_total_memtables_ratio: 0.5, // (50% of rocksdb-total-memory-size)
627+
process_total_memory_size: None,
601628
rocksdb_total_memory_size: NonZeroUsize::new(6 * 1024 * 1024 * 1024).unwrap(), // 6GiB
629+
rocksdb_total_memtables_ratio: 0.5, // (50% of rocksdb-total-memory-size)
602630
rocksdb_bg_threads: None,
603631
rocksdb_high_priority_bg_threads: NonZeroU32::new(2).unwrap(),
604632
rocksdb_perf_level: PerfStatsLevel::EnableCount,

0 commit comments

Comments
 (0)