From 5d344a8495818ce35f2fa16f67cdefc77158ec02 Mon Sep 17 00:00:00 2001 From: TimmyOVO Date: Tue, 18 Nov 2025 06:22:06 +0800 Subject: [PATCH 01/13] feat(metal): reuse buffers and limit RSS growth --- candle-core/src/metal_backend/device.rs | 128 ++++++++++++++++++++++-- candle-core/src/metal_backend/mod.rs | 6 +- 2 files changed, 124 insertions(+), 10 deletions(-) diff --git a/candle-core/src/metal_backend/device.rs b/candle-core/src/metal_backend/device.rs index 0a13bbfcf3..c3ec467452 100644 --- a/candle-core/src/metal_backend/device.rs +++ b/candle-core/src/metal_backend/device.rs @@ -9,7 +9,10 @@ use candle_metal_kernels::{ use objc2_foundation::NSURL; use objc2_metal::{MTLCaptureDescriptor, MTLCaptureDestination, MTLCaptureManager}; use std::path::Path; -use std::sync::{Arc, Mutex, RwLock}; +use std::sync::{ + atomic::{AtomicUsize, Ordering}, + Arc, Mutex, RwLock, +}; use super::MetalError; @@ -26,6 +29,61 @@ impl DeviceId { } } +#[derive(Clone)] +pub(crate) struct AllocationPolicy { + /// Total bytes we can allocate before forcing a sync to reclaim temporaries. + pending_limit_bytes: usize, + /// Maximum bytes to keep cached for reuse. + cache_limit_bytes: usize, +} + +impl Default for AllocationPolicy { + fn default() -> Self { + const DEFAULT_PENDING: usize = 4 * 1024 * 1024 * 1024; // 4 GiB + const MIN_PENDING: usize = 512 * 1024 * 1024; // 512 MiB + const MAX_PENDING: usize = 12 * 1024 * 1024 * 1024; // 12 GiB + + fn parse_env_mebibytes(var: &str) -> Option { + std::env::var(var) + .ok() + .and_then(|value| value.trim().parse::().ok()) + .map(|mb| mb * 1024 * 1024) + } + + fn system_memory_bytes() -> Option { + use libc::c_void; + let mut value: u64 = 0; + let mut len = core::mem::size_of::(); + let ret = unsafe { + libc::sysctlbyname( + b"hw.memsize\0".as_ptr() as *const libc::c_char, + &mut value as *mut u64 as *mut c_void, + &mut len as *mut usize, + std::ptr::null_mut(), + 0, + ) + }; + if ret == 0 { + Some(value as usize) + } else { + None + } + } + + let pending_limit = parse_env_mebibytes("CANDLE_METAL_PENDING_LIMIT_MB") + .or_else(|| system_memory_bytes().map(|mem| (mem / 3).clamp(MIN_PENDING, MAX_PENDING))) + .unwrap_or(DEFAULT_PENDING); + + let cache_limit = parse_env_mebibytes("CANDLE_METAL_CACHE_LIMIT_MB") + .unwrap_or_else(|| std::cmp::max(pending_limit / 2, 64 * 1024 * 1024)); + + crate::metal_backend::device::AllocationPolicy { + pending_limit_bytes: pending_limit, + cache_limit_bytes: cache_limit, + } + } +} + #[derive(Clone)] pub struct MetalDevice { /// Unique identifier, the registryID is not sufficient as it identifies the GPU rather than @@ -57,6 +115,10 @@ pub struct MetalDevice { pub(crate) kernels: Arc, /// Seed for random number generation. pub(crate) seed: Arc>, + /// Bytes allocated since the last synchronization point. + pub(crate) pending_allocation_bytes: Arc, + /// Allocation thresholds and cache budget. + pub(crate) allocation_policy: AllocationPolicy, } // Resource options used for creating buffers. Shared storage mode allows both CPU and GPU to access the buffer. @@ -112,14 +174,46 @@ impl MetalDevice { } fn drop_unused_buffers(&self) -> Result<()> { + self.trim_buffer_cache_to(self.allocation_policy.cache_limit_bytes) + } + + fn trim_buffer_cache_to(&self, limit: usize) -> Result<()> { let mut buffers = self.buffers.write().map_err(MetalError::from)?; - for subbuffers in buffers.values_mut() { - let newbuffers = subbuffers - .iter() - .filter(|s| Arc::strong_count(*s) > 1) - .map(Arc::clone) - .collect(); - *subbuffers = newbuffers; + let mut cached_bytes = 0usize; + for (size, subbuffers) in buffers.iter() { + for buffer in subbuffers.iter() { + if Arc::strong_count(buffer) == 1 { + cached_bytes += *size; + } + } + } + if cached_bytes <= limit { + return Ok(()); + } + + let mut bytes_to_drop = cached_bytes - limit; + let mut empty_keys = Vec::new(); + for (size, subbuffers) in buffers.iter_mut() { + if bytes_to_drop == 0 { + break; + } + subbuffers.retain(|buffer| { + if bytes_to_drop == 0 { + return true; + } + if Arc::strong_count(buffer) == 1 { + bytes_to_drop = bytes_to_drop.saturating_sub(*size); + false + } else { + true + } + }); + if subbuffers.is_empty() { + empty_keys.push(*size); + } + } + for key in empty_keys { + buffers.remove(&key); } Ok(()) } @@ -211,6 +305,8 @@ impl MetalDevice { .map_err(MetalError::from)?; let new_buffer = Arc::new(new_buffer); subbuffers.push(new_buffer.clone()); + drop(buffers); + self.on_new_allocation(size)?; Ok(new_buffer) } @@ -235,6 +331,22 @@ impl MetalDevice { .map_err(|e| MetalError::from(e.to_string()))?; Ok(()) } + + fn on_new_allocation(&self, size: usize) -> Result<()> { + let pending = self + .pending_allocation_bytes + .fetch_add(size, Ordering::AcqRel) + .saturating_add(size); + if pending >= self.allocation_policy.pending_limit_bytes { + // Ensure the GPU processed the backlog so buffers can be reused. + self.wait_until_completed()?; + self.pending_allocation_bytes.store(0, Ordering::Release); + // Drop part of the cache to keep the resident set under control. + let target = self.allocation_policy.cache_limit_bytes / 2; + self.trim_buffer_cache_to(target)?; + } + Ok(()) + } } fn buf_size(size: usize) -> usize { diff --git a/candle-core/src/metal_backend/mod.rs b/candle-core/src/metal_backend/mod.rs index e7a3324a3a..3dec8bf121 100644 --- a/candle-core/src/metal_backend/mod.rs +++ b/candle-core/src/metal_backend/mod.rs @@ -2,6 +2,7 @@ //! use crate::backend::{BackendDevice, BackendStorage}; use crate::conv::{ParamsConv1D, ParamsConv2D, ParamsConvTranspose1D, ParamsConvTranspose2D}; +use crate::metal_backend::device::AllocationPolicy; use crate::op::{BinaryOpT, CmpOp, ReduceOp, UnaryOpT}; use crate::{CpuStorage, CpuStorageRef, DType, Layout, Result, Shape}; use candle_metal_kernels::{ @@ -11,8 +12,7 @@ use candle_metal_kernels::{ use objc2_foundation::NSRange; use std::collections::HashMap; use std::ffi::c_void; -use std::sync::{Arc, Mutex, PoisonError, RwLock, TryLockError}; - +use std::sync::{atomic::AtomicUsize, Arc, Mutex, PoisonError, RwLock, TryLockError}; mod device; pub use device::{DeviceId, MetalDevice}; @@ -2099,6 +2099,8 @@ impl BackendDevice for MetalDevice { buffers: Arc::new(RwLock::new(HashMap::new())), kernels, seed, + pending_allocation_bytes: Arc::new(AtomicUsize::new(0)), + allocation_policy: AllocationPolicy::default(), }) } From 5dc33cc4139490026e48094a5331a576ac633bbb Mon Sep 17 00:00:00 2001 From: TimmyOVO Date: Wed, 19 Nov 2025 01:48:45 +0800 Subject: [PATCH 02/13] perf(metal): improve memory detection with `iogpu.wired_limit_mb` fallback --- candle-core/src/metal_backend/device.rs | 57 +++++++++++++++++++------ 1 file changed, 44 insertions(+), 13 deletions(-) diff --git a/candle-core/src/metal_backend/device.rs b/candle-core/src/metal_backend/device.rs index c3ec467452..6d0011c255 100644 --- a/candle-core/src/metal_backend/device.rs +++ b/candle-core/src/metal_backend/device.rs @@ -49,27 +49,58 @@ impl Default for AllocationPolicy { .and_then(|value| value.trim().parse::().ok()) .map(|mb| mb * 1024 * 1024) } - - fn system_memory_bytes() -> Option { + fn sysctl_u64(name: &[u8]) -> Option { use libc::c_void; - let mut value: u64 = 0; - let mut len = core::mem::size_of::(); - let ret = unsafe { - libc::sysctlbyname( - b"hw.memsize\0".as_ptr() as *const libc::c_char, + unsafe { + let mut len: usize = 0; + if libc::sysctlbyname( + name.as_ptr() as *const libc::c_char, + std::ptr::null_mut(), + &mut len as *mut usize, + std::ptr::null_mut(), + 0, + ) != 0 + { + return None; + } + if len == 0 || len > core::mem::size_of::() { + return None; + } + let mut value: u64 = 0; + if libc::sysctlbyname( + name.as_ptr() as *const libc::c_char, &mut value as *mut u64 as *mut c_void, &mut len as *mut usize, std::ptr::null_mut(), 0, - ) - }; - if ret == 0 { - Some(value as usize) - } else { - None + ) != 0 + { + return None; + } + Some(value) } } + fn system_memory_bytes() -> Option { + const MEBIBYTE: usize = 1024 * 1024; + if let Some(limit_mb) = sysctl_u64(b"iogpu.wired_limit_mb\0") { + if limit_mb <= usize::MAX as u64 { + let limit_mb = limit_mb as usize; + if let Some(limit_bytes) = limit_mb.checked_mul(MEBIBYTE) { + return Some(limit_bytes); + } + } + } + + sysctl_u64(b"hw.memsize\0").and_then(|bytes| { + if bytes <= usize::MAX as u64 { + Some(bytes as usize) + } else { + None + } + }) + } + let pending_limit = parse_env_mebibytes("CANDLE_METAL_PENDING_LIMIT_MB") .or_else(|| system_memory_bytes().map(|mem| (mem / 3).clamp(MIN_PENDING, MAX_PENDING))) .unwrap_or(DEFAULT_PENDING); From cfe73fda868477b03d3100ad1ff2b8c271d8c2f2 Mon Sep 17 00:00:00 2001 From: TimmyOVO Date: Wed, 19 Nov 2025 02:39:23 +0800 Subject: [PATCH 03/13] perf(metal): improve memory budget calculation for system allocation --- candle-core/src/metal_backend/device.rs | 37 +++++++++++++++++-------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/candle-core/src/metal_backend/device.rs b/candle-core/src/metal_backend/device.rs index 6d0011c255..8c2167905e 100644 --- a/candle-core/src/metal_backend/device.rs +++ b/candle-core/src/metal_backend/device.rs @@ -83,22 +83,37 @@ impl Default for AllocationPolicy { fn system_memory_bytes() -> Option { const MEBIBYTE: usize = 1024 * 1024; - if let Some(limit_mb) = sysctl_u64(b"iogpu.wired_limit_mb\0") { - if limit_mb <= usize::MAX as u64 { - let limit_mb = limit_mb as usize; - if let Some(limit_bytes) = limit_mb.checked_mul(MEBIBYTE) { - return Some(limit_bytes); - } + const SYSTEM_RESERVE_FRACTION: usize = 4; // Keep at least 25% for the OS. + const SYSTEM_RESERVE_MIN: usize = 2 * 1024 * 1024 * 1024; // 2 GiB floor. + + let wired_limit_bytes = sysctl_u64(b"iogpu.wired_limit_mb\0").and_then(|limit_mb| { + if limit_mb == 0 || limit_mb > usize::MAX as u64 { + return None; } - } + (limit_mb as usize).checked_mul(MEBIBYTE) + }); - sysctl_u64(b"hw.memsize\0").and_then(|bytes| { - if bytes <= usize::MAX as u64 { - Some(bytes as usize) + let hw_budget = sysctl_u64(b"hw.memsize\0").and_then(|bytes| { + if bytes > usize::MAX as u64 { + return None; + } + let hw = bytes as usize; + // Reserve some memory for the OS / CPU workloads. + let reserve = std::cmp::max(hw / SYSTEM_RESERVE_FRACTION, SYSTEM_RESERVE_MIN); + let available = hw.saturating_sub(reserve); + if available > 0 { + Some(available) } else { None } - }) + }); + + match (hw_budget, wired_limit_bytes) { + (Some(hw), Some(wired)) => Some(std::cmp::min(hw, wired)), + (Some(hw), None) => Some(hw), + (None, Some(wired)) => Some(wired), + (None, None) => None, + } } let pending_limit = parse_env_mebibytes("CANDLE_METAL_PENDING_LIMIT_MB") From e63ea2e0a5bb6223f55409013ebc0027f2fd1d72 Mon Sep 17 00:00:00 2001 From: TimmyOVO Date: Wed, 19 Nov 2025 02:40:09 +0800 Subject: [PATCH 04/13] perf(metal): improve memory budget calculation logic for system allocation --- candle-core/src/metal_backend/device.rs | 39 ++++++++++++------------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/candle-core/src/metal_backend/device.rs b/candle-core/src/metal_backend/device.rs index 8c2167905e..6ae79984c8 100644 --- a/candle-core/src/metal_backend/device.rs +++ b/candle-core/src/metal_backend/device.rs @@ -86,6 +86,20 @@ impl Default for AllocationPolicy { const SYSTEM_RESERVE_FRACTION: usize = 4; // Keep at least 25% for the OS. const SYSTEM_RESERVE_MIN: usize = 2 * 1024 * 1024 * 1024; // 2 GiB floor. + let hw_total = sysctl_u64(b"hw.memsize\0").and_then(|bytes| { + if bytes == 0 || bytes > usize::MAX as u64 { + None + } else { + Some(bytes as usize) + } + })?; + + let reserve = std::cmp::max(hw_total / SYSTEM_RESERVE_FRACTION, SYSTEM_RESERVE_MIN); + let hw_budget = hw_total.saturating_sub(reserve); + if hw_budget == 0 { + return None; + } + let wired_limit_bytes = sysctl_u64(b"iogpu.wired_limit_mb\0").and_then(|limit_mb| { if limit_mb == 0 || limit_mb > usize::MAX as u64 { return None; @@ -93,27 +107,12 @@ impl Default for AllocationPolicy { (limit_mb as usize).checked_mul(MEBIBYTE) }); - let hw_budget = sysctl_u64(b"hw.memsize\0").and_then(|bytes| { - if bytes > usize::MAX as u64 { - return None; - } - let hw = bytes as usize; - // Reserve some memory for the OS / CPU workloads. - let reserve = std::cmp::max(hw / SYSTEM_RESERVE_FRACTION, SYSTEM_RESERVE_MIN); - let available = hw.saturating_sub(reserve); - if available > 0 { - Some(available) - } else { - None - } - }); + let wired_clamped = wired_limit_bytes.map(|limit| std::cmp::min(limit, hw_total)); - match (hw_budget, wired_limit_bytes) { - (Some(hw), Some(wired)) => Some(std::cmp::min(hw, wired)), - (Some(hw), None) => Some(hw), - (None, Some(wired)) => Some(wired), - (None, None) => None, - } + Some(match wired_clamped { + Some(wired) => std::cmp::min(wired, hw_budget), + None => hw_budget, + }) } let pending_limit = parse_env_mebibytes("CANDLE_METAL_PENDING_LIMIT_MB") From 5d5f41de0a00e4b5752195f4906779712dd916e7 Mon Sep 17 00:00:00 2001 From: TimmyOVO Date: Wed, 19 Nov 2025 06:36:02 +0800 Subject: [PATCH 05/13] refactor(metal): use CStr constants for sysctl keys --- candle-core/src/metal_backend/device.rs | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/candle-core/src/metal_backend/device.rs b/candle-core/src/metal_backend/device.rs index 6ae79984c8..1e84f1ed03 100644 --- a/candle-core/src/metal_backend/device.rs +++ b/candle-core/src/metal_backend/device.rs @@ -8,6 +8,7 @@ use candle_metal_kernels::{ }; use objc2_foundation::NSURL; use objc2_metal::{MTLCaptureDescriptor, MTLCaptureDestination, MTLCaptureManager}; +use std::ffi::CStr; use std::path::Path; use std::sync::{ atomic::{AtomicUsize, Ordering}, @@ -42,6 +43,8 @@ impl Default for AllocationPolicy { const DEFAULT_PENDING: usize = 4 * 1024 * 1024 * 1024; // 4 GiB const MIN_PENDING: usize = 512 * 1024 * 1024; // 512 MiB const MAX_PENDING: usize = 12 * 1024 * 1024 * 1024; // 12 GiB + const HW_MEMSIZE_KEY: &CStr = c"hw.memsize"; + const IOGPU_WIRED_LIMIT_MB_KEY: &CStr = c"iogpu.wired_limit_mb"; fn parse_env_mebibytes(var: &str) -> Option { std::env::var(var) @@ -49,12 +52,12 @@ impl Default for AllocationPolicy { .and_then(|value| value.trim().parse::().ok()) .map(|mb| mb * 1024 * 1024) } - fn sysctl_u64(name: &[u8]) -> Option { + fn sysctl_u64(name: &CStr) -> Option { use libc::c_void; unsafe { let mut len: usize = 0; if libc::sysctlbyname( - name.as_ptr() as *const libc::c_char, + name.as_ptr(), std::ptr::null_mut(), &mut len as *mut usize, std::ptr::null_mut(), @@ -68,7 +71,7 @@ impl Default for AllocationPolicy { } let mut value: u64 = 0; if libc::sysctlbyname( - name.as_ptr() as *const libc::c_char, + name.as_ptr(), &mut value as *mut u64 as *mut c_void, &mut len as *mut usize, std::ptr::null_mut(), @@ -86,7 +89,7 @@ impl Default for AllocationPolicy { const SYSTEM_RESERVE_FRACTION: usize = 4; // Keep at least 25% for the OS. const SYSTEM_RESERVE_MIN: usize = 2 * 1024 * 1024 * 1024; // 2 GiB floor. - let hw_total = sysctl_u64(b"hw.memsize\0").and_then(|bytes| { + let hw_total = sysctl_u64(HW_MEMSIZE_KEY).and_then(|bytes| { if bytes == 0 || bytes > usize::MAX as u64 { None } else { @@ -100,7 +103,7 @@ impl Default for AllocationPolicy { return None; } - let wired_limit_bytes = sysctl_u64(b"iogpu.wired_limit_mb\0").and_then(|limit_mb| { + let wired_limit_bytes = sysctl_u64(IOGPU_WIRED_LIMIT_MB_KEY).and_then(|limit_mb| { if limit_mb == 0 || limit_mb > usize::MAX as u64 { return None; } From e949fd4aacb204240db974dddf7ae87ce41cb2f3 Mon Sep 17 00:00:00 2001 From: TimmyOVO Date: Wed, 19 Nov 2025 06:38:30 +0800 Subject: [PATCH 06/13] refactor(metal): simplified sysctl_u64 to a single sysctlbyname call --- candle-core/src/metal_backend/device.rs | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/candle-core/src/metal_backend/device.rs b/candle-core/src/metal_backend/device.rs index 1e84f1ed03..e0fcb9959a 100644 --- a/candle-core/src/metal_backend/device.rs +++ b/candle-core/src/metal_backend/device.rs @@ -55,21 +55,8 @@ impl Default for AllocationPolicy { fn sysctl_u64(name: &CStr) -> Option { use libc::c_void; unsafe { - let mut len: usize = 0; - if libc::sysctlbyname( - name.as_ptr(), - std::ptr::null_mut(), - &mut len as *mut usize, - std::ptr::null_mut(), - 0, - ) != 0 - { - return None; - } - if len == 0 || len > core::mem::size_of::() { - return None; - } let mut value: u64 = 0; + let mut len = core::mem::size_of::(); if libc::sysctlbyname( name.as_ptr(), &mut value as *mut u64 as *mut c_void, @@ -80,7 +67,11 @@ impl Default for AllocationPolicy { { return None; } - Some(value) + if len == 0 { + None + } else { + Some(value) + } } } From 42dccad0f9de68106cfe60efbeaaf6e010e86d67 Mon Sep 17 00:00:00 2001 From: TimmyOVO Date: Wed, 19 Nov 2025 06:40:15 +0800 Subject: [PATCH 07/13] refactor(metal): simplified the wired-limit handling --- candle-core/src/metal_backend/device.rs | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/candle-core/src/metal_backend/device.rs b/candle-core/src/metal_backend/device.rs index e0fcb9959a..57544336a3 100644 --- a/candle-core/src/metal_backend/device.rs +++ b/candle-core/src/metal_backend/device.rs @@ -101,12 +101,11 @@ impl Default for AllocationPolicy { (limit_mb as usize).checked_mul(MEBIBYTE) }); - let wired_clamped = wired_limit_bytes.map(|limit| std::cmp::min(limit, hw_total)); - - Some(match wired_clamped { - Some(wired) => std::cmp::min(wired, hw_budget), - None => hw_budget, - }) + if let Some(wired) = wired_limit_bytes { + Some(std::cmp::min(wired, hw_budget)) + } else { + Some(hw_budget) + } } let pending_limit = parse_env_mebibytes("CANDLE_METAL_PENDING_LIMIT_MB") From 04354702488071e35b6f045bcfd9c68705c80afd Mon Sep 17 00:00:00 2001 From: TimmyOVO Date: Wed, 19 Nov 2025 06:42:59 +0800 Subject: [PATCH 08/13] refactor(metal): droip the 32-bit upper-bound check on hw.memsize --- candle-core/src/metal_backend/device.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/candle-core/src/metal_backend/device.rs b/candle-core/src/metal_backend/device.rs index 57544336a3..8c721af6ce 100644 --- a/candle-core/src/metal_backend/device.rs +++ b/candle-core/src/metal_backend/device.rs @@ -81,7 +81,7 @@ impl Default for AllocationPolicy { const SYSTEM_RESERVE_MIN: usize = 2 * 1024 * 1024 * 1024; // 2 GiB floor. let hw_total = sysctl_u64(HW_MEMSIZE_KEY).and_then(|bytes| { - if bytes == 0 || bytes > usize::MAX as u64 { + if bytes == 0 { None } else { Some(bytes as usize) @@ -95,7 +95,7 @@ impl Default for AllocationPolicy { } let wired_limit_bytes = sysctl_u64(IOGPU_WIRED_LIMIT_MB_KEY).and_then(|limit_mb| { - if limit_mb == 0 || limit_mb > usize::MAX as u64 { + if limit_mb == 0 { return None; } (limit_mb as usize).checked_mul(MEBIBYTE) From 6d71931ad65b0ed0ff2a58863719c6d04e5e1d6e Mon Sep 17 00:00:00 2001 From: TimmyOVO Date: Wed, 19 Nov 2025 06:45:02 +0800 Subject: [PATCH 09/13] refactor(metal): hardened env parsing to avoid overflow --- candle-core/src/metal_backend/device.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/candle-core/src/metal_backend/device.rs b/candle-core/src/metal_backend/device.rs index 8c721af6ce..77ec8bb9fd 100644 --- a/candle-core/src/metal_backend/device.rs +++ b/candle-core/src/metal_backend/device.rs @@ -50,7 +50,7 @@ impl Default for AllocationPolicy { std::env::var(var) .ok() .and_then(|value| value.trim().parse::().ok()) - .map(|mb| mb * 1024 * 1024) + .and_then(|mb| mb.checked_mul(1024 * 1024)) } fn sysctl_u64(name: &CStr) -> Option { use libc::c_void; From 57204f5e1f078457d4d3449fdc3f8577725a4c4e Mon Sep 17 00:00:00 2001 From: TimmyOVO Date: Wed, 19 Nov 2025 07:07:37 +0800 Subject: [PATCH 10/13] refactor(metal): refine memory limit heuristics and naming --- candle-core/src/metal_backend/device.rs | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/candle-core/src/metal_backend/device.rs b/candle-core/src/metal_backend/device.rs index 189e49aeaf..fb69b15483 100644 --- a/candle-core/src/metal_backend/device.rs +++ b/candle-core/src/metal_backend/device.rs @@ -32,8 +32,9 @@ impl DeviceId { #[derive(Clone)] pub(crate) struct AllocationPolicy { - /// Total bytes we can allocate before forcing a sync to reclaim temporaries. - pending_limit_bytes: usize, + /// Maximum number of bytes we allow to be newly allocated since the last + /// synchronization point before forcing a sync to reclaim temporaries. + pending_allocation_bytes_limit: usize, /// Maximum bytes to keep cached for reuse. cache_limit_bytes: usize, } @@ -116,7 +117,7 @@ impl Default for AllocationPolicy { .unwrap_or_else(|| std::cmp::max(pending_limit / 2, 64 * 1024 * 1024)); crate::metal_backend::device::AllocationPolicy { - pending_limit_bytes: pending_limit, + pending_allocation_bytes_limit: pending_limit, cache_limit_bytes: cache_limit, } } @@ -153,7 +154,9 @@ pub struct MetalDevice { pub(crate) kernels: Arc, /// Seed for random number generation. pub(crate) seed: Arc>, - /// Bytes allocated since the last synchronization point. + /// Bytes newly allocated since the last GPU synchronization point. This is + /// compared against `allocation_policy.pending_allocation_bytes_limit` to + /// decide when to force a sync and reclaim temporaries. pub(crate) pending_allocation_bytes: Arc, /// Allocation thresholds and cache budget. pub(crate) allocation_policy: AllocationPolicy, @@ -375,7 +378,7 @@ impl MetalDevice { .pending_allocation_bytes .fetch_add(size, Ordering::AcqRel) .saturating_add(size); - if pending >= self.allocation_policy.pending_limit_bytes { + if pending >= self.allocation_policy.pending_allocation_bytes_limit { // Ensure the GPU processed the backlog so buffers can be reused. self.wait_until_completed()?; self.pending_allocation_bytes.store(0, Ordering::Release); From 5c41d759f78df781ba69594e25d37facd098bf53 Mon Sep 17 00:00:00 2001 From: TimmyOVO Date: Wed, 19 Nov 2025 07:10:06 +0800 Subject: [PATCH 11/13] refactor(metal): extract min cache limit constant --- candle-core/src/metal_backend/device.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/candle-core/src/metal_backend/device.rs b/candle-core/src/metal_backend/device.rs index fb69b15483..b2b1b69584 100644 --- a/candle-core/src/metal_backend/device.rs +++ b/candle-core/src/metal_backend/device.rs @@ -44,6 +44,7 @@ impl Default for AllocationPolicy { const DEFAULT_PENDING: usize = 4 * 1024 * 1024 * 1024; // 4 GiB const MIN_PENDING: usize = 512 * 1024 * 1024; // 512 MiB const MAX_PENDING: usize = 12 * 1024 * 1024 * 1024; // 12 GiB + const MIN_CACHE_LIMIT: usize = 64 * 1024 * 1024; // 64 MiB const HW_MEMSIZE_KEY: &CStr = c"hw.memsize"; const IOGPU_WIRED_LIMIT_MB_KEY: &CStr = c"iogpu.wired_limit_mb"; @@ -114,7 +115,7 @@ impl Default for AllocationPolicy { .unwrap_or(DEFAULT_PENDING); let cache_limit = parse_env_mebibytes("CANDLE_METAL_CACHE_LIMIT_MB") - .unwrap_or_else(|| std::cmp::max(pending_limit / 2, 64 * 1024 * 1024)); + .unwrap_or_else(|| std::cmp::max(pending_limit / 2, MIN_CACHE_LIMIT)); crate::metal_backend::device::AllocationPolicy { pending_allocation_bytes_limit: pending_limit, From f3b39b90d3014b425db9a6ea0df290d9538c2176 Mon Sep 17 00:00:00 2001 From: TimmyOVO Date: Wed, 19 Nov 2025 07:24:55 +0800 Subject: [PATCH 12/13] refactor(metal): avoid removing empty buffer size keys during cache trim --- candle-core/src/metal_backend/device.rs | 7 ------- 1 file changed, 7 deletions(-) diff --git a/candle-core/src/metal_backend/device.rs b/candle-core/src/metal_backend/device.rs index b2b1b69584..fae264e892 100644 --- a/candle-core/src/metal_backend/device.rs +++ b/candle-core/src/metal_backend/device.rs @@ -234,7 +234,6 @@ impl MetalDevice { } let mut bytes_to_drop = cached_bytes - limit; - let mut empty_keys = Vec::new(); for (size, subbuffers) in buffers.iter_mut() { if bytes_to_drop == 0 { break; @@ -250,12 +249,6 @@ impl MetalDevice { true } }); - if subbuffers.is_empty() { - empty_keys.push(*size); - } - } - for key in empty_keys { - buffers.remove(&key); } Ok(()) } From 56aee4480be2090491f04478588afd2536cedf6d Mon Sep 17 00:00:00 2001 From: TimmyOVO Date: Sun, 23 Nov 2025 10:28:53 +0800 Subject: [PATCH 13/13] refactor(metal): handle memory size types consistently --- candle-core/src/metal_backend/device.rs | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/candle-core/src/metal_backend/device.rs b/candle-core/src/metal_backend/device.rs index fae264e892..0a59c34ff4 100644 --- a/candle-core/src/metal_backend/device.rs +++ b/candle-core/src/metal_backend/device.rs @@ -54,7 +54,8 @@ impl Default for AllocationPolicy { .and_then(|value| value.trim().parse::().ok()) .and_then(|mb| mb.checked_mul(1024 * 1024)) } - fn sysctl_u64(name: &CStr) -> Option { + + fn sysctl_usize(name: &CStr) -> Option { use libc::c_void; unsafe { let mut value: u64 = 0; @@ -72,7 +73,7 @@ impl Default for AllocationPolicy { if len == 0 { None } else { - Some(value) + Some(value as usize) } } } @@ -82,13 +83,7 @@ impl Default for AllocationPolicy { const SYSTEM_RESERVE_FRACTION: usize = 4; // Keep at least 25% for the OS. const SYSTEM_RESERVE_MIN: usize = 2 * 1024 * 1024 * 1024; // 2 GiB floor. - let hw_total = sysctl_u64(HW_MEMSIZE_KEY).and_then(|bytes| { - if bytes == 0 { - None - } else { - Some(bytes as usize) - } - })?; + let hw_total = sysctl_usize(HW_MEMSIZE_KEY)?; let reserve = std::cmp::max(hw_total / SYSTEM_RESERVE_FRACTION, SYSTEM_RESERVE_MIN); let hw_budget = hw_total.saturating_sub(reserve); @@ -96,11 +91,12 @@ impl Default for AllocationPolicy { return None; } - let wired_limit_bytes = sysctl_u64(IOGPU_WIRED_LIMIT_MB_KEY).and_then(|limit_mb| { + let wired_limit_bytes = sysctl_usize(IOGPU_WIRED_LIMIT_MB_KEY).and_then(|limit_mb| { if limit_mb == 0 { - return None; + None + } else { + limit_mb.checked_mul(MEBIBYTE) } - (limit_mb as usize).checked_mul(MEBIBYTE) }); if let Some(wired) = wired_limit_bytes {