diff --git a/MultiTierDataMovement.md b/MultiTierDataMovement.md new file mode 100644 index 0000000000..cccc14b947 --- /dev/null +++ b/MultiTierDataMovement.md @@ -0,0 +1,90 @@ +# Background Data Movement + +In order to reduce the number of online evictions and support asynchronous +promotion - we have added two periodic workers to handle eviction and promotion. + +The diagram below shows a simplified version of how the background evictor +thread (green) is integrated to the CacheLib architecture. + +

+ BackgroundEvictor +

+ +## Background Evictors + +The background evictors scan each class to see if there are objects to move the next (lower) +tier using a given strategy. Here we document the parameters for the different +strategies and general parameters. + +- `backgroundEvictorIntervalMilSec`: The interval that this thread runs for - by default +the background evictor threads will wake up every 10 ms to scan the AllocationClasses. Also, +the background evictor thread will be woken up everytime there is a failed allocation (from +a request handling thread) and the current percentage of free memory for the +AllocationClass is lower than `lowEvictionAcWatermark`. This may render the interval parameter +not as important when there are many allocations occuring from request handling threads. + +- `evictorThreads`: The number of background evictors to run - each thread is a assigned +a set of AllocationClasses to scan and evict objects from. Currently, each thread gets +an equal number of classes to scan - but as object size distribution may be unequal - future +versions will attempt to balance the classes among threads. The range is 1 to number of AllocationClasses. +The default is 1. + +- `maxEvictionBatch`: The number of objects to remove in a given eviction call. The +default is 40. Lower range is 10 and the upper range is 1000. Too low and we might not +remove objects at a reasonable rate, too high and it might increase contention with user threads. + +- `minEvictionBatch`: Minimum number of items to evict at any time (if there are any +candidates) + +- `maxEvictionPromotionHotness`: Maximum candidates to consider for eviction. This is similar to `maxEvictionBatch` +but it specifies how many candidates will be taken into consideration, not the actual number of items to evict. +This option can be used to configure duration of critical section on LRU lock. + + +### FreeThresholdStrategy (default) + +- `lowEvictionAcWatermark`: Triggers background eviction thread to run +when this percentage of the AllocationClass is free. +The default is `2.0`, to avoid wasting capacity we don't set this above `10.0`. + +- `highEvictionAcWatermark`: Stop the evictions from an AllocationClass when this +percentage of the AllocationClass is free. The default is `5.0`, to avoid wasting capacity we +don't set this above `10`. + + +## Background Promoters + +The background promoters scan each class to see if there are objects to move to a lower +tier using a given strategy. Here we document the parameters for the different +strategies and general parameters. + +- `backgroundPromoterIntervalMilSec`: The interval that this thread runs for - by default +the background promoter threads will wake up every 10 ms to scan the AllocationClasses for +objects to promote. + +- `promoterThreads`: The number of background promoters to run - each thread is a assigned +a set of AllocationClasses to scan and promote objects from. Currently, each thread gets +an equal number of classes to scan - but as object size distribution may be unequal - future +versions will attempt to balance the classes among threads. The range is `1` to number of AllocationClasses. The default is `1`. + +- `maxProtmotionBatch`: The number of objects to promote in a given promotion call. The +default is 40. Lower range is 10 and the upper range is 1000. Too low and we might not +remove objects at a reasonable rate, too high and it might increase contention with user threads. + +- `minPromotionBatch`: Minimum number of items to promote at any time (if there are any +candidates) + +- `numDuplicateElements`: This allows us to promote items that have existing handles (read-only) since +we won't need to modify the data when a user is done with the data. Therefore, for a short time +the data could reside in both tiers until it is evicted from its current tier. The default is to +not allow this (0). Setting the value to 100 will enable duplicate elements in tiers. + +### Background Promotion Strategy (only one currently) + +- `promotionAcWatermark`: Promote items if there is at least this +percent of free AllocationClasses. Promotion thread will attempt to move `maxPromotionBatch` number of objects +to that tier. The objects are chosen from the head of the LRU. The default is `4.0`. +This value should correlate with `lowEvictionAcWatermark`, `highEvictionAcWatermark`, `minAcAllocationWatermark`, `maxAcAllocationWatermark`. +- `maxPromotionBatch`: The number of objects to promote in batch during BG promotion. Analogous to +`maxEvictionBatch`. It's value should be lower to decrease contention on hot items. + diff --git a/cachelib/allocator/BackgroundMover-inl.h b/cachelib/allocator/BackgroundMover-inl.h new file mode 100644 index 0000000000..b77436635f --- /dev/null +++ b/cachelib/allocator/BackgroundMover-inl.h @@ -0,0 +1,112 @@ +/* + * Copyright (c) Intel and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +namespace facebook { +namespace cachelib { + +template +BackgroundMover::BackgroundMover( + Cache& cache, + std::shared_ptr strategy, + MoverDir direction) + : cache_(cache), strategy_(strategy), direction_(direction) { + if (direction_ == MoverDir::Evict) { + moverFunc = BackgroundMoverAPIWrapper::traverseAndEvictItems; + + } else if (direction_ == MoverDir::Promote) { + moverFunc = BackgroundMoverAPIWrapper::traverseAndPromoteItems; + } +} + +template +BackgroundMover::~BackgroundMover() { + stop(std::chrono::seconds(0)); +} + +template +void BackgroundMover::work() { + try { + checkAndRun(); + } catch (const std::exception& ex) { + XLOGF(ERR, "BackgroundMover interrupted due to exception: {}", ex.what()); + } +} + +template +void BackgroundMover::setAssignedMemory( + std::vector&& assignedMemory) { + XLOG(INFO, "Class assigned to background worker:"); + for (auto [tid, pid, cid] : assignedMemory) { + XLOGF(INFO, "Tid: {}, Pid: {}, Cid: {}", tid, pid, cid); + } + + mutex.lock_combine([this, &assignedMemory] { + this->assignedMemory_ = std::move(assignedMemory); + }); +} + +// Look for classes that exceed the target memory capacity +// and return those for eviction +template +void BackgroundMover::checkAndRun() { + auto assignedMemory = mutex.lock_combine([this] { return assignedMemory_; }); + + unsigned int moves = 0; + std::set classes{}; + auto batches = strategy_->calculateBatchSizes(cache_, assignedMemory); + + for (size_t i = 0; i < batches.size(); i++) { + const auto [tid, pid, cid] = assignedMemory[i]; + const auto batch = batches[i]; + + classes.insert(cid); + const auto& mpStats = cache_.getPoolByTid(pid, tid).getStats(); + + if (!batch) { + continue; + } + + // try moving BATCH items from the class in order to reach free target + auto moved = moverFunc(cache_, tid, pid, cid, batch); + moves += moved; + moves_per_class_[tid][pid][cid] += moved; + totalBytesMoved.add(moved * mpStats.acStats.at(cid).allocSize); + } + + numTraversals.inc(); + numMovedItems.add(moves); + totalClasses.add(classes.size()); +} + +template +BackgroundMoverStats BackgroundMover::getStats() const noexcept { + BackgroundMoverStats stats; + stats.numMovedItems = numMovedItems.get(); + stats.runCount = numTraversals.get(); + stats.totalBytesMoved = totalBytesMoved.get(); + stats.totalClasses = totalClasses.get(); + + return stats; +} + +template +std::map>> +BackgroundMover::getClassStats() const noexcept { + return moves_per_class_; +} + +} // namespace cachelib +} // namespace facebook diff --git a/cachelib/allocator/BackgroundMover.h b/cachelib/allocator/BackgroundMover.h new file mode 100644 index 0000000000..1246676d6e --- /dev/null +++ b/cachelib/allocator/BackgroundMover.h @@ -0,0 +1,103 @@ +/* + * Copyright (c) Intel and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "cachelib/allocator/BackgroundMoverStrategy.h" +#include "cachelib/allocator/CacheStats.h" +#include "cachelib/common/AtomicCounter.h" +#include "cachelib/common/PeriodicWorker.h" + +namespace facebook { +namespace cachelib { + +// wrapper that exposes the private APIs of CacheType that are specifically +// needed for the cache api +template +struct BackgroundMoverAPIWrapper { + static size_t traverseAndEvictItems(C& cache, + unsigned int tid, + unsigned int pid, + unsigned int cid, + size_t batch) { + return cache.traverseAndEvictItems(tid, pid, cid, batch); + } + + static size_t traverseAndPromoteItems(C& cache, + unsigned int tid, + unsigned int pid, + unsigned int cid, + size_t batch) { + return cache.traverseAndPromoteItems(tid, pid, cid, batch); + } +}; + +enum class MoverDir { Evict = 0, Promote }; + +// Periodic worker that evicts items from tiers in batches +// The primary aim is to reduce insertion times for new items in the +// cache +template +class BackgroundMover : public PeriodicWorker { + public: + using Cache = CacheT; + // @param cache the cache interface + // @param strategy the stragey class that defines how objects are + // moved, + // (promoted vs. evicted and how much) + BackgroundMover(Cache& cache, + std::shared_ptr strategy, + MoverDir direction_); + + ~BackgroundMover() override; + + BackgroundMoverStats getStats() const noexcept; + std::map>> + getClassStats() const noexcept; + + void setAssignedMemory( + std::vector&& assignedMemory); + + private: + std::map>> + moves_per_class_; + // cache allocator's interface for evicting + using Item = typename Cache::Item; + + Cache& cache_; + std::shared_ptr strategy_; + MoverDir direction_; + + std::function + moverFunc; + + // implements the actual logic of running the background evictor + void work() override final; + void checkAndRun(); + + AtomicCounter numMovedItems{0}; + AtomicCounter numTraversals{0}; + AtomicCounter totalClasses{0}; + AtomicCounter totalBytesMoved{0}; + + std::vector assignedMemory_; + folly::DistributedMutex mutex; +}; +} // namespace cachelib +} // namespace facebook + +#include "cachelib/allocator/BackgroundMover-inl.h" diff --git a/cachelib/allocator/BackgroundMoverStrategy.h b/cachelib/allocator/BackgroundMoverStrategy.h new file mode 100644 index 0000000000..7706a625a5 --- /dev/null +++ b/cachelib/allocator/BackgroundMoverStrategy.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "cachelib/allocator/Cache.h" + + +namespace facebook { +namespace cachelib { + +struct MemoryDescriptorType { + MemoryDescriptorType(TierId tid, PoolId pid, ClassId cid) : + tid_(tid), pid_(pid), cid_(cid) {} + TierId tid_; + PoolId pid_; + ClassId cid_; +}; + +// Base class for background eviction strategy. +class BackgroundMoverStrategy { + public: + virtual std::vector calculateBatchSizes( + const CacheBase& cache, + std::vector acVec) = 0; +}; + +} // namespace cachelib +} // namespace facebook diff --git a/cachelib/allocator/CMakeLists.txt b/cachelib/allocator/CMakeLists.txt index d64fadc932..87643ff006 100644 --- a/cachelib/allocator/CMakeLists.txt +++ b/cachelib/allocator/CMakeLists.txt @@ -35,6 +35,7 @@ add_library (cachelib_allocator CCacheManager.cpp ContainerTypes.cpp FreeMemStrategy.cpp + FreeThresholdStrategy.cpp HitsPerSlabStrategy.cpp LruTailAgeStrategy.cpp MarginalHitsOptimizeStrategy.cpp diff --git a/cachelib/allocator/Cache.h b/cachelib/allocator/Cache.h index a7a97467ab..589614ee3b 100644 --- a/cachelib/allocator/Cache.h +++ b/cachelib/allocator/Cache.h @@ -96,6 +96,12 @@ class CacheBase { // // @param poolId The pool id to query virtual const MemoryPool& getPool(PoolId poolId) const = 0; + + // Get the reference to a memory pool using a tier id, for stats purposes + // + // @param poolId The pool id to query + // @param tierId The tier of the pool id + virtual const MemoryPool& getPoolByTid(PoolId poolId, TierId tid) const = 0; // Get Pool specific stats (regular pools). This includes stats from the // Memory Pool and also the cache. diff --git a/cachelib/allocator/CacheAllocator-inl.h b/cachelib/allocator/CacheAllocator-inl.h index 1b494d15bb..7b0f1de992 100644 --- a/cachelib/allocator/CacheAllocator-inl.h +++ b/cachelib/allocator/CacheAllocator-inl.h @@ -302,6 +302,18 @@ void CacheAllocator::initWorkers() { config_.poolOptimizeStrategy, config_.ccacheOptimizeStepSizePercent); } + + if (config_.backgroundEvictorEnabled()) { + startNewBackgroundEvictor(config_.backgroundEvictorInterval, + config_.backgroundEvictorStrategy, + config_.backgroundEvictorThreads); + } + + if (config_.backgroundPromoterEnabled()) { + startNewBackgroundPromoter(config_.backgroundPromoterInterval, + config_.backgroundPromoterStrategy, + config_.backgroundPromoterThreads); + } } template @@ -383,6 +395,22 @@ CacheAllocator::allocate(PoolId poolId, ttlSecs == 0 ? 0 : creationTime + ttlSecs); } +template +bool CacheAllocator::shouldWakeupBgEvictor(TierId tid, PoolId pid, ClassId cid) { + // TODO: should we also work on lower tiers? should we have separate set of params? + if (tid == 1) return false; + return getAllocationClassStats(tid, pid, cid).approxFreePercent <= config_.lowEvictionAcWatermark; +} + +template +size_t CacheAllocator::backgroundWorkerId(TierId tid, PoolId pid, ClassId cid, size_t numWorkers) { + XDCHECK(numWorkers); + + // TODO: came up with some better sharding (use some hashing) + return (tid + pid + cid) % numWorkers; +} + + template typename CacheAllocator::WriteHandle CacheAllocator::allocateInternalTier(TierId tid, @@ -390,7 +418,8 @@ CacheAllocator::allocateInternalTier(TierId tid, typename Item::Key key, uint32_t size, uint32_t creationTime, - uint32_t expiryTime) { + uint32_t expiryTime, + bool fromBgThread) { util::LatencyTracker tracker{stats().allocateLatency_}; SCOPE_FAIL { stats_.invalidAllocs.inc(); }; @@ -404,9 +433,13 @@ CacheAllocator::allocateInternalTier(TierId tid, // TODO: per-tier (*stats_.allocAttempts)[pid][cid].inc(); - + void* memory = allocator_[tid]->allocate(pid, requiredSize); - // TODO: Today disableEviction means do not evict from memory (DRAM). + + if (backgroundEvictor_.size() && !fromBgThread && (memory == nullptr || shouldWakeupBgEvictor(tid, pid, cid))) { + backgroundEvictor_[backgroundWorkerId(tid, pid, cid, backgroundEvictor_.size())]->wakeUp(); + } + // TODO: Today isEvictionDisabled means do not evict from memory (DRAM). // Should we support eviction between memory tiers (e.g. from DRAM to PMEM)? if (memory == nullptr && !config_.isEvictionDisabled()) { memory = findEviction(tid, pid, cid); @@ -454,10 +487,11 @@ CacheAllocator::allocateInternal(PoolId pid, typename Item::Key key, uint32_t size, uint32_t creationTime, - uint32_t expiryTime) { + uint32_t expiryTime, + bool fromBgThread) { auto tid = 0; /* TODO: consult admission policy */ for(TierId tid = 0; tid < getNumTiers(); ++tid) { - auto handle = allocateInternalTier(tid, pid, key, size, creationTime, expiryTime); + auto handle = allocateInternalTier(tid, pid, key, size, creationTime, expiryTime, fromBgThread); if (handle) return handle; } return {}; @@ -1639,7 +1673,7 @@ bool CacheAllocator::shouldWriteToNvmCacheExclusive( template typename CacheAllocator::WriteHandle CacheAllocator::tryEvictToNextMemoryTier( - TierId tid, PoolId pid, Item& item) { + TierId tid, PoolId pid, Item& item, bool fromBgThread) { if(item.isChainedItem()) return {}; // TODO: We do not support ChainedItem yet if(item.isExpired()) return acquire(&item); @@ -1650,7 +1684,8 @@ CacheAllocator::tryEvictToNextMemoryTier( item.getKey(), item.getSize(), item.getCreationTime(), - item.getExpiryTime()); + item.getExpiryTime(), + fromBgThread); if (newItemHdl) { XDCHECK_EQ(newItemHdl->getSize(), item.getSize()); @@ -1663,12 +1698,52 @@ CacheAllocator::tryEvictToNextMemoryTier( template typename CacheAllocator::WriteHandle -CacheAllocator::tryEvictToNextMemoryTier(Item& item) { +CacheAllocator::tryEvictToNextMemoryTier(Item& item, bool fromBgThread) { auto tid = getTierId(item); auto pid = allocator_[tid]->getAllocInfo(item.getMemory()).poolId; - return tryEvictToNextMemoryTier(tid, pid, item); + return tryEvictToNextMemoryTier(tid, pid, item, fromBgThread); } +template +bool +CacheAllocator::tryPromoteToNextMemoryTier( + TierId tid, PoolId pid, Item& item, bool fromBgThread) { + TierId nextTier = tid; + while (nextTier > 0) { // try to evict down to the next memory tiers + auto toPromoteTier = nextTier - 1; + --nextTier; + + // allocateInternal might trigger another eviction + auto newItemHdl = allocateInternalTier(toPromoteTier, pid, + item.getKey(), + item.getSize(), + item.getCreationTime(), + item.getExpiryTime(), + fromBgThread); + + if (newItemHdl) { + XDCHECK_EQ(newItemHdl->getSize(), item.getSize()); + auto predicate = [&](const Item& item){ + return item.getRefCount() == 0 || config_.numDuplicateElements > 0; + }; + if (moveRegularItemWithSync(item, newItemHdl, predicate)) { + return true; + } + } + } + + return false; +} + +template +bool +CacheAllocator::tryPromoteToNextMemoryTier(Item& item, bool fromBgThread) { + auto tid = getTierId(item); + auto pid = allocator_[tid]->getAllocInfo(item.getMemory()).poolId; + return tryPromoteToNextMemoryTier(tid, pid, item, fromBgThread); +} + + template typename CacheAllocator::RemoveRes CacheAllocator::remove(typename Item::Key key) { @@ -2361,6 +2436,16 @@ PoolId CacheAllocator::addPool( setRebalanceStrategy(pid, std::move(rebalanceStrategy)); setResizeStrategy(pid, std::move(resizeStrategy)); + if (backgroundEvictor_.size()) { + for (size_t id = 0; id < backgroundEvictor_.size(); id++) + backgroundEvictor_[id]->setAssignedMemory(getAssignedMemoryToBgWorker(id, backgroundEvictor_.size(), 0)); + } + + if (backgroundPromoter_.size()) { + for (size_t id = 0; id < backgroundPromoter_.size(); id++) + backgroundPromoter_[id]->setAssignedMemory(getAssignedMemoryToBgWorker(id, backgroundPromoter_.size(), 1)); + } + return pid; } @@ -2910,7 +2995,8 @@ CacheAllocator::allocateNewItemForOldItem(const Item& oldItem) { oldItem.getKey(), oldItem.getSize(), oldItem.getCreationTime(), - oldItem.getExpiryTime()); + oldItem.getExpiryTime(), + false); if (!newItemHdl) { return {}; } @@ -3043,14 +3129,15 @@ void CacheAllocator::evictForSlabRelease( template typename CacheAllocator::WriteHandle CacheAllocator::evictNormalItem(Item& item, - bool skipIfTokenInvalid) { + bool skipIfTokenInvalid, + bool fromBgThread) { XDCHECK(item.isMoving()); if (item.isOnlyMoving()) { return WriteHandle{}; } - auto evictHandle = tryEvictToNextMemoryTier(item); + auto evictHandle = tryEvictToNextMemoryTier(item, fromBgThread); if(evictHandle) return evictHandle; auto predicate = [](const Item& it) { return it.getRefCount() == 0; }; @@ -3434,6 +3521,8 @@ bool CacheAllocator::stopWorkers(std::chrono::seconds timeout) { success &= stopPoolResizer(timeout); success &= stopMemMonitor(timeout); success &= stopReaper(timeout); + success &= stopBackgroundEvictor(timeout); + success &= stopBackgroundPromoter(timeout); return success; } @@ -3698,6 +3787,8 @@ GlobalCacheStats CacheAllocator::getGlobalCacheStats() const { ret.nvmUpTime = currTime - nvmCacheState_.getCreationTime(); ret.nvmCacheEnabled = nvmCache_ ? nvmCache_->isEnabled() : false; ret.reaperStats = getReaperStats(); + ret.evictionStats = getBackgroundMoverStats(MoverDir::Evict); + ret.promotionStats = getBackgroundMoverStats(MoverDir::Promote); ret.numActiveHandles = getNumActiveHandles(); ret.isNewRamCache = cacheCreationTime_ == cacheInstanceCreationTime_; @@ -3847,6 +3938,64 @@ bool CacheAllocator::startNewReaper( return startNewWorker("Reaper", reaper_, interval, reaperThrottleConfig); } +template +auto CacheAllocator::getAssignedMemoryToBgWorker(size_t evictorId, size_t numWorkers, TierId tid) +{ + std::vector asssignedMemory; + // TODO: for now, only evict from tier 0 + auto pools = filterCompactCachePools(allocator_[tid]->getPoolIds()); + for (const auto pid : pools) { + const auto& mpStats = getPoolByTid(pid,tid).getStats(); + for (const auto cid : mpStats.classIds) { + if (backgroundWorkerId(tid, pid, cid, numWorkers) == evictorId) { + asssignedMemory.emplace_back(tid, pid, cid); + } + } + } + return asssignedMemory; +} + +template +bool CacheAllocator::startNewBackgroundEvictor( + std::chrono::milliseconds interval, + std::shared_ptr strategy, + size_t threads) { + XDCHECK(threads > 0); + backgroundEvictor_.resize(threads); + bool result = true; + + for (size_t i = 0; i < threads; i++) { + auto ret = startNewWorker("BackgroundEvictor" + std::to_string(i), backgroundEvictor_[i], interval, strategy, MoverDir::Evict); + result = result && ret; + + if (result) { + backgroundEvictor_[i]->setAssignedMemory(getAssignedMemoryToBgWorker(i, backgroundEvictor_.size(), 0)); + } + } + return result; +} + +template +bool CacheAllocator::startNewBackgroundPromoter( + std::chrono::milliseconds interval, + std::shared_ptr strategy, + size_t threads) { + XDCHECK(threads > 0); + XDCHECK(getNumTiers() > 1); + backgroundPromoter_.resize(threads); + bool result = true; + + for (size_t i = 0; i < threads; i++) { + auto ret = startNewWorker("BackgroundPromoter" + std::to_string(i), backgroundPromoter_[i], interval, strategy, MoverDir::Promote); + result = result && ret; + + if (result) { + backgroundPromoter_[i]->setAssignedMemory(getAssignedMemoryToBgWorker(i, backgroundPromoter_.size(), 1)); + } + } + return result; +} + template bool CacheAllocator::stopPoolRebalancer( std::chrono::seconds timeout) { @@ -3874,6 +4023,26 @@ bool CacheAllocator::stopReaper(std::chrono::seconds timeout) { return stopWorker("Reaper", reaper_, timeout); } +template +bool CacheAllocator::stopBackgroundEvictor(std::chrono::seconds timeout) { + bool result = true; + for (size_t i = 0; i < backgroundEvictor_.size(); i++) { + auto ret = stopWorker("BackgroundEvictor", backgroundEvictor_[i], timeout); + result = result && ret; + } + return result; +} + +template +bool CacheAllocator::stopBackgroundPromoter(std::chrono::seconds timeout) { + bool result = true; + for (size_t i = 0; i < backgroundPromoter_.size(); i++) { + auto ret = stopWorker("BackgroundPromoter", backgroundPromoter_[i], timeout); + result = result && ret; + } + return result; +} + template bool CacheAllocator::cleanupStrayShmSegments( const std::string& cacheDir, bool posix /*TODO(SHM_FILE): const std::vector& config */) { diff --git a/cachelib/allocator/CacheAllocator.h b/cachelib/allocator/CacheAllocator.h index 02557dfe24..802abb0115 100644 --- a/cachelib/allocator/CacheAllocator.h +++ b/cachelib/allocator/CacheAllocator.h @@ -39,6 +39,7 @@ #include #pragma GCC diagnostic pop +#include "cachelib/allocator/BackgroundMover.h" #include "cachelib/allocator/CCacheManager.h" #include "cachelib/allocator/Cache.h" #include "cachelib/allocator/CacheAllocatorConfig.h" @@ -659,6 +660,11 @@ class CacheAllocator : public CacheBase { // @return the full usable size for this item uint32_t getUsableSize(const Item& item) const; + // gets the allocation class assigned to BG worker + auto getAssignedMemoryToBgWorker(size_t evictorId, size_t numWorkers, TierId tid); + bool shouldWakeupBgEvictor(TierId tid, PoolId pid, ClassId cid); + size_t backgroundWorkerId(TierId tid, PoolId pid, ClassId cid, size_t numWorkers); + // Get a random item from memory // This is useful for profiling and sampling cachelib managed memory // @@ -1004,6 +1010,11 @@ class CacheAllocator : public CacheBase { // @param reaperThrottleConfig throttling config bool startNewReaper(std::chrono::milliseconds interval, util::Throttler::Config reaperThrottleConfig); + + bool startNewBackgroundPromoter(std::chrono::milliseconds interval, + std::shared_ptr strategy, size_t threads); + bool startNewBackgroundEvictor(std::chrono::milliseconds interval, + std::shared_ptr strategy, size_t threads); // Stop existing workers with a timeout bool stopPoolRebalancer(std::chrono::seconds timeout = std::chrono::seconds{ @@ -1013,6 +1024,8 @@ class CacheAllocator : public CacheBase { 0}); bool stopMemMonitor(std::chrono::seconds timeout = std::chrono::seconds{0}); bool stopReaper(std::chrono::seconds timeout = std::chrono::seconds{0}); + bool stopBackgroundEvictor(std::chrono::seconds timeout = std::chrono::seconds{0}); + bool stopBackgroundPromoter(std::chrono::seconds timeout = std::chrono::seconds{0}); // Set pool optimization to either true or false // @@ -1048,6 +1061,10 @@ class CacheAllocator : public CacheBase { return allocator_[currentTier()]->getPool(pid); } + const MemoryPool& getPoolByTid(PoolId pid, TierId tid) const override final { + return allocator_[tid]->getPool(pid); + } + // calculate the number of slabs to be advised/reclaimed in each pool PoolAdviseReclaimData calcNumSlabsToAdviseReclaim() override final { auto regularPoolIds = getRegularPoolIds(); @@ -1098,6 +1115,52 @@ class CacheAllocator : public CacheBase { auto stats = reaper_ ? reaper_->getStats() : ReaperStats{}; return stats; } + + // returns the background mover stats + BackgroundMoverStats getBackgroundMoverStats(MoverDir direction) const { + + auto stats = BackgroundMoverStats{}; + if (direction == MoverDir::Evict) { + for (auto &bg : backgroundEvictor_) + stats += bg->getStats(); + } else if (direction == MoverDir::Promote) { + for (auto &bg : backgroundPromoter_) + stats += bg->getStats(); + } + return stats; + + } + + + std::map>> + getBackgroundMoverClassStats(MoverDir direction) const { + std::map>> stats; + + if (direction == MoverDir::Evict) { + for (auto &bg : backgroundEvictor_) { + for (auto &tid : bg->getClassStats()) { + for (auto &pid : tid.second) { + for (auto &cid : pid.second) { + stats[tid.first][pid.first][cid.first] += cid.second; + } + } + } + } + } else if (direction == MoverDir::Promote) { + for (auto &bg : backgroundPromoter_) { + for (auto &tid : bg->getClassStats()) { + for (auto &pid : tid.second) { + for (auto &cid : pid.second) { + stats[tid.first][pid.first][cid.first] += cid.second; + } + } + } + } + } + + return stats; + } + // return the LruType of an item typename MMType::LruType getItemLruType(const Item& item) const; @@ -1393,7 +1456,8 @@ class CacheAllocator : public CacheBase { Key key, uint32_t size, uint32_t creationTime, - uint32_t expiryTime); + uint32_t expiryTime, + bool fromBgThread = false); // create a new cache allocation on specific memory tier. // For description see allocateInternal. @@ -1404,7 +1468,8 @@ class CacheAllocator : public CacheBase { Key key, uint32_t size, uint32_t creationTime, - uint32_t expiryTime); + uint32_t expiryTime, + bool fromBgThread); // Allocate a chained item // @@ -1668,7 +1733,11 @@ class CacheAllocator : public CacheBase { // // @return valid handle to the item. This will be the last // handle to the item. On failure an empty handle. - WriteHandle tryEvictToNextMemoryTier(TierId tid, PoolId pid, Item& item); + WriteHandle tryEvictToNextMemoryTier(TierId tid, PoolId pid, Item& item, bool fromBgThread); + + bool tryPromoteToNextMemoryTier(TierId tid, PoolId pid, Item& item, bool fromBgThread); + + bool tryPromoteToNextMemoryTier(Item& item, bool fromBgThread); // Try to move the item down to the next memory tier // @@ -1676,7 +1745,7 @@ class CacheAllocator : public CacheBase { // // @return valid handle to the item. This will be the last // handle to the item. On failure an empty handle. - WriteHandle tryEvictToNextMemoryTier(Item& item); + WriteHandle tryEvictToNextMemoryTier(Item& item, bool fromBgThread); size_t memoryTierSize(TierId tid) const; @@ -1797,7 +1866,7 @@ class CacheAllocator : public CacheBase { // // @return last handle for corresponding to item on success. empty handle on // failure. caller can retry if needed. - WriteHandle evictNormalItem(Item& item, bool skipIfTokenInvalid = false); + WriteHandle evictNormalItem(Item& item, bool skipIfTokenInvalid = false, bool fromBgThread = false); // Helper function to evict a child item for slab release // As a side effect, the parent item is also evicted @@ -1827,6 +1896,130 @@ class CacheAllocator : public CacheBase { stats().numSkippedSlabReleases.add(slabsSkipped); } + // exposed for the background evictor to iterate through the memory and evict + // in batch. This should improve insertion path for tiered memory config + size_t traverseAndEvictItems(unsigned int tid, unsigned int pid, unsigned int cid, size_t batch) { + auto& mmContainer = getMMContainer(tid, pid, cid); + size_t evictions = 0; + size_t evictionCandidates = 0; + std::vector candidates; + candidates.reserve(batch); + + size_t tries = 0; + mmContainer.withEvictionIterator([&tries, &candidates, &batch, this](auto &&itr){ + while (candidates.size() < batch && (config_.maxEvictionPromotionHotness == 0 || tries < config_.maxEvictionPromotionHotness) && itr) { + tries++; + Item* candidate = itr.get(); + XDCHECK(candidate); + + if (candidate->isChainedItem()) { + throw std::runtime_error("Not supported for chained items"); + } + + if (candidate->getRefCount() == 0 && candidate->markMoving()) { + candidates.push_back(candidate); + } + + ++itr; + } + }); + + for (Item *candidate : candidates) { + auto toReleaseHandle = + evictNormalItem(*candidate, true /* skipIfTokenInvalid */, true /* from BG thread */); + auto ref = candidate->unmarkMoving(); + + if (toReleaseHandle || ref == 0u) { + if (candidate->hasChainedItem()) { + (*stats_.chainedItemEvictions)[pid][cid].inc(); + } else { + (*stats_.regularItemEvictions)[pid][cid].inc(); + } + + evictions++; + } else { + if (candidate->hasChainedItem()) { + stats_.evictFailParentAC.inc(); + } else { + stats_.evictFailAC.inc(); + } + } + + if (toReleaseHandle) { + XDCHECK(toReleaseHandle.get() == candidate); + XDCHECK_EQ(1u, toReleaseHandle->getRefCount()); + + // We manually release the item here because we don't want to + // invoke the Item Handle's destructor which will be decrementing + // an already zero refcount, which will throw exception + auto& itemToRelease = *toReleaseHandle.release(); + + // Decrementing the refcount because we want to recycle the item + const auto ref = decRef(itemToRelease); + XDCHECK_EQ(0u, ref); + + auto res = releaseBackToAllocator(*candidate, RemoveContext::kEviction, + /* isNascent */ false); + XDCHECK(res == ReleaseRes::kReleased); + } else if (ref == 0u) { + // it's safe to recycle the item here as there are no more + // references and the item could not been marked as moving + // by other thread since it's detached from MMContainer. + auto res = releaseBackToAllocator(*candidate, RemoveContext::kEviction, + /* isNascent */ false); + XDCHECK(res == ReleaseRes::kReleased); + } + } + + return evictions; + } + + size_t traverseAndPromoteItems(unsigned int tid, unsigned int pid, unsigned int cid, size_t batch) { + auto& mmContainer = getMMContainer(tid, pid, cid); + size_t promotions = 0; + std::vector candidates; + candidates.reserve(batch); + + size_t tries = 0; + + mmContainer.withPromotionIterator([&tries, &candidates, &batch, this](auto &&itr){ + while (candidates.size() < batch && (config_.maxEvictionPromotionHotness == 0 || tries < config_.maxEvictionPromotionHotness) && itr) { + tries++; + Item* candidate = itr.get(); + XDCHECK(candidate); + + if (candidate->isChainedItem()) { + throw std::runtime_error("Not supported for chained items"); + } + + + // TODO: only allow it for read-only items? + // or implement mvcc + if (!candidate->isExpired() && candidate->markMoving()) { + candidates.push_back(candidate); + } + + ++itr; + } + }); + + for (Item *candidate : candidates) { + auto promoted = tryPromoteToNextMemoryTier(*candidate, true); + auto ref = candidate->unmarkMoving(); + if (promoted) + promotions++; + + if (ref == 0u) { + // stats_.promotionMoveSuccess.inc(); + auto res = releaseBackToAllocator(*candidate, RemoveContext::kEviction, + /* isNascent */ false); + XDCHECK(res == ReleaseRes::kReleased); + } + } + + return promotions; + } + // returns true if nvmcache is enabled and we should write this item to // nvmcache. bool shouldWriteToNvmCache(const Item& item); @@ -2153,6 +2346,10 @@ class CacheAllocator : public CacheBase { // free memory monitor std::unique_ptr memMonitor_; + + // background evictor + std::vector>> backgroundEvictor_; + std::vector>> backgroundPromoter_; // check whether a pool is a slabs pool std::array isCompactCachePool_{}; @@ -2214,6 +2411,7 @@ class CacheAllocator : public CacheBase { // Make this friend to give access to acquire and release friend ReadHandle; friend ReaperAPIWrapper; + friend BackgroundMoverAPIWrapper; friend class CacheAPIWrapperForNvm; friend class FbInternalRuntimeUpdateWrapper; friend class objcache2::ObjectCache; diff --git a/cachelib/allocator/CacheAllocatorConfig.h b/cachelib/allocator/CacheAllocatorConfig.h index c0a70139ce..4060ca2eeb 100644 --- a/cachelib/allocator/CacheAllocatorConfig.h +++ b/cachelib/allocator/CacheAllocatorConfig.h @@ -31,6 +31,7 @@ #include "cachelib/allocator/MemoryTierCacheConfig.h" #include "cachelib/allocator/NvmAdmissionPolicy.h" #include "cachelib/allocator/PoolOptimizeStrategy.h" +#include "cachelib/allocator/BackgroundMoverStrategy.h" #include "cachelib/allocator/RebalanceStrategy.h" #include "cachelib/allocator/Util.h" #include "cachelib/common/EventInterface.h" @@ -267,6 +268,16 @@ class CacheAllocatorConfig { std::chrono::seconds regularInterval, std::chrono::seconds ccacheInterval, uint32_t ccacheStepSizePercent); + + // Enable the background evictor - scans a tier to look for objects + // to evict to the next tier + CacheAllocatorConfig& enableBackgroundEvictor( + std::shared_ptr backgroundMoverStrategy, + std::chrono::milliseconds regularInterval, size_t threads); + + CacheAllocatorConfig& enableBackgroundPromoter( + std::shared_ptr backgroundMoverStrategy, + std::chrono::milliseconds regularInterval, size_t threads); // This enables an optimization for Pool rebalancing and resizing. // The rough idea is to ensure only the least useful items are evicted when @@ -354,6 +365,17 @@ class CacheAllocatorConfig { poolOptimizeStrategy != nullptr; } + // @return whether background evictor thread is enabled + bool backgroundEvictorEnabled() const noexcept { + return backgroundEvictorInterval.count() > 0 && + backgroundEvictorStrategy != nullptr; + } + + bool backgroundPromoterEnabled() const noexcept { + return backgroundPromoterInterval.count() > 0 && + backgroundPromoterStrategy != nullptr; + } + // @return whether memory monitor is enabled bool memMonitoringEnabled() const noexcept { return memMonitorConfig.mode != MemoryMonitor::Disabled && @@ -464,6 +486,16 @@ class CacheAllocatorConfig { // The slab release process is considered as being stuck if it does not // make any progress for the below threshold std::chrono::milliseconds slabReleaseStuckThreshold{std::chrono::seconds(60)}; + + // rebalance to avoid alloc fialures. + std::shared_ptr backgroundEvictorStrategy; + std::shared_ptr backgroundPromoterStrategy; + // time interval to sleep between runs of the background evictor + std::chrono::milliseconds backgroundEvictorInterval{std::chrono::milliseconds{1000}}; + std::chrono::milliseconds backgroundPromoterInterval{std::chrono::milliseconds{1000}}; + + size_t backgroundEvictorThreads{1}; + size_t backgroundPromoterThreads{1}; // time interval to sleep between iterations of pool size optimization, // for regular pools and compact caches @@ -603,6 +635,25 @@ class CacheAllocatorConfig { // If true, we will delay worker start until user explicitly calls // CacheAllocator::startCacheWorkers() bool delayCacheWorkersStart{false}; + + // see MultiTierDataMovement.md + double promotionAcWatermark{4.0}; + double lowEvictionAcWatermark{2.0}; + double highEvictionAcWatermark{5.0}; + double numDuplicateElements{0.0}; // inclusivness of the cache + double syncPromotion{0.0}; // can promotion be done synchronously in user thread + + uint64_t evictorThreads{1}; + uint64_t promoterThreads{1}; + + uint64_t maxEvictionBatch{40}; + uint64_t maxPromotionBatch{10}; + + uint64_t minEvictionBatch{1}; + uint64_t minPromotionBatch{1}; + + uint64_t maxEvictionPromotionHotness{60}; + friend CacheT; @@ -951,6 +1002,26 @@ CacheAllocatorConfig& CacheAllocatorConfig::enablePoolRebalancing( return *this; } +template +CacheAllocatorConfig& CacheAllocatorConfig::enableBackgroundEvictor( + std::shared_ptr strategy, + std::chrono::milliseconds interval, size_t evictorThreads) { + backgroundEvictorStrategy = strategy; + backgroundEvictorInterval = interval; + backgroundEvictorThreads = evictorThreads; + return *this; +} + +template +CacheAllocatorConfig& CacheAllocatorConfig::enableBackgroundPromoter( + std::shared_ptr strategy, + std::chrono::milliseconds interval, size_t promoterThreads) { + backgroundPromoterStrategy = strategy; + backgroundPromoterInterval = interval; + backgroundPromoterThreads = promoterThreads; + return *this; +} + template CacheAllocatorConfig& CacheAllocatorConfig::enablePoolResizing( std::shared_ptr resizeStrategy, diff --git a/cachelib/allocator/CacheStats.h b/cachelib/allocator/CacheStats.h index edd1d8a4cb..df718ab8c3 100644 --- a/cachelib/allocator/CacheStats.h +++ b/cachelib/allocator/CacheStats.h @@ -304,6 +304,27 @@ struct ReaperStats { uint64_t avgTraversalTimeMs{0}; }; +// Mover Stats +struct BackgroundMoverStats { + // the number of items this worker moved by looking at pools/classes stats + uint64_t numMovedItems{0}; + // number of times we went executed the thread //TODO: is this def correct? + uint64_t runCount{0}; + // total number of classes + uint64_t totalClasses{0}; + // eviction size + uint64_t totalBytesMoved{0}; + + BackgroundMoverStats& operator+=(const BackgroundMoverStats& rhs) { + numMovedItems += rhs.numMovedItems; + runCount += rhs.runCount; + totalClasses += rhs.totalClasses; + totalBytesMoved += rhs.totalBytesMoved; + return *this; + } +}; + + // CacheMetadata type to export struct CacheMetadata { // allocator_version @@ -324,6 +345,11 @@ struct Stats; // Stats that apply globally in cache and // the ones that are aggregated over all pools struct GlobalCacheStats { + // background eviction stats + BackgroundMoverStats evictionStats; + + BackgroundMoverStats promotionStats; + // number of calls to CacheAllocator::find uint64_t numCacheGets{0}; diff --git a/cachelib/allocator/FreeThresholdStrategy.cpp b/cachelib/allocator/FreeThresholdStrategy.cpp new file mode 100644 index 0000000000..1311d678fb --- /dev/null +++ b/cachelib/allocator/FreeThresholdStrategy.cpp @@ -0,0 +1,74 @@ +/* + * Copyright (c) Intel and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "cachelib/allocator/FreeThresholdStrategy.h" + +#include + +namespace facebook { +namespace cachelib { + +FreeThresholdStrategy::FreeThresholdStrategy(double lowEvictionAcWatermark, + double highEvictionAcWatermark, + uint64_t maxEvictionBatch, + uint64_t minEvictionBatch) + : lowEvictionAcWatermark(lowEvictionAcWatermark), + highEvictionAcWatermark(highEvictionAcWatermark), + maxEvictionBatch(maxEvictionBatch), + minEvictionBatch(minEvictionBatch) {} + +std::vector FreeThresholdStrategy::calculateBatchSizes( + const CacheBase& cache, + std::vector acVec) { + std::vector batches{}; + for (auto [tid, pid, cid] : acVec) { + auto stats = cache.getAllocationClassStats(tid, pid, cid); + if (stats.approxFreePercent >= highEvictionAcWatermark) { + batches.push_back(0); + } else { + auto toFreeMemPercent = highEvictionAcWatermark - stats.approxFreePercent; + auto toFreeItems = static_cast( + toFreeMemPercent * stats.memorySize / stats.allocSize); + batches.push_back(toFreeItems); + } + } + + if (batches.size() == 0) { + return batches; + } + + auto maxBatch = *std::max_element(batches.begin(), batches.end()); + if (maxBatch == 0) + return batches; + + std::transform( + batches.begin(), batches.end(), batches.begin(), [&](auto numItems) { + if (numItems == 0) { + return 0UL; + } + + auto cappedBatchSize = maxEvictionBatch * numItems / maxBatch; + if (cappedBatchSize < minEvictionBatch) + return minEvictionBatch; + else + return cappedBatchSize; + }); + + return batches; +} + +} // namespace cachelib +} // namespace facebook diff --git a/cachelib/allocator/FreeThresholdStrategy.h b/cachelib/allocator/FreeThresholdStrategy.h new file mode 100644 index 0000000000..94316bfe82 --- /dev/null +++ b/cachelib/allocator/FreeThresholdStrategy.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "cachelib/allocator/BackgroundMoverStrategy.h" +#include "cachelib/allocator/Cache.h" + +namespace facebook { +namespace cachelib { + +// Base class for background mover strategy. +class FreeThresholdStrategy : public BackgroundMoverStrategy { + public: + FreeThresholdStrategy(double lowEvictionAcWatermark, + double highEvictionAcWatermark, + uint64_t maxEvictionBatch, + uint64_t minEvictionBatch); + ~FreeThresholdStrategy() {} + + std::vector calculateBatchSizes( + const CacheBase& cache, + std::vector acVecs); + + private: + double lowEvictionAcWatermark{2.0}; + double highEvictionAcWatermark{5.0}; + uint64_t maxEvictionBatch{40}; + uint64_t minEvictionBatch{5}; +}; + +} // namespace cachelib +} // namespace facebook diff --git a/cachelib/allocator/MMLru-inl.h b/cachelib/allocator/MMLru-inl.h index 25751f188b..ab35030d01 100644 --- a/cachelib/allocator/MMLru-inl.h +++ b/cachelib/allocator/MMLru-inl.h @@ -227,6 +227,15 @@ MMLru::Container::withEvictionIterator(F&& fun) { }); } +template T::*HookPtr> +template +void +MMLru::Container::withPromotionIterator(F&& fun) { + lruMutex_->lock_combine([this, &fun]() { + fun(Iterator{LockHolder{}, lru_.begin()}); + }); +} + template T::*HookPtr> void MMLru::Container::ensureNotInsertionPoint(T& node) noexcept { // If we are removing the insertion point node, grow tail before we remove diff --git a/cachelib/allocator/MMLru.h b/cachelib/allocator/MMLru.h index 0ba27db3a4..fed847f1ae 100644 --- a/cachelib/allocator/MMLru.h +++ b/cachelib/allocator/MMLru.h @@ -337,6 +337,9 @@ class MMLru { template void withEvictionIterator(F&& f); + template + void withPromotionIterator(F&& f); + // get copy of current config Config getConfig() const; diff --git a/cachelib/allocator/MMTinyLFU-inl.h b/cachelib/allocator/MMTinyLFU-inl.h index f4420177e1..09f4ba6dba 100644 --- a/cachelib/allocator/MMTinyLFU-inl.h +++ b/cachelib/allocator/MMTinyLFU-inl.h @@ -228,6 +228,14 @@ MMTinyLFU::Container::withEvictionIterator(F&& fun) { fun(Iterator{LockHolder{}, *this}); } +template T::*HookPtr> +template +void +MMTinyLFU::Container::withPromotionIterator(F&& fun) { + throw std::runtime_error("Not supported"); +} + + template T::*HookPtr> void MMTinyLFU::Container::removeLocked(T& node) noexcept { diff --git a/cachelib/allocator/MMTinyLFU.h b/cachelib/allocator/MMTinyLFU.h index 40886d53af..0f0a245064 100644 --- a/cachelib/allocator/MMTinyLFU.h +++ b/cachelib/allocator/MMTinyLFU.h @@ -495,6 +495,9 @@ class MMTinyLFU { // iterator passed as parameter. template void withEvictionIterator(F&& f); + + template + void withPromotionIterator(F&& f); // for saving the state of the lru // diff --git a/cachelib/allocator/PromotionStrategy.h b/cachelib/allocator/PromotionStrategy.h new file mode 100644 index 0000000000..2347def5f9 --- /dev/null +++ b/cachelib/allocator/PromotionStrategy.h @@ -0,0 +1,84 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "cachelib/allocator/BackgroundMoverStrategy.h" +#include "cachelib/allocator/Cache.h" + +namespace facebook { +namespace cachelib { + +// Base class for background eviction strategy. +class PromotionStrategy : public BackgroundMoverStrategy { + public: + PromotionStrategy(uint64_t promotionAcWatermark, + uint64_t maxPromotionBatch, + uint64_t minPromotionBatch) + : promotionAcWatermark(promotionAcWatermark), + maxPromotionBatch(maxPromotionBatch), + minPromotionBatch(minPromotionBatch) {} + ~PromotionStrategy() {} + + std::vector calculateBatchSizes( + const CacheBase& cache, + std::vector acVec) { + std::vector batches{}; + for (auto [tid, pid, cid] : acVec) { + XDCHECK(tid > 0); + auto stats = cache.getAllocationClassStats(tid - 1, pid, cid); + if (stats.approxFreePercent < promotionAcWatermark) + batches.push_back(0); + else { + auto maxPossibleItemsToPromote = static_cast( + (promotionAcWatermark - stats.approxFreePercent) * + stats.memorySize / stats.allocSize); + batches.push_back(maxPossibleItemsToPromote); + } + } + + if (batches.size() == 0) { + return batches; + } + + auto maxBatch = *std::max_element(batches.begin(), batches.end()); + if (maxBatch == 0) + return batches; + + std::transform( + batches.begin(), batches.end(), batches.begin(), [&](auto numItems) { + if (numItems == 0) { + return 0UL; + } + + auto cappedBatchSize = maxPromotionBatch * numItems / maxBatch; + if (cappedBatchSize < minPromotionBatch) + return minPromotionBatch; + else + return cappedBatchSize; + }); + + return batches; + } + + private: + double promotionAcWatermark{4.0}; + uint64_t maxPromotionBatch{40}; + uint64_t minPromotionBatch{5}; +}; + +} // namespace cachelib +} // namespace facebook diff --git a/cachelib/allocator/tests/AllocatorMemoryTiersTest.cpp b/cachelib/allocator/tests/AllocatorMemoryTiersTest.cpp index d378522b22..78604bc765 100644 --- a/cachelib/allocator/tests/AllocatorMemoryTiersTest.cpp +++ b/cachelib/allocator/tests/AllocatorMemoryTiersTest.cpp @@ -26,6 +26,7 @@ using LruAllocatorMemoryTiersTest = AllocatorMemoryTiersTest; TEST_F(LruAllocatorMemoryTiersTest, MultiTiersFromFileInvalid) { this->testMultiTiersFormFileInvalid(); } TEST_F(LruAllocatorMemoryTiersTest, MultiTiersFromFileValid) { this->testMultiTiersFromFileValid(); } TEST_F(LruAllocatorMemoryTiersTest, MultiTiersValidMixed) { this->testMultiTiersValidMixed(); } +TEST_F(LruAllocatorMemoryTiersTest, MultiTiersBackgroundMovers ) { this->testMultiTiersBackgroundMovers(); } TEST_F(LruAllocatorMemoryTiersTest, MultiTiersNumaBindingsSysVValid) { this->testMultiTiersNumaBindingsSysVValid(); } TEST_F(LruAllocatorMemoryTiersTest, MultiTiersNumaBindingsPosixValid) { this->testMultiTiersNumaBindingsPosixValid(); } diff --git a/cachelib/allocator/tests/AllocatorMemoryTiersTest.h b/cachelib/allocator/tests/AllocatorMemoryTiersTest.h index 16e1f88728..9501a2a6e0 100644 --- a/cachelib/allocator/tests/AllocatorMemoryTiersTest.h +++ b/cachelib/allocator/tests/AllocatorMemoryTiersTest.h @@ -19,6 +19,8 @@ #include "cachelib/allocator/CacheAllocatorConfig.h" #include "cachelib/allocator/MemoryTierCacheConfig.h" #include "cachelib/allocator/tests/TestBase.h" +#include "cachelib/allocator/FreeThresholdStrategy.h" +#include "cachelib/allocator/PromotionStrategy.h" namespace facebook { namespace cachelib { @@ -62,6 +64,61 @@ class AllocatorMemoryTiersTest : public AllocatorTest { ASSERT(handle != nullptr); ASSERT_NO_THROW(alloc->insertOrReplace(handle)); } + + void testMultiTiersBackgroundMovers() { + typename AllocatorT::Config config; + config.setCacheSize(10 * Slab::kSize); + config.enableCachePersistence("/tmp"); + config.usePosixForShm(); + config.configureMemoryTiers({ + MemoryTierCacheConfig::fromShm() + .setRatio(1), + MemoryTierCacheConfig::fromFile("/tmp/b" + std::to_string(::getpid())) + .setRatio(1) + }); + config.enableBackgroundEvictor(std::make_shared(2, 10, 100, 40), + std::chrono::milliseconds(10),1); + config.enableBackgroundPromoter(std::make_shared(5, 4, 2), + std::chrono::milliseconds(10),1); + + auto allocator = std::make_unique(AllocatorT::SharedMemNew, config); + ASSERT(allocator != nullptr); + const size_t numBytes = allocator->getCacheMemoryStats().cacheSize; + + auto poolId = allocator->addPool("default", numBytes); + + const unsigned int keyLen = 100; + std::vector sizes = {100}; + this->fillUpPoolUntilEvictions(*allocator, poolId, sizes, keyLen); + + const auto key = this->getRandomNewKey(*allocator, keyLen); + auto handle = util::allocateAccessible(*allocator, poolId, key, sizes[0]); + ASSERT_NE(nullptr, handle); + + const uint8_t cid = allocator->getAllocInfo(handle->getMemory()).classId; + auto stats = allocator->getGlobalCacheStats(); + auto slabStats = allocator->getAllocationClassStats(0,0,cid); + const auto& mpStats = allocator->getPoolByTid(poolId, 0).getStats(); + //cache is 10MB should move about 1MB to reach 10% free + uint32_t approxEvict = (1024*1024)/mpStats.acStats.at(cid).allocSize; + while (stats.evictionStats.numMovedItems < approxEvict*0.95 && slabStats.approxFreePercent >= 9.5) { + std::this_thread::sleep_for(std::chrono::seconds(1)); + stats = allocator->getGlobalCacheStats(); + slabStats = allocator->getAllocationClassStats(0,0,cid); + } + ASSERT_GE(slabStats.approxFreePercent,9.5); + + auto perclassEstats = allocator->getBackgroundMoverClassStats(MoverDir::Evict); + auto perclassPstats = allocator->getBackgroundMoverClassStats(MoverDir::Promote); + + ASSERT_GE(stats.evictionStats.numMovedItems,1); + ASSERT_GE(stats.evictionStats.runCount,1); + ASSERT_GE(stats.promotionStats.numMovedItems,1); + + ASSERT_GE(perclassEstats[0][0][cid], 1); + ASSERT_GE(perclassPstats[1][0][cid], 1); + + } void testMultiTiersValidMixed() { typename AllocatorT::Config config; diff --git a/cachelib/allocator/tests/CacheBaseTest.cpp b/cachelib/allocator/tests/CacheBaseTest.cpp index 89721f3589..420e77885a 100644 --- a/cachelib/allocator/tests/CacheBaseTest.cpp +++ b/cachelib/allocator/tests/CacheBaseTest.cpp @@ -33,6 +33,8 @@ class CacheBaseTest : public CacheBase, public SlabAllocatorTestBase { const std::string getCacheName() const override { return cacheName; } bool isObjectCache() const override { return false; } const MemoryPool& getPool(PoolId) const override { return memoryPool_; } + //TODO: support tiers + const MemoryPool& getPoolByTid(PoolId, TierId tid) const override { return memoryPool_; } PoolStats getPoolStats(PoolId) const override { return PoolStats(); } AllocationClassBaseStat getAllocationClassStats(TierId tid, PoolId, diff --git a/cachelib/cachebench/cache/Cache-inl.h b/cachelib/cachebench/cache/Cache-inl.h index 383355c184..a236fe0f75 100644 --- a/cachelib/cachebench/cache/Cache-inl.h +++ b/cachelib/cachebench/cache/Cache-inl.h @@ -46,6 +46,16 @@ Cache::Cache(const CacheConfig& config, config_.getRebalanceStrategy(), std::chrono::seconds(config_.poolRebalanceIntervalSec)); + allocatorConfig_.enableBackgroundEvictor( + config_.getBackgroundEvictorStrategy(), + std::chrono::milliseconds(config_.backgroundEvictorIntervalMilSec), + config_.evictorThreads); + + allocatorConfig_.enableBackgroundPromoter( + config_.getBackgroundPromoterStrategy(), + std::chrono::milliseconds(config_.backgroundPromoterIntervalMilSec), + config_.promoterThreads); + if (config_.moveOnSlabRelease && movingSync != nullptr) { allocatorConfig_.enableMovingOnSlabRelease( [](Item& oldItem, Item& newItem, Item* parentPtr) { @@ -100,6 +110,12 @@ Cache::Cache(const CacheConfig& config, } }); + allocatorConfig_.maxEvictionBatch = config_.maxEvictionBatch; + allocatorConfig_.maxPromotionBatch = config_.maxPromotionBatch; + allocatorConfig_.minEvictionBatch = config_.minEvictionBatch; + allocatorConfig_.minPromotionBatch = config_.minPromotionBatch; + allocatorConfig_.maxEvictionPromotionHotness = config_.maxEvictionPromotionHotness; + if (config_.enableItemDestructorCheck) { auto removeCB = [&](const typename Allocator::DestructorData& data) { if (!itemRecords_.validate(data)) { @@ -611,6 +627,21 @@ Stats Cache::getStats() const { ret.slabsApproxFreePercentages = cache_->getCacheMemoryStats().slabsApproxFreePercentages; ret.allocationClassStats = allocationClassStats; + + ret.backgndEvicStats.nEvictedItems = + cacheStats.evictionStats.numMovedItems; + ret.backgndEvicStats.nTraversals = + cacheStats.evictionStats.runCount; + ret.backgndEvicStats.nClasses = + cacheStats.evictionStats.totalClasses; + ret.backgndEvicStats.evictionSize = + cacheStats.evictionStats.totalBytesMoved; + + ret.backgndPromoStats.nPromotedItems = + cacheStats.promotionStats.numMovedItems; + ret.backgndPromoStats.nTraversals = + cacheStats.promotionStats.runCount; + ret.numEvictions = aggregate.numEvictions(); ret.numItems = aggregate.numItems(); ret.evictAttempts = cacheStats.evictionAttempts; @@ -663,6 +694,9 @@ Stats Cache::getStats() const { ret.nvmCounters = cache_->getNvmCacheStatsMap(); } + ret.backgroundEvictionClasses = cache_->getBackgroundMoverClassStats(MoverDir::Evict); + ret.backgroundPromotionClasses = cache_->getBackgroundMoverClassStats(MoverDir::Promote); + // nvm stats from navy if (!isRamOnly() && !navyStats.empty()) { auto lookup = [&navyStats](const std::string& key) { diff --git a/cachelib/cachebench/cache/CacheStats.h b/cachelib/cachebench/cache/CacheStats.h index 5627b93556..7568cae954 100644 --- a/cachelib/cachebench/cache/CacheStats.h +++ b/cachelib/cachebench/cache/CacheStats.h @@ -26,7 +26,33 @@ DECLARE_string(report_memory_usage_stats); namespace facebook { namespace cachelib { namespace cachebench { + +struct BackgroundEvictionStats { + // the number of items this worker evicted by looking at pools/classes stats + uint64_t nEvictedItems{0}; + + // number of times we went executed the thread //TODO: is this def correct? + uint64_t nTraversals{0}; + + // number of classes + uint64_t nClasses{0}; + + // size of evicted items + uint64_t evictionSize{0}; +}; + +struct BackgroundPromotionStats { + // the number of items this worker evicted by looking at pools/classes stats + uint64_t nPromotedItems{0}; + + // number of times we went executed the thread //TODO: is this def correct? + uint64_t nTraversals{0}; +}; + struct Stats { + BackgroundEvictionStats backgndEvicStats; + BackgroundPromotionStats backgndPromoStats; + uint64_t numEvictions{0}; uint64_t numItems{0}; @@ -110,6 +136,9 @@ struct Stats { // cachebench. std::unordered_map nvmCounters; + std::map>> backgroundEvictionClasses; + std::map>> backgroundPromotionClasses; + // errors from the nvm engine. std::unordered_map nvmErrors; @@ -130,6 +159,16 @@ struct Stats { << std::endl; out << folly::sformat("RAM Evictions : {:,}", numEvictions) << std::endl; + auto foreachAC = [&](auto &map, auto cb) { + for (auto &tidStats : map) { + for (auto &pidStat : tidStats.second) { + for (auto &cidStat : pidStat.second) { + cb(tidStats.first, pidStat.first, cidStat.first, cidStat.second); + } + } + } + }; + for (auto pid = 0U; pid < poolUsageFraction.size(); pid++) { out << folly::sformat("Fraction of pool {:,} used : {:.2f}", pid, poolUsageFraction[pid]) @@ -186,6 +225,10 @@ struct Stats { }); } + out << folly::sformat("Tier 0 Background Evicted Items : {:,}", + backgndEvicStats.nEvictedItems) << std::endl; + out << folly::sformat("Tier 0 Background Traversals : {:,}", + backgndEvicStats.nTraversals) << std::endl; if (numCacheGets > 0) { out << folly::sformat("Cache Gets : {:,}", numCacheGets) << std::endl; out << folly::sformat("Hit Ratio : {:6.2f}%", overallHitRatio) @@ -216,6 +259,22 @@ struct Stats { } } + if (!backgroundEvictionClasses.empty() && backgndEvicStats.nEvictedItems > 0 ) { + out << "== Class Background Eviction Counters Map ==" << std::endl; + foreachAC(backgroundEvictionClasses, [&](auto tid, auto pid, auto cid, auto evicted){ + out << folly::sformat("tid{:2} pid{:2} cid{:4} evicted: {:4}", + tid, pid, cid, evicted) << std::endl; + }); + } + + if (!backgroundPromotionClasses.empty() && backgndPromoStats.nPromotedItems > 0) { + out << "== Class Background Promotion Counters Map ==" << std::endl; + foreachAC(backgroundPromotionClasses, [&](auto tid, auto pid, auto cid, auto promoted){ + out << folly::sformat("tid{:2} pid{:2} cid{:4} promoted: {:4}", + tid, pid, cid, promoted) << std::endl; + }); + } + if (numNvmGets > 0 || numNvmDeletes > 0 || numNvmPuts > 0) { const double ramHitRatio = invertPctFn(numCacheGetMiss, numCacheGets); const double nvmHitRatio = invertPctFn(numNvmGetMiss, numNvmGets); diff --git a/cachelib/cachebench/util/CacheConfig.cpp b/cachelib/cachebench/util/CacheConfig.cpp index 29cd9cb6a3..929913cafe 100644 --- a/cachelib/cachebench/util/CacheConfig.cpp +++ b/cachelib/cachebench/util/CacheConfig.cpp @@ -19,6 +19,8 @@ #include "cachelib/allocator/HitsPerSlabStrategy.h" #include "cachelib/allocator/LruTailAgeStrategy.h" #include "cachelib/allocator/RandomStrategy.h" +#include "cachelib/allocator/FreeThresholdStrategy.h" +#include "cachelib/allocator/PromotionStrategy.h" namespace facebook { namespace cachelib { @@ -28,6 +30,9 @@ CacheConfig::CacheConfig(const folly::dynamic& configJson) { JSONSetVal(configJson, cacheDir); JSONSetVal(configJson, cacheSizeMB); JSONSetVal(configJson, poolRebalanceIntervalSec); + JSONSetVal(configJson, backgroundEvictorIntervalMilSec); + JSONSetVal(configJson, backgroundPromoterIntervalMilSec); + JSONSetVal(configJson, backgroundEvictorStrategy); JSONSetVal(configJson, moveOnSlabRelease); JSONSetVal(configJson, rebalanceStrategy); JSONSetVal(configJson, rebalanceMinSlabs); @@ -92,9 +97,26 @@ CacheConfig::CacheConfig(const folly::dynamic& configJson) { JSONSetVal(configJson, nvmAdmissionRetentionTimeThreshold); JSONSetVal(configJson, customConfigJson); - + + + JSONSetVal(configJson, lowEvictionAcWatermark); + JSONSetVal(configJson, highEvictionAcWatermark); + JSONSetVal(configJson, minAcAllocationWatermark); + JSONSetVal(configJson, maxAcAllocationWatermark); + JSONSetVal(configJson, numDuplicateElements); + JSONSetVal(configJson, syncPromotion); + JSONSetVal(configJson, evictorThreads); + JSONSetVal(configJson, promoterThreads); + + JSONSetVal(configJson, promotionAcWatermark); JSONSetVal(configJson, persistedCacheDir); JSONSetVal(configJson, usePosixShm); + JSONSetVal(configJson, maxEvictionBatch); + JSONSetVal(configJson, maxPromotionBatch); + JSONSetVal(configJson, minEvictionBatch); + JSONSetVal(configJson, minPromotionBatch); + JSONSetVal(configJson, maxEvictionPromotionHotness); + if (configJson.count("memoryTiers")) { for (auto& it : configJson["memoryTiers"]) { memoryTierConfigs.push_back(MemoryTierConfig(it).getMemoryTierCacheConfig()); @@ -103,7 +125,7 @@ CacheConfig::CacheConfig(const folly::dynamic& configJson) { // if you added new fields to the configuration, update the JSONSetVal // to make them available for the json configs and increment the size // below - checkCorrectSize(); + checkCorrectSize(); if (numPools != poolSizes.size()) { throw std::invalid_argument(folly::sformat( @@ -138,10 +160,23 @@ MemoryTierConfig::MemoryTierConfig(const folly::dynamic& configJson) { JSONSetVal(configJson, file); JSONSetVal(configJson, ratio); JSONSetVal(configJson, memBindNodes); - checkCorrectSize(); } +std::shared_ptr CacheConfig::getBackgroundEvictorStrategy() const { + if (backgroundEvictorIntervalMilSec == 0) { + return nullptr; + } + return std::make_shared(lowEvictionAcWatermark, highEvictionAcWatermark, maxEvictionBatch, minEvictionBatch); +} + +std::shared_ptr CacheConfig::getBackgroundPromoterStrategy() const { + if (backgroundPromoterIntervalMilSec == 0) { + return nullptr; + } + return std::make_shared(promotionAcWatermark, maxPromotionBatch, minPromotionBatch); +} + static bool starts_with() {return true;} std::vector MemoryTierConfig::parseNumaNodes() { diff --git a/cachelib/cachebench/util/CacheConfig.h b/cachelib/cachebench/util/CacheConfig.h index 7a8c9020b0..c1b18df670 100644 --- a/cachelib/cachebench/util/CacheConfig.h +++ b/cachelib/cachebench/util/CacheConfig.h @@ -20,6 +20,7 @@ #include "cachelib/allocator/CacheAllocator.h" #include "cachelib/allocator/RebalanceStrategy.h" +#include "cachelib/allocator/BackgroundMoverStrategy.h" #include "cachelib/cachebench/util/JSONConfig.h" #include "cachelib/common/Ticker.h" #include "cachelib/navy/common/Device.h" @@ -78,7 +79,10 @@ struct CacheConfig : public JSONConfig { uint64_t cacheSizeMB{0}; uint64_t poolRebalanceIntervalSec{0}; + uint64_t backgroundEvictorIntervalMilSec{0}; + uint64_t backgroundPromoterIntervalMilSec{0}; std::string rebalanceStrategy; + std::string backgroundEvictorStrategy; uint64_t rebalanceMinSlabs{1}; double rebalanceDiffRatio{0.25}; bool moveOnSlabRelease{false}; @@ -256,6 +260,27 @@ struct CacheConfig : public JSONConfig { // eviction-age is more than this threshold. 0 means no threshold uint32_t nvmAdmissionRetentionTimeThreshold{0}; + // See BackgroundMovers.md for complete description + double promotionAcWatermark{4.0}; + double lowEvictionAcWatermark{2.0}; + double highEvictionAcWatermark{5.0}; + double minAcAllocationWatermark{0.0}; + double maxAcAllocationWatermark{0.0}; + + double numDuplicateElements{0.0}; // inclusivness of the cache + double syncPromotion{0.0}; // can promotion be done synchronously in user thread + + uint64_t evictorThreads{1}; + uint64_t promoterThreads{1}; + + uint64_t maxEvictionBatch{40}; + uint64_t maxPromotionBatch{10}; + + uint64_t minEvictionBatch{5}; + uint64_t minPromotionBatch{5}; + + uint64_t maxEvictionPromotionHotness{60}; + // // Options below are not to be populated with JSON // @@ -287,6 +312,8 @@ struct CacheConfig : public JSONConfig { CacheConfig() {} std::shared_ptr getRebalanceStrategy() const; + std::shared_ptr getBackgroundEvictorStrategy() const; + std::shared_ptr getBackgroundPromoterStrategy() const; }; } // namespace cachebench } // namespace cachelib