diff --git a/.github/workflows/build-cachelib-centos.yml b/.github/workflows/build-cachelib-centos-long.yml
similarity index 86%
rename from .github/workflows/build-cachelib-centos.yml
rename to .github/workflows/build-cachelib-centos-long.yml
index 3b071a186a..92165f603b 100644
--- a/.github/workflows/build-cachelib-centos.yml
+++ b/.github/workflows/build-cachelib-centos-long.yml
@@ -1,7 +1,8 @@
name: build-cachelib-centos-latest
on:
schedule:
- - cron: '30 5 * * 1,4'
+ - cron: '0 7 * * *'
+
jobs:
build-cachelib-centos8-latest:
name: "CentOS/latest - Build CacheLib with all dependencies"
@@ -33,3 +34,6 @@ jobs:
uses: actions/checkout@v2
- name: "build CacheLib using build script"
run: ./contrib/build.sh -j -v -T
+ - name: "run tests"
+ timeout-minutes: 60
+ run: cd opt/cachelib/tests && ../../../run_tests.sh long
diff --git a/.github/workflows/build-cachelib-debian.yml b/.github/workflows/build-cachelib-debian.yml
index a2ae44a569..5bc3ad3c70 100644
--- a/.github/workflows/build-cachelib-debian.yml
+++ b/.github/workflows/build-cachelib-debian.yml
@@ -1,7 +1,8 @@
name: build-cachelib-debian-10
on:
schedule:
- - cron: '30 5 * * 2,6'
+ - cron: '30 5 * * 0,3'
+
jobs:
build-cachelib-debian-10:
name: "Debian/Buster - Build CacheLib with all dependencies"
@@ -37,3 +38,6 @@ jobs:
uses: actions/checkout@v2
- name: "build CacheLib using build script"
run: ./contrib/build.sh -j -v -T
+ - name: "run tests"
+ timeout-minutes: 60
+ run: cd opt/cachelib/tests && ../../../run_tests.sh
diff --git a/.github/workflows/build-cachelib-docker.yml b/.github/workflows/build-cachelib-docker.yml
new file mode 100644
index 0000000000..be28bc233c
--- /dev/null
+++ b/.github/workflows/build-cachelib-docker.yml
@@ -0,0 +1,49 @@
+name: build-cachelib-docker
+on:
+ push:
+ pull_request:
+
+jobs:
+ build-cachelib-docker:
+ name: "CentOS/latest - Build CacheLib with all dependencies"
+ runs-on: ubuntu-latest
+ env:
+ REPO: cachelib
+ GITHUB_REPO: intel/CacheLib
+ CONTAINER_REG: ghcr.io/pmem/cachelib
+ CONTAINER_REG_USER: ${{ secrets.GH_CR_USER }}
+ CONTAINER_REG_PASS: ${{ secrets.GH_CR_PAT }}
+ FORCE_IMAGE_ACTION: ${{ secrets.FORCE_IMAGE_ACTION }}
+ HOST_WORKDIR: ${{ github.workspace }}
+ WORKDIR: docker
+ IMG_VER: devel
+ strategy:
+ matrix:
+ CONFIG: ["OS=centos OS_VER=8streams PUSH_IMAGE=1"]
+ steps:
+ - name: "System Information"
+ run: |
+ echo === uname ===
+ uname -a
+ echo === /etc/os-release ===
+ cat /etc/os-release
+ echo === df -hl ===
+ df -hl
+ echo === free -h ===
+ free -h
+ echo === top ===
+ top -b -n1 -1 -Eg || timeout 1 top -b -n1
+ echo === env ===
+ env
+ echo === gcc -v ===
+ gcc -v
+ - name: "checkout sources"
+ uses: actions/checkout@v2
+ with:
+ fetch-depth: 0
+
+ - name: Pull the image or rebuild and push it
+ run: cd $WORKDIR && ${{ matrix.CONFIG }} ./pull-or-rebuild-image.sh $FORCE_IMAGE_ACTION
+
+ - name: Run the build
+ run: cd $WORKDIR && ${{ matrix.CONFIG }} ./build.sh
diff --git a/.github/workflows/build-cachelib.yml b/.github/workflows/build-cachelib.yml
deleted file mode 100644
index 15161c40e0..0000000000
--- a/.github/workflows/build-cachelib.yml
+++ /dev/null
@@ -1,147 +0,0 @@
-# NOTES:
-# 1. While Github-Actions enables cache of dependencies,
-# Facebook's projects (folly,fizz,wangle,fbthrift)
-# are fast-moving targets - so we always checkout the latest version
-# (as opposed to using gitactions cache, which is recommended in the
-# documentation).
-#
-# 2. Using docker containers to build on CentOS and Debian,
-# Specifically CentOS v8.1.1911 as that
-# version is closest to Facebook's internal dev machines.
-#
-# 3. When using docker containers we install 'sudo',
-# as the docker images are typically very minimal and without
-# 'sudo', while the ./contrib/ scripts use sudo.
-#
-# 4. When using the docker containers we install 'git'
-# BEFORE getting the CacheLib source code (with the 'checkout' action).
-# Otherwise, the 'checkout@v2' action script falls back to downloading
-# the git repository files only, without the ".git" directory.
-# We need the ".git" directory to updating the git-submodules
-# (folly/wangle/fizz/fbthrift). See:
-# https://github.com/actions/checkout/issues/126#issuecomment-570288731
-#
-# 5. To reduce less-critical (and yet frequent) rebuilds, the jobs
-# check the author of the commit, and SKIP the build if
-# the author is "svcscm". These commits are automatic updates
-# for the folly/fbthrift git-submodules, and can happen several times a day.
-# While there is a possiblity that updating the git-submodules breaks
-# CacheLib, it is less likely, and will be detected once an actual
-# code change commit triggers a full build.
-# e.g. https://github.com/facebookincubator/CacheLib/commit/9372a82190dd71a6e2bcb668828cfed9d1bd25c1
-#
-# 6. The 'if' condition checking the author name of the commit (see #5 above)
-# uses github actions metadata variable:
-# 'github.event.head_commit.author.name'
-# GitHub have changed in the past the metadata structure and broke
-# such conditions. If you need to debug the metadata values,
-# see the "dummy-show-github-event" job below.
-# E.g. https://github.blog/changelog/2019-10-16-changes-in-github-actions-push-event-payload/
-# As of Jan-2021, the output is:
-# {
-# "author": {
-# "email": "mimi@moo.moo",
-# "name": "mimi"
-# },
-# "committer": {
-# "email": "assafgordon@gmail.com",
-# "name": "Assaf Gordon",
-# "username": "agordon"
-# },
-# "distinct": true,
-# "id": "6c3aab0970f4a07cc2af7658756a6ef9d82f3276",
-# "message": "gitactions: test",
-# "timestamp": "2021-01-26T11:11:57-07:00",
-# "tree_id": "741cd1cb802df84362a51e5d01f28788845d08b7",
-# "url": "https://github.com/agordon/CacheLib/commit/6c3aab0970f4a07cc2af7658756a6ef9d82f3276"
-# }
-#
-# 7. When checking the commit's author name, we use '...author.name',
-# NOT '...author.username' - because the 'svcscm' author does not
-# have a github username (see the 'mimi' example above).
-#
-
-name: build-cachelib
-on: [push]
-jobs:
- dummy-show-github-event:
- name: "Show GitHub Action event.head_commit variable"
- runs-on: ubuntu-latest
- steps:
- - name: "GitHub Variable Content"
- env:
- CONTENT: ${{ toJSON(github.event.head_commit) }}
- run: echo "$CONTENT"
-
-
- build-cachelib-centos8-1-1911:
- if: "!contains(github.event.head_commit.author.name, 'svcscm')"
- name: "CentOS/8.1.1911 - Build CacheLib with all dependencies"
- runs-on: ubuntu-latest
- # Docker container image name
- container: "centos:8.1.1911"
- steps:
- - name: "update packages"
- # stock centos has a problem with CMAKE, fails with:
- # "cmake: symbol lookup error: cmake: undefined symbol: archive_write_add_filter_zstd"
- # updating solves it
- run: dnf update -y
- - name: "install sudo,git"
- run: dnf install -y sudo git cmake gcc
- - name: "System Information"
- run: |
- echo === uname ===
- uname -a
- echo === /etc/os-release ===
- cat /etc/os-release
- echo === df -hl ===
- df -hl
- echo === free -h ===
- free -h
- echo === top ===
- top -b -n1 -1 -Eg || timeout 1 top -b -n1
- echo === env ===
- env
- echo === gcc -v ===
- gcc -v
- - name: "checkout sources"
- uses: actions/checkout@v2
- - name: "Install Prerequisites"
- run: ./contrib/build.sh -S -B
- - name: "Test: update-submodules"
- run: ./contrib/update-submodules.sh
- - name: "Install dependency: zstd"
- run: ./contrib/build-package.sh -j -v -i zstd
- - name: "Install dependency: googleflags"
- run: ./contrib/build-package.sh -j -v -i googleflags
- - name: "Install dependency: googlelog"
- run: ./contrib/build-package.sh -j -v -i googlelog
- - name: "Install dependency: googletest"
- run: ./contrib/build-package.sh -j -v -i googletest
- - name: "Install dependency: sparsemap"
- run: ./contrib/build-package.sh -j -v -i sparsemap
- - name: "Install dependency: fmt"
- run: ./contrib/build-package.sh -j -v -i fmt
- - name: "Install dependency: folly"
- run: ./contrib/build-package.sh -j -v -i folly
- - name: "Install dependency: fizz"
- run: ./contrib/build-package.sh -j -v -i fizz
- - name: "Install dependency: wangle"
- run: ./contrib/build-package.sh -j -v -i wangle
- - name: "Install dependency: fbthrift"
- run: ./contrib/build-package.sh -j -v -i fbthrift
- - name: "build CacheLib"
- # Build cachelib in debug mode (-d) and with all tests (-t)
- run: ./contrib/build-package.sh -j -v -i -d -t cachelib
- - uses: actions/upload-artifact@v2
- if: failure()
- with:
- name: cachelib-cmake-logs
- path: |
- build-cachelib/CMakeFiles/*.log
- build-cachelib/CMakeCache.txt
- build-cachelib/Makefile
- build-cachelib/**/Makefile
- if-no-files-found: warn
- retention-days: 1
-
diff --git a/.github/workflows/clang-format-check.yml b/.github/workflows/clang-format-check.yml
index 4b4897b610..90c8d739c6 100644
--- a/.github/workflows/clang-format-check.yml
+++ b/.github/workflows/clang-format-check.yml
@@ -1,6 +1,6 @@
# From: https://github.com/marketplace/actions/clang-format-check#multiple-paths
name: clang-format Check
-on: [pull_request]
+on: []
jobs:
formatting-check:
name: Formatting Check
diff --git a/MultiTierDataMovement.md b/MultiTierDataMovement.md
new file mode 100644
index 0000000000..cccc14b947
--- /dev/null
+++ b/MultiTierDataMovement.md
@@ -0,0 +1,90 @@
+# Background Data Movement
+
+In order to reduce the number of online evictions and support asynchronous
+promotion - we have added two periodic workers to handle eviction and promotion.
+
+The diagram below shows a simplified version of how the background evictor
+thread (green) is integrated to the CacheLib architecture.
+
+
+
+
+
+## Background Evictors
+
+The background evictors scan each class to see if there are objects to move the next (lower)
+tier using a given strategy. Here we document the parameters for the different
+strategies and general parameters.
+
+- `backgroundEvictorIntervalMilSec`: The interval that this thread runs for - by default
+the background evictor threads will wake up every 10 ms to scan the AllocationClasses. Also,
+the background evictor thread will be woken up everytime there is a failed allocation (from
+a request handling thread) and the current percentage of free memory for the
+AllocationClass is lower than `lowEvictionAcWatermark`. This may render the interval parameter
+not as important when there are many allocations occuring from request handling threads.
+
+- `evictorThreads`: The number of background evictors to run - each thread is a assigned
+a set of AllocationClasses to scan and evict objects from. Currently, each thread gets
+an equal number of classes to scan - but as object size distribution may be unequal - future
+versions will attempt to balance the classes among threads. The range is 1 to number of AllocationClasses.
+The default is 1.
+
+- `maxEvictionBatch`: The number of objects to remove in a given eviction call. The
+default is 40. Lower range is 10 and the upper range is 1000. Too low and we might not
+remove objects at a reasonable rate, too high and it might increase contention with user threads.
+
+- `minEvictionBatch`: Minimum number of items to evict at any time (if there are any
+candidates)
+
+- `maxEvictionPromotionHotness`: Maximum candidates to consider for eviction. This is similar to `maxEvictionBatch`
+but it specifies how many candidates will be taken into consideration, not the actual number of items to evict.
+This option can be used to configure duration of critical section on LRU lock.
+
+
+### FreeThresholdStrategy (default)
+
+- `lowEvictionAcWatermark`: Triggers background eviction thread to run
+when this percentage of the AllocationClass is free.
+The default is `2.0`, to avoid wasting capacity we don't set this above `10.0`.
+
+- `highEvictionAcWatermark`: Stop the evictions from an AllocationClass when this
+percentage of the AllocationClass is free. The default is `5.0`, to avoid wasting capacity we
+don't set this above `10`.
+
+
+## Background Promoters
+
+The background promoters scan each class to see if there are objects to move to a lower
+tier using a given strategy. Here we document the parameters for the different
+strategies and general parameters.
+
+- `backgroundPromoterIntervalMilSec`: The interval that this thread runs for - by default
+the background promoter threads will wake up every 10 ms to scan the AllocationClasses for
+objects to promote.
+
+- `promoterThreads`: The number of background promoters to run - each thread is a assigned
+a set of AllocationClasses to scan and promote objects from. Currently, each thread gets
+an equal number of classes to scan - but as object size distribution may be unequal - future
+versions will attempt to balance the classes among threads. The range is `1` to number of AllocationClasses. The default is `1`.
+
+- `maxProtmotionBatch`: The number of objects to promote in a given promotion call. The
+default is 40. Lower range is 10 and the upper range is 1000. Too low and we might not
+remove objects at a reasonable rate, too high and it might increase contention with user threads.
+
+- `minPromotionBatch`: Minimum number of items to promote at any time (if there are any
+candidates)
+
+- `numDuplicateElements`: This allows us to promote items that have existing handles (read-only) since
+we won't need to modify the data when a user is done with the data. Therefore, for a short time
+the data could reside in both tiers until it is evicted from its current tier. The default is to
+not allow this (0). Setting the value to 100 will enable duplicate elements in tiers.
+
+### Background Promotion Strategy (only one currently)
+
+- `promotionAcWatermark`: Promote items if there is at least this
+percent of free AllocationClasses. Promotion thread will attempt to move `maxPromotionBatch` number of objects
+to that tier. The objects are chosen from the head of the LRU. The default is `4.0`.
+This value should correlate with `lowEvictionAcWatermark`, `highEvictionAcWatermark`, `minAcAllocationWatermark`, `maxAcAllocationWatermark`.
+- `maxPromotionBatch`: The number of objects to promote in batch during BG promotion. Analogous to
+`maxEvictionBatch`. It's value should be lower to decrease contention on hot items.
+
diff --git a/cachelib/allocator/BackgroundMover-inl.h b/cachelib/allocator/BackgroundMover-inl.h
new file mode 100644
index 0000000000..b77436635f
--- /dev/null
+++ b/cachelib/allocator/BackgroundMover-inl.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) Intel and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace facebook {
+namespace cachelib {
+
+template
+BackgroundMover::BackgroundMover(
+ Cache& cache,
+ std::shared_ptr strategy,
+ MoverDir direction)
+ : cache_(cache), strategy_(strategy), direction_(direction) {
+ if (direction_ == MoverDir::Evict) {
+ moverFunc = BackgroundMoverAPIWrapper::traverseAndEvictItems;
+
+ } else if (direction_ == MoverDir::Promote) {
+ moverFunc = BackgroundMoverAPIWrapper::traverseAndPromoteItems;
+ }
+}
+
+template
+BackgroundMover::~BackgroundMover() {
+ stop(std::chrono::seconds(0));
+}
+
+template
+void BackgroundMover::work() {
+ try {
+ checkAndRun();
+ } catch (const std::exception& ex) {
+ XLOGF(ERR, "BackgroundMover interrupted due to exception: {}", ex.what());
+ }
+}
+
+template
+void BackgroundMover::setAssignedMemory(
+ std::vector&& assignedMemory) {
+ XLOG(INFO, "Class assigned to background worker:");
+ for (auto [tid, pid, cid] : assignedMemory) {
+ XLOGF(INFO, "Tid: {}, Pid: {}, Cid: {}", tid, pid, cid);
+ }
+
+ mutex.lock_combine([this, &assignedMemory] {
+ this->assignedMemory_ = std::move(assignedMemory);
+ });
+}
+
+// Look for classes that exceed the target memory capacity
+// and return those for eviction
+template
+void BackgroundMover::checkAndRun() {
+ auto assignedMemory = mutex.lock_combine([this] { return assignedMemory_; });
+
+ unsigned int moves = 0;
+ std::set classes{};
+ auto batches = strategy_->calculateBatchSizes(cache_, assignedMemory);
+
+ for (size_t i = 0; i < batches.size(); i++) {
+ const auto [tid, pid, cid] = assignedMemory[i];
+ const auto batch = batches[i];
+
+ classes.insert(cid);
+ const auto& mpStats = cache_.getPoolByTid(pid, tid).getStats();
+
+ if (!batch) {
+ continue;
+ }
+
+ // try moving BATCH items from the class in order to reach free target
+ auto moved = moverFunc(cache_, tid, pid, cid, batch);
+ moves += moved;
+ moves_per_class_[tid][pid][cid] += moved;
+ totalBytesMoved.add(moved * mpStats.acStats.at(cid).allocSize);
+ }
+
+ numTraversals.inc();
+ numMovedItems.add(moves);
+ totalClasses.add(classes.size());
+}
+
+template
+BackgroundMoverStats BackgroundMover::getStats() const noexcept {
+ BackgroundMoverStats stats;
+ stats.numMovedItems = numMovedItems.get();
+ stats.runCount = numTraversals.get();
+ stats.totalBytesMoved = totalBytesMoved.get();
+ stats.totalClasses = totalClasses.get();
+
+ return stats;
+}
+
+template
+std::map>>
+BackgroundMover::getClassStats() const noexcept {
+ return moves_per_class_;
+}
+
+} // namespace cachelib
+} // namespace facebook
diff --git a/cachelib/allocator/BackgroundMover.h b/cachelib/allocator/BackgroundMover.h
new file mode 100644
index 0000000000..1246676d6e
--- /dev/null
+++ b/cachelib/allocator/BackgroundMover.h
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) Intel and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "cachelib/allocator/BackgroundMoverStrategy.h"
+#include "cachelib/allocator/CacheStats.h"
+#include "cachelib/common/AtomicCounter.h"
+#include "cachelib/common/PeriodicWorker.h"
+
+namespace facebook {
+namespace cachelib {
+
+// wrapper that exposes the private APIs of CacheType that are specifically
+// needed for the cache api
+template
+struct BackgroundMoverAPIWrapper {
+ static size_t traverseAndEvictItems(C& cache,
+ unsigned int tid,
+ unsigned int pid,
+ unsigned int cid,
+ size_t batch) {
+ return cache.traverseAndEvictItems(tid, pid, cid, batch);
+ }
+
+ static size_t traverseAndPromoteItems(C& cache,
+ unsigned int tid,
+ unsigned int pid,
+ unsigned int cid,
+ size_t batch) {
+ return cache.traverseAndPromoteItems(tid, pid, cid, batch);
+ }
+};
+
+enum class MoverDir { Evict = 0, Promote };
+
+// Periodic worker that evicts items from tiers in batches
+// The primary aim is to reduce insertion times for new items in the
+// cache
+template
+class BackgroundMover : public PeriodicWorker {
+ public:
+ using Cache = CacheT;
+ // @param cache the cache interface
+ // @param strategy the stragey class that defines how objects are
+ // moved,
+ // (promoted vs. evicted and how much)
+ BackgroundMover(Cache& cache,
+ std::shared_ptr strategy,
+ MoverDir direction_);
+
+ ~BackgroundMover() override;
+
+ BackgroundMoverStats getStats() const noexcept;
+ std::map>>
+ getClassStats() const noexcept;
+
+ void setAssignedMemory(
+ std::vector&& assignedMemory);
+
+ private:
+ std::map>>
+ moves_per_class_;
+ // cache allocator's interface for evicting
+ using Item = typename Cache::Item;
+
+ Cache& cache_;
+ std::shared_ptr strategy_;
+ MoverDir direction_;
+
+ std::function
+ moverFunc;
+
+ // implements the actual logic of running the background evictor
+ void work() override final;
+ void checkAndRun();
+
+ AtomicCounter numMovedItems{0};
+ AtomicCounter numTraversals{0};
+ AtomicCounter totalClasses{0};
+ AtomicCounter totalBytesMoved{0};
+
+ std::vector assignedMemory_;
+ folly::DistributedMutex mutex;
+};
+} // namespace cachelib
+} // namespace facebook
+
+#include "cachelib/allocator/BackgroundMover-inl.h"
diff --git a/cachelib/allocator/BackgroundMoverStrategy.h b/cachelib/allocator/BackgroundMoverStrategy.h
new file mode 100644
index 0000000000..7706a625a5
--- /dev/null
+++ b/cachelib/allocator/BackgroundMoverStrategy.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "cachelib/allocator/Cache.h"
+
+
+namespace facebook {
+namespace cachelib {
+
+struct MemoryDescriptorType {
+ MemoryDescriptorType(TierId tid, PoolId pid, ClassId cid) :
+ tid_(tid), pid_(pid), cid_(cid) {}
+ TierId tid_;
+ PoolId pid_;
+ ClassId cid_;
+};
+
+// Base class for background eviction strategy.
+class BackgroundMoverStrategy {
+ public:
+ virtual std::vector calculateBatchSizes(
+ const CacheBase& cache,
+ std::vector acVec) = 0;
+};
+
+} // namespace cachelib
+} // namespace facebook
diff --git a/cachelib/allocator/CMakeLists.txt b/cachelib/allocator/CMakeLists.txt
index b659770d82..30e5431e45 100644
--- a/cachelib/allocator/CMakeLists.txt
+++ b/cachelib/allocator/CMakeLists.txt
@@ -35,6 +35,8 @@ add_library (cachelib_allocator
CCacheManager.cpp
ContainerTypes.cpp
FreeMemStrategy.cpp
+ DynamicFreeThresholdStrategy.cpp
+ FreeThresholdStrategy.cpp
HitsPerSlabStrategy.cpp
LruTailAgeStrategy.cpp
MarginalHitsOptimizeStrategy.cpp
@@ -117,6 +119,8 @@ if (BUILD_TESTS)
add_test (tests/ChainedHashTest.cpp)
add_test (tests/AllocatorResizeTypeTest.cpp)
add_test (tests/AllocatorHitStatsTypeTest.cpp)
+ add_test (tests/AllocatorMemoryTiersTest.cpp)
+ add_test (tests/MemoryTiersTest.cpp)
add_test (tests/MultiAllocatorTest.cpp)
add_test (tests/NvmAdmissionPolicyTest.cpp)
add_test (tests/CacheAllocatorConfigTest.cpp)
diff --git a/cachelib/allocator/Cache.h b/cachelib/allocator/Cache.h
index 2511a18291..589614ee3b 100644
--- a/cachelib/allocator/Cache.h
+++ b/cachelib/allocator/Cache.h
@@ -83,6 +83,9 @@ class CacheBase {
CacheBase(CacheBase&&) = default;
CacheBase& operator=(CacheBase&&) = default;
+ // TODO: come up with some reasonable number
+ static constexpr unsigned kMaxTiers = 2;
+
// Get a string referring to the cache name for this cache
virtual const std::string getCacheName() const = 0;
@@ -93,6 +96,12 @@ class CacheBase {
//
// @param poolId The pool id to query
virtual const MemoryPool& getPool(PoolId poolId) const = 0;
+
+ // Get the reference to a memory pool using a tier id, for stats purposes
+ //
+ // @param poolId The pool id to query
+ // @param tierId The tier of the pool id
+ virtual const MemoryPool& getPoolByTid(PoolId poolId, TierId tid) const = 0;
// Get Pool specific stats (regular pools). This includes stats from the
// Memory Pool and also the cache.
@@ -100,6 +109,9 @@ class CacheBase {
// @param poolId the pool id
virtual PoolStats getPoolStats(PoolId poolId) const = 0;
+ virtual AllocationClassBaseStat getAllocationClassStats(
+ TierId, PoolId pid, ClassId cid) const = 0;
+
// @param poolId the pool id
virtual AllSlabReleaseEvents getAllSlabReleaseEvents(PoolId poolId) const = 0;
diff --git a/cachelib/allocator/CacheAllocator-inl.h b/cachelib/allocator/CacheAllocator-inl.h
index 8c7fec31c4..378a30f611 100644
--- a/cachelib/allocator/CacheAllocator-inl.h
+++ b/cachelib/allocator/CacheAllocator-inl.h
@@ -16,12 +16,21 @@
#pragma once
+#include
+
namespace facebook {
namespace cachelib {
template
CacheAllocator::CacheAllocator(Config config)
: CacheAllocator(InitMemType::kNone, config) {
+ // TODO(MEMORY_TIER)
+ if (getNumTiers() > 1 || std::holds_alternative(
+ memoryTierConfigs[0].getShmTypeOpts())) {
+ throw std::runtime_error(
+ "Using custom memory tier or using more than one tier is only "
+ "supported for Shared Memory.");
+ }
initCommon(false);
}
@@ -29,12 +38,14 @@ template
CacheAllocator::CacheAllocator(SharedMemNewT, Config config)
: CacheAllocator(InitMemType::kMemNew, config) {
initCommon(false);
- shmManager_->removeShm(detail::kShmInfoName);
+ shmManager_->removeShm(detail::kShmInfoName,
+ PosixSysVSegmentOpts(config_.isUsingPosixShm()));
}
template
CacheAllocator::CacheAllocator(SharedMemAttachT, Config config)
: CacheAllocator(InitMemType::kMemAttach, config) {
+ /* TODO - per tier? */
for (auto pid : *metadata_.compactCachePools()) {
isCompactCachePool_[pid] = true;
}
@@ -44,7 +55,8 @@ CacheAllocator::CacheAllocator(SharedMemAttachT, Config config)
// We will create a new info shm segment on shutDown(). If we don't remove
// this info shm segment here and the new info shm segment's size is larger
// than this one, creating new one will fail.
- shmManager_->removeShm(detail::kShmInfoName);
+ shmManager_->removeShm(detail::kShmInfoName,
+ PosixSysVSegmentOpts(config_.isUsingPosixShm()));
}
template
@@ -53,12 +65,13 @@ CacheAllocator::CacheAllocator(
: isOnShm_{type != InitMemType::kNone ? true
: config.memMonitoringEnabled()},
config_(config.validate()),
+ memoryTierConfigs(config.getMemoryTierConfigs()),
tempShm_(type == InitMemType::kNone && isOnShm_
- ? std::make_unique(config_.size)
+ ? std::make_unique(config_.getCacheSize())
: nullptr),
shmManager_(type != InitMemType::kNone
? std::make_unique(config_.cacheDir,
- config_.usePosixShm)
+ config_.isUsingPosixShm())
: nullptr),
deserializer_(type == InitMemType::kMemAttach ? createDeserializer()
: nullptr),
@@ -67,20 +80,23 @@ CacheAllocator::CacheAllocator(
: serialization::CacheAllocatorMetadata{}},
allocator_(initAllocator(type)),
compactCacheManager_(type != InitMemType::kMemAttach
- ? std::make_unique(*allocator_)
- : restoreCCacheManager()),
+ ? std::make_unique(*allocator_[0] /* TODO: per tier */)
+ : restoreCCacheManager(0/* TODO: per tier */)),
compressor_(createPtrCompressor()),
mmContainers_(type == InitMemType::kMemAttach
? deserializeMMContainers(*deserializer_, compressor_)
- : MMContainers{}),
+ : MMContainers{getNumTiers()}),
accessContainer_(initAccessContainer(
- type, detail::kShmHashTableName, config.accessConfig)),
+ type, detail::kShmHashTableName, config.accessConfig, config_.isUsingPosixShm())),
chainedItemAccessContainer_(
initAccessContainer(type,
detail::kShmChainedItemHashTableName,
- config.chainedItemAccessConfig)),
+ config.chainedItemAccessConfig,
+ config_.isUsingPosixShm())),
chainedItemLocks_(config_.chainedItemsLockPower,
std::make_shared()),
+ movesMap_(kShards),
+ moveLock_(kShards),
cacheCreationTime_{
type != InitMemType::kMemAttach
? util::getCurrentTimeSec()
@@ -105,39 +121,99 @@ CacheAllocator::~CacheAllocator() {
}
template
-std::unique_ptr
-CacheAllocator::createNewMemoryAllocator() {
+ShmSegmentOpts CacheAllocator::createShmCacheOpts(TierId tid) {
ShmSegmentOpts opts;
opts.alignment = sizeof(Slab);
+ opts.typeOpts = memoryTierConfigs[tid].getShmTypeOpts();
+ opts.memBindNumaNodes = memoryTierConfigs[tid].getMemBind();
+ if (auto *v = std::get_if(&opts.typeOpts)) {
+ v->usePosix = config_.usePosixShm;
+ }
+
+ return opts;
+}
+
+template
+size_t CacheAllocator::memoryTierSize(TierId tid) const
+{
+ auto partitions = std::accumulate(memoryTierConfigs.begin(), memoryTierConfigs.end(), 0UL,
+ [](const size_t i, const MemoryTierCacheConfig& config){
+ return i + config.getRatio();
+ });
+
+ return memoryTierConfigs[tid].calculateTierSize(config_.getCacheSize(), partitions);
+}
+
+template
+std::vector>
+CacheAllocator::createPrivateAllocator() {
+ std::vector> allocators;
+
+ if (isOnShm_)
+ allocators.emplace_back(std::make_unique(
+ getAllocatorConfig(config_),
+ tempShm_->getAddr(),
+ config_.getCacheSize()));
+ else
+ allocators.emplace_back(std::make_unique(
+ getAllocatorConfig(config_),
+ config_.getCacheSize()));
+
+ return allocators;
+}
+
+template
+std::unique_ptr
+CacheAllocator::createNewMemoryAllocator(TierId tid) {
+ size_t tierSize = memoryTierSize(tid);
return std::make_unique(
getAllocatorConfig(config_),
shmManager_
- ->createShm(detail::kShmCacheName, config_.size,
- config_.slabMemoryBaseAddr, opts)
+ ->createShm(detail::kShmCacheName + std::to_string(tid),
+ tierSize, config_.slabMemoryBaseAddr,
+ createShmCacheOpts(tid))
.addr,
- config_.size);
+ tierSize);
}
template
std::unique_ptr
-CacheAllocator::restoreMemoryAllocator() {
- ShmSegmentOpts opts;
- opts.alignment = sizeof(Slab);
+CacheAllocator::restoreMemoryAllocator(TierId tid) {
return std::make_unique(
deserializer_->deserialize(),
shmManager_
- ->attachShm(detail::kShmCacheName, config_.slabMemoryBaseAddr, opts)
- .addr,
- config_.size,
+ ->attachShm(detail::kShmCacheName + std::to_string(tid),
+ config_.slabMemoryBaseAddr, createShmCacheOpts(tid)).addr,
+ memoryTierSize(tid),
config_.disableFullCoredump);
}
+template
+std::vector>
+CacheAllocator::createAllocators() {
+ std::vector> allocators;
+ for (int tid = 0; tid < getNumTiers(); tid++) {
+ allocators.emplace_back(createNewMemoryAllocator(tid));
+ }
+ return allocators;
+}
+
+template
+std::vector>
+CacheAllocator::restoreAllocators() {
+ std::vector> allocators;
+ for (int tid = 0; tid < getNumTiers(); tid++) {
+ allocators.emplace_back(restoreMemoryAllocator(tid));
+ }
+ return allocators;
+}
+
template
std::unique_ptr
-CacheAllocator::restoreCCacheManager() {
+CacheAllocator::restoreCCacheManager(TierId tid) {
return std::make_unique(
deserializer_->deserialize(),
- *allocator_);
+ *allocator_[tid]);
}
template
@@ -226,23 +302,30 @@ void CacheAllocator::initWorkers() {
config_.poolOptimizeStrategy,
config_.ccacheOptimizeStepSizePercent);
}
+
+ if (config_.backgroundEvictorEnabled()) {
+ startNewBackgroundEvictor(config_.backgroundEvictorInterval,
+ config_.backgroundEvictorStrategy,
+ config_.backgroundEvictorThreads);
+ }
+
+ if (config_.backgroundPromoterEnabled()) {
+ startNewBackgroundPromoter(config_.backgroundPromoterInterval,
+ config_.backgroundPromoterStrategy,
+ config_.backgroundPromoterThreads);
+ }
}
template
-std::unique_ptr CacheAllocator::initAllocator(
+std::vector>
+CacheAllocator::initAllocator(
InitMemType type) {
if (type == InitMemType::kNone) {
- if (isOnShm_ == true) {
- return std::make_unique(
- getAllocatorConfig(config_), tempShm_->getAddr(), config_.size);
- } else {
- return std::make_unique(getAllocatorConfig(config_),
- config_.size);
- }
+ return createPrivateAllocator();
} else if (type == InitMemType::kMemNew) {
- return createNewMemoryAllocator();
+ return createAllocators();
} else if (type == InitMemType::kMemAttach) {
- return restoreMemoryAllocator();
+ return restoreAllocators();
}
// Invalid type
@@ -255,7 +338,8 @@ template
std::unique_ptr::AccessContainer>
CacheAllocator::initAccessContainer(InitMemType type,
const std::string name,
- AccessConfig config) {
+ AccessConfig config,
+ bool usePosixShm) {
if (type == InitMemType::kNone) {
return std::make_unique(
config, compressor_,
@@ -268,7 +352,7 @@ CacheAllocator::initAccessContainer(InitMemType type,
name,
AccessContainer::getRequiredSize(config.getNumBuckets()),
nullptr,
- ShmSegmentOpts(config.getPageSize()))
+ ShmSegmentOpts(config.getPageSize(), false, usePosixShm))
.addr,
compressor_,
[this](Item* it) -> WriteHandle { return acquire(it); });
@@ -276,7 +360,8 @@ CacheAllocator::initAccessContainer(InitMemType type,
return std::make_unique(
deserializer_->deserialize(),
config,
- shmManager_->attachShm(name),
+ shmManager_->attachShm(name, nullptr,
+ ShmSegmentOpts(config.getPageSize(), false, usePosixShm)),
compressor_,
[this](Item* it) -> WriteHandle { return acquire(it); });
}
@@ -289,7 +374,8 @@ CacheAllocator::initAccessContainer(InitMemType type,
template
std::unique_ptr CacheAllocator::createDeserializer() {
- auto infoAddr = shmManager_->attachShm(detail::kShmInfoName);
+ auto infoAddr = shmManager_->attachShm(detail::kShmInfoName, nullptr,
+ ShmSegmentOpts(PageSizeT::NORMAL, false, config_.isUsingPosixShm()));
return std::make_unique(
reinterpret_cast(infoAddr.addr),
reinterpret_cast(infoAddr.addr) + infoAddr.size);
@@ -309,13 +395,31 @@ CacheAllocator::allocate(PoolId poolId,
ttlSecs == 0 ? 0 : creationTime + ttlSecs);
}
+template
+bool CacheAllocator::shouldWakeupBgEvictor(TierId tid, PoolId pid, ClassId cid) {
+ // TODO: should we also work on lower tiers? should we have separate set of params?
+ if (tid == 1) return false;
+ return getAllocationClassStats(tid, pid, cid).approxFreePercent <= config_.lowEvictionAcWatermark;
+}
+
+template
+size_t CacheAllocator::backgroundWorkerId(TierId tid, PoolId pid, ClassId cid, size_t numWorkers) {
+ XDCHECK(numWorkers);
+
+ // TODO: came up with some better sharding (use some hashing)
+ return (tid + pid + cid) % numWorkers;
+}
+
+
template
typename CacheAllocator::WriteHandle
-CacheAllocator::allocateInternal(PoolId pid,
- typename Item::Key key,
- uint32_t size,
- uint32_t creationTime,
- uint32_t expiryTime) {
+CacheAllocator::allocateInternalTier(TierId tid,
+ PoolId pid,
+ typename Item::Key key,
+ uint32_t size,
+ uint32_t creationTime,
+ uint32_t expiryTime,
+ bool fromBgThread) {
util::LatencyTracker tracker{stats().allocateLatency_};
SCOPE_FAIL { stats_.invalidAllocs.inc(); };
@@ -324,13 +428,21 @@ CacheAllocator::allocateInternal(PoolId pid,
const auto requiredSize = Item::getRequiredSize(key, size);
// the allocation class in our memory allocator.
- const auto cid = allocator_->getAllocationClassId(pid, requiredSize);
+ const auto cid = allocator_[tid]->getAllocationClassId(pid, requiredSize);
+ util::RollingLatencyTracker rollTracker{(*stats_.classAllocLatency)[tid][pid][cid]};
+ // TODO: per-tier
(*stats_.allocAttempts)[pid][cid].inc();
-
- void* memory = allocator_->allocate(pid, requiredSize);
+
+ void* memory = allocator_[tid]->allocate(pid, requiredSize);
+
+ if (backgroundEvictor_.size() && !fromBgThread && (memory == nullptr || shouldWakeupBgEvictor(tid, pid, cid))) {
+ backgroundEvictor_[backgroundWorkerId(tid, pid, cid, backgroundEvictor_.size())]->wakeUp();
+ }
+ // TODO: Today isEvictionDisabled means do not evict from memory (DRAM).
+ // Should we support eviction between memory tiers (e.g. from DRAM to PMEM)?
if (memory == nullptr && !config_.isEvictionDisabled()) {
- memory = findEviction(pid, cid);
+ memory = findEviction(tid, pid, cid);
}
WriteHandle handle;
@@ -341,7 +453,7 @@ CacheAllocator::allocateInternal(PoolId pid,
// for example.
SCOPE_FAIL {
// free back the memory to the allocator since we failed.
- allocator_->free(memory);
+ allocator_[tid]->free(memory);
};
handle = acquire(new (memory) Item(key, size, creationTime, expiryTime));
@@ -352,7 +464,7 @@ CacheAllocator::allocateInternal(PoolId pid,
}
} else { // failed to allocate memory.
- (*stats_.allocFailures)[pid][cid].inc();
+ (*stats_.allocFailures)[pid][cid].inc(); // TODO: per-tier
// wake up rebalancer
if (poolRebalancer_) {
poolRebalancer_->wakeUp();
@@ -369,6 +481,22 @@ CacheAllocator::allocateInternal(PoolId pid,
return handle;
}
+template
+typename CacheAllocator::WriteHandle
+CacheAllocator::allocateInternal(PoolId pid,
+ typename Item::Key key,
+ uint32_t size,
+ uint32_t creationTime,
+ uint32_t expiryTime,
+ bool fromBgThread) {
+ auto tid = 0; /* TODO: consult admission policy */
+ for(TierId tid = 0; tid < getNumTiers(); ++tid) {
+ auto handle = allocateInternalTier(tid, pid, key, size, creationTime, expiryTime, fromBgThread);
+ if (handle) return handle;
+ }
+ return {};
+}
+
template
typename CacheAllocator::WriteHandle
CacheAllocator::allocateChainedItem(const ReadHandle& parent,
@@ -399,21 +527,28 @@ CacheAllocator::allocateChainedItemInternal(
// number of bytes required for this item
const auto requiredSize = ChainedItem::getRequiredSize(size);
- const auto pid = allocator_->getAllocInfo(parent->getMemory()).poolId;
- const auto cid = allocator_->getAllocationClassId(pid, requiredSize);
+ // TODO: is this correct?
+ auto tid = getTierId(*parent);
+ const auto pid = allocator_[tid]->getAllocInfo(parent->getMemory()).poolId;
+ const auto cid = allocator_[tid]->getAllocationClassId(pid, requiredSize);
+
+ util::RollingLatencyTracker rollTracker{(*stats_.classAllocLatency)[tid][pid][cid]};
+
+ // TODO: per-tier? Right now stats_ are not used in any public periodic
+ // worker
(*stats_.allocAttempts)[pid][cid].inc();
- void* memory = allocator_->allocate(pid, requiredSize);
+ void* memory = allocator_[tid]->allocate(pid, requiredSize);
if (memory == nullptr) {
- memory = findEviction(pid, cid);
+ memory = findEviction(tid, pid, cid);
}
if (memory == nullptr) {
(*stats_.allocFailures)[pid][cid].inc();
return WriteHandle{};
}
- SCOPE_FAIL { allocator_->free(memory); };
+ SCOPE_FAIL { allocator_[tid]->free(memory); };
auto child = acquire(
new (memory) ChainedItem(compressor_.compress(parent.getInternal()), size,
@@ -722,8 +857,8 @@ CacheAllocator::releaseBackToAllocator(Item& it,
throw std::runtime_error(
folly::sformat("cannot release this item: {}", it.toString()));
}
-
- const auto allocInfo = allocator_->getAllocInfo(it.getMemory());
+ const auto tid = getTierId(it);
+ const auto allocInfo = allocator_[tid]->getAllocInfo(it.getMemory());
if (ctx == RemoveContext::kEviction) {
const auto timeNow = util::getCurrentTimeSec();
@@ -747,8 +882,7 @@ CacheAllocator::releaseBackToAllocator(Item& it,
folly::sformat("Can not recycle a chained item {}, toRecyle",
it.toString(), toRecycle->toString()));
}
-
- allocator_->free(&it);
+ allocator_[tid]->free(&it);
return ReleaseRes::kReleased;
}
@@ -807,7 +941,7 @@ CacheAllocator::releaseBackToAllocator(Item& it,
auto next = head->getNext(compressor_);
const auto childInfo =
- allocator_->getAllocInfo(static_cast(head));
+ allocator_[tid]->getAllocInfo(static_cast(head));
(*stats_.fragmentationSize)[childInfo.poolId][childInfo.classId].sub(
util::getFragmentation(*this, *head));
@@ -840,7 +974,7 @@ CacheAllocator::releaseBackToAllocator(Item& it,
XDCHECK(ReleaseRes::kReleased != res);
res = ReleaseRes::kRecycled;
} else {
- allocator_->free(head);
+ allocator_[tid]->free(head);
}
}
@@ -855,7 +989,7 @@ CacheAllocator::releaseBackToAllocator(Item& it,
res = ReleaseRes::kRecycled;
} else {
XDCHECK(it.isDrained());
- allocator_->free(&it);
+ allocator_[tid]->free(&it);
}
return res;
@@ -927,6 +1061,25 @@ bool CacheAllocator::replaceInMMContainer(Item& oldItem,
}
}
+template
+bool CacheAllocator::replaceInMMContainer(Item* oldItem,
+ Item& newItem) {
+ return replaceInMMContainer(*oldItem, newItem);
+}
+
+template
+bool CacheAllocator::replaceInMMContainer(EvictionIterator& oldItemIt,
+ Item& newItem) {
+ auto& oldContainer = getMMContainer(*oldItemIt);
+ auto& newContainer = getMMContainer(newItem);
+
+ // This function is used for eviction across tiers
+ XDCHECK(&oldContainer != &newContainer);
+ oldContainer.remove(oldItemIt);
+
+ return newContainer.add(newItem);
+}
+
template
bool CacheAllocator::replaceChainedItemInMMContainer(
Item& oldItem, Item& newItem) {
@@ -1072,6 +1225,157 @@ CacheAllocator::insertOrReplace(const WriteHandle& handle) {
return replaced;
}
+/* Next two methods are used to asynchronously move Item between memory tiers.
+ *
+ * The thread, which moves Item, allocates new Item in the tier we are moving to
+ * and calls moveRegularItemWithSync() method. This method does the following:
+ * 1. Create MoveCtx and put it to the movesMap.
+ * 2. Update the access container with the new item from the tier we are
+ * moving to. This Item has kIncomplete flag set.
+ * 3. Copy data from the old Item to the new one.
+ * 4. Unset the kIncomplete flag and Notify MoveCtx
+ *
+ * Concurrent threads which are getting handle to the same key:
+ * 1. When a handle is created it checks if the kIncomplete flag is set
+ * 2. If so, Handle implementation creates waitContext and adds it to the
+ * MoveCtx by calling addWaitContextForMovingItem() method.
+ * 3. Wait until the moving thread will complete its job.
+ */
+template
+bool CacheAllocator::addWaitContextForMovingItem(
+ folly::StringPiece key, std::shared_ptr> waiter) {
+ auto shard = getShardForKey(key);
+ auto& movesMap = getMoveMapForShard(shard);
+ auto lock = getMoveLockForShard(shard);
+ auto it = movesMap.find(key);
+ if (it == movesMap.end()) {
+ return false;
+ }
+ auto ctx = it->second.get();
+ ctx->addWaiter(std::move(waiter));
+ return true;
+}
+
+template
+template
+typename CacheAllocator::WriteHandle
+CacheAllocator::moveRegularItemWithSync(
+ Item& oldItem, WriteHandle& newItemHdl, P&& predicate) {
+ XDCHECK(oldItem.isMoving());
+ // TODO: should we introduce new latency tracker. E.g. evictRegularLatency_
+ // ??? util::LatencyTracker tracker{stats_.evictRegularLatency_};
+
+ if (!oldItem.isAccessible() || oldItem.isExpired()) {
+ return {};
+ }
+
+ XDCHECK_EQ(newItemHdl->getSize(), oldItem.getSize());
+ XDCHECK_NE(getTierId(oldItem), getTierId(*newItemHdl));
+
+ // take care of the flags before we expose the item to be accessed. this
+ // will ensure that when another thread removes the item from RAM, we issue
+ // a delete accordingly. See D7859775 for an example
+ if (oldItem.isNvmClean()) {
+ newItemHdl->markNvmClean();
+ }
+
+ folly::StringPiece key(oldItem.getKey());
+ auto shard = getShardForKey(key);
+ auto& movesMap = getMoveMapForShard(shard);
+ MoveCtx* ctx(nullptr);
+ {
+ auto lock = getMoveLockForShard(shard);
+ auto res = movesMap.try_emplace(key, std::make_unique());
+ if (!res.second) {
+ return {};
+ }
+ ctx = res.first->second.get();
+ }
+
+ auto resHdl = WriteHandle{};
+ auto guard = folly::makeGuard([key, this, ctx, shard, &resHdl]() {
+ auto& movesMap = getMoveMapForShard(shard);
+ if (resHdl)
+ resHdl->unmarkIncomplete();
+ auto lock = getMoveLockForShard(shard);
+ ctx->setItemHandle(std::move(resHdl));
+ movesMap.erase(key);
+ });
+
+ // TODO: Possibly we can use markMoving() instead. But today
+ // moveOnSlabRelease logic assume that we mark as moving old Item
+ // and than do copy and replace old Item with the new one in access
+ // container. Furthermore, Item can be marked as Moving only
+ // if it is linked to MM container. In our case we mark the new Item
+ // and update access container before the new Item is ready (content is
+ // copied).
+ newItemHdl->markIncomplete();
+
+ // Inside the access container's lock, this checks if the old item is
+ // accessible and its refcount is zero. If the item is not accessible,
+ // there is no point to replace it since it had already been removed
+ // or in the process of being removed. If the item is in cache but the
+ // refcount is non-zero, it means user could be attempting to remove
+ // this item through an API such as remove(ItemHandle). In this case,
+ // it is unsafe to replace the old item with a new one, so we should
+ // also abort.
+ if (!accessContainer_->replaceIf(oldItem, *newItemHdl,
+ predicate)) {
+ return {};
+ }
+
+ if (config_.moveCb) {
+ // Execute the move callback. We cannot make any guarantees about the
+ // consistency of the old item beyond this point, because the callback can
+ // do more than a simple memcpy() e.g. update external references. If there
+ // are any remaining handles to the old item, it is the caller's
+ // responsibility to invalidate them. The move can only fail after this
+ // statement if the old item has been removed or replaced, in which case it
+ // should be fine for it to be left in an inconsistent state.
+ config_.moveCb(oldItem, *newItemHdl, nullptr);
+ } else {
+ std::memcpy(newItemHdl->getMemory(), oldItem.getMemory(),
+ oldItem.getSize());
+ }
+
+ // Inside the MM container's lock, this checks if the old item exists to
+ // make sure that no other thread removed it, and only then replaces it.
+ if (!replaceInMMContainer(oldItem, *newItemHdl)) {
+ accessContainer_->remove(*newItemHdl);
+ return acquire(&oldItem);
+ }
+
+ // Replacing into the MM container was successful, but someone could have
+ // called insertOrReplace() or remove() before or after the
+ // replaceInMMContainer() operation, which would invalidate newItemHdl.
+ if (!newItemHdl->isAccessible()) {
+ removeFromMMContainer(*newItemHdl);
+ return acquire(&oldItem);
+ }
+
+ // no one can add or remove chained items at this point
+ if (oldItem.hasChainedItem()) {
+ // safe to acquire handle for a moving Item
+ auto oldHandle = acquire(&oldItem);
+ XDCHECK_EQ(1u, oldHandle->getRefCount()) << oldHandle->toString();
+ XDCHECK(!newItemHdl->hasChainedItem()) << newItemHdl->toString();
+ try {
+ auto l = chainedItemLocks_.lockExclusive(oldItem.getKey());
+ transferChainLocked(oldHandle, newItemHdl);
+ } catch (const std::exception& e) {
+ // this should never happen because we drained all the handles.
+ XLOGF(DFATAL, "{}", e.what());
+ throw;
+ }
+
+ XDCHECK(!oldItem.hasChainedItem());
+ XDCHECK(newItemHdl->hasChainedItem());
+ }
+ newItemHdl.unmarkNascent();
+ resHdl = std::move(newItemHdl); // guard will assign it to ctx under lock
+ return acquire(&oldItem);
+}
+
template
bool CacheAllocator::moveRegularItem(Item& oldItem,
WriteHandle& newItemHdl) {
@@ -1214,48 +1518,78 @@ bool CacheAllocator::moveChainedItem(ChainedItem& oldItem,
template
typename CacheAllocator::Item*
-CacheAllocator::findEviction(PoolId pid, ClassId cid) {
- auto& mmContainer = getMMContainer(pid, cid);
+CacheAllocator::findEviction(TierId tid, PoolId pid, ClassId cid) {
+ auto& mmContainer = getMMContainer(tid, pid, cid);
// Keep searching for a candidate until we were able to evict it
// or until the search limit has been exhausted
unsigned int searchTries = 0;
- auto itr = mmContainer.getEvictionIterator();
while ((config_.evictionSearchTries == 0 ||
- config_.evictionSearchTries > searchTries) &&
- itr) {
+ config_.evictionSearchTries > searchTries)) {
++searchTries;
(*stats_.evictionAttempts)[pid][cid].inc();
- Item* candidate = itr.get();
+ Item* toRecycle = nullptr;
+ Item* candidate = nullptr;
+
+ mmContainer.withEvictionIterator([this, &candidate, &toRecycle, &searchTries](auto &&itr){
+ while ((config_.evictionSearchTries == 0 ||
+ config_.evictionSearchTries > searchTries) && itr) {
+ ++searchTries;
+
+ auto *toRecycle_ = itr.get();
+ auto *candidate_ = toRecycle_->isChainedItem()
+ ? &toRecycle_->asChainedItem().getParentItem(compressor_)
+ : toRecycle_;
+
+ // make sure no other thead is evicting the item
+ if (candidate_->getRefCount() == 0 && candidate_->markMoving()) {
+ toRecycle = toRecycle_;
+ candidate = candidate_;
+ return;
+ }
+
+ ++itr;
+ }
+ });
+
+ if (!toRecycle)
+ continue;
+
+ XDCHECK(toRecycle);
+ XDCHECK(candidate);
+
// for chained items, the ownership of the parent can change. We try to
// evict what we think as parent and see if the eviction of parent
// recycles the child we intend to.
auto toReleaseHandle =
- itr->isChainedItem()
- ? advanceIteratorAndTryEvictChainedItem(itr)
- : advanceIteratorAndTryEvictRegularItem(mmContainer, itr);
+ evictNormalItem(*candidate, true /* skipIfTokenInvalid */);
+ auto ref = candidate->unmarkMoving();
- if (toReleaseHandle) {
- if (toReleaseHandle->hasChainedItem()) {
+ if (toReleaseHandle || ref == 0u) {
+ if (candidate->hasChainedItem()) {
(*stats_.chainedItemEvictions)[pid][cid].inc();
} else {
(*stats_.regularItemEvictions)[pid][cid].inc();
}
+ } else {
+ if (candidate->hasChainedItem()) {
+ stats_.evictFailParentAC.inc();
+ } else {
+ stats_.evictFailAC.inc();
+ }
+ }
+ if (toReleaseHandle) {
if (auto eventTracker = getEventTracker()) {
eventTracker->record(
AllocatorApiEvent::DRAM_EVICT, toReleaseHandle->getKey(),
AllocatorApiResult::EVICTED, toReleaseHandle->getSize(),
toReleaseHandle->getConfiguredTTL().count());
}
- // Invalidate iterator since later on we may use this mmContainer
- // again, which cannot be done unless we drop this iterator
- itr.destroy();
- // we must be the last handle and for chained items, this will be
- // the parent.
- XDCHECK(toReleaseHandle.get() == candidate || candidate->isChainedItem());
+ XDCHECK(toReleaseHandle.get() == candidate);
+ XDCHECK(toRecycle == candidate || toRecycle->isChainedItem());
XDCHECK_EQ(1u, toReleaseHandle->getRefCount());
// We manually release the item here because we don't want to
@@ -1271,15 +1605,18 @@ CacheAllocator::findEviction(PoolId pid, ClassId cid) {
// recycle the candidate.
if (ReleaseRes::kRecycled ==
releaseBackToAllocator(itemToRelease, RemoveContext::kEviction,
- /* isNascent */ false, candidate)) {
- return candidate;
+ /* isNascent */ false, toRecycle)) {
+ return toRecycle;
+ }
+ } else if (ref == 0u) {
+ // it's safe to recycle the item here as there are no more
+ // references and the item could not been marked as moving
+ // by other thread since it's detached from MMContainer.
+ if (ReleaseRes::kRecycled ==
+ releaseBackToAllocator(*candidate, RemoveContext::kEviction,
+ /* isNascent */ false, toRecycle)) {
+ return toRecycle;
}
- }
-
- // If we destroyed the itr to possibly evict and failed, we restart
- // from the beginning again
- if (!itr) {
- itr.resetToBegin();
}
}
return nullptr;
@@ -1335,141 +1672,84 @@ bool CacheAllocator::shouldWriteToNvmCacheExclusive(
template
typename CacheAllocator::WriteHandle
-CacheAllocator::advanceIteratorAndTryEvictRegularItem(
- MMContainer& mmContainer, EvictionIterator& itr) {
- // we should flush this to nvmcache if it is not already present in nvmcache
- // and the item is not expired.
- Item& item = *itr;
- const bool evictToNvmCache = shouldWriteToNvmCache(item);
-
- auto token = evictToNvmCache ? nvmCache_->createPutToken(item.getKey())
- : typename NvmCacheT::PutToken{};
- // record the in-flight eviciton. If not, we move on to next item to avoid
- // stalling eviction.
- if (evictToNvmCache && !token.isValid()) {
- ++itr;
- stats_.evictFailConcurrentFill.inc();
- return WriteHandle{};
- }
-
- // If there are other accessors, we should abort. Acquire a handle here since
- // if we remove the item from both access containers and mm containers
- // below, we will need a handle to ensure proper cleanup in case we end up
- // not evicting this item
- auto evictHandle = accessContainer_->removeIf(item, &itemEvictionPredicate);
-
- if (!evictHandle) {
- ++itr;
- stats_.evictFailAC.inc();
- return evictHandle;
- }
+CacheAllocator::tryEvictToNextMemoryTier(
+ TierId tid, PoolId pid, Item& item, bool fromBgThread) {
+ if(item.isChainedItem()) return {}; // TODO: We do not support ChainedItem yet
+ if(item.isExpired()) {
+ auto handle = removeIf(item, [](const Item& it) {
+ return it.getRefCount() == 0;
+ });
+
+ if (handle) { return handle; }
+ }
+
+ TierId nextTier = tid; // TODO - calculate this based on some admission policy
+ while (++nextTier < getNumTiers()) { // try to evict down to the next memory tiers
+ // allocateInternal might trigger another eviction
+ auto newItemHdl = allocateInternalTier(nextTier, pid,
+ item.getKey(),
+ item.getSize(),
+ item.getCreationTime(),
+ item.getExpiryTime(),
+ fromBgThread);
- mmContainer.remove(itr);
- XDCHECK_EQ(reinterpret_cast(evictHandle.get()),
- reinterpret_cast(&item));
- XDCHECK(!evictHandle->isInMMContainer());
- XDCHECK(!evictHandle->isAccessible());
-
- // If the item is now marked as moving, that means its corresponding slab is
- // being released right now. So, we look for the next item that is eligible
- // for eviction. It is safe to destroy the handle here since the moving bit
- // is set. Iterator was already advance by the remove call above.
- if (evictHandle->isMoving()) {
- stats_.evictFailMove.inc();
- return WriteHandle{};
+ if (newItemHdl) {
+ XDCHECK_EQ(newItemHdl->getSize(), item.getSize());
+ return moveRegularItemWithSync(item, newItemHdl, itemMovingPredicate);
+ }
}
- // Invalidate iterator since later on if we are not evicting this
- // item, we may need to rely on the handle we created above to ensure
- // proper cleanup if the item's raw refcount has dropped to 0.
- // And since this item may be a parent item that has some child items
- // in this very same mmContainer, we need to make sure we drop this
- // exclusive iterator so we can gain access to it when we're cleaning
- // up the child items
- itr.destroy();
-
- // Ensure that there are no accessors after removing from the access
- // container
- XDCHECK(evictHandle->getRefCount() == 1);
-
- if (evictToNvmCache && shouldWriteToNvmCacheExclusive(item)) {
- XDCHECK(token.isValid());
- nvmCache_->put(evictHandle, std::move(token));
- }
- return evictHandle;
+ return {};
}
template
typename CacheAllocator::WriteHandle
-CacheAllocator::advanceIteratorAndTryEvictChainedItem(
- EvictionIterator& itr) {
- XDCHECK(itr->isChainedItem());
-
- ChainedItem* candidate = &itr->asChainedItem();
- ++itr;
-
- // The parent could change at any point through transferChain. However, if
- // that happens, we would realize that the releaseBackToAllocator return
- // kNotRecycled and we would try another chained item, leading to transient
- // failure.
- auto& parent = candidate->getParentItem(compressor_);
-
- const bool evictToNvmCache = shouldWriteToNvmCache(parent);
-
- auto token = evictToNvmCache ? nvmCache_->createPutToken(parent.getKey())
- : typename NvmCacheT::PutToken{};
-
- // if token is invalid, return. iterator is already advanced.
- if (evictToNvmCache && !token.isValid()) {
- stats_.evictFailConcurrentFill.inc();
- return WriteHandle{};
- }
-
- // check if the parent exists in the hashtable and refcount is drained.
- auto parentHandle =
- accessContainer_->removeIf(parent, &itemEvictionPredicate);
- if (!parentHandle) {
- stats_.evictFailParentAC.inc();
- return parentHandle;
- }
-
- // Invalidate iterator since later on we may use the mmContainer
- // associated with this iterator which cannot be done unless we
- // drop this iterator
- //
- // This must be done once we know the parent is not nullptr.
- // Since we can very well be the last holder of this parent item,
- // which may have a chained item that is linked in this MM container.
- itr.destroy();
-
- // Ensure we have the correct parent and we're the only user of the
- // parent, then free it from access container. Otherwise, we abort
- XDCHECK_EQ(reinterpret_cast(&parent),
- reinterpret_cast(parentHandle.get()));
- XDCHECK_EQ(1u, parent.getRefCount());
+CacheAllocator::tryEvictToNextMemoryTier(Item& item, bool fromBgThread) {
+ auto tid = getTierId(item);
+ auto pid = allocator_[tid]->getAllocInfo(item.getMemory()).poolId;
+ return tryEvictToNextMemoryTier(tid, pid, item, fromBgThread);
+}
- removeFromMMContainer(*parentHandle);
+template
+bool
+CacheAllocator::tryPromoteToNextMemoryTier(
+ TierId tid, PoolId pid, Item& item, bool fromBgThread) {
+ TierId nextTier = tid;
+ while (nextTier > 0) { // try to evict down to the next memory tiers
+ auto toPromoteTier = nextTier - 1;
+ --nextTier;
- XDCHECK(!parent.isInMMContainer());
- XDCHECK(!parent.isAccessible());
+ // allocateInternal might trigger another eviction
+ auto newItemHdl = allocateInternalTier(toPromoteTier, pid,
+ item.getKey(),
+ item.getSize(),
+ item.getCreationTime(),
+ item.getExpiryTime(),
+ fromBgThread);
- // We need to make sure the parent is not marked as moving
- // and we're the only holder of the parent item. Safe to destroy the handle
- // here since moving bit is set.
- if (parentHandle->isMoving()) {
- stats_.evictFailParentMove.inc();
- return WriteHandle{};
+ if (newItemHdl) {
+ XDCHECK_EQ(newItemHdl->getSize(), item.getSize());
+ auto predicate = [&](const Item& item){
+ return item.getRefCount() == 0 || config_.numDuplicateElements > 0;
+ };
+ if (moveRegularItemWithSync(item, newItemHdl, predicate)) {
+ return true;
+ }
+ }
}
- if (evictToNvmCache && shouldWriteToNvmCacheExclusive(*parentHandle)) {
- XDCHECK(token.isValid());
- XDCHECK(parentHandle->hasChainedItem());
- nvmCache_->put(parentHandle, std::move(token));
- }
+ return false;
+}
- return parentHandle;
+template
+bool
+CacheAllocator::tryPromoteToNextMemoryTier(Item& item, bool fromBgThread) {
+ auto tid = getTierId(item);
+ auto pid = allocator_[tid]->getAllocInfo(item.getMemory()).poolId;
+ return tryPromoteToNextMemoryTier(tid, pid, item, fromBgThread);
}
+
template
typename CacheAllocator::RemoveRes
CacheAllocator::remove(typename Item::Key key) {
@@ -1670,33 +1950,56 @@ void CacheAllocator::invalidateNvm(Item& item) {
}
}
+template
+TierId
+CacheAllocator::getTierId(const Item& item) const {
+ return getTierId(item.getMemory());
+}
+
+template
+TierId
+CacheAllocator::getTierId(const void* ptr) const {
+ for (TierId tid = 0; tid < getNumTiers(); tid++) {
+ if (allocator_[tid]->isMemoryInAllocator(ptr))
+ return tid;
+ }
+
+ throw std::invalid_argument("Item does not belong to any tier!");
+}
+
template
typename CacheAllocator::MMContainer&
CacheAllocator::getMMContainer(const Item& item) const noexcept {
+ const auto tid = getTierId(item);
const auto allocInfo =
- allocator_->getAllocInfo(static_cast(&item));
- return getMMContainer(allocInfo.poolId, allocInfo.classId);
+ allocator_[tid]->getAllocInfo(static_cast(&item));
+ return getMMContainer(tid, allocInfo.poolId, allocInfo.classId);
}
template
typename CacheAllocator::MMContainer&
-CacheAllocator::getMMContainer(PoolId pid,
+CacheAllocator::getMMContainer(TierId tid,
+ PoolId pid,
ClassId cid) const noexcept {
- XDCHECK_LT(static_cast(pid), mmContainers_.size());
- XDCHECK_LT(static_cast(cid), mmContainers_[pid].size());
- return *mmContainers_[pid][cid];
+ XDCHECK_LT(static_cast(tid), mmContainers_.size());
+ XDCHECK_LT(static_cast(pid), mmContainers_[tid].size());
+ XDCHECK_LT(static_cast(cid), mmContainers_[tid][pid].size());
+ return *mmContainers_[tid][pid][cid];
}
template
MMContainerStat CacheAllocator::getMMContainerStat(
- PoolId pid, ClassId cid) const noexcept {
- if (static_cast(pid) >= mmContainers_.size()) {
+ TierId tid, PoolId pid, ClassId cid) const noexcept {
+ if(static_cast(tid) >= mmContainers_.size()) {
return MMContainerStat{};
}
- if (static_cast(cid) >= mmContainers_[pid].size()) {
+ if (static_cast(pid) >= mmContainers_[tid].size()) {
return MMContainerStat{};
}
- return mmContainers_[pid][cid] ? mmContainers_[pid][cid]->getStats()
+ if (static_cast(cid) >= mmContainers_[tid][pid].size()) {
+ return MMContainerStat{};
+ }
+ return mmContainers_[tid][pid][cid] ? mmContainers_[tid][pid][cid]->getStats()
: MMContainerStat{};
}
@@ -1863,8 +2166,9 @@ void CacheAllocator::markUseful(const ReadHandle& handle,
template
bool CacheAllocator::recordAccessInMMContainer(Item& item,
AccessMode mode) {
+ const auto tid = getTierId(item);
const auto allocInfo =
- allocator_->getAllocInfo(static_cast(&item));
+ allocator_[tid]->getAllocInfo(static_cast(&item));
(*stats_.cacheHits)[allocInfo.poolId][allocInfo.classId].inc();
// track recently accessed items if needed
@@ -1872,14 +2176,15 @@ bool CacheAllocator::recordAccessInMMContainer(Item& item,
ring_->trackItem(reinterpret_cast(&item), item.getSize());
}
- auto& mmContainer = getMMContainer(allocInfo.poolId, allocInfo.classId);
+ auto& mmContainer = getMMContainer(tid, allocInfo.poolId, allocInfo.classId);
return mmContainer.recordAccess(item, mode);
}
template
uint32_t CacheAllocator::getUsableSize(const Item& item) const {
+ const auto tid = getTierId(item);
const auto allocSize =
- allocator_->getAllocInfo(static_cast(&item)).allocSize;
+ allocator_[tid]->getAllocInfo(static_cast(&item)).allocSize;
return item.isChainedItem()
? allocSize - ChainedItem::getRequiredSize(0)
: allocSize - Item::getRequiredSize(item.getKey(), 0);
@@ -1888,8 +2193,11 @@ uint32_t CacheAllocator::getUsableSize(const Item& item) const {
template
typename CacheAllocator::ReadHandle
CacheAllocator::getSampleItem() {
+ // TODO: is using random tier a good idea?
+ auto tid = folly::Random::rand32() % getNumTiers();
+
const auto* item =
- reinterpret_cast(allocator_->getRandomAlloc());
+ reinterpret_cast(allocator_[tid]->getRandomAlloc());
if (!item) {
return ReadHandle{};
}
@@ -1904,26 +2212,34 @@ CacheAllocator::getSampleItem() {
template
std::vector CacheAllocator::dumpEvictionIterator(
- PoolId pid, ClassId cid, size_t numItems) {
+ PoolId pid, ClassId cid, size_t numItems) {
if (numItems == 0) {
return {};
}
- if (static_cast(pid) >= mmContainers_.size() ||
- static_cast(cid) >= mmContainers_[pid].size()) {
+ // Always evict from the lowest layer.
+ int tid = getNumTiers() - 1;
+
+ if (static_cast(tid) >= mmContainers_.size() ||
+ static_cast(pid) >= mmContainers_[tid].size() ||
+ static_cast(cid) >= mmContainers_[tid][pid].size()) {
throw std::invalid_argument(
- folly::sformat("Invalid PoolId: {} and ClassId: {}.", pid, cid));
+ folly::sformat("Invalid TierId: {} and PoolId: {} and ClassId: {}.", tid, pid, cid));
}
std::vector content;
- auto& mm = *mmContainers_[pid][cid];
- auto evictItr = mm.getEvictionIterator();
size_t i = 0;
- while (evictItr && i < numItems) {
- content.push_back(evictItr->toString());
- ++evictItr;
- ++i;
+ while (i < numItems && tid >= 0) {
+ auto& mm = *mmContainers_[tid][pid][cid];
+ auto evictItr = mm.getEvictionIterator();
+ while (evictItr && i < numItems) {
+ content.push_back(evictItr->toString());
+ ++evictItr;
+ ++i;
+ }
+
+ --tid;
}
return content;
@@ -2101,19 +2417,50 @@ PoolId CacheAllocator::addPool(
std::shared_ptr resizeStrategy,
bool ensureProvisionable) {
folly::SharedMutex::WriteHolder w(poolsResizeAndRebalanceLock_);
- auto pid = allocator_->addPool(name, size, allocSizes, ensureProvisionable);
+
+ PoolId pid = 0;
+ size_t totalCacheSize = 0;
+
+ for (TierId tid = 0; tid < getNumTiers(); tid++) {
+ totalCacheSize += allocator_[tid]->getMemorySize();
+ }
+
+ for (TierId tid = 0; tid < getNumTiers(); tid++) {
+ auto tierSizeRatio =
+ static_cast(allocator_[tid]->getMemorySize()) / totalCacheSize;
+ size_t tierPoolSize = static_cast(tierSizeRatio * size);
+
+ // TODO: what if we manage to add pool only in one tier?
+ // we should probably remove that on failure
+ auto res = allocator_[tid]->addPool(
+ name, tierPoolSize, allocSizes, ensureProvisionable);
+ XDCHECK(tid == 0 || res == pid);
+ pid = res;
+ }
+
createMMContainers(pid, std::move(config));
setRebalanceStrategy(pid, std::move(rebalanceStrategy));
setResizeStrategy(pid, std::move(resizeStrategy));
+
+ if (backgroundEvictor_.size()) {
+ for (size_t id = 0; id < backgroundEvictor_.size(); id++)
+ backgroundEvictor_[id]->setAssignedMemory(getAssignedMemoryToBgWorker(id, backgroundEvictor_.size(), 0));
+ }
+
+ if (backgroundPromoter_.size()) {
+ for (size_t id = 0; id < backgroundPromoter_.size(); id++)
+ backgroundPromoter_[id]->setAssignedMemory(getAssignedMemoryToBgWorker(id, backgroundPromoter_.size(), 1));
+ }
+
return pid;
}
template
void CacheAllocator::overridePoolRebalanceStrategy(
PoolId pid, std::shared_ptr rebalanceStrategy) {
- if (static_cast(pid) >= mmContainers_.size()) {
+ if (static_cast(pid) >= mmContainers_[0].size()) {
throw std::invalid_argument(folly::sformat(
- "Invalid PoolId: {}, size of pools: {}", pid, mmContainers_.size()));
+ "Invalid PoolId: {}, size of pools: {}", pid, mmContainers_[0].size()));
}
setRebalanceStrategy(pid, std::move(rebalanceStrategy));
}
@@ -2121,9 +2468,9 @@ void CacheAllocator::overridePoolRebalanceStrategy(
template
void CacheAllocator::overridePoolResizeStrategy(
PoolId pid, std::shared_ptr resizeStrategy) {
- if (static_cast(pid) >= mmContainers_.size()) {
+ if (static_cast(pid) >= mmContainers_[0].size()) {
throw std::invalid_argument(folly::sformat(
- "Invalid PoolId: {}, size of pools: {}", pid, mmContainers_.size()));
+ "Invalid PoolId: {}, size of pools: {}", pid, mmContainers_[0].size()));
}
setResizeStrategy(pid, std::move(resizeStrategy));
}
@@ -2135,14 +2482,14 @@ void CacheAllocator::overridePoolOptimizeStrategy(
}
template
-void CacheAllocator::overridePoolConfig(PoolId pid,
+void CacheAllocator