Skip to content

Commit 3ab0551

Browse files
byrnedjvinser52
authored andcommitted
fix race in moveRegularItemWith sync where insertOrReplace can cause move to fail
- updated slab release logic for move failure, but there is still an issue with slab movement. currently investigating.
1 parent e7b3709 commit 3ab0551

File tree

4 files changed

+154
-8
lines changed

4 files changed

+154
-8
lines changed

cachelib/allocator/CacheAllocator-inl.h

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1479,28 +1479,43 @@ CacheAllocator<CacheTrait>::getNextCandidate(TierId tid,
14791479
auto evictedToNext = lastTier ? nullptr
14801480
: tryEvictToNextMemoryTier(*candidate, false);
14811481
if (!evictedToNext) {
1482-
if (!token.isValid()) {
1482+
//if insertOrReplace was called during move
1483+
//then candidate will not be accessible (failed replace during tryEvict)
1484+
// - therefore this was why we failed to
1485+
// evict to the next tier and insertOrReplace
1486+
// will remove from NVM cache
1487+
//however, if candidate is accessible
1488+
//that means the allocation in the next
1489+
//tier failed - so we will continue to
1490+
//evict the item to NVM cache
1491+
bool failedToReplace = !candidate->isAccessible();
1492+
if (!token.isValid() && !failedToReplace) {
14831493
token = createPutToken(*candidate);
14841494
}
1485-
// tryEvictToNextMemoryTier should only fail if allocation of the new item fails
1486-
// in that case, it should be still possible to mark item as exclusive.
1495+
// tryEvictToNextMemoryTier can fail if:
1496+
// a) allocation of the new item fails in that case,
1497+
// it should be still possible to mark item for eviction.
1498+
// b) another thread calls insertOrReplace and the item
1499+
// is no longer accessible
14871500
//
14881501
// in case that we are on the last tier, we whould have already marked
14891502
// as exclusive since we will not be moving the item to the next tier
14901503
// but rather just evicting all together, no need to
1491-
// markExclusiveWhenMoving
1504+
// markForEvictionWhenMoving
14921505
auto ret = lastTier ? true : candidate->markForEvictionWhenMoving();
14931506
XDCHECK(ret);
14941507

14951508
unlinkItemForEviction(*candidate);
1509+
1510+
if (token.isValid() && shouldWriteToNvmCacheExclusive(*candidate)
1511+
&& !failedToReplace) {
1512+
nvmCache_->put(*candidate, std::move(token));
1513+
}
14961514
// wake up any readers that wait for the move to complete
14971515
// it's safe to do now, as we have the item marked exclusive and
14981516
// no other reader can be added to the waiters list
14991517
wakeUpWaiters(candidate->getKey(), {});
15001518

1501-
if (token.isValid() && shouldWriteToNvmCacheExclusive(*candidate)) {
1502-
nvmCache_->put(*candidate, std::move(token));
1503-
}
15041519
} else {
15051520
XDCHECK(!evictedToNext->isMarkedForEviction() && !evictedToNext->isMoving());
15061521
XDCHECK(!candidate->isMarkedForEviction() && !candidate->isMoving());

cachelib/allocator/CacheItem.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,9 @@ class BaseAllocatorTest;
4646
template <typename AllocatorT>
4747
class AllocatorHitStatsTest;
4848

49+
template <typename AllocatorT>
50+
class AllocatorMemoryTiersTest;
51+
4952
template <typename AllocatorT>
5053
class MapTest;
5154

@@ -469,6 +472,8 @@ class CACHELIB_PACKED_ATTR CacheItem {
469472
FRIEND_TEST(ItemTest, NonStringKey);
470473
template <typename AllocatorT>
471474
friend class facebook::cachelib::tests::AllocatorHitStatsTest;
475+
template <typename AllocatorT>
476+
friend class facebook::cachelib::tests::AllocatorMemoryTiersTest;
472477
};
473478

474479
// A chained item has a hook pointing to the next chained item. The hook is

cachelib/allocator/tests/AllocatorMemoryTiersTest.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,14 +21,15 @@ namespace cachelib {
2121
namespace tests {
2222

2323
using LruAllocatorMemoryTiersTest = AllocatorMemoryTiersTest<LruAllocator>;
24-
24+
//using LruTestAllocatorMemoryTiersTest = AllocatorMemoryTiersTest<LruTestAllocator>;
2525
// TODO(MEMORY_TIER): add more tests with different eviction policies
2626
TEST_F(LruAllocatorMemoryTiersTest, MultiTiersInvalid) { this->testMultiTiersInvalid(); }
2727
TEST_F(LruAllocatorMemoryTiersTest, MultiTiersValid) { this->testMultiTiersValid(); }
2828
TEST_F(LruAllocatorMemoryTiersTest, MultiTiersValidMixed) { this->testMultiTiersValidMixed(); }
2929
TEST_F(LruAllocatorMemoryTiersTest, MultiTiersBackgroundMovers ) { this->testMultiTiersBackgroundMovers(); }
3030
TEST_F(LruAllocatorMemoryTiersTest, MultiTiersRemoveDuringEviction) { this->testMultiTiersRemoveDuringEviction(); }
3131
TEST_F(LruAllocatorMemoryTiersTest, MultiTiersReplaceDuringEviction) { this->testMultiTiersReplaceDuringEviction(); }
32+
TEST_F(LruAllocatorMemoryTiersTest, MultiTiersReplaceDuringEvictionWithReader) { this->testMultiTiersReplaceDuringEvictionWithReader(); }
3233

3334
} // end of namespace tests
3435
} // end of namespace cachelib

cachelib/allocator/tests/AllocatorMemoryTiersTest.h

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,10 @@
2222
#include "cachelib/allocator/FreeThresholdStrategy.h"
2323
#include "cachelib/allocator/PromotionStrategy.h"
2424

25+
#include <fcntl.h>
26+
#include <unistd.h>
27+
#include <ctype.h>
28+
#include <semaphore.h>
2529
#include <folly/synchronization/Latch.h>
2630

2731
namespace facebook {
@@ -234,6 +238,127 @@ class AllocatorMemoryTiersTest : public AllocatorTest<AllocatorT> {
234238
testMultiTiersAsyncOpDuringMove(alloc, pool, quit, moveCb);
235239

236240
t->join();
241+
242+
}
243+
244+
245+
void gdb_sync1() {}
246+
void gdb_sync2() {}
247+
void gdb_sync3() {}
248+
using ReadHandle = typename AllocatorT::ReadHandle;
249+
void testMultiTiersReplaceDuringEvictionWithReader() {
250+
sem_unlink ("/gdb1_sem");
251+
sem_t *sem = sem_open ("/gdb1_sem", O_CREAT | O_EXCL, S_IRUSR | S_IWUSR, 0);
252+
int gdbfd = open("/tmp/gdb1.gdb",O_CREAT | O_TRUNC | O_RDWR, S_IRUSR | S_IWUSR);
253+
char gdbcmds[] =
254+
"set attached=1\n"
255+
"break gdb_sync1\n"
256+
"break gdb_sync2\n"
257+
"break moveRegularItem\n"
258+
"c\n"
259+
"set scheduler-locking on\n"
260+
"thread 1\n"
261+
"c\n"
262+
"thread 4\n"
263+
"c\n"
264+
"thread 5\n"
265+
"break nativeFutexWaitImpl thread 5\n"
266+
"c\n"
267+
"thread 4\n"
268+
"break nativeFutexWaitImpl thread 4\n"
269+
"c\n"
270+
"thread 1\n"
271+
"break releaseBackToAllocator\n"
272+
"c\n"
273+
"c\n"
274+
"thread 5\n"
275+
"c\n"
276+
"thread 4\n"
277+
"c\n"
278+
"thread 1\n"
279+
"break gdb_sync3\n"
280+
"c\n"
281+
"quit\n";
282+
int ret = write(gdbfd,gdbcmds,strlen(gdbcmds));
283+
int ppid = getpid(); //parent pid
284+
//int pid = 0;
285+
int pid = fork();
286+
if (pid == 0) {
287+
sem_wait(sem);
288+
sem_close(sem);
289+
sem_unlink("/gdb1_sem");
290+
char cmdpid[256];
291+
sprintf(cmdpid,"%d",ppid);
292+
int f = execlp("gdb","gdb","--pid",cmdpid,"--batch-silent","--command=/tmp/gdb1.gdb",(char*) 0);
293+
ASSERT(f != -1);
294+
}
295+
sem_post(sem);
296+
//wait for gdb to run
297+
int attached = 0;
298+
while (attached == 0);
299+
300+
std::unique_ptr<AllocatorT> alloc;
301+
PoolId pool;
302+
bool quit = false;
303+
304+
typename AllocatorT::Config config;
305+
config.setCacheSize(4 * Slab::kSize);
306+
config.enableCachePersistence("/tmp");
307+
config.configureMemoryTiers({
308+
MemoryTierCacheConfig::fromShm()
309+
.setRatio(1).setMemBind(std::string("0")),
310+
MemoryTierCacheConfig::fromShm()
311+
.setRatio(1).setMemBind(std::string("0"))
312+
});
313+
314+
alloc = std::make_unique<AllocatorT>(AllocatorT::SharedMemNew, config);
315+
ASSERT(alloc != nullptr);
316+
pool = alloc->addPool("default", alloc->getCacheMemoryStats().ramCacheSize);
317+
318+
int i = 0;
319+
typename AllocatorT::Item* evicted;
320+
std::unique_ptr<std::thread> t;
321+
std::unique_ptr<std::thread> r;
322+
while(!quit) {
323+
auto handle = alloc->allocate(pool, std::to_string(++i), std::string("value").size());
324+
ASSERT(handle != nullptr);
325+
if (i == 1) {
326+
evicted = static_cast<typename AllocatorT::Item*>(handle.get());
327+
folly::Latch latch_t(1);
328+
t = std::make_unique<std::thread>([&](){
329+
auto handleNew = alloc->allocate(pool, std::to_string(1), std::string("new value").size());
330+
ASSERT(handleNew != nullptr);
331+
latch_t.count_down();
332+
//first breakpoint will be this one because
333+
//thread 1 still has more items to fill up the
334+
//cache before an evict is evicted
335+
gdb_sync1();
336+
ASSERT(evicted->isMoving());
337+
//need to suspend thread 1 - who is doing the eviction
338+
//gdb will do this for us
339+
folly::Latch latch(1);
340+
r = std::make_unique<std::thread>([&](){
341+
ASSERT(evicted->isMoving());
342+
latch.count_down();
343+
auto handleEvict = alloc->find(std::to_string(1));
344+
//does find block until done moving?? yes
345+
while (evicted->isMarkedForEviction()); //move will fail
346+
XDCHECK(handleEvict == nullptr) << handleEvict->toString();
347+
ASSERT(handleEvict == nullptr);
348+
});
349+
latch.wait();
350+
gdb_sync2();
351+
alloc->insertOrReplace(handleNew);
352+
ASSERT(!evicted->isAccessible()); //move failed
353+
quit = true;
354+
});
355+
latch_t.wait();
356+
}
357+
ASSERT_NO_THROW(alloc->insertOrReplace(handle));
358+
}
359+
t->join();
360+
r->join();
361+
gdb_sync3();
237362
}
238363
};
239364
} // namespace tests

0 commit comments

Comments
 (0)