Skip to content

Commit ce52f9c

Browse files
authored
[libc] Search empty bits after failed allocation (#149910)
Summary: The scheme we use to find a free bit is to just do a random walk. This works very well up until you start to completely saturate the bitfield. Because the result of the fetch_or yields the previous value, we can search this to go to any known empty bits as our next guess. This effectively increases our liklihood of finding a match after two tries by 32x since the distribution is random. This *massively* improves performance when a lot of memory is allocated without freeing, as it now doesn't takea one in a million shot to fill that last bit. A further change could improve this further by only *mostly* filling the slab, allowing 1% to be free at all times.
1 parent df1dd80 commit ce52f9c

File tree

1 file changed

+13
-3
lines changed

1 file changed

+13
-3
lines changed

libc/src/__support/GPU/allocator.cpp

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -256,12 +256,18 @@ struct Slab {
256256
// The uniform mask represents which lanes contain a uniform target pointer.
257257
// We attempt to place these next to each other.
258258
void *result = nullptr;
259+
uint32_t after = ~0u;
260+
uint32_t old_index = 0;
259261
for (uint64_t mask = lane_mask; mask;
260262
mask = gpu::ballot(lane_mask, !result)) {
261263
if (result)
262264
continue;
263265

264-
uint32_t start = gpu::broadcast_value(lane_mask, impl::xorshift32(state));
266+
// We try using any known empty bits from the previous attempt first.
267+
uint32_t start = gpu::shuffle(mask, cpp::countr_zero(uniform & mask),
268+
~after ? (old_index & ~(BITS_IN_WORD - 1)) +
269+
cpp::countr_zero(~after)
270+
: impl::xorshift32(state));
265271

266272
uint32_t id = impl::lane_count(uniform & mask);
267273
uint32_t index = (start + id) % usable_bits(chunk_size);
@@ -271,8 +277,9 @@ struct Slab {
271277
// Get the mask of bits destined for the same slot and coalesce it.
272278
uint64_t match = uniform & gpu::match_any(mask, slot);
273279
uint32_t length = cpp::popcount(match);
274-
uint32_t bitmask = static_cast<uint32_t>((uint64_t(1) << length) - 1)
275-
<< bit;
280+
uint32_t bitmask = gpu::shuffle(
281+
mask, cpp::countr_zero(match),
282+
static_cast<uint32_t>((uint64_t(1) << length) - 1) << bit);
276283

277284
uint32_t before = 0;
278285
if (gpu::get_lane_id() == static_cast<uint32_t>(cpp::countr_zero(match)))
@@ -283,6 +290,9 @@ struct Slab {
283290
result = ptr_from_index(index, chunk_size);
284291
else
285292
sleep_briefly();
293+
294+
after = before | bitmask;
295+
old_index = index;
286296
}
287297

288298
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);

0 commit comments

Comments
 (0)