Skip to content
2 changes: 1 addition & 1 deletion Include/internal/pycore_pymem.h
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ extern wchar_t *_PyMem_DefaultRawWcsdup(const wchar_t *str);
extern int _PyMem_DebugEnabled(void);

// Enqueue a pointer to be freed possibly after some delay.
extern void _PyMem_FreeDelayed(void *ptr);
extern void _PyMem_FreeDelayed(void *ptr, size_t size);

// Enqueue an object to be freed possibly after some delay
#ifdef Py_GIL_DISABLED
Expand Down
31 changes: 25 additions & 6 deletions Include/internal/pycore_qsbr.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,21 @@ struct _qsbr_thread_state {
// Thread state (or NULL)
PyThreadState *tstate;

// Used to defer advancing write sequence a fixed number of times
int deferrals;
// Number of held items added by this thread since the last write sequence
// advance
int deferred_count;

// Estimate for the amount of memory that is held by this thread since
// the last write sequence advance
size_t deferred_memory;

// Amount of memory in mimalloc pages deferred from collection. When
// deferred, they are prevented from being used for a different size class
// and in a different thread.
size_t deferred_page_memory;

// True if the deferred memory frees should be processed.
bool should_process;

// Is this thread state allocated?
bool allocated;
Expand Down Expand Up @@ -109,11 +122,17 @@ _Py_qbsr_goal_reached(struct _qsbr_thread_state *qsbr, uint64_t goal)
extern uint64_t
_Py_qsbr_advance(struct _qsbr_shared *shared);

// Batches requests to advance the write sequence. This advances the write
// sequence every N calls, which reduces overhead but increases time to
// reclamation. Returns the new goal.
// Return the next value for the write sequence (current plus the increment).
extern uint64_t
_Py_qsbr_deferred_advance(struct _qsbr_thread_state *qsbr);
_Py_qsbr_shared_next(struct _qsbr_shared *shared);

// Return true if deferred memory frees held by QSBR should be processed to
// determine if they can be safely freed.
static inline bool
_Py_qsbr_should_process(struct _qsbr_thread_state *qsbr)
{
return qsbr->should_process;
}

// Have the read sequences advanced to the given goal? If this returns true,
// it safe to reclaim any memory tagged with the goal (or earlier goal).
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Limit excess memory usage in the :term:`free threading` build when a
large dictionary or list is resized and accessed by multiple threads.
2 changes: 1 addition & 1 deletion Objects/codeobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -3370,7 +3370,7 @@ create_tlbc_lock_held(PyCodeObject *co, Py_ssize_t idx)
}
memcpy(new_tlbc->entries, tlbc->entries, tlbc->size * sizeof(void *));
_Py_atomic_store_ptr_release(&co->co_tlbc, new_tlbc);
_PyMem_FreeDelayed(tlbc);
_PyMem_FreeDelayed(tlbc, tlbc->size * sizeof(void *));
tlbc = new_tlbc;
}
char *bc = PyMem_Calloc(1, _PyCode_NBYTES(co));
Expand Down
4 changes: 2 additions & 2 deletions Objects/dictobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -813,7 +813,7 @@ free_keys_object(PyDictKeysObject *keys, bool use_qsbr)
{
#ifdef Py_GIL_DISABLED
if (use_qsbr) {
_PyMem_FreeDelayed(keys);
_PyMem_FreeDelayed(keys, _PyDict_KeysSize(keys));
return;
}
#endif
Expand Down Expand Up @@ -858,7 +858,7 @@ free_values(PyDictValues *values, bool use_qsbr)
assert(values->embedded == 0);
#ifdef Py_GIL_DISABLED
if (use_qsbr) {
_PyMem_FreeDelayed(values);
_PyMem_FreeDelayed(values, values_size_from_count(values->capacity));
return;
}
#endif
Expand Down
3 changes: 2 additions & 1 deletion Objects/listobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,8 @@ free_list_items(PyObject** items, bool use_qsbr)
#ifdef Py_GIL_DISABLED
_PyListArray *array = _Py_CONTAINER_OF(items, _PyListArray, ob_item);
if (use_qsbr) {
_PyMem_FreeDelayed(array);
size_t size = sizeof(_PyListArray) + array->allocated * sizeof(PyObject *);
_PyMem_FreeDelayed(array, size);
}
else {
PyMem_Free(array);
Expand Down
86 changes: 80 additions & 6 deletions Objects/obmalloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,29 @@ _PyMem_mi_page_is_safe_to_free(mi_page_t *page)

}

#ifdef Py_GIL_DISABLED

// If we are deferring collection of more than this amount of memory for
// mimalloc pages, advance the write sequence. Advancing allows these
// pages to be re-used in a different thread or for a different size class.
#define QSBR_PAGE_MEM_LIMIT 4096*20

// Return true if the global write sequence should be advanced for a mimalloc
// page that is deferred from collection.
static bool
should_advance_qsbr_for_page(struct _qsbr_thread_state *qsbr, mi_page_t *page)
{
size_t bsize = mi_page_block_size(page);
size_t page_size = page->capacity*bsize;
qsbr->deferred_page_memory += page_size;
if (qsbr->deferred_page_memory > QSBR_PAGE_MEM_LIMIT) {
qsbr->deferred_page_memory = 0;
return true;
}
return false;
}
#endif

static bool
_PyMem_mi_page_maybe_free(mi_page_t *page, mi_page_queue_t *pq, bool force)
{
Expand All @@ -139,7 +162,14 @@ _PyMem_mi_page_maybe_free(mi_page_t *page, mi_page_queue_t *pq, bool force)

_PyMem_mi_page_clear_qsbr(page);
page->retire_expire = 0;
page->qsbr_goal = _Py_qsbr_deferred_advance(tstate->qsbr);

if (should_advance_qsbr_for_page(tstate->qsbr, page)) {
page->qsbr_goal = _Py_qsbr_advance(tstate->qsbr->shared);
}
else {
page->qsbr_goal = _Py_qsbr_shared_next(tstate->qsbr->shared);
}

llist_insert_tail(&tstate->mimalloc.page_list, &page->qsbr_node);
return false;
}
Expand Down Expand Up @@ -1141,8 +1171,38 @@ free_work_item(uintptr_t ptr, delayed_dealloc_cb cb, void *state)
}
}


#ifdef Py_GIL_DISABLED

// For deferred advance on free: the number of deferred items before advancing
// the write sequence. This is based on WORK_ITEMS_PER_CHUNK. We ideally
// want to process a chunk before it overflows.
#define QSBR_DEFERRED_LIMIT 127

// If the deferred memory exceeds 1 MiB, advance the write sequence. This
// helps limit memory usage due to QSBR delaying frees too long.
#define QSBR_FREE_MEM_LIMIT 1024*1024

// Return true if the global write sequence should be advanced for a deferred
// memory free.
static bool
should_advance_qsbr_for_free(struct _qsbr_thread_state *qsbr, size_t size)
{
qsbr->deferred_count++;
qsbr->deferred_memory += size;
if (qsbr->deferred_count > QSBR_DEFERRED_LIMIT ||
qsbr->deferred_memory > QSBR_FREE_MEM_LIMIT) {
qsbr->deferred_count = 0;
qsbr->deferred_memory = 0;
qsbr->should_process = true;
return true;
}
return false;
}
#endif

static void
free_delayed(uintptr_t ptr)
free_delayed(uintptr_t ptr, size_t size)
{
#ifndef Py_GIL_DISABLED
free_work_item(ptr, NULL, NULL);
Expand Down Expand Up @@ -1200,23 +1260,32 @@ free_delayed(uintptr_t ptr)
}

assert(buf != NULL && buf->wr_idx < WORK_ITEMS_PER_CHUNK);
uint64_t seq = _Py_qsbr_deferred_advance(tstate->qsbr);
uint64_t seq;
if (should_advance_qsbr_for_free(tstate->qsbr, size)) {
seq = _Py_qsbr_advance(tstate->qsbr->shared);
}
else {
seq = _Py_qsbr_shared_next(tstate->qsbr->shared);
}
buf->array[buf->wr_idx].ptr = ptr;
buf->array[buf->wr_idx].qsbr_goal = seq;
buf->wr_idx++;

if (buf->wr_idx == WORK_ITEMS_PER_CHUNK) {
// Normally the processing of delayed items is done from the eval
// breaker. Processing here is a safety measure to ensure too much
// work does not accumulate.
_PyMem_ProcessDelayed((PyThreadState *)tstate);
}
#endif
}

void
_PyMem_FreeDelayed(void *ptr)
_PyMem_FreeDelayed(void *ptr, size_t size)
{
assert(!((uintptr_t)ptr & 0x01));
if (ptr != NULL) {
free_delayed((uintptr_t)ptr);
free_delayed((uintptr_t)ptr, size);
}
}

Expand All @@ -1226,7 +1295,10 @@ _PyObject_XDecRefDelayed(PyObject *ptr)
{
assert(!((uintptr_t)ptr & 0x01));
if (ptr != NULL) {
free_delayed(((uintptr_t)ptr)|0x01);
// We use 0 as the size since we don't have an easy way to know the
// actual size. If we are freeing many objects, the write sequence
// will be advanced due to QSBR_DEFERRED_LIMIT.
free_delayed(((uintptr_t)ptr)|0x01, 0);
}
}
#endif
Expand Down Expand Up @@ -1302,6 +1374,8 @@ _PyMem_ProcessDelayed(PyThreadState *tstate)
PyInterpreterState *interp = tstate->interp;
_PyThreadStateImpl *tstate_impl = (_PyThreadStateImpl *)tstate;

tstate_impl->qsbr->should_process = false;

// Process thread-local work
process_queue(&tstate_impl->mem_free_queue, tstate_impl, true, NULL, NULL);

Expand Down
4 changes: 4 additions & 0 deletions Python/ceval_gil.c
Original file line number Diff line number Diff line change
Expand Up @@ -1387,6 +1387,10 @@ _Py_HandlePending(PyThreadState *tstate)
_Py_unset_eval_breaker_bit(tstate, _PY_EVAL_EXPLICIT_MERGE_BIT);
_Py_brc_merge_refcounts(tstate);
}
/* Process deferred memory frees held by QSBR */
if (_Py_qsbr_should_process(((_PyThreadStateImpl *)tstate)->qsbr)) {
_PyMem_ProcessDelayed(tstate);
}
#endif

/* GC scheduled to run */
Expand Down
12 changes: 2 additions & 10 deletions Python/qsbr.c
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,6 @@
// Starting size of the array of qsbr thread states
#define MIN_ARRAY_SIZE 8

// For _Py_qsbr_deferred_advance(): the number of deferrals before advancing
// the write sequence.
#define QSBR_DEFERRED_LIMIT 10

// Allocate a QSBR thread state from the freelist
static struct _qsbr_thread_state *
qsbr_allocate(struct _qsbr_shared *shared)
Expand Down Expand Up @@ -117,13 +113,9 @@ _Py_qsbr_advance(struct _qsbr_shared *shared)
}

uint64_t
_Py_qsbr_deferred_advance(struct _qsbr_thread_state *qsbr)
_Py_qsbr_shared_next(struct _qsbr_shared *shared)
{
if (++qsbr->deferrals < QSBR_DEFERRED_LIMIT) {
return _Py_qsbr_shared_current(qsbr->shared) + QSBR_INCR;
}
qsbr->deferrals = 0;
return _Py_qsbr_advance(qsbr->shared);
return _Py_qsbr_shared_current(shared) + QSBR_INCR;
}

static uint64_t
Expand Down
Loading