Cache bin: rewrite to track more state.

With this, we track all of the empty, full, and low water states together.  This
simplifies a lot of the tracking logic, since we now don't need the
cache_bin_info_t for state queries (except for some debugging).
This commit is contained in:
David Goldblatt 2020-03-03 18:32:36 -08:00 committed by David Goldblatt
parent fef0b1ffe4
commit 397da03865
2 changed files with 129 additions and 135 deletions

View File

@ -35,67 +35,53 @@ struct cache_bin_stats_s {
*/
typedef struct cache_bin_info_s cache_bin_info_t;
struct cache_bin_info_s {
/* The size of the bin stack, i.e. ncached_max * sizeof(ptr). */
cache_bin_sz_t stack_size;
cache_bin_sz_t ncached_max;
};
typedef struct cache_bin_s cache_bin_t;
struct cache_bin_s {
/*
* The cache bin stack is represented using 3 pointers: cur_ptr,
* low_water and full, optimized for the fast path efficiency.
*
* low addr ==> high addr
* |----|----|----|item1|item2|.....................|itemN|
* full cur empty
* (ncached == N; full + ncached_max == empty)
*
* Data directly stored:
* 1) cur_ptr points to the current item to be allocated, i.e. *cur_ptr.
* 2) full points to the top of the stack (i.e. ncached == ncached_max),
* which is compared against on free_fastpath to check "is_full".
* 3) low_water indicates a low water mark of ncached.
* Range of low_water is [cur, empty], i.e. values of [ncached, 0].
*
* The empty position (ncached == 0) is derived via full + ncached_max
* and not accessed in the common case (guarded behind low_water).
*
* On 64-bit, 2 of the 3 pointers (full and low water) are compressed by
* omitting the high 32 bits. Overflow of the half pointers is avoided
* when allocating / initializing the stack space. As a result,
* cur_ptr.lowbits can be safely used for pointer comparisons.
* The stack grows down. Whenever the bin is nonempty, the head points
* to an array entry containing a valid allocation. When it is empty,
* the head points to one element past the owned array.
*/
union {
void **ptr;
struct {
/* highbits never accessed directly. */
#if (LG_SIZEOF_PTR == 3 && defined(JEMALLOC_BIG_ENDIAN))
uint32_t __highbits;
#endif
uint32_t lowbits;
#if (LG_SIZEOF_PTR == 3 && !defined(JEMALLOC_BIG_ENDIAN))
uint32_t __highbits;
#endif
};
} cur_ptr;
void **stack_head;
/*
* The low bits of the address of the first item in the stack that
* hasn't been used since the last GC, to track the low water mark (min
* # of cached items).
*
* Since the stack grows down, this is a higher address than
* low_bits_full.
*/
uint16_t low_bits_low_water;
/*
* The low bits of the value that stack_head will take on when the array
* is full. (But remember that stack_head always points to a valid item
* when the array is nonempty -- this is in the array).
*
* Recall that since the stack grows down, this is the lowest address in
* the array.
*/
uint16_t low_bits_full;
/*
* The low bits of the value that stack_head will take on when the array
* is empty.
*
* The stack grows down -- this is one past the highest address in the
* array.
*/
uint16_t low_bits_empty;
/*
* cur_ptr and stats are both modified frequently. Let's keep them
* close so that they have a higher chance of being on the same
* cacheline, thus less write-backs.
*/
cache_bin_stats_t tstats;
/*
* Points to the first item that hasn't been used since last GC, to
* track the low water mark (min # of cached).
*/
uint32_t low_water_position;
/*
* Points to the position when the cache is full.
*
* To make use of adjacent cacheline prefetch, the items in the avail
* stack goes to higher address for newer allocations (i.e. cur_ptr++).
*/
uint32_t full_position;
};
typedef struct cache_bin_array_descriptor_s cache_bin_array_descriptor_t;
@ -118,30 +104,51 @@ struct cache_bin_array_descriptor_s {
/* Returns ncached_max: Upper limit on ncached. */
static inline cache_bin_sz_t
cache_bin_info_ncached_max(cache_bin_info_t *info) {
return info->stack_size / sizeof(void *);
return info->ncached_max;
}
/*
* Asserts that the pointer associated with earlier is <= the one associated
* with later.
*/
static inline void
cache_bin_assert_earlier(cache_bin_t *bin, uint16_t earlier, uint16_t later) {
if (earlier > later) {
assert(bin->low_bits_full > bin->low_bits_empty);
}
}
/*
* Internal -- does difference calculations that handle wraparound correctly.
* Earlier must be associated with the position earlier in memory.
*/
static inline uint16_t
cache_bin_diff(cache_bin_t *bin, uint16_t earlier, uint16_t later) {
cache_bin_assert_earlier(bin, earlier, later);
return later - earlier;
}
static inline cache_bin_sz_t
cache_bin_ncached_get(cache_bin_t *bin, cache_bin_info_t *info) {
cache_bin_sz_t n = (cache_bin_sz_t)((info->stack_size +
bin->full_position - bin->cur_ptr.lowbits) / sizeof(void *));
cache_bin_sz_t diff = cache_bin_diff(bin,
(uint16_t)(uintptr_t)bin->stack_head, bin->low_bits_empty);
cache_bin_sz_t n = diff / sizeof(void *);
assert(n <= cache_bin_info_ncached_max(info));
assert(n == 0 || *(bin->cur_ptr.ptr) != NULL);
assert(n == 0 || *(bin->stack_head) != NULL);
return n;
}
static inline void **
cache_bin_empty_position_get(cache_bin_t *bin, cache_bin_info_t *info) {
void **ret = bin->cur_ptr.ptr + cache_bin_ncached_get(bin, info);
/* Low bits overflow disallowed when allocating the space. */
assert((uint32_t)(uintptr_t)ret >= bin->cur_ptr.lowbits);
cache_bin_sz_t diff = cache_bin_diff(bin,
(uint16_t)(uintptr_t)bin->stack_head, bin->low_bits_empty);
uintptr_t empty_bits = (uintptr_t)bin->stack_head + diff;
void **ret = (void **)empty_bits;
/* Can also be computed via (full_position + ncached_max) | highbits. */
uintptr_t lowbits = bin->full_position + info->stack_size;
uintptr_t highbits = (uintptr_t)bin->cur_ptr.ptr &
~(((uint64_t)1 << 32) - 1);
assert(ret == (void **)(lowbits | highbits));
assert(ret >= bin->stack_head);
return ret;
}
@ -149,20 +156,29 @@ cache_bin_empty_position_get(cache_bin_t *bin, cache_bin_info_t *info) {
static inline void
cache_bin_assert_empty(cache_bin_t *bin, cache_bin_info_t *info) {
assert(cache_bin_ncached_get(bin, info) == 0);
assert(cache_bin_empty_position_get(bin, info) == bin->cur_ptr.ptr);
assert(cache_bin_empty_position_get(bin, info) == bin->stack_head);
}
/*
* Get low water, but without any of the correctness checking we do for the
* caller-usable version, if we are temporarily breaking invariants (like
* ncached >= low_water during flush).
*/
static inline cache_bin_sz_t
cache_bin_low_water_get_internal(cache_bin_t *bin, cache_bin_info_t *info) {
return cache_bin_diff(bin, bin->low_bits_low_water,
bin->low_bits_empty) / sizeof(void *);
}
/* Returns the numeric value of low water in [0, ncached]. */
static inline cache_bin_sz_t
cache_bin_low_water_get(cache_bin_t *bin, cache_bin_info_t *info) {
cache_bin_sz_t ncached_max = cache_bin_info_ncached_max(info);
cache_bin_sz_t low_water = ncached_max -
(cache_bin_sz_t)((bin->low_water_position - bin->full_position) /
sizeof(void *));
assert(low_water <= ncached_max);
cache_bin_sz_t low_water = cache_bin_low_water_get_internal(bin, info);
assert(low_water <= cache_bin_info_ncached_max(info));
assert(low_water <= cache_bin_ncached_get(bin, info));
assert(bin->low_water_position >= bin->cur_ptr.lowbits);
cache_bin_assert_earlier(bin, (uint16_t)(uintptr_t)bin->stack_head,
bin->low_bits_low_water);
return low_water;
}
@ -173,20 +189,7 @@ cache_bin_low_water_get(cache_bin_t *bin, cache_bin_info_t *info) {
*/
static inline void
cache_bin_low_water_set(cache_bin_t *bin) {
bin->low_water_position = bin->cur_ptr.lowbits;
}
/*
* This is an internal implementation detail -- users should only affect ncached
* via single-item pushes or batch fills.
*/
static inline void
cache_bin_ncached_set(cache_bin_t *bin, cache_bin_info_t *info,
cache_bin_sz_t n) {
bin->cur_ptr.lowbits = bin->full_position + info->stack_size
- n * sizeof(void *);
assert(n <= cache_bin_info_ncached_max(info));
assert(n == 0 || *bin->cur_ptr.ptr != NULL);
bin->low_bits_low_water = (uint16_t)(uintptr_t)bin->stack_head;
}
static inline void
@ -198,38 +201,35 @@ cache_bin_array_descriptor_init(cache_bin_array_descriptor_t *descriptor,
}
JEMALLOC_ALWAYS_INLINE void *
cache_bin_alloc_easy_impl(cache_bin_t *bin, cache_bin_info_t *info,
bool *success, const bool adjust_low_water) {
cache_bin_alloc_easy_impl(cache_bin_t *bin, bool *success,
const bool adjust_low_water) {
/*
* This may read from the empty position; however the loaded value won't
* be used. It's safe because the stack has one more slot reserved.
*/
void *ret = *(bin->cur_ptr.ptr++);
void *ret = *bin->stack_head;
uint16_t low_bits = (uint16_t)(uintptr_t)bin->stack_head;
void **new_head = bin->stack_head + 1;
/*
* Check for both bin->ncached == 0 and ncached < low_water in a single
* branch. When adjust_low_water is true, this also avoids accessing
* the cache_bin_info_t (which is on a separate cacheline / page) in
* the common case.
* Note that the low water mark is at most empty; if we pass this check,
* we know we're non-empty.
*/
if (unlikely(bin->cur_ptr.lowbits > bin->low_water_position)) {
if (unlikely(low_bits == bin->low_bits_low_water)) {
if (adjust_low_water) {
uint32_t empty_position = bin->full_position +
info->stack_size;
if (unlikely(bin->cur_ptr.lowbits > empty_position)) {
/* Over-allocated; revert. */
bin->cur_ptr.ptr--;
assert(bin->cur_ptr.lowbits == empty_position);
if (unlikely(low_bits == bin->low_bits_empty)) {
*success = false;
return NULL;
}
bin->low_water_position = bin->cur_ptr.lowbits;
/* Overflow should be impossible. */
assert(bin->low_bits_low_water
< (uint16_t)(uintptr_t)new_head);
bin->low_bits_low_water = (uint16_t)(uintptr_t)new_head;
} else {
bin->cur_ptr.ptr--;
assert(bin->cur_ptr.lowbits == bin->low_water_position);
*success = false;
return NULL;
}
}
bin->stack_head = new_head;
/*
* success (instead of ret) should be checked upon the return of this
@ -246,22 +246,27 @@ cache_bin_alloc_easy_impl(cache_bin_t *bin, cache_bin_info_t *info,
JEMALLOC_ALWAYS_INLINE void *
cache_bin_alloc_easy_reduced(cache_bin_t *bin, bool *success) {
/* We don't look at info if we're not adjusting low-water. */
return cache_bin_alloc_easy_impl(bin, NULL, success, false);
return cache_bin_alloc_easy_impl(bin, success, false);
}
JEMALLOC_ALWAYS_INLINE void *
cache_bin_alloc_easy(cache_bin_t *bin, cache_bin_info_t *info, bool *success) {
return cache_bin_alloc_easy_impl(bin, info, success, true);
/* We don't use info now, but we may want to in the future. */
(void)info;
return cache_bin_alloc_easy_impl(bin, success, true);
}
JEMALLOC_ALWAYS_INLINE bool
cache_bin_dalloc_easy(cache_bin_t *bin, void *ptr) {
if (unlikely(bin->cur_ptr.lowbits == bin->full_position)) {
uint16_t low_bits = (uint16_t)(uintptr_t)bin->stack_head;
if (unlikely(low_bits == bin->low_bits_full)) {
return false;
}
*(--bin->cur_ptr.ptr) = ptr;
assert(bin->cur_ptr.lowbits >= bin->full_position);
bin->stack_head--;
*bin->stack_head = ptr;
cache_bin_assert_earlier(bin, bin->low_bits_full,
(uint16_t)(uintptr_t)bin->stack_head);
return true;
}
@ -279,8 +284,8 @@ struct cache_bin_ptr_array_s {
static inline void
cache_bin_init_ptr_array_for_fill(cache_bin_t *bin, cache_bin_info_t *info,
cache_bin_ptr_array_t *arr, cache_bin_sz_t nfill) {
arr->ptr = cache_bin_empty_position_get(bin, info) - nfill;
assert(cache_bin_ncached_get(bin, info) == 0);
arr->ptr = cache_bin_empty_position_get(bin, info) - nfill;
}
/*
@ -292,12 +297,12 @@ static inline void
cache_bin_finish_fill(cache_bin_t *bin, cache_bin_info_t *info,
cache_bin_ptr_array_t *arr, cache_bin_sz_t nfilled) {
assert(cache_bin_ncached_get(bin, info) == 0);
void **empty_position = cache_bin_empty_position_get(bin, info);
if (nfilled < arr->n) {
void **empty_position = cache_bin_empty_position_get(bin, info);
memmove(empty_position - nfilled, empty_position - arr->n,
nfilled * sizeof(void *));
}
cache_bin_ncached_set(bin, info, nfilled);
bin->stack_head = empty_position - nfilled;
}
static inline void
@ -326,11 +331,12 @@ static inline void
cache_bin_finish_flush(cache_bin_t *bin, cache_bin_info_t *info,
cache_bin_ptr_array_t *arr, cache_bin_sz_t nflushed) {
unsigned rem = cache_bin_ncached_get(bin, info) - nflushed;
memmove(bin->cur_ptr.ptr + nflushed, bin->cur_ptr.ptr,
memmove(bin->stack_head + nflushed, bin->stack_head,
rem * sizeof(void *));
cache_bin_ncached_set(bin, info, rem);
if (bin->cur_ptr.lowbits > bin->low_water_position) {
bin->low_water_position = bin->cur_ptr.lowbits;
bin->stack_head = bin->stack_head + nflushed;
if (cache_bin_ncached_get(bin, info)
< cache_bin_low_water_get_internal(bin, info)) {
bin->low_bits_low_water = (uint16_t)(uintptr_t)bin->stack_head;
}
}

View File

@ -8,7 +8,7 @@ cache_bin_info_init(cache_bin_info_t *info,
cache_bin_sz_t ncached_max) {
size_t stack_size = (size_t)ncached_max * sizeof(void *);
assert(stack_size < ((size_t)1 << (sizeof(cache_bin_sz_t) * 8)));
info->stack_size = (cache_bin_sz_t)stack_size;
info->ncached_max = (cache_bin_sz_t)ncached_max;
}
void
@ -23,23 +23,14 @@ cache_bin_info_compute_alloc(cache_bin_info_t *infos, szind_t ninfos,
*/
*size = sizeof(void *) * 2;
for (szind_t i = 0; i < ninfos; i++) {
*size += infos[i].stack_size;
*size += infos[i].ncached_max * sizeof(void *);
}
/*
* 1) Align to at least PAGE, to minimize the # of TLBs needed by the
* Align to at least PAGE, to minimize the # of TLBs needed by the
* smaller sizes; also helps if the larger sizes don't get used at all.
* 2) On 32-bit the pointers won't be compressed; use minimal alignment.
*/
if (LG_SIZEOF_PTR < 3 || *size < PAGE) {
*alignment = PAGE;
} else {
/*
* Align pow2 to avoid overflow the cache bin compressed
* pointers.
*/
*alignment = pow2_ceil_zu(*size);
}
*alignment = PAGE;
}
void
@ -53,10 +44,6 @@ cache_bin_preincrement(cache_bin_info_t *infos, szind_t ninfos, void *alloc,
cache_bin_info_compute_alloc(infos, ninfos, &computed_size,
&computed_alignment);
assert(((uintptr_t)alloc & (computed_alignment - 1)) == 0);
/* And that alignment should disallow overflow. */
uint32_t lowbits = (uint32_t)((uintptr_t)alloc + computed_size);
assert((uint32_t)(uintptr_t)alloc < lowbits);
}
/*
* Leave a noticeable mark pattern on the boundaries, in case a bug
@ -81,7 +68,6 @@ cache_bin_postincrement(cache_bin_info_t *infos, szind_t ninfos, void *alloc,
void
cache_bin_init(cache_bin_t *bin, cache_bin_info_t *info, void *alloc,
size_t *cur_offset) {
assert(sizeof(bin->cur_ptr) == sizeof(void *));
/*
* The full_position points to the lowest available space. Allocations
* will access the slots toward higher addresses (for the benefit of
@ -89,21 +75,23 @@ cache_bin_init(cache_bin_t *bin, cache_bin_info_t *info, void *alloc,
*/
void *stack_cur = (void *)((uintptr_t)alloc + *cur_offset);
void *full_position = stack_cur;
uint32_t bin_stack_size = info->stack_size;
uint16_t bin_stack_size = info->ncached_max * sizeof(void *);
*cur_offset += bin_stack_size;
void *empty_position = (void *)((uintptr_t)alloc + *cur_offset);
/* Init to the empty position. */
bin->cur_ptr.ptr = empty_position;
bin->low_water_position = bin->cur_ptr.lowbits;
bin->full_position = (uint32_t)(uintptr_t)full_position;
assert(bin->cur_ptr.lowbits - bin->full_position == bin_stack_size);
bin->stack_head = (void **)empty_position;
bin->low_bits_low_water = (uint16_t)(uintptr_t)bin->stack_head;
bin->low_bits_full = (uint16_t)(uintptr_t)full_position;
bin->low_bits_empty = (uint16_t)(uintptr_t)empty_position;
assert(cache_bin_diff(bin, bin->low_bits_full,
(uint16_t)(uintptr_t) bin->stack_head) == bin_stack_size);
assert(cache_bin_ncached_get(bin, info) == 0);
assert(cache_bin_empty_position_get(bin, info) == empty_position);
}
bool
cache_bin_still_zero_initialized(cache_bin_t *bin) {
return bin->cur_ptr.ptr == NULL;
return bin->stack_head == NULL;
}