Redesign the cache bin metadata for fast path.

Implement the pointer-based metadata for tcache bins --
- 3 pointers are maintained to represent each bin;
- 2 of the pointers are compressed on 64-bit;
- is_full / is_empty done through pointer comparison;

Comparing to the previous counter based design --
- fast-path speed up ~15% in benchmarks
- direct pointer comparison and de-reference
- no need to access tcache_bin_info in common case
This commit is contained in:
Qi Wang
2019-08-09 22:12:47 -07:00
committed by Qi Wang
parent d2dddfb82a
commit 7599c82d48
9 changed files with 340 additions and 122 deletions

View File

@@ -13,7 +13,6 @@
* of the tcache at all.
*/
/*
* The count of the number of cached allocations in a bin. We make this signed
* so that negative numbers can encode "invalid" states (e.g. a low water mark
@@ -39,29 +38,67 @@ struct cache_bin_info_s {
/* Upper limit on ncached. */
cache_bin_sz_t ncached_max;
};
extern cache_bin_info_t *tcache_bin_info;
typedef struct cache_bin_s cache_bin_t;
struct cache_bin_s {
/* Min # cached since last GC. */
cache_bin_sz_t low_water;
/* # of cached objects. */
cache_bin_sz_t ncached;
/*
* ncached and stats are both modified frequently. Let's keep them
* The cache bin stack is represented using 3 pointers: cur_ptr,
* low_water and full, optimized for the fast path efficiency.
*
* low addr ==> high addr
* |----|----|----|item1|item2|.....................|itemN|
* full cur empty
* (ncached == N; full + ncached_max == empty)
*
* Data directly stored:
* 1) cur_ptr points to the current item to be allocated, i.e. *cur_ptr.
* 2) full points to the top of the stack (i.e. ncached == ncached_max),
* which is compared against on free_fastpath to check "is_full".
* 3) low_water indicates a low water mark of ncached.
* Range of low_water is [cur, empty + 1], i.e. values of [ncached, -1].
*
* The empty position (ncached == 0) is derived via full + ncached_max
* and not accessed in the common case (guarded behind low_water).
*
* On 64-bit, 2 of the 3 pointers (full and low water) are compressed by
* omitting the high 32 bits. Overflow of the half pointers is avoided
* when allocating / initializing the stack space. As a result,
* cur_ptr.lowbits can be safely used for pointer comparisons.
*/
union {
void **ptr;
struct {
/* highbits never accessed directly. */
#if (LG_SIZEOF_PTR == 3 && defined(JEMALLOC_BIG_ENDIAN))
uint32_t __highbits;
#endif
uint32_t lowbits;
#if (LG_SIZEOF_PTR == 3 && !defined(JEMALLOC_BIG_ENDIAN))
uint32_t __highbits;
#endif
};
} cur_ptr;
/*
* cur_ptr and stats are both modified frequently. Let's keep them
* close so that they have a higher chance of being on the same
* cacheline, thus less write-backs.
*/
cache_bin_stats_t tstats;
/*
* Stack of available objects.
* Points to the first item that hasn't been used since last GC, to
* track the low water mark (min # of cached). It may point to
* empty_position + 1, which indicates the cache has been depleted and
* refilled (low_water == -1).
*/
uint32_t low_water_position;
/*
* Points to the position when the cache is full.
*
* To make use of adjacent cacheline prefetch, the items in the avail
* stack goes to higher address for newer allocations. avail points
* just above the available space, which means that
* avail[-ncached, ... -1] are available items and the lowest item will
* be allocated first.
* stack goes to higher address for newer allocations (i.e. cur_ptr++).
*/
void **avail;
uint32_t full_position;
};
typedef struct cache_bin_array_descriptor_s cache_bin_array_descriptor_t;
@@ -76,6 +113,67 @@ struct cache_bin_array_descriptor_s {
cache_bin_t *bins_large;
};
/*
* None of the cache_bin_*_get / _set functions is used on the fast path, which
* relies on pointer comparisons to determine if the cache is full / empty.
*/
static inline cache_bin_sz_t
cache_bin_ncached_get(cache_bin_t *bin, szind_t ind) {
cache_bin_sz_t n = tcache_bin_info[ind].ncached_max -
(bin->cur_ptr.lowbits - bin->full_position) / sizeof(void *);
assert(n >= 0 && n <= tcache_bin_info[ind].ncached_max);
assert(n == 0 || *(bin->cur_ptr.ptr) != NULL);
return n;
}
static inline void **
cache_bin_empty_position_get(cache_bin_t *bin, szind_t ind) {
void **ret = bin->cur_ptr.ptr + cache_bin_ncached_get(bin, ind);
/* Low bits overflow disallowed when allocating the space. */
assert((uint32_t)(uintptr_t)ret >= bin->cur_ptr.lowbits);
assert(bin->full_position + tcache_bin_info[ind].ncached_max *
sizeof(void *) > bin->full_position);
/* Can also be computed via (full_position + ncached_max) | highbits. */
assert(ret == (void **)((uintptr_t)(bin->full_position +
tcache_bin_info[ind].ncached_max * sizeof(void *)) |
(uintptr_t)((uintptr_t)bin->cur_ptr.ptr &
~(((uint64_t)1 << 32) - 1))));
return ret;
}
/* Returns the position of the bottom item on the stack; for convenience. */
static inline void **
cache_bin_bottom_item_get(cache_bin_t *bin, szind_t ind) {
void **bottom = cache_bin_empty_position_get(bin, ind) - 1;
assert(cache_bin_ncached_get(bin, ind) == 0 || *bottom != NULL);
return bottom;
}
/* Returns the numeric value of low water in [-1, ncached]. */
static inline cache_bin_sz_t
cache_bin_low_water_get(cache_bin_t *bin, szind_t ind) {
cache_bin_sz_t low_water = tcache_bin_info[ind].ncached_max -
(bin->low_water_position - bin->full_position) / sizeof(void *);
assert(low_water >= -1 && low_water <=
tcache_bin_info[ind].ncached_max);
assert(low_water <= cache_bin_ncached_get(bin, ind));
assert(bin->low_water_position >= bin->cur_ptr.lowbits);
return low_water;
}
static inline void
cache_bin_ncached_set(cache_bin_t *bin, szind_t ind, cache_bin_sz_t n) {
bin->cur_ptr.lowbits = bin->full_position +
(tcache_bin_info[ind].ncached_max - n) * sizeof(void *);
assert(n >= 0 && n <= tcache_bin_info[ind].ncached_max);
assert(n == 0 || *bin->cur_ptr.ptr != NULL);
}
static inline void
cache_bin_array_descriptor_init(cache_bin_array_descriptor_t *descriptor,
cache_bin_t *bins_small, cache_bin_t *bins_large) {
@@ -85,19 +183,24 @@ cache_bin_array_descriptor_init(cache_bin_array_descriptor_t *descriptor,
}
JEMALLOC_ALWAYS_INLINE void *
cache_bin_alloc_easy(cache_bin_t *bin, bool *success) {
void *ret;
bin->ncached--;
cache_bin_alloc_easy(cache_bin_t *bin, bool *success, cache_bin_sz_t ind) {
/*
* Check for both bin->ncached == 0 and ncached < low_water
* in a single branch.
* This may read from the empty position; however the loaded value won't
* be used. It's safe because the stack has one more slot reserved.
*/
if (unlikely(bin->ncached <= bin->low_water)) {
bin->low_water = bin->ncached;
if (bin->ncached == -1) {
bin->ncached = 0;
void *ret = *(bin->cur_ptr.ptr++);
/*
* Check for both bin->ncached == 0 and ncached < low_water in a single
* branch. This also avoids accessing tcache_bin_info (which is on a
* separate cacheline / page) in the common case.
*/
if (unlikely(bin->cur_ptr.lowbits >= bin->low_water_position)) {
bin->low_water_position = bin->cur_ptr.lowbits;
uint32_t empty_position = bin->full_position +
tcache_bin_info[ind].ncached_max * sizeof(void *);
if (bin->cur_ptr.lowbits > empty_position) {
bin->cur_ptr.ptr--;
assert(bin->cur_ptr.lowbits == empty_position);
*success = false;
return NULL;
}
@@ -111,19 +214,18 @@ cache_bin_alloc_easy(cache_bin_t *bin, bool *success) {
* cacheline).
*/
*success = true;
ret = *(bin->avail - (bin->ncached + 1));
return ret;
}
JEMALLOC_ALWAYS_INLINE bool
cache_bin_dalloc_easy(cache_bin_t *bin, cache_bin_info_t *bin_info, void *ptr) {
if (unlikely(bin->ncached == bin_info->ncached_max)) {
cache_bin_dalloc_easy(cache_bin_t *bin, void *ptr) {
if (unlikely(bin->cur_ptr.lowbits == bin->full_position)) {
return false;
}
assert(bin->ncached < bin_info->ncached_max);
bin->ncached++;
*(bin->avail - bin->ncached) = ptr;
*(--bin->cur_ptr.ptr) = ptr;
assert(bin->cur_ptr.lowbits >= bin->full_position);
return true;
}

View File

@@ -130,8 +130,8 @@ tcache_available(tsd_t *tsd) {
if (likely(tsd_tcache_enabled_get(tsd))) {
/* Associated arena == NULL implies tcache init in progress. */
assert(tsd_tcachep_get(tsd)->arena == NULL ||
tcache_small_bin_get(tsd_tcachep_get(tsd), 0)->avail !=
NULL);
tcache_small_bin_get(tsd_tcachep_get(tsd), 0)->cur_ptr.ptr
!= NULL);
return true;
}

View File

@@ -4,8 +4,6 @@
extern bool opt_tcache;
extern ssize_t opt_lg_tcache_max;
extern cache_bin_info_t *tcache_bin_info;
/*
* Number of tcache bins. There are SC_NBINS small-object bins, plus 0 or more
* large-object bins.

View File

@@ -48,7 +48,7 @@ tcache_alloc_small(tsd_t *tsd, arena_t *arena, tcache_t *tcache,
assert(binind < SC_NBINS);
bin = tcache_small_bin_get(tcache, binind);
ret = cache_bin_alloc_easy(bin, &tcache_success);
ret = cache_bin_alloc_easy(bin, &tcache_success, binind);
assert(tcache_success == (ret != NULL));
if (unlikely(!tcache_success)) {
bool tcache_hard_success;
@@ -109,7 +109,7 @@ tcache_alloc_large(tsd_t *tsd, arena_t *arena, tcache_t *tcache, size_t size,
assert(binind >= SC_NBINS &&binind < nhbins);
bin = tcache_large_bin_get(tcache, binind);
ret = cache_bin_alloc_easy(bin, &tcache_success);
ret = cache_bin_alloc_easy(bin, &tcache_success, binind);
assert(tcache_success == (ret != NULL));
if (unlikely(!tcache_success)) {
/*
@@ -164,7 +164,6 @@ JEMALLOC_ALWAYS_INLINE void
tcache_dalloc_small(tsd_t *tsd, tcache_t *tcache, void *ptr, szind_t binind,
bool slow_path) {
cache_bin_t *bin;
cache_bin_info_t *bin_info;
assert(tcache_salloc(tsd_tsdn(tsd), ptr)
<= SC_SMALL_MAXCLASS);
@@ -174,11 +173,10 @@ tcache_dalloc_small(tsd_t *tsd, tcache_t *tcache, void *ptr, szind_t binind,
}
bin = tcache_small_bin_get(tcache, binind);
bin_info = &tcache_bin_info[binind];
if (unlikely(!cache_bin_dalloc_easy(bin, bin_info, ptr))) {
if (unlikely(!cache_bin_dalloc_easy(bin, ptr))) {
tcache_bin_flush_small(tsd, tcache, bin, binind,
(bin_info->ncached_max >> 1));
bool ret = cache_bin_dalloc_easy(bin, bin_info, ptr);
tcache_bin_info[binind].ncached_max >> 1);
bool ret = cache_bin_dalloc_easy(bin, ptr);
assert(ret);
}
@@ -189,7 +187,6 @@ JEMALLOC_ALWAYS_INLINE void
tcache_dalloc_large(tsd_t *tsd, tcache_t *tcache, void *ptr, szind_t binind,
bool slow_path) {
cache_bin_t *bin;
cache_bin_info_t *bin_info;
assert(tcache_salloc(tsd_tsdn(tsd), ptr)
> SC_SMALL_MAXCLASS);
@@ -200,11 +197,10 @@ tcache_dalloc_large(tsd_t *tsd, tcache_t *tcache, void *ptr, szind_t binind,
}
bin = tcache_large_bin_get(tcache, binind);
bin_info = &tcache_bin_info[binind];
if (unlikely(!cache_bin_dalloc_easy(bin, bin_info, ptr))) {
if (unlikely(!cache_bin_dalloc_easy(bin, ptr))) {
tcache_bin_flush_large(tsd, tcache, bin, binind,
(bin_info->ncached_max >> 1));
bool ret = cache_bin_dalloc_easy(bin, bin_info, ptr);
tcache_bin_info[binind].ncached_max >> 1);
bool ret = cache_bin_dalloc_easy(bin, ptr);
assert(ret);
}