Redesign the cache bin metadata for fast path.

Implement the pointer-based metadata for tcache bins --
- 3 pointers are maintained to represent each bin;
- 2 of the pointers are compressed on 64-bit;
- is_full / is_empty done through pointer comparison;

Comparing to the previous counter based design --
- fast-path speed up ~15% in benchmarks
- direct pointer comparison and de-reference
- no need to access tcache_bin_info in common case
This commit is contained in:
Qi Wang
2019-08-09 22:12:47 -07:00
committed by Qi Wang
parent d2dddfb82a
commit 7599c82d48
9 changed files with 340 additions and 122 deletions

View File

@@ -202,12 +202,13 @@ arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads,
for (szind_t i = 0; i < SC_NBINS; i++) {
cache_bin_t *tbin = &descriptor->bins_small[i];
arena_stats_accum_zu(&astats->tcache_bytes,
tbin->ncached * sz_index2size(i));
cache_bin_ncached_get(tbin, i) * sz_index2size(i));
}
for (szind_t i = 0; i < nhbins - SC_NBINS; i++) {
cache_bin_t *tbin = &descriptor->bins_large[i];
arena_stats_accum_zu(&astats->tcache_bytes,
tbin->ncached * sz_index2size(i));
cache_bin_ncached_get(tbin, i + SC_NBINS) *
sz_index2size(i));
}
}
malloc_mutex_prof_read(tsdn,
@@ -1381,7 +1382,7 @@ arena_tcache_fill_small(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache,
cache_bin_t *tbin, szind_t binind, uint64_t prof_accumbytes) {
unsigned i, nfill, cnt;
assert(tbin->ncached == 0);
assert(cache_bin_ncached_get(tbin, binind) == 0);
if (config_prof && arena_prof_accum(tsdn, arena, prof_accumbytes)) {
prof_idump(tsdn);
@@ -1390,6 +1391,7 @@ arena_tcache_fill_small(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache,
unsigned binshard;
bin_t *bin = arena_bin_choose_lock(tsdn, arena, binind, &binshard);
void **empty_position = cache_bin_empty_position_get(tbin, binind);
for (i = 0, nfill = (tcache_bin_info[binind].ncached_max >>
tcache->lg_fill_div[binind]); i < nfill; i += cnt) {
extent_t *slab;
@@ -1400,7 +1402,7 @@ arena_tcache_fill_small(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache,
tofill : extent_nfree_get(slab);
arena_slab_reg_alloc_batch(
slab, &bin_infos[binind], cnt,
tbin->avail - nfill + i);
empty_position - nfill + i);
} else {
cnt = 1;
void *ptr = arena_bin_malloc_hard(tsdn, arena, bin,
@@ -1412,18 +1414,18 @@ arena_tcache_fill_small(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache,
*/
if (ptr == NULL) {
if (i > 0) {
memmove(tbin->avail - i,
tbin->avail - nfill,
memmove(empty_position - i,
empty_position - nfill,
i * sizeof(void *));
}
break;
}
/* Insert such that low regions get used first. */
*(tbin->avail - nfill + i) = ptr;
*(empty_position - nfill + i) = ptr;
}
if (config_fill && unlikely(opt_junk_alloc)) {
for (unsigned j = 0; j < cnt; j++) {
void* ptr = *(tbin->avail - nfill + i + j);
void* ptr = *(empty_position - nfill + i + j);
arena_alloc_junk_small(ptr, &bin_infos[binind],
true);
}
@@ -1437,7 +1439,7 @@ arena_tcache_fill_small(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache,
tbin->tstats.nrequests = 0;
}
malloc_mutex_unlock(tsdn, &bin->lock);
tbin->ncached = i;
cache_bin_ncached_set(tbin, binind, i);
arena_decay_tick(tsdn, arena);
}

View File

@@ -2368,7 +2368,7 @@ je_malloc(size_t size) {
cache_bin_t *bin = tcache_small_bin_get(tcache, ind);
bool tcache_success;
void* ret = cache_bin_alloc_easy(bin, &tcache_success);
void *ret = cache_bin_alloc_easy(bin, &tcache_success, ind);
if (tcache_success) {
if (config_stats) {
@@ -2846,8 +2846,7 @@ bool free_fastpath(void *ptr, size_t size, bool size_hint) {
}
cache_bin_t *bin = tcache_small_bin_get(tcache, alloc_ctx.szind);
cache_bin_info_t *bin_info = &tcache_bin_info[alloc_ctx.szind];
if (!cache_bin_dalloc_easy(bin, bin_info, ptr)) {
if (!cache_bin_dalloc_easy(bin, ptr)) {
return false;
}

View File

@@ -14,7 +14,16 @@ bool opt_tcache = true;
ssize_t opt_lg_tcache_max = LG_TCACHE_MAXCLASS_DEFAULT;
cache_bin_info_t *tcache_bin_info;
static unsigned stack_nelms; /* Total stack elms per tcache. */
/*
* For the total bin stack region (per tcache), reserve 2 more slots so that 1)
* the empty position can be safely read on the fast path before checking
* "is_empty"; and 2) the low_water == -1 case can go beyond the empty position
* by 1 step safely (i.e. no overflow).
*/
static const unsigned total_stack_padding = sizeof(void *) * 2;
/* Total stack size required (per tcache). Include the padding above. */
static uint32_t total_stack_bytes;
unsigned nhbins;
size_t tcache_maxclass;
@@ -47,14 +56,16 @@ tcache_event_hard(tsd_t *tsd, tcache_t *tcache) {
} else {
tbin = tcache_large_bin_get(tcache, binind);
}
if (tbin->low_water > 0) {
cache_bin_sz_t low_water = cache_bin_low_water_get(tbin, binind);
cache_bin_sz_t ncached = cache_bin_ncached_get(tbin, binind);
if (low_water > 0) {
/*
* Flush (ceiling) 3/4 of the objects below the low water mark.
*/
if (binind < SC_NBINS) {
tcache_bin_flush_small(tsd, tcache, tbin, binind,
tbin->ncached - tbin->low_water + (tbin->low_water
>> 2));
ncached - low_water + (low_water >> 2));
/*
* Reduce fill count by 2X. Limit lg_fill_div such that
* the fill count is always at least 1.
@@ -66,10 +77,10 @@ tcache_event_hard(tsd_t *tsd, tcache_t *tcache) {
}
} else {
tcache_bin_flush_large(tsd, tcache, tbin, binind,
tbin->ncached - tbin->low_water + (tbin->low_water
>> 2));
ncached - low_water + (low_water >> 2));
}
} else if (tbin->low_water < 0) {
} else if (low_water < 0) {
assert(low_water == -1);
/*
* Increase fill count by 2X for small bins. Make sure
* lg_fill_div stays greater than 0.
@@ -78,7 +89,7 @@ tcache_event_hard(tsd_t *tsd, tcache_t *tcache) {
tcache->lg_fill_div[binind]--;
}
}
tbin->low_water = tbin->ncached;
tbin->low_water_position = tbin->cur_ptr.lowbits;
tcache->next_gc_bin++;
if (tcache->next_gc_bin == nhbins) {
@@ -97,7 +108,7 @@ tcache_alloc_small_hard(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache,
if (config_prof) {
tcache->prof_accumbytes = 0;
}
ret = cache_bin_alloc_easy(tbin, tcache_success);
ret = cache_bin_alloc_easy(tbin, tcache_success, binind);
return ret;
}
@@ -117,9 +128,10 @@ tbin_extents_lookup_size_check(tsdn_t *tsdn, cache_bin_t *tbin, szind_t binind,
*/
szind_t szind;
size_t sz_sum = binind * nflush;
void **bottom_item = cache_bin_bottom_item_get(tbin, binind);
for (unsigned i = 0 ; i < nflush; i++) {
rtree_extent_szind_read(tsdn, &extents_rtree,
rtree_ctx, (uintptr_t)*(tbin->avail - 1 - i), true,
rtree_ctx, (uintptr_t)*(bottom_item - i), true,
&extents[i], &szind);
sz_sum -= szind;
}
@@ -137,13 +149,15 @@ tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, cache_bin_t *tbin,
bool merged_stats = false;
assert(binind < SC_NBINS);
assert((cache_bin_sz_t)rem <= tbin->ncached);
cache_bin_sz_t ncached = cache_bin_ncached_get(tbin, binind);
assert((cache_bin_sz_t)rem <= ncached);
arena_t *arena = tcache->arena;
assert(arena != NULL);
unsigned nflush = tbin->ncached - rem;
unsigned nflush = ncached - rem;
VARIABLE_ARRAY(extent_t *, item_extent, nflush);
void **bottom_item = cache_bin_bottom_item_get(tbin, binind);
/* Look up extent once per item. */
if (config_opt_safety_checks) {
tbin_extents_lookup_size_check(tsd_tsdn(tsd), tbin, binind,
@@ -151,7 +165,7 @@ tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, cache_bin_t *tbin,
} else {
for (unsigned i = 0 ; i < nflush; i++) {
item_extent[i] = iealloc(tsd_tsdn(tsd),
*(tbin->avail - 1 - i));
*(bottom_item - i));
}
}
while (nflush > 0) {
@@ -181,7 +195,7 @@ tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, cache_bin_t *tbin,
}
unsigned ndeferred = 0;
for (unsigned i = 0; i < nflush; i++) {
void *ptr = *(tbin->avail - 1 - i);
void *ptr = *(bottom_item - i);
extent = item_extent[i];
assert(ptr != NULL && extent != NULL);
@@ -196,7 +210,7 @@ tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, cache_bin_t *tbin,
* locked. Stash the object, so that it can be
* handled in a future pass.
*/
*(tbin->avail - 1 - ndeferred) = ptr;
*(bottom_item - ndeferred) = ptr;
item_extent[ndeferred] = extent;
ndeferred++;
}
@@ -219,11 +233,11 @@ tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, cache_bin_t *tbin,
malloc_mutex_unlock(tsd_tsdn(tsd), &bin->lock);
}
memmove(tbin->avail - rem, tbin->avail - tbin->ncached, rem *
memmove(tbin->cur_ptr.ptr + (ncached - rem), tbin->cur_ptr.ptr, rem *
sizeof(void *));
tbin->ncached = rem;
if (tbin->ncached < tbin->low_water) {
tbin->low_water = tbin->ncached;
cache_bin_ncached_set(tbin, binind, rem);
if (tbin->cur_ptr.lowbits > tbin->low_water_position) {
tbin->low_water_position = tbin->cur_ptr.lowbits;
}
}
@@ -233,17 +247,19 @@ tcache_bin_flush_large(tsd_t *tsd, tcache_t *tcache, cache_bin_t *tbin, szind_t
bool merged_stats = false;
assert(binind < nhbins);
assert((cache_bin_sz_t)rem <= tbin->ncached);
cache_bin_sz_t ncached = cache_bin_ncached_get(tbin, binind);
assert((cache_bin_sz_t)rem <= ncached);
arena_t *tcache_arena = tcache->arena;
assert(tcache_arena != NULL);
unsigned nflush = tbin->ncached - rem;
unsigned nflush = ncached - rem;
VARIABLE_ARRAY(extent_t *, item_extent, nflush);
void **bottom_item = cache_bin_bottom_item_get(tbin, binind);
#ifndef JEMALLOC_EXTRA_SIZE_CHECK
/* Look up extent once per item. */
for (unsigned i = 0 ; i < nflush; i++) {
item_extent[i] = iealloc(tsd_tsdn(tsd), *(tbin->avail - 1 - i));
item_extent[i] = iealloc(tsd_tsdn(tsd), *(bottom_item - i));
}
#else
tbin_extents_lookup_size_check(tsd_tsdn(tsd), tbin, binind, nflush,
@@ -266,7 +282,7 @@ tcache_bin_flush_large(tsd_t *tsd, tcache_t *tcache, cache_bin_t *tbin, szind_t
malloc_mutex_lock(tsd_tsdn(tsd), &locked_arena->large_mtx);
}
for (unsigned i = 0; i < nflush; i++) {
void *ptr = *(tbin->avail - 1 - i);
void *ptr = *(bottom_item - i);
assert(ptr != NULL);
extent = item_extent[i];
if (extent_arena_ind_get(extent) == locked_arena_ind) {
@@ -295,7 +311,7 @@ tcache_bin_flush_large(tsd_t *tsd, tcache_t *tcache, cache_bin_t *tbin, szind_t
unsigned ndeferred = 0;
for (unsigned i = 0; i < nflush; i++) {
void *ptr = *(tbin->avail - 1 - i);
void *ptr = *(bottom_item - i);
extent = item_extent[i];
assert(ptr != NULL && extent != NULL);
@@ -308,7 +324,7 @@ tcache_bin_flush_large(tsd_t *tsd, tcache_t *tcache, cache_bin_t *tbin, szind_t
* Stash the object, so that it can be handled
* in a future pass.
*/
*(tbin->avail - 1 - ndeferred) = ptr;
*(bottom_item - ndeferred) = ptr;
item_extent[ndeferred] = extent;
ndeferred++;
}
@@ -330,11 +346,11 @@ tcache_bin_flush_large(tsd_t *tsd, tcache_t *tcache, cache_bin_t *tbin, szind_t
tbin->tstats.nrequests = 0;
}
memmove(tbin->avail - rem, tbin->avail - tbin->ncached, rem *
memmove(tbin->cur_ptr.ptr + (ncached - rem), tbin->cur_ptr.ptr, rem *
sizeof(void *));
tbin->ncached = rem;
if (tbin->ncached < tbin->low_water) {
tbin->low_water = tbin->ncached;
cache_bin_ncached_set(tbin, binind, rem);
if (tbin->cur_ptr.lowbits > tbin->low_water_position) {
tbin->low_water_position = tbin->cur_ptr.lowbits;
}
}
@@ -406,9 +422,43 @@ tsd_tcache_enabled_data_init(tsd_t *tsd) {
return false;
}
/* Initialize auto tcache (embedded in TSD). */
static bool
tcache_bin_init(cache_bin_t *bin, szind_t ind, uintptr_t *stack_cur) {
cassert(sizeof(bin->cur_ptr) == sizeof(void *));
/*
* The full_position points to the lowest available space. Allocations
* will access the slots toward higher addresses (for the benefit of
* adjacent prefetch).
*/
void *full_position = (void *)*stack_cur;
uint32_t bin_stack_size = tcache_bin_info[ind].ncached_max *
sizeof(void *);
*stack_cur += bin_stack_size;
void *empty_position = (void *)*stack_cur;
/* Init to the empty position. */
bin->cur_ptr.ptr = empty_position;
bin->low_water_position = bin->cur_ptr.lowbits;
bin->full_position = (uint32_t)(uintptr_t)full_position;
assert(bin->cur_ptr.lowbits - bin->full_position == bin_stack_size);
assert(cache_bin_ncached_get(bin, ind) == 0);
assert(cache_bin_empty_position_get(bin, ind) == empty_position);
return false;
}
/* Sanity check only. */
static bool
tcache_bin_lowbits_overflowable(void *ptr) {
uint32_t lowbits = (uint32_t)((uintptr_t)ptr + total_stack_bytes);
return lowbits < (uint32_t)(uintptr_t)ptr;
}
static void
tcache_init(tsd_t *tsd, tcache_t *tcache, void *avail_stack) {
assert(!tcache_bin_lowbits_overflowable(avail_stack));
memset(&tcache->link, 0, sizeof(ql_elm(tcache_t)));
tcache->prof_accumbytes = 0;
tcache->next_gc_bin = 0;
@@ -416,41 +466,43 @@ tcache_init(tsd_t *tsd, tcache_t *tcache, void *avail_stack) {
ticker_init(&tcache->gc_ticker, TCACHE_GC_INCR);
size_t stack_offset = 0;
assert((TCACHE_NSLOTS_SMALL_MAX & 1U) == 0);
memset(tcache->bins_small, 0, sizeof(cache_bin_t) * SC_NBINS);
memset(tcache->bins_large, 0, sizeof(cache_bin_t) * (nhbins - SC_NBINS));
unsigned i = 0;
uintptr_t stack_cur = (uintptr_t)avail_stack;
for (; i < SC_NBINS; i++) {
tcache->lg_fill_div[i] = 1;
stack_offset += tcache_bin_info[i].ncached_max * sizeof(void *);
/*
* avail points past the available space. Allocations will
* access the slots toward higher addresses (for the benefit of
* prefetch).
*/
tcache_small_bin_get(tcache, i)->avail =
(void **)((uintptr_t)avail_stack + (uintptr_t)stack_offset);
cache_bin_t *bin = tcache_small_bin_get(tcache, i);
tcache_bin_init(bin, i, &stack_cur);
}
for (; i < nhbins; i++) {
stack_offset += tcache_bin_info[i].ncached_max * sizeof(void *);
tcache_large_bin_get(tcache, i)->avail =
(void **)((uintptr_t)avail_stack + (uintptr_t)stack_offset);
cache_bin_t *bin = tcache_large_bin_get(tcache, i);
tcache_bin_init(bin, i, &stack_cur);
}
assert(stack_offset == stack_nelms * sizeof(void *));
/* Sanity check that the whole stack is used. */
size_t stack_offset = stack_cur - (uintptr_t)avail_stack;
assert(stack_offset + total_stack_padding == total_stack_bytes);
}
static size_t
tcache_bin_stack_alignment (size_t size) {
/* Align pow2 to avoid overflow the cache bin compressed pointers. */
return (LG_SIZEOF_PTR == 3) ? pow2_ceil_zu(size) : CACHELINE;
}
/* Initialize auto tcache (embedded in TSD). */
bool
tsd_tcache_data_init(tsd_t *tsd) {
tcache_t *tcache = tsd_tcachep_get_unsafe(tsd);
assert(tcache_small_bin_get(tcache, 0)->avail == NULL);
size_t size = stack_nelms * sizeof(void *);
assert(tcache_small_bin_get(tcache, 0)->cur_ptr.ptr == NULL);
/* Avoid false cacheline sharing. */
size = sz_sa2u(size, CACHELINE);
void *avail_array = ipallocztm(tsd_tsdn(tsd), size, CACHELINE, true,
NULL, true, arena_get(TSDN_NULL, 0, true));
size_t size = sz_sa2u(total_stack_bytes, CACHELINE);
void *avail_array = ipallocztm(tsd_tsdn(tsd), size,
tcache_bin_stack_alignment(size), true, NULL, true,
arena_get(TSDN_NULL, 0, true));
if (avail_array == NULL) {
return true;
}
@@ -485,25 +537,24 @@ tsd_tcache_data_init(tsd_t *tsd) {
/* Created manual tcache for tcache.create mallctl. */
tcache_t *
tcache_create_explicit(tsd_t *tsd) {
tcache_t *tcache;
size_t size, stack_offset;
size = sizeof(tcache_t);
size_t size = sizeof(tcache_t);
/* Naturally align the pointer stacks. */
size = PTR_CEILING(size);
stack_offset = size;
size += stack_nelms * sizeof(void *);
size_t stack_offset = size;
size += total_stack_bytes;
/* Avoid false cacheline sharing. */
size = sz_sa2u(size, CACHELINE);
tcache = ipallocztm(tsd_tsdn(tsd), size, CACHELINE, true, NULL, true,
tcache_t *tcache = ipallocztm(tsd_tsdn(tsd), size,
tcache_bin_stack_alignment(size), true, NULL, true,
arena_get(TSDN_NULL, 0, true));
if (tcache == NULL) {
return NULL;
}
tcache_init(tsd, tcache,
(void *)((uintptr_t)tcache + (uintptr_t)stack_offset));
void *avail_array = (void *)((uintptr_t)tcache +
(uintptr_t)stack_offset);
tcache_init(tsd, tcache, avail_array);
tcache_arena_associate(tsd_tsdn(tsd), tcache, arena_ichoose(tsd, NULL));
return tcache;
@@ -553,9 +604,12 @@ tcache_destroy(tsd_t *tsd, tcache_t *tcache, bool tsd_tcache) {
if (tsd_tcache) {
/* Release the avail array for the TSD embedded auto tcache. */
void *avail_array =
(void *)((uintptr_t)tcache_small_bin_get(tcache, 0)->avail -
(uintptr_t)tcache_bin_info[0].ncached_max * sizeof(void *));
cache_bin_t *bin = tcache_small_bin_get(tcache, 0);
assert(cache_bin_ncached_get(bin, 0) == 0);
assert(cache_bin_empty_position_get(bin, 0) ==
bin->cur_ptr.ptr);
void *avail_array = bin->cur_ptr.ptr -
tcache_bin_info[0].ncached_max;
idalloctm(tsd_tsdn(tsd), avail_array, NULL, NULL, true, true);
} else {
/* Release both the tcache struct and avail array. */
@@ -587,16 +641,17 @@ tcache_cleanup(tsd_t *tsd) {
if (!tcache_available(tsd)) {
assert(tsd_tcache_enabled_get(tsd) == false);
if (config_debug) {
assert(tcache_small_bin_get(tcache, 0)->avail == NULL);
assert(tcache_small_bin_get(tcache, 0)->cur_ptr.ptr
== NULL);
}
return;
}
assert(tsd_tcache_enabled_get(tsd));
assert(tcache_small_bin_get(tcache, 0)->avail != NULL);
assert(tcache_small_bin_get(tcache, 0)->cur_ptr.ptr != NULL);
tcache_destroy(tsd, tcache, true);
if (config_debug) {
tcache_small_bin_get(tcache, 0)->avail = NULL;
tcache_small_bin_get(tcache, 0)->cur_ptr.ptr = NULL;
}
}
@@ -755,8 +810,8 @@ tcache_boot(tsdn_t *tsdn) {
if (tcache_bin_info == NULL) {
return true;
}
unsigned i, stack_nelms;
stack_nelms = 0;
unsigned i;
for (i = 0; i < SC_NBINS; i++) {
if ((bin_infos[i].nregs << 1) <= TCACHE_NSLOTS_SMALL_MIN) {
tcache_bin_info[i].ncached_max =
@@ -775,6 +830,7 @@ tcache_boot(tsdn_t *tsdn) {
tcache_bin_info[i].ncached_max = TCACHE_NSLOTS_LARGE;
stack_nelms += tcache_bin_info[i].ncached_max;
}
total_stack_bytes = stack_nelms * sizeof(void *) + total_stack_padding;
return false;
}