diff --git a/Makefile.in b/Makefile.in index ef75d8ac..7584f598 100644 --- a/Makefile.in +++ b/Makefile.in @@ -178,6 +178,7 @@ TESTS_UNIT := \ $(srcroot)test/unit/bit_util.c \ $(srcroot)test/unit/binshard.c \ $(srcroot)test/unit/buf_writer.c \ + $(srcroot)test/unit/cache_bin.c \ $(srcroot)test/unit/ckh.c \ $(srcroot)test/unit/decay.c \ $(srcroot)test/unit/div.c \ diff --git a/include/jemalloc/internal/cache_bin.h b/include/jemalloc/internal/cache_bin.h index d14556a3..67180cfa 100644 --- a/include/jemalloc/internal/cache_bin.h +++ b/include/jemalloc/internal/cache_bin.h @@ -13,7 +13,6 @@ * of the tcache at all. */ - /* * The count of the number of cached allocations in a bin. We make this signed * so that negative numbers can encode "invalid" states (e.g. a low water mark @@ -39,29 +38,67 @@ struct cache_bin_info_s { /* Upper limit on ncached. */ cache_bin_sz_t ncached_max; }; +extern cache_bin_info_t *tcache_bin_info; typedef struct cache_bin_s cache_bin_t; struct cache_bin_s { - /* Min # cached since last GC. */ - cache_bin_sz_t low_water; - /* # of cached objects. */ - cache_bin_sz_t ncached; /* - * ncached and stats are both modified frequently. Let's keep them + * The cache bin stack is represented using 3 pointers: cur_ptr, + * low_water and full, optimized for the fast path efficiency. + * + * low addr ==> high addr + * |----|----|----|item1|item2|.....................|itemN| + * full cur empty + * (ncached == N; full + ncached_max == empty) + * + * Data directly stored: + * 1) cur_ptr points to the current item to be allocated, i.e. *cur_ptr. + * 2) full points to the top of the stack (i.e. ncached == ncached_max), + * which is compared against on free_fastpath to check "is_full". + * 3) low_water indicates a low water mark of ncached. + * Range of low_water is [cur, empty + 1], i.e. values of [ncached, -1]. + * + * The empty position (ncached == 0) is derived via full + ncached_max + * and not accessed in the common case (guarded behind low_water). + * + * On 64-bit, 2 of the 3 pointers (full and low water) are compressed by + * omitting the high 32 bits. Overflow of the half pointers is avoided + * when allocating / initializing the stack space. As a result, + * cur_ptr.lowbits can be safely used for pointer comparisons. + */ + union { + void **ptr; + struct { + /* highbits never accessed directly. */ +#if (LG_SIZEOF_PTR == 3 && defined(JEMALLOC_BIG_ENDIAN)) + uint32_t __highbits; +#endif + uint32_t lowbits; +#if (LG_SIZEOF_PTR == 3 && !defined(JEMALLOC_BIG_ENDIAN)) + uint32_t __highbits; +#endif + }; + } cur_ptr; + /* + * cur_ptr and stats are both modified frequently. Let's keep them * close so that they have a higher chance of being on the same * cacheline, thus less write-backs. */ cache_bin_stats_t tstats; /* - * Stack of available objects. + * Points to the first item that hasn't been used since last GC, to + * track the low water mark (min # of cached). It may point to + * empty_position + 1, which indicates the cache has been depleted and + * refilled (low_water == -1). + */ + uint32_t low_water_position; + /* + * Points to the position when the cache is full. * * To make use of adjacent cacheline prefetch, the items in the avail - * stack goes to higher address for newer allocations. avail points - * just above the available space, which means that - * avail[-ncached, ... -1] are available items and the lowest item will - * be allocated first. + * stack goes to higher address for newer allocations (i.e. cur_ptr++). */ - void **avail; + uint32_t full_position; }; typedef struct cache_bin_array_descriptor_s cache_bin_array_descriptor_t; @@ -76,6 +113,67 @@ struct cache_bin_array_descriptor_s { cache_bin_t *bins_large; }; +/* + * None of the cache_bin_*_get / _set functions is used on the fast path, which + * relies on pointer comparisons to determine if the cache is full / empty. + */ +static inline cache_bin_sz_t +cache_bin_ncached_get(cache_bin_t *bin, szind_t ind) { + cache_bin_sz_t n = tcache_bin_info[ind].ncached_max - + (bin->cur_ptr.lowbits - bin->full_position) / sizeof(void *); + assert(n >= 0 && n <= tcache_bin_info[ind].ncached_max); + assert(n == 0 || *(bin->cur_ptr.ptr) != NULL); + + return n; +} + +static inline void ** +cache_bin_empty_position_get(cache_bin_t *bin, szind_t ind) { + void **ret = bin->cur_ptr.ptr + cache_bin_ncached_get(bin, ind); + /* Low bits overflow disallowed when allocating the space. */ + assert((uint32_t)(uintptr_t)ret >= bin->cur_ptr.lowbits); + assert(bin->full_position + tcache_bin_info[ind].ncached_max * + sizeof(void *) > bin->full_position); + + /* Can also be computed via (full_position + ncached_max) | highbits. */ + assert(ret == (void **)((uintptr_t)(bin->full_position + + tcache_bin_info[ind].ncached_max * sizeof(void *)) | + (uintptr_t)((uintptr_t)bin->cur_ptr.ptr & + ~(((uint64_t)1 << 32) - 1)))); + + return ret; +} + +/* Returns the position of the bottom item on the stack; for convenience. */ +static inline void ** +cache_bin_bottom_item_get(cache_bin_t *bin, szind_t ind) { + void **bottom = cache_bin_empty_position_get(bin, ind) - 1; + assert(cache_bin_ncached_get(bin, ind) == 0 || *bottom != NULL); + + return bottom; +} + +/* Returns the numeric value of low water in [-1, ncached]. */ +static inline cache_bin_sz_t +cache_bin_low_water_get(cache_bin_t *bin, szind_t ind) { + cache_bin_sz_t low_water = tcache_bin_info[ind].ncached_max - + (bin->low_water_position - bin->full_position) / sizeof(void *); + assert(low_water >= -1 && low_water <= + tcache_bin_info[ind].ncached_max); + assert(low_water <= cache_bin_ncached_get(bin, ind)); + assert(bin->low_water_position >= bin->cur_ptr.lowbits); + + return low_water; +} + +static inline void +cache_bin_ncached_set(cache_bin_t *bin, szind_t ind, cache_bin_sz_t n) { + bin->cur_ptr.lowbits = bin->full_position + + (tcache_bin_info[ind].ncached_max - n) * sizeof(void *); + assert(n >= 0 && n <= tcache_bin_info[ind].ncached_max); + assert(n == 0 || *bin->cur_ptr.ptr != NULL); +} + static inline void cache_bin_array_descriptor_init(cache_bin_array_descriptor_t *descriptor, cache_bin_t *bins_small, cache_bin_t *bins_large) { @@ -85,19 +183,24 @@ cache_bin_array_descriptor_init(cache_bin_array_descriptor_t *descriptor, } JEMALLOC_ALWAYS_INLINE void * -cache_bin_alloc_easy(cache_bin_t *bin, bool *success) { - void *ret; - - bin->ncached--; - +cache_bin_alloc_easy(cache_bin_t *bin, bool *success, cache_bin_sz_t ind) { /* - * Check for both bin->ncached == 0 and ncached < low_water - * in a single branch. + * This may read from the empty position; however the loaded value won't + * be used. It's safe because the stack has one more slot reserved. */ - if (unlikely(bin->ncached <= bin->low_water)) { - bin->low_water = bin->ncached; - if (bin->ncached == -1) { - bin->ncached = 0; + void *ret = *(bin->cur_ptr.ptr++); + /* + * Check for both bin->ncached == 0 and ncached < low_water in a single + * branch. This also avoids accessing tcache_bin_info (which is on a + * separate cacheline / page) in the common case. + */ + if (unlikely(bin->cur_ptr.lowbits >= bin->low_water_position)) { + bin->low_water_position = bin->cur_ptr.lowbits; + uint32_t empty_position = bin->full_position + + tcache_bin_info[ind].ncached_max * sizeof(void *); + if (bin->cur_ptr.lowbits > empty_position) { + bin->cur_ptr.ptr--; + assert(bin->cur_ptr.lowbits == empty_position); *success = false; return NULL; } @@ -111,19 +214,18 @@ cache_bin_alloc_easy(cache_bin_t *bin, bool *success) { * cacheline). */ *success = true; - ret = *(bin->avail - (bin->ncached + 1)); return ret; } JEMALLOC_ALWAYS_INLINE bool -cache_bin_dalloc_easy(cache_bin_t *bin, cache_bin_info_t *bin_info, void *ptr) { - if (unlikely(bin->ncached == bin_info->ncached_max)) { +cache_bin_dalloc_easy(cache_bin_t *bin, void *ptr) { + if (unlikely(bin->cur_ptr.lowbits == bin->full_position)) { return false; } - assert(bin->ncached < bin_info->ncached_max); - bin->ncached++; - *(bin->avail - bin->ncached) = ptr; + + *(--bin->cur_ptr.ptr) = ptr; + assert(bin->cur_ptr.lowbits >= bin->full_position); return true; } diff --git a/include/jemalloc/internal/jemalloc_internal_inlines_a.h b/include/jemalloc/internal/jemalloc_internal_inlines_a.h index ddde9b4e..fedbd862 100644 --- a/include/jemalloc/internal/jemalloc_internal_inlines_a.h +++ b/include/jemalloc/internal/jemalloc_internal_inlines_a.h @@ -130,8 +130,8 @@ tcache_available(tsd_t *tsd) { if (likely(tsd_tcache_enabled_get(tsd))) { /* Associated arena == NULL implies tcache init in progress. */ assert(tsd_tcachep_get(tsd)->arena == NULL || - tcache_small_bin_get(tsd_tcachep_get(tsd), 0)->avail != - NULL); + tcache_small_bin_get(tsd_tcachep_get(tsd), 0)->cur_ptr.ptr + != NULL); return true; } diff --git a/include/jemalloc/internal/tcache_externs.h b/include/jemalloc/internal/tcache_externs.h index 266f246b..2060bb19 100644 --- a/include/jemalloc/internal/tcache_externs.h +++ b/include/jemalloc/internal/tcache_externs.h @@ -4,8 +4,6 @@ extern bool opt_tcache; extern ssize_t opt_lg_tcache_max; -extern cache_bin_info_t *tcache_bin_info; - /* * Number of tcache bins. There are SC_NBINS small-object bins, plus 0 or more * large-object bins. diff --git a/include/jemalloc/internal/tcache_inlines.h b/include/jemalloc/internal/tcache_inlines.h index 4815774b..4f7e02a8 100644 --- a/include/jemalloc/internal/tcache_inlines.h +++ b/include/jemalloc/internal/tcache_inlines.h @@ -48,7 +48,7 @@ tcache_alloc_small(tsd_t *tsd, arena_t *arena, tcache_t *tcache, assert(binind < SC_NBINS); bin = tcache_small_bin_get(tcache, binind); - ret = cache_bin_alloc_easy(bin, &tcache_success); + ret = cache_bin_alloc_easy(bin, &tcache_success, binind); assert(tcache_success == (ret != NULL)); if (unlikely(!tcache_success)) { bool tcache_hard_success; @@ -109,7 +109,7 @@ tcache_alloc_large(tsd_t *tsd, arena_t *arena, tcache_t *tcache, size_t size, assert(binind >= SC_NBINS &&binind < nhbins); bin = tcache_large_bin_get(tcache, binind); - ret = cache_bin_alloc_easy(bin, &tcache_success); + ret = cache_bin_alloc_easy(bin, &tcache_success, binind); assert(tcache_success == (ret != NULL)); if (unlikely(!tcache_success)) { /* @@ -164,7 +164,6 @@ JEMALLOC_ALWAYS_INLINE void tcache_dalloc_small(tsd_t *tsd, tcache_t *tcache, void *ptr, szind_t binind, bool slow_path) { cache_bin_t *bin; - cache_bin_info_t *bin_info; assert(tcache_salloc(tsd_tsdn(tsd), ptr) <= SC_SMALL_MAXCLASS); @@ -174,11 +173,10 @@ tcache_dalloc_small(tsd_t *tsd, tcache_t *tcache, void *ptr, szind_t binind, } bin = tcache_small_bin_get(tcache, binind); - bin_info = &tcache_bin_info[binind]; - if (unlikely(!cache_bin_dalloc_easy(bin, bin_info, ptr))) { + if (unlikely(!cache_bin_dalloc_easy(bin, ptr))) { tcache_bin_flush_small(tsd, tcache, bin, binind, - (bin_info->ncached_max >> 1)); - bool ret = cache_bin_dalloc_easy(bin, bin_info, ptr); + tcache_bin_info[binind].ncached_max >> 1); + bool ret = cache_bin_dalloc_easy(bin, ptr); assert(ret); } @@ -189,7 +187,6 @@ JEMALLOC_ALWAYS_INLINE void tcache_dalloc_large(tsd_t *tsd, tcache_t *tcache, void *ptr, szind_t binind, bool slow_path) { cache_bin_t *bin; - cache_bin_info_t *bin_info; assert(tcache_salloc(tsd_tsdn(tsd), ptr) > SC_SMALL_MAXCLASS); @@ -200,11 +197,10 @@ tcache_dalloc_large(tsd_t *tsd, tcache_t *tcache, void *ptr, szind_t binind, } bin = tcache_large_bin_get(tcache, binind); - bin_info = &tcache_bin_info[binind]; - if (unlikely(!cache_bin_dalloc_easy(bin, bin_info, ptr))) { + if (unlikely(!cache_bin_dalloc_easy(bin, ptr))) { tcache_bin_flush_large(tsd, tcache, bin, binind, - (bin_info->ncached_max >> 1)); - bool ret = cache_bin_dalloc_easy(bin, bin_info, ptr); + tcache_bin_info[binind].ncached_max >> 1); + bool ret = cache_bin_dalloc_easy(bin, ptr); assert(ret); } diff --git a/src/arena.c b/src/arena.c index e956c394..23d0294b 100644 --- a/src/arena.c +++ b/src/arena.c @@ -202,12 +202,13 @@ arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads, for (szind_t i = 0; i < SC_NBINS; i++) { cache_bin_t *tbin = &descriptor->bins_small[i]; arena_stats_accum_zu(&astats->tcache_bytes, - tbin->ncached * sz_index2size(i)); + cache_bin_ncached_get(tbin, i) * sz_index2size(i)); } for (szind_t i = 0; i < nhbins - SC_NBINS; i++) { cache_bin_t *tbin = &descriptor->bins_large[i]; arena_stats_accum_zu(&astats->tcache_bytes, - tbin->ncached * sz_index2size(i)); + cache_bin_ncached_get(tbin, i + SC_NBINS) * + sz_index2size(i)); } } malloc_mutex_prof_read(tsdn, @@ -1381,7 +1382,7 @@ arena_tcache_fill_small(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache, cache_bin_t *tbin, szind_t binind, uint64_t prof_accumbytes) { unsigned i, nfill, cnt; - assert(tbin->ncached == 0); + assert(cache_bin_ncached_get(tbin, binind) == 0); if (config_prof && arena_prof_accum(tsdn, arena, prof_accumbytes)) { prof_idump(tsdn); @@ -1390,6 +1391,7 @@ arena_tcache_fill_small(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache, unsigned binshard; bin_t *bin = arena_bin_choose_lock(tsdn, arena, binind, &binshard); + void **empty_position = cache_bin_empty_position_get(tbin, binind); for (i = 0, nfill = (tcache_bin_info[binind].ncached_max >> tcache->lg_fill_div[binind]); i < nfill; i += cnt) { extent_t *slab; @@ -1400,7 +1402,7 @@ arena_tcache_fill_small(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache, tofill : extent_nfree_get(slab); arena_slab_reg_alloc_batch( slab, &bin_infos[binind], cnt, - tbin->avail - nfill + i); + empty_position - nfill + i); } else { cnt = 1; void *ptr = arena_bin_malloc_hard(tsdn, arena, bin, @@ -1412,18 +1414,18 @@ arena_tcache_fill_small(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache, */ if (ptr == NULL) { if (i > 0) { - memmove(tbin->avail - i, - tbin->avail - nfill, + memmove(empty_position - i, + empty_position - nfill, i * sizeof(void *)); } break; } /* Insert such that low regions get used first. */ - *(tbin->avail - nfill + i) = ptr; + *(empty_position - nfill + i) = ptr; } if (config_fill && unlikely(opt_junk_alloc)) { for (unsigned j = 0; j < cnt; j++) { - void* ptr = *(tbin->avail - nfill + i + j); + void* ptr = *(empty_position - nfill + i + j); arena_alloc_junk_small(ptr, &bin_infos[binind], true); } @@ -1437,7 +1439,7 @@ arena_tcache_fill_small(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache, tbin->tstats.nrequests = 0; } malloc_mutex_unlock(tsdn, &bin->lock); - tbin->ncached = i; + cache_bin_ncached_set(tbin, binind, i); arena_decay_tick(tsdn, arena); } diff --git a/src/jemalloc.c b/src/jemalloc.c index dec987c5..75a40277 100644 --- a/src/jemalloc.c +++ b/src/jemalloc.c @@ -2368,7 +2368,7 @@ je_malloc(size_t size) { cache_bin_t *bin = tcache_small_bin_get(tcache, ind); bool tcache_success; - void* ret = cache_bin_alloc_easy(bin, &tcache_success); + void *ret = cache_bin_alloc_easy(bin, &tcache_success, ind); if (tcache_success) { if (config_stats) { @@ -2846,8 +2846,7 @@ bool free_fastpath(void *ptr, size_t size, bool size_hint) { } cache_bin_t *bin = tcache_small_bin_get(tcache, alloc_ctx.szind); - cache_bin_info_t *bin_info = &tcache_bin_info[alloc_ctx.szind]; - if (!cache_bin_dalloc_easy(bin, bin_info, ptr)) { + if (!cache_bin_dalloc_easy(bin, ptr)) { return false; } diff --git a/src/tcache.c b/src/tcache.c index c5fe67a9..d282e1fa 100644 --- a/src/tcache.c +++ b/src/tcache.c @@ -14,7 +14,16 @@ bool opt_tcache = true; ssize_t opt_lg_tcache_max = LG_TCACHE_MAXCLASS_DEFAULT; cache_bin_info_t *tcache_bin_info; -static unsigned stack_nelms; /* Total stack elms per tcache. */ +/* + * For the total bin stack region (per tcache), reserve 2 more slots so that 1) + * the empty position can be safely read on the fast path before checking + * "is_empty"; and 2) the low_water == -1 case can go beyond the empty position + * by 1 step safely (i.e. no overflow). + */ +static const unsigned total_stack_padding = sizeof(void *) * 2; + +/* Total stack size required (per tcache). Include the padding above. */ +static uint32_t total_stack_bytes; unsigned nhbins; size_t tcache_maxclass; @@ -47,14 +56,16 @@ tcache_event_hard(tsd_t *tsd, tcache_t *tcache) { } else { tbin = tcache_large_bin_get(tcache, binind); } - if (tbin->low_water > 0) { + + cache_bin_sz_t low_water = cache_bin_low_water_get(tbin, binind); + cache_bin_sz_t ncached = cache_bin_ncached_get(tbin, binind); + if (low_water > 0) { /* * Flush (ceiling) 3/4 of the objects below the low water mark. */ if (binind < SC_NBINS) { tcache_bin_flush_small(tsd, tcache, tbin, binind, - tbin->ncached - tbin->low_water + (tbin->low_water - >> 2)); + ncached - low_water + (low_water >> 2)); /* * Reduce fill count by 2X. Limit lg_fill_div such that * the fill count is always at least 1. @@ -66,10 +77,10 @@ tcache_event_hard(tsd_t *tsd, tcache_t *tcache) { } } else { tcache_bin_flush_large(tsd, tcache, tbin, binind, - tbin->ncached - tbin->low_water + (tbin->low_water - >> 2)); + ncached - low_water + (low_water >> 2)); } - } else if (tbin->low_water < 0) { + } else if (low_water < 0) { + assert(low_water == -1); /* * Increase fill count by 2X for small bins. Make sure * lg_fill_div stays greater than 0. @@ -78,7 +89,7 @@ tcache_event_hard(tsd_t *tsd, tcache_t *tcache) { tcache->lg_fill_div[binind]--; } } - tbin->low_water = tbin->ncached; + tbin->low_water_position = tbin->cur_ptr.lowbits; tcache->next_gc_bin++; if (tcache->next_gc_bin == nhbins) { @@ -97,7 +108,7 @@ tcache_alloc_small_hard(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache, if (config_prof) { tcache->prof_accumbytes = 0; } - ret = cache_bin_alloc_easy(tbin, tcache_success); + ret = cache_bin_alloc_easy(tbin, tcache_success, binind); return ret; } @@ -117,9 +128,10 @@ tbin_extents_lookup_size_check(tsdn_t *tsdn, cache_bin_t *tbin, szind_t binind, */ szind_t szind; size_t sz_sum = binind * nflush; + void **bottom_item = cache_bin_bottom_item_get(tbin, binind); for (unsigned i = 0 ; i < nflush; i++) { rtree_extent_szind_read(tsdn, &extents_rtree, - rtree_ctx, (uintptr_t)*(tbin->avail - 1 - i), true, + rtree_ctx, (uintptr_t)*(bottom_item - i), true, &extents[i], &szind); sz_sum -= szind; } @@ -137,13 +149,15 @@ tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, cache_bin_t *tbin, bool merged_stats = false; assert(binind < SC_NBINS); - assert((cache_bin_sz_t)rem <= tbin->ncached); + cache_bin_sz_t ncached = cache_bin_ncached_get(tbin, binind); + assert((cache_bin_sz_t)rem <= ncached); arena_t *arena = tcache->arena; assert(arena != NULL); - unsigned nflush = tbin->ncached - rem; + unsigned nflush = ncached - rem; VARIABLE_ARRAY(extent_t *, item_extent, nflush); + void **bottom_item = cache_bin_bottom_item_get(tbin, binind); /* Look up extent once per item. */ if (config_opt_safety_checks) { tbin_extents_lookup_size_check(tsd_tsdn(tsd), tbin, binind, @@ -151,7 +165,7 @@ tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, cache_bin_t *tbin, } else { for (unsigned i = 0 ; i < nflush; i++) { item_extent[i] = iealloc(tsd_tsdn(tsd), - *(tbin->avail - 1 - i)); + *(bottom_item - i)); } } while (nflush > 0) { @@ -181,7 +195,7 @@ tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, cache_bin_t *tbin, } unsigned ndeferred = 0; for (unsigned i = 0; i < nflush; i++) { - void *ptr = *(tbin->avail - 1 - i); + void *ptr = *(bottom_item - i); extent = item_extent[i]; assert(ptr != NULL && extent != NULL); @@ -196,7 +210,7 @@ tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, cache_bin_t *tbin, * locked. Stash the object, so that it can be * handled in a future pass. */ - *(tbin->avail - 1 - ndeferred) = ptr; + *(bottom_item - ndeferred) = ptr; item_extent[ndeferred] = extent; ndeferred++; } @@ -219,11 +233,11 @@ tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, cache_bin_t *tbin, malloc_mutex_unlock(tsd_tsdn(tsd), &bin->lock); } - memmove(tbin->avail - rem, tbin->avail - tbin->ncached, rem * + memmove(tbin->cur_ptr.ptr + (ncached - rem), tbin->cur_ptr.ptr, rem * sizeof(void *)); - tbin->ncached = rem; - if (tbin->ncached < tbin->low_water) { - tbin->low_water = tbin->ncached; + cache_bin_ncached_set(tbin, binind, rem); + if (tbin->cur_ptr.lowbits > tbin->low_water_position) { + tbin->low_water_position = tbin->cur_ptr.lowbits; } } @@ -233,17 +247,19 @@ tcache_bin_flush_large(tsd_t *tsd, tcache_t *tcache, cache_bin_t *tbin, szind_t bool merged_stats = false; assert(binind < nhbins); - assert((cache_bin_sz_t)rem <= tbin->ncached); + cache_bin_sz_t ncached = cache_bin_ncached_get(tbin, binind); + assert((cache_bin_sz_t)rem <= ncached); arena_t *tcache_arena = tcache->arena; assert(tcache_arena != NULL); - unsigned nflush = tbin->ncached - rem; + unsigned nflush = ncached - rem; VARIABLE_ARRAY(extent_t *, item_extent, nflush); + void **bottom_item = cache_bin_bottom_item_get(tbin, binind); #ifndef JEMALLOC_EXTRA_SIZE_CHECK /* Look up extent once per item. */ for (unsigned i = 0 ; i < nflush; i++) { - item_extent[i] = iealloc(tsd_tsdn(tsd), *(tbin->avail - 1 - i)); + item_extent[i] = iealloc(tsd_tsdn(tsd), *(bottom_item - i)); } #else tbin_extents_lookup_size_check(tsd_tsdn(tsd), tbin, binind, nflush, @@ -266,7 +282,7 @@ tcache_bin_flush_large(tsd_t *tsd, tcache_t *tcache, cache_bin_t *tbin, szind_t malloc_mutex_lock(tsd_tsdn(tsd), &locked_arena->large_mtx); } for (unsigned i = 0; i < nflush; i++) { - void *ptr = *(tbin->avail - 1 - i); + void *ptr = *(bottom_item - i); assert(ptr != NULL); extent = item_extent[i]; if (extent_arena_ind_get(extent) == locked_arena_ind) { @@ -295,7 +311,7 @@ tcache_bin_flush_large(tsd_t *tsd, tcache_t *tcache, cache_bin_t *tbin, szind_t unsigned ndeferred = 0; for (unsigned i = 0; i < nflush; i++) { - void *ptr = *(tbin->avail - 1 - i); + void *ptr = *(bottom_item - i); extent = item_extent[i]; assert(ptr != NULL && extent != NULL); @@ -308,7 +324,7 @@ tcache_bin_flush_large(tsd_t *tsd, tcache_t *tcache, cache_bin_t *tbin, szind_t * Stash the object, so that it can be handled * in a future pass. */ - *(tbin->avail - 1 - ndeferred) = ptr; + *(bottom_item - ndeferred) = ptr; item_extent[ndeferred] = extent; ndeferred++; } @@ -330,11 +346,11 @@ tcache_bin_flush_large(tsd_t *tsd, tcache_t *tcache, cache_bin_t *tbin, szind_t tbin->tstats.nrequests = 0; } - memmove(tbin->avail - rem, tbin->avail - tbin->ncached, rem * + memmove(tbin->cur_ptr.ptr + (ncached - rem), tbin->cur_ptr.ptr, rem * sizeof(void *)); - tbin->ncached = rem; - if (tbin->ncached < tbin->low_water) { - tbin->low_water = tbin->ncached; + cache_bin_ncached_set(tbin, binind, rem); + if (tbin->cur_ptr.lowbits > tbin->low_water_position) { + tbin->low_water_position = tbin->cur_ptr.lowbits; } } @@ -406,9 +422,43 @@ tsd_tcache_enabled_data_init(tsd_t *tsd) { return false; } -/* Initialize auto tcache (embedded in TSD). */ +static bool +tcache_bin_init(cache_bin_t *bin, szind_t ind, uintptr_t *stack_cur) { + cassert(sizeof(bin->cur_ptr) == sizeof(void *)); + /* + * The full_position points to the lowest available space. Allocations + * will access the slots toward higher addresses (for the benefit of + * adjacent prefetch). + */ + void *full_position = (void *)*stack_cur; + uint32_t bin_stack_size = tcache_bin_info[ind].ncached_max * + sizeof(void *); + + *stack_cur += bin_stack_size; + void *empty_position = (void *)*stack_cur; + + /* Init to the empty position. */ + bin->cur_ptr.ptr = empty_position; + bin->low_water_position = bin->cur_ptr.lowbits; + bin->full_position = (uint32_t)(uintptr_t)full_position; + assert(bin->cur_ptr.lowbits - bin->full_position == bin_stack_size); + assert(cache_bin_ncached_get(bin, ind) == 0); + assert(cache_bin_empty_position_get(bin, ind) == empty_position); + + return false; +} + +/* Sanity check only. */ +static bool +tcache_bin_lowbits_overflowable(void *ptr) { + uint32_t lowbits = (uint32_t)((uintptr_t)ptr + total_stack_bytes); + return lowbits < (uint32_t)(uintptr_t)ptr; +} + static void tcache_init(tsd_t *tsd, tcache_t *tcache, void *avail_stack) { + assert(!tcache_bin_lowbits_overflowable(avail_stack)); + memset(&tcache->link, 0, sizeof(ql_elm(tcache_t))); tcache->prof_accumbytes = 0; tcache->next_gc_bin = 0; @@ -416,41 +466,43 @@ tcache_init(tsd_t *tsd, tcache_t *tcache, void *avail_stack) { ticker_init(&tcache->gc_ticker, TCACHE_GC_INCR); - size_t stack_offset = 0; assert((TCACHE_NSLOTS_SMALL_MAX & 1U) == 0); memset(tcache->bins_small, 0, sizeof(cache_bin_t) * SC_NBINS); memset(tcache->bins_large, 0, sizeof(cache_bin_t) * (nhbins - SC_NBINS)); + unsigned i = 0; + uintptr_t stack_cur = (uintptr_t)avail_stack; for (; i < SC_NBINS; i++) { tcache->lg_fill_div[i] = 1; - stack_offset += tcache_bin_info[i].ncached_max * sizeof(void *); - /* - * avail points past the available space. Allocations will - * access the slots toward higher addresses (for the benefit of - * prefetch). - */ - tcache_small_bin_get(tcache, i)->avail = - (void **)((uintptr_t)avail_stack + (uintptr_t)stack_offset); + cache_bin_t *bin = tcache_small_bin_get(tcache, i); + tcache_bin_init(bin, i, &stack_cur); } for (; i < nhbins; i++) { - stack_offset += tcache_bin_info[i].ncached_max * sizeof(void *); - tcache_large_bin_get(tcache, i)->avail = - (void **)((uintptr_t)avail_stack + (uintptr_t)stack_offset); + cache_bin_t *bin = tcache_large_bin_get(tcache, i); + tcache_bin_init(bin, i, &stack_cur); } - assert(stack_offset == stack_nelms * sizeof(void *)); + + /* Sanity check that the whole stack is used. */ + size_t stack_offset = stack_cur - (uintptr_t)avail_stack; + assert(stack_offset + total_stack_padding == total_stack_bytes); +} + +static size_t +tcache_bin_stack_alignment (size_t size) { + /* Align pow2 to avoid overflow the cache bin compressed pointers. */ + return (LG_SIZEOF_PTR == 3) ? pow2_ceil_zu(size) : CACHELINE; } /* Initialize auto tcache (embedded in TSD). */ bool tsd_tcache_data_init(tsd_t *tsd) { tcache_t *tcache = tsd_tcachep_get_unsafe(tsd); - assert(tcache_small_bin_get(tcache, 0)->avail == NULL); - size_t size = stack_nelms * sizeof(void *); + assert(tcache_small_bin_get(tcache, 0)->cur_ptr.ptr == NULL); /* Avoid false cacheline sharing. */ - size = sz_sa2u(size, CACHELINE); - - void *avail_array = ipallocztm(tsd_tsdn(tsd), size, CACHELINE, true, - NULL, true, arena_get(TSDN_NULL, 0, true)); + size_t size = sz_sa2u(total_stack_bytes, CACHELINE); + void *avail_array = ipallocztm(tsd_tsdn(tsd), size, + tcache_bin_stack_alignment(size), true, NULL, true, + arena_get(TSDN_NULL, 0, true)); if (avail_array == NULL) { return true; } @@ -485,25 +537,24 @@ tsd_tcache_data_init(tsd_t *tsd) { /* Created manual tcache for tcache.create mallctl. */ tcache_t * tcache_create_explicit(tsd_t *tsd) { - tcache_t *tcache; - size_t size, stack_offset; - - size = sizeof(tcache_t); + size_t size = sizeof(tcache_t); /* Naturally align the pointer stacks. */ size = PTR_CEILING(size); - stack_offset = size; - size += stack_nelms * sizeof(void *); + size_t stack_offset = size; + size += total_stack_bytes; /* Avoid false cacheline sharing. */ size = sz_sa2u(size, CACHELINE); - tcache = ipallocztm(tsd_tsdn(tsd), size, CACHELINE, true, NULL, true, + tcache_t *tcache = ipallocztm(tsd_tsdn(tsd), size, + tcache_bin_stack_alignment(size), true, NULL, true, arena_get(TSDN_NULL, 0, true)); if (tcache == NULL) { return NULL; } - tcache_init(tsd, tcache, - (void *)((uintptr_t)tcache + (uintptr_t)stack_offset)); + void *avail_array = (void *)((uintptr_t)tcache + + (uintptr_t)stack_offset); + tcache_init(tsd, tcache, avail_array); tcache_arena_associate(tsd_tsdn(tsd), tcache, arena_ichoose(tsd, NULL)); return tcache; @@ -553,9 +604,12 @@ tcache_destroy(tsd_t *tsd, tcache_t *tcache, bool tsd_tcache) { if (tsd_tcache) { /* Release the avail array for the TSD embedded auto tcache. */ - void *avail_array = - (void *)((uintptr_t)tcache_small_bin_get(tcache, 0)->avail - - (uintptr_t)tcache_bin_info[0].ncached_max * sizeof(void *)); + cache_bin_t *bin = tcache_small_bin_get(tcache, 0); + assert(cache_bin_ncached_get(bin, 0) == 0); + assert(cache_bin_empty_position_get(bin, 0) == + bin->cur_ptr.ptr); + void *avail_array = bin->cur_ptr.ptr - + tcache_bin_info[0].ncached_max; idalloctm(tsd_tsdn(tsd), avail_array, NULL, NULL, true, true); } else { /* Release both the tcache struct and avail array. */ @@ -587,16 +641,17 @@ tcache_cleanup(tsd_t *tsd) { if (!tcache_available(tsd)) { assert(tsd_tcache_enabled_get(tsd) == false); if (config_debug) { - assert(tcache_small_bin_get(tcache, 0)->avail == NULL); + assert(tcache_small_bin_get(tcache, 0)->cur_ptr.ptr + == NULL); } return; } assert(tsd_tcache_enabled_get(tsd)); - assert(tcache_small_bin_get(tcache, 0)->avail != NULL); + assert(tcache_small_bin_get(tcache, 0)->cur_ptr.ptr != NULL); tcache_destroy(tsd, tcache, true); if (config_debug) { - tcache_small_bin_get(tcache, 0)->avail = NULL; + tcache_small_bin_get(tcache, 0)->cur_ptr.ptr = NULL; } } @@ -755,8 +810,8 @@ tcache_boot(tsdn_t *tsdn) { if (tcache_bin_info == NULL) { return true; } + unsigned i, stack_nelms; stack_nelms = 0; - unsigned i; for (i = 0; i < SC_NBINS; i++) { if ((bin_infos[i].nregs << 1) <= TCACHE_NSLOTS_SMALL_MIN) { tcache_bin_info[i].ncached_max = @@ -775,6 +830,7 @@ tcache_boot(tsdn_t *tsdn) { tcache_bin_info[i].ncached_max = TCACHE_NSLOTS_LARGE; stack_nelms += tcache_bin_info[i].ncached_max; } + total_stack_bytes = stack_nelms * sizeof(void *) + total_stack_padding; return false; } diff --git a/test/unit/cache_bin.c b/test/unit/cache_bin.c new file mode 100644 index 00000000..74cf24cb --- /dev/null +++ b/test/unit/cache_bin.c @@ -0,0 +1,64 @@ +#include "test/jemalloc_test.h" + +cache_bin_t test_bin; + +TEST_BEGIN(test_cache_bin) { + cache_bin_t *bin = &test_bin; + cassert(PAGE > TCACHE_NSLOTS_SMALL_MAX * sizeof(void *)); + /* Page aligned to make sure lowbits not overflowable. */ + void **stack = mallocx(PAGE, MALLOCX_TCACHE_NONE | MALLOCX_ALIGN(PAGE)); + + assert_ptr_not_null(stack, "Unexpected mallocx failure"); + /* Initialize to empty; bin 0. */ + cache_bin_sz_t ncached_max = tcache_bin_info[0].ncached_max; + void **empty_position = stack + ncached_max; + bin->cur_ptr.ptr = empty_position; + bin->low_water_position = bin->cur_ptr.lowbits; + bin->full_position = (uint32_t)(uintptr_t)stack; + assert_ptr_eq(cache_bin_empty_position_get(bin, 0), empty_position, + "Incorrect empty position"); + /* Not using assert_zu etc on cache_bin_sz_t since it may change. */ + assert_true(cache_bin_ncached_get(bin, 0) == 0, "Incorrect cache size"); + + bool success; + void *ret = cache_bin_alloc_easy(bin, &success, 0); + assert_false(success, "Empty cache bin should not alloc"); + assert_true(cache_bin_low_water_get(bin, 0) == - 1, + "Incorrect low water mark"); + + cache_bin_ncached_set(bin, 0, 0); + assert_ptr_eq(bin->cur_ptr.ptr, empty_position, "Bin should be empty"); + for (cache_bin_sz_t i = 1; i < ncached_max + 1; i++) { + success = cache_bin_dalloc_easy(bin, (void *)(uintptr_t)i); + assert_true(success && cache_bin_ncached_get(bin, 0) == i, + "Bin dalloc failure"); + } + success = cache_bin_dalloc_easy(bin, (void *)1); + assert_false(success, "Bin should be full"); + assert_ptr_eq(bin->cur_ptr.ptr, stack, "Incorrect bin cur_ptr"); + + cache_bin_ncached_set(bin, 0, ncached_max); + assert_ptr_eq(bin->cur_ptr.ptr, stack, "cur_ptr should not change"); + /* Emulate low water after refill. */ + bin->low_water_position = bin->full_position; + for (cache_bin_sz_t i = ncached_max; i > 0; i--) { + ret = cache_bin_alloc_easy(bin, &success, 0); + cache_bin_sz_t ncached = cache_bin_ncached_get(bin, 0); + assert_true(success && ncached == i - 1, + "Cache bin alloc failure"); + assert_ptr_eq(ret, (void *)(uintptr_t)i, "Bin alloc failure"); + assert_true(cache_bin_low_water_get(bin, 0) == ncached, + "Incorrect low water mark"); + } + + ret = cache_bin_alloc_easy(bin, &success, 0); + assert_false(success, "Empty cache bin should not alloc."); + assert_ptr_eq(bin->cur_ptr.ptr, stack + ncached_max, + "Bin should be empty"); +} +TEST_END + +int +main(void) { + return test(test_cache_bin); +}