diff --git a/include/jemalloc/internal/base.h b/include/jemalloc/internal/base.h index 6b41aa6f..451be10f 100644 --- a/include/jemalloc/internal/base.h +++ b/include/jemalloc/internal/base.h @@ -73,6 +73,9 @@ struct base_s { /* Heap of extents that track unused trailing space within blocks. */ edata_heap_t avail[SC_NSIZES]; + /* Contains reusable base edata (used by tcache_stacks currently). */ + edata_avail_t edata_avail; + /* Stats, only maintained if config_stats. */ size_t allocated; size_t resident; @@ -101,6 +104,8 @@ extent_hooks_t *base_extent_hooks_set(base_t *base, extent_hooks_t *extent_hooks); void *base_alloc(tsdn_t *tsdn, base_t *base, size_t size, size_t alignment); edata_t *base_alloc_edata(tsdn_t *tsdn, base_t *base); +void *b0_alloc_tcache_stack(tsdn_t *tsdn, size_t size); +void b0_dalloc_tcache_stack(tsdn_t *tsdn, void *tcache_stack); void base_stats_get(tsdn_t *tsdn, base_t *base, size_t *allocated, size_t *resident, size_t *mapped, size_t *n_thp); void base_prefork(tsdn_t *tsdn, base_t *base); diff --git a/include/jemalloc/internal/cache_bin.h b/include/jemalloc/internal/cache_bin.h index 4cfc3f1d..78ac3295 100644 --- a/include/jemalloc/internal/cache_bin.h +++ b/include/jemalloc/internal/cache_bin.h @@ -704,5 +704,6 @@ void cache_bin_init(cache_bin_t *bin, cache_bin_info_t *info, void *alloc, * not cache_bin_init was called on it. */ bool cache_bin_still_zero_initialized(cache_bin_t *bin); +bool cache_bin_stack_use_thp(void); #endif /* JEMALLOC_INTERNAL_CACHE_BIN_H */ diff --git a/include/jemalloc/internal/edata.h b/include/jemalloc/internal/edata.h index baf5187f..17befd92 100644 --- a/include/jemalloc/internal/edata.h +++ b/include/jemalloc/internal/edata.h @@ -621,7 +621,8 @@ edata_init(edata_t *edata, unsigned arena_ind, void *addr, size_t size, } static inline void -edata_binit(edata_t *edata, void *addr, size_t bsize, uint64_t sn) { +edata_binit(edata_t *edata, void *addr, size_t bsize, uint64_t sn, + bool reused) { edata_arena_ind_set(edata, (1U << MALLOCX_ARENA_BITS) - 1); edata_addr_set(edata, addr); edata_bsize_set(edata, bsize); @@ -629,7 +630,8 @@ edata_binit(edata_t *edata, void *addr, size_t bsize, uint64_t sn) { edata_szind_set(edata, SC_NSIZES); edata_sn_set(edata, sn); edata_state_set(edata, extent_state_active); - edata_guarded_set(edata, false); + /* See comments in base_edata_is_reused. */ + edata_guarded_set(edata, reused); edata_zeroed_set(edata, true); edata_committed_set(edata, true); /* diff --git a/src/base.c b/src/base.c index 8e4606d0..e1dfe604 100644 --- a/src/base.c +++ b/src/base.c @@ -110,6 +110,16 @@ label_done: } } +static inline bool +base_edata_is_reused(edata_t *edata) { + /* + * Borrow the guarded bit to indicate if the extent is a recycled one, + * i.e. the ones returned to base for reuse; currently only tcache bin + * stacks. Skips stats updating if so (needed for this purpose only). + */ + return edata_guarded_get(edata); +} + static void base_edata_init(size_t *extent_sn_next, edata_t *edata, void *addr, size_t size) { @@ -118,7 +128,7 @@ base_edata_init(size_t *extent_sn_next, edata_t *edata, void *addr, sn = *extent_sn_next; (*extent_sn_next)++; - edata_binit(edata, addr, size, sn); + edata_binit(edata, addr, size, sn, false /* is_reused */); } static size_t @@ -185,24 +195,57 @@ base_extent_bump_alloc_helper(edata_t *edata, size_t *gap_size, size_t size, assert(edata_bsize_get(edata) >= *gap_size + size); edata_binit(edata, (void *)((byte_t *)edata_addr_get(edata) + *gap_size + size), edata_bsize_get(edata) - *gap_size - size, - edata_sn_get(edata)); + edata_sn_get(edata), base_edata_is_reused(edata)); return ret; } static void -base_extent_bump_alloc_post(base_t *base, edata_t *edata, size_t gap_size, - void *addr, size_t size) { - if (edata_bsize_get(edata) > 0) { - /* - * Compute the index for the largest size class that does not - * exceed extent's size. - */ - szind_t index_floor = - sz_size2index(edata_bsize_get(edata) + 1) - 1; - edata_heap_insert(&base->avail[index_floor], edata); +base_edata_heap_insert(tsdn_t *tsdn, base_t *base, edata_t *edata) { + malloc_mutex_assert_owner(tsdn, &base->mtx); + + size_t bsize = edata_bsize_get(edata); + assert(bsize > 0); + /* + * Compute the index for the largest size class that does not exceed + * extent's size. + */ + szind_t index_floor = sz_size2index(bsize + 1) - 1; + edata_heap_insert(&base->avail[index_floor], edata); +} + +/* + * Only can be called by top-level functions, since it may call base_alloc + * internally when cache is empty. + */ +static edata_t * +base_alloc_base_edata(tsdn_t *tsdn, base_t *base) { + edata_t *edata; + + malloc_mutex_lock(tsdn, &base->mtx); + edata = edata_avail_first(&base->edata_avail); + if (edata != NULL) { + edata_avail_remove(&base->edata_avail, edata); + } + malloc_mutex_unlock(tsdn, &base->mtx); + + if (edata == NULL) { + edata = base_alloc_edata(tsdn, base); } - if (config_stats) { + return edata; +} + +static void +base_extent_bump_alloc_post(tsdn_t *tsdn, base_t *base, edata_t *edata, + size_t gap_size, void *addr, size_t size) { + if (edata_bsize_get(edata) > 0) { + base_edata_heap_insert(tsdn, base, edata); + } else { + /* Freed base edata_t stored in edata_avail. */ + edata_avail_insert(&base->edata_avail, edata); + } + + if (config_stats && !base_edata_is_reused(edata)) { base->allocated += size; /* * Add one PAGE to base_resident for every page boundary that is @@ -224,13 +267,13 @@ base_extent_bump_alloc_post(base_t *base, edata_t *edata, size_t gap_size, } static void * -base_extent_bump_alloc(base_t *base, edata_t *edata, size_t size, +base_extent_bump_alloc(tsdn_t *tsdn, base_t *base, edata_t *edata, size_t size, size_t alignment) { void *ret; size_t gap_size; ret = base_extent_bump_alloc_helper(edata, &gap_size, size, alignment); - base_extent_bump_alloc_post(base, edata, gap_size, ret, size); + base_extent_bump_alloc_post(tsdn, base, edata, gap_size, ret, size); return ret; } @@ -384,6 +427,8 @@ base_new(tsdn_t *tsdn, unsigned ind, const extent_hooks_t *extent_hooks, for (szind_t i = 0; i < SC_NSIZES; i++) { edata_heap_new(&base->avail[i]); } + edata_avail_new(&base->edata_avail); + if (config_stats) { base->allocated = sizeof(base_block_t); base->resident = PAGE_CEILING(sizeof(base_block_t)); @@ -395,8 +440,12 @@ base_new(tsdn_t *tsdn, unsigned ind, const extent_hooks_t *extent_hooks, assert(base->resident <= base->mapped); assert(base->n_thp << LG_HUGEPAGE <= base->mapped); } - base_extent_bump_alloc_post(base, &block->edata, gap_size, base, + + /* Locking here is only necessary because of assertions. */ + malloc_mutex_lock(tsdn, &base->mtx); + base_extent_bump_alloc_post(tsdn, base, &block->edata, gap_size, base, base_size); + malloc_mutex_unlock(tsdn, &base->mtx); return base; } @@ -457,7 +506,7 @@ base_alloc_impl(tsdn_t *tsdn, base_t *base, size_t size, size_t alignment, goto label_return; } - ret = base_extent_bump_alloc(base, edata, usize, alignment); + ret = base_extent_bump_alloc(tsdn, base, edata, usize, alignment); if (esn != NULL) { *esn = (size_t)edata_sn_get(edata); } @@ -491,6 +540,74 @@ base_alloc_edata(tsdn_t *tsdn, base_t *base) { return edata; } +static inline void +b0_alloc_header_size(size_t *header_size, size_t *alignment) { + *alignment = QUANTUM; + *header_size = QUANTUM > sizeof(edata_t *) ? QUANTUM : + sizeof(edata_t *); +} + +/* + * Each piece allocated here is managed by a separate edata, because it was bump + * allocated and cannot be merged back into the original base_block. This means + * it's not for general purpose: 1) they are not page aligned, nor page sized, + * and 2) the requested size should not be too small (as each piece comes with + * an edata_t). Only used for tcache bin stack allocation now. + */ +void * +b0_alloc_tcache_stack(tsdn_t *tsdn, size_t stack_size) { + base_t *base = b0get(); + edata_t *edata = base_alloc_base_edata(tsdn, base); + if (edata == NULL) { + return NULL; + } + + /* + * Reserve room for the header, which stores a pointer to the managing + * edata_t. The header itself is located right before the return + * address, so that edata can be retrieved on dalloc. Bump up to usize + * to improve reusability -- otherwise the freed stacks will be put back + * into the previous size class. + */ + size_t esn, alignment, header_size; + b0_alloc_header_size(&header_size, &alignment); + + size_t alloc_size = sz_s2u(stack_size + header_size); + void *addr = base_alloc_impl(tsdn, base, alloc_size, alignment, &esn); + if (addr == NULL) { + edata_avail_insert(&base->edata_avail, edata); + return NULL; + } + + /* Set is_reused: see comments in base_edata_is_reused. */ + edata_binit(edata, addr, alloc_size, esn, true /* is_reused */); + *(edata_t **)addr = edata; + + return (byte_t *)addr + header_size; +} + +void +b0_dalloc_tcache_stack(tsdn_t *tsdn, void *tcache_stack) { + /* edata_t pointer stored in header. */ + size_t alignment, header_size; + b0_alloc_header_size(&header_size, &alignment); + + edata_t *edata = *(edata_t **)((byte_t *)tcache_stack - header_size); + void *addr = edata_addr_get(edata); + size_t bsize = edata_bsize_get(edata); + /* Marked as "reused" to avoid double counting stats. */ + assert(base_edata_is_reused(edata)); + assert(addr != NULL && bsize > 0); + + /* Zero out since base_alloc returns zeroed memory. */ + memset(addr, 0, bsize); + + base_t *base = b0get(); + malloc_mutex_lock(tsdn, &base->mtx); + base_edata_heap_insert(tsdn, base, edata); + malloc_mutex_unlock(tsdn, &base->mtx); +} + void base_stats_get(tsdn_t *tsdn, base_t *base, size_t *allocated, size_t *resident, size_t *mapped, size_t *n_thp) { diff --git a/src/cache_bin.c b/src/cache_bin.c index 03577084..2ad2062d 100644 --- a/src/cache_bin.c +++ b/src/cache_bin.c @@ -14,6 +14,17 @@ cache_bin_info_init(cache_bin_info_t *info, info->ncached_max = (cache_bin_sz_t)ncached_max; } +bool +cache_bin_stack_use_thp(void) { + /* + * If metadata_thp is enabled, allocating tcache stack from the base + * allocator for efficiency gains. The downside, however, is that base + * allocator never purges freed memory, and may cache a fair amount of + * memory after many threads are terminated and not reused. + */ + return metadata_thp_enabled(); +} + void cache_bin_info_compute_alloc(cache_bin_info_t *infos, szind_t ninfos, size_t *size, size_t *alignment) { @@ -31,10 +42,11 @@ cache_bin_info_compute_alloc(cache_bin_info_t *infos, szind_t ninfos, } /* - * Align to at least PAGE, to minimize the # of TLBs needed by the - * smaller sizes; also helps if the larger sizes don't get used at all. + * When not using THP, align to at least PAGE, to minimize the # of TLBs + * needed by the smaller sizes; also helps if the larger sizes don't get + * used at all. */ - *alignment = PAGE; + *alignment = cache_bin_stack_use_thp() ? QUANTUM : PAGE; } void diff --git a/src/tcache.c b/src/tcache.c index ae68c08b..2c0a7e2e 100644 --- a/src/tcache.c +++ b/src/tcache.c @@ -2,6 +2,7 @@ #include "jemalloc/internal/jemalloc_internal_includes.h" #include "jemalloc/internal/assert.h" +#include "jemalloc/internal/base.h" #include "jemalloc/internal/mutex.h" #include "jemalloc/internal/safety_check.h" #include "jemalloc/internal/san.h" @@ -814,10 +815,17 @@ tsd_tcache_data_init(tsd_t *tsd, arena_t *arena) { tcache_bin_info_compute(tcache_bin_info, tcache_nhbins); cache_bin_info_compute_alloc(tcache_bin_info, tcache_nhbins, &size, &alignment); - size = sz_sa2u(size, alignment); - void *mem = ipallocztm(tsd_tsdn(tsd), size, alignment, true, NULL, - true, arena_get(TSDN_NULL, 0, true)); + void *mem; + if (cache_bin_stack_use_thp()) { + /* Alignment is ignored since it comes from THP. */ + assert(alignment == QUANTUM); + mem = b0_alloc_tcache_stack(tsd_tsdn(tsd), size); + } else { + size = sz_sa2u(size, alignment); + mem = ipallocztm(tsd_tsdn(tsd), size, alignment, true, NULL, + true, arena_get(TSDN_NULL, 0, true)); + } if (mem == NULL) { return true; } @@ -925,8 +933,12 @@ tcache_destroy(tsd_t *tsd, tcache_t *tcache, bool tsd_tcache) { cache_bin_t *cache_bin = &tcache->bins[0]; cache_bin_assert_empty(cache_bin, &cache_bin->bin_info); } - idalloctm(tsd_tsdn(tsd), tcache_slow->dyn_alloc, NULL, NULL, true, - true); + if (tsd_tcache && cache_bin_stack_use_thp()) { + b0_dalloc_tcache_stack(tsd_tsdn(tsd), tcache_slow->dyn_alloc); + } else { + idalloctm(tsd_tsdn(tsd), tcache_slow->dyn_alloc, NULL, NULL, + true, true); + } /* * The deallocation and tcache flush above may not trigger decay since diff --git a/test/unit/tcache_max.c b/test/unit/tcache_max.c index 0a563c2f..6481504e 100644 --- a/test/unit/tcache_max.c +++ b/test/unit/tcache_max.c @@ -215,6 +215,29 @@ tcache_max2nhbins(size_t tcache_max) { return sz_size2index(tcache_max) + 1; } +static void +validate_tcache_stack(tcache_t *tcache) { + /* Assume bins[0] is enabled. */ + void *tcache_stack = tcache->bins[0].stack_head; + bool expect_found = cache_bin_stack_use_thp() ? true : false; + + /* Walk through all blocks to see if the stack is within range. */ + base_t *base = b0get(); + base_block_t *next = base->blocks; + bool found = false; + do { + base_block_t *block = next; + if ((byte_t *)tcache_stack >= (byte_t *)block && + (byte_t *)tcache_stack < ((byte_t *)block + block->size)) { + found = true; + break; + } + next = block->next; + } while (next != NULL); + + expect_true(found == expect_found, "Unexpected tcache stack source"); +} + static void * tcache_check(void *arg) { size_t old_tcache_max, new_tcache_max, min_tcache_max, sz; @@ -235,6 +258,7 @@ tcache_check(void *arg) { tcache_nhbins = tcache_nhbins_get(tcache); expect_zu_eq(tcache_nhbins, (size_t)global_do_not_change_nhbins, "Unexpected default value for tcache_nhbins"); + validate_tcache_stack(tcache); /* * Close the tcache and test the set. @@ -280,6 +304,7 @@ tcache_check(void *arg) { "Unexpected.mallctl().failure"); expect_zu_eq(old_tcache_max, min_tcache_max, "Unexpected value for tcache_max"); + validate_tcache_stack(tcache); /* * Check the thread's tcache_max and nhbins both through mallctl @@ -303,6 +328,7 @@ tcache_check(void *arg) { test_tcache_max_impl(new_tcache_max, alloc_option, dalloc_option); } + validate_tcache_stack(tcache); } return NULL;