From f4a0f32d340985de477bbe329ecdaecd69ed1055 Mon Sep 17 00:00:00 2001 From: Qi Wang Date: Tue, 27 Oct 2015 15:12:10 -0700 Subject: [PATCH] Fast-path improvement: reduce # of branches and unnecessary operations. - Combine multiple runtime branches into a single malloc_slow check. - Avoid calling arena_choose / size2index / index2size on fast path. - A few micro optimizations. --- include/jemalloc/internal/arena.h | 63 +++--- .../jemalloc/internal/jemalloc_internal.h.in | 62 +++--- include/jemalloc/internal/prof.h | 6 +- include/jemalloc/internal/tcache.h | 116 +++++++---- src/arena.c | 26 +-- src/ckh.c | 10 +- src/huge.c | 6 +- src/jemalloc.c | 192 +++++++++++++----- src/prof.c | 37 ++-- src/quarantine.c | 20 +- src/tcache.c | 33 +-- 11 files changed, 357 insertions(+), 214 deletions(-) diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h index 9e2375ce..9715ad93 100644 --- a/include/jemalloc/internal/arena.h +++ b/include/jemalloc/internal/arena.h @@ -461,8 +461,10 @@ extern arena_dalloc_junk_small_t *arena_dalloc_junk_small; void arena_dalloc_junk_small(void *ptr, arena_bin_info_t *bin_info); #endif void arena_quarantine_junk_small(void *ptr, size_t usize); -void *arena_malloc_small(arena_t *arena, size_t size, bool zero); -void *arena_malloc_large(arena_t *arena, size_t size, bool zero); +void *arena_malloc_small(arena_t *arena, size_t size, szind_t ind, + bool zero); +void *arena_malloc_large(arena_t *arena, size_t size, szind_t ind, + bool zero); void *arena_palloc(tsd_t *tsd, arena_t *arena, size_t usize, size_t alignment, bool zero, tcache_t *tcache); void arena_prof_promoted(const void *ptr, size_t size); @@ -558,11 +560,11 @@ prof_tctx_t *arena_prof_tctx_get(const void *ptr); void arena_prof_tctx_set(const void *ptr, size_t usize, prof_tctx_t *tctx); void arena_prof_tctx_reset(const void *ptr, size_t usize, const void *old_ptr, prof_tctx_t *old_tctx); -void *arena_malloc(tsd_t *tsd, arena_t *arena, size_t size, bool zero, - tcache_t *tcache); +void *arena_malloc(tsd_t *tsd, arena_t *arena, size_t size, szind_t ind, + bool zero, tcache_t *tcache, bool slow_path); arena_t *arena_aalloc(const void *ptr); size_t arena_salloc(const void *ptr, bool demote); -void arena_dalloc(tsd_t *tsd, void *ptr, tcache_t *tcache); +void arena_dalloc(tsd_t *tsd, void *ptr, tcache_t *tcache, bool slow_path); void arena_sdalloc(tsd_t *tsd, void *ptr, size_t size, tcache_t *tcache); #endif @@ -1158,34 +1160,34 @@ arena_prof_tctx_reset(const void *ptr, size_t usize, const void *old_ptr, } JEMALLOC_ALWAYS_INLINE void * -arena_malloc(tsd_t *tsd, arena_t *arena, size_t size, bool zero, - tcache_t *tcache) +arena_malloc(tsd_t *tsd, arena_t *arena, size_t size, szind_t ind, + bool zero, tcache_t *tcache, bool slow_path) { assert(size != 0); + if (likely(tcache != NULL)) { + if (likely(size <= SMALL_MAXCLASS)) { + return (tcache_alloc_small(tsd, arena, tcache, size, + ind, zero, slow_path)); + } + if (likely(size <= tcache_maxclass)) { + return (tcache_alloc_large(tsd, arena, tcache, size, + ind, zero, slow_path)); + } + /* (size > tcache_maxclass) case falls through. */ + assert(size > tcache_maxclass); + } + arena = arena_choose(tsd, arena); if (unlikely(arena == NULL)) return (NULL); - if (likely(size <= SMALL_MAXCLASS)) { - if (likely(tcache != NULL)) { - return (tcache_alloc_small(tsd, arena, tcache, size, - zero)); - } else - return (arena_malloc_small(arena, size, zero)); - } else if (likely(size <= large_maxclass)) { - /* - * Initialize tcache after checking size in order to avoid - * infinite recursion during tcache initialization. - */ - if (likely(tcache != NULL) && size <= tcache_maxclass) { - return (tcache_alloc_large(tsd, arena, tcache, size, - zero)); - } else - return (arena_malloc_large(arena, size, zero)); - } else - return (huge_malloc(tsd, arena, size, zero, tcache)); + if (likely(size <= SMALL_MAXCLASS)) + return (arena_malloc_small(arena, size, ind, zero)); + if (likely(size <= large_maxclass)) + return (arena_malloc_large(arena, size, ind, zero)); + return (huge_malloc(tsd, arena, size, zero, tcache)); } JEMALLOC_ALWAYS_INLINE arena_t * @@ -1251,7 +1253,7 @@ arena_salloc(const void *ptr, bool demote) } JEMALLOC_ALWAYS_INLINE void -arena_dalloc(tsd_t *tsd, void *ptr, tcache_t *tcache) +arena_dalloc(tsd_t *tsd, void *ptr, tcache_t *tcache, bool slow_path) { arena_chunk_t *chunk; size_t pageind, mapbits; @@ -1268,7 +1270,8 @@ arena_dalloc(tsd_t *tsd, void *ptr, tcache_t *tcache) if (likely(tcache != NULL)) { szind_t binind = arena_ptr_small_binind_get(ptr, mapbits); - tcache_dalloc_small(tsd, tcache, ptr, binind); + tcache_dalloc_small(tsd, tcache, ptr, binind, + slow_path); } else { arena_dalloc_small(extent_node_arena_get( &chunk->node), chunk, ptr, pageind); @@ -1283,7 +1286,7 @@ arena_dalloc(tsd_t *tsd, void *ptr, tcache_t *tcache) if (likely(tcache != NULL) && size - large_pad <= tcache_maxclass) { tcache_dalloc_large(tsd, tcache, ptr, size - - large_pad); + large_pad, slow_path); } else { arena_dalloc_large(extent_node_arena_get( &chunk->node), chunk, ptr); @@ -1319,7 +1322,7 @@ arena_sdalloc(tsd_t *tsd, void *ptr, size_t size, tcache_t *tcache) /* Small allocation. */ if (likely(tcache != NULL)) { szind_t binind = size2index(size); - tcache_dalloc_small(tsd, tcache, ptr, binind); + tcache_dalloc_small(tsd, tcache, ptr, binind, true); } else { size_t pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> LG_PAGE; @@ -1331,7 +1334,7 @@ arena_sdalloc(tsd_t *tsd, void *ptr, size_t size, tcache_t *tcache) PAGE_MASK) == 0); if (likely(tcache != NULL) && size <= tcache_maxclass) - tcache_dalloc_large(tsd, tcache, ptr, size); + tcache_dalloc_large(tsd, tcache, ptr, size, true); else { arena_dalloc_large(extent_node_arena_get( &chunk->node), chunk, ptr); diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in index 654cd088..d31da4ca 100644 --- a/include/jemalloc/internal/jemalloc_internal.h.in +++ b/include/jemalloc/internal/jemalloc_internal.h.in @@ -437,7 +437,7 @@ extern unsigned ncpus; * index2size_tab encodes the same information as could be computed (at * unacceptable cost in some code paths) by index2size_compute(). */ -extern size_t const index2size_tab[NSIZES]; +extern size_t const index2size_tab[NSIZES+1]; /* * size2index_tab is a compact lookup table that rounds request sizes up to * size classes. In order to reduce cache footprint, the table is compressed, @@ -624,7 +624,7 @@ JEMALLOC_ALWAYS_INLINE size_t index2size(szind_t index) { - assert(index < NSIZES); + assert(index <= NSIZES); return (index2size_lookup(index)); } @@ -823,12 +823,14 @@ arena_get(tsd_t *tsd, unsigned ind, bool init_if_missing, #ifndef JEMALLOC_ENABLE_INLINE arena_t *iaalloc(const void *ptr); size_t isalloc(const void *ptr, bool demote); -void *iallocztm(tsd_t *tsd, size_t size, bool zero, tcache_t *tcache, - bool is_metadata, arena_t *arena); -void *imalloct(tsd_t *tsd, size_t size, tcache_t *tcache, arena_t *arena); -void *imalloc(tsd_t *tsd, size_t size); -void *icalloct(tsd_t *tsd, size_t size, tcache_t *tcache, arena_t *arena); -void *icalloc(tsd_t *tsd, size_t size); +void *iallocztm(tsd_t *tsd, size_t size, szind_t ind, bool zero, + tcache_t *tcache, bool is_metadata, arena_t *arena, bool slow_path); +void *imalloct(tsd_t *tsd, size_t size, szind_t ind, tcache_t *tcache, + arena_t *arena); +void *imalloc(tsd_t *tsd, size_t size, szind_t ind, bool slow_path); +void *icalloct(tsd_t *tsd, size_t size, szind_t ind, tcache_t *tcache, + arena_t *arena); +void *icalloc(tsd_t *tsd, size_t size, szind_t ind); void *ipallocztm(tsd_t *tsd, size_t usize, size_t alignment, bool zero, tcache_t *tcache, bool is_metadata, arena_t *arena); void *ipalloct(tsd_t *tsd, size_t usize, size_t alignment, bool zero, @@ -837,10 +839,11 @@ void *ipalloc(tsd_t *tsd, size_t usize, size_t alignment, bool zero); size_t ivsalloc(const void *ptr, bool demote); size_t u2rz(size_t usize); size_t p2rz(const void *ptr); -void idalloctm(tsd_t *tsd, void *ptr, tcache_t *tcache, bool is_metadata); +void idalloctm(tsd_t *tsd, void *ptr, tcache_t *tcache, bool is_metadata, + bool slow_path); void idalloct(tsd_t *tsd, void *ptr, tcache_t *tcache); void idalloc(tsd_t *tsd, void *ptr); -void iqalloc(tsd_t *tsd, void *ptr, tcache_t *tcache); +void iqalloc(tsd_t *tsd, void *ptr, tcache_t *tcache, bool slow_path); void isdalloct(tsd_t *tsd, void *ptr, size_t size, tcache_t *tcache); void isqalloc(tsd_t *tsd, void *ptr, size_t size, tcache_t *tcache); void *iralloct_realign(tsd_t *tsd, void *ptr, size_t oldsize, size_t size, @@ -881,14 +884,14 @@ isalloc(const void *ptr, bool demote) } JEMALLOC_ALWAYS_INLINE void * -iallocztm(tsd_t *tsd, size_t size, bool zero, tcache_t *tcache, bool is_metadata, - arena_t *arena) +iallocztm(tsd_t *tsd, size_t size, szind_t ind, bool zero, tcache_t *tcache, + bool is_metadata, arena_t *arena, bool slow_path) { void *ret; assert(size != 0); - ret = arena_malloc(tsd, arena, size, zero, tcache); + ret = arena_malloc(tsd, arena, size, ind, zero, tcache, slow_path); if (config_stats && is_metadata && likely(ret != NULL)) { arena_metadata_allocated_add(iaalloc(ret), isalloc(ret, config_prof)); @@ -897,31 +900,33 @@ iallocztm(tsd_t *tsd, size_t size, bool zero, tcache_t *tcache, bool is_metadata } JEMALLOC_ALWAYS_INLINE void * -imalloct(tsd_t *tsd, size_t size, tcache_t *tcache, arena_t *arena) +imalloct(tsd_t *tsd, size_t size, szind_t ind, tcache_t *tcache, arena_t *arena) { - return (iallocztm(tsd, size, false, tcache, false, arena)); + return (iallocztm(tsd, size, ind, false, tcache, false, arena, true)); } JEMALLOC_ALWAYS_INLINE void * -imalloc(tsd_t *tsd, size_t size) +imalloc(tsd_t *tsd, size_t size, szind_t ind, bool slow_path) { - return (iallocztm(tsd, size, false, tcache_get(tsd, true), false, NULL)); + return (iallocztm(tsd, size, ind, false, tcache_get(tsd, true), false, + NULL, slow_path)); } JEMALLOC_ALWAYS_INLINE void * -icalloct(tsd_t *tsd, size_t size, tcache_t *tcache, arena_t *arena) +icalloct(tsd_t *tsd, size_t size, szind_t ind, tcache_t *tcache, arena_t *arena) { - return (iallocztm(tsd, size, true, tcache, false, arena)); + return (iallocztm(tsd, size, ind, true, tcache, false, arena, true)); } JEMALLOC_ALWAYS_INLINE void * -icalloc(tsd_t *tsd, size_t size) +icalloc(tsd_t *tsd, size_t size, szind_t ind) { - return (iallocztm(tsd, size, true, tcache_get(tsd, true), false, NULL)); + return (iallocztm(tsd, size, ind, true, tcache_get(tsd, true), false, + NULL, true)); } JEMALLOC_ALWAYS_INLINE void * @@ -997,7 +1002,8 @@ p2rz(const void *ptr) } JEMALLOC_ALWAYS_INLINE void -idalloctm(tsd_t *tsd, void *ptr, tcache_t *tcache, bool is_metadata) +idalloctm(tsd_t *tsd, void *ptr, tcache_t *tcache, bool is_metadata, + bool slow_path) { assert(ptr != NULL); @@ -1006,31 +1012,31 @@ idalloctm(tsd_t *tsd, void *ptr, tcache_t *tcache, bool is_metadata) config_prof)); } - arena_dalloc(tsd, ptr, tcache); + arena_dalloc(tsd, ptr, tcache, slow_path); } JEMALLOC_ALWAYS_INLINE void idalloct(tsd_t *tsd, void *ptr, tcache_t *tcache) { - idalloctm(tsd, ptr, tcache, false); + idalloctm(tsd, ptr, tcache, false, true); } JEMALLOC_ALWAYS_INLINE void idalloc(tsd_t *tsd, void *ptr) { - idalloctm(tsd, ptr, tcache_get(tsd, false), false); + idalloctm(tsd, ptr, tcache_get(tsd, false), false, true); } JEMALLOC_ALWAYS_INLINE void -iqalloc(tsd_t *tsd, void *ptr, tcache_t *tcache) +iqalloc(tsd_t *tsd, void *ptr, tcache_t *tcache, bool slow_path) { - if (config_fill && unlikely(opt_quarantine)) + if (slow_path && config_fill && unlikely(opt_quarantine)) quarantine(tsd, ptr); else - idalloctm(tsd, ptr, tcache, false); + idalloctm(tsd, ptr, tcache, false, slow_path); } JEMALLOC_ALWAYS_INLINE void diff --git a/include/jemalloc/internal/prof.h b/include/jemalloc/internal/prof.h index e5198c3e..a25502a9 100644 --- a/include/jemalloc/internal/prof.h +++ b/include/jemalloc/internal/prof.h @@ -436,16 +436,16 @@ prof_sample_accum_update(tsd_t *tsd, size_t usize, bool update, cassert(config_prof); tdata = prof_tdata_get(tsd, true); - if ((uintptr_t)tdata <= (uintptr_t)PROF_TDATA_STATE_MAX) + if (unlikely((uintptr_t)tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)) tdata = NULL; if (tdata_out != NULL) *tdata_out = tdata; - if (tdata == NULL) + if (unlikely(tdata == NULL)) return (true); - if (tdata->bytes_until_sample >= usize) { + if (likely(tdata->bytes_until_sample >= usize)) { if (update) tdata->bytes_until_sample -= usize; return (true); diff --git a/include/jemalloc/internal/tcache.h b/include/jemalloc/internal/tcache.h index 5079cd26..c2921405 100644 --- a/include/jemalloc/internal/tcache.h +++ b/include/jemalloc/internal/tcache.h @@ -70,6 +70,13 @@ struct tcache_bin_s { int low_water; /* Min # cached since last GC. */ unsigned lg_fill_div; /* Fill (ncached_max >> lg_fill_div). */ unsigned ncached; /* # of cached objects. */ + /* + * To make use of adjacent cacheline prefetch, the items in the avail + * stack goes to higher address for newer allocations. avail points + * just above the available space, which means that + * avail[-ncached, ... 1] are available items and the lowest item will + * be allocated first. + */ void **avail; /* Stack of available objects. */ }; @@ -126,7 +133,7 @@ extern tcaches_t *tcaches; size_t tcache_salloc(const void *ptr); void tcache_event_hard(tsd_t *tsd, tcache_t *tcache); void *tcache_alloc_small_hard(tsd_t *tsd, arena_t *arena, tcache_t *tcache, - tcache_bin_t *tbin, szind_t binind); + tcache_bin_t *tbin, szind_t binind, bool *tcache_success); void tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, tcache_bin_t *tbin, szind_t binind, unsigned rem); void tcache_bin_flush_large(tsd_t *tsd, tcache_bin_t *tbin, szind_t binind, @@ -155,15 +162,15 @@ void tcache_flush(void); bool tcache_enabled_get(void); tcache_t *tcache_get(tsd_t *tsd, bool create); void tcache_enabled_set(bool enabled); -void *tcache_alloc_easy(tcache_bin_t *tbin); +void *tcache_alloc_easy(tcache_bin_t *tbin, bool *tcache_success); void *tcache_alloc_small(tsd_t *tsd, arena_t *arena, tcache_t *tcache, - size_t size, bool zero); + size_t size, szind_t ind, bool zero, bool slow_path); void *tcache_alloc_large(tsd_t *tsd, arena_t *arena, tcache_t *tcache, - size_t size, bool zero); + size_t size, szind_t ind, bool zero, bool slow_path); void tcache_dalloc_small(tsd_t *tsd, tcache_t *tcache, void *ptr, - szind_t binind); + szind_t binind, bool slow_path); void tcache_dalloc_large(tsd_t *tsd, tcache_t *tcache, void *ptr, - size_t size); + size_t size, bool slow_path); tcache_t *tcaches_get(tsd_t *tsd, unsigned ind); #endif @@ -247,44 +254,69 @@ tcache_event(tsd_t *tsd, tcache_t *tcache) } JEMALLOC_ALWAYS_INLINE void * -tcache_alloc_easy(tcache_bin_t *tbin) +tcache_alloc_easy(tcache_bin_t *tbin, bool *tcache_success) { void *ret; if (unlikely(tbin->ncached == 0)) { tbin->low_water = -1; + *tcache_success = false; return (NULL); } + /* + * tcache_success (instead of ret) should be checked upon the return of + * this function. We avoid checking (ret == NULL) because there is + * never a null stored on the avail stack (which is unknown to the + * compiler), and eagerly checking ret would cause pipeline stall + * (waiting for the cacheline). + */ + *tcache_success = true; + ret = *(tbin->avail - tbin->ncached); tbin->ncached--; + if (unlikely((int)tbin->ncached < tbin->low_water)) tbin->low_water = tbin->ncached; - ret = tbin->avail[tbin->ncached]; + return (ret); } JEMALLOC_ALWAYS_INLINE void * tcache_alloc_small(tsd_t *tsd, arena_t *arena, tcache_t *tcache, size_t size, - bool zero) + szind_t binind, bool zero, bool slow_path) { void *ret; - szind_t binind; - size_t usize; tcache_bin_t *tbin; + bool tcache_success; + size_t usize JEMALLOC_CC_SILENCE_INIT(0); - binind = size2index(size); assert(binind < NBINS); tbin = &tcache->tbins[binind]; - usize = index2size(binind); - ret = tcache_alloc_easy(tbin); - if (unlikely(ret == NULL)) { - ret = tcache_alloc_small_hard(tsd, arena, tcache, tbin, binind); - if (ret == NULL) + ret = tcache_alloc_easy(tbin, &tcache_success); + assert(tcache_success == (ret != NULL)); + if (unlikely(!tcache_success)) { + bool tcache_hard_success; + arena = arena_choose(tsd, arena); + if (unlikely(arena == NULL)) + return (NULL); + + ret = tcache_alloc_small_hard(tsd, arena, tcache, tbin, binind, + &tcache_hard_success); + if (tcache_hard_success == false) return (NULL); } - assert(tcache_salloc(ret) == usize); + + assert(ret); + /* + * Only compute usize if required. The checks in the following if + * statement are all static. + */ + if (config_prof || (slow_path && config_fill) || unlikely(zero)) { + usize = index2size(binind); + assert(tcache_salloc(ret) == usize); + } if (likely(!zero)) { - if (config_fill) { + if (slow_path && config_fill) { if (unlikely(opt_junk_alloc)) { arena_alloc_junk_small(ret, &arena_bin_info[binind], false); @@ -292,7 +324,7 @@ tcache_alloc_small(tsd_t *tsd, arena_t *arena, tcache_t *tcache, size_t size, memset(ret, 0, usize); } } else { - if (config_fill && unlikely(opt_junk_alloc)) { + if (slow_path && config_fill && unlikely(opt_junk_alloc)) { arena_alloc_junk_small(ret, &arena_bin_info[binind], true); } @@ -309,28 +341,38 @@ tcache_alloc_small(tsd_t *tsd, arena_t *arena, tcache_t *tcache, size_t size, JEMALLOC_ALWAYS_INLINE void * tcache_alloc_large(tsd_t *tsd, arena_t *arena, tcache_t *tcache, size_t size, - bool zero) + szind_t binind, bool zero, bool slow_path) { void *ret; - szind_t binind; - size_t usize; tcache_bin_t *tbin; + bool tcache_success; + size_t usize JEMALLOC_CC_SILENCE_INIT(0); - binind = size2index(size); - usize = index2size(binind); - assert(usize <= tcache_maxclass); assert(binind < nhbins); tbin = &tcache->tbins[binind]; - ret = tcache_alloc_easy(tbin); - if (unlikely(ret == NULL)) { + ret = tcache_alloc_easy(tbin, &tcache_success); + assert(tcache_success == (ret != NULL)); + if (unlikely(!tcache_success)) { /* * Only allocate one large object at a time, because it's quite * expensive to create one and not use it. */ - ret = arena_malloc_large(arena, usize, zero); + arena = arena_choose(tsd, arena); + if (unlikely(arena == NULL)) + return (NULL); + + usize = index2size(binind); + assert(usize <= tcache_maxclass); + ret = arena_malloc_large(arena, usize, binind, zero); if (ret == NULL) return (NULL); } else { + /* Only compute usize on demand */ + if (config_prof || (slow_path && config_fill) || unlikely(zero)) { + usize = index2size(binind); + assert(usize <= tcache_maxclass); + } + if (config_prof && usize == LARGE_MINCLASS) { arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ret); @@ -340,7 +382,7 @@ tcache_alloc_large(tsd_t *tsd, arena_t *arena, tcache_t *tcache, size_t size, BININD_INVALID); } if (likely(!zero)) { - if (config_fill) { + if (slow_path && config_fill) { if (unlikely(opt_junk_alloc)) memset(ret, 0xa5, usize); else if (unlikely(opt_zero)) @@ -360,14 +402,15 @@ tcache_alloc_large(tsd_t *tsd, arena_t *arena, tcache_t *tcache, size_t size, } JEMALLOC_ALWAYS_INLINE void -tcache_dalloc_small(tsd_t *tsd, tcache_t *tcache, void *ptr, szind_t binind) +tcache_dalloc_small(tsd_t *tsd, tcache_t *tcache, void *ptr, szind_t binind, + bool slow_path) { tcache_bin_t *tbin; tcache_bin_info_t *tbin_info; assert(tcache_salloc(ptr) <= SMALL_MAXCLASS); - if (config_fill && unlikely(opt_junk_free)) + if (slow_path && config_fill && unlikely(opt_junk_free)) arena_dalloc_junk_small(ptr, &arena_bin_info[binind]); tbin = &tcache->tbins[binind]; @@ -377,14 +420,15 @@ tcache_dalloc_small(tsd_t *tsd, tcache_t *tcache, void *ptr, szind_t binind) (tbin_info->ncached_max >> 1)); } assert(tbin->ncached < tbin_info->ncached_max); - tbin->avail[tbin->ncached] = ptr; tbin->ncached++; + *(tbin->avail - tbin->ncached) = ptr; tcache_event(tsd, tcache); } JEMALLOC_ALWAYS_INLINE void -tcache_dalloc_large(tsd_t *tsd, tcache_t *tcache, void *ptr, size_t size) +tcache_dalloc_large(tsd_t *tsd, tcache_t *tcache, void *ptr, size_t size, + bool slow_path) { szind_t binind; tcache_bin_t *tbin; @@ -396,7 +440,7 @@ tcache_dalloc_large(tsd_t *tsd, tcache_t *tcache, void *ptr, size_t size) binind = size2index(size); - if (config_fill && unlikely(opt_junk_free)) + if (slow_path && config_fill && unlikely(opt_junk_free)) arena_dalloc_junk_large(ptr, size); tbin = &tcache->tbins[binind]; @@ -406,8 +450,8 @@ tcache_dalloc_large(tsd_t *tsd, tcache_t *tcache, void *ptr, size_t size) (tbin_info->ncached_max >> 1), tcache); } assert(tbin->ncached < tbin_info->ncached_max); - tbin->avail[tbin->ncached] = ptr; tbin->ncached++; + *(tbin->avail - tbin->ncached) = ptr; tcache_event(tsd, tcache); } diff --git a/src/arena.c b/src/arena.c index 844d721c..143afb9a 100644 --- a/src/arena.c +++ b/src/arena.c @@ -1990,11 +1990,10 @@ arena_tcache_fill_small(arena_t *arena, tcache_bin_t *tbin, szind_t binind, /* * OOM. tbin->avail isn't yet filled down to its first * element, so the successful allocations (if any) must - * be moved to the base of tbin->avail before bailing - * out. + * be moved just before tbin->avail before bailing out. */ if (i > 0) { - memmove(tbin->avail, &tbin->avail[nfill - i], + memmove(tbin->avail - i, tbin->avail - nfill, i * sizeof(void *)); } break; @@ -2004,7 +2003,7 @@ arena_tcache_fill_small(arena_t *arena, tcache_bin_t *tbin, szind_t binind, true); } /* Insert such that low regions get used first. */ - tbin->avail[nfill - 1 - i] = ptr; + *(tbin->avail - nfill + i) = ptr; } if (config_stats) { bin->stats.nmalloc += i; @@ -2125,14 +2124,12 @@ arena_quarantine_junk_small(void *ptr, size_t usize) } void * -arena_malloc_small(arena_t *arena, size_t size, bool zero) +arena_malloc_small(arena_t *arena, size_t size, szind_t binind, bool zero) { void *ret; arena_bin_t *bin; arena_run_t *run; - szind_t binind; - binind = size2index(size); assert(binind < NBINS); bin = &arena->bins[binind]; size = index2size(binind); @@ -2179,7 +2176,7 @@ arena_malloc_small(arena_t *arena, size_t size, bool zero) } void * -arena_malloc_large(arena_t *arena, size_t size, bool zero) +arena_malloc_large(arena_t *arena, size_t size, szind_t binind, bool zero) { void *ret; size_t usize; @@ -2189,7 +2186,7 @@ arena_malloc_large(arena_t *arena, size_t size, bool zero) UNUSED bool idump; /* Large allocation. */ - usize = s2u(size); + usize = index2size(binind); malloc_mutex_lock(&arena->lock); if (config_cache_oblivious) { uint64_t r; @@ -2214,7 +2211,7 @@ arena_malloc_large(arena_t *arena, size_t size, bool zero) ret = (void *)((uintptr_t)arena_miscelm_to_rpages(miscelm) + random_offset); if (config_stats) { - szind_t index = size2index(usize) - NBINS; + szind_t index = binind - NBINS; arena->stats.nmalloc_large++; arena->stats.nrequests_large++; @@ -2336,7 +2333,8 @@ arena_palloc(tsd_t *tsd, arena_t *arena, size_t usize, size_t alignment, if (usize <= SMALL_MAXCLASS && (alignment < PAGE || (alignment == PAGE && (usize & PAGE_MASK) == 0))) { /* Small; alignment doesn't require special run placement. */ - ret = arena_malloc(tsd, arena, usize, zero, tcache); + ret = arena_malloc(tsd, arena, usize, size2index(usize), zero, + tcache, true); } else if (usize <= large_maxclass && alignment <= PAGE) { /* * Large; alignment doesn't require special run placement. @@ -2344,7 +2342,8 @@ arena_palloc(tsd_t *tsd, arena_t *arena, size_t usize, size_t alignment, * the base of the run, so do some bit manipulation to retrieve * the base. */ - ret = arena_malloc(tsd, arena, usize, zero, tcache); + ret = arena_malloc(tsd, arena, usize, size2index(usize), zero, + tcache, true); if (config_cache_oblivious) ret = (void *)((uintptr_t)ret & ~PAGE_MASK); } else { @@ -2823,7 +2822,8 @@ arena_ralloc_move_helper(tsd_t *tsd, arena_t *arena, size_t usize, { if (alignment == 0) - return (arena_malloc(tsd, arena, usize, zero, tcache)); + return (arena_malloc(tsd, arena, usize, size2index(usize), zero, + tcache, true)); usize = sa2u(usize, alignment); if (usize == 0) return (NULL); diff --git a/src/ckh.c b/src/ckh.c index 53a1c1ef..e4328d22 100644 --- a/src/ckh.c +++ b/src/ckh.c @@ -283,12 +283,12 @@ ckh_grow(tsd_t *tsd, ckh_t *ckh) ckh->lg_curbuckets = lg_curcells - LG_CKH_BUCKET_CELLS; if (!ckh_rebuild(ckh, tab)) { - idalloctm(tsd, tab, tcache_get(tsd, false), true); + idalloctm(tsd, tab, tcache_get(tsd, false), true, true); break; } /* Rebuilding failed, so back out partially rebuilt table. */ - idalloctm(tsd, ckh->tab, tcache_get(tsd, false), true); + idalloctm(tsd, ckh->tab, tcache_get(tsd, false), true, true); ckh->tab = tab; ckh->lg_curbuckets = lg_prevbuckets; } @@ -330,7 +330,7 @@ ckh_shrink(tsd_t *tsd, ckh_t *ckh) ckh->lg_curbuckets = lg_curcells - LG_CKH_BUCKET_CELLS; if (!ckh_rebuild(ckh, tab)) { - idalloctm(tsd, tab, tcache_get(tsd, false), true); + idalloctm(tsd, tab, tcache_get(tsd, false), true, true); #ifdef CKH_COUNT ckh->nshrinks++; #endif @@ -338,7 +338,7 @@ ckh_shrink(tsd_t *tsd, ckh_t *ckh) } /* Rebuilding failed, so back out partially rebuilt table. */ - idalloctm(tsd, ckh->tab, tcache_get(tsd, false), true); + idalloctm(tsd, ckh->tab, tcache_get(tsd, false), true, true); ckh->tab = tab; ckh->lg_curbuckets = lg_prevbuckets; #ifdef CKH_COUNT @@ -421,7 +421,7 @@ ckh_delete(tsd_t *tsd, ckh_t *ckh) (unsigned long long)ckh->nrelocs); #endif - idalloctm(tsd, ckh->tab, tcache_get(tsd, false), true); + idalloctm(tsd, ckh->tab, tcache_get(tsd, false), true, true); if (config_debug) memset(ckh, 0x5a, sizeof(ckh_t)); } diff --git a/src/huge.c b/src/huge.c index 1e9a6651..c1fa3795 100644 --- a/src/huge.c +++ b/src/huge.c @@ -75,7 +75,7 @@ huge_palloc(tsd_t *tsd, arena_t *arena, size_t size, size_t alignment, arena = arena_choose(tsd, arena); if (unlikely(arena == NULL) || (ret = arena_chunk_alloc_huge(arena, size, alignment, &is_zeroed)) == NULL) { - idalloctm(tsd, node, tcache, true); + idalloctm(tsd, node, tcache, true, true); return (NULL); } @@ -83,7 +83,7 @@ huge_palloc(tsd_t *tsd, arena_t *arena, size_t size, size_t alignment, if (huge_node_set(ret, node)) { arena_chunk_dalloc_huge(arena, ret, size); - idalloctm(tsd, node, tcache, true); + idalloctm(tsd, node, tcache, true, true); return (NULL); } @@ -372,7 +372,7 @@ huge_dalloc(tsd_t *tsd, void *ptr, tcache_t *tcache) extent_node_size_get(node)); arena_chunk_dalloc_huge(extent_node_arena_get(node), extent_node_addr_get(node), extent_node_size_get(node)); - idalloctm(tsd, node, tcache, true); + idalloctm(tsd, node, tcache, true, true); } arena_t * diff --git a/src/jemalloc.c b/src/jemalloc.c index 5a2d3240..eed6331d 100644 --- a/src/jemalloc.c +++ b/src/jemalloc.c @@ -70,12 +70,29 @@ typedef enum { } malloc_init_t; static malloc_init_t malloc_init_state = malloc_init_uninitialized; +/* 0 should be the common case. Set to true to trigger initialization. */ +static bool malloc_slow = true; + +/* When malloc_slow != 0, set the corresponding bits for sanity check. */ +enum { + flag_opt_junk_alloc = (1U), + flag_opt_junk_free = (1U << 1), + flag_opt_quarantine = (1U << 2), + flag_opt_zero = (1U << 3), + flag_opt_utrace = (1U << 4), + flag_in_valgrind = (1U << 5), + flag_opt_xmalloc = (1U << 6) +}; +static uint8_t malloc_slow_flags; + +/* Last entry for overflow detection only. */ JEMALLOC_ALIGNED(CACHELINE) -const size_t index2size_tab[NSIZES] = { +const size_t index2size_tab[NSIZES+1] = { #define SC(index, lg_grp, lg_delta, ndelta, bin, lg_delta_lookup) \ ((ZU(1)<: Error in malloc(): " + "out of memory\n"); + abort(); + } + set_errno(ENOMEM); + } + if (config_stats && likely(ret != NULL)) { + assert(usize == isalloc(ret, config_prof)); + *tsd_thread_allocatedp_get(tsd) += usize; + } } JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN @@ -1424,21 +1488,20 @@ je_malloc(size_t size) if (size == 0) size = 1; - ret = imalloc_body(size, &tsd, &usize); - if (unlikely(ret == NULL)) { - if (config_xmalloc && unlikely(opt_xmalloc)) { - malloc_write(": Error in malloc(): " - "out of memory\n"); - abort(); - } - set_errno(ENOMEM); + if (likely(!malloc_slow)) { + /* + * imalloc_body() is inlined so that fast and slow paths are + * generated separately with statically known slow_path. + */ + ret = imalloc_body(size, &tsd, &usize, false); + imalloc_post_check(ret, tsd, usize, false); + } else { + ret = imalloc_body(size, &tsd, &usize, true); + imalloc_post_check(ret, tsd, usize, true); + UTRACE(0, size, ret); + JEMALLOC_VALGRIND_MALLOC(ret != NULL, ret, usize, false); } - if (config_stats && likely(ret != NULL)) { - assert(usize == isalloc(ret, config_prof)); - *tsd_thread_allocatedp_get(tsd) += usize; - } - UTRACE(0, size, ret); - JEMALLOC_VALGRIND_MALLOC(ret != NULL, ret, usize, false); + return (ret); } @@ -1576,34 +1639,35 @@ je_aligned_alloc(size_t alignment, size_t size) } static void * -icalloc_prof_sample(tsd_t *tsd, size_t usize, prof_tctx_t *tctx) +icalloc_prof_sample(tsd_t *tsd, size_t usize, szind_t ind, prof_tctx_t *tctx) { void *p; if (tctx == NULL) return (NULL); if (usize <= SMALL_MAXCLASS) { - p = icalloc(tsd, LARGE_MINCLASS); + szind_t ind_large = size2index(LARGE_MINCLASS); + p = icalloc(tsd, LARGE_MINCLASS, ind_large); if (p == NULL) return (NULL); arena_prof_promoted(p, usize); } else - p = icalloc(tsd, usize); + p = icalloc(tsd, usize, ind); return (p); } JEMALLOC_ALWAYS_INLINE_C void * -icalloc_prof(tsd_t *tsd, size_t usize) +icalloc_prof(tsd_t *tsd, size_t usize, szind_t ind) { void *p; prof_tctx_t *tctx; tctx = prof_alloc_prep(tsd, usize, prof_active_get_unlocked(), true); if (unlikely((uintptr_t)tctx != (uintptr_t)1U)) - p = icalloc_prof_sample(tsd, usize, tctx); + p = icalloc_prof_sample(tsd, usize, ind, tctx); else - p = icalloc(tsd, usize); + p = icalloc(tsd, usize, ind); if (unlikely(p == NULL)) { prof_alloc_rollback(tsd, tctx, true); return (NULL); @@ -1621,6 +1685,7 @@ je_calloc(size_t num, size_t size) void *ret; tsd_t *tsd; size_t num_size; + szind_t ind; size_t usize JEMALLOC_CC_SILENCE_INIT(0); if (unlikely(malloc_init())) { @@ -1650,17 +1715,18 @@ je_calloc(size_t num, size_t size) goto label_return; } + ind = size2index(num_size); if (config_prof && opt_prof) { - usize = s2u(num_size); + usize = index2size(ind); if (unlikely(usize == 0)) { ret = NULL; goto label_return; } - ret = icalloc_prof(tsd, usize); + ret = icalloc_prof(tsd, usize, ind); } else { if (config_stats || (config_valgrind && unlikely(in_valgrind))) - usize = s2u(num_size); - ret = icalloc(tsd, num_size); + usize = index2size(ind); + ret = icalloc(tsd, num_size, ind); } label_return: @@ -1725,7 +1791,7 @@ irealloc_prof(tsd_t *tsd, void *old_ptr, size_t old_usize, size_t usize) } JEMALLOC_INLINE_C void -ifree(tsd_t *tsd, void *ptr, tcache_t *tcache) +ifree(tsd_t *tsd, void *ptr, tcache_t *tcache, bool slow_path) { size_t usize; UNUSED size_t rzsize JEMALLOC_CC_SILENCE_INIT(0); @@ -1740,10 +1806,15 @@ ifree(tsd_t *tsd, void *ptr, tcache_t *tcache) usize = isalloc(ptr, config_prof); if (config_stats) *tsd_thread_deallocatedp_get(tsd) += usize; - if (config_valgrind && unlikely(in_valgrind)) - rzsize = p2rz(ptr); - iqalloc(tsd, ptr, tcache); - JEMALLOC_VALGRIND_FREE(ptr, rzsize); + + if (likely(!slow_path)) + iqalloc(tsd, ptr, tcache, false); + else { + if (config_valgrind && unlikely(in_valgrind)) + rzsize = p2rz(ptr); + iqalloc(tsd, ptr, tcache, true); + JEMALLOC_VALGRIND_FREE(ptr, rzsize); + } } JEMALLOC_INLINE_C void @@ -1780,7 +1851,7 @@ je_realloc(void *ptr, size_t size) /* realloc(ptr, 0) is equivalent to free(ptr). */ UTRACE(ptr, 0, 0); tsd = tsd_fetch(); - ifree(tsd, ptr, tcache_get(tsd, false)); + ifree(tsd, ptr, tcache_get(tsd, false), true); return (NULL); } size = 1; @@ -1807,7 +1878,10 @@ je_realloc(void *ptr, size_t size) } } else { /* realloc(NULL, size) is equivalent to malloc(size). */ - ret = imalloc_body(size, &tsd, &usize); + if (likely(!malloc_slow)) + ret = imalloc_body(size, &tsd, &usize, false); + else + ret = imalloc_body(size, &tsd, &usize, true); } if (unlikely(ret == NULL)) { @@ -1836,7 +1910,10 @@ je_free(void *ptr) UTRACE(ptr, 0, 0); if (likely(ptr != NULL)) { tsd_t *tsd = tsd_fetch(); - ifree(tsd, ptr, tcache_get(tsd, false)); + if (likely(!malloc_slow)) + ifree(tsd, ptr, tcache_get(tsd, false), false); + else + ifree(tsd, ptr, tcache_get(tsd, false), true); } } @@ -1965,12 +2042,14 @@ JEMALLOC_ALWAYS_INLINE_C void * imallocx_flags(tsd_t *tsd, size_t usize, size_t alignment, bool zero, tcache_t *tcache, arena_t *arena) { + szind_t ind; + ind = size2index(usize); if (unlikely(alignment != 0)) return (ipalloct(tsd, usize, alignment, zero, tcache, arena)); if (unlikely(zero)) - return (icalloct(tsd, usize, tcache, arena)); - return (imalloct(tsd, usize, tcache, arena)); + return (icalloct(tsd, usize, ind, tcache, arena)); + return (imalloct(tsd, usize, ind, tcache, arena)); } static void * @@ -2034,9 +2113,10 @@ imallocx_no_prof(tsd_t *tsd, size_t size, int flags, size_t *usize) arena_t *arena; if (likely(flags == 0)) { + szind_t ind = size2index(size); if (config_stats || (config_valgrind && unlikely(in_valgrind))) - *usize = s2u(size); - return (imalloc(tsd, size)); + *usize = index2size(ind); + return (imalloc(tsd, size, ind, true)); } if (unlikely(imallocx_flags_decode_hard(tsd, size, flags, usize, @@ -2375,7 +2455,7 @@ je_dallocx(void *ptr, int flags) tcache = tcache_get(tsd, false); UTRACE(ptr, 0, 0); - ifree(tsd_fetch(), ptr, tcache); + ifree(tsd_fetch(), ptr, tcache, true); } JEMALLOC_ALWAYS_INLINE_C size_t diff --git a/src/prof.c b/src/prof.c index 5d2b9598..199e63e4 100644 --- a/src/prof.c +++ b/src/prof.c @@ -551,9 +551,9 @@ prof_gctx_create(tsd_t *tsd, prof_bt_t *bt) /* * Create a single allocation that has space for vec of length bt->len. */ - prof_gctx_t *gctx = (prof_gctx_t *)iallocztm(tsd, offsetof(prof_gctx_t, - vec) + (bt->len * sizeof(void *)), false, tcache_get(tsd, true), - true, NULL); + size_t size = offsetof(prof_gctx_t, vec) + (bt->len * sizeof(void *)); + prof_gctx_t *gctx = (prof_gctx_t *)iallocztm(tsd, size, + size2index(size), false, tcache_get(tsd, true), true, NULL, true); if (gctx == NULL) return (NULL); gctx->lock = prof_gctx_mutex_choose(); @@ -594,7 +594,7 @@ prof_gctx_try_destroy(tsd_t *tsd, prof_tdata_t *tdata_self, prof_gctx_t *gctx, prof_leave(tsd, tdata_self); /* Destroy gctx. */ malloc_mutex_unlock(gctx->lock); - idalloctm(tsd, gctx, tcache_get(tsd, false), true); + idalloctm(tsd, gctx, tcache_get(tsd, false), true, true); } else { /* * Compensate for increment in prof_tctx_destroy() or @@ -701,7 +701,7 @@ prof_tctx_destroy(tsd_t *tsd, prof_tctx_t *tctx) prof_tdata_destroy(tsd, tdata, false); if (destroy_tctx) - idalloctm(tsd, tctx, tcache_get(tsd, false), true); + idalloctm(tsd, tctx, tcache_get(tsd, false), true, true); } static bool @@ -730,7 +730,8 @@ prof_lookup_global(tsd_t *tsd, prof_bt_t *bt, prof_tdata_t *tdata, if (ckh_insert(tsd, &bt2gctx, btkey.v, gctx.v)) { /* OOM. */ prof_leave(tsd, tdata); - idalloctm(tsd, gctx.v, tcache_get(tsd, false), true); + idalloctm(tsd, gctx.v, tcache_get(tsd, false), true, + true); return (true); } new_gctx = true; @@ -789,8 +790,9 @@ prof_lookup(tsd_t *tsd, prof_bt_t *bt) /* Link a prof_tctx_t into gctx for this thread. */ tcache = tcache_get(tsd, true); - ret.v = iallocztm(tsd, sizeof(prof_tctx_t), false, tcache, true, - NULL); + ret.v = iallocztm(tsd, sizeof(prof_tctx_t), + size2index(sizeof(prof_tctx_t)), false, tcache, true, NULL, + true); if (ret.p == NULL) { if (new_gctx) prof_gctx_try_destroy(tsd, tdata, gctx, tdata); @@ -810,7 +812,7 @@ prof_lookup(tsd_t *tsd, prof_bt_t *bt) if (error) { if (new_gctx) prof_gctx_try_destroy(tsd, tdata, gctx, tdata); - idalloctm(tsd, ret.v, tcache, true); + idalloctm(tsd, ret.v, tcache, true, true); return (NULL); } malloc_mutex_lock(gctx->lock); @@ -1211,7 +1213,7 @@ prof_gctx_finish(tsd_t *tsd, prof_gctx_tree_t *gctxs) tctx_tree_remove(&gctx->tctxs, to_destroy); idalloctm(tsd, to_destroy, - tcache_get(tsd, false), true); + tcache_get(tsd, false), true, true); } else next = NULL; } while (next != NULL); @@ -1714,8 +1716,8 @@ prof_tdata_init_impl(tsd_t *tsd, uint64_t thr_uid, uint64_t thr_discrim, /* Initialize an empty cache for this thread. */ tcache = tcache_get(tsd, true); - tdata = (prof_tdata_t *)iallocztm(tsd, sizeof(prof_tdata_t), false, - tcache, true, NULL); + tdata = (prof_tdata_t *)iallocztm(tsd, sizeof(prof_tdata_t), + size2index(sizeof(prof_tdata_t)), false, tcache, true, NULL, true); if (tdata == NULL) return (NULL); @@ -1729,7 +1731,7 @@ prof_tdata_init_impl(tsd_t *tsd, uint64_t thr_uid, uint64_t thr_discrim, if (ckh_new(tsd, &tdata->bt2tctx, PROF_CKH_MINITEMS, prof_bt_hash, prof_bt_keycomp)) { - idalloctm(tsd, tdata, tcache, true); + idalloctm(tsd, tdata, tcache, true, true); return (NULL); } @@ -1784,9 +1786,9 @@ prof_tdata_destroy_locked(tsd_t *tsd, prof_tdata_t *tdata, tcache = tcache_get(tsd, false); if (tdata->thread_name != NULL) - idalloctm(tsd, tdata->thread_name, tcache, true); + idalloctm(tsd, tdata->thread_name, tcache, true, true); ckh_delete(tsd, &tdata->bt2tctx); - idalloctm(tsd, tdata, tcache, true); + idalloctm(tsd, tdata, tcache, true, true); } static void @@ -1947,7 +1949,8 @@ prof_thread_name_alloc(tsd_t *tsd, const char *thread_name) if (size == 1) return (""); - ret = iallocztm(tsd, size, false, tcache_get(tsd, true), true, NULL); + ret = iallocztm(tsd, size, size2index(size), false, tcache_get(tsd, + true), true, NULL, true); if (ret == NULL) return (NULL); memcpy(ret, thread_name, size); @@ -1980,7 +1983,7 @@ prof_thread_name_set(tsd_t *tsd, const char *thread_name) if (tdata->thread_name != NULL) { idalloctm(tsd, tdata->thread_name, tcache_get(tsd, false), - true); + true, true); tdata->thread_name = NULL; } if (strlen(s) > 0) diff --git a/src/quarantine.c b/src/quarantine.c index 6c43dfca..ff8801cb 100644 --- a/src/quarantine.c +++ b/src/quarantine.c @@ -23,12 +23,14 @@ static quarantine_t * quarantine_init(tsd_t *tsd, size_t lg_maxobjs) { quarantine_t *quarantine; + size_t size; assert(tsd_nominal(tsd)); - quarantine = (quarantine_t *)iallocztm(tsd, offsetof(quarantine_t, objs) - + ((ZU(1) << lg_maxobjs) * sizeof(quarantine_obj_t)), false, - tcache_get(tsd, true), true, NULL); + size = offsetof(quarantine_t, objs) + ((ZU(1) << lg_maxobjs) * + sizeof(quarantine_obj_t)); + quarantine = (quarantine_t *)iallocztm(tsd, size, size2index(size), + false, tcache_get(tsd, true), true, NULL, true); if (quarantine == NULL) return (NULL); quarantine->curbytes = 0; @@ -55,7 +57,7 @@ quarantine_alloc_hook_work(tsd_t *tsd) if (tsd_quarantine_get(tsd) == NULL) tsd_quarantine_set(tsd, quarantine); else - idalloctm(tsd, quarantine, tcache_get(tsd, false), true); + idalloctm(tsd, quarantine, tcache_get(tsd, false), true, true); } static quarantine_t * @@ -87,7 +89,7 @@ quarantine_grow(tsd_t *tsd, quarantine_t *quarantine) memcpy(&ret->objs[ncopy_a], quarantine->objs, ncopy_b * sizeof(quarantine_obj_t)); } - idalloctm(tsd, quarantine, tcache_get(tsd, false), true); + idalloctm(tsd, quarantine, tcache_get(tsd, false), true, true); tsd_quarantine_set(tsd, ret); return (ret); @@ -98,7 +100,7 @@ quarantine_drain_one(tsd_t *tsd, quarantine_t *quarantine) { quarantine_obj_t *obj = &quarantine->objs[quarantine->first]; assert(obj->usize == isalloc(obj->ptr, config_prof)); - idalloctm(tsd, obj->ptr, NULL, false); + idalloctm(tsd, obj->ptr, NULL, false, true); quarantine->curbytes -= obj->usize; quarantine->curobjs--; quarantine->first = (quarantine->first + 1) & ((ZU(1) << @@ -123,7 +125,7 @@ quarantine(tsd_t *tsd, void *ptr) assert(opt_quarantine); if ((quarantine = tsd_quarantine_get(tsd)) == NULL) { - idalloctm(tsd, ptr, NULL, false); + idalloctm(tsd, ptr, NULL, false, true); return; } /* @@ -162,7 +164,7 @@ quarantine(tsd_t *tsd, void *ptr) } } else { assert(quarantine->curbytes == 0); - idalloctm(tsd, ptr, NULL, false); + idalloctm(tsd, ptr, NULL, false, true); } } @@ -177,7 +179,7 @@ quarantine_cleanup(tsd_t *tsd) quarantine = tsd_quarantine_get(tsd); if (quarantine != NULL) { quarantine_drain(tsd, quarantine, 0); - idalloctm(tsd, quarantine, tcache_get(tsd, false), true); + idalloctm(tsd, quarantine, tcache_get(tsd, false), true, true); tsd_quarantine_set(tsd, NULL); } } diff --git a/src/tcache.c b/src/tcache.c index fdafd0c6..78c62300 100644 --- a/src/tcache.c +++ b/src/tcache.c @@ -72,7 +72,7 @@ tcache_event_hard(tsd_t *tsd, tcache_t *tcache) void * tcache_alloc_small_hard(tsd_t *tsd, arena_t *arena, tcache_t *tcache, - tcache_bin_t *tbin, szind_t binind) + tcache_bin_t *tbin, szind_t binind, bool *tcache_success) { void *ret; @@ -80,7 +80,7 @@ tcache_alloc_small_hard(tsd_t *tsd, arena_t *arena, tcache_t *tcache, tcache->prof_accumbytes : 0); if (config_prof) tcache->prof_accumbytes = 0; - ret = tcache_alloc_easy(tbin); + ret = tcache_alloc_easy(tbin, tcache_success); return (ret); } @@ -102,7 +102,7 @@ tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, tcache_bin_t *tbin, for (nflush = tbin->ncached - rem; nflush > 0; nflush = ndeferred) { /* Lock the arena bin associated with the first object. */ arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE( - tbin->avail[0]); + *(tbin->avail - 1)); arena_t *bin_arena = extent_node_arena_get(&chunk->node); arena_bin_t *bin = &bin_arena->bins[binind]; @@ -122,7 +122,7 @@ tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, tcache_bin_t *tbin, } ndeferred = 0; for (i = 0; i < nflush; i++) { - ptr = tbin->avail[i]; + ptr = *(tbin->avail - 1 - i); assert(ptr != NULL); chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr); if (extent_node_arena_get(&chunk->node) == bin_arena) { @@ -139,7 +139,7 @@ tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, tcache_bin_t *tbin, * locked. Stash the object, so that it can be * handled in a future pass. */ - tbin->avail[ndeferred] = ptr; + *(tbin->avail - 1 - ndeferred) = ptr; ndeferred++; } } @@ -158,8 +158,8 @@ tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, tcache_bin_t *tbin, malloc_mutex_unlock(&bin->lock); } - memmove(tbin->avail, &tbin->avail[tbin->ncached - rem], - rem * sizeof(void *)); + memmove(tbin->avail - rem, tbin->avail - tbin->ncached, rem * + sizeof(void *)); tbin->ncached = rem; if ((int)tbin->ncached < tbin->low_water) tbin->low_water = tbin->ncached; @@ -182,7 +182,7 @@ tcache_bin_flush_large(tsd_t *tsd, tcache_bin_t *tbin, szind_t binind, for (nflush = tbin->ncached - rem; nflush > 0; nflush = ndeferred) { /* Lock the arena associated with the first object. */ arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE( - tbin->avail[0]); + *(tbin->avail - 1)); arena_t *locked_arena = extent_node_arena_get(&chunk->node); UNUSED bool idump; @@ -206,7 +206,7 @@ tcache_bin_flush_large(tsd_t *tsd, tcache_bin_t *tbin, szind_t binind, } ndeferred = 0; for (i = 0; i < nflush; i++) { - ptr = tbin->avail[i]; + ptr = *(tbin->avail - 1 - i); assert(ptr != NULL); chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr); if (extent_node_arena_get(&chunk->node) == @@ -220,7 +220,7 @@ tcache_bin_flush_large(tsd_t *tsd, tcache_bin_t *tbin, szind_t binind, * Stash the object, so that it can be handled * in a future pass. */ - tbin->avail[ndeferred] = ptr; + *(tbin->avail - 1 - ndeferred) = ptr; ndeferred++; } } @@ -241,8 +241,8 @@ tcache_bin_flush_large(tsd_t *tsd, tcache_bin_t *tbin, szind_t binind, malloc_mutex_unlock(&arena->lock); } - memmove(tbin->avail, &tbin->avail[tbin->ncached - rem], - rem * sizeof(void *)); + memmove(tbin->avail - rem, tbin->avail - tbin->ncached, rem * + sizeof(void *)); tbin->ncached = rem; if ((int)tbin->ncached < tbin->low_water) tbin->low_water = tbin->ncached; @@ -333,9 +333,14 @@ tcache_create(tsd_t *tsd, arena_t *arena) assert((TCACHE_NSLOTS_SMALL_MAX & 1U) == 0); for (i = 0; i < nhbins; i++) { tcache->tbins[i].lg_fill_div = 1; + stack_offset += tcache_bin_info[i].ncached_max * sizeof(void *); + /* + * avail points past the available space. Allocations will + * access the slots toward higher addresses (for the benefit of + * prefetch). + */ tcache->tbins[i].avail = (void **)((uintptr_t)tcache + (uintptr_t)stack_offset); - stack_offset += tcache_bin_info[i].ncached_max * sizeof(void *); } return (tcache); @@ -379,7 +384,7 @@ tcache_destroy(tsd_t *tsd, tcache_t *tcache) arena_prof_accum(arena, tcache->prof_accumbytes)) prof_idump(); - idalloctm(tsd, tcache, false, true); + idalloctm(tsd, tcache, false, true, true); } void