diff --git a/ChangeLog b/ChangeLog index 33139f9b..b6fa3661 100644 --- a/ChangeLog +++ b/ChangeLog @@ -101,6 +101,9 @@ found in the git revision history: run fragmentation, smaller runs reduce external fragmentation for small size classes, and packed (less uniformly aligned) metadata layout improves CPU cache set distribution. + - Randomly distribute large allocation base pointer alignment relative to page + boundaries in order to more uniformly utilize CPU cache sets. This can be + disabled via the --disable-cache-oblivious configure option. - Micro-optimize the fast paths for the public API functions. - Refactor thread-specific data to reside in a single structure. This assures that only a single TLS read is necessary per call into the public API. diff --git a/INSTALL b/INSTALL index cd760ca2..8d396874 100644 --- a/INSTALL +++ b/INSTALL @@ -185,6 +185,15 @@ any of the following arguments (not a definitive list) to 'configure': thread-local variables via the __thread keyword. If TLS is available, jemalloc uses it for several purposes. +--disable-cache-oblivious + Disable cache-oblivious large allocation alignment for large allocation + requests with no alignment constraints. If this feature is disabled, all + large allocations are page-aligned as an implementation artifact, which can + severely harm CPU cache utilization. However, the cache-oblivious layout + comes at the cost of one extra page per large allocation, which in the + most extreme case increases physical memory usage for the 16 KiB size class + to 20 KiB. + --with-xslroot= Specify where to find DocBook XSL stylesheets when building the documentation. diff --git a/configure.ac b/configure.ac index 05c0d56d..bb6f3a32 100644 --- a/configure.ac +++ b/configure.ac @@ -952,6 +952,23 @@ if test "x$enable_xmalloc" = "x1" ; then fi AC_SUBST([enable_xmalloc]) +dnl Support cache-oblivious allocation alignment by default. +AC_ARG_ENABLE([cache-oblivious], + [AS_HELP_STRING([--disable-cache-oblivious], + [Disable support for cache-oblivious allocation alignment])], +[if test "x$enable_cache_oblivious" = "xno" ; then + enable_cache_oblivious="0" +else + enable_cache_oblivious="1" +fi +], +[enable_cache_oblivious="1"] +) +if test "x$enable_cache_oblivious" = "x1" ; then + AC_DEFINE([JEMALLOC_CACHE_OBLIVIOUS], [ ]) +fi +AC_SUBST([enable_cache_oblivious]) + dnl ============================================================================ dnl Check for __builtin_ffsl(), then ffsl(3), and fail if neither are found. dnl One of those two functions should (theoretically) exist on all platforms @@ -1663,4 +1680,5 @@ AC_MSG_RESULT([xmalloc : ${enable_xmalloc}]) AC_MSG_RESULT([munmap : ${enable_munmap}]) AC_MSG_RESULT([lazy_lock : ${enable_lazy_lock}]) AC_MSG_RESULT([tls : ${enable_tls}]) +AC_MSG_RESULT([cache-oblivious : ${enable_cache_oblivious}]) AC_MSG_RESULT([===============================================================================]) diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h index dff99fb4..fba1b81f 100644 --- a/include/jemalloc/internal/arena.h +++ b/include/jemalloc/internal/arena.h @@ -290,6 +290,12 @@ struct arena_s { uint64_t prof_accumbytes; + /* + * PRNG state for cache index randomization of large allocation base + * pointers. + */ + uint64_t offset_state; + dss_prec_t dss_prec; /* @@ -394,7 +400,15 @@ struct arena_s { /******************************************************************************/ #ifdef JEMALLOC_H_EXTERNS -extern ssize_t opt_lg_dirty_mult; +static const size_t large_pad = +#ifdef JEMALLOC_CACHE_OBLIVIOUS + PAGE +#else + 0 +#endif + ; + +extern ssize_t opt_lg_dirty_mult; extern arena_bin_info_t arena_bin_info[NBINS]; @@ -475,7 +489,7 @@ void arena_stats_merge(arena_t *arena, const char **dss, arena_stats_t *astats, malloc_bin_stats_t *bstats, malloc_large_stats_t *lstats, malloc_huge_stats_t *hstats); arena_t *arena_new(unsigned ind); -void arena_boot(void); +bool arena_boot(void); void arena_prefork(arena_t *arena); void arena_postfork_parent(arena_t *arena); void arena_postfork_child(arena_t *arena); @@ -721,7 +735,7 @@ arena_mapbits_unallocated_set(arena_chunk_t *chunk, size_t pageind, size_t size, { size_t *mapbitsp = arena_mapbitsp_get(chunk, pageind); - assert((size & PAGE_MASK) == 0); + assert(size == PAGE_CEILING(size)); assert((flags & ~CHUNK_MAP_FLAGS_MASK) == 0); assert((flags & (CHUNK_MAP_DIRTY|CHUNK_MAP_UNZEROED)) == flags); arena_mapbitsp_write(mapbitsp, size | CHUNK_MAP_BININD_INVALID | flags); @@ -734,7 +748,7 @@ arena_mapbits_unallocated_size_set(arena_chunk_t *chunk, size_t pageind, size_t *mapbitsp = arena_mapbitsp_get(chunk, pageind); size_t mapbits = arena_mapbitsp_read(mapbitsp); - assert((size & PAGE_MASK) == 0); + assert(size == PAGE_CEILING(size)); assert((mapbits & (CHUNK_MAP_LARGE|CHUNK_MAP_ALLOCATED)) == 0); arena_mapbitsp_write(mapbitsp, size | (mapbits & PAGE_MASK)); } @@ -747,7 +761,7 @@ arena_mapbits_large_set(arena_chunk_t *chunk, size_t pageind, size_t size, size_t mapbits = arena_mapbitsp_read(mapbitsp); size_t unzeroed; - assert((size & PAGE_MASK) == 0); + assert(size == PAGE_CEILING(size)); assert((flags & CHUNK_MAP_DIRTY) == flags); unzeroed = mapbits & CHUNK_MAP_UNZEROED; /* Preserve unzeroed. */ arena_mapbitsp_write(mapbitsp, size | CHUNK_MAP_BININD_INVALID | flags @@ -762,7 +776,8 @@ arena_mapbits_large_binind_set(arena_chunk_t *chunk, size_t pageind, size_t mapbits = arena_mapbitsp_read(mapbitsp); assert(binind <= BININD_INVALID); - assert(arena_mapbits_large_size_get(chunk, pageind) == LARGE_MINCLASS); + assert(arena_mapbits_large_size_get(chunk, pageind) == LARGE_MINCLASS + + large_pad); arena_mapbitsp_write(mapbitsp, (mapbits & ~CHUNK_MAP_BININD_MASK) | (binind << CHUNK_MAP_BININD_SHIFT)); } @@ -1107,13 +1122,16 @@ arena_salloc(const void *ptr, bool demote) * end up looking at binind to determine that ptr is a * small allocation. */ - assert(((uintptr_t)ptr & PAGE_MASK) == 0); - ret = arena_mapbits_large_size_get(chunk, pageind); + assert(config_cache_oblivious || ((uintptr_t)ptr & + PAGE_MASK) == 0); + ret = arena_mapbits_large_size_get(chunk, pageind) - + large_pad; assert(ret != 0); - assert(pageind + (ret>>LG_PAGE) <= chunk_npages); + assert(pageind + ((ret+large_pad)>>LG_PAGE) <= + chunk_npages); assert(arena_mapbits_dirty_get(chunk, pageind) == arena_mapbits_dirty_get(chunk, - pageind+(ret>>LG_PAGE)-1)); + pageind+((ret+large_pad)>>LG_PAGE)-1)); } else { /* * Small allocation (possibly promoted to a large @@ -1157,11 +1175,13 @@ arena_dalloc(tsd_t *tsd, void *ptr, tcache_t *tcache) size_t size = arena_mapbits_large_size_get(chunk, pageind); - assert(((uintptr_t)ptr & PAGE_MASK) == 0); + assert(config_cache_oblivious || ((uintptr_t)ptr & + PAGE_MASK) == 0); - if (likely(tcache != NULL) && size <= tcache_maxclass) - tcache_dalloc_large(tsd, tcache, ptr, size); - else { + if (likely(tcache != NULL) && size <= tcache_maxclass) { + tcache_dalloc_large(tsd, tcache, ptr, size - + large_pad); + } else { arena_dalloc_large(extent_node_arena_get( &chunk->node), chunk, ptr); } @@ -1188,7 +1208,7 @@ arena_sdalloc(tsd_t *tsd, void *ptr, size_t size, tcache_t *tcache) */ assert(((uintptr_t)ptr & PAGE_MASK) == 0); size = arena_mapbits_large_size_get(chunk, - pageind); + pageind) - large_pad; } } assert(s2u(size) == s2u(arena_salloc(ptr, false))); @@ -1205,7 +1225,8 @@ arena_sdalloc(tsd_t *tsd, void *ptr, size_t size, tcache_t *tcache) &chunk->node), chunk, ptr, pageind); } } else { - assert(((uintptr_t)ptr & PAGE_MASK) == 0); + assert(config_cache_oblivious || ((uintptr_t)ptr & + PAGE_MASK) == 0); if (likely(tcache != NULL) && size <= tcache_maxclass) tcache_dalloc_large(tsd, tcache, ptr, size); diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in index b398f31e..910ebf76 100644 --- a/include/jemalloc/internal/jemalloc_internal.h.in +++ b/include/jemalloc/internal/jemalloc_internal.h.in @@ -126,6 +126,13 @@ static const bool config_ivsalloc = false #endif ; +static const bool config_cache_oblivious = +#ifdef JEMALLOC_CACHE_OBLIVIOUS + true +#else + false +#endif + ; #ifdef JEMALLOC_C11ATOMICS #include diff --git a/include/jemalloc/internal/jemalloc_internal_defs.h.in b/include/jemalloc/internal/jemalloc_internal_defs.h.in index a943d23c..ed8347a7 100644 --- a/include/jemalloc/internal/jemalloc_internal_defs.h.in +++ b/include/jemalloc/internal/jemalloc_internal_defs.h.in @@ -192,6 +192,12 @@ */ #undef JEMALLOC_IVSALLOC +/* + * If defined, explicitly attempt to more uniformly distribute large allocation + * pointer alignments across all cache indices. + */ +#undef JEMALLOC_CACHE_OBLIVIOUS + /* * Darwin (OS X) uses zones to work around Mach-O symbol override shortcomings. */ diff --git a/include/jemalloc/internal/prng.h b/include/jemalloc/internal/prng.h index c6b17972..216d0ef4 100644 --- a/include/jemalloc/internal/prng.h +++ b/include/jemalloc/internal/prng.h @@ -26,22 +26,22 @@ * const uint32_t a, c : See above discussion. */ #define prng32(r, lg_range, state, a, c) do { \ - assert(lg_range > 0); \ - assert(lg_range <= 32); \ + assert((lg_range) > 0); \ + assert((lg_range) <= 32); \ \ r = (state * (a)) + (c); \ state = r; \ - r >>= (32 - lg_range); \ + r >>= (32 - (lg_range)); \ } while (false) /* Same as prng32(), but 64 bits of pseudo-randomness, using uint64_t. */ #define prng64(r, lg_range, state, a, c) do { \ - assert(lg_range > 0); \ - assert(lg_range <= 64); \ + assert((lg_range) > 0); \ + assert((lg_range) <= 64); \ \ r = (state * (a)) + (c); \ state = r; \ - r >>= (64 - lg_range); \ + r >>= (64 - (lg_range)); \ } while (false) #endif /* JEMALLOC_H_TYPES */ diff --git a/src/arena.c b/src/arena.c index 30410683..a053adf0 100644 --- a/src/arena.c +++ b/src/arena.c @@ -12,6 +12,8 @@ size_t map_bias; size_t map_misc_offset; size_t arena_maxrun; /* Max run size for arenas. */ size_t arena_maxclass; /* Max size class for arenas. */ +static size_t small_maxrun; /* Max run size used for small size classes. */ +static bool *small_run_tab; /* Valid small run page multiples. */ unsigned nlclasses; /* Number of large size classes. */ unsigned nhclasses; /* Number of huge size classes. */ @@ -56,33 +58,102 @@ arena_run_comp(arena_chunk_map_misc_t *a, arena_chunk_map_misc_t *b) rb_gen(static UNUSED, arena_run_tree_, arena_run_tree_t, arena_chunk_map_misc_t, rb_link, arena_run_comp) +static size_t +run_quantize(size_t size) +{ + size_t qsize; + + assert(size != 0); + assert(size == PAGE_CEILING(size)); + + /* Don't change sizes that are valid small run sizes. */ + if (size <= small_maxrun && small_run_tab[size >> LG_PAGE]) + return (size); + + /* + * Round down to the nearest run size that can actually be requested + * during normal large allocation. Add large_pad so that cache index + * randomization can offset the allocation from the page boundary. + */ + qsize = index2size(size2index(size - large_pad + 1) - 1) + large_pad; + if (qsize <= SMALL_MAXCLASS + large_pad) + return (run_quantize(size - large_pad)); + assert(qsize <= size); + return (qsize); +} + +static size_t +run_quantize_next(size_t size) +{ + size_t large_run_size_next; + + assert(size != 0); + assert(size == PAGE_CEILING(size)); + + /* + * Return the next quantized size greater than the input size. + * Quantized sizes comprise the union of run sizes that back small + * region runs, and run sizes that back large regions with no explicit + * alignment constraints. + */ + + if (size > SMALL_MAXCLASS) { + large_run_size_next = PAGE_CEILING(index2size(size2index(size - + large_pad) + 1) + large_pad); + } else + large_run_size_next = SIZE_T_MAX; + if (size >= small_maxrun) + return (large_run_size_next); + + while (true) { + size += PAGE; + assert(size <= small_maxrun); + if (small_run_tab[size >> LG_PAGE]) { + if (large_run_size_next < size) + return (large_run_size_next); + return (size); + } + } +} + +static size_t +run_quantize_first(size_t size) +{ + size_t qsize = run_quantize(size); + + if (qsize < size) { + /* + * Skip a quantization that may have an adequately large run, + * because under-sized runs may be mixed in. This only happens + * when an unusual size is requested, i.e. for aligned + * allocation, and is just one of several places where linear + * search would potentially find sufficiently aligned available + * memory somewhere lower. + */ + qsize = run_quantize_next(size); + } + return (qsize); +} + JEMALLOC_INLINE_C int arena_avail_comp(arena_chunk_map_misc_t *a, arena_chunk_map_misc_t *b) { int ret; uintptr_t a_miscelm = (uintptr_t)a; - size_t a_size; - size_t b_size = arena_miscelm_to_bits(b) & ~PAGE_MASK; - index_t a_index, b_index; + size_t a_qsize; + size_t b_qsize = run_quantize(arena_miscelm_to_bits(b) & ~PAGE_MASK); if (a_miscelm & CHUNK_MAP_KEY) { - a_size = a_miscelm & ~PAGE_MASK; - assert(a_size == s2u(a_size)); + size_t a_size = a_miscelm & ~PAGE_MASK; + a_qsize = run_quantize(a_size); } else - a_size = arena_miscelm_to_bits(a) & ~PAGE_MASK; + a_qsize = run_quantize(arena_miscelm_to_bits(a) & ~PAGE_MASK); /* - * Compute the index of the largest size class that the run can satisfy - * a request for. + * Compare based on quantized size rather than size, in order to sort + * equally useful runs only by address. */ - a_index = size2index(a_size + 1) - 1; - b_index = size2index(b_size + 1) - 1; - - /* - * Compare based on size class index rather than size, in order to - * sort equally useful runs only by address. - */ - ret = (a_index > b_index) - (a_index < b_index); + ret = (a_qsize > b_qsize) - (a_qsize < b_qsize); if (ret == 0) { if (!(a_miscelm & CHUNK_MAP_KEY)) { uintptr_t b_miscelm = (uintptr_t)b; @@ -913,7 +984,7 @@ static arena_run_t * arena_run_first_fit(arena_t *arena, size_t size) { arena_run_t *run; - index_t index, max_index; + size_t search_size, max_size; assert(size == s2u(size)); assert(size == PAGE_CEILING(size)); @@ -924,14 +995,14 @@ arena_run_first_fit(arena_t *arena, size_t size) * and choose the lowest of the runs found. */ run = NULL; - for (index = size2index(size), max_index = size2index(arena_maxclass); - index <= max_index;) { + for (search_size = run_quantize_first(size), max_size = + run_quantize(arena_maxclass + large_pad); search_size <= max_size; + search_size = run_quantize_next(search_size)) { arena_run_t *currun; arena_chunk_t *currun_chunk; size_t currun_pageind, currun_size; - size_t usize = PAGE_CEILING(index2size(index)); - arena_chunk_map_misc_t *key = (arena_chunk_map_misc_t *)(usize | - CHUNK_MAP_KEY); + arena_chunk_map_misc_t *key = (arena_chunk_map_misc_t *) + (search_size | CHUNK_MAP_KEY); arena_chunk_map_misc_t *miscelm = arena_avail_tree_nsearch(&arena->runs_avail, key); if (miscelm == NULL) @@ -939,12 +1010,13 @@ arena_run_first_fit(arena_t *arena, size_t size) currun = &miscelm->run; if (run == NULL || (uintptr_t)currun < (uintptr_t)run) run = currun; + /* Skip iteration(s) if run is larger than the search size. */ currun_chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(currun); currun_pageind = arena_miscelm_to_pageind(miscelm); currun_size = arena_mapbits_unallocated_size_get(currun_chunk, currun_pageind); - assert(size2index(currun_size) + 1 > index); - index = size2index(currun_size) + 1; + assert(currun_size >= search_size); + search_size = currun_size; } return (run); @@ -966,7 +1038,7 @@ arena_run_alloc_large(arena_t *arena, size_t size, bool zero) arena_run_t *run; assert(size <= arena_maxrun); - assert((size & PAGE_MASK) == 0); + assert(size == PAGE_CEILING(size)); /* Search the arena's chunks for the lowest best fit. */ run = arena_run_alloc_large_helper(arena, size, zero); @@ -994,7 +1066,7 @@ arena_run_alloc_large(arena_t *arena, size_t size, bool zero) static arena_run_t * arena_run_alloc_small_helper(arena_t *arena, size_t size, index_t binind) { - arena_run_t *run = arena_run_first_fit(arena, PAGE_CEILING(size)); + arena_run_t *run = arena_run_first_fit(arena, size); if (run != NULL) arena_run_split_small(arena, run, size, binind); return (run); @@ -1007,7 +1079,7 @@ arena_run_alloc_small(arena_t *arena, size_t size, index_t binind) arena_run_t *run; assert(size <= arena_maxrun); - assert((size & PAGE_MASK) == 0); + assert(size == PAGE_CEILING(size)); assert(binind != BININD_INVALID); /* Search the arena's chunks for the lowest best fit. */ @@ -1965,6 +2037,8 @@ arena_malloc_large(arena_t *arena, size_t size, bool zero) { void *ret; size_t usize; + uint64_t r; + uintptr_t random_offset; arena_run_t *run; arena_chunk_map_misc_t *miscelm; UNUSED bool idump; @@ -1972,13 +2046,25 @@ arena_malloc_large(arena_t *arena, size_t size, bool zero) /* Large allocation. */ usize = s2u(size); malloc_mutex_lock(&arena->lock); - run = arena_run_alloc_large(arena, usize, zero); + if (config_cache_oblivious) { + /* + * Compute a uniformly distributed offset within the first page + * that is a multiple of the cacheline size, e.g. [0 .. 63) * 64 + * for 4 KiB pages and 64-byte cachelines. + */ + prng64(r, LG_PAGE - LG_CACHELINE, arena->offset_state, + UINT64_C(6364136223846793009), UINT64_C(1442695040888963409)); + random_offset = ((uintptr_t)r) << LG_CACHELINE; + } else + random_offset = 0; + run = arena_run_alloc_large(arena, usize + large_pad, zero); if (run == NULL) { malloc_mutex_unlock(&arena->lock); return (NULL); } miscelm = arena_run_to_miscelm(run); - ret = arena_miscelm_to_rpages(miscelm); + ret = (void *)((uintptr_t)arena_miscelm_to_rpages(miscelm) + + random_offset); if (config_stats) { index_t index = size2index(usize) - NBINS; @@ -2019,14 +2105,14 @@ arena_palloc_large(tsd_t *tsd, arena_t *arena, size_t size, size_t alignment, arena_chunk_map_misc_t *miscelm; void *rpages; - assert((size & PAGE_MASK) == 0); + assert(size == PAGE_CEILING(size)); arena = arena_choose(tsd, arena); if (unlikely(arena == NULL)) return (NULL); alignment = PAGE_CEILING(alignment); - alloc_size = size + alignment - PAGE; + alloc_size = size + large_pad + alignment - PAGE; malloc_mutex_lock(&arena->lock); run = arena_run_alloc_large(arena, alloc_size, false); @@ -2041,7 +2127,7 @@ arena_palloc_large(tsd_t *tsd, arena_t *arena, size_t size, size_t alignment, leadsize = ALIGNMENT_CEILING((uintptr_t)rpages, alignment) - (uintptr_t)rpages; assert(alloc_size >= leadsize + size); - trailsize = alloc_size - leadsize - size; + trailsize = alloc_size - leadsize - size - large_pad; if (leadsize != 0) { arena_chunk_map_misc_t *head_miscelm = miscelm; arena_run_t *head_run = run; @@ -2055,10 +2141,10 @@ arena_palloc_large(tsd_t *tsd, arena_t *arena, size_t size, size_t alignment, alloc_size - leadsize); } if (trailsize != 0) { - arena_run_trim_tail(arena, chunk, run, size + trailsize, size, - false); + arena_run_trim_tail(arena, chunk, run, size + large_pad + + trailsize, size + large_pad, false); } - arena_run_init_large(arena, run, size, zero); + arena_run_init_large(arena, run, size + large_pad, zero); ret = arena_miscelm_to_rpages(miscelm); if (config_stats) { @@ -2088,7 +2174,8 @@ arena_palloc(tsd_t *tsd, arena_t *arena, size_t usize, size_t alignment, { void *ret; - if (usize <= SMALL_MAXCLASS && alignment < PAGE) + if (usize <= SMALL_MAXCLASS && (alignment < PAGE || (alignment == PAGE + && (usize & PAGE_MASK) == 0))) ret = arena_malloc(tsd, arena, usize, zero, tcache); else { if (likely(usize <= arena_maxclass)) { @@ -2292,7 +2379,8 @@ arena_dalloc_large_locked_impl(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run = &miscelm->run; if (config_fill || config_stats) { - size_t usize = arena_mapbits_large_size_get(chunk, pageind); + size_t usize = arena_mapbits_large_size_get(chunk, pageind) - + large_pad; if (!junked) arena_dalloc_junk_large(ptr, usize); @@ -2341,7 +2429,8 @@ arena_ralloc_large_shrink(arena_t *arena, arena_chunk_t *chunk, void *ptr, * allocations. */ malloc_mutex_lock(&arena->lock); - arena_run_trim_tail(arena, chunk, run, oldsize, size, true); + arena_run_trim_tail(arena, chunk, run, oldsize + large_pad, size + + large_pad, true); if (config_stats) { index_t oldindex = size2index(oldsize) - NBINS; index_t index = size2index(size) - NBINS; @@ -2370,7 +2459,8 @@ arena_ralloc_large_grow(arena_t *arena, arena_chunk_t *chunk, void *ptr, size_t followsize; size_t usize_min = s2u(size); - assert(oldsize == arena_mapbits_large_size_get(chunk, pageind)); + assert(oldsize == arena_mapbits_large_size_get(chunk, pageind) - + large_pad); /* Try to extend the run. */ assert(usize_min > oldsize); @@ -2391,7 +2481,7 @@ arena_ralloc_large_grow(arena_t *arena, arena_chunk_t *chunk, void *ptr, while (oldsize + followsize < usize) usize = index2size(size2index(usize)-1); assert(usize >= usize_min); - splitsize = usize - oldsize; + splitsize = usize - oldsize + large_pad; run = &arena_miscelm_get(chunk, pageind+npages)->run; arena_run_split_large(arena, run, splitsize, zero); @@ -2755,6 +2845,18 @@ arena_new(unsigned ind) if (config_prof) arena->prof_accumbytes = 0; + if (config_cache_oblivious) { + /* + * A nondeterministic seed based on the address of arena reduces + * the likelihood of lockstep non-uniform cache index + * utilization among identical concurrent processes, but at the + * cost of test repeatability. For debug builds, instead use a + * deterministic seed. + */ + arena->offset_state = config_debug ? ind : + (uint64_t)(uintptr_t)arena; + } + arena->dss_prec = chunk_dss_prec_get(); arena->spare = NULL; @@ -2890,6 +2992,9 @@ bin_info_run_size_calc(arena_bin_info_t *bin_info) bin_info->reg0_offset = actual_run_size - (actual_nregs * bin_info->reg_interval) - pad_size + bin_info->redzone_size; + if (actual_run_size > small_maxrun) + small_maxrun = actual_run_size; + assert(bin_info->reg0_offset - bin_info->redzone_size + (bin_info->nregs * bin_info->reg_interval) + pad_size == bin_info->run_size); } @@ -2899,7 +3004,7 @@ bin_info_init(void) { arena_bin_info_t *bin_info; -#define BIN_INFO_INIT_bin_yes(index, size) \ +#define BIN_INFO_INIT_bin_yes(index, size) \ bin_info = &arena_bin_info[index]; \ bin_info->reg_size = size; \ bin_info_run_size_calc(bin_info); \ @@ -2913,7 +3018,33 @@ bin_info_init(void) #undef SC } -void +static bool +small_run_size_init(void) +{ + + assert(small_maxrun != 0); + + small_run_tab = (bool *)base_alloc(sizeof(bool) * (small_maxrun >> + LG_PAGE)); + if (small_run_tab == NULL) + return (true); + +#define TAB_INIT_bin_yes(index, size) { \ + arena_bin_info_t *bin_info = &arena_bin_info[index]; \ + small_run_tab[bin_info->run_size >> LG_PAGE] = true; \ + } +#define TAB_INIT_bin_no(index, size) +#define SC(index, lg_grp, lg_delta, ndelta, bin, lg_delta_lookup) \ + TAB_INIT_bin_##bin(index, (ZU(1)< b_index) - (a_index < b_index); + /* + * Compare based on quantized size rather than size, in order to sort + * equally useful extents only by address. + */ + ret = (a_qsize > b_qsize) - (a_qsize < b_qsize); if (ret == 0) { uintptr_t a_addr = (uintptr_t)extent_node_addr_get(a); uintptr_t b_addr = (uintptr_t)extent_node_addr_get(b); diff --git a/src/jemalloc.c b/src/jemalloc.c index a2d1c5c2..7f266526 100644 --- a/src/jemalloc.c +++ b/src/jemalloc.c @@ -1182,7 +1182,8 @@ malloc_init_hard_a0_locked(void) return (true); if (config_prof) prof_boot1(); - arena_boot(); + if (arena_boot()) + return (true); if (config_tcache && tcache_boot()) return (true); if (malloc_mutex_init(&arenas_lock))