From 3fa9a2fad83a3014d5069b5a2530a0cfb8d8d197 Mon Sep 17 00:00:00 2001 From: Jason Evans Date: Sun, 7 Mar 2010 15:34:14 -0800 Subject: [PATCH] Simplify tcache object caching. Use chains of cached objects, rather than using arrays of pointers. Since tcache_bin_t is no longer dynamically sized, convert tcache_t's tbin to an array of structures, rather than an array of pointers. This implicitly removes tcache_bin_{create,destroy}(), which further simplifies the fast path for malloc/free. Use cacheline alignment for tcache_t allocations. Remove runtime configuration option for number of tcache bin slots, and replace it with a boolean option for enabling/disabling tcache. Limit the number of tcache objects to the lesser of TCACHE_NSLOTS_MAX and 2X the number of regions per run for the size class. For GC-triggered flush, discard 3/4 of the objects below the low water mark, rather than 1/2. --- jemalloc/doc/jemalloc.3.in | 11 +- jemalloc/include/jemalloc/internal/arena.h | 6 +- jemalloc/include/jemalloc/internal/tcache.h | 107 ++++------- jemalloc/src/arena.c | 60 +++--- jemalloc/src/ctl.c | 6 +- jemalloc/src/jemalloc.c | 9 +- jemalloc/src/stats.c | 21 +-- jemalloc/src/tcache.c | 197 ++++++++------------ 8 files changed, 171 insertions(+), 246 deletions(-) diff --git a/jemalloc/doc/jemalloc.3.in b/jemalloc/doc/jemalloc.3.in index 272a5b9b..197e68c3 100644 --- a/jemalloc/doc/jemalloc.3.in +++ b/jemalloc/doc/jemalloc.3.in @@ -376,8 +376,7 @@ will disable dirty page purging. @roff_tcache@.Ev JEMALLOC_OPTIONS=14g @roff_tcache@will disable garbage collection. @roff_tcache@.It H -@roff_tcache@Double/halve the number of thread-specific cache slots per size -@roff_tcache@class. +@roff_tcache@Enable/disable thread-specific caching. @roff_tcache@When there are multiple threads, each thread uses a @roff_tcache@thread-specific cache for small and medium objects. @roff_tcache@Thread-specific caching allows many allocations to be satisfied @@ -386,11 +385,7 @@ will disable dirty page purging. @roff_tcache@See the @roff_tcache@.Dq G @roff_tcache@option for related tuning information. -@roff_tcache@The default number of cache slots is 128; -@roff_tcache@.Ev JEMALLOC_OPTIONS=7h -@roff_tcache@will disable thread-specific caching. -@roff_tcache@Note that one cache slot per size class is not a valid -@roff_tcache@configuration due to implementation details. +@roff_tcache@This option is enabled by default. @roff_prof@.It I @roff_prof@Double/halve the average interval between memory profile dumps, as @roff_prof@measured in bytes of allocation activity. @@ -773,7 +768,7 @@ option. @roff_xmalloc@option. @roff_xmalloc@.Ed .\"----------------------------------------------------------------------------- -@roff_tcache@.It Sy "opt.lg_tcache_nslots (size_t) r-" +@roff_tcache@.It Sy "opt.tcache (bool) r-" @roff_tcache@.Bd -ragged -offset indent -compact @roff_tcache@See the @roff_tcache@.Dq H diff --git a/jemalloc/include/jemalloc/internal/arena.h b/jemalloc/include/jemalloc/internal/arena.h index c4803a3b..e950d592 100644 --- a/jemalloc/include/jemalloc/internal/arena.h +++ b/jemalloc/include/jemalloc/internal/arena.h @@ -18,7 +18,11 @@ #ifdef JEMALLOC_TINY /* Smallest size class to support. */ -# define LG_TINY_MIN 1 +# ifdef JEMALLOC_TCACHE +# define LG_TINY_MIN LG_SIZEOF_PTR +# else +# define LG_TINY_MIN 1 +# endif #endif /* diff --git a/jemalloc/include/jemalloc/internal/tcache.h b/jemalloc/include/jemalloc/internal/tcache.h index b499f525..fa238bf3 100644 --- a/jemalloc/include/jemalloc/internal/tcache.h +++ b/jemalloc/include/jemalloc/internal/tcache.h @@ -6,10 +6,13 @@ typedef struct tcache_bin_s tcache_bin_t; typedef struct tcache_s tcache_t; /* - * Default number of cache slots for each bin in the thread cache (0: - * disabled). + * Absolute maximum number of cache slots for each bin in the thread cache. + * This is an additional constraint beyond that imposed as: twice the number of + * regions per run for this size class. + * + * This constant must be an even number. */ -#define LG_TCACHE_NSLOTS_DEFAULT 7 +#define TCACHE_NSLOTS_MAX 200 /* * (1U << opt_lg_tcache_gc_sweep) is the approximate number of allocation * events between full GC sweeps (-1: disabled). Integer rounding may cause @@ -29,7 +32,8 @@ struct tcache_bin_s { unsigned low_water; /* Min # cached since last GC. */ unsigned high_water; /* Max # cached since last GC. */ unsigned ncached; /* # of cached objects. */ - void *slots[1]; /* Dynamically sized. */ + unsigned ncached_max; /* Upper limit on ncached. */ + void *avail; /* Chain of available objects. */ }; struct tcache_s { @@ -42,26 +46,20 @@ struct tcache_s { arena_t *arena; /* This thread's arena. */ unsigned ev_cnt; /* Event count since incremental GC. */ unsigned next_gc_bin; /* Next bin to GC. */ - tcache_bin_t *tbins[1]; /* Dynamically sized. */ + tcache_bin_t tbins[1]; /* Dynamically sized. */ }; #endif /* JEMALLOC_H_STRUCTS */ /******************************************************************************/ #ifdef JEMALLOC_H_EXTERNS -extern size_t opt_lg_tcache_nslots; +extern bool opt_tcache; extern ssize_t opt_lg_tcache_gc_sweep; /* Map of thread-specific caches. */ extern __thread tcache_t *tcache_tls JEMALLOC_ATTR(tls_model("initial-exec")); -/* - * Number of cache slots for each bin in the thread cache, or 0 if tcache is - * disabled. - */ -extern size_t tcache_nslots; - /* Number of tcache allocation/deallocation events between incremental GCs. */ extern unsigned tcache_gc_incr; @@ -71,10 +69,7 @@ void tcache_bin_flush(tcache_bin_t *tbin, size_t binind, unsigned rem #endif ); tcache_t *tcache_create(arena_t *arena); -void tcache_bin_destroy(tcache_t *tcache, tcache_bin_t *tbin, - unsigned binind); void *tcache_alloc_hard(tcache_t *tcache, tcache_bin_t *tbin, size_t binind); -tcache_bin_t *tcache_bin_create(arena_t *arena); void tcache_destroy(tcache_t *tcache); #ifdef JEMALLOC_STATS void tcache_stats_merge(tcache_t *tcache, arena_t *arena); @@ -99,7 +94,7 @@ tcache_get(void) { tcache_t *tcache; - if (isthreaded == false || tcache_nslots == 0) + if ((isthreaded & opt_tcache) == false) return (NULL); tcache = tcache_tls; @@ -124,37 +119,24 @@ tcache_event(tcache_t *tcache) tcache->ev_cnt++; assert(tcache->ev_cnt <= tcache_gc_incr); - if (tcache->ev_cnt >= tcache_gc_incr) { + if (tcache->ev_cnt == tcache_gc_incr) { size_t binind = tcache->next_gc_bin; - tcache_bin_t *tbin = tcache->tbins[binind]; + tcache_bin_t *tbin = &tcache->tbins[binind]; - if (tbin != NULL) { - if (tbin->high_water == 0) { - /* - * This bin went completely unused for an - * entire GC cycle, so throw away the tbin. - */ - assert(tbin->ncached == 0); - tcache_bin_destroy(tcache, tbin, binind); - tcache->tbins[binind] = NULL; - } else { - if (tbin->low_water > 0) { - /* - * Flush (ceiling) half of the objects - * below the low water mark. - */ - tcache_bin_flush(tbin, binind, - tbin->ncached - (tbin->low_water >> - 1) - (tbin->low_water & 1) + if (tbin->low_water > 0) { + /* + * Flush (ceiling) 3/4 of the objects below the low + * water mark. + */ + tcache_bin_flush(tbin, binind, tbin->ncached - + tbin->low_water + (tbin->low_water >> 2) #ifdef JEMALLOC_PROF - , tcache + , tcache #endif - ); - } - tbin->low_water = tbin->ncached; - tbin->high_water = tbin->ncached; - } + ); } + tbin->low_water = tbin->ncached; + tbin->high_water = tbin->ncached; tcache->next_gc_bin++; if (tcache->next_gc_bin == nbins) @@ -166,21 +148,24 @@ tcache_event(tcache_t *tcache) JEMALLOC_INLINE void * tcache_bin_alloc(tcache_bin_t *tbin) { + void *ret; if (tbin->ncached == 0) return (NULL); tbin->ncached--; if (tbin->ncached < tbin->low_water) tbin->low_water = tbin->ncached; - return (tbin->slots[tbin->ncached]); + ret = tbin->avail; + tbin->avail = *(void **)ret; + return (ret); } JEMALLOC_INLINE void * tcache_alloc(tcache_t *tcache, size_t size, bool zero) { void *ret; - tcache_bin_t *tbin; size_t binind; + tcache_bin_t *tbin; if (size <= small_maxclass) binind = small_size2bin[size]; @@ -189,14 +174,7 @@ tcache_alloc(tcache_t *tcache, size_t size, bool zero) lg_mspace); } assert(binind < nbins); - tbin = tcache->tbins[binind]; - if (tbin == NULL) { - tbin = tcache_bin_create(tcache->arena); - if (tbin == NULL) - return (NULL); - tcache->tbins[binind] = tbin; - } - + tbin = &tcache->tbins[binind]; ret = tcache_bin_alloc(tbin); if (ret == NULL) { ret = tcache_alloc_hard(tcache, tbin, binind); @@ -250,29 +228,20 @@ tcache_dalloc(tcache_t *tcache, void *ptr) #ifdef JEMALLOC_FILL if (opt_junk) - memset(ptr, 0x5a, arena->bins[binind].reg_size); + memset(ptr, 0x5a, bin->reg_size); #endif - tbin = tcache->tbins[binind]; - if (tbin == NULL) { - tbin = tcache_bin_create(choose_arena()); - if (tbin == NULL) { - malloc_mutex_lock(&arena->lock); - arena_dalloc_bin(arena, chunk, ptr, mapelm); - malloc_mutex_unlock(&arena->lock); - return; - } - tcache->tbins[binind] = tbin; - } - - if (tbin->ncached == tcache_nslots) - tcache_bin_flush(tbin, binind, (tcache_nslots >> 1) + tbin = &tcache->tbins[binind]; + if (tbin->ncached == tbin->ncached_max) { + tcache_bin_flush(tbin, binind, (tbin->ncached_max >> 1) #ifdef JEMALLOC_PROF , tcache #endif ); - assert(tbin->ncached < tcache_nslots); - tbin->slots[tbin->ncached] = ptr; + } + assert(tbin->ncached < tbin->ncached_max); + *(void **)ptr = tbin->avail; + tbin->avail = ptr; tbin->ncached++; if (tbin->ncached > tbin->high_water) tbin->high_water = tbin->ncached; diff --git a/jemalloc/src/arena.c b/jemalloc/src/arena.c index 248fcddf..1153a0b5 100644 --- a/jemalloc/src/arena.c +++ b/jemalloc/src/arena.c @@ -53,13 +53,26 @@ static malloc_mutex_t purge_lock; static const uint8_t const_small_size2bin[STATIC_PAGE_SIZE - 255] = { S2B_1(0xffU) /* 0 */ #if (LG_QUANTUM == 4) -/* 64-bit system ************************/ +/* 16-byte quantum **********************/ # ifdef JEMALLOC_TINY - S2B_2(0) /* 2 */ - S2B_2(1) /* 4 */ - S2B_4(2) /* 8 */ - S2B_8(3) /* 16 */ +# if (LG_TINY_MIN == 1) + S2B_2(0) /* 2 */ + S2B_2(1) /* 4 */ + S2B_4(2) /* 8 */ + S2B_8(3) /* 16 */ # define S2B_QMIN 3 +# elif (LG_TINY_MIN == 2) + S2B_4(0) /* 4 */ + S2B_4(1) /* 8 */ + S2B_8(2) /* 16 */ +# define S2B_QMIN 2 +# elif (LG_TINY_MIN == 3) + S2B_8(0) /* 8 */ + S2B_8(1) /* 16 */ +# define S2B_QMIN 1 +# else +# error "Unsupported LG_TINY_MIN" +# endif # else S2B_16(0) /* 16 */ # define S2B_QMIN 0 @@ -73,12 +86,20 @@ static const uint8_t const_small_size2bin[STATIC_PAGE_SIZE - 255] = { S2B_16(S2B_QMIN + 7) /* 128 */ # define S2B_CMIN (S2B_QMIN + 8) #else -/* 32-bit system ************************/ +/* 8-byte quantum ***********************/ # ifdef JEMALLOC_TINY - S2B_2(0) /* 2 */ - S2B_2(1) /* 4 */ - S2B_4(2) /* 8 */ +# if (LG_TINY_MIN == 1) + S2B_2(0) /* 2 */ + S2B_2(1) /* 4 */ + S2B_4(2) /* 8 */ # define S2B_QMIN 2 +# elif (LG_TINY_MIN == 2) + S2B_4(0) /* 4 */ + S2B_4(1) /* 8 */ +# define S2B_QMIN 1 +# else +# error "Unsupported LG_TINY_MIN" +# endif # else S2B_8(0) /* 8 */ # define S2B_QMIN 0 @@ -1048,28 +1069,15 @@ arena_tcache_fill(arena_t *arena, tcache_bin_t *tbin, size_t binind #ifdef JEMALLOC_PROF arena_prof_accum(arena, prof_accumbytes); #endif - for (i = 0, nfill = (tcache_nslots >> 1); i < nfill; i++) { + for (i = 0, nfill = (tbin->ncached_max >> 1); i < nfill; i++) { if ((run = bin->runcur) != NULL && run->nfree > 0) ptr = arena_bin_malloc_easy(arena, bin, run); else ptr = arena_bin_malloc_hard(arena, bin); - if (ptr == NULL) { - if (i > 0) { - /* - * Move valid pointers to the base of - * tbin->slots. - */ - memmove(&tbin->slots[0], - &tbin->slots[nfill - i], - i * sizeof(void *)); - } + if (ptr == NULL) break; - } - /* - * Fill slots such that the objects lowest in memory come last. - * This causes tcache to use low objects first. - */ - tbin->slots[nfill - 1 - i] = ptr; + *(void **)ptr = tbin->avail; + tbin->avail = ptr; } #ifdef JEMALLOC_STATS bin->stats.nfills++; diff --git a/jemalloc/src/ctl.c b/jemalloc/src/ctl.c index 33ddbb51..28e9368c 100644 --- a/jemalloc/src/ctl.c +++ b/jemalloc/src/ctl.c @@ -64,7 +64,7 @@ CTL_PROTO(opt_xmalloc) CTL_PROTO(opt_zero) #endif #ifdef JEMALLOC_TCACHE -CTL_PROTO(opt_lg_tcache_nslots) +CTL_PROTO(opt_tcache) CTL_PROTO(opt_lg_tcache_gc_sweep) #endif #ifdef JEMALLOC_PROF @@ -230,7 +230,7 @@ static const ctl_node_t opt_node[] = { {NAME("zero"), CTL(opt_zero)}, #endif #ifdef JEMALLOC_TCACHE - {NAME("lg_tcache_nslots"), CTL(opt_lg_tcache_nslots)}, + {NAME("tcache"), CTL(opt_tcache)}, {NAME("lg_tcache_gc_sweep"), CTL(opt_lg_tcache_gc_sweep)}, #endif #ifdef JEMALLOC_PROF @@ -1070,7 +1070,7 @@ CTL_RO_GEN(opt_xmalloc, opt_xmalloc, bool) CTL_RO_GEN(opt_zero, opt_zero, bool) #endif #ifdef JEMALLOC_TCACHE -CTL_RO_GEN(opt_lg_tcache_nslots, opt_lg_tcache_nslots, size_t) +CTL_RO_GEN(opt_tcache, opt_tcache, bool) CTL_RO_GEN(opt_lg_tcache_gc_sweep, opt_lg_tcache_gc_sweep, ssize_t) #endif #ifdef JEMALLOC_PROF diff --git a/jemalloc/src/jemalloc.c b/jemalloc/src/jemalloc.c index dc7879f4..49c2b0b0 100644 --- a/jemalloc/src/jemalloc.c +++ b/jemalloc/src/jemalloc.c @@ -482,13 +482,10 @@ MALLOC_OUT: opt_lg_tcache_gc_sweep++; break; case 'h': - if (opt_lg_tcache_nslots > 0) - opt_lg_tcache_nslots--; + opt_tcache = false; break; case 'H': - if (opt_lg_tcache_nslots + 1 < - (sizeof(size_t) << 3)) - opt_lg_tcache_nslots++; + opt_tcache = true; break; #endif #ifdef JEMALLOC_PROF @@ -729,7 +726,7 @@ MALLOC_OUT: * default. */ #ifdef JEMALLOC_TCACHE - if (tcache_nslots + if (opt_tcache # ifdef JEMALLOC_PROF /* * Profile data storage concurrency is directly linked to diff --git a/jemalloc/src/stats.c b/jemalloc/src/stats.c index 311a5f24..236d7f87 100644 --- a/jemalloc/src/stats.c +++ b/jemalloc/src/stats.c @@ -440,6 +440,9 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque, if ((err = JEMALLOC_P(mallctl)("opt.prof", &bv, &bsz, NULL, 0)) == 0) write_cb(cbopaque, bv ? "F" : "f"); + if ((err = JEMALLOC_P(mallctl)("opt.tcache", &bv, &bsz, NULL, + 0)) == 0) + write_cb(cbopaque, bv ? "H" : "h"); if ((err = JEMALLOC_P(mallctl)("opt.junk", &bv, &bsz, NULL, 0)) == 0) write_cb(cbopaque, bv ? "J" : "j"); @@ -550,21 +553,13 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque, write_cb(cbopaque, "Min active:dirty page ratio per arena: N/A\n"); } - if ((err = JEMALLOC_P(mallctl)("opt.lg_tcache_nslots", &sv, + if ((err = JEMALLOC_P(mallctl)("opt.lg_tcache_gc_sweep", &ssv, &ssz, NULL, 0)) == 0) { - size_t tcache_nslots, tcache_gc_sweep; - - tcache_nslots = (1U << sv); - write_cb(cbopaque, - "Thread cache slots per size class: "); - write_cb(cbopaque, tcache_nslots ? - umax2s(tcache_nslots, 10, s) : "N/A"); - write_cb(cbopaque, "\n"); - - CTL_GET("opt.lg_tcache_gc_sweep", &ssv, ssize_t); - tcache_gc_sweep = (1U << ssv); + size_t tcache_gc_sweep = (1U << ssv); + bool tcache_enabled; + CTL_GET("opt.tcache", &tcache_enabled, bool); write_cb(cbopaque, "Thread cache GC sweep interval: "); - write_cb(cbopaque, tcache_nslots && ssv >= 0 ? + write_cb(cbopaque, tcache_enabled && ssv >= 0 ? umax2s(tcache_gc_sweep, 10, s) : "N/A"); write_cb(cbopaque, "\n"); } diff --git a/jemalloc/src/tcache.c b/jemalloc/src/tcache.c index e1b10310..dcb72c61 100644 --- a/jemalloc/src/tcache.c +++ b/jemalloc/src/tcache.c @@ -4,7 +4,7 @@ /******************************************************************************/ /* Data. */ -size_t opt_lg_tcache_nslots = LG_TCACHE_NSLOTS_DEFAULT; +bool opt_tcache = true; ssize_t opt_lg_tcache_gc_sweep = LG_TCACHE_GC_SWEEP_DEFAULT; /* Map of thread-specific caches. */ @@ -16,7 +16,6 @@ __thread tcache_t *tcache_tls JEMALLOC_ATTR(tls_model("initial-exec")); */ static pthread_key_t tcache_tsd; -size_t tcache_nslots; unsigned tcache_gc_incr; /******************************************************************************/ @@ -51,16 +50,14 @@ tcache_bin_flush(tcache_bin_t *tbin, size_t binind, unsigned rem #endif ) { - arena_chunk_t *chunk; - arena_t *arena; - void *ptr; - unsigned i, ndeferred, ncached; + void *flush, *deferred, *ptr; + unsigned i, nflush, ndeferred; - for (ndeferred = tbin->ncached - rem; ndeferred > 0;) { - ncached = ndeferred; + for (flush = tbin->avail, nflush = tbin->ncached - rem; flush != NULL; + flush = deferred, nflush = ndeferred) { /* Lock the arena associated with the first object. */ - chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(tbin->slots[0]); - arena = chunk->arena; + arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(flush); + arena_t *arena = chunk->arena; malloc_mutex_lock(&arena->lock); #ifdef JEMALLOC_PROF if (arena == tcache->arena) { @@ -68,9 +65,12 @@ tcache_bin_flush(tcache_bin_t *tbin, size_t binind, unsigned rem tcache->prof_accumbytes = 0; } #endif - /* Deallocate every object that belongs to the locked arena. */ - for (i = ndeferred = 0; i < ncached; i++) { - ptr = tbin->slots[i]; + deferred = NULL; + ndeferred = 0; + for (i = 0; i < nflush; i++) { + ptr = flush; + assert(ptr != NULL); + flush = *(void **)ptr; chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr); if (chunk->arena == arena) { size_t pageind = (((uintptr_t)ptr - @@ -85,7 +85,8 @@ tcache_bin_flush(tcache_bin_t *tbin, size_t binind, unsigned rem * Stash the object, so that it can be handled * in a future pass. */ - tbin->slots[ndeferred] = ptr; + *(void **)ptr = deferred; + deferred = ptr; ndeferred++; } } @@ -105,98 +106,41 @@ tcache_bin_flush(tcache_bin_t *tbin, size_t binind, unsigned rem } #endif malloc_mutex_unlock(&arena->lock); + + if (flush != NULL) { + /* + * This was the first pass, and rem cached objects + * remain. + */ + tbin->avail = flush; + } } - if (rem > 0) { - /* - * Shift the remaining valid pointers to the base of the slots - * array. - */ - memmove(&tbin->slots[0], &tbin->slots[tbin->ncached - rem], - rem * sizeof(void *)); - } tbin->ncached = rem; } -tcache_bin_t * -tcache_bin_create(arena_t *arena) -{ - tcache_bin_t *ret; - size_t tsize; - - tsize = sizeof(tcache_bin_t) + (sizeof(void *) * (tcache_nslots - 1)); - if (tsize <= small_maxclass) - ret = (tcache_bin_t *)arena_malloc_small(arena, tsize, false); - else if (tsize <= bin_maxclass) - ret = (tcache_bin_t *)arena_malloc_medium(arena, tsize, false); - else - ret = (tcache_bin_t *)imalloc(tsize); - if (ret == NULL) - return (NULL); -#ifdef JEMALLOC_STATS - memset(&ret->tstats, 0, sizeof(tcache_bin_stats_t)); -#endif - ret->low_water = 0; - ret->high_water = 0; - ret->ncached = 0; - - return (ret); -} - -void -tcache_bin_destroy(tcache_t *tcache, tcache_bin_t *tbin, unsigned binind) -{ - arena_t *arena; - arena_chunk_t *chunk; - size_t pageind, tsize; - arena_chunk_map_t *mapelm; - - chunk = CHUNK_ADDR2BASE(tbin); - arena = chunk->arena; - pageind = (((uintptr_t)tbin - (uintptr_t)chunk) >> PAGE_SHIFT); - mapelm = &chunk->map[pageind]; - -#ifdef JEMALLOC_STATS - if (tbin->tstats.nrequests != 0) { - arena_t *arena = tcache->arena; - arena_bin_t *bin = &arena->bins[binind]; - malloc_mutex_lock(&arena->lock); - bin->stats.nrequests += tbin->tstats.nrequests; - if (bin->reg_size <= small_maxclass) - arena->stats.nmalloc_small += tbin->tstats.nrequests; - else - arena->stats.nmalloc_medium += tbin->tstats.nrequests; - malloc_mutex_unlock(&arena->lock); - } -#endif - - assert(tbin->ncached == 0); - tsize = sizeof(tcache_bin_t) + (sizeof(void *) * (tcache_nslots - 1)); - if (tsize <= bin_maxclass) { - malloc_mutex_lock(&arena->lock); - arena_dalloc_bin(arena, chunk, tbin, mapelm); - malloc_mutex_unlock(&arena->lock); - } else - idalloc(tbin); -} - tcache_t * tcache_create(arena_t *arena) { tcache_t *tcache; + size_t size; + unsigned i; - if (sizeof(tcache_t) + (sizeof(tcache_bin_t *) * (nbins - 1)) <= - small_maxclass) { - tcache = (tcache_t *)arena_malloc_small(arena, sizeof(tcache_t) - + (sizeof(tcache_bin_t *) * (nbins - 1)), true); - } else if (sizeof(tcache_t) + (sizeof(tcache_bin_t *) * (nbins - 1)) <= - bin_maxclass) { - tcache = (tcache_t *)arena_malloc_medium(arena, sizeof(tcache_t) - + (sizeof(tcache_bin_t *) * (nbins - 1)), true); - } else { - tcache = (tcache_t *)icalloc(sizeof(tcache_t) + - (sizeof(tcache_bin_t *) * (nbins - 1))); - } + size = sizeof(tcache_t) + (sizeof(tcache_bin_t) * (nbins - 1)); + /* + * Round up to the nearest multiple of the cacheline size, in order to + * avoid the possibility of false cacheline sharing. + * + * That this works relies on the same logic as in ipalloc(). + */ + size = (size + CACHELINE_MASK) & (-CACHELINE); + + if (size <= small_maxclass) + tcache = (tcache_t *)arena_malloc_small(arena, size, true); + else if (size <= bin_maxclass) + tcache = (tcache_t *)arena_malloc_medium(arena, size, true); + else + tcache = (tcache_t *)icalloc(size); if (tcache == NULL) return (NULL); @@ -210,6 +154,14 @@ tcache_create(arena_t *arena) #endif tcache->arena = arena; + assert((TCACHE_NSLOTS_MAX & 1U) == 0); + for (i = 0; i < nbins; i++) { + if ((arena->bins[i].nregs << 1) <= TCACHE_NSLOTS_MAX) { + tcache->tbins[i].ncached_max = (arena->bins[i].nregs << + 1); + } else + tcache->tbins[i].ncached_max = TCACHE_NSLOTS_MAX; + } tcache_tls = tcache; pthread_setspecific(tcache_tsd, tcache); @@ -231,15 +183,29 @@ tcache_destroy(tcache_t *tcache) #endif for (i = 0; i < nbins; i++) { - tcache_bin_t *tbin = tcache->tbins[i]; - if (tbin != NULL) { - tcache_bin_flush(tbin, i, 0 + tcache_bin_t *tbin = &tcache->tbins[i]; + tcache_bin_flush(tbin, i, 0 #ifdef JEMALLOC_PROF - , tcache + , tcache #endif - ); - tcache_bin_destroy(tcache, tbin, i); + ); + +#ifdef JEMALLOC_STATS + if (tbin->tstats.nrequests != 0) { + arena_t *arena = tcache->arena; + arena_bin_t *bin = &arena->bins[i]; + malloc_mutex_lock(&arena->lock); + bin->stats.nrequests += tbin->tstats.nrequests; + if (bin->reg_size <= small_maxclass) { + arena->stats.nmalloc_small += + tbin->tstats.nrequests; + } else { + arena->stats.nmalloc_medium += + tbin->tstats.nrequests; + } + malloc_mutex_unlock(&arena->lock); } +#endif } #ifdef JEMALLOC_PROF @@ -286,21 +252,17 @@ tcache_stats_merge(tcache_t *tcache, arena_t *arena) /* Merge and reset tcache stats. */ for (i = 0; i < mbin0; i++) { arena_bin_t *bin = &arena->bins[i]; - tcache_bin_t *tbin = tcache->tbins[i]; - if (tbin != NULL) { - bin->stats.nrequests += tbin->tstats.nrequests; - arena->stats.nmalloc_small += tbin->tstats.nrequests; - tbin->tstats.nrequests = 0; - } + tcache_bin_t *tbin = &tcache->tbins[i]; + bin->stats.nrequests += tbin->tstats.nrequests; + arena->stats.nmalloc_small += tbin->tstats.nrequests; + tbin->tstats.nrequests = 0; } for (; i < nbins; i++) { arena_bin_t *bin = &arena->bins[i]; - tcache_bin_t *tbin = tcache->tbins[i]; - if (tbin != NULL) { - bin->stats.nrequests += tbin->tstats.nrequests; - arena->stats.nmalloc_medium += tbin->tstats.nrequests; - tbin->tstats.nrequests = 0; - } + tcache_bin_t *tbin = &tcache->tbins[i]; + bin->stats.nrequests += tbin->tstats.nrequests; + arena->stats.nmalloc_medium += tbin->tstats.nrequests; + tbin->tstats.nrequests = 0; } } #endif @@ -309,9 +271,7 @@ void tcache_boot(void) { - if (opt_lg_tcache_nslots > 0) { - tcache_nslots = (1U << opt_lg_tcache_nslots); - + if (opt_tcache) { /* Compute incremental GC event threshold. */ if (opt_lg_tcache_gc_sweep >= 0) { tcache_gc_incr = ((1U << opt_lg_tcache_gc_sweep) / @@ -319,10 +279,7 @@ tcache_boot(void) 0) ? 0 : 1); } else tcache_gc_incr = 0; - } else - tcache_nslots = 0; - if (tcache_nslots != 0) { if (pthread_key_create(&tcache_tsd, tcache_thread_cleanup) != 0) { malloc_write(