Simplify tcache object caching.

Use chains of cached objects, rather than using arrays of pointers. Since tcache_bin_t is no longer dynamically sized, convert tcache_t's tbin to an array of structures, rather than an array of pointers. This implicitly removes tcache_bin_{create,destroy}(), which further simplifies the fast path for malloc/free. Use cacheline alignment for tcache_t allocations. Remove runtime configuration option for number of tcache bin slots, and replace it with a boolean option for enabling/disabling tcache. Limit the number of tcache objects to the lesser of TCACHE_NSLOTS_MAX and 2X the number of regions per run for the size class. For GC-triggered flush, discard 3/4 of the objects below the low water mark, rather than 1/2.
2010-03-07 15:34:14 -08:00 · 2010-03-07 15:34:14 -08:00 · 3fa9a2fad8
commit 3fa9a2fad8
parent 2caa4715ed
8 changed files with 171 additions and 246 deletions
--- a/jemalloc/doc/jemalloc.3.in
+++ b/jemalloc/doc/jemalloc.3.in
@ -376,8 +376,7 @@ will disable dirty page purging.
@roff_tcache@.Ev JEMALLOC_OPTIONS=14g
@roff_tcache@will disable garbage collection.
@roff_tcache@.It H
-@roff_tcache@Double/halve the number of thread-specific cache slots per size
-@roff_tcache@class.
+@roff_tcache@Enable/disable thread-specific caching.
@roff_tcache@When there are multiple threads, each thread uses a
@roff_tcache@thread-specific cache for small and medium objects.
@roff_tcache@Thread-specific caching allows many allocations to be satisfied
@ -386,11 +385,7 @@ will disable dirty page purging.
@roff_tcache@See the
@roff_tcache@.Dq G
@roff_tcache@option for related tuning information.
-@roff_tcache@The default number of cache slots is 128;
-@roff_tcache@.Ev JEMALLOC_OPTIONS=7h
-@roff_tcache@will disable thread-specific caching.
-@roff_tcache@Note that one cache slot per size class is not a valid
-@roff_tcache@configuration due to implementation details.
+@roff_tcache@This option is enabled by default.
@roff_prof@.It I
@roff_prof@Double/halve the average interval between memory profile dumps, as
@roff_prof@measured in bytes of allocation activity.
@ -773,7 +768,7 @@ option.
@roff_xmalloc@option.
@roff_xmalloc@.Ed
 .\"-----------------------------------------------------------------------------
-@roff_tcache@.It Sy "opt.lg_tcache_nslots (size_t) r-"
+@roff_tcache@.It Sy "opt.tcache (bool) r-"
@roff_tcache@.Bd -ragged -offset indent -compact
@roff_tcache@See the
@roff_tcache@.Dq H
--- a/jemalloc/include/jemalloc/internal/arena.h
+++ b/jemalloc/include/jemalloc/internal/arena.h
@ -18,7 +18,11 @@

 #ifdef JEMALLOC_TINY
   /* Smallest size class to support. */
-#  define LG_TINY_MIN		1
+#  ifdef JEMALLOC_TCACHE
+#    define LG_TINY_MIN		LG_SIZEOF_PTR
+#  else
+#    define LG_TINY_MIN		1
+#  endif
 #endif

 /*
--- a/jemalloc/include/jemalloc/internal/tcache.h
+++ b/jemalloc/include/jemalloc/internal/tcache.h
@ -6,10 +6,13 @@ typedef struct tcache_bin_s tcache_bin_t;
 typedef struct tcache_s tcache_t;

 /*
- * Default number of cache slots for each bin in the thread cache (0:
- * disabled).
+ * Absolute maximum number of cache slots for each bin in the thread cache.
+ * This is an additional constraint beyond that imposed as: twice the number of
+ * regions per run for this size class.
+ *
+ * This constant must be an even number.
 */
-#define LG_TCACHE_NSLOTS_DEFAULT	7
+#define TCACHE_NSLOTS_MAX		200
 /*
  * (1U << opt_lg_tcache_gc_sweep) is the approximate number of allocation
  * events between full GC sweeps (-1: disabled).  Integer rounding may cause
@ -29,7 +32,8 @@ struct tcache_bin_s {
 	unsigned	low_water;	/* Min # cached since last GC. */
 	unsigned	high_water;	/* Max # cached since last GC. */
 	unsigned	ncached;	/* # of cached objects. */
-	void		*slots[1];	/* Dynamically sized. */
+	unsigned	ncached_max;	/* Upper limit on ncached. */
+	void		*avail;		/* Chain of available objects. */
 };

 struct tcache_s {
@ -42,26 +46,20 @@ struct tcache_s {
 	arena_t		*arena;		/* This thread's arena. */
 	unsigned	ev_cnt;		/* Event count since incremental GC. */
 	unsigned	next_gc_bin;	/* Next bin to GC. */
-	tcache_bin_t	*tbins[1];	/* Dynamically sized. */
+	tcache_bin_t	tbins[1];	/* Dynamically sized. */
 };

 #endif /* JEMALLOC_H_STRUCTS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_EXTERNS

-extern size_t	opt_lg_tcache_nslots;
+extern bool	opt_tcache;
 extern ssize_t	opt_lg_tcache_gc_sweep;

 /* Map of thread-specific caches. */
 extern __thread tcache_t	*tcache_tls
    JEMALLOC_ATTR(tls_model("initial-exec"));

-/*
- * Number of cache slots for each bin in the thread cache, or 0 if tcache is
- * disabled.
- */
-extern size_t			tcache_nslots;
-
 /* Number of tcache allocation/deallocation events between incremental GCs. */
 extern unsigned			tcache_gc_incr;

@ -71,10 +69,7 @@ void	tcache_bin_flush(tcache_bin_t *tbin, size_t binind, unsigned rem
 #endif
    );
 tcache_t *tcache_create(arena_t *arena);
-void	tcache_bin_destroy(tcache_t *tcache, tcache_bin_t *tbin,
-    unsigned binind);
 void	*tcache_alloc_hard(tcache_t *tcache, tcache_bin_t *tbin, size_t binind);
-tcache_bin_t *tcache_bin_create(arena_t *arena);
 void	tcache_destroy(tcache_t *tcache);
 #ifdef JEMALLOC_STATS
 void	tcache_stats_merge(tcache_t *tcache, arena_t *arena);
@ -99,7 +94,7 @@ tcache_get(void)
 {
 	tcache_t *tcache;

-	if (isthreaded == false || tcache_nslots == 0)
+	if ((isthreaded & opt_tcache) == false)
 		return (NULL);

 	tcache = tcache_tls;
@ -124,37 +119,24 @@ tcache_event(tcache_t *tcache)

 	tcache->ev_cnt++;
 	assert(tcache->ev_cnt <= tcache_gc_incr);
-	if (tcache->ev_cnt >= tcache_gc_incr) {
+	if (tcache->ev_cnt == tcache_gc_incr) {
 		size_t binind = tcache->next_gc_bin;
-		tcache_bin_t *tbin = tcache->tbins[binind];
+		tcache_bin_t *tbin = &tcache->tbins[binind];

-		if (tbin != NULL) {
-			if (tbin->high_water == 0) {
-				/*
-				 * This bin went completely unused for an
-				 * entire GC cycle, so throw away the tbin.
-				 */
-				assert(tbin->ncached == 0);
-				tcache_bin_destroy(tcache, tbin, binind);
-				tcache->tbins[binind] = NULL;
-			} else {
-				if (tbin->low_water > 0) {
-					/*
-					 * Flush (ceiling) half of the objects
-					 * below the low water mark.
-					 */
-					tcache_bin_flush(tbin, binind,
-					    tbin->ncached - (tbin->low_water >>
-					    1) - (tbin->low_water & 1)
+		if (tbin->low_water > 0) {
+			/*
+			 * Flush (ceiling) 3/4 of the objects below the low
+			 * water mark.
+			 */
+			tcache_bin_flush(tbin, binind, tbin->ncached -
+			    tbin->low_water + (tbin->low_water >> 2)
 #ifdef JEMALLOC_PROF
-					    , tcache
+			    , tcache
 #endif
-					    );
-				}
-				tbin->low_water = tbin->ncached;
-				tbin->high_water = tbin->ncached;
-			}
+			    );
 		}
+		tbin->low_water = tbin->ncached;
+		tbin->high_water = tbin->ncached;

 		tcache->next_gc_bin++;
 		if (tcache->next_gc_bin == nbins)
@ -166,21 +148,24 @@ tcache_event(tcache_t *tcache)
 JEMALLOC_INLINE void *
 tcache_bin_alloc(tcache_bin_t *tbin)
 {
+	void *ret;

 	if (tbin->ncached == 0)
 		return (NULL);
 	tbin->ncached--;
 	if (tbin->ncached < tbin->low_water)
 		tbin->low_water = tbin->ncached;
-	return (tbin->slots[tbin->ncached]);
+	ret = tbin->avail;
+	tbin->avail = *(void **)ret;
+	return (ret);
 }

 JEMALLOC_INLINE void *
 tcache_alloc(tcache_t *tcache, size_t size, bool zero)
 {
 	void *ret;
-	tcache_bin_t *tbin;
 	size_t binind;
+	tcache_bin_t *tbin;

 	if (size <= small_maxclass)
 		binind = small_size2bin[size];
@ -189,14 +174,7 @@ tcache_alloc(tcache_t *tcache, size_t size, bool zero)
 		    lg_mspace);
 	}
 	assert(binind < nbins);
-	tbin = tcache->tbins[binind];
-	if (tbin == NULL) {
-		tbin = tcache_bin_create(tcache->arena);
-		if (tbin == NULL)
-			return (NULL);
-		tcache->tbins[binind] = tbin;
-	}
-
+	tbin = &tcache->tbins[binind];
 	ret = tcache_bin_alloc(tbin);
 	if (ret == NULL) {
 		ret = tcache_alloc_hard(tcache, tbin, binind);
@ -250,29 +228,20 @@ tcache_dalloc(tcache_t *tcache, void *ptr)

 #ifdef JEMALLOC_FILL
 	if (opt_junk)
-		memset(ptr, 0x5a, arena->bins[binind].reg_size);
+		memset(ptr, 0x5a, bin->reg_size);
 #endif

-	tbin = tcache->tbins[binind];
-	if (tbin == NULL) {
-		tbin = tcache_bin_create(choose_arena());
-		if (tbin == NULL) {
-			malloc_mutex_lock(&arena->lock);
-			arena_dalloc_bin(arena, chunk, ptr, mapelm);
-			malloc_mutex_unlock(&arena->lock);
-			return;
-		}
-		tcache->tbins[binind] = tbin;
-	}
-
-	if (tbin->ncached == tcache_nslots)
-		tcache_bin_flush(tbin, binind, (tcache_nslots >> 1)
+	tbin = &tcache->tbins[binind];
+	if (tbin->ncached == tbin->ncached_max) {
+		tcache_bin_flush(tbin, binind, (tbin->ncached_max >> 1)
 #ifdef JEMALLOC_PROF
 		    , tcache
 #endif
 		    );
-	assert(tbin->ncached < tcache_nslots);
-	tbin->slots[tbin->ncached] = ptr;
+	}
+	assert(tbin->ncached < tbin->ncached_max);
+	*(void **)ptr = tbin->avail;
+	tbin->avail = ptr;
 	tbin->ncached++;
 	if (tbin->ncached > tbin->high_water)
 		tbin->high_water = tbin->ncached;
--- a/jemalloc/src/arena.c
+++ b/jemalloc/src/arena.c
@ -53,13 +53,26 @@ static malloc_mutex_t	purge_lock;
 static const uint8_t	const_small_size2bin[STATIC_PAGE_SIZE - 255] = {
 	S2B_1(0xffU)		/*    0 */
 #if (LG_QUANTUM == 4)
-/* 64-bit system ************************/
+/* 16-byte quantum **********************/
 #  ifdef JEMALLOC_TINY
-	S2B_2(0)		/*    2 */
-	S2B_2(1)		/*    4 */
-	S2B_4(2)		/*    8 */
-	S2B_8(3)		/*   16 */
+#    if (LG_TINY_MIN == 1)
+       S2B_2(0)			/*    2 */
+       S2B_2(1)			/*    4 */
+       S2B_4(2)			/*    8 */
+       S2B_8(3)			/*   16 */
 #    define S2B_QMIN 3
+#    elif (LG_TINY_MIN == 2)
+       S2B_4(0)			/*    4 */
+       S2B_4(1)			/*    8 */
+       S2B_8(2)			/*   16 */
+#    define S2B_QMIN 2
+#    elif (LG_TINY_MIN == 3)
+       S2B_8(0)			/*    8 */
+       S2B_8(1)			/*   16 */
+#    define S2B_QMIN 1
+#    else
+#      error "Unsupported LG_TINY_MIN"
+#    endif
 #  else
 	S2B_16(0)		/*   16 */
 #    define S2B_QMIN 0
@ -73,12 +86,20 @@ static const uint8_t	const_small_size2bin[STATIC_PAGE_SIZE - 255] = {
 	S2B_16(S2B_QMIN + 7)	/*  128 */
 #  define S2B_CMIN (S2B_QMIN + 8)
 #else
-/* 32-bit system ************************/
+/* 8-byte quantum ***********************/
 #  ifdef JEMALLOC_TINY
-	S2B_2(0)		/*    2 */
-	S2B_2(1)		/*    4 */
-	S2B_4(2)		/*    8 */
+#    if (LG_TINY_MIN == 1)
+       S2B_2(0)			/*    2 */
+       S2B_2(1)			/*    4 */
+       S2B_4(2)			/*    8 */
 #    define S2B_QMIN 2
+#    elif (LG_TINY_MIN == 2)
+       S2B_4(0)			/*    4 */
+       S2B_4(1)			/*    8 */
+#    define S2B_QMIN 1
+#    else
+#      error "Unsupported LG_TINY_MIN"
+#    endif
 #  else
 	S2B_8(0)		/*    8 */
 #    define S2B_QMIN 0
@ -1048,28 +1069,15 @@ arena_tcache_fill(arena_t *arena, tcache_bin_t *tbin, size_t binind
 #ifdef JEMALLOC_PROF
 	arena_prof_accum(arena, prof_accumbytes);
 #endif
-	for (i = 0, nfill = (tcache_nslots >> 1); i < nfill; i++) {
+	for (i = 0, nfill = (tbin->ncached_max >> 1); i < nfill; i++) {
 		if ((run = bin->runcur) != NULL && run->nfree > 0)
 			ptr = arena_bin_malloc_easy(arena, bin, run);
 		else
 			ptr = arena_bin_malloc_hard(arena, bin);
-		if (ptr == NULL) {
-			if (i > 0) {
-				/*
-				 * Move valid pointers to the base of
-				 * tbin->slots.
-				 */
-				memmove(&tbin->slots[0],
-				    &tbin->slots[nfill - i],
-				    i * sizeof(void *));
-			}
+		if (ptr == NULL)
 			break;
-		}
-		/*
-		 * Fill slots such that the objects lowest in memory come last.
-		 * This causes tcache to use low objects first.
-		 */
-		tbin->slots[nfill - 1 - i] = ptr;
+		*(void **)ptr = tbin->avail;
+		tbin->avail = ptr;
 	}
 #ifdef JEMALLOC_STATS
 	bin->stats.nfills++;
--- a/jemalloc/src/ctl.c
+++ b/jemalloc/src/ctl.c
@ -64,7 +64,7 @@ CTL_PROTO(opt_xmalloc)
 CTL_PROTO(opt_zero)
 #endif
 #ifdef JEMALLOC_TCACHE
-CTL_PROTO(opt_lg_tcache_nslots)
+CTL_PROTO(opt_tcache)
 CTL_PROTO(opt_lg_tcache_gc_sweep)
 #endif
 #ifdef JEMALLOC_PROF
@ -230,7 +230,7 @@ static const ctl_node_t opt_node[] = {
 	{NAME("zero"),			CTL(opt_zero)},
 #endif
 #ifdef JEMALLOC_TCACHE
-	{NAME("lg_tcache_nslots"),	CTL(opt_lg_tcache_nslots)},
+	{NAME("tcache"),		CTL(opt_tcache)},
 	{NAME("lg_tcache_gc_sweep"),	CTL(opt_lg_tcache_gc_sweep)},
 #endif
 #ifdef JEMALLOC_PROF
@ -1070,7 +1070,7 @@ CTL_RO_GEN(opt_xmalloc, opt_xmalloc, bool)
 CTL_RO_GEN(opt_zero, opt_zero, bool)
 #endif
 #ifdef JEMALLOC_TCACHE
-CTL_RO_GEN(opt_lg_tcache_nslots, opt_lg_tcache_nslots, size_t)
+CTL_RO_GEN(opt_tcache, opt_tcache, bool)
 CTL_RO_GEN(opt_lg_tcache_gc_sweep, opt_lg_tcache_gc_sweep, ssize_t)
 #endif
 #ifdef JEMALLOC_PROF
--- a/jemalloc/src/jemalloc.c
+++ b/jemalloc/src/jemalloc.c
@ -482,13 +482,10 @@ MALLOC_OUT:
 						opt_lg_tcache_gc_sweep++;
 					break;
 				case 'h':
-					if (opt_lg_tcache_nslots > 0)
-						opt_lg_tcache_nslots--;
+					opt_tcache = false;
 					break;
 				case 'H':
-					if (opt_lg_tcache_nslots + 1 <
-					    (sizeof(size_t) << 3))
-						opt_lg_tcache_nslots++;
+					opt_tcache = true;
 					break;
 #endif
 #ifdef JEMALLOC_PROF
@ -729,7 +726,7 @@ MALLOC_OUT:
 		 * default.
 		 */
 #ifdef JEMALLOC_TCACHE
-		if (tcache_nslots
+		if (opt_tcache
 #  ifdef JEMALLOC_PROF
 		    /*
 		     * Profile data storage concurrency is directly linked to
--- a/jemalloc/src/stats.c
+++ b/jemalloc/src/stats.c
@ -440,6 +440,9 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 		if ((err = JEMALLOC_P(mallctl)("opt.prof", &bv, &bsz, NULL, 0))
 		   == 0)
 			write_cb(cbopaque, bv ? "F" : "f");
+		if ((err = JEMALLOC_P(mallctl)("opt.tcache", &bv, &bsz, NULL,
+		    0)) == 0)
+			write_cb(cbopaque, bv ? "H" : "h");
 		if ((err = JEMALLOC_P(mallctl)("opt.junk", &bv, &bsz, NULL, 0))
 		    == 0)
 			write_cb(cbopaque, bv ? "J" : "j");
@ -550,21 +553,13 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 			write_cb(cbopaque,
 			    "Min active:dirty page ratio per arena: N/A\n");
 		}
-		if ((err = JEMALLOC_P(mallctl)("opt.lg_tcache_nslots", &sv,
+		if ((err = JEMALLOC_P(mallctl)("opt.lg_tcache_gc_sweep", &ssv,
 		    &ssz, NULL, 0)) == 0) {
-			size_t tcache_nslots, tcache_gc_sweep;
-
-			tcache_nslots = (1U << sv);
-			write_cb(cbopaque,
-			    "Thread cache slots per size class: ");
-			write_cb(cbopaque, tcache_nslots ?
-			    umax2s(tcache_nslots, 10, s) : "N/A");
-			write_cb(cbopaque, "\n");
-
-			CTL_GET("opt.lg_tcache_gc_sweep", &ssv, ssize_t);
-			tcache_gc_sweep = (1U << ssv);
+			size_t tcache_gc_sweep = (1U << ssv);
+			bool tcache_enabled;
+			CTL_GET("opt.tcache", &tcache_enabled, bool);
 			write_cb(cbopaque, "Thread cache GC sweep interval: ");
-			write_cb(cbopaque, tcache_nslots && ssv >= 0 ?
+			write_cb(cbopaque, tcache_enabled && ssv >= 0 ?
 			    umax2s(tcache_gc_sweep, 10, s) : "N/A");
 			write_cb(cbopaque, "\n");
 		}
--- a/jemalloc/src/tcache.c
+++ b/jemalloc/src/tcache.c
@ -4,7 +4,7 @@
 /******************************************************************************/
 /* Data. */

-size_t	opt_lg_tcache_nslots = LG_TCACHE_NSLOTS_DEFAULT;
+bool	opt_tcache = true;
 ssize_t	opt_lg_tcache_gc_sweep = LG_TCACHE_GC_SWEEP_DEFAULT;

 /* Map of thread-specific caches. */
@ -16,7 +16,6 @@ __thread tcache_t	*tcache_tls JEMALLOC_ATTR(tls_model("initial-exec"));
 */
 static pthread_key_t		tcache_tsd;

-size_t				tcache_nslots;
 unsigned			tcache_gc_incr;

 /******************************************************************************/
@ -51,16 +50,14 @@ tcache_bin_flush(tcache_bin_t *tbin, size_t binind, unsigned rem
 #endif
    )
 {
-	arena_chunk_t *chunk;
-	arena_t *arena;
-	void *ptr;
-	unsigned i, ndeferred, ncached;
+	void *flush, *deferred, *ptr;
+	unsigned i, nflush, ndeferred;

-	for (ndeferred = tbin->ncached - rem; ndeferred > 0;) {
-		ncached = ndeferred;
+	for (flush = tbin->avail, nflush = tbin->ncached - rem; flush != NULL;
+	    flush = deferred, nflush = ndeferred) {
 		/* Lock the arena associated with the first object. */
-		chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(tbin->slots[0]);
-		arena = chunk->arena;
+		arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(flush);
+		arena_t *arena = chunk->arena;
 		malloc_mutex_lock(&arena->lock);
 #ifdef JEMALLOC_PROF
 		if (arena == tcache->arena) {
@ -68,9 +65,12 @@ tcache_bin_flush(tcache_bin_t *tbin, size_t binind, unsigned rem
 			tcache->prof_accumbytes = 0;
 		}
 #endif
-		/* Deallocate every object that belongs to the locked arena. */
-		for (i = ndeferred = 0; i < ncached; i++) {
-			ptr = tbin->slots[i];
+		deferred = NULL;
+		ndeferred = 0;
+		for (i = 0; i < nflush; i++) {
+			ptr = flush;
+			assert(ptr != NULL);
+			flush = *(void **)ptr;
 			chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 			if (chunk->arena == arena) {
 				size_t pageind = (((uintptr_t)ptr -
@ -85,7 +85,8 @@ tcache_bin_flush(tcache_bin_t *tbin, size_t binind, unsigned rem
 				 * Stash the object, so that it can be handled
 				 * in a future pass.
 				 */
-				tbin->slots[ndeferred] = ptr;
+				*(void **)ptr = deferred;
+				deferred = ptr;
 				ndeferred++;
 			}
 		}
@ -105,98 +106,41 @@ tcache_bin_flush(tcache_bin_t *tbin, size_t binind, unsigned rem
 		}
 #endif
 		malloc_mutex_unlock(&arena->lock);
+
+		if (flush != NULL) {
+			/*
+			 * This was the first pass, and rem cached objects
+			 * remain.
+			 */
+			tbin->avail = flush;
+		}
 	}

-	if (rem > 0) {
-		/*
-		 * Shift the remaining valid pointers to the base of the slots
-		 * array.
-		 */
-		memmove(&tbin->slots[0], &tbin->slots[tbin->ncached - rem],
-		    rem * sizeof(void *));
-	}
 	tbin->ncached = rem;
 }

-tcache_bin_t *
-tcache_bin_create(arena_t *arena)
-{
-	tcache_bin_t *ret;
-	size_t tsize;
-
-	tsize = sizeof(tcache_bin_t) + (sizeof(void *) * (tcache_nslots - 1));
-	if (tsize <= small_maxclass)
-		ret = (tcache_bin_t *)arena_malloc_small(arena, tsize, false);
-	else if (tsize <= bin_maxclass)
-		ret = (tcache_bin_t *)arena_malloc_medium(arena, tsize, false);
-	else
-		ret = (tcache_bin_t *)imalloc(tsize);
-	if (ret == NULL)
-		return (NULL);
-#ifdef JEMALLOC_STATS
-	memset(&ret->tstats, 0, sizeof(tcache_bin_stats_t));
-#endif
-	ret->low_water = 0;
-	ret->high_water = 0;
-	ret->ncached = 0;
-
-	return (ret);
-}
-
-void
-tcache_bin_destroy(tcache_t *tcache, tcache_bin_t *tbin, unsigned binind)
-{
-	arena_t *arena;
-	arena_chunk_t *chunk;
-	size_t pageind, tsize;
-	arena_chunk_map_t *mapelm;
-
-	chunk = CHUNK_ADDR2BASE(tbin);
-	arena = chunk->arena;
-	pageind = (((uintptr_t)tbin - (uintptr_t)chunk) >> PAGE_SHIFT);
-	mapelm = &chunk->map[pageind];
-
-#ifdef JEMALLOC_STATS
-	if (tbin->tstats.nrequests != 0) {
-		arena_t *arena = tcache->arena;
-		arena_bin_t *bin = &arena->bins[binind];
-		malloc_mutex_lock(&arena->lock);
-		bin->stats.nrequests += tbin->tstats.nrequests;
-		if (bin->reg_size <= small_maxclass)
-			arena->stats.nmalloc_small += tbin->tstats.nrequests;
-		else
-			arena->stats.nmalloc_medium += tbin->tstats.nrequests;
-		malloc_mutex_unlock(&arena->lock);
-	}
-#endif
-
-	assert(tbin->ncached == 0);
-	tsize = sizeof(tcache_bin_t) + (sizeof(void *) * (tcache_nslots - 1));
-	if (tsize <= bin_maxclass) {
-		malloc_mutex_lock(&arena->lock);
-		arena_dalloc_bin(arena, chunk, tbin, mapelm);
-		malloc_mutex_unlock(&arena->lock);
-	} else
-		idalloc(tbin);
-}
-
 tcache_t *
 tcache_create(arena_t *arena)
 {
 	tcache_t *tcache;
+	size_t size;
+	unsigned i;

-	if (sizeof(tcache_t) + (sizeof(tcache_bin_t *) * (nbins - 1)) <=
-	    small_maxclass) {
-		tcache = (tcache_t *)arena_malloc_small(arena, sizeof(tcache_t)
-		    + (sizeof(tcache_bin_t *) * (nbins - 1)), true);
-	} else if (sizeof(tcache_t) + (sizeof(tcache_bin_t *) * (nbins - 1)) <=
-	    bin_maxclass) {
-		tcache = (tcache_t *)arena_malloc_medium(arena, sizeof(tcache_t)
-		    + (sizeof(tcache_bin_t *) * (nbins - 1)), true);
-	} else {
-		tcache = (tcache_t *)icalloc(sizeof(tcache_t) +
-		    (sizeof(tcache_bin_t *) * (nbins - 1)));
-	}
+	size = sizeof(tcache_t) + (sizeof(tcache_bin_t) * (nbins - 1));
+	/*
+	 * Round up to the nearest multiple of the cacheline size, in order to
+	 * avoid the possibility of false cacheline sharing.
+	 *
+	 * That this works relies on the same logic as in ipalloc().
+	 */
+	size = (size + CACHELINE_MASK) & (-CACHELINE);
+
+	if (size <= small_maxclass)
+		tcache = (tcache_t *)arena_malloc_small(arena, size, true);
+	else if (size <= bin_maxclass)
+		tcache = (tcache_t *)arena_malloc_medium(arena, size, true);
+	else
+		tcache = (tcache_t *)icalloc(size);

 	if (tcache == NULL)
 		return (NULL);
@ -210,6 +154,14 @@ tcache_create(arena_t *arena)
 #endif

 	tcache->arena = arena;
+	assert((TCACHE_NSLOTS_MAX & 1U) == 0);
+	for (i = 0; i < nbins; i++) {
+		if ((arena->bins[i].nregs << 1) <= TCACHE_NSLOTS_MAX) {
+			tcache->tbins[i].ncached_max = (arena->bins[i].nregs <<
+			    1);
+		} else
+			tcache->tbins[i].ncached_max = TCACHE_NSLOTS_MAX;
+	}

 	tcache_tls = tcache;
 	pthread_setspecific(tcache_tsd, tcache);
@ -231,15 +183,29 @@ tcache_destroy(tcache_t *tcache)
 #endif

 	for (i = 0; i < nbins; i++) {
-		tcache_bin_t *tbin = tcache->tbins[i];
-		if (tbin != NULL) {
-			tcache_bin_flush(tbin, i, 0
+		tcache_bin_t *tbin = &tcache->tbins[i];
+		tcache_bin_flush(tbin, i, 0
 #ifdef JEMALLOC_PROF
-			    , tcache
+		    , tcache
 #endif
-			    );
-			tcache_bin_destroy(tcache, tbin, i);
+		    );
+
+#ifdef JEMALLOC_STATS
+		if (tbin->tstats.nrequests != 0) {
+			arena_t *arena = tcache->arena;
+			arena_bin_t *bin = &arena->bins[i];
+			malloc_mutex_lock(&arena->lock);
+			bin->stats.nrequests += tbin->tstats.nrequests;
+			if (bin->reg_size <= small_maxclass) {
+				arena->stats.nmalloc_small +=
+				    tbin->tstats.nrequests;
+			} else {
+				arena->stats.nmalloc_medium +=
+				    tbin->tstats.nrequests;
+			}
+			malloc_mutex_unlock(&arena->lock);
 		}
+#endif
 	}

 #ifdef JEMALLOC_PROF
@ -286,21 +252,17 @@ tcache_stats_merge(tcache_t *tcache, arena_t *arena)
 	/* Merge and reset tcache stats. */
 	for (i = 0; i < mbin0; i++) {
 		arena_bin_t *bin = &arena->bins[i];
-		tcache_bin_t *tbin = tcache->tbins[i];
-		if (tbin != NULL) {
-			bin->stats.nrequests += tbin->tstats.nrequests;
-			arena->stats.nmalloc_small += tbin->tstats.nrequests;
-			tbin->tstats.nrequests = 0;
-		}
+		tcache_bin_t *tbin = &tcache->tbins[i];
+		bin->stats.nrequests += tbin->tstats.nrequests;
+		arena->stats.nmalloc_small += tbin->tstats.nrequests;
+		tbin->tstats.nrequests = 0;
 	}
 	for (; i < nbins; i++) {
 		arena_bin_t *bin = &arena->bins[i];
-		tcache_bin_t *tbin = tcache->tbins[i];
-		if (tbin != NULL) {
-			bin->stats.nrequests += tbin->tstats.nrequests;
-			arena->stats.nmalloc_medium += tbin->tstats.nrequests;
-			tbin->tstats.nrequests = 0;
-		}
+		tcache_bin_t *tbin = &tcache->tbins[i];
+		bin->stats.nrequests += tbin->tstats.nrequests;
+		arena->stats.nmalloc_medium += tbin->tstats.nrequests;
+		tbin->tstats.nrequests = 0;
 	}
 }
 #endif
@ -309,9 +271,7 @@ void
 tcache_boot(void)
 {

-	if (opt_lg_tcache_nslots > 0) {
-		tcache_nslots = (1U << opt_lg_tcache_nslots);
-
+	if (opt_tcache) {
 		/* Compute incremental GC event threshold. */
 		if (opt_lg_tcache_gc_sweep >= 0) {
 			tcache_gc_incr = ((1U << opt_lg_tcache_gc_sweep) /
@ -319,10 +279,7 @@ tcache_boot(void)
 			    0) ? 0 : 1);
 		} else
 			tcache_gc_incr = 0;
-	} else
-		tcache_nslots = 0;

-	if (tcache_nslots != 0) {
 		if (pthread_key_create(&tcache_tsd, tcache_thread_cleanup) !=
 		    0) {
 			malloc_write(