From 66cd953514a18477eb49732e40d5c2ab5f1b12c5 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Fri, 22 Apr 2016 14:34:14 -0700
Subject: [PATCH] Do not allocate metadata via non-auto arenas, nor tcaches.

This assures that all internally allocated metadata come from the
first opt_narenas arenas, i.e. the automatically multiplexed arenas.
---
 include/jemalloc/internal/arena.h             |  28 +++--
 include/jemalloc/internal/huge.h              |   7 +-
 .../jemalloc/internal/jemalloc_internal.h.in  |  21 +++-
 include/jemalloc/internal/private_symbols.txt |   2 +
 include/jemalloc/internal/tcache.h            |  10 +-
 include/jemalloc/internal/tsd.h               |   2 +
 src/arena.c                                   |  29 +++--
 src/ckh.c                                     |  16 +--
 src/ctl.c                                     |   2 +-
 src/huge.c                                    |  28 ++---
 src/jemalloc.c                                | 112 ++++++++++++------
 src/prof.c                                    |  42 +++----
 src/quarantine.c                              |   8 +-
 src/tcache.c                                  |  20 ++--
 14 files changed, 192 insertions(+), 135 deletions(-)
diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index 2130e9a0..103a4c91 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -290,10 +290,18 @@ struct arena_s {
 	unsigned		ind;
 
 	/*
-	 * Number of threads currently assigned to this arena.  This field is
-	 * synchronized via atomic operations.
+	 * Number of threads currently assigned to this arena, synchronized via
+	 * atomic operations.  Each thread has two distinct assignments, one for
+	 * application-serving allocation, and the other for internal metadata
+	 * allocation.  Internal metadata must not be allocated from arenas
+	 * created via the arenas.extend mallctl, because the arena.<i>.reset
+	 * mallctl indiscriminately discards all allocations for the affected
+	 * arena.
+	 *
+	 *   0: Application allocation.
+	 *   1: Internal metadata allocation.
 	 */
-	unsigned		nthreads;
+	unsigned		nthreads[2];
 
 	/*
 	 * There are three classes of arena operations from a locking
@@ -541,7 +549,7 @@ void	arena_dalloc_junk_small(void *ptr, arena_bin_info_t *bin_info);
 void	arena_quarantine_junk_small(void *ptr, size_t usize);
 void	*arena_malloc_large(tsd_t *tsd, arena_t *arena, szind_t ind, bool zero);
 void	*arena_malloc_hard(tsd_t *tsd, arena_t *arena, size_t size, szind_t ind,
-    bool zero, tcache_t *tcache);
+    bool zero);
 void	*arena_palloc(tsd_t *tsd, arena_t *arena, size_t usize,
     size_t alignment, bool zero, tcache_t *tcache);
 void	arena_prof_promoted(tsd_t *tsd, const void *ptr, size_t size);
@@ -583,9 +591,9 @@ void	arena_stats_merge(tsd_t *tsd, arena_t *arena, unsigned *nthreads,
     size_t *nactive, size_t *ndirty, arena_stats_t *astats,
     malloc_bin_stats_t *bstats, malloc_large_stats_t *lstats,
     malloc_huge_stats_t *hstats);
-unsigned	arena_nthreads_get(arena_t *arena);
-void	arena_nthreads_inc(arena_t *arena);
-void	arena_nthreads_dec(arena_t *arena);
+unsigned	arena_nthreads_get(arena_t *arena, bool internal);
+void	arena_nthreads_inc(arena_t *arena, bool internal);
+void	arena_nthreads_dec(arena_t *arena, bool internal);
 arena_t	*arena_new(tsd_t *tsd, unsigned ind);
 bool	arena_boot(void);
 void	arena_prefork(tsd_t *tsd, arena_t *arena);
@@ -1320,7 +1328,7 @@ arena_malloc(tsd_t *tsd, arena_t *arena, size_t size, szind_t ind, bool zero,
 		assert(size > tcache_maxclass);
 	}
 
-	return (arena_malloc_hard(tsd, arena, size, ind, zero, tcache));
+	return (arena_malloc_hard(tsd, arena, size, ind, zero));
 }
 
 JEMALLOC_ALWAYS_INLINE arena_t *
@@ -1426,7 +1434,7 @@ arena_dalloc(tsd_t *tsd, void *ptr, tcache_t *tcache, bool slow_path)
 			}
 		}
 	} else
-		huge_dalloc(tsd, ptr, tcache);
+		huge_dalloc(tsd, ptr);
 }
 
 JEMALLOC_ALWAYS_INLINE void
@@ -1477,7 +1485,7 @@ arena_sdalloc(tsd_t *tsd, void *ptr, size_t size, tcache_t *tcache)
 			}
 		}
 	} else
-		huge_dalloc(tsd, ptr, tcache);
+		huge_dalloc(tsd, ptr);
 }
 #  endif /* JEMALLOC_ARENA_INLINE_B */
 #endif
diff --git a/include/jemalloc/internal/huge.h b/include/jemalloc/internal/huge.h
index f19d3368..9de2055d 100644
--- a/include/jemalloc/internal/huge.h
+++ b/include/jemalloc/internal/huge.h
@@ -9,10 +9,9 @@
 /******************************************************************************/
 #ifdef JEMALLOC_H_EXTERNS
 
-void	*huge_malloc(tsd_t *tsd, arena_t *arena, size_t usize, bool zero,
-    tcache_t *tcache);
+void	*huge_malloc(tsd_t *tsd, arena_t *arena, size_t usize, bool zero);
 void	*huge_palloc(tsd_t *tsd, arena_t *arena, size_t usize, size_t alignment,
-    bool zero, tcache_t *tcache);
+    bool zero);
 bool	huge_ralloc_no_move(tsd_t *tsd, void *ptr, size_t oldsize,
     size_t usize_min, size_t usize_max, bool zero);
 void	*huge_ralloc(tsd_t *tsd, arena_t *arena, void *ptr, size_t oldsize,
@@ -21,7 +20,7 @@ void	*huge_ralloc(tsd_t *tsd, arena_t *arena, void *ptr, size_t oldsize,
 typedef void (huge_dalloc_junk_t)(tsd_t *, void *, size_t);
 extern huge_dalloc_junk_t *huge_dalloc_junk;
 #endif
-void	huge_dalloc(tsd_t *tsd, void *ptr, tcache_t *tcache);
+void	huge_dalloc(tsd_t *tsd, void *ptr);
 arena_t	*huge_aalloc(const void *ptr);
 size_t	huge_salloc(tsd_t *tsd, const void *ptr);
 prof_tctx_t	*huge_prof_tctx_get(tsd_t *tsd, const void *ptr);
diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
index ddceabca..fe58c1c6 100644
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -443,6 +443,9 @@ extern bool	in_valgrind;
 /* Number of CPUs. */
 extern unsigned	ncpus;
 
+/* Number of arenas used for automatic multiplexing of threads and arenas. */
+extern unsigned	narenas_auto;
+
 /*
  * Arenas that are used to service external requests.  Not all elements of the
  * arenas array are necessarily used; arenas are created lazily as needed.
@@ -469,10 +472,11 @@ void	bootstrap_free(void *ptr);
 unsigned	narenas_total_get(void);
 arena_t	*arena_init(tsd_t *tsd, unsigned ind);
 arena_tdata_t	*arena_tdata_get_hard(tsd_t *tsd, unsigned ind);
-arena_t	*arena_choose_hard(tsd_t *tsd);
+arena_t	*arena_choose_hard(tsd_t *tsd, bool internal);
 void	arena_migrate(tsd_t *tsd, unsigned oldind, unsigned newind);
 void	thread_allocated_cleanup(tsd_t *tsd);
 void	thread_deallocated_cleanup(tsd_t *tsd);
+void	iarena_cleanup(tsd_t *tsd);
 void	arena_cleanup(tsd_t *tsd);
 void	arenas_tdata_cleanup(tsd_t *tsd);
 void	narenas_tdata_cleanup(tsd_t *tsd);
@@ -546,7 +550,7 @@ size_t	s2u_compute(size_t size);
 size_t	s2u_lookup(size_t size);
 size_t	s2u(size_t size);
 size_t	sa2u(size_t size, size_t alignment);
-arena_t	*arena_choose(tsd_t *tsd, arena_t *arena);
+arena_t	*arena_choose(tsd_t *tsd, arena_t *arena, bool internal);
 arena_tdata_t	*arena_tdata_get(tsd_t *tsd, unsigned ind,
     bool refresh_if_missing);
 arena_t	*arena_get(tsd_t *tsd, unsigned ind, bool init_if_missing);
@@ -784,15 +788,16 @@ sa2u(size_t size, size_t alignment)
 
 /* Choose an arena based on a per-thread value. */
 JEMALLOC_INLINE arena_t *
-arena_choose(tsd_t *tsd, arena_t *arena)
+arena_choose(tsd_t *tsd, arena_t *arena, bool internal)
 {
 	arena_t *ret;
 
 	if (arena != NULL)
 		return (arena);
 
-	if (unlikely((ret = tsd_arena_get(tsd)) == NULL))
-		ret = arena_choose_hard(tsd);
+	ret = internal ? tsd_iarena_get(tsd) : tsd_arena_get(tsd);
+	if (unlikely(ret == NULL))
+		ret = arena_choose_hard(tsd, internal);
 
 	return (ret);
 }
@@ -935,6 +940,8 @@ iallocztm(tsd_t *tsd, size_t size, szind_t ind, bool zero, tcache_t *tcache,
 	void *ret;
 
 	assert(size != 0);
+	assert(!is_metadata || tcache == NULL);
+	assert(!is_metadata || arena == NULL || arena->ind < narenas_auto);
 
 	ret = arena_malloc(tsd, arena, size, ind, zero, tcache, slow_path);
 	if (config_stats && is_metadata && likely(ret != NULL)) {
@@ -982,6 +989,8 @@ ipallocztm(tsd_t *tsd, size_t usize, size_t alignment, bool zero,
 
 	assert(usize != 0);
 	assert(usize == sa2u(usize, alignment));
+	assert(!is_metadata || tcache == NULL);
+	assert(!is_metadata || arena == NULL || arena->ind < narenas_auto);
 
 	ret = arena_palloc(tsd, arena, usize, alignment, zero, tcache);
 	assert(ALIGNMENT_ADDR2BASE(ret, alignment) == ret);
@@ -1052,6 +1061,8 @@ idalloctm(tsd_t *tsd, void *ptr, tcache_t *tcache, bool is_metadata,
 {
 
 	assert(ptr != NULL);
+	assert(!is_metadata || tcache == NULL);
+	assert(!is_metadata || iaalloc(ptr)->ind < narenas_auto);
 	if (config_stats && is_metadata) {
 		arena_metadata_allocated_sub(iaalloc(ptr), isalloc(tsd, ptr,
 		    config_prof));
diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt
index c8799cba..eacc7c62 100644
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -286,6 +286,7 @@ huge_ralloc_no_move
 huge_salloc
 iaalloc
 iallocztm
+iarena_cleanup
 icalloc
 icalloct
 idalloc
@@ -342,6 +343,7 @@ malloc_write
 map_bias
 map_misc_offset
 mb_write
+narenas_auto
 narenas_tdata_cleanup
 narenas_total_get
 ncpus
diff --git a/include/jemalloc/internal/tcache.h b/include/jemalloc/internal/tcache.h
index 1aa64631..82724304 100644
--- a/include/jemalloc/internal/tcache.h
+++ b/include/jemalloc/internal/tcache.h
@@ -293,7 +293,7 @@ tcache_alloc_small(tsd_t *tsd, arena_t *arena, tcache_t *tcache, size_t size,
 	assert(tcache_success == (ret != NULL));
 	if (unlikely(!tcache_success)) {
 		bool tcache_hard_success;
-		arena = arena_choose(tsd, arena);
+		arena = arena_choose(tsd, arena, false);
 		if (unlikely(arena == NULL))
 			return (NULL);
 
@@ -354,7 +354,7 @@ tcache_alloc_large(tsd_t *tsd, arena_t *arena, tcache_t *tcache, size_t size,
 		 * Only allocate one large object at a time, because it's quite
 		 * expensive to create one and not use it.
 		 */
-		arena = arena_choose(tsd, arena);
+		arena = arena_choose(tsd, arena, false);
 		if (unlikely(arena == NULL))
 			return (NULL);
 
@@ -459,8 +459,10 @@ JEMALLOC_ALWAYS_INLINE tcache_t *
 tcaches_get(tsd_t *tsd, unsigned ind)
 {
 	tcaches_t *elm = &tcaches[ind];
-	if (unlikely(elm->tcache == NULL))
-		elm->tcache = tcache_create(tsd, arena_choose(tsd, NULL));
+	if (unlikely(elm->tcache == NULL)) {
+		elm->tcache = tcache_create(tsd, arena_choose(tsd, NULL,
+		    false));
+	}
 	return (elm->tcache);
 }
 #endif
diff --git a/include/jemalloc/internal/tsd.h b/include/jemalloc/internal/tsd.h
index b23b3b4c..1a1b5c32 100644
--- a/include/jemalloc/internal/tsd.h
+++ b/include/jemalloc/internal/tsd.h
@@ -536,6 +536,7 @@ struct tsd_init_head_s {
     O(thread_allocated,		uint64_t)				\
     O(thread_deallocated,	uint64_t)				\
     O(prof_tdata,		prof_tdata_t *)				\
+    O(iarena,			arena_t *)				\
     O(arena,			arena_t *)				\
     O(arenas_tdata,		arena_tdata_t *)			\
     O(narenas_tdata,		unsigned)				\
@@ -552,6 +553,7 @@ struct tsd_init_head_s {
     NULL,								\
     NULL,								\
     NULL,								\
+    NULL,								\
     0,									\
     false,								\
     tcache_enabled_default,						\
diff --git a/src/arena.c b/src/arena.c
index 15023cf9..0da832e2 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -2478,10 +2478,10 @@ arena_malloc_large(tsd_t *tsd, arena_t *arena, szind_t binind, bool zero)
 
 void *
 arena_malloc_hard(tsd_t *tsd, arena_t *arena, size_t size, szind_t ind,
-    bool zero, tcache_t *tcache)
+    bool zero)
 {
 
-	arena = arena_choose(tsd, arena);
+	arena = arena_choose(tsd, arena, false);
 	if (unlikely(arena == NULL))
 		return (NULL);
 
@@ -2489,7 +2489,7 @@ arena_malloc_hard(tsd_t *tsd, arena_t *arena, size_t size, szind_t ind,
 		return (arena_malloc_small(tsd, arena, ind, zero));
 	if (likely(size <= large_maxclass))
 		return (arena_malloc_large(tsd, arena, ind, zero));
-	return (huge_malloc(tsd, arena, index2size(ind), zero, tcache));
+	return (huge_malloc(tsd, arena, index2size(ind), zero));
 }
 
 /* Only handles large allocations that require more than page alignment. */
@@ -2506,7 +2506,7 @@ arena_palloc_large(tsd_t *tsd, arena_t *arena, size_t usize, size_t alignment,
 
 	assert(usize == PAGE_CEILING(usize));
 
-	arena = arena_choose(tsd, arena);
+	arena = arena_choose(tsd, arena, false);
 	if (unlikely(arena == NULL))
 		return (NULL);
 
@@ -2606,10 +2606,9 @@ arena_palloc(tsd_t *tsd, arena_t *arena, size_t usize, size_t alignment,
 			ret = arena_palloc_large(tsd, arena, usize, alignment,
 			    zero);
 		} else if (likely(alignment <= chunksize))
-			ret = huge_malloc(tsd, arena, usize, zero, tcache);
+			ret = huge_malloc(tsd, arena, usize, zero);
 		else {
-			ret = huge_palloc(tsd, arena, usize, alignment, zero,
-			    tcache);
+			ret = huge_palloc(tsd, arena, usize, alignment, zero);
 		}
 	}
 	return (ret);
@@ -3211,7 +3210,7 @@ arena_basic_stats_merge_locked(arena_t *arena, unsigned *nthreads,
     size_t *nactive, size_t *ndirty)
 {
 
-	*nthreads += arena_nthreads_get(arena);
+	*nthreads += arena_nthreads_get(arena, false);
 	*dss = dss_prec_names[arena->dss_prec];
 	*lg_dirty_mult = arena->lg_dirty_mult;
 	*decay_time = arena->decay_time;
@@ -3294,24 +3293,24 @@ arena_stats_merge(tsd_t *tsd, arena_t *arena, unsigned *nthreads,
 }
 
 unsigned
-arena_nthreads_get(arena_t *arena)
+arena_nthreads_get(arena_t *arena, bool internal)
 {
 
-	return (atomic_read_u(&arena->nthreads));
+	return (atomic_read_u(&arena->nthreads[internal]));
 }
 
 void
-arena_nthreads_inc(arena_t *arena)
+arena_nthreads_inc(arena_t *arena, bool internal)
 {
 
-	atomic_add_u(&arena->nthreads, 1);
+	atomic_add_u(&arena->nthreads[internal], 1);
 }
 
 void
-arena_nthreads_dec(arena_t *arena)
+arena_nthreads_dec(arena_t *arena, bool internal)
 {
 
-	atomic_sub_u(&arena->nthreads, 1);
+	atomic_sub_u(&arena->nthreads[internal], 1);
 }
 
 arena_t *
@@ -3338,7 +3337,7 @@ arena_new(tsd_t *tsd, unsigned ind)
 		return (NULL);
 
 	arena->ind = ind;
-	arena->nthreads = 0;
+	arena->nthreads[0] = arena->nthreads[1] = 0;
 	if (malloc_mutex_init(&arena->lock, "arena", WITNESS_RANK_ARENA))
 		return (NULL);
 
diff --git a/src/ckh.c b/src/ckh.c
index 07b49dd2..aa9803e8 100644
--- a/src/ckh.c
+++ b/src/ckh.c
@@ -271,7 +271,7 @@ ckh_grow(tsd_t *tsd, ckh_t *ckh)
 			goto label_return;
 		}
 		tab = (ckhc_t *)ipallocztm(tsd, usize, CACHELINE, true, NULL,
-		    true, NULL);
+		    true, arena_choose(tsd, NULL, true));
 		if (tab == NULL) {
 			ret = true;
 			goto label_return;
@@ -283,12 +283,12 @@ ckh_grow(tsd_t *tsd, ckh_t *ckh)
 		ckh->lg_curbuckets = lg_curcells - LG_CKH_BUCKET_CELLS;
 
 		if (!ckh_rebuild(ckh, tab)) {
-			idalloctm(tsd, tab, tcache_get(tsd, false), true, true);
+			idalloctm(tsd, tab, NULL, true, true);
 			break;
 		}
 
 		/* Rebuilding failed, so back out partially rebuilt table. */
-		idalloctm(tsd, ckh->tab, tcache_get(tsd, false), true, true);
+		idalloctm(tsd, ckh->tab, NULL, true, true);
 		ckh->tab = tab;
 		ckh->lg_curbuckets = lg_prevbuckets;
 	}
@@ -315,7 +315,7 @@ ckh_shrink(tsd_t *tsd, ckh_t *ckh)
 	if (unlikely(usize == 0 || usize > HUGE_MAXCLASS))
 		return;
 	tab = (ckhc_t *)ipallocztm(tsd, usize, CACHELINE, true, NULL, true,
-	    NULL);
+	    arena_choose(tsd, NULL, true));
 	if (tab == NULL) {
 		/*
 		 * An OOM error isn't worth propagating, since it doesn't
@@ -330,7 +330,7 @@ ckh_shrink(tsd_t *tsd, ckh_t *ckh)
 	ckh->lg_curbuckets = lg_curcells - LG_CKH_BUCKET_CELLS;
 
 	if (!ckh_rebuild(ckh, tab)) {
-		idalloctm(tsd, tab, tcache_get(tsd, false), true, true);
+		idalloctm(tsd, tab, NULL, true, true);
 #ifdef CKH_COUNT
 		ckh->nshrinks++;
 #endif
@@ -338,7 +338,7 @@ ckh_shrink(tsd_t *tsd, ckh_t *ckh)
 	}
 
 	/* Rebuilding failed, so back out partially rebuilt table. */
-	idalloctm(tsd, ckh->tab, tcache_get(tsd, false), true, true);
+	idalloctm(tsd, ckh->tab, NULL, true, true);
 	ckh->tab = tab;
 	ckh->lg_curbuckets = lg_prevbuckets;
 #ifdef CKH_COUNT
@@ -392,7 +392,7 @@ ckh_new(tsd_t *tsd, ckh_t *ckh, size_t minitems, ckh_hash_t *hash,
 		goto label_return;
 	}
 	ckh->tab = (ckhc_t *)ipallocztm(tsd, usize, CACHELINE, true, NULL, true,
-	    NULL);
+	    arena_choose(tsd, NULL, true));
 	if (ckh->tab == NULL) {
 		ret = true;
 		goto label_return;
@@ -421,7 +421,7 @@ ckh_delete(tsd_t *tsd, ckh_t *ckh)
 	    (unsigned long long)ckh->nrelocs);
 #endif
 
-	idalloctm(tsd, ckh->tab, tcache_get(tsd, false), true, true);
+	idalloctm(tsd, ckh->tab, NULL, true, true);
 	if (config_debug)
 		memset(ckh, JEMALLOC_FREE_JUNK, sizeof(ckh_t));
 }
diff --git a/src/ctl.c b/src/ctl.c
index 50faee7b..fad2fdd7 100644
--- a/src/ctl.c
+++ b/src/ctl.c
@@ -1304,7 +1304,7 @@ thread_arena_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,
 	arena_t *oldarena;
 	unsigned newind, oldind;
 
-	oldarena = arena_choose(tsd, NULL);
+	oldarena = arena_choose(tsd, NULL, false);
 	if (oldarena == NULL)
 		return (EAGAIN);
 
diff --git a/src/huge.c b/src/huge.c
index 3a802dee..bac2425f 100644
--- a/src/huge.c
+++ b/src/huge.c
@@ -31,18 +31,17 @@ huge_node_unset(const void *ptr, const extent_node_t *node)
 }
 
 void *
-huge_malloc(tsd_t *tsd, arena_t *arena, size_t usize, bool zero,
-    tcache_t *tcache)
+huge_malloc(tsd_t *tsd, arena_t *arena, size_t usize, bool zero)
 {
 
 	assert(usize == s2u(usize));
 
-	return (huge_palloc(tsd, arena, usize, chunksize, zero, tcache));
+	return (huge_palloc(tsd, arena, usize, chunksize, zero));
 }
 
 void *
 huge_palloc(tsd_t *tsd, arena_t *arena, size_t usize, size_t alignment,
-    bool zero, tcache_t *tcache)
+    bool zero)
 {
 	void *ret;
 	size_t ausize;
@@ -58,7 +57,7 @@ huge_palloc(tsd_t *tsd, arena_t *arena, size_t usize, size_t alignment,
 
 	/* Allocate an extent node with which to track the chunk. */
 	node = ipallocztm(tsd, CACHELINE_CEILING(sizeof(extent_node_t)),
-	    CACHELINE, false, tcache, true, arena);
+	    CACHELINE, false, NULL, true, arena_choose(tsd, NULL, true));
 	if (node == NULL)
 		return (NULL);
 
@@ -67,10 +66,10 @@ huge_palloc(tsd_t *tsd, arena_t *arena, size_t usize, size_t alignment,
 	 * it is possible to make correct junk/zero fill decisions below.
 	 */
 	is_zeroed = zero;
-	arena = arena_choose(tsd, arena);
+	arena = arena_choose(tsd, arena, false);
 	if (unlikely(arena == NULL) || (ret = arena_chunk_alloc_huge(tsd, arena,
 	    usize, alignment, &is_zeroed)) == NULL) {
-		idalloctm(tsd, node, tcache, true, true);
+		idalloctm(tsd, node, NULL, true, true);
 		return (NULL);
 	}
 
@@ -78,7 +77,7 @@ huge_palloc(tsd_t *tsd, arena_t *arena, size_t usize, size_t alignment,
 
 	if (huge_node_set(tsd, ret, node)) {
 		arena_chunk_dalloc_huge(tsd, arena, ret, usize);
-		idalloctm(tsd, node, tcache, true, true);
+		idalloctm(tsd, node, NULL, true, true);
 		return (NULL);
 	}
 
@@ -331,12 +330,12 @@ huge_ralloc_no_move(tsd_t *tsd, void *ptr, size_t oldsize, size_t usize_min,
 
 static void *
 huge_ralloc_move_helper(tsd_t *tsd, arena_t *arena, size_t usize,
-    size_t alignment, bool zero, tcache_t *tcache)
+    size_t alignment, bool zero)
 {
 
 	if (alignment <= chunksize)
-		return (huge_malloc(tsd, arena, usize, zero, tcache));
-	return (huge_palloc(tsd, arena, usize, alignment, zero, tcache));
+		return (huge_malloc(tsd, arena, usize, zero));
+	return (huge_palloc(tsd, arena, usize, alignment, zero));
 }
 
 void *
@@ -358,8 +357,7 @@ huge_ralloc(tsd_t *tsd, arena_t *arena, void *ptr, size_t oldsize, size_t usize,
 	 * different size class.  In that case, fall back to allocating new
 	 * space and copying.
 	 */
-	ret = huge_ralloc_move_helper(tsd, arena, usize, alignment, zero,
-	    tcache);
+	ret = huge_ralloc_move_helper(tsd, arena, usize, alignment, zero);
 	if (ret == NULL)
 		return (NULL);
 
@@ -370,7 +368,7 @@ huge_ralloc(tsd_t *tsd, arena_t *arena, void *ptr, size_t oldsize, size_t usize,
 }
 
 void
-huge_dalloc(tsd_t *tsd, void *ptr, tcache_t *tcache)
+huge_dalloc(tsd_t *tsd, void *ptr)
 {
 	extent_node_t *node;
 	arena_t *arena;
@@ -386,7 +384,7 @@ huge_dalloc(tsd_t *tsd, void *ptr, tcache_t *tcache)
 	    extent_node_size_get(node));
 	arena_chunk_dalloc_huge(tsd, extent_node_arena_get(node),
 	    extent_node_addr_get(node), extent_node_size_get(node));
-	idalloctm(tsd, node, tcache, true, true);
+	idalloctm(tsd, node, NULL, true, true);
 
 	arena_decay_tick(tsd, arena);
 }
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 7543dff1..3bd39c3c 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -60,7 +60,7 @@ static malloc_mutex_t	arenas_lock;
 arena_t			**arenas;
 static unsigned		narenas_total; /* Use narenas_total_*(). */
 static arena_t		*a0; /* arenas[0]; read-only after initialization. */
-static unsigned		narenas_auto; /* Read-only after initialization. */
+unsigned		narenas_auto; /* Read-only after initialization. */
 
 typedef enum {
 	malloc_init_uninitialized	= 3,
@@ -318,8 +318,8 @@ a0ialloc(size_t size, bool zero, bool is_metadata)
 	if (unlikely(malloc_init_a0()))
 		return (NULL);
 
-	return (iallocztm(NULL, size, size2index(size), zero, false,
-	    is_metadata, arena_get(NULL, 0, false), true));
+	return (iallocztm(NULL, size, size2index(size), zero, NULL,
+	    is_metadata, arena_get(NULL, 0, true), true));
 }
 
 static void
@@ -451,15 +451,19 @@ arena_init(tsd_t *tsd, unsigned ind)
 }
 
 static void
-arena_bind(tsd_t *tsd, unsigned ind)
+arena_bind(tsd_t *tsd, unsigned ind, bool internal)
 {
 	arena_t *arena;
 
 	arena = arena_get(tsd, ind, false);
-	arena_nthreads_inc(arena);
+	arena_nthreads_inc(arena, internal);
 
-	if (tsd_nominal(tsd))
-		tsd_arena_set(tsd, arena);
+	if (tsd_nominal(tsd)) {
+		if (internal)
+			tsd_iarena_set(tsd, arena);
+		else
+			tsd_arena_set(tsd, arena);
+	}
 }
 
 void
@@ -469,19 +473,22 @@ arena_migrate(tsd_t *tsd, unsigned oldind, unsigned newind)
 
 	oldarena = arena_get(tsd, oldind, false);
 	newarena = arena_get(tsd, newind, false);
-	arena_nthreads_dec(oldarena);
-	arena_nthreads_inc(newarena);
+	arena_nthreads_dec(oldarena, false);
+	arena_nthreads_inc(newarena, false);
 	tsd_arena_set(tsd, newarena);
 }
 
 static void
-arena_unbind(tsd_t *tsd, unsigned ind)
+arena_unbind(tsd_t *tsd, unsigned ind, bool internal)
 {
 	arena_t *arena;
 
 	arena = arena_get(tsd, ind, false);
-	arena_nthreads_dec(arena);
-	tsd_arena_set(tsd, NULL);
+	arena_nthreads_dec(arena, internal);
+	if (internal)
+		tsd_iarena_set(tsd, NULL);
+	else
+		tsd_arena_set(tsd, NULL);
 }
 
 arena_tdata_t *
@@ -562,14 +569,24 @@ label_return:
 
 /* Slow path, called only by arena_choose(). */
 arena_t *
-arena_choose_hard(tsd_t *tsd)
+arena_choose_hard(tsd_t *tsd, bool internal)
 {
-	arena_t *ret;
+	arena_t *ret JEMALLOC_CC_SILENCE_INIT(NULL);
 
 	if (narenas_auto > 1) {
-		unsigned i, choose, first_null;
+		unsigned i, j, choose[2], first_null;
+
+		/*
+		 * Determine binding for both non-internal and internal
+		 * allocation.
+		 *
+		 *   choose[0]: For application allocation.
+		 *   choose[1]: For internal metadata allocation.
+		 */
+
+		for (j = 0; j < 2; j++)
+			choose[j] = 0;
 
-		choose = 0;
 		first_null = narenas_auto;
 		malloc_mutex_lock(tsd, &arenas_lock);
 		assert(arena_get(tsd, 0, false) != NULL);
@@ -579,10 +596,13 @@ arena_choose_hard(tsd_t *tsd)
 				 * Choose the first arena that has the lowest
 				 * number of threads assigned to it.
 				 */
-				if (arena_nthreads_get(arena_get(tsd, i, false))
-				    < arena_nthreads_get(arena_get(tsd, choose,
-				    false)))
-					choose = i;
+				for (j = 0; j < 2; j++) {
+					if (arena_nthreads_get(arena_get(tsd, i,
+					    false), !!j) <
+					    arena_nthreads_get(arena_get(tsd,
+					    choose[j], false), !!j))
+						choose[j] = i;
+				}
 			} else if (first_null == narenas_auto) {
 				/*
 				 * Record the index of the first uninitialized
@@ -597,27 +617,35 @@ arena_choose_hard(tsd_t *tsd)
 			}
 		}
 
-		if (arena_nthreads_get(arena_get(tsd, choose, false)) == 0
-		    || first_null == narenas_auto) {
-			/*
-			 * Use an unloaded arena, or the least loaded arena if
-			 * all arenas are already initialized.
-			 */
-			ret = arena_get(tsd, choose, false);
-		} else {
-			/* Initialize a new arena. */
-			choose = first_null;
-			ret = arena_init_locked(tsd, choose);
-			if (ret == NULL) {
-				malloc_mutex_unlock(tsd, &arenas_lock);
-				return (NULL);
+		for (j = 0; j < 2; j++) {
+			if (arena_nthreads_get(arena_get(tsd, choose[j], false),
+			    !!j) == 0 || first_null != narenas_auto) {
+				/*
+				 * Use an unloaded arena, or the least loaded
+				 * arena if all arenas are already initialized.
+				 */
+				if (!!j == internal)
+					ret = arena_get(tsd, choose[j], false);
+			} else {
+				arena_t *arena;
+
+				/* Initialize a new arena. */
+				choose[j] = first_null;
+				arena = arena_init_locked(tsd, choose[j]);
+				if (arena == NULL) {
+					malloc_mutex_unlock(tsd, &arenas_lock);
+					return (NULL);
+				}
+				if (!!j == internal)
+					ret = arena;
 			}
+			arena_bind(tsd, choose[j], !!j);
 		}
-		arena_bind(tsd, choose);
 		malloc_mutex_unlock(tsd, &arenas_lock);
 	} else {
 		ret = arena_get(tsd, 0, false);
-		arena_bind(tsd, 0);
+		arena_bind(tsd, 0, false);
+		arena_bind(tsd, 0, true);
 	}
 
 	return (ret);
@@ -637,6 +665,16 @@ thread_deallocated_cleanup(tsd_t *tsd)
 	/* Do nothing. */
 }
 
+void
+iarena_cleanup(tsd_t *tsd)
+{
+	arena_t *iarena;
+
+	iarena = tsd_iarena_get(tsd);
+	if (iarena != NULL)
+		arena_unbind(tsd, iarena->ind, true);
+}
+
 void
 arena_cleanup(tsd_t *tsd)
 {
@@ -644,7 +682,7 @@ arena_cleanup(tsd_t *tsd)
 
 	arena = tsd_arena_get(tsd);
 	if (arena != NULL)
-		arena_unbind(tsd, arena->ind);
+		arena_unbind(tsd, arena->ind, false);
 }
 
 void
diff --git a/src/prof.c b/src/prof.c
index 520bf90a..82604632 100644
--- a/src/prof.c
+++ b/src/prof.c
@@ -554,7 +554,8 @@ prof_gctx_create(tsd_t *tsd, prof_bt_t *bt)
 	 */
 	size_t size = offsetof(prof_gctx_t, vec) + (bt->len * sizeof(void *));
 	prof_gctx_t *gctx = (prof_gctx_t *)iallocztm(tsd, size,
-	    size2index(size), false, tcache_get(tsd, true), true, NULL, true);
+	    size2index(size), false, NULL, true, arena_get(NULL, 0, true),
+	    true);
 	if (gctx == NULL)
 		return (NULL);
 	gctx->lock = prof_gctx_mutex_choose();
@@ -595,7 +596,7 @@ prof_gctx_try_destroy(tsd_t *tsd, prof_tdata_t *tdata_self, prof_gctx_t *gctx,
 		prof_leave(tsd, tdata_self);
 		/* Destroy gctx. */
 		malloc_mutex_unlock(tsd, gctx->lock);
-		idalloctm(tsd, gctx, tcache_get(tsd, false), true, true);
+		idalloctm(tsd, gctx, NULL, true, true);
 	} else {
 		/*
 		 * Compensate for increment in prof_tctx_destroy() or
@@ -706,7 +707,7 @@ prof_tctx_destroy(tsd_t *tsd, prof_tctx_t *tctx)
 		prof_tdata_destroy(tsd, tdata, false);
 
 	if (destroy_tctx)
-		idalloctm(tsd, tctx, tcache_get(tsd, false), true, true);
+		idalloctm(tsd, tctx, NULL, true, true);
 }
 
 static bool
@@ -735,8 +736,7 @@ prof_lookup_global(tsd_t *tsd, prof_bt_t *bt, prof_tdata_t *tdata,
 		if (ckh_insert(tsd, &bt2gctx, btkey.v, gctx.v)) {
 			/* OOM. */
 			prof_leave(tsd, tdata);
-			idalloctm(tsd, gctx.v, tcache_get(tsd, false), true,
-			    true);
+			idalloctm(tsd, gctx.v, NULL, true, true);
 			return (true);
 		}
 		new_gctx = true;
@@ -780,7 +780,6 @@ prof_lookup(tsd_t *tsd, prof_bt_t *bt)
 		ret.p->prepared = true;
 	malloc_mutex_unlock(tsd, tdata->lock);
 	if (not_found) {
-		tcache_t *tcache;
 		void *btkey;
 		prof_gctx_t *gctx;
 		bool new_gctx, error;
@@ -794,10 +793,9 @@ prof_lookup(tsd_t *tsd, prof_bt_t *bt)
 			return (NULL);
 
 		/* Link a prof_tctx_t into gctx for this thread. */
-		tcache = tcache_get(tsd, true);
 		ret.v = iallocztm(tsd, sizeof(prof_tctx_t),
-		    size2index(sizeof(prof_tctx_t)), false, tcache, true, NULL,
-		    true);
+		    size2index(sizeof(prof_tctx_t)), false, NULL, true,
+		    arena_choose(tsd, NULL, true), true);
 		if (ret.p == NULL) {
 			if (new_gctx)
 				prof_gctx_try_destroy(tsd, tdata, gctx, tdata);
@@ -817,7 +815,7 @@ prof_lookup(tsd_t *tsd, prof_bt_t *bt)
 		if (error) {
 			if (new_gctx)
 				prof_gctx_try_destroy(tsd, tdata, gctx, tdata);
-			idalloctm(tsd, ret.v, tcache, true, true);
+			idalloctm(tsd, ret.v, NULL, true, true);
 			return (NULL);
 		}
 		malloc_mutex_lock(tsd, gctx->lock);
@@ -1238,8 +1236,8 @@ prof_gctx_finish(tsd_t *tsd, prof_gctx_tree_t *gctxs)
 					    to_destroy);
 					tctx_tree_remove(&gctx->tctxs,
 					    to_destroy);
-					idalloctm(tsd, to_destroy,
-					    tcache_get(tsd, false), true, true);
+					idalloctm(tsd, to_destroy, NULL, true,
+					    true);
 				} else
 					next = NULL;
 			} while (next != NULL);
@@ -1771,14 +1769,13 @@ prof_tdata_init_impl(tsd_t *tsd, uint64_t thr_uid, uint64_t thr_discrim,
     char *thread_name, bool active)
 {
 	prof_tdata_t *tdata;
-	tcache_t *tcache;
 
 	cassert(config_prof);
 
 	/* Initialize an empty cache for this thread. */
-	tcache = tcache_get(tsd, true);
 	tdata = (prof_tdata_t *)iallocztm(tsd, sizeof(prof_tdata_t),
-	    size2index(sizeof(prof_tdata_t)), false, tcache, true, NULL, true);
+	    size2index(sizeof(prof_tdata_t)), false, NULL, true, arena_get(NULL,
+	    0, true), true);
 	if (tdata == NULL)
 		return (NULL);
 
@@ -1792,7 +1789,7 @@ prof_tdata_init_impl(tsd_t *tsd, uint64_t thr_uid, uint64_t thr_discrim,
 
 	if (ckh_new(tsd, &tdata->bt2tctx, PROF_CKH_MINITEMS,
 	    prof_bt_hash, prof_bt_keycomp)) {
-		idalloctm(tsd, tdata, tcache, true, true);
+		idalloctm(tsd, tdata, NULL, true, true);
 		return (NULL);
 	}
 
@@ -1848,7 +1845,6 @@ static void
 prof_tdata_destroy_locked(tsd_t *tsd, prof_tdata_t *tdata,
     bool even_if_attached)
 {
-	tcache_t *tcache;
 
 	malloc_mutex_assert_owner(tsd, &tdatas_mtx);
 
@@ -1859,11 +1855,10 @@ prof_tdata_destroy_locked(tsd_t *tsd, prof_tdata_t *tdata,
 	assert(prof_tdata_should_destroy_unlocked(tsd, tdata,
 	    even_if_attached));
 
-	tcache = tcache_get(tsd, false);
 	if (tdata->thread_name != NULL)
-		idalloctm(tsd, tdata->thread_name, tcache, true, true);
+		idalloctm(tsd, tdata->thread_name, NULL, true, true);
 	ckh_delete(tsd, &tdata->bt2tctx);
-	idalloctm(tsd, tdata, tcache, true, true);
+	idalloctm(tsd, tdata, NULL, true, true);
 }
 
 static void
@@ -2023,8 +2018,8 @@ prof_thread_name_alloc(tsd_t *tsd, const char *thread_name)
 	if (size == 1)
 		return ("");
 
-	ret = iallocztm(tsd, size, size2index(size), false, tcache_get(tsd,
-	    true), true, NULL, true);
+	ret = iallocztm(tsd, size, size2index(size), false, NULL, true,
+	    arena_get(NULL, 0, true), true);
 	if (ret == NULL)
 		return (NULL);
 	memcpy(ret, thread_name, size);
@@ -2056,8 +2051,7 @@ prof_thread_name_set(tsd_t *tsd, const char *thread_name)
 		return (EAGAIN);
 
 	if (tdata->thread_name != NULL) {
-		idalloctm(tsd, tdata->thread_name, tcache_get(tsd, false),
-		    true, true);
+		idalloctm(tsd, tdata->thread_name, NULL, true, true);
 		tdata->thread_name = NULL;
 	}
 	if (strlen(s) > 0)
diff --git a/src/quarantine.c b/src/quarantine.c
index 6cb74b37..ff1637ec 100644
--- a/src/quarantine.c
+++ b/src/quarantine.c
@@ -30,7 +30,7 @@ quarantine_init(tsd_t *tsd, size_t lg_maxobjs)
 	size = offsetof(quarantine_t, objs) + ((ZU(1) << lg_maxobjs) *
 	    sizeof(quarantine_obj_t));
 	quarantine = (quarantine_t *)iallocztm(tsd, size, size2index(size),
-	    false, tcache_get(tsd, true), true, NULL, true);
+	    false, NULL, true, arena_get(NULL, 0, true), true);
 	if (quarantine == NULL)
 		return (NULL);
 	quarantine->curbytes = 0;
@@ -57,7 +57,7 @@ quarantine_alloc_hook_work(tsd_t *tsd)
 	if (tsd_quarantine_get(tsd) == NULL)
 		tsd_quarantine_set(tsd, quarantine);
 	else
-		idalloctm(tsd, quarantine, tcache_get(tsd, false), true, true);
+		idalloctm(tsd, quarantine, NULL, true, true);
 }
 
 static quarantine_t *
@@ -89,7 +89,7 @@ quarantine_grow(tsd_t *tsd, quarantine_t *quarantine)
 		memcpy(&ret->objs[ncopy_a], quarantine->objs, ncopy_b *
 		    sizeof(quarantine_obj_t));
 	}
-	idalloctm(tsd, quarantine, tcache_get(tsd, false), true, true);
+	idalloctm(tsd, quarantine, NULL, true, true);
 
 	tsd_quarantine_set(tsd, ret);
 	return (ret);
@@ -179,7 +179,7 @@ quarantine_cleanup(tsd_t *tsd)
 	quarantine = tsd_quarantine_get(tsd);
 	if (quarantine != NULL) {
 		quarantine_drain(tsd, quarantine, 0);
-		idalloctm(tsd, quarantine, tcache_get(tsd, false), true, true);
+		idalloctm(tsd, quarantine, NULL, true, true);
 		tsd_quarantine_set(tsd, NULL);
 	}
 }
diff --git a/src/tcache.c b/src/tcache.c
index a9539f64..ca867c72 100644
--- a/src/tcache.c
+++ b/src/tcache.c
@@ -97,7 +97,7 @@ tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, tcache_bin_t *tbin,
 	assert(binind < NBINS);
 	assert(rem <= tbin->ncached);
 
-	arena = arena_choose(tsd, NULL);
+	arena = arena_choose(tsd, NULL, false);
 	assert(arena != NULL);
 	for (nflush = tbin->ncached - rem; nflush > 0; nflush = ndeferred) {
 		/* Lock the arena bin associated with the first object. */
@@ -179,7 +179,7 @@ tcache_bin_flush_large(tsd_t *tsd, tcache_bin_t *tbin, szind_t binind,
 	assert(binind < nhbins);
 	assert(rem <= tbin->ncached);
 
-	arena = arena_choose(tsd, NULL);
+	arena = arena_choose(tsd, NULL, false);
 	assert(arena != NULL);
 	for (nflush = tbin->ncached - rem; nflush > 0; nflush = ndeferred) {
 		/* Lock the arena associated with the first object. */
@@ -307,7 +307,7 @@ tcache_get_hard(tsd_t *tsd)
 			tcache_enabled_set(false); /* Memoize. */
 		return (NULL);
 	}
-	arena = arena_choose(tsd, NULL);
+	arena = arena_choose(tsd, NULL, false);
 	if (unlikely(arena == NULL))
 		return (NULL);
 	return (tcache_create(tsd, arena));
@@ -328,8 +328,8 @@ tcache_create(tsd_t *tsd, arena_t *arena)
 	/* Avoid false cacheline sharing. */
 	size = sa2u(size, CACHELINE);
 
-	tcache = ipallocztm(tsd, size, CACHELINE, true, false, true,
-	    arena_get(tsd, 0, false));
+	tcache = ipallocztm(tsd, size, CACHELINE, true, NULL, true,
+	    arena_get(NULL, 0, true));
 	if (tcache == NULL)
 		return (NULL);
 
@@ -359,7 +359,7 @@ tcache_destroy(tsd_t *tsd, tcache_t *tcache)
 	arena_t *arena;
 	unsigned i;
 
-	arena = arena_choose(tsd, NULL);
+	arena = arena_choose(tsd, NULL, false);
 	tcache_arena_dissociate(tsd, tcache, arena);
 
 	for (i = 0; i < NBINS; i++) {
@@ -391,7 +391,7 @@ tcache_destroy(tsd_t *tsd, tcache_t *tcache)
 	    arena_prof_accum(tsd, arena, tcache->prof_accumbytes))
 		prof_idump(tsd);
 
-	idalloctm(tsd, tcache, false, true, true);
+	idalloctm(tsd, tcache, NULL, true, true);
 }
 
 void
@@ -446,6 +446,7 @@ tcache_stats_merge(tsd_t *tsd, tcache_t *tcache, arena_t *arena)
 bool
 tcaches_create(tsd_t *tsd, unsigned *r_ind)
 {
+	arena_t *arena;
 	tcache_t *tcache;
 	tcaches_t *elm;
 
@@ -458,7 +459,10 @@ tcaches_create(tsd_t *tsd, unsigned *r_ind)
 
 	if (tcaches_avail == NULL && tcaches_past > MALLOCX_TCACHE_MAX)
 		return (true);
-	tcache = tcache_create(tsd, arena_get(tsd, 0, false));
+	arena = arena_choose(tsd, NULL, true);
+	if (unlikely(arena == NULL))
+		return (true);
+	tcache = tcache_create(tsd, arena);
 	if (tcache == NULL)
 		return (true);