From ec532e2c5c0b25fb7ab09383fe5a274583a90def Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Thu, 2 Feb 2017 17:02:05 -0800
Subject: [PATCH] Implement per-CPU arena.

The new feature, opt.percpu_arena, determines thread-arena association
dynamically based CPU id. Three modes are supported: "percpu", "phycpu"
and disabled.

"percpu" uses the current core id (with help from sched_getcpu())
directly as the arena index, while "phycpu" will assign threads on the
same physical CPU to the same arena. In other words, "percpu" means # of
arenas == # of CPUs, while "phycpu" has # of arenas == 1/2 * (# of
CPUs). Note that no runtime check on whether hyper threading is enabled
is added yet.

When enabled, threads will be migrated between arenas when a CPU change
is detected. In the current design, to reduce overhead from reading CPU
id, each arena tracks the thread accessed most recently. When a new
thread comes in, we will read CPU id and update arena if necessary.
---
 configure.ac                                  |   9 ++
 include/jemalloc/internal/arena_externs.h     |   4 +
 include/jemalloc/internal/arena_inlines_a.h   |  25 ++++
 include/jemalloc/internal/arena_structs_b.h   |   7 +
 include/jemalloc/internal/arena_types.h       |  16 ++
 .../jemalloc/internal/jemalloc_internal.h.in  | 123 +++++++++++++---
 .../internal/jemalloc_internal_defs.h.in      |   3 +
 include/jemalloc/internal/private_symbols.txt |   4 +
 src/arena.c                                   |  10 ++
 src/ctl.c                                     |  18 ++-
 src/jemalloc.c                                | 139 +++++++++++++++---
 src/stats.c                                   |   1 +
 src/tcache.c                                  |  11 +-
 test/integration/thread_arena.c               |  25 +++-
 test/unit/mallctl.c                           |  39 +++--
 test/unit/stats.c                             | 100 ++++++-------
 16 files changed, 415 insertions(+), 119 deletions(-)

diff --git a/configure.ac b/configure.ac
index 0095caf1..96b105f3 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1598,6 +1598,15 @@ if test "x$have_secure_getenv" = "x1" ; then
   AC_DEFINE([JEMALLOC_HAVE_SECURE_GETENV], [ ])
 fi
 
+dnl Check if the GNU-specific sched_getcpu function exists.
+AC_CHECK_FUNC([sched_getcpu],
+              [have_sched_getcpu="1"],
+              [have_sched_getcpu="0"]
+             )
+if test "x$have_sched_getcpu" = "x1" ; then
+  AC_DEFINE([JEMALLOC_HAVE_SCHED_GETCPU], [ ])
+fi
+
 dnl Check if the Solaris/BSD issetugid function exists.
 AC_CHECK_FUNC([issetugid],
               [have_issetugid="1"],
diff --git a/include/jemalloc/internal/arena_externs.h b/include/jemalloc/internal/arena_externs.h
index 2df55184..349bae99 100644
--- a/include/jemalloc/internal/arena_externs.h
+++ b/include/jemalloc/internal/arena_externs.h
@@ -13,6 +13,10 @@ extern ssize_t		opt_decay_time;
 
 extern const arena_bin_info_t	arena_bin_info[NBINS];
 
+extern percpu_arena_mode_t	percpu_arena_mode;
+extern const char	*opt_percpu_arena;
+extern const char	*percpu_arena_mode_names[];
+
 void arena_stats_large_nrequests_add(tsdn_t *tsdn, arena_stats_t *arena_stats,
     szind_t szind, uint64_t nrequests);
 void arena_stats_mapped_add(tsdn_t *tsdn, arena_stats_t *arena_stats,
diff --git a/include/jemalloc/internal/arena_inlines_a.h b/include/jemalloc/internal/arena_inlines_a.h
index ea7e0995..9dd5304c 100644
--- a/include/jemalloc/internal/arena_inlines_a.h
+++ b/include/jemalloc/internal/arena_inlines_a.h
@@ -7,6 +7,7 @@ void	arena_internal_add(arena_t *arena, size_t size);
 void	arena_internal_sub(arena_t *arena, size_t size);
 size_t	arena_internal_get(arena_t *arena);
 bool	arena_prof_accum(tsdn_t *tsdn, arena_t *arena, uint64_t accumbytes);
+void	percpu_arena_update(tsd_t *tsd, unsigned cpu);
 #endif /* JEMALLOC_ENABLE_INLINE */
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_ARENA_C_))
@@ -42,6 +43,30 @@ arena_prof_accum(tsdn_t *tsdn, arena_t *arena, uint64_t accumbytes) {
 	return prof_accum_add(tsdn, &arena->prof_accum, accumbytes);
 }
 
+JEMALLOC_INLINE void
+percpu_arena_update(tsd_t *tsd, unsigned cpu) {
+	assert(have_percpu_arena);
+	arena_t *oldarena = tsd_arena_get(tsd);
+	assert(oldarena != NULL);
+	unsigned oldind = arena_ind_get(oldarena);
+
+	if (oldind != cpu) {
+		unsigned newind = cpu;
+		arena_t *newarena = arena_get(tsd_tsdn(tsd), newind, true);
+		assert(newarena != NULL);
+
+		/* Set new arena/tcache associations. */
+		arena_migrate(tsd, oldind, newind);
+		if (config_tcache) {
+			tcache_t *tcache = tsd_tcache_get(tsd);
+			if (tcache) {
+				tcache_arena_reassociate(tsd_tsdn(tsd), tcache,
+				    newarena);
+			}
+		}
+	}
+}
+
 #endif /* (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_ARENA_C_)) */
 
 #endif /* JEMALLOC_INTERNAL_ARENA_INLINES_A_H */
diff --git a/include/jemalloc/internal/arena_structs_b.h b/include/jemalloc/internal/arena_structs_b.h
index ebcdbc4d..ba8bb8ad 100644
--- a/include/jemalloc/internal/arena_structs_b.h
+++ b/include/jemalloc/internal/arena_structs_b.h
@@ -125,6 +125,13 @@ struct arena_s {
 	 */
 	unsigned		nthreads[2];
 
+	/*
+	 * When percpu_arena is enabled, to amortize the cost of reading /
+	 * updating the current CPU id, track the most recent thread accessing
+	 * this arena, and only read CPU if there is a mismatch.
+	 */
+	tsdn_t		*last_thd;
+
 	/* Synchronization: internal. */
 	arena_stats_t		stats;
 
diff --git a/include/jemalloc/internal/arena_types.h b/include/jemalloc/internal/arena_types.h
index d821be45..067c9ee9 100644
--- a/include/jemalloc/internal/arena_types.h
+++ b/include/jemalloc/internal/arena_types.h
@@ -19,4 +19,20 @@ typedef struct arena_bin_s arena_bin_t;
 typedef struct arena_s arena_t;
 typedef struct arena_tdata_s arena_tdata_t;
 
+typedef enum {
+	percpu_arena_disabled = 0,
+	percpu_arena = 1,
+	per_phycpu_arena = 2, /* i.e. hyper threads share arena. */
+
+	percpu_arena_mode_limit = 3
+} percpu_arena_mode_t;
+
+#ifdef JEMALLOC_PERCPU_ARENA
+#define PERCPU_ARENA_MODE_DEFAULT	percpu_arena
+#define OPT_PERCPU_ARENA_DEFAULT	"percpu"
+#else
+#define PERCPU_ARENA_MODE_DEFAULT	percpu_arena_disabled
+#define OPT_PERCPU_ARENA_DEFAULT	"disabled"
+#endif
+
 #endif /* JEMALLOC_INTERNAL_ARENA_TYPES_H */
diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
index 8d2ec7dd..97b41bb0 100644
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -145,6 +145,17 @@ static const bool have_thp =
     false
 #endif
     ;
+#ifdef JEMALLOC_HAVE_SCHED_GETCPU
+/* Currently percpu_arena depends on sched_getcpu. */
+#define JEMALLOC_PERCPU_ARENA
+#endif
+static const bool have_percpu_arena =
+#ifdef JEMALLOC_PERCPU_ARENA
+    true
+#else
+    false
+#endif
+    ;
 
 #if (defined(JEMALLOC_OSATOMIC) || defined(JEMALLOC_OSSPIN))
 #include <libkern/OSAtomic.h>
@@ -220,6 +231,9 @@ typedef unsigned pszind_t;
 /* Size class index type. */
 typedef unsigned szind_t;
 
+/* Processor / core id type. */
+typedef int malloc_cpuid_t;
+
 /*
  * Flags bits:
  *
@@ -455,7 +469,7 @@ extern unsigned	narenas_auto;
  * Arenas that are used to service external requests.  Not all elements of the
  * arenas array are necessarily used; arenas are created lazily as needed.
  */
-extern arena_t	**arenas;
+extern arena_t	*arenas[];
 
 /*
  * pind2sz_tab encodes the same information as could be computed by
@@ -548,6 +562,10 @@ arena_tdata_t	*arena_tdata_get(tsd_t *tsd, unsigned ind,
     bool refresh_if_missing);
 arena_t	*arena_get(tsdn_t *tsdn, unsigned ind, bool init_if_missing);
 ticker_t	*decay_ticker_get(tsd_t *tsd, unsigned ind);
+malloc_cpuid_t	malloc_getcpu(void);
+unsigned	percpu_arena_choose(void);
+unsigned	percpu_arena_ind_limit(void);
+
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_C_))
@@ -818,32 +836,53 @@ sa2u(size_t size, size_t alignment) {
 	return usize;
 }
 
-/* Choose an arena based on a per-thread value. */
-JEMALLOC_INLINE arena_t *
-arena_choose_impl(tsd_t *tsd, arena_t *arena, bool internal) {
-	arena_t *ret;
+JEMALLOC_ALWAYS_INLINE malloc_cpuid_t
+malloc_getcpu(void) {
+	assert(have_percpu_arena);
+#if defined(JEMALLOC_HAVE_SCHED_GETCPU)
+	return (malloc_cpuid_t)sched_getcpu();
+#else
+	not_reached();
+	return -1;
+#endif
+}
 
-	if (arena != NULL) {
-		return arena;
+/* Return the chosen arena index based on current cpu. */
+JEMALLOC_ALWAYS_INLINE unsigned
+percpu_arena_choose(void) {
+	unsigned arena_ind;
+	assert(have_percpu_arena && (percpu_arena_mode != percpu_arena_disabled));
+
+	malloc_cpuid_t cpuid = malloc_getcpu();
+	assert(cpuid >= 0);
+	if ((percpu_arena_mode == percpu_arena) ||
+	    ((unsigned)cpuid < ncpus / 2)) {
+		arena_ind = cpuid;
+	} else {
+		assert(percpu_arena_mode == per_phycpu_arena);
+		/* Hyper threads on the same physical CPU share arena. */
+		arena_ind = cpuid - ncpus / 2;
 	}
 
-	ret = internal ? tsd_iarena_get(tsd) : tsd_arena_get(tsd);
-	if (unlikely(ret == NULL)) {
-		ret = arena_choose_hard(tsd, internal);
+	return arena_ind;
+}
+
+/* Return the limit of percpu auto arena range, i.e. arenas[0...ind_limit). */
+JEMALLOC_ALWAYS_INLINE unsigned
+percpu_arena_ind_limit(void) {
+	assert(have_percpu_arena && (percpu_arena_mode != percpu_arena_disabled));
+	if (percpu_arena_mode == per_phycpu_arena && ncpus > 1) {
+		if (ncpus % 2) {
+			/* This likely means a misconfig. */
+			return ncpus / 2 + 1;
+		}
+		return ncpus / 2;
+	} else {
+		return ncpus;
 	}
-
-	return ret;
 }
 
-JEMALLOC_INLINE arena_t *
-arena_choose(tsd_t *tsd, arena_t *arena) {
-	return arena_choose_impl(tsd, arena, false);
-}
 
-JEMALLOC_INLINE arena_t *
-arena_ichoose(tsd_t *tsd, arena_t *arena) {
-	return arena_choose_impl(tsd, arena, true);
-}
 
 JEMALLOC_INLINE arena_tdata_t *
 arena_tdata_get(tsd_t *tsd, unsigned ind, bool refresh_if_missing) {
@@ -912,6 +951,50 @@ extent_t	*iealloc(tsdn_t *tsdn, const void *ptr);
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_C_))
+/* Choose an arena based on a per-thread value. */
+JEMALLOC_INLINE arena_t *
+arena_choose_impl(tsd_t *tsd, arena_t *arena, bool internal) {
+	arena_t *ret;
+
+	if (arena != NULL) {
+		return arena;
+	}
+
+	ret = internal ? tsd_iarena_get(tsd) : tsd_arena_get(tsd);
+	if (unlikely(ret == NULL)) {
+		ret = arena_choose_hard(tsd, internal);
+	}
+
+	assert(ret != NULL);
+	/*
+	 * Note that for percpu arena, if the current arena is outside of the
+	 * auto percpu arena range, (i.e. thread is assigned to a manually
+	 * managed arena), then percpu arena is skipped.
+	 */
+	if (have_percpu_arena && (percpu_arena_mode != percpu_arena_disabled) &&
+	    (arena_ind_get(ret) < percpu_arena_ind_limit()) &&
+	    (ret->last_thd != tsd_tsdn(tsd))) {
+		unsigned ind = percpu_arena_choose();
+		if (arena_ind_get(ret) != ind) {
+			percpu_arena_update(tsd, ind);
+			ret = tsd_arena_get(tsd);
+		}
+		ret->last_thd = tsd_tsdn(tsd);
+	}
+
+	return ret;
+}
+
+JEMALLOC_INLINE arena_t *
+arena_choose(tsd_t *tsd, arena_t *arena) {
+	return arena_choose_impl(tsd, arena, false);
+}
+
+JEMALLOC_INLINE arena_t *
+arena_ichoose(tsd_t *tsd, arena_t *arena) {
+	return arena_choose_impl(tsd, arena, true);
+}
+
 JEMALLOC_ALWAYS_INLINE extent_t *
 iealloc(tsdn_t *tsdn, const void *ptr) {
 	return extent_lookup(tsdn, ptr, true);
diff --git a/include/jemalloc/internal/jemalloc_internal_defs.h.in b/include/jemalloc/internal/jemalloc_internal_defs.h.in
index b2e0077e..500f4274 100644
--- a/include/jemalloc/internal/jemalloc_internal_defs.h.in
+++ b/include/jemalloc/internal/jemalloc_internal_defs.h.in
@@ -308,6 +308,9 @@
 /* Adaptive mutex support in pthreads. */
 #undef JEMALLOC_HAVE_PTHREAD_MUTEX_ADAPTIVE_NP
 
+/* GNU specific sched_getcpu support */
+#undef JEMALLOC_HAVE_SCHED_GETCPU
+
 /*
  * If defined, jemalloc symbols are not exported (doesn't work when
  * JEMALLOC_PREFIX is not defined).
diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt
index 64bea334..c0211e58 100644
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -258,6 +258,7 @@ large_salloc
 lg_floor
 lg_prof_sample
 malloc_cprintf
+malloc_getcpu
 malloc_mutex_assert_not_owner
 malloc_mutex_assert_owner
 malloc_mutex_boot
@@ -330,6 +331,9 @@ pages_purge_forced
 pages_purge_lazy
 pages_trim
 pages_unmap
+percpu_arena_choose
+percpu_arena_ind_limit
+percpu_arena_update
 pind2sz
 pind2sz_compute
 pind2sz_lookup
diff --git a/src/arena.c b/src/arena.c
index 43bad81c..a3a1fdd7 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -4,6 +4,15 @@
 /******************************************************************************/
 /* Data. */
 
+const char	*percpu_arena_mode_names[] = {
+	"disabled",
+	"percpu",
+	"phycpu"
+};
+
+const char	*opt_percpu_arena = OPT_PERCPU_ARENA_DEFAULT;
+percpu_arena_mode_t	percpu_arena_mode = PERCPU_ARENA_MODE_DEFAULT;
+
 ssize_t		opt_decay_time = DECAY_TIME_DEFAULT;
 static ssize_t	decay_time_default;
 
@@ -1629,6 +1638,7 @@ arena_new(tsdn_t *tsdn, unsigned ind, extent_hooks_t *extent_hooks) {
 	}
 
 	arena->nthreads[0] = arena->nthreads[1] = 0;
+	arena->last_thd = NULL;
 
 	if (config_stats) {
 		if (arena_stats_init(tsdn, &arena->stats)) {
diff --git a/src/ctl.c b/src/ctl.c
index 831877b0..d4ab699f 100644
--- a/src/ctl.c
+++ b/src/ctl.c
@@ -72,6 +72,7 @@ CTL_PROTO(config_xmalloc)
 CTL_PROTO(opt_abort)
 CTL_PROTO(opt_dss)
 CTL_PROTO(opt_narenas)
+CTL_PROTO(opt_percpu_arena)
 CTL_PROTO(opt_decay_time)
 CTL_PROTO(opt_stats_print)
 CTL_PROTO(opt_junk)
@@ -229,6 +230,7 @@ static const ctl_named_node_t opt_node[] = {
 	{NAME("abort"),		CTL(opt_abort)},
 	{NAME("dss"),		CTL(opt_dss)},
 	{NAME("narenas"),	CTL(opt_narenas)},
+	{NAME("percpu_arena"),	CTL(opt_percpu_arena)},
 	{NAME("decay_time"),	CTL(opt_decay_time)},
 	{NAME("stats_print"),	CTL(opt_stats_print)},
 	{NAME("junk"),		CTL(opt_junk)},
@@ -1284,6 +1286,7 @@ CTL_RO_CONFIG_GEN(config_xmalloc, bool)
 CTL_RO_NL_GEN(opt_abort, opt_abort, bool)
 CTL_RO_NL_GEN(opt_dss, opt_dss, const char *)
 CTL_RO_NL_GEN(opt_narenas, opt_narenas, unsigned)
+CTL_RO_NL_GEN(opt_percpu_arena, opt_percpu_arena, const char *)
 CTL_RO_NL_GEN(opt_decay_time, opt_decay_time, ssize_t)
 CTL_RO_NL_GEN(opt_stats_print, opt_stats_print, bool)
 CTL_RO_NL_CGEN(config_fill, opt_junk, opt_junk, const char *)
@@ -1317,10 +1320,10 @@ thread_arena_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,
 	if (oldarena == NULL) {
 		return EAGAIN;
 	}
-
 	newind = oldind = arena_ind_get(oldarena);
 	WRITE(newind, unsigned);
 	READ(oldind, unsigned);
+
 	if (newind != oldind) {
 		arena_t *newarena;
 
@@ -1330,6 +1333,19 @@ thread_arena_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,
 			goto label_return;
 		}
 
+		if (have_percpu_arena &&
+		    (percpu_arena_mode != percpu_arena_disabled)) {
+			if (newind < percpu_arena_ind_limit()) {
+				/*
+				 * If perCPU arena is enabled, thread_arena
+				 * control is not allowed for the auto arena
+				 * range.
+				 */
+				ret = EPERM;
+				goto label_return;
+			}
+		}
+
 		/* Initialize arena if necessary. */
 		newarena = arena_get(tsd_tsdn(tsd), newind, true);
 		if (newarena == NULL) {
diff --git a/src/jemalloc.c b/src/jemalloc.c
index ecfecf9c..ce84b3cf 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -56,7 +56,8 @@ static malloc_mutex_t	arenas_lock;
  * arenas.  arenas[narenas_auto..narenas_total) are only used if the application
  * takes some action to create them and allocate from them.
  */
-arena_t			**arenas;
+JEMALLOC_ALIGNED(CACHELINE)
+arena_t			*arenas[MALLOCX_ARENA_MAX + 1];
 static unsigned		narenas_total; /* Use narenas_total_*(). */
 static arena_t		*a0; /* arenas[0]; read-only after initialization. */
 unsigned		narenas_auto; /* Read-only after initialization. */
@@ -543,6 +544,16 @@ arena_t *
 arena_choose_hard(tsd_t *tsd, bool internal) {
 	arena_t *ret JEMALLOC_CC_SILENCE_INIT(NULL);
 
+	if (have_percpu_arena && percpu_arena_mode != percpu_arena_disabled) {
+		unsigned choose = percpu_arena_choose();
+		ret = arena_get(tsd_tsdn(tsd), choose, true);
+		assert(ret != NULL);
+		arena_bind(tsd, arena_ind_get(ret), false);
+		arena_bind(tsd, arena_ind_get(ret), true);
+
+		return ret;
+	}
+
 	if (narenas_auto > 1) {
 		unsigned i, j, choose[2], first_null;
 
@@ -1095,6 +1106,30 @@ malloc_conf_init(void) {
 				    "lg_tcache_max", -1,
 				    (sizeof(size_t) << 3) - 1)
 			}
+			if (strncmp("percpu_arena", k, klen) == 0) {
+				int i;
+				bool match = false;
+				for (i = 0; i < percpu_arena_mode_limit; i++) {
+					if (strncmp(percpu_arena_mode_names[i],
+						    v, vlen) == 0) {
+						if (!have_percpu_arena) {
+							malloc_conf_error(
+							    "No getcpu support",
+							    k, klen, v, vlen);
+						}
+						percpu_arena_mode = i;
+						opt_percpu_arena =
+						    percpu_arena_mode_names[i];
+						match = true;
+						break;
+					}
+				}
+				if (!match) {
+					malloc_conf_error("Invalid conf value",
+					    k, klen, v, vlen);
+				}
+				continue;
+			}
 			if (config_prof) {
 				CONF_HANDLE_BOOL(opt_prof, "prof", true)
 				CONF_HANDLE_CHAR_P(opt_prof_prefix,
@@ -1204,8 +1239,6 @@ malloc_init_hard_a0_locked() {
 	 * malloc_ncpus().
 	 */
 	narenas_auto = 1;
-	narenas_total_set(narenas_auto);
-	arenas = &a0;
 	memset(arenas, 0, sizeof(arena_t *) * narenas_auto);
 	/*
 	 * Initialize one arena here.  The rest are lazily created in
@@ -1215,7 +1248,7 @@ malloc_init_hard_a0_locked() {
 	    == NULL) {
 		return true;
 	}
-
+	a0 = arena_get(TSDN_NULL, 0, false);
 	malloc_init_state = malloc_init_a0_initialized;
 
 	return false;
@@ -1255,23 +1288,76 @@ malloc_init_hard_recursible(void) {
 	return false;
 }
 
-static bool
-malloc_init_hard_finish(tsdn_t *tsdn) {
-	if (malloc_mutex_boot()) {
-		return true;
+static unsigned
+malloc_narenas_default(void) {
+	assert(ncpus > 0);
+	/*
+	 * For SMP systems, create more than one arena per CPU by
+	 * default.
+	 */
+	if (ncpus > 1) {
+		return ncpus << 2;
+	} else {
+		return 1;
 	}
+}
 
-	if (opt_narenas == 0) {
-		/*
-		 * For SMP systems, create more than one arena per CPU by
-		 * default.
-		 */
-		if (ncpus > 1) {
-			opt_narenas = ncpus << 2;
+static bool
+malloc_init_narenas(void) {
+	assert(ncpus > 0);
+
+	if (percpu_arena_mode != percpu_arena_disabled) {
+		if (!have_percpu_arena || malloc_getcpu() < 0) {
+			percpu_arena_mode = percpu_arena_disabled;
+			malloc_printf("<jemalloc>: perCPU arena getcpu() not "
+			    "available. Setting narenas to %u.\n", opt_narenas ?
+			    opt_narenas : malloc_narenas_default());
+			if (opt_abort) {
+				abort();
+			}
 		} else {
-			opt_narenas = 1;
+			if (ncpus > MALLOCX_ARENA_MAX) {
+				malloc_printf("<jemalloc>: narenas w/ percpu"
+				    "arena beyond limit (%d)\n", ncpus);
+				if (opt_abort) {
+					abort();
+				}
+				return true;
+			}
+			if ((percpu_arena_mode == per_phycpu_arena) &&
+			    (ncpus % 2 != 0)) {
+				malloc_printf("<jemalloc>: invalid "
+				    "configuration -- per physical CPU arena "
+				    "with odd number (%u) of CPUs (no hyper "
+				    "threading?).\n", ncpus);
+				if (opt_abort)
+					abort();
+			}
+			unsigned n = percpu_arena_ind_limit();
+			if (opt_narenas < n) {
+				/*
+				 * If narenas is specified with percpu_arena
+				 * enabled, actual narenas is set as the greater
+				 * of the two. percpu_arena_choose will be free
+				 * to use any of the arenas based on CPU
+				 * id. This is conservative (at a small cost)
+				 * but ensures correctness.
+				 *
+				 * If for some reason the ncpus determined at
+				 * boot is not the actual number (e.g. because
+				 * of affinity setting from numactl), reserving
+				 * narenas this way provides a workaround for
+				 * percpu_arena.
+				 */
+				opt_narenas = n;
+			}
 		}
 	}
+	if (opt_narenas == 0) {
+		opt_narenas = malloc_narenas_default();
+	}
+	assert(opt_narenas > 0);
+
 	narenas_auto = opt_narenas;
 	/*
 	 * Limit the number of arenas to the indexing range of MALLOCX_ARENA().
@@ -1283,14 +1369,13 @@ malloc_init_hard_finish(tsdn_t *tsdn) {
 	}
 	narenas_total_set(narenas_auto);
 
-	/* Allocate and initialize arenas. */
-	arenas = (arena_t **)base_alloc(tsdn, a0->base, sizeof(arena_t *) *
-	    (MALLOCX_ARENA_MAX+1), CACHELINE);
-	if (arenas == NULL) {
+	return false;
+}
+
+static bool
+malloc_init_hard_finish(void) {
+	if (malloc_mutex_boot())
 		return true;
-	}
-	/* Copy the pointer to the one arena that was already initialized. */
-	arena_set(0, a0);
 
 	malloc_init_state = malloc_init_initialized;
 	malloc_slow_flag_init();
@@ -1328,12 +1413,18 @@ malloc_init_hard(void) {
 	}
 	malloc_mutex_lock(tsd_tsdn(tsd), &init_lock);
 
+	/* Need this before prof_boot2 (for allocation). */
+	if (malloc_init_narenas()) {
+		malloc_mutex_unlock(tsd_tsdn(tsd), &init_lock);
+		return true;
+	}
+
 	if (config_prof && prof_boot2(tsd)) {
 		malloc_mutex_unlock(tsd_tsdn(tsd), &init_lock);
 		return true;
 	}
 
-	if (malloc_init_hard_finish(tsd_tsdn(tsd))) {
+	if (malloc_init_hard_finish()) {
 		malloc_mutex_unlock(tsd_tsdn(tsd), &init_lock);
 		return true;
 	}
diff --git a/src/stats.c b/src/stats.c
index ae360e1b..776fb862 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -621,6 +621,7 @@ stats_general_print(void (*write_cb)(void *, const char *), void *cbopaque,
 	OPT_WRITE_BOOL(abort, ",")
 	OPT_WRITE_CHAR_P(dss, ",")
 	OPT_WRITE_UNSIGNED(narenas, ",")
+	OPT_WRITE_CHAR_P(percpu_arena, ",")
 	OPT_WRITE_SSIZE_T_MUTABLE(decay_time, arenas.decay_time, ",")
 	OPT_WRITE_CHAR_P(junk, ",")
 	OPT_WRITE_BOOL(zero, ",")
diff --git a/src/tcache.c b/src/tcache.c
index 78570663..266bd1f5 100644
--- a/src/tcache.c
+++ b/src/tcache.c
@@ -357,12 +357,8 @@ tcache_create(tsdn_t *tsdn, arena_t *arena) {
 
 static void
 tcache_destroy(tsd_t *tsd, tcache_t *tcache) {
-	arena_t *arena;
 	unsigned i;
 
-	arena = arena_choose(tsd, NULL);
-	tcache_arena_dissociate(tsd_tsdn(tsd), tcache);
-
 	for (i = 0; i < NBINS; i++) {
 		tcache_bin_t *tbin = &tcache->tbins[i];
 		tcache_bin_flush_small(tsd, tcache, tbin, i, 0);
@@ -381,6 +377,13 @@ tcache_destroy(tsd_t *tsd, tcache_t *tcache) {
 		}
 	}
 
+	/*
+	 * Get arena after flushing -- when using percpu arena, the associated
+	 * arena could change during flush.
+	 */
+	arena_t *arena = arena_choose(tsd, NULL);
+	tcache_arena_dissociate(tsd_tsdn(tsd), tcache);
+
 	if (config_prof && tcache->prof_accumbytes > 0 &&
 	    arena_prof_accum(tsd_tsdn(tsd), arena, tcache->prof_accumbytes)) {
 		prof_idump(tsd_tsdn(tsd));
diff --git a/test/integration/thread_arena.c b/test/integration/thread_arena.c
index 9991a42f..1e5ec05d 100644
--- a/test/integration/thread_arena.c
+++ b/test/integration/thread_arena.c
@@ -37,10 +37,16 @@ thd_start(void *arg) {
 	return NULL;
 }
 
+static void
+mallctl_failure(int err) {
+	char buf[BUFERROR_BUF];
+
+	buferror(err, buf, sizeof(buf));
+	test_fail("Error in mallctl(): %s", buf);
+}
+
 TEST_BEGIN(test_thread_arena) {
 	void *p;
-	unsigned arena_ind;
-	size_t size;
 	int err;
 	thd_t thds[NTHREADS];
 	unsigned i;
@@ -48,13 +54,15 @@ TEST_BEGIN(test_thread_arena) {
 	p = malloc(1);
 	assert_ptr_not_null(p, "Error in malloc()");
 
-	size = sizeof(arena_ind);
-	if ((err = mallctl("thread.arena", (void *)&arena_ind, &size, NULL,
-	    0))) {
-		char buf[BUFERROR_BUF];
+	unsigned arena_ind, old_arena_ind;
+	size_t sz = sizeof(unsigned);
+	assert_d_eq(mallctl("arenas.create", (void *)&arena_ind, &sz, NULL, 0),
+	    0, "Arena creation failure");
 
-		buferror(err, buf, sizeof(buf));
-		test_fail("Error in mallctl(): %s", buf);
+	size_t size = sizeof(arena_ind);
+	if ((err = mallctl("thread.arena", (void *)&old_arena_ind, &size,
+	    (void *)&arena_ind, sizeof(arena_ind))) != 0) {
+		mallctl_failure(err);
 	}
 
 	for (i = 0; i < NTHREADS; i++) {
@@ -67,6 +75,7 @@ TEST_BEGIN(test_thread_arena) {
 		thd_join(thds[i], (void *)&join_ret);
 		assert_zd_eq(join_ret, 0, "Unexpected thread join error");
 	}
+	free(p);
 }
 TEST_END
 
diff --git a/test/unit/mallctl.c b/test/unit/mallctl.c
index c931e378..1aedbe8a 100644
--- a/test/unit/mallctl.c
+++ b/test/unit/mallctl.c
@@ -160,6 +160,7 @@ TEST_BEGIN(test_mallctl_opt) {
 	TEST_MALLCTL_OPT(bool, abort, always);
 	TEST_MALLCTL_OPT(const char *, dss, always);
 	TEST_MALLCTL_OPT(unsigned, narenas, always);
+	TEST_MALLCTL_OPT(const char *, percpu_arena, always);
 	TEST_MALLCTL_OPT(ssize_t, decay_time, always);
 	TEST_MALLCTL_OPT(bool, stats_print, always);
 	TEST_MALLCTL_OPT(const char *, junk, fill);
@@ -327,20 +328,38 @@ TEST_BEGIN(test_tcache) {
 TEST_END
 
 TEST_BEGIN(test_thread_arena) {
-	unsigned arena_old, arena_new, narenas;
-	size_t sz = sizeof(unsigned);
+	unsigned old_arena_ind, new_arena_ind, narenas;
+	const char *opt_percpu_arena;
 
+	size_t sz = sizeof(opt_percpu_arena);
+	assert_d_eq(mallctl("opt.percpu_arena", &opt_percpu_arena, &sz, NULL,
+	    0), 0, "Unexpected mallctl() failure");
+
+	sz = sizeof(unsigned);
 	assert_d_eq(mallctl("arenas.narenas", (void *)&narenas, &sz, NULL, 0),
 	    0, "Unexpected mallctl() failure");
 	assert_u_eq(narenas, opt_narenas, "Number of arenas incorrect");
-	arena_new = narenas - 1;
-	assert_d_eq(mallctl("thread.arena", (void *)&arena_old, &sz,
-	    (void *)&arena_new, sizeof(unsigned)), 0,
-	    "Unexpected mallctl() failure");
-	arena_new = 0;
-	assert_d_eq(mallctl("thread.arena", (void *)&arena_old, &sz,
-	    (void *)&arena_new, sizeof(unsigned)), 0,
-	    "Unexpected mallctl() failure");
+
+	if (strcmp(opt_percpu_arena, "disabled") == 0) {
+		new_arena_ind = narenas - 1;
+		assert_d_eq(mallctl("thread.arena", (void *)&old_arena_ind, &sz,
+		    (void *)&new_arena_ind, sizeof(unsigned)), 0,
+		    "Unexpected mallctl() failure");
+		new_arena_ind = 0;
+		assert_d_eq(mallctl("thread.arena", (void *)&old_arena_ind, &sz,
+		    (void *)&new_arena_ind, sizeof(unsigned)), 0,
+		    "Unexpected mallctl() failure");
+	} else {
+		assert_d_eq(mallctl("thread.arena", (void *)&old_arena_ind, &sz,
+		    NULL, 0), 0, "Unexpected mallctl() failure");
+		new_arena_ind = percpu_arena_ind_limit() - 1;
+		if (old_arena_ind != new_arena_ind) {
+			assert_d_eq(mallctl("thread.arena",
+			    (void *)&old_arena_ind, &sz, (void *)&new_arena_ind,
+			    sizeof(unsigned)), EPERM, "thread.arena ctl "
+			    "should not be allowed with percpu arena");
+		}
+	}
 }
 TEST_END
 
diff --git a/test/unit/stats.c b/test/unit/stats.c
index 948132cb..c458d3f9 100644
--- a/test/unit/stats.c
+++ b/test/unit/stats.c
@@ -33,7 +33,7 @@ TEST_BEGIN(test_stats_large) {
 	size_t sz;
 	int expected = config_stats ? 0 : ENOENT;
 
-	p = mallocx(SMALL_MAXCLASS+1, 0);
+	p = mallocx(SMALL_MAXCLASS+1, MALLOCX_ARENA(0));
 	assert_ptr_not_null(p, "Unexpected mallocx() failure");
 
 	assert_d_eq(mallctl("epoch", NULL, NULL, (void *)&epoch, sizeof(epoch)),
@@ -66,7 +66,6 @@ TEST_BEGIN(test_stats_large) {
 TEST_END
 
 TEST_BEGIN(test_stats_arenas_summary) {
-	unsigned arena;
 	void *little, *large;
 	uint64_t epoch;
 	size_t sz;
@@ -74,13 +73,9 @@ TEST_BEGIN(test_stats_arenas_summary) {
 	size_t mapped;
 	uint64_t npurge, nmadvise, purged;
 
-	arena = 0;
-	assert_d_eq(mallctl("thread.arena", NULL, NULL, (void *)&arena,
-	    sizeof(arena)), 0, "Unexpected mallctl() failure");
-
-	little = mallocx(SMALL_MAXCLASS, 0);
+	little = mallocx(SMALL_MAXCLASS, MALLOCX_ARENA(0));
 	assert_ptr_not_null(little, "Unexpected mallocx() failure");
-	large = mallocx((1U << LG_LARGE_MINCLASS), 0);
+	large = mallocx((1U << LG_LARGE_MINCLASS), MALLOCX_ARENA(0));
 	assert_ptr_not_null(large, "Unexpected mallocx() failure");
 
 	dallocx(little, 0);
@@ -128,7 +123,6 @@ no_lazy_lock(void) {
 }
 
 TEST_BEGIN(test_stats_arenas_small) {
-	unsigned arena;
 	void *p;
 	size_t sz, allocated;
 	uint64_t epoch, nmalloc, ndalloc, nrequests;
@@ -136,11 +130,7 @@ TEST_BEGIN(test_stats_arenas_small) {
 
 	no_lazy_lock(); /* Lazy locking would dodge tcache testing. */
 
-	arena = 0;
-	assert_d_eq(mallctl("thread.arena", NULL, NULL, (void *)&arena,
-	    sizeof(arena)), 0, "Unexpected mallctl() failure");
-
-	p = mallocx(SMALL_MAXCLASS, 0);
+	p = mallocx(SMALL_MAXCLASS, MALLOCX_ARENA(0));
 	assert_ptr_not_null(p, "Unexpected mallocx() failure");
 
 	assert_d_eq(mallctl("thread.tcache.flush", NULL, NULL, NULL, 0),
@@ -178,17 +168,12 @@ TEST_BEGIN(test_stats_arenas_small) {
 TEST_END
 
 TEST_BEGIN(test_stats_arenas_large) {
-	unsigned arena;
 	void *p;
 	size_t sz, allocated;
 	uint64_t epoch, nmalloc, ndalloc;
 	int expected = config_stats ? 0 : ENOENT;
 
-	arena = 0;
-	assert_d_eq(mallctl("thread.arena", NULL, NULL, (void *)&arena,
-	    sizeof(arena)), 0, "Unexpected mallctl() failure");
-
-	p = mallocx((1U << LG_LARGE_MINCLASS), 0);
+	p = mallocx((1U << LG_LARGE_MINCLASS), MALLOCX_ARENA(0));
 	assert_ptr_not_null(p, "Unexpected mallocx() failure");
 
 	assert_d_eq(mallctl("epoch", NULL, NULL, (void *)&epoch, sizeof(epoch)),
@@ -217,20 +202,29 @@ TEST_BEGIN(test_stats_arenas_large) {
 }
 TEST_END
 
+static void
+gen_mallctl_str(char *cmd, char *name, unsigned arena_ind) {
+	sprintf(cmd, "stats.arenas.%u.bins.0.%s", arena_ind, name);
+}
+
 TEST_BEGIN(test_stats_arenas_bins) {
-	unsigned arena;
 	void *p;
 	size_t sz, curslabs, curregs;
 	uint64_t epoch, nmalloc, ndalloc, nrequests, nfills, nflushes;
 	uint64_t nslabs, nreslabs;
 	int expected = config_stats ? 0 : ENOENT;
 
-	arena = 0;
-	assert_d_eq(mallctl("thread.arena", NULL, NULL, (void *)&arena,
-	    sizeof(arena)), 0, "Unexpected mallctl() failure");
+	unsigned arena_ind, old_arena_ind;
+	sz = sizeof(unsigned);
+	assert_d_eq(mallctl("arenas.create", (void *)&arena_ind, &sz, NULL, 0),
+	    0, "Arena creation failure");
+	sz = sizeof(arena_ind);
+	assert_d_eq(mallctl("thread.arena", (void *)&old_arena_ind, &sz,
+	    (void *)&arena_ind, sizeof(arena_ind)), 0,
+	    "Unexpected mallctl() failure");
 
-	p = mallocx(arena_bin_info[0].reg_size, 0);
-	assert_ptr_not_null(p, "Unexpected mallocx() failure");
+	p = malloc(arena_bin_info[0].reg_size);
+	assert_ptr_not_null(p, "Unexpected malloc() failure");
 
 	assert_d_eq(mallctl("thread.tcache.flush", NULL, NULL, NULL, 0),
 	    config_tcache ? 0 : ENOENT, "Unexpected mallctl() result");
@@ -238,33 +232,40 @@ TEST_BEGIN(test_stats_arenas_bins) {
 	assert_d_eq(mallctl("epoch", NULL, NULL, (void *)&epoch, sizeof(epoch)),
 	    0, "Unexpected mallctl() failure");
 
+	char cmd[128];
 	sz = sizeof(uint64_t);
-	assert_d_eq(mallctl("stats.arenas.0.bins.0.nmalloc", (void *)&nmalloc,
-	    &sz, NULL, 0), expected, "Unexpected mallctl() result");
-	assert_d_eq(mallctl("stats.arenas.0.bins.0.ndalloc", (void *)&ndalloc,
-	    &sz, NULL, 0), expected, "Unexpected mallctl() result");
-	assert_d_eq(mallctl("stats.arenas.0.bins.0.nrequests",
-	    (void *)&nrequests, &sz, NULL, 0), expected,
+	gen_mallctl_str(cmd, "nmalloc", arena_ind);
+	assert_d_eq(mallctl(cmd, (void *)&nmalloc, &sz, NULL, 0), expected,
+	    "Unexpected mallctl() result");
+	gen_mallctl_str(cmd, "ndalloc", arena_ind);
+	assert_d_eq(mallctl(cmd, (void *)&ndalloc, &sz, NULL, 0), expected,
+	    "Unexpected mallctl() result");
+	gen_mallctl_str(cmd, "nrequests", arena_ind);
+	assert_d_eq(mallctl(cmd, (void *)&nrequests, &sz, NULL, 0), expected,
 	    "Unexpected mallctl() result");
 	sz = sizeof(size_t);
-	assert_d_eq(mallctl("stats.arenas.0.bins.0.curregs", (void *)&curregs,
-	    &sz, NULL, 0), expected, "Unexpected mallctl() result");
+	gen_mallctl_str(cmd, "curregs", arena_ind);
+	assert_d_eq(mallctl(cmd, (void *)&curregs, &sz, NULL, 0), expected,
+	    "Unexpected mallctl() result");
 
 	sz = sizeof(uint64_t);
-	assert_d_eq(mallctl("stats.arenas.0.bins.0.nfills", (void *)&nfills,
-	    &sz, NULL, 0), config_tcache ? expected : ENOENT,
-	    "Unexpected mallctl() result");
-	assert_d_eq(mallctl("stats.arenas.0.bins.0.nflushes", (void *)&nflushes,
-	    &sz, NULL, 0), config_tcache ? expected : ENOENT,
-	    "Unexpected mallctl() result");
+	gen_mallctl_str(cmd, "nfills", arena_ind);
+	assert_d_eq(mallctl(cmd, (void *)&nfills, &sz, NULL, 0),
+	    config_tcache ? expected : ENOENT, "Unexpected mallctl() result");
+	gen_mallctl_str(cmd, "nflushes", arena_ind);
+	assert_d_eq(mallctl(cmd, (void *)&nflushes, &sz, NULL, 0),
+	    config_tcache ? expected : ENOENT, "Unexpected mallctl() result");
 
-	assert_d_eq(mallctl("stats.arenas.0.bins.0.nslabs", (void *)&nslabs,
-	    &sz, NULL, 0), expected, "Unexpected mallctl() result");
-	assert_d_eq(mallctl("stats.arenas.0.bins.0.nreslabs", (void *)&nreslabs,
-	    &sz, NULL, 0), expected, "Unexpected mallctl() result");
+	gen_mallctl_str(cmd, "nslabs", arena_ind);
+	assert_d_eq(mallctl(cmd, (void *)&nslabs, &sz, NULL, 0), expected,
+	    "Unexpected mallctl() result");
+	gen_mallctl_str(cmd, "nreslabs", arena_ind);
+	assert_d_eq(mallctl(cmd, (void *)&nreslabs, &sz, NULL, 0), expected,
+	    "Unexpected mallctl() result");
 	sz = sizeof(size_t);
-	assert_d_eq(mallctl("stats.arenas.0.bins.0.curslabs", (void *)&curslabs,
-	    &sz, NULL, 0), expected, "Unexpected mallctl() result");
+	gen_mallctl_str(cmd, "curslabs", arena_ind);
+	assert_d_eq(mallctl(cmd, (void *)&curslabs, &sz, NULL, 0), expected,
+	    "Unexpected mallctl() result");
 
 	if (config_stats) {
 		assert_u64_gt(nmalloc, 0,
@@ -292,21 +293,16 @@ TEST_BEGIN(test_stats_arenas_bins) {
 TEST_END
 
 TEST_BEGIN(test_stats_arenas_lextents) {
-	unsigned arena;
 	void *p;
 	uint64_t epoch, nmalloc, ndalloc;
 	size_t curlextents, sz, hsize;
 	int expected = config_stats ? 0 : ENOENT;
 
-	arena = 0;
-	assert_d_eq(mallctl("thread.arena", NULL, NULL, (void *)&arena,
-	    sizeof(arena)), 0, "Unexpected mallctl() failure");
-
 	sz = sizeof(size_t);
 	assert_d_eq(mallctl("arenas.lextent.0.size", (void *)&hsize, &sz, NULL,
 	    0), 0, "Unexpected mallctl() failure");
 
-	p = mallocx(hsize, 0);
+	p = mallocx(hsize, MALLOCX_ARENA(0));
 	assert_ptr_not_null(p, "Unexpected mallocx() failure");
 
 	assert_d_eq(mallctl("epoch", NULL, NULL, (void *)&epoch, sizeof(epoch)),