diff --git a/include/jemalloc/internal/cache_bin.h b/include/jemalloc/internal/cache_bin.h
index 382883cd..6ab6baa7 100644
--- a/include/jemalloc/internal/cache_bin.h
+++ b/include/jemalloc/internal/cache_bin.h
@@ -14,7 +14,10 @@
  * of the tcache at all.
  */
 
-/* The size in bytes of each cache bin stack. */
+/*
+ * The size in bytes of each cache bin stack.  We also use this to indicate
+ * *counts* of individual objects.
+ */
 typedef uint16_t cache_bin_sz_t;
 
 typedef struct cache_bin_stats_s cache_bin_stats_t;
@@ -311,4 +314,31 @@ cache_bin_finish_flush(cache_bin_t *bin, cache_bin_info_t *info,
 	}
 }
 
+/*
+ * Initialize a cache_bin_info to represent up to the given number of items in
+ * the cache_bins it is associated with.
+ */
+void cache_bin_info_init(cache_bin_info_t *bin_info,
+    cache_bin_sz_t ncached_max);
+/*
+ * Given an array of initialized cache_bin_info_ts, determine how big an
+ * allocation is required to initialize a full set of cache_bin_ts.
+ */
+void cache_bin_info_compute_alloc(cache_bin_info_t *infos, szind_t ninfos,
+    size_t *size, size_t *alignment);
+
+/*
+ * Actually initialize some cache bins.  Callers should allocate the backing
+ * memory indicated by a call to cache_bin_compute_alloc.  They should then
+ * preincrement, call init once for each bin and info, and then call
+ * cache_bin_postincrement.  *alloc_cur will then point immediately past the end
+ * of the allocation.
+ */
+void cache_bin_preincrement(cache_bin_info_t *infos, szind_t ninfos,
+    void *alloc, size_t *cur_offset);
+void cache_bin_postincrement(cache_bin_info_t *infos, szind_t ninfos,
+    void *alloc, size_t *cur_offset);
+void cache_bin_init(cache_bin_t *bin, cache_bin_info_t *info, void *alloc,
+    size_t *cur_offset);
+
 #endif /* JEMALLOC_INTERNAL_CACHE_BIN_H */
diff --git a/src/cache_bin.c b/src/cache_bin.c
index 454cb475..260c1b77 100644
--- a/src/cache_bin.c
+++ b/src/cache_bin.c
@@ -1,3 +1,104 @@
 #include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/jemalloc_internal_includes.h"
 
+#include "jemalloc/internal/bit_util.h"
+
+void
+cache_bin_info_init(cache_bin_info_t *info,
+    cache_bin_sz_t ncached_max) {
+	size_t stack_size = (size_t)ncached_max * sizeof(void *);
+	assert(stack_size < ((size_t)1 << (sizeof(cache_bin_sz_t) * 8)));
+	info->stack_size = (cache_bin_sz_t)stack_size;
+}
+
+void
+cache_bin_info_compute_alloc(cache_bin_info_t *infos, szind_t ninfos,
+    size_t *size, size_t *alignment) {
+	/* For the total bin stack region (per tcache), reserve 2 more slots so
+	 * that
+	 * 1) the empty position can be safely read on the fast path before
+	 *    checking "is_empty"; and
+	 * 2) the cur_ptr can go beyond the empty position by 1 step safely on
+	 * the fast path (i.e. no overflow).
+	 */
+	*size = sizeof(void *) * 2;
+	for (szind_t i = 0; i < ninfos; i++) {
+		*size += infos[i].stack_size;
+	}
+
+	/*
+	 * 1) Align to at least PAGE, to minimize the # of TLBs needed by the
+	 * smaller sizes; also helps if the larger sizes don't get used at all.
+	 * 2) On 32-bit the pointers won't be compressed; use minimal alignment.
+	 */
+	if (LG_SIZEOF_PTR < 3 || *size < PAGE) {
+		*alignment = PAGE;
+	} else {
+		/*
+		 * Align pow2 to avoid overflow the cache bin compressed
+		 * pointers.
+		 */
+		*alignment = pow2_ceil_zu(*size);
+	}
+}
+
+void
+cache_bin_preincrement(cache_bin_info_t *infos, szind_t ninfos, void *alloc,
+    size_t *cur_offset) {
+	if (config_debug) {
+		size_t computed_size;
+		size_t computed_alignment;
+
+		/* Pointer should be as aligned as we asked for. */
+		cache_bin_info_compute_alloc(infos, ninfos, &computed_size,
+		    &computed_alignment);
+		assert(((uintptr_t)alloc & (computed_alignment - 1)) == 0);
+
+		/* And that alignment should disallow overflow. */
+		uint32_t lowbits = (uint32_t)((uintptr_t)alloc + computed_size);
+		assert((uint32_t)(uintptr_t)alloc < lowbits);
+	}
+	/*
+	 * Leave a noticeable mark pattern on the boundaries, in case a bug
+	 * starts leaking those.  Make it look like the junk pattern but be
+	 * distinct from it.
+	 */
+	uintptr_t preceding_ptr_junk = (uintptr_t)0x7a7a7a7a7a7a7a7aULL;
+	*(uintptr_t *)((uintptr_t)alloc + *cur_offset) = preceding_ptr_junk;
+	*cur_offset += sizeof(void *);
+}
+
+void
+cache_bin_postincrement(cache_bin_info_t *infos, szind_t ninfos, void *alloc,
+    size_t *cur_offset) {
+	/* Note: a7 vs. 7a above -- this tells you which pointer leaked. */
+	uintptr_t trailing_ptr_junk = (uintptr_t)0xa7a7a7a7a7a7a7a7ULL;
+	*(uintptr_t *)((uintptr_t)alloc + *cur_offset) = trailing_ptr_junk;
+	*cur_offset += sizeof(void *);
+}
+
+
+void
+cache_bin_init(cache_bin_t *bin, cache_bin_info_t *info, void *alloc,
+    size_t *cur_offset) {
+	assert(sizeof(bin->cur_ptr) == sizeof(void *));
+	/*
+	 * The full_position points to the lowest available space.  Allocations
+	 * will access the slots toward higher addresses (for the benefit of
+	 * adjacent prefetch).
+	 */
+	void *stack_cur = (void *)((uintptr_t)alloc + *cur_offset);
+	void *full_position = stack_cur;
+	uint32_t bin_stack_size = info->stack_size;
+
+	*cur_offset += bin_stack_size;
+	void *empty_position = (void *)((uintptr_t)alloc + *cur_offset);
+
+	/* Init to the empty position. */
+	bin->cur_ptr.ptr = empty_position;
+	bin->low_water_position = bin->cur_ptr.lowbits;
+	bin->full_position = (uint32_t)(uintptr_t)full_position;
+	assert(bin->cur_ptr.lowbits - bin->full_position == bin_stack_size);
+	assert(cache_bin_ncached_get(bin, info) == 0);
+	assert(cache_bin_empty_position_get(bin, info) == empty_position);
+}
diff --git a/src/tcache.c b/src/tcache.c
index e7188585..48f06b70 100644
--- a/src/tcache.c
+++ b/src/tcache.c
@@ -14,16 +14,10 @@ bool	opt_tcache = true;
 ssize_t	opt_lg_tcache_max = LG_TCACHE_MAXCLASS_DEFAULT;
 
 cache_bin_info_t	*tcache_bin_info;
-/*
- * For the total bin stack region (per tcache), reserve 2 more slots so that 1)
- * the empty position can be safely read on the fast path before checking
- * "is_empty"; and 2) the cur_ptr can go beyond the empty position by 1 step
- * safely on the fast path (i.e. no overflow).
- */
-static const unsigned total_stack_padding = sizeof(void *) * 2;
 
 /* Total stack size required (per tcache).  Include the padding above. */
-static uint32_t total_stack_bytes;
+static size_t tcache_bin_alloc_size;
+static size_t tcache_bin_alloc_alignment;
 
 unsigned		nhbins;
 size_t			tcache_maxclass;
@@ -430,43 +424,8 @@ tsd_tcache_enabled_data_init(tsd_t *tsd) {
 	return false;
 }
 
-static bool
-tcache_bin_init(cache_bin_t *bin, szind_t ind, uintptr_t *stack_cur) {
-	assert(sizeof(bin->cur_ptr) == sizeof(void *));
-	/*
-	 * The full_position points to the lowest available space.  Allocations
-	 * will access the slots toward higher addresses (for the benefit of
-	 * adjacent prefetch).
-	 */
-	void *full_position = (void *)*stack_cur;
-	uint32_t bin_stack_size = tcache_bin_info[ind].stack_size;
-
-	*stack_cur += bin_stack_size;
-	void *empty_position = (void *)*stack_cur;
-
-	/* Init to the empty position. */
-	bin->cur_ptr.ptr = empty_position;
-	bin->low_water_position = bin->cur_ptr.lowbits;
-	bin->full_position = (uint32_t)(uintptr_t)full_position;
-	assert(bin->cur_ptr.lowbits - bin->full_position == bin_stack_size);
-	assert(cache_bin_ncached_get(bin, &tcache_bin_info[ind]) == 0);
-	assert(cache_bin_empty_position_get(bin, &tcache_bin_info[ind])
-	    == empty_position);
-
-	return false;
-}
-
-/* Sanity check only. */
-static bool
-tcache_bin_lowbits_overflowable(void *ptr) {
-	uint32_t lowbits = (uint32_t)((uintptr_t)ptr + total_stack_bytes);
-	return lowbits < (uint32_t)(uintptr_t)ptr;
-}
-
 static void
 tcache_init(tsd_t *tsd, tcache_t *tcache, void *avail_stack) {
-	assert(!tcache_bin_lowbits_overflowable(avail_stack));
-
 	memset(&tcache->link, 0, sizeof(ql_elm(tcache_t)));
 	tcache->next_gc_bin = 0;
 	tcache->arena = NULL;
@@ -476,35 +435,25 @@ tcache_init(tsd_t *tsd, tcache_t *tcache, void *avail_stack) {
 	memset(tcache->bins_large, 0, sizeof(cache_bin_t) * (nhbins - SC_NBINS));
 
 	unsigned i = 0;
-	uintptr_t stack_cur = (uintptr_t)avail_stack;
+	size_t cur_offset = 0;
+	cache_bin_preincrement(tcache_bin_info, nhbins, avail_stack,
+	    &cur_offset);
 	for (; i < SC_NBINS; i++) {
 		tcache->lg_fill_div[i] = 1;
 		tcache->bin_refilled[i] = false;
 		cache_bin_t *bin = tcache_small_bin_get(tcache, i);
-		tcache_bin_init(bin, i, &stack_cur);
+		cache_bin_init(bin, &tcache_bin_info[i], avail_stack,
+		    &cur_offset);
 	}
 	for (; i < nhbins; i++) {
 		cache_bin_t *bin = tcache_large_bin_get(tcache, i);
-		tcache_bin_init(bin, i, &stack_cur);
+		cache_bin_init(bin, &tcache_bin_info[i], avail_stack,
+		    &cur_offset);
 	}
-
+	cache_bin_postincrement(tcache_bin_info, nhbins, avail_stack,
+	    &cur_offset);
 	/* Sanity check that the whole stack is used. */
-	size_t stack_offset = stack_cur - (uintptr_t)avail_stack;
-	assert(stack_offset + total_stack_padding == total_stack_bytes);
-}
-
-static size_t
-tcache_bin_stack_alignment (size_t size) {
-	/*
-	 * 1) Align to at least PAGE, to minimize the # of TLBs needed by the
-	 * smaller sizes; also helps if the larger sizes don't get used at all.
-	 * 2) On 32-bit the pointers won't be compressed; use minimal alignment.
-	 */
-	if (LG_SIZEOF_PTR < 3 || size < PAGE) {
-		return PAGE;
-	}
-	/* Align pow2 to avoid overflow the cache bin compressed pointers. */
-	return pow2_ceil_zu(size);
+	assert(cur_offset == tcache_bin_alloc_size);
 }
 
 /* Initialize auto tcache (embedded in TSD). */
@@ -512,8 +461,8 @@ bool
 tsd_tcache_data_init(tsd_t *tsd) {
 	tcache_t *tcache = tsd_tcachep_get_unsafe(tsd);
 	assert(tcache_small_bin_get(tcache, 0)->cur_ptr.ptr == NULL);
-	size_t alignment = tcache_bin_stack_alignment(total_stack_bytes);
-	size_t size = sz_sa2u(total_stack_bytes, alignment);
+	size_t alignment = tcache_bin_alloc_alignment;
+	size_t size = sz_sa2u(tcache_bin_alloc_size, alignment);
 
 	void *avail_array = ipallocztm(tsd_tsdn(tsd), size, alignment, true,
 	    NULL, true, arena_get(TSDN_NULL, 0, true));
@@ -551,22 +500,29 @@ tsd_tcache_data_init(tsd_t *tsd) {
 /* Created manual tcache for tcache.create mallctl. */
 tcache_t *
 tcache_create_explicit(tsd_t *tsd) {
-	size_t size = sizeof(tcache_t);
+	/*
+	 * We place the cache bin stacks, then the tcache_t, then a pointer to
+	 * the beginning of the whole allocation (for freeing).  The makes sure
+	 * the cache bins have the requested alignment.
+	 */
+	size_t size = tcache_bin_alloc_size + sizeof(tcache_t) + sizeof(void *);
 	/* Naturally align the pointer stacks. */
 	size = PTR_CEILING(size);
-	size_t stack_offset = size;
-	size += total_stack_bytes;
-	size_t alignment = tcache_bin_stack_alignment(size);
-	size = sz_sa2u(size, alignment);
+	size = sz_sa2u(size, tcache_bin_alloc_alignment);
 
-	tcache_t *tcache = ipallocztm(tsd_tsdn(tsd), size, alignment, true,
-	    NULL, true, arena_get(TSDN_NULL, 0, true));
-	if (tcache == NULL) {
+	void *mem = ipallocztm(tsd_tsdn(tsd), size, tcache_bin_alloc_alignment,
+	    true, NULL, true, arena_get(TSDN_NULL, 0, true));
+	if (mem == NULL) {
 		return NULL;
 	}
+	void *avail_array = mem;
+	tcache_t *tcache = (void *)((uintptr_t)avail_array
+	    + tcache_bin_alloc_size);
+	void **head_ptr = (void *)((uintptr_t)avail_array
+	    + tcache_bin_alloc_size + sizeof(tcache_t));
+	tcache_init(tsd, tcache, avail_array);
+	*head_ptr = mem;
 
-	void *avail_array = (void *)((uintptr_t)tcache +
-	    (uintptr_t)stack_offset);
 	tcache_init(tsd, tcache, avail_array);
 	tcache_arena_associate(tsd_tsdn(tsd), tcache, arena_ichoose(tsd, NULL));
 
@@ -617,8 +573,10 @@ tcache_destroy(tsd_t *tsd, tcache_t *tcache, bool tsd_tcache) {
 		    tcache_bin_info[0].stack_size);
 		idalloctm(tsd_tsdn(tsd), avail_array, NULL, NULL, true, true);
 	} else {
+		/* See the comment at the top of tcache_create_explicit. */
+		void **mem_begin = (void **)((uintptr_t)tcache + sizeof(tcache_t));
 		/* Release both the tcache struct and avail array. */
-		idalloctm(tsd_tsdn(tsd), tcache, NULL, NULL, true, true);
+		idalloctm(tsd_tsdn(tsd), *mem_begin, NULL, NULL, true, true);
 	}
 
 	/*
@@ -816,7 +774,6 @@ tcache_boot(tsdn_t *tsdn, base_t *base) {
 		return true;
 	}
 	unsigned i, ncached_max;
-	total_stack_bytes = 0;
 	for (i = 0; i < SC_NBINS; i++) {
 		if ((bin_infos[i].nregs << 1) <= TCACHE_NSLOTS_SMALL_MIN) {
 			ncached_max = TCACHE_NSLOTS_SMALL_MIN;
@@ -826,18 +783,13 @@ tcache_boot(tsdn_t *tsdn, base_t *base) {
 		} else {
 			ncached_max = TCACHE_NSLOTS_SMALL_MAX;
 		}
-		unsigned stack_size = ncached_max * sizeof(void *);
-		assert(stack_size < ((uint64_t)1 <<
-		    (sizeof(cache_bin_sz_t) * 8)));
-		tcache_bin_info[i].stack_size = stack_size;
-		total_stack_bytes += stack_size;
+		cache_bin_info_init(&tcache_bin_info[i], ncached_max);
 	}
 	for (; i < nhbins; i++) {
-		unsigned stack_size = TCACHE_NSLOTS_LARGE * sizeof(void *);
-		tcache_bin_info[i].stack_size = stack_size;
-		total_stack_bytes += stack_size;
+		cache_bin_info_init(&tcache_bin_info[i], TCACHE_NSLOTS_LARGE);
 	}
-	total_stack_bytes += total_stack_padding;
+	cache_bin_info_compute_alloc(tcache_bin_info, i, &tcache_bin_alloc_size,
+	    &tcache_bin_alloc_alignment);
 
 	return false;
 }