diff --git a/Makefile.in b/Makefile.in
index ef75d8ac..7584f598 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -178,6 +178,7 @@ TESTS_UNIT := \
 	$(srcroot)test/unit/bit_util.c \
 	$(srcroot)test/unit/binshard.c \
 	$(srcroot)test/unit/buf_writer.c \
+	$(srcroot)test/unit/cache_bin.c \
 	$(srcroot)test/unit/ckh.c \
 	$(srcroot)test/unit/decay.c \
 	$(srcroot)test/unit/div.c \
diff --git a/include/jemalloc/internal/cache_bin.h b/include/jemalloc/internal/cache_bin.h
index d14556a3..67180cfa 100644
--- a/include/jemalloc/internal/cache_bin.h
+++ b/include/jemalloc/internal/cache_bin.h
@@ -13,7 +13,6 @@
  * of the tcache at all.
  */
 
-
 /*
  * The count of the number of cached allocations in a bin.  We make this signed
  * so that negative numbers can encode "invalid" states (e.g. a low water mark
@@ -39,29 +38,67 @@ struct cache_bin_info_s {
 	/* Upper limit on ncached. */
 	cache_bin_sz_t ncached_max;
 };
+extern cache_bin_info_t	*tcache_bin_info;
 
 typedef struct cache_bin_s cache_bin_t;
 struct cache_bin_s {
-	/* Min # cached since last GC. */
-	cache_bin_sz_t low_water;
-	/* # of cached objects. */
-	cache_bin_sz_t ncached;
 	/*
-	 * ncached and stats are both modified frequently.  Let's keep them
+	 * The cache bin stack is represented using 3 pointers: cur_ptr,
+	 * low_water and full, optimized for the fast path efficiency.
+	 *
+	 * low addr ==> high addr
+	 * |----|----|----|item1|item2|.....................|itemN|
+	 *  full            cur                                    empty
+	 * (ncached == N; full + ncached_max == empty)
+	 *
+	 * Data directly stored:
+	 * 1) cur_ptr points to the current item to be allocated, i.e. *cur_ptr.
+	 * 2) full points to the top of the stack (i.e. ncached == ncached_max),
+	 * which is compared against on free_fastpath to check "is_full".
+	 * 3) low_water indicates a low water mark of ncached.
+	 * Range of low_water is [cur, empty + 1], i.e. values of [ncached, -1].
+	 *
+	 * The empty position (ncached == 0) is derived via full + ncached_max
+	 * and not accessed in the common case (guarded behind low_water).
+	 *
+	 * On 64-bit, 2 of the 3 pointers (full and low water) are compressed by
+	 * omitting the high 32 bits.  Overflow of the half pointers is avoided
+	 * when allocating / initializing the stack space.  As a result,
+	 * cur_ptr.lowbits can be safely used for pointer comparisons.
+	 */
+	union {
+		void **ptr;
+		struct {
+			/* highbits never accessed directly. */
+#if (LG_SIZEOF_PTR == 3 && defined(JEMALLOC_BIG_ENDIAN))
+			uint32_t __highbits;
+#endif
+			uint32_t lowbits;
+#if (LG_SIZEOF_PTR == 3 && !defined(JEMALLOC_BIG_ENDIAN))
+			uint32_t __highbits;
+#endif
+		};
+	} cur_ptr;
+	/*
+	 * cur_ptr and stats are both modified frequently.  Let's keep them
 	 * close so that they have a higher chance of being on the same
 	 * cacheline, thus less write-backs.
 	 */
 	cache_bin_stats_t tstats;
 	/*
-	 * Stack of available objects.
+	 * Points to the first item that hasn't been used since last GC, to
+	 * track the low water mark (min # of cached).  It may point to
+	 * empty_position + 1, which indicates the cache has been depleted and
+	 * refilled (low_water == -1).
+	 */
+	uint32_t low_water_position;
+	/*
+	 * Points to the position when the cache is full.
 	 *
 	 * To make use of adjacent cacheline prefetch, the items in the avail
-	 * stack goes to higher address for newer allocations.  avail points
-	 * just above the available space, which means that
-	 * avail[-ncached, ... -1] are available items and the lowest item will
-	 * be allocated first.
+	 * stack goes to higher address for newer allocations (i.e. cur_ptr++).
 	 */
-	void **avail;
+	uint32_t full_position;
 };
 
 typedef struct cache_bin_array_descriptor_s cache_bin_array_descriptor_t;
@@ -76,6 +113,67 @@ struct cache_bin_array_descriptor_s {
 	cache_bin_t *bins_large;
 };
 
+/*
+ * None of the cache_bin_*_get / _set functions is used on the fast path, which
+ * relies on pointer comparisons to determine if the cache is full / empty.
+ */
+static inline cache_bin_sz_t
+cache_bin_ncached_get(cache_bin_t *bin, szind_t ind) {
+	cache_bin_sz_t n = tcache_bin_info[ind].ncached_max -
+	    (bin->cur_ptr.lowbits - bin->full_position) / sizeof(void *);
+	assert(n >= 0 && n <= tcache_bin_info[ind].ncached_max);
+	assert(n == 0 || *(bin->cur_ptr.ptr) != NULL);
+
+	return n;
+}
+
+static inline void **
+cache_bin_empty_position_get(cache_bin_t *bin, szind_t ind) {
+	void **ret = bin->cur_ptr.ptr + cache_bin_ncached_get(bin, ind);
+	/* Low bits overflow disallowed when allocating the space. */
+	assert((uint32_t)(uintptr_t)ret >= bin->cur_ptr.lowbits);
+	assert(bin->full_position + tcache_bin_info[ind].ncached_max *
+	    sizeof(void *) > bin->full_position);
+
+	/* Can also be computed via (full_position + ncached_max) | highbits. */
+	assert(ret == (void **)((uintptr_t)(bin->full_position +
+	    tcache_bin_info[ind].ncached_max * sizeof(void *)) |
+	    (uintptr_t)((uintptr_t)bin->cur_ptr.ptr &
+	    ~(((uint64_t)1 << 32) - 1))));
+
+	return ret;
+}
+
+/* Returns the position of the bottom item on the stack; for convenience. */
+static inline void **
+cache_bin_bottom_item_get(cache_bin_t *bin, szind_t ind) {
+	void **bottom = cache_bin_empty_position_get(bin, ind) - 1;
+	assert(cache_bin_ncached_get(bin, ind) == 0 || *bottom != NULL);
+
+	return bottom;
+}
+
+/* Returns the numeric value of low water in [-1, ncached]. */
+static inline cache_bin_sz_t
+cache_bin_low_water_get(cache_bin_t *bin, szind_t ind) {
+	cache_bin_sz_t low_water = tcache_bin_info[ind].ncached_max -
+	    (bin->low_water_position - bin->full_position) / sizeof(void *);
+	assert(low_water >= -1 && low_water <=
+	    tcache_bin_info[ind].ncached_max);
+	assert(low_water <= cache_bin_ncached_get(bin, ind));
+	assert(bin->low_water_position >= bin->cur_ptr.lowbits);
+
+	return low_water;
+}
+
+static inline void
+cache_bin_ncached_set(cache_bin_t *bin, szind_t ind, cache_bin_sz_t n) {
+	bin->cur_ptr.lowbits = bin->full_position +
+	    (tcache_bin_info[ind].ncached_max - n) * sizeof(void *);
+	assert(n >= 0 && n <= tcache_bin_info[ind].ncached_max);
+	assert(n == 0 || *bin->cur_ptr.ptr != NULL);
+}
+
 static inline void
 cache_bin_array_descriptor_init(cache_bin_array_descriptor_t *descriptor,
     cache_bin_t *bins_small, cache_bin_t *bins_large) {
@@ -85,19 +183,24 @@ cache_bin_array_descriptor_init(cache_bin_array_descriptor_t *descriptor,
 }
 
 JEMALLOC_ALWAYS_INLINE void *
-cache_bin_alloc_easy(cache_bin_t *bin, bool *success) {
-	void *ret;
-
-	bin->ncached--;
-
+cache_bin_alloc_easy(cache_bin_t *bin, bool *success, cache_bin_sz_t ind) {
 	/*
-	 * Check for both bin->ncached == 0 and ncached < low_water
-	 * in a single branch.
+	 * This may read from the empty position; however the loaded value won't
+	 * be used.  It's safe because the stack has one more slot reserved.
 	 */
-	if (unlikely(bin->ncached <= bin->low_water)) {
-		bin->low_water = bin->ncached;
-		if (bin->ncached == -1) {
-			bin->ncached = 0;
+	void *ret = *(bin->cur_ptr.ptr++);
+	/*
+	 * Check for both bin->ncached == 0 and ncached < low_water in a single
+	 * branch.  This also avoids accessing tcache_bin_info (which is on a
+	 * separate cacheline / page) in the common case.
+	 */
+	if (unlikely(bin->cur_ptr.lowbits >= bin->low_water_position)) {
+		bin->low_water_position = bin->cur_ptr.lowbits;
+		uint32_t empty_position = bin->full_position +
+		    tcache_bin_info[ind].ncached_max * sizeof(void *);
+		if (bin->cur_ptr.lowbits > empty_position) {
+			bin->cur_ptr.ptr--;
+			assert(bin->cur_ptr.lowbits == empty_position);
 			*success = false;
 			return NULL;
 		}
@@ -111,19 +214,18 @@ cache_bin_alloc_easy(cache_bin_t *bin, bool *success) {
 	 * cacheline).
 	 */
 	*success = true;
-	ret = *(bin->avail - (bin->ncached + 1));
 
 	return ret;
 }
 
 JEMALLOC_ALWAYS_INLINE bool
-cache_bin_dalloc_easy(cache_bin_t *bin, cache_bin_info_t *bin_info, void *ptr) {
-	if (unlikely(bin->ncached == bin_info->ncached_max)) {
+cache_bin_dalloc_easy(cache_bin_t *bin, void *ptr) {
+	if (unlikely(bin->cur_ptr.lowbits == bin->full_position)) {
 		return false;
 	}
-	assert(bin->ncached < bin_info->ncached_max);
-	bin->ncached++;
-	*(bin->avail - bin->ncached) = ptr;
+
+	*(--bin->cur_ptr.ptr) = ptr;
+	assert(bin->cur_ptr.lowbits >= bin->full_position);
 
 	return true;
 }
diff --git a/include/jemalloc/internal/jemalloc_internal_inlines_a.h b/include/jemalloc/internal/jemalloc_internal_inlines_a.h
index ddde9b4e..fedbd862 100644
--- a/include/jemalloc/internal/jemalloc_internal_inlines_a.h
+++ b/include/jemalloc/internal/jemalloc_internal_inlines_a.h
@@ -130,8 +130,8 @@ tcache_available(tsd_t *tsd) {
 	if (likely(tsd_tcache_enabled_get(tsd))) {
 		/* Associated arena == NULL implies tcache init in progress. */
 		assert(tsd_tcachep_get(tsd)->arena == NULL ||
-		    tcache_small_bin_get(tsd_tcachep_get(tsd), 0)->avail !=
-		    NULL);
+		    tcache_small_bin_get(tsd_tcachep_get(tsd), 0)->cur_ptr.ptr
+		    != NULL);
 		return true;
 	}
 
diff --git a/include/jemalloc/internal/tcache_externs.h b/include/jemalloc/internal/tcache_externs.h
index 266f246b..2060bb19 100644
--- a/include/jemalloc/internal/tcache_externs.h
+++ b/include/jemalloc/internal/tcache_externs.h
@@ -4,8 +4,6 @@
 extern bool	opt_tcache;
 extern ssize_t	opt_lg_tcache_max;
 
-extern cache_bin_info_t	*tcache_bin_info;
-
 /*
  * Number of tcache bins.  There are SC_NBINS small-object bins, plus 0 or more
  * large-object bins.
diff --git a/include/jemalloc/internal/tcache_inlines.h b/include/jemalloc/internal/tcache_inlines.h
index 4815774b..4f7e02a8 100644
--- a/include/jemalloc/internal/tcache_inlines.h
+++ b/include/jemalloc/internal/tcache_inlines.h
@@ -48,7 +48,7 @@ tcache_alloc_small(tsd_t *tsd, arena_t *arena, tcache_t *tcache,
 
 	assert(binind < SC_NBINS);
 	bin = tcache_small_bin_get(tcache, binind);
-	ret = cache_bin_alloc_easy(bin, &tcache_success);
+	ret = cache_bin_alloc_easy(bin, &tcache_success, binind);
 	assert(tcache_success == (ret != NULL));
 	if (unlikely(!tcache_success)) {
 		bool tcache_hard_success;
@@ -109,7 +109,7 @@ tcache_alloc_large(tsd_t *tsd, arena_t *arena, tcache_t *tcache, size_t size,
 
 	assert(binind >= SC_NBINS &&binind < nhbins);
 	bin = tcache_large_bin_get(tcache, binind);
-	ret = cache_bin_alloc_easy(bin, &tcache_success);
+	ret = cache_bin_alloc_easy(bin, &tcache_success, binind);
 	assert(tcache_success == (ret != NULL));
 	if (unlikely(!tcache_success)) {
 		/*
@@ -164,7 +164,6 @@ JEMALLOC_ALWAYS_INLINE void
 tcache_dalloc_small(tsd_t *tsd, tcache_t *tcache, void *ptr, szind_t binind,
     bool slow_path) {
 	cache_bin_t *bin;
-	cache_bin_info_t *bin_info;
 
 	assert(tcache_salloc(tsd_tsdn(tsd), ptr)
 	    <= SC_SMALL_MAXCLASS);
@@ -174,11 +173,10 @@ tcache_dalloc_small(tsd_t *tsd, tcache_t *tcache, void *ptr, szind_t binind,
 	}
 
 	bin = tcache_small_bin_get(tcache, binind);
-	bin_info = &tcache_bin_info[binind];
-	if (unlikely(!cache_bin_dalloc_easy(bin, bin_info, ptr))) {
+	if (unlikely(!cache_bin_dalloc_easy(bin, ptr))) {
 		tcache_bin_flush_small(tsd, tcache, bin, binind,
-		    (bin_info->ncached_max >> 1));
-		bool ret = cache_bin_dalloc_easy(bin, bin_info, ptr);
+		    tcache_bin_info[binind].ncached_max >> 1);
+		bool ret = cache_bin_dalloc_easy(bin, ptr);
 		assert(ret);
 	}
 
@@ -189,7 +187,6 @@ JEMALLOC_ALWAYS_INLINE void
 tcache_dalloc_large(tsd_t *tsd, tcache_t *tcache, void *ptr, szind_t binind,
     bool slow_path) {
 	cache_bin_t *bin;
-	cache_bin_info_t *bin_info;
 
 	assert(tcache_salloc(tsd_tsdn(tsd), ptr)
 	    > SC_SMALL_MAXCLASS);
@@ -200,11 +197,10 @@ tcache_dalloc_large(tsd_t *tsd, tcache_t *tcache, void *ptr, szind_t binind,
 	}
 
 	bin = tcache_large_bin_get(tcache, binind);
-	bin_info = &tcache_bin_info[binind];
-	if (unlikely(!cache_bin_dalloc_easy(bin, bin_info, ptr))) {
+	if (unlikely(!cache_bin_dalloc_easy(bin, ptr))) {
 		tcache_bin_flush_large(tsd, tcache, bin, binind,
-		    (bin_info->ncached_max >> 1));
-		bool ret = cache_bin_dalloc_easy(bin, bin_info, ptr);
+		    tcache_bin_info[binind].ncached_max >> 1);
+		bool ret = cache_bin_dalloc_easy(bin, ptr);
 		assert(ret);
 	}
 
diff --git a/src/arena.c b/src/arena.c
index e956c394..23d0294b 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -202,12 +202,13 @@ arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads,
 		for (szind_t i = 0; i < SC_NBINS; i++) {
 			cache_bin_t *tbin = &descriptor->bins_small[i];
 			arena_stats_accum_zu(&astats->tcache_bytes,
-			    tbin->ncached * sz_index2size(i));
+			    cache_bin_ncached_get(tbin, i) * sz_index2size(i));
 		}
 		for (szind_t i = 0; i < nhbins - SC_NBINS; i++) {
 			cache_bin_t *tbin = &descriptor->bins_large[i];
 			arena_stats_accum_zu(&astats->tcache_bytes,
-			    tbin->ncached * sz_index2size(i));
+			    cache_bin_ncached_get(tbin, i + SC_NBINS) *
+			    sz_index2size(i));
 		}
 	}
 	malloc_mutex_prof_read(tsdn,
@@ -1381,7 +1382,7 @@ arena_tcache_fill_small(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache,
     cache_bin_t *tbin, szind_t binind, uint64_t prof_accumbytes) {
 	unsigned i, nfill, cnt;
 
-	assert(tbin->ncached == 0);
+	assert(cache_bin_ncached_get(tbin, binind) == 0);
 
 	if (config_prof && arena_prof_accum(tsdn, arena, prof_accumbytes)) {
 		prof_idump(tsdn);
@@ -1390,6 +1391,7 @@ arena_tcache_fill_small(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache,
 	unsigned binshard;
 	bin_t *bin = arena_bin_choose_lock(tsdn, arena, binind, &binshard);
 
+	void **empty_position = cache_bin_empty_position_get(tbin, binind);
 	for (i = 0, nfill = (tcache_bin_info[binind].ncached_max >>
 	    tcache->lg_fill_div[binind]); i < nfill; i += cnt) {
 		extent_t *slab;
@@ -1400,7 +1402,7 @@ arena_tcache_fill_small(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache,
 				tofill : extent_nfree_get(slab);
 			arena_slab_reg_alloc_batch(
 			   slab, &bin_infos[binind], cnt,
-			   tbin->avail - nfill + i);
+			   empty_position - nfill + i);
 		} else {
 			cnt = 1;
 			void *ptr = arena_bin_malloc_hard(tsdn, arena, bin,
@@ -1412,18 +1414,18 @@ arena_tcache_fill_small(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache,
 			 */
 			if (ptr == NULL) {
 				if (i > 0) {
-					memmove(tbin->avail - i,
-						tbin->avail - nfill,
+					memmove(empty_position - i,
+						empty_position - nfill,
 						i * sizeof(void *));
 				}
 				break;
 			}
 			/* Insert such that low regions get used first. */
-			*(tbin->avail - nfill + i) = ptr;
+			*(empty_position - nfill + i) = ptr;
 		}
 		if (config_fill && unlikely(opt_junk_alloc)) {
 			for (unsigned j = 0; j < cnt; j++) {
-				void* ptr = *(tbin->avail - nfill + i + j);
+				void* ptr = *(empty_position - nfill + i + j);
 				arena_alloc_junk_small(ptr, &bin_infos[binind],
 							true);
 			}
@@ -1437,7 +1439,7 @@ arena_tcache_fill_small(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache,
 		tbin->tstats.nrequests = 0;
 	}
 	malloc_mutex_unlock(tsdn, &bin->lock);
-	tbin->ncached = i;
+	cache_bin_ncached_set(tbin, binind, i);
 	arena_decay_tick(tsdn, arena);
 }
 
diff --git a/src/jemalloc.c b/src/jemalloc.c
index dec987c5..75a40277 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -2368,7 +2368,7 @@ je_malloc(size_t size) {
 
 	cache_bin_t *bin = tcache_small_bin_get(tcache, ind);
 	bool tcache_success;
-	void* ret = cache_bin_alloc_easy(bin, &tcache_success);
+	void *ret = cache_bin_alloc_easy(bin, &tcache_success, ind);
 
 	if (tcache_success) {
 		if (config_stats) {
@@ -2846,8 +2846,7 @@ bool free_fastpath(void *ptr, size_t size, bool size_hint) {
 	}
 
 	cache_bin_t *bin = tcache_small_bin_get(tcache, alloc_ctx.szind);
-	cache_bin_info_t *bin_info = &tcache_bin_info[alloc_ctx.szind];
-	if (!cache_bin_dalloc_easy(bin, bin_info, ptr)) {
+	if (!cache_bin_dalloc_easy(bin, ptr)) {
 		return false;
 	}
 
diff --git a/src/tcache.c b/src/tcache.c
index c5fe67a9..d282e1fa 100644
--- a/src/tcache.c
+++ b/src/tcache.c
@@ -14,7 +14,16 @@ bool	opt_tcache = true;
 ssize_t	opt_lg_tcache_max = LG_TCACHE_MAXCLASS_DEFAULT;
 
 cache_bin_info_t	*tcache_bin_info;
-static unsigned		stack_nelms; /* Total stack elms per tcache. */
+/*
+ * For the total bin stack region (per tcache), reserve 2 more slots so that 1)
+ * the empty position can be safely read on the fast path before checking
+ * "is_empty"; and 2) the low_water == -1 case can go beyond the empty position
+ * by 1 step safely (i.e. no overflow).
+ */
+static const unsigned total_stack_padding = sizeof(void *) * 2;
+
+/* Total stack size required (per tcache).  Include the padding above. */
+static uint32_t total_stack_bytes;
 
 unsigned		nhbins;
 size_t			tcache_maxclass;
@@ -47,14 +56,16 @@ tcache_event_hard(tsd_t *tsd, tcache_t *tcache) {
 	} else {
 		tbin = tcache_large_bin_get(tcache, binind);
 	}
-	if (tbin->low_water > 0) {
+
+	cache_bin_sz_t low_water = cache_bin_low_water_get(tbin, binind);
+	cache_bin_sz_t ncached = cache_bin_ncached_get(tbin, binind);
+	if (low_water > 0) {
 		/*
 		 * Flush (ceiling) 3/4 of the objects below the low water mark.
 		 */
 		if (binind < SC_NBINS) {
 			tcache_bin_flush_small(tsd, tcache, tbin, binind,
-			    tbin->ncached - tbin->low_water + (tbin->low_water
-			    >> 2));
+			    ncached - low_water + (low_water >> 2));
 			/*
 			 * Reduce fill count by 2X.  Limit lg_fill_div such that
 			 * the fill count is always at least 1.
@@ -66,10 +77,10 @@ tcache_event_hard(tsd_t *tsd, tcache_t *tcache) {
 			}
 		} else {
 			tcache_bin_flush_large(tsd, tcache, tbin, binind,
-			    tbin->ncached - tbin->low_water + (tbin->low_water
-			    >> 2));
+			     ncached - low_water + (low_water >> 2));
 		}
-	} else if (tbin->low_water < 0) {
+	} else if (low_water < 0) {
+		assert(low_water == -1);
 		/*
 		 * Increase fill count by 2X for small bins.  Make sure
 		 * lg_fill_div stays greater than 0.
@@ -78,7 +89,7 @@ tcache_event_hard(tsd_t *tsd, tcache_t *tcache) {
 			tcache->lg_fill_div[binind]--;
 		}
 	}
-	tbin->low_water = tbin->ncached;
+	tbin->low_water_position = tbin->cur_ptr.lowbits;
 
 	tcache->next_gc_bin++;
 	if (tcache->next_gc_bin == nhbins) {
@@ -97,7 +108,7 @@ tcache_alloc_small_hard(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache,
 	if (config_prof) {
 		tcache->prof_accumbytes = 0;
 	}
-	ret = cache_bin_alloc_easy(tbin, tcache_success);
+	ret = cache_bin_alloc_easy(tbin, tcache_success, binind);
 
 	return ret;
 }
@@ -117,9 +128,10 @@ tbin_extents_lookup_size_check(tsdn_t *tsdn, cache_bin_t *tbin, szind_t binind,
 	 */
 	szind_t szind;
 	size_t sz_sum = binind * nflush;
+	void **bottom_item = cache_bin_bottom_item_get(tbin, binind);
 	for (unsigned i = 0 ; i < nflush; i++) {
 		rtree_extent_szind_read(tsdn, &extents_rtree,
-		    rtree_ctx, (uintptr_t)*(tbin->avail - 1 - i), true,
+		    rtree_ctx, (uintptr_t)*(bottom_item - i), true,
 		    &extents[i], &szind);
 		sz_sum -= szind;
 	}
@@ -137,13 +149,15 @@ tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, cache_bin_t *tbin,
 	bool merged_stats = false;
 
 	assert(binind < SC_NBINS);
-	assert((cache_bin_sz_t)rem <= tbin->ncached);
+	cache_bin_sz_t ncached = cache_bin_ncached_get(tbin, binind);
+	assert((cache_bin_sz_t)rem <= ncached);
 
 	arena_t *arena = tcache->arena;
 	assert(arena != NULL);
-	unsigned nflush = tbin->ncached - rem;
+	unsigned nflush = ncached - rem;
 	VARIABLE_ARRAY(extent_t *, item_extent, nflush);
 
+	void **bottom_item = cache_bin_bottom_item_get(tbin, binind);
 	/* Look up extent once per item. */
 	if (config_opt_safety_checks) {
 		tbin_extents_lookup_size_check(tsd_tsdn(tsd), tbin, binind,
@@ -151,7 +165,7 @@ tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, cache_bin_t *tbin,
 	} else {
 		for (unsigned i = 0 ; i < nflush; i++) {
 			item_extent[i] = iealloc(tsd_tsdn(tsd),
-			    *(tbin->avail - 1 - i));
+			    *(bottom_item - i));
 		}
 	}
 	while (nflush > 0) {
@@ -181,7 +195,7 @@ tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, cache_bin_t *tbin,
 		}
 		unsigned ndeferred = 0;
 		for (unsigned i = 0; i < nflush; i++) {
-			void *ptr = *(tbin->avail - 1 - i);
+			void *ptr = *(bottom_item - i);
 			extent = item_extent[i];
 			assert(ptr != NULL && extent != NULL);
 
@@ -196,7 +210,7 @@ tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, cache_bin_t *tbin,
 				 * locked.  Stash the object, so that it can be
 				 * handled in a future pass.
 				 */
-				*(tbin->avail - 1 - ndeferred) = ptr;
+				*(bottom_item - ndeferred) = ptr;
 				item_extent[ndeferred] = extent;
 				ndeferred++;
 			}
@@ -219,11 +233,11 @@ tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, cache_bin_t *tbin,
 		malloc_mutex_unlock(tsd_tsdn(tsd), &bin->lock);
 	}
 
-	memmove(tbin->avail - rem, tbin->avail - tbin->ncached, rem *
+	memmove(tbin->cur_ptr.ptr + (ncached - rem), tbin->cur_ptr.ptr, rem *
 	    sizeof(void *));
-	tbin->ncached = rem;
-	if (tbin->ncached < tbin->low_water) {
-		tbin->low_water = tbin->ncached;
+	cache_bin_ncached_set(tbin, binind, rem);
+	if (tbin->cur_ptr.lowbits > tbin->low_water_position) {
+		tbin->low_water_position = tbin->cur_ptr.lowbits;
 	}
 }
 
@@ -233,17 +247,19 @@ tcache_bin_flush_large(tsd_t *tsd, tcache_t *tcache, cache_bin_t *tbin, szind_t
 	bool merged_stats = false;
 
 	assert(binind < nhbins);
-	assert((cache_bin_sz_t)rem <= tbin->ncached);
+	cache_bin_sz_t ncached = cache_bin_ncached_get(tbin, binind);
+	assert((cache_bin_sz_t)rem <= ncached);
 
 	arena_t *tcache_arena = tcache->arena;
 	assert(tcache_arena != NULL);
-	unsigned nflush = tbin->ncached - rem;
+	unsigned nflush = ncached - rem;
 	VARIABLE_ARRAY(extent_t *, item_extent, nflush);
 
+	void **bottom_item = cache_bin_bottom_item_get(tbin, binind);
 #ifndef JEMALLOC_EXTRA_SIZE_CHECK
 	/* Look up extent once per item. */
 	for (unsigned i = 0 ; i < nflush; i++) {
-		item_extent[i] = iealloc(tsd_tsdn(tsd), *(tbin->avail - 1 - i));
+		item_extent[i] = iealloc(tsd_tsdn(tsd), *(bottom_item - i));
 	}
 #else
 	tbin_extents_lookup_size_check(tsd_tsdn(tsd), tbin, binind, nflush,
@@ -266,7 +282,7 @@ tcache_bin_flush_large(tsd_t *tsd, tcache_t *tcache, cache_bin_t *tbin, szind_t
 			malloc_mutex_lock(tsd_tsdn(tsd), &locked_arena->large_mtx);
 		}
 		for (unsigned i = 0; i < nflush; i++) {
-			void *ptr = *(tbin->avail - 1 - i);
+			void *ptr = *(bottom_item - i);
 			assert(ptr != NULL);
 			extent = item_extent[i];
 			if (extent_arena_ind_get(extent) == locked_arena_ind) {
@@ -295,7 +311,7 @@ tcache_bin_flush_large(tsd_t *tsd, tcache_t *tcache, cache_bin_t *tbin, szind_t
 
 		unsigned ndeferred = 0;
 		for (unsigned i = 0; i < nflush; i++) {
-			void *ptr = *(tbin->avail - 1 - i);
+			void *ptr = *(bottom_item - i);
 			extent = item_extent[i];
 			assert(ptr != NULL && extent != NULL);
 
@@ -308,7 +324,7 @@ tcache_bin_flush_large(tsd_t *tsd, tcache_t *tcache, cache_bin_t *tbin, szind_t
 				 * Stash the object, so that it can be handled
 				 * in a future pass.
 				 */
-				*(tbin->avail - 1 - ndeferred) = ptr;
+				*(bottom_item - ndeferred) = ptr;
 				item_extent[ndeferred] = extent;
 				ndeferred++;
 			}
@@ -330,11 +346,11 @@ tcache_bin_flush_large(tsd_t *tsd, tcache_t *tcache, cache_bin_t *tbin, szind_t
 		tbin->tstats.nrequests = 0;
 	}
 
-	memmove(tbin->avail - rem, tbin->avail - tbin->ncached, rem *
+	memmove(tbin->cur_ptr.ptr + (ncached - rem), tbin->cur_ptr.ptr, rem *
 	    sizeof(void *));
-	tbin->ncached = rem;
-	if (tbin->ncached < tbin->low_water) {
-		tbin->low_water = tbin->ncached;
+	cache_bin_ncached_set(tbin, binind, rem);
+	if (tbin->cur_ptr.lowbits > tbin->low_water_position) {
+		tbin->low_water_position = tbin->cur_ptr.lowbits;
 	}
 }
 
@@ -406,9 +422,43 @@ tsd_tcache_enabled_data_init(tsd_t *tsd) {
 	return false;
 }
 
-/* Initialize auto tcache (embedded in TSD). */
+static bool
+tcache_bin_init(cache_bin_t *bin, szind_t ind, uintptr_t *stack_cur) {
+	cassert(sizeof(bin->cur_ptr) == sizeof(void *));
+	/*
+	 * The full_position points to the lowest available space.  Allocations
+	 * will access the slots toward higher addresses (for the benefit of
+	 * adjacent prefetch).
+	 */
+	void *full_position = (void *)*stack_cur;
+	uint32_t bin_stack_size = tcache_bin_info[ind].ncached_max *
+	    sizeof(void *);
+
+	*stack_cur += bin_stack_size;
+	void *empty_position = (void *)*stack_cur;
+
+	/* Init to the empty position. */
+	bin->cur_ptr.ptr = empty_position;
+	bin->low_water_position = bin->cur_ptr.lowbits;
+	bin->full_position = (uint32_t)(uintptr_t)full_position;
+	assert(bin->cur_ptr.lowbits - bin->full_position == bin_stack_size);
+	assert(cache_bin_ncached_get(bin, ind) == 0);
+	assert(cache_bin_empty_position_get(bin, ind) == empty_position);
+
+	return false;
+}
+
+/* Sanity check only. */
+static bool
+tcache_bin_lowbits_overflowable(void *ptr) {
+	uint32_t lowbits = (uint32_t)((uintptr_t)ptr + total_stack_bytes);
+	return lowbits < (uint32_t)(uintptr_t)ptr;
+}
+
 static void
 tcache_init(tsd_t *tsd, tcache_t *tcache, void *avail_stack) {
+	assert(!tcache_bin_lowbits_overflowable(avail_stack));
+
 	memset(&tcache->link, 0, sizeof(ql_elm(tcache_t)));
 	tcache->prof_accumbytes = 0;
 	tcache->next_gc_bin = 0;
@@ -416,41 +466,43 @@ tcache_init(tsd_t *tsd, tcache_t *tcache, void *avail_stack) {
 
 	ticker_init(&tcache->gc_ticker, TCACHE_GC_INCR);
 
-	size_t stack_offset = 0;
 	assert((TCACHE_NSLOTS_SMALL_MAX & 1U) == 0);
 	memset(tcache->bins_small, 0, sizeof(cache_bin_t) * SC_NBINS);
 	memset(tcache->bins_large, 0, sizeof(cache_bin_t) * (nhbins - SC_NBINS));
+
 	unsigned i = 0;
+	uintptr_t stack_cur = (uintptr_t)avail_stack;
 	for (; i < SC_NBINS; i++) {
 		tcache->lg_fill_div[i] = 1;
-		stack_offset += tcache_bin_info[i].ncached_max * sizeof(void *);
-		/*
-		 * avail points past the available space.  Allocations will
-		 * access the slots toward higher addresses (for the benefit of
-		 * prefetch).
-		 */
-		tcache_small_bin_get(tcache, i)->avail =
-		    (void **)((uintptr_t)avail_stack + (uintptr_t)stack_offset);
+		cache_bin_t *bin = tcache_small_bin_get(tcache, i);
+		tcache_bin_init(bin, i, &stack_cur);
 	}
 	for (; i < nhbins; i++) {
-		stack_offset += tcache_bin_info[i].ncached_max * sizeof(void *);
-		tcache_large_bin_get(tcache, i)->avail =
-		    (void **)((uintptr_t)avail_stack + (uintptr_t)stack_offset);
+		cache_bin_t *bin = tcache_large_bin_get(tcache, i);
+		tcache_bin_init(bin, i, &stack_cur);
 	}
-	assert(stack_offset == stack_nelms * sizeof(void *));
+
+	/* Sanity check that the whole stack is used. */
+	size_t stack_offset = stack_cur - (uintptr_t)avail_stack;
+	assert(stack_offset + total_stack_padding == total_stack_bytes);
+}
+
+static size_t
+tcache_bin_stack_alignment (size_t size) {
+	/* Align pow2 to avoid overflow the cache bin compressed pointers. */
+	return (LG_SIZEOF_PTR == 3) ? pow2_ceil_zu(size) : CACHELINE;
 }
 
 /* Initialize auto tcache (embedded in TSD). */
 bool
 tsd_tcache_data_init(tsd_t *tsd) {
 	tcache_t *tcache = tsd_tcachep_get_unsafe(tsd);
-	assert(tcache_small_bin_get(tcache, 0)->avail == NULL);
-	size_t size = stack_nelms * sizeof(void *);
+	assert(tcache_small_bin_get(tcache, 0)->cur_ptr.ptr == NULL);
 	/* Avoid false cacheline sharing. */
-	size = sz_sa2u(size, CACHELINE);
-
-	void *avail_array = ipallocztm(tsd_tsdn(tsd), size, CACHELINE, true,
-	    NULL, true, arena_get(TSDN_NULL, 0, true));
+	size_t size = sz_sa2u(total_stack_bytes, CACHELINE);
+	void *avail_array = ipallocztm(tsd_tsdn(tsd), size,
+	    tcache_bin_stack_alignment(size), true, NULL, true,
+	    arena_get(TSDN_NULL, 0, true));
 	if (avail_array == NULL) {
 		return true;
 	}
@@ -485,25 +537,24 @@ tsd_tcache_data_init(tsd_t *tsd) {
 /* Created manual tcache for tcache.create mallctl. */
 tcache_t *
 tcache_create_explicit(tsd_t *tsd) {
-	tcache_t *tcache;
-	size_t size, stack_offset;
-
-	size = sizeof(tcache_t);
+	size_t size = sizeof(tcache_t);
 	/* Naturally align the pointer stacks. */
 	size = PTR_CEILING(size);
-	stack_offset = size;
-	size += stack_nelms * sizeof(void *);
+	size_t stack_offset = size;
+	size += total_stack_bytes;
 	/* Avoid false cacheline sharing. */
 	size = sz_sa2u(size, CACHELINE);
 
-	tcache = ipallocztm(tsd_tsdn(tsd), size, CACHELINE, true, NULL, true,
+	tcache_t *tcache = ipallocztm(tsd_tsdn(tsd), size,
+	    tcache_bin_stack_alignment(size), true, NULL, true,
 	    arena_get(TSDN_NULL, 0, true));
 	if (tcache == NULL) {
 		return NULL;
 	}
 
-	tcache_init(tsd, tcache,
-	    (void *)((uintptr_t)tcache + (uintptr_t)stack_offset));
+	void *avail_array = (void *)((uintptr_t)tcache +
+	    (uintptr_t)stack_offset);
+	tcache_init(tsd, tcache, avail_array);
 	tcache_arena_associate(tsd_tsdn(tsd), tcache, arena_ichoose(tsd, NULL));
 
 	return tcache;
@@ -553,9 +604,12 @@ tcache_destroy(tsd_t *tsd, tcache_t *tcache, bool tsd_tcache) {
 
 	if (tsd_tcache) {
 		/* Release the avail array for the TSD embedded auto tcache. */
-		void *avail_array =
-		    (void *)((uintptr_t)tcache_small_bin_get(tcache, 0)->avail -
-		    (uintptr_t)tcache_bin_info[0].ncached_max * sizeof(void *));
+		cache_bin_t *bin = tcache_small_bin_get(tcache, 0);
+		assert(cache_bin_ncached_get(bin, 0) == 0);
+		assert(cache_bin_empty_position_get(bin, 0) ==
+		    bin->cur_ptr.ptr);
+		void *avail_array = bin->cur_ptr.ptr -
+		    tcache_bin_info[0].ncached_max;
 		idalloctm(tsd_tsdn(tsd), avail_array, NULL, NULL, true, true);
 	} else {
 		/* Release both the tcache struct and avail array. */
@@ -587,16 +641,17 @@ tcache_cleanup(tsd_t *tsd) {
 	if (!tcache_available(tsd)) {
 		assert(tsd_tcache_enabled_get(tsd) == false);
 		if (config_debug) {
-			assert(tcache_small_bin_get(tcache, 0)->avail == NULL);
+			assert(tcache_small_bin_get(tcache, 0)->cur_ptr.ptr
+			    == NULL);
 		}
 		return;
 	}
 	assert(tsd_tcache_enabled_get(tsd));
-	assert(tcache_small_bin_get(tcache, 0)->avail != NULL);
+	assert(tcache_small_bin_get(tcache, 0)->cur_ptr.ptr != NULL);
 
 	tcache_destroy(tsd, tcache, true);
 	if (config_debug) {
-		tcache_small_bin_get(tcache, 0)->avail = NULL;
+		tcache_small_bin_get(tcache, 0)->cur_ptr.ptr = NULL;
 	}
 }
 
@@ -755,8 +810,8 @@ tcache_boot(tsdn_t *tsdn) {
 	if (tcache_bin_info == NULL) {
 		return true;
 	}
+	unsigned i, stack_nelms;
 	stack_nelms = 0;
-	unsigned i;
 	for (i = 0; i < SC_NBINS; i++) {
 		if ((bin_infos[i].nregs << 1) <= TCACHE_NSLOTS_SMALL_MIN) {
 			tcache_bin_info[i].ncached_max =
@@ -775,6 +830,7 @@ tcache_boot(tsdn_t *tsdn) {
 		tcache_bin_info[i].ncached_max = TCACHE_NSLOTS_LARGE;
 		stack_nelms += tcache_bin_info[i].ncached_max;
 	}
+	total_stack_bytes = stack_nelms * sizeof(void *) + total_stack_padding;
 
 	return false;
 }
diff --git a/test/unit/cache_bin.c b/test/unit/cache_bin.c
new file mode 100644
index 00000000..74cf24cb
--- /dev/null
+++ b/test/unit/cache_bin.c
@@ -0,0 +1,64 @@
+#include "test/jemalloc_test.h"
+
+cache_bin_t test_bin;
+
+TEST_BEGIN(test_cache_bin) {
+	cache_bin_t *bin = &test_bin;
+	cassert(PAGE > TCACHE_NSLOTS_SMALL_MAX * sizeof(void *));
+	/* Page aligned to make sure lowbits not overflowable. */
+	void **stack = mallocx(PAGE, MALLOCX_TCACHE_NONE | MALLOCX_ALIGN(PAGE));
+
+	assert_ptr_not_null(stack, "Unexpected mallocx failure");
+	/* Initialize to empty; bin 0. */
+	cache_bin_sz_t ncached_max = tcache_bin_info[0].ncached_max;
+	void **empty_position = stack + ncached_max;
+	bin->cur_ptr.ptr = empty_position;
+	bin->low_water_position = bin->cur_ptr.lowbits;
+	bin->full_position = (uint32_t)(uintptr_t)stack;
+	assert_ptr_eq(cache_bin_empty_position_get(bin, 0), empty_position,
+	    "Incorrect empty position");
+	/* Not using assert_zu etc on cache_bin_sz_t since it may change. */
+	assert_true(cache_bin_ncached_get(bin, 0) == 0, "Incorrect cache size");
+
+	bool success;
+	void *ret = cache_bin_alloc_easy(bin, &success, 0);
+	assert_false(success, "Empty cache bin should not alloc");
+	assert_true(cache_bin_low_water_get(bin, 0) == - 1,
+	    "Incorrect low water mark");
+
+	cache_bin_ncached_set(bin, 0, 0);
+	assert_ptr_eq(bin->cur_ptr.ptr, empty_position, "Bin should be empty");
+	for (cache_bin_sz_t i = 1; i < ncached_max + 1; i++) {
+		success = cache_bin_dalloc_easy(bin, (void *)(uintptr_t)i);
+		assert_true(success && cache_bin_ncached_get(bin, 0) == i,
+		    "Bin dalloc failure");
+	}
+	success = cache_bin_dalloc_easy(bin, (void *)1);
+	assert_false(success, "Bin should be full");
+	assert_ptr_eq(bin->cur_ptr.ptr, stack, "Incorrect bin cur_ptr");
+
+	cache_bin_ncached_set(bin, 0, ncached_max);
+	assert_ptr_eq(bin->cur_ptr.ptr, stack, "cur_ptr should not change");
+	/* Emulate low water after refill. */
+	bin->low_water_position = bin->full_position;
+	for (cache_bin_sz_t i = ncached_max; i > 0; i--) {
+		ret = cache_bin_alloc_easy(bin, &success, 0);
+		cache_bin_sz_t ncached = cache_bin_ncached_get(bin, 0);
+		assert_true(success && ncached == i - 1,
+		    "Cache bin alloc failure");
+		assert_ptr_eq(ret, (void *)(uintptr_t)i, "Bin alloc failure");
+		assert_true(cache_bin_low_water_get(bin, 0) == ncached,
+		    "Incorrect low water mark");
+	}
+
+	ret = cache_bin_alloc_easy(bin, &success, 0);
+	assert_false(success, "Empty cache bin should not alloc.");
+	assert_ptr_eq(bin->cur_ptr.ptr, stack + ncached_max,
+	    "Bin should be empty");
+}
+TEST_END
+
+int
+main(void) {
+	return test(test_cache_bin);
+}