diff --git a/include/jemalloc/internal/tsd.h b/include/jemalloc/internal/tsd.h
index d88f3d12..7e08f6b2 100644
--- a/include/jemalloc/internal/tsd.h
+++ b/include/jemalloc/internal/tsd.h
@@ -15,57 +15,30 @@
 
 /*
  * Thread-Specific-Data layout
- * --- data accessed on tcache fast path: state, rtree_ctx, stats ---
- * s: state
- * m: thread_allocated
- * k: thread_allocated_next_event_fast
- * f: thread_deallocated
- * h: thread_deallocated_next_event_fast
- * c: rtree_ctx (rtree cache accessed on deallocation)
- * t: tcache
- * --- data not accessed on tcache fast path: arena-related fields ---
- * e: tcache_enabled
- * d: arenas_tdata_bypass
- * r: reentrancy_level
- * n: narenas_tdata
- * l: thread_allocated_last_event
- * j: thread_allocated_next_event
- * q: thread_deallocated_last_event
- * u: thread_deallocated_next_event
- * g: tcache_gc_event_wait
- * y: tcache_gc_dalloc_event_wait
- * w: prof_sample_event_wait (config_prof)
- * x: prof_sample_last_event (config_prof)
- * z: stats_interval_event_wait
- * e: stats_interval_last_event
- * p: prof_tdata (config_prof)
- * v: prng_state
- * i: iarena
- * a: arena
- * o: arenas_tdata
- * b: binshards
- * Loading TSD data is on the critical path of basically all malloc operations.
- * In particular, tcache and rtree_ctx rely on hot CPU cache to be effective.
- * Use a compact layout to reduce cache footprint.
- * +--- 64-bit and 64B cacheline; 1B each letter; First byte on the left. ---+
- * |----------------------------  1st cacheline  ----------------------------|
- * | sedrnnnn mmmmmmmm kkkkkkkk ffffffff hhhhhhhh [c * 24  ........ ........]|
- * |----------------------------  2nd cacheline  ----------------------------|
- * | [c * 64  ........ ........ ........ ........ ........ ........ ........]|
- * |----------------------------  3nd cacheline  ----------------------------|
- * | [c * 40  ........ ........ ........ .......] llllllll jjjjjjjj qqqqqqqq |
- * +----------------------------  4th cacheline  ----------------------------+
- * | uuuuuuuu gggggggg yyyyyyyy wwwwwwww xxxxxxxx zzzzzzzz eeeeeeee pppppppp |
- * +----------------------------  5th and after  ----------------------------+
- * | vvvvvvvv iiiiiiii aaaaaaaa oooooooo [b * 40; then embedded tcache ..... |
- * +-------------------------------------------------------------------------+
- * Note: the entire tcache is embedded into TSD and spans multiple cachelines.
  *
- * The elements after rtree_ctx and before tcache aren't really needed on tcache
- * fast path.  However we have a number of unused tcache bins and witnesses
- * (never touched unless config_debug) at the end of tcache, so we place them
- * there to avoid breaking the cachelines and possibly paging in an extra page.
+ * At least some thread-local data gets touched on the fast-path of almost all
+ * malloc operations.  But much of it is only necessary down slow-paths, or
+ * testing.  We want to colocate the fast-path data so that it can live on the
+ * same cacheline if possible.  So we define three tiers of hotness:
+ * TSD_DATA_FAST: Touched on the alloc/dalloc fast paths.
+ * TSD_DATA_SLOW: Touched down slow paths.  "Slow" here is sort of general;
+ *     there are "semi-slow" paths like "not a sized deallocation, but can still
+ *     live in the tcache".  We'll want to keep these closer to the fast-path
+ *     data.
+ * TSD_DATA_SLOWER: Only touched in test or debug modes, or not touched at all.
+ *
+ * An additional concern is that the larger tcache bins won't be used (we have a
+ * bin per size class, but by default only cache relatively small objects).  So
+ * the earlier bins are in the TSD_DATA_FAST tier, but the later ones are in the
+ * TSD_DATA_SLOWER tier.
+ *
+ * As a result of all this, we put the slow data first, then the fast data, then
+ * the slower data, while keeping the tcache as the last element of the fast
+ * data (so that the fast -> slower transition happens midway through the
+ * tcache).  While we don't yet play alignment tricks to guarantee it, this
+ * increases our odds of getting some cache/page locality on fast paths.
  */
+
 #ifdef JEMALLOC_JET
 typedef void (*test_callback_t)(int *);
 #  define MALLOC_TSD_TEST_DATA_INIT 0x72b65c10
@@ -79,16 +52,11 @@ typedef void (*test_callback_t)(int *);
 #endif
 
 /*  O(name,			type,			nullable type) */
-#define MALLOC_TSD							\
+#define TSD_DATA_SLOW							\
     O(tcache_enabled,		bool,			bool)		\
     O(arenas_tdata_bypass,	bool,			bool)		\
     O(reentrancy_level,		int8_t,			int8_t)		\
     O(narenas_tdata,		uint32_t,		uint32_t)	\
-    O(thread_allocated,		uint64_t,		uint64_t)	\
-    O(thread_allocated_next_event_fast,	uint64_t,	uint64_t)	\
-    O(thread_deallocated,	uint64_t,		uint64_t)	\
-    O(thread_deallocated_next_event_fast, uint64_t,	uint64_t)	\
-    O(rtree_ctx,		rtree_ctx_t,		rtree_ctx_t)	\
     O(thread_allocated_last_event,	uint64_t,	uint64_t)	\
     O(thread_allocated_next_event,	uint64_t,	uint64_t)	\
     O(thread_deallocated_last_event,	uint64_t,	uint64_t)	\
@@ -104,28 +72,13 @@ typedef void (*test_callback_t)(int *);
     O(iarena,			arena_t *,		arena_t *)	\
     O(arena,			arena_t *,		arena_t *)	\
     O(arenas_tdata,		arena_tdata_t *,	arena_tdata_t *)\
-    O(binshards,		tsd_binshards_t,	tsd_binshards_t)\
-    O(tcache,			tcache_t,		tcache_t)	\
-    O(witness_tsd,              witness_tsd_t,		witness_tsdn_t)	\
-    MALLOC_TEST_TSD
+    O(binshards,		tsd_binshards_t,	tsd_binshards_t)
 
-/*
- * TE_MIN_START_WAIT should not exceed the minimal allocation usize.
- */
-#define TE_MIN_START_WAIT ((uint64_t)1U)
-#define TE_MAX_START_WAIT UINT64_MAX
-
-#define TSD_INITIALIZER {						\
-    /* state */			ATOMIC_INIT(tsd_state_uninitialized),	\
+#define TSD_DATA_SLOW_INITIALIZER					\
     /* tcache_enabled */	TCACHE_ENABLED_ZERO_INITIALIZER,	\
     /* arenas_tdata_bypass */	false,					\
     /* reentrancy_level */	0,					\
     /* narenas_tdata */		0,					\
-    /* thread_allocated */	0,					\
-    /* thread_allocated_next_event_fast */ 0, 				\
-    /* thread_deallocated */	0,					\
-    /* thread_deallocated_next_event_fast */	0,			\
-    /* rtree_ctx */		RTREE_CTX_ZERO_INITIALIZER,		\
     /* thread_allocated_last_event */	0,				\
     /* thread_allocated_next_event */	TE_MIN_START_WAIT,		\
     /* thread_deallocated_last_event */	0,				\
@@ -141,10 +94,46 @@ typedef void (*test_callback_t)(int *);
     /* iarena */		NULL,					\
     /* arena */			NULL,					\
     /* arenas_tdata */		NULL,					\
-    /* binshards */		TSD_BINSHARDS_ZERO_INITIALIZER,		\
-    /* tcache */		TCACHE_ZERO_INITIALIZER,		\
+    /* binshards */		TSD_BINSHARDS_ZERO_INITIALIZER,
+
+/*  O(name,			type,			nullable type) */
+#define TSD_DATA_FAST							\
+    O(thread_allocated,		uint64_t,		uint64_t)	\
+    O(thread_allocated_next_event_fast,	uint64_t,	uint64_t)	\
+    O(thread_deallocated,	uint64_t,		uint64_t)	\
+    O(thread_deallocated_next_event_fast, uint64_t,	uint64_t)	\
+    O(rtree_ctx,		rtree_ctx_t,		rtree_ctx_t)	\
+    O(tcache,			tcache_t,		tcache_t)
+
+#define TSD_DATA_FAST_INITIALIZER					\
+    /* thread_allocated */	0,					\
+    /* thread_allocated_next_event_fast */ 0, 				\
+    /* thread_deallocated */	0,					\
+    /* thread_deallocated_next_event_fast */	0,			\
+    /* rtree_ctx */		RTREE_CTX_ZERO_INITIALIZER,		\
+    /* tcache */		TCACHE_ZERO_INITIALIZER,
+
+/*  O(name,			type,			nullable type) */
+#define TSD_DATA_SLOWER							\
+    O(witness_tsd,              witness_tsd_t,		witness_tsdn_t)	\
+    MALLOC_TEST_TSD
+
+#define TSD_DATA_SLOWER_INITIALIZER					\
     /* witness */		WITNESS_TSD_INITIALIZER			\
-    /* test data */		MALLOC_TEST_TSD_INITIALIZER		\
+    /* test data */		MALLOC_TEST_TSD_INITIALIZER
+
+
+/*
+ * TE_MIN_START_WAIT should not exceed the minimal allocation usize.
+ */
+#define TE_MIN_START_WAIT ((uint64_t)1U)
+#define TE_MAX_START_WAIT UINT64_MAX
+
+#define TSD_INITIALIZER {						\
+    				TSD_DATA_SLOW_INITIALIZER		\
+    /* state */			ATOMIC_INIT(tsd_state_uninitialized),	\
+    				TSD_DATA_FAST_INITIALIZER		\
+    				TSD_DATA_SLOWER_INITIALIZER		\
 }
 
 void *malloc_tsd_malloc(size_t size);
@@ -235,14 +224,17 @@ struct tsd_s {
 	 * setters below.
 	 */
 
+#define O(n, t, nt)							\
+	t TSD_MANGLE(n);
+
+	TSD_DATA_SLOW
 	/*
 	 * We manually limit the state to just a single byte.  Unless the 8-bit
 	 * atomics are unavailable (which is rare).
 	 */
 	tsd_state_t state;
-#define O(n, t, nt)							\
-	t TSD_MANGLE(n);
-MALLOC_TSD
+	TSD_DATA_FAST
+	TSD_DATA_SLOWER
 #undef O
 };
 
@@ -308,7 +300,9 @@ JEMALLOC_ALWAYS_INLINE t *						\
 tsd_##n##p_get_unsafe(tsd_t *tsd) {					\
 	return &tsd->TSD_MANGLE(n);					\
 }
-MALLOC_TSD
+TSD_DATA_SLOW
+TSD_DATA_FAST
+TSD_DATA_SLOWER
 #undef O
 
 /* tsd_foop_get(tsd) returns a pointer to the thread-local instance of foo. */
@@ -327,7 +321,9 @@ tsd_##n##p_get(tsd_t *tsd) {						\
 	    state == tsd_state_minimal_initialized);			\
 	return tsd_##n##p_get_unsafe(tsd);				\
 }
-MALLOC_TSD
+TSD_DATA_SLOW
+TSD_DATA_FAST
+TSD_DATA_SLOWER
 #undef O
 
 /*
@@ -343,7 +339,9 @@ tsdn_##n##p_get(tsdn_t *tsdn) {						\
 	tsd_t *tsd = tsdn_tsd(tsdn);					\
 	return (nt *)tsd_##n##p_get(tsd);				\
 }
-MALLOC_TSD
+TSD_DATA_SLOW
+TSD_DATA_FAST
+TSD_DATA_SLOWER
 #undef O
 
 /* tsd_foo_get(tsd) returns the value of the thread-local instance of foo. */
@@ -352,7 +350,9 @@ JEMALLOC_ALWAYS_INLINE t						\
 tsd_##n##_get(tsd_t *tsd) {						\
 	return *tsd_##n##p_get(tsd);					\
 }
-MALLOC_TSD
+TSD_DATA_SLOW
+TSD_DATA_FAST
+TSD_DATA_SLOWER
 #undef O
 
 /* tsd_foo_set(tsd, val) updates the thread-local instance of foo to be val. */
@@ -363,7 +363,9 @@ tsd_##n##_set(tsd_t *tsd, t val) {					\
 	    tsd_state_get(tsd) != tsd_state_minimal_initialized);	\
 	*tsd_##n##p_get(tsd) = val;					\
 }
-MALLOC_TSD
+TSD_DATA_SLOW
+TSD_DATA_FAST
+TSD_DATA_SLOWER
 #undef O
 
 JEMALLOC_ALWAYS_INLINE void