diff --git a/include/jemalloc/internal/thread_event.h b/include/jemalloc/internal/thread_event.h
index 8a05eaed..3ceb4702 100644
--- a/include/jemalloc/internal/thread_event.h
+++ b/include/jemalloc/internal/thread_event.h
@@ -27,6 +27,7 @@ void thread_event_trigger(tsd_t *tsd, bool delay_event);
 void thread_event_rollback(tsd_t *tsd, size_t diff);
 void thread_event_update(tsd_t *tsd);
 void thread_event_boot();
+void thread_event_recompute_fast_threshold(tsd_t *tsd);
 void tsd_thread_event_init(tsd_t *tsd);
 
 /*
@@ -43,9 +44,7 @@ void tsd_thread_event_init(tsd_t *tsd);
 /* List of all thread event counters. */
 #define ITERATE_OVER_ALL_COUNTERS					\
     C(thread_allocated)							\
-    C(thread_allocated_next_event_fast)					\
     C(thread_allocated_last_event)					\
-    C(thread_allocated_next_event)					\
     ITERATE_OVER_ALL_EVENTS						\
     C(prof_sample_last_event)
 
@@ -81,6 +80,60 @@ ITERATE_OVER_ALL_COUNTERS
  */
 #undef E
 
+/*
+ * Two malloc fastpath getters -- use the unsafe getters since tsd may be
+ * non-nominal, in which case the fast_threshold will be set to 0.  This allows
+ * checking for events and tsd non-nominal in a single branch.
+ *
+ * Note that these can only be used on the fastpath.
+ */
+JEMALLOC_ALWAYS_INLINE uint64_t
+thread_allocated_malloc_fastpath(tsd_t *tsd) {
+	return *tsd_thread_allocatedp_get_unsafe(tsd);
+}
+
+JEMALLOC_ALWAYS_INLINE uint64_t
+thread_allocated_next_event_malloc_fastpath(tsd_t *tsd) {
+	uint64_t v = *tsd_thread_allocated_next_event_fastp_get_unsafe(tsd);
+	assert(v <= THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX);
+	return v;
+}
+
+/* Below 3 for next_event_fast. */
+JEMALLOC_ALWAYS_INLINE uint64_t
+thread_allocated_next_event_fast_get(tsd_t *tsd) {
+	uint64_t v = tsd_thread_allocated_next_event_fast_get(tsd);
+	assert(v <= THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX);
+	return v;
+}
+
+JEMALLOC_ALWAYS_INLINE void
+thread_allocated_next_event_fast_set(tsd_t *tsd, uint64_t v) {
+	assert(v <= THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX);
+	*tsd_thread_allocated_next_event_fastp_get(tsd) = v;
+}
+
+JEMALLOC_ALWAYS_INLINE void
+thread_allocated_next_event_fast_set_non_nominal(tsd_t *tsd) {
+	/*
+	 * Set the fast threshold to zero when tsd is non-nominal.  Use the
+	 * unsafe getter as this may get called during tsd init and clean up.
+	 */
+	*tsd_thread_allocated_next_event_fastp_get_unsafe(tsd) = 0;
+}
+
+/* For next_event.  Setter also updates the fast threshold. */
+JEMALLOC_ALWAYS_INLINE uint64_t
+thread_allocated_next_event_get(tsd_t *tsd) {
+	return tsd_thread_allocated_next_event_get(tsd);
+}
+
+JEMALLOC_ALWAYS_INLINE void
+thread_allocated_next_event_set(tsd_t *tsd, uint64_t v) {
+	*tsd_thread_allocated_next_eventp_get(tsd) = v;
+	thread_event_recompute_fast_threshold(tsd);
+}
+
 /*
  * The function checks in debug mode whether the thread event counters are in
  * a consistent state, which forms the invariants before and after each round
diff --git a/include/jemalloc/internal/tsd.h b/include/jemalloc/internal/tsd.h
index 6332a003..961fc1f5 100644
--- a/include/jemalloc/internal/tsd.h
+++ b/include/jemalloc/internal/tsd.h
@@ -110,7 +110,7 @@ typedef void (*test_callback_t)(int *);
     /* reentrancy_level */	0,					\
     /* narenas_tdata */		0,					\
     /* thread_allocated */	0,					\
-    /* thread_allocated_next_event_fast */ THREAD_EVENT_MIN_START_WAIT,	\
+    /* thread_allocated_next_event_fast */ 0, 				\
     /* thread_deallocated */	0,					\
     /* rtree_ctx */		RTREE_CTX_ZERO_INITIALIZER,		\
     /* thread_allocated_last_event */	0,				\
diff --git a/src/jemalloc.c b/src/jemalloc.c
index e25e064b..af72d41a 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -2354,7 +2354,7 @@ je_malloc(size_t size) {
 	}
 
 	tsd_t *tsd = tsd_get(false);
-	if (unlikely((size > SC_LOOKUP_MAXCLASS) || !tsd || !tsd_fast(tsd))) {
+	if (unlikely((size > SC_LOOKUP_MAXCLASS) || tsd == NULL)) {
 		return malloc_default(size);
 	}
 
@@ -2373,13 +2373,17 @@ je_malloc(size_t size) {
 	assert(ind < SC_NBINS);
 	assert(size <= SC_SMALL_MAXCLASS);
 
-	uint64_t thread_allocated_after = thread_allocated_get(tsd) + usize;
-	assert(thread_allocated_next_event_fast_get(tsd) <=
-	    THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX);
-	if (unlikely(thread_allocated_after >=
-	    thread_allocated_next_event_fast_get(tsd))) {
+	uint64_t allocated = thread_allocated_malloc_fastpath(tsd);
+	uint64_t threshold = thread_allocated_next_event_malloc_fastpath(tsd);
+	/*
+	 * Check for events and tsd non-nominal (fast_threshold will be set to
+	 * 0) in a single branch.
+	 */
+	uint64_t allocated_after = allocated + usize;
+	if (unlikely(allocated_after >= threshold)) {
 		return malloc_default(size);
 	}
+	assert(tsd_fast(tsd));
 
 	tcache_t *tcache = tsd_tcachep_get(tsd);
 	cache_bin_t *bin = tcache_small_bin_get(tcache, ind);
@@ -2387,7 +2391,7 @@ je_malloc(size_t size) {
 	void *ret = cache_bin_alloc_easy_reduced(bin, &tcache_success);
 
 	if (tcache_success) {
-		thread_allocated_set(tsd, thread_allocated_after);
+		thread_allocated_set(tsd, allocated_after);
 		if (config_stats) {
 			bin->tstats.nrequests++;
 		}
diff --git a/src/thread_event.c b/src/thread_event.c
index 9a1d0f9b..0657c841 100644
--- a/src/thread_event.c
+++ b/src/thread_event.c
@@ -103,10 +103,11 @@ thread_event_assert_invariants_debug(tsd_t *tsd) {
 	uint64_t next_event_fast = thread_allocated_next_event_fast_get(tsd);
 
 	assert(last_event != next_event);
-	if (next_event <= THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX) {
-		assert(next_event_fast == next_event);
-	} else {
+	if (next_event > THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX ||
+	    !tsd_fast(tsd)) {
 		assert(next_event_fast == 0U);
+	} else {
+		assert(next_event_fast == next_event);
 	}
 
 	/* The subtraction is intentionally susceptible to underflow. */
@@ -128,15 +129,77 @@ thread_event_assert_invariants_debug(tsd_t *tsd) {
 	    (interval < min_wait && interval == THREAD_EVENT_MAX_INTERVAL));
 }
 
+/*
+ * Synchronization around the fast threshold in tsd --
+ * There are two threads to consider in the synchronization here:
+ * - The owner of the tsd being updated by a slow path change
+ * - The remote thread, doing that slow path change.
+ *
+ * As a design constraint, we want to ensure that a slow-path transition cannot
+ * be ignored for arbitrarily long, and that if the remote thread causes a
+ * slow-path transition and then communicates with the owner thread that it has
+ * occurred, then the owner will go down the slow path on the next allocator
+ * operation (so that we don't want to just wait until the owner hits its slow
+ * path reset condition on its own).
+ *
+ * Here's our strategy to do that:
+ *
+ * The remote thread will update the slow-path stores to TSD variables, issue a
+ * SEQ_CST fence, and then update the TSD next_event_fast counter. The owner
+ * thread will update next_event_fast, issue an SEQ_CST fence, and then check
+ * its TSD to see if it's on the slow path.
+
+ * This is fairly straightforward when 64-bit atomics are supported. Assume that
+ * the remote fence is sandwiched between two owner fences in the reset pathway.
+ * The case where there is no preceding or trailing owner fence (i.e. because
+ * the owner thread is near the beginning or end of its life) can be analyzed
+ * similarly. The owner store to next_event_fast preceding the earlier owner
+ * fence will be earlier in coherence order than the remote store to it, so that
+ * the owner thread will go down the slow path once the store becomes visible to
+ * it, which is no later than the time of the second fence.
+
+ * The case where we don't support 64-bit atomics is trickier, since word
+ * tearing is possible. We'll repeat the same analysis, and look at the two
+ * owner fences sandwiching the remote fence. The next_event_fast stores done
+ * alongside the earlier owner fence cannot overwrite any of the remote stores
+ * (since they precede the earlier owner fence in sb, which precedes the remote
+ * fence in sc, which precedes the remote stores in sb). After the second owner
+ * fence there will be a re-check of the slow-path variables anyways, so the
+ * "owner will notice that it's on the slow path eventually" guarantee is
+ * satisfied. To make sure that the out-of-band-messaging constraint is as well,
+ * note that either the message passing is sequenced before the second owner
+ * fence (in which case the remote stores happen before the second set of owner
+ * stores, so malloc sees a value of zero for next_event_fast and goes down the
+ * slow path), or it is not (in which case the owner sees the tsd slow-path
+ * writes on its previous update). This leaves open the possibility that the
+ * remote thread will (at some arbitrary point in the future) zero out one half
+ * of the owner thread's next_event_fast, but that's always safe (it just sends
+ * it down the slow path earlier).
+ */
+void
+thread_event_recompute_fast_threshold(tsd_t *tsd) {
+	if (tsd_state_get(tsd) != tsd_state_nominal) {
+		/* Check first because this is also called on purgatory. */
+		thread_allocated_next_event_fast_set_non_nominal(tsd);
+		return;
+	}
+	uint64_t next_event = thread_allocated_next_event_get(tsd);
+	uint64_t next_event_fast = (next_event <=
+	    THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX) ? next_event : 0U;
+	thread_allocated_next_event_fast_set(tsd, next_event_fast);
+
+	atomic_fence(ATOMIC_SEQ_CST);
+	if (tsd_state_get(tsd) != tsd_state_nominal) {
+		thread_allocated_next_event_fast_set_non_nominal(tsd);
+	}
+}
+
 static void
 thread_event_adjust_thresholds_helper(tsd_t *tsd, uint64_t wait) {
 	assert(wait <= THREAD_EVENT_MAX_START_WAIT);
 	uint64_t next_event = thread_allocated_last_event_get(tsd) + (wait <=
 	    THREAD_EVENT_MAX_INTERVAL ? wait : THREAD_EVENT_MAX_INTERVAL);
 	thread_allocated_next_event_set(tsd, next_event);
-	uint64_t next_event_fast = (next_event <=
-	    THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX) ? next_event : 0U;
-	thread_allocated_next_event_fast_set(tsd, next_event_fast);
 }
 
 static uint64_t
diff --git a/src/tsd.c b/src/tsd.c
index 6e0ee93c..17e9eed2 100644
--- a/src/tsd.c
+++ b/src/tsd.c
@@ -115,8 +115,11 @@ tsd_force_recompute(tsdn_t *tsdn) {
 	ql_foreach(remote_tsd, &tsd_nominal_tsds, TSD_MANGLE(tcache).tsd_link) {
 		assert(tsd_atomic_load(&remote_tsd->state, ATOMIC_RELAXED)
 		    <= tsd_state_nominal_max);
-		tsd_atomic_store(&remote_tsd->state, tsd_state_nominal_recompute,
-		    ATOMIC_RELAXED);
+		tsd_atomic_store(&remote_tsd->state,
+		    tsd_state_nominal_recompute, ATOMIC_RELAXED);
+		/* See comments in thread_event_recompute_fast_threshold(). */
+		atomic_fence(ATOMIC_SEQ_CST);
+		thread_allocated_next_event_fast_set_non_nominal(remote_tsd);
 	}
 	malloc_mutex_unlock(tsdn, &tsd_nominal_tsds_lock);
 }
@@ -175,6 +178,8 @@ tsd_slow_update(tsd_t *tsd) {
 		old_state = tsd_atomic_exchange(&tsd->state, new_state,
 		    ATOMIC_ACQUIRE);
 	} while (old_state == tsd_state_nominal_recompute);
+
+	thread_event_recompute_fast_threshold(tsd);
 }
 
 void
@@ -213,6 +218,7 @@ tsd_state_set(tsd_t *tsd, uint8_t new_state) {
 			tsd_slow_update(tsd);
 		}
 	}
+	thread_event_recompute_fast_threshold(tsd);
 }
 
 static bool
diff --git a/test/unit/thread_event.c b/test/unit/thread_event.c
index cf5b2e59..f016cc5d 100644
--- a/test/unit/thread_event.c
+++ b/test/unit/thread_event.c
@@ -7,8 +7,6 @@ TEST_BEGIN(test_next_event_fast_roll_back) {
 	    THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX - 8U);
 	thread_allocated_next_event_set(tsd,
 	    THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX);
-	thread_allocated_next_event_fast_set(tsd,
-	    THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX);
 #define E(event, condition)						\
 	event##_event_wait_set(tsd,					\
 	    THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX);
@@ -27,7 +25,6 @@ TEST_BEGIN(test_next_event_fast_resume) {
 	    THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX + 8U);
 	thread_allocated_next_event_set(tsd,
 	    THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX + 16U);
-	thread_allocated_next_event_fast_set(tsd, 0);
 #define E(event, condition)						\
 	event##_event_wait_set(tsd,					\
 	    THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX + 16U);