From 152c0ef954f19fc2bbe53fead9c62c9824f06109 Mon Sep 17 00:00:00 2001
From: Yinan Zhang <zyn8950@gmail.com>
Date: Tue, 3 Sep 2019 15:04:48 -0700
Subject: [PATCH] Build a general purpose thread event handler

---
 Makefile.in                                   |  10 +-
 include/jemalloc/internal/prof_inlines_b.h    |  62 +++--
 include/jemalloc/internal/thread_event.h      | 139 ++++++++++
 include/jemalloc/internal/tsd.h               |  76 ++++--
 .../projects/vc2015/jemalloc/jemalloc.vcxproj |   3 +-
 .../vc2015/jemalloc/jemalloc.vcxproj.filters  |  27 +-
 .../projects/vc2017/jemalloc/jemalloc.vcxproj |   3 +-
 .../vc2017/jemalloc/jemalloc.vcxproj.filters  |  33 +--
 src/jemalloc.c                                |  78 +++---
 src/prof.c                                    |  21 +-
 src/thread_event.c                            | 255 ++++++++++++++++++
 test/unit/thread_event.c                      |  57 ++++
 test/unit/thread_event.sh                     |   5 +
 13 files changed, 630 insertions(+), 139 deletions(-)
 create mode 100644 include/jemalloc/internal/thread_event.h
 create mode 100644 src/thread_event.c
 create mode 100644 test/unit/thread_event.c
 create mode 100644 test/unit/thread_event.sh

diff --git a/Makefile.in b/Makefile.in
index fede961d..7eba7742 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -123,11 +123,12 @@ C_SRCS := $(srcroot)src/jemalloc.c \
 	$(srcroot)src/prof_log.c \
 	$(srcroot)src/rtree.c \
 	$(srcroot)src/safety_check.c \
-	$(srcroot)src/stats.c \
 	$(srcroot)src/sc.c \
+	$(srcroot)src/stats.c \
 	$(srcroot)src/sz.c \
 	$(srcroot)src/tcache.c \
 	$(srcroot)src/test_hooks.c \
+	$(srcroot)src/thread_event.c \
 	$(srcroot)src/ticker.c \
 	$(srcroot)src/tsd.c \
 	$(srcroot)src/witness.c
@@ -176,9 +177,9 @@ TESTS_UNIT := \
 	$(srcroot)test/unit/background_thread.c \
 	$(srcroot)test/unit/background_thread_enable.c \
 	$(srcroot)test/unit/base.c \
+	$(srcroot)test/unit/binshard.c \
 	$(srcroot)test/unit/bitmap.c \
 	$(srcroot)test/unit/bit_util.c \
-	$(srcroot)test/unit/binshard.c \
 	$(srcroot)test/unit/buf_writer.c \
 	$(srcroot)test/unit/cache_bin.c \
 	$(srcroot)test/unit/ckh.c \
@@ -200,6 +201,7 @@ TESTS_UNIT := \
 	$(srcroot)test/unit/math.c \
 	$(srcroot)test/unit/mq.c \
 	$(srcroot)test/unit/mtx.c \
+	$(srcroot)test/unit/nstime.c \
 	$(srcroot)test/unit/pack.c \
 	$(srcroot)test/unit/pages.c \
 	$(srcroot)test/unit/ph.c \
@@ -218,9 +220,9 @@ TESTS_UNIT := \
 	$(srcroot)test/unit/retained.c \
 	$(srcroot)test/unit/rtree.c \
 	$(srcroot)test/unit/safety_check.c \
+	$(srcroot)test/unit/sc.c \
 	$(srcroot)test/unit/seq.c \
 	$(srcroot)test/unit/SFMT.c \
-	$(srcroot)test/unit/sc.c \
 	$(srcroot)test/unit/size_classes.c \
 	$(srcroot)test/unit/slab.c \
 	$(srcroot)test/unit/smoothstep.c \
@@ -228,8 +230,8 @@ TESTS_UNIT := \
 	$(srcroot)test/unit/stats.c \
 	$(srcroot)test/unit/stats_print.c \
 	$(srcroot)test/unit/test_hooks.c \
+	$(srcroot)test/unit/thread_event.c \
 	$(srcroot)test/unit/ticker.c \
-	$(srcroot)test/unit/nstime.c \
 	$(srcroot)test/unit/tsd.c \
 	$(srcroot)test/unit/witness.c \
 	$(srcroot)test/unit/zero.c \
diff --git a/include/jemalloc/internal/prof_inlines_b.h b/include/jemalloc/internal/prof_inlines_b.h
index 6b10f5bf..b4e65c05 100644
--- a/include/jemalloc/internal/prof_inlines_b.h
+++ b/include/jemalloc/internal/prof_inlines_b.h
@@ -3,6 +3,7 @@
 
 #include "jemalloc/internal/safety_check.h"
 #include "jemalloc/internal/sz.h"
+#include "jemalloc/internal/thread_event.h"
 
 JEMALLOC_ALWAYS_INLINE bool
 prof_gdump_get_unlocked(void) {
@@ -79,24 +80,6 @@ prof_alloc_time_set(tsdn_t *tsdn, const void *ptr, nstime_t t) {
 	arena_prof_alloc_time_set(tsdn, ptr, t);
 }
 
-JEMALLOC_ALWAYS_INLINE bool
-prof_sample_check(tsd_t *tsd, size_t usize, bool update) {
-	ssize_t check = update ? 0 : usize;
-
-	int64_t bytes_until_sample = tsd_bytes_until_sample_get(tsd);
-	if (update) {
-		bytes_until_sample -= usize;
-		if (tsd_nominal(tsd)) {
-			tsd_bytes_until_sample_set(tsd, bytes_until_sample);
-		}
-	}
-	if (likely(bytes_until_sample >= check)) {
-		return true;
-	}
-
-	return false;
-}
-
 JEMALLOC_ALWAYS_INLINE bool
 prof_sample_accum_update(tsd_t *tsd, size_t usize, bool update,
 			 prof_tdata_t **tdata_out) {
@@ -105,7 +88,7 @@ prof_sample_accum_update(tsd_t *tsd, size_t usize, bool update,
 	cassert(config_prof);
 
 	/* Fastpath: no need to load tdata */
-	if (likely(prof_sample_check(tsd, usize, update))) {
+	if (likely(prof_sample_event_wait_get(tsd) > 0)) {
 		return true;
 	}
 
@@ -127,13 +110,40 @@ prof_sample_accum_update(tsd_t *tsd, size_t usize, bool update,
 		return true;
 	}
 
-	/*
-	 * If this was the first creation of tdata, then
-	 * prof_tdata_get() reset bytes_until_sample, so decrement and
-	 * check it again
-	 */
-	if (!booted && prof_sample_check(tsd, usize, update)) {
-		return true;
+	if (!booted) {
+		/*
+		 * If this was the first creation of tdata, then it means that
+		 * the previous thread_event() relied on the wrong prof_sample
+		 * wait time, and that it should have relied on the new
+		 * prof_sample wait time just set by prof_tdata_get(), so we
+		 * now manually check again.
+		 *
+		 * If the check fails, then even though we relied on the wrong
+		 * prof_sample wait time, we're now actually in perfect shape,
+		 * in the sense that we can pretend that we have used the right
+		 * prof_sample wait time.
+		 *
+		 * If the check succeeds, then we are now in a tougher
+		 * situation, in the sense that we cannot pretend that we have
+		 * used the right prof_sample wait time.  A straightforward
+		 * solution would be to fully roll back thread_event(), set the
+		 * right prof_sample wait time, and then redo thread_event().
+		 * A simpler way, which is implemented below, is to just set a
+		 * new prof_sample wait time that is usize less, and do nothing
+		 * else.  Strictly speaking, the thread event handler may end
+		 * up in a wrong state, since it has still recorded an event
+		 * whereas in reality there may be no event.  However, the
+		 * difference in the wait time offsets the wrongly recorded
+		 * event, so that, functionally, the countdown to the next
+		 * event will behave exactly as if we have used the right
+		 * prof_sample wait time in the first place.
+		 */
+		uint64_t wait = prof_sample_event_wait_get(tsd);
+		assert(wait > 0);
+		if (usize < wait) {
+			thread_prof_sample_event_update(tsd, wait - usize);
+			return true;
+		}
 	}
 
 	/* Compute new sample threshold. */
diff --git a/include/jemalloc/internal/thread_event.h b/include/jemalloc/internal/thread_event.h
new file mode 100644
index 00000000..08678b74
--- /dev/null
+++ b/include/jemalloc/internal/thread_event.h
@@ -0,0 +1,139 @@
+#ifndef JEMALLOC_INTERNAL_THREAD_EVENT_H
+#define JEMALLOC_INTERNAL_THREAD_EVENT_H
+
+#include "jemalloc/internal/tsd.h"
+
+/*
+ * Maximum threshold on thread_allocated_next_event_fast, so that there is no
+ * need to check overflow in malloc fast path. (The allocation size in malloc
+ * fast path never exceeds SC_LOOKUP_MAXCLASS.)
+ */
+#define THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX				\
+    (UINT64_MAX - SC_LOOKUP_MAXCLASS + 1U)
+
+/*
+ * The max interval helps make sure that malloc stays on the fast path in the
+ * common case, i.e. thread_allocated < thread_allocated_next_event_fast.
+ * When thread_allocated is within an event's distance to
+ * THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX above, thread_allocated_next_event_fast
+ * is wrapped around and we fall back to the medium-fast path. The max interval
+ * makes sure that we're not staying on the fallback case for too long, even if
+ * there's no active event or if all active events have long wait times.
+ */
+#define THREAD_EVENT_MAX_INTERVAL ((uint64_t)(4U << 20))
+
+void thread_event_assert_invariants_debug(tsd_t *tsd);
+void thread_event_trigger(tsd_t *tsd, bool delay_event);
+void thread_event_rollback(tsd_t *tsd, size_t diff);
+void thread_event_update(tsd_t *tsd);
+void thread_event_boot();
+
+/*
+ * List of all events, in the following format:
+ *  E(event,		(condition))
+ */
+#define ITERATE_OVER_ALL_EVENTS						\
+    E(prof_sample,	(config_prof && opt_prof))
+
+#define E(event, condition)						\
+    C(event##_event_wait)
+
+/* List of all thread event counters. */
+#define ITERATE_OVER_ALL_COUNTERS					\
+    C(thread_allocated)							\
+    C(thread_allocated_next_event_fast)					\
+    C(thread_allocated_last_event)					\
+    C(thread_allocated_next_event)					\
+    ITERATE_OVER_ALL_EVENTS
+
+/* Getters directly wrap TSD getters. */
+#define C(counter)							\
+JEMALLOC_ALWAYS_INLINE uint64_t						\
+counter##_get(tsd_t *tsd) {						\
+	return tsd_##counter##_get(tsd);				\
+}
+
+ITERATE_OVER_ALL_COUNTERS
+#undef C
+
+/*
+ * Setters call the TSD pointer getters rather than the TSD setters, so that
+ * the counters can be modified even when TSD state is reincarnated or
+ * minimal_initialized: if an event is triggered in such cases, we will
+ * temporarily delay the event and let it be immediately triggered at the next
+ * allocation call.
+ */
+#define C(counter)							\
+JEMALLOC_ALWAYS_INLINE void						\
+counter##_set(tsd_t *tsd, uint64_t v) {					\
+	*tsd_##counter##p_get(tsd) = v;					\
+}
+
+ITERATE_OVER_ALL_COUNTERS
+#undef C
+
+/*
+ * For generating _event_wait getter / setter functions for each individual
+ * event.
+ */
+#undef E
+
+/*
+ * The function checks in debug mode whether the thread event counters are in
+ * a consistent state, which forms the invariants before and after each round
+ * of thread event handling that we can rely on and need to promise.
+ * The invariants are only temporarily violated in the middle of:
+ * (a) thread_event() if an event is triggered (the thread_event_trigger() call
+ *     at the end will restore the invariants),
+ * (b) thread_##event##_event_update() (the thread_event_update() call at the
+ *     end will restore the invariants), or
+ * (c) thread_event_rollback() if the rollback falls below the last_event (the
+ *     thread_event_update() call at the end will restore the invariants).
+ */
+JEMALLOC_ALWAYS_INLINE void
+thread_event_assert_invariants(tsd_t *tsd) {
+	if (config_debug) {
+		thread_event_assert_invariants_debug(tsd);
+	}
+}
+
+JEMALLOC_ALWAYS_INLINE void
+thread_event(tsd_t *tsd, size_t usize) {
+	thread_event_assert_invariants(tsd);
+
+	uint64_t thread_allocated_before = thread_allocated_get(tsd);
+	thread_allocated_set(tsd, thread_allocated_before + usize);
+
+	/* The subtraction is intentionally susceptible to underflow. */
+	if (likely(usize < thread_allocated_next_event_get(tsd) -
+	    thread_allocated_before)) {
+		thread_event_assert_invariants(tsd);
+	} else {
+		thread_event_trigger(tsd, false);
+	}
+}
+
+#define E(event, condition)						\
+JEMALLOC_ALWAYS_INLINE void						\
+thread_##event##_event_update(tsd_t *tsd, uint64_t event_wait) {	\
+	thread_event_assert_invariants(tsd);				\
+	assert(condition);						\
+	assert(tsd_nominal(tsd));					\
+	assert(tsd_reentrancy_level_get(tsd) == 0);			\
+	assert(event_wait > 0U);					\
+	if (THREAD_EVENT_MIN_START_WAIT > 1U &&				\
+	    unlikely(event_wait < THREAD_EVENT_MIN_START_WAIT)) {	\
+		event_wait = THREAD_EVENT_MIN_START_WAIT;		\
+	}								\
+	if (THREAD_EVENT_MAX_START_WAIT < UINT64_MAX &&			\
+	    unlikely(event_wait > THREAD_EVENT_MAX_START_WAIT)) {	\
+		event_wait = THREAD_EVENT_MAX_START_WAIT;		\
+	}								\
+	event##_event_wait_set(tsd, event_wait);			\
+	thread_event_update(tsd);					\
+}
+
+ITERATE_OVER_ALL_EVENTS
+#undef E
+
+#endif /* JEMALLOC_INTERNAL_THREAD_EVENT_H */
diff --git a/include/jemalloc/internal/tsd.h b/include/jemalloc/internal/tsd.h
index e2cc7747..14ad53d7 100644
--- a/include/jemalloc/internal/tsd.h
+++ b/include/jemalloc/internal/tsd.h
@@ -15,39 +15,45 @@
 
 /*
  * Thread-Specific-Data layout
- * --- data accessed on tcache fast path: state, rtree_ctx, stats, prof ---
+ * --- data accessed on tcache fast path: state, rtree_ctx, stats ---
  * s: state
  * e: tcache_enabled
  * m: thread_allocated
+ * k: thread_allocated_next_event_fast
  * f: thread_deallocated
- * b: bytes_until_sample (config_prof)
- * p: prof_tdata (config_prof)
  * c: rtree_ctx (rtree cache accessed on deallocation)
  * t: tcache
  * --- data not accessed on tcache fast path: arena-related fields ---
  * d: arenas_tdata_bypass
  * r: reentrancy_level
  * x: narenas_tdata
+ * l: thread_allocated_last_event
+ * j: thread_allocated_next_event
+ * w: prof_sample_event_wait (config_prof)
+ * p: prof_tdata (config_prof)
  * v: offset_state
  * i: iarena
  * a: arena
  * o: arenas_tdata
+ * b: binshards
  * Loading TSD data is on the critical path of basically all malloc operations.
  * In particular, tcache and rtree_ctx rely on hot CPU cache to be effective.
  * Use a compact layout to reduce cache footprint.
  * +--- 64-bit and 64B cacheline; 1B each letter; First byte on the left. ---+
  * |----------------------------  1st cacheline  ----------------------------|
- * | sedrxxxx vvvvvvvv mmmmmmmm ffffffff bbbbbbbb pppppppp [c * 16  .......] |
+ * | sedrxxxx mmmmmmmm kkkkkkkk ffffffff [c * 32  ........ ........ .......] |
  * |----------------------------  2nd cacheline  ----------------------------|
  * | [c * 64  ........ ........ ........ ........ ........ ........ .......] |
  * |----------------------------  3nd cacheline  ----------------------------|
- * | [c * 48  ........ ........ ........ ........ .......] iiiiiiii aaaaaaaa |
+ * | [c * 32  ........ ........ .......] llllllll jjjjjjjj wwwwwwww pppppppp |
  * +----------------------------  4th cacheline  ----------------------------+
- * | oooooooo [t...... ........ ........ ........ ........ ........ ........ |
+ * | vvvvvvvv iiiiiiii aaaaaaaa oooooooo [b...... ........ ........ ........ |
+ * +----------------------------  5th cacheline  ----------------------------+
+ * | ..b][t.. ........ ........ ........ ........ ........ ........ ........ |
  * +-------------------------------------------------------------------------+
  * Note: the entire tcache is embedded into TSD and spans multiple cachelines.
  *
- * The last 3 members (i, a and o) before tcache isn't really needed on tcache
+ * The elements after rtree_ctx and before tcache aren't really needed on tcache
  * fast path.  However we have a number of unused tcache bins and witnesses
  * (never touched unless config_debug) at the end of tcache, so we place them
  * there to avoid breaking the cachelines and possibly paging in an extra page.
@@ -64,18 +70,21 @@ typedef void (*test_callback_t)(int *);
 #  define MALLOC_TEST_TSD_INITIALIZER
 #endif
 
-/*  O(name,			type,			nullable type */
+/*  O(name,			type,			nullable type) */
 #define MALLOC_TSD							\
     O(tcache_enabled,		bool,			bool)		\
     O(arenas_tdata_bypass,	bool,			bool)		\
     O(reentrancy_level,		int8_t,			int8_t)		\
     O(narenas_tdata,		uint32_t,		uint32_t)	\
-    O(offset_state,		uint64_t,		uint64_t)	\
     O(thread_allocated,		uint64_t,		uint64_t)	\
+    O(thread_allocated_next_event_fast,	uint64_t,	uint64_t)	\
     O(thread_deallocated,	uint64_t,		uint64_t)	\
-    O(bytes_until_sample,	int64_t,		int64_t)	\
-    O(prof_tdata,		prof_tdata_t *,		prof_tdata_t *)	\
     O(rtree_ctx,		rtree_ctx_t,		rtree_ctx_t)	\
+    O(thread_allocated_last_event,	uint64_t,	uint64_t)	\
+    O(thread_allocated_next_event,	uint64_t,	uint64_t)	\
+    O(prof_sample_event_wait,	uint64_t,		uint64_t)	\
+    O(prof_tdata,		prof_tdata_t *,		prof_tdata_t *)	\
+    O(offset_state,		uint64_t,		uint64_t)	\
     O(iarena,			arena_t *,		arena_t *)	\
     O(arena,			arena_t *,		arena_t *)	\
     O(arenas_tdata,		arena_tdata_t *,	arena_tdata_t *)\
@@ -84,25 +93,34 @@ typedef void (*test_callback_t)(int *);
     O(witness_tsd,              witness_tsd_t,		witness_tsdn_t)	\
     MALLOC_TEST_TSD
 
+/*
+ * THREAD_EVENT_MIN_START_WAIT should not exceed the minimal allocation usize.
+ */
+#define THREAD_EVENT_MIN_START_WAIT ((uint64_t)1U)
+#define THREAD_EVENT_MAX_START_WAIT UINT64_MAX
+
 #define TSD_INITIALIZER {						\
-    ATOMIC_INIT(tsd_state_uninitialized),				\
-    TCACHE_ENABLED_ZERO_INITIALIZER,					\
-    false,								\
-    0,									\
-    0,									\
-    0,									\
-    0,									\
-    0,									\
-    0,									\
-    NULL,								\
-    RTREE_CTX_ZERO_INITIALIZER,						\
-    NULL,								\
-    NULL,								\
-    NULL,								\
-    TSD_BINSHARDS_ZERO_INITIALIZER,					\
-    TCACHE_ZERO_INITIALIZER,						\
-    WITNESS_TSD_INITIALIZER						\
-    MALLOC_TEST_TSD_INITIALIZER						\
+    /* state */			ATOMIC_INIT(tsd_state_uninitialized),	\
+    /* tcache_enabled */	TCACHE_ENABLED_ZERO_INITIALIZER,	\
+    /* arenas_tdata_bypass */	false,					\
+    /* reentrancy_level */	0,					\
+    /* narenas_tdata */		0,					\
+    /* thread_allocated */	0,					\
+    /* thread_allocated_next_event_fast */ THREAD_EVENT_MIN_START_WAIT,	\
+    /* thread_deallocated */	0,					\
+    /* rtree_ctx */		RTREE_CTX_ZERO_INITIALIZER,		\
+    /* thread_allocated_last_event */	0,			\
+    /* thread_allocated_next_event */	THREAD_EVENT_MIN_START_WAIT,	\
+    /* prof_sample_event_wait */	THREAD_EVENT_MIN_START_WAIT,	\
+    /* prof_tdata */		NULL,					\
+    /* offset_state */		0,					\
+    /* iarena */		NULL,					\
+    /* arena */			NULL,					\
+    /* arenas_tdata */		NULL,					\
+    /* binshards */		TSD_BINSHARDS_ZERO_INITIALIZER,		\
+    /* tcache */		TCACHE_ZERO_INITIALIZER,		\
+    /* witness */		WITNESS_TSD_INITIALIZER			\
+    /* test data */		MALLOC_TEST_TSD_INITIALIZER		\
 }
 
 void *malloc_tsd_malloc(size_t size);
diff --git a/msvc/projects/vc2015/jemalloc/jemalloc.vcxproj b/msvc/projects/vc2015/jemalloc/jemalloc.vcxproj
index a9683384..5838e933 100644
--- a/msvc/projects/vc2015/jemalloc/jemalloc.vcxproj
+++ b/msvc/projects/vc2015/jemalloc/jemalloc.vcxproj
@@ -63,15 +63,16 @@
     <ClCompile Include="..\..\..\..\src\prof_data.c" />
     <ClCompile Include="..\..\..\..\src\prof_log.c" />
     <ClCompile Include="..\..\..\..\src\rtree.c" />
+    <ClCompile Include="..\..\..\..\src\safety_check.c" />
     <ClCompile Include="..\..\..\..\src\sc.c" />
     <ClCompile Include="..\..\..\..\src\stats.c" />
     <ClCompile Include="..\..\..\..\src\sz.c" />
     <ClCompile Include="..\..\..\..\src\tcache.c" />
     <ClCompile Include="..\..\..\..\src\test_hooks.c" />
+    <ClCompile Include="..\..\..\..\src\thread_event.c" />
     <ClCompile Include="..\..\..\..\src\ticker.c" />
     <ClCompile Include="..\..\..\..\src\tsd.c" />
     <ClCompile Include="..\..\..\..\src\witness.c" />
-    <ClCompile Include="..\..\..\..\src\safety_check.c" />
   </ItemGroup>
   <PropertyGroup Label="Globals">
     <ProjectGuid>{8D6BB292-9E1C-413D-9F98-4864BDC1514A}</ProjectGuid>
diff --git a/msvc/projects/vc2015/jemalloc/jemalloc.vcxproj.filters b/msvc/projects/vc2015/jemalloc/jemalloc.vcxproj.filters
index bc40883b..3551ba5e 100644
--- a/msvc/projects/vc2015/jemalloc/jemalloc.vcxproj.filters
+++ b/msvc/projects/vc2015/jemalloc/jemalloc.vcxproj.filters
@@ -16,6 +16,9 @@
     <ClCompile Include="..\..\..\..\src\base.c">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\..\..\src\bin.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="..\..\..\..\src\bitmap.c">
       <Filter>Source Files</Filter>
     </ClCompile>
@@ -25,6 +28,9 @@
     <ClCompile Include="..\..\..\..\src\ctl.c">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\..\..\src\div.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="..\..\..\..\src\extent.c">
       <Filter>Source Files</Filter>
     </ClCompile>
@@ -46,6 +52,9 @@
     <ClCompile Include="..\..\..\..\src\large.c">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\..\..\src\log.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="..\..\..\..\src\malloc_io.c">
       <Filter>Source Files</Filter>
     </ClCompile>
@@ -76,6 +85,9 @@
     <ClCompile Include="..\..\..\..\src\rtree.c">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\..\..\src\safety_check.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="..\..\..\..\src\sc.c">
       <Filter>Source Files</Filter>
     </ClCompile>
@@ -91,6 +103,9 @@
     <ClCompile Include="..\..\..\..\src\test_hooks.c">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\..\..\src\thread_event.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="..\..\..\..\src\ticker.c">
       <Filter>Source Files</Filter>
     </ClCompile>
@@ -100,17 +115,5 @@
     <ClCompile Include="..\..\..\..\src\witness.c">
       <Filter>Source Files</Filter>
     </ClCompile>
-    <ClCompile Include="..\..\..\..\src\log.c">
-      <Filter>Source Files</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\..\..\src\bin.c">
-      <Filter>Source Files</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\..\..\src\div.c">
-      <Filter>Source Files</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\..\..\src\safety_check.c">
-      <Filter>Source Files</Filter>
-    </ClCompile>
   </ItemGroup>
 </Project>
diff --git a/msvc/projects/vc2017/jemalloc/jemalloc.vcxproj b/msvc/projects/vc2017/jemalloc/jemalloc.vcxproj
index 72a57e56..b9d4f681 100644
--- a/msvc/projects/vc2017/jemalloc/jemalloc.vcxproj
+++ b/msvc/projects/vc2017/jemalloc/jemalloc.vcxproj
@@ -63,15 +63,16 @@
     <ClCompile Include="..\..\..\..\src\prof_data.c" />
     <ClCompile Include="..\..\..\..\src\prof_log.c" />
     <ClCompile Include="..\..\..\..\src\rtree.c" />
+    <ClCompile Include="..\..\..\..\src\safety_check.c" />
     <ClCompile Include="..\..\..\..\src\sc.c" />
     <ClCompile Include="..\..\..\..\src\stats.c" />
     <ClCompile Include="..\..\..\..\src\sz.c" />
     <ClCompile Include="..\..\..\..\src\tcache.c" />
     <ClCompile Include="..\..\..\..\src\test_hooks.c" />
+    <ClCompile Include="..\..\..\..\src\thread_event.c" />
     <ClCompile Include="..\..\..\..\src\ticker.c" />
     <ClCompile Include="..\..\..\..\src\tsd.c" />
     <ClCompile Include="..\..\..\..\src\witness.c" />
-    <ClCompile Include="..\..\..\..\src\safety_check.c" />
   </ItemGroup>
   <PropertyGroup Label="Globals">
     <ProjectGuid>{8D6BB292-9E1C-413D-9F98-4864BDC1514A}</ProjectGuid>
diff --git a/msvc/projects/vc2017/jemalloc/jemalloc.vcxproj.filters b/msvc/projects/vc2017/jemalloc/jemalloc.vcxproj.filters
index 622b93f1..3551ba5e 100644
--- a/msvc/projects/vc2017/jemalloc/jemalloc.vcxproj.filters
+++ b/msvc/projects/vc2017/jemalloc/jemalloc.vcxproj.filters
@@ -16,6 +16,9 @@
     <ClCompile Include="..\..\..\..\src\base.c">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\..\..\src\bin.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="..\..\..\..\src\bitmap.c">
       <Filter>Source Files</Filter>
     </ClCompile>
@@ -25,6 +28,9 @@
     <ClCompile Include="..\..\..\..\src\ctl.c">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\..\..\src\div.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="..\..\..\..\src\extent.c">
       <Filter>Source Files</Filter>
     </ClCompile>
@@ -46,6 +52,9 @@
     <ClCompile Include="..\..\..\..\src\large.c">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\..\..\src\log.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="..\..\..\..\src\malloc_io.c">
       <Filter>Source Files</Filter>
     </ClCompile>
@@ -76,6 +85,9 @@
     <ClCompile Include="..\..\..\..\src\rtree.c">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\..\..\src\safety_check.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="..\..\..\..\src\sc.c">
       <Filter>Source Files</Filter>
     </ClCompile>
@@ -88,6 +100,12 @@
     <ClCompile Include="..\..\..\..\src\tcache.c">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\..\..\src\test_hooks.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\..\..\src\thread_event.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="..\..\..\..\src\ticker.c">
       <Filter>Source Files</Filter>
     </ClCompile>
@@ -97,20 +115,5 @@
     <ClCompile Include="..\..\..\..\src\witness.c">
       <Filter>Source Files</Filter>
     </ClCompile>
-    <ClCompile Include="..\..\..\..\src\log.c">
-      <Filter>Source Files</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\..\..\src\bin.c">
-      <Filter>Source Files</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\..\..\src\div.c">
-      <Filter>Source Files</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\..\..\src\test_hooks.c">
-      <Filter>Source Files</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\..\..\src\safety_check.c">
-      <Filter>Source Files</Filter>
-    </ClCompile>
   </ItemGroup>
 </Project>
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 88064df4..63a1e302 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -18,6 +18,7 @@
 #include "jemalloc/internal/spin.h"
 #include "jemalloc/internal/sz.h"
 #include "jemalloc/internal/ticker.h"
+#include "jemalloc/internal/thread_event.h"
 #include "jemalloc/internal/util.h"
 
 /******************************************************************************/
@@ -1530,6 +1531,7 @@ malloc_init_hard_a0_locked() {
 		prof_boot0();
 	}
 	malloc_conf_init(&sc_data, bin_shard_sizes);
+	thread_event_boot();
 	sz_boot(&sc_data);
 	bin_info_boot(&sc_data, bin_shard_sizes);
 
@@ -2128,6 +2130,8 @@ imalloc_body(static_opts_t *sopts, dynamic_opts_t *dopts, tsd_t *tsd) {
 		dopts->arena_ind = 0;
 	}
 
+	thread_event(tsd, usize);
+
 	/*
 	 * If dopts->alignment > 0, then ind is still 0, but usize was computed
 	 * in the previous if statement.  Down the positive alignment path,
@@ -2136,20 +2140,6 @@ imalloc_body(static_opts_t *sopts, dynamic_opts_t *dopts, tsd_t *tsd) {
 
 	/* If profiling is on, get our profiling context. */
 	if (config_prof && opt_prof) {
-		/*
-		 * The fast path modifies bytes_until_sample regardless of
-		 * prof_active.  We reset it to be the sample interval, so that
-		 * there won't be excessive routings to the slow path, and that
-		 * when prof_active is turned on later, the counting for
-		 * sampling can immediately resume as normal (though the very
-		 * first sampling interval is not randomized).
-		 */
-		if (unlikely(tsd_bytes_until_sample_get(tsd) < 0) &&
-		    !prof_active_get_unlocked()) {
-			tsd_bytes_until_sample_set(tsd,
-			    (ssize_t)(1 << lg_prof_sample));
-		}
-
 		prof_tctx_t *tctx = prof_alloc_prep(
 		    tsd, usize, prof_active_get_unlocked(), true);
 
@@ -2167,24 +2157,17 @@ imalloc_body(static_opts_t *sopts, dynamic_opts_t *dopts, tsd_t *tsd) {
 		}
 
 		if (unlikely(allocation == NULL)) {
+			thread_event_rollback(tsd, usize);
 			prof_alloc_rollback(tsd, tctx, true);
 			goto label_oom;
 		}
 		prof_malloc(tsd_tsdn(tsd), allocation, usize, &alloc_ctx, tctx);
 	} else {
 		assert(!opt_prof);
-		/*
-		 * The fast path modifies bytes_until_sample regardless of
-		 * opt_prof.  We reset it to a huge value here, so as to
-		 * minimize the triggering for slow path.
-		 */
-		if (config_prof &&
-		    unlikely(tsd_bytes_until_sample_get(tsd) < 0)) {
-			tsd_bytes_until_sample_set(tsd, SSIZE_MAX);
-		}
 		allocation = imalloc_no_sample(sopts, dopts, tsd, size, usize,
 		    ind);
 		if (unlikely(allocation == NULL)) {
+			thread_event_rollback(tsd, usize);
 			goto label_oom;
 		}
 	}
@@ -2197,7 +2180,6 @@ imalloc_body(static_opts_t *sopts, dynamic_opts_t *dopts, tsd_t *tsd) {
 	    || ((uintptr_t)allocation & (dopts->alignment - 1)) == ZU(0));
 
 	assert(usize == isalloc(tsd_tsdn(tsd), allocation));
-	*tsd_thread_allocatedp_get(tsd) += usize;
 
 	if (sopts->slow) {
 		UTRACE(0, size, allocation);
@@ -2373,7 +2355,12 @@ je_malloc(size_t size) {
 	}
 
 	szind_t ind = sz_size2index_lookup(size);
-	/* usize is always needed to increment thread_allocated. */
+	/*
+	 * The thread_allocated counter in tsd serves as a general purpose
+	 * accumulator for bytes of allocation to trigger different types of
+	 * events.  usize is always needed to advance thread_allocated, though
+	 * it's not always needed in the core allocation logic.
+	 */
 	size_t usize = sz_index2size(ind);
 	/*
 	 * Fast path relies on size being a bin.
@@ -2382,19 +2369,12 @@ je_malloc(size_t size) {
 	assert(ind < SC_NBINS);
 	assert(size <= SC_SMALL_MAXCLASS);
 
-	if (config_prof) {
-		int64_t bytes_until_sample = tsd_bytes_until_sample_get(tsd);
-		bytes_until_sample -= usize;
-		tsd_bytes_until_sample_set(tsd, bytes_until_sample);
-
-		if (unlikely(bytes_until_sample < 0)) {
-			/*
-			 * Avoid a prof_active check on the fastpath.
-			 * If prof_active is false, bytes_until_sample will be
-			 * reset in slow path.
-			 */
-			return malloc_default(size);
-		}
+	uint64_t thread_allocated_after = thread_allocated_get(tsd) + usize;
+	assert(thread_allocated_next_event_fast_get(tsd) <=
+	    THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX);
+	if (unlikely(thread_allocated_after >=
+	    thread_allocated_next_event_fast_get(tsd))) {
+		return malloc_default(size);
 	}
 
 	cache_bin_t *bin = tcache_small_bin_get(tcache, ind);
@@ -2402,7 +2382,7 @@ je_malloc(size_t size) {
 	void *ret = cache_bin_alloc_easy_reduced(bin, &tcache_success);
 
 	if (tcache_success) {
-		*tsd_thread_allocatedp_get(tsd) += usize;
+		thread_allocated_set(tsd, thread_allocated_after);
 		if (config_stats) {
 			bin->tstats.nrequests++;
 		}
@@ -3116,9 +3096,11 @@ do_rallocx(void *ptr, size_t size, int flags, bool is_realloc) {
 		if (unlikely(usize == 0 || usize > SC_LARGE_MAXCLASS)) {
 			goto label_oom;
 		}
+		thread_event(tsd, usize);
 		p = irallocx_prof(tsd, ptr, old_usize, size, alignment, &usize,
 		    zero, tcache, arena, &alloc_ctx, &hook_args);
 		if (unlikely(p == NULL)) {
+			thread_event_rollback(tsd, usize);
 			goto label_oom;
 		}
 	} else {
@@ -3128,10 +3110,10 @@ do_rallocx(void *ptr, size_t size, int flags, bool is_realloc) {
 			goto label_oom;
 		}
 		usize = isalloc(tsd_tsdn(tsd), p);
+		thread_event(tsd, usize);
 	}
 	assert(alignment == 0 || ((uintptr_t)p & (alignment - 1)) == ZU(0));
 
-	*tsd_thread_allocatedp_get(tsd) += usize;
 	*tsd_thread_deallocatedp_get(tsd) += old_usize;
 
 	UTRACE(ptr, size, p);
@@ -3307,6 +3289,7 @@ ixallocx_prof(tsd_t *tsd, void *ptr, size_t old_usize, size_t size,
 			usize_max = SC_LARGE_MAXCLASS;
 		}
 	}
+	thread_event(tsd, usize_max);
 	tctx = prof_alloc_prep(tsd, usize_max, prof_active, false);
 
 	if (unlikely((uintptr_t)tctx != (uintptr_t)1U)) {
@@ -3316,6 +3299,18 @@ ixallocx_prof(tsd_t *tsd, void *ptr, size_t old_usize, size_t size,
 		usize = ixallocx_helper(tsd_tsdn(tsd), ptr, old_usize, size,
 		    extra, alignment, zero);
 	}
+	if (usize <= usize_max) {
+		thread_event_rollback(tsd, usize_max - usize);
+	} else {
+		/*
+		 * For downsizing request, usize_max can be less than usize.
+		 * We here further increase thread event counters so as to
+		 * record the true usize, and then when the execution goes back
+		 * to xallocx(), the entire usize will be rolled back if it's
+		 * equal to the old usize.
+		 */
+		thread_event(tsd, usize - usize_max);
+	}
 	if (usize == old_usize) {
 		prof_alloc_rollback(tsd, tctx, false);
 		return usize;
@@ -3373,12 +3368,13 @@ je_xallocx(void *ptr, size_t size, size_t extra, int flags) {
 	} else {
 		usize = ixallocx_helper(tsd_tsdn(tsd), ptr, old_usize, size,
 		    extra, alignment, zero);
+		thread_event(tsd, usize);
 	}
 	if (unlikely(usize == old_usize)) {
+		thread_event_rollback(tsd, usize);
 		goto label_not_resized;
 	}
 
-	*tsd_thread_allocatedp_get(tsd) += usize;
 	*tsd_thread_deallocatedp_get(tsd) += old_usize;
 
 label_not_resized:
diff --git a/src/prof.c b/src/prof.c
index fc0c7d8a..7e219dc3 100644
--- a/src/prof.c
+++ b/src/prof.c
@@ -5,6 +5,7 @@
 #include "jemalloc/internal/ctl.h"
 #include "jemalloc/internal/assert.h"
 #include "jemalloc/internal/mutex.h"
+#include "jemalloc/internal/thread_event.h"
 
 /*
  * This file implements the profiling "APIs" needed by other parts of jemalloc,
@@ -471,8 +472,11 @@ prof_sample_threshold_update(prof_tdata_t *tdata) {
 		return;
 	}
 
+	tsd_t *tsd = tsd_fetch();
+
 	if (lg_prof_sample == 0) {
-		tsd_bytes_until_sample_set(tsd_fetch(), 0);
+		thread_prof_sample_event_update(tsd,
+		    THREAD_EVENT_MIN_START_WAIT);
 		return;
 	}
 
@@ -480,11 +484,11 @@ prof_sample_threshold_update(prof_tdata_t *tdata) {
 	 * Compute sample interval as a geometrically distributed random
 	 * variable with mean (2^lg_prof_sample).
 	 *
-	 *                             __        __
-	 *                             |  log(u)  |                     1
-	 * tdata->bytes_until_sample = | -------- |, where p = ---------------
-	 *                             | log(1-p) |             lg_prof_sample
-	 *                                                     2
+	 *                      __        __
+	 *                      |  log(u)  |                     1
+	 * bytes_until_sample = | -------- |, where p = ---------------
+	 *                      | log(1-p) |             lg_prof_sample
+	 *                                              2
 	 *
 	 * For more information on the math, see:
 	 *
@@ -499,10 +503,7 @@ prof_sample_threshold_update(prof_tdata_t *tdata) {
 	uint64_t bytes_until_sample = (uint64_t)(log(u) /
 	    log(1.0 - (1.0 / (double)((uint64_t)1U << lg_prof_sample))))
 	    + (uint64_t)1U;
-	if (bytes_until_sample > SSIZE_MAX) {
-		bytes_until_sample = SSIZE_MAX;
-	}
-	tsd_bytes_until_sample_set(tsd_fetch(), bytes_until_sample);
+	thread_prof_sample_event_update(tsd, bytes_until_sample);
 
 #endif
 }
diff --git a/src/thread_event.c b/src/thread_event.c
new file mode 100644
index 00000000..c6542f46
--- /dev/null
+++ b/src/thread_event.c
@@ -0,0 +1,255 @@
+#define JEMALLOC_THREAD_EVENT_C_
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/jemalloc_internal_includes.h"
+
+#include "jemalloc/internal/thread_event.h"
+
+/*
+ * There's no lock for thread_event_active because write is only done in
+ * malloc_init(), where init_lock there serves as the guard, and ever since
+ * then thread_event_active becomes read only.
+ */
+static bool thread_event_active = false;
+
+/* Event handler function signatures. */
+#define E(event, condition)						\
+static void thread_##event##_event_handler(tsd_t *tsd);
+
+ITERATE_OVER_ALL_EVENTS
+#undef E
+
+static uint64_t
+thread_allocated_next_event_compute(tsd_t *tsd) {
+	uint64_t wait = THREAD_EVENT_MAX_START_WAIT;
+	bool no_event_on = true;
+
+#define E(event, condition)						\
+	if (condition) {						\
+		no_event_on = false;					\
+		uint64_t event_wait =					\
+		    event##_event_wait_get(tsd);			\
+		assert(event_wait <= THREAD_EVENT_MAX_START_WAIT);	\
+		if (event_wait > 0U && event_wait < wait) {		\
+			wait = event_wait;				\
+		}							\
+	}
+
+	ITERATE_OVER_ALL_EVENTS
+#undef E
+
+	assert(no_event_on == !thread_event_active);
+	assert(wait <= THREAD_EVENT_MAX_START_WAIT);
+	return wait;
+}
+
+void
+thread_event_assert_invariants_debug(tsd_t *tsd) {
+	uint64_t thread_allocated = thread_allocated_get(tsd);
+	uint64_t last_event = thread_allocated_last_event_get(tsd);
+	uint64_t next_event = thread_allocated_next_event_get(tsd);
+	uint64_t next_event_fast = thread_allocated_next_event_fast_get(tsd);
+
+	assert(last_event != next_event);
+	if (next_event <= THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX) {
+		assert(next_event_fast == next_event);
+	} else {
+		assert(next_event_fast == 0U);
+	}
+
+	/* The subtraction is intentionally susceptible to underflow. */
+	uint64_t interval = next_event - last_event;
+
+	/* The subtraction is intentionally susceptible to underflow. */
+	assert(thread_allocated - last_event < interval);
+
+	uint64_t min_wait = thread_allocated_next_event_compute(tsd);
+
+	/*
+	 * next_event should have been pushed up only except when no event is
+	 * on and the TSD is just initialized.  The last_event == 0U guard
+	 * below is stronger than needed, but having an exactly accurate guard
+	 * is more complicated to implement.
+	 */
+	assert((!thread_event_active && last_event == 0U) ||
+	    interval == min_wait ||
+	    (interval < min_wait && interval == THREAD_EVENT_MAX_INTERVAL));
+}
+
+static void
+thread_event_adjust_thresholds_helper(tsd_t *tsd, uint64_t wait) {
+	assert(wait <= THREAD_EVENT_MAX_START_WAIT);
+	uint64_t next_event = thread_allocated_last_event_get(tsd) + (wait <=
+	    THREAD_EVENT_MAX_INTERVAL ? wait : THREAD_EVENT_MAX_INTERVAL);
+	thread_allocated_next_event_set(tsd, next_event);
+	uint64_t next_event_fast = (next_event <=
+	    THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX) ? next_event : 0U;
+	thread_allocated_next_event_fast_set(tsd, next_event_fast);
+}
+
+static void
+thread_prof_sample_event_handler(tsd_t *tsd) {
+	assert(config_prof && opt_prof);
+	assert(prof_sample_event_wait_get(tsd) == 0U);
+	if (!prof_active_get_unlocked()) {
+		/*
+		 * If prof_active is off, we reset prof_sample_event_wait to be
+		 * the sample interval when it drops to 0, so that there won't
+		 * be excessive routings to the slow path, and that when
+		 * prof_active is turned on later, the counting for sampling
+		 * can immediately resume as normal.
+		 */
+		thread_prof_sample_event_update(tsd,
+		    (uint64_t)(1 << lg_prof_sample));
+	}
+}
+
+static uint64_t
+thread_event_trigger_batch_update(tsd_t *tsd, uint64_t accumbytes,
+    bool allow_event_trigger) {
+	uint64_t wait = THREAD_EVENT_MAX_START_WAIT;
+
+#define E(event, condition)						\
+	if (condition) {						\
+		uint64_t event_wait = event##_event_wait_get(tsd);	\
+		assert(event_wait <= THREAD_EVENT_MAX_START_WAIT);	\
+		if (event_wait > accumbytes) {				\
+			event_wait -= accumbytes;			\
+		} else {						\
+			event_wait = 0U;				\
+			if (!allow_event_trigger) {			\
+				event_wait =				\
+				    THREAD_EVENT_MIN_START_WAIT;	\
+			}						\
+		}							\
+		assert(event_wait <= THREAD_EVENT_MAX_START_WAIT);	\
+		event##_event_wait_set(tsd, event_wait);		\
+		/*							\
+		 * If there is a single event, then the remaining wait	\
+		 * time may become zero, and we rely on either the	\
+		 * event handler or a thread_event_update() call later	\
+		 * to properly set next_event; if there are multiple	\
+		 * events, then	here we can get the minimum remaining	\
+		 * wait time to	the next already set event.		\
+		 */							\
+		if (event_wait > 0U && event_wait < wait) {		\
+			wait = event_wait;				\
+		}							\
+	}
+
+	ITERATE_OVER_ALL_EVENTS
+#undef E
+
+	assert(wait <= THREAD_EVENT_MAX_START_WAIT);
+	return wait;
+}
+
+void
+thread_event_trigger(tsd_t *tsd, bool delay_event) {
+	/* usize has already been added to thread_allocated. */
+	uint64_t thread_allocated_after = thread_allocated_get(tsd);
+
+	/* The subtraction is intentionally susceptible to underflow. */
+	uint64_t accumbytes = thread_allocated_after -
+	    thread_allocated_last_event_get(tsd);
+
+	/* Make sure that accumbytes cannot overflow uint64_t. */
+	cassert(THREAD_EVENT_MAX_INTERVAL <=
+	    UINT64_MAX - SC_LARGE_MAXCLASS + 1);
+
+	thread_allocated_last_event_set(tsd, thread_allocated_after);
+	bool allow_event_trigger = !delay_event && tsd_nominal(tsd) &&
+	    tsd_reentrancy_level_get(tsd) == 0;
+	uint64_t wait = thread_event_trigger_batch_update(tsd, accumbytes,
+	    allow_event_trigger);
+	thread_event_adjust_thresholds_helper(tsd, wait);
+
+	thread_event_assert_invariants(tsd);
+
+#define E(event, condition)						\
+	if (condition && event##_event_wait_get(tsd) == 0U) {		\
+		assert(allow_event_trigger);				\
+		thread_##event##_event_handler(tsd);			\
+	}
+
+	ITERATE_OVER_ALL_EVENTS
+#undef E
+
+	thread_event_assert_invariants(tsd);
+}
+
+void
+thread_event_rollback(tsd_t *tsd, size_t diff) {
+	thread_event_assert_invariants(tsd);
+
+	if (diff == 0U) {
+		return;
+	}
+
+	uint64_t thread_allocated = thread_allocated_get(tsd);
+	/* The subtraction is intentionally susceptible to underflow. */
+	uint64_t thread_allocated_rollback = thread_allocated - diff;
+	thread_allocated_set(tsd, thread_allocated_rollback);
+
+	uint64_t last_event = thread_allocated_last_event_get(tsd);
+	/* Both subtractions are intentionally susceptible to underflow. */
+	if (thread_allocated_rollback - last_event <=
+	    thread_allocated - last_event) {
+		thread_event_assert_invariants(tsd);
+		return;
+	}
+
+	thread_allocated_last_event_set(tsd, thread_allocated_rollback);
+
+	/* The subtraction is intentionally susceptible to underflow. */
+	uint64_t wait_diff = last_event - thread_allocated_rollback;
+	assert(wait_diff <= diff);
+
+#define E(event, condition)						\
+	if (condition) {						\
+		uint64_t event_wait = event##_event_wait_get(tsd);	\
+		assert(event_wait <= THREAD_EVENT_MAX_START_WAIT);	\
+		if (event_wait > 0U) {					\
+			if (wait_diff >					\
+			    THREAD_EVENT_MAX_START_WAIT - event_wait) {	\
+				event_wait =				\
+				    THREAD_EVENT_MAX_START_WAIT;	\
+			} else {					\
+				event_wait += wait_diff;		\
+			}						\
+			assert(event_wait <=				\
+			    THREAD_EVENT_MAX_START_WAIT);		\
+			event##_event_wait_set(tsd, event_wait);	\
+		}							\
+	}
+
+	ITERATE_OVER_ALL_EVENTS
+#undef E
+
+	thread_event_update(tsd);
+}
+
+void
+thread_event_update(tsd_t *tsd) {
+	uint64_t wait = thread_allocated_next_event_compute(tsd);
+	thread_event_adjust_thresholds_helper(tsd, wait);
+
+	uint64_t last_event = thread_allocated_last_event_get(tsd);
+
+	/* Both subtractions are intentionally susceptible to underflow. */
+	if (thread_allocated_get(tsd) - last_event >=
+	    thread_allocated_next_event_get(tsd) - last_event) {
+		thread_event_trigger(tsd, true);
+	} else {
+		thread_event_assert_invariants(tsd);
+	}
+}
+
+void thread_event_boot() {
+#define E(event, condition)						\
+	if (condition) {						\
+		thread_event_active = true;				\
+	}
+
+	ITERATE_OVER_ALL_EVENTS
+#undef E
+}
diff --git a/test/unit/thread_event.c b/test/unit/thread_event.c
new file mode 100644
index 00000000..6817262b
--- /dev/null
+++ b/test/unit/thread_event.c
@@ -0,0 +1,57 @@
+#include "test/jemalloc_test.h"
+
+TEST_BEGIN(test_next_event_fast_roll_back) {
+	tsd_t *tsd = tsd_fetch();
+	thread_allocated_last_event_set(tsd, 0);
+	thread_allocated_set(tsd,
+	    THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX - 8U);
+	thread_allocated_next_event_set(tsd,
+	    THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX);
+	thread_allocated_next_event_fast_set(tsd,
+	    THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX);
+	prof_sample_event_wait_set(tsd,
+	    THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX);
+	void *p = malloc(16U);
+	assert_ptr_not_null(p, "malloc() failed");
+	free(p);
+}
+TEST_END
+
+TEST_BEGIN(test_next_event_fast_resume) {
+	tsd_t *tsd = tsd_fetch();
+	thread_allocated_last_event_set(tsd, 0);
+	thread_allocated_set(tsd,
+	    THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX + 8U);
+	thread_allocated_next_event_set(tsd,
+	    THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX + 16U);
+	thread_allocated_next_event_fast_set(tsd, 0);
+	prof_sample_event_wait_set(tsd,
+	    THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX + 16U);
+	void *p = malloc(SC_LOOKUP_MAXCLASS);
+	assert_ptr_not_null(p, "malloc() failed");
+	free(p);
+}
+TEST_END
+
+TEST_BEGIN(test_event_rollback) {
+	tsd_t *tsd = tsd_fetch();
+	const uint64_t diff = THREAD_EVENT_MAX_INTERVAL >> 2;
+	size_t count = 10;
+	uint64_t thread_allocated = thread_allocated_get(tsd);
+	while (count-- != 0) {
+		thread_event_rollback(tsd, diff);
+		uint64_t thread_allocated_after = thread_allocated_get(tsd);
+		assert_u64_eq(thread_allocated - thread_allocated_after, diff,
+		    "thread event counters are not properly rolled back");
+		thread_allocated = thread_allocated_after;
+	}
+}
+TEST_END
+
+int
+main(void) {
+	return test(
+	    test_next_event_fast_roll_back,
+	    test_next_event_fast_resume,
+	    test_event_rollback);
+}
diff --git a/test/unit/thread_event.sh b/test/unit/thread_event.sh
new file mode 100644
index 00000000..8fcc7d8a
--- /dev/null
+++ b/test/unit/thread_event.sh
@@ -0,0 +1,5 @@
+#!/bin/sh
+
+if [ "x${enable_prof}" = "x1" ] ; then
+  export MALLOC_CONF="prof:true,lg_prof_sample:0"
+fi