Pull prof_accumbytes into thread event handler

2019-10-14 09:35:51 -07:00 · 2019-10-14 09:35:51 -07:00 · 198f02e797
commit 198f02e797
parent 152c0ef954
16 changed files with 148 additions and 177 deletions
--- a/include/jemalloc/internal/arena_externs.h
+++ b/include/jemalloc/internal/arena_externs.h
@ -49,7 +49,7 @@ void arena_decay(tsdn_t *tsdn, arena_t *arena, bool is_background_thread,
 void arena_reset(tsd_t *tsd, arena_t *arena);
 void arena_destroy(tsd_t *tsd, arena_t *arena);
 void arena_tcache_fill_small(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache,
-    cache_bin_t *tbin, szind_t binind, uint64_t prof_accumbytes);
+    cache_bin_t *tbin, szind_t binind);
 void arena_alloc_junk_small(void *ptr, const bin_info_t *bin_info,
    bool zero);

--- a/include/jemalloc/internal/arena_inlines_a.h
+++ b/include/jemalloc/internal/arena_inlines_a.h
@ -21,17 +21,6 @@ arena_internal_get(arena_t *arena) {
 	return atomic_load_zu(&arena->stats.internal, ATOMIC_RELAXED);
 }

-static inline bool
-arena_prof_accum(tsdn_t *tsdn, arena_t *arena, uint64_t accumbytes) {
-	cassert(config_prof);
-
-	if (likely(prof_interval == 0 || !prof_active_get_unlocked())) {
-		return false;
-	}
-
-	return prof_accum_add(tsdn, &arena->prof_accum, accumbytes);
-}
-
 static inline void
 percpu_arena_update(tsd_t *tsd, unsigned cpu) {
 	assert(have_percpu_arena);
--- a/include/jemalloc/internal/jemalloc_internal_inlines_b.h
+++ b/include/jemalloc/internal/jemalloc_internal_inlines_b.h
@ -24,7 +24,7 @@ arena_choose_impl(tsd_t *tsd, arena_t *arena, bool internal) {
 		if (tcache_available(tsd)) {
 			tcache_t *tcache = tcache_get(tsd);
 			if (tcache->arena != NULL) {
-				/* See comments in tcache_data_init().*/
+				/* See comments in tsd_tcache_data_init().*/
 				assert(tcache->arena ==
 				    arena_get(tsd_tsdn(tsd), 0, false));
 				if (tcache->arena != ret) {
--- a/include/jemalloc/internal/prof_externs.h
+++ b/include/jemalloc/internal/prof_externs.h
@ -33,13 +33,7 @@ extern bool	prof_active;
 /* Accessed via prof_gdump_[gs]et{_unlocked,}(). */
 extern bool	prof_gdump_val;

-/*
- * Profile dump interval, measured in bytes allocated.  Each arena triggers a
- * profile dump when it reaches this threshold.  The effect is that the
- * interval between profile dumps averages prof_interval, though the actual
- * interval between dumps will tend to be sporadic, and the interval will be a
- * maximum of approximately (prof_interval * narenas).
- */
+/* Profile dump interval, measured in bytes allocated. */
 extern uint64_t	prof_interval;

 /*
@ -50,6 +44,10 @@ extern size_t	lg_prof_sample;

 extern bool	prof_booted;

+/* Functions only accessed in prof_inlines_a.h */
+bool prof_idump_accum_impl(tsdn_t *tsdn, uint64_t accumbytes);
+void prof_idump_rollback_impl(tsdn_t *tsdn, size_t usize);
+
 void prof_alloc_rollback(tsd_t *tsd, prof_tctx_t *tctx, bool updated);
 void prof_malloc_sample_object(tsdn_t *tsdn, const void *ptr, size_t usize,
    prof_tctx_t *tctx);
@ -73,7 +71,7 @@ void prof_cnt_all(uint64_t *curobjs, uint64_t *curbytes, uint64_t *accumobjs,
 #endif
 int prof_getpid(void);
 void prof_get_default_filename(tsdn_t *tsdn, char *filename, uint64_t ind);
-bool prof_accum_init(tsdn_t *tsdn, prof_accum_t *prof_accum);
+bool prof_accum_init(tsdn_t *tsdn);
 void prof_idump(tsdn_t *tsdn);
 bool prof_mdump(tsd_t *tsd, const char *filename);
 void prof_gdump(tsdn_t *tsdn);
--- a/include/jemalloc/internal/prof_inlines_a.h
+++ b/include/jemalloc/internal/prof_inlines_a.h
@ -3,74 +3,6 @@

 #include "jemalloc/internal/mutex.h"

-static inline bool
-prof_accum_add(tsdn_t *tsdn, prof_accum_t *prof_accum,
-    uint64_t accumbytes) {
-	cassert(config_prof);
-
-	bool overflow;
-	uint64_t a0, a1;
-
-	/*
-	 * If the application allocates fast enough (and/or if idump is slow
-	 * enough), extreme overflow here (a1 >= prof_interval * 2) can cause
-	 * idump trigger coalescing.  This is an intentional mechanism that
-	 * avoids rate-limiting allocation.
-	 */
-#ifdef JEMALLOC_ATOMIC_U64
-	a0 = atomic_load_u64(&prof_accum->accumbytes, ATOMIC_RELAXED);
-	do {
-		a1 = a0 + accumbytes;
-		assert(a1 >= a0);
-		overflow = (a1 >= prof_interval);
-		if (overflow) {
-			a1 %= prof_interval;
-		}
-	} while (!atomic_compare_exchange_weak_u64(&prof_accum->accumbytes, &a0,
-	    a1, ATOMIC_RELAXED, ATOMIC_RELAXED));
-#else
-	malloc_mutex_lock(tsdn, &prof_accum->mtx);
-	a0 = prof_accum->accumbytes;
-	a1 = a0 + accumbytes;
-	overflow = (a1 >= prof_interval);
-	if (overflow) {
-		a1 %= prof_interval;
-	}
-	prof_accum->accumbytes = a1;
-	malloc_mutex_unlock(tsdn, &prof_accum->mtx);
-#endif
-	return overflow;
-}
-
-static inline void
-prof_accum_cancel(tsdn_t *tsdn, prof_accum_t *prof_accum,
-    size_t usize) {
-	cassert(config_prof);
-
-	/*
-	 * Cancel out as much of the excessive prof_accumbytes increase as
-	 * possible without underflowing.  Interval-triggered dumps occur
-	 * slightly more often than intended as a result of incomplete
-	 * canceling.
-	 */
-	uint64_t a0, a1;
-#ifdef JEMALLOC_ATOMIC_U64
-	a0 = atomic_load_u64(&prof_accum->accumbytes, ATOMIC_RELAXED);
-	do {
-		a1 = (a0 >= SC_LARGE_MINCLASS - usize)
-		    ? a0 - (SC_LARGE_MINCLASS - usize) : 0;
-	} while (!atomic_compare_exchange_weak_u64(&prof_accum->accumbytes, &a0,
-	    a1, ATOMIC_RELAXED, ATOMIC_RELAXED));
-#else
-	malloc_mutex_lock(tsdn, &prof_accum->mtx);
-	a0 = prof_accum->accumbytes;
-	a1 = (a0 >= SC_LARGE_MINCLASS - usize)
-	    ?  a0 - (SC_LARGE_MINCLASS - usize) : 0;
-	prof_accum->accumbytes = a1;
-	malloc_mutex_unlock(tsdn, &prof_accum->mtx);
-#endif
-}
-
 JEMALLOC_ALWAYS_INLINE void
 prof_active_assert() {
 	cassert(config_prof);
@ -93,4 +25,26 @@ prof_active_get_unlocked(void) {
 	return prof_active;
 }

+JEMALLOC_ALWAYS_INLINE bool
+prof_idump_accum(tsdn_t *tsdn, uint64_t accumbytes) {
+	cassert(config_prof);
+
+	if (prof_interval == 0 || !prof_active_get_unlocked()) {
+		return false;
+	}
+
+	return prof_idump_accum_impl(tsdn, accumbytes);
+}
+
+JEMALLOC_ALWAYS_INLINE void
+prof_idump_rollback(tsdn_t *tsdn, size_t usize) {
+	cassert(config_prof);
+
+	if (prof_interval == 0 || !prof_active_get_unlocked()) {
+		return;
+	}
+
+	prof_idump_rollback_impl(tsdn, usize);
+}
+
 #endif /* JEMALLOC_INTERNAL_PROF_INLINES_A_H */
--- a/include/jemalloc/internal/tcache_inlines.h
+++ b/include/jemalloc/internal/tcache_inlines.h
@ -93,9 +93,6 @@ tcache_alloc_small(tsd_t *tsd, arena_t *arena, tcache_t *tcache,
 	if (config_stats) {
 		bin->tstats.nrequests++;
 	}
-	if (config_prof) {
-		tcache->prof_accumbytes += usize;
-	}
 	tcache_event(tsd, tcache);
 	return ret;
 }
@ -151,9 +148,6 @@ tcache_alloc_large(tsd_t *tsd, arena_t *arena, tcache_t *tcache, size_t size,
 		if (config_stats) {
 			bin->tstats.nrequests++;
 		}
-		if (config_prof) {
-			tcache->prof_accumbytes += usize;
-		}
 	}

 	tcache_event(tsd, tcache);
--- a/include/jemalloc/internal/tcache_structs.h
+++ b/include/jemalloc/internal/tcache_structs.h
@ -16,10 +16,9 @@ struct tcache_s {
 	 * together at the start of this struct.
 	 */

-	/* Cleared after arena_prof_accum(). */
-	uint64_t	prof_accumbytes;
 	/* Drives incremental GC. */
 	ticker_t	gc_ticker;
+
 	/*
 	 * The pointer stacks associated with bins follow as a contiguous array.
 	 * During tcache initialization, the avail pointer in each element of
--- a/include/jemalloc/internal/tcache_types.h
+++ b/include/jemalloc/internal/tcache_types.h
@ -47,8 +47,8 @@ typedef struct tcaches_s tcaches_t;
 #define TCACHE_GC_INCR							\
    ((TCACHE_GC_SWEEP / SC_NBINS) + ((TCACHE_GC_SWEEP / SC_NBINS == 0) ? 0 : 1))

-/* Used in TSD static initializer only. Real init in tcache_data_init(). */
-#define TCACHE_ZERO_INITIALIZER {0}
+/* Used in TSD static initializer only. Real init in tsd_tcache_data_init(). */
+#define TCACHE_ZERO_INITIALIZER {{0}}

 /* Used in TSD static initializer only. Will be initialized to opt_tcache. */
 #define TCACHE_ENABLED_ZERO_INITIALIZER false
--- a/include/jemalloc/internal/thread_event.h
+++ b/include/jemalloc/internal/thread_event.h
@ -44,7 +44,8 @@ void thread_event_boot();
    C(thread_allocated_next_event_fast)					\
    C(thread_allocated_last_event)					\
    C(thread_allocated_next_event)					\
-    ITERATE_OVER_ALL_EVENTS
+    ITERATE_OVER_ALL_EVENTS						\
+    C(prof_sample_last_event)

 /* Getters directly wrap TSD getters. */
 #define C(counter)							\
--- a/include/jemalloc/internal/tsd.h
+++ b/include/jemalloc/internal/tsd.h
@ -30,6 +30,7 @@
 * l: thread_allocated_last_event
 * j: thread_allocated_next_event
 * w: prof_sample_event_wait (config_prof)
+ * x: prof_sample_last_event (config_prof)
 * p: prof_tdata (config_prof)
 * v: offset_state
 * i: iarena
@ -45,11 +46,11 @@
 * |----------------------------  2nd cacheline  ----------------------------|
 * | [c * 64  ........ ........ ........ ........ ........ ........ .......] |
 * |----------------------------  3nd cacheline  ----------------------------|
- * | [c * 32  ........ ........ .......] llllllll jjjjjjjj wwwwwwww pppppppp |
+ * | [c * 32  ........ ........ .......] llllllll jjjjjjjj wwwwwwww xxxxxxxx |
 * +----------------------------  4th cacheline  ----------------------------+
- * | vvvvvvvv iiiiiiii aaaaaaaa oooooooo [b...... ........ ........ ........ |
+ * | pppppppp vvvvvvvv iiiiiiii aaaaaaaa oooooooo [b...... ........ ........ |
 * +----------------------------  5th cacheline  ----------------------------+
- * | ..b][t.. ........ ........ ........ ........ ........ ........ ........ |
+ * | ........ ..b][t.. ........ ........ ........ ........ ........ ........ |
 * +-------------------------------------------------------------------------+
 * Note: the entire tcache is embedded into TSD and spans multiple cachelines.
 *
@ -83,6 +84,7 @@ typedef void (*test_callback_t)(int *);
    O(thread_allocated_last_event,	uint64_t,	uint64_t)	\
    O(thread_allocated_next_event,	uint64_t,	uint64_t)	\
    O(prof_sample_event_wait,	uint64_t,		uint64_t)	\
+    O(prof_sample_last_event,	uint64_t,		uint64_t)	\
    O(prof_tdata,		prof_tdata_t *,		prof_tdata_t *)	\
    O(offset_state,		uint64_t,		uint64_t)	\
    O(iarena,			arena_t *,		arena_t *)	\
@ -109,9 +111,10 @@ typedef void (*test_callback_t)(int *);
    /* thread_allocated_next_event_fast */ THREAD_EVENT_MIN_START_WAIT,	\
    /* thread_deallocated */	0,					\
    /* rtree_ctx */		RTREE_CTX_ZERO_INITIALIZER,		\
-    /* thread_allocated_last_event */	0,			\
+    /* thread_allocated_last_event */	0,				\
    /* thread_allocated_next_event */	THREAD_EVENT_MIN_START_WAIT,	\
    /* prof_sample_event_wait */	THREAD_EVENT_MIN_START_WAIT,	\
+    /* prof_sample_last_event */	0,				\
    /* prof_tdata */		NULL,					\
    /* offset_state */		0,					\
    /* iarena */		NULL,					\
--- a/src/arena.c
+++ b/src/arena.c
@ -1378,13 +1378,10 @@ arena_bin_choose_lock(tsdn_t *tsdn, arena_t *arena, szind_t binind,

 void
 arena_tcache_fill_small(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache,
-    cache_bin_t *tbin, szind_t binind, uint64_t prof_accumbytes) {
+    cache_bin_t *tbin, szind_t binind) {
 	unsigned i, nfill, cnt;

 	assert(cache_bin_ncached_get(tbin, binind) == 0);
-	if (config_prof && arena_prof_accum(tsdn, arena, prof_accumbytes)) {
-		prof_idump(tsdn);
-	}
 	tcache->bin_refilled[binind] = true;

 	unsigned binshard;
@ -1484,10 +1481,8 @@ arena_malloc_small(tsdn_t *tsdn, arena_t *arena, szind_t binind, bool zero) {
 		bin->stats.nrequests++;
 		bin->stats.curregs++;
 	}
+
 	malloc_mutex_unlock(tsdn, &bin->lock);
-	if (config_prof && arena_prof_accum(tsdn, arena, usize)) {
-		prof_idump(tsdn);
-	}

 	if (!zero) {
 		if (config_fill) {
@ -1565,14 +1560,13 @@ arena_prof_promote(tsdn_t *tsdn, void *ptr, size_t usize) {

 	extent_t *extent = rtree_extent_read(tsdn, &extents_rtree, rtree_ctx,
 	    (uintptr_t)ptr, true);
-	arena_t *arena = arena_get_from_extent(extent);

 	szind_t szind = sz_size2index(usize);
 	extent_szind_set(extent, szind);
 	rtree_szind_slab_update(tsdn, &extents_rtree, rtree_ctx, (uintptr_t)ptr,
 	    szind, false);

-	prof_accum_cancel(tsdn, &arena->prof_accum, usize);
+	prof_idump_rollback(tsdn, usize);

 	assert(isalloc(tsdn, ptr) == usize);
 }
@ -1982,7 +1976,7 @@ arena_new(tsdn_t *tsdn, unsigned ind, extent_hooks_t *extent_hooks) {
 	}

 	if (config_prof) {
-		if (prof_accum_init(tsdn, &arena->prof_accum)) {
+		if (prof_accum_init(tsdn)) {
 			goto label_error;
 		}
 	}
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@ -2386,9 +2386,6 @@ je_malloc(size_t size) {
 		if (config_stats) {
 			bin->tstats.nrequests++;
 		}
-		if (config_prof) {
-			tcache->prof_accumbytes += usize;
-		}

 		LOG("core.malloc.exit", "result: %p", ret);

--- a/src/large.c
+++ b/src/large.c
@ -56,9 +56,6 @@ large_palloc(tsdn_t *tsdn, arena_t *arena, size_t usize, size_t alignment,
 		extent_list_append(&arena->large, extent);
 		malloc_mutex_unlock(tsdn, &arena->large_mtx);
 	}
-	if (config_prof && arena_prof_accum(tsdn, arena, usize)) {
-		prof_idump(tsdn);
-	}

 	if (zero) {
 		assert(is_zeroed);
--- a/src/prof.c
+++ b/src/prof.c
@ -45,6 +45,9 @@ bool		opt_prof_leak = false;
 bool		opt_prof_accum = false;
 char		opt_prof_prefix[PROF_DUMP_FILENAME_LEN];

+/* Accessed via prof_idump_[accum/rollback](). */
+static prof_accum_t	prof_idump_accumulated;
+
 /*
 * Initialized as opt_prof_active, and accessed via
 * prof_active_[gs]et{_unlocked,}().
@ -586,21 +589,91 @@ prof_fdump(void) {
 }

 bool
-prof_accum_init(tsdn_t *tsdn, prof_accum_t *prof_accum) {
+prof_accum_init(tsdn_t *tsdn) {
 	cassert(config_prof);

 #ifndef JEMALLOC_ATOMIC_U64
-	if (malloc_mutex_init(&prof_accum->mtx, "prof_accum",
+	if (malloc_mutex_init(&prof_idump_accumulated.mtx, "prof_accum",
 	    WITNESS_RANK_PROF_ACCUM, malloc_mutex_rank_exclusive)) {
 		return true;
 	}
-	prof_accum->accumbytes = 0;
+	prof_idump_accumulated.accumbytes = 0;
 #else
-	atomic_store_u64(&prof_accum->accumbytes, 0, ATOMIC_RELAXED);
+	atomic_store_u64(&prof_idump_accumulated.accumbytes, 0,
+	    ATOMIC_RELAXED);
 #endif
 	return false;
 }

+bool
+prof_idump_accum_impl(tsdn_t *tsdn, uint64_t accumbytes) {
+	cassert(config_prof);
+
+	bool overflow;
+	uint64_t a0, a1;
+
+	/*
+	 * If the application allocates fast enough (and/or if idump is slow
+	 * enough), extreme overflow here (a1 >= prof_interval * 2) can cause
+	 * idump trigger coalescing.  This is an intentional mechanism that
+	 * avoids rate-limiting allocation.
+	 */
+#ifdef JEMALLOC_ATOMIC_U64
+	a0 = atomic_load_u64(&prof_idump_accumulated.accumbytes,
+	    ATOMIC_RELAXED);
+	do {
+		a1 = a0 + accumbytes;
+		assert(a1 >= a0);
+		overflow = (a1 >= prof_interval);
+		if (overflow) {
+			a1 %= prof_interval;
+		}
+	} while (!atomic_compare_exchange_weak_u64(
+	    &prof_idump_accumulated.accumbytes, &a0, a1, ATOMIC_RELAXED,
+	    ATOMIC_RELAXED));
+#else
+	malloc_mutex_lock(tsdn, &prof_idump_accumulated.mtx);
+	a0 = prof_idump_accumulated.accumbytes;
+	a1 = a0 + accumbytes;
+	overflow = (a1 >= prof_interval);
+	if (overflow) {
+		a1 %= prof_interval;
+	}
+	prof_idump_accumulated.accumbytes = a1;
+	malloc_mutex_unlock(tsdn, &prof_idump_accumulated.mtx);
+#endif
+	return overflow;
+}
+
+void
+prof_idump_rollback_impl(tsdn_t *tsdn, size_t usize) {
+	cassert(config_prof);
+
+	/*
+	 * Cancel out as much of the excessive accumbytes increase as possible
+	 * without underflowing.  Interval-triggered dumps occur slightly more
+	 * often than intended as a result of incomplete canceling.
+	 */
+	uint64_t a0, a1;
+#ifdef JEMALLOC_ATOMIC_U64
+	a0 = atomic_load_u64(&prof_idump_accumulated.accumbytes,
+	    ATOMIC_RELAXED);
+	do {
+		a1 = (a0 >= SC_LARGE_MINCLASS - usize)
+		    ? a0 - (SC_LARGE_MINCLASS - usize) : 0;
+	} while (!atomic_compare_exchange_weak_u64(
+	    &prof_idump_accumulated.accumbytes, &a0, a1, ATOMIC_RELAXED,
+	    ATOMIC_RELAXED));
+#else
+	malloc_mutex_lock(tsdn, &prof_idump_accumulated.mtx);
+	a0 = prof_idump_accumulated.accumbytes;
+	a1 = (a0 >= SC_LARGE_MINCLASS - usize)
+	    ?  a0 - (SC_LARGE_MINCLASS - usize) : 0;
+	prof_idump_accumulated.accumbytes = a1;
+	malloc_mutex_unlock(tsdn, &prof_idump_accumulated.mtx);
+#endif
+}
+
 bool
 prof_dump_prefix_set(tsdn_t *tsdn, const char *prefix) {
 	cassert(config_prof);
@ -641,7 +714,7 @@ prof_idump(tsdn_t *tsdn) {
 		return;
 	}

-	tdata = prof_tdata_get(tsd, false);
+	tdata = prof_tdata_get(tsd, true);
 	if (tdata == NULL) {
 		return;
 	}
--- a/src/tcache.c
+++ b/src/tcache.c
@ -106,11 +106,7 @@ tcache_alloc_small_hard(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache,
 	void *ret;

 	assert(tcache->arena != NULL);
-	arena_tcache_fill_small(tsdn, arena, tcache, tbin, binind,
-	    config_prof ? tcache->prof_accumbytes : 0);
-	if (config_prof) {
-		tcache->prof_accumbytes = 0;
-	}
+	arena_tcache_fill_small(tsdn, arena, tcache, tbin, binind);
 	ret = cache_bin_alloc_easy(tbin, tcache_success, binind);

 	return ret;
@ -181,14 +177,6 @@ tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, cache_bin_t *tbin,
 		assert(binshard < bin_infos[binind].n_shards);
 		bin_t *bin = &bin_arena->bins[binind].bin_shards[binshard];

-		if (config_prof && bin_arena == arena) {
-			if (arena_prof_accum(tsd_tsdn(tsd), arena,
-			    tcache->prof_accumbytes)) {
-				prof_idump(tsd_tsdn(tsd));
-			}
-			tcache->prof_accumbytes = 0;
-		}
-
 		malloc_mutex_lock(tsd_tsdn(tsd), &bin->lock);
 		if (config_stats && bin_arena == arena && !merged_stats) {
 			merged_stats = true;
@ -274,11 +262,6 @@ tcache_bin_flush_large(tsd_t *tsd, tcache_t *tcache, cache_bin_t *tbin, szind_t
 		unsigned locked_arena_ind = extent_arena_ind_get(extent);
 		arena_t *locked_arena = arena_get(tsd_tsdn(tsd),
 		    locked_arena_ind, false);
-		bool idump;
-
-		if (config_prof) {
-			idump = false;
-		}

 		bool lock_large = !arena_is_auto(locked_arena);
 		if (lock_large) {
@ -295,11 +278,6 @@ tcache_bin_flush_large(tsd_t *tsd, tcache_t *tcache, cache_bin_t *tbin, szind_t
 		}
 		if ((config_prof || config_stats) &&
 		    (locked_arena == tcache_arena)) {
-			if (config_prof) {
-				idump = arena_prof_accum(tsd_tsdn(tsd),
-				    tcache_arena, tcache->prof_accumbytes);
-				tcache->prof_accumbytes = 0;
-			}
 			if (config_stats) {
 				merged_stats = true;
 				arena_stats_large_flush_nrequests_add(
@ -332,9 +310,6 @@ tcache_bin_flush_large(tsd_t *tsd, tcache_t *tcache, cache_bin_t *tbin, szind_t
 				ndeferred++;
 			}
 		}
-		if (config_prof && idump) {
-			prof_idump(tsd_tsdn(tsd));
-		}
 		arena_decay_ticks(tsd_tsdn(tsd), locked_arena, nflush -
 		    ndeferred);
 		nflush = ndeferred;
@ -462,7 +437,6 @@ tcache_init(tsd_t *tsd, tcache_t *tcache, void *avail_stack) {
 	assert(!tcache_bin_lowbits_overflowable(avail_stack));

 	memset(&tcache->link, 0, sizeof(ql_elm(tcache_t)));
-	tcache->prof_accumbytes = 0;
 	tcache->next_gc_bin = 0;
 	tcache->arena = NULL;

@ -590,14 +564,6 @@ tcache_flush_cache(tsd_t *tsd, tcache_t *tcache) {
 			assert(tbin->tstats.nrequests == 0);
 		}
 	}
-
-	if (config_prof && tcache->prof_accumbytes > 0) {
-		if (arena_prof_accum(tsd_tsdn(tsd), tcache->arena,
-		    tcache->prof_accumbytes)) {
-			prof_idump(tsd_tsdn(tsd));
-		}
-		tcache->prof_accumbytes = 0;
-	}
 }

 void
--- a/src/thread_event.c
+++ b/src/thread_event.c
@ -18,6 +18,29 @@ static void thread_##event##_event_handler(tsd_t *tsd);
 ITERATE_OVER_ALL_EVENTS
 #undef E

+static void
+thread_prof_sample_event_handler(tsd_t *tsd) {
+	assert(config_prof && opt_prof);
+	assert(prof_sample_event_wait_get(tsd) == 0U);
+	uint64_t last_event = thread_allocated_last_event_get(tsd);
+	uint64_t last_sample_event = prof_sample_last_event_get(tsd);
+	prof_sample_last_event_set(tsd, last_event);
+	if (prof_idump_accum(tsd_tsdn(tsd), last_event - last_sample_event)) {
+		prof_idump(tsd_tsdn(tsd));
+	}
+	if (!prof_active_get_unlocked()) {
+		/*
+		 * If prof_active is off, we reset prof_sample_event_wait to be
+		 * the sample interval when it drops to 0, so that there won't
+		 * be excessive routings to the slow path, and that when
+		 * prof_active is turned on later, the counting for sampling
+		 * can immediately resume as normal.
+		 */
+		thread_prof_sample_event_update(tsd,
+		    (uint64_t)(1 << lg_prof_sample));
+	}
+}
+
 static uint64_t
 thread_allocated_next_event_compute(tsd_t *tsd) {
 	uint64_t wait = THREAD_EVENT_MAX_START_WAIT;
@ -86,23 +109,6 @@ thread_event_adjust_thresholds_helper(tsd_t *tsd, uint64_t wait) {
 	thread_allocated_next_event_fast_set(tsd, next_event_fast);
 }

-static void
-thread_prof_sample_event_handler(tsd_t *tsd) {
-	assert(config_prof && opt_prof);
-	assert(prof_sample_event_wait_get(tsd) == 0U);
-	if (!prof_active_get_unlocked()) {
-		/*
-		 * If prof_active is off, we reset prof_sample_event_wait to be
-		 * the sample interval when it drops to 0, so that there won't
-		 * be excessive routings to the slow path, and that when
-		 * prof_active is turned on later, the counting for sampling
-		 * can immediately resume as normal.
-		 */
-		thread_prof_sample_event_update(tsd,
-		    (uint64_t)(1 << lg_prof_sample));
-	}
-}
-
 static uint64_t
 thread_event_trigger_batch_update(tsd_t *tsd, uint64_t accumbytes,
    bool allow_event_trigger) {