Pull prof_accumbytes into thread event handler

2019-10-14 09:35:51 -07:00
parent 152c0ef954
commit 198f02e797
16 changed files with 148 additions and 177 deletions
--- a/include/jemalloc/internal/arena_externs.h
+++ b/include/jemalloc/internal/arena_externs.h
@@ -49,7 +49,7 @@ void arena_decay(tsdn_t *tsdn, arena_t *arena, bool is_background_thread,
 void arena_reset(tsd_t *tsd, arena_t *arena);
 void arena_destroy(tsd_t *tsd, arena_t *arena);
 void arena_tcache_fill_small(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache,
-    cache_bin_t *tbin, szind_t binind, uint64_t prof_accumbytes);
+    cache_bin_t *tbin, szind_t binind);
 void arena_alloc_junk_small(void *ptr, const bin_info_t *bin_info,
    bool zero);

--- a/include/jemalloc/internal/arena_inlines_a.h
+++ b/include/jemalloc/internal/arena_inlines_a.h
@@ -21,17 +21,6 @@ arena_internal_get(arena_t *arena) {
 	return atomic_load_zu(&arena->stats.internal, ATOMIC_RELAXED);
 }

-static inline bool
-arena_prof_accum(tsdn_t *tsdn, arena_t *arena, uint64_t accumbytes) {
-	cassert(config_prof);
-
-	if (likely(prof_interval == 0 || !prof_active_get_unlocked())) {
-		return false;
-	}
-
-	return prof_accum_add(tsdn, &arena->prof_accum, accumbytes);
-}
-
 static inline void
 percpu_arena_update(tsd_t *tsd, unsigned cpu) {
 	assert(have_percpu_arena);
--- a/include/jemalloc/internal/jemalloc_internal_inlines_b.h
+++ b/include/jemalloc/internal/jemalloc_internal_inlines_b.h
@@ -24,7 +24,7 @@ arena_choose_impl(tsd_t *tsd, arena_t *arena, bool internal) {
 		if (tcache_available(tsd)) {
 			tcache_t *tcache = tcache_get(tsd);
 			if (tcache->arena != NULL) {
-				/* See comments in tcache_data_init().*/
+				/* See comments in tsd_tcache_data_init().*/
 				assert(tcache->arena ==
 				    arena_get(tsd_tsdn(tsd), 0, false));
 				if (tcache->arena != ret) {
--- a/include/jemalloc/internal/prof_externs.h
+++ b/include/jemalloc/internal/prof_externs.h
@@ -33,13 +33,7 @@ extern bool	prof_active;
 /* Accessed via prof_gdump_[gs]et{_unlocked,}(). */
 extern bool	prof_gdump_val;

-/*
- * Profile dump interval, measured in bytes allocated.  Each arena triggers a
- * profile dump when it reaches this threshold.  The effect is that the
- * interval between profile dumps averages prof_interval, though the actual
- * interval between dumps will tend to be sporadic, and the interval will be a
- * maximum of approximately (prof_interval * narenas).
- */
+/* Profile dump interval, measured in bytes allocated. */
 extern uint64_t	prof_interval;

 /*
@@ -50,6 +44,10 @@ extern size_t	lg_prof_sample;

 extern bool	prof_booted;

+/* Functions only accessed in prof_inlines_a.h */
+bool prof_idump_accum_impl(tsdn_t *tsdn, uint64_t accumbytes);
+void prof_idump_rollback_impl(tsdn_t *tsdn, size_t usize);
+
 void prof_alloc_rollback(tsd_t *tsd, prof_tctx_t *tctx, bool updated);
 void prof_malloc_sample_object(tsdn_t *tsdn, const void *ptr, size_t usize,
    prof_tctx_t *tctx);
@@ -73,7 +71,7 @@ void prof_cnt_all(uint64_t *curobjs, uint64_t *curbytes, uint64_t *accumobjs,
 #endif
 int prof_getpid(void);
 void prof_get_default_filename(tsdn_t *tsdn, char *filename, uint64_t ind);
-bool prof_accum_init(tsdn_t *tsdn, prof_accum_t *prof_accum);
+bool prof_accum_init(tsdn_t *tsdn);
 void prof_idump(tsdn_t *tsdn);
 bool prof_mdump(tsd_t *tsd, const char *filename);
 void prof_gdump(tsdn_t *tsdn);
--- a/include/jemalloc/internal/prof_inlines_a.h
+++ b/include/jemalloc/internal/prof_inlines_a.h
@@ -3,74 +3,6 @@

 #include "jemalloc/internal/mutex.h"

-static inline bool
-prof_accum_add(tsdn_t *tsdn, prof_accum_t *prof_accum,
-    uint64_t accumbytes) {
-	cassert(config_prof);
-
-	bool overflow;
-	uint64_t a0, a1;
-
-	/*
-	 * If the application allocates fast enough (and/or if idump is slow
-	 * enough), extreme overflow here (a1 >= prof_interval * 2) can cause
-	 * idump trigger coalescing.  This is an intentional mechanism that
-	 * avoids rate-limiting allocation.
-	 */
-#ifdef JEMALLOC_ATOMIC_U64
-	a0 = atomic_load_u64(&prof_accum->accumbytes, ATOMIC_RELAXED);
-	do {
-		a1 = a0 + accumbytes;
-		assert(a1 >= a0);
-		overflow = (a1 >= prof_interval);
-		if (overflow) {
-			a1 %= prof_interval;
-		}
-	} while (!atomic_compare_exchange_weak_u64(&prof_accum->accumbytes, &a0,
-	    a1, ATOMIC_RELAXED, ATOMIC_RELAXED));
-#else
-	malloc_mutex_lock(tsdn, &prof_accum->mtx);
-	a0 = prof_accum->accumbytes;
-	a1 = a0 + accumbytes;
-	overflow = (a1 >= prof_interval);
-	if (overflow) {
-		a1 %= prof_interval;
-	}
-	prof_accum->accumbytes = a1;
-	malloc_mutex_unlock(tsdn, &prof_accum->mtx);
-#endif
-	return overflow;
-}
-
-static inline void
-prof_accum_cancel(tsdn_t *tsdn, prof_accum_t *prof_accum,
-    size_t usize) {
-	cassert(config_prof);
-
-	/*
-	 * Cancel out as much of the excessive prof_accumbytes increase as
-	 * possible without underflowing.  Interval-triggered dumps occur
-	 * slightly more often than intended as a result of incomplete
-	 * canceling.
-	 */
-	uint64_t a0, a1;
-#ifdef JEMALLOC_ATOMIC_U64
-	a0 = atomic_load_u64(&prof_accum->accumbytes, ATOMIC_RELAXED);
-	do {
-		a1 = (a0 >= SC_LARGE_MINCLASS - usize)
-		    ? a0 - (SC_LARGE_MINCLASS - usize) : 0;
-	} while (!atomic_compare_exchange_weak_u64(&prof_accum->accumbytes, &a0,
-	    a1, ATOMIC_RELAXED, ATOMIC_RELAXED));
-#else
-	malloc_mutex_lock(tsdn, &prof_accum->mtx);
-	a0 = prof_accum->accumbytes;
-	a1 = (a0 >= SC_LARGE_MINCLASS - usize)
-	    ?  a0 - (SC_LARGE_MINCLASS - usize) : 0;
-	prof_accum->accumbytes = a1;
-	malloc_mutex_unlock(tsdn, &prof_accum->mtx);
-#endif
-}
-
 JEMALLOC_ALWAYS_INLINE void
 prof_active_assert() {
 	cassert(config_prof);
@@ -93,4 +25,26 @@ prof_active_get_unlocked(void) {
 	return prof_active;
 }

+JEMALLOC_ALWAYS_INLINE bool
+prof_idump_accum(tsdn_t *tsdn, uint64_t accumbytes) {
+	cassert(config_prof);
+
+	if (prof_interval == 0 || !prof_active_get_unlocked()) {
+		return false;
+	}
+
+	return prof_idump_accum_impl(tsdn, accumbytes);
+}
+
+JEMALLOC_ALWAYS_INLINE void
+prof_idump_rollback(tsdn_t *tsdn, size_t usize) {
+	cassert(config_prof);
+
+	if (prof_interval == 0 || !prof_active_get_unlocked()) {
+		return;
+	}
+
+	prof_idump_rollback_impl(tsdn, usize);
+}
+
 #endif /* JEMALLOC_INTERNAL_PROF_INLINES_A_H */
--- a/include/jemalloc/internal/tcache_inlines.h
+++ b/include/jemalloc/internal/tcache_inlines.h
@@ -93,9 +93,6 @@ tcache_alloc_small(tsd_t *tsd, arena_t *arena, tcache_t *tcache,
 	if (config_stats) {
 		bin->tstats.nrequests++;
 	}
-	if (config_prof) {
-		tcache->prof_accumbytes += usize;
-	}
 	tcache_event(tsd, tcache);
 	return ret;
 }
@@ -151,9 +148,6 @@ tcache_alloc_large(tsd_t *tsd, arena_t *arena, tcache_t *tcache, size_t size,
 		if (config_stats) {
 			bin->tstats.nrequests++;
 		}
-		if (config_prof) {
-			tcache->prof_accumbytes += usize;
-		}
 	}

 	tcache_event(tsd, tcache);
--- a/include/jemalloc/internal/tcache_structs.h
+++ b/include/jemalloc/internal/tcache_structs.h
@@ -16,10 +16,9 @@ struct tcache_s {
 	 * together at the start of this struct.
 	 */

-	/* Cleared after arena_prof_accum(). */
-	uint64_t	prof_accumbytes;
 	/* Drives incremental GC. */
 	ticker_t	gc_ticker;
+
 	/*
 	 * The pointer stacks associated with bins follow as a contiguous array.
 	 * During tcache initialization, the avail pointer in each element of
--- a/include/jemalloc/internal/tcache_types.h
+++ b/include/jemalloc/internal/tcache_types.h
@@ -47,8 +47,8 @@ typedef struct tcaches_s tcaches_t;
 #define TCACHE_GC_INCR							\
    ((TCACHE_GC_SWEEP / SC_NBINS) + ((TCACHE_GC_SWEEP / SC_NBINS == 0) ? 0 : 1))

-/* Used in TSD static initializer only. Real init in tcache_data_init(). */
-#define TCACHE_ZERO_INITIALIZER {0}
+/* Used in TSD static initializer only. Real init in tsd_tcache_data_init(). */
+#define TCACHE_ZERO_INITIALIZER {{0}}

 /* Used in TSD static initializer only. Will be initialized to opt_tcache. */
 #define TCACHE_ENABLED_ZERO_INITIALIZER false
--- a/include/jemalloc/internal/thread_event.h
+++ b/include/jemalloc/internal/thread_event.h
@@ -44,7 +44,8 @@ void thread_event_boot();
    C(thread_allocated_next_event_fast)					\
    C(thread_allocated_last_event)					\
    C(thread_allocated_next_event)					\
-    ITERATE_OVER_ALL_EVENTS
+    ITERATE_OVER_ALL_EVENTS						\
+    C(prof_sample_last_event)

 /* Getters directly wrap TSD getters. */
 #define C(counter)							\
--- a/include/jemalloc/internal/tsd.h
+++ b/include/jemalloc/internal/tsd.h
@@ -30,6 +30,7 @@
 * l: thread_allocated_last_event
 * j: thread_allocated_next_event
 * w: prof_sample_event_wait (config_prof)
+ * x: prof_sample_last_event (config_prof)
 * p: prof_tdata (config_prof)
 * v: offset_state
 * i: iarena
@@ -45,11 +46,11 @@
 * |----------------------------  2nd cacheline  ----------------------------|
 * | [c * 64  ........ ........ ........ ........ ........ ........ .......] |
 * |----------------------------  3nd cacheline  ----------------------------|
- * | [c * 32  ........ ........ .......] llllllll jjjjjjjj wwwwwwww pppppppp |
+ * | [c * 32  ........ ........ .......] llllllll jjjjjjjj wwwwwwww xxxxxxxx |
 * +----------------------------  4th cacheline  ----------------------------+
- * | vvvvvvvv iiiiiiii aaaaaaaa oooooooo [b...... ........ ........ ........ |
+ * | pppppppp vvvvvvvv iiiiiiii aaaaaaaa oooooooo [b...... ........ ........ |
 * +----------------------------  5th cacheline  ----------------------------+
- * | ..b][t.. ........ ........ ........ ........ ........ ........ ........ |
+ * | ........ ..b][t.. ........ ........ ........ ........ ........ ........ |
 * +-------------------------------------------------------------------------+
 * Note: the entire tcache is embedded into TSD and spans multiple cachelines.
 *
@@ -83,6 +84,7 @@ typedef void (*test_callback_t)(int *);
    O(thread_allocated_last_event,	uint64_t,	uint64_t)	\
    O(thread_allocated_next_event,	uint64_t,	uint64_t)	\
    O(prof_sample_event_wait,	uint64_t,		uint64_t)	\
+    O(prof_sample_last_event,	uint64_t,		uint64_t)	\
    O(prof_tdata,		prof_tdata_t *,		prof_tdata_t *)	\
    O(offset_state,		uint64_t,		uint64_t)	\
    O(iarena,			arena_t *,		arena_t *)	\
@@ -109,9 +111,10 @@ typedef void (*test_callback_t)(int *);
    /* thread_allocated_next_event_fast */ THREAD_EVENT_MIN_START_WAIT,	\
    /* thread_deallocated */	0,					\
    /* rtree_ctx */		RTREE_CTX_ZERO_INITIALIZER,		\
-    /* thread_allocated_last_event */	0,			\
+    /* thread_allocated_last_event */	0,				\
    /* thread_allocated_next_event */	THREAD_EVENT_MIN_START_WAIT,	\
    /* prof_sample_event_wait */	THREAD_EVENT_MIN_START_WAIT,	\
+    /* prof_sample_last_event */	0,				\
    /* prof_tdata */		NULL,					\
    /* offset_state */		0,					\
    /* iarena */		NULL,					\