Pull prof_accumbytes into thread event handler

This commit is contained in:
Yinan Zhang
2019-10-14 09:35:51 -07:00
parent 152c0ef954
commit 198f02e797
16 changed files with 148 additions and 177 deletions

View File

@@ -49,7 +49,7 @@ void arena_decay(tsdn_t *tsdn, arena_t *arena, bool is_background_thread,
void arena_reset(tsd_t *tsd, arena_t *arena);
void arena_destroy(tsd_t *tsd, arena_t *arena);
void arena_tcache_fill_small(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache,
cache_bin_t *tbin, szind_t binind, uint64_t prof_accumbytes);
cache_bin_t *tbin, szind_t binind);
void arena_alloc_junk_small(void *ptr, const bin_info_t *bin_info,
bool zero);

View File

@@ -21,17 +21,6 @@ arena_internal_get(arena_t *arena) {
return atomic_load_zu(&arena->stats.internal, ATOMIC_RELAXED);
}
static inline bool
arena_prof_accum(tsdn_t *tsdn, arena_t *arena, uint64_t accumbytes) {
cassert(config_prof);
if (likely(prof_interval == 0 || !prof_active_get_unlocked())) {
return false;
}
return prof_accum_add(tsdn, &arena->prof_accum, accumbytes);
}
static inline void
percpu_arena_update(tsd_t *tsd, unsigned cpu) {
assert(have_percpu_arena);

View File

@@ -24,7 +24,7 @@ arena_choose_impl(tsd_t *tsd, arena_t *arena, bool internal) {
if (tcache_available(tsd)) {
tcache_t *tcache = tcache_get(tsd);
if (tcache->arena != NULL) {
/* See comments in tcache_data_init().*/
/* See comments in tsd_tcache_data_init().*/
assert(tcache->arena ==
arena_get(tsd_tsdn(tsd), 0, false));
if (tcache->arena != ret) {

View File

@@ -33,13 +33,7 @@ extern bool prof_active;
/* Accessed via prof_gdump_[gs]et{_unlocked,}(). */
extern bool prof_gdump_val;
/*
* Profile dump interval, measured in bytes allocated. Each arena triggers a
* profile dump when it reaches this threshold. The effect is that the
* interval between profile dumps averages prof_interval, though the actual
* interval between dumps will tend to be sporadic, and the interval will be a
* maximum of approximately (prof_interval * narenas).
*/
/* Profile dump interval, measured in bytes allocated. */
extern uint64_t prof_interval;
/*
@@ -50,6 +44,10 @@ extern size_t lg_prof_sample;
extern bool prof_booted;
/* Functions only accessed in prof_inlines_a.h */
bool prof_idump_accum_impl(tsdn_t *tsdn, uint64_t accumbytes);
void prof_idump_rollback_impl(tsdn_t *tsdn, size_t usize);
void prof_alloc_rollback(tsd_t *tsd, prof_tctx_t *tctx, bool updated);
void prof_malloc_sample_object(tsdn_t *tsdn, const void *ptr, size_t usize,
prof_tctx_t *tctx);
@@ -73,7 +71,7 @@ void prof_cnt_all(uint64_t *curobjs, uint64_t *curbytes, uint64_t *accumobjs,
#endif
int prof_getpid(void);
void prof_get_default_filename(tsdn_t *tsdn, char *filename, uint64_t ind);
bool prof_accum_init(tsdn_t *tsdn, prof_accum_t *prof_accum);
bool prof_accum_init(tsdn_t *tsdn);
void prof_idump(tsdn_t *tsdn);
bool prof_mdump(tsd_t *tsd, const char *filename);
void prof_gdump(tsdn_t *tsdn);

View File

@@ -3,74 +3,6 @@
#include "jemalloc/internal/mutex.h"
static inline bool
prof_accum_add(tsdn_t *tsdn, prof_accum_t *prof_accum,
uint64_t accumbytes) {
cassert(config_prof);
bool overflow;
uint64_t a0, a1;
/*
* If the application allocates fast enough (and/or if idump is slow
* enough), extreme overflow here (a1 >= prof_interval * 2) can cause
* idump trigger coalescing. This is an intentional mechanism that
* avoids rate-limiting allocation.
*/
#ifdef JEMALLOC_ATOMIC_U64
a0 = atomic_load_u64(&prof_accum->accumbytes, ATOMIC_RELAXED);
do {
a1 = a0 + accumbytes;
assert(a1 >= a0);
overflow = (a1 >= prof_interval);
if (overflow) {
a1 %= prof_interval;
}
} while (!atomic_compare_exchange_weak_u64(&prof_accum->accumbytes, &a0,
a1, ATOMIC_RELAXED, ATOMIC_RELAXED));
#else
malloc_mutex_lock(tsdn, &prof_accum->mtx);
a0 = prof_accum->accumbytes;
a1 = a0 + accumbytes;
overflow = (a1 >= prof_interval);
if (overflow) {
a1 %= prof_interval;
}
prof_accum->accumbytes = a1;
malloc_mutex_unlock(tsdn, &prof_accum->mtx);
#endif
return overflow;
}
static inline void
prof_accum_cancel(tsdn_t *tsdn, prof_accum_t *prof_accum,
size_t usize) {
cassert(config_prof);
/*
* Cancel out as much of the excessive prof_accumbytes increase as
* possible without underflowing. Interval-triggered dumps occur
* slightly more often than intended as a result of incomplete
* canceling.
*/
uint64_t a0, a1;
#ifdef JEMALLOC_ATOMIC_U64
a0 = atomic_load_u64(&prof_accum->accumbytes, ATOMIC_RELAXED);
do {
a1 = (a0 >= SC_LARGE_MINCLASS - usize)
? a0 - (SC_LARGE_MINCLASS - usize) : 0;
} while (!atomic_compare_exchange_weak_u64(&prof_accum->accumbytes, &a0,
a1, ATOMIC_RELAXED, ATOMIC_RELAXED));
#else
malloc_mutex_lock(tsdn, &prof_accum->mtx);
a0 = prof_accum->accumbytes;
a1 = (a0 >= SC_LARGE_MINCLASS - usize)
? a0 - (SC_LARGE_MINCLASS - usize) : 0;
prof_accum->accumbytes = a1;
malloc_mutex_unlock(tsdn, &prof_accum->mtx);
#endif
}
JEMALLOC_ALWAYS_INLINE void
prof_active_assert() {
cassert(config_prof);
@@ -93,4 +25,26 @@ prof_active_get_unlocked(void) {
return prof_active;
}
JEMALLOC_ALWAYS_INLINE bool
prof_idump_accum(tsdn_t *tsdn, uint64_t accumbytes) {
cassert(config_prof);
if (prof_interval == 0 || !prof_active_get_unlocked()) {
return false;
}
return prof_idump_accum_impl(tsdn, accumbytes);
}
JEMALLOC_ALWAYS_INLINE void
prof_idump_rollback(tsdn_t *tsdn, size_t usize) {
cassert(config_prof);
if (prof_interval == 0 || !prof_active_get_unlocked()) {
return;
}
prof_idump_rollback_impl(tsdn, usize);
}
#endif /* JEMALLOC_INTERNAL_PROF_INLINES_A_H */

View File

@@ -93,9 +93,6 @@ tcache_alloc_small(tsd_t *tsd, arena_t *arena, tcache_t *tcache,
if (config_stats) {
bin->tstats.nrequests++;
}
if (config_prof) {
tcache->prof_accumbytes += usize;
}
tcache_event(tsd, tcache);
return ret;
}
@@ -151,9 +148,6 @@ tcache_alloc_large(tsd_t *tsd, arena_t *arena, tcache_t *tcache, size_t size,
if (config_stats) {
bin->tstats.nrequests++;
}
if (config_prof) {
tcache->prof_accumbytes += usize;
}
}
tcache_event(tsd, tcache);

View File

@@ -16,10 +16,9 @@ struct tcache_s {
* together at the start of this struct.
*/
/* Cleared after arena_prof_accum(). */
uint64_t prof_accumbytes;
/* Drives incremental GC. */
ticker_t gc_ticker;
/*
* The pointer stacks associated with bins follow as a contiguous array.
* During tcache initialization, the avail pointer in each element of

View File

@@ -47,8 +47,8 @@ typedef struct tcaches_s tcaches_t;
#define TCACHE_GC_INCR \
((TCACHE_GC_SWEEP / SC_NBINS) + ((TCACHE_GC_SWEEP / SC_NBINS == 0) ? 0 : 1))
/* Used in TSD static initializer only. Real init in tcache_data_init(). */
#define TCACHE_ZERO_INITIALIZER {0}
/* Used in TSD static initializer only. Real init in tsd_tcache_data_init(). */
#define TCACHE_ZERO_INITIALIZER {{0}}
/* Used in TSD static initializer only. Will be initialized to opt_tcache. */
#define TCACHE_ENABLED_ZERO_INITIALIZER false

View File

@@ -44,7 +44,8 @@ void thread_event_boot();
C(thread_allocated_next_event_fast) \
C(thread_allocated_last_event) \
C(thread_allocated_next_event) \
ITERATE_OVER_ALL_EVENTS
ITERATE_OVER_ALL_EVENTS \
C(prof_sample_last_event)
/* Getters directly wrap TSD getters. */
#define C(counter) \

View File

@@ -30,6 +30,7 @@
* l: thread_allocated_last_event
* j: thread_allocated_next_event
* w: prof_sample_event_wait (config_prof)
* x: prof_sample_last_event (config_prof)
* p: prof_tdata (config_prof)
* v: offset_state
* i: iarena
@@ -45,11 +46,11 @@
* |---------------------------- 2nd cacheline ----------------------------|
* | [c * 64 ........ ........ ........ ........ ........ ........ .......] |
* |---------------------------- 3nd cacheline ----------------------------|
* | [c * 32 ........ ........ .......] llllllll jjjjjjjj wwwwwwww pppppppp |
* | [c * 32 ........ ........ .......] llllllll jjjjjjjj wwwwwwww xxxxxxxx |
* +---------------------------- 4th cacheline ----------------------------+
* | vvvvvvvv iiiiiiii aaaaaaaa oooooooo [b...... ........ ........ ........ |
* | pppppppp vvvvvvvv iiiiiiii aaaaaaaa oooooooo [b...... ........ ........ |
* +---------------------------- 5th cacheline ----------------------------+
* | ..b][t.. ........ ........ ........ ........ ........ ........ ........ |
* | ........ ..b][t.. ........ ........ ........ ........ ........ ........ |
* +-------------------------------------------------------------------------+
* Note: the entire tcache is embedded into TSD and spans multiple cachelines.
*
@@ -83,6 +84,7 @@ typedef void (*test_callback_t)(int *);
O(thread_allocated_last_event, uint64_t, uint64_t) \
O(thread_allocated_next_event, uint64_t, uint64_t) \
O(prof_sample_event_wait, uint64_t, uint64_t) \
O(prof_sample_last_event, uint64_t, uint64_t) \
O(prof_tdata, prof_tdata_t *, prof_tdata_t *) \
O(offset_state, uint64_t, uint64_t) \
O(iarena, arena_t *, arena_t *) \
@@ -109,9 +111,10 @@ typedef void (*test_callback_t)(int *);
/* thread_allocated_next_event_fast */ THREAD_EVENT_MIN_START_WAIT, \
/* thread_deallocated */ 0, \
/* rtree_ctx */ RTREE_CTX_ZERO_INITIALIZER, \
/* thread_allocated_last_event */ 0, \
/* thread_allocated_last_event */ 0, \
/* thread_allocated_next_event */ THREAD_EVENT_MIN_START_WAIT, \
/* prof_sample_event_wait */ THREAD_EVENT_MIN_START_WAIT, \
/* prof_sample_last_event */ 0, \
/* prof_tdata */ NULL, \
/* offset_state */ 0, \
/* iarena */ NULL, \