Pull prof_accumbytes into thread event handler

This commit is contained in:
Yinan Zhang 2019-10-14 09:35:51 -07:00
parent 152c0ef954
commit 198f02e797
16 changed files with 148 additions and 177 deletions

View File

@ -49,7 +49,7 @@ void arena_decay(tsdn_t *tsdn, arena_t *arena, bool is_background_thread,
void arena_reset(tsd_t *tsd, arena_t *arena);
void arena_destroy(tsd_t *tsd, arena_t *arena);
void arena_tcache_fill_small(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache,
cache_bin_t *tbin, szind_t binind, uint64_t prof_accumbytes);
cache_bin_t *tbin, szind_t binind);
void arena_alloc_junk_small(void *ptr, const bin_info_t *bin_info,
bool zero);

View File

@ -21,17 +21,6 @@ arena_internal_get(arena_t *arena) {
return atomic_load_zu(&arena->stats.internal, ATOMIC_RELAXED);
}
static inline bool
arena_prof_accum(tsdn_t *tsdn, arena_t *arena, uint64_t accumbytes) {
cassert(config_prof);
if (likely(prof_interval == 0 || !prof_active_get_unlocked())) {
return false;
}
return prof_accum_add(tsdn, &arena->prof_accum, accumbytes);
}
static inline void
percpu_arena_update(tsd_t *tsd, unsigned cpu) {
assert(have_percpu_arena);

View File

@ -24,7 +24,7 @@ arena_choose_impl(tsd_t *tsd, arena_t *arena, bool internal) {
if (tcache_available(tsd)) {
tcache_t *tcache = tcache_get(tsd);
if (tcache->arena != NULL) {
/* See comments in tcache_data_init().*/
/* See comments in tsd_tcache_data_init().*/
assert(tcache->arena ==
arena_get(tsd_tsdn(tsd), 0, false));
if (tcache->arena != ret) {

View File

@ -33,13 +33,7 @@ extern bool prof_active;
/* Accessed via prof_gdump_[gs]et{_unlocked,}(). */
extern bool prof_gdump_val;
/*
* Profile dump interval, measured in bytes allocated. Each arena triggers a
* profile dump when it reaches this threshold. The effect is that the
* interval between profile dumps averages prof_interval, though the actual
* interval between dumps will tend to be sporadic, and the interval will be a
* maximum of approximately (prof_interval * narenas).
*/
/* Profile dump interval, measured in bytes allocated. */
extern uint64_t prof_interval;
/*
@ -50,6 +44,10 @@ extern size_t lg_prof_sample;
extern bool prof_booted;
/* Functions only accessed in prof_inlines_a.h */
bool prof_idump_accum_impl(tsdn_t *tsdn, uint64_t accumbytes);
void prof_idump_rollback_impl(tsdn_t *tsdn, size_t usize);
void prof_alloc_rollback(tsd_t *tsd, prof_tctx_t *tctx, bool updated);
void prof_malloc_sample_object(tsdn_t *tsdn, const void *ptr, size_t usize,
prof_tctx_t *tctx);
@ -73,7 +71,7 @@ void prof_cnt_all(uint64_t *curobjs, uint64_t *curbytes, uint64_t *accumobjs,
#endif
int prof_getpid(void);
void prof_get_default_filename(tsdn_t *tsdn, char *filename, uint64_t ind);
bool prof_accum_init(tsdn_t *tsdn, prof_accum_t *prof_accum);
bool prof_accum_init(tsdn_t *tsdn);
void prof_idump(tsdn_t *tsdn);
bool prof_mdump(tsd_t *tsd, const char *filename);
void prof_gdump(tsdn_t *tsdn);

View File

@ -3,74 +3,6 @@
#include "jemalloc/internal/mutex.h"
static inline bool
prof_accum_add(tsdn_t *tsdn, prof_accum_t *prof_accum,
uint64_t accumbytes) {
cassert(config_prof);
bool overflow;
uint64_t a0, a1;
/*
* If the application allocates fast enough (and/or if idump is slow
* enough), extreme overflow here (a1 >= prof_interval * 2) can cause
* idump trigger coalescing. This is an intentional mechanism that
* avoids rate-limiting allocation.
*/
#ifdef JEMALLOC_ATOMIC_U64
a0 = atomic_load_u64(&prof_accum->accumbytes, ATOMIC_RELAXED);
do {
a1 = a0 + accumbytes;
assert(a1 >= a0);
overflow = (a1 >= prof_interval);
if (overflow) {
a1 %= prof_interval;
}
} while (!atomic_compare_exchange_weak_u64(&prof_accum->accumbytes, &a0,
a1, ATOMIC_RELAXED, ATOMIC_RELAXED));
#else
malloc_mutex_lock(tsdn, &prof_accum->mtx);
a0 = prof_accum->accumbytes;
a1 = a0 + accumbytes;
overflow = (a1 >= prof_interval);
if (overflow) {
a1 %= prof_interval;
}
prof_accum->accumbytes = a1;
malloc_mutex_unlock(tsdn, &prof_accum->mtx);
#endif
return overflow;
}
static inline void
prof_accum_cancel(tsdn_t *tsdn, prof_accum_t *prof_accum,
size_t usize) {
cassert(config_prof);
/*
* Cancel out as much of the excessive prof_accumbytes increase as
* possible without underflowing. Interval-triggered dumps occur
* slightly more often than intended as a result of incomplete
* canceling.
*/
uint64_t a0, a1;
#ifdef JEMALLOC_ATOMIC_U64
a0 = atomic_load_u64(&prof_accum->accumbytes, ATOMIC_RELAXED);
do {
a1 = (a0 >= SC_LARGE_MINCLASS - usize)
? a0 - (SC_LARGE_MINCLASS - usize) : 0;
} while (!atomic_compare_exchange_weak_u64(&prof_accum->accumbytes, &a0,
a1, ATOMIC_RELAXED, ATOMIC_RELAXED));
#else
malloc_mutex_lock(tsdn, &prof_accum->mtx);
a0 = prof_accum->accumbytes;
a1 = (a0 >= SC_LARGE_MINCLASS - usize)
? a0 - (SC_LARGE_MINCLASS - usize) : 0;
prof_accum->accumbytes = a1;
malloc_mutex_unlock(tsdn, &prof_accum->mtx);
#endif
}
JEMALLOC_ALWAYS_INLINE void
prof_active_assert() {
cassert(config_prof);
@ -93,4 +25,26 @@ prof_active_get_unlocked(void) {
return prof_active;
}
JEMALLOC_ALWAYS_INLINE bool
prof_idump_accum(tsdn_t *tsdn, uint64_t accumbytes) {
cassert(config_prof);
if (prof_interval == 0 || !prof_active_get_unlocked()) {
return false;
}
return prof_idump_accum_impl(tsdn, accumbytes);
}
JEMALLOC_ALWAYS_INLINE void
prof_idump_rollback(tsdn_t *tsdn, size_t usize) {
cassert(config_prof);
if (prof_interval == 0 || !prof_active_get_unlocked()) {
return;
}
prof_idump_rollback_impl(tsdn, usize);
}
#endif /* JEMALLOC_INTERNAL_PROF_INLINES_A_H */

View File

@ -93,9 +93,6 @@ tcache_alloc_small(tsd_t *tsd, arena_t *arena, tcache_t *tcache,
if (config_stats) {
bin->tstats.nrequests++;
}
if (config_prof) {
tcache->prof_accumbytes += usize;
}
tcache_event(tsd, tcache);
return ret;
}
@ -151,9 +148,6 @@ tcache_alloc_large(tsd_t *tsd, arena_t *arena, tcache_t *tcache, size_t size,
if (config_stats) {
bin->tstats.nrequests++;
}
if (config_prof) {
tcache->prof_accumbytes += usize;
}
}
tcache_event(tsd, tcache);

View File

@ -16,10 +16,9 @@ struct tcache_s {
* together at the start of this struct.
*/
/* Cleared after arena_prof_accum(). */
uint64_t prof_accumbytes;
/* Drives incremental GC. */
ticker_t gc_ticker;
/*
* The pointer stacks associated with bins follow as a contiguous array.
* During tcache initialization, the avail pointer in each element of

View File

@ -47,8 +47,8 @@ typedef struct tcaches_s tcaches_t;
#define TCACHE_GC_INCR \
((TCACHE_GC_SWEEP / SC_NBINS) + ((TCACHE_GC_SWEEP / SC_NBINS == 0) ? 0 : 1))
/* Used in TSD static initializer only. Real init in tcache_data_init(). */
#define TCACHE_ZERO_INITIALIZER {0}
/* Used in TSD static initializer only. Real init in tsd_tcache_data_init(). */
#define TCACHE_ZERO_INITIALIZER {{0}}
/* Used in TSD static initializer only. Will be initialized to opt_tcache. */
#define TCACHE_ENABLED_ZERO_INITIALIZER false

View File

@ -44,7 +44,8 @@ void thread_event_boot();
C(thread_allocated_next_event_fast) \
C(thread_allocated_last_event) \
C(thread_allocated_next_event) \
ITERATE_OVER_ALL_EVENTS
ITERATE_OVER_ALL_EVENTS \
C(prof_sample_last_event)
/* Getters directly wrap TSD getters. */
#define C(counter) \

View File

@ -30,6 +30,7 @@
* l: thread_allocated_last_event
* j: thread_allocated_next_event
* w: prof_sample_event_wait (config_prof)
* x: prof_sample_last_event (config_prof)
* p: prof_tdata (config_prof)
* v: offset_state
* i: iarena
@ -45,11 +46,11 @@
* |---------------------------- 2nd cacheline ----------------------------|
* | [c * 64 ........ ........ ........ ........ ........ ........ .......] |
* |---------------------------- 3nd cacheline ----------------------------|
* | [c * 32 ........ ........ .......] llllllll jjjjjjjj wwwwwwww pppppppp |
* | [c * 32 ........ ........ .......] llllllll jjjjjjjj wwwwwwww xxxxxxxx |
* +---------------------------- 4th cacheline ----------------------------+
* | vvvvvvvv iiiiiiii aaaaaaaa oooooooo [b...... ........ ........ ........ |
* | pppppppp vvvvvvvv iiiiiiii aaaaaaaa oooooooo [b...... ........ ........ |
* +---------------------------- 5th cacheline ----------------------------+
* | ..b][t.. ........ ........ ........ ........ ........ ........ ........ |
* | ........ ..b][t.. ........ ........ ........ ........ ........ ........ |
* +-------------------------------------------------------------------------+
* Note: the entire tcache is embedded into TSD and spans multiple cachelines.
*
@ -83,6 +84,7 @@ typedef void (*test_callback_t)(int *);
O(thread_allocated_last_event, uint64_t, uint64_t) \
O(thread_allocated_next_event, uint64_t, uint64_t) \
O(prof_sample_event_wait, uint64_t, uint64_t) \
O(prof_sample_last_event, uint64_t, uint64_t) \
O(prof_tdata, prof_tdata_t *, prof_tdata_t *) \
O(offset_state, uint64_t, uint64_t) \
O(iarena, arena_t *, arena_t *) \
@ -109,9 +111,10 @@ typedef void (*test_callback_t)(int *);
/* thread_allocated_next_event_fast */ THREAD_EVENT_MIN_START_WAIT, \
/* thread_deallocated */ 0, \
/* rtree_ctx */ RTREE_CTX_ZERO_INITIALIZER, \
/* thread_allocated_last_event */ 0, \
/* thread_allocated_last_event */ 0, \
/* thread_allocated_next_event */ THREAD_EVENT_MIN_START_WAIT, \
/* prof_sample_event_wait */ THREAD_EVENT_MIN_START_WAIT, \
/* prof_sample_last_event */ 0, \
/* prof_tdata */ NULL, \
/* offset_state */ 0, \
/* iarena */ NULL, \

View File

@ -1378,13 +1378,10 @@ arena_bin_choose_lock(tsdn_t *tsdn, arena_t *arena, szind_t binind,
void
arena_tcache_fill_small(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache,
cache_bin_t *tbin, szind_t binind, uint64_t prof_accumbytes) {
cache_bin_t *tbin, szind_t binind) {
unsigned i, nfill, cnt;
assert(cache_bin_ncached_get(tbin, binind) == 0);
if (config_prof && arena_prof_accum(tsdn, arena, prof_accumbytes)) {
prof_idump(tsdn);
}
tcache->bin_refilled[binind] = true;
unsigned binshard;
@ -1484,10 +1481,8 @@ arena_malloc_small(tsdn_t *tsdn, arena_t *arena, szind_t binind, bool zero) {
bin->stats.nrequests++;
bin->stats.curregs++;
}
malloc_mutex_unlock(tsdn, &bin->lock);
if (config_prof && arena_prof_accum(tsdn, arena, usize)) {
prof_idump(tsdn);
}
if (!zero) {
if (config_fill) {
@ -1565,14 +1560,13 @@ arena_prof_promote(tsdn_t *tsdn, void *ptr, size_t usize) {
extent_t *extent = rtree_extent_read(tsdn, &extents_rtree, rtree_ctx,
(uintptr_t)ptr, true);
arena_t *arena = arena_get_from_extent(extent);
szind_t szind = sz_size2index(usize);
extent_szind_set(extent, szind);
rtree_szind_slab_update(tsdn, &extents_rtree, rtree_ctx, (uintptr_t)ptr,
szind, false);
prof_accum_cancel(tsdn, &arena->prof_accum, usize);
prof_idump_rollback(tsdn, usize);
assert(isalloc(tsdn, ptr) == usize);
}
@ -1982,7 +1976,7 @@ arena_new(tsdn_t *tsdn, unsigned ind, extent_hooks_t *extent_hooks) {
}
if (config_prof) {
if (prof_accum_init(tsdn, &arena->prof_accum)) {
if (prof_accum_init(tsdn)) {
goto label_error;
}
}

View File

@ -2386,9 +2386,6 @@ je_malloc(size_t size) {
if (config_stats) {
bin->tstats.nrequests++;
}
if (config_prof) {
tcache->prof_accumbytes += usize;
}
LOG("core.malloc.exit", "result: %p", ret);

View File

@ -56,9 +56,6 @@ large_palloc(tsdn_t *tsdn, arena_t *arena, size_t usize, size_t alignment,
extent_list_append(&arena->large, extent);
malloc_mutex_unlock(tsdn, &arena->large_mtx);
}
if (config_prof && arena_prof_accum(tsdn, arena, usize)) {
prof_idump(tsdn);
}
if (zero) {
assert(is_zeroed);

View File

@ -45,6 +45,9 @@ bool opt_prof_leak = false;
bool opt_prof_accum = false;
char opt_prof_prefix[PROF_DUMP_FILENAME_LEN];
/* Accessed via prof_idump_[accum/rollback](). */
static prof_accum_t prof_idump_accumulated;
/*
* Initialized as opt_prof_active, and accessed via
* prof_active_[gs]et{_unlocked,}().
@ -586,21 +589,91 @@ prof_fdump(void) {
}
bool
prof_accum_init(tsdn_t *tsdn, prof_accum_t *prof_accum) {
prof_accum_init(tsdn_t *tsdn) {
cassert(config_prof);
#ifndef JEMALLOC_ATOMIC_U64
if (malloc_mutex_init(&prof_accum->mtx, "prof_accum",
if (malloc_mutex_init(&prof_idump_accumulated.mtx, "prof_accum",
WITNESS_RANK_PROF_ACCUM, malloc_mutex_rank_exclusive)) {
return true;
}
prof_accum->accumbytes = 0;
prof_idump_accumulated.accumbytes = 0;
#else
atomic_store_u64(&prof_accum->accumbytes, 0, ATOMIC_RELAXED);
atomic_store_u64(&prof_idump_accumulated.accumbytes, 0,
ATOMIC_RELAXED);
#endif
return false;
}
bool
prof_idump_accum_impl(tsdn_t *tsdn, uint64_t accumbytes) {
cassert(config_prof);
bool overflow;
uint64_t a0, a1;
/*
* If the application allocates fast enough (and/or if idump is slow
* enough), extreme overflow here (a1 >= prof_interval * 2) can cause
* idump trigger coalescing. This is an intentional mechanism that
* avoids rate-limiting allocation.
*/
#ifdef JEMALLOC_ATOMIC_U64
a0 = atomic_load_u64(&prof_idump_accumulated.accumbytes,
ATOMIC_RELAXED);
do {
a1 = a0 + accumbytes;
assert(a1 >= a0);
overflow = (a1 >= prof_interval);
if (overflow) {
a1 %= prof_interval;
}
} while (!atomic_compare_exchange_weak_u64(
&prof_idump_accumulated.accumbytes, &a0, a1, ATOMIC_RELAXED,
ATOMIC_RELAXED));
#else
malloc_mutex_lock(tsdn, &prof_idump_accumulated.mtx);
a0 = prof_idump_accumulated.accumbytes;
a1 = a0 + accumbytes;
overflow = (a1 >= prof_interval);
if (overflow) {
a1 %= prof_interval;
}
prof_idump_accumulated.accumbytes = a1;
malloc_mutex_unlock(tsdn, &prof_idump_accumulated.mtx);
#endif
return overflow;
}
void
prof_idump_rollback_impl(tsdn_t *tsdn, size_t usize) {
cassert(config_prof);
/*
* Cancel out as much of the excessive accumbytes increase as possible
* without underflowing. Interval-triggered dumps occur slightly more
* often than intended as a result of incomplete canceling.
*/
uint64_t a0, a1;
#ifdef JEMALLOC_ATOMIC_U64
a0 = atomic_load_u64(&prof_idump_accumulated.accumbytes,
ATOMIC_RELAXED);
do {
a1 = (a0 >= SC_LARGE_MINCLASS - usize)
? a0 - (SC_LARGE_MINCLASS - usize) : 0;
} while (!atomic_compare_exchange_weak_u64(
&prof_idump_accumulated.accumbytes, &a0, a1, ATOMIC_RELAXED,
ATOMIC_RELAXED));
#else
malloc_mutex_lock(tsdn, &prof_idump_accumulated.mtx);
a0 = prof_idump_accumulated.accumbytes;
a1 = (a0 >= SC_LARGE_MINCLASS - usize)
? a0 - (SC_LARGE_MINCLASS - usize) : 0;
prof_idump_accumulated.accumbytes = a1;
malloc_mutex_unlock(tsdn, &prof_idump_accumulated.mtx);
#endif
}
bool
prof_dump_prefix_set(tsdn_t *tsdn, const char *prefix) {
cassert(config_prof);
@ -641,7 +714,7 @@ prof_idump(tsdn_t *tsdn) {
return;
}
tdata = prof_tdata_get(tsd, false);
tdata = prof_tdata_get(tsd, true);
if (tdata == NULL) {
return;
}

View File

@ -106,11 +106,7 @@ tcache_alloc_small_hard(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache,
void *ret;
assert(tcache->arena != NULL);
arena_tcache_fill_small(tsdn, arena, tcache, tbin, binind,
config_prof ? tcache->prof_accumbytes : 0);
if (config_prof) {
tcache->prof_accumbytes = 0;
}
arena_tcache_fill_small(tsdn, arena, tcache, tbin, binind);
ret = cache_bin_alloc_easy(tbin, tcache_success, binind);
return ret;
@ -181,14 +177,6 @@ tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, cache_bin_t *tbin,
assert(binshard < bin_infos[binind].n_shards);
bin_t *bin = &bin_arena->bins[binind].bin_shards[binshard];
if (config_prof && bin_arena == arena) {
if (arena_prof_accum(tsd_tsdn(tsd), arena,
tcache->prof_accumbytes)) {
prof_idump(tsd_tsdn(tsd));
}
tcache->prof_accumbytes = 0;
}
malloc_mutex_lock(tsd_tsdn(tsd), &bin->lock);
if (config_stats && bin_arena == arena && !merged_stats) {
merged_stats = true;
@ -274,11 +262,6 @@ tcache_bin_flush_large(tsd_t *tsd, tcache_t *tcache, cache_bin_t *tbin, szind_t
unsigned locked_arena_ind = extent_arena_ind_get(extent);
arena_t *locked_arena = arena_get(tsd_tsdn(tsd),
locked_arena_ind, false);
bool idump;
if (config_prof) {
idump = false;
}
bool lock_large = !arena_is_auto(locked_arena);
if (lock_large) {
@ -295,11 +278,6 @@ tcache_bin_flush_large(tsd_t *tsd, tcache_t *tcache, cache_bin_t *tbin, szind_t
}
if ((config_prof || config_stats) &&
(locked_arena == tcache_arena)) {
if (config_prof) {
idump = arena_prof_accum(tsd_tsdn(tsd),
tcache_arena, tcache->prof_accumbytes);
tcache->prof_accumbytes = 0;
}
if (config_stats) {
merged_stats = true;
arena_stats_large_flush_nrequests_add(
@ -332,9 +310,6 @@ tcache_bin_flush_large(tsd_t *tsd, tcache_t *tcache, cache_bin_t *tbin, szind_t
ndeferred++;
}
}
if (config_prof && idump) {
prof_idump(tsd_tsdn(tsd));
}
arena_decay_ticks(tsd_tsdn(tsd), locked_arena, nflush -
ndeferred);
nflush = ndeferred;
@ -462,7 +437,6 @@ tcache_init(tsd_t *tsd, tcache_t *tcache, void *avail_stack) {
assert(!tcache_bin_lowbits_overflowable(avail_stack));
memset(&tcache->link, 0, sizeof(ql_elm(tcache_t)));
tcache->prof_accumbytes = 0;
tcache->next_gc_bin = 0;
tcache->arena = NULL;
@ -590,14 +564,6 @@ tcache_flush_cache(tsd_t *tsd, tcache_t *tcache) {
assert(tbin->tstats.nrequests == 0);
}
}
if (config_prof && tcache->prof_accumbytes > 0) {
if (arena_prof_accum(tsd_tsdn(tsd), tcache->arena,
tcache->prof_accumbytes)) {
prof_idump(tsd_tsdn(tsd));
}
tcache->prof_accumbytes = 0;
}
}
void

View File

@ -18,6 +18,29 @@ static void thread_##event##_event_handler(tsd_t *tsd);
ITERATE_OVER_ALL_EVENTS
#undef E
static void
thread_prof_sample_event_handler(tsd_t *tsd) {
assert(config_prof && opt_prof);
assert(prof_sample_event_wait_get(tsd) == 0U);
uint64_t last_event = thread_allocated_last_event_get(tsd);
uint64_t last_sample_event = prof_sample_last_event_get(tsd);
prof_sample_last_event_set(tsd, last_event);
if (prof_idump_accum(tsd_tsdn(tsd), last_event - last_sample_event)) {
prof_idump(tsd_tsdn(tsd));
}
if (!prof_active_get_unlocked()) {
/*
* If prof_active is off, we reset prof_sample_event_wait to be
* the sample interval when it drops to 0, so that there won't
* be excessive routings to the slow path, and that when
* prof_active is turned on later, the counting for sampling
* can immediately resume as normal.
*/
thread_prof_sample_event_update(tsd,
(uint64_t)(1 << lg_prof_sample));
}
}
static uint64_t
thread_allocated_next_event_compute(tsd_t *tsd) {
uint64_t wait = THREAD_EVENT_MAX_START_WAIT;
@ -86,23 +109,6 @@ thread_event_adjust_thresholds_helper(tsd_t *tsd, uint64_t wait) {
thread_allocated_next_event_fast_set(tsd, next_event_fast);
}
static void
thread_prof_sample_event_handler(tsd_t *tsd) {
assert(config_prof && opt_prof);
assert(prof_sample_event_wait_get(tsd) == 0U);
if (!prof_active_get_unlocked()) {
/*
* If prof_active is off, we reset prof_sample_event_wait to be
* the sample interval when it drops to 0, so that there won't
* be excessive routings to the slow path, and that when
* prof_active is turned on later, the counting for sampling
* can immediately resume as normal.
*/
thread_prof_sample_event_update(tsd,
(uint64_t)(1 << lg_prof_sample));
}
}
static uint64_t
thread_event_trigger_batch_update(tsd_t *tsd, uint64_t accumbytes,
bool allow_event_trigger) {