From 60993697d8bd3f8a07756091df397ed4044da921 Mon Sep 17 00:00:00 2001 From: David Goldblatt Date: Mon, 3 Aug 2020 13:05:34 -0700 Subject: [PATCH] Prof: Add prof_unbias. This gives more accurate attribution of bytes and counts to stack traces, without introducing backwards incompatibilities in heap-profile parsing tools. We track the ideal reported (to the end user) number of bytes more carefully inside core jemalloc. When dumping heap profiles, insteading of outputting our counts directly, we output counts that will cause parsing tools to give a result close to the value we want. We retain the old version as an opt setting, to let users who are tracking values on a per-component basis to keep their metrics stable until they decide to switch. --- include/jemalloc/internal/prof_externs.h | 4 + include/jemalloc/internal/prof_structs.h | 4 + src/jemalloc.c | 10 ++ src/prof.c | 65 +++++++++ src/prof_data.c | 160 ++++++++++++++++++++++- 5 files changed, 241 insertions(+), 2 deletions(-) diff --git a/include/jemalloc/internal/prof_externs.h b/include/jemalloc/internal/prof_externs.h index 4579ab02..ba5933af 100644 --- a/include/jemalloc/internal/prof_externs.h +++ b/include/jemalloc/internal/prof_externs.h @@ -19,6 +19,7 @@ extern char opt_prof_prefix[ PATH_MAX + #endif 1]; +extern bool opt_prof_unbias; /* For recording recent allocations */ extern ssize_t opt_prof_recent_alloc_max; @@ -40,6 +41,9 @@ extern uint64_t prof_interval; * resets. */ extern size_t lg_prof_sample; +extern size_t prof_unbiased_sz[SC_NSIZES]; +extern size_t prof_shifted_unbiased_cnt[SC_NSIZES]; +void prof_unbias_map_init(); extern bool prof_booted; diff --git a/include/jemalloc/internal/prof_structs.h b/include/jemalloc/internal/prof_structs.h index 26942aa6..fbad6145 100644 --- a/include/jemalloc/internal/prof_structs.h +++ b/include/jemalloc/internal/prof_structs.h @@ -24,9 +24,13 @@ typedef struct { struct prof_cnt_s { /* Profiling counters. */ uint64_t curobjs; + uint64_t curobjs_shifted_unbiased; uint64_t curbytes; + uint64_t curbytes_unbiased; uint64_t accumobjs; + uint64_t accumobjs_shifted_unbiased; uint64_t accumbytes; + uint64_t accumbytes_unbiased; }; typedef enum { diff --git a/src/jemalloc.c b/src/jemalloc.c index f2e5f8eb..ae9ef3d1 100644 --- a/src/jemalloc.c +++ b/src/jemalloc.c @@ -1517,6 +1517,16 @@ malloc_conf_init_helper(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS], } CONF_CONTINUE; } + /* + * Undocumented. When set to false, don't + * correct for an unbiasing bug in jeprof + * attribution. This can be handy if you want + * to get consistent numbers from your binary + * across different jemalloc versions, even if + * those numbers are incorrect. The default is + * true. + */ + CONF_HANDLE_BOOL(opt_prof_unbias, "prof_unbias") } if (config_log) { if (CONF_MATCH("log")) { diff --git a/src/prof.c b/src/prof.c index 25735410..7b649e49 100644 --- a/src/prof.c +++ b/src/prof.c @@ -32,6 +32,7 @@ bool opt_prof_leak = false; bool opt_prof_accum = false; char opt_prof_prefix[PROF_DUMP_FILENAME_LEN]; bool opt_prof_sys_thread_name = false; +bool opt_prof_unbias = true; /* Accessed via prof_sample_event_handler(). */ static counter_accum_t prof_idump_accumulated; @@ -60,6 +61,8 @@ static malloc_mutex_t prof_gdump_mtx; uint64_t prof_interval = 0; size_t lg_prof_sample; +size_t prof_unbiased_sz[SC_NSIZES]; +size_t prof_shifted_unbiased_cnt[SC_NSIZES]; static uint64_t next_thr_uid; static malloc_mutex_t next_thr_uid_mtx; @@ -69,6 +72,40 @@ bool prof_booted = false; /******************************************************************************/ +void prof_unbias_map_init() { + /* See the comment in prof_sample_new_event_wait */ +#ifdef JEMALLOC_PROF + for (szind_t i = 0; i < SC_NSIZES; i++) { + double sz = (double)sz_index2size(i); + double rate = (double)(ZU(1) << lg_prof_sample); + double div_val = 1.0 - exp(-sz / rate); + double unbiased_sz = sz / div_val; + /* + * The "true" right value for the unbiased count is + * 1.0/(1 - exp(-sz/rate)). The problem is, we keep the counts + * as integers (for a variety of reasons -- rounding errors + * could trigger asserts, and not all libcs can properly handle + * floating point arithmetic during malloc calls inside libc). + * Rounding to an integer, though, can lead to rounding errors + * of over 30% for sizes close to the sampling rate. So + * instead, we multiply by a constant, dividing the maximum + * possible roundoff error by that constant. To avoid overflow + * in summing up size_t values, the largest safe constant we can + * pick is the size of the smallest allocation. + */ + double cnt_shift = (double)(ZU(1) << SC_LG_TINY_MIN); + double shifted_unbiased_cnt = cnt_shift / div_val; + prof_unbiased_sz[i] = (size_t)round(unbiased_sz); + prof_shifted_unbiased_cnt[i] = (size_t)round( + shifted_unbiased_cnt); + } +#else + unreachable(); +#endif +} + +/******************************************************************************/ + void prof_alloc_rollback(tsd_t *tsd, prof_tctx_t *tctx) { cassert(config_prof); @@ -96,12 +133,30 @@ prof_malloc_sample_object(tsd_t *tsd, const void *ptr, size_t size, ptr); prof_info_set(tsd, edata, tctx); + szind_t szind = sz_size2index(size); + malloc_mutex_lock(tsd_tsdn(tsd), tctx->tdata->lock); + /* + * We need to do these map lookups while holding the lock, to avoid the + * possibility of races with prof_reset calls, which update the map and + * then acquire the lock. This actually still leaves a data race on the + * contents of the unbias map, but we have not yet gone through and + * atomic-ified the prof module, and compilers are not yet causing us + * issues. The key thing is to make sure that, if we read garbage data, + * the prof_reset call is about to mark our tctx as expired before any + * dumping of our corrupted output is attempted. + */ + size_t shifted_unbiased_cnt = prof_shifted_unbiased_cnt[szind]; + size_t unbiased_bytes = prof_unbiased_sz[szind]; tctx->cnts.curobjs++; + tctx->cnts.curobjs_shifted_unbiased += shifted_unbiased_cnt; tctx->cnts.curbytes += usize; + tctx->cnts.curbytes_unbiased += unbiased_bytes; if (opt_prof_accum) { tctx->cnts.accumobjs++; + tctx->cnts.accumobjs_shifted_unbiased += shifted_unbiased_cnt; tctx->cnts.accumbytes += usize; + tctx->cnts.accumbytes_unbiased += unbiased_bytes; } bool record_recent = prof_recent_alloc_prepare(tsd, tctx); tctx->prepared = false; @@ -118,12 +173,21 @@ prof_free_sampled_object(tsd_t *tsd, size_t usize, prof_info_t *prof_info) { prof_tctx_t *tctx = prof_info->alloc_tctx; assert((uintptr_t)tctx > (uintptr_t)1U); + szind_t szind = sz_size2index(usize); malloc_mutex_lock(tsd_tsdn(tsd), tctx->tdata->lock); assert(tctx->cnts.curobjs > 0); assert(tctx->cnts.curbytes >= usize); + /* + * It's not correct to do equivalent asserts for unbiased bytes, because + * of the potential for races with prof.reset calls. The map contents + * should really be atomic, but we have not atomic-ified the prof module + * yet. + */ tctx->cnts.curobjs--; + tctx->cnts.curobjs_shifted_unbiased -= prof_shifted_unbiased_cnt[szind]; tctx->cnts.curbytes -= usize; + tctx->cnts.curbytes_unbiased -= prof_unbiased_sz[szind]; prof_try_log(tsd, usize, prof_info); @@ -517,6 +581,7 @@ prof_boot2(tsd_t *tsd, base_t *base) { unsigned i; lg_prof_sample = opt_lg_prof_sample; + prof_unbias_map_init(); prof_active = opt_prof_active; if (malloc_mutex_init(&prof_active_mtx, "prof_active", diff --git a/src/prof_data.c b/src/prof_data.c index 6b441de1..ae9cd4b1 100644 --- a/src/prof_data.c +++ b/src/prof_data.c @@ -514,12 +514,121 @@ prof_dump_printf(write_cb_t *prof_dump_write, void *cbopaque, prof_dump_write(cbopaque, buf); } +/* + * Casting a double to a uint64_t may not necessarily be in range; this can be + * UB. I don't think this is practically possible with the cur counters, but + * plausibly could be with the accum counters. + */ +#ifdef JEMALLOC_PROF +static uint64_t +prof_double_uint64_cast(double d) { + /* + * Note: UINT64_MAX + 1 is exactly representable as a double on all + * reasonable platforms (certainly those we'll support). Writing this + * as !(a < b) instead of (a >= b) means that we're NaN-safe. + */ + double rounded = round(d); + if (!(rounded < (double)UINT64_MAX)) { + return UINT64_MAX; + } + return (uint64_t)rounded; +} +#endif + +/* + * The unbiasing story is long. The jeprof unbiasing logic was copied from + * pprof. Both shared an issue: they unbiased using the average size of the + * allocations at a particular stack trace. This can work out OK if allocations + * are mostly of the same size given some stack, but not otherwise. We now + * internally track what the unbiased results ought to be. We can't just report + * them as they are though; they'll still go through the jeprof unbiasing + * process. Instead, we figure out what values we can feed *into* jeprof's + * unbiasing mechanism that will lead to getting the right values out. + * + * It'll unbias count and aggregate size as: + * + * c_out = c_in * 1/(1-exp(-s_in/c_in/R) + * s_out = s_in * 1/(1-exp(-s_in/c_in/R) + * + * We want to solve for the values of c_in and s_in that will + * give the c_out and s_out that we've computed internally. + * + * Let's do a change of variables (both to make the math easier and to make it + * easier to write): + * x = s_in / c_in + * y = s_in + * k = 1/R. + * + * Then + * c_out = y/x * 1/(1-exp(-k*x)) + * s_out = y * 1/(1-exp(-k*x)) + * + * The first equation gives: + * y = x * c_out * (1-exp(-k*x)) + * The second gives: + * y = s_out * (1-exp(-k*x)) + * So we have + * x = s_out / c_out. + * And all the other values fall out from that. + * + * This is all a fair bit of work. The thing we get out of it is that we don't + * break backwards compatibility with jeprof (and the various tools that have + * copied its unbiasing logic). Eventually, we anticipate a v3 heap profile + * dump format based on JSON, at which point I think much of this logic can get + * cleaned up (since we'll be taking a compatibility break there anyways). + */ +static void +prof_do_unbias(uint64_t c_out_shifted_i, uint64_t s_out_i, uint64_t *r_c_in, + uint64_t *r_s_in) { +#ifdef JEMALLOC_PROF + if (c_out_shifted_i == 0 || s_out_i == 0) { + *r_c_in = 0; + *r_s_in = 0; + return; + } + /* + * See the note in prof_unbias_map_init() to see why we take c_out in a + * shifted form. + */ + double c_out = (double)c_out_shifted_i + / (double)(ZU(1) << SC_LG_TINY_MIN); + double s_out = (double)s_out_i; + double R = (double)(ZU(1) << lg_prof_sample); + + double x = s_out / c_out; + double y = s_out * (1.0 - exp(-x / R)); + + double c_in = y / x; + double s_in = y; + + *r_c_in = prof_double_uint64_cast(c_in); + *r_s_in = prof_double_uint64_cast(s_in); +#else + unreachable(); +#endif +} + static void prof_dump_print_cnts(write_cb_t *prof_dump_write, void *cbopaque, const prof_cnt_t *cnts) { + uint64_t curobjs; + uint64_t curbytes; + uint64_t accumobjs; + uint64_t accumbytes; + if (opt_prof_unbias) { + prof_do_unbias(cnts->curobjs_shifted_unbiased, + cnts->curbytes_unbiased, &curobjs, &curbytes); + prof_do_unbias(cnts->accumobjs_shifted_unbiased, + cnts->accumbytes_unbiased, &accumobjs, &accumbytes); + } else { + curobjs = cnts->curobjs; + curbytes = cnts->curbytes; + accumobjs = cnts->accumobjs; + accumbytes = cnts->accumbytes; + } prof_dump_printf(prof_dump_write, cbopaque, "%"FMTu64": %"FMTu64" [%"FMTu64": %"FMTu64"]", - cnts->curobjs, cnts->curbytes, cnts->accumobjs, cnts->accumbytes); + curobjs, curbytes, accumobjs, accumbytes); } static void @@ -539,12 +648,20 @@ prof_tctx_merge_tdata(tsdn_t *tsdn, prof_tctx_t *tctx, prof_tdata_t *tdata) { memcpy(&tctx->dump_cnts, &tctx->cnts, sizeof(prof_cnt_t)); tdata->cnt_summed.curobjs += tctx->dump_cnts.curobjs; + tdata->cnt_summed.curobjs_shifted_unbiased + += tctx->dump_cnts.curobjs_shifted_unbiased; tdata->cnt_summed.curbytes += tctx->dump_cnts.curbytes; + tdata->cnt_summed.curbytes_unbiased + += tctx->dump_cnts.curbytes_unbiased; if (opt_prof_accum) { tdata->cnt_summed.accumobjs += tctx->dump_cnts.accumobjs; + tdata->cnt_summed.accumobjs_shifted_unbiased += + tctx->dump_cnts.accumobjs_shifted_unbiased; tdata->cnt_summed.accumbytes += tctx->dump_cnts.accumbytes; + tdata->cnt_summed.accumbytes_unbiased += + tctx->dump_cnts.accumbytes_unbiased; } break; case prof_tctx_state_dumping: @@ -558,10 +675,17 @@ prof_tctx_merge_gctx(tsdn_t *tsdn, prof_tctx_t *tctx, prof_gctx_t *gctx) { malloc_mutex_assert_owner(tsdn, gctx->lock); gctx->cnt_summed.curobjs += tctx->dump_cnts.curobjs; + gctx->cnt_summed.curobjs_shifted_unbiased + += tctx->dump_cnts.curobjs_shifted_unbiased; gctx->cnt_summed.curbytes += tctx->dump_cnts.curbytes; + gctx->cnt_summed.curbytes_unbiased += tctx->dump_cnts.curbytes_unbiased; if (opt_prof_accum) { gctx->cnt_summed.accumobjs += tctx->dump_cnts.accumobjs; + gctx->cnt_summed.accumobjs_shifted_unbiased + += tctx->dump_cnts.accumobjs_shifted_unbiased; gctx->cnt_summed.accumbytes += tctx->dump_cnts.accumbytes; + gctx->cnt_summed.accumbytes_unbiased + += tctx->dump_cnts.accumbytes_unbiased; } } @@ -757,11 +881,19 @@ prof_tdata_merge_iter(prof_tdata_tree_t *tdatas, prof_tdata_t *tdata, } arg->cnt_all->curobjs += tdata->cnt_summed.curobjs; + arg->cnt_all->curobjs_shifted_unbiased + += tdata->cnt_summed.curobjs_shifted_unbiased; arg->cnt_all->curbytes += tdata->cnt_summed.curbytes; + arg->cnt_all->curbytes_unbiased + += tdata->cnt_summed.curbytes_unbiased; if (opt_prof_accum) { arg->cnt_all->accumobjs += tdata->cnt_summed.accumobjs; + arg->cnt_all->accumobjs_shifted_unbiased + += tdata->cnt_summed.accumobjs_shifted_unbiased; arg->cnt_all->accumbytes += tdata->cnt_summed.accumbytes; + arg->cnt_all->accumbytes_unbiased += + tdata->cnt_summed.accumbytes_unbiased; } } else { tdata->dumping = false; @@ -814,8 +946,16 @@ prof_dump_gctx(prof_dump_iter_arg_t *arg, prof_gctx_t *gctx, (opt_prof_accum && gctx->cnt_summed.accumobjs == 0)) { assert(gctx->cnt_summed.curobjs == 0); assert(gctx->cnt_summed.curbytes == 0); + /* + * These asserts would not be correct -- see the comment on races + * in prof.c + * assert(gctx->cnt_summed.curobjs_unbiased == 0); + * assert(gctx->cnt_summed.curbytes_unbiased == 0); + */ assert(gctx->cnt_summed.accumobjs == 0); + assert(gctx->cnt_summed.accumobjs_shifted_unbiased == 0); assert(gctx->cnt_summed.accumbytes == 0); + assert(gctx->cnt_summed.accumbytes_unbiased == 0); return; } @@ -834,7 +974,7 @@ prof_dump_gctx(prof_dump_iter_arg_t *arg, prof_gctx_t *gctx, } /* - * See prof_sample_threshold_update() comment for why the body of this function + * See prof_sample_new_event_wait() comment for why the body of this function * is conditionally compiled. */ static void @@ -1120,6 +1260,7 @@ prof_reset(tsd_t *tsd, size_t lg_sample) { malloc_mutex_lock(tsd_tsdn(tsd), &tdatas_mtx); lg_prof_sample = lg_sample; + prof_unbias_map_init(); next = NULL; do { @@ -1162,9 +1303,24 @@ prof_tctx_destroy(tsd_t *tsd, prof_tctx_t *tctx) { assert(tctx->cnts.curobjs == 0); assert(tctx->cnts.curbytes == 0); + /* + * These asserts are not correct -- see the comment about races in + * prof.c + * + * assert(tctx->cnts.curobjs_shifted_unbiased == 0); + * assert(tctx->cnts.curbytes_unbiased == 0); + */ assert(!opt_prof_accum); assert(tctx->cnts.accumobjs == 0); assert(tctx->cnts.accumbytes == 0); + /* + * These ones are, since accumbyte counts never go down. Either + * prof_accum is off (in which case these should never have changed from + * their initial value of zero), or it's on (in which case we shouldn't + * be destroying this tctx). + */ + assert(tctx->cnts.accumobjs_shifted_unbiased == 0); + assert(tctx->cnts.accumbytes_unbiased == 0); prof_gctx_t *gctx = tctx->gctx;