Implement sampling for heap profiling.
This commit is contained in:
parent
f3ff75289b
commit
b9477e782b
@ -647,6 +647,7 @@ if test "x$enable_tls" = "x0" ; then
|
||||
enable_prof="0"
|
||||
fi
|
||||
if test "x$enable_prof" = "x1" ; then
|
||||
LIBS="$LIBS -lm"
|
||||
AC_DEFINE([JEMALLOC_PROF], [ ])
|
||||
if test "x$enable_prof_libunwind" = "x1" ; then
|
||||
AC_CHECK_HEADERS([libunwind.h], , [enable_prof_libunwind="0"])
|
||||
|
@ -38,7 +38,7 @@
|
||||
.\" @(#)malloc.3 8.1 (Berkeley) 6/4/93
|
||||
.\" $FreeBSD: head/lib/libc/stdlib/malloc.3 182225 2008-08-27 02:00:53Z jasone $
|
||||
.\"
|
||||
.Dd February 11, 2010
|
||||
.Dd March 1, 2010
|
||||
.Dt JEMALLOC 3
|
||||
.Os
|
||||
.Sh NAME
|
||||
@ -355,6 +355,9 @@ will disable dirty page purging.
|
||||
@roff_prof@.Dq B
|
||||
@roff_prof@option for backtrace depth control.
|
||||
@roff_prof@See the
|
||||
@roff_prof@.Dq S
|
||||
@roff_prof@option for probabilistic sampling control.
|
||||
@roff_prof@See the
|
||||
@roff_prof@.Dq I
|
||||
@roff_prof@option for information on interval-triggered profile dumping, and the
|
||||
@roff_prof@.Dq U
|
||||
@ -464,6 +467,15 @@ Double/halve the size of the maximum size class that is a multiple of the
|
||||
quantum (8 or 16 bytes, depending on architecture).
|
||||
Above this size, cacheline spacing is used for size classes.
|
||||
The default value is 128 bytes.
|
||||
@roff_prof@.It S
|
||||
@roff_prof@Double/halve the average interval between allocation samples, as
|
||||
@roff_prof@measured in bytes of allocation activity.
|
||||
@roff_prof@Increasing the sampling interval decreases profile fidelity, but
|
||||
@roff_prof@also decreases the computational overhead.
|
||||
@roff_prof@The default sample interval is one (i.e. all allocations are
|
||||
@roff_prof@sampled).
|
||||
@roff_prof@A sample interval greater than one implicitly disables leak
|
||||
@roff_prof@reporting.
|
||||
@roff_prof@.It U
|
||||
@roff_prof@Trigger a memory profile dump every time the total virtual memory
|
||||
@roff_prof@exceeds the previous maximum.
|
||||
|
@ -25,7 +25,7 @@
|
||||
* uint32_t state : Seed value.
|
||||
* const uint32_t a, c : See above discussion.
|
||||
*/
|
||||
#define prn(r, lg_range, state, a, c) do { \
|
||||
#define prn32(r, lg_range, state, a, c) do { \
|
||||
assert(lg_range > 0); \
|
||||
assert(lg_range <= 32); \
|
||||
\
|
||||
@ -34,6 +34,16 @@
|
||||
r >>= (32 - lg_range); \
|
||||
} while (false)
|
||||
|
||||
/* Same as prn32(), but 64 bits of pseudo-randomness, using uint64_t. */
|
||||
#define prn64(r, lg_range, state, a, c) do { \
|
||||
assert(lg_range > 0); \
|
||||
assert(lg_range <= 64); \
|
||||
\
|
||||
r = (state * (a)) + (c); \
|
||||
state = r; \
|
||||
r >>= (64 - lg_range); \
|
||||
} while (false)
|
||||
|
||||
#endif /* JEMALLOC_H_TYPES */
|
||||
/******************************************************************************/
|
||||
#ifdef JEMALLOC_H_STRUCTS
|
||||
|
@ -8,6 +8,9 @@ typedef struct prof_thr_cnt_s prof_thr_cnt_t;
|
||||
typedef struct prof_ctx_s prof_ctx_t;
|
||||
typedef struct prof_s prof_t;
|
||||
|
||||
/* Option defaults. */
|
||||
#define LG_PROF_BT_MAX_DEFAULT 2
|
||||
#define LG_PROF_SAMPLE_DEFAULT 0
|
||||
#define LG_PROF_INTERVAL_DEFAULT 30
|
||||
|
||||
/*
|
||||
@ -16,7 +19,7 @@ typedef struct prof_s prof_t;
|
||||
* a hard-coded number of backtrace frame handlers, so increasing
|
||||
* LG_PROF_BT_MAX requires changing prof_backtrace().
|
||||
*/
|
||||
#define LG_PROF_BT_MAX 7
|
||||
#define LG_PROF_BT_MAX 7 /* >= LG_PROF_BT_MAX_DEFAULT */
|
||||
#define PROF_BT_MAX (1U << LG_PROF_BT_MAX)
|
||||
|
||||
/* Initial hash table size. */
|
||||
@ -117,7 +120,8 @@ struct prof_ctx_s {
|
||||
|
||||
extern bool opt_prof;
|
||||
extern size_t opt_lg_prof_bt_max; /* Maximum backtrace depth. */
|
||||
extern size_t opt_lg_prof_interval;
|
||||
extern size_t opt_lg_prof_sample; /* Mean bytes between samples. */
|
||||
extern size_t opt_lg_prof_interval; /* lg(prof_interval). */
|
||||
extern bool opt_prof_udump; /* High-water memory dumping. */
|
||||
extern bool opt_prof_leak; /* Dump leak summary at exit. */
|
||||
|
||||
@ -133,7 +137,7 @@ extern uint64_t prof_interval;
|
||||
bool prof_init(prof_t *prof, bool master);
|
||||
void prof_destroy(prof_t *prof);
|
||||
|
||||
prof_thr_cnt_t *prof_alloc_prep(void);
|
||||
prof_thr_cnt_t *prof_alloc_prep(size_t size);
|
||||
prof_thr_cnt_t *prof_cnt_get(const void *ptr);
|
||||
void prof_malloc(const void *ptr, prof_thr_cnt_t *cnt);
|
||||
void prof_realloc(const void *ptr, prof_thr_cnt_t *cnt, const void *old_ptr,
|
||||
|
@ -100,7 +100,7 @@ ckh_try_bucket_insert(ckh_t *ckh, size_t bucket, const void *key,
|
||||
* Cycle through the cells in the bucket, starting at a random position.
|
||||
* The randomness avoids worst-case search overhead as buckets fill up.
|
||||
*/
|
||||
prn(offset, LG_CKH_BUCKET_CELLS, ckh->prn_state, CKH_A, CKH_C);
|
||||
prn32(offset, LG_CKH_BUCKET_CELLS, ckh->prn_state, CKH_A, CKH_C);
|
||||
for (i = 0; i < (ZU(1) << LG_CKH_BUCKET_CELLS); i++) {
|
||||
cell = &ckh->tab[(bucket << LG_CKH_BUCKET_CELLS) +
|
||||
((i + offset) & ((ZU(1) << LG_CKH_BUCKET_CELLS) - 1))];
|
||||
@ -142,7 +142,7 @@ ckh_evict_reloc_insert(ckh_t *ckh, size_t argbucket, void const **argkey,
|
||||
* were an item for which both hashes indicated the same
|
||||
* bucket.
|
||||
*/
|
||||
prn(i, LG_CKH_BUCKET_CELLS, ckh->prn_state, CKH_A, CKH_C);
|
||||
prn32(i, LG_CKH_BUCKET_CELLS, ckh->prn_state, CKH_A, CKH_C);
|
||||
cell = &ckh->tab[(bucket << LG_CKH_BUCKET_CELLS) + i];
|
||||
assert(cell->key != NULL);
|
||||
|
||||
|
@ -69,6 +69,7 @@ CTL_PROTO(opt_lg_tcache_gc_sweep)
|
||||
#ifdef JEMALLOC_PROF
|
||||
CTL_PROTO(opt_prof)
|
||||
CTL_PROTO(opt_lg_prof_bt_max)
|
||||
CTL_PROTO(opt_lg_prof_sample)
|
||||
CTL_PROTO(opt_lg_prof_interval)
|
||||
CTL_PROTO(opt_prof_udump)
|
||||
CTL_PROTO(opt_prof_leak)
|
||||
@ -234,6 +235,7 @@ static const ctl_node_t opt_node[] = {
|
||||
#ifdef JEMALLOC_PROF
|
||||
{NAME("prof"), CTL(opt_prof)},
|
||||
{NAME("lg_prof_bt_max"), CTL(opt_lg_prof_bt_max)},
|
||||
{NAME("lg_prof_sample"), CTL(opt_lg_prof_sample)},
|
||||
{NAME("lg_prof_interval"), CTL(opt_lg_prof_interval)},
|
||||
{NAME("prof_udump"), CTL(opt_prof_udump)},
|
||||
{NAME("prof_leak"), CTL(opt_prof_leak)},
|
||||
@ -1066,6 +1068,7 @@ CTL_RO_GEN(opt_lg_tcache_gc_sweep, opt_lg_tcache_gc_sweep, ssize_t)
|
||||
#ifdef JEMALLOC_PROF
|
||||
CTL_RO_GEN(opt_prof, opt_prof, bool)
|
||||
CTL_RO_GEN(opt_lg_prof_bt_max, opt_lg_prof_bt_max, size_t)
|
||||
CTL_RO_GEN(opt_lg_prof_sample, opt_lg_prof_sample, size_t)
|
||||
CTL_RO_GEN(opt_lg_prof_interval, opt_lg_prof_interval, size_t)
|
||||
CTL_RO_GEN(opt_prof_udump, opt_prof_udump, bool)
|
||||
CTL_RO_GEN(opt_prof_leak, opt_prof_leak, bool)
|
||||
|
@ -582,6 +582,15 @@ MALLOC_OUT:
|
||||
opt_lg_qspace_max++;
|
||||
break;
|
||||
#ifdef JEMALLOC_PROF
|
||||
case 's':
|
||||
if (opt_lg_prof_sample > 0)
|
||||
opt_lg_prof_sample--;
|
||||
break;
|
||||
case 'S':
|
||||
if (opt_lg_prof_sample + 1 <
|
||||
(sizeof(uint64_t) << 3))
|
||||
opt_lg_prof_sample++;
|
||||
break;
|
||||
case 'u':
|
||||
opt_prof_udump = false;
|
||||
break;
|
||||
@ -870,7 +879,7 @@ JEMALLOC_P(malloc)(size_t size)
|
||||
}
|
||||
|
||||
#ifdef JEMALLOC_PROF
|
||||
if (opt_prof && (cnt = prof_alloc_prep()) == NULL) {
|
||||
if (opt_prof && (cnt = prof_alloc_prep(size)) == NULL) {
|
||||
ret = NULL;
|
||||
goto OOM;
|
||||
}
|
||||
@ -955,7 +964,7 @@ JEMALLOC_P(posix_memalign)(void **memptr, size_t alignment, size_t size)
|
||||
}
|
||||
|
||||
#ifdef JEMALLOC_PROF
|
||||
if (opt_prof && (cnt = prof_alloc_prep()) == NULL) {
|
||||
if (opt_prof && (cnt = prof_alloc_prep(size)) == NULL) {
|
||||
result = NULL;
|
||||
ret = EINVAL;
|
||||
} else
|
||||
@ -1030,7 +1039,7 @@ JEMALLOC_P(calloc)(size_t num, size_t size)
|
||||
}
|
||||
|
||||
#ifdef JEMALLOC_PROF
|
||||
if (opt_prof && (cnt = prof_alloc_prep()) == NULL) {
|
||||
if (opt_prof && (cnt = prof_alloc_prep(num_size)) == NULL) {
|
||||
ret = NULL;
|
||||
goto RETURN;
|
||||
}
|
||||
@ -1106,7 +1115,7 @@ JEMALLOC_P(realloc)(void *ptr, size_t size)
|
||||
if (opt_prof) {
|
||||
old_size = isalloc(ptr);
|
||||
old_cnt = prof_cnt_get(ptr);
|
||||
if ((cnt = prof_alloc_prep()) == NULL) {
|
||||
if ((cnt = prof_alloc_prep(size)) == NULL) {
|
||||
ret = NULL;
|
||||
goto OOM;
|
||||
}
|
||||
@ -1144,7 +1153,7 @@ OOM:
|
||||
ret = NULL;
|
||||
} else {
|
||||
#ifdef JEMALLOC_PROF
|
||||
if (opt_prof && (cnt = prof_alloc_prep()) == NULL) {
|
||||
if (opt_prof && (cnt = prof_alloc_prep(size)) == NULL) {
|
||||
ret = NULL;
|
||||
} else
|
||||
#endif
|
||||
|
@ -12,11 +12,14 @@
|
||||
#include <libunwind.h>
|
||||
#endif
|
||||
|
||||
#include <math.h>
|
||||
|
||||
/******************************************************************************/
|
||||
/* Data. */
|
||||
|
||||
bool opt_prof = false;
|
||||
size_t opt_lg_prof_bt_max = 2;
|
||||
size_t opt_lg_prof_bt_max = LG_PROF_BT_MAX_DEFAULT;
|
||||
size_t opt_lg_prof_sample = LG_PROF_SAMPLE_DEFAULT;
|
||||
size_t opt_lg_prof_interval = LG_PROF_INTERVAL_DEFAULT;
|
||||
bool opt_prof_udump = false;
|
||||
bool opt_prof_leak = false;
|
||||
@ -52,6 +55,13 @@ static pthread_key_t bt2cnt_tsd;
|
||||
/* (1U << opt_lg_prof_bt_max). */
|
||||
static unsigned prof_bt_max;
|
||||
|
||||
static __thread uint64_t prof_sample_prn_state
|
||||
JEMALLOC_ATTR(tls_model("initial-exec"));
|
||||
static __thread uint64_t prof_sample_threshold
|
||||
JEMALLOC_ATTR(tls_model("initial-exec"));
|
||||
static __thread uint64_t prof_sample_accum
|
||||
JEMALLOC_ATTR(tls_model("initial-exec"));
|
||||
|
||||
static malloc_mutex_t prof_dump_seq_mtx;
|
||||
static uint64_t prof_dump_seq;
|
||||
static uint64_t prof_dump_iseq;
|
||||
@ -500,15 +510,27 @@ prof_lookup(prof_bt_t *bt)
|
||||
}
|
||||
|
||||
prof_thr_cnt_t *
|
||||
prof_alloc_prep(void)
|
||||
prof_alloc_prep(size_t size)
|
||||
{
|
||||
prof_thr_cnt_t *ret;
|
||||
void *vec[prof_bt_max];
|
||||
prof_bt_t bt;
|
||||
|
||||
/*
|
||||
* Determine whether to capture a backtrace based on whether size is
|
||||
* enough for prof_accum to reach prof_sample_threshold. However,
|
||||
* delay updating these variables until prof_{m,re}alloc(), because we
|
||||
* don't know for sure that the allocation will succeed.
|
||||
*
|
||||
* Use subtraction rather than addition to avoid potential integer
|
||||
* overflow.
|
||||
*/
|
||||
if (size >= prof_sample_threshold - prof_sample_accum) {
|
||||
bt_init(&bt, vec);
|
||||
prof_backtrace(&bt, 2, prof_bt_max);
|
||||
ret = prof_lookup(&bt);
|
||||
} else
|
||||
ret = (prof_thr_cnt_t *)(uintptr_t)1U;
|
||||
|
||||
return (ret);
|
||||
}
|
||||
@ -550,13 +572,68 @@ prof_cnt_set(const void *ptr, prof_thr_cnt_t *cnt)
|
||||
huge_prof_cnt_set(ptr, cnt);
|
||||
}
|
||||
|
||||
static inline void
|
||||
prof_sample_threshold_update(void)
|
||||
{
|
||||
uint64_t r;
|
||||
double u;
|
||||
|
||||
/*
|
||||
* Compute prof_sample_threshold as a geometrically distributed random
|
||||
* variable with mean (2^opt_lg_prof_sample).
|
||||
*/
|
||||
prn64(r, 53, prof_sample_prn_state, (uint64_t)1125899906842625LLU,
|
||||
1058392653243283975);
|
||||
u = (double)r * (1.0/9007199254740992.0L);
|
||||
prof_sample_threshold = (uint64_t)(log(u) /
|
||||
log(1.0 - (1.0 / (double)((uint64_t)1U << opt_lg_prof_sample))))
|
||||
+ (uint64_t)1U;
|
||||
}
|
||||
|
||||
static inline void
|
||||
prof_sample_accum_update(size_t size)
|
||||
{
|
||||
|
||||
if (opt_lg_prof_sample == 0) {
|
||||
/*
|
||||
* Don't bother with sampling logic, since sampling interval is
|
||||
* 1.
|
||||
*/
|
||||
return;
|
||||
}
|
||||
|
||||
if (prof_sample_threshold == 0) {
|
||||
/* Initialize. Seed the prng differently for each thread. */
|
||||
prof_sample_prn_state = (uint64_t)(uintptr_t)&size;
|
||||
prof_sample_threshold_update();
|
||||
}
|
||||
|
||||
/* Take care to avoid integer overflow. */
|
||||
if (size >= prof_sample_threshold - prof_sample_accum) {
|
||||
prof_sample_accum -= (prof_sample_threshold - size);
|
||||
/*
|
||||
* Compute new geometrically distributed prof_sample_threshold.
|
||||
*/
|
||||
prof_sample_threshold_update();
|
||||
while (prof_sample_accum >= prof_sample_threshold) {
|
||||
prof_sample_accum -= prof_sample_threshold;
|
||||
prof_sample_threshold_update();
|
||||
}
|
||||
} else
|
||||
prof_sample_accum += size;
|
||||
}
|
||||
|
||||
void
|
||||
prof_malloc(const void *ptr, prof_thr_cnt_t *cnt)
|
||||
{
|
||||
size_t size = isalloc(ptr);
|
||||
|
||||
prof_cnt_set(ptr, cnt);
|
||||
assert(ptr != NULL);
|
||||
|
||||
prof_cnt_set(ptr, cnt);
|
||||
prof_sample_accum_update(size);
|
||||
|
||||
if ((uintptr_t)cnt > (uintptr_t)1U) {
|
||||
cnt->epoch++;
|
||||
/*********/
|
||||
mb_write();
|
||||
@ -573,6 +650,7 @@ prof_malloc(const void *ptr, prof_thr_cnt_t *cnt)
|
||||
mb_write();
|
||||
/*********/
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
prof_realloc(const void *ptr, prof_thr_cnt_t *cnt, const void *old_ptr,
|
||||
@ -580,20 +658,23 @@ prof_realloc(const void *ptr, prof_thr_cnt_t *cnt, const void *old_ptr,
|
||||
{
|
||||
size_t size = isalloc(ptr);
|
||||
|
||||
if (ptr != NULL) {
|
||||
prof_cnt_set(ptr, cnt);
|
||||
prof_sample_accum_update(size);
|
||||
}
|
||||
|
||||
if (old_cnt != NULL)
|
||||
if ((uintptr_t)old_cnt > (uintptr_t)1U)
|
||||
old_cnt->epoch++;
|
||||
if (cnt != NULL)
|
||||
if ((uintptr_t)cnt > (uintptr_t)1U)
|
||||
cnt->epoch++;
|
||||
/*********/
|
||||
mb_write();
|
||||
/*********/
|
||||
if (old_cnt != NULL) {
|
||||
if ((uintptr_t)old_cnt > (uintptr_t)1U) {
|
||||
old_cnt->cnts.curobjs--;
|
||||
old_cnt->cnts.curbytes -= old_size;
|
||||
}
|
||||
if (cnt != NULL) {
|
||||
if ((uintptr_t)cnt > (uintptr_t)1U) {
|
||||
cnt->cnts.curobjs++;
|
||||
cnt->cnts.curbytes += size;
|
||||
cnt->cnts.accumobjs++;
|
||||
@ -602,9 +683,9 @@ prof_realloc(const void *ptr, prof_thr_cnt_t *cnt, const void *old_ptr,
|
||||
/*********/
|
||||
mb_write();
|
||||
/*********/
|
||||
if (old_cnt != NULL)
|
||||
if ((uintptr_t)old_cnt > (uintptr_t)1U)
|
||||
old_cnt->epoch++;
|
||||
if (cnt != NULL)
|
||||
if ((uintptr_t)cnt > (uintptr_t)1U)
|
||||
cnt->epoch++;
|
||||
/*********/
|
||||
mb_write(); /* Not strictly necessary. */
|
||||
@ -614,6 +695,8 @@ void
|
||||
prof_free(const void *ptr)
|
||||
{
|
||||
prof_thr_cnt_t *cnt = prof_cnt_get(ptr);
|
||||
|
||||
if ((uintptr_t)cnt > (uintptr_t)1) {
|
||||
size_t size = isalloc(ptr);
|
||||
|
||||
cnt->epoch++;
|
||||
@ -630,6 +713,7 @@ prof_free(const void *ptr)
|
||||
mb_write();
|
||||
/*********/
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
prof_flush(void)
|
||||
@ -825,7 +909,13 @@ prof_dump(const char *filename, bool leakcheck)
|
||||
prof_write(umax2s(cnt_all.accumobjs, 10, buf));
|
||||
prof_write(": ");
|
||||
prof_write(umax2s(cnt_all.accumbytes, 10, buf));
|
||||
if (opt_lg_prof_sample == 0)
|
||||
prof_write("] @ heapprofile\n");
|
||||
else {
|
||||
prof_write("] @ heap_v2/");
|
||||
prof_write(umax2s((uint64_t)1U << opt_lg_prof_sample, 10, buf));
|
||||
prof_write("\n");
|
||||
}
|
||||
|
||||
/* Dump per ctx profile stats. */
|
||||
for (tabind = 0; ckh_iter(&bt2ctx, &tabind, (void **)&bt, (void **)&ctx)
|
||||
@ -1104,6 +1194,14 @@ prof_boot0(void)
|
||||
* initialized, so this function must be executed early.
|
||||
*/
|
||||
|
||||
if (opt_lg_prof_sample > 0) {
|
||||
/*
|
||||
* Disable leak checking, since not all allocations will be
|
||||
* sampled.
|
||||
*/
|
||||
opt_prof_leak = false;
|
||||
}
|
||||
|
||||
if (opt_prof_leak && opt_prof == false) {
|
||||
/*
|
||||
* Enable opt_prof, but in such a way that profiles are never
|
||||
|
@ -540,13 +540,18 @@ stats_print(void (*write4)(void *, const char *, const char *, const char *,
|
||||
tcache_nslots && ssv >= 0 ? umax2s(tcache_gc_sweep,
|
||||
10, s) : "N/A", "\n", "");
|
||||
}
|
||||
if ((err = JEMALLOC_P(mallctl)("opt.lg_prof_bt_max", &sv, &ssz,
|
||||
NULL, 0)) == 0) {
|
||||
if ((err = JEMALLOC_P(mallctl)("opt.prof", &bv, &bsz, NULL, 0))
|
||||
== 0 && bv) {
|
||||
xmallctl("opt.lg_prof_bt_max", &sv, &ssz, NULL, 0);
|
||||
write4(w4opaque, "Maximum profile backtrace depth: ",
|
||||
umax2s((1U << sv), 10, s), "\n", "");
|
||||
}
|
||||
if ((err = JEMALLOC_P(mallctl)("opt.lg_prof_interval", &sv,
|
||||
&ssz, NULL, 0)) == 0) {
|
||||
|
||||
xmallctl("opt.lg_prof_sample", &sv, &ssz, NULL, 0);
|
||||
write4(w4opaque, "Average profile sample interval: ",
|
||||
umax2s((1U << sv), 10, s), "", "");
|
||||
write4(w4opaque, " (2^", umax2s(sv, 10, s), ")\n", "");
|
||||
|
||||
xmallctl("opt.lg_prof_interval", &sv, &ssz, NULL, 0);
|
||||
write4(w4opaque, "Average profile dump interval: ",
|
||||
umax2s((1U << sv), 10, s), "", "");
|
||||
write4(w4opaque, " (2^", umax2s(sv, 10, s), ")\n", "");
|
||||
|
Loading…
Reference in New Issue
Block a user