Implement sampling for heap profiling.

This commit is contained in:
Jason Evans 2010-03-01 20:15:26 -08:00
parent f3ff75289b
commit b9477e782b
9 changed files with 202 additions and 60 deletions

View File

@ -647,6 +647,7 @@ if test "x$enable_tls" = "x0" ; then
enable_prof="0"
fi
if test "x$enable_prof" = "x1" ; then
LIBS="$LIBS -lm"
AC_DEFINE([JEMALLOC_PROF], [ ])
if test "x$enable_prof_libunwind" = "x1" ; then
AC_CHECK_HEADERS([libunwind.h], , [enable_prof_libunwind="0"])

View File

@ -38,7 +38,7 @@
.\" @(#)malloc.3 8.1 (Berkeley) 6/4/93
.\" $FreeBSD: head/lib/libc/stdlib/malloc.3 182225 2008-08-27 02:00:53Z jasone $
.\"
.Dd February 11, 2010
.Dd March 1, 2010
.Dt JEMALLOC 3
.Os
.Sh NAME
@ -355,6 +355,9 @@ will disable dirty page purging.
@roff_prof@.Dq B
@roff_prof@option for backtrace depth control.
@roff_prof@See the
@roff_prof@.Dq S
@roff_prof@option for probabilistic sampling control.
@roff_prof@See the
@roff_prof@.Dq I
@roff_prof@option for information on interval-triggered profile dumping, and the
@roff_prof@.Dq U
@ -464,6 +467,15 @@ Double/halve the size of the maximum size class that is a multiple of the
quantum (8 or 16 bytes, depending on architecture).
Above this size, cacheline spacing is used for size classes.
The default value is 128 bytes.
@roff_prof@.It S
@roff_prof@Double/halve the average interval between allocation samples, as
@roff_prof@measured in bytes of allocation activity.
@roff_prof@Increasing the sampling interval decreases profile fidelity, but
@roff_prof@also decreases the computational overhead.
@roff_prof@The default sample interval is one (i.e. all allocations are
@roff_prof@sampled).
@roff_prof@A sample interval greater than one implicitly disables leak
@roff_prof@reporting.
@roff_prof@.It U
@roff_prof@Trigger a memory profile dump every time the total virtual memory
@roff_prof@exceeds the previous maximum.

View File

@ -25,7 +25,7 @@
* uint32_t state : Seed value.
* const uint32_t a, c : See above discussion.
*/
#define prn(r, lg_range, state, a, c) do { \
#define prn32(r, lg_range, state, a, c) do { \
assert(lg_range > 0); \
assert(lg_range <= 32); \
\
@ -34,6 +34,16 @@
r >>= (32 - lg_range); \
} while (false)
/* Same as prn32(), but 64 bits of pseudo-randomness, using uint64_t. */
#define prn64(r, lg_range, state, a, c) do { \
assert(lg_range > 0); \
assert(lg_range <= 64); \
\
r = (state * (a)) + (c); \
state = r; \
r >>= (64 - lg_range); \
} while (false)
#endif /* JEMALLOC_H_TYPES */
/******************************************************************************/
#ifdef JEMALLOC_H_STRUCTS

View File

@ -8,6 +8,9 @@ typedef struct prof_thr_cnt_s prof_thr_cnt_t;
typedef struct prof_ctx_s prof_ctx_t;
typedef struct prof_s prof_t;
/* Option defaults. */
#define LG_PROF_BT_MAX_DEFAULT 2
#define LG_PROF_SAMPLE_DEFAULT 0
#define LG_PROF_INTERVAL_DEFAULT 30
/*
@ -16,7 +19,7 @@ typedef struct prof_s prof_t;
* a hard-coded number of backtrace frame handlers, so increasing
* LG_PROF_BT_MAX requires changing prof_backtrace().
*/
#define LG_PROF_BT_MAX 7
#define LG_PROF_BT_MAX 7 /* >= LG_PROF_BT_MAX_DEFAULT */
#define PROF_BT_MAX (1U << LG_PROF_BT_MAX)
/* Initial hash table size. */
@ -117,7 +120,8 @@ struct prof_ctx_s {
extern bool opt_prof;
extern size_t opt_lg_prof_bt_max; /* Maximum backtrace depth. */
extern size_t opt_lg_prof_interval;
extern size_t opt_lg_prof_sample; /* Mean bytes between samples. */
extern size_t opt_lg_prof_interval; /* lg(prof_interval). */
extern bool opt_prof_udump; /* High-water memory dumping. */
extern bool opt_prof_leak; /* Dump leak summary at exit. */
@ -133,7 +137,7 @@ extern uint64_t prof_interval;
bool prof_init(prof_t *prof, bool master);
void prof_destroy(prof_t *prof);
prof_thr_cnt_t *prof_alloc_prep(void);
prof_thr_cnt_t *prof_alloc_prep(size_t size);
prof_thr_cnt_t *prof_cnt_get(const void *ptr);
void prof_malloc(const void *ptr, prof_thr_cnt_t *cnt);
void prof_realloc(const void *ptr, prof_thr_cnt_t *cnt, const void *old_ptr,

View File

@ -100,7 +100,7 @@ ckh_try_bucket_insert(ckh_t *ckh, size_t bucket, const void *key,
* Cycle through the cells in the bucket, starting at a random position.
* The randomness avoids worst-case search overhead as buckets fill up.
*/
prn(offset, LG_CKH_BUCKET_CELLS, ckh->prn_state, CKH_A, CKH_C);
prn32(offset, LG_CKH_BUCKET_CELLS, ckh->prn_state, CKH_A, CKH_C);
for (i = 0; i < (ZU(1) << LG_CKH_BUCKET_CELLS); i++) {
cell = &ckh->tab[(bucket << LG_CKH_BUCKET_CELLS) +
((i + offset) & ((ZU(1) << LG_CKH_BUCKET_CELLS) - 1))];
@ -142,7 +142,7 @@ ckh_evict_reloc_insert(ckh_t *ckh, size_t argbucket, void const **argkey,
* were an item for which both hashes indicated the same
* bucket.
*/
prn(i, LG_CKH_BUCKET_CELLS, ckh->prn_state, CKH_A, CKH_C);
prn32(i, LG_CKH_BUCKET_CELLS, ckh->prn_state, CKH_A, CKH_C);
cell = &ckh->tab[(bucket << LG_CKH_BUCKET_CELLS) + i];
assert(cell->key != NULL);

View File

@ -69,6 +69,7 @@ CTL_PROTO(opt_lg_tcache_gc_sweep)
#ifdef JEMALLOC_PROF
CTL_PROTO(opt_prof)
CTL_PROTO(opt_lg_prof_bt_max)
CTL_PROTO(opt_lg_prof_sample)
CTL_PROTO(opt_lg_prof_interval)
CTL_PROTO(opt_prof_udump)
CTL_PROTO(opt_prof_leak)
@ -234,6 +235,7 @@ static const ctl_node_t opt_node[] = {
#ifdef JEMALLOC_PROF
{NAME("prof"), CTL(opt_prof)},
{NAME("lg_prof_bt_max"), CTL(opt_lg_prof_bt_max)},
{NAME("lg_prof_sample"), CTL(opt_lg_prof_sample)},
{NAME("lg_prof_interval"), CTL(opt_lg_prof_interval)},
{NAME("prof_udump"), CTL(opt_prof_udump)},
{NAME("prof_leak"), CTL(opt_prof_leak)},
@ -1066,6 +1068,7 @@ CTL_RO_GEN(opt_lg_tcache_gc_sweep, opt_lg_tcache_gc_sweep, ssize_t)
#ifdef JEMALLOC_PROF
CTL_RO_GEN(opt_prof, opt_prof, bool)
CTL_RO_GEN(opt_lg_prof_bt_max, opt_lg_prof_bt_max, size_t)
CTL_RO_GEN(opt_lg_prof_sample, opt_lg_prof_sample, size_t)
CTL_RO_GEN(opt_lg_prof_interval, opt_lg_prof_interval, size_t)
CTL_RO_GEN(opt_prof_udump, opt_prof_udump, bool)
CTL_RO_GEN(opt_prof_leak, opt_prof_leak, bool)

View File

@ -582,6 +582,15 @@ MALLOC_OUT:
opt_lg_qspace_max++;
break;
#ifdef JEMALLOC_PROF
case 's':
if (opt_lg_prof_sample > 0)
opt_lg_prof_sample--;
break;
case 'S':
if (opt_lg_prof_sample + 1 <
(sizeof(uint64_t) << 3))
opt_lg_prof_sample++;
break;
case 'u':
opt_prof_udump = false;
break;
@ -870,7 +879,7 @@ JEMALLOC_P(malloc)(size_t size)
}
#ifdef JEMALLOC_PROF
if (opt_prof && (cnt = prof_alloc_prep()) == NULL) {
if (opt_prof && (cnt = prof_alloc_prep(size)) == NULL) {
ret = NULL;
goto OOM;
}
@ -955,7 +964,7 @@ JEMALLOC_P(posix_memalign)(void **memptr, size_t alignment, size_t size)
}
#ifdef JEMALLOC_PROF
if (opt_prof && (cnt = prof_alloc_prep()) == NULL) {
if (opt_prof && (cnt = prof_alloc_prep(size)) == NULL) {
result = NULL;
ret = EINVAL;
} else
@ -1030,7 +1039,7 @@ JEMALLOC_P(calloc)(size_t num, size_t size)
}
#ifdef JEMALLOC_PROF
if (opt_prof && (cnt = prof_alloc_prep()) == NULL) {
if (opt_prof && (cnt = prof_alloc_prep(num_size)) == NULL) {
ret = NULL;
goto RETURN;
}
@ -1106,7 +1115,7 @@ JEMALLOC_P(realloc)(void *ptr, size_t size)
if (opt_prof) {
old_size = isalloc(ptr);
old_cnt = prof_cnt_get(ptr);
if ((cnt = prof_alloc_prep()) == NULL) {
if ((cnt = prof_alloc_prep(size)) == NULL) {
ret = NULL;
goto OOM;
}
@ -1144,7 +1153,7 @@ OOM:
ret = NULL;
} else {
#ifdef JEMALLOC_PROF
if (opt_prof && (cnt = prof_alloc_prep()) == NULL) {
if (opt_prof && (cnt = prof_alloc_prep(size)) == NULL) {
ret = NULL;
} else
#endif

View File

@ -12,11 +12,14 @@
#include <libunwind.h>
#endif
#include <math.h>
/******************************************************************************/
/* Data. */
bool opt_prof = false;
size_t opt_lg_prof_bt_max = 2;
size_t opt_lg_prof_bt_max = LG_PROF_BT_MAX_DEFAULT;
size_t opt_lg_prof_sample = LG_PROF_SAMPLE_DEFAULT;
size_t opt_lg_prof_interval = LG_PROF_INTERVAL_DEFAULT;
bool opt_prof_udump = false;
bool opt_prof_leak = false;
@ -52,6 +55,13 @@ static pthread_key_t bt2cnt_tsd;
/* (1U << opt_lg_prof_bt_max). */
static unsigned prof_bt_max;
static __thread uint64_t prof_sample_prn_state
JEMALLOC_ATTR(tls_model("initial-exec"));
static __thread uint64_t prof_sample_threshold
JEMALLOC_ATTR(tls_model("initial-exec"));
static __thread uint64_t prof_sample_accum
JEMALLOC_ATTR(tls_model("initial-exec"));
static malloc_mutex_t prof_dump_seq_mtx;
static uint64_t prof_dump_seq;
static uint64_t prof_dump_iseq;
@ -500,15 +510,27 @@ prof_lookup(prof_bt_t *bt)
}
prof_thr_cnt_t *
prof_alloc_prep(void)
prof_alloc_prep(size_t size)
{
prof_thr_cnt_t *ret;
void *vec[prof_bt_max];
prof_bt_t bt;
bt_init(&bt, vec);
prof_backtrace(&bt, 2, prof_bt_max);
ret = prof_lookup(&bt);
/*
* Determine whether to capture a backtrace based on whether size is
* enough for prof_accum to reach prof_sample_threshold. However,
* delay updating these variables until prof_{m,re}alloc(), because we
* don't know for sure that the allocation will succeed.
*
* Use subtraction rather than addition to avoid potential integer
* overflow.
*/
if (size >= prof_sample_threshold - prof_sample_accum) {
bt_init(&bt, vec);
prof_backtrace(&bt, 2, prof_bt_max);
ret = prof_lookup(&bt);
} else
ret = (prof_thr_cnt_t *)(uintptr_t)1U;
return (ret);
}
@ -550,28 +572,84 @@ prof_cnt_set(const void *ptr, prof_thr_cnt_t *cnt)
huge_prof_cnt_set(ptr, cnt);
}
static inline void
prof_sample_threshold_update(void)
{
uint64_t r;
double u;
/*
* Compute prof_sample_threshold as a geometrically distributed random
* variable with mean (2^opt_lg_prof_sample).
*/
prn64(r, 53, prof_sample_prn_state, (uint64_t)1125899906842625LLU,
1058392653243283975);
u = (double)r * (1.0/9007199254740992.0L);
prof_sample_threshold = (uint64_t)(log(u) /
log(1.0 - (1.0 / (double)((uint64_t)1U << opt_lg_prof_sample))))
+ (uint64_t)1U;
}
static inline void
prof_sample_accum_update(size_t size)
{
if (opt_lg_prof_sample == 0) {
/*
* Don't bother with sampling logic, since sampling interval is
* 1.
*/
return;
}
if (prof_sample_threshold == 0) {
/* Initialize. Seed the prng differently for each thread. */
prof_sample_prn_state = (uint64_t)(uintptr_t)&size;
prof_sample_threshold_update();
}
/* Take care to avoid integer overflow. */
if (size >= prof_sample_threshold - prof_sample_accum) {
prof_sample_accum -= (prof_sample_threshold - size);
/*
* Compute new geometrically distributed prof_sample_threshold.
*/
prof_sample_threshold_update();
while (prof_sample_accum >= prof_sample_threshold) {
prof_sample_accum -= prof_sample_threshold;
prof_sample_threshold_update();
}
} else
prof_sample_accum += size;
}
void
prof_malloc(const void *ptr, prof_thr_cnt_t *cnt)
{
size_t size = isalloc(ptr);
prof_cnt_set(ptr, cnt);
assert(ptr != NULL);
cnt->epoch++;
/*********/
mb_write();
/*********/
cnt->cnts.curobjs++;
cnt->cnts.curbytes += size;
cnt->cnts.accumobjs++;
cnt->cnts.accumbytes += size;
/*********/
mb_write();
/*********/
cnt->epoch++;
/*********/
mb_write();
/*********/
prof_cnt_set(ptr, cnt);
prof_sample_accum_update(size);
if ((uintptr_t)cnt > (uintptr_t)1U) {
cnt->epoch++;
/*********/
mb_write();
/*********/
cnt->cnts.curobjs++;
cnt->cnts.curbytes += size;
cnt->cnts.accumobjs++;
cnt->cnts.accumbytes += size;
/*********/
mb_write();
/*********/
cnt->epoch++;
/*********/
mb_write();
/*********/
}
}
void
@ -580,20 +658,23 @@ prof_realloc(const void *ptr, prof_thr_cnt_t *cnt, const void *old_ptr,
{
size_t size = isalloc(ptr);
prof_cnt_set(ptr, cnt);
if (ptr != NULL) {
prof_cnt_set(ptr, cnt);
prof_sample_accum_update(size);
}
if (old_cnt != NULL)
if ((uintptr_t)old_cnt > (uintptr_t)1U)
old_cnt->epoch++;
if (cnt != NULL)
if ((uintptr_t)cnt > (uintptr_t)1U)
cnt->epoch++;
/*********/
mb_write();
/*********/
if (old_cnt != NULL) {
if ((uintptr_t)old_cnt > (uintptr_t)1U) {
old_cnt->cnts.curobjs--;
old_cnt->cnts.curbytes -= old_size;
}
if (cnt != NULL) {
if ((uintptr_t)cnt > (uintptr_t)1U) {
cnt->cnts.curobjs++;
cnt->cnts.curbytes += size;
cnt->cnts.accumobjs++;
@ -602,9 +683,9 @@ prof_realloc(const void *ptr, prof_thr_cnt_t *cnt, const void *old_ptr,
/*********/
mb_write();
/*********/
if (old_cnt != NULL)
if ((uintptr_t)old_cnt > (uintptr_t)1U)
old_cnt->epoch++;
if (cnt != NULL)
if ((uintptr_t)cnt > (uintptr_t)1U)
cnt->epoch++;
/*********/
mb_write(); /* Not strictly necessary. */
@ -614,21 +695,24 @@ void
prof_free(const void *ptr)
{
prof_thr_cnt_t *cnt = prof_cnt_get(ptr);
size_t size = isalloc(ptr);
cnt->epoch++;
/*********/
mb_write();
/*********/
cnt->cnts.curobjs--;
cnt->cnts.curbytes -= size;
/*********/
mb_write();
/*********/
cnt->epoch++;
/*********/
mb_write();
/*********/
if ((uintptr_t)cnt > (uintptr_t)1) {
size_t size = isalloc(ptr);
cnt->epoch++;
/*********/
mb_write();
/*********/
cnt->cnts.curobjs--;
cnt->cnts.curbytes -= size;
/*********/
mb_write();
/*********/
cnt->epoch++;
/*********/
mb_write();
/*********/
}
}
static void
@ -825,7 +909,13 @@ prof_dump(const char *filename, bool leakcheck)
prof_write(umax2s(cnt_all.accumobjs, 10, buf));
prof_write(": ");
prof_write(umax2s(cnt_all.accumbytes, 10, buf));
prof_write("] @ heapprofile\n");
if (opt_lg_prof_sample == 0)
prof_write("] @ heapprofile\n");
else {
prof_write("] @ heap_v2/");
prof_write(umax2s((uint64_t)1U << opt_lg_prof_sample, 10, buf));
prof_write("\n");
}
/* Dump per ctx profile stats. */
for (tabind = 0; ckh_iter(&bt2ctx, &tabind, (void **)&bt, (void **)&ctx)
@ -1104,6 +1194,14 @@ prof_boot0(void)
* initialized, so this function must be executed early.
*/
if (opt_lg_prof_sample > 0) {
/*
* Disable leak checking, since not all allocations will be
* sampled.
*/
opt_prof_leak = false;
}
if (opt_prof_leak && opt_prof == false) {
/*
* Enable opt_prof, but in such a way that profiles are never

View File

@ -540,13 +540,18 @@ stats_print(void (*write4)(void *, const char *, const char *, const char *,
tcache_nslots && ssv >= 0 ? umax2s(tcache_gc_sweep,
10, s) : "N/A", "\n", "");
}
if ((err = JEMALLOC_P(mallctl)("opt.lg_prof_bt_max", &sv, &ssz,
NULL, 0)) == 0) {
if ((err = JEMALLOC_P(mallctl)("opt.prof", &bv, &bsz, NULL, 0))
== 0 && bv) {
xmallctl("opt.lg_prof_bt_max", &sv, &ssz, NULL, 0);
write4(w4opaque, "Maximum profile backtrace depth: ",
umax2s((1U << sv), 10, s), "\n", "");
}
if ((err = JEMALLOC_P(mallctl)("opt.lg_prof_interval", &sv,
&ssz, NULL, 0)) == 0) {
xmallctl("opt.lg_prof_sample", &sv, &ssz, NULL, 0);
write4(w4opaque, "Average profile sample interval: ",
umax2s((1U << sv), 10, s), "", "");
write4(w4opaque, " (2^", umax2s(sv, 10, s), ")\n", "");
xmallctl("opt.lg_prof_interval", &sv, &ssz, NULL, 0);
write4(w4opaque, "Average profile dump interval: ",
umax2s((1U << sv), 10, s), "", "");
write4(w4opaque, " (2^", umax2s(sv, 10, s), ")\n", "");