From b9477e782b07afa38c4b1dc0688e053be8a84dd8 Mon Sep 17 00:00:00 2001 From: Jason Evans Date: Mon, 1 Mar 2010 20:15:26 -0800 Subject: [PATCH] Implement sampling for heap profiling. --- jemalloc/configure.ac | 1 + jemalloc/doc/jemalloc.3.in | 14 +- jemalloc/include/jemalloc/internal/prn.h | 12 +- jemalloc/include/jemalloc/internal/prof.h | 10 +- jemalloc/src/ckh.c | 4 +- jemalloc/src/ctl.c | 3 + jemalloc/src/jemalloc.c | 19 ++- jemalloc/src/prof.c | 184 +++++++++++++++++----- jemalloc/src/stats.c | 15 +- 9 files changed, 202 insertions(+), 60 deletions(-) diff --git a/jemalloc/configure.ac b/jemalloc/configure.ac index 5d038909..918be8b8 100644 --- a/jemalloc/configure.ac +++ b/jemalloc/configure.ac @@ -647,6 +647,7 @@ if test "x$enable_tls" = "x0" ; then enable_prof="0" fi if test "x$enable_prof" = "x1" ; then + LIBS="$LIBS -lm" AC_DEFINE([JEMALLOC_PROF], [ ]) if test "x$enable_prof_libunwind" = "x1" ; then AC_CHECK_HEADERS([libunwind.h], , [enable_prof_libunwind="0"]) diff --git a/jemalloc/doc/jemalloc.3.in b/jemalloc/doc/jemalloc.3.in index 718ad84d..6e55ca07 100644 --- a/jemalloc/doc/jemalloc.3.in +++ b/jemalloc/doc/jemalloc.3.in @@ -38,7 +38,7 @@ .\" @(#)malloc.3 8.1 (Berkeley) 6/4/93 .\" $FreeBSD: head/lib/libc/stdlib/malloc.3 182225 2008-08-27 02:00:53Z jasone $ .\" -.Dd February 11, 2010 +.Dd March 1, 2010 .Dt JEMALLOC 3 .Os .Sh NAME @@ -355,6 +355,9 @@ will disable dirty page purging. @roff_prof@.Dq B @roff_prof@option for backtrace depth control. @roff_prof@See the +@roff_prof@.Dq S +@roff_prof@option for probabilistic sampling control. +@roff_prof@See the @roff_prof@.Dq I @roff_prof@option for information on interval-triggered profile dumping, and the @roff_prof@.Dq U @@ -464,6 +467,15 @@ Double/halve the size of the maximum size class that is a multiple of the quantum (8 or 16 bytes, depending on architecture). Above this size, cacheline spacing is used for size classes. The default value is 128 bytes. +@roff_prof@.It S +@roff_prof@Double/halve the average interval between allocation samples, as +@roff_prof@measured in bytes of allocation activity. +@roff_prof@Increasing the sampling interval decreases profile fidelity, but +@roff_prof@also decreases the computational overhead. +@roff_prof@The default sample interval is one (i.e. all allocations are +@roff_prof@sampled). +@roff_prof@A sample interval greater than one implicitly disables leak +@roff_prof@reporting. @roff_prof@.It U @roff_prof@Trigger a memory profile dump every time the total virtual memory @roff_prof@exceeds the previous maximum. diff --git a/jemalloc/include/jemalloc/internal/prn.h b/jemalloc/include/jemalloc/internal/prn.h index 502733c5..0709d708 100644 --- a/jemalloc/include/jemalloc/internal/prn.h +++ b/jemalloc/include/jemalloc/internal/prn.h @@ -25,7 +25,7 @@ * uint32_t state : Seed value. * const uint32_t a, c : See above discussion. */ -#define prn(r, lg_range, state, a, c) do { \ +#define prn32(r, lg_range, state, a, c) do { \ assert(lg_range > 0); \ assert(lg_range <= 32); \ \ @@ -34,6 +34,16 @@ r >>= (32 - lg_range); \ } while (false) +/* Same as prn32(), but 64 bits of pseudo-randomness, using uint64_t. */ +#define prn64(r, lg_range, state, a, c) do { \ + assert(lg_range > 0); \ + assert(lg_range <= 64); \ + \ + r = (state * (a)) + (c); \ + state = r; \ + r >>= (64 - lg_range); \ +} while (false) + #endif /* JEMALLOC_H_TYPES */ /******************************************************************************/ #ifdef JEMALLOC_H_STRUCTS diff --git a/jemalloc/include/jemalloc/internal/prof.h b/jemalloc/include/jemalloc/internal/prof.h index 44e11cbf..364ac0a2 100644 --- a/jemalloc/include/jemalloc/internal/prof.h +++ b/jemalloc/include/jemalloc/internal/prof.h @@ -8,6 +8,9 @@ typedef struct prof_thr_cnt_s prof_thr_cnt_t; typedef struct prof_ctx_s prof_ctx_t; typedef struct prof_s prof_t; +/* Option defaults. */ +#define LG_PROF_BT_MAX_DEFAULT 2 +#define LG_PROF_SAMPLE_DEFAULT 0 #define LG_PROF_INTERVAL_DEFAULT 30 /* @@ -16,7 +19,7 @@ typedef struct prof_s prof_t; * a hard-coded number of backtrace frame handlers, so increasing * LG_PROF_BT_MAX requires changing prof_backtrace(). */ -#define LG_PROF_BT_MAX 7 +#define LG_PROF_BT_MAX 7 /* >= LG_PROF_BT_MAX_DEFAULT */ #define PROF_BT_MAX (1U << LG_PROF_BT_MAX) /* Initial hash table size. */ @@ -117,7 +120,8 @@ struct prof_ctx_s { extern bool opt_prof; extern size_t opt_lg_prof_bt_max; /* Maximum backtrace depth. */ -extern size_t opt_lg_prof_interval; +extern size_t opt_lg_prof_sample; /* Mean bytes between samples. */ +extern size_t opt_lg_prof_interval; /* lg(prof_interval). */ extern bool opt_prof_udump; /* High-water memory dumping. */ extern bool opt_prof_leak; /* Dump leak summary at exit. */ @@ -133,7 +137,7 @@ extern uint64_t prof_interval; bool prof_init(prof_t *prof, bool master); void prof_destroy(prof_t *prof); -prof_thr_cnt_t *prof_alloc_prep(void); +prof_thr_cnt_t *prof_alloc_prep(size_t size); prof_thr_cnt_t *prof_cnt_get(const void *ptr); void prof_malloc(const void *ptr, prof_thr_cnt_t *cnt); void prof_realloc(const void *ptr, prof_thr_cnt_t *cnt, const void *old_ptr, diff --git a/jemalloc/src/ckh.c b/jemalloc/src/ckh.c index fd234a44..a0c4162a 100644 --- a/jemalloc/src/ckh.c +++ b/jemalloc/src/ckh.c @@ -100,7 +100,7 @@ ckh_try_bucket_insert(ckh_t *ckh, size_t bucket, const void *key, * Cycle through the cells in the bucket, starting at a random position. * The randomness avoids worst-case search overhead as buckets fill up. */ - prn(offset, LG_CKH_BUCKET_CELLS, ckh->prn_state, CKH_A, CKH_C); + prn32(offset, LG_CKH_BUCKET_CELLS, ckh->prn_state, CKH_A, CKH_C); for (i = 0; i < (ZU(1) << LG_CKH_BUCKET_CELLS); i++) { cell = &ckh->tab[(bucket << LG_CKH_BUCKET_CELLS) + ((i + offset) & ((ZU(1) << LG_CKH_BUCKET_CELLS) - 1))]; @@ -142,7 +142,7 @@ ckh_evict_reloc_insert(ckh_t *ckh, size_t argbucket, void const **argkey, * were an item for which both hashes indicated the same * bucket. */ - prn(i, LG_CKH_BUCKET_CELLS, ckh->prn_state, CKH_A, CKH_C); + prn32(i, LG_CKH_BUCKET_CELLS, ckh->prn_state, CKH_A, CKH_C); cell = &ckh->tab[(bucket << LG_CKH_BUCKET_CELLS) + i]; assert(cell->key != NULL); diff --git a/jemalloc/src/ctl.c b/jemalloc/src/ctl.c index 55ad2a7c..36411e07 100644 --- a/jemalloc/src/ctl.c +++ b/jemalloc/src/ctl.c @@ -69,6 +69,7 @@ CTL_PROTO(opt_lg_tcache_gc_sweep) #ifdef JEMALLOC_PROF CTL_PROTO(opt_prof) CTL_PROTO(opt_lg_prof_bt_max) +CTL_PROTO(opt_lg_prof_sample) CTL_PROTO(opt_lg_prof_interval) CTL_PROTO(opt_prof_udump) CTL_PROTO(opt_prof_leak) @@ -234,6 +235,7 @@ static const ctl_node_t opt_node[] = { #ifdef JEMALLOC_PROF {NAME("prof"), CTL(opt_prof)}, {NAME("lg_prof_bt_max"), CTL(opt_lg_prof_bt_max)}, + {NAME("lg_prof_sample"), CTL(opt_lg_prof_sample)}, {NAME("lg_prof_interval"), CTL(opt_lg_prof_interval)}, {NAME("prof_udump"), CTL(opt_prof_udump)}, {NAME("prof_leak"), CTL(opt_prof_leak)}, @@ -1066,6 +1068,7 @@ CTL_RO_GEN(opt_lg_tcache_gc_sweep, opt_lg_tcache_gc_sweep, ssize_t) #ifdef JEMALLOC_PROF CTL_RO_GEN(opt_prof, opt_prof, bool) CTL_RO_GEN(opt_lg_prof_bt_max, opt_lg_prof_bt_max, size_t) +CTL_RO_GEN(opt_lg_prof_sample, opt_lg_prof_sample, size_t) CTL_RO_GEN(opt_lg_prof_interval, opt_lg_prof_interval, size_t) CTL_RO_GEN(opt_prof_udump, opt_prof_udump, bool) CTL_RO_GEN(opt_prof_leak, opt_prof_leak, bool) diff --git a/jemalloc/src/jemalloc.c b/jemalloc/src/jemalloc.c index 22401d16..4d51e1a3 100644 --- a/jemalloc/src/jemalloc.c +++ b/jemalloc/src/jemalloc.c @@ -582,6 +582,15 @@ MALLOC_OUT: opt_lg_qspace_max++; break; #ifdef JEMALLOC_PROF + case 's': + if (opt_lg_prof_sample > 0) + opt_lg_prof_sample--; + break; + case 'S': + if (opt_lg_prof_sample + 1 < + (sizeof(uint64_t) << 3)) + opt_lg_prof_sample++; + break; case 'u': opt_prof_udump = false; break; @@ -870,7 +879,7 @@ JEMALLOC_P(malloc)(size_t size) } #ifdef JEMALLOC_PROF - if (opt_prof && (cnt = prof_alloc_prep()) == NULL) { + if (opt_prof && (cnt = prof_alloc_prep(size)) == NULL) { ret = NULL; goto OOM; } @@ -955,7 +964,7 @@ JEMALLOC_P(posix_memalign)(void **memptr, size_t alignment, size_t size) } #ifdef JEMALLOC_PROF - if (opt_prof && (cnt = prof_alloc_prep()) == NULL) { + if (opt_prof && (cnt = prof_alloc_prep(size)) == NULL) { result = NULL; ret = EINVAL; } else @@ -1030,7 +1039,7 @@ JEMALLOC_P(calloc)(size_t num, size_t size) } #ifdef JEMALLOC_PROF - if (opt_prof && (cnt = prof_alloc_prep()) == NULL) { + if (opt_prof && (cnt = prof_alloc_prep(num_size)) == NULL) { ret = NULL; goto RETURN; } @@ -1106,7 +1115,7 @@ JEMALLOC_P(realloc)(void *ptr, size_t size) if (opt_prof) { old_size = isalloc(ptr); old_cnt = prof_cnt_get(ptr); - if ((cnt = prof_alloc_prep()) == NULL) { + if ((cnt = prof_alloc_prep(size)) == NULL) { ret = NULL; goto OOM; } @@ -1144,7 +1153,7 @@ OOM: ret = NULL; } else { #ifdef JEMALLOC_PROF - if (opt_prof && (cnt = prof_alloc_prep()) == NULL) { + if (opt_prof && (cnt = prof_alloc_prep(size)) == NULL) { ret = NULL; } else #endif diff --git a/jemalloc/src/prof.c b/jemalloc/src/prof.c index d1bb4d02..d44084fb 100644 --- a/jemalloc/src/prof.c +++ b/jemalloc/src/prof.c @@ -12,11 +12,14 @@ #include #endif +#include + /******************************************************************************/ /* Data. */ bool opt_prof = false; -size_t opt_lg_prof_bt_max = 2; +size_t opt_lg_prof_bt_max = LG_PROF_BT_MAX_DEFAULT; +size_t opt_lg_prof_sample = LG_PROF_SAMPLE_DEFAULT; size_t opt_lg_prof_interval = LG_PROF_INTERVAL_DEFAULT; bool opt_prof_udump = false; bool opt_prof_leak = false; @@ -52,6 +55,13 @@ static pthread_key_t bt2cnt_tsd; /* (1U << opt_lg_prof_bt_max). */ static unsigned prof_bt_max; +static __thread uint64_t prof_sample_prn_state + JEMALLOC_ATTR(tls_model("initial-exec")); +static __thread uint64_t prof_sample_threshold + JEMALLOC_ATTR(tls_model("initial-exec")); +static __thread uint64_t prof_sample_accum + JEMALLOC_ATTR(tls_model("initial-exec")); + static malloc_mutex_t prof_dump_seq_mtx; static uint64_t prof_dump_seq; static uint64_t prof_dump_iseq; @@ -500,15 +510,27 @@ prof_lookup(prof_bt_t *bt) } prof_thr_cnt_t * -prof_alloc_prep(void) +prof_alloc_prep(size_t size) { prof_thr_cnt_t *ret; void *vec[prof_bt_max]; prof_bt_t bt; - bt_init(&bt, vec); - prof_backtrace(&bt, 2, prof_bt_max); - ret = prof_lookup(&bt); + /* + * Determine whether to capture a backtrace based on whether size is + * enough for prof_accum to reach prof_sample_threshold. However, + * delay updating these variables until prof_{m,re}alloc(), because we + * don't know for sure that the allocation will succeed. + * + * Use subtraction rather than addition to avoid potential integer + * overflow. + */ + if (size >= prof_sample_threshold - prof_sample_accum) { + bt_init(&bt, vec); + prof_backtrace(&bt, 2, prof_bt_max); + ret = prof_lookup(&bt); + } else + ret = (prof_thr_cnt_t *)(uintptr_t)1U; return (ret); } @@ -550,28 +572,84 @@ prof_cnt_set(const void *ptr, prof_thr_cnt_t *cnt) huge_prof_cnt_set(ptr, cnt); } +static inline void +prof_sample_threshold_update(void) +{ + uint64_t r; + double u; + + /* + * Compute prof_sample_threshold as a geometrically distributed random + * variable with mean (2^opt_lg_prof_sample). + */ + prn64(r, 53, prof_sample_prn_state, (uint64_t)1125899906842625LLU, + 1058392653243283975); + u = (double)r * (1.0/9007199254740992.0L); + prof_sample_threshold = (uint64_t)(log(u) / + log(1.0 - (1.0 / (double)((uint64_t)1U << opt_lg_prof_sample)))) + + (uint64_t)1U; +} + +static inline void +prof_sample_accum_update(size_t size) +{ + + if (opt_lg_prof_sample == 0) { + /* + * Don't bother with sampling logic, since sampling interval is + * 1. + */ + return; + } + + if (prof_sample_threshold == 0) { + /* Initialize. Seed the prng differently for each thread. */ + prof_sample_prn_state = (uint64_t)(uintptr_t)&size; + prof_sample_threshold_update(); + } + + /* Take care to avoid integer overflow. */ + if (size >= prof_sample_threshold - prof_sample_accum) { + prof_sample_accum -= (prof_sample_threshold - size); + /* + * Compute new geometrically distributed prof_sample_threshold. + */ + prof_sample_threshold_update(); + while (prof_sample_accum >= prof_sample_threshold) { + prof_sample_accum -= prof_sample_threshold; + prof_sample_threshold_update(); + } + } else + prof_sample_accum += size; +} + void prof_malloc(const void *ptr, prof_thr_cnt_t *cnt) { size_t size = isalloc(ptr); - prof_cnt_set(ptr, cnt); + assert(ptr != NULL); - cnt->epoch++; - /*********/ - mb_write(); - /*********/ - cnt->cnts.curobjs++; - cnt->cnts.curbytes += size; - cnt->cnts.accumobjs++; - cnt->cnts.accumbytes += size; - /*********/ - mb_write(); - /*********/ - cnt->epoch++; - /*********/ - mb_write(); - /*********/ + prof_cnt_set(ptr, cnt); + prof_sample_accum_update(size); + + if ((uintptr_t)cnt > (uintptr_t)1U) { + cnt->epoch++; + /*********/ + mb_write(); + /*********/ + cnt->cnts.curobjs++; + cnt->cnts.curbytes += size; + cnt->cnts.accumobjs++; + cnt->cnts.accumbytes += size; + /*********/ + mb_write(); + /*********/ + cnt->epoch++; + /*********/ + mb_write(); + /*********/ + } } void @@ -580,20 +658,23 @@ prof_realloc(const void *ptr, prof_thr_cnt_t *cnt, const void *old_ptr, { size_t size = isalloc(ptr); - prof_cnt_set(ptr, cnt); + if (ptr != NULL) { + prof_cnt_set(ptr, cnt); + prof_sample_accum_update(size); + } - if (old_cnt != NULL) + if ((uintptr_t)old_cnt > (uintptr_t)1U) old_cnt->epoch++; - if (cnt != NULL) + if ((uintptr_t)cnt > (uintptr_t)1U) cnt->epoch++; /*********/ mb_write(); /*********/ - if (old_cnt != NULL) { + if ((uintptr_t)old_cnt > (uintptr_t)1U) { old_cnt->cnts.curobjs--; old_cnt->cnts.curbytes -= old_size; } - if (cnt != NULL) { + if ((uintptr_t)cnt > (uintptr_t)1U) { cnt->cnts.curobjs++; cnt->cnts.curbytes += size; cnt->cnts.accumobjs++; @@ -602,9 +683,9 @@ prof_realloc(const void *ptr, prof_thr_cnt_t *cnt, const void *old_ptr, /*********/ mb_write(); /*********/ - if (old_cnt != NULL) + if ((uintptr_t)old_cnt > (uintptr_t)1U) old_cnt->epoch++; - if (cnt != NULL) + if ((uintptr_t)cnt > (uintptr_t)1U) cnt->epoch++; /*********/ mb_write(); /* Not strictly necessary. */ @@ -614,21 +695,24 @@ void prof_free(const void *ptr) { prof_thr_cnt_t *cnt = prof_cnt_get(ptr); - size_t size = isalloc(ptr); - cnt->epoch++; - /*********/ - mb_write(); - /*********/ - cnt->cnts.curobjs--; - cnt->cnts.curbytes -= size; - /*********/ - mb_write(); - /*********/ - cnt->epoch++; - /*********/ - mb_write(); - /*********/ + if ((uintptr_t)cnt > (uintptr_t)1) { + size_t size = isalloc(ptr); + + cnt->epoch++; + /*********/ + mb_write(); + /*********/ + cnt->cnts.curobjs--; + cnt->cnts.curbytes -= size; + /*********/ + mb_write(); + /*********/ + cnt->epoch++; + /*********/ + mb_write(); + /*********/ + } } static void @@ -825,7 +909,13 @@ prof_dump(const char *filename, bool leakcheck) prof_write(umax2s(cnt_all.accumobjs, 10, buf)); prof_write(": "); prof_write(umax2s(cnt_all.accumbytes, 10, buf)); - prof_write("] @ heapprofile\n"); + if (opt_lg_prof_sample == 0) + prof_write("] @ heapprofile\n"); + else { + prof_write("] @ heap_v2/"); + prof_write(umax2s((uint64_t)1U << opt_lg_prof_sample, 10, buf)); + prof_write("\n"); + } /* Dump per ctx profile stats. */ for (tabind = 0; ckh_iter(&bt2ctx, &tabind, (void **)&bt, (void **)&ctx) @@ -1104,6 +1194,14 @@ prof_boot0(void) * initialized, so this function must be executed early. */ + if (opt_lg_prof_sample > 0) { + /* + * Disable leak checking, since not all allocations will be + * sampled. + */ + opt_prof_leak = false; + } + if (opt_prof_leak && opt_prof == false) { /* * Enable opt_prof, but in such a way that profiles are never diff --git a/jemalloc/src/stats.c b/jemalloc/src/stats.c index 7c6d8c93..f8c1731e 100644 --- a/jemalloc/src/stats.c +++ b/jemalloc/src/stats.c @@ -540,13 +540,18 @@ stats_print(void (*write4)(void *, const char *, const char *, const char *, tcache_nslots && ssv >= 0 ? umax2s(tcache_gc_sweep, 10, s) : "N/A", "\n", ""); } - if ((err = JEMALLOC_P(mallctl)("opt.lg_prof_bt_max", &sv, &ssz, - NULL, 0)) == 0) { + if ((err = JEMALLOC_P(mallctl)("opt.prof", &bv, &bsz, NULL, 0)) + == 0 && bv) { + xmallctl("opt.lg_prof_bt_max", &sv, &ssz, NULL, 0); write4(w4opaque, "Maximum profile backtrace depth: ", umax2s((1U << sv), 10, s), "\n", ""); - } - if ((err = JEMALLOC_P(mallctl)("opt.lg_prof_interval", &sv, - &ssz, NULL, 0)) == 0) { + + xmallctl("opt.lg_prof_sample", &sv, &ssz, NULL, 0); + write4(w4opaque, "Average profile sample interval: ", + umax2s((1U << sv), 10, s), "", ""); + write4(w4opaque, " (2^", umax2s(sv, 10, s), ")\n", ""); + + xmallctl("opt.lg_prof_interval", &sv, &ssz, NULL, 0); write4(w4opaque, "Average profile dump interval: ", umax2s((1U << sv), 10, s), "", ""); write4(w4opaque, " (2^", umax2s(sv, 10, s), ")\n", "");