Make cumulative heap profile data optional.

Add the R option to control whether cumulative heap profile data
are maintained.  Add the T option to control the size of per thread
backtrace caches, primarily because when the R option is specified,
backtraces that no longer have allocations associated with them are
discarded as soon as no thread caches refer to them.
This commit is contained in:
Jason Evans 2010-10-02 15:18:50 -07:00
parent 4d5c09905e
commit a881cd2c61
8 changed files with 328 additions and 124 deletions

View File

@ -484,6 +484,12 @@ will disable dirty page purging.
@roff_prof@.Dq S
@roff_prof@option for probabilistic sampling control.
@roff_prof@See the
@roff_prof@.Dq R
@roff_prof@option for control of cumulative sample reporting.
@roff_prof@See the
@roff_prof@.Dq T
@roff_prof@option for control of per thread backtrace caching.
@roff_prof@See the
@roff_prof@.Dq I
@roff_prof@option for information on interval-triggered profile dumping, and the
@roff_prof@.Dq U
@ -595,6 +601,18 @@ Double/halve the size of the maximum size class that is a multiple of the
quantum (8 or 16 bytes, depending on architecture).
Above this size, cacheline spacing is used for size classes.
The default value is 128 bytes.
@roff_prof@.It R
@roff_prof@Enable/disable reporting of cumulative object/byte counts in profile
@roff_prof@dumps.
@roff_prof@If this option is enabled, every unique backtrace must be stored for
@roff_prof@the duration of execution.
@roff_prof@Depending on the application, this can impose a large memory
@roff_prof@overhead, and the cumulative counts are not always of interest.
@roff_prof@See the
@roff_prof@.Dq T
@roff_prof@option for control of per thread backtrace caching, which has
@roff_prof@important interactions.
@roff_prof@This option is enabled by default.
@roff_prof@.It S
@roff_prof@Double/halve the average interval between allocation samples, as
@roff_prof@measured in bytes of allocation activity.
@ -602,6 +620,22 @@ The default value is 128 bytes.
@roff_prof@also decreases the computational overhead.
@roff_prof@The default sample interval is one (i.e. all allocations are
@roff_prof@sampled).
@roff_prof@.It T
@roff_prof@Double/halve the maximum per thread backtrace cache used for heap
@roff_prof@profiling.
@roff_prof@A backtrace can only be discarded if the
@roff_prof@.Dq R
@roff_prof@option is disabled, and no thread caches currently refer to the
@roff_prof@backtrace.
@roff_prof@Therefore, a backtrace cache limit should be imposed if the
@roff_prof@intention is to limit how much memory is used by backtraces.
@roff_prof@By default, no limit is imposed.
@roff_prof@This is internally encoded as (1 << -1), and each
@roff_prof@.Dq T
@roff_prof@that is specified increments the shift amount.
@roff_prof@Therefore, e.g.
@roff_prof@.Ev JEMALLOC_OPTIONS=11T
@roff_prof@specifies a backtrace cache limit of 1024 backtraces.
@roff_prof@.It U
@roff_prof@Trigger a memory profile dump every time the total virtual memory
@roff_prof@exceeds the previous maximum.
@ -992,6 +1026,27 @@ option.
@roff_prof@option.
@roff_prof@.Ed
.\"-----------------------------------------------------------------------------
@roff_prof@.It Sy "opt.prof_accum (bool) r-"
@roff_prof@.Bd -ragged -offset indent -compact
@roff_prof@See the
@roff_prof@.Dq R
@roff_prof@option.
@roff_prof@.Ed
.\"-----------------------------------------------------------------------------
@roff_prof@.It Sy "opt.lg_prof_tcmax (ssize_t) r-"
@roff_prof@.Bd -ragged -offset indent -compact
@roff_prof@See the
@roff_prof@.Dq T
@roff_prof@option.
@roff_prof@.Ed
.\"-----------------------------------------------------------------------------
@roff_prof@.It Sy "opt.lg_prof_sample (ssize_t) r-"
@roff_prof@.Bd -ragged -offset indent -compact
@roff_prof@See the
@roff_prof@.Dq S
@roff_prof@option.
@roff_prof@.Ed
.\"-----------------------------------------------------------------------------
@roff_prof@.It Sy "opt.lg_prof_interval (ssize_t) r-"
@roff_prof@.Bd -ragged -offset indent -compact
@roff_prof@See the

View File

@ -24,6 +24,7 @@ extern bool isthreaded;
#endif
bool malloc_mutex_init(malloc_mutex_t *mutex);
void malloc_mutex_destroy(malloc_mutex_t *mutex);
#endif /* JEMALLOC_H_EXTERNS */
/******************************************************************************/

View File

@ -6,12 +6,13 @@ typedef struct prof_bt_s prof_bt_t;
typedef struct prof_cnt_s prof_cnt_t;
typedef struct prof_thr_cnt_s prof_thr_cnt_t;
typedef struct prof_ctx_s prof_ctx_t;
typedef struct prof_s prof_t;
typedef struct prof_tcache_s prof_tcache_t;
/* Option defaults. */
#define LG_PROF_BT_MAX_DEFAULT 2
#define LG_PROF_SAMPLE_DEFAULT 0
#define LG_PROF_INTERVAL_DEFAULT -1
#define LG_PROF_TCMAX_DEFAULT -1
/*
* Hard limit on stack backtrace depth. Note that the version of
@ -41,9 +42,9 @@ struct prof_bt_s {
#ifdef JEMALLOC_PROF_LIBGCC
/* Data structure passed to libgcc _Unwind_Backtrace() callback functions. */
typedef struct {
prof_bt_t *bt;
unsigned nignore;
unsigned max;
prof_bt_t *bt;
unsigned nignore;
unsigned max;
} prof_unwind_data_t;
#endif
@ -51,11 +52,11 @@ struct prof_cnt_s {
/*
* Profiling counters. An allocation/deallocation pair can operate on
* different prof_thr_cnt_t objects that are linked into the same
* prof_ctx_t sets_ql, so it is possible for the cur* counters to go
* prof_ctx_t cnts_ql, so it is possible for the cur* counters to go
* negative. In principle it is possible for the *bytes counters to
* overflow/underflow, but a general solution would require some form
* of 128-bit counter solution; this implementation doesn't bother to
* solve that problem.
* overflow/underflow, but a general solution would require something
* like 128-bit counters; this implementation doesn't bother to solve
* that problem.
*/
int64_t curobjs;
int64_t curbytes;
@ -64,15 +65,18 @@ struct prof_cnt_s {
};
struct prof_thr_cnt_s {
/* Linkage into prof_ctx_t's sets_ql. */
ql_elm(prof_thr_cnt_t) link;
/* Linkage into prof_ctx_t's cnts_ql. */
ql_elm(prof_thr_cnt_t) cnts_link;
/* Linkage into thread's LRU. */
ql_elm(prof_thr_cnt_t) lru_link;
/*
* Associated context. If a thread frees an object that it did not
* allocate, it is possible that the context is not cached in the
* thread's hash table, in which case it must be able to look up the
* context, insert a new prof_thr_cnt_t into the thread's hash table,
* and link it into the prof_ctx_t's sets_ql.
* and link it into the prof_ctx_t's cnts_ql.
*/
prof_ctx_t *ctx;
@ -101,11 +105,11 @@ struct prof_ctx_s {
/* Associated backtrace. */
prof_bt_t *bt;
/* Protects cnt_merged and sets_ql. */
/* Protects cnt_merged and cnts_ql. */
malloc_mutex_t lock;
/* Temporary storage for aggregation during dump. */
prof_cnt_t cnt_dump;
/* Temporary storage for summation during dump. */
prof_cnt_t cnt_summed;
/* When threads exit, they merge their stats into cnt_merged. */
prof_cnt_t cnt_merged;
@ -117,6 +121,24 @@ struct prof_ctx_s {
ql_head(prof_thr_cnt_t) cnts_ql;
};
/*
* Thread-specific hash of (prof_bt_t *)-->(prof_thr_cnt_t *). Each thread
* keeps a cache of backtraces, with associated thread-specific prof_thr_cnt_t
* objects. Other threads may read the prof_thr_cnt_t contents, but no others
* will ever write them.
*
* Upon thread exit, the thread must merge all the prof_thr_cnt_t counter data
* into the associated prof_ctx_t objects, and unlink/free the prof_thr_cnt_t
* objects.
*/
struct prof_tcache_s {
/* (prof_bt_t *)-->(prof_thr_cnt_t *). */
ckh_t bt2cnt;
/* LRU for contents of bt2cnt. */
ql_head(prof_thr_cnt_t) lru_ql;
};
#endif /* JEMALLOC_H_STRUCTS */
/******************************************************************************/
#ifdef JEMALLOC_H_EXTERNS
@ -129,11 +151,13 @@ extern bool opt_prof;
* to notice state changes.
*/
extern bool opt_prof_active;
extern size_t opt_lg_prof_bt_max; /* Maximum backtrace depth. */
extern size_t opt_lg_prof_sample; /* Mean bytes between samples. */
extern size_t opt_lg_prof_bt_max; /* Maximum backtrace depth. */
extern size_t opt_lg_prof_sample; /* Mean bytes between samples. */
extern ssize_t opt_lg_prof_interval; /* lg(prof_interval). */
extern bool opt_prof_udump; /* High-water memory dumping. */
extern bool opt_prof_leak; /* Dump leak summary at exit. */
extern bool opt_prof_udump; /* High-water memory dumping. */
extern bool opt_prof_leak; /* Dump leak summary at exit. */
extern bool opt_prof_accum; /* Report cumulative bytes. */
extern ssize_t opt_lg_prof_tcmax; /* lg(max per thread bactrace cache) */
/*
* Profile dump interval, measured in bytes allocated. Each arena triggers a
@ -150,9 +174,6 @@ extern uint64_t prof_interval;
*/
extern bool prof_promote;
bool prof_init(prof_t *prof, bool master);
void prof_destroy(prof_t *prof);
prof_thr_cnt_t *prof_alloc_prep(size_t size);
prof_ctx_t *prof_ctx_get(const void *ptr);
void prof_malloc(const void *ptr, prof_thr_cnt_t *cnt);

View File

@ -82,6 +82,8 @@ CTL_PROTO(opt_lg_prof_sample)
CTL_PROTO(opt_lg_prof_interval)
CTL_PROTO(opt_prof_udump)
CTL_PROTO(opt_prof_leak)
CTL_PROTO(opt_prof_accum)
CTL_PROTO(opt_lg_prof_tcmax)
#endif
CTL_PROTO(opt_stats_print)
CTL_PROTO(opt_lg_qspace_max)
@ -260,6 +262,8 @@ static const ctl_node_t opt_node[] = {
{NAME("lg_prof_interval"), CTL(opt_lg_prof_interval)},
{NAME("prof_udump"), CTL(opt_prof_udump)},
{NAME("prof_leak"), CTL(opt_prof_leak)},
{NAME("prof_accum"), CTL(opt_prof_accum)},
{NAME("lg_prof_tcmax"), CTL(opt_lg_prof_tcmax)},
#endif
{NAME("stats_print"), CTL(opt_stats_print)},
{NAME("lg_qspace_max"), CTL(opt_lg_qspace_max)},
@ -1207,6 +1211,8 @@ CTL_RO_GEN(opt_lg_prof_sample, opt_lg_prof_sample, size_t)
CTL_RO_GEN(opt_lg_prof_interval, opt_lg_prof_interval, ssize_t)
CTL_RO_GEN(opt_prof_udump, opt_prof_udump, bool)
CTL_RO_GEN(opt_prof_leak, opt_prof_leak, bool)
CTL_RO_GEN(opt_prof_accum, opt_prof_accum, bool)
CTL_RO_GEN(opt_lg_prof_tcmax, opt_lg_prof_tcmax, ssize_t)
#endif
CTL_RO_GEN(opt_stats_print, opt_stats_print, bool)
CTL_RO_GEN(opt_lg_qspace_max, opt_lg_qspace_max, size_t)

View File

@ -512,6 +512,12 @@ MALLOC_OUT:
opt_lg_qspace_max++;
break;
#ifdef JEMALLOC_PROF
case 'r':
opt_prof_accum = false;
break;
case 'R':
opt_prof_accum = true;
break;
case 's':
if (opt_lg_prof_sample > 0)
opt_lg_prof_sample--;
@ -521,6 +527,15 @@ MALLOC_OUT:
(sizeof(uint64_t) << 3))
opt_lg_prof_sample++;
break;
case 't':
if (opt_lg_prof_tcmax >= 0)
opt_lg_prof_tcmax--;
break;
case 'T':
if (opt_lg_prof_tcmax + 1 <
(sizeof(size_t) << 3))
opt_lg_prof_tcmax++;
break;
case 'u':
opt_prof_udump = false;
break;

View File

@ -72,3 +72,13 @@ malloc_mutex_init(malloc_mutex_t *mutex)
return (false);
}
void
malloc_mutex_destroy(malloc_mutex_t *mutex)
{
if (pthread_mutex_destroy(mutex) != 0) {
malloc_write("<jemalloc>: Error in pthread_mutex_destroy()\n");
abort();
}
}

View File

@ -24,47 +24,41 @@ size_t opt_lg_prof_sample = LG_PROF_SAMPLE_DEFAULT;
ssize_t opt_lg_prof_interval = LG_PROF_INTERVAL_DEFAULT;
bool opt_prof_udump = false;
bool opt_prof_leak = false;
bool opt_prof_accum = true;
ssize_t opt_lg_prof_tcmax = LG_PROF_TCMAX_DEFAULT;
uint64_t prof_interval;
bool prof_promote;
/*
* Global hash of (prof_bt_t *)-->(prof_ctx_t *). This is the master data
* structure that knows about all backtraces ever captured.
* structure that knows about all backtraces currently captured.
*/
static ckh_t bt2ctx;
static malloc_mutex_t bt2ctx_mtx;
/*
* Thread-specific hash of (prof_bt_t *)-->(prof_thr_cnt_t *). Each thread
* keeps a cache of backtraces, with associated thread-specific prof_thr_cnt_t
* objects. Other threads may read the prof_thr_cnt_t contents, but no others
* will ever write them.
*
* Upon thread exit, the thread must merge all the prof_thr_cnt_t counter data
* into the associated prof_ctx_t objects, and unlink/free the prof_thr_cnt_t
* objects.
*/
/* Thread-specific backtrace cache, used to reduce bt2ctx contention. */
#ifndef NO_TLS
static __thread ckh_t *bt2cnt_tls JEMALLOC_ATTR(tls_model("initial-exec"));
# define BT2CNT_GET() bt2cnt_tls
# define BT2CNT_SET(v) do { \
bt2cnt_tls = (v); \
pthread_setspecific(bt2cnt_tsd, (void *)(v)); \
static __thread prof_tcache_t *prof_tcache_tls
JEMALLOC_ATTR(tls_model("initial-exec"));
# define PROF_TCACHE_GET() prof_tcache_tls
# define PROF_TCACHE_SET(v) do { \
prof_tcache_tls = (v); \
pthread_setspecific(prof_tcache_tsd, (void *)(v)); \
} while (0)
#else
# define BT2CNT_GET() ((ckh_t *)pthread_getspecific(bt2cnt_tsd))
# define BT2CNT_SET(v) do { \
pthread_setspecific(bt2cnt_tsd, (void *)(v)); \
# define PROF_TCACHE_GET() ((ckh_t *)pthread_getspecific(prof_tcache_tsd))
# define PROF_TCACHE_SET(v) do { \
pthread_setspecific(prof_tcache_tsd, (void *)(v)); \
} while (0)
#endif
/*
* Same contents as b2cnt_tls, but initialized such that the TSD destructor is
* called when a thread exits, so that bt2cnt_tls contents can be merged,
* called when a thread exits, so that prof_tcache_tls contents can be merged,
* unlinked, and deallocated.
*/
static pthread_key_t bt2cnt_tsd;
static pthread_key_t prof_tcache_tsd;
/* (1U << opt_lg_prof_bt_max). */
static unsigned prof_bt_max;
@ -137,6 +131,7 @@ static bool enq_udump;
static prof_bt_t *bt_dup(prof_bt_t *bt);
static void bt_init(prof_bt_t *bt, void **vec);
static void bt_destroy(prof_bt_t *bt);
#ifdef JEMALLOC_PROF_LIBGCC
static _Unwind_Reason_Code prof_unwind_init_callback(
struct _Unwind_Context *context, void *arg);
@ -148,8 +143,10 @@ static prof_thr_cnt_t *prof_lookup(prof_bt_t *bt);
static void prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
static bool prof_flush(bool propagate_err);
static bool prof_write(const char *s, bool propagate_err);
static void prof_ctx_merge(prof_ctx_t *ctx, prof_cnt_t *cnt_all,
static void prof_ctx_sum(prof_ctx_t *ctx, prof_cnt_t *cnt_all,
size_t *leak_nctx);
static void prof_ctx_destroy(prof_ctx_t *ctx);
static void prof_ctx_merge(prof_ctx_t *ctx, prof_thr_cnt_t *cnt);
static bool prof_dump_ctx(prof_ctx_t *ctx, prof_bt_t *bt,
bool propagate_err);
static bool prof_dump_maps(bool propagate_err);
@ -160,7 +157,7 @@ static void prof_fdump(void);
static void prof_bt_hash(const void *key, unsigned minbits, size_t *hash1,
size_t *hash2);
static bool prof_bt_keycomp(const void *k1, const void *k2);
static void bt2cnt_thread_cleanup(void *arg);
static void prof_tcache_cleanup(void *arg);
#ifdef NO_TLS
static void prof_sample_state_thread_cleanup(void *arg);
#endif
@ -175,6 +172,13 @@ bt_init(prof_bt_t *bt, void **vec)
bt->len = 0;
}
static void
bt_destroy(prof_bt_t *bt)
{
idalloc(bt);
}
static prof_bt_t *
bt_dup(prof_bt_t *bt)
{
@ -487,23 +491,25 @@ prof_lookup(prof_bt_t *bt)
prof_thr_cnt_t *p;
void *v;
} ret;
ckh_t *bt2cnt = BT2CNT_GET();
prof_tcache_t *prof_tcache = PROF_TCACHE_GET();
if (bt2cnt == NULL) {
if (prof_tcache == NULL) {
/* Initialize an empty cache for this thread. */
bt2cnt = (ckh_t *)imalloc(sizeof(ckh_t));
if (bt2cnt == NULL)
prof_tcache = (prof_tcache_t *)imalloc(sizeof(prof_tcache_t));
if (prof_tcache == NULL)
return (NULL);
if (ckh_new(bt2cnt, PROF_CKH_MINITEMS, prof_bt_hash,
prof_bt_keycomp)) {
idalloc(bt2cnt);
if (ckh_new(&prof_tcache->bt2cnt, PROF_CKH_MINITEMS,
prof_bt_hash, prof_bt_keycomp)) {
idalloc(prof_tcache);
return (NULL);
}
ql_new(&prof_tcache->lru_ql);
BT2CNT_SET(bt2cnt);
PROF_TCACHE_SET(prof_tcache);
}
if (ckh_search(bt2cnt, bt, NULL, &ret.v)) {
if (ckh_search(&prof_tcache->bt2cnt, bt, NULL, &ret.v)) {
union {
prof_bt_t *p;
void *v;
@ -519,7 +525,6 @@ prof_lookup(prof_bt_t *bt)
*/
prof_enter();
if (ckh_search(&bt2ctx, bt, &btkey.v, &ctx.v)) {
/* bt has never been seen before. Insert it. */
ctx.v = imalloc(sizeof(prof_ctx_t));
if (ctx.v == NULL) {
@ -544,28 +549,60 @@ prof_lookup(prof_bt_t *bt)
if (ckh_insert(&bt2ctx, btkey.v, ctx.v)) {
/* OOM. */
prof_leave();
malloc_mutex_destroy(&ctx.p->lock);
idalloc(btkey.v);
idalloc(ctx.v);
return (NULL);
}
}
/*
* Acquire ctx's lock before releasing bt2ctx_mtx, in order to
* avoid a race condition with prof_ctx_destroy().
*/
malloc_mutex_lock(&ctx.p->lock);
prof_leave();
/* Link a prof_thd_cnt_t into ctx for this thread. */
ret.v = imalloc(sizeof(prof_thr_cnt_t));
if (ret.p == NULL)
return (NULL);
ql_elm_new(ret.p, link);
if (opt_lg_prof_tcmax >= 0 && ckh_count(&prof_tcache->bt2cnt)
== (ZU(1) << opt_lg_prof_tcmax)) {
assert(ckh_count(&prof_tcache->bt2cnt) > 0);
/*
* Flush the least least recently used cnt in order to
* keep bt2cnt from becoming too large.
*/
ret.p = ql_last(&prof_tcache->lru_ql, lru_link);
assert(ret.v != NULL);
ckh_remove(&prof_tcache->bt2cnt, ret.p->ctx->bt, NULL,
NULL);
ql_remove(&prof_tcache->lru_ql, ret.p, lru_link);
prof_ctx_merge(ret.p->ctx, ret.p);
/* ret can now be re-used. */
} else {
assert(opt_lg_prof_tcmax < 0 ||
ckh_count(&prof_tcache->bt2cnt) < (ZU(1) <<
opt_lg_prof_tcmax));
/* Allocate and partially initialize a new cnt. */
ret.v = imalloc(sizeof(prof_thr_cnt_t));
if (ret.p == NULL)
return (NULL);
ql_elm_new(ret.p, cnts_link);
ql_elm_new(ret.p, lru_link);
}
/* Finish initializing ret. */
ret.p->ctx = ctx.p;
ret.p->epoch = 0;
memset(&ret.p->cnts, 0, sizeof(prof_cnt_t));
if (ckh_insert(bt2cnt, btkey.v, ret.v)) {
if (ckh_insert(&prof_tcache->bt2cnt, btkey.v, ret.v)) {
idalloc(ret.v);
return (NULL);
}
malloc_mutex_lock(&ctx.p->lock);
ql_tail_insert(&ctx.p->cnts_ql, ret.p, link);
ql_head_insert(&prof_tcache->lru_ql, ret.p, lru_link);
ql_tail_insert(&ctx.p->cnts_ql, ret.p, cnts_link);
malloc_mutex_unlock(&ctx.p->lock);
} else {
/* Move ret to the front of the LRU. */
ql_remove(&prof_tcache->lru_ql, ret.p, lru_link);
ql_head_insert(&prof_tcache->lru_ql, ret.p, lru_link);
}
return (ret.p);
@ -729,8 +766,10 @@ prof_malloc(const void *ptr, prof_thr_cnt_t *cnt)
/*********/
cnt->cnts.curobjs++;
cnt->cnts.curbytes += size;
cnt->cnts.accumobjs++;
cnt->cnts.accumbytes += size;
if (opt_prof_accum) {
cnt->cnts.accumobjs++;
cnt->cnts.accumbytes += size;
}
/*********/
mb_write();
/*********/
@ -796,8 +835,10 @@ prof_realloc(const void *ptr, prof_thr_cnt_t *cnt, const void *old_ptr,
if ((uintptr_t)cnt > (uintptr_t)1U) {
cnt->cnts.curobjs++;
cnt->cnts.curbytes += size;
cnt->cnts.accumobjs++;
cnt->cnts.accumbytes += size;
if (opt_prof_accum) {
cnt->cnts.accumobjs++;
cnt->cnts.accumbytes += size;
}
}
/*********/
mb_write();
@ -896,15 +937,15 @@ prof_write(const char *s, bool propagate_err)
}
static void
prof_ctx_merge(prof_ctx_t *ctx, prof_cnt_t *cnt_all, size_t *leak_nctx)
prof_ctx_sum(prof_ctx_t *ctx, prof_cnt_t *cnt_all, size_t *leak_nctx)
{
prof_thr_cnt_t *thr_cnt;
prof_cnt_t tcnt;
malloc_mutex_lock(&ctx->lock);
memcpy(&ctx->cnt_dump, &ctx->cnt_merged, sizeof(prof_cnt_t));
ql_foreach(thr_cnt, &ctx->cnts_ql, link) {
memcpy(&ctx->cnt_summed, &ctx->cnt_merged, sizeof(prof_cnt_t));
ql_foreach(thr_cnt, &ctx->cnts_ql, cnts_link) {
volatile unsigned *epoch = &thr_cnt->epoch;
while (true) {
@ -921,39 +962,101 @@ prof_ctx_merge(prof_ctx_t *ctx, prof_cnt_t *cnt_all, size_t *leak_nctx)
break;
}
ctx->cnt_dump.curobjs += tcnt.curobjs;
ctx->cnt_dump.curbytes += tcnt.curbytes;
ctx->cnt_dump.accumobjs += tcnt.accumobjs;
ctx->cnt_dump.accumbytes += tcnt.accumbytes;
ctx->cnt_summed.curobjs += tcnt.curobjs;
ctx->cnt_summed.curbytes += tcnt.curbytes;
if (opt_prof_accum) {
ctx->cnt_summed.accumobjs += tcnt.accumobjs;
ctx->cnt_summed.accumbytes += tcnt.accumbytes;
}
if (tcnt.curobjs != 0)
(*leak_nctx)++;
}
/* Merge into cnt_all. */
cnt_all->curobjs += ctx->cnt_dump.curobjs;
cnt_all->curbytes += ctx->cnt_dump.curbytes;
cnt_all->accumobjs += ctx->cnt_dump.accumobjs;
cnt_all->accumbytes += ctx->cnt_dump.accumbytes;
/* Add to cnt_all. */
cnt_all->curobjs += ctx->cnt_summed.curobjs;
cnt_all->curbytes += ctx->cnt_summed.curbytes;
if (opt_prof_accum) {
cnt_all->accumobjs += ctx->cnt_summed.accumobjs;
cnt_all->accumbytes += ctx->cnt_summed.accumbytes;
}
malloc_mutex_unlock(&ctx->lock);
}
static void
prof_ctx_destroy(prof_ctx_t *ctx)
{
/*
* Check that ctx is still unused by any thread cache before destroying
* it. prof_lookup() interlocks bt2ctx_mtx and ctx->lock in order to
* avoid a race condition with this function.
*/
prof_enter();
malloc_mutex_lock(&ctx->lock);
if (ql_first(&ctx->cnts_ql) == NULL && ctx->cnt_merged.curobjs == 0) {
assert(ctx->cnt_merged.curbytes == 0);
assert(ctx->cnt_merged.accumobjs == 0);
assert(ctx->cnt_merged.accumbytes == 0);
/* Remove ctx from bt2ctx. */
ckh_remove(&bt2ctx, ctx->bt, NULL, NULL);
prof_leave();
/* Destroy ctx. */
malloc_mutex_unlock(&ctx->lock);
bt_destroy(ctx->bt);
malloc_mutex_destroy(&ctx->lock);
idalloc(ctx);
} else {
malloc_mutex_unlock(&ctx->lock);
prof_leave();
}
}
static void
prof_ctx_merge(prof_ctx_t *ctx, prof_thr_cnt_t *cnt)
{
bool destroy;
/* Merge cnt stats and detach from ctx. */
malloc_mutex_lock(&ctx->lock);
ctx->cnt_merged.curobjs += cnt->cnts.curobjs;
ctx->cnt_merged.curbytes += cnt->cnts.curbytes;
ctx->cnt_merged.accumobjs += cnt->cnts.accumobjs;
ctx->cnt_merged.accumbytes += cnt->cnts.accumbytes;
ql_remove(&ctx->cnts_ql, cnt, cnts_link);
if (opt_prof_accum == false && ql_first(&ctx->cnts_ql) == NULL &&
ctx->cnt_merged.curobjs == 0)
destroy = true;
else
destroy = false;
malloc_mutex_unlock(&ctx->lock);
if (destroy)
prof_ctx_destroy(ctx);
}
static bool
prof_dump_ctx(prof_ctx_t *ctx, prof_bt_t *bt, bool propagate_err)
{
char buf[UMAX2S_BUFSIZE];
unsigned i;
if (prof_write(umax2s(ctx->cnt_dump.curobjs, 10, buf), propagate_err)
if (opt_prof_accum == false && ctx->cnt_summed.curobjs == 0) {
assert(ctx->cnt_summed.curbytes == 0);
assert(ctx->cnt_summed.accumobjs == 0);
assert(ctx->cnt_summed.accumbytes == 0);
return (false);
}
if (prof_write(umax2s(ctx->cnt_summed.curobjs, 10, buf), propagate_err)
|| prof_write(": ", propagate_err)
|| prof_write(umax2s(ctx->cnt_dump.curbytes, 10, buf),
|| prof_write(umax2s(ctx->cnt_summed.curbytes, 10, buf),
propagate_err)
|| prof_write(" [", propagate_err)
|| prof_write(umax2s(ctx->cnt_dump.accumobjs, 10, buf),
|| prof_write(umax2s(ctx->cnt_summed.accumobjs, 10, buf),
propagate_err)
|| prof_write(": ", propagate_err)
|| prof_write(umax2s(ctx->cnt_dump.accumbytes, 10, buf),
|| prof_write(umax2s(ctx->cnt_summed.accumbytes, 10, buf),
propagate_err)
|| prof_write("] @", propagate_err))
return (true);
@ -1060,7 +1163,7 @@ prof_dump(const char *filename, bool leakcheck, bool propagate_err)
leak_nctx = 0;
for (tabind = 0; ckh_iter(&bt2ctx, &tabind, NULL, &ctx.v)
== false;) {
prof_ctx_merge(ctx.p, &cnt_all, &leak_nctx);
prof_ctx_sum(ctx.p, &cnt_all, &leak_nctx);
}
/* Dump profile header. */
@ -1319,54 +1422,33 @@ prof_bt_keycomp(const void *k1, const void *k2)
}
static void
bt2cnt_thread_cleanup(void *arg)
prof_tcache_cleanup(void *arg)
{
ckh_t *bt2cnt;
prof_tcache_t *prof_tcache;
bt2cnt = BT2CNT_GET();
if (bt2cnt != NULL) {
ql_head(prof_thr_cnt_t) cnts_ql;
size_t tabind;
union {
prof_thr_cnt_t *p;
void *v;
} cnt;
/* Iteratively merge cnt's into the global stats. */
ql_new(&cnts_ql);
tabind = 0;
while (ckh_iter(bt2cnt, &tabind, NULL, &cnt.v) ==
false) {
prof_ctx_t *ctx = cnt.p->ctx;
/* Merge stats and detach from ctx. */
malloc_mutex_lock(&ctx->lock);
ctx->cnt_merged.curobjs += cnt.p->cnts.curobjs;
ctx->cnt_merged.curbytes += cnt.p->cnts.curbytes;
ctx->cnt_merged.accumobjs += cnt.p->cnts.accumobjs;
ctx->cnt_merged.accumbytes += cnt.p->cnts.accumbytes;
ql_remove(&ctx->cnts_ql, cnt.p, link);
malloc_mutex_unlock(&ctx->lock);
/*
* Stash cnt for deletion after finishing with
* ckh_iter().
*/
ql_tail_insert(&cnts_ql, cnt.p, link);
}
prof_tcache = PROF_TCACHE_GET();
if (prof_tcache != NULL) {
prof_thr_cnt_t *cnt;
/*
* Delete the hash table now that cnts_ql has a list of all
* cnt's.
* Delete the hash table. All of its contents can still be
* iterated over via the LRU.
*/
ckh_delete(bt2cnt);
idalloc(bt2cnt);
BT2CNT_SET(NULL);
ckh_delete(&prof_tcache->bt2cnt);
/* Delete cnt's. */
while ((cnt.p = ql_last(&cnts_ql, link)) != NULL) {
ql_remove(&cnts_ql, cnt.p, link);
idalloc(cnt.v);
/*
* Iteratively merge cnt's into the global stats and delete
* them.
*/
while ((cnt = ql_last(&prof_tcache->lru_ql, lru_link)) !=
NULL) {
prof_ctx_merge(cnt->ctx, cnt);
ql_remove(&prof_tcache->lru_ql, cnt, lru_link);
idalloc(cnt);
}
idalloc(prof_tcache);
PROF_TCACHE_SET(NULL);
}
}
@ -1419,7 +1501,7 @@ prof_boot1(void)
return (true);
if (malloc_mutex_init(&bt2ctx_mtx))
return (true);
if (pthread_key_create(&bt2cnt_tsd, bt2cnt_thread_cleanup)
if (pthread_key_create(&prof_tcache_tsd, prof_tcache_cleanup)
!= 0) {
malloc_write(
"<jemalloc>: Error in pthread_key_create()\n");

View File

@ -469,6 +469,9 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
if ((err = JEMALLOC_P(mallctl)("opt.stats_print", &bv, &bsz,
NULL, 0)) == 0)
write_cb(cbopaque, bv ? "P" : "p");
if ((err = JEMALLOC_P(mallctl)("opt.prof_accum", &bv, &bsz,
NULL, 0)) == 0)
write_cb(cbopaque, bv ? "R" : "r");
if ((err = JEMALLOC_P(mallctl)("opt.prof_udump", &bv, &bsz,
NULL, 0)) == 0)
write_cb(cbopaque, bv ? "U" : "u");
@ -580,6 +583,17 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
write_cb(cbopaque, umax2s((1U << sv), 10, s));
write_cb(cbopaque, "\n");
CTL_GET("opt.lg_prof_tcmax", &ssv, ssize_t);
write_cb(cbopaque,
"Maximum per thread backtrace cache: ");
if (ssv >= 0) {
write_cb(cbopaque, umax2s((1U << ssv), 10, s));
write_cb(cbopaque, " (2^");
write_cb(cbopaque, umax2s(ssv, 10, s));
write_cb(cbopaque, ")\n");
} else
write_cb(cbopaque, "N/A\n");
CTL_GET("opt.lg_prof_sample", &sv, size_t);
write_cb(cbopaque, "Average profile sample interval: ");
write_cb(cbopaque, umax2s((1U << sv), 10, s));