Make cumulative heap profile data optional.

Add the R option to control whether cumulative heap profile data
are maintained.  Add the T option to control the size of per thread
backtrace caches, primarily because when the R option is specified,
backtraces that no longer have allocations associated with them are
discarded as soon as no thread caches refer to them.
This commit is contained in:
Jason Evans 2010-10-02 15:18:50 -07:00
parent 4d5c09905e
commit a881cd2c61
8 changed files with 328 additions and 124 deletions

View File

@ -484,6 +484,12 @@ will disable dirty page purging.
@roff_prof@.Dq S @roff_prof@.Dq S
@roff_prof@option for probabilistic sampling control. @roff_prof@option for probabilistic sampling control.
@roff_prof@See the @roff_prof@See the
@roff_prof@.Dq R
@roff_prof@option for control of cumulative sample reporting.
@roff_prof@See the
@roff_prof@.Dq T
@roff_prof@option for control of per thread backtrace caching.
@roff_prof@See the
@roff_prof@.Dq I @roff_prof@.Dq I
@roff_prof@option for information on interval-triggered profile dumping, and the @roff_prof@option for information on interval-triggered profile dumping, and the
@roff_prof@.Dq U @roff_prof@.Dq U
@ -595,6 +601,18 @@ Double/halve the size of the maximum size class that is a multiple of the
quantum (8 or 16 bytes, depending on architecture). quantum (8 or 16 bytes, depending on architecture).
Above this size, cacheline spacing is used for size classes. Above this size, cacheline spacing is used for size classes.
The default value is 128 bytes. The default value is 128 bytes.
@roff_prof@.It R
@roff_prof@Enable/disable reporting of cumulative object/byte counts in profile
@roff_prof@dumps.
@roff_prof@If this option is enabled, every unique backtrace must be stored for
@roff_prof@the duration of execution.
@roff_prof@Depending on the application, this can impose a large memory
@roff_prof@overhead, and the cumulative counts are not always of interest.
@roff_prof@See the
@roff_prof@.Dq T
@roff_prof@option for control of per thread backtrace caching, which has
@roff_prof@important interactions.
@roff_prof@This option is enabled by default.
@roff_prof@.It S @roff_prof@.It S
@roff_prof@Double/halve the average interval between allocation samples, as @roff_prof@Double/halve the average interval between allocation samples, as
@roff_prof@measured in bytes of allocation activity. @roff_prof@measured in bytes of allocation activity.
@ -602,6 +620,22 @@ The default value is 128 bytes.
@roff_prof@also decreases the computational overhead. @roff_prof@also decreases the computational overhead.
@roff_prof@The default sample interval is one (i.e. all allocations are @roff_prof@The default sample interval is one (i.e. all allocations are
@roff_prof@sampled). @roff_prof@sampled).
@roff_prof@.It T
@roff_prof@Double/halve the maximum per thread backtrace cache used for heap
@roff_prof@profiling.
@roff_prof@A backtrace can only be discarded if the
@roff_prof@.Dq R
@roff_prof@option is disabled, and no thread caches currently refer to the
@roff_prof@backtrace.
@roff_prof@Therefore, a backtrace cache limit should be imposed if the
@roff_prof@intention is to limit how much memory is used by backtraces.
@roff_prof@By default, no limit is imposed.
@roff_prof@This is internally encoded as (1 << -1), and each
@roff_prof@.Dq T
@roff_prof@that is specified increments the shift amount.
@roff_prof@Therefore, e.g.
@roff_prof@.Ev JEMALLOC_OPTIONS=11T
@roff_prof@specifies a backtrace cache limit of 1024 backtraces.
@roff_prof@.It U @roff_prof@.It U
@roff_prof@Trigger a memory profile dump every time the total virtual memory @roff_prof@Trigger a memory profile dump every time the total virtual memory
@roff_prof@exceeds the previous maximum. @roff_prof@exceeds the previous maximum.
@ -992,6 +1026,27 @@ option.
@roff_prof@option. @roff_prof@option.
@roff_prof@.Ed @roff_prof@.Ed
.\"----------------------------------------------------------------------------- .\"-----------------------------------------------------------------------------
@roff_prof@.It Sy "opt.prof_accum (bool) r-"
@roff_prof@.Bd -ragged -offset indent -compact
@roff_prof@See the
@roff_prof@.Dq R
@roff_prof@option.
@roff_prof@.Ed
.\"-----------------------------------------------------------------------------
@roff_prof@.It Sy "opt.lg_prof_tcmax (ssize_t) r-"
@roff_prof@.Bd -ragged -offset indent -compact
@roff_prof@See the
@roff_prof@.Dq T
@roff_prof@option.
@roff_prof@.Ed
.\"-----------------------------------------------------------------------------
@roff_prof@.It Sy "opt.lg_prof_sample (ssize_t) r-"
@roff_prof@.Bd -ragged -offset indent -compact
@roff_prof@See the
@roff_prof@.Dq S
@roff_prof@option.
@roff_prof@.Ed
.\"-----------------------------------------------------------------------------
@roff_prof@.It Sy "opt.lg_prof_interval (ssize_t) r-" @roff_prof@.It Sy "opt.lg_prof_interval (ssize_t) r-"
@roff_prof@.Bd -ragged -offset indent -compact @roff_prof@.Bd -ragged -offset indent -compact
@roff_prof@See the @roff_prof@See the

View File

@ -24,6 +24,7 @@ extern bool isthreaded;
#endif #endif
bool malloc_mutex_init(malloc_mutex_t *mutex); bool malloc_mutex_init(malloc_mutex_t *mutex);
void malloc_mutex_destroy(malloc_mutex_t *mutex);
#endif /* JEMALLOC_H_EXTERNS */ #endif /* JEMALLOC_H_EXTERNS */
/******************************************************************************/ /******************************************************************************/

View File

@ -6,12 +6,13 @@ typedef struct prof_bt_s prof_bt_t;
typedef struct prof_cnt_s prof_cnt_t; typedef struct prof_cnt_s prof_cnt_t;
typedef struct prof_thr_cnt_s prof_thr_cnt_t; typedef struct prof_thr_cnt_s prof_thr_cnt_t;
typedef struct prof_ctx_s prof_ctx_t; typedef struct prof_ctx_s prof_ctx_t;
typedef struct prof_s prof_t; typedef struct prof_tcache_s prof_tcache_t;
/* Option defaults. */ /* Option defaults. */
#define LG_PROF_BT_MAX_DEFAULT 2 #define LG_PROF_BT_MAX_DEFAULT 2
#define LG_PROF_SAMPLE_DEFAULT 0 #define LG_PROF_SAMPLE_DEFAULT 0
#define LG_PROF_INTERVAL_DEFAULT -1 #define LG_PROF_INTERVAL_DEFAULT -1
#define LG_PROF_TCMAX_DEFAULT -1
/* /*
* Hard limit on stack backtrace depth. Note that the version of * Hard limit on stack backtrace depth. Note that the version of
@ -41,9 +42,9 @@ struct prof_bt_s {
#ifdef JEMALLOC_PROF_LIBGCC #ifdef JEMALLOC_PROF_LIBGCC
/* Data structure passed to libgcc _Unwind_Backtrace() callback functions. */ /* Data structure passed to libgcc _Unwind_Backtrace() callback functions. */
typedef struct { typedef struct {
prof_bt_t *bt; prof_bt_t *bt;
unsigned nignore; unsigned nignore;
unsigned max; unsigned max;
} prof_unwind_data_t; } prof_unwind_data_t;
#endif #endif
@ -51,11 +52,11 @@ struct prof_cnt_s {
/* /*
* Profiling counters. An allocation/deallocation pair can operate on * Profiling counters. An allocation/deallocation pair can operate on
* different prof_thr_cnt_t objects that are linked into the same * different prof_thr_cnt_t objects that are linked into the same
* prof_ctx_t sets_ql, so it is possible for the cur* counters to go * prof_ctx_t cnts_ql, so it is possible for the cur* counters to go
* negative. In principle it is possible for the *bytes counters to * negative. In principle it is possible for the *bytes counters to
* overflow/underflow, but a general solution would require some form * overflow/underflow, but a general solution would require something
* of 128-bit counter solution; this implementation doesn't bother to * like 128-bit counters; this implementation doesn't bother to solve
* solve that problem. * that problem.
*/ */
int64_t curobjs; int64_t curobjs;
int64_t curbytes; int64_t curbytes;
@ -64,15 +65,18 @@ struct prof_cnt_s {
}; };
struct prof_thr_cnt_s { struct prof_thr_cnt_s {
/* Linkage into prof_ctx_t's sets_ql. */ /* Linkage into prof_ctx_t's cnts_ql. */
ql_elm(prof_thr_cnt_t) link; ql_elm(prof_thr_cnt_t) cnts_link;
/* Linkage into thread's LRU. */
ql_elm(prof_thr_cnt_t) lru_link;
/* /*
* Associated context. If a thread frees an object that it did not * Associated context. If a thread frees an object that it did not
* allocate, it is possible that the context is not cached in the * allocate, it is possible that the context is not cached in the
* thread's hash table, in which case it must be able to look up the * thread's hash table, in which case it must be able to look up the
* context, insert a new prof_thr_cnt_t into the thread's hash table, * context, insert a new prof_thr_cnt_t into the thread's hash table,
* and link it into the prof_ctx_t's sets_ql. * and link it into the prof_ctx_t's cnts_ql.
*/ */
prof_ctx_t *ctx; prof_ctx_t *ctx;
@ -101,11 +105,11 @@ struct prof_ctx_s {
/* Associated backtrace. */ /* Associated backtrace. */
prof_bt_t *bt; prof_bt_t *bt;
/* Protects cnt_merged and sets_ql. */ /* Protects cnt_merged and cnts_ql. */
malloc_mutex_t lock; malloc_mutex_t lock;
/* Temporary storage for aggregation during dump. */ /* Temporary storage for summation during dump. */
prof_cnt_t cnt_dump; prof_cnt_t cnt_summed;
/* When threads exit, they merge their stats into cnt_merged. */ /* When threads exit, they merge their stats into cnt_merged. */
prof_cnt_t cnt_merged; prof_cnt_t cnt_merged;
@ -117,6 +121,24 @@ struct prof_ctx_s {
ql_head(prof_thr_cnt_t) cnts_ql; ql_head(prof_thr_cnt_t) cnts_ql;
}; };
/*
* Thread-specific hash of (prof_bt_t *)-->(prof_thr_cnt_t *). Each thread
* keeps a cache of backtraces, with associated thread-specific prof_thr_cnt_t
* objects. Other threads may read the prof_thr_cnt_t contents, but no others
* will ever write them.
*
* Upon thread exit, the thread must merge all the prof_thr_cnt_t counter data
* into the associated prof_ctx_t objects, and unlink/free the prof_thr_cnt_t
* objects.
*/
struct prof_tcache_s {
/* (prof_bt_t *)-->(prof_thr_cnt_t *). */
ckh_t bt2cnt;
/* LRU for contents of bt2cnt. */
ql_head(prof_thr_cnt_t) lru_ql;
};
#endif /* JEMALLOC_H_STRUCTS */ #endif /* JEMALLOC_H_STRUCTS */
/******************************************************************************/ /******************************************************************************/
#ifdef JEMALLOC_H_EXTERNS #ifdef JEMALLOC_H_EXTERNS
@ -129,11 +151,13 @@ extern bool opt_prof;
* to notice state changes. * to notice state changes.
*/ */
extern bool opt_prof_active; extern bool opt_prof_active;
extern size_t opt_lg_prof_bt_max; /* Maximum backtrace depth. */ extern size_t opt_lg_prof_bt_max; /* Maximum backtrace depth. */
extern size_t opt_lg_prof_sample; /* Mean bytes between samples. */ extern size_t opt_lg_prof_sample; /* Mean bytes between samples. */
extern ssize_t opt_lg_prof_interval; /* lg(prof_interval). */ extern ssize_t opt_lg_prof_interval; /* lg(prof_interval). */
extern bool opt_prof_udump; /* High-water memory dumping. */ extern bool opt_prof_udump; /* High-water memory dumping. */
extern bool opt_prof_leak; /* Dump leak summary at exit. */ extern bool opt_prof_leak; /* Dump leak summary at exit. */
extern bool opt_prof_accum; /* Report cumulative bytes. */
extern ssize_t opt_lg_prof_tcmax; /* lg(max per thread bactrace cache) */
/* /*
* Profile dump interval, measured in bytes allocated. Each arena triggers a * Profile dump interval, measured in bytes allocated. Each arena triggers a
@ -150,9 +174,6 @@ extern uint64_t prof_interval;
*/ */
extern bool prof_promote; extern bool prof_promote;
bool prof_init(prof_t *prof, bool master);
void prof_destroy(prof_t *prof);
prof_thr_cnt_t *prof_alloc_prep(size_t size); prof_thr_cnt_t *prof_alloc_prep(size_t size);
prof_ctx_t *prof_ctx_get(const void *ptr); prof_ctx_t *prof_ctx_get(const void *ptr);
void prof_malloc(const void *ptr, prof_thr_cnt_t *cnt); void prof_malloc(const void *ptr, prof_thr_cnt_t *cnt);

View File

@ -82,6 +82,8 @@ CTL_PROTO(opt_lg_prof_sample)
CTL_PROTO(opt_lg_prof_interval) CTL_PROTO(opt_lg_prof_interval)
CTL_PROTO(opt_prof_udump) CTL_PROTO(opt_prof_udump)
CTL_PROTO(opt_prof_leak) CTL_PROTO(opt_prof_leak)
CTL_PROTO(opt_prof_accum)
CTL_PROTO(opt_lg_prof_tcmax)
#endif #endif
CTL_PROTO(opt_stats_print) CTL_PROTO(opt_stats_print)
CTL_PROTO(opt_lg_qspace_max) CTL_PROTO(opt_lg_qspace_max)
@ -260,6 +262,8 @@ static const ctl_node_t opt_node[] = {
{NAME("lg_prof_interval"), CTL(opt_lg_prof_interval)}, {NAME("lg_prof_interval"), CTL(opt_lg_prof_interval)},
{NAME("prof_udump"), CTL(opt_prof_udump)}, {NAME("prof_udump"), CTL(opt_prof_udump)},
{NAME("prof_leak"), CTL(opt_prof_leak)}, {NAME("prof_leak"), CTL(opt_prof_leak)},
{NAME("prof_accum"), CTL(opt_prof_accum)},
{NAME("lg_prof_tcmax"), CTL(opt_lg_prof_tcmax)},
#endif #endif
{NAME("stats_print"), CTL(opt_stats_print)}, {NAME("stats_print"), CTL(opt_stats_print)},
{NAME("lg_qspace_max"), CTL(opt_lg_qspace_max)}, {NAME("lg_qspace_max"), CTL(opt_lg_qspace_max)},
@ -1207,6 +1211,8 @@ CTL_RO_GEN(opt_lg_prof_sample, opt_lg_prof_sample, size_t)
CTL_RO_GEN(opt_lg_prof_interval, opt_lg_prof_interval, ssize_t) CTL_RO_GEN(opt_lg_prof_interval, opt_lg_prof_interval, ssize_t)
CTL_RO_GEN(opt_prof_udump, opt_prof_udump, bool) CTL_RO_GEN(opt_prof_udump, opt_prof_udump, bool)
CTL_RO_GEN(opt_prof_leak, opt_prof_leak, bool) CTL_RO_GEN(opt_prof_leak, opt_prof_leak, bool)
CTL_RO_GEN(opt_prof_accum, opt_prof_accum, bool)
CTL_RO_GEN(opt_lg_prof_tcmax, opt_lg_prof_tcmax, ssize_t)
#endif #endif
CTL_RO_GEN(opt_stats_print, opt_stats_print, bool) CTL_RO_GEN(opt_stats_print, opt_stats_print, bool)
CTL_RO_GEN(opt_lg_qspace_max, opt_lg_qspace_max, size_t) CTL_RO_GEN(opt_lg_qspace_max, opt_lg_qspace_max, size_t)

View File

@ -512,6 +512,12 @@ MALLOC_OUT:
opt_lg_qspace_max++; opt_lg_qspace_max++;
break; break;
#ifdef JEMALLOC_PROF #ifdef JEMALLOC_PROF
case 'r':
opt_prof_accum = false;
break;
case 'R':
opt_prof_accum = true;
break;
case 's': case 's':
if (opt_lg_prof_sample > 0) if (opt_lg_prof_sample > 0)
opt_lg_prof_sample--; opt_lg_prof_sample--;
@ -521,6 +527,15 @@ MALLOC_OUT:
(sizeof(uint64_t) << 3)) (sizeof(uint64_t) << 3))
opt_lg_prof_sample++; opt_lg_prof_sample++;
break; break;
case 't':
if (opt_lg_prof_tcmax >= 0)
opt_lg_prof_tcmax--;
break;
case 'T':
if (opt_lg_prof_tcmax + 1 <
(sizeof(size_t) << 3))
opt_lg_prof_tcmax++;
break;
case 'u': case 'u':
opt_prof_udump = false; opt_prof_udump = false;
break; break;

View File

@ -72,3 +72,13 @@ malloc_mutex_init(malloc_mutex_t *mutex)
return (false); return (false);
} }
void
malloc_mutex_destroy(malloc_mutex_t *mutex)
{
if (pthread_mutex_destroy(mutex) != 0) {
malloc_write("<jemalloc>: Error in pthread_mutex_destroy()\n");
abort();
}
}

View File

@ -24,47 +24,41 @@ size_t opt_lg_prof_sample = LG_PROF_SAMPLE_DEFAULT;
ssize_t opt_lg_prof_interval = LG_PROF_INTERVAL_DEFAULT; ssize_t opt_lg_prof_interval = LG_PROF_INTERVAL_DEFAULT;
bool opt_prof_udump = false; bool opt_prof_udump = false;
bool opt_prof_leak = false; bool opt_prof_leak = false;
bool opt_prof_accum = true;
ssize_t opt_lg_prof_tcmax = LG_PROF_TCMAX_DEFAULT;
uint64_t prof_interval; uint64_t prof_interval;
bool prof_promote; bool prof_promote;
/* /*
* Global hash of (prof_bt_t *)-->(prof_ctx_t *). This is the master data * Global hash of (prof_bt_t *)-->(prof_ctx_t *). This is the master data
* structure that knows about all backtraces ever captured. * structure that knows about all backtraces currently captured.
*/ */
static ckh_t bt2ctx; static ckh_t bt2ctx;
static malloc_mutex_t bt2ctx_mtx; static malloc_mutex_t bt2ctx_mtx;
/* /* Thread-specific backtrace cache, used to reduce bt2ctx contention. */
* Thread-specific hash of (prof_bt_t *)-->(prof_thr_cnt_t *). Each thread
* keeps a cache of backtraces, with associated thread-specific prof_thr_cnt_t
* objects. Other threads may read the prof_thr_cnt_t contents, but no others
* will ever write them.
*
* Upon thread exit, the thread must merge all the prof_thr_cnt_t counter data
* into the associated prof_ctx_t objects, and unlink/free the prof_thr_cnt_t
* objects.
*/
#ifndef NO_TLS #ifndef NO_TLS
static __thread ckh_t *bt2cnt_tls JEMALLOC_ATTR(tls_model("initial-exec")); static __thread prof_tcache_t *prof_tcache_tls
# define BT2CNT_GET() bt2cnt_tls JEMALLOC_ATTR(tls_model("initial-exec"));
# define BT2CNT_SET(v) do { \ # define PROF_TCACHE_GET() prof_tcache_tls
bt2cnt_tls = (v); \ # define PROF_TCACHE_SET(v) do { \
pthread_setspecific(bt2cnt_tsd, (void *)(v)); \ prof_tcache_tls = (v); \
pthread_setspecific(prof_tcache_tsd, (void *)(v)); \
} while (0) } while (0)
#else #else
# define BT2CNT_GET() ((ckh_t *)pthread_getspecific(bt2cnt_tsd)) # define PROF_TCACHE_GET() ((ckh_t *)pthread_getspecific(prof_tcache_tsd))
# define BT2CNT_SET(v) do { \ # define PROF_TCACHE_SET(v) do { \
pthread_setspecific(bt2cnt_tsd, (void *)(v)); \ pthread_setspecific(prof_tcache_tsd, (void *)(v)); \
} while (0) } while (0)
#endif #endif
/* /*
* Same contents as b2cnt_tls, but initialized such that the TSD destructor is * Same contents as b2cnt_tls, but initialized such that the TSD destructor is
* called when a thread exits, so that bt2cnt_tls contents can be merged, * called when a thread exits, so that prof_tcache_tls contents can be merged,
* unlinked, and deallocated. * unlinked, and deallocated.
*/ */
static pthread_key_t bt2cnt_tsd; static pthread_key_t prof_tcache_tsd;
/* (1U << opt_lg_prof_bt_max). */ /* (1U << opt_lg_prof_bt_max). */
static unsigned prof_bt_max; static unsigned prof_bt_max;
@ -137,6 +131,7 @@ static bool enq_udump;
static prof_bt_t *bt_dup(prof_bt_t *bt); static prof_bt_t *bt_dup(prof_bt_t *bt);
static void bt_init(prof_bt_t *bt, void **vec); static void bt_init(prof_bt_t *bt, void **vec);
static void bt_destroy(prof_bt_t *bt);
#ifdef JEMALLOC_PROF_LIBGCC #ifdef JEMALLOC_PROF_LIBGCC
static _Unwind_Reason_Code prof_unwind_init_callback( static _Unwind_Reason_Code prof_unwind_init_callback(
struct _Unwind_Context *context, void *arg); struct _Unwind_Context *context, void *arg);
@ -148,8 +143,10 @@ static prof_thr_cnt_t *prof_lookup(prof_bt_t *bt);
static void prof_ctx_set(const void *ptr, prof_ctx_t *ctx); static void prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
static bool prof_flush(bool propagate_err); static bool prof_flush(bool propagate_err);
static bool prof_write(const char *s, bool propagate_err); static bool prof_write(const char *s, bool propagate_err);
static void prof_ctx_merge(prof_ctx_t *ctx, prof_cnt_t *cnt_all, static void prof_ctx_sum(prof_ctx_t *ctx, prof_cnt_t *cnt_all,
size_t *leak_nctx); size_t *leak_nctx);
static void prof_ctx_destroy(prof_ctx_t *ctx);
static void prof_ctx_merge(prof_ctx_t *ctx, prof_thr_cnt_t *cnt);
static bool prof_dump_ctx(prof_ctx_t *ctx, prof_bt_t *bt, static bool prof_dump_ctx(prof_ctx_t *ctx, prof_bt_t *bt,
bool propagate_err); bool propagate_err);
static bool prof_dump_maps(bool propagate_err); static bool prof_dump_maps(bool propagate_err);
@ -160,7 +157,7 @@ static void prof_fdump(void);
static void prof_bt_hash(const void *key, unsigned minbits, size_t *hash1, static void prof_bt_hash(const void *key, unsigned minbits, size_t *hash1,
size_t *hash2); size_t *hash2);
static bool prof_bt_keycomp(const void *k1, const void *k2); static bool prof_bt_keycomp(const void *k1, const void *k2);
static void bt2cnt_thread_cleanup(void *arg); static void prof_tcache_cleanup(void *arg);
#ifdef NO_TLS #ifdef NO_TLS
static void prof_sample_state_thread_cleanup(void *arg); static void prof_sample_state_thread_cleanup(void *arg);
#endif #endif
@ -175,6 +172,13 @@ bt_init(prof_bt_t *bt, void **vec)
bt->len = 0; bt->len = 0;
} }
static void
bt_destroy(prof_bt_t *bt)
{
idalloc(bt);
}
static prof_bt_t * static prof_bt_t *
bt_dup(prof_bt_t *bt) bt_dup(prof_bt_t *bt)
{ {
@ -487,23 +491,25 @@ prof_lookup(prof_bt_t *bt)
prof_thr_cnt_t *p; prof_thr_cnt_t *p;
void *v; void *v;
} ret; } ret;
ckh_t *bt2cnt = BT2CNT_GET(); prof_tcache_t *prof_tcache = PROF_TCACHE_GET();
if (bt2cnt == NULL) { if (prof_tcache == NULL) {
/* Initialize an empty cache for this thread. */ /* Initialize an empty cache for this thread. */
bt2cnt = (ckh_t *)imalloc(sizeof(ckh_t)); prof_tcache = (prof_tcache_t *)imalloc(sizeof(prof_tcache_t));
if (bt2cnt == NULL) if (prof_tcache == NULL)
return (NULL); return (NULL);
if (ckh_new(bt2cnt, PROF_CKH_MINITEMS, prof_bt_hash,
prof_bt_keycomp)) { if (ckh_new(&prof_tcache->bt2cnt, PROF_CKH_MINITEMS,
idalloc(bt2cnt); prof_bt_hash, prof_bt_keycomp)) {
idalloc(prof_tcache);
return (NULL); return (NULL);
} }
ql_new(&prof_tcache->lru_ql);
BT2CNT_SET(bt2cnt); PROF_TCACHE_SET(prof_tcache);
} }
if (ckh_search(bt2cnt, bt, NULL, &ret.v)) { if (ckh_search(&prof_tcache->bt2cnt, bt, NULL, &ret.v)) {
union { union {
prof_bt_t *p; prof_bt_t *p;
void *v; void *v;
@ -519,7 +525,6 @@ prof_lookup(prof_bt_t *bt)
*/ */
prof_enter(); prof_enter();
if (ckh_search(&bt2ctx, bt, &btkey.v, &ctx.v)) { if (ckh_search(&bt2ctx, bt, &btkey.v, &ctx.v)) {
/* bt has never been seen before. Insert it. */ /* bt has never been seen before. Insert it. */
ctx.v = imalloc(sizeof(prof_ctx_t)); ctx.v = imalloc(sizeof(prof_ctx_t));
if (ctx.v == NULL) { if (ctx.v == NULL) {
@ -544,28 +549,60 @@ prof_lookup(prof_bt_t *bt)
if (ckh_insert(&bt2ctx, btkey.v, ctx.v)) { if (ckh_insert(&bt2ctx, btkey.v, ctx.v)) {
/* OOM. */ /* OOM. */
prof_leave(); prof_leave();
malloc_mutex_destroy(&ctx.p->lock);
idalloc(btkey.v); idalloc(btkey.v);
idalloc(ctx.v); idalloc(ctx.v);
return (NULL); return (NULL);
} }
} }
/*
* Acquire ctx's lock before releasing bt2ctx_mtx, in order to
* avoid a race condition with prof_ctx_destroy().
*/
malloc_mutex_lock(&ctx.p->lock);
prof_leave(); prof_leave();
/* Link a prof_thd_cnt_t into ctx for this thread. */ /* Link a prof_thd_cnt_t into ctx for this thread. */
ret.v = imalloc(sizeof(prof_thr_cnt_t)); if (opt_lg_prof_tcmax >= 0 && ckh_count(&prof_tcache->bt2cnt)
if (ret.p == NULL) == (ZU(1) << opt_lg_prof_tcmax)) {
return (NULL); assert(ckh_count(&prof_tcache->bt2cnt) > 0);
ql_elm_new(ret.p, link); /*
* Flush the least least recently used cnt in order to
* keep bt2cnt from becoming too large.
*/
ret.p = ql_last(&prof_tcache->lru_ql, lru_link);
assert(ret.v != NULL);
ckh_remove(&prof_tcache->bt2cnt, ret.p->ctx->bt, NULL,
NULL);
ql_remove(&prof_tcache->lru_ql, ret.p, lru_link);
prof_ctx_merge(ret.p->ctx, ret.p);
/* ret can now be re-used. */
} else {
assert(opt_lg_prof_tcmax < 0 ||
ckh_count(&prof_tcache->bt2cnt) < (ZU(1) <<
opt_lg_prof_tcmax));
/* Allocate and partially initialize a new cnt. */
ret.v = imalloc(sizeof(prof_thr_cnt_t));
if (ret.p == NULL)
return (NULL);
ql_elm_new(ret.p, cnts_link);
ql_elm_new(ret.p, lru_link);
}
/* Finish initializing ret. */
ret.p->ctx = ctx.p; ret.p->ctx = ctx.p;
ret.p->epoch = 0; ret.p->epoch = 0;
memset(&ret.p->cnts, 0, sizeof(prof_cnt_t)); memset(&ret.p->cnts, 0, sizeof(prof_cnt_t));
if (ckh_insert(bt2cnt, btkey.v, ret.v)) { if (ckh_insert(&prof_tcache->bt2cnt, btkey.v, ret.v)) {
idalloc(ret.v); idalloc(ret.v);
return (NULL); return (NULL);
} }
malloc_mutex_lock(&ctx.p->lock); ql_head_insert(&prof_tcache->lru_ql, ret.p, lru_link);
ql_tail_insert(&ctx.p->cnts_ql, ret.p, link); ql_tail_insert(&ctx.p->cnts_ql, ret.p, cnts_link);
malloc_mutex_unlock(&ctx.p->lock); malloc_mutex_unlock(&ctx.p->lock);
} else {
/* Move ret to the front of the LRU. */
ql_remove(&prof_tcache->lru_ql, ret.p, lru_link);
ql_head_insert(&prof_tcache->lru_ql, ret.p, lru_link);
} }
return (ret.p); return (ret.p);
@ -729,8 +766,10 @@ prof_malloc(const void *ptr, prof_thr_cnt_t *cnt)
/*********/ /*********/
cnt->cnts.curobjs++; cnt->cnts.curobjs++;
cnt->cnts.curbytes += size; cnt->cnts.curbytes += size;
cnt->cnts.accumobjs++; if (opt_prof_accum) {
cnt->cnts.accumbytes += size; cnt->cnts.accumobjs++;
cnt->cnts.accumbytes += size;
}
/*********/ /*********/
mb_write(); mb_write();
/*********/ /*********/
@ -796,8 +835,10 @@ prof_realloc(const void *ptr, prof_thr_cnt_t *cnt, const void *old_ptr,
if ((uintptr_t)cnt > (uintptr_t)1U) { if ((uintptr_t)cnt > (uintptr_t)1U) {
cnt->cnts.curobjs++; cnt->cnts.curobjs++;
cnt->cnts.curbytes += size; cnt->cnts.curbytes += size;
cnt->cnts.accumobjs++; if (opt_prof_accum) {
cnt->cnts.accumbytes += size; cnt->cnts.accumobjs++;
cnt->cnts.accumbytes += size;
}
} }
/*********/ /*********/
mb_write(); mb_write();
@ -896,15 +937,15 @@ prof_write(const char *s, bool propagate_err)
} }
static void static void
prof_ctx_merge(prof_ctx_t *ctx, prof_cnt_t *cnt_all, size_t *leak_nctx) prof_ctx_sum(prof_ctx_t *ctx, prof_cnt_t *cnt_all, size_t *leak_nctx)
{ {
prof_thr_cnt_t *thr_cnt; prof_thr_cnt_t *thr_cnt;
prof_cnt_t tcnt; prof_cnt_t tcnt;
malloc_mutex_lock(&ctx->lock); malloc_mutex_lock(&ctx->lock);
memcpy(&ctx->cnt_dump, &ctx->cnt_merged, sizeof(prof_cnt_t)); memcpy(&ctx->cnt_summed, &ctx->cnt_merged, sizeof(prof_cnt_t));
ql_foreach(thr_cnt, &ctx->cnts_ql, link) { ql_foreach(thr_cnt, &ctx->cnts_ql, cnts_link) {
volatile unsigned *epoch = &thr_cnt->epoch; volatile unsigned *epoch = &thr_cnt->epoch;
while (true) { while (true) {
@ -921,39 +962,101 @@ prof_ctx_merge(prof_ctx_t *ctx, prof_cnt_t *cnt_all, size_t *leak_nctx)
break; break;
} }
ctx->cnt_dump.curobjs += tcnt.curobjs; ctx->cnt_summed.curobjs += tcnt.curobjs;
ctx->cnt_dump.curbytes += tcnt.curbytes; ctx->cnt_summed.curbytes += tcnt.curbytes;
ctx->cnt_dump.accumobjs += tcnt.accumobjs; if (opt_prof_accum) {
ctx->cnt_dump.accumbytes += tcnt.accumbytes; ctx->cnt_summed.accumobjs += tcnt.accumobjs;
ctx->cnt_summed.accumbytes += tcnt.accumbytes;
}
if (tcnt.curobjs != 0) if (tcnt.curobjs != 0)
(*leak_nctx)++; (*leak_nctx)++;
} }
/* Merge into cnt_all. */ /* Add to cnt_all. */
cnt_all->curobjs += ctx->cnt_dump.curobjs; cnt_all->curobjs += ctx->cnt_summed.curobjs;
cnt_all->curbytes += ctx->cnt_dump.curbytes; cnt_all->curbytes += ctx->cnt_summed.curbytes;
cnt_all->accumobjs += ctx->cnt_dump.accumobjs; if (opt_prof_accum) {
cnt_all->accumbytes += ctx->cnt_dump.accumbytes; cnt_all->accumobjs += ctx->cnt_summed.accumobjs;
cnt_all->accumbytes += ctx->cnt_summed.accumbytes;
}
malloc_mutex_unlock(&ctx->lock); malloc_mutex_unlock(&ctx->lock);
} }
static void
prof_ctx_destroy(prof_ctx_t *ctx)
{
/*
* Check that ctx is still unused by any thread cache before destroying
* it. prof_lookup() interlocks bt2ctx_mtx and ctx->lock in order to
* avoid a race condition with this function.
*/
prof_enter();
malloc_mutex_lock(&ctx->lock);
if (ql_first(&ctx->cnts_ql) == NULL && ctx->cnt_merged.curobjs == 0) {
assert(ctx->cnt_merged.curbytes == 0);
assert(ctx->cnt_merged.accumobjs == 0);
assert(ctx->cnt_merged.accumbytes == 0);
/* Remove ctx from bt2ctx. */
ckh_remove(&bt2ctx, ctx->bt, NULL, NULL);
prof_leave();
/* Destroy ctx. */
malloc_mutex_unlock(&ctx->lock);
bt_destroy(ctx->bt);
malloc_mutex_destroy(&ctx->lock);
idalloc(ctx);
} else {
malloc_mutex_unlock(&ctx->lock);
prof_leave();
}
}
static void
prof_ctx_merge(prof_ctx_t *ctx, prof_thr_cnt_t *cnt)
{
bool destroy;
/* Merge cnt stats and detach from ctx. */
malloc_mutex_lock(&ctx->lock);
ctx->cnt_merged.curobjs += cnt->cnts.curobjs;
ctx->cnt_merged.curbytes += cnt->cnts.curbytes;
ctx->cnt_merged.accumobjs += cnt->cnts.accumobjs;
ctx->cnt_merged.accumbytes += cnt->cnts.accumbytes;
ql_remove(&ctx->cnts_ql, cnt, cnts_link);
if (opt_prof_accum == false && ql_first(&ctx->cnts_ql) == NULL &&
ctx->cnt_merged.curobjs == 0)
destroy = true;
else
destroy = false;
malloc_mutex_unlock(&ctx->lock);
if (destroy)
prof_ctx_destroy(ctx);
}
static bool static bool
prof_dump_ctx(prof_ctx_t *ctx, prof_bt_t *bt, bool propagate_err) prof_dump_ctx(prof_ctx_t *ctx, prof_bt_t *bt, bool propagate_err)
{ {
char buf[UMAX2S_BUFSIZE]; char buf[UMAX2S_BUFSIZE];
unsigned i; unsigned i;
if (prof_write(umax2s(ctx->cnt_dump.curobjs, 10, buf), propagate_err) if (opt_prof_accum == false && ctx->cnt_summed.curobjs == 0) {
assert(ctx->cnt_summed.curbytes == 0);
assert(ctx->cnt_summed.accumobjs == 0);
assert(ctx->cnt_summed.accumbytes == 0);
return (false);
}
if (prof_write(umax2s(ctx->cnt_summed.curobjs, 10, buf), propagate_err)
|| prof_write(": ", propagate_err) || prof_write(": ", propagate_err)
|| prof_write(umax2s(ctx->cnt_dump.curbytes, 10, buf), || prof_write(umax2s(ctx->cnt_summed.curbytes, 10, buf),
propagate_err) propagate_err)
|| prof_write(" [", propagate_err) || prof_write(" [", propagate_err)
|| prof_write(umax2s(ctx->cnt_dump.accumobjs, 10, buf), || prof_write(umax2s(ctx->cnt_summed.accumobjs, 10, buf),
propagate_err) propagate_err)
|| prof_write(": ", propagate_err) || prof_write(": ", propagate_err)
|| prof_write(umax2s(ctx->cnt_dump.accumbytes, 10, buf), || prof_write(umax2s(ctx->cnt_summed.accumbytes, 10, buf),
propagate_err) propagate_err)
|| prof_write("] @", propagate_err)) || prof_write("] @", propagate_err))
return (true); return (true);
@ -1060,7 +1163,7 @@ prof_dump(const char *filename, bool leakcheck, bool propagate_err)
leak_nctx = 0; leak_nctx = 0;
for (tabind = 0; ckh_iter(&bt2ctx, &tabind, NULL, &ctx.v) for (tabind = 0; ckh_iter(&bt2ctx, &tabind, NULL, &ctx.v)
== false;) { == false;) {
prof_ctx_merge(ctx.p, &cnt_all, &leak_nctx); prof_ctx_sum(ctx.p, &cnt_all, &leak_nctx);
} }
/* Dump profile header. */ /* Dump profile header. */
@ -1319,54 +1422,33 @@ prof_bt_keycomp(const void *k1, const void *k2)
} }
static void static void
bt2cnt_thread_cleanup(void *arg) prof_tcache_cleanup(void *arg)
{ {
ckh_t *bt2cnt; prof_tcache_t *prof_tcache;
bt2cnt = BT2CNT_GET(); prof_tcache = PROF_TCACHE_GET();
if (bt2cnt != NULL) { if (prof_tcache != NULL) {
ql_head(prof_thr_cnt_t) cnts_ql; prof_thr_cnt_t *cnt;
size_t tabind;
union {
prof_thr_cnt_t *p;
void *v;
} cnt;
/* Iteratively merge cnt's into the global stats. */
ql_new(&cnts_ql);
tabind = 0;
while (ckh_iter(bt2cnt, &tabind, NULL, &cnt.v) ==
false) {
prof_ctx_t *ctx = cnt.p->ctx;
/* Merge stats and detach from ctx. */
malloc_mutex_lock(&ctx->lock);
ctx->cnt_merged.curobjs += cnt.p->cnts.curobjs;
ctx->cnt_merged.curbytes += cnt.p->cnts.curbytes;
ctx->cnt_merged.accumobjs += cnt.p->cnts.accumobjs;
ctx->cnt_merged.accumbytes += cnt.p->cnts.accumbytes;
ql_remove(&ctx->cnts_ql, cnt.p, link);
malloc_mutex_unlock(&ctx->lock);
/*
* Stash cnt for deletion after finishing with
* ckh_iter().
*/
ql_tail_insert(&cnts_ql, cnt.p, link);
}
/* /*
* Delete the hash table now that cnts_ql has a list of all * Delete the hash table. All of its contents can still be
* cnt's. * iterated over via the LRU.
*/ */
ckh_delete(bt2cnt); ckh_delete(&prof_tcache->bt2cnt);
idalloc(bt2cnt);
BT2CNT_SET(NULL);
/* Delete cnt's. */ /*
while ((cnt.p = ql_last(&cnts_ql, link)) != NULL) { * Iteratively merge cnt's into the global stats and delete
ql_remove(&cnts_ql, cnt.p, link); * them.
idalloc(cnt.v); */
while ((cnt = ql_last(&prof_tcache->lru_ql, lru_link)) !=
NULL) {
prof_ctx_merge(cnt->ctx, cnt);
ql_remove(&prof_tcache->lru_ql, cnt, lru_link);
idalloc(cnt);
} }
idalloc(prof_tcache);
PROF_TCACHE_SET(NULL);
} }
} }
@ -1419,7 +1501,7 @@ prof_boot1(void)
return (true); return (true);
if (malloc_mutex_init(&bt2ctx_mtx)) if (malloc_mutex_init(&bt2ctx_mtx))
return (true); return (true);
if (pthread_key_create(&bt2cnt_tsd, bt2cnt_thread_cleanup) if (pthread_key_create(&prof_tcache_tsd, prof_tcache_cleanup)
!= 0) { != 0) {
malloc_write( malloc_write(
"<jemalloc>: Error in pthread_key_create()\n"); "<jemalloc>: Error in pthread_key_create()\n");

View File

@ -469,6 +469,9 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
if ((err = JEMALLOC_P(mallctl)("opt.stats_print", &bv, &bsz, if ((err = JEMALLOC_P(mallctl)("opt.stats_print", &bv, &bsz,
NULL, 0)) == 0) NULL, 0)) == 0)
write_cb(cbopaque, bv ? "P" : "p"); write_cb(cbopaque, bv ? "P" : "p");
if ((err = JEMALLOC_P(mallctl)("opt.prof_accum", &bv, &bsz,
NULL, 0)) == 0)
write_cb(cbopaque, bv ? "R" : "r");
if ((err = JEMALLOC_P(mallctl)("opt.prof_udump", &bv, &bsz, if ((err = JEMALLOC_P(mallctl)("opt.prof_udump", &bv, &bsz,
NULL, 0)) == 0) NULL, 0)) == 0)
write_cb(cbopaque, bv ? "U" : "u"); write_cb(cbopaque, bv ? "U" : "u");
@ -580,6 +583,17 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
write_cb(cbopaque, umax2s((1U << sv), 10, s)); write_cb(cbopaque, umax2s((1U << sv), 10, s));
write_cb(cbopaque, "\n"); write_cb(cbopaque, "\n");
CTL_GET("opt.lg_prof_tcmax", &ssv, ssize_t);
write_cb(cbopaque,
"Maximum per thread backtrace cache: ");
if (ssv >= 0) {
write_cb(cbopaque, umax2s((1U << ssv), 10, s));
write_cb(cbopaque, " (2^");
write_cb(cbopaque, umax2s(ssv, 10, s));
write_cb(cbopaque, ")\n");
} else
write_cb(cbopaque, "N/A\n");
CTL_GET("opt.lg_prof_sample", &sv, size_t); CTL_GET("opt.lg_prof_sample", &sv, size_t);
write_cb(cbopaque, "Average profile sample interval: "); write_cb(cbopaque, "Average profile sample interval: ");
write_cb(cbopaque, umax2s((1U << sv), 10, s)); write_cb(cbopaque, umax2s((1U << sv), 10, s));