Make cumulative heap profile data optional.

Add the R option to control whether cumulative heap profile data are maintained. Add the T option to control the size of per thread backtrace caches, primarily because when the R option is specified, backtraces that no longer have allocations associated with them are discarded as soon as no thread caches refer to them.
2010-10-02 15:18:50 -07:00 · 2010-10-02 15:18:50 -07:00 · a881cd2c61
commit a881cd2c61
parent 4d5c09905e
8 changed files with 328 additions and 124 deletions
--- a/jemalloc/doc/jemalloc.3.in
+++ b/jemalloc/doc/jemalloc.3.in
@ -484,6 +484,12 @@ will disable dirty page purging.
@roff_prof@.Dq S
@roff_prof@option for probabilistic sampling control.
@roff_prof@See the
+@roff_prof@.Dq R
+@roff_prof@option for control of cumulative sample reporting.
+@roff_prof@See the
+@roff_prof@.Dq T
+@roff_prof@option for control of per thread backtrace caching.
+@roff_prof@See the
@roff_prof@.Dq I
@roff_prof@option for information on interval-triggered profile dumping, and the
@roff_prof@.Dq U
@ -595,6 +601,18 @@ Double/halve the size of the maximum size class that is a multiple of the
 quantum (8 or 16 bytes, depending on architecture).
 Above this size, cacheline spacing is used for size classes.
 The default value is 128 bytes.
+@roff_prof@.It R
+@roff_prof@Enable/disable reporting of cumulative object/byte counts in profile
+@roff_prof@dumps.
+@roff_prof@If this option is enabled, every unique backtrace must be stored for
+@roff_prof@the duration of execution.
+@roff_prof@Depending on the application, this can impose a large memory
+@roff_prof@overhead, and the cumulative counts are not always of interest.
+@roff_prof@See the
+@roff_prof@.Dq T
+@roff_prof@option for control of per thread backtrace caching, which has
+@roff_prof@important interactions.
+@roff_prof@This option is enabled by default.
@roff_prof@.It S
@roff_prof@Double/halve the average interval between allocation samples, as
@roff_prof@measured in bytes of allocation activity.
@ -602,6 +620,22 @@ The default value is 128 bytes.
@roff_prof@also decreases the computational overhead.
@roff_prof@The default sample interval is one (i.e. all allocations are
@roff_prof@sampled).
+@roff_prof@.It T
+@roff_prof@Double/halve the maximum per thread backtrace cache used for heap
+@roff_prof@profiling.
+@roff_prof@A backtrace can only be discarded if the
+@roff_prof@.Dq R
+@roff_prof@option is disabled, and no thread caches currently refer to the
+@roff_prof@backtrace.
+@roff_prof@Therefore, a backtrace cache limit should be imposed if the
+@roff_prof@intention is to limit how much memory is used by backtraces.
+@roff_prof@By default, no limit is imposed.
+@roff_prof@This is internally encoded as (1 << -1), and each
+@roff_prof@.Dq T
+@roff_prof@that is specified increments the shift amount.
+@roff_prof@Therefore, e.g.
+@roff_prof@.Ev JEMALLOC_OPTIONS=11T
+@roff_prof@specifies a backtrace cache limit of 1024 backtraces.
@roff_prof@.It U
@roff_prof@Trigger a memory profile dump every time the total virtual memory
@roff_prof@exceeds the previous maximum.
@ -992,6 +1026,27 @@ option.
@roff_prof@option.
@roff_prof@.Ed
 .\"-----------------------------------------------------------------------------
+@roff_prof@.It Sy "opt.prof_accum (bool) r-"
+@roff_prof@.Bd -ragged -offset indent -compact
+@roff_prof@See the
+@roff_prof@.Dq R
+@roff_prof@option.
+@roff_prof@.Ed
+.\"-----------------------------------------------------------------------------
+@roff_prof@.It Sy "opt.lg_prof_tcmax (ssize_t) r-"
+@roff_prof@.Bd -ragged -offset indent -compact
+@roff_prof@See the
+@roff_prof@.Dq T
+@roff_prof@option.
+@roff_prof@.Ed
+.\"-----------------------------------------------------------------------------
+@roff_prof@.It Sy "opt.lg_prof_sample (ssize_t) r-"
+@roff_prof@.Bd -ragged -offset indent -compact
+@roff_prof@See the
+@roff_prof@.Dq S
+@roff_prof@option.
+@roff_prof@.Ed
+.\"-----------------------------------------------------------------------------
@roff_prof@.It Sy "opt.lg_prof_interval (ssize_t) r-"
@roff_prof@.Bd -ragged -offset indent -compact
@roff_prof@See the
--- a/jemalloc/include/jemalloc/internal/mutex.h
+++ b/jemalloc/include/jemalloc/internal/mutex.h
@ -24,6 +24,7 @@ extern bool isthreaded;
 #endif

 bool	malloc_mutex_init(malloc_mutex_t *mutex);
+void	malloc_mutex_destroy(malloc_mutex_t *mutex);

 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
--- a/jemalloc/include/jemalloc/internal/prof.h
+++ b/jemalloc/include/jemalloc/internal/prof.h
@ -6,12 +6,13 @@ typedef struct prof_bt_s prof_bt_t;
 typedef struct prof_cnt_s prof_cnt_t;
 typedef struct prof_thr_cnt_s prof_thr_cnt_t;
 typedef struct prof_ctx_s prof_ctx_t;
-typedef struct prof_s prof_t;
+typedef struct prof_tcache_s prof_tcache_t;

 /* Option defaults. */
 #define	LG_PROF_BT_MAX_DEFAULT		2
 #define	LG_PROF_SAMPLE_DEFAULT		0
 #define	LG_PROF_INTERVAL_DEFAULT	-1
+#define	LG_PROF_TCMAX_DEFAULT		-1

 /*
 * Hard limit on stack backtrace depth.  Note that the version of
@ -41,9 +42,9 @@ struct prof_bt_s {
 #ifdef JEMALLOC_PROF_LIBGCC
 /* Data structure passed to libgcc _Unwind_Backtrace() callback functions. */
 typedef struct {
-	prof_bt_t *bt;
-	unsigned nignore;
-	unsigned max;
+	prof_bt_t	*bt;
+	unsigned	nignore;
+	unsigned	max;
 } prof_unwind_data_t;
 #endif

@ -51,11 +52,11 @@ struct prof_cnt_s {
 	/*
 	 * Profiling counters.  An allocation/deallocation pair can operate on
 	 * different prof_thr_cnt_t objects that are linked into the same
-	 * prof_ctx_t sets_ql, so it is possible for the cur* counters to go
+	 * prof_ctx_t cnts_ql, so it is possible for the cur* counters to go
 	 * negative.  In principle it is possible for the *bytes counters to
-	 * overflow/underflow, but a general solution would require some form
-	 * of 128-bit counter solution; this implementation doesn't bother to
-	 * solve that problem.
+	 * overflow/underflow, but a general solution would require something
+	 * like 128-bit counters; this implementation doesn't bother to solve
+	 * that problem.
 	 */
 	int64_t		curobjs;
 	int64_t		curbytes;
@ -64,15 +65,18 @@ struct prof_cnt_s {
 };

 struct prof_thr_cnt_s {
-	/* Linkage into prof_ctx_t's sets_ql. */
-	ql_elm(prof_thr_cnt_t)	link;
+	/* Linkage into prof_ctx_t's cnts_ql. */
+	ql_elm(prof_thr_cnt_t)	cnts_link;
+
+	/* Linkage into thread's LRU. */
+	ql_elm(prof_thr_cnt_t)	lru_link;

 	/*
 	 * Associated context.  If a thread frees an object that it did not
 	 * allocate, it is possible that the context is not cached in the
 	 * thread's hash table, in which case it must be able to look up the
 	 * context, insert a new prof_thr_cnt_t into the thread's hash table,
-	 * and link it into the prof_ctx_t's sets_ql.
+	 * and link it into the prof_ctx_t's cnts_ql.
 	 */
 	prof_ctx_t		*ctx;

@ -101,11 +105,11 @@ struct prof_ctx_s {
 	/* Associated backtrace. */
 	prof_bt_t		*bt;

-	/* Protects cnt_merged and sets_ql. */
+	/* Protects cnt_merged and cnts_ql. */
 	malloc_mutex_t		lock;

-	/* Temporary storage for aggregation during dump. */
-	prof_cnt_t		cnt_dump;
+	/* Temporary storage for summation during dump. */
+	prof_cnt_t		cnt_summed;

 	/* When threads exit, they merge their stats into cnt_merged. */
 	prof_cnt_t		cnt_merged;
@ -117,6 +121,24 @@ struct prof_ctx_s {
 	ql_head(prof_thr_cnt_t)	cnts_ql;
 };

+/*
+ * Thread-specific hash of (prof_bt_t *)-->(prof_thr_cnt_t *).  Each thread
+ * keeps a cache of backtraces, with associated thread-specific prof_thr_cnt_t
+ * objects.  Other threads may read the prof_thr_cnt_t contents, but no others
+ * will ever write them.
+ *
+ * Upon thread exit, the thread must merge all the prof_thr_cnt_t counter data
+ * into the associated prof_ctx_t objects, and unlink/free the prof_thr_cnt_t
+ * objects.
+ */
+struct prof_tcache_s {
+	/* (prof_bt_t *)-->(prof_thr_cnt_t *). */
+	ckh_t			bt2cnt;
+
+	/* LRU for contents of bt2cnt. */
+	ql_head(prof_thr_cnt_t)	lru_ql;
+};
+
 #endif /* JEMALLOC_H_STRUCTS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_EXTERNS
@ -129,11 +151,13 @@ extern bool	opt_prof;
 * to notice state changes.
 */
 extern bool	opt_prof_active;
-extern size_t	opt_lg_prof_bt_max; /* Maximum backtrace depth. */
-extern size_t	opt_lg_prof_sample; /* Mean bytes between samples. */
+extern size_t	opt_lg_prof_bt_max;   /* Maximum backtrace depth. */
+extern size_t	opt_lg_prof_sample;   /* Mean bytes between samples. */
 extern ssize_t	opt_lg_prof_interval; /* lg(prof_interval). */
-extern bool	opt_prof_udump; /* High-water memory dumping. */
-extern bool	opt_prof_leak; /* Dump leak summary at exit. */
+extern bool	opt_prof_udump;       /* High-water memory dumping. */
+extern bool	opt_prof_leak;        /* Dump leak summary at exit. */
+extern bool	opt_prof_accum;       /* Report cumulative bytes. */
+extern ssize_t	opt_lg_prof_tcmax;    /* lg(max per thread bactrace cache) */

 /*
 * Profile dump interval, measured in bytes allocated.  Each arena triggers a
@ -150,9 +174,6 @@ extern uint64_t	prof_interval;
 */
 extern bool	prof_promote;

-bool	prof_init(prof_t *prof, bool master);
-void	prof_destroy(prof_t *prof);
-
 prof_thr_cnt_t	*prof_alloc_prep(size_t size);
 prof_ctx_t	*prof_ctx_get(const void *ptr);
 void	prof_malloc(const void *ptr, prof_thr_cnt_t *cnt);
--- a/jemalloc/src/ctl.c
+++ b/jemalloc/src/ctl.c
@ -82,6 +82,8 @@ CTL_PROTO(opt_lg_prof_sample)
 CTL_PROTO(opt_lg_prof_interval)
 CTL_PROTO(opt_prof_udump)
 CTL_PROTO(opt_prof_leak)
+CTL_PROTO(opt_prof_accum)
+CTL_PROTO(opt_lg_prof_tcmax)
 #endif
 CTL_PROTO(opt_stats_print)
 CTL_PROTO(opt_lg_qspace_max)
@ -260,6 +262,8 @@ static const ctl_node_t opt_node[] = {
 	{NAME("lg_prof_interval"),	CTL(opt_lg_prof_interval)},
 	{NAME("prof_udump"),		CTL(opt_prof_udump)},
 	{NAME("prof_leak"),		CTL(opt_prof_leak)},
+	{NAME("prof_accum"),		CTL(opt_prof_accum)},
+	{NAME("lg_prof_tcmax"),		CTL(opt_lg_prof_tcmax)},
 #endif
 	{NAME("stats_print"),		CTL(opt_stats_print)},
 	{NAME("lg_qspace_max"),		CTL(opt_lg_qspace_max)},
@ -1207,6 +1211,8 @@ CTL_RO_GEN(opt_lg_prof_sample, opt_lg_prof_sample, size_t)
 CTL_RO_GEN(opt_lg_prof_interval, opt_lg_prof_interval, ssize_t)
 CTL_RO_GEN(opt_prof_udump, opt_prof_udump, bool)
 CTL_RO_GEN(opt_prof_leak, opt_prof_leak, bool)
+CTL_RO_GEN(opt_prof_accum, opt_prof_accum, bool)
+CTL_RO_GEN(opt_lg_prof_tcmax, opt_lg_prof_tcmax, ssize_t)
 #endif
 CTL_RO_GEN(opt_stats_print, opt_stats_print, bool)
 CTL_RO_GEN(opt_lg_qspace_max, opt_lg_qspace_max, size_t)
--- a/jemalloc/src/jemalloc.c
+++ b/jemalloc/src/jemalloc.c
@ -512,6 +512,12 @@ MALLOC_OUT:
 						opt_lg_qspace_max++;
 					break;
 #ifdef JEMALLOC_PROF
+				case 'r':
+					opt_prof_accum = false;
+					break;
+				case 'R':
+					opt_prof_accum = true;
+					break;
 				case 's':
 					if (opt_lg_prof_sample > 0)
 						opt_lg_prof_sample--;
@ -521,6 +527,15 @@ MALLOC_OUT:
 					    (sizeof(uint64_t) << 3))
 						opt_lg_prof_sample++;
 					break;
+				case 't':
+					if (opt_lg_prof_tcmax >= 0)
+						opt_lg_prof_tcmax--;
+					break;
+				case 'T':
+					if (opt_lg_prof_tcmax + 1 <
+					    (sizeof(size_t) << 3))
+						opt_lg_prof_tcmax++;
+					break;
 				case 'u':
 					opt_prof_udump = false;
 					break;
--- a/jemalloc/src/mutex.c
+++ b/jemalloc/src/mutex.c
@ -72,3 +72,13 @@ malloc_mutex_init(malloc_mutex_t *mutex)

 	return (false);
 }
+
+void
+malloc_mutex_destroy(malloc_mutex_t *mutex)
+{
+
+	if (pthread_mutex_destroy(mutex) != 0) {
+		malloc_write("<jemalloc>: Error in pthread_mutex_destroy()\n");
+		abort();
+	}
+}
--- a/jemalloc/src/prof.c
+++ b/jemalloc/src/prof.c
@ -24,47 +24,41 @@ size_t		opt_lg_prof_sample = LG_PROF_SAMPLE_DEFAULT;
 ssize_t		opt_lg_prof_interval = LG_PROF_INTERVAL_DEFAULT;
 bool		opt_prof_udump = false;
 bool		opt_prof_leak = false;
+bool		opt_prof_accum = true;
+ssize_t		opt_lg_prof_tcmax = LG_PROF_TCMAX_DEFAULT;

 uint64_t	prof_interval;
 bool		prof_promote;

 /*
 * Global hash of (prof_bt_t *)-->(prof_ctx_t *).  This is the master data
- * structure that knows about all backtraces ever captured.
+ * structure that knows about all backtraces currently captured.
 */
 static ckh_t		bt2ctx;
 static malloc_mutex_t	bt2ctx_mtx;

-/*
- * Thread-specific hash of (prof_bt_t *)-->(prof_thr_cnt_t *).  Each thread
- * keeps a cache of backtraces, with associated thread-specific prof_thr_cnt_t
- * objects.  Other threads may read the prof_thr_cnt_t contents, but no others
- * will ever write them.
- *
- * Upon thread exit, the thread must merge all the prof_thr_cnt_t counter data
- * into the associated prof_ctx_t objects, and unlink/free the prof_thr_cnt_t
- * objects.
- */
+/* Thread-specific backtrace cache, used to reduce bt2ctx contention. */
 #ifndef NO_TLS
-static __thread ckh_t	*bt2cnt_tls JEMALLOC_ATTR(tls_model("initial-exec"));
-#  define BT2CNT_GET()	bt2cnt_tls
-#  define BT2CNT_SET(v)	do {						\
-	bt2cnt_tls = (v);						\
-	pthread_setspecific(bt2cnt_tsd, (void *)(v));			\
+static __thread prof_tcache_t	*prof_tcache_tls
+    JEMALLOC_ATTR(tls_model("initial-exec"));
+#  define PROF_TCACHE_GET()	prof_tcache_tls
+#  define PROF_TCACHE_SET(v)	do {					\
+	prof_tcache_tls = (v);						\
+	pthread_setspecific(prof_tcache_tsd, (void *)(v));		\
 } while (0)
 #else
-#  define BT2CNT_GET()	((ckh_t *)pthread_getspecific(bt2cnt_tsd))
-#  define BT2CNT_SET(v)	do {						\
-	pthread_setspecific(bt2cnt_tsd, (void *)(v));			\
+#  define PROF_TCACHE_GET()	((ckh_t *)pthread_getspecific(prof_tcache_tsd))
+#  define PROF_TCACHE_SET(v)	do {					\
+	pthread_setspecific(prof_tcache_tsd, (void *)(v));		\
 } while (0)
 #endif

 /*
 * Same contents as b2cnt_tls, but initialized such that the TSD destructor is
- * called when a thread exits, so that bt2cnt_tls contents can be merged,
+ * called when a thread exits, so that prof_tcache_tls contents can be merged,
 * unlinked, and deallocated.
 */
-static pthread_key_t	bt2cnt_tsd;
+static pthread_key_t	prof_tcache_tsd;

 /* (1U << opt_lg_prof_bt_max). */
 static unsigned		prof_bt_max;
@ -137,6 +131,7 @@ static bool		enq_udump;

 static prof_bt_t	*bt_dup(prof_bt_t *bt);
 static void	bt_init(prof_bt_t *bt, void **vec);
+static void	bt_destroy(prof_bt_t *bt);
 #ifdef JEMALLOC_PROF_LIBGCC
 static _Unwind_Reason_Code	prof_unwind_init_callback(
    struct _Unwind_Context *context, void *arg);
@ -148,8 +143,10 @@ static prof_thr_cnt_t	*prof_lookup(prof_bt_t *bt);
 static void	prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
 static bool	prof_flush(bool propagate_err);
 static bool	prof_write(const char *s, bool propagate_err);
-static void	prof_ctx_merge(prof_ctx_t *ctx, prof_cnt_t *cnt_all,
+static void	prof_ctx_sum(prof_ctx_t *ctx, prof_cnt_t *cnt_all,
    size_t *leak_nctx);
+static void	prof_ctx_destroy(prof_ctx_t *ctx);
+static void	prof_ctx_merge(prof_ctx_t *ctx, prof_thr_cnt_t *cnt);
 static bool	prof_dump_ctx(prof_ctx_t *ctx, prof_bt_t *bt,
    bool propagate_err);
 static bool	prof_dump_maps(bool propagate_err);
@ -160,7 +157,7 @@ static void	prof_fdump(void);
 static void	prof_bt_hash(const void *key, unsigned minbits, size_t *hash1,
    size_t *hash2);
 static bool	prof_bt_keycomp(const void *k1, const void *k2);
-static void	bt2cnt_thread_cleanup(void *arg);
+static void	prof_tcache_cleanup(void *arg);
 #ifdef NO_TLS
 static void	prof_sample_state_thread_cleanup(void *arg);
 #endif
@ -175,6 +172,13 @@ bt_init(prof_bt_t *bt, void **vec)
 	bt->len = 0;
 }

+static void
+bt_destroy(prof_bt_t *bt)
+{
+
+	idalloc(bt);
+}
+
 static prof_bt_t *
 bt_dup(prof_bt_t *bt)
 {
@ -487,23 +491,25 @@ prof_lookup(prof_bt_t *bt)
 		prof_thr_cnt_t	*p;
 		void		*v;
 	} ret;
-	ckh_t *bt2cnt = BT2CNT_GET();
+	prof_tcache_t *prof_tcache = PROF_TCACHE_GET();

-	if (bt2cnt == NULL) {
+	if (prof_tcache == NULL) {
 		/* Initialize an empty cache for this thread. */
-		bt2cnt = (ckh_t *)imalloc(sizeof(ckh_t));
-		if (bt2cnt == NULL)
+		prof_tcache = (prof_tcache_t *)imalloc(sizeof(prof_tcache_t));
+		if (prof_tcache == NULL)
 			return (NULL);
-		if (ckh_new(bt2cnt, PROF_CKH_MINITEMS, prof_bt_hash,
-		    prof_bt_keycomp)) {
-			idalloc(bt2cnt);
+
+		if (ckh_new(&prof_tcache->bt2cnt, PROF_CKH_MINITEMS,
+		    prof_bt_hash, prof_bt_keycomp)) {
+			idalloc(prof_tcache);
 			return (NULL);
 		}
+		ql_new(&prof_tcache->lru_ql);

-		BT2CNT_SET(bt2cnt);
+		PROF_TCACHE_SET(prof_tcache);
 	}

-	if (ckh_search(bt2cnt, bt, NULL, &ret.v)) {
+	if (ckh_search(&prof_tcache->bt2cnt, bt, NULL, &ret.v)) {
 		union {
 			prof_bt_t	*p;
 			void		*v;
@ -519,7 +525,6 @@ prof_lookup(prof_bt_t *bt)
 		 */
 		prof_enter();
 		if (ckh_search(&bt2ctx, bt, &btkey.v, &ctx.v)) {
-
 			/* bt has never been seen before.  Insert it. */
 			ctx.v = imalloc(sizeof(prof_ctx_t));
 			if (ctx.v == NULL) {
@ -544,28 +549,60 @@ prof_lookup(prof_bt_t *bt)
 			if (ckh_insert(&bt2ctx, btkey.v, ctx.v)) {
 				/* OOM. */
 				prof_leave();
+				malloc_mutex_destroy(&ctx.p->lock);
 				idalloc(btkey.v);
 				idalloc(ctx.v);
 				return (NULL);
 			}
 		}
+		/*
+		 * Acquire ctx's lock before releasing bt2ctx_mtx, in order to
+		 * avoid a race condition with prof_ctx_destroy().
+		 */
+		malloc_mutex_lock(&ctx.p->lock);
 		prof_leave();

 		/* Link a prof_thd_cnt_t into ctx for this thread. */
-		ret.v = imalloc(sizeof(prof_thr_cnt_t));
-		if (ret.p == NULL)
-			return (NULL);
-		ql_elm_new(ret.p, link);
+		if (opt_lg_prof_tcmax >= 0 && ckh_count(&prof_tcache->bt2cnt)
+		    == (ZU(1) << opt_lg_prof_tcmax)) {
+			assert(ckh_count(&prof_tcache->bt2cnt) > 0);
+			/*
+			 * Flush the least least recently used cnt in order to
+			 * keep bt2cnt from becoming too large.
+			 */
+			ret.p = ql_last(&prof_tcache->lru_ql, lru_link);
+			assert(ret.v != NULL);
+			ckh_remove(&prof_tcache->bt2cnt, ret.p->ctx->bt, NULL,
+			    NULL);
+			ql_remove(&prof_tcache->lru_ql, ret.p, lru_link);
+			prof_ctx_merge(ret.p->ctx, ret.p);
+			/* ret can now be re-used. */
+		} else {
+			assert(opt_lg_prof_tcmax < 0 ||
+			    ckh_count(&prof_tcache->bt2cnt) < (ZU(1) <<
+			    opt_lg_prof_tcmax));
+			/* Allocate and partially initialize a new cnt. */
+			ret.v = imalloc(sizeof(prof_thr_cnt_t));
+			if (ret.p == NULL)
+				return (NULL);
+			ql_elm_new(ret.p, cnts_link);
+			ql_elm_new(ret.p, lru_link);
+		}
+		/* Finish initializing ret. */
 		ret.p->ctx = ctx.p;
 		ret.p->epoch = 0;
 		memset(&ret.p->cnts, 0, sizeof(prof_cnt_t));
-		if (ckh_insert(bt2cnt, btkey.v, ret.v)) {
+		if (ckh_insert(&prof_tcache->bt2cnt, btkey.v, ret.v)) {
 			idalloc(ret.v);
 			return (NULL);
 		}
-		malloc_mutex_lock(&ctx.p->lock);
-		ql_tail_insert(&ctx.p->cnts_ql, ret.p, link);
+		ql_head_insert(&prof_tcache->lru_ql, ret.p, lru_link);
+		ql_tail_insert(&ctx.p->cnts_ql, ret.p, cnts_link);
 		malloc_mutex_unlock(&ctx.p->lock);
+	} else {
+		/* Move ret to the front of the LRU. */
+		ql_remove(&prof_tcache->lru_ql, ret.p, lru_link);
+		ql_head_insert(&prof_tcache->lru_ql, ret.p, lru_link);
 	}

 	return (ret.p);
@ -729,8 +766,10 @@ prof_malloc(const void *ptr, prof_thr_cnt_t *cnt)
 		/*********/
 		cnt->cnts.curobjs++;
 		cnt->cnts.curbytes += size;
-		cnt->cnts.accumobjs++;
-		cnt->cnts.accumbytes += size;
+		if (opt_prof_accum) {
+			cnt->cnts.accumobjs++;
+			cnt->cnts.accumbytes += size;
+		}
 		/*********/
 		mb_write();
 		/*********/
@ -796,8 +835,10 @@ prof_realloc(const void *ptr, prof_thr_cnt_t *cnt, const void *old_ptr,
 	if ((uintptr_t)cnt > (uintptr_t)1U) {
 		cnt->cnts.curobjs++;
 		cnt->cnts.curbytes += size;
-		cnt->cnts.accumobjs++;
-		cnt->cnts.accumbytes += size;
+		if (opt_prof_accum) {
+			cnt->cnts.accumobjs++;
+			cnt->cnts.accumbytes += size;
+		}
 	}
 	/*********/
 	mb_write();
@ -896,15 +937,15 @@ prof_write(const char *s, bool propagate_err)
 }

 static void
-prof_ctx_merge(prof_ctx_t *ctx, prof_cnt_t *cnt_all, size_t *leak_nctx)
+prof_ctx_sum(prof_ctx_t *ctx, prof_cnt_t *cnt_all, size_t *leak_nctx)
 {
 	prof_thr_cnt_t *thr_cnt;
 	prof_cnt_t tcnt;

 	malloc_mutex_lock(&ctx->lock);

-	memcpy(&ctx->cnt_dump, &ctx->cnt_merged, sizeof(prof_cnt_t));
-	ql_foreach(thr_cnt, &ctx->cnts_ql, link) {
+	memcpy(&ctx->cnt_summed, &ctx->cnt_merged, sizeof(prof_cnt_t));
+	ql_foreach(thr_cnt, &ctx->cnts_ql, cnts_link) {
 		volatile unsigned *epoch = &thr_cnt->epoch;

 		while (true) {
@ -921,39 +962,101 @@ prof_ctx_merge(prof_ctx_t *ctx, prof_cnt_t *cnt_all, size_t *leak_nctx)
 				break;
 		}

-		ctx->cnt_dump.curobjs += tcnt.curobjs;
-		ctx->cnt_dump.curbytes += tcnt.curbytes;
-		ctx->cnt_dump.accumobjs += tcnt.accumobjs;
-		ctx->cnt_dump.accumbytes += tcnt.accumbytes;
+		ctx->cnt_summed.curobjs += tcnt.curobjs;
+		ctx->cnt_summed.curbytes += tcnt.curbytes;
+		if (opt_prof_accum) {
+			ctx->cnt_summed.accumobjs += tcnt.accumobjs;
+			ctx->cnt_summed.accumbytes += tcnt.accumbytes;
+		}

 		if (tcnt.curobjs != 0)
 			(*leak_nctx)++;
 	}

-	/* Merge into cnt_all. */
-	cnt_all->curobjs += ctx->cnt_dump.curobjs;
-	cnt_all->curbytes += ctx->cnt_dump.curbytes;
-	cnt_all->accumobjs += ctx->cnt_dump.accumobjs;
-	cnt_all->accumbytes += ctx->cnt_dump.accumbytes;
+	/* Add to cnt_all. */
+	cnt_all->curobjs += ctx->cnt_summed.curobjs;
+	cnt_all->curbytes += ctx->cnt_summed.curbytes;
+	if (opt_prof_accum) {
+		cnt_all->accumobjs += ctx->cnt_summed.accumobjs;
+		cnt_all->accumbytes += ctx->cnt_summed.accumbytes;
+	}

 	malloc_mutex_unlock(&ctx->lock);
 }

+static void
+prof_ctx_destroy(prof_ctx_t *ctx)
+{
+
+	/*
+	 * Check that ctx is still unused by any thread cache before destroying
+	 * it.  prof_lookup() interlocks bt2ctx_mtx and ctx->lock in order to
+	 * avoid a race condition with this function.
+	 */
+	prof_enter();
+	malloc_mutex_lock(&ctx->lock);
+	if (ql_first(&ctx->cnts_ql) == NULL && ctx->cnt_merged.curobjs == 0) {
+		assert(ctx->cnt_merged.curbytes == 0);
+		assert(ctx->cnt_merged.accumobjs == 0);
+		assert(ctx->cnt_merged.accumbytes == 0);
+		/* Remove ctx from bt2ctx. */
+		ckh_remove(&bt2ctx, ctx->bt, NULL, NULL);
+		prof_leave();
+		/* Destroy ctx. */
+		malloc_mutex_unlock(&ctx->lock);
+		bt_destroy(ctx->bt);
+		malloc_mutex_destroy(&ctx->lock);
+		idalloc(ctx);
+	} else {
+		malloc_mutex_unlock(&ctx->lock);
+		prof_leave();
+	}
+}
+
+static void
+prof_ctx_merge(prof_ctx_t *ctx, prof_thr_cnt_t *cnt)
+{
+	bool destroy;
+
+	/* Merge cnt stats and detach from ctx. */
+	malloc_mutex_lock(&ctx->lock);
+	ctx->cnt_merged.curobjs += cnt->cnts.curobjs;
+	ctx->cnt_merged.curbytes += cnt->cnts.curbytes;
+	ctx->cnt_merged.accumobjs += cnt->cnts.accumobjs;
+	ctx->cnt_merged.accumbytes += cnt->cnts.accumbytes;
+	ql_remove(&ctx->cnts_ql, cnt, cnts_link);
+	if (opt_prof_accum == false && ql_first(&ctx->cnts_ql) == NULL &&
+	    ctx->cnt_merged.curobjs == 0)
+		destroy = true;
+	else
+		destroy = false;
+	malloc_mutex_unlock(&ctx->lock);
+	if (destroy)
+		prof_ctx_destroy(ctx);
+}
+
 static bool
 prof_dump_ctx(prof_ctx_t *ctx, prof_bt_t *bt, bool propagate_err)
 {
 	char buf[UMAX2S_BUFSIZE];
 	unsigned i;

-	if (prof_write(umax2s(ctx->cnt_dump.curobjs, 10, buf), propagate_err)
+	if (opt_prof_accum == false && ctx->cnt_summed.curobjs == 0) {
+		assert(ctx->cnt_summed.curbytes == 0);
+		assert(ctx->cnt_summed.accumobjs == 0);
+		assert(ctx->cnt_summed.accumbytes == 0);
+		return (false);
+	}
+
+	if (prof_write(umax2s(ctx->cnt_summed.curobjs, 10, buf), propagate_err)
 	    || prof_write(": ", propagate_err)
-	    || prof_write(umax2s(ctx->cnt_dump.curbytes, 10, buf),
+	    || prof_write(umax2s(ctx->cnt_summed.curbytes, 10, buf),
 	    propagate_err)
 	    || prof_write(" [", propagate_err)
-	    || prof_write(umax2s(ctx->cnt_dump.accumobjs, 10, buf),
+	    || prof_write(umax2s(ctx->cnt_summed.accumobjs, 10, buf),
 	    propagate_err)
 	    || prof_write(": ", propagate_err)
-	    || prof_write(umax2s(ctx->cnt_dump.accumbytes, 10, buf),
+	    || prof_write(umax2s(ctx->cnt_summed.accumbytes, 10, buf),
 	    propagate_err)
 	    || prof_write("] @", propagate_err))
 		return (true);
@ -1060,7 +1163,7 @@ prof_dump(const char *filename, bool leakcheck, bool propagate_err)
 	leak_nctx = 0;
 	for (tabind = 0; ckh_iter(&bt2ctx, &tabind, NULL, &ctx.v)
 	    == false;) {
-		prof_ctx_merge(ctx.p, &cnt_all, &leak_nctx);
+		prof_ctx_sum(ctx.p, &cnt_all, &leak_nctx);
 	}

 	/* Dump profile header. */
@ -1319,54 +1422,33 @@ prof_bt_keycomp(const void *k1, const void *k2)
 }

 static void
-bt2cnt_thread_cleanup(void *arg)
+prof_tcache_cleanup(void *arg)
 {
-	ckh_t *bt2cnt;
+	prof_tcache_t *prof_tcache;

-	bt2cnt = BT2CNT_GET();
-	if (bt2cnt != NULL) {
-		ql_head(prof_thr_cnt_t) cnts_ql;
-		size_t tabind;
-		union {
-			prof_thr_cnt_t	*p;
-			void		*v;
-		} cnt;
-
-		/* Iteratively merge cnt's into the global stats. */
-		ql_new(&cnts_ql);
-		tabind = 0;
-		while (ckh_iter(bt2cnt, &tabind, NULL, &cnt.v) ==
-		    false) {
-			prof_ctx_t *ctx = cnt.p->ctx;
-			/* Merge stats and detach from ctx. */
-			malloc_mutex_lock(&ctx->lock);
-			ctx->cnt_merged.curobjs += cnt.p->cnts.curobjs;
-			ctx->cnt_merged.curbytes += cnt.p->cnts.curbytes;
-			ctx->cnt_merged.accumobjs += cnt.p->cnts.accumobjs;
-			ctx->cnt_merged.accumbytes += cnt.p->cnts.accumbytes;
-			ql_remove(&ctx->cnts_ql, cnt.p, link);
-			malloc_mutex_unlock(&ctx->lock);
-
-			/*
-			 * Stash cnt for deletion after finishing with
-			 * ckh_iter().
-			 */
-			ql_tail_insert(&cnts_ql, cnt.p, link);
-		}
+	prof_tcache = PROF_TCACHE_GET();
+	if (prof_tcache != NULL) {
+		prof_thr_cnt_t *cnt;

 		/*
-		 * Delete the hash table now that cnts_ql has a list of all
-		 * cnt's.
+		 * Delete the hash table.  All of its contents can still be
+		 * iterated over via the LRU.
 		 */
-		ckh_delete(bt2cnt);
-		idalloc(bt2cnt);
-		BT2CNT_SET(NULL);
+		ckh_delete(&prof_tcache->bt2cnt);

-		/* Delete cnt's. */
-		while ((cnt.p = ql_last(&cnts_ql, link)) != NULL) {
-			ql_remove(&cnts_ql, cnt.p, link);
-			idalloc(cnt.v);
+		/*
+		 * Iteratively merge cnt's into the global stats and delete
+		 * them.
+		 */
+		while ((cnt = ql_last(&prof_tcache->lru_ql, lru_link)) !=
+		    NULL) {
+			prof_ctx_merge(cnt->ctx, cnt);
+			ql_remove(&prof_tcache->lru_ql, cnt, lru_link);
+			idalloc(cnt);
 		}
+
+		idalloc(prof_tcache);
+		PROF_TCACHE_SET(NULL);
 	}
 }

@ -1419,7 +1501,7 @@ prof_boot1(void)
 			return (true);
 		if (malloc_mutex_init(&bt2ctx_mtx))
 			return (true);
-		if (pthread_key_create(&bt2cnt_tsd, bt2cnt_thread_cleanup)
+		if (pthread_key_create(&prof_tcache_tsd, prof_tcache_cleanup)
 		    != 0) {
 			malloc_write(
 			    "<jemalloc>: Error in pthread_key_create()\n");
--- a/jemalloc/src/stats.c
+++ b/jemalloc/src/stats.c
@ -469,6 +469,9 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 		if ((err = JEMALLOC_P(mallctl)("opt.stats_print", &bv, &bsz,
 		    NULL, 0)) == 0)
 			write_cb(cbopaque, bv ? "P" : "p");
+		if ((err = JEMALLOC_P(mallctl)("opt.prof_accum", &bv, &bsz,
+		    NULL, 0)) == 0)
+			write_cb(cbopaque, bv ? "R" : "r");
 		if ((err = JEMALLOC_P(mallctl)("opt.prof_udump", &bv, &bsz,
 		    NULL, 0)) == 0)
 			write_cb(cbopaque, bv ? "U" : "u");
@ -580,6 +583,17 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 			write_cb(cbopaque, umax2s((1U << sv), 10, s));
 			write_cb(cbopaque, "\n");

+			CTL_GET("opt.lg_prof_tcmax", &ssv, ssize_t);
+			write_cb(cbopaque,
+			    "Maximum per thread backtrace cache: ");
+			if (ssv >= 0) {
+				write_cb(cbopaque, umax2s((1U << ssv), 10, s));
+				write_cb(cbopaque, " (2^");
+				write_cb(cbopaque, umax2s(ssv, 10, s));
+				write_cb(cbopaque, ")\n");
+			} else
+				write_cb(cbopaque, "N/A\n");
+
 			CTL_GET("opt.lg_prof_sample", &sv, size_t);
 			write_cb(cbopaque, "Average profile sample interval: ");
 			write_cb(cbopaque, umax2s((1U << sv), 10, s));