diff --git a/doc/jemalloc.xml.in b/doc/jemalloc.xml.in
index 308d0c65..8f4327f3 100644
--- a/doc/jemalloc.xml.in
+++ b/doc/jemalloc.xml.in
@@ -1047,7 +1047,7 @@ malloc_conf = "xmalloc:true";]]></programlisting>
       <varlistentry id="opt.lg_prof_sample">
         <term>
           <mallctl>opt.lg_prof_sample</mallctl>
-          (<type>ssize_t</type>)
+          (<type>size_t</type>)
           <literal>r-</literal>
           [<option>--enable-prof</option>]
         </term>
@@ -1243,6 +1243,35 @@ malloc_conf = "xmalloc:true";]]></programlisting>
         the developer may find manual flushing useful.</para></listitem>
       </varlistentry>
 
+      <varlistentry id="thread.prof.name">
+        <term>
+          <mallctl>thread.prof.name</mallctl>
+          (<type>const char *</type>)
+          <literal>rw</literal>
+          [<option>--enable-prof</option>]
+        </term>
+        <listitem><para>Get/set the descriptive name associated with the calling
+        thread in memory profile dumps.  An internal copy of the name string is
+        created, so the input string need not be maintained after this interface
+        completes execution.  The output string of this interface should be
+        copied for non-ephemeral uses, because multiple implementation details
+        can cause asynchronous string deallocation.</para></listitem>
+      </varlistentry>
+
+      <varlistentry id="thread.prof.active">
+        <term>
+          <mallctl>thread.prof.active</mallctl>
+          (<type>bool</type>)
+          <literal>rw</literal>
+          [<option>--enable-prof</option>]
+        </term>
+        <listitem><para>Control whether sampling is currently active for the
+        calling thread.  This is a deactivation mechanism in addition to <link
+        linkend="prof.active"><mallctl>prof.active</mallctl></link>; both must
+        be active for the calling thread to sample.  This flag is enabled by
+        default.</para></listitem>
+      </varlistentry>
+
       <varlistentry id="arena.i.purge">
         <term>
           <mallctl>arena.&lt;i&gt;.purge</mallctl>
@@ -1492,6 +1521,31 @@ malloc_conf = "xmalloc:true";]]></programlisting>
         option.</para></listitem>
       </varlistentry>
 
+      <varlistentry id="prof.reset">
+        <term>
+          <mallctl>prof.reset</mallctl>
+          (<type>size_t</type>)
+          <literal>-w</literal>
+          [<option>--enable-prof</option>]
+        </term>
+        <listitem><para>Reset all memory profile statistics, and optionally
+        update the sample rate (see <link
+        linkend="opt.lg_prof_sample"><mallctl>opt.lg_prof_sample</mallctl></link>).
+        </para></listitem>
+      </varlistentry>
+
+      <varlistentry id="prof.lg_sample">
+        <term>
+          <mallctl>prof.lg_sample</mallctl>
+          (<type>size_t</type>)
+          <literal>r-</literal>
+          [<option>--enable-prof</option>]
+        </term>
+        <listitem><para>Get the sample rate (see <link
+        linkend="opt.lg_prof_sample"><mallctl>opt.lg_prof_sample</mallctl></link>).
+        </para></listitem>
+      </varlistentry>
+
       <varlistentry id="prof.interval">
         <term>
           <mallctl>prof.interval</mallctl>
diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index 9351e3b0..f3f6426c 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -58,7 +58,7 @@ typedef struct arena_s arena_t;
 struct arena_chunk_map_s {
 #ifndef JEMALLOC_PROF
 	/*
-	 * Overlay prof_ctx in order to allow it to be referenced by dead code.
+	 * Overlay prof_tctx in order to allow it to be referenced by dead code.
 	 * Such antics aren't warranted for per arena data structures, but
 	 * chunk map overhead accounts for a percentage of memory, rather than
 	 * being just a fixed cost.
@@ -75,7 +75,7 @@ struct arena_chunk_map_s {
 	rb_node(arena_chunk_map_t)	rb_link;
 
 	/* Profile counters, used for large object runs. */
-	prof_ctx_t			*prof_ctx;
+	prof_tctx_t			*prof_tctx;
 #ifndef JEMALLOC_PROF
 	}; /* union { ... }; */
 #endif
@@ -472,8 +472,8 @@ size_t	arena_ptr_small_binind_get(const void *ptr, size_t mapbits);
 size_t	arena_bin_index(arena_t *arena, arena_bin_t *bin);
 unsigned	arena_run_regind(arena_run_t *run, arena_bin_info_t *bin_info,
     const void *ptr);
-prof_ctx_t	*arena_prof_ctx_get(const void *ptr);
-void	arena_prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
+prof_tctx_t	*arena_prof_tctx_get(const void *ptr);
+void	arena_prof_tctx_set(const void *ptr, prof_tctx_t *tctx);
 void	*arena_malloc(arena_t *arena, size_t size, bool zero, bool try_tcache);
 size_t	arena_salloc(const void *ptr, bool demote);
 void	arena_dalloc(arena_chunk_t *chunk, void *ptr, bool try_tcache);
@@ -987,10 +987,10 @@ arena_run_regind(arena_run_t *run, arena_bin_info_t *bin_info, const void *ptr)
 	return (regind);
 }
 
-JEMALLOC_INLINE prof_ctx_t *
-arena_prof_ctx_get(const void *ptr)
+JEMALLOC_INLINE prof_tctx_t *
+arena_prof_tctx_get(const void *ptr)
 {
-	prof_ctx_t *ret;
+	prof_tctx_t *ret;
 	arena_chunk_t *chunk;
 	size_t pageind, mapbits;
 
@@ -1003,15 +1003,15 @@ arena_prof_ctx_get(const void *ptr)
 	mapbits = arena_mapbits_get(chunk, pageind);
 	assert((mapbits & CHUNK_MAP_ALLOCATED) != 0);
 	if ((mapbits & CHUNK_MAP_LARGE) == 0)
-		ret = (prof_ctx_t *)(uintptr_t)1U;
+		ret = (prof_tctx_t *)(uintptr_t)1U;
 	else
-		ret = arena_mapp_get(chunk, pageind)->prof_ctx;
+		ret = arena_mapp_get(chunk, pageind)->prof_tctx;
 
 	return (ret);
 }
 
 JEMALLOC_INLINE void
-arena_prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
+arena_prof_tctx_set(const void *ptr, prof_tctx_t *tctx)
 {
 	arena_chunk_t *chunk;
 	size_t pageind;
@@ -1025,7 +1025,7 @@ arena_prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
 	assert(arena_mapbits_allocated_get(chunk, pageind) != 0);
 
 	if (arena_mapbits_large_get(chunk, pageind) != 0)
-		arena_mapp_get(chunk, pageind)->prof_ctx = ctx;
+		arena_mapp_get(chunk, pageind)->prof_tctx = tctx;
 }
 
 JEMALLOC_ALWAYS_INLINE void *
diff --git a/include/jemalloc/internal/extent.h b/include/jemalloc/internal/extent.h
index 000ef6d5..5b00076f 100644
--- a/include/jemalloc/internal/extent.h
+++ b/include/jemalloc/internal/extent.h
@@ -16,7 +16,7 @@ struct extent_node_s {
 	rb_node(extent_node_t)	link_ad;
 
 	/* Profile counters, used for huge objects. */
-	prof_ctx_t		*prof_ctx;
+	prof_tctx_t		*prof_tctx;
 
 	/* Pointer to the extent that this tree node is responsible for. */
 	void			*addr;
diff --git a/include/jemalloc/internal/huge.h b/include/jemalloc/internal/huge.h
index 1e545367..2ec77520 100644
--- a/include/jemalloc/internal/huge.h
+++ b/include/jemalloc/internal/huge.h
@@ -21,8 +21,8 @@ extern huge_dalloc_junk_t *huge_dalloc_junk;
 #endif
 void	huge_dalloc(void *ptr);
 size_t	huge_salloc(const void *ptr);
-prof_ctx_t	*huge_prof_ctx_get(const void *ptr);
-void	huge_prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
+prof_tctx_t	*huge_prof_tctx_get(const void *ptr);
+void	huge_prof_tctx_set(const void *ptr, prof_tctx_t *tctx);
 bool	huge_boot(void);
 void	huge_prefork(void);
 void	huge_postfork_parent(void);
diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt
index 3401301c..13505455 100644
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -48,9 +48,9 @@ arena_prefork
 arena_prof_accum
 arena_prof_accum_impl
 arena_prof_accum_locked
-arena_prof_ctx_get
-arena_prof_ctx_set
 arena_prof_promoted
+arena_prof_tctx_get
+arena_prof_tctx_set
 arena_ptr_small_binind_get
 arena_purge_all
 arena_quarantine_junk_small
@@ -208,8 +208,8 @@ huge_palloc
 huge_postfork_child
 huge_postfork_parent
 huge_prefork
-huge_prof_ctx_get
-huge_prof_ctx_set
+huge_prof_tctx_get
+huge_prof_tctx_set
 huge_ralloc
 huge_ralloc_no_move
 huge_salloc
@@ -287,28 +287,31 @@ opt_zero
 p2rz
 pages_purge
 pow2_ceil
+prof_alloc_prep
 prof_backtrace
 prof_boot0
 prof_boot1
 prof_boot2
 prof_bt_count
-prof_ctx_get
-prof_ctx_set
 prof_dump_open
 prof_free
+prof_free_sampled_object
 prof_gdump
 prof_idump
 prof_interval
 prof_lookup
 prof_malloc
-prof_malloc_record_object
+prof_malloc_sample_object
 prof_mdump
 prof_postfork_child
 prof_postfork_parent
 prof_prefork
 prof_realloc
+prof_reset
 prof_sample_accum_update
 prof_sample_threshold_update
+prof_tctx_get
+prof_tctx_set
 prof_tdata_booted
 prof_tdata_cleanup
 prof_tdata_get
@@ -322,6 +325,10 @@ prof_tdata_tsd_get
 prof_tdata_tsd_get_wrapper
 prof_tdata_tsd_init_head
 prof_tdata_tsd_set
+prof_thread_active_get
+prof_thread_active_set
+prof_thread_name_get
+prof_thread_name_set
 quarantine
 quarantine_alloc_hook
 quarantine_boot
diff --git a/include/jemalloc/internal/prof.h b/include/jemalloc/internal/prof.h
index 9398ad91..104bfade 100644
--- a/include/jemalloc/internal/prof.h
+++ b/include/jemalloc/internal/prof.h
@@ -1,11 +1,10 @@
 /******************************************************************************/
 #ifdef JEMALLOC_H_TYPES
 
-typedef uint64_t prof_thr_uid_t;
 typedef struct prof_bt_s prof_bt_t;
 typedef struct prof_cnt_s prof_cnt_t;
-typedef struct prof_thr_cnt_s prof_thr_cnt_t;
-typedef struct prof_ctx_s prof_ctx_t;
+typedef struct prof_tctx_s prof_tctx_t;
+typedef struct prof_gctx_s prof_gctx_t;
 typedef struct prof_tdata_s prof_tdata_t;
 
 /* Option defaults. */
@@ -34,11 +33,17 @@ typedef struct prof_tdata_s prof_tdata_t;
 #define	PROF_PRINTF_BUFSIZE		128
 
 /*
- * Number of mutexes shared among all ctx's.  No space is allocated for these
+ * Number of mutexes shared among all gctx's.  No space is allocated for these
  * unless profiling is enabled, so it's okay to over-provision.
  */
 #define	PROF_NCTX_LOCKS			1024
 
+/*
+ * Number of mutexes shared among all tdata's.  No space is allocated for these
+ * unless profiling is enabled, so it's okay to over-provision.
+ */
+#define	PROF_NTDATA_LOCKS		256
+
 /*
  * prof_tdata pointers close to NULL are used to encode state information that
  * is used for cleaning up during thread shutdown.
@@ -66,87 +71,70 @@ typedef struct {
 #endif
 
 struct prof_cnt_s {
-	/*
-	 * Profiling counters.  An allocation/deallocation pair can operate on
-	 * different prof_thr_cnt_t objects that are linked into the same
-	 * prof_ctx_t cnts_ql, so it is possible for the cur* counters to go
-	 * negative.  In principle it is possible for the *bytes counters to
-	 * overflow/underflow, but a general solution would require something
-	 * like 128-bit counters; this implementation doesn't bother to solve
-	 * that problem.
-	 */
-	int64_t		curobjs;
-	int64_t		curbytes;
+	/* Profiling counters. */
+	uint64_t	curobjs;
+	uint64_t	curbytes;
 	uint64_t	accumobjs;
 	uint64_t	accumbytes;
 };
 
-struct prof_thr_cnt_s {
-	prof_thr_uid_t		thr_uid;
+typedef enum {
+	prof_tctx_state_nominal,
+	prof_tctx_state_dumping,
+	prof_tctx_state_purgatory /* Dumper must finish destroying. */
+} prof_tctx_state_t;
 
-	/* Linkage into prof_ctx_t's thr_cnts. */
-	rb_node(prof_thr_cnt_t)	thr_cnt_link;
+struct prof_tctx_s {
+	/* Thread data for thread that performed the allocation. */
+	prof_tdata_t		*tdata;
 
-	/*
-	 * Associated context.  If a thread frees an object that it did not
-	 * allocate, it is possible that the context is not present in the
-	 * thread's hash table, in which case it must be able to look up the
-	 * context, insert a new prof_thr_cnt_t into the thread's hash table,
-	 * and link it into the prof_ctx_t's thr_cnts.
-	 */
-	prof_ctx_t		*ctx;
-
-	/*
-	 * Threads use memory barriers to update the counters.  Since there is
-	 * only ever one writer, the only challenge is for the reader to get a
-	 * consistent read of the counters.
-	 *
-	 * The writer uses this series of operations:
-	 *
-	 * 1) Increment epoch to an odd number.
-	 * 2) Update counters.
-	 * 3) Increment epoch to an even number.
-	 *
-	 * The reader must assure 1) that the epoch is even while it reads the
-	 * counters, and 2) that the epoch doesn't change between the time it
-	 * starts and finishes reading the counters.
-	 */
-	unsigned		epoch;
-
-	/* Profiling counters. */
+	/* Profiling counters, protected by tdata->lock. */
 	prof_cnt_t		cnts;
-};
-typedef rb_tree(prof_thr_cnt_t) prof_thr_cnt_tree_t;
 
-struct prof_ctx_s {
-	/* Protects nlimbo, cnt_merged, and thr_cnts. */
+	/* Associated global context. */
+	prof_gctx_t		*gctx;
+
+	/* Linkage into gctx's tctxs. */
+	rb_node(prof_tctx_t)	tctx_link;
+
+	/* Current dump-related state, protected by gctx->lock. */
+	prof_tctx_state_t	state;
+
+	/*
+	 * Copy of cnts snapshotted during early dump phase, protected by
+	 * dump_mtx.
+	 */
+	prof_cnt_t		dump_cnts;
+};
+typedef rb_tree(prof_tctx_t) prof_tctx_tree_t;
+
+struct prof_gctx_s {
+	/* Protects nlimbo, cnt_summed, and tctxs. */
 	malloc_mutex_t		*lock;
 
 	/*
-	 * Number of threads that currently cause this ctx to be in a state of
+	 * Number of threads that currently cause this gctx to be in a state of
 	 * limbo due to one of:
-	 *   - Initializing per thread counters associated with this ctx.
-	 *   - Preparing to destroy this ctx.
-	 *   - Dumping a heap profile that includes this ctx.
+	 *   - Initializing this gctx.
+	 *   - Initializing per thread counters associated with this gctx.
+	 *   - Preparing to destroy this gctx.
+	 *   - Dumping a heap profile that includes this gctx.
 	 * nlimbo must be 1 (single destroyer) in order to safely destroy the
-	 * ctx.
+	 * gctx.
 	 */
 	unsigned		nlimbo;
 
-	/* Temporary storage for summation during dump. */
-	prof_cnt_t		cnt_summed;
-
-	/* When threads exit, they merge their stats into cnt_merged. */
-	prof_cnt_t		cnt_merged;
-
 	/*
 	 * Tree of profile counters, one for each thread that has allocated in
 	 * this context.
 	 */
-	prof_thr_cnt_tree_t	thr_cnts;
+	prof_tctx_tree_t	tctxs;
 
 	/* Linkage for tree of contexts to be dumped. */
-	rb_node(prof_ctx_t)	dump_link;
+	rb_node(prof_gctx_t)	dump_link;
+
+	/* Temporary storage for summation during dump. */
+	prof_cnt_t		cnt_summed;
 
 	/* Associated backtrace. */
 	prof_bt_t		bt;
@@ -154,21 +142,34 @@ struct prof_ctx_s {
 	/* Backtrace vector, variable size, referred to by bt. */
 	void			*vec[1];
 };
-typedef rb_tree(prof_ctx_t) prof_ctx_tree_t;
+typedef rb_tree(prof_gctx_t) prof_gctx_tree_t;
+
+typedef enum {
+	prof_tdata_state_attached, /* Active thread attached, data valid. */
+	prof_tdata_state_detached, /* Defunct thread, data remain valid. */
+	prof_tdata_state_expired   /* Predates reset, omit data from dump. */
+} prof_tdata_state_t;
 
 struct prof_tdata_s {
+	malloc_mutex_t		*lock;
+
+	/* Monotonically increasing unique thread identifier. */
+	uint64_t		thr_uid;
+
+	/* Included in heap profile dumps if non-NULL. */
+	char			*thread_name;
+
+	prof_tdata_state_t	state;
+
+	rb_node(prof_tdata_t)	tdata_link;
+
 	/*
-	 * Hash of (prof_bt_t *)-->(prof_thr_cnt_t *).  Each thread tracks
+	 * Hash of (prof_bt_t *)-->(prof_tctx_t *).  Each thread tracks
 	 * backtraces for which it has non-zero allocation/deallocation counters
-	 * associated with thread-specific prof_thr_cnt_t objects.  Other
-	 * threads may read the prof_thr_cnt_t contents, but no others will ever
-	 * write them.
-	 *
-	 * Upon thread exit, the thread must merge all the prof_thr_cnt_t
-	 * counter data into the associated prof_ctx_t objects, and unlink/free
-	 * the prof_thr_cnt_t objects.
+	 * associated with thread-specific prof_tctx_t objects.  Other threads
+	 * may write to prof_tctx_t contents when freeing associated objects.
 	 */
-	ckh_t			bt2cnt;
+	ckh_t			bt2tctx;
 
 	/* Sampling state. */
 	uint64_t		prng_state;
@@ -179,9 +180,27 @@ struct prof_tdata_s {
 	bool			enq_idump;
 	bool			enq_gdump;
 
+	/*
+	 * Set to true during an early dump phase for tdata's which are
+	 * currently being dumped.  New threads' tdata's have this initialized
+	 * to false so that they aren't accidentally included in later dump
+	 * phases.
+	 */
+	bool			dumping;
+
+	/*
+	 * True if profiling is active for this tdata's thread
+	 * (thread.prof.active mallctl).
+	 */
+	bool			active;
+
+	/* Temporary storage for summation during dump. */
+	prof_cnt_t		cnt_summed;
+
 	/* Backtrace vector, used for calls to prof_backtrace(). */
 	void			*vec[PROF_BT_MAX];
 };
+typedef rb_tree(prof_tdata_t) prof_tdata_tree_t;
 
 #endif /* JEMALLOC_H_STRUCTS */
 /******************************************************************************/
@@ -217,9 +236,18 @@ extern char	opt_prof_prefix[
  */
 extern uint64_t	prof_interval;
 
+/*
+ * Initialized as opt_lg_prof_sample, and potentially modified during profiling
+ * resets.
+ */
+extern size_t	lg_prof_sample;
+
+void	prof_malloc_sample_object(const void *ptr, size_t usize,
+    prof_tctx_t *tctx);
+void	prof_free_sampled_object(size_t usize, prof_tctx_t *tctx);
 void	bt_init(prof_bt_t *bt, void **vec);
 void	prof_backtrace(prof_bt_t *bt);
-prof_thr_cnt_t	*prof_lookup(prof_bt_t *bt);
+prof_tctx_t	*prof_lookup(prof_bt_t *bt);
 #ifdef JEMALLOC_JET
 size_t	prof_bt_count(void);
 typedef int (prof_dump_open_t)(bool, const char *);
@@ -229,53 +257,44 @@ void	prof_idump(void);
 bool	prof_mdump(const char *filename);
 void	prof_gdump(void);
 prof_tdata_t	*prof_tdata_init(void);
+prof_tdata_t	*prof_tdata_reinit(prof_tdata_t *tdata);
+void	prof_reset(size_t lg_sample);
 void	prof_tdata_cleanup(void *arg);
+const char	*prof_thread_name_get(void);
+bool	prof_thread_name_set(const char *thread_name);
+bool	prof_thread_active_get(void);
+bool	prof_thread_active_set(bool active);
 void	prof_boot0(void);
 void	prof_boot1(void);
 bool	prof_boot2(void);
 void	prof_prefork(void);
 void	prof_postfork_parent(void);
 void	prof_postfork_child(void);
-void	prof_sample_threshold_update(prof_tdata_t *prof_tdata);
+void	prof_sample_threshold_update(prof_tdata_t *tdata);
 
 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_INLINES
 
-#define	PROF_ALLOC_PREP(size, ret) do {					\
-	prof_tdata_t *prof_tdata;					\
-	prof_bt_t bt;							\
-									\
-	assert(size == s2u(size));					\
-									\
-	if (!opt_prof_active ||						\
-	    prof_sample_accum_update(size, false, &prof_tdata)) {	\
-		ret = (prof_thr_cnt_t *)(uintptr_t)1U;			\
-	} else {							\
-		bt_init(&bt, prof_tdata->vec);				\
-		prof_backtrace(&bt);					\
-		ret = prof_lookup(&bt);					\
-	}								\
-} while (0)
-
 #ifndef JEMALLOC_ENABLE_INLINE
 malloc_tsd_protos(JEMALLOC_ATTR(unused), prof_tdata, prof_tdata_t *)
 
 prof_tdata_t	*prof_tdata_get(bool create);
-bool	prof_sample_accum_update(size_t size, bool commit,
-    prof_tdata_t **prof_tdata_out);
-prof_ctx_t	*prof_ctx_get(const void *ptr);
-void	prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
-void	prof_malloc_record_object(const void *ptr, size_t usize,
-    prof_thr_cnt_t *cnt);
-void	prof_malloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt);
-void	prof_realloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt,
-    size_t old_usize, prof_ctx_t *old_ctx);
-void	prof_free(const void *ptr, size_t size);
+bool	prof_sample_accum_update(size_t usize, bool commit,
+    prof_tdata_t **tdata_out);
+prof_tctx_t	*prof_alloc_prep(size_t usize);
+prof_tctx_t	*prof_tctx_get(const void *ptr);
+void	prof_tctx_set(const void *ptr, prof_tctx_t *tctx);
+void	prof_malloc_sample_object(const void *ptr, size_t usize,
+    prof_tctx_t *tctx);
+void	prof_malloc(const void *ptr, size_t usize, prof_tctx_t *tctx);
+void	prof_realloc(const void *ptr, size_t usize, prof_tctx_t *tctx,
+    size_t old_usize, prof_tctx_t *old_tctx);
+void	prof_free(const void *ptr, size_t usize);
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_PROF_C_))
-/* Thread-specific backtrace cache, used to reduce bt2ctx contention. */
+/* Thread-specific backtrace cache, used to reduce bt2gctx contention. */
 malloc_tsd_externs(prof_tdata, prof_tdata_t *)
 malloc_tsd_funcs(JEMALLOC_INLINE, prof_tdata, prof_tdata_t *, NULL,
     prof_tdata_cleanup)
@@ -283,21 +302,27 @@ malloc_tsd_funcs(JEMALLOC_INLINE, prof_tdata, prof_tdata_t *, NULL,
 JEMALLOC_INLINE prof_tdata_t *
 prof_tdata_get(bool create)
 {
-	prof_tdata_t *prof_tdata;
+	prof_tdata_t *tdata;
 
 	cassert(config_prof);
 
-	prof_tdata = *prof_tdata_tsd_get();
-	if (create && prof_tdata == NULL)
-		prof_tdata = prof_tdata_init();
+	tdata = *prof_tdata_tsd_get();
+	if (create) {
+		if (tdata == NULL)
+			tdata = prof_tdata_init();
+		else if (tdata->state == prof_tdata_state_expired)
+			tdata = prof_tdata_reinit(tdata);
+		assert(tdata == NULL || tdata->state ==
+		    prof_tdata_state_attached);
+	}
 
-	return (prof_tdata);
+	return (tdata);
 }
 
-JEMALLOC_INLINE prof_ctx_t *
-prof_ctx_get(const void *ptr)
+JEMALLOC_INLINE prof_tctx_t *
+prof_tctx_get(const void *ptr)
 {
-	prof_ctx_t *ret;
+	prof_tctx_t *ret;
 	arena_chunk_t *chunk;
 
 	cassert(config_prof);
@@ -306,15 +331,15 @@ prof_ctx_get(const void *ptr)
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 	if (chunk != ptr) {
 		/* Region. */
-		ret = arena_prof_ctx_get(ptr);
+		ret = arena_prof_tctx_get(ptr);
 	} else
-		ret = huge_prof_ctx_get(ptr);
+		ret = huge_prof_tctx_get(ptr);
 
 	return (ret);
 }
 
 JEMALLOC_INLINE void
-prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
+prof_tctx_set(const void *ptr, prof_tctx_t *tctx)
 {
 	arena_chunk_t *chunk;
 
@@ -324,66 +349,62 @@ prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 	if (chunk != ptr) {
 		/* Region. */
-		arena_prof_ctx_set(ptr, ctx);
+		arena_prof_tctx_set(ptr, tctx);
 	} else
-		huge_prof_ctx_set(ptr, ctx);
+		huge_prof_tctx_set(ptr, tctx);
 }
 
 JEMALLOC_INLINE bool
-prof_sample_accum_update(size_t size, bool commit,
-    prof_tdata_t **prof_tdata_out)
+prof_sample_accum_update(size_t usize, bool commit, prof_tdata_t **tdata_out)
 {
-	prof_tdata_t *prof_tdata;
+	prof_tdata_t *tdata;
 
 	cassert(config_prof);
 
-	prof_tdata = prof_tdata_get(true);
-	if ((uintptr_t)prof_tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
-		prof_tdata = NULL;
+	tdata = prof_tdata_get(true);
+	if ((uintptr_t)tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
+		tdata = NULL;
 
-	if (prof_tdata_out != NULL)
-		*prof_tdata_out = prof_tdata;
+	if (tdata_out != NULL)
+		*tdata_out = tdata;
 
-	if (prof_tdata == NULL)
+	if (tdata == NULL)
 		return (true);
 
-	if (prof_tdata->bytes_until_sample >= size) {
+	if (tdata->bytes_until_sample >= usize) {
 		if (commit)
-			prof_tdata->bytes_until_sample -= size;
+			tdata->bytes_until_sample -= usize;
 		return (true);
 	} else {
 		/* Compute new sample threshold. */
 		if (commit)
-			prof_sample_threshold_update(prof_tdata);
-		return (false);
+			prof_sample_threshold_update(tdata);
+		return (tdata->active == false);
 	}
 }
 
-JEMALLOC_INLINE void
-prof_malloc_record_object(const void *ptr, size_t usize, prof_thr_cnt_t *cnt) {
-	prof_ctx_set(ptr, cnt->ctx);
+JEMALLOC_INLINE prof_tctx_t *
+prof_alloc_prep(size_t usize)
+{
+	prof_tctx_t *ret;
+	prof_tdata_t *tdata;
+	prof_bt_t bt;
 
-	cnt->epoch++;
-	/*********/
-	mb_write();
-	/*********/
-	cnt->cnts.curobjs++;
-	cnt->cnts.curbytes += usize;
-	if (opt_prof_accum) {
-		cnt->cnts.accumobjs++;
-		cnt->cnts.accumbytes += usize;
+	assert(usize == s2u(usize));
+
+	if (!opt_prof_active || prof_sample_accum_update(usize, false, &tdata))
+		ret = (prof_tctx_t *)(uintptr_t)1U;
+	else {
+		bt_init(&bt, tdata->vec);
+		prof_backtrace(&bt);
+		ret = prof_lookup(&bt);
 	}
-	/*********/
-	mb_write();
-	/*********/
-	cnt->epoch++;
-	/*********/
-	mb_write();
-	/*********/
+
+	return (ret);
 }
 
 JEMALLOC_INLINE void
-prof_malloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt)
+prof_malloc(const void *ptr, size_t usize, prof_tctx_t *tctx)
 {
 
 	cassert(config_prof);
@@ -392,131 +413,60 @@ prof_malloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt)
 
 	if (prof_sample_accum_update(usize, true, NULL)) {
 		/*
-		 * Don't sample.  For malloc()-like allocation, it is
-		 * always possible to tell in advance how large an
-		 * object's usable size will be, so there should never
-		 * be a difference between the usize passed to
-		 * PROF_ALLOC_PREP() and prof_malloc().
+		 * Don't sample.  For malloc()-like allocation, it is always
+		 * possible to tell in advance how large an object's usable size
+		 * will be, so there should never be a difference between the
+		 * usize passed to PROF_ALLOC_PREP() and prof_malloc().
 		 */
-		assert((uintptr_t)cnt == (uintptr_t)1U);
+		assert((uintptr_t)tctx == (uintptr_t)1U);
 	}
 
-	if ((uintptr_t)cnt > (uintptr_t)1U)
-		prof_malloc_record_object(ptr, usize, cnt);
+	if ((uintptr_t)tctx > (uintptr_t)1U)
+		prof_malloc_sample_object(ptr, usize, tctx);
 	else
-		prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U);
+		prof_tctx_set(ptr, (prof_tctx_t *)(uintptr_t)1U);
 }
 
 JEMALLOC_INLINE void
-prof_realloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt,
-    size_t old_usize, prof_ctx_t *old_ctx)
+prof_realloc(const void *ptr, size_t usize, prof_tctx_t *tctx, size_t old_usize,
+    prof_tctx_t *old_tctx)
 {
-	prof_thr_cnt_t *told_cnt;
 
 	cassert(config_prof);
-	assert(ptr != NULL || (uintptr_t)cnt <= (uintptr_t)1U);
+	assert(ptr != NULL || (uintptr_t)tctx <= (uintptr_t)1U);
 
 	if (ptr != NULL) {
 		assert(usize == isalloc(ptr, true));
 		if (prof_sample_accum_update(usize, true, NULL)) {
 			/*
-			 * Don't sample.  The usize passed to
-			 * PROF_ALLOC_PREP() was larger than what
-			 * actually got allocated, so a backtrace was
-			 * captured for this allocation, even though
-			 * its actual usize was insufficient to cross
-			 * the sample threshold.
+			 * Don't sample.  The usize passed to PROF_ALLOC_PREP()
+			 * was larger than what actually got allocated, so a
+			 * backtrace was captured for this allocation, even
+			 * though its actual usize was insufficient to cross the
+			 * sample threshold.
 			 */
-			cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
+			tctx = (prof_tctx_t *)(uintptr_t)1U;
 		}
 	}
 
-	if ((uintptr_t)old_ctx > (uintptr_t)1U) {
-		told_cnt = prof_lookup(&old_ctx->bt);
-		if (told_cnt == NULL) {
-			/*
-			 * It's too late to propagate OOM for this realloc(),
-			 * so operate directly on old_cnt->ctx->cnt_merged.
-			 */
-			malloc_mutex_lock(old_ctx->lock);
-			old_ctx->cnt_merged.curobjs--;
-			old_ctx->cnt_merged.curbytes -= old_usize;
-			malloc_mutex_unlock(old_ctx->lock);
-			told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
-		}
-	} else
-		told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
-
-	if ((uintptr_t)told_cnt > (uintptr_t)1U)
-		told_cnt->epoch++;
-	if ((uintptr_t)cnt > (uintptr_t)1U) {
-		prof_ctx_set(ptr, cnt->ctx);
-		cnt->epoch++;
-	} else if (ptr != NULL)
-		prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U);
-	/*********/
-	mb_write();
-	/*********/
-	if ((uintptr_t)told_cnt > (uintptr_t)1U) {
-		told_cnt->cnts.curobjs--;
-		told_cnt->cnts.curbytes -= old_usize;
-	}
-	if ((uintptr_t)cnt > (uintptr_t)1U) {
-		cnt->cnts.curobjs++;
-		cnt->cnts.curbytes += usize;
-		if (opt_prof_accum) {
-			cnt->cnts.accumobjs++;
-			cnt->cnts.accumbytes += usize;
-		}
-	}
-	/*********/
-	mb_write();
-	/*********/
-	if ((uintptr_t)told_cnt > (uintptr_t)1U)
-		told_cnt->epoch++;
-	if ((uintptr_t)cnt > (uintptr_t)1U)
-		cnt->epoch++;
-	/*********/
-	mb_write(); /* Not strictly necessary. */
+	if ((uintptr_t)old_tctx > (uintptr_t)1U)
+		prof_free_sampled_object(old_usize, old_tctx);
+	if ((uintptr_t)tctx > (uintptr_t)1U)
+		prof_malloc_sample_object(ptr, usize, tctx);
+	else
+		prof_tctx_set(ptr, (prof_tctx_t *)(uintptr_t)1U);
 }
 
 JEMALLOC_INLINE void
-prof_free(const void *ptr, size_t size)
+prof_free(const void *ptr, size_t usize)
 {
-	prof_ctx_t *ctx = prof_ctx_get(ptr);
+	prof_tctx_t *tctx = prof_tctx_get(ptr);
 
 	cassert(config_prof);
+	assert(usize == isalloc(ptr, true));
 
-	if ((uintptr_t)ctx > (uintptr_t)1) {
-		prof_thr_cnt_t *tcnt;
-		assert(size == isalloc(ptr, true));
-		tcnt = prof_lookup(&ctx->bt);
-
-		if (tcnt != NULL) {
-			tcnt->epoch++;
-			/*********/
-			mb_write();
-			/*********/
-			tcnt->cnts.curobjs--;
-			tcnt->cnts.curbytes -= size;
-			/*********/
-			mb_write();
-			/*********/
-			tcnt->epoch++;
-			/*********/
-			mb_write();
-			/*********/
-		} else {
-			/*
-			 * OOM during free() cannot be propagated, so operate
-			 * directly on cnt->ctx->cnt_merged.
-			 */
-			malloc_mutex_lock(ctx->lock);
-			ctx->cnt_merged.curobjs--;
-			ctx->cnt_merged.curbytes -= size;
-			malloc_mutex_unlock(ctx->lock);
-		}
-	}
+	if ((uintptr_t)tctx > (uintptr_t)1U)
+		prof_free_sampled_object(usize, tctx);
 }
 #endif
 
diff --git a/src/ctl.c b/src/ctl.c
index fa52a6cc..b816c845 100644
--- a/src/ctl.c
+++ b/src/ctl.c
@@ -68,6 +68,8 @@ CTL_PROTO(version)
 CTL_PROTO(epoch)
 CTL_PROTO(thread_tcache_enabled)
 CTL_PROTO(thread_tcache_flush)
+CTL_PROTO(thread_prof_name)
+CTL_PROTO(thread_prof_active)
 CTL_PROTO(thread_arena)
 CTL_PROTO(thread_allocated)
 CTL_PROTO(thread_allocatedp)
@@ -132,7 +134,9 @@ CTL_PROTO(arenas_nlruns)
 CTL_PROTO(arenas_extend)
 CTL_PROTO(prof_active)
 CTL_PROTO(prof_dump)
+CTL_PROTO(prof_reset)
 CTL_PROTO(prof_interval)
+CTL_PROTO(lg_prof_sample)
 CTL_PROTO(stats_chunks_current)
 CTL_PROTO(stats_chunks_total)
 CTL_PROTO(stats_chunks_high)
@@ -196,18 +200,24 @@ CTL_PROTO(stats_mapped)
  */
 #define	INDEX(i)	{false},	i##_index
 
-static const ctl_named_node_t	tcache_node[] = {
+static const ctl_named_node_t	thread_tcache_node[] = {
 	{NAME("enabled"),	CTL(thread_tcache_enabled)},
 	{NAME("flush"),		CTL(thread_tcache_flush)}
 };
 
+static const ctl_named_node_t	thread_prof_node[] = {
+	{NAME("name"),		CTL(thread_prof_name)},
+	{NAME("active"),	CTL(thread_prof_active)}
+};
+
 static const ctl_named_node_t	thread_node[] = {
 	{NAME("arena"),		CTL(thread_arena)},
 	{NAME("allocated"),	CTL(thread_allocated)},
 	{NAME("allocatedp"),	CTL(thread_allocatedp)},
 	{NAME("deallocated"),	CTL(thread_deallocated)},
 	{NAME("deallocatedp"),	CTL(thread_deallocatedp)},
-	{NAME("tcache"),	CHILD(named, tcache)}
+	{NAME("tcache"),	CHILD(named, thread_tcache)},
+	{NAME("prof"),		CHILD(named, thread_prof)}
 };
 
 static const ctl_named_node_t	config_node[] = {
@@ -311,7 +321,9 @@ static const ctl_named_node_t arenas_node[] = {
 static const ctl_named_node_t	prof_node[] = {
 	{NAME("active"),	CTL(prof_active)},
 	{NAME("dump"),		CTL(prof_dump)},
-	{NAME("interval"),	CTL(prof_interval)}
+	{NAME("reset"),		CTL(prof_reset)},
+	{NAME("interval"),	CTL(prof_interval)},
+	{NAME("lg_sample"),	CTL(lg_prof_sample)}
 };
 
 static const ctl_named_node_t stats_chunks_node[] = {
@@ -1281,6 +1293,62 @@ label_return:
 	return (ret);
 }
 
+static int
+thread_prof_name_ctl(const size_t *mib, size_t miblen, void *oldp,
+    size_t *oldlenp, void *newp, size_t newlen)
+{
+	int ret;
+	const char *oldname;
+
+	if (config_prof == false)
+		return (ENOENT);
+
+	oldname = prof_thread_name_get();
+	if (newp != NULL) {
+		if (newlen != sizeof(const char *)) {
+			ret = EINVAL;
+			goto label_return;
+		}
+		if (prof_thread_name_set(*(const char **)newp)) {
+			ret = EAGAIN;
+			goto label_return;
+		}
+	}
+	READ(oldname, const char *);
+
+	ret = 0;
+label_return:
+	return (ret);
+}
+
+static int
+thread_prof_active_ctl(const size_t *mib, size_t miblen, void *oldp,
+    size_t *oldlenp, void *newp, size_t newlen)
+{
+	int ret;
+	bool oldval;
+
+	if (config_prof == false)
+		return (ENOENT);
+
+	oldval = prof_thread_active_get();
+	if (newp != NULL) {
+		if (newlen != sizeof(bool)) {
+			ret = EINVAL;
+			goto label_return;
+		}
+		if (prof_thread_active_set(*(bool *)newp)) {
+			ret = EAGAIN;
+			goto label_return;
+		}
+	}
+	READ(oldval, bool);
+
+	ret = 0;
+label_return:
+	return (ret);
+}
+
 /******************************************************************************/
 
 /* ctl_mutex must be held during execution of this function. */
@@ -1601,7 +1669,30 @@ label_return:
 	return (ret);
 }
 
+static int
+prof_reset_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
+    void *newp, size_t newlen)
+{
+	int ret;
+	size_t lg_sample = lg_prof_sample;
+
+	if (config_prof == false)
+		return (ENOENT);
+
+	WRITEONLY();
+	WRITE(lg_sample, size_t);
+	if (lg_sample >= (sizeof(uint64_t) << 3))
+		lg_sample = (sizeof(uint64_t) << 3) - 1;
+
+	prof_reset(lg_sample);
+
+	ret = 0;
+label_return:
+	return (ret);
+}
+
 CTL_RO_NL_CGEN(config_prof, prof_interval, prof_interval, uint64_t)
+CTL_RO_NL_CGEN(config_prof, lg_prof_sample, lg_prof_sample, size_t)
 
 /******************************************************************************/
 
diff --git a/src/huge.c b/src/huge.c
index d08ed4a9..5f0c6980 100644
--- a/src/huge.c
+++ b/src/huge.c
@@ -197,10 +197,10 @@ huge_salloc(const void *ptr)
 	return (ret);
 }
 
-prof_ctx_t *
-huge_prof_ctx_get(const void *ptr)
+prof_tctx_t *
+huge_prof_tctx_get(const void *ptr)
 {
-	prof_ctx_t *ret;
+	prof_tctx_t *ret;
 	extent_node_t *node, key;
 
 	malloc_mutex_lock(&huge_mtx);
@@ -210,7 +210,7 @@ huge_prof_ctx_get(const void *ptr)
 	node = extent_tree_ad_search(&huge, &key);
 	assert(node != NULL);
 
-	ret = node->prof_ctx;
+	ret = node->prof_tctx;
 
 	malloc_mutex_unlock(&huge_mtx);
 
@@ -218,7 +218,7 @@ huge_prof_ctx_get(const void *ptr)
 }
 
 void
-huge_prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
+huge_prof_tctx_set(const void *ptr, prof_tctx_t *tctx)
 {
 	extent_node_t *node, key;
 
@@ -229,7 +229,7 @@ huge_prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
 	node = extent_tree_ad_search(&huge, &key);
 	assert(node != NULL);
 
-	node->prof_ctx = ctx;
+	node->prof_tctx = tctx;
 
 	malloc_mutex_unlock(&huge_mtx);
 }
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 0983c00d..2d01272e 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -636,9 +636,9 @@ malloc_conf_init(void)
 				    "prof_prefix", "jeprof")
 				CONF_HANDLE_BOOL(opt_prof_active, "prof_active",
 				    true)
-				CONF_HANDLE_SSIZE_T(opt_lg_prof_sample,
+				CONF_HANDLE_SIZE_T(opt_lg_prof_sample,
 				    "lg_prof_sample", 0,
-				    (sizeof(uint64_t) << 3) - 1)
+				    (sizeof(uint64_t) << 3) - 1, true)
 				CONF_HANDLE_BOOL(opt_prof_accum, "prof_accum",
 				    true)
 				CONF_HANDLE_SSIZE_T(opt_lg_prof_interval,
@@ -863,11 +863,11 @@ malloc_init_hard(void)
  */
 
 static void *
-imalloc_prof_sample(size_t usize, prof_thr_cnt_t *cnt)
+imalloc_prof_sample(size_t usize, prof_tctx_t *tctx)
 {
 	void *p;
 
-	if (cnt == NULL)
+	if (tctx == NULL)
 		return (NULL);
 	if (usize <= SMALL_MAXCLASS) {
 		p = imalloc(SMALL_MAXCLASS+1);
@@ -884,16 +884,16 @@ JEMALLOC_ALWAYS_INLINE_C void *
 imalloc_prof(size_t usize)
 {
 	void *p;
-	prof_thr_cnt_t *cnt;
+	prof_tctx_t *tctx;
 
-	PROF_ALLOC_PREP(usize, cnt);
-	if ((uintptr_t)cnt != (uintptr_t)1U)
-		p = imalloc_prof_sample(usize, cnt);
+	tctx = prof_alloc_prep(usize);
+	if ((uintptr_t)tctx != (uintptr_t)1U)
+		p = imalloc_prof_sample(usize, tctx);
 	else
 		p = imalloc(usize);
 	if (p == NULL)
 		return (NULL);
-	prof_malloc(p, usize, cnt);
+	prof_malloc(p, usize, tctx);
 
 	return (p);
 }
@@ -943,11 +943,11 @@ je_malloc(size_t size)
 }
 
 static void *
-imemalign_prof_sample(size_t alignment, size_t usize, prof_thr_cnt_t *cnt)
+imemalign_prof_sample(size_t alignment, size_t usize, prof_tctx_t *tctx)
 {
 	void *p;
 
-	if (cnt == NULL)
+	if (tctx == NULL)
 		return (NULL);
 	if (usize <= SMALL_MAXCLASS) {
 		assert(sa2u(SMALL_MAXCLASS+1, alignment) != 0);
@@ -963,17 +963,17 @@ imemalign_prof_sample(size_t alignment, size_t usize, prof_thr_cnt_t *cnt)
 }
 
 JEMALLOC_ALWAYS_INLINE_C void *
-imemalign_prof(size_t alignment, size_t usize, prof_thr_cnt_t *cnt)
+imemalign_prof(size_t alignment, size_t usize, prof_tctx_t *tctx)
 {
 	void *p;
 
-	if ((uintptr_t)cnt != (uintptr_t)1U)
-		p = imemalign_prof_sample(alignment, usize, cnt);
+	if ((uintptr_t)tctx != (uintptr_t)1U)
+		p = imemalign_prof_sample(alignment, usize, tctx);
 	else
 		p = ipalloc(usize, alignment, false);
 	if (p == NULL)
 		return (NULL);
-	prof_malloc(p, usize, cnt);
+	prof_malloc(p, usize, tctx);
 
 	return (p);
 }
@@ -1015,10 +1015,10 @@ imemalign(void **memptr, size_t alignment, size_t size, size_t min_alignment)
 		}
 
 		if (config_prof && opt_prof) {
-			prof_thr_cnt_t *cnt;
+			prof_tctx_t *tctx;
 
-			PROF_ALLOC_PREP(usize, cnt);
-			result = imemalign_prof(alignment, usize, cnt);
+			tctx = prof_alloc_prep(usize);
+			result = imemalign_prof(alignment, usize, tctx);
 		} else
 			result = ipalloc(usize, alignment, false);
 		if (result == NULL)
@@ -1070,11 +1070,11 @@ je_aligned_alloc(size_t alignment, size_t size)
 }
 
 static void *
-icalloc_prof_sample(size_t usize, prof_thr_cnt_t *cnt)
+icalloc_prof_sample(size_t usize, prof_tctx_t *tctx)
 {
 	void *p;
 
-	if (cnt == NULL)
+	if (tctx == NULL)
 		return (NULL);
 	if (usize <= SMALL_MAXCLASS) {
 		p = icalloc(SMALL_MAXCLASS+1);
@@ -1088,17 +1088,17 @@ icalloc_prof_sample(size_t usize, prof_thr_cnt_t *cnt)
 }
 
 JEMALLOC_ALWAYS_INLINE_C void *
-icalloc_prof(size_t usize, prof_thr_cnt_t *cnt)
+icalloc_prof(size_t usize, prof_tctx_t *tctx)
 {
 	void *p;
 
-	if ((uintptr_t)cnt != (uintptr_t)1U)
-		p = icalloc_prof_sample(usize, cnt);
+	if ((uintptr_t)tctx != (uintptr_t)1U)
+		p = icalloc_prof_sample(usize, tctx);
 	else
 		p = icalloc(usize);
 	if (p == NULL)
 		return (NULL);
-	prof_malloc(p, usize, cnt);
+	prof_malloc(p, usize, tctx);
 
 	return (p);
 }
@@ -1137,11 +1137,11 @@ je_calloc(size_t num, size_t size)
 	}
 
 	if (config_prof && opt_prof) {
-		prof_thr_cnt_t *cnt;
+		prof_tctx_t *tctx;
 
 		usize = s2u(num_size);
-		PROF_ALLOC_PREP(usize, cnt);
-		ret = icalloc_prof(usize, cnt);
+		tctx = prof_alloc_prep(usize);
+		ret = icalloc_prof(usize, tctx);
 	} else {
 		if (config_stats || (config_valgrind && in_valgrind))
 			usize = s2u(num_size);
@@ -1167,11 +1167,11 @@ label_return:
 }
 
 static void *
-irealloc_prof_sample(void *oldptr, size_t usize, prof_thr_cnt_t *cnt)
+irealloc_prof_sample(void *oldptr, size_t usize, prof_tctx_t *tctx)
 {
 	void *p;
 
-	if (cnt == NULL)
+	if (tctx == NULL)
 		return (NULL);
 	if (usize <= SMALL_MAXCLASS) {
 		p = iralloc(oldptr, SMALL_MAXCLASS+1, 0, 0, false);
@@ -1185,19 +1185,19 @@ irealloc_prof_sample(void *oldptr, size_t usize, prof_thr_cnt_t *cnt)
 }
 
 JEMALLOC_ALWAYS_INLINE_C void *
-irealloc_prof(void *oldptr, size_t old_usize, size_t usize, prof_thr_cnt_t *cnt)
+irealloc_prof(void *oldptr, size_t old_usize, size_t usize, prof_tctx_t *tctx)
 {
 	void *p;
-	prof_ctx_t *old_ctx;
+	prof_tctx_t *old_tctx;
 
-	old_ctx = prof_ctx_get(oldptr);
-	if ((uintptr_t)cnt != (uintptr_t)1U)
-		p = irealloc_prof_sample(oldptr, usize, cnt);
+	old_tctx = prof_tctx_get(oldptr);
+	if ((uintptr_t)tctx != (uintptr_t)1U)
+		p = irealloc_prof_sample(oldptr, usize, tctx);
 	else
 		p = iralloc(oldptr, usize, 0, 0, false);
 	if (p == NULL)
 		return (NULL);
-	prof_realloc(p, usize, cnt, old_usize, old_ctx);
+	prof_realloc(p, usize, tctx, old_usize, old_tctx);
 
 	return (p);
 }
@@ -1253,11 +1253,11 @@ je_realloc(void *ptr, size_t size)
 			old_rzsize = config_prof ? p2rz(ptr) : u2rz(old_usize);
 
 		if (config_prof && opt_prof) {
-			prof_thr_cnt_t *cnt;
+			prof_tctx_t *tctx;
 
 			usize = s2u(size);
-			PROF_ALLOC_PREP(usize, cnt);
-			ret = irealloc_prof(ptr, old_usize, usize, cnt);
+			tctx = prof_alloc_prep(usize);
+			ret = irealloc_prof(ptr, old_usize, usize, tctx);
 		} else {
 			if (config_stats || (config_valgrind && in_valgrind))
 				usize = s2u(size);
@@ -1379,11 +1379,11 @@ imallocx(size_t usize, size_t alignment, bool zero, bool try_tcache,
 
 static void *
 imallocx_prof_sample(size_t usize, size_t alignment, bool zero, bool try_tcache,
-    arena_t *arena, prof_thr_cnt_t *cnt)
+    arena_t *arena, prof_tctx_t *tctx)
 {
 	void *p;
 
-	if (cnt == NULL)
+	if (tctx == NULL)
 		return (NULL);
 	if (usize <= SMALL_MAXCLASS) {
 		size_t usize_promoted = (alignment == 0) ?
@@ -1402,18 +1402,18 @@ imallocx_prof_sample(size_t usize, size_t alignment, bool zero, bool try_tcache,
 
 JEMALLOC_ALWAYS_INLINE_C void *
 imallocx_prof(size_t usize, size_t alignment, bool zero, bool try_tcache,
-    arena_t *arena, prof_thr_cnt_t *cnt)
+    arena_t *arena, prof_tctx_t *tctx)
 {
 	void *p;
 
-	if ((uintptr_t)cnt != (uintptr_t)1U) {
+	if ((uintptr_t)tctx != (uintptr_t)1U) {
 		p = imallocx_prof_sample(usize, alignment, zero, try_tcache,
-		    arena, cnt);
+		    arena, tctx);
 	} else
 		p = imallocx(usize, alignment, zero, try_tcache, arena);
 	if (p == NULL)
 		return (NULL);
-	prof_malloc(p, usize, cnt);
+	prof_malloc(p, usize, tctx);
 
 	return (p);
 }
@@ -1447,11 +1447,11 @@ je_mallocx(size_t size, int flags)
 	assert(usize != 0);
 
 	if (config_prof && opt_prof) {
-		prof_thr_cnt_t *cnt;
+		prof_tctx_t *tctx;
 
-		PROF_ALLOC_PREP(usize, cnt);
+		tctx = prof_alloc_prep(usize);
 		p = imallocx_prof(usize, alignment, zero, try_tcache, arena,
-		    cnt);
+		    tctx);
 	} else
 		p = imallocx(usize, alignment, zero, try_tcache, arena);
 	if (p == NULL)
@@ -1476,11 +1476,11 @@ label_oom:
 static void *
 irallocx_prof_sample(void *oldptr, size_t size, size_t alignment, size_t usize,
     bool zero, bool try_tcache_alloc, bool try_tcache_dalloc, arena_t *arena,
-    prof_thr_cnt_t *cnt)
+    prof_tctx_t *tctx)
 {
 	void *p;
 
-	if (cnt == NULL)
+	if (tctx == NULL)
 		return (NULL);
 	if (usize <= SMALL_MAXCLASS) {
 		p = iralloct(oldptr, SMALL_MAXCLASS+1, (SMALL_MAXCLASS+1 >=
@@ -1500,15 +1500,15 @@ irallocx_prof_sample(void *oldptr, size_t size, size_t alignment, size_t usize,
 JEMALLOC_ALWAYS_INLINE_C void *
 irallocx_prof(void *oldptr, size_t old_usize, size_t size, size_t alignment,
     size_t *usize, bool zero, bool try_tcache_alloc, bool try_tcache_dalloc,
-    arena_t *arena, prof_thr_cnt_t *cnt)
+    arena_t *arena, prof_tctx_t *tctx)
 {
 	void *p;
-	prof_ctx_t *old_ctx;
+	prof_tctx_t *old_tctx;
 
-	old_ctx = prof_ctx_get(oldptr);
-	if ((uintptr_t)cnt != (uintptr_t)1U)
+	old_tctx = prof_tctx_get(oldptr);
+	if ((uintptr_t)tctx != (uintptr_t)1U)
 		p = irallocx_prof_sample(oldptr, size, alignment, *usize, zero,
-		    try_tcache_alloc, try_tcache_dalloc, arena, cnt);
+		    try_tcache_alloc, try_tcache_dalloc, arena, tctx);
 	else {
 		p = iralloct(oldptr, size, 0, alignment, zero,
 		    try_tcache_alloc, try_tcache_dalloc, arena);
@@ -1527,7 +1527,7 @@ irallocx_prof(void *oldptr, size_t old_usize, size_t size, size_t alignment,
 		 */
 		*usize = isalloc(p, config_prof);
 	}
-	prof_realloc(p, *usize, cnt, old_usize, old_ctx);
+	prof_realloc(p, *usize, tctx, old_usize, old_tctx);
 
 	return (p);
 }
@@ -1570,13 +1570,13 @@ je_rallocx(void *ptr, size_t size, int flags)
 		old_rzsize = u2rz(old_usize);
 
 	if (config_prof && opt_prof) {
-		prof_thr_cnt_t *cnt;
+		prof_tctx_t *tctx;
 
 		usize = (alignment == 0) ? s2u(size) : sa2u(size, alignment);
 		assert(usize != 0);
-		PROF_ALLOC_PREP(usize, cnt);
+		tctx = prof_alloc_prep(usize);
 		p = irallocx_prof(ptr, old_usize, size, alignment, &usize, zero,
-		    try_tcache_alloc, try_tcache_dalloc, arena, cnt);
+		    try_tcache_alloc, try_tcache_dalloc, arena, tctx);
 		if (p == NULL)
 			goto label_oom;
 	} else {
@@ -1623,11 +1623,11 @@ ixallocx_helper(void *ptr, size_t old_usize, size_t size, size_t extra,
 static size_t
 ixallocx_prof_sample(void *ptr, size_t old_usize, size_t size, size_t extra,
     size_t alignment, size_t max_usize, bool zero, arena_t *arena,
-    prof_thr_cnt_t *cnt)
+    prof_tctx_t *tctx)
 {
 	size_t usize;
 
-	if (cnt == NULL)
+	if (tctx == NULL)
 		return (old_usize);
 	/* Use minimum usize to determine whether promotion may happen. */
 	if (((alignment == 0) ? s2u(size) : sa2u(size, alignment)) <=
@@ -1650,22 +1650,22 @@ ixallocx_prof_sample(void *ptr, size_t old_usize, size_t size, size_t extra,
 JEMALLOC_ALWAYS_INLINE_C size_t
 ixallocx_prof(void *ptr, size_t old_usize, size_t size, size_t extra,
     size_t alignment, size_t max_usize, bool zero, arena_t *arena,
-    prof_thr_cnt_t *cnt)
+    prof_tctx_t *tctx)
 {
 	size_t usize;
-	prof_ctx_t *old_ctx;
+	prof_tctx_t *old_tctx;
 
-	old_ctx = prof_ctx_get(ptr);
-	if ((uintptr_t)cnt != (uintptr_t)1U) {
+	old_tctx = prof_tctx_get(ptr);
+	if ((uintptr_t)tctx != (uintptr_t)1U) {
 		usize = ixallocx_prof_sample(ptr, old_usize, size, extra,
-		    alignment, zero, max_usize, arena, cnt);
+		    alignment, zero, max_usize, arena, tctx);
 	} else {
 		usize = ixallocx_helper(ptr, old_usize, size, extra, alignment,
 		    zero, arena);
 	}
 	if (usize == old_usize)
 		return (usize);
-	prof_realloc(ptr, usize, cnt, old_usize, old_ctx);
+	prof_realloc(ptr, usize, tctx, old_usize, old_tctx);
 
 	return (usize);
 }
@@ -1697,19 +1697,19 @@ je_xallocx(void *ptr, size_t size, size_t extra, int flags)
 		old_rzsize = u2rz(old_usize);
 
 	if (config_prof && opt_prof) {
-		prof_thr_cnt_t *cnt;
+		prof_tctx_t *tctx;
 		/*
 		 * usize isn't knowable before ixalloc() returns when extra is
 		 * non-zero.  Therefore, compute its maximum possible value and
-		 * use that in PROF_ALLOC_PREP() to decide whether to capture a
+		 * use that in prof_alloc_prep() to decide whether to capture a
 		 * backtrace.  prof_realloc() will use the actual usize to
 		 * decide whether to sample.
 		 */
 		size_t max_usize = (alignment == 0) ? s2u(size+extra) :
 		    sa2u(size+extra, alignment);
-		PROF_ALLOC_PREP(max_usize, cnt);
+		tctx = prof_alloc_prep(max_usize);
 		usize = ixallocx_prof(ptr, old_usize, size, extra, alignment,
-		    max_usize, zero, arena, cnt);
+		    max_usize, zero, arena, tctx);
 	} else {
 		usize = ixallocx_helper(ptr, old_usize, size, extra, alignment,
 		    zero, arena);
diff --git a/src/prof.c b/src/prof.c
index 497ccf42..044acd8b 100644
--- a/src/prof.c
+++ b/src/prof.c
@@ -33,22 +33,41 @@ char		opt_prof_prefix[
 
 uint64_t	prof_interval = 0;
 
+size_t		lg_prof_sample;
+
 /*
- * Table of mutexes that are shared among ctx's.  These are leaf locks, so
- * there is no problem with using them for more than one ctx at the same time.
- * The primary motivation for this sharing though is that ctx's are ephemeral,
+ * Table of mutexes that are shared among gctx's.  These are leaf locks, so
+ * there is no problem with using them for more than one gctx at the same time.
+ * The primary motivation for this sharing though is that gctx's are ephemeral,
  * and destroying mutexes causes complications for systems that allocate when
  * creating/destroying mutexes.
  */
-static malloc_mutex_t	*ctx_locks;
-static unsigned		cum_ctxs; /* Atomic counter. */
+static malloc_mutex_t	*gctx_locks;
+static unsigned		cum_gctxs; /* Atomic counter. */
 
 /*
- * Global hash of (prof_bt_t *)-->(prof_ctx_t *).  This is the master data
+ * Table of mutexes that are shared among tdata's.  No operations require
+ * holding multiple tdata locks, so there is no problem with using them for more
+ * than one tdata at the same time, even though a gctx lock may be acquired
+ * while holding a tdata lock.
+ */
+static malloc_mutex_t	*tdata_locks;
+
+/*
+ * Global hash of (prof_bt_t *)-->(prof_gctx_t *).  This is the master data
  * structure that knows about all backtraces currently captured.
  */
-static ckh_t		bt2ctx;
-static malloc_mutex_t	bt2ctx_mtx;
+static ckh_t		bt2gctx;
+static malloc_mutex_t	bt2gctx_mtx;
+
+/*
+ * Tree of all extant prof_tdata_t structures, regardless of state,
+ * {attached,detached,expired}.
+ */
+static prof_tdata_tree_t	tdatas;
+static malloc_mutex_t	tdatas_mtx;
+
+static uint64_t		next_thr_uid;
 
 static malloc_mutex_t	prof_dump_seq_mtx;
 static uint64_t		prof_dump_seq;
@@ -76,21 +95,33 @@ static int		prof_dump_fd;
 static bool		prof_booted = false;
 
 /******************************************************************************/
+/*
+ * Function prototypes for static functions that are referenced prior to
+ * definition.
+ */
+
+static bool	prof_tctx_should_destroy(prof_tctx_t *tctx);
+static void	prof_tctx_destroy(prof_tctx_t *tctx);
+static bool	prof_tdata_should_destroy(prof_tdata_t *tdata);
+static void	prof_tdata_destroy(prof_tdata_t *tdata);
+
+/******************************************************************************/
+/* Red-black trees. */
 
 JEMALLOC_INLINE_C int
-prof_thr_cnt_comp(const prof_thr_cnt_t *a, const prof_thr_cnt_t *b)
+prof_tctx_comp(const prof_tctx_t *a, const prof_tctx_t *b)
 {
-	prof_thr_uid_t a_uid = a->thr_uid;
-	prof_thr_uid_t b_uid = b->thr_uid;
+	uint64_t a_uid = a->tdata->thr_uid;
+	uint64_t b_uid = b->tdata->thr_uid;
 
 	return ((a_uid > b_uid) - (a_uid < b_uid));
 }
 
-rb_gen(static UNUSED, thr_cnt_tree_, prof_thr_cnt_tree_t, prof_thr_cnt_t,
-    thr_cnt_link, prof_thr_cnt_comp)
+rb_gen(static UNUSED, tctx_tree_, prof_tctx_tree_t, prof_tctx_t,
+    tctx_link, prof_tctx_comp)
 
 JEMALLOC_INLINE_C int
-prof_ctx_comp(const prof_ctx_t *a, const prof_ctx_t *b)
+prof_gctx_comp(const prof_gctx_t *a, const prof_gctx_t *b)
 {
 	unsigned a_len = a->bt.len;
 	unsigned b_len = b->bt.len;
@@ -101,8 +132,52 @@ prof_ctx_comp(const prof_ctx_t *a, const prof_ctx_t *b)
 	return (ret);
 }
 
-rb_gen(static UNUSED, ctx_tree_, prof_ctx_tree_t, prof_ctx_t, dump_link,
-    prof_ctx_comp)
+rb_gen(static UNUSED, gctx_tree_, prof_gctx_tree_t, prof_gctx_t, dump_link,
+    prof_gctx_comp)
+
+JEMALLOC_INLINE_C int
+prof_tdata_comp(const prof_tdata_t *a, const prof_tdata_t *b)
+{
+	uint64_t a_uid = a->thr_uid;
+	uint64_t b_uid = b->thr_uid;
+
+	return ((a_uid > b_uid) - (a_uid < b_uid));
+}
+
+rb_gen(static UNUSED, tdata_tree_, prof_tdata_tree_t, prof_tdata_t, tdata_link,
+    prof_tdata_comp)
+
+/******************************************************************************/
+
+void
+prof_malloc_sample_object(const void *ptr, size_t usize, prof_tctx_t *tctx) {
+	prof_tctx_set(ptr, tctx);
+
+	malloc_mutex_lock(tctx->tdata->lock);
+	tctx->cnts.curobjs++;
+	tctx->cnts.curbytes += usize;
+	if (opt_prof_accum) {
+		tctx->cnts.accumobjs++;
+		tctx->cnts.accumbytes += usize;
+	}
+	malloc_mutex_unlock(tctx->tdata->lock);
+}
+
+void
+prof_free_sampled_object(size_t usize, prof_tctx_t *tctx)
+{
+
+	malloc_mutex_lock(tctx->tdata->lock);
+	assert(tctx->cnts.curobjs > 0);
+	assert(tctx->cnts.curbytes >= usize);
+	tctx->cnts.curobjs--;
+	tctx->cnts.curbytes -= usize;
+
+	if (prof_tctx_should_destroy(tctx))
+		prof_tctx_destroy(tctx);
+	else
+		malloc_mutex_unlock(tctx->tdata->lock);
+}
 
 void
 bt_init(prof_bt_t *bt, void **vec)
@@ -115,32 +190,32 @@ bt_init(prof_bt_t *bt, void **vec)
 }
 
 static inline void
-prof_enter(prof_tdata_t *prof_tdata)
+prof_enter(prof_tdata_t *tdata)
 {
 
 	cassert(config_prof);
 
-	assert(prof_tdata->enq == false);
-	prof_tdata->enq = true;
+	assert(tdata->enq == false);
+	tdata->enq = true;
 
-	malloc_mutex_lock(&bt2ctx_mtx);
+	malloc_mutex_lock(&bt2gctx_mtx);
 }
 
 static inline void
-prof_leave(prof_tdata_t *prof_tdata)
+prof_leave(prof_tdata_t *tdata)
 {
 	bool idump, gdump;
 
 	cassert(config_prof);
 
-	malloc_mutex_unlock(&bt2ctx_mtx);
+	malloc_mutex_unlock(&bt2gctx_mtx);
 
-	assert(prof_tdata->enq);
-	prof_tdata->enq = false;
-	idump = prof_tdata->enq_idump;
-	prof_tdata->enq_idump = false;
-	gdump = prof_tdata->enq_gdump;
-	prof_tdata->enq_gdump = false;
+	assert(tdata->enq);
+	tdata->enq = false;
+	idump = tdata->enq_idump;
+	tdata->enq_idump = false;
+	gdump = tdata->enq_gdump;
+	tdata->enq_gdump = false;
 
 	if (idump)
 		prof_idump();
@@ -373,220 +448,268 @@ prof_backtrace(prof_bt_t *bt)
 #endif
 
 static malloc_mutex_t *
-prof_ctx_mutex_choose(void)
+prof_gctx_mutex_choose(void)
 {
-	unsigned nctxs = atomic_add_u(&cum_ctxs, 1);
+	unsigned ngctxs = atomic_add_u(&cum_gctxs, 1);
 
-	return (&ctx_locks[(nctxs - 1) % PROF_NCTX_LOCKS]);
+	return (&gctx_locks[(ngctxs - 1) % PROF_NCTX_LOCKS]);
 }
 
-static prof_ctx_t *
-prof_ctx_create(prof_bt_t *bt)
+static malloc_mutex_t *
+prof_tdata_mutex_choose(uint64_t thr_uid)
+{
+
+	return (&tdata_locks[thr_uid % PROF_NTDATA_LOCKS]);
+}
+
+static prof_gctx_t *
+prof_gctx_create(prof_bt_t *bt)
 {
 	/*
 	 * Create a single allocation that has space for vec of length bt->len.
 	 */
-	prof_ctx_t *ctx = (prof_ctx_t *)imalloc(offsetof(prof_ctx_t, vec) +
+	prof_gctx_t *gctx = (prof_gctx_t *)imalloc(offsetof(prof_gctx_t, vec) +
 	    (bt->len * sizeof(void *)));
-	if (ctx == NULL)
+	if (gctx == NULL)
 		return (NULL);
-	ctx->lock = prof_ctx_mutex_choose();
+	gctx->lock = prof_gctx_mutex_choose();
 	/*
 	 * Set nlimbo to 1, in order to avoid a race condition with
-	 * prof_ctx_merge()/prof_ctx_destroy().
+	 * prof_tctx_destroy()/prof_gctx_maybe_destroy().
 	 */
-	ctx->nlimbo = 1;
-	memset(&ctx->cnt_merged, 0, sizeof(prof_cnt_t));
-	thr_cnt_tree_new(&ctx->thr_cnts);
+	gctx->nlimbo = 1;
+	tctx_tree_new(&gctx->tctxs);
 	/* Duplicate bt. */
-	memcpy(ctx->vec, bt->vec, bt->len * sizeof(void *));
-	ctx->bt.vec = ctx->vec;
-	ctx->bt.len = bt->len;
-	return (ctx);
+	memcpy(gctx->vec, bt->vec, bt->len * sizeof(void *));
+	gctx->bt.vec = gctx->vec;
+	gctx->bt.len = bt->len;
+	return (gctx);
 }
 
 static void
-prof_ctx_destroy(prof_ctx_t *ctx)
+prof_gctx_maybe_destroy(prof_gctx_t *gctx)
 {
-	prof_tdata_t *prof_tdata;
+	prof_tdata_t *tdata;
 
 	cassert(config_prof);
 
 	/*
-	 * Check that ctx is still unused by any thread cache before destroying
-	 * it.  prof_lookup() increments ctx->nlimbo in order to avoid a race
-	 * condition with this function, as does prof_ctx_merge() in order to
-	 * avoid a race between the main body of prof_ctx_merge() and entry
+	 * Check that gctx is still unused by any thread cache before destroying
+	 * it.  prof_lookup() increments gctx->nlimbo in order to avoid a race
+	 * condition with this function, as does prof_tctx_destroy() in order to
+	 * avoid a race between the main body of prof_tctx_destroy() and entry
 	 * into this function.
 	 */
-	prof_tdata = prof_tdata_get(false);
-	assert((uintptr_t)prof_tdata > (uintptr_t)PROF_TDATA_STATE_MAX);
-	prof_enter(prof_tdata);
-	malloc_mutex_lock(ctx->lock);
-	if (thr_cnt_tree_first(&ctx->thr_cnts) == NULL &&
-	    ctx->cnt_merged.curobjs == 0 && ctx->nlimbo == 1) {
-		assert(ctx->cnt_merged.curbytes == 0);
-		assert(ctx->cnt_merged.accumobjs == 0);
-		assert(ctx->cnt_merged.accumbytes == 0);
-		/* Remove ctx from bt2ctx. */
-		if (ckh_remove(&bt2ctx, &ctx->bt, NULL, NULL))
+	tdata = prof_tdata_get(false);
+	assert((uintptr_t)tdata > (uintptr_t)PROF_TDATA_STATE_MAX);
+	prof_enter(tdata);
+	malloc_mutex_lock(gctx->lock);
+	if (tctx_tree_empty(&gctx->tctxs) && gctx->nlimbo == 1) {
+		/* Remove gctx from bt2gctx. */
+		if (ckh_remove(&bt2gctx, &gctx->bt, NULL, NULL))
 			not_reached();
-		prof_leave(prof_tdata);
-		/* Destroy ctx. */
-		malloc_mutex_unlock(ctx->lock);
-		idalloc(ctx);
+		prof_leave(tdata);
+		/* Destroy gctx. */
+		malloc_mutex_unlock(gctx->lock);
+		idalloc(gctx);
 	} else {
 		/*
-		 * Compensate for increment in prof_ctx_merge() or
+		 * Compensate for increment in prof_tctx_destroy() or
 		 * prof_lookup().
 		 */
-		ctx->nlimbo--;
-		malloc_mutex_unlock(ctx->lock);
-		prof_leave(prof_tdata);
+		gctx->nlimbo--;
+		malloc_mutex_unlock(gctx->lock);
+		prof_leave(tdata);
 	}
 }
 
-static void
-prof_ctx_merge(prof_ctx_t *ctx, prof_thr_cnt_t *cnt)
+/* tctx->tdata->lock must be held. */
+static bool
+prof_tctx_should_destroy(prof_tctx_t *tctx)
 {
-	bool destroy;
 
-	cassert(config_prof);
-
-	/* Merge cnt stats and detach from ctx. */
-	malloc_mutex_lock(ctx->lock);
-	ctx->cnt_merged.curobjs += cnt->cnts.curobjs;
-	ctx->cnt_merged.curbytes += cnt->cnts.curbytes;
-	ctx->cnt_merged.accumobjs += cnt->cnts.accumobjs;
-	ctx->cnt_merged.accumbytes += cnt->cnts.accumbytes;
-	thr_cnt_tree_remove(&ctx->thr_cnts, cnt);
-	if (opt_prof_accum == false && thr_cnt_tree_first(&ctx->thr_cnts) ==
-	    NULL && ctx->cnt_merged.curobjs == 0 && ctx->nlimbo == 0) {
-		/*
-		 * Increment ctx->nlimbo in order to keep another thread from
-		 * winning the race to destroy ctx while this one has ctx->lock
-		 * dropped.  Without this, it would be possible for another
-		 * thread to:
-		 *
-		 * 1) Sample an allocation associated with ctx.
-		 * 2) Deallocate the sampled object.
-		 * 3) Successfully prof_ctx_destroy(ctx).
-		 *
-		 * The result would be that ctx no longer exists by the time
-		 * this thread accesses it in prof_ctx_destroy().
-		 */
-		ctx->nlimbo++;
-		destroy = true;
-	} else
-		destroy = false;
-	malloc_mutex_unlock(ctx->lock);
-	if (destroy)
-		prof_ctx_destroy(ctx);
+	if (opt_prof_accum)
+		return (false);
+	if (tctx->cnts.curobjs != 0)
+		return (false);
+	return (true);
 }
 
 static bool
-prof_lookup_global(prof_bt_t *bt, prof_tdata_t *prof_tdata, void **p_btkey,
-    prof_ctx_t **p_ctx, bool *p_new_ctx)
+prof_gctx_should_destroy(prof_gctx_t *gctx)
+{
+
+	if (opt_prof_accum)
+		return (false);
+	if (tctx_tree_empty(&gctx->tctxs) == false)
+		return (false);
+	if (gctx->nlimbo != 0)
+		return (false);
+	return (true);
+}
+
+/* tctx->tdata->lock is held upon entry, and released before return. */
+static void
+prof_tctx_destroy(prof_tctx_t *tctx)
+{
+	prof_gctx_t *gctx = tctx->gctx;
+	bool destroy_gctx;
+
+	assert(tctx->cnts.curobjs == 0);
+	assert(tctx->cnts.curbytes == 0);
+	assert(opt_prof_accum == false);
+	assert(tctx->cnts.accumobjs == 0);
+	assert(tctx->cnts.accumbytes == 0);
+
+	{
+		prof_tdata_t *tdata = tctx->tdata;
+		bool tdata_destroy;
+
+		ckh_remove(&tdata->bt2tctx, &gctx->bt, NULL, NULL);
+		tdata_destroy = prof_tdata_should_destroy(tdata);
+		malloc_mutex_unlock(tdata->lock);
+		if (tdata_destroy)
+			prof_tdata_destroy(tdata);
+	}
+
+	malloc_mutex_lock(gctx->lock);
+	tctx_tree_remove(&gctx->tctxs, tctx);
+	if (prof_gctx_should_destroy(gctx)) {
+		/*
+		 * Increment gctx->nlimbo in order to keep another thread from
+		 * winning the race to destroy gctx while this one has
+		 * gctx->lock dropped.  Without this, it would be possible for
+		 * another thread to:
+		 *
+		 * 1) Sample an allocation associated with gctx.
+		 * 2) Deallocate the sampled object.
+		 * 3) Successfully prof_gctx_maybe_destroy(gctx).
+		 *
+		 * The result would be that gctx no longer exists by the time
+		 * this thread accesses it in prof_gctx_maybe_destroy().
+		 */
+		gctx->nlimbo++;
+		destroy_gctx = true;
+	} else
+		destroy_gctx = false;
+	malloc_mutex_unlock(gctx->lock);
+	if (destroy_gctx)
+		prof_gctx_maybe_destroy(gctx);
+
+	idalloc(tctx);
+}
+
+static bool
+prof_lookup_global(prof_bt_t *bt, prof_tdata_t *tdata, void **p_btkey,
+    prof_gctx_t **p_gctx, bool *p_new_gctx)
 {
 	union {
-		prof_ctx_t	*p;
+		prof_gctx_t	*p;
 		void		*v;
-	} ctx;
+	} gctx;
 	union {
 		prof_bt_t	*p;
 		void		*v;
 	} btkey;
-	bool new_ctx;
+	bool new_gctx;
 
-	prof_enter(prof_tdata);
-	if (ckh_search(&bt2ctx, bt, &btkey.v, &ctx.v)) {
+	prof_enter(tdata);
+	if (ckh_search(&bt2gctx, bt, &btkey.v, &gctx.v)) {
 		/* bt has never been seen before.  Insert it. */
-		ctx.p = prof_ctx_create(bt);
-		if (ctx.v == NULL) {
-			prof_leave(prof_tdata);
+		gctx.p = prof_gctx_create(bt);
+		if (gctx.v == NULL) {
+			prof_leave(tdata);
 			return (true);
 		}
-		btkey.p = &ctx.p->bt;
-		if (ckh_insert(&bt2ctx, btkey.v, ctx.v)) {
+		btkey.p = &gctx.p->bt;
+		if (ckh_insert(&bt2gctx, btkey.v, gctx.v)) {
 			/* OOM. */
-			prof_leave(prof_tdata);
-			idalloc(ctx.v);
+			prof_leave(tdata);
+			idalloc(gctx.v);
 			return (true);
 		}
-		new_ctx = true;
+		new_gctx = true;
 	} else {
 		/*
 		 * Increment nlimbo, in order to avoid a race condition with
-		 * prof_ctx_merge()/prof_ctx_destroy().
+		 * prof_tctx_destroy()/prof_gctx_maybe_destroy().
 		 */
-		malloc_mutex_lock(ctx.p->lock);
-		ctx.p->nlimbo++;
-		malloc_mutex_unlock(ctx.p->lock);
-		new_ctx = false;
+		malloc_mutex_lock(gctx.p->lock);
+		gctx.p->nlimbo++;
+		malloc_mutex_unlock(gctx.p->lock);
+		new_gctx = false;
 	}
-	prof_leave(prof_tdata);
+	prof_leave(tdata);
 
 	*p_btkey = btkey.v;
-	*p_ctx = ctx.p;
-	*p_new_ctx = new_ctx;
+	*p_gctx = gctx.p;
+	*p_new_gctx = new_gctx;
 	return (false);
 }
 
-prof_thr_cnt_t *
+prof_tctx_t *
 prof_lookup(prof_bt_t *bt)
 {
 	union {
-		prof_thr_cnt_t	*p;
+		prof_tctx_t	*p;
 		void		*v;
 	} ret;
-	prof_tdata_t *prof_tdata;
+	prof_tdata_t *tdata;
+	bool not_found;
 
 	cassert(config_prof);
 
-	prof_tdata = prof_tdata_get(false);
-	if ((uintptr_t)prof_tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
+	tdata = prof_tdata_get(false);
+	if ((uintptr_t)tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
 		return (NULL);
 
-	if (ckh_search(&prof_tdata->bt2cnt, bt, NULL, &ret.v)) {
+	malloc_mutex_lock(tdata->lock);
+	not_found = ckh_search(&tdata->bt2tctx, bt, NULL, &ret.v);
+	malloc_mutex_unlock(tdata->lock);
+	if (not_found) {
 		void *btkey;
-		prof_ctx_t *ctx;
-		bool new_ctx;
+		prof_gctx_t *gctx;
+		bool new_gctx, error;
 
 		/*
 		 * This thread's cache lacks bt.  Look for it in the global
 		 * cache.
 		 */
-		if (prof_lookup_global(bt, prof_tdata, &btkey, &ctx, &new_ctx))
+		if (prof_lookup_global(bt, tdata, &btkey, &gctx,
+		    &new_gctx))
 			return (NULL);
 
-		/* Link a prof_thd_cnt_t into ctx for this thread. */
-		ret.v = imalloc(sizeof(prof_thr_cnt_t));
+		/* Link a prof_tctx_t into gctx for this thread. */
+		ret.v = imalloc(sizeof(prof_tctx_t));
 		if (ret.p == NULL) {
-			if (new_ctx)
-				prof_ctx_destroy(ctx);
+			if (new_gctx)
+				prof_gctx_maybe_destroy(gctx);
 			return (NULL);
 		}
-		ret.p->ctx = ctx;
-		ret.p->epoch = 0;
+		ret.p->tdata = tdata;
 		memset(&ret.p->cnts, 0, sizeof(prof_cnt_t));
-		if (ckh_insert(&prof_tdata->bt2cnt, btkey, ret.v)) {
-			if (new_ctx)
-				prof_ctx_destroy(ctx);
+		ret.p->gctx = gctx;
+		ret.p->state = prof_tctx_state_nominal;
+		malloc_mutex_lock(tdata->lock);
+		error = ckh_insert(&tdata->bt2tctx, btkey, ret.v);
+		malloc_mutex_unlock(tdata->lock);
+		if (error) {
+			if (new_gctx)
+				prof_gctx_maybe_destroy(gctx);
 			idalloc(ret.v);
 			return (NULL);
 		}
-		malloc_mutex_lock(ctx->lock);
-		thr_cnt_tree_insert(&ctx->thr_cnts, ret.p);
-		ctx->nlimbo--;
-		malloc_mutex_unlock(ctx->lock);
+		malloc_mutex_lock(gctx->lock);
+		tctx_tree_insert(&gctx->tctxs, ret.p);
+		gctx->nlimbo--;
+		malloc_mutex_unlock(gctx->lock);
 	}
 
 	return (ret.p);
 }
 
-
 void
-prof_sample_threshold_update(prof_tdata_t *prof_tdata)
+prof_sample_threshold_update(prof_tdata_t *tdata)
 {
 	/*
 	 * The body of this function is compiled out unless heap profiling is
@@ -608,23 +731,20 @@ prof_sample_threshold_update(prof_tdata_t *prof_tdata)
 	if (!config_prof)
 		return;
 
-	if (prof_tdata == NULL)
-		prof_tdata = prof_tdata_get(false);
-
-	if (opt_lg_prof_sample == 0) {
-		prof_tdata->bytes_until_sample = 0;
+	if (lg_prof_sample == 0) {
+		tdata->bytes_until_sample = 0;
 		return;
 	}
 
 	/*
-	 * Compute sample threshold as a geometrically distributed random
-	 * variable with mean (2^opt_lg_prof_sample).
+	 * Compute sample interval as a geometrically distributed random
+	 * variable with mean (2^lg_prof_sample).
 	 *
-	 *                         __        __
-	 *                         |  log(u)  |                     1
-	 * prof_tdata->threshold = | -------- |, where p = -------------------
-	 *                         | log(1-p) |             opt_lg_prof_sample
-	 *                                                 2
+	 *                             __        __
+	 *                             |  log(u)  |                     1
+	 * tdata->bytes_until_sample = | -------- |, where p = ---------------
+	 *                             | log(1-p) |             lg_prof_sample
+	 *                                                     2
 	 *
 	 * For more information on the math, see:
 	 *
@@ -634,30 +754,29 @@ prof_sample_threshold_update(prof_tdata_t *prof_tdata)
 	 *   pp 500
 	 *   (http://luc.devroye.org/rnbookindex.html)
 	 */
-	prng64(r, 53, prof_tdata->prng_state,
-	    UINT64_C(6364136223846793005), UINT64_C(1442695040888963407));
+	prng64(r, 53, tdata->prng_state, UINT64_C(6364136223846793005),
+	    UINT64_C(1442695040888963407));
 	u = (double)r * (1.0/9007199254740992.0L);
-	prof_tdata->bytes_until_sample = (uint64_t)(log(u) /
-	    log(1.0 - (1.0 / (double)((uint64_t)1U << opt_lg_prof_sample))))
+	tdata->bytes_until_sample = (uint64_t)(log(u) /
+	    log(1.0 - (1.0 / (double)((uint64_t)1U << lg_prof_sample))))
 	    + (uint64_t)1U;
 #endif
 }
 
-
 #ifdef JEMALLOC_JET
 size_t
 prof_bt_count(void)
 {
 	size_t bt_count;
-	prof_tdata_t *prof_tdata;
+	prof_tdata_t *tdata;
 
-	prof_tdata = prof_tdata_get(false);
-	if ((uintptr_t)prof_tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
+	tdata = prof_tdata_get(false);
+	if ((uintptr_t)tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
 		return (0);
 
-	prof_enter(prof_tdata);
-	bt_count = ckh_count(&bt2ctx);
-	prof_leave(prof_tdata);
+	prof_enter(tdata);
+	bt_count = ckh_count(&bt2gctx);
+	prof_leave(tdata);
 
 	return (bt_count);
 }
@@ -770,146 +889,249 @@ prof_dump_printf(bool propagate_err, const char *format, ...)
 	return (ret);
 }
 
-static prof_thr_cnt_t *
-ctx_sum_iter(prof_thr_cnt_tree_t *thr_cnts, prof_thr_cnt_t *thr_cnt, void *arg)
+/* tctx->tdata->lock is held. */
+static void
+prof_tctx_merge_tdata(prof_tctx_t *tctx, prof_tdata_t *tdata)
 {
-	prof_ctx_t *ctx = (prof_ctx_t *)arg;
-	volatile unsigned *epoch = &thr_cnt->epoch;
-	prof_cnt_t tcnt;
 
-	while (true) {
-		unsigned epoch0 = *epoch;
+	assert(tctx->state == prof_tctx_state_nominal);
+	tctx->state = prof_tctx_state_dumping;
+	memcpy(&tctx->dump_cnts, &tctx->cnts, sizeof(prof_cnt_t));
 
-		/* Make sure epoch is even. */
-		if (epoch0 & 1U)
-			continue;
-
-		memcpy(&tcnt, &thr_cnt->cnts, sizeof(prof_cnt_t));
-
-		/* Terminate if epoch didn't change while reading. */
-		if (*epoch == epoch0)
-			break;
-	}
-
-	ctx->cnt_summed.curobjs += tcnt.curobjs;
-	ctx->cnt_summed.curbytes += tcnt.curbytes;
+	tdata->cnt_summed.curobjs += tctx->dump_cnts.curobjs;
+	tdata->cnt_summed.curbytes += tctx->dump_cnts.curbytes;
 	if (opt_prof_accum) {
-		ctx->cnt_summed.accumobjs += tcnt.accumobjs;
-		ctx->cnt_summed.accumbytes += tcnt.accumbytes;
+		tdata->cnt_summed.accumobjs += tctx->dump_cnts.accumobjs;
+		tdata->cnt_summed.accumbytes += tctx->dump_cnts.accumbytes;
+	}
+}
+
+/* gctx->lock is held. */
+static void
+prof_tctx_merge_gctx(prof_tctx_t *tctx, prof_gctx_t *gctx)
+{
+
+	gctx->cnt_summed.curobjs += tctx->dump_cnts.curobjs;
+	gctx->cnt_summed.curbytes += tctx->dump_cnts.curbytes;
+	if (opt_prof_accum) {
+		gctx->cnt_summed.accumobjs += tctx->dump_cnts.accumobjs;
+		gctx->cnt_summed.accumbytes += tctx->dump_cnts.accumbytes;
+	}
+}
+
+/* tctx->gctx is held. */
+static prof_tctx_t *
+prof_tctx_merge_iter(prof_tctx_tree_t *tctxs, prof_tctx_t *tctx, void *arg)
+{
+
+	switch (tctx->state) {
+	case prof_tctx_state_nominal:
+		/* New since dumping started; ignore. */
+		break;
+	case prof_tctx_state_dumping:
+	case prof_tctx_state_purgatory:
+		prof_tctx_merge_gctx(tctx, tctx->gctx);
+		break;
+	default:
+		not_reached();
 	}
 
 	return (NULL);
 }
 
+/* gctx->lock is held. */
+static prof_tctx_t *
+prof_tctx_dump_iter(prof_tctx_tree_t *tctxs, prof_tctx_t *tctx, void *arg)
+{
+	bool propagate_err = *(bool *)arg;
+
+	if (prof_dump_printf(propagate_err,
+	    "  t%"PRIu64": %"PRIu64": %"PRIu64" [%"PRIu64": %"PRIu64"]\n",
+	    tctx->tdata->thr_uid, tctx->dump_cnts.curobjs,
+	    tctx->dump_cnts.curbytes, tctx->dump_cnts.accumobjs,
+	    tctx->dump_cnts.accumbytes))
+		return (tctx);
+	return (NULL);
+}
+
+/* tctx->gctx is held. */
+static prof_tctx_t *
+prof_tctx_finish_iter(prof_tctx_tree_t *tctxs, prof_tctx_t *tctx, void *arg)
+{
+	prof_tctx_t *ret;
+
+	switch (tctx->state) {
+	case prof_tctx_state_nominal:
+		/* New since dumping started; ignore. */
+		break;
+	case prof_tctx_state_dumping:
+		tctx->state = prof_tctx_state_nominal;
+		break;
+	case prof_tctx_state_purgatory:
+		ret = tctx_tree_next(tctxs, tctx);
+		tctx_tree_remove(tctxs, tctx);
+		idalloc(tctx);
+		goto label_return;
+	default:
+		not_reached();
+	}
+
+	ret = NULL;
+label_return:
+	return (ret);
+}
+
 static void
-prof_dump_ctx_prep(prof_ctx_t *ctx, prof_cnt_t *cnt_all, size_t *leak_nctx,
-    prof_ctx_tree_t *ctxs)
+prof_dump_gctx_prep(prof_gctx_t *gctx, prof_gctx_tree_t *gctxs)
 {
 
 	cassert(config_prof);
 
-	malloc_mutex_lock(ctx->lock);
+	malloc_mutex_lock(gctx->lock);
 
 	/*
-	 * Increment nlimbo so that ctx won't go away before dump.
-	 * Additionally, link ctx into the dump list so that it is included in
+	 * Increment nlimbo so that gctx won't go away before dump.
+	 * Additionally, link gctx into the dump list so that it is included in
 	 * prof_dump()'s second pass.
 	 */
-	ctx->nlimbo++;
-	ctx_tree_insert(ctxs, ctx);
+	gctx->nlimbo++;
+	gctx_tree_insert(gctxs, gctx);
 
-	memcpy(&ctx->cnt_summed, &ctx->cnt_merged, sizeof(prof_cnt_t));
-	thr_cnt_tree_iter(&ctx->thr_cnts, NULL, ctx_sum_iter, (void *)ctx);
+	memset(&gctx->cnt_summed, 0, sizeof(prof_cnt_t));
 
-	if (ctx->cnt_summed.curobjs != 0)
-		(*leak_nctx)++;
+	malloc_mutex_unlock(gctx->lock);
+}
 
-	/* Add to cnt_all. */
-	cnt_all->curobjs += ctx->cnt_summed.curobjs;
-	cnt_all->curbytes += ctx->cnt_summed.curbytes;
-	if (opt_prof_accum) {
-		cnt_all->accumobjs += ctx->cnt_summed.accumobjs;
-		cnt_all->accumbytes += ctx->cnt_summed.accumbytes;
-	}
+static prof_gctx_t *
+prof_gctx_merge_iter(prof_gctx_tree_t *gctxs, prof_gctx_t *gctx, void *arg)
+{
+	size_t *leak_ngctx = (size_t *)arg;
 
-	malloc_mutex_unlock(ctx->lock);
+	malloc_mutex_lock(gctx->lock);
+	tctx_tree_iter(&gctx->tctxs, NULL, prof_tctx_merge_iter, NULL);
+	if (gctx->cnt_summed.curobjs != 0)
+		(*leak_ngctx)++;
+	malloc_mutex_unlock(gctx->lock);
+
+	return (NULL);
+}
+
+static prof_gctx_t *
+prof_gctx_finish_iter(prof_gctx_tree_t *gctxs, prof_gctx_t *gctx, void *arg)
+{
+	prof_tctx_t *next;
+	bool destroy_gctx;
+
+	malloc_mutex_lock(gctx->lock);
+	next = NULL;
+	do {
+		next = tctx_tree_iter(&gctx->tctxs, next, prof_tctx_finish_iter,
+		    NULL);
+	} while (next != NULL);
+	gctx->nlimbo--;
+	destroy_gctx = prof_gctx_should_destroy(gctx);
+	malloc_mutex_unlock(gctx->lock);
+	if (destroy_gctx)
+		prof_gctx_maybe_destroy(gctx);
+
+	return (NULL);
+}
+
+static prof_tdata_t *
+prof_tdata_merge_iter(prof_tdata_tree_t *tdatas, prof_tdata_t *tdata, void *arg)
+{
+	prof_cnt_t *cnt_all = (prof_cnt_t *)arg;
+
+	malloc_mutex_lock(tdata->lock);
+	if (tdata->state != prof_tdata_state_expired) {
+		size_t tabind;
+		union {
+			prof_tctx_t	*p;
+			void		*v;
+		} tctx;
+
+		tdata->dumping = true;
+		memset(&tdata->cnt_summed, 0, sizeof(prof_cnt_t));
+		for (tabind = 0; ckh_iter(&tdata->bt2tctx, &tabind, NULL,
+		    &tctx.v) == false;)
+			prof_tctx_merge_tdata(tctx.p, tdata);
+
+		cnt_all->curobjs += tdata->cnt_summed.curobjs;
+		cnt_all->curbytes += tdata->cnt_summed.curbytes;
+		if (opt_prof_accum) {
+			cnt_all->accumobjs += tdata->cnt_summed.accumobjs;
+			cnt_all->accumbytes += tdata->cnt_summed.accumbytes;
+		}
+	} else
+		tdata->dumping = false;
+	malloc_mutex_unlock(tdata->lock);
+
+	return (NULL);
+}
+
+static prof_tdata_t *
+prof_tdata_dump_iter(prof_tdata_tree_t *tdatas, prof_tdata_t *tdata, void *arg)
+{
+	bool propagate_err = *(bool *)arg;
+
+	if (tdata->dumping == false)
+		return (NULL);
+
+	if (prof_dump_printf(propagate_err,
+	    "  t%"PRIu64": %"PRIu64": %"PRIu64" [%"PRIu64": %"PRIu64"]%s%s\n",
+	    tdata->thr_uid, tdata->cnt_summed.curobjs,
+	    tdata->cnt_summed.curbytes, tdata->cnt_summed.accumobjs,
+	    tdata->cnt_summed.accumbytes,
+	    (tdata->thread_name != NULL) ? " " : "",
+	    (tdata->thread_name != NULL) ? tdata->thread_name : ""))
+		return (tdata);
+	return (NULL);
 }
 
 static bool
 prof_dump_header(bool propagate_err, const prof_cnt_t *cnt_all)
 {
+	bool ret;
 
-	if (opt_lg_prof_sample == 0) {
-		if (prof_dump_printf(propagate_err,
-		    "heap profile: %"PRId64": %"PRId64
-		    " [%"PRIu64": %"PRIu64"] @ heapprofile\n",
-		    cnt_all->curobjs, cnt_all->curbytes,
-		    cnt_all->accumobjs, cnt_all->accumbytes))
-			return (true);
-	} else {
-		if (prof_dump_printf(propagate_err,
-		    "heap profile: %"PRId64": %"PRId64
-		    " [%"PRIu64": %"PRIu64"] @ heap_v2/%"PRIu64"\n",
-		    cnt_all->curobjs, cnt_all->curbytes,
-		    cnt_all->accumobjs, cnt_all->accumbytes,
-		    ((uint64_t)1U << opt_lg_prof_sample)))
-			return (true);
-	}
+	if (prof_dump_printf(propagate_err,
+	    "heap_v2/%"PRIu64"\n"
+	    "  t*: %"PRIu64": %"PRIu64" [%"PRIu64": %"PRIu64"]\n",
+	    ((uint64_t)1U << lg_prof_sample), cnt_all->curobjs,
+	    cnt_all->curbytes, cnt_all->accumobjs, cnt_all->accumbytes))
+		return (true);
 
-	return (false);
-}
-
-static void
-prof_dump_ctx_cleanup_locked(prof_ctx_t *ctx, prof_ctx_tree_t *ctxs)
-{
-
-	ctx->nlimbo--;
-}
-
-static void
-prof_dump_ctx_cleanup(prof_ctx_t *ctx, prof_ctx_tree_t *ctxs)
-{
-
-	malloc_mutex_lock(ctx->lock);
-	prof_dump_ctx_cleanup_locked(ctx, ctxs);
-	malloc_mutex_unlock(ctx->lock);
+	malloc_mutex_lock(&tdatas_mtx);
+	ret = (tdata_tree_iter(&tdatas, NULL, prof_tdata_dump_iter,
+	    (void *)&propagate_err) != NULL);
+	malloc_mutex_unlock(&tdatas_mtx);
+	return (ret);
 }
 
+/* gctx->lock is held. */
 static bool
-prof_dump_ctx(bool propagate_err, prof_ctx_t *ctx, const prof_bt_t *bt,
-    prof_ctx_tree_t *ctxs)
+prof_dump_gctx(bool propagate_err, prof_gctx_t *gctx, const prof_bt_t *bt,
+    prof_gctx_tree_t *gctxs)
 {
 	bool ret;
 	unsigned i;
 
 	cassert(config_prof);
 
-	/*
-	 * Current statistics can sum to 0 as a result of unmerged per thread
-	 * statistics.  Additionally, interval- and growth-triggered dumps can
-	 * occur between the time a ctx is created and when its statistics are
-	 * filled in.  Avoid dumping any ctx that is an artifact of either
-	 * implementation detail.
-	 */
-	malloc_mutex_lock(ctx->lock);
-	if ((opt_prof_accum == false && ctx->cnt_summed.curobjs == 0) ||
-	    (opt_prof_accum && ctx->cnt_summed.accumobjs == 0)) {
-		assert(ctx->cnt_summed.curobjs == 0);
-		assert(ctx->cnt_summed.curbytes == 0);
-		assert(ctx->cnt_summed.accumobjs == 0);
-		assert(ctx->cnt_summed.accumbytes == 0);
+	/* Avoid dumping such gctx's that have no useful data. */
+	if ((opt_prof_accum == false && gctx->cnt_summed.curobjs == 0) ||
+	    (opt_prof_accum && gctx->cnt_summed.accumobjs == 0)) {
+		assert(gctx->cnt_summed.curobjs == 0);
+		assert(gctx->cnt_summed.curbytes == 0);
+		assert(gctx->cnt_summed.accumobjs == 0);
+		assert(gctx->cnt_summed.accumbytes == 0);
 		ret = false;
 		goto label_return;
 	}
 
-	if (prof_dump_printf(propagate_err, "%"PRId64": %"PRId64
-	    " [%"PRIu64": %"PRIu64"] @",
-	    ctx->cnt_summed.curobjs, ctx->cnt_summed.curbytes,
-	    ctx->cnt_summed.accumobjs, ctx->cnt_summed.accumbytes)) {
+	if (prof_dump_printf(propagate_err, "@")) {
 		ret = true;
 		goto label_return;
 	}
-
 	for (i = 0; i < bt->len; i++) {
 		if (prof_dump_printf(propagate_err, " %#"PRIxPTR,
 		    (uintptr_t)bt->vec[i])) {
@@ -918,15 +1140,23 @@ prof_dump_ctx(bool propagate_err, prof_ctx_t *ctx, const prof_bt_t *bt,
 		}
 	}
 
-	if (prof_dump_write(propagate_err, "\n")) {
+	if (prof_dump_printf(propagate_err,
+	    "\n"
+	    "  t*: %"PRIu64": %"PRIu64" [%"PRIu64": %"PRIu64"]\n",
+	    gctx->cnt_summed.curobjs, gctx->cnt_summed.curbytes,
+	    gctx->cnt_summed.accumobjs, gctx->cnt_summed.accumbytes)) {
+		ret = true;
+		goto label_return;
+	}
+
+	if (tctx_tree_iter(&gctx->tctxs, NULL, prof_tctx_dump_iter,
+	    (void *)&propagate_err) != NULL) {
 		ret = true;
 		goto label_return;
 	}
 
 	ret = false;
 label_return:
-	prof_dump_ctx_cleanup_locked(ctx, ctxs);
-	malloc_mutex_unlock(ctx->lock);
 	return (ret);
 }
 
@@ -980,72 +1210,85 @@ label_return:
 }
 
 static void
-prof_leakcheck(const prof_cnt_t *cnt_all, size_t leak_nctx,
+prof_leakcheck(const prof_cnt_t *cnt_all, size_t leak_ngctx,
     const char *filename)
 {
 
 	if (cnt_all->curbytes != 0) {
-		malloc_printf("<jemalloc>: Leak summary: %"PRId64" byte%s, %"
-		    PRId64" object%s, %zu context%s\n",
+		malloc_printf("<jemalloc>: Leak summary: %"PRIu64" byte%s, %"
+		    PRIu64" object%s, %zu context%s\n",
 		    cnt_all->curbytes, (cnt_all->curbytes != 1) ? "s" : "",
 		    cnt_all->curobjs, (cnt_all->curobjs != 1) ? "s" : "",
-		    leak_nctx, (leak_nctx != 1) ? "s" : "");
+		    leak_ngctx, (leak_ngctx != 1) ? "s" : "");
 		malloc_printf(
 		    "<jemalloc>: Run pprof on \"%s\" for leak detail\n",
 		    filename);
 	}
 }
 
-static prof_ctx_t *
-prof_ctx_dump_iter(prof_ctx_tree_t *ctxs, prof_ctx_t *ctx, void *arg)
+static prof_gctx_t *
+prof_gctx_dump_iter(prof_gctx_tree_t *gctxs, prof_gctx_t *gctx, void *arg)
 {
+	prof_gctx_t *ret;
 	bool propagate_err = *(bool *)arg;
 
-	if (prof_dump_ctx(propagate_err, ctx, &ctx->bt, ctxs))
-		return (ctx_tree_next(ctxs, ctx));
+	malloc_mutex_lock(gctx->lock);
 
-	return (NULL);
-}
+	if (prof_dump_gctx(propagate_err, gctx, &gctx->bt, gctxs)) {
+		ret = gctx_tree_next(gctxs, gctx);
+		goto label_return;
+	}
 
-static prof_ctx_t *
-prof_ctx_cleanup_iter(prof_ctx_tree_t *ctxs, prof_ctx_t *ctx, void *arg)
-{
-
-	prof_dump_ctx_cleanup(ctx, ctxs);
-
-	return (NULL);
+	ret = NULL;
+label_return:
+	malloc_mutex_unlock(gctx->lock);
+	return (ret);
 }
 
 static bool
 prof_dump(bool propagate_err, const char *filename, bool leakcheck)
 {
-	prof_tdata_t *prof_tdata;
+	prof_tdata_t *tdata;
 	prof_cnt_t cnt_all;
 	size_t tabind;
 	union {
-		prof_ctx_t	*p;
+		prof_gctx_t	*p;
 		void		*v;
-	} ctx;
-	size_t leak_nctx;
-	prof_ctx_tree_t ctxs;
-	prof_ctx_t *cleanup_start = NULL;
+	} gctx;
+	size_t leak_ngctx;
+	prof_gctx_tree_t gctxs;
 
 	cassert(config_prof);
 
-	prof_tdata = prof_tdata_get(false);
-	if ((uintptr_t)prof_tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
+	tdata = prof_tdata_get(false);
+	if ((uintptr_t)tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
 		return (true);
 
 	malloc_mutex_lock(&prof_dump_mtx);
+	prof_enter(tdata);
 
-	/* Merge per thread profile stats, and sum them in cnt_all. */
+	/*
+	 * Put gctx's in limbo and clear their counters in preparation for
+	 * summing.
+	 */
+	gctx_tree_new(&gctxs);
+	for (tabind = 0; ckh_iter(&bt2gctx, &tabind, NULL, &gctx.v) == false;)
+		prof_dump_gctx_prep(gctx.p, &gctxs);
+
+	/*
+	 * Iterate over tdatas, and for the non-expired ones snapshot their tctx
+	 * stats and merge them into the associated gctx's.
+	 */
 	memset(&cnt_all, 0, sizeof(prof_cnt_t));
-	leak_nctx = 0;
-	ctx_tree_new(&ctxs);
-	prof_enter(prof_tdata);
-	for (tabind = 0; ckh_iter(&bt2ctx, &tabind, NULL, &ctx.v) == false;)
-		prof_dump_ctx_prep(ctx.p, &cnt_all, &leak_nctx, &ctxs);
-	prof_leave(prof_tdata);
+	malloc_mutex_lock(&tdatas_mtx);
+	tdata_tree_iter(&tdatas, NULL, prof_tdata_merge_iter, (void *)&cnt_all);
+	malloc_mutex_unlock(&tdatas_mtx);
+
+	/* Merge tctx stats into gctx's. */
+	leak_ngctx = 0;
+	gctx_tree_iter(&gctxs, NULL, prof_gctx_merge_iter, (void *)&leak_ngctx);
+
+	prof_leave(tdata);
 
 	/* Create dump file. */
 	if ((prof_dump_fd = prof_dump_open(propagate_err, filename)) == -1)
@@ -1055,10 +1298,9 @@ prof_dump(bool propagate_err, const char *filename, bool leakcheck)
 	if (prof_dump_header(propagate_err, &cnt_all))
 		goto label_write_error;
 
-	/* Dump per ctx profile stats. */
-	cleanup_start = ctx_tree_iter(&ctxs, NULL, prof_ctx_dump_iter,
-	    (void *)&propagate_err);
-	if (cleanup_start != NULL)
+	/* Dump per gctx profile stats. */
+	if (gctx_tree_iter(&gctxs, NULL, prof_gctx_dump_iter,
+	    (void *)&propagate_err) != NULL)
 		goto label_write_error;
 
 	/* Dump /proc/<pid>/maps if possible. */
@@ -1068,19 +1310,17 @@ prof_dump(bool propagate_err, const char *filename, bool leakcheck)
 	if (prof_dump_close(propagate_err))
 		goto label_open_close_error;
 
+	gctx_tree_iter(&gctxs, NULL, prof_gctx_finish_iter, NULL);
 	malloc_mutex_unlock(&prof_dump_mtx);
 
 	if (leakcheck)
-		prof_leakcheck(&cnt_all, leak_nctx, filename);
+		prof_leakcheck(&cnt_all, leak_ngctx, filename);
 
 	return (false);
 label_write_error:
 	prof_dump_close(propagate_err);
 label_open_close_error:
-	if (cleanup_start != NULL) {
-		ctx_tree_iter(&ctxs, cleanup_start, prof_ctx_cleanup_iter,
-		    NULL);
-	}
+	gctx_tree_iter(&gctxs, NULL, prof_gctx_finish_iter, NULL);
 	malloc_mutex_unlock(&prof_dump_mtx);
 	return (true);
 }
@@ -1128,18 +1368,18 @@ prof_fdump(void)
 void
 prof_idump(void)
 {
-	prof_tdata_t *prof_tdata;
+	prof_tdata_t *tdata;
 	char filename[PATH_MAX + 1];
 
 	cassert(config_prof);
 
 	if (prof_booted == false)
 		return;
-	prof_tdata = prof_tdata_get(false);
-	if ((uintptr_t)prof_tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
+	tdata = prof_tdata_get(false);
+	if ((uintptr_t)tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
 		return;
-	if (prof_tdata->enq) {
-		prof_tdata->enq_idump = true;
+	if (tdata->enq) {
+		tdata->enq_idump = true;
 		return;
 	}
 
@@ -1178,18 +1418,18 @@ prof_mdump(const char *filename)
 void
 prof_gdump(void)
 {
-	prof_tdata_t *prof_tdata;
+	prof_tdata_t *tdata;
 	char filename[DUMP_FILENAME_BUFSIZE];
 
 	cassert(config_prof);
 
 	if (prof_booted == false)
 		return;
-	prof_tdata = prof_tdata_get(false);
-	if ((uintptr_t)prof_tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
+	tdata = prof_tdata_get(false);
+	if ((uintptr_t)tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
 		return;
-	if (prof_tdata->enq) {
-		prof_tdata->enq_gdump = true;
+	if (tdata->enq) {
+		tdata->enq_gdump = true;
 		return;
 	}
 
@@ -1225,81 +1465,233 @@ prof_bt_keycomp(const void *k1, const void *k2)
 	return (memcmp(bt1->vec, bt2->vec, bt1->len * sizeof(void *)) == 0);
 }
 
-prof_tdata_t *
-prof_tdata_init(void)
+JEMALLOC_INLINE_C uint64_t
+prof_thr_uid_alloc(void)
 {
-	prof_tdata_t *prof_tdata;
+
+	return (atomic_add_uint64(&next_thr_uid, 1) - 1);
+}
+
+static prof_tdata_t *
+prof_tdata_init_impl(uint64_t thr_uid)
+{
+	prof_tdata_t *tdata;
 
 	cassert(config_prof);
 
 	/* Initialize an empty cache for this thread. */
-	prof_tdata = (prof_tdata_t *)imalloc(sizeof(prof_tdata_t));
-	if (prof_tdata == NULL)
+	tdata = (prof_tdata_t *)imalloc(sizeof(prof_tdata_t));
+	if (tdata == NULL)
 		return (NULL);
 
-	if (ckh_new(&prof_tdata->bt2cnt, PROF_CKH_MINITEMS,
+	tdata->lock = prof_tdata_mutex_choose(thr_uid);
+	tdata->thr_uid = thr_uid;
+	tdata->thread_name = NULL;
+	tdata->state = prof_tdata_state_attached;
+
+	if (ckh_new(&tdata->bt2tctx, PROF_CKH_MINITEMS,
 	    prof_bt_hash, prof_bt_keycomp)) {
-		idalloc(prof_tdata);
+		idalloc(tdata);
 		return (NULL);
 	}
 
-	prof_tdata->prng_state = (uint64_t)(uintptr_t)prof_tdata;
-	prof_sample_threshold_update(prof_tdata);
+	tdata->prng_state = (uint64_t)(uintptr_t)tdata;
+	prof_sample_threshold_update(tdata);
 
-	prof_tdata->enq = false;
-	prof_tdata->enq_idump = false;
-	prof_tdata->enq_gdump = false;
+	tdata->enq = false;
+	tdata->enq_idump = false;
+	tdata->enq_gdump = false;
 
-	prof_tdata_tsd_set(&prof_tdata);
+	tdata->dumping = false;
+	tdata->active = true;
 
-	return (prof_tdata);
+	prof_tdata_tsd_set(&tdata);
+
+	malloc_mutex_lock(&tdatas_mtx);
+	tdata_tree_insert(&tdatas, tdata);
+	malloc_mutex_unlock(&tdatas_mtx);
+
+	return (tdata);
+}
+
+prof_tdata_t *
+prof_tdata_init(void)
+{
+
+	return (prof_tdata_init_impl(prof_thr_uid_alloc()));
+}
+
+prof_tdata_t *
+prof_tdata_reinit(prof_tdata_t *tdata)
+{
+
+	return (prof_tdata_init_impl(tdata->thr_uid));
+}
+
+/* tdata->lock must be held. */
+static bool
+prof_tdata_should_destroy(prof_tdata_t *tdata)
+{
+
+	if (tdata->state == prof_tdata_state_attached)
+		return (false);
+	if (ckh_count(&tdata->bt2tctx) != 0)
+		return (false);
+	return (true);
+}
+
+static void
+prof_tdata_destroy(prof_tdata_t *tdata)
+{
+
+	assert(prof_tdata_should_destroy(tdata));
+
+	malloc_mutex_lock(&tdatas_mtx);
+	tdata_tree_remove(&tdatas, tdata);
+	malloc_mutex_unlock(&tdatas_mtx);
+
+	if (tdata->thread_name != NULL)
+		idalloc(tdata->thread_name);
+	ckh_delete(&tdata->bt2tctx);
+	idalloc(tdata);
+}
+
+static void
+prof_tdata_state_transition(prof_tdata_t *tdata, prof_tdata_state_t state)
+{
+	bool destroy_tdata;
+
+	malloc_mutex_lock(tdata->lock);
+	if (tdata->state != state) {
+		tdata->state = state;
+		destroy_tdata = prof_tdata_should_destroy(tdata);
+	} else
+		destroy_tdata = false;
+	malloc_mutex_unlock(tdata->lock);
+	if (destroy_tdata)
+		prof_tdata_destroy(tdata);
+}
+
+static void
+prof_tdata_detach(prof_tdata_t *tdata)
+{
+
+	prof_tdata_state_transition(tdata, prof_tdata_state_detached);
+}
+
+static void
+prof_tdata_expire(prof_tdata_t *tdata)
+{
+
+	prof_tdata_state_transition(tdata, prof_tdata_state_expired);
+}
+
+static prof_tdata_t *
+prof_tdata_reset_iter(prof_tdata_tree_t *tdatas, prof_tdata_t *tdata, void *arg)
+{
+
+	prof_tdata_expire(tdata);
+	return (NULL);
+}
+
+void
+prof_reset(size_t lg_sample)
+{
+
+	assert(lg_sample < (sizeof(uint64_t) << 3));
+
+	malloc_mutex_lock(&prof_dump_mtx);
+	malloc_mutex_lock(&tdatas_mtx);
+
+	lg_prof_sample = lg_sample;
+	tdata_tree_iter(&tdatas, NULL, prof_tdata_reset_iter, NULL);
+
+	malloc_mutex_unlock(&tdatas_mtx);
+	malloc_mutex_unlock(&prof_dump_mtx);
 }
 
 void
 prof_tdata_cleanup(void *arg)
 {
-	prof_tdata_t *prof_tdata = *(prof_tdata_t **)arg;
+	prof_tdata_t *tdata = *(prof_tdata_t **)arg;
 
 	cassert(config_prof);
 
-	if (prof_tdata == PROF_TDATA_STATE_REINCARNATED) {
+	if (tdata == PROF_TDATA_STATE_REINCARNATED) {
 		/*
 		 * Another destructor deallocated memory after this destructor
-		 * was called.  Reset prof_tdata to PROF_TDATA_STATE_PURGATORY
-		 * in order to receive another callback.
+		 * was called.  Reset tdata to PROF_TDATA_STATE_PURGATORY in
+		 * order to receive another callback.
 		 */
-		prof_tdata = PROF_TDATA_STATE_PURGATORY;
-		prof_tdata_tsd_set(&prof_tdata);
-	} else if (prof_tdata == PROF_TDATA_STATE_PURGATORY) {
+		tdata = PROF_TDATA_STATE_PURGATORY;
+		prof_tdata_tsd_set(&tdata);
+	} else if (tdata == PROF_TDATA_STATE_PURGATORY) {
 		/*
 		 * The previous time this destructor was called, we set the key
 		 * to PROF_TDATA_STATE_PURGATORY so that other destructors
-		 * wouldn't cause re-creation of the prof_tdata.  This time, do
+		 * wouldn't cause re-creation of the tdata.  This time, do
 		 * nothing, so that the destructor will not be called again.
 		 */
-	} else if (prof_tdata != NULL) {
-		union {
-			prof_thr_cnt_t	*p;
-			void		*v;
-		} cnt;
-		size_t tabind;
-
-		/*
-		 * Iteratively merge cnt's into the global stats and delete
-		 * them.
-		 */
-		for (tabind = 0; ckh_iter(&prof_tdata->bt2cnt, &tabind, NULL,
-		    &cnt.v);) {
-			prof_ctx_merge(cnt.p->ctx, cnt.p);
-			idalloc(cnt.v);
-		}
-		ckh_delete(&prof_tdata->bt2cnt);
-		idalloc(prof_tdata);
-		prof_tdata = PROF_TDATA_STATE_PURGATORY;
-		prof_tdata_tsd_set(&prof_tdata);
+	} else if (tdata != NULL) {
+		prof_tdata_detach(tdata);
+		tdata = PROF_TDATA_STATE_PURGATORY;
+		prof_tdata_tsd_set(&tdata);
 	}
 }
 
+const char *
+prof_thread_name_get(void)
+{
+	prof_tdata_t *tdata = prof_tdata_get(true);
+	if (tdata == NULL)
+		return (NULL);
+	return (tdata->thread_name);
+}
+
+bool
+prof_thread_name_set(const char *thread_name)
+{
+	prof_tdata_t *tdata;
+	size_t size;
+	char *s;
+
+	tdata = prof_tdata_get(true);
+	if (tdata == NULL)
+		return (true);
+
+	size = strlen(thread_name) + 1;
+	s = imalloc(size);
+	if (s == NULL)
+		return (true);
+
+	memcpy(s, thread_name, size);
+	if (tdata->thread_name != NULL)
+		idalloc(tdata->thread_name);
+	tdata->thread_name = s;
+	return (false);
+}
+
+bool
+prof_thread_active_get(void)
+{
+	prof_tdata_t *tdata = prof_tdata_get(true);
+	if (tdata == NULL)
+		return (false);
+	return (tdata->active);
+}
+
+bool
+prof_thread_active_set(bool active)
+{
+	prof_tdata_t *tdata;
+
+	tdata = prof_tdata_get(true);
+	if (tdata == NULL)
+		return (true);
+	tdata->active = active;
+	return (false);
+}
+
 void
 prof_boot0(void)
 {
@@ -1345,10 +1737,12 @@ prof_boot2(void)
 	if (opt_prof) {
 		unsigned i;
 
-		if (ckh_new(&bt2ctx, PROF_CKH_MINITEMS, prof_bt_hash,
+		lg_prof_sample = opt_lg_prof_sample;
+
+		if (ckh_new(&bt2gctx, PROF_CKH_MINITEMS, prof_bt_hash,
 		    prof_bt_keycomp))
 			return (true);
-		if (malloc_mutex_init(&bt2ctx_mtx))
+		if (malloc_mutex_init(&bt2gctx_mtx))
 			return (true);
 		if (prof_tdata_tsd_boot()) {
 			malloc_write(
@@ -1356,6 +1750,12 @@ prof_boot2(void)
 			abort();
 		}
 
+		tdata_tree_new(&tdatas);
+		if (malloc_mutex_init(&tdatas_mtx))
+			return (true);
+
+		next_thr_uid = 0;
+
 		if (malloc_mutex_init(&prof_dump_seq_mtx))
 			return (true);
 		if (malloc_mutex_init(&prof_dump_mtx))
@@ -1367,12 +1767,21 @@ prof_boot2(void)
 				abort();
 		}
 
-		ctx_locks = (malloc_mutex_t *)base_alloc(PROF_NCTX_LOCKS *
+		gctx_locks = (malloc_mutex_t *)base_alloc(PROF_NCTX_LOCKS *
 		    sizeof(malloc_mutex_t));
-		if (ctx_locks == NULL)
+		if (gctx_locks == NULL)
 			return (true);
 		for (i = 0; i < PROF_NCTX_LOCKS; i++) {
-			if (malloc_mutex_init(&ctx_locks[i]))
+			if (malloc_mutex_init(&gctx_locks[i]))
+				return (true);
+		}
+
+		tdata_locks = (malloc_mutex_t *)base_alloc(PROF_NTDATA_LOCKS *
+		    sizeof(malloc_mutex_t));
+		if (tdata_locks == NULL)
+			return (true);
+		for (i = 0; i < PROF_NTDATA_LOCKS; i++) {
+			if (malloc_mutex_init(&tdata_locks[i]))
 				return (true);
 		}
 	}
@@ -1397,10 +1806,10 @@ prof_prefork(void)
 	if (opt_prof) {
 		unsigned i;
 
-		malloc_mutex_prefork(&bt2ctx_mtx);
+		malloc_mutex_prefork(&bt2gctx_mtx);
 		malloc_mutex_prefork(&prof_dump_seq_mtx);
 		for (i = 0; i < PROF_NCTX_LOCKS; i++)
-			malloc_mutex_prefork(&ctx_locks[i]);
+			malloc_mutex_prefork(&gctx_locks[i]);
 	}
 }
 
@@ -1412,9 +1821,9 @@ prof_postfork_parent(void)
 		unsigned i;
 
 		for (i = 0; i < PROF_NCTX_LOCKS; i++)
-			malloc_mutex_postfork_parent(&ctx_locks[i]);
+			malloc_mutex_postfork_parent(&gctx_locks[i]);
 		malloc_mutex_postfork_parent(&prof_dump_seq_mtx);
-		malloc_mutex_postfork_parent(&bt2ctx_mtx);
+		malloc_mutex_postfork_parent(&bt2gctx_mtx);
 	}
 }
 
@@ -1426,9 +1835,9 @@ prof_postfork_child(void)
 		unsigned i;
 
 		for (i = 0; i < PROF_NCTX_LOCKS; i++)
-			malloc_mutex_postfork_child(&ctx_locks[i]);
+			malloc_mutex_postfork_child(&gctx_locks[i]);
 		malloc_mutex_postfork_child(&prof_dump_seq_mtx);
-		malloc_mutex_postfork_child(&bt2ctx_mtx);
+		malloc_mutex_postfork_child(&bt2gctx_mtx);
 	}
 }
 
diff --git a/src/stats.c b/src/stats.c
index a0eb2971..db34275e 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -441,7 +441,7 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 		}
 		if ((err = je_mallctl("opt.prof", &bv, &bsz, NULL, 0)) == 0 &&
 		    bv) {
-			CTL_GET("opt.lg_prof_sample", &sv, size_t);
+			CTL_GET("prof.lg_sample", &sv, size_t);
 			malloc_cprintf(write_cb, cbopaque,
 			    "Average profile sample interval: %"PRIu64
 			    " (2^%zu)\n", (((uint64_t)1U) << sv), sv);