From 9a60cf54ec4b825a692330a1c56932fa1b121e27 Mon Sep 17 00:00:00 2001
From: Yinan Zhang <zyn8950@gmail.com>
Date: Wed, 18 Dec 2019 13:38:14 -0800
Subject: [PATCH] Last-N profiling mode

---
 Makefile.in                                   |   2 +
 include/jemalloc/internal/arena_inlines_b.h   |  16 +-
 include/jemalloc/internal/edata.h             |  53 +-
 include/jemalloc/internal/large_externs.h     |   3 +-
 include/jemalloc/internal/nstime.h            |  10 +-
 include/jemalloc/internal/prof_externs.h      |   9 +
 include/jemalloc/internal/prof_inlines_b.h    |  20 +-
 include/jemalloc/internal/prof_recent.h       |  16 +
 include/jemalloc/internal/prof_structs.h      |  22 +-
 include/jemalloc/internal/prof_types.h        |   4 +
 include/jemalloc/internal/witness.h           |   1 +
 .../projects/vc2015/jemalloc/jemalloc.vcxproj |   1 +
 .../vc2015/jemalloc/jemalloc.vcxproj.filters  |   3 +
 .../projects/vc2017/jemalloc/jemalloc.vcxproj |   1 +
 .../vc2017/jemalloc/jemalloc.vcxproj.filters  |   3 +
 src/ctl.c                                     |  45 +-
 src/extent.c                                  |   2 +
 src/jemalloc.c                                |  48 +-
 src/large.c                                   |  20 +-
 src/prof.c                                    |  16 +-
 src/prof_data.c                               |  15 +-
 src/prof_recent.c                             | 553 ++++++++++++++++++
 test/unit/mallctl.c                           |   1 +
 test/unit/prof_recent.c                       | 391 +++++++++++++
 test/unit/prof_recent.sh                      |   5 +
 test/unit/prof_reset.sh                       |   2 +-
 26 files changed, 1218 insertions(+), 44 deletions(-)
 create mode 100644 include/jemalloc/internal/prof_recent.h
 create mode 100644 src/prof_recent.c
 create mode 100644 test/unit/prof_recent.c
 create mode 100644 test/unit/prof_recent.sh

diff --git a/Makefile.in b/Makefile.in
index 40ba7f26..ad54720e 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -126,6 +126,7 @@ C_SRCS := $(srcroot)src/jemalloc.c \
 	$(srcroot)src/prof.c \
 	$(srcroot)src/prof_data.c \
 	$(srcroot)src/prof_log.c \
+	$(srcroot)src/prof_recent.c \
 	$(srcroot)src/rtree.c \
 	$(srcroot)src/safety_check.c \
 	$(srcroot)src/sc.c \
@@ -216,6 +217,7 @@ TESTS_UNIT := \
 	$(srcroot)test/unit/prof_gdump.c \
 	$(srcroot)test/unit/prof_idump.c \
 	$(srcroot)test/unit/prof_log.c \
+	$(srcroot)test/unit/prof_recent.c \
 	$(srcroot)test/unit/prof_reset.c \
 	$(srcroot)test/unit/prof_tctx.c \
 	$(srcroot)test/unit/prof_thread_name.c \
diff --git a/include/jemalloc/internal/arena_inlines_b.h b/include/jemalloc/internal/arena_inlines_b.h
index 28f2e97f..a310eb29 100644
--- a/include/jemalloc/internal/arena_inlines_b.h
+++ b/include/jemalloc/internal/arena_inlines_b.h
@@ -37,12 +37,12 @@ arena_choose_maybe_huge(tsd_t *tsd, arena_t *arena, size_t size) {
 
 JEMALLOC_ALWAYS_INLINE void
 arena_prof_info_get(tsd_t *tsd, const void *ptr, alloc_ctx_t *alloc_ctx,
-    prof_info_t *prof_info) {
+    prof_info_t *prof_info, bool reset_recent) {
 	cassert(config_prof);
 	assert(ptr != NULL);
 	assert(prof_info != NULL);
 
-	const edata_t *edata;
+	edata_t *edata = NULL;
 	bool is_slab;
 
 	/* Static check. */
@@ -55,10 +55,14 @@ arena_prof_info_get(tsd_t *tsd, const void *ptr, alloc_ctx_t *alloc_ctx,
 
 	if (unlikely(!is_slab)) {
 		/* edata must have been initialized at this point. */
-		large_prof_info_get(edata, prof_info);
+		assert(edata != NULL);
+		large_prof_info_get(tsd, edata, prof_info, reset_recent);
 	} else {
-		memset(prof_info, 0, sizeof(prof_info_t));
 		prof_info->alloc_tctx = (prof_tctx_t *)(uintptr_t)1U;
+		/*
+		 * No need to set other fields in prof_info; they will never be
+		 * accessed if (uintptr_t)alloc_tctx == (uintptr_t)1U.
+		 */
 	}
 }
 
@@ -92,11 +96,9 @@ arena_prof_tctx_reset_sampled(tsd_t *tsd, const void *ptr) {
 }
 
 JEMALLOC_ALWAYS_INLINE void
-arena_prof_info_set(tsd_t *tsd, const void *ptr, prof_tctx_t *tctx) {
+arena_prof_info_set(tsd_t *tsd, edata_t *edata, prof_tctx_t *tctx) {
 	cassert(config_prof);
-	assert(ptr != NULL);
 
-	edata_t *edata = iealloc(tsd_tsdn(tsd), ptr);
 	assert(!edata_slab_get(edata));
 	large_prof_info_set(edata, tctx);
 }
diff --git a/include/jemalloc/internal/edata.h b/include/jemalloc/internal/edata.h
index 86f5ac57..2a81bdc6 100644
--- a/include/jemalloc/internal/edata.h
+++ b/include/jemalloc/internal/edata.h
@@ -25,6 +25,20 @@ enum extent_head_state_e {
 };
 typedef enum extent_head_state_e extent_head_state_t;
 
+struct e_prof_info_s {
+	/* Time when this was allocated. */
+	nstime_t	e_prof_alloc_time;
+	/* Points to a prof_tctx_t. */
+	atomic_p_t	e_prof_tctx;
+	/*
+	 * Points to a prof_recent_t for the allocation; NULL
+	 * means the recent allocation record no longer exists.
+	 * Protected by prof_recent_alloc_mtx.
+	 */
+	atomic_p_t	e_prof_recent_alloc;
+};
+typedef struct e_prof_info_s e_prof_info_t;
+
 /* Extent (span of pages).  Use accessor functions for e_* fields. */
 typedef struct edata_s edata_t;
 typedef ql_head(edata_t) edata_list_t;
@@ -186,12 +200,7 @@ struct edata_s {
 		slab_data_t	e_slab_data;
 
 		/* Profiling data, used for large objects. */
-		struct {
-			/* Time when this was allocated. */
-			nstime_t		e_alloc_time;
-			/* Points to a prof_tctx_t. */
-			atomic_p_t		e_prof_tctx;
-		};
+		e_prof_info_t	e_prof_info;
 	};
 };
 
@@ -333,12 +342,21 @@ edata_slab_data_get_const(const edata_t *edata) {
 	return &edata->e_slab_data;
 }
 
-static inline void
-edata_prof_info_get(const edata_t *edata, prof_info_t *prof_info) {
-	assert(prof_info != NULL);
-	prof_info->alloc_tctx = (prof_tctx_t *)atomic_load_p(
-	    &edata->e_prof_tctx, ATOMIC_ACQUIRE);
-	prof_info->alloc_time = edata->e_alloc_time;
+static inline prof_tctx_t *
+edata_prof_tctx_get(const edata_t *edata) {
+	return (prof_tctx_t *)atomic_load_p(&edata->e_prof_info.e_prof_tctx,
+	    ATOMIC_ACQUIRE);
+}
+
+static inline const nstime_t *
+edata_prof_alloc_time_get(const edata_t *edata) {
+	return &edata->e_prof_info.e_prof_alloc_time;
+}
+
+static inline prof_recent_t *
+edata_prof_recent_alloc_get_dont_call_directly(const edata_t *edata) {
+	return (prof_recent_t *)atomic_load_p(
+	    &edata->e_prof_info.e_prof_recent_alloc, ATOMIC_RELAXED);
 }
 
 static inline void
@@ -457,12 +475,19 @@ edata_slab_set(edata_t *edata, bool slab) {
 
 static inline void
 edata_prof_tctx_set(edata_t *edata, prof_tctx_t *tctx) {
-	atomic_store_p(&edata->e_prof_tctx, tctx, ATOMIC_RELEASE);
+	atomic_store_p(&edata->e_prof_info.e_prof_tctx, tctx, ATOMIC_RELEASE);
 }
 
 static inline void
 edata_prof_alloc_time_set(edata_t *edata, nstime_t *t) {
-	nstime_copy(&edata->e_alloc_time, t);
+	nstime_copy(&edata->e_prof_info.e_prof_alloc_time, t);
+}
+
+static inline void
+edata_prof_recent_alloc_set_dont_call_directly(edata_t *edata,
+    prof_recent_t *recent_alloc) {
+	atomic_store_p(&edata->e_prof_info.e_prof_recent_alloc, recent_alloc,
+	    ATOMIC_RELAXED);
 }
 
 static inline bool
diff --git a/include/jemalloc/internal/large_externs.h b/include/jemalloc/internal/large_externs.h
index fe5e606b..05e6c442 100644
--- a/include/jemalloc/internal/large_externs.h
+++ b/include/jemalloc/internal/large_externs.h
@@ -22,7 +22,8 @@ void large_dalloc_prep_junked_locked(tsdn_t *tsdn, edata_t *edata);
 void large_dalloc_finish(tsdn_t *tsdn, edata_t *edata);
 void large_dalloc(tsdn_t *tsdn, edata_t *edata);
 size_t large_salloc(tsdn_t *tsdn, const edata_t *edata);
-void large_prof_info_get(const edata_t *edata, prof_info_t *prof_info);
+void large_prof_info_get(tsd_t *tsd, edata_t *edata, prof_info_t *prof_info,
+    bool reset_recent);
 void large_prof_tctx_reset(edata_t *edata);
 void large_prof_info_set(edata_t *edata, prof_tctx_t *tctx);
 
diff --git a/include/jemalloc/internal/nstime.h b/include/jemalloc/internal/nstime.h
index a3766ff2..c4bee24d 100644
--- a/include/jemalloc/internal/nstime.h
+++ b/include/jemalloc/internal/nstime.h
@@ -9,6 +9,8 @@ typedef struct {
 	uint64_t ns;
 } nstime_t;
 
+static const nstime_t zero = NSTIME_ZERO_INITIALIZER;
+
 void nstime_init(nstime_t *time, uint64_t ns);
 void nstime_init2(nstime_t *time, uint64_t sec, uint64_t nsec);
 uint64_t nstime_ns(const nstime_t *time);
@@ -35,8 +37,14 @@ bool nstime_init_update(nstime_t *time);
 
 JEMALLOC_ALWAYS_INLINE void
 nstime_init_zero(nstime_t *time) {
-	static const nstime_t zero = NSTIME_ZERO_INITIALIZER;
 	nstime_copy(time, &zero);
 }
 
+JEMALLOC_ALWAYS_INLINE bool
+nstime_equals_zero(nstime_t *time) {
+	int diff = nstime_compare(time, &zero);
+	assert(diff >= 0);
+	return diff == 0;
+}
+
 #endif /* JEMALLOC_INTERNAL_NSTIME_H */
diff --git a/include/jemalloc/internal/prof_externs.h b/include/jemalloc/internal/prof_externs.h
index 9ba363bf..a07fd22b 100644
--- a/include/jemalloc/internal/prof_externs.h
+++ b/include/jemalloc/internal/prof_externs.h
@@ -24,6 +24,10 @@ extern char opt_prof_prefix[
 #endif
     1];
 
+/* For recording recent allocations */
+extern ssize_t opt_prof_recent_alloc_max;
+extern malloc_mutex_t prof_recent_alloc_mtx;
+
 /* Accessed via prof_active_[gs]et{_unlocked,}(). */
 extern bool prof_active;
 
@@ -99,4 +103,9 @@ void prof_sample_threshold_update(tsd_t *tsd);
 bool prof_log_start(tsdn_t *tsdn, const char *filename);
 bool prof_log_stop(tsdn_t *tsdn);
 
+ssize_t prof_recent_alloc_max_ctl_read();
+ssize_t prof_recent_alloc_max_ctl_write(tsd_t *tsd, ssize_t max);
+void prof_recent_alloc_dump(tsd_t *tsd, void (*write_cb)(void *, const char *),
+    void *cbopaque);
+
 #endif /* JEMALLOC_INTERNAL_PROF_EXTERNS_H */
diff --git a/include/jemalloc/internal/prof_inlines_b.h b/include/jemalloc/internal/prof_inlines_b.h
index 186446bb..9ea0454c 100644
--- a/include/jemalloc/internal/prof_inlines_b.h
+++ b/include/jemalloc/internal/prof_inlines_b.h
@@ -46,7 +46,17 @@ prof_info_get(tsd_t *tsd, const void *ptr, alloc_ctx_t *alloc_ctx,
 	assert(ptr != NULL);
 	assert(prof_info != NULL);
 
-	arena_prof_info_get(tsd, ptr, alloc_ctx, prof_info);
+	arena_prof_info_get(tsd, ptr, alloc_ctx, prof_info, false);
+}
+
+JEMALLOC_ALWAYS_INLINE void
+prof_info_get_and_reset_recent(tsd_t *tsd, const void *ptr,
+    alloc_ctx_t *alloc_ctx, prof_info_t *prof_info) {
+	cassert(config_prof);
+	assert(ptr != NULL);
+	assert(prof_info != NULL);
+
+	arena_prof_info_get(tsd, ptr, alloc_ctx, prof_info, true);
 }
 
 JEMALLOC_ALWAYS_INLINE void
@@ -66,12 +76,12 @@ prof_tctx_reset_sampled(tsd_t *tsd, const void *ptr) {
 }
 
 JEMALLOC_ALWAYS_INLINE void
-prof_info_set(tsd_t *tsd, const void *ptr, prof_tctx_t *tctx) {
+prof_info_set(tsd_t *tsd, edata_t *edata, prof_tctx_t *tctx) {
 	cassert(config_prof);
-	assert(ptr != NULL);
+	assert(edata != NULL);
 	assert((uintptr_t)tctx > (uintptr_t)1U);
 
-	arena_prof_info_set(tsd, ptr, tctx);
+	arena_prof_info_set(tsd, edata, tctx);
 }
 
 JEMALLOC_ALWAYS_INLINE bool
@@ -190,7 +200,7 @@ prof_realloc(tsd_t *tsd, const void *ptr, size_t usize, prof_tctx_t *tctx,
 JEMALLOC_ALWAYS_INLINE void
 prof_free(tsd_t *tsd, const void *ptr, size_t usize, alloc_ctx_t *alloc_ctx) {
 	prof_info_t prof_info;
-	prof_info_get(tsd, ptr, alloc_ctx, &prof_info);
+	prof_info_get_and_reset_recent(tsd, ptr, alloc_ctx, &prof_info);
 
 	cassert(config_prof);
 	assert(usize == isalloc(tsd_tsdn(tsd), ptr));
diff --git a/include/jemalloc/internal/prof_recent.h b/include/jemalloc/internal/prof_recent.h
new file mode 100644
index 00000000..d0e9e1e1
--- /dev/null
+++ b/include/jemalloc/internal/prof_recent.h
@@ -0,0 +1,16 @@
+#ifndef JEMALLOC_INTERNAL_PROF_RECENT_EXTERNS_H
+#define JEMALLOC_INTERNAL_PROF_RECENT_EXTERNS_H
+
+bool prof_recent_alloc_prepare(tsd_t *tsd, prof_tctx_t *tctx);
+void prof_recent_alloc(tsd_t *tsd, edata_t *edata, size_t usize);
+void prof_recent_alloc_reset(tsd_t *tsd, edata_t *edata);
+bool prof_recent_init();
+void edata_prof_recent_alloc_init(edata_t *edata);
+#ifdef JEMALLOC_JET
+prof_recent_t *prof_recent_alloc_begin(tsd_t *tsd);
+prof_recent_t *prof_recent_alloc_end(tsd_t *tsd);
+prof_recent_t *prof_recent_alloc_next(tsd_t *tsd, prof_recent_t *node);
+prof_recent_t *edata_prof_recent_alloc_get(tsd_t *tsd, const edata_t *edata);
+#endif
+
+#endif /* JEMALLOC_INTERNAL_PROF_RECENT_EXTERNS_H */
diff --git a/include/jemalloc/internal/prof_structs.h b/include/jemalloc/internal/prof_structs.h
index 6223adc8..59c0f4ff 100644
--- a/include/jemalloc/internal/prof_structs.h
+++ b/include/jemalloc/internal/prof_structs.h
@@ -2,6 +2,7 @@
 #define JEMALLOC_INTERNAL_PROF_STRUCTS_H
 
 #include "jemalloc/internal/ckh.h"
+#include "jemalloc/internal/edata.h"
 #include "jemalloc/internal/mutex.h"
 #include "jemalloc/internal/prng.h"
 #include "jemalloc/internal/rb.h"
@@ -55,6 +56,12 @@ struct prof_tctx_s {
 	uint64_t		thr_uid;
 	uint64_t		thr_discrim;
 
+	/*
+	 * Reference count of how many times this tctx object is referenced in
+	 * recent allocation / deallocation records, protected by tdata->lock.
+	 */
+	uint64_t		recent_count;
+
 	/* Profiling counters, protected by tdata->lock. */
 	prof_cnt_t		cnts;
 
@@ -97,10 +104,10 @@ struct prof_tctx_s {
 typedef rb_tree(prof_tctx_t) prof_tctx_tree_t;
 
 struct prof_info_s {
-	/* Points to the prof_tctx_t corresponding to the allocation. */
-	prof_tctx_t		*alloc_tctx;
 	/* Time when the allocation was made. */
 	nstime_t		alloc_time;
+	/* Points to the prof_tctx_t corresponding to the allocation. */
+	prof_tctx_t		*alloc_tctx;
 };
 
 struct prof_gctx_s {
@@ -201,4 +208,15 @@ struct prof_tdata_s {
 };
 typedef rb_tree(prof_tdata_t) prof_tdata_tree_t;
 
+struct prof_recent_s {
+	nstime_t alloc_time;
+	nstime_t dalloc_time;
+
+	prof_recent_t *next;
+	size_t usize;
+	prof_tctx_t *alloc_tctx;
+	edata_t *alloc_edata; /* NULL means allocation has been freed. */
+	prof_tctx_t *dalloc_tctx;
+};
+
 #endif /* JEMALLOC_INTERNAL_PROF_STRUCTS_H */
diff --git a/include/jemalloc/internal/prof_types.h b/include/jemalloc/internal/prof_types.h
index ad095da3..498962db 100644
--- a/include/jemalloc/internal/prof_types.h
+++ b/include/jemalloc/internal/prof_types.h
@@ -8,6 +8,7 @@ typedef struct prof_tctx_s prof_tctx_t;
 typedef struct prof_info_s prof_info_t;
 typedef struct prof_gctx_s prof_gctx_t;
 typedef struct prof_tdata_s prof_tdata_t;
+typedef struct prof_recent_s prof_recent_t;
 
 /* Option defaults. */
 #ifdef JEMALLOC_PROF
@@ -53,4 +54,7 @@ typedef struct prof_tdata_s prof_tdata_t;
 #define PROF_DUMP_FILENAME_LEN 1
 #endif
 
+/* Default number of recent allocations to record. */
+#define PROF_RECENT_ALLOC_MAX_DEFAULT 0
+
 #endif /* JEMALLOC_INTERNAL_PROF_TYPES_H */
diff --git a/include/jemalloc/internal/witness.h b/include/jemalloc/internal/witness.h
index 985e0a33..4ed787a2 100644
--- a/include/jemalloc/internal/witness.h
+++ b/include/jemalloc/internal/witness.h
@@ -61,6 +61,7 @@
 #define WITNESS_RANK_PROF_GDUMP		WITNESS_RANK_LEAF
 #define WITNESS_RANK_PROF_NEXT_THR_UID	WITNESS_RANK_LEAF
 #define WITNESS_RANK_PROF_THREAD_ACTIVE_INIT	WITNESS_RANK_LEAF
+#define WITNESS_RANK_PROF_RECENT_ALLOC	WITNESS_RANK_LEAF
 
 /******************************************************************************/
 /* PER-WITNESS DATA */
diff --git a/msvc/projects/vc2015/jemalloc/jemalloc.vcxproj b/msvc/projects/vc2015/jemalloc/jemalloc.vcxproj
index 58790903..f9af3ddd 100644
--- a/msvc/projects/vc2015/jemalloc/jemalloc.vcxproj
+++ b/msvc/projects/vc2015/jemalloc/jemalloc.vcxproj
@@ -67,6 +67,7 @@
     <ClCompile Include="..\..\..\..\src\prof.c" />
     <ClCompile Include="..\..\..\..\src\prof_data.c" />
     <ClCompile Include="..\..\..\..\src\prof_log.c" />
+    <ClCompile Include="..\..\..\..\src\prof_recent.c" />
     <ClCompile Include="..\..\..\..\src\rtree.c" />
     <ClCompile Include="..\..\..\..\src\safety_check.c" />
     <ClCompile Include="..\..\..\..\src\sc.c" />
diff --git a/msvc/projects/vc2015/jemalloc/jemalloc.vcxproj.filters b/msvc/projects/vc2015/jemalloc/jemalloc.vcxproj.filters
index 3551ba5e..90f8831d 100644
--- a/msvc/projects/vc2015/jemalloc/jemalloc.vcxproj.filters
+++ b/msvc/projects/vc2015/jemalloc/jemalloc.vcxproj.filters
@@ -82,6 +82,9 @@
     <ClCompile Include="..\..\..\..\src\prof_log.c">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\..\..\src\prof_recent.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="..\..\..\..\src\rtree.c">
       <Filter>Source Files</Filter>
     </ClCompile>
diff --git a/msvc/projects/vc2017/jemalloc/jemalloc.vcxproj b/msvc/projects/vc2017/jemalloc/jemalloc.vcxproj
index 631de575..4ca484ac 100644
--- a/msvc/projects/vc2017/jemalloc/jemalloc.vcxproj
+++ b/msvc/projects/vc2017/jemalloc/jemalloc.vcxproj
@@ -67,6 +67,7 @@
     <ClCompile Include="..\..\..\..\src\prof.c" />
     <ClCompile Include="..\..\..\..\src\prof_data.c" />
     <ClCompile Include="..\..\..\..\src\prof_log.c" />
+    <ClCompile Include="..\..\..\..\src\prof_recent.c" />
     <ClCompile Include="..\..\..\..\src\rtree.c" />
     <ClCompile Include="..\..\..\..\src\safety_check.c" />
     <ClCompile Include="..\..\..\..\src\sc.c" />
diff --git a/msvc/projects/vc2017/jemalloc/jemalloc.vcxproj.filters b/msvc/projects/vc2017/jemalloc/jemalloc.vcxproj.filters
index 3551ba5e..90f8831d 100644
--- a/msvc/projects/vc2017/jemalloc/jemalloc.vcxproj.filters
+++ b/msvc/projects/vc2017/jemalloc/jemalloc.vcxproj.filters
@@ -82,6 +82,9 @@
     <ClCompile Include="..\..\..\..\src\prof_log.c">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\..\..\src\prof_recent.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="..\..\..\..\src\rtree.c">
       <Filter>Source Files</Filter>
     </ClCompile>
diff --git a/src/ctl.c b/src/ctl.c
index eee12770..5a467d5a 100644
--- a/src/ctl.c
+++ b/src/ctl.c
@@ -113,6 +113,7 @@ CTL_PROTO(opt_prof_gdump)
 CTL_PROTO(opt_prof_final)
 CTL_PROTO(opt_prof_leak)
 CTL_PROTO(opt_prof_accum)
+CTL_PROTO(opt_prof_recent_alloc_max)
 CTL_PROTO(opt_zero_realloc)
 CTL_PROTO(tcache_create)
 CTL_PROTO(tcache_flush)
@@ -232,6 +233,7 @@ CTL_PROTO(experimental_utilization_query)
 CTL_PROTO(experimental_utilization_batch_query)
 CTL_PROTO(experimental_arenas_i_pactivep)
 INDEX_PROTO(experimental_arenas_i)
+CTL_PROTO(experimental_prof_recent_alloc_max)
 
 #define MUTEX_STATS_CTL_PROTO_GEN(n)					\
 CTL_PROTO(stats_##n##_num_ops)						\
@@ -343,6 +345,7 @@ static const ctl_named_node_t opt_node[] = {
 	{NAME("prof_final"),	CTL(opt_prof_final)},
 	{NAME("prof_leak"),	CTL(opt_prof_leak)},
 	{NAME("prof_accum"),	CTL(opt_prof_accum)},
+	{NAME("prof_recent_alloc_max"), CTL(opt_prof_recent_alloc_max)},
 	{NAME("zero_realloc"),	CTL(opt_zero_realloc)}
 };
 
@@ -620,10 +623,15 @@ static const ctl_indexed_node_t experimental_arenas_node[] = {
 	{INDEX(experimental_arenas_i)}
 };
 
+static const ctl_named_node_t experimental_prof_recent_node[] = {
+	{NAME("alloc_max"),	CTL(experimental_prof_recent_alloc_max)},
+};
+
 static const ctl_named_node_t experimental_node[] = {
 	{NAME("hooks"),		CHILD(named, experimental_hooks)},
 	{NAME("utilization"),	CHILD(named, experimental_utilization)},
-	{NAME("arenas"),	CHILD(indexed, experimental_arenas)}
+	{NAME("arenas"),	CHILD(indexed, experimental_arenas)},
+	{NAME("prof_recent"),	CHILD(named, experimental_prof_recent)}
 };
 
 static const ctl_named_node_t	root_node[] = {
@@ -1791,6 +1799,8 @@ CTL_RO_NL_CGEN(config_prof, opt_lg_prof_interval, opt_lg_prof_interval, ssize_t)
 CTL_RO_NL_CGEN(config_prof, opt_prof_gdump, opt_prof_gdump, bool)
 CTL_RO_NL_CGEN(config_prof, opt_prof_final, opt_prof_final, bool)
 CTL_RO_NL_CGEN(config_prof, opt_prof_leak, opt_prof_leak, bool)
+CTL_RO_NL_CGEN(config_prof, opt_prof_recent_alloc_max,
+    opt_prof_recent_alloc_max, ssize_t)
 CTL_RO_NL_GEN(opt_zero_realloc,
     zero_realloc_mode_names[opt_zero_realloc_action], const char *)
 
@@ -3461,3 +3471,36 @@ label_return:
 	malloc_mutex_unlock(tsd_tsdn(tsd), &ctl_mtx);
 	return ret;
 }
+
+static int
+experimental_prof_recent_alloc_max_ctl(tsd_t *tsd, const size_t *mib,
+    size_t miblen, void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
+	int ret;
+
+	if (!(config_prof && opt_prof)) {
+		ret = ENOENT;
+		goto label_return;
+	}
+
+	ssize_t old_max;
+	if (newp != NULL) {
+		if (newlen != sizeof(ssize_t)) {
+			ret = EINVAL;
+			goto label_return;
+		}
+		ssize_t max = *(ssize_t *)newp;
+		if (max < -1) {
+			ret = EINVAL;
+			goto label_return;
+		}
+		old_max = prof_recent_alloc_max_ctl_write(tsd, max);
+	} else {
+		old_max = prof_recent_alloc_max_ctl_read();
+	}
+	READ(old_max, ssize_t);
+
+	ret = 0;
+
+label_return:
+	return ret;
+}
diff --git a/src/extent.c b/src/extent.c
index 8d78f95f..54f14995 100644
--- a/src/extent.c
+++ b/src/extent.c
@@ -1562,6 +1562,8 @@ extent_merge_wrapper(tsdn_t *tsdn, ehooks_t *ehooks, edata_cache_t *edata_cache,
 
 bool
 extent_boot(void) {
+	assert(sizeof(slab_data_t) >= sizeof(e_prof_info_t));
+
 	if (rtree_new(&extents_rtree, true)) {
 		return true;
 	}
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 825a8ed0..7184cbb0 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -1402,6 +1402,8 @@ malloc_conf_init_helper(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS],
 				CONF_HANDLE_BOOL(opt_prof_final, "prof_final")
 				CONF_HANDLE_BOOL(opt_prof_leak, "prof_leak")
 				CONF_HANDLE_BOOL(opt_prof_log, "prof_log")
+				CONF_HANDLE_SSIZE_T(opt_prof_recent_alloc_max,
+				    "prof_recent_alloc_max", -1, SSIZE_MAX)
 			}
 			if (config_log) {
 				if (CONF_MATCH("log")) {
@@ -3015,7 +3017,7 @@ irallocx_prof(tsd_t *tsd, void *old_ptr, size_t old_usize, size_t size,
     size_t alignment, size_t *usize, bool zero, tcache_t *tcache,
     arena_t *arena, alloc_ctx_t *alloc_ctx, hook_ralloc_args_t *hook_args) {
 	prof_info_t old_prof_info;
-	prof_info_get(tsd, old_ptr, alloc_ctx, &old_prof_info);
+	prof_info_get_and_reset_recent(tsd, old_ptr, alloc_ctx, &old_prof_info);
 	bool prof_active = prof_active_get_unlocked();
 	prof_tctx_t *tctx = prof_alloc_prep(tsd, *usize, prof_active, false);
 	void *p;
@@ -3265,8 +3267,13 @@ ixallocx_prof_sample(tsdn_t *tsdn, void *ptr, size_t old_usize, size_t size,
 JEMALLOC_ALWAYS_INLINE size_t
 ixallocx_prof(tsd_t *tsd, void *ptr, size_t old_usize, size_t size,
     size_t extra, size_t alignment, bool zero, alloc_ctx_t *alloc_ctx) {
+	/*
+	 * old_prof_info is only used for asserting that the profiling info
+	 * isn't changed by the ixalloc() call.
+	 */
 	prof_info_t old_prof_info;
 	prof_info_get(tsd, ptr, alloc_ctx, &old_prof_info);
+
 	/*
 	 * usize isn't knowable before ixalloc() returns when extra is non-zero.
 	 * Therefore, compute its maximum possible value and use that in
@@ -3315,13 +3322,26 @@ ixallocx_prof(tsd_t *tsd, void *ptr, size_t old_usize, size_t size,
 		 */
 		thread_event(tsd, usize - usize_max);
 	}
-	if (usize == old_usize) {
-		prof_alloc_rollback(tsd, tctx, false);
-		return usize;
-	}
-	prof_realloc(tsd, ptr, usize, tctx, prof_active, ptr, old_usize,
-	    &old_prof_info);
 
+	/*
+	 * At this point we can still safely get the original profiling
+	 * information associated with the ptr, because (a) the edata_t object
+	 * associated with the ptr still lives and (b) the profiling info
+	 * fields are not touched.  "(a)" is asserted in the outer je_xallocx()
+	 * function, and "(b)" is indirectly verified below by checking that
+	 * the alloc_tctx field is unchanged.
+	 */
+	prof_info_t prof_info;
+	if (usize == old_usize) {
+		prof_info_get(tsd, ptr, alloc_ctx, &prof_info);
+		prof_alloc_rollback(tsd, tctx, false);
+	} else {
+		prof_info_get_and_reset_recent(tsd, ptr, alloc_ctx, &prof_info);
+		prof_realloc(tsd, ptr, usize, tctx, prof_active, ptr,
+		    old_usize, &prof_info);
+	}
+
+	assert(old_prof_info.alloc_tctx == prof_info.alloc_tctx);
 	return usize;
 }
 
@@ -3342,6 +3362,13 @@ je_xallocx(void *ptr, size_t size, size_t extra, int flags) {
 	tsd = tsd_fetch();
 	check_entry_exit_locking(tsd_tsdn(tsd));
 
+	/*
+	 * old_edata is only for verifying that xallocx() keeps the edata_t
+	 * object associated with the ptr (though the content of the edata_t
+	 * object can be changed).
+	 */
+	edata_t *old_edata = iealloc(tsd_tsdn(tsd), ptr);
+
 	alloc_ctx_t alloc_ctx;
 	rtree_ctx_t *rtree_ctx = tsd_rtree_ctx(tsd);
 	rtree_szind_slab_read(tsd_tsdn(tsd), &extents_rtree, rtree_ctx,
@@ -3374,6 +3401,13 @@ je_xallocx(void *ptr, size_t size, size_t extra, int flags) {
 		    extra, alignment, zero);
 		thread_event(tsd, usize);
 	}
+
+	/*
+	 * xallocx() should keep using the same edata_t object (though its
+	 * content can be changed).
+	 */
+	assert(iealloc(tsd_tsdn(tsd), ptr) == old_edata);
+
 	if (unlikely(usize == old_usize)) {
 		thread_event_rollback(tsd, usize);
 		goto label_not_resized;
diff --git a/src/large.c b/src/large.c
index 5ca09f68..ca35fc54 100644
--- a/src/large.c
+++ b/src/large.c
@@ -5,6 +5,7 @@
 #include "jemalloc/internal/assert.h"
 #include "jemalloc/internal/extent_mmap.h"
 #include "jemalloc/internal/mutex.h"
+#include "jemalloc/internal/prof_recent.h"
 #include "jemalloc/internal/rtree.h"
 #include "jemalloc/internal/util.h"
 
@@ -368,8 +369,22 @@ large_salloc(tsdn_t *tsdn, const edata_t *edata) {
 }
 
 void
-large_prof_info_get(const edata_t *edata, prof_info_t *prof_info) {
-	edata_prof_info_get(edata, prof_info);
+large_prof_info_get(tsd_t *tsd, edata_t *edata, prof_info_t *prof_info,
+    bool reset_recent) {
+	assert(prof_info != NULL);
+	nstime_copy(&prof_info->alloc_time, edata_prof_alloc_time_get(edata));
+
+	prof_tctx_t *alloc_tctx = edata_prof_tctx_get(edata);
+	prof_info->alloc_tctx = alloc_tctx;
+
+	if (reset_recent && (uintptr_t)alloc_tctx > (uintptr_t)1U) {
+		/*
+		 * This allocation was a prof sample.  Reset the pointer on the
+		 * recent allocation record, so that this allocation is
+		 * recorded as released.
+		 */
+		prof_recent_alloc_reset(tsd, edata);
+	}
 }
 
 static void
@@ -388,4 +403,5 @@ large_prof_info_set(edata_t *edata, prof_tctx_t *tctx) {
 	nstime_t t;
 	nstime_init_update(&t);
 	edata_prof_alloc_time_set(edata, &t);
+	edata_prof_recent_alloc_init(edata);
 }
diff --git a/src/prof.c b/src/prof.c
index 33b68198..159600e7 100644
--- a/src/prof.c
+++ b/src/prof.c
@@ -7,6 +7,7 @@
 #include "jemalloc/internal/mutex.h"
 #include "jemalloc/internal/prof_data.h"
 #include "jemalloc/internal/prof_log.h"
+#include "jemalloc/internal/prof_recent.h"
 #include "jemalloc/internal/thread_event.h"
 
 /*
@@ -146,7 +147,8 @@ prof_alloc_rollback(tsd_t *tsd, prof_tctx_t *tctx, bool updated) {
 void
 prof_malloc_sample_object(tsd_t *tsd, const void *ptr, size_t usize,
     prof_tctx_t *tctx) {
-	prof_info_set(tsd, ptr, tctx);
+	edata_t *edata = iealloc(tsd_tsdn(tsd), ptr);
+	prof_info_set(tsd, edata, tctx);
 
 	malloc_mutex_lock(tsd_tsdn(tsd), tctx->tdata->lock);
 	tctx->cnts.curobjs++;
@@ -155,8 +157,13 @@ prof_malloc_sample_object(tsd_t *tsd, const void *ptr, size_t usize,
 		tctx->cnts.accumobjs++;
 		tctx->cnts.accumbytes += usize;
 	}
+	bool record_recent = prof_recent_alloc_prepare(tsd, tctx);
 	tctx->prepared = false;
 	malloc_mutex_unlock(tsd_tsdn(tsd), tctx->tdata->lock);
+	if (record_recent) {
+		assert(tctx == edata_prof_tctx_get(edata));
+		prof_recent_alloc(tsd, edata, usize);
+	}
 }
 
 void
@@ -1068,6 +1075,10 @@ prof_boot2(tsd_t *tsd) {
 			return true;
 		}
 
+		if (prof_recent_init()) {
+			return true;
+		}
+
 		gctx_locks = (malloc_mutex_t *)base_alloc(tsd_tsdn(tsd),
 		    b0get(), PROF_NCTX_LOCKS * sizeof(malloc_mutex_t),
 		    CACHELINE);
@@ -1134,6 +1145,7 @@ prof_prefork1(tsdn_t *tsdn) {
 		malloc_mutex_prefork(tsdn, &prof_gdump_mtx);
 		malloc_mutex_prefork(tsdn, &next_thr_uid_mtx);
 		malloc_mutex_prefork(tsdn, &prof_thread_active_init_mtx);
+		malloc_mutex_prefork(tsdn, &prof_recent_alloc_mtx);
 	}
 }
 
@@ -1142,6 +1154,7 @@ prof_postfork_parent(tsdn_t *tsdn) {
 	if (config_prof && opt_prof) {
 		unsigned i;
 
+		malloc_mutex_postfork_parent(tsdn, &prof_recent_alloc_mtx);
 		malloc_mutex_postfork_parent(tsdn,
 		    &prof_thread_active_init_mtx);
 		malloc_mutex_postfork_parent(tsdn, &next_thr_uid_mtx);
@@ -1166,6 +1179,7 @@ prof_postfork_child(tsdn_t *tsdn) {
 	if (config_prof && opt_prof) {
 		unsigned i;
 
+		malloc_mutex_postfork_child(tsdn, &prof_recent_alloc_mtx);
 		malloc_mutex_postfork_child(tsdn, &prof_thread_active_init_mtx);
 		malloc_mutex_postfork_child(tsdn, &next_thr_uid_mtx);
 		malloc_mutex_postfork_child(tsdn, &prof_gdump_mtx);
diff --git a/src/prof_data.c b/src/prof_data.c
index 690070e6..dfc507f9 100644
--- a/src/prof_data.c
+++ b/src/prof_data.c
@@ -378,6 +378,7 @@ prof_lookup(tsd_t *tsd, prof_bt_t *bt) {
 		ret.p->tdata = tdata;
 		ret.p->thr_uid = tdata->thr_uid;
 		ret.p->thr_discrim = tdata->thr_discrim;
+		ret.p->recent_count = 0;
 		memset(&ret.p->cnts, 0, sizeof(prof_cnt_t));
 		ret.p->gctx = gctx;
 		ret.p->tctx_uid = tdata->tctx_uid_next++;
@@ -405,8 +406,15 @@ prof_lookup(tsd_t *tsd, prof_bt_t *bt) {
 
 prof_tctx_t *
 prof_tctx_create(tsd_t *tsd) {
-	prof_tdata_t *tdata = prof_tdata_get(tsd, false);
-	assert(tdata != NULL);
+	if (tsd_reentrancy_level_get(tsd) > 0) {
+		return NULL;
+	}
+
+	prof_tdata_t *tdata = prof_tdata_get(tsd, true);
+	if (tdata == NULL) {
+		return NULL;
+	}
+
 	prof_bt_t bt;
 	bt_init(&bt, tdata->vec);
 	prof_backtrace(tsd, &bt);
@@ -1417,6 +1425,9 @@ prof_tctx_should_destroy(tsd_t *tsd, prof_tctx_t *tctx) {
 	if (tctx->prepared) {
 		return false;
 	}
+	if (tctx->recent_count != 0) {
+		return false;
+	}
 	return true;
 }
 
diff --git a/src/prof_recent.c b/src/prof_recent.c
new file mode 100644
index 00000000..98349aca
--- /dev/null
+++ b/src/prof_recent.c
@@ -0,0 +1,553 @@
+#define JEMALLOC_PROF_RECENT_C_
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/jemalloc_internal_includes.h"
+
+#include "jemalloc/internal/assert.h"
+#include "jemalloc/internal/emitter.h"
+#include "jemalloc/internal/prof_data.h"
+#include "jemalloc/internal/prof_recent.h"
+
+#ifndef JEMALLOC_JET
+#  define STATIC_INLINE_IF_NOT_TEST static inline
+#else
+#  define STATIC_INLINE_IF_NOT_TEST
+#endif
+
+ssize_t opt_prof_recent_alloc_max = PROF_RECENT_ALLOC_MAX_DEFAULT;
+malloc_mutex_t prof_recent_alloc_mtx; /* Protects the fields below */
+static atomic_zd_t prof_recent_alloc_max;
+static ssize_t prof_recent_alloc_count = 0;
+static prof_recent_t *prof_recent_alloc_dummy = NULL;
+
+static void
+prof_recent_alloc_max_init() {
+	atomic_store_zd(&prof_recent_alloc_max, opt_prof_recent_alloc_max,
+	    ATOMIC_RELAXED);
+}
+
+static inline ssize_t
+prof_recent_alloc_max_get_no_lock() {
+	return atomic_load_zd(&prof_recent_alloc_max, ATOMIC_RELAXED);
+}
+
+static inline ssize_t
+prof_recent_alloc_max_get(tsd_t *tsd) {
+	malloc_mutex_assert_owner(tsd_tsdn(tsd), &prof_recent_alloc_mtx);
+	return prof_recent_alloc_max_get_no_lock();
+}
+
+static inline ssize_t
+prof_recent_alloc_max_update(tsd_t *tsd, ssize_t max) {
+	malloc_mutex_assert_owner(tsd_tsdn(tsd), &prof_recent_alloc_mtx);
+	ssize_t old_max = prof_recent_alloc_max_get(tsd);
+	atomic_store_zd(&prof_recent_alloc_max, max, ATOMIC_RELAXED);
+	return old_max;
+}
+
+static inline void
+increment_recent_count(tsd_t *tsd, prof_tctx_t *tctx) {
+	malloc_mutex_assert_owner(tsd_tsdn(tsd), tctx->tdata->lock);
+	++tctx->recent_count;
+	assert(tctx->recent_count > 0);
+}
+
+bool
+prof_recent_alloc_prepare(tsd_t *tsd, prof_tctx_t *tctx) {
+	assert(opt_prof && prof_booted);
+	malloc_mutex_assert_owner(tsd_tsdn(tsd), tctx->tdata->lock);
+	malloc_mutex_assert_not_owner(tsd_tsdn(tsd), &prof_recent_alloc_mtx);
+
+	/*
+	 * Check whether last-N mode is turned on without trying to acquire the
+	 * lock, so as to optimize for the following two scenarios:
+	 * (1) Last-N mode is switched off;
+	 * (2) Dumping, during which last-N mode is temporarily turned off so
+	 *     as not to block sampled allocations.
+	 */
+	if (prof_recent_alloc_max_get_no_lock() == 0) {
+		return false;
+	}
+
+	/*
+	 * Increment recent_count to hold the tctx so that it won't be gone
+	 * even after tctx->tdata->lock is released.  This acts as a
+	 * "placeholder"; the real recording of the allocation requires a lock
+	 * on prof_recent_alloc_mtx and is done in prof_recent_alloc (when
+	 * tctx->tdata->lock has been released).
+	 */
+	increment_recent_count(tsd, tctx);
+	return true;
+}
+
+static void
+decrement_recent_count(tsd_t *tsd, prof_tctx_t *tctx) {
+	malloc_mutex_assert_not_owner(tsd_tsdn(tsd), &prof_recent_alloc_mtx);
+	assert(tctx != NULL);
+	malloc_mutex_lock(tsd_tsdn(tsd), tctx->tdata->lock);
+	assert(tctx->recent_count > 0);
+	--tctx->recent_count;
+	prof_tctx_try_destroy(tsd, tctx);
+}
+
+void
+edata_prof_recent_alloc_init(edata_t *edata) {
+	edata_prof_recent_alloc_set_dont_call_directly(edata, NULL);
+}
+
+static inline prof_recent_t *
+edata_prof_recent_alloc_get_no_lock(const edata_t *edata) {
+	return edata_prof_recent_alloc_get_dont_call_directly(edata);
+}
+
+STATIC_INLINE_IF_NOT_TEST prof_recent_t *
+edata_prof_recent_alloc_get(tsd_t *tsd, const edata_t *edata) {
+	malloc_mutex_assert_owner(tsd_tsdn(tsd), &prof_recent_alloc_mtx);
+	prof_recent_t *recent_alloc =
+	    edata_prof_recent_alloc_get_no_lock(edata);
+	assert(recent_alloc == NULL || recent_alloc->alloc_edata == edata);
+	return recent_alloc;
+}
+
+static prof_recent_t *
+edata_prof_recent_alloc_update_internal(tsd_t *tsd, edata_t *edata,
+    prof_recent_t *recent_alloc) {
+	malloc_mutex_assert_owner(tsd_tsdn(tsd), &prof_recent_alloc_mtx);
+	prof_recent_t *old_recent_alloc =
+	    edata_prof_recent_alloc_get(tsd, edata);
+	edata_prof_recent_alloc_set_dont_call_directly(edata, recent_alloc);
+	return old_recent_alloc;
+}
+
+static void
+edata_prof_recent_alloc_set(tsd_t *tsd, edata_t *edata,
+    prof_recent_t *recent_alloc) {
+	assert(recent_alloc != NULL);
+	prof_recent_t *old_recent_alloc =
+	    edata_prof_recent_alloc_update_internal(tsd, edata, recent_alloc);
+	assert(old_recent_alloc == NULL);
+	recent_alloc->alloc_edata = edata;
+}
+
+static void
+edata_prof_recent_alloc_reset(tsd_t *tsd, edata_t *edata,
+    prof_recent_t *recent_alloc) {
+	assert(recent_alloc != NULL);
+	prof_recent_t *old_recent_alloc =
+	    edata_prof_recent_alloc_update_internal(tsd, edata, NULL);
+	assert(old_recent_alloc == recent_alloc);
+	assert(edata == recent_alloc->alloc_edata);
+	recent_alloc->alloc_edata = NULL;
+}
+
+/*
+ * This function should be called right before an allocation is released, so
+ * that the associated recent allocation record can contain the following
+ * information:
+ * (1) The allocation is released;
+ * (2) The time of the deallocation; and
+ * (3) The prof_tctx associated with the deallocation.
+ */
+void
+prof_recent_alloc_reset(tsd_t *tsd, edata_t *edata) {
+	/*
+	 * Check whether the recent allocation record still exists without
+	 * trying to acquire the lock.
+	 */
+	if (edata_prof_recent_alloc_get_no_lock(edata) == NULL) {
+		return;
+	}
+
+	prof_tctx_t *dalloc_tctx = prof_tctx_create(tsd);
+	/*
+	 * In case dalloc_tctx is NULL, e.g. due to OOM, we will not record the
+	 * deallocation time / tctx, which is handled later, after we check
+	 * again when holding the lock.
+	 */
+
+	if (dalloc_tctx != NULL) {
+		malloc_mutex_lock(tsd_tsdn(tsd), dalloc_tctx->tdata->lock);
+		increment_recent_count(tsd, dalloc_tctx);
+		dalloc_tctx->prepared = false;
+		malloc_mutex_unlock(tsd_tsdn(tsd), dalloc_tctx->tdata->lock);
+	}
+
+	malloc_mutex_lock(tsd_tsdn(tsd), &prof_recent_alloc_mtx);
+	/* Check again after acquiring the lock.  */
+	prof_recent_t *recent = edata_prof_recent_alloc_get(tsd, edata);
+	if (recent != NULL) {
+		edata_prof_recent_alloc_reset(tsd, edata, recent);
+		assert(nstime_equals_zero(&recent->dalloc_time));
+		assert(recent->dalloc_tctx == NULL);
+		if (dalloc_tctx != NULL) {
+			nstime_update(&recent->dalloc_time);
+			recent->dalloc_tctx = dalloc_tctx;
+		}
+	} else if (dalloc_tctx != NULL) {
+		/* We lost the rase - the allocation record was just gone. */
+		decrement_recent_count(tsd, dalloc_tctx);
+	}
+	malloc_mutex_unlock(tsd_tsdn(tsd), &prof_recent_alloc_mtx);
+}
+
+static void
+prof_recent_alloc_evict_edata(tsd_t *tsd, prof_recent_t *recent) {
+	malloc_mutex_assert_owner(tsd_tsdn(tsd), &prof_recent_alloc_mtx);
+	if (recent->alloc_edata != NULL) {
+		edata_prof_recent_alloc_reset(tsd, recent->alloc_edata, recent);
+	}
+}
+
+STATIC_INLINE_IF_NOT_TEST prof_recent_t *
+prof_recent_alloc_begin(tsd_t *tsd) {
+	malloc_mutex_assert_owner(tsd_tsdn(tsd), &prof_recent_alloc_mtx);
+	assert(prof_recent_alloc_dummy != NULL);
+	return prof_recent_alloc_dummy->next;
+}
+
+STATIC_INLINE_IF_NOT_TEST prof_recent_t *
+prof_recent_alloc_end(tsd_t *tsd) {
+	malloc_mutex_assert_owner(tsd_tsdn(tsd), &prof_recent_alloc_mtx);
+	assert(prof_recent_alloc_dummy != NULL);
+	return prof_recent_alloc_dummy;
+}
+
+STATIC_INLINE_IF_NOT_TEST prof_recent_t *
+prof_recent_alloc_next(tsd_t *tsd, prof_recent_t *node) {
+	malloc_mutex_assert_owner(tsd_tsdn(tsd), &prof_recent_alloc_mtx);
+	assert(prof_recent_alloc_dummy != NULL);
+	assert(node != NULL && node != prof_recent_alloc_dummy);
+	return node->next;
+}
+
+static bool
+prof_recent_alloc_is_empty(tsd_t *tsd) {
+	malloc_mutex_assert_owner(tsd_tsdn(tsd), &prof_recent_alloc_mtx);
+	if (prof_recent_alloc_begin(tsd) == prof_recent_alloc_end(tsd)) {
+		assert(prof_recent_alloc_count == 0);
+		return true;
+	} else {
+		assert(prof_recent_alloc_count > 0);
+		return false;
+	}
+}
+
+static void
+prof_recent_alloc_assert_count(tsd_t *tsd) {
+	malloc_mutex_assert_owner(tsd_tsdn(tsd), &prof_recent_alloc_mtx);
+	if (config_debug) {
+		ssize_t count = 0;
+		prof_recent_t *n = prof_recent_alloc_begin(tsd);
+		while (n != prof_recent_alloc_end(tsd)) {
+			++count;
+			n = prof_recent_alloc_next(tsd, n);
+		}
+		assert(count == prof_recent_alloc_count);
+		assert(prof_recent_alloc_max_get(tsd) == -1 ||
+		    count <= prof_recent_alloc_max_get(tsd));
+	}
+}
+
+void
+prof_recent_alloc(tsd_t *tsd, edata_t *edata, size_t usize) {
+	assert(edata != NULL);
+	prof_tctx_t *tctx = edata_prof_tctx_get(edata);
+
+	malloc_mutex_assert_not_owner(tsd_tsdn(tsd), tctx->tdata->lock);
+	malloc_mutex_lock(tsd_tsdn(tsd), &prof_recent_alloc_mtx);
+	prof_recent_alloc_assert_count(tsd);
+
+	/*
+	 * Reserve a new prof_recent_t node if needed.  If needed, we release
+	 * the prof_recent_alloc_mtx lock and allocate.  Then, rather than
+	 * immediately checking for OOM, we regain the lock and try to make use
+	 * of the reserve node if needed.  There are six scenarios:
+	 *
+	 *          \ now | no need | need but OOMed | need and allocated
+	 *     later \    |         |                |
+	 *    ------------------------------------------------------------
+	 *     no need    |   (1)   |      (2)       |         (3)
+	 *    ------------------------------------------------------------
+	 *     need       |   (4)   |      (5)       |         (6)
+	 *
+	 * First, "(4)" never happens, because we don't release the lock in the
+	 * middle if there's no need for a new node; in such cases "(1)" always
+	 * takes place, which is trivial.
+	 *
+	 * Out of the remaining four scenarios, "(6)" is the common case and is
+	 * trivial.  "(5)" is also trivial, in which case we'll rollback the
+	 * effect of prof_recent_alloc_prepare() as expected.
+	 *
+	 * "(2)" / "(3)" occurs when the need for a new node is gone after we
+	 * regain the lock.  If the new node is successfully allocated, i.e. in
+	 * the case of "(3)", we'll release it in the end; otherwise, i.e. in
+	 * the case of "(2)", we do nothing - we're lucky that the OOM ends up
+	 * doing no harm at all.
+	 *
+	 * Therefore, the only performance cost of the "release lock" ->
+	 * "allocate" -> "regain lock" design is the "(3)" case, but it happens
+	 * very rarely, so the cost is relatively small compared to the gain of
+	 * not having to have the lock order of prof_recent_alloc_mtx above all
+	 * the allocation locks.
+	 */
+	prof_recent_t *reserve = NULL;
+	if (prof_recent_alloc_max_get(tsd) == -1 ||
+	    prof_recent_alloc_count < prof_recent_alloc_max_get(tsd)) {
+		assert(prof_recent_alloc_max_get(tsd) != 0);
+		malloc_mutex_unlock(tsd_tsdn(tsd), &prof_recent_alloc_mtx);
+		reserve = (prof_recent_t *)iallocztm(tsd_tsdn(tsd),
+		    sizeof(prof_recent_t), sz_size2index(sizeof(prof_recent_t)),
+		    false, NULL, true, arena_get(tsd_tsdn(tsd), 0, false),
+		    true);
+		malloc_mutex_lock(tsd_tsdn(tsd), &prof_recent_alloc_mtx);
+		prof_recent_alloc_assert_count(tsd);
+	}
+
+	if (prof_recent_alloc_max_get(tsd) == 0) {
+		assert(prof_recent_alloc_is_empty(tsd));
+		goto label_rollback;
+	}
+
+	assert(prof_recent_alloc_dummy != NULL);
+	{
+		/* Fill content into the dummy node. */
+		prof_recent_t *node = prof_recent_alloc_dummy;
+		node->usize = usize;
+		nstime_copy(&node->alloc_time,
+		    edata_prof_alloc_time_get(edata));
+		node->alloc_tctx = tctx;
+		edata_prof_recent_alloc_set(tsd, edata, node);
+		nstime_init_zero(&node->dalloc_time);
+		node->dalloc_tctx = NULL;
+	}
+
+	prof_tctx_t *old_alloc_tctx, *old_dalloc_tctx;
+	if (prof_recent_alloc_count == prof_recent_alloc_max_get(tsd)) {
+		/* If upper limit is reached, simply shift the dummy. */
+		assert(prof_recent_alloc_max_get(tsd) != -1);
+		assert(!prof_recent_alloc_is_empty(tsd));
+		prof_recent_alloc_dummy = prof_recent_alloc_dummy->next;
+		old_alloc_tctx = prof_recent_alloc_dummy->alloc_tctx;
+		assert(old_alloc_tctx != NULL);
+		old_dalloc_tctx = prof_recent_alloc_dummy->dalloc_tctx;
+		prof_recent_alloc_evict_edata(tsd, prof_recent_alloc_dummy);
+	} else {
+		/* Otherwise use the new node as the dummy. */
+		assert(prof_recent_alloc_max_get(tsd) == -1 ||
+		    prof_recent_alloc_count < prof_recent_alloc_max_get(tsd));
+		if (reserve == NULL) {
+			goto label_rollback;
+		}
+		reserve->next = prof_recent_alloc_dummy->next;
+		prof_recent_alloc_dummy->next = reserve;
+		prof_recent_alloc_dummy = reserve;
+		reserve = NULL;
+		old_alloc_tctx = NULL;
+		old_dalloc_tctx = NULL;
+		++prof_recent_alloc_count;
+	}
+
+	assert(!prof_recent_alloc_is_empty(tsd));
+	prof_recent_alloc_assert_count(tsd);
+	malloc_mutex_unlock(tsd_tsdn(tsd), &prof_recent_alloc_mtx);
+
+	if (reserve != NULL) {
+		idalloctm(tsd_tsdn(tsd), reserve, NULL, NULL, true, true);
+	}
+
+	/*
+	 * Asynchronously handle the tctx of the old node, so that there's no
+	 * simultaneous holdings of prof_recent_alloc_mtx and tdata->lock.
+	 * In the worst case this may delay the tctx release but it's better
+	 * than holding prof_recent_alloc_mtx for longer.
+	 */
+	if (old_alloc_tctx != NULL) {
+		decrement_recent_count(tsd, old_alloc_tctx);
+	}
+	if (old_dalloc_tctx != NULL) {
+		decrement_recent_count(tsd, old_dalloc_tctx);
+	}
+	return;
+
+label_rollback:
+	assert(edata_prof_recent_alloc_get(tsd, edata) == NULL);
+	prof_recent_alloc_assert_count(tsd);
+	malloc_mutex_unlock(tsd_tsdn(tsd), &prof_recent_alloc_mtx);
+	if (reserve != NULL) {
+		idalloctm(tsd_tsdn(tsd), reserve, NULL, NULL, true, true);
+	}
+	decrement_recent_count(tsd, tctx);
+}
+
+ssize_t
+prof_recent_alloc_max_ctl_read() {
+	/* Don't bother to acquire the lock. */
+	return prof_recent_alloc_max_get_no_lock();
+}
+
+ssize_t
+prof_recent_alloc_max_ctl_write(tsd_t *tsd, ssize_t max) {
+	assert(max >= -1);
+
+	malloc_mutex_lock(tsd_tsdn(tsd), &prof_recent_alloc_mtx);
+	prof_recent_alloc_assert_count(tsd);
+
+	const ssize_t old_max = prof_recent_alloc_max_update(tsd, max);
+
+	if (max == -1 || prof_recent_alloc_count <= max) {
+		/* Easy case - no need to alter the list. */
+		malloc_mutex_unlock(tsd_tsdn(tsd), &prof_recent_alloc_mtx);
+		return old_max;
+	}
+
+	prof_recent_t *begin = prof_recent_alloc_dummy->next;
+	/* For verification purpose only. */
+	ssize_t count = prof_recent_alloc_count - max;
+	do {
+		assert(!prof_recent_alloc_is_empty(tsd));
+		prof_recent_t *node = prof_recent_alloc_dummy->next;
+		assert(node != prof_recent_alloc_dummy);
+		prof_recent_alloc_evict_edata(tsd, node);
+		prof_recent_alloc_dummy->next = node->next;
+		--prof_recent_alloc_count;
+	} while (prof_recent_alloc_count > max);
+	prof_recent_t *end = prof_recent_alloc_dummy->next;
+	assert(begin != end);
+
+	prof_recent_alloc_assert_count(tsd);
+	malloc_mutex_unlock(tsd_tsdn(tsd), &prof_recent_alloc_mtx);
+
+	/*
+	 * Asynchronously handle the tctx of the to-be-deleted nodes, so that
+	 * there's no simultaneous holdings of prof_recent_alloc_mtx and
+	 * tdata->lock.  In the worst case there can be slightly extra space
+	 * overhead taken by these nodes, but the total number of nodes at any
+	 * time is bounded by (max + sum(decreases)), where "max" means the
+	 * most recent prof_recent_alloc_max and "sum(decreases)" means the
+	 * sum of the deltas of all decreases in prof_recent_alloc_max in the
+	 * past.  This (max + sum(decreases)) value is completely transparent
+	 * to and controlled by application.
+	 */
+	do {
+		prof_recent_t *node = begin;
+		decrement_recent_count(tsd, node->alloc_tctx);
+		if (node->dalloc_tctx != NULL) {
+			decrement_recent_count(tsd, node->dalloc_tctx);
+		}
+		begin = node->next;
+		idalloctm(tsd_tsdn(tsd), node, NULL, NULL, true, true);
+		--count;
+	} while (begin != end);
+	assert(count == 0);
+
+	return old_max;
+}
+
+static void
+dump_bt(emitter_t *emitter, prof_tctx_t *tctx) {
+	char bt_buf[2 * sizeof(intptr_t) + 3];
+	char *s = bt_buf;
+	assert(tctx != NULL);
+	prof_bt_t *bt = &tctx->gctx->bt;
+	for (size_t i = 0; i < bt->len; ++i) {
+		malloc_snprintf(bt_buf, sizeof(bt_buf), "%p", bt->vec[i]);
+		emitter_json_value(emitter, emitter_type_string, &s);
+	}
+}
+
+#define PROF_RECENT_PRINT_BUFSIZE 4096
+void
+prof_recent_alloc_dump(tsd_t *tsd, void (*write_cb)(void *, const char *),
+    void *cbopaque) {
+	char *buf = (char *)iallocztm(tsd_tsdn(tsd), PROF_RECENT_PRINT_BUFSIZE,
+	    sz_size2index(PROF_RECENT_PRINT_BUFSIZE), false, NULL, true,
+	    arena_get(tsd_tsdn(tsd), 0, false), true);
+	buf_writer_arg_t buf_arg = {write_cb, cbopaque, buf,
+	    PROF_RECENT_PRINT_BUFSIZE - 1, 0};
+	emitter_t emitter;
+	emitter_init(&emitter, emitter_output_json_compact, buffered_write_cb,
+	    &buf_arg);
+	emitter_begin(&emitter);
+
+	malloc_mutex_lock(tsd_tsdn(tsd), &prof_recent_alloc_mtx);
+	prof_recent_alloc_assert_count(tsd);
+
+	/*
+	 * Set prof_recent_alloc_max to 0 so that dumping won't block sampled
+	 * allocations: the allocations can complete but will not be recorded.
+	 */
+	ssize_t max = prof_recent_alloc_max_update(tsd, 0);
+
+	emitter_json_kv(&emitter, "recent_alloc_max", emitter_type_ssize, &max);
+
+	emitter_json_array_kv_begin(&emitter, "recent_alloc");
+	for (prof_recent_t *n = prof_recent_alloc_begin(tsd);
+	    n != prof_recent_alloc_end(tsd);
+	    n = prof_recent_alloc_next(tsd, n)) {
+		emitter_json_object_begin(&emitter);
+
+		emitter_json_kv(&emitter, "usize", emitter_type_size,
+		    &n->usize);
+		bool released = n->alloc_edata == NULL;
+		emitter_json_kv(&emitter, "released", emitter_type_bool,
+		    &released);
+
+		emitter_json_kv(&emitter, "alloc_thread_uid",
+		    emitter_type_uint64, &n->alloc_tctx->thr_uid);
+		uint64_t alloc_time_ns = nstime_ns(&n->alloc_time);
+		emitter_json_kv(&emitter, "alloc_time", emitter_type_uint64,
+		    &alloc_time_ns);
+		emitter_json_array_kv_begin(&emitter, "alloc_trace");
+		dump_bt(&emitter, n->alloc_tctx);
+		emitter_json_array_end(&emitter);
+
+		if (n->dalloc_tctx != NULL) {
+			assert(released);
+			emitter_json_kv(&emitter, "dalloc_thread_uid",
+			    emitter_type_uint64, &n->dalloc_tctx->thr_uid);
+			assert(!nstime_equals_zero(&n->dalloc_time));
+			uint64_t dalloc_time_ns = nstime_ns(&n->dalloc_time);
+			emitter_json_kv(&emitter, "dalloc_time",
+			    emitter_type_uint64, &dalloc_time_ns);
+			emitter_json_array_kv_begin(&emitter, "dalloc_trace");
+			dump_bt(&emitter, n->dalloc_tctx);
+			emitter_json_array_end(&emitter);
+		} else {
+			assert(nstime_equals_zero(&n->dalloc_time));
+		}
+
+		emitter_json_object_end(&emitter);
+	}
+	emitter_json_array_end(&emitter);
+
+	max = prof_recent_alloc_max_update(tsd, max);
+	assert(max == 0);
+	malloc_mutex_unlock(tsd_tsdn(tsd), &prof_recent_alloc_mtx);
+
+	emitter_end(&emitter);
+	buf_writer_flush(&buf_arg);
+	idalloctm(tsd_tsdn(tsd), buf, NULL, NULL, true, true);
+}
+#undef PROF_RECENT_PRINT_BUFSIZE
+
+bool
+prof_recent_init() {
+	prof_recent_alloc_max_init();
+
+	if (malloc_mutex_init(&prof_recent_alloc_mtx,
+	    "prof_recent_alloc", WITNESS_RANK_PROF_RECENT_ALLOC,
+	    malloc_mutex_rank_exclusive)) {
+		return true;
+	}
+
+	assert(prof_recent_alloc_dummy == NULL);
+	prof_recent_alloc_dummy = (prof_recent_t *)iallocztm(
+	    TSDN_NULL, sizeof(prof_recent_t),
+	    sz_size2index(sizeof(prof_recent_t)), false, NULL, true,
+	    arena_get(TSDN_NULL, 0, true), true);
+	if (prof_recent_alloc_dummy == NULL) {
+		return true;
+	}
+	prof_recent_alloc_dummy->next = prof_recent_alloc_dummy;
+
+	return false;
+}
diff --git a/test/unit/mallctl.c b/test/unit/mallctl.c
index ebbaed7d..d317b4af 100644
--- a/test/unit/mallctl.c
+++ b/test/unit/mallctl.c
@@ -188,6 +188,7 @@ TEST_BEGIN(test_mallctl_opt) {
 	TEST_MALLCTL_OPT(bool, prof_gdump, prof);
 	TEST_MALLCTL_OPT(bool, prof_final, prof);
 	TEST_MALLCTL_OPT(bool, prof_leak, prof);
+	TEST_MALLCTL_OPT(ssize_t, prof_recent_alloc_max, prof);
 
 #undef TEST_MALLCTL_OPT
 }
diff --git a/test/unit/prof_recent.c b/test/unit/prof_recent.c
new file mode 100644
index 00000000..e10ac3fe
--- /dev/null
+++ b/test/unit/prof_recent.c
@@ -0,0 +1,391 @@
+#include "test/jemalloc_test.h"
+
+#include "jemalloc/internal/prof_recent.h"
+
+/* As specified in the shell script */
+#define OPT_ALLOC_MAX	3
+
+/* Invariant before and after every test (when config_prof is on) */
+static void confirm_prof_setup(tsd_t *tsd) {
+	/* Options */
+	assert_true(opt_prof, "opt_prof not on");
+	assert_true(opt_prof_active, "opt_prof_active not on");
+	assert_zd_eq(opt_prof_recent_alloc_max, OPT_ALLOC_MAX,
+	    "opt_prof_recent_alloc_max not set correctly");
+
+	/* Dynamics */
+	assert_true(prof_active, "prof_active not on");
+	assert_zd_eq(prof_recent_alloc_max_ctl_read(tsd), OPT_ALLOC_MAX,
+	    "prof_recent_alloc_max not set correctly");
+}
+
+TEST_BEGIN(test_confirm_setup) {
+	test_skip_if(!config_prof);
+	confirm_prof_setup(tsd_fetch());
+}
+TEST_END
+
+TEST_BEGIN(test_prof_recent_off) {
+	test_skip_if(config_prof);
+
+	const ssize_t past_ref = 0, future_ref = 0;
+	const size_t len_ref = sizeof(ssize_t);
+
+	ssize_t past = past_ref, future = future_ref;
+	size_t len = len_ref;
+
+#define ASSERT_SHOULD_FAIL(opt, a, b, c, d) do {			\
+	assert_d_eq(mallctl("experimental.prof_recent." opt, a, b, c,	\
+	    d), ENOENT, "Should return ENOENT when config_prof is off");\
+	assert_zd_eq(past, past_ref, "output was touched");		\
+	assert_zu_eq(len, len_ref, "output length was touched");	\
+	assert_zd_eq(future, future_ref, "input was touched");		\
+} while (0)
+
+	ASSERT_SHOULD_FAIL("alloc_max", NULL, NULL, NULL, 0);
+	ASSERT_SHOULD_FAIL("alloc_max", &past, &len, NULL, 0);
+	ASSERT_SHOULD_FAIL("alloc_max", NULL, NULL, &future, len);
+	ASSERT_SHOULD_FAIL("alloc_max", &past, &len, &future, len);
+
+#undef ASSERT_SHOULD_FAIL
+}
+TEST_END
+
+TEST_BEGIN(test_prof_recent_on) {
+	test_skip_if(!config_prof);
+
+	ssize_t past, future;
+	size_t len = sizeof(ssize_t);
+
+	tsd_t *tsd = tsd_fetch();
+
+	confirm_prof_setup(tsd);
+
+	assert_d_eq(mallctl("experimental.prof_recent.alloc_max",
+	    NULL, NULL, NULL, 0), 0, "no-op mallctl should be allowed");
+	confirm_prof_setup(tsd);
+
+	assert_d_eq(mallctl("experimental.prof_recent.alloc_max",
+	    &past, &len, NULL, 0), 0, "Read error");
+	assert_zd_eq(past, OPT_ALLOC_MAX, "Wrong read result");
+	future = OPT_ALLOC_MAX + 1;
+	assert_d_eq(mallctl("experimental.prof_recent.alloc_max",
+	    NULL, NULL, &future, len), 0, "Write error");
+	future = -1;
+	assert_d_eq(mallctl("experimental.prof_recent.alloc_max",
+	    &past, &len, &future, len), 0, "Read/write error");
+	assert_zd_eq(past, OPT_ALLOC_MAX + 1, "Wrong read result");
+	future = -2;
+	assert_d_eq(mallctl("experimental.prof_recent.alloc_max",
+	    &past, &len, &future, len), EINVAL,
+	    "Invalid write should return EINVAL");
+	assert_zd_eq(past, OPT_ALLOC_MAX + 1,
+	    "Output should not be touched given invalid write");
+	future = OPT_ALLOC_MAX;
+	assert_d_eq(mallctl("experimental.prof_recent.alloc_max",
+	    &past, &len, &future, len), 0, "Read/write error");
+	assert_zd_eq(past, -1, "Wrong read result");
+	future = OPT_ALLOC_MAX + 2;
+	assert_d_eq(mallctl("experimental.prof_recent.alloc_max",
+	    &past, &len, &future, len * 2), EINVAL,
+	    "Invalid write should return EINVAL");
+	assert_zd_eq(past, -1,
+	    "Output should not be touched given invalid write");
+
+	confirm_prof_setup(tsd);
+}
+TEST_END
+
+/* Reproducible sequence of request sizes */
+#define NTH_REQ_SIZE(n) ((n) * 97 + 101)
+
+static void confirm_malloc(tsd_t *tsd, void *p) {
+	assert_ptr_not_null(p, "malloc failed unexpectedly");
+	edata_t *e = iealloc(TSDN_NULL, p);
+	assert_ptr_not_null(e, "NULL edata for living pointer");
+	malloc_mutex_lock(tsd_tsdn(tsd), &prof_recent_alloc_mtx);
+	prof_recent_t *n = edata_prof_recent_alloc_get(tsd, e);
+	assert_ptr_not_null(n, "Record in edata should not be NULL");
+	assert_ptr_not_null(n->alloc_tctx,
+	    "alloc_tctx in record should not be NULL");
+	assert_ptr_eq(e, n->alloc_edata,
+	    "edata pointer in record is not correct");
+	assert_ptr_null(n->dalloc_tctx, "dalloc_tctx in record should be NULL");
+	malloc_mutex_unlock(tsd_tsdn(tsd), &prof_recent_alloc_mtx);
+}
+
+static void confirm_record_size(tsd_t *tsd, prof_recent_t *n, unsigned kth) {
+	malloc_mutex_assert_owner(tsd_tsdn(tsd), &prof_recent_alloc_mtx);
+	assert_zu_eq(n->usize, sz_s2u(NTH_REQ_SIZE(kth)),
+	    "Recorded allocation usize is wrong");
+}
+
+static void confirm_record_living(tsd_t *tsd, prof_recent_t *n) {
+	malloc_mutex_assert_owner(tsd_tsdn(tsd), &prof_recent_alloc_mtx);
+	assert_ptr_not_null(n->alloc_tctx,
+	    "alloc_tctx in record should not be NULL");
+	assert_ptr_not_null(n->alloc_edata,
+	    "Recorded edata should not be NULL for living pointer");
+	assert_ptr_eq(n, edata_prof_recent_alloc_get(tsd, n->alloc_edata),
+	    "Record in edata is not correct");
+	assert_ptr_null(n->dalloc_tctx, "dalloc_tctx in record should be NULL");
+}
+
+static void confirm_record_released(tsd_t *tsd, prof_recent_t *n) {
+	malloc_mutex_assert_owner(tsd_tsdn(tsd), &prof_recent_alloc_mtx);
+	assert_ptr_not_null(n->alloc_tctx,
+	    "alloc_tctx in record should not be NULL");
+	assert_ptr_null(n->alloc_edata,
+	    "Recorded edata should be NULL for released pointer");
+	assert_ptr_not_null(n->dalloc_tctx,
+	    "dalloc_tctx in record should not be NULL for released pointer");
+}
+
+TEST_BEGIN(test_prof_recent_alloc) {
+	test_skip_if(!config_prof);
+
+	bool b;
+	unsigned i, c;
+	size_t req_size;
+	void *p;
+	prof_recent_t *n;
+	ssize_t future;
+
+	tsd_t *tsd = tsd_fetch();
+
+	confirm_prof_setup(tsd);
+
+	/*
+	 * First batch of 2 * OPT_ALLOC_MAX allocations.  After the
+	 * (OPT_ALLOC_MAX - 1)'th allocation the recorded allocations should
+	 * always be the last OPT_ALLOC_MAX allocations coming from here.
+	 */
+	for (i = 0; i < 2 * OPT_ALLOC_MAX; ++i) {
+		req_size = NTH_REQ_SIZE(i);
+		p = malloc(req_size);
+		confirm_malloc(tsd, p);
+		if (i < OPT_ALLOC_MAX - 1) {
+			malloc_mutex_lock(tsd_tsdn(tsd),
+			    &prof_recent_alloc_mtx);
+			assert_ptr_ne(prof_recent_alloc_begin(tsd),
+			    prof_recent_alloc_end(tsd),
+			    "Empty recent allocation");
+			malloc_mutex_unlock(tsd_tsdn(tsd),
+			    &prof_recent_alloc_mtx);
+			free(p);
+			/*
+			 * The recorded allocations may still include some
+			 * other allocations before the test run started,
+			 * so keep allocating without checking anything.
+			 */
+			continue;
+		}
+		c = 0;
+		malloc_mutex_lock(tsd_tsdn(tsd), &prof_recent_alloc_mtx);
+		for (n = prof_recent_alloc_begin(tsd);
+		    n != prof_recent_alloc_end(tsd);
+		    n = prof_recent_alloc_next(tsd, n)) {
+			++c;
+			confirm_record_size(tsd, n, i + c - OPT_ALLOC_MAX);
+			if (c == OPT_ALLOC_MAX) {
+				confirm_record_living(tsd, n);
+			} else {
+				confirm_record_released(tsd, n);
+			}
+		}
+		malloc_mutex_unlock(tsd_tsdn(tsd), &prof_recent_alloc_mtx);
+		assert_u_eq(c, OPT_ALLOC_MAX,
+		    "Incorrect total number of allocations");
+		free(p);
+	}
+
+	confirm_prof_setup(tsd);
+
+	b = false;
+	assert_d_eq(mallctl("prof.active", NULL, NULL, &b, sizeof(bool)), 0,
+	    "mallctl for turning off prof_active failed");
+
+	/*
+	 * Second batch of OPT_ALLOC_MAX allocations.  Since prof_active is
+	 * turned off, this batch shouldn't be recorded.
+	 */
+	for (; i < 3 * OPT_ALLOC_MAX; ++i) {
+		req_size = NTH_REQ_SIZE(i);
+		p = malloc(req_size);
+		assert_ptr_not_null(p, "malloc failed unexpectedly");
+		c = 0;
+		malloc_mutex_lock(tsd_tsdn(tsd), &prof_recent_alloc_mtx);
+		for (n = prof_recent_alloc_begin(tsd);
+		    n != prof_recent_alloc_end(tsd);
+		    n = prof_recent_alloc_next(tsd, n)) {
+			confirm_record_size(tsd, n, c + OPT_ALLOC_MAX);
+			confirm_record_released(tsd, n);
+			++c;
+		}
+		malloc_mutex_unlock(tsd_tsdn(tsd), &prof_recent_alloc_mtx);
+		assert_u_eq(c, OPT_ALLOC_MAX,
+		    "Incorrect total number of allocations");
+		free(p);
+	}
+
+	b = true;
+	assert_d_eq(mallctl("prof.active", NULL, NULL, &b, sizeof(bool)), 0,
+	    "mallctl for turning on prof_active failed");
+
+	confirm_prof_setup(tsd);
+
+	/*
+	 * Third batch of OPT_ALLOC_MAX allocations.  Since prof_active is
+	 * turned back on, they should be recorded, and in the list of recorded
+	 * allocations they should follow the first batch rather than the
+	 * second batch.
+	 */
+	for (; i < 4 * OPT_ALLOC_MAX; ++i) {
+		req_size = NTH_REQ_SIZE(i);
+		p = malloc(req_size);
+		confirm_malloc(tsd, p);
+		c = 0;
+		malloc_mutex_lock(tsd_tsdn(tsd), &prof_recent_alloc_mtx);
+		for (n = prof_recent_alloc_begin(tsd);
+		    n != prof_recent_alloc_end(tsd);
+		    n = prof_recent_alloc_next(tsd, n)) {
+			++c;
+			confirm_record_size(tsd, n,
+			    /* Is the allocation from the third batch? */
+			    i + c - OPT_ALLOC_MAX >= 3 * OPT_ALLOC_MAX ?
+			    /* If yes, then it's just recorded. */
+			    i + c - OPT_ALLOC_MAX :
+			    /*
+			     * Otherwise, it should come from the first batch
+			     * instead of the second batch.
+			     */
+			    i + c - 2 * OPT_ALLOC_MAX);
+			if (c == OPT_ALLOC_MAX) {
+				confirm_record_living(tsd, n);
+			} else {
+				confirm_record_released(tsd, n);
+			}
+		}
+		malloc_mutex_unlock(tsd_tsdn(tsd), &prof_recent_alloc_mtx);
+		assert_u_eq(c, OPT_ALLOC_MAX,
+		    "Incorrect total number of allocations");
+		free(p);
+	}
+
+	/* Increasing the limit shouldn't alter the list of records. */
+	future = OPT_ALLOC_MAX + 1;
+	assert_d_eq(mallctl("experimental.prof_recent.alloc_max",
+	    NULL, NULL, &future, sizeof(ssize_t)), 0, "Write error");
+	c = 0;
+	malloc_mutex_lock(tsd_tsdn(tsd), &prof_recent_alloc_mtx);
+	for (n = prof_recent_alloc_begin(tsd);
+	    n != prof_recent_alloc_end(tsd);
+	    n = prof_recent_alloc_next(tsd, n)) {
+		confirm_record_size(tsd, n, c + 3 * OPT_ALLOC_MAX);
+		confirm_record_released(tsd, n);
+		++c;
+	}
+	malloc_mutex_unlock(tsd_tsdn(tsd), &prof_recent_alloc_mtx);
+	assert_u_eq(c, OPT_ALLOC_MAX,
+	    "Incorrect total number of allocations");
+
+	/*
+	 * Decreasing the limit shouldn't alter the list of records as long as
+	 * the new limit is still no less than the length of the list.
+	 */
+	future = OPT_ALLOC_MAX;
+	assert_d_eq(mallctl("experimental.prof_recent.alloc_max",
+	    NULL, NULL, &future, sizeof(ssize_t)), 0, "Write error");
+	c = 0;
+	malloc_mutex_lock(tsd_tsdn(tsd), &prof_recent_alloc_mtx);
+	for (n = prof_recent_alloc_begin(tsd);
+	    n != prof_recent_alloc_end(tsd);
+	    n = prof_recent_alloc_next(tsd, n)) {
+		confirm_record_size(tsd, n, c + 3 * OPT_ALLOC_MAX);
+		confirm_record_released(tsd, n);
+		++c;
+	}
+	malloc_mutex_unlock(tsd_tsdn(tsd), &prof_recent_alloc_mtx);
+	assert_u_eq(c, OPT_ALLOC_MAX,
+	    "Incorrect total number of allocations");
+
+	/*
+	 * Decreasing the limit should shorten the list of records if the new
+	 * limit is less than the length of the list.
+	 */
+	future = OPT_ALLOC_MAX - 1;
+	assert_d_eq(mallctl("experimental.prof_recent.alloc_max",
+	    NULL, NULL, &future, sizeof(ssize_t)), 0, "Write error");
+	c = 0;
+	malloc_mutex_lock(tsd_tsdn(tsd), &prof_recent_alloc_mtx);
+	for (n = prof_recent_alloc_begin(tsd);
+	    n != prof_recent_alloc_end(tsd);
+	    n = prof_recent_alloc_next(tsd, n)) {
+		++c;
+		confirm_record_size(tsd, n, c + 3 * OPT_ALLOC_MAX);
+		confirm_record_released(tsd, n);
+	}
+	malloc_mutex_unlock(tsd_tsdn(tsd), &prof_recent_alloc_mtx);
+	assert_u_eq(c, OPT_ALLOC_MAX - 1,
+	    "Incorrect total number of allocations");
+
+	/* Setting to unlimited shouldn't alter the list of records. */
+	future = -1;
+	assert_d_eq(mallctl("experimental.prof_recent.alloc_max",
+	    NULL, NULL, &future, sizeof(ssize_t)), 0, "Write error");
+	c = 0;
+	malloc_mutex_lock(tsd_tsdn(tsd), &prof_recent_alloc_mtx);
+	for (n = prof_recent_alloc_begin(tsd);
+	    n != prof_recent_alloc_end(tsd);
+	    n = prof_recent_alloc_next(tsd, n)) {
+		++c;
+		confirm_record_size(tsd, n, c + 3 * OPT_ALLOC_MAX);
+		confirm_record_released(tsd, n);
+	}
+	malloc_mutex_unlock(tsd_tsdn(tsd), &prof_recent_alloc_mtx);
+	assert_u_eq(c, OPT_ALLOC_MAX - 1,
+	    "Incorrect total number of allocations");
+
+	/* Downshift to only one record. */
+	future = 1;
+	assert_d_eq(mallctl("experimental.prof_recent.alloc_max",
+	    NULL, NULL, &future, sizeof(ssize_t)), 0, "Write error");
+	malloc_mutex_lock(tsd_tsdn(tsd), &prof_recent_alloc_mtx);
+	n = prof_recent_alloc_begin(tsd);
+	assert(n != prof_recent_alloc_end(tsd));
+	confirm_record_size(tsd, n, 4 * OPT_ALLOC_MAX - 1);
+	confirm_record_released(tsd, n);
+	n = prof_recent_alloc_next(tsd, n);
+	assert(n == prof_recent_alloc_end(tsd));
+	malloc_mutex_unlock(tsd_tsdn(tsd), &prof_recent_alloc_mtx);
+
+	/* Completely turn off. */
+	future = 0;
+	assert_d_eq(mallctl("experimental.prof_recent.alloc_max",
+	    NULL, NULL, &future, sizeof(ssize_t)), 0, "Write error");
+	malloc_mutex_lock(tsd_tsdn(tsd), &prof_recent_alloc_mtx);
+	assert(prof_recent_alloc_begin(tsd) == prof_recent_alloc_end(tsd));
+	malloc_mutex_unlock(tsd_tsdn(tsd), &prof_recent_alloc_mtx);
+
+	/* Restore the settings. */
+	future = OPT_ALLOC_MAX;
+	assert_d_eq(mallctl("experimental.prof_recent.alloc_max",
+	    NULL, NULL, &future, sizeof(ssize_t)), 0, "Write error");
+	malloc_mutex_lock(tsd_tsdn(tsd), &prof_recent_alloc_mtx);
+	assert(prof_recent_alloc_begin(tsd) == prof_recent_alloc_end(tsd));
+	malloc_mutex_unlock(tsd_tsdn(tsd), &prof_recent_alloc_mtx);
+
+	confirm_prof_setup(tsd);
+}
+TEST_END
+
+#undef NTH_REQ_SIZE
+
+int
+main(void) {
+	return test(
+	    test_confirm_setup,
+	    test_prof_recent_off,
+	    test_prof_recent_on,
+	    test_prof_recent_alloc);
+}
diff --git a/test/unit/prof_recent.sh b/test/unit/prof_recent.sh
new file mode 100644
index 00000000..59759a6a
--- /dev/null
+++ b/test/unit/prof_recent.sh
@@ -0,0 +1,5 @@
+#!/bin/sh
+
+if [ "x${enable_prof}" = "x1" ] ; then
+  export MALLOC_CONF="prof:true,lg_prof_sample:0,prof_recent_alloc_max:3"
+fi
diff --git a/test/unit/prof_reset.sh b/test/unit/prof_reset.sh
index 43c516a0..daefeb70 100644
--- a/test/unit/prof_reset.sh
+++ b/test/unit/prof_reset.sh
@@ -1,5 +1,5 @@
 #!/bin/sh
 
 if [ "x${enable_prof}" = "x1" ] ; then
-  export MALLOC_CONF="prof:true,prof_active:false,lg_prof_sample:0"
+  export MALLOC_CONF="prof:true,prof_active:false,lg_prof_sample:0,prof_recent_alloc_max:0"
 fi