From 6599651aee2b1b1ab0c52fdb03f23394bd683c47 Mon Sep 17 00:00:00 2001
From: David Goldblatt <davidgoldblatt@fb.com>
Date: Fri, 16 Oct 2020 13:14:59 -0700
Subject: [PATCH] PA: Use an SEC in fron of the HPA shard.

---
 include/jemalloc/internal/arena_externs.h     |  3 +-
 include/jemalloc/internal/ctl.h               |  1 +
 include/jemalloc/internal/hpa.h               |  6 ++--
 .../internal/jemalloc_internal_externs.h      |  5 ++++
 include/jemalloc/internal/mutex_prof.h        |  3 +-
 include/jemalloc/internal/pa.h                | 20 +++++++++----
 src/arena.c                                   | 28 ++++++++++++-----
 src/ctl.c                                     | 19 +++++++++++-
 src/hpa.c                                     |  6 ++--
 src/jemalloc.c                                | 30 ++++++++++++++-----
 src/pa.c                                      | 19 ++++++++----
 src/pa_extra.c                                | 25 ++++++++++++----
 src/stats.c                                   |  8 +++++
 src/tcache.c                                  |  6 ++--
 test/unit/mallctl.c                           |  3 ++
 15 files changed, 141 insertions(+), 41 deletions(-)

diff --git a/include/jemalloc/internal/arena_externs.h b/include/jemalloc/internal/arena_externs.h
index c8e1e38d..40223b58 100644
--- a/include/jemalloc/internal/arena_externs.h
+++ b/include/jemalloc/internal/arena_externs.h
@@ -28,7 +28,7 @@ void arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads,
     const char **dss, ssize_t *dirty_decay_ms, ssize_t *muzzy_decay_ms,
     size_t *nactive, size_t *ndirty, size_t *nmuzzy, arena_stats_t *astats,
     bin_stats_data_t *bstats, arena_stats_large_t *lstats,
-    pac_estats_t *estats, hpa_shard_stats_t *hpastats);
+    pac_estats_t *estats, hpa_shard_stats_t *hpastats, sec_stats_t *secstats);
 void arena_handle_new_dirty_pages(tsdn_t *tsdn, arena_t *arena);
 #ifdef JEMALLOC_JET
 size_t arena_slab_regind(edata_t *slab, szind_t binind, const void *ptr);
@@ -99,6 +99,7 @@ void arena_prefork4(tsdn_t *tsdn, arena_t *arena);
 void arena_prefork5(tsdn_t *tsdn, arena_t *arena);
 void arena_prefork6(tsdn_t *tsdn, arena_t *arena);
 void arena_prefork7(tsdn_t *tsdn, arena_t *arena);
+void arena_prefork8(tsdn_t *tsdn, arena_t *arena);
 void arena_postfork_parent(tsdn_t *tsdn, arena_t *arena);
 void arena_postfork_child(tsdn_t *tsdn, arena_t *arena);
 
diff --git a/include/jemalloc/internal/ctl.h b/include/jemalloc/internal/ctl.h
index 305d3655..a6ae05c1 100644
--- a/include/jemalloc/internal/ctl.h
+++ b/include/jemalloc/internal/ctl.h
@@ -46,6 +46,7 @@ typedef struct ctl_arena_stats_s {
 	arena_stats_large_t lstats[SC_NSIZES - SC_NBINS];
 	pac_estats_t estats[SC_NPSIZES];
 	hpa_shard_stats_t hpastats;
+	sec_stats_t secstats;
 } ctl_arena_stats_t;
 
 typedef struct ctl_stats_s {
diff --git a/include/jemalloc/internal/hpa.h b/include/jemalloc/internal/hpa.h
index 3fe9fc48..24c68560 100644
--- a/include/jemalloc/internal/hpa.h
+++ b/include/jemalloc/internal/hpa.h
@@ -90,10 +90,10 @@ void hpa_shard_destroy(tsdn_t *tsdn, hpa_shard_t *shard);
 
 /*
  * We share the fork ordering with the PA and arena prefork handling; that's why
- * these are 2 and 3 rather than 0 or 1.
+ * these are 3 and 4 rather than 0 and 1.
  */
-void hpa_shard_prefork2(tsdn_t *tsdn, hpa_shard_t *shard);
 void hpa_shard_prefork3(tsdn_t *tsdn, hpa_shard_t *shard);
+void hpa_shard_prefork4(tsdn_t *tsdn, hpa_shard_t *shard);
 void hpa_shard_postfork_parent(tsdn_t *tsdn, hpa_shard_t *shard);
 void hpa_shard_postfork_child(tsdn_t *tsdn, hpa_shard_t *shard);
 
@@ -103,7 +103,7 @@ void hpa_shard_postfork_child(tsdn_t *tsdn, hpa_shard_t *shard);
  * so it needs to be lower in the witness ordering, but it's also logically
  * global and not tied to any particular arena.
  */
-void hpa_prefork3(tsdn_t *tsdn, hpa_t *hpa);
+void hpa_prefork4(tsdn_t *tsdn, hpa_t *hpa);
 void hpa_postfork_parent(tsdn_t *tsdn, hpa_t *hpa);
 void hpa_postfork_child(tsdn_t *tsdn, hpa_t *hpa);
 
diff --git a/include/jemalloc/internal/jemalloc_internal_externs.h b/include/jemalloc/internal/jemalloc_internal_externs.h
index 8faadaa1..814a7a1b 100644
--- a/include/jemalloc/internal/jemalloc_internal_externs.h
+++ b/include/jemalloc/internal/jemalloc_internal_externs.h
@@ -17,6 +17,11 @@ extern size_t opt_hpa_slab_goal;
 extern size_t opt_hpa_slab_max_alloc;
 extern size_t opt_hpa_small_max;
 extern size_t opt_hpa_large_min;
+
+extern size_t opt_hpa_sec_max_alloc;
+extern size_t opt_hpa_sec_max_bytes;
+extern size_t opt_hpa_sec_nshards;
+
 extern const char *opt_junk;
 extern bool opt_junk_alloc;
 extern bool opt_junk_free;
diff --git a/include/jemalloc/internal/mutex_prof.h b/include/jemalloc/internal/mutex_prof.h
index 970f469b..ef0bf0d3 100644
--- a/include/jemalloc/internal/mutex_prof.h
+++ b/include/jemalloc/internal/mutex_prof.h
@@ -33,7 +33,8 @@ typedef enum {
     OP(base)								\
     OP(tcache_list)							\
     OP(hpa_shard)							\
-    OP(hpa_shard_grow)
+    OP(hpa_shard_grow)							\
+    OP(hpa_sec)
 
 typedef enum {
 #define OP(mtx) arena_prof_mutex_##mtx,
diff --git a/include/jemalloc/internal/pa.h b/include/jemalloc/internal/pa.h
index d138f2f0..5e97d0b0 100644
--- a/include/jemalloc/internal/pa.h
+++ b/include/jemalloc/internal/pa.h
@@ -10,6 +10,7 @@
 #include "jemalloc/internal/lockedint.h"
 #include "jemalloc/internal/pac.h"
 #include "jemalloc/internal/pai.h"
+#include "jemalloc/internal/sec.h"
 
 /*
  * The page allocator; responsible for acquiring pages of memory for
@@ -85,7 +86,12 @@ struct pa_shard_s {
 	/* Allocates from a PAC. */
 	pac_t pac;
 
-	/* Allocates from a HPA. */
+	/*
+	 * We place a small extent cache in front of the HPA, since we intend
+	 * these configurations to use many fewer arenas, and therefore have a
+	 * higher risk of hot locks.
+	 */
+	sec_t hpa_sec;
 	hpa_shard_t hpa_shard;
 
 	/* The source of edata_t objects. */
@@ -124,18 +130,20 @@ bool pa_shard_init(tsdn_t *tsdn, pa_shard_t *shard, emap_t *emap, base_t *base,
  * that we can boot without worrying about the HPA, then turn it on in a0.
  */
 bool pa_shard_enable_hpa(pa_shard_t *shard, hpa_t *hpa, size_t ps_goal,
-    size_t ps_alloc_max, size_t small_max, size_t large_min);
+    size_t ps_alloc_max, size_t small_max, size_t large_min, size_t sec_nshards,
+    size_t sec_alloc_max, size_t sec_bytes_max);
 /*
  * We stop using the HPA when custom extent hooks are installed, but still
  * redirect deallocations to it.
  */
-void pa_shard_disable_hpa(pa_shard_t *shard);
+void pa_shard_disable_hpa(tsdn_t *tsdn, pa_shard_t *shard);
 
 /*
  * This does the PA-specific parts of arena reset (i.e. freeing all active
  * allocations).
  */
-void pa_shard_reset(pa_shard_t *shard);
+void pa_shard_reset(tsdn_t *tsdn, pa_shard_t *shard);
+
 /*
  * Destroy all the remaining retained extents.  Should only be called after
  * decaying all active, dirty, and muzzy extents to the retained state, as the
@@ -184,6 +192,7 @@ void pa_shard_prefork0(tsdn_t *tsdn, pa_shard_t *shard);
 void pa_shard_prefork2(tsdn_t *tsdn, pa_shard_t *shard);
 void pa_shard_prefork3(tsdn_t *tsdn, pa_shard_t *shard);
 void pa_shard_prefork4(tsdn_t *tsdn, pa_shard_t *shard);
+void pa_shard_prefork5(tsdn_t *tsdn, pa_shard_t *shard);
 void pa_shard_postfork_parent(tsdn_t *tsdn, pa_shard_t *shard);
 void pa_shard_postfork_child(tsdn_t *tsdn, pa_shard_t *shard);
 
@@ -192,7 +201,8 @@ void pa_shard_basic_stats_merge(pa_shard_t *shard, size_t *nactive,
 
 void pa_shard_stats_merge(tsdn_t *tsdn, pa_shard_t *shard,
     pa_shard_stats_t *pa_shard_stats_out, pac_estats_t *estats_out,
-    hpa_shard_stats_t *hpa_stats_out, size_t *resident);
+    hpa_shard_stats_t *hpa_stats_out, sec_stats_t *sec_stats_out,
+    size_t *resident);
 
 /*
  * Reads the PA-owned mutex stats into the output stats array, at the
diff --git a/src/arena.c b/src/arena.c
index dc58a287..360827ef 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -81,7 +81,7 @@ arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads,
     const char **dss, ssize_t *dirty_decay_ms, ssize_t *muzzy_decay_ms,
     size_t *nactive, size_t *ndirty, size_t *nmuzzy, arena_stats_t *astats,
     bin_stats_data_t *bstats, arena_stats_large_t *lstats,
-    pac_estats_t *estats, hpa_shard_stats_t *hpastats) {
+    pac_estats_t *estats, hpa_shard_stats_t *hpastats, sec_stats_t *secstats) {
 	cassert(config_stats);
 
 	arena_basic_stats_merge(tsdn, arena, nthreads, dss, dirty_decay_ms,
@@ -139,7 +139,7 @@ arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads,
 	}
 
 	pa_shard_stats_merge(tsdn, &arena->pa_shard, &astats->pa_shard_stats,
-	    estats, hpastats, &astats->resident);
+	    estats, hpastats, secstats, &astats->resident);
 
 	LOCKEDINT_MTX_UNLOCK(tsdn, arena->stats.mtx);
 
@@ -483,6 +483,14 @@ arena_decay_muzzy(tsdn_t *tsdn, arena_t *arena, bool is_background_thread,
 
 void
 arena_decay(tsdn_t *tsdn, arena_t *arena, bool is_background_thread, bool all) {
+	if (all) {
+		/*
+		 * We should take a purge of "all" to mean "save as much memory
+		 * as possible", including flushing any caches (for situations
+		 * like thread death, or manual purge calls).
+		 */
+		sec_flush(tsdn, &arena->pa_shard.hpa_sec);
+	}
 	if (arena_decay_dirty(tsdn, arena, is_background_thread, all)) {
 		return;
 	}
@@ -631,7 +639,7 @@ arena_reset(tsd_t *tsd, arena_t *arena) {
 			    &arena->bins[i].bin_shards[j]);
 		}
 	}
-	pa_shard_reset(&arena->pa_shard);
+	pa_shard_reset(tsd_tsdn(tsd), &arena->pa_shard);
 }
 
 void
@@ -1362,7 +1370,7 @@ arena_set_extent_hooks(tsd_t *tsd, arena_t *arena,
 		malloc_mutex_lock(tsd_tsdn(tsd), &info->mtx);
 	}
 	/* No using the HPA now that we have the custom hooks. */
-	pa_shard_disable_hpa(&arena->pa_shard);
+	pa_shard_disable_hpa(tsd_tsdn(tsd), &arena->pa_shard);
 	extent_hooks_t *ret = base_extent_hooks_set(arena->base, extent_hooks);
 	if (have_background_thread) {
 		malloc_mutex_unlock(tsd_tsdn(tsd), &info->mtx);
@@ -1529,7 +1537,8 @@ arena_new(tsdn_t *tsdn, unsigned ind, extent_hooks_t *extent_hooks) {
 	if (opt_hpa && ehooks_are_default(base_ehooks_get(base)) && ind != 0) {
 		if (pa_shard_enable_hpa(&arena->pa_shard, &arena_hpa_global,
 		    opt_hpa_slab_goal, opt_hpa_slab_max_alloc,
-		    opt_hpa_small_max, opt_hpa_large_min)) {
+		    opt_hpa_small_max, opt_hpa_large_min, opt_hpa_sec_nshards,
+		    opt_hpa_sec_max_alloc, opt_hpa_sec_max_bytes)) {
 			goto label_error;
 		}
 	}
@@ -1658,16 +1667,21 @@ arena_prefork4(tsdn_t *tsdn, arena_t *arena) {
 
 void
 arena_prefork5(tsdn_t *tsdn, arena_t *arena) {
-	base_prefork(tsdn, arena->base);
+	pa_shard_prefork5(tsdn, &arena->pa_shard);
 }
 
 void
 arena_prefork6(tsdn_t *tsdn, arena_t *arena) {
-	malloc_mutex_prefork(tsdn, &arena->large_mtx);
+	base_prefork(tsdn, arena->base);
 }
 
 void
 arena_prefork7(tsdn_t *tsdn, arena_t *arena) {
+	malloc_mutex_prefork(tsdn, &arena->large_mtx);
+}
+
+void
+arena_prefork8(tsdn_t *tsdn, arena_t *arena) {
 	for (unsigned i = 0; i < SC_NBINS; i++) {
 		for (unsigned j = 0; j < bin_infos[i].n_shards; j++) {
 			bin_prefork(tsdn, &arena->bins[i].bin_shards[j]);
diff --git a/src/ctl.c b/src/ctl.c
index b4e65172..874aaac2 100644
--- a/src/ctl.c
+++ b/src/ctl.c
@@ -95,6 +95,9 @@ CTL_PROTO(opt_hpa_slab_goal)
 CTL_PROTO(opt_hpa_slab_max_alloc)
 CTL_PROTO(opt_hpa_small_max)
 CTL_PROTO(opt_hpa_large_min)
+CTL_PROTO(opt_hpa_sec_max_alloc)
+CTL_PROTO(opt_hpa_sec_max_bytes)
+CTL_PROTO(opt_hpa_sec_nshards)
 CTL_PROTO(opt_metadata_thp)
 CTL_PROTO(opt_retain)
 CTL_PROTO(opt_dss)
@@ -246,6 +249,7 @@ CTL_PROTO(stats_arenas_i_metadata_thp)
 CTL_PROTO(stats_arenas_i_tcache_bytes)
 CTL_PROTO(stats_arenas_i_resident)
 CTL_PROTO(stats_arenas_i_abandoned_vm)
+CTL_PROTO(stats_arenas_i_hpa_sec_bytes)
 INDEX_PROTO(stats_arenas_i)
 CTL_PROTO(stats_allocated)
 CTL_PROTO(stats_active)
@@ -360,6 +364,9 @@ static const ctl_named_node_t opt_node[] = {
 	{NAME("hpa_slab_max_alloc"),	CTL(opt_hpa_slab_max_alloc)},
 	{NAME("hpa_small_max"),	CTL(opt_hpa_small_max)},
 	{NAME("hpa_large_min"),	CTL(opt_hpa_large_min)},
+	{NAME("hpa_sec_max_alloc"),	CTL(opt_hpa_sec_max_alloc)},
+	{NAME("hpa_sec_max_bytes"),	CTL(opt_hpa_sec_max_bytes)},
+	{NAME("hpa_sec_nshards"),	CTL(opt_hpa_sec_nshards)},
 	{NAME("metadata_thp"),	CTL(opt_metadata_thp)},
 	{NAME("retain"),	CTL(opt_retain)},
 	{NAME("dss"),		CTL(opt_dss)},
@@ -650,6 +657,7 @@ static const ctl_named_node_t stats_arenas_i_node[] = {
 	{NAME("tcache_bytes"),	CTL(stats_arenas_i_tcache_bytes)},
 	{NAME("resident"),	CTL(stats_arenas_i_resident)},
 	{NAME("abandoned_vm"),	CTL(stats_arenas_i_abandoned_vm)},
+	{NAME("hpa_sec_bytes"),	CTL(stats_arenas_i_hpa_sec_bytes)},
 	{NAME("small"),		CHILD(named, stats_arenas_i_small)},
 	{NAME("large"),		CHILD(named, stats_arenas_i_large)},
 	{NAME("bins"),		CHILD(indexed, stats_arenas_i_bins)},
@@ -889,6 +897,8 @@ ctl_arena_clear(ctl_arena_t *ctl_arena) {
 		    sizeof(pac_estats_t));
 		memset(&ctl_arena->astats->hpastats, 0,
 		    sizeof(hpa_shard_stats_t));
+		memset(&ctl_arena->astats->secstats, 0,
+		    sizeof(sec_stats_t));
 	}
 }
 
@@ -903,7 +913,7 @@ ctl_arena_stats_amerge(tsdn_t *tsdn, ctl_arena_t *ctl_arena, arena_t *arena) {
 		    &ctl_arena->pdirty, &ctl_arena->pmuzzy,
 		    &ctl_arena->astats->astats, ctl_arena->astats->bstats,
 		    ctl_arena->astats->lstats, ctl_arena->astats->estats,
-		    &ctl_arena->astats->hpastats);
+		    &ctl_arena->astats->hpastats, &ctl_arena->astats->secstats);
 
 		for (i = 0; i < SC_NBINS; i++) {
 			bin_stats_t *bstats =
@@ -1089,6 +1099,7 @@ MUTEX_PROF_ARENA_MUTEXES
 			    &astats->hpastats.psset_slab_stats[i]);
 		}
 
+		sec_stats_accum(&sdstats->secstats, &astats->secstats);
 	}
 }
 
@@ -1895,6 +1906,9 @@ CTL_RO_NL_GEN(opt_hpa_slab_goal, opt_hpa_slab_goal, size_t)
 CTL_RO_NL_GEN(opt_hpa_slab_max_alloc, opt_hpa_slab_max_alloc, size_t)
 CTL_RO_NL_GEN(opt_hpa_small_max, opt_hpa_small_max, size_t)
 CTL_RO_NL_GEN(opt_hpa_large_min, opt_hpa_large_min, size_t)
+CTL_RO_NL_GEN(opt_hpa_sec_max_alloc, opt_hpa_sec_max_alloc, size_t)
+CTL_RO_NL_GEN(opt_hpa_sec_max_bytes, opt_hpa_sec_max_bytes, size_t)
+CTL_RO_NL_GEN(opt_hpa_sec_nshards, opt_hpa_sec_nshards, size_t)
 CTL_RO_NL_GEN(opt_metadata_thp, metadata_thp_mode_names[opt_metadata_thp],
     const char *)
 CTL_RO_NL_GEN(opt_retain, opt_retain, bool)
@@ -3114,6 +3128,9 @@ CTL_RO_CGEN(config_stats, stats_arenas_i_abandoned_vm,
     &arenas_i(mib[2])->astats->astats.pa_shard_stats.pac_stats.abandoned_vm,
     ATOMIC_RELAXED), size_t)
 
+CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_sec_bytes,
+    arenas_i(mib[2])->astats->secstats.bytes, size_t)
+
 CTL_RO_CGEN(config_stats, stats_arenas_i_small_allocated,
     arenas_i(mib[2])->astats->allocated_small, size_t)
 CTL_RO_CGEN(config_stats, stats_arenas_i_small_nmalloc,
diff --git a/src/hpa.c b/src/hpa.c
index 08992bda..f49aa2b0 100644
--- a/src/hpa.c
+++ b/src/hpa.c
@@ -411,12 +411,12 @@ hpa_shard_destroy(tsdn_t *tsdn, hpa_shard_t *shard) {
 }
 
 void
-hpa_shard_prefork2(tsdn_t *tsdn, hpa_shard_t *shard) {
+hpa_shard_prefork3(tsdn_t *tsdn, hpa_shard_t *shard) {
 	malloc_mutex_prefork(tsdn, &shard->grow_mtx);
 }
 
 void
-hpa_shard_prefork3(tsdn_t *tsdn, hpa_shard_t *shard) {
+hpa_shard_prefork4(tsdn_t *tsdn, hpa_shard_t *shard) {
 	malloc_mutex_prefork(tsdn, &shard->mtx);
 }
 
@@ -433,7 +433,7 @@ hpa_shard_postfork_child(tsdn_t *tsdn, hpa_shard_t *shard) {
 }
 
 void
-hpa_prefork3(tsdn_t *tsdn, hpa_t *hpa) {
+hpa_prefork4(tsdn_t *tsdn, hpa_t *hpa) {
 	malloc_mutex_prefork(tsdn, &hpa->grow_mtx);
 	malloc_mutex_prefork(tsdn, &hpa->mtx);
 }
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 8ce9ca1e..09b168ca 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -141,6 +141,11 @@ size_t opt_hpa_slab_max_alloc = 256 * 1024;
 size_t opt_hpa_small_max = 32 * 1024;
 size_t opt_hpa_large_min = 4 * 1024 * 1024;
 
+size_t opt_hpa_sec_max_alloc = 32 * 1024;
+/* These settings correspond to a maximum of 1MB cached per arena. */
+size_t opt_hpa_sec_max_bytes = 256 * 1024;
+size_t opt_hpa_sec_nshards = 4;
+
 /*
  * Arenas that are used to service external requests.  Not all elements of the
  * arenas array are necessarily used; arenas are created lazily as needed.
@@ -1494,11 +1499,18 @@ malloc_conf_init_helper(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS],
 			    true)
 			CONF_HANDLE_SIZE_T(opt_hpa_slab_max_alloc,
 			    "hpa_slab_max_alloc", PAGE, 512 * PAGE,
-			    CONF_CHECK_MIN, CONF_CHECK_MAX, true)
+			    CONF_CHECK_MIN, CONF_CHECK_MAX, true);
 			CONF_HANDLE_SIZE_T(opt_hpa_small_max, "hpa_small_max",
-			    PAGE, 0, CONF_CHECK_MIN, CONF_DONT_CHECK_MAX, true)
+			    PAGE, 0, CONF_CHECK_MIN, CONF_DONT_CHECK_MAX, true);
 			CONF_HANDLE_SIZE_T(opt_hpa_large_min, "hpa_large_min",
-			    PAGE, 0, CONF_CHECK_MIN, CONF_DONT_CHECK_MAX, true)
+			    PAGE, 0, CONF_CHECK_MIN, CONF_DONT_CHECK_MAX, true);
+
+			CONF_HANDLE_SIZE_T(opt_hpa_sec_max_alloc, "hpa_sec_max_alloc",
+			    PAGE, 0, CONF_CHECK_MIN, CONF_DONT_CHECK_MAX, true);
+			CONF_HANDLE_SIZE_T(opt_hpa_sec_max_bytes, "hpa_sec_max_bytes",
+			    PAGE, 0, CONF_CHECK_MIN, CONF_DONT_CHECK_MAX, true);
+			CONF_HANDLE_SIZE_T(opt_hpa_sec_nshards, "hpa_sec_nshards",
+			    0, 0, CONF_CHECK_MIN, CONF_DONT_CHECK_MAX, true);
 
 			if (CONF_MATCH("slab_sizes")) {
 				if (CONF_MATCH_VALUE("default")) {
@@ -1808,7 +1820,8 @@ malloc_init_hard_a0_locked() {
 		}
 		if (pa_shard_enable_hpa(&a0->pa_shard, &arena_hpa_global,
 		    opt_hpa_slab_goal, opt_hpa_slab_max_alloc,
-		    opt_hpa_small_max, opt_hpa_large_min)) {
+		    opt_hpa_small_max, opt_hpa_large_min, opt_hpa_sec_nshards,
+		    opt_hpa_sec_max_alloc, opt_hpa_sec_max_bytes)) {
 			return true;
 		}
 	}
@@ -4226,7 +4239,7 @@ _malloc_prefork(void)
 		background_thread_prefork1(tsd_tsdn(tsd));
 	}
 	/* Break arena prefork into stages to preserve lock order. */
-	for (i = 0; i < 8; i++) {
+	for (i = 0; i < 9; i++) {
 		for (j = 0; j < narenas; j++) {
 			if ((arena = arena_get(tsd_tsdn(tsd), j, false)) !=
 			    NULL) {
@@ -4255,12 +4268,15 @@ _malloc_prefork(void)
 				case 7:
 					arena_prefork7(tsd_tsdn(tsd), arena);
 					break;
+				case 8:
+					arena_prefork8(tsd_tsdn(tsd), arena);
+					break;
 				default: not_reached();
 				}
 			}
 		}
-		if (i == 3 && opt_hpa) {
-			hpa_prefork3(tsd_tsdn(tsd), &arena_hpa_global);
+		if (i == 4 && opt_hpa) {
+			hpa_prefork4(tsd_tsdn(tsd), &arena_hpa_global);
 		}
 
 	}
diff --git a/src/pa.c b/src/pa.c
index 8e1ec842..825b10ab 100644
--- a/src/pa.c
+++ b/src/pa.c
@@ -49,7 +49,8 @@ pa_shard_init(tsdn_t *tsdn, pa_shard_t *shard, emap_t *emap, base_t *base,
 
 bool
 pa_shard_enable_hpa(pa_shard_t *shard, hpa_t *hpa, size_t ps_goal,
-    size_t ps_alloc_max, size_t small_max, size_t large_min) {
+    size_t ps_alloc_max, size_t small_max, size_t large_min,
+    size_t sec_nshards, size_t sec_alloc_max, size_t sec_bytes_max) {
 	ps_goal &= ~PAGE_MASK;
 	ps_alloc_max &= ~PAGE_MASK;
 
@@ -60,6 +61,10 @@ pa_shard_enable_hpa(pa_shard_t *shard, hpa_t *hpa, size_t ps_goal,
 	    shard->ind, ps_goal, ps_alloc_max, small_max, large_min)) {
 		return true;
 	}
+	if (sec_init(&shard->hpa_sec, &shard->hpa_shard.pai, sec_nshards,
+	    sec_alloc_max, sec_bytes_max)) {
+		return true;
+	}
 	shard->ever_used_hpa = true;
 	atomic_store_b(&shard->use_hpa, true, ATOMIC_RELAXED);
 
@@ -67,24 +72,27 @@ pa_shard_enable_hpa(pa_shard_t *shard, hpa_t *hpa, size_t ps_goal,
 }
 
 void
-pa_shard_disable_hpa(pa_shard_t *shard) {
+pa_shard_disable_hpa(tsdn_t *tsdn, pa_shard_t *shard) {
 	atomic_store_b(&shard->use_hpa, false, ATOMIC_RELAXED);
+	sec_disable(tsdn, &shard->hpa_sec);
 }
 
 void
-pa_shard_reset(pa_shard_t *shard) {
+pa_shard_reset(tsdn_t *tsdn, pa_shard_t *shard) {
 	atomic_store_zu(&shard->nactive, 0, ATOMIC_RELAXED);
+	sec_flush(tsdn, &shard->hpa_sec);
 }
 
 void
 pa_shard_destroy(tsdn_t *tsdn, pa_shard_t *shard) {
+	sec_flush(tsdn, &shard->hpa_sec);
 	pac_destroy(tsdn, &shard->pac);
 }
 
 static pai_t *
 pa_get_pai(pa_shard_t *shard, edata_t *edata) {
 	return (edata_pai_get(edata) == EXTENT_PAI_PAC
-	    ? &shard->pac.pai : &shard->hpa_shard.pai);
+	    ? &shard->pac.pai : &shard->hpa_sec.pai);
 }
 
 edata_t *
@@ -95,7 +103,7 @@ pa_alloc(tsdn_t *tsdn, pa_shard_t *shard, size_t size, size_t alignment,
 
 	edata_t *edata = NULL;
 	if (atomic_load_b(&shard->use_hpa, ATOMIC_RELAXED)) {
-		edata = pai_alloc(tsdn, &shard->hpa_shard.pai, size, alignment,
+		edata = pai_alloc(tsdn, &shard->hpa_sec.pai, size, alignment,
 		    zero);
 	}
 	/*
@@ -173,6 +181,7 @@ pa_dalloc(tsdn_t *tsdn, pa_shard_t *shard, edata_t *edata,
 		emap_deregister_interior(tsdn, shard->emap, edata);
 		edata_slab_set(edata, false);
 	}
+	edata_addr_set(edata, edata_base_get(edata));
 	edata_szind_set(edata, SC_NSIZES);
 	pa_nactive_sub(shard, edata_size_get(edata) >> LG_PAGE);
 	pai_t *pai = pa_get_pai(shard, edata);
diff --git a/src/pa_extra.c b/src/pa_extra.c
index db236ad8..24cb6537 100644
--- a/src/pa_extra.c
+++ b/src/pa_extra.c
@@ -16,17 +16,14 @@ pa_shard_prefork0(tsdn_t *tsdn, pa_shard_t *shard) {
 
 void
 pa_shard_prefork2(tsdn_t *tsdn, pa_shard_t *shard) {
-	malloc_mutex_prefork(tsdn, &shard->pac.grow_mtx);
 	if (shard->ever_used_hpa) {
-		hpa_shard_prefork2(tsdn, &shard->hpa_shard);
+		sec_prefork2(tsdn, &shard->hpa_sec);
 	}
 }
 
 void
 pa_shard_prefork3(tsdn_t *tsdn, pa_shard_t *shard) {
-	ecache_prefork(tsdn, &shard->pac.ecache_dirty);
-	ecache_prefork(tsdn, &shard->pac.ecache_muzzy);
-	ecache_prefork(tsdn, &shard->pac.ecache_retained);
+	malloc_mutex_prefork(tsdn, &shard->pac.grow_mtx);
 	if (shard->ever_used_hpa) {
 		hpa_shard_prefork3(tsdn, &shard->hpa_shard);
 	}
@@ -34,6 +31,16 @@ pa_shard_prefork3(tsdn_t *tsdn, pa_shard_t *shard) {
 
 void
 pa_shard_prefork4(tsdn_t *tsdn, pa_shard_t *shard) {
+	ecache_prefork(tsdn, &shard->pac.ecache_dirty);
+	ecache_prefork(tsdn, &shard->pac.ecache_muzzy);
+	ecache_prefork(tsdn, &shard->pac.ecache_retained);
+	if (shard->ever_used_hpa) {
+		hpa_shard_prefork4(tsdn, &shard->hpa_shard);
+	}
+}
+
+void
+pa_shard_prefork5(tsdn_t *tsdn, pa_shard_t *shard) {
 	edata_cache_prefork(tsdn, &shard->edata_cache);
 }
 
@@ -47,6 +54,7 @@ pa_shard_postfork_parent(tsdn_t *tsdn, pa_shard_t *shard) {
 	malloc_mutex_postfork_parent(tsdn, &shard->pac.decay_dirty.mtx);
 	malloc_mutex_postfork_parent(tsdn, &shard->pac.decay_muzzy.mtx);
 	if (shard->ever_used_hpa) {
+		sec_postfork_parent(tsdn, &shard->hpa_sec);
 		hpa_shard_postfork_parent(tsdn, &shard->hpa_shard);
 	}
 }
@@ -61,6 +69,7 @@ pa_shard_postfork_child(tsdn_t *tsdn, pa_shard_t *shard) {
 	malloc_mutex_postfork_child(tsdn, &shard->pac.decay_dirty.mtx);
 	malloc_mutex_postfork_child(tsdn, &shard->pac.decay_muzzy.mtx);
 	if (shard->ever_used_hpa) {
+		sec_postfork_child(tsdn, &shard->hpa_sec);
 		hpa_shard_postfork_child(tsdn, &shard->hpa_shard);
 	}
 }
@@ -76,7 +85,8 @@ pa_shard_basic_stats_merge(pa_shard_t *shard, size_t *nactive, size_t *ndirty,
 void
 pa_shard_stats_merge(tsdn_t *tsdn, pa_shard_t *shard,
     pa_shard_stats_t *pa_shard_stats_out, pac_estats_t *estats_out,
-    hpa_shard_stats_t *hpa_stats_out, size_t *resident) {
+    hpa_shard_stats_t *hpa_stats_out, sec_stats_t *sec_stats_out,
+    size_t *resident) {
 	cassert(config_stats);
 
 	pa_shard_stats_out->pac_stats.retained +=
@@ -149,6 +159,7 @@ pa_shard_stats_merge(tsdn_t *tsdn, pa_shard_t *shard,
 			    &shard->hpa_shard.psset.slab_stats[i]);
 		}
 		malloc_mutex_unlock(tsdn, &shard->hpa_shard.mtx);
+		sec_stats_merge(tsdn, &shard->hpa_sec, sec_stats_out);
 	}
 }
 
@@ -182,5 +193,7 @@ pa_shard_mtx_stats_read(tsdn_t *tsdn, pa_shard_t *shard,
 		pa_shard_mtx_stats_read_single(tsdn, mutex_prof_data,
 		    &shard->hpa_shard.grow_mtx,
 		    arena_prof_mutex_hpa_shard_grow);
+		sec_mutex_stats_read(tsdn, &shard->hpa_sec,
+		    &mutex_prof_data[arena_prof_mutex_hpa_sec]);
 	}
 }
diff --git a/src/stats.c b/src/stats.c
index f03e5e44..4b40721a 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -678,6 +678,11 @@ stats_arena_hpa_shard_print(emitter_t *emitter, unsigned i) {
 	CTL_M2_GET("stats.arenas.0.hpa_shard.full_slabs.ninactive",
 	    i, &ninactive, size_t);
 
+	size_t sec_bytes;
+	CTL_M2_GET("stats.arenas.0.hpa_sec_bytes", i, &sec_bytes, size_t);
+	emitter_kv(emitter, "sec_bytes", "Bytes in small extent cache",
+	    emitter_type_size, &sec_bytes);
+
 	emitter_table_printf(emitter,
 	    "HPA shard stats:\n"
 	    "  In full slabs:\n"
@@ -1194,6 +1199,9 @@ stats_general_print(emitter_t *emitter) {
 	OPT_WRITE_SIZE_T("hpa_slab_max_alloc")
 	OPT_WRITE_SIZE_T("hpa_small_max")
 	OPT_WRITE_SIZE_T("hpa_large_min")
+	OPT_WRITE_SIZE_T("hpa_sec_max_alloc")
+	OPT_WRITE_SIZE_T("hpa_sec_max_bytes")
+	OPT_WRITE_SIZE_T("hpa_sec_nshards")
 	OPT_WRITE_CHAR_P("metadata_thp")
 	OPT_WRITE_BOOL_MUTABLE("background_thread", "background_thread")
 	OPT_WRITE_SSIZE_T_MUTABLE("dirty_decay_ms", "arenas.dirty_decay_ms")
diff --git a/src/tcache.c b/src/tcache.c
index 6bf1d309..edbedf79 100644
--- a/src/tcache.c
+++ b/src/tcache.c
@@ -716,9 +716,11 @@ tcache_destroy(tsd_t *tsd, tcache_t *tcache, bool tsd_tcache) {
 	if (arena_nthreads_get(arena, false) == 0 &&
 	    !background_thread_enabled()) {
 		/* Force purging when no threads assigned to the arena anymore. */
-		arena_decay(tsd_tsdn(tsd), arena, false, true);
+		arena_decay(tsd_tsdn(tsd), arena,
+		    /* is_background_thread */ false, /* all */ true);
 	} else {
-		arena_decay(tsd_tsdn(tsd), arena, false, false);
+		arena_decay(tsd_tsdn(tsd), arena,
+		    /* is_background_thread */ false, /* all */ false);
 	}
 }
 
diff --git a/test/unit/mallctl.c b/test/unit/mallctl.c
index ecbcda9e..278bd09d 100644
--- a/test/unit/mallctl.c
+++ b/test/unit/mallctl.c
@@ -168,6 +168,9 @@ TEST_BEGIN(test_mallctl_opt) {
 	TEST_MALLCTL_OPT(size_t, hpa_slab_max_alloc, always);
 	TEST_MALLCTL_OPT(size_t, hpa_small_max, always);
 	TEST_MALLCTL_OPT(size_t, hpa_large_min, always);
+	TEST_MALLCTL_OPT(size_t, hpa_sec_max_alloc, always);
+	TEST_MALLCTL_OPT(size_t, hpa_sec_max_bytes, always);
+	TEST_MALLCTL_OPT(size_t, hpa_sec_nshards, always);
 	TEST_MALLCTL_OPT(unsigned, narenas, always);
 	TEST_MALLCTL_OPT(const char *, percpu_arena, always);
 	TEST_MALLCTL_OPT(size_t, oversize_threshold, always);