From 43af63fff496967bf2173c92737aea1cca4ca025 Mon Sep 17 00:00:00 2001
From: David Goldblatt <davidgoldblatt@fb.com>
Date: Mon, 9 Nov 2020 13:49:30 -0800
Subject: [PATCH] HPA: Manage whole hugepages at a time.

This redesigns the HPA implementation to allow us to manage hugepages all at
once, locally, without relying on a global fallback.
---
 include/jemalloc/internal/arena_externs.h |   1 -
 include/jemalloc/internal/edata.h         |  18 +-
 include/jemalloc/internal/hpa.h           |  93 ++--
 include/jemalloc/internal/mutex_prof.h    |   4 +-
 include/jemalloc/internal/pa.h            |   5 +-
 include/jemalloc/internal/psset.h         |  11 +-
 src/arena.c                               |   6 +-
 src/ctl.c                                 | 115 ++--
 src/hpa.c                                 | 613 +++++++++++++---------
 src/jemalloc.c                            |  37 +-
 src/pa.c                                  |  15 +-
 src/pa_extra.c                            |   2 +-
 src/psset.c                               | 120 +++--
 src/stats.c                               | 113 ++--
 test/unit/hpa.c                           |  76 +--
 test/unit/psset.c                         |  21 +-
 16 files changed, 700 insertions(+), 550 deletions(-)

diff --git a/include/jemalloc/internal/arena_externs.h b/include/jemalloc/internal/arena_externs.h
index 40223b58..e3cfcee2 100644
--- a/include/jemalloc/internal/arena_externs.h
+++ b/include/jemalloc/internal/arena_externs.h
@@ -16,7 +16,6 @@ extern const char *percpu_arena_mode_names[];
 extern const uint64_t h_steps[SMOOTHSTEP_NSTEPS];
 extern malloc_mutex_t arenas_lock;
 extern emap_t arena_emap_global;
-extern hpa_t arena_hpa_global;
 
 extern size_t opt_oversize_threshold;
 extern size_t oversize_threshold;
diff --git a/include/jemalloc/internal/edata.h b/include/jemalloc/internal/edata.h
index 5ec12beb..465c962f 100644
--- a/include/jemalloc/internal/edata.h
+++ b/include/jemalloc/internal/edata.h
@@ -208,9 +208,9 @@ struct edata_s {
 		 */
 
 		/*
-		 * If this edata is from an HPA, it may be part of some larger
-		 * pageslab.  Track it if so.  Otherwise (either because it's
-		 * not part of a pageslab, or not from the HPA at all), NULL.
+		 * If this edata is a user allocation from an HPA, it comes out
+		 * of some pageslab (we don't yet support huegpage allocations
+		 * that don't fit into pageslabs).  This tracks it.
 		 */
 		edata_t *ps;
 		/*
@@ -225,6 +225,8 @@ struct edata_s {
 			 * between heaps.
 			 */
 			uint32_t longest_free_range;
+			/* Whether or not the slab is backed by a hugepage. */
+			bool hugeified;
 		};
 	};
 
@@ -328,6 +330,11 @@ edata_pai_get(const edata_t *edata) {
 	    EDATA_BITS_PAI_SHIFT);
 }
 
+static inline bool
+edata_hugeified_get(const edata_t *edata) {
+	return edata->hugeified;
+}
+
 static inline bool
 edata_slab_get(const edata_t *edata) {
 	return (bool)((edata->e_bits & EDATA_BITS_SLAB_MASK) >>
@@ -559,6 +566,11 @@ edata_pai_set(edata_t *edata, extent_pai_t pai) {
 	    ((uint64_t)pai << EDATA_BITS_PAI_SHIFT);
 }
 
+static inline void
+edata_hugeified_set(edata_t *edata, bool hugeified) {
+	edata->hugeified = hugeified;
+}
+
 static inline void
 edata_slab_set(edata_t *edata, bool slab) {
 	edata->e_bits = (edata->e_bits & ~EDATA_BITS_SLAB_MASK) |
diff --git a/include/jemalloc/internal/hpa.h b/include/jemalloc/internal/hpa.h
index 12a7a17d..1c4585df 100644
--- a/include/jemalloc/internal/hpa.h
+++ b/include/jemalloc/internal/hpa.h
@@ -6,32 +6,6 @@
 #include "jemalloc/internal/pai.h"
 #include "jemalloc/internal/psset.h"
 
-typedef struct hpa_s hpa_t;
-struct hpa_s {
-	/*
-	 * We have two mutexes for the central allocator; mtx protects its
-	 * state, while grow_mtx protects controls the ability to grow the
-	 * backing store.  This prevents race conditions in which the central
-	 * allocator has exhausted its memory while mutiple threads are trying
-	 * to allocate.  If they all reserved more address space from the OS
-	 * without synchronization, we'd end consuming much more than necessary.
-	 */
-	malloc_mutex_t grow_mtx;
-	malloc_mutex_t mtx;
-	hpa_central_t central;
-	/* The arena ind we're associated with. */
-	unsigned ind;
-	/*
-	 * This edata cache is the global one that we use for new allocations in
-	 * growing; practically, it comes from a0.
-	 *
-	 * We don't use an edata_cache_small in front of this, since we expect a
-	 * small finite number of allocations from it.
-	 */
-	edata_cache_t *edata_cache;
-	exp_grow_t exp_grow;
-};
-
 /* Used only by CTL; not actually stored here (i.e., all derived). */
 typedef struct hpa_shard_stats_s hpa_shard_stats_t;
 struct hpa_shard_stats_s {
@@ -53,44 +27,53 @@ struct hpa_shard_s {
 	 * allocator, and so will use its edata_cache.
 	 */
 	edata_cache_small_t ecs;
-	hpa_t *hpa;
+
 	psset_t psset;
 
 	/*
-	 * When we're grabbing a new ps from the central allocator, how big
-	 * would we like it to be?  This is mostly about the level of batching
-	 * we use in our requests to the centralized allocator.
+	 * The largest size we'll allocate out of the shard.  For those
+	 * allocations refused, the caller (in practice, the PA module) will
+	 * fall back to the more general (for now) PAC, which can always handle
+	 * any allocation request.
 	 */
-	size_t ps_goal;
+	size_t alloc_max;
+
 	/*
-	 * What's the maximum size we'll try to allocate out of the psset?  We
-	 * don't want this to be too large relative to ps_goal, as a
-	 * fragmentation avoidance measure.
+	 * Slabs currently purged away.  They are hugepage-sized and
+	 * hugepage-aligned, but have had pages_nohuge and pages_purge_forced
+	 * called on them.
+	 *
+	 * Guarded by grow_mtx.
 	 */
-	size_t ps_alloc_max;
+	edata_list_inactive_t unused_slabs;
+
 	/*
-	 * What's the maximum size we'll try to allocate out of the shard at
-	 * all?
+	 * Either NULL (if empty), or some integer multiple of a
+	 * hugepage-aligned number of hugepages.  We carve them off one at a
+	 * time to satisfy new pageslab requests.
+	 *
+	 * Guarded by grow_mtx.
 	 */
-	size_t small_max;
-	/*
-	 * What's the minimum size for which we'll go straight to the global
-	 * arena?
-	 */
-	size_t large_min;
+	edata_t *eden;
 
 	/* The arena ind we're associated with. */
 	unsigned ind;
+	emap_t *emap;
 };
 
-bool hpa_init(hpa_t *hpa, base_t *base, emap_t *emap,
-    edata_cache_t *edata_cache);
-bool hpa_shard_init(hpa_shard_t *shard, hpa_t *hpa,
-    edata_cache_t *edata_cache, unsigned ind, size_t ps_goal,
-    size_t ps_alloc_max, size_t small_max, size_t large_min);
+/*
+ * Whether or not the HPA can be used given the current configuration.  This is
+ * is not necessarily a guarantee that it backs its allocations by hugepages,
+ * just that it can function properly given the system it's running on.
+ */
+bool hpa_supported();
+bool hpa_shard_init(hpa_shard_t *shard, emap_t *emap,
+    edata_cache_t *edata_cache, unsigned ind, size_t alloc_max);
+
+void hpa_shard_stats_accum(hpa_shard_stats_t *dst, hpa_shard_stats_t *src);
+void hpa_shard_stats_merge(tsdn_t *tsdn, hpa_shard_t *shard,
+    hpa_shard_stats_t *dst);
 
-void hpa_stats_accum(hpa_shard_stats_t *dst, hpa_shard_stats_t *src);
-void hpa_stats_merge(tsdn_t *tsdn, hpa_shard_t *shard, hpa_shard_stats_t *dst);
 /*
  * Notify the shard that we won't use it for allocations much longer.  Due to
  * the possibility of races, we don't actually prevent allocations; just flush
@@ -108,14 +91,4 @@ void hpa_shard_prefork4(tsdn_t *tsdn, hpa_shard_t *shard);
 void hpa_shard_postfork_parent(tsdn_t *tsdn, hpa_shard_t *shard);
 void hpa_shard_postfork_child(tsdn_t *tsdn, hpa_shard_t *shard);
 
-/*
- * These should be acquired after all the shard locks in phase 4, but before any
- * locks in phase 4.  The central HPA may acquire an edata cache mutex (of a0),
- * so it needs to be lower in the witness ordering, but it's also logically
- * global and not tied to any particular arena.
- */
-void hpa_prefork4(tsdn_t *tsdn, hpa_t *hpa);
-void hpa_postfork_parent(tsdn_t *tsdn, hpa_t *hpa);
-void hpa_postfork_child(tsdn_t *tsdn, hpa_t *hpa);
-
 #endif /* JEMALLOC_INTERNAL_HPA_H */
diff --git a/include/jemalloc/internal/mutex_prof.h b/include/jemalloc/internal/mutex_prof.h
index ef0bf0d3..3759daaf 100644
--- a/include/jemalloc/internal/mutex_prof.h
+++ b/include/jemalloc/internal/mutex_prof.h
@@ -11,9 +11,7 @@
     OP(ctl)								\
     OP(prof)								\
     OP(prof_thds_data)							\
-    OP(prof_dump)							\
-    OP(hpa_central)							\
-    OP(hpa_central_grow)
+    OP(prof_dump)
 
 typedef enum {
 #define OP(mtx) global_prof_mutex_##mtx,
diff --git a/include/jemalloc/internal/pa.h b/include/jemalloc/internal/pa.h
index f1823e6b..b9030226 100644
--- a/include/jemalloc/internal/pa.h
+++ b/include/jemalloc/internal/pa.h
@@ -130,9 +130,8 @@ bool pa_shard_init(tsdn_t *tsdn, pa_shard_t *shard, emap_t *emap, base_t *base,
  * This isn't exposed to users; we allow late enablement of the HPA shard so
  * that we can boot without worrying about the HPA, then turn it on in a0.
  */
-bool pa_shard_enable_hpa(pa_shard_t *shard, hpa_t *hpa, size_t ps_goal,
-    size_t ps_alloc_max, size_t small_max, size_t large_min, size_t sec_nshards,
-    size_t sec_alloc_max, size_t sec_bytes_max);
+bool pa_shard_enable_hpa(pa_shard_t *shard, size_t alloc_max,
+    size_t sec_nshards, size_t sec_alloc_max, size_t sec_bytes_max);
 /*
  * We stop using the HPA when custom extent hooks are installed, but still
  * redirect deallocations to it.
diff --git a/include/jemalloc/internal/psset.h b/include/jemalloc/internal/psset.h
index 4529827a..3c9f23bb 100644
--- a/include/jemalloc/internal/psset.h
+++ b/include/jemalloc/internal/psset.h
@@ -24,11 +24,14 @@
 typedef struct psset_bin_stats_s psset_bin_stats_t;
 struct psset_bin_stats_s {
 	/* How many pageslabs are in this bin? */
-	size_t npageslabs;
+	size_t npageslabs_huge;
+	size_t npageslabs_nonhuge;
 	/* Of them, how many pages are active? */
-	size_t nactive;
+	size_t nactive_huge;
+	size_t nactive_nonhuge;
 	/* How many are inactive? */
-	size_t ninactive;
+	size_t ninactive_huge;
+	size_t ninactive_nonhuge;
 };
 
 /* Used only by CTL; not actually stored here (i.e., all derived). */
@@ -62,6 +65,8 @@ void psset_stats_accum(psset_stats_t *dst, psset_stats_t *src);
 void psset_insert(psset_t *psset, edata_t *ps);
 void psset_remove(psset_t *psset, edata_t *ps);
 
+void psset_hugify(psset_t *psset, edata_t *ps);
+
 /*
  * Tries to obtain a chunk from an existing pageslab already in the set.
  * Returns true on failure.
diff --git a/src/arena.c b/src/arena.c
index 7099713a..209eb347 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -37,7 +37,6 @@ static atomic_zd_t dirty_decay_ms_default;
 static atomic_zd_t muzzy_decay_ms_default;
 
 emap_t arena_emap_global;
-hpa_t arena_hpa_global;
 
 const uint64_t h_steps[SMOOTHSTEP_NSTEPS] = {
 #define STEP(step, h, x, y)			\
@@ -1535,9 +1534,8 @@ arena_new(tsdn_t *tsdn, unsigned ind, extent_hooks_t *extent_hooks) {
 	 *   so arena_hpa_global is not yet initialized.
 	 */
 	if (opt_hpa && ehooks_are_default(base_ehooks_get(base)) && ind != 0) {
-		if (pa_shard_enable_hpa(&arena->pa_shard, &arena_hpa_global,
-		    opt_hpa_slab_goal, opt_hpa_slab_max_alloc,
-		    opt_hpa_small_max, opt_hpa_large_min, opt_hpa_sec_nshards,
+		if (pa_shard_enable_hpa(&arena->pa_shard,
+		    opt_hpa_slab_max_alloc, opt_hpa_sec_nshards,
 		    opt_hpa_sec_max_alloc, opt_hpa_sec_max_bytes)) {
 			goto label_error;
 		}
diff --git a/src/ctl.c b/src/ctl.c
index f0df73b7..88cee666 100644
--- a/src/ctl.c
+++ b/src/ctl.c
@@ -220,13 +220,19 @@ CTL_PROTO(stats_arenas_i_extents_j_dirty_bytes)
 CTL_PROTO(stats_arenas_i_extents_j_muzzy_bytes)
 CTL_PROTO(stats_arenas_i_extents_j_retained_bytes)
 INDEX_PROTO(stats_arenas_i_extents_j)
-CTL_PROTO(stats_arenas_i_hpa_shard_nonfull_slabs_j_npageslabs)
-CTL_PROTO(stats_arenas_i_hpa_shard_nonfull_slabs_j_nactive)
-CTL_PROTO(stats_arenas_i_hpa_shard_nonfull_slabs_j_ninactive)
+CTL_PROTO(stats_arenas_i_hpa_shard_full_slabs_npageslabs_huge)
+CTL_PROTO(stats_arenas_i_hpa_shard_full_slabs_nactive_huge)
+CTL_PROTO(stats_arenas_i_hpa_shard_full_slabs_ninactive_huge)
+CTL_PROTO(stats_arenas_i_hpa_shard_full_slabs_npageslabs_nonhuge)
+CTL_PROTO(stats_arenas_i_hpa_shard_full_slabs_nactive_nonhuge)
+CTL_PROTO(stats_arenas_i_hpa_shard_full_slabs_ninactive_nonhuge)
+CTL_PROTO(stats_arenas_i_hpa_shard_nonfull_slabs_j_npageslabs_huge)
+CTL_PROTO(stats_arenas_i_hpa_shard_nonfull_slabs_j_npageslabs_nonhuge)
+CTL_PROTO(stats_arenas_i_hpa_shard_nonfull_slabs_j_nactive_huge)
+CTL_PROTO(stats_arenas_i_hpa_shard_nonfull_slabs_j_nactive_nonhuge)
+CTL_PROTO(stats_arenas_i_hpa_shard_nonfull_slabs_j_ninactive_huge)
+CTL_PROTO(stats_arenas_i_hpa_shard_nonfull_slabs_j_ninactive_nonhuge)
 INDEX_PROTO(stats_arenas_i_hpa_shard_nonfull_slabs_j)
-CTL_PROTO(stats_arenas_i_hpa_shard_full_slabs_npageslabs)
-CTL_PROTO(stats_arenas_i_hpa_shard_full_slabs_nactive)
-CTL_PROTO(stats_arenas_i_hpa_shard_full_slabs_ninactive)
 CTL_PROTO(stats_arenas_i_nthreads)
 CTL_PROTO(stats_arenas_i_uptime)
 CTL_PROTO(stats_arenas_i_dss)
@@ -606,21 +612,33 @@ MUTEX_PROF_ARENA_MUTEXES
 };
 
 static const ctl_named_node_t stats_arenas_i_hpa_shard_full_slabs_node[] = {
-	{NAME("npageslabs"),
-		CTL(stats_arenas_i_hpa_shard_full_slabs_npageslabs)},
-	{NAME("nactive"),
-		CTL(stats_arenas_i_hpa_shard_full_slabs_nactive)},
-	{NAME("ninactive"),
-		CTL(stats_arenas_i_hpa_shard_full_slabs_ninactive)}
+	{NAME("npageslabs_huge"),
+		CTL(stats_arenas_i_hpa_shard_full_slabs_npageslabs_huge)},
+	{NAME("nactive_huge"),
+		CTL(stats_arenas_i_hpa_shard_full_slabs_nactive_huge)},
+	{NAME("ninactive_huge"),
+		CTL(stats_arenas_i_hpa_shard_full_slabs_ninactive_huge)},
+	{NAME("npageslabs_nonhuge"),
+		CTL(stats_arenas_i_hpa_shard_full_slabs_npageslabs_nonhuge)},
+	{NAME("nactive_nonhuge"),
+		CTL(stats_arenas_i_hpa_shard_full_slabs_nactive_nonhuge)},
+	{NAME("ninactive_nonhuge"),
+		CTL(stats_arenas_i_hpa_shard_full_slabs_ninactive_nonhuge)},
 };
 
 static const ctl_named_node_t stats_arenas_i_hpa_shard_nonfull_slabs_j_node[] = {
-	{NAME("npageslabs"),
-		CTL(stats_arenas_i_hpa_shard_nonfull_slabs_j_npageslabs)},
-	{NAME("nactive"),
-		CTL(stats_arenas_i_hpa_shard_nonfull_slabs_j_nactive)},
-	{NAME("ninactive"),
-		CTL(stats_arenas_i_hpa_shard_nonfull_slabs_j_ninactive)}
+	{NAME("npageslabs_huge"),
+		CTL(stats_arenas_i_hpa_shard_nonfull_slabs_j_npageslabs_huge)},
+	{NAME("nactive_huge"),
+		CTL(stats_arenas_i_hpa_shard_nonfull_slabs_j_nactive_huge)},
+	{NAME("ninactive_huge"),
+		CTL(stats_arenas_i_hpa_shard_nonfull_slabs_j_ninactive_huge)},
+	{NAME("npageslabs_nonhuge"),
+		CTL(stats_arenas_i_hpa_shard_nonfull_slabs_j_npageslabs_nonhuge)},
+	{NAME("nactive_nonhuge"),
+		CTL(stats_arenas_i_hpa_shard_nonfull_slabs_j_nactive_nonhuge)},
+	{NAME("ninactive_nonhuge"),
+		CTL(stats_arenas_i_hpa_shard_nonfull_slabs_j_ninactive_nonhuge)}
 };
 
 static const ctl_named_node_t super_stats_arenas_i_hpa_shard_nonfull_slabs_j_node[] = {
@@ -1104,7 +1122,7 @@ MUTEX_PROF_ARENA_MUTEXES
 		}
 
 		/* Merge HPA stats. */
-		hpa_stats_accum(&sdstats->hpastats, &astats->hpastats);
+		hpa_shard_stats_accum(&sdstats->hpastats, &astats->hpastats);
 		sec_stats_accum(&sdstats->secstats, &astats->secstats);
 	}
 }
@@ -1219,14 +1237,6 @@ ctl_refresh(tsdn_t *tsdn) {
 			READ_GLOBAL_MUTEX_PROF_DATA(
 			    global_prof_mutex_prof_dump, prof_dump_mtx);
 		}
-		if (opt_hpa) {
-			READ_GLOBAL_MUTEX_PROF_DATA(
-			    global_prof_mutex_hpa_central,
-			    arena_hpa_global.mtx);
-			READ_GLOBAL_MUTEX_PROF_DATA(
-			    global_prof_mutex_hpa_central_grow,
-			    arena_hpa_global.grow_mtx);
-		}
 		if (have_background_thread) {
 			READ_GLOBAL_MUTEX_PROF_DATA(
 			    global_prof_mutex_background_thread,
@@ -3259,11 +3269,6 @@ stats_mutexes_reset_ctl(tsd_t *tsd, const size_t *mib,
 		MUTEX_PROF_RESET(tdatas_mtx);
 		MUTEX_PROF_RESET(prof_dump_mtx);
 	}
-	if (opt_hpa) {
-		MUTEX_PROF_RESET(arena_hpa_global.mtx);
-		MUTEX_PROF_RESET(arena_hpa_global.grow_mtx);
-	}
-
 
 	/* Per arena mutexes. */
 	unsigned n = narenas_total_get();
@@ -3367,22 +3372,44 @@ stats_arenas_i_extents_j_index(tsdn_t *tsdn, const size_t *mib,
 	return super_stats_arenas_i_extents_j_node;
 }
 
-CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_full_slabs_npageslabs,
-    arenas_i(mib[2])->astats->hpastats.psset_stats.full_slabs.npageslabs,
+/* Full, huge */
+CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_full_slabs_npageslabs_huge,
+    arenas_i(mib[2])->astats->hpastats.psset_stats.full_slabs.npageslabs_huge,
     size_t);
-CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_full_slabs_nactive,
-    arenas_i(mib[2])->astats->hpastats.psset_stats.full_slabs.nactive, size_t);
-CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_full_slabs_ninactive,
-    arenas_i(mib[2])->astats->hpastats.psset_stats.full_slabs.ninactive, size_t);
+CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_full_slabs_nactive_huge,
+    arenas_i(mib[2])->astats->hpastats.psset_stats.full_slabs.nactive_huge, size_t);
+CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_full_slabs_ninactive_huge,
+    arenas_i(mib[2])->astats->hpastats.psset_stats.full_slabs.ninactive_huge, size_t);
 
-CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_nonfull_slabs_j_npageslabs,
-    arenas_i(mib[2])->astats->hpastats.psset_stats.nonfull_slabs[mib[5]].npageslabs,
+/* Full, nonhuge */
+CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_full_slabs_npageslabs_nonhuge,
+    arenas_i(mib[2])->astats->hpastats.psset_stats.full_slabs.npageslabs_nonhuge,
     size_t);
-CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_nonfull_slabs_j_nactive,
-    arenas_i(mib[2])->astats->hpastats.psset_stats.nonfull_slabs[mib[5]].nactive,
+CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_full_slabs_nactive_nonhuge,
+    arenas_i(mib[2])->astats->hpastats.psset_stats.full_slabs.nactive_nonhuge, size_t);
+CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_full_slabs_ninactive_nonhuge,
+    arenas_i(mib[2])->astats->hpastats.psset_stats.full_slabs.ninactive_nonhuge, size_t);
+
+/* Nonfull, huge */
+CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_nonfull_slabs_j_npageslabs_huge,
+    arenas_i(mib[2])->astats->hpastats.psset_stats.nonfull_slabs[mib[5]].npageslabs_huge,
     size_t);
-CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_nonfull_slabs_j_ninactive,
-    arenas_i(mib[2])->astats->hpastats.psset_stats.nonfull_slabs[mib[5]].ninactive,
+CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_nonfull_slabs_j_nactive_huge,
+    arenas_i(mib[2])->astats->hpastats.psset_stats.nonfull_slabs[mib[5]].nactive_huge,
+    size_t);
+CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_nonfull_slabs_j_ninactive_huge,
+    arenas_i(mib[2])->astats->hpastats.psset_stats.nonfull_slabs[mib[5]].ninactive_huge,
+    size_t);
+
+/* Nonfull, nonhuge */
+CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_nonfull_slabs_j_npageslabs_nonhuge,
+    arenas_i(mib[2])->astats->hpastats.psset_stats.nonfull_slabs[mib[5]].npageslabs_nonhuge,
+    size_t);
+CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_nonfull_slabs_j_nactive_nonhuge,
+    arenas_i(mib[2])->astats->hpastats.psset_stats.nonfull_slabs[mib[5]].nactive_nonhuge,
+    size_t);
+CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_nonfull_slabs_j_ninactive_nonhuge,
+    arenas_i(mib[2])->astats->hpastats.psset_stats.nonfull_slabs[mib[5]].ninactive_nonhuge,
     size_t);
 
 static const ctl_named_node_t *
diff --git a/src/hpa.c b/src/hpa.c
index e7548adb..ca75628c 100644
--- a/src/hpa.c
+++ b/src/hpa.c
@@ -6,6 +6,8 @@
 #include "jemalloc/internal/flat_bitmap.h"
 #include "jemalloc/internal/witness.h"
 
+#define HPA_EDEN_SIZE (128 * HUGEPAGE)
+
 static edata_t *hpa_alloc(tsdn_t *tsdn, pai_t *self, size_t size,
     size_t alignment, bool zero);
 static bool hpa_expand(tsdn_t *tsdn, pai_t *self, edata_t *edata,
@@ -15,43 +17,40 @@ static bool hpa_shrink(tsdn_t *tsdn, pai_t *self, edata_t *edata,
 static void hpa_dalloc(tsdn_t *tsdn, pai_t *self, edata_t *edata);
 
 bool
-hpa_init(hpa_t *hpa, base_t *base, emap_t *emap, edata_cache_t *edata_cache) {
-	bool err;
-
+hpa_supported() {
+#ifdef _WIN32
+	/*
+	 * At least until the API and implementation is somewhat settled, we
+	 * don't want to try to debug the VM subsystem on the hardest-to-test
+	 * platform.
+	 */
+	return false;
+#endif
+	if (!pages_can_hugify) {
+		return false;
+	}
 	/*
 	 * We fundamentally rely on a address-space-hungry growth strategy for
-	 * hugepages.  This may change in the future, but for now we should have
-	 * refused to turn on any HPA at a higher level of the stack.
+	 * hugepages.
 	 */
-	assert(LG_SIZEOF_PTR == 3);
-
-	err = malloc_mutex_init(&hpa->grow_mtx, "hpa_grow", WITNESS_RANK_HPA_GROW,
-	    malloc_mutex_rank_exclusive);
-	if (err) {
-		return true;
+	if (LG_SIZEOF_PTR == 2) {
+		return false;
 	}
-	err = malloc_mutex_init(&hpa->mtx, "hpa", WITNESS_RANK_HPA,
-	    malloc_mutex_rank_exclusive);
-	if (err) {
-		return true;
+	/*
+	 * We use the edata bitmap; it needs to have at least as many bits as a
+	 * hugepage has pages.
+	 */
+	if (HUGEPAGE / PAGE > BITMAP_GROUPS_MAX * sizeof(bitmap_t) * 8) {
+		return false;
 	}
-
-	hpa_central_init(&hpa->central, edata_cache, emap);
-	if (err) {
-		return true;
-	}
-	hpa->ind = base_ind_get(base);
-	hpa->edata_cache = edata_cache;
-
-	exp_grow_init(&hpa->exp_grow);
-
-	return false;
+	return true;
 }
 
 bool
-hpa_shard_init(hpa_shard_t *shard, hpa_t *hpa, edata_cache_t *edata_cache,
-    unsigned ind, size_t ps_goal, size_t ps_alloc_max, size_t small_max,
-    size_t large_min) {
+hpa_shard_init(hpa_shard_t *shard, emap_t *emap, edata_cache_t *edata_cache,
+    unsigned ind, size_t alloc_max) {
+	/* malloc_conf processing should have filtered out these cases. */
+	assert(hpa_supported());
 	bool err;
 	err = malloc_mutex_init(&shard->grow_mtx, "hpa_shard_grow",
 	    WITNESS_RANK_HPA_SHARD_GROW, malloc_mutex_rank_exclusive);
@@ -66,12 +65,12 @@ hpa_shard_init(hpa_shard_t *shard, hpa_t *hpa, edata_cache_t *edata_cache,
 
 	assert(edata_cache != NULL);
 	edata_cache_small_init(&shard->ecs, edata_cache);
-	shard->hpa = hpa;
 	psset_init(&shard->psset);
-	shard->ps_goal = ps_goal;
-	shard->ps_alloc_max = ps_alloc_max;
-	shard->small_max = small_max;
-	shard->large_min = large_min;
+	shard->alloc_max = alloc_max;
+	edata_list_inactive_init(&shard->unused_slabs);
+	shard->eden = NULL;
+	shard->ind = ind;
+	shard->emap = emap;
 
 	/*
 	 * Fill these in last, so that if an hpa_shard gets used despite
@@ -83,9 +82,6 @@ hpa_shard_init(hpa_shard_t *shard, hpa_t *hpa, edata_cache_t *edata_cache,
 	shard->pai.shrink = &hpa_shrink;
 	shard->pai.dalloc = &hpa_dalloc;
 
-	shard->ind = ind;
-	assert(ind == base_ind_get(edata_cache->base));
-
 	return false;
 }
 
@@ -96,176 +92,333 @@ hpa_shard_init(hpa_shard_t *shard, hpa_t *hpa, edata_cache_t *edata_cache,
  * locking here.
  */
 void
-hpa_stats_accum(hpa_shard_stats_t *dst, hpa_shard_stats_t *src) {
+hpa_shard_stats_accum(hpa_shard_stats_t *dst, hpa_shard_stats_t *src) {
 	psset_stats_accum(&dst->psset_stats, &src->psset_stats);
 }
 
 void
-hpa_stats_merge(tsdn_t *tsdn, hpa_shard_t *shard, hpa_shard_stats_t *dst) {
+hpa_shard_stats_merge(tsdn_t *tsdn, hpa_shard_t *shard,
+    hpa_shard_stats_t *dst) {
 	malloc_mutex_lock(tsdn, &shard->mtx);
 	psset_stats_accum(&dst->psset_stats, &shard->psset.stats);
 	malloc_mutex_unlock(tsdn, &shard->mtx);
 }
 
-static edata_t *
-hpa_alloc_central(tsdn_t *tsdn, hpa_shard_t *shard, size_t size_min,
-    size_t size_goal) {
-	bool err;
-	edata_t *edata;
-
-	hpa_t *hpa = shard->hpa;
-
-	malloc_mutex_lock(tsdn, &hpa->mtx);
-	edata = hpa_central_alloc_reuse(tsdn, &hpa->central, size_min,
-	    size_goal);
-	malloc_mutex_unlock(tsdn, &hpa->mtx);
-	if (edata != NULL) {
-		edata_arena_ind_set(edata, shard->ind);
-		return edata;
-	}
-	/* No existing range can satisfy the request; try to grow. */
-	malloc_mutex_lock(tsdn, &hpa->grow_mtx);
-
+static bool
+hpa_should_hugify(hpa_shard_t *shard, edata_t *ps) {
 	/*
-	 * We could have raced with other grow attempts; re-check to see if we
-	 * did, and are now able to satisfy the request.
+	 * For now, just use a static check; hugify a page if it's <= 5%
+	 * inactive.  Eventually, this should be a malloc conf option.
 	 */
-	malloc_mutex_lock(tsdn, &hpa->mtx);
-	edata = hpa_central_alloc_reuse(tsdn, &hpa->central, size_min,
-	    size_goal);
-	malloc_mutex_unlock(tsdn, &hpa->mtx);
-	if (edata != NULL) {
-		malloc_mutex_unlock(tsdn, &hpa->grow_mtx);
-		edata_arena_ind_set(edata, shard->ind);
-		return edata;
-	}
+	return !edata_hugeified_get(ps)
+	    && edata_nfree_get(ps) < (HUGEPAGE / PAGE) * 5 / 100;
+}
 
+/* Returns true on error. */
+static void
+hpa_hugify(edata_t *ps) {
+	assert(edata_size_get(ps) == HUGEPAGE);
+	assert(edata_hugeified_get(ps));
+	bool err = pages_huge(edata_base_get(ps), HUGEPAGE);
 	/*
-	 * No such luck. We've dropped mtx, so other allocations can proceed
-	 * while we allocate the new extent.  We know no one else will grow in
-	 * the meantime, though, since we still hold grow_mtx.
-	 */
-	size_t alloc_size;
-	pszind_t skip;
-
-	size_t hugepage_goal_min = HUGEPAGE_CEILING(size_goal);
-
-	err = exp_grow_size_prepare(&hpa->exp_grow, hugepage_goal_min,
-	    &alloc_size, &skip);
-	if (err) {
-		malloc_mutex_unlock(tsdn, &hpa->grow_mtx);
-		return NULL;
-	}
-	alloc_size = HUGEPAGE_CEILING(alloc_size);
-
-	/*
-	 * Eventually, we need to think about this more systematically, and in
-	 * terms of extent hooks.  For now, though, we know we only care about
-	 * overcommitting systems, and we're not going to purge much.
-	 */
-	bool commit = true;
-	void *addr = pages_map(NULL, alloc_size, HUGEPAGE, &commit);
-	if (addr == NULL) {
-		malloc_mutex_unlock(tsdn, &hpa->grow_mtx);
-		return NULL;
-	}
-	err = pages_huge(addr, alloc_size);
-	/*
-	 * Ignore this for now; even if the allocation fails, the address space
-	 * should still be usable.
+	 * Eat the error; even if the hugeification failed, it's still safe to
+	 * pretend it didn't (and would require extraordinary measures to
+	 * unhugify).
 	 */
 	(void)err;
+}
 
-	edata = edata_cache_get(tsdn, hpa->edata_cache);
-	if (edata == NULL) {
-		malloc_mutex_unlock(tsdn, &hpa->grow_mtx);
-		pages_unmap(addr, alloc_size);
-		return NULL;
+static void
+hpa_dehugify(edata_t *ps) {
+	/* Purge, then dehugify while unbacked. */
+	pages_purge_forced(edata_addr_get(ps), HUGEPAGE);
+	pages_nohuge(edata_addr_get(ps), HUGEPAGE);
+	edata_hugeified_set(ps, false);
+}
+
+static edata_t *
+hpa_grow(tsdn_t *tsdn, hpa_shard_t *shard) {
+	malloc_mutex_assert_owner(tsdn, &shard->grow_mtx);
+	edata_t *ps = NULL;
+
+	/* Is there address space waiting for reuse? */
+	malloc_mutex_assert_owner(tsdn, &shard->grow_mtx);
+	ps = edata_list_inactive_first(&shard->unused_slabs);
+	if (ps != NULL) {
+		edata_list_inactive_remove(&shard->unused_slabs, ps);
+		return ps;
+	}
+
+	/* Is eden a perfect fit? */
+	if (shard->eden != NULL && edata_size_get(shard->eden) == HUGEPAGE) {
+		ps = shard->eden;
+		shard->eden = NULL;
+		return ps;
 	}
 
 	/*
-	 * The serial number here is just a placeholder; the hpa_central gets to
-	 * decide how it wants to fill it in.
-	 *
-	 * The grow edata is associated with the hpa_central_t arena ind; the
-	 * subsequent allocation we get (in the hpa_central_alloc_grow call
-	 * below) will be filled in with the shard ind.
+	 * We're about to try to allocate from eden by splitting.  If eden is
+	 * NULL, we have to allocate it too.  Otherwise, we just have to
+	 * allocate an edata_t for the new psset.
 	 */
-	edata_init(edata, hpa->ind, addr, alloc_size, /* slab */ false,
-	    SC_NSIZES, /* sn */ 0, extent_state_active, /* zeroed */ true,
-	    /* comitted */ true, EXTENT_PAI_HPA, /* is_head */ true);
+	if (shard->eden == NULL) {
+		/*
+		 * During development, we're primarily concerned with systems
+		 * with overcommit.  Eventually, we should be more careful here.
+		 */
+		bool commit = true;
+		/* Allocate address space, bailing if we fail. */
+		void *new_eden = pages_map(NULL, HPA_EDEN_SIZE, HUGEPAGE,
+		    &commit);
+		if (new_eden == NULL) {
+			return NULL;
+		}
+		malloc_mutex_lock(tsdn, &shard->mtx);
+		/* Allocate ps edata, bailing if we fail. */
+		ps = edata_cache_small_get(tsdn, &shard->ecs);
+		if (ps == NULL) {
+			malloc_mutex_unlock(tsdn, &shard->mtx);
+			pages_unmap(new_eden, HPA_EDEN_SIZE);
+			return NULL;
+		}
+		/* Allocate eden edata, bailing if we fail. */
+		shard->eden = edata_cache_small_get(tsdn, &shard->ecs);
+		if (shard->eden == NULL) {
+			edata_cache_small_put(tsdn, &shard->ecs, ps);
+			malloc_mutex_unlock(tsdn, &shard->mtx);
+			pages_unmap(new_eden, HPA_EDEN_SIZE);
+			return NULL;
+		}
+		/* Success. */
+		malloc_mutex_unlock(tsdn, &shard->mtx);
 
-	malloc_mutex_lock(tsdn, &hpa->mtx);
-	/* Note that this replace edata with the allocation to return. */
-	err = hpa_central_alloc_grow(tsdn, &hpa->central, size_goal, edata);
-	malloc_mutex_unlock(tsdn, &hpa->mtx);
-
-	if (!err) {
-		exp_grow_size_commit(&hpa->exp_grow, skip);
+		/*
+		 * Note that the values here don't really make sense (e.g. eden
+		 * is actually zeroed).  But we don't use the slab metadata in
+		 * determining subsequent allocation metadata (e.g. zero
+		 * tracking should be done at the per-page level, not at the
+		 * level of the hugepage).  It's just a convenient data
+		 * structure that contains much of the helpers we need (defined
+		 * lists, a bitmap, an address field, etc.).  Eventually, we'll
+		 * have a "real" representation of a hugepage that's unconnected
+		 * to the edata_ts it will serve allocations into.
+		 */
+		edata_init(shard->eden, shard->ind, new_eden, HPA_EDEN_SIZE,
+		    /* slab */ false, SC_NSIZES, /* sn */ 0, extent_state_dirty,
+		    /* zeroed */ false, /* comitted */ true, EXTENT_PAI_HPA,
+		    /* is_head */ true);
+		edata_hugeified_set(shard->eden, false);
+	} else {
+		/* Eden is already nonempty; only need an edata for ps. */
+		malloc_mutex_lock(tsdn, &shard->mtx);
+		ps = edata_cache_small_get(tsdn, &shard->ecs);
+		malloc_mutex_unlock(tsdn, &shard->mtx);
+		if (ps == NULL) {
+			return NULL;
+		}
 	}
-	malloc_mutex_unlock(tsdn, &hpa->grow_mtx);
-	edata_arena_ind_set(edata, shard->ind);
+	/*
+	 * We should have dropped mtx since we're not touching ecs any more, but
+	 * we should continue to hold the grow mutex, since we're about to touch
+	 * eden.
+	 */
+	malloc_mutex_assert_not_owner(tsdn, &shard->mtx);
+	malloc_mutex_assert_owner(tsdn, &shard->grow_mtx);
 
+	assert(shard->eden != NULL);
+	assert(edata_size_get(shard->eden) > HUGEPAGE);
+	assert(edata_size_get(shard->eden) % HUGEPAGE == 0);
+	assert(edata_addr_get(shard->eden)
+	    == HUGEPAGE_ADDR2BASE(edata_addr_get(shard->eden)));
+	malloc_mutex_lock(tsdn, &shard->mtx);
+	ps = edata_cache_small_get(tsdn, &shard->ecs);
+	malloc_mutex_unlock(tsdn, &shard->mtx);
+	if (ps == NULL) {
+		return NULL;
+	}
+	edata_init(ps, edata_arena_ind_get(shard->eden),
+	    edata_addr_get(shard->eden), HUGEPAGE, /* slab */ false,
+	    /* szind */ SC_NSIZES, /* sn */ 0, extent_state_dirty,
+	    /* zeroed */ false, /* comitted */ true, EXTENT_PAI_HPA,
+	    /* is_head */ true);
+	edata_hugeified_set(ps, false);
+	edata_addr_set(shard->eden, edata_past_get(ps));
+	edata_size_set(shard->eden,
+	    edata_size_get(shard->eden) - HUGEPAGE);
+
+	return ps;
+}
+
+/*
+ * The psset does not hold empty slabs.  Upon becoming empty, then, we need to
+ * put them somewhere.  We take this as an opportunity to purge, and retain
+ * their address space in a list outside the psset.
+ */
+static void
+hpa_handle_ps_eviction(tsdn_t *tsdn, hpa_shard_t *shard, edata_t *ps) {
+	/*
+	 * We do relatively expensive system calls.  The ps was evicted, so no
+	 * one should touch it while we're also touching it.
+	 */
+	malloc_mutex_assert_not_owner(tsdn, &shard->mtx);
+	malloc_mutex_assert_not_owner(tsdn, &shard->grow_mtx);
+
+	assert(edata_size_get(ps) == HUGEPAGE);
+	assert(HUGEPAGE_ADDR2BASE(edata_addr_get(ps)) == edata_addr_get(ps));
+
+	/*
+	 * We do this unconditionally, even for pages which were not originally
+	 * hugeified; it has the same effect.
+	 */
+	hpa_dehugify(ps);
+
+	malloc_mutex_lock(tsdn, &shard->grow_mtx);
+	edata_list_inactive_prepend(&shard->unused_slabs, ps);
+	malloc_mutex_unlock(tsdn, &shard->grow_mtx);
+}
+
+static edata_t *
+hpa_try_alloc_no_grow(tsdn_t *tsdn, hpa_shard_t *shard, size_t size, bool *oom) {
+	bool err;
+	malloc_mutex_lock(tsdn, &shard->mtx);
+	edata_t *edata = edata_cache_small_get(tsdn, &shard->ecs);
+	*oom = false;
+	if (edata == NULL) {
+		malloc_mutex_unlock(tsdn, &shard->mtx);
+		*oom = true;
+		return NULL;
+	}
+	assert(edata_arena_ind_get(edata) == shard->ind);
+
+	err = psset_alloc_reuse(&shard->psset, edata, size);
 	if (err) {
-		pages_unmap(addr, alloc_size);
-		edata_cache_put(tsdn, hpa->edata_cache, edata);
+		edata_cache_small_put(tsdn, &shard->ecs, edata);
+		malloc_mutex_unlock(tsdn, &shard->mtx);
+		return NULL;
+	}
+	/*
+	 * This could theoretically be moved outside of the critical section,
+	 * but that introduces the potential for a race.  Without the lock, the
+	 * (initially nonempty, since this is the reuse pathway) pageslab we
+	 * allocated out of could become otherwise empty while the lock is
+	 * dropped.  This would force us to deal with a pageslab eviction down
+	 * the error pathway, which is a pain.
+	 */
+	err = emap_register_boundary(tsdn, shard->emap, edata,
+	    SC_NSIZES, /* slab */ false);
+	if (err) {
+		edata_t *ps = psset_dalloc(&shard->psset, edata);
+		/*
+		 * The pageslab was nonempty before we started; it
+		 * should still be nonempty now, and so shouldn't get
+		 * evicted.
+		 */
+		assert(ps == NULL);
+		edata_cache_small_put(tsdn, &shard->ecs, edata);
+		malloc_mutex_unlock(tsdn, &shard->mtx);
+		*oom = true;
 		return NULL;
 	}
 
+	edata_t *ps = edata_ps_get(edata);
+	assert(ps != NULL);
+	bool hugify = hpa_should_hugify(shard, ps);
+	if (hugify) {
+		/*
+		 * Do the metadata modification while holding the lock; we'll
+		 * actually change state with the lock dropped.
+		 */
+		psset_hugify(&shard->psset, ps);
+	}
+	malloc_mutex_unlock(tsdn, &shard->mtx);
+	if (hugify) {
+		/*
+		 * Hugifying with the lock dropped is safe, even with
+		 * concurrent modifications to the ps.  This relies on
+		 * the fact that the current implementation will never
+		 * dehugify a non-empty pageslab, and ps will never
+		 * become empty before we return edata to the user to be
+		 * freed.
+		 *
+		 * Note that holding the lock would prevent not just operations
+		 * on this page slab, but also operations any other alloc/dalloc
+		 * operations in this hpa shard.
+		 */
+		hpa_hugify(ps);
+	}
 	return edata;
 }
 
 static edata_t *
 hpa_alloc_psset(tsdn_t *tsdn, hpa_shard_t *shard, size_t size) {
-	assert(size <= shard->ps_alloc_max);
-
+	assert(size <= shard->alloc_max);
 	bool err;
-	malloc_mutex_lock(tsdn, &shard->mtx);
-	edata_t *edata = edata_cache_small_get(tsdn, &shard->ecs);
-	if (edata == NULL) {
-		malloc_mutex_unlock(tsdn, &shard->mtx);
-		return NULL;
-	}
-	edata_arena_ind_set(edata, shard->ind);
+	bool oom;
+	edata_t *edata;
 
-	err = psset_alloc_reuse(&shard->psset, edata, size);
-	malloc_mutex_unlock(tsdn, &shard->mtx);
-	if (!err) {
+	edata = hpa_try_alloc_no_grow(tsdn, shard, size, &oom);
+	if (edata != NULL) {
 		return edata;
 	}
+
 	/* Nothing in the psset works; we have to grow it. */
 	malloc_mutex_lock(tsdn, &shard->grow_mtx);
-
-	/* As above; check for grow races. */
-	malloc_mutex_lock(tsdn, &shard->mtx);
-	err = psset_alloc_reuse(&shard->psset, edata, size);
-	malloc_mutex_unlock(tsdn, &shard->mtx);
-	if (!err) {
+	/*
+	 * Check for grow races; maybe some earlier thread expanded the psset
+	 * in between when we dropped the main mutex and grabbed the grow mutex.
+	 */
+	edata = hpa_try_alloc_no_grow(tsdn, shard, size, &oom);
+	if (edata != NULL || oom) {
 		malloc_mutex_unlock(tsdn, &shard->grow_mtx);
 		return edata;
 	}
 
-	edata_t *grow_edata = hpa_alloc_central(tsdn, shard, size,
-	    shard->ps_goal);
+	/*
+	 * Note that we don't hold shard->mtx here (while growing);
+	 * deallocations (and allocations of smaller sizes) may still succeed
+	 * while we're doing this potentially expensive system call.
+	 */
+	edata_t *grow_edata = hpa_grow(tsdn, shard);
 	if (grow_edata == NULL) {
 		malloc_mutex_unlock(tsdn, &shard->grow_mtx);
-
-		malloc_mutex_lock(tsdn, &shard->mtx);
-		edata_cache_small_put(tsdn, &shard->ecs, edata);
-		malloc_mutex_unlock(tsdn, &shard->mtx);
-
 		return NULL;
 	}
-	edata_arena_ind_set(grow_edata, shard->ind);
+	assert(edata_arena_ind_get(grow_edata) == shard->ind);
+
 	edata_slab_set(grow_edata, true);
 	fb_group_t *fb = edata_slab_data_get(grow_edata)->bitmap;
-	fb_init(fb, shard->ps_goal / PAGE);
+	fb_init(fb, HUGEPAGE / PAGE);
 
 	/* We got the new edata; allocate from it. */
 	malloc_mutex_lock(tsdn, &shard->mtx);
+	edata = edata_cache_small_get(tsdn, &shard->ecs);
+	if (edata == NULL) {
+		malloc_mutex_unlock(tsdn, &shard->mtx);
+		malloc_mutex_unlock(tsdn, &shard->grow_mtx);
+		return NULL;
+	}
 	psset_alloc_new(&shard->psset, grow_edata, edata, size);
+	err = emap_register_boundary(tsdn, shard->emap, edata,
+	    SC_NSIZES, /* slab */ false);
+	if (err) {
+		edata_t *ps = psset_dalloc(&shard->psset, edata);
+		/*
+		 * The pageslab was empty except for the new allocation; it
+		 * should get evicted.
+		 */
+		assert(ps == grow_edata);
+		edata_cache_small_put(tsdn, &shard->ecs, edata);
+		/*
+		 * Technically the same as fallthrough at the time of this
+		 * writing, but consistent with the error handling in the rest
+		 * of the function.
+		 */
+		malloc_mutex_unlock(tsdn, &shard->mtx);
+		malloc_mutex_unlock(tsdn, &shard->grow_mtx);
+		hpa_handle_ps_eviction(tsdn, shard, ps);
+		return NULL;
+	}
 	malloc_mutex_unlock(tsdn, &shard->mtx);
-
 	malloc_mutex_unlock(tsdn, &shard->grow_mtx);
 	return edata;
 }
@@ -283,33 +436,25 @@ static edata_t *
 hpa_alloc(tsdn_t *tsdn, pai_t *self, size_t size,
     size_t alignment, bool zero) {
 	assert((size & PAGE_MASK) == 0);
+	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
+	    WITNESS_RANK_CORE, 0);
+
 	hpa_shard_t *shard = hpa_from_pai(self);
 	/* We don't handle alignment or zeroing for now. */
 	if (alignment > PAGE || zero) {
 		return NULL;
 	}
-	if (size > shard->small_max && size < shard->large_min) {
+	if (size > shard->alloc_max) {
 		return NULL;
 	}
 
-	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
-	    WITNESS_RANK_CORE, 0);
-
-	edata_t *edata;
-	if (size <= shard->ps_alloc_max) {
-		edata = hpa_alloc_psset(tsdn, shard, size);
-		if (edata != NULL) {
-			emap_register_boundary(tsdn, shard->hpa->central.emap,
-			    edata, SC_NSIZES, /* slab */ false);
-		}
-	} else {
-		edata = hpa_alloc_central(tsdn, shard, size, size);
-	}
+	edata_t *edata = hpa_alloc_psset(tsdn, shard, size);
 
 	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
 	    WITNESS_RANK_CORE, 0);
+
 	if (edata != NULL) {
-		emap_assert_mapped(tsdn, shard->hpa->central.emap, edata);
+		emap_assert_mapped(tsdn, shard->emap, edata);
 		assert(edata_pai_get(edata) == EXTENT_PAI_HPA);
 		assert(edata_state_get(edata) == extent_state_active);
 		assert(edata_arena_ind_get(edata) == shard->ind);
@@ -336,16 +481,6 @@ hpa_shrink(tsdn_t *tsdn, pai_t *self, edata_t *edata,
 	return true;
 }
 
-static void
-hpa_dalloc_central(tsdn_t *tsdn, hpa_shard_t *shard, edata_t *edata) {
-	hpa_t *hpa = shard->hpa;
-
-	edata_arena_ind_set(edata, hpa->ind);
-	malloc_mutex_lock(tsdn, &hpa->mtx);
-	hpa_central_dalloc(tsdn, &hpa->central, edata);
-	malloc_mutex_unlock(tsdn, &hpa->mtx);
-}
-
 static void
 hpa_dalloc(tsdn_t *tsdn, pai_t *self, edata_t *edata) {
 	hpa_shard_t *shard = hpa_from_pai(self);
@@ -361,56 +496,29 @@ hpa_dalloc(tsdn_t *tsdn, pai_t *self, edata_t *edata) {
 	assert(edata_committed_get(edata));
 	assert(edata_base_get(edata) != NULL);
 
+	edata_t *ps = edata_ps_get(edata);
+	/* Currently, all edatas come from pageslabs. */
+	assert(ps != NULL);
+	emap_deregister_boundary(tsdn, shard->emap, edata);
+	malloc_mutex_lock(tsdn, &shard->mtx);
 	/*
-	 * There are two cases:
-	 * - The psset field is NULL.  In this case, the edata comes directly
-	 *   from the hpa_central_t and should be returned to it.
-	 * - THe psset field is not NULL, in which case we return the edata to
-	 *   the appropriate slab (which may in turn cause it to become empty,
-	 *   triggering an eviction of the whole slab, which should then be
-	 *   returned to the hpa_central_t).
+	 * Note that the shard mutex protects the edata hugeified field, too.
+	 * Page slabs can move between pssets (and have their hugeified status
+	 * change) in racy ways.
 	 */
-	if (edata_ps_get(edata) != NULL) {
-		emap_deregister_boundary(tsdn, shard->hpa->central.emap, edata);
-
-		malloc_mutex_lock(tsdn, &shard->mtx);
-		edata_t *evicted_ps = psset_dalloc(&shard->psset, edata);
-		edata_cache_small_put(tsdn, &shard->ecs, edata);
-		malloc_mutex_unlock(tsdn, &shard->mtx);
-
-
-		if (evicted_ps != NULL) {
-			/*
-			 * The deallocation caused a pageslab to become empty.
-			 * Free it back to the centralized allocator.
-			 */
-			bool err = emap_register_boundary(tsdn,
-			    shard->hpa->central.emap, evicted_ps, SC_NSIZES,
-			    /* slab */ false);
-			/*
-			 * Registration can only fail on OOM, but the boundary
-			 * mappings should have been initialized during
-			 * allocation.
-			 */
-			assert(!err);
-			edata_slab_set(evicted_ps, false);
-			edata_ps_set(evicted_ps, NULL);
-
-			assert(edata_arena_ind_get(evicted_ps) == shard->ind);
-			hpa_dalloc_central(tsdn, shard, evicted_ps);
-		}
-	} else {
-		hpa_dalloc_central(tsdn, shard, edata);
+	edata_t *evicted_ps = psset_dalloc(&shard->psset, edata);
+	/*
+	 * If a pageslab became empty because of the dalloc, it better have been
+	 * the one we expected.
+	 */
+	assert(evicted_ps == NULL || evicted_ps == ps);
+	edata_cache_small_put(tsdn, &shard->ecs, edata);
+	malloc_mutex_unlock(tsdn, &shard->mtx);
+	if (evicted_ps != NULL) {
+		hpa_handle_ps_eviction(tsdn, shard, evicted_ps);
 	}
 }
 
-static void
-hpa_shard_assert_stats_empty(psset_bin_stats_t *bin_stats) {
-	assert(bin_stats->npageslabs == 0);
-	assert(bin_stats->nactive == 0);
-	assert(bin_stats->ninactive == 0);
-}
-
 void
 hpa_shard_disable(tsdn_t *tsdn, hpa_shard_t *shard) {
 	malloc_mutex_lock(tsdn, &shard->mtx);
@@ -418,6 +526,29 @@ hpa_shard_disable(tsdn_t *tsdn, hpa_shard_t *shard) {
 	malloc_mutex_unlock(tsdn, &shard->mtx);
 }
 
+static void
+hpa_shard_assert_stats_empty(psset_bin_stats_t *bin_stats) {
+	assert(bin_stats->npageslabs_huge == 0);
+	assert(bin_stats->nactive_huge == 0);
+	assert(bin_stats->ninactive_huge == 0);
+	assert(bin_stats->npageslabs_nonhuge == 0);
+	assert(bin_stats->nactive_nonhuge == 0);
+	assert(bin_stats->ninactive_nonhuge == 0);
+}
+
+static void
+hpa_assert_empty(tsdn_t *tsdn, hpa_shard_t *shard, psset_t *psset) {
+	edata_t edata = {0};
+	malloc_mutex_assert_owner(tsdn, &shard->mtx);
+	bool psset_empty = psset_alloc_reuse(psset, &edata, PAGE);
+	assert(psset_empty);
+	hpa_shard_assert_stats_empty(&psset->stats.full_slabs);
+	for (pszind_t i = 0; i < PSSET_NPSIZES; i++) {
+		hpa_shard_assert_stats_empty(
+		    &psset->stats.nonfull_slabs[i]);
+	}
+}
+
 void
 hpa_shard_destroy(tsdn_t *tsdn, hpa_shard_t *shard) {
 	/*
@@ -427,17 +558,15 @@ hpa_shard_destroy(tsdn_t *tsdn, hpa_shard_t *shard) {
 	 * 1-page allocation.
 	 */
 	if (config_debug) {
-		edata_t edata = {0};
 		malloc_mutex_lock(tsdn, &shard->mtx);
-		bool psset_empty = psset_alloc_reuse(&shard->psset, &edata,
-		    PAGE);
+		hpa_assert_empty(tsdn, shard, &shard->psset);
 		malloc_mutex_unlock(tsdn, &shard->mtx);
-		assert(psset_empty);
-		hpa_shard_assert_stats_empty(&shard->psset.stats.full_slabs);
-		for (pszind_t i = 0; i < PSSET_NPSIZES; i++) {
-			hpa_shard_assert_stats_empty(
-			    &shard->psset.stats.nonfull_slabs[i]);
-		}
+	}
+	edata_t *ps;
+	while ((ps = edata_list_inactive_first(&shard->unused_slabs)) != NULL) {
+		assert(edata_size_get(ps) == HUGEPAGE);
+		edata_list_inactive_remove(&shard->unused_slabs, ps);
+		pages_unmap(edata_base_get(ps), HUGEPAGE);
 	}
 }
 
@@ -462,21 +591,3 @@ hpa_shard_postfork_child(tsdn_t *tsdn, hpa_shard_t *shard) {
 	malloc_mutex_postfork_child(tsdn, &shard->grow_mtx);
 	malloc_mutex_postfork_child(tsdn, &shard->mtx);
 }
-
-void
-hpa_prefork4(tsdn_t *tsdn, hpa_t *hpa) {
-	malloc_mutex_prefork(tsdn, &hpa->grow_mtx);
-	malloc_mutex_prefork(tsdn, &hpa->mtx);
-}
-
-void
-hpa_postfork_parent(tsdn_t *tsdn, hpa_t *hpa) {
-	malloc_mutex_postfork_parent(tsdn, &hpa->grow_mtx);
-	malloc_mutex_postfork_parent(tsdn, &hpa->mtx);
-}
-
-void
-hpa_postfork_child(tsdn_t *tsdn, hpa_t *hpa) {
-	malloc_mutex_postfork_child(tsdn, &hpa->grow_mtx);
-	malloc_mutex_postfork_child(tsdn, &hpa->mtx);
-}
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 74240c0a..277b9e72 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -1802,31 +1802,19 @@ malloc_init_hard_a0_locked() {
 	}
 	a0 = arena_get(TSDN_NULL, 0, false);
 
-	if (opt_hpa && LG_SIZEOF_PTR == 2) {
+	if (opt_hpa && !hpa_supported()) {
+		malloc_printf("<jemalloc>: HPA not supported in the current "
+		    "configuration; %s.",
+		    opt_abort_conf ? "aborting" : "disabling");
 		if (opt_abort_conf) {
-			malloc_printf("<jemalloc>: Hugepages not currently "
-			    "supported on 32-bit architectures; aborting.");
+			malloc_abort_invalid_conf();
 		} else {
-			malloc_printf("<jemalloc>: Hugepages not currently "
-			    "supported on 32-bit architectures; disabling.");
 			opt_hpa = false;
 		}
 	} else if (opt_hpa) {
-		/*
-		 * The global HPA uses the edata cache from a0, and so needs to
-		 * be initialized specially, after a0 is.  The arena init code
-		 * handles this case specially, and does not turn on the HPA for
-		 * a0 when opt_hpa is true.  This lets us do global HPA
-		 * initialization against a valid a0.
-		 */
-		if (hpa_init(&arena_hpa_global, b0get(), &arena_emap_global,
-		    &a0->pa_shard.edata_cache)) {
-			return true;
-		}
-		if (pa_shard_enable_hpa(&a0->pa_shard, &arena_hpa_global,
-		    opt_hpa_slab_goal, opt_hpa_slab_max_alloc,
-		    opt_hpa_small_max, opt_hpa_large_min, opt_hpa_sec_nshards,
-		    opt_hpa_sec_max_alloc, opt_hpa_sec_max_bytes)) {
+		if (pa_shard_enable_hpa(&a0->pa_shard, opt_hpa_slab_max_alloc,
+		    opt_hpa_sec_nshards, opt_hpa_sec_max_alloc,
+		    opt_hpa_sec_max_bytes)) {
 			return true;
 		}
 	}
@@ -4346,9 +4334,6 @@ _malloc_prefork(void)
 				}
 			}
 		}
-		if (i == 4 && opt_hpa) {
-			hpa_prefork4(tsd_tsdn(tsd), &arena_hpa_global);
-		}
 
 	}
 	prof_prefork1(tsd_tsdn(tsd));
@@ -4388,9 +4373,6 @@ _malloc_postfork(void)
 			arena_postfork_parent(tsd_tsdn(tsd), arena);
 		}
 	}
-	if (opt_hpa) {
-		hpa_postfork_parent(tsd_tsdn(tsd), &arena_hpa_global);
-	}
 	prof_postfork_parent(tsd_tsdn(tsd));
 	if (have_background_thread) {
 		background_thread_postfork_parent(tsd_tsdn(tsd));
@@ -4421,9 +4403,6 @@ jemalloc_postfork_child(void) {
 			arena_postfork_child(tsd_tsdn(tsd), arena);
 		}
 	}
-	if (opt_hpa) {
-		hpa_postfork_child(tsd_tsdn(tsd), &arena_hpa_global);
-	}
 	prof_postfork_child(tsd_tsdn(tsd));
 	if (have_background_thread) {
 		background_thread_postfork_child(tsd_tsdn(tsd));
diff --git a/src/pa.c b/src/pa.c
index e5fcbb7b..bc52ff43 100644
--- a/src/pa.c
+++ b/src/pa.c
@@ -49,17 +49,10 @@ pa_shard_init(tsdn_t *tsdn, pa_shard_t *shard, emap_t *emap, base_t *base,
 }
 
 bool
-pa_shard_enable_hpa(pa_shard_t *shard, hpa_t *hpa, size_t ps_goal,
-    size_t ps_alloc_max, size_t small_max, size_t large_min,
-    size_t sec_nshards, size_t sec_alloc_max, size_t sec_bytes_max) {
-	ps_goal &= ~PAGE_MASK;
-	ps_alloc_max &= ~PAGE_MASK;
-
-	if (ps_alloc_max > ps_goal) {
-		ps_alloc_max = ps_goal;
-	}
-	if (hpa_shard_init(&shard->hpa_shard, hpa, &shard->edata_cache,
-	    shard->ind, ps_goal, ps_alloc_max, small_max, large_min)) {
+pa_shard_enable_hpa(pa_shard_t *shard, size_t alloc_max, size_t sec_nshards,
+    size_t sec_alloc_max, size_t sec_bytes_max) {
+	if (hpa_shard_init(&shard->hpa_shard, shard->emap, &shard->edata_cache,
+	    shard->ind, alloc_max)) {
 		return true;
 	}
 	if (sec_init(&shard->hpa_sec, &shard->hpa_shard.pai, sec_nshards,
diff --git a/src/pa_extra.c b/src/pa_extra.c
index 2002418a..0f488be6 100644
--- a/src/pa_extra.c
+++ b/src/pa_extra.c
@@ -150,7 +150,7 @@ pa_shard_stats_merge(tsdn_t *tsdn, pa_shard_t *shard,
 	}
 
 	if (shard->ever_used_hpa) {
-		hpa_stats_merge(tsdn, &shard->hpa_shard, hpa_stats_out);
+		hpa_shard_stats_merge(tsdn, &shard->hpa_shard, hpa_stats_out);
 		sec_stats_merge(tsdn, &shard->hpa_sec, sec_stats_out);
 	}
 }
diff --git a/src/psset.c b/src/psset.c
index c24266ce..2ee683b6 100644
--- a/src/psset.c
+++ b/src/psset.c
@@ -20,9 +20,13 @@ psset_init(psset_t *psset) {
 
 static void
 psset_bin_stats_accum(psset_bin_stats_t *dst, psset_bin_stats_t *src) {
-	dst->npageslabs += src->npageslabs;
-	dst->nactive += src->nactive;
-	dst->ninactive += src->ninactive;
+	dst->npageslabs_huge += src->npageslabs_huge;
+	dst->nactive_huge += src->nactive_huge;
+	dst->ninactive_huge += src->ninactive_huge;
+
+	dst->npageslabs_nonhuge += src->npageslabs_nonhuge;
+	dst->nactive_nonhuge += src->nactive_nonhuge;
+	dst->ninactive_nonhuge += src->ninactive_nonhuge;
 }
 
 void
@@ -45,29 +49,62 @@ psset_stats_accum(psset_stats_t *dst, psset_stats_t *src) {
  * ensure we don't miss any heap modification operations.
  */
 JEMALLOC_ALWAYS_INLINE void
-psset_bin_stats_adjust(psset_bin_stats_t *binstats, edata_t *ps, bool inc) {
-	size_t mul = inc ? (size_t)1 : (size_t)-1;
+psset_bin_stats_insert_remove(psset_bin_stats_t *binstats, edata_t *ps,
+    bool insert) {
+	size_t *npageslabs_dst = edata_hugeified_get(ps)
+	    ? &binstats->npageslabs_huge : &binstats->npageslabs_nonhuge;
+	size_t *nactive_dst = edata_hugeified_get(ps)
+	    ? &binstats->nactive_huge : &binstats->nactive_nonhuge;
+	size_t *ninactive_dst = edata_hugeified_get(ps)
+	    ? &binstats->ninactive_huge : &binstats->ninactive_nonhuge;
 
 	size_t npages = edata_size_get(ps) >> LG_PAGE;
 	size_t ninactive = edata_nfree_get(ps);
 	size_t nactive = npages - ninactive;
-	binstats->npageslabs += mul * 1;
-	binstats->nactive += mul * nactive;
-	binstats->ninactive += mul * ninactive;
+
+	size_t mul = insert ? (size_t)1 : (size_t)-1;
+	*npageslabs_dst += mul * 1;
+	*nactive_dst += mul * nactive;
+	*ninactive_dst += mul * ninactive;
+}
+
+static void
+psset_bin_stats_insert(psset_bin_stats_t *binstats, edata_t *ps) {
+	psset_bin_stats_insert_remove(binstats, ps, /* insert */ true);
+}
+
+static void
+psset_bin_stats_remove(psset_bin_stats_t *binstats, edata_t *ps) {
+	psset_bin_stats_insert_remove(binstats, ps, /* insert */ false);
+}
+
+/*
+ * We don't currently need an "activate" equivalent to this, since down the
+ * allocation pathways we don't do the optimization in which we change a slab
+ * without first removing it from a bin.
+ */
+static void
+psset_bin_stats_deactivate(psset_bin_stats_t *binstats, bool huge, size_t num) {
+	size_t *nactive_dst = huge
+	    ? &binstats->nactive_huge : &binstats->nactive_nonhuge;
+	size_t *ninactive_dst = huge
+	    ? &binstats->ninactive_huge : &binstats->ninactive_nonhuge;
+
+	assert(*nactive_dst >= num);
+	*nactive_dst -= num;
+	*ninactive_dst += num;
 }
 
 static void
 psset_edata_heap_remove(psset_t *psset, pszind_t pind, edata_t *ps) {
 	edata_age_heap_remove(&psset->pageslabs[pind], ps);
-	psset_bin_stats_adjust(&psset->stats.nonfull_slabs[pind], ps,
-	    /* inc */ false);
+	psset_bin_stats_remove(&psset->stats.nonfull_slabs[pind], ps);
 }
 
 static void
 psset_edata_heap_insert(psset_t *psset, pszind_t pind, edata_t *ps) {
 	edata_age_heap_insert(&psset->pageslabs[pind], ps);
-	psset_bin_stats_adjust(&psset->stats.nonfull_slabs[pind], ps,
-	    /* inc */ true);
+	psset_bin_stats_insert(&psset->stats.nonfull_slabs[pind], ps);
 }
 
 JEMALLOC_ALWAYS_INLINE void
@@ -86,8 +123,7 @@ psset_insert(psset_t *psset, edata_t *ps) {
 		 * We don't ned to track full slabs; just pretend to for stats
 		 * purposes.  See the comment at psset_bin_stats_adjust.
 		 */
-		psset_bin_stats_adjust(&psset->stats.full_slabs, ps,
-		    /* inc */ true);
+		psset_bin_stats_insert(&psset->stats.full_slabs, ps);
 		return;
 	}
 
@@ -107,8 +143,7 @@ psset_remove(psset_t *psset, edata_t *ps) {
 	size_t longest_free_range = edata_longest_free_range_get(ps);
 
 	if (longest_free_range == 0) {
-		psset_bin_stats_adjust(&psset->stats.full_slabs, ps,
-		    /* inc */ true);
+		psset_bin_stats_remove(&psset->stats.full_slabs, ps);
 		return;
 	}
 
@@ -121,6 +156,26 @@ psset_remove(psset_t *psset, edata_t *ps) {
 	}
 }
 
+void
+psset_hugify(psset_t *psset, edata_t *ps) {
+	assert(!edata_hugeified_get(ps));
+	psset_assert_ps_consistent(ps);
+
+	size_t longest_free_range = edata_longest_free_range_get(ps);
+	psset_bin_stats_t *bin_stats;
+	if (longest_free_range == 0) {
+		bin_stats = &psset->stats.full_slabs;
+	} else {
+		pszind_t pind = sz_psz2ind(sz_psz_quantize_floor(
+		    longest_free_range << LG_PAGE));
+		assert(pind < PSSET_NPSIZES);
+		bin_stats = &psset->stats.nonfull_slabs[pind];
+	}
+	psset_bin_stats_remove(bin_stats, ps);
+	edata_hugeified_set(ps, true);
+	psset_bin_stats_insert(bin_stats, ps);
+}
+
 /*
  * Similar to PAC's extent_recycle_extract.  Out of all the pageslabs in the
  * set, picks one that can satisfy the allocation and remove it from the set.
@@ -225,8 +280,7 @@ psset_ps_alloc_insert(psset_t *psset, edata_t *ps, edata_t *r_edata,
 	}
 	edata_longest_free_range_set(ps, (uint32_t)largest_unchosen_range);
 	if (largest_unchosen_range == 0) {
-		psset_bin_stats_adjust(&psset->stats.full_slabs, ps,
-		    /* inc */ true);
+		psset_bin_stats_insert(&psset->stats.full_slabs, ps);
 	} else {
 		psset_insert(psset, ps);
 	}
@@ -258,8 +312,8 @@ edata_t *
 psset_dalloc(psset_t *psset, edata_t *edata) {
 	assert(edata_pai_get(edata) == EXTENT_PAI_HPA);
 	assert(edata_ps_get(edata) != NULL);
-
 	edata_t *ps = edata_ps_get(edata);
+
 	fb_group_t *ps_fb = edata_slab_data_get(ps)->bitmap;
 	size_t ps_old_longest_free_range = edata_longest_free_range_get(ps);
 	pszind_t old_pind = SC_NPSIZES;
@@ -274,22 +328,12 @@ psset_dalloc(psset_t *psset, edata_t *edata) {
 	    >> LG_PAGE;
 	size_t len = edata_size_get(edata) >> LG_PAGE;
 	fb_unset_range(ps_fb, ps_npages, begin, len);
-	if (ps_old_longest_free_range == 0) {
-		/* We were in the (imaginary) full bin; update stats for it. */
-		psset_bin_stats_adjust(&psset->stats.full_slabs, ps,
-		    /* inc */ false);
-	} else {
-		/*
-		 * The edata is still in the bin, need to update its
-		 * contribution.
-		 */
-		psset->stats.nonfull_slabs[old_pind].nactive -= len;
-		psset->stats.nonfull_slabs[old_pind].ninactive += len;
-	}
-	/*
-	 * Note that we want to do this after the stats updates, since if it was
-	 * full it psset_bin_stats_adjust would have looked at the old version.
-	 */
+
+	/* The pageslab is still in the bin; adjust its stats first. */
+	psset_bin_stats_t *bin_stats = (ps_old_longest_free_range == 0
+	    ? &psset->stats.full_slabs : &psset->stats.nonfull_slabs[old_pind]);
+	psset_bin_stats_deactivate(bin_stats, edata_hugeified_get(ps), len);
+
 	edata_nfree_set(ps, (uint32_t)(edata_nfree_get(ps) + len));
 
 	/* We might have just created a new, larger range. */
@@ -327,6 +371,12 @@ psset_dalloc(psset_t *psset, edata_t *edata) {
 			bitmap_set(psset->bitmap, &psset_bitmap_info,
 			    (size_t)old_pind);
 		}
+	} else {
+		/*
+		 * Otherwise, the bin was full, and we need to adjust the full
+		 * bin stats.
+		 */
+		psset_bin_stats_remove(&psset->stats.full_slabs, ps);
 	}
 	/* If the pageslab is empty, it gets evicted from the set. */
 	if (new_range_len == ps_npages) {
diff --git a/src/stats.c b/src/stats.c
index 4b40721a..abe3ab16 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -667,16 +667,27 @@ stats_arena_hpa_shard_print(emitter_t *emitter, unsigned i) {
 	emitter_row_t row;
 	emitter_row_init(&row);
 
-	size_t npageslabs;
-	size_t nactive;
-	size_t ninactive;
+	size_t npageslabs_huge;
+	size_t nactive_huge;
+	size_t ninactive_huge;
 
-	CTL_M2_GET("stats.arenas.0.hpa_shard.full_slabs.npageslabs",
-	    i, &npageslabs, size_t);
-	CTL_M2_GET("stats.arenas.0.hpa_shard.full_slabs.nactive",
-	    i, &nactive, size_t);
-	CTL_M2_GET("stats.arenas.0.hpa_shard.full_slabs.ninactive",
-	    i, &ninactive, size_t);
+	size_t npageslabs_nonhuge;
+	size_t nactive_nonhuge;
+	size_t ninactive_nonhuge;
+
+	CTL_M2_GET("stats.arenas.0.hpa_shard.full_slabs.npageslabs_huge",
+	    i, &npageslabs_huge, size_t);
+	CTL_M2_GET("stats.arenas.0.hpa_shard.full_slabs.nactive_huge",
+	    i, &nactive_huge, size_t);
+	CTL_M2_GET("stats.arenas.0.hpa_shard.full_slabs.ninactive_huge",
+	    i, &ninactive_huge, size_t);
+
+	CTL_M2_GET("stats.arenas.0.hpa_shard.full_slabs.npageslabs_nonhuge",
+	    i, &npageslabs_nonhuge, size_t);
+	CTL_M2_GET("stats.arenas.0.hpa_shard.full_slabs.nactive_nonhuge",
+	    i, &nactive_nonhuge, size_t);
+	CTL_M2_GET("stats.arenas.0.hpa_shard.full_slabs.ninactive_nonhuge",
+	    i, &ninactive_nonhuge, size_t);
 
 	size_t sec_bytes;
 	CTL_M2_GET("stats.arenas.0.hpa_sec_bytes", i, &sec_bytes, size_t);
@@ -686,39 +697,62 @@ stats_arena_hpa_shard_print(emitter_t *emitter, unsigned i) {
 	emitter_table_printf(emitter,
 	    "HPA shard stats:\n"
 	    "  In full slabs:\n"
-	    "      npageslabs: %zu\n"
-	    "      nactive: %zu\n"
-	    "      ninactive: %zu\n",
-	    npageslabs, nactive, ninactive);
+	    "      npageslabs: %zu huge, %zu nonhuge\n"
+	    "      nactive: %zu huge, %zu nonhuge \n"
+	    "      ninactive: %zu huge, %zu nonhuge \n",
+	    npageslabs_huge, npageslabs_nonhuge, nactive_huge, nactive_nonhuge,
+	    ninactive_huge, ninactive_nonhuge);
 	emitter_json_object_kv_begin(emitter, "hpa_shard");
 	emitter_json_object_kv_begin(emitter, "full_slabs");
-	emitter_json_kv(emitter, "npageslabs", emitter_type_size, &npageslabs);
-	emitter_json_kv(emitter, "nactive", emitter_type_size, &nactive);
-	emitter_json_kv(emitter, "ninactive", emitter_type_size, &ninactive);
+	emitter_json_kv(emitter, "npageslabs_huge", emitter_type_size,
+	    &npageslabs_huge);
+	emitter_json_kv(emitter, "npageslabs_nonhuge", emitter_type_size,
+	    &npageslabs_nonhuge);
+	emitter_json_kv(emitter, "nactive_huge", emitter_type_size,
+	    &nactive_huge);
+	emitter_json_kv(emitter, "nactive_nonhuge", emitter_type_size,
+	    &nactive_nonhuge);
+	emitter_json_kv(emitter, "ninactive_huge", emitter_type_size,
+	    &ninactive_huge);
+	emitter_json_kv(emitter, "ninactive_nonhuge", emitter_type_size,
+	    &ninactive_nonhuge);
 	emitter_json_object_end(emitter); /* End "full_slabs" */
 
 	COL_HDR(row, size, NULL, right, 20, size)
 	COL_HDR(row, ind, NULL, right, 4, unsigned)
-	COL_HDR(row, npageslabs, NULL, right, 13, size)
-	COL_HDR(row, nactive, NULL, right, 13, size)
-	COL_HDR(row, ninactive, NULL, right, 13, size)
+	COL_HDR(row, npageslabs_huge, NULL, right, 16, size)
+	COL_HDR(row, nactive_huge, NULL, right, 16, size)
+	COL_HDR(row, ninactive_huge, NULL, right, 16, size)
+	COL_HDR(row, npageslabs_nonhuge, NULL, right, 20, size)
+	COL_HDR(row, nactive_nonhuge, NULL, right, 20, size)
+	COL_HDR(row, ninactive_nonhuge, NULL, right, 20, size)
 
 	emitter_table_row(emitter, &header_row);
 	emitter_json_array_kv_begin(emitter, "nonfull_slabs");
 	bool in_gap = false;
 	for (pszind_t j = 0; j < PSSET_NPSIZES; j++) {
 		CTL_M2_M5_GET(
-		    "stats.arenas.0.hpa_shard.nonfull_slabs.0.npageslabs",
-		    i, j, &npageslabs, size_t);
+		    "stats.arenas.0.hpa_shard.nonfull_slabs.0.npageslabs_huge",
+		    i, j, &npageslabs_huge, size_t);
 		CTL_M2_M5_GET(
-		    "stats.arenas.0.hpa_shard.nonfull_slabs.0.nactive",
-		    i, j, &nactive, size_t);
+		    "stats.arenas.0.hpa_shard.nonfull_slabs.0.nactive_huge",
+		    i, j, &nactive_huge, size_t);
 		CTL_M2_M5_GET(
-		    "stats.arenas.0.hpa_shard.nonfull_slabs.0.ninactive",
-		    i, j, &ninactive, size_t);
+		    "stats.arenas.0.hpa_shard.nonfull_slabs.0.ninactive_huge",
+		    i, j, &ninactive_huge, size_t);
+
+		CTL_M2_M5_GET(
+		    "stats.arenas.0.hpa_shard.nonfull_slabs.0.npageslabs_nonhuge",
+		    i, j, &npageslabs_nonhuge, size_t);
+		CTL_M2_M5_GET(
+		    "stats.arenas.0.hpa_shard.nonfull_slabs.0.nactive_nonhuge",
+		    i, j, &nactive_nonhuge, size_t);
+		CTL_M2_M5_GET(
+		    "stats.arenas.0.hpa_shard.nonfull_slabs.0.ninactive_nonhuge",
+		    i, j, &ninactive_nonhuge, size_t);
 
 		bool in_gap_prev = in_gap;
-		in_gap = (npageslabs == 0);
+		in_gap = (npageslabs_huge == 0 && npageslabs_nonhuge == 0);
 		if (in_gap_prev && !in_gap) {
 			emitter_table_printf(emitter,
 			    "                     ---\n");
@@ -726,20 +760,29 @@ stats_arena_hpa_shard_print(emitter_t *emitter, unsigned i) {
 
 		col_size.size_val = sz_pind2sz(j);
 		col_ind.size_val = j;
-		col_npageslabs.size_val = npageslabs;
-		col_nactive.size_val = nactive;
-		col_ninactive.size_val = ninactive;
+		col_npageslabs_huge.size_val = npageslabs_huge;
+		col_nactive_huge.size_val = nactive_huge;
+		col_ninactive_huge.size_val = ninactive_huge;
+		col_npageslabs_nonhuge.size_val = npageslabs_nonhuge;
+		col_nactive_nonhuge.size_val = nactive_nonhuge;
+		col_ninactive_nonhuge.size_val = ninactive_nonhuge;
 		if (!in_gap) {
 			emitter_table_row(emitter, &row);
 		}
 
 		emitter_json_object_begin(emitter);
-		emitter_json_kv(emitter, "npageslabs", emitter_type_size,
-		    &npageslabs);
-		emitter_json_kv(emitter, "nactive", emitter_type_size,
-		    &nactive);
-		emitter_json_kv(emitter, "ninactive", emitter_type_size,
-		    &ninactive);
+		emitter_json_kv(emitter, "npageslabs_huge", emitter_type_size,
+		    &npageslabs_huge);
+		emitter_json_kv(emitter, "nactive_huge", emitter_type_size,
+		    &nactive_huge);
+		emitter_json_kv(emitter, "ninactive_huge", emitter_type_size,
+		    &ninactive_huge);
+		emitter_json_kv(emitter, "npageslabs_nonhuge", emitter_type_size,
+		    &npageslabs_nonhuge);
+		emitter_json_kv(emitter, "nactive_nonhuge", emitter_type_size,
+		    &nactive_nonhuge);
+		emitter_json_kv(emitter, "ninactive_nonhuge", emitter_type_size,
+		    &ninactive_huge);
 		emitter_json_object_end(emitter);
 	}
 	emitter_json_array_end(emitter); /* End "nonfull_slabs" */
diff --git a/test/unit/hpa.c b/test/unit/hpa.c
index b58dcede..72a20c32 100644
--- a/test/unit/hpa.c
+++ b/test/unit/hpa.c
@@ -2,14 +2,9 @@
 
 #include "jemalloc/internal/hpa.h"
 
-#define HPA_IND 111
-#define SHARD_IND 222
+#define SHARD_IND 111
 
-#define PS_GOAL (128 * PAGE)
-#define PS_ALLOC_MAX (64 * PAGE)
-
-#define HPA_SMALL_MAX (200 * PAGE)
-#define HPA_LARGE_MIN (300 * PAGE)
+#define ALLOC_MAX (HUGEPAGE / 4)
 
 typedef struct test_data_s test_data_t;
 struct test_data_s {
@@ -18,50 +13,32 @@ struct test_data_s {
 	 * test_data_t and the hpa_shard_t;
 	 */
 	hpa_shard_t shard;
-	base_t *shard_base;
+	base_t *base;
 	edata_cache_t shard_edata_cache;
 
-	hpa_t hpa;
-	base_t *hpa_base;
-	edata_cache_t hpa_edata_cache;
-
 	emap_t emap;
 };
 
 static hpa_shard_t *
 create_test_data() {
 	bool err;
-	base_t *shard_base = base_new(TSDN_NULL, /* ind */ SHARD_IND,
+	base_t *base = base_new(TSDN_NULL, /* ind */ SHARD_IND,
 	    &ehooks_default_extent_hooks);
-	assert_ptr_not_null(shard_base, "");
-
-	base_t *hpa_base = base_new(TSDN_NULL, /* ind */ HPA_IND,
-	    &ehooks_default_extent_hooks);
-	assert_ptr_not_null(hpa_base, "");
+	assert_ptr_not_null(base, "");
 
 	test_data_t *test_data = malloc(sizeof(test_data_t));
 	assert_ptr_not_null(test_data, "");
 
-	test_data->shard_base = shard_base;
-	test_data->hpa_base = hpa_base;
+	test_data->base = base;
 
-	err = edata_cache_init(&test_data->shard_edata_cache, shard_base);
+	err = edata_cache_init(&test_data->shard_edata_cache, base);
 	assert_false(err, "");
 
-	err = edata_cache_init(&test_data->hpa_edata_cache, hpa_base);
+	err = emap_init(&test_data->emap, test_data->base, /* zeroed */ false);
 	assert_false(err, "");
 
-	err = emap_init(&test_data->emap, test_data->hpa_base,
-	    /* zeroed */ false);
-	assert_false(err, "");
-
-	err = hpa_init(&test_data->hpa, hpa_base, &test_data->emap,
-	    &test_data->hpa_edata_cache);
-	assert_false(err, "");
-
-	err = hpa_shard_init(&test_data->shard, &test_data->hpa,
-	    &test_data->shard_edata_cache, SHARD_IND, PS_GOAL, PS_ALLOC_MAX,
-	    HPA_SMALL_MAX, HPA_LARGE_MIN);
+	err = hpa_shard_init(&test_data->shard, &test_data->emap,
+	    &test_data->shard_edata_cache, SHARD_IND, ALLOC_MAX);
 	assert_false(err, "");
 
 	return (hpa_shard_t *)test_data;
@@ -70,12 +47,11 @@ create_test_data() {
 static void
 destroy_test_data(hpa_shard_t *shard) {
 	test_data_t *test_data = (test_data_t *)shard;
-	base_delete(TSDN_NULL, test_data->shard_base);
-	base_delete(TSDN_NULL, test_data->hpa_base);
+	base_delete(TSDN_NULL, test_data->base);
 	free(test_data);
 }
 
-TEST_BEGIN(test_small_max_large_min) {
+TEST_BEGIN(test_alloc_max) {
 	test_skip_if(LG_SIZEOF_PTR != 3);
 
 	hpa_shard_t *shard = create_test_data();
@@ -84,18 +60,11 @@ TEST_BEGIN(test_small_max_large_min) {
 	edata_t *edata;
 
 	/* Small max */
-	edata = pai_alloc(tsdn, &shard->pai, HPA_SMALL_MAX, PAGE, false);
+	edata = pai_alloc(tsdn, &shard->pai, ALLOC_MAX, PAGE, false);
 	expect_ptr_not_null(edata, "Allocation of small max failed");
-	edata = pai_alloc(tsdn, &shard->pai, HPA_SMALL_MAX + PAGE, PAGE, false);
+	edata = pai_alloc(tsdn, &shard->pai, ALLOC_MAX + PAGE, PAGE, false);
 	expect_ptr_null(edata, "Allocation of larger than small max succeeded");
 
-	/* Large min */
-	edata = pai_alloc(tsdn, &shard->pai, HPA_LARGE_MIN, PAGE, false);
-	expect_ptr_not_null(edata, "Allocation of large min failed");
-	edata = pai_alloc(tsdn, &shard->pai, HPA_LARGE_MIN - PAGE, PAGE, false);
-	expect_ptr_null(edata,
-	    "Allocation of smaller than large min succeeded");
-
 	destroy_test_data(shard);
 }
 TEST_END
@@ -178,26 +147,19 @@ TEST_BEGIN(test_stress) {
 	mem_tree_new(&tree);
 
 	for (size_t i = 0; i < 100 * 1000; i++) {
-		size_t operation = prng_range_zu(&prng_state, 4);
-		if (operation < 2) {
+		size_t operation = prng_range_zu(&prng_state, 2);
+		if (operation == 0) {
 			/* Alloc */
 			if (nlive_edatas == nlive_edatas_max) {
 				continue;
 			}
 
-			size_t npages_min;
-			size_t npages_max;
 			/*
 			 * We make sure to get an even balance of small and
 			 * large allocations.
 			 */
-			if (operation == 0) {
-				npages_min = 1;
-				npages_max = HPA_SMALL_MAX / PAGE;
-			} else {
-				npages_min = HPA_LARGE_MIN / PAGE;
-				npages_max = HPA_LARGE_MIN / PAGE + 20;
-			}
+			size_t npages_min = 1;
+			size_t npages_max = ALLOC_MAX / PAGE;
 			size_t npages = npages_min + prng_range_zu(&prng_state,
 			    npages_max - npages_min);
 			edata_t *edata = pai_alloc(tsdn, &shard->pai,
@@ -260,6 +222,6 @@ main(void) {
 	(void)mem_tree_reverse_iter;
 	(void)mem_tree_destroy;
 	return test_no_reentrancy(
-	    test_small_max_large_min,
+	    test_alloc_max,
 	    test_stress);
 }
diff --git a/test/unit/psset.c b/test/unit/psset.c
index e07bdc46..ea61ab92 100644
--- a/test/unit/psset.c
+++ b/test/unit/psset.c
@@ -2,7 +2,7 @@
 
 #include "jemalloc/internal/psset.h"
 
-#define PAGESLAB_PAGES 64
+#define PAGESLAB_PAGES (HUGEPAGE / PAGE)
 #define PAGESLAB_SIZE (PAGESLAB_PAGES << LG_PAGE)
 #define PAGESLAB_SN 123
 #define PAGESLAB_ADDR ((void *)(1234 << LG_PAGE))
@@ -296,22 +296,23 @@ TEST_END
 
 static void
 stats_expect_empty(psset_bin_stats_t *stats) {
-	assert_zu_eq(0, stats->npageslabs,
+	assert_zu_eq(0, stats->npageslabs_nonhuge,
 	    "Supposedly empty bin had positive npageslabs");
-	expect_zu_eq(0, stats->nactive, "Unexpected nonempty bin"
+	expect_zu_eq(0, stats->nactive_nonhuge, "Unexpected nonempty bin"
 	    "Supposedly empty bin had positive nactive");
-	expect_zu_eq(0, stats->ninactive, "Unexpected nonempty bin"
+	expect_zu_eq(0, stats->ninactive_nonhuge, "Unexpected nonempty bin"
 	    "Supposedly empty bin had positive ninactive");
 }
 
 static void
 stats_expect(psset_t *psset, size_t nactive) {
 	if (nactive == PAGESLAB_PAGES) {
-		expect_zu_eq(1, psset->stats.full_slabs.npageslabs,
+		expect_zu_eq(1, psset->stats.full_slabs.npageslabs_nonhuge,
 		    "Expected a full slab");
-		expect_zu_eq(PAGESLAB_PAGES, psset->stats.full_slabs.nactive,
+		expect_zu_eq(PAGESLAB_PAGES,
+		    psset->stats.full_slabs.nactive_nonhuge,
 		    "Should have exactly filled the bin");
-		expect_zu_eq(0, psset->stats.full_slabs.ninactive,
+		expect_zu_eq(0, psset->stats.full_slabs.ninactive_nonhuge,
 		    "Should never have inactive pages in a full slab");
 	} else {
 		stats_expect_empty(&psset->stats.full_slabs);
@@ -325,13 +326,13 @@ stats_expect(psset_t *psset, size_t nactive) {
 	for (pszind_t i = 0; i < PSSET_NPSIZES; i++) {
 		if (i == nonempty_pind) {
 			assert_zu_eq(1,
-			    psset->stats.nonfull_slabs[i].npageslabs,
+			    psset->stats.nonfull_slabs[i].npageslabs_nonhuge,
 			    "Should have found a slab");
 			expect_zu_eq(nactive,
-			    psset->stats.nonfull_slabs[i].nactive,
+			    psset->stats.nonfull_slabs[i].nactive_nonhuge,
 			    "Mismatch in active pages");
 			expect_zu_eq(ninactive,
-			    psset->stats.nonfull_slabs[i].ninactive,
+			    psset->stats.nonfull_slabs[i].ninactive_nonhuge,
 			    "Mismatch in inactive pages");
 		} else {
 			stats_expect_empty(&psset->stats.nonfull_slabs[i]);