HPA: Track pending purges/hugifies in the psset.

This finishes the refactoring of the HPA/psset interactions the past few commits have been building towards. Rather than the HPA removing and then reinserting hpdatas, it simply begins updates and ends them. These updates can set flags on the hpdata that prevent it from being returned for certain types of requests. For example, it can call hpdata_alloc_allowed_set(hpdata, false) during an update, at which point the given hpdata will no longer be returned for psset_pick_alloc requests. This has various of benefits: - It maintains stats correctness during purges and hugifies. - It allows simpler and more explicit concurrency control for the various special cases (e.g. allocations are disallowed during purge, but not during hugify). - It lets allocations and deallocations avoid disturbing the purging and hugification orderings. If an hpdata "loses its place" in one of the queues just do to an alloc / dalloc, it can result in pathological edge cases where very hot, very full hugepages never get hugified (and cold extents on the same hugepage as hot ones never get purged). The key benefit though is that tracking hpdatas to be purged / hugified in a principled way will let us do delayed purging and hugification. Eventually this will let us move these operations to background threads, but in the short term the benefit is that it will let us have global purging policies (e.g. purge when the entire arena has too many dirty pages, rather than any particular hugepage).
2020-12-06 09:49:26 -08:00 · 2020-12-06 09:49:26 -08:00 · da63f23e68
commit da63f23e68
parent 0ea3d6307c
6 changed files with 436 additions and 240 deletions
--- a/include/jemalloc/internal/hpdata.h
+++ b/include/jemalloc/internal/hpdata.h
@ -36,11 +36,30 @@ struct hpdata_s {
 	bool h_huge;

 	/*
-	 * Whether or not some thread is purging this hpdata (i.e. has called
-	 * hpdata_purge_begin but not yet called hpdata_purge_end), or
-	 * hugifying it.  Only one thread at a time is allowed to change a
-	 * hugepage's state.
+	 * For some properties, we keep parallel sets of bools; h_foo_allowed
+	 * and h_in_psset_foo_container.  This is a decoupling mechanism to
+	 * avoid bothering the hpa (which manages policies) from the psset
+	 * (which is the mechanism used to enforce those policies).  This allows
+	 * all the container management logic to live in one place, without the
+	 * HPA needing to know or care how that happens.
 	 */
+
+	/*
+	 * Whether or not the hpdata is allowed to be used to serve allocations,
+	 * and whether or not the psset is currently tracking it as such.
+	 */
+	bool h_alloc_allowed;
+	bool h_in_psset_alloc_container;
+
+	/* The same, but with purging. */
+	bool h_purge_allowed;
+	bool h_in_psset_purge_container;
+
+	/* And with hugifying. */
+	bool h_hugify_allowed;
+	bool h_in_psset_hugify_container;
+
+	/* Whether or not a purge or hugify is currently happening. */
 	bool h_mid_purge;
 	bool h_mid_hugify;

@ -65,6 +84,12 @@ struct hpdata_s {
 		ql_elm(hpdata_t) ql_link_empty;
 	};

+	/*
+	 * Linkage for the psset to track candidates for purging and hugifying.
+	 */
+	ql_elm(hpdata_t) ql_link_purge;
+	ql_elm(hpdata_t) ql_link_hugify;
+
 	/* The length of the largest contiguous sequence of inactive pages. */
 	size_t h_longest_free_range;

@ -86,6 +111,9 @@ struct hpdata_s {
 };

 TYPED_LIST(hpdata_empty_list, hpdata_t, ql_link_empty)
+TYPED_LIST(hpdata_purge_list, hpdata_t, ql_link_purge)
+TYPED_LIST(hpdata_hugify_list, hpdata_t, ql_link_hugify)
+
 typedef ph(hpdata_t) hpdata_age_heap_t;
 ph_proto(, hpdata_age_heap_, hpdata_age_heap_t, hpdata_t);

@ -116,8 +144,66 @@ hpdata_huge_get(const hpdata_t *hpdata) {
 }

 static inline bool
-hpdata_changing_state_get(const hpdata_t *hpdata) {
-	return hpdata->h_mid_purge || hpdata->h_mid_hugify;
+hpdata_alloc_allowed_get(const hpdata_t *hpdata) {
+	return hpdata->h_alloc_allowed;
+}
+
+static inline void
+hpdata_alloc_allowed_set(hpdata_t *hpdata, bool alloc_allowed) {
+	hpdata->h_alloc_allowed = alloc_allowed;
+}
+
+static inline bool
+hpdata_in_psset_alloc_container_get(const hpdata_t *hpdata) {
+	return hpdata->h_in_psset_alloc_container;
+}
+
+static inline void
+hpdata_in_psset_alloc_container_set(hpdata_t *hpdata, bool in_container) {
+	assert(in_container != hpdata->h_in_psset_alloc_container);
+	hpdata->h_in_psset_alloc_container = in_container;
+}
+
+static inline bool
+hpdata_purge_allowed_get(const hpdata_t *hpdata) {
+	return hpdata->h_purge_allowed;
+}
+
+static inline void
+hpdata_purge_allowed_set(hpdata_t *hpdata, bool purge_allowed) {
+	hpdata->h_purge_allowed = purge_allowed;
+}
+
+static inline bool
+hpdata_in_psset_purge_container_get(const hpdata_t *hpdata) {
+	return hpdata->h_in_psset_purge_container;
+}
+
+static inline void
+hpdata_in_psset_purge_container_set(hpdata_t *hpdata, bool in_container) {
+	assert(in_container != hpdata->h_in_psset_purge_container);
+	hpdata->h_in_psset_purge_container = in_container;
+}
+
+static inline bool
+hpdata_hugify_allowed_get(const hpdata_t *hpdata) {
+	return hpdata->h_hugify_allowed;
+}
+
+static inline void
+hpdata_hugify_allowed_set(hpdata_t *hpdata, bool hugify_allowed) {
+	hpdata->h_hugify_allowed = hugify_allowed;
+}
+
+static inline bool
+hpdata_in_psset_hugify_container_get(const hpdata_t *hpdata) {
+	return hpdata->h_in_psset_hugify_container;
+}
+
+static inline void
+hpdata_in_psset_hugify_container_set(hpdata_t *hpdata, bool in_container) {
+	assert(in_container != hpdata->h_in_psset_hugify_container);
+	hpdata->h_in_psset_hugify_container = in_container;
 }

 static inline bool
@ -125,11 +211,29 @@ hpdata_mid_purge_get(const hpdata_t *hpdata) {
 	return hpdata->h_mid_purge;
 }

+static inline void
+hpdata_mid_purge_set(hpdata_t *hpdata, bool mid_purge) {
+	assert(mid_purge != hpdata->h_mid_purge);
+	hpdata->h_mid_purge = mid_purge;
+}
+
 static inline bool
 hpdata_mid_hugify_get(const hpdata_t *hpdata) {
 	return hpdata->h_mid_hugify;
 }

+static inline void
+hpdata_mid_hugify_set(hpdata_t *hpdata, bool mid_hugify) {
+	assert(mid_hugify != hpdata->h_mid_hugify);
+	hpdata->h_mid_hugify = mid_hugify;
+}
+
+static inline bool
+hpdata_changing_state_get(const hpdata_t *hpdata) {
+	return hpdata->h_mid_purge || hpdata->h_mid_hugify;
+}
+
+
 static inline bool
 hpdata_updating_get(const hpdata_t *hpdata) {
 	return hpdata->h_updating;
@ -278,26 +382,7 @@ bool hpdata_purge_next(hpdata_t *hpdata, hpdata_purge_state_t *purge_state,
 */
 void hpdata_purge_end(hpdata_t *hpdata, hpdata_purge_state_t *purge_state);

-/*
- * Similarly, when hugifying , callers can do the metadata modifications while
- * holding a lock (thereby setting the change_state field), but actually do the
- * operation without blocking other threads.
- *
- * Unlike most metadata operations, hugification ending should happen while an
- * hpdata is in the psset (or upcoming hugepage collections).  This is because
- * while purge/use races are unsafe, purge/hugepageify races are perfectly
- * reasonable.
- */
-void hpdata_hugify_begin(hpdata_t *hpdata);
-void hpdata_hugify_end(hpdata_t *hpdata);
-
-/*
- * Tell the hpdata that it's no longer a hugepage (all its pages are still
- * counted as dirty, though; an explicit purge call is required to change that).
- *
- * This should only be done after starting to purge, and before actually purging
- * any contents.
- */
+void hpdata_hugify(hpdata_t *hpdata);
 void hpdata_dehugify(hpdata_t *hpdata);

 #endif /* JEMALLOC_INTERNAL_HPDATA_H */
--- a/include/jemalloc/internal/psset.h
+++ b/include/jemalloc/internal/psset.h
@ -61,8 +61,15 @@ struct psset_s {
 	hpdata_age_heap_t pageslabs[PSSET_NPSIZES];
 	bitmap_t bitmap[BITMAP_GROUPS(PSSET_NPSIZES)];
 	psset_stats_t stats;
-	/* Slabs with no active allocations. */
-	hpdata_empty_list_t empty_slabs;
+	/*
+	 * Slabs with no active allocations, but which are allowed to serve new
+	 * allocations.
+	 */
+	hpdata_empty_list_t empty;
+	/* Slabs which are available to be purged. */
+	hpdata_purge_list_t to_purge;
+	/* Slabs which are available to be hugified. */
+	hpdata_hugify_list_t to_hugify;
 };

 void psset_init(psset_t *psset);
@ -77,6 +84,10 @@ void psset_update_end(psset_t *psset, hpdata_t *ps);

 /* Analogous to the eset_fit; pick a hpdata to serve the request. */
 hpdata_t *psset_pick_alloc(psset_t *psset, size_t size);
+/* Pick one to purge. */
+hpdata_t *psset_pick_purge(psset_t *psset);
+/* Pick one to hugify. */
+hpdata_t *psset_pick_hugify(psset_t *psset);

 void psset_insert(psset_t *psset, hpdata_t *ps);
 void psset_remove(psset_t *psset, hpdata_t *ps);
--- a/src/hpa.c
+++ b/src/hpa.c
@ -227,65 +227,150 @@ hpa_grow(tsdn_t *tsdn, hpa_shard_t *shard) {
 	return ps;
 }

-/*
- * As a precondition, ps should not be in the psset (we can handle deallocation
- * races, but not allocation ones), and we should hold the shard mutex.
- */
-static void
-hpa_purge(tsdn_t *tsdn, hpa_shard_t *shard, hpdata_t *ps) {
+/* Returns whether or not we purged anything. */
+static bool
+hpa_try_purge(tsdn_t *tsdn, hpa_shard_t *shard) {
 	malloc_mutex_assert_owner(tsdn, &shard->mtx);
-	while (hpa_should_purge(shard, ps)) {
-		/* Do the metadata update bit while holding the lock. */
-		hpdata_purge_state_t purge_state;
-		hpdata_purge_begin(ps, &purge_state);
-		shard->stats.npurge_passes++;

-		/*
-		 * Dehugifying can only happen on the first loop iteration,
-		 * since no other threads can allocate out of this ps while
-		 * we're purging (and thus, can't hugify it), but there's not a
-		 * natural way to express that in the control flow.
-		 */
-		bool needs_dehugify = false;
-		if (hpdata_huge_get(ps)) {
-			needs_dehugify = true;
-			shard->stats.ndehugifies++;
-			hpdata_dehugify(ps);
-		}
-
-		/* Drop the lock to do the OS calls. */
-		malloc_mutex_unlock(tsdn, &shard->mtx);
-
-		if (needs_dehugify) {
-			pages_nohuge(hpdata_addr_get(ps), HUGEPAGE);
-		}
-
-		size_t total_purged = 0;
-		uint64_t purges_this_pass = 0;
-		void *purge_addr;
-		size_t purge_size;
-		while (hpdata_purge_next(ps, &purge_state, &purge_addr,
-		    &purge_size)) {
-			purges_this_pass++;
-			pages_purge_forced(purge_addr, purge_size);
-			total_purged += purge_size;
-		}
-
-		/* Reacquire to finish our metadata update. */
-		malloc_mutex_lock(tsdn, &shard->mtx);
-		shard->stats.npurges += purges_this_pass;
-		hpdata_purge_end(ps, &purge_state);
-
-		assert(total_purged <= HUGEPAGE);
-
-		/*
-		 * We're not done here; other threads can't allocate out of ps
-		 * while purging, but they can still deallocate.  Those
-		 * deallocations could have meant more purging than what we
-		 * planned ought to happen.  We have to re-check now that we've
-		 * reacquired the mutex again.
-		 */
+	hpdata_t *to_purge = psset_pick_purge(&shard->psset);
+	if (to_purge == NULL) {
+		return false;
 	}
+	assert(hpdata_purge_allowed_get(to_purge));
+	assert(!hpdata_changing_state_get(to_purge));
+
+	/*
+	 * Don't let anyone else purge or hugify this page while
+	 * we're purging it (allocations and deallocations are
+	 * OK).
+	 */
+	psset_update_begin(&shard->psset, to_purge);
+	assert(hpdata_alloc_allowed_get(to_purge));
+	hpdata_mid_purge_set(to_purge, true);
+	hpdata_purge_allowed_set(to_purge, false);
+	hpdata_hugify_allowed_set(to_purge, false);
+	/*
+	 * Unlike with hugification (where concurrent
+	 * allocations are allowed), concurrent allocation out
+	 * of a hugepage being purged is unsafe; we might hand
+	 * out an extent for an allocation and then purge it
+	 * (clearing out user data).
+	 */
+	hpdata_alloc_allowed_set(to_purge, false);
+	psset_update_end(&shard->psset, to_purge);
+
+	/* Gather all the metadata we'll need during the purge. */
+	bool dehugify = hpdata_huge_get(to_purge);
+	hpdata_purge_state_t purge_state;
+	hpdata_purge_begin(to_purge, &purge_state);
+
+	malloc_mutex_unlock(tsdn, &shard->mtx);
+
+	/* Actually do the purging, now that the lock is dropped. */
+	if (dehugify) {
+		pages_nohuge(hpdata_addr_get(to_purge), HUGEPAGE);
+	}
+	size_t total_purged = 0;
+	uint64_t purges_this_pass = 0;
+	void *purge_addr;
+	size_t purge_size;
+	while (hpdata_purge_next(to_purge, &purge_state, &purge_addr,
+	    &purge_size)) {
+		total_purged += purge_size;
+		assert(total_purged <= HUGEPAGE);
+		purges_this_pass++;
+		pages_purge_forced(purge_addr, purge_size);
+	}
+
+	malloc_mutex_lock(tsdn, &shard->mtx);
+	/* The shard updates */
+	shard->stats.npurge_passes++;
+	shard->stats.npurges += purges_this_pass;
+	if (dehugify) {
+		shard->stats.ndehugifies++;
+	}
+
+	/* The hpdata updates. */
+	psset_update_begin(&shard->psset, to_purge);
+	if (dehugify) {
+		hpdata_dehugify(to_purge);
+	}
+	hpdata_purge_end(to_purge, &purge_state);
+	hpdata_mid_purge_set(to_purge, false);
+
+	hpdata_alloc_allowed_set(to_purge, true);
+	hpdata_purge_allowed_set(to_purge, hpa_should_purge(shard, to_purge));
+	hpdata_hugify_allowed_set(to_purge, hpa_should_hugify(shard, to_purge));
+
+	psset_update_end(&shard->psset, to_purge);
+
+	return true;
+}
+
+/* Returns whether or not we hugified anything. */
+static bool
+hpa_try_hugify(tsdn_t *tsdn, hpa_shard_t *shard) {
+	malloc_mutex_assert_owner(tsdn, &shard->mtx);
+
+	hpdata_t *to_hugify = psset_pick_hugify(&shard->psset);
+	if (to_hugify == NULL) {
+		return false;
+	}
+	assert(hpdata_hugify_allowed_get(to_hugify));
+	assert(!hpdata_changing_state_get(to_hugify));
+
+	/*
+	 * Don't let anyone else purge or hugify this page while
+	 * we're hugifying it (allocations and deallocations are
+	 * OK).
+	 */
+	psset_update_begin(&shard->psset, to_hugify);
+	hpdata_mid_hugify_set(to_hugify, true);
+	hpdata_purge_allowed_set(to_hugify, false);
+	hpdata_hugify_allowed_set(to_hugify, false);
+	assert(hpdata_alloc_allowed_get(to_hugify));
+	psset_update_end(&shard->psset, to_hugify);
+
+	malloc_mutex_unlock(tsdn, &shard->mtx);
+
+	bool err = pages_huge(hpdata_addr_get(to_hugify),
+	    HUGEPAGE);
+	/*
+	 * It's not clear what we could do in case of error; we
+	 * might get into situations where we loop trying to
+	 * hugify some page and failing over and over again.
+	 * Just eat the error and pretend we were successful.
+	 */
+	(void)err;
+
+	malloc_mutex_lock(tsdn, &shard->mtx);
+	shard->stats.nhugifies++;
+
+	psset_update_begin(&shard->psset, to_hugify);
+	hpdata_hugify(to_hugify);
+	hpdata_mid_hugify_set(to_hugify, false);
+	hpdata_purge_allowed_set(to_hugify,
+	    hpa_should_purge(shard, to_hugify));
+	hpdata_hugify_allowed_set(to_hugify, false);
+	psset_update_end(&shard->psset, to_hugify);
+
+	return true;
+}
+
+
+static void
+hpa_do_deferred_work(tsdn_t *tsdn, hpa_shard_t *shard) {
+	bool hugified;
+	bool purged;
+	size_t nloop = 0;
+	/* Just *some* bound, to impose a worst-case latency bound. */
+	size_t maxloops = 100;;
+	do {
+		malloc_mutex_assert_owner(tsdn, &shard->mtx);
+		hugified = hpa_try_hugify(tsdn, shard);
+		purged = hpa_try_purge(tsdn, shard);
+		malloc_mutex_assert_owner(tsdn, &shard->mtx);
+	} while ((hugified || purged) && nloop++ < maxloops);
 }

 static edata_t *
@ -344,6 +429,10 @@ hpa_try_alloc_no_grow(tsdn_t *tsdn, hpa_shard_t *shard, size_t size, bool *oom)
 		 * We should arguably reset dirty state here, but this would
 		 * require some sort of prepare + commit functionality that's a
 		 * little much to deal with for now.
+		 *
+		 * We don't have a do_deferred_work down this pathway, on the
+		 * principle that we didn't *really* affect shard state (we
+		 * tweaked the stats, but our tweaks weren't really accurate).
 		 */
 		psset_update_end(&shard->psset, ps);
 		edata_cache_small_put(tsdn, &shard->ecs, edata);
@ -352,49 +441,14 @@ hpa_try_alloc_no_grow(tsdn_t *tsdn, hpa_shard_t *shard, size_t size, bool *oom)
 		return NULL;
 	}

-	bool hugify = hpa_should_hugify(shard, ps);
-	if (hugify) {
-		hpdata_hugify_begin(ps);
-		shard->stats.nhugifies++;
+	if (hpa_should_hugify(shard, ps)) {
+		hpdata_hugify_allowed_set(ps, true);
 	}
 	psset_update_end(&shard->psset, ps);

+	hpa_do_deferred_work(tsdn, shard);
 	malloc_mutex_unlock(tsdn, &shard->mtx);
-	if (hugify) {
-		/*
-		 * Hugifying with the lock dropped is safe, even with
-		 * concurrent modifications to the ps.  This relies on
-		 * the fact that the current implementation will never
-		 * dehugify a non-empty pageslab, and ps will never
-		 * become empty before we return edata to the user to be
-		 * freed.
-		 *
-		 * Note that holding the lock would prevent not just operations
-		 * on this page slab, but also operations any other alloc/dalloc
-		 * operations in this hpa shard.
-		 */
-		bool err = pages_huge(hpdata_addr_get(ps), HUGEPAGE);
-		/*
-		 * Pretending we succeed when we actually failed is safe; trying
-		 * to rolllback would be tricky, though.  Eat the error.
-		 */
-		(void)err;

-		malloc_mutex_lock(tsdn, &shard->mtx);
-		hpdata_hugify_end(ps);
-		if (hpa_should_purge(shard, ps)) {
-			/*
-			 * There was a race in which the ps went from being
-			 * almost full to having lots of free space while we
-			 * hugified.  Undo our operation, taking care to meet
-			 * the precondition that the ps isn't in the psset.
-			 */
-			psset_update_begin(&shard->psset, ps);
-			hpa_purge(tsdn, shard, ps);
-			psset_update_end(&shard->psset, ps);
-		}
-		malloc_mutex_unlock(tsdn, &shard->mtx);
-	}
 	return edata;
 }

@ -445,6 +499,14 @@ hpa_alloc_psset(tsdn_t *tsdn, hpa_shard_t *shard, size_t size) {
 		return NULL;
 	}

+	/*
+	 * TODO: the tail of this function is quite similar to the tail of
+	 * hpa_try_alloc_no_grow (both, broadly, do the metadata management of
+	 * initializing an edata_t from an hpdata_t once both have been
+	 * allocated).  The only differences are in error case handling and lock
+	 * management (we hold grow_mtx, but should drop it before doing any
+	 * deferred work).  With a little refactoring, we could unify the paths.
+	 */
 	psset_update_begin(&shard->psset, ps);

 	void *addr = hpdata_reserve_alloc(ps, size);
@ -481,10 +543,20 @@ hpa_alloc_psset(tsdn_t *tsdn, hpa_shard_t *shard, size_t size) {
 		malloc_mutex_unlock(tsdn, &shard->grow_mtx);
 		return NULL;
 	}
+	if (hpa_should_hugify(shard, ps)) {
+		hpdata_hugify_allowed_set(ps, true);
+	}
 	psset_update_end(&shard->psset, ps);

-	malloc_mutex_unlock(tsdn, &shard->mtx);
+	/*
+	 * Drop grow_mtx before doing deferred work; other threads blocked on it
+	 * should be allowed to proceed while we're working.
+	 */
 	malloc_mutex_unlock(tsdn, &shard->grow_mtx);
+
+	hpa_do_deferred_work(tsdn, shard);
+
+	malloc_mutex_unlock(tsdn, &shard->mtx);
 	return edata;
 }

@ -579,48 +651,15 @@ hpa_dalloc(tsdn_t *tsdn, pai_t *self, edata_t *edata) {
 	size_t unreserve_size = edata_size_get(edata);
 	edata_cache_small_put(tsdn, &shard->ecs, edata);

-	/*
-	 * We have three rules interacting here:
-	 * - You can't update ps metadata while it's still in the psset.  We
-	 *   enforce this because it's necessary for stats tracking and metadata
-	 *   management.
-	 * - The ps must not be in the psset while purging.  This is because we
-	 *   can't handle purge/alloc races.
-	 * - Whoever removes the ps from the psset is the one to reinsert it.
-	 */
-	if (hpdata_mid_purge_get(ps)) {
-		/*
-		 * Another thread started purging, and so the ps is not in the
-		 * psset and we can do our metadata update.  The other thread is
-		 * in charge of reinserting the ps, so we're done.
-		 */
-		assert(hpdata_updating_get(ps));
-		hpdata_unreserve(ps, unreserve_addr, unreserve_size);
-		malloc_mutex_unlock(tsdn, &shard->mtx);
-		return;
-	}
-	/*
-	 * No other thread is purging, and the ps is non-empty, so it should be
-	 * in the psset.
-	 */
-	assert(!hpdata_updating_get(ps));
 	psset_update_begin(&shard->psset, ps);
 	hpdata_unreserve(ps, unreserve_addr, unreserve_size);
-	if (!hpa_should_purge(shard, ps)) {
-		/*
-		 * This should be the common case; no other thread is purging,
-		 * and we won't purge either.
-		 */
-		psset_update_end(&shard->psset, ps);
-		malloc_mutex_unlock(tsdn, &shard->mtx);
-		return;
+	if (hpa_should_purge(shard, ps)) {
+		hpdata_purge_allowed_set(ps, true);
 	}
-
-	/* It's our job to purge. */
-	hpa_purge(tsdn, shard, ps);
-
 	psset_update_end(&shard->psset, ps);

+	hpa_do_deferred_work(tsdn, shard);
+
 	malloc_mutex_unlock(tsdn, &shard->mtx);
 }

--- a/src/hpdata.c
+++ b/src/hpdata.c
@ -22,6 +22,12 @@ hpdata_init(hpdata_t *hpdata, void *addr, uint64_t age) {
 	hpdata_addr_set(hpdata, addr);
 	hpdata_age_set(hpdata, age);
 	hpdata->h_huge = false;
+	hpdata->h_alloc_allowed = true;
+	hpdata->h_in_psset_alloc_container = false;
+	hpdata->h_purge_allowed = false;
+	hpdata->h_in_psset_purge_container = false;
+	hpdata->h_hugify_allowed = false;
+	hpdata->h_in_psset_hugify_container = false;
 	hpdata->h_mid_purge = false;
 	hpdata->h_mid_hugify = false;
 	hpdata->h_updating = false;
@ -44,6 +50,7 @@ hpdata_reserve_alloc(hpdata_t *hpdata, size_t sz) {
 	 * mid-update.
 	 */
 	assert(!hpdata->h_in_psset || hpdata->h_updating);
+	assert(hpdata->h_alloc_allowed);
 	assert((sz & PAGE_MASK) == 0);
 	size_t npages = sz >> LG_PAGE;
 	assert(npages <= hpdata_longest_free_range_get(hpdata));
@ -155,10 +162,6 @@ void
 hpdata_purge_begin(hpdata_t *hpdata, hpdata_purge_state_t *purge_state) {
 	hpdata_assert_consistent(hpdata);
 	/* See the comment in reserve. */
-	assert(!hpdata->h_in_psset || hpdata->h_updating);
-	assert(!hpdata->h_mid_purge);
-	assert(!hpdata->h_mid_hugify);
-	hpdata->h_mid_purge = true;

 	purge_state->npurged = 0;
 	purge_state->next_purge_search_begin = 0;
@ -192,12 +195,6 @@ hpdata_purge_next(hpdata_t *hpdata, hpdata_purge_state_t *purge_state,
 	 * hpdata without synchronization, and therefore have no right to expect
 	 * a consistent state.
 	 */
-	assert(hpdata->h_mid_purge);
-	/* See the comment in reserve. */
-	assert(!hpdata->h_in_psset || hpdata->h_updating);
-	/* Should have dehugified already (if necessary). */
-	assert(!hpdata->h_huge);
-	assert(!hpdata->h_mid_hugify);

 	if (purge_state->next_purge_search_begin == HUGEPAGE_PAGES) {
 		return false;
@ -226,9 +223,6 @@ hpdata_purge_end(hpdata_t *hpdata, hpdata_purge_state_t *purge_state) {
 	hpdata_assert_consistent(hpdata);
 	/* See the comment in reserve. */
 	assert(!hpdata->h_in_psset || hpdata->h_updating);
-	assert(hpdata->h_mid_purge);
-	assert(!hpdata->h_mid_hugify);
-	hpdata->h_mid_purge = false;

 	assert(purge_state->npurged == fb_scount(purge_state->to_purge,
 	    HUGEPAGE_PAGES, 0, HUGEPAGE_PAGES));
@ -244,40 +238,17 @@ hpdata_purge_end(hpdata_t *hpdata, hpdata_purge_state_t *purge_state) {
 }

 void
-hpdata_hugify_begin(hpdata_t *hpdata) {
+hpdata_hugify(hpdata_t *hpdata) {
 	hpdata_assert_consistent(hpdata);
-	/* See the comment in reserve. */
-	assert(!hpdata->h_in_psset || hpdata->h_updating);
-	assert(!hpdata->h_mid_purge);
-	assert(!hpdata->h_mid_hugify);
-	hpdata->h_mid_hugify = true;
 	hpdata->h_huge = true;
 	fb_set_range(hpdata->touched_pages, HUGEPAGE_PAGES, 0, HUGEPAGE_PAGES);
 	hpdata->h_ntouched = HUGEPAGE_PAGES;
 	hpdata_assert_consistent(hpdata);
 }

-void
-hpdata_hugify_end(hpdata_t *hpdata) {
-	hpdata_assert_consistent(hpdata);
-	/*
-	 * This is the exception to the "no-metadata updates without informing
-	 * the psset first" rule; this assert would be incorrect.
-	 */
-	/* assert(!hpdata->h_in_psset || hpdata->h_updating); */
-	assert(!hpdata->h_mid_purge);
-	assert(hpdata->h_mid_hugify);
-	hpdata->h_mid_hugify = false;
-	hpdata_assert_consistent(hpdata);
-}
-
 void
 hpdata_dehugify(hpdata_t *hpdata) {
 	hpdata_assert_consistent(hpdata);
-	assert(hpdata->h_updating);
-	assert(hpdata->h_updating);
-	assert(hpdata->h_mid_purge);
-	assert(!hpdata->h_mid_hugify);
 	hpdata->h_huge = false;
 	hpdata_assert_consistent(hpdata);
 }
--- a/src/psset.c
+++ b/src/psset.c
@ -15,7 +15,9 @@ psset_init(psset_t *psset) {
 	}
 	bitmap_init(psset->bitmap, &psset_bitmap_info, /* fill */ true);
 	memset(&psset->stats, 0, sizeof(psset->stats));
-	hpdata_empty_list_init(&psset->empty_slabs);
+	hpdata_empty_list_init(&psset->empty);
+	hpdata_purge_list_init(&psset->to_purge);
+	hpdata_hugify_list_init(&psset->to_hugify);
 }

 static void
@ -85,25 +87,56 @@ psset_hpdata_heap_insert(psset_t *psset, pszind_t pind, hpdata_t *ps) {
 	hpdata_age_heap_insert(&psset->pageslabs[pind], ps);
 }

-/*
- * Insert ps into the data structures we use to track allocation stats and pick
- * the pageslabs for new allocations.
- *
- * In particular, this does *not* remove ps from any hugification / purging
- * queues it may be in.
- */
 static void
-psset_do_alloc_tracking_insert(psset_t *psset, hpdata_t *ps) {
+psset_stats_insert(psset_t* psset, hpdata_t *ps) {
 	if (hpdata_empty(ps)) {
 		psset_bin_stats_insert(psset->stats.empty_slabs, ps);
+	} else if (hpdata_full(ps)) {
+		psset_bin_stats_insert(psset->stats.full_slabs, ps);
+	} else {
+		size_t longest_free_range = hpdata_longest_free_range_get(ps);
+
+		pszind_t pind = sz_psz2ind(sz_psz_quantize_floor(
+		    longest_free_range << LG_PAGE));
+		assert(pind < PSSET_NPSIZES);
+
+		psset_bin_stats_insert(psset->stats.nonfull_slabs[pind], ps);
+	}
+}
+
+static void
+psset_stats_remove(psset_t *psset, hpdata_t *ps) {
+	if (hpdata_empty(ps)) {
+		psset_bin_stats_remove(psset->stats.empty_slabs, ps);
+	} else if (hpdata_full(ps)) {
+		psset_bin_stats_remove(psset->stats.full_slabs, ps);
+	} else {
+		size_t longest_free_range = hpdata_longest_free_range_get(ps);
+
+		pszind_t pind = sz_psz2ind(sz_psz_quantize_floor(
+		    longest_free_range << LG_PAGE));
+		assert(pind < PSSET_NPSIZES);
+
+		psset_bin_stats_remove(psset->stats.nonfull_slabs[pind], ps);
+	}
+}
+
+/*
+ * Put ps into some container so that it can be found during future allocation
+ * requests.
+ */
+static void
+psset_alloc_container_insert(psset_t *psset, hpdata_t *ps) {
+	assert(!hpdata_in_psset_alloc_container_get(ps));
+	hpdata_in_psset_alloc_container_set(ps, true);
+	if (hpdata_empty(ps)) {
 		/*
 		 * This prepend, paired with popping the head in psset_fit,
 		 * means we implement LIFO ordering for the empty slabs set,
 		 * which seems reasonable.
 		 */
-		hpdata_empty_list_prepend(&psset->empty_slabs, ps);
+		hpdata_empty_list_prepend(&psset->empty, ps);
 	} else if (hpdata_full(ps)) {
-		psset_bin_stats_insert(psset->stats.full_slabs, ps);
 		/*
 		 * We don't need to keep track of the full slabs; we're never
 		 * going to return them from a psset_pick_alloc call.
@ -115,23 +148,20 @@ psset_do_alloc_tracking_insert(psset_t *psset, hpdata_t *ps) {
 		    longest_free_range << LG_PAGE));
 		assert(pind < PSSET_NPSIZES);

-		psset_bin_stats_insert(psset->stats.nonfull_slabs[pind], ps);
 		psset_hpdata_heap_insert(psset, pind, ps);
 	}
 }

 /* Remove ps from those collections. */
 static void
-psset_do_alloc_tracking_remove(psset_t *psset, hpdata_t *ps) {
+psset_alloc_container_remove(psset_t *psset, hpdata_t *ps) {
+	assert(hpdata_in_psset_alloc_container_get(ps));
+	hpdata_in_psset_alloc_container_set(ps, false);
+
 	if (hpdata_empty(ps)) {
-		psset_bin_stats_remove(psset->stats.empty_slabs, ps);
-		hpdata_empty_list_remove(&psset->empty_slabs, ps);
+		hpdata_empty_list_remove(&psset->empty, ps);
 	} else if (hpdata_full(ps)) {
-		/*
-		 * We don't need to maintain an explicit container of full
-		 * pageslabs anywhere, but we do have to update stats.
-		 */
-		psset_bin_stats_remove(psset->stats.full_slabs, ps);
+		/* Same as above -- do nothing in this case. */
 	} else {
 		size_t longest_free_range = hpdata_longest_free_range_get(ps);

@ -139,7 +169,6 @@ psset_do_alloc_tracking_remove(psset_t *psset, hpdata_t *ps) {
 		    longest_free_range << LG_PAGE));
 		assert(pind < PSSET_NPSIZES);

-		psset_bin_stats_remove(psset->stats.nonfull_slabs[pind], ps);
 		psset_hpdata_heap_remove(psset, pind, ps);
 	}
 }
@ -149,7 +178,21 @@ psset_update_begin(psset_t *psset, hpdata_t *ps) {
 	hpdata_assert_consistent(ps);
 	assert(hpdata_in_psset_get(ps));
 	hpdata_updating_set(ps, true);
-	psset_do_alloc_tracking_remove(psset, ps);
+	psset_stats_remove(psset, ps);
+	if (hpdata_in_psset_alloc_container_get(ps)) {
+		/*
+		 * Some metadata updates can break alloc container invariants
+		 * (e.g. the longest free range determines the hpdata_heap_t the
+		 * pageslab lives in).
+		 */
+		assert(hpdata_alloc_allowed_get(ps));
+		psset_alloc_container_remove(psset, ps);
+	}
+	/*
+	 * We don't update presence in the purge list or hugify list; we try to
+	 * keep those FIFO, even in the presence of other metadata updates.
+	 * We'll update presence at the end of the metadata update if necessary.
+	 */
 }

 void
@ -157,7 +200,36 @@ psset_update_end(psset_t *psset, hpdata_t *ps) {
 	hpdata_assert_consistent(ps);
 	assert(hpdata_in_psset_get(ps));
 	hpdata_updating_set(ps, false);
-	psset_do_alloc_tracking_insert(psset, ps);
+	psset_stats_insert(psset, ps);
+
+	/*
+	 * The update begin should have removed ps from whatever alloc container
+	 * it was in.
+	 */
+	assert(!hpdata_in_psset_alloc_container_get(ps));
+	if (hpdata_alloc_allowed_get(ps)) {
+		psset_alloc_container_insert(psset, ps);
+	}
+
+	if (hpdata_purge_allowed_get(ps)
+	    && !hpdata_in_psset_purge_container_get(ps)) {
+		hpdata_in_psset_purge_container_set(ps, true);
+		hpdata_purge_list_append(&psset->to_purge, ps);
+	} else if (!hpdata_purge_allowed_get(ps)
+	    && hpdata_in_psset_purge_container_get(ps)) {
+		hpdata_in_psset_purge_container_set(ps, false);
+		hpdata_purge_list_remove(&psset->to_purge, ps);
+	}
+
+	if (hpdata_hugify_allowed_get(ps)
+	    && !hpdata_in_psset_hugify_container_get(ps)) {
+		hpdata_in_psset_hugify_container_set(ps, true);
+		hpdata_hugify_list_append(&psset->to_hugify, ps);
+	} else if (!hpdata_hugify_allowed_get(ps)
+	    && hpdata_in_psset_hugify_container_get(ps)) {
+		hpdata_in_psset_hugify_container_set(ps, false);
+		hpdata_hugify_list_remove(&psset->to_hugify, ps);
+	}
 }

 hpdata_t *
@ -169,7 +241,7 @@ psset_pick_alloc(psset_t *psset, size_t size) {
 	pszind_t pind = (pszind_t)bitmap_ffu(psset->bitmap, &psset_bitmap_info,
 	    (size_t)min_pind);
 	if (pind == PSSET_NPSIZES) {
-		return hpdata_empty_list_first(&psset->empty_slabs);
+		return hpdata_empty_list_first(&psset->empty);
 	}
 	hpdata_t *ps = hpdata_age_heap_first(&psset->pageslabs[pind]);
 	if (ps == NULL) {
@ -181,16 +253,48 @@ psset_pick_alloc(psset_t *psset, size_t size) {
 	return ps;
 }

+hpdata_t *
+psset_pick_purge(psset_t *psset) {
+	return hpdata_purge_list_first(&psset->to_purge);
+}
+
+hpdata_t *
+psset_pick_hugify(psset_t *psset) {
+	return hpdata_hugify_list_first(&psset->to_hugify);
+}
+
 void
 psset_insert(psset_t *psset, hpdata_t *ps) {
-	/* We only support inserting empty pageslabs, for now. */
-	assert(hpdata_empty(ps));
 	hpdata_in_psset_set(ps, true);
-	psset_do_alloc_tracking_insert(psset, ps);
+
+	psset_stats_insert(psset, ps);
+	if (hpdata_alloc_allowed_get(ps)) {
+		psset_alloc_container_insert(psset, ps);
+	}
+	if (hpdata_purge_allowed_get(ps)) {
+		hpdata_in_psset_purge_container_set(ps, true);
+		hpdata_purge_list_append(&psset->to_purge, ps);
+	}
+	if (hpdata_hugify_allowed_get(ps)) {
+		hpdata_in_psset_hugify_container_set(ps, true);
+		hpdata_hugify_list_append(&psset->to_hugify, ps);
+	}
 }

 void
 psset_remove(psset_t *psset, hpdata_t *ps) {
 	hpdata_in_psset_set(ps, false);
-	psset_do_alloc_tracking_remove(psset, ps);
+
+	psset_stats_remove(psset, ps);
+	if (hpdata_in_psset_alloc_container_get(ps)) {
+		psset_alloc_container_remove(psset, ps);
+	}
+	if (hpdata_in_psset_purge_container_get(ps)) {
+		hpdata_in_psset_purge_container_set(ps, false);
+		hpdata_purge_list_remove(&psset->to_purge, ps);
+	}
+	if (hpdata_in_psset_purge_container_get(ps)) {
+		hpdata_in_psset_purge_container_set(ps, false);
+		hpdata_purge_list_remove(&psset->to_purge, ps);
+	}
 }
--- a/test/unit/hpdata.c
+++ b/test/unit/hpdata.c
@ -67,13 +67,9 @@ TEST_BEGIN(test_purge_simple) {

 	expect_zu_eq(hpdata_ntouched_get(&hpdata), HUGEPAGE_PAGES / 2, "");

-	expect_false(hpdata_changing_state_get(&hpdata), "");
-
 	hpdata_purge_state_t purge_state;
 	hpdata_purge_begin(&hpdata, &purge_state);

-	expect_true(hpdata_changing_state_get(&hpdata), "");
-
 	void *purge_addr;
 	size_t purge_size;
 	bool got_result = hpdata_purge_next(&hpdata, &purge_state, &purge_addr,
@ -82,17 +78,12 @@ TEST_BEGIN(test_purge_simple) {
 	expect_ptr_eq(HPDATA_ADDR, purge_addr, "");
 	expect_zu_eq(HUGEPAGE_PAGES / 4 * PAGE, purge_size, "");

-	expect_true(hpdata_changing_state_get(&hpdata), "");
-
 	got_result = hpdata_purge_next(&hpdata, &purge_state, &purge_addr,
 	    &purge_size);
 	expect_false(got_result, "Unexpected additional purge range: "
 	    "extent at %p of size %zu", purge_addr, purge_size);

-	expect_true(hpdata_changing_state_get(&hpdata), "");
-
 	hpdata_purge_end(&hpdata, &purge_state);
-	expect_false(hpdata_changing_state_get(&hpdata), "");
 	expect_zu_eq(hpdata_ntouched_get(&hpdata), HUGEPAGE_PAGES / 4, "");
 }
 TEST_END
@ -166,12 +157,7 @@ TEST_BEGIN(test_hugify) {

 	expect_zu_eq(HUGEPAGE_PAGES / 2, hpdata_ntouched_get(&hpdata), "");

-	expect_false(hpdata_changing_state_get(&hpdata), "");
-	hpdata_hugify_begin(&hpdata);
-	expect_true(hpdata_changing_state_get(&hpdata), "");
-
-	hpdata_hugify_end(&hpdata);
-	expect_false(hpdata_changing_state_get(&hpdata), "");
+	hpdata_hugify(&hpdata);

 	/* Hugeifying should have increased the dirty page count. */
 	expect_zu_eq(HUGEPAGE_PAGES, hpdata_ntouched_get(&hpdata), "");