Allow PAI to calculate time until deferred work

Previously the calculation of sleep time between wakeups was implemented within background_thread. This resulted in some parts of decay and hpa specific logic mixing with background thread implementation. In this change, background thread delegates this calculation to arena and it, in turn, delegates it to PAI. The next step is to implement the actual calculation of time until deferred work in HPA.
2021-08-06 14:53:05 -07:00 · 2021-08-06 14:53:05 -07:00 · b8b8027f19
commit b8b8027f19
parent 26140dd246
14 changed files with 298 additions and 169 deletions
--- a/include/jemalloc/internal/arena_externs.h
+++ b/include/jemalloc/internal/arena_externs.h
@ -8,6 +8,12 @@
 #include "jemalloc/internal/pages.h"
 #include "jemalloc/internal/stats.h"

+/*
+ * When the amount of pages to be purged exceeds this amount, deferred purge
+ * should happen.
+ */
+#define ARENA_DEFERRED_PURGE_NPAGES_THRESHOLD UINT64_C(1024)
+
 extern ssize_t opt_dirty_decay_ms;
 extern ssize_t opt_muzzy_decay_ms;

@ -16,7 +22,6 @@ extern const char *percpu_arena_mode_names[];

 extern div_info_t arena_binind_div_info[SC_NBINS];

-extern const uint64_t h_steps[SMOOTHSTEP_NSTEPS];
 extern malloc_mutex_t arenas_lock;
 extern emap_t arena_emap_global;

@ -51,6 +56,7 @@ bool arena_decay_ms_set(tsdn_t *tsdn, arena_t *arena, extent_state_t state,
 ssize_t arena_decay_ms_get(arena_t *arena, extent_state_t state);
 void arena_decay(tsdn_t *tsdn, arena_t *arena, bool is_background_thread,
    bool all);
+uint64_t arena_time_until_deferred(tsdn_t *tsdn, arena_t *arena);
 void arena_do_deferred_work(tsdn_t *tsdn, arena_t *arena);
 void arena_reset(tsd_t *tsd, arena_t *arena);
 void arena_destroy(tsd_t *tsd, arena_t *arena);
--- a/include/jemalloc/internal/background_thread_externs.h
+++ b/include/jemalloc/internal/background_thread_externs.h
@ -13,6 +13,9 @@ extern background_thread_info_t *background_thread_info;
 bool background_thread_create(tsd_t *tsd, unsigned arena_ind);
 bool background_threads_enable(tsd_t *tsd);
 bool background_threads_disable(tsd_t *tsd);
+bool background_thread_running(background_thread_info_t* info);
+void background_thread_wakeup_early(background_thread_info_t *info,
+    nstime_t *remaining_sleep);
 void background_thread_interval_check(tsdn_t *tsdn, arena_t *arena,
    decay_t *decay, size_t npages_new);
 void background_thread_prefork0(tsdn_t *tsdn);
--- a/include/jemalloc/internal/background_thread_inlines.h
+++ b/include/jemalloc/internal/background_thread_inlines.h
@ -45,18 +45,4 @@ background_thread_indefinite_sleep(background_thread_info_t *info) {
 	return atomic_load_b(&info->indefinite_sleep, ATOMIC_ACQUIRE);
 }

-JEMALLOC_ALWAYS_INLINE void
-arena_background_thread_inactivity_check(tsdn_t *tsdn, arena_t *arena,
-    bool is_background_thread) {
-	if (!background_thread_enabled() || is_background_thread) {
-		return;
-	}
-	background_thread_info_t *info =
-	    arena_background_thread_info_get(arena);
-	if (background_thread_indefinite_sleep(info)) {
-		background_thread_interval_check(tsdn, arena,
-		    &arena->pa_shard.pac.decay_dirty, 0);
-	}
-}
-
 #endif /* JEMALLOC_INTERNAL_BACKGROUND_THREAD_INLINES_H */
--- a/include/jemalloc/internal/background_thread_structs.h
+++ b/include/jemalloc/internal/background_thread_structs.h
@ -19,6 +19,9 @@
 #define BACKGROUND_THREAD_HPA_INTERVAL_MAX_UNINITIALIZED (-2)
 #define BACKGROUND_THREAD_HPA_INTERVAL_MAX_DEFAULT_WHEN_ENABLED 5000

+#define BACKGROUND_THREAD_DEFERRED_MIN UINT64_C(0)
+#define BACKGROUND_THREAD_DEFERRED_MAX UINT64_C(-1)
+
 typedef enum {
 	background_thread_stopped,
 	background_thread_started,
--- a/include/jemalloc/internal/decay.h
+++ b/include/jemalloc/internal/decay.h
@ -118,6 +118,25 @@ decay_epoch_duration_ns(const decay_t *decay) {
 	return nstime_ns(&decay->interval);
 }

+static inline bool
+decay_immediately(const decay_t *decay) {
+	ssize_t decay_ms = decay_ms_read(decay);
+	return decay_ms == 0;
+}
+
+static inline bool
+decay_disabled(const decay_t *decay) {
+	ssize_t decay_ms = decay_ms_read(decay);
+	return decay_ms < 0;
+}
+
+/* Returns true if decay is enabled and done gradually. */
+static inline bool
+decay_gradually(const decay_t *decay) {
+	ssize_t decay_ms = decay_ms_read(decay);
+	return decay_ms > 0;
+}
+
 /*
 * Returns true if the passed in decay time setting is valid.
 * < -1 : invalid
@ -144,6 +163,12 @@ bool decay_init(decay_t *decay, nstime_t *cur_time, ssize_t decay_ms);
 */
 void decay_reinit(decay_t *decay, nstime_t *cur_time, ssize_t decay_ms);

+/*
+ * Compute how many of 'npages_new' pages we would need to purge in 'time'.
+ */
+uint64_t decay_npages_purge_in(decay_t *decay, nstime_t *time,
+    size_t npages_new);
+
 /* Returns true if the epoch advanced and there are pages to purge. */
 bool decay_maybe_advance_epoch(decay_t *decay, nstime_t *new_time,
    size_t current_npages);
--- a/include/jemalloc/internal/pa.h
+++ b/include/jemalloc/internal/pa.h
@ -200,6 +200,8 @@ ssize_t pa_decay_ms_get(pa_shard_t *shard, extent_state_t state);
 void pa_shard_set_deferral_allowed(tsdn_t *tsdn, pa_shard_t *shard,
    bool deferral_allowed);
 void pa_shard_do_deferred_work(tsdn_t *tsdn, pa_shard_t *shard);
+void pa_shard_try_deferred_work(tsdn_t *tsdn, pa_shard_t *shard);
+uint64_t pa_shard_time_until_deferred_work(tsdn_t *tsdn, pa_shard_t *shard);

 /******************************************************************************/
 /*
--- a/include/jemalloc/internal/pai.h
+++ b/include/jemalloc/internal/pai.h
@ -24,6 +24,7 @@ struct pai_s {
 	/* This function empties out list as a side-effect of being called. */
 	void (*dalloc_batch)(tsdn_t *tsdn, pai_t *self,
 	    edata_list_active_t *list);
+	uint64_t (*time_until_deferred_work)(tsdn_t *tsdn, pai_t *self);
 };

 /*
@ -64,6 +65,11 @@ pai_dalloc_batch(tsdn_t *tsdn, pai_t *self, edata_list_active_t *list) {
 	self->dalloc_batch(tsdn, self, list);
 }

+static inline uint64_t
+pai_time_until_deferred_work(tsdn_t *tsdn, pai_t *self) {
+	return self->time_until_deferred_work(tsdn, self);
+}
+
 /*
 * An implementation of batch allocation that simply calls alloc once for
 * each item in the list.
--- a/src/arena.c
+++ b/src/arena.c
@ -38,13 +38,6 @@ static atomic_zd_t muzzy_decay_ms_default;
 emap_t arena_emap_global;
 pa_central_t arena_pa_central_global;

-const uint64_t h_steps[SMOOTHSTEP_NSTEPS] = {
-#define STEP(step, h, x, y)			\
-		h,
-		SMOOTHSTEP
-#undef STEP
-};
-
 div_info_t arena_binind_div_info[SC_NBINS];

 size_t opt_oversize_threshold = OVERSIZE_THRESHOLD_DEFAULT;
@ -65,6 +58,9 @@ static bool arena_decay_dirty(tsdn_t *tsdn, arena_t *arena,
    bool is_background_thread, bool all);
 static void arena_bin_lower_slab(tsdn_t *tsdn, arena_t *arena, edata_t *slab,
    bin_t *bin);
+static void
+arena_maybe_do_deferred_work(tsdn_t *tsdn, arena_t *arena, decay_t *decay,
+    size_t npages_new);

 /******************************************************************************/

@ -189,6 +185,20 @@ arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads,
 	}
 }

+static void
+arena_background_thread_inactivity_check(tsdn_t *tsdn, arena_t *arena,
+    bool is_background_thread) {
+	if (!background_thread_enabled() || is_background_thread) {
+		return;
+	}
+	background_thread_info_t *info =
+	    arena_background_thread_info_get(arena);
+	if (background_thread_indefinite_sleep(info)) {
+		arena_maybe_do_deferred_work(tsdn, arena,
+		    &arena->pa_shard.pac.decay_dirty, 0);
+	}
+}
+
 void arena_handle_new_dirty_pages(tsdn_t *tsdn, arena_t *arena) {
 	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
 	    WITNESS_RANK_CORE, 0);
@ -420,8 +430,7 @@ arena_decay_impl(tsdn_t *tsdn, arena_t *arena, decay_t *decay,

 	if (have_background_thread && background_thread_enabled() &&
 	    epoch_advanced && !is_background_thread) {
-		background_thread_interval_check(tsdn, arena, decay,
-		    npages_new);
+		arena_maybe_do_deferred_work(tsdn, arena, decay, npages_new);
 	}

 	return false;
@ -462,6 +471,65 @@ arena_decay(tsdn_t *tsdn, arena_t *arena, bool is_background_thread, bool all) {
 	arena_decay_muzzy(tsdn, arena, is_background_thread, all);
 }

+static void
+arena_maybe_do_deferred_work(tsdn_t *tsdn, arena_t *arena, decay_t *decay,
+    size_t npages_new) {
+	background_thread_info_t *info = arena_background_thread_info_get(
+	    arena);
+	if (malloc_mutex_trylock(tsdn, &info->mtx)) {
+		/*
+		 * Background thread may hold the mutex for a long period of
+		 * time.  We'd like to avoid the variance on application
+		 * threads.  So keep this non-blocking, and leave the work to a
+		 * future epoch.
+		 */
+		return;
+	}
+	if (!background_thread_running(info)) {
+		goto label_done;
+	}
+	if (malloc_mutex_trylock(tsdn, &decay->mtx)) {
+		goto label_done;
+	}
+	if (!decay_gradually(decay)) {
+		goto label_done_unlock2;
+	}
+
+	nstime_t diff;
+	nstime_init(&diff, background_thread_wakeup_time_get(info));
+	if (nstime_compare(&diff, &decay->epoch) <= 0) {
+		goto label_done_unlock2;
+	}
+	nstime_subtract(&diff, &decay->epoch);
+
+	if (npages_new > 0) {
+		uint64_t npurge_new = decay_npages_purge_in(decay, &diff,
+		    npages_new);
+		info->npages_to_purge_new += npurge_new;
+	}
+
+	bool should_signal;
+	if (info->npages_to_purge_new > ARENA_DEFERRED_PURGE_NPAGES_THRESHOLD) {
+		should_signal = true;
+	} else if (unlikely(background_thread_indefinite_sleep(info)) &&
+	    (ecache_npages_get(&arena->pa_shard.pac.ecache_dirty) > 0 ||
+	    ecache_npages_get(&arena->pa_shard.pac.ecache_muzzy) > 0 ||
+	    info->npages_to_purge_new > 0)) {
+		should_signal = true;
+	} else {
+		should_signal = false;
+	}
+
+	if (should_signal) {
+		info->npages_to_purge_new = 0;
+		background_thread_wakeup_early(info, &diff);
+	}
+label_done_unlock2:
+	malloc_mutex_unlock(tsdn, &decay->mtx);
+label_done:
+	malloc_mutex_unlock(tsdn, &info->mtx);
+}
+
 /* Called from background threads. */
 void
 arena_do_deferred_work(tsdn_t *tsdn, arena_t *arena) {
--- a/src/background_thread.c
+++ b/src/background_thread.c
@ -60,8 +60,9 @@ pthread_create_wrapper(pthread_t *__restrict thread, const pthread_attr_t *attr,
 bool background_thread_create(tsd_t *tsd, unsigned arena_ind) NOT_REACHED
 bool background_threads_enable(tsd_t *tsd) NOT_REACHED
 bool background_threads_disable(tsd_t *tsd) NOT_REACHED
-void background_thread_interval_check(tsdn_t *tsdn, arena_t *arena,
-    decay_t *decay, size_t npages_new) NOT_REACHED
+bool background_thread_running(background_thread_info_t *info) NOT_REACHED
+void background_thread_wakeup_early(background_thread_info_t *info,
+    nstime_t *remaining_sleep) NOT_REACHED
 void background_thread_prefork0(tsdn_t *tsdn) NOT_REACHED
 void background_thread_prefork1(tsdn_t *tsdn) NOT_REACHED
 void background_thread_postfork_parent(tsdn_t *tsdn) NOT_REACHED
@ -98,8 +99,6 @@ set_current_thread_affinity(int cpu) {
 #endif
 }

-/* Threshold for determining when to wake up the background thread. */
-#define BACKGROUND_THREAD_NPAGES_THRESHOLD UINT64_C(1024)
 #define BILLION UINT64_C(1000000000)
 /* Minimal sleep interval 100 ms. */
 #define BACKGROUND_THREAD_MIN_INTERVAL_NS (BILLION / 10)
@ -173,55 +172,10 @@ background_thread_pause_check(tsdn_t *tsdn, background_thread_info_t *info) {
 	return false;
 }

-static inline uint64_t
-arena_decay_compute_purge_interval(tsdn_t *tsdn, decay_t *decay,
-    size_t npages) {
-	if (malloc_mutex_trylock(tsdn, &decay->mtx)) {
-		/* Use minimal interval if decay is contended. */
-		return BACKGROUND_THREAD_MIN_INTERVAL_NS;
-	}
-	uint64_t decay_ns = decay_ns_until_purge(decay, npages,
-	    BACKGROUND_THREAD_NPAGES_THRESHOLD);
-	malloc_mutex_unlock(tsdn, &decay->mtx);
-
-	return decay_ns < BACKGROUND_THREAD_MIN_INTERVAL_NS ?
-	    BACKGROUND_THREAD_MIN_INTERVAL_NS :
-	    decay_ns;
-}
-
-
-static inline uint64_t
-arena_decay_compute_min_purge_interval(tsdn_t *tsdn, arena_t *arena) {
-	uint64_t dirty, muzzy;
-	dirty = arena_decay_compute_purge_interval(tsdn,
-	    &arena->pa_shard.pac.decay_dirty,
-	    ecache_npages_get(&arena->pa_shard.pac.ecache_dirty));
-	if (dirty == BACKGROUND_THREAD_MIN_INTERVAL_NS) {
-		return dirty;
-	}
-	muzzy = arena_decay_compute_purge_interval(tsdn,
-	    &arena->pa_shard.pac.decay_muzzy,
-	    ecache_npages_get(&arena->pa_shard.pac.ecache_muzzy));
-
-	uint64_t min_so_far = dirty < muzzy ? dirty : muzzy;
-	if (opt_background_thread_hpa_interval_max_ms >= 0) {
-		uint64_t hpa_interval = 1000 * 1000 *
-		    (uint64_t)opt_background_thread_hpa_interval_max_ms;
-		if (hpa_interval < min_so_far) {
-			if (hpa_interval < BACKGROUND_THREAD_MIN_INTERVAL_NS) {
-				min_so_far = BACKGROUND_THREAD_MIN_INTERVAL_NS;
-			} else {
-				min_so_far = hpa_interval;
-			}
-		}
-	}
-
-	return min_so_far;
-}
-
 static inline void
-background_work_sleep_once(tsdn_t *tsdn, background_thread_info_t *info, unsigned ind) {
-	uint64_t min_interval = BACKGROUND_THREAD_INDEFINITE_SLEEP;
+background_work_sleep_once(tsdn_t *tsdn, background_thread_info_t *info,
+    unsigned ind) {
+	uint64_t ns_until_deferred = BACKGROUND_THREAD_DEFERRED_MAX;
 	unsigned narenas = narenas_total_get();

 	for (unsigned i = ind; i < narenas; i += max_background_threads) {
@ -230,19 +184,29 @@ background_work_sleep_once(tsdn_t *tsdn, background_thread_info_t *info, unsigne
 			continue;
 		}
 		arena_do_deferred_work(tsdn, arena);
-		if (min_interval == BACKGROUND_THREAD_MIN_INTERVAL_NS) {
+		if (ns_until_deferred <= BACKGROUND_THREAD_MIN_INTERVAL_NS) {
 			/* Min interval will be used. */
 			continue;
 		}
-		uint64_t interval = arena_decay_compute_min_purge_interval(tsdn,
-		    arena);
-		assert(interval >= BACKGROUND_THREAD_MIN_INTERVAL_NS);
-		if (interval != DECAY_UNBOUNDED_TIME_TO_PURGE &&
-		    min_interval > interval) {
-			min_interval = interval;
+		uint64_t ns_arena_deferred = pa_shard_time_until_deferred_work(
+		    tsdn, &arena->pa_shard);
+		if (ns_arena_deferred < ns_until_deferred) {
+			ns_until_deferred = ns_arena_deferred;
 		}
 	}
-	background_thread_sleep(tsdn, info, min_interval);
+
+	uint64_t sleep_ns;
+	if (ns_until_deferred == BACKGROUND_THREAD_DEFERRED_MAX) {
+		sleep_ns = BACKGROUND_THREAD_INDEFINITE_SLEEP;
+	} else {
+		sleep_ns =
+		    (ns_until_deferred < BACKGROUND_THREAD_MIN_INTERVAL_NS)
+		    ? BACKGROUND_THREAD_MIN_INTERVAL_NS
+		    : ns_until_deferred;
+
+	}
+
+	background_thread_sleep(tsdn, info, sleep_ns);
 }

 static bool
@ -609,88 +573,23 @@ background_threads_disable(tsd_t *tsd) {
 	return false;
 }

-/* Check if we need to signal the background thread early. */
+bool
+background_thread_running(background_thread_info_t *info) {
+	return info->state == background_thread_started;
+}
+
 void
-background_thread_interval_check(tsdn_t *tsdn, arena_t *arena, decay_t *decay,
-    size_t npages_new) {
-	background_thread_info_t *info = arena_background_thread_info_get(
-	    arena);
-	if (malloc_mutex_trylock(tsdn, &info->mtx)) {
-		/*
-		 * Background thread may hold the mutex for a long period of
-		 * time.  We'd like to avoid the variance on application
-		 * threads.  So keep this non-blocking, and leave the work to a
-		 * future epoch.
-		 */
+background_thread_wakeup_early(background_thread_info_t *info,
+    nstime_t *remaining_sleep) {
+	/*
+	 * This is an optimization to increase batching. At this point
+	 * we know that background thread wakes up soon, so the time to cache
+	 * the just freed memory is bounded and low.
+	 */
+	if (nstime_ns(remaining_sleep) < BACKGROUND_THREAD_MIN_INTERVAL_NS) {
 		return;
 	}
-
-	if (info->state != background_thread_started) {
-		goto label_done;
-	}
-	if (malloc_mutex_trylock(tsdn, &decay->mtx)) {
-		goto label_done;
-	}
-
-	ssize_t decay_time = decay_ms_read(decay);
-	if (decay_time <= 0) {
-		/* Purging is eagerly done or disabled currently. */
-		goto label_done_unlock2;
-	}
-	uint64_t decay_interval_ns = decay_epoch_duration_ns(decay);
-	assert(decay_interval_ns > 0);
-
-	nstime_t diff;
-	nstime_init(&diff, background_thread_wakeup_time_get(info));
-	if (nstime_compare(&diff, &decay->epoch) <= 0) {
-		goto label_done_unlock2;
-	}
-	nstime_subtract(&diff, &decay->epoch);
-	if (nstime_ns(&diff) < BACKGROUND_THREAD_MIN_INTERVAL_NS) {
-		goto label_done_unlock2;
-	}
-
-	if (npages_new > 0) {
-		size_t n_epoch = (size_t)(nstime_ns(&diff) / decay_interval_ns);
-		/*
-		 * Compute how many new pages we would need to purge by the next
-		 * wakeup, which is used to determine if we should signal the
-		 * background thread.
-		 */
-		uint64_t npurge_new;
-		if (n_epoch >= SMOOTHSTEP_NSTEPS) {
-			npurge_new = npages_new;
-		} else {
-			uint64_t h_steps_max = h_steps[SMOOTHSTEP_NSTEPS - 1];
-			assert(h_steps_max >=
-			    h_steps[SMOOTHSTEP_NSTEPS - 1 - n_epoch]);
-			npurge_new = npages_new * (h_steps_max -
-			    h_steps[SMOOTHSTEP_NSTEPS - 1 - n_epoch]);
-			npurge_new >>= SMOOTHSTEP_BFP;
-		}
-		info->npages_to_purge_new += npurge_new;
-	}
-
-	bool should_signal;
-	if (info->npages_to_purge_new > BACKGROUND_THREAD_NPAGES_THRESHOLD) {
-		should_signal = true;
-	} else if (unlikely(background_thread_indefinite_sleep(info)) &&
-	    (ecache_npages_get(&arena->pa_shard.pac.ecache_dirty) > 0 ||
-	    ecache_npages_get(&arena->pa_shard.pac.ecache_muzzy) > 0 ||
-	    info->npages_to_purge_new > 0)) {
-		should_signal = true;
-	} else {
-		should_signal = false;
-	}
-
-	if (should_signal) {
-		info->npages_to_purge_new = 0;
-		pthread_cond_signal(&info->cond);
-	}
-label_done_unlock2:
-	malloc_mutex_unlock(tsdn, &decay->mtx);
-label_done:
-	malloc_mutex_unlock(tsdn, &info->mtx);
+	pthread_cond_signal(&info->cond);
 }

 void
--- a/src/decay.c
+++ b/src/decay.c
@ -3,6 +3,13 @@

 #include "jemalloc/internal/decay.h"

+const uint64_t h_steps[SMOOTHSTEP_NSTEPS] = {
+#define STEP(step, h, x, y)			\
+		h,
+		SMOOTHSTEP
+#undef STEP
+};
+
 /*
 * Generate a new deadline that is uniformly random within the next epoch after
 * the current one.
@ -147,6 +154,25 @@ decay_deadline_reached(const decay_t *decay, const nstime_t *time) {
 	return (nstime_compare(&decay->deadline, time) <= 0);
 }

+uint64_t
+decay_npages_purge_in(decay_t *decay, nstime_t *time, size_t npages_new) {
+	uint64_t decay_interval_ns = decay_epoch_duration_ns(decay);
+	size_t n_epoch = (size_t)(nstime_ns(time) / decay_interval_ns);
+
+	uint64_t npages_purge;
+	if (n_epoch >= SMOOTHSTEP_NSTEPS) {
+		npages_purge = npages_new;
+	} else {
+		uint64_t h_steps_max = h_steps[SMOOTHSTEP_NSTEPS - 1];
+		assert(h_steps_max >=
+		    h_steps[SMOOTHSTEP_NSTEPS - 1 - n_epoch]);
+		npages_purge = npages_new * (h_steps_max -
+		    h_steps[SMOOTHSTEP_NSTEPS - 1 - n_epoch]);
+		npages_purge >>= SMOOTHSTEP_BFP;
+	}
+	return npages_purge;
+}
+
 bool
 decay_maybe_advance_epoch(decay_t *decay, nstime_t *new_time,
    size_t npages_current) {
@ -214,9 +240,7 @@ decay_npurge_after_interval(decay_t *decay, size_t interval) {

 uint64_t decay_ns_until_purge(decay_t *decay, size_t npages_current,
    uint64_t npages_threshold) {
-	ssize_t decay_time = decay_ms_read(decay);
-	if (decay_time <= 0) {
-		/* Purging is eagerly done or disabled currently. */
+	if (!decay_gradually(decay)) {
 		return DECAY_UNBOUNDED_TIME_TO_PURGE;
 	}
 	uint64_t decay_interval_ns = decay_epoch_duration_ns(decay);
--- a/src/hpa.c
+++ b/src/hpa.c
@ -19,6 +19,7 @@ static bool hpa_shrink(tsdn_t *tsdn, pai_t *self, edata_t *edata,
 static void hpa_dalloc(tsdn_t *tsdn, pai_t *self, edata_t *edata);
 static void hpa_dalloc_batch(tsdn_t *tsdn, pai_t *self,
    edata_list_active_t *list);
+static uint64_t hpa_time_until_deferred_work(tsdn_t *tsdn, pai_t *self);

 bool
 hpa_supported() {
@ -218,6 +219,7 @@ hpa_shard_init(hpa_shard_t *shard, hpa_central_t *central, emap_t *emap,
 	shard->pai.shrink = &hpa_shrink;
 	shard->pai.dalloc = &hpa_dalloc;
 	shard->pai.dalloc_batch = &hpa_dalloc_batch;
+	shard->pai.time_until_deferred_work = &hpa_time_until_deferred_work;

 	hpa_do_consistency_checks(shard);

@ -850,6 +852,11 @@ hpa_dalloc(tsdn_t *tsdn, pai_t *self, edata_t *edata) {
 	hpa_dalloc_batch(tsdn, self, &dalloc_list);
 }

+static uint64_t
+hpa_time_until_deferred_work(tsdn_t *tsdn, pai_t *self) {
+	return opt_background_thread_hpa_interval_max_ms;
+}
+
 void
 hpa_shard_disable(tsdn_t *tsdn, hpa_shard_t *shard) {
 	hpa_do_consistency_checks(shard);
--- a/src/pa.c
+++ b/src/pa.c
@ -96,6 +96,11 @@ pa_shard_reset(tsdn_t *tsdn, pa_shard_t *shard) {
 	}
 }

+static bool
+pa_shard_uses_hpa(pa_shard_t *shard) {
+	return atomic_load_b(&shard->use_hpa, ATOMIC_RELAXED);
+}
+
 void
 pa_shard_destroy(tsdn_t *tsdn, pa_shard_t *shard) {
 	pac_destroy(tsdn, &shard->pac);
@ -118,7 +123,7 @@ pa_alloc(tsdn_t *tsdn, pa_shard_t *shard, size_t size, size_t alignment,
 	    WITNESS_RANK_CORE, 0);

 	edata_t *edata = NULL;
-	if (atomic_load_b(&shard->use_hpa, ATOMIC_RELAXED)) {
+	if (pa_shard_uses_hpa(shard)) {
 		edata = pai_alloc(tsdn, &shard->hpa_sec.pai, size, alignment,
 		    zero);
 	}
@ -226,7 +231,7 @@ pa_decay_ms_get(pa_shard_t *shard, extent_state_t state) {
 void
 pa_shard_set_deferral_allowed(tsdn_t *tsdn, pa_shard_t *shard,
    bool deferral_allowed) {
-	if (atomic_load_b(&shard->use_hpa, ATOMIC_RELAXED)) {
+	if (pa_shard_uses_hpa(shard)) {
 		hpa_shard_set_deferral_allowed(tsdn, &shard->hpa_shard,
 		    deferral_allowed);
 	}
@ -234,7 +239,63 @@ pa_shard_set_deferral_allowed(tsdn_t *tsdn, pa_shard_t *shard,

 void
 pa_shard_do_deferred_work(tsdn_t *tsdn, pa_shard_t *shard) {
-	if (atomic_load_b(&shard->use_hpa, ATOMIC_RELAXED)) {
+	if (pa_shard_uses_hpa(shard)) {
 		hpa_shard_do_deferred_work(tsdn, &shard->hpa_shard);
 	}
 }
+
+static inline uint64_t
+pa_shard_ns_until_purge(tsdn_t *tsdn, decay_t *decay, size_t npages) {
+	if (malloc_mutex_trylock(tsdn, &decay->mtx)) {
+		/* Use minimal interval if decay is contended. */
+		return BACKGROUND_THREAD_DEFERRED_MIN;
+	}
+	uint64_t result = decay_ns_until_purge(decay, npages,
+	    ARENA_DEFERRED_PURGE_NPAGES_THRESHOLD);
+
+	malloc_mutex_unlock(tsdn, &decay->mtx);
+	return result;
+}
+
+/*
+ * Get time until next deferred work ought to happen. If there are multiple
+ * things that have been deferred, this function calculates the time until
+ * the soonest of those things.
+ */
+uint64_t
+pa_shard_time_until_deferred_work(tsdn_t *tsdn, pa_shard_t *shard) {
+	uint64_t time;
+	time = pa_shard_ns_until_purge(tsdn,
+	    &shard->pac.decay_dirty,
+	    ecache_npages_get(&shard->pac.ecache_dirty));
+	if (time == BACKGROUND_THREAD_DEFERRED_MIN) {
+		return time;
+	}
+
+	uint64_t muzzy = pa_shard_ns_until_purge(tsdn,
+	    &shard->pac.decay_muzzy,
+	    ecache_npages_get(&shard->pac.ecache_muzzy));
+	if (muzzy < time) {
+		time = muzzy;
+		if (time == BACKGROUND_THREAD_DEFERRED_MIN) {
+			return time;
+		}
+	}
+
+	uint64_t pac = pai_time_until_deferred_work(tsdn, &shard->pac.pai);
+	if (pac < time) {
+		time = pac;
+		if (time == BACKGROUND_THREAD_DEFERRED_MIN) {
+			return time;
+		}
+	}
+
+	if (pa_shard_uses_hpa(shard)) {
+		uint64_t hpa =
+		    pai_time_until_deferred_work(tsdn, &shard->hpa_shard.pai);
+		if (hpa < time) {
+			time = hpa;
+		}
+	}
+	return time;
+}
--- a/src/pac.c
+++ b/src/pac.c
@ -10,6 +10,7 @@ static bool pac_expand_impl(tsdn_t *tsdn, pai_t *self, edata_t *edata,
 static bool pac_shrink_impl(tsdn_t *tsdn, pai_t *self, edata_t *edata,
    size_t old_size, size_t new_size);
 static void pac_dalloc_impl(tsdn_t *tsdn, pai_t *self, edata_t *edata);
+static uint64_t pac_time_until_deferred_work(tsdn_t *tsdn, pai_t *self);

 static ehooks_t *
 pac_ehooks_get(pac_t *pac) {
@ -96,6 +97,7 @@ pac_init(tsdn_t *tsdn, pac_t *pac, base_t *base, emap_t *emap,
 	pac->pai.shrink = &pac_shrink_impl;
 	pac->pai.dalloc = &pac_dalloc_impl;
 	pac->pai.dalloc_batch = &pai_dalloc_batch_default;
+	pac->pai.time_until_deferred_work = &pac_time_until_deferred_work;

 	return false;
 }
@ -196,6 +198,11 @@ pac_dalloc_impl(tsdn_t *tsdn, pai_t *self, edata_t *edata) {
 	ecache_dalloc(tsdn, pac, ehooks, &pac->ecache_dirty, edata);
 }

+static uint64_t
+pac_time_until_deferred_work(tsdn_t *tsdn, pai_t *self) {
+	return BACKGROUND_THREAD_DEFERRED_MAX;
+}
+
 bool
 pac_retain_grow_limit_get_set(tsdn_t *tsdn, pac_t *pac, size_t *old_limit,
    size_t *new_limit) {
--- a/test/unit/decay.c
+++ b/test/unit/decay.c
@ -36,6 +36,37 @@ TEST_BEGIN(test_decay_ms_valid) {
 }
 TEST_END

+TEST_BEGIN(test_decay_npages_purge_in) {
+	decay_t decay;
+	memset(&decay, 0, sizeof(decay));
+
+	nstime_t curtime;
+	nstime_init(&curtime, 0);
+
+	uint64_t decay_ms = 1000;
+	nstime_t decay_nstime;
+	nstime_init(&decay_nstime, decay_ms * 1000 * 1000);
+	expect_false(decay_init(&decay, &curtime, (ssize_t)decay_ms),
+	    "Failed to initialize decay");
+
+	const size_t new_pages = 100;
+
+	nstime_t time;
+	nstime_copy(&time, &decay_nstime);
+	expect_u64_eq(decay_npages_purge_in(&decay, &time, new_pages),
+	    new_pages, "Not all pages are expected to decay in decay_ms");
+
+	nstime_init(&time, 0);
+	expect_u64_eq(decay_npages_purge_in(&decay, &time, new_pages), 0,
+	    "More than zero pages are expected to instantly decay");
+
+	nstime_copy(&time, &decay_nstime);
+	nstime_idivide(&time, 2);
+	expect_u64_eq(decay_npages_purge_in(&decay, &time, new_pages),
+	    new_pages / 2, "Not half of pages decay in half the decay period");
+}
+TEST_END
+
 TEST_BEGIN(test_decay_maybe_advance_epoch) {
 	decay_t decay;
 	memset(&decay, 0, sizeof(decay));
@ -244,6 +275,7 @@ main(void) {
 	return test(
 	    test_decay_init,
 	    test_decay_ms_valid,
+	    test_decay_npages_purge_in,
 	    test_decay_maybe_advance_epoch,
 	    test_decay_empty,
 	    test_decay,