diff --git a/include/jemalloc/internal/arena_externs.h b/include/jemalloc/internal/arena_externs.h index 557e49f1..02e7c1cc 100644 --- a/include/jemalloc/internal/arena_externs.h +++ b/include/jemalloc/internal/arena_externs.h @@ -8,6 +8,12 @@ #include "jemalloc/internal/pages.h" #include "jemalloc/internal/stats.h" +/* + * When the amount of pages to be purged exceeds this amount, deferred purge + * should happen. + */ +#define ARENA_DEFERRED_PURGE_NPAGES_THRESHOLD UINT64_C(1024) + extern ssize_t opt_dirty_decay_ms; extern ssize_t opt_muzzy_decay_ms; @@ -16,7 +22,6 @@ extern const char *percpu_arena_mode_names[]; extern div_info_t arena_binind_div_info[SC_NBINS]; -extern const uint64_t h_steps[SMOOTHSTEP_NSTEPS]; extern malloc_mutex_t arenas_lock; extern emap_t arena_emap_global; @@ -51,6 +56,7 @@ bool arena_decay_ms_set(tsdn_t *tsdn, arena_t *arena, extent_state_t state, ssize_t arena_decay_ms_get(arena_t *arena, extent_state_t state); void arena_decay(tsdn_t *tsdn, arena_t *arena, bool is_background_thread, bool all); +uint64_t arena_time_until_deferred(tsdn_t *tsdn, arena_t *arena); void arena_do_deferred_work(tsdn_t *tsdn, arena_t *arena); void arena_reset(tsd_t *tsd, arena_t *arena); void arena_destroy(tsd_t *tsd, arena_t *arena); diff --git a/include/jemalloc/internal/background_thread_externs.h b/include/jemalloc/internal/background_thread_externs.h index bc49beaf..3d1ea6ce 100644 --- a/include/jemalloc/internal/background_thread_externs.h +++ b/include/jemalloc/internal/background_thread_externs.h @@ -13,6 +13,9 @@ extern background_thread_info_t *background_thread_info; bool background_thread_create(tsd_t *tsd, unsigned arena_ind); bool background_threads_enable(tsd_t *tsd); bool background_threads_disable(tsd_t *tsd); +bool background_thread_running(background_thread_info_t* info); +void background_thread_wakeup_early(background_thread_info_t *info, + nstime_t *remaining_sleep); void background_thread_interval_check(tsdn_t *tsdn, arena_t *arena, decay_t *decay, size_t npages_new); void background_thread_prefork0(tsdn_t *tsdn); diff --git a/include/jemalloc/internal/background_thread_inlines.h b/include/jemalloc/internal/background_thread_inlines.h index 71b433cb..92c5febe 100644 --- a/include/jemalloc/internal/background_thread_inlines.h +++ b/include/jemalloc/internal/background_thread_inlines.h @@ -45,18 +45,4 @@ background_thread_indefinite_sleep(background_thread_info_t *info) { return atomic_load_b(&info->indefinite_sleep, ATOMIC_ACQUIRE); } -JEMALLOC_ALWAYS_INLINE void -arena_background_thread_inactivity_check(tsdn_t *tsdn, arena_t *arena, - bool is_background_thread) { - if (!background_thread_enabled() || is_background_thread) { - return; - } - background_thread_info_t *info = - arena_background_thread_info_get(arena); - if (background_thread_indefinite_sleep(info)) { - background_thread_interval_check(tsdn, arena, - &arena->pa_shard.pac.decay_dirty, 0); - } -} - #endif /* JEMALLOC_INTERNAL_BACKGROUND_THREAD_INLINES_H */ diff --git a/include/jemalloc/internal/background_thread_structs.h b/include/jemalloc/internal/background_thread_structs.h index cc14dde3..b884b682 100644 --- a/include/jemalloc/internal/background_thread_structs.h +++ b/include/jemalloc/internal/background_thread_structs.h @@ -19,6 +19,9 @@ #define BACKGROUND_THREAD_HPA_INTERVAL_MAX_UNINITIALIZED (-2) #define BACKGROUND_THREAD_HPA_INTERVAL_MAX_DEFAULT_WHEN_ENABLED 5000 +#define BACKGROUND_THREAD_DEFERRED_MIN UINT64_C(0) +#define BACKGROUND_THREAD_DEFERRED_MAX UINT64_C(-1) + typedef enum { background_thread_stopped, background_thread_started, diff --git a/include/jemalloc/internal/decay.h b/include/jemalloc/internal/decay.h index 8e517458..cf6a9d22 100644 --- a/include/jemalloc/internal/decay.h +++ b/include/jemalloc/internal/decay.h @@ -118,6 +118,25 @@ decay_epoch_duration_ns(const decay_t *decay) { return nstime_ns(&decay->interval); } +static inline bool +decay_immediately(const decay_t *decay) { + ssize_t decay_ms = decay_ms_read(decay); + return decay_ms == 0; +} + +static inline bool +decay_disabled(const decay_t *decay) { + ssize_t decay_ms = decay_ms_read(decay); + return decay_ms < 0; +} + +/* Returns true if decay is enabled and done gradually. */ +static inline bool +decay_gradually(const decay_t *decay) { + ssize_t decay_ms = decay_ms_read(decay); + return decay_ms > 0; +} + /* * Returns true if the passed in decay time setting is valid. * < -1 : invalid @@ -144,6 +163,12 @@ bool decay_init(decay_t *decay, nstime_t *cur_time, ssize_t decay_ms); */ void decay_reinit(decay_t *decay, nstime_t *cur_time, ssize_t decay_ms); +/* + * Compute how many of 'npages_new' pages we would need to purge in 'time'. + */ +uint64_t decay_npages_purge_in(decay_t *decay, nstime_t *time, + size_t npages_new); + /* Returns true if the epoch advanced and there are pages to purge. */ bool decay_maybe_advance_epoch(decay_t *decay, nstime_t *new_time, size_t current_npages); diff --git a/include/jemalloc/internal/pa.h b/include/jemalloc/internal/pa.h index 2e5b9ef0..b2fed594 100644 --- a/include/jemalloc/internal/pa.h +++ b/include/jemalloc/internal/pa.h @@ -200,6 +200,8 @@ ssize_t pa_decay_ms_get(pa_shard_t *shard, extent_state_t state); void pa_shard_set_deferral_allowed(tsdn_t *tsdn, pa_shard_t *shard, bool deferral_allowed); void pa_shard_do_deferred_work(tsdn_t *tsdn, pa_shard_t *shard); +void pa_shard_try_deferred_work(tsdn_t *tsdn, pa_shard_t *shard); +uint64_t pa_shard_time_until_deferred_work(tsdn_t *tsdn, pa_shard_t *shard); /******************************************************************************/ /* diff --git a/include/jemalloc/internal/pai.h b/include/jemalloc/internal/pai.h index 4d3a9e01..7179fd36 100644 --- a/include/jemalloc/internal/pai.h +++ b/include/jemalloc/internal/pai.h @@ -24,6 +24,7 @@ struct pai_s { /* This function empties out list as a side-effect of being called. */ void (*dalloc_batch)(tsdn_t *tsdn, pai_t *self, edata_list_active_t *list); + uint64_t (*time_until_deferred_work)(tsdn_t *tsdn, pai_t *self); }; /* @@ -64,6 +65,11 @@ pai_dalloc_batch(tsdn_t *tsdn, pai_t *self, edata_list_active_t *list) { self->dalloc_batch(tsdn, self, list); } +static inline uint64_t +pai_time_until_deferred_work(tsdn_t *tsdn, pai_t *self) { + return self->time_until_deferred_work(tsdn, self); +} + /* * An implementation of batch allocation that simply calls alloc once for * each item in the list. diff --git a/src/arena.c b/src/arena.c index a495ef64..3dd77824 100644 --- a/src/arena.c +++ b/src/arena.c @@ -38,13 +38,6 @@ static atomic_zd_t muzzy_decay_ms_default; emap_t arena_emap_global; pa_central_t arena_pa_central_global; -const uint64_t h_steps[SMOOTHSTEP_NSTEPS] = { -#define STEP(step, h, x, y) \ - h, - SMOOTHSTEP -#undef STEP -}; - div_info_t arena_binind_div_info[SC_NBINS]; size_t opt_oversize_threshold = OVERSIZE_THRESHOLD_DEFAULT; @@ -65,6 +58,9 @@ static bool arena_decay_dirty(tsdn_t *tsdn, arena_t *arena, bool is_background_thread, bool all); static void arena_bin_lower_slab(tsdn_t *tsdn, arena_t *arena, edata_t *slab, bin_t *bin); +static void +arena_maybe_do_deferred_work(tsdn_t *tsdn, arena_t *arena, decay_t *decay, + size_t npages_new); /******************************************************************************/ @@ -189,6 +185,20 @@ arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads, } } +static void +arena_background_thread_inactivity_check(tsdn_t *tsdn, arena_t *arena, + bool is_background_thread) { + if (!background_thread_enabled() || is_background_thread) { + return; + } + background_thread_info_t *info = + arena_background_thread_info_get(arena); + if (background_thread_indefinite_sleep(info)) { + arena_maybe_do_deferred_work(tsdn, arena, + &arena->pa_shard.pac.decay_dirty, 0); + } +} + void arena_handle_new_dirty_pages(tsdn_t *tsdn, arena_t *arena) { witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), WITNESS_RANK_CORE, 0); @@ -420,8 +430,7 @@ arena_decay_impl(tsdn_t *tsdn, arena_t *arena, decay_t *decay, if (have_background_thread && background_thread_enabled() && epoch_advanced && !is_background_thread) { - background_thread_interval_check(tsdn, arena, decay, - npages_new); + arena_maybe_do_deferred_work(tsdn, arena, decay, npages_new); } return false; @@ -462,6 +471,65 @@ arena_decay(tsdn_t *tsdn, arena_t *arena, bool is_background_thread, bool all) { arena_decay_muzzy(tsdn, arena, is_background_thread, all); } +static void +arena_maybe_do_deferred_work(tsdn_t *tsdn, arena_t *arena, decay_t *decay, + size_t npages_new) { + background_thread_info_t *info = arena_background_thread_info_get( + arena); + if (malloc_mutex_trylock(tsdn, &info->mtx)) { + /* + * Background thread may hold the mutex for a long period of + * time. We'd like to avoid the variance on application + * threads. So keep this non-blocking, and leave the work to a + * future epoch. + */ + return; + } + if (!background_thread_running(info)) { + goto label_done; + } + if (malloc_mutex_trylock(tsdn, &decay->mtx)) { + goto label_done; + } + if (!decay_gradually(decay)) { + goto label_done_unlock2; + } + + nstime_t diff; + nstime_init(&diff, background_thread_wakeup_time_get(info)); + if (nstime_compare(&diff, &decay->epoch) <= 0) { + goto label_done_unlock2; + } + nstime_subtract(&diff, &decay->epoch); + + if (npages_new > 0) { + uint64_t npurge_new = decay_npages_purge_in(decay, &diff, + npages_new); + info->npages_to_purge_new += npurge_new; + } + + bool should_signal; + if (info->npages_to_purge_new > ARENA_DEFERRED_PURGE_NPAGES_THRESHOLD) { + should_signal = true; + } else if (unlikely(background_thread_indefinite_sleep(info)) && + (ecache_npages_get(&arena->pa_shard.pac.ecache_dirty) > 0 || + ecache_npages_get(&arena->pa_shard.pac.ecache_muzzy) > 0 || + info->npages_to_purge_new > 0)) { + should_signal = true; + } else { + should_signal = false; + } + + if (should_signal) { + info->npages_to_purge_new = 0; + background_thread_wakeup_early(info, &diff); + } +label_done_unlock2: + malloc_mutex_unlock(tsdn, &decay->mtx); +label_done: + malloc_mutex_unlock(tsdn, &info->mtx); +} + /* Called from background threads. */ void arena_do_deferred_work(tsdn_t *tsdn, arena_t *arena) { diff --git a/src/background_thread.c b/src/background_thread.c index 4951cd1a..9e577cb3 100644 --- a/src/background_thread.c +++ b/src/background_thread.c @@ -60,8 +60,9 @@ pthread_create_wrapper(pthread_t *__restrict thread, const pthread_attr_t *attr, bool background_thread_create(tsd_t *tsd, unsigned arena_ind) NOT_REACHED bool background_threads_enable(tsd_t *tsd) NOT_REACHED bool background_threads_disable(tsd_t *tsd) NOT_REACHED -void background_thread_interval_check(tsdn_t *tsdn, arena_t *arena, - decay_t *decay, size_t npages_new) NOT_REACHED +bool background_thread_running(background_thread_info_t *info) NOT_REACHED +void background_thread_wakeup_early(background_thread_info_t *info, + nstime_t *remaining_sleep) NOT_REACHED void background_thread_prefork0(tsdn_t *tsdn) NOT_REACHED void background_thread_prefork1(tsdn_t *tsdn) NOT_REACHED void background_thread_postfork_parent(tsdn_t *tsdn) NOT_REACHED @@ -98,8 +99,6 @@ set_current_thread_affinity(int cpu) { #endif } -/* Threshold for determining when to wake up the background thread. */ -#define BACKGROUND_THREAD_NPAGES_THRESHOLD UINT64_C(1024) #define BILLION UINT64_C(1000000000) /* Minimal sleep interval 100 ms. */ #define BACKGROUND_THREAD_MIN_INTERVAL_NS (BILLION / 10) @@ -173,55 +172,10 @@ background_thread_pause_check(tsdn_t *tsdn, background_thread_info_t *info) { return false; } -static inline uint64_t -arena_decay_compute_purge_interval(tsdn_t *tsdn, decay_t *decay, - size_t npages) { - if (malloc_mutex_trylock(tsdn, &decay->mtx)) { - /* Use minimal interval if decay is contended. */ - return BACKGROUND_THREAD_MIN_INTERVAL_NS; - } - uint64_t decay_ns = decay_ns_until_purge(decay, npages, - BACKGROUND_THREAD_NPAGES_THRESHOLD); - malloc_mutex_unlock(tsdn, &decay->mtx); - - return decay_ns < BACKGROUND_THREAD_MIN_INTERVAL_NS ? - BACKGROUND_THREAD_MIN_INTERVAL_NS : - decay_ns; -} - - -static inline uint64_t -arena_decay_compute_min_purge_interval(tsdn_t *tsdn, arena_t *arena) { - uint64_t dirty, muzzy; - dirty = arena_decay_compute_purge_interval(tsdn, - &arena->pa_shard.pac.decay_dirty, - ecache_npages_get(&arena->pa_shard.pac.ecache_dirty)); - if (dirty == BACKGROUND_THREAD_MIN_INTERVAL_NS) { - return dirty; - } - muzzy = arena_decay_compute_purge_interval(tsdn, - &arena->pa_shard.pac.decay_muzzy, - ecache_npages_get(&arena->pa_shard.pac.ecache_muzzy)); - - uint64_t min_so_far = dirty < muzzy ? dirty : muzzy; - if (opt_background_thread_hpa_interval_max_ms >= 0) { - uint64_t hpa_interval = 1000 * 1000 * - (uint64_t)opt_background_thread_hpa_interval_max_ms; - if (hpa_interval < min_so_far) { - if (hpa_interval < BACKGROUND_THREAD_MIN_INTERVAL_NS) { - min_so_far = BACKGROUND_THREAD_MIN_INTERVAL_NS; - } else { - min_so_far = hpa_interval; - } - } - } - - return min_so_far; -} - static inline void -background_work_sleep_once(tsdn_t *tsdn, background_thread_info_t *info, unsigned ind) { - uint64_t min_interval = BACKGROUND_THREAD_INDEFINITE_SLEEP; +background_work_sleep_once(tsdn_t *tsdn, background_thread_info_t *info, + unsigned ind) { + uint64_t ns_until_deferred = BACKGROUND_THREAD_DEFERRED_MAX; unsigned narenas = narenas_total_get(); for (unsigned i = ind; i < narenas; i += max_background_threads) { @@ -230,19 +184,29 @@ background_work_sleep_once(tsdn_t *tsdn, background_thread_info_t *info, unsigne continue; } arena_do_deferred_work(tsdn, arena); - if (min_interval == BACKGROUND_THREAD_MIN_INTERVAL_NS) { + if (ns_until_deferred <= BACKGROUND_THREAD_MIN_INTERVAL_NS) { /* Min interval will be used. */ continue; } - uint64_t interval = arena_decay_compute_min_purge_interval(tsdn, - arena); - assert(interval >= BACKGROUND_THREAD_MIN_INTERVAL_NS); - if (interval != DECAY_UNBOUNDED_TIME_TO_PURGE && - min_interval > interval) { - min_interval = interval; + uint64_t ns_arena_deferred = pa_shard_time_until_deferred_work( + tsdn, &arena->pa_shard); + if (ns_arena_deferred < ns_until_deferred) { + ns_until_deferred = ns_arena_deferred; } } - background_thread_sleep(tsdn, info, min_interval); + + uint64_t sleep_ns; + if (ns_until_deferred == BACKGROUND_THREAD_DEFERRED_MAX) { + sleep_ns = BACKGROUND_THREAD_INDEFINITE_SLEEP; + } else { + sleep_ns = + (ns_until_deferred < BACKGROUND_THREAD_MIN_INTERVAL_NS) + ? BACKGROUND_THREAD_MIN_INTERVAL_NS + : ns_until_deferred; + + } + + background_thread_sleep(tsdn, info, sleep_ns); } static bool @@ -609,88 +573,23 @@ background_threads_disable(tsd_t *tsd) { return false; } -/* Check if we need to signal the background thread early. */ +bool +background_thread_running(background_thread_info_t *info) { + return info->state == background_thread_started; +} + void -background_thread_interval_check(tsdn_t *tsdn, arena_t *arena, decay_t *decay, - size_t npages_new) { - background_thread_info_t *info = arena_background_thread_info_get( - arena); - if (malloc_mutex_trylock(tsdn, &info->mtx)) { - /* - * Background thread may hold the mutex for a long period of - * time. We'd like to avoid the variance on application - * threads. So keep this non-blocking, and leave the work to a - * future epoch. - */ +background_thread_wakeup_early(background_thread_info_t *info, + nstime_t *remaining_sleep) { + /* + * This is an optimization to increase batching. At this point + * we know that background thread wakes up soon, so the time to cache + * the just freed memory is bounded and low. + */ + if (nstime_ns(remaining_sleep) < BACKGROUND_THREAD_MIN_INTERVAL_NS) { return; } - - if (info->state != background_thread_started) { - goto label_done; - } - if (malloc_mutex_trylock(tsdn, &decay->mtx)) { - goto label_done; - } - - ssize_t decay_time = decay_ms_read(decay); - if (decay_time <= 0) { - /* Purging is eagerly done or disabled currently. */ - goto label_done_unlock2; - } - uint64_t decay_interval_ns = decay_epoch_duration_ns(decay); - assert(decay_interval_ns > 0); - - nstime_t diff; - nstime_init(&diff, background_thread_wakeup_time_get(info)); - if (nstime_compare(&diff, &decay->epoch) <= 0) { - goto label_done_unlock2; - } - nstime_subtract(&diff, &decay->epoch); - if (nstime_ns(&diff) < BACKGROUND_THREAD_MIN_INTERVAL_NS) { - goto label_done_unlock2; - } - - if (npages_new > 0) { - size_t n_epoch = (size_t)(nstime_ns(&diff) / decay_interval_ns); - /* - * Compute how many new pages we would need to purge by the next - * wakeup, which is used to determine if we should signal the - * background thread. - */ - uint64_t npurge_new; - if (n_epoch >= SMOOTHSTEP_NSTEPS) { - npurge_new = npages_new; - } else { - uint64_t h_steps_max = h_steps[SMOOTHSTEP_NSTEPS - 1]; - assert(h_steps_max >= - h_steps[SMOOTHSTEP_NSTEPS - 1 - n_epoch]); - npurge_new = npages_new * (h_steps_max - - h_steps[SMOOTHSTEP_NSTEPS - 1 - n_epoch]); - npurge_new >>= SMOOTHSTEP_BFP; - } - info->npages_to_purge_new += npurge_new; - } - - bool should_signal; - if (info->npages_to_purge_new > BACKGROUND_THREAD_NPAGES_THRESHOLD) { - should_signal = true; - } else if (unlikely(background_thread_indefinite_sleep(info)) && - (ecache_npages_get(&arena->pa_shard.pac.ecache_dirty) > 0 || - ecache_npages_get(&arena->pa_shard.pac.ecache_muzzy) > 0 || - info->npages_to_purge_new > 0)) { - should_signal = true; - } else { - should_signal = false; - } - - if (should_signal) { - info->npages_to_purge_new = 0; - pthread_cond_signal(&info->cond); - } -label_done_unlock2: - malloc_mutex_unlock(tsdn, &decay->mtx); -label_done: - malloc_mutex_unlock(tsdn, &info->mtx); + pthread_cond_signal(&info->cond); } void diff --git a/src/decay.c b/src/decay.c index fdbd63d8..cdb8487b 100644 --- a/src/decay.c +++ b/src/decay.c @@ -3,6 +3,13 @@ #include "jemalloc/internal/decay.h" +const uint64_t h_steps[SMOOTHSTEP_NSTEPS] = { +#define STEP(step, h, x, y) \ + h, + SMOOTHSTEP +#undef STEP +}; + /* * Generate a new deadline that is uniformly random within the next epoch after * the current one. @@ -147,6 +154,25 @@ decay_deadline_reached(const decay_t *decay, const nstime_t *time) { return (nstime_compare(&decay->deadline, time) <= 0); } +uint64_t +decay_npages_purge_in(decay_t *decay, nstime_t *time, size_t npages_new) { + uint64_t decay_interval_ns = decay_epoch_duration_ns(decay); + size_t n_epoch = (size_t)(nstime_ns(time) / decay_interval_ns); + + uint64_t npages_purge; + if (n_epoch >= SMOOTHSTEP_NSTEPS) { + npages_purge = npages_new; + } else { + uint64_t h_steps_max = h_steps[SMOOTHSTEP_NSTEPS - 1]; + assert(h_steps_max >= + h_steps[SMOOTHSTEP_NSTEPS - 1 - n_epoch]); + npages_purge = npages_new * (h_steps_max - + h_steps[SMOOTHSTEP_NSTEPS - 1 - n_epoch]); + npages_purge >>= SMOOTHSTEP_BFP; + } + return npages_purge; +} + bool decay_maybe_advance_epoch(decay_t *decay, nstime_t *new_time, size_t npages_current) { @@ -214,9 +240,7 @@ decay_npurge_after_interval(decay_t *decay, size_t interval) { uint64_t decay_ns_until_purge(decay_t *decay, size_t npages_current, uint64_t npages_threshold) { - ssize_t decay_time = decay_ms_read(decay); - if (decay_time <= 0) { - /* Purging is eagerly done or disabled currently. */ + if (!decay_gradually(decay)) { return DECAY_UNBOUNDED_TIME_TO_PURGE; } uint64_t decay_interval_ns = decay_epoch_duration_ns(decay); diff --git a/src/hpa.c b/src/hpa.c index 6b7517d8..d45a3bd0 100644 --- a/src/hpa.c +++ b/src/hpa.c @@ -19,6 +19,7 @@ static bool hpa_shrink(tsdn_t *tsdn, pai_t *self, edata_t *edata, static void hpa_dalloc(tsdn_t *tsdn, pai_t *self, edata_t *edata); static void hpa_dalloc_batch(tsdn_t *tsdn, pai_t *self, edata_list_active_t *list); +static uint64_t hpa_time_until_deferred_work(tsdn_t *tsdn, pai_t *self); bool hpa_supported() { @@ -218,6 +219,7 @@ hpa_shard_init(hpa_shard_t *shard, hpa_central_t *central, emap_t *emap, shard->pai.shrink = &hpa_shrink; shard->pai.dalloc = &hpa_dalloc; shard->pai.dalloc_batch = &hpa_dalloc_batch; + shard->pai.time_until_deferred_work = &hpa_time_until_deferred_work; hpa_do_consistency_checks(shard); @@ -850,6 +852,11 @@ hpa_dalloc(tsdn_t *tsdn, pai_t *self, edata_t *edata) { hpa_dalloc_batch(tsdn, self, &dalloc_list); } +static uint64_t +hpa_time_until_deferred_work(tsdn_t *tsdn, pai_t *self) { + return opt_background_thread_hpa_interval_max_ms; +} + void hpa_shard_disable(tsdn_t *tsdn, hpa_shard_t *shard) { hpa_do_consistency_checks(shard); diff --git a/src/pa.c b/src/pa.c index 93da02e0..c5b8daa7 100644 --- a/src/pa.c +++ b/src/pa.c @@ -96,6 +96,11 @@ pa_shard_reset(tsdn_t *tsdn, pa_shard_t *shard) { } } +static bool +pa_shard_uses_hpa(pa_shard_t *shard) { + return atomic_load_b(&shard->use_hpa, ATOMIC_RELAXED); +} + void pa_shard_destroy(tsdn_t *tsdn, pa_shard_t *shard) { pac_destroy(tsdn, &shard->pac); @@ -118,7 +123,7 @@ pa_alloc(tsdn_t *tsdn, pa_shard_t *shard, size_t size, size_t alignment, WITNESS_RANK_CORE, 0); edata_t *edata = NULL; - if (atomic_load_b(&shard->use_hpa, ATOMIC_RELAXED)) { + if (pa_shard_uses_hpa(shard)) { edata = pai_alloc(tsdn, &shard->hpa_sec.pai, size, alignment, zero); } @@ -226,7 +231,7 @@ pa_decay_ms_get(pa_shard_t *shard, extent_state_t state) { void pa_shard_set_deferral_allowed(tsdn_t *tsdn, pa_shard_t *shard, bool deferral_allowed) { - if (atomic_load_b(&shard->use_hpa, ATOMIC_RELAXED)) { + if (pa_shard_uses_hpa(shard)) { hpa_shard_set_deferral_allowed(tsdn, &shard->hpa_shard, deferral_allowed); } @@ -234,7 +239,63 @@ pa_shard_set_deferral_allowed(tsdn_t *tsdn, pa_shard_t *shard, void pa_shard_do_deferred_work(tsdn_t *tsdn, pa_shard_t *shard) { - if (atomic_load_b(&shard->use_hpa, ATOMIC_RELAXED)) { + if (pa_shard_uses_hpa(shard)) { hpa_shard_do_deferred_work(tsdn, &shard->hpa_shard); } } + +static inline uint64_t +pa_shard_ns_until_purge(tsdn_t *tsdn, decay_t *decay, size_t npages) { + if (malloc_mutex_trylock(tsdn, &decay->mtx)) { + /* Use minimal interval if decay is contended. */ + return BACKGROUND_THREAD_DEFERRED_MIN; + } + uint64_t result = decay_ns_until_purge(decay, npages, + ARENA_DEFERRED_PURGE_NPAGES_THRESHOLD); + + malloc_mutex_unlock(tsdn, &decay->mtx); + return result; +} + +/* + * Get time until next deferred work ought to happen. If there are multiple + * things that have been deferred, this function calculates the time until + * the soonest of those things. + */ +uint64_t +pa_shard_time_until_deferred_work(tsdn_t *tsdn, pa_shard_t *shard) { + uint64_t time; + time = pa_shard_ns_until_purge(tsdn, + &shard->pac.decay_dirty, + ecache_npages_get(&shard->pac.ecache_dirty)); + if (time == BACKGROUND_THREAD_DEFERRED_MIN) { + return time; + } + + uint64_t muzzy = pa_shard_ns_until_purge(tsdn, + &shard->pac.decay_muzzy, + ecache_npages_get(&shard->pac.ecache_muzzy)); + if (muzzy < time) { + time = muzzy; + if (time == BACKGROUND_THREAD_DEFERRED_MIN) { + return time; + } + } + + uint64_t pac = pai_time_until_deferred_work(tsdn, &shard->pac.pai); + if (pac < time) { + time = pac; + if (time == BACKGROUND_THREAD_DEFERRED_MIN) { + return time; + } + } + + if (pa_shard_uses_hpa(shard)) { + uint64_t hpa = + pai_time_until_deferred_work(tsdn, &shard->hpa_shard.pai); + if (hpa < time) { + time = hpa; + } + } + return time; +} diff --git a/src/pac.c b/src/pac.c index 0737e68c..c611d919 100644 --- a/src/pac.c +++ b/src/pac.c @@ -10,6 +10,7 @@ static bool pac_expand_impl(tsdn_t *tsdn, pai_t *self, edata_t *edata, static bool pac_shrink_impl(tsdn_t *tsdn, pai_t *self, edata_t *edata, size_t old_size, size_t new_size); static void pac_dalloc_impl(tsdn_t *tsdn, pai_t *self, edata_t *edata); +static uint64_t pac_time_until_deferred_work(tsdn_t *tsdn, pai_t *self); static ehooks_t * pac_ehooks_get(pac_t *pac) { @@ -96,6 +97,7 @@ pac_init(tsdn_t *tsdn, pac_t *pac, base_t *base, emap_t *emap, pac->pai.shrink = &pac_shrink_impl; pac->pai.dalloc = &pac_dalloc_impl; pac->pai.dalloc_batch = &pai_dalloc_batch_default; + pac->pai.time_until_deferred_work = &pac_time_until_deferred_work; return false; } @@ -196,6 +198,11 @@ pac_dalloc_impl(tsdn_t *tsdn, pai_t *self, edata_t *edata) { ecache_dalloc(tsdn, pac, ehooks, &pac->ecache_dirty, edata); } +static uint64_t +pac_time_until_deferred_work(tsdn_t *tsdn, pai_t *self) { + return BACKGROUND_THREAD_DEFERRED_MAX; +} + bool pac_retain_grow_limit_get_set(tsdn_t *tsdn, pac_t *pac, size_t *old_limit, size_t *new_limit) { diff --git a/test/unit/decay.c b/test/unit/decay.c index 72484c80..67722199 100644 --- a/test/unit/decay.c +++ b/test/unit/decay.c @@ -36,6 +36,37 @@ TEST_BEGIN(test_decay_ms_valid) { } TEST_END +TEST_BEGIN(test_decay_npages_purge_in) { + decay_t decay; + memset(&decay, 0, sizeof(decay)); + + nstime_t curtime; + nstime_init(&curtime, 0); + + uint64_t decay_ms = 1000; + nstime_t decay_nstime; + nstime_init(&decay_nstime, decay_ms * 1000 * 1000); + expect_false(decay_init(&decay, &curtime, (ssize_t)decay_ms), + "Failed to initialize decay"); + + const size_t new_pages = 100; + + nstime_t time; + nstime_copy(&time, &decay_nstime); + expect_u64_eq(decay_npages_purge_in(&decay, &time, new_pages), + new_pages, "Not all pages are expected to decay in decay_ms"); + + nstime_init(&time, 0); + expect_u64_eq(decay_npages_purge_in(&decay, &time, new_pages), 0, + "More than zero pages are expected to instantly decay"); + + nstime_copy(&time, &decay_nstime); + nstime_idivide(&time, 2); + expect_u64_eq(decay_npages_purge_in(&decay, &time, new_pages), + new_pages / 2, "Not half of pages decay in half the decay period"); +} +TEST_END + TEST_BEGIN(test_decay_maybe_advance_epoch) { decay_t decay; memset(&decay, 0, sizeof(decay)); @@ -244,6 +275,7 @@ main(void) { return test( test_decay_init, test_decay_ms_valid, + test_decay_npages_purge_in, test_decay_maybe_advance_epoch, test_decay_empty, test_decay,