From 1d4a7666d558b2c21e8cfc2b3e8981020db072fa Mon Sep 17 00:00:00 2001 From: David Goldblatt Date: Mon, 7 Jun 2021 11:45:57 -0700 Subject: [PATCH] HPA: Do deferred operations on background threads. --- Makefile.in | 1 + .../internal/background_thread_externs.h | 1 + .../internal/background_thread_structs.h | 8 + include/jemalloc/internal/pa.h | 12 +- src/arena.c | 1 + src/background_thread.c | 51 +++++- src/ctl.c | 5 + src/jemalloc.c | 9 + src/pa.c | 12 ++ src/stats.c | 1 + test/unit/hpa_background_thread.c | 158 ++++++++++++++++++ test/unit/hpa_background_thread.sh | 4 + 12 files changed, 256 insertions(+), 7 deletions(-) create mode 100644 test/unit/hpa_background_thread.c create mode 100644 test/unit/hpa_background_thread.sh diff --git a/Makefile.in b/Makefile.in index ed03d4e2..3e7d122b 100644 --- a/Makefile.in +++ b/Makefile.in @@ -221,6 +221,7 @@ TESTS_UNIT := \ $(srcroot)test/unit/hash.c \ $(srcroot)test/unit/hook.c \ $(srcroot)test/unit/hpa.c \ + $(srcroot)test/unit/hpa_background_thread.c \ $(srcroot)test/unit/hpa_central.c \ $(srcroot)test/unit/hpdata.c \ $(srcroot)test/unit/huge.c \ diff --git a/include/jemalloc/internal/background_thread_externs.h b/include/jemalloc/internal/background_thread_externs.h index d5c13695..bc49beaf 100644 --- a/include/jemalloc/internal/background_thread_externs.h +++ b/include/jemalloc/internal/background_thread_externs.h @@ -2,6 +2,7 @@ #define JEMALLOC_INTERNAL_BACKGROUND_THREAD_EXTERNS_H extern bool opt_background_thread; +extern ssize_t opt_background_thread_hpa_interval_max_ms; extern size_t opt_max_background_threads; extern malloc_mutex_t background_thread_lock; extern atomic_b_t background_thread_enabled_state; diff --git a/include/jemalloc/internal/background_thread_structs.h b/include/jemalloc/internal/background_thread_structs.h index 249115c3..cc14dde3 100644 --- a/include/jemalloc/internal/background_thread_structs.h +++ b/include/jemalloc/internal/background_thread_structs.h @@ -11,6 +11,14 @@ #define MAX_BACKGROUND_THREAD_LIMIT MALLOCX_ARENA_LIMIT #define DEFAULT_NUM_BACKGROUND_THREAD 4 +/* + * These exist only as a transitional state. Eventually, deferral should be + * part of the PAI, and each implementation can indicate wait times with more + * specificity. + */ +#define BACKGROUND_THREAD_HPA_INTERVAL_MAX_UNINITIALIZED (-2) +#define BACKGROUND_THREAD_HPA_INTERVAL_MAX_DEFAULT_WHEN_ENABLED 5000 + typedef enum { background_thread_stopped, background_thread_started, diff --git a/include/jemalloc/internal/pa.h b/include/jemalloc/internal/pa.h index cb9f8cff..0fb77250 100644 --- a/include/jemalloc/internal/pa.h +++ b/include/jemalloc/internal/pa.h @@ -172,11 +172,21 @@ bool pa_shrink(tsdn_t *tsdn, pa_shard_t *shard, edata_t *edata, size_t old_size, */ void pa_dalloc(tsdn_t *tsdn, pa_shard_t *shard, edata_t *edata, bool *generated_dirty); - bool pa_decay_ms_set(tsdn_t *tsdn, pa_shard_t *shard, extent_state_t state, ssize_t decay_ms, pac_purge_eagerness_t eagerness); ssize_t pa_decay_ms_get(pa_shard_t *shard, extent_state_t state); +/* + * Do deferred work on this PA shard. + * + * Morally, this should do both PAC decay and the HPA deferred work. For now, + * though, the arena, background thread, and PAC modules are tightly interwoven + * in a way that's tricky to extricate, so we only do the HPA-specific parts. + */ +void pa_shard_set_deferral_allowed(tsdn_t *tsdn, pa_shard_t *shard, + bool deferral_allowed); +void pa_shard_do_deferred_work(tsdn_t *tsdn, pa_shard_t *shard); + /******************************************************************************/ /* * Various bits of "boring" functionality that are still part of this module, diff --git a/src/arena.c b/src/arena.c index bdc120fa..d6a1f674 100644 --- a/src/arena.c +++ b/src/arena.c @@ -465,6 +465,7 @@ arena_decay(tsdn_t *tsdn, arena_t *arena, bool is_background_thread, bool all) { void arena_do_deferred_work(tsdn_t *tsdn, arena_t *arena) { arena_decay(tsdn, arena, true, false); + pa_shard_do_deferred_work(tsdn, &arena->pa_shard); } void diff --git a/src/background_thread.c b/src/background_thread.c index edcf786e..1fb24fe6 100644 --- a/src/background_thread.c +++ b/src/background_thread.c @@ -13,6 +13,13 @@ JEMALLOC_DIAGNOSTIC_DISABLE_SPURIOUS /* Read-only after initialization. */ bool opt_background_thread = BACKGROUND_THREAD_DEFAULT; size_t opt_max_background_threads = MAX_BACKGROUND_THREAD_LIMIT + 1; +/* + * This is disabled (and set to -1) if the HPA is. If the HPA is enabled, + * malloc_conf initialization sets it to + * BACKGROUND_THREAD_HPA_INTERVAL_MAX_DEFAULT_WHEN_ENABLED. + */ +ssize_t opt_background_thread_hpa_interval_max_ms = + BACKGROUND_THREAD_HPA_INTERVAL_MAX_UNINITIALIZED; /* Used for thread creation, termination and stats. */ malloc_mutex_t background_thread_lock; @@ -209,7 +216,20 @@ arena_decay_compute_purge_interval(tsdn_t *tsdn, arena_t *arena) { i2 = arena_decay_compute_purge_interval_impl(tsdn, &arena->pa_shard.pac.decay_muzzy, &arena->pa_shard.pac.ecache_muzzy); - return i1 < i2 ? i1 : i2; + uint64_t min_so_far = i1 < i2 ? i1 : i2; + if (opt_background_thread_hpa_interval_max_ms >= 0) { + uint64_t hpa_interval = 1000 * 1000 * + (uint64_t)opt_background_thread_hpa_interval_max_ms; + if (hpa_interval < min_so_far) { + if (hpa_interval < BACKGROUND_THREAD_MIN_INTERVAL_NS) { + min_so_far = BACKGROUND_THREAD_MIN_INTERVAL_NS; + } else { + min_so_far = hpa_interval; + } + } + } + + return min_so_far; } static void @@ -607,16 +627,16 @@ background_threads_enable(tsd_t *tsd) { malloc_mutex_assert_owner(tsd_tsdn(tsd), &background_thread_lock); VARIABLE_ARRAY(bool, marked, max_background_threads); - unsigned i, nmarked; - for (i = 0; i < max_background_threads; i++) { + unsigned nmarked; + for (unsigned i = 0; i < max_background_threads; i++) { marked[i] = false; } nmarked = 0; /* Thread 0 is required and created at the end. */ marked[0] = true; /* Mark the threads we need to create for thread 0. */ - unsigned n = narenas_total_get(); - for (i = 1; i < n; i++) { + unsigned narenas = narenas_total_get(); + for (unsigned i = 1; i < narenas; i++) { if (marked[i % max_background_threads] || arena_get(tsd_tsdn(tsd), i, false) == NULL) { continue; @@ -633,7 +653,18 @@ background_threads_enable(tsd_t *tsd) { } } - return background_thread_create_locked(tsd, 0); + bool err = background_thread_create_locked(tsd, 0); + if (err) { + return true; + } + for (unsigned i = 0; i < narenas; i++) { + arena_t *arena = arena_get(tsd_tsdn(tsd), i, false); + if (arena != NULL) { + pa_shard_set_deferral_allowed(tsd_tsdn(tsd), + &arena->pa_shard, true); + } + } + return false; } bool @@ -647,6 +678,14 @@ background_threads_disable(tsd_t *tsd) { return true; } assert(n_background_threads == 0); + unsigned narenas = narenas_total_get(); + for (unsigned i = 0; i < narenas; i++) { + arena_t *arena = arena_get(tsd_tsdn(tsd), i, false); + if (arena != NULL) { + pa_shard_set_deferral_allowed(tsd_tsdn(tsd), + &arena->pa_shard, false); + } + } return false; } diff --git a/src/ctl.c b/src/ctl.c index c713f0e2..c66b4d8c 100644 --- a/src/ctl.c +++ b/src/ctl.c @@ -111,6 +111,7 @@ CTL_PROTO(opt_percpu_arena) CTL_PROTO(opt_oversize_threshold) CTL_PROTO(opt_background_thread) CTL_PROTO(opt_max_background_threads) +CTL_PROTO(opt_background_thread_hpa_interval_max_ms) CTL_PROTO(opt_dirty_decay_ms) CTL_PROTO(opt_muzzy_decay_ms) CTL_PROTO(opt_stats_print) @@ -423,6 +424,8 @@ static const ctl_named_node_t opt_node[] = { {NAME("oversize_threshold"), CTL(opt_oversize_threshold)}, {NAME("background_thread"), CTL(opt_background_thread)}, {NAME("max_background_threads"), CTL(opt_max_background_threads)}, + {NAME("background_thread_hpa_interval_max_ms"), + CTL(opt_background_thread_hpa_interval_max_ms)}, {NAME("dirty_decay_ms"), CTL(opt_dirty_decay_ms)}, {NAME("muzzy_decay_ms"), CTL(opt_muzzy_decay_ms)}, {NAME("stats_print"), CTL(opt_stats_print)}, @@ -2139,6 +2142,8 @@ CTL_RO_NL_GEN(opt_percpu_arena, percpu_arena_mode_names[opt_percpu_arena], CTL_RO_NL_GEN(opt_oversize_threshold, opt_oversize_threshold, size_t) CTL_RO_NL_GEN(opt_background_thread, opt_background_thread, bool) CTL_RO_NL_GEN(opt_max_background_threads, opt_max_background_threads, size_t) +CTL_RO_NL_GEN(opt_background_thread_hpa_interval_max_ms, + opt_background_thread_hpa_interval_max_ms, ssize_t) CTL_RO_NL_GEN(opt_dirty_decay_ms, opt_dirty_decay_ms, ssize_t) CTL_RO_NL_GEN(opt_muzzy_decay_ms, opt_muzzy_decay_ms, ssize_t) CTL_RO_NL_GEN(opt_stats_print, opt_stats_print, bool) diff --git a/src/jemalloc.c b/src/jemalloc.c index 85d68639..28c7fdc0 100644 --- a/src/jemalloc.c +++ b/src/jemalloc.c @@ -1410,6 +1410,10 @@ malloc_conf_init_helper(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS], CONF_CHECK_MIN, CONF_CHECK_MAX, true); CONF_HANDLE_BOOL(opt_hpa, "hpa") + CONF_HANDLE_SSIZE_T( + opt_background_thread_hpa_interval_max_ms, + "background_thread_hpa_interval_max_ms", -1, + SSIZE_MAX) CONF_HANDLE_SIZE_T(opt_hpa_opts.slab_max_alloc, "hpa_slab_max_alloc", PAGE, HUGEPAGE, CONF_CHECK_MIN, CONF_CHECK_MAX, true); @@ -1659,6 +1663,11 @@ malloc_conf_init(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS]) { malloc_conf_init_helper(NULL, NULL, true, opts_cache, buf); malloc_conf_init_helper(sc_data, bin_shard_sizes, false, opts_cache, NULL); + if (opt_hpa && opt_background_thread_hpa_interval_max_ms + == BACKGROUND_THREAD_HPA_INTERVAL_MAX_UNINITIALIZED) { + opt_background_thread_hpa_interval_max_ms = + BACKGROUND_THREAD_HPA_INTERVAL_MAX_DEFAULT_WHEN_ENABLED; + } } #undef MALLOC_CONF_NSOURCES diff --git a/src/pa.c b/src/pa.c index cb3b3df5..cbc8f760 100644 --- a/src/pa.c +++ b/src/pa.c @@ -208,3 +208,15 @@ ssize_t pa_decay_ms_get(pa_shard_t *shard, extent_state_t state) { return pac_decay_ms_get(&shard->pac, state); } + +void +pa_shard_set_deferral_allowed(tsdn_t *tsdn, pa_shard_t *shard, + bool deferral_allowed) { + hpa_shard_set_deferral_allowed(tsdn, &shard->hpa_shard, + deferral_allowed); +} + +void +pa_shard_do_deferred_work(tsdn_t *tsdn, pa_shard_t *shard) { + hpa_shard_do_deferred_work(tsdn, &shard->hpa_shard); +} diff --git a/src/stats.c b/src/stats.c index 34cae0ab..4e6c3922 100644 --- a/src/stats.c +++ b/src/stats.c @@ -1494,6 +1494,7 @@ stats_general_print(emitter_t *emitter) { OPT_WRITE_SIZE_T("hpa_sec_batch_fill_extra") OPT_WRITE_CHAR_P("metadata_thp") OPT_WRITE_BOOL_MUTABLE("background_thread", "background_thread") + OPT_WRITE_SSIZE_T("background_thread_hpa_interval_max_ms") OPT_WRITE_SSIZE_T_MUTABLE("dirty_decay_ms", "arenas.dirty_decay_ms") OPT_WRITE_SSIZE_T_MUTABLE("muzzy_decay_ms", "arenas.muzzy_decay_ms") OPT_WRITE_SIZE_T("lg_extent_max_active_fit") diff --git a/test/unit/hpa_background_thread.c b/test/unit/hpa_background_thread.c new file mode 100644 index 00000000..1907a6dd --- /dev/null +++ b/test/unit/hpa_background_thread.c @@ -0,0 +1,158 @@ +#include "test/jemalloc_test.h" +#include "test/sleep.h" + +static void +sleep_for_background_thread_interval() { + /* + * The sleep interval set in our .sh file is 50ms. So it should + * definitely run if we sleep for for times that. + */ + sleep_ns(200 * 1000 * 1000); +} + +static unsigned +create_arena() { + unsigned arena_ind; + size_t sz; + + sz = sizeof(unsigned); + expect_d_eq(mallctl("arenas.create", (void *)&arena_ind, &sz, NULL, 2), + 0, "Unexpected mallctl() failure"); + return arena_ind; +} + +static size_t +get_empty_ndirty(unsigned arena_ind) { + int err; + size_t ndirty_huge; + size_t ndirty_nonhuge; + uint64_t epoch = 1; + size_t sz = sizeof(epoch); + err = je_mallctl("epoch", (void *)&epoch, &sz, (void *)&epoch, + sizeof(epoch)); + expect_d_eq(0, err, "Unexpected mallctl() failure"); + + size_t mib[6]; + size_t miblen = sizeof(mib)/sizeof(mib[0]); + err = mallctlnametomib( + "stats.arenas.0.hpa_shard.empty_slabs.ndirty_nonhuge", mib, + &miblen); + expect_d_eq(0, err, "Unexpected mallctlnametomib() failure"); + + sz = sizeof(ndirty_nonhuge); + mib[2] = arena_ind; + err = mallctlbymib(mib, miblen, &ndirty_nonhuge, &sz, NULL, 0); + expect_d_eq(0, err, "Unexpected mallctlbymib() failure"); + + err = mallctlnametomib( + "stats.arenas.0.hpa_shard.empty_slabs.ndirty_huge", mib, + &miblen); + expect_d_eq(0, err, "Unexpected mallctlnametomib() failure"); + + sz = sizeof(ndirty_huge); + mib[2] = arena_ind; + err = mallctlbymib(mib, miblen, &ndirty_huge, &sz, NULL, 0); + expect_d_eq(0, err, "Unexpected mallctlbymib() failure"); + + return ndirty_huge + ndirty_nonhuge; +} + +static void +set_background_thread_enabled(bool enabled) { + int err; + err = je_mallctl("background_thread", NULL, NULL, &enabled, + sizeof(enabled)); + expect_d_eq(0, err, "Unexpected mallctl failure"); +} + +static void +expect_purging(unsigned arena_ind, bool expect_deferred) { + size_t empty_ndirty; + + empty_ndirty = get_empty_ndirty(arena_ind); + expect_zu_eq(0, empty_ndirty, "Expected arena to start unused."); + + /* + * It's possible that we get unlucky with our stats collection timing, + * and the background thread runs in between the deallocation and the + * stats collection. So we retry 10 times, and see if we *ever* see + * deferred reclamation. + */ + bool observed_dirty_page = false; + for (int i = 0; i < 10; i++) { + void *ptr = mallocx(PAGE, + MALLOCX_TCACHE_NONE | MALLOCX_ARENA(arena_ind)); + empty_ndirty = get_empty_ndirty(arena_ind); + expect_zu_eq(0, empty_ndirty, "All pages should be active"); + dallocx(ptr, MALLOCX_TCACHE_NONE); + empty_ndirty = get_empty_ndirty(arena_ind); + if (expect_deferred) { + expect_true(empty_ndirty == 0 || empty_ndirty == 1, + "Unexpected extra dirty page count: %zu", + empty_ndirty); + } else { + assert_zu_eq(0, empty_ndirty, + "Saw dirty pages without deferred purging"); + } + if (empty_ndirty > 0) { + observed_dirty_page = true; + break; + } + } + expect_b_eq(expect_deferred, observed_dirty_page, ""); + if (expect_deferred) { + sleep_for_background_thread_interval(); + } + empty_ndirty = get_empty_ndirty(arena_ind); + expect_zu_eq(0, empty_ndirty, "Should have seen a background purge"); +} + +TEST_BEGIN(test_hpa_background_thread_purges) { + test_skip_if(!config_stats); + test_skip_if(!hpa_supported()); + test_skip_if(!have_background_thread); + + unsigned arena_ind = create_arena(); + /* + * Our .sh sets dirty mult to 0, so all dirty pages should get purged + * any time any thread frees. + */ + expect_purging(arena_ind, /* expect_deferred */ true); +} +TEST_END + +TEST_BEGIN(test_hpa_background_thread_enable_disable) { + test_skip_if(!config_stats); + test_skip_if(!hpa_supported()); + test_skip_if(!have_background_thread); + + unsigned arena_ind = create_arena(); + + set_background_thread_enabled(false); + expect_purging(arena_ind, false); + + set_background_thread_enabled(true); + expect_purging(arena_ind, true); +} +TEST_END + +int +main(void) { + /* + * OK, this is a sort of nasty hack. We don't want to add *another* + * config option for HPA (the intent is that it becomes available on + * more platforms over time, and we're trying to prune back config + * options generally. But we'll get initialization errors on other + * platforms if we set hpa:true in the MALLOC_CONF (even if we set + * abort_conf:false as well). So we reach into the internals and set + * them directly, but only if we know that we're actually going to do + * something nontrivial in the tests. + */ + if (config_stats && hpa_supported() && have_background_thread) { + opt_hpa = true; + opt_background_thread = true; + } + return test_no_reentrancy( + test_hpa_background_thread_purges, + test_hpa_background_thread_enable_disable); +} diff --git a/test/unit/hpa_background_thread.sh b/test/unit/hpa_background_thread.sh new file mode 100644 index 00000000..811da8bd --- /dev/null +++ b/test/unit/hpa_background_thread.sh @@ -0,0 +1,4 @@ +#!/bin/sh + +export MALLOC_CONF="hpa_dirty_mult:0,background_thread_hpa_interval_max_ms:50,hpa_sec_nshards:0" +