From 6599651aee2b1b1ab0c52fdb03f23394bd683c47 Mon Sep 17 00:00:00 2001 From: David Goldblatt Date: Fri, 16 Oct 2020 13:14:59 -0700 Subject: [PATCH] PA: Use an SEC in fron of the HPA shard. --- include/jemalloc/internal/arena_externs.h | 3 +- include/jemalloc/internal/ctl.h | 1 + include/jemalloc/internal/hpa.h | 6 ++-- .../internal/jemalloc_internal_externs.h | 5 ++++ include/jemalloc/internal/mutex_prof.h | 3 +- include/jemalloc/internal/pa.h | 20 +++++++++---- src/arena.c | 28 ++++++++++++----- src/ctl.c | 19 +++++++++++- src/hpa.c | 6 ++-- src/jemalloc.c | 30 ++++++++++++++----- src/pa.c | 19 ++++++++---- src/pa_extra.c | 25 ++++++++++++---- src/stats.c | 8 +++++ src/tcache.c | 6 ++-- test/unit/mallctl.c | 3 ++ 15 files changed, 141 insertions(+), 41 deletions(-) diff --git a/include/jemalloc/internal/arena_externs.h b/include/jemalloc/internal/arena_externs.h index c8e1e38d..40223b58 100644 --- a/include/jemalloc/internal/arena_externs.h +++ b/include/jemalloc/internal/arena_externs.h @@ -28,7 +28,7 @@ void arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads, const char **dss, ssize_t *dirty_decay_ms, ssize_t *muzzy_decay_ms, size_t *nactive, size_t *ndirty, size_t *nmuzzy, arena_stats_t *astats, bin_stats_data_t *bstats, arena_stats_large_t *lstats, - pac_estats_t *estats, hpa_shard_stats_t *hpastats); + pac_estats_t *estats, hpa_shard_stats_t *hpastats, sec_stats_t *secstats); void arena_handle_new_dirty_pages(tsdn_t *tsdn, arena_t *arena); #ifdef JEMALLOC_JET size_t arena_slab_regind(edata_t *slab, szind_t binind, const void *ptr); @@ -99,6 +99,7 @@ void arena_prefork4(tsdn_t *tsdn, arena_t *arena); void arena_prefork5(tsdn_t *tsdn, arena_t *arena); void arena_prefork6(tsdn_t *tsdn, arena_t *arena); void arena_prefork7(tsdn_t *tsdn, arena_t *arena); +void arena_prefork8(tsdn_t *tsdn, arena_t *arena); void arena_postfork_parent(tsdn_t *tsdn, arena_t *arena); void arena_postfork_child(tsdn_t *tsdn, arena_t *arena); diff --git a/include/jemalloc/internal/ctl.h b/include/jemalloc/internal/ctl.h index 305d3655..a6ae05c1 100644 --- a/include/jemalloc/internal/ctl.h +++ b/include/jemalloc/internal/ctl.h @@ -46,6 +46,7 @@ typedef struct ctl_arena_stats_s { arena_stats_large_t lstats[SC_NSIZES - SC_NBINS]; pac_estats_t estats[SC_NPSIZES]; hpa_shard_stats_t hpastats; + sec_stats_t secstats; } ctl_arena_stats_t; typedef struct ctl_stats_s { diff --git a/include/jemalloc/internal/hpa.h b/include/jemalloc/internal/hpa.h index 3fe9fc48..24c68560 100644 --- a/include/jemalloc/internal/hpa.h +++ b/include/jemalloc/internal/hpa.h @@ -90,10 +90,10 @@ void hpa_shard_destroy(tsdn_t *tsdn, hpa_shard_t *shard); /* * We share the fork ordering with the PA and arena prefork handling; that's why - * these are 2 and 3 rather than 0 or 1. + * these are 3 and 4 rather than 0 and 1. */ -void hpa_shard_prefork2(tsdn_t *tsdn, hpa_shard_t *shard); void hpa_shard_prefork3(tsdn_t *tsdn, hpa_shard_t *shard); +void hpa_shard_prefork4(tsdn_t *tsdn, hpa_shard_t *shard); void hpa_shard_postfork_parent(tsdn_t *tsdn, hpa_shard_t *shard); void hpa_shard_postfork_child(tsdn_t *tsdn, hpa_shard_t *shard); @@ -103,7 +103,7 @@ void hpa_shard_postfork_child(tsdn_t *tsdn, hpa_shard_t *shard); * so it needs to be lower in the witness ordering, but it's also logically * global and not tied to any particular arena. */ -void hpa_prefork3(tsdn_t *tsdn, hpa_t *hpa); +void hpa_prefork4(tsdn_t *tsdn, hpa_t *hpa); void hpa_postfork_parent(tsdn_t *tsdn, hpa_t *hpa); void hpa_postfork_child(tsdn_t *tsdn, hpa_t *hpa); diff --git a/include/jemalloc/internal/jemalloc_internal_externs.h b/include/jemalloc/internal/jemalloc_internal_externs.h index 8faadaa1..814a7a1b 100644 --- a/include/jemalloc/internal/jemalloc_internal_externs.h +++ b/include/jemalloc/internal/jemalloc_internal_externs.h @@ -17,6 +17,11 @@ extern size_t opt_hpa_slab_goal; extern size_t opt_hpa_slab_max_alloc; extern size_t opt_hpa_small_max; extern size_t opt_hpa_large_min; + +extern size_t opt_hpa_sec_max_alloc; +extern size_t opt_hpa_sec_max_bytes; +extern size_t opt_hpa_sec_nshards; + extern const char *opt_junk; extern bool opt_junk_alloc; extern bool opt_junk_free; diff --git a/include/jemalloc/internal/mutex_prof.h b/include/jemalloc/internal/mutex_prof.h index 970f469b..ef0bf0d3 100644 --- a/include/jemalloc/internal/mutex_prof.h +++ b/include/jemalloc/internal/mutex_prof.h @@ -33,7 +33,8 @@ typedef enum { OP(base) \ OP(tcache_list) \ OP(hpa_shard) \ - OP(hpa_shard_grow) + OP(hpa_shard_grow) \ + OP(hpa_sec) typedef enum { #define OP(mtx) arena_prof_mutex_##mtx, diff --git a/include/jemalloc/internal/pa.h b/include/jemalloc/internal/pa.h index d138f2f0..5e97d0b0 100644 --- a/include/jemalloc/internal/pa.h +++ b/include/jemalloc/internal/pa.h @@ -10,6 +10,7 @@ #include "jemalloc/internal/lockedint.h" #include "jemalloc/internal/pac.h" #include "jemalloc/internal/pai.h" +#include "jemalloc/internal/sec.h" /* * The page allocator; responsible for acquiring pages of memory for @@ -85,7 +86,12 @@ struct pa_shard_s { /* Allocates from a PAC. */ pac_t pac; - /* Allocates from a HPA. */ + /* + * We place a small extent cache in front of the HPA, since we intend + * these configurations to use many fewer arenas, and therefore have a + * higher risk of hot locks. + */ + sec_t hpa_sec; hpa_shard_t hpa_shard; /* The source of edata_t objects. */ @@ -124,18 +130,20 @@ bool pa_shard_init(tsdn_t *tsdn, pa_shard_t *shard, emap_t *emap, base_t *base, * that we can boot without worrying about the HPA, then turn it on in a0. */ bool pa_shard_enable_hpa(pa_shard_t *shard, hpa_t *hpa, size_t ps_goal, - size_t ps_alloc_max, size_t small_max, size_t large_min); + size_t ps_alloc_max, size_t small_max, size_t large_min, size_t sec_nshards, + size_t sec_alloc_max, size_t sec_bytes_max); /* * We stop using the HPA when custom extent hooks are installed, but still * redirect deallocations to it. */ -void pa_shard_disable_hpa(pa_shard_t *shard); +void pa_shard_disable_hpa(tsdn_t *tsdn, pa_shard_t *shard); /* * This does the PA-specific parts of arena reset (i.e. freeing all active * allocations). */ -void pa_shard_reset(pa_shard_t *shard); +void pa_shard_reset(tsdn_t *tsdn, pa_shard_t *shard); + /* * Destroy all the remaining retained extents. Should only be called after * decaying all active, dirty, and muzzy extents to the retained state, as the @@ -184,6 +192,7 @@ void pa_shard_prefork0(tsdn_t *tsdn, pa_shard_t *shard); void pa_shard_prefork2(tsdn_t *tsdn, pa_shard_t *shard); void pa_shard_prefork3(tsdn_t *tsdn, pa_shard_t *shard); void pa_shard_prefork4(tsdn_t *tsdn, pa_shard_t *shard); +void pa_shard_prefork5(tsdn_t *tsdn, pa_shard_t *shard); void pa_shard_postfork_parent(tsdn_t *tsdn, pa_shard_t *shard); void pa_shard_postfork_child(tsdn_t *tsdn, pa_shard_t *shard); @@ -192,7 +201,8 @@ void pa_shard_basic_stats_merge(pa_shard_t *shard, size_t *nactive, void pa_shard_stats_merge(tsdn_t *tsdn, pa_shard_t *shard, pa_shard_stats_t *pa_shard_stats_out, pac_estats_t *estats_out, - hpa_shard_stats_t *hpa_stats_out, size_t *resident); + hpa_shard_stats_t *hpa_stats_out, sec_stats_t *sec_stats_out, + size_t *resident); /* * Reads the PA-owned mutex stats into the output stats array, at the diff --git a/src/arena.c b/src/arena.c index dc58a287..360827ef 100644 --- a/src/arena.c +++ b/src/arena.c @@ -81,7 +81,7 @@ arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads, const char **dss, ssize_t *dirty_decay_ms, ssize_t *muzzy_decay_ms, size_t *nactive, size_t *ndirty, size_t *nmuzzy, arena_stats_t *astats, bin_stats_data_t *bstats, arena_stats_large_t *lstats, - pac_estats_t *estats, hpa_shard_stats_t *hpastats) { + pac_estats_t *estats, hpa_shard_stats_t *hpastats, sec_stats_t *secstats) { cassert(config_stats); arena_basic_stats_merge(tsdn, arena, nthreads, dss, dirty_decay_ms, @@ -139,7 +139,7 @@ arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads, } pa_shard_stats_merge(tsdn, &arena->pa_shard, &astats->pa_shard_stats, - estats, hpastats, &astats->resident); + estats, hpastats, secstats, &astats->resident); LOCKEDINT_MTX_UNLOCK(tsdn, arena->stats.mtx); @@ -483,6 +483,14 @@ arena_decay_muzzy(tsdn_t *tsdn, arena_t *arena, bool is_background_thread, void arena_decay(tsdn_t *tsdn, arena_t *arena, bool is_background_thread, bool all) { + if (all) { + /* + * We should take a purge of "all" to mean "save as much memory + * as possible", including flushing any caches (for situations + * like thread death, or manual purge calls). + */ + sec_flush(tsdn, &arena->pa_shard.hpa_sec); + } if (arena_decay_dirty(tsdn, arena, is_background_thread, all)) { return; } @@ -631,7 +639,7 @@ arena_reset(tsd_t *tsd, arena_t *arena) { &arena->bins[i].bin_shards[j]); } } - pa_shard_reset(&arena->pa_shard); + pa_shard_reset(tsd_tsdn(tsd), &arena->pa_shard); } void @@ -1362,7 +1370,7 @@ arena_set_extent_hooks(tsd_t *tsd, arena_t *arena, malloc_mutex_lock(tsd_tsdn(tsd), &info->mtx); } /* No using the HPA now that we have the custom hooks. */ - pa_shard_disable_hpa(&arena->pa_shard); + pa_shard_disable_hpa(tsd_tsdn(tsd), &arena->pa_shard); extent_hooks_t *ret = base_extent_hooks_set(arena->base, extent_hooks); if (have_background_thread) { malloc_mutex_unlock(tsd_tsdn(tsd), &info->mtx); @@ -1529,7 +1537,8 @@ arena_new(tsdn_t *tsdn, unsigned ind, extent_hooks_t *extent_hooks) { if (opt_hpa && ehooks_are_default(base_ehooks_get(base)) && ind != 0) { if (pa_shard_enable_hpa(&arena->pa_shard, &arena_hpa_global, opt_hpa_slab_goal, opt_hpa_slab_max_alloc, - opt_hpa_small_max, opt_hpa_large_min)) { + opt_hpa_small_max, opt_hpa_large_min, opt_hpa_sec_nshards, + opt_hpa_sec_max_alloc, opt_hpa_sec_max_bytes)) { goto label_error; } } @@ -1658,16 +1667,21 @@ arena_prefork4(tsdn_t *tsdn, arena_t *arena) { void arena_prefork5(tsdn_t *tsdn, arena_t *arena) { - base_prefork(tsdn, arena->base); + pa_shard_prefork5(tsdn, &arena->pa_shard); } void arena_prefork6(tsdn_t *tsdn, arena_t *arena) { - malloc_mutex_prefork(tsdn, &arena->large_mtx); + base_prefork(tsdn, arena->base); } void arena_prefork7(tsdn_t *tsdn, arena_t *arena) { + malloc_mutex_prefork(tsdn, &arena->large_mtx); +} + +void +arena_prefork8(tsdn_t *tsdn, arena_t *arena) { for (unsigned i = 0; i < SC_NBINS; i++) { for (unsigned j = 0; j < bin_infos[i].n_shards; j++) { bin_prefork(tsdn, &arena->bins[i].bin_shards[j]); diff --git a/src/ctl.c b/src/ctl.c index b4e65172..874aaac2 100644 --- a/src/ctl.c +++ b/src/ctl.c @@ -95,6 +95,9 @@ CTL_PROTO(opt_hpa_slab_goal) CTL_PROTO(opt_hpa_slab_max_alloc) CTL_PROTO(opt_hpa_small_max) CTL_PROTO(opt_hpa_large_min) +CTL_PROTO(opt_hpa_sec_max_alloc) +CTL_PROTO(opt_hpa_sec_max_bytes) +CTL_PROTO(opt_hpa_sec_nshards) CTL_PROTO(opt_metadata_thp) CTL_PROTO(opt_retain) CTL_PROTO(opt_dss) @@ -246,6 +249,7 @@ CTL_PROTO(stats_arenas_i_metadata_thp) CTL_PROTO(stats_arenas_i_tcache_bytes) CTL_PROTO(stats_arenas_i_resident) CTL_PROTO(stats_arenas_i_abandoned_vm) +CTL_PROTO(stats_arenas_i_hpa_sec_bytes) INDEX_PROTO(stats_arenas_i) CTL_PROTO(stats_allocated) CTL_PROTO(stats_active) @@ -360,6 +364,9 @@ static const ctl_named_node_t opt_node[] = { {NAME("hpa_slab_max_alloc"), CTL(opt_hpa_slab_max_alloc)}, {NAME("hpa_small_max"), CTL(opt_hpa_small_max)}, {NAME("hpa_large_min"), CTL(opt_hpa_large_min)}, + {NAME("hpa_sec_max_alloc"), CTL(opt_hpa_sec_max_alloc)}, + {NAME("hpa_sec_max_bytes"), CTL(opt_hpa_sec_max_bytes)}, + {NAME("hpa_sec_nshards"), CTL(opt_hpa_sec_nshards)}, {NAME("metadata_thp"), CTL(opt_metadata_thp)}, {NAME("retain"), CTL(opt_retain)}, {NAME("dss"), CTL(opt_dss)}, @@ -650,6 +657,7 @@ static const ctl_named_node_t stats_arenas_i_node[] = { {NAME("tcache_bytes"), CTL(stats_arenas_i_tcache_bytes)}, {NAME("resident"), CTL(stats_arenas_i_resident)}, {NAME("abandoned_vm"), CTL(stats_arenas_i_abandoned_vm)}, + {NAME("hpa_sec_bytes"), CTL(stats_arenas_i_hpa_sec_bytes)}, {NAME("small"), CHILD(named, stats_arenas_i_small)}, {NAME("large"), CHILD(named, stats_arenas_i_large)}, {NAME("bins"), CHILD(indexed, stats_arenas_i_bins)}, @@ -889,6 +897,8 @@ ctl_arena_clear(ctl_arena_t *ctl_arena) { sizeof(pac_estats_t)); memset(&ctl_arena->astats->hpastats, 0, sizeof(hpa_shard_stats_t)); + memset(&ctl_arena->astats->secstats, 0, + sizeof(sec_stats_t)); } } @@ -903,7 +913,7 @@ ctl_arena_stats_amerge(tsdn_t *tsdn, ctl_arena_t *ctl_arena, arena_t *arena) { &ctl_arena->pdirty, &ctl_arena->pmuzzy, &ctl_arena->astats->astats, ctl_arena->astats->bstats, ctl_arena->astats->lstats, ctl_arena->astats->estats, - &ctl_arena->astats->hpastats); + &ctl_arena->astats->hpastats, &ctl_arena->astats->secstats); for (i = 0; i < SC_NBINS; i++) { bin_stats_t *bstats = @@ -1089,6 +1099,7 @@ MUTEX_PROF_ARENA_MUTEXES &astats->hpastats.psset_slab_stats[i]); } + sec_stats_accum(&sdstats->secstats, &astats->secstats); } } @@ -1895,6 +1906,9 @@ CTL_RO_NL_GEN(opt_hpa_slab_goal, opt_hpa_slab_goal, size_t) CTL_RO_NL_GEN(opt_hpa_slab_max_alloc, opt_hpa_slab_max_alloc, size_t) CTL_RO_NL_GEN(opt_hpa_small_max, opt_hpa_small_max, size_t) CTL_RO_NL_GEN(opt_hpa_large_min, opt_hpa_large_min, size_t) +CTL_RO_NL_GEN(opt_hpa_sec_max_alloc, opt_hpa_sec_max_alloc, size_t) +CTL_RO_NL_GEN(opt_hpa_sec_max_bytes, opt_hpa_sec_max_bytes, size_t) +CTL_RO_NL_GEN(opt_hpa_sec_nshards, opt_hpa_sec_nshards, size_t) CTL_RO_NL_GEN(opt_metadata_thp, metadata_thp_mode_names[opt_metadata_thp], const char *) CTL_RO_NL_GEN(opt_retain, opt_retain, bool) @@ -3114,6 +3128,9 @@ CTL_RO_CGEN(config_stats, stats_arenas_i_abandoned_vm, &arenas_i(mib[2])->astats->astats.pa_shard_stats.pac_stats.abandoned_vm, ATOMIC_RELAXED), size_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_sec_bytes, + arenas_i(mib[2])->astats->secstats.bytes, size_t) + CTL_RO_CGEN(config_stats, stats_arenas_i_small_allocated, arenas_i(mib[2])->astats->allocated_small, size_t) CTL_RO_CGEN(config_stats, stats_arenas_i_small_nmalloc, diff --git a/src/hpa.c b/src/hpa.c index 08992bda..f49aa2b0 100644 --- a/src/hpa.c +++ b/src/hpa.c @@ -411,12 +411,12 @@ hpa_shard_destroy(tsdn_t *tsdn, hpa_shard_t *shard) { } void -hpa_shard_prefork2(tsdn_t *tsdn, hpa_shard_t *shard) { +hpa_shard_prefork3(tsdn_t *tsdn, hpa_shard_t *shard) { malloc_mutex_prefork(tsdn, &shard->grow_mtx); } void -hpa_shard_prefork3(tsdn_t *tsdn, hpa_shard_t *shard) { +hpa_shard_prefork4(tsdn_t *tsdn, hpa_shard_t *shard) { malloc_mutex_prefork(tsdn, &shard->mtx); } @@ -433,7 +433,7 @@ hpa_shard_postfork_child(tsdn_t *tsdn, hpa_shard_t *shard) { } void -hpa_prefork3(tsdn_t *tsdn, hpa_t *hpa) { +hpa_prefork4(tsdn_t *tsdn, hpa_t *hpa) { malloc_mutex_prefork(tsdn, &hpa->grow_mtx); malloc_mutex_prefork(tsdn, &hpa->mtx); } diff --git a/src/jemalloc.c b/src/jemalloc.c index 8ce9ca1e..09b168ca 100644 --- a/src/jemalloc.c +++ b/src/jemalloc.c @@ -141,6 +141,11 @@ size_t opt_hpa_slab_max_alloc = 256 * 1024; size_t opt_hpa_small_max = 32 * 1024; size_t opt_hpa_large_min = 4 * 1024 * 1024; +size_t opt_hpa_sec_max_alloc = 32 * 1024; +/* These settings correspond to a maximum of 1MB cached per arena. */ +size_t opt_hpa_sec_max_bytes = 256 * 1024; +size_t opt_hpa_sec_nshards = 4; + /* * Arenas that are used to service external requests. Not all elements of the * arenas array are necessarily used; arenas are created lazily as needed. @@ -1494,11 +1499,18 @@ malloc_conf_init_helper(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS], true) CONF_HANDLE_SIZE_T(opt_hpa_slab_max_alloc, "hpa_slab_max_alloc", PAGE, 512 * PAGE, - CONF_CHECK_MIN, CONF_CHECK_MAX, true) + CONF_CHECK_MIN, CONF_CHECK_MAX, true); CONF_HANDLE_SIZE_T(opt_hpa_small_max, "hpa_small_max", - PAGE, 0, CONF_CHECK_MIN, CONF_DONT_CHECK_MAX, true) + PAGE, 0, CONF_CHECK_MIN, CONF_DONT_CHECK_MAX, true); CONF_HANDLE_SIZE_T(opt_hpa_large_min, "hpa_large_min", - PAGE, 0, CONF_CHECK_MIN, CONF_DONT_CHECK_MAX, true) + PAGE, 0, CONF_CHECK_MIN, CONF_DONT_CHECK_MAX, true); + + CONF_HANDLE_SIZE_T(opt_hpa_sec_max_alloc, "hpa_sec_max_alloc", + PAGE, 0, CONF_CHECK_MIN, CONF_DONT_CHECK_MAX, true); + CONF_HANDLE_SIZE_T(opt_hpa_sec_max_bytes, "hpa_sec_max_bytes", + PAGE, 0, CONF_CHECK_MIN, CONF_DONT_CHECK_MAX, true); + CONF_HANDLE_SIZE_T(opt_hpa_sec_nshards, "hpa_sec_nshards", + 0, 0, CONF_CHECK_MIN, CONF_DONT_CHECK_MAX, true); if (CONF_MATCH("slab_sizes")) { if (CONF_MATCH_VALUE("default")) { @@ -1808,7 +1820,8 @@ malloc_init_hard_a0_locked() { } if (pa_shard_enable_hpa(&a0->pa_shard, &arena_hpa_global, opt_hpa_slab_goal, opt_hpa_slab_max_alloc, - opt_hpa_small_max, opt_hpa_large_min)) { + opt_hpa_small_max, opt_hpa_large_min, opt_hpa_sec_nshards, + opt_hpa_sec_max_alloc, opt_hpa_sec_max_bytes)) { return true; } } @@ -4226,7 +4239,7 @@ _malloc_prefork(void) background_thread_prefork1(tsd_tsdn(tsd)); } /* Break arena prefork into stages to preserve lock order. */ - for (i = 0; i < 8; i++) { + for (i = 0; i < 9; i++) { for (j = 0; j < narenas; j++) { if ((arena = arena_get(tsd_tsdn(tsd), j, false)) != NULL) { @@ -4255,12 +4268,15 @@ _malloc_prefork(void) case 7: arena_prefork7(tsd_tsdn(tsd), arena); break; + case 8: + arena_prefork8(tsd_tsdn(tsd), arena); + break; default: not_reached(); } } } - if (i == 3 && opt_hpa) { - hpa_prefork3(tsd_tsdn(tsd), &arena_hpa_global); + if (i == 4 && opt_hpa) { + hpa_prefork4(tsd_tsdn(tsd), &arena_hpa_global); } } diff --git a/src/pa.c b/src/pa.c index 8e1ec842..825b10ab 100644 --- a/src/pa.c +++ b/src/pa.c @@ -49,7 +49,8 @@ pa_shard_init(tsdn_t *tsdn, pa_shard_t *shard, emap_t *emap, base_t *base, bool pa_shard_enable_hpa(pa_shard_t *shard, hpa_t *hpa, size_t ps_goal, - size_t ps_alloc_max, size_t small_max, size_t large_min) { + size_t ps_alloc_max, size_t small_max, size_t large_min, + size_t sec_nshards, size_t sec_alloc_max, size_t sec_bytes_max) { ps_goal &= ~PAGE_MASK; ps_alloc_max &= ~PAGE_MASK; @@ -60,6 +61,10 @@ pa_shard_enable_hpa(pa_shard_t *shard, hpa_t *hpa, size_t ps_goal, shard->ind, ps_goal, ps_alloc_max, small_max, large_min)) { return true; } + if (sec_init(&shard->hpa_sec, &shard->hpa_shard.pai, sec_nshards, + sec_alloc_max, sec_bytes_max)) { + return true; + } shard->ever_used_hpa = true; atomic_store_b(&shard->use_hpa, true, ATOMIC_RELAXED); @@ -67,24 +72,27 @@ pa_shard_enable_hpa(pa_shard_t *shard, hpa_t *hpa, size_t ps_goal, } void -pa_shard_disable_hpa(pa_shard_t *shard) { +pa_shard_disable_hpa(tsdn_t *tsdn, pa_shard_t *shard) { atomic_store_b(&shard->use_hpa, false, ATOMIC_RELAXED); + sec_disable(tsdn, &shard->hpa_sec); } void -pa_shard_reset(pa_shard_t *shard) { +pa_shard_reset(tsdn_t *tsdn, pa_shard_t *shard) { atomic_store_zu(&shard->nactive, 0, ATOMIC_RELAXED); + sec_flush(tsdn, &shard->hpa_sec); } void pa_shard_destroy(tsdn_t *tsdn, pa_shard_t *shard) { + sec_flush(tsdn, &shard->hpa_sec); pac_destroy(tsdn, &shard->pac); } static pai_t * pa_get_pai(pa_shard_t *shard, edata_t *edata) { return (edata_pai_get(edata) == EXTENT_PAI_PAC - ? &shard->pac.pai : &shard->hpa_shard.pai); + ? &shard->pac.pai : &shard->hpa_sec.pai); } edata_t * @@ -95,7 +103,7 @@ pa_alloc(tsdn_t *tsdn, pa_shard_t *shard, size_t size, size_t alignment, edata_t *edata = NULL; if (atomic_load_b(&shard->use_hpa, ATOMIC_RELAXED)) { - edata = pai_alloc(tsdn, &shard->hpa_shard.pai, size, alignment, + edata = pai_alloc(tsdn, &shard->hpa_sec.pai, size, alignment, zero); } /* @@ -173,6 +181,7 @@ pa_dalloc(tsdn_t *tsdn, pa_shard_t *shard, edata_t *edata, emap_deregister_interior(tsdn, shard->emap, edata); edata_slab_set(edata, false); } + edata_addr_set(edata, edata_base_get(edata)); edata_szind_set(edata, SC_NSIZES); pa_nactive_sub(shard, edata_size_get(edata) >> LG_PAGE); pai_t *pai = pa_get_pai(shard, edata); diff --git a/src/pa_extra.c b/src/pa_extra.c index db236ad8..24cb6537 100644 --- a/src/pa_extra.c +++ b/src/pa_extra.c @@ -16,17 +16,14 @@ pa_shard_prefork0(tsdn_t *tsdn, pa_shard_t *shard) { void pa_shard_prefork2(tsdn_t *tsdn, pa_shard_t *shard) { - malloc_mutex_prefork(tsdn, &shard->pac.grow_mtx); if (shard->ever_used_hpa) { - hpa_shard_prefork2(tsdn, &shard->hpa_shard); + sec_prefork2(tsdn, &shard->hpa_sec); } } void pa_shard_prefork3(tsdn_t *tsdn, pa_shard_t *shard) { - ecache_prefork(tsdn, &shard->pac.ecache_dirty); - ecache_prefork(tsdn, &shard->pac.ecache_muzzy); - ecache_prefork(tsdn, &shard->pac.ecache_retained); + malloc_mutex_prefork(tsdn, &shard->pac.grow_mtx); if (shard->ever_used_hpa) { hpa_shard_prefork3(tsdn, &shard->hpa_shard); } @@ -34,6 +31,16 @@ pa_shard_prefork3(tsdn_t *tsdn, pa_shard_t *shard) { void pa_shard_prefork4(tsdn_t *tsdn, pa_shard_t *shard) { + ecache_prefork(tsdn, &shard->pac.ecache_dirty); + ecache_prefork(tsdn, &shard->pac.ecache_muzzy); + ecache_prefork(tsdn, &shard->pac.ecache_retained); + if (shard->ever_used_hpa) { + hpa_shard_prefork4(tsdn, &shard->hpa_shard); + } +} + +void +pa_shard_prefork5(tsdn_t *tsdn, pa_shard_t *shard) { edata_cache_prefork(tsdn, &shard->edata_cache); } @@ -47,6 +54,7 @@ pa_shard_postfork_parent(tsdn_t *tsdn, pa_shard_t *shard) { malloc_mutex_postfork_parent(tsdn, &shard->pac.decay_dirty.mtx); malloc_mutex_postfork_parent(tsdn, &shard->pac.decay_muzzy.mtx); if (shard->ever_used_hpa) { + sec_postfork_parent(tsdn, &shard->hpa_sec); hpa_shard_postfork_parent(tsdn, &shard->hpa_shard); } } @@ -61,6 +69,7 @@ pa_shard_postfork_child(tsdn_t *tsdn, pa_shard_t *shard) { malloc_mutex_postfork_child(tsdn, &shard->pac.decay_dirty.mtx); malloc_mutex_postfork_child(tsdn, &shard->pac.decay_muzzy.mtx); if (shard->ever_used_hpa) { + sec_postfork_child(tsdn, &shard->hpa_sec); hpa_shard_postfork_child(tsdn, &shard->hpa_shard); } } @@ -76,7 +85,8 @@ pa_shard_basic_stats_merge(pa_shard_t *shard, size_t *nactive, size_t *ndirty, void pa_shard_stats_merge(tsdn_t *tsdn, pa_shard_t *shard, pa_shard_stats_t *pa_shard_stats_out, pac_estats_t *estats_out, - hpa_shard_stats_t *hpa_stats_out, size_t *resident) { + hpa_shard_stats_t *hpa_stats_out, sec_stats_t *sec_stats_out, + size_t *resident) { cassert(config_stats); pa_shard_stats_out->pac_stats.retained += @@ -149,6 +159,7 @@ pa_shard_stats_merge(tsdn_t *tsdn, pa_shard_t *shard, &shard->hpa_shard.psset.slab_stats[i]); } malloc_mutex_unlock(tsdn, &shard->hpa_shard.mtx); + sec_stats_merge(tsdn, &shard->hpa_sec, sec_stats_out); } } @@ -182,5 +193,7 @@ pa_shard_mtx_stats_read(tsdn_t *tsdn, pa_shard_t *shard, pa_shard_mtx_stats_read_single(tsdn, mutex_prof_data, &shard->hpa_shard.grow_mtx, arena_prof_mutex_hpa_shard_grow); + sec_mutex_stats_read(tsdn, &shard->hpa_sec, + &mutex_prof_data[arena_prof_mutex_hpa_sec]); } } diff --git a/src/stats.c b/src/stats.c index f03e5e44..4b40721a 100644 --- a/src/stats.c +++ b/src/stats.c @@ -678,6 +678,11 @@ stats_arena_hpa_shard_print(emitter_t *emitter, unsigned i) { CTL_M2_GET("stats.arenas.0.hpa_shard.full_slabs.ninactive", i, &ninactive, size_t); + size_t sec_bytes; + CTL_M2_GET("stats.arenas.0.hpa_sec_bytes", i, &sec_bytes, size_t); + emitter_kv(emitter, "sec_bytes", "Bytes in small extent cache", + emitter_type_size, &sec_bytes); + emitter_table_printf(emitter, "HPA shard stats:\n" " In full slabs:\n" @@ -1194,6 +1199,9 @@ stats_general_print(emitter_t *emitter) { OPT_WRITE_SIZE_T("hpa_slab_max_alloc") OPT_WRITE_SIZE_T("hpa_small_max") OPT_WRITE_SIZE_T("hpa_large_min") + OPT_WRITE_SIZE_T("hpa_sec_max_alloc") + OPT_WRITE_SIZE_T("hpa_sec_max_bytes") + OPT_WRITE_SIZE_T("hpa_sec_nshards") OPT_WRITE_CHAR_P("metadata_thp") OPT_WRITE_BOOL_MUTABLE("background_thread", "background_thread") OPT_WRITE_SSIZE_T_MUTABLE("dirty_decay_ms", "arenas.dirty_decay_ms") diff --git a/src/tcache.c b/src/tcache.c index 6bf1d309..edbedf79 100644 --- a/src/tcache.c +++ b/src/tcache.c @@ -716,9 +716,11 @@ tcache_destroy(tsd_t *tsd, tcache_t *tcache, bool tsd_tcache) { if (arena_nthreads_get(arena, false) == 0 && !background_thread_enabled()) { /* Force purging when no threads assigned to the arena anymore. */ - arena_decay(tsd_tsdn(tsd), arena, false, true); + arena_decay(tsd_tsdn(tsd), arena, + /* is_background_thread */ false, /* all */ true); } else { - arena_decay(tsd_tsdn(tsd), arena, false, false); + arena_decay(tsd_tsdn(tsd), arena, + /* is_background_thread */ false, /* all */ false); } } diff --git a/test/unit/mallctl.c b/test/unit/mallctl.c index ecbcda9e..278bd09d 100644 --- a/test/unit/mallctl.c +++ b/test/unit/mallctl.c @@ -168,6 +168,9 @@ TEST_BEGIN(test_mallctl_opt) { TEST_MALLCTL_OPT(size_t, hpa_slab_max_alloc, always); TEST_MALLCTL_OPT(size_t, hpa_small_max, always); TEST_MALLCTL_OPT(size_t, hpa_large_min, always); + TEST_MALLCTL_OPT(size_t, hpa_sec_max_alloc, always); + TEST_MALLCTL_OPT(size_t, hpa_sec_max_bytes, always); + TEST_MALLCTL_OPT(size_t, hpa_sec_nshards, always); TEST_MALLCTL_OPT(unsigned, narenas, always); TEST_MALLCTL_OPT(const char *, percpu_arena, always); TEST_MALLCTL_OPT(size_t, oversize_threshold, always);