From 0f6c420f83a52c3927cc1c78d155622de05e3ba5 Mon Sep 17 00:00:00 2001 From: David Goldblatt Date: Fri, 5 Feb 2021 10:46:17 -0800 Subject: [PATCH] HPA: Make purging/hugifying more principled. Before this change, purge/hugify decisions had several sharp edges that could lead to pathological behavior if tuning parameters weren't carefully chosen. It's the first of a series; this introduces basic "make every hugepage with dirty pages purgeable" functionality, and the next commit expands that functionality to have a smarter policy for picking hugepages to purge. Previously, the dehugify logic would *never* dehugify a hugepage unless it was dirtier than the dehugification threshold. This can lead to situations in which these pages (which themselves could never be purged) would push us above the maximum allowed dirty pages in the shard. This forces immediate purging of any pages deallocated in non-hugified hugepages, which in turn places nonobvious practical limitations on the relationships between various config settings. Instead, we make our preference not to dehugify to purge a soft one rather than a hard one. We'll avoid purging them, but only so long as we can do so by purging non-hugified pages. If we need to purge them to satisfy our dirty page limits, or to hugify other, more worthy candidates, we'll still do so. --- include/jemalloc/internal/hpdata.h | 76 ++++++++++++++++----- include/jemalloc/internal/psset.h | 4 +- src/hpa.c | 105 +++++++++++++++++++---------- src/hpdata.c | 4 +- src/psset.c | 69 +++++++++++++------ 5 files changed, 183 insertions(+), 75 deletions(-) diff --git a/include/jemalloc/internal/hpdata.h b/include/jemalloc/internal/hpdata.h index e489e624..3bbb7cc8 100644 --- a/include/jemalloc/internal/hpdata.h +++ b/include/jemalloc/internal/hpdata.h @@ -6,6 +6,42 @@ #include "jemalloc/internal/ql.h" #include "jemalloc/internal/typed_list.h" +/* + * How badly we want to purge some region of memory. This is a temporary + * definition; it gets deleted in the next commit (where we adopt a more + * explicit dirtiest-first policy that only considers hugification status). + */ +enum hpdata_purge_level_e { + /* + * The level number is important -- we use it as indices into an array + * of size 2 (one for each purge level). + */ + + /* "Regular" candidates for purging. */ + hpdata_purge_level_default = 0, + + /* + * Candidates for purging, but as a last resort. Practically, + * nonpreferred corresponds to hugified regions that are below the + * hugification threshold but have not yet reached the dehugification + * threshold, while strongly nonpreferred candidates are those which are + * above the hugification threshold. + */ + hpdata_purge_level_nonpreferred = 1, + hpdata_purge_level_strongly_nonpreferred = 2, + + /* Don't purge, no matter what. */ + hpdata_purge_level_never = 2, + + /* + * How big an array has to be to accomodate all purge levels. This + * relies on the fact that we don't actually keep unpurgable hpdatas in + * a container. + */ + hpdata_purge_level_count = hpdata_purge_level_never +}; +typedef enum hpdata_purge_level_e hpdata_purge_level_t; + /* * The metadata representation we use for extents in hugepages. While the PAC * uses the edata_t to represent both active and inactive extents, the HP only @@ -52,8 +88,8 @@ struct hpdata_s { bool h_in_psset_alloc_container; /* The same, but with purging. */ - bool h_purge_allowed; - bool h_in_psset_purge_container; + uint8_t h_purge_level; + uint8_t h_purge_container_level; /* And with hugifying. */ bool h_hugify_allowed; @@ -164,26 +200,26 @@ hpdata_in_psset_alloc_container_set(hpdata_t *hpdata, bool in_container) { hpdata->h_in_psset_alloc_container = in_container; } -static inline bool -hpdata_purge_allowed_get(const hpdata_t *hpdata) { - return hpdata->h_purge_allowed; +static inline hpdata_purge_level_t +hpdata_purge_level_get(const hpdata_t *hpdata) { + return (hpdata_purge_level_t)hpdata->h_purge_level; } static inline void -hpdata_purge_allowed_set(hpdata_t *hpdata, bool purge_allowed) { - assert(purge_allowed == false || !hpdata->h_mid_purge); - hpdata->h_purge_allowed = purge_allowed; +hpdata_purge_level_set(hpdata_t *hpdata, hpdata_purge_level_t level) { + assert(level == hpdata_purge_level_never || !hpdata->h_mid_purge); + hpdata->h_purge_level = (uint8_t)level; } -static inline bool -hpdata_in_psset_purge_container_get(const hpdata_t *hpdata) { - return hpdata->h_in_psset_purge_container; +static inline hpdata_purge_level_t +hpdata_purge_container_level_get(const hpdata_t *hpdata) { + return (hpdata_purge_level_t)hpdata->h_purge_container_level; } static inline void -hpdata_in_psset_purge_container_set(hpdata_t *hpdata, bool in_container) { - assert(in_container != hpdata->h_in_psset_purge_container); - hpdata->h_in_psset_purge_container = in_container; +hpdata_purge_container_level_set(hpdata_t *hpdata, hpdata_purge_level_t level) { + assert(level != hpdata->h_purge_container_level); + hpdata->h_purge_container_level = level; } static inline bool @@ -284,6 +320,11 @@ hpdata_ndirty_get(hpdata_t *hpdata) { return hpdata->h_ntouched - hpdata->h_nactive; } +static inline size_t +hpdata_nretained_get(hpdata_t *hpdata) { + return hpdata->h_nactive - hpdata->h_ntouched; +} + static inline void hpdata_assert_empty(hpdata_t *hpdata) { assert(fb_empty(hpdata->active_pages, HUGEPAGE_PAGES)); @@ -316,11 +357,12 @@ hpdata_consistent(hpdata_t *hpdata) { return false; } if (hpdata_changing_state_get(hpdata) - && (hpdata->h_purge_allowed || hpdata->h_hugify_allowed)) { + && ((hpdata->h_purge_level != hpdata_purge_level_never) + || hpdata->h_hugify_allowed)) { return false; } - if (hpdata_purge_allowed_get(hpdata) - != hpdata_in_psset_purge_container_get(hpdata)) { + if (hpdata_purge_level_get(hpdata) + != hpdata_purge_container_level_get(hpdata)) { return false; } if (hpdata_hugify_allowed_get(hpdata) diff --git a/include/jemalloc/internal/psset.h b/include/jemalloc/internal/psset.h index 271d1443..285bf6da 100644 --- a/include/jemalloc/internal/psset.h +++ b/include/jemalloc/internal/psset.h @@ -70,8 +70,8 @@ struct psset_s { * allocations. */ hpdata_empty_list_t empty; - /* Slabs which are available to be purged. */ - hpdata_purge_list_t to_purge; + /* Slabs which are available to be purged, ordered by purge level. */ + hpdata_purge_list_t to_purge[hpdata_purge_level_count]; /* Slabs which are available to be hugified. */ hpdata_hugify_list_t to_hugify; }; diff --git a/src/hpa.c b/src/hpa.c index d078f180..90fec354 100644 --- a/src/hpa.c +++ b/src/hpa.c @@ -151,34 +151,59 @@ hpa_good_hugification_candidate(hpa_shard_t *shard, hpdata_t *ps) { >= shard->opts.hugification_threshold; } -static bool -hpa_should_purge(hpa_shard_t *shard) { +static size_t +hpa_adjusted_ndirty(tsdn_t *tsdn, hpa_shard_t *shard) { + malloc_mutex_assert_owner(tsdn, &shard->mtx); + return psset_ndirty(&shard->psset) - shard->npending_purge; +} + +static size_t +hpa_ndirty_max(tsdn_t *tsdn, hpa_shard_t *shard) { + malloc_mutex_assert_owner(tsdn, &shard->mtx); if (shard->opts.dirty_mult == (fxp_t)-1) { + return (size_t)-1; + } + return fxp_mul_frac(psset_nactive(&shard->psset), + shard->opts.dirty_mult); +} + +static bool +hpa_hugify_blocked_by_ndirty(tsdn_t *tsdn, hpa_shard_t *shard) { + malloc_mutex_assert_owner(tsdn, &shard->mtx); + hpdata_t *to_hugify = psset_pick_hugify(&shard->psset); + if (to_hugify == NULL) { return false; } - size_t adjusted_ndirty = psset_ndirty(&shard->psset) - - shard->npending_purge; - /* - * Another simple static check; purge whenever dirty exceeds 25% of - * active. - */ - size_t max_ndirty = fxp_mul_frac(psset_nactive(&shard->psset), - shard->opts.dirty_mult); - return adjusted_ndirty > max_ndirty; + return hpa_adjusted_ndirty(tsdn, shard) + + hpdata_nretained_get(to_hugify) > hpa_ndirty_max(tsdn, shard); +} + +static bool +hpa_should_purge(tsdn_t *tsdn, hpa_shard_t *shard) { + malloc_mutex_assert_owner(tsdn, &shard->mtx); + if (hpa_adjusted_ndirty(tsdn, shard) > hpa_ndirty_max(tsdn, shard)) { + return true; + } + if (hpa_hugify_blocked_by_ndirty(tsdn, shard)) { + return true; + } + return false; } static void -hpa_update_purge_hugify_eligibility(hpa_shard_t *shard, hpdata_t *ps) { +hpa_update_purge_hugify_eligibility(tsdn_t *tsdn, hpa_shard_t *shard, + hpdata_t *ps) { + malloc_mutex_assert_owner(tsdn, &shard->mtx); if (hpdata_changing_state_get(ps)) { - hpdata_purge_allowed_set(ps, false); + hpdata_purge_level_set(ps, hpdata_purge_level_never); hpdata_hugify_allowed_set(ps, false); return; } /* - * Hugepages are distinctly costly to purge, so do it only if they're - * *particularly* full of dirty pages. Eventually, we should use a - * smarter / more dynamic heuristic for situations where we have to - * manually hugify. + * Hugepages are distinctly costly to purge, so try to avoid it unless + * they're *particularly* full of dirty pages. Eventually, we should + * use a smarter / more dynamic heuristic for situations where we have + * to manually hugify. * * In situations where we don't manually hugify, this problem is * reduced. The "bad" situation we're trying to avoid is one's that's @@ -195,17 +220,23 @@ hpa_update_purge_hugify_eligibility(hpa_shard_t *shard, hpdata_t *ps) { * deferred; in that case we don't need any explicit calls on the * allocator's end at all; we just try to pack allocations in a * hugepage-friendly manner and let the OS hugify in the background. - * - * Anyways, our strategy to delay dehugification is to only consider - * purging a hugified hugepage if it's individually dirtier than the - * overall max dirty pages setting. That setting is 1 dirty page per 4 - * active pages; i.e. 4/5s of hugepage pages must be active. */ - if ((!hpdata_huge_get(ps) && hpdata_ndirty_get(ps) > 0) - || (hpdata_ndirty_get(ps) != 0 - && hpdata_ndirty_get(ps) * PAGE - >= shard->opts.dehugification_threshold)) { - hpdata_purge_allowed_set(ps, true); + if (hpdata_ndirty_get(ps) > 0) { + if (hpdata_huge_get(ps)) { + if (hpa_good_hugification_candidate(shard, ps)) { + hpdata_purge_level_set(ps, + hpdata_purge_level_strongly_nonpreferred); + } else if (hpdata_ndirty_get(ps) * PAGE + >= shard->opts.dehugification_threshold) { + hpdata_purge_level_set(ps, + hpdata_purge_level_nonpreferred); + } else { + hpdata_purge_level_set(ps, + hpdata_purge_level_default); + } + } else { + hpdata_purge_level_set(ps, hpdata_purge_level_default); + } } if (hpa_good_hugification_candidate(shard, ps) && !hpdata_huge_get(ps)) { @@ -286,7 +317,7 @@ hpa_try_purge(tsdn_t *tsdn, hpa_shard_t *shard) { if (to_purge == NULL) { return false; } - assert(hpdata_purge_allowed_get(to_purge)); + assert(hpdata_purge_level_get(to_purge) != hpdata_purge_level_never); assert(!hpdata_changing_state_get(to_purge)); /* @@ -297,7 +328,7 @@ hpa_try_purge(tsdn_t *tsdn, hpa_shard_t *shard) { psset_update_begin(&shard->psset, to_purge); assert(hpdata_alloc_allowed_get(to_purge)); hpdata_mid_purge_set(to_purge, true); - hpdata_purge_allowed_set(to_purge, false); + hpdata_purge_level_set(to_purge, hpdata_purge_level_never); hpdata_hugify_allowed_set(to_purge, false); /* * Unlike with hugification (where concurrent @@ -352,7 +383,7 @@ hpa_try_purge(tsdn_t *tsdn, hpa_shard_t *shard) { hpdata_mid_purge_set(to_purge, false); hpdata_alloc_allowed_set(to_purge, true); - hpa_update_purge_hugify_eligibility(shard, to_purge); + hpa_update_purge_hugify_eligibility(tsdn, shard, to_purge); psset_update_end(&shard->psset, to_purge); @@ -364,6 +395,10 @@ static bool hpa_try_hugify(tsdn_t *tsdn, hpa_shard_t *shard) { malloc_mutex_assert_owner(tsdn, &shard->mtx); + if (hpa_hugify_blocked_by_ndirty(tsdn, shard)) { + return false; + } + hpdata_t *to_hugify = psset_pick_hugify(&shard->psset); if (to_hugify == NULL) { return false; @@ -378,7 +413,7 @@ hpa_try_hugify(tsdn_t *tsdn, hpa_shard_t *shard) { */ psset_update_begin(&shard->psset, to_hugify); hpdata_mid_hugify_set(to_hugify, true); - hpdata_purge_allowed_set(to_hugify, false); + hpdata_purge_level_set(to_hugify, hpdata_purge_level_never); hpdata_hugify_allowed_set(to_hugify, false); assert(hpdata_alloc_allowed_get(to_hugify)); psset_update_end(&shard->psset, to_hugify); @@ -401,7 +436,7 @@ hpa_try_hugify(tsdn_t *tsdn, hpa_shard_t *shard) { psset_update_begin(&shard->psset, to_hugify); hpdata_hugify(to_hugify); hpdata_mid_hugify_set(to_hugify, false); - hpa_update_purge_hugify_eligibility(shard, to_hugify); + hpa_update_purge_hugify_eligibility(tsdn, shard, to_hugify); psset_update_end(&shard->psset, to_hugify); return true; @@ -419,7 +454,7 @@ hpa_do_deferred_work(tsdn_t *tsdn, hpa_shard_t *shard) { hugified = hpa_try_hugify(tsdn, shard); purged = false; - if (hpa_should_purge(shard)) { + if (hpa_should_purge(tsdn, shard)) { purged = hpa_try_purge(tsdn, shard); } malloc_mutex_assert_owner(tsdn, &shard->mtx); @@ -491,7 +526,7 @@ hpa_try_alloc_one_no_grow(tsdn_t *tsdn, hpa_shard_t *shard, size_t size, return NULL; } - hpa_update_purge_hugify_eligibility(shard, ps); + hpa_update_purge_hugify_eligibility(tsdn, shard, ps); psset_update_end(&shard->psset, ps); return edata; } @@ -703,7 +738,7 @@ hpa_dalloc_locked(tsdn_t *tsdn, hpa_shard_t *shard, edata_t *edata) { psset_update_begin(&shard->psset, ps); hpdata_unreserve(ps, unreserve_addr, unreserve_size); - hpa_update_purge_hugify_eligibility(shard, ps); + hpa_update_purge_hugify_eligibility(tsdn, shard, ps); psset_update_end(&shard->psset, ps); hpa_do_deferred_work(tsdn, shard); } diff --git a/src/hpdata.c b/src/hpdata.c index 0fc7b7dc..6aee4f61 100644 --- a/src/hpdata.c +++ b/src/hpdata.c @@ -24,8 +24,8 @@ hpdata_init(hpdata_t *hpdata, void *addr, uint64_t age) { hpdata->h_huge = false; hpdata->h_alloc_allowed = true; hpdata->h_in_psset_alloc_container = false; - hpdata->h_purge_allowed = false; - hpdata->h_in_psset_purge_container = false; + hpdata->h_purge_level = hpdata_purge_level_never; + hpdata->h_purge_container_level = hpdata_purge_level_never; hpdata->h_hugify_allowed = false; hpdata->h_in_psset_hugify_container = false; hpdata->h_mid_purge = false; diff --git a/src/psset.c b/src/psset.c index 66fd0c49..6de82605 100644 --- a/src/psset.c +++ b/src/psset.c @@ -14,7 +14,9 @@ psset_init(psset_t *psset) { memset(&psset->merged_stats, 0, sizeof(psset->merged_stats)); memset(&psset->stats, 0, sizeof(psset->stats)); hpdata_empty_list_init(&psset->empty); - hpdata_purge_list_init(&psset->to_purge); + for (int i = 0; i < hpdata_purge_level_count; i++) { + hpdata_purge_list_init(&psset->to_purge[i]); + } hpdata_hugify_list_init(&psset->to_hugify); } @@ -230,14 +232,31 @@ psset_update_end(psset_t *psset, hpdata_t *ps) { psset_alloc_container_insert(psset, ps); } - if (hpdata_purge_allowed_get(ps) - && !hpdata_in_psset_purge_container_get(ps)) { - hpdata_in_psset_purge_container_set(ps, true); - hpdata_purge_list_append(&psset->to_purge, ps); - } else if (!hpdata_purge_allowed_get(ps) - && hpdata_in_psset_purge_container_get(ps)) { - hpdata_in_psset_purge_container_set(ps, false); - hpdata_purge_list_remove(&psset->to_purge, ps); + if (hpdata_purge_level_get(ps) == hpdata_purge_level_never + && hpdata_purge_container_level_get(ps) + != hpdata_purge_level_never) { + /* In some purge container, but shouldn't be in any. */ + hpdata_purge_list_remove( + &psset->to_purge[hpdata_purge_container_level_get(ps)], + ps); + hpdata_purge_container_level_set(ps, hpdata_purge_level_never); + } else if (hpdata_purge_level_get(ps) != hpdata_purge_level_never + && hpdata_purge_container_level_get(ps) + == hpdata_purge_level_never) { + /* Not in any purge container, but should be in one. */ + hpdata_purge_list_append( + &psset->to_purge[hpdata_purge_level_get(ps)], ps); + hpdata_purge_container_level_set(ps, + hpdata_purge_level_get(ps)); + } else if (hpdata_purge_level_get(ps) + != hpdata_purge_container_level_get(ps)) { + /* Should switch containers. */ + hpdata_purge_list_remove( + &psset->to_purge[hpdata_purge_container_level_get(ps)], ps); + hpdata_purge_list_append( + &psset->to_purge[hpdata_purge_level_get(ps)], ps); + hpdata_purge_container_level_set(ps, + hpdata_purge_level_get(ps)); } if (hpdata_hugify_allowed_get(ps) @@ -275,7 +294,13 @@ psset_pick_alloc(psset_t *psset, size_t size) { hpdata_t * psset_pick_purge(psset_t *psset) { - return hpdata_purge_list_first(&psset->to_purge); + for (int i = 0; i < hpdata_purge_level_count; i++) { + hpdata_t *ps = hpdata_purge_list_first(&psset->to_purge[i]); + if (ps != NULL) { + return ps; + } + } + return NULL; } hpdata_t * @@ -291,10 +316,15 @@ psset_insert(psset_t *psset, hpdata_t *ps) { if (hpdata_alloc_allowed_get(ps)) { psset_alloc_container_insert(psset, ps); } - if (hpdata_purge_allowed_get(ps)) { - hpdata_in_psset_purge_container_set(ps, true); - hpdata_purge_list_append(&psset->to_purge, ps); + assert( + hpdata_purge_container_level_get(ps) == hpdata_purge_level_never); + if (hpdata_purge_level_get(ps) != hpdata_purge_level_never) { + hpdata_purge_container_level_set(ps, + hpdata_purge_level_get(ps)); + hpdata_purge_list_append( + &psset->to_purge[hpdata_purge_level_get(ps)], ps); } + if (hpdata_hugify_allowed_get(ps)) { hpdata_in_psset_hugify_container_set(ps, true); hpdata_hugify_list_append(&psset->to_hugify, ps); @@ -309,12 +339,13 @@ psset_remove(psset_t *psset, hpdata_t *ps) { if (hpdata_in_psset_alloc_container_get(ps)) { psset_alloc_container_remove(psset, ps); } - if (hpdata_in_psset_purge_container_get(ps)) { - hpdata_in_psset_purge_container_set(ps, false); - hpdata_purge_list_remove(&psset->to_purge, ps); + if (hpdata_purge_container_level_get(ps) != hpdata_purge_level_never) { + hpdata_purge_list_remove( + &psset->to_purge[hpdata_purge_container_level_get(ps)], ps); + hpdata_purge_container_level_set(ps, hpdata_purge_level_never); } - if (hpdata_in_psset_purge_container_get(ps)) { - hpdata_in_psset_purge_container_set(ps, false); - hpdata_purge_list_remove(&psset->to_purge, ps); + if (hpdata_in_psset_hugify_container_get(ps)) { + hpdata_in_psset_hugify_container_set(ps, false); + hpdata_hugify_list_remove(&psset->to_hugify, ps); } }