From 30b9e8162b9127d5c352fc312dfdea5e07d51e56 Mon Sep 17 00:00:00 2001 From: David Goldblatt Date: Wed, 2 Dec 2020 22:24:15 -0800 Subject: [PATCH] HPA: Generalize purging. Previously, we would purge a hugepage only when it's completely empty. With this change, we can purge even when only partially empty. Although the heuristic here is still fairly primitive, this infrastructure can scale to become more advanced. --- include/jemalloc/internal/hpdata.h | 23 +++- src/hpa.c | 208 ++++++++++++++++++++++++----- src/hpdata.c | 35 ++--- src/psset.c | 5 + test/unit/hpdata.c | 1 + 5 files changed, 208 insertions(+), 64 deletions(-) diff --git a/include/jemalloc/internal/hpdata.h b/include/jemalloc/internal/hpdata.h index faa62434..66473d2e 100644 --- a/include/jemalloc/internal/hpdata.h +++ b/include/jemalloc/internal/hpdata.h @@ -44,6 +44,9 @@ struct hpdata_s { bool h_mid_purge; bool h_mid_hugify; + /* Whether or not the hpdata is a the psset. */ + bool h_in_psset; + union { /* When nonempty, used by the psset bins. */ phn(hpdata_t) ph_link; @@ -115,6 +118,15 @@ hpdata_mid_hugify_get(const hpdata_t *hpdata) { return hpdata->h_mid_hugify; } +static inline bool +hpdata_in_psset_get(const hpdata_t *hpdata) { + return hpdata->h_in_psset; +} + +static inline void +hpdata_in_psset_set(hpdata_t *hpdata, bool in_psset) { + hpdata->h_in_psset = in_psset; +} static inline size_t hpdata_longest_free_range_get(const hpdata_t *hpdata) { @@ -195,12 +207,6 @@ void hpdata_init(hpdata_t *hpdata, void *addr, uint64_t age); void *hpdata_reserve_alloc(hpdata_t *hpdata, size_t sz); void hpdata_unreserve(hpdata_t *hpdata, void *begin, size_t sz); -/* - * Tell the hpdata (which should be empty) that all dirty pages in it have been - * purged. - */ -void hpdata_purge(hpdata_t *hpdata); - /* * The hpdata_purge_prepare_t allows grabbing the metadata required to purge * subranges of a hugepage while holding a lock, drop the lock during the actual @@ -247,6 +253,11 @@ void hpdata_purge_end(hpdata_t *hpdata, hpdata_purge_state_t *purge_state); * Similarly, when hugifying , callers can do the metadata modifications while * holding a lock (thereby setting the change_state field), but actually do the * operation without blocking other threads. + * + * Unlike most metadata operations, hugification ending should happen while an + * hpdata is in the psset (or upcoming hugepage collections). This is because + * while purge/use races are unsafe, purge/hugepageify races are perfectly + * reasonable. */ void hpdata_hugify_begin(hpdata_t *hpdata); void hpdata_hugify_end(hpdata_t *hpdata); diff --git a/src/hpa.c b/src/hpa.c index a36eee4e..99594549 100644 --- a/src/hpa.c +++ b/src/hpa.c @@ -124,32 +124,26 @@ hpa_should_hugify(hpa_shard_t *shard, hpdata_t *ps) { * For now, just use a static check; hugify a page if it's <= 5% * inactive. Eventually, this should be a malloc conf option. */ + if (hpdata_changing_state_get(ps)) { + return false; + } return !hpdata_huge_get(ps) && hpdata_nactive_get(ps) >= (HUGEPAGE_PAGES) * 95 / 100; } -/* Returns true on error. */ -static void -hpa_hugify(hpdata_t *ps) { - assert(hpdata_huge_get(ps)); - bool err = pages_huge(hpdata_addr_get(ps), HUGEPAGE); - /* - * Eat the error; even if the hugification failed, it's still safe to - * pretend it didn't (and would require extraordinary measures to - * unhugify). - */ - (void)err; -} - -static void -hpa_dehugify(hpdata_t *ps) { - /* Purge, then dehugify while unbacked. */ - pages_purge_forced(hpdata_addr_get(ps), HUGEPAGE); - pages_nohuge(hpdata_addr_get(ps), HUGEPAGE); - - /* Update metadata. */ - hpdata_dehugify(ps); - hpdata_purge(ps); +/* + * Whether or not the given pageslab meets the criteria for being purged (and, + * if necessary, dehugified). + */ +static bool +hpa_should_purge(hpa_shard_t *shard, hpdata_t *ps) { + /* Ditto. */ + if (hpdata_changing_state_get(ps)) { + return false; + } + size_t purgeable = hpdata_ndirty_get(ps) - hpdata_nactive_get(ps); + return purgeable > HUGEPAGE_PAGES * 25 / 100 + || (purgeable > 0 && hpdata_empty(ps)); } static hpdata_t * @@ -226,9 +220,65 @@ hpa_grow(tsdn_t *tsdn, hpa_shard_t *shard) { } /* - * The psset does not hold empty slabs. Upon becoming empty, then, we need to - * put them somewhere. We take this as an opportunity to purge, and retain - * their address space in a list outside the psset. + * As a precondition, ps should not be in the psset (we can handle deallocation + * races, but not allocation ones), and we should hold the shard mutex. + */ +static void +hpa_purge(tsdn_t *tsdn, hpa_shard_t *shard, hpdata_t *ps) { + malloc_mutex_assert_owner(tsdn, &shard->mtx); + while (hpa_should_purge(shard, ps)) { + /* Do the metadata update bit while holding the lock. */ + hpdata_purge_state_t purge_state; + hpdata_purge_begin(ps, &purge_state); + + /* + * Dehugifying can only happen on the first loop iteration, + * since no other threads can allocate out of this ps while + * we're purging (and thus, can't hugify it), but there's not a + * natural way to express that in the control flow. + */ + bool needs_dehugify = false; + if (hpdata_huge_get(ps)) { + needs_dehugify = true; + hpdata_dehugify(ps); + } + + /* Drop the lock to do the OS calls. */ + malloc_mutex_unlock(tsdn, &shard->mtx); + + if (needs_dehugify) { + pages_nohuge(hpdata_addr_get(ps), HUGEPAGE); + } + + size_t total_purged = 0; + void *purge_addr; + size_t purge_size; + while (hpdata_purge_next(ps, &purge_state, &purge_addr, + &purge_size)) { + pages_purge_forced(purge_addr, purge_size); + total_purged += purge_size; + } + + /* Reacquire to finish our metadata update. */ + malloc_mutex_lock(tsdn, &shard->mtx); + hpdata_purge_end(ps, &purge_state); + + assert(total_purged <= HUGEPAGE); + + /* + * We're not done here; other threads can't allocate out of ps + * while purging, but they can still deallocate. Those + * deallocations could have meant more purging than what we + * planned ought to happen. We have to re-check now that we've + * reacquired the mutex again. + */ + } +} + +/* + * Does the metadata tracking associated with a page slab becoming empty. The + * psset doesn't hold empty pageslabs, but we do want address space reuse, so we + * track these pages outside the psset. */ static void hpa_handle_ps_eviction(tsdn_t *tsdn, hpa_shard_t *shard, hpdata_t *ps) { @@ -239,12 +289,6 @@ hpa_handle_ps_eviction(tsdn_t *tsdn, hpa_shard_t *shard, hpdata_t *ps) { malloc_mutex_assert_not_owner(tsdn, &shard->mtx); malloc_mutex_assert_not_owner(tsdn, &shard->grow_mtx); - /* - * We do this unconditionally, even for pages which were not originally - * hugified; it has the same effect. - */ - hpa_dehugify(ps); - malloc_mutex_lock(tsdn, &shard->grow_mtx); shard->nevictions++; hpdata_list_prepend(&shard->unused_slabs, ps); @@ -291,6 +335,11 @@ hpa_try_alloc_no_grow(tsdn_t *tsdn, hpa_shard_t *shard, size_t size, bool *oom) if (err) { hpdata_unreserve(ps, edata_addr_get(edata), edata_size_get(edata)); + /* + * We should arguably reset dirty state here, but this would + * require some sort of prepare + commit functionality that's a + * little much to deal with for now. + */ psset_insert(&shard->psset, ps); edata_cache_small_put(tsdn, &shard->ecs, edata); malloc_mutex_unlock(tsdn, &shard->mtx); @@ -318,9 +367,26 @@ hpa_try_alloc_no_grow(tsdn_t *tsdn, hpa_shard_t *shard, size_t size, bool *oom) * on this page slab, but also operations any other alloc/dalloc * operations in this hpa shard. */ - hpa_hugify(ps); + bool err = pages_huge(hpdata_addr_get(ps), HUGEPAGE); + /* + * Pretending we succeed when we actually failed is safe; trying + * to rolllback would be tricky, though. Eat the error. + */ + (void)err; + malloc_mutex_lock(tsdn, &shard->mtx); hpdata_hugify_end(ps); + if (hpa_should_purge(shard, ps)) { + /* + * There was a race in which the ps went from being + * almost full to having lots of free space while we + * hugified. Undo our operation, taking care to meet + * the precondition that the ps isn't in the psset. + */ + psset_remove(&shard->psset, ps); + hpa_purge(tsdn, shard, ps); + psset_insert(&shard->psset, ps); + } malloc_mutex_unlock(tsdn, &shard->mtx); } return edata; @@ -383,11 +449,28 @@ hpa_alloc_psset(tsdn_t *tsdn, hpa_shard_t *shard, size_t size) { if (err) { hpdata_unreserve(ps, edata_addr_get(edata), edata_size_get(edata)); + edata_cache_small_put(tsdn, &shard->ecs, edata); shard->nevictions++; malloc_mutex_unlock(tsdn, &shard->mtx); malloc_mutex_unlock(tsdn, &shard->grow_mtx); + + /* We'll do a fake purge; the pages weren't really touched. */ + hpdata_purge_state_t purge_state; + void *purge_addr; + size_t purge_size; + hpdata_purge_begin(ps, &purge_state); + bool found_extent = hpdata_purge_next(ps, &purge_state, + &purge_addr, &purge_size); + assert(found_extent); + assert(purge_addr == addr); + assert(purge_size == size); + found_extent = hpdata_purge_next(ps, &purge_state, + &purge_addr, &purge_size); + assert(!found_extent); + hpdata_purge_end(ps, &purge_state); + hpa_handle_ps_eviction(tsdn, shard, ps); return NULL; } @@ -475,13 +558,66 @@ hpa_dalloc(tsdn_t *tsdn, pai_t *self, edata_t *edata) { /* Currently, all edatas come from pageslabs. */ assert(ps != NULL); emap_deregister_boundary(tsdn, shard->emap, edata); + /* + * Note that the shard mutex protects ps's metadata too; it wouldn't be + * correct to try to read most information out of it without the lock. + */ malloc_mutex_lock(tsdn, &shard->mtx); - /* Note that the shard mutex protects ps's metadata too. */ - psset_remove(&shard->psset, ps); - hpdata_unreserve(ps, edata_addr_get(edata), edata_size_get(edata)); - + /* + * Release the metadata early, to avoid having to remember to do it + * while we're also doing tricky purging logic. + */ + void *unreserve_addr = edata_addr_get(edata); + size_t unreserve_size = edata_size_get(edata); edata_cache_small_put(tsdn, &shard->ecs, edata); + + /* + * We have three rules interacting here: + * - You can't update ps metadata while it's still in the psset. We + * enforce this because it's necessary for stats tracking and metadata + * management. + * - The ps must not be in the psset while purging. This is because we + * can't handle purge/alloc races. + * - Whoever removes the ps from the psset is the one to reinsert it (or + * to pass it to hpa_handle_ps_eviction upon emptying). This keeps + * responsibility tracking simple. + */ + if (hpdata_mid_purge_get(ps)) { + /* + * Another thread started purging, and so the ps is not in the + * psset and we can do our metadata update. The other thread is + * in charge of reinserting the ps, so we're done. + */ + assert(!hpdata_in_psset_get(ps)); + hpdata_unreserve(ps, unreserve_addr, unreserve_size); + malloc_mutex_unlock(tsdn, &shard->mtx); + return; + } + /* + * No other thread is purging, and the ps is non-empty, so it should be + * in the psset. + */ + assert(hpdata_in_psset_get(ps)); + psset_remove(&shard->psset, ps); + hpdata_unreserve(ps, unreserve_addr, unreserve_size); + if (!hpa_should_purge(shard, ps)) { + /* + * This should be the common case; no other thread is purging, + * and we won't purge either. + */ + psset_insert(&shard->psset, ps); + malloc_mutex_unlock(tsdn, &shard->mtx); + return; + } + + /* It's our job to purge. */ + hpa_purge(tsdn, shard, ps); + + /* + * OK, the hpdata is as purged as we want it to be, and it's going back + * into the psset (if nonempty) or getting evicted (if empty). + */ if (hpdata_empty(ps)) { malloc_mutex_unlock(tsdn, &shard->mtx); hpa_handle_ps_eviction(tsdn, shard, ps); diff --git a/src/hpdata.c b/src/hpdata.c index 29aecff5..78816196 100644 --- a/src/hpdata.c +++ b/src/hpdata.c @@ -24,6 +24,7 @@ hpdata_init(hpdata_t *hpdata, void *addr, uint64_t age) { hpdata->h_huge = false; hpdata->h_mid_purge = false; hpdata->h_mid_hugify = false; + hpdata->h_in_psset = false; hpdata_longest_free_range_set(hpdata, HUGEPAGE_PAGES); hpdata->h_nactive = 0; fb_init(hpdata->active_pages, HUGEPAGE_PAGES); @@ -36,6 +37,7 @@ hpdata_init(hpdata_t *hpdata, void *addr, uint64_t age) { void * hpdata_reserve_alloc(hpdata_t *hpdata, size_t sz) { hpdata_assert_consistent(hpdata); + assert(!hpdata_in_psset_get(hpdata)); assert((sz & PAGE_MASK) == 0); size_t npages = sz >> LG_PAGE; assert(npages <= hpdata_longest_free_range_get(hpdata)); @@ -116,6 +118,7 @@ hpdata_reserve_alloc(hpdata_t *hpdata, size_t sz) { void hpdata_unreserve(hpdata_t *hpdata, void *addr, size_t sz) { hpdata_assert_consistent(hpdata); + assert(!hpdata->h_in_psset); assert(((uintptr_t)addr & PAGE_MASK) == 0); assert((sz & PAGE_MASK) == 0); size_t begin = ((uintptr_t)addr - (uintptr_t)hpdata_addr_get(hpdata)) @@ -144,6 +147,7 @@ hpdata_unreserve(hpdata_t *hpdata, void *addr, size_t sz) { void hpdata_purge_begin(hpdata_t *hpdata, hpdata_purge_state_t *purge_state) { hpdata_assert_consistent(hpdata); + assert(!hpdata->h_in_psset); assert(!hpdata->h_mid_purge); assert(!hpdata->h_mid_hugify); hpdata->h_mid_purge = true; @@ -181,6 +185,7 @@ hpdata_purge_next(hpdata_t *hpdata, hpdata_purge_state_t *purge_state, * a consistent state. */ assert(hpdata->h_mid_purge); + assert(!hpdata->h_in_psset); /* Should have dehugified already (if necessary). */ assert(!hpdata->h_huge); assert(!hpdata->h_mid_hugify); @@ -210,6 +215,7 @@ hpdata_purge_next(hpdata_t *hpdata, hpdata_purge_state_t *purge_state, void hpdata_purge_end(hpdata_t *hpdata, hpdata_purge_state_t *purge_state) { hpdata_assert_consistent(hpdata); + assert(!hpdata->h_in_psset); assert(hpdata->h_mid_purge); assert(!hpdata->h_mid_hugify); hpdata->h_mid_purge = false; @@ -230,6 +236,7 @@ hpdata_purge_end(hpdata_t *hpdata, hpdata_purge_state_t *purge_state) { void hpdata_hugify_begin(hpdata_t *hpdata) { hpdata_assert_consistent(hpdata); + assert(!hpdata_in_psset_get(hpdata)); assert(!hpdata->h_mid_purge); assert(!hpdata->h_mid_hugify); hpdata->h_mid_hugify = true; @@ -242,6 +249,11 @@ hpdata_hugify_begin(hpdata_t *hpdata) { void hpdata_hugify_end(hpdata_t *hpdata) { hpdata_assert_consistent(hpdata); + /* + * This is the exception to the "no metadata tweaks while in the psset" + * rule. + */ + /* assert(!hpdata_in_psset_get(hpdata)); */ assert(!hpdata->h_mid_purge); assert(hpdata->h_mid_hugify); hpdata->h_mid_hugify = false; @@ -251,30 +263,9 @@ hpdata_hugify_end(hpdata_t *hpdata) { void hpdata_dehugify(hpdata_t *hpdata) { hpdata_assert_consistent(hpdata); - /* - * These asserts are morally right; for now, though, we have the "purge a - * hugepage only in its entirety, when it becomes empty", path sharing - * hpdata_dehugify with the new purge pathway coming in the next - * commit. - */ - /* + assert(!hpdata_in_psset_get(hpdata)); assert(hpdata->h_mid_purge); assert(!hpdata->h_mid_hugify); - */ hpdata->h_huge = false; hpdata_assert_consistent(hpdata); } - -void -hpdata_purge(hpdata_t *hpdata) { - hpdata_assert_consistent(hpdata); - /* - * The hpdata must be empty; we don't (yet) support partial purges of - * hugepages. - */ - assert(hpdata->h_nactive == 0); - fb_unset_range(hpdata->dirty_pages, HUGEPAGE_PAGES, 0, HUGEPAGE_PAGES); - fb_init(hpdata->dirty_pages, HUGEPAGE_PAGES); - hpdata->h_ndirty = 0; - hpdata_assert_consistent(hpdata); -} diff --git a/src/psset.c b/src/psset.c index 9fcdac22..688cd620 100644 --- a/src/psset.c +++ b/src/psset.c @@ -92,6 +92,8 @@ void psset_insert(psset_t *psset, hpdata_t *ps) { assert(!hpdata_empty(ps)); hpdata_assert_consistent(ps); + assert(!hpdata_in_psset_get(ps)); + hpdata_in_psset_set(ps, true); size_t longest_free_range = hpdata_longest_free_range_get(ps); if (longest_free_range == 0) { @@ -116,6 +118,9 @@ psset_insert(psset_t *psset, hpdata_t *ps) { void psset_remove(psset_t *psset, hpdata_t *ps) { hpdata_assert_consistent(ps); + assert(hpdata_in_psset_get(ps)); + hpdata_in_psset_set(ps, false); + size_t longest_free_range = hpdata_longest_free_range_get(ps); if (longest_free_range == 0) { diff --git a/test/unit/hpdata.c b/test/unit/hpdata.c index 2fd9a367..aa4506f7 100644 --- a/test/unit/hpdata.c +++ b/test/unit/hpdata.c @@ -169,6 +169,7 @@ TEST_BEGIN(test_hugify) { expect_false(hpdata_changing_state_get(&hpdata), ""); hpdata_hugify_begin(&hpdata); expect_true(hpdata_changing_state_get(&hpdata), ""); + hpdata_hugify_end(&hpdata); expect_false(hpdata_changing_state_get(&hpdata), "");