HPA: Generalize purging.

Previously, we would purge a hugepage only when it's completely empty.  With
this change, we can purge even when only partially empty.  Although the
heuristic here is still fairly primitive, this infrastructure can scale to
become more advanced.
This commit is contained in:
David Goldblatt 2020-12-02 22:24:15 -08:00 committed by David Goldblatt
parent 70692cfb13
commit 30b9e8162b
5 changed files with 208 additions and 64 deletions

View File

@ -44,6 +44,9 @@ struct hpdata_s {
bool h_mid_purge;
bool h_mid_hugify;
/* Whether or not the hpdata is a the psset. */
bool h_in_psset;
union {
/* When nonempty, used by the psset bins. */
phn(hpdata_t) ph_link;
@ -115,6 +118,15 @@ hpdata_mid_hugify_get(const hpdata_t *hpdata) {
return hpdata->h_mid_hugify;
}
static inline bool
hpdata_in_psset_get(const hpdata_t *hpdata) {
return hpdata->h_in_psset;
}
static inline void
hpdata_in_psset_set(hpdata_t *hpdata, bool in_psset) {
hpdata->h_in_psset = in_psset;
}
static inline size_t
hpdata_longest_free_range_get(const hpdata_t *hpdata) {
@ -195,12 +207,6 @@ void hpdata_init(hpdata_t *hpdata, void *addr, uint64_t age);
void *hpdata_reserve_alloc(hpdata_t *hpdata, size_t sz);
void hpdata_unreserve(hpdata_t *hpdata, void *begin, size_t sz);
/*
* Tell the hpdata (which should be empty) that all dirty pages in it have been
* purged.
*/
void hpdata_purge(hpdata_t *hpdata);
/*
* The hpdata_purge_prepare_t allows grabbing the metadata required to purge
* subranges of a hugepage while holding a lock, drop the lock during the actual
@ -247,6 +253,11 @@ void hpdata_purge_end(hpdata_t *hpdata, hpdata_purge_state_t *purge_state);
* Similarly, when hugifying , callers can do the metadata modifications while
* holding a lock (thereby setting the change_state field), but actually do the
* operation without blocking other threads.
*
* Unlike most metadata operations, hugification ending should happen while an
* hpdata is in the psset (or upcoming hugepage collections). This is because
* while purge/use races are unsafe, purge/hugepageify races are perfectly
* reasonable.
*/
void hpdata_hugify_begin(hpdata_t *hpdata);
void hpdata_hugify_end(hpdata_t *hpdata);

208
src/hpa.c
View File

@ -124,32 +124,26 @@ hpa_should_hugify(hpa_shard_t *shard, hpdata_t *ps) {
* For now, just use a static check; hugify a page if it's <= 5%
* inactive. Eventually, this should be a malloc conf option.
*/
if (hpdata_changing_state_get(ps)) {
return false;
}
return !hpdata_huge_get(ps)
&& hpdata_nactive_get(ps) >= (HUGEPAGE_PAGES) * 95 / 100;
}
/* Returns true on error. */
static void
hpa_hugify(hpdata_t *ps) {
assert(hpdata_huge_get(ps));
bool err = pages_huge(hpdata_addr_get(ps), HUGEPAGE);
/*
* Eat the error; even if the hugification failed, it's still safe to
* pretend it didn't (and would require extraordinary measures to
* unhugify).
*/
(void)err;
}
static void
hpa_dehugify(hpdata_t *ps) {
/* Purge, then dehugify while unbacked. */
pages_purge_forced(hpdata_addr_get(ps), HUGEPAGE);
pages_nohuge(hpdata_addr_get(ps), HUGEPAGE);
/* Update metadata. */
hpdata_dehugify(ps);
hpdata_purge(ps);
/*
* Whether or not the given pageslab meets the criteria for being purged (and,
* if necessary, dehugified).
*/
static bool
hpa_should_purge(hpa_shard_t *shard, hpdata_t *ps) {
/* Ditto. */
if (hpdata_changing_state_get(ps)) {
return false;
}
size_t purgeable = hpdata_ndirty_get(ps) - hpdata_nactive_get(ps);
return purgeable > HUGEPAGE_PAGES * 25 / 100
|| (purgeable > 0 && hpdata_empty(ps));
}
static hpdata_t *
@ -226,9 +220,65 @@ hpa_grow(tsdn_t *tsdn, hpa_shard_t *shard) {
}
/*
* The psset does not hold empty slabs. Upon becoming empty, then, we need to
* put them somewhere. We take this as an opportunity to purge, and retain
* their address space in a list outside the psset.
* As a precondition, ps should not be in the psset (we can handle deallocation
* races, but not allocation ones), and we should hold the shard mutex.
*/
static void
hpa_purge(tsdn_t *tsdn, hpa_shard_t *shard, hpdata_t *ps) {
malloc_mutex_assert_owner(tsdn, &shard->mtx);
while (hpa_should_purge(shard, ps)) {
/* Do the metadata update bit while holding the lock. */
hpdata_purge_state_t purge_state;
hpdata_purge_begin(ps, &purge_state);
/*
* Dehugifying can only happen on the first loop iteration,
* since no other threads can allocate out of this ps while
* we're purging (and thus, can't hugify it), but there's not a
* natural way to express that in the control flow.
*/
bool needs_dehugify = false;
if (hpdata_huge_get(ps)) {
needs_dehugify = true;
hpdata_dehugify(ps);
}
/* Drop the lock to do the OS calls. */
malloc_mutex_unlock(tsdn, &shard->mtx);
if (needs_dehugify) {
pages_nohuge(hpdata_addr_get(ps), HUGEPAGE);
}
size_t total_purged = 0;
void *purge_addr;
size_t purge_size;
while (hpdata_purge_next(ps, &purge_state, &purge_addr,
&purge_size)) {
pages_purge_forced(purge_addr, purge_size);
total_purged += purge_size;
}
/* Reacquire to finish our metadata update. */
malloc_mutex_lock(tsdn, &shard->mtx);
hpdata_purge_end(ps, &purge_state);
assert(total_purged <= HUGEPAGE);
/*
* We're not done here; other threads can't allocate out of ps
* while purging, but they can still deallocate. Those
* deallocations could have meant more purging than what we
* planned ought to happen. We have to re-check now that we've
* reacquired the mutex again.
*/
}
}
/*
* Does the metadata tracking associated with a page slab becoming empty. The
* psset doesn't hold empty pageslabs, but we do want address space reuse, so we
* track these pages outside the psset.
*/
static void
hpa_handle_ps_eviction(tsdn_t *tsdn, hpa_shard_t *shard, hpdata_t *ps) {
@ -239,12 +289,6 @@ hpa_handle_ps_eviction(tsdn_t *tsdn, hpa_shard_t *shard, hpdata_t *ps) {
malloc_mutex_assert_not_owner(tsdn, &shard->mtx);
malloc_mutex_assert_not_owner(tsdn, &shard->grow_mtx);
/*
* We do this unconditionally, even for pages which were not originally
* hugified; it has the same effect.
*/
hpa_dehugify(ps);
malloc_mutex_lock(tsdn, &shard->grow_mtx);
shard->nevictions++;
hpdata_list_prepend(&shard->unused_slabs, ps);
@ -291,6 +335,11 @@ hpa_try_alloc_no_grow(tsdn_t *tsdn, hpa_shard_t *shard, size_t size, bool *oom)
if (err) {
hpdata_unreserve(ps, edata_addr_get(edata),
edata_size_get(edata));
/*
* We should arguably reset dirty state here, but this would
* require some sort of prepare + commit functionality that's a
* little much to deal with for now.
*/
psset_insert(&shard->psset, ps);
edata_cache_small_put(tsdn, &shard->ecs, edata);
malloc_mutex_unlock(tsdn, &shard->mtx);
@ -318,9 +367,26 @@ hpa_try_alloc_no_grow(tsdn_t *tsdn, hpa_shard_t *shard, size_t size, bool *oom)
* on this page slab, but also operations any other alloc/dalloc
* operations in this hpa shard.
*/
hpa_hugify(ps);
bool err = pages_huge(hpdata_addr_get(ps), HUGEPAGE);
/*
* Pretending we succeed when we actually failed is safe; trying
* to rolllback would be tricky, though. Eat the error.
*/
(void)err;
malloc_mutex_lock(tsdn, &shard->mtx);
hpdata_hugify_end(ps);
if (hpa_should_purge(shard, ps)) {
/*
* There was a race in which the ps went from being
* almost full to having lots of free space while we
* hugified. Undo our operation, taking care to meet
* the precondition that the ps isn't in the psset.
*/
psset_remove(&shard->psset, ps);
hpa_purge(tsdn, shard, ps);
psset_insert(&shard->psset, ps);
}
malloc_mutex_unlock(tsdn, &shard->mtx);
}
return edata;
@ -383,11 +449,28 @@ hpa_alloc_psset(tsdn_t *tsdn, hpa_shard_t *shard, size_t size) {
if (err) {
hpdata_unreserve(ps, edata_addr_get(edata),
edata_size_get(edata));
edata_cache_small_put(tsdn, &shard->ecs, edata);
shard->nevictions++;
malloc_mutex_unlock(tsdn, &shard->mtx);
malloc_mutex_unlock(tsdn, &shard->grow_mtx);
/* We'll do a fake purge; the pages weren't really touched. */
hpdata_purge_state_t purge_state;
void *purge_addr;
size_t purge_size;
hpdata_purge_begin(ps, &purge_state);
bool found_extent = hpdata_purge_next(ps, &purge_state,
&purge_addr, &purge_size);
assert(found_extent);
assert(purge_addr == addr);
assert(purge_size == size);
found_extent = hpdata_purge_next(ps, &purge_state,
&purge_addr, &purge_size);
assert(!found_extent);
hpdata_purge_end(ps, &purge_state);
hpa_handle_ps_eviction(tsdn, shard, ps);
return NULL;
}
@ -475,13 +558,66 @@ hpa_dalloc(tsdn_t *tsdn, pai_t *self, edata_t *edata) {
/* Currently, all edatas come from pageslabs. */
assert(ps != NULL);
emap_deregister_boundary(tsdn, shard->emap, edata);
/*
* Note that the shard mutex protects ps's metadata too; it wouldn't be
* correct to try to read most information out of it without the lock.
*/
malloc_mutex_lock(tsdn, &shard->mtx);
/* Note that the shard mutex protects ps's metadata too. */
psset_remove(&shard->psset, ps);
hpdata_unreserve(ps, edata_addr_get(edata), edata_size_get(edata));
/*
* Release the metadata early, to avoid having to remember to do it
* while we're also doing tricky purging logic.
*/
void *unreserve_addr = edata_addr_get(edata);
size_t unreserve_size = edata_size_get(edata);
edata_cache_small_put(tsdn, &shard->ecs, edata);
/*
* We have three rules interacting here:
* - You can't update ps metadata while it's still in the psset. We
* enforce this because it's necessary for stats tracking and metadata
* management.
* - The ps must not be in the psset while purging. This is because we
* can't handle purge/alloc races.
* - Whoever removes the ps from the psset is the one to reinsert it (or
* to pass it to hpa_handle_ps_eviction upon emptying). This keeps
* responsibility tracking simple.
*/
if (hpdata_mid_purge_get(ps)) {
/*
* Another thread started purging, and so the ps is not in the
* psset and we can do our metadata update. The other thread is
* in charge of reinserting the ps, so we're done.
*/
assert(!hpdata_in_psset_get(ps));
hpdata_unreserve(ps, unreserve_addr, unreserve_size);
malloc_mutex_unlock(tsdn, &shard->mtx);
return;
}
/*
* No other thread is purging, and the ps is non-empty, so it should be
* in the psset.
*/
assert(hpdata_in_psset_get(ps));
psset_remove(&shard->psset, ps);
hpdata_unreserve(ps, unreserve_addr, unreserve_size);
if (!hpa_should_purge(shard, ps)) {
/*
* This should be the common case; no other thread is purging,
* and we won't purge either.
*/
psset_insert(&shard->psset, ps);
malloc_mutex_unlock(tsdn, &shard->mtx);
return;
}
/* It's our job to purge. */
hpa_purge(tsdn, shard, ps);
/*
* OK, the hpdata is as purged as we want it to be, and it's going back
* into the psset (if nonempty) or getting evicted (if empty).
*/
if (hpdata_empty(ps)) {
malloc_mutex_unlock(tsdn, &shard->mtx);
hpa_handle_ps_eviction(tsdn, shard, ps);

View File

@ -24,6 +24,7 @@ hpdata_init(hpdata_t *hpdata, void *addr, uint64_t age) {
hpdata->h_huge = false;
hpdata->h_mid_purge = false;
hpdata->h_mid_hugify = false;
hpdata->h_in_psset = false;
hpdata_longest_free_range_set(hpdata, HUGEPAGE_PAGES);
hpdata->h_nactive = 0;
fb_init(hpdata->active_pages, HUGEPAGE_PAGES);
@ -36,6 +37,7 @@ hpdata_init(hpdata_t *hpdata, void *addr, uint64_t age) {
void *
hpdata_reserve_alloc(hpdata_t *hpdata, size_t sz) {
hpdata_assert_consistent(hpdata);
assert(!hpdata_in_psset_get(hpdata));
assert((sz & PAGE_MASK) == 0);
size_t npages = sz >> LG_PAGE;
assert(npages <= hpdata_longest_free_range_get(hpdata));
@ -116,6 +118,7 @@ hpdata_reserve_alloc(hpdata_t *hpdata, size_t sz) {
void
hpdata_unreserve(hpdata_t *hpdata, void *addr, size_t sz) {
hpdata_assert_consistent(hpdata);
assert(!hpdata->h_in_psset);
assert(((uintptr_t)addr & PAGE_MASK) == 0);
assert((sz & PAGE_MASK) == 0);
size_t begin = ((uintptr_t)addr - (uintptr_t)hpdata_addr_get(hpdata))
@ -144,6 +147,7 @@ hpdata_unreserve(hpdata_t *hpdata, void *addr, size_t sz) {
void
hpdata_purge_begin(hpdata_t *hpdata, hpdata_purge_state_t *purge_state) {
hpdata_assert_consistent(hpdata);
assert(!hpdata->h_in_psset);
assert(!hpdata->h_mid_purge);
assert(!hpdata->h_mid_hugify);
hpdata->h_mid_purge = true;
@ -181,6 +185,7 @@ hpdata_purge_next(hpdata_t *hpdata, hpdata_purge_state_t *purge_state,
* a consistent state.
*/
assert(hpdata->h_mid_purge);
assert(!hpdata->h_in_psset);
/* Should have dehugified already (if necessary). */
assert(!hpdata->h_huge);
assert(!hpdata->h_mid_hugify);
@ -210,6 +215,7 @@ hpdata_purge_next(hpdata_t *hpdata, hpdata_purge_state_t *purge_state,
void
hpdata_purge_end(hpdata_t *hpdata, hpdata_purge_state_t *purge_state) {
hpdata_assert_consistent(hpdata);
assert(!hpdata->h_in_psset);
assert(hpdata->h_mid_purge);
assert(!hpdata->h_mid_hugify);
hpdata->h_mid_purge = false;
@ -230,6 +236,7 @@ hpdata_purge_end(hpdata_t *hpdata, hpdata_purge_state_t *purge_state) {
void
hpdata_hugify_begin(hpdata_t *hpdata) {
hpdata_assert_consistent(hpdata);
assert(!hpdata_in_psset_get(hpdata));
assert(!hpdata->h_mid_purge);
assert(!hpdata->h_mid_hugify);
hpdata->h_mid_hugify = true;
@ -242,6 +249,11 @@ hpdata_hugify_begin(hpdata_t *hpdata) {
void
hpdata_hugify_end(hpdata_t *hpdata) {
hpdata_assert_consistent(hpdata);
/*
* This is the exception to the "no metadata tweaks while in the psset"
* rule.
*/
/* assert(!hpdata_in_psset_get(hpdata)); */
assert(!hpdata->h_mid_purge);
assert(hpdata->h_mid_hugify);
hpdata->h_mid_hugify = false;
@ -251,30 +263,9 @@ hpdata_hugify_end(hpdata_t *hpdata) {
void
hpdata_dehugify(hpdata_t *hpdata) {
hpdata_assert_consistent(hpdata);
/*
* These asserts are morally right; for now, though, we have the "purge a
* hugepage only in its entirety, when it becomes empty", path sharing
* hpdata_dehugify with the new purge pathway coming in the next
* commit.
*/
/*
assert(!hpdata_in_psset_get(hpdata));
assert(hpdata->h_mid_purge);
assert(!hpdata->h_mid_hugify);
*/
hpdata->h_huge = false;
hpdata_assert_consistent(hpdata);
}
void
hpdata_purge(hpdata_t *hpdata) {
hpdata_assert_consistent(hpdata);
/*
* The hpdata must be empty; we don't (yet) support partial purges of
* hugepages.
*/
assert(hpdata->h_nactive == 0);
fb_unset_range(hpdata->dirty_pages, HUGEPAGE_PAGES, 0, HUGEPAGE_PAGES);
fb_init(hpdata->dirty_pages, HUGEPAGE_PAGES);
hpdata->h_ndirty = 0;
hpdata_assert_consistent(hpdata);
}

View File

@ -92,6 +92,8 @@ void
psset_insert(psset_t *psset, hpdata_t *ps) {
assert(!hpdata_empty(ps));
hpdata_assert_consistent(ps);
assert(!hpdata_in_psset_get(ps));
hpdata_in_psset_set(ps, true);
size_t longest_free_range = hpdata_longest_free_range_get(ps);
if (longest_free_range == 0) {
@ -116,6 +118,9 @@ psset_insert(psset_t *psset, hpdata_t *ps) {
void
psset_remove(psset_t *psset, hpdata_t *ps) {
hpdata_assert_consistent(ps);
assert(hpdata_in_psset_get(ps));
hpdata_in_psset_set(ps, false);
size_t longest_free_range = hpdata_longest_free_range_get(ps);
if (longest_free_range == 0) {

View File

@ -169,6 +169,7 @@ TEST_BEGIN(test_hugify) {
expect_false(hpdata_changing_state_get(&hpdata), "");
hpdata_hugify_begin(&hpdata);
expect_true(hpdata_changing_state_get(&hpdata), "");
hpdata_hugify_end(&hpdata);
expect_false(hpdata_changing_state_get(&hpdata), "");