HPA: Purge across retained extents.

This lets us cut down on the number of expensive system calls we perform.
This commit is contained in:
David Goldblatt 2021-06-03 13:29:02 -07:00 committed by David Goldblatt
parent 347523517b
commit 41fd56605e
4 changed files with 160 additions and 25 deletions

View File

@ -110,7 +110,7 @@ struct hpdata_s {
*/ */
size_t h_ntouched; size_t h_ntouched;
/* The dirty pages (using the same definition as above). */ /* The touched pages (using the same definition as above). */
fb_group_t touched_pages[FB_NGROUPS(HUGEPAGE_PAGES)]; fb_group_t touched_pages[FB_NGROUPS(HUGEPAGE_PAGES)];
}; };
@ -356,6 +356,7 @@ void hpdata_unreserve(hpdata_t *hpdata, void *begin, size_t sz);
typedef struct hpdata_purge_state_s hpdata_purge_state_t; typedef struct hpdata_purge_state_s hpdata_purge_state_t;
struct hpdata_purge_state_s { struct hpdata_purge_state_s {
size_t npurged; size_t npurged;
size_t ndirty_to_purge;
fb_group_t to_purge[FB_NGROUPS(HUGEPAGE_PAGES)]; fb_group_t to_purge[FB_NGROUPS(HUGEPAGE_PAGES)];
size_t next_purge_search_begin; size_t next_purge_search_begin;
}; };
@ -372,7 +373,7 @@ struct hpdata_purge_state_s {
* until you're done, and then end. Allocating out of an hpdata undergoing * until you're done, and then end. Allocating out of an hpdata undergoing
* purging is not allowed. * purging is not allowed.
* *
* Returns the number of pages that will be purged. * Returns the number of dirty pages that will be purged.
*/ */
size_t hpdata_purge_begin(hpdata_t *hpdata, hpdata_purge_state_t *purge_state); size_t hpdata_purge_begin(hpdata_t *hpdata, hpdata_purge_state_t *purge_state);

View File

@ -166,33 +166,93 @@ hpdata_unreserve(hpdata_t *hpdata, void *addr, size_t sz) {
size_t size_t
hpdata_purge_begin(hpdata_t *hpdata, hpdata_purge_state_t *purge_state) { hpdata_purge_begin(hpdata_t *hpdata, hpdata_purge_state_t *purge_state) {
hpdata_assert_consistent(hpdata); hpdata_assert_consistent(hpdata);
/* See the comment in reserve. */ /*
* See the comment below; we might purge any inactive extent, so it's
* unsafe for any other thread to turn any inactive extent active while
* we're operating on it.
*/
assert(!hpdata_alloc_allowed_get(hpdata));
purge_state->npurged = 0; purge_state->npurged = 0;
purge_state->next_purge_search_begin = 0; purge_state->next_purge_search_begin = 0;
/* /*
* Initialize to_purge with everything that's not active but that is * Initialize to_purge.
* dirty.
* *
* As an optimization, we could note that in practice we never allocate * It's possible to end up in situations where two dirty extents are
* out of a hugepage while purging within it, and so could try to * separated by a retained extent:
* combine dirty extents separated by a non-dirty but non-active extent * - 1 page allocated.
* to avoid purge calls. This does nontrivially complicate metadata * - 1 page allocated.
* tracking though, so let's hold off for now. * - 1 pages allocated.
*
* If the middle page is freed and purged, and then the first and third
* pages are freed, and then another purge pass happens, the hpdata
* looks like this:
* - 1 page dirty.
* - 1 page retained.
* - 1 page dirty.
*
* But it's safe to do a single 3-page purge.
*
* We do this by first computing the dirty pages, and then filling in
* any gaps by extending each range in the dirty bitmap to extend until
* the next active page. This purges more pages, but the expensive part
* of purging is the TLB shootdowns, rather than the kernel state
* tracking; doing a little bit more of the latter is fine if it saves
* us from doing some of the former.
*/ */
fb_bit_not(purge_state->to_purge, hpdata->active_pages, HUGEPAGE_PAGES);
fb_bit_and(purge_state->to_purge, purge_state->to_purge,
hpdata->touched_pages, HUGEPAGE_PAGES);
/* We purge everything we can. */ /*
size_t to_purge = hpdata->h_ntouched - hpdata->h_nactive; * The dirty pages are those that are touched but not active. Note that
assert(to_purge == fb_scount( * in a normal-ish case, HUGEPAGE_PAGES is something like 512 and the
* fb_group_t is 64 bits, so this is 64 bytes, spread across 8
* fb_group_ts.
*/
fb_group_t dirty_pages[FB_NGROUPS(HUGEPAGE_PAGES)];
fb_init(dirty_pages, HUGEPAGE_PAGES);
fb_bit_not(dirty_pages, hpdata->active_pages, HUGEPAGE_PAGES);
fb_bit_and(dirty_pages, dirty_pages, hpdata->touched_pages,
HUGEPAGE_PAGES);
fb_init(purge_state->to_purge, HUGEPAGE_PAGES);
size_t next_bit = 0;
while (next_bit < HUGEPAGE_PAGES) {
size_t next_dirty = fb_ffs(dirty_pages, HUGEPAGE_PAGES,
next_bit);
/* Recall that fb_ffs returns nbits if no set bit is found. */
if (next_dirty == HUGEPAGE_PAGES) {
break;
}
size_t next_active = fb_ffs(hpdata->active_pages,
HUGEPAGE_PAGES, next_dirty);
/*
* Don't purge past the end of the dirty extent, into retained
* pages. This helps the kernel a tiny bit, but honestly it's
* mostly helpful for testing (where we tend to write test cases
* that think in terms of the dirty ranges).
*/
ssize_t last_dirty = fb_fls(dirty_pages, HUGEPAGE_PAGES,
next_active - 1);
assert(last_dirty >= 0);
assert((size_t)last_dirty >= next_dirty);
assert((size_t)last_dirty - next_dirty + 1 <= HUGEPAGE_PAGES);
fb_set_range(purge_state->to_purge, HUGEPAGE_PAGES, next_dirty,
last_dirty - next_dirty + 1);
next_bit = next_active + 1;
}
/* We should purge, at least, everything dirty. */
size_t ndirty = hpdata->h_ntouched - hpdata->h_nactive;
purge_state->ndirty_to_purge = ndirty;
assert(ndirty <= fb_scount(
purge_state->to_purge, HUGEPAGE_PAGES, 0, HUGEPAGE_PAGES)); purge_state->to_purge, HUGEPAGE_PAGES, 0, HUGEPAGE_PAGES));
assert(ndirty == fb_scount(dirty_pages, HUGEPAGE_PAGES, 0,
HUGEPAGE_PAGES));
hpdata_assert_consistent(hpdata); hpdata_assert_consistent(hpdata);
return to_purge; return ndirty;
} }
bool bool
@ -203,6 +263,7 @@ hpdata_purge_next(hpdata_t *hpdata, hpdata_purge_state_t *purge_state,
* hpdata without synchronization, and therefore have no right to expect * hpdata without synchronization, and therefore have no right to expect
* a consistent state. * a consistent state.
*/ */
assert(!hpdata_alloc_allowed_get(hpdata));
if (purge_state->next_purge_search_begin == HUGEPAGE_PAGES) { if (purge_state->next_purge_search_begin == HUGEPAGE_PAGES) {
return false; return false;
@ -228,19 +289,21 @@ hpdata_purge_next(hpdata_t *hpdata, hpdata_purge_state_t *purge_state,
void void
hpdata_purge_end(hpdata_t *hpdata, hpdata_purge_state_t *purge_state) { hpdata_purge_end(hpdata_t *hpdata, hpdata_purge_state_t *purge_state) {
assert(!hpdata_alloc_allowed_get(hpdata));
hpdata_assert_consistent(hpdata); hpdata_assert_consistent(hpdata);
/* See the comment in reserve. */ /* See the comment in reserve. */
assert(!hpdata->h_in_psset || hpdata->h_updating); assert(!hpdata->h_in_psset || hpdata->h_updating);
assert(purge_state->npurged == fb_scount(purge_state->to_purge, assert(purge_state->npurged == fb_scount(purge_state->to_purge,
HUGEPAGE_PAGES, 0, HUGEPAGE_PAGES)); HUGEPAGE_PAGES, 0, HUGEPAGE_PAGES));
assert(purge_state->npurged >= purge_state->ndirty_to_purge);
fb_bit_not(purge_state->to_purge, purge_state->to_purge, fb_bit_not(purge_state->to_purge, purge_state->to_purge,
HUGEPAGE_PAGES); HUGEPAGE_PAGES);
fb_bit_and(hpdata->touched_pages, hpdata->touched_pages, fb_bit_and(hpdata->touched_pages, hpdata->touched_pages,
purge_state->to_purge, HUGEPAGE_PAGES); purge_state->to_purge, HUGEPAGE_PAGES);
assert(hpdata->h_ntouched >= purge_state->npurged); assert(hpdata->h_ntouched >= purge_state->ndirty_to_purge);
hpdata->h_ntouched -= purge_state->npurged; hpdata->h_ntouched -= purge_state->ndirty_to_purge;
hpdata_assert_consistent(hpdata); hpdata_assert_consistent(hpdata);
} }

View File

@ -67,6 +67,7 @@ TEST_BEGIN(test_purge_simple) {
expect_zu_eq(hpdata_ntouched_get(&hpdata), HUGEPAGE_PAGES / 2, ""); expect_zu_eq(hpdata_ntouched_get(&hpdata), HUGEPAGE_PAGES / 2, "");
hpdata_alloc_allowed_set(&hpdata, false);
hpdata_purge_state_t purge_state; hpdata_purge_state_t purge_state;
size_t to_purge = hpdata_purge_begin(&hpdata, &purge_state); size_t to_purge = hpdata_purge_begin(&hpdata, &purge_state);
expect_zu_eq(HUGEPAGE_PAGES / 4, to_purge, ""); expect_zu_eq(HUGEPAGE_PAGES / 4, to_purge, "");
@ -90,11 +91,9 @@ TEST_BEGIN(test_purge_simple) {
TEST_END TEST_END
/* /*
* We only test intervening dalloc's not intervening allocs; we don't need * We only test intervening dalloc's not intervening allocs; the latter are
* intervening allocs, and foreseeable optimizations will make them not just * disallowed as a purging precondition (because they interfere with purging
* unnecessary but incorrect. In particular, if there are two dirty extents * across a retained extent, saving a purge call).
* separated only by a retained extent, we can just purge the entire range,
* saving a purge call.
*/ */
TEST_BEGIN(test_purge_intervening_dalloc) { TEST_BEGIN(test_purge_intervening_dalloc) {
hpdata_t hpdata; hpdata_t hpdata;
@ -112,6 +111,7 @@ TEST_BEGIN(test_purge_intervening_dalloc) {
expect_zu_eq(hpdata_ntouched_get(&hpdata), 3 * HUGEPAGE_PAGES / 4, ""); expect_zu_eq(hpdata_ntouched_get(&hpdata), 3 * HUGEPAGE_PAGES / 4, "");
hpdata_alloc_allowed_set(&hpdata, false);
hpdata_purge_state_t purge_state; hpdata_purge_state_t purge_state;
size_t to_purge = hpdata_purge_begin(&hpdata, &purge_state); size_t to_purge = hpdata_purge_begin(&hpdata, &purge_state);
expect_zu_eq(HUGEPAGE_PAGES / 2, to_purge, ""); expect_zu_eq(HUGEPAGE_PAGES / 2, to_purge, "");
@ -137,7 +137,7 @@ TEST_BEGIN(test_purge_intervening_dalloc) {
expect_ptr_eq( expect_ptr_eq(
(void *)((uintptr_t)alloc + 2 * HUGEPAGE_PAGES / 4 * PAGE), (void *)((uintptr_t)alloc + 2 * HUGEPAGE_PAGES / 4 * PAGE),
purge_addr, ""); purge_addr, "");
expect_zu_eq(HUGEPAGE_PAGES / 4 * PAGE, purge_size, ""); expect_zu_ge(HUGEPAGE_PAGES / 4 * PAGE, purge_size, "");
got_result = hpdata_purge_next(&hpdata, &purge_state, &purge_addr, got_result = hpdata_purge_next(&hpdata, &purge_state, &purge_addr,
&purge_size); &purge_size);
@ -150,6 +150,74 @@ TEST_BEGIN(test_purge_intervening_dalloc) {
} }
TEST_END TEST_END
TEST_BEGIN(test_purge_over_retained) {
void *purge_addr;
size_t purge_size;
hpdata_t hpdata;
hpdata_init(&hpdata, HPDATA_ADDR, HPDATA_AGE);
/* Allocate the first 3/4 of the pages. */
void *alloc = hpdata_reserve_alloc(&hpdata, 3 * HUGEPAGE_PAGES / 4 * PAGE);
expect_ptr_eq(alloc, HPDATA_ADDR, "");
/* Free the second quarter. */
void *second_quarter =
(void *)((uintptr_t)alloc + HUGEPAGE_PAGES / 4 * PAGE);
hpdata_unreserve(&hpdata, second_quarter, HUGEPAGE_PAGES / 4 * PAGE);
expect_zu_eq(hpdata_ntouched_get(&hpdata), 3 * HUGEPAGE_PAGES / 4, "");
/* Purge the second quarter. */
hpdata_alloc_allowed_set(&hpdata, false);
hpdata_purge_state_t purge_state;
size_t to_purge_dirty = hpdata_purge_begin(&hpdata, &purge_state);
expect_zu_eq(HUGEPAGE_PAGES / 4, to_purge_dirty, "");
bool got_result = hpdata_purge_next(&hpdata, &purge_state, &purge_addr,
&purge_size);
expect_true(got_result, "");
expect_ptr_eq(second_quarter, purge_addr, "");
expect_zu_eq(HUGEPAGE_PAGES / 4 * PAGE, purge_size, "");
got_result = hpdata_purge_next(&hpdata, &purge_state, &purge_addr,
&purge_size);
expect_false(got_result, "Unexpected additional purge range: "
"extent at %p of size %zu", purge_addr, purge_size);
hpdata_purge_end(&hpdata, &purge_state);
expect_zu_eq(hpdata_ntouched_get(&hpdata), HUGEPAGE_PAGES / 2, "");
/* Free the first and third quarter. */
hpdata_unreserve(&hpdata, HPDATA_ADDR, HUGEPAGE_PAGES / 4 * PAGE);
hpdata_unreserve(&hpdata,
(void *)((uintptr_t)alloc + 2 * HUGEPAGE_PAGES / 4 * PAGE),
HUGEPAGE_PAGES / 4 * PAGE);
/*
* Purge again. The second quarter is retained, so we can safely
* re-purge it. We expect a single purge of 3/4 of the hugepage,
* purging half its pages.
*/
to_purge_dirty = hpdata_purge_begin(&hpdata, &purge_state);
expect_zu_eq(HUGEPAGE_PAGES / 2, to_purge_dirty, "");
got_result = hpdata_purge_next(&hpdata, &purge_state, &purge_addr,
&purge_size);
expect_true(got_result, "");
expect_ptr_eq(HPDATA_ADDR, purge_addr, "");
expect_zu_eq(3 * HUGEPAGE_PAGES / 4 * PAGE, purge_size, "");
got_result = hpdata_purge_next(&hpdata, &purge_state, &purge_addr,
&purge_size);
expect_false(got_result, "Unexpected additional purge range: "
"extent at %p of size %zu", purge_addr, purge_size);
hpdata_purge_end(&hpdata, &purge_state);
expect_zu_eq(hpdata_ntouched_get(&hpdata), 0, "");
}
TEST_END
TEST_BEGIN(test_hugify) { TEST_BEGIN(test_hugify) {
hpdata_t hpdata; hpdata_t hpdata;
hpdata_init(&hpdata, HPDATA_ADDR, HPDATA_AGE); hpdata_init(&hpdata, HPDATA_ADDR, HPDATA_AGE);
@ -171,5 +239,6 @@ int main(void) {
test_reserve_alloc, test_reserve_alloc,
test_purge_simple, test_purge_simple,
test_purge_intervening_dalloc, test_purge_intervening_dalloc,
test_purge_over_retained,
test_hugify); test_hugify);
} }

View File

@ -18,12 +18,14 @@ edata_init_test(edata_t *edata) {
static void static void
test_psset_fake_purge(hpdata_t *ps) { test_psset_fake_purge(hpdata_t *ps) {
hpdata_purge_state_t purge_state; hpdata_purge_state_t purge_state;
hpdata_alloc_allowed_set(ps, false);
hpdata_purge_begin(ps, &purge_state); hpdata_purge_begin(ps, &purge_state);
void *addr; void *addr;
size_t size; size_t size;
while (hpdata_purge_next(ps, &purge_state, &addr, &size)) { while (hpdata_purge_next(ps, &purge_state, &addr, &size)) {
} }
hpdata_purge_end(ps, &purge_state); hpdata_purge_end(ps, &purge_state);
hpdata_alloc_allowed_set(ps, true);
} }
static void static void