diff --git a/include/jemalloc/internal/hpdata.h b/include/jemalloc/internal/hpdata.h index 4ff2e575..32e26248 100644 --- a/include/jemalloc/internal/hpdata.h +++ b/include/jemalloc/internal/hpdata.h @@ -110,7 +110,7 @@ struct hpdata_s { */ size_t h_ntouched; - /* The dirty pages (using the same definition as above). */ + /* The touched pages (using the same definition as above). */ fb_group_t touched_pages[FB_NGROUPS(HUGEPAGE_PAGES)]; }; @@ -356,6 +356,7 @@ void hpdata_unreserve(hpdata_t *hpdata, void *begin, size_t sz); typedef struct hpdata_purge_state_s hpdata_purge_state_t; struct hpdata_purge_state_s { size_t npurged; + size_t ndirty_to_purge; fb_group_t to_purge[FB_NGROUPS(HUGEPAGE_PAGES)]; size_t next_purge_search_begin; }; @@ -372,7 +373,7 @@ struct hpdata_purge_state_s { * until you're done, and then end. Allocating out of an hpdata undergoing * purging is not allowed. * - * Returns the number of pages that will be purged. + * Returns the number of dirty pages that will be purged. */ size_t hpdata_purge_begin(hpdata_t *hpdata, hpdata_purge_state_t *purge_state); diff --git a/src/hpdata.c b/src/hpdata.c index b861e9e4..18519be3 100644 --- a/src/hpdata.c +++ b/src/hpdata.c @@ -166,33 +166,93 @@ hpdata_unreserve(hpdata_t *hpdata, void *addr, size_t sz) { size_t hpdata_purge_begin(hpdata_t *hpdata, hpdata_purge_state_t *purge_state) { hpdata_assert_consistent(hpdata); - /* See the comment in reserve. */ + /* + * See the comment below; we might purge any inactive extent, so it's + * unsafe for any other thread to turn any inactive extent active while + * we're operating on it. + */ + assert(!hpdata_alloc_allowed_get(hpdata)); purge_state->npurged = 0; purge_state->next_purge_search_begin = 0; /* - * Initialize to_purge with everything that's not active but that is - * dirty. + * Initialize to_purge. * - * As an optimization, we could note that in practice we never allocate - * out of a hugepage while purging within it, and so could try to - * combine dirty extents separated by a non-dirty but non-active extent - * to avoid purge calls. This does nontrivially complicate metadata - * tracking though, so let's hold off for now. + * It's possible to end up in situations where two dirty extents are + * separated by a retained extent: + * - 1 page allocated. + * - 1 page allocated. + * - 1 pages allocated. + * + * If the middle page is freed and purged, and then the first and third + * pages are freed, and then another purge pass happens, the hpdata + * looks like this: + * - 1 page dirty. + * - 1 page retained. + * - 1 page dirty. + * + * But it's safe to do a single 3-page purge. + * + * We do this by first computing the dirty pages, and then filling in + * any gaps by extending each range in the dirty bitmap to extend until + * the next active page. This purges more pages, but the expensive part + * of purging is the TLB shootdowns, rather than the kernel state + * tracking; doing a little bit more of the latter is fine if it saves + * us from doing some of the former. */ - fb_bit_not(purge_state->to_purge, hpdata->active_pages, HUGEPAGE_PAGES); - fb_bit_and(purge_state->to_purge, purge_state->to_purge, - hpdata->touched_pages, HUGEPAGE_PAGES); - /* We purge everything we can. */ - size_t to_purge = hpdata->h_ntouched - hpdata->h_nactive; - assert(to_purge == fb_scount( + /* + * The dirty pages are those that are touched but not active. Note that + * in a normal-ish case, HUGEPAGE_PAGES is something like 512 and the + * fb_group_t is 64 bits, so this is 64 bytes, spread across 8 + * fb_group_ts. + */ + fb_group_t dirty_pages[FB_NGROUPS(HUGEPAGE_PAGES)]; + fb_init(dirty_pages, HUGEPAGE_PAGES); + fb_bit_not(dirty_pages, hpdata->active_pages, HUGEPAGE_PAGES); + fb_bit_and(dirty_pages, dirty_pages, hpdata->touched_pages, + HUGEPAGE_PAGES); + + fb_init(purge_state->to_purge, HUGEPAGE_PAGES); + size_t next_bit = 0; + while (next_bit < HUGEPAGE_PAGES) { + size_t next_dirty = fb_ffs(dirty_pages, HUGEPAGE_PAGES, + next_bit); + /* Recall that fb_ffs returns nbits if no set bit is found. */ + if (next_dirty == HUGEPAGE_PAGES) { + break; + } + size_t next_active = fb_ffs(hpdata->active_pages, + HUGEPAGE_PAGES, next_dirty); + /* + * Don't purge past the end of the dirty extent, into retained + * pages. This helps the kernel a tiny bit, but honestly it's + * mostly helpful for testing (where we tend to write test cases + * that think in terms of the dirty ranges). + */ + ssize_t last_dirty = fb_fls(dirty_pages, HUGEPAGE_PAGES, + next_active - 1); + assert(last_dirty >= 0); + assert((size_t)last_dirty >= next_dirty); + assert((size_t)last_dirty - next_dirty + 1 <= HUGEPAGE_PAGES); + + fb_set_range(purge_state->to_purge, HUGEPAGE_PAGES, next_dirty, + last_dirty - next_dirty + 1); + next_bit = next_active + 1; + } + + /* We should purge, at least, everything dirty. */ + size_t ndirty = hpdata->h_ntouched - hpdata->h_nactive; + purge_state->ndirty_to_purge = ndirty; + assert(ndirty <= fb_scount( purge_state->to_purge, HUGEPAGE_PAGES, 0, HUGEPAGE_PAGES)); + assert(ndirty == fb_scount(dirty_pages, HUGEPAGE_PAGES, 0, + HUGEPAGE_PAGES)); hpdata_assert_consistent(hpdata); - return to_purge; + return ndirty; } bool @@ -203,6 +263,7 @@ hpdata_purge_next(hpdata_t *hpdata, hpdata_purge_state_t *purge_state, * hpdata without synchronization, and therefore have no right to expect * a consistent state. */ + assert(!hpdata_alloc_allowed_get(hpdata)); if (purge_state->next_purge_search_begin == HUGEPAGE_PAGES) { return false; @@ -228,19 +289,21 @@ hpdata_purge_next(hpdata_t *hpdata, hpdata_purge_state_t *purge_state, void hpdata_purge_end(hpdata_t *hpdata, hpdata_purge_state_t *purge_state) { + assert(!hpdata_alloc_allowed_get(hpdata)); hpdata_assert_consistent(hpdata); /* See the comment in reserve. */ assert(!hpdata->h_in_psset || hpdata->h_updating); assert(purge_state->npurged == fb_scount(purge_state->to_purge, HUGEPAGE_PAGES, 0, HUGEPAGE_PAGES)); + assert(purge_state->npurged >= purge_state->ndirty_to_purge); fb_bit_not(purge_state->to_purge, purge_state->to_purge, HUGEPAGE_PAGES); fb_bit_and(hpdata->touched_pages, hpdata->touched_pages, purge_state->to_purge, HUGEPAGE_PAGES); - assert(hpdata->h_ntouched >= purge_state->npurged); - hpdata->h_ntouched -= purge_state->npurged; + assert(hpdata->h_ntouched >= purge_state->ndirty_to_purge); + hpdata->h_ntouched -= purge_state->ndirty_to_purge; hpdata_assert_consistent(hpdata); } diff --git a/test/unit/hpdata.c b/test/unit/hpdata.c index 11bccc58..288e71d4 100644 --- a/test/unit/hpdata.c +++ b/test/unit/hpdata.c @@ -67,6 +67,7 @@ TEST_BEGIN(test_purge_simple) { expect_zu_eq(hpdata_ntouched_get(&hpdata), HUGEPAGE_PAGES / 2, ""); + hpdata_alloc_allowed_set(&hpdata, false); hpdata_purge_state_t purge_state; size_t to_purge = hpdata_purge_begin(&hpdata, &purge_state); expect_zu_eq(HUGEPAGE_PAGES / 4, to_purge, ""); @@ -90,11 +91,9 @@ TEST_BEGIN(test_purge_simple) { TEST_END /* - * We only test intervening dalloc's not intervening allocs; we don't need - * intervening allocs, and foreseeable optimizations will make them not just - * unnecessary but incorrect. In particular, if there are two dirty extents - * separated only by a retained extent, we can just purge the entire range, - * saving a purge call. + * We only test intervening dalloc's not intervening allocs; the latter are + * disallowed as a purging precondition (because they interfere with purging + * across a retained extent, saving a purge call). */ TEST_BEGIN(test_purge_intervening_dalloc) { hpdata_t hpdata; @@ -112,6 +111,7 @@ TEST_BEGIN(test_purge_intervening_dalloc) { expect_zu_eq(hpdata_ntouched_get(&hpdata), 3 * HUGEPAGE_PAGES / 4, ""); + hpdata_alloc_allowed_set(&hpdata, false); hpdata_purge_state_t purge_state; size_t to_purge = hpdata_purge_begin(&hpdata, &purge_state); expect_zu_eq(HUGEPAGE_PAGES / 2, to_purge, ""); @@ -137,7 +137,7 @@ TEST_BEGIN(test_purge_intervening_dalloc) { expect_ptr_eq( (void *)((uintptr_t)alloc + 2 * HUGEPAGE_PAGES / 4 * PAGE), purge_addr, ""); - expect_zu_eq(HUGEPAGE_PAGES / 4 * PAGE, purge_size, ""); + expect_zu_ge(HUGEPAGE_PAGES / 4 * PAGE, purge_size, ""); got_result = hpdata_purge_next(&hpdata, &purge_state, &purge_addr, &purge_size); @@ -150,6 +150,74 @@ TEST_BEGIN(test_purge_intervening_dalloc) { } TEST_END +TEST_BEGIN(test_purge_over_retained) { + void *purge_addr; + size_t purge_size; + + hpdata_t hpdata; + hpdata_init(&hpdata, HPDATA_ADDR, HPDATA_AGE); + + /* Allocate the first 3/4 of the pages. */ + void *alloc = hpdata_reserve_alloc(&hpdata, 3 * HUGEPAGE_PAGES / 4 * PAGE); + expect_ptr_eq(alloc, HPDATA_ADDR, ""); + + /* Free the second quarter. */ + void *second_quarter = + (void *)((uintptr_t)alloc + HUGEPAGE_PAGES / 4 * PAGE); + hpdata_unreserve(&hpdata, second_quarter, HUGEPAGE_PAGES / 4 * PAGE); + + expect_zu_eq(hpdata_ntouched_get(&hpdata), 3 * HUGEPAGE_PAGES / 4, ""); + + /* Purge the second quarter. */ + hpdata_alloc_allowed_set(&hpdata, false); + hpdata_purge_state_t purge_state; + size_t to_purge_dirty = hpdata_purge_begin(&hpdata, &purge_state); + expect_zu_eq(HUGEPAGE_PAGES / 4, to_purge_dirty, ""); + + bool got_result = hpdata_purge_next(&hpdata, &purge_state, &purge_addr, + &purge_size); + expect_true(got_result, ""); + expect_ptr_eq(second_quarter, purge_addr, ""); + expect_zu_eq(HUGEPAGE_PAGES / 4 * PAGE, purge_size, ""); + + got_result = hpdata_purge_next(&hpdata, &purge_state, &purge_addr, + &purge_size); + expect_false(got_result, "Unexpected additional purge range: " + "extent at %p of size %zu", purge_addr, purge_size); + hpdata_purge_end(&hpdata, &purge_state); + + expect_zu_eq(hpdata_ntouched_get(&hpdata), HUGEPAGE_PAGES / 2, ""); + + /* Free the first and third quarter. */ + hpdata_unreserve(&hpdata, HPDATA_ADDR, HUGEPAGE_PAGES / 4 * PAGE); + hpdata_unreserve(&hpdata, + (void *)((uintptr_t)alloc + 2 * HUGEPAGE_PAGES / 4 * PAGE), + HUGEPAGE_PAGES / 4 * PAGE); + + /* + * Purge again. The second quarter is retained, so we can safely + * re-purge it. We expect a single purge of 3/4 of the hugepage, + * purging half its pages. + */ + to_purge_dirty = hpdata_purge_begin(&hpdata, &purge_state); + expect_zu_eq(HUGEPAGE_PAGES / 2, to_purge_dirty, ""); + + got_result = hpdata_purge_next(&hpdata, &purge_state, &purge_addr, + &purge_size); + expect_true(got_result, ""); + expect_ptr_eq(HPDATA_ADDR, purge_addr, ""); + expect_zu_eq(3 * HUGEPAGE_PAGES / 4 * PAGE, purge_size, ""); + + got_result = hpdata_purge_next(&hpdata, &purge_state, &purge_addr, + &purge_size); + expect_false(got_result, "Unexpected additional purge range: " + "extent at %p of size %zu", purge_addr, purge_size); + hpdata_purge_end(&hpdata, &purge_state); + + expect_zu_eq(hpdata_ntouched_get(&hpdata), 0, ""); +} +TEST_END + TEST_BEGIN(test_hugify) { hpdata_t hpdata; hpdata_init(&hpdata, HPDATA_ADDR, HPDATA_AGE); @@ -171,5 +239,6 @@ int main(void) { test_reserve_alloc, test_purge_simple, test_purge_intervening_dalloc, + test_purge_over_retained, test_hugify); } diff --git a/test/unit/psset.c b/test/unit/psset.c index fde403e1..7bce7c1b 100644 --- a/test/unit/psset.c +++ b/test/unit/psset.c @@ -18,12 +18,14 @@ edata_init_test(edata_t *edata) { static void test_psset_fake_purge(hpdata_t *ps) { hpdata_purge_state_t purge_state; + hpdata_alloc_allowed_set(ps, false); hpdata_purge_begin(ps, &purge_state); void *addr; size_t size; while (hpdata_purge_next(ps, &purge_state, &addr, &size)) { } hpdata_purge_end(ps, &purge_state); + hpdata_alloc_allowed_set(ps, true); } static void