From ce9386370ad67d4b12dc167600080fe17fcf3113 Mon Sep 17 00:00:00 2001 From: David Goldblatt Date: Wed, 20 Jan 2021 14:55:42 -0800 Subject: [PATCH] HPA: Implement batch allocation. --- src/hpa.c | 191 ++++++++++++++++++++++++------------------------ test/unit/hpa.c | 74 ++++++++++++++++++- 2 files changed, 168 insertions(+), 97 deletions(-) diff --git a/src/hpa.c b/src/hpa.c index 338d5759..0e9b152a 100644 --- a/src/hpa.c +++ b/src/hpa.c @@ -10,6 +10,8 @@ static edata_t *hpa_alloc(tsdn_t *tsdn, pai_t *self, size_t size, size_t alignment, bool zero); +static size_t hpa_alloc_batch(tsdn_t *tsdn, pai_t *self, size_t size, + size_t nallocs, edata_list_active_t *results); static bool hpa_expand(tsdn_t *tsdn, pai_t *self, edata_t *edata, size_t old_size, size_t new_size, bool zero); static bool hpa_shrink(tsdn_t *tsdn, pai_t *self, edata_t *edata, @@ -425,13 +427,11 @@ hpa_do_deferred_work(tsdn_t *tsdn, hpa_shard_t *shard) { } static edata_t * -hpa_try_alloc_no_grow(tsdn_t *tsdn, hpa_shard_t *shard, size_t size, bool *oom) { +hpa_try_alloc_one_no_grow(tsdn_t *tsdn, hpa_shard_t *shard, size_t size, + bool *oom) { bool err; - malloc_mutex_lock(tsdn, &shard->mtx); edata_t *edata = edata_cache_small_get(tsdn, &shard->ecs); - *oom = false; if (edata == NULL) { - malloc_mutex_unlock(tsdn, &shard->mtx); *oom = true; return NULL; } @@ -440,7 +440,6 @@ hpa_try_alloc_no_grow(tsdn_t *tsdn, hpa_shard_t *shard, size_t size, bool *oom) hpdata_t *ps = psset_pick_alloc(&shard->psset, size); if (ps == NULL) { edata_cache_small_put(tsdn, &shard->ecs, edata); - malloc_mutex_unlock(tsdn, &shard->mtx); return NULL; } @@ -487,42 +486,61 @@ hpa_try_alloc_no_grow(tsdn_t *tsdn, hpa_shard_t *shard, size_t size, bool *oom) */ psset_update_end(&shard->psset, ps); edata_cache_small_put(tsdn, &shard->ecs, edata); - malloc_mutex_unlock(tsdn, &shard->mtx); *oom = true; return NULL; } hpa_update_purge_hugify_eligibility(shard, ps); psset_update_end(&shard->psset, ps); - - hpa_do_deferred_work(tsdn, shard); - malloc_mutex_unlock(tsdn, &shard->mtx); - return edata; } -static edata_t * -hpa_alloc_psset(tsdn_t *tsdn, hpa_shard_t *shard, size_t size) { - assert(size <= shard->opts.slab_max_alloc); - bool err; - bool oom; - edata_t *edata; - - edata = hpa_try_alloc_no_grow(tsdn, shard, size, &oom); - if (edata != NULL) { - return edata; +static size_t +hpa_try_alloc_batch_no_grow(tsdn_t *tsdn, hpa_shard_t *shard, size_t size, + bool *oom, size_t nallocs, edata_list_active_t *results) { + malloc_mutex_lock(tsdn, &shard->mtx); + size_t nsuccess = 0; + for (; nsuccess < nallocs; nsuccess++) { + edata_t *edata = hpa_try_alloc_one_no_grow(tsdn, shard, size, + oom); + if (edata == NULL) { + break; + } + edata_list_active_append(results, edata); } - /* Nothing in the psset works; we have to grow it. */ + hpa_do_deferred_work(tsdn, shard); + malloc_mutex_unlock(tsdn, &shard->mtx); + return nsuccess; +} + +static size_t +hpa_alloc_batch_psset(tsdn_t *tsdn, hpa_shard_t *shard, size_t size, + size_t nallocs, edata_list_active_t *results) { + assert(size <= shard->opts.slab_max_alloc); + bool oom = false; + + size_t nsuccess = hpa_try_alloc_batch_no_grow(tsdn, shard, size, &oom, + nallocs, results); + + if (nsuccess == nallocs || oom) { + return nsuccess; + } + + /* + * We didn't OOM, but weren't able to fill everything requested of us; + * try to grow. + */ malloc_mutex_lock(tsdn, &shard->grow_mtx); /* * Check for grow races; maybe some earlier thread expanded the psset * in between when we dropped the main mutex and grabbed the grow mutex. */ - edata = hpa_try_alloc_no_grow(tsdn, shard, size, &oom); - if (edata != NULL || oom) { + nsuccess += hpa_try_alloc_batch_no_grow(tsdn, shard, size, &oom, + nallocs - nsuccess, results); + if (nsuccess == nallocs || oom) { malloc_mutex_unlock(tsdn, &shard->grow_mtx); - return edata; + return nsuccess; } /* @@ -533,78 +551,28 @@ hpa_alloc_psset(tsdn_t *tsdn, hpa_shard_t *shard, size_t size) { hpdata_t *ps = hpa_grow(tsdn, shard); if (ps == NULL) { malloc_mutex_unlock(tsdn, &shard->grow_mtx); - return NULL; - } - - /* We got the pageslab; allocate from it. */ - malloc_mutex_lock(tsdn, &shard->mtx); - - psset_insert(&shard->psset, ps); - - edata = edata_cache_small_get(tsdn, &shard->ecs); - if (edata == NULL) { - malloc_mutex_unlock(tsdn, &shard->mtx); - malloc_mutex_unlock(tsdn, &shard->grow_mtx); - return NULL; + return nsuccess; } /* - * TODO: the tail of this function is quite similar to the tail of - * hpa_try_alloc_no_grow (both, broadly, do the metadata management of - * initializing an edata_t from an hpdata_t once both have been - * allocated). The only differences are in error case handling and lock - * management (we hold grow_mtx, but should drop it before doing any - * deferred work). With a little refactoring, we could unify the paths. + * We got the pageslab; allocate from it. This does an unlock followed + * by a lock on the same mutex, and holds the grow mutex while doing + * deferred work, but this is an uncommon path; the simplicity is worth + * it. */ - psset_update_begin(&shard->psset, ps); - - void *addr = hpdata_reserve_alloc(ps, size); - edata_init(edata, shard->ind, addr, size, /* slab */ false, - SC_NSIZES, /* sn */ 0, extent_state_active, /* zeroed */ false, - /* committed */ true, EXTENT_PAI_HPA, EXTENT_NOT_HEAD); - edata_ps_set(edata, ps); - - err = emap_register_boundary(tsdn, shard->emap, edata, - SC_NSIZES, /* slab */ false); - if (err) { - hpdata_unreserve(ps, edata_addr_get(edata), - edata_size_get(edata)); - - edata_cache_small_put(tsdn, &shard->ecs, edata); - - /* We'll do a fake purge; the pages weren't really touched. */ - hpdata_purge_state_t purge_state; - void *purge_addr; - size_t purge_size; - hpdata_purge_begin(ps, &purge_state); - bool found_extent = hpdata_purge_next(ps, &purge_state, - &purge_addr, &purge_size); - assert(found_extent); - assert(purge_addr == addr); - assert(purge_size == size); - found_extent = hpdata_purge_next(ps, &purge_state, - &purge_addr, &purge_size); - assert(!found_extent); - hpdata_purge_end(ps, &purge_state); - - psset_update_end(&shard->psset, ps); - malloc_mutex_unlock(tsdn, &shard->mtx); - malloc_mutex_unlock(tsdn, &shard->grow_mtx); - return NULL; - } - hpa_update_purge_hugify_eligibility(shard, ps); - psset_update_end(&shard->psset, ps); + malloc_mutex_lock(tsdn, &shard->mtx); + psset_insert(&shard->psset, ps); + malloc_mutex_unlock(tsdn, &shard->mtx); + nsuccess += hpa_try_alloc_batch_no_grow(tsdn, shard, size, &oom, + nallocs - nsuccess, results); /* * Drop grow_mtx before doing deferred work; other threads blocked on it * should be allowed to proceed while we're working. */ malloc_mutex_unlock(tsdn, &shard->grow_mtx); - hpa_do_deferred_work(tsdn, shard); - - malloc_mutex_unlock(tsdn, &shard->mtx); - return edata; + return nsuccess; } static hpa_shard_t * @@ -616,28 +584,27 @@ hpa_from_pai(pai_t *self) { return (hpa_shard_t *)self; } -static edata_t * -hpa_alloc(tsdn_t *tsdn, pai_t *self, size_t size, - size_t alignment, bool zero) { +static size_t +hpa_alloc_batch(tsdn_t *tsdn, pai_t *self, size_t size, size_t nallocs, + edata_list_active_t *results) { + assert(nallocs > 0); assert((size & PAGE_MASK) == 0); witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), WITNESS_RANK_CORE, 0); - hpa_shard_t *shard = hpa_from_pai(self); - /* We don't handle alignment or zeroing for now. */ - if (alignment > PAGE || zero) { - return NULL; - } + if (size > shard->opts.slab_max_alloc) { - return NULL; + return 0; } - edata_t *edata = hpa_alloc_psset(tsdn, shard, size); + size_t nsuccess = hpa_alloc_batch_psset(tsdn, shard, size, nallocs, + results); witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), WITNESS_RANK_CORE, 0); - if (edata != NULL) { + edata_t *edata; + ql_foreach(edata, &results->head, ql_link_active) { emap_assert_mapped(tsdn, shard->emap, edata); assert(edata_pai_get(edata) == EXTENT_PAI_HPA); assert(edata_state_get(edata) == extent_state_active); @@ -648,6 +615,29 @@ hpa_alloc(tsdn_t *tsdn, pai_t *self, size_t size, assert(edata_base_get(edata) == edata_addr_get(edata)); assert(edata_base_get(edata) != NULL); } + return nsuccess; +} + +static edata_t * +hpa_alloc(tsdn_t *tsdn, pai_t *self, size_t size, size_t alignment, bool zero) { + assert((size & PAGE_MASK) == 0); + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, 0); + + /* We don't handle alignment or zeroing for now. */ + if (alignment > PAGE || zero) { + return NULL; + } + /* + * An alloc with alignment == PAGE and zero == false is equivalent to a + * batch alloc of 1. Just do that, so we can share code. + */ + edata_list_active_t results; + edata_list_active_init(&results); + size_t nallocs = hpa_alloc_batch(tsdn, self, size, /* nallocs */ 1, + &results); + assert(nallocs == 0 || nallocs == 1); + edata_t *edata = edata_list_active_first(&results); return edata; } @@ -677,6 +667,15 @@ hpa_dalloc_prepare_unlocked(tsdn_t *tsdn, hpa_shard_t *shard, edata_t *edata) { assert(edata_committed_get(edata)); assert(edata_base_get(edata) != NULL); + /* + * Another thread shouldn't be trying to touch the metadata of an + * allocation being freed. The one exception is a merge attempt from a + * lower-addressed PAC extent; in this case we have a nominal race on + * the edata metadata bits, but in practice the fact that the PAI bits + * are different will prevent any further access. The race is bad, but + * benign in practice, and the long term plan is to track enough state + * in the rtree to prevent these merge attempts in the first place. + */ edata_addr_set(edata, edata_base_get(edata)); edata_zeroed_set(edata, false); emap_deregister_boundary(tsdn, shard->emap, edata); diff --git a/test/unit/hpa.c b/test/unit/hpa.c index 924795f6..46009835 100644 --- a/test/unit/hpa.c +++ b/test/unit/hpa.c @@ -211,6 +211,77 @@ TEST_BEGIN(test_stress) { } TEST_END +static void +expect_contiguous(edata_t **edatas, size_t nedatas) { + for (size_t i = 0; i < nedatas; i++) { + size_t expected = (size_t)edata_base_get(edatas[0]) + + i * PAGE; + expect_zu_eq(expected, (size_t)edata_base_get(edatas[i]), + "Mismatch at index %zu", i); + } +} + +TEST_BEGIN(test_alloc_dalloc_batch) { + test_skip_if(!hpa_supported()); + + hpa_shard_t *shard = create_test_data(); + tsdn_t *tsdn = tsd_tsdn(tsd_fetch()); + + enum {NALLOCS = 8}; + + edata_t *allocs[NALLOCS]; + /* + * Allocate a mix of ways; first half from regular alloc, second half + * from alloc_batch. + */ + for (size_t i = 0; i < NALLOCS / 2; i++) { + allocs[i] = pai_alloc(tsdn, &shard->pai, PAGE, PAGE, + /* zero */ false); + expect_ptr_not_null(allocs[i], "Unexpected alloc failure"); + } + edata_list_active_t allocs_list; + edata_list_active_init(&allocs_list); + size_t nsuccess = pai_alloc_batch(tsdn, &shard->pai, PAGE, NALLOCS / 2, + &allocs_list); + expect_zu_eq(NALLOCS / 2, nsuccess, "Unexpected oom"); + for (size_t i = NALLOCS / 2; i < NALLOCS; i++) { + allocs[i] = edata_list_active_first(&allocs_list); + edata_list_active_remove(&allocs_list, allocs[i]); + } + + /* + * Should have allocated them contiguously, despite the differing + * methods used. + */ + void *orig_base = edata_base_get(allocs[0]); + expect_contiguous(allocs, NALLOCS); + + /* + * Batch dalloc the first half, individually deallocate the second half. + */ + for (size_t i = 0; i < NALLOCS / 2; i++) { + edata_list_active_append(&allocs_list, allocs[i]); + } + pai_dalloc_batch(tsdn, &shard->pai, &allocs_list); + for (size_t i = NALLOCS / 2; i < NALLOCS; i++) { + pai_dalloc(tsdn, &shard->pai, allocs[i]); + } + + /* Reallocate (individually), and ensure reuse and contiguity. */ + for (size_t i = 0; i < NALLOCS; i++) { + allocs[i] = pai_alloc(tsdn, &shard->pai, PAGE, PAGE, + /* zero */ false); + expect_ptr_not_null(allocs[i], "Unexpected alloc failure."); + } + void *new_base = edata_base_get(allocs[0]); + expect_ptr_eq(orig_base, new_base, + "Failed to reuse the allocated memory."); + expect_contiguous(allocs, NALLOCS); + + destroy_test_data(shard); +} +TEST_END + int main(void) { /* @@ -227,5 +298,6 @@ main(void) { (void)mem_tree_destroy; return test_no_reentrancy( test_alloc_max, - test_stress); + test_stress, + test_alloc_dalloc_batch); }