From cdae6706a6dbe6ab75688ea24a82ef4165c3b0b1 Mon Sep 17 00:00:00 2001 From: David Goldblatt Date: Tue, 19 Jan 2021 13:06:43 -0800 Subject: [PATCH] SEC: Use batch fills. Currently, this doesn't help much, since no PAI implementation supports flushing. This will change in subsequent commits. --- include/jemalloc/internal/sec.h | 28 ++++++ src/sec.c | 147 ++++++++++++++++++++++---------- test/unit/sec.c | 63 ++++++++------ 3 files changed, 167 insertions(+), 71 deletions(-) diff --git a/include/jemalloc/internal/sec.h b/include/jemalloc/internal/sec.h index 815b4bbc..fadf4b61 100644 --- a/include/jemalloc/internal/sec.h +++ b/include/jemalloc/internal/sec.h @@ -45,6 +45,24 @@ sec_stats_accum(sec_stats_t *dst, sec_stats_t *src) { /* A collections of free extents, all of the same size. */ typedef struct sec_bin_s sec_bin_t; struct sec_bin_s { + /* + * When we fail to fulfill an allocation, we do a batch-alloc on the + * underlying allocator to fill extra items, as well. We drop the SEC + * lock while doing so, to allow operations on other bins to succeed. + * That introduces the possibility of other threads also trying to + * allocate out of this bin, failing, and also going to the backing + * allocator. To avoid a thundering herd problem in which lots of + * threads do batch allocs and overfill this bin as a result, we only + * allow one batch allocation at a time for a bin. This bool tracks + * whether or not some thread is already batch allocating. + * + * Eventually, the right answer may be a smarter sharding policy for the + * bins (e.g. a mutex per bin, which would also be more scalable + * generally; the batch-allocating thread could hold it while + * batch-allocating). + */ + bool being_batch_filled; + /* * Number of bytes in this particular bin (as opposed to the * sec_shard_t's bytes_cur. This isn't user visible or reported in @@ -108,6 +126,16 @@ struct sec_s { */ size_t bytes_after_flush; + /* + * When we can't satisfy an allocation out of the SEC because there are + * no available ones cached, we allocate multiple of that size out of + * the fallback allocator. Eventually we might want to do something + * cleverer, but for now we just grab a fixed number. + * + * For now, just the constant 4. Eventually, it should be configurable. + */ + size_t batch_fill_extra; + /* * We don't necessarily always use all the shards; requests are * distributed across shards [0, nshards - 1). diff --git a/src/sec.c b/src/sec.c index af7c2910..f177bbee 100644 --- a/src/sec.c +++ b/src/sec.c @@ -13,6 +13,7 @@ static void sec_dalloc(tsdn_t *tsdn, pai_t *self, edata_t *edata); static void sec_bin_init(sec_bin_t *bin) { + bin->being_batch_filled = false; bin->bytes_cur = 0; edata_list_active_init(&bin->freelist); } @@ -45,6 +46,7 @@ sec_init(sec_t *sec, pai_t *fallback, size_t nshards, size_t alloc_max, sec->bytes_max = bytes_max; sec->bytes_after_flush = bytes_max / 2; + sec->batch_fill_extra = 4; sec->nshards = nshards; /* @@ -88,14 +90,52 @@ sec_shard_pick(tsdn_t *tsdn, sec_t *sec) { return &sec->shards[*idxp]; } +/* + * Perhaps surprisingly, this can be called on the alloc pathways; if we hit an + * empty cache, we'll try to fill it, which can push the shard over it's limit. + */ +static void +sec_flush_some_and_unlock(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard) { + malloc_mutex_assert_owner(tsdn, &shard->mtx); + edata_list_active_t to_flush; + edata_list_active_init(&to_flush); + while (shard->bytes_cur > sec->bytes_after_flush) { + /* Pick a victim. */ + sec_bin_t *bin = &shard->bins[shard->to_flush_next]; + + /* Update our victim-picking state. */ + shard->to_flush_next++; + if (shard->to_flush_next == SEC_NPSIZES) { + shard->to_flush_next = 0; + } + + assert(shard->bytes_cur >= bin->bytes_cur); + if (bin->bytes_cur != 0) { + shard->bytes_cur -= bin->bytes_cur; + bin->bytes_cur = 0; + edata_list_active_concat(&to_flush, &bin->freelist); + } + /* + * Either bin->bytes_cur was 0, in which case we didn't touch + * the bin list but it should be empty anyways (or else we + * missed a bytes_cur update on a list modification), or it + * *was* 0 and we emptied it ourselves. Either way, it should + * be empty now. + */ + assert(edata_list_active_empty(&bin->freelist)); + } + + malloc_mutex_unlock(tsdn, &shard->mtx); + pai_dalloc_batch(tsdn, sec->fallback, &to_flush); +} + static edata_t * sec_shard_alloc_locked(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard, - pszind_t pszind) { + sec_bin_t *bin) { malloc_mutex_assert_owner(tsdn, &shard->mtx); if (!shard->enabled) { return NULL; } - sec_bin_t *bin = &shard->bins[pszind]; edata_t *edata = edata_list_active_first(&bin->freelist); if (edata != NULL) { edata_list_active_remove(&bin->freelist, edata); @@ -107,6 +147,50 @@ sec_shard_alloc_locked(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard, return edata; } +static edata_t * +sec_batch_fill_and_alloc(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard, + sec_bin_t *bin, size_t size) { + malloc_mutex_assert_not_owner(tsdn, &shard->mtx); + + edata_list_active_t result; + edata_list_active_init(&result); + size_t nalloc = pai_alloc_batch(tsdn, sec->fallback, size, + 1 + sec->batch_fill_extra, &result); + + edata_t *ret = edata_list_active_first(&result); + if (ret != NULL) { + edata_list_active_remove(&result, ret); + } + + malloc_mutex_lock(tsdn, &shard->mtx); + bin->being_batch_filled = false; + /* + * Handle the easy case first: nothing to cache. Note that this can + * only happen in case of OOM, since sec_alloc checks the expected + * number of allocs, and doesn't bother going down the batch_fill + * pathway if there won't be anything left to cache. So to be in this + * code path, we must have asked for > 1 alloc, but only gotten 1 back. + */ + if (nalloc <= 1) { + malloc_mutex_unlock(tsdn, &shard->mtx); + return ret; + } + + size_t new_cached_bytes = (nalloc - 1) * size; + + edata_list_active_concat(&bin->freelist, &result); + bin->bytes_cur += new_cached_bytes; + shard->bytes_cur += new_cached_bytes; + + if (shard->bytes_cur > sec->bytes_max) { + sec_flush_some_and_unlock(tsdn, sec, shard); + } else { + malloc_mutex_unlock(tsdn, &shard->mtx); + } + + return ret; +} + static edata_t * sec_alloc(tsdn_t *tsdn, pai_t *self, size_t size, size_t alignment, bool zero) { assert((size & PAGE_MASK) == 0); @@ -119,16 +203,26 @@ sec_alloc(tsdn_t *tsdn, pai_t *self, size_t size, size_t alignment, bool zero) { } pszind_t pszind = sz_psz2ind(size); sec_shard_t *shard = sec_shard_pick(tsdn, sec); + sec_bin_t *bin = &shard->bins[pszind]; + bool do_batch_fill = false; + malloc_mutex_lock(tsdn, &shard->mtx); - edata_t *edata = sec_shard_alloc_locked(tsdn, sec, shard, pszind); + edata_t *edata = sec_shard_alloc_locked(tsdn, sec, shard, bin); + if (edata == NULL) { + if (!bin->being_batch_filled && sec->batch_fill_extra > 0) { + bin->being_batch_filled = true; + do_batch_fill = true; + } + } malloc_mutex_unlock(tsdn, &shard->mtx); if (edata == NULL) { - /* - * See the note in dalloc, below; really, we should add a - * batch_alloc method to the PAI and get more than one extent at - * a time. - */ - edata = pai_alloc(tsdn, sec->fallback, size, alignment, zero); + if (do_batch_fill) { + edata = sec_batch_fill_and_alloc(tsdn, sec, shard, bin, + size); + } else { + edata = pai_alloc(tsdn, sec->fallback, size, alignment, + zero); + } } return edata; } @@ -168,41 +262,6 @@ sec_flush_all_locked(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard) { pai_dalloc_batch(tsdn, sec->fallback, &to_flush); } -static void -sec_flush_some_and_unlock(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard) { - malloc_mutex_assert_owner(tsdn, &shard->mtx); - edata_list_active_t to_flush; - edata_list_active_init(&to_flush); - while (shard->bytes_cur > sec->bytes_after_flush) { - /* Pick a victim. */ - sec_bin_t *bin = &shard->bins[shard->to_flush_next]; - - /* Update our victim-picking state. */ - shard->to_flush_next++; - if (shard->to_flush_next == SEC_NPSIZES) { - shard->to_flush_next = 0; - } - - assert(shard->bytes_cur >= bin->bytes_cur); - if (bin->bytes_cur != 0) { - shard->bytes_cur -= bin->bytes_cur; - bin->bytes_cur = 0; - edata_list_active_concat(&to_flush, &bin->freelist); - } - /* - * Either bin->bytes_cur was 0, in which case we didn't touch - * the bin list but it should be empty anyways (or else we - * missed a bytes_cur update on a list modification), or it - * *was* 0 and we emptied it ourselves. Either way, it should - * be empty now. - */ - assert(edata_list_active_empty(&bin->freelist)); - } - - malloc_mutex_unlock(tsdn, &shard->mtx); - pai_dalloc_batch(tsdn, sec->fallback, &to_flush); -} - static void sec_shard_dalloc_and_unlock(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard, edata_t *edata) { diff --git a/test/unit/sec.c b/test/unit/sec.c index 69132c1f..ff39453c 100644 --- a/test/unit/sec.c +++ b/test/unit/sec.c @@ -134,14 +134,17 @@ TEST_BEGIN(test_reuse) { */ tsdn_t *tsdn = TSDN_NULL; /* - * 10-allocs apiece of 1-PAGE and 2-PAGE objects means that we should be - * able to get to 30 pages in the cache before triggering a flush. + * 11 allocs apiece of 1-PAGE and 2-PAGE objects means that we should be + * able to get to 33 pages in the cache before triggering a flush. We + * set the flush liimt to twice this amount, to avoid accidentally + * triggering a flush caused by the batch-allocation down the cache fill + * pathway disrupting ordering. */ - enum { NALLOCS = 10 }; + enum { NALLOCS = 11 }; edata_t *one_page[NALLOCS]; edata_t *two_page[NALLOCS]; sec_init(&sec, &ta.pai, /* nshards */ 1, /* alloc_max */ 2 * PAGE, - /* bytes_max */ NALLOCS * PAGE + NALLOCS * 2 * PAGE); + /* bytes_max */ 2 * (NALLOCS * PAGE + NALLOCS * 2 * PAGE)); for (int i = 0; i < NALLOCS; i++) { one_page[i] = pai_alloc(tsdn, &sec.pai, PAGE, PAGE, /* zero */ false); @@ -150,7 +153,9 @@ TEST_BEGIN(test_reuse) { /* zero */ false); expect_ptr_not_null(one_page[i], "Unexpected alloc failure"); } - expect_zu_eq(2 * NALLOCS, ta.alloc_count, + expect_zu_eq(0, ta.alloc_count, "Should be using batch allocs"); + size_t max_allocs = ta.alloc_count + ta.alloc_batch_count; + expect_zu_le(2 * NALLOCS, max_allocs, "Incorrect number of allocations"); expect_zu_eq(0, ta.dalloc_count, "Incorrect number of allocations"); @@ -164,7 +169,7 @@ TEST_BEGIN(test_reuse) { for (int i = NALLOCS - 1; i >= 0; i--) { pai_dalloc(tsdn, &sec.pai, two_page[i]); } - expect_zu_eq(2 * NALLOCS, ta.alloc_count, + expect_zu_eq(max_allocs, ta.alloc_count + ta.alloc_batch_count, "Incorrect number of allocations"); expect_zu_eq(0, ta.dalloc_count, "Incorrect number of allocations"); @@ -182,7 +187,7 @@ TEST_BEGIN(test_reuse) { expect_ptr_eq(two_page[i], alloc2, "Got unexpected allocation"); } - expect_zu_eq(2 * NALLOCS, ta.alloc_count, + expect_zu_eq(max_allocs, ta.alloc_count + ta.alloc_batch_count, "Incorrect number of allocations"); expect_zu_eq(0, ta.dalloc_count, "Incorrect number of allocations"); @@ -198,7 +203,12 @@ TEST_BEGIN(test_auto_flush) { tsdn_t *tsdn = TSDN_NULL; /* * 10-allocs apiece of 1-PAGE and 2-PAGE objects means that we should be - * able to get to 30 pages in the cache before triggering a flush. + * able to get to 30 pages in the cache before triggering a flush. The + * choice of NALLOCS here is chosen to match the batch allocation + * default (4 extra + 1 == 5; so 10 allocations leaves the cache exactly + * empty, even in the presence of batch allocation on fill). + * Eventually, once our allocation batching strategies become smarter, + * this should change. */ enum { NALLOCS = 10 }; edata_t *extra_alloc; @@ -212,7 +222,8 @@ TEST_BEGIN(test_auto_flush) { } extra_alloc = pai_alloc(tsdn, &sec.pai, PAGE, PAGE, /* zero */ false); expect_ptr_not_null(extra_alloc, "Unexpected alloc failure"); - expect_zu_eq(NALLOCS + 1, ta.alloc_count, + size_t max_allocs = ta.alloc_count + ta.alloc_batch_count; + expect_zu_le(NALLOCS + 1, max_allocs, "Incorrect number of allocations"); expect_zu_eq(0, ta.dalloc_count, "Incorrect number of allocations"); @@ -220,7 +231,7 @@ TEST_BEGIN(test_auto_flush) { for (int i = 0; i < NALLOCS; i++) { pai_dalloc(tsdn, &sec.pai, allocs[i]); } - expect_zu_eq(NALLOCS + 1, ta.alloc_count, + expect_zu_le(NALLOCS + 1, max_allocs, "Incorrect number of allocations"); expect_zu_eq(0, ta.dalloc_count, "Incorrect number of allocations"); @@ -232,7 +243,7 @@ TEST_BEGIN(test_auto_flush) { * right now. */ pai_dalloc(tsdn, &sec.pai, extra_alloc); - expect_zu_eq(NALLOCS + 1, ta.alloc_count, + expect_zu_eq(max_allocs, ta.alloc_count + ta.alloc_batch_count, "Incorrect number of allocations"); expect_zu_eq(0, ta.dalloc_count, "Incorrect number of (non-batch) deallocations"); @@ -253,7 +264,7 @@ do_disable_flush_test(bool is_disable) { /* See the note above -- we can't use the real tsd. */ tsdn_t *tsdn = TSDN_NULL; - enum { NALLOCS = 10 }; + enum { NALLOCS = 11 }; edata_t *allocs[NALLOCS]; sec_init(&sec, &ta.pai, /* nshards */ 1, /* alloc_max */ PAGE, /* bytes_max */ NALLOCS * PAGE); @@ -266,8 +277,9 @@ do_disable_flush_test(bool is_disable) { for (int i = 0; i < NALLOCS - 1; i++) { pai_dalloc(tsdn, &sec.pai, allocs[i]); } - expect_zu_eq(NALLOCS, ta.alloc_count, - "Incorrect number of allocations"); + size_t max_allocs = ta.alloc_count + ta.alloc_batch_count; + + expect_zu_le(NALLOCS, max_allocs, "Incorrect number of allocations"); expect_zu_eq(0, ta.dalloc_count, "Incorrect number of allocations"); @@ -277,12 +289,13 @@ do_disable_flush_test(bool is_disable) { sec_flush(tsdn, &sec); } - expect_zu_eq(NALLOCS, ta.alloc_count, + expect_zu_eq(max_allocs, ta.alloc_count + ta.alloc_batch_count, "Incorrect number of allocations"); expect_zu_eq(0, ta.dalloc_count, "Incorrect number of (non-batch) deallocations"); - expect_zu_eq(NALLOCS - 1, ta.dalloc_batch_count, + expect_zu_le(NALLOCS - 1, ta.dalloc_batch_count, "Incorrect number of batch deallocations"); + size_t old_dalloc_batch_count = ta.dalloc_batch_count; /* * If we free into a disabled SEC, it should forward to the fallback. @@ -290,11 +303,11 @@ do_disable_flush_test(bool is_disable) { */ pai_dalloc(tsdn, &sec.pai, allocs[NALLOCS - 1]); - expect_zu_eq(NALLOCS, ta.alloc_count, + expect_zu_eq(max_allocs, ta.alloc_count + ta.alloc_batch_count, "Incorrect number of allocations"); expect_zu_eq(is_disable ? 1 : 0, ta.dalloc_count, "Incorrect number of (non-batch) deallocations"); - expect_zu_eq(NALLOCS - 1, ta.dalloc_batch_count, + expect_zu_eq(old_dalloc_batch_count, ta.dalloc_batch_count, "Incorrect number of batch deallocations"); } @@ -404,7 +417,7 @@ expect_stats_pages(tsdn_t *tsdn, sec_t *sec, size_t npages) { */ stats.bytes = 123; sec_stats_merge(tsdn, sec, &stats); - assert_zu_eq(npages * PAGE + 123, stats.bytes, ""); + assert_zu_le(npages * PAGE + 123, stats.bytes, ""); } TEST_BEGIN(test_stats_simple) { @@ -417,7 +430,7 @@ TEST_BEGIN(test_stats_simple) { enum { NITERS = 100, - FLUSH_PAGES = 10, + FLUSH_PAGES = 20, }; sec_init(&sec, &ta.pai, /* nshards */ 1, /* alloc_max */ PAGE, @@ -470,26 +483,22 @@ TEST_BEGIN(test_stats_auto_flush) { for (size_t i = 0; i < 2 * FLUSH_PAGES; i++) { allocs[i] = pai_alloc(tsdn, &sec.pai, PAGE, PAGE, /* zero */ false); - expect_stats_pages(tsdn, &sec, 0); } for (size_t i = 0; i < FLUSH_PAGES; i++) { pai_dalloc(tsdn, &sec.pai, allocs[i]); - expect_stats_pages(tsdn, &sec, i + 1); } pai_dalloc(tsdn, &sec.pai, extra_alloc0); - /* The last dalloc should have triggered a flush. */ - expect_stats_pages(tsdn, &sec, 0); /* Flush the remaining pages; stats should still work. */ for (size_t i = 0; i < FLUSH_PAGES; i++) { pai_dalloc(tsdn, &sec.pai, allocs[FLUSH_PAGES + i]); - expect_stats_pages(tsdn, &sec, i + 1); } pai_dalloc(tsdn, &sec.pai, extra_alloc1); - /* The last dalloc should have triggered a flush, again. */ - expect_stats_pages(tsdn, &sec, 0); + + expect_stats_pages(tsdn, &sec, ta.alloc_count + ta.alloc_batch_count + - ta.dalloc_count - ta.dalloc_batch_count); } TEST_END