From bf448d7a5a4c2aecbda7ef11767a75829d9aaf77 Mon Sep 17 00:00:00 2001 From: David Goldblatt Date: Tue, 5 Jan 2021 15:52:25 -0800 Subject: [PATCH] SEC: Reduce lock hold times. Only flush a subset of extents during flushing, and drop the lock while doing so. --- include/jemalloc/internal/sec.h | 40 ++++++++++++--- src/sec.c | 87 +++++++++++++++++++++++++++------ test/unit/sec.c | 7 ++- 3 files changed, 110 insertions(+), 24 deletions(-) diff --git a/include/jemalloc/internal/sec.h b/include/jemalloc/internal/sec.h index 6bf5687d..815b4bbc 100644 --- a/include/jemalloc/internal/sec.h +++ b/include/jemalloc/internal/sec.h @@ -8,13 +8,9 @@ * Small extent cache. * * This includes some utilities to cache small extents. We have a per-pszind - * bin with its own lock and edata heap (including only extents of that size). - * We don't try to do any coalescing of extents (since it would require - * cross-bin locks). As a result, we need to be careful about fragmentation. - * As a gesture in that direction, we limit the size of caches, apply first-fit - * within the bins, and, when flushing a bin, flush all of its extents rather - * than just those up to some threshold. When we allocate again, we'll get a - * chance to move to better ones. + * bin with its own list of extents of that size. We don't try to do any + * coalescing of extents (since it would in general require cross-shard locks or + * knowledge of the underlying PAI implementation). */ /* @@ -46,6 +42,19 @@ sec_stats_accum(sec_stats_t *dst, sec_stats_t *src) { dst->bytes += src->bytes; } +/* A collections of free extents, all of the same size. */ +typedef struct sec_bin_s sec_bin_t; +struct sec_bin_s { + /* + * Number of bytes in this particular bin (as opposed to the + * sec_shard_t's bytes_cur. This isn't user visible or reported in + * stats; rather, it allows us to quickly determine the change in the + * centralized counter when flushing. + */ + size_t bytes_cur; + edata_list_active_t freelist; +}; + typedef struct sec_shard_s sec_shard_t; struct sec_shard_s { /* @@ -64,8 +73,11 @@ struct sec_shard_s { * hooks are installed. */ bool enabled; - edata_list_active_t freelist[SEC_NPSIZES]; + sec_bin_t bins[SEC_NPSIZES]; + /* Number of bytes in all bins in the shard. */ size_t bytes_cur; + /* The next pszind to flush in the flush-some pathways. */ + pszind_t to_flush_next; }; typedef struct sec_s sec_t; @@ -83,6 +95,18 @@ struct sec_s { * the bins in that shard to be flushed. */ size_t bytes_max; + /* + * The number of bytes (in all bins) we flush down to when we exceed + * bytes_cur. We want this to be less than bytes_cur, because + * otherwise we could get into situations where a shard undergoing + * net-deallocation keeps bytes_cur very near to bytes_max, so that + * most deallocations get immediately forwarded to the underlying PAI + * implementation, defeating the point of the SEC. + * + * Currently this is just set to bytes_max / 2, but eventually can be + * configurable. + */ + size_t bytes_after_flush; /* * We don't necessarily always use all the shards; requests are diff --git a/src/sec.c b/src/sec.c index 3a3a0b90..49b41047 100644 --- a/src/sec.c +++ b/src/sec.c @@ -11,7 +11,14 @@ static bool sec_shrink(tsdn_t *tsdn, pai_t *self, edata_t *edata, size_t old_size, size_t new_size); static void sec_dalloc(tsdn_t *tsdn, pai_t *self, edata_t *edata); -bool sec_init(sec_t *sec, pai_t *fallback, size_t nshards, size_t alloc_max, +static void +sec_bin_init(sec_bin_t *bin) { + bin->bytes_cur = 0; + edata_list_active_init(&bin->freelist); +} + +bool +sec_init(sec_t *sec, pai_t *fallback, size_t nshards, size_t alloc_max, size_t bytes_max) { if (nshards > SEC_NSHARDS_MAX) { nshards = SEC_NSHARDS_MAX; @@ -25,9 +32,10 @@ bool sec_init(sec_t *sec, pai_t *fallback, size_t nshards, size_t alloc_max, } shard->enabled = true; for (pszind_t j = 0; j < SEC_NPSIZES; j++) { - edata_list_active_init(&shard->freelist[j]); + sec_bin_init(&shard->bins[j]); } shard->bytes_cur = 0; + shard->to_flush_next = 0; } sec->fallback = fallback; sec->alloc_max = alloc_max; @@ -36,6 +44,7 @@ bool sec_init(sec_t *sec, pai_t *fallback, size_t nshards, size_t alloc_max, } sec->bytes_max = bytes_max; + sec->bytes_after_flush = bytes_max / 2; sec->nshards = nshards; /* @@ -85,9 +94,12 @@ sec_shard_alloc_locked(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard, if (!shard->enabled) { return NULL; } - edata_t *edata = edata_list_active_first(&shard->freelist[pszind]); + sec_bin_t *bin = &shard->bins[pszind]; + edata_t *edata = edata_list_active_first(&bin->freelist); if (edata != NULL) { - edata_list_active_remove(&shard->freelist[pszind], edata); + edata_list_active_remove(&bin->freelist, edata); + assert(edata_size_get(edata) <= bin->bytes_cur); + bin->bytes_cur -= edata_size_get(edata); assert(edata_size_get(edata) <= shard->bytes_cur); shard->bytes_cur -= edata_size_get(edata); } @@ -135,30 +147,75 @@ sec_shrink(tsdn_t *tsdn, pai_t *self, edata_t *edata, size_t old_size, } static void -sec_do_flush_locked(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard) { +sec_flush_all_locked(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard) { malloc_mutex_assert_owner(tsdn, &shard->mtx); shard->bytes_cur = 0; edata_list_active_t to_flush; edata_list_active_init(&to_flush); for (pszind_t i = 0; i < SEC_NPSIZES; i++) { - edata_list_active_concat(&to_flush, &shard->freelist[i]); + sec_bin_t *bin = &shard->bins[i]; + bin->bytes_cur = 0; + edata_list_active_concat(&to_flush, &bin->freelist); } + /* + * Ordinarily we would try to avoid doing the batch deallocation while + * holding the shard mutex, but the flush_all pathways only happen when + * we're disabling the HPA or resetting the arena, both of which are + * rare pathways. + */ pai_dalloc_batch(tsdn, sec->fallback, &to_flush); } static void -sec_shard_dalloc_locked(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard, +sec_flush_some_and_unlock(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard) { + malloc_mutex_assert_owner(tsdn, &shard->mtx); + edata_list_active_t to_flush; + edata_list_active_init(&to_flush); + while (shard->bytes_cur > sec->bytes_after_flush) { + /* Pick a victim. */ + sec_bin_t *bin = &shard->bins[shard->to_flush_next]; + + /* Update our victim-picking state. */ + shard->to_flush_next++; + if (shard->to_flush_next == SEC_NPSIZES) { + shard->to_flush_next = 0; + } + + assert(shard->bytes_cur >= bin->bytes_cur); + if (bin->bytes_cur != 0) { + shard->bytes_cur -= bin->bytes_cur; + bin->bytes_cur = 0; + edata_list_active_concat(&to_flush, &bin->freelist); + } + /* + * Either bin->bytes_cur was 0, in which case we didn't touch + * the bin list but it should be empty anyways (or else we + * missed a bytes_cur update on a list modification), or it + * *was* 0 and we emptied it ourselves. Either way, it should + * be empty now. + */ + assert(edata_list_active_empty(&bin->freelist)); + } + + malloc_mutex_unlock(tsdn, &shard->mtx); + pai_dalloc_batch(tsdn, sec->fallback, &to_flush); +} + +static void +sec_shard_dalloc_and_unlock(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard, edata_t *edata) { malloc_mutex_assert_owner(tsdn, &shard->mtx); assert(shard->bytes_cur <= sec->bytes_max); size_t size = edata_size_get(edata); pszind_t pszind = sz_psz2ind(size); /* - * Prepending here results in FIFO allocation per bin, which seems + * Prepending here results in LIFO allocation per bin, which seems * reasonable. */ - edata_list_active_prepend(&shard->freelist[pszind], edata); + sec_bin_t *bin = &shard->bins[pszind]; + edata_list_active_prepend(&bin->freelist, edata); + bin->bytes_cur += size; shard->bytes_cur += size; if (shard->bytes_cur > sec->bytes_max) { /* @@ -170,7 +227,10 @@ sec_shard_dalloc_locked(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard, * in the backing allocator). This has the extra advantage of * not requiring advanced cache balancing strategies. */ - sec_do_flush_locked(tsdn, sec, shard); + sec_flush_some_and_unlock(tsdn, sec, shard); + malloc_mutex_assert_not_owner(tsdn, &shard->mtx); + } else { + malloc_mutex_unlock(tsdn, &shard->mtx); } } @@ -184,8 +244,7 @@ sec_dalloc(tsdn_t *tsdn, pai_t *self, edata_t *edata) { sec_shard_t *shard = sec_shard_pick(tsdn, sec); malloc_mutex_lock(tsdn, &shard->mtx); if (shard->enabled) { - sec_shard_dalloc_locked(tsdn, sec, shard, edata); - malloc_mutex_unlock(tsdn, &shard->mtx); + sec_shard_dalloc_and_unlock(tsdn, sec, shard, edata); } else { malloc_mutex_unlock(tsdn, &shard->mtx); pai_dalloc(tsdn, sec->fallback, edata); @@ -196,7 +255,7 @@ void sec_flush(tsdn_t *tsdn, sec_t *sec) { for (size_t i = 0; i < sec->nshards; i++) { malloc_mutex_lock(tsdn, &sec->shards[i].mtx); - sec_do_flush_locked(tsdn, sec, &sec->shards[i]); + sec_flush_all_locked(tsdn, sec, &sec->shards[i]); malloc_mutex_unlock(tsdn, &sec->shards[i].mtx); } } @@ -206,7 +265,7 @@ sec_disable(tsdn_t *tsdn, sec_t *sec) { for (size_t i = 0; i < sec->nshards; i++) { malloc_mutex_lock(tsdn, &sec->shards[i].mtx); sec->shards[i].enabled = false; - sec_do_flush_locked(tsdn, sec, &sec->shards[i]); + sec_flush_all_locked(tsdn, sec, &sec->shards[i]); malloc_mutex_unlock(tsdn, &sec->shards[i].mtx); } } diff --git a/test/unit/sec.c b/test/unit/sec.c index 7657537b..5fe3550c 100644 --- a/test/unit/sec.c +++ b/test/unit/sec.c @@ -200,8 +200,11 @@ TEST_BEGIN(test_auto_flush) { expect_zu_eq(0, ta.dalloc_count, "Incorrect number of allocations"); /* - * Free the extra allocation; this should trigger a flush of all - * extents in the cache. + * Free the extra allocation; this should trigger a flush. The internal + * flushing logic is allowed to get complicated; for now, we rely on our + * whitebox knowledge of the fact that the SEC flushes bins in their + * entirety when it decides to do so, and it has only one bin active + * right now. */ pai_dalloc(tsdn, &sec.pai, extra_alloc); expect_zu_eq(NALLOCS + 1, ta.alloc_count,