From dcb7b83facf4f7641cefc0fc7c11c3d88310dae0 Mon Sep 17 00:00:00 2001 From: David Goldblatt Date: Tue, 20 Jul 2021 10:20:44 -0700 Subject: [PATCH] Eset: Cache summary information for heap edatas. This lets us do a single array scan to find first fits, instead of taking a cache miss per examined size class. --- include/jemalloc/internal/eset.h | 8 +++++ src/eset.c | 60 +++++++++++++++++++++++++++----- 2 files changed, 59 insertions(+), 9 deletions(-) diff --git a/include/jemalloc/internal/eset.h b/include/jemalloc/internal/eset.h index 708ef997..4f689b47 100644 --- a/include/jemalloc/internal/eset.h +++ b/include/jemalloc/internal/eset.h @@ -18,6 +18,14 @@ typedef struct eset_bin_s eset_bin_t; struct eset_bin_s { edata_heap_t heap; + /* + * We do first-fit across multiple size classes. If we compared against + * the min element in each heap directly, we'd take a cache miss per + * extent we looked at. If we co-locate the edata summaries, we only + * take a miss on the edata we're actually going to return (which is + * inevitable anyways). + */ + edata_cmp_summary_t heap_min; }; typedef struct eset_bin_stats_s eset_bin_stats_t; diff --git a/src/eset.c b/src/eset.c index 01af422c..6f8f335e 100644 --- a/src/eset.c +++ b/src/eset.c @@ -8,6 +8,10 @@ static void eset_bin_init(eset_bin_t *bin) { edata_heap_new(&bin->heap); + /* + * heap_min doesn't need initialization; it gets filled in when the bin + * goes from non-empty to empty. + */ } static void @@ -71,8 +75,21 @@ eset_insert(eset_t *eset, edata_t *edata) { size_t size = edata_size_get(edata); size_t psz = sz_psz_quantize_floor(size); pszind_t pind = sz_psz2ind(psz); + + edata_cmp_summary_t edata_cmp_summary = edata_cmp_summary_get(edata); if (edata_heap_empty(&eset->bins[pind].heap)) { fb_set(eset->bitmap, ESET_NPSIZES, (size_t)pind); + /* Only element is automatically the min element. */ + eset->bins[pind].heap_min = edata_cmp_summary; + } else { + /* + * There's already a min element; update the summary if we're + * about to insert a lower one. + */ + if (edata_cmp_summary_comp(edata_cmp_summary, + eset->bins[pind].heap_min) < 0) { + eset->bins[pind].heap_min = edata_cmp_summary; + } } edata_heap_insert(&eset->bins[pind].heap, edata); @@ -101,14 +118,29 @@ eset_remove(eset_t *eset, edata_t *edata) { size_t size = edata_size_get(edata); size_t psz = sz_psz_quantize_floor(size); pszind_t pind = sz_psz2ind(psz); - edata_heap_remove(&eset->bins[pind].heap, edata); - if (config_stats) { eset_stats_sub(eset, pind, size); } + edata_cmp_summary_t edata_cmp_summary = edata_cmp_summary_get(edata); + edata_heap_remove(&eset->bins[pind].heap, edata); if (edata_heap_empty(&eset->bins[pind].heap)) { fb_unset(eset->bitmap, ESET_NPSIZES, (size_t)pind); + } else { + /* + * This is a little weird; we compare if the summaries are + * equal, rather than if the edata we removed was the heap + * minimum. The reason why is that getting the heap minimum + * can cause a pairing heap merge operation. We can avoid this + * if we only update the min if it's changed, in which case the + * summaries of the removed element and the min element should + * compare equal. + */ + if (edata_cmp_summary_comp(edata_cmp_summary, + eset->bins[pind].heap_min) == 0) { + eset->bins[pind].heap_min = edata_cmp_summary_get( + edata_heap_first(&eset->bins[pind].heap)); + } } edata_list_inactive_remove(&eset->lru, edata); size_t npages = size >> LG_PAGE; @@ -116,10 +148,6 @@ eset_remove(eset_t *eset, edata_t *edata) { * As in eset_insert, we hold eset->mtx and so don't need atomic * operations for updating eset->npages. */ - /* - * This class is not thread-safe in general; we rely on external - * synchronization for all mutating operations. - */ size_t cur_extents_npages = atomic_load_zu(&eset->npages, ATOMIC_RELAXED); assert(cur_extents_npages >= npages); @@ -178,6 +206,7 @@ static edata_t * eset_first_fit(eset_t *eset, size_t size, bool exact_only, unsigned lg_max_fit) { edata_t *ret = NULL; + edata_cmp_summary_t ret_summ JEMALLOC_CC_SILENCE_INIT({0}); pszind_t pind = sz_psz2ind(sz_psz_quantize_ceil(size)); @@ -191,8 +220,6 @@ eset_first_fit(eset_t *eset, size_t size, bool exact_only, i < ESET_NPSIZES; i = (pszind_t)fb_ffs(eset->bitmap, ESET_NPSIZES, (size_t)i + 1)) { assert(!edata_heap_empty(&eset->bins[i].heap)); - edata_t *edata = edata_heap_first(&eset->bins[i].heap); - assert(edata_size_get(edata) >= size); if (lg_max_fit == SC_PTR_BITS) { /* * We'll shift by this below, and shifting out all the @@ -204,8 +231,23 @@ eset_first_fit(eset_t *eset, size_t size, bool exact_only, if ((sz_pind2sz(i) >> lg_max_fit) > size) { break; } - if (ret == NULL || edata_snad_comp(edata, ret) < 0) { + if (ret == NULL || edata_cmp_summary_comp( + eset->bins[i].heap_min, ret_summ) < 0) { + /* + * We grab the edata as early as possible, even though + * we might change it later. Practically, a large + * portion of eset_fit calls succeed at the first valid + * index, so this doesn't cost much, and we get the + * effect of prefetching the edata as early as possible. + */ + edata_t *edata = edata_heap_first(&eset->bins[i].heap); + assert(edata_size_get(edata) >= size); + assert(ret == NULL || edata_snad_comp(edata, ret) < 0); + assert(ret == NULL || edata_cmp_summary_comp( + eset->bins[i].heap_min, + edata_cmp_summary_get(edata)) == 0); ret = edata; + ret_summ = eset->bins[i].heap_min; } if (i == SC_NPSIZES) { break;