From 9f9247a62ed5ac1157519cd2b1f966cacf772aaa Mon Sep 17 00:00:00 2001 From: David Goldblatt Date: Wed, 27 Jan 2021 17:14:38 -0800 Subject: [PATCH] Tcache fluhing: increase cache miss parallelism. In practice, many rtree_leaf_elm accesses are cache misses. By restructuring, we can make it more likely that these misses occur without blocking us from starting later lookups, taking more of those misses in parallel. --- include/jemalloc/internal/emap.h | 38 ++++++++++++++++++++++++++------ src/tcache.c | 12 +++++----- 2 files changed, 37 insertions(+), 13 deletions(-) diff --git a/include/jemalloc/internal/emap.h b/include/jemalloc/internal/emap.h index f0d7e768..ac0050b5 100644 --- a/include/jemalloc/internal/emap.h +++ b/include/jemalloc/internal/emap.h @@ -226,23 +226,47 @@ typedef const void *(*emap_ptr_getter)(void *ctx, size_t ind); */ typedef void (*emap_metadata_visitor)(void *ctx, emap_full_alloc_ctx_t *alloc_ctx); +typedef union emap_batch_lookup_result_u emap_batch_lookup_result_t; +union emap_batch_lookup_result_u { + edata_t *edata; + rtree_leaf_elm_t *rtree_leaf; +}; + JEMALLOC_ALWAYS_INLINE void emap_edata_lookup_batch(tsd_t *tsd, emap_t *emap, size_t nptrs, emap_ptr_getter ptr_getter, void *ptr_getter_ctx, emap_metadata_visitor metadata_visitor, void *metadata_visitor_ctx, - edata_t **r_edatas) { - + emap_batch_lookup_result_t *result) { /* Avoids null-checking tsdn in the loop below. */ util_assume(tsd != NULL); + rtree_ctx_t *rtree_ctx = tsd_rtree_ctxp_get(tsd); for (size_t i = 0; i < nptrs; i++) { - emap_full_alloc_ctx_t full_alloc_ctx; const void *ptr = ptr_getter(ptr_getter_ctx, i); + /* + * Reuse the edatas array as a temp buffer, lying a little about + * the types. + */ + result[i].rtree_leaf = rtree_leaf_elm_lookup(tsd_tsdn(tsd), + &emap->rtree, rtree_ctx, (uintptr_t)ptr, + /* dependent */ true, /* init_missing */ false); + } - emap_full_alloc_ctx_lookup(tsd_tsdn(tsd), emap, ptr, - &full_alloc_ctx); - r_edatas[i] = full_alloc_ctx.edata; - metadata_visitor(metadata_visitor_ctx, &full_alloc_ctx); + for (size_t i = 0; i < nptrs; i++) { + rtree_leaf_elm_t *elm = result[i].rtree_leaf; + rtree_contents_t contents = rtree_leaf_elm_read(tsd_tsdn(tsd), + &emap->rtree, elm, /* dependent */ true); + result[i].edata = contents.edata; + emap_full_alloc_ctx_t alloc_ctx; + /* + * Not all these fields are read in practice by the metadata + * visitor. But the compiler can easily optimize away the ones + * that aren't, so no sense in being incomplete. + */ + alloc_ctx.szind = contents.metadata.szind; + alloc_ctx.slab = contents.metadata.slab; + alloc_ctx.edata = contents.edata; + metadata_visitor(metadata_visitor_ctx, &alloc_ctx); } } diff --git a/src/tcache.c b/src/tcache.c index 602823d9..635ba0b1 100644 --- a/src/tcache.c +++ b/src/tcache.c @@ -251,7 +251,7 @@ tcache_bin_flush_metadata_visitor(void *szind_sum_ctx, static void tcache_bin_flush_edatas_lookup(tsd_t *tsd, cache_bin_ptr_array_t *arr, - szind_t binind, size_t nflush, edata_t **edatas) { + szind_t binind, size_t nflush, emap_batch_lookup_result_t *edatas) { /* * This gets compiled away when config_opt_safety_checks is false. @@ -305,7 +305,7 @@ tcache_bin_flush_impl(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin, * Variable length array must have > 0 length; the last element is never * touched (it's just included to satisfy the no-zero-length rule). */ - VARIABLE_ARRAY(edata_t *, item_edata, nflush + 1); + VARIABLE_ARRAY(emap_batch_lookup_result_t, item_edata, nflush + 1); CACHE_BIN_PTR_ARRAY_DECLARE(ptrs, nflush); cache_bin_init_ptr_array_for_flush(cache_bin, &tcache_bin_info[binind], @@ -329,7 +329,7 @@ tcache_bin_flush_impl(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin, bool merged_stats = false; while (nflush > 0) { /* Lock the arena, or bin, associated with the first object. */ - edata_t *edata = item_edata[0]; + edata_t *edata = item_edata[0].edata; unsigned cur_arena_ind = edata_arena_ind_get(edata); arena_t *cur_arena = arena_get(tsdn, cur_arena_ind, false); @@ -382,7 +382,7 @@ tcache_bin_flush_impl(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin, if (!small) { for (unsigned i = 0; i < nflush; i++) { void *ptr = cache_bin_ptr_array_get(&ptrs, i); - edata = item_edata[i]; + edata = item_edata[i].edata; assert(ptr != NULL && edata != NULL); if (tcache_bin_flush_match(edata, cur_arena_ind, @@ -400,7 +400,7 @@ tcache_bin_flush_impl(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin, unsigned ndeferred = 0; for (unsigned i = 0; i < nflush; i++) { void *ptr = cache_bin_ptr_array_get(&ptrs, i); - edata = item_edata[i]; + edata = item_edata[i].edata; assert(ptr != NULL && edata != NULL); if (!tcache_bin_flush_match(edata, cur_arena_ind, cur_binshard, small)) { @@ -411,7 +411,7 @@ tcache_bin_flush_impl(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin, * it can be handled in a future pass. */ cache_bin_ptr_array_set(&ptrs, ndeferred, ptr); - item_edata[ndeferred] = edata; + item_edata[ndeferred].edata = edata; ndeferred++; continue; }