Tcache fluhing: increase cache miss parallelism.

In practice, many rtree_leaf_elm accesses are cache misses.  By restructuring,
we can make it more likely that these misses occur without blocking us from
starting later lookups, taking more of those misses in parallel.
This commit is contained in:
David Goldblatt 2021-01-27 17:14:38 -08:00 committed by David Goldblatt
parent 181ba7fd4d
commit 9f9247a62e
2 changed files with 37 additions and 13 deletions

View File

@ -226,23 +226,47 @@ typedef const void *(*emap_ptr_getter)(void *ctx, size_t ind);
*/
typedef void (*emap_metadata_visitor)(void *ctx, emap_full_alloc_ctx_t *alloc_ctx);
typedef union emap_batch_lookup_result_u emap_batch_lookup_result_t;
union emap_batch_lookup_result_u {
edata_t *edata;
rtree_leaf_elm_t *rtree_leaf;
};
JEMALLOC_ALWAYS_INLINE void
emap_edata_lookup_batch(tsd_t *tsd, emap_t *emap, size_t nptrs,
emap_ptr_getter ptr_getter, void *ptr_getter_ctx,
emap_metadata_visitor metadata_visitor, void *metadata_visitor_ctx,
edata_t **r_edatas) {
emap_batch_lookup_result_t *result) {
/* Avoids null-checking tsdn in the loop below. */
util_assume(tsd != NULL);
rtree_ctx_t *rtree_ctx = tsd_rtree_ctxp_get(tsd);
for (size_t i = 0; i < nptrs; i++) {
emap_full_alloc_ctx_t full_alloc_ctx;
const void *ptr = ptr_getter(ptr_getter_ctx, i);
/*
* Reuse the edatas array as a temp buffer, lying a little about
* the types.
*/
result[i].rtree_leaf = rtree_leaf_elm_lookup(tsd_tsdn(tsd),
&emap->rtree, rtree_ctx, (uintptr_t)ptr,
/* dependent */ true, /* init_missing */ false);
}
emap_full_alloc_ctx_lookup(tsd_tsdn(tsd), emap, ptr,
&full_alloc_ctx);
r_edatas[i] = full_alloc_ctx.edata;
metadata_visitor(metadata_visitor_ctx, &full_alloc_ctx);
for (size_t i = 0; i < nptrs; i++) {
rtree_leaf_elm_t *elm = result[i].rtree_leaf;
rtree_contents_t contents = rtree_leaf_elm_read(tsd_tsdn(tsd),
&emap->rtree, elm, /* dependent */ true);
result[i].edata = contents.edata;
emap_full_alloc_ctx_t alloc_ctx;
/*
* Not all these fields are read in practice by the metadata
* visitor. But the compiler can easily optimize away the ones
* that aren't, so no sense in being incomplete.
*/
alloc_ctx.szind = contents.metadata.szind;
alloc_ctx.slab = contents.metadata.slab;
alloc_ctx.edata = contents.edata;
metadata_visitor(metadata_visitor_ctx, &alloc_ctx);
}
}

View File

@ -251,7 +251,7 @@ tcache_bin_flush_metadata_visitor(void *szind_sum_ctx,
static void
tcache_bin_flush_edatas_lookup(tsd_t *tsd, cache_bin_ptr_array_t *arr,
szind_t binind, size_t nflush, edata_t **edatas) {
szind_t binind, size_t nflush, emap_batch_lookup_result_t *edatas) {
/*
* This gets compiled away when config_opt_safety_checks is false.
@ -305,7 +305,7 @@ tcache_bin_flush_impl(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin,
* Variable length array must have > 0 length; the last element is never
* touched (it's just included to satisfy the no-zero-length rule).
*/
VARIABLE_ARRAY(edata_t *, item_edata, nflush + 1);
VARIABLE_ARRAY(emap_batch_lookup_result_t, item_edata, nflush + 1);
CACHE_BIN_PTR_ARRAY_DECLARE(ptrs, nflush);
cache_bin_init_ptr_array_for_flush(cache_bin, &tcache_bin_info[binind],
@ -329,7 +329,7 @@ tcache_bin_flush_impl(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin,
bool merged_stats = false;
while (nflush > 0) {
/* Lock the arena, or bin, associated with the first object. */
edata_t *edata = item_edata[0];
edata_t *edata = item_edata[0].edata;
unsigned cur_arena_ind = edata_arena_ind_get(edata);
arena_t *cur_arena = arena_get(tsdn, cur_arena_ind, false);
@ -382,7 +382,7 @@ tcache_bin_flush_impl(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin,
if (!small) {
for (unsigned i = 0; i < nflush; i++) {
void *ptr = cache_bin_ptr_array_get(&ptrs, i);
edata = item_edata[i];
edata = item_edata[i].edata;
assert(ptr != NULL && edata != NULL);
if (tcache_bin_flush_match(edata, cur_arena_ind,
@ -400,7 +400,7 @@ tcache_bin_flush_impl(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin,
unsigned ndeferred = 0;
for (unsigned i = 0; i < nflush; i++) {
void *ptr = cache_bin_ptr_array_get(&ptrs, i);
edata = item_edata[i];
edata = item_edata[i].edata;
assert(ptr != NULL && edata != NULL);
if (!tcache_bin_flush_match(edata, cur_arena_ind,
cur_binshard, small)) {
@ -411,7 +411,7 @@ tcache_bin_flush_impl(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin,
* it can be handled in a future pass.
*/
cache_bin_ptr_array_set(&ptrs, ndeferred, ptr);
item_edata[ndeferred] = edata;
item_edata[ndeferred].edata = edata;
ndeferred++;
continue;
}