diff --git a/include/jemalloc/internal/rtree_inlines.h b/include/jemalloc/internal/rtree_inlines.h index ce03c578..6791f50c 100644 --- a/include/jemalloc/internal/rtree_inlines.h +++ b/include/jemalloc/internal/rtree_inlines.h @@ -64,6 +64,15 @@ rtree_leafkey(uintptr_t key) { return (key & mask); } +JEMALLOC_ALWAYS_INLINE size_t +rtree_cache_direct_map(uintptr_t key) { + unsigned ptrbits = ZU(1) << (LG_SIZEOF_PTR+3); + unsigned cumbits = (rtree_levels[RTREE_HEIGHT-1].cumbits - + rtree_levels[RTREE_HEIGHT-1].bits); + unsigned maskbits = ptrbits - cumbits; + return (size_t)((key >> maskbits) & (RTREE_CTX_NCACHE - 1)); +} + JEMALLOC_ALWAYS_INLINE uintptr_t rtree_subkey(uintptr_t key, unsigned level) { unsigned ptrbits = ZU(1) << (LG_SIZEOF_PTR+3); @@ -320,36 +329,54 @@ rtree_leaf_elm_lookup(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx, assert(key != 0); assert(!dependent || !init_missing); + size_t slot = rtree_cache_direct_map(key); uintptr_t leafkey = rtree_leafkey(key); assert(leafkey != RTREE_LEAFKEY_INVALID); -#define RTREE_CACHE_CHECK(i) do { \ - if (likely(rtree_ctx->cache[i].leafkey == leafkey)) { \ - rtree_leaf_elm_t *leaf = rtree_ctx->cache[i].leaf; \ + /* Fast path: L1 direct mapped cache. */ + if (likely(rtree_ctx->cache[slot].leafkey == leafkey)) { + rtree_leaf_elm_t *leaf = rtree_ctx->cache[slot].leaf; + assert(leaf != NULL); + uintptr_t subkey = rtree_subkey(key, RTREE_HEIGHT-1); + return &leaf[subkey]; + } + /* + * Search the L2 LRU cache. On hit, swap the matching element into the + * slot in L1 cache, and move the position in L2 up by 1. + */ +#define RTREE_CACHE_CHECK_L2(i) do { \ + if (likely(rtree_ctx->l2_cache[i].leafkey == leafkey)) { \ + rtree_leaf_elm_t *leaf = rtree_ctx->l2_cache[i].leaf; \ assert(leaf != NULL); \ if (i > 0) { \ /* Bubble up by one. */ \ - rtree_ctx->cache[i].leafkey = \ - rtree_ctx->cache[i - 1].leafkey; \ - rtree_ctx->cache[i].leaf = \ - rtree_ctx->cache[i - 1].leaf; \ - rtree_ctx->cache[i - 1].leafkey = leafkey; \ - rtree_ctx->cache[i - 1].leaf = leaf; \ + rtree_ctx->l2_cache[i].leafkey = \ + rtree_ctx->l2_cache[i - 1].leafkey; \ + rtree_ctx->l2_cache[i].leaf = \ + rtree_ctx->l2_cache[i - 1].leaf; \ + rtree_ctx->l2_cache[i - 1].leafkey = \ + rtree_ctx->cache[slot].leafkey; \ + rtree_ctx->l2_cache[i - 1].leaf = \ + rtree_ctx->cache[slot].leaf; \ + } else { \ + rtree_ctx->l2_cache[0].leafkey = \ + rtree_ctx->cache[slot].leafkey; \ + rtree_ctx->l2_cache[0].leaf = \ + rtree_ctx->cache[slot].leaf; \ } \ + rtree_ctx->cache[slot].leafkey = leafkey; \ + rtree_ctx->cache[slot].leaf = leaf; \ uintptr_t subkey = rtree_subkey(key, RTREE_HEIGHT-1); \ return &leaf[subkey]; \ } \ } while (0) /* Check the first cache entry. */ - RTREE_CACHE_CHECK(0); - /* - * Search the remaining cache elements, and on success move the matching - * element up by one slot. - */ - for (unsigned i = 1; i < RTREE_CTX_NCACHE; i++) { - RTREE_CACHE_CHECK(i); + RTREE_CACHE_CHECK_L2(0); + /* Search the remaining cache elements. */ + for (unsigned i = 1; i < RTREE_CTX_NCACHE_L2; i++) { + RTREE_CACHE_CHECK_L2(i); } -#undef RTREE_CACHE_CHECK +#undef RTREE_CACHE_CHECK_L2 return rtree_leaf_elm_lookup_hard(tsdn, rtree, rtree_ctx, key, dependent, init_missing); diff --git a/include/jemalloc/internal/rtree_structs.h b/include/jemalloc/internal/rtree_structs.h index 175a013c..7ff92e61 100644 --- a/include/jemalloc/internal/rtree_structs.h +++ b/include/jemalloc/internal/rtree_structs.h @@ -55,7 +55,10 @@ struct rtree_ctx_cache_elm_s { }; struct rtree_ctx_s { + /* Direct mapped cache. */ rtree_ctx_cache_elm_t cache[RTREE_CTX_NCACHE]; + /* L2 LRU cache. */ + rtree_ctx_cache_elm_t l2_cache[RTREE_CTX_NCACHE_L2]; }; struct rtree_s { diff --git a/include/jemalloc/internal/rtree_types.h b/include/jemalloc/internal/rtree_types.h index e480542d..d9a4cf4d 100644 --- a/include/jemalloc/internal/rtree_types.h +++ b/include/jemalloc/internal/rtree_types.h @@ -42,19 +42,25 @@ typedef struct rtree_s rtree_t; #define RTREE_LEAFKEY_INVALID ((uintptr_t)1) /* - * Number of leafkey/leaf pairs to cache. Each entry supports an entire leaf, - * so the cache hit rate is typically high even with a small number of entries. - * In rare cases extent activity will straddle the boundary between two leaf - * nodes. Furthermore, an arena may use a combination of dss and mmap. Four - * entries covers both of these considerations as long as locality of reference - * is high, and/or total memory usage doesn't exceed the range supported by - * those entries. Note that as memory usage grows past the amount that this - * cache can directly cover, the cache will become less effective if locality of - * reference is low, but the consequence is merely cache misses while traversing - * the tree nodes, and the cache will itself suffer cache misses if made overly - * large, not to mention the cost of linear search. + * Number of leafkey/leaf pairs to cache in L1 and L2 level respectively. Each + * entry supports an entire leaf, so the cache hit rate is typically high even + * with a small number of entries. In rare cases extent activity will straddle + * the boundary between two leaf nodes. Furthermore, an arena may use a + * combination of dss and mmap. Note that as memory usage grows past the amount + * that this cache can directly cover, the cache will become less effective if + * locality of reference is low, but the consequence is merely cache misses + * while traversing the tree nodes. + * + * The L1 direct mapped cache offers consistent and low cost on cache hit. + * However collision could affect hit rate negatively. This is resolved by + * combining with a L2 LRU cache, which requires linear search and re-ordering + * on access but suffers no collision. Note that, the cache will itself suffer + * cache misses if made overly large, plus the cost of linear search in the LRU + * cache. */ -#define RTREE_CTX_NCACHE 8 +#define RTREE_CTX_LG_NCACHE 4 +#define RTREE_CTX_NCACHE (1 << RTREE_CTX_LG_NCACHE) +#define RTREE_CTX_NCACHE_L2 8 /* * Zero initializer required for tsd initialization only. Proper initialization diff --git a/src/prof.c b/src/prof.c index 334466b1..b33e9397 100644 --- a/src/prof.c +++ b/src/prof.c @@ -310,6 +310,7 @@ prof_leave(tsd_t *tsd, prof_tdata_t *tdata) { } #ifdef JEMALLOC_PROF_LIBUNWIND +JEMALLOC_ALIGNED(CACHELINE) void prof_backtrace(prof_bt_t *bt) { int nframes; diff --git a/src/rtree.c b/src/rtree.c index 051428f1..8d11d99f 100644 --- a/src/rtree.c +++ b/src/rtree.c @@ -256,6 +256,16 @@ rtree_leaf_elm_lookup_hard(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx, leaf = rtree->root; #endif + if (config_debug) { + uintptr_t leafkey = rtree_leafkey(key); + for (unsigned i = 0; i < RTREE_CTX_NCACHE; i++) { + assert(rtree_ctx->cache[i].leafkey != leafkey); + } + for (unsigned i = 0; i < RTREE_CTX_NCACHE_L2; i++) { + assert(rtree_ctx->l2_cache[i].leafkey != leafkey); + } + } + #define RTREE_GET_CHILD(level) { \ assert(level < RTREE_HEIGHT-1); \ if (level != 0 && !dependent && \ @@ -277,20 +287,30 @@ rtree_leaf_elm_lookup_hard(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx, dependent); \ } \ } + /* + * Cache replacement upon hard lookup (i.e. L1 & L2 rtree cache miss): + * (1) evict last entry in L2 cache; (2) move the collision slot from L1 + * cache down to L2; and 3) fill L1. + */ #define RTREE_GET_LEAF(level) { \ assert(level == RTREE_HEIGHT-1); \ if (!dependent && unlikely(!rtree_leaf_valid(leaf))) { \ return NULL; \ } \ - if (RTREE_CTX_NCACHE > 1) { \ - memmove(&rtree_ctx->cache[1], \ - &rtree_ctx->cache[0], \ + if (RTREE_CTX_NCACHE_L2 > 1) { \ + memmove(&rtree_ctx->l2_cache[1], \ + &rtree_ctx->l2_cache[0], \ sizeof(rtree_ctx_cache_elm_t) * \ - (RTREE_CTX_NCACHE-1)); \ + (RTREE_CTX_NCACHE_L2 - 1)); \ } \ + size_t slot = rtree_cache_direct_map(key); \ + rtree_ctx->l2_cache[0].leafkey = \ + rtree_ctx->cache[slot].leafkey; \ + rtree_ctx->l2_cache[0].leaf = \ + rtree_ctx->cache[slot].leaf; \ uintptr_t leafkey = rtree_leafkey(key); \ - rtree_ctx->cache[0].leafkey = leafkey; \ - rtree_ctx->cache[0].leaf = leaf; \ + rtree_ctx->cache[slot].leafkey = leafkey; \ + rtree_ctx->cache[slot].leaf = leaf; \ uintptr_t subkey = rtree_subkey(key, level); \ return &leaf[subkey]; \ } @@ -433,6 +453,11 @@ rtree_ctx_data_init(rtree_ctx_t *ctx) { cache->leafkey = RTREE_LEAFKEY_INVALID; cache->leaf = NULL; } + for (unsigned i = 0; i < RTREE_CTX_NCACHE_L2; i++) { + rtree_ctx_cache_elm_t *cache = &ctx->l2_cache[i]; + cache->leafkey = RTREE_LEAFKEY_INVALID; + cache->leaf = NULL; + } } bool