Improve rtree cache with a two-level cache design.
Two levels of rcache is implemented: a direct mapped cache as L1, combined with a LRU cache as L2. The L1 cache offers low cost on cache hit, but could suffer collision under circumstances. This is complemented by the L2 LRU cache, which is slower on cache access (overhead from linear search + reordering), but solves collison of L1 rather well.
This commit is contained in:
parent
d16f1e53df
commit
3c9c41edb2
@ -64,6 +64,15 @@ rtree_leafkey(uintptr_t key) {
|
|||||||
return (key & mask);
|
return (key & mask);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
JEMALLOC_ALWAYS_INLINE size_t
|
||||||
|
rtree_cache_direct_map(uintptr_t key) {
|
||||||
|
unsigned ptrbits = ZU(1) << (LG_SIZEOF_PTR+3);
|
||||||
|
unsigned cumbits = (rtree_levels[RTREE_HEIGHT-1].cumbits -
|
||||||
|
rtree_levels[RTREE_HEIGHT-1].bits);
|
||||||
|
unsigned maskbits = ptrbits - cumbits;
|
||||||
|
return (size_t)((key >> maskbits) & (RTREE_CTX_NCACHE - 1));
|
||||||
|
}
|
||||||
|
|
||||||
JEMALLOC_ALWAYS_INLINE uintptr_t
|
JEMALLOC_ALWAYS_INLINE uintptr_t
|
||||||
rtree_subkey(uintptr_t key, unsigned level) {
|
rtree_subkey(uintptr_t key, unsigned level) {
|
||||||
unsigned ptrbits = ZU(1) << (LG_SIZEOF_PTR+3);
|
unsigned ptrbits = ZU(1) << (LG_SIZEOF_PTR+3);
|
||||||
@ -320,36 +329,54 @@ rtree_leaf_elm_lookup(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx,
|
|||||||
assert(key != 0);
|
assert(key != 0);
|
||||||
assert(!dependent || !init_missing);
|
assert(!dependent || !init_missing);
|
||||||
|
|
||||||
|
size_t slot = rtree_cache_direct_map(key);
|
||||||
uintptr_t leafkey = rtree_leafkey(key);
|
uintptr_t leafkey = rtree_leafkey(key);
|
||||||
assert(leafkey != RTREE_LEAFKEY_INVALID);
|
assert(leafkey != RTREE_LEAFKEY_INVALID);
|
||||||
|
|
||||||
#define RTREE_CACHE_CHECK(i) do { \
|
/* Fast path: L1 direct mapped cache. */
|
||||||
if (likely(rtree_ctx->cache[i].leafkey == leafkey)) { \
|
if (likely(rtree_ctx->cache[slot].leafkey == leafkey)) {
|
||||||
rtree_leaf_elm_t *leaf = rtree_ctx->cache[i].leaf; \
|
rtree_leaf_elm_t *leaf = rtree_ctx->cache[slot].leaf;
|
||||||
|
assert(leaf != NULL);
|
||||||
|
uintptr_t subkey = rtree_subkey(key, RTREE_HEIGHT-1);
|
||||||
|
return &leaf[subkey];
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
* Search the L2 LRU cache. On hit, swap the matching element into the
|
||||||
|
* slot in L1 cache, and move the position in L2 up by 1.
|
||||||
|
*/
|
||||||
|
#define RTREE_CACHE_CHECK_L2(i) do { \
|
||||||
|
if (likely(rtree_ctx->l2_cache[i].leafkey == leafkey)) { \
|
||||||
|
rtree_leaf_elm_t *leaf = rtree_ctx->l2_cache[i].leaf; \
|
||||||
assert(leaf != NULL); \
|
assert(leaf != NULL); \
|
||||||
if (i > 0) { \
|
if (i > 0) { \
|
||||||
/* Bubble up by one. */ \
|
/* Bubble up by one. */ \
|
||||||
rtree_ctx->cache[i].leafkey = \
|
rtree_ctx->l2_cache[i].leafkey = \
|
||||||
rtree_ctx->cache[i - 1].leafkey; \
|
rtree_ctx->l2_cache[i - 1].leafkey; \
|
||||||
rtree_ctx->cache[i].leaf = \
|
rtree_ctx->l2_cache[i].leaf = \
|
||||||
rtree_ctx->cache[i - 1].leaf; \
|
rtree_ctx->l2_cache[i - 1].leaf; \
|
||||||
rtree_ctx->cache[i - 1].leafkey = leafkey; \
|
rtree_ctx->l2_cache[i - 1].leafkey = \
|
||||||
rtree_ctx->cache[i - 1].leaf = leaf; \
|
rtree_ctx->cache[slot].leafkey; \
|
||||||
|
rtree_ctx->l2_cache[i - 1].leaf = \
|
||||||
|
rtree_ctx->cache[slot].leaf; \
|
||||||
|
} else { \
|
||||||
|
rtree_ctx->l2_cache[0].leafkey = \
|
||||||
|
rtree_ctx->cache[slot].leafkey; \
|
||||||
|
rtree_ctx->l2_cache[0].leaf = \
|
||||||
|
rtree_ctx->cache[slot].leaf; \
|
||||||
} \
|
} \
|
||||||
|
rtree_ctx->cache[slot].leafkey = leafkey; \
|
||||||
|
rtree_ctx->cache[slot].leaf = leaf; \
|
||||||
uintptr_t subkey = rtree_subkey(key, RTREE_HEIGHT-1); \
|
uintptr_t subkey = rtree_subkey(key, RTREE_HEIGHT-1); \
|
||||||
return &leaf[subkey]; \
|
return &leaf[subkey]; \
|
||||||
} \
|
} \
|
||||||
} while (0)
|
} while (0)
|
||||||
/* Check the first cache entry. */
|
/* Check the first cache entry. */
|
||||||
RTREE_CACHE_CHECK(0);
|
RTREE_CACHE_CHECK_L2(0);
|
||||||
/*
|
/* Search the remaining cache elements. */
|
||||||
* Search the remaining cache elements, and on success move the matching
|
for (unsigned i = 1; i < RTREE_CTX_NCACHE_L2; i++) {
|
||||||
* element up by one slot.
|
RTREE_CACHE_CHECK_L2(i);
|
||||||
*/
|
|
||||||
for (unsigned i = 1; i < RTREE_CTX_NCACHE; i++) {
|
|
||||||
RTREE_CACHE_CHECK(i);
|
|
||||||
}
|
}
|
||||||
#undef RTREE_CACHE_CHECK
|
#undef RTREE_CACHE_CHECK_L2
|
||||||
|
|
||||||
return rtree_leaf_elm_lookup_hard(tsdn, rtree, rtree_ctx, key,
|
return rtree_leaf_elm_lookup_hard(tsdn, rtree, rtree_ctx, key,
|
||||||
dependent, init_missing);
|
dependent, init_missing);
|
||||||
|
@ -55,7 +55,10 @@ struct rtree_ctx_cache_elm_s {
|
|||||||
};
|
};
|
||||||
|
|
||||||
struct rtree_ctx_s {
|
struct rtree_ctx_s {
|
||||||
|
/* Direct mapped cache. */
|
||||||
rtree_ctx_cache_elm_t cache[RTREE_CTX_NCACHE];
|
rtree_ctx_cache_elm_t cache[RTREE_CTX_NCACHE];
|
||||||
|
/* L2 LRU cache. */
|
||||||
|
rtree_ctx_cache_elm_t l2_cache[RTREE_CTX_NCACHE_L2];
|
||||||
};
|
};
|
||||||
|
|
||||||
struct rtree_s {
|
struct rtree_s {
|
||||||
|
@ -42,19 +42,25 @@ typedef struct rtree_s rtree_t;
|
|||||||
#define RTREE_LEAFKEY_INVALID ((uintptr_t)1)
|
#define RTREE_LEAFKEY_INVALID ((uintptr_t)1)
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Number of leafkey/leaf pairs to cache. Each entry supports an entire leaf,
|
* Number of leafkey/leaf pairs to cache in L1 and L2 level respectively. Each
|
||||||
* so the cache hit rate is typically high even with a small number of entries.
|
* entry supports an entire leaf, so the cache hit rate is typically high even
|
||||||
* In rare cases extent activity will straddle the boundary between two leaf
|
* with a small number of entries. In rare cases extent activity will straddle
|
||||||
* nodes. Furthermore, an arena may use a combination of dss and mmap. Four
|
* the boundary between two leaf nodes. Furthermore, an arena may use a
|
||||||
* entries covers both of these considerations as long as locality of reference
|
* combination of dss and mmap. Note that as memory usage grows past the amount
|
||||||
* is high, and/or total memory usage doesn't exceed the range supported by
|
* that this cache can directly cover, the cache will become less effective if
|
||||||
* those entries. Note that as memory usage grows past the amount that this
|
* locality of reference is low, but the consequence is merely cache misses
|
||||||
* cache can directly cover, the cache will become less effective if locality of
|
* while traversing the tree nodes.
|
||||||
* reference is low, but the consequence is merely cache misses while traversing
|
*
|
||||||
* the tree nodes, and the cache will itself suffer cache misses if made overly
|
* The L1 direct mapped cache offers consistent and low cost on cache hit.
|
||||||
* large, not to mention the cost of linear search.
|
* However collision could affect hit rate negatively. This is resolved by
|
||||||
|
* combining with a L2 LRU cache, which requires linear search and re-ordering
|
||||||
|
* on access but suffers no collision. Note that, the cache will itself suffer
|
||||||
|
* cache misses if made overly large, plus the cost of linear search in the LRU
|
||||||
|
* cache.
|
||||||
*/
|
*/
|
||||||
#define RTREE_CTX_NCACHE 8
|
#define RTREE_CTX_LG_NCACHE 4
|
||||||
|
#define RTREE_CTX_NCACHE (1 << RTREE_CTX_LG_NCACHE)
|
||||||
|
#define RTREE_CTX_NCACHE_L2 8
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Zero initializer required for tsd initialization only. Proper initialization
|
* Zero initializer required for tsd initialization only. Proper initialization
|
||||||
|
@ -310,6 +310,7 @@ prof_leave(tsd_t *tsd, prof_tdata_t *tdata) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#ifdef JEMALLOC_PROF_LIBUNWIND
|
#ifdef JEMALLOC_PROF_LIBUNWIND
|
||||||
|
JEMALLOC_ALIGNED(CACHELINE)
|
||||||
void
|
void
|
||||||
prof_backtrace(prof_bt_t *bt) {
|
prof_backtrace(prof_bt_t *bt) {
|
||||||
int nframes;
|
int nframes;
|
||||||
|
37
src/rtree.c
37
src/rtree.c
@ -256,6 +256,16 @@ rtree_leaf_elm_lookup_hard(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx,
|
|||||||
leaf = rtree->root;
|
leaf = rtree->root;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
if (config_debug) {
|
||||||
|
uintptr_t leafkey = rtree_leafkey(key);
|
||||||
|
for (unsigned i = 0; i < RTREE_CTX_NCACHE; i++) {
|
||||||
|
assert(rtree_ctx->cache[i].leafkey != leafkey);
|
||||||
|
}
|
||||||
|
for (unsigned i = 0; i < RTREE_CTX_NCACHE_L2; i++) {
|
||||||
|
assert(rtree_ctx->l2_cache[i].leafkey != leafkey);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#define RTREE_GET_CHILD(level) { \
|
#define RTREE_GET_CHILD(level) { \
|
||||||
assert(level < RTREE_HEIGHT-1); \
|
assert(level < RTREE_HEIGHT-1); \
|
||||||
if (level != 0 && !dependent && \
|
if (level != 0 && !dependent && \
|
||||||
@ -277,20 +287,30 @@ rtree_leaf_elm_lookup_hard(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx,
|
|||||||
dependent); \
|
dependent); \
|
||||||
} \
|
} \
|
||||||
}
|
}
|
||||||
|
/*
|
||||||
|
* Cache replacement upon hard lookup (i.e. L1 & L2 rtree cache miss):
|
||||||
|
* (1) evict last entry in L2 cache; (2) move the collision slot from L1
|
||||||
|
* cache down to L2; and 3) fill L1.
|
||||||
|
*/
|
||||||
#define RTREE_GET_LEAF(level) { \
|
#define RTREE_GET_LEAF(level) { \
|
||||||
assert(level == RTREE_HEIGHT-1); \
|
assert(level == RTREE_HEIGHT-1); \
|
||||||
if (!dependent && unlikely(!rtree_leaf_valid(leaf))) { \
|
if (!dependent && unlikely(!rtree_leaf_valid(leaf))) { \
|
||||||
return NULL; \
|
return NULL; \
|
||||||
} \
|
} \
|
||||||
if (RTREE_CTX_NCACHE > 1) { \
|
if (RTREE_CTX_NCACHE_L2 > 1) { \
|
||||||
memmove(&rtree_ctx->cache[1], \
|
memmove(&rtree_ctx->l2_cache[1], \
|
||||||
&rtree_ctx->cache[0], \
|
&rtree_ctx->l2_cache[0], \
|
||||||
sizeof(rtree_ctx_cache_elm_t) * \
|
sizeof(rtree_ctx_cache_elm_t) * \
|
||||||
(RTREE_CTX_NCACHE-1)); \
|
(RTREE_CTX_NCACHE_L2 - 1)); \
|
||||||
} \
|
} \
|
||||||
|
size_t slot = rtree_cache_direct_map(key); \
|
||||||
|
rtree_ctx->l2_cache[0].leafkey = \
|
||||||
|
rtree_ctx->cache[slot].leafkey; \
|
||||||
|
rtree_ctx->l2_cache[0].leaf = \
|
||||||
|
rtree_ctx->cache[slot].leaf; \
|
||||||
uintptr_t leafkey = rtree_leafkey(key); \
|
uintptr_t leafkey = rtree_leafkey(key); \
|
||||||
rtree_ctx->cache[0].leafkey = leafkey; \
|
rtree_ctx->cache[slot].leafkey = leafkey; \
|
||||||
rtree_ctx->cache[0].leaf = leaf; \
|
rtree_ctx->cache[slot].leaf = leaf; \
|
||||||
uintptr_t subkey = rtree_subkey(key, level); \
|
uintptr_t subkey = rtree_subkey(key, level); \
|
||||||
return &leaf[subkey]; \
|
return &leaf[subkey]; \
|
||||||
}
|
}
|
||||||
@ -433,6 +453,11 @@ rtree_ctx_data_init(rtree_ctx_t *ctx) {
|
|||||||
cache->leafkey = RTREE_LEAFKEY_INVALID;
|
cache->leafkey = RTREE_LEAFKEY_INVALID;
|
||||||
cache->leaf = NULL;
|
cache->leaf = NULL;
|
||||||
}
|
}
|
||||||
|
for (unsigned i = 0; i < RTREE_CTX_NCACHE_L2; i++) {
|
||||||
|
rtree_ctx_cache_elm_t *cache = &ctx->l2_cache[i];
|
||||||
|
cache->leafkey = RTREE_LEAFKEY_INVALID;
|
||||||
|
cache->leaf = NULL;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool
|
bool
|
||||||
|
Loading…
Reference in New Issue
Block a user