diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt index d1166b20..8c8653f0 100644 --- a/include/jemalloc/internal/private_symbols.txt +++ b/include/jemalloc/internal/private_symbols.txt @@ -419,7 +419,6 @@ rtree_child_read rtree_child_read_hard rtree_child_tryread rtree_clear -rtree_ctx_start_level rtree_delete rtree_elm_acquire rtree_elm_lookup @@ -431,6 +430,7 @@ rtree_elm_witness_acquire rtree_elm_witness_release rtree_elm_write rtree_elm_write_acquired +rtree_leafkey rtree_new rtree_node_alloc rtree_node_dalloc diff --git a/include/jemalloc/internal/rtree_inlines.h b/include/jemalloc/internal/rtree_inlines.h index 3316ea37..88d6ee00 100644 --- a/include/jemalloc/internal/rtree_inlines.h +++ b/include/jemalloc/internal/rtree_inlines.h @@ -3,8 +3,7 @@ #ifndef JEMALLOC_ENABLE_INLINE unsigned rtree_start_level(const rtree_t *rtree, uintptr_t key); -unsigned rtree_ctx_start_level(const rtree_t *rtree, - const rtree_ctx_t *rtree_ctx, uintptr_t key); +uintptr_t rtree_leafkey(rtree_t *rtree, uintptr_t key); uintptr_t rtree_subkey(rtree_t *rtree, uintptr_t key, unsigned level); bool rtree_node_valid(rtree_elm_t *node); @@ -50,31 +49,24 @@ rtree_start_level(const rtree_t *rtree, uintptr_t key) { return start_level; } -JEMALLOC_ALWAYS_INLINE unsigned -rtree_ctx_start_level(const rtree_t *rtree, const rtree_ctx_t *rtree_ctx, - uintptr_t key) { - unsigned start_level; - uintptr_t key_diff; - - /* Compute the difference between old and new lookup keys. */ - key_diff = key ^ rtree_ctx->key; - assert(key_diff != 0); /* Handled in rtree_elm_lookup(). */ - - /* - * Compute the last traversal path element at which the keys' paths - * are the same. - */ - start_level = rtree->start_level[(lg_floor(key_diff) + 1) >> - LG_RTREE_BITS_PER_LEVEL]; - assert(start_level < rtree->height); - return start_level; +JEMALLOC_ALWAYS_INLINE uintptr_t +rtree_leafkey(rtree_t *rtree, uintptr_t key) { + unsigned ptrbits = ZU(1) << (LG_SIZEOF_PTR+3); + unsigned cumbits = (rtree->levels[rtree->height-1].cumbits - + rtree->levels[rtree->height-1].bits); + unsigned maskbits = ptrbits - cumbits; + uintptr_t mask = ~((ZU(1) << maskbits) - 1); + return (key & mask); } JEMALLOC_ALWAYS_INLINE uintptr_t rtree_subkey(rtree_t *rtree, uintptr_t key, unsigned level) { - return ((key >> ((ZU(1) << (LG_SIZEOF_PTR+3)) - - rtree->levels[level].cumbits)) & ((ZU(1) << - rtree->levels[level].bits) - 1)); + unsigned ptrbits = ZU(1) << (LG_SIZEOF_PTR+3); + unsigned cumbits = rtree->levels[level].cumbits; + unsigned shiftbits = ptrbits - cumbits; + unsigned maskbits = rtree->levels[level].bits; + unsigned mask = (ZU(1) << maskbits) - 1; + return ((key >> shiftbits) & mask); } JEMALLOC_ALWAYS_INLINE bool @@ -170,103 +162,89 @@ rtree_subtree_read(tsdn_t *tsdn, rtree_t *rtree, unsigned level, JEMALLOC_ALWAYS_INLINE rtree_elm_t * rtree_elm_lookup(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx, uintptr_t key, bool dependent, bool init_missing) { - uintptr_t subkey; - unsigned start_level; - rtree_elm_t *node; - assert(!dependent || !init_missing); - if (dependent || init_missing) { - if (likely(rtree_ctx->valid)) { - if (key == rtree_ctx->key) { - return rtree_ctx->elms[rtree->height]; - } else { - unsigned no_ctx_start_level = - rtree_start_level(rtree, key); - unsigned ctx_start_level; - - if (likely(no_ctx_start_level <= - rtree_ctx->start_level && (ctx_start_level = - rtree_ctx_start_level(rtree, rtree_ctx, - key)) >= rtree_ctx->start_level)) { - start_level = ctx_start_level; - node = rtree_ctx->elms[ctx_start_level]; - } else { - start_level = no_ctx_start_level; - node = init_missing ? - rtree_subtree_read(tsdn, rtree, - no_ctx_start_level, dependent) : - rtree_subtree_tryread(rtree, - no_ctx_start_level, dependent); - rtree_ctx->start_level = - no_ctx_start_level; - rtree_ctx->elms[no_ctx_start_level] = - node; - } - } - } else { - unsigned no_ctx_start_level = rtree_start_level(rtree, - key); - - start_level = no_ctx_start_level; - node = init_missing ? rtree_subtree_read(tsdn, rtree, - no_ctx_start_level, dependent) : - rtree_subtree_tryread(rtree, no_ctx_start_level, - dependent); - rtree_ctx->valid = true; - rtree_ctx->start_level = no_ctx_start_level; - rtree_ctx->elms[no_ctx_start_level] = node; + /* Search the cache. */ + uintptr_t leafkey = rtree_leafkey(rtree, key); + if (likely(key != 0)) { +#define RTREE_CACHE_CHECK(i) do { \ + if (likely(rtree_ctx->cache[i].leafkey == leafkey)) { \ + rtree_elm_t *leaf = rtree_ctx->cache[i].leaf; \ + if (likely(leaf != NULL)) { \ + /* Reorder. */ \ + memmove(&rtree_ctx->cache[1], \ + &rtree_ctx->cache[0], \ + sizeof(rtree_ctx_cache_elm_t) * i); \ + rtree_ctx->cache[0].leafkey = leafkey; \ + rtree_ctx->cache[0].leaf = leaf; \ + \ + uintptr_t subkey = rtree_subkey(rtree, \ + key, rtree->height-1); \ + return &leaf[subkey]; \ + } \ + } \ +} while (0) + /* Check the MRU cache entry. */ + RTREE_CACHE_CHECK(0); + /* + * Search the remaining cache elements, and on success move the + * matching element to the front. Unroll the first iteration to + * avoid calling memmove() (the compiler typically optimizes it + * into raw moves). + */ + if (RTREE_CTX_NCACHE > 1) { + RTREE_CACHE_CHECK(1); } - rtree_ctx->key = key; - } else { - start_level = rtree_start_level(rtree, key); - node = init_missing ? rtree_subtree_read(tsdn, rtree, - start_level, dependent) : rtree_subtree_tryread(rtree, - start_level, dependent); + for (unsigned i = 2; i < RTREE_CTX_NCACHE; i++) { + RTREE_CACHE_CHECK(i); + } +#undef RTREE_CACHE_CHECK } + unsigned start_level = rtree_start_level(rtree, key); + rtree_elm_t *node = init_missing ? rtree_subtree_read(tsdn, rtree, + start_level, dependent) : rtree_subtree_tryread(rtree, start_level, + dependent); + #define RTREE_GET_BIAS (RTREE_HEIGHT_MAX - rtree->height) switch (start_level + RTREE_GET_BIAS) { #define RTREE_GET_SUBTREE(level) \ - case level: \ + case level: { \ assert(level < (RTREE_HEIGHT_MAX-1)); \ if (!dependent && unlikely(!rtree_node_valid(node))) { \ - if (init_missing) { \ - rtree_ctx->valid = false; \ - } \ return NULL; \ } \ - subkey = rtree_subkey(rtree, key, level - \ + uintptr_t subkey = rtree_subkey(rtree, key, level - \ RTREE_GET_BIAS); \ node = init_missing ? rtree_child_read(tsdn, rtree, \ &node[subkey], level - RTREE_GET_BIAS, dependent) : \ rtree_child_tryread(&node[subkey], dependent); \ - if (dependent || init_missing) { \ - rtree_ctx->elms[level - RTREE_GET_BIAS + 1] = \ - node; \ - } \ - /* Fall through. */ + /* Fall through. */ \ + } #define RTREE_GET_LEAF(level) \ - case level: \ + case level: { \ assert(level == (RTREE_HEIGHT_MAX-1)); \ if (!dependent && unlikely(!rtree_node_valid(node))) { \ - if (init_missing) { \ - rtree_ctx->valid = false; \ - } \ return NULL; \ } \ - subkey = rtree_subkey(rtree, key, level - \ - RTREE_GET_BIAS); \ /* \ * node is a leaf, so it contains values rather than \ * child pointers. \ */ \ - node = &node[subkey]; \ - if (dependent || init_missing) { \ - rtree_ctx->elms[level - RTREE_GET_BIAS + 1] = \ - node; \ + if (likely(key != 0)) { \ + if (RTREE_CTX_NCACHE > 1) { \ + memmove(&rtree_ctx->cache[1], \ + &rtree_ctx->cache[0], \ + sizeof(rtree_ctx_cache_elm_t) * \ + (RTREE_CTX_NCACHE-1)); \ + } \ + rtree_ctx->cache[0].leafkey = leafkey; \ + rtree_ctx->cache[0].leaf = node; \ } \ - return node; + uintptr_t subkey = rtree_subkey(rtree, key, level - \ + RTREE_GET_BIAS); \ + return &node[subkey]; \ + } #if RTREE_HEIGHT_MAX > 1 RTREE_GET_SUBTREE(0) #endif @@ -365,16 +343,14 @@ rtree_elm_acquire(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx, if (!dependent && elm == NULL) { return NULL; } - { - extent_t *extent; - void *s; - do { - extent = rtree_elm_read(elm, false); - /* The least significant bit serves as a lock. */ - s = (void *)((uintptr_t)extent | (uintptr_t)0x1); - } while (atomic_cas_p(&elm->pun, (void *)extent, s)); - } + extent_t *extent; + void *s; + do { + extent = rtree_elm_read(elm, false); + /* The least significant bit serves as a lock. */ + s = (void *)((uintptr_t)extent | (uintptr_t)0x1); + } while (atomic_cas_p(&elm->pun, (void *)extent, s)); if (config_debug) { rtree_elm_witness_acquire(tsdn, rtree, key, elm); diff --git a/include/jemalloc/internal/rtree_structs.h b/include/jemalloc/internal/rtree_structs.h index 5a7a23c7..892156b1 100644 --- a/include/jemalloc/internal/rtree_structs.h +++ b/include/jemalloc/internal/rtree_structs.h @@ -54,22 +54,16 @@ struct rtree_level_s { unsigned cumbits; }; +struct rtree_ctx_cache_elm_s { + uintptr_t leafkey; + rtree_elm_t *leaf; +}; + struct rtree_ctx_s { - /* If false, key/elms have not yet been initialized by a lookup. */ - bool valid; - /* Key that corresponds to the tree path recorded in elms. */ - uintptr_t key; - /* Memoized rtree_start_level(key). */ - unsigned start_level; - /* - * A path through rtree, driven by key. Only elements that could - * actually be used for subsequent lookups are initialized, i.e. if - * start_level = rtree_start_level(key) is non-zero, the first - * start_level elements are uninitialized. The last element contains a - * pointer to the leaf node element that corresponds to key, so that - * exact matches require no tree node offset computation. - */ - rtree_elm_t *elms[RTREE_HEIGHT_MAX + 1]; +#ifndef _MSC_VER + JEMALLOC_ALIGNED(CACHELINE) +#endif + rtree_ctx_cache_elm_t cache[RTREE_CTX_NCACHE]; }; struct rtree_s { diff --git a/include/jemalloc/internal/rtree_types.h b/include/jemalloc/internal/rtree_types.h index 122d5cef..b4ab018d 100644 --- a/include/jemalloc/internal/rtree_types.h +++ b/include/jemalloc/internal/rtree_types.h @@ -12,6 +12,7 @@ typedef struct rtree_elm_s rtree_elm_t; typedef struct rtree_elm_witness_s rtree_elm_witness_t; typedef struct rtree_elm_witness_tsd_s rtree_elm_witness_tsd_t; typedef struct rtree_level_s rtree_level_t; +typedef struct rtree_ctx_cache_elm_s rtree_ctx_cache_elm_t; typedef struct rtree_ctx_s rtree_ctx_t; typedef struct rtree_s rtree_t; @@ -25,11 +26,24 @@ typedef struct rtree_s rtree_t; #define RTREE_HEIGHT_MAX \ ((1U << (LG_SIZEOF_PTR+3)) / RTREE_BITS_PER_LEVEL) +/* + * Number of leafkey/leaf pairs to cache. Each entry supports an entire leaf, + * so the cache hit rate is typically high even with a small number of entries. + * In rare cases extent activity will straddle the boundary between two leaf + * nodes. Furthermore, an arena may use a combination of dss and mmap. Four + * entries covers both of these considerations as long as locality of reference + * is high, and/or total memory usage doesn't exceed the range supported by + * those entries. Note that as memory usage grows past the amount that this + * cache can directly cover, the cache will become less effective if locality of + * reference is low, but the consequence is merely cache misses while traversing + * the tree nodes, and the cache will itself suffer cache misses if made overly + * large, not to mention the cost of linear search. + */ +#define RTREE_CTX_NCACHE 8 + +/* Static initializer for rtree_ctx_t. */ #define RTREE_CTX_INITIALIZER { \ - false, \ - 0, \ - 0, \ - {NULL /* C initializes all trailing elements to NULL. */} \ + {{0, NULL} /* C initializes all trailing elements to NULL. */} \ } /*