From 4d56aaeca5883ae5f4b5550c528503fb51fdf479 Mon Sep 17 00:00:00 2001 From: Qi Wang Date: Tue, 19 Oct 2021 17:14:08 -0700 Subject: [PATCH] Optimize away the tsd_fast() check on free fastpath. To ensure that the free fastpath can tolerate uninitialized tsd, improved the static initializer for rtree_ctx in tsd. --- include/jemalloc/internal/emap.h | 6 ++++-- include/jemalloc/internal/rtree.h | 3 --- include/jemalloc/internal/rtree_tsd.h | 26 +++++++++++++++++------ include/jemalloc/internal/thread_event.h | 15 ++++--------- include/jemalloc/internal/tsd.h | 2 +- src/jemalloc.c | 27 ++++++++++++------------ test/unit/tsd.c | 7 ++++++ 7 files changed, 48 insertions(+), 38 deletions(-) diff --git a/include/jemalloc/internal/emap.h b/include/jemalloc/internal/emap.h index a40b504b..87ece63d 100644 --- a/include/jemalloc/internal/emap.h +++ b/include/jemalloc/internal/emap.h @@ -276,12 +276,14 @@ emap_full_alloc_ctx_try_lookup(tsdn_t *tsdn, emap_t *emap, const void *ptr, } /* - * Returns true on error. + * Only used on the fastpath of free. Returns true when cannot be fulfilled by + * fast path, e.g. when the metadata key is not cached. */ JEMALLOC_ALWAYS_INLINE bool emap_alloc_ctx_try_lookup_fast(tsd_t *tsd, emap_t *emap, const void *ptr, emap_alloc_ctx_t *alloc_ctx) { - rtree_ctx_t *rtree_ctx = tsd_rtree_ctx(tsd); + /* Use the unsafe getter since this may gets called during exit. */ + rtree_ctx_t *rtree_ctx = tsd_rtree_ctxp_get_unsafe(tsd); rtree_metadata_t metadata; bool err = rtree_metadata_try_read_fast(tsd_tsdn(tsd), &emap->rtree, diff --git a/include/jemalloc/internal/rtree.h b/include/jemalloc/internal/rtree.h index c5f0d8c4..b4f44840 100644 --- a/include/jemalloc/internal/rtree.h +++ b/include/jemalloc/internal/rtree.h @@ -35,9 +35,6 @@ # define RTREE_LEAF_COMPACT #endif -/* Needed for initialization only. */ -#define RTREE_LEAFKEY_INVALID ((uintptr_t)1) - typedef struct rtree_node_elm_s rtree_node_elm_t; struct rtree_node_elm_s { atomic_p_t child; /* (rtree_{node,leaf}_elm_t *) */ diff --git a/include/jemalloc/internal/rtree_tsd.h b/include/jemalloc/internal/rtree_tsd.h index 562e2929..e45525c5 100644 --- a/include/jemalloc/internal/rtree_tsd.h +++ b/include/jemalloc/internal/rtree_tsd.h @@ -18,16 +18,28 @@ * cache misses if made overly large, plus the cost of linear search in the LRU * cache. */ -#define RTREE_CTX_LG_NCACHE 4 -#define RTREE_CTX_NCACHE (1 << RTREE_CTX_LG_NCACHE) +#define RTREE_CTX_NCACHE 16 #define RTREE_CTX_NCACHE_L2 8 -/* - * Zero initializer required for tsd initialization only. Proper initialization - * done via rtree_ctx_data_init(). - */ -#define RTREE_CTX_ZERO_INITIALIZER {{{0, 0}}, {{0, 0}}} +/* Needed for initialization only. */ +#define RTREE_LEAFKEY_INVALID ((uintptr_t)1) +#define RTREE_CTX_CACHE_ELM_INVALID {RTREE_LEAFKEY_INVALID, NULL} +#define RTREE_CTX_INIT_ELM_1 RTREE_CTX_CACHE_ELM_INVALID +#define RTREE_CTX_INIT_ELM_2 RTREE_CTX_INIT_ELM_1, RTREE_CTX_INIT_ELM_1 +#define RTREE_CTX_INIT_ELM_4 RTREE_CTX_INIT_ELM_2, RTREE_CTX_INIT_ELM_2 +#define RTREE_CTX_INIT_ELM_8 RTREE_CTX_INIT_ELM_4, RTREE_CTX_INIT_ELM_4 +#define RTREE_CTX_INIT_ELM_16 RTREE_CTX_INIT_ELM_8, RTREE_CTX_INIT_ELM_8 + +#define _RTREE_CTX_INIT_ELM_DATA(n) RTREE_CTX_INIT_ELM_##n +#define RTREE_CTX_INIT_ELM_DATA(n) _RTREE_CTX_INIT_ELM_DATA(n) + +/* + * Static initializer (to invalidate the cache entries) is required because the + * free fastpath may access the rtree cache before a full tsd initialization. + */ +#define RTREE_CTX_INITIALIZER {{RTREE_CTX_INIT_ELM_DATA(RTREE_CTX_NCACHE)}, \ + {RTREE_CTX_INIT_ELM_DATA(RTREE_CTX_NCACHE_L2)}} typedef struct rtree_leaf_elm_s rtree_leaf_elm_t; diff --git a/include/jemalloc/internal/thread_event.h b/include/jemalloc/internal/thread_event.h index 525019b6..2f4e1b39 100644 --- a/include/jemalloc/internal/thread_event.h +++ b/include/jemalloc/internal/thread_event.h @@ -118,17 +118,10 @@ te_malloc_fastpath_ctx(tsd_t *tsd, uint64_t *allocated, uint64_t *threshold) { } JEMALLOC_ALWAYS_INLINE void -te_free_fastpath_ctx(tsd_t *tsd, uint64_t *deallocated, uint64_t *threshold, - bool size_hint) { - if (!size_hint) { - *deallocated = tsd_thread_deallocated_get(tsd); - *threshold = tsd_thread_deallocated_next_event_fast_get(tsd); - } else { - /* Unsafe getters since this may happen before tsd_init. */ - *deallocated = *tsd_thread_deallocatedp_get_unsafe(tsd); - *threshold = - *tsd_thread_deallocated_next_event_fastp_get_unsafe(tsd); - } +te_free_fastpath_ctx(tsd_t *tsd, uint64_t *deallocated, uint64_t *threshold) { + /* Unsafe getters since this may happen before tsd_init. */ + *deallocated = *tsd_thread_deallocatedp_get_unsafe(tsd); + *threshold = *tsd_thread_deallocated_next_event_fastp_get_unsafe(tsd); assert(*threshold <= TE_NEXT_EVENT_FAST_MAX); } diff --git a/include/jemalloc/internal/tsd.h b/include/jemalloc/internal/tsd.h index 86d52778..0a46d448 100644 --- a/include/jemalloc/internal/tsd.h +++ b/include/jemalloc/internal/tsd.h @@ -119,7 +119,7 @@ typedef ql_elm(tsd_t) tsd_link_t; /* activity_callback_thunk */ \ ACTIVITY_CALLBACK_THUNK_INITIALIZER, \ /* tcache_slow */ TCACHE_SLOW_ZERO_INITIALIZER, \ - /* rtree_ctx */ RTREE_CTX_ZERO_INITIALIZER, + /* rtree_ctx */ RTREE_CTX_INITIALIZER, /* O(name, type, nullable type) */ #define TSD_DATA_FAST \ diff --git a/src/jemalloc.c b/src/jemalloc.c index 469a4910..0c798c87 100644 --- a/src/jemalloc.c +++ b/src/jemalloc.c @@ -2910,12 +2910,20 @@ free_default(void *ptr) { JEMALLOC_ALWAYS_INLINE bool free_fastpath(void *ptr, size_t size, bool size_hint) { tsd_t *tsd = tsd_get(false); + /* The branch gets optimized away unless tsd_get_allocates(). */ + if (unlikely(tsd == NULL)) { + return false; + } + /* + * The tsd_fast() / initialized checks are folded into the branch + * testing (deallocated_after >= threshold) later in this function. + * The threshold will be set to 0 when !tsd_fast. + */ + assert(tsd_fast(tsd) || + *tsd_thread_deallocated_next_event_fastp_get_unsafe(tsd) == 0); emap_alloc_ctx_t alloc_ctx; if (!size_hint) { - if (unlikely(tsd == NULL || !tsd_fast(tsd))) { - return false; - } bool err = emap_alloc_ctx_try_lookup_fast(tsd, &arena_emap_global, ptr, &alloc_ctx); @@ -2925,15 +2933,6 @@ bool free_fastpath(void *ptr, size_t size, bool size_hint) { } assert(alloc_ctx.szind != SC_NSIZES); } else { - /* - * The size hinted fastpath does not involve rtree lookup, thus - * can tolerate an uninitialized tsd. This allows the tsd_fast - * check to be folded into the branch testing fast_threshold - * (set to 0 when !tsd_fast). - */ - if (unlikely(tsd == NULL)) { - return false; - } /* * Check for both sizes that are too large, and for sampled * objects. Sampled objects are always page-aligned. The @@ -2949,7 +2948,7 @@ bool free_fastpath(void *ptr, size_t size, bool size_hint) { } uint64_t deallocated, threshold; - te_free_fastpath_ctx(tsd, &deallocated, &threshold, size_hint); + te_free_fastpath_ctx(tsd, &deallocated, &threshold); size_t usize = sz_index2size(alloc_ctx.szind); uint64_t deallocated_after = deallocated + usize; @@ -2963,7 +2962,7 @@ bool free_fastpath(void *ptr, size_t size, bool size_hint) { if (unlikely(deallocated_after >= threshold)) { return false; } - + assert(tsd_fast(tsd)); bool fail = maybe_check_alloc_ctx(tsd, ptr, &alloc_ctx); if (fail) { /* See the comment in isfree. */ diff --git a/test/unit/tsd.c b/test/unit/tsd.c index 3f3ca73d..205d8708 100644 --- a/test/unit/tsd.c +++ b/test/unit/tsd.c @@ -48,6 +48,13 @@ thd_start(void *arg) { int d = (int)(uintptr_t)arg; void *p; + /* + * Test free before tsd init -- the free fast path (which does not + * explicitly check for NULL) has to tolerate this case, and fall back + * to free_default. + */ + free(NULL); + tsd_t *tsd = tsd_fetch(); expect_x_eq(tsd_test_data_get(tsd), MALLOC_TSD_TEST_DATA_INIT, "Initial tsd get should return initialization value");