From 143e9c4a2f4eb8916e9802323485fd91260fd17c Mon Sep 17 00:00:00 2001 From: Qi Wang Date: Fri, 21 Oct 2022 15:10:48 -0700 Subject: [PATCH] Enable fast thread locals for dealloc-only threads. Previously if a thread does only allocations, it stays on the slow path / minimal initialized state forever. However, dealloc-only is a valid pattern for dedicated reclamation threads -- this means thread cache is disabled (no batched flush) for them, which causes high overhead and contention. Added the condition to fully initialize TSD when a fair amount of dealloc activities are observed. --- include/jemalloc/internal/tsd.h | 4 +++ src/tsd.c | 18 ++++++++++- test/unit/tsd.c | 56 +++++++++++++++++++++++++++++++++ 3 files changed, 77 insertions(+), 1 deletion(-) diff --git a/include/jemalloc/internal/tsd.h b/include/jemalloc/internal/tsd.h index 66d68822..c6bf28fc 100644 --- a/include/jemalloc/internal/tsd.h +++ b/include/jemalloc/internal/tsd.h @@ -59,6 +59,7 @@ typedef ql_elm(tsd_t) tsd_link_t; #define TSD_DATA_SLOW \ O(tcache_enabled, bool, bool) \ O(reentrancy_level, int8_t, int8_t) \ + O(min_init_state_nfetched, uint8_t, uint8_t) \ O(thread_allocated_last_event, uint64_t, uint64_t) \ O(thread_allocated_next_event, uint64_t, uint64_t) \ O(thread_deallocated_last_event, uint64_t, uint64_t) \ @@ -91,6 +92,7 @@ typedef ql_elm(tsd_t) tsd_link_t; #define TSD_DATA_SLOW_INITIALIZER \ /* tcache_enabled */ TCACHE_ENABLED_ZERO_INITIALIZER, \ /* reentrancy_level */ 0, \ + /* min_init_state_nfetched */ 0, \ /* thread_allocated_last_event */ 0, \ /* thread_allocated_next_event */ 0, \ /* thread_deallocated_last_event */ 0, \ @@ -177,6 +179,8 @@ void tsd_global_slow_inc(tsdn_t *tsdn); void tsd_global_slow_dec(tsdn_t *tsdn); bool tsd_global_slow(); +#define TSD_MIN_INIT_STATE_MAX_FETCHED (128) + enum { /* Common case --> jnz. */ tsd_state_nominal = 0, diff --git a/src/tsd.c b/src/tsd.c index e8e4f3a3..cef7ba58 100644 --- a/src/tsd.c +++ b/src/tsd.c @@ -300,9 +300,25 @@ tsd_fetch_slow(tsd_t *tsd, bool minimal) { tsd_state_set(tsd, tsd_state_minimal_initialized); tsd_set(tsd); tsd_data_init_nocleanup(tsd); + *tsd_min_init_state_nfetchedp_get(tsd) = 1; } } else if (tsd_state_get(tsd) == tsd_state_minimal_initialized) { - if (!minimal) { + /* + * If a thread only ever deallocates (e.g. dedicated reclamation + * threads), we want to help it to eventually escape the slow + * path (caused by the minimal initialized state). The nfetched + * counter tracks the number of times the tsd has been accessed + * under the min init state, and triggers the switch to nominal + * once reached the max allowed count. + * + * This means at most 128 deallocations stay on the slow path. + * + * Also see comments in free_default(). + */ + uint8_t *nfetched = tsd_min_init_state_nfetchedp_get(tsd); + assert(*nfetched >= 1); + (*nfetched)++; + if (!minimal || *nfetched == TSD_MIN_INIT_STATE_MAX_FETCHED) { /* Switch to fully initialized. */ tsd_state_set(tsd, tsd_state_nominal); assert(*tsd_reentrancy_levelp_get(tsd) >= 1); diff --git a/test/unit/tsd.c b/test/unit/tsd.c index 205d8708..bb5cd9f6 100644 --- a/test/unit/tsd.c +++ b/test/unit/tsd.c @@ -136,6 +136,61 @@ TEST_BEGIN(test_tsd_reincarnation) { } TEST_END +static void * +thd_start_dalloc_only(void *arg) { + void **ptrs = (void **)arg; + + tsd_t *tsd = tsd_fetch_min(); + if (tsd_state_get(tsd) != tsd_state_minimal_initialized) { + /* Allocation happened implicitly. */ + expect_u_eq(tsd_state_get(tsd), tsd_state_nominal, + "TSD state should be nominal"); + return NULL; + } + + void *ptr; + for (size_t i = 0; (ptr = ptrs[i]) != NULL; i++) { + /* Offset by 1 because of the manual tsd_fetch_min above. */ + if (i + 1 < TSD_MIN_INIT_STATE_MAX_FETCHED) { + expect_u_eq(tsd_state_get(tsd), + tsd_state_minimal_initialized, + "TSD should be minimal initialized"); + } else { + /* State may be nominal or nominal_slow. */ + expect_true(tsd_nominal(tsd), "TSD should be nominal"); + } + free(ptr); + } + + return NULL; +} + +static void +test_sub_thread_n_dalloc(size_t nptrs) { + void **ptrs = (void **)malloc(sizeof(void *) * (nptrs + 1)); + for (size_t i = 0; i < nptrs; i++) { + ptrs[i] = malloc(8); + } + ptrs[nptrs] = NULL; + + thd_t thd; + thd_create(&thd, thd_start_dalloc_only, (void *)ptrs); + thd_join(thd, NULL); + free(ptrs); +} + +TEST_BEGIN(test_tsd_sub_thread_dalloc_only) { + test_sub_thread_n_dalloc(1); + test_sub_thread_n_dalloc(16); + test_sub_thread_n_dalloc(TSD_MIN_INIT_STATE_MAX_FETCHED - 2); + test_sub_thread_n_dalloc(TSD_MIN_INIT_STATE_MAX_FETCHED - 1); + test_sub_thread_n_dalloc(TSD_MIN_INIT_STATE_MAX_FETCHED); + test_sub_thread_n_dalloc(TSD_MIN_INIT_STATE_MAX_FETCHED + 1); + test_sub_thread_n_dalloc(TSD_MIN_INIT_STATE_MAX_FETCHED + 2); + test_sub_thread_n_dalloc(TSD_MIN_INIT_STATE_MAX_FETCHED * 2); +} +TEST_END + typedef struct { atomic_u32_t phase; atomic_b_t error; @@ -269,6 +324,7 @@ main(void) { return test_no_reentrancy( test_tsd_main_thread, test_tsd_sub_thread, + test_tsd_sub_thread_dalloc_only, test_tsd_reincarnation, test_tsd_global_slow); }