Enable fast thread locals for dealloc-only threads.
Previously if a thread does only allocations, it stays on the slow path / minimal initialized state forever. However, dealloc-only is a valid pattern for dedicated reclamation threads -- this means thread cache is disabled (no batched flush) for them, which causes high overhead and contention. Added the condition to fully initialize TSD when a fair amount of dealloc activities are observed.
This commit is contained in:
parent
be65438f20
commit
143e9c4a2f
@ -59,6 +59,7 @@ typedef ql_elm(tsd_t) tsd_link_t;
|
|||||||
#define TSD_DATA_SLOW \
|
#define TSD_DATA_SLOW \
|
||||||
O(tcache_enabled, bool, bool) \
|
O(tcache_enabled, bool, bool) \
|
||||||
O(reentrancy_level, int8_t, int8_t) \
|
O(reentrancy_level, int8_t, int8_t) \
|
||||||
|
O(min_init_state_nfetched, uint8_t, uint8_t) \
|
||||||
O(thread_allocated_last_event, uint64_t, uint64_t) \
|
O(thread_allocated_last_event, uint64_t, uint64_t) \
|
||||||
O(thread_allocated_next_event, uint64_t, uint64_t) \
|
O(thread_allocated_next_event, uint64_t, uint64_t) \
|
||||||
O(thread_deallocated_last_event, uint64_t, uint64_t) \
|
O(thread_deallocated_last_event, uint64_t, uint64_t) \
|
||||||
@ -91,6 +92,7 @@ typedef ql_elm(tsd_t) tsd_link_t;
|
|||||||
#define TSD_DATA_SLOW_INITIALIZER \
|
#define TSD_DATA_SLOW_INITIALIZER \
|
||||||
/* tcache_enabled */ TCACHE_ENABLED_ZERO_INITIALIZER, \
|
/* tcache_enabled */ TCACHE_ENABLED_ZERO_INITIALIZER, \
|
||||||
/* reentrancy_level */ 0, \
|
/* reentrancy_level */ 0, \
|
||||||
|
/* min_init_state_nfetched */ 0, \
|
||||||
/* thread_allocated_last_event */ 0, \
|
/* thread_allocated_last_event */ 0, \
|
||||||
/* thread_allocated_next_event */ 0, \
|
/* thread_allocated_next_event */ 0, \
|
||||||
/* thread_deallocated_last_event */ 0, \
|
/* thread_deallocated_last_event */ 0, \
|
||||||
@ -177,6 +179,8 @@ void tsd_global_slow_inc(tsdn_t *tsdn);
|
|||||||
void tsd_global_slow_dec(tsdn_t *tsdn);
|
void tsd_global_slow_dec(tsdn_t *tsdn);
|
||||||
bool tsd_global_slow();
|
bool tsd_global_slow();
|
||||||
|
|
||||||
|
#define TSD_MIN_INIT_STATE_MAX_FETCHED (128)
|
||||||
|
|
||||||
enum {
|
enum {
|
||||||
/* Common case --> jnz. */
|
/* Common case --> jnz. */
|
||||||
tsd_state_nominal = 0,
|
tsd_state_nominal = 0,
|
||||||
|
18
src/tsd.c
18
src/tsd.c
@ -300,9 +300,25 @@ tsd_fetch_slow(tsd_t *tsd, bool minimal) {
|
|||||||
tsd_state_set(tsd, tsd_state_minimal_initialized);
|
tsd_state_set(tsd, tsd_state_minimal_initialized);
|
||||||
tsd_set(tsd);
|
tsd_set(tsd);
|
||||||
tsd_data_init_nocleanup(tsd);
|
tsd_data_init_nocleanup(tsd);
|
||||||
|
*tsd_min_init_state_nfetchedp_get(tsd) = 1;
|
||||||
}
|
}
|
||||||
} else if (tsd_state_get(tsd) == tsd_state_minimal_initialized) {
|
} else if (tsd_state_get(tsd) == tsd_state_minimal_initialized) {
|
||||||
if (!minimal) {
|
/*
|
||||||
|
* If a thread only ever deallocates (e.g. dedicated reclamation
|
||||||
|
* threads), we want to help it to eventually escape the slow
|
||||||
|
* path (caused by the minimal initialized state). The nfetched
|
||||||
|
* counter tracks the number of times the tsd has been accessed
|
||||||
|
* under the min init state, and triggers the switch to nominal
|
||||||
|
* once reached the max allowed count.
|
||||||
|
*
|
||||||
|
* This means at most 128 deallocations stay on the slow path.
|
||||||
|
*
|
||||||
|
* Also see comments in free_default().
|
||||||
|
*/
|
||||||
|
uint8_t *nfetched = tsd_min_init_state_nfetchedp_get(tsd);
|
||||||
|
assert(*nfetched >= 1);
|
||||||
|
(*nfetched)++;
|
||||||
|
if (!minimal || *nfetched == TSD_MIN_INIT_STATE_MAX_FETCHED) {
|
||||||
/* Switch to fully initialized. */
|
/* Switch to fully initialized. */
|
||||||
tsd_state_set(tsd, tsd_state_nominal);
|
tsd_state_set(tsd, tsd_state_nominal);
|
||||||
assert(*tsd_reentrancy_levelp_get(tsd) >= 1);
|
assert(*tsd_reentrancy_levelp_get(tsd) >= 1);
|
||||||
|
@ -136,6 +136,61 @@ TEST_BEGIN(test_tsd_reincarnation) {
|
|||||||
}
|
}
|
||||||
TEST_END
|
TEST_END
|
||||||
|
|
||||||
|
static void *
|
||||||
|
thd_start_dalloc_only(void *arg) {
|
||||||
|
void **ptrs = (void **)arg;
|
||||||
|
|
||||||
|
tsd_t *tsd = tsd_fetch_min();
|
||||||
|
if (tsd_state_get(tsd) != tsd_state_minimal_initialized) {
|
||||||
|
/* Allocation happened implicitly. */
|
||||||
|
expect_u_eq(tsd_state_get(tsd), tsd_state_nominal,
|
||||||
|
"TSD state should be nominal");
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
void *ptr;
|
||||||
|
for (size_t i = 0; (ptr = ptrs[i]) != NULL; i++) {
|
||||||
|
/* Offset by 1 because of the manual tsd_fetch_min above. */
|
||||||
|
if (i + 1 < TSD_MIN_INIT_STATE_MAX_FETCHED) {
|
||||||
|
expect_u_eq(tsd_state_get(tsd),
|
||||||
|
tsd_state_minimal_initialized,
|
||||||
|
"TSD should be minimal initialized");
|
||||||
|
} else {
|
||||||
|
/* State may be nominal or nominal_slow. */
|
||||||
|
expect_true(tsd_nominal(tsd), "TSD should be nominal");
|
||||||
|
}
|
||||||
|
free(ptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
test_sub_thread_n_dalloc(size_t nptrs) {
|
||||||
|
void **ptrs = (void **)malloc(sizeof(void *) * (nptrs + 1));
|
||||||
|
for (size_t i = 0; i < nptrs; i++) {
|
||||||
|
ptrs[i] = malloc(8);
|
||||||
|
}
|
||||||
|
ptrs[nptrs] = NULL;
|
||||||
|
|
||||||
|
thd_t thd;
|
||||||
|
thd_create(&thd, thd_start_dalloc_only, (void *)ptrs);
|
||||||
|
thd_join(thd, NULL);
|
||||||
|
free(ptrs);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_BEGIN(test_tsd_sub_thread_dalloc_only) {
|
||||||
|
test_sub_thread_n_dalloc(1);
|
||||||
|
test_sub_thread_n_dalloc(16);
|
||||||
|
test_sub_thread_n_dalloc(TSD_MIN_INIT_STATE_MAX_FETCHED - 2);
|
||||||
|
test_sub_thread_n_dalloc(TSD_MIN_INIT_STATE_MAX_FETCHED - 1);
|
||||||
|
test_sub_thread_n_dalloc(TSD_MIN_INIT_STATE_MAX_FETCHED);
|
||||||
|
test_sub_thread_n_dalloc(TSD_MIN_INIT_STATE_MAX_FETCHED + 1);
|
||||||
|
test_sub_thread_n_dalloc(TSD_MIN_INIT_STATE_MAX_FETCHED + 2);
|
||||||
|
test_sub_thread_n_dalloc(TSD_MIN_INIT_STATE_MAX_FETCHED * 2);
|
||||||
|
}
|
||||||
|
TEST_END
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
atomic_u32_t phase;
|
atomic_u32_t phase;
|
||||||
atomic_b_t error;
|
atomic_b_t error;
|
||||||
@ -269,6 +324,7 @@ main(void) {
|
|||||||
return test_no_reentrancy(
|
return test_no_reentrancy(
|
||||||
test_tsd_main_thread,
|
test_tsd_main_thread,
|
||||||
test_tsd_sub_thread,
|
test_tsd_sub_thread,
|
||||||
|
test_tsd_sub_thread_dalloc_only,
|
||||||
test_tsd_reincarnation,
|
test_tsd_reincarnation,
|
||||||
test_tsd_global_slow);
|
test_tsd_global_slow);
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user