TSD: Put all fast-path data together.
This commit is contained in:
parent
3589571bfd
commit
58a00df238
@ -15,57 +15,30 @@
|
|||||||
|
|
||||||
/*
|
/*
|
||||||
* Thread-Specific-Data layout
|
* Thread-Specific-Data layout
|
||||||
* --- data accessed on tcache fast path: state, rtree_ctx, stats ---
|
|
||||||
* s: state
|
|
||||||
* m: thread_allocated
|
|
||||||
* k: thread_allocated_next_event_fast
|
|
||||||
* f: thread_deallocated
|
|
||||||
* h: thread_deallocated_next_event_fast
|
|
||||||
* c: rtree_ctx (rtree cache accessed on deallocation)
|
|
||||||
* t: tcache
|
|
||||||
* --- data not accessed on tcache fast path: arena-related fields ---
|
|
||||||
* e: tcache_enabled
|
|
||||||
* d: arenas_tdata_bypass
|
|
||||||
* r: reentrancy_level
|
|
||||||
* n: narenas_tdata
|
|
||||||
* l: thread_allocated_last_event
|
|
||||||
* j: thread_allocated_next_event
|
|
||||||
* q: thread_deallocated_last_event
|
|
||||||
* u: thread_deallocated_next_event
|
|
||||||
* g: tcache_gc_event_wait
|
|
||||||
* y: tcache_gc_dalloc_event_wait
|
|
||||||
* w: prof_sample_event_wait (config_prof)
|
|
||||||
* x: prof_sample_last_event (config_prof)
|
|
||||||
* z: stats_interval_event_wait
|
|
||||||
* e: stats_interval_last_event
|
|
||||||
* p: prof_tdata (config_prof)
|
|
||||||
* v: prng_state
|
|
||||||
* i: iarena
|
|
||||||
* a: arena
|
|
||||||
* o: arenas_tdata
|
|
||||||
* b: binshards
|
|
||||||
* Loading TSD data is on the critical path of basically all malloc operations.
|
|
||||||
* In particular, tcache and rtree_ctx rely on hot CPU cache to be effective.
|
|
||||||
* Use a compact layout to reduce cache footprint.
|
|
||||||
* +--- 64-bit and 64B cacheline; 1B each letter; First byte on the left. ---+
|
|
||||||
* |---------------------------- 1st cacheline ----------------------------|
|
|
||||||
* | sedrnnnn mmmmmmmm kkkkkkkk ffffffff hhhhhhhh [c * 24 ........ ........]|
|
|
||||||
* |---------------------------- 2nd cacheline ----------------------------|
|
|
||||||
* | [c * 64 ........ ........ ........ ........ ........ ........ ........]|
|
|
||||||
* |---------------------------- 3nd cacheline ----------------------------|
|
|
||||||
* | [c * 40 ........ ........ ........ .......] llllllll jjjjjjjj qqqqqqqq |
|
|
||||||
* +---------------------------- 4th cacheline ----------------------------+
|
|
||||||
* | uuuuuuuu gggggggg yyyyyyyy wwwwwwww xxxxxxxx zzzzzzzz eeeeeeee pppppppp |
|
|
||||||
* +---------------------------- 5th and after ----------------------------+
|
|
||||||
* | vvvvvvvv iiiiiiii aaaaaaaa oooooooo [b * 40; then embedded tcache ..... |
|
|
||||||
* +-------------------------------------------------------------------------+
|
|
||||||
* Note: the entire tcache is embedded into TSD and spans multiple cachelines.
|
|
||||||
*
|
*
|
||||||
* The elements after rtree_ctx and before tcache aren't really needed on tcache
|
* At least some thread-local data gets touched on the fast-path of almost all
|
||||||
* fast path. However we have a number of unused tcache bins and witnesses
|
* malloc operations. But much of it is only necessary down slow-paths, or
|
||||||
* (never touched unless config_debug) at the end of tcache, so we place them
|
* testing. We want to colocate the fast-path data so that it can live on the
|
||||||
* there to avoid breaking the cachelines and possibly paging in an extra page.
|
* same cacheline if possible. So we define three tiers of hotness:
|
||||||
|
* TSD_DATA_FAST: Touched on the alloc/dalloc fast paths.
|
||||||
|
* TSD_DATA_SLOW: Touched down slow paths. "Slow" here is sort of general;
|
||||||
|
* there are "semi-slow" paths like "not a sized deallocation, but can still
|
||||||
|
* live in the tcache". We'll want to keep these closer to the fast-path
|
||||||
|
* data.
|
||||||
|
* TSD_DATA_SLOWER: Only touched in test or debug modes, or not touched at all.
|
||||||
|
*
|
||||||
|
* An additional concern is that the larger tcache bins won't be used (we have a
|
||||||
|
* bin per size class, but by default only cache relatively small objects). So
|
||||||
|
* the earlier bins are in the TSD_DATA_FAST tier, but the later ones are in the
|
||||||
|
* TSD_DATA_SLOWER tier.
|
||||||
|
*
|
||||||
|
* As a result of all this, we put the slow data first, then the fast data, then
|
||||||
|
* the slower data, while keeping the tcache as the last element of the fast
|
||||||
|
* data (so that the fast -> slower transition happens midway through the
|
||||||
|
* tcache). While we don't yet play alignment tricks to guarantee it, this
|
||||||
|
* increases our odds of getting some cache/page locality on fast paths.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#ifdef JEMALLOC_JET
|
#ifdef JEMALLOC_JET
|
||||||
typedef void (*test_callback_t)(int *);
|
typedef void (*test_callback_t)(int *);
|
||||||
# define MALLOC_TSD_TEST_DATA_INIT 0x72b65c10
|
# define MALLOC_TSD_TEST_DATA_INIT 0x72b65c10
|
||||||
@ -79,16 +52,11 @@ typedef void (*test_callback_t)(int *);
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* O(name, type, nullable type) */
|
/* O(name, type, nullable type) */
|
||||||
#define MALLOC_TSD \
|
#define TSD_DATA_SLOW \
|
||||||
O(tcache_enabled, bool, bool) \
|
O(tcache_enabled, bool, bool) \
|
||||||
O(arenas_tdata_bypass, bool, bool) \
|
O(arenas_tdata_bypass, bool, bool) \
|
||||||
O(reentrancy_level, int8_t, int8_t) \
|
O(reentrancy_level, int8_t, int8_t) \
|
||||||
O(narenas_tdata, uint32_t, uint32_t) \
|
O(narenas_tdata, uint32_t, uint32_t) \
|
||||||
O(thread_allocated, uint64_t, uint64_t) \
|
|
||||||
O(thread_allocated_next_event_fast, uint64_t, uint64_t) \
|
|
||||||
O(thread_deallocated, uint64_t, uint64_t) \
|
|
||||||
O(thread_deallocated_next_event_fast, uint64_t, uint64_t) \
|
|
||||||
O(rtree_ctx, rtree_ctx_t, rtree_ctx_t) \
|
|
||||||
O(thread_allocated_last_event, uint64_t, uint64_t) \
|
O(thread_allocated_last_event, uint64_t, uint64_t) \
|
||||||
O(thread_allocated_next_event, uint64_t, uint64_t) \
|
O(thread_allocated_next_event, uint64_t, uint64_t) \
|
||||||
O(thread_deallocated_last_event, uint64_t, uint64_t) \
|
O(thread_deallocated_last_event, uint64_t, uint64_t) \
|
||||||
@ -104,28 +72,13 @@ typedef void (*test_callback_t)(int *);
|
|||||||
O(iarena, arena_t *, arena_t *) \
|
O(iarena, arena_t *, arena_t *) \
|
||||||
O(arena, arena_t *, arena_t *) \
|
O(arena, arena_t *, arena_t *) \
|
||||||
O(arenas_tdata, arena_tdata_t *, arena_tdata_t *)\
|
O(arenas_tdata, arena_tdata_t *, arena_tdata_t *)\
|
||||||
O(binshards, tsd_binshards_t, tsd_binshards_t)\
|
O(binshards, tsd_binshards_t, tsd_binshards_t)
|
||||||
O(tcache, tcache_t, tcache_t) \
|
|
||||||
O(witness_tsd, witness_tsd_t, witness_tsdn_t) \
|
|
||||||
MALLOC_TEST_TSD
|
|
||||||
|
|
||||||
/*
|
#define TSD_DATA_SLOW_INITIALIZER \
|
||||||
* TE_MIN_START_WAIT should not exceed the minimal allocation usize.
|
|
||||||
*/
|
|
||||||
#define TE_MIN_START_WAIT ((uint64_t)1U)
|
|
||||||
#define TE_MAX_START_WAIT UINT64_MAX
|
|
||||||
|
|
||||||
#define TSD_INITIALIZER { \
|
|
||||||
/* state */ ATOMIC_INIT(tsd_state_uninitialized), \
|
|
||||||
/* tcache_enabled */ TCACHE_ENABLED_ZERO_INITIALIZER, \
|
/* tcache_enabled */ TCACHE_ENABLED_ZERO_INITIALIZER, \
|
||||||
/* arenas_tdata_bypass */ false, \
|
/* arenas_tdata_bypass */ false, \
|
||||||
/* reentrancy_level */ 0, \
|
/* reentrancy_level */ 0, \
|
||||||
/* narenas_tdata */ 0, \
|
/* narenas_tdata */ 0, \
|
||||||
/* thread_allocated */ 0, \
|
|
||||||
/* thread_allocated_next_event_fast */ 0, \
|
|
||||||
/* thread_deallocated */ 0, \
|
|
||||||
/* thread_deallocated_next_event_fast */ 0, \
|
|
||||||
/* rtree_ctx */ RTREE_CTX_ZERO_INITIALIZER, \
|
|
||||||
/* thread_allocated_last_event */ 0, \
|
/* thread_allocated_last_event */ 0, \
|
||||||
/* thread_allocated_next_event */ TE_MIN_START_WAIT, \
|
/* thread_allocated_next_event */ TE_MIN_START_WAIT, \
|
||||||
/* thread_deallocated_last_event */ 0, \
|
/* thread_deallocated_last_event */ 0, \
|
||||||
@ -141,10 +94,46 @@ typedef void (*test_callback_t)(int *);
|
|||||||
/* iarena */ NULL, \
|
/* iarena */ NULL, \
|
||||||
/* arena */ NULL, \
|
/* arena */ NULL, \
|
||||||
/* arenas_tdata */ NULL, \
|
/* arenas_tdata */ NULL, \
|
||||||
/* binshards */ TSD_BINSHARDS_ZERO_INITIALIZER, \
|
/* binshards */ TSD_BINSHARDS_ZERO_INITIALIZER,
|
||||||
/* tcache */ TCACHE_ZERO_INITIALIZER, \
|
|
||||||
|
/* O(name, type, nullable type) */
|
||||||
|
#define TSD_DATA_FAST \
|
||||||
|
O(thread_allocated, uint64_t, uint64_t) \
|
||||||
|
O(thread_allocated_next_event_fast, uint64_t, uint64_t) \
|
||||||
|
O(thread_deallocated, uint64_t, uint64_t) \
|
||||||
|
O(thread_deallocated_next_event_fast, uint64_t, uint64_t) \
|
||||||
|
O(rtree_ctx, rtree_ctx_t, rtree_ctx_t) \
|
||||||
|
O(tcache, tcache_t, tcache_t)
|
||||||
|
|
||||||
|
#define TSD_DATA_FAST_INITIALIZER \
|
||||||
|
/* thread_allocated */ 0, \
|
||||||
|
/* thread_allocated_next_event_fast */ 0, \
|
||||||
|
/* thread_deallocated */ 0, \
|
||||||
|
/* thread_deallocated_next_event_fast */ 0, \
|
||||||
|
/* rtree_ctx */ RTREE_CTX_ZERO_INITIALIZER, \
|
||||||
|
/* tcache */ TCACHE_ZERO_INITIALIZER,
|
||||||
|
|
||||||
|
/* O(name, type, nullable type) */
|
||||||
|
#define TSD_DATA_SLOWER \
|
||||||
|
O(witness_tsd, witness_tsd_t, witness_tsdn_t) \
|
||||||
|
MALLOC_TEST_TSD
|
||||||
|
|
||||||
|
#define TSD_DATA_SLOWER_INITIALIZER \
|
||||||
/* witness */ WITNESS_TSD_INITIALIZER \
|
/* witness */ WITNESS_TSD_INITIALIZER \
|
||||||
/* test data */ MALLOC_TEST_TSD_INITIALIZER \
|
/* test data */ MALLOC_TEST_TSD_INITIALIZER
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* TE_MIN_START_WAIT should not exceed the minimal allocation usize.
|
||||||
|
*/
|
||||||
|
#define TE_MIN_START_WAIT ((uint64_t)1U)
|
||||||
|
#define TE_MAX_START_WAIT UINT64_MAX
|
||||||
|
|
||||||
|
#define TSD_INITIALIZER { \
|
||||||
|
TSD_DATA_SLOW_INITIALIZER \
|
||||||
|
/* state */ ATOMIC_INIT(tsd_state_uninitialized), \
|
||||||
|
TSD_DATA_FAST_INITIALIZER \
|
||||||
|
TSD_DATA_SLOWER_INITIALIZER \
|
||||||
}
|
}
|
||||||
|
|
||||||
void *malloc_tsd_malloc(size_t size);
|
void *malloc_tsd_malloc(size_t size);
|
||||||
@ -235,14 +224,17 @@ struct tsd_s {
|
|||||||
* setters below.
|
* setters below.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#define O(n, t, nt) \
|
||||||
|
t TSD_MANGLE(n);
|
||||||
|
|
||||||
|
TSD_DATA_SLOW
|
||||||
/*
|
/*
|
||||||
* We manually limit the state to just a single byte. Unless the 8-bit
|
* We manually limit the state to just a single byte. Unless the 8-bit
|
||||||
* atomics are unavailable (which is rare).
|
* atomics are unavailable (which is rare).
|
||||||
*/
|
*/
|
||||||
tsd_state_t state;
|
tsd_state_t state;
|
||||||
#define O(n, t, nt) \
|
TSD_DATA_FAST
|
||||||
t TSD_MANGLE(n);
|
TSD_DATA_SLOWER
|
||||||
MALLOC_TSD
|
|
||||||
#undef O
|
#undef O
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -308,7 +300,9 @@ JEMALLOC_ALWAYS_INLINE t * \
|
|||||||
tsd_##n##p_get_unsafe(tsd_t *tsd) { \
|
tsd_##n##p_get_unsafe(tsd_t *tsd) { \
|
||||||
return &tsd->TSD_MANGLE(n); \
|
return &tsd->TSD_MANGLE(n); \
|
||||||
}
|
}
|
||||||
MALLOC_TSD
|
TSD_DATA_SLOW
|
||||||
|
TSD_DATA_FAST
|
||||||
|
TSD_DATA_SLOWER
|
||||||
#undef O
|
#undef O
|
||||||
|
|
||||||
/* tsd_foop_get(tsd) returns a pointer to the thread-local instance of foo. */
|
/* tsd_foop_get(tsd) returns a pointer to the thread-local instance of foo. */
|
||||||
@ -327,7 +321,9 @@ tsd_##n##p_get(tsd_t *tsd) { \
|
|||||||
state == tsd_state_minimal_initialized); \
|
state == tsd_state_minimal_initialized); \
|
||||||
return tsd_##n##p_get_unsafe(tsd); \
|
return tsd_##n##p_get_unsafe(tsd); \
|
||||||
}
|
}
|
||||||
MALLOC_TSD
|
TSD_DATA_SLOW
|
||||||
|
TSD_DATA_FAST
|
||||||
|
TSD_DATA_SLOWER
|
||||||
#undef O
|
#undef O
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -343,7 +339,9 @@ tsdn_##n##p_get(tsdn_t *tsdn) { \
|
|||||||
tsd_t *tsd = tsdn_tsd(tsdn); \
|
tsd_t *tsd = tsdn_tsd(tsdn); \
|
||||||
return (nt *)tsd_##n##p_get(tsd); \
|
return (nt *)tsd_##n##p_get(tsd); \
|
||||||
}
|
}
|
||||||
MALLOC_TSD
|
TSD_DATA_SLOW
|
||||||
|
TSD_DATA_FAST
|
||||||
|
TSD_DATA_SLOWER
|
||||||
#undef O
|
#undef O
|
||||||
|
|
||||||
/* tsd_foo_get(tsd) returns the value of the thread-local instance of foo. */
|
/* tsd_foo_get(tsd) returns the value of the thread-local instance of foo. */
|
||||||
@ -352,7 +350,9 @@ JEMALLOC_ALWAYS_INLINE t \
|
|||||||
tsd_##n##_get(tsd_t *tsd) { \
|
tsd_##n##_get(tsd_t *tsd) { \
|
||||||
return *tsd_##n##p_get(tsd); \
|
return *tsd_##n##p_get(tsd); \
|
||||||
}
|
}
|
||||||
MALLOC_TSD
|
TSD_DATA_SLOW
|
||||||
|
TSD_DATA_FAST
|
||||||
|
TSD_DATA_SLOWER
|
||||||
#undef O
|
#undef O
|
||||||
|
|
||||||
/* tsd_foo_set(tsd, val) updates the thread-local instance of foo to be val. */
|
/* tsd_foo_set(tsd, val) updates the thread-local instance of foo to be val. */
|
||||||
@ -363,7 +363,9 @@ tsd_##n##_set(tsd_t *tsd, t val) { \
|
|||||||
tsd_state_get(tsd) != tsd_state_minimal_initialized); \
|
tsd_state_get(tsd) != tsd_state_minimal_initialized); \
|
||||||
*tsd_##n##p_get(tsd) = val; \
|
*tsd_##n##p_get(tsd) = val; \
|
||||||
}
|
}
|
||||||
MALLOC_TSD
|
TSD_DATA_SLOW
|
||||||
|
TSD_DATA_FAST
|
||||||
|
TSD_DATA_SLOWER
|
||||||
#undef O
|
#undef O
|
||||||
|
|
||||||
JEMALLOC_ALWAYS_INLINE void
|
JEMALLOC_ALWAYS_INLINE void
|
||||||
|
Loading…
Reference in New Issue
Block a user