497 lines
16 KiB
C
497 lines
16 KiB
C
#ifndef JEMALLOC_INTERNAL_TSD_H
|
|
#define JEMALLOC_INTERNAL_TSD_H
|
|
|
|
#include "jemalloc/internal/arena_types.h"
|
|
#include "jemalloc/internal/assert.h"
|
|
#include "jemalloc/internal/bin_types.h"
|
|
#include "jemalloc/internal/jemalloc_internal_externs.h"
|
|
#include "jemalloc/internal/prof_types.h"
|
|
#include "jemalloc/internal/ql.h"
|
|
#include "jemalloc/internal/rtree_tsd.h"
|
|
#include "jemalloc/internal/tcache_types.h"
|
|
#include "jemalloc/internal/tcache_structs.h"
|
|
#include "jemalloc/internal/util.h"
|
|
#include "jemalloc/internal/witness.h"
|
|
|
|
/*
|
|
* Thread-Specific-Data layout
|
|
* --- data accessed on tcache fast path: state, rtree_ctx, stats ---
|
|
* s: state
|
|
* m: thread_allocated
|
|
* k: thread_allocated_next_event_fast
|
|
* f: thread_deallocated
|
|
* h: thread_deallocated_next_event_fast
|
|
* c: rtree_ctx (rtree cache accessed on deallocation)
|
|
* t: tcache
|
|
* --- data not accessed on tcache fast path: arena-related fields ---
|
|
* e: tcache_enabled
|
|
* d: arenas_tdata_bypass
|
|
* r: reentrancy_level
|
|
* n: narenas_tdata
|
|
* l: thread_allocated_last_event
|
|
* j: thread_allocated_next_event
|
|
* q: thread_deallocated_last_event
|
|
* u: thread_deallocated_next_event
|
|
* g: tcache_gc_event_wait
|
|
* y: tcache_gc_dalloc_event_wait
|
|
* w: prof_sample_event_wait (config_prof)
|
|
* x: prof_sample_last_event (config_prof)
|
|
* z: stats_interval_event_wait
|
|
* e: stats_interval_last_event
|
|
* p: prof_tdata (config_prof)
|
|
* v: prng_state
|
|
* i: iarena
|
|
* a: arena
|
|
* o: arenas_tdata
|
|
* b: binshards
|
|
* Loading TSD data is on the critical path of basically all malloc operations.
|
|
* In particular, tcache and rtree_ctx rely on hot CPU cache to be effective.
|
|
* Use a compact layout to reduce cache footprint.
|
|
* +--- 64-bit and 64B cacheline; 1B each letter; First byte on the left. ---+
|
|
* |---------------------------- 1st cacheline ----------------------------|
|
|
* | sedrnnnn mmmmmmmm kkkkkkkk ffffffff hhhhhhhh [c * 24 ........ ........]|
|
|
* |---------------------------- 2nd cacheline ----------------------------|
|
|
* | [c * 64 ........ ........ ........ ........ ........ ........ ........]|
|
|
* |---------------------------- 3nd cacheline ----------------------------|
|
|
* | [c * 40 ........ ........ ........ .......] llllllll jjjjjjjj qqqqqqqq |
|
|
* +---------------------------- 4th cacheline ----------------------------+
|
|
* | uuuuuuuu gggggggg yyyyyyyy wwwwwwww xxxxxxxx zzzzzzzz eeeeeeee pppppppp |
|
|
* +---------------------------- 5th and after ----------------------------+
|
|
* | vvvvvvvv iiiiiiii aaaaaaaa oooooooo [b * 40; then embedded tcache ..... |
|
|
* +-------------------------------------------------------------------------+
|
|
* Note: the entire tcache is embedded into TSD and spans multiple cachelines.
|
|
*
|
|
* The elements after rtree_ctx and before tcache aren't really needed on tcache
|
|
* fast path. However we have a number of unused tcache bins and witnesses
|
|
* (never touched unless config_debug) at the end of tcache, so we place them
|
|
* there to avoid breaking the cachelines and possibly paging in an extra page.
|
|
*/
|
|
#ifdef JEMALLOC_JET
|
|
typedef void (*test_callback_t)(int *);
|
|
# define MALLOC_TSD_TEST_DATA_INIT 0x72b65c10
|
|
# define MALLOC_TEST_TSD \
|
|
O(test_data, int, int) \
|
|
O(test_callback, test_callback_t, int)
|
|
# define MALLOC_TEST_TSD_INITIALIZER , MALLOC_TSD_TEST_DATA_INIT, NULL
|
|
#else
|
|
# define MALLOC_TEST_TSD
|
|
# define MALLOC_TEST_TSD_INITIALIZER
|
|
#endif
|
|
|
|
/* O(name, type, nullable type) */
|
|
#define MALLOC_TSD \
|
|
O(tcache_enabled, bool, bool) \
|
|
O(arenas_tdata_bypass, bool, bool) \
|
|
O(reentrancy_level, int8_t, int8_t) \
|
|
O(narenas_tdata, uint32_t, uint32_t) \
|
|
O(thread_allocated, uint64_t, uint64_t) \
|
|
O(thread_allocated_next_event_fast, uint64_t, uint64_t) \
|
|
O(thread_deallocated, uint64_t, uint64_t) \
|
|
O(thread_deallocated_next_event_fast, uint64_t, uint64_t) \
|
|
O(rtree_ctx, rtree_ctx_t, rtree_ctx_t) \
|
|
O(thread_allocated_last_event, uint64_t, uint64_t) \
|
|
O(thread_allocated_next_event, uint64_t, uint64_t) \
|
|
O(thread_deallocated_last_event, uint64_t, uint64_t) \
|
|
O(thread_deallocated_next_event, uint64_t, uint64_t) \
|
|
O(tcache_gc_event_wait, uint64_t, uint64_t) \
|
|
O(tcache_gc_dalloc_event_wait, uint64_t, uint64_t) \
|
|
O(prof_sample_event_wait, uint64_t, uint64_t) \
|
|
O(prof_sample_last_event, uint64_t, uint64_t) \
|
|
O(stats_interval_event_wait, uint64_t, uint64_t) \
|
|
O(stats_interval_last_event, uint64_t, uint64_t) \
|
|
O(prof_tdata, prof_tdata_t *, prof_tdata_t *) \
|
|
O(prng_state, uint64_t, uint64_t) \
|
|
O(iarena, arena_t *, arena_t *) \
|
|
O(arena, arena_t *, arena_t *) \
|
|
O(arenas_tdata, arena_tdata_t *, arena_tdata_t *)\
|
|
O(binshards, tsd_binshards_t, tsd_binshards_t)\
|
|
O(tcache, tcache_t, tcache_t) \
|
|
O(witness_tsd, witness_tsd_t, witness_tsdn_t) \
|
|
MALLOC_TEST_TSD
|
|
|
|
/*
|
|
* TE_MIN_START_WAIT should not exceed the minimal allocation usize.
|
|
*/
|
|
#define TE_MIN_START_WAIT ((uint64_t)1U)
|
|
#define TE_MAX_START_WAIT UINT64_MAX
|
|
|
|
#define TSD_INITIALIZER { \
|
|
/* state */ ATOMIC_INIT(tsd_state_uninitialized), \
|
|
/* tcache_enabled */ TCACHE_ENABLED_ZERO_INITIALIZER, \
|
|
/* arenas_tdata_bypass */ false, \
|
|
/* reentrancy_level */ 0, \
|
|
/* narenas_tdata */ 0, \
|
|
/* thread_allocated */ 0, \
|
|
/* thread_allocated_next_event_fast */ 0, \
|
|
/* thread_deallocated */ 0, \
|
|
/* thread_deallocated_next_event_fast */ 0, \
|
|
/* rtree_ctx */ RTREE_CTX_ZERO_INITIALIZER, \
|
|
/* thread_allocated_last_event */ 0, \
|
|
/* thread_allocated_next_event */ TE_MIN_START_WAIT, \
|
|
/* thread_deallocated_last_event */ 0, \
|
|
/* thread_deallocated_next_event */ TE_MIN_START_WAIT, \
|
|
/* tcache_gc_event_wait */ TE_MIN_START_WAIT, \
|
|
/* tcache_gc_dalloc_event_wait */ TE_MIN_START_WAIT, \
|
|
/* prof_sample_event_wait */ TE_MIN_START_WAIT, \
|
|
/* prof_sample_last_event */ 0, \
|
|
/* stats_interval_event_wait */ TE_MIN_START_WAIT, \
|
|
/* stats_interval_last_event */ 0, \
|
|
/* prof_tdata */ NULL, \
|
|
/* prng_state */ 0, \
|
|
/* iarena */ NULL, \
|
|
/* arena */ NULL, \
|
|
/* arenas_tdata */ NULL, \
|
|
/* binshards */ TSD_BINSHARDS_ZERO_INITIALIZER, \
|
|
/* tcache */ TCACHE_ZERO_INITIALIZER, \
|
|
/* witness */ WITNESS_TSD_INITIALIZER \
|
|
/* test data */ MALLOC_TEST_TSD_INITIALIZER \
|
|
}
|
|
|
|
void *malloc_tsd_malloc(size_t size);
|
|
void malloc_tsd_dalloc(void *wrapper);
|
|
void malloc_tsd_cleanup_register(bool (*f)(void));
|
|
tsd_t *malloc_tsd_boot0(void);
|
|
void malloc_tsd_boot1(void);
|
|
void tsd_cleanup(void *arg);
|
|
tsd_t *tsd_fetch_slow(tsd_t *tsd, bool internal);
|
|
void tsd_state_set(tsd_t *tsd, uint8_t new_state);
|
|
void tsd_slow_update(tsd_t *tsd);
|
|
void tsd_prefork(tsd_t *tsd);
|
|
void tsd_postfork_parent(tsd_t *tsd);
|
|
void tsd_postfork_child(tsd_t *tsd);
|
|
|
|
/*
|
|
* Call ..._inc when your module wants to take all threads down the slow paths,
|
|
* and ..._dec when it no longer needs to.
|
|
*/
|
|
void tsd_global_slow_inc(tsdn_t *tsdn);
|
|
void tsd_global_slow_dec(tsdn_t *tsdn);
|
|
bool tsd_global_slow();
|
|
|
|
enum {
|
|
/* Common case --> jnz. */
|
|
tsd_state_nominal = 0,
|
|
/* Initialized but on slow path. */
|
|
tsd_state_nominal_slow = 1,
|
|
/*
|
|
* Some thread has changed global state in such a way that all nominal
|
|
* threads need to recompute their fast / slow status the next time they
|
|
* get a chance.
|
|
*
|
|
* Any thread can change another thread's status *to* recompute, but
|
|
* threads are the only ones who can change their status *from*
|
|
* recompute.
|
|
*/
|
|
tsd_state_nominal_recompute = 2,
|
|
/*
|
|
* The above nominal states should be lower values. We use
|
|
* tsd_nominal_max to separate nominal states from threads in the
|
|
* process of being born / dying.
|
|
*/
|
|
tsd_state_nominal_max = 2,
|
|
|
|
/*
|
|
* A thread might free() during its death as its only allocator action;
|
|
* in such scenarios, we need tsd, but set up in such a way that no
|
|
* cleanup is necessary.
|
|
*/
|
|
tsd_state_minimal_initialized = 3,
|
|
/* States during which we know we're in thread death. */
|
|
tsd_state_purgatory = 4,
|
|
tsd_state_reincarnated = 5,
|
|
/*
|
|
* What it says on the tin; tsd that hasn't been initialized. Note
|
|
* that even when the tsd struct lives in TLS, when need to keep track
|
|
* of stuff like whether or not our pthread destructors have been
|
|
* scheduled, so this really truly is different than the nominal state.
|
|
*/
|
|
tsd_state_uninitialized = 6
|
|
};
|
|
|
|
/*
|
|
* Some TSD accesses can only be done in a nominal state. To enforce this, we
|
|
* wrap TSD member access in a function that asserts on TSD state, and mangle
|
|
* field names to prevent touching them accidentally.
|
|
*/
|
|
#define TSD_MANGLE(n) cant_access_tsd_items_directly_use_a_getter_or_setter_##n
|
|
|
|
#ifdef JEMALLOC_U8_ATOMICS
|
|
# define tsd_state_t atomic_u8_t
|
|
# define tsd_atomic_load atomic_load_u8
|
|
# define tsd_atomic_store atomic_store_u8
|
|
# define tsd_atomic_exchange atomic_exchange_u8
|
|
#else
|
|
# define tsd_state_t atomic_u32_t
|
|
# define tsd_atomic_load atomic_load_u32
|
|
# define tsd_atomic_store atomic_store_u32
|
|
# define tsd_atomic_exchange atomic_exchange_u32
|
|
#endif
|
|
|
|
/* The actual tsd. */
|
|
struct tsd_s {
|
|
/*
|
|
* The contents should be treated as totally opaque outside the tsd
|
|
* module. Access any thread-local state through the getters and
|
|
* setters below.
|
|
*/
|
|
|
|
/*
|
|
* We manually limit the state to just a single byte. Unless the 8-bit
|
|
* atomics are unavailable (which is rare).
|
|
*/
|
|
tsd_state_t state;
|
|
#define O(n, t, nt) \
|
|
t TSD_MANGLE(n);
|
|
MALLOC_TSD
|
|
#undef O
|
|
};
|
|
|
|
JEMALLOC_ALWAYS_INLINE uint8_t
|
|
tsd_state_get(tsd_t *tsd) {
|
|
/*
|
|
* This should be atomic. Unfortunately, compilers right now can't tell
|
|
* that this can be done as a memory comparison, and forces a load into
|
|
* a register that hurts fast-path performance.
|
|
*/
|
|
/* return atomic_load_u8(&tsd->state, ATOMIC_RELAXED); */
|
|
return *(uint8_t *)&tsd->state;
|
|
}
|
|
|
|
/*
|
|
* Wrapper around tsd_t that makes it possible to avoid implicit conversion
|
|
* between tsd_t and tsdn_t, where tsdn_t is "nullable" and has to be
|
|
* explicitly converted to tsd_t, which is non-nullable.
|
|
*/
|
|
struct tsdn_s {
|
|
tsd_t tsd;
|
|
};
|
|
#define TSDN_NULL ((tsdn_t *)0)
|
|
JEMALLOC_ALWAYS_INLINE tsdn_t *
|
|
tsd_tsdn(tsd_t *tsd) {
|
|
return (tsdn_t *)tsd;
|
|
}
|
|
|
|
JEMALLOC_ALWAYS_INLINE bool
|
|
tsdn_null(const tsdn_t *tsdn) {
|
|
return tsdn == NULL;
|
|
}
|
|
|
|
JEMALLOC_ALWAYS_INLINE tsd_t *
|
|
tsdn_tsd(tsdn_t *tsdn) {
|
|
assert(!tsdn_null(tsdn));
|
|
|
|
return &tsdn->tsd;
|
|
}
|
|
|
|
/*
|
|
* We put the platform-specific data declarations and inlines into their own
|
|
* header files to avoid cluttering this file. They define tsd_boot0,
|
|
* tsd_boot1, tsd_boot, tsd_booted_get, tsd_get_allocates, tsd_get, and tsd_set.
|
|
*/
|
|
#ifdef JEMALLOC_MALLOC_THREAD_CLEANUP
|
|
#include "jemalloc/internal/tsd_malloc_thread_cleanup.h"
|
|
#elif (defined(JEMALLOC_TLS))
|
|
#include "jemalloc/internal/tsd_tls.h"
|
|
#elif (defined(_WIN32))
|
|
#include "jemalloc/internal/tsd_win.h"
|
|
#else
|
|
#include "jemalloc/internal/tsd_generic.h"
|
|
#endif
|
|
|
|
/*
|
|
* tsd_foop_get_unsafe(tsd) returns a pointer to the thread-local instance of
|
|
* foo. This omits some safety checks, and so can be used during tsd
|
|
* initialization and cleanup.
|
|
*/
|
|
#define O(n, t, nt) \
|
|
JEMALLOC_ALWAYS_INLINE t * \
|
|
tsd_##n##p_get_unsafe(tsd_t *tsd) { \
|
|
return &tsd->TSD_MANGLE(n); \
|
|
}
|
|
MALLOC_TSD
|
|
#undef O
|
|
|
|
/* tsd_foop_get(tsd) returns a pointer to the thread-local instance of foo. */
|
|
#define O(n, t, nt) \
|
|
JEMALLOC_ALWAYS_INLINE t * \
|
|
tsd_##n##p_get(tsd_t *tsd) { \
|
|
/* \
|
|
* Because the state might change asynchronously if it's \
|
|
* nominal, we need to make sure that we only read it once. \
|
|
*/ \
|
|
uint8_t state = tsd_state_get(tsd); \
|
|
assert(state == tsd_state_nominal || \
|
|
state == tsd_state_nominal_slow || \
|
|
state == tsd_state_nominal_recompute || \
|
|
state == tsd_state_reincarnated || \
|
|
state == tsd_state_minimal_initialized); \
|
|
return tsd_##n##p_get_unsafe(tsd); \
|
|
}
|
|
MALLOC_TSD
|
|
#undef O
|
|
|
|
/*
|
|
* tsdn_foop_get(tsdn) returns either the thread-local instance of foo (if tsdn
|
|
* isn't NULL), or NULL (if tsdn is NULL), cast to the nullable pointer type.
|
|
*/
|
|
#define O(n, t, nt) \
|
|
JEMALLOC_ALWAYS_INLINE nt * \
|
|
tsdn_##n##p_get(tsdn_t *tsdn) { \
|
|
if (tsdn_null(tsdn)) { \
|
|
return NULL; \
|
|
} \
|
|
tsd_t *tsd = tsdn_tsd(tsdn); \
|
|
return (nt *)tsd_##n##p_get(tsd); \
|
|
}
|
|
MALLOC_TSD
|
|
#undef O
|
|
|
|
/* tsd_foo_get(tsd) returns the value of the thread-local instance of foo. */
|
|
#define O(n, t, nt) \
|
|
JEMALLOC_ALWAYS_INLINE t \
|
|
tsd_##n##_get(tsd_t *tsd) { \
|
|
return *tsd_##n##p_get(tsd); \
|
|
}
|
|
MALLOC_TSD
|
|
#undef O
|
|
|
|
/* tsd_foo_set(tsd, val) updates the thread-local instance of foo to be val. */
|
|
#define O(n, t, nt) \
|
|
JEMALLOC_ALWAYS_INLINE void \
|
|
tsd_##n##_set(tsd_t *tsd, t val) { \
|
|
assert(tsd_state_get(tsd) != tsd_state_reincarnated && \
|
|
tsd_state_get(tsd) != tsd_state_minimal_initialized); \
|
|
*tsd_##n##p_get(tsd) = val; \
|
|
}
|
|
MALLOC_TSD
|
|
#undef O
|
|
|
|
JEMALLOC_ALWAYS_INLINE void
|
|
tsd_assert_fast(tsd_t *tsd) {
|
|
/*
|
|
* Note that our fastness assertion does *not* include global slowness
|
|
* counters; it's not in general possible to ensure that they won't
|
|
* change asynchronously from underneath us.
|
|
*/
|
|
assert(!malloc_slow && tsd_tcache_enabled_get(tsd) &&
|
|
tsd_reentrancy_level_get(tsd) == 0);
|
|
}
|
|
|
|
JEMALLOC_ALWAYS_INLINE bool
|
|
tsd_fast(tsd_t *tsd) {
|
|
bool fast = (tsd_state_get(tsd) == tsd_state_nominal);
|
|
if (fast) {
|
|
tsd_assert_fast(tsd);
|
|
}
|
|
|
|
return fast;
|
|
}
|
|
|
|
JEMALLOC_ALWAYS_INLINE tsd_t *
|
|
tsd_fetch_impl(bool init, bool minimal) {
|
|
tsd_t *tsd = tsd_get(init);
|
|
|
|
if (!init && tsd_get_allocates() && tsd == NULL) {
|
|
return NULL;
|
|
}
|
|
assert(tsd != NULL);
|
|
|
|
if (unlikely(tsd_state_get(tsd) != tsd_state_nominal)) {
|
|
return tsd_fetch_slow(tsd, minimal);
|
|
}
|
|
assert(tsd_fast(tsd));
|
|
tsd_assert_fast(tsd);
|
|
|
|
return tsd;
|
|
}
|
|
|
|
/* Get a minimal TSD that requires no cleanup. See comments in free(). */
|
|
JEMALLOC_ALWAYS_INLINE tsd_t *
|
|
tsd_fetch_min(void) {
|
|
return tsd_fetch_impl(true, true);
|
|
}
|
|
|
|
/* For internal background threads use only. */
|
|
JEMALLOC_ALWAYS_INLINE tsd_t *
|
|
tsd_internal_fetch(void) {
|
|
tsd_t *tsd = tsd_fetch_min();
|
|
/* Use reincarnated state to prevent full initialization. */
|
|
tsd_state_set(tsd, tsd_state_reincarnated);
|
|
|
|
return tsd;
|
|
}
|
|
|
|
JEMALLOC_ALWAYS_INLINE tsd_t *
|
|
tsd_fetch(void) {
|
|
return tsd_fetch_impl(true, false);
|
|
}
|
|
|
|
static inline bool
|
|
tsd_nominal(tsd_t *tsd) {
|
|
bool nominal = tsd_state_get(tsd) <= tsd_state_nominal_max;
|
|
assert(nominal || tsd_reentrancy_level_get(tsd) > 0);
|
|
|
|
return nominal;
|
|
}
|
|
|
|
JEMALLOC_ALWAYS_INLINE tsdn_t *
|
|
tsdn_fetch(void) {
|
|
if (!tsd_booted_get()) {
|
|
return NULL;
|
|
}
|
|
|
|
return tsd_tsdn(tsd_fetch_impl(false, false));
|
|
}
|
|
|
|
JEMALLOC_ALWAYS_INLINE rtree_ctx_t *
|
|
tsd_rtree_ctx(tsd_t *tsd) {
|
|
return tsd_rtree_ctxp_get(tsd);
|
|
}
|
|
|
|
JEMALLOC_ALWAYS_INLINE rtree_ctx_t *
|
|
tsdn_rtree_ctx(tsdn_t *tsdn, rtree_ctx_t *fallback) {
|
|
/*
|
|
* If tsd cannot be accessed, initialize the fallback rtree_ctx and
|
|
* return a pointer to it.
|
|
*/
|
|
if (unlikely(tsdn_null(tsdn))) {
|
|
rtree_ctx_data_init(fallback);
|
|
return fallback;
|
|
}
|
|
return tsd_rtree_ctx(tsdn_tsd(tsdn));
|
|
}
|
|
|
|
static inline bool
|
|
tsd_state_nocleanup(tsd_t *tsd) {
|
|
return tsd_state_get(tsd) == tsd_state_reincarnated ||
|
|
tsd_state_get(tsd) == tsd_state_minimal_initialized;
|
|
}
|
|
|
|
/*
|
|
* These "raw" tsd reentrancy functions don't have any debug checking to make
|
|
* sure that we're not touching arena 0. Better is to call pre_reentrancy and
|
|
* post_reentrancy if this is possible.
|
|
*/
|
|
static inline void
|
|
tsd_pre_reentrancy_raw(tsd_t *tsd) {
|
|
bool fast = tsd_fast(tsd);
|
|
assert(tsd_reentrancy_level_get(tsd) < INT8_MAX);
|
|
++*tsd_reentrancy_levelp_get(tsd);
|
|
if (fast) {
|
|
/* Prepare slow path for reentrancy. */
|
|
tsd_slow_update(tsd);
|
|
assert(tsd_state_get(tsd) == tsd_state_nominal_slow);
|
|
}
|
|
}
|
|
|
|
static inline void
|
|
tsd_post_reentrancy_raw(tsd_t *tsd) {
|
|
int8_t *reentrancy_level = tsd_reentrancy_levelp_get(tsd);
|
|
assert(*reentrancy_level > 0);
|
|
if (--*reentrancy_level == 0) {
|
|
tsd_slow_update(tsd);
|
|
}
|
|
}
|
|
|
|
#endif /* JEMALLOC_INTERNAL_TSD_H */
|