Build a general purpose thread event handler
This commit is contained in:
@@ -3,6 +3,7 @@
|
||||
|
||||
#include "jemalloc/internal/safety_check.h"
|
||||
#include "jemalloc/internal/sz.h"
|
||||
#include "jemalloc/internal/thread_event.h"
|
||||
|
||||
JEMALLOC_ALWAYS_INLINE bool
|
||||
prof_gdump_get_unlocked(void) {
|
||||
@@ -79,24 +80,6 @@ prof_alloc_time_set(tsdn_t *tsdn, const void *ptr, nstime_t t) {
|
||||
arena_prof_alloc_time_set(tsdn, ptr, t);
|
||||
}
|
||||
|
||||
JEMALLOC_ALWAYS_INLINE bool
|
||||
prof_sample_check(tsd_t *tsd, size_t usize, bool update) {
|
||||
ssize_t check = update ? 0 : usize;
|
||||
|
||||
int64_t bytes_until_sample = tsd_bytes_until_sample_get(tsd);
|
||||
if (update) {
|
||||
bytes_until_sample -= usize;
|
||||
if (tsd_nominal(tsd)) {
|
||||
tsd_bytes_until_sample_set(tsd, bytes_until_sample);
|
||||
}
|
||||
}
|
||||
if (likely(bytes_until_sample >= check)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
JEMALLOC_ALWAYS_INLINE bool
|
||||
prof_sample_accum_update(tsd_t *tsd, size_t usize, bool update,
|
||||
prof_tdata_t **tdata_out) {
|
||||
@@ -105,7 +88,7 @@ prof_sample_accum_update(tsd_t *tsd, size_t usize, bool update,
|
||||
cassert(config_prof);
|
||||
|
||||
/* Fastpath: no need to load tdata */
|
||||
if (likely(prof_sample_check(tsd, usize, update))) {
|
||||
if (likely(prof_sample_event_wait_get(tsd) > 0)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -127,13 +110,40 @@ prof_sample_accum_update(tsd_t *tsd, size_t usize, bool update,
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* If this was the first creation of tdata, then
|
||||
* prof_tdata_get() reset bytes_until_sample, so decrement and
|
||||
* check it again
|
||||
*/
|
||||
if (!booted && prof_sample_check(tsd, usize, update)) {
|
||||
return true;
|
||||
if (!booted) {
|
||||
/*
|
||||
* If this was the first creation of tdata, then it means that
|
||||
* the previous thread_event() relied on the wrong prof_sample
|
||||
* wait time, and that it should have relied on the new
|
||||
* prof_sample wait time just set by prof_tdata_get(), so we
|
||||
* now manually check again.
|
||||
*
|
||||
* If the check fails, then even though we relied on the wrong
|
||||
* prof_sample wait time, we're now actually in perfect shape,
|
||||
* in the sense that we can pretend that we have used the right
|
||||
* prof_sample wait time.
|
||||
*
|
||||
* If the check succeeds, then we are now in a tougher
|
||||
* situation, in the sense that we cannot pretend that we have
|
||||
* used the right prof_sample wait time. A straightforward
|
||||
* solution would be to fully roll back thread_event(), set the
|
||||
* right prof_sample wait time, and then redo thread_event().
|
||||
* A simpler way, which is implemented below, is to just set a
|
||||
* new prof_sample wait time that is usize less, and do nothing
|
||||
* else. Strictly speaking, the thread event handler may end
|
||||
* up in a wrong state, since it has still recorded an event
|
||||
* whereas in reality there may be no event. However, the
|
||||
* difference in the wait time offsets the wrongly recorded
|
||||
* event, so that, functionally, the countdown to the next
|
||||
* event will behave exactly as if we have used the right
|
||||
* prof_sample wait time in the first place.
|
||||
*/
|
||||
uint64_t wait = prof_sample_event_wait_get(tsd);
|
||||
assert(wait > 0);
|
||||
if (usize < wait) {
|
||||
thread_prof_sample_event_update(tsd, wait - usize);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/* Compute new sample threshold. */
|
||||
|
139
include/jemalloc/internal/thread_event.h
Normal file
139
include/jemalloc/internal/thread_event.h
Normal file
@@ -0,0 +1,139 @@
|
||||
#ifndef JEMALLOC_INTERNAL_THREAD_EVENT_H
|
||||
#define JEMALLOC_INTERNAL_THREAD_EVENT_H
|
||||
|
||||
#include "jemalloc/internal/tsd.h"
|
||||
|
||||
/*
|
||||
* Maximum threshold on thread_allocated_next_event_fast, so that there is no
|
||||
* need to check overflow in malloc fast path. (The allocation size in malloc
|
||||
* fast path never exceeds SC_LOOKUP_MAXCLASS.)
|
||||
*/
|
||||
#define THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX \
|
||||
(UINT64_MAX - SC_LOOKUP_MAXCLASS + 1U)
|
||||
|
||||
/*
|
||||
* The max interval helps make sure that malloc stays on the fast path in the
|
||||
* common case, i.e. thread_allocated < thread_allocated_next_event_fast.
|
||||
* When thread_allocated is within an event's distance to
|
||||
* THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX above, thread_allocated_next_event_fast
|
||||
* is wrapped around and we fall back to the medium-fast path. The max interval
|
||||
* makes sure that we're not staying on the fallback case for too long, even if
|
||||
* there's no active event or if all active events have long wait times.
|
||||
*/
|
||||
#define THREAD_EVENT_MAX_INTERVAL ((uint64_t)(4U << 20))
|
||||
|
||||
void thread_event_assert_invariants_debug(tsd_t *tsd);
|
||||
void thread_event_trigger(tsd_t *tsd, bool delay_event);
|
||||
void thread_event_rollback(tsd_t *tsd, size_t diff);
|
||||
void thread_event_update(tsd_t *tsd);
|
||||
void thread_event_boot();
|
||||
|
||||
/*
|
||||
* List of all events, in the following format:
|
||||
* E(event, (condition))
|
||||
*/
|
||||
#define ITERATE_OVER_ALL_EVENTS \
|
||||
E(prof_sample, (config_prof && opt_prof))
|
||||
|
||||
#define E(event, condition) \
|
||||
C(event##_event_wait)
|
||||
|
||||
/* List of all thread event counters. */
|
||||
#define ITERATE_OVER_ALL_COUNTERS \
|
||||
C(thread_allocated) \
|
||||
C(thread_allocated_next_event_fast) \
|
||||
C(thread_allocated_last_event) \
|
||||
C(thread_allocated_next_event) \
|
||||
ITERATE_OVER_ALL_EVENTS
|
||||
|
||||
/* Getters directly wrap TSD getters. */
|
||||
#define C(counter) \
|
||||
JEMALLOC_ALWAYS_INLINE uint64_t \
|
||||
counter##_get(tsd_t *tsd) { \
|
||||
return tsd_##counter##_get(tsd); \
|
||||
}
|
||||
|
||||
ITERATE_OVER_ALL_COUNTERS
|
||||
#undef C
|
||||
|
||||
/*
|
||||
* Setters call the TSD pointer getters rather than the TSD setters, so that
|
||||
* the counters can be modified even when TSD state is reincarnated or
|
||||
* minimal_initialized: if an event is triggered in such cases, we will
|
||||
* temporarily delay the event and let it be immediately triggered at the next
|
||||
* allocation call.
|
||||
*/
|
||||
#define C(counter) \
|
||||
JEMALLOC_ALWAYS_INLINE void \
|
||||
counter##_set(tsd_t *tsd, uint64_t v) { \
|
||||
*tsd_##counter##p_get(tsd) = v; \
|
||||
}
|
||||
|
||||
ITERATE_OVER_ALL_COUNTERS
|
||||
#undef C
|
||||
|
||||
/*
|
||||
* For generating _event_wait getter / setter functions for each individual
|
||||
* event.
|
||||
*/
|
||||
#undef E
|
||||
|
||||
/*
|
||||
* The function checks in debug mode whether the thread event counters are in
|
||||
* a consistent state, which forms the invariants before and after each round
|
||||
* of thread event handling that we can rely on and need to promise.
|
||||
* The invariants are only temporarily violated in the middle of:
|
||||
* (a) thread_event() if an event is triggered (the thread_event_trigger() call
|
||||
* at the end will restore the invariants),
|
||||
* (b) thread_##event##_event_update() (the thread_event_update() call at the
|
||||
* end will restore the invariants), or
|
||||
* (c) thread_event_rollback() if the rollback falls below the last_event (the
|
||||
* thread_event_update() call at the end will restore the invariants).
|
||||
*/
|
||||
JEMALLOC_ALWAYS_INLINE void
|
||||
thread_event_assert_invariants(tsd_t *tsd) {
|
||||
if (config_debug) {
|
||||
thread_event_assert_invariants_debug(tsd);
|
||||
}
|
||||
}
|
||||
|
||||
JEMALLOC_ALWAYS_INLINE void
|
||||
thread_event(tsd_t *tsd, size_t usize) {
|
||||
thread_event_assert_invariants(tsd);
|
||||
|
||||
uint64_t thread_allocated_before = thread_allocated_get(tsd);
|
||||
thread_allocated_set(tsd, thread_allocated_before + usize);
|
||||
|
||||
/* The subtraction is intentionally susceptible to underflow. */
|
||||
if (likely(usize < thread_allocated_next_event_get(tsd) -
|
||||
thread_allocated_before)) {
|
||||
thread_event_assert_invariants(tsd);
|
||||
} else {
|
||||
thread_event_trigger(tsd, false);
|
||||
}
|
||||
}
|
||||
|
||||
#define E(event, condition) \
|
||||
JEMALLOC_ALWAYS_INLINE void \
|
||||
thread_##event##_event_update(tsd_t *tsd, uint64_t event_wait) { \
|
||||
thread_event_assert_invariants(tsd); \
|
||||
assert(condition); \
|
||||
assert(tsd_nominal(tsd)); \
|
||||
assert(tsd_reentrancy_level_get(tsd) == 0); \
|
||||
assert(event_wait > 0U); \
|
||||
if (THREAD_EVENT_MIN_START_WAIT > 1U && \
|
||||
unlikely(event_wait < THREAD_EVENT_MIN_START_WAIT)) { \
|
||||
event_wait = THREAD_EVENT_MIN_START_WAIT; \
|
||||
} \
|
||||
if (THREAD_EVENT_MAX_START_WAIT < UINT64_MAX && \
|
||||
unlikely(event_wait > THREAD_EVENT_MAX_START_WAIT)) { \
|
||||
event_wait = THREAD_EVENT_MAX_START_WAIT; \
|
||||
} \
|
||||
event##_event_wait_set(tsd, event_wait); \
|
||||
thread_event_update(tsd); \
|
||||
}
|
||||
|
||||
ITERATE_OVER_ALL_EVENTS
|
||||
#undef E
|
||||
|
||||
#endif /* JEMALLOC_INTERNAL_THREAD_EVENT_H */
|
@@ -15,39 +15,45 @@
|
||||
|
||||
/*
|
||||
* Thread-Specific-Data layout
|
||||
* --- data accessed on tcache fast path: state, rtree_ctx, stats, prof ---
|
||||
* --- data accessed on tcache fast path: state, rtree_ctx, stats ---
|
||||
* s: state
|
||||
* e: tcache_enabled
|
||||
* m: thread_allocated
|
||||
* k: thread_allocated_next_event_fast
|
||||
* f: thread_deallocated
|
||||
* b: bytes_until_sample (config_prof)
|
||||
* p: prof_tdata (config_prof)
|
||||
* c: rtree_ctx (rtree cache accessed on deallocation)
|
||||
* t: tcache
|
||||
* --- data not accessed on tcache fast path: arena-related fields ---
|
||||
* d: arenas_tdata_bypass
|
||||
* r: reentrancy_level
|
||||
* x: narenas_tdata
|
||||
* l: thread_allocated_last_event
|
||||
* j: thread_allocated_next_event
|
||||
* w: prof_sample_event_wait (config_prof)
|
||||
* p: prof_tdata (config_prof)
|
||||
* v: offset_state
|
||||
* i: iarena
|
||||
* a: arena
|
||||
* o: arenas_tdata
|
||||
* b: binshards
|
||||
* Loading TSD data is on the critical path of basically all malloc operations.
|
||||
* In particular, tcache and rtree_ctx rely on hot CPU cache to be effective.
|
||||
* Use a compact layout to reduce cache footprint.
|
||||
* +--- 64-bit and 64B cacheline; 1B each letter; First byte on the left. ---+
|
||||
* |---------------------------- 1st cacheline ----------------------------|
|
||||
* | sedrxxxx vvvvvvvv mmmmmmmm ffffffff bbbbbbbb pppppppp [c * 16 .......] |
|
||||
* | sedrxxxx mmmmmmmm kkkkkkkk ffffffff [c * 32 ........ ........ .......] |
|
||||
* |---------------------------- 2nd cacheline ----------------------------|
|
||||
* | [c * 64 ........ ........ ........ ........ ........ ........ .......] |
|
||||
* |---------------------------- 3nd cacheline ----------------------------|
|
||||
* | [c * 48 ........ ........ ........ ........ .......] iiiiiiii aaaaaaaa |
|
||||
* | [c * 32 ........ ........ .......] llllllll jjjjjjjj wwwwwwww pppppppp |
|
||||
* +---------------------------- 4th cacheline ----------------------------+
|
||||
* | oooooooo [t...... ........ ........ ........ ........ ........ ........ |
|
||||
* | vvvvvvvv iiiiiiii aaaaaaaa oooooooo [b...... ........ ........ ........ |
|
||||
* +---------------------------- 5th cacheline ----------------------------+
|
||||
* | ..b][t.. ........ ........ ........ ........ ........ ........ ........ |
|
||||
* +-------------------------------------------------------------------------+
|
||||
* Note: the entire tcache is embedded into TSD and spans multiple cachelines.
|
||||
*
|
||||
* The last 3 members (i, a and o) before tcache isn't really needed on tcache
|
||||
* The elements after rtree_ctx and before tcache aren't really needed on tcache
|
||||
* fast path. However we have a number of unused tcache bins and witnesses
|
||||
* (never touched unless config_debug) at the end of tcache, so we place them
|
||||
* there to avoid breaking the cachelines and possibly paging in an extra page.
|
||||
@@ -64,18 +70,21 @@ typedef void (*test_callback_t)(int *);
|
||||
# define MALLOC_TEST_TSD_INITIALIZER
|
||||
#endif
|
||||
|
||||
/* O(name, type, nullable type */
|
||||
/* O(name, type, nullable type) */
|
||||
#define MALLOC_TSD \
|
||||
O(tcache_enabled, bool, bool) \
|
||||
O(arenas_tdata_bypass, bool, bool) \
|
||||
O(reentrancy_level, int8_t, int8_t) \
|
||||
O(narenas_tdata, uint32_t, uint32_t) \
|
||||
O(offset_state, uint64_t, uint64_t) \
|
||||
O(thread_allocated, uint64_t, uint64_t) \
|
||||
O(thread_allocated_next_event_fast, uint64_t, uint64_t) \
|
||||
O(thread_deallocated, uint64_t, uint64_t) \
|
||||
O(bytes_until_sample, int64_t, int64_t) \
|
||||
O(prof_tdata, prof_tdata_t *, prof_tdata_t *) \
|
||||
O(rtree_ctx, rtree_ctx_t, rtree_ctx_t) \
|
||||
O(thread_allocated_last_event, uint64_t, uint64_t) \
|
||||
O(thread_allocated_next_event, uint64_t, uint64_t) \
|
||||
O(prof_sample_event_wait, uint64_t, uint64_t) \
|
||||
O(prof_tdata, prof_tdata_t *, prof_tdata_t *) \
|
||||
O(offset_state, uint64_t, uint64_t) \
|
||||
O(iarena, arena_t *, arena_t *) \
|
||||
O(arena, arena_t *, arena_t *) \
|
||||
O(arenas_tdata, arena_tdata_t *, arena_tdata_t *)\
|
||||
@@ -84,25 +93,34 @@ typedef void (*test_callback_t)(int *);
|
||||
O(witness_tsd, witness_tsd_t, witness_tsdn_t) \
|
||||
MALLOC_TEST_TSD
|
||||
|
||||
/*
|
||||
* THREAD_EVENT_MIN_START_WAIT should not exceed the minimal allocation usize.
|
||||
*/
|
||||
#define THREAD_EVENT_MIN_START_WAIT ((uint64_t)1U)
|
||||
#define THREAD_EVENT_MAX_START_WAIT UINT64_MAX
|
||||
|
||||
#define TSD_INITIALIZER { \
|
||||
ATOMIC_INIT(tsd_state_uninitialized), \
|
||||
TCACHE_ENABLED_ZERO_INITIALIZER, \
|
||||
false, \
|
||||
0, \
|
||||
0, \
|
||||
0, \
|
||||
0, \
|
||||
0, \
|
||||
0, \
|
||||
NULL, \
|
||||
RTREE_CTX_ZERO_INITIALIZER, \
|
||||
NULL, \
|
||||
NULL, \
|
||||
NULL, \
|
||||
TSD_BINSHARDS_ZERO_INITIALIZER, \
|
||||
TCACHE_ZERO_INITIALIZER, \
|
||||
WITNESS_TSD_INITIALIZER \
|
||||
MALLOC_TEST_TSD_INITIALIZER \
|
||||
/* state */ ATOMIC_INIT(tsd_state_uninitialized), \
|
||||
/* tcache_enabled */ TCACHE_ENABLED_ZERO_INITIALIZER, \
|
||||
/* arenas_tdata_bypass */ false, \
|
||||
/* reentrancy_level */ 0, \
|
||||
/* narenas_tdata */ 0, \
|
||||
/* thread_allocated */ 0, \
|
||||
/* thread_allocated_next_event_fast */ THREAD_EVENT_MIN_START_WAIT, \
|
||||
/* thread_deallocated */ 0, \
|
||||
/* rtree_ctx */ RTREE_CTX_ZERO_INITIALIZER, \
|
||||
/* thread_allocated_last_event */ 0, \
|
||||
/* thread_allocated_next_event */ THREAD_EVENT_MIN_START_WAIT, \
|
||||
/* prof_sample_event_wait */ THREAD_EVENT_MIN_START_WAIT, \
|
||||
/* prof_tdata */ NULL, \
|
||||
/* offset_state */ 0, \
|
||||
/* iarena */ NULL, \
|
||||
/* arena */ NULL, \
|
||||
/* arenas_tdata */ NULL, \
|
||||
/* binshards */ TSD_BINSHARDS_ZERO_INITIALIZER, \
|
||||
/* tcache */ TCACHE_ZERO_INITIALIZER, \
|
||||
/* witness */ WITNESS_TSD_INITIALIZER \
|
||||
/* test data */ MALLOC_TEST_TSD_INITIALIZER \
|
||||
}
|
||||
|
||||
void *malloc_tsd_malloc(size_t size);
|
||||
|
Reference in New Issue
Block a user