Optimize away the tsd_fast() check on fastpath.

Fold the tsd_state check onto the event threshold check.  The fast threshold is
set to 0 when tsd switch to non-nominal.

The fast_threshold can be reset by remote threads, to refect the non nominal tsd
state change.
This commit is contained in:
Qi Wang 2019-11-11 16:34:48 -08:00 committed by Qi Wang
parent 1decf958d1
commit dd649c9485
6 changed files with 144 additions and 21 deletions

View File

@ -27,6 +27,7 @@ void thread_event_trigger(tsd_t *tsd, bool delay_event);
void thread_event_rollback(tsd_t *tsd, size_t diff); void thread_event_rollback(tsd_t *tsd, size_t diff);
void thread_event_update(tsd_t *tsd); void thread_event_update(tsd_t *tsd);
void thread_event_boot(); void thread_event_boot();
void thread_event_recompute_fast_threshold(tsd_t *tsd);
void tsd_thread_event_init(tsd_t *tsd); void tsd_thread_event_init(tsd_t *tsd);
/* /*
@ -43,9 +44,7 @@ void tsd_thread_event_init(tsd_t *tsd);
/* List of all thread event counters. */ /* List of all thread event counters. */
#define ITERATE_OVER_ALL_COUNTERS \ #define ITERATE_OVER_ALL_COUNTERS \
C(thread_allocated) \ C(thread_allocated) \
C(thread_allocated_next_event_fast) \
C(thread_allocated_last_event) \ C(thread_allocated_last_event) \
C(thread_allocated_next_event) \
ITERATE_OVER_ALL_EVENTS \ ITERATE_OVER_ALL_EVENTS \
C(prof_sample_last_event) C(prof_sample_last_event)
@ -81,6 +80,60 @@ ITERATE_OVER_ALL_COUNTERS
*/ */
#undef E #undef E
/*
* Two malloc fastpath getters -- use the unsafe getters since tsd may be
* non-nominal, in which case the fast_threshold will be set to 0. This allows
* checking for events and tsd non-nominal in a single branch.
*
* Note that these can only be used on the fastpath.
*/
JEMALLOC_ALWAYS_INLINE uint64_t
thread_allocated_malloc_fastpath(tsd_t *tsd) {
return *tsd_thread_allocatedp_get_unsafe(tsd);
}
JEMALLOC_ALWAYS_INLINE uint64_t
thread_allocated_next_event_malloc_fastpath(tsd_t *tsd) {
uint64_t v = *tsd_thread_allocated_next_event_fastp_get_unsafe(tsd);
assert(v <= THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX);
return v;
}
/* Below 3 for next_event_fast. */
JEMALLOC_ALWAYS_INLINE uint64_t
thread_allocated_next_event_fast_get(tsd_t *tsd) {
uint64_t v = tsd_thread_allocated_next_event_fast_get(tsd);
assert(v <= THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX);
return v;
}
JEMALLOC_ALWAYS_INLINE void
thread_allocated_next_event_fast_set(tsd_t *tsd, uint64_t v) {
assert(v <= THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX);
*tsd_thread_allocated_next_event_fastp_get(tsd) = v;
}
JEMALLOC_ALWAYS_INLINE void
thread_allocated_next_event_fast_set_non_nominal(tsd_t *tsd) {
/*
* Set the fast threshold to zero when tsd is non-nominal. Use the
* unsafe getter as this may get called during tsd init and clean up.
*/
*tsd_thread_allocated_next_event_fastp_get_unsafe(tsd) = 0;
}
/* For next_event. Setter also updates the fast threshold. */
JEMALLOC_ALWAYS_INLINE uint64_t
thread_allocated_next_event_get(tsd_t *tsd) {
return tsd_thread_allocated_next_event_get(tsd);
}
JEMALLOC_ALWAYS_INLINE void
thread_allocated_next_event_set(tsd_t *tsd, uint64_t v) {
*tsd_thread_allocated_next_eventp_get(tsd) = v;
thread_event_recompute_fast_threshold(tsd);
}
/* /*
* The function checks in debug mode whether the thread event counters are in * The function checks in debug mode whether the thread event counters are in
* a consistent state, which forms the invariants before and after each round * a consistent state, which forms the invariants before and after each round

View File

@ -110,7 +110,7 @@ typedef void (*test_callback_t)(int *);
/* reentrancy_level */ 0, \ /* reentrancy_level */ 0, \
/* narenas_tdata */ 0, \ /* narenas_tdata */ 0, \
/* thread_allocated */ 0, \ /* thread_allocated */ 0, \
/* thread_allocated_next_event_fast */ THREAD_EVENT_MIN_START_WAIT, \ /* thread_allocated_next_event_fast */ 0, \
/* thread_deallocated */ 0, \ /* thread_deallocated */ 0, \
/* rtree_ctx */ RTREE_CTX_ZERO_INITIALIZER, \ /* rtree_ctx */ RTREE_CTX_ZERO_INITIALIZER, \
/* thread_allocated_last_event */ 0, \ /* thread_allocated_last_event */ 0, \

View File

@ -2354,7 +2354,7 @@ je_malloc(size_t size) {
} }
tsd_t *tsd = tsd_get(false); tsd_t *tsd = tsd_get(false);
if (unlikely((size > SC_LOOKUP_MAXCLASS) || !tsd || !tsd_fast(tsd))) { if (unlikely((size > SC_LOOKUP_MAXCLASS) || tsd == NULL)) {
return malloc_default(size); return malloc_default(size);
} }
@ -2373,13 +2373,17 @@ je_malloc(size_t size) {
assert(ind < SC_NBINS); assert(ind < SC_NBINS);
assert(size <= SC_SMALL_MAXCLASS); assert(size <= SC_SMALL_MAXCLASS);
uint64_t thread_allocated_after = thread_allocated_get(tsd) + usize; uint64_t allocated = thread_allocated_malloc_fastpath(tsd);
assert(thread_allocated_next_event_fast_get(tsd) <= uint64_t threshold = thread_allocated_next_event_malloc_fastpath(tsd);
THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX); /*
if (unlikely(thread_allocated_after >= * Check for events and tsd non-nominal (fast_threshold will be set to
thread_allocated_next_event_fast_get(tsd))) { * 0) in a single branch.
*/
uint64_t allocated_after = allocated + usize;
if (unlikely(allocated_after >= threshold)) {
return malloc_default(size); return malloc_default(size);
} }
assert(tsd_fast(tsd));
tcache_t *tcache = tsd_tcachep_get(tsd); tcache_t *tcache = tsd_tcachep_get(tsd);
cache_bin_t *bin = tcache_small_bin_get(tcache, ind); cache_bin_t *bin = tcache_small_bin_get(tcache, ind);
@ -2387,7 +2391,7 @@ je_malloc(size_t size) {
void *ret = cache_bin_alloc_easy_reduced(bin, &tcache_success); void *ret = cache_bin_alloc_easy_reduced(bin, &tcache_success);
if (tcache_success) { if (tcache_success) {
thread_allocated_set(tsd, thread_allocated_after); thread_allocated_set(tsd, allocated_after);
if (config_stats) { if (config_stats) {
bin->tstats.nrequests++; bin->tstats.nrequests++;
} }

View File

@ -103,10 +103,11 @@ thread_event_assert_invariants_debug(tsd_t *tsd) {
uint64_t next_event_fast = thread_allocated_next_event_fast_get(tsd); uint64_t next_event_fast = thread_allocated_next_event_fast_get(tsd);
assert(last_event != next_event); assert(last_event != next_event);
if (next_event <= THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX) { if (next_event > THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX ||
assert(next_event_fast == next_event); !tsd_fast(tsd)) {
} else {
assert(next_event_fast == 0U); assert(next_event_fast == 0U);
} else {
assert(next_event_fast == next_event);
} }
/* The subtraction is intentionally susceptible to underflow. */ /* The subtraction is intentionally susceptible to underflow. */
@ -128,15 +129,77 @@ thread_event_assert_invariants_debug(tsd_t *tsd) {
(interval < min_wait && interval == THREAD_EVENT_MAX_INTERVAL)); (interval < min_wait && interval == THREAD_EVENT_MAX_INTERVAL));
} }
/*
* Synchronization around the fast threshold in tsd --
* There are two threads to consider in the synchronization here:
* - The owner of the tsd being updated by a slow path change
* - The remote thread, doing that slow path change.
*
* As a design constraint, we want to ensure that a slow-path transition cannot
* be ignored for arbitrarily long, and that if the remote thread causes a
* slow-path transition and then communicates with the owner thread that it has
* occurred, then the owner will go down the slow path on the next allocator
* operation (so that we don't want to just wait until the owner hits its slow
* path reset condition on its own).
*
* Here's our strategy to do that:
*
* The remote thread will update the slow-path stores to TSD variables, issue a
* SEQ_CST fence, and then update the TSD next_event_fast counter. The owner
* thread will update next_event_fast, issue an SEQ_CST fence, and then check
* its TSD to see if it's on the slow path.
* This is fairly straightforward when 64-bit atomics are supported. Assume that
* the remote fence is sandwiched between two owner fences in the reset pathway.
* The case where there is no preceding or trailing owner fence (i.e. because
* the owner thread is near the beginning or end of its life) can be analyzed
* similarly. The owner store to next_event_fast preceding the earlier owner
* fence will be earlier in coherence order than the remote store to it, so that
* the owner thread will go down the slow path once the store becomes visible to
* it, which is no later than the time of the second fence.
* The case where we don't support 64-bit atomics is trickier, since word
* tearing is possible. We'll repeat the same analysis, and look at the two
* owner fences sandwiching the remote fence. The next_event_fast stores done
* alongside the earlier owner fence cannot overwrite any of the remote stores
* (since they precede the earlier owner fence in sb, which precedes the remote
* fence in sc, which precedes the remote stores in sb). After the second owner
* fence there will be a re-check of the slow-path variables anyways, so the
* "owner will notice that it's on the slow path eventually" guarantee is
* satisfied. To make sure that the out-of-band-messaging constraint is as well,
* note that either the message passing is sequenced before the second owner
* fence (in which case the remote stores happen before the second set of owner
* stores, so malloc sees a value of zero for next_event_fast and goes down the
* slow path), or it is not (in which case the owner sees the tsd slow-path
* writes on its previous update). This leaves open the possibility that the
* remote thread will (at some arbitrary point in the future) zero out one half
* of the owner thread's next_event_fast, but that's always safe (it just sends
* it down the slow path earlier).
*/
void
thread_event_recompute_fast_threshold(tsd_t *tsd) {
if (tsd_state_get(tsd) != tsd_state_nominal) {
/* Check first because this is also called on purgatory. */
thread_allocated_next_event_fast_set_non_nominal(tsd);
return;
}
uint64_t next_event = thread_allocated_next_event_get(tsd);
uint64_t next_event_fast = (next_event <=
THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX) ? next_event : 0U;
thread_allocated_next_event_fast_set(tsd, next_event_fast);
atomic_fence(ATOMIC_SEQ_CST);
if (tsd_state_get(tsd) != tsd_state_nominal) {
thread_allocated_next_event_fast_set_non_nominal(tsd);
}
}
static void static void
thread_event_adjust_thresholds_helper(tsd_t *tsd, uint64_t wait) { thread_event_adjust_thresholds_helper(tsd_t *tsd, uint64_t wait) {
assert(wait <= THREAD_EVENT_MAX_START_WAIT); assert(wait <= THREAD_EVENT_MAX_START_WAIT);
uint64_t next_event = thread_allocated_last_event_get(tsd) + (wait <= uint64_t next_event = thread_allocated_last_event_get(tsd) + (wait <=
THREAD_EVENT_MAX_INTERVAL ? wait : THREAD_EVENT_MAX_INTERVAL); THREAD_EVENT_MAX_INTERVAL ? wait : THREAD_EVENT_MAX_INTERVAL);
thread_allocated_next_event_set(tsd, next_event); thread_allocated_next_event_set(tsd, next_event);
uint64_t next_event_fast = (next_event <=
THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX) ? next_event : 0U;
thread_allocated_next_event_fast_set(tsd, next_event_fast);
} }
static uint64_t static uint64_t

View File

@ -115,8 +115,11 @@ tsd_force_recompute(tsdn_t *tsdn) {
ql_foreach(remote_tsd, &tsd_nominal_tsds, TSD_MANGLE(tcache).tsd_link) { ql_foreach(remote_tsd, &tsd_nominal_tsds, TSD_MANGLE(tcache).tsd_link) {
assert(tsd_atomic_load(&remote_tsd->state, ATOMIC_RELAXED) assert(tsd_atomic_load(&remote_tsd->state, ATOMIC_RELAXED)
<= tsd_state_nominal_max); <= tsd_state_nominal_max);
tsd_atomic_store(&remote_tsd->state, tsd_state_nominal_recompute, tsd_atomic_store(&remote_tsd->state,
ATOMIC_RELAXED); tsd_state_nominal_recompute, ATOMIC_RELAXED);
/* See comments in thread_event_recompute_fast_threshold(). */
atomic_fence(ATOMIC_SEQ_CST);
thread_allocated_next_event_fast_set_non_nominal(remote_tsd);
} }
malloc_mutex_unlock(tsdn, &tsd_nominal_tsds_lock); malloc_mutex_unlock(tsdn, &tsd_nominal_tsds_lock);
} }
@ -175,6 +178,8 @@ tsd_slow_update(tsd_t *tsd) {
old_state = tsd_atomic_exchange(&tsd->state, new_state, old_state = tsd_atomic_exchange(&tsd->state, new_state,
ATOMIC_ACQUIRE); ATOMIC_ACQUIRE);
} while (old_state == tsd_state_nominal_recompute); } while (old_state == tsd_state_nominal_recompute);
thread_event_recompute_fast_threshold(tsd);
} }
void void
@ -213,6 +218,7 @@ tsd_state_set(tsd_t *tsd, uint8_t new_state) {
tsd_slow_update(tsd); tsd_slow_update(tsd);
} }
} }
thread_event_recompute_fast_threshold(tsd);
} }
static bool static bool

View File

@ -7,8 +7,6 @@ TEST_BEGIN(test_next_event_fast_roll_back) {
THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX - 8U); THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX - 8U);
thread_allocated_next_event_set(tsd, thread_allocated_next_event_set(tsd,
THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX); THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX);
thread_allocated_next_event_fast_set(tsd,
THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX);
#define E(event, condition) \ #define E(event, condition) \
event##_event_wait_set(tsd, \ event##_event_wait_set(tsd, \
THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX); THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX);
@ -27,7 +25,6 @@ TEST_BEGIN(test_next_event_fast_resume) {
THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX + 8U); THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX + 8U);
thread_allocated_next_event_set(tsd, thread_allocated_next_event_set(tsd,
THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX + 16U); THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX + 16U);
thread_allocated_next_event_fast_set(tsd, 0);
#define E(event, condition) \ #define E(event, condition) \
event##_event_wait_set(tsd, \ event##_event_wait_set(tsd, \
THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX + 16U); THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX + 16U);