diff --git a/include/jemalloc/internal/thread_event.h b/include/jemalloc/internal/thread_event.h index 8a05eaed..3ceb4702 100644 --- a/include/jemalloc/internal/thread_event.h +++ b/include/jemalloc/internal/thread_event.h @@ -27,6 +27,7 @@ void thread_event_trigger(tsd_t *tsd, bool delay_event); void thread_event_rollback(tsd_t *tsd, size_t diff); void thread_event_update(tsd_t *tsd); void thread_event_boot(); +void thread_event_recompute_fast_threshold(tsd_t *tsd); void tsd_thread_event_init(tsd_t *tsd); /* @@ -43,9 +44,7 @@ void tsd_thread_event_init(tsd_t *tsd); /* List of all thread event counters. */ #define ITERATE_OVER_ALL_COUNTERS \ C(thread_allocated) \ - C(thread_allocated_next_event_fast) \ C(thread_allocated_last_event) \ - C(thread_allocated_next_event) \ ITERATE_OVER_ALL_EVENTS \ C(prof_sample_last_event) @@ -81,6 +80,60 @@ ITERATE_OVER_ALL_COUNTERS */ #undef E +/* + * Two malloc fastpath getters -- use the unsafe getters since tsd may be + * non-nominal, in which case the fast_threshold will be set to 0. This allows + * checking for events and tsd non-nominal in a single branch. + * + * Note that these can only be used on the fastpath. + */ +JEMALLOC_ALWAYS_INLINE uint64_t +thread_allocated_malloc_fastpath(tsd_t *tsd) { + return *tsd_thread_allocatedp_get_unsafe(tsd); +} + +JEMALLOC_ALWAYS_INLINE uint64_t +thread_allocated_next_event_malloc_fastpath(tsd_t *tsd) { + uint64_t v = *tsd_thread_allocated_next_event_fastp_get_unsafe(tsd); + assert(v <= THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX); + return v; +} + +/* Below 3 for next_event_fast. */ +JEMALLOC_ALWAYS_INLINE uint64_t +thread_allocated_next_event_fast_get(tsd_t *tsd) { + uint64_t v = tsd_thread_allocated_next_event_fast_get(tsd); + assert(v <= THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX); + return v; +} + +JEMALLOC_ALWAYS_INLINE void +thread_allocated_next_event_fast_set(tsd_t *tsd, uint64_t v) { + assert(v <= THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX); + *tsd_thread_allocated_next_event_fastp_get(tsd) = v; +} + +JEMALLOC_ALWAYS_INLINE void +thread_allocated_next_event_fast_set_non_nominal(tsd_t *tsd) { + /* + * Set the fast threshold to zero when tsd is non-nominal. Use the + * unsafe getter as this may get called during tsd init and clean up. + */ + *tsd_thread_allocated_next_event_fastp_get_unsafe(tsd) = 0; +} + +/* For next_event. Setter also updates the fast threshold. */ +JEMALLOC_ALWAYS_INLINE uint64_t +thread_allocated_next_event_get(tsd_t *tsd) { + return tsd_thread_allocated_next_event_get(tsd); +} + +JEMALLOC_ALWAYS_INLINE void +thread_allocated_next_event_set(tsd_t *tsd, uint64_t v) { + *tsd_thread_allocated_next_eventp_get(tsd) = v; + thread_event_recompute_fast_threshold(tsd); +} + /* * The function checks in debug mode whether the thread event counters are in * a consistent state, which forms the invariants before and after each round diff --git a/include/jemalloc/internal/tsd.h b/include/jemalloc/internal/tsd.h index 6332a003..961fc1f5 100644 --- a/include/jemalloc/internal/tsd.h +++ b/include/jemalloc/internal/tsd.h @@ -110,7 +110,7 @@ typedef void (*test_callback_t)(int *); /* reentrancy_level */ 0, \ /* narenas_tdata */ 0, \ /* thread_allocated */ 0, \ - /* thread_allocated_next_event_fast */ THREAD_EVENT_MIN_START_WAIT, \ + /* thread_allocated_next_event_fast */ 0, \ /* thread_deallocated */ 0, \ /* rtree_ctx */ RTREE_CTX_ZERO_INITIALIZER, \ /* thread_allocated_last_event */ 0, \ diff --git a/src/jemalloc.c b/src/jemalloc.c index e25e064b..af72d41a 100644 --- a/src/jemalloc.c +++ b/src/jemalloc.c @@ -2354,7 +2354,7 @@ je_malloc(size_t size) { } tsd_t *tsd = tsd_get(false); - if (unlikely((size > SC_LOOKUP_MAXCLASS) || !tsd || !tsd_fast(tsd))) { + if (unlikely((size > SC_LOOKUP_MAXCLASS) || tsd == NULL)) { return malloc_default(size); } @@ -2373,13 +2373,17 @@ je_malloc(size_t size) { assert(ind < SC_NBINS); assert(size <= SC_SMALL_MAXCLASS); - uint64_t thread_allocated_after = thread_allocated_get(tsd) + usize; - assert(thread_allocated_next_event_fast_get(tsd) <= - THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX); - if (unlikely(thread_allocated_after >= - thread_allocated_next_event_fast_get(tsd))) { + uint64_t allocated = thread_allocated_malloc_fastpath(tsd); + uint64_t threshold = thread_allocated_next_event_malloc_fastpath(tsd); + /* + * Check for events and tsd non-nominal (fast_threshold will be set to + * 0) in a single branch. + */ + uint64_t allocated_after = allocated + usize; + if (unlikely(allocated_after >= threshold)) { return malloc_default(size); } + assert(tsd_fast(tsd)); tcache_t *tcache = tsd_tcachep_get(tsd); cache_bin_t *bin = tcache_small_bin_get(tcache, ind); @@ -2387,7 +2391,7 @@ je_malloc(size_t size) { void *ret = cache_bin_alloc_easy_reduced(bin, &tcache_success); if (tcache_success) { - thread_allocated_set(tsd, thread_allocated_after); + thread_allocated_set(tsd, allocated_after); if (config_stats) { bin->tstats.nrequests++; } diff --git a/src/thread_event.c b/src/thread_event.c index 9a1d0f9b..0657c841 100644 --- a/src/thread_event.c +++ b/src/thread_event.c @@ -103,10 +103,11 @@ thread_event_assert_invariants_debug(tsd_t *tsd) { uint64_t next_event_fast = thread_allocated_next_event_fast_get(tsd); assert(last_event != next_event); - if (next_event <= THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX) { - assert(next_event_fast == next_event); - } else { + if (next_event > THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX || + !tsd_fast(tsd)) { assert(next_event_fast == 0U); + } else { + assert(next_event_fast == next_event); } /* The subtraction is intentionally susceptible to underflow. */ @@ -128,15 +129,77 @@ thread_event_assert_invariants_debug(tsd_t *tsd) { (interval < min_wait && interval == THREAD_EVENT_MAX_INTERVAL)); } +/* + * Synchronization around the fast threshold in tsd -- + * There are two threads to consider in the synchronization here: + * - The owner of the tsd being updated by a slow path change + * - The remote thread, doing that slow path change. + * + * As a design constraint, we want to ensure that a slow-path transition cannot + * be ignored for arbitrarily long, and that if the remote thread causes a + * slow-path transition and then communicates with the owner thread that it has + * occurred, then the owner will go down the slow path on the next allocator + * operation (so that we don't want to just wait until the owner hits its slow + * path reset condition on its own). + * + * Here's our strategy to do that: + * + * The remote thread will update the slow-path stores to TSD variables, issue a + * SEQ_CST fence, and then update the TSD next_event_fast counter. The owner + * thread will update next_event_fast, issue an SEQ_CST fence, and then check + * its TSD to see if it's on the slow path. + + * This is fairly straightforward when 64-bit atomics are supported. Assume that + * the remote fence is sandwiched between two owner fences in the reset pathway. + * The case where there is no preceding or trailing owner fence (i.e. because + * the owner thread is near the beginning or end of its life) can be analyzed + * similarly. The owner store to next_event_fast preceding the earlier owner + * fence will be earlier in coherence order than the remote store to it, so that + * the owner thread will go down the slow path once the store becomes visible to + * it, which is no later than the time of the second fence. + + * The case where we don't support 64-bit atomics is trickier, since word + * tearing is possible. We'll repeat the same analysis, and look at the two + * owner fences sandwiching the remote fence. The next_event_fast stores done + * alongside the earlier owner fence cannot overwrite any of the remote stores + * (since they precede the earlier owner fence in sb, which precedes the remote + * fence in sc, which precedes the remote stores in sb). After the second owner + * fence there will be a re-check of the slow-path variables anyways, so the + * "owner will notice that it's on the slow path eventually" guarantee is + * satisfied. To make sure that the out-of-band-messaging constraint is as well, + * note that either the message passing is sequenced before the second owner + * fence (in which case the remote stores happen before the second set of owner + * stores, so malloc sees a value of zero for next_event_fast and goes down the + * slow path), or it is not (in which case the owner sees the tsd slow-path + * writes on its previous update). This leaves open the possibility that the + * remote thread will (at some arbitrary point in the future) zero out one half + * of the owner thread's next_event_fast, but that's always safe (it just sends + * it down the slow path earlier). + */ +void +thread_event_recompute_fast_threshold(tsd_t *tsd) { + if (tsd_state_get(tsd) != tsd_state_nominal) { + /* Check first because this is also called on purgatory. */ + thread_allocated_next_event_fast_set_non_nominal(tsd); + return; + } + uint64_t next_event = thread_allocated_next_event_get(tsd); + uint64_t next_event_fast = (next_event <= + THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX) ? next_event : 0U; + thread_allocated_next_event_fast_set(tsd, next_event_fast); + + atomic_fence(ATOMIC_SEQ_CST); + if (tsd_state_get(tsd) != tsd_state_nominal) { + thread_allocated_next_event_fast_set_non_nominal(tsd); + } +} + static void thread_event_adjust_thresholds_helper(tsd_t *tsd, uint64_t wait) { assert(wait <= THREAD_EVENT_MAX_START_WAIT); uint64_t next_event = thread_allocated_last_event_get(tsd) + (wait <= THREAD_EVENT_MAX_INTERVAL ? wait : THREAD_EVENT_MAX_INTERVAL); thread_allocated_next_event_set(tsd, next_event); - uint64_t next_event_fast = (next_event <= - THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX) ? next_event : 0U; - thread_allocated_next_event_fast_set(tsd, next_event_fast); } static uint64_t diff --git a/src/tsd.c b/src/tsd.c index 6e0ee93c..17e9eed2 100644 --- a/src/tsd.c +++ b/src/tsd.c @@ -115,8 +115,11 @@ tsd_force_recompute(tsdn_t *tsdn) { ql_foreach(remote_tsd, &tsd_nominal_tsds, TSD_MANGLE(tcache).tsd_link) { assert(tsd_atomic_load(&remote_tsd->state, ATOMIC_RELAXED) <= tsd_state_nominal_max); - tsd_atomic_store(&remote_tsd->state, tsd_state_nominal_recompute, - ATOMIC_RELAXED); + tsd_atomic_store(&remote_tsd->state, + tsd_state_nominal_recompute, ATOMIC_RELAXED); + /* See comments in thread_event_recompute_fast_threshold(). */ + atomic_fence(ATOMIC_SEQ_CST); + thread_allocated_next_event_fast_set_non_nominal(remote_tsd); } malloc_mutex_unlock(tsdn, &tsd_nominal_tsds_lock); } @@ -175,6 +178,8 @@ tsd_slow_update(tsd_t *tsd) { old_state = tsd_atomic_exchange(&tsd->state, new_state, ATOMIC_ACQUIRE); } while (old_state == tsd_state_nominal_recompute); + + thread_event_recompute_fast_threshold(tsd); } void @@ -213,6 +218,7 @@ tsd_state_set(tsd_t *tsd, uint8_t new_state) { tsd_slow_update(tsd); } } + thread_event_recompute_fast_threshold(tsd); } static bool diff --git a/test/unit/thread_event.c b/test/unit/thread_event.c index cf5b2e59..f016cc5d 100644 --- a/test/unit/thread_event.c +++ b/test/unit/thread_event.c @@ -7,8 +7,6 @@ TEST_BEGIN(test_next_event_fast_roll_back) { THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX - 8U); thread_allocated_next_event_set(tsd, THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX); - thread_allocated_next_event_fast_set(tsd, - THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX); #define E(event, condition) \ event##_event_wait_set(tsd, \ THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX); @@ -27,7 +25,6 @@ TEST_BEGIN(test_next_event_fast_resume) { THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX + 8U); thread_allocated_next_event_set(tsd, THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX + 16U); - thread_allocated_next_event_fast_set(tsd, 0); #define E(event, condition) \ event##_event_wait_set(tsd, \ THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX + 16U);