From 152c0ef954f19fc2bbe53fead9c62c9824f06109 Mon Sep 17 00:00:00 2001 From: Yinan Zhang Date: Tue, 3 Sep 2019 15:04:48 -0700 Subject: [PATCH] Build a general purpose thread event handler --- Makefile.in | 10 +- include/jemalloc/internal/prof_inlines_b.h | 62 +++-- include/jemalloc/internal/thread_event.h | 139 ++++++++++ include/jemalloc/internal/tsd.h | 76 ++++-- .../projects/vc2015/jemalloc/jemalloc.vcxproj | 3 +- .../vc2015/jemalloc/jemalloc.vcxproj.filters | 27 +- .../projects/vc2017/jemalloc/jemalloc.vcxproj | 3 +- .../vc2017/jemalloc/jemalloc.vcxproj.filters | 33 +-- src/jemalloc.c | 78 +++--- src/prof.c | 21 +- src/thread_event.c | 255 ++++++++++++++++++ test/unit/thread_event.c | 57 ++++ test/unit/thread_event.sh | 5 + 13 files changed, 630 insertions(+), 139 deletions(-) create mode 100644 include/jemalloc/internal/thread_event.h create mode 100644 src/thread_event.c create mode 100644 test/unit/thread_event.c create mode 100644 test/unit/thread_event.sh diff --git a/Makefile.in b/Makefile.in index fede961d..7eba7742 100644 --- a/Makefile.in +++ b/Makefile.in @@ -123,11 +123,12 @@ C_SRCS := $(srcroot)src/jemalloc.c \ $(srcroot)src/prof_log.c \ $(srcroot)src/rtree.c \ $(srcroot)src/safety_check.c \ - $(srcroot)src/stats.c \ $(srcroot)src/sc.c \ + $(srcroot)src/stats.c \ $(srcroot)src/sz.c \ $(srcroot)src/tcache.c \ $(srcroot)src/test_hooks.c \ + $(srcroot)src/thread_event.c \ $(srcroot)src/ticker.c \ $(srcroot)src/tsd.c \ $(srcroot)src/witness.c @@ -176,9 +177,9 @@ TESTS_UNIT := \ $(srcroot)test/unit/background_thread.c \ $(srcroot)test/unit/background_thread_enable.c \ $(srcroot)test/unit/base.c \ + $(srcroot)test/unit/binshard.c \ $(srcroot)test/unit/bitmap.c \ $(srcroot)test/unit/bit_util.c \ - $(srcroot)test/unit/binshard.c \ $(srcroot)test/unit/buf_writer.c \ $(srcroot)test/unit/cache_bin.c \ $(srcroot)test/unit/ckh.c \ @@ -200,6 +201,7 @@ TESTS_UNIT := \ $(srcroot)test/unit/math.c \ $(srcroot)test/unit/mq.c \ $(srcroot)test/unit/mtx.c \ + $(srcroot)test/unit/nstime.c \ $(srcroot)test/unit/pack.c \ $(srcroot)test/unit/pages.c \ $(srcroot)test/unit/ph.c \ @@ -218,9 +220,9 @@ TESTS_UNIT := \ $(srcroot)test/unit/retained.c \ $(srcroot)test/unit/rtree.c \ $(srcroot)test/unit/safety_check.c \ + $(srcroot)test/unit/sc.c \ $(srcroot)test/unit/seq.c \ $(srcroot)test/unit/SFMT.c \ - $(srcroot)test/unit/sc.c \ $(srcroot)test/unit/size_classes.c \ $(srcroot)test/unit/slab.c \ $(srcroot)test/unit/smoothstep.c \ @@ -228,8 +230,8 @@ TESTS_UNIT := \ $(srcroot)test/unit/stats.c \ $(srcroot)test/unit/stats_print.c \ $(srcroot)test/unit/test_hooks.c \ + $(srcroot)test/unit/thread_event.c \ $(srcroot)test/unit/ticker.c \ - $(srcroot)test/unit/nstime.c \ $(srcroot)test/unit/tsd.c \ $(srcroot)test/unit/witness.c \ $(srcroot)test/unit/zero.c \ diff --git a/include/jemalloc/internal/prof_inlines_b.h b/include/jemalloc/internal/prof_inlines_b.h index 6b10f5bf..b4e65c05 100644 --- a/include/jemalloc/internal/prof_inlines_b.h +++ b/include/jemalloc/internal/prof_inlines_b.h @@ -3,6 +3,7 @@ #include "jemalloc/internal/safety_check.h" #include "jemalloc/internal/sz.h" +#include "jemalloc/internal/thread_event.h" JEMALLOC_ALWAYS_INLINE bool prof_gdump_get_unlocked(void) { @@ -79,24 +80,6 @@ prof_alloc_time_set(tsdn_t *tsdn, const void *ptr, nstime_t t) { arena_prof_alloc_time_set(tsdn, ptr, t); } -JEMALLOC_ALWAYS_INLINE bool -prof_sample_check(tsd_t *tsd, size_t usize, bool update) { - ssize_t check = update ? 0 : usize; - - int64_t bytes_until_sample = tsd_bytes_until_sample_get(tsd); - if (update) { - bytes_until_sample -= usize; - if (tsd_nominal(tsd)) { - tsd_bytes_until_sample_set(tsd, bytes_until_sample); - } - } - if (likely(bytes_until_sample >= check)) { - return true; - } - - return false; -} - JEMALLOC_ALWAYS_INLINE bool prof_sample_accum_update(tsd_t *tsd, size_t usize, bool update, prof_tdata_t **tdata_out) { @@ -105,7 +88,7 @@ prof_sample_accum_update(tsd_t *tsd, size_t usize, bool update, cassert(config_prof); /* Fastpath: no need to load tdata */ - if (likely(prof_sample_check(tsd, usize, update))) { + if (likely(prof_sample_event_wait_get(tsd) > 0)) { return true; } @@ -127,13 +110,40 @@ prof_sample_accum_update(tsd_t *tsd, size_t usize, bool update, return true; } - /* - * If this was the first creation of tdata, then - * prof_tdata_get() reset bytes_until_sample, so decrement and - * check it again - */ - if (!booted && prof_sample_check(tsd, usize, update)) { - return true; + if (!booted) { + /* + * If this was the first creation of tdata, then it means that + * the previous thread_event() relied on the wrong prof_sample + * wait time, and that it should have relied on the new + * prof_sample wait time just set by prof_tdata_get(), so we + * now manually check again. + * + * If the check fails, then even though we relied on the wrong + * prof_sample wait time, we're now actually in perfect shape, + * in the sense that we can pretend that we have used the right + * prof_sample wait time. + * + * If the check succeeds, then we are now in a tougher + * situation, in the sense that we cannot pretend that we have + * used the right prof_sample wait time. A straightforward + * solution would be to fully roll back thread_event(), set the + * right prof_sample wait time, and then redo thread_event(). + * A simpler way, which is implemented below, is to just set a + * new prof_sample wait time that is usize less, and do nothing + * else. Strictly speaking, the thread event handler may end + * up in a wrong state, since it has still recorded an event + * whereas in reality there may be no event. However, the + * difference in the wait time offsets the wrongly recorded + * event, so that, functionally, the countdown to the next + * event will behave exactly as if we have used the right + * prof_sample wait time in the first place. + */ + uint64_t wait = prof_sample_event_wait_get(tsd); + assert(wait > 0); + if (usize < wait) { + thread_prof_sample_event_update(tsd, wait - usize); + return true; + } } /* Compute new sample threshold. */ diff --git a/include/jemalloc/internal/thread_event.h b/include/jemalloc/internal/thread_event.h new file mode 100644 index 00000000..08678b74 --- /dev/null +++ b/include/jemalloc/internal/thread_event.h @@ -0,0 +1,139 @@ +#ifndef JEMALLOC_INTERNAL_THREAD_EVENT_H +#define JEMALLOC_INTERNAL_THREAD_EVENT_H + +#include "jemalloc/internal/tsd.h" + +/* + * Maximum threshold on thread_allocated_next_event_fast, so that there is no + * need to check overflow in malloc fast path. (The allocation size in malloc + * fast path never exceeds SC_LOOKUP_MAXCLASS.) + */ +#define THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX \ + (UINT64_MAX - SC_LOOKUP_MAXCLASS + 1U) + +/* + * The max interval helps make sure that malloc stays on the fast path in the + * common case, i.e. thread_allocated < thread_allocated_next_event_fast. + * When thread_allocated is within an event's distance to + * THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX above, thread_allocated_next_event_fast + * is wrapped around and we fall back to the medium-fast path. The max interval + * makes sure that we're not staying on the fallback case for too long, even if + * there's no active event or if all active events have long wait times. + */ +#define THREAD_EVENT_MAX_INTERVAL ((uint64_t)(4U << 20)) + +void thread_event_assert_invariants_debug(tsd_t *tsd); +void thread_event_trigger(tsd_t *tsd, bool delay_event); +void thread_event_rollback(tsd_t *tsd, size_t diff); +void thread_event_update(tsd_t *tsd); +void thread_event_boot(); + +/* + * List of all events, in the following format: + * E(event, (condition)) + */ +#define ITERATE_OVER_ALL_EVENTS \ + E(prof_sample, (config_prof && opt_prof)) + +#define E(event, condition) \ + C(event##_event_wait) + +/* List of all thread event counters. */ +#define ITERATE_OVER_ALL_COUNTERS \ + C(thread_allocated) \ + C(thread_allocated_next_event_fast) \ + C(thread_allocated_last_event) \ + C(thread_allocated_next_event) \ + ITERATE_OVER_ALL_EVENTS + +/* Getters directly wrap TSD getters. */ +#define C(counter) \ +JEMALLOC_ALWAYS_INLINE uint64_t \ +counter##_get(tsd_t *tsd) { \ + return tsd_##counter##_get(tsd); \ +} + +ITERATE_OVER_ALL_COUNTERS +#undef C + +/* + * Setters call the TSD pointer getters rather than the TSD setters, so that + * the counters can be modified even when TSD state is reincarnated or + * minimal_initialized: if an event is triggered in such cases, we will + * temporarily delay the event and let it be immediately triggered at the next + * allocation call. + */ +#define C(counter) \ +JEMALLOC_ALWAYS_INLINE void \ +counter##_set(tsd_t *tsd, uint64_t v) { \ + *tsd_##counter##p_get(tsd) = v; \ +} + +ITERATE_OVER_ALL_COUNTERS +#undef C + +/* + * For generating _event_wait getter / setter functions for each individual + * event. + */ +#undef E + +/* + * The function checks in debug mode whether the thread event counters are in + * a consistent state, which forms the invariants before and after each round + * of thread event handling that we can rely on and need to promise. + * The invariants are only temporarily violated in the middle of: + * (a) thread_event() if an event is triggered (the thread_event_trigger() call + * at the end will restore the invariants), + * (b) thread_##event##_event_update() (the thread_event_update() call at the + * end will restore the invariants), or + * (c) thread_event_rollback() if the rollback falls below the last_event (the + * thread_event_update() call at the end will restore the invariants). + */ +JEMALLOC_ALWAYS_INLINE void +thread_event_assert_invariants(tsd_t *tsd) { + if (config_debug) { + thread_event_assert_invariants_debug(tsd); + } +} + +JEMALLOC_ALWAYS_INLINE void +thread_event(tsd_t *tsd, size_t usize) { + thread_event_assert_invariants(tsd); + + uint64_t thread_allocated_before = thread_allocated_get(tsd); + thread_allocated_set(tsd, thread_allocated_before + usize); + + /* The subtraction is intentionally susceptible to underflow. */ + if (likely(usize < thread_allocated_next_event_get(tsd) - + thread_allocated_before)) { + thread_event_assert_invariants(tsd); + } else { + thread_event_trigger(tsd, false); + } +} + +#define E(event, condition) \ +JEMALLOC_ALWAYS_INLINE void \ +thread_##event##_event_update(tsd_t *tsd, uint64_t event_wait) { \ + thread_event_assert_invariants(tsd); \ + assert(condition); \ + assert(tsd_nominal(tsd)); \ + assert(tsd_reentrancy_level_get(tsd) == 0); \ + assert(event_wait > 0U); \ + if (THREAD_EVENT_MIN_START_WAIT > 1U && \ + unlikely(event_wait < THREAD_EVENT_MIN_START_WAIT)) { \ + event_wait = THREAD_EVENT_MIN_START_WAIT; \ + } \ + if (THREAD_EVENT_MAX_START_WAIT < UINT64_MAX && \ + unlikely(event_wait > THREAD_EVENT_MAX_START_WAIT)) { \ + event_wait = THREAD_EVENT_MAX_START_WAIT; \ + } \ + event##_event_wait_set(tsd, event_wait); \ + thread_event_update(tsd); \ +} + +ITERATE_OVER_ALL_EVENTS +#undef E + +#endif /* JEMALLOC_INTERNAL_THREAD_EVENT_H */ diff --git a/include/jemalloc/internal/tsd.h b/include/jemalloc/internal/tsd.h index e2cc7747..14ad53d7 100644 --- a/include/jemalloc/internal/tsd.h +++ b/include/jemalloc/internal/tsd.h @@ -15,39 +15,45 @@ /* * Thread-Specific-Data layout - * --- data accessed on tcache fast path: state, rtree_ctx, stats, prof --- + * --- data accessed on tcache fast path: state, rtree_ctx, stats --- * s: state * e: tcache_enabled * m: thread_allocated + * k: thread_allocated_next_event_fast * f: thread_deallocated - * b: bytes_until_sample (config_prof) - * p: prof_tdata (config_prof) * c: rtree_ctx (rtree cache accessed on deallocation) * t: tcache * --- data not accessed on tcache fast path: arena-related fields --- * d: arenas_tdata_bypass * r: reentrancy_level * x: narenas_tdata + * l: thread_allocated_last_event + * j: thread_allocated_next_event + * w: prof_sample_event_wait (config_prof) + * p: prof_tdata (config_prof) * v: offset_state * i: iarena * a: arena * o: arenas_tdata + * b: binshards * Loading TSD data is on the critical path of basically all malloc operations. * In particular, tcache and rtree_ctx rely on hot CPU cache to be effective. * Use a compact layout to reduce cache footprint. * +--- 64-bit and 64B cacheline; 1B each letter; First byte on the left. ---+ * |---------------------------- 1st cacheline ----------------------------| - * | sedrxxxx vvvvvvvv mmmmmmmm ffffffff bbbbbbbb pppppppp [c * 16 .......] | + * | sedrxxxx mmmmmmmm kkkkkkkk ffffffff [c * 32 ........ ........ .......] | * |---------------------------- 2nd cacheline ----------------------------| * | [c * 64 ........ ........ ........ ........ ........ ........ .......] | * |---------------------------- 3nd cacheline ----------------------------| - * | [c * 48 ........ ........ ........ ........ .......] iiiiiiii aaaaaaaa | + * | [c * 32 ........ ........ .......] llllllll jjjjjjjj wwwwwwww pppppppp | * +---------------------------- 4th cacheline ----------------------------+ - * | oooooooo [t...... ........ ........ ........ ........ ........ ........ | + * | vvvvvvvv iiiiiiii aaaaaaaa oooooooo [b...... ........ ........ ........ | + * +---------------------------- 5th cacheline ----------------------------+ + * | ..b][t.. ........ ........ ........ ........ ........ ........ ........ | * +-------------------------------------------------------------------------+ * Note: the entire tcache is embedded into TSD and spans multiple cachelines. * - * The last 3 members (i, a and o) before tcache isn't really needed on tcache + * The elements after rtree_ctx and before tcache aren't really needed on tcache * fast path. However we have a number of unused tcache bins and witnesses * (never touched unless config_debug) at the end of tcache, so we place them * there to avoid breaking the cachelines and possibly paging in an extra page. @@ -64,18 +70,21 @@ typedef void (*test_callback_t)(int *); # define MALLOC_TEST_TSD_INITIALIZER #endif -/* O(name, type, nullable type */ +/* O(name, type, nullable type) */ #define MALLOC_TSD \ O(tcache_enabled, bool, bool) \ O(arenas_tdata_bypass, bool, bool) \ O(reentrancy_level, int8_t, int8_t) \ O(narenas_tdata, uint32_t, uint32_t) \ - O(offset_state, uint64_t, uint64_t) \ O(thread_allocated, uint64_t, uint64_t) \ + O(thread_allocated_next_event_fast, uint64_t, uint64_t) \ O(thread_deallocated, uint64_t, uint64_t) \ - O(bytes_until_sample, int64_t, int64_t) \ - O(prof_tdata, prof_tdata_t *, prof_tdata_t *) \ O(rtree_ctx, rtree_ctx_t, rtree_ctx_t) \ + O(thread_allocated_last_event, uint64_t, uint64_t) \ + O(thread_allocated_next_event, uint64_t, uint64_t) \ + O(prof_sample_event_wait, uint64_t, uint64_t) \ + O(prof_tdata, prof_tdata_t *, prof_tdata_t *) \ + O(offset_state, uint64_t, uint64_t) \ O(iarena, arena_t *, arena_t *) \ O(arena, arena_t *, arena_t *) \ O(arenas_tdata, arena_tdata_t *, arena_tdata_t *)\ @@ -84,25 +93,34 @@ typedef void (*test_callback_t)(int *); O(witness_tsd, witness_tsd_t, witness_tsdn_t) \ MALLOC_TEST_TSD +/* + * THREAD_EVENT_MIN_START_WAIT should not exceed the minimal allocation usize. + */ +#define THREAD_EVENT_MIN_START_WAIT ((uint64_t)1U) +#define THREAD_EVENT_MAX_START_WAIT UINT64_MAX + #define TSD_INITIALIZER { \ - ATOMIC_INIT(tsd_state_uninitialized), \ - TCACHE_ENABLED_ZERO_INITIALIZER, \ - false, \ - 0, \ - 0, \ - 0, \ - 0, \ - 0, \ - 0, \ - NULL, \ - RTREE_CTX_ZERO_INITIALIZER, \ - NULL, \ - NULL, \ - NULL, \ - TSD_BINSHARDS_ZERO_INITIALIZER, \ - TCACHE_ZERO_INITIALIZER, \ - WITNESS_TSD_INITIALIZER \ - MALLOC_TEST_TSD_INITIALIZER \ + /* state */ ATOMIC_INIT(tsd_state_uninitialized), \ + /* tcache_enabled */ TCACHE_ENABLED_ZERO_INITIALIZER, \ + /* arenas_tdata_bypass */ false, \ + /* reentrancy_level */ 0, \ + /* narenas_tdata */ 0, \ + /* thread_allocated */ 0, \ + /* thread_allocated_next_event_fast */ THREAD_EVENT_MIN_START_WAIT, \ + /* thread_deallocated */ 0, \ + /* rtree_ctx */ RTREE_CTX_ZERO_INITIALIZER, \ + /* thread_allocated_last_event */ 0, \ + /* thread_allocated_next_event */ THREAD_EVENT_MIN_START_WAIT, \ + /* prof_sample_event_wait */ THREAD_EVENT_MIN_START_WAIT, \ + /* prof_tdata */ NULL, \ + /* offset_state */ 0, \ + /* iarena */ NULL, \ + /* arena */ NULL, \ + /* arenas_tdata */ NULL, \ + /* binshards */ TSD_BINSHARDS_ZERO_INITIALIZER, \ + /* tcache */ TCACHE_ZERO_INITIALIZER, \ + /* witness */ WITNESS_TSD_INITIALIZER \ + /* test data */ MALLOC_TEST_TSD_INITIALIZER \ } void *malloc_tsd_malloc(size_t size); diff --git a/msvc/projects/vc2015/jemalloc/jemalloc.vcxproj b/msvc/projects/vc2015/jemalloc/jemalloc.vcxproj index a9683384..5838e933 100644 --- a/msvc/projects/vc2015/jemalloc/jemalloc.vcxproj +++ b/msvc/projects/vc2015/jemalloc/jemalloc.vcxproj @@ -63,15 +63,16 @@ + + - {8D6BB292-9E1C-413D-9F98-4864BDC1514A} diff --git a/msvc/projects/vc2015/jemalloc/jemalloc.vcxproj.filters b/msvc/projects/vc2015/jemalloc/jemalloc.vcxproj.filters index bc40883b..3551ba5e 100644 --- a/msvc/projects/vc2015/jemalloc/jemalloc.vcxproj.filters +++ b/msvc/projects/vc2015/jemalloc/jemalloc.vcxproj.filters @@ -16,6 +16,9 @@ Source Files + + Source Files + Source Files @@ -25,6 +28,9 @@ Source Files + + Source Files + Source Files @@ -46,6 +52,9 @@ Source Files + + Source Files + Source Files @@ -76,6 +85,9 @@ Source Files + + Source Files + Source Files @@ -91,6 +103,9 @@ Source Files + + Source Files + Source Files @@ -100,17 +115,5 @@ Source Files - - Source Files - - - Source Files - - - Source Files - - - Source Files - diff --git a/msvc/projects/vc2017/jemalloc/jemalloc.vcxproj b/msvc/projects/vc2017/jemalloc/jemalloc.vcxproj index 72a57e56..b9d4f681 100644 --- a/msvc/projects/vc2017/jemalloc/jemalloc.vcxproj +++ b/msvc/projects/vc2017/jemalloc/jemalloc.vcxproj @@ -63,15 +63,16 @@ + + - {8D6BB292-9E1C-413D-9F98-4864BDC1514A} diff --git a/msvc/projects/vc2017/jemalloc/jemalloc.vcxproj.filters b/msvc/projects/vc2017/jemalloc/jemalloc.vcxproj.filters index 622b93f1..3551ba5e 100644 --- a/msvc/projects/vc2017/jemalloc/jemalloc.vcxproj.filters +++ b/msvc/projects/vc2017/jemalloc/jemalloc.vcxproj.filters @@ -16,6 +16,9 @@ Source Files + + Source Files + Source Files @@ -25,6 +28,9 @@ Source Files + + Source Files + Source Files @@ -46,6 +52,9 @@ Source Files + + Source Files + Source Files @@ -76,6 +85,9 @@ Source Files + + Source Files + Source Files @@ -88,6 +100,12 @@ Source Files + + Source Files + + + Source Files + Source Files @@ -97,20 +115,5 @@ Source Files - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - diff --git a/src/jemalloc.c b/src/jemalloc.c index 88064df4..63a1e302 100644 --- a/src/jemalloc.c +++ b/src/jemalloc.c @@ -18,6 +18,7 @@ #include "jemalloc/internal/spin.h" #include "jemalloc/internal/sz.h" #include "jemalloc/internal/ticker.h" +#include "jemalloc/internal/thread_event.h" #include "jemalloc/internal/util.h" /******************************************************************************/ @@ -1530,6 +1531,7 @@ malloc_init_hard_a0_locked() { prof_boot0(); } malloc_conf_init(&sc_data, bin_shard_sizes); + thread_event_boot(); sz_boot(&sc_data); bin_info_boot(&sc_data, bin_shard_sizes); @@ -2128,6 +2130,8 @@ imalloc_body(static_opts_t *sopts, dynamic_opts_t *dopts, tsd_t *tsd) { dopts->arena_ind = 0; } + thread_event(tsd, usize); + /* * If dopts->alignment > 0, then ind is still 0, but usize was computed * in the previous if statement. Down the positive alignment path, @@ -2136,20 +2140,6 @@ imalloc_body(static_opts_t *sopts, dynamic_opts_t *dopts, tsd_t *tsd) { /* If profiling is on, get our profiling context. */ if (config_prof && opt_prof) { - /* - * The fast path modifies bytes_until_sample regardless of - * prof_active. We reset it to be the sample interval, so that - * there won't be excessive routings to the slow path, and that - * when prof_active is turned on later, the counting for - * sampling can immediately resume as normal (though the very - * first sampling interval is not randomized). - */ - if (unlikely(tsd_bytes_until_sample_get(tsd) < 0) && - !prof_active_get_unlocked()) { - tsd_bytes_until_sample_set(tsd, - (ssize_t)(1 << lg_prof_sample)); - } - prof_tctx_t *tctx = prof_alloc_prep( tsd, usize, prof_active_get_unlocked(), true); @@ -2167,24 +2157,17 @@ imalloc_body(static_opts_t *sopts, dynamic_opts_t *dopts, tsd_t *tsd) { } if (unlikely(allocation == NULL)) { + thread_event_rollback(tsd, usize); prof_alloc_rollback(tsd, tctx, true); goto label_oom; } prof_malloc(tsd_tsdn(tsd), allocation, usize, &alloc_ctx, tctx); } else { assert(!opt_prof); - /* - * The fast path modifies bytes_until_sample regardless of - * opt_prof. We reset it to a huge value here, so as to - * minimize the triggering for slow path. - */ - if (config_prof && - unlikely(tsd_bytes_until_sample_get(tsd) < 0)) { - tsd_bytes_until_sample_set(tsd, SSIZE_MAX); - } allocation = imalloc_no_sample(sopts, dopts, tsd, size, usize, ind); if (unlikely(allocation == NULL)) { + thread_event_rollback(tsd, usize); goto label_oom; } } @@ -2197,7 +2180,6 @@ imalloc_body(static_opts_t *sopts, dynamic_opts_t *dopts, tsd_t *tsd) { || ((uintptr_t)allocation & (dopts->alignment - 1)) == ZU(0)); assert(usize == isalloc(tsd_tsdn(tsd), allocation)); - *tsd_thread_allocatedp_get(tsd) += usize; if (sopts->slow) { UTRACE(0, size, allocation); @@ -2373,7 +2355,12 @@ je_malloc(size_t size) { } szind_t ind = sz_size2index_lookup(size); - /* usize is always needed to increment thread_allocated. */ + /* + * The thread_allocated counter in tsd serves as a general purpose + * accumulator for bytes of allocation to trigger different types of + * events. usize is always needed to advance thread_allocated, though + * it's not always needed in the core allocation logic. + */ size_t usize = sz_index2size(ind); /* * Fast path relies on size being a bin. @@ -2382,19 +2369,12 @@ je_malloc(size_t size) { assert(ind < SC_NBINS); assert(size <= SC_SMALL_MAXCLASS); - if (config_prof) { - int64_t bytes_until_sample = tsd_bytes_until_sample_get(tsd); - bytes_until_sample -= usize; - tsd_bytes_until_sample_set(tsd, bytes_until_sample); - - if (unlikely(bytes_until_sample < 0)) { - /* - * Avoid a prof_active check on the fastpath. - * If prof_active is false, bytes_until_sample will be - * reset in slow path. - */ - return malloc_default(size); - } + uint64_t thread_allocated_after = thread_allocated_get(tsd) + usize; + assert(thread_allocated_next_event_fast_get(tsd) <= + THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX); + if (unlikely(thread_allocated_after >= + thread_allocated_next_event_fast_get(tsd))) { + return malloc_default(size); } cache_bin_t *bin = tcache_small_bin_get(tcache, ind); @@ -2402,7 +2382,7 @@ je_malloc(size_t size) { void *ret = cache_bin_alloc_easy_reduced(bin, &tcache_success); if (tcache_success) { - *tsd_thread_allocatedp_get(tsd) += usize; + thread_allocated_set(tsd, thread_allocated_after); if (config_stats) { bin->tstats.nrequests++; } @@ -3116,9 +3096,11 @@ do_rallocx(void *ptr, size_t size, int flags, bool is_realloc) { if (unlikely(usize == 0 || usize > SC_LARGE_MAXCLASS)) { goto label_oom; } + thread_event(tsd, usize); p = irallocx_prof(tsd, ptr, old_usize, size, alignment, &usize, zero, tcache, arena, &alloc_ctx, &hook_args); if (unlikely(p == NULL)) { + thread_event_rollback(tsd, usize); goto label_oom; } } else { @@ -3128,10 +3110,10 @@ do_rallocx(void *ptr, size_t size, int flags, bool is_realloc) { goto label_oom; } usize = isalloc(tsd_tsdn(tsd), p); + thread_event(tsd, usize); } assert(alignment == 0 || ((uintptr_t)p & (alignment - 1)) == ZU(0)); - *tsd_thread_allocatedp_get(tsd) += usize; *tsd_thread_deallocatedp_get(tsd) += old_usize; UTRACE(ptr, size, p); @@ -3307,6 +3289,7 @@ ixallocx_prof(tsd_t *tsd, void *ptr, size_t old_usize, size_t size, usize_max = SC_LARGE_MAXCLASS; } } + thread_event(tsd, usize_max); tctx = prof_alloc_prep(tsd, usize_max, prof_active, false); if (unlikely((uintptr_t)tctx != (uintptr_t)1U)) { @@ -3316,6 +3299,18 @@ ixallocx_prof(tsd_t *tsd, void *ptr, size_t old_usize, size_t size, usize = ixallocx_helper(tsd_tsdn(tsd), ptr, old_usize, size, extra, alignment, zero); } + if (usize <= usize_max) { + thread_event_rollback(tsd, usize_max - usize); + } else { + /* + * For downsizing request, usize_max can be less than usize. + * We here further increase thread event counters so as to + * record the true usize, and then when the execution goes back + * to xallocx(), the entire usize will be rolled back if it's + * equal to the old usize. + */ + thread_event(tsd, usize - usize_max); + } if (usize == old_usize) { prof_alloc_rollback(tsd, tctx, false); return usize; @@ -3373,12 +3368,13 @@ je_xallocx(void *ptr, size_t size, size_t extra, int flags) { } else { usize = ixallocx_helper(tsd_tsdn(tsd), ptr, old_usize, size, extra, alignment, zero); + thread_event(tsd, usize); } if (unlikely(usize == old_usize)) { + thread_event_rollback(tsd, usize); goto label_not_resized; } - *tsd_thread_allocatedp_get(tsd) += usize; *tsd_thread_deallocatedp_get(tsd) += old_usize; label_not_resized: diff --git a/src/prof.c b/src/prof.c index fc0c7d8a..7e219dc3 100644 --- a/src/prof.c +++ b/src/prof.c @@ -5,6 +5,7 @@ #include "jemalloc/internal/ctl.h" #include "jemalloc/internal/assert.h" #include "jemalloc/internal/mutex.h" +#include "jemalloc/internal/thread_event.h" /* * This file implements the profiling "APIs" needed by other parts of jemalloc, @@ -471,8 +472,11 @@ prof_sample_threshold_update(prof_tdata_t *tdata) { return; } + tsd_t *tsd = tsd_fetch(); + if (lg_prof_sample == 0) { - tsd_bytes_until_sample_set(tsd_fetch(), 0); + thread_prof_sample_event_update(tsd, + THREAD_EVENT_MIN_START_WAIT); return; } @@ -480,11 +484,11 @@ prof_sample_threshold_update(prof_tdata_t *tdata) { * Compute sample interval as a geometrically distributed random * variable with mean (2^lg_prof_sample). * - * __ __ - * | log(u) | 1 - * tdata->bytes_until_sample = | -------- |, where p = --------------- - * | log(1-p) | lg_prof_sample - * 2 + * __ __ + * | log(u) | 1 + * bytes_until_sample = | -------- |, where p = --------------- + * | log(1-p) | lg_prof_sample + * 2 * * For more information on the math, see: * @@ -499,10 +503,7 @@ prof_sample_threshold_update(prof_tdata_t *tdata) { uint64_t bytes_until_sample = (uint64_t)(log(u) / log(1.0 - (1.0 / (double)((uint64_t)1U << lg_prof_sample)))) + (uint64_t)1U; - if (bytes_until_sample > SSIZE_MAX) { - bytes_until_sample = SSIZE_MAX; - } - tsd_bytes_until_sample_set(tsd_fetch(), bytes_until_sample); + thread_prof_sample_event_update(tsd, bytes_until_sample); #endif } diff --git a/src/thread_event.c b/src/thread_event.c new file mode 100644 index 00000000..c6542f46 --- /dev/null +++ b/src/thread_event.c @@ -0,0 +1,255 @@ +#define JEMALLOC_THREAD_EVENT_C_ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/thread_event.h" + +/* + * There's no lock for thread_event_active because write is only done in + * malloc_init(), where init_lock there serves as the guard, and ever since + * then thread_event_active becomes read only. + */ +static bool thread_event_active = false; + +/* Event handler function signatures. */ +#define E(event, condition) \ +static void thread_##event##_event_handler(tsd_t *tsd); + +ITERATE_OVER_ALL_EVENTS +#undef E + +static uint64_t +thread_allocated_next_event_compute(tsd_t *tsd) { + uint64_t wait = THREAD_EVENT_MAX_START_WAIT; + bool no_event_on = true; + +#define E(event, condition) \ + if (condition) { \ + no_event_on = false; \ + uint64_t event_wait = \ + event##_event_wait_get(tsd); \ + assert(event_wait <= THREAD_EVENT_MAX_START_WAIT); \ + if (event_wait > 0U && event_wait < wait) { \ + wait = event_wait; \ + } \ + } + + ITERATE_OVER_ALL_EVENTS +#undef E + + assert(no_event_on == !thread_event_active); + assert(wait <= THREAD_EVENT_MAX_START_WAIT); + return wait; +} + +void +thread_event_assert_invariants_debug(tsd_t *tsd) { + uint64_t thread_allocated = thread_allocated_get(tsd); + uint64_t last_event = thread_allocated_last_event_get(tsd); + uint64_t next_event = thread_allocated_next_event_get(tsd); + uint64_t next_event_fast = thread_allocated_next_event_fast_get(tsd); + + assert(last_event != next_event); + if (next_event <= THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX) { + assert(next_event_fast == next_event); + } else { + assert(next_event_fast == 0U); + } + + /* The subtraction is intentionally susceptible to underflow. */ + uint64_t interval = next_event - last_event; + + /* The subtraction is intentionally susceptible to underflow. */ + assert(thread_allocated - last_event < interval); + + uint64_t min_wait = thread_allocated_next_event_compute(tsd); + + /* + * next_event should have been pushed up only except when no event is + * on and the TSD is just initialized. The last_event == 0U guard + * below is stronger than needed, but having an exactly accurate guard + * is more complicated to implement. + */ + assert((!thread_event_active && last_event == 0U) || + interval == min_wait || + (interval < min_wait && interval == THREAD_EVENT_MAX_INTERVAL)); +} + +static void +thread_event_adjust_thresholds_helper(tsd_t *tsd, uint64_t wait) { + assert(wait <= THREAD_EVENT_MAX_START_WAIT); + uint64_t next_event = thread_allocated_last_event_get(tsd) + (wait <= + THREAD_EVENT_MAX_INTERVAL ? wait : THREAD_EVENT_MAX_INTERVAL); + thread_allocated_next_event_set(tsd, next_event); + uint64_t next_event_fast = (next_event <= + THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX) ? next_event : 0U; + thread_allocated_next_event_fast_set(tsd, next_event_fast); +} + +static void +thread_prof_sample_event_handler(tsd_t *tsd) { + assert(config_prof && opt_prof); + assert(prof_sample_event_wait_get(tsd) == 0U); + if (!prof_active_get_unlocked()) { + /* + * If prof_active is off, we reset prof_sample_event_wait to be + * the sample interval when it drops to 0, so that there won't + * be excessive routings to the slow path, and that when + * prof_active is turned on later, the counting for sampling + * can immediately resume as normal. + */ + thread_prof_sample_event_update(tsd, + (uint64_t)(1 << lg_prof_sample)); + } +} + +static uint64_t +thread_event_trigger_batch_update(tsd_t *tsd, uint64_t accumbytes, + bool allow_event_trigger) { + uint64_t wait = THREAD_EVENT_MAX_START_WAIT; + +#define E(event, condition) \ + if (condition) { \ + uint64_t event_wait = event##_event_wait_get(tsd); \ + assert(event_wait <= THREAD_EVENT_MAX_START_WAIT); \ + if (event_wait > accumbytes) { \ + event_wait -= accumbytes; \ + } else { \ + event_wait = 0U; \ + if (!allow_event_trigger) { \ + event_wait = \ + THREAD_EVENT_MIN_START_WAIT; \ + } \ + } \ + assert(event_wait <= THREAD_EVENT_MAX_START_WAIT); \ + event##_event_wait_set(tsd, event_wait); \ + /* \ + * If there is a single event, then the remaining wait \ + * time may become zero, and we rely on either the \ + * event handler or a thread_event_update() call later \ + * to properly set next_event; if there are multiple \ + * events, then here we can get the minimum remaining \ + * wait time to the next already set event. \ + */ \ + if (event_wait > 0U && event_wait < wait) { \ + wait = event_wait; \ + } \ + } + + ITERATE_OVER_ALL_EVENTS +#undef E + + assert(wait <= THREAD_EVENT_MAX_START_WAIT); + return wait; +} + +void +thread_event_trigger(tsd_t *tsd, bool delay_event) { + /* usize has already been added to thread_allocated. */ + uint64_t thread_allocated_after = thread_allocated_get(tsd); + + /* The subtraction is intentionally susceptible to underflow. */ + uint64_t accumbytes = thread_allocated_after - + thread_allocated_last_event_get(tsd); + + /* Make sure that accumbytes cannot overflow uint64_t. */ + cassert(THREAD_EVENT_MAX_INTERVAL <= + UINT64_MAX - SC_LARGE_MAXCLASS + 1); + + thread_allocated_last_event_set(tsd, thread_allocated_after); + bool allow_event_trigger = !delay_event && tsd_nominal(tsd) && + tsd_reentrancy_level_get(tsd) == 0; + uint64_t wait = thread_event_trigger_batch_update(tsd, accumbytes, + allow_event_trigger); + thread_event_adjust_thresholds_helper(tsd, wait); + + thread_event_assert_invariants(tsd); + +#define E(event, condition) \ + if (condition && event##_event_wait_get(tsd) == 0U) { \ + assert(allow_event_trigger); \ + thread_##event##_event_handler(tsd); \ + } + + ITERATE_OVER_ALL_EVENTS +#undef E + + thread_event_assert_invariants(tsd); +} + +void +thread_event_rollback(tsd_t *tsd, size_t diff) { + thread_event_assert_invariants(tsd); + + if (diff == 0U) { + return; + } + + uint64_t thread_allocated = thread_allocated_get(tsd); + /* The subtraction is intentionally susceptible to underflow. */ + uint64_t thread_allocated_rollback = thread_allocated - diff; + thread_allocated_set(tsd, thread_allocated_rollback); + + uint64_t last_event = thread_allocated_last_event_get(tsd); + /* Both subtractions are intentionally susceptible to underflow. */ + if (thread_allocated_rollback - last_event <= + thread_allocated - last_event) { + thread_event_assert_invariants(tsd); + return; + } + + thread_allocated_last_event_set(tsd, thread_allocated_rollback); + + /* The subtraction is intentionally susceptible to underflow. */ + uint64_t wait_diff = last_event - thread_allocated_rollback; + assert(wait_diff <= diff); + +#define E(event, condition) \ + if (condition) { \ + uint64_t event_wait = event##_event_wait_get(tsd); \ + assert(event_wait <= THREAD_EVENT_MAX_START_WAIT); \ + if (event_wait > 0U) { \ + if (wait_diff > \ + THREAD_EVENT_MAX_START_WAIT - event_wait) { \ + event_wait = \ + THREAD_EVENT_MAX_START_WAIT; \ + } else { \ + event_wait += wait_diff; \ + } \ + assert(event_wait <= \ + THREAD_EVENT_MAX_START_WAIT); \ + event##_event_wait_set(tsd, event_wait); \ + } \ + } + + ITERATE_OVER_ALL_EVENTS +#undef E + + thread_event_update(tsd); +} + +void +thread_event_update(tsd_t *tsd) { + uint64_t wait = thread_allocated_next_event_compute(tsd); + thread_event_adjust_thresholds_helper(tsd, wait); + + uint64_t last_event = thread_allocated_last_event_get(tsd); + + /* Both subtractions are intentionally susceptible to underflow. */ + if (thread_allocated_get(tsd) - last_event >= + thread_allocated_next_event_get(tsd) - last_event) { + thread_event_trigger(tsd, true); + } else { + thread_event_assert_invariants(tsd); + } +} + +void thread_event_boot() { +#define E(event, condition) \ + if (condition) { \ + thread_event_active = true; \ + } + + ITERATE_OVER_ALL_EVENTS +#undef E +} diff --git a/test/unit/thread_event.c b/test/unit/thread_event.c new file mode 100644 index 00000000..6817262b --- /dev/null +++ b/test/unit/thread_event.c @@ -0,0 +1,57 @@ +#include "test/jemalloc_test.h" + +TEST_BEGIN(test_next_event_fast_roll_back) { + tsd_t *tsd = tsd_fetch(); + thread_allocated_last_event_set(tsd, 0); + thread_allocated_set(tsd, + THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX - 8U); + thread_allocated_next_event_set(tsd, + THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX); + thread_allocated_next_event_fast_set(tsd, + THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX); + prof_sample_event_wait_set(tsd, + THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX); + void *p = malloc(16U); + assert_ptr_not_null(p, "malloc() failed"); + free(p); +} +TEST_END + +TEST_BEGIN(test_next_event_fast_resume) { + tsd_t *tsd = tsd_fetch(); + thread_allocated_last_event_set(tsd, 0); + thread_allocated_set(tsd, + THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX + 8U); + thread_allocated_next_event_set(tsd, + THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX + 16U); + thread_allocated_next_event_fast_set(tsd, 0); + prof_sample_event_wait_set(tsd, + THREAD_ALLOCATED_NEXT_EVENT_FAST_MAX + 16U); + void *p = malloc(SC_LOOKUP_MAXCLASS); + assert_ptr_not_null(p, "malloc() failed"); + free(p); +} +TEST_END + +TEST_BEGIN(test_event_rollback) { + tsd_t *tsd = tsd_fetch(); + const uint64_t diff = THREAD_EVENT_MAX_INTERVAL >> 2; + size_t count = 10; + uint64_t thread_allocated = thread_allocated_get(tsd); + while (count-- != 0) { + thread_event_rollback(tsd, diff); + uint64_t thread_allocated_after = thread_allocated_get(tsd); + assert_u64_eq(thread_allocated - thread_allocated_after, diff, + "thread event counters are not properly rolled back"); + thread_allocated = thread_allocated_after; + } +} +TEST_END + +int +main(void) { + return test( + test_next_event_fast_roll_back, + test_next_event_fast_resume, + test_event_rollback); +} diff --git a/test/unit/thread_event.sh b/test/unit/thread_event.sh new file mode 100644 index 00000000..8fcc7d8a --- /dev/null +++ b/test/unit/thread_event.sh @@ -0,0 +1,5 @@ +#!/bin/sh + +if [ "x${enable_prof}" = "x1" ] ; then + export MALLOC_CONF="prof:true,lg_prof_sample:0" +fi