From 441d88d1c78ecc38a7ffad3f88ea50513dabc0f8 Mon Sep 17 00:00:00 2001 From: Yinan Zhang Date: Mon, 9 Mar 2020 15:49:15 -0700 Subject: [PATCH] Rewrite profiling thread event --- include/jemalloc/internal/prof_externs.h | 2 +- include/jemalloc/internal/prof_inlines_b.h | 17 +++------ include/jemalloc/internal/thread_event.h | 7 ++++ src/jemalloc.c | 44 ++++++++-------------- src/prof.c | 17 +-------- src/thread_event.c | 12 +----- 6 files changed, 31 insertions(+), 68 deletions(-) diff --git a/include/jemalloc/internal/prof_externs.h b/include/jemalloc/internal/prof_externs.h index 0b6fecd2..5a32754e 100644 --- a/include/jemalloc/internal/prof_externs.h +++ b/include/jemalloc/internal/prof_externs.h @@ -53,7 +53,7 @@ void prof_idump_rollback_impl(tsdn_t *tsdn, size_t usize); prof_tdata_t *prof_tdata_init(tsd_t *tsd); prof_tdata_t *prof_tdata_reinit(tsd_t *tsd, prof_tdata_t *tdata); -void prof_alloc_rollback(tsd_t *tsd, prof_tctx_t *tctx, bool updated); +void prof_alloc_rollback(tsd_t *tsd, prof_tctx_t *tctx); void prof_malloc_sample_object(tsd_t *tsd, const void *ptr, size_t size, size_t usize, prof_tctx_t *tctx); void prof_free_sampled_object(tsd_t *tsd, size_t usize, prof_info_t *prof_info); diff --git a/include/jemalloc/internal/prof_inlines_b.h b/include/jemalloc/internal/prof_inlines_b.h index 7e28d836..64983877 100644 --- a/include/jemalloc/internal/prof_inlines_b.h +++ b/include/jemalloc/internal/prof_inlines_b.h @@ -85,11 +85,11 @@ prof_info_set(tsd_t *tsd, edata_t *edata, prof_tctx_t *tctx) { } JEMALLOC_ALWAYS_INLINE bool -prof_sample_accum_update(tsd_t *tsd, size_t usize, bool update) { +prof_sample_should_skip(tsd_t *tsd, size_t usize) { cassert(config_prof); /* Fastpath: no need to load tdata */ - if (likely(prof_sample_event_wait_get(tsd) > 0)) { + if (likely(!te_prof_sample_event_lookahead(tsd, usize))) { return true; } @@ -102,21 +102,16 @@ prof_sample_accum_update(tsd_t *tsd, size_t usize, bool update) { return true; } - /* Compute new sample threshold. */ - if (update) { - prof_sample_threshold_update(tsd); - } return !tdata->active; } JEMALLOC_ALWAYS_INLINE prof_tctx_t * -prof_alloc_prep(tsd_t *tsd, size_t usize, bool prof_active, bool update) { +prof_alloc_prep(tsd_t *tsd, size_t usize, bool prof_active) { prof_tctx_t *ret; assert(usize == sz_s2u(usize)); - if (!prof_active || - likely(prof_sample_accum_update(tsd, usize, update))) { + if (!prof_active || likely(prof_sample_should_skip(tsd, usize))) { ret = (prof_tctx_t *)(uintptr_t)1U; } else { ret = prof_tctx_create(tsd); @@ -150,7 +145,7 @@ prof_realloc(tsd_t *tsd, const void *ptr, size_t size, size_t usize, if (prof_active && ptr != NULL) { assert(usize == isalloc(tsd_tsdn(tsd), ptr)); - if (prof_sample_accum_update(tsd, usize, true)) { + if (prof_sample_should_skip(tsd, usize)) { /* * Don't sample. The usize passed to prof_alloc_prep() * was larger than what actually got allocated, so a @@ -158,7 +153,7 @@ prof_realloc(tsd_t *tsd, const void *ptr, size_t size, size_t usize, * though its actual usize was insufficient to cross the * sample threshold. */ - prof_alloc_rollback(tsd, tctx, true); + prof_alloc_rollback(tsd, tctx); tctx = (prof_tctx_t *)(uintptr_t)1U; } } diff --git a/include/jemalloc/internal/thread_event.h b/include/jemalloc/internal/thread_event.h index d528c051..f9e2ba5c 100644 --- a/include/jemalloc/internal/thread_event.h +++ b/include/jemalloc/internal/thread_event.h @@ -218,6 +218,13 @@ te_ctx_get(tsd_t *tsd, te_ctx_t *ctx, bool is_alloc) { } } +JEMALLOC_ALWAYS_INLINE bool +te_prof_sample_event_lookahead(tsd_t *tsd, size_t usize) { + return tsd_thread_allocated_get(tsd) + usize - + tsd_thread_allocated_last_event_get(tsd) >= + tsd_prof_sample_event_wait_get(tsd); +} + JEMALLOC_ALWAYS_INLINE void te_event_advance(tsd_t *tsd, size_t usize, bool is_alloc) { te_assert_invariants(tsd); diff --git a/src/jemalloc.c b/src/jemalloc.c index 758e3244..7a65db02 100644 --- a/src/jemalloc.c +++ b/src/jemalloc.c @@ -2177,8 +2177,6 @@ imalloc_body(static_opts_t *sopts, dynamic_opts_t *dopts, tsd_t *tsd) { dopts->arena_ind = 0; } - thread_alloc_event(tsd, usize); - /* * If dopts->alignment > 0, then ind is still 0, but usize was computed * in the previous if statement. Down the positive alignment path, @@ -2187,8 +2185,8 @@ imalloc_body(static_opts_t *sopts, dynamic_opts_t *dopts, tsd_t *tsd) { /* If profiling is on, get our profiling context. */ if (config_prof && opt_prof) { - prof_tctx_t *tctx = prof_alloc_prep( - tsd, usize, prof_active_get_unlocked(), true); + bool prof_active = prof_active_get_unlocked(); + prof_tctx_t *tctx = prof_alloc_prep(tsd, usize, prof_active); emap_alloc_ctx_t alloc_ctx; if (likely((uintptr_t)tctx == (uintptr_t)1U)) { @@ -2204,8 +2202,7 @@ imalloc_body(static_opts_t *sopts, dynamic_opts_t *dopts, tsd_t *tsd) { } if (unlikely(allocation == NULL)) { - te_alloc_rollback(tsd, usize); - prof_alloc_rollback(tsd, tctx, true); + prof_alloc_rollback(tsd, tctx); goto label_oom; } prof_malloc(tsd, allocation, size, usize, &alloc_ctx, tctx); @@ -2214,7 +2211,6 @@ imalloc_body(static_opts_t *sopts, dynamic_opts_t *dopts, tsd_t *tsd) { allocation = imalloc_no_sample(sopts, dopts, tsd, size, usize, ind); if (unlikely(allocation == NULL)) { - te_alloc_rollback(tsd, usize); goto label_oom; } } @@ -2223,6 +2219,9 @@ imalloc_body(static_opts_t *sopts, dynamic_opts_t *dopts, tsd_t *tsd) { * Allocation has been done at this point. We still have some * post-allocation work to do though. */ + + thread_alloc_event(tsd, usize); + assert(dopts->alignment == 0 || ((uintptr_t)allocation & (dopts->alignment - 1)) == ZU(0)); @@ -3132,7 +3131,7 @@ irallocx_prof(tsd_t *tsd, void *old_ptr, size_t old_usize, size_t size, prof_info_t old_prof_info; prof_info_get_and_reset_recent(tsd, old_ptr, alloc_ctx, &old_prof_info); bool prof_active = prof_active_get_unlocked(); - prof_tctx_t *tctx = prof_alloc_prep(tsd, *usize, prof_active, false); + prof_tctx_t *tctx = prof_alloc_prep(tsd, *usize, prof_active); void *p; if (unlikely((uintptr_t)tctx != (uintptr_t)1U)) { p = irallocx_prof_sample(tsd_tsdn(tsd), old_ptr, old_usize, @@ -3142,7 +3141,7 @@ irallocx_prof(tsd_t *tsd, void *old_ptr, size_t old_usize, size_t size, zero, tcache, arena, hook_args); } if (unlikely(p == NULL)) { - prof_alloc_rollback(tsd, tctx, false); + prof_alloc_rollback(tsd, tctx); return NULL; } @@ -3155,8 +3154,10 @@ irallocx_prof(tsd_t *tsd, void *old_ptr, size_t old_usize, size_t size, * be the same as the current usize because of in-place large * reallocation. Therefore, query the actual value of usize. */ + assert(*usize >= isalloc(tsd_tsdn(tsd), p)); *usize = isalloc(tsd_tsdn(tsd), p); } + prof_realloc(tsd, p, size, *usize, tctx, prof_active, old_ptr, old_usize, &old_prof_info); @@ -3214,11 +3215,9 @@ do_rallocx(void *ptr, size_t size, int flags, bool is_realloc) { if (unlikely(usize == 0 || usize > SC_LARGE_MAXCLASS)) { goto label_oom; } - thread_alloc_event(tsd, usize); p = irallocx_prof(tsd, ptr, old_usize, size, alignment, &usize, zero, tcache, arena, &alloc_ctx, &hook_args); if (unlikely(p == NULL)) { - te_alloc_rollback(tsd, usize); goto label_oom; } } else { @@ -3228,9 +3227,9 @@ do_rallocx(void *ptr, size_t size, int flags, bool is_realloc) { goto label_oom; } usize = isalloc(tsd_tsdn(tsd), p); - thread_alloc_event(tsd, usize); } assert(alignment == 0 || ((uintptr_t)p & (alignment - 1)) == ZU(0)); + thread_alloc_event(tsd, usize); thread_dalloc_event(tsd, old_usize); UTRACE(ptr, size, p); @@ -3416,9 +3415,8 @@ ixallocx_prof(tsd_t *tsd, void *ptr, size_t old_usize, size_t size, usize_max = SC_LARGE_MAXCLASS; } } - thread_alloc_event(tsd, usize_max); bool prof_active = prof_active_get_unlocked(); - prof_tctx_t *tctx = prof_alloc_prep(tsd, usize_max, prof_active, false); + prof_tctx_t *tctx = prof_alloc_prep(tsd, usize_max, prof_active); size_t usize; if (unlikely((uintptr_t)tctx != (uintptr_t)1U)) { @@ -3428,18 +3426,6 @@ ixallocx_prof(tsd_t *tsd, void *ptr, size_t old_usize, size_t size, usize = ixallocx_helper(tsd_tsdn(tsd), ptr, old_usize, size, extra, alignment, zero); } - if (usize <= usize_max) { - te_alloc_rollback(tsd, usize_max - usize); - } else { - /* - * For downsizing request, usize_max can be less than usize. - * We here further increase thread event counters so as to - * record the true usize, and then when the execution goes back - * to xallocx(), the entire usize will be rolled back if it's - * equal to the old usize. - */ - thread_alloc_event(tsd, usize - usize_max); - } /* * At this point we can still safely get the original profiling @@ -3452,9 +3438,10 @@ ixallocx_prof(tsd_t *tsd, void *ptr, size_t old_usize, size_t size, prof_info_t prof_info; if (usize == old_usize) { prof_info_get(tsd, ptr, alloc_ctx, &prof_info); - prof_alloc_rollback(tsd, tctx, false); + prof_alloc_rollback(tsd, tctx); } else { prof_info_get_and_reset_recent(tsd, ptr, alloc_ctx, &prof_info); + assert(usize <= usize_max); prof_realloc(tsd, ptr, size, usize, tctx, prof_active, ptr, old_usize, &prof_info); } @@ -3516,7 +3503,6 @@ je_xallocx(void *ptr, size_t size, size_t extra, int flags) { } else { usize = ixallocx_helper(tsd_tsdn(tsd), ptr, old_usize, size, extra, alignment, zero); - thread_alloc_event(tsd, usize); } /* @@ -3527,9 +3513,9 @@ je_xallocx(void *ptr, size_t size, size_t extra, int flags) { == old_edata); if (unlikely(usize == old_usize)) { - te_alloc_rollback(tsd, usize); goto label_not_resized; } + thread_alloc_event(tsd, usize); thread_dalloc_event(tsd, old_usize); if (config_fill && malloc_slow) { diff --git a/src/prof.c b/src/prof.c index 82f88a21..73e6d914 100644 --- a/src/prof.c +++ b/src/prof.c @@ -118,7 +118,7 @@ prof_strncpy(char *UNUSED dest, const char *UNUSED src, size_t UNUSED size) { } void -prof_alloc_rollback(tsd_t *tsd, prof_tctx_t *tctx, bool updated) { +prof_alloc_rollback(tsd_t *tsd, prof_tctx_t *tctx) { cassert(config_prof); if (tsd_reentrancy_level_get(tsd) > 0) { @@ -126,21 +126,6 @@ prof_alloc_rollback(tsd_t *tsd, prof_tctx_t *tctx, bool updated) { return; } - prof_tdata_t *tdata; - - if (updated) { - /* - * Compute a new sample threshold. This isn't very important in - * practice, because this function is rarely executed, so the - * potential for sample bias is minimal except in contrived - * programs. - */ - tdata = prof_tdata_get(tsd, true); - if (tdata != NULL) { - prof_sample_threshold_update(tsd); - } - } - if ((uintptr_t)tctx > (uintptr_t)1U) { malloc_mutex_lock(tsd_tsdn(tsd), tctx->tdata->lock); tctx->prepared = false; diff --git a/src/thread_event.c b/src/thread_event.c index dadace38..75208f0e 100644 --- a/src/thread_event.c +++ b/src/thread_event.c @@ -78,17 +78,7 @@ te_prof_sample_event_handler(tsd_t *tsd) { if (prof_idump_accum(tsd_tsdn(tsd), last_event - last_sample_event)) { prof_idump(tsd_tsdn(tsd)); } - if (!prof_active_get_unlocked()) { - /* - * If prof_active is off, we reset prof_sample_event_wait to be - * the sample interval when it drops to 0, so that there won't - * be excessive routings to the slow path, and that when - * prof_active is turned on later, the counting for sampling - * can immediately resume as normal. - */ - te_prof_sample_event_update(tsd, - (uint64_t)(1 << lg_prof_sample)); - } + te_tsd_prof_sample_event_init(tsd); } static void