From a7619b7fa56f98d1ca99a23b458696dd37c12b77 Mon Sep 17 00:00:00 2001 From: Ben Maurer Date: Tue, 15 Apr 2014 13:28:37 -0700 Subject: [PATCH 1/3] outline rare tcache_get codepaths --- include/jemalloc/internal/private_symbols.txt | 1 + include/jemalloc/internal/tcache.h | 35 +--------------- src/tcache.c | 40 +++++++++++++++++++ 3 files changed, 43 insertions(+), 33 deletions(-) diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt index f52d49f9..376f95d1 100644 --- a/include/jemalloc/internal/private_symbols.txt +++ b/include/jemalloc/internal/private_symbols.txt @@ -384,6 +384,7 @@ tcache_event tcache_event_hard tcache_flush tcache_get +tcache_get_hard tcache_initialized tcache_maxclass tcache_salloc diff --git a/include/jemalloc/internal/tcache.h b/include/jemalloc/internal/tcache.h index 51974136..96447f42 100644 --- a/include/jemalloc/internal/tcache.h +++ b/include/jemalloc/internal/tcache.h @@ -110,6 +110,7 @@ void tcache_bin_flush_large(tcache_bin_t *tbin, size_t binind, unsigned rem, tcache_t *tcache); void tcache_arena_associate(tcache_t *tcache, arena_t *arena); void tcache_arena_dissociate(tcache_t *tcache); +tcache_t *tcache_get_hard(tcache_t *tcache, bool create); tcache_t *tcache_create(arena_t *arena); void tcache_destroy(tcache_t *tcache); void tcache_thread_cleanup(void *arg); @@ -220,39 +221,7 @@ tcache_get(bool create) if ((uintptr_t)tcache <= (uintptr_t)TCACHE_STATE_MAX) { if (tcache == TCACHE_STATE_DISABLED) return (NULL); - if (tcache == NULL) { - if (create == false) { - /* - * Creating a tcache here would cause - * allocation as a side effect of free(). - * Ordinarily that would be okay since - * tcache_create() failure is a soft failure - * that doesn't propagate. However, if TLS - * data are freed via free() as in glibc, - * subtle corruption could result from setting - * a TLS variable after its backing memory is - * freed. - */ - return (NULL); - } - if (tcache_enabled_get() == false) { - tcache_enabled_set(false); /* Memoize. */ - return (NULL); - } - return (tcache_create(choose_arena(NULL))); - } - if (tcache == TCACHE_STATE_PURGATORY) { - /* - * Make a note that an allocator function was called - * after tcache_thread_cleanup() was called. - */ - tcache = TCACHE_STATE_REINCARNATED; - tcache_tsd_set(&tcache); - return (NULL); - } - if (tcache == TCACHE_STATE_REINCARNATED) - return (NULL); - not_reached(); + tcache = tcache_get_hard(tcache, create); } return (tcache); diff --git a/src/tcache.c b/src/tcache.c index 6de92960..868f2d77 100644 --- a/src/tcache.c +++ b/src/tcache.c @@ -265,6 +265,46 @@ tcache_arena_dissociate(tcache_t *tcache) } } +tcache_t * +tcache_get_hard(tcache_t *tcache, bool create) +{ + + if (tcache == NULL) { + if (create == false) { + /* + * Creating a tcache here would cause + * allocation as a side effect of free(). + * Ordinarily that would be okay since + * tcache_create() failure is a soft failure + * that doesn't propagate. However, if TLS + * data are freed via free() as in glibc, + * subtle corruption could result from setting + * a TLS variable after its backing memory is + * freed. + */ + return (NULL); + } + if (tcache_enabled_get() == false) { + tcache_enabled_set(false); /* Memoize. */ + return (NULL); + } + return (tcache_create(choose_arena(NULL))); + } + if (tcache == TCACHE_STATE_PURGATORY) { + /* + * Make a note that an allocator function was called + * after tcache_thread_cleanup() was called. + */ + tcache = TCACHE_STATE_REINCARNATED; + tcache_tsd_set(&tcache); + return (NULL); + } + if (tcache == TCACHE_STATE_REINCARNATED) + return (NULL); + not_reached(); + return (NULL); +} + tcache_t * tcache_create(arena_t *arena) { From 6c39f9e059d0825f4c29d8cec9f318b798912c3c Mon Sep 17 00:00:00 2001 From: Ben Maurer Date: Tue, 15 Apr 2014 13:47:13 -0700 Subject: [PATCH 2/3] refactor profiling. only use a bytes till next sample variable. --- include/jemalloc/internal/private_symbols.txt | 1 + include/jemalloc/internal/prof.h | 222 ++++++------------ src/prof.c | 65 ++++- 3 files changed, 134 insertions(+), 154 deletions(-) diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt index 376f95d1..032bed4f 100644 --- a/include/jemalloc/internal/private_symbols.txt +++ b/include/jemalloc/internal/private_symbols.txt @@ -299,6 +299,7 @@ prof_idump prof_interval prof_lookup prof_malloc +prof_malloc_record_object prof_mdump prof_postfork_child prof_postfork_parent diff --git a/include/jemalloc/internal/prof.h b/include/jemalloc/internal/prof.h index 56014f18..27be10c1 100644 --- a/include/jemalloc/internal/prof.h +++ b/include/jemalloc/internal/prof.h @@ -177,8 +177,7 @@ struct prof_tdata_s { /* Sampling state. */ uint64_t prng_state; - uint64_t threshold; - uint64_t accum; + uint64_t bytes_until_sample; /* State used to avoid dumping while operating on prof internals. */ bool enq; @@ -239,6 +238,7 @@ bool prof_boot2(void); void prof_prefork(void); void prof_postfork_parent(void); void prof_postfork_child(void); +void prof_sample_threshold_update(prof_tdata_t *prof_tdata); #endif /* JEMALLOC_H_EXTERNS */ /******************************************************************************/ @@ -250,49 +250,13 @@ void prof_postfork_child(void); \ assert(size == s2u(size)); \ \ - prof_tdata = prof_tdata_get(true); \ - if ((uintptr_t)prof_tdata <= (uintptr_t)PROF_TDATA_STATE_MAX) { \ - if (prof_tdata != NULL) \ - ret = (prof_thr_cnt_t *)(uintptr_t)1U; \ - else \ - ret = NULL; \ - break; \ - } \ - \ - if (opt_prof_active == false) { \ - /* Sampling is currently inactive, so avoid sampling. */\ + if (!opt_prof_active || \ + prof_sample_accum_update(size, false, &prof_tdata)) { \ ret = (prof_thr_cnt_t *)(uintptr_t)1U; \ - } else if (opt_lg_prof_sample == 0) { \ - /* Don't bother with sampling logic, since sampling */\ - /* interval is 1. */\ + } else { \ bt_init(&bt, prof_tdata->vec); \ prof_backtrace(&bt, nignore); \ ret = prof_lookup(&bt); \ - } else { \ - if (prof_tdata->threshold == 0) { \ - /* Initialize. Seed the prng differently for */\ - /* each thread. */\ - prof_tdata->prng_state = \ - (uint64_t)(uintptr_t)&size; \ - prof_sample_threshold_update(prof_tdata); \ - } \ - \ - /* Determine whether to capture a backtrace based on */\ - /* whether size is enough for prof_accum to reach */\ - /* prof_tdata->threshold. However, delay updating */\ - /* these variables until prof_{m,re}alloc(), because */\ - /* we don't know for sure that the allocation will */\ - /* succeed. */\ - /* */\ - /* Use subtraction rather than addition to avoid */\ - /* potential integer overflow. */\ - if (size >= prof_tdata->threshold - \ - prof_tdata->accum) { \ - bt_init(&bt, prof_tdata->vec); \ - prof_backtrace(&bt, nignore); \ - ret = prof_lookup(&bt); \ - } else \ - ret = (prof_thr_cnt_t *)(uintptr_t)1U; \ } \ } while (0) @@ -300,10 +264,13 @@ void prof_postfork_child(void); malloc_tsd_protos(JEMALLOC_ATTR(unused), prof_tdata, prof_tdata_t *) prof_tdata_t *prof_tdata_get(bool create); -void prof_sample_threshold_update(prof_tdata_t *prof_tdata); +void prof_sample_accum_update(size_t size, bool commit, + prof_tdata_t **prof_tdata_out); prof_ctx_t *prof_ctx_get(const void *ptr); void prof_ctx_set(const void *ptr, prof_ctx_t *ctx); bool prof_sample_accum_update(size_t size); +void prof_malloc_record_object(const void *ptr, size_t usize, + prof_thr_cnt_t *cnt) void prof_malloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt); void prof_realloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt, size_t old_usize, prof_ctx_t *old_ctx); @@ -330,55 +297,6 @@ prof_tdata_get(bool create) return (prof_tdata); } -JEMALLOC_INLINE void -prof_sample_threshold_update(prof_tdata_t *prof_tdata) -{ - /* - * The body of this function is compiled out unless heap profiling is - * enabled, so that it is possible to compile jemalloc with floating - * point support completely disabled. Avoiding floating point code is - * important on memory-constrained systems, but it also enables a - * workaround for versions of glibc that don't properly save/restore - * floating point registers during dynamic lazy symbol loading (which - * internally calls into whatever malloc implementation happens to be - * integrated into the application). Note that some compilers (e.g. - * gcc 4.8) may use floating point registers for fast memory moves, so - * jemalloc must be compiled with such optimizations disabled (e.g. - * -mno-sse) in order for the workaround to be complete. - */ -#ifdef JEMALLOC_PROF - uint64_t r; - double u; - - cassert(config_prof); - - /* - * Compute sample threshold as a geometrically distributed random - * variable with mean (2^opt_lg_prof_sample). - * - * __ __ - * | log(u) | 1 - * prof_tdata->threshold = | -------- |, where p = ------------------- - * | log(1-p) | opt_lg_prof_sample - * 2 - * - * For more information on the math, see: - * - * Non-Uniform Random Variate Generation - * Luc Devroye - * Springer-Verlag, New York, 1986 - * pp 500 - * (http://luc.devroye.org/rnbookindex.html) - */ - prng64(r, 53, prof_tdata->prng_state, - UINT64_C(6364136223846793005), UINT64_C(1442695040888963407)); - u = (double)r * (1.0/9007199254740992.0L); - prof_tdata->threshold = (uint64_t)(log(u) / - log(1.0 - (1.0 / (double)((uint64_t)1U << opt_lg_prof_sample)))) - + (uint64_t)1U; -#endif -} - JEMALLOC_INLINE prof_ctx_t * prof_ctx_get(const void *ptr) { @@ -415,34 +333,58 @@ prof_ctx_set(const void *ptr, prof_ctx_t *ctx) } JEMALLOC_INLINE bool -prof_sample_accum_update(size_t size) +prof_sample_accum_update(size_t size, bool commit, + prof_tdata_t **prof_tdata_out) { prof_tdata_t *prof_tdata; cassert(config_prof); - /* Sampling logic is unnecessary if the interval is 1. */ - assert(opt_lg_prof_sample != 0); - prof_tdata = prof_tdata_get(false); + prof_tdata = prof_tdata_get(true); if ((uintptr_t)prof_tdata <= (uintptr_t)PROF_TDATA_STATE_MAX) + prof_tdata = NULL; + + if (prof_tdata_out != NULL) + *prof_tdata_out = prof_tdata; + + if (prof_tdata == NULL) return (true); - /* Take care to avoid integer overflow. */ - if (size >= prof_tdata->threshold - prof_tdata->accum) { - prof_tdata->accum -= (prof_tdata->threshold - size); - /* Compute new sample threshold. */ - prof_sample_threshold_update(prof_tdata); - while (prof_tdata->accum >= prof_tdata->threshold) { - prof_tdata->accum -= prof_tdata->threshold; - prof_sample_threshold_update(prof_tdata); - } - return (false); - } else { - prof_tdata->accum += size; + if (prof_tdata->bytes_until_sample >= size) { + if (commit) + prof_tdata->bytes_until_sample -= size; return (true); + } else { + /* Compute new sample threshold. */ + if (commit) + prof_sample_threshold_update(prof_tdata); + return (false); } } +JEMALLOC_INLINE void +prof_malloc_record_object(const void *ptr, size_t usize, prof_thr_cnt_t *cnt) { + prof_ctx_set(ptr, cnt->ctx); + + cnt->epoch++; + /*********/ + mb_write(); + /*********/ + cnt->cnts.curobjs++; + cnt->cnts.curbytes += usize; + if (opt_prof_accum) { + cnt->cnts.accumobjs++; + cnt->cnts.accumbytes += usize; + } + /*********/ + mb_write(); + /*********/ + cnt->epoch++; + /*********/ + mb_write(); + /*********/ +} + JEMALLOC_INLINE void prof_malloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt) { @@ -451,40 +393,20 @@ prof_malloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt) assert(ptr != NULL); assert(usize == isalloc(ptr, true)); - if (opt_lg_prof_sample != 0) { - if (prof_sample_accum_update(usize)) { - /* - * Don't sample. For malloc()-like allocation, it is - * always possible to tell in advance how large an - * object's usable size will be, so there should never - * be a difference between the usize passed to - * PROF_ALLOC_PREP() and prof_malloc(). - */ - assert((uintptr_t)cnt == (uintptr_t)1U); - } + if (prof_sample_accum_update(usize, true, NULL)) { + /* + * Don't sample. For malloc()-like allocation, it is + * always possible to tell in advance how large an + * object's usable size will be, so there should never + * be a difference between the usize passed to + * PROF_ALLOC_PREP() and prof_malloc(). + */ + assert((uintptr_t)cnt == (uintptr_t)1U); } - if ((uintptr_t)cnt > (uintptr_t)1U) { - prof_ctx_set(ptr, cnt->ctx); - - cnt->epoch++; - /*********/ - mb_write(); - /*********/ - cnt->cnts.curobjs++; - cnt->cnts.curbytes += usize; - if (opt_prof_accum) { - cnt->cnts.accumobjs++; - cnt->cnts.accumbytes += usize; - } - /*********/ - mb_write(); - /*********/ - cnt->epoch++; - /*********/ - mb_write(); - /*********/ - } else + if ((uintptr_t)cnt > (uintptr_t)1U) + prof_malloc_record_object(ptr, usize, cnt); + else prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U); } @@ -499,18 +421,16 @@ prof_realloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt, if (ptr != NULL) { assert(usize == isalloc(ptr, true)); - if (opt_lg_prof_sample != 0) { - if (prof_sample_accum_update(usize)) { - /* - * Don't sample. The usize passed to - * PROF_ALLOC_PREP() was larger than what - * actually got allocated, so a backtrace was - * captured for this allocation, even though - * its actual usize was insufficient to cross - * the sample threshold. - */ - cnt = (prof_thr_cnt_t *)(uintptr_t)1U; - } + if (prof_sample_accum_update(usize, true, NULL)) { + /* + * Don't sample. The usize passed to + * PROF_ALLOC_PREP() was larger than what + * actually got allocated, so a backtrace was + * captured for this allocation, even though + * its actual usize was insufficient to cross + * the sample threshold. + */ + cnt = (prof_thr_cnt_t *)(uintptr_t)1U; } } diff --git a/src/prof.c b/src/prof.c index 1b1f7a84..82c7f703 100644 --- a/src/prof.c +++ b/src/prof.c @@ -645,6 +645,66 @@ prof_lookup(prof_bt_t *bt) return (ret.p); } + +void +prof_sample_threshold_update(prof_tdata_t *prof_tdata) +{ + /* + * The body of this function is compiled out unless heap profiling is + * enabled, so that it is possible to compile jemalloc with floating + * point support completely disabled. Avoiding floating point code is + * important on memory-constrained systems, but it also enables a + * workaround for versions of glibc that don't properly save/restore + * floating point registers during dynamic lazy symbol loading (which + * internally calls into whatever malloc implementation happens to be + * integrated into the application). Note that some compilers (e.g. + * gcc 4.8) may use floating point registers for fast memory moves, so + * jemalloc must be compiled with such optimizations disabled (e.g. + * -mno-sse) in order for the workaround to be complete. + */ +#ifdef JEMALLOC_PROF + uint64_t r; + double u; + + if (!config_prof) + return; + + if (prof_tdata == NULL) + prof_tdata = prof_tdata_get(false); + + if (opt_lg_prof_sample == 0) { + prof_tdata->bytes_until_sample = 0; + return; + } + + /* + * Compute sample threshold as a geometrically distributed random + * variable with mean (2^opt_lg_prof_sample). + * + * __ __ + * | log(u) | 1 + * prof_tdata->threshold = | -------- |, where p = ------------------- + * | log(1-p) | opt_lg_prof_sample + * 2 + * + * For more information on the math, see: + * + * Non-Uniform Random Variate Generation + * Luc Devroye + * Springer-Verlag, New York, 1986 + * pp 500 + * (http://luc.devroye.org/rnbookindex.html) + */ + prng64(r, 53, prof_tdata->prng_state, + UINT64_C(6364136223846793005), UINT64_C(1442695040888963407)); + u = (double)r * (1.0/9007199254740992.0L); + prof_tdata->bytes_until_sample = (uint64_t)(log(u) / + log(1.0 - (1.0 / (double)((uint64_t)1U << opt_lg_prof_sample)))) + + (uint64_t)1U; +#endif +} + + #ifdef JEMALLOC_JET size_t prof_bt_count(void) @@ -1224,9 +1284,8 @@ prof_tdata_init(void) return (NULL); } - prof_tdata->prng_state = 0; - prof_tdata->threshold = 0; - prof_tdata->accum = 0; + prof_tdata->prng_state = (uint64_t)(uintptr_t)prof_tdata; + prof_sample_threshold_update(prof_tdata); prof_tdata->enq = false; prof_tdata->enq_idump = false; From 021136ce4db79f50031a1fd5dd751891888fbc7b Mon Sep 17 00:00:00 2001 From: Ben Maurer Date: Wed, 16 Apr 2014 14:31:24 -0700 Subject: [PATCH 3/3] Create a const array with only a small bin to size map --- include/jemalloc/internal/arena.h | 3 ++- include/jemalloc/internal/jemalloc_internal.h.in | 4 ++-- include/jemalloc/internal/private_symbols.txt | 1 + include/jemalloc/internal/tcache.h | 6 +++--- src/arena.c | 10 +++++++++- 5 files changed, 17 insertions(+), 7 deletions(-) diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h index 0e14c2c4..b435d0b0 100644 --- a/include/jemalloc/internal/arena.h +++ b/include/jemalloc/internal/arena.h @@ -385,6 +385,7 @@ extern ssize_t opt_lg_dirty_mult; * and all accesses are via the SMALL_SIZE2BIN macro. */ extern uint8_t const small_size2bin[]; +extern uint32_t const small_bin2size[]; #define SMALL_SIZE2BIN(s) (small_size2bin[(s-1) >> LG_TINY_MIN]) extern arena_bin_info_t arena_bin_info[NBINS]; @@ -964,7 +965,7 @@ arena_salloc(const void *ptr, bool demote) assert(arena_mapbits_large_get(chunk, pageind) != 0 || arena_ptr_small_binind_get(ptr, arena_mapbits_get(chunk, pageind)) == binind); - ret = arena_bin_info[binind].reg_size; + ret = small_bin2size[binind]; } return (ret); diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in index 9c79ae00..17d77623 100644 --- a/include/jemalloc/internal/jemalloc_internal.h.in +++ b/include/jemalloc/internal/jemalloc_internal.h.in @@ -602,7 +602,7 @@ s2u(size_t size) { if (size <= SMALL_MAXCLASS) - return (arena_bin_info[SMALL_SIZE2BIN(size)].reg_size); + return (small_bin2size[SMALL_SIZE2BIN(size)]); if (size <= arena_maxclass) return (PAGE_CEILING(size)); return (CHUNK_CEILING(size)); @@ -645,7 +645,7 @@ sa2u(size_t size, size_t alignment) if (usize <= arena_maxclass && alignment <= PAGE) { if (usize <= SMALL_MAXCLASS) - return (arena_bin_info[SMALL_SIZE2BIN(usize)].reg_size); + return (small_bin2size[SMALL_SIZE2BIN(usize)]); return (PAGE_CEILING(usize)); } else { size_t run_size; diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt index 032bed4f..12d64dc4 100644 --- a/include/jemalloc/internal/private_symbols.txt +++ b/include/jemalloc/internal/private_symbols.txt @@ -346,6 +346,7 @@ rtree_set s2u sa2u set_errno +small_bin2size small_size2bin stats_cactive stats_cactive_add diff --git a/include/jemalloc/internal/tcache.h b/include/jemalloc/internal/tcache.h index 96447f42..098d19ae 100644 --- a/include/jemalloc/internal/tcache.h +++ b/include/jemalloc/internal/tcache.h @@ -266,14 +266,14 @@ tcache_alloc_small(tcache_t *tcache, size_t size, bool zero) binind = SMALL_SIZE2BIN(size); assert(binind < NBINS); tbin = &tcache->tbins[binind]; - size = arena_bin_info[binind].reg_size; + size = small_bin2size[binind]; ret = tcache_alloc_easy(tbin); if (ret == NULL) { ret = tcache_alloc_small_hard(tcache, tbin, binind); if (ret == NULL) return (NULL); } - assert(tcache_salloc(ret) == arena_bin_info[binind].reg_size); + assert(tcache_salloc(ret) == size); if (zero == false) { if (config_fill) { @@ -296,7 +296,7 @@ tcache_alloc_small(tcache_t *tcache, size_t size, bool zero) if (config_stats) tbin->tstats.nrequests++; if (config_prof) - tcache->prof_accumbytes += arena_bin_info[binind].reg_size; + tcache->prof_accumbytes += size; tcache_event(tcache); return (ret); } diff --git a/src/arena.c b/src/arena.c index d5741000..37487ffa 100644 --- a/src/arena.c +++ b/src/arena.c @@ -7,6 +7,14 @@ ssize_t opt_lg_dirty_mult = LG_DIRTY_MULT_DEFAULT; arena_bin_info_t arena_bin_info[NBINS]; +JEMALLOC_ALIGNED(CACHELINE) +const uint32_t small_bin2size[NBINS] = { +#define SIZE_CLASS(bin, delta, size) \ + size, + SIZE_CLASSES +#undef SIZE_CLASS +}; + JEMALLOC_ALIGNED(CACHELINE) const uint8_t small_size2bin[] = { #define S2B_8(i) i, @@ -1615,7 +1623,7 @@ arena_malloc_small(arena_t *arena, size_t size, bool zero) binind = SMALL_SIZE2BIN(size); assert(binind < NBINS); bin = &arena->bins[binind]; - size = arena_bin_info[binind].reg_size; + size = small_bin2size[binind]; malloc_mutex_lock(&bin->lock); if ((run = bin->runcur) != NULL && run->nfree > 0)