From 4d6a134e13e1e9705faff6f5dc6ca90d416aa0a4 Mon Sep 17 00:00:00 2001 From: Jason Evans Date: Wed, 20 Oct 2010 19:05:59 -0700 Subject: [PATCH] Inline the fast path for heap sampling. Inline the heap sampling code that is executed for every allocation event (regardless of whether a sample is taken). Combine all prof TLS data into a single data structure, in order to reduce the TLS lookup volume. --- .../jemalloc/internal/jemalloc_internal.h.in | 10 +- jemalloc/include/jemalloc/internal/prof.h | 388 +++++++++++- jemalloc/src/prof.c | 557 +++--------------- 3 files changed, 449 insertions(+), 506 deletions(-) diff --git a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in index eb609624..99746bbd 100644 --- a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in +++ b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in @@ -27,6 +27,7 @@ #include #include #include +#include #define JEMALLOC_MANGLE #include "../jemalloc@install_suffix@.h" @@ -210,10 +211,10 @@ extern void (*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s); #include "jemalloc/internal/rtree.h" #include "jemalloc/internal/tcache.h" #include "jemalloc/internal/hash.h" -#include "jemalloc/internal/prof.h" #ifdef JEMALLOC_ZONE #include "jemalloc/internal/zone.h" #endif +#include "jemalloc/internal/prof.h" #undef JEMALLOC_H_TYPES /******************************************************************************/ @@ -233,10 +234,10 @@ extern void (*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s); #include "jemalloc/internal/rtree.h" #include "jemalloc/internal/tcache.h" #include "jemalloc/internal/hash.h" -#include "jemalloc/internal/prof.h" #ifdef JEMALLOC_ZONE #include "jemalloc/internal/zone.h" #endif +#include "jemalloc/internal/prof.h" #undef JEMALLOC_H_STRUCTS /******************************************************************************/ @@ -355,10 +356,10 @@ void jemalloc_postfork(void); #include "jemalloc/internal/rtree.h" #include "jemalloc/internal/tcache.h" #include "jemalloc/internal/hash.h" -#include "jemalloc/internal/prof.h" #ifdef JEMALLOC_ZONE #include "jemalloc/internal/zone.h" #endif +#include "jemalloc/internal/prof.h" #undef JEMALLOC_H_EXTERNS /******************************************************************************/ @@ -547,7 +548,6 @@ choose_arena(void) #include "jemalloc/internal/tcache.h" #include "jemalloc/internal/arena.h" #include "jemalloc/internal/hash.h" -#include "jemalloc/internal/prof.h" #ifdef JEMALLOC_ZONE #include "jemalloc/internal/zone.h" #endif @@ -729,5 +729,7 @@ iralloc(void *ptr, size_t size, size_t extra, size_t alignment, bool zero, } #endif +#include "jemalloc/internal/prof.h" + #undef JEMALLOC_H_INLINES /******************************************************************************/ diff --git a/jemalloc/include/jemalloc/internal/prof.h b/jemalloc/include/jemalloc/internal/prof.h index 0d139da7..a6bd797f 100644 --- a/jemalloc/include/jemalloc/internal/prof.h +++ b/jemalloc/include/jemalloc/internal/prof.h @@ -6,7 +6,7 @@ typedef struct prof_bt_s prof_bt_t; typedef struct prof_cnt_s prof_cnt_t; typedef struct prof_thr_cnt_s prof_thr_cnt_t; typedef struct prof_ctx_s prof_ctx_t; -typedef struct prof_tcache_s prof_tcache_t; +typedef struct prof_tdata_s prof_tdata_t; /* Option defaults. */ #define LG_PROF_BT_MAX_DEFAULT 7 @@ -38,8 +38,8 @@ typedef struct prof_tcache_s prof_tcache_t; struct prof_bt_s { /* Backtrace, stored as len program counters. */ - void **vec; - unsigned len; + void **vec; + unsigned len; }; #ifdef JEMALLOC_PROF_LIBGCC @@ -124,22 +124,29 @@ struct prof_ctx_s { ql_head(prof_thr_cnt_t) cnts_ql; }; -/* - * Thread-specific hash of (prof_bt_t *)-->(prof_thr_cnt_t *). Each thread - * keeps a cache of backtraces, with associated thread-specific prof_thr_cnt_t - * objects. Other threads may read the prof_thr_cnt_t contents, but no others - * will ever write them. - * - * Upon thread exit, the thread must merge all the prof_thr_cnt_t counter data - * into the associated prof_ctx_t objects, and unlink/free the prof_thr_cnt_t - * objects. - */ -struct prof_tcache_s { - /* (prof_bt_t *)-->(prof_thr_cnt_t *). */ +struct prof_tdata_s { + /* + * Hash of (prof_bt_t *)-->(prof_thr_cnt_t *). Each thread keeps a + * cache of backtraces, with associated thread-specific prof_thr_cnt_t + * objects. Other threads may read the prof_thr_cnt_t contents, but no + * others will ever write them. + * + * Upon thread exit, the thread must merge all the prof_thr_cnt_t + * counter data into the associated prof_ctx_t objects, and unlink/free + * the prof_thr_cnt_t objects. + */ ckh_t bt2cnt; /* LRU for contents of bt2cnt. */ ql_head(prof_thr_cnt_t) lru_ql; + + /* Backtrace vector, used for calls to prof_backtrace(). */ + void **vec; + + /* Sampling state. */ + uint64_t prn_state; + uint64_t threshold; + uint64_t accum; }; #endif /* JEMALLOC_H_STRUCTS */ @@ -177,15 +184,39 @@ extern uint64_t prof_interval; */ extern bool prof_promote; -prof_thr_cnt_t *prof_alloc_prep(size_t size); -prof_ctx_t *prof_ctx_get(const void *ptr); -void prof_malloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt); -void prof_realloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt, - const void *old_ptr, size_t old_size, prof_ctx_t *old_ctx); -void prof_free(const void *ptr); +/* (1U << opt_lg_prof_bt_max). */ +extern unsigned prof_bt_max; + +/* Thread-specific backtrace cache, used to reduce bt2ctx contention. */ +#ifndef NO_TLS +extern __thread prof_tdata_t *prof_tdata_tls + JEMALLOC_ATTR(tls_model("initial-exec")); +# define PROF_TCACHE_GET() prof_tdata_tls +# define PROF_TCACHE_SET(v) do { \ + prof_tdata_tls = (v); \ + pthread_setspecific(prof_tdata_tsd, (void *)(v)); \ +} while (0) +#else +# define PROF_TCACHE_GET() \ + ((prof_tdata_t *)pthread_getspecific(prof_tdata_tsd)) +# define PROF_TCACHE_SET(v) do { \ + pthread_setspecific(prof_tdata_tsd, (void *)(v)); \ +} while (0) +#endif +/* + * Same contents as b2cnt_tls, but initialized such that the TSD destructor is + * called when a thread exits, so that prof_tdata_tls contents can be merged, + * unlinked, and deallocated. + */ +extern pthread_key_t prof_tdata_tsd; + +void bt_init(prof_bt_t *bt, void **vec); +void prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max); +prof_thr_cnt_t *prof_lookup(prof_bt_t *bt); void prof_idump(void); bool prof_mdump(const char *filename); void prof_udump(void); +prof_tdata_t *prof_tdata_init(void); void prof_boot0(void); bool prof_boot1(void); @@ -193,6 +224,321 @@ bool prof_boot1(void); /******************************************************************************/ #ifdef JEMALLOC_H_INLINES +#ifndef JEMALLOC_ENABLE_INLINE +void prof_sample_threshold_update(prof_tdata_t *prof_tdata); +prof_thr_cnt_t *prof_alloc_prep(size_t size); +prof_ctx_t *prof_ctx_get(const void *ptr); +void prof_ctx_set(const void *ptr, prof_ctx_t *ctx); +bool prof_sample_accum_update(size_t size); +void prof_malloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt); +void prof_realloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt, + const void *old_ptr, size_t old_size, prof_ctx_t *old_ctx); +void prof_free(const void *ptr); +#endif + +#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_PROF_C_)) +JEMALLOC_INLINE void +prof_sample_threshold_update(prof_tdata_t *prof_tdata) +{ + uint64_t r; + double u; + + /* + * Compute prof_sample_threshold as a geometrically distributed random + * variable with mean (2^opt_lg_prof_sample). + */ + prn64(r, 53, prof_tdata->prn_state, + (uint64_t)6364136223846793005LLU, (uint64_t)1442695040888963407LLU); + u = (double)r * (1.0/9007199254740992.0L); + prof_tdata->threshold = (uint64_t)(log(u) / + log(1.0 - (1.0 / (double)((uint64_t)1U << opt_lg_prof_sample)))) + + (uint64_t)1U; +} + +JEMALLOC_INLINE prof_thr_cnt_t * +prof_alloc_prep(size_t size) +{ +#ifdef JEMALLOC_ENABLE_INLINE + /* This function does not have its own stack frame, because it is inlined. */ +# define NIGNORE 1 +#else +# define NIGNORE 2 +#endif + prof_thr_cnt_t *ret; + prof_tdata_t *prof_tdata; + prof_bt_t bt; + + assert(size == s2u(size)); + + prof_tdata = PROF_TCACHE_GET(); + if (prof_tdata == NULL) { + prof_tdata = prof_tdata_init(); + if (prof_tdata == NULL) + return (NULL); + } + + if (opt_prof_active == false) { + /* Sampling is currently inactive, so avoid sampling. */ + ret = (prof_thr_cnt_t *)(uintptr_t)1U; + } else if (opt_lg_prof_sample == 0) { + /* + * Don't bother with sampling logic, since sampling interval is + * 1. + */ + bt_init(&bt, prof_tdata->vec); + prof_backtrace(&bt, NIGNORE, prof_bt_max); + ret = prof_lookup(&bt); + } else { + if (prof_tdata->threshold == 0) { + /* + * Initialize. Seed the prng differently for each + * thread. + */ + prof_tdata->prn_state = (uint64_t)(uintptr_t)&size; + prof_sample_threshold_update(prof_tdata); + } + + /* + * Determine whether to capture a backtrace based on whether + * size is enough for prof_accum to reach + * prof_tdata->threshold. However, delay updating these + * variables until prof_{m,re}alloc(), because we don't know + * for sure that the allocation will succeed. + * + * Use subtraction rather than addition to avoid potential + * integer overflow. + */ + if (size >= prof_tdata->threshold - prof_tdata->accum) { + bt_init(&bt, prof_tdata->vec); + prof_backtrace(&bt, NIGNORE, prof_bt_max); + ret = prof_lookup(&bt); + } else + ret = (prof_thr_cnt_t *)(uintptr_t)1U; + } + + return (ret); +#undef NIGNORE +} + +JEMALLOC_INLINE prof_ctx_t * +prof_ctx_get(const void *ptr) +{ + prof_ctx_t *ret; + arena_chunk_t *chunk; + + assert(ptr != NULL); + + chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr); + if (chunk != ptr) { + /* Region. */ + assert(chunk->arena->magic == ARENA_MAGIC); + + ret = arena_prof_ctx_get(ptr); + } else + ret = huge_prof_ctx_get(ptr); + + return (ret); +} + +JEMALLOC_INLINE void +prof_ctx_set(const void *ptr, prof_ctx_t *ctx) +{ + arena_chunk_t *chunk; + + assert(ptr != NULL); + + chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr); + if (chunk != ptr) { + /* Region. */ + assert(chunk->arena->magic == ARENA_MAGIC); + + arena_prof_ctx_set(ptr, ctx); + } else + huge_prof_ctx_set(ptr, ctx); +} + +JEMALLOC_INLINE bool +prof_sample_accum_update(size_t size) +{ + prof_tdata_t *prof_tdata; + + /* Sampling logic is unnecessary if the interval is 1. */ + assert(opt_lg_prof_sample != 0); + + prof_tdata = PROF_TCACHE_GET(); + assert(prof_tdata != NULL); + + /* Take care to avoid integer overflow. */ + if (size >= prof_tdata->threshold - prof_tdata->accum) { + prof_tdata->accum -= (prof_tdata->threshold - size); + /* Compute new prof_sample_threshold. */ + prof_sample_threshold_update(prof_tdata); + while (prof_tdata->accum >= prof_tdata->threshold) { + prof_tdata->accum -= prof_tdata->threshold; + prof_sample_threshold_update(prof_tdata); + } + return (false); + } else { + prof_tdata->accum += size; + return (true); + } +} + +JEMALLOC_INLINE void +prof_malloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt) +{ + + assert(ptr != NULL); + assert(size == s2u(size)); + + if (opt_lg_prof_sample != 0) { + if (prof_sample_accum_update(size)) { + /* + * Don't sample. For malloc()-like allocation, it is + * always possible to tell in advance how large an + * object's usable size will be, so there should never + * be a difference between the size passed to + * prof_alloc_prep() and prof_malloc(). + */ + assert(false); + return; + } + } + + if ((uintptr_t)cnt > (uintptr_t)1U) { + prof_ctx_set(ptr, cnt->ctx); + + cnt->epoch++; + /*********/ + mb_write(); + /*********/ + cnt->cnts.curobjs++; + cnt->cnts.curbytes += size; + if (opt_prof_accum) { + cnt->cnts.accumobjs++; + cnt->cnts.accumbytes += size; + } + /*********/ + mb_write(); + /*********/ + cnt->epoch++; + /*********/ + mb_write(); + /*********/ + } else + prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U); +} + +JEMALLOC_INLINE void +prof_realloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt, + const void *old_ptr, size_t old_size, prof_ctx_t *old_ctx) +{ + prof_thr_cnt_t *told_cnt; + + assert(ptr != NULL || (uintptr_t)cnt <= (uintptr_t)1U); + + if (ptr != NULL) { + if (opt_lg_prof_sample != 0) { + if (prof_sample_accum_update(size)) { + /* + * Don't sample. The size passed to + * prof_alloc_prep() was larger than what + * actually got allocated., so a backtrace was + * captured for this allocation, even though + * its actual size was insufficient to cross + * the sample threshold. + */ + return; + } + } + } + + if ((uintptr_t)old_ctx > (uintptr_t)1U) { + told_cnt = prof_lookup(old_ctx->bt); + if (told_cnt == NULL) { + /* + * It's too late to propagate OOM for this realloc(), + * so operate directly on old_cnt->ctx->cnt_merged. + */ + malloc_mutex_lock(&old_ctx->lock); + old_ctx->cnt_merged.curobjs--; + old_ctx->cnt_merged.curbytes -= old_size; + malloc_mutex_unlock(&old_ctx->lock); + told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U; + } + } else + told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U; + + if ((uintptr_t)told_cnt > (uintptr_t)1U) + told_cnt->epoch++; + if ((uintptr_t)cnt > (uintptr_t)1U) { + prof_ctx_set(ptr, cnt->ctx); + cnt->epoch++; + } else + prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U); + /*********/ + mb_write(); + /*********/ + if ((uintptr_t)told_cnt > (uintptr_t)1U) { + told_cnt->cnts.curobjs--; + told_cnt->cnts.curbytes -= old_size; + } + if ((uintptr_t)cnt > (uintptr_t)1U) { + cnt->cnts.curobjs++; + cnt->cnts.curbytes += size; + if (opt_prof_accum) { + cnt->cnts.accumobjs++; + cnt->cnts.accumbytes += size; + } + } + /*********/ + mb_write(); + /*********/ + if ((uintptr_t)told_cnt > (uintptr_t)1U) + told_cnt->epoch++; + if ((uintptr_t)cnt > (uintptr_t)1U) + cnt->epoch++; + /*********/ + mb_write(); /* Not strictly necessary. */ +} + +JEMALLOC_INLINE void +prof_free(const void *ptr) +{ + prof_ctx_t *ctx = prof_ctx_get(ptr); + + if ((uintptr_t)ctx > (uintptr_t)1) { + size_t size = isalloc(ptr); + prof_thr_cnt_t *tcnt = prof_lookup(ctx->bt); + + if (tcnt != NULL) { + tcnt->epoch++; + /*********/ + mb_write(); + /*********/ + tcnt->cnts.curobjs--; + tcnt->cnts.curbytes -= size; + /*********/ + mb_write(); + /*********/ + tcnt->epoch++; + /*********/ + mb_write(); + /*********/ + } else { + /* + * OOM during free() cannot be propagated, so operate + * directly on cnt->ctx->cnt_merged. + */ + malloc_mutex_lock(&ctx->lock); + ctx->cnt_merged.curobjs--; + ctx->cnt_merged.curbytes -= size; + malloc_mutex_unlock(&ctx->lock); + } + } +} +#endif + #endif /* JEMALLOC_H_INLINES */ /******************************************************************************/ #endif /* JEMALLOC_PROF */ diff --git a/jemalloc/src/prof.c b/jemalloc/src/prof.c index 583a6e91..4a49e901 100644 --- a/jemalloc/src/prof.c +++ b/jemalloc/src/prof.c @@ -12,8 +12,6 @@ #include #endif -#include - /******************************************************************************/ /* Data. */ @@ -30,6 +28,14 @@ ssize_t opt_lg_prof_tcmax = LG_PROF_TCMAX_DEFAULT; uint64_t prof_interval; bool prof_promote; +unsigned prof_bt_max; + +#ifndef NO_TLS +__thread prof_tdata_t *prof_tdata_tls + JEMALLOC_ATTR(tls_model("initial-exec")); +#endif +pthread_key_t prof_tdata_tsd; + /* * Global hash of (prof_bt_t *)-->(prof_ctx_t *). This is the master data * structure that knows about all backtraces currently captured. @@ -37,96 +43,6 @@ bool prof_promote; static ckh_t bt2ctx; static malloc_mutex_t bt2ctx_mtx; -/* Thread-specific backtrace cache, used to reduce bt2ctx contention. */ -#ifndef NO_TLS -static __thread prof_tcache_t *prof_tcache_tls - JEMALLOC_ATTR(tls_model("initial-exec")); -# define PROF_TCACHE_GET() prof_tcache_tls -# define PROF_TCACHE_SET(v) do { \ - prof_tcache_tls = (v); \ - pthread_setspecific(prof_tcache_tsd, (void *)(v)); \ -} while (0) -#else -# define PROF_TCACHE_GET() \ - ((prof_tcache_t *)pthread_getspecific(prof_tcache_tsd)) -# define PROF_TCACHE_SET(v) do { \ - pthread_setspecific(prof_tcache_tsd, (void *)(v)); \ -} while (0) -#endif -/* - * Same contents as b2cnt_tls, but initialized such that the TSD destructor is - * called when a thread exits, so that prof_tcache_tls contents can be merged, - * unlinked, and deallocated. - */ -static pthread_key_t prof_tcache_tsd; - -/* Thread-specific backtrace vector, used for calls to prof_backtrace(). */ -#ifndef NO_TLS -static __thread void **vec_tls - JEMALLOC_ATTR(tls_model("initial-exec")); -# define VEC_GET() vec_tls -# define VEC_SET(v) do { \ - vec_tls = (v); \ - pthread_setspecific(vec_tsd, (void *)(v)); \ -} while (0) -#else -# define VEC_GET() ((void **)pthread_getspecific(vec_tsd)) -# define VEC_SET(v) do { \ - pthread_setspecific(vec_tsd, (void *)(v)); \ -} while (0) -#endif -/* - * Same contents as vec_tls, but initialized such that the TSD destructor is - * called when a thread exits, so that vec_tls contents can be merged, - * unlinked, and deallocated. - */ -static pthread_key_t vec_tsd; - - -/* (1U << opt_lg_prof_bt_max). */ -static unsigned prof_bt_max; - -typedef struct prof_sample_state_s prof_sample_state_t; -struct prof_sample_state_s { - uint64_t prn_state; - uint64_t threshold; - uint64_t accum; -}; - -#ifndef NO_TLS -static __thread prof_sample_state_t prof_sample_state_tls - JEMALLOC_ATTR(tls_model("initial-exec")); -# define PROF_SAMPLE_STATE_GET(r) do { \ - r = &prof_sample_state_tls; \ -} while (0) -#else -static pthread_key_t prof_sample_state_tsd; -/* Used only if an OOM error occurs in PROF_SAMPLE_STATE_GET(). */ -prof_sample_state_t prof_sample_state_oom; -# define PROF_SAMPLE_STATE_GET(r) do { \ - r = (prof_sample_state_t *)pthread_getspecific( \ - prof_sample_state_tsd); \ - if (r == NULL) { \ - r = ipalloc(sizeof(prof_sample_state_t), CACHELINE, \ - false); \ - if (r == NULL) { \ - malloc_write(": Error in heap " \ - "profiler: out of memory; subsequent heap " \ - "profiles may be inaccurate\n"); \ - if (opt_abort) \ - abort(); \ - /* Failure is not an option... */ \ - r = &prof_sample_state_oom; \ - } \ - pthread_setspecific(prof_sample_state_tsd, (void *)r); \ - } \ -} while (0) -# define ARENA_GET() ((arena_t *)pthread_getspecific(arenas_tsd)) -# define ARENA_SET(v) do { \ - pthread_setspecific(arenas_tsd, (void *)(v)); \ -} while (0) -#endif - static malloc_mutex_t prof_dump_seq_mtx; static uint64_t prof_dump_seq; static uint64_t prof_dump_iseq; @@ -154,7 +70,6 @@ static bool enq_udump; /* Function prototypes for non-inline static functions. */ static prof_bt_t *bt_dup(prof_bt_t *bt); -static void bt_init(prof_bt_t *bt, void **vec); static void bt_destroy(prof_bt_t *bt); #ifdef JEMALLOC_PROF_LIBGCC static _Unwind_Reason_Code prof_unwind_init_callback( @@ -162,9 +77,6 @@ static _Unwind_Reason_Code prof_unwind_init_callback( static _Unwind_Reason_Code prof_unwind_callback( struct _Unwind_Context *context, void *arg); #endif -static void prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max); -static prof_thr_cnt_t *prof_lookup(prof_bt_t *bt); -static void prof_ctx_set(const void *ptr, prof_ctx_t *ctx); static bool prof_flush(bool propagate_err); static bool prof_write(const char *s, bool propagate_err); static void prof_ctx_sum(prof_ctx_t *ctx, prof_cnt_t *cnt_all, @@ -181,15 +93,11 @@ static void prof_fdump(void); static void prof_bt_hash(const void *key, unsigned minbits, size_t *hash1, size_t *hash2); static bool prof_bt_keycomp(const void *k1, const void *k2); -static void prof_tcache_cleanup(void *arg); -static void vec_cleanup(void *arg); -#ifdef NO_TLS -static void prof_sample_state_thread_cleanup(void *arg); -#endif +static void prof_tdata_cleanup(void *arg); /******************************************************************************/ -static void +void bt_init(prof_bt_t *bt, void **vec) { @@ -285,7 +193,7 @@ prof_unwind_callback(struct _Unwind_Context *context, void *arg) return (_URC_NO_REASON); } -static void +void prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max) { prof_unwind_data_t data = {bt, nignore, max}; @@ -293,7 +201,7 @@ prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max) _Unwind_Backtrace(prof_unwind_callback, &data); } #elif defined(JEMALLOC_PROF_LIBUNWIND) -static void +void prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max) { unw_context_t uc; @@ -328,7 +236,7 @@ prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max) } } #else -static void +void prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max) { #define NIGNORE 3 @@ -509,32 +417,23 @@ prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max) } #endif -static prof_thr_cnt_t * +prof_thr_cnt_t * prof_lookup(prof_bt_t *bt) { union { prof_thr_cnt_t *p; void *v; } ret; - prof_tcache_t *prof_tcache = PROF_TCACHE_GET(); + prof_tdata_t *prof_tdata; - if (prof_tcache == NULL) { - /* Initialize an empty cache for this thread. */ - prof_tcache = (prof_tcache_t *)imalloc(sizeof(prof_tcache_t)); - if (prof_tcache == NULL) + prof_tdata = PROF_TCACHE_GET(); + if (prof_tdata == NULL) { + prof_tdata = prof_tdata_init(); + if (prof_tdata == NULL) return (NULL); - - if (ckh_new(&prof_tcache->bt2cnt, PROF_CKH_MINITEMS, - prof_bt_hash, prof_bt_keycomp)) { - idalloc(prof_tcache); - return (NULL); - } - ql_new(&prof_tcache->lru_ql); - - PROF_TCACHE_SET(prof_tcache); } - if (ckh_search(&prof_tcache->bt2cnt, bt, NULL, &ret.v)) { + if (ckh_search(&prof_tdata->bt2cnt, bt, NULL, &ret.v)) { union { prof_bt_t *p; void *v; @@ -588,23 +487,23 @@ prof_lookup(prof_bt_t *bt) prof_leave(); /* Link a prof_thd_cnt_t into ctx for this thread. */ - if (opt_lg_prof_tcmax >= 0 && ckh_count(&prof_tcache->bt2cnt) + if (opt_lg_prof_tcmax >= 0 && ckh_count(&prof_tdata->bt2cnt) == (ZU(1) << opt_lg_prof_tcmax)) { - assert(ckh_count(&prof_tcache->bt2cnt) > 0); + assert(ckh_count(&prof_tdata->bt2cnt) > 0); /* * Flush the least least recently used cnt in order to * keep bt2cnt from becoming too large. */ - ret.p = ql_last(&prof_tcache->lru_ql, lru_link); + ret.p = ql_last(&prof_tdata->lru_ql, lru_link); assert(ret.v != NULL); - ckh_remove(&prof_tcache->bt2cnt, ret.p->ctx->bt, NULL, + ckh_remove(&prof_tdata->bt2cnt, ret.p->ctx->bt, NULL, NULL); - ql_remove(&prof_tcache->lru_ql, ret.p, lru_link); + ql_remove(&prof_tdata->lru_ql, ret.p, lru_link); prof_ctx_merge(ret.p->ctx, ret.p); /* ret can now be re-used. */ } else { assert(opt_lg_prof_tcmax < 0 || - ckh_count(&prof_tcache->bt2cnt) < (ZU(1) << + ckh_count(&prof_tdata->bt2cnt) < (ZU(1) << opt_lg_prof_tcmax)); /* Allocate and partially initialize a new cnt. */ ret.v = imalloc(sizeof(prof_thr_cnt_t)); @@ -617,325 +516,22 @@ prof_lookup(prof_bt_t *bt) ret.p->ctx = ctx.p; ret.p->epoch = 0; memset(&ret.p->cnts, 0, sizeof(prof_cnt_t)); - if (ckh_insert(&prof_tcache->bt2cnt, btkey.v, ret.v)) { + if (ckh_insert(&prof_tdata->bt2cnt, btkey.v, ret.v)) { idalloc(ret.v); return (NULL); } - ql_head_insert(&prof_tcache->lru_ql, ret.p, lru_link); + ql_head_insert(&prof_tdata->lru_ql, ret.p, lru_link); ql_tail_insert(&ctx.p->cnts_ql, ret.p, cnts_link); malloc_mutex_unlock(&ctx.p->lock); } else { /* Move ret to the front of the LRU. */ - ql_remove(&prof_tcache->lru_ql, ret.p, lru_link); - ql_head_insert(&prof_tcache->lru_ql, ret.p, lru_link); + ql_remove(&prof_tdata->lru_ql, ret.p, lru_link); + ql_head_insert(&prof_tdata->lru_ql, ret.p, lru_link); } return (ret.p); } -static inline void -prof_sample_threshold_update(void) -{ - uint64_t r; - double u; - prof_sample_state_t *prof_sample_state; - - /* - * Compute prof_sample_threshold as a geometrically distributed random - * variable with mean (2^opt_lg_prof_sample). - */ - PROF_SAMPLE_STATE_GET(prof_sample_state); - prn64(r, 53, prof_sample_state->prn_state, - (uint64_t)6364136223846793005LLU, (uint64_t)1442695040888963407LLU); - u = (double)r * (1.0/9007199254740992.0L); - prof_sample_state->threshold = (uint64_t)(log(u) / - log(1.0 - (1.0 / (double)((uint64_t)1U << opt_lg_prof_sample)))) - + (uint64_t)1U; -} - -prof_thr_cnt_t * -prof_alloc_prep(size_t size) -{ - prof_thr_cnt_t *ret; - void **vec; - prof_bt_t bt; - - assert(size == s2u(size)); - - vec = VEC_GET(); - if (vec == NULL) { - vec = imalloc(sizeof(void *) * prof_bt_max); - if (vec == NULL) - return (NULL); - VEC_SET(vec); - } - - if (opt_prof_active == false) { - /* Sampling is currently inactive, so avoid sampling. */ - ret = (prof_thr_cnt_t *)(uintptr_t)1U; - } else if (opt_lg_prof_sample == 0) { - /* - * Don't bother with sampling logic, since sampling interval is - * 1. - */ - bt_init(&bt, vec); - prof_backtrace(&bt, 2, prof_bt_max); - ret = prof_lookup(&bt); - } else { - prof_sample_state_t *prof_sample_state; - - PROF_SAMPLE_STATE_GET(prof_sample_state); - if (prof_sample_state->threshold == 0) { - /* - * Initialize. Seed the prng differently for each - * thread. - */ - prof_sample_state->prn_state = - (uint64_t)(uintptr_t)&size; - prof_sample_threshold_update(); - } - - /* - * Determine whether to capture a backtrace based on whether - * size is enough for prof_accum to reach - * prof_sample_state->threshold. However, delay updating these - * variables until prof_{m,re}alloc(), because we don't know - * for sure that the allocation will succeed. - * - * Use subtraction rather than addition to avoid potential - * integer overflow. - */ - if (size >= prof_sample_state->threshold - - prof_sample_state->accum) { - bt_init(&bt, vec); - prof_backtrace(&bt, 2, prof_bt_max); - ret = prof_lookup(&bt); - } else - ret = (prof_thr_cnt_t *)(uintptr_t)1U; - } - - return (ret); -} - -prof_ctx_t * -prof_ctx_get(const void *ptr) -{ - prof_ctx_t *ret; - arena_chunk_t *chunk; - - assert(ptr != NULL); - - chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr); - if (chunk != ptr) { - /* Region. */ - assert(chunk->arena->magic == ARENA_MAGIC); - - ret = arena_prof_ctx_get(ptr); - } else - ret = huge_prof_ctx_get(ptr); - - return (ret); -} - -static void -prof_ctx_set(const void *ptr, prof_ctx_t *ctx) -{ - arena_chunk_t *chunk; - - assert(ptr != NULL); - - chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr); - if (chunk != ptr) { - /* Region. */ - assert(chunk->arena->magic == ARENA_MAGIC); - - arena_prof_ctx_set(ptr, ctx); - } else - huge_prof_ctx_set(ptr, ctx); -} - -static inline bool -prof_sample_accum_update(size_t size) -{ - prof_sample_state_t *prof_sample_state; - - /* Sampling logic is unnecessary if the interval is 1. */ - assert(opt_lg_prof_sample != 0); - - /* Take care to avoid integer overflow. */ - PROF_SAMPLE_STATE_GET(prof_sample_state); - if (size >= prof_sample_state->threshold - prof_sample_state->accum) { - prof_sample_state->accum -= (prof_sample_state->threshold - - size); - /* Compute new prof_sample_threshold. */ - prof_sample_threshold_update(); - while (prof_sample_state->accum >= - prof_sample_state->threshold) { - prof_sample_state->accum -= - prof_sample_state->threshold; - prof_sample_threshold_update(); - } - return (false); - } else { - prof_sample_state->accum += size; - return (true); - } -} - -void -prof_malloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt) -{ - - assert(ptr != NULL); - assert(size == s2u(size)); - - if (opt_lg_prof_sample != 0) { - if (prof_sample_accum_update(size)) { - /* - * Don't sample. For malloc()-like allocation, it is - * always possible to tell in advance how large an - * object's usable size will be, so there should never - * be a difference between the size passed to - * prof_alloc_prep() and prof_malloc(). - */ - assert(false); - return; - } - } - - if ((uintptr_t)cnt > (uintptr_t)1U) { - prof_ctx_set(ptr, cnt->ctx); - - cnt->epoch++; - /*********/ - mb_write(); - /*********/ - cnt->cnts.curobjs++; - cnt->cnts.curbytes += size; - if (opt_prof_accum) { - cnt->cnts.accumobjs++; - cnt->cnts.accumbytes += size; - } - /*********/ - mb_write(); - /*********/ - cnt->epoch++; - /*********/ - mb_write(); - /*********/ - } else - prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U); -} - -void -prof_realloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt, - const void *old_ptr, size_t old_size, prof_ctx_t *old_ctx) -{ - prof_thr_cnt_t *told_cnt; - - assert(ptr != NULL || (uintptr_t)cnt <= (uintptr_t)1U); - - if (ptr != NULL) { - if (opt_lg_prof_sample != 0) { - if (prof_sample_accum_update(size)) { - /* - * Don't sample. The size passed to - * prof_alloc_prep() was larger than what - * actually got allocated., so a backtrace was - * captured for this allocation, even though - * its actual size was insufficient to cross - * the sample threshold. - */ - return; - } - } - } - - if ((uintptr_t)old_ctx > (uintptr_t)1U) { - told_cnt = prof_lookup(old_ctx->bt); - if (told_cnt == NULL) { - /* - * It's too late to propagate OOM for this realloc(), - * so operate directly on old_cnt->ctx->cnt_merged. - */ - malloc_mutex_lock(&old_ctx->lock); - old_ctx->cnt_merged.curobjs--; - old_ctx->cnt_merged.curbytes -= old_size; - malloc_mutex_unlock(&old_ctx->lock); - told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U; - } - } else - told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U; - - if ((uintptr_t)told_cnt > (uintptr_t)1U) - told_cnt->epoch++; - if ((uintptr_t)cnt > (uintptr_t)1U) { - prof_ctx_set(ptr, cnt->ctx); - cnt->epoch++; - } else - prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U); - /*********/ - mb_write(); - /*********/ - if ((uintptr_t)told_cnt > (uintptr_t)1U) { - told_cnt->cnts.curobjs--; - told_cnt->cnts.curbytes -= old_size; - } - if ((uintptr_t)cnt > (uintptr_t)1U) { - cnt->cnts.curobjs++; - cnt->cnts.curbytes += size; - if (opt_prof_accum) { - cnt->cnts.accumobjs++; - cnt->cnts.accumbytes += size; - } - } - /*********/ - mb_write(); - /*********/ - if ((uintptr_t)told_cnt > (uintptr_t)1U) - told_cnt->epoch++; - if ((uintptr_t)cnt > (uintptr_t)1U) - cnt->epoch++; - /*********/ - mb_write(); /* Not strictly necessary. */ -} - -void -prof_free(const void *ptr) -{ - prof_ctx_t *ctx = prof_ctx_get(ptr); - - if ((uintptr_t)ctx > (uintptr_t)1) { - size_t size = isalloc(ptr); - prof_thr_cnt_t *tcnt = prof_lookup(ctx->bt); - - if (tcnt != NULL) { - tcnt->epoch++; - /*********/ - mb_write(); - /*********/ - tcnt->cnts.curobjs--; - tcnt->cnts.curbytes -= size; - /*********/ - mb_write(); - /*********/ - tcnt->epoch++; - /*********/ - mb_write(); - /*********/ - } else { - /* - * OOM during free() cannot be propagated, so operate - * directly on cnt->ctx->cnt_merged. - */ - malloc_mutex_lock(&ctx->lock); - ctx->cnt_merged.curobjs--; - ctx->cnt_merged.curbytes -= size; - malloc_mutex_unlock(&ctx->lock); - } - } -} - static bool prof_flush(bool propagate_err) { @@ -1468,60 +1064,72 @@ prof_bt_keycomp(const void *k1, const void *k2) return (memcmp(bt1->vec, bt2->vec, bt1->len * sizeof(void *)) == 0); } -static void -prof_tcache_cleanup(void *arg) +prof_tdata_t * +prof_tdata_init(void) { - prof_tcache_t *prof_tcache; + prof_tdata_t *prof_tdata; - prof_tcache = PROF_TCACHE_GET(); - if (prof_tcache != NULL) { + /* Initialize an empty cache for this thread. */ + prof_tdata = (prof_tdata_t *)imalloc(sizeof(prof_tdata_t)); + if (prof_tdata == NULL) + return (NULL); + + if (ckh_new(&prof_tdata->bt2cnt, PROF_CKH_MINITEMS, + prof_bt_hash, prof_bt_keycomp)) { + idalloc(prof_tdata); + return (NULL); + } + ql_new(&prof_tdata->lru_ql); + + prof_tdata->vec = imalloc(sizeof(void *) * prof_bt_max); + if (prof_tdata->vec == NULL) { + + ckh_delete(&prof_tdata->bt2cnt); + idalloc(prof_tdata); + return (NULL); + } + + prof_tdata->prn_state = 0; + prof_tdata->threshold = 0; + prof_tdata->accum = 0; + + PROF_TCACHE_SET(prof_tdata); + + return (prof_tdata); +} + +static void +prof_tdata_cleanup(void *arg) +{ + prof_tdata_t *prof_tdata; + + prof_tdata = PROF_TCACHE_GET(); + if (prof_tdata != NULL) { prof_thr_cnt_t *cnt; /* * Delete the hash table. All of its contents can still be * iterated over via the LRU. */ - ckh_delete(&prof_tcache->bt2cnt); + ckh_delete(&prof_tdata->bt2cnt); /* * Iteratively merge cnt's into the global stats and delete * them. */ - while ((cnt = ql_last(&prof_tcache->lru_ql, lru_link)) != - NULL) { + while ((cnt = ql_last(&prof_tdata->lru_ql, lru_link)) != NULL) { prof_ctx_merge(cnt->ctx, cnt); - ql_remove(&prof_tcache->lru_ql, cnt, lru_link); + ql_remove(&prof_tdata->lru_ql, cnt, lru_link); idalloc(cnt); } - idalloc(prof_tcache); + idalloc(prof_tdata->vec); + + idalloc(prof_tdata); PROF_TCACHE_SET(NULL); } } -static void -vec_cleanup(void *arg) -{ - void **vec; - - vec = VEC_GET(); - if (vec != NULL) { - idalloc(vec); - VEC_SET(NULL); - } -} - -#ifdef NO_TLS -static void -prof_sample_state_thread_cleanup(void *arg) -{ - prof_sample_state_t *prof_sample_state = (prof_sample_state_t *)arg; - - if (prof_sample_state != &prof_sample_state_oom) - idalloc(prof_sample_state); -} -#endif - void prof_boot0(void) { @@ -1560,25 +1168,12 @@ prof_boot1(void) return (true); if (malloc_mutex_init(&bt2ctx_mtx)) return (true); - if (pthread_key_create(&prof_tcache_tsd, prof_tcache_cleanup) + if (pthread_key_create(&prof_tdata_tsd, prof_tdata_cleanup) != 0) { malloc_write( ": Error in pthread_key_create()\n"); abort(); } - if (pthread_key_create(&vec_tsd, vec_cleanup) != 0) { - malloc_write( - ": Error in pthread_key_create()\n"); - abort(); - } -#ifdef NO_TLS - if (pthread_key_create(&prof_sample_state_tsd, - prof_sample_state_thread_cleanup) != 0) { - malloc_write( - ": Error in pthread_key_create()\n"); - abort(); - } -#endif prof_bt_max = (1U << opt_lg_prof_bt_max); if (malloc_mutex_init(&prof_dump_seq_mtx))