/******************************************************************************/ #ifdef JEMALLOC_H_TYPES typedef struct prof_bt_s prof_bt_t; typedef struct prof_cnt_s prof_cnt_t; typedef struct prof_thr_cnt_s prof_thr_cnt_t; typedef struct prof_ctx_s prof_ctx_t; typedef struct prof_tdata_s prof_tdata_t; /* Option defaults. */ #define PROF_PREFIX_DEFAULT "jeprof" #define LG_PROF_SAMPLE_DEFAULT 0 #define LG_PROF_INTERVAL_DEFAULT -1 /* * Hard limit on stack backtrace depth. The version of prof_backtrace() that * is based on __builtin_return_address() necessarily has a hard-coded number * of backtrace frame handlers, and should be kept in sync with this setting. */ #define PROF_BT_MAX 128 /* Maximum number of backtraces to store in each per thread LRU cache. */ #define PROF_TCMAX 1024 /* Initial hash table size. */ #define PROF_CKH_MINITEMS 64 /* Size of memory buffer to use when writing dump files. */ #define PROF_DUMP_BUF_SIZE 65536 #endif /* JEMALLOC_H_TYPES */ /******************************************************************************/ #ifdef JEMALLOC_H_STRUCTS struct prof_bt_s { /* Backtrace, stored as len program counters. */ void **vec; unsigned len; }; #ifdef JEMALLOC_PROF_LIBGCC /* Data structure passed to libgcc _Unwind_Backtrace() callback functions. */ typedef struct { prof_bt_t *bt; unsigned nignore; unsigned max; } prof_unwind_data_t; #endif struct prof_cnt_s { /* * Profiling counters. An allocation/deallocation pair can operate on * different prof_thr_cnt_t objects that are linked into the same * prof_ctx_t cnts_ql, so it is possible for the cur* counters to go * negative. In principle it is possible for the *bytes counters to * overflow/underflow, but a general solution would require something * like 128-bit counters; this implementation doesn't bother to solve * that problem. */ int64_t curobjs; int64_t curbytes; uint64_t accumobjs; uint64_t accumbytes; }; struct prof_thr_cnt_s { /* Linkage into prof_ctx_t's cnts_ql. */ ql_elm(prof_thr_cnt_t) cnts_link; /* Linkage into thread's LRU. */ ql_elm(prof_thr_cnt_t) lru_link; /* * Associated context. If a thread frees an object that it did not * allocate, it is possible that the context is not cached in the * thread's hash table, in which case it must be able to look up the * context, insert a new prof_thr_cnt_t into the thread's hash table, * and link it into the prof_ctx_t's cnts_ql. */ prof_ctx_t *ctx; /* * Threads use memory barriers to update the counters. Since there is * only ever one writer, the only challenge is for the reader to get a * consistent read of the counters. * * The writer uses this series of operations: * * 1) Increment epoch to an odd number. * 2) Update counters. * 3) Increment epoch to an even number. * * The reader must assure 1) that the epoch is even while it reads the * counters, and 2) that the epoch doesn't change between the time it * starts and finishes reading the counters. */ unsigned epoch; /* Profiling counters. */ prof_cnt_t cnts; }; struct prof_ctx_s { /* Associated backtrace. */ prof_bt_t *bt; /* Protects cnt_merged and cnts_ql. */ malloc_mutex_t lock; /* Temporary storage for summation during dump. */ prof_cnt_t cnt_summed; /* When threads exit, they merge their stats into cnt_merged. */ prof_cnt_t cnt_merged; /* * List of profile counters, one for each thread that has allocated in * this context. */ ql_head(prof_thr_cnt_t) cnts_ql; }; struct prof_tdata_s { /* * Hash of (prof_bt_t *)-->(prof_thr_cnt_t *). Each thread keeps a * cache of backtraces, with associated thread-specific prof_thr_cnt_t * objects. Other threads may read the prof_thr_cnt_t contents, but no * others will ever write them. * * Upon thread exit, the thread must merge all the prof_thr_cnt_t * counter data into the associated prof_ctx_t objects, and unlink/free * the prof_thr_cnt_t objects. */ ckh_t bt2cnt; /* LRU for contents of bt2cnt. */ ql_head(prof_thr_cnt_t) lru_ql; /* Backtrace vector, used for calls to prof_backtrace(). */ void **vec; /* Sampling state. */ uint64_t prn_state; uint64_t threshold; uint64_t accum; }; #endif /* JEMALLOC_H_STRUCTS */ /******************************************************************************/ #ifdef JEMALLOC_H_EXTERNS extern bool opt_prof; /* * Even if opt_prof is true, sampling can be temporarily disabled by setting * opt_prof_active to false. No locking is used when updating opt_prof_active, * so there are no guarantees regarding how long it will take for all threads * to notice state changes. */ extern bool opt_prof_active; extern size_t opt_lg_prof_sample; /* Mean bytes between samples. */ extern ssize_t opt_lg_prof_interval; /* lg(prof_interval). */ extern bool opt_prof_gdump; /* High-water memory dumping. */ extern bool opt_prof_leak; /* Dump leak summary at exit. */ extern bool opt_prof_accum; /* Report cumulative bytes. */ extern char opt_prof_prefix[PATH_MAX + 1]; /* * Profile dump interval, measured in bytes allocated. Each arena triggers a * profile dump when it reaches this threshold. The effect is that the * interval between profile dumps averages prof_interval, though the actual * interval between dumps will tend to be sporadic, and the interval will be a * maximum of approximately (prof_interval * narenas). */ extern uint64_t prof_interval; /* * If true, promote small sampled objects to large objects, since small run * headers do not have embedded profile context pointers. */ extern bool prof_promote; /* Thread-specific backtrace cache, used to reduce bt2ctx contention. */ #ifndef NO_TLS extern __thread prof_tdata_t *prof_tdata_tls JEMALLOC_ATTR(tls_model("initial-exec")); # define PROF_TCACHE_GET() prof_tdata_tls # define PROF_TCACHE_SET(v) do { \ prof_tdata_tls = (v); \ pthread_setspecific(prof_tdata_tsd, (void *)(v)); \ } while (0) #else # define PROF_TCACHE_GET() \ ((prof_tdata_t *)pthread_getspecific(prof_tdata_tsd)) # define PROF_TCACHE_SET(v) do { \ pthread_setspecific(prof_tdata_tsd, (void *)(v)); \ } while (0) #endif /* * Same contents as b2cnt_tls, but initialized such that the TSD destructor is * called when a thread exits, so that prof_tdata_tls contents can be merged, * unlinked, and deallocated. */ extern pthread_key_t prof_tdata_tsd; void bt_init(prof_bt_t *bt, void **vec); void prof_backtrace(prof_bt_t *bt, unsigned nignore); prof_thr_cnt_t *prof_lookup(prof_bt_t *bt); void prof_idump(void); bool prof_mdump(const char *filename); void prof_gdump(void); prof_tdata_t *prof_tdata_init(void); void prof_boot0(void); void prof_boot1(void); bool prof_boot2(void); #endif /* JEMALLOC_H_EXTERNS */ /******************************************************************************/ #ifdef JEMALLOC_H_INLINES #define PROF_ALLOC_PREP(nignore, size, ret) do { \ prof_tdata_t *prof_tdata; \ prof_bt_t bt; \ \ assert(size == s2u(size)); \ \ prof_tdata = PROF_TCACHE_GET(); \ if (prof_tdata == NULL) { \ prof_tdata = prof_tdata_init(); \ if (prof_tdata == NULL) { \ ret = NULL; \ break; \ } \ } \ \ if (opt_prof_active == false) { \ /* Sampling is currently inactive, so avoid sampling. */\ ret = (prof_thr_cnt_t *)(uintptr_t)1U; \ } else if (opt_lg_prof_sample == 0) { \ /* Don't bother with sampling logic, since sampling */\ /* interval is 1. */\ bt_init(&bt, prof_tdata->vec); \ prof_backtrace(&bt, nignore); \ ret = prof_lookup(&bt); \ } else { \ if (prof_tdata->threshold == 0) { \ /* Initialize. Seed the prng differently for */\ /* each thread. */\ prof_tdata->prn_state = \ (uint64_t)(uintptr_t)&size; \ prof_sample_threshold_update(prof_tdata); \ } \ \ /* Determine whether to capture a backtrace based on */\ /* whether size is enough for prof_accum to reach */\ /* prof_tdata->threshold. However, delay updating */\ /* these variables until prof_{m,re}alloc(), because */\ /* we don't know for sure that the allocation will */\ /* succeed. */\ /* */\ /* Use subtraction rather than addition to avoid */\ /* potential integer overflow. */\ if (size >= prof_tdata->threshold - \ prof_tdata->accum) { \ bt_init(&bt, prof_tdata->vec); \ prof_backtrace(&bt, nignore); \ ret = prof_lookup(&bt); \ } else \ ret = (prof_thr_cnt_t *)(uintptr_t)1U; \ } \ } while (0) #ifndef JEMALLOC_ENABLE_INLINE void prof_sample_threshold_update(prof_tdata_t *prof_tdata); prof_ctx_t *prof_ctx_get(const void *ptr); void prof_ctx_set(const void *ptr, prof_ctx_t *ctx); bool prof_sample_accum_update(size_t size); void prof_malloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt); void prof_realloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt, size_t old_size, prof_ctx_t *old_ctx); void prof_free(const void *ptr, size_t size); #endif #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_PROF_C_)) JEMALLOC_INLINE void prof_sample_threshold_update(prof_tdata_t *prof_tdata) { uint64_t r; double u; cassert(config_prof); /* * Compute sample threshold as a geometrically distributed random * variable with mean (2^opt_lg_prof_sample). * * __ __ * | log(u) | 1 * prof_tdata->threshold = | -------- |, where p = ------------------- * | log(1-p) | opt_lg_prof_sample * 2 * * For more information on the math, see: * * Non-Uniform Random Variate Generation * Luc Devroye * Springer-Verlag, New York, 1986 * pp 500 * (http://cg.scs.carleton.ca/~luc/rnbookindex.html) */ prn64(r, 53, prof_tdata->prn_state, (uint64_t)6364136223846793005LLU, (uint64_t)1442695040888963407LLU); u = (double)r * (1.0/9007199254740992.0L); prof_tdata->threshold = (uint64_t)(log(u) / log(1.0 - (1.0 / (double)((uint64_t)1U << opt_lg_prof_sample)))) + (uint64_t)1U; } JEMALLOC_INLINE prof_ctx_t * prof_ctx_get(const void *ptr) { prof_ctx_t *ret; arena_chunk_t *chunk; cassert(config_prof); assert(ptr != NULL); chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr); if (chunk != ptr) { /* Region. */ ret = arena_prof_ctx_get(ptr); } else ret = huge_prof_ctx_get(ptr); return (ret); } JEMALLOC_INLINE void prof_ctx_set(const void *ptr, prof_ctx_t *ctx) { arena_chunk_t *chunk; cassert(config_prof); assert(ptr != NULL); chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr); if (chunk != ptr) { /* Region. */ arena_prof_ctx_set(ptr, ctx); } else huge_prof_ctx_set(ptr, ctx); } JEMALLOC_INLINE bool prof_sample_accum_update(size_t size) { prof_tdata_t *prof_tdata; cassert(config_prof); /* Sampling logic is unnecessary if the interval is 1. */ assert(opt_lg_prof_sample != 0); prof_tdata = PROF_TCACHE_GET(); assert(prof_tdata != NULL); /* Take care to avoid integer overflow. */ if (size >= prof_tdata->threshold - prof_tdata->accum) { prof_tdata->accum -= (prof_tdata->threshold - size); /* Compute new sample threshold. */ prof_sample_threshold_update(prof_tdata); while (prof_tdata->accum >= prof_tdata->threshold) { prof_tdata->accum -= prof_tdata->threshold; prof_sample_threshold_update(prof_tdata); } return (false); } else { prof_tdata->accum += size; return (true); } } JEMALLOC_INLINE void prof_malloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt) { cassert(config_prof); assert(ptr != NULL); assert(size == isalloc(ptr)); if (opt_lg_prof_sample != 0) { if (prof_sample_accum_update(size)) { /* * Don't sample. For malloc()-like allocation, it is * always possible to tell in advance how large an * object's usable size will be, so there should never * be a difference between the size passed to * PROF_ALLOC_PREP() and prof_malloc(). */ assert((uintptr_t)cnt == (uintptr_t)1U); } } if ((uintptr_t)cnt > (uintptr_t)1U) { prof_ctx_set(ptr, cnt->ctx); cnt->epoch++; /*********/ mb_write(); /*********/ cnt->cnts.curobjs++; cnt->cnts.curbytes += size; if (opt_prof_accum) { cnt->cnts.accumobjs++; cnt->cnts.accumbytes += size; } /*********/ mb_write(); /*********/ cnt->epoch++; /*********/ mb_write(); /*********/ } else prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U); } JEMALLOC_INLINE void prof_realloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt, size_t old_size, prof_ctx_t *old_ctx) { prof_thr_cnt_t *told_cnt; cassert(config_prof); assert(ptr != NULL || (uintptr_t)cnt <= (uintptr_t)1U); if (ptr != NULL) { assert(size == isalloc(ptr)); if (opt_lg_prof_sample != 0) { if (prof_sample_accum_update(size)) { /* * Don't sample. The size passed to * PROF_ALLOC_PREP() was larger than what * actually got allocated, so a backtrace was * captured for this allocation, even though * its actual size was insufficient to cross * the sample threshold. */ cnt = (prof_thr_cnt_t *)(uintptr_t)1U; } } } if ((uintptr_t)old_ctx > (uintptr_t)1U) { told_cnt = prof_lookup(old_ctx->bt); if (told_cnt == NULL) { /* * It's too late to propagate OOM for this realloc(), * so operate directly on old_cnt->ctx->cnt_merged. */ malloc_mutex_lock(&old_ctx->lock); old_ctx->cnt_merged.curobjs--; old_ctx->cnt_merged.curbytes -= old_size; malloc_mutex_unlock(&old_ctx->lock); told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U; } } else told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U; if ((uintptr_t)told_cnt > (uintptr_t)1U) told_cnt->epoch++; if ((uintptr_t)cnt > (uintptr_t)1U) { prof_ctx_set(ptr, cnt->ctx); cnt->epoch++; } else prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U); /*********/ mb_write(); /*********/ if ((uintptr_t)told_cnt > (uintptr_t)1U) { told_cnt->cnts.curobjs--; told_cnt->cnts.curbytes -= old_size; } if ((uintptr_t)cnt > (uintptr_t)1U) { cnt->cnts.curobjs++; cnt->cnts.curbytes += size; if (opt_prof_accum) { cnt->cnts.accumobjs++; cnt->cnts.accumbytes += size; } } /*********/ mb_write(); /*********/ if ((uintptr_t)told_cnt > (uintptr_t)1U) told_cnt->epoch++; if ((uintptr_t)cnt > (uintptr_t)1U) cnt->epoch++; /*********/ mb_write(); /* Not strictly necessary. */ } JEMALLOC_INLINE void prof_free(const void *ptr, size_t size) { prof_ctx_t *ctx = prof_ctx_get(ptr); cassert(config_prof); if ((uintptr_t)ctx > (uintptr_t)1) { assert(size == isalloc(ptr)); prof_thr_cnt_t *tcnt = prof_lookup(ctx->bt); if (tcnt != NULL) { tcnt->epoch++; /*********/ mb_write(); /*********/ tcnt->cnts.curobjs--; tcnt->cnts.curbytes -= size; /*********/ mb_write(); /*********/ tcnt->epoch++; /*********/ mb_write(); /*********/ } else { /* * OOM during free() cannot be propagated, so operate * directly on cnt->ctx->cnt_merged. */ malloc_mutex_lock(&ctx->lock); ctx->cnt_merged.curobjs--; ctx->cnt_merged.curbytes -= size; malloc_mutex_unlock(&ctx->lock); } } } #endif #endif /* JEMALLOC_H_INLINES */ /******************************************************************************/