server-skynet-source-3rd-je.../include/jemalloc/internal/prof.h
Jason Evans 0b526ff94d Remove the opt.lg_prof_tcmax option.
Remove the opt.lg_prof_tcmax option and hard-code a cache size of 1024.
This setting is something that users just shouldn't have to worry about.
If lock contention actually ends up being a problem, the simple solution
available to the user is to reduce sampling frequency.
2012-02-13 18:04:26 -08:00

552 lines
16 KiB
C

/******************************************************************************/
#ifdef JEMALLOC_H_TYPES
typedef struct prof_bt_s prof_bt_t;
typedef struct prof_cnt_s prof_cnt_t;
typedef struct prof_thr_cnt_s prof_thr_cnt_t;
typedef struct prof_ctx_s prof_ctx_t;
typedef struct prof_tdata_s prof_tdata_t;
/* Option defaults. */
#define PROF_PREFIX_DEFAULT "jeprof"
#define LG_PROF_BT_MAX_DEFAULT 7
#define LG_PROF_SAMPLE_DEFAULT 0
#define LG_PROF_INTERVAL_DEFAULT -1
/* Maximum number of backtraces to store in each per thread LRU cache. */
#define PROF_TCMAX 1024
/*
* Hard limit on stack backtrace depth. Note that the version of
* prof_backtrace() that is based on __builtin_return_address() necessarily has
* a hard-coded number of backtrace frame handlers.
*/
#if (defined(JEMALLOC_PROF_LIBGCC) || defined(JEMALLOC_PROF_LIBUNWIND))
# define LG_PROF_BT_MAX ((ZU(1) << (LG_SIZEOF_PTR+3)) - 1)
#else
# define LG_PROF_BT_MAX 7 /* >= LG_PROF_BT_MAX_DEFAULT */
#endif
#define PROF_BT_MAX (1U << LG_PROF_BT_MAX)
/* Initial hash table size. */
#define PROF_CKH_MINITEMS 64
/* Size of memory buffer to use when writing dump files. */
#define PROF_DUMP_BUF_SIZE 65536
#endif /* JEMALLOC_H_TYPES */
/******************************************************************************/
#ifdef JEMALLOC_H_STRUCTS
struct prof_bt_s {
/* Backtrace, stored as len program counters. */
void **vec;
unsigned len;
};
#ifdef JEMALLOC_PROF_LIBGCC
/* Data structure passed to libgcc _Unwind_Backtrace() callback functions. */
typedef struct {
prof_bt_t *bt;
unsigned nignore;
unsigned max;
} prof_unwind_data_t;
#endif
struct prof_cnt_s {
/*
* Profiling counters. An allocation/deallocation pair can operate on
* different prof_thr_cnt_t objects that are linked into the same
* prof_ctx_t cnts_ql, so it is possible for the cur* counters to go
* negative. In principle it is possible for the *bytes counters to
* overflow/underflow, but a general solution would require something
* like 128-bit counters; this implementation doesn't bother to solve
* that problem.
*/
int64_t curobjs;
int64_t curbytes;
uint64_t accumobjs;
uint64_t accumbytes;
};
struct prof_thr_cnt_s {
/* Linkage into prof_ctx_t's cnts_ql. */
ql_elm(prof_thr_cnt_t) cnts_link;
/* Linkage into thread's LRU. */
ql_elm(prof_thr_cnt_t) lru_link;
/*
* Associated context. If a thread frees an object that it did not
* allocate, it is possible that the context is not cached in the
* thread's hash table, in which case it must be able to look up the
* context, insert a new prof_thr_cnt_t into the thread's hash table,
* and link it into the prof_ctx_t's cnts_ql.
*/
prof_ctx_t *ctx;
/*
* Threads use memory barriers to update the counters. Since there is
* only ever one writer, the only challenge is for the reader to get a
* consistent read of the counters.
*
* The writer uses this series of operations:
*
* 1) Increment epoch to an odd number.
* 2) Update counters.
* 3) Increment epoch to an even number.
*
* The reader must assure 1) that the epoch is even while it reads the
* counters, and 2) that the epoch doesn't change between the time it
* starts and finishes reading the counters.
*/
unsigned epoch;
/* Profiling counters. */
prof_cnt_t cnts;
};
struct prof_ctx_s {
/* Associated backtrace. */
prof_bt_t *bt;
/* Protects cnt_merged and cnts_ql. */
malloc_mutex_t lock;
/* Temporary storage for summation during dump. */
prof_cnt_t cnt_summed;
/* When threads exit, they merge their stats into cnt_merged. */
prof_cnt_t cnt_merged;
/*
* List of profile counters, one for each thread that has allocated in
* this context.
*/
ql_head(prof_thr_cnt_t) cnts_ql;
};
struct prof_tdata_s {
/*
* Hash of (prof_bt_t *)-->(prof_thr_cnt_t *). Each thread keeps a
* cache of backtraces, with associated thread-specific prof_thr_cnt_t
* objects. Other threads may read the prof_thr_cnt_t contents, but no
* others will ever write them.
*
* Upon thread exit, the thread must merge all the prof_thr_cnt_t
* counter data into the associated prof_ctx_t objects, and unlink/free
* the prof_thr_cnt_t objects.
*/
ckh_t bt2cnt;
/* LRU for contents of bt2cnt. */
ql_head(prof_thr_cnt_t) lru_ql;
/* Backtrace vector, used for calls to prof_backtrace(). */
void **vec;
/* Sampling state. */
uint64_t prn_state;
uint64_t threshold;
uint64_t accum;
};
#endif /* JEMALLOC_H_STRUCTS */
/******************************************************************************/
#ifdef JEMALLOC_H_EXTERNS
extern bool opt_prof;
/*
* Even if opt_prof is true, sampling can be temporarily disabled by setting
* opt_prof_active to false. No locking is used when updating opt_prof_active,
* so there are no guarantees regarding how long it will take for all threads
* to notice state changes.
*/
extern bool opt_prof_active;
extern size_t opt_lg_prof_bt_max; /* Maximum backtrace depth. */
extern size_t opt_lg_prof_sample; /* Mean bytes between samples. */
extern ssize_t opt_lg_prof_interval; /* lg(prof_interval). */
extern bool opt_prof_gdump; /* High-water memory dumping. */
extern bool opt_prof_leak; /* Dump leak summary at exit. */
extern bool opt_prof_accum; /* Report cumulative bytes. */
extern char opt_prof_prefix[PATH_MAX + 1];
/*
* Profile dump interval, measured in bytes allocated. Each arena triggers a
* profile dump when it reaches this threshold. The effect is that the
* interval between profile dumps averages prof_interval, though the actual
* interval between dumps will tend to be sporadic, and the interval will be a
* maximum of approximately (prof_interval * narenas).
*/
extern uint64_t prof_interval;
/*
* If true, promote small sampled objects to large objects, since small run
* headers do not have embedded profile context pointers.
*/
extern bool prof_promote;
/* (1U << opt_lg_prof_bt_max). */
extern unsigned prof_bt_max;
/* Thread-specific backtrace cache, used to reduce bt2ctx contention. */
#ifndef NO_TLS
extern __thread prof_tdata_t *prof_tdata_tls
JEMALLOC_ATTR(tls_model("initial-exec"));
# define PROF_TCACHE_GET() prof_tdata_tls
# define PROF_TCACHE_SET(v) do { \
prof_tdata_tls = (v); \
pthread_setspecific(prof_tdata_tsd, (void *)(v)); \
} while (0)
#else
# define PROF_TCACHE_GET() \
((prof_tdata_t *)pthread_getspecific(prof_tdata_tsd))
# define PROF_TCACHE_SET(v) do { \
pthread_setspecific(prof_tdata_tsd, (void *)(v)); \
} while (0)
#endif
/*
* Same contents as b2cnt_tls, but initialized such that the TSD destructor is
* called when a thread exits, so that prof_tdata_tls contents can be merged,
* unlinked, and deallocated.
*/
extern pthread_key_t prof_tdata_tsd;
void bt_init(prof_bt_t *bt, void **vec);
void prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max);
prof_thr_cnt_t *prof_lookup(prof_bt_t *bt);
void prof_idump(void);
bool prof_mdump(const char *filename);
void prof_gdump(void);
prof_tdata_t *prof_tdata_init(void);
void prof_boot0(void);
void prof_boot1(void);
bool prof_boot2(void);
#endif /* JEMALLOC_H_EXTERNS */
/******************************************************************************/
#ifdef JEMALLOC_H_INLINES
#define PROF_ALLOC_PREP(nignore, size, ret) do { \
prof_tdata_t *prof_tdata; \
prof_bt_t bt; \
\
assert(size == s2u(size)); \
\
prof_tdata = PROF_TCACHE_GET(); \
if (prof_tdata == NULL) { \
prof_tdata = prof_tdata_init(); \
if (prof_tdata == NULL) { \
ret = NULL; \
break; \
} \
} \
\
if (opt_prof_active == false) { \
/* Sampling is currently inactive, so avoid sampling. */\
ret = (prof_thr_cnt_t *)(uintptr_t)1U; \
} else if (opt_lg_prof_sample == 0) { \
/* Don't bother with sampling logic, since sampling */\
/* interval is 1. */\
bt_init(&bt, prof_tdata->vec); \
prof_backtrace(&bt, nignore, prof_bt_max); \
ret = prof_lookup(&bt); \
} else { \
if (prof_tdata->threshold == 0) { \
/* Initialize. Seed the prng differently for */\
/* each thread. */\
prof_tdata->prn_state = \
(uint64_t)(uintptr_t)&size; \
prof_sample_threshold_update(prof_tdata); \
} \
\
/* Determine whether to capture a backtrace based on */\
/* whether size is enough for prof_accum to reach */\
/* prof_tdata->threshold. However, delay updating */\
/* these variables until prof_{m,re}alloc(), because */\
/* we don't know for sure that the allocation will */\
/* succeed. */\
/* */\
/* Use subtraction rather than addition to avoid */\
/* potential integer overflow. */\
if (size >= prof_tdata->threshold - \
prof_tdata->accum) { \
bt_init(&bt, prof_tdata->vec); \
prof_backtrace(&bt, nignore, prof_bt_max); \
ret = prof_lookup(&bt); \
} else \
ret = (prof_thr_cnt_t *)(uintptr_t)1U; \
} \
} while (0)
#ifndef JEMALLOC_ENABLE_INLINE
void prof_sample_threshold_update(prof_tdata_t *prof_tdata);
prof_ctx_t *prof_ctx_get(const void *ptr);
void prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
bool prof_sample_accum_update(size_t size);
void prof_malloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt);
void prof_realloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt,
size_t old_size, prof_ctx_t *old_ctx);
void prof_free(const void *ptr, size_t size);
#endif
#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_PROF_C_))
JEMALLOC_INLINE void
prof_sample_threshold_update(prof_tdata_t *prof_tdata)
{
uint64_t r;
double u;
cassert(config_prof);
/*
* Compute sample threshold as a geometrically distributed random
* variable with mean (2^opt_lg_prof_sample).
*
* __ __
* | log(u) | 1
* prof_tdata->threshold = | -------- |, where p = -------------------
* | log(1-p) | opt_lg_prof_sample
* 2
*
* For more information on the math, see:
*
* Non-Uniform Random Variate Generation
* Luc Devroye
* Springer-Verlag, New York, 1986
* pp 500
* (http://cg.scs.carleton.ca/~luc/rnbookindex.html)
*/
prn64(r, 53, prof_tdata->prn_state,
(uint64_t)6364136223846793005LLU, (uint64_t)1442695040888963407LLU);
u = (double)r * (1.0/9007199254740992.0L);
prof_tdata->threshold = (uint64_t)(log(u) /
log(1.0 - (1.0 / (double)((uint64_t)1U << opt_lg_prof_sample))))
+ (uint64_t)1U;
}
JEMALLOC_INLINE prof_ctx_t *
prof_ctx_get(const void *ptr)
{
prof_ctx_t *ret;
arena_chunk_t *chunk;
cassert(config_prof);
assert(ptr != NULL);
chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
if (chunk != ptr) {
/* Region. */
ret = arena_prof_ctx_get(ptr);
} else
ret = huge_prof_ctx_get(ptr);
return (ret);
}
JEMALLOC_INLINE void
prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
{
arena_chunk_t *chunk;
cassert(config_prof);
assert(ptr != NULL);
chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
if (chunk != ptr) {
/* Region. */
arena_prof_ctx_set(ptr, ctx);
} else
huge_prof_ctx_set(ptr, ctx);
}
JEMALLOC_INLINE bool
prof_sample_accum_update(size_t size)
{
prof_tdata_t *prof_tdata;
cassert(config_prof);
/* Sampling logic is unnecessary if the interval is 1. */
assert(opt_lg_prof_sample != 0);
prof_tdata = PROF_TCACHE_GET();
assert(prof_tdata != NULL);
/* Take care to avoid integer overflow. */
if (size >= prof_tdata->threshold - prof_tdata->accum) {
prof_tdata->accum -= (prof_tdata->threshold - size);
/* Compute new sample threshold. */
prof_sample_threshold_update(prof_tdata);
while (prof_tdata->accum >= prof_tdata->threshold) {
prof_tdata->accum -= prof_tdata->threshold;
prof_sample_threshold_update(prof_tdata);
}
return (false);
} else {
prof_tdata->accum += size;
return (true);
}
}
JEMALLOC_INLINE void
prof_malloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt)
{
cassert(config_prof);
assert(ptr != NULL);
assert(size == isalloc(ptr));
if (opt_lg_prof_sample != 0) {
if (prof_sample_accum_update(size)) {
/*
* Don't sample. For malloc()-like allocation, it is
* always possible to tell in advance how large an
* object's usable size will be, so there should never
* be a difference between the size passed to
* PROF_ALLOC_PREP() and prof_malloc().
*/
assert((uintptr_t)cnt == (uintptr_t)1U);
}
}
if ((uintptr_t)cnt > (uintptr_t)1U) {
prof_ctx_set(ptr, cnt->ctx);
cnt->epoch++;
/*********/
mb_write();
/*********/
cnt->cnts.curobjs++;
cnt->cnts.curbytes += size;
if (opt_prof_accum) {
cnt->cnts.accumobjs++;
cnt->cnts.accumbytes += size;
}
/*********/
mb_write();
/*********/
cnt->epoch++;
/*********/
mb_write();
/*********/
} else
prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U);
}
JEMALLOC_INLINE void
prof_realloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt,
size_t old_size, prof_ctx_t *old_ctx)
{
prof_thr_cnt_t *told_cnt;
cassert(config_prof);
assert(ptr != NULL || (uintptr_t)cnt <= (uintptr_t)1U);
if (ptr != NULL) {
assert(size == isalloc(ptr));
if (opt_lg_prof_sample != 0) {
if (prof_sample_accum_update(size)) {
/*
* Don't sample. The size passed to
* PROF_ALLOC_PREP() was larger than what
* actually got allocated, so a backtrace was
* captured for this allocation, even though
* its actual size was insufficient to cross
* the sample threshold.
*/
cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
}
}
}
if ((uintptr_t)old_ctx > (uintptr_t)1U) {
told_cnt = prof_lookup(old_ctx->bt);
if (told_cnt == NULL) {
/*
* It's too late to propagate OOM for this realloc(),
* so operate directly on old_cnt->ctx->cnt_merged.
*/
malloc_mutex_lock(&old_ctx->lock);
old_ctx->cnt_merged.curobjs--;
old_ctx->cnt_merged.curbytes -= old_size;
malloc_mutex_unlock(&old_ctx->lock);
told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
}
} else
told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
if ((uintptr_t)told_cnt > (uintptr_t)1U)
told_cnt->epoch++;
if ((uintptr_t)cnt > (uintptr_t)1U) {
prof_ctx_set(ptr, cnt->ctx);
cnt->epoch++;
} else
prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U);
/*********/
mb_write();
/*********/
if ((uintptr_t)told_cnt > (uintptr_t)1U) {
told_cnt->cnts.curobjs--;
told_cnt->cnts.curbytes -= old_size;
}
if ((uintptr_t)cnt > (uintptr_t)1U) {
cnt->cnts.curobjs++;
cnt->cnts.curbytes += size;
if (opt_prof_accum) {
cnt->cnts.accumobjs++;
cnt->cnts.accumbytes += size;
}
}
/*********/
mb_write();
/*********/
if ((uintptr_t)told_cnt > (uintptr_t)1U)
told_cnt->epoch++;
if ((uintptr_t)cnt > (uintptr_t)1U)
cnt->epoch++;
/*********/
mb_write(); /* Not strictly necessary. */
}
JEMALLOC_INLINE void
prof_free(const void *ptr, size_t size)
{
prof_ctx_t *ctx = prof_ctx_get(ptr);
cassert(config_prof);
if ((uintptr_t)ctx > (uintptr_t)1) {
assert(size == isalloc(ptr));
prof_thr_cnt_t *tcnt = prof_lookup(ctx->bt);
if (tcnt != NULL) {
tcnt->epoch++;
/*********/
mb_write();
/*********/
tcnt->cnts.curobjs--;
tcnt->cnts.curbytes -= size;
/*********/
mb_write();
/*********/
tcnt->epoch++;
/*********/
mb_write();
/*********/
} else {
/*
* OOM during free() cannot be propagated, so operate
* directly on cnt->ctx->cnt_merged.
*/
malloc_mutex_lock(&ctx->lock);
ctx->cnt_merged.curobjs--;
ctx->cnt_merged.curbytes -= size;
malloc_mutex_unlock(&ctx->lock);
}
}
}
#endif
#endif /* JEMALLOC_H_INLINES */
/******************************************************************************/