/******************************************************************************/
#ifdef JEMALLOC_H_TYPES

typedef struct prof_bt_s prof_bt_t;
typedef struct prof_cnt_s prof_cnt_t;
typedef struct prof_tctx_s prof_tctx_t;
typedef struct prof_gctx_s prof_gctx_t;
typedef struct prof_tdata_s prof_tdata_t;

/* Option defaults. */
#ifdef JEMALLOC_PROF
#  define PROF_PREFIX_DEFAULT		"jeprof"
#else
#  define PROF_PREFIX_DEFAULT		""
#endif
#define	LG_PROF_SAMPLE_DEFAULT		19
#define	LG_PROF_INTERVAL_DEFAULT	-1

/*
 * Hard limit on stack backtrace depth.  The version of prof_backtrace() that
 * is based on __builtin_return_address() necessarily has a hard-coded number
 * of backtrace frame handlers, and should be kept in sync with this setting.
 */
#define	PROF_BT_MAX			128

/* Initial hash table size. */
#define	PROF_CKH_MINITEMS		64

/* Size of memory buffer to use when writing dump files. */
#define	PROF_DUMP_BUFSIZE		65536

/* Size of stack-allocated buffer used by prof_printf(). */
#define	PROF_PRINTF_BUFSIZE		128

/*
 * Number of mutexes shared among all gctx's.  No space is allocated for these
 * unless profiling is enabled, so it's okay to over-provision.
 */
#define	PROF_NCTX_LOCKS			1024

/*
 * Number of mutexes shared among all tdata's.  No space is allocated for these
 * unless profiling is enabled, so it's okay to over-provision.
 */
#define	PROF_NTDATA_LOCKS		256

/*
 * prof_tdata pointers close to NULL are used to encode state information that
 * is used for cleaning up during thread shutdown.
 */
#define	PROF_TDATA_STATE_REINCARNATED	((prof_tdata_t *)(uintptr_t)1)
#define	PROF_TDATA_STATE_PURGATORY	((prof_tdata_t *)(uintptr_t)2)
#define	PROF_TDATA_STATE_MAX		PROF_TDATA_STATE_PURGATORY

#endif /* JEMALLOC_H_TYPES */
/******************************************************************************/
#ifdef JEMALLOC_H_STRUCTS

struct prof_bt_s {
	/* Backtrace, stored as len program counters. */
	void		**vec;
	unsigned	len;
};

#ifdef JEMALLOC_PROF_LIBGCC
/* Data structure passed to libgcc _Unwind_Backtrace() callback functions. */
typedef struct {
	prof_bt_t	*bt;
	unsigned	max;
} prof_unwind_data_t;
#endif

struct prof_cnt_s {
	/* Profiling counters. */
	uint64_t	curobjs;
	uint64_t	curbytes;
	uint64_t	accumobjs;
	uint64_t	accumbytes;
};

typedef enum {
	prof_tctx_state_initializing,
	prof_tctx_state_nominal,
	prof_tctx_state_dumping,
	prof_tctx_state_purgatory /* Dumper must finish destroying. */
} prof_tctx_state_t;

struct prof_tctx_s {
	/* Thread data for thread that performed the allocation. */
	prof_tdata_t		*tdata;

	/*
	 * Copy of tdata->thr_{uid,discrim}, necessary because tdata may be
	 * defunct during teardown.
	 */
	uint64_t		thr_uid;
	uint64_t		thr_discrim;

	/* Profiling counters, protected by tdata->lock. */
	prof_cnt_t		cnts;

	/* Associated global context. */
	prof_gctx_t		*gctx;

	/*
	 * UID that distinguishes multiple tctx's created by the same thread,
	 * but coexisting in gctx->tctxs.  There are two ways that such
	 * coexistence can occur:
	 * - A dumper thread can cause a tctx to be retained in the purgatory
	 *   state.
	 * - Although a single "producer" thread must create all tctx's which
	 *   share the same thr_uid, multiple "consumers" can each concurrently
	 *   execute portions of prof_tctx_destroy().  prof_tctx_destroy() only
	 *   gets called once each time cnts.cur{objs,bytes} drop to 0, but this
	 *   threshold can be hit again before the first consumer finishes
	 *   executing prof_tctx_destroy().
	 */
	uint64_t		tctx_uid;

	/* Linkage into gctx's tctxs. */
	rb_node(prof_tctx_t)	tctx_link;

	/*
	 * True during prof_alloc_prep()..prof_malloc_sample_object(), prevents
	 * sample vs destroy race.
	 */
	bool			prepared;

	/* Current dump-related state, protected by gctx->lock. */
	prof_tctx_state_t	state;

	/*
	 * Copy of cnts snapshotted during early dump phase, protected by
	 * dump_mtx.
	 */
	prof_cnt_t		dump_cnts;
};
typedef rb_tree(prof_tctx_t) prof_tctx_tree_t;

struct prof_gctx_s {
	/* Protects nlimbo, cnt_summed, and tctxs. */
	malloc_mutex_t		*lock;

	/*
	 * Number of threads that currently cause this gctx to be in a state of
	 * limbo due to one of:
	 *   - Initializing this gctx.
	 *   - Initializing per thread counters associated with this gctx.
	 *   - Preparing to destroy this gctx.
	 *   - Dumping a heap profile that includes this gctx.
	 * nlimbo must be 1 (single destroyer) in order to safely destroy the
	 * gctx.
	 */
	unsigned		nlimbo;

	/*
	 * Tree of profile counters, one for each thread that has allocated in
	 * this context.
	 */
	prof_tctx_tree_t	tctxs;

	/* Linkage for tree of contexts to be dumped. */
	rb_node(prof_gctx_t)	dump_link;

	/* Temporary storage for summation during dump. */
	prof_cnt_t		cnt_summed;

	/* Associated backtrace. */
	prof_bt_t		bt;

	/* Backtrace vector, variable size, referred to by bt. */
	void			*vec[1];
};
typedef rb_tree(prof_gctx_t) prof_gctx_tree_t;

struct prof_tdata_s {
	malloc_mutex_t		*lock;

	/* Monotonically increasing unique thread identifier. */
	uint64_t		thr_uid;

	/*
	 * Monotonically increasing discriminator among tdata structures
	 * associated with the same thr_uid.
	 */
	uint64_t		thr_discrim;

	/* Included in heap profile dumps if non-NULL. */
	char			*thread_name;

	bool			attached;
	bool			expired;

	rb_node(prof_tdata_t)	tdata_link;

	/*
	 * Counter used to initialize prof_tctx_t's tctx_uid.  No locking is
	 * necessary when incrementing this field, because only one thread ever
	 * does so.
	 */
	uint64_t		tctx_uid_next;

	/*
	 * Hash of (prof_bt_t *)-->(prof_tctx_t *).  Each thread tracks
	 * backtraces for which it has non-zero allocation/deallocation counters
	 * associated with thread-specific prof_tctx_t objects.  Other threads
	 * may write to prof_tctx_t contents when freeing associated objects.
	 */
	ckh_t			bt2tctx;

	/* Sampling state. */
	uint64_t		prng_state;
	uint64_t		bytes_until_sample;

	/* State used to avoid dumping while operating on prof internals. */
	bool			enq;
	bool			enq_idump;
	bool			enq_gdump;

	/*
	 * Set to true during an early dump phase for tdata's which are
	 * currently being dumped.  New threads' tdata's have this initialized
	 * to false so that they aren't accidentally included in later dump
	 * phases.
	 */
	bool			dumping;

	/*
	 * True if profiling is active for this tdata's thread
	 * (thread.prof.active mallctl).
	 */
	bool			active;

	/* Temporary storage for summation during dump. */
	prof_cnt_t		cnt_summed;

	/* Backtrace vector, used for calls to prof_backtrace(). */
	void			*vec[PROF_BT_MAX];
};
typedef rb_tree(prof_tdata_t) prof_tdata_tree_t;

#endif /* JEMALLOC_H_STRUCTS */
/******************************************************************************/
#ifdef JEMALLOC_H_EXTERNS

extern bool	opt_prof;
extern bool	opt_prof_active;
extern bool	opt_prof_thread_active_init;
extern size_t	opt_lg_prof_sample;   /* Mean bytes between samples. */
extern ssize_t	opt_lg_prof_interval; /* lg(prof_interval). */
extern bool	opt_prof_gdump;       /* High-water memory dumping. */
extern bool	opt_prof_final;       /* Final profile dumping. */
extern bool	opt_prof_leak;        /* Dump leak summary at exit. */
extern bool	opt_prof_accum;       /* Report cumulative bytes. */
extern char	opt_prof_prefix[
    /* Minimize memory bloat for non-prof builds. */
#ifdef JEMALLOC_PROF
    PATH_MAX +
#endif
    1];

/* Accessed via prof_active_[gs]et{_unlocked,}(). */
extern bool	prof_active;

/* Accessed via prof_gdump_[gs]et{_unlocked,}(). */
extern bool	prof_gdump_val;

/*
 * Profile dump interval, measured in bytes allocated.  Each arena triggers a
 * profile dump when it reaches this threshold.  The effect is that the
 * interval between profile dumps averages prof_interval, though the actual
 * interval between dumps will tend to be sporadic, and the interval will be a
 * maximum of approximately (prof_interval * narenas).
 */
extern uint64_t	prof_interval;

/*
 * Initialized as opt_lg_prof_sample, and potentially modified during profiling
 * resets.
 */
extern size_t	lg_prof_sample;

void	prof_alloc_rollback(tsd_t *tsd, prof_tctx_t *tctx, bool updated);
void	prof_malloc_sample_object(tsdn_t *tsdn, extent_t *extent,
    const void *ptr, size_t usize, prof_tctx_t *tctx);
void	prof_free_sampled_object(tsd_t *tsd, size_t usize, prof_tctx_t *tctx);
void	bt_init(prof_bt_t *bt, void **vec);
void	prof_backtrace(prof_bt_t *bt);
prof_tctx_t	*prof_lookup(tsd_t *tsd, prof_bt_t *bt);
#ifdef JEMALLOC_JET
size_t	prof_tdata_count(void);
size_t	prof_bt_count(void);
const prof_cnt_t *prof_cnt_all(void);
typedef int (prof_dump_open_t)(bool, const char *);
extern prof_dump_open_t *prof_dump_open;
typedef bool (prof_dump_header_t)(tsdn_t *, bool, const prof_cnt_t *);
extern prof_dump_header_t *prof_dump_header;
#endif
void	prof_idump(tsdn_t *tsdn);
bool	prof_mdump(tsd_t *tsd, const char *filename);
void	prof_gdump(tsdn_t *tsdn);
prof_tdata_t	*prof_tdata_init(tsd_t *tsd);
prof_tdata_t	*prof_tdata_reinit(tsd_t *tsd, prof_tdata_t *tdata);
void	prof_reset(tsd_t *tsd, size_t lg_sample);
void	prof_tdata_cleanup(tsd_t *tsd);
bool	prof_active_get(tsdn_t *tsdn);
bool	prof_active_set(tsdn_t *tsdn, bool active);
const char	*prof_thread_name_get(tsd_t *tsd);
int	prof_thread_name_set(tsd_t *tsd, const char *thread_name);
bool	prof_thread_active_get(tsd_t *tsd);
bool	prof_thread_active_set(tsd_t *tsd, bool active);
bool	prof_thread_active_init_get(tsdn_t *tsdn);
bool	prof_thread_active_init_set(tsdn_t *tsdn, bool active_init);
bool	prof_gdump_get(tsdn_t *tsdn);
bool	prof_gdump_set(tsdn_t *tsdn, bool active);
void	prof_boot0(void);
void	prof_boot1(void);
bool	prof_boot2(tsd_t *tsd);
void	prof_prefork0(tsdn_t *tsdn);
void	prof_prefork1(tsdn_t *tsdn);
void	prof_postfork_parent(tsdn_t *tsdn);
void	prof_postfork_child(tsdn_t *tsdn);
void	prof_sample_threshold_update(prof_tdata_t *tdata);

#endif /* JEMALLOC_H_EXTERNS */
/******************************************************************************/
#ifdef JEMALLOC_H_INLINES

#ifndef JEMALLOC_ENABLE_INLINE
bool	prof_active_get_unlocked(void);
bool	prof_gdump_get_unlocked(void);
prof_tdata_t	*prof_tdata_get(tsd_t *tsd, bool create);
prof_tctx_t	*prof_tctx_get(tsdn_t *tsdn, const extent_t *extent,
    const void *ptr);
void	prof_tctx_set(tsdn_t *tsdn, extent_t *extent, const void *ptr,
    size_t usize, prof_tctx_t *tctx);
void	prof_tctx_reset(tsdn_t *tsdn, extent_t *extent, const void *ptr,
    prof_tctx_t *tctx);
bool	prof_sample_accum_update(tsd_t *tsd, size_t usize, bool update,
    prof_tdata_t **tdata_out);
prof_tctx_t	*prof_alloc_prep(tsd_t *tsd, size_t usize, bool prof_active,
    bool update);
void	prof_malloc(tsdn_t *tsdn, extent_t *extent, const void *ptr,
    size_t usize, prof_tctx_t *tctx);
void	prof_realloc(tsd_t *tsd, extent_t *extent, const void *ptr,
    size_t usize, prof_tctx_t *tctx, bool prof_active, bool updated,
    extent_t *old_extent, const void *old_ptr, size_t old_usize,
    prof_tctx_t *old_tctx);
void	prof_free(tsd_t *tsd, const extent_t *extent, const void *ptr,
    size_t usize);
#endif

#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_PROF_C_))
JEMALLOC_ALWAYS_INLINE bool
prof_active_get_unlocked(void)
{

	/*
	 * Even if opt_prof is true, sampling can be temporarily disabled by
	 * setting prof_active to false.  No locking is used when reading
	 * prof_active in the fast path, so there are no guarantees regarding
	 * how long it will take for all threads to notice state changes.
	 */
	return (prof_active);
}

JEMALLOC_ALWAYS_INLINE bool
prof_gdump_get_unlocked(void)
{

	/*
	 * No locking is used when reading prof_gdump_val in the fast path, so
	 * there are no guarantees regarding how long it will take for all
	 * threads to notice state changes.
	 */
	return (prof_gdump_val);
}

JEMALLOC_ALWAYS_INLINE prof_tdata_t *
prof_tdata_get(tsd_t *tsd, bool create)
{
	prof_tdata_t *tdata;

	cassert(config_prof);

	tdata = tsd_prof_tdata_get(tsd);
	if (create) {
		if (unlikely(tdata == NULL)) {
			if (tsd_nominal(tsd)) {
				tdata = prof_tdata_init(tsd);
				tsd_prof_tdata_set(tsd, tdata);
			}
		} else if (unlikely(tdata->expired)) {
			tdata = prof_tdata_reinit(tsd, tdata);
			tsd_prof_tdata_set(tsd, tdata);
		}
		assert(tdata == NULL || tdata->attached);
	}

	return (tdata);
}

JEMALLOC_ALWAYS_INLINE prof_tctx_t *
prof_tctx_get(tsdn_t *tsdn, const extent_t *extent, const void *ptr)
{

	cassert(config_prof);
	assert(ptr != NULL);

	return (arena_prof_tctx_get(tsdn, extent, ptr));
}

JEMALLOC_ALWAYS_INLINE void
prof_tctx_set(tsdn_t *tsdn, extent_t *extent, const void *ptr, size_t usize,
    prof_tctx_t *tctx)
{

	cassert(config_prof);
	assert(ptr != NULL);

	arena_prof_tctx_set(tsdn, extent, ptr, usize, tctx);
}

JEMALLOC_ALWAYS_INLINE void
prof_tctx_reset(tsdn_t *tsdn, extent_t *extent, const void *ptr,
    prof_tctx_t *tctx)
{

	cassert(config_prof);
	assert(ptr != NULL);

	arena_prof_tctx_reset(tsdn, extent, ptr, tctx);
}

JEMALLOC_ALWAYS_INLINE bool
prof_sample_accum_update(tsd_t *tsd, size_t usize, bool update,
    prof_tdata_t **tdata_out)
{
	prof_tdata_t *tdata;

	cassert(config_prof);

	tdata = prof_tdata_get(tsd, true);
	if (unlikely((uintptr_t)tdata <= (uintptr_t)PROF_TDATA_STATE_MAX))
		tdata = NULL;

	if (tdata_out != NULL)
		*tdata_out = tdata;

	if (unlikely(tdata == NULL))
		return (true);

	if (likely(tdata->bytes_until_sample >= usize)) {
		if (update)
			tdata->bytes_until_sample -= usize;
		return (true);
	} else {
		/* Compute new sample threshold. */
		if (update)
			prof_sample_threshold_update(tdata);
		return (!tdata->active);
	}
}

JEMALLOC_ALWAYS_INLINE prof_tctx_t *
prof_alloc_prep(tsd_t *tsd, size_t usize, bool prof_active, bool update)
{
	prof_tctx_t *ret;
	prof_tdata_t *tdata;
	prof_bt_t bt;

	assert(usize == s2u(usize));

	if (!prof_active || likely(prof_sample_accum_update(tsd, usize, update,
	    &tdata)))
		ret = (prof_tctx_t *)(uintptr_t)1U;
	else {
		bt_init(&bt, tdata->vec);
		prof_backtrace(&bt);
		ret = prof_lookup(tsd, &bt);
	}

	return (ret);
}

JEMALLOC_ALWAYS_INLINE void
prof_malloc(tsdn_t *tsdn, extent_t *extent, const void *ptr, size_t usize,
    prof_tctx_t *tctx)
{

	cassert(config_prof);
	assert(ptr != NULL);
	assert(usize == isalloc(tsdn, extent, ptr));

	if (unlikely((uintptr_t)tctx > (uintptr_t)1U))
		prof_malloc_sample_object(tsdn, extent, ptr, usize, tctx);
	else {
		prof_tctx_set(tsdn, extent, ptr, usize,
		    (prof_tctx_t *)(uintptr_t)1U);
	}
}

JEMALLOC_ALWAYS_INLINE void
prof_realloc(tsd_t *tsd, extent_t *extent, const void *ptr, size_t usize,
    prof_tctx_t *tctx, bool prof_active, bool updated, extent_t *old_extent,
    const void *old_ptr, size_t old_usize, prof_tctx_t *old_tctx)
{
	bool sampled, old_sampled, moved;

	cassert(config_prof);
	assert(ptr != NULL || (uintptr_t)tctx <= (uintptr_t)1U);

	if (prof_active && !updated && ptr != NULL) {
		assert(usize == isalloc(tsd_tsdn(tsd), extent, ptr));
		if (prof_sample_accum_update(tsd, usize, true, NULL)) {
			/*
			 * Don't sample.  The usize passed to prof_alloc_prep()
			 * was larger than what actually got allocated, so a
			 * backtrace was captured for this allocation, even
			 * though its actual usize was insufficient to cross the
			 * sample threshold.
			 */
			prof_alloc_rollback(tsd, tctx, true);
			tctx = (prof_tctx_t *)(uintptr_t)1U;
		}
	}

	/*
	 * The following code must differentiate among eight possible cases,
	 * based on three boolean conditions.
	 */
	sampled = ((uintptr_t)tctx > (uintptr_t)1U);
	old_sampled = ((uintptr_t)old_tctx > (uintptr_t)1U);
	moved = (ptr != old_ptr);

	/*
	 * The following block must only execute if this is a non-moving
	 * reallocation, because for moving reallocation the old allocation will
	 * be deallocated via a separate call.
	 */
	if (unlikely(old_sampled) && !moved)
		prof_free_sampled_object(tsd, old_usize, old_tctx);

	if (unlikely(sampled)) {
		prof_malloc_sample_object(tsd_tsdn(tsd), extent, ptr, usize,
		    tctx);
	} else if (moved) {
		prof_tctx_set(tsd_tsdn(tsd), extent, ptr, usize,
		    (prof_tctx_t *)(uintptr_t)1U);
	} else if (unlikely(old_sampled))
		prof_tctx_reset(tsd_tsdn(tsd), extent, ptr, tctx);
}

JEMALLOC_ALWAYS_INLINE void
prof_free(tsd_t *tsd, const extent_t *extent, const void *ptr, size_t usize)
{
	prof_tctx_t *tctx = prof_tctx_get(tsd_tsdn(tsd), extent, ptr);

	cassert(config_prof);
	assert(usize == isalloc(tsd_tsdn(tsd), extent, ptr));

	if (unlikely((uintptr_t)tctx > (uintptr_t)1U))
		prof_free_sampled_object(tsd, usize, tctx);
}
#endif

#endif /* JEMALLOC_H_INLINES */
/******************************************************************************/