Merge pull request #73 from bmaurer/smallmalloc

Smaller malloc hot path
2014-04-16 16:33:21 -07:00 · 2014-04-16 16:33:21 -07:00 · 3e3caf03af
commit 3e3caf03af
parent bd87b01999 021136ce4d
8 changed files with 194 additions and 194 deletions
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@ -385,6 +385,7 @@ extern ssize_t	opt_lg_dirty_mult;
 * and all accesses are via the SMALL_SIZE2BIN macro.
 */
 extern uint8_t const	small_size2bin[];
+extern uint32_t const	small_bin2size[];
 #define	SMALL_SIZE2BIN(s)	(small_size2bin[(s-1) >> LG_TINY_MIN])

 extern arena_bin_info_t	arena_bin_info[NBINS];
@ -964,7 +965,7 @@ arena_salloc(const void *ptr, bool demote)
 		assert(arena_mapbits_large_get(chunk, pageind) != 0 ||
 		    arena_ptr_small_binind_get(ptr, arena_mapbits_get(chunk,
 		    pageind)) == binind);
-		ret = arena_bin_info[binind].reg_size;
+		ret = small_bin2size[binind];
 	}

 	return (ret);
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@ -526,7 +526,7 @@ s2u(size_t size)
 {

 	if (size <= SMALL_MAXCLASS)
-		return (arena_bin_info[SMALL_SIZE2BIN(size)].reg_size);
+		return (small_bin2size[SMALL_SIZE2BIN(size)]);
 	if (size <= arena_maxclass)
 		return (PAGE_CEILING(size));
 	return (CHUNK_CEILING(size));
@ -569,7 +569,7 @@ sa2u(size_t size, size_t alignment)

 	if (usize <= arena_maxclass && alignment <= PAGE) {
 		if (usize <= SMALL_MAXCLASS)
-			return (arena_bin_info[SMALL_SIZE2BIN(usize)].reg_size);
+			return (small_bin2size[SMALL_SIZE2BIN(usize)]);
 		return (PAGE_CEILING(usize));
 	} else {
 		size_t run_size;
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@ -298,6 +298,7 @@ prof_idump
 prof_interval
 prof_lookup
 prof_malloc
+prof_malloc_record_object
 prof_mdump
 prof_postfork_child
 prof_postfork_parent
@ -344,6 +345,7 @@ rtree_set
 s2u
 sa2u
 set_errno
+small_bin2size
 small_size2bin
 stats_cactive
 stats_cactive_add
@ -383,6 +385,7 @@ tcache_event
 tcache_event_hard
 tcache_flush
 tcache_get
+tcache_get_hard
 tcache_initialized
 tcache_maxclass
 tcache_salloc
--- a/include/jemalloc/internal/prof.h
+++ b/include/jemalloc/internal/prof.h
@ -177,8 +177,7 @@ struct prof_tdata_s {

 	/* Sampling state. */
 	uint64_t		prng_state;
-	uint64_t		threshold;
-	uint64_t		accum;
+	uint64_t		bytes_until_sample;

 	/* State used to avoid dumping while operating on prof internals. */
 	bool			enq;
@ -239,6 +238,7 @@ bool	prof_boot2(void);
 void	prof_prefork(void);
 void	prof_postfork_parent(void);
 void	prof_postfork_child(void);
+void	prof_sample_threshold_update(prof_tdata_t *prof_tdata);

 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
@ -250,49 +250,13 @@ void	prof_postfork_child(void);
 									\
 	assert(size == s2u(size));					\
 									\
-	prof_tdata = prof_tdata_get(true);				\
-	if ((uintptr_t)prof_tdata <= (uintptr_t)PROF_TDATA_STATE_MAX) {	\
-		if (prof_tdata != NULL)					\
+	if (!opt_prof_active ||						\
+	    prof_sample_accum_update(size, false, &prof_tdata)) {	\
 		ret = (prof_thr_cnt_t *)(uintptr_t)1U;			\
-		else							\
-			ret = NULL;					\
-		break;							\
-	}								\
-									\
-	if (opt_prof_active == false) {					\
-		/* Sampling is currently inactive, so avoid sampling. */\
-		ret = (prof_thr_cnt_t *)(uintptr_t)1U;			\
-	} else if (opt_lg_prof_sample == 0) {				\
-		/* Don't bother with sampling logic, since sampling   */\
-		/* interval is 1.                                     */\
-		bt_init(&bt, prof_tdata->vec);				\
-		prof_backtrace(&bt, nignore);				\
-		ret = prof_lookup(&bt);					\
 	} else {							\
-		if (prof_tdata->threshold == 0) {			\
-			/* Initialize.  Seed the prng differently for */\
-			/* each thread.                               */\
-			prof_tdata->prng_state =			\
-			    (uint64_t)(uintptr_t)&size;			\
-			prof_sample_threshold_update(prof_tdata);	\
-		}							\
-									\
-		/* Determine whether to capture a backtrace based on  */\
-		/* whether size is enough for prof_accum to reach     */\
-		/* prof_tdata->threshold.  However, delay updating    */\
-		/* these variables until prof_{m,re}alloc(), because  */\
-		/* we don't know for sure that the allocation will    */\
-		/* succeed.                                           */\
-		/*                                                    */\
-		/* Use subtraction rather than addition to avoid      */\
-		/* potential integer overflow.                        */\
-		if (size >= prof_tdata->threshold -			\
-		    prof_tdata->accum) {				\
 		bt_init(&bt, prof_tdata->vec);				\
 		prof_backtrace(&bt, nignore);				\
 		ret = prof_lookup(&bt);					\
-		} else							\
-			ret = (prof_thr_cnt_t *)(uintptr_t)1U;		\
 	}								\
 } while (0)

@ -300,10 +264,13 @@ void	prof_postfork_child(void);
 malloc_tsd_protos(JEMALLOC_ATTR(unused), prof_tdata, prof_tdata_t *)

 prof_tdata_t	*prof_tdata_get(bool create);
-void	prof_sample_threshold_update(prof_tdata_t *prof_tdata);
+void	prof_sample_accum_update(size_t size, bool commit,
+    prof_tdata_t **prof_tdata_out);
 prof_ctx_t	*prof_ctx_get(const void *ptr);
 void	prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
 bool	prof_sample_accum_update(size_t size);
+void	prof_malloc_record_object(const void *ptr, size_t usize,
+    prof_thr_cnt_t *cnt)
 void	prof_malloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt);
 void	prof_realloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt,
    size_t old_usize, prof_ctx_t *old_ctx);
@ -330,55 +297,6 @@ prof_tdata_get(bool create)
 	return (prof_tdata);
 }

-JEMALLOC_INLINE void
-prof_sample_threshold_update(prof_tdata_t *prof_tdata)
-{
-	/*
-	 * The body of this function is compiled out unless heap profiling is
-	 * enabled, so that it is possible to compile jemalloc with floating
-	 * point support completely disabled.  Avoiding floating point code is
-	 * important on memory-constrained systems, but it also enables a
-	 * workaround for versions of glibc that don't properly save/restore
-	 * floating point registers during dynamic lazy symbol loading (which
-	 * internally calls into whatever malloc implementation happens to be
-	 * integrated into the application).  Note that some compilers (e.g.
-	 * gcc 4.8) may use floating point registers for fast memory moves, so
-	 * jemalloc must be compiled with such optimizations disabled (e.g.
-	 * -mno-sse) in order for the workaround to be complete.
-	 */
-#ifdef JEMALLOC_PROF
-	uint64_t r;
-	double u;
-
-	cassert(config_prof);
-
-	/*
-	 * Compute sample threshold as a geometrically distributed random
-	 * variable with mean (2^opt_lg_prof_sample).
-	 *
-	 *                         __        __
-	 *                         |  log(u)  |                     1
-	 * prof_tdata->threshold = | -------- |, where p = -------------------
-	 *                         | log(1-p) |             opt_lg_prof_sample
-	 *                                                 2
-	 *
-	 * For more information on the math, see:
-	 *
-	 *   Non-Uniform Random Variate Generation
-	 *   Luc Devroye
-	 *   Springer-Verlag, New York, 1986
-	 *   pp 500
-	 *   (http://luc.devroye.org/rnbookindex.html)
-	 */
-	prng64(r, 53, prof_tdata->prng_state,
-	    UINT64_C(6364136223846793005), UINT64_C(1442695040888963407));
-	u = (double)r * (1.0/9007199254740992.0L);
-	prof_tdata->threshold = (uint64_t)(log(u) /
-	    log(1.0 - (1.0 / (double)((uint64_t)1U << opt_lg_prof_sample))))
-	    + (uint64_t)1U;
-#endif
-}
-
 JEMALLOC_INLINE prof_ctx_t *
 prof_ctx_get(const void *ptr)
 {
@ -415,56 +333,37 @@ prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
 }

 JEMALLOC_INLINE bool
-prof_sample_accum_update(size_t size)
+prof_sample_accum_update(size_t size, bool commit,
+    prof_tdata_t **prof_tdata_out)
 {
 	prof_tdata_t *prof_tdata;

 	cassert(config_prof);
-	/* Sampling logic is unnecessary if the interval is 1. */
-	assert(opt_lg_prof_sample != 0);

-	prof_tdata = prof_tdata_get(false);
+	prof_tdata = prof_tdata_get(true);
 	if ((uintptr_t)prof_tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
+		prof_tdata = NULL;
+
+	if (prof_tdata_out != NULL)
+		*prof_tdata_out = prof_tdata;
+
+	if (prof_tdata == NULL)
 		return (true);

-	/* Take care to avoid integer overflow. */
-	if (size >= prof_tdata->threshold - prof_tdata->accum) {
-		prof_tdata->accum -= (prof_tdata->threshold - size);
-		/* Compute new sample threshold. */
-		prof_sample_threshold_update(prof_tdata);
-		while (prof_tdata->accum >= prof_tdata->threshold) {
-			prof_tdata->accum -= prof_tdata->threshold;
-			prof_sample_threshold_update(prof_tdata);
-		}
-		return (false);
-	} else {
-		prof_tdata->accum += size;
+	if (prof_tdata->bytes_until_sample >= size) {
+		if (commit)
+			prof_tdata->bytes_until_sample -= size;
 		return (true);
+	} else {
+		/* Compute new sample threshold. */
+		if (commit)
+			prof_sample_threshold_update(prof_tdata);
+		return (false);
 	}
 }

 JEMALLOC_INLINE void
-prof_malloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt)
-{
-
-	cassert(config_prof);
-	assert(ptr != NULL);
-	assert(usize == isalloc(ptr, true));
-
-	if (opt_lg_prof_sample != 0) {
-		if (prof_sample_accum_update(usize)) {
-			/*
-			 * Don't sample.  For malloc()-like allocation, it is
-			 * always possible to tell in advance how large an
-			 * object's usable size will be, so there should never
-			 * be a difference between the usize passed to
-			 * PROF_ALLOC_PREP() and prof_malloc().
-			 */
-			assert((uintptr_t)cnt == (uintptr_t)1U);
-		}
-	}
-
-	if ((uintptr_t)cnt > (uintptr_t)1U) {
+prof_malloc_record_object(const void *ptr, size_t usize, prof_thr_cnt_t *cnt) {
 	prof_ctx_set(ptr, cnt->ctx);

 	cnt->epoch++;
@ -484,7 +383,30 @@ prof_malloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt)
 	/*********/
 	mb_write();
 	/*********/
-	} else
+}
+
+JEMALLOC_INLINE void
+prof_malloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt)
+{
+
+	cassert(config_prof);
+	assert(ptr != NULL);
+	assert(usize == isalloc(ptr, true));
+
+	if (prof_sample_accum_update(usize, true, NULL)) {
+		/*
+		 * Don't sample.  For malloc()-like allocation, it is
+		 * always possible to tell in advance how large an
+		 * object's usable size will be, so there should never
+		 * be a difference between the usize passed to
+		 * PROF_ALLOC_PREP() and prof_malloc().
+		 */
+		assert((uintptr_t)cnt == (uintptr_t)1U);
+	}
+
+	if ((uintptr_t)cnt > (uintptr_t)1U)
+		prof_malloc_record_object(ptr, usize, cnt);
+	else
 		prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U);
 }

@ -499,8 +421,7 @@ prof_realloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt,

 	if (ptr != NULL) {
 		assert(usize == isalloc(ptr, true));
-		if (opt_lg_prof_sample != 0) {
-			if (prof_sample_accum_update(usize)) {
+		if (prof_sample_accum_update(usize, true, NULL)) {
 			/*
 			 * Don't sample.  The usize passed to
 			 * PROF_ALLOC_PREP() was larger than what
@ -512,7 +433,6 @@ prof_realloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt,
 			cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
 		}
 	}
-	}

 	if ((uintptr_t)old_ctx > (uintptr_t)1U) {
 		told_cnt = prof_lookup(old_ctx->bt);
--- a/include/jemalloc/internal/tcache.h
+++ b/include/jemalloc/internal/tcache.h
@ -110,6 +110,7 @@ void	tcache_bin_flush_large(tcache_bin_t *tbin, size_t binind, unsigned rem,
    tcache_t *tcache);
 void	tcache_arena_associate(tcache_t *tcache, arena_t *arena);
 void	tcache_arena_dissociate(tcache_t *tcache);
+tcache_t *tcache_get_hard(tcache_t *tcache, bool create);
 tcache_t *tcache_create(arena_t *arena);
 void	tcache_destroy(tcache_t *tcache);
 void	tcache_thread_cleanup(void *arg);
@ -220,39 +221,7 @@ tcache_get(bool create)
 	if ((uintptr_t)tcache <= (uintptr_t)TCACHE_STATE_MAX) {
 		if (tcache == TCACHE_STATE_DISABLED)
 			return (NULL);
-		if (tcache == NULL) {
-			if (create == false) {
-				/*
-				 * Creating a tcache here would cause
-				 * allocation as a side effect of free().
-				 * Ordinarily that would be okay since
-				 * tcache_create() failure is a soft failure
-				 * that doesn't propagate.  However, if TLS
-				 * data are freed via free() as in glibc,
-				 * subtle corruption could result from setting
-				 * a TLS variable after its backing memory is
-				 * freed.
-				 */
-				return (NULL);
-			}
-			if (tcache_enabled_get() == false) {
-				tcache_enabled_set(false); /* Memoize. */
-				return (NULL);
-			}
-			return (tcache_create(choose_arena(NULL)));
-		}
-		if (tcache == TCACHE_STATE_PURGATORY) {
-			/*
-			 * Make a note that an allocator function was called
-			 * after tcache_thread_cleanup() was called.
-			 */
-			tcache = TCACHE_STATE_REINCARNATED;
-			tcache_tsd_set(&tcache);
-			return (NULL);
-		}
-		if (tcache == TCACHE_STATE_REINCARNATED)
-			return (NULL);
-		not_reached();
+		tcache = tcache_get_hard(tcache, create);
 	}

 	return (tcache);
@ -297,14 +266,14 @@ tcache_alloc_small(tcache_t *tcache, size_t size, bool zero)
 	binind = SMALL_SIZE2BIN(size);
 	assert(binind < NBINS);
 	tbin = &tcache->tbins[binind];
-	size = arena_bin_info[binind].reg_size;
+	size = small_bin2size[binind];
 	ret = tcache_alloc_easy(tbin);
 	if (ret == NULL) {
 		ret = tcache_alloc_small_hard(tcache, tbin, binind);
 		if (ret == NULL)
 			return (NULL);
 	}
-	assert(tcache_salloc(ret) == arena_bin_info[binind].reg_size);
+	assert(tcache_salloc(ret) == size);

 	if (zero == false) {
 		if (config_fill) {
@ -325,7 +294,7 @@ tcache_alloc_small(tcache_t *tcache, size_t size, bool zero)
 	if (config_stats)
 		tbin->tstats.nrequests++;
 	if (config_prof)
-		tcache->prof_accumbytes += arena_bin_info[binind].reg_size;
+		tcache->prof_accumbytes += size;
 	tcache_event(tcache);
 	return (ret);
 }
--- a/src/arena.c
+++ b/src/arena.c
@ -7,6 +7,14 @@
 ssize_t		opt_lg_dirty_mult = LG_DIRTY_MULT_DEFAULT;
 arena_bin_info_t	arena_bin_info[NBINS];

+JEMALLOC_ALIGNED(CACHELINE)
+const uint32_t	small_bin2size[NBINS] = {
+#define SIZE_CLASS(bin, delta, size)		\
+	size,
+	SIZE_CLASSES
+#undef SIZE_CLASS
+};
+
 JEMALLOC_ALIGNED(CACHELINE)
 const uint8_t	small_size2bin[] = {
 #define	S2B_8(i)	i,
@ -1615,7 +1623,7 @@ arena_malloc_small(arena_t *arena, size_t size, bool zero)
 	binind = SMALL_SIZE2BIN(size);
 	assert(binind < NBINS);
 	bin = &arena->bins[binind];
-	size = arena_bin_info[binind].reg_size;
+	size = small_bin2size[binind];

 	malloc_mutex_lock(&bin->lock);
 	if ((run = bin->runcur) != NULL && run->nfree > 0)
--- a/src/prof.c
+++ b/src/prof.c
@ -645,6 +645,66 @@ prof_lookup(prof_bt_t *bt)
 	return (ret.p);
 }

+
+void
+prof_sample_threshold_update(prof_tdata_t *prof_tdata)
+{
+	/*
+	 * The body of this function is compiled out unless heap profiling is
+	 * enabled, so that it is possible to compile jemalloc with floating
+	 * point support completely disabled.  Avoiding floating point code is
+	 * important on memory-constrained systems, but it also enables a
+	 * workaround for versions of glibc that don't properly save/restore
+	 * floating point registers during dynamic lazy symbol loading (which
+	 * internally calls into whatever malloc implementation happens to be
+	 * integrated into the application).  Note that some compilers (e.g.
+	 * gcc 4.8) may use floating point registers for fast memory moves, so
+	 * jemalloc must be compiled with such optimizations disabled (e.g.
+	 * -mno-sse) in order for the workaround to be complete.
+	 */
+#ifdef JEMALLOC_PROF
+	uint64_t r;
+	double u;
+
+	if (!config_prof)
+		return;
+
+	if (prof_tdata == NULL)
+		prof_tdata = prof_tdata_get(false);
+
+	if (opt_lg_prof_sample == 0) {
+		prof_tdata->bytes_until_sample = 0;
+		return;
+	}
+
+	/*
+	 * Compute sample threshold as a geometrically distributed random
+	 * variable with mean (2^opt_lg_prof_sample).
+	 *
+	 *                         __        __
+	 *                         |  log(u)  |                     1
+	 * prof_tdata->threshold = | -------- |, where p = -------------------
+	 *                         | log(1-p) |             opt_lg_prof_sample
+	 *                                                 2
+	 *
+	 * For more information on the math, see:
+	 *
+	 *   Non-Uniform Random Variate Generation
+	 *   Luc Devroye
+	 *   Springer-Verlag, New York, 1986
+	 *   pp 500
+	 *   (http://luc.devroye.org/rnbookindex.html)
+	 */
+	prng64(r, 53, prof_tdata->prng_state,
+	    UINT64_C(6364136223846793005), UINT64_C(1442695040888963407));
+	u = (double)r * (1.0/9007199254740992.0L);
+	prof_tdata->bytes_until_sample = (uint64_t)(log(u) /
+	    log(1.0 - (1.0 / (double)((uint64_t)1U << opt_lg_prof_sample))))
+	    + (uint64_t)1U;
+#endif
+}
+
+
 #ifdef JEMALLOC_JET
 size_t
 prof_bt_count(void)
@ -1224,9 +1284,8 @@ prof_tdata_init(void)
 		return (NULL);
 	}

-	prof_tdata->prng_state = 0;
-	prof_tdata->threshold = 0;
-	prof_tdata->accum = 0;
+	prof_tdata->prng_state = (uint64_t)(uintptr_t)prof_tdata;
+	prof_sample_threshold_update(prof_tdata);

 	prof_tdata->enq = false;
 	prof_tdata->enq_idump = false;
--- a/src/tcache.c
+++ b/src/tcache.c
@ -265,6 +265,46 @@ tcache_arena_dissociate(tcache_t *tcache)
 	}
 }

+tcache_t *
+tcache_get_hard(tcache_t *tcache, bool create)
+{
+
+	if (tcache == NULL) {
+		if (create == false) {
+			/*
+			 * Creating a tcache here would cause
+			 * allocation as a side effect of free().
+			 * Ordinarily that would be okay since
+			 * tcache_create() failure is a soft failure
+			 * that doesn't propagate.  However, if TLS
+			 * data are freed via free() as in glibc,
+			 * subtle corruption could result from setting
+			 * a TLS variable after its backing memory is
+			 * freed.
+			 */
+			return (NULL);
+		}
+		if (tcache_enabled_get() == false) {
+			tcache_enabled_set(false); /* Memoize. */
+			return (NULL);
+		}
+		return (tcache_create(choose_arena(NULL)));
+	}
+	if (tcache == TCACHE_STATE_PURGATORY) {
+		/*
+		 * Make a note that an allocator function was called
+		 * after tcache_thread_cleanup() was called.
+		 */
+		tcache = TCACHE_STATE_REINCARNATED;
+		tcache_tsd_set(&tcache);
+		return (NULL);
+	}
+	if (tcache == TCACHE_STATE_REINCARNATED)
+		return (NULL);
+	not_reached();
+	return (NULL);
+}
+
 tcache_t *
 tcache_create(arena_t *arena)
 {