Enhance the H/h MALLOC_OPTIONS flags to control the number of tcache bin slots,

rather than just enabling/disabling the tcache. Fix an off-by-one bug in large object stats recording.
2010-01-03 16:16:10 -08:00 · 2010-01-03 16:16:10 -08:00 · 279e09d1ff
commit 279e09d1ff
parent 3f3ecfb8e8
2 changed files with 82 additions and 63 deletions
--- a/jemalloc/doc/jemalloc.3.in
+++ b/jemalloc/doc/jemalloc.3.in
@ -254,15 +254,21 @@ will disable dirty page purging.
@roff_tcache@.Ev JEMALLOC_OPTIONS=14g
@roff_tcache@will disable garbage collection.
@roff_tcache@.It H
-@roff_tcache@When there are multiple threads, use thread-specific caching for
-@roff_tcache@small and medium objects.
-@roff_tcache@This option is enabled by default.
+@roff_tcache@Double/halve the number of thread-specific cache slots per size
+@roff_tcache@class.
+@roff_tcache@When there are multiple threads, each thread uses a
+@roff_tcache@thread-specific cache for small and medium objects.
@roff_tcache@Thread-specific caching allows many allocations to be satisfied
@roff_tcache@without performing any thread synchronization, at the cost of
@roff_tcache@increased memory use.
@roff_tcache@See the
@roff_tcache@.Dq G
@roff_tcache@option for related tuning information.
+@roff_tcache@The default number of cache slots is 128;
+@roff_tcache@.Ev JEMALLOC_OPTIONS=7h
+@roff_tcache@will disable thread-specific caching.
+@roff_tcache@Note that one cache slot per size class is not a valid
+@roff_tcache@configuration due to implementation details.
@roff_fill@.It J
@roff_fill@Each byte of new memory allocated by
@roff_fill@.Fn @jemalloc_prefix@malloc
--- a/jemalloc/src/jemalloc.c
+++ b/jemalloc/src/jemalloc.c
@ -272,9 +272,11 @@ __FBSDID("$FreeBSD: src/lib/libc/stdlib/malloc.c,v 1.183 2008/12/01 10:20:59 jas
 	    PAGE_SHIFT)))

 #ifdef JEMALLOC_TCACHE
-   /* Number of cache slots for each bin in the thread cache. */
-#  define TCACHE_LG_NSLOTS	7
-#  define TCACHE_NSLOTS		(1U << TCACHE_LG_NSLOTS)
+   /*
+    * Default number of cache slots for each bin in the thread cache (0:
+    * disabled).
+    */
+#  define LG_TCACHE_NSLOTS_DEFAULT	7
   /*
    * (1U << opt_lg_tcache_gc_sweep) is the approximate number of
    * allocation events between full GC sweeps (-1: disabled).  Integer
@ -721,7 +723,7 @@ struct tcache_bin_s {
 	unsigned	low_water;	/* Min # cached since last GC. */
 	unsigned	high_water;	/* Max # cached since last GC. */
 	unsigned	ncached;	/* # of cached objects. */
-	void		*slots[TCACHE_NSLOTS];
+	void		*slots[1];	/* Dynamically sized. */
 };

 struct tcache_s {
@ -1038,6 +1040,12 @@ static __thread tcache_t	*tcache_tls
 */
 static pthread_key_t		tcache_tsd;

+/*
+ * Number of cache slots for each bin in the thread cache, or 0 if tcache is
+ * disabled.
+ */
+size_t				tcache_nslots;
+
 /* Number of tcache allocation/deallocation events between incremental GCs. */
 unsigned			tcache_gc_incr;
 #endif
@ -1080,7 +1088,7 @@ static bool	opt_junk = false;
 #  endif
 #endif
 #ifdef JEMALLOC_TCACHE
-static bool	opt_tcache = true;
+static size_t	opt_lg_tcache_nslots = LG_TCACHE_NSLOTS_DEFAULT;
 static ssize_t	opt_lg_tcache_gc_sweep = LG_TCACHE_GC_SWEEP_DEFAULT;
 #endif
 static ssize_t	opt_lg_dirty_mult = LG_DIRTY_MULT_DEFAULT;
@ -3174,7 +3182,7 @@ tcache_bin_fill(tcache_t *tcache, tcache_bin_t *tbin, size_t binind)
 	arena = tcache->arena;
 	bin = &arena->bins[binind];
 	malloc_mutex_lock(&arena->lock);
-	for (i = 0; i < (TCACHE_NSLOTS >> 1); i++) {
+	for (i = 0; i < (tcache_nslots >> 1); i++) {
 		if ((run = bin->runcur) != NULL && run->nfree > 0)
 			ptr = arena_bin_malloc_easy(arena, bin, run);
 		else
@ -3185,7 +3193,7 @@ tcache_bin_fill(tcache_t *tcache, tcache_bin_t *tbin, size_t binind)
 		 * Fill tbin such that the objects lowest in memory are used
 		 * first.
 		 */
-		tbin->slots[(TCACHE_NSLOTS >> 1) - 1 - i] = ptr;
+		tbin->slots[(tcache_nslots >> 1) - 1 - i] = ptr;
 	}
 #ifdef JEMALLOC_STATS
 	bin->stats.nfills++;
@ -3384,12 +3392,12 @@ arena_malloc_large(arena_t *arena, size_t size, bool zero)
 #ifdef JEMALLOC_STATS
 	arena->stats.nmalloc_large++;
 	arena->stats.allocated_large += size;
-	arena->stats.lstats[size >> PAGE_SHIFT].nrequests++;
-	arena->stats.lstats[size >> PAGE_SHIFT].curruns++;
-	if (arena->stats.lstats[size >> PAGE_SHIFT].curruns >
-	    arena->stats.lstats[size >> PAGE_SHIFT].highruns) {
-		arena->stats.lstats[size >> PAGE_SHIFT].highruns =
-		    arena->stats.lstats[size >> PAGE_SHIFT].curruns;
+	arena->stats.lstats[(size >> PAGE_SHIFT) - 1].nrequests++;
+	arena->stats.lstats[(size >> PAGE_SHIFT) - 1].curruns++;
+	if (arena->stats.lstats[(size >> PAGE_SHIFT) - 1].curruns >
+	    arena->stats.lstats[(size >> PAGE_SHIFT) - 1].highruns) {
+		arena->stats.lstats[(size >> PAGE_SHIFT) - 1].highruns =
+		    arena->stats.lstats[(size >> PAGE_SHIFT) - 1].curruns;
 	}
 #endif
 	malloc_mutex_unlock(&arena->lock);
@ -3415,7 +3423,7 @@ arena_malloc(size_t size, bool zero)

 	if (size <= bin_maxclass) {
 #ifdef JEMALLOC_TCACHE
-		if (isthreaded && opt_tcache) {
+		if (isthreaded && tcache_nslots) {
 			tcache_t *tcache = tcache_tls;
 			if (tcache == NULL) {
 				tcache = tcache_create(choose_arena());
@ -3508,12 +3516,12 @@ arena_palloc(arena_t *arena, size_t alignment, size_t size, size_t alloc_size)
 #ifdef JEMALLOC_STATS
 	arena->stats.nmalloc_large++;
 	arena->stats.allocated_large += size;
-	arena->stats.lstats[size >> PAGE_SHIFT].nrequests++;
-	arena->stats.lstats[size >> PAGE_SHIFT].curruns++;
-	if (arena->stats.lstats[size >> PAGE_SHIFT].curruns >
-	    arena->stats.lstats[size >> PAGE_SHIFT].highruns) {
-		arena->stats.lstats[size >> PAGE_SHIFT].highruns =
-		    arena->stats.lstats[size >> PAGE_SHIFT].curruns;
+	arena->stats.lstats[(size >> PAGE_SHIFT) - 1].nrequests++;
+	arena->stats.lstats[(size >> PAGE_SHIFT) - 1].curruns++;
+	if (arena->stats.lstats[(size >> PAGE_SHIFT) - 1].curruns >
+	    arena->stats.lstats[(size >> PAGE_SHIFT) - 1].highruns) {
+		arena->stats.lstats[(size >> PAGE_SHIFT) - 1].highruns =
+		    arena->stats.lstats[(size >> PAGE_SHIFT) - 1].curruns;
 	}
 #endif
 	malloc_mutex_unlock(&arena->lock);
@ -4013,7 +4021,7 @@ tcache_bin_sort(tcache_bin_t *tbin)
 {
 	unsigned e, i;
 	void **fr, **to;
-	void *mslots[TCACHE_NSLOTS];
+	void *mslots[tcache_nslots];

 	/*
 	 * Perform iterative merge sort, swapping source and destination arrays
@ -4153,9 +4161,9 @@ tcache_dalloc(tcache_t *tcache, void *ptr)
 		tcache->tbins[binind] = tbin;
 	}

-	if (tbin->ncached == TCACHE_NSLOTS)
-		tcache_bin_flush(tbin, binind, (TCACHE_NSLOTS >> 1));
-	assert(tbin->ncached < TCACHE_NSLOTS);
+	if (tbin->ncached == tcache_nslots)
+		tcache_bin_flush(tbin, binind, (tcache_nslots >> 1));
+	assert(tbin->ncached < tcache_nslots);
 	tbin->slots[tbin->ncached] = ptr;
 	tbin->ncached++;
 	if (tbin->ncached > tbin->high_water)
@ -4220,7 +4228,7 @@ arena_dalloc(arena_t *arena, arena_chunk_t *chunk, void *ptr)
 	if ((mapelm->bits & CHUNK_MAP_LARGE) == 0) {
 		/* Small allocation. */
 #ifdef JEMALLOC_TCACHE
-		if (isthreaded && opt_tcache) {
+		if (isthreaded && tcache_nslots) {
 			tcache_t *tcache = tcache_tls;
 			if ((uintptr_t)tcache > (uintptr_t)1)
 				tcache_dalloc(tcache, ptr);
@ -4701,15 +4709,15 @@ static tcache_bin_t *
 tcache_bin_create(arena_t *arena)
 {
 	tcache_bin_t *ret;
+	size_t tsize;

-	if (sizeof(tcache_bin_t) <= small_maxclass) {
-		ret = (tcache_bin_t *)arena_malloc_small(arena,
-		    sizeof(tcache_bin_t), false);
-	} else if (sizeof(tcache_bin_t) <= bin_maxclass) {
-		ret = (tcache_bin_t *)arena_malloc_medium(arena,
-		    sizeof(tcache_bin_t), false);
-	} else
-		ret = imalloc(sizeof(tcache_bin_t));
+	tsize = sizeof(tcache_bin_t) + (sizeof(void *) * (tcache_nslots - 1));
+	if (tsize <= small_maxclass)
+		ret = (tcache_bin_t *)arena_malloc_small(arena, tsize, false);
+	else if (tsize <= bin_maxclass)
+		ret = (tcache_bin_t *)arena_malloc_medium(arena, tsize, false);
+	else
+		ret = (tcache_bin_t *)imalloc(tsize);
 	if (ret == NULL)
 		return (NULL);
 #ifdef JEMALLOC_STATS
@ -4727,7 +4735,7 @@ tcache_bin_destroy(tcache_t *tcache, tcache_bin_t *tbin, unsigned binind)
 {
 	arena_t *arena;
 	arena_chunk_t *chunk;
-	size_t pageind;
+	size_t pageind, tsize;
 	arena_chunk_map_t *mapelm;

 	chunk = CHUNK_ADDR2BASE(tbin);
@ -4750,7 +4758,8 @@ tcache_bin_destroy(tcache_t *tcache, tcache_bin_t *tbin, unsigned binind)
 #endif

 	assert(tbin->ncached == 0);
-	if (sizeof(tcache_bin_t) <= bin_maxclass) {
+	tsize = sizeof(tcache_bin_t) + (sizeof(void *) * (tcache_nslots - 1));
+	if (tsize <= bin_maxclass) {
 		malloc_mutex_lock(&arena->lock);
 		arena_dalloc_bin(arena, chunk, tbin, mapelm);
 		malloc_mutex_unlock(&arena->lock);
@ -5622,10 +5631,13 @@ MALLOC_OUT:
 						opt_lg_tcache_gc_sweep++;
 					break;
 				case 'h':
-					opt_tcache = false;
+					if (opt_lg_tcache_nslots > 0)
+						opt_lg_tcache_nslots--;
 					break;
 				case 'H':
-					opt_tcache = true;
+					if (opt_lg_tcache_nslots + 1 <
+					    (sizeof(size_t) << 3))
+						opt_lg_tcache_nslots++;
 					break;
 #endif
 #ifdef JEMALLOC_FILL
@ -5810,13 +5822,18 @@ MALLOC_OUT:
 	}

 #ifdef JEMALLOC_TCACHE
-	/* Compute incremental GC event threshold. */
-	if (opt_lg_tcache_gc_sweep >= 0) {
-		tcache_gc_incr = ((1U << opt_lg_tcache_gc_sweep) /
-		    nbins) + (((1U << opt_lg_tcache_gc_sweep) % nbins == 0)
-		    ? 0 : 1);
+	if (opt_lg_tcache_nslots > 0) {
+		tcache_nslots = (1U << opt_lg_tcache_nslots);
+
+		/* Compute incremental GC event threshold. */
+		if (opt_lg_tcache_gc_sweep >= 0) {
+			tcache_gc_incr = ((1U << opt_lg_tcache_gc_sweep) /
+			    nbins) + (((1U << opt_lg_tcache_gc_sweep) % nbins ==
+			    0) ? 0 : 1);
+		} else
+			tcache_gc_incr = 0;
 	} else
-		tcache_gc_incr = 0;
+		tcache_nslots = 0;
 #endif

 	/* Set variables according to the value of opt_lg_chunk. */
@ -5914,7 +5931,7 @@ MALLOC_OUT:
 #endif

 #ifdef JEMALLOC_TCACHE
-	if (opt_tcache) {
+	if (tcache_nslots) {
 		if (pthread_key_create(&tcache_tsd, tcache_thread_cleanup) !=
 		    0) {
 			malloc_message("<jemalloc>",
@ -5938,7 +5955,7 @@ MALLOC_OUT:
 		 * default.
 		 */
 #ifdef JEMALLOC_TCACHE
-		if (opt_tcache) {
+		if (tcache_nslots) {
 			/*
 			 * Only large object allocation/deallocation is
 			 * guaranteed to acquire an arena mutex, so we can get
@ -6397,9 +6414,6 @@ malloc_stats_print(const char *opts)
 		    "\n", "");
 		malloc_message("Boolean JEMALLOC_OPTIONS: ",
 		    opt_abort ? "A" : "a", "", "");
-#ifdef JEMALLOC_TCACHE
-		malloc_message(opt_tcache ? "H" : "h", "", "", "");
-#endif
 #ifdef JEMALLOC_FILL
 		malloc_message(opt_junk ? "J" : "j", "", "", "");
 #endif
@ -6459,18 +6473,17 @@ malloc_stats_print(const char *opts)
 			    "", "", "");
 		}
 #ifdef JEMALLOC_TCACHE
-		if (opt_tcache) {
-			malloc_message("Thread cache GC sweep interval: ",
-			    (tcache_gc_incr > 0) ?
-			    umax2s((1U << opt_lg_tcache_gc_sweep), 10, s)
-			    : "N/A",
-			    "", "");
-			malloc_message(" (increment interval: ",
-			    (tcache_gc_incr > 0) ?
-			    umax2s(tcache_gc_incr, 10, s)
-			    : "N/A",
-			    ")\n", "");
-		}
+		malloc_message("Thread cache slots per size class: ",
+		    tcache_nslots ? umax2s(tcache_nslots, 10, s) : "N/A",
+		    "\n", "");
+		malloc_message("Thread cache GC sweep interval: ",
+		    (tcache_nslots && tcache_gc_incr > 0) ?
+		    umax2s((1U << opt_lg_tcache_gc_sweep), 10, s) : "N/A",
+		    "", "");
+		malloc_message(" (increment interval: ",
+		    (tcache_nslots && tcache_gc_incr > 0) ?
+		    umax2s(tcache_gc_incr, 10, s) : "N/A",
+		    ")\n", "");
 #endif
 		malloc_message("Chunk size: ", umax2s(chunksize, 10, s), "",
 		    "");