Simplify small size class infrastructure.

Program-generate small size class tables for all valid combinations of LG_TINY_MIN, LG_QUANTUM, and PAGE_SHIFT. Use the appropriate table to generate all relevant data structures, and remove the distinction between tiny/quantum/cacheline/subpage bins. Remove --enable-dynamic-page-shift. This option didn't prove useful in practice, and it prevented optimizations. Add Tilera architecture support.
2012-02-28 16:50:47 -08:00
parent 5389146191
commit b172610317
17 changed files with 327 additions and 858 deletions
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -1,39 +1,6 @@
 /******************************************************************************/
 #ifdef JEMALLOC_H_TYPES

-/*
- * Subpages are an artificially designated partitioning of pages.  Their only
- * purpose is to support subpage-spaced size classes.
- *
- * There must be at least 4 subpages per page, due to the way size classes are
- * handled.
- */
-#define	LG_SUBPAGE		8
-#define	SUBPAGE			((size_t)(1U << LG_SUBPAGE))
-#define	SUBPAGE_MASK		(SUBPAGE - 1)
-
-/* Return the smallest subpage multiple that is >= s. */
-#define	SUBPAGE_CEILING(s)						\
-	(((s) + SUBPAGE_MASK) & ~SUBPAGE_MASK)
-
-/* Smallest size class to support. */
-#define	LG_TINY_MIN		3
-#define	TINY_MIN		(1U << LG_TINY_MIN)
-
-/*
- * Maximum size class that is a multiple of the quantum, but not (necessarily)
- * a power of 2.  Above this size, allocations are rounded up to the nearest
- * power of 2.
- */
-#define	LG_QSPACE_MAX_DEFAULT	7
-
-/*
- * Maximum size class that is a multiple of the cacheline, but not (necessarily)
- * a power of 2.  Above this size, allocations are rounded up to the nearest
- * power of 2.
- */
-#define	LG_CSPACE_MAX_DEFAULT	9
-
 /*
 * RUN_MAX_OVRHD indicates maximum desired run header overhead.  Runs are sized
 * as small as possible such that this setting is still honored, without
@@ -364,75 +331,26 @@ struct arena_s {
 	arena_avail_tree_t	runs_avail_clean;
 	arena_avail_tree_t	runs_avail_dirty;

-	/*
-	 * bins is used to store trees of free regions of the following sizes,
-	 * assuming a 64-bit system with 16-byte quantum, 4 KiB page size, and
-	 * default MALLOC_CONF.
-	 *
-	 *   bins[i] |   size |
-	 *   --------+--------+
-	 *        0  |      8 |
-	 *   --------+--------+
-	 *        1  |     16 |
-	 *        2  |     32 |
-	 *        3  |     48 |
-	 *           :        :
-	 *        6  |     96 |
-	 *        7  |    112 |
-	 *        8  |    128 |
-	 *   --------+--------+
-	 *        9  |    192 |
-	 *       10  |    256 |
-	 *       11  |    320 |
-	 *       12  |    384 |
-	 *       13  |    448 |
-	 *       14  |    512 |
-	 *   --------+--------+
-	 *       15  |    768 |
-	 *       16  |   1024 |
-	 *       17  |   1280 |
-	 *           :        :
-	 *       25  |   3328 |
-	 *       26  |   3584 |
-	 *       27  |   3840 |
-	 *   --------+--------+
-	 */
-	arena_bin_t		bins[1]; /* Dynamically sized. */
+	/* bins is used to store trees of free regions. */
+	arena_bin_t		bins[NBINS];
 };

 #endif /* JEMALLOC_H_STRUCTS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_EXTERNS

-extern size_t	opt_lg_qspace_max;
-extern size_t	opt_lg_cspace_max;
 extern ssize_t	opt_lg_dirty_mult;
 /*
 * small_size2bin is a compact lookup table that rounds request sizes up to
 * size classes.  In order to reduce cache footprint, the table is compressed,
 * and all accesses are via the SMALL_SIZE2BIN macro.
 */
-extern uint8_t const	*small_size2bin;
+extern uint8_t const	small_size2bin[];
 #define	SMALL_SIZE2BIN(s)	(small_size2bin[(s-1) >> LG_TINY_MIN])

-extern arena_bin_info_t	*arena_bin_info;
-
-/* Various bin-related settings. */
-				/* Number of (2^n)-spaced tiny bins. */
-#define			ntbins	((unsigned)(LG_QUANTUM - LG_TINY_MIN))
-extern unsigned		nqbins; /* Number of quantum-spaced bins. */
-extern unsigned		ncbins; /* Number of cacheline-spaced bins. */
-extern unsigned		nsbins; /* Number of subpage-spaced bins. */
-extern unsigned		nbins;
-#define			tspace_max	((size_t)(QUANTUM >> 1))
-#define			qspace_min	QUANTUM
-extern size_t		qspace_max;
-extern size_t		cspace_min;
-extern size_t		cspace_max;
-extern size_t		sspace_min;
-extern size_t		sspace_max;
-#define			small_maxclass	sspace_max
+extern arena_bin_info_t	arena_bin_info[NBINS];

+/* Number of large size classes. */
 #define			nlclasses (chunk_npages - map_bias)

 void	arena_purge_all(arena_t *arena);
@@ -457,7 +375,7 @@ void	*arena_ralloc_no_move(void *ptr, size_t oldsize, size_t size,
 void	*arena_ralloc(void *ptr, size_t oldsize, size_t size, size_t extra,
    size_t alignment, bool zero);
 bool	arena_new(arena_t *arena, unsigned ind);
-bool	arena_boot(void);
+void	arena_boot(void);

 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
@@ -478,7 +396,7 @@ JEMALLOC_INLINE size_t
 arena_bin_index(arena_t *arena, arena_bin_t *bin)
 {
 	size_t binind = bin - arena->bins;
-	assert(binind < nbins);
+	assert(binind < NBINS);
 	return (binind);
 }

@@ -633,7 +551,7 @@ arena_malloc(size_t size, bool zero)
 	assert(size != 0);
 	assert(QUANTUM_CEILING(size) <= arena_maxclass);

-	if (size <= small_maxclass) {
+	if (size <= SMALL_MAXCLASS) {
 		if ((tcache = tcache_get()) != NULL)
 			return (tcache_alloc_small(tcache, size, zero));
 		else
--- a/include/jemalloc/internal/atomic.h
+++ b/include/jemalloc/internal/atomic.h
@@ -70,7 +70,7 @@ atomic_sub_uint64(uint64_t *p, uint64_t x)

 	return (OSAtomicAdd64(-((int64_t)x), (int64_t *)p));
 }
-#elif (defined(__amd64_) || defined(__x86_64__))
+#elif (defined(__amd64__) || defined(__x86_64__))
 JEMALLOC_INLINE uint64_t
 atomic_add_uint64(uint64_t *p, uint64_t x)
 {
@@ -133,7 +133,7 @@ atomic_sub_uint32(uint32_t *p, uint32_t x)

 	return (OSAtomicAdd32(-((int32_t)x), (int32_t *)p));
 }
-#elif (defined(__i386__) || defined(__amd64_) || defined(__x86_64__))
+#elif (defined(__i386__) || defined(__amd64__) || defined(__x86_64__))
 JEMALLOC_INLINE uint32_t
 atomic_add_uint32(uint32_t *p, uint32_t x)
 {
--- a/include/jemalloc/internal/ctl.h
+++ b/include/jemalloc/internal/ctl.h
@@ -40,7 +40,7 @@ struct ctl_arena_stats_s {
 	uint64_t		ndalloc_small;
 	uint64_t		nrequests_small;

-	malloc_bin_stats_t	*bstats;	/* nbins elements. */
+	malloc_bin_stats_t	bstats[NBINS];
 	malloc_large_stats_t	*lstats;	/* nlclasses elements. */
 };

--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -229,33 +229,48 @@ extern void	(*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s);
 /* Size of stack-allocated buffer passed to buferror(). */
 #define	BUFERROR_BUF		64

-/* Minimum alignment of allocations is 2^LG_QUANTUM bytes. */
-#ifdef __i386__
-#  define LG_QUANTUM		4
-#endif
-#ifdef __ia64__
-#  define LG_QUANTUM		4
-#endif
-#ifdef __alpha__
-#  define LG_QUANTUM		4
-#endif
-#ifdef __sparc64__
-#  define LG_QUANTUM		4
-#endif
-#if (defined(__amd64__) || defined(__x86_64__))
-#  define LG_QUANTUM		4
-#endif
-#ifdef __arm__
-#  define LG_QUANTUM		3
-#endif
-#ifdef __mips__
-#  define LG_QUANTUM		3
-#endif
-#ifdef __powerpc__
-#  define LG_QUANTUM		4
-#endif
-#ifdef __s390x__
-#  define LG_QUANTUM		4
+/* Smallest size class to support. */
+#define	LG_TINY_MIN		3
+#define	TINY_MIN		(1U << LG_TINY_MIN)
+
+/*
+ * Minimum alignment of allocations is 2^LG_QUANTUM bytes (ignoring tiny size
+ * classes).
+ */
+#ifndef LG_QUANTUM
+#  ifdef __i386__
+#    define LG_QUANTUM		4
+#  endif
+#  ifdef __ia64__
+#    define LG_QUANTUM		4
+#  endif
+#  ifdef __alpha__
+#    define LG_QUANTUM		4
+#  endif
+#  ifdef __sparc64__
+#    define LG_QUANTUM		4
+#  endif
+#  if (defined(__amd64__) || defined(__x86_64__))
+#    define LG_QUANTUM		4
+#  endif
+#  ifdef __arm__
+#    define LG_QUANTUM		3
+#  endif
+#  ifdef __mips__
+#    define LG_QUANTUM		3
+#  endif
+#  ifdef __powerpc__
+#    define LG_QUANTUM		4
+#  endif
+#  ifdef __s390x__
+#    define LG_QUANTUM		4
+#  endif
+#  ifdef __tile__
+#    define LG_QUANTUM		4
+#  endif
+#  ifndef LG_QUANTUM
+#    error "No LG_QUANTUM definition for architecture; specify via CPPFLAGS"
+#  endif
 #endif

 #define	QUANTUM			((size_t)(1U << LG_QUANTUM))
@@ -291,15 +306,9 @@ extern void	(*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s);
 #define	CACHELINE_CEILING(s)						\
 	(((s) + CACHELINE_MASK) & ~CACHELINE_MASK)

-/*
- * Page size.  STATIC_PAGE_SHIFT is determined by the configure script.  If
- * DYNAMIC_PAGE_SHIFT is enabled, only use the STATIC_PAGE_* macros where
- * compile-time values are required for the purposes of defining data
- * structures.
- */
+/* Page size.  STATIC_PAGE_SHIFT is determined by the configure script. */
 #define	STATIC_PAGE_SIZE ((size_t)(1U << STATIC_PAGE_SHIFT))
 #define	STATIC_PAGE_MASK ((size_t)(STATIC_PAGE_SIZE - 1))
-
 #ifdef PAGE_SHIFT
 #  undef PAGE_SHIFT
 #endif
@@ -309,16 +318,9 @@ extern void	(*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s);
 #ifdef PAGE_MASK
 #  undef PAGE_MASK
 #endif
-
-#ifdef DYNAMIC_PAGE_SHIFT
-#  define PAGE_SHIFT	lg_pagesize
-#  define PAGE_SIZE	pagesize
-#  define PAGE_MASK	pagesize_mask
-#else
-#  define PAGE_SHIFT	STATIC_PAGE_SHIFT
-#  define PAGE_SIZE	STATIC_PAGE_SIZE
-#  define PAGE_MASK	STATIC_PAGE_MASK
-#endif
+#define	PAGE_SHIFT	STATIC_PAGE_SHIFT
+#define	PAGE_SIZE	STATIC_PAGE_SIZE
+#define	PAGE_MASK	STATIC_PAGE_MASK

 /* Return the smallest pagesize multiple that is >= s. */
 #define	PAGE_CEILING(s)							\
@@ -327,6 +329,7 @@ extern void	(*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s);
 #include "jemalloc/internal/atomic.h"
 #include "jemalloc/internal/prn.h"
 #include "jemalloc/internal/ckh.h"
+#include "jemalloc/internal/size_classes.h"
 #include "jemalloc/internal/stats.h"
 #include "jemalloc/internal/ctl.h"
 #include "jemalloc/internal/mutex.h"
@@ -352,6 +355,7 @@ extern void	(*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s);
 #include "jemalloc/internal/atomic.h"
 #include "jemalloc/internal/prn.h"
 #include "jemalloc/internal/ckh.h"
+#include "jemalloc/internal/size_classes.h"
 #include "jemalloc/internal/stats.h"
 #include "jemalloc/internal/ctl.h"
 #include "jemalloc/internal/mutex.h"
@@ -455,6 +459,7 @@ void	jemalloc_postfork(void);
 #include "jemalloc/internal/atomic.h"
 #include "jemalloc/internal/prn.h"
 #include "jemalloc/internal/ckh.h"
+#include "jemalloc/internal/size_classes.h"
 #include "jemalloc/internal/stats.h"
 #include "jemalloc/internal/ctl.h"
 #include "jemalloc/internal/mutex.h"
@@ -480,6 +485,7 @@ void	jemalloc_postfork(void);
 #include "jemalloc/internal/atomic.h"
 #include "jemalloc/internal/prn.h"
 #include "jemalloc/internal/ckh.h"
+#include "jemalloc/internal/size_classes.h"
 #include "jemalloc/internal/stats.h"
 #include "jemalloc/internal/ctl.h"
 #include "jemalloc/internal/mutex.h"
@@ -525,7 +531,7 @@ JEMALLOC_INLINE size_t
 s2u(size_t size)
 {

-	if (size <= small_maxclass)
+	if (size <= SMALL_MAXCLASS)
 		return (arena_bin_info[SMALL_SIZE2BIN(size)].reg_size);
 	if (size <= arena_maxclass)
 		return (PAGE_CEILING(size));
@@ -570,7 +576,7 @@ sa2u(size_t size, size_t alignment, size_t *run_size_p)
 	}

 	if (usize <= arena_maxclass && alignment <= PAGE_SIZE) {
-		if (usize <= small_maxclass)
+		if (usize <= SMALL_MAXCLASS)
 			return (arena_bin_info[SMALL_SIZE2BIN(usize)].reg_size);
 		return (PAGE_CEILING(usize));
 	} else {
--- a/include/jemalloc/internal/mb.h
+++ b/include/jemalloc/internal/mb.h
@@ -54,7 +54,7 @@ mb_write(void)
 	    );
 #endif
 }
-#elif (defined(__amd64_) || defined(__x86_64__))
+#elif (defined(__amd64__) || defined(__x86_64__))
 JEMALLOC_INLINE void
 mb_write(void)
 {
@@ -87,6 +87,13 @@ mb_write(void)
 	    : "memory" /* Clobbers. */
 	    );
 }
+#elif defined(__tile__)
+JEMALLOC_INLINE void
+mb_write(void)
+{
+
+	__sync_synchronize();
+}
 #else
 /*
 * This is much slower than a simple memory barrier, but the semantics of mutex
--- a/include/jemalloc/internal/size_classes.sh
+++ b/include/jemalloc/internal/size_classes.sh
@@ -0,0 +1,132 @@
+#!/bin/sh
+
+# The following limits are chosen such that they cover all supported platforms.
+
+# Range of quanta.
+lg_qmin=3
+lg_qmax=4
+
+# The range of tiny size classes is [2^lg_tmin..2^(lg_q-1)].
+lg_tmin=3
+
+# Range of page sizes.
+lg_pmin=12
+lg_pmax=16
+
+function pow2() {
+  e=$1
+  pow2_result=1
+  while [ ${e} -gt 0 ] ; do
+    pow2_result=`expr ${pow2_result} + ${pow2_result}`
+    e=`expr ${e} - 1`
+  done
+}
+
+cat <<EOF
+/* This file was automatically generated by size_classes.sh. */
+/******************************************************************************/
+#ifdef JEMALLOC_H_TYPES
+
+EOF
+
+lg_q=${lg_qmin}
+while [ ${lg_q} -le ${lg_qmax} ] ; do
+  lg_t=${lg_tmin}
+  while [ ${lg_t} -le ${lg_q} ] ; do
+    lg_p=${lg_pmin}
+    while [ ${lg_p} -le ${lg_pmax} ] ; do
+      cat <<EOF
+#if (LG_TINY_MIN == ${lg_t} && LG_QUANTUM == ${lg_q} && PAGE_SHIFT == ${lg_p})
+#define	SIZE_CLASSES_DEFINED
+EOF
+      pow2 ${lg_q}; q=${pow2_result}
+      pow2 ${lg_t}; t=${pow2_result}
+      pow2 ${lg_p}; p=${pow2_result}
+      bin=0
+      psz=0
+      sz=${t}
+      delta=`expr ${sz} - ${psz}`
+cat <<EOF
+/*  SIZE_CLASS(bin,	delta,	sz) */
+#define	SIZE_CLASSES							\\
+EOF
+
+      # Tiny size classes.
+      while [ ${sz} -lt ${q} ] ; do
+        cat <<EOF
+    SIZE_CLASS(${bin},	${delta},	${sz})					\\
+EOF
+        bin=`expr ${bin} + 1`
+        psz=${sz}
+        sz=`expr ${sz} + ${sz}`
+        delta=`expr ${sz} - ${psz}`
+      done
+      # Quantum-multiple size classes.  For each doubling of sz, as many as 4
+      # size classes exist.  Their spacing is the greater of:
+      # - q
+      # - sz/4, where sz is a power of 2
+      while [ ${sz} -lt ${p} ] ; do
+        if [ ${sz} -ge `expr ${q} \* 4` ] ; then
+          i=`expr ${sz} / 4`
+        else
+          i=${q}
+        fi
+        next_2pow=`expr ${sz} \* 2`
+        while [ ${sz} -lt $next_2pow ] ; do
+          cat <<EOF
+    SIZE_CLASS(${bin},	${delta},	${sz})					\\
+EOF
+          bin=`expr ${bin} + 1`
+          psz=${sz}
+          sz=`expr ${sz} + ${i}`
+          delta=`expr ${sz} - ${psz}`
+        done
+      done
+      cat <<EOF
+
+#define	NBINS		${bin}
+#define	SMALL_MAXCLASS	${psz}
+#endif
+
+EOF
+      lg_p=`expr ${lg_p} + 1`
+    done
+    lg_t=`expr ${lg_t} + 1`
+  done
+  lg_q=`expr ${lg_q} + 1`
+done
+
+cat <<EOF
+#ifndef SIZE_CLASSES_DEFINED
+#  error "No size class definitions match configuration"
+#endif
+#undef SIZE_CLASSES_DEFINED
+/*
+ * The small_size2bin lookup table uses uint8_t to encode each bin index, so we
+ * cannot support more than 256 small size classes.  Further constrain NBINS to
+ * 255 to support prof_promote, since all small size classes, plus a "not
+ * small" size class must be stored in 8 bits of arena_chunk_map_t's bits
+ * field.
+ */
+#if (NBINS > 255)
+#  error "Too many small size classes"
+#endif
+
+#endif /* JEMALLOC_H_TYPES */
+/******************************************************************************/
+#ifdef JEMALLOC_H_STRUCTS
+
+
+#endif /* JEMALLOC_H_STRUCTS */
+/******************************************************************************/
+#ifdef JEMALLOC_H_EXTERNS
+
+
+#endif /* JEMALLOC_H_EXTERNS */
+/******************************************************************************/
+#ifdef JEMALLOC_H_INLINES
+
+
+#endif /* JEMALLOC_H_INLINES */
+/******************************************************************************/
+EOF
--- a/include/jemalloc/internal/tcache.h
+++ b/include/jemalloc/internal/tcache.h
@@ -91,7 +91,7 @@ extern __thread tcache_t	*tcache_tls
 extern pthread_key_t		tcache_tsd;

 /*
- * Number of tcache bins.  There are nbins small-object bins, plus 0 or more
+ * Number of tcache bins.  There are NBINS small-object bins, plus 0 or more
 * large-object bins.
 */
 extern size_t			nhbins;
@@ -181,7 +181,7 @@ tcache_event(tcache_t *tcache)
 			 * Flush (ceiling) 3/4 of the objects below the low
 			 * water mark.
 			 */
-			if (binind < nbins) {
+			if (binind < NBINS) {
 				tcache_bin_flush_small(tbin, binind,
 				    tbin->ncached - tbin->low_water +
 				    (tbin->low_water >> 2), tcache);
@@ -238,7 +238,7 @@ tcache_alloc_small(tcache_t *tcache, size_t size, bool zero)
 	tcache_bin_t *tbin;

 	binind = SMALL_SIZE2BIN(size);
-	assert(binind < nbins);
+	assert(binind < NBINS);
 	tbin = &tcache->tbins[binind];
 	ret = tcache_alloc_easy(tbin);
 	if (ret == NULL) {
@@ -275,7 +275,7 @@ tcache_alloc_large(tcache_t *tcache, size_t size, bool zero)

 	size = PAGE_CEILING(size);
 	assert(size <= tcache_maxclass);
-	binind = nbins + (size >> PAGE_SHIFT) - 1;
+	binind = NBINS + (size >> PAGE_SHIFT) - 1;
 	assert(binind < nhbins);
 	tbin = &tcache->tbins[binind];
 	ret = tcache_alloc_easy(tbin);
@@ -328,7 +328,7 @@ tcache_dalloc_small(tcache_t *tcache, void *ptr)
 	size_t pageind, binind;
 	arena_chunk_map_t *mapelm;

-	assert(arena_salloc(ptr) <= small_maxclass);
+	assert(arena_salloc(ptr) <= SMALL_MAXCLASS);

 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 	arena = chunk->arena;
@@ -339,7 +339,7 @@ tcache_dalloc_small(tcache_t *tcache, void *ptr)
 	bin = run->bin;
 	binind = ((uintptr_t)bin - (uintptr_t)&arena->bins) /
 	    sizeof(arena_bin_t);
-	assert(binind < nbins);
+	assert(binind < NBINS);

 	if (config_fill && opt_junk)
 		memset(ptr, 0x5a, arena_bin_info[binind].reg_size);
@@ -367,13 +367,13 @@ tcache_dalloc_large(tcache_t *tcache, void *ptr, size_t size)
 	tcache_bin_info_t *tbin_info;

 	assert((size & PAGE_MASK) == 0);
-	assert(arena_salloc(ptr) > small_maxclass);
+	assert(arena_salloc(ptr) > SMALL_MAXCLASS);
 	assert(arena_salloc(ptr) <= tcache_maxclass);

 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 	arena = chunk->arena;
 	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT;
-	binind = nbins + (size >> PAGE_SHIFT) - 1;
+	binind = NBINS + (size >> PAGE_SHIFT) - 1;

 	if (config_fill && opt_junk)
 		memset(ptr, 0x5a, size);