Simplify small size class infrastructure.

Program-generate small size class tables for all valid combinations of
LG_TINY_MIN, LG_QUANTUM, and PAGE_SHIFT.  Use the appropriate table to generate
all relevant data structures, and remove the distinction between
tiny/quantum/cacheline/subpage bins.

Remove --enable-dynamic-page-shift.  This option didn't prove useful in
practice, and it prevented optimizations.

Add Tilera architecture support.
This commit is contained in:
Jason Evans
2012-02-28 16:50:47 -08:00
parent 5389146191
commit b172610317
17 changed files with 327 additions and 858 deletions

View File

@@ -1,39 +1,6 @@
/******************************************************************************/
#ifdef JEMALLOC_H_TYPES
/*
* Subpages are an artificially designated partitioning of pages. Their only
* purpose is to support subpage-spaced size classes.
*
* There must be at least 4 subpages per page, due to the way size classes are
* handled.
*/
#define LG_SUBPAGE 8
#define SUBPAGE ((size_t)(1U << LG_SUBPAGE))
#define SUBPAGE_MASK (SUBPAGE - 1)
/* Return the smallest subpage multiple that is >= s. */
#define SUBPAGE_CEILING(s) \
(((s) + SUBPAGE_MASK) & ~SUBPAGE_MASK)
/* Smallest size class to support. */
#define LG_TINY_MIN 3
#define TINY_MIN (1U << LG_TINY_MIN)
/*
* Maximum size class that is a multiple of the quantum, but not (necessarily)
* a power of 2. Above this size, allocations are rounded up to the nearest
* power of 2.
*/
#define LG_QSPACE_MAX_DEFAULT 7
/*
* Maximum size class that is a multiple of the cacheline, but not (necessarily)
* a power of 2. Above this size, allocations are rounded up to the nearest
* power of 2.
*/
#define LG_CSPACE_MAX_DEFAULT 9
/*
* RUN_MAX_OVRHD indicates maximum desired run header overhead. Runs are sized
* as small as possible such that this setting is still honored, without
@@ -364,75 +331,26 @@ struct arena_s {
arena_avail_tree_t runs_avail_clean;
arena_avail_tree_t runs_avail_dirty;
/*
* bins is used to store trees of free regions of the following sizes,
* assuming a 64-bit system with 16-byte quantum, 4 KiB page size, and
* default MALLOC_CONF.
*
* bins[i] | size |
* --------+--------+
* 0 | 8 |
* --------+--------+
* 1 | 16 |
* 2 | 32 |
* 3 | 48 |
* : :
* 6 | 96 |
* 7 | 112 |
* 8 | 128 |
* --------+--------+
* 9 | 192 |
* 10 | 256 |
* 11 | 320 |
* 12 | 384 |
* 13 | 448 |
* 14 | 512 |
* --------+--------+
* 15 | 768 |
* 16 | 1024 |
* 17 | 1280 |
* : :
* 25 | 3328 |
* 26 | 3584 |
* 27 | 3840 |
* --------+--------+
*/
arena_bin_t bins[1]; /* Dynamically sized. */
/* bins is used to store trees of free regions. */
arena_bin_t bins[NBINS];
};
#endif /* JEMALLOC_H_STRUCTS */
/******************************************************************************/
#ifdef JEMALLOC_H_EXTERNS
extern size_t opt_lg_qspace_max;
extern size_t opt_lg_cspace_max;
extern ssize_t opt_lg_dirty_mult;
/*
* small_size2bin is a compact lookup table that rounds request sizes up to
* size classes. In order to reduce cache footprint, the table is compressed,
* and all accesses are via the SMALL_SIZE2BIN macro.
*/
extern uint8_t const *small_size2bin;
extern uint8_t const small_size2bin[];
#define SMALL_SIZE2BIN(s) (small_size2bin[(s-1) >> LG_TINY_MIN])
extern arena_bin_info_t *arena_bin_info;
/* Various bin-related settings. */
/* Number of (2^n)-spaced tiny bins. */
#define ntbins ((unsigned)(LG_QUANTUM - LG_TINY_MIN))
extern unsigned nqbins; /* Number of quantum-spaced bins. */
extern unsigned ncbins; /* Number of cacheline-spaced bins. */
extern unsigned nsbins; /* Number of subpage-spaced bins. */
extern unsigned nbins;
#define tspace_max ((size_t)(QUANTUM >> 1))
#define qspace_min QUANTUM
extern size_t qspace_max;
extern size_t cspace_min;
extern size_t cspace_max;
extern size_t sspace_min;
extern size_t sspace_max;
#define small_maxclass sspace_max
extern arena_bin_info_t arena_bin_info[NBINS];
/* Number of large size classes. */
#define nlclasses (chunk_npages - map_bias)
void arena_purge_all(arena_t *arena);
@@ -457,7 +375,7 @@ void *arena_ralloc_no_move(void *ptr, size_t oldsize, size_t size,
void *arena_ralloc(void *ptr, size_t oldsize, size_t size, size_t extra,
size_t alignment, bool zero);
bool arena_new(arena_t *arena, unsigned ind);
bool arena_boot(void);
void arena_boot(void);
#endif /* JEMALLOC_H_EXTERNS */
/******************************************************************************/
@@ -478,7 +396,7 @@ JEMALLOC_INLINE size_t
arena_bin_index(arena_t *arena, arena_bin_t *bin)
{
size_t binind = bin - arena->bins;
assert(binind < nbins);
assert(binind < NBINS);
return (binind);
}
@@ -633,7 +551,7 @@ arena_malloc(size_t size, bool zero)
assert(size != 0);
assert(QUANTUM_CEILING(size) <= arena_maxclass);
if (size <= small_maxclass) {
if (size <= SMALL_MAXCLASS) {
if ((tcache = tcache_get()) != NULL)
return (tcache_alloc_small(tcache, size, zero));
else

View File

@@ -70,7 +70,7 @@ atomic_sub_uint64(uint64_t *p, uint64_t x)
return (OSAtomicAdd64(-((int64_t)x), (int64_t *)p));
}
#elif (defined(__amd64_) || defined(__x86_64__))
#elif (defined(__amd64__) || defined(__x86_64__))
JEMALLOC_INLINE uint64_t
atomic_add_uint64(uint64_t *p, uint64_t x)
{
@@ -133,7 +133,7 @@ atomic_sub_uint32(uint32_t *p, uint32_t x)
return (OSAtomicAdd32(-((int32_t)x), (int32_t *)p));
}
#elif (defined(__i386__) || defined(__amd64_) || defined(__x86_64__))
#elif (defined(__i386__) || defined(__amd64__) || defined(__x86_64__))
JEMALLOC_INLINE uint32_t
atomic_add_uint32(uint32_t *p, uint32_t x)
{

View File

@@ -40,7 +40,7 @@ struct ctl_arena_stats_s {
uint64_t ndalloc_small;
uint64_t nrequests_small;
malloc_bin_stats_t *bstats; /* nbins elements. */
malloc_bin_stats_t bstats[NBINS];
malloc_large_stats_t *lstats; /* nlclasses elements. */
};

View File

@@ -229,33 +229,48 @@ extern void (*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s);
/* Size of stack-allocated buffer passed to buferror(). */
#define BUFERROR_BUF 64
/* Minimum alignment of allocations is 2^LG_QUANTUM bytes. */
#ifdef __i386__
# define LG_QUANTUM 4
#endif
#ifdef __ia64__
# define LG_QUANTUM 4
#endif
#ifdef __alpha__
# define LG_QUANTUM 4
#endif
#ifdef __sparc64__
# define LG_QUANTUM 4
#endif
#if (defined(__amd64__) || defined(__x86_64__))
# define LG_QUANTUM 4
#endif
#ifdef __arm__
# define LG_QUANTUM 3
#endif
#ifdef __mips__
# define LG_QUANTUM 3
#endif
#ifdef __powerpc__
# define LG_QUANTUM 4
#endif
#ifdef __s390x__
# define LG_QUANTUM 4
/* Smallest size class to support. */
#define LG_TINY_MIN 3
#define TINY_MIN (1U << LG_TINY_MIN)
/*
* Minimum alignment of allocations is 2^LG_QUANTUM bytes (ignoring tiny size
* classes).
*/
#ifndef LG_QUANTUM
# ifdef __i386__
# define LG_QUANTUM 4
# endif
# ifdef __ia64__
# define LG_QUANTUM 4
# endif
# ifdef __alpha__
# define LG_QUANTUM 4
# endif
# ifdef __sparc64__
# define LG_QUANTUM 4
# endif
# if (defined(__amd64__) || defined(__x86_64__))
# define LG_QUANTUM 4
# endif
# ifdef __arm__
# define LG_QUANTUM 3
# endif
# ifdef __mips__
# define LG_QUANTUM 3
# endif
# ifdef __powerpc__
# define LG_QUANTUM 4
# endif
# ifdef __s390x__
# define LG_QUANTUM 4
# endif
# ifdef __tile__
# define LG_QUANTUM 4
# endif
# ifndef LG_QUANTUM
# error "No LG_QUANTUM definition for architecture; specify via CPPFLAGS"
# endif
#endif
#define QUANTUM ((size_t)(1U << LG_QUANTUM))
@@ -291,15 +306,9 @@ extern void (*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s);
#define CACHELINE_CEILING(s) \
(((s) + CACHELINE_MASK) & ~CACHELINE_MASK)
/*
* Page size. STATIC_PAGE_SHIFT is determined by the configure script. If
* DYNAMIC_PAGE_SHIFT is enabled, only use the STATIC_PAGE_* macros where
* compile-time values are required for the purposes of defining data
* structures.
*/
/* Page size. STATIC_PAGE_SHIFT is determined by the configure script. */
#define STATIC_PAGE_SIZE ((size_t)(1U << STATIC_PAGE_SHIFT))
#define STATIC_PAGE_MASK ((size_t)(STATIC_PAGE_SIZE - 1))
#ifdef PAGE_SHIFT
# undef PAGE_SHIFT
#endif
@@ -309,16 +318,9 @@ extern void (*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s);
#ifdef PAGE_MASK
# undef PAGE_MASK
#endif
#ifdef DYNAMIC_PAGE_SHIFT
# define PAGE_SHIFT lg_pagesize
# define PAGE_SIZE pagesize
# define PAGE_MASK pagesize_mask
#else
# define PAGE_SHIFT STATIC_PAGE_SHIFT
# define PAGE_SIZE STATIC_PAGE_SIZE
# define PAGE_MASK STATIC_PAGE_MASK
#endif
#define PAGE_SHIFT STATIC_PAGE_SHIFT
#define PAGE_SIZE STATIC_PAGE_SIZE
#define PAGE_MASK STATIC_PAGE_MASK
/* Return the smallest pagesize multiple that is >= s. */
#define PAGE_CEILING(s) \
@@ -327,6 +329,7 @@ extern void (*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s);
#include "jemalloc/internal/atomic.h"
#include "jemalloc/internal/prn.h"
#include "jemalloc/internal/ckh.h"
#include "jemalloc/internal/size_classes.h"
#include "jemalloc/internal/stats.h"
#include "jemalloc/internal/ctl.h"
#include "jemalloc/internal/mutex.h"
@@ -352,6 +355,7 @@ extern void (*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s);
#include "jemalloc/internal/atomic.h"
#include "jemalloc/internal/prn.h"
#include "jemalloc/internal/ckh.h"
#include "jemalloc/internal/size_classes.h"
#include "jemalloc/internal/stats.h"
#include "jemalloc/internal/ctl.h"
#include "jemalloc/internal/mutex.h"
@@ -455,6 +459,7 @@ void jemalloc_postfork(void);
#include "jemalloc/internal/atomic.h"
#include "jemalloc/internal/prn.h"
#include "jemalloc/internal/ckh.h"
#include "jemalloc/internal/size_classes.h"
#include "jemalloc/internal/stats.h"
#include "jemalloc/internal/ctl.h"
#include "jemalloc/internal/mutex.h"
@@ -480,6 +485,7 @@ void jemalloc_postfork(void);
#include "jemalloc/internal/atomic.h"
#include "jemalloc/internal/prn.h"
#include "jemalloc/internal/ckh.h"
#include "jemalloc/internal/size_classes.h"
#include "jemalloc/internal/stats.h"
#include "jemalloc/internal/ctl.h"
#include "jemalloc/internal/mutex.h"
@@ -525,7 +531,7 @@ JEMALLOC_INLINE size_t
s2u(size_t size)
{
if (size <= small_maxclass)
if (size <= SMALL_MAXCLASS)
return (arena_bin_info[SMALL_SIZE2BIN(size)].reg_size);
if (size <= arena_maxclass)
return (PAGE_CEILING(size));
@@ -570,7 +576,7 @@ sa2u(size_t size, size_t alignment, size_t *run_size_p)
}
if (usize <= arena_maxclass && alignment <= PAGE_SIZE) {
if (usize <= small_maxclass)
if (usize <= SMALL_MAXCLASS)
return (arena_bin_info[SMALL_SIZE2BIN(usize)].reg_size);
return (PAGE_CEILING(usize));
} else {

View File

@@ -54,7 +54,7 @@ mb_write(void)
);
#endif
}
#elif (defined(__amd64_) || defined(__x86_64__))
#elif (defined(__amd64__) || defined(__x86_64__))
JEMALLOC_INLINE void
mb_write(void)
{
@@ -87,6 +87,13 @@ mb_write(void)
: "memory" /* Clobbers. */
);
}
#elif defined(__tile__)
JEMALLOC_INLINE void
mb_write(void)
{
__sync_synchronize();
}
#else
/*
* This is much slower than a simple memory barrier, but the semantics of mutex

View File

@@ -0,0 +1,132 @@
#!/bin/sh
# The following limits are chosen such that they cover all supported platforms.
# Range of quanta.
lg_qmin=3
lg_qmax=4
# The range of tiny size classes is [2^lg_tmin..2^(lg_q-1)].
lg_tmin=3
# Range of page sizes.
lg_pmin=12
lg_pmax=16
function pow2() {
e=$1
pow2_result=1
while [ ${e} -gt 0 ] ; do
pow2_result=`expr ${pow2_result} + ${pow2_result}`
e=`expr ${e} - 1`
done
}
cat <<EOF
/* This file was automatically generated by size_classes.sh. */
/******************************************************************************/
#ifdef JEMALLOC_H_TYPES
EOF
lg_q=${lg_qmin}
while [ ${lg_q} -le ${lg_qmax} ] ; do
lg_t=${lg_tmin}
while [ ${lg_t} -le ${lg_q} ] ; do
lg_p=${lg_pmin}
while [ ${lg_p} -le ${lg_pmax} ] ; do
cat <<EOF
#if (LG_TINY_MIN == ${lg_t} && LG_QUANTUM == ${lg_q} && PAGE_SHIFT == ${lg_p})
#define SIZE_CLASSES_DEFINED
EOF
pow2 ${lg_q}; q=${pow2_result}
pow2 ${lg_t}; t=${pow2_result}
pow2 ${lg_p}; p=${pow2_result}
bin=0
psz=0
sz=${t}
delta=`expr ${sz} - ${psz}`
cat <<EOF
/* SIZE_CLASS(bin, delta, sz) */
#define SIZE_CLASSES \\
EOF
# Tiny size classes.
while [ ${sz} -lt ${q} ] ; do
cat <<EOF
SIZE_CLASS(${bin}, ${delta}, ${sz}) \\
EOF
bin=`expr ${bin} + 1`
psz=${sz}
sz=`expr ${sz} + ${sz}`
delta=`expr ${sz} - ${psz}`
done
# Quantum-multiple size classes. For each doubling of sz, as many as 4
# size classes exist. Their spacing is the greater of:
# - q
# - sz/4, where sz is a power of 2
while [ ${sz} -lt ${p} ] ; do
if [ ${sz} -ge `expr ${q} \* 4` ] ; then
i=`expr ${sz} / 4`
else
i=${q}
fi
next_2pow=`expr ${sz} \* 2`
while [ ${sz} -lt $next_2pow ] ; do
cat <<EOF
SIZE_CLASS(${bin}, ${delta}, ${sz}) \\
EOF
bin=`expr ${bin} + 1`
psz=${sz}
sz=`expr ${sz} + ${i}`
delta=`expr ${sz} - ${psz}`
done
done
cat <<EOF
#define NBINS ${bin}
#define SMALL_MAXCLASS ${psz}
#endif
EOF
lg_p=`expr ${lg_p} + 1`
done
lg_t=`expr ${lg_t} + 1`
done
lg_q=`expr ${lg_q} + 1`
done
cat <<EOF
#ifndef SIZE_CLASSES_DEFINED
# error "No size class definitions match configuration"
#endif
#undef SIZE_CLASSES_DEFINED
/*
* The small_size2bin lookup table uses uint8_t to encode each bin index, so we
* cannot support more than 256 small size classes. Further constrain NBINS to
* 255 to support prof_promote, since all small size classes, plus a "not
* small" size class must be stored in 8 bits of arena_chunk_map_t's bits
* field.
*/
#if (NBINS > 255)
# error "Too many small size classes"
#endif
#endif /* JEMALLOC_H_TYPES */
/******************************************************************************/
#ifdef JEMALLOC_H_STRUCTS
#endif /* JEMALLOC_H_STRUCTS */
/******************************************************************************/
#ifdef JEMALLOC_H_EXTERNS
#endif /* JEMALLOC_H_EXTERNS */
/******************************************************************************/
#ifdef JEMALLOC_H_INLINES
#endif /* JEMALLOC_H_INLINES */
/******************************************************************************/
EOF

View File

@@ -91,7 +91,7 @@ extern __thread tcache_t *tcache_tls
extern pthread_key_t tcache_tsd;
/*
* Number of tcache bins. There are nbins small-object bins, plus 0 or more
* Number of tcache bins. There are NBINS small-object bins, plus 0 or more
* large-object bins.
*/
extern size_t nhbins;
@@ -181,7 +181,7 @@ tcache_event(tcache_t *tcache)
* Flush (ceiling) 3/4 of the objects below the low
* water mark.
*/
if (binind < nbins) {
if (binind < NBINS) {
tcache_bin_flush_small(tbin, binind,
tbin->ncached - tbin->low_water +
(tbin->low_water >> 2), tcache);
@@ -238,7 +238,7 @@ tcache_alloc_small(tcache_t *tcache, size_t size, bool zero)
tcache_bin_t *tbin;
binind = SMALL_SIZE2BIN(size);
assert(binind < nbins);
assert(binind < NBINS);
tbin = &tcache->tbins[binind];
ret = tcache_alloc_easy(tbin);
if (ret == NULL) {
@@ -275,7 +275,7 @@ tcache_alloc_large(tcache_t *tcache, size_t size, bool zero)
size = PAGE_CEILING(size);
assert(size <= tcache_maxclass);
binind = nbins + (size >> PAGE_SHIFT) - 1;
binind = NBINS + (size >> PAGE_SHIFT) - 1;
assert(binind < nhbins);
tbin = &tcache->tbins[binind];
ret = tcache_alloc_easy(tbin);
@@ -328,7 +328,7 @@ tcache_dalloc_small(tcache_t *tcache, void *ptr)
size_t pageind, binind;
arena_chunk_map_t *mapelm;
assert(arena_salloc(ptr) <= small_maxclass);
assert(arena_salloc(ptr) <= SMALL_MAXCLASS);
chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
arena = chunk->arena;
@@ -339,7 +339,7 @@ tcache_dalloc_small(tcache_t *tcache, void *ptr)
bin = run->bin;
binind = ((uintptr_t)bin - (uintptr_t)&arena->bins) /
sizeof(arena_bin_t);
assert(binind < nbins);
assert(binind < NBINS);
if (config_fill && opt_junk)
memset(ptr, 0x5a, arena_bin_info[binind].reg_size);
@@ -367,13 +367,13 @@ tcache_dalloc_large(tcache_t *tcache, void *ptr, size_t size)
tcache_bin_info_t *tbin_info;
assert((size & PAGE_MASK) == 0);
assert(arena_salloc(ptr) > small_maxclass);
assert(arena_salloc(ptr) > SMALL_MAXCLASS);
assert(arena_salloc(ptr) <= tcache_maxclass);
chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
arena = chunk->arena;
pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT;
binind = nbins + (size >> PAGE_SHIFT) - 1;
binind = NBINS + (size >> PAGE_SHIFT) - 1;
if (config_fill && opt_junk)
memset(ptr, 0x5a, size);