Simplify small object allocation/deallocation.

Use chained run free lists instead of bitmaps to track free objects
within small runs.

Remove reference counting for small object run pages.
This commit is contained in:
Jason Evans 2010-03-13 13:41:58 -08:00
parent 3fa9a2fad8
commit 1e0a636c11
2 changed files with 135 additions and 343 deletions

View File

@ -18,11 +18,7 @@
#ifdef JEMALLOC_TINY #ifdef JEMALLOC_TINY
/* Smallest size class to support. */ /* Smallest size class to support. */
# ifdef JEMALLOC_TCACHE # define LG_TINY_MIN LG_SIZEOF_PTR
# define LG_TINY_MIN LG_SIZEOF_PTR
# else
# define LG_TINY_MIN 1
# endif
#endif #endif
/* /*
@ -77,12 +73,6 @@
#define RUN_MAX_OVRHD 0x0000003dU #define RUN_MAX_OVRHD 0x0000003dU
#define RUN_MAX_OVRHD_RELAX 0x00001800U #define RUN_MAX_OVRHD_RELAX 0x00001800U
/* Put a cap on small object run size. This overrides RUN_MAX_OVRHD. */
#define RUN_MAX_SMALL \
(arena_maxclass <= (1U << (CHUNK_MAP_LG_PG_RANGE + PAGE_SHIFT)) \
? arena_maxclass : (1U << (CHUNK_MAP_LG_PG_RANGE + \
PAGE_SHIFT)))
/* /*
* The minimum ratio of active:dirty pages per arena is computed as: * The minimum ratio of active:dirty pages per arena is computed as:
* *
@ -130,7 +120,6 @@ struct arena_chunk_map_s {
* Small/medium: Don't care. * Small/medium: Don't care.
* Large: Run size for first page, unset for trailing pages. * Large: Run size for first page, unset for trailing pages.
* - : Unused. * - : Unused.
* c : refcount (could overflow for PAGE_SIZE >= 128 KiB)
* d : dirty? * d : dirty?
* z : zeroed? * z : zeroed?
* l : large? * l : large?
@ -150,9 +139,9 @@ struct arena_chunk_map_s {
* ssssssss ssssssss ssss---- -----z-- * ssssssss ssssssss ssss---- -----z--
* *
* Small/medium: * Small/medium:
* pppppppp ppppcccc cccccccc cccc---a * pppppppp pppppppp pppp---- -------a
* pppppppp ppppcccc cccccccc cccc---a * pppppppp pppppppp pppp---- -------a
* pppppppp ppppcccc cccccccc cccc---a * pppppppp pppppppp pppp---- -------a
* *
* Large: * Large:
* ssssssss ssssssss ssss---- ------la * ssssssss ssssssss ssss---- ------la
@ -160,12 +149,9 @@ struct arena_chunk_map_s {
* -------- -------- -------- ------la * -------- -------- -------- ------la
*/ */
size_t bits; size_t bits;
#define CHUNK_MAP_PG_MASK ((size_t)0xfff00000U) #define CHUNK_MAP_PG_MASK ((size_t)0xfffff000U)
#define CHUNK_MAP_PG_SHIFT 20 #define CHUNK_MAP_PG_SHIFT 12
#define CHUNK_MAP_LG_PG_RANGE 12 #define CHUNK_MAP_LG_PG_RANGE 20
#define CHUNK_MAP_RC_MASK ((size_t)0xffff0U)
#define CHUNK_MAP_RC_ONE ((size_t)0x00010U)
#define CHUNK_MAP_FLAGS_MASK ((size_t)0xfU) #define CHUNK_MAP_FLAGS_MASK ((size_t)0xfU)
#define CHUNK_MAP_DIRTY ((size_t)0x8U) #define CHUNK_MAP_DIRTY ((size_t)0x8U)
@ -209,14 +195,14 @@ struct arena_run_s {
/* Bin this run is associated with. */ /* Bin this run is associated with. */
arena_bin_t *bin; arena_bin_t *bin;
/* Index of first element that might have a free region. */ /* Stack of available freed regions, or NULL. */
unsigned regs_minelm; void *avail;
/* Next region that has never been allocated, or run boundary. */
void *next;
/* Number of free regions in run. */ /* Number of free regions in run. */
unsigned nfree; unsigned nfree;
/* Bitmask of in-use regions (0: in use, 1: free). */
unsigned regs_mask[1]; /* Dynamically sized. */
}; };
struct arena_bin_s { struct arena_bin_s {
@ -244,9 +230,6 @@ struct arena_bin_s {
/* Total number of regions in a run for this bin's size class. */ /* Total number of regions in a run for this bin's size class. */
uint32_t nregs; uint32_t nregs;
/* Number of elements in a run's regs_mask for this bin's size class. */
uint32_t regs_mask_nelms;
#ifdef JEMALLOC_PROF #ifdef JEMALLOC_PROF
/* /*
* Offset of first (prof_cnt_t *) in a run header for this bin's size * Offset of first (prof_cnt_t *) in a run header for this bin's size

View File

@ -55,21 +55,15 @@ static const uint8_t const_small_size2bin[STATIC_PAGE_SIZE - 255] = {
#if (LG_QUANTUM == 4) #if (LG_QUANTUM == 4)
/* 16-byte quantum **********************/ /* 16-byte quantum **********************/
# ifdef JEMALLOC_TINY # ifdef JEMALLOC_TINY
# if (LG_TINY_MIN == 1) # if (LG_TINY_MIN == 2)
S2B_2(0) /* 2 */
S2B_2(1) /* 4 */
S2B_4(2) /* 8 */
S2B_8(3) /* 16 */
# define S2B_QMIN 3
# elif (LG_TINY_MIN == 2)
S2B_4(0) /* 4 */ S2B_4(0) /* 4 */
S2B_4(1) /* 8 */ S2B_4(1) /* 8 */
S2B_8(2) /* 16 */ S2B_8(2) /* 16 */
# define S2B_QMIN 2 # define S2B_QMIN 2
# elif (LG_TINY_MIN == 3) # elif (LG_TINY_MIN == 3)
S2B_8(0) /* 8 */ S2B_8(0) /* 8 */
S2B_8(1) /* 16 */ S2B_8(1) /* 16 */
# define S2B_QMIN 1 # define S2B_QMIN 1
# else # else
# error "Unsupported LG_TINY_MIN" # error "Unsupported LG_TINY_MIN"
# endif # endif
@ -88,15 +82,10 @@ static const uint8_t const_small_size2bin[STATIC_PAGE_SIZE - 255] = {
#else #else
/* 8-byte quantum ***********************/ /* 8-byte quantum ***********************/
# ifdef JEMALLOC_TINY # ifdef JEMALLOC_TINY
# if (LG_TINY_MIN == 1) # if (LG_TINY_MIN == 2)
S2B_2(0) /* 2 */
S2B_2(1) /* 4 */
S2B_4(2) /* 8 */
# define S2B_QMIN 2
# elif (LG_TINY_MIN == 2)
S2B_4(0) /* 4 */ S2B_4(0) /* 4 */
S2B_4(1) /* 8 */ S2B_4(1) /* 8 */
# define S2B_QMIN 1 # define S2B_QMIN 1
# else # else
# error "Unsupported LG_TINY_MIN" # error "Unsupported LG_TINY_MIN"
# endif # endif
@ -260,246 +249,48 @@ arena_avail_comp(arena_chunk_map_t *a, arena_chunk_map_t *b)
rb_gen(static JEMALLOC_ATTR(unused), arena_avail_tree_, arena_avail_tree_t, rb_gen(static JEMALLOC_ATTR(unused), arena_avail_tree_, arena_avail_tree_t,
arena_chunk_map_t, link, arena_avail_comp) arena_chunk_map_t, link, arena_avail_comp)
static inline void
arena_run_rc_incr(arena_run_t *run, arena_bin_t *bin, const void *ptr)
{
arena_chunk_t *chunk;
arena_t *arena;
size_t pagebeg, pageend, i;
chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
arena = chunk->arena;
pagebeg = ((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT;
pageend = ((uintptr_t)ptr + (uintptr_t)(bin->reg_size - 1) -
(uintptr_t)chunk) >> PAGE_SHIFT;
for (i = pagebeg; i <= pageend; i++) {
size_t mapbits = chunk->map[i].bits;
if (mapbits & CHUNK_MAP_DIRTY) {
assert((mapbits & CHUNK_MAP_RC_MASK) == 0);
chunk->ndirty--;
arena->ndirty--;
mapbits ^= CHUNK_MAP_DIRTY;
}
assert((mapbits & CHUNK_MAP_RC_MASK) != CHUNK_MAP_RC_MASK);
mapbits += CHUNK_MAP_RC_ONE;
chunk->map[i].bits = mapbits;
}
}
static inline void
arena_run_rc_decr(arena_run_t *run, arena_bin_t *bin, const void *ptr)
{
arena_chunk_t *chunk;
arena_t *arena;
size_t pagebeg, pageend, mapbits, i;
bool dirtier = false;
chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
arena = chunk->arena;
pagebeg = ((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT;
pageend = ((uintptr_t)ptr + (uintptr_t)(bin->reg_size - 1) -
(uintptr_t)chunk) >> PAGE_SHIFT;
/* First page. */
mapbits = chunk->map[pagebeg].bits;
mapbits -= CHUNK_MAP_RC_ONE;
if ((mapbits & CHUNK_MAP_RC_MASK) == 0) {
dirtier = true;
assert((mapbits & CHUNK_MAP_DIRTY) == 0);
mapbits |= CHUNK_MAP_DIRTY;
chunk->ndirty++;
arena->ndirty++;
}
chunk->map[pagebeg].bits = mapbits;
if (pageend - pagebeg >= 1) {
/*
* Interior pages are completely consumed by the object being
* deallocated, which means that the pages can be
* unconditionally marked dirty.
*/
for (i = pagebeg + 1; i < pageend; i++) {
mapbits = chunk->map[i].bits;
mapbits -= CHUNK_MAP_RC_ONE;
assert((mapbits & CHUNK_MAP_RC_MASK) == 0);
dirtier = true;
assert((mapbits & CHUNK_MAP_DIRTY) == 0);
mapbits |= CHUNK_MAP_DIRTY;
chunk->ndirty++;
arena->ndirty++;
chunk->map[i].bits = mapbits;
}
/* Last page. */
mapbits = chunk->map[pageend].bits;
mapbits -= CHUNK_MAP_RC_ONE;
if ((mapbits & CHUNK_MAP_RC_MASK) == 0) {
dirtier = true;
assert((mapbits & CHUNK_MAP_DIRTY) == 0);
mapbits |= CHUNK_MAP_DIRTY;
chunk->ndirty++;
arena->ndirty++;
}
chunk->map[pageend].bits = mapbits;
}
if (dirtier) {
if (chunk->dirtied == false) {
ql_tail_insert(&arena->chunks_dirty, chunk, link_dirty);
chunk->dirtied = true;
}
/* Enforce opt_lg_dirty_mult. */
if (opt_lg_dirty_mult >= 0 && arena->ndirty > chunk_npages &&
(arena->nactive >> opt_lg_dirty_mult) < arena->ndirty)
arena_purge(arena);
}
}
static inline void * static inline void *
arena_run_reg_alloc(arena_run_t *run, arena_bin_t *bin) arena_run_reg_alloc(arena_run_t *run, arena_bin_t *bin)
{ {
void *ret; void *ret;
unsigned i, mask, bit, regind;
assert(run->magic == ARENA_RUN_MAGIC); assert(run->magic == ARENA_RUN_MAGIC);
assert(run->regs_minelm < bin->regs_mask_nelms); assert(run->nfree > 0);
/*
* Move the first check outside the loop, so that run->regs_minelm can
* be updated unconditionally, without the possibility of updating it
* multiple times.
*/
i = run->regs_minelm;
mask = run->regs_mask[i];
if (mask != 0) {
/* Usable allocation found. */
bit = ffs((int)mask) - 1;
regind = ((i << (LG_SIZEOF_INT + 3)) + bit);
assert(regind < bin->nregs);
ret = (void *)(((uintptr_t)run) + bin->reg0_offset
+ (bin->reg_size * regind));
/* Clear bit. */
mask ^= (1U << bit);
run->regs_mask[i] = mask;
arena_run_rc_incr(run, bin, ret);
run->nfree--;
ret = run->avail;
if (ret != NULL) {
run->avail = *(void **)ret;
/* Double free can cause assertion failure.*/
assert(ret != NULL);
/* Write-after free can cause assertion failure. */
assert((uintptr_t)ret >= (uintptr_t)run +
(uintptr_t)bin->reg0_offset);
assert((uintptr_t)ret < (uintptr_t)run->next);
assert(((uintptr_t)ret - ((uintptr_t)run +
(uintptr_t)bin->reg0_offset)) % (uintptr_t)bin->reg_size ==
0);
return (ret); return (ret);
} }
ret = run->next;
for (i++; i < bin->regs_mask_nelms; i++) { run->next = (void *)((uintptr_t)ret + (uintptr_t)bin->reg_size);
mask = run->regs_mask[i]; assert(ret != NULL);
if (mask != 0) { return (ret);
/* Usable allocation found. */
bit = ffs((int)mask) - 1;
regind = ((i << (LG_SIZEOF_INT + 3)) + bit);
assert(regind < bin->nregs);
ret = (void *)(((uintptr_t)run) + bin->reg0_offset
+ (bin->reg_size * regind));
/* Clear bit. */
mask ^= (1U << bit);
run->regs_mask[i] = mask;
/*
* Make a note that nothing before this element
* contains a free region.
*/
run->regs_minelm = i; /* Low payoff: + (mask == 0); */
arena_run_rc_incr(run, bin, ret);
return (ret);
}
}
/* Not reached. */
assert(0);
return (NULL);
}
static inline unsigned
arena_run_regind(arena_run_t *run, arena_bin_t *bin, const void *ptr,
size_t size)
{
unsigned shift, diff, regind;
assert(run->magic == ARENA_RUN_MAGIC);
/*
* Avoid doing division with a variable divisor if possible. Using
* actual division here can reduce allocator throughput by over 20%!
*/
diff = (unsigned)((uintptr_t)ptr - (uintptr_t)run - bin->reg0_offset);
/* Rescale (factor powers of 2 out of the numerator and denominator). */
shift = ffs(size) - 1;
diff >>= shift;
size >>= shift;
if (size == 1) {
/* The divisor was a power of 2. */
regind = diff;
} else {
/*
* To divide by a number D that is not a power of two we
* multiply by (2^21 / D) and then right shift by 21 positions.
*
* X / D
*
* becomes
*
* (X * size_invs[D - 3]) >> SIZE_INV_SHIFT
*
* We can omit the first three elements, because we never
* divide by 0, and 1 and 2 are both powers of two, which are
* handled above.
*/
#define SIZE_INV_SHIFT 21
#define SIZE_INV(s) (((1U << SIZE_INV_SHIFT) / (s)) + 1)
static const unsigned size_invs[] = {
SIZE_INV(3),
SIZE_INV(4), SIZE_INV(5), SIZE_INV(6), SIZE_INV(7),
SIZE_INV(8), SIZE_INV(9), SIZE_INV(10), SIZE_INV(11),
SIZE_INV(12), SIZE_INV(13), SIZE_INV(14), SIZE_INV(15),
SIZE_INV(16), SIZE_INV(17), SIZE_INV(18), SIZE_INV(19),
SIZE_INV(20), SIZE_INV(21), SIZE_INV(22), SIZE_INV(23),
SIZE_INV(24), SIZE_INV(25), SIZE_INV(26), SIZE_INV(27),
SIZE_INV(28), SIZE_INV(29), SIZE_INV(30), SIZE_INV(31)
};
if (size <= ((sizeof(size_invs) / sizeof(unsigned)) + 2))
regind = (diff * size_invs[size - 3]) >> SIZE_INV_SHIFT;
else
regind = diff / size;
#undef SIZE_INV
#undef SIZE_INV_SHIFT
}
assert(diff == regind * size);
assert(regind < bin->nregs);
return (regind);
} }
static inline void static inline void
arena_run_reg_dalloc(arena_run_t *run, arena_bin_t *bin, void *ptr, size_t size) arena_run_reg_dalloc(arena_run_t *run, void *ptr)
{ {
unsigned regind, elm, bit;
regind = arena_run_regind(run, bin, ptr, size); assert(run->nfree < run->bin->nregs);
elm = regind >> (LG_SIZEOF_INT + 3); /* Freeing an interior pointer can cause assertion failure. */
if (elm < run->regs_minelm) assert(((uintptr_t)ptr - ((uintptr_t)run +
run->regs_minelm = elm; (uintptr_t)run->bin->reg0_offset)) % (uintptr_t)run->bin->reg_size
bit = regind - (elm << (LG_SIZEOF_INT + 3)); == 0);
assert((run->regs_mask[elm] & (1U << bit)) == 0);
run->regs_mask[elm] |= (1U << bit);
arena_run_rc_decr(run, bin, ptr); *(void **)ptr = run->avail;
run->avail = ptr;
run->nfree++;
} }
static void static void
@ -571,12 +362,6 @@ arena_run_split(arena_t *arena, arena_run_t *run, size_t size, bool large,
* tries to operate on an interior pointer. * tries to operate on an interior pointer.
*/ */
chunk->map[run_ind].bits |= size; chunk->map[run_ind].bits |= size;
} else {
/*
* Initialize the first page's refcount to 1, so that the run
* header is protected from dirty page purging.
*/
chunk->map[run_ind].bits += CHUNK_MAP_RC_ONE;
} }
} }
@ -960,7 +745,6 @@ arena_bin_nonfull_run_get(arena_t *arena, arena_bin_t *bin)
{ {
arena_chunk_map_t *mapelm; arena_chunk_map_t *mapelm;
arena_run_t *run; arena_run_t *run;
unsigned i, remainder;
/* Look for a usable run. */ /* Look for a usable run. */
mapelm = arena_run_tree_first(&bin->runs); mapelm = arena_run_tree_first(&bin->runs);
@ -991,20 +775,8 @@ arena_bin_nonfull_run_get(arena_t *arena, arena_bin_t *bin)
/* Initialize run internals. */ /* Initialize run internals. */
run->bin = bin; run->bin = bin;
run->avail = NULL;
for (i = 0; i < bin->regs_mask_nelms - 1; i++) run->next = (void *)(((uintptr_t)run) + (uintptr_t)bin->reg0_offset);
run->regs_mask[i] = UINT_MAX;
remainder = bin->nregs & ((1U << (LG_SIZEOF_INT + 3)) - 1);
if (remainder == 0)
run->regs_mask[i] = UINT_MAX;
else {
/* The last element has spare bits that need to be unset. */
run->regs_mask[i] = (UINT_MAX >> ((1U << (LG_SIZEOF_INT + 3))
- remainder));
}
run->regs_minelm = 0;
run->nfree = bin->nregs; run->nfree = bin->nregs;
#ifdef JEMALLOC_DEBUG #ifdef JEMALLOC_DEBUG
run->magic = ARENA_RUN_MAGIC; run->magic = ARENA_RUN_MAGIC;
@ -1019,23 +791,7 @@ arena_bin_nonfull_run_get(arena_t *arena, arena_bin_t *bin)
return (run); return (run);
} }
/* bin->runcur must have space available before this function is called. */ /* Re-fill bin->runcur, then call arena_run_reg_alloc(). */
static inline void *
arena_bin_malloc_easy(arena_t *arena, arena_bin_t *bin, arena_run_t *run)
{
void *ret;
assert(run->magic == ARENA_RUN_MAGIC);
assert(run->nfree > 0);
ret = arena_run_reg_alloc(run, bin);
assert(ret != NULL);
run->nfree--;
return (ret);
}
/* Re-fill bin->runcur, then call arena_bin_malloc_easy(). */
static void * static void *
arena_bin_malloc_hard(arena_t *arena, arena_bin_t *bin) arena_bin_malloc_hard(arena_t *arena, arena_bin_t *bin)
{ {
@ -1046,7 +802,7 @@ arena_bin_malloc_hard(arena_t *arena, arena_bin_t *bin)
assert(bin->runcur->magic == ARENA_RUN_MAGIC); assert(bin->runcur->magic == ARENA_RUN_MAGIC);
assert(bin->runcur->nfree > 0); assert(bin->runcur->nfree > 0);
return (arena_bin_malloc_easy(arena, bin, bin->runcur)); return (arena_run_reg_alloc(bin->runcur, bin));
} }
#ifdef JEMALLOC_TCACHE #ifdef JEMALLOC_TCACHE
@ -1071,7 +827,7 @@ arena_tcache_fill(arena_t *arena, tcache_bin_t *tbin, size_t binind
#endif #endif
for (i = 0, nfill = (tbin->ncached_max >> 1); i < nfill; i++) { for (i = 0, nfill = (tbin->ncached_max >> 1); i < nfill; i++) {
if ((run = bin->runcur) != NULL && run->nfree > 0) if ((run = bin->runcur) != NULL && run->nfree > 0)
ptr = arena_bin_malloc_easy(arena, bin, run); ptr = arena_run_reg_alloc(run, bin);
else else
ptr = arena_bin_malloc_hard(arena, bin); ptr = arena_bin_malloc_hard(arena, bin);
if (ptr == NULL) if (ptr == NULL)
@ -1120,19 +876,17 @@ arena_prof_accum(arena_t *arena, uint64_t accumbytes)
* *
* *) bin->run_size >= min_run_size * *) bin->run_size >= min_run_size
* *) bin->run_size <= arena_maxclass * *) bin->run_size <= arena_maxclass
* *) bin->run_size <= RUN_MAX_SMALL
* *) run header overhead <= RUN_MAX_OVRHD (or header overhead relaxed). * *) run header overhead <= RUN_MAX_OVRHD (or header overhead relaxed).
* *) run header size < PAGE_SIZE * *) run header size < PAGE_SIZE
* *
* bin->nregs, bin->regs_mask_nelms, and bin->reg0_offset are * bin->nregs and bin->reg0_offset are also calculated here, since these
* also calculated here, since these settings are all interdependent. * settings are all interdependent.
*/ */
static size_t static size_t
arena_bin_run_size_calc(arena_bin_t *bin, size_t min_run_size) arena_bin_run_size_calc(arena_bin_t *bin, size_t min_run_size)
{ {
size_t try_run_size, good_run_size; size_t try_run_size, good_run_size;
uint32_t try_nregs, good_nregs; uint32_t try_nregs, good_nregs;
uint32_t try_mask_nelms, good_mask_nelms;
uint32_t try_hdr_size, good_hdr_size; uint32_t try_hdr_size, good_hdr_size;
#ifdef JEMALLOC_PROF #ifdef JEMALLOC_PROF
uint32_t try_cnt0_offset, good_cnt0_offset; uint32_t try_cnt0_offset, good_cnt0_offset;
@ -1141,7 +895,6 @@ arena_bin_run_size_calc(arena_bin_t *bin, size_t min_run_size)
assert(min_run_size >= PAGE_SIZE); assert(min_run_size >= PAGE_SIZE);
assert(min_run_size <= arena_maxclass); assert(min_run_size <= arena_maxclass);
assert(min_run_size <= RUN_MAX_SMALL);
/* /*
* Calculate known-valid settings before entering the run_size * Calculate known-valid settings before entering the run_size
@ -1158,10 +911,7 @@ arena_bin_run_size_calc(arena_bin_t *bin, size_t min_run_size)
+ 1; /* Counter-act try_nregs-- in loop. */ + 1; /* Counter-act try_nregs-- in loop. */
do { do {
try_nregs--; try_nregs--;
try_mask_nelms = (try_nregs >> (LG_SIZEOF_INT + 3)) + try_hdr_size = sizeof(arena_run_t);
((try_nregs & ((1U << (LG_SIZEOF_INT + 3)) - 1)) ? 1 : 0);
try_hdr_size = sizeof(arena_run_t) + (sizeof(unsigned) *
(try_mask_nelms - 1));
#ifdef JEMALLOC_PROF #ifdef JEMALLOC_PROF
if (opt_prof) { if (opt_prof) {
/* Pad to a quantum boundary. */ /* Pad to a quantum boundary. */
@ -1182,7 +932,6 @@ arena_bin_run_size_calc(arena_bin_t *bin, size_t min_run_size)
*/ */
good_run_size = try_run_size; good_run_size = try_run_size;
good_nregs = try_nregs; good_nregs = try_nregs;
good_mask_nelms = try_mask_nelms;
good_hdr_size = try_hdr_size; good_hdr_size = try_hdr_size;
#ifdef JEMALLOC_PROF #ifdef JEMALLOC_PROF
good_cnt0_offset = try_cnt0_offset; good_cnt0_offset = try_cnt0_offset;
@ -1195,11 +944,7 @@ arena_bin_run_size_calc(arena_bin_t *bin, size_t min_run_size)
bin->reg_size) + 1; /* Counter-act try_nregs-- in loop. */ bin->reg_size) + 1; /* Counter-act try_nregs-- in loop. */
do { do {
try_nregs--; try_nregs--;
try_mask_nelms = (try_nregs >> (LG_SIZEOF_INT + 3)) + try_hdr_size = sizeof(arena_run_t);
((try_nregs & ((1U << (LG_SIZEOF_INT + 3)) - 1)) ?
1 : 0);
try_hdr_size = sizeof(arena_run_t) + (sizeof(unsigned) *
(try_mask_nelms - 1));
#ifdef JEMALLOC_PROF #ifdef JEMALLOC_PROF
if (opt_prof) { if (opt_prof) {
/* Pad to a quantum boundary. */ /* Pad to a quantum boundary. */
@ -1216,18 +961,17 @@ arena_bin_run_size_calc(arena_bin_t *bin, size_t min_run_size)
try_reg0_offset = try_run_size - (try_nregs * try_reg0_offset = try_run_size - (try_nregs *
bin->reg_size); bin->reg_size);
} while (try_hdr_size > try_reg0_offset); } while (try_hdr_size > try_reg0_offset);
} while (try_run_size <= arena_maxclass && try_run_size <= RUN_MAX_SMALL } while (try_run_size <= arena_maxclass
&& try_run_size <= arena_maxclass
&& RUN_MAX_OVRHD * (bin->reg_size << 3) > RUN_MAX_OVRHD_RELAX && RUN_MAX_OVRHD * (bin->reg_size << 3) > RUN_MAX_OVRHD_RELAX
&& (try_reg0_offset << RUN_BFP) > RUN_MAX_OVRHD * try_run_size && (try_reg0_offset << RUN_BFP) > RUN_MAX_OVRHD * try_run_size
&& try_hdr_size < PAGE_SIZE); && try_hdr_size < PAGE_SIZE);
assert(good_hdr_size <= good_reg0_offset); assert(good_hdr_size <= good_reg0_offset);
assert((good_mask_nelms << (LG_SIZEOF_INT + 3)) >= good_nregs);
/* Copy final settings. */ /* Copy final settings. */
bin->run_size = good_run_size; bin->run_size = good_run_size;
bin->nregs = good_nregs; bin->nregs = good_nregs;
bin->regs_mask_nelms = good_mask_nelms;
#ifdef JEMALLOC_PROF #ifdef JEMALLOC_PROF
bin->cnt0_offset = good_cnt0_offset; bin->cnt0_offset = good_cnt0_offset;
#endif #endif
@ -1251,7 +995,7 @@ arena_malloc_small(arena_t *arena, size_t size, bool zero)
malloc_mutex_lock(&arena->lock); malloc_mutex_lock(&arena->lock);
if ((run = bin->runcur) != NULL && run->nfree > 0) if ((run = bin->runcur) != NULL && run->nfree > 0)
ret = arena_bin_malloc_easy(arena, bin, run); ret = arena_run_reg_alloc(run, bin);
else else
ret = arena_bin_malloc_hard(arena, bin); ret = arena_bin_malloc_hard(arena, bin);
@ -1306,7 +1050,7 @@ arena_malloc_medium(arena_t *arena, size_t size, bool zero)
malloc_mutex_lock(&arena->lock); malloc_mutex_lock(&arena->lock);
if ((run = bin->runcur) != NULL && run->nfree > 0) if ((run = bin->runcur) != NULL && run->nfree > 0)
ret = arena_bin_malloc_easy(arena, bin, run); ret = arena_run_reg_alloc(run, bin);
else else
ret = arena_bin_malloc_hard(arena, bin); ret = arena_bin_malloc_hard(arena, bin);
@ -1522,6 +1266,69 @@ arena_salloc(const void *ptr)
} }
#ifdef JEMALLOC_PROF #ifdef JEMALLOC_PROF
static inline unsigned
arena_run_regind(arena_run_t *run, arena_bin_t *bin, const void *ptr,
size_t size)
{
unsigned shift, diff, regind;
assert(run->magic == ARENA_RUN_MAGIC);
/*
* Avoid doing division with a variable divisor if possible. Using
* actual division here can reduce allocator throughput by over 20%!
*/
diff = (unsigned)((uintptr_t)ptr - (uintptr_t)run - bin->reg0_offset);
/* Rescale (factor powers of 2 out of the numerator and denominator). */
shift = ffs(size) - 1;
diff >>= shift;
size >>= shift;
if (size == 1) {
/* The divisor was a power of 2. */
regind = diff;
} else {
/*
* To divide by a number D that is not a power of two we
* multiply by (2^21 / D) and then right shift by 21 positions.
*
* X / D
*
* becomes
*
* (X * size_invs[D - 3]) >> SIZE_INV_SHIFT
*
* We can omit the first three elements, because we never
* divide by 0, and 1 and 2 are both powers of two, which are
* handled above.
*/
#define SIZE_INV_SHIFT 21
#define SIZE_INV(s) (((1U << SIZE_INV_SHIFT) / (s)) + 1)
static const unsigned size_invs[] = {
SIZE_INV(3),
SIZE_INV(4), SIZE_INV(5), SIZE_INV(6), SIZE_INV(7),
SIZE_INV(8), SIZE_INV(9), SIZE_INV(10), SIZE_INV(11),
SIZE_INV(12), SIZE_INV(13), SIZE_INV(14), SIZE_INV(15),
SIZE_INV(16), SIZE_INV(17), SIZE_INV(18), SIZE_INV(19),
SIZE_INV(20), SIZE_INV(21), SIZE_INV(22), SIZE_INV(23),
SIZE_INV(24), SIZE_INV(25), SIZE_INV(26), SIZE_INV(27),
SIZE_INV(28), SIZE_INV(29), SIZE_INV(30), SIZE_INV(31)
};
if (size <= ((sizeof(size_invs) / sizeof(unsigned)) + 2))
regind = (diff * size_invs[size - 3]) >> SIZE_INV_SHIFT;
else
regind = diff / size;
#undef SIZE_INV
#undef SIZE_INV_SHIFT
}
assert(diff == regind * size);
assert(regind < bin->nregs);
return (regind);
}
prof_thr_cnt_t * prof_thr_cnt_t *
arena_prof_cnt_get(const void *ptr) arena_prof_cnt_get(const void *ptr)
{ {
@ -1589,7 +1396,7 @@ static void
arena_dalloc_bin_run(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run, arena_dalloc_bin_run(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
arena_bin_t *bin) arena_bin_t *bin)
{ {
size_t run_ind; size_t run_ind, past;
/* Deallocate run. */ /* Deallocate run. */
if (run == bin->runcur) if (run == bin->runcur)
@ -1606,17 +1413,16 @@ arena_dalloc_bin_run(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
*/ */
arena_run_tree_remove(&bin->runs, run_mapelm); arena_run_tree_remove(&bin->runs, run_mapelm);
} }
/* /* Mark all pages that were ever used for allocations as dirty. */
* Mark the first page as dirty. The dirty bit for every other page in
* the run is already properly set, which means we can call
* arena_run_dalloc(..., false), thus potentially avoiding the needless
* creation of many dirty pages.
*/
run_ind = (size_t)(((uintptr_t)run - (uintptr_t)chunk) >> PAGE_SHIFT); run_ind = (size_t)(((uintptr_t)run - (uintptr_t)chunk) >> PAGE_SHIFT);
assert((chunk->map[run_ind].bits & CHUNK_MAP_DIRTY) == 0); past = (size_t)(((uintptr_t)run->next - (uintptr_t)1U -
chunk->map[run_ind].bits |= CHUNK_MAP_DIRTY; (uintptr_t)chunk) >> PAGE_SHIFT) + 1;
chunk->ndirty++; chunk->ndirty += past - run_ind;
arena->ndirty++; arena->ndirty += past - run_ind;
for (; run_ind < past; run_ind++) {
assert((chunk->map[run_ind].bits & CHUNK_MAP_DIRTY) == 0);
chunk->map[run_ind].bits |= CHUNK_MAP_DIRTY;
}
#ifdef JEMALLOC_DEBUG #ifdef JEMALLOC_DEBUG
run->magic = 0; run->magic = 0;
@ -1643,7 +1449,9 @@ arena_dalloc_bin(arena_t *arena, arena_chunk_t *chunk, void *ptr,
size_t pageind; size_t pageind;
arena_run_t *run; arena_run_t *run;
arena_bin_t *bin; arena_bin_t *bin;
#if (defined(JEMALLOC_FILL) || defined(JEMALLOC_STATS))
size_t size; size_t size;
#endif
pageind = (((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT); pageind = (((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT);
run = (arena_run_t *)((uintptr_t)chunk + (uintptr_t)((pageind - run = (arena_run_t *)((uintptr_t)chunk + (uintptr_t)((pageind -
@ -1651,15 +1459,16 @@ arena_dalloc_bin(arena_t *arena, arena_chunk_t *chunk, void *ptr,
PAGE_SHIFT)); PAGE_SHIFT));
assert(run->magic == ARENA_RUN_MAGIC); assert(run->magic == ARENA_RUN_MAGIC);
bin = run->bin; bin = run->bin;
#if (defined(JEMALLOC_FILL) || defined(JEMALLOC_STATS))
size = bin->reg_size; size = bin->reg_size;
#endif
#ifdef JEMALLOC_FILL #ifdef JEMALLOC_FILL
if (opt_junk) if (opt_junk)
memset(ptr, 0x5a, size); memset(ptr, 0x5a, size);
#endif #endif
arena_run_reg_dalloc(run, bin, ptr, size); arena_run_reg_dalloc(run, ptr);
run->nfree++;
if (run->nfree == bin->nregs) if (run->nfree == bin->nregs)
arena_dalloc_bin_run(arena, chunk, run, bin); arena_dalloc_bin_run(arena, chunk, run, bin);