From b718cf77e9917f6ae1995c2e2b219ff4219c9f46 Mon Sep 17 00:00:00 2001 From: Jason Evans Date: Sun, 7 Sep 2014 14:40:19 -0700 Subject: [PATCH] Optimize [nmd]alloc() fast paths. Optimize [nmd]alloc() fast paths such that the (flags == 0) case is streamlined, flags decoding only happens to the minimum degree necessary, and no conditionals are repeated. --- include/jemalloc/internal/arena.h | 4 +- .../jemalloc/internal/jemalloc_internal.h.in | 52 ++-- include/jemalloc/internal/private_symbols.txt | 1 - include/jemalloc/internal/size_classes.sh | 3 + src/arena.c | 2 +- src/huge.c | 2 +- src/jemalloc.c | 239 ++++++++++-------- 7 files changed, 172 insertions(+), 131 deletions(-) diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h index 986bea92..166d0523 100644 --- a/include/jemalloc/internal/arena.h +++ b/include/jemalloc/internal/arena.h @@ -577,7 +577,7 @@ small_bin2size_lookup(size_t binind) assert(binind < NBINS); { - size_t ret = ((size_t)(small_bin2size_tab[binind])); + size_t ret = (size_t)small_bin2size_tab[binind]; assert(ret == small_bin2size_compute(binind)); return (ret); } @@ -615,7 +615,7 @@ small_s2u_compute(size_t size) JEMALLOC_ALWAYS_INLINE size_t small_s2u_lookup(size_t size) { - size_t ret = (small_bin2size(small_size2bin(size))); + size_t ret = small_bin2size(small_size2bin(size)); assert(ret == small_s2u_compute(size)); return (ret); diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in index 1c2f3d44..59ae8d55 100644 --- a/include/jemalloc/internal/jemalloc_internal.h.in +++ b/include/jemalloc/internal/jemalloc_internal.h.in @@ -165,7 +165,17 @@ static const bool config_ivsalloc = #include "jemalloc/internal/jemalloc_internal_macros.h" +#define MALLOCX_ARENA_MASK ((int)~0xff) #define MALLOCX_LG_ALIGN_MASK ((int)0x3f) +/* Use MALLOCX_ALIGN_GET() if alignment may not be specified in flags. */ +#define MALLOCX_ALIGN_GET_SPECIFIED(flags) \ + (ZU(1) << (flags & MALLOCX_LG_ALIGN_MASK)) +#define MALLOCX_ALIGN_GET(flags) \ + (MALLOCX_ALIGN_GET_SPECIFIED(flags) & (SIZE_T_MAX-1)) +#define MALLOCX_ZERO_GET(flags) \ + ((bool)(flags & MALLOCX_ZERO)) +#define MALLOCX_ARENA_GET(flags) \ + (((unsigned)(flags >> 8)) - 1) /* Smallest size class to support. */ #define LG_TINY_MIN 3 @@ -625,15 +635,13 @@ size_t u2rz(size_t usize); size_t p2rz(const void *ptr); void idalloct(void *ptr, bool try_tcache); void idalloc(void *ptr); -void iqalloct(void *ptr, bool try_tcache); -void iqalloc(void *ptr); +void iqalloc(void *ptr, bool try_tcache); void *iralloct_realign(void *ptr, size_t oldsize, size_t size, size_t extra, size_t alignment, bool zero, bool try_tcache_alloc, bool try_tcache_dalloc, arena_t *arena); -void *iralloct(void *ptr, size_t size, size_t extra, size_t alignment, - bool zero, bool try_tcache_alloc, bool try_tcache_dalloc, arena_t *arena); -void *iralloc(void *ptr, size_t size, size_t extra, size_t alignment, - bool zero); +void *iralloct(void *ptr, size_t size, size_t alignment, bool zero, + bool try_tcache_alloc, bool try_tcache_dalloc, arena_t *arena); +void *iralloc(void *ptr, size_t size, size_t alignment, bool zero); bool ixalloc(void *ptr, size_t size, size_t extra, size_t alignment, bool zero); malloc_tsd_protos(JEMALLOC_ATTR(unused), thread_allocated, thread_allocated_t) @@ -787,7 +795,7 @@ idalloc(void *ptr) } JEMALLOC_ALWAYS_INLINE void -iqalloct(void *ptr, bool try_tcache) +iqalloc(void *ptr, bool try_tcache) { if (config_fill && opt_quarantine) @@ -796,13 +804,6 @@ iqalloct(void *ptr, bool try_tcache) idalloct(ptr, try_tcache); } -JEMALLOC_ALWAYS_INLINE void -iqalloc(void *ptr) -{ - - iqalloct(ptr, true); -} - JEMALLOC_ALWAYS_INLINE void * iralloct_realign(void *ptr, size_t oldsize, size_t size, size_t extra, size_t alignment, bool zero, bool try_tcache_alloc, bool try_tcache_dalloc, @@ -832,12 +833,12 @@ iralloct_realign(void *ptr, size_t oldsize, size_t size, size_t extra, */ copysize = (size < oldsize) ? size : oldsize; memcpy(p, ptr, copysize); - iqalloct(ptr, try_tcache_dalloc); + iqalloc(ptr, try_tcache_dalloc); return (p); } JEMALLOC_ALWAYS_INLINE void * -iralloct(void *ptr, size_t size, size_t extra, size_t alignment, bool zero, +iralloct(void *ptr, size_t size, size_t alignment, bool zero, bool try_tcache_alloc, bool try_tcache_dalloc, arena_t *arena) { size_t oldsize; @@ -853,25 +854,24 @@ iralloct(void *ptr, size_t size, size_t extra, size_t alignment, bool zero, * Existing object alignment is inadequate; allocate new space * and copy. */ - return (iralloct_realign(ptr, oldsize, size, extra, alignment, - zero, try_tcache_alloc, try_tcache_dalloc, arena)); + return (iralloct_realign(ptr, oldsize, size, 0, alignment, zero, + try_tcache_alloc, try_tcache_dalloc, arena)); } - if (size + extra <= arena_maxclass) { - return (arena_ralloc(arena, ptr, oldsize, size, extra, - alignment, zero, try_tcache_alloc, - try_tcache_dalloc)); + if (size <= arena_maxclass) { + return (arena_ralloc(arena, ptr, oldsize, size, 0, alignment, + zero, try_tcache_alloc, try_tcache_dalloc)); } else { - return (huge_ralloc(arena, ptr, oldsize, size, extra, - alignment, zero, try_tcache_dalloc)); + return (huge_ralloc(arena, ptr, oldsize, size, 0, alignment, + zero, try_tcache_dalloc)); } } JEMALLOC_ALWAYS_INLINE void * -iralloc(void *ptr, size_t size, size_t extra, size_t alignment, bool zero) +iralloc(void *ptr, size_t size, size_t alignment, bool zero) { - return (iralloct(ptr, size, extra, alignment, zero, true, true, NULL)); + return (iralloct(ptr, size, alignment, zero, true, true, NULL)); } JEMALLOC_ALWAYS_INLINE bool diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt index 9ca139ab..84f05910 100644 --- a/include/jemalloc/internal/private_symbols.txt +++ b/include/jemalloc/internal/private_symbols.txt @@ -224,7 +224,6 @@ in_valgrind ipalloc ipalloct iqalloc -iqalloct iralloc iralloct iralloct_realign diff --git a/include/jemalloc/internal/size_classes.sh b/include/jemalloc/internal/size_classes.sh index 379d36c2..0cfac72d 100755 --- a/include/jemalloc/internal/size_classes.sh +++ b/include/jemalloc/internal/size_classes.sh @@ -202,6 +202,7 @@ cat <deallocated += usize; if (config_valgrind && in_valgrind) rzsize = p2rz(ptr); - iqalloc(ptr); + iqalloc(ptr, try_tcache); JEMALLOC_VALGRIND_FREE(ptr, rzsize); } @@ -1236,7 +1235,7 @@ je_realloc(void *ptr, size_t size) if (ptr != NULL) { /* realloc(ptr, 0) is equivalent to free(ptr). */ UTRACE(ptr, 0, 0); - ifree(ptr); + ifree(ptr, true); return (NULL); } size = 1; @@ -1261,7 +1260,7 @@ je_realloc(void *ptr, size_t size) } else { if (config_stats || (config_valgrind && in_valgrind)) usize = s2u(size); - ret = iralloc(ptr, size, 0, 0, false); + ret = iralloc(ptr, size, 0, false); } } else { /* realloc(NULL, size) is equivalent to malloc(size). */ @@ -1295,7 +1294,7 @@ je_free(void *ptr) UTRACE(ptr, 0, 0); if (ptr != NULL) - ifree(ptr); + ifree(ptr, true); } /* @@ -1363,99 +1362,153 @@ JEMALLOC_EXPORT void *(*__memalign_hook)(size_t alignment, size_t size) = * Begin non-standard functions. */ +JEMALLOC_ALWAYS_INLINE_C void +imallocx_flags_decode_hard(size_t size, int flags, size_t *usize, + size_t *alignment, bool *zero, bool *try_tcache, arena_t **arena) +{ + + if ((flags & MALLOCX_LG_ALIGN_MASK) == 0) { + *alignment = 0; + *usize = s2u(size); + } else { + *alignment = MALLOCX_ALIGN_GET_SPECIFIED(flags); + *usize = sa2u(size, *alignment); + } + *zero = MALLOCX_ZERO_GET(flags); + if ((flags & MALLOCX_ARENA_MASK) != 0) { + unsigned arena_ind = MALLOCX_ARENA_GET(flags); + *try_tcache = false; + *arena = arenas[arena_ind]; + } else { + *try_tcache = true; + *arena = NULL; + } +} + +JEMALLOC_ALWAYS_INLINE_C void +imallocx_flags_decode(size_t size, int flags, size_t *usize, size_t *alignment, + bool *zero, bool *try_tcache, arena_t **arena) +{ + + if (flags == 0) { + *usize = s2u(size); + assert(usize != 0); + *alignment = 0; + *zero = false; + *try_tcache = true; + *arena = NULL; + } else { + imallocx_flags_decode_hard(size, flags, usize, alignment, zero, + try_tcache, arena); + } +} + JEMALLOC_ALWAYS_INLINE_C void * -imallocx(size_t usize, size_t alignment, bool zero, bool try_tcache, +imallocx_flags(size_t usize, size_t alignment, bool zero, bool try_tcache, arena_t *arena) { - assert(usize == ((alignment == 0) ? s2u(usize) : sa2u(usize, - alignment))); - if (alignment != 0) return (ipalloct(usize, alignment, zero, try_tcache, arena)); - else if (zero) + if (zero) return (icalloct(usize, try_tcache, arena)); - else - return (imalloct(usize, try_tcache, arena)); + return (imalloct(usize, try_tcache, arena)); +} + + +JEMALLOC_ALWAYS_INLINE_C void * +imallocx_maybe_flags(size_t size, int flags, size_t usize, size_t alignment, + bool zero, bool try_tcache, arena_t *arena) +{ + + if (flags == 0) + return (imalloc(size)); + return (imallocx_flags(usize, alignment, zero, try_tcache, arena)); } static void * -imallocx_prof_sample(size_t usize, size_t alignment, bool zero, bool try_tcache, - arena_t *arena, prof_tctx_t *tctx) +imallocx_prof_sample(size_t size, int flags, size_t usize, size_t alignment, + bool zero, bool try_tcache, arena_t *arena) { void *p; - if (tctx == NULL) - return (NULL); if (usize <= SMALL_MAXCLASS) { - size_t usize_promoted = (alignment == 0) ? - s2u(SMALL_MAXCLASS+1) : sa2u(SMALL_MAXCLASS+1, alignment); - assert(usize_promoted != 0); - p = imallocx(usize_promoted, alignment, zero, try_tcache, - arena); + assert(((alignment == 0) ? s2u(LARGE_MINCLASS) : + sa2u(LARGE_MINCLASS, alignment)) == LARGE_MINCLASS); + p = imalloc(LARGE_MINCLASS); if (p == NULL) return (NULL); arena_prof_promoted(p, usize); - } else - p = imallocx(usize, alignment, zero, try_tcache, arena); + } else { + p = imallocx_maybe_flags(size, flags, usize, alignment, zero, + try_tcache, arena); + } return (p); } JEMALLOC_ALWAYS_INLINE_C void * -imallocx_prof(size_t usize, size_t alignment, bool zero, bool try_tcache, - arena_t *arena, prof_tctx_t *tctx) +imallocx_prof(size_t size, int flags, size_t *usize) { void *p; + size_t alignment; + bool zero; + bool try_tcache; + arena_t *arena; + prof_tctx_t *tctx; - if ((uintptr_t)tctx != (uintptr_t)1U) { - p = imallocx_prof_sample(usize, alignment, zero, try_tcache, - arena, tctx); + imallocx_flags_decode(size, flags, usize, &alignment, &zero, + &try_tcache, &arena); + tctx = prof_alloc_prep(*usize); + if ((uintptr_t)tctx == (uintptr_t)1U) { + p = imallocx_maybe_flags(size, flags, *usize, alignment, zero, + try_tcache, arena); + } else if ((uintptr_t)tctx > (uintptr_t)1U) { + p = imallocx_prof_sample(size, flags, *usize, alignment, zero, + try_tcache, arena); } else - p = imallocx(usize, alignment, zero, try_tcache, arena); + p = NULL; if (p == NULL) return (NULL); - prof_malloc(p, usize, tctx); + prof_malloc(p, *usize, tctx); return (p); } +JEMALLOC_ALWAYS_INLINE_C void * +imallocx_no_prof(size_t size, int flags, size_t *usize) +{ + size_t alignment; + bool zero; + bool try_tcache; + arena_t *arena; + + if (flags == 0) { + if (config_stats || (config_valgrind && in_valgrind)) + *usize = s2u(size); + return (imalloc(size)); + } + + imallocx_flags_decode_hard(size, flags, usize, &alignment, &zero, + &try_tcache, &arena); + return (imallocx_flags(*usize, alignment, zero, try_tcache, arena)); +} + void * je_mallocx(size_t size, int flags) { void *p; size_t usize; - size_t alignment = (ZU(1) << (flags & MALLOCX_LG_ALIGN_MASK) - & (SIZE_T_MAX-1)); - bool zero = flags & MALLOCX_ZERO; - unsigned arena_ind = ((unsigned)(flags >> 8)) - 1; - arena_t *arena; - bool try_tcache; assert(size != 0); if (malloc_init()) goto label_oom; - if (arena_ind != UINT_MAX) { - arena = arenas[arena_ind]; - try_tcache = false; - } else { - arena = NULL; - try_tcache = true; - } - - usize = (alignment == 0) ? s2u(size) : sa2u(size, alignment); - assert(usize != 0); - - if (config_prof && opt_prof) { - prof_tctx_t *tctx; - - tctx = prof_alloc_prep(usize); - p = imallocx_prof(usize, alignment, zero, try_tcache, arena, - tctx); - } else - p = imallocx(usize, alignment, zero, try_tcache, arena); + if (config_prof && opt_prof) + p = imallocx_prof(size, flags, &usize); + else + p = imallocx_no_prof(size, flags, &usize); if (p == NULL) goto label_oom; @@ -1464,7 +1517,7 @@ je_mallocx(size_t size, int flags) thread_allocated_tsd_get()->allocated += usize; } UTRACE(0, size, p); - JEMALLOC_VALGRIND_MALLOC(true, p, usize, zero); + JEMALLOC_VALGRIND_MALLOC(true, p, usize, MALLOCX_ZERO_GET(flags)); return (p); label_oom: if (config_xmalloc && opt_xmalloc) { @@ -1485,15 +1538,14 @@ irallocx_prof_sample(void *oldptr, size_t size, size_t alignment, size_t usize, if (tctx == NULL) return (NULL); if (usize <= SMALL_MAXCLASS) { - p = iralloct(oldptr, SMALL_MAXCLASS+1, (SMALL_MAXCLASS+1 >= - size) ? 0 : size - (SMALL_MAXCLASS+1), alignment, zero, + p = iralloct(oldptr, LARGE_MINCLASS, alignment, zero, try_tcache_alloc, try_tcache_dalloc, arena); if (p == NULL) return (NULL); arena_prof_promoted(p, usize); } else { - p = iralloct(oldptr, size, 0, alignment, zero, - try_tcache_alloc, try_tcache_dalloc, arena); + p = iralloct(oldptr, size, alignment, zero, try_tcache_alloc, + try_tcache_dalloc, arena); } return (p); @@ -1512,8 +1564,8 @@ irallocx_prof(void *oldptr, size_t old_usize, size_t size, size_t alignment, p = irallocx_prof_sample(oldptr, size, alignment, *usize, zero, try_tcache_alloc, try_tcache_dalloc, arena, tctx); else { - p = iralloct(oldptr, size, 0, alignment, zero, - try_tcache_alloc, try_tcache_dalloc, arena); + p = iralloct(oldptr, size, alignment, zero, try_tcache_alloc, + try_tcache_dalloc, arena); } if (p == NULL) return (NULL); @@ -1540,10 +1592,8 @@ je_rallocx(void *ptr, size_t size, int flags) void *p; size_t usize, old_usize; UNUSED size_t old_rzsize JEMALLOC_CC_SILENCE_INIT(0); - size_t alignment = (ZU(1) << (flags & MALLOCX_LG_ALIGN_MASK) - & (SIZE_T_MAX-1)); + size_t alignment = MALLOCX_ALIGN_GET(flags); bool zero = flags & MALLOCX_ZERO; - unsigned arena_ind = ((unsigned)(flags >> 8)) - 1; bool try_tcache_alloc, try_tcache_dalloc; arena_t *arena; @@ -1552,7 +1602,8 @@ je_rallocx(void *ptr, size_t size, int flags) assert(malloc_initialized || IS_INITIALIZER); malloc_thread_init(); - if (arena_ind != UINT_MAX) { + if ((flags & MALLOCX_ARENA_MASK) != 0) { + unsigned arena_ind = MALLOCX_ARENA_GET(flags); arena_chunk_t *chunk; try_tcache_alloc = false; chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr); @@ -1582,7 +1633,7 @@ je_rallocx(void *ptr, size_t size, int flags) if (p == NULL) goto label_oom; } else { - p = iralloct(ptr, size, 0, alignment, zero, try_tcache_alloc, + p = iralloct(ptr, size, alignment, zero, try_tcache_alloc, try_tcache_dalloc, arena); if (p == NULL) goto label_oom; @@ -1677,10 +1728,8 @@ je_xallocx(void *ptr, size_t size, size_t extra, int flags) { size_t usize, old_usize; UNUSED size_t old_rzsize JEMALLOC_CC_SILENCE_INIT(0); - size_t alignment = (ZU(1) << (flags & MALLOCX_LG_ALIGN_MASK) - & (SIZE_T_MAX-1)); + size_t alignment = MALLOCX_ALIGN_GET(flags); bool zero = flags & MALLOCX_ZERO; - unsigned arena_ind = ((unsigned)(flags >> 8)) - 1; arena_t *arena; assert(ptr != NULL); @@ -1689,9 +1738,10 @@ je_xallocx(void *ptr, size_t size, size_t extra, int flags) assert(malloc_initialized || IS_INITIALIZER); malloc_thread_init(); - if (arena_ind != UINT_MAX) + if ((flags & MALLOCX_ARENA_MASK) != 0) { + unsigned arena_ind = MALLOCX_ARENA_GET(flags); arena = arenas[arena_ind]; - else + } else arena = NULL; old_usize = isalloc(ptr, config_prof); @@ -1753,15 +1803,13 @@ je_sallocx(const void *ptr, int flags) void je_dallocx(void *ptr, int flags) { - size_t usize; - UNUSED size_t rzsize JEMALLOC_CC_SILENCE_INIT(0); - unsigned arena_ind = ((unsigned)(flags >> 8)) - 1; bool try_tcache; assert(ptr != NULL); assert(malloc_initialized || IS_INITIALIZER); - if (arena_ind != UINT_MAX) { + if ((flags & MALLOCX_ARENA_MASK) != 0) { + unsigned arena_ind = MALLOCX_ARENA_GET(flags); arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr); try_tcache = (chunk == ptr || chunk->arena != arenas[arena_ind]); @@ -1769,34 +1817,25 @@ je_dallocx(void *ptr, int flags) try_tcache = true; UTRACE(ptr, 0, 0); - if (config_stats || config_valgrind) - usize = isalloc(ptr, config_prof); - if (config_prof && opt_prof) { - if (config_stats == false && config_valgrind == false) - usize = isalloc(ptr, config_prof); - prof_free(ptr, usize); - } - if (config_stats) - thread_allocated_tsd_get()->deallocated += usize; - if (config_valgrind && in_valgrind) - rzsize = p2rz(ptr); - iqalloct(ptr, try_tcache); - JEMALLOC_VALGRIND_FREE(ptr, rzsize); + ifree(ptr, try_tcache); } size_t je_nallocx(size_t size, int flags) { size_t usize; - size_t alignment = (ZU(1) << (flags & MALLOCX_LG_ALIGN_MASK) - & (SIZE_T_MAX-1)); assert(size != 0); if (malloc_init()) return (0); - usize = (alignment == 0) ? s2u(size) : sa2u(size, alignment); + if ((flags & MALLOCX_LG_ALIGN_MASK) == 0) + usize = s2u(size); + else { + size_t alignment = MALLOCX_ALIGN_GET_SPECIFIED(flags); + usize = sa2u(size, alignment); + } assert(usize != 0); return (usize); }