From edbfe6912c1b7e8b561dfee1b058425de6c06285 Mon Sep 17 00:00:00 2001 From: David Goldblatt Date: Mon, 8 Feb 2021 08:49:34 -0800 Subject: [PATCH] Inline malloc fastpath into operator new. This saves a small but non-negligible amount of CPU in C++ programs. --- .../internal/jemalloc_internal_externs.h | 3 +- .../internal/jemalloc_internal_inlines_c.h | 118 ++++++++++++++++++ .../internal/jemalloc_internal_types.h | 8 ++ src/jemalloc.c | 116 +---------------- src/jemalloc_cpp.cpp | 15 ++- 5 files changed, 141 insertions(+), 119 deletions(-) diff --git a/include/jemalloc/internal/jemalloc_internal_externs.h b/include/jemalloc/internal/jemalloc_internal_externs.h index 166c91d0..8054ad9c 100644 --- a/include/jemalloc/internal/jemalloc_internal_externs.h +++ b/include/jemalloc/internal/jemalloc_internal_externs.h @@ -30,6 +30,7 @@ extern bool opt_xmalloc; extern bool opt_zero; extern unsigned opt_narenas; extern zero_realloc_action_t opt_zero_realloc_action; +extern malloc_init_t malloc_init_state; extern const char *zero_realloc_mode_names[]; extern atomic_zu_t zero_realloc_count; @@ -64,7 +65,7 @@ size_t batch_alloc(void **ptrs, size_t num, size_t size, int flags); void jemalloc_prefork(void); void jemalloc_postfork_parent(void); void jemalloc_postfork_child(void); -bool malloc_initialized(void); void je_sdallocx_noflags(void *ptr, size_t size); +void *malloc_default(size_t size); #endif /* JEMALLOC_INTERNAL_EXTERNS_H */ diff --git a/include/jemalloc/internal/jemalloc_internal_inlines_c.h b/include/jemalloc/internal/jemalloc_internal_inlines_c.h index 0a5ffba5..b0868b7d 100644 --- a/include/jemalloc/internal/jemalloc_internal_inlines_c.h +++ b/include/jemalloc/internal/jemalloc_internal_inlines_c.h @@ -3,7 +3,9 @@ #include "jemalloc/internal/hook.h" #include "jemalloc/internal/jemalloc_internal_types.h" +#include "jemalloc/internal/log.h" #include "jemalloc/internal/sz.h" +#include "jemalloc/internal/thread_event.h" #include "jemalloc/internal/witness.h" /* @@ -219,4 +221,120 @@ ixalloc(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size, size_t extra, newsize); } +JEMALLOC_ALWAYS_INLINE void +fastpath_success_finish(tsd_t *tsd, uint64_t allocated_after, + cache_bin_t *bin, void *ret) { + thread_allocated_set(tsd, allocated_after); + if (config_stats) { + bin->tstats.nrequests++; + } + + LOG("core.malloc.exit", "result: %p", ret); +} + +JEMALLOC_ALWAYS_INLINE bool +malloc_initialized(void) { + return (malloc_init_state == malloc_init_initialized); +} + +/* + * malloc() fastpath. Included here so that we can inline it into operator new; + * function call overhead there is non-negligible as a fraction of total CPU in + * allocation-heavy C++ programs. We take the fallback alloc to allow malloc + * (which can return NULL) to differ in its behavior from operator new (which + * can't). It matches the signature of malloc / operator new so that we can + * tail-call the fallback allocator, allowing us to avoid setting up the call + * frame in the common case. + * + * Fastpath assumes size <= SC_LOOKUP_MAXCLASS, and that we hit + * tcache. If either of these is false, we tail-call to the slowpath, + * malloc_default(). Tail-calling is used to avoid any caller-saved + * registers. + * + * fastpath supports ticker and profiling, both of which will also + * tail-call to the slowpath if they fire. + */ +JEMALLOC_ALWAYS_INLINE void * +imalloc_fastpath(size_t size, void *(fallback_alloc)(size_t)) { + LOG("core.malloc.entry", "size: %zu", size); + if (tsd_get_allocates() && unlikely(!malloc_initialized())) { + return fallback_alloc(size); + } + + tsd_t *tsd = tsd_get(false); + if (unlikely((size > SC_LOOKUP_MAXCLASS) || tsd == NULL)) { + return fallback_alloc(size); + } + /* + * The code below till the branch checking the next_event threshold may + * execute before malloc_init(), in which case the threshold is 0 to + * trigger slow path and initialization. + * + * Note that when uninitialized, only the fast-path variants of the sz / + * tsd facilities may be called. + */ + szind_t ind; + /* + * The thread_allocated counter in tsd serves as a general purpose + * accumulator for bytes of allocation to trigger different types of + * events. usize is always needed to advance thread_allocated, though + * it's not always needed in the core allocation logic. + */ + size_t usize; + sz_size2index_usize_fastpath(size, &ind, &usize); + /* Fast path relies on size being a bin. */ + assert(ind < SC_NBINS); + assert((SC_LOOKUP_MAXCLASS < SC_SMALL_MAXCLASS) && + (size <= SC_SMALL_MAXCLASS)); + + uint64_t allocated, threshold; + te_malloc_fastpath_ctx(tsd, &allocated, &threshold); + uint64_t allocated_after = allocated + usize; + /* + * The ind and usize might be uninitialized (or partially) before + * malloc_init(). The assertions check for: 1) full correctness (usize + * & ind) when initialized; and 2) guaranteed slow-path (threshold == 0) + * when !initialized. + */ + if (!malloc_initialized()) { + assert(threshold == 0); + } else { + assert(ind == sz_size2index(size)); + assert(usize > 0 && usize == sz_index2size(ind)); + } + /* + * Check for events and tsd non-nominal (fast_threshold will be set to + * 0) in a single branch. + */ + if (unlikely(allocated_after >= threshold)) { + return fallback_alloc(size); + } + assert(tsd_fast(tsd)); + + tcache_t *tcache = tsd_tcachep_get(tsd); + assert(tcache == tcache_get(tsd)); + cache_bin_t *bin = &tcache->bins[ind]; + bool tcache_success; + void *ret; + + /* + * We split up the code this way so that redundant low-water + * computation doesn't happen on the (more common) case in which we + * don't touch the low water mark. The compiler won't do this + * duplication on its own. + */ + ret = cache_bin_alloc_easy(bin, &tcache_success); + if (tcache_success) { + fastpath_success_finish(tsd, allocated_after, bin, ret); + return ret; + } + ret = cache_bin_alloc(bin, &tcache_success); + if (tcache_success) { + fastpath_success_finish(tsd, allocated_after, bin, ret); + return ret; + } + + return fallback_alloc(size); +} + #endif /* JEMALLOC_INTERNAL_INLINES_C_H */ diff --git a/include/jemalloc/internal/jemalloc_internal_types.h b/include/jemalloc/internal/jemalloc_internal_types.h index 1ce0f3aa..61c1f31a 100644 --- a/include/jemalloc/internal/jemalloc_internal_types.h +++ b/include/jemalloc/internal/jemalloc_internal_types.h @@ -20,6 +20,14 @@ typedef enum zero_realloc_action_e zero_realloc_action_t; /* Signature of write callback. */ typedef void (write_cb_t)(void *, const char *); +enum malloc_init_e { + malloc_init_uninitialized = 3, + malloc_init_a0_initialized = 2, + malloc_init_recursible = 1, + malloc_init_initialized = 0 /* Common case --> jnz. */ +}; +typedef enum malloc_init_e malloc_init_t; + /* * Flags bits: * diff --git a/src/jemalloc.c b/src/jemalloc.c index c2c75fa5..dc3c98b6 100644 --- a/src/jemalloc.c +++ b/src/jemalloc.c @@ -169,13 +169,7 @@ static arena_t *a0; /* arenas[0]. */ unsigned narenas_auto; unsigned manual_arena_base; -typedef enum { - malloc_init_uninitialized = 3, - malloc_init_a0_initialized = 2, - malloc_init_recursible = 1, - malloc_init_initialized = 0 /* Common case --> jnz. */ -} malloc_init_t; -static malloc_init_t malloc_init_state = malloc_init_uninitialized; +malloc_init_t malloc_init_state = malloc_init_uninitialized; /* False should be the common case. Set to true to trigger initialization. */ bool malloc_slow = true; @@ -280,11 +274,6 @@ static bool malloc_init_hard(void); * Begin miscellaneous support functions. */ -bool -malloc_initialized(void) { - return (malloc_init_state == malloc_init_initialized); -} - JEMALLOC_ALWAYS_INLINE bool malloc_init_a0(void) { if (unlikely(malloc_init_state == malloc_init_uninitialized)) { @@ -2597,112 +2586,11 @@ malloc_default(size_t size) { * Begin malloc(3)-compatible functions. */ -JEMALLOC_ALWAYS_INLINE void -fastpath_success_finish(tsd_t *tsd, uint64_t allocated_after, - cache_bin_t *bin, void *ret) { - thread_allocated_set(tsd, allocated_after); - if (config_stats) { - bin->tstats.nrequests++; - } - - LOG("core.malloc.exit", "result: %p", ret); -} - -/* - * malloc() fastpath. - * - * Fastpath assumes size <= SC_LOOKUP_MAXCLASS, and that we hit - * tcache. If either of these is false, we tail-call to the slowpath, - * malloc_default(). Tail-calling is used to avoid any caller-saved - * registers. - * - * fastpath supports ticker and profiling, both of which will also - * tail-call to the slowpath if they fire. - */ JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_NOTHROW * JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1) je_malloc(size_t size) { - LOG("core.malloc.entry", "size: %zu", size); - - if (tsd_get_allocates() && unlikely(!malloc_initialized())) { - return malloc_default(size); - } - - tsd_t *tsd = tsd_get(false); - if (unlikely((size > SC_LOOKUP_MAXCLASS) || tsd == NULL)) { - return malloc_default(size); - } - /* - * The code below till the branch checking the next_event threshold may - * execute before malloc_init(), in which case the threshold is 0 to - * trigger slow path and initialization. - * - * Note that when uninitialized, only the fast-path variants of the sz / - * tsd facilities may be called. - */ - szind_t ind; - /* - * The thread_allocated counter in tsd serves as a general purpose - * accumulator for bytes of allocation to trigger different types of - * events. usize is always needed to advance thread_allocated, though - * it's not always needed in the core allocation logic. - */ - size_t usize; - sz_size2index_usize_fastpath(size, &ind, &usize); - /* Fast path relies on size being a bin. */ - assert(ind < SC_NBINS); - assert((SC_LOOKUP_MAXCLASS < SC_SMALL_MAXCLASS) && - (size <= SC_SMALL_MAXCLASS)); - - uint64_t allocated, threshold; - te_malloc_fastpath_ctx(tsd, &allocated, &threshold); - uint64_t allocated_after = allocated + usize; - /* - * The ind and usize might be uninitialized (or partially) before - * malloc_init(). The assertions check for: 1) full correctness (usize - * & ind) when initialized; and 2) guaranteed slow-path (threshold == 0) - * when !initialized. - */ - if (!malloc_initialized()) { - assert(threshold == 0); - } else { - assert(ind == sz_size2index(size)); - assert(usize > 0 && usize == sz_index2size(ind)); - } - /* - * Check for events and tsd non-nominal (fast_threshold will be set to - * 0) in a single branch. - */ - if (unlikely(allocated_after >= threshold)) { - return malloc_default(size); - } - assert(tsd_fast(tsd)); - - tcache_t *tcache = tcache_get_from_ind(tsd, TCACHE_IND_AUTOMATIC, - /* slow */ false, /* is_alloc */ true); - cache_bin_t *bin = &tcache->bins[ind]; - bool tcache_success; - void *ret; - - /* - * We split up the code this way so that redundant low-water - * computation doesn't happen on the (more common) case in which we - * don't touch the low water mark. The compiler won't do this - * duplication on its own. - */ - ret = cache_bin_alloc_easy(bin, &tcache_success); - if (tcache_success) { - fastpath_success_finish(tsd, allocated_after, bin, ret); - return ret; - } - ret = cache_bin_alloc(bin, &tcache_success); - if (tcache_success) { - fastpath_success_finish(tsd, allocated_after, bin, ret); - return ret; - } - - return malloc_default(size); + return imalloc_fastpath(size, &malloc_default); } JEMALLOC_EXPORT int JEMALLOC_NOTHROW diff --git a/src/jemalloc_cpp.cpp b/src/jemalloc_cpp.cpp index 6959b27f..47ba92a0 100644 --- a/src/jemalloc_cpp.cpp +++ b/src/jemalloc_cpp.cpp @@ -86,10 +86,10 @@ handleOOM(std::size_t size, bool nothrow) { } template -JEMALLOC_ALWAYS_INLINE -void * -newImpl(std::size_t size) noexcept(IsNoExcept) { - void *ptr = je_malloc(size); +JEMALLOC_NOINLINE +static void * +fallback_impl(std::size_t size) noexcept(IsNoExcept) { + void *ptr = malloc_default(size); if (likely(ptr != nullptr)) { return ptr; } @@ -97,6 +97,13 @@ newImpl(std::size_t size) noexcept(IsNoExcept) { return handleOOM(size, IsNoExcept); } +template +JEMALLOC_ALWAYS_INLINE +void * +newImpl(std::size_t size) noexcept(IsNoExcept) { + return imalloc_fastpath(size, &fallback_impl); +} + void * operator new(std::size_t size) { return newImpl(size);