Inline malloc fastpath into operator new.

This saves a small but non-negligible amount of CPU in C++ programs.
This commit is contained in:
David Goldblatt 2021-02-08 08:49:34 -08:00 committed by David Goldblatt
parent 79f81a3732
commit edbfe6912c
5 changed files with 141 additions and 119 deletions

View File

@ -30,6 +30,7 @@ extern bool opt_xmalloc;
extern bool opt_zero;
extern unsigned opt_narenas;
extern zero_realloc_action_t opt_zero_realloc_action;
extern malloc_init_t malloc_init_state;
extern const char *zero_realloc_mode_names[];
extern atomic_zu_t zero_realloc_count;
@ -64,7 +65,7 @@ size_t batch_alloc(void **ptrs, size_t num, size_t size, int flags);
void jemalloc_prefork(void);
void jemalloc_postfork_parent(void);
void jemalloc_postfork_child(void);
bool malloc_initialized(void);
void je_sdallocx_noflags(void *ptr, size_t size);
void *malloc_default(size_t size);
#endif /* JEMALLOC_INTERNAL_EXTERNS_H */

View File

@ -3,7 +3,9 @@
#include "jemalloc/internal/hook.h"
#include "jemalloc/internal/jemalloc_internal_types.h"
#include "jemalloc/internal/log.h"
#include "jemalloc/internal/sz.h"
#include "jemalloc/internal/thread_event.h"
#include "jemalloc/internal/witness.h"
/*
@ -219,4 +221,120 @@ ixalloc(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size, size_t extra,
newsize);
}
JEMALLOC_ALWAYS_INLINE void
fastpath_success_finish(tsd_t *tsd, uint64_t allocated_after,
cache_bin_t *bin, void *ret) {
thread_allocated_set(tsd, allocated_after);
if (config_stats) {
bin->tstats.nrequests++;
}
LOG("core.malloc.exit", "result: %p", ret);
}
JEMALLOC_ALWAYS_INLINE bool
malloc_initialized(void) {
return (malloc_init_state == malloc_init_initialized);
}
/*
* malloc() fastpath. Included here so that we can inline it into operator new;
* function call overhead there is non-negligible as a fraction of total CPU in
* allocation-heavy C++ programs. We take the fallback alloc to allow malloc
* (which can return NULL) to differ in its behavior from operator new (which
* can't). It matches the signature of malloc / operator new so that we can
* tail-call the fallback allocator, allowing us to avoid setting up the call
* frame in the common case.
*
* Fastpath assumes size <= SC_LOOKUP_MAXCLASS, and that we hit
* tcache. If either of these is false, we tail-call to the slowpath,
* malloc_default(). Tail-calling is used to avoid any caller-saved
* registers.
*
* fastpath supports ticker and profiling, both of which will also
* tail-call to the slowpath if they fire.
*/
JEMALLOC_ALWAYS_INLINE void *
imalloc_fastpath(size_t size, void *(fallback_alloc)(size_t)) {
LOG("core.malloc.entry", "size: %zu", size);
if (tsd_get_allocates() && unlikely(!malloc_initialized())) {
return fallback_alloc(size);
}
tsd_t *tsd = tsd_get(false);
if (unlikely((size > SC_LOOKUP_MAXCLASS) || tsd == NULL)) {
return fallback_alloc(size);
}
/*
* The code below till the branch checking the next_event threshold may
* execute before malloc_init(), in which case the threshold is 0 to
* trigger slow path and initialization.
*
* Note that when uninitialized, only the fast-path variants of the sz /
* tsd facilities may be called.
*/
szind_t ind;
/*
* The thread_allocated counter in tsd serves as a general purpose
* accumulator for bytes of allocation to trigger different types of
* events. usize is always needed to advance thread_allocated, though
* it's not always needed in the core allocation logic.
*/
size_t usize;
sz_size2index_usize_fastpath(size, &ind, &usize);
/* Fast path relies on size being a bin. */
assert(ind < SC_NBINS);
assert((SC_LOOKUP_MAXCLASS < SC_SMALL_MAXCLASS) &&
(size <= SC_SMALL_MAXCLASS));
uint64_t allocated, threshold;
te_malloc_fastpath_ctx(tsd, &allocated, &threshold);
uint64_t allocated_after = allocated + usize;
/*
* The ind and usize might be uninitialized (or partially) before
* malloc_init(). The assertions check for: 1) full correctness (usize
* & ind) when initialized; and 2) guaranteed slow-path (threshold == 0)
* when !initialized.
*/
if (!malloc_initialized()) {
assert(threshold == 0);
} else {
assert(ind == sz_size2index(size));
assert(usize > 0 && usize == sz_index2size(ind));
}
/*
* Check for events and tsd non-nominal (fast_threshold will be set to
* 0) in a single branch.
*/
if (unlikely(allocated_after >= threshold)) {
return fallback_alloc(size);
}
assert(tsd_fast(tsd));
tcache_t *tcache = tsd_tcachep_get(tsd);
assert(tcache == tcache_get(tsd));
cache_bin_t *bin = &tcache->bins[ind];
bool tcache_success;
void *ret;
/*
* We split up the code this way so that redundant low-water
* computation doesn't happen on the (more common) case in which we
* don't touch the low water mark. The compiler won't do this
* duplication on its own.
*/
ret = cache_bin_alloc_easy(bin, &tcache_success);
if (tcache_success) {
fastpath_success_finish(tsd, allocated_after, bin, ret);
return ret;
}
ret = cache_bin_alloc(bin, &tcache_success);
if (tcache_success) {
fastpath_success_finish(tsd, allocated_after, bin, ret);
return ret;
}
return fallback_alloc(size);
}
#endif /* JEMALLOC_INTERNAL_INLINES_C_H */

View File

@ -20,6 +20,14 @@ typedef enum zero_realloc_action_e zero_realloc_action_t;
/* Signature of write callback. */
typedef void (write_cb_t)(void *, const char *);
enum malloc_init_e {
malloc_init_uninitialized = 3,
malloc_init_a0_initialized = 2,
malloc_init_recursible = 1,
malloc_init_initialized = 0 /* Common case --> jnz. */
};
typedef enum malloc_init_e malloc_init_t;
/*
* Flags bits:
*

View File

@ -169,13 +169,7 @@ static arena_t *a0; /* arenas[0]. */
unsigned narenas_auto;
unsigned manual_arena_base;
typedef enum {
malloc_init_uninitialized = 3,
malloc_init_a0_initialized = 2,
malloc_init_recursible = 1,
malloc_init_initialized = 0 /* Common case --> jnz. */
} malloc_init_t;
static malloc_init_t malloc_init_state = malloc_init_uninitialized;
malloc_init_t malloc_init_state = malloc_init_uninitialized;
/* False should be the common case. Set to true to trigger initialization. */
bool malloc_slow = true;
@ -280,11 +274,6 @@ static bool malloc_init_hard(void);
* Begin miscellaneous support functions.
*/
bool
malloc_initialized(void) {
return (malloc_init_state == malloc_init_initialized);
}
JEMALLOC_ALWAYS_INLINE bool
malloc_init_a0(void) {
if (unlikely(malloc_init_state == malloc_init_uninitialized)) {
@ -2597,112 +2586,11 @@ malloc_default(size_t size) {
* Begin malloc(3)-compatible functions.
*/
JEMALLOC_ALWAYS_INLINE void
fastpath_success_finish(tsd_t *tsd, uint64_t allocated_after,
cache_bin_t *bin, void *ret) {
thread_allocated_set(tsd, allocated_after);
if (config_stats) {
bin->tstats.nrequests++;
}
LOG("core.malloc.exit", "result: %p", ret);
}
/*
* malloc() fastpath.
*
* Fastpath assumes size <= SC_LOOKUP_MAXCLASS, and that we hit
* tcache. If either of these is false, we tail-call to the slowpath,
* malloc_default(). Tail-calling is used to avoid any caller-saved
* registers.
*
* fastpath supports ticker and profiling, both of which will also
* tail-call to the slowpath if they fire.
*/
JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
void JEMALLOC_NOTHROW *
JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1)
je_malloc(size_t size) {
LOG("core.malloc.entry", "size: %zu", size);
if (tsd_get_allocates() && unlikely(!malloc_initialized())) {
return malloc_default(size);
}
tsd_t *tsd = tsd_get(false);
if (unlikely((size > SC_LOOKUP_MAXCLASS) || tsd == NULL)) {
return malloc_default(size);
}
/*
* The code below till the branch checking the next_event threshold may
* execute before malloc_init(), in which case the threshold is 0 to
* trigger slow path and initialization.
*
* Note that when uninitialized, only the fast-path variants of the sz /
* tsd facilities may be called.
*/
szind_t ind;
/*
* The thread_allocated counter in tsd serves as a general purpose
* accumulator for bytes of allocation to trigger different types of
* events. usize is always needed to advance thread_allocated, though
* it's not always needed in the core allocation logic.
*/
size_t usize;
sz_size2index_usize_fastpath(size, &ind, &usize);
/* Fast path relies on size being a bin. */
assert(ind < SC_NBINS);
assert((SC_LOOKUP_MAXCLASS < SC_SMALL_MAXCLASS) &&
(size <= SC_SMALL_MAXCLASS));
uint64_t allocated, threshold;
te_malloc_fastpath_ctx(tsd, &allocated, &threshold);
uint64_t allocated_after = allocated + usize;
/*
* The ind and usize might be uninitialized (or partially) before
* malloc_init(). The assertions check for: 1) full correctness (usize
* & ind) when initialized; and 2) guaranteed slow-path (threshold == 0)
* when !initialized.
*/
if (!malloc_initialized()) {
assert(threshold == 0);
} else {
assert(ind == sz_size2index(size));
assert(usize > 0 && usize == sz_index2size(ind));
}
/*
* Check for events and tsd non-nominal (fast_threshold will be set to
* 0) in a single branch.
*/
if (unlikely(allocated_after >= threshold)) {
return malloc_default(size);
}
assert(tsd_fast(tsd));
tcache_t *tcache = tcache_get_from_ind(tsd, TCACHE_IND_AUTOMATIC,
/* slow */ false, /* is_alloc */ true);
cache_bin_t *bin = &tcache->bins[ind];
bool tcache_success;
void *ret;
/*
* We split up the code this way so that redundant low-water
* computation doesn't happen on the (more common) case in which we
* don't touch the low water mark. The compiler won't do this
* duplication on its own.
*/
ret = cache_bin_alloc_easy(bin, &tcache_success);
if (tcache_success) {
fastpath_success_finish(tsd, allocated_after, bin, ret);
return ret;
}
ret = cache_bin_alloc(bin, &tcache_success);
if (tcache_success) {
fastpath_success_finish(tsd, allocated_after, bin, ret);
return ret;
}
return malloc_default(size);
return imalloc_fastpath(size, &malloc_default);
}
JEMALLOC_EXPORT int JEMALLOC_NOTHROW

View File

@ -86,10 +86,10 @@ handleOOM(std::size_t size, bool nothrow) {
}
template <bool IsNoExcept>
JEMALLOC_ALWAYS_INLINE
void *
newImpl(std::size_t size) noexcept(IsNoExcept) {
void *ptr = je_malloc(size);
JEMALLOC_NOINLINE
static void *
fallback_impl(std::size_t size) noexcept(IsNoExcept) {
void *ptr = malloc_default(size);
if (likely(ptr != nullptr)) {
return ptr;
}
@ -97,6 +97,13 @@ newImpl(std::size_t size) noexcept(IsNoExcept) {
return handleOOM(size, IsNoExcept);
}
template <bool IsNoExcept>
JEMALLOC_ALWAYS_INLINE
void *
newImpl(std::size_t size) noexcept(IsNoExcept) {
return imalloc_fastpath(size, &fallback_impl<IsNoExcept>);
}
void *
operator new(std::size_t size) {
return newImpl<false>(size);