Inline malloc fastpath into operator new.
This saves a small but non-negligible amount of CPU in C++ programs.
This commit is contained in:
parent
79f81a3732
commit
edbfe6912c
@ -30,6 +30,7 @@ extern bool opt_xmalloc;
|
||||
extern bool opt_zero;
|
||||
extern unsigned opt_narenas;
|
||||
extern zero_realloc_action_t opt_zero_realloc_action;
|
||||
extern malloc_init_t malloc_init_state;
|
||||
extern const char *zero_realloc_mode_names[];
|
||||
extern atomic_zu_t zero_realloc_count;
|
||||
|
||||
@ -64,7 +65,7 @@ size_t batch_alloc(void **ptrs, size_t num, size_t size, int flags);
|
||||
void jemalloc_prefork(void);
|
||||
void jemalloc_postfork_parent(void);
|
||||
void jemalloc_postfork_child(void);
|
||||
bool malloc_initialized(void);
|
||||
void je_sdallocx_noflags(void *ptr, size_t size);
|
||||
void *malloc_default(size_t size);
|
||||
|
||||
#endif /* JEMALLOC_INTERNAL_EXTERNS_H */
|
||||
|
@ -3,7 +3,9 @@
|
||||
|
||||
#include "jemalloc/internal/hook.h"
|
||||
#include "jemalloc/internal/jemalloc_internal_types.h"
|
||||
#include "jemalloc/internal/log.h"
|
||||
#include "jemalloc/internal/sz.h"
|
||||
#include "jemalloc/internal/thread_event.h"
|
||||
#include "jemalloc/internal/witness.h"
|
||||
|
||||
/*
|
||||
@ -219,4 +221,120 @@ ixalloc(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size, size_t extra,
|
||||
newsize);
|
||||
}
|
||||
|
||||
JEMALLOC_ALWAYS_INLINE void
|
||||
fastpath_success_finish(tsd_t *tsd, uint64_t allocated_after,
|
||||
cache_bin_t *bin, void *ret) {
|
||||
thread_allocated_set(tsd, allocated_after);
|
||||
if (config_stats) {
|
||||
bin->tstats.nrequests++;
|
||||
}
|
||||
|
||||
LOG("core.malloc.exit", "result: %p", ret);
|
||||
}
|
||||
|
||||
JEMALLOC_ALWAYS_INLINE bool
|
||||
malloc_initialized(void) {
|
||||
return (malloc_init_state == malloc_init_initialized);
|
||||
}
|
||||
|
||||
/*
|
||||
* malloc() fastpath. Included here so that we can inline it into operator new;
|
||||
* function call overhead there is non-negligible as a fraction of total CPU in
|
||||
* allocation-heavy C++ programs. We take the fallback alloc to allow malloc
|
||||
* (which can return NULL) to differ in its behavior from operator new (which
|
||||
* can't). It matches the signature of malloc / operator new so that we can
|
||||
* tail-call the fallback allocator, allowing us to avoid setting up the call
|
||||
* frame in the common case.
|
||||
*
|
||||
* Fastpath assumes size <= SC_LOOKUP_MAXCLASS, and that we hit
|
||||
* tcache. If either of these is false, we tail-call to the slowpath,
|
||||
* malloc_default(). Tail-calling is used to avoid any caller-saved
|
||||
* registers.
|
||||
*
|
||||
* fastpath supports ticker and profiling, both of which will also
|
||||
* tail-call to the slowpath if they fire.
|
||||
*/
|
||||
JEMALLOC_ALWAYS_INLINE void *
|
||||
imalloc_fastpath(size_t size, void *(fallback_alloc)(size_t)) {
|
||||
LOG("core.malloc.entry", "size: %zu", size);
|
||||
if (tsd_get_allocates() && unlikely(!malloc_initialized())) {
|
||||
return fallback_alloc(size);
|
||||
}
|
||||
|
||||
tsd_t *tsd = tsd_get(false);
|
||||
if (unlikely((size > SC_LOOKUP_MAXCLASS) || tsd == NULL)) {
|
||||
return fallback_alloc(size);
|
||||
}
|
||||
/*
|
||||
* The code below till the branch checking the next_event threshold may
|
||||
* execute before malloc_init(), in which case the threshold is 0 to
|
||||
* trigger slow path and initialization.
|
||||
*
|
||||
* Note that when uninitialized, only the fast-path variants of the sz /
|
||||
* tsd facilities may be called.
|
||||
*/
|
||||
szind_t ind;
|
||||
/*
|
||||
* The thread_allocated counter in tsd serves as a general purpose
|
||||
* accumulator for bytes of allocation to trigger different types of
|
||||
* events. usize is always needed to advance thread_allocated, though
|
||||
* it's not always needed in the core allocation logic.
|
||||
*/
|
||||
size_t usize;
|
||||
sz_size2index_usize_fastpath(size, &ind, &usize);
|
||||
/* Fast path relies on size being a bin. */
|
||||
assert(ind < SC_NBINS);
|
||||
assert((SC_LOOKUP_MAXCLASS < SC_SMALL_MAXCLASS) &&
|
||||
(size <= SC_SMALL_MAXCLASS));
|
||||
|
||||
uint64_t allocated, threshold;
|
||||
te_malloc_fastpath_ctx(tsd, &allocated, &threshold);
|
||||
uint64_t allocated_after = allocated + usize;
|
||||
/*
|
||||
* The ind and usize might be uninitialized (or partially) before
|
||||
* malloc_init(). The assertions check for: 1) full correctness (usize
|
||||
* & ind) when initialized; and 2) guaranteed slow-path (threshold == 0)
|
||||
* when !initialized.
|
||||
*/
|
||||
if (!malloc_initialized()) {
|
||||
assert(threshold == 0);
|
||||
} else {
|
||||
assert(ind == sz_size2index(size));
|
||||
assert(usize > 0 && usize == sz_index2size(ind));
|
||||
}
|
||||
/*
|
||||
* Check for events and tsd non-nominal (fast_threshold will be set to
|
||||
* 0) in a single branch.
|
||||
*/
|
||||
if (unlikely(allocated_after >= threshold)) {
|
||||
return fallback_alloc(size);
|
||||
}
|
||||
assert(tsd_fast(tsd));
|
||||
|
||||
tcache_t *tcache = tsd_tcachep_get(tsd);
|
||||
assert(tcache == tcache_get(tsd));
|
||||
cache_bin_t *bin = &tcache->bins[ind];
|
||||
bool tcache_success;
|
||||
void *ret;
|
||||
|
||||
/*
|
||||
* We split up the code this way so that redundant low-water
|
||||
* computation doesn't happen on the (more common) case in which we
|
||||
* don't touch the low water mark. The compiler won't do this
|
||||
* duplication on its own.
|
||||
*/
|
||||
ret = cache_bin_alloc_easy(bin, &tcache_success);
|
||||
if (tcache_success) {
|
||||
fastpath_success_finish(tsd, allocated_after, bin, ret);
|
||||
return ret;
|
||||
}
|
||||
ret = cache_bin_alloc(bin, &tcache_success);
|
||||
if (tcache_success) {
|
||||
fastpath_success_finish(tsd, allocated_after, bin, ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
return fallback_alloc(size);
|
||||
}
|
||||
|
||||
#endif /* JEMALLOC_INTERNAL_INLINES_C_H */
|
||||
|
@ -20,6 +20,14 @@ typedef enum zero_realloc_action_e zero_realloc_action_t;
|
||||
/* Signature of write callback. */
|
||||
typedef void (write_cb_t)(void *, const char *);
|
||||
|
||||
enum malloc_init_e {
|
||||
malloc_init_uninitialized = 3,
|
||||
malloc_init_a0_initialized = 2,
|
||||
malloc_init_recursible = 1,
|
||||
malloc_init_initialized = 0 /* Common case --> jnz. */
|
||||
};
|
||||
typedef enum malloc_init_e malloc_init_t;
|
||||
|
||||
/*
|
||||
* Flags bits:
|
||||
*
|
||||
|
116
src/jemalloc.c
116
src/jemalloc.c
@ -169,13 +169,7 @@ static arena_t *a0; /* arenas[0]. */
|
||||
unsigned narenas_auto;
|
||||
unsigned manual_arena_base;
|
||||
|
||||
typedef enum {
|
||||
malloc_init_uninitialized = 3,
|
||||
malloc_init_a0_initialized = 2,
|
||||
malloc_init_recursible = 1,
|
||||
malloc_init_initialized = 0 /* Common case --> jnz. */
|
||||
} malloc_init_t;
|
||||
static malloc_init_t malloc_init_state = malloc_init_uninitialized;
|
||||
malloc_init_t malloc_init_state = malloc_init_uninitialized;
|
||||
|
||||
/* False should be the common case. Set to true to trigger initialization. */
|
||||
bool malloc_slow = true;
|
||||
@ -280,11 +274,6 @@ static bool malloc_init_hard(void);
|
||||
* Begin miscellaneous support functions.
|
||||
*/
|
||||
|
||||
bool
|
||||
malloc_initialized(void) {
|
||||
return (malloc_init_state == malloc_init_initialized);
|
||||
}
|
||||
|
||||
JEMALLOC_ALWAYS_INLINE bool
|
||||
malloc_init_a0(void) {
|
||||
if (unlikely(malloc_init_state == malloc_init_uninitialized)) {
|
||||
@ -2597,112 +2586,11 @@ malloc_default(size_t size) {
|
||||
* Begin malloc(3)-compatible functions.
|
||||
*/
|
||||
|
||||
JEMALLOC_ALWAYS_INLINE void
|
||||
fastpath_success_finish(tsd_t *tsd, uint64_t allocated_after,
|
||||
cache_bin_t *bin, void *ret) {
|
||||
thread_allocated_set(tsd, allocated_after);
|
||||
if (config_stats) {
|
||||
bin->tstats.nrequests++;
|
||||
}
|
||||
|
||||
LOG("core.malloc.exit", "result: %p", ret);
|
||||
}
|
||||
|
||||
/*
|
||||
* malloc() fastpath.
|
||||
*
|
||||
* Fastpath assumes size <= SC_LOOKUP_MAXCLASS, and that we hit
|
||||
* tcache. If either of these is false, we tail-call to the slowpath,
|
||||
* malloc_default(). Tail-calling is used to avoid any caller-saved
|
||||
* registers.
|
||||
*
|
||||
* fastpath supports ticker and profiling, both of which will also
|
||||
* tail-call to the slowpath if they fire.
|
||||
*/
|
||||
JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
|
||||
void JEMALLOC_NOTHROW *
|
||||
JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1)
|
||||
je_malloc(size_t size) {
|
||||
LOG("core.malloc.entry", "size: %zu", size);
|
||||
|
||||
if (tsd_get_allocates() && unlikely(!malloc_initialized())) {
|
||||
return malloc_default(size);
|
||||
}
|
||||
|
||||
tsd_t *tsd = tsd_get(false);
|
||||
if (unlikely((size > SC_LOOKUP_MAXCLASS) || tsd == NULL)) {
|
||||
return malloc_default(size);
|
||||
}
|
||||
/*
|
||||
* The code below till the branch checking the next_event threshold may
|
||||
* execute before malloc_init(), in which case the threshold is 0 to
|
||||
* trigger slow path and initialization.
|
||||
*
|
||||
* Note that when uninitialized, only the fast-path variants of the sz /
|
||||
* tsd facilities may be called.
|
||||
*/
|
||||
szind_t ind;
|
||||
/*
|
||||
* The thread_allocated counter in tsd serves as a general purpose
|
||||
* accumulator for bytes of allocation to trigger different types of
|
||||
* events. usize is always needed to advance thread_allocated, though
|
||||
* it's not always needed in the core allocation logic.
|
||||
*/
|
||||
size_t usize;
|
||||
sz_size2index_usize_fastpath(size, &ind, &usize);
|
||||
/* Fast path relies on size being a bin. */
|
||||
assert(ind < SC_NBINS);
|
||||
assert((SC_LOOKUP_MAXCLASS < SC_SMALL_MAXCLASS) &&
|
||||
(size <= SC_SMALL_MAXCLASS));
|
||||
|
||||
uint64_t allocated, threshold;
|
||||
te_malloc_fastpath_ctx(tsd, &allocated, &threshold);
|
||||
uint64_t allocated_after = allocated + usize;
|
||||
/*
|
||||
* The ind and usize might be uninitialized (or partially) before
|
||||
* malloc_init(). The assertions check for: 1) full correctness (usize
|
||||
* & ind) when initialized; and 2) guaranteed slow-path (threshold == 0)
|
||||
* when !initialized.
|
||||
*/
|
||||
if (!malloc_initialized()) {
|
||||
assert(threshold == 0);
|
||||
} else {
|
||||
assert(ind == sz_size2index(size));
|
||||
assert(usize > 0 && usize == sz_index2size(ind));
|
||||
}
|
||||
/*
|
||||
* Check for events and tsd non-nominal (fast_threshold will be set to
|
||||
* 0) in a single branch.
|
||||
*/
|
||||
if (unlikely(allocated_after >= threshold)) {
|
||||
return malloc_default(size);
|
||||
}
|
||||
assert(tsd_fast(tsd));
|
||||
|
||||
tcache_t *tcache = tcache_get_from_ind(tsd, TCACHE_IND_AUTOMATIC,
|
||||
/* slow */ false, /* is_alloc */ true);
|
||||
cache_bin_t *bin = &tcache->bins[ind];
|
||||
bool tcache_success;
|
||||
void *ret;
|
||||
|
||||
/*
|
||||
* We split up the code this way so that redundant low-water
|
||||
* computation doesn't happen on the (more common) case in which we
|
||||
* don't touch the low water mark. The compiler won't do this
|
||||
* duplication on its own.
|
||||
*/
|
||||
ret = cache_bin_alloc_easy(bin, &tcache_success);
|
||||
if (tcache_success) {
|
||||
fastpath_success_finish(tsd, allocated_after, bin, ret);
|
||||
return ret;
|
||||
}
|
||||
ret = cache_bin_alloc(bin, &tcache_success);
|
||||
if (tcache_success) {
|
||||
fastpath_success_finish(tsd, allocated_after, bin, ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
return malloc_default(size);
|
||||
return imalloc_fastpath(size, &malloc_default);
|
||||
}
|
||||
|
||||
JEMALLOC_EXPORT int JEMALLOC_NOTHROW
|
||||
|
@ -86,10 +86,10 @@ handleOOM(std::size_t size, bool nothrow) {
|
||||
}
|
||||
|
||||
template <bool IsNoExcept>
|
||||
JEMALLOC_ALWAYS_INLINE
|
||||
void *
|
||||
newImpl(std::size_t size) noexcept(IsNoExcept) {
|
||||
void *ptr = je_malloc(size);
|
||||
JEMALLOC_NOINLINE
|
||||
static void *
|
||||
fallback_impl(std::size_t size) noexcept(IsNoExcept) {
|
||||
void *ptr = malloc_default(size);
|
||||
if (likely(ptr != nullptr)) {
|
||||
return ptr;
|
||||
}
|
||||
@ -97,6 +97,13 @@ newImpl(std::size_t size) noexcept(IsNoExcept) {
|
||||
return handleOOM(size, IsNoExcept);
|
||||
}
|
||||
|
||||
template <bool IsNoExcept>
|
||||
JEMALLOC_ALWAYS_INLINE
|
||||
void *
|
||||
newImpl(std::size_t size) noexcept(IsNoExcept) {
|
||||
return imalloc_fastpath(size, &fallback_impl<IsNoExcept>);
|
||||
}
|
||||
|
||||
void *
|
||||
operator new(std::size_t size) {
|
||||
return newImpl<false>(size);
|
||||
|
Loading…
Reference in New Issue
Block a user