Inline malloc fastpath into operator new.
This saves a small but non-negligible amount of CPU in C++ programs.
This commit is contained in:
parent
79f81a3732
commit
edbfe6912c
@ -30,6 +30,7 @@ extern bool opt_xmalloc;
|
|||||||
extern bool opt_zero;
|
extern bool opt_zero;
|
||||||
extern unsigned opt_narenas;
|
extern unsigned opt_narenas;
|
||||||
extern zero_realloc_action_t opt_zero_realloc_action;
|
extern zero_realloc_action_t opt_zero_realloc_action;
|
||||||
|
extern malloc_init_t malloc_init_state;
|
||||||
extern const char *zero_realloc_mode_names[];
|
extern const char *zero_realloc_mode_names[];
|
||||||
extern atomic_zu_t zero_realloc_count;
|
extern atomic_zu_t zero_realloc_count;
|
||||||
|
|
||||||
@ -64,7 +65,7 @@ size_t batch_alloc(void **ptrs, size_t num, size_t size, int flags);
|
|||||||
void jemalloc_prefork(void);
|
void jemalloc_prefork(void);
|
||||||
void jemalloc_postfork_parent(void);
|
void jemalloc_postfork_parent(void);
|
||||||
void jemalloc_postfork_child(void);
|
void jemalloc_postfork_child(void);
|
||||||
bool malloc_initialized(void);
|
|
||||||
void je_sdallocx_noflags(void *ptr, size_t size);
|
void je_sdallocx_noflags(void *ptr, size_t size);
|
||||||
|
void *malloc_default(size_t size);
|
||||||
|
|
||||||
#endif /* JEMALLOC_INTERNAL_EXTERNS_H */
|
#endif /* JEMALLOC_INTERNAL_EXTERNS_H */
|
||||||
|
@ -3,7 +3,9 @@
|
|||||||
|
|
||||||
#include "jemalloc/internal/hook.h"
|
#include "jemalloc/internal/hook.h"
|
||||||
#include "jemalloc/internal/jemalloc_internal_types.h"
|
#include "jemalloc/internal/jemalloc_internal_types.h"
|
||||||
|
#include "jemalloc/internal/log.h"
|
||||||
#include "jemalloc/internal/sz.h"
|
#include "jemalloc/internal/sz.h"
|
||||||
|
#include "jemalloc/internal/thread_event.h"
|
||||||
#include "jemalloc/internal/witness.h"
|
#include "jemalloc/internal/witness.h"
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -219,4 +221,120 @@ ixalloc(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size, size_t extra,
|
|||||||
newsize);
|
newsize);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
JEMALLOC_ALWAYS_INLINE void
|
||||||
|
fastpath_success_finish(tsd_t *tsd, uint64_t allocated_after,
|
||||||
|
cache_bin_t *bin, void *ret) {
|
||||||
|
thread_allocated_set(tsd, allocated_after);
|
||||||
|
if (config_stats) {
|
||||||
|
bin->tstats.nrequests++;
|
||||||
|
}
|
||||||
|
|
||||||
|
LOG("core.malloc.exit", "result: %p", ret);
|
||||||
|
}
|
||||||
|
|
||||||
|
JEMALLOC_ALWAYS_INLINE bool
|
||||||
|
malloc_initialized(void) {
|
||||||
|
return (malloc_init_state == malloc_init_initialized);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* malloc() fastpath. Included here so that we can inline it into operator new;
|
||||||
|
* function call overhead there is non-negligible as a fraction of total CPU in
|
||||||
|
* allocation-heavy C++ programs. We take the fallback alloc to allow malloc
|
||||||
|
* (which can return NULL) to differ in its behavior from operator new (which
|
||||||
|
* can't). It matches the signature of malloc / operator new so that we can
|
||||||
|
* tail-call the fallback allocator, allowing us to avoid setting up the call
|
||||||
|
* frame in the common case.
|
||||||
|
*
|
||||||
|
* Fastpath assumes size <= SC_LOOKUP_MAXCLASS, and that we hit
|
||||||
|
* tcache. If either of these is false, we tail-call to the slowpath,
|
||||||
|
* malloc_default(). Tail-calling is used to avoid any caller-saved
|
||||||
|
* registers.
|
||||||
|
*
|
||||||
|
* fastpath supports ticker and profiling, both of which will also
|
||||||
|
* tail-call to the slowpath if they fire.
|
||||||
|
*/
|
||||||
|
JEMALLOC_ALWAYS_INLINE void *
|
||||||
|
imalloc_fastpath(size_t size, void *(fallback_alloc)(size_t)) {
|
||||||
|
LOG("core.malloc.entry", "size: %zu", size);
|
||||||
|
if (tsd_get_allocates() && unlikely(!malloc_initialized())) {
|
||||||
|
return fallback_alloc(size);
|
||||||
|
}
|
||||||
|
|
||||||
|
tsd_t *tsd = tsd_get(false);
|
||||||
|
if (unlikely((size > SC_LOOKUP_MAXCLASS) || tsd == NULL)) {
|
||||||
|
return fallback_alloc(size);
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
* The code below till the branch checking the next_event threshold may
|
||||||
|
* execute before malloc_init(), in which case the threshold is 0 to
|
||||||
|
* trigger slow path and initialization.
|
||||||
|
*
|
||||||
|
* Note that when uninitialized, only the fast-path variants of the sz /
|
||||||
|
* tsd facilities may be called.
|
||||||
|
*/
|
||||||
|
szind_t ind;
|
||||||
|
/*
|
||||||
|
* The thread_allocated counter in tsd serves as a general purpose
|
||||||
|
* accumulator for bytes of allocation to trigger different types of
|
||||||
|
* events. usize is always needed to advance thread_allocated, though
|
||||||
|
* it's not always needed in the core allocation logic.
|
||||||
|
*/
|
||||||
|
size_t usize;
|
||||||
|
sz_size2index_usize_fastpath(size, &ind, &usize);
|
||||||
|
/* Fast path relies on size being a bin. */
|
||||||
|
assert(ind < SC_NBINS);
|
||||||
|
assert((SC_LOOKUP_MAXCLASS < SC_SMALL_MAXCLASS) &&
|
||||||
|
(size <= SC_SMALL_MAXCLASS));
|
||||||
|
|
||||||
|
uint64_t allocated, threshold;
|
||||||
|
te_malloc_fastpath_ctx(tsd, &allocated, &threshold);
|
||||||
|
uint64_t allocated_after = allocated + usize;
|
||||||
|
/*
|
||||||
|
* The ind and usize might be uninitialized (or partially) before
|
||||||
|
* malloc_init(). The assertions check for: 1) full correctness (usize
|
||||||
|
* & ind) when initialized; and 2) guaranteed slow-path (threshold == 0)
|
||||||
|
* when !initialized.
|
||||||
|
*/
|
||||||
|
if (!malloc_initialized()) {
|
||||||
|
assert(threshold == 0);
|
||||||
|
} else {
|
||||||
|
assert(ind == sz_size2index(size));
|
||||||
|
assert(usize > 0 && usize == sz_index2size(ind));
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
* Check for events and tsd non-nominal (fast_threshold will be set to
|
||||||
|
* 0) in a single branch.
|
||||||
|
*/
|
||||||
|
if (unlikely(allocated_after >= threshold)) {
|
||||||
|
return fallback_alloc(size);
|
||||||
|
}
|
||||||
|
assert(tsd_fast(tsd));
|
||||||
|
|
||||||
|
tcache_t *tcache = tsd_tcachep_get(tsd);
|
||||||
|
assert(tcache == tcache_get(tsd));
|
||||||
|
cache_bin_t *bin = &tcache->bins[ind];
|
||||||
|
bool tcache_success;
|
||||||
|
void *ret;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We split up the code this way so that redundant low-water
|
||||||
|
* computation doesn't happen on the (more common) case in which we
|
||||||
|
* don't touch the low water mark. The compiler won't do this
|
||||||
|
* duplication on its own.
|
||||||
|
*/
|
||||||
|
ret = cache_bin_alloc_easy(bin, &tcache_success);
|
||||||
|
if (tcache_success) {
|
||||||
|
fastpath_success_finish(tsd, allocated_after, bin, ret);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
ret = cache_bin_alloc(bin, &tcache_success);
|
||||||
|
if (tcache_success) {
|
||||||
|
fastpath_success_finish(tsd, allocated_after, bin, ret);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
return fallback_alloc(size);
|
||||||
|
}
|
||||||
|
|
||||||
#endif /* JEMALLOC_INTERNAL_INLINES_C_H */
|
#endif /* JEMALLOC_INTERNAL_INLINES_C_H */
|
||||||
|
@ -20,6 +20,14 @@ typedef enum zero_realloc_action_e zero_realloc_action_t;
|
|||||||
/* Signature of write callback. */
|
/* Signature of write callback. */
|
||||||
typedef void (write_cb_t)(void *, const char *);
|
typedef void (write_cb_t)(void *, const char *);
|
||||||
|
|
||||||
|
enum malloc_init_e {
|
||||||
|
malloc_init_uninitialized = 3,
|
||||||
|
malloc_init_a0_initialized = 2,
|
||||||
|
malloc_init_recursible = 1,
|
||||||
|
malloc_init_initialized = 0 /* Common case --> jnz. */
|
||||||
|
};
|
||||||
|
typedef enum malloc_init_e malloc_init_t;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Flags bits:
|
* Flags bits:
|
||||||
*
|
*
|
||||||
|
116
src/jemalloc.c
116
src/jemalloc.c
@ -169,13 +169,7 @@ static arena_t *a0; /* arenas[0]. */
|
|||||||
unsigned narenas_auto;
|
unsigned narenas_auto;
|
||||||
unsigned manual_arena_base;
|
unsigned manual_arena_base;
|
||||||
|
|
||||||
typedef enum {
|
malloc_init_t malloc_init_state = malloc_init_uninitialized;
|
||||||
malloc_init_uninitialized = 3,
|
|
||||||
malloc_init_a0_initialized = 2,
|
|
||||||
malloc_init_recursible = 1,
|
|
||||||
malloc_init_initialized = 0 /* Common case --> jnz. */
|
|
||||||
} malloc_init_t;
|
|
||||||
static malloc_init_t malloc_init_state = malloc_init_uninitialized;
|
|
||||||
|
|
||||||
/* False should be the common case. Set to true to trigger initialization. */
|
/* False should be the common case. Set to true to trigger initialization. */
|
||||||
bool malloc_slow = true;
|
bool malloc_slow = true;
|
||||||
@ -280,11 +274,6 @@ static bool malloc_init_hard(void);
|
|||||||
* Begin miscellaneous support functions.
|
* Begin miscellaneous support functions.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
bool
|
|
||||||
malloc_initialized(void) {
|
|
||||||
return (malloc_init_state == malloc_init_initialized);
|
|
||||||
}
|
|
||||||
|
|
||||||
JEMALLOC_ALWAYS_INLINE bool
|
JEMALLOC_ALWAYS_INLINE bool
|
||||||
malloc_init_a0(void) {
|
malloc_init_a0(void) {
|
||||||
if (unlikely(malloc_init_state == malloc_init_uninitialized)) {
|
if (unlikely(malloc_init_state == malloc_init_uninitialized)) {
|
||||||
@ -2597,112 +2586,11 @@ malloc_default(size_t size) {
|
|||||||
* Begin malloc(3)-compatible functions.
|
* Begin malloc(3)-compatible functions.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
JEMALLOC_ALWAYS_INLINE void
|
|
||||||
fastpath_success_finish(tsd_t *tsd, uint64_t allocated_after,
|
|
||||||
cache_bin_t *bin, void *ret) {
|
|
||||||
thread_allocated_set(tsd, allocated_after);
|
|
||||||
if (config_stats) {
|
|
||||||
bin->tstats.nrequests++;
|
|
||||||
}
|
|
||||||
|
|
||||||
LOG("core.malloc.exit", "result: %p", ret);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* malloc() fastpath.
|
|
||||||
*
|
|
||||||
* Fastpath assumes size <= SC_LOOKUP_MAXCLASS, and that we hit
|
|
||||||
* tcache. If either of these is false, we tail-call to the slowpath,
|
|
||||||
* malloc_default(). Tail-calling is used to avoid any caller-saved
|
|
||||||
* registers.
|
|
||||||
*
|
|
||||||
* fastpath supports ticker and profiling, both of which will also
|
|
||||||
* tail-call to the slowpath if they fire.
|
|
||||||
*/
|
|
||||||
JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
|
JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
|
||||||
void JEMALLOC_NOTHROW *
|
void JEMALLOC_NOTHROW *
|
||||||
JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1)
|
JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1)
|
||||||
je_malloc(size_t size) {
|
je_malloc(size_t size) {
|
||||||
LOG("core.malloc.entry", "size: %zu", size);
|
return imalloc_fastpath(size, &malloc_default);
|
||||||
|
|
||||||
if (tsd_get_allocates() && unlikely(!malloc_initialized())) {
|
|
||||||
return malloc_default(size);
|
|
||||||
}
|
|
||||||
|
|
||||||
tsd_t *tsd = tsd_get(false);
|
|
||||||
if (unlikely((size > SC_LOOKUP_MAXCLASS) || tsd == NULL)) {
|
|
||||||
return malloc_default(size);
|
|
||||||
}
|
|
||||||
/*
|
|
||||||
* The code below till the branch checking the next_event threshold may
|
|
||||||
* execute before malloc_init(), in which case the threshold is 0 to
|
|
||||||
* trigger slow path and initialization.
|
|
||||||
*
|
|
||||||
* Note that when uninitialized, only the fast-path variants of the sz /
|
|
||||||
* tsd facilities may be called.
|
|
||||||
*/
|
|
||||||
szind_t ind;
|
|
||||||
/*
|
|
||||||
* The thread_allocated counter in tsd serves as a general purpose
|
|
||||||
* accumulator for bytes of allocation to trigger different types of
|
|
||||||
* events. usize is always needed to advance thread_allocated, though
|
|
||||||
* it's not always needed in the core allocation logic.
|
|
||||||
*/
|
|
||||||
size_t usize;
|
|
||||||
sz_size2index_usize_fastpath(size, &ind, &usize);
|
|
||||||
/* Fast path relies on size being a bin. */
|
|
||||||
assert(ind < SC_NBINS);
|
|
||||||
assert((SC_LOOKUP_MAXCLASS < SC_SMALL_MAXCLASS) &&
|
|
||||||
(size <= SC_SMALL_MAXCLASS));
|
|
||||||
|
|
||||||
uint64_t allocated, threshold;
|
|
||||||
te_malloc_fastpath_ctx(tsd, &allocated, &threshold);
|
|
||||||
uint64_t allocated_after = allocated + usize;
|
|
||||||
/*
|
|
||||||
* The ind and usize might be uninitialized (or partially) before
|
|
||||||
* malloc_init(). The assertions check for: 1) full correctness (usize
|
|
||||||
* & ind) when initialized; and 2) guaranteed slow-path (threshold == 0)
|
|
||||||
* when !initialized.
|
|
||||||
*/
|
|
||||||
if (!malloc_initialized()) {
|
|
||||||
assert(threshold == 0);
|
|
||||||
} else {
|
|
||||||
assert(ind == sz_size2index(size));
|
|
||||||
assert(usize > 0 && usize == sz_index2size(ind));
|
|
||||||
}
|
|
||||||
/*
|
|
||||||
* Check for events and tsd non-nominal (fast_threshold will be set to
|
|
||||||
* 0) in a single branch.
|
|
||||||
*/
|
|
||||||
if (unlikely(allocated_after >= threshold)) {
|
|
||||||
return malloc_default(size);
|
|
||||||
}
|
|
||||||
assert(tsd_fast(tsd));
|
|
||||||
|
|
||||||
tcache_t *tcache = tcache_get_from_ind(tsd, TCACHE_IND_AUTOMATIC,
|
|
||||||
/* slow */ false, /* is_alloc */ true);
|
|
||||||
cache_bin_t *bin = &tcache->bins[ind];
|
|
||||||
bool tcache_success;
|
|
||||||
void *ret;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* We split up the code this way so that redundant low-water
|
|
||||||
* computation doesn't happen on the (more common) case in which we
|
|
||||||
* don't touch the low water mark. The compiler won't do this
|
|
||||||
* duplication on its own.
|
|
||||||
*/
|
|
||||||
ret = cache_bin_alloc_easy(bin, &tcache_success);
|
|
||||||
if (tcache_success) {
|
|
||||||
fastpath_success_finish(tsd, allocated_after, bin, ret);
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
ret = cache_bin_alloc(bin, &tcache_success);
|
|
||||||
if (tcache_success) {
|
|
||||||
fastpath_success_finish(tsd, allocated_after, bin, ret);
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
return malloc_default(size);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
JEMALLOC_EXPORT int JEMALLOC_NOTHROW
|
JEMALLOC_EXPORT int JEMALLOC_NOTHROW
|
||||||
|
@ -86,10 +86,10 @@ handleOOM(std::size_t size, bool nothrow) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <bool IsNoExcept>
|
template <bool IsNoExcept>
|
||||||
JEMALLOC_ALWAYS_INLINE
|
JEMALLOC_NOINLINE
|
||||||
void *
|
static void *
|
||||||
newImpl(std::size_t size) noexcept(IsNoExcept) {
|
fallback_impl(std::size_t size) noexcept(IsNoExcept) {
|
||||||
void *ptr = je_malloc(size);
|
void *ptr = malloc_default(size);
|
||||||
if (likely(ptr != nullptr)) {
|
if (likely(ptr != nullptr)) {
|
||||||
return ptr;
|
return ptr;
|
||||||
}
|
}
|
||||||
@ -97,6 +97,13 @@ newImpl(std::size_t size) noexcept(IsNoExcept) {
|
|||||||
return handleOOM(size, IsNoExcept);
|
return handleOOM(size, IsNoExcept);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <bool IsNoExcept>
|
||||||
|
JEMALLOC_ALWAYS_INLINE
|
||||||
|
void *
|
||||||
|
newImpl(std::size_t size) noexcept(IsNoExcept) {
|
||||||
|
return imalloc_fastpath(size, &fallback_impl<IsNoExcept>);
|
||||||
|
}
|
||||||
|
|
||||||
void *
|
void *
|
||||||
operator new(std::size_t size) {
|
operator new(std::size_t size) {
|
||||||
return newImpl<false>(size);
|
return newImpl<false>(size);
|
||||||
|
Loading…
Reference in New Issue
Block a user