malloc: Add a fastpath
This diff adds a fastpath that assumes size <= SC_LOOKUP_MAXCLASS, and that we hit tcache. If either of these is false, we fall back to the previous codepath (renamed 'malloc_default'). Crucially, we only tail call malloc_default, and with the same kind and number of arguments, so that both clang and gcc tail-calling will kick in - therefore malloc() gets treated as a leaf function, and there are *no* caller-saved registers. Previously malloc() contained 5 caller saved registers on x64, resulting in at least 10 extra memory-movement instructions. In microbenchmarks this results in up to ~10% improvement in malloc() fastpath. In real programs, this is a ~1% CPU and latency improvement overall.
This commit is contained in:
parent
0ec656eb71
commit
0f8313659e
@ -2152,15 +2152,9 @@ imalloc(static_opts_t *sopts, dynamic_opts_t *dopts) {
|
|||||||
return imalloc_body(sopts, dopts, tsd);
|
return imalloc_body(sopts, dopts, tsd);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/******************************************************************************/
|
|
||||||
/*
|
|
||||||
* Begin malloc(3)-compatible functions.
|
|
||||||
*/
|
|
||||||
|
|
||||||
JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
|
void *
|
||||||
void JEMALLOC_NOTHROW *
|
malloc_default(size_t size) {
|
||||||
JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1)
|
|
||||||
je_malloc(size_t size) {
|
|
||||||
void *ret;
|
void *ret;
|
||||||
static_opts_t sopts;
|
static_opts_t sopts;
|
||||||
dynamic_opts_t dopts;
|
dynamic_opts_t dopts;
|
||||||
@ -2193,6 +2187,93 @@ je_malloc(size_t size) {
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/******************************************************************************/
|
||||||
|
/*
|
||||||
|
* Begin malloc(3)-compatible functions.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* malloc() fastpath.
|
||||||
|
*
|
||||||
|
* Fastpath assumes size <= SC_LOOKUP_MAXCLASS, and that we hit
|
||||||
|
* tcache. If either of these is false, we tail-call to the slowpath,
|
||||||
|
* malloc_default(). Tail-calling is used to avoid any caller-saved
|
||||||
|
* registers.
|
||||||
|
*
|
||||||
|
* fastpath supports ticker and profiling, both of which will also
|
||||||
|
* tail-call to the slowpath if they fire.
|
||||||
|
*/
|
||||||
|
JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
|
||||||
|
void JEMALLOC_NOTHROW *
|
||||||
|
JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1)
|
||||||
|
je_malloc(size_t size) {
|
||||||
|
LOG("core.malloc.entry", "size: %zu", size);
|
||||||
|
|
||||||
|
if (tsd_get_allocates() && unlikely(!malloc_initialized())) {
|
||||||
|
return malloc_default(size);
|
||||||
|
}
|
||||||
|
|
||||||
|
tsd_t *tsd = tsd_get(false);
|
||||||
|
if (unlikely(!tsd || !tsd_fast(tsd) || (size > SC_LOOKUP_MAXCLASS))) {
|
||||||
|
return malloc_default(size);
|
||||||
|
}
|
||||||
|
|
||||||
|
tcache_t *tcache = tsd_tcachep_get(tsd);
|
||||||
|
|
||||||
|
if (unlikely(ticker_trytick(&tcache->gc_ticker))) {
|
||||||
|
return malloc_default(size);
|
||||||
|
}
|
||||||
|
|
||||||
|
szind_t ind = sz_size2index_lookup(size);
|
||||||
|
size_t usize;
|
||||||
|
if (config_stats || config_prof) {
|
||||||
|
usize = sz_index2size(ind);
|
||||||
|
}
|
||||||
|
/* Fast path relies on size being a bin. I.e. SC_LOOKUP_MAXCLASS < SC_SMALL_MAXCLASS */
|
||||||
|
assert(ind < SC_NBINS);
|
||||||
|
assert(size <= SC_SMALL_MAXCLASS);
|
||||||
|
|
||||||
|
if (config_prof) {
|
||||||
|
int64_t bytes_until_sample = tsd_bytes_until_sample_get(tsd);
|
||||||
|
bytes_until_sample -= usize;
|
||||||
|
tsd_bytes_until_sample_set(tsd, bytes_until_sample);
|
||||||
|
|
||||||
|
if (unlikely(bytes_until_sample < 0)) {
|
||||||
|
/*
|
||||||
|
* Avoid a prof_active check on the fastpath.
|
||||||
|
* If prof_active is false, set bytes_until_sample to
|
||||||
|
* a large value. If prof_active is set to true,
|
||||||
|
* bytes_until_sample will be reset.
|
||||||
|
*/
|
||||||
|
if (!prof_active) {
|
||||||
|
tsd_bytes_until_sample_set(tsd, SSIZE_MAX);
|
||||||
|
}
|
||||||
|
return malloc_default(size);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
cache_bin_t *bin = tcache_small_bin_get(tcache, ind);
|
||||||
|
bool tcache_success;
|
||||||
|
void* ret = cache_bin_alloc_easy(bin, &tcache_success);
|
||||||
|
|
||||||
|
if (tcache_success) {
|
||||||
|
if (config_stats) {
|
||||||
|
*tsd_thread_allocatedp_get(tsd) += usize;
|
||||||
|
bin->tstats.nrequests++;
|
||||||
|
}
|
||||||
|
if (config_prof) {
|
||||||
|
tcache->prof_accumbytes += usize;
|
||||||
|
}
|
||||||
|
|
||||||
|
LOG("core.malloc.exit", "result: %p", ret);
|
||||||
|
|
||||||
|
/* Fastpath success */
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
return malloc_default(size);
|
||||||
|
}
|
||||||
|
|
||||||
JEMALLOC_EXPORT int JEMALLOC_NOTHROW
|
JEMALLOC_EXPORT int JEMALLOC_NOTHROW
|
||||||
JEMALLOC_ATTR(nonnull(1))
|
JEMALLOC_ATTR(nonnull(1))
|
||||||
je_posix_memalign(void **memptr, size_t alignment, size_t size) {
|
je_posix_memalign(void **memptr, size_t alignment, size_t size) {
|
||||||
|
Loading…
Reference in New Issue
Block a user