2010-01-17 01:53:50 +08:00
|
|
|
#define JEMALLOC_CHUNK_C_
|
2010-02-12 06:45:59 +08:00
|
|
|
#include "jemalloc/internal/jemalloc_internal.h"
|
2010-01-17 01:53:50 +08:00
|
|
|
|
|
|
|
/******************************************************************************/
|
|
|
|
/* Data. */
|
|
|
|
|
2012-10-12 04:53:15 +08:00
|
|
|
const char *opt_dss = DSS_DEFAULT;
|
|
|
|
size_t opt_lg_chunk = LG_CHUNK_DEFAULT;
|
2010-01-17 01:53:50 +08:00
|
|
|
|
2010-01-28 05:10:55 +08:00
|
|
|
malloc_mutex_t chunks_mtx;
|
2010-01-17 01:53:50 +08:00
|
|
|
chunk_stats_t stats_chunks;
|
|
|
|
|
2012-04-13 11:20:58 +08:00
|
|
|
/*
|
|
|
|
* Trees of chunks that were previously allocated (trees differ only in node
|
|
|
|
* ordering). These are used when allocating chunks, in an attempt to re-use
|
|
|
|
* address space. Depending on function, different tree orderings are needed,
|
|
|
|
* which is why there are two trees with the same contents.
|
|
|
|
*/
|
2012-10-12 04:53:15 +08:00
|
|
|
static extent_tree_t chunks_szad_mmap;
|
|
|
|
static extent_tree_t chunks_ad_mmap;
|
|
|
|
static extent_tree_t chunks_szad_dss;
|
|
|
|
static extent_tree_t chunks_ad_dss;
|
2012-04-13 11:20:58 +08:00
|
|
|
|
2010-09-06 01:35:13 +08:00
|
|
|
rtree_t *chunks_rtree;
|
|
|
|
|
2010-01-17 01:53:50 +08:00
|
|
|
/* Various chunk-related settings. */
|
|
|
|
size_t chunksize;
|
|
|
|
size_t chunksize_mask; /* (chunksize - 1). */
|
|
|
|
size_t chunk_npages;
|
|
|
|
|
|
|
|
/******************************************************************************/
|
2014-05-16 13:22:27 +08:00
|
|
|
/*
|
|
|
|
* Function prototypes for static functions that are referenced prior to
|
|
|
|
* definition.
|
|
|
|
*/
|
2012-04-13 11:20:58 +08:00
|
|
|
|
2014-05-16 13:22:27 +08:00
|
|
|
static void chunk_dalloc_core(void *chunk, size_t size);
|
2012-04-13 11:20:58 +08:00
|
|
|
|
|
|
|
/******************************************************************************/
|
|
|
|
|
|
|
|
static void *
|
Attempt to expand huge allocations in-place.
This adds support for expanding huge allocations in-place by requesting
memory at a specific address from the chunk allocator.
It's currently only implemented for the chunk recycling path, although
in theory it could also be done by optimistically allocating new chunks.
On Linux, it could attempt an in-place mremap. However, that won't work
in practice since the heap is grown downwards and memory is not unmapped
(in a normal build, at least).
Repeated vector reallocation micro-benchmark:
#include <string.h>
#include <stdlib.h>
int main(void) {
for (size_t i = 0; i < 100; i++) {
void *ptr = NULL;
size_t old_size = 0;
for (size_t size = 4; size < (1 << 30); size *= 2) {
ptr = realloc(ptr, size);
if (!ptr) return 1;
memset(ptr + old_size, 0xff, size - old_size);
old_size = size;
}
free(ptr);
}
}
The glibc allocator fails to do any in-place reallocations on this
benchmark once it passes the M_MMAP_THRESHOLD (default 128k) but it
elides the cost of copies via mremap, which is currently not something
that jemalloc can use.
With this improvement, jemalloc still fails to do any in-place huge
reallocations for the first outer loop, but then succeeds 100% of the
time for the remaining 99 iterations. The time spent doing allocations
and copies drops down to under 5%, with nearly all of it spent doing
purging + faulting (when huge pages are disabled) and the array memset.
An improved mremap API (MREMAP_RETAIN - #138) would be far more general
but this is a portable optimization and would still be useful on Linux
for xallocx.
Numbers with transparent huge pages enabled:
glibc (copies elided via MREMAP_MAYMOVE): 8.471s
jemalloc: 17.816s
jemalloc + no-op madvise: 13.236s
jemalloc + this commit: 6.787s
jemalloc + this commit + no-op madvise: 6.144s
Numbers with transparent huge pages disabled:
glibc (copies elided via MREMAP_MAYMOVE): 15.403s
jemalloc: 39.456s
jemalloc + no-op madvise: 12.768s
jemalloc + this commit: 15.534s
jemalloc + this commit + no-op madvise: 6.354s
Closes #137
2014-10-04 13:39:32 +08:00
|
|
|
chunk_recycle(extent_tree_t *chunks_szad, extent_tree_t *chunks_ad,
|
|
|
|
void *new_addr, size_t size, size_t alignment, bool base, bool *zero)
|
2012-04-13 11:20:58 +08:00
|
|
|
{
|
|
|
|
void *ret;
|
|
|
|
extent_node_t *node;
|
|
|
|
extent_node_t key;
|
|
|
|
size_t alloc_size, leadsize, trailsize;
|
2012-10-09 08:56:11 +08:00
|
|
|
bool zeroed;
|
2012-04-13 11:20:58 +08:00
|
|
|
|
2012-05-03 11:41:42 +08:00
|
|
|
if (base) {
|
|
|
|
/*
|
|
|
|
* This function may need to call base_node_{,de}alloc(), but
|
|
|
|
* the current chunk allocation request is on behalf of the
|
|
|
|
* base allocator. Avoid deadlock (and if that weren't an
|
|
|
|
* issue, potential for infinite recursion) by returning NULL.
|
|
|
|
*/
|
|
|
|
return (NULL);
|
|
|
|
}
|
|
|
|
|
2012-04-13 11:20:58 +08:00
|
|
|
alloc_size = size + alignment - chunksize;
|
|
|
|
/* Beware size_t wrap-around. */
|
|
|
|
if (alloc_size < size)
|
|
|
|
return (NULL);
|
Attempt to expand huge allocations in-place.
This adds support for expanding huge allocations in-place by requesting
memory at a specific address from the chunk allocator.
It's currently only implemented for the chunk recycling path, although
in theory it could also be done by optimistically allocating new chunks.
On Linux, it could attempt an in-place mremap. However, that won't work
in practice since the heap is grown downwards and memory is not unmapped
(in a normal build, at least).
Repeated vector reallocation micro-benchmark:
#include <string.h>
#include <stdlib.h>
int main(void) {
for (size_t i = 0; i < 100; i++) {
void *ptr = NULL;
size_t old_size = 0;
for (size_t size = 4; size < (1 << 30); size *= 2) {
ptr = realloc(ptr, size);
if (!ptr) return 1;
memset(ptr + old_size, 0xff, size - old_size);
old_size = size;
}
free(ptr);
}
}
The glibc allocator fails to do any in-place reallocations on this
benchmark once it passes the M_MMAP_THRESHOLD (default 128k) but it
elides the cost of copies via mremap, which is currently not something
that jemalloc can use.
With this improvement, jemalloc still fails to do any in-place huge
reallocations for the first outer loop, but then succeeds 100% of the
time for the remaining 99 iterations. The time spent doing allocations
and copies drops down to under 5%, with nearly all of it spent doing
purging + faulting (when huge pages are disabled) and the array memset.
An improved mremap API (MREMAP_RETAIN - #138) would be far more general
but this is a portable optimization and would still be useful on Linux
for xallocx.
Numbers with transparent huge pages enabled:
glibc (copies elided via MREMAP_MAYMOVE): 8.471s
jemalloc: 17.816s
jemalloc + no-op madvise: 13.236s
jemalloc + this commit: 6.787s
jemalloc + this commit + no-op madvise: 6.144s
Numbers with transparent huge pages disabled:
glibc (copies elided via MREMAP_MAYMOVE): 15.403s
jemalloc: 39.456s
jemalloc + no-op madvise: 12.768s
jemalloc + this commit: 15.534s
jemalloc + this commit + no-op madvise: 6.354s
Closes #137
2014-10-04 13:39:32 +08:00
|
|
|
key.addr = new_addr;
|
2012-04-13 11:20:58 +08:00
|
|
|
key.size = alloc_size;
|
|
|
|
malloc_mutex_lock(&chunks_mtx);
|
2012-10-12 04:53:15 +08:00
|
|
|
node = extent_tree_szad_nsearch(chunks_szad, &key);
|
Attempt to expand huge allocations in-place.
This adds support for expanding huge allocations in-place by requesting
memory at a specific address from the chunk allocator.
It's currently only implemented for the chunk recycling path, although
in theory it could also be done by optimistically allocating new chunks.
On Linux, it could attempt an in-place mremap. However, that won't work
in practice since the heap is grown downwards and memory is not unmapped
(in a normal build, at least).
Repeated vector reallocation micro-benchmark:
#include <string.h>
#include <stdlib.h>
int main(void) {
for (size_t i = 0; i < 100; i++) {
void *ptr = NULL;
size_t old_size = 0;
for (size_t size = 4; size < (1 << 30); size *= 2) {
ptr = realloc(ptr, size);
if (!ptr) return 1;
memset(ptr + old_size, 0xff, size - old_size);
old_size = size;
}
free(ptr);
}
}
The glibc allocator fails to do any in-place reallocations on this
benchmark once it passes the M_MMAP_THRESHOLD (default 128k) but it
elides the cost of copies via mremap, which is currently not something
that jemalloc can use.
With this improvement, jemalloc still fails to do any in-place huge
reallocations for the first outer loop, but then succeeds 100% of the
time for the remaining 99 iterations. The time spent doing allocations
and copies drops down to under 5%, with nearly all of it spent doing
purging + faulting (when huge pages are disabled) and the array memset.
An improved mremap API (MREMAP_RETAIN - #138) would be far more general
but this is a portable optimization and would still be useful on Linux
for xallocx.
Numbers with transparent huge pages enabled:
glibc (copies elided via MREMAP_MAYMOVE): 8.471s
jemalloc: 17.816s
jemalloc + no-op madvise: 13.236s
jemalloc + this commit: 6.787s
jemalloc + this commit + no-op madvise: 6.144s
Numbers with transparent huge pages disabled:
glibc (copies elided via MREMAP_MAYMOVE): 15.403s
jemalloc: 39.456s
jemalloc + no-op madvise: 12.768s
jemalloc + this commit: 15.534s
jemalloc + this commit + no-op madvise: 6.354s
Closes #137
2014-10-04 13:39:32 +08:00
|
|
|
if (node == NULL || (new_addr && node->addr != new_addr)) {
|
2012-04-13 11:20:58 +08:00
|
|
|
malloc_mutex_unlock(&chunks_mtx);
|
|
|
|
return (NULL);
|
|
|
|
}
|
|
|
|
leadsize = ALIGNMENT_CEILING((uintptr_t)node->addr, alignment) -
|
|
|
|
(uintptr_t)node->addr;
|
2012-05-10 05:48:35 +08:00
|
|
|
assert(node->size >= leadsize + size);
|
|
|
|
trailsize = node->size - leadsize - size;
|
2012-04-13 11:20:58 +08:00
|
|
|
ret = (void *)((uintptr_t)node->addr + leadsize);
|
2013-01-22 11:56:34 +08:00
|
|
|
zeroed = node->zeroed;
|
|
|
|
if (zeroed)
|
|
|
|
*zero = true;
|
2012-04-13 11:20:58 +08:00
|
|
|
/* Remove node from the tree. */
|
2012-10-12 04:53:15 +08:00
|
|
|
extent_tree_szad_remove(chunks_szad, node);
|
|
|
|
extent_tree_ad_remove(chunks_ad, node);
|
2012-04-13 11:20:58 +08:00
|
|
|
if (leadsize != 0) {
|
|
|
|
/* Insert the leading space as a smaller chunk. */
|
|
|
|
node->size = leadsize;
|
2012-10-12 04:53:15 +08:00
|
|
|
extent_tree_szad_insert(chunks_szad, node);
|
|
|
|
extent_tree_ad_insert(chunks_ad, node);
|
2012-04-13 11:20:58 +08:00
|
|
|
node = NULL;
|
|
|
|
}
|
|
|
|
if (trailsize != 0) {
|
|
|
|
/* Insert the trailing space as a smaller chunk. */
|
|
|
|
if (node == NULL) {
|
|
|
|
/*
|
|
|
|
* An additional node is required, but
|
|
|
|
* base_node_alloc() can cause a new base chunk to be
|
|
|
|
* allocated. Drop chunks_mtx in order to avoid
|
|
|
|
* deadlock, and if node allocation fails, deallocate
|
|
|
|
* the result before returning an error.
|
|
|
|
*/
|
|
|
|
malloc_mutex_unlock(&chunks_mtx);
|
|
|
|
node = base_node_alloc();
|
|
|
|
if (node == NULL) {
|
2014-05-16 13:22:27 +08:00
|
|
|
chunk_dalloc_core(ret, size);
|
2012-04-13 11:20:58 +08:00
|
|
|
return (NULL);
|
|
|
|
}
|
|
|
|
malloc_mutex_lock(&chunks_mtx);
|
|
|
|
}
|
|
|
|
node->addr = (void *)((uintptr_t)(ret) + size);
|
|
|
|
node->size = trailsize;
|
2013-02-01 08:53:58 +08:00
|
|
|
node->zeroed = zeroed;
|
2012-10-12 04:53:15 +08:00
|
|
|
extent_tree_szad_insert(chunks_szad, node);
|
|
|
|
extent_tree_ad_insert(chunks_ad, node);
|
2012-04-13 11:20:58 +08:00
|
|
|
node = NULL;
|
|
|
|
}
|
|
|
|
malloc_mutex_unlock(&chunks_mtx);
|
|
|
|
|
2013-01-22 11:56:34 +08:00
|
|
|
if (node != NULL)
|
2014-05-16 13:22:27 +08:00
|
|
|
base_node_dalloc(node);
|
2013-01-22 11:56:34 +08:00
|
|
|
if (*zero) {
|
2014-10-04 01:16:09 +08:00
|
|
|
if (!zeroed)
|
2013-01-22 11:56:34 +08:00
|
|
|
memset(ret, 0, size);
|
|
|
|
else if (config_debug) {
|
|
|
|
size_t i;
|
|
|
|
size_t *p = (size_t *)(uintptr_t)ret;
|
|
|
|
|
2014-04-16 07:35:08 +08:00
|
|
|
JEMALLOC_VALGRIND_MAKE_MEM_DEFINED(ret, size);
|
2013-01-22 11:56:34 +08:00
|
|
|
for (i = 0; i < size / sizeof(size_t); i++)
|
|
|
|
assert(p[i] == 0);
|
|
|
|
}
|
|
|
|
}
|
2012-04-13 11:20:58 +08:00
|
|
|
return (ret);
|
|
|
|
}
|
2010-01-17 01:53:50 +08:00
|
|
|
|
2010-01-25 09:13:07 +08:00
|
|
|
/*
|
2014-10-04 01:16:09 +08:00
|
|
|
* If the caller specifies (!*zero), it is still possible to receive zeroed
|
|
|
|
* memory, in which case *zero is toggled to true. arena_chunk_alloc() takes
|
|
|
|
* advantage of this to avoid demanding zeroed chunks, but taking advantage of
|
|
|
|
* them if they are returned.
|
2010-01-25 09:13:07 +08:00
|
|
|
*/
|
2014-05-06 06:16:56 +08:00
|
|
|
static void *
|
Attempt to expand huge allocations in-place.
This adds support for expanding huge allocations in-place by requesting
memory at a specific address from the chunk allocator.
It's currently only implemented for the chunk recycling path, although
in theory it could also be done by optimistically allocating new chunks.
On Linux, it could attempt an in-place mremap. However, that won't work
in practice since the heap is grown downwards and memory is not unmapped
(in a normal build, at least).
Repeated vector reallocation micro-benchmark:
#include <string.h>
#include <stdlib.h>
int main(void) {
for (size_t i = 0; i < 100; i++) {
void *ptr = NULL;
size_t old_size = 0;
for (size_t size = 4; size < (1 << 30); size *= 2) {
ptr = realloc(ptr, size);
if (!ptr) return 1;
memset(ptr + old_size, 0xff, size - old_size);
old_size = size;
}
free(ptr);
}
}
The glibc allocator fails to do any in-place reallocations on this
benchmark once it passes the M_MMAP_THRESHOLD (default 128k) but it
elides the cost of copies via mremap, which is currently not something
that jemalloc can use.
With this improvement, jemalloc still fails to do any in-place huge
reallocations for the first outer loop, but then succeeds 100% of the
time for the remaining 99 iterations. The time spent doing allocations
and copies drops down to under 5%, with nearly all of it spent doing
purging + faulting (when huge pages are disabled) and the array memset.
An improved mremap API (MREMAP_RETAIN - #138) would be far more general
but this is a portable optimization and would still be useful on Linux
for xallocx.
Numbers with transparent huge pages enabled:
glibc (copies elided via MREMAP_MAYMOVE): 8.471s
jemalloc: 17.816s
jemalloc + no-op madvise: 13.236s
jemalloc + this commit: 6.787s
jemalloc + this commit + no-op madvise: 6.144s
Numbers with transparent huge pages disabled:
glibc (copies elided via MREMAP_MAYMOVE): 15.403s
jemalloc: 39.456s
jemalloc + no-op madvise: 12.768s
jemalloc + this commit: 15.534s
jemalloc + this commit + no-op madvise: 6.354s
Closes #137
2014-10-04 13:39:32 +08:00
|
|
|
chunk_alloc_core(void *new_addr, size_t size, size_t alignment, bool base,
|
|
|
|
bool *zero, dss_prec_t dss_prec)
|
2010-01-17 01:53:50 +08:00
|
|
|
{
|
|
|
|
void *ret;
|
|
|
|
|
|
|
|
assert(size != 0);
|
|
|
|
assert((size & chunksize_mask) == 0);
|
2012-05-10 04:05:04 +08:00
|
|
|
assert(alignment != 0);
|
2012-04-11 01:50:33 +08:00
|
|
|
assert((alignment & chunksize_mask) == 0);
|
2010-01-17 01:53:50 +08:00
|
|
|
|
2012-10-12 04:53:15 +08:00
|
|
|
/* "primary" dss. */
|
2014-04-16 03:09:48 +08:00
|
|
|
if (have_dss && dss_prec == dss_prec_primary) {
|
Attempt to expand huge allocations in-place.
This adds support for expanding huge allocations in-place by requesting
memory at a specific address from the chunk allocator.
It's currently only implemented for the chunk recycling path, although
in theory it could also be done by optimistically allocating new chunks.
On Linux, it could attempt an in-place mremap. However, that won't work
in practice since the heap is grown downwards and memory is not unmapped
(in a normal build, at least).
Repeated vector reallocation micro-benchmark:
#include <string.h>
#include <stdlib.h>
int main(void) {
for (size_t i = 0; i < 100; i++) {
void *ptr = NULL;
size_t old_size = 0;
for (size_t size = 4; size < (1 << 30); size *= 2) {
ptr = realloc(ptr, size);
if (!ptr) return 1;
memset(ptr + old_size, 0xff, size - old_size);
old_size = size;
}
free(ptr);
}
}
The glibc allocator fails to do any in-place reallocations on this
benchmark once it passes the M_MMAP_THRESHOLD (default 128k) but it
elides the cost of copies via mremap, which is currently not something
that jemalloc can use.
With this improvement, jemalloc still fails to do any in-place huge
reallocations for the first outer loop, but then succeeds 100% of the
time for the remaining 99 iterations. The time spent doing allocations
and copies drops down to under 5%, with nearly all of it spent doing
purging + faulting (when huge pages are disabled) and the array memset.
An improved mremap API (MREMAP_RETAIN - #138) would be far more general
but this is a portable optimization and would still be useful on Linux
for xallocx.
Numbers with transparent huge pages enabled:
glibc (copies elided via MREMAP_MAYMOVE): 8.471s
jemalloc: 17.816s
jemalloc + no-op madvise: 13.236s
jemalloc + this commit: 6.787s
jemalloc + this commit + no-op madvise: 6.144s
Numbers with transparent huge pages disabled:
glibc (copies elided via MREMAP_MAYMOVE): 15.403s
jemalloc: 39.456s
jemalloc + no-op madvise: 12.768s
jemalloc + this commit: 15.534s
jemalloc + this commit + no-op madvise: 6.354s
Closes #137
2014-10-04 13:39:32 +08:00
|
|
|
if ((ret = chunk_recycle(&chunks_szad_dss, &chunks_ad_dss,
|
|
|
|
new_addr, size, alignment, base, zero)) != NULL)
|
2014-05-06 06:16:56 +08:00
|
|
|
return (ret);
|
Attempt to expand huge allocations in-place.
This adds support for expanding huge allocations in-place by requesting
memory at a specific address from the chunk allocator.
It's currently only implemented for the chunk recycling path, although
in theory it could also be done by optimistically allocating new chunks.
On Linux, it could attempt an in-place mremap. However, that won't work
in practice since the heap is grown downwards and memory is not unmapped
(in a normal build, at least).
Repeated vector reallocation micro-benchmark:
#include <string.h>
#include <stdlib.h>
int main(void) {
for (size_t i = 0; i < 100; i++) {
void *ptr = NULL;
size_t old_size = 0;
for (size_t size = 4; size < (1 << 30); size *= 2) {
ptr = realloc(ptr, size);
if (!ptr) return 1;
memset(ptr + old_size, 0xff, size - old_size);
old_size = size;
}
free(ptr);
}
}
The glibc allocator fails to do any in-place reallocations on this
benchmark once it passes the M_MMAP_THRESHOLD (default 128k) but it
elides the cost of copies via mremap, which is currently not something
that jemalloc can use.
With this improvement, jemalloc still fails to do any in-place huge
reallocations for the first outer loop, but then succeeds 100% of the
time for the remaining 99 iterations. The time spent doing allocations
and copies drops down to under 5%, with nearly all of it spent doing
purging + faulting (when huge pages are disabled) and the array memset.
An improved mremap API (MREMAP_RETAIN - #138) would be far more general
but this is a portable optimization and would still be useful on Linux
for xallocx.
Numbers with transparent huge pages enabled:
glibc (copies elided via MREMAP_MAYMOVE): 8.471s
jemalloc: 17.816s
jemalloc + no-op madvise: 13.236s
jemalloc + this commit: 6.787s
jemalloc + this commit + no-op madvise: 6.144s
Numbers with transparent huge pages disabled:
glibc (copies elided via MREMAP_MAYMOVE): 15.403s
jemalloc: 39.456s
jemalloc + no-op madvise: 12.768s
jemalloc + this commit: 15.534s
jemalloc + this commit + no-op madvise: 6.354s
Closes #137
2014-10-04 13:39:32 +08:00
|
|
|
/* requesting an address only implemented for recycle */
|
|
|
|
if (new_addr == NULL
|
|
|
|
&& (ret = chunk_alloc_dss(size, alignment, zero)) != NULL)
|
2014-05-06 06:16:56 +08:00
|
|
|
return (ret);
|
2012-10-17 13:06:56 +08:00
|
|
|
}
|
2012-10-12 04:53:15 +08:00
|
|
|
/* mmap. */
|
Attempt to expand huge allocations in-place.
This adds support for expanding huge allocations in-place by requesting
memory at a specific address from the chunk allocator.
It's currently only implemented for the chunk recycling path, although
in theory it could also be done by optimistically allocating new chunks.
On Linux, it could attempt an in-place mremap. However, that won't work
in practice since the heap is grown downwards and memory is not unmapped
(in a normal build, at least).
Repeated vector reallocation micro-benchmark:
#include <string.h>
#include <stdlib.h>
int main(void) {
for (size_t i = 0; i < 100; i++) {
void *ptr = NULL;
size_t old_size = 0;
for (size_t size = 4; size < (1 << 30); size *= 2) {
ptr = realloc(ptr, size);
if (!ptr) return 1;
memset(ptr + old_size, 0xff, size - old_size);
old_size = size;
}
free(ptr);
}
}
The glibc allocator fails to do any in-place reallocations on this
benchmark once it passes the M_MMAP_THRESHOLD (default 128k) but it
elides the cost of copies via mremap, which is currently not something
that jemalloc can use.
With this improvement, jemalloc still fails to do any in-place huge
reallocations for the first outer loop, but then succeeds 100% of the
time for the remaining 99 iterations. The time spent doing allocations
and copies drops down to under 5%, with nearly all of it spent doing
purging + faulting (when huge pages are disabled) and the array memset.
An improved mremap API (MREMAP_RETAIN - #138) would be far more general
but this is a portable optimization and would still be useful on Linux
for xallocx.
Numbers with transparent huge pages enabled:
glibc (copies elided via MREMAP_MAYMOVE): 8.471s
jemalloc: 17.816s
jemalloc + no-op madvise: 13.236s
jemalloc + this commit: 6.787s
jemalloc + this commit + no-op madvise: 6.144s
Numbers with transparent huge pages disabled:
glibc (copies elided via MREMAP_MAYMOVE): 15.403s
jemalloc: 39.456s
jemalloc + no-op madvise: 12.768s
jemalloc + this commit: 15.534s
jemalloc + this commit + no-op madvise: 6.354s
Closes #137
2014-10-04 13:39:32 +08:00
|
|
|
if ((ret = chunk_recycle(&chunks_szad_mmap, &chunks_ad_mmap, new_addr,
|
|
|
|
size, alignment, base, zero)) != NULL)
|
2014-05-06 06:16:56 +08:00
|
|
|
return (ret);
|
Attempt to expand huge allocations in-place.
This adds support for expanding huge allocations in-place by requesting
memory at a specific address from the chunk allocator.
It's currently only implemented for the chunk recycling path, although
in theory it could also be done by optimistically allocating new chunks.
On Linux, it could attempt an in-place mremap. However, that won't work
in practice since the heap is grown downwards and memory is not unmapped
(in a normal build, at least).
Repeated vector reallocation micro-benchmark:
#include <string.h>
#include <stdlib.h>
int main(void) {
for (size_t i = 0; i < 100; i++) {
void *ptr = NULL;
size_t old_size = 0;
for (size_t size = 4; size < (1 << 30); size *= 2) {
ptr = realloc(ptr, size);
if (!ptr) return 1;
memset(ptr + old_size, 0xff, size - old_size);
old_size = size;
}
free(ptr);
}
}
The glibc allocator fails to do any in-place reallocations on this
benchmark once it passes the M_MMAP_THRESHOLD (default 128k) but it
elides the cost of copies via mremap, which is currently not something
that jemalloc can use.
With this improvement, jemalloc still fails to do any in-place huge
reallocations for the first outer loop, but then succeeds 100% of the
time for the remaining 99 iterations. The time spent doing allocations
and copies drops down to under 5%, with nearly all of it spent doing
purging + faulting (when huge pages are disabled) and the array memset.
An improved mremap API (MREMAP_RETAIN - #138) would be far more general
but this is a portable optimization and would still be useful on Linux
for xallocx.
Numbers with transparent huge pages enabled:
glibc (copies elided via MREMAP_MAYMOVE): 8.471s
jemalloc: 17.816s
jemalloc + no-op madvise: 13.236s
jemalloc + this commit: 6.787s
jemalloc + this commit + no-op madvise: 6.144s
Numbers with transparent huge pages disabled:
glibc (copies elided via MREMAP_MAYMOVE): 15.403s
jemalloc: 39.456s
jemalloc + no-op madvise: 12.768s
jemalloc + this commit: 15.534s
jemalloc + this commit + no-op madvise: 6.354s
Closes #137
2014-10-04 13:39:32 +08:00
|
|
|
/* requesting an address only implemented for recycle */
|
|
|
|
if (new_addr == NULL &&
|
|
|
|
(ret = chunk_alloc_mmap(size, alignment, zero)) != NULL)
|
2014-05-06 06:16:56 +08:00
|
|
|
return (ret);
|
2012-10-12 04:53:15 +08:00
|
|
|
/* "secondary" dss. */
|
2014-04-16 03:09:48 +08:00
|
|
|
if (have_dss && dss_prec == dss_prec_secondary) {
|
Attempt to expand huge allocations in-place.
This adds support for expanding huge allocations in-place by requesting
memory at a specific address from the chunk allocator.
It's currently only implemented for the chunk recycling path, although
in theory it could also be done by optimistically allocating new chunks.
On Linux, it could attempt an in-place mremap. However, that won't work
in practice since the heap is grown downwards and memory is not unmapped
(in a normal build, at least).
Repeated vector reallocation micro-benchmark:
#include <string.h>
#include <stdlib.h>
int main(void) {
for (size_t i = 0; i < 100; i++) {
void *ptr = NULL;
size_t old_size = 0;
for (size_t size = 4; size < (1 << 30); size *= 2) {
ptr = realloc(ptr, size);
if (!ptr) return 1;
memset(ptr + old_size, 0xff, size - old_size);
old_size = size;
}
free(ptr);
}
}
The glibc allocator fails to do any in-place reallocations on this
benchmark once it passes the M_MMAP_THRESHOLD (default 128k) but it
elides the cost of copies via mremap, which is currently not something
that jemalloc can use.
With this improvement, jemalloc still fails to do any in-place huge
reallocations for the first outer loop, but then succeeds 100% of the
time for the remaining 99 iterations. The time spent doing allocations
and copies drops down to under 5%, with nearly all of it spent doing
purging + faulting (when huge pages are disabled) and the array memset.
An improved mremap API (MREMAP_RETAIN - #138) would be far more general
but this is a portable optimization and would still be useful on Linux
for xallocx.
Numbers with transparent huge pages enabled:
glibc (copies elided via MREMAP_MAYMOVE): 8.471s
jemalloc: 17.816s
jemalloc + no-op madvise: 13.236s
jemalloc + this commit: 6.787s
jemalloc + this commit + no-op madvise: 6.144s
Numbers with transparent huge pages disabled:
glibc (copies elided via MREMAP_MAYMOVE): 15.403s
jemalloc: 39.456s
jemalloc + no-op madvise: 12.768s
jemalloc + this commit: 15.534s
jemalloc + this commit + no-op madvise: 6.354s
Closes #137
2014-10-04 13:39:32 +08:00
|
|
|
if ((ret = chunk_recycle(&chunks_szad_dss, &chunks_ad_dss,
|
|
|
|
new_addr, size, alignment, base, zero)) != NULL)
|
2014-05-06 06:16:56 +08:00
|
|
|
return (ret);
|
Attempt to expand huge allocations in-place.
This adds support for expanding huge allocations in-place by requesting
memory at a specific address from the chunk allocator.
It's currently only implemented for the chunk recycling path, although
in theory it could also be done by optimistically allocating new chunks.
On Linux, it could attempt an in-place mremap. However, that won't work
in practice since the heap is grown downwards and memory is not unmapped
(in a normal build, at least).
Repeated vector reallocation micro-benchmark:
#include <string.h>
#include <stdlib.h>
int main(void) {
for (size_t i = 0; i < 100; i++) {
void *ptr = NULL;
size_t old_size = 0;
for (size_t size = 4; size < (1 << 30); size *= 2) {
ptr = realloc(ptr, size);
if (!ptr) return 1;
memset(ptr + old_size, 0xff, size - old_size);
old_size = size;
}
free(ptr);
}
}
The glibc allocator fails to do any in-place reallocations on this
benchmark once it passes the M_MMAP_THRESHOLD (default 128k) but it
elides the cost of copies via mremap, which is currently not something
that jemalloc can use.
With this improvement, jemalloc still fails to do any in-place huge
reallocations for the first outer loop, but then succeeds 100% of the
time for the remaining 99 iterations. The time spent doing allocations
and copies drops down to under 5%, with nearly all of it spent doing
purging + faulting (when huge pages are disabled) and the array memset.
An improved mremap API (MREMAP_RETAIN - #138) would be far more general
but this is a portable optimization and would still be useful on Linux
for xallocx.
Numbers with transparent huge pages enabled:
glibc (copies elided via MREMAP_MAYMOVE): 8.471s
jemalloc: 17.816s
jemalloc + no-op madvise: 13.236s
jemalloc + this commit: 6.787s
jemalloc + this commit + no-op madvise: 6.144s
Numbers with transparent huge pages disabled:
glibc (copies elided via MREMAP_MAYMOVE): 15.403s
jemalloc: 39.456s
jemalloc + no-op madvise: 12.768s
jemalloc + this commit: 15.534s
jemalloc + this commit + no-op madvise: 6.354s
Closes #137
2014-10-04 13:39:32 +08:00
|
|
|
/* requesting an address only implemented for recycle */
|
|
|
|
if (new_addr == NULL &&
|
|
|
|
(ret = chunk_alloc_dss(size, alignment, zero)) != NULL)
|
2014-05-06 06:16:56 +08:00
|
|
|
return (ret);
|
2012-10-17 13:06:56 +08:00
|
|
|
}
|
2010-01-17 01:53:50 +08:00
|
|
|
|
|
|
|
/* All strategies for allocation failed. */
|
2014-05-06 06:16:56 +08:00
|
|
|
return (NULL);
|
|
|
|
}
|
|
|
|
|
2014-05-16 13:22:27 +08:00
|
|
|
static bool
|
|
|
|
chunk_register(void *chunk, size_t size, bool base)
|
2014-05-06 06:16:56 +08:00
|
|
|
{
|
|
|
|
|
2014-05-16 13:22:27 +08:00
|
|
|
assert(chunk != NULL);
|
|
|
|
assert(CHUNK_ADDR2BASE(chunk) == chunk);
|
|
|
|
|
2014-10-04 01:16:09 +08:00
|
|
|
if (config_ivsalloc && !base) {
|
2014-05-16 13:22:27 +08:00
|
|
|
if (rtree_set(chunks_rtree, (uintptr_t)chunk, 1))
|
|
|
|
return (true);
|
|
|
|
}
|
|
|
|
if (config_stats || config_prof) {
|
|
|
|
bool gdump;
|
|
|
|
malloc_mutex_lock(&chunks_mtx);
|
|
|
|
if (config_stats)
|
|
|
|
stats_chunks.nchunks += (size / chunksize);
|
|
|
|
stats_chunks.curchunks += (size / chunksize);
|
|
|
|
if (stats_chunks.curchunks > stats_chunks.highchunks) {
|
|
|
|
stats_chunks.highchunks =
|
|
|
|
stats_chunks.curchunks;
|
|
|
|
if (config_prof)
|
|
|
|
gdump = true;
|
|
|
|
} else if (config_prof)
|
|
|
|
gdump = false;
|
|
|
|
malloc_mutex_unlock(&chunks_mtx);
|
|
|
|
if (config_prof && opt_prof && opt_prof_gdump && gdump)
|
|
|
|
prof_gdump();
|
|
|
|
}
|
|
|
|
if (config_valgrind)
|
|
|
|
JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(chunk, size);
|
|
|
|
return (false);
|
2014-05-06 06:16:56 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
void *
|
2014-05-16 13:22:27 +08:00
|
|
|
chunk_alloc_base(size_t size)
|
2014-05-06 06:16:56 +08:00
|
|
|
{
|
|
|
|
void *ret;
|
2014-05-16 13:22:27 +08:00
|
|
|
bool zero;
|
2014-05-06 06:16:56 +08:00
|
|
|
|
2014-05-16 13:22:27 +08:00
|
|
|
zero = false;
|
Attempt to expand huge allocations in-place.
This adds support for expanding huge allocations in-place by requesting
memory at a specific address from the chunk allocator.
It's currently only implemented for the chunk recycling path, although
in theory it could also be done by optimistically allocating new chunks.
On Linux, it could attempt an in-place mremap. However, that won't work
in practice since the heap is grown downwards and memory is not unmapped
(in a normal build, at least).
Repeated vector reallocation micro-benchmark:
#include <string.h>
#include <stdlib.h>
int main(void) {
for (size_t i = 0; i < 100; i++) {
void *ptr = NULL;
size_t old_size = 0;
for (size_t size = 4; size < (1 << 30); size *= 2) {
ptr = realloc(ptr, size);
if (!ptr) return 1;
memset(ptr + old_size, 0xff, size - old_size);
old_size = size;
}
free(ptr);
}
}
The glibc allocator fails to do any in-place reallocations on this
benchmark once it passes the M_MMAP_THRESHOLD (default 128k) but it
elides the cost of copies via mremap, which is currently not something
that jemalloc can use.
With this improvement, jemalloc still fails to do any in-place huge
reallocations for the first outer loop, but then succeeds 100% of the
time for the remaining 99 iterations. The time spent doing allocations
and copies drops down to under 5%, with nearly all of it spent doing
purging + faulting (when huge pages are disabled) and the array memset.
An improved mremap API (MREMAP_RETAIN - #138) would be far more general
but this is a portable optimization and would still be useful on Linux
for xallocx.
Numbers with transparent huge pages enabled:
glibc (copies elided via MREMAP_MAYMOVE): 8.471s
jemalloc: 17.816s
jemalloc + no-op madvise: 13.236s
jemalloc + this commit: 6.787s
jemalloc + this commit + no-op madvise: 6.144s
Numbers with transparent huge pages disabled:
glibc (copies elided via MREMAP_MAYMOVE): 15.403s
jemalloc: 39.456s
jemalloc + no-op madvise: 12.768s
jemalloc + this commit: 15.534s
jemalloc + this commit + no-op madvise: 6.354s
Closes #137
2014-10-04 13:39:32 +08:00
|
|
|
ret = chunk_alloc_core(NULL, size, chunksize, true, &zero,
|
2014-05-16 13:22:27 +08:00
|
|
|
chunk_dss_prec_get());
|
|
|
|
if (ret == NULL)
|
|
|
|
return (NULL);
|
|
|
|
if (chunk_register(ret, size, true)) {
|
|
|
|
chunk_dalloc_core(ret, size);
|
|
|
|
return (NULL);
|
|
|
|
}
|
|
|
|
return (ret);
|
|
|
|
}
|
2014-05-06 06:16:56 +08:00
|
|
|
|
2014-05-16 13:22:27 +08:00
|
|
|
void *
|
|
|
|
chunk_alloc_arena(chunk_alloc_t *chunk_alloc, chunk_dalloc_t *chunk_dalloc,
|
Attempt to expand huge allocations in-place.
This adds support for expanding huge allocations in-place by requesting
memory at a specific address from the chunk allocator.
It's currently only implemented for the chunk recycling path, although
in theory it could also be done by optimistically allocating new chunks.
On Linux, it could attempt an in-place mremap. However, that won't work
in practice since the heap is grown downwards and memory is not unmapped
(in a normal build, at least).
Repeated vector reallocation micro-benchmark:
#include <string.h>
#include <stdlib.h>
int main(void) {
for (size_t i = 0; i < 100; i++) {
void *ptr = NULL;
size_t old_size = 0;
for (size_t size = 4; size < (1 << 30); size *= 2) {
ptr = realloc(ptr, size);
if (!ptr) return 1;
memset(ptr + old_size, 0xff, size - old_size);
old_size = size;
}
free(ptr);
}
}
The glibc allocator fails to do any in-place reallocations on this
benchmark once it passes the M_MMAP_THRESHOLD (default 128k) but it
elides the cost of copies via mremap, which is currently not something
that jemalloc can use.
With this improvement, jemalloc still fails to do any in-place huge
reallocations for the first outer loop, but then succeeds 100% of the
time for the remaining 99 iterations. The time spent doing allocations
and copies drops down to under 5%, with nearly all of it spent doing
purging + faulting (when huge pages are disabled) and the array memset.
An improved mremap API (MREMAP_RETAIN - #138) would be far more general
but this is a portable optimization and would still be useful on Linux
for xallocx.
Numbers with transparent huge pages enabled:
glibc (copies elided via MREMAP_MAYMOVE): 8.471s
jemalloc: 17.816s
jemalloc + no-op madvise: 13.236s
jemalloc + this commit: 6.787s
jemalloc + this commit + no-op madvise: 6.144s
Numbers with transparent huge pages disabled:
glibc (copies elided via MREMAP_MAYMOVE): 15.403s
jemalloc: 39.456s
jemalloc + no-op madvise: 12.768s
jemalloc + this commit: 15.534s
jemalloc + this commit + no-op madvise: 6.354s
Closes #137
2014-10-04 13:39:32 +08:00
|
|
|
unsigned arena_ind, void *new_addr, size_t size, size_t alignment,
|
|
|
|
bool *zero)
|
2014-05-16 13:22:27 +08:00
|
|
|
{
|
|
|
|
void *ret;
|
|
|
|
|
Attempt to expand huge allocations in-place.
This adds support for expanding huge allocations in-place by requesting
memory at a specific address from the chunk allocator.
It's currently only implemented for the chunk recycling path, although
in theory it could also be done by optimistically allocating new chunks.
On Linux, it could attempt an in-place mremap. However, that won't work
in practice since the heap is grown downwards and memory is not unmapped
(in a normal build, at least).
Repeated vector reallocation micro-benchmark:
#include <string.h>
#include <stdlib.h>
int main(void) {
for (size_t i = 0; i < 100; i++) {
void *ptr = NULL;
size_t old_size = 0;
for (size_t size = 4; size < (1 << 30); size *= 2) {
ptr = realloc(ptr, size);
if (!ptr) return 1;
memset(ptr + old_size, 0xff, size - old_size);
old_size = size;
}
free(ptr);
}
}
The glibc allocator fails to do any in-place reallocations on this
benchmark once it passes the M_MMAP_THRESHOLD (default 128k) but it
elides the cost of copies via mremap, which is currently not something
that jemalloc can use.
With this improvement, jemalloc still fails to do any in-place huge
reallocations for the first outer loop, but then succeeds 100% of the
time for the remaining 99 iterations. The time spent doing allocations
and copies drops down to under 5%, with nearly all of it spent doing
purging + faulting (when huge pages are disabled) and the array memset.
An improved mremap API (MREMAP_RETAIN - #138) would be far more general
but this is a portable optimization and would still be useful on Linux
for xallocx.
Numbers with transparent huge pages enabled:
glibc (copies elided via MREMAP_MAYMOVE): 8.471s
jemalloc: 17.816s
jemalloc + no-op madvise: 13.236s
jemalloc + this commit: 6.787s
jemalloc + this commit + no-op madvise: 6.144s
Numbers with transparent huge pages disabled:
glibc (copies elided via MREMAP_MAYMOVE): 15.403s
jemalloc: 39.456s
jemalloc + no-op madvise: 12.768s
jemalloc + this commit: 15.534s
jemalloc + this commit + no-op madvise: 6.354s
Closes #137
2014-10-04 13:39:32 +08:00
|
|
|
ret = chunk_alloc(new_addr, size, alignment, zero, arena_ind);
|
2014-05-16 13:22:27 +08:00
|
|
|
if (ret != NULL && chunk_register(ret, size, false)) {
|
|
|
|
chunk_dalloc(ret, size, arena_ind);
|
|
|
|
ret = NULL;
|
2010-01-17 01:53:50 +08:00
|
|
|
}
|
2014-05-16 13:22:27 +08:00
|
|
|
|
2010-01-17 01:53:50 +08:00
|
|
|
return (ret);
|
|
|
|
}
|
|
|
|
|
2014-05-16 13:22:27 +08:00
|
|
|
/* Default arena chunk allocation routine in the absence of user override. */
|
|
|
|
void *
|
Attempt to expand huge allocations in-place.
This adds support for expanding huge allocations in-place by requesting
memory at a specific address from the chunk allocator.
It's currently only implemented for the chunk recycling path, although
in theory it could also be done by optimistically allocating new chunks.
On Linux, it could attempt an in-place mremap. However, that won't work
in practice since the heap is grown downwards and memory is not unmapped
(in a normal build, at least).
Repeated vector reallocation micro-benchmark:
#include <string.h>
#include <stdlib.h>
int main(void) {
for (size_t i = 0; i < 100; i++) {
void *ptr = NULL;
size_t old_size = 0;
for (size_t size = 4; size < (1 << 30); size *= 2) {
ptr = realloc(ptr, size);
if (!ptr) return 1;
memset(ptr + old_size, 0xff, size - old_size);
old_size = size;
}
free(ptr);
}
}
The glibc allocator fails to do any in-place reallocations on this
benchmark once it passes the M_MMAP_THRESHOLD (default 128k) but it
elides the cost of copies via mremap, which is currently not something
that jemalloc can use.
With this improvement, jemalloc still fails to do any in-place huge
reallocations for the first outer loop, but then succeeds 100% of the
time for the remaining 99 iterations. The time spent doing allocations
and copies drops down to under 5%, with nearly all of it spent doing
purging + faulting (when huge pages are disabled) and the array memset.
An improved mremap API (MREMAP_RETAIN - #138) would be far more general
but this is a portable optimization and would still be useful on Linux
for xallocx.
Numbers with transparent huge pages enabled:
glibc (copies elided via MREMAP_MAYMOVE): 8.471s
jemalloc: 17.816s
jemalloc + no-op madvise: 13.236s
jemalloc + this commit: 6.787s
jemalloc + this commit + no-op madvise: 6.144s
Numbers with transparent huge pages disabled:
glibc (copies elided via MREMAP_MAYMOVE): 15.403s
jemalloc: 39.456s
jemalloc + no-op madvise: 12.768s
jemalloc + this commit: 15.534s
jemalloc + this commit + no-op madvise: 6.354s
Closes #137
2014-10-04 13:39:32 +08:00
|
|
|
chunk_alloc_default(void *new_addr, size_t size, size_t alignment, bool *zero,
|
2014-05-16 13:22:27 +08:00
|
|
|
unsigned arena_ind)
|
|
|
|
{
|
Refactor/fix arenas manipulation.
Abstract arenas access to use arena_get() (or a0get() where appropriate)
rather than directly reading e.g. arenas[ind]. Prior to the addition of
the arenas.extend mallctl, the worst possible outcome of directly
accessing arenas was a stale read, but arenas.extend may allocate and
assign a new array to arenas.
Add a tsd-based arenas_cache, which amortizes arenas reads. This
introduces some subtle bootstrapping issues, with tsd_boot() now being
split into tsd_boot[01]() to support tsd wrapper allocation
bootstrapping, as well as an arenas_cache_bypass tsd variable which
dynamically terminates allocation of arenas_cache itself.
Promote a0malloc(), a0calloc(), and a0free() to be generally useful for
internal allocation, and use them in several places (more may be
appropriate).
Abstract arena->nthreads management and fix a missing decrement during
thread destruction (recent tsd refactoring left arenas_cleanup()
unused).
Change arena_choose() to propagate OOM, and handle OOM in all callers.
This is important for providing consistent allocation behavior when the
MALLOCX_ARENA() flag is being used. Prior to this fix, it was possible
for an OOM to result in allocation silently allocating from a different
arena than the one specified.
2014-10-08 14:14:57 +08:00
|
|
|
arena_t *arena;
|
|
|
|
|
|
|
|
arena = arena_get(tsd_fetch(), arena_ind, false, true);
|
|
|
|
/*
|
|
|
|
* The arena we're allocating on behalf of must have been initialized
|
|
|
|
* already.
|
|
|
|
*/
|
|
|
|
assert(arena != NULL);
|
2014-05-16 13:22:27 +08:00
|
|
|
|
Attempt to expand huge allocations in-place.
This adds support for expanding huge allocations in-place by requesting
memory at a specific address from the chunk allocator.
It's currently only implemented for the chunk recycling path, although
in theory it could also be done by optimistically allocating new chunks.
On Linux, it could attempt an in-place mremap. However, that won't work
in practice since the heap is grown downwards and memory is not unmapped
(in a normal build, at least).
Repeated vector reallocation micro-benchmark:
#include <string.h>
#include <stdlib.h>
int main(void) {
for (size_t i = 0; i < 100; i++) {
void *ptr = NULL;
size_t old_size = 0;
for (size_t size = 4; size < (1 << 30); size *= 2) {
ptr = realloc(ptr, size);
if (!ptr) return 1;
memset(ptr + old_size, 0xff, size - old_size);
old_size = size;
}
free(ptr);
}
}
The glibc allocator fails to do any in-place reallocations on this
benchmark once it passes the M_MMAP_THRESHOLD (default 128k) but it
elides the cost of copies via mremap, which is currently not something
that jemalloc can use.
With this improvement, jemalloc still fails to do any in-place huge
reallocations for the first outer loop, but then succeeds 100% of the
time for the remaining 99 iterations. The time spent doing allocations
and copies drops down to under 5%, with nearly all of it spent doing
purging + faulting (when huge pages are disabled) and the array memset.
An improved mremap API (MREMAP_RETAIN - #138) would be far more general
but this is a portable optimization and would still be useful on Linux
for xallocx.
Numbers with transparent huge pages enabled:
glibc (copies elided via MREMAP_MAYMOVE): 8.471s
jemalloc: 17.816s
jemalloc + no-op madvise: 13.236s
jemalloc + this commit: 6.787s
jemalloc + this commit + no-op madvise: 6.144s
Numbers with transparent huge pages disabled:
glibc (copies elided via MREMAP_MAYMOVE): 15.403s
jemalloc: 39.456s
jemalloc + no-op madvise: 12.768s
jemalloc + this commit: 15.534s
jemalloc + this commit + no-op madvise: 6.354s
Closes #137
2014-10-04 13:39:32 +08:00
|
|
|
return (chunk_alloc_core(new_addr, size, alignment, false, zero,
|
Refactor/fix arenas manipulation.
Abstract arenas access to use arena_get() (or a0get() where appropriate)
rather than directly reading e.g. arenas[ind]. Prior to the addition of
the arenas.extend mallctl, the worst possible outcome of directly
accessing arenas was a stale read, but arenas.extend may allocate and
assign a new array to arenas.
Add a tsd-based arenas_cache, which amortizes arenas reads. This
introduces some subtle bootstrapping issues, with tsd_boot() now being
split into tsd_boot[01]() to support tsd wrapper allocation
bootstrapping, as well as an arenas_cache_bypass tsd variable which
dynamically terminates allocation of arenas_cache itself.
Promote a0malloc(), a0calloc(), and a0free() to be generally useful for
internal allocation, and use them in several places (more may be
appropriate).
Abstract arena->nthreads management and fix a missing decrement during
thread destruction (recent tsd refactoring left arenas_cleanup()
unused).
Change arena_choose() to propagate OOM, and handle OOM in all callers.
This is important for providing consistent allocation behavior when the
MALLOCX_ARENA() flag is being used. Prior to this fix, it was possible
for an OOM to result in allocation silently allocating from a different
arena than the one specified.
2014-10-08 14:14:57 +08:00
|
|
|
arena->dss_prec));
|
2014-05-16 13:22:27 +08:00
|
|
|
}
|
|
|
|
|
2012-04-13 11:20:58 +08:00
|
|
|
static void
|
2012-10-12 04:53:15 +08:00
|
|
|
chunk_record(extent_tree_t *chunks_szad, extent_tree_t *chunks_ad, void *chunk,
|
|
|
|
size_t size)
|
2012-04-13 11:20:58 +08:00
|
|
|
{
|
2012-10-09 08:56:11 +08:00
|
|
|
bool unzeroed;
|
2013-04-23 13:36:18 +08:00
|
|
|
extent_node_t *xnode, *node, *prev, *xprev, key;
|
2012-04-13 11:20:58 +08:00
|
|
|
|
2012-10-09 08:56:11 +08:00
|
|
|
unzeroed = pages_purge(chunk, size);
|
2014-04-16 07:35:08 +08:00
|
|
|
JEMALLOC_VALGRIND_MAKE_MEM_NOACCESS(chunk, size);
|
2012-04-13 11:20:58 +08:00
|
|
|
|
2012-05-10 05:48:35 +08:00
|
|
|
/*
|
|
|
|
* Allocate a node before acquiring chunks_mtx even though it might not
|
|
|
|
* be needed, because base_node_alloc() may cause a new base chunk to
|
|
|
|
* be allocated, which could cause deadlock if chunks_mtx were already
|
|
|
|
* held.
|
|
|
|
*/
|
|
|
|
xnode = base_node_alloc();
|
2013-04-23 13:36:18 +08:00
|
|
|
/* Use xprev to implement conditional deferred deallocation of prev. */
|
|
|
|
xprev = NULL;
|
2012-05-10 05:48:35 +08:00
|
|
|
|
2012-04-13 11:20:58 +08:00
|
|
|
malloc_mutex_lock(&chunks_mtx);
|
2012-05-10 05:48:35 +08:00
|
|
|
key.addr = (void *)((uintptr_t)chunk + size);
|
2012-10-12 04:53:15 +08:00
|
|
|
node = extent_tree_ad_nsearch(chunks_ad, &key);
|
2012-05-10 05:48:35 +08:00
|
|
|
/* Try to coalesce forward. */
|
|
|
|
if (node != NULL && node->addr == key.addr) {
|
|
|
|
/*
|
|
|
|
* Coalesce chunk with the following address range. This does
|
|
|
|
* not change the position within chunks_ad, so only
|
|
|
|
* remove/insert from/into chunks_szad.
|
|
|
|
*/
|
2012-10-12 04:53:15 +08:00
|
|
|
extent_tree_szad_remove(chunks_szad, node);
|
2012-05-10 05:48:35 +08:00
|
|
|
node->addr = chunk;
|
|
|
|
node->size += size;
|
2014-10-04 01:16:09 +08:00
|
|
|
node->zeroed = (node->zeroed && !unzeroed);
|
2012-10-12 04:53:15 +08:00
|
|
|
extent_tree_szad_insert(chunks_szad, node);
|
2012-05-10 05:48:35 +08:00
|
|
|
} else {
|
|
|
|
/* Coalescing forward failed, so insert a new node. */
|
|
|
|
if (xnode == NULL) {
|
2012-04-13 11:20:58 +08:00
|
|
|
/*
|
2012-05-10 05:48:35 +08:00
|
|
|
* base_node_alloc() failed, which is an exceedingly
|
|
|
|
* unlikely failure. Leak chunk; its pages have
|
|
|
|
* already been purged, so this is only a virtual
|
|
|
|
* memory leak.
|
2012-04-13 11:20:58 +08:00
|
|
|
*/
|
2013-04-18 00:57:11 +08:00
|
|
|
goto label_return;
|
2012-04-13 11:20:58 +08:00
|
|
|
}
|
2012-05-10 05:48:35 +08:00
|
|
|
node = xnode;
|
2013-04-18 00:57:11 +08:00
|
|
|
xnode = NULL; /* Prevent deallocation below. */
|
2012-05-10 05:48:35 +08:00
|
|
|
node->addr = chunk;
|
|
|
|
node->size = size;
|
2014-10-04 01:16:09 +08:00
|
|
|
node->zeroed = !unzeroed;
|
2012-10-12 04:53:15 +08:00
|
|
|
extent_tree_ad_insert(chunks_ad, node);
|
|
|
|
extent_tree_szad_insert(chunks_szad, node);
|
2012-04-13 11:20:58 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Try to coalesce backward. */
|
2012-10-12 04:53:15 +08:00
|
|
|
prev = extent_tree_ad_prev(chunks_ad, node);
|
2012-04-13 11:20:58 +08:00
|
|
|
if (prev != NULL && (void *)((uintptr_t)prev->addr + prev->size) ==
|
|
|
|
chunk) {
|
|
|
|
/*
|
|
|
|
* Coalesce chunk with the previous address range. This does
|
|
|
|
* not change the position within chunks_ad, so only
|
|
|
|
* remove/insert node from/into chunks_szad.
|
|
|
|
*/
|
2012-10-12 04:53:15 +08:00
|
|
|
extent_tree_szad_remove(chunks_szad, prev);
|
|
|
|
extent_tree_ad_remove(chunks_ad, prev);
|
2012-04-13 11:20:58 +08:00
|
|
|
|
2012-10-12 04:53:15 +08:00
|
|
|
extent_tree_szad_remove(chunks_szad, node);
|
2012-04-13 11:20:58 +08:00
|
|
|
node->addr = prev->addr;
|
|
|
|
node->size += prev->size;
|
2012-10-09 08:56:11 +08:00
|
|
|
node->zeroed = (node->zeroed && prev->zeroed);
|
2012-10-12 04:53:15 +08:00
|
|
|
extent_tree_szad_insert(chunks_szad, node);
|
2012-04-13 11:20:58 +08:00
|
|
|
|
2013-04-23 13:36:18 +08:00
|
|
|
xprev = prev;
|
2012-04-13 11:20:58 +08:00
|
|
|
}
|
2013-04-18 00:57:11 +08:00
|
|
|
|
|
|
|
label_return:
|
2012-04-13 11:20:58 +08:00
|
|
|
malloc_mutex_unlock(&chunks_mtx);
|
2013-04-23 13:36:18 +08:00
|
|
|
/*
|
|
|
|
* Deallocate xnode and/or xprev after unlocking chunks_mtx in order to
|
|
|
|
* avoid potential deadlock.
|
|
|
|
*/
|
|
|
|
if (xnode != NULL)
|
2014-05-16 13:22:27 +08:00
|
|
|
base_node_dalloc(xnode);
|
2013-04-23 13:36:18 +08:00
|
|
|
if (xprev != NULL)
|
2014-05-16 13:22:27 +08:00
|
|
|
base_node_dalloc(xprev);
|
2012-04-13 11:20:58 +08:00
|
|
|
}
|
|
|
|
|
2012-10-12 04:53:15 +08:00
|
|
|
void
|
|
|
|
chunk_unmap(void *chunk, size_t size)
|
|
|
|
{
|
|
|
|
assert(chunk != NULL);
|
|
|
|
assert(CHUNK_ADDR2BASE(chunk) == chunk);
|
|
|
|
assert(size != 0);
|
|
|
|
assert((size & chunksize_mask) == 0);
|
|
|
|
|
2014-04-16 03:09:48 +08:00
|
|
|
if (have_dss && chunk_in_dss(chunk))
|
2012-10-12 04:53:15 +08:00
|
|
|
chunk_record(&chunks_szad_dss, &chunks_ad_dss, chunk, size);
|
2014-05-16 13:22:27 +08:00
|
|
|
else if (chunk_dalloc_mmap(chunk, size))
|
2012-10-12 04:53:15 +08:00
|
|
|
chunk_record(&chunks_szad_mmap, &chunks_ad_mmap, chunk, size);
|
|
|
|
}
|
|
|
|
|
2014-05-16 13:22:27 +08:00
|
|
|
static void
|
|
|
|
chunk_dalloc_core(void *chunk, size_t size)
|
2010-01-17 01:53:50 +08:00
|
|
|
{
|
|
|
|
|
|
|
|
assert(chunk != NULL);
|
|
|
|
assert(CHUNK_ADDR2BASE(chunk) == chunk);
|
|
|
|
assert(size != 0);
|
|
|
|
assert((size & chunksize_mask) == 0);
|
|
|
|
|
2012-02-11 12:22:09 +08:00
|
|
|
if (config_ivsalloc)
|
2014-01-03 09:36:38 +08:00
|
|
|
rtree_set(chunks_rtree, (uintptr_t)chunk, 0);
|
2012-02-11 12:22:09 +08:00
|
|
|
if (config_stats || config_prof) {
|
|
|
|
malloc_mutex_lock(&chunks_mtx);
|
2012-10-12 04:53:15 +08:00
|
|
|
assert(stats_chunks.curchunks >= (size / chunksize));
|
2012-02-11 12:22:09 +08:00
|
|
|
stats_chunks.curchunks -= (size / chunksize);
|
|
|
|
malloc_mutex_unlock(&chunks_mtx);
|
|
|
|
}
|
2010-01-17 01:53:50 +08:00
|
|
|
|
2014-05-16 13:22:27 +08:00
|
|
|
chunk_unmap(chunk, size);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Default arena chunk deallocation routine in the absence of user override. */
|
|
|
|
bool
|
|
|
|
chunk_dalloc_default(void *chunk, size_t size, unsigned arena_ind)
|
|
|
|
{
|
|
|
|
|
|
|
|
chunk_dalloc_core(chunk, size);
|
|
|
|
return (false);
|
2010-01-17 01:53:50 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
bool
|
2012-04-22 10:17:21 +08:00
|
|
|
chunk_boot(void)
|
2010-01-17 01:53:50 +08:00
|
|
|
{
|
|
|
|
|
|
|
|
/* Set variables according to the value of opt_lg_chunk. */
|
2010-09-06 01:35:13 +08:00
|
|
|
chunksize = (ZU(1) << opt_lg_chunk);
|
2012-04-02 22:04:34 +08:00
|
|
|
assert(chunksize >= PAGE);
|
2010-01-17 01:53:50 +08:00
|
|
|
chunksize_mask = chunksize - 1;
|
2012-04-02 22:04:34 +08:00
|
|
|
chunk_npages = (chunksize >> LG_PAGE);
|
2010-01-17 01:53:50 +08:00
|
|
|
|
2014-10-17 03:33:18 +08:00
|
|
|
if (malloc_mutex_init(&chunks_mtx))
|
|
|
|
return (true);
|
|
|
|
if (config_stats || config_prof)
|
2012-02-11 12:22:09 +08:00
|
|
|
memset(&stats_chunks, 0, sizeof(chunk_stats_t));
|
2014-04-16 03:09:48 +08:00
|
|
|
if (have_dss && chunk_dss_boot())
|
2010-01-17 01:53:50 +08:00
|
|
|
return (true);
|
2012-10-12 04:53:15 +08:00
|
|
|
extent_tree_szad_new(&chunks_szad_mmap);
|
|
|
|
extent_tree_ad_new(&chunks_ad_mmap);
|
|
|
|
extent_tree_szad_new(&chunks_szad_dss);
|
|
|
|
extent_tree_ad_new(&chunks_ad_dss);
|
2012-02-11 12:22:09 +08:00
|
|
|
if (config_ivsalloc) {
|
|
|
|
chunks_rtree = rtree_new((ZU(1) << (LG_SIZEOF_PTR+3)) -
|
2014-01-03 08:08:28 +08:00
|
|
|
opt_lg_chunk, base_alloc, NULL);
|
2012-02-11 12:22:09 +08:00
|
|
|
if (chunks_rtree == NULL)
|
|
|
|
return (true);
|
|
|
|
}
|
2010-01-17 01:53:50 +08:00
|
|
|
|
|
|
|
return (false);
|
|
|
|
}
|
2012-10-10 05:46:22 +08:00
|
|
|
|
|
|
|
void
|
|
|
|
chunk_prefork(void)
|
|
|
|
{
|
|
|
|
|
2013-10-22 05:59:10 +08:00
|
|
|
malloc_mutex_prefork(&chunks_mtx);
|
2012-10-10 05:46:22 +08:00
|
|
|
if (config_ivsalloc)
|
|
|
|
rtree_prefork(chunks_rtree);
|
|
|
|
chunk_dss_prefork();
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
chunk_postfork_parent(void)
|
|
|
|
{
|
|
|
|
|
|
|
|
chunk_dss_postfork_parent();
|
|
|
|
if (config_ivsalloc)
|
|
|
|
rtree_postfork_parent(chunks_rtree);
|
|
|
|
malloc_mutex_postfork_parent(&chunks_mtx);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
chunk_postfork_child(void)
|
|
|
|
{
|
|
|
|
|
|
|
|
chunk_dss_postfork_child();
|
|
|
|
if (config_ivsalloc)
|
|
|
|
rtree_postfork_child(chunks_rtree);
|
|
|
|
malloc_mutex_postfork_child(&chunks_mtx);
|
|
|
|
}
|