server-skynet-source-3rd-je.../src/huge.c

451 lines
12 KiB
C
Raw Normal View History

#define JEMALLOC_HUGE_C_
2010-02-12 06:45:59 +08:00
#include "jemalloc/internal/jemalloc_internal.h"
/******************************************************************************/
static extent_node_t *
huge_node_get(const void *ptr)
{
extent_node_t *node;
node = chunk_lookup(ptr, true);
assert(!extent_node_achunk_get(node));
return (node);
}
static bool
huge_node_set(tsd_t *tsd, const void *ptr, extent_node_t *node)
{
assert(extent_node_addr_get(node) == ptr);
assert(!extent_node_achunk_get(node));
return (chunk_register(tsd, ptr, node));
}
static void
huge_node_unset(const void *ptr, const extent_node_t *node)
{
chunk_deregister(ptr, node);
}
void *
huge_malloc(tsd_t *tsd, arena_t *arena, size_t usize, bool zero)
{
assert(usize == s2u(usize));
return (huge_palloc(tsd, arena, usize, chunksize, zero));
}
void *
huge_palloc(tsd_t *tsd, arena_t *arena, size_t usize, size_t alignment,
bool zero)
{
void *ret;
size_t ausize;
extent_node_t *node;
bool is_zeroed;
/* Allocate one or more contiguous chunks for this request. */
ausize = sa2u(usize, alignment);
if (unlikely(ausize == 0 || ausize > HUGE_MAXCLASS))
return (NULL);
assert(ausize >= chunksize);
/* Allocate an extent node with which to track the chunk. */
assert(tsd != NULL || arena != NULL);
node = ipallocztm(tsd, CACHELINE_CEILING(sizeof(extent_node_t)),
CACHELINE, false, NULL, true, arena_ichoose(tsd, arena));
if (node == NULL)
return (NULL);
/*
* Copy zero into is_zeroed and pass the copy to chunk_alloc(), so that
* it is possible to make correct junk/zero fill decisions below.
*/
is_zeroed = zero;
arena = arena_choose(tsd, arena);
if (unlikely(arena == NULL) || (ret = arena_chunk_alloc_huge(tsd, arena,
usize, alignment, &is_zeroed)) == NULL) {
idalloctm(tsd, node, NULL, true, true);
return (NULL);
}
extent_node_init(node, arena, ret, usize, is_zeroed, true);
if (huge_node_set(tsd, ret, node)) {
arena_chunk_dalloc_huge(tsd, arena, ret, usize);
idalloctm(tsd, node, NULL, true, true);
return (NULL);
}
/* Insert node into huge. */
malloc_mutex_lock(tsd, &arena->huge_mtx);
ql_elm_new(node, ql_link);
ql_tail_insert(&arena->huge, node, ql_link);
malloc_mutex_unlock(tsd, &arena->huge_mtx);
if (zero || (config_fill && unlikely(opt_zero))) {
if (!is_zeroed)
memset(ret, 0, usize);
} else if (config_fill && unlikely(opt_junk_alloc))
memset(ret, JEMALLOC_ALLOC_JUNK, usize);
arena_decay_tick(tsd, arena);
return (ret);
}
#ifdef JEMALLOC_JET
#undef huge_dalloc_junk
#define huge_dalloc_junk JEMALLOC_N(huge_dalloc_junk_impl)
#endif
static void
huge_dalloc_junk(tsd_t *tsd, void *ptr, size_t usize)
{
if (config_fill && have_dss && unlikely(opt_junk_free)) {
/*
* Only bother junk filling if the chunk isn't about to be
* unmapped.
*/
if (!config_munmap || (have_dss && chunk_in_dss(tsd, ptr)))
memset(ptr, JEMALLOC_FREE_JUNK, usize);
}
}
#ifdef JEMALLOC_JET
#undef huge_dalloc_junk
#define huge_dalloc_junk JEMALLOC_N(huge_dalloc_junk)
huge_dalloc_junk_t *huge_dalloc_junk = JEMALLOC_N(huge_dalloc_junk_impl);
#endif
static void
huge_ralloc_no_move_similar(tsd_t *tsd, void *ptr, size_t oldsize,
size_t usize_min, size_t usize_max, bool zero)
{
size_t usize, usize_next;
extent_node_t *node;
arena_t *arena;
chunk_hooks_t chunk_hooks = CHUNK_HOOKS_INITIALIZER;
bool pre_zeroed, post_zeroed;
/* Increase usize to incorporate extra. */
for (usize = usize_min; usize < usize_max && (usize_next = s2u(usize+1))
<= oldsize; usize = usize_next)
; /* Do nothing. */
2014-10-15 13:20:00 +08:00
if (oldsize == usize)
return;
node = huge_node_get(ptr);
arena = extent_node_arena_get(node);
pre_zeroed = extent_node_zeroed_get(node);
/* Fill if necessary (shrinking). */
if (oldsize > usize) {
size_t sdiff = oldsize - usize;
if (config_fill && unlikely(opt_junk_free)) {
memset((void *)((uintptr_t)ptr + usize),
JEMALLOC_FREE_JUNK, sdiff);
post_zeroed = false;
} else {
post_zeroed = !chunk_purge_wrapper(tsd, arena,
&chunk_hooks, ptr, CHUNK_CEILING(oldsize), usize,
sdiff);
}
} else
post_zeroed = pre_zeroed;
malloc_mutex_lock(tsd, &arena->huge_mtx);
2014-10-15 13:20:00 +08:00
/* Update the size of the huge allocation. */
assert(extent_node_size_get(node) != usize);
extent_node_size_set(node, usize);
/* Update zeroed. */
extent_node_zeroed_set(node, post_zeroed);
malloc_mutex_unlock(tsd, &arena->huge_mtx);
arena_chunk_ralloc_huge_similar(tsd, arena, ptr, oldsize, usize);
2014-10-15 13:20:00 +08:00
/* Fill if necessary (growing). */
if (oldsize < usize) {
if (zero || (config_fill && unlikely(opt_zero))) {
if (!pre_zeroed) {
memset((void *)((uintptr_t)ptr + oldsize), 0,
usize - oldsize);
}
} else if (config_fill && unlikely(opt_junk_alloc)) {
memset((void *)((uintptr_t)ptr + oldsize),
JEMALLOC_ALLOC_JUNK, usize - oldsize);
}
2014-10-15 13:20:00 +08:00
}
}
static bool
huge_ralloc_no_move_shrink(tsd_t *tsd, void *ptr, size_t oldsize, size_t usize)
{
extent_node_t *node;
arena_t *arena;
chunk_hooks_t chunk_hooks;
size_t cdiff;
bool pre_zeroed, post_zeroed;
node = huge_node_get(ptr);
arena = extent_node_arena_get(node);
pre_zeroed = extent_node_zeroed_get(node);
chunk_hooks = chunk_hooks_get(tsd, arena);
assert(oldsize > usize);
/* Split excess chunks. */
cdiff = CHUNK_CEILING(oldsize) - CHUNK_CEILING(usize);
if (cdiff != 0 && chunk_hooks.split(ptr, CHUNK_CEILING(oldsize),
CHUNK_CEILING(usize), cdiff, true, arena->ind))
return (true);
if (oldsize > usize) {
size_t sdiff = oldsize - usize;
if (config_fill && unlikely(opt_junk_free)) {
huge_dalloc_junk(tsd, (void *)((uintptr_t)ptr + usize),
sdiff);
post_zeroed = false;
} else {
post_zeroed = !chunk_purge_wrapper(tsd, arena,
&chunk_hooks, CHUNK_ADDR2BASE((uintptr_t)ptr +
usize), CHUNK_CEILING(oldsize),
CHUNK_ADDR2OFFSET((uintptr_t)ptr + usize), sdiff);
}
} else
post_zeroed = pre_zeroed;
malloc_mutex_lock(tsd, &arena->huge_mtx);
/* Update the size of the huge allocation. */
extent_node_size_set(node, usize);
/* Update zeroed. */
extent_node_zeroed_set(node, post_zeroed);
malloc_mutex_unlock(tsd, &arena->huge_mtx);
/* Zap the excess chunks. */
arena_chunk_ralloc_huge_shrink(tsd, arena, ptr, oldsize, usize);
return (false);
}
Attempt to expand huge allocations in-place. This adds support for expanding huge allocations in-place by requesting memory at a specific address from the chunk allocator. It's currently only implemented for the chunk recycling path, although in theory it could also be done by optimistically allocating new chunks. On Linux, it could attempt an in-place mremap. However, that won't work in practice since the heap is grown downwards and memory is not unmapped (in a normal build, at least). Repeated vector reallocation micro-benchmark: #include <string.h> #include <stdlib.h> int main(void) { for (size_t i = 0; i < 100; i++) { void *ptr = NULL; size_t old_size = 0; for (size_t size = 4; size < (1 << 30); size *= 2) { ptr = realloc(ptr, size); if (!ptr) return 1; memset(ptr + old_size, 0xff, size - old_size); old_size = size; } free(ptr); } } The glibc allocator fails to do any in-place reallocations on this benchmark once it passes the M_MMAP_THRESHOLD (default 128k) but it elides the cost of copies via mremap, which is currently not something that jemalloc can use. With this improvement, jemalloc still fails to do any in-place huge reallocations for the first outer loop, but then succeeds 100% of the time for the remaining 99 iterations. The time spent doing allocations and copies drops down to under 5%, with nearly all of it spent doing purging + faulting (when huge pages are disabled) and the array memset. An improved mremap API (MREMAP_RETAIN - #138) would be far more general but this is a portable optimization and would still be useful on Linux for xallocx. Numbers with transparent huge pages enabled: glibc (copies elided via MREMAP_MAYMOVE): 8.471s jemalloc: 17.816s jemalloc + no-op madvise: 13.236s jemalloc + this commit: 6.787s jemalloc + this commit + no-op madvise: 6.144s Numbers with transparent huge pages disabled: glibc (copies elided via MREMAP_MAYMOVE): 15.403s jemalloc: 39.456s jemalloc + no-op madvise: 12.768s jemalloc + this commit: 15.534s jemalloc + this commit + no-op madvise: 6.354s Closes #137
2014-10-04 13:39:32 +08:00
static bool
huge_ralloc_no_move_expand(tsd_t *tsd, void *ptr, size_t oldsize, size_t usize,
bool zero) {
extent_node_t *node;
Attempt to expand huge allocations in-place. This adds support for expanding huge allocations in-place by requesting memory at a specific address from the chunk allocator. It's currently only implemented for the chunk recycling path, although in theory it could also be done by optimistically allocating new chunks. On Linux, it could attempt an in-place mremap. However, that won't work in practice since the heap is grown downwards and memory is not unmapped (in a normal build, at least). Repeated vector reallocation micro-benchmark: #include <string.h> #include <stdlib.h> int main(void) { for (size_t i = 0; i < 100; i++) { void *ptr = NULL; size_t old_size = 0; for (size_t size = 4; size < (1 << 30); size *= 2) { ptr = realloc(ptr, size); if (!ptr) return 1; memset(ptr + old_size, 0xff, size - old_size); old_size = size; } free(ptr); } } The glibc allocator fails to do any in-place reallocations on this benchmark once it passes the M_MMAP_THRESHOLD (default 128k) but it elides the cost of copies via mremap, which is currently not something that jemalloc can use. With this improvement, jemalloc still fails to do any in-place huge reallocations for the first outer loop, but then succeeds 100% of the time for the remaining 99 iterations. The time spent doing allocations and copies drops down to under 5%, with nearly all of it spent doing purging + faulting (when huge pages are disabled) and the array memset. An improved mremap API (MREMAP_RETAIN - #138) would be far more general but this is a portable optimization and would still be useful on Linux for xallocx. Numbers with transparent huge pages enabled: glibc (copies elided via MREMAP_MAYMOVE): 8.471s jemalloc: 17.816s jemalloc + no-op madvise: 13.236s jemalloc + this commit: 6.787s jemalloc + this commit + no-op madvise: 6.144s Numbers with transparent huge pages disabled: glibc (copies elided via MREMAP_MAYMOVE): 15.403s jemalloc: 39.456s jemalloc + no-op madvise: 12.768s jemalloc + this commit: 15.534s jemalloc + this commit + no-op madvise: 6.354s Closes #137
2014-10-04 13:39:32 +08:00
arena_t *arena;
bool is_zeroed_subchunk, is_zeroed_chunk;
Attempt to expand huge allocations in-place. This adds support for expanding huge allocations in-place by requesting memory at a specific address from the chunk allocator. It's currently only implemented for the chunk recycling path, although in theory it could also be done by optimistically allocating new chunks. On Linux, it could attempt an in-place mremap. However, that won't work in practice since the heap is grown downwards and memory is not unmapped (in a normal build, at least). Repeated vector reallocation micro-benchmark: #include <string.h> #include <stdlib.h> int main(void) { for (size_t i = 0; i < 100; i++) { void *ptr = NULL; size_t old_size = 0; for (size_t size = 4; size < (1 << 30); size *= 2) { ptr = realloc(ptr, size); if (!ptr) return 1; memset(ptr + old_size, 0xff, size - old_size); old_size = size; } free(ptr); } } The glibc allocator fails to do any in-place reallocations on this benchmark once it passes the M_MMAP_THRESHOLD (default 128k) but it elides the cost of copies via mremap, which is currently not something that jemalloc can use. With this improvement, jemalloc still fails to do any in-place huge reallocations for the first outer loop, but then succeeds 100% of the time for the remaining 99 iterations. The time spent doing allocations and copies drops down to under 5%, with nearly all of it spent doing purging + faulting (when huge pages are disabled) and the array memset. An improved mremap API (MREMAP_RETAIN - #138) would be far more general but this is a portable optimization and would still be useful on Linux for xallocx. Numbers with transparent huge pages enabled: glibc (copies elided via MREMAP_MAYMOVE): 8.471s jemalloc: 17.816s jemalloc + no-op madvise: 13.236s jemalloc + this commit: 6.787s jemalloc + this commit + no-op madvise: 6.144s Numbers with transparent huge pages disabled: glibc (copies elided via MREMAP_MAYMOVE): 15.403s jemalloc: 39.456s jemalloc + no-op madvise: 12.768s jemalloc + this commit: 15.534s jemalloc + this commit + no-op madvise: 6.354s Closes #137
2014-10-04 13:39:32 +08:00
node = huge_node_get(ptr);
arena = extent_node_arena_get(node);
malloc_mutex_lock(tsd, &arena->huge_mtx);
is_zeroed_subchunk = extent_node_zeroed_get(node);
malloc_mutex_unlock(tsd, &arena->huge_mtx);
Attempt to expand huge allocations in-place. This adds support for expanding huge allocations in-place by requesting memory at a specific address from the chunk allocator. It's currently only implemented for the chunk recycling path, although in theory it could also be done by optimistically allocating new chunks. On Linux, it could attempt an in-place mremap. However, that won't work in practice since the heap is grown downwards and memory is not unmapped (in a normal build, at least). Repeated vector reallocation micro-benchmark: #include <string.h> #include <stdlib.h> int main(void) { for (size_t i = 0; i < 100; i++) { void *ptr = NULL; size_t old_size = 0; for (size_t size = 4; size < (1 << 30); size *= 2) { ptr = realloc(ptr, size); if (!ptr) return 1; memset(ptr + old_size, 0xff, size - old_size); old_size = size; } free(ptr); } } The glibc allocator fails to do any in-place reallocations on this benchmark once it passes the M_MMAP_THRESHOLD (default 128k) but it elides the cost of copies via mremap, which is currently not something that jemalloc can use. With this improvement, jemalloc still fails to do any in-place huge reallocations for the first outer loop, but then succeeds 100% of the time for the remaining 99 iterations. The time spent doing allocations and copies drops down to under 5%, with nearly all of it spent doing purging + faulting (when huge pages are disabled) and the array memset. An improved mremap API (MREMAP_RETAIN - #138) would be far more general but this is a portable optimization and would still be useful on Linux for xallocx. Numbers with transparent huge pages enabled: glibc (copies elided via MREMAP_MAYMOVE): 8.471s jemalloc: 17.816s jemalloc + no-op madvise: 13.236s jemalloc + this commit: 6.787s jemalloc + this commit + no-op madvise: 6.144s Numbers with transparent huge pages disabled: glibc (copies elided via MREMAP_MAYMOVE): 15.403s jemalloc: 39.456s jemalloc + no-op madvise: 12.768s jemalloc + this commit: 15.534s jemalloc + this commit + no-op madvise: 6.354s Closes #137
2014-10-04 13:39:32 +08:00
/*
* Copy zero into is_zeroed_chunk and pass the copy to chunk_alloc(), so
* that it is possible to make correct junk/zero fill decisions below.
Attempt to expand huge allocations in-place. This adds support for expanding huge allocations in-place by requesting memory at a specific address from the chunk allocator. It's currently only implemented for the chunk recycling path, although in theory it could also be done by optimistically allocating new chunks. On Linux, it could attempt an in-place mremap. However, that won't work in practice since the heap is grown downwards and memory is not unmapped (in a normal build, at least). Repeated vector reallocation micro-benchmark: #include <string.h> #include <stdlib.h> int main(void) { for (size_t i = 0; i < 100; i++) { void *ptr = NULL; size_t old_size = 0; for (size_t size = 4; size < (1 << 30); size *= 2) { ptr = realloc(ptr, size); if (!ptr) return 1; memset(ptr + old_size, 0xff, size - old_size); old_size = size; } free(ptr); } } The glibc allocator fails to do any in-place reallocations on this benchmark once it passes the M_MMAP_THRESHOLD (default 128k) but it elides the cost of copies via mremap, which is currently not something that jemalloc can use. With this improvement, jemalloc still fails to do any in-place huge reallocations for the first outer loop, but then succeeds 100% of the time for the remaining 99 iterations. The time spent doing allocations and copies drops down to under 5%, with nearly all of it spent doing purging + faulting (when huge pages are disabled) and the array memset. An improved mremap API (MREMAP_RETAIN - #138) would be far more general but this is a portable optimization and would still be useful on Linux for xallocx. Numbers with transparent huge pages enabled: glibc (copies elided via MREMAP_MAYMOVE): 8.471s jemalloc: 17.816s jemalloc + no-op madvise: 13.236s jemalloc + this commit: 6.787s jemalloc + this commit + no-op madvise: 6.144s Numbers with transparent huge pages disabled: glibc (copies elided via MREMAP_MAYMOVE): 15.403s jemalloc: 39.456s jemalloc + no-op madvise: 12.768s jemalloc + this commit: 15.534s jemalloc + this commit + no-op madvise: 6.354s Closes #137
2014-10-04 13:39:32 +08:00
*/
is_zeroed_chunk = zero;
Attempt to expand huge allocations in-place. This adds support for expanding huge allocations in-place by requesting memory at a specific address from the chunk allocator. It's currently only implemented for the chunk recycling path, although in theory it could also be done by optimistically allocating new chunks. On Linux, it could attempt an in-place mremap. However, that won't work in practice since the heap is grown downwards and memory is not unmapped (in a normal build, at least). Repeated vector reallocation micro-benchmark: #include <string.h> #include <stdlib.h> int main(void) { for (size_t i = 0; i < 100; i++) { void *ptr = NULL; size_t old_size = 0; for (size_t size = 4; size < (1 << 30); size *= 2) { ptr = realloc(ptr, size); if (!ptr) return 1; memset(ptr + old_size, 0xff, size - old_size); old_size = size; } free(ptr); } } The glibc allocator fails to do any in-place reallocations on this benchmark once it passes the M_MMAP_THRESHOLD (default 128k) but it elides the cost of copies via mremap, which is currently not something that jemalloc can use. With this improvement, jemalloc still fails to do any in-place huge reallocations for the first outer loop, but then succeeds 100% of the time for the remaining 99 iterations. The time spent doing allocations and copies drops down to under 5%, with nearly all of it spent doing purging + faulting (when huge pages are disabled) and the array memset. An improved mremap API (MREMAP_RETAIN - #138) would be far more general but this is a portable optimization and would still be useful on Linux for xallocx. Numbers with transparent huge pages enabled: glibc (copies elided via MREMAP_MAYMOVE): 8.471s jemalloc: 17.816s jemalloc + no-op madvise: 13.236s jemalloc + this commit: 6.787s jemalloc + this commit + no-op madvise: 6.144s Numbers with transparent huge pages disabled: glibc (copies elided via MREMAP_MAYMOVE): 15.403s jemalloc: 39.456s jemalloc + no-op madvise: 12.768s jemalloc + this commit: 15.534s jemalloc + this commit + no-op madvise: 6.354s Closes #137
2014-10-04 13:39:32 +08:00
if (arena_chunk_ralloc_huge_expand(tsd, arena, ptr, oldsize, usize,
&is_zeroed_chunk))
2014-10-15 13:20:00 +08:00
return (true);
Attempt to expand huge allocations in-place. This adds support for expanding huge allocations in-place by requesting memory at a specific address from the chunk allocator. It's currently only implemented for the chunk recycling path, although in theory it could also be done by optimistically allocating new chunks. On Linux, it could attempt an in-place mremap. However, that won't work in practice since the heap is grown downwards and memory is not unmapped (in a normal build, at least). Repeated vector reallocation micro-benchmark: #include <string.h> #include <stdlib.h> int main(void) { for (size_t i = 0; i < 100; i++) { void *ptr = NULL; size_t old_size = 0; for (size_t size = 4; size < (1 << 30); size *= 2) { ptr = realloc(ptr, size); if (!ptr) return 1; memset(ptr + old_size, 0xff, size - old_size); old_size = size; } free(ptr); } } The glibc allocator fails to do any in-place reallocations on this benchmark once it passes the M_MMAP_THRESHOLD (default 128k) but it elides the cost of copies via mremap, which is currently not something that jemalloc can use. With this improvement, jemalloc still fails to do any in-place huge reallocations for the first outer loop, but then succeeds 100% of the time for the remaining 99 iterations. The time spent doing allocations and copies drops down to under 5%, with nearly all of it spent doing purging + faulting (when huge pages are disabled) and the array memset. An improved mremap API (MREMAP_RETAIN - #138) would be far more general but this is a portable optimization and would still be useful on Linux for xallocx. Numbers with transparent huge pages enabled: glibc (copies elided via MREMAP_MAYMOVE): 8.471s jemalloc: 17.816s jemalloc + no-op madvise: 13.236s jemalloc + this commit: 6.787s jemalloc + this commit + no-op madvise: 6.144s Numbers with transparent huge pages disabled: glibc (copies elided via MREMAP_MAYMOVE): 15.403s jemalloc: 39.456s jemalloc + no-op madvise: 12.768s jemalloc + this commit: 15.534s jemalloc + this commit + no-op madvise: 6.354s Closes #137
2014-10-04 13:39:32 +08:00
malloc_mutex_lock(tsd, &arena->huge_mtx);
Attempt to expand huge allocations in-place. This adds support for expanding huge allocations in-place by requesting memory at a specific address from the chunk allocator. It's currently only implemented for the chunk recycling path, although in theory it could also be done by optimistically allocating new chunks. On Linux, it could attempt an in-place mremap. However, that won't work in practice since the heap is grown downwards and memory is not unmapped (in a normal build, at least). Repeated vector reallocation micro-benchmark: #include <string.h> #include <stdlib.h> int main(void) { for (size_t i = 0; i < 100; i++) { void *ptr = NULL; size_t old_size = 0; for (size_t size = 4; size < (1 << 30); size *= 2) { ptr = realloc(ptr, size); if (!ptr) return 1; memset(ptr + old_size, 0xff, size - old_size); old_size = size; } free(ptr); } } The glibc allocator fails to do any in-place reallocations on this benchmark once it passes the M_MMAP_THRESHOLD (default 128k) but it elides the cost of copies via mremap, which is currently not something that jemalloc can use. With this improvement, jemalloc still fails to do any in-place huge reallocations for the first outer loop, but then succeeds 100% of the time for the remaining 99 iterations. The time spent doing allocations and copies drops down to under 5%, with nearly all of it spent doing purging + faulting (when huge pages are disabled) and the array memset. An improved mremap API (MREMAP_RETAIN - #138) would be far more general but this is a portable optimization and would still be useful on Linux for xallocx. Numbers with transparent huge pages enabled: glibc (copies elided via MREMAP_MAYMOVE): 8.471s jemalloc: 17.816s jemalloc + no-op madvise: 13.236s jemalloc + this commit: 6.787s jemalloc + this commit + no-op madvise: 6.144s Numbers with transparent huge pages disabled: glibc (copies elided via MREMAP_MAYMOVE): 15.403s jemalloc: 39.456s jemalloc + no-op madvise: 12.768s jemalloc + this commit: 15.534s jemalloc + this commit + no-op madvise: 6.354s Closes #137
2014-10-04 13:39:32 +08:00
/* Update the size of the huge allocation. */
extent_node_size_set(node, usize);
malloc_mutex_unlock(tsd, &arena->huge_mtx);
Attempt to expand huge allocations in-place. This adds support for expanding huge allocations in-place by requesting memory at a specific address from the chunk allocator. It's currently only implemented for the chunk recycling path, although in theory it could also be done by optimistically allocating new chunks. On Linux, it could attempt an in-place mremap. However, that won't work in practice since the heap is grown downwards and memory is not unmapped (in a normal build, at least). Repeated vector reallocation micro-benchmark: #include <string.h> #include <stdlib.h> int main(void) { for (size_t i = 0; i < 100; i++) { void *ptr = NULL; size_t old_size = 0; for (size_t size = 4; size < (1 << 30); size *= 2) { ptr = realloc(ptr, size); if (!ptr) return 1; memset(ptr + old_size, 0xff, size - old_size); old_size = size; } free(ptr); } } The glibc allocator fails to do any in-place reallocations on this benchmark once it passes the M_MMAP_THRESHOLD (default 128k) but it elides the cost of copies via mremap, which is currently not something that jemalloc can use. With this improvement, jemalloc still fails to do any in-place huge reallocations for the first outer loop, but then succeeds 100% of the time for the remaining 99 iterations. The time spent doing allocations and copies drops down to under 5%, with nearly all of it spent doing purging + faulting (when huge pages are disabled) and the array memset. An improved mremap API (MREMAP_RETAIN - #138) would be far more general but this is a portable optimization and would still be useful on Linux for xallocx. Numbers with transparent huge pages enabled: glibc (copies elided via MREMAP_MAYMOVE): 8.471s jemalloc: 17.816s jemalloc + no-op madvise: 13.236s jemalloc + this commit: 6.787s jemalloc + this commit + no-op madvise: 6.144s Numbers with transparent huge pages disabled: glibc (copies elided via MREMAP_MAYMOVE): 15.403s jemalloc: 39.456s jemalloc + no-op madvise: 12.768s jemalloc + this commit: 15.534s jemalloc + this commit + no-op madvise: 6.354s Closes #137
2014-10-04 13:39:32 +08:00
if (zero || (config_fill && unlikely(opt_zero))) {
if (!is_zeroed_subchunk) {
memset((void *)((uintptr_t)ptr + oldsize), 0,
CHUNK_CEILING(oldsize) - oldsize);
}
if (!is_zeroed_chunk) {
memset((void *)((uintptr_t)ptr +
CHUNK_CEILING(oldsize)), 0, usize -
CHUNK_CEILING(oldsize));
}
} else if (config_fill && unlikely(opt_junk_alloc)) {
memset((void *)((uintptr_t)ptr + oldsize), JEMALLOC_ALLOC_JUNK,
usize - oldsize);
}
Attempt to expand huge allocations in-place. This adds support for expanding huge allocations in-place by requesting memory at a specific address from the chunk allocator. It's currently only implemented for the chunk recycling path, although in theory it could also be done by optimistically allocating new chunks. On Linux, it could attempt an in-place mremap. However, that won't work in practice since the heap is grown downwards and memory is not unmapped (in a normal build, at least). Repeated vector reallocation micro-benchmark: #include <string.h> #include <stdlib.h> int main(void) { for (size_t i = 0; i < 100; i++) { void *ptr = NULL; size_t old_size = 0; for (size_t size = 4; size < (1 << 30); size *= 2) { ptr = realloc(ptr, size); if (!ptr) return 1; memset(ptr + old_size, 0xff, size - old_size); old_size = size; } free(ptr); } } The glibc allocator fails to do any in-place reallocations on this benchmark once it passes the M_MMAP_THRESHOLD (default 128k) but it elides the cost of copies via mremap, which is currently not something that jemalloc can use. With this improvement, jemalloc still fails to do any in-place huge reallocations for the first outer loop, but then succeeds 100% of the time for the remaining 99 iterations. The time spent doing allocations and copies drops down to under 5%, with nearly all of it spent doing purging + faulting (when huge pages are disabled) and the array memset. An improved mremap API (MREMAP_RETAIN - #138) would be far more general but this is a portable optimization and would still be useful on Linux for xallocx. Numbers with transparent huge pages enabled: glibc (copies elided via MREMAP_MAYMOVE): 8.471s jemalloc: 17.816s jemalloc + no-op madvise: 13.236s jemalloc + this commit: 6.787s jemalloc + this commit + no-op madvise: 6.144s Numbers with transparent huge pages disabled: glibc (copies elided via MREMAP_MAYMOVE): 15.403s jemalloc: 39.456s jemalloc + no-op madvise: 12.768s jemalloc + this commit: 15.534s jemalloc + this commit + no-op madvise: 6.354s Closes #137
2014-10-04 13:39:32 +08:00
return (false);
}
bool
huge_ralloc_no_move(tsd_t *tsd, void *ptr, size_t oldsize, size_t usize_min,
size_t usize_max, bool zero)
{
assert(s2u(oldsize) == oldsize);
/* The following should have been caught by callers. */
assert(usize_min > 0 && usize_max <= HUGE_MAXCLASS);
/* Both allocations must be huge to avoid a move. */
if (oldsize < chunksize || usize_max < chunksize)
return (true);
if (CHUNK_CEILING(usize_max) > CHUNK_CEILING(oldsize)) {
/* Attempt to expand the allocation in-place. */
if (!huge_ralloc_no_move_expand(tsd, ptr, oldsize, usize_max,
zero)) {
arena_decay_tick(tsd, huge_aalloc(ptr));
return (false);
}
/* Try again, this time with usize_min. */
if (usize_min < usize_max && CHUNK_CEILING(usize_min) >
CHUNK_CEILING(oldsize) && huge_ralloc_no_move_expand(tsd,
ptr, oldsize, usize_min, zero)) {
arena_decay_tick(tsd, huge_aalloc(ptr));
return (false);
}
}
/*
* Avoid moving the allocation if the existing chunk size accommodates
* the new size.
*/
if (CHUNK_CEILING(oldsize) >= CHUNK_CEILING(usize_min)
&& CHUNK_CEILING(oldsize) <= CHUNK_CEILING(usize_max)) {
huge_ralloc_no_move_similar(tsd, ptr, oldsize, usize_min,
usize_max, zero);
arena_decay_tick(tsd, huge_aalloc(ptr));
return (false);
}
/* Attempt to shrink the allocation in-place. */
if (CHUNK_CEILING(oldsize) > CHUNK_CEILING(usize_max)) {
if (!huge_ralloc_no_move_shrink(tsd, ptr, oldsize, usize_max)) {
arena_decay_tick(tsd, huge_aalloc(ptr));
return (false);
}
}
return (true);
}
static void *
huge_ralloc_move_helper(tsd_t *tsd, arena_t *arena, size_t usize,
size_t alignment, bool zero)
{
Attempt to expand huge allocations in-place. This adds support for expanding huge allocations in-place by requesting memory at a specific address from the chunk allocator. It's currently only implemented for the chunk recycling path, although in theory it could also be done by optimistically allocating new chunks. On Linux, it could attempt an in-place mremap. However, that won't work in practice since the heap is grown downwards and memory is not unmapped (in a normal build, at least). Repeated vector reallocation micro-benchmark: #include <string.h> #include <stdlib.h> int main(void) { for (size_t i = 0; i < 100; i++) { void *ptr = NULL; size_t old_size = 0; for (size_t size = 4; size < (1 << 30); size *= 2) { ptr = realloc(ptr, size); if (!ptr) return 1; memset(ptr + old_size, 0xff, size - old_size); old_size = size; } free(ptr); } } The glibc allocator fails to do any in-place reallocations on this benchmark once it passes the M_MMAP_THRESHOLD (default 128k) but it elides the cost of copies via mremap, which is currently not something that jemalloc can use. With this improvement, jemalloc still fails to do any in-place huge reallocations for the first outer loop, but then succeeds 100% of the time for the remaining 99 iterations. The time spent doing allocations and copies drops down to under 5%, with nearly all of it spent doing purging + faulting (when huge pages are disabled) and the array memset. An improved mremap API (MREMAP_RETAIN - #138) would be far more general but this is a portable optimization and would still be useful on Linux for xallocx. Numbers with transparent huge pages enabled: glibc (copies elided via MREMAP_MAYMOVE): 8.471s jemalloc: 17.816s jemalloc + no-op madvise: 13.236s jemalloc + this commit: 6.787s jemalloc + this commit + no-op madvise: 6.144s Numbers with transparent huge pages disabled: glibc (copies elided via MREMAP_MAYMOVE): 15.403s jemalloc: 39.456s jemalloc + no-op madvise: 12.768s jemalloc + this commit: 15.534s jemalloc + this commit + no-op madvise: 6.354s Closes #137
2014-10-04 13:39:32 +08:00
if (alignment <= chunksize)
return (huge_malloc(tsd, arena, usize, zero));
return (huge_palloc(tsd, arena, usize, alignment, zero));
}
void *
huge_ralloc(tsd_t *tsd, arena_t *arena, void *ptr, size_t oldsize, size_t usize,
size_t alignment, bool zero, tcache_t *tcache)
{
void *ret;
size_t copysize;
/* The following should have been caught by callers. */
assert(usize > 0 && usize <= HUGE_MAXCLASS);
/* Try to avoid moving the allocation. */
if (!huge_ralloc_no_move(tsd, ptr, oldsize, usize, usize, zero))
return (ptr);
/*
* usize and oldsize are different enough that we need to use a
* different size class. In that case, fall back to allocating new
* space and copying.
*/
ret = huge_ralloc_move_helper(tsd, arena, usize, alignment, zero);
if (ret == NULL)
return (NULL);
copysize = (usize < oldsize) ? usize : oldsize;
memcpy(ret, ptr, copysize);
isqalloc(tsd, ptr, oldsize, tcache);
return (ret);
}
void
huge_dalloc(tsd_t *tsd, void *ptr)
{
extent_node_t *node;
arena_t *arena;
node = huge_node_get(ptr);
arena = extent_node_arena_get(node);
huge_node_unset(ptr, node);
malloc_mutex_lock(tsd, &arena->huge_mtx);
ql_remove(&arena->huge, node, ql_link);
malloc_mutex_unlock(tsd, &arena->huge_mtx);
huge_dalloc_junk(tsd, extent_node_addr_get(node),
extent_node_size_get(node));
arena_chunk_dalloc_huge(tsd, extent_node_arena_get(node),
extent_node_addr_get(node), extent_node_size_get(node));
idalloctm(tsd, node, NULL, true, true);
arena_decay_tick(tsd, arena);
}
arena_t *
huge_aalloc(const void *ptr)
{
return (extent_node_arena_get(huge_node_get(ptr)));
}
size_t
huge_salloc(tsd_t *tsd, const void *ptr)
{
size_t size;
extent_node_t *node;
arena_t *arena;
node = huge_node_get(ptr);
arena = extent_node_arena_get(node);
malloc_mutex_lock(tsd, &arena->huge_mtx);
size = extent_node_size_get(node);
malloc_mutex_unlock(tsd, &arena->huge_mtx);
return (size);
}
prof_tctx_t *
huge_prof_tctx_get(tsd_t *tsd, const void *ptr)
{
prof_tctx_t *tctx;
extent_node_t *node;
arena_t *arena;
node = huge_node_get(ptr);
arena = extent_node_arena_get(node);
malloc_mutex_lock(tsd, &arena->huge_mtx);
tctx = extent_node_prof_tctx_get(node);
malloc_mutex_unlock(tsd, &arena->huge_mtx);
return (tctx);
}
void
huge_prof_tctx_set(tsd_t *tsd, const void *ptr, prof_tctx_t *tctx)
{
extent_node_t *node;
arena_t *arena;
node = huge_node_get(ptr);
arena = extent_node_arena_get(node);
malloc_mutex_lock(tsd, &arena->huge_mtx);
extent_node_prof_tctx_set(node, tctx);
malloc_mutex_unlock(tsd, &arena->huge_mtx);
}
void
huge_prof_tctx_reset(tsd_t *tsd, const void *ptr)
{
huge_prof_tctx_set(tsd, ptr, (prof_tctx_t *)(uintptr_t)1U);
}