Refactor rtree to be lock-free.

Recent huge allocation refactoring associates huge allocations with
arenas, but it remains necessary to quickly look up huge allocation
metadata during reallocation/deallocation.  A global radix tree remains
a good solution to this problem, but locking would have become the
primary bottleneck after (upcoming) migration of chunk management from
global to per arena data structures.

This lock-free implementation uses double-checked reads to traverse the
tree, so that in the steady state, each read or write requires only a
single atomic operation.

This implementation also assures that no more than two tree levels
actually exist, through a combination of careful virtual memory
allocation which makes large sparse nodes cheap, and skipping the root
node on x64 (possible because the top 16 bits are all 0 in practice).
This commit is contained in:
Jason Evans
2015-01-30 22:54:08 -08:00
parent c810fcea1f
commit 8d0e04d42f
7 changed files with 384 additions and 229 deletions

View File

@@ -21,7 +21,7 @@ static extent_tree_t chunks_ad_mmap;
static extent_tree_t chunks_szad_dss;
static extent_tree_t chunks_ad_dss;
rtree_t *chunks_rtree;
rtree_t chunks_rtree;
/* Various chunk-related settings. */
size_t chunksize;
@@ -200,7 +200,7 @@ chunk_register(void *chunk, size_t size, bool base)
assert(CHUNK_ADDR2BASE(chunk) == chunk);
if (config_ivsalloc && !base) {
if (rtree_set(chunks_rtree, (uintptr_t)chunk, 1))
if (rtree_set(&chunks_rtree, (uintptr_t)chunk, chunk))
return (true);
}
if (config_stats || config_prof) {
@@ -395,7 +395,7 @@ chunk_dalloc_core(void *chunk, size_t size)
assert((size & chunksize_mask) == 0);
if (config_ivsalloc)
rtree_set(chunks_rtree, (uintptr_t)chunk, 0);
rtree_set(&chunks_rtree, (uintptr_t)chunk, NULL);
if (config_stats || config_prof) {
malloc_mutex_lock(&chunks_mtx);
assert(stats_chunks.curchunks >= (size / chunksize));
@@ -415,6 +415,14 @@ chunk_dalloc_default(void *chunk, size_t size, unsigned arena_ind)
return (false);
}
static rtree_node_elm_t *
chunks_rtree_node_alloc(size_t nelms)
{
return ((rtree_node_elm_t *)base_alloc(nelms *
sizeof(rtree_node_elm_t)));
}
bool
chunk_boot(void)
{
@@ -436,9 +444,8 @@ chunk_boot(void)
extent_tree_szad_new(&chunks_szad_dss);
extent_tree_ad_new(&chunks_ad_dss);
if (config_ivsalloc) {
chunks_rtree = rtree_new((ZU(1) << (LG_SIZEOF_PTR+3)) -
opt_lg_chunk, base_alloc, NULL);
if (chunks_rtree == NULL)
if (rtree_new(&chunks_rtree, (ZU(1) << (LG_SIZEOF_PTR+3)) -
opt_lg_chunk, chunks_rtree_node_alloc, NULL))
return (true);
}
@@ -450,8 +457,6 @@ chunk_prefork(void)
{
malloc_mutex_prefork(&chunks_mtx);
if (config_ivsalloc)
rtree_prefork(chunks_rtree);
chunk_dss_prefork();
}
@@ -460,8 +465,6 @@ chunk_postfork_parent(void)
{
chunk_dss_postfork_parent();
if (config_ivsalloc)
rtree_postfork_parent(chunks_rtree);
malloc_mutex_postfork_parent(&chunks_mtx);
}
@@ -470,7 +473,5 @@ chunk_postfork_child(void)
{
chunk_dss_postfork_child();
if (config_ivsalloc)
rtree_postfork_child(chunks_rtree);
malloc_mutex_postfork_child(&chunks_mtx);
}

View File

@@ -1,75 +1,74 @@
#define JEMALLOC_RTREE_C_
#include "jemalloc/internal/jemalloc_internal.h"
rtree_t *
rtree_new(unsigned bits, rtree_alloc_t *alloc, rtree_dalloc_t *dalloc)
static unsigned
hmin(unsigned ha, unsigned hb)
{
rtree_t *ret;
unsigned bits_per_level, bits_in_leaf, height, i;
return (ha < hb ? ha : hb);
}
/* Only the most significant bits of keys passed to rtree_[gs]et() are used. */
bool
rtree_new(rtree_t *rtree, unsigned bits, rtree_node_alloc_t *alloc,
rtree_node_dalloc_t *dalloc)
{
unsigned bits_in_leaf, height, i;
assert(bits > 0 && bits <= (sizeof(uintptr_t) << 3));
bits_per_level = jemalloc_ffs(pow2_ceil((RTREE_NODESIZE / sizeof(void
*)))) - 1;
bits_in_leaf = jemalloc_ffs(pow2_ceil((RTREE_NODESIZE /
sizeof(uint8_t)))) - 1;
bits_in_leaf = (bits % RTREE_BITS_PER_LEVEL) == 0 ? RTREE_BITS_PER_LEVEL
: (bits % RTREE_BITS_PER_LEVEL);
if (bits > bits_in_leaf) {
height = 1 + (bits - bits_in_leaf) / bits_per_level;
if ((height-1) * bits_per_level + bits_in_leaf != bits)
height = 1 + (bits - bits_in_leaf) / RTREE_BITS_PER_LEVEL;
if ((height-1) * RTREE_BITS_PER_LEVEL + bits_in_leaf != bits)
height++;
} else {
height = 1;
}
assert((height-1) * bits_per_level + bits_in_leaf >= bits);
ret = (rtree_t*)alloc(offsetof(rtree_t, level2bits) +
(sizeof(unsigned) * height));
if (ret == NULL)
return (NULL);
memset(ret, 0, offsetof(rtree_t, level2bits) + (sizeof(unsigned) *
height));
ret->alloc = alloc;
ret->dalloc = dalloc;
if (malloc_mutex_init(&ret->mutex)) {
if (dalloc != NULL)
dalloc(ret);
return (NULL);
}
ret->height = height;
if (height > 1) {
if ((height-1) * bits_per_level + bits_in_leaf > bits) {
ret->level2bits[0] = (bits - bits_in_leaf) %
bits_per_level;
} else
ret->level2bits[0] = bits_per_level;
for (i = 1; i < height-1; i++)
ret->level2bits[i] = bits_per_level;
ret->level2bits[height-1] = bits_in_leaf;
} else
ret->level2bits[0] = bits;
height = 1;
assert((height-1) * RTREE_BITS_PER_LEVEL + bits_in_leaf == bits);
ret->root = (void**)alloc(sizeof(void *) << ret->level2bits[0]);
if (ret->root == NULL) {
if (dalloc != NULL)
dalloc(ret);
return (NULL);
rtree->alloc = alloc;
rtree->dalloc = dalloc;
rtree->height = height;
/* Root level. */
rtree->levels[0].subtree = NULL;
rtree->levels[0].bits = (height > 1) ? RTREE_BITS_PER_LEVEL :
bits_in_leaf;
rtree->levels[0].cumbits = rtree->levels[0].bits;
/* Interior levels. */
for (i = 1; i < height-1; i++) {
rtree->levels[i].subtree = NULL;
rtree->levels[i].bits = RTREE_BITS_PER_LEVEL;
rtree->levels[i].cumbits = rtree->levels[i-1].cumbits +
RTREE_BITS_PER_LEVEL;
}
/* Leaf level. */
if (height > 1) {
rtree->levels[height-1].subtree = NULL;
rtree->levels[height-1].bits = bits_in_leaf;
rtree->levels[height-1].cumbits = bits;
}
memset(ret->root, 0, sizeof(void *) << ret->level2bits[0]);
return (ret);
/* Compute lookup table to be used by rtree_start_level(). */
for (i = 0; i < RTREE_HEIGHT_MAX; i++) {
rtree->start_level[i] = hmin(RTREE_HEIGHT_MAX - 1 - i, height -
1);
}
return (false);
}
static void
rtree_delete_subtree(rtree_t *rtree, void **node, unsigned level)
rtree_delete_subtree(rtree_t *rtree, rtree_node_elm_t *node, unsigned level)
{
if (level < rtree->height - 1) {
size_t nchildren, i;
nchildren = ZU(1) << rtree->level2bits[level];
nchildren = ZU(1) << rtree->levels[level].bits;
for (i = 0; i < nchildren; i++) {
void **child = (void **)node[i];
rtree_node_elm_t *child = node[i].child;
if (child != NULL)
rtree_delete_subtree(rtree, child, level + 1);
}
@@ -80,28 +79,49 @@ rtree_delete_subtree(rtree_t *rtree, void **node, unsigned level)
void
rtree_delete(rtree_t *rtree)
{
unsigned i;
rtree_delete_subtree(rtree, rtree->root, 0);
rtree->dalloc(rtree);
for (i = 0; i < rtree->height; i++) {
rtree_node_elm_t *subtree = rtree->levels[i].subtree;
if (subtree != NULL)
rtree_delete_subtree(rtree, subtree, i);
}
}
void
rtree_prefork(rtree_t *rtree)
static rtree_node_elm_t *
rtree_node_init(rtree_t *rtree, unsigned level, rtree_node_elm_t **elmp)
{
rtree_node_elm_t *node;
if (atomic_cas_p((void **)elmp, NULL, RTREE_NODE_INITIALIZING)) {
/*
* Another thread is already in the process of initializing.
* Spin-wait until initialization is complete.
*/
do {
CPU_SPINWAIT;
node = atomic_read_p((void **)elmp);
} while (node == RTREE_NODE_INITIALIZING);
} else {
node = rtree->alloc(ZU(1) << rtree->levels[level].bits);
if (node == NULL)
return (NULL);
atomic_write_p((void **)elmp, node);
}
return (node);
}
rtree_node_elm_t *
rtree_subtree_read_hard(rtree_t *rtree, unsigned level)
{
malloc_mutex_prefork(&rtree->mutex);
return (rtree_node_init(rtree, level, &rtree->levels[level].subtree));
}
void
rtree_postfork_parent(rtree_t *rtree)
rtree_node_elm_t *
rtree_child_read_hard(rtree_t *rtree, rtree_node_elm_t *elm, unsigned level)
{
malloc_mutex_postfork_parent(&rtree->mutex);
}
void
rtree_postfork_child(rtree_t *rtree)
{
malloc_mutex_postfork_child(&rtree->mutex);
return (rtree_node_init(rtree, level, &elm->child));
}