Refactor rtree to be lock-free.

Recent huge allocation refactoring associates huge allocations with
arenas, but it remains necessary to quickly look up huge allocation
metadata during reallocation/deallocation.  A global radix tree remains
a good solution to this problem, but locking would have become the
primary bottleneck after (upcoming) migration of chunk management from
global to per arena data structures.

This lock-free implementation uses double-checked reads to traverse the
tree, so that in the steady state, each read or write requires only a
single atomic operation.

This implementation also assures that no more than two tree levels
actually exist, through a combination of careful virtual memory
allocation which makes large sparse nodes cheap, and skipping the root
node on x64 (possible because the top 16 bits are all 0 in practice).
This commit is contained in:
Jason Evans
2015-01-30 22:54:08 -08:00
parent c810fcea1f
commit 8d0e04d42f
7 changed files with 384 additions and 229 deletions

View File

@@ -35,7 +35,7 @@ extern malloc_mutex_t chunks_mtx;
/* Chunk statistics. */
extern chunk_stats_t stats_chunks;
extern rtree_t *chunks_rtree;
extern rtree_t chunks_rtree;
extern size_t chunksize;
extern size_t chunksize_mask; /* (chunksize - 1). */

View File

@@ -955,7 +955,7 @@ ivsalloc(const void *ptr, bool demote)
{
/* Return 0 if ptr is not within a chunk managed by jemalloc. */
if (rtree_get(chunks_rtree, (uintptr_t)CHUNK_ADDR2BASE(ptr)) == 0)
if (rtree_get(&chunks_rtree, (uintptr_t)CHUNK_ADDR2BASE(ptr)) == 0)
return (0);
return (isalloc(ptr, demote));

View File

@@ -369,14 +369,21 @@ quarantine_alloc_hook
quarantine_alloc_hook_work
quarantine_cleanup
register_zone
rtree_child_read
rtree_child_read_hard
rtree_child_tryread
rtree_delete
rtree_get
rtree_get_locked
rtree_new
rtree_postfork_child
rtree_postfork_parent
rtree_prefork
rtree_node_valid
rtree_set
rtree_start_level
rtree_subkey
rtree_subtree_read
rtree_subtree_read_hard
rtree_subtree_tryread
rtree_val_read
rtree_val_write
s2u
s2u_compute
s2u_lookup

View File

@@ -1,170 +1,270 @@
/*
* This radix tree implementation is tailored to the singular purpose of
* tracking which chunks are currently owned by jemalloc. This functionality
* is mandatory for OS X, where jemalloc must be able to respond to object
* ownership queries.
* associating metadata with chunks that are currently owned by jemalloc.
*
*******************************************************************************
*/
#ifdef JEMALLOC_H_TYPES
typedef struct rtree_node_elm_s rtree_node_elm_t;
typedef struct rtree_level_s rtree_level_t;
typedef struct rtree_s rtree_t;
/*
* Size of each radix tree node (must be a power of 2). This impacts tree
* depth.
* RTREE_BITS_PER_LEVEL must be a power of two that is no larger than the
* machine address width.
*/
#define RTREE_NODESIZE (1U << 16)
#define LG_RTREE_BITS_PER_LEVEL 4
#define RTREE_BITS_PER_LEVEL (ZU(1) << LG_RTREE_BITS_PER_LEVEL)
#define RTREE_HEIGHT_MAX \
((ZU(1) << (LG_SIZEOF_PTR+3)) / RTREE_BITS_PER_LEVEL)
typedef void *(rtree_alloc_t)(size_t);
typedef void (rtree_dalloc_t)(void *);
/* Used for two-stage lock-free node initialization. */
#define RTREE_NODE_INITIALIZING ((rtree_node_elm_t *)0x1)
/*
* The node allocation callback function's argument is the number of contiguous
* rtree_node_elm_t structures to allocate, and the resulting memory must be
* zeroed.
*/
typedef rtree_node_elm_t *(rtree_node_alloc_t)(size_t);
typedef void (rtree_node_dalloc_t)(rtree_node_elm_t *);
#endif /* JEMALLOC_H_TYPES */
/******************************************************************************/
#ifdef JEMALLOC_H_STRUCTS
struct rtree_node_elm_s {
union {
rtree_node_elm_t *child;
void *val;
};
};
struct rtree_level_s {
/*
* A non-NULL subtree points to a subtree rooted along the hypothetical
* path to the leaf node corresponding to key 0. Depending on what keys
* have been used to store to the tree, an arbitrary combination of
* subtree pointers may remain NULL.
*
* Suppose keys comprise 48 bits, and LG_RTREE_BITS_PER_LEVEL is 4.
* This results in a 3-level tree, and the leftmost leaf can be directly
* accessed via subtrees[2], the subtree prefixed by 0x0000 (excluding
* 0x00000000) can be accessed via subtrees[1], and the remainder of the
* tree can be accessed via subtrees[0].
*
* levels[0] : [<unused> | 0x0001******** | 0x0002******** | ...]
*
* levels[1] : [<unused> | 0x00000001**** | 0x00000002**** | ... ]
*
* levels[2] : [val(0x000000000000) | val(0x000000000001) | ...]
*
* This has practical implications on x64, which currently uses only the
* lower 47 bits of virtual address space in userland, thus leaving
* subtrees[0] unused and avoiding a level of tree traversal.
*/
rtree_node_elm_t *subtree;
/* Number of key bits distinguished by this level. */
unsigned bits;
/*
* Cumulative number of key bits distinguished by traversing to
* corresponding tree level.
*/
unsigned cumbits;
};
struct rtree_s {
rtree_alloc_t *alloc;
rtree_dalloc_t *dalloc;
malloc_mutex_t mutex;
void **root;
unsigned height;
unsigned level2bits[1]; /* Dynamically sized. */
rtree_node_alloc_t *alloc;
rtree_node_dalloc_t *dalloc;
unsigned height;
/*
* Precomputed table used to convert from the number of leading 0 key
* bits to which subtree level to start at.
*/
unsigned start_level[RTREE_HEIGHT_MAX];
rtree_level_t levels[RTREE_HEIGHT_MAX];
};
#endif /* JEMALLOC_H_STRUCTS */
/******************************************************************************/
#ifdef JEMALLOC_H_EXTERNS
rtree_t *rtree_new(unsigned bits, rtree_alloc_t *alloc, rtree_dalloc_t *dalloc);
bool rtree_new(rtree_t *rtree, unsigned bits, rtree_node_alloc_t *alloc,
rtree_node_dalloc_t *dalloc);
void rtree_delete(rtree_t *rtree);
void rtree_prefork(rtree_t *rtree);
void rtree_postfork_parent(rtree_t *rtree);
void rtree_postfork_child(rtree_t *rtree);
rtree_node_elm_t *rtree_subtree_read_hard(rtree_t *rtree,
unsigned level);
rtree_node_elm_t *rtree_child_read_hard(rtree_t *rtree,
rtree_node_elm_t *elm, unsigned level);
#endif /* JEMALLOC_H_EXTERNS */
/******************************************************************************/
#ifdef JEMALLOC_H_INLINES
#ifndef JEMALLOC_ENABLE_INLINE
#ifdef JEMALLOC_DEBUG
uint8_t rtree_get_locked(rtree_t *rtree, uintptr_t key);
#endif
uint8_t rtree_get(rtree_t *rtree, uintptr_t key);
bool rtree_set(rtree_t *rtree, uintptr_t key, uint8_t val);
unsigned rtree_start_level(rtree_t *rtree, uintptr_t key);
uintptr_t rtree_subkey(rtree_t *rtree, uintptr_t key, unsigned level);
bool rtree_node_valid(rtree_node_elm_t *node);
rtree_node_elm_t *rtree_child_tryread(rtree_node_elm_t *elm);
rtree_node_elm_t *rtree_child_read(rtree_t *rtree, rtree_node_elm_t *elm,
unsigned level);
void *rtree_val_read(rtree_t *rtree, rtree_node_elm_t *elm);
void rtree_val_write(rtree_t *rtree, rtree_node_elm_t *elm, void *val);
rtree_node_elm_t *rtree_subtree_tryread(rtree_t *rtree, unsigned level);
rtree_node_elm_t *rtree_subtree_read(rtree_t *rtree, unsigned level);
void *rtree_get(rtree_t *rtree, uintptr_t key);
bool rtree_set(rtree_t *rtree, uintptr_t key, void *val);
#endif
#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_RTREE_C_))
#define RTREE_GET_GENERATE(f) \
/* The least significant bits of the key are ignored. */ \
JEMALLOC_INLINE uint8_t \
f(rtree_t *rtree, uintptr_t key) \
{ \
uint8_t ret; \
uintptr_t subkey; \
unsigned i, lshift, height, bits; \
void **node, **child; \
\
RTREE_LOCK(&rtree->mutex); \
for (i = lshift = 0, height = rtree->height, node = rtree->root;\
i < height - 1; \
i++, lshift += bits, node = child) { \
bits = rtree->level2bits[i]; \
subkey = (key << lshift) >> ((ZU(1) << (LG_SIZEOF_PTR + \
3)) - bits); \
child = (void**)node[subkey]; \
if (child == NULL) { \
RTREE_UNLOCK(&rtree->mutex); \
return (0); \
} \
} \
\
/* \
* node is a leaf, so it contains values rather than node \
* pointers. \
*/ \
bits = rtree->level2bits[i]; \
subkey = (key << lshift) >> ((ZU(1) << (LG_SIZEOF_PTR+3)) - \
bits); \
{ \
uint8_t *leaf = (uint8_t *)node; \
ret = leaf[subkey]; \
} \
RTREE_UNLOCK(&rtree->mutex); \
\
RTREE_GET_VALIDATE \
return (ret); \
JEMALLOC_INLINE unsigned
rtree_start_level(rtree_t *rtree, uintptr_t key)
{
unsigned start_level;
if (unlikely(key == 0))
return (rtree->height - 1);
start_level = rtree->start_level[lg_floor(key) >>
LG_RTREE_BITS_PER_LEVEL];
assert(start_level < rtree->height);
return (start_level);
}
#ifdef JEMALLOC_DEBUG
# define RTREE_LOCK(l) malloc_mutex_lock(l)
# define RTREE_UNLOCK(l) malloc_mutex_unlock(l)
# define RTREE_GET_VALIDATE
RTREE_GET_GENERATE(rtree_get_locked)
# undef RTREE_LOCK
# undef RTREE_UNLOCK
# undef RTREE_GET_VALIDATE
#endif
JEMALLOC_INLINE uintptr_t
rtree_subkey(rtree_t *rtree, uintptr_t key, unsigned level)
{
#define RTREE_LOCK(l)
#define RTREE_UNLOCK(l)
#ifdef JEMALLOC_DEBUG
/*
* Suppose that it were possible for a jemalloc-allocated chunk to be
* munmap()ped, followed by a different allocator in another thread re-using
* overlapping virtual memory, all without invalidating the cached rtree
* value. The result would be a false positive (the rtree would claim that
* jemalloc owns memory that it had actually discarded). This scenario
* seems impossible, but the following assertion is a prudent sanity check.
*/
# define RTREE_GET_VALIDATE \
assert(rtree_get_locked(rtree, key) == ret);
#else
# define RTREE_GET_VALIDATE
#endif
RTREE_GET_GENERATE(rtree_get)
#undef RTREE_LOCK
#undef RTREE_UNLOCK
#undef RTREE_GET_VALIDATE
return ((key >> ((ZU(1) << (LG_SIZEOF_PTR+3)) -
rtree->levels[level].cumbits)) & ((ZU(1) <<
rtree->levels[level].bits) - 1));
}
JEMALLOC_INLINE bool
rtree_set(rtree_t *rtree, uintptr_t key, uint8_t val)
rtree_node_valid(rtree_node_elm_t *node)
{
return ((uintptr_t)node > (uintptr_t)RTREE_NODE_INITIALIZING);
}
JEMALLOC_INLINE rtree_node_elm_t *
rtree_child_tryread(rtree_node_elm_t *elm)
{
rtree_node_elm_t *child;
/* Double-checked read (first read may be stale. */
child = elm->child;
if (!rtree_node_valid(child))
child = atomic_read_p((void **)&elm->child);
return (child);
}
JEMALLOC_INLINE rtree_node_elm_t *
rtree_child_read(rtree_t *rtree, rtree_node_elm_t *elm, unsigned level)
{
rtree_node_elm_t *child;
child = rtree_child_tryread(elm);
if (unlikely(!rtree_node_valid(child)))
child = rtree_child_read_hard(rtree, elm, level);
return (child);
}
JEMALLOC_INLINE void *
rtree_val_read(rtree_t *rtree, rtree_node_elm_t *elm)
{
return (atomic_read_p(&elm->val));
}
JEMALLOC_INLINE void
rtree_val_write(rtree_t *rtree, rtree_node_elm_t *elm, void *val)
{
atomic_write_p(&elm->val, val);
}
JEMALLOC_INLINE rtree_node_elm_t *
rtree_subtree_tryread(rtree_t *rtree, unsigned level)
{
rtree_node_elm_t *subtree;
/* Double-checked read (first read may be stale. */
subtree = rtree->levels[level].subtree;
if (!rtree_node_valid(subtree))
subtree = atomic_read_p((void **)&rtree->levels[level].subtree);
return (subtree);
}
JEMALLOC_INLINE rtree_node_elm_t *
rtree_subtree_read(rtree_t *rtree, unsigned level)
{
rtree_node_elm_t *subtree;
subtree = rtree_subtree_tryread(rtree, level);
if (unlikely(!rtree_node_valid(subtree)))
subtree = rtree_subtree_read_hard(rtree, level);
return (subtree);
}
JEMALLOC_INLINE void *
rtree_get(rtree_t *rtree, uintptr_t key)
{
uintptr_t subkey;
unsigned i, lshift, height, bits;
void **node, **child;
unsigned i, start_level;
rtree_node_elm_t *node, *child;
malloc_mutex_lock(&rtree->mutex);
for (i = lshift = 0, height = rtree->height, node = rtree->root;
i < height - 1;
i++, lshift += bits, node = child) {
bits = rtree->level2bits[i];
subkey = (key << lshift) >> ((ZU(1) << (LG_SIZEOF_PTR+3)) -
bits);
child = (void**)node[subkey];
if (child == NULL) {
size_t size = ((i + 1 < height - 1) ? sizeof(void *)
: (sizeof(uint8_t))) << rtree->level2bits[i+1];
child = (void**)rtree->alloc(size);
if (child == NULL) {
malloc_mutex_unlock(&rtree->mutex);
return (true);
}
memset(child, 0, size);
node[subkey] = child;
start_level = rtree_start_level(rtree, key);
for (i = start_level, node = rtree_subtree_tryread(rtree, start_level);
/**/; i++, node = child) {
if (unlikely(!rtree_node_valid(node)))
return (NULL);
subkey = rtree_subkey(rtree, key, i);
if (i == rtree->height - 1) {
/*
* node is a leaf, so it contains values rather than
* child pointers.
*/
return (rtree_val_read(rtree, &node[subkey]));
}
assert(i < rtree->height - 1);
child = rtree_child_tryread(&node[subkey]);
}
not_reached();
}
/* node is a leaf, so it contains values rather than node pointers. */
bits = rtree->level2bits[i];
subkey = (key << lshift) >> ((ZU(1) << (LG_SIZEOF_PTR+3)) - bits);
{
uint8_t *leaf = (uint8_t *)node;
leaf[subkey] = val;
JEMALLOC_INLINE bool
rtree_set(rtree_t *rtree, uintptr_t key, void *val)
{
uintptr_t subkey;
unsigned i, start_level;
rtree_node_elm_t *node, *child;
start_level = rtree_start_level(rtree, key);
node = rtree_subtree_read(rtree, start_level);
if (node == NULL)
return (true);
for (i = start_level; /**/; i++, node = child) {
subkey = rtree_subkey(rtree, key, i);
if (i == rtree->height - 1) {
/*
* node is a leaf, so it contains values rather than
* child pointers.
*/
rtree_val_write(rtree, &node[subkey], val);
return (false);
}
assert(i < rtree->height - 1);
child = rtree_child_read(rtree, &node[subkey], i);
if (child == NULL)
return (true);
}
malloc_mutex_unlock(&rtree->mutex);
return (false);
not_reached();
}
#endif