Use rtree tracked states to protect edata outside of ecache locks.

This avoids the addr-based mutexes (i.e. the mutex_pool), and instead relies on
the metadata tracked in rtree leaf: the head state and extent_state.  Before
trying to access the neighbor edata (e.g. for coalescing), the states will be
verified first -- only neighbor edatas from the same arena and with the same
state will be accessed.
This commit is contained in:
Qi Wang
2021-03-04 14:33:40 -08:00
committed by Qi Wang
parent 9ea235f8fe
commit 1784939688
7 changed files with 366 additions and 277 deletions

View File

@@ -23,7 +23,11 @@ enum extent_state_e {
extent_state_active = 0,
extent_state_dirty = 1,
extent_state_muzzy = 2,
extent_state_retained = 3
extent_state_retained = 3,
extent_state_transition = 4, /* States below are intermediate. */
extent_state_updating = 4,
extent_state_merging = 5,
extent_state_max = 5 /* Sanity checking only. */
};
typedef enum extent_state_e extent_state_t;
@@ -550,6 +554,11 @@ edata_is_head_set(edata_t *edata, bool is_head) {
((uint64_t)is_head << EDATA_BITS_IS_HEAD_SHIFT);
}
static inline bool
edata_state_in_transition(extent_state_t state) {
return state >= extent_state_transition;
}
/*
* Because this function is implemented as a sequence of bitfield modifications,
* even though each individual bit is properly initialized, we technically read

View File

@@ -5,6 +5,15 @@
#include "jemalloc/internal/mutex_pool.h"
#include "jemalloc/internal/rtree.h"
/*
* Note: Ends without at semicolon, so that
* EMAP_DECLARE_RTREE_CTX;
* in uses will avoid empty-statement warnings.
*/
#define EMAP_DECLARE_RTREE_CTX \
rtree_ctx_t rtree_ctx_fallback; \
rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn, &rtree_ctx_fallback)
typedef struct emap_s emap_t;
struct emap_s {
rtree_t rtree;
@@ -31,20 +40,16 @@ bool emap_init(emap_t *emap, base_t *base, bool zeroed);
void emap_remap(tsdn_t *tsdn, emap_t *emap, edata_t *edata, szind_t szind,
bool slab);
/*
* Grab the lock or locks associated with the edata or edatas indicated (which
* is done just by simple address hashing). The hashing strategy means that
* it's never safe to grab locks incrementally -- you have to grab all the locks
* you'll need at once, and release them all at once.
*/
void emap_lock_edata(tsdn_t *tsdn, emap_t *emap, edata_t *edata);
void emap_unlock_edata(tsdn_t *tsdn, emap_t *emap, edata_t *edata);
void emap_lock_edata2(tsdn_t *tsdn, emap_t *emap, edata_t *edata1,
edata_t *edata2);
void emap_unlock_edata2(tsdn_t *tsdn, emap_t *emap, edata_t *edata1,
edata_t *edata2);
edata_t *emap_lock_edata_from_addr(tsdn_t *tsdn, emap_t *emap, void *addr,
bool inactive_only);
void emap_update_edata_state(tsdn_t *tsdn, emap_t *emap, edata_t *edata,
extent_state_t state);
edata_t *emap_try_acquire_edata(tsdn_t *tsdn, emap_t *emap, void *addr,
extent_state_t expected_state, bool allow_head_extent);
edata_t *emap_try_acquire_edata_neighbor(tsdn_t *tsdn, emap_t *emap,
edata_t *edata, extent_pai_t pai, extent_state_t expected_state,
bool forward);
void emap_release_edata(tsdn_t *tsdn, emap_t *emap, edata_t *edata,
extent_state_t new_state);
/*
* Associate the given edata with its beginning and end address, setting the
@@ -136,43 +141,66 @@ emap_assert_not_mapped(tsdn_t *tsdn, emap_t *emap, edata_t *edata) {
}
}
static inline void
emap_update_rtree_at_addr(tsdn_t *tsdn, rtree_t *rtree, edata_t *expected_edata,
uintptr_t addr, extent_state_t state) {
witness_assert_positive_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
WITNESS_RANK_CORE);
JEMALLOC_ALWAYS_INLINE bool
emap_edata_in_transition(tsdn_t *tsdn, emap_t *emap, edata_t *edata) {
assert(config_debug);
emap_assert_mapped(tsdn, emap, edata);
rtree_ctx_t rtree_ctx_fallback;
rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn, &rtree_ctx_fallback);
EMAP_DECLARE_RTREE_CTX;
rtree_contents_t contents = rtree_read(tsdn, &emap->rtree, rtree_ctx,
(uintptr_t)edata_base_get(edata));
rtree_leaf_elm_t *elm = rtree_leaf_elm_lookup(tsdn, rtree, rtree_ctx,
addr, /* dependent */ true, /* init_missing */ false);
assert(elm != NULL);
rtree_contents_t contents = rtree_leaf_elm_read(tsdn, rtree, elm,
/* dependent */ true);
assert(contents.edata == expected_edata);
contents.metadata.state = state;
rtree_leaf_elm_write(tsdn, rtree, elm, contents);
return edata_state_in_transition(contents.metadata.state);
}
static inline void
emap_edata_state_update(tsdn_t *tsdn, emap_t *emap, edata_t *edata,
extent_state_t state) {
/* Only emap is allowed to modify the edata internal state. */
edata_state_set(edata, state);
JEMALLOC_ALWAYS_INLINE bool
emap_edata_is_acquired(tsdn_t *tsdn, emap_t *emap, edata_t *edata) {
if (!config_debug) {
/* For assertions only. */
return false;
}
emap_update_rtree_at_addr(tsdn, &emap->rtree, edata,
(uintptr_t)edata_base_get(edata), state);
emap_update_rtree_at_addr(tsdn, &emap->rtree, edata,
(uintptr_t)edata_last_get(edata), state);
/*
* The edata is considered acquired if no other threads will attempt to
* read / write any fields from it. This includes a few cases:
*
* 1) edata not hooked into emap yet -- This implies the edata just got
* allocated or initialized.
*
* 2) in an active or transition state -- In both cases, the edata can
* be discovered from the emap, however the state tracked in the rtree
* will prevent other threads from accessing the actual edata.
*/
EMAP_DECLARE_RTREE_CTX;
rtree_leaf_elm_t *elm = rtree_leaf_elm_lookup(tsdn, &emap->rtree,
rtree_ctx, (uintptr_t)edata_base_get(edata), /* dependent */ true,
/* init_missing */ false);
if (elm == NULL) {
return true;
}
rtree_contents_t contents = rtree_leaf_elm_read(tsdn, &emap->rtree, elm,
/* dependent */ true);
if (contents.edata == NULL ||
contents.metadata.state == extent_state_active ||
edata_state_in_transition(contents.metadata.state)) {
return true;
}
emap_assert_mapped(tsdn, emap, edata);
return false;
}
JEMALLOC_ALWAYS_INLINE void
extent_assert_can_coalesce(const edata_t *inner, const edata_t *outer) {
assert(edata_arena_ind_get(inner) == edata_arena_ind_get(outer));
assert(edata_pai_get(inner) == edata_pai_get(outer));
assert(edata_committed_get(inner) == edata_committed_get(outer));
assert(edata_state_get(inner) == extent_state_active);
assert(edata_state_get(outer) == extent_state_merging);
}
JEMALLOC_ALWAYS_INLINE edata_t *
emap_edata_lookup(tsdn_t *tsdn, emap_t *emap, const void *ptr) {
rtree_ctx_t rtree_ctx_fallback;
rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn, &rtree_ctx_fallback);
EMAP_DECLARE_RTREE_CTX;
return rtree_read(tsdn, &emap->rtree, rtree_ctx, (uintptr_t)ptr).edata;
}
@@ -181,8 +209,7 @@ emap_edata_lookup(tsdn_t *tsdn, emap_t *emap, const void *ptr) {
JEMALLOC_ALWAYS_INLINE void
emap_alloc_ctx_lookup(tsdn_t *tsdn, emap_t *emap, const void *ptr,
emap_alloc_ctx_t *alloc_ctx) {
rtree_ctx_t rtree_ctx_fallback;
rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn, &rtree_ctx_fallback);
EMAP_DECLARE_RTREE_CTX;
rtree_metadata_t metadata = rtree_metadata_read(tsdn, &emap->rtree,
rtree_ctx, (uintptr_t)ptr);
@@ -194,8 +221,7 @@ emap_alloc_ctx_lookup(tsdn_t *tsdn, emap_t *emap, const void *ptr,
JEMALLOC_ALWAYS_INLINE void
emap_full_alloc_ctx_lookup(tsdn_t *tsdn, emap_t *emap, const void *ptr,
emap_full_alloc_ctx_t *full_alloc_ctx) {
rtree_ctx_t rtree_ctx_fallback;
rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn, &rtree_ctx_fallback);
EMAP_DECLARE_RTREE_CTX;
rtree_contents_t contents = rtree_read(tsdn, &emap->rtree, rtree_ctx,
(uintptr_t)ptr);
@@ -212,8 +238,7 @@ emap_full_alloc_ctx_lookup(tsdn_t *tsdn, emap_t *emap, const void *ptr,
JEMALLOC_ALWAYS_INLINE bool
emap_full_alloc_ctx_try_lookup(tsdn_t *tsdn, emap_t *emap, const void *ptr,
emap_full_alloc_ctx_t *full_alloc_ctx) {
rtree_ctx_t rtree_ctx_fallback;
rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn, &rtree_ctx_fallback);
EMAP_DECLARE_RTREE_CTX;
rtree_contents_t contents;
bool err = rtree_read_independent(tsdn, &emap->rtree, rtree_ctx,

View File

@@ -81,7 +81,7 @@ struct rtree_leaf_elm_s {
#else
atomic_p_t le_edata; /* (edata_t *) */
/*
* From low to high bits: slab, is_head, state.
* From high to low bits: szind (8 bits), state (4 bits), is_head, slab
*/
atomic_u_t le_metadata;
#endif
@@ -187,6 +187,7 @@ rtree_leaf_elm_bits_read(tsdn_t *tsdn, rtree_t *rtree,
JEMALLOC_ALWAYS_INLINE uintptr_t
rtree_leaf_elm_bits_encode(rtree_contents_t contents) {
assert((uintptr_t)contents.edata % (uintptr_t)EDATA_ALIGNMENT == 0);
uintptr_t edata_bits = (uintptr_t)contents.edata
& (((uintptr_t)1 << LG_VADDR) - 1);
@@ -212,6 +213,7 @@ rtree_leaf_elm_bits_decode(uintptr_t bits) {
uintptr_t state_bits = (bits & RTREE_LEAF_STATE_MASK) >>
RTREE_LEAF_STATE_SHIFT;
assert(state_bits <= extent_state_max);
contents.metadata.state = (extent_state_t)state_bits;
uintptr_t low_bit_mask = ~((uintptr_t)EDATA_ALIGNMENT - 1);
@@ -229,6 +231,7 @@ rtree_leaf_elm_bits_decode(uintptr_t bits) {
contents.edata = (edata_t *)((uintptr_t)((intptr_t)(bits << RTREE_NHIB)
>> RTREE_NHIB) & low_bit_mask);
# endif
assert((uintptr_t)contents.edata % (uintptr_t)EDATA_ALIGNMENT == 0);
return contents;
}
@@ -250,6 +253,7 @@ rtree_leaf_elm_read(tsdn_t *tsdn, rtree_t *rtree, rtree_leaf_elm_t *elm,
uintptr_t state_bits = (metadata_bits & RTREE_LEAF_STATE_MASK) >>
RTREE_LEAF_STATE_SHIFT;
assert(state_bits <= extent_state_max);
contents.metadata.state = (extent_state_t)state_bits;
contents.metadata.szind = metadata_bits >> (RTREE_LEAF_STATE_SHIFT +
RTREE_LEAF_STATE_WIDTH);
@@ -283,6 +287,31 @@ rtree_leaf_elm_write(tsdn_t *tsdn, rtree_t *rtree,
#endif
}
/* The state field can be updated independently (and more frequently). */
static inline void
rtree_leaf_elm_state_update(tsdn_t *tsdn, rtree_t *rtree,
rtree_leaf_elm_t *elm1, rtree_leaf_elm_t *elm2, extent_state_t state) {
assert(elm1 != NULL);
#ifdef RTREE_LEAF_COMPACT
uintptr_t bits = rtree_leaf_elm_bits_read(tsdn, rtree, elm1,
/* dependent */ true);
bits &= ~RTREE_LEAF_STATE_MASK;
bits |= state << RTREE_LEAF_STATE_SHIFT;
atomic_store_p(&elm1->le_bits, (void *)bits, ATOMIC_RELEASE);
if (elm2 != NULL) {
atomic_store_p(&elm2->le_bits, (void *)bits, ATOMIC_RELEASE);
}
#else
unsigned bits = atomic_load_u(&elm1->le_metadata, ATOMIC_RELAXED);
bits &= ~RTREE_LEAF_STATE_MASK;
bits |= state << RTREE_LEAF_STATE_SHIFT;
atomic_store_u(&elm1->le_metadata, bits, ATOMIC_RELEASE);
if (elm2 != NULL) {
atomic_store_u(&elm2->le_metadata, bits, ATOMIC_RELEASE);
}
#endif
}
/*
* Tries to look up the key in the L1 cache, returning it if there's a hit, or
* NULL if there's a miss.