Use rtree tracked states to protect edata outside of ecache locks.

This avoids the addr-based mutexes (i.e. the mutex_pool), and instead relies on
the metadata tracked in rtree leaf: the head state and extent_state.  Before
trying to access the neighbor edata (e.g. for coalescing), the states will be
verified first -- only neighbor edatas from the same arena and with the same
state will be accessed.
This commit is contained in:
Qi Wang 2021-03-04 14:33:40 -08:00 committed by Qi Wang
parent 9ea235f8fe
commit 1784939688
7 changed files with 366 additions and 277 deletions

View File

@ -23,7 +23,11 @@ enum extent_state_e {
extent_state_active = 0,
extent_state_dirty = 1,
extent_state_muzzy = 2,
extent_state_retained = 3
extent_state_retained = 3,
extent_state_transition = 4, /* States below are intermediate. */
extent_state_updating = 4,
extent_state_merging = 5,
extent_state_max = 5 /* Sanity checking only. */
};
typedef enum extent_state_e extent_state_t;
@ -550,6 +554,11 @@ edata_is_head_set(edata_t *edata, bool is_head) {
((uint64_t)is_head << EDATA_BITS_IS_HEAD_SHIFT);
}
static inline bool
edata_state_in_transition(extent_state_t state) {
return state >= extent_state_transition;
}
/*
* Because this function is implemented as a sequence of bitfield modifications,
* even though each individual bit is properly initialized, we technically read

View File

@ -5,6 +5,15 @@
#include "jemalloc/internal/mutex_pool.h"
#include "jemalloc/internal/rtree.h"
/*
* Note: Ends without at semicolon, so that
* EMAP_DECLARE_RTREE_CTX;
* in uses will avoid empty-statement warnings.
*/
#define EMAP_DECLARE_RTREE_CTX \
rtree_ctx_t rtree_ctx_fallback; \
rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn, &rtree_ctx_fallback)
typedef struct emap_s emap_t;
struct emap_s {
rtree_t rtree;
@ -31,20 +40,16 @@ bool emap_init(emap_t *emap, base_t *base, bool zeroed);
void emap_remap(tsdn_t *tsdn, emap_t *emap, edata_t *edata, szind_t szind,
bool slab);
/*
* Grab the lock or locks associated with the edata or edatas indicated (which
* is done just by simple address hashing). The hashing strategy means that
* it's never safe to grab locks incrementally -- you have to grab all the locks
* you'll need at once, and release them all at once.
*/
void emap_lock_edata(tsdn_t *tsdn, emap_t *emap, edata_t *edata);
void emap_unlock_edata(tsdn_t *tsdn, emap_t *emap, edata_t *edata);
void emap_lock_edata2(tsdn_t *tsdn, emap_t *emap, edata_t *edata1,
edata_t *edata2);
void emap_unlock_edata2(tsdn_t *tsdn, emap_t *emap, edata_t *edata1,
edata_t *edata2);
edata_t *emap_lock_edata_from_addr(tsdn_t *tsdn, emap_t *emap, void *addr,
bool inactive_only);
void emap_update_edata_state(tsdn_t *tsdn, emap_t *emap, edata_t *edata,
extent_state_t state);
edata_t *emap_try_acquire_edata(tsdn_t *tsdn, emap_t *emap, void *addr,
extent_state_t expected_state, bool allow_head_extent);
edata_t *emap_try_acquire_edata_neighbor(tsdn_t *tsdn, emap_t *emap,
edata_t *edata, extent_pai_t pai, extent_state_t expected_state,
bool forward);
void emap_release_edata(tsdn_t *tsdn, emap_t *emap, edata_t *edata,
extent_state_t new_state);
/*
* Associate the given edata with its beginning and end address, setting the
@ -136,43 +141,66 @@ emap_assert_not_mapped(tsdn_t *tsdn, emap_t *emap, edata_t *edata) {
}
}
static inline void
emap_update_rtree_at_addr(tsdn_t *tsdn, rtree_t *rtree, edata_t *expected_edata,
uintptr_t addr, extent_state_t state) {
witness_assert_positive_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
WITNESS_RANK_CORE);
JEMALLOC_ALWAYS_INLINE bool
emap_edata_in_transition(tsdn_t *tsdn, emap_t *emap, edata_t *edata) {
assert(config_debug);
emap_assert_mapped(tsdn, emap, edata);
rtree_ctx_t rtree_ctx_fallback;
rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn, &rtree_ctx_fallback);
EMAP_DECLARE_RTREE_CTX;
rtree_contents_t contents = rtree_read(tsdn, &emap->rtree, rtree_ctx,
(uintptr_t)edata_base_get(edata));
rtree_leaf_elm_t *elm = rtree_leaf_elm_lookup(tsdn, rtree, rtree_ctx,
addr, /* dependent */ true, /* init_missing */ false);
assert(elm != NULL);
rtree_contents_t contents = rtree_leaf_elm_read(tsdn, rtree, elm,
/* dependent */ true);
assert(contents.edata == expected_edata);
contents.metadata.state = state;
rtree_leaf_elm_write(tsdn, rtree, elm, contents);
return edata_state_in_transition(contents.metadata.state);
}
static inline void
emap_edata_state_update(tsdn_t *tsdn, emap_t *emap, edata_t *edata,
extent_state_t state) {
/* Only emap is allowed to modify the edata internal state. */
edata_state_set(edata, state);
JEMALLOC_ALWAYS_INLINE bool
emap_edata_is_acquired(tsdn_t *tsdn, emap_t *emap, edata_t *edata) {
if (!config_debug) {
/* For assertions only. */
return false;
}
emap_update_rtree_at_addr(tsdn, &emap->rtree, edata,
(uintptr_t)edata_base_get(edata), state);
emap_update_rtree_at_addr(tsdn, &emap->rtree, edata,
(uintptr_t)edata_last_get(edata), state);
/*
* The edata is considered acquired if no other threads will attempt to
* read / write any fields from it. This includes a few cases:
*
* 1) edata not hooked into emap yet -- This implies the edata just got
* allocated or initialized.
*
* 2) in an active or transition state -- In both cases, the edata can
* be discovered from the emap, however the state tracked in the rtree
* will prevent other threads from accessing the actual edata.
*/
EMAP_DECLARE_RTREE_CTX;
rtree_leaf_elm_t *elm = rtree_leaf_elm_lookup(tsdn, &emap->rtree,
rtree_ctx, (uintptr_t)edata_base_get(edata), /* dependent */ true,
/* init_missing */ false);
if (elm == NULL) {
return true;
}
rtree_contents_t contents = rtree_leaf_elm_read(tsdn, &emap->rtree, elm,
/* dependent */ true);
if (contents.edata == NULL ||
contents.metadata.state == extent_state_active ||
edata_state_in_transition(contents.metadata.state)) {
return true;
}
emap_assert_mapped(tsdn, emap, edata);
return false;
}
JEMALLOC_ALWAYS_INLINE void
extent_assert_can_coalesce(const edata_t *inner, const edata_t *outer) {
assert(edata_arena_ind_get(inner) == edata_arena_ind_get(outer));
assert(edata_pai_get(inner) == edata_pai_get(outer));
assert(edata_committed_get(inner) == edata_committed_get(outer));
assert(edata_state_get(inner) == extent_state_active);
assert(edata_state_get(outer) == extent_state_merging);
}
JEMALLOC_ALWAYS_INLINE edata_t *
emap_edata_lookup(tsdn_t *tsdn, emap_t *emap, const void *ptr) {
rtree_ctx_t rtree_ctx_fallback;
rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn, &rtree_ctx_fallback);
EMAP_DECLARE_RTREE_CTX;
return rtree_read(tsdn, &emap->rtree, rtree_ctx, (uintptr_t)ptr).edata;
}
@ -181,8 +209,7 @@ emap_edata_lookup(tsdn_t *tsdn, emap_t *emap, const void *ptr) {
JEMALLOC_ALWAYS_INLINE void
emap_alloc_ctx_lookup(tsdn_t *tsdn, emap_t *emap, const void *ptr,
emap_alloc_ctx_t *alloc_ctx) {
rtree_ctx_t rtree_ctx_fallback;
rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn, &rtree_ctx_fallback);
EMAP_DECLARE_RTREE_CTX;
rtree_metadata_t metadata = rtree_metadata_read(tsdn, &emap->rtree,
rtree_ctx, (uintptr_t)ptr);
@ -194,8 +221,7 @@ emap_alloc_ctx_lookup(tsdn_t *tsdn, emap_t *emap, const void *ptr,
JEMALLOC_ALWAYS_INLINE void
emap_full_alloc_ctx_lookup(tsdn_t *tsdn, emap_t *emap, const void *ptr,
emap_full_alloc_ctx_t *full_alloc_ctx) {
rtree_ctx_t rtree_ctx_fallback;
rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn, &rtree_ctx_fallback);
EMAP_DECLARE_RTREE_CTX;
rtree_contents_t contents = rtree_read(tsdn, &emap->rtree, rtree_ctx,
(uintptr_t)ptr);
@ -212,8 +238,7 @@ emap_full_alloc_ctx_lookup(tsdn_t *tsdn, emap_t *emap, const void *ptr,
JEMALLOC_ALWAYS_INLINE bool
emap_full_alloc_ctx_try_lookup(tsdn_t *tsdn, emap_t *emap, const void *ptr,
emap_full_alloc_ctx_t *full_alloc_ctx) {
rtree_ctx_t rtree_ctx_fallback;
rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn, &rtree_ctx_fallback);
EMAP_DECLARE_RTREE_CTX;
rtree_contents_t contents;
bool err = rtree_read_independent(tsdn, &emap->rtree, rtree_ctx,

View File

@ -81,7 +81,7 @@ struct rtree_leaf_elm_s {
#else
atomic_p_t le_edata; /* (edata_t *) */
/*
* From low to high bits: slab, is_head, state.
* From high to low bits: szind (8 bits), state (4 bits), is_head, slab
*/
atomic_u_t le_metadata;
#endif
@ -187,6 +187,7 @@ rtree_leaf_elm_bits_read(tsdn_t *tsdn, rtree_t *rtree,
JEMALLOC_ALWAYS_INLINE uintptr_t
rtree_leaf_elm_bits_encode(rtree_contents_t contents) {
assert((uintptr_t)contents.edata % (uintptr_t)EDATA_ALIGNMENT == 0);
uintptr_t edata_bits = (uintptr_t)contents.edata
& (((uintptr_t)1 << LG_VADDR) - 1);
@ -212,6 +213,7 @@ rtree_leaf_elm_bits_decode(uintptr_t bits) {
uintptr_t state_bits = (bits & RTREE_LEAF_STATE_MASK) >>
RTREE_LEAF_STATE_SHIFT;
assert(state_bits <= extent_state_max);
contents.metadata.state = (extent_state_t)state_bits;
uintptr_t low_bit_mask = ~((uintptr_t)EDATA_ALIGNMENT - 1);
@ -229,6 +231,7 @@ rtree_leaf_elm_bits_decode(uintptr_t bits) {
contents.edata = (edata_t *)((uintptr_t)((intptr_t)(bits << RTREE_NHIB)
>> RTREE_NHIB) & low_bit_mask);
# endif
assert((uintptr_t)contents.edata % (uintptr_t)EDATA_ALIGNMENT == 0);
return contents;
}
@ -250,6 +253,7 @@ rtree_leaf_elm_read(tsdn_t *tsdn, rtree_t *rtree, rtree_leaf_elm_t *elm,
uintptr_t state_bits = (metadata_bits & RTREE_LEAF_STATE_MASK) >>
RTREE_LEAF_STATE_SHIFT;
assert(state_bits <= extent_state_max);
contents.metadata.state = (extent_state_t)state_bits;
contents.metadata.szind = metadata_bits >> (RTREE_LEAF_STATE_SHIFT +
RTREE_LEAF_STATE_WIDTH);
@ -283,6 +287,31 @@ rtree_leaf_elm_write(tsdn_t *tsdn, rtree_t *rtree,
#endif
}
/* The state field can be updated independently (and more frequently). */
static inline void
rtree_leaf_elm_state_update(tsdn_t *tsdn, rtree_t *rtree,
rtree_leaf_elm_t *elm1, rtree_leaf_elm_t *elm2, extent_state_t state) {
assert(elm1 != NULL);
#ifdef RTREE_LEAF_COMPACT
uintptr_t bits = rtree_leaf_elm_bits_read(tsdn, rtree, elm1,
/* dependent */ true);
bits &= ~RTREE_LEAF_STATE_MASK;
bits |= state << RTREE_LEAF_STATE_SHIFT;
atomic_store_p(&elm1->le_bits, (void *)bits, ATOMIC_RELEASE);
if (elm2 != NULL) {
atomic_store_p(&elm2->le_bits, (void *)bits, ATOMIC_RELEASE);
}
#else
unsigned bits = atomic_load_u(&elm1->le_metadata, ATOMIC_RELAXED);
bits &= ~RTREE_LEAF_STATE_MASK;
bits |= state << RTREE_LEAF_STATE_SHIFT;
atomic_store_u(&elm1->le_metadata, bits, ATOMIC_RELEASE);
if (elm2 != NULL) {
atomic_store_u(&elm2->le_metadata, bits, ATOMIC_RELEASE);
}
#endif
}
/*
* Tries to look up the key in the L1 cache, returning it if there's a hit, or
* NULL if there's a miss.

View File

@ -3,15 +3,6 @@
#include "jemalloc/internal/emap.h"
/*
* Note: Ends without at semicolon, so that
* EMAP_DECLARE_RTREE_CTX;
* in uses will avoid empty-statement warnings.
*/
#define EMAP_DECLARE_RTREE_CTX \
rtree_ctx_t rtree_ctx_fallback; \
rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn, &rtree_ctx_fallback)
enum emap_lock_result_e {
emap_lock_result_success,
emap_lock_result_failure,
@ -35,82 +26,186 @@ emap_init(emap_t *emap, base_t *base, bool zeroed) {
}
void
emap_lock_edata(tsdn_t *tsdn, emap_t *emap, edata_t *edata) {
assert(edata != NULL);
mutex_pool_lock(tsdn, &emap->mtx_pool, (uintptr_t)edata);
}
emap_update_edata_state(tsdn_t *tsdn, emap_t *emap, edata_t *edata,
extent_state_t state) {
witness_assert_positive_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
WITNESS_RANK_CORE);
void
emap_unlock_edata(tsdn_t *tsdn, emap_t *emap, edata_t *edata) {
assert(edata != NULL);
mutex_pool_unlock(tsdn, &emap->mtx_pool, (uintptr_t)edata);
}
edata_state_set(edata, state);
void
emap_lock_edata2(tsdn_t *tsdn, emap_t *emap, edata_t *edata1,
edata_t *edata2) {
assert(edata1 != NULL && edata2 != NULL);
mutex_pool_lock2(tsdn, &emap->mtx_pool, (uintptr_t)edata1,
(uintptr_t)edata2);
}
void
emap_unlock_edata2(tsdn_t *tsdn, emap_t *emap, edata_t *edata1,
edata_t *edata2) {
assert(edata1 != NULL && edata2 != NULL);
mutex_pool_unlock2(tsdn, &emap->mtx_pool, (uintptr_t)edata1,
(uintptr_t)edata2);
}
static inline emap_lock_result_t
emap_try_lock_rtree_leaf_elm(tsdn_t *tsdn, emap_t *emap, rtree_leaf_elm_t *elm,
edata_t **result, bool inactive_only) {
edata_t *edata1 = rtree_leaf_elm_read(tsdn, &emap->rtree, elm,
/* dependent */ true).edata;
/* Slab implies active extents and should be skipped. */
if (edata1 == NULL || (inactive_only && rtree_leaf_elm_read(tsdn,
&emap->rtree, elm, /* dependent */ true).metadata.slab)) {
return emap_lock_result_no_extent;
}
/*
* It's possible that the extent changed out from under us, and with it
* the leaf->edata mapping. We have to recheck while holding the lock.
*/
emap_lock_edata(tsdn, emap, edata1);
edata_t *edata2 = rtree_leaf_elm_read(tsdn, &emap->rtree, elm,
/* dependent */ true).edata;
if (edata1 == edata2) {
*result = edata1;
return emap_lock_result_success;
} else {
emap_unlock_edata(tsdn, emap, edata1);
return emap_lock_result_failure;
}
}
/*
* Returns a pool-locked edata_t * if there's one associated with the given
* address, and NULL otherwise.
*/
edata_t *
emap_lock_edata_from_addr(tsdn_t *tsdn, emap_t *emap, void *addr,
bool inactive_only) {
EMAP_DECLARE_RTREE_CTX;
edata_t *ret = NULL;
rtree_leaf_elm_t *elm1 = rtree_leaf_elm_lookup(tsdn, &emap->rtree,
rtree_ctx, (uintptr_t)edata_base_get(edata), /* dependent */ true,
/* init_missing */ false);
assert(elm1 != NULL);
rtree_leaf_elm_t *elm2 = edata_size_get(edata) == PAGE ? NULL :
rtree_leaf_elm_lookup(tsdn, &emap->rtree, rtree_ctx,
(uintptr_t)edata_last_get(edata), /* dependent */ true,
/* init_missing */ false);
rtree_leaf_elm_state_update(tsdn, &emap->rtree, elm1, elm2, state);
emap_assert_mapped(tsdn, emap, edata);
}
static inline bool
edata_neighbor_head_state_mergeable(bool edata_is_head,
bool neighbor_is_head, bool forward) {
/*
* Head states checking: disallow merging if the higher addr extent is a
* head extent. This helps preserve first-fit, and more importantly
* makes sure no merge across arenas.
*/
if (forward) {
if (neighbor_is_head) {
return false;
}
} else {
if (edata_is_head) {
return false;
}
}
return true;
}
static inline bool
edata_can_acquire_neighbor(edata_t *edata, rtree_contents_t contents,
extent_pai_t pai, extent_state_t expected_state, bool forward) {
edata_t *neighbor = contents.edata;
if (neighbor == NULL) {
return false;
}
/* It's not safe to access *neighbor yet; must verify states first. */
bool neighbor_is_head = contents.metadata.is_head;
if (!edata_neighbor_head_state_mergeable(edata_is_head_get(edata),
neighbor_is_head, forward)) {
return NULL;
}
extent_state_t neighbor_state = contents.metadata.state;
if (pai == EXTENT_PAI_PAC) {
if (neighbor_state != expected_state) {
return false;
}
/* From this point, it's safe to access *neighbor. */
if (edata_committed_get(edata) !=
edata_committed_get(neighbor)) {
/*
* Some platforms (e.g. Windows) require an explicit
* commit step (and writing to uncomitted memory is not
* allowed).
*/
return false;
}
} else {
if (neighbor_state == extent_state_active) {
return false;
}
/* From this point, it's safe to access *neighbor. */
}
assert(edata_pai_get(edata) == pai);
if (edata_pai_get(neighbor) != pai) {
return false;
}
if (opt_retain) {
assert(edata_arena_ind_get(edata) ==
edata_arena_ind_get(neighbor));
} else {
/*
* This isn't entirely safe with the presence of arena_reset /
* destroy, in which case the neighbor edata can be destoryed if
* it belongs to a manual arena. More on that later.
*/
if (edata_arena_ind_get(edata) !=
edata_arena_ind_get(neighbor)) {
return false;
}
}
return true;
}
/* Will be removed in the next commit. */
edata_t *
emap_try_acquire_edata(tsdn_t *tsdn, emap_t *emap, void *addr,
extent_state_t expected_state, bool allow_head_extent) {
EMAP_DECLARE_RTREE_CTX;
rtree_leaf_elm_t *elm = rtree_leaf_elm_lookup(tsdn, &emap->rtree,
rtree_ctx, (uintptr_t)addr, false, false);
if (elm == NULL) {
return NULL;
}
emap_lock_result_t lock_result;
do {
lock_result = emap_try_lock_rtree_leaf_elm(tsdn, emap, elm,
&ret, inactive_only);
} while (lock_result == emap_lock_result_failure);
return ret;
rtree_contents_t contents = rtree_leaf_elm_read(tsdn, &emap->rtree, elm,
/* dependent */ true);
if (!allow_head_extent && contents.metadata.is_head) {
/* !allow_head_extent indicates the expanding path. */
return NULL;
}
edata_t *edata = contents.edata;
if (edata == NULL || contents.metadata.state != expected_state) {
return NULL;
}
assert(edata_state_get(edata) == expected_state);
emap_update_edata_state(tsdn, emap, edata, extent_state_updating);
return edata;
}
void
emap_release_edata(tsdn_t *tsdn, emap_t *emap, edata_t *edata,
extent_state_t new_state) {
assert(emap_edata_in_transition(tsdn, emap, edata));
assert(emap_edata_is_acquired(tsdn, emap, edata));
emap_update_edata_state(tsdn, emap, edata, new_state);
}
edata_t *
emap_try_acquire_edata_neighbor(tsdn_t *tsdn, emap_t *emap, edata_t *edata,
extent_pai_t pai, extent_state_t expected_state, bool forward) {
witness_assert_positive_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
WITNESS_RANK_CORE);
assert(!edata_state_in_transition(expected_state));
assert(expected_state == extent_state_dirty ||
expected_state == extent_state_muzzy ||
expected_state == extent_state_retained);
void *neighbor_addr = forward ? edata_past_get(edata) :
edata_before_get(edata);
/*
* This is subtle; the rtree code asserts that its input pointer is
* non-NULL, and this is a useful thing to check. But it's possible
* that edata corresponds to an address of (void *)PAGE (in practice,
* this has only been observed on FreeBSD when address-space
* randomization is on, but it could in principle happen anywhere). In
* this case, edata_before_get(edata) is NULL, triggering the assert.
*/
if (neighbor_addr == NULL) {
return NULL;
}
EMAP_DECLARE_RTREE_CTX;
rtree_leaf_elm_t *elm = rtree_leaf_elm_lookup(tsdn, &emap->rtree,
rtree_ctx, (uintptr_t)neighbor_addr, /* dependent*/ false,
/* init_missing */ false);
if (elm == NULL) {
return NULL;
}
rtree_contents_t neighbor_contents = rtree_leaf_elm_read(tsdn,
&emap->rtree, elm, /* dependent */ true);
if (!edata_can_acquire_neighbor(edata, neighbor_contents, pai,
expected_state, forward)) {
return NULL;
}
/* From this point, the neighbor edata can be safely acquired. */
edata_t *neighbor = neighbor_contents.edata;
emap_update_edata_state(tsdn, emap, neighbor, extent_state_merging);
extent_assert_can_coalesce(edata, neighbor);
return neighbor;
}
static bool
@ -153,6 +248,7 @@ emap_rtree_write_acquired(tsdn_t *tsdn, emap_t *emap, rtree_leaf_elm_t *elm_a,
bool
emap_register_boundary(tsdn_t *tsdn, emap_t *emap, edata_t *edata,
szind_t szind, bool slab) {
assert(edata_state_get(edata) == extent_state_active);
EMAP_DECLARE_RTREE_CTX;
rtree_leaf_elm_t *elm_a, *elm_b;
@ -161,6 +257,10 @@ emap_register_boundary(tsdn_t *tsdn, emap_t *emap, edata_t *edata,
if (err) {
return true;
}
assert(rtree_leaf_elm_read(tsdn, &emap->rtree, elm_a,
/* dependent */ false).edata == NULL);
assert(rtree_leaf_elm_read(tsdn, &emap->rtree, elm_b,
/* dependent */ false).edata == NULL);
emap_rtree_write_acquired(tsdn, emap, elm_a, elm_b, edata, szind, slab);
return false;
}
@ -190,6 +290,15 @@ emap_register_interior(tsdn_t *tsdn, emap_t *emap, edata_t *edata,
void
emap_deregister_boundary(tsdn_t *tsdn, emap_t *emap, edata_t *edata) {
/*
* The edata must be either in an acquired state, or protected by state
* based locks.
*/
if (!emap_edata_is_acquired(tsdn, emap, edata)) {
witness_assert_positive_depth_to_rank(
tsdn_witness_tsdp_get(tsdn), WITNESS_RANK_CORE);
}
EMAP_DECLARE_RTREE_CTX;
rtree_leaf_elm_t *elm_a, *elm_b;

View File

@ -78,7 +78,8 @@ eset_insert(eset_t *eset, edata_t *edata) {
void
eset_remove(eset_t *eset, edata_t *edata) {
assert(edata_state_get(edata) == eset->state);
assert(edata_state_get(edata) == eset->state ||
edata_state_in_transition(edata_state_get(edata)));
size_t size = edata_size_get(edata);
size_t psz = sz_psz_quantize_floor(size);

View File

@ -64,12 +64,12 @@ extent_may_force_decay(pac_t *pac) {
static bool
extent_try_delayed_coalesce(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks,
ecache_t *ecache, edata_t *edata) {
emap_edata_state_update(tsdn, pac->emap, edata, extent_state_active);
emap_update_edata_state(tsdn, pac->emap, edata, extent_state_active);
bool coalesced;
edata = extent_try_coalesce(tsdn, pac, ehooks, ecache,
edata, &coalesced, false);
emap_edata_state_update(tsdn, pac->emap, edata, ecache->state);
emap_update_edata_state(tsdn, pac->emap, edata, ecache->state);
if (!coalesced) {
return true;
@ -183,7 +183,7 @@ ecache_evict(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks,
not_reached();
case extent_state_dirty:
case extent_state_muzzy:
emap_edata_state_update(tsdn, pac->emap, edata,
emap_update_edata_state(tsdn, pac->emap, edata,
extent_state_active);
break;
case extent_state_retained:
@ -230,7 +230,7 @@ extent_deactivate_locked(tsdn_t *tsdn, pac_t *pac, ecache_t *ecache,
assert(edata_arena_ind_get(edata) == ecache_ind_get(ecache));
assert(edata_state_get(edata) == extent_state_active);
emap_edata_state_update(tsdn, pac->emap, edata, ecache->state);
emap_update_edata_state(tsdn, pac->emap, edata, ecache->state);
eset_insert(&ecache->eset, edata);
}
@ -245,10 +245,11 @@ static void
extent_activate_locked(tsdn_t *tsdn, pac_t *pac, ecache_t *ecache,
edata_t *edata) {
assert(edata_arena_ind_get(edata) == ecache_ind_get(ecache));
assert(edata_state_get(edata) == ecache->state);
assert(edata_state_get(edata) == ecache->state ||
edata_state_get(edata) == extent_state_updating);
eset_remove(&ecache->eset, edata);
emap_edata_state_update(tsdn, pac->emap, edata, extent_state_active);
emap_update_edata_state(tsdn, pac->emap, edata, extent_state_active);
}
static void
@ -290,20 +291,16 @@ extent_gdump_sub(tsdn_t *tsdn, const edata_t *edata) {
static bool
extent_register_impl(tsdn_t *tsdn, pac_t *pac, edata_t *edata, bool gdump_add) {
assert(edata_state_get(edata) == extent_state_active);
/*
* We need to hold the lock to protect against a concurrent coalesce
* operation that sees us in a partial state.
* No locking needed, as the edata must be in active state, which
* prevents other threads from accessing the edata.
*/
emap_lock_edata(tsdn, pac->emap, edata);
if (emap_register_boundary(tsdn, pac->emap, edata, SC_NSIZES,
/* slab */ false)) {
emap_unlock_edata(tsdn, pac->emap, edata);
return true;
}
emap_unlock_edata(tsdn, pac->emap, edata);
if (config_prof && gdump_add) {
extent_gdump_add(tsdn, edata);
}
@ -333,9 +330,7 @@ extent_reregister(tsdn_t *tsdn, pac_t *pac, edata_t *edata) {
static void
extent_deregister_impl(tsdn_t *tsdn, pac_t *pac, edata_t *edata,
bool gdump) {
emap_lock_edata(tsdn, pac->emap, edata);
emap_deregister_boundary(tsdn, pac->emap, edata);
emap_unlock_edata(tsdn, pac->emap, edata);
if (config_prof && gdump) {
extent_gdump_sub(tsdn, edata);
@ -383,22 +378,18 @@ extent_recycle_extract(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks,
malloc_mutex_lock(tsdn, &ecache->mtx);
edata_t *edata;
if (new_addr != NULL) {
edata = emap_lock_edata_from_addr(tsdn, pac->emap, new_addr,
false);
edata = emap_try_acquire_edata(tsdn, pac->emap, new_addr,
ecache->state, /* allow_head_extent*/ false);
if (edata != NULL) {
/*
* We might null-out edata to report an error, but we
* still need to unlock the associated mutex after.
*/
edata_t *unlock_edata = edata;
assert(edata_base_get(edata) == new_addr);
if (edata_arena_ind_get(edata) != ecache_ind_get(ecache)
|| edata_size_get(edata) < size
|| edata_state_get(edata)
!= ecache->state) {
assert(edata_arena_ind_get(edata) ==
ecache_ind_get(ecache));
assert(edata_state_get(edata) == extent_state_updating);
if (edata_size_get(edata) < size) {
emap_release_edata(tsdn, pac->emap, edata,
ecache->state);
edata = NULL;
}
emap_unlock_edata(tsdn, pac->emap, unlock_edata);
}
} else {
/*
@ -557,8 +548,8 @@ extent_recycle_split(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks,
extent_deregister_no_gdump_sub(tsdn, pac, to_leak);
extents_abandon_vm(tsdn, pac, ehooks, ecache, to_leak,
growing_retained);
assert(emap_lock_edata_from_addr(tsdn, pac->emap,
leak, false) == NULL);
assert(emap_try_acquire_edata(tsdn, pac->emap,
leak, ecache->state, true) == NULL);
}
return NULL;
}
@ -806,42 +797,11 @@ extent_alloc_wrapper(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks,
return edata;
}
static bool
extent_can_coalesce(ecache_t *ecache, const edata_t *inner,
const edata_t *outer) {
assert(edata_arena_ind_get(inner) == ecache_ind_get(ecache));
if (edata_arena_ind_get(inner) != edata_arena_ind_get(outer)) {
return false;
}
/*
* We wouldn't really get into this situation because one or the other
* edata would have to have a head bit set to true, but this is
* conceptually correct and cheap.
*/
if (edata_pai_get(inner) != edata_pai_get(outer)) {
return false;
}
assert(edata_state_get(inner) == extent_state_active);
if (edata_state_get(outer) != ecache->state) {
return false;
}
if (edata_committed_get(inner) != edata_committed_get(outer)) {
return false;
}
return true;
}
static bool
extent_coalesce(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, ecache_t *ecache,
edata_t *inner, edata_t *outer, bool forward, bool growing_retained) {
assert(extent_can_coalesce(ecache, inner, outer));
extent_activate_locked(tsdn, pac, ecache, outer);
extent_assert_can_coalesce(inner, outer);
eset_remove(&ecache->eset, outer);
malloc_mutex_unlock(tsdn, &ecache->mtx);
bool err = extent_merge_impl(tsdn, pac, ehooks,
@ -873,22 +833,11 @@ extent_try_coalesce_impl(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks,
again = false;
/* Try to coalesce forward. */
edata_t *next = emap_lock_edata_from_addr(tsdn, pac->emap,
edata_past_get(edata), inactive_only);
edata_t *next = emap_try_acquire_edata_neighbor(tsdn, pac->emap,
edata, EXTENT_PAI_PAC, ecache->state, /* forward */ true);
if (next != NULL) {
/*
* ecache->mtx only protects against races for
* like-state extents, so call extent_can_coalesce()
* before releasing next's pool lock.
*/
bool can_coalesce = extent_can_coalesce(ecache,
edata, next);
emap_unlock_edata(tsdn, pac->emap, next);
if (can_coalesce && !extent_coalesce(tsdn, pac,
ehooks, ecache, edata, next, true,
growing_retained)) {
if (!extent_coalesce(tsdn, pac, ehooks, ecache, edata,
next, true, growing_retained)) {
if (ecache->delay_coalesce) {
/* Do minimal coalescing. */
*coalesced = true;
@ -899,30 +848,11 @@ extent_try_coalesce_impl(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks,
}
/* Try to coalesce backward. */
edata_t *prev = NULL;
if (edata_before_get(edata) != NULL) {
/*
* This is subtle; the rtree code asserts that its input
* pointer is non-NULL, and this is a useful thing to
* check. But it's possible that edata corresponds to
* an address of (void *)PAGE (in practice, this has
* only been observed on FreeBSD when address-space
* randomization is on, but it could in principle happen
* anywhere). In this case, edata_before_get(edata) is
* NULL, triggering the assert.
*/
prev = emap_lock_edata_from_addr(tsdn, pac->emap,
edata_before_get(edata), inactive_only);
}
edata_t *prev = emap_try_acquire_edata_neighbor(tsdn, pac->emap,
edata, EXTENT_PAI_PAC, ecache->state, /* forward */ false);
if (prev != NULL) {
bool can_coalesce = extent_can_coalesce(ecache, edata,
prev);
emap_unlock_edata(tsdn, pac->emap, prev);
if (can_coalesce && !extent_coalesce(tsdn, pac,
ehooks, ecache, edata, prev, false,
growing_retained)) {
if (!extent_coalesce(tsdn, pac, ehooks, ecache, edata,
prev, false, growing_retained)) {
edata = prev;
if (ecache->delay_coalesce) {
/* Do minimal coalescing. */
@ -1218,24 +1148,27 @@ extent_split_impl(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks,
goto label_error_b;
}
emap_lock_edata2(tsdn, pac->emap, edata, trail);
/*
* No need to acquire trail or edata, because: 1) trail was new (just
* allocated); and 2) edata is either an active allocation (the shrink
* path), or in an acquired state (extracted from the ecache on the
* extent_recycle_split path).
*/
assert(emap_edata_is_acquired(tsdn, pac->emap, edata));
assert(emap_edata_is_acquired(tsdn, pac->emap, trail));
err = ehooks_split(tsdn, ehooks, edata_base_get(edata), size_a + size_b,
size_a, size_b, edata_committed_get(edata));
if (err) {
goto label_error_c;
goto label_error_b;
}
edata_size_set(edata, size_a);
emap_split_commit(tsdn, pac->emap, &prepare, edata, size_a, trail,
size_b);
emap_unlock_edata2(tsdn, pac->emap, edata, trail);
return trail;
label_error_c:
emap_unlock_edata2(tsdn, pac->emap, edata, trail);
label_error_b:
edata_cache_put(tsdn, pac->edata_cache, trail);
label_error_a:
@ -1277,15 +1210,15 @@ extent_merge_impl(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, edata_t *a,
emap_prepare_t prepare;
emap_merge_prepare(tsdn, pac->emap, &prepare, a, b);
emap_lock_edata2(tsdn, pac->emap, a, b);
assert(edata_state_get(a) == extent_state_active ||
edata_state_get(a) == extent_state_merging);
edata_state_set(a, extent_state_active);
edata_size_set(a, edata_size_get(a) + edata_size_get(b));
edata_sn_set(a, (edata_sn_get(a) < edata_sn_get(b)) ?
edata_sn_get(a) : edata_sn_get(b));
edata_zeroed_set(a, edata_zeroed_get(a) && edata_zeroed_get(b));
emap_merge_commit(tsdn, pac->emap, &prepare, a, b);
emap_unlock_edata2(tsdn, pac->emap, a, b);
edata_cache_put(tsdn, pac->edata_cache, b);

View File

@ -33,15 +33,16 @@ hpa_central_split(tsdn_t *tsdn, hpa_central_t *central, edata_t *edata,
emap_prepare_t prepare;
bool err = emap_split_prepare(tsdn, central->emap, &prepare, edata,
size, trail, cursize - size);
assert(edata_state_get(edata) == edata_state_get(trail));
if (err) {
edata_cache_small_put(tsdn, &central->ecs, trail);
return NULL;
}
emap_lock_edata2(tsdn, central->emap, edata, trail);
assert(edata_state_get(edata) == edata_state_get(trail));
edata_size_set(edata, size);
emap_split_commit(tsdn, central->emap, &prepare, edata, size, trail,
cursize - size);
emap_unlock_edata2(tsdn, central->emap, edata, trail);
return trail;
}
@ -91,7 +92,7 @@ label_success:
*/
assert(edata_state_get(edata) == extent_state_dirty);
assert(edata_base_get(edata) == edata_addr_get(edata));
emap_edata_state_update(tsdn, central->emap, edata,
emap_update_edata_state(tsdn, central->emap, edata,
extent_state_active);
return edata;
}
@ -137,43 +138,22 @@ hpa_central_alloc_grow(tsdn_t *tsdn, hpa_central_t *central,
edata_sn_set(edata, sn);
edata_sn_set(trail, sn);
emap_edata_state_update(tsdn, central->emap, trail, extent_state_dirty);
emap_update_edata_state(tsdn, central->emap, trail, extent_state_dirty);
eset_insert(&central->eset, trail);
return false;
}
static edata_t *
hpa_central_dalloc_get_merge_candidate(tsdn_t *tsdn, hpa_central_t *central,
void *addr) {
edata_t *edata = emap_lock_edata_from_addr(tsdn, central->emap, addr,
/* inactive_only */ true);
if (edata == NULL) {
return NULL;
}
extent_pai_t pai = edata_pai_get(edata);
extent_state_t state = edata_state_get(edata);
emap_unlock_edata(tsdn, central->emap, edata);
if (pai != EXTENT_PAI_HPA) {
return NULL;
}
if (state == extent_state_active) {
return NULL;
}
return edata;
}
/* Merges b into a, freeing b back to the edata cache.. */
static void
hpa_central_dalloc_merge(tsdn_t *tsdn, hpa_central_t *central, edata_t *a,
edata_t *b) {
assert(emap_edata_is_acquired(tsdn, central->emap, a));
assert(emap_edata_is_acquired(tsdn, central->emap, b));
emap_prepare_t prepare;
emap_merge_prepare(tsdn, central->emap, &prepare, a, b);
emap_lock_edata2(tsdn, central->emap, a, b);
edata_size_set(a, edata_size_get(a) + edata_size_get(b));
emap_merge_commit(tsdn, central->emap, &prepare, a, b);
emap_unlock_edata2(tsdn, central->emap, a, b);
edata_cache_small_put(tsdn, &central->ecs, b);
}
@ -189,21 +169,24 @@ hpa_central_dalloc(tsdn_t *tsdn, hpa_central_t *central, edata_t *edata) {
edata_addr_set(edata, edata_base_get(edata));
edata_zeroed_set(edata, false);
if (!edata_is_head_get(edata)) {
edata_t *lead = hpa_central_dalloc_get_merge_candidate(tsdn,
central, edata_before_get(edata));
/*
* Merge forward first, so that the original *edata stays active state
* for the second acquire (only necessary for sanity checking).
*/
edata_t *trail = emap_try_acquire_edata_neighbor(tsdn, central->emap,
edata, EXTENT_PAI_HPA, extent_state_dirty, /* forward */ true);
if (trail != NULL) {
eset_remove(&central->eset, trail);
hpa_central_dalloc_merge(tsdn, central, edata, trail);
}
edata_t *lead = emap_try_acquire_edata_neighbor(tsdn, central->emap,
edata, EXTENT_PAI_HPA, extent_state_dirty, /* forward */ false);
if (lead != NULL) {
eset_remove(&central->eset, lead);
hpa_central_dalloc_merge(tsdn, central, lead, edata);
edata = lead;
}
}
edata_t *trail = hpa_central_dalloc_get_merge_candidate(tsdn, central,
edata_past_get(edata));
if (trail != NULL && !edata_is_head_get(trail)) {
eset_remove(&central->eset, trail);
hpa_central_dalloc_merge(tsdn, central, edata, trail);
}
emap_edata_state_update(tsdn, central->emap, edata, extent_state_dirty);
emap_update_edata_state(tsdn, central->emap, edata, extent_state_dirty);
eset_insert(&central->eset, edata);
}