From 1784939688b86e459ecb39615e463176dd609685 Mon Sep 17 00:00:00 2001 From: Qi Wang Date: Thu, 4 Mar 2021 14:33:40 -0800 Subject: [PATCH] Use rtree tracked states to protect edata outside of ecache locks. This avoids the addr-based mutexes (i.e. the mutex_pool), and instead relies on the metadata tracked in rtree leaf: the head state and extent_state. Before trying to access the neighbor edata (e.g. for coalescing), the states will be verified first -- only neighbor edatas from the same arena and with the same state will be accessed. --- include/jemalloc/internal/edata.h | 11 +- include/jemalloc/internal/emap.h | 119 ++++++++------ include/jemalloc/internal/rtree.h | 31 +++- src/emap.c | 263 +++++++++++++++++++++--------- src/eset.c | 3 +- src/extent.c | 151 +++++------------ src/hpa_central.c | 65 +++----- 7 files changed, 366 insertions(+), 277 deletions(-) diff --git a/include/jemalloc/internal/edata.h b/include/jemalloc/internal/edata.h index 648b478e..b2e6ee9f 100644 --- a/include/jemalloc/internal/edata.h +++ b/include/jemalloc/internal/edata.h @@ -23,7 +23,11 @@ enum extent_state_e { extent_state_active = 0, extent_state_dirty = 1, extent_state_muzzy = 2, - extent_state_retained = 3 + extent_state_retained = 3, + extent_state_transition = 4, /* States below are intermediate. */ + extent_state_updating = 4, + extent_state_merging = 5, + extent_state_max = 5 /* Sanity checking only. */ }; typedef enum extent_state_e extent_state_t; @@ -550,6 +554,11 @@ edata_is_head_set(edata_t *edata, bool is_head) { ((uint64_t)is_head << EDATA_BITS_IS_HEAD_SHIFT); } +static inline bool +edata_state_in_transition(extent_state_t state) { + return state >= extent_state_transition; +} + /* * Because this function is implemented as a sequence of bitfield modifications, * even though each individual bit is properly initialized, we technically read diff --git a/include/jemalloc/internal/emap.h b/include/jemalloc/internal/emap.h index afb4983c..239f3e43 100644 --- a/include/jemalloc/internal/emap.h +++ b/include/jemalloc/internal/emap.h @@ -5,6 +5,15 @@ #include "jemalloc/internal/mutex_pool.h" #include "jemalloc/internal/rtree.h" +/* + * Note: Ends without at semicolon, so that + * EMAP_DECLARE_RTREE_CTX; + * in uses will avoid empty-statement warnings. + */ +#define EMAP_DECLARE_RTREE_CTX \ + rtree_ctx_t rtree_ctx_fallback; \ + rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn, &rtree_ctx_fallback) + typedef struct emap_s emap_t; struct emap_s { rtree_t rtree; @@ -31,20 +40,16 @@ bool emap_init(emap_t *emap, base_t *base, bool zeroed); void emap_remap(tsdn_t *tsdn, emap_t *emap, edata_t *edata, szind_t szind, bool slab); -/* - * Grab the lock or locks associated with the edata or edatas indicated (which - * is done just by simple address hashing). The hashing strategy means that - * it's never safe to grab locks incrementally -- you have to grab all the locks - * you'll need at once, and release them all at once. - */ -void emap_lock_edata(tsdn_t *tsdn, emap_t *emap, edata_t *edata); -void emap_unlock_edata(tsdn_t *tsdn, emap_t *emap, edata_t *edata); -void emap_lock_edata2(tsdn_t *tsdn, emap_t *emap, edata_t *edata1, - edata_t *edata2); -void emap_unlock_edata2(tsdn_t *tsdn, emap_t *emap, edata_t *edata1, - edata_t *edata2); -edata_t *emap_lock_edata_from_addr(tsdn_t *tsdn, emap_t *emap, void *addr, - bool inactive_only); +void emap_update_edata_state(tsdn_t *tsdn, emap_t *emap, edata_t *edata, + extent_state_t state); + +edata_t *emap_try_acquire_edata(tsdn_t *tsdn, emap_t *emap, void *addr, + extent_state_t expected_state, bool allow_head_extent); +edata_t *emap_try_acquire_edata_neighbor(tsdn_t *tsdn, emap_t *emap, + edata_t *edata, extent_pai_t pai, extent_state_t expected_state, + bool forward); +void emap_release_edata(tsdn_t *tsdn, emap_t *emap, edata_t *edata, + extent_state_t new_state); /* * Associate the given edata with its beginning and end address, setting the @@ -136,43 +141,66 @@ emap_assert_not_mapped(tsdn_t *tsdn, emap_t *emap, edata_t *edata) { } } -static inline void -emap_update_rtree_at_addr(tsdn_t *tsdn, rtree_t *rtree, edata_t *expected_edata, - uintptr_t addr, extent_state_t state) { - witness_assert_positive_depth_to_rank(tsdn_witness_tsdp_get(tsdn), - WITNESS_RANK_CORE); +JEMALLOC_ALWAYS_INLINE bool +emap_edata_in_transition(tsdn_t *tsdn, emap_t *emap, edata_t *edata) { + assert(config_debug); + emap_assert_mapped(tsdn, emap, edata); - rtree_ctx_t rtree_ctx_fallback; - rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn, &rtree_ctx_fallback); + EMAP_DECLARE_RTREE_CTX; + rtree_contents_t contents = rtree_read(tsdn, &emap->rtree, rtree_ctx, + (uintptr_t)edata_base_get(edata)); - rtree_leaf_elm_t *elm = rtree_leaf_elm_lookup(tsdn, rtree, rtree_ctx, - addr, /* dependent */ true, /* init_missing */ false); - assert(elm != NULL); - rtree_contents_t contents = rtree_leaf_elm_read(tsdn, rtree, elm, - /* dependent */ true); - assert(contents.edata == expected_edata); - contents.metadata.state = state; - rtree_leaf_elm_write(tsdn, rtree, elm, contents); + return edata_state_in_transition(contents.metadata.state); } -static inline void -emap_edata_state_update(tsdn_t *tsdn, emap_t *emap, edata_t *edata, - extent_state_t state) { - /* Only emap is allowed to modify the edata internal state. */ - edata_state_set(edata, state); +JEMALLOC_ALWAYS_INLINE bool +emap_edata_is_acquired(tsdn_t *tsdn, emap_t *emap, edata_t *edata) { + if (!config_debug) { + /* For assertions only. */ + return false; + } - emap_update_rtree_at_addr(tsdn, &emap->rtree, edata, - (uintptr_t)edata_base_get(edata), state); - emap_update_rtree_at_addr(tsdn, &emap->rtree, edata, - (uintptr_t)edata_last_get(edata), state); + /* + * The edata is considered acquired if no other threads will attempt to + * read / write any fields from it. This includes a few cases: + * + * 1) edata not hooked into emap yet -- This implies the edata just got + * allocated or initialized. + * + * 2) in an active or transition state -- In both cases, the edata can + * be discovered from the emap, however the state tracked in the rtree + * will prevent other threads from accessing the actual edata. + */ + EMAP_DECLARE_RTREE_CTX; + rtree_leaf_elm_t *elm = rtree_leaf_elm_lookup(tsdn, &emap->rtree, + rtree_ctx, (uintptr_t)edata_base_get(edata), /* dependent */ true, + /* init_missing */ false); + if (elm == NULL) { + return true; + } + rtree_contents_t contents = rtree_leaf_elm_read(tsdn, &emap->rtree, elm, + /* dependent */ true); + if (contents.edata == NULL || + contents.metadata.state == extent_state_active || + edata_state_in_transition(contents.metadata.state)) { + return true; + } - emap_assert_mapped(tsdn, emap, edata); + return false; +} + +JEMALLOC_ALWAYS_INLINE void +extent_assert_can_coalesce(const edata_t *inner, const edata_t *outer) { + assert(edata_arena_ind_get(inner) == edata_arena_ind_get(outer)); + assert(edata_pai_get(inner) == edata_pai_get(outer)); + assert(edata_committed_get(inner) == edata_committed_get(outer)); + assert(edata_state_get(inner) == extent_state_active); + assert(edata_state_get(outer) == extent_state_merging); } JEMALLOC_ALWAYS_INLINE edata_t * emap_edata_lookup(tsdn_t *tsdn, emap_t *emap, const void *ptr) { - rtree_ctx_t rtree_ctx_fallback; - rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn, &rtree_ctx_fallback); + EMAP_DECLARE_RTREE_CTX; return rtree_read(tsdn, &emap->rtree, rtree_ctx, (uintptr_t)ptr).edata; } @@ -181,8 +209,7 @@ emap_edata_lookup(tsdn_t *tsdn, emap_t *emap, const void *ptr) { JEMALLOC_ALWAYS_INLINE void emap_alloc_ctx_lookup(tsdn_t *tsdn, emap_t *emap, const void *ptr, emap_alloc_ctx_t *alloc_ctx) { - rtree_ctx_t rtree_ctx_fallback; - rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn, &rtree_ctx_fallback); + EMAP_DECLARE_RTREE_CTX; rtree_metadata_t metadata = rtree_metadata_read(tsdn, &emap->rtree, rtree_ctx, (uintptr_t)ptr); @@ -194,8 +221,7 @@ emap_alloc_ctx_lookup(tsdn_t *tsdn, emap_t *emap, const void *ptr, JEMALLOC_ALWAYS_INLINE void emap_full_alloc_ctx_lookup(tsdn_t *tsdn, emap_t *emap, const void *ptr, emap_full_alloc_ctx_t *full_alloc_ctx) { - rtree_ctx_t rtree_ctx_fallback; - rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn, &rtree_ctx_fallback); + EMAP_DECLARE_RTREE_CTX; rtree_contents_t contents = rtree_read(tsdn, &emap->rtree, rtree_ctx, (uintptr_t)ptr); @@ -212,8 +238,7 @@ emap_full_alloc_ctx_lookup(tsdn_t *tsdn, emap_t *emap, const void *ptr, JEMALLOC_ALWAYS_INLINE bool emap_full_alloc_ctx_try_lookup(tsdn_t *tsdn, emap_t *emap, const void *ptr, emap_full_alloc_ctx_t *full_alloc_ctx) { - rtree_ctx_t rtree_ctx_fallback; - rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn, &rtree_ctx_fallback); + EMAP_DECLARE_RTREE_CTX; rtree_contents_t contents; bool err = rtree_read_independent(tsdn, &emap->rtree, rtree_ctx, diff --git a/include/jemalloc/internal/rtree.h b/include/jemalloc/internal/rtree.h index 89c08cb0..42aa11c9 100644 --- a/include/jemalloc/internal/rtree.h +++ b/include/jemalloc/internal/rtree.h @@ -81,7 +81,7 @@ struct rtree_leaf_elm_s { #else atomic_p_t le_edata; /* (edata_t *) */ /* - * From low to high bits: slab, is_head, state. + * From high to low bits: szind (8 bits), state (4 bits), is_head, slab */ atomic_u_t le_metadata; #endif @@ -187,6 +187,7 @@ rtree_leaf_elm_bits_read(tsdn_t *tsdn, rtree_t *rtree, JEMALLOC_ALWAYS_INLINE uintptr_t rtree_leaf_elm_bits_encode(rtree_contents_t contents) { + assert((uintptr_t)contents.edata % (uintptr_t)EDATA_ALIGNMENT == 0); uintptr_t edata_bits = (uintptr_t)contents.edata & (((uintptr_t)1 << LG_VADDR) - 1); @@ -212,6 +213,7 @@ rtree_leaf_elm_bits_decode(uintptr_t bits) { uintptr_t state_bits = (bits & RTREE_LEAF_STATE_MASK) >> RTREE_LEAF_STATE_SHIFT; + assert(state_bits <= extent_state_max); contents.metadata.state = (extent_state_t)state_bits; uintptr_t low_bit_mask = ~((uintptr_t)EDATA_ALIGNMENT - 1); @@ -229,6 +231,7 @@ rtree_leaf_elm_bits_decode(uintptr_t bits) { contents.edata = (edata_t *)((uintptr_t)((intptr_t)(bits << RTREE_NHIB) >> RTREE_NHIB) & low_bit_mask); # endif + assert((uintptr_t)contents.edata % (uintptr_t)EDATA_ALIGNMENT == 0); return contents; } @@ -250,6 +253,7 @@ rtree_leaf_elm_read(tsdn_t *tsdn, rtree_t *rtree, rtree_leaf_elm_t *elm, uintptr_t state_bits = (metadata_bits & RTREE_LEAF_STATE_MASK) >> RTREE_LEAF_STATE_SHIFT; + assert(state_bits <= extent_state_max); contents.metadata.state = (extent_state_t)state_bits; contents.metadata.szind = metadata_bits >> (RTREE_LEAF_STATE_SHIFT + RTREE_LEAF_STATE_WIDTH); @@ -283,6 +287,31 @@ rtree_leaf_elm_write(tsdn_t *tsdn, rtree_t *rtree, #endif } +/* The state field can be updated independently (and more frequently). */ +static inline void +rtree_leaf_elm_state_update(tsdn_t *tsdn, rtree_t *rtree, + rtree_leaf_elm_t *elm1, rtree_leaf_elm_t *elm2, extent_state_t state) { + assert(elm1 != NULL); +#ifdef RTREE_LEAF_COMPACT + uintptr_t bits = rtree_leaf_elm_bits_read(tsdn, rtree, elm1, + /* dependent */ true); + bits &= ~RTREE_LEAF_STATE_MASK; + bits |= state << RTREE_LEAF_STATE_SHIFT; + atomic_store_p(&elm1->le_bits, (void *)bits, ATOMIC_RELEASE); + if (elm2 != NULL) { + atomic_store_p(&elm2->le_bits, (void *)bits, ATOMIC_RELEASE); + } +#else + unsigned bits = atomic_load_u(&elm1->le_metadata, ATOMIC_RELAXED); + bits &= ~RTREE_LEAF_STATE_MASK; + bits |= state << RTREE_LEAF_STATE_SHIFT; + atomic_store_u(&elm1->le_metadata, bits, ATOMIC_RELEASE); + if (elm2 != NULL) { + atomic_store_u(&elm2->le_metadata, bits, ATOMIC_RELEASE); + } +#endif +} + /* * Tries to look up the key in the L1 cache, returning it if there's a hit, or * NULL if there's a miss. diff --git a/src/emap.c b/src/emap.c index 4f3915b5..26a079cb 100644 --- a/src/emap.c +++ b/src/emap.c @@ -3,15 +3,6 @@ #include "jemalloc/internal/emap.h" -/* - * Note: Ends without at semicolon, so that - * EMAP_DECLARE_RTREE_CTX; - * in uses will avoid empty-statement warnings. - */ -#define EMAP_DECLARE_RTREE_CTX \ - rtree_ctx_t rtree_ctx_fallback; \ - rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn, &rtree_ctx_fallback) - enum emap_lock_result_e { emap_lock_result_success, emap_lock_result_failure, @@ -35,82 +26,186 @@ emap_init(emap_t *emap, base_t *base, bool zeroed) { } void -emap_lock_edata(tsdn_t *tsdn, emap_t *emap, edata_t *edata) { - assert(edata != NULL); - mutex_pool_lock(tsdn, &emap->mtx_pool, (uintptr_t)edata); -} +emap_update_edata_state(tsdn_t *tsdn, emap_t *emap, edata_t *edata, + extent_state_t state) { + witness_assert_positive_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE); -void -emap_unlock_edata(tsdn_t *tsdn, emap_t *emap, edata_t *edata) { - assert(edata != NULL); - mutex_pool_unlock(tsdn, &emap->mtx_pool, (uintptr_t)edata); -} + edata_state_set(edata, state); -void -emap_lock_edata2(tsdn_t *tsdn, emap_t *emap, edata_t *edata1, - edata_t *edata2) { - assert(edata1 != NULL && edata2 != NULL); - mutex_pool_lock2(tsdn, &emap->mtx_pool, (uintptr_t)edata1, - (uintptr_t)edata2); -} - -void -emap_unlock_edata2(tsdn_t *tsdn, emap_t *emap, edata_t *edata1, - edata_t *edata2) { - assert(edata1 != NULL && edata2 != NULL); - mutex_pool_unlock2(tsdn, &emap->mtx_pool, (uintptr_t)edata1, - (uintptr_t)edata2); -} - -static inline emap_lock_result_t -emap_try_lock_rtree_leaf_elm(tsdn_t *tsdn, emap_t *emap, rtree_leaf_elm_t *elm, - edata_t **result, bool inactive_only) { - edata_t *edata1 = rtree_leaf_elm_read(tsdn, &emap->rtree, elm, - /* dependent */ true).edata; - - /* Slab implies active extents and should be skipped. */ - if (edata1 == NULL || (inactive_only && rtree_leaf_elm_read(tsdn, - &emap->rtree, elm, /* dependent */ true).metadata.slab)) { - return emap_lock_result_no_extent; - } - - /* - * It's possible that the extent changed out from under us, and with it - * the leaf->edata mapping. We have to recheck while holding the lock. - */ - emap_lock_edata(tsdn, emap, edata1); - edata_t *edata2 = rtree_leaf_elm_read(tsdn, &emap->rtree, elm, - /* dependent */ true).edata; - - if (edata1 == edata2) { - *result = edata1; - return emap_lock_result_success; - } else { - emap_unlock_edata(tsdn, emap, edata1); - return emap_lock_result_failure; - } -} - -/* - * Returns a pool-locked edata_t * if there's one associated with the given - * address, and NULL otherwise. - */ -edata_t * -emap_lock_edata_from_addr(tsdn_t *tsdn, emap_t *emap, void *addr, - bool inactive_only) { EMAP_DECLARE_RTREE_CTX; - edata_t *ret = NULL; + rtree_leaf_elm_t *elm1 = rtree_leaf_elm_lookup(tsdn, &emap->rtree, + rtree_ctx, (uintptr_t)edata_base_get(edata), /* dependent */ true, + /* init_missing */ false); + assert(elm1 != NULL); + rtree_leaf_elm_t *elm2 = edata_size_get(edata) == PAGE ? NULL : + rtree_leaf_elm_lookup(tsdn, &emap->rtree, rtree_ctx, + (uintptr_t)edata_last_get(edata), /* dependent */ true, + /* init_missing */ false); + + rtree_leaf_elm_state_update(tsdn, &emap->rtree, elm1, elm2, state); + + emap_assert_mapped(tsdn, emap, edata); +} + +static inline bool +edata_neighbor_head_state_mergeable(bool edata_is_head, + bool neighbor_is_head, bool forward) { + /* + * Head states checking: disallow merging if the higher addr extent is a + * head extent. This helps preserve first-fit, and more importantly + * makes sure no merge across arenas. + */ + if (forward) { + if (neighbor_is_head) { + return false; + } + } else { + if (edata_is_head) { + return false; + } + } + return true; +} + +static inline bool +edata_can_acquire_neighbor(edata_t *edata, rtree_contents_t contents, + extent_pai_t pai, extent_state_t expected_state, bool forward) { + edata_t *neighbor = contents.edata; + if (neighbor == NULL) { + return false; + } + /* It's not safe to access *neighbor yet; must verify states first. */ + bool neighbor_is_head = contents.metadata.is_head; + if (!edata_neighbor_head_state_mergeable(edata_is_head_get(edata), + neighbor_is_head, forward)) { + return NULL; + } + extent_state_t neighbor_state = contents.metadata.state; + if (pai == EXTENT_PAI_PAC) { + if (neighbor_state != expected_state) { + return false; + } + /* From this point, it's safe to access *neighbor. */ + if (edata_committed_get(edata) != + edata_committed_get(neighbor)) { + /* + * Some platforms (e.g. Windows) require an explicit + * commit step (and writing to uncomitted memory is not + * allowed). + */ + return false; + } + } else { + if (neighbor_state == extent_state_active) { + return false; + } + /* From this point, it's safe to access *neighbor. */ + } + + assert(edata_pai_get(edata) == pai); + if (edata_pai_get(neighbor) != pai) { + return false; + } + if (opt_retain) { + assert(edata_arena_ind_get(edata) == + edata_arena_ind_get(neighbor)); + } else { + /* + * This isn't entirely safe with the presence of arena_reset / + * destroy, in which case the neighbor edata can be destoryed if + * it belongs to a manual arena. More on that later. + */ + if (edata_arena_ind_get(edata) != + edata_arena_ind_get(neighbor)) { + return false; + } + } + + return true; +} + +/* Will be removed in the next commit. */ +edata_t * +emap_try_acquire_edata(tsdn_t *tsdn, emap_t *emap, void *addr, + extent_state_t expected_state, bool allow_head_extent) { + EMAP_DECLARE_RTREE_CTX; rtree_leaf_elm_t *elm = rtree_leaf_elm_lookup(tsdn, &emap->rtree, rtree_ctx, (uintptr_t)addr, false, false); if (elm == NULL) { return NULL; } - emap_lock_result_t lock_result; - do { - lock_result = emap_try_lock_rtree_leaf_elm(tsdn, emap, elm, - &ret, inactive_only); - } while (lock_result == emap_lock_result_failure); - return ret; + rtree_contents_t contents = rtree_leaf_elm_read(tsdn, &emap->rtree, elm, + /* dependent */ true); + if (!allow_head_extent && contents.metadata.is_head) { + /* !allow_head_extent indicates the expanding path. */ + return NULL; + } + + edata_t *edata = contents.edata; + if (edata == NULL || contents.metadata.state != expected_state) { + return NULL; + } + assert(edata_state_get(edata) == expected_state); + emap_update_edata_state(tsdn, emap, edata, extent_state_updating); + + return edata; +} + +void +emap_release_edata(tsdn_t *tsdn, emap_t *emap, edata_t *edata, + extent_state_t new_state) { + assert(emap_edata_in_transition(tsdn, emap, edata)); + assert(emap_edata_is_acquired(tsdn, emap, edata)); + + emap_update_edata_state(tsdn, emap, edata, new_state); +} + +edata_t * +emap_try_acquire_edata_neighbor(tsdn_t *tsdn, emap_t *emap, edata_t *edata, + extent_pai_t pai, extent_state_t expected_state, bool forward) { + witness_assert_positive_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE); + assert(!edata_state_in_transition(expected_state)); + assert(expected_state == extent_state_dirty || + expected_state == extent_state_muzzy || + expected_state == extent_state_retained); + + void *neighbor_addr = forward ? edata_past_get(edata) : + edata_before_get(edata); + /* + * This is subtle; the rtree code asserts that its input pointer is + * non-NULL, and this is a useful thing to check. But it's possible + * that edata corresponds to an address of (void *)PAGE (in practice, + * this has only been observed on FreeBSD when address-space + * randomization is on, but it could in principle happen anywhere). In + * this case, edata_before_get(edata) is NULL, triggering the assert. + */ + if (neighbor_addr == NULL) { + return NULL; + } + + EMAP_DECLARE_RTREE_CTX; + rtree_leaf_elm_t *elm = rtree_leaf_elm_lookup(tsdn, &emap->rtree, + rtree_ctx, (uintptr_t)neighbor_addr, /* dependent*/ false, + /* init_missing */ false); + if (elm == NULL) { + return NULL; + } + + rtree_contents_t neighbor_contents = rtree_leaf_elm_read(tsdn, + &emap->rtree, elm, /* dependent */ true); + if (!edata_can_acquire_neighbor(edata, neighbor_contents, pai, + expected_state, forward)) { + return NULL; + } + + /* From this point, the neighbor edata can be safely acquired. */ + edata_t *neighbor = neighbor_contents.edata; + emap_update_edata_state(tsdn, emap, neighbor, extent_state_merging); + extent_assert_can_coalesce(edata, neighbor); + + return neighbor; } static bool @@ -153,6 +248,7 @@ emap_rtree_write_acquired(tsdn_t *tsdn, emap_t *emap, rtree_leaf_elm_t *elm_a, bool emap_register_boundary(tsdn_t *tsdn, emap_t *emap, edata_t *edata, szind_t szind, bool slab) { + assert(edata_state_get(edata) == extent_state_active); EMAP_DECLARE_RTREE_CTX; rtree_leaf_elm_t *elm_a, *elm_b; @@ -161,6 +257,10 @@ emap_register_boundary(tsdn_t *tsdn, emap_t *emap, edata_t *edata, if (err) { return true; } + assert(rtree_leaf_elm_read(tsdn, &emap->rtree, elm_a, + /* dependent */ false).edata == NULL); + assert(rtree_leaf_elm_read(tsdn, &emap->rtree, elm_b, + /* dependent */ false).edata == NULL); emap_rtree_write_acquired(tsdn, emap, elm_a, elm_b, edata, szind, slab); return false; } @@ -190,6 +290,15 @@ emap_register_interior(tsdn_t *tsdn, emap_t *emap, edata_t *edata, void emap_deregister_boundary(tsdn_t *tsdn, emap_t *emap, edata_t *edata) { + /* + * The edata must be either in an acquired state, or protected by state + * based locks. + */ + if (!emap_edata_is_acquired(tsdn, emap, edata)) { + witness_assert_positive_depth_to_rank( + tsdn_witness_tsdp_get(tsdn), WITNESS_RANK_CORE); + } + EMAP_DECLARE_RTREE_CTX; rtree_leaf_elm_t *elm_a, *elm_b; diff --git a/src/eset.c b/src/eset.c index a52a6f7c..9183ac67 100644 --- a/src/eset.c +++ b/src/eset.c @@ -78,7 +78,8 @@ eset_insert(eset_t *eset, edata_t *edata) { void eset_remove(eset_t *eset, edata_t *edata) { - assert(edata_state_get(edata) == eset->state); + assert(edata_state_get(edata) == eset->state || + edata_state_in_transition(edata_state_get(edata))); size_t size = edata_size_get(edata); size_t psz = sz_psz_quantize_floor(size); diff --git a/src/extent.c b/src/extent.c index 56ea33f6..e660d4c5 100644 --- a/src/extent.c +++ b/src/extent.c @@ -64,12 +64,12 @@ extent_may_force_decay(pac_t *pac) { static bool extent_try_delayed_coalesce(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, ecache_t *ecache, edata_t *edata) { - emap_edata_state_update(tsdn, pac->emap, edata, extent_state_active); + emap_update_edata_state(tsdn, pac->emap, edata, extent_state_active); bool coalesced; edata = extent_try_coalesce(tsdn, pac, ehooks, ecache, edata, &coalesced, false); - emap_edata_state_update(tsdn, pac->emap, edata, ecache->state); + emap_update_edata_state(tsdn, pac->emap, edata, ecache->state); if (!coalesced) { return true; @@ -183,7 +183,7 @@ ecache_evict(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, not_reached(); case extent_state_dirty: case extent_state_muzzy: - emap_edata_state_update(tsdn, pac->emap, edata, + emap_update_edata_state(tsdn, pac->emap, edata, extent_state_active); break; case extent_state_retained: @@ -230,7 +230,7 @@ extent_deactivate_locked(tsdn_t *tsdn, pac_t *pac, ecache_t *ecache, assert(edata_arena_ind_get(edata) == ecache_ind_get(ecache)); assert(edata_state_get(edata) == extent_state_active); - emap_edata_state_update(tsdn, pac->emap, edata, ecache->state); + emap_update_edata_state(tsdn, pac->emap, edata, ecache->state); eset_insert(&ecache->eset, edata); } @@ -245,10 +245,11 @@ static void extent_activate_locked(tsdn_t *tsdn, pac_t *pac, ecache_t *ecache, edata_t *edata) { assert(edata_arena_ind_get(edata) == ecache_ind_get(ecache)); - assert(edata_state_get(edata) == ecache->state); + assert(edata_state_get(edata) == ecache->state || + edata_state_get(edata) == extent_state_updating); eset_remove(&ecache->eset, edata); - emap_edata_state_update(tsdn, pac->emap, edata, extent_state_active); + emap_update_edata_state(tsdn, pac->emap, edata, extent_state_active); } static void @@ -290,20 +291,16 @@ extent_gdump_sub(tsdn_t *tsdn, const edata_t *edata) { static bool extent_register_impl(tsdn_t *tsdn, pac_t *pac, edata_t *edata, bool gdump_add) { + assert(edata_state_get(edata) == extent_state_active); /* - * We need to hold the lock to protect against a concurrent coalesce - * operation that sees us in a partial state. + * No locking needed, as the edata must be in active state, which + * prevents other threads from accessing the edata. */ - emap_lock_edata(tsdn, pac->emap, edata); - if (emap_register_boundary(tsdn, pac->emap, edata, SC_NSIZES, /* slab */ false)) { - emap_unlock_edata(tsdn, pac->emap, edata); return true; } - emap_unlock_edata(tsdn, pac->emap, edata); - if (config_prof && gdump_add) { extent_gdump_add(tsdn, edata); } @@ -333,9 +330,7 @@ extent_reregister(tsdn_t *tsdn, pac_t *pac, edata_t *edata) { static void extent_deregister_impl(tsdn_t *tsdn, pac_t *pac, edata_t *edata, bool gdump) { - emap_lock_edata(tsdn, pac->emap, edata); emap_deregister_boundary(tsdn, pac->emap, edata); - emap_unlock_edata(tsdn, pac->emap, edata); if (config_prof && gdump) { extent_gdump_sub(tsdn, edata); @@ -383,22 +378,18 @@ extent_recycle_extract(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, malloc_mutex_lock(tsdn, &ecache->mtx); edata_t *edata; if (new_addr != NULL) { - edata = emap_lock_edata_from_addr(tsdn, pac->emap, new_addr, - false); + edata = emap_try_acquire_edata(tsdn, pac->emap, new_addr, + ecache->state, /* allow_head_extent*/ false); if (edata != NULL) { - /* - * We might null-out edata to report an error, but we - * still need to unlock the associated mutex after. - */ - edata_t *unlock_edata = edata; assert(edata_base_get(edata) == new_addr); - if (edata_arena_ind_get(edata) != ecache_ind_get(ecache) - || edata_size_get(edata) < size - || edata_state_get(edata) - != ecache->state) { + assert(edata_arena_ind_get(edata) == + ecache_ind_get(ecache)); + assert(edata_state_get(edata) == extent_state_updating); + if (edata_size_get(edata) < size) { + emap_release_edata(tsdn, pac->emap, edata, + ecache->state); edata = NULL; } - emap_unlock_edata(tsdn, pac->emap, unlock_edata); } } else { /* @@ -557,8 +548,8 @@ extent_recycle_split(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, extent_deregister_no_gdump_sub(tsdn, pac, to_leak); extents_abandon_vm(tsdn, pac, ehooks, ecache, to_leak, growing_retained); - assert(emap_lock_edata_from_addr(tsdn, pac->emap, - leak, false) == NULL); + assert(emap_try_acquire_edata(tsdn, pac->emap, + leak, ecache->state, true) == NULL); } return NULL; } @@ -806,42 +797,11 @@ extent_alloc_wrapper(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, return edata; } -static bool -extent_can_coalesce(ecache_t *ecache, const edata_t *inner, - const edata_t *outer) { - assert(edata_arena_ind_get(inner) == ecache_ind_get(ecache)); - - if (edata_arena_ind_get(inner) != edata_arena_ind_get(outer)) { - return false; - } - - /* - * We wouldn't really get into this situation because one or the other - * edata would have to have a head bit set to true, but this is - * conceptually correct and cheap. - */ - if (edata_pai_get(inner) != edata_pai_get(outer)) { - return false; - } - - assert(edata_state_get(inner) == extent_state_active); - if (edata_state_get(outer) != ecache->state) { - return false; - } - - if (edata_committed_get(inner) != edata_committed_get(outer)) { - return false; - } - - return true; -} - static bool extent_coalesce(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, ecache_t *ecache, edata_t *inner, edata_t *outer, bool forward, bool growing_retained) { - assert(extent_can_coalesce(ecache, inner, outer)); - - extent_activate_locked(tsdn, pac, ecache, outer); + extent_assert_can_coalesce(inner, outer); + eset_remove(&ecache->eset, outer); malloc_mutex_unlock(tsdn, &ecache->mtx); bool err = extent_merge_impl(tsdn, pac, ehooks, @@ -873,22 +833,11 @@ extent_try_coalesce_impl(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, again = false; /* Try to coalesce forward. */ - edata_t *next = emap_lock_edata_from_addr(tsdn, pac->emap, - edata_past_get(edata), inactive_only); + edata_t *next = emap_try_acquire_edata_neighbor(tsdn, pac->emap, + edata, EXTENT_PAI_PAC, ecache->state, /* forward */ true); if (next != NULL) { - /* - * ecache->mtx only protects against races for - * like-state extents, so call extent_can_coalesce() - * before releasing next's pool lock. - */ - bool can_coalesce = extent_can_coalesce(ecache, - edata, next); - - emap_unlock_edata(tsdn, pac->emap, next); - - if (can_coalesce && !extent_coalesce(tsdn, pac, - ehooks, ecache, edata, next, true, - growing_retained)) { + if (!extent_coalesce(tsdn, pac, ehooks, ecache, edata, + next, true, growing_retained)) { if (ecache->delay_coalesce) { /* Do minimal coalescing. */ *coalesced = true; @@ -899,30 +848,11 @@ extent_try_coalesce_impl(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, } /* Try to coalesce backward. */ - edata_t *prev = NULL; - if (edata_before_get(edata) != NULL) { - /* - * This is subtle; the rtree code asserts that its input - * pointer is non-NULL, and this is a useful thing to - * check. But it's possible that edata corresponds to - * an address of (void *)PAGE (in practice, this has - * only been observed on FreeBSD when address-space - * randomization is on, but it could in principle happen - * anywhere). In this case, edata_before_get(edata) is - * NULL, triggering the assert. - */ - prev = emap_lock_edata_from_addr(tsdn, pac->emap, - edata_before_get(edata), inactive_only); - - } + edata_t *prev = emap_try_acquire_edata_neighbor(tsdn, pac->emap, + edata, EXTENT_PAI_PAC, ecache->state, /* forward */ false); if (prev != NULL) { - bool can_coalesce = extent_can_coalesce(ecache, edata, - prev); - emap_unlock_edata(tsdn, pac->emap, prev); - - if (can_coalesce && !extent_coalesce(tsdn, pac, - ehooks, ecache, edata, prev, false, - growing_retained)) { + if (!extent_coalesce(tsdn, pac, ehooks, ecache, edata, + prev, false, growing_retained)) { edata = prev; if (ecache->delay_coalesce) { /* Do minimal coalescing. */ @@ -1218,24 +1148,27 @@ extent_split_impl(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, goto label_error_b; } - emap_lock_edata2(tsdn, pac->emap, edata, trail); + /* + * No need to acquire trail or edata, because: 1) trail was new (just + * allocated); and 2) edata is either an active allocation (the shrink + * path), or in an acquired state (extracted from the ecache on the + * extent_recycle_split path). + */ + assert(emap_edata_is_acquired(tsdn, pac->emap, edata)); + assert(emap_edata_is_acquired(tsdn, pac->emap, trail)); err = ehooks_split(tsdn, ehooks, edata_base_get(edata), size_a + size_b, size_a, size_b, edata_committed_get(edata)); if (err) { - goto label_error_c; + goto label_error_b; } edata_size_set(edata, size_a); emap_split_commit(tsdn, pac->emap, &prepare, edata, size_a, trail, size_b); - emap_unlock_edata2(tsdn, pac->emap, edata, trail); - return trail; -label_error_c: - emap_unlock_edata2(tsdn, pac->emap, edata, trail); label_error_b: edata_cache_put(tsdn, pac->edata_cache, trail); label_error_a: @@ -1277,15 +1210,15 @@ extent_merge_impl(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, edata_t *a, emap_prepare_t prepare; emap_merge_prepare(tsdn, pac->emap, &prepare, a, b); - emap_lock_edata2(tsdn, pac->emap, a, b); - + assert(edata_state_get(a) == extent_state_active || + edata_state_get(a) == extent_state_merging); + edata_state_set(a, extent_state_active); edata_size_set(a, edata_size_get(a) + edata_size_get(b)); edata_sn_set(a, (edata_sn_get(a) < edata_sn_get(b)) ? edata_sn_get(a) : edata_sn_get(b)); edata_zeroed_set(a, edata_zeroed_get(a) && edata_zeroed_get(b)); emap_merge_commit(tsdn, pac->emap, &prepare, a, b); - emap_unlock_edata2(tsdn, pac->emap, a, b); edata_cache_put(tsdn, pac->edata_cache, b); diff --git a/src/hpa_central.c b/src/hpa_central.c index 36758a03..9e00dd64 100644 --- a/src/hpa_central.c +++ b/src/hpa_central.c @@ -33,15 +33,16 @@ hpa_central_split(tsdn_t *tsdn, hpa_central_t *central, edata_t *edata, emap_prepare_t prepare; bool err = emap_split_prepare(tsdn, central->emap, &prepare, edata, size, trail, cursize - size); + assert(edata_state_get(edata) == edata_state_get(trail)); if (err) { edata_cache_small_put(tsdn, ¢ral->ecs, trail); return NULL; } - emap_lock_edata2(tsdn, central->emap, edata, trail); + assert(edata_state_get(edata) == edata_state_get(trail)); + edata_size_set(edata, size); emap_split_commit(tsdn, central->emap, &prepare, edata, size, trail, cursize - size); - emap_unlock_edata2(tsdn, central->emap, edata, trail); return trail; } @@ -91,7 +92,7 @@ label_success: */ assert(edata_state_get(edata) == extent_state_dirty); assert(edata_base_get(edata) == edata_addr_get(edata)); - emap_edata_state_update(tsdn, central->emap, edata, + emap_update_edata_state(tsdn, central->emap, edata, extent_state_active); return edata; } @@ -137,43 +138,22 @@ hpa_central_alloc_grow(tsdn_t *tsdn, hpa_central_t *central, edata_sn_set(edata, sn); edata_sn_set(trail, sn); - emap_edata_state_update(tsdn, central->emap, trail, extent_state_dirty); + emap_update_edata_state(tsdn, central->emap, trail, extent_state_dirty); eset_insert(¢ral->eset, trail); return false; } -static edata_t * -hpa_central_dalloc_get_merge_candidate(tsdn_t *tsdn, hpa_central_t *central, - void *addr) { - edata_t *edata = emap_lock_edata_from_addr(tsdn, central->emap, addr, - /* inactive_only */ true); - if (edata == NULL) { - return NULL; - } - extent_pai_t pai = edata_pai_get(edata); - extent_state_t state = edata_state_get(edata); - emap_unlock_edata(tsdn, central->emap, edata); - - if (pai != EXTENT_PAI_HPA) { - return NULL; - } - if (state == extent_state_active) { - return NULL; - } - - return edata; -} - /* Merges b into a, freeing b back to the edata cache.. */ static void hpa_central_dalloc_merge(tsdn_t *tsdn, hpa_central_t *central, edata_t *a, edata_t *b) { + assert(emap_edata_is_acquired(tsdn, central->emap, a)); + assert(emap_edata_is_acquired(tsdn, central->emap, b)); + emap_prepare_t prepare; emap_merge_prepare(tsdn, central->emap, &prepare, a, b); - emap_lock_edata2(tsdn, central->emap, a, b); edata_size_set(a, edata_size_get(a) + edata_size_get(b)); emap_merge_commit(tsdn, central->emap, &prepare, a, b); - emap_unlock_edata2(tsdn, central->emap, a, b); edata_cache_small_put(tsdn, ¢ral->ecs, b); } @@ -189,21 +169,24 @@ hpa_central_dalloc(tsdn_t *tsdn, hpa_central_t *central, edata_t *edata) { edata_addr_set(edata, edata_base_get(edata)); edata_zeroed_set(edata, false); - if (!edata_is_head_get(edata)) { - edata_t *lead = hpa_central_dalloc_get_merge_candidate(tsdn, - central, edata_before_get(edata)); - if (lead != NULL) { - eset_remove(¢ral->eset, lead); - hpa_central_dalloc_merge(tsdn, central, lead, edata); - edata = lead; - } - } - edata_t *trail = hpa_central_dalloc_get_merge_candidate(tsdn, central, - edata_past_get(edata)); - if (trail != NULL && !edata_is_head_get(trail)) { + /* + * Merge forward first, so that the original *edata stays active state + * for the second acquire (only necessary for sanity checking). + */ + edata_t *trail = emap_try_acquire_edata_neighbor(tsdn, central->emap, + edata, EXTENT_PAI_HPA, extent_state_dirty, /* forward */ true); + if (trail != NULL) { eset_remove(¢ral->eset, trail); hpa_central_dalloc_merge(tsdn, central, edata, trail); } - emap_edata_state_update(tsdn, central->emap, edata, extent_state_dirty); + edata_t *lead = emap_try_acquire_edata_neighbor(tsdn, central->emap, + edata, EXTENT_PAI_HPA, extent_state_dirty, /* forward */ false); + if (lead != NULL) { + eset_remove(¢ral->eset, lead); + hpa_central_dalloc_merge(tsdn, central, lead, edata); + edata = lead; + } + + emap_update_edata_state(tsdn, central->emap, edata, extent_state_dirty); eset_insert(¢ral->eset, edata); }