Implement compact rtree leaf element representation.

If a single virtual adddress pointer has enough unused bits to pack
{szind_t, extent_t *, bool, bool}, use a single pointer-sized field in
each rtree leaf element, rather than using three separate fields.  This
has little impact on access speed (fewer loads/stores, but more bit
twiddling), except that denser representation increases TLB
effectiveness.
This commit is contained in:
Jason Evans 2017-03-20 16:38:21 -07:00
parent ce41ab0c57
commit 0ee0e0c155
5 changed files with 163 additions and 7 deletions

View File

@ -419,6 +419,11 @@ rtree_extent_szind_read
rtree_leaf_alloc
rtree_leaf_dalloc
rtree_leaf_elm_acquire
rtree_leaf_elm_bits_extent_get
rtree_leaf_elm_bits_locked_get
rtree_leaf_elm_bits_read
rtree_leaf_elm_bits_slab_get
rtree_leaf_elm_bits_szind_get
rtree_leaf_elm_extent_read
rtree_leaf_elm_extent_write
rtree_leaf_elm_lookup

View File

@ -4,6 +4,14 @@
#ifndef JEMALLOC_ENABLE_INLINE
uintptr_t rtree_leafkey(uintptr_t key);
uintptr_t rtree_subkey(uintptr_t key, unsigned level);
# ifdef RTREE_LEAF_COMPACT
uintptr_t rtree_leaf_elm_bits_read(tsdn_t *tsdn, rtree_t *rtree,
rtree_leaf_elm_t *elm, bool acquired, bool dependent);
extent_t *rtree_leaf_elm_bits_extent_get(uintptr_t bits);
szind_t rtree_leaf_elm_bits_szind_get(uintptr_t bits);
bool rtree_leaf_elm_bits_slab_get(uintptr_t bits);
bool rtree_leaf_elm_bits_locked_get(uintptr_t bits);
# endif
extent_t *rtree_leaf_elm_extent_read(tsdn_t *tsdn, rtree_t *rtree,
rtree_leaf_elm_t *elm, bool acquired, bool dependent);
szind_t rtree_leaf_elm_szind_read(tsdn_t *tsdn, rtree_t *rtree,
@ -75,6 +83,42 @@ rtree_subkey(uintptr_t key, unsigned level) {
* dependent on a previous rtree write, which means a stale read
* could result if synchronization were omitted here.
*/
# ifdef RTREE_LEAF_COMPACT
JEMALLOC_ALWAYS_INLINE uintptr_t
rtree_leaf_elm_bits_read(tsdn_t *tsdn, rtree_t *rtree, rtree_leaf_elm_t *elm,
bool acquired, bool dependent) {
if (config_debug && acquired) {
assert(dependent);
rtree_leaf_elm_witness_access(tsdn, rtree, elm);
}
return (uintptr_t)atomic_load_p(&elm->le_bits, dependent
? ATOMIC_RELAXED : ATOMIC_ACQUIRE);
}
JEMALLOC_ALWAYS_INLINE extent_t *
rtree_leaf_elm_bits_extent_get(uintptr_t bits) {
/* Restore sign-extended high bits, mask slab and lock bits. */
return (extent_t *)((uintptr_t)((intptr_t)(bits << RTREE_NHIB) >>
RTREE_NHIB) & ~((uintptr_t)0x3));
}
JEMALLOC_ALWAYS_INLINE szind_t
rtree_leaf_elm_bits_szind_get(uintptr_t bits) {
return (szind_t)(bits >> LG_VADDR);
}
JEMALLOC_ALWAYS_INLINE bool
rtree_leaf_elm_bits_slab_get(uintptr_t bits) {
return (bool)((bits >> 1) & (uintptr_t)0x1);
}
JEMALLOC_ALWAYS_INLINE bool
rtree_leaf_elm_bits_locked_get(uintptr_t bits) {
return (bool)(bits & (uintptr_t)0x1);
}
# endif
JEMALLOC_ALWAYS_INLINE extent_t *
rtree_leaf_elm_extent_read(tsdn_t *tsdn, rtree_t *rtree, rtree_leaf_elm_t *elm,
bool acquired, bool dependent) {
@ -83,6 +127,12 @@ rtree_leaf_elm_extent_read(tsdn_t *tsdn, rtree_t *rtree, rtree_leaf_elm_t *elm,
rtree_leaf_elm_witness_access(tsdn, rtree, elm);
}
#ifdef RTREE_LEAF_COMPACT
uintptr_t bits = rtree_leaf_elm_bits_read(tsdn, rtree, elm, acquired,
dependent);
assert(!acquired || rtree_leaf_elm_bits_locked_get(bits));
return rtree_leaf_elm_bits_extent_get(bits);
#else
extent_t *extent = (extent_t *)atomic_load_p(&elm->le_extent, dependent
? ATOMIC_RELAXED : ATOMIC_ACQUIRE);
assert(!acquired || ((uintptr_t)extent & (uintptr_t)0x1) ==
@ -90,6 +140,7 @@ rtree_leaf_elm_extent_read(tsdn_t *tsdn, rtree_t *rtree, rtree_leaf_elm_t *elm,
/* Mask lock bit. */
extent = (extent_t *)((uintptr_t)extent & ~((uintptr_t)0x1));
return extent;
#endif
}
JEMALLOC_ALWAYS_INLINE szind_t
@ -100,8 +151,15 @@ rtree_leaf_elm_szind_read(tsdn_t *tsdn, rtree_t *rtree, rtree_leaf_elm_t *elm,
rtree_leaf_elm_witness_access(tsdn, rtree, elm);
}
#ifdef RTREE_LEAF_COMPACT
uintptr_t bits = rtree_leaf_elm_bits_read(tsdn, rtree, elm, acquired,
dependent);
assert(!acquired || rtree_leaf_elm_bits_locked_get(bits));
return rtree_leaf_elm_bits_szind_get(bits);
#else
return (szind_t)atomic_load_u(&elm->le_szind, dependent ? ATOMIC_RELAXED
: ATOMIC_ACQUIRE);
#endif
}
JEMALLOC_ALWAYS_INLINE bool
@ -112,8 +170,15 @@ rtree_leaf_elm_slab_read(tsdn_t *tsdn, rtree_t *rtree, rtree_leaf_elm_t *elm,
rtree_leaf_elm_witness_access(tsdn, rtree, elm);
}
#ifdef RTREE_LEAF_COMPACT
uintptr_t bits = rtree_leaf_elm_bits_read(tsdn, rtree, elm, acquired,
dependent);
assert(!acquired || rtree_leaf_elm_bits_locked_get(bits));
return rtree_leaf_elm_bits_slab_get(bits);
#else
return atomic_load_b(&elm->le_slab, dependent ? ATOMIC_RELAXED :
ATOMIC_ACQUIRE);
#endif
}
JEMALLOC_INLINE void
@ -124,11 +189,21 @@ rtree_leaf_elm_extent_write(tsdn_t *tsdn, rtree_t *rtree, rtree_leaf_elm_t *elm,
}
assert(((uintptr_t)extent & (uintptr_t)0x1) == (uintptr_t)0x0);
#ifdef RTREE_LEAF_COMPACT
uintptr_t old_bits = rtree_leaf_elm_bits_read(tsdn, rtree, elm,
acquired, acquired);
uintptr_t bits = ((uintptr_t)rtree_leaf_elm_bits_szind_get(old_bits) <<
LG_VADDR) | ((uintptr_t)extent & (((uintptr_t)0x1 << LG_VADDR) - 1))
| ((uintptr_t)rtree_leaf_elm_bits_slab_get(old_bits) << 1) |
(uintptr_t)acquired;
atomic_store_p(&elm->le_bits, (void *)bits, ATOMIC_RELEASE);
#else
if (acquired) {
/* Overlay lock bit. */
extent = (extent_t *)((uintptr_t)extent | (uintptr_t)0x1);
}
atomic_store_p(&elm->le_extent, extent, ATOMIC_RELEASE);
#endif
}
JEMALLOC_INLINE void
@ -139,7 +214,18 @@ rtree_leaf_elm_szind_write(tsdn_t *tsdn, rtree_t *rtree, rtree_leaf_elm_t *elm,
}
assert(szind <= NSIZES);
#ifdef RTREE_LEAF_COMPACT
uintptr_t old_bits = rtree_leaf_elm_bits_read(tsdn, rtree, elm,
acquired, acquired);
uintptr_t bits = ((uintptr_t)szind << LG_VADDR) |
((uintptr_t)rtree_leaf_elm_bits_extent_get(old_bits) &
(((uintptr_t)0x1 << LG_VADDR) - 1)) |
((uintptr_t)rtree_leaf_elm_bits_slab_get(old_bits) << 1) |
(uintptr_t)acquired;
atomic_store_p(&elm->le_bits, (void *)bits, ATOMIC_RELEASE);
#else
atomic_store_u(&elm->le_szind, szind, ATOMIC_RELEASE);
#endif
}
JEMALLOC_INLINE void
@ -149,12 +235,35 @@ rtree_leaf_elm_slab_write(tsdn_t *tsdn, rtree_t *rtree, rtree_leaf_elm_t *elm,
rtree_leaf_elm_witness_access(tsdn, rtree, elm);
}
#ifdef RTREE_LEAF_COMPACT
uintptr_t old_bits = rtree_leaf_elm_bits_read(tsdn, rtree, elm,
acquired, acquired);
uintptr_t bits = ((uintptr_t)rtree_leaf_elm_bits_szind_get(old_bits) <<
LG_VADDR) | ((uintptr_t)rtree_leaf_elm_bits_extent_get(old_bits) &
(((uintptr_t)0x1 << LG_VADDR) - 1)) | ((uintptr_t)slab << 1) |
(uintptr_t)acquired;
atomic_store_p(&elm->le_bits, (void *)bits, ATOMIC_RELEASE);
#else
atomic_store_b(&elm->le_slab, slab, ATOMIC_RELEASE);
#endif
}
JEMALLOC_INLINE void
rtree_leaf_elm_write(tsdn_t *tsdn, rtree_t *rtree, rtree_leaf_elm_t *elm,
bool acquired, extent_t *extent, szind_t szind, bool slab) {
#ifdef RTREE_LEAF_COMPACT
if (config_debug && acquired) {
rtree_leaf_elm_witness_access(tsdn, rtree, elm);
}
assert(!slab || szind < NBINS);
uintptr_t bits = ((uintptr_t)szind << LG_VADDR) |
((uintptr_t)extent & (((uintptr_t)0x1 << LG_VADDR) - 1)) |
((uintptr_t)slab << 1) |
(uintptr_t)acquired;
atomic_store_p(&elm->le_bits, (void *)bits, ATOMIC_RELEASE);
#else
rtree_leaf_elm_slab_write(tsdn, rtree, elm, acquired, slab);
rtree_leaf_elm_szind_write(tsdn, rtree, elm, acquired, szind);
/*
@ -162,6 +271,7 @@ rtree_leaf_elm_write(tsdn_t *tsdn, rtree_t *rtree, rtree_leaf_elm_t *elm,
* as soon as the extent field is non-NULL.
*/
rtree_leaf_elm_extent_write(tsdn, rtree, elm, acquired, extent);
#endif
}
JEMALLOC_ALWAYS_INLINE rtree_leaf_elm_t *
@ -317,19 +427,24 @@ rtree_leaf_elm_acquire(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx,
spin_t spinner = SPIN_INITIALIZER;
while (true) {
/* The least significant bit serves as a lock. */
void *extent_and_lock = atomic_load_p(&elm->le_extent,
#ifdef RTREE_LEAF_COMPACT
# define RTREE_FIELD_WITH_LOCK le_bits
#else
# define RTREE_FIELD_WITH_LOCK le_extent
#endif
void *bits = atomic_load_p(&elm->RTREE_FIELD_WITH_LOCK,
ATOMIC_RELAXED);
if (likely(((uintptr_t)extent_and_lock & (uintptr_t)0x1) == 0))
{
void *locked = (void *)((uintptr_t)extent_and_lock
| (uintptr_t)0x1);
if (likely(((uintptr_t)bits & (uintptr_t)0x1) == 0)) {
void *locked = (void *)((uintptr_t)bits |
(uintptr_t)0x1);
if (likely(atomic_compare_exchange_strong_p(
&elm->le_extent, &extent_and_lock, locked,
&elm->RTREE_FIELD_WITH_LOCK, &bits, locked,
ATOMIC_ACQUIRE, ATOMIC_RELAXED))) {
break;
}
}
spin_adaptive(&spinner);
#undef RTREE_FIELD_WITH_LOCK
}
if (config_debug) {

View File

@ -6,9 +6,26 @@ struct rtree_node_elm_s {
};
struct rtree_leaf_elm_s {
atomic_p_t le_extent; /* (extent_t *) */
#ifdef RTREE_LEAF_COMPACT
/*
* Single pointer-width field containing all three leaf element fields.
* For example, on a 64-bit x64 system with 48 significant virtual
* memory address bits, the index, extent, and slab fields are packed as
* such:
*
* x: index
* e: extent
* b: slab
* k: lock
*
* 00000000 xxxxxxxx eeeeeeee [...] eeeeeeee eeee00bk
*/
atomic_p_t le_bits;
#else
atomic_p_t le_extent; /* (extent_t *), lock in low bit */
atomic_u_t le_szind; /* (szind_t) */
atomic_b_t le_slab; /* (bool) */
#endif
};
struct rtree_leaf_elm_witness_s {

View File

@ -33,6 +33,10 @@ typedef struct rtree_s rtree_t;
#else
# error Unsupported number of significant virtual address bits
#endif
/* Use compact leaf representation if virtual address encoding allows. */
#if RTREE_NHIB >= LG_CEIL_NSIZES
# define RTREE_LEAF_COMPACT
#endif
/*
* Number of leafkey/leaf pairs to cache. Each entry supports an entire leaf,

View File

@ -40,6 +40,17 @@ lg() {
done
}
lg_ceil() {
y=$1
lg ${y}; lg_floor=${lg_result}
pow2 ${lg_floor}; pow2_floor=${pow2_result}
if [ ${pow2_floor} -lt ${y} ] ; then
lg_ceil_result=$((${lg_floor} + 1))
else
lg_ceil_result=${lg_floor}
fi
}
reg_size_compute() {
lg_grp=$1
lg_delta=$2
@ -246,12 +257,14 @@ size_classes() {
done
echo
nsizes=${index}
lg_ceil ${nsizes}; lg_ceil_nsizes=${lg_ceil_result}
# Defined upon completion:
# - ntbins
# - nlbins
# - nbins
# - nsizes
# - lg_ceil_nsizes
# - npsizes
# - lg_tiny_maxclass
# - lookup_maxclass
@ -286,6 +299,7 @@ cat <<EOF
* NLBINS: Number of bins supported by the lookup table.
* NBINS: Number of small size class bins.
* NSIZES: Number of size classes.
* LG_CEIL_NSIZES: Number of bits required to store NSIZES.
* NPSIZES: Number of size classes that are a multiple of (1U << LG_PAGE).
* LG_TINY_MAXCLASS: Lg of maximum tiny size class.
* LOOKUP_MAXCLASS: Maximum size class included in lookup table.
@ -311,6 +325,7 @@ for lg_z in ${lg_zarr} ; do
echo "#define NLBINS ${nlbins}"
echo "#define NBINS ${nbins}"
echo "#define NSIZES ${nsizes}"
echo "#define LG_CEIL_NSIZES ${lg_ceil_nsizes}"
echo "#define NPSIZES ${npsizes}"
echo "#define LG_TINY_MAXCLASS ${lg_tiny_maxclass}"
echo "#define LOOKUP_MAXCLASS ${lookup_maxclass}"