From 04b463546e57ecd9ebc334739881a1c69623813a Mon Sep 17 00:00:00 2001 From: Jason Evans Date: Mon, 7 Nov 2016 10:52:44 -0800 Subject: [PATCH] Refactor prng to not use 64-bit atomics on 32-bit platforms. This resolves #495. --- include/jemalloc/internal/arena.h | 2 +- include/jemalloc/internal/extent.h | 4 +- include/jemalloc/internal/private_symbols.txt | 12 +- include/jemalloc/internal/prng.h | 155 +++++++++++-- src/arena.c | 6 +- src/ckh.c | 8 +- src/prof.c | 2 +- test/unit/prng.c | 209 ++++++++++++++++-- 8 files changed, 340 insertions(+), 58 deletions(-) diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h index ce9d8b5e..dbd334e6 100644 --- a/include/jemalloc/internal/arena.h +++ b/include/jemalloc/internal/arena.h @@ -182,7 +182,7 @@ struct arena_s { * PRNG state for cache index randomization of large allocation base * pointers. */ - uint64_t offset_state; + size_t offset_state; dss_prec_t dss_prec; diff --git a/include/jemalloc/internal/extent.h b/include/jemalloc/internal/extent.h index 673cac2f..531d853c 100644 --- a/include/jemalloc/internal/extent.h +++ b/include/jemalloc/internal/extent.h @@ -325,8 +325,8 @@ extent_addr_randomize(tsdn_t *tsdn, extent_t *extent, size_t alignment) if (alignment < PAGE) { unsigned lg_range = LG_PAGE - lg_floor(CACHELINE_CEILING(alignment)); - uint64_t r = - prng_lg_range(&extent_arena_get(extent)->offset_state, + size_t r = + prng_lg_range_zu(&extent_arena_get(extent)->offset_state, lg_range, true); uintptr_t random_offset = ((uintptr_t)r) << (LG_PAGE - lg_range); diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt index 2949de10..f178daf7 100644 --- a/include/jemalloc/internal/private_symbols.txt +++ b/include/jemalloc/internal/private_symbols.txt @@ -328,9 +328,15 @@ pind2sz_tab pow2_ceil_u32 pow2_ceil_u64 pow2_ceil_zu -prng_lg_range -prng_range -prng_state_next +prng_lg_range_u32 +prng_lg_range_u64 +prng_lg_range_zu +prng_range_u32 +prng_range_u64 +prng_range_zu +prng_state_next_u32 +prng_state_next_u64 +prng_state_next_zu prof_active prof_active_get prof_active_get_unlocked diff --git a/include/jemalloc/internal/prng.h b/include/jemalloc/internal/prng.h index ebe916f8..c2bda19c 100644 --- a/include/jemalloc/internal/prng.h +++ b/include/jemalloc/internal/prng.h @@ -19,8 +19,12 @@ * the next has a cycle of 4, etc. For this reason, we prefer to use the upper * bits. */ -#define PRNG_A UINT64_C(6364136223846793005) -#define PRNG_C UINT64_C(1442695040888963407) + +#define PRNG_A_32 UINT32_C(1103515241) +#define PRNG_C_32 UINT32_C(12347) + +#define PRNG_A_64 UINT64_C(6364136223846793005) +#define PRNG_C_64 UINT64_C(1442695040888963407) #endif /* JEMALLOC_H_TYPES */ /******************************************************************************/ @@ -35,45 +39,133 @@ #ifdef JEMALLOC_H_INLINES #ifndef JEMALLOC_ENABLE_INLINE -uint64_t prng_state_next(uint64_t state); -uint64_t prng_lg_range(uint64_t *state, unsigned lg_range, bool atomic); -uint64_t prng_range(uint64_t *state, uint64_t range, bool atomic); +uint32_t prng_state_next_u32(uint32_t state); +uint64_t prng_state_next_u64(uint64_t state); +size_t prng_state_next_zu(size_t state); + +uint32_t prng_lg_range_u32(uint32_t *state, unsigned lg_range, + bool atomic); +uint64_t prng_lg_range_u64(uint64_t *state, unsigned lg_range); +size_t prng_lg_range_zu(size_t *state, unsigned lg_range, bool atomic); + +uint32_t prng_range_u32(uint32_t *state, uint32_t range, bool atomic); +uint64_t prng_range_u64(uint64_t *state, uint64_t range); +size_t prng_range_zu(size_t *state, size_t range, bool atomic); #endif #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_PRNG_C_)) -JEMALLOC_ALWAYS_INLINE uint64_t -prng_state_next(uint64_t state) +JEMALLOC_ALWAYS_INLINE uint32_t +prng_state_next_u32(uint32_t state) { - return ((state * PRNG_A) + PRNG_C); + return ((state * PRNG_A_32) + PRNG_C_32); } JEMALLOC_ALWAYS_INLINE uint64_t -prng_lg_range(uint64_t *state, unsigned lg_range, bool atomic) +prng_state_next_u64(uint64_t state) +{ + + return ((state * PRNG_A_64) + PRNG_C_64); +} + +JEMALLOC_ALWAYS_INLINE size_t +prng_state_next_zu(size_t state) +{ + +#if LG_SIZEOF_PTR == 2 + return ((state * PRNG_A_32) + PRNG_C_32); +#elif LG_SIZEOF_PTR == 3 + return ((state * PRNG_A_64) + PRNG_C_64); +#else +#error Unsupported pointer size +#endif +} + +JEMALLOC_ALWAYS_INLINE uint32_t +prng_lg_range_u32(uint32_t *state, unsigned lg_range, bool atomic) +{ + uint32_t ret, state1; + + assert(lg_range > 0); + assert(lg_range <= 32); + + if (atomic) { + uint32_t state0; + + do { + state0 = atomic_read_uint32(state); + state1 = prng_state_next_u32(state0); + } while (atomic_cas_uint32(state, state0, state1)); + } else { + state1 = prng_state_next_u32(*state); + *state = state1; + } + ret = state1 >> (32 - lg_range); + + return (ret); +} + +/* 64-bit atomic operations cannot be supported on all relevant platforms. */ +JEMALLOC_ALWAYS_INLINE uint64_t +prng_lg_range_u64(uint64_t *state, unsigned lg_range) { uint64_t ret, state1; assert(lg_range > 0); assert(lg_range <= 64); - if (atomic) { - uint64_t state0; - - do { - state0 = atomic_read_uint64(state); - state1 = prng_state_next(state0); - } while (atomic_cas_uint64(state, state0, state1)); - } else { - state1 = prng_state_next(*state); - *state = state1; - } + state1 = prng_state_next_u64(*state); + *state = state1; ret = state1 >> (64 - lg_range); return (ret); } +JEMALLOC_ALWAYS_INLINE size_t +prng_lg_range_zu(size_t *state, unsigned lg_range, bool atomic) +{ + size_t ret, state1; + + assert(lg_range > 0); + assert(lg_range <= ZU(1) << (3 + LG_SIZEOF_PTR)); + + if (atomic) { + size_t state0; + + do { + state0 = atomic_read_z(state); + state1 = prng_state_next_zu(state0); + } while (atomic_cas_z(state, state0, state1)); + } else { + state1 = prng_state_next_zu(*state); + *state = state1; + } + ret = state1 >> ((ZU(1) << (3 + LG_SIZEOF_PTR)) - lg_range); + + return (ret); +} + +JEMALLOC_ALWAYS_INLINE uint32_t +prng_range_u32(uint32_t *state, uint32_t range, bool atomic) +{ + uint32_t ret; + unsigned lg_range; + + assert(range > 1); + + /* Compute the ceiling of lg(range). */ + lg_range = ffs_u32(pow2_ceil_u32(range)) - 1; + + /* Generate a result in [0..range) via repeated trial. */ + do { + ret = prng_lg_range_u32(state, lg_range, atomic); + } while (ret >= range); + + return (ret); +} + JEMALLOC_ALWAYS_INLINE uint64_t -prng_range(uint64_t *state, uint64_t range, bool atomic) +prng_range_u64(uint64_t *state, uint64_t range) { uint64_t ret; unsigned lg_range; @@ -85,7 +177,26 @@ prng_range(uint64_t *state, uint64_t range, bool atomic) /* Generate a result in [0..range) via repeated trial. */ do { - ret = prng_lg_range(state, lg_range, atomic); + ret = prng_lg_range_u64(state, lg_range); + } while (ret >= range); + + return (ret); +} + +JEMALLOC_ALWAYS_INLINE size_t +prng_range_zu(size_t *state, size_t range, bool atomic) +{ + size_t ret; + unsigned lg_range; + + assert(range > 1); + + /* Compute the ceiling of lg(range). */ + lg_range = ffs_u64(pow2_ceil_u64(range)) - 1; + + /* Generate a result in [0..range) via repeated trial. */ + do { + ret = prng_lg_range_zu(state, lg_range, atomic); } while (ret >= range); return (ret); diff --git a/src/arena.c b/src/arena.c index dd8e4d9c..4b104a0e 100644 --- a/src/arena.c +++ b/src/arena.c @@ -422,8 +422,8 @@ arena_decay_deadline_init(arena_t *arena) if (arena->decay.time > 0) { nstime_t jitter; - nstime_init(&jitter, prng_range(&arena->decay.jitter_state, - nstime_ns(&arena->decay.interval), false)); + nstime_init(&jitter, prng_range_u64(&arena->decay.jitter_state, + nstime_ns(&arena->decay.interval))); nstime_add(&arena->decay.deadline, &jitter); } } @@ -1680,7 +1680,7 @@ arena_new(tsdn_t *tsdn, unsigned ind) * deterministic seed. */ arena->offset_state = config_debug ? ind : - (uint64_t)(uintptr_t)arena; + (size_t)(uintptr_t)arena; } arena->dss_prec = extent_dss_prec_get(); diff --git a/src/ckh.c b/src/ckh.c index 75376017..6f16565f 100644 --- a/src/ckh.c +++ b/src/ckh.c @@ -99,8 +99,8 @@ ckh_try_bucket_insert(ckh_t *ckh, size_t bucket, const void *key, * Cycle through the cells in the bucket, starting at a random position. * The randomness avoids worst-case search overhead as buckets fill up. */ - offset = (unsigned)prng_lg_range(&ckh->prng_state, LG_CKH_BUCKET_CELLS, - false); + offset = (unsigned)prng_lg_range_u64(&ckh->prng_state, + LG_CKH_BUCKET_CELLS); for (i = 0; i < (ZU(1) << LG_CKH_BUCKET_CELLS); i++) { cell = &ckh->tab[(bucket << LG_CKH_BUCKET_CELLS) + ((i + offset) & ((ZU(1) << LG_CKH_BUCKET_CELLS) - 1))]; @@ -142,8 +142,8 @@ ckh_evict_reloc_insert(ckh_t *ckh, size_t argbucket, void const **argkey, * were an item for which both hashes indicated the same * bucket. */ - i = (unsigned)prng_lg_range(&ckh->prng_state, - LG_CKH_BUCKET_CELLS, false); + i = (unsigned)prng_lg_range_u64(&ckh->prng_state, + LG_CKH_BUCKET_CELLS); cell = &ckh->tab[(bucket << LG_CKH_BUCKET_CELLS) + i]; assert(cell->key != NULL); diff --git a/src/prof.c b/src/prof.c index 4bafb39a..19c8fb71 100644 --- a/src/prof.c +++ b/src/prof.c @@ -878,7 +878,7 @@ prof_sample_threshold_update(prof_tdata_t *tdata) * pp 500 * (http://luc.devroye.org/rnbookindex.html) */ - r = prng_lg_range(&tdata->prng_state, 53, false); + r = prng_lg_range_u64(&tdata->prng_state, 53); u = (double)r * (1.0/9007199254740992.0L); tdata->bytes_until_sample = (uint64_t)(log(u) / log(1.0 - (1.0 / (double)((uint64_t)1U << lg_prof_sample)))) diff --git a/test/unit/prng.c b/test/unit/prng.c index f3234455..111fa59f 100644 --- a/test/unit/prng.c +++ b/test/unit/prng.c @@ -1,34 +1,71 @@ #include "test/jemalloc_test.h" static void -test_prng_lg_range(bool atomic) +test_prng_lg_range_u32(bool atomic) +{ + uint32_t sa, sb, ra, rb; + unsigned lg_range; + + sa = 42; + ra = prng_lg_range_u32(&sa, 32, atomic); + sa = 42; + rb = prng_lg_range_u32(&sa, 32, atomic); + assert_u32_eq(ra, rb, + "Repeated generation should produce repeated results"); + + sb = 42; + rb = prng_lg_range_u32(&sb, 32, atomic); + assert_u32_eq(ra, rb, + "Equivalent generation should produce equivalent results"); + + sa = 42; + ra = prng_lg_range_u32(&sa, 32, atomic); + rb = prng_lg_range_u32(&sa, 32, atomic); + assert_u32_ne(ra, rb, + "Full-width results must not immediately repeat"); + + sa = 42; + ra = prng_lg_range_u32(&sa, 32, atomic); + for (lg_range = 31; lg_range > 0; lg_range--) { + sb = 42; + rb = prng_lg_range_u32(&sb, lg_range, atomic); + assert_u32_eq((rb & (UINT32_C(0xffffffff) << lg_range)), + 0, "High order bits should be 0, lg_range=%u", lg_range); + assert_u32_eq(rb, (ra >> (32 - lg_range)), + "Expected high order bits of full-width result, " + "lg_range=%u", lg_range); + } +} + +static void +test_prng_lg_range_u64(void) { uint64_t sa, sb, ra, rb; unsigned lg_range; sa = 42; - ra = prng_lg_range(&sa, 64, atomic); + ra = prng_lg_range_u64(&sa, 64); sa = 42; - rb = prng_lg_range(&sa, 64, atomic); + rb = prng_lg_range_u64(&sa, 64); assert_u64_eq(ra, rb, "Repeated generation should produce repeated results"); sb = 42; - rb = prng_lg_range(&sb, 64, atomic); + rb = prng_lg_range_u64(&sb, 64); assert_u64_eq(ra, rb, "Equivalent generation should produce equivalent results"); sa = 42; - ra = prng_lg_range(&sa, 64, atomic); - rb = prng_lg_range(&sa, 64, atomic); + ra = prng_lg_range_u64(&sa, 64); + rb = prng_lg_range_u64(&sa, 64); assert_u64_ne(ra, rb, "Full-width results must not immediately repeat"); sa = 42; - ra = prng_lg_range(&sa, 64, atomic); + ra = prng_lg_range_u64(&sa, 64); for (lg_range = 63; lg_range > 0; lg_range--) { sb = 42; - rb = prng_lg_range(&sb, lg_range, atomic); + rb = prng_lg_range_u64(&sb, lg_range); assert_u64_eq((rb & (UINT64_C(0xffffffffffffffff) << lg_range)), 0, "High order bits should be 0, lg_range=%u", lg_range); assert_u64_eq(rb, (ra >> (64 - lg_range)), @@ -37,22 +74,102 @@ test_prng_lg_range(bool atomic) } } -TEST_BEGIN(test_prng_lg_range_nonatomic) +static void +test_prng_lg_range_zu(bool atomic) +{ + uint64_t sa, sb, ra, rb; + unsigned lg_range; + + sa = 42; + ra = prng_lg_range_zu(&sa, 64, atomic); + sa = 42; + rb = prng_lg_range_zu(&sa, 64, atomic); + assert_zu_eq(ra, rb, + "Repeated generation should produce repeated results"); + + sb = 42; + rb = prng_lg_range_zu(&sb, 64, atomic); + assert_zu_eq(ra, rb, + "Equivalent generation should produce equivalent results"); + + sa = 42; + ra = prng_lg_range_zu(&sa, 64, atomic); + rb = prng_lg_range_zu(&sa, 64, atomic); + assert_zu_ne(ra, rb, + "Full-width results must not immediately repeat"); + + sa = 42; + ra = prng_lg_range_zu(&sa, 64, atomic); + for (lg_range = (ZU(1) << (3 + LG_SIZEOF_PTR)) - 1; lg_range > 0; + lg_range--) { + sb = 42; + rb = prng_lg_range_zu(&sb, lg_range, atomic); + assert_zu_eq((rb & (SIZE_T_MAX << lg_range)), + 0, "High order bits should be 0, lg_range=%u", lg_range); + assert_zu_eq(rb, (ra >> (64 - lg_range)), + "Expected high order bits of full-width result, " + "lg_range=%u", lg_range); + } +} + +TEST_BEGIN(test_prng_lg_range_u32_nonatomic) { - test_prng_lg_range(false); + test_prng_lg_range_u32(false); } TEST_END -TEST_BEGIN(test_prng_lg_range_atomic) +TEST_BEGIN(test_prng_lg_range_u32_atomic) { - test_prng_lg_range(true); + test_prng_lg_range_u32(true); +} +TEST_END + +TEST_BEGIN(test_prng_lg_range_u64_nonatomic) +{ + + test_prng_lg_range_u64(); +} +TEST_END + +TEST_BEGIN(test_prng_lg_range_zu_nonatomic) +{ + + test_prng_lg_range_zu(false); +} +TEST_END + +TEST_BEGIN(test_prng_lg_range_zu_atomic) +{ + + test_prng_lg_range_zu(true); } TEST_END static void -test_prng_range(bool atomic) +test_prng_range_u32(bool atomic) +{ + uint32_t range; +#define MAX_RANGE 10000000 +#define RANGE_STEP 97 +#define NREPS 10 + + for (range = 2; range < MAX_RANGE; range += RANGE_STEP) { + uint32_t s; + unsigned rep; + + s = range; + for (rep = 0; rep < NREPS; rep++) { + uint32_t r = prng_range_u32(&s, range, atomic); + + assert_u32_lt(r, range, "Out of range"); + } + } +} + +static void +test_prng_range_u64(void) { uint64_t range; #define MAX_RANGE 10000000 @@ -65,24 +182,66 @@ test_prng_range(bool atomic) s = range; for (rep = 0; rep < NREPS; rep++) { - uint64_t r = prng_range(&s, range, atomic); + uint64_t r = prng_range_u64(&s, range); assert_u64_lt(r, range, "Out of range"); } } } -TEST_BEGIN(test_prng_range_nonatomic) +static void +test_prng_range_zu(bool atomic) +{ + size_t range; +#define MAX_RANGE 10000000 +#define RANGE_STEP 97 +#define NREPS 10 + + for (range = 2; range < MAX_RANGE; range += RANGE_STEP) { + size_t s; + unsigned rep; + + s = range; + for (rep = 0; rep < NREPS; rep++) { + size_t r = prng_range_zu(&s, range, atomic); + + assert_zu_lt(r, range, "Out of range"); + } + } +} + +TEST_BEGIN(test_prng_range_u32_nonatomic) { - test_prng_range(false); + test_prng_range_u32(false); } TEST_END -TEST_BEGIN(test_prng_range_atomic) +TEST_BEGIN(test_prng_range_u32_atomic) { - test_prng_range(true); + test_prng_range_u32(true); +} +TEST_END + +TEST_BEGIN(test_prng_range_u64_nonatomic) +{ + + test_prng_range_u64(); +} +TEST_END + +TEST_BEGIN(test_prng_range_zu_nonatomic) +{ + + test_prng_range_zu(false); +} +TEST_END + +TEST_BEGIN(test_prng_range_zu_atomic) +{ + + test_prng_range_zu(true); } TEST_END @@ -91,8 +250,14 @@ main(void) { return (test( - test_prng_lg_range_nonatomic, - test_prng_lg_range_atomic, - test_prng_range_nonatomic, - test_prng_range_atomic)); + test_prng_lg_range_u32_nonatomic, + test_prng_lg_range_u32_atomic, + test_prng_lg_range_u64_nonatomic, + test_prng_lg_range_zu_nonatomic, + test_prng_lg_range_zu_atomic, + test_prng_range_u32_nonatomic, + test_prng_range_u32_atomic, + test_prng_range_u64_nonatomic, + test_prng_range_zu_nonatomic, + test_prng_range_zu_atomic)); }