diff --git a/include/jemalloc/internal/tsd.h b/include/jemalloc/internal/tsd.h index 9408b2ca..5ac85e14 100644 --- a/include/jemalloc/internal/tsd.h +++ b/include/jemalloc/internal/tsd.h @@ -77,6 +77,7 @@ typedef ql_elm(tsd_t) tsd_link_t; O(iarena, arena_t *, arena_t *) \ O(arena, arena_t *, arena_t *) \ O(arenas_tdata, arena_tdata_t *, arena_tdata_t *)\ + O(sec_shard, uint8_t, uint8_t) \ O(binshards, tsd_binshards_t, tsd_binshards_t)\ O(tsd_link, tsd_link_t, tsd_link_t) \ O(in_hook, bool, bool) \ @@ -106,6 +107,7 @@ typedef ql_elm(tsd_t) tsd_link_t; /* iarena */ NULL, \ /* arena */ NULL, \ /* arenas_tdata */ NULL, \ + /* sec_shard */ (uint8_t)-1, \ /* binshards */ TSD_BINSHARDS_ZERO_INITIALIZER, \ /* tsd_link */ {NULL}, \ /* in_hook */ false, \ diff --git a/src/sec.c b/src/sec.c index f3c906bc..262d813d 100644 --- a/src/sec.c +++ b/src/sec.c @@ -61,14 +61,20 @@ sec_shard_pick(tsdn_t *tsdn, sec_t *sec) { return &sec->shards[0]; } tsd_t *tsd = tsdn_tsd(tsdn); - /* - * Use the trick from Daniel Lemire's "A fast alternative to the modulo - * reduction. Use a 64 bit number to store 32 bits, since we'll - * deliberately overflow when we multiply by the number of shards. - */ - uint64_t rand32 = prng_lg_range_u64(tsd_prng_statep_get(tsd), 32); - uint32_t idx = (uint32_t)((rand32 * (uint64_t)sec->nshards) >> 32); - return &sec->shards[idx]; + uint8_t *idxp = tsd_sec_shardp_get(tsd); + if (*idxp == (uint8_t)-1) { + /* + * First use; initialize using the trick from Daniel Lemire's + * "A fast alternative to the modulo reduction. Use a 64 bit + * number to store 32 bits, since we'll deliberately overflow + * when we multiply by the number of shards. + */ + uint64_t rand32 = prng_lg_range_u64(tsd_prng_statep_get(tsd), 32); + uint32_t idx = (uint32_t)((rand32 * (uint64_t)sec->nshards) >> 32); + assert(idx < (uint32_t)sec->nshards); + *idxp = (uint8_t)idx; + } + return &sec->shards[*idxp]; } static edata_t *