From 4fc2acf5aef9ea8fe7e2dd39ee8b6a5050c5ff7f Mon Sep 17 00:00:00 2001 From: David Goldblatt Date: Wed, 8 Mar 2017 15:56:31 -0800 Subject: [PATCH] Switch atomic uint64_ts in arena_stats_t to C11 atomics I expect this to be the trickiest conversion we will see, since we want atomics on 64-bit platforms, but are also always able to piggyback on some sort of external synchronization on non-64 bit platforms. --- include/jemalloc/internal/stats_structs.h | 33 ++++++---- src/arena.c | 60 +++++++++++------ src/ctl.c | 80 +++++++++++++++++------ 3 files changed, 119 insertions(+), 54 deletions(-) diff --git a/include/jemalloc/internal/stats_structs.h b/include/jemalloc/internal/stats_structs.h index 06ba95fc..b64ba2d2 100644 --- a/include/jemalloc/internal/stats_structs.h +++ b/include/jemalloc/internal/stats_structs.h @@ -1,6 +1,13 @@ #ifndef JEMALLOC_INTERNAL_STATS_STRUCTS_H #define JEMALLOC_INTERNAL_STATS_STRUCTS_H +#ifdef JEMALLOC_ATOMIC_U64 +typedef atomic_u64_t arena_stats_u64_t; +#else +/* Must hold the arena stats mutex while reading atomically. */ +typedef uint64_t arena_stats_u64_t; +#endif + struct tcache_bin_stats_s { /* * Number of allocation requests that corresponded to the size of this @@ -56,15 +63,15 @@ struct malloc_large_stats_s { * Total number of allocation/deallocation requests served directly by * the arena. */ - uint64_t nmalloc; - uint64_t ndalloc; + arena_stats_u64_t nmalloc; + arena_stats_u64_t ndalloc; /* * Number of allocation requests that correspond to this size class. * This includes requests served by tcache, though tcache only * periodically merges into this counter. */ - uint64_t nrequests; /* Partially derived. */ + arena_stats_u64_t nrequests; /* Partially derived. */ /* Current number of allocations of this size class. */ size_t curlextents; /* Derived. */ @@ -96,18 +103,18 @@ struct arena_stats_s { * and total pages purged in order to keep dirty unused memory under * control. */ - uint64_t npurge; - uint64_t nmadvise; - uint64_t purged; + arena_stats_u64_t npurge; + arena_stats_u64_t nmadvise; + arena_stats_u64_t purged; - size_t base; /* Derived. */ - size_t internal; - size_t resident; /* Derived. */ + size_t base; /* Derived. */ + size_t internal; + size_t resident; /* Derived. */ - size_t allocated_large; /* Derived. */ - uint64_t nmalloc_large; /* Derived. */ - uint64_t ndalloc_large; /* Derived. */ - uint64_t nrequests_large; /* Derived. */ + size_t allocated_large; /* Derived. */ + arena_stats_u64_t nmalloc_large; /* Derived. */ + arena_stats_u64_t ndalloc_large; /* Derived. */ + arena_stats_u64_t nrequests_large; /* Derived. */ /* Number of bytes cached in tcache associated with this arena. */ size_t tcache_bytes; /* Derived. */ diff --git a/src/arena.c b/src/arena.c index cb0194ae..1fbf87dd 100644 --- a/src/arena.c +++ b/src/arena.c @@ -78,9 +78,10 @@ arena_stats_unlock(tsdn_t *tsdn, arena_stats_t *arena_stats) { } static uint64_t -arena_stats_read_u64(tsdn_t *tsdn, arena_stats_t *arena_stats, uint64_t *p) { +arena_stats_read_u64(tsdn_t *tsdn, arena_stats_t *arena_stats, + arena_stats_u64_t *p) { #ifdef JEMALLOC_ATOMIC_U64 - return atomic_read_u64(p); + return atomic_load_u64(p, ATOMIC_RELAXED); #else malloc_mutex_assert_owner(tsdn, &arena_stats->mtx); return *p; @@ -88,10 +89,10 @@ arena_stats_read_u64(tsdn_t *tsdn, arena_stats_t *arena_stats, uint64_t *p) { } static void -arena_stats_add_u64(tsdn_t *tsdn, arena_stats_t *arena_stats, uint64_t *p, - uint64_t x) { +arena_stats_add_u64(tsdn_t *tsdn, arena_stats_t *arena_stats, + arena_stats_u64_t *p, uint64_t x) { #ifdef JEMALLOC_ATOMIC_U64 - atomic_add_u64(p, x); + atomic_fetch_add_u64(p, x, ATOMIC_RELAXED); #else malloc_mutex_assert_owner(tsdn, &arena_stats->mtx); *p += x; @@ -99,11 +100,11 @@ arena_stats_add_u64(tsdn_t *tsdn, arena_stats_t *arena_stats, uint64_t *p, } UNUSED static void -arena_stats_sub_u64(tsdn_t *tsdn, arena_stats_t *arena_stats, uint64_t *p, - uint64_t x) { +arena_stats_sub_u64(tsdn_t *tsdn, arena_stats_t *arena_stats, + arena_stats_u64_t *p, uint64_t x) { #ifdef JEMALLOC_ATOMIC_U64 - UNUSED uint64_t r = atomic_sub_u64(p, x); - assert(r + x >= r); + UNUSED uint64_t r = atomic_fetch_sub_u64(p, x, ATOMIC_RELAXED); + assert(r - x <= r); #else malloc_mutex_assert_owner(tsdn, &arena_stats->mtx); *p -= x; @@ -111,6 +112,21 @@ arena_stats_sub_u64(tsdn_t *tsdn, arena_stats_t *arena_stats, uint64_t *p, #endif } +/* + * Non-atomically sets *dst += src. *dst needs external synchronization. + * This lets us avoid the cost of a fetch_add when its unnecessary (note that + * the types here are atomic). + */ +static void +arena_stats_accum_u64(arena_stats_u64_t *dst, uint64_t src) { +#ifdef JEMALLOC_ATOMIC_U64 + uint64_t cur_dst = atomic_load_u64(dst, ATOMIC_RELAXED); + atomic_store_u64(dst, src + cur_dst, ATOMIC_RELAXED); +#else + *dst += src; +#endif +} + static size_t arena_stats_read_zu(tsdn_t *tsdn, arena_stats_t *arena_stats, size_t *p) { #ifdef JEMALLOC_ATOMIC_U64 @@ -191,12 +207,12 @@ arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads, &arena->stats.mapped); astats->retained += (extents_npages_get(&arena->extents_retained) << LG_PAGE); - astats->npurge += arena_stats_read_u64(tsdn, &arena->stats, - &arena->stats.npurge); - astats->nmadvise += arena_stats_read_u64(tsdn, &arena->stats, - &arena->stats.nmadvise); - astats->purged += arena_stats_read_u64(tsdn, &arena->stats, - &arena->stats.purged); + arena_stats_accum_u64(&astats->npurge, arena_stats_read_u64(tsdn, + &arena->stats, &arena->stats.npurge)); + arena_stats_accum_u64(&astats->nmadvise, arena_stats_read_u64(tsdn, + &arena->stats, &arena->stats.nmadvise)); + arena_stats_accum_u64(&astats->purged, arena_stats_read_u64(tsdn, + &arena->stats, &arena->stats.purged)); astats->base += base_allocated; astats->internal += arena_internal_get(arena); astats->resident += base_resident + (((atomic_read_zu(&arena->nactive) + @@ -205,18 +221,20 @@ arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads, for (szind_t i = 0; i < NSIZES - NBINS; i++) { uint64_t nmalloc = arena_stats_read_u64(tsdn, &arena->stats, &arena->stats.lstats[i].nmalloc); - lstats[i].nmalloc += nmalloc; - astats->nmalloc_large += nmalloc; + arena_stats_accum_u64(&lstats[i].nmalloc, nmalloc); + arena_stats_accum_u64(&astats->nmalloc_large, nmalloc); uint64_t ndalloc = arena_stats_read_u64(tsdn, &arena->stats, &arena->stats.lstats[i].ndalloc); - lstats[i].ndalloc += ndalloc; - astats->ndalloc_large += ndalloc; + arena_stats_accum_u64(&lstats[i].ndalloc, ndalloc); + arena_stats_accum_u64(&astats->ndalloc_large, ndalloc); uint64_t nrequests = arena_stats_read_u64(tsdn, &arena->stats, &arena->stats.lstats[i].nrequests); - lstats[i].nrequests += nmalloc + nrequests; - astats->nrequests_large += nmalloc + nrequests; + arena_stats_accum_u64(&lstats[i].nrequests, + nmalloc + nrequests); + arena_stats_accum_u64(&astats->nrequests_large, + nmalloc + nrequests); assert(nmalloc >= ndalloc); assert(nmalloc - ndalloc <= SIZE_T_MAX); diff --git a/src/ctl.c b/src/ctl.c index d4ab699f..bb835836 100644 --- a/src/ctl.c +++ b/src/ctl.c @@ -431,6 +431,33 @@ static const ctl_named_node_t super_root_node[] = { /******************************************************************************/ +/* + * Sets *dst + *src non-atomically. This is safe, since everything is + * synchronized by the ctl mutex. + */ +static void +accum_arena_stats_u64(arena_stats_u64_t *dst, arena_stats_u64_t *src) { +#ifdef JEMALLOC_ATOMIC_U64 + uint64_t cur_dst = atomic_load_u64(dst, ATOMIC_RELAXED); + uint64_t cur_src = atomic_load_u64(src, ATOMIC_RELAXED); + atomic_store_u64(dst, cur_dst + cur_src, ATOMIC_RELAXED); +#else + *dst += *src; +#endif +} + +/* Likewise: with ctl mutex synchronization, reading is simple. */ +static uint64_t +arena_stats_read_u64(arena_stats_u64_t *p) { +#ifdef JEMALLOC_ATOMIC_U64 + return atomic_load_u64(p, ATOMIC_RELAXED); +#else + return *p; +#endif +} + +/******************************************************************************/ + static unsigned arenas_i2a_impl(size_t i, bool compat, bool validate) { unsigned a; @@ -589,9 +616,12 @@ ctl_arena_stats_sdmerge(ctl_arena_t *ctl_sdarena, ctl_arena_t *ctl_arena, sdstats->astats.mapped += astats->astats.mapped; sdstats->astats.retained += astats->astats.retained; } - sdstats->astats.npurge += astats->astats.npurge; - sdstats->astats.nmadvise += astats->astats.nmadvise; - sdstats->astats.purged += astats->astats.purged; + accum_arena_stats_u64(&sdstats->astats.npurge, + &astats->astats.npurge); + accum_arena_stats_u64(&sdstats->astats.nmadvise, + &astats->astats.nmadvise); + accum_arena_stats_u64(&sdstats->astats.purged, + &astats->astats.purged); if (!destroyed) { sdstats->astats.base += astats->astats.base; @@ -616,10 +646,12 @@ ctl_arena_stats_sdmerge(ctl_arena_t *ctl_sdarena, ctl_arena_t *ctl_arena, } else { assert(astats->astats.allocated_large == 0); } - sdstats->astats.nmalloc_large += astats->astats.nmalloc_large; - sdstats->astats.ndalloc_large += astats->astats.ndalloc_large; - sdstats->astats.nrequests_large += - astats->astats.nrequests_large; + accum_arena_stats_u64(&sdstats->astats.nmalloc_large, + &astats->astats.nmalloc_large); + accum_arena_stats_u64(&sdstats->astats.ndalloc_large, + &astats->astats.ndalloc_large); + accum_arena_stats_u64(&sdstats->astats.nrequests_large, + &astats->astats.nrequests_large); if (config_tcache) { sdstats->astats.tcache_bytes += @@ -654,10 +686,12 @@ ctl_arena_stats_sdmerge(ctl_arena_t *ctl_sdarena, ctl_arena_t *ctl_arena, } for (i = 0; i < NSIZES - NBINS; i++) { - sdstats->lstats[i].nmalloc += astats->lstats[i].nmalloc; - sdstats->lstats[i].ndalloc += astats->lstats[i].ndalloc; - sdstats->lstats[i].nrequests += - astats->lstats[i].nrequests; + accum_arena_stats_u64(&sdstats->lstats[i].nmalloc, + &astats->lstats[i].nmalloc); + accum_arena_stats_u64(&sdstats->lstats[i].ndalloc, + &astats->lstats[i].ndalloc); + accum_arena_stats_u64(&sdstats->lstats[i].nrequests, + &astats->lstats[i].nrequests); if (!destroyed) { sdstats->lstats[i].curlextents += astats->lstats[i].curlextents; @@ -2139,11 +2173,11 @@ CTL_RO_CGEN(config_stats, stats_arenas_i_mapped, CTL_RO_CGEN(config_stats, stats_arenas_i_retained, arenas_i(mib[2])->astats->astats.retained, size_t) CTL_RO_CGEN(config_stats, stats_arenas_i_npurge, - arenas_i(mib[2])->astats->astats.npurge, uint64_t) + arena_stats_read_u64(&arenas_i(mib[2])->astats->astats.npurge), uint64_t) CTL_RO_CGEN(config_stats, stats_arenas_i_nmadvise, - arenas_i(mib[2])->astats->astats.nmadvise, uint64_t) + arena_stats_read_u64(&arenas_i(mib[2])->astats->astats.nmadvise), uint64_t) CTL_RO_CGEN(config_stats, stats_arenas_i_purged, - arenas_i(mib[2])->astats->astats.purged, uint64_t) + arena_stats_read_u64(&arenas_i(mib[2])->astats->astats.purged), uint64_t) CTL_RO_CGEN(config_stats, stats_arenas_i_base, arenas_i(mib[2])->astats->astats.base, size_t) CTL_RO_CGEN(config_stats, stats_arenas_i_internal, @@ -2164,11 +2198,14 @@ CTL_RO_CGEN(config_stats, stats_arenas_i_small_nrequests, CTL_RO_CGEN(config_stats, stats_arenas_i_large_allocated, arenas_i(mib[2])->astats->astats.allocated_large, size_t) CTL_RO_CGEN(config_stats, stats_arenas_i_large_nmalloc, - arenas_i(mib[2])->astats->astats.nmalloc_large, uint64_t) + arena_stats_read_u64(&arenas_i(mib[2])->astats->astats.nmalloc_large), + uint64_t) CTL_RO_CGEN(config_stats, stats_arenas_i_large_ndalloc, - arenas_i(mib[2])->astats->astats.ndalloc_large, uint64_t) + arena_stats_read_u64(&arenas_i(mib[2])->astats->astats.ndalloc_large), + uint64_t) CTL_RO_CGEN(config_stats, stats_arenas_i_large_nrequests, - arenas_i(mib[2])->astats->astats.nmalloc_large, uint64_t) /* Intentional. */ + arena_stats_read_u64(&arenas_i(mib[2])->astats->astats.nmalloc_large), + uint64_t) /* Intentional. */ CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_nmalloc, arenas_i(mib[2])->astats->bstats[mib[4]].nmalloc, uint64_t) @@ -2199,11 +2236,14 @@ stats_arenas_i_bins_j_index(tsdn_t *tsdn, const size_t *mib, size_t miblen, } CTL_RO_CGEN(config_stats, stats_arenas_i_lextents_j_nmalloc, - arenas_i(mib[2])->astats->lstats[mib[4]].nmalloc, uint64_t) + arena_stats_read_u64(&arenas_i(mib[2])->astats->lstats[mib[4]].nmalloc), + uint64_t) CTL_RO_CGEN(config_stats, stats_arenas_i_lextents_j_ndalloc, - arenas_i(mib[2])->astats->lstats[mib[4]].ndalloc, uint64_t) + arena_stats_read_u64(&arenas_i(mib[2])->astats->lstats[mib[4]].ndalloc), + uint64_t) CTL_RO_CGEN(config_stats, stats_arenas_i_lextents_j_nrequests, - arenas_i(mib[2])->astats->lstats[mib[4]].nrequests, uint64_t) + arena_stats_read_u64( + &arenas_i(mib[2])->astats->lstats[mib[4]].nrequests), uint64_t) CTL_RO_CGEN(config_stats, stats_arenas_i_lextents_j_curlextents, arenas_i(mib[2])->astats->lstats[mib[4]].curlextents, size_t)