From 3025b021b9206478d2edcf017f1df7657d35e615 Mon Sep 17 00:00:00 2001 From: Qi Wang Date: Mon, 23 Oct 2023 13:00:10 -0700 Subject: [PATCH] Optimize mutex and bin alignment / locality. --- include/jemalloc/internal/arena_structs.h | 5 ++++- include/jemalloc/internal/mutex.h | 23 ++++++++++++----------- src/arena.c | 7 ++++++- 3 files changed, 22 insertions(+), 13 deletions(-) diff --git a/include/jemalloc/internal/arena_structs.h b/include/jemalloc/internal/arena_structs.h index 6f79be97..803ed25c 100644 --- a/include/jemalloc/internal/arena_structs.h +++ b/include/jemalloc/internal/arena_structs.h @@ -98,10 +98,13 @@ struct arena_s { /* * The arena is allocated alongside its bins; really this is a * dynamically sized array determined by the binshard settings. + * Enforcing cacheline-alignment to minimize the number of cachelines + * touched on the hot paths. */ JEMALLOC_WARN_ON_USAGE("Do not use this field directly. " "Use `arena_get_bin` instead.") - bin_t all_bins[0]; + JEMALLOC_ALIGNED(CACHELINE) + bin_t all_bins[0]; }; #endif /* JEMALLOC_INTERNAL_ARENA_STRUCTS_H */ diff --git a/include/jemalloc/internal/mutex.h b/include/jemalloc/internal/mutex.h index 46f22aec..75abf298 100644 --- a/include/jemalloc/internal/mutex.h +++ b/include/jemalloc/internal/mutex.h @@ -32,6 +32,12 @@ struct malloc_mutex_s { * unlocking thread). */ mutex_prof_data_t prof_data; + /* + * Hint flag to avoid exclusive cache line contention + * during spin waiting. Placed along with prof_data + * since it's always modified even with no contention. + */ + atomic_b_t locked; #ifdef _WIN32 # if _WIN32_WINNT >= 0x0600 SRWLOCK lock; @@ -46,11 +52,6 @@ struct malloc_mutex_s { #else pthread_mutex_t lock; #endif - /* - * Hint flag to avoid exclusive cache line contention - * during spin waiting - */ - atomic_b_t locked; }; /* * We only touch witness when configured w/ debug. However we @@ -99,21 +100,21 @@ struct malloc_mutex_s { #elif (defined(JEMALLOC_OS_UNFAIR_LOCK)) # if defined(JEMALLOC_DEBUG) # define MALLOC_MUTEX_INITIALIZER \ - {{{LOCK_PROF_DATA_INITIALIZER, OS_UNFAIR_LOCK_INIT, ATOMIC_INIT(false)}}, \ + {{{LOCK_PROF_DATA_INITIALIZER, ATOMIC_INIT(false), OS_UNFAIR_LOCK_INIT}}, \ WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT), 0} # else # define MALLOC_MUTEX_INITIALIZER \ - {{{LOCK_PROF_DATA_INITIALIZER, OS_UNFAIR_LOCK_INIT, ATOMIC_INIT(false)}}, \ + {{{LOCK_PROF_DATA_INITIALIZER, ATOMIC_INIT(false), OS_UNFAIR_LOCK_INIT}}, \ WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT)} # endif #elif (defined(JEMALLOC_MUTEX_INIT_CB)) # if (defined(JEMALLOC_DEBUG)) # define MALLOC_MUTEX_INITIALIZER \ - {{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER, NULL, ATOMIC_INIT(false)}}, \ + {{{LOCK_PROF_DATA_INITIALIZER, ATOMIC_INIT(false), PTHREAD_MUTEX_INITIALIZER, NULL}}, \ WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT), 0} # else # define MALLOC_MUTEX_INITIALIZER \ - {{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER, NULL, ATOMIC_INIT(false)}}, \ + {{{LOCK_PROF_DATA_INITIALIZER, ATOMIC_INIT(false), PTHREAD_MUTEX_INITIALIZER, NULL}}, \ WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT)} # endif @@ -121,11 +122,11 @@ struct malloc_mutex_s { # define MALLOC_MUTEX_TYPE PTHREAD_MUTEX_DEFAULT # if defined(JEMALLOC_DEBUG) # define MALLOC_MUTEX_INITIALIZER \ - {{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER, ATOMIC_INIT(false)}}, \ + {{{LOCK_PROF_DATA_INITIALIZER, ATOMIC_INIT(false), PTHREAD_MUTEX_INITIALIZER}}, \ WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT), 0} # else # define MALLOC_MUTEX_INITIALIZER \ - {{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER, ATOMIC_INIT(false)}}, \ + {{{LOCK_PROF_DATA_INITIALIZER, ATOMIC_INIT(false), PTHREAD_MUTEX_INITIALIZER}}, \ WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT)} # endif #endif diff --git a/src/arena.c b/src/arena.c index 9a8e5d64..b4ead26a 100644 --- a/src/arena.c +++ b/src/arena.c @@ -1666,11 +1666,16 @@ arena_new(tsdn_t *tsdn, unsigned ind, const arena_config_t *config) { } } - size_t arena_size = sizeof(arena_t) + sizeof(bin_t) * nbins_total; + size_t arena_size = ALIGNMENT_CEILING(sizeof(arena_t), CACHELINE) + + sizeof(bin_t) * nbins_total; arena = (arena_t *)base_alloc(tsdn, base, arena_size, CACHELINE); if (arena == NULL) { goto label_error; } + JEMALLOC_SUPPRESS_WARN_ON_USAGE( + assert((uintptr_t)&arena->all_bins[nbins_total -1] + sizeof(bin_t) <= + (uintptr_t)arena + arena_size); + ) atomic_store_u(&arena->nthreads[0], 0, ATOMIC_RELAXED); atomic_store_u(&arena->nthreads[1], 0, ATOMIC_RELAXED);