From b23336af96e6ef9efb47591ce7bf2c8a1eab866b Mon Sep 17 00:00:00 2001 From: Dave Watson Date: Mon, 26 Nov 2018 08:11:00 -0800 Subject: [PATCH] mutex: fix trylock spin wait contention If there are 3 or more threads spin-waiting on the same mutex, there will be excessive exclusive cacheline contention because pthread_trylock() immediately tries to CAS in a new value, instead of first checking if the lock is locked. This diff adds a 'locked' hint flag, and we will only spin wait without trylock()ing while set. I don't know of any other portable way to get the same behavior as pthread_mutex_lock(). This is pretty easy to test via ttest, e.g. ./ttest1 500 3 10000 1 100 Throughput is nearly 3x as fast. This blames to the mutex profiling changes, however, we almost never have 3 or more threads contending in properly configured production workloads, but still worth fixing. --- include/jemalloc/internal/mutex.h | 21 +++++++++++++++------ src/mutex.c | 3 ++- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/include/jemalloc/internal/mutex.h b/include/jemalloc/internal/mutex.h index c530cc9d..8f4a3072 100644 --- a/include/jemalloc/internal/mutex.h +++ b/include/jemalloc/internal/mutex.h @@ -43,6 +43,11 @@ struct malloc_mutex_s { #else pthread_mutex_t lock; #endif + /* + * Hint flag to avoid exclusive cache line contention + * during spin waiting + */ + atomic_b_t locked; }; /* * We only touch witness when configured w/ debug. However we @@ -97,21 +102,21 @@ struct malloc_mutex_s { #elif (defined(JEMALLOC_OS_UNFAIR_LOCK)) # if defined(JEMALLOC_DEBUG) # define MALLOC_MUTEX_INITIALIZER \ - {{{LOCK_PROF_DATA_INITIALIZER, OS_UNFAIR_LOCK_INIT}}, \ + {{{LOCK_PROF_DATA_INITIALIZER, OS_UNFAIR_LOCK_INIT, ATOMIC_INIT(false)}}, \ WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT), 0} # else # define MALLOC_MUTEX_INITIALIZER \ - {{{LOCK_PROF_DATA_INITIALIZER, OS_UNFAIR_LOCK_INIT}}, \ + {{{LOCK_PROF_DATA_INITIALIZER, OS_UNFAIR_LOCK_INIT, ATOMIC_INIT(false)}}, \ WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT)} # endif #elif (defined(JEMALLOC_MUTEX_INIT_CB)) # if (defined(JEMALLOC_DEBUG)) # define MALLOC_MUTEX_INITIALIZER \ - {{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER, NULL}}, \ + {{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER, NULL, ATOMIC_INIT(false)}}, \ WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT), 0} # else # define MALLOC_MUTEX_INITIALIZER \ - {{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER, NULL}}, \ + {{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER, NULL, ATOMIC_INIT(false)}}, \ WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT)} # endif @@ -119,11 +124,11 @@ struct malloc_mutex_s { # define MALLOC_MUTEX_TYPE PTHREAD_MUTEX_DEFAULT # if defined(JEMALLOC_DEBUG) # define MALLOC_MUTEX_INITIALIZER \ - {{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER}}, \ + {{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER, ATOMIC_INIT(false)}}, \ WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT), 0} # else # define MALLOC_MUTEX_INITIALIZER \ - {{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER}}, \ + {{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER, ATOMIC_INIT(false)}}, \ WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT)} # endif #endif @@ -148,6 +153,7 @@ void malloc_mutex_lock_slow(malloc_mutex_t *mutex); static inline void malloc_mutex_lock_final(malloc_mutex_t *mutex) { MALLOC_MUTEX_LOCK(mutex); + atomic_store_b(&mutex->locked, true, ATOMIC_RELAXED); } static inline bool @@ -173,6 +179,7 @@ malloc_mutex_trylock(tsdn_t *tsdn, malloc_mutex_t *mutex) { witness_assert_not_owner(tsdn_witness_tsdp_get(tsdn), &mutex->witness); if (isthreaded) { if (malloc_mutex_trylock_final(mutex)) { + atomic_store_b(&mutex->locked, true, ATOMIC_RELAXED); return true; } mutex_owner_stats_update(tsdn, mutex); @@ -212,6 +219,7 @@ malloc_mutex_lock(tsdn_t *tsdn, malloc_mutex_t *mutex) { if (isthreaded) { if (malloc_mutex_trylock_final(mutex)) { malloc_mutex_lock_slow(mutex); + atomic_store_b(&mutex->locked, true, ATOMIC_RELAXED); } mutex_owner_stats_update(tsdn, mutex); } @@ -220,6 +228,7 @@ malloc_mutex_lock(tsdn_t *tsdn, malloc_mutex_t *mutex) { static inline void malloc_mutex_unlock(tsdn_t *tsdn, malloc_mutex_t *mutex) { + atomic_store_b(&mutex->locked, false, ATOMIC_RELAXED); witness_unlock(tsdn_witness_tsdp_get(tsdn), &mutex->witness); if (isthreaded) { MALLOC_MUTEX_UNLOCK(mutex); diff --git a/src/mutex.c b/src/mutex.c index eb6c4c6d..3f920f5b 100644 --- a/src/mutex.c +++ b/src/mutex.c @@ -55,7 +55,8 @@ malloc_mutex_lock_slow(malloc_mutex_t *mutex) { int cnt = 0, max_cnt = MALLOC_MUTEX_MAX_SPIN; do { spin_cpu_spinwait(); - if (!malloc_mutex_trylock_final(mutex)) { + if (!atomic_load_b(&mutex->locked, ATOMIC_RELAXED) + && !malloc_mutex_trylock_final(mutex)) { data->n_spin_acquired++; return; }