diff --git a/include/jemalloc/internal/mutex_inlines.h b/include/jemalloc/internal/mutex_inlines.h index c0c3cfe9..cf0ce23a 100644 --- a/include/jemalloc/internal/mutex_inlines.h +++ b/include/jemalloc/internal/mutex_inlines.h @@ -1,31 +1,42 @@ #ifndef JEMALLOC_INTERNAL_MUTEX_INLINES_H #define JEMALLOC_INTERNAL_MUTEX_INLINES_H +void malloc_mutex_lock_slow(malloc_mutex_t *mutex); + #ifndef JEMALLOC_ENABLE_INLINE void malloc_mutex_lock(tsdn_t *tsdn, malloc_mutex_t *mutex); +bool malloc_mutex_trylock(malloc_mutex_t *mutex); void malloc_mutex_unlock(tsdn_t *tsdn, malloc_mutex_t *mutex); void malloc_mutex_assert_owner(tsdn_t *tsdn, malloc_mutex_t *mutex); void malloc_mutex_assert_not_owner(tsdn_t *tsdn, malloc_mutex_t *mutex); #endif #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_MUTEX_C_)) +JEMALLOC_INLINE void +malloc_mutex_lock_final(malloc_mutex_t *mutex) { + MALLOC_MUTEX_LOCK(mutex); +} + +/* Trylock: return false if the lock is successfully acquired. */ +JEMALLOC_INLINE bool +malloc_mutex_trylock(malloc_mutex_t *mutex) { + return MALLOC_MUTEX_TRYLOCK(mutex); +} + JEMALLOC_INLINE void malloc_mutex_lock(tsdn_t *tsdn, malloc_mutex_t *mutex) { witness_assert_not_owner(tsdn, &mutex->witness); if (isthreaded) { -#ifdef _WIN32 -# if _WIN32_WINNT >= 0x0600 - AcquireSRWLockExclusive(&mutex->lock); -# else - EnterCriticalSection(&mutex->lock); -# endif -#elif (defined(JEMALLOC_OS_UNFAIR_LOCK)) - os_unfair_lock_lock(&mutex->lock); -#elif (defined(JEMALLOC_OSSPIN)) - OSSpinLockLock(&mutex->lock); -#else - pthread_mutex_lock(&mutex->lock); -#endif + if (malloc_mutex_trylock(mutex)) { + malloc_mutex_lock_slow(mutex); + } + /* We own the lock now. Update a few counters. */ + lock_prof_data_t *data = &mutex->prof_data; + data->n_lock_ops++; + if (data->prev_owner != tsdn) { + data->prev_owner = tsdn; + data->n_owner_switches++; + } } witness_lock(tsdn, &mutex->witness); } @@ -34,19 +45,7 @@ JEMALLOC_INLINE void malloc_mutex_unlock(tsdn_t *tsdn, malloc_mutex_t *mutex) { witness_unlock(tsdn, &mutex->witness); if (isthreaded) { -#ifdef _WIN32 -# if _WIN32_WINNT >= 0x0600 - ReleaseSRWLockExclusive(&mutex->lock); -# else - LeaveCriticalSection(&mutex->lock); -# endif -#elif (defined(JEMALLOC_OS_UNFAIR_LOCK)) - os_unfair_lock_unlock(&mutex->lock); -#elif (defined(JEMALLOC_OSSPIN)) - OSSpinLockUnlock(&mutex->lock); -#else - pthread_mutex_unlock(&mutex->lock); -#endif + MALLOC_MUTEX_UNLOCK(mutex); } } diff --git a/include/jemalloc/internal/mutex_structs.h b/include/jemalloc/internal/mutex_structs.h index c34c1d47..7065c997 100644 --- a/include/jemalloc/internal/mutex_structs.h +++ b/include/jemalloc/internal/mutex_structs.h @@ -1,9 +1,50 @@ #ifndef JEMALLOC_INTERNAL_MUTEX_STRUCTS_H #define JEMALLOC_INTERNAL_MUTEX_STRUCTS_H +struct lock_prof_data_s { + /* + * Counters touched on the slow path, i.e. when there is lock + * contention. We update them once we have the lock. + */ + /* Total time spent waiting on this lock. */ + nstime_t tot_wait_time; + /* Max time spent on a single lock operation. */ + nstime_t max_wait_time; + /* # of times have to wait for this lock (after spinning). */ + uint64_t n_wait_times; + /* # of times acquired the lock through local spinning. */ + uint64_t n_spin_acquired; + /* Max # of threads waiting for the lock at the same time. */ + uint32_t max_n_thds; + /* Current # of threads waiting on the lock. Atomic synced. */ + uint32_t n_waiting_thds; + + /* + * Data touched on the fast path. These are modified right after we + * grab the lock, so it's placed closest to the end (i.e. right before + * the lock) so that we have a higher chance of them being on the same + * cacheline. + */ + /* # of times the new lock holder is different from the previous one. */ + uint64_t n_owner_switches; + /* Previous lock holder, to facilitate n_owner_switches. */ + tsdn_t *prev_owner; + /* # of lock() operations in total. */ + uint64_t n_lock_ops; +}; + struct malloc_mutex_s { union { struct { + /* + * prof_data is defined first to reduce cacheline + * bouncing: the data is not touched by the lock holder + * during unlocking, while might be modified by + * contenders. Having it before the lock itself could + * avoid prefetching a modified cacheline (for the + * unlocking thread). + */ + lock_prof_data_t prof_data; #ifdef _WIN32 # if _WIN32_WINNT >= 0x0600 SRWLOCK lock; @@ -22,7 +63,7 @@ struct malloc_mutex_s { #endif }; /* - * We only touch witness when configured w/ debug. However we + * We only touch witness when configured w/ debug. However we * keep the field in a union when !debug so that we don't have * to pollute the code base with #ifdefs, while avoid paying the * memory cost. diff --git a/include/jemalloc/internal/mutex_types.h b/include/jemalloc/internal/mutex_types.h index b7e3a7a1..d7c7f04f 100644 --- a/include/jemalloc/internal/mutex_types.h +++ b/include/jemalloc/internal/mutex_types.h @@ -1,31 +1,63 @@ #ifndef JEMALLOC_INTERNAL_MUTEX_TYPES_H #define JEMALLOC_INTERNAL_MUTEX_TYPES_H +typedef struct lock_prof_data_s lock_prof_data_t; typedef struct malloc_mutex_s malloc_mutex_t; +#ifdef _WIN32 +# if _WIN32_WINNT >= 0x0600 +# define MALLOC_MUTEX_LOCK(m) AcquireSRWLockExclusive(&(m)->lock) +# define MALLOC_MUTEX_UNLOCK(m) ReleaseSRWLockExclusive(&(m)->lock) +# define MALLOC_MUTEX_TRYLOCK(m) (!TryAcquireSRWLockExclusive(&(m)->lock)) +# else +# define MALLOC_MUTEX_LOCK(m) EnterCriticalSection(&(m)->lock) +# define MALLOC_MUTEX_UNLOCK(m) LeaveCriticalSection(&(m)->lock) +# define MALLOC_MUTEX_TRYLOCK(m) (!TryEnterCriticalSection(&(m)->lock)) +# endif +#elif (defined(JEMALLOC_OS_UNFAIR_LOCK)) +# define MALLOC_MUTEX_LOCK(m) os_unfair_lock_lock(&(m)->lock) +# define MALLOC_MUTEX_UNLOCK(m) os_unfair_lock_unlock(&(m)->lock) +# define MALLOC_MUTEX_TRYLOCK(m) (!os_unfair_lock_trylock(&(m)->lock)) +#elif (defined(JEMALLOC_OSSPIN)) +# define MALLOC_MUTEX_LOCK(m) OSSpinLockLock(&(m)->lock) +# define MALLOC_MUTEX_UNLOCK(m) OSSpinLockUnlock(&(m)->lock) +# define MALLOC_MUTEX_TRYLOCK(m) (!OSSpinLockTry(&(m)->lock)) +#else +# define MALLOC_MUTEX_LOCK(m) pthread_mutex_lock(&(m)->lock) +# define MALLOC_MUTEX_UNLOCK(m) pthread_mutex_unlock(&(m)->lock) +# define MALLOC_MUTEX_TRYLOCK(m) (pthread_mutex_trylock(&(m)->lock) != 0) +#endif + +#define LOCK_PROF_DATA_INITIALIZER \ + {NSTIME_ZERO_INITIALIZER, NSTIME_ZERO_INITIALIZER, 0, 0, 0, 0, 0, NULL, 0} + #ifdef _WIN32 # define MALLOC_MUTEX_INITIALIZER #elif (defined(JEMALLOC_OS_UNFAIR_LOCK)) # define MALLOC_MUTEX_INITIALIZER \ - {{{OS_UNFAIR_LOCK_INIT}}, WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT)} + {{{LOCK_PROF_DATA_INITIALIZER, OS_UNFAIR_LOCK_INIT}}, \ + WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT)} #elif (defined(JEMALLOC_OSSPIN)) # define MALLOC_MUTEX_INITIALIZER \ - {{{0}}, WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT)} + {{{LOCK_PROF_DATA_INITIALIZER, 0}}, \ + WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT)} #elif (defined(JEMALLOC_MUTEX_INIT_CB)) # define MALLOC_MUTEX_INITIALIZER \ - {{{PTHREAD_MUTEX_INITIALIZER, NULL}}, \ + {{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER, NULL}}, \ WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT)} #else +/* TODO: get rid of adaptive mutex once we do our own spin. */ # if (defined(JEMALLOC_HAVE_PTHREAD_MUTEX_ADAPTIVE_NP) && \ defined(PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP)) # define MALLOC_MUTEX_TYPE PTHREAD_MUTEX_ADAPTIVE_NP # define MALLOC_MUTEX_INITIALIZER \ - {{{PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP}}, \ + {{{LOCK_PROF_DATA_INITIALIZER, \ + PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP}}, \ WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT)} # else # define MALLOC_MUTEX_TYPE PTHREAD_MUTEX_DEFAULT # define MALLOC_MUTEX_INITIALIZER \ - {{{PTHREAD_MUTEX_INITIALIZER}}, \ + {{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER}}, \ WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT)} # endif #endif diff --git a/include/jemalloc/internal/nstime_types.h b/include/jemalloc/internal/nstime_types.h index d6039e03..6e7e74cf 100644 --- a/include/jemalloc/internal/nstime_types.h +++ b/include/jemalloc/internal/nstime_types.h @@ -6,4 +6,6 @@ typedef struct nstime_s nstime_t; /* Maximum supported number of seconds (~584 years). */ #define NSTIME_SEC_MAX KQU(18446744072) +#define NSTIME_ZERO_INITIALIZER {0} + #endif /* JEMALLOC_INTERNAL_NSTIME_TYPES_H */ diff --git a/src/mutex.c b/src/mutex.c index f1aa155e..2b80a9d9 100644 --- a/src/mutex.c +++ b/src/mutex.c @@ -65,6 +65,49 @@ JEMALLOC_EXPORT int _pthread_mutex_init_calloc_cb(pthread_mutex_t *mutex, void *(calloc_cb)(size_t, size_t)); #endif +void +malloc_mutex_lock_slow(malloc_mutex_t *mutex) { + lock_prof_data_t *data = &mutex->prof_data; + bool spin_success = false; + + {//TODO: a smart spin policy + if (!malloc_mutex_trylock(mutex)) { + spin_success = true; + goto label_locked; + } + } + + nstime_t now, before; + uint32_t n_thds; + nstime_init(&now, 0); + nstime_update(&now); + n_thds = atomic_add_u32(&data->n_waiting_thds, 1); + /* One last try as above two calls may take quite some cycles. */ + if (!malloc_mutex_trylock(mutex)) { + spin_success = true; + atomic_sub_u32(&data->n_waiting_thds, 1); + goto label_locked; + } + nstime_copy(&before, &now); + malloc_mutex_lock_final(mutex); + atomic_sub_u32(&data->n_waiting_thds, 1); + nstime_update(&now); + nstime_subtract(&now, &before); +label_locked: + if (spin_success) { + data->n_spin_acquired++; + } else { + data->n_wait_times++; + nstime_add(&data->tot_wait_time, &now); + if (nstime_compare(&now, &data->max_wait_time)) { + nstime_copy(&data->max_wait_time, &now); + } + if (n_thds > data->max_n_thds) { + data->max_n_thds = n_thds; + } + } +} + bool malloc_mutex_init(malloc_mutex_t *mutex, const char *name, witness_rank_t rank) {