First stage of mutex profiling.

Switched to trylock and update counters based on state.
This commit is contained in:
Qi Wang 2017-02-23 14:18:07 -08:00 committed by Qi Wang
parent 32e7cf51cd
commit 6309df628f
5 changed files with 149 additions and 32 deletions

View File

@ -1,31 +1,42 @@
#ifndef JEMALLOC_INTERNAL_MUTEX_INLINES_H
#define JEMALLOC_INTERNAL_MUTEX_INLINES_H
void malloc_mutex_lock_slow(malloc_mutex_t *mutex);
#ifndef JEMALLOC_ENABLE_INLINE
void malloc_mutex_lock(tsdn_t *tsdn, malloc_mutex_t *mutex);
bool malloc_mutex_trylock(malloc_mutex_t *mutex);
void malloc_mutex_unlock(tsdn_t *tsdn, malloc_mutex_t *mutex);
void malloc_mutex_assert_owner(tsdn_t *tsdn, malloc_mutex_t *mutex);
void malloc_mutex_assert_not_owner(tsdn_t *tsdn, malloc_mutex_t *mutex);
#endif
#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_MUTEX_C_))
JEMALLOC_INLINE void
malloc_mutex_lock_final(malloc_mutex_t *mutex) {
MALLOC_MUTEX_LOCK(mutex);
}
/* Trylock: return false if the lock is successfully acquired. */
JEMALLOC_INLINE bool
malloc_mutex_trylock(malloc_mutex_t *mutex) {
return MALLOC_MUTEX_TRYLOCK(mutex);
}
JEMALLOC_INLINE void
malloc_mutex_lock(tsdn_t *tsdn, malloc_mutex_t *mutex) {
witness_assert_not_owner(tsdn, &mutex->witness);
if (isthreaded) {
#ifdef _WIN32
# if _WIN32_WINNT >= 0x0600
AcquireSRWLockExclusive(&mutex->lock);
# else
EnterCriticalSection(&mutex->lock);
# endif
#elif (defined(JEMALLOC_OS_UNFAIR_LOCK))
os_unfair_lock_lock(&mutex->lock);
#elif (defined(JEMALLOC_OSSPIN))
OSSpinLockLock(&mutex->lock);
#else
pthread_mutex_lock(&mutex->lock);
#endif
if (malloc_mutex_trylock(mutex)) {
malloc_mutex_lock_slow(mutex);
}
/* We own the lock now. Update a few counters. */
lock_prof_data_t *data = &mutex->prof_data;
data->n_lock_ops++;
if (data->prev_owner != tsdn) {
data->prev_owner = tsdn;
data->n_owner_switches++;
}
}
witness_lock(tsdn, &mutex->witness);
}
@ -34,19 +45,7 @@ JEMALLOC_INLINE void
malloc_mutex_unlock(tsdn_t *tsdn, malloc_mutex_t *mutex) {
witness_unlock(tsdn, &mutex->witness);
if (isthreaded) {
#ifdef _WIN32
# if _WIN32_WINNT >= 0x0600
ReleaseSRWLockExclusive(&mutex->lock);
# else
LeaveCriticalSection(&mutex->lock);
# endif
#elif (defined(JEMALLOC_OS_UNFAIR_LOCK))
os_unfair_lock_unlock(&mutex->lock);
#elif (defined(JEMALLOC_OSSPIN))
OSSpinLockUnlock(&mutex->lock);
#else
pthread_mutex_unlock(&mutex->lock);
#endif
MALLOC_MUTEX_UNLOCK(mutex);
}
}

View File

@ -1,9 +1,50 @@
#ifndef JEMALLOC_INTERNAL_MUTEX_STRUCTS_H
#define JEMALLOC_INTERNAL_MUTEX_STRUCTS_H
struct lock_prof_data_s {
/*
* Counters touched on the slow path, i.e. when there is lock
* contention. We update them once we have the lock.
*/
/* Total time spent waiting on this lock. */
nstime_t tot_wait_time;
/* Max time spent on a single lock operation. */
nstime_t max_wait_time;
/* # of times have to wait for this lock (after spinning). */
uint64_t n_wait_times;
/* # of times acquired the lock through local spinning. */
uint64_t n_spin_acquired;
/* Max # of threads waiting for the lock at the same time. */
uint32_t max_n_thds;
/* Current # of threads waiting on the lock. Atomic synced. */
uint32_t n_waiting_thds;
/*
* Data touched on the fast path. These are modified right after we
* grab the lock, so it's placed closest to the end (i.e. right before
* the lock) so that we have a higher chance of them being on the same
* cacheline.
*/
/* # of times the new lock holder is different from the previous one. */
uint64_t n_owner_switches;
/* Previous lock holder, to facilitate n_owner_switches. */
tsdn_t *prev_owner;
/* # of lock() operations in total. */
uint64_t n_lock_ops;
};
struct malloc_mutex_s {
union {
struct {
/*
* prof_data is defined first to reduce cacheline
* bouncing: the data is not touched by the lock holder
* during unlocking, while might be modified by
* contenders. Having it before the lock itself could
* avoid prefetching a modified cacheline (for the
* unlocking thread).
*/
lock_prof_data_t prof_data;
#ifdef _WIN32
# if _WIN32_WINNT >= 0x0600
SRWLOCK lock;
@ -22,7 +63,7 @@ struct malloc_mutex_s {
#endif
};
/*
* We only touch witness when configured w/ debug. However we
* We only touch witness when configured w/ debug. However we
* keep the field in a union when !debug so that we don't have
* to pollute the code base with #ifdefs, while avoid paying the
* memory cost.

View File

@ -1,31 +1,63 @@
#ifndef JEMALLOC_INTERNAL_MUTEX_TYPES_H
#define JEMALLOC_INTERNAL_MUTEX_TYPES_H
typedef struct lock_prof_data_s lock_prof_data_t;
typedef struct malloc_mutex_s malloc_mutex_t;
#ifdef _WIN32
# if _WIN32_WINNT >= 0x0600
# define MALLOC_MUTEX_LOCK(m) AcquireSRWLockExclusive(&(m)->lock)
# define MALLOC_MUTEX_UNLOCK(m) ReleaseSRWLockExclusive(&(m)->lock)
# define MALLOC_MUTEX_TRYLOCK(m) (!TryAcquireSRWLockExclusive(&(m)->lock))
# else
# define MALLOC_MUTEX_LOCK(m) EnterCriticalSection(&(m)->lock)
# define MALLOC_MUTEX_UNLOCK(m) LeaveCriticalSection(&(m)->lock)
# define MALLOC_MUTEX_TRYLOCK(m) (!TryEnterCriticalSection(&(m)->lock))
# endif
#elif (defined(JEMALLOC_OS_UNFAIR_LOCK))
# define MALLOC_MUTEX_LOCK(m) os_unfair_lock_lock(&(m)->lock)
# define MALLOC_MUTEX_UNLOCK(m) os_unfair_lock_unlock(&(m)->lock)
# define MALLOC_MUTEX_TRYLOCK(m) (!os_unfair_lock_trylock(&(m)->lock))
#elif (defined(JEMALLOC_OSSPIN))
# define MALLOC_MUTEX_LOCK(m) OSSpinLockLock(&(m)->lock)
# define MALLOC_MUTEX_UNLOCK(m) OSSpinLockUnlock(&(m)->lock)
# define MALLOC_MUTEX_TRYLOCK(m) (!OSSpinLockTry(&(m)->lock))
#else
# define MALLOC_MUTEX_LOCK(m) pthread_mutex_lock(&(m)->lock)
# define MALLOC_MUTEX_UNLOCK(m) pthread_mutex_unlock(&(m)->lock)
# define MALLOC_MUTEX_TRYLOCK(m) (pthread_mutex_trylock(&(m)->lock) != 0)
#endif
#define LOCK_PROF_DATA_INITIALIZER \
{NSTIME_ZERO_INITIALIZER, NSTIME_ZERO_INITIALIZER, 0, 0, 0, 0, 0, NULL, 0}
#ifdef _WIN32
# define MALLOC_MUTEX_INITIALIZER
#elif (defined(JEMALLOC_OS_UNFAIR_LOCK))
# define MALLOC_MUTEX_INITIALIZER \
{{{OS_UNFAIR_LOCK_INIT}}, WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT)}
{{{LOCK_PROF_DATA_INITIALIZER, OS_UNFAIR_LOCK_INIT}}, \
WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT)}
#elif (defined(JEMALLOC_OSSPIN))
# define MALLOC_MUTEX_INITIALIZER \
{{{0}}, WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT)}
{{{LOCK_PROF_DATA_INITIALIZER, 0}}, \
WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT)}
#elif (defined(JEMALLOC_MUTEX_INIT_CB))
# define MALLOC_MUTEX_INITIALIZER \
{{{PTHREAD_MUTEX_INITIALIZER, NULL}}, \
{{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER, NULL}}, \
WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT)}
#else
/* TODO: get rid of adaptive mutex once we do our own spin. */
# if (defined(JEMALLOC_HAVE_PTHREAD_MUTEX_ADAPTIVE_NP) && \
defined(PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP))
# define MALLOC_MUTEX_TYPE PTHREAD_MUTEX_ADAPTIVE_NP
# define MALLOC_MUTEX_INITIALIZER \
{{{PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP}}, \
{{{LOCK_PROF_DATA_INITIALIZER, \
PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP}}, \
WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT)}
# else
# define MALLOC_MUTEX_TYPE PTHREAD_MUTEX_DEFAULT
# define MALLOC_MUTEX_INITIALIZER \
{{{PTHREAD_MUTEX_INITIALIZER}}, \
{{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER}}, \
WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT)}
# endif
#endif

View File

@ -6,4 +6,6 @@ typedef struct nstime_s nstime_t;
/* Maximum supported number of seconds (~584 years). */
#define NSTIME_SEC_MAX KQU(18446744072)
#define NSTIME_ZERO_INITIALIZER {0}
#endif /* JEMALLOC_INTERNAL_NSTIME_TYPES_H */

View File

@ -65,6 +65,49 @@ JEMALLOC_EXPORT int _pthread_mutex_init_calloc_cb(pthread_mutex_t *mutex,
void *(calloc_cb)(size_t, size_t));
#endif
void
malloc_mutex_lock_slow(malloc_mutex_t *mutex) {
lock_prof_data_t *data = &mutex->prof_data;
bool spin_success = false;
{//TODO: a smart spin policy
if (!malloc_mutex_trylock(mutex)) {
spin_success = true;
goto label_locked;
}
}
nstime_t now, before;
uint32_t n_thds;
nstime_init(&now, 0);
nstime_update(&now);
n_thds = atomic_add_u32(&data->n_waiting_thds, 1);
/* One last try as above two calls may take quite some cycles. */
if (!malloc_mutex_trylock(mutex)) {
spin_success = true;
atomic_sub_u32(&data->n_waiting_thds, 1);
goto label_locked;
}
nstime_copy(&before, &now);
malloc_mutex_lock_final(mutex);
atomic_sub_u32(&data->n_waiting_thds, 1);
nstime_update(&now);
nstime_subtract(&now, &before);
label_locked:
if (spin_success) {
data->n_spin_acquired++;
} else {
data->n_wait_times++;
nstime_add(&data->tot_wait_time, &now);
if (nstime_compare(&now, &data->max_wait_time)) {
nstime_copy(&data->max_wait_time, &now);
}
if (n_thds > data->max_n_thds) {
data->max_n_thds = n_thds;
}
}
}
bool
malloc_mutex_init(malloc_mutex_t *mutex, const char *name,
witness_rank_t rank) {