From 6da5418ded9170b087c35960e0010006430117c1 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Fri, 23 Mar 2012 18:05:51 -0700
Subject: [PATCH] Remove ephemeral mutexes.

Remove ephemeral mutexes from the prof machinery, and remove
malloc_mutex_destroy().  This simplifies mutex management on systems
that call malloc()/free() inside pthread_mutex_{create,destroy}().

Add atomic_*_u() for operation on unsigned values.

Fix prof_printf() to call malloc_vsnprintf() rather than
malloc_snprintf().
---
 include/jemalloc/internal/atomic.h | 30 ++++++++++++++
 include/jemalloc/internal/mutex.h  |  1 -
 include/jemalloc/internal/prof.h   | 16 +++++---
 src/jemalloc.c                     | 10 ++---
 src/mutex.c                        | 12 ------
 src/prof.c                         | 63 ++++++++++++++++++++----------
 6 files changed, 89 insertions(+), 43 deletions(-)

diff --git a/include/jemalloc/internal/atomic.h b/include/jemalloc/internal/atomic.h
index afeb9cb7..7a9cb61e 100644
--- a/include/jemalloc/internal/atomic.h
+++ b/include/jemalloc/internal/atomic.h
@@ -12,6 +12,7 @@
 #define	atomic_read_uint64(p)	atomic_add_uint64(p, 0)
 #define	atomic_read_uint32(p)	atomic_add_uint32(p, 0)
 #define	atomic_read_z(p)	atomic_add_z(p, 0)
+#define	atomic_read_u(p)	atomic_add_u(p, 0)
 
 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
@@ -24,6 +25,8 @@ uint32_t	atomic_add_uint32(uint32_t *p, uint32_t x);
 uint32_t	atomic_sub_uint32(uint32_t *p, uint32_t x);
 size_t	atomic_add_z(size_t *p, size_t x);
 size_t	atomic_sub_z(size_t *p, size_t x);
+unsigned	atomic_add_u(unsigned *p, unsigned x);
+unsigned	atomic_sub_u(unsigned *p, unsigned x);
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_ATOMIC_C_))
@@ -192,6 +195,33 @@ atomic_sub_z(size_t *p, size_t x)
 	    (uint32_t)-((int32_t)x)));
 #endif
 }
+
+/******************************************************************************/
+/* unsigned operations. */
+JEMALLOC_INLINE unsigned
+atomic_add_u(unsigned *p, unsigned x)
+{
+
+#if (LG_SIZEOF_INT == 3)
+	return ((unsigned)atomic_add_uint64((uint64_t *)p, (uint64_t)x));
+#elif (LG_SIZEOF_INT == 2)
+	return ((unsigned)atomic_add_uint32((uint32_t *)p, (uint32_t)x));
+#endif
+}
+
+JEMALLOC_INLINE unsigned
+atomic_sub_u(unsigned *p, unsigned x)
+{
+
+#if (LG_SIZEOF_INT == 3)
+	return ((unsigned)atomic_add_uint64((uint64_t *)p,
+	    (uint64_t)-((int64_t)x)));
+#elif (LG_SIZEOF_INT == 2)
+	return ((unsigned)atomic_add_uint32((uint32_t *)p,
+	    (uint32_t)-((int32_t)x)));
+#endif
+}
+/******************************************************************************/
 #endif
 
 #endif /* JEMALLOC_H_INLINES */
diff --git a/include/jemalloc/internal/mutex.h b/include/jemalloc/internal/mutex.h
index 9d136585..10637e92 100644
--- a/include/jemalloc/internal/mutex.h
+++ b/include/jemalloc/internal/mutex.h
@@ -28,7 +28,6 @@ extern bool isthreaded;
 #endif
 
 bool	malloc_mutex_init(malloc_mutex_t *mutex);
-void	malloc_mutex_destroy(malloc_mutex_t *mutex);
 void	malloc_mutex_prefork(malloc_mutex_t *mutex);
 void	malloc_mutex_postfork_parent(malloc_mutex_t *mutex);
 void	malloc_mutex_postfork_child(malloc_mutex_t *mutex);
diff --git a/include/jemalloc/internal/prof.h b/include/jemalloc/internal/prof.h
index 231a3876..34929e7e 100644
--- a/include/jemalloc/internal/prof.h
+++ b/include/jemalloc/internal/prof.h
@@ -31,6 +31,12 @@ typedef struct prof_tdata_s prof_tdata_t;
 /* Size of stack-allocated buffer used by prof_printf(). */
 #define	PROF_PRINTF_BUFSIZE		128
 
+/*
+ * Number of mutexes shared among all ctx's.  No space is allocated for these
+ * unless profiling is enabled, so it's okay to over-provision.
+ */
+#define	PROF_NCTX_LOCKS			1024
+
 #endif /* JEMALLOC_H_TYPES */
 /******************************************************************************/
 #ifdef JEMALLOC_H_STRUCTS
@@ -108,7 +114,7 @@ struct prof_ctx_s {
 	prof_bt_t		*bt;
 
 	/* Protects cnt_merged and cnts_ql. */
-	malloc_mutex_t		lock;
+	malloc_mutex_t		*lock;
 
 	/* Temporary storage for summation during dump. */
 	prof_cnt_t		cnt_summed;
@@ -444,10 +450,10 @@ prof_realloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt,
 			 * It's too late to propagate OOM for this realloc(),
 			 * so operate directly on old_cnt->ctx->cnt_merged.
 			 */
-			malloc_mutex_lock(&old_ctx->lock);
+			malloc_mutex_lock(old_ctx->lock);
 			old_ctx->cnt_merged.curobjs--;
 			old_ctx->cnt_merged.curbytes -= old_size;
-			malloc_mutex_unlock(&old_ctx->lock);
+			malloc_mutex_unlock(old_ctx->lock);
 			told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
 		}
 	} else
@@ -516,10 +522,10 @@ prof_free(const void *ptr, size_t size)
 			 * OOM during free() cannot be propagated, so operate
 			 * directly on cnt->ctx->cnt_merged.
 			 */
-			malloc_mutex_lock(&ctx->lock);
+			malloc_mutex_lock(ctx->lock);
 			ctx->cnt_merged.curobjs--;
 			ctx->cnt_merged.curbytes -= size;
-			malloc_mutex_unlock(&ctx->lock);
+			malloc_mutex_unlock(ctx->lock);
 		}
 	}
 }
diff --git a/src/jemalloc.c b/src/jemalloc.c
index f9451796..b3e898c1 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -638,11 +638,6 @@ malloc_init_hard(void)
 		return (true);
 	}
 
-	if (config_prof && prof_boot2()) {
-		malloc_mutex_unlock(&init_lock);
-		return (true);
-	}
-
 	if (arenas_tsd_boot()) {
 		malloc_mutex_unlock(&init_lock);
 		return (true);
@@ -653,6 +648,11 @@ malloc_init_hard(void)
 		return (true);
 	}
 
+	if (config_prof && prof_boot2()) {
+		malloc_mutex_unlock(&init_lock);
+		return (true);
+	}
+
 	/* Get number of CPUs. */
 	malloc_initializer = pthread_self();
 	malloc_mutex_unlock(&init_lock);
diff --git a/src/mutex.c b/src/mutex.c
index 243b7129..07d2a033 100644
--- a/src/mutex.c
+++ b/src/mutex.c
@@ -81,18 +81,6 @@ malloc_mutex_init(malloc_mutex_t *mutex)
 	return (false);
 }
 
-void
-malloc_mutex_destroy(malloc_mutex_t *mutex)
-{
-
-#ifndef JEMALLOC_OSSPIN
-	if (pthread_mutex_destroy(mutex) != 0) {
-		malloc_write("<jemalloc>: Error in pthread_mutex_destroy()\n");
-		abort();
-	}
-#endif
-}
-
 void
 malloc_mutex_prefork(malloc_mutex_t *mutex)
 {
diff --git a/src/prof.c b/src/prof.c
index ba0b64e1..bc21d894 100644
--- a/src/prof.c
+++ b/src/prof.c
@@ -28,6 +28,16 @@ char		opt_prof_prefix[PATH_MAX + 1];
 uint64_t	prof_interval;
 bool		prof_promote;
 
+/*
+ * Table of mutexes that are shared among ctx's.  These are leaf locks, so
+ * there is no problem with using them for more than one ctx at the same time.
+ * The primary motivation for this sharing though is that ctx's are ephemeral,
+ * and destroying mutexes causes complications for systems that allocate when
+ * creating/destroying mutexes.
+ */
+static malloc_mutex_t	*ctx_locks;
+static unsigned		cum_ctxs; /* Atomic counter. */
+
 /*
  * Global hash of (prof_bt_t *)-->(prof_ctx_t *).  This is the master data
  * structure that knows about all backtraces currently captured.
@@ -87,6 +97,7 @@ static void	prof_fdump(void);
 static void	prof_bt_hash(const void *key, unsigned minbits, size_t *hash1,
     size_t *hash2);
 static bool	prof_bt_keycomp(const void *k1, const void *k2);
+static malloc_mutex_t	*prof_ctx_mutex_choose(void);
 
 /******************************************************************************/
 
@@ -471,18 +482,12 @@ prof_lookup(prof_bt_t *bt)
 				return (NULL);
 			}
 			ctx.p->bt = btkey.p;
-			if (malloc_mutex_init(&ctx.p->lock)) {
-				prof_leave();
-				idalloc(btkey.v);
-				idalloc(ctx.v);
-				return (NULL);
-			}
+			ctx.p->lock = prof_ctx_mutex_choose();
 			memset(&ctx.p->cnt_merged, 0, sizeof(prof_cnt_t));
 			ql_new(&ctx.p->cnts_ql);
 			if (ckh_insert(&bt2ctx, btkey.v, ctx.v)) {
 				/* OOM. */
 				prof_leave();
-				malloc_mutex_destroy(&ctx.p->lock);
 				idalloc(btkey.v);
 				idalloc(ctx.v);
 				return (NULL);
@@ -502,9 +507,9 @@ prof_lookup(prof_bt_t *bt)
 			 * Artificially raise curobjs, in order to avoid a race
 			 * condition with prof_ctx_merge()/prof_ctx_destroy().
 			 */
-			malloc_mutex_lock(&ctx.p->lock);
+			malloc_mutex_lock(ctx.p->lock);
 			ctx.p->cnt_merged.curobjs++;
-			malloc_mutex_unlock(&ctx.p->lock);
+			malloc_mutex_unlock(ctx.p->lock);
 			new_ctx = false;
 		}
 		prof_leave();
@@ -547,10 +552,10 @@ prof_lookup(prof_bt_t *bt)
 			return (NULL);
 		}
 		ql_head_insert(&prof_tdata->lru_ql, ret.p, lru_link);
-		malloc_mutex_lock(&ctx.p->lock);
+		malloc_mutex_lock(ctx.p->lock);
 		ql_tail_insert(&ctx.p->cnts_ql, ret.p, cnts_link);
 		ctx.p->cnt_merged.curobjs--;
-		malloc_mutex_unlock(&ctx.p->lock);
+		malloc_mutex_unlock(ctx.p->lock);
 	} else {
 		/* Move ret to the front of the LRU. */
 		ql_remove(&prof_tdata->lru_ql, ret.p, lru_link);
@@ -622,7 +627,7 @@ prof_printf(bool propagate_err, const char *format, ...)
 	char buf[PROF_PRINTF_BUFSIZE];
 
 	va_start(ap, format);
-	malloc_snprintf(buf, sizeof(buf), format, ap);
+	malloc_vsnprintf(buf, sizeof(buf), format, ap);
 	va_end(ap);
 	ret = prof_write(propagate_err, buf);
 
@@ -637,7 +642,7 @@ prof_ctx_sum(prof_ctx_t *ctx, prof_cnt_t *cnt_all, size_t *leak_nctx)
 
 	cassert(config_prof);
 
-	malloc_mutex_lock(&ctx->lock);
+	malloc_mutex_lock(ctx->lock);
 
 	memcpy(&ctx->cnt_summed, &ctx->cnt_merged, sizeof(prof_cnt_t));
 	ql_foreach(thr_cnt, &ctx->cnts_ql, cnts_link) {
@@ -676,7 +681,7 @@ prof_ctx_sum(prof_ctx_t *ctx, prof_cnt_t *cnt_all, size_t *leak_nctx)
 		cnt_all->accumbytes += ctx->cnt_summed.accumbytes;
 	}
 
-	malloc_mutex_unlock(&ctx->lock);
+	malloc_mutex_unlock(ctx->lock);
 }
 
 static void
@@ -693,7 +698,7 @@ prof_ctx_destroy(prof_ctx_t *ctx)
 	 * prof_ctx_merge() and entry into this function.
 	 */
 	prof_enter();
-	malloc_mutex_lock(&ctx->lock);
+	malloc_mutex_lock(ctx->lock);
 	if (ql_first(&ctx->cnts_ql) == NULL && ctx->cnt_merged.curobjs == 1) {
 		assert(ctx->cnt_merged.curbytes == 0);
 		assert(ctx->cnt_merged.accumobjs == 0);
@@ -703,9 +708,8 @@ prof_ctx_destroy(prof_ctx_t *ctx)
 			assert(false);
 		prof_leave();
 		/* Destroy ctx. */
-		malloc_mutex_unlock(&ctx->lock);
+		malloc_mutex_unlock(ctx->lock);
 		bt_destroy(ctx->bt);
-		malloc_mutex_destroy(&ctx->lock);
 		idalloc(ctx);
 	} else {
 		/*
@@ -713,7 +717,7 @@ prof_ctx_destroy(prof_ctx_t *ctx)
 		 * prof_lookup().
 		 */
 		ctx->cnt_merged.curobjs--;
-		malloc_mutex_unlock(&ctx->lock);
+		malloc_mutex_unlock(ctx->lock);
 		prof_leave();
 	}
 }
@@ -726,7 +730,7 @@ prof_ctx_merge(prof_ctx_t *ctx, prof_thr_cnt_t *cnt)
 	cassert(config_prof);
 
 	/* Merge cnt stats and detach from ctx. */
-	malloc_mutex_lock(&ctx->lock);
+	malloc_mutex_lock(ctx->lock);
 	ctx->cnt_merged.curobjs += cnt->cnts.curobjs;
 	ctx->cnt_merged.curbytes += cnt->cnts.curbytes;
 	ctx->cnt_merged.accumobjs += cnt->cnts.accumobjs;
@@ -751,7 +755,7 @@ prof_ctx_merge(prof_ctx_t *ctx, prof_thr_cnt_t *cnt)
 		destroy = true;
 	} else
 		destroy = false;
-	malloc_mutex_unlock(&ctx->lock);
+	malloc_mutex_unlock(ctx->lock);
 	if (destroy)
 		prof_ctx_destroy(ctx);
 }
@@ -1067,6 +1071,14 @@ prof_bt_keycomp(const void *k1, const void *k2)
 	return (memcmp(bt1->vec, bt2->vec, bt1->len * sizeof(void *)) == 0);
 }
 
+static malloc_mutex_t *
+prof_ctx_mutex_choose(void)
+{
+	unsigned nctxs = atomic_add_u(&cum_ctxs, 1);
+
+	return (&ctx_locks[(nctxs - 1) % PROF_NCTX_LOCKS]);
+}
+
 prof_tdata_t *
 prof_tdata_init(void)
 {
@@ -1177,6 +1189,8 @@ prof_boot2(void)
 	cassert(config_prof);
 
 	if (opt_prof) {
+		unsigned i;
+
 		if (ckh_new(&bt2ctx, PROF_CKH_MINITEMS, prof_bt_hash,
 		    prof_bt_keycomp))
 			return (true);
@@ -1202,6 +1216,15 @@ prof_boot2(void)
 			if (opt_abort)
 				abort();
 		}
+
+		ctx_locks = (malloc_mutex_t *)base_alloc(PROF_NCTX_LOCKS *
+		    sizeof(malloc_mutex_t));
+		if (ctx_locks == NULL)
+			return (true);
+		for (i = 0; i < PROF_NCTX_LOCKS; i++) {
+			if (malloc_mutex_init(&ctx_locks[i]))
+				return (true);
+		}
 	}
 
 #ifdef JEMALLOC_PROF_LIBGCC