SEC: Reduce lock hold times.

Only flush a subset of extents during flushing, and drop the lock while doing so.
2021-01-05 15:52:25 -08:00 · 2021-01-05 15:52:25 -08:00 · bf448d7a5a
commit bf448d7a5a
parent 1944ebbe7f
3 changed files with 110 additions and 24 deletions
--- a/include/jemalloc/internal/sec.h
+++ b/include/jemalloc/internal/sec.h
@ -8,13 +8,9 @@
 * Small extent cache.
 *
 * This includes some utilities to cache small extents.  We have a per-pszind
- * bin with its own lock and edata heap (including only extents of that size).
+ * bin with its own list of extents of that size.  We don't try to do any
- * We don't try to do any coalescing of extents (since it would require
+ * coalescing of extents (since it would in general require cross-shard locks or
- * cross-bin locks).  As a result, we need to be careful about fragmentation.
+ * knowledge of the underlying PAI implementation).
 * As a gesture in that direction, we limit the size of caches, apply first-fit
 * within the bins, and, when flushing a bin, flush all of its extents rather
 * than just those up to some threshold.  When we allocate again, we'll get a
 * chance to move to better ones.
 */
 /*
@ -46,6 +42,19 @@ sec_stats_accum(sec_stats_t *dst, sec_stats_t *src) {
 	dst->bytes += src->bytes;
 }
 /* A collections of free extents, all of the same size. */
 typedef struct sec_bin_s sec_bin_t;
 struct sec_bin_s {
 	/*
 	 * Number of bytes in this particular bin (as opposed to the
 	 * sec_shard_t's bytes_cur.  This isn't user visible or reported in
 	 * stats; rather, it allows us to quickly determine the change in the
 	 * centralized counter when flushing.
 	 */
 	size_t bytes_cur;
 	edata_list_active_t freelist;
 };
 typedef struct sec_shard_s sec_shard_t;
 struct sec_shard_s {
 	/*
@ -64,8 +73,11 @@ struct sec_shard_s {
 	 * hooks are installed.
 	 */
 	bool enabled;
-	edata_list_active_t freelist[SEC_NPSIZES];
+	sec_bin_t bins[SEC_NPSIZES];
 	/* Number of bytes in all bins in the shard. */
 	size_t bytes_cur;
 	/* The next pszind to flush in the flush-some pathways. */
 	pszind_t to_flush_next;
 };
 typedef struct sec_s sec_t;
@ -83,6 +95,18 @@ struct sec_s {
 	 * the bins in that shard to be flushed.
 	 */
 	size_t bytes_max;
 	/*
 	 * The number of bytes (in all bins) we flush down to when we exceed
 	 * bytes_cur.  We want this to be less than bytes_cur, because
 	 * otherwise we could get into situations where a shard undergoing
 	 * net-deallocation keeps bytes_cur very near to bytes_max, so that
 	 * most deallocations get immediately forwarded to the underlying PAI
 	 * implementation, defeating the point of the SEC.
 	 *
 	 * Currently this is just set to bytes_max / 2, but eventually can be
 	 * configurable.
 	 */
 	size_t bytes_after_flush;
 	/*
 	 * We don't necessarily always use all the shards; requests are
--- a/src/sec.c
+++ b/src/sec.c
@ -11,7 +11,14 @@ static bool sec_shrink(tsdn_t *tsdn, pai_t *self, edata_t *edata,
    size_t old_size, size_t new_size);
 static void sec_dalloc(tsdn_t *tsdn, pai_t *self, edata_t *edata);
-bool sec_init(sec_t *sec, pai_t *fallback, size_t nshards, size_t alloc_max,
+static void
 sec_bin_init(sec_bin_t *bin) {
 	bin->bytes_cur = 0;
 	edata_list_active_init(&bin->freelist);
 }
 bool
 sec_init(sec_t *sec, pai_t *fallback, size_t nshards, size_t alloc_max,
    size_t bytes_max) {
 	if (nshards > SEC_NSHARDS_MAX) {
 		nshards = SEC_NSHARDS_MAX;
@ -25,9 +32,10 @@ bool sec_init(sec_t *sec, pai_t *fallback, size_t nshards, size_t alloc_max,
 		}
 		shard->enabled = true;
 		for (pszind_t j = 0; j < SEC_NPSIZES; j++) {
-			edata_list_active_init(&shard->freelist[j]);
+			sec_bin_init(&shard->bins[j]);
 		}
 		shard->bytes_cur = 0;
 		shard->to_flush_next = 0;
 	}
 	sec->fallback = fallback;
 	sec->alloc_max = alloc_max;
@ -36,6 +44,7 @@ bool sec_init(sec_t *sec, pai_t *fallback, size_t nshards, size_t alloc_max,
 	}
 	sec->bytes_max = bytes_max;
 	sec->bytes_after_flush = bytes_max / 2;
 	sec->nshards = nshards;
 	/*
@ -85,9 +94,12 @@ sec_shard_alloc_locked(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard,
 	if (!shard->enabled) {
 		return NULL;
 	}
-	edata_t *edata = edata_list_active_first(&shard->freelist[pszind]);
+	sec_bin_t *bin = &shard->bins[pszind];
 	edata_t *edata = edata_list_active_first(&bin->freelist);
 	if (edata != NULL) {
-		edata_list_active_remove(&shard->freelist[pszind], edata);
+		edata_list_active_remove(&bin->freelist, edata);
 		assert(edata_size_get(edata) <= bin->bytes_cur);
 		bin->bytes_cur -= edata_size_get(edata);
 		assert(edata_size_get(edata) <= shard->bytes_cur);
 		shard->bytes_cur -= edata_size_get(edata);
 	}
@ -135,30 +147,75 @@ sec_shrink(tsdn_t *tsdn, pai_t *self, edata_t *edata, size_t old_size,
 }
 static void
-sec_do_flush_locked(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard) {
+sec_flush_all_locked(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard) {
 	malloc_mutex_assert_owner(tsdn, &shard->mtx);
 	shard->bytes_cur = 0;
 	edata_list_active_t to_flush;
 	edata_list_active_init(&to_flush);
 	for (pszind_t i = 0; i < SEC_NPSIZES; i++) {
-		edata_list_active_concat(&to_flush, &shard->freelist[i]);
+		sec_bin_t *bin = &shard->bins[i];
 		bin->bytes_cur = 0;
 		edata_list_active_concat(&to_flush, &bin->freelist);
 	}
 	/*
 	 * Ordinarily we would try to avoid doing the batch deallocation while
 	 * holding the shard mutex, but the flush_all pathways only happen when
 	 * we're disabling the HPA or resetting the arena, both of which are
 	 * rare pathways.
 	 */
 	pai_dalloc_batch(tsdn, sec->fallback, &to_flush);
 }
 static void
-sec_shard_dalloc_locked(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard,
+sec_flush_some_and_unlock(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard) {
 	malloc_mutex_assert_owner(tsdn, &shard->mtx);
 	edata_list_active_t to_flush;
 	edata_list_active_init(&to_flush);
 	while (shard->bytes_cur > sec->bytes_after_flush) {
 		/* Pick a victim. */
 		sec_bin_t *bin = &shard->bins[shard->to_flush_next];
 		/* Update our victim-picking state. */
 		shard->to_flush_next++;
 		if (shard->to_flush_next == SEC_NPSIZES) {
 			shard->to_flush_next = 0;
 		}
 		assert(shard->bytes_cur >= bin->bytes_cur);
 		if (bin->bytes_cur != 0) {
 			shard->bytes_cur -= bin->bytes_cur;
 			bin->bytes_cur = 0;
 			edata_list_active_concat(&to_flush, &bin->freelist);
 		}
 		/*
 		 * Either bin->bytes_cur was 0, in which case we didn't touch
 		 * the bin list but it should be empty anyways (or else we
 		 * missed a bytes_cur update on a list modification), or it
 		 * *was* 0 and we emptied it ourselves.  Either way, it should
 		 * be empty now.
 		 */
 		assert(edata_list_active_empty(&bin->freelist));
 	}
 	malloc_mutex_unlock(tsdn, &shard->mtx);
 	pai_dalloc_batch(tsdn, sec->fallback, &to_flush);
 }
 static void
 sec_shard_dalloc_and_unlock(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard,
    edata_t *edata) {
 	malloc_mutex_assert_owner(tsdn, &shard->mtx);
 	assert(shard->bytes_cur <= sec->bytes_max);
 	size_t size = edata_size_get(edata);
 	pszind_t pszind = sz_psz2ind(size);
 	/*
-	 * Prepending here results in FIFO allocation per bin, which seems
+	 * Prepending here results in LIFO allocation per bin, which seems
 	 * reasonable.
 	 */
-	edata_list_active_prepend(&shard->freelist[pszind], edata);
+	sec_bin_t *bin = &shard->bins[pszind];
 	edata_list_active_prepend(&bin->freelist, edata);
 	bin->bytes_cur += size;
 	shard->bytes_cur += size;
 	if (shard->bytes_cur > sec->bytes_max) {
 		/*
@ -170,7 +227,10 @@ sec_shard_dalloc_locked(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard,
 		 * in the backing allocator).  This has the extra advantage of
 		 * not requiring advanced cache balancing strategies.
 		 */
-		sec_do_flush_locked(tsdn, sec, shard);
+		sec_flush_some_and_unlock(tsdn, sec, shard);
 		malloc_mutex_assert_not_owner(tsdn, &shard->mtx);
 	} else {
 		malloc_mutex_unlock(tsdn, &shard->mtx);
 	}
 }
@ -184,8 +244,7 @@ sec_dalloc(tsdn_t *tsdn, pai_t *self, edata_t *edata) {
 	sec_shard_t *shard = sec_shard_pick(tsdn, sec);
 	malloc_mutex_lock(tsdn, &shard->mtx);
 	if (shard->enabled) {
-		sec_shard_dalloc_locked(tsdn, sec, shard, edata);
+		sec_shard_dalloc_and_unlock(tsdn, sec, shard, edata);
 		malloc_mutex_unlock(tsdn, &shard->mtx);
 	} else {
 		malloc_mutex_unlock(tsdn, &shard->mtx);
 		pai_dalloc(tsdn, sec->fallback, edata);
@ -196,7 +255,7 @@ void
 sec_flush(tsdn_t *tsdn, sec_t *sec) {
 	for (size_t i = 0; i < sec->nshards; i++) {
 		malloc_mutex_lock(tsdn, &sec->shards[i].mtx);
-		sec_do_flush_locked(tsdn, sec, &sec->shards[i]);
+		sec_flush_all_locked(tsdn, sec, &sec->shards[i]);
 		malloc_mutex_unlock(tsdn, &sec->shards[i].mtx);
 	}
 }
@ -206,7 +265,7 @@ sec_disable(tsdn_t *tsdn, sec_t *sec) {
 	for (size_t i = 0; i < sec->nshards; i++) {
 		malloc_mutex_lock(tsdn, &sec->shards[i].mtx);
 		sec->shards[i].enabled = false;
-		sec_do_flush_locked(tsdn, sec, &sec->shards[i]);
+		sec_flush_all_locked(tsdn, sec, &sec->shards[i]);
 		malloc_mutex_unlock(tsdn, &sec->shards[i].mtx);
 	}
 }
--- a/test/unit/sec.c
+++ b/test/unit/sec.c
@ -200,8 +200,11 @@ TEST_BEGIN(test_auto_flush) {
 	expect_zu_eq(0, ta.dalloc_count,
 	    "Incorrect number of allocations");
 	/*
-	 * Free the extra allocation; this should trigger a flush of all
+	 * Free the extra allocation; this should trigger a flush.  The internal
-	 * extents in the cache.
+	 * flushing logic is allowed to get complicated; for now, we rely on our
 	 * whitebox knowledge of the fact that the SEC flushes bins in their
 	 * entirety when it decides to do so, and it has only one bin active
 	 * right now.
 	 */
 	pai_dalloc(tsdn, &sec.pai, extra_alloc);
 	expect_zu_eq(NALLOCS + 1, ta.alloc_count,