From bf448d7a5a4c2aecbda7ef11767a75829d9aaf77 Mon Sep 17 00:00:00 2001
From: David Goldblatt <davidgoldblatt@fb.com>
Date: Tue, 5 Jan 2021 15:52:25 -0800
Subject: [PATCH] SEC: Reduce lock hold times.

Only flush a subset of extents during flushing, and drop the lock while doing
so.
---
 include/jemalloc/internal/sec.h | 40 ++++++++++++---
 src/sec.c                       | 87 +++++++++++++++++++++++++++------
 test/unit/sec.c                 |  7 ++-
 3 files changed, 110 insertions(+), 24 deletions(-)

diff --git a/include/jemalloc/internal/sec.h b/include/jemalloc/internal/sec.h
index 6bf5687d..815b4bbc 100644
--- a/include/jemalloc/internal/sec.h
+++ b/include/jemalloc/internal/sec.h
@@ -8,13 +8,9 @@
  * Small extent cache.
  *
  * This includes some utilities to cache small extents.  We have a per-pszind
- * bin with its own lock and edata heap (including only extents of that size).
- * We don't try to do any coalescing of extents (since it would require
- * cross-bin locks).  As a result, we need to be careful about fragmentation.
- * As a gesture in that direction, we limit the size of caches, apply first-fit
- * within the bins, and, when flushing a bin, flush all of its extents rather
- * than just those up to some threshold.  When we allocate again, we'll get a
- * chance to move to better ones.
+ * bin with its own list of extents of that size.  We don't try to do any
+ * coalescing of extents (since it would in general require cross-shard locks or
+ * knowledge of the underlying PAI implementation).
  */
 
 /*
@@ -46,6 +42,19 @@ sec_stats_accum(sec_stats_t *dst, sec_stats_t *src) {
 	dst->bytes += src->bytes;
 }
 
+/* A collections of free extents, all of the same size. */
+typedef struct sec_bin_s sec_bin_t;
+struct sec_bin_s {
+	/*
+	 * Number of bytes in this particular bin (as opposed to the
+	 * sec_shard_t's bytes_cur.  This isn't user visible or reported in
+	 * stats; rather, it allows us to quickly determine the change in the
+	 * centralized counter when flushing.
+	 */
+	size_t bytes_cur;
+	edata_list_active_t freelist;
+};
+
 typedef struct sec_shard_s sec_shard_t;
 struct sec_shard_s {
 	/*
@@ -64,8 +73,11 @@ struct sec_shard_s {
 	 * hooks are installed.
 	 */
 	bool enabled;
-	edata_list_active_t freelist[SEC_NPSIZES];
+	sec_bin_t bins[SEC_NPSIZES];
+	/* Number of bytes in all bins in the shard. */
 	size_t bytes_cur;
+	/* The next pszind to flush in the flush-some pathways. */
+	pszind_t to_flush_next;
 };
 
 typedef struct sec_s sec_t;
@@ -83,6 +95,18 @@ struct sec_s {
 	 * the bins in that shard to be flushed.
 	 */
 	size_t bytes_max;
+	/*
+	 * The number of bytes (in all bins) we flush down to when we exceed
+	 * bytes_cur.  We want this to be less than bytes_cur, because
+	 * otherwise we could get into situations where a shard undergoing
+	 * net-deallocation keeps bytes_cur very near to bytes_max, so that
+	 * most deallocations get immediately forwarded to the underlying PAI
+	 * implementation, defeating the point of the SEC.
+	 *
+	 * Currently this is just set to bytes_max / 2, but eventually can be
+	 * configurable.
+	 */
+	size_t bytes_after_flush;
 
 	/*
 	 * We don't necessarily always use all the shards; requests are
diff --git a/src/sec.c b/src/sec.c
index 3a3a0b90..49b41047 100644
--- a/src/sec.c
+++ b/src/sec.c
@@ -11,7 +11,14 @@ static bool sec_shrink(tsdn_t *tsdn, pai_t *self, edata_t *edata,
     size_t old_size, size_t new_size);
 static void sec_dalloc(tsdn_t *tsdn, pai_t *self, edata_t *edata);
 
-bool sec_init(sec_t *sec, pai_t *fallback, size_t nshards, size_t alloc_max,
+static void
+sec_bin_init(sec_bin_t *bin) {
+	bin->bytes_cur = 0;
+	edata_list_active_init(&bin->freelist);
+}
+
+bool
+sec_init(sec_t *sec, pai_t *fallback, size_t nshards, size_t alloc_max,
     size_t bytes_max) {
 	if (nshards > SEC_NSHARDS_MAX) {
 		nshards = SEC_NSHARDS_MAX;
@@ -25,9 +32,10 @@ bool sec_init(sec_t *sec, pai_t *fallback, size_t nshards, size_t alloc_max,
 		}
 		shard->enabled = true;
 		for (pszind_t j = 0; j < SEC_NPSIZES; j++) {
-			edata_list_active_init(&shard->freelist[j]);
+			sec_bin_init(&shard->bins[j]);
 		}
 		shard->bytes_cur = 0;
+		shard->to_flush_next = 0;
 	}
 	sec->fallback = fallback;
 	sec->alloc_max = alloc_max;
@@ -36,6 +44,7 @@ bool sec_init(sec_t *sec, pai_t *fallback, size_t nshards, size_t alloc_max,
 	}
 
 	sec->bytes_max = bytes_max;
+	sec->bytes_after_flush = bytes_max / 2;
 	sec->nshards = nshards;
 
 	/*
@@ -85,9 +94,12 @@ sec_shard_alloc_locked(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard,
 	if (!shard->enabled) {
 		return NULL;
 	}
-	edata_t *edata = edata_list_active_first(&shard->freelist[pszind]);
+	sec_bin_t *bin = &shard->bins[pszind];
+	edata_t *edata = edata_list_active_first(&bin->freelist);
 	if (edata != NULL) {
-		edata_list_active_remove(&shard->freelist[pszind], edata);
+		edata_list_active_remove(&bin->freelist, edata);
+		assert(edata_size_get(edata) <= bin->bytes_cur);
+		bin->bytes_cur -= edata_size_get(edata);
 		assert(edata_size_get(edata) <= shard->bytes_cur);
 		shard->bytes_cur -= edata_size_get(edata);
 	}
@@ -135,30 +147,75 @@ sec_shrink(tsdn_t *tsdn, pai_t *self, edata_t *edata, size_t old_size,
 }
 
 static void
-sec_do_flush_locked(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard) {
+sec_flush_all_locked(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard) {
 	malloc_mutex_assert_owner(tsdn, &shard->mtx);
 	shard->bytes_cur = 0;
 	edata_list_active_t to_flush;
 	edata_list_active_init(&to_flush);
 	for (pszind_t i = 0; i < SEC_NPSIZES; i++) {
-		edata_list_active_concat(&to_flush, &shard->freelist[i]);
+		sec_bin_t *bin = &shard->bins[i];
+		bin->bytes_cur = 0;
+		edata_list_active_concat(&to_flush, &bin->freelist);
 	}
 
+	/*
+	 * Ordinarily we would try to avoid doing the batch deallocation while
+	 * holding the shard mutex, but the flush_all pathways only happen when
+	 * we're disabling the HPA or resetting the arena, both of which are
+	 * rare pathways.
+	 */
 	pai_dalloc_batch(tsdn, sec->fallback, &to_flush);
 }
 
 static void
-sec_shard_dalloc_locked(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard,
+sec_flush_some_and_unlock(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard) {
+	malloc_mutex_assert_owner(tsdn, &shard->mtx);
+	edata_list_active_t to_flush;
+	edata_list_active_init(&to_flush);
+	while (shard->bytes_cur > sec->bytes_after_flush) {
+		/* Pick a victim. */
+		sec_bin_t *bin = &shard->bins[shard->to_flush_next];
+
+		/* Update our victim-picking state. */
+		shard->to_flush_next++;
+		if (shard->to_flush_next == SEC_NPSIZES) {
+			shard->to_flush_next = 0;
+		}
+
+		assert(shard->bytes_cur >= bin->bytes_cur);
+		if (bin->bytes_cur != 0) {
+			shard->bytes_cur -= bin->bytes_cur;
+			bin->bytes_cur = 0;
+			edata_list_active_concat(&to_flush, &bin->freelist);
+		}
+		/*
+		 * Either bin->bytes_cur was 0, in which case we didn't touch
+		 * the bin list but it should be empty anyways (or else we
+		 * missed a bytes_cur update on a list modification), or it
+		 * *was* 0 and we emptied it ourselves.  Either way, it should
+		 * be empty now.
+		 */
+		assert(edata_list_active_empty(&bin->freelist));
+	}
+
+	malloc_mutex_unlock(tsdn, &shard->mtx);
+	pai_dalloc_batch(tsdn, sec->fallback, &to_flush);
+}
+
+static void
+sec_shard_dalloc_and_unlock(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard,
     edata_t *edata) {
 	malloc_mutex_assert_owner(tsdn, &shard->mtx);
 	assert(shard->bytes_cur <= sec->bytes_max);
 	size_t size = edata_size_get(edata);
 	pszind_t pszind = sz_psz2ind(size);
 	/*
-	 * Prepending here results in FIFO allocation per bin, which seems
+	 * Prepending here results in LIFO allocation per bin, which seems
 	 * reasonable.
 	 */
-	edata_list_active_prepend(&shard->freelist[pszind], edata);
+	sec_bin_t *bin = &shard->bins[pszind];
+	edata_list_active_prepend(&bin->freelist, edata);
+	bin->bytes_cur += size;
 	shard->bytes_cur += size;
 	if (shard->bytes_cur > sec->bytes_max) {
 		/*
@@ -170,7 +227,10 @@ sec_shard_dalloc_locked(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard,
 		 * in the backing allocator).  This has the extra advantage of
 		 * not requiring advanced cache balancing strategies.
 		 */
-		sec_do_flush_locked(tsdn, sec, shard);
+		sec_flush_some_and_unlock(tsdn, sec, shard);
+		malloc_mutex_assert_not_owner(tsdn, &shard->mtx);
+	} else {
+		malloc_mutex_unlock(tsdn, &shard->mtx);
 	}
 }
 
@@ -184,8 +244,7 @@ sec_dalloc(tsdn_t *tsdn, pai_t *self, edata_t *edata) {
 	sec_shard_t *shard = sec_shard_pick(tsdn, sec);
 	malloc_mutex_lock(tsdn, &shard->mtx);
 	if (shard->enabled) {
-		sec_shard_dalloc_locked(tsdn, sec, shard, edata);
-		malloc_mutex_unlock(tsdn, &shard->mtx);
+		sec_shard_dalloc_and_unlock(tsdn, sec, shard, edata);
 	} else {
 		malloc_mutex_unlock(tsdn, &shard->mtx);
 		pai_dalloc(tsdn, sec->fallback, edata);
@@ -196,7 +255,7 @@ void
 sec_flush(tsdn_t *tsdn, sec_t *sec) {
 	for (size_t i = 0; i < sec->nshards; i++) {
 		malloc_mutex_lock(tsdn, &sec->shards[i].mtx);
-		sec_do_flush_locked(tsdn, sec, &sec->shards[i]);
+		sec_flush_all_locked(tsdn, sec, &sec->shards[i]);
 		malloc_mutex_unlock(tsdn, &sec->shards[i].mtx);
 	}
 }
@@ -206,7 +265,7 @@ sec_disable(tsdn_t *tsdn, sec_t *sec) {
 	for (size_t i = 0; i < sec->nshards; i++) {
 		malloc_mutex_lock(tsdn, &sec->shards[i].mtx);
 		sec->shards[i].enabled = false;
-		sec_do_flush_locked(tsdn, sec, &sec->shards[i]);
+		sec_flush_all_locked(tsdn, sec, &sec->shards[i]);
 		malloc_mutex_unlock(tsdn, &sec->shards[i].mtx);
 	}
 }
diff --git a/test/unit/sec.c b/test/unit/sec.c
index 7657537b..5fe3550c 100644
--- a/test/unit/sec.c
+++ b/test/unit/sec.c
@@ -200,8 +200,11 @@ TEST_BEGIN(test_auto_flush) {
 	expect_zu_eq(0, ta.dalloc_count,
 	    "Incorrect number of allocations");
 	/*
-	 * Free the extra allocation; this should trigger a flush of all
-	 * extents in the cache.
+	 * Free the extra allocation; this should trigger a flush.  The internal
+	 * flushing logic is allowed to get complicated; for now, we rely on our
+	 * whitebox knowledge of the fact that the SEC flushes bins in their
+	 * entirety when it decides to do so, and it has only one bin active
+	 * right now.
 	 */
 	pai_dalloc(tsdn, &sec.pai, extra_alloc);
 	expect_zu_eq(NALLOCS + 1, ta.alloc_count,