From cdae6706a6dbe6ab75688ea24a82ef4165c3b0b1 Mon Sep 17 00:00:00 2001
From: David Goldblatt <davidgoldblatt@fb.com>
Date: Tue, 19 Jan 2021 13:06:43 -0800
Subject: [PATCH] SEC: Use batch fills.

Currently, this doesn't help much, since no PAI implementation supports
flushing.  This will change in subsequent commits.
---
 include/jemalloc/internal/sec.h |  28 ++++++
 src/sec.c                       | 147 ++++++++++++++++++++++----------
 test/unit/sec.c                 |  63 ++++++++------
 3 files changed, 167 insertions(+), 71 deletions(-)

diff --git a/include/jemalloc/internal/sec.h b/include/jemalloc/internal/sec.h
index 815b4bbc..fadf4b61 100644
--- a/include/jemalloc/internal/sec.h
+++ b/include/jemalloc/internal/sec.h
@@ -45,6 +45,24 @@ sec_stats_accum(sec_stats_t *dst, sec_stats_t *src) {
 /* A collections of free extents, all of the same size. */
 typedef struct sec_bin_s sec_bin_t;
 struct sec_bin_s {
+	/*
+	 * When we fail to fulfill an allocation, we do a batch-alloc on the
+	 * underlying allocator to fill extra items, as well.  We drop the SEC
+	 * lock while doing so, to allow operations on other bins to succeed.
+	 * That introduces the possibility of other threads also trying to
+	 * allocate out of this bin, failing, and also going to the backing
+	 * allocator.  To avoid a thundering herd problem in which lots of
+	 * threads do batch allocs and overfill this bin as a result, we only
+	 * allow one batch allocation at a time for a bin.  This bool tracks
+	 * whether or not some thread is already batch allocating.
+	 *
+	 * Eventually, the right answer may be a smarter sharding policy for the
+	 * bins (e.g. a mutex per bin, which would also be more scalable
+	 * generally; the batch-allocating thread could hold it while
+	 * batch-allocating).
+	 */
+	bool being_batch_filled;
+
 	/*
 	 * Number of bytes in this particular bin (as opposed to the
 	 * sec_shard_t's bytes_cur.  This isn't user visible or reported in
@@ -108,6 +126,16 @@ struct sec_s {
 	 */
 	size_t bytes_after_flush;
 
+	/*
+	 * When we can't satisfy an allocation out of the SEC because there are
+	 * no available ones cached, we allocate multiple of that size out of
+	 * the fallback allocator.  Eventually we might want to do something
+	 * cleverer, but for now we just grab a fixed number.
+	 *
+	 * For now, just the constant 4.  Eventually, it should be configurable.
+	 */
+	size_t batch_fill_extra;
+
 	/*
 	 * We don't necessarily always use all the shards; requests are
 	 * distributed across shards [0, nshards - 1).
diff --git a/src/sec.c b/src/sec.c
index af7c2910..f177bbee 100644
--- a/src/sec.c
+++ b/src/sec.c
@@ -13,6 +13,7 @@ static void sec_dalloc(tsdn_t *tsdn, pai_t *self, edata_t *edata);
 
 static void
 sec_bin_init(sec_bin_t *bin) {
+	bin->being_batch_filled = false;
 	bin->bytes_cur = 0;
 	edata_list_active_init(&bin->freelist);
 }
@@ -45,6 +46,7 @@ sec_init(sec_t *sec, pai_t *fallback, size_t nshards, size_t alloc_max,
 
 	sec->bytes_max = bytes_max;
 	sec->bytes_after_flush = bytes_max / 2;
+	sec->batch_fill_extra = 4;
 	sec->nshards = nshards;
 
 	/*
@@ -88,14 +90,52 @@ sec_shard_pick(tsdn_t *tsdn, sec_t *sec) {
 	return &sec->shards[*idxp];
 }
 
+/*
+ * Perhaps surprisingly, this can be called on the alloc pathways; if we hit an
+ * empty cache, we'll try to fill it, which can push the shard over it's limit.
+ */
+static void
+sec_flush_some_and_unlock(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard) {
+	malloc_mutex_assert_owner(tsdn, &shard->mtx);
+	edata_list_active_t to_flush;
+	edata_list_active_init(&to_flush);
+	while (shard->bytes_cur > sec->bytes_after_flush) {
+		/* Pick a victim. */
+		sec_bin_t *bin = &shard->bins[shard->to_flush_next];
+
+		/* Update our victim-picking state. */
+		shard->to_flush_next++;
+		if (shard->to_flush_next == SEC_NPSIZES) {
+			shard->to_flush_next = 0;
+		}
+
+		assert(shard->bytes_cur >= bin->bytes_cur);
+		if (bin->bytes_cur != 0) {
+			shard->bytes_cur -= bin->bytes_cur;
+			bin->bytes_cur = 0;
+			edata_list_active_concat(&to_flush, &bin->freelist);
+		}
+		/*
+		 * Either bin->bytes_cur was 0, in which case we didn't touch
+		 * the bin list but it should be empty anyways (or else we
+		 * missed a bytes_cur update on a list modification), or it
+		 * *was* 0 and we emptied it ourselves.  Either way, it should
+		 * be empty now.
+		 */
+		assert(edata_list_active_empty(&bin->freelist));
+	}
+
+	malloc_mutex_unlock(tsdn, &shard->mtx);
+	pai_dalloc_batch(tsdn, sec->fallback, &to_flush);
+}
+
 static edata_t *
 sec_shard_alloc_locked(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard,
-    pszind_t pszind) {
+    sec_bin_t *bin) {
 	malloc_mutex_assert_owner(tsdn, &shard->mtx);
 	if (!shard->enabled) {
 		return NULL;
 	}
-	sec_bin_t *bin = &shard->bins[pszind];
 	edata_t *edata = edata_list_active_first(&bin->freelist);
 	if (edata != NULL) {
 		edata_list_active_remove(&bin->freelist, edata);
@@ -107,6 +147,50 @@ sec_shard_alloc_locked(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard,
 	return edata;
 }
 
+static edata_t *
+sec_batch_fill_and_alloc(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard,
+    sec_bin_t *bin, size_t size) {
+	malloc_mutex_assert_not_owner(tsdn, &shard->mtx);
+
+	edata_list_active_t result;
+	edata_list_active_init(&result);
+	size_t nalloc = pai_alloc_batch(tsdn, sec->fallback, size,
+	    1 + sec->batch_fill_extra, &result);
+
+	edata_t *ret = edata_list_active_first(&result);
+	if (ret != NULL) {
+		edata_list_active_remove(&result, ret);
+	}
+
+	malloc_mutex_lock(tsdn, &shard->mtx);
+	bin->being_batch_filled = false;
+	/*
+	 * Handle the easy case first: nothing to cache.  Note that this can
+	 * only happen in case of OOM, since sec_alloc checks the expected
+	 * number of allocs, and doesn't bother going down the batch_fill
+	 * pathway if there won't be anything left to cache.  So to be in this
+	 * code path, we must have asked for > 1 alloc, but only gotten 1 back.
+	 */
+	if (nalloc <= 1) {
+		malloc_mutex_unlock(tsdn, &shard->mtx);
+		return ret;
+	}
+
+	size_t new_cached_bytes = (nalloc - 1) * size;
+
+	edata_list_active_concat(&bin->freelist, &result);
+	bin->bytes_cur += new_cached_bytes;
+	shard->bytes_cur += new_cached_bytes;
+
+	if (shard->bytes_cur > sec->bytes_max) {
+		sec_flush_some_and_unlock(tsdn, sec, shard);
+	} else {
+		malloc_mutex_unlock(tsdn, &shard->mtx);
+	}
+
+	return ret;
+}
+
 static edata_t *
 sec_alloc(tsdn_t *tsdn, pai_t *self, size_t size, size_t alignment, bool zero) {
 	assert((size & PAGE_MASK) == 0);
@@ -119,16 +203,26 @@ sec_alloc(tsdn_t *tsdn, pai_t *self, size_t size, size_t alignment, bool zero) {
 	}
 	pszind_t pszind = sz_psz2ind(size);
 	sec_shard_t *shard = sec_shard_pick(tsdn, sec);
+	sec_bin_t *bin = &shard->bins[pszind];
+	bool do_batch_fill = false;
+
 	malloc_mutex_lock(tsdn, &shard->mtx);
-	edata_t *edata = sec_shard_alloc_locked(tsdn, sec, shard, pszind);
+	edata_t *edata = sec_shard_alloc_locked(tsdn, sec, shard, bin);
+	if (edata == NULL) {
+		if (!bin->being_batch_filled && sec->batch_fill_extra > 0) {
+			bin->being_batch_filled = true;
+			do_batch_fill = true;
+		}
+	}
 	malloc_mutex_unlock(tsdn, &shard->mtx);
 	if (edata == NULL) {
-		/*
-		 * See the note in dalloc, below; really, we should add a
-		 * batch_alloc method to the PAI and get more than one extent at
-		 * a time.
-		 */
-		edata = pai_alloc(tsdn, sec->fallback, size, alignment, zero);
+		if (do_batch_fill) {
+			edata = sec_batch_fill_and_alloc(tsdn, sec, shard, bin,
+			    size);
+		} else {
+			edata = pai_alloc(tsdn, sec->fallback, size, alignment,
+			    zero);
+		}
 	}
 	return edata;
 }
@@ -168,41 +262,6 @@ sec_flush_all_locked(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard) {
 	pai_dalloc_batch(tsdn, sec->fallback, &to_flush);
 }
 
-static void
-sec_flush_some_and_unlock(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard) {
-	malloc_mutex_assert_owner(tsdn, &shard->mtx);
-	edata_list_active_t to_flush;
-	edata_list_active_init(&to_flush);
-	while (shard->bytes_cur > sec->bytes_after_flush) {
-		/* Pick a victim. */
-		sec_bin_t *bin = &shard->bins[shard->to_flush_next];
-
-		/* Update our victim-picking state. */
-		shard->to_flush_next++;
-		if (shard->to_flush_next == SEC_NPSIZES) {
-			shard->to_flush_next = 0;
-		}
-
-		assert(shard->bytes_cur >= bin->bytes_cur);
-		if (bin->bytes_cur != 0) {
-			shard->bytes_cur -= bin->bytes_cur;
-			bin->bytes_cur = 0;
-			edata_list_active_concat(&to_flush, &bin->freelist);
-		}
-		/*
-		 * Either bin->bytes_cur was 0, in which case we didn't touch
-		 * the bin list but it should be empty anyways (or else we
-		 * missed a bytes_cur update on a list modification), or it
-		 * *was* 0 and we emptied it ourselves.  Either way, it should
-		 * be empty now.
-		 */
-		assert(edata_list_active_empty(&bin->freelist));
-	}
-
-	malloc_mutex_unlock(tsdn, &shard->mtx);
-	pai_dalloc_batch(tsdn, sec->fallback, &to_flush);
-}
-
 static void
 sec_shard_dalloc_and_unlock(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard,
     edata_t *edata) {
diff --git a/test/unit/sec.c b/test/unit/sec.c
index 69132c1f..ff39453c 100644
--- a/test/unit/sec.c
+++ b/test/unit/sec.c
@@ -134,14 +134,17 @@ TEST_BEGIN(test_reuse) {
 	 */
 	tsdn_t *tsdn = TSDN_NULL;
 	/*
-	 * 10-allocs apiece of 1-PAGE and 2-PAGE objects means that we should be
-	 * able to get to 30 pages in the cache before triggering a flush.
+	 * 11 allocs apiece of 1-PAGE and 2-PAGE objects means that we should be
+	 * able to get to 33 pages in the cache before triggering a flush.  We
+	 * set the flush liimt to twice this amount, to avoid accidentally
+	 * triggering a flush caused by the batch-allocation down the cache fill
+	 * pathway disrupting ordering.
 	 */
-	enum { NALLOCS = 10 };
+	enum { NALLOCS = 11 };
 	edata_t *one_page[NALLOCS];
 	edata_t *two_page[NALLOCS];
 	sec_init(&sec, &ta.pai, /* nshards */ 1, /* alloc_max */ 2 * PAGE,
-	    /* bytes_max */ NALLOCS * PAGE + NALLOCS * 2 * PAGE);
+	    /* bytes_max */ 2 * (NALLOCS * PAGE + NALLOCS * 2 * PAGE));
 	for (int i = 0; i < NALLOCS; i++) {
 		one_page[i] = pai_alloc(tsdn, &sec.pai, PAGE, PAGE,
 		    /* zero */ false);
@@ -150,7 +153,9 @@ TEST_BEGIN(test_reuse) {
 		    /* zero */ false);
 		expect_ptr_not_null(one_page[i], "Unexpected alloc failure");
 	}
-	expect_zu_eq(2 * NALLOCS, ta.alloc_count,
+	expect_zu_eq(0, ta.alloc_count, "Should be using batch allocs");
+	size_t max_allocs = ta.alloc_count + ta.alloc_batch_count;
+	expect_zu_le(2 * NALLOCS, max_allocs,
 	    "Incorrect number of allocations");
 	expect_zu_eq(0, ta.dalloc_count,
 	    "Incorrect number of allocations");
@@ -164,7 +169,7 @@ TEST_BEGIN(test_reuse) {
 	for (int i = NALLOCS - 1; i >= 0; i--) {
 		pai_dalloc(tsdn, &sec.pai, two_page[i]);
 	}
-	expect_zu_eq(2 * NALLOCS, ta.alloc_count,
+	expect_zu_eq(max_allocs, ta.alloc_count + ta.alloc_batch_count,
 	    "Incorrect number of allocations");
 	expect_zu_eq(0, ta.dalloc_count,
 	    "Incorrect number of allocations");
@@ -182,7 +187,7 @@ TEST_BEGIN(test_reuse) {
 		expect_ptr_eq(two_page[i], alloc2,
 		    "Got unexpected allocation");
 	}
-	expect_zu_eq(2 * NALLOCS, ta.alloc_count,
+	expect_zu_eq(max_allocs, ta.alloc_count + ta.alloc_batch_count,
 	    "Incorrect number of allocations");
 	expect_zu_eq(0, ta.dalloc_count,
 	    "Incorrect number of allocations");
@@ -198,7 +203,12 @@ TEST_BEGIN(test_auto_flush) {
 	tsdn_t *tsdn = TSDN_NULL;
 	/*
 	 * 10-allocs apiece of 1-PAGE and 2-PAGE objects means that we should be
-	 * able to get to 30 pages in the cache before triggering a flush.
+	 * able to get to 30 pages in the cache before triggering a flush.  The
+	 * choice of NALLOCS here is chosen to match the batch allocation
+	 * default (4 extra + 1 == 5; so 10 allocations leaves the cache exactly
+	 * empty, even in the presence of batch allocation on fill).
+	 * Eventually, once our allocation batching strategies become smarter,
+	 * this should change.
 	 */
 	enum { NALLOCS = 10 };
 	edata_t *extra_alloc;
@@ -212,7 +222,8 @@ TEST_BEGIN(test_auto_flush) {
 	}
 	extra_alloc = pai_alloc(tsdn, &sec.pai, PAGE, PAGE, /* zero */ false);
 	expect_ptr_not_null(extra_alloc, "Unexpected alloc failure");
-	expect_zu_eq(NALLOCS + 1, ta.alloc_count,
+	size_t max_allocs = ta.alloc_count + ta.alloc_batch_count;
+	expect_zu_le(NALLOCS + 1, max_allocs,
 	    "Incorrect number of allocations");
 	expect_zu_eq(0, ta.dalloc_count,
 	    "Incorrect number of allocations");
@@ -220,7 +231,7 @@ TEST_BEGIN(test_auto_flush) {
 	for (int i = 0; i < NALLOCS; i++) {
 		pai_dalloc(tsdn, &sec.pai, allocs[i]);
 	}
-	expect_zu_eq(NALLOCS + 1, ta.alloc_count,
+	expect_zu_le(NALLOCS + 1, max_allocs,
 	    "Incorrect number of allocations");
 	expect_zu_eq(0, ta.dalloc_count,
 	    "Incorrect number of allocations");
@@ -232,7 +243,7 @@ TEST_BEGIN(test_auto_flush) {
 	 * right now.
 	 */
 	pai_dalloc(tsdn, &sec.pai, extra_alloc);
-	expect_zu_eq(NALLOCS + 1, ta.alloc_count,
+	expect_zu_eq(max_allocs, ta.alloc_count + ta.alloc_batch_count,
 	    "Incorrect number of allocations");
 	expect_zu_eq(0, ta.dalloc_count,
 	    "Incorrect number of (non-batch) deallocations");
@@ -253,7 +264,7 @@ do_disable_flush_test(bool is_disable) {
 	/* See the note above -- we can't use the real tsd. */
 	tsdn_t *tsdn = TSDN_NULL;
 
-	enum { NALLOCS = 10 };
+	enum { NALLOCS = 11 };
 	edata_t *allocs[NALLOCS];
 	sec_init(&sec, &ta.pai, /* nshards */ 1, /* alloc_max */ PAGE,
 	    /* bytes_max */ NALLOCS * PAGE);
@@ -266,8 +277,9 @@ do_disable_flush_test(bool is_disable) {
 	for (int i = 0; i < NALLOCS - 1; i++) {
 		pai_dalloc(tsdn, &sec.pai, allocs[i]);
 	}
-	expect_zu_eq(NALLOCS, ta.alloc_count,
-	    "Incorrect number of allocations");
+	size_t max_allocs = ta.alloc_count + ta.alloc_batch_count;
+
+	expect_zu_le(NALLOCS, max_allocs, "Incorrect number of allocations");
 	expect_zu_eq(0, ta.dalloc_count,
 	    "Incorrect number of allocations");
 
@@ -277,12 +289,13 @@ do_disable_flush_test(bool is_disable) {
 		sec_flush(tsdn, &sec);
 	}
 
-	expect_zu_eq(NALLOCS, ta.alloc_count,
+	expect_zu_eq(max_allocs, ta.alloc_count + ta.alloc_batch_count,
 	    "Incorrect number of allocations");
 	expect_zu_eq(0, ta.dalloc_count,
 	    "Incorrect number of (non-batch) deallocations");
-	expect_zu_eq(NALLOCS - 1, ta.dalloc_batch_count,
+	expect_zu_le(NALLOCS - 1, ta.dalloc_batch_count,
 	    "Incorrect number of batch deallocations");
+	size_t old_dalloc_batch_count = ta.dalloc_batch_count;
 
 	/*
 	 * If we free into a disabled SEC, it should forward to the fallback.
@@ -290,11 +303,11 @@ do_disable_flush_test(bool is_disable) {
 	 */
 	pai_dalloc(tsdn, &sec.pai, allocs[NALLOCS - 1]);
 
-	expect_zu_eq(NALLOCS, ta.alloc_count,
+	expect_zu_eq(max_allocs, ta.alloc_count + ta.alloc_batch_count,
 	    "Incorrect number of allocations");
 	expect_zu_eq(is_disable ? 1 : 0, ta.dalloc_count,
 	    "Incorrect number of (non-batch) deallocations");
-	expect_zu_eq(NALLOCS - 1, ta.dalloc_batch_count,
+	expect_zu_eq(old_dalloc_batch_count, ta.dalloc_batch_count,
 	    "Incorrect number of batch deallocations");
 }
 
@@ -404,7 +417,7 @@ expect_stats_pages(tsdn_t *tsdn, sec_t *sec, size_t npages) {
 	 */
 	stats.bytes = 123;
 	sec_stats_merge(tsdn, sec, &stats);
-	assert_zu_eq(npages * PAGE + 123, stats.bytes, "");
+	assert_zu_le(npages * PAGE + 123, stats.bytes, "");
 }
 
 TEST_BEGIN(test_stats_simple) {
@@ -417,7 +430,7 @@ TEST_BEGIN(test_stats_simple) {
 
 	enum {
 		NITERS = 100,
-		FLUSH_PAGES = 10,
+		FLUSH_PAGES = 20,
 	};
 
 	sec_init(&sec, &ta.pai, /* nshards */ 1, /* alloc_max */ PAGE,
@@ -470,26 +483,22 @@ TEST_BEGIN(test_stats_auto_flush) {
 	for (size_t i = 0; i < 2 * FLUSH_PAGES; i++) {
 		allocs[i] = pai_alloc(tsdn, &sec.pai, PAGE, PAGE,
 		    /* zero */ false);
-		expect_stats_pages(tsdn, &sec, 0);
 	}
 
 	for (size_t i = 0; i < FLUSH_PAGES; i++) {
 		pai_dalloc(tsdn, &sec.pai, allocs[i]);
-		expect_stats_pages(tsdn, &sec, i + 1);
 	}
 	pai_dalloc(tsdn, &sec.pai, extra_alloc0);
-	/* The last dalloc should have triggered a flush. */
-	expect_stats_pages(tsdn, &sec, 0);
 
 	/* Flush the remaining pages; stats should still work. */
 	for (size_t i = 0; i < FLUSH_PAGES; i++) {
 		pai_dalloc(tsdn, &sec.pai, allocs[FLUSH_PAGES + i]);
-		expect_stats_pages(tsdn, &sec, i + 1);
 	}
 
 	pai_dalloc(tsdn, &sec.pai, extra_alloc1);
-	/* The last dalloc should have triggered a flush, again. */
-	expect_stats_pages(tsdn, &sec, 0);
+
+	expect_stats_pages(tsdn, &sec, ta.alloc_count + ta.alloc_batch_count
+	    - ta.dalloc_count - ta.dalloc_batch_count);
 }
 TEST_END