From d96e4525adaefbde79f349d024eb5f94e72faf50 Mon Sep 17 00:00:00 2001
From: Yinan Zhang <zyn8950@gmail.com>
Date: Thu, 12 Nov 2020 14:54:25 -0800
Subject: [PATCH] Route batch allocation of small batch size to tcache

---
 src/jemalloc.c          | 106 ++++++++++++++++++++++++++--------------
 test/unit/batch_alloc.c |  76 +++++++++-------------------
 2 files changed, 91 insertions(+), 91 deletions(-)

diff --git a/src/jemalloc.c b/src/jemalloc.c
index 2a791e17..575a63cf 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -4088,32 +4088,15 @@ batch_alloc(void **ptrs, size_t num, size_t size, int flags) {
 	if (aligned_usize_get(size, alignment, &usize, NULL, false)) {
 		goto label_done;
 	}
-
 	szind_t ind = sz_size2index(usize);
-	if (unlikely(ind >= SC_NBINS)) {
-		/* No optimization for large sizes. */
-		void *p;
-		while (filled < num && (p = je_mallocx(size, flags)) != NULL) {
-			ptrs[filled++] = p;
-		}
-		goto label_done;
-	}
-
 	bool zero = zero_get(MALLOCX_ZERO_GET(flags), /* slow */ true);
 
-	unsigned arena_ind = mallocx_arena_get(flags);
-	arena_t *arena;
-	if (arena_get_from_ind(tsd, arena_ind, &arena)) {
-		goto label_done;
-	}
-	if (arena == NULL) {
-		arena = arena_choose(tsd, NULL);
-	} else {
-		/* When a manual arena is specified, bypass the tcache. */
-		flags |= MALLOCX_TCACHE_NONE;
-	}
-	if (unlikely(arena == NULL)) {
-		goto label_done;
+	cache_bin_t *bin = NULL;
+	arena_t *arena = NULL;
+	size_t nregs = 0;
+	if (likely(ind < SC_NBINS)) {
+		nregs = bin_infos[ind].nregs;
+		assert(nregs > 0);
 	}
 
 	while (filled < num) {
@@ -4132,9 +4115,63 @@ batch_alloc(void **ptrs, size_t num, size_t size, int flags) {
 			batch_alloc_prof_sample_assert(tsd, batch, usize);
 		}
 
-		size_t n = arena_fill_small_fresh(tsd_tsdn(tsd), arena,
-		    ind, ptrs + filled, batch, zero);
-		filled += n;
+		size_t progress = 0;
+
+		if (likely(ind < SC_NBINS) && batch >= nregs) {
+			if (arena == NULL) {
+				unsigned arena_ind = mallocx_arena_get(flags);
+				if (arena_get_from_ind(tsd, arena_ind,
+				    &arena)) {
+					goto label_done;
+				}
+				if (arena == NULL) {
+					arena = arena_choose(tsd, NULL);
+				}
+				if (unlikely(arena == NULL)) {
+					goto label_done;
+				}
+			}
+			size_t arena_batch = batch - batch % nregs;
+			size_t n = arena_fill_small_fresh(tsd_tsdn(tsd), arena,
+			    ind, ptrs + filled, arena_batch, zero);
+			progress += n;
+			filled += n;
+		}
+
+		if (likely(ind < nhbins) && progress < batch) {
+			if (bin == NULL) {
+				unsigned tcache_ind = mallocx_tcache_get(flags);
+				tcache_t *tcache = tcache_get_from_ind(tsd,
+				    tcache_ind, /* slow */ true,
+				    /* is_alloc */ true);
+				if (tcache != NULL) {
+					bin = &tcache->bins[ind];
+				}
+			}
+			if (bin != NULL) {
+				size_t bin_batch = batch - progress;
+				size_t n = cache_bin_alloc_batch(bin, bin_batch,
+				    ptrs + filled);
+				if (config_stats) {
+					bin->tstats.nrequests += n;
+				}
+				if (zero) {
+					for (size_t i = 0; i < n; ++i) {
+						memset(ptrs[filled + i], 0,
+						    usize);
+					}
+				}
+				if (config_prof && opt_prof
+				    && unlikely(ind >= SC_NBINS)) {
+					for (size_t i = 0; i < n; ++i) {
+						prof_tctx_reset_sampled(tsd,
+						    ptrs[filled + i]);
+					}
+				}
+				progress += n;
+				filled += n;
+			}
+		}
 
 		/*
 		 * For thread events other than prof sampling, trigger them as
@@ -4146,23 +4183,16 @@ batch_alloc(void **ptrs, size_t num, size_t size, int flags) {
 		 *     were handled individually, but it would do no harm (or
 		 *     even be beneficial) to coalesce the triggerings.
 		 */
-		thread_alloc_event(tsd, n * usize);
+		thread_alloc_event(tsd, progress * usize);
 
-		if (n < batch) { /* OOM */
-			break;
-		}
-
-		if (prof_sample_event) {
-			/*
-			 * The next allocation will be prof sampled.  The
-			 * thread event logic is handled within the mallocx()
-			 * call.
-			 */
+		if (progress < batch || prof_sample_event) {
 			void *p = je_mallocx(size, flags);
 			if (p == NULL) { /* OOM */
 				break;
 			}
-			assert(prof_sampled(tsd, p));
+			if (progress == batch) {
+				assert(prof_sampled(tsd, p));
+			}
 			ptrs[filled++] = p;
 		}
 	}
diff --git a/test/unit/batch_alloc.c b/test/unit/batch_alloc.c
index cb46513d..992990f3 100644
--- a/test/unit/batch_alloc.c
+++ b/test/unit/batch_alloc.c
@@ -5,35 +5,6 @@ static void *ptrs[BATCH_MAX];
 
 #define PAGE_ALIGNED(ptr) (((uintptr_t)ptr & PAGE_MASK) == 0)
 
-static void
-verify_stats(bin_stats_t *before, bin_stats_t *after, size_t batch,
-    unsigned nregs) {
-	if (!config_stats) {
-		return;
-	}
-	if (config_prof && opt_prof) {
-		/*
-		 * Checking the stats when prof is on is feasible but
-		 * complicated, while checking the non-prof case suffices for
-		 * unit-test purpose.
-		 */
-		return;
-	}
-	expect_u64_eq(before->nmalloc + batch, after->nmalloc, "");
-	expect_u64_eq(before->nrequests + batch, after->nrequests, "");
-	expect_zu_eq(before->curregs + batch, after->curregs, "");
-	size_t nslab = batch / nregs;
-	size_t n_nonfull = 0;
-	if (batch % nregs != 0) {
-		++nslab;
-		++n_nonfull;
-	}
-	expect_u64_eq(before->nslabs + nslab, after->nslabs, "");
-	expect_zu_eq(before->curslabs + nslab, after->curslabs, "");
-	expect_zu_eq(before->nonfull_slabs + n_nonfull, after->nonfull_slabs,
-	    "");
-}
-
 static void
 verify_batch_basic(tsd_t *tsd, void **ptrs, size_t batch, size_t usize,
     bool zero) {
@@ -51,10 +22,21 @@ verify_batch_basic(tsd_t *tsd, void **ptrs, size_t batch, size_t usize,
 static void
 verify_batch_locality(tsd_t *tsd, void **ptrs, size_t batch, size_t usize,
     arena_t *arena, unsigned nregs) {
+	if (config_prof && opt_prof) {
+		/*
+		 * Checking batch locality when prof is on is feasible but
+		 * complicated, while checking the non-prof case suffices for
+		 * unit-test purpose.
+		 */
+		return;
+	}
 	for (size_t i = 0, j = 0; i < batch; ++i, ++j) {
 		if (j == nregs) {
 			j = 0;
 		}
+		if (j == 0 && batch - i < nregs) {
+			break;
+		}
 		void *p = ptrs[i];
 		expect_ptr_eq(iaalloc(tsd_tsdn(tsd), p), arena, "");
 		if (j == 0) {
@@ -63,21 +45,8 @@ verify_batch_locality(tsd_t *tsd, void **ptrs, size_t batch, size_t usize,
 		}
 		assert(i > 0);
 		void *q = ptrs[i - 1];
-		bool adjacent = (uintptr_t)p > (uintptr_t)q
-		    && (size_t)((uintptr_t)p - (uintptr_t)q) == usize;
-		if (config_prof && opt_prof) {
-			if (adjacent) {
-				expect_false(prof_sampled(tsd, p)
-				    || prof_sampled(tsd, q), "");
-			} else {
-				expect_true(prof_sampled(tsd, p)
-				    || prof_sampled(tsd, q), "");
-				expect_true(PAGE_ALIGNED(p), "");
-				j = 0;
-			}
-		} else {
-			expect_true(adjacent, "");
-		}
+		expect_true((uintptr_t)p > (uintptr_t)q
+		    && (size_t)((uintptr_t)p - (uintptr_t)q) == usize, "");
 	}
 }
 
@@ -124,8 +93,6 @@ test_wrapper(size_t size, size_t alignment, bool zero, unsigned arena_flag) {
 		arena = arena_choose(tsd, NULL);
 	}
 	assert(arena != NULL);
-	bin_t *bin = arena_bin_choose(tsd_tsdn(tsd), arena, ind, NULL);
-	assert(bin != NULL);
 	int flags = arena_flag;
 	if (alignment != 0) {
 		flags |= MALLOCX_ALIGN(alignment);
@@ -155,13 +122,9 @@ test_wrapper(size_t size, size_t alignment, bool zero, unsigned arena_flag) {
 			}
 			size_t batch = base + (size_t)j;
 			assert(batch < BATCH_MAX);
-			bin_stats_t stats_before, stats_after;
-			memcpy(&stats_before, &bin->stats, sizeof(bin_stats_t));
 			size_t filled = batch_alloc_wrapper(ptrs, batch, size,
 			    flags);
 			assert_zu_eq(filled, batch, "");
-			memcpy(&stats_after, &bin->stats, sizeof(bin_stats_t));
-			verify_stats(&stats_before, &stats_after, batch, nregs);
 			verify_batch_basic(tsd, ptrs, batch, usize, zero);
 			verify_batch_locality(tsd, ptrs, batch, usize, arena,
 			    nregs);
@@ -196,8 +159,15 @@ TEST_BEGIN(test_batch_alloc_manual_arena) {
 }
 TEST_END
 
-TEST_BEGIN(test_batch_alloc_fallback) {
-	const size_t size = SC_LARGE_MINCLASS;
+TEST_BEGIN(test_batch_alloc_large) {
+	size_t size = SC_LARGE_MINCLASS;
+	for (size_t batch = 0; batch < 4; ++batch) {
+		assert(batch < BATCH_MAX);
+		size_t filled = batch_alloc(ptrs, batch, size, 0);
+		assert_zu_eq(filled, batch, "");
+		release_batch(ptrs, batch, size);
+	}
+	size = tcache_maxclass + 1;
 	for (size_t batch = 0; batch < 4; ++batch) {
 		assert(batch < BATCH_MAX);
 		size_t filled = batch_alloc(ptrs, batch, size, 0);
@@ -214,5 +184,5 @@ main(void) {
 	    test_batch_alloc_zero,
 	    test_batch_alloc_aligned,
 	    test_batch_alloc_manual_arena,
-	    test_batch_alloc_fallback);
+	    test_batch_alloc_large);
 }