From 72cfdce71806443f4ccdbfe10aa5d50346a3d07e Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Wed, 13 Sep 2023 21:51:54 -0700
Subject: [PATCH] Allocate tcache stack from base allocator

When using metadata_thp, allocate tcache bin stacks from base0, which means they
will be placed on huge pages along with other metadata, instead of mixed with
other regular allocations.

In order to do so, modified the base allocator to support limited reuse: freed
tcached stacks (from thread termination) will be returned to base0 and made
available for reuse, but no merging will be attempted since they were bump
allocated out of base blocks. These reused base extents are managed using
separately allocated base edata_t -- they are cached in base->edata_avail when
the extent is all allocated.

One tricky part is, stats updating must be skipped for such reused extents
(since they were accounted for already, and there is no purging for base). This
requires tracking the "if is reused" state explicitly and bypass the stats
updates when allocating from them.
---
 include/jemalloc/internal/base.h      |   5 +
 include/jemalloc/internal/cache_bin.h |   1 +
 include/jemalloc/internal/edata.h     |   6 +-
 src/base.c                            | 151 +++++++++++++++++++++++---
 src/cache_bin.c                       |  18 ++-
 src/tcache.c                          |  22 +++-
 test/unit/tcache_max.c                |  26 +++++
 7 files changed, 202 insertions(+), 27 deletions(-)

diff --git a/include/jemalloc/internal/base.h b/include/jemalloc/internal/base.h
index 6b41aa6f..451be10f 100644
--- a/include/jemalloc/internal/base.h
+++ b/include/jemalloc/internal/base.h
@@ -73,6 +73,9 @@ struct base_s {
 	/* Heap of extents that track unused trailing space within blocks. */
 	edata_heap_t avail[SC_NSIZES];
 
+	/* Contains reusable base edata (used by tcache_stacks currently). */
+	edata_avail_t edata_avail;
+
 	/* Stats, only maintained if config_stats. */
 	size_t allocated;
 	size_t resident;
@@ -101,6 +104,8 @@ extent_hooks_t *base_extent_hooks_set(base_t *base,
     extent_hooks_t *extent_hooks);
 void *base_alloc(tsdn_t *tsdn, base_t *base, size_t size, size_t alignment);
 edata_t *base_alloc_edata(tsdn_t *tsdn, base_t *base);
+void *b0_alloc_tcache_stack(tsdn_t *tsdn, size_t size);
+void b0_dalloc_tcache_stack(tsdn_t *tsdn, void *tcache_stack);
 void base_stats_get(tsdn_t *tsdn, base_t *base, size_t *allocated,
     size_t *resident, size_t *mapped, size_t *n_thp);
 void base_prefork(tsdn_t *tsdn, base_t *base);
diff --git a/include/jemalloc/internal/cache_bin.h b/include/jemalloc/internal/cache_bin.h
index 4cfc3f1d..78ac3295 100644
--- a/include/jemalloc/internal/cache_bin.h
+++ b/include/jemalloc/internal/cache_bin.h
@@ -704,5 +704,6 @@ void cache_bin_init(cache_bin_t *bin, cache_bin_info_t *info, void *alloc,
  * not cache_bin_init was called on it.
  */
 bool cache_bin_still_zero_initialized(cache_bin_t *bin);
+bool cache_bin_stack_use_thp(void);
 
 #endif /* JEMALLOC_INTERNAL_CACHE_BIN_H */
diff --git a/include/jemalloc/internal/edata.h b/include/jemalloc/internal/edata.h
index baf5187f..17befd92 100644
--- a/include/jemalloc/internal/edata.h
+++ b/include/jemalloc/internal/edata.h
@@ -621,7 +621,8 @@ edata_init(edata_t *edata, unsigned arena_ind, void *addr, size_t size,
 }
 
 static inline void
-edata_binit(edata_t *edata, void *addr, size_t bsize, uint64_t sn) {
+edata_binit(edata_t *edata, void *addr, size_t bsize, uint64_t sn,
+    bool reused) {
 	edata_arena_ind_set(edata, (1U << MALLOCX_ARENA_BITS) - 1);
 	edata_addr_set(edata, addr);
 	edata_bsize_set(edata, bsize);
@@ -629,7 +630,8 @@ edata_binit(edata_t *edata, void *addr, size_t bsize, uint64_t sn) {
 	edata_szind_set(edata, SC_NSIZES);
 	edata_sn_set(edata, sn);
 	edata_state_set(edata, extent_state_active);
-	edata_guarded_set(edata, false);
+	/* See comments in base_edata_is_reused. */
+	edata_guarded_set(edata, reused);
 	edata_zeroed_set(edata, true);
 	edata_committed_set(edata, true);
 	/*
diff --git a/src/base.c b/src/base.c
index 8e4606d0..e1dfe604 100644
--- a/src/base.c
+++ b/src/base.c
@@ -110,6 +110,16 @@ label_done:
 	}
 }
 
+static inline bool
+base_edata_is_reused(edata_t *edata) {
+	/*
+	 * Borrow the guarded bit to indicate if the extent is a recycled one,
+	 * i.e. the ones returned to base for reuse; currently only tcache bin
+	 * stacks.  Skips stats updating if so (needed for this purpose only).
+	 */
+	return edata_guarded_get(edata);
+}
+
 static void
 base_edata_init(size_t *extent_sn_next, edata_t *edata, void *addr,
     size_t size) {
@@ -118,7 +128,7 @@ base_edata_init(size_t *extent_sn_next, edata_t *edata, void *addr,
 	sn = *extent_sn_next;
 	(*extent_sn_next)++;
 
-	edata_binit(edata, addr, size, sn);
+	edata_binit(edata, addr, size, sn, false /* is_reused */);
 }
 
 static size_t
@@ -185,24 +195,57 @@ base_extent_bump_alloc_helper(edata_t *edata, size_t *gap_size, size_t size,
 	assert(edata_bsize_get(edata) >= *gap_size + size);
 	edata_binit(edata, (void *)((byte_t *)edata_addr_get(edata) +
 	    *gap_size + size), edata_bsize_get(edata) - *gap_size - size,
-	    edata_sn_get(edata));
+	    edata_sn_get(edata), base_edata_is_reused(edata));
 	return ret;
 }
 
 static void
-base_extent_bump_alloc_post(base_t *base, edata_t *edata, size_t gap_size,
-    void *addr, size_t size) {
-	if (edata_bsize_get(edata) > 0) {
-		/*
-		 * Compute the index for the largest size class that does not
-		 * exceed extent's size.
-		 */
-		szind_t index_floor =
-		    sz_size2index(edata_bsize_get(edata) + 1) - 1;
-		edata_heap_insert(&base->avail[index_floor], edata);
+base_edata_heap_insert(tsdn_t *tsdn, base_t *base, edata_t *edata) {
+	malloc_mutex_assert_owner(tsdn, &base->mtx);
+
+	size_t bsize = edata_bsize_get(edata);
+	assert(bsize > 0);
+	/*
+	 * Compute the index for the largest size class that does not exceed
+	 * extent's size.
+	 */
+	szind_t index_floor = sz_size2index(bsize + 1) - 1;
+	edata_heap_insert(&base->avail[index_floor], edata);
+}
+
+/*
+ * Only can be called by top-level functions, since it may call base_alloc
+ * internally when cache is empty.
+ */
+static edata_t *
+base_alloc_base_edata(tsdn_t *tsdn, base_t *base) {
+	edata_t *edata;
+
+	malloc_mutex_lock(tsdn, &base->mtx);
+	edata = edata_avail_first(&base->edata_avail);
+	if (edata != NULL) {
+		edata_avail_remove(&base->edata_avail, edata);
+	}
+	malloc_mutex_unlock(tsdn, &base->mtx);
+
+	if (edata == NULL) {
+		edata = base_alloc_edata(tsdn, base);
 	}
 
-	if (config_stats) {
+	return edata;
+}
+
+static void
+base_extent_bump_alloc_post(tsdn_t *tsdn, base_t *base, edata_t *edata,
+    size_t gap_size, void *addr, size_t size) {
+	if (edata_bsize_get(edata) > 0) {
+		base_edata_heap_insert(tsdn, base, edata);
+	} else {
+		/* Freed base edata_t stored in edata_avail. */
+		edata_avail_insert(&base->edata_avail, edata);
+	}
+
+	if (config_stats && !base_edata_is_reused(edata)) {
 		base->allocated += size;
 		/*
 		 * Add one PAGE to base_resident for every page boundary that is
@@ -224,13 +267,13 @@ base_extent_bump_alloc_post(base_t *base, edata_t *edata, size_t gap_size,
 }
 
 static void *
-base_extent_bump_alloc(base_t *base, edata_t *edata, size_t size,
+base_extent_bump_alloc(tsdn_t *tsdn, base_t *base, edata_t *edata, size_t size,
     size_t alignment) {
 	void *ret;
 	size_t gap_size;
 
 	ret = base_extent_bump_alloc_helper(edata, &gap_size, size, alignment);
-	base_extent_bump_alloc_post(base, edata, gap_size, ret, size);
+	base_extent_bump_alloc_post(tsdn, base, edata, gap_size, ret, size);
 	return ret;
 }
 
@@ -384,6 +427,8 @@ base_new(tsdn_t *tsdn, unsigned ind, const extent_hooks_t *extent_hooks,
 	for (szind_t i = 0; i < SC_NSIZES; i++) {
 		edata_heap_new(&base->avail[i]);
 	}
+	edata_avail_new(&base->edata_avail);
+
 	if (config_stats) {
 		base->allocated = sizeof(base_block_t);
 		base->resident = PAGE_CEILING(sizeof(base_block_t));
@@ -395,8 +440,12 @@ base_new(tsdn_t *tsdn, unsigned ind, const extent_hooks_t *extent_hooks,
 		assert(base->resident <= base->mapped);
 		assert(base->n_thp << LG_HUGEPAGE <= base->mapped);
 	}
-	base_extent_bump_alloc_post(base, &block->edata, gap_size, base,
+
+	/* Locking here is only necessary because of assertions. */
+	malloc_mutex_lock(tsdn, &base->mtx);
+	base_extent_bump_alloc_post(tsdn, base, &block->edata, gap_size, base,
 	    base_size);
+	malloc_mutex_unlock(tsdn, &base->mtx);
 
 	return base;
 }
@@ -457,7 +506,7 @@ base_alloc_impl(tsdn_t *tsdn, base_t *base, size_t size, size_t alignment,
 		goto label_return;
 	}
 
-	ret = base_extent_bump_alloc(base, edata, usize, alignment);
+	ret = base_extent_bump_alloc(tsdn, base, edata, usize, alignment);
 	if (esn != NULL) {
 		*esn = (size_t)edata_sn_get(edata);
 	}
@@ -491,6 +540,74 @@ base_alloc_edata(tsdn_t *tsdn, base_t *base) {
 	return edata;
 }
 
+static inline void
+b0_alloc_header_size(size_t *header_size, size_t *alignment) {
+	*alignment = QUANTUM;
+	*header_size = QUANTUM > sizeof(edata_t *) ? QUANTUM :
+	    sizeof(edata_t *);
+}
+
+/*
+ * Each piece allocated here is managed by a separate edata, because it was bump
+ * allocated and cannot be merged back into the original base_block.  This means
+ * it's not for general purpose: 1) they are not page aligned, nor page sized,
+ * and 2) the requested size should not be too small (as each piece comes with
+ * an edata_t).  Only used for tcache bin stack allocation now.
+ */
+void *
+b0_alloc_tcache_stack(tsdn_t *tsdn, size_t stack_size) {
+	base_t *base = b0get();
+	edata_t *edata = base_alloc_base_edata(tsdn, base);
+	if (edata == NULL) {
+		return NULL;
+	}
+
+	/*
+	 * Reserve room for the header, which stores a pointer to the managing
+	 * edata_t.  The header itself is located right before the return
+	 * address, so that edata can be retrieved on dalloc.  Bump up to usize
+	 * to improve reusability -- otherwise the freed stacks will be put back
+	 * into the previous size class.
+	 */
+	size_t esn, alignment, header_size;
+	b0_alloc_header_size(&header_size, &alignment);
+
+	size_t alloc_size = sz_s2u(stack_size + header_size);
+	void *addr = base_alloc_impl(tsdn, base, alloc_size, alignment, &esn);
+	if (addr == NULL) {
+		edata_avail_insert(&base->edata_avail, edata);
+		return NULL;
+	}
+
+	/* Set is_reused: see comments in base_edata_is_reused. */
+	edata_binit(edata, addr, alloc_size, esn, true /* is_reused */);
+	*(edata_t **)addr = edata;
+
+	return (byte_t *)addr + header_size;
+}
+
+void
+b0_dalloc_tcache_stack(tsdn_t *tsdn, void *tcache_stack) {
+	/* edata_t pointer stored in header. */
+	size_t alignment, header_size;
+	b0_alloc_header_size(&header_size, &alignment);
+
+	edata_t *edata = *(edata_t **)((byte_t *)tcache_stack - header_size);
+	void *addr = edata_addr_get(edata);
+	size_t bsize = edata_bsize_get(edata);
+	/* Marked as "reused" to avoid double counting stats. */
+	assert(base_edata_is_reused(edata));
+	assert(addr != NULL && bsize > 0);
+
+	/* Zero out since base_alloc returns zeroed memory. */
+	memset(addr, 0, bsize);
+
+	base_t *base = b0get();
+	malloc_mutex_lock(tsdn, &base->mtx);
+	base_edata_heap_insert(tsdn, base, edata);
+	malloc_mutex_unlock(tsdn, &base->mtx);
+}
+
 void
 base_stats_get(tsdn_t *tsdn, base_t *base, size_t *allocated, size_t *resident,
     size_t *mapped, size_t *n_thp) {
diff --git a/src/cache_bin.c b/src/cache_bin.c
index 03577084..2ad2062d 100644
--- a/src/cache_bin.c
+++ b/src/cache_bin.c
@@ -14,6 +14,17 @@ cache_bin_info_init(cache_bin_info_t *info,
 	info->ncached_max = (cache_bin_sz_t)ncached_max;
 }
 
+bool
+cache_bin_stack_use_thp(void) {
+	/*
+	 * If metadata_thp is enabled, allocating tcache stack from the base
+	 * allocator for efficiency gains.  The downside, however, is that base
+	 * allocator never purges freed memory, and may cache a fair amount of
+	 * memory after many threads are terminated and not reused.
+	 */
+	return metadata_thp_enabled();
+}
+
 void
 cache_bin_info_compute_alloc(cache_bin_info_t *infos, szind_t ninfos,
     size_t *size, size_t *alignment) {
@@ -31,10 +42,11 @@ cache_bin_info_compute_alloc(cache_bin_info_t *infos, szind_t ninfos,
 	}
 
 	/*
-	 * Align to at least PAGE, to minimize the # of TLBs needed by the
-	 * smaller sizes; also helps if the larger sizes don't get used at all.
+	 * When not using THP, align to at least PAGE, to minimize the # of TLBs
+	 * needed by the smaller sizes; also helps if the larger sizes don't get
+	 * used at all.
 	 */
-	*alignment = PAGE;
+	*alignment = cache_bin_stack_use_thp() ? QUANTUM : PAGE;
 }
 
 void
diff --git a/src/tcache.c b/src/tcache.c
index ae68c08b..2c0a7e2e 100644
--- a/src/tcache.c
+++ b/src/tcache.c
@@ -2,6 +2,7 @@
 #include "jemalloc/internal/jemalloc_internal_includes.h"
 
 #include "jemalloc/internal/assert.h"
+#include "jemalloc/internal/base.h"
 #include "jemalloc/internal/mutex.h"
 #include "jemalloc/internal/safety_check.h"
 #include "jemalloc/internal/san.h"
@@ -814,10 +815,17 @@ tsd_tcache_data_init(tsd_t *tsd, arena_t *arena) {
 	tcache_bin_info_compute(tcache_bin_info, tcache_nhbins);
 	cache_bin_info_compute_alloc(tcache_bin_info, tcache_nhbins,
 	    &size, &alignment);
-	size = sz_sa2u(size, alignment);
 
-	void *mem = ipallocztm(tsd_tsdn(tsd), size, alignment, true, NULL,
-	    true, arena_get(TSDN_NULL, 0, true));
+	void *mem;
+	if (cache_bin_stack_use_thp()) {
+		/* Alignment is ignored since it comes from THP. */
+		assert(alignment == QUANTUM);
+		mem = b0_alloc_tcache_stack(tsd_tsdn(tsd), size);
+	} else {
+		size = sz_sa2u(size, alignment);
+		mem = ipallocztm(tsd_tsdn(tsd), size, alignment, true, NULL,
+		    true, arena_get(TSDN_NULL, 0, true));
+	}
 	if (mem == NULL) {
 		return true;
 	}
@@ -925,8 +933,12 @@ tcache_destroy(tsd_t *tsd, tcache_t *tcache, bool tsd_tcache) {
 		cache_bin_t *cache_bin = &tcache->bins[0];
 		cache_bin_assert_empty(cache_bin, &cache_bin->bin_info);
 	}
-	idalloctm(tsd_tsdn(tsd), tcache_slow->dyn_alloc, NULL, NULL, true,
-	    true);
+	if (tsd_tcache && cache_bin_stack_use_thp()) {
+		b0_dalloc_tcache_stack(tsd_tsdn(tsd), tcache_slow->dyn_alloc);
+	} else {
+		idalloctm(tsd_tsdn(tsd), tcache_slow->dyn_alloc, NULL, NULL,
+		    true, true);
+	}
 
 	/*
 	 * The deallocation and tcache flush above may not trigger decay since
diff --git a/test/unit/tcache_max.c b/test/unit/tcache_max.c
index 0a563c2f..6481504e 100644
--- a/test/unit/tcache_max.c
+++ b/test/unit/tcache_max.c
@@ -215,6 +215,29 @@ tcache_max2nhbins(size_t tcache_max) {
 	return sz_size2index(tcache_max) + 1;
 }
 
+static void
+validate_tcache_stack(tcache_t *tcache) {
+	/* Assume bins[0] is enabled. */
+	void *tcache_stack = tcache->bins[0].stack_head;
+	bool expect_found = cache_bin_stack_use_thp() ? true : false;
+
+	/* Walk through all blocks to see if the stack is within range. */
+	base_t *base = b0get();
+	base_block_t *next = base->blocks;
+	bool found = false;
+	do {
+		base_block_t *block = next;
+		if ((byte_t *)tcache_stack >= (byte_t *)block &&
+		    (byte_t *)tcache_stack < ((byte_t *)block + block->size)) {
+			found = true;
+			break;
+		}
+		next = block->next;
+	} while (next != NULL);
+
+	expect_true(found == expect_found, "Unexpected tcache stack source");
+}
+
 static void *
 tcache_check(void *arg) {
 	size_t old_tcache_max, new_tcache_max, min_tcache_max, sz;
@@ -235,6 +258,7 @@ tcache_check(void *arg) {
 	tcache_nhbins = tcache_nhbins_get(tcache);
 	expect_zu_eq(tcache_nhbins, (size_t)global_do_not_change_nhbins,
 	    "Unexpected default value for tcache_nhbins");
+	validate_tcache_stack(tcache);
 
 	/*
 	 * Close the tcache and test the set.
@@ -280,6 +304,7 @@ tcache_check(void *arg) {
 	    "Unexpected.mallctl().failure");
 	expect_zu_eq(old_tcache_max, min_tcache_max,
 	    "Unexpected value for tcache_max");
+	validate_tcache_stack(tcache);
 
 	/*
 	 * Check the thread's tcache_max and nhbins both through mallctl
@@ -303,6 +328,7 @@ tcache_check(void *arg) {
 			test_tcache_max_impl(new_tcache_max,
 			    alloc_option, dalloc_option);
 		}
+		validate_tcache_stack(tcache);
 	}
 
 	return NULL;