Allocate tcache stack from base allocator

When using metadata_thp, allocate tcache bin stacks from base0, which means they will be placed on huge pages along with other metadata, instead of mixed with other regular allocations. In order to do so, modified the base allocator to support limited reuse: freed tcached stacks (from thread termination) will be returned to base0 and made available for reuse, but no merging will be attempted since they were bump allocated out of base blocks. These reused base extents are managed using separately allocated base edata_t -- they are cached in base->edata_avail when the extent is all allocated. One tricky part is, stats updating must be skipped for such reused extents (since they were accounted for already, and there is no purging for base). This requires tracking the "if is reused" state explicitly and bypass the stats updates when allocating from them.
2023-09-13 21:51:54 -07:00 · 2023-09-13 21:51:54 -07:00 · 72cfdce718
commit 72cfdce718
parent a442d9b895
7 changed files with 202 additions and 27 deletions
--- a/include/jemalloc/internal/base.h
+++ b/include/jemalloc/internal/base.h
@ -73,6 +73,9 @@ struct base_s {
 	/* Heap of extents that track unused trailing space within blocks. */
 	edata_heap_t avail[SC_NSIZES];

+	/* Contains reusable base edata (used by tcache_stacks currently). */
+	edata_avail_t edata_avail;
+
 	/* Stats, only maintained if config_stats. */
 	size_t allocated;
 	size_t resident;
@ -101,6 +104,8 @@ extent_hooks_t *base_extent_hooks_set(base_t *base,
    extent_hooks_t *extent_hooks);
 void *base_alloc(tsdn_t *tsdn, base_t *base, size_t size, size_t alignment);
 edata_t *base_alloc_edata(tsdn_t *tsdn, base_t *base);
+void *b0_alloc_tcache_stack(tsdn_t *tsdn, size_t size);
+void b0_dalloc_tcache_stack(tsdn_t *tsdn, void *tcache_stack);
 void base_stats_get(tsdn_t *tsdn, base_t *base, size_t *allocated,
    size_t *resident, size_t *mapped, size_t *n_thp);
 void base_prefork(tsdn_t *tsdn, base_t *base);
--- a/include/jemalloc/internal/cache_bin.h
+++ b/include/jemalloc/internal/cache_bin.h
@ -704,5 +704,6 @@ void cache_bin_init(cache_bin_t *bin, cache_bin_info_t *info, void *alloc,
 * not cache_bin_init was called on it.
 */
 bool cache_bin_still_zero_initialized(cache_bin_t *bin);
+bool cache_bin_stack_use_thp(void);

 #endif /* JEMALLOC_INTERNAL_CACHE_BIN_H */
--- a/include/jemalloc/internal/edata.h
+++ b/include/jemalloc/internal/edata.h
@ -621,7 +621,8 @@ edata_init(edata_t *edata, unsigned arena_ind, void *addr, size_t size,
 }

 static inline void
-edata_binit(edata_t *edata, void *addr, size_t bsize, uint64_t sn) {
+edata_binit(edata_t *edata, void *addr, size_t bsize, uint64_t sn,
+    bool reused) {
 	edata_arena_ind_set(edata, (1U << MALLOCX_ARENA_BITS) - 1);
 	edata_addr_set(edata, addr);
 	edata_bsize_set(edata, bsize);
@ -629,7 +630,8 @@ edata_binit(edata_t *edata, void *addr, size_t bsize, uint64_t sn) {
 	edata_szind_set(edata, SC_NSIZES);
 	edata_sn_set(edata, sn);
 	edata_state_set(edata, extent_state_active);
-	edata_guarded_set(edata, false);
+	/* See comments in base_edata_is_reused. */
+	edata_guarded_set(edata, reused);
 	edata_zeroed_set(edata, true);
 	edata_committed_set(edata, true);
 	/*
--- a/src/base.c
+++ b/src/base.c
@ -110,6 +110,16 @@ label_done:
 	}
 }

+static inline bool
+base_edata_is_reused(edata_t *edata) {
+	/*
+	 * Borrow the guarded bit to indicate if the extent is a recycled one,
+	 * i.e. the ones returned to base for reuse; currently only tcache bin
+	 * stacks.  Skips stats updating if so (needed for this purpose only).
+	 */
+	return edata_guarded_get(edata);
+}
+
 static void
 base_edata_init(size_t *extent_sn_next, edata_t *edata, void *addr,
    size_t size) {
@ -118,7 +128,7 @@ base_edata_init(size_t *extent_sn_next, edata_t *edata, void *addr,
 	sn = *extent_sn_next;
 	(*extent_sn_next)++;

-	edata_binit(edata, addr, size, sn);
+	edata_binit(edata, addr, size, sn, false /* is_reused */);
 }

 static size_t
@ -185,24 +195,57 @@ base_extent_bump_alloc_helper(edata_t *edata, size_t *gap_size, size_t size,
 	assert(edata_bsize_get(edata) >= *gap_size + size);
 	edata_binit(edata, (void *)((byte_t *)edata_addr_get(edata) +
 	    *gap_size + size), edata_bsize_get(edata) - *gap_size - size,
-	    edata_sn_get(edata));
+	    edata_sn_get(edata), base_edata_is_reused(edata));
 	return ret;
 }

 static void
-base_extent_bump_alloc_post(base_t *base, edata_t *edata, size_t gap_size,
-    void *addr, size_t size) {
-	if (edata_bsize_get(edata) > 0) {
+base_edata_heap_insert(tsdn_t *tsdn, base_t *base, edata_t *edata) {
+	malloc_mutex_assert_owner(tsdn, &base->mtx);
+
+	size_t bsize = edata_bsize_get(edata);
+	assert(bsize > 0);
 	/*
-		 * Compute the index for the largest size class that does not
-		 * exceed extent's size.
+	 * Compute the index for the largest size class that does not exceed
+	 * extent's size.
 	 */
-		szind_t index_floor =
-		    sz_size2index(edata_bsize_get(edata) + 1) - 1;
+	szind_t index_floor = sz_size2index(bsize + 1) - 1;
 	edata_heap_insert(&base->avail[index_floor], edata);
 }

-	if (config_stats) {
+/*
+ * Only can be called by top-level functions, since it may call base_alloc
+ * internally when cache is empty.
+ */
+static edata_t *
+base_alloc_base_edata(tsdn_t *tsdn, base_t *base) {
+	edata_t *edata;
+
+	malloc_mutex_lock(tsdn, &base->mtx);
+	edata = edata_avail_first(&base->edata_avail);
+	if (edata != NULL) {
+		edata_avail_remove(&base->edata_avail, edata);
+	}
+	malloc_mutex_unlock(tsdn, &base->mtx);
+
+	if (edata == NULL) {
+		edata = base_alloc_edata(tsdn, base);
+	}
+
+	return edata;
+}
+
+static void
+base_extent_bump_alloc_post(tsdn_t *tsdn, base_t *base, edata_t *edata,
+    size_t gap_size, void *addr, size_t size) {
+	if (edata_bsize_get(edata) > 0) {
+		base_edata_heap_insert(tsdn, base, edata);
+	} else {
+		/* Freed base edata_t stored in edata_avail. */
+		edata_avail_insert(&base->edata_avail, edata);
+	}
+
+	if (config_stats && !base_edata_is_reused(edata)) {
 		base->allocated += size;
 		/*
 		 * Add one PAGE to base_resident for every page boundary that is
@ -224,13 +267,13 @@ base_extent_bump_alloc_post(base_t *base, edata_t *edata, size_t gap_size,
 }

 static void *
-base_extent_bump_alloc(base_t *base, edata_t *edata, size_t size,
+base_extent_bump_alloc(tsdn_t *tsdn, base_t *base, edata_t *edata, size_t size,
    size_t alignment) {
 	void *ret;
 	size_t gap_size;

 	ret = base_extent_bump_alloc_helper(edata, &gap_size, size, alignment);
-	base_extent_bump_alloc_post(base, edata, gap_size, ret, size);
+	base_extent_bump_alloc_post(tsdn, base, edata, gap_size, ret, size);
 	return ret;
 }

@ -384,6 +427,8 @@ base_new(tsdn_t *tsdn, unsigned ind, const extent_hooks_t *extent_hooks,
 	for (szind_t i = 0; i < SC_NSIZES; i++) {
 		edata_heap_new(&base->avail[i]);
 	}
+	edata_avail_new(&base->edata_avail);
+
 	if (config_stats) {
 		base->allocated = sizeof(base_block_t);
 		base->resident = PAGE_CEILING(sizeof(base_block_t));
@ -395,8 +440,12 @@ base_new(tsdn_t *tsdn, unsigned ind, const extent_hooks_t *extent_hooks,
 		assert(base->resident <= base->mapped);
 		assert(base->n_thp << LG_HUGEPAGE <= base->mapped);
 	}
-	base_extent_bump_alloc_post(base, &block->edata, gap_size, base,
+
+	/* Locking here is only necessary because of assertions. */
+	malloc_mutex_lock(tsdn, &base->mtx);
+	base_extent_bump_alloc_post(tsdn, base, &block->edata, gap_size, base,
 	    base_size);
+	malloc_mutex_unlock(tsdn, &base->mtx);

 	return base;
 }
@ -457,7 +506,7 @@ base_alloc_impl(tsdn_t *tsdn, base_t *base, size_t size, size_t alignment,
 		goto label_return;
 	}

-	ret = base_extent_bump_alloc(base, edata, usize, alignment);
+	ret = base_extent_bump_alloc(tsdn, base, edata, usize, alignment);
 	if (esn != NULL) {
 		*esn = (size_t)edata_sn_get(edata);
 	}
@ -491,6 +540,74 @@ base_alloc_edata(tsdn_t *tsdn, base_t *base) {
 	return edata;
 }

+static inline void
+b0_alloc_header_size(size_t *header_size, size_t *alignment) {
+	*alignment = QUANTUM;
+	*header_size = QUANTUM > sizeof(edata_t *) ? QUANTUM :
+	    sizeof(edata_t *);
+}
+
+/*
+ * Each piece allocated here is managed by a separate edata, because it was bump
+ * allocated and cannot be merged back into the original base_block.  This means
+ * it's not for general purpose: 1) they are not page aligned, nor page sized,
+ * and 2) the requested size should not be too small (as each piece comes with
+ * an edata_t).  Only used for tcache bin stack allocation now.
+ */
+void *
+b0_alloc_tcache_stack(tsdn_t *tsdn, size_t stack_size) {
+	base_t *base = b0get();
+	edata_t *edata = base_alloc_base_edata(tsdn, base);
+	if (edata == NULL) {
+		return NULL;
+	}
+
+	/*
+	 * Reserve room for the header, which stores a pointer to the managing
+	 * edata_t.  The header itself is located right before the return
+	 * address, so that edata can be retrieved on dalloc.  Bump up to usize
+	 * to improve reusability -- otherwise the freed stacks will be put back
+	 * into the previous size class.
+	 */
+	size_t esn, alignment, header_size;
+	b0_alloc_header_size(&header_size, &alignment);
+
+	size_t alloc_size = sz_s2u(stack_size + header_size);
+	void *addr = base_alloc_impl(tsdn, base, alloc_size, alignment, &esn);
+	if (addr == NULL) {
+		edata_avail_insert(&base->edata_avail, edata);
+		return NULL;
+	}
+
+	/* Set is_reused: see comments in base_edata_is_reused. */
+	edata_binit(edata, addr, alloc_size, esn, true /* is_reused */);
+	*(edata_t **)addr = edata;
+
+	return (byte_t *)addr + header_size;
+}
+
+void
+b0_dalloc_tcache_stack(tsdn_t *tsdn, void *tcache_stack) {
+	/* edata_t pointer stored in header. */
+	size_t alignment, header_size;
+	b0_alloc_header_size(&header_size, &alignment);
+
+	edata_t *edata = *(edata_t **)((byte_t *)tcache_stack - header_size);
+	void *addr = edata_addr_get(edata);
+	size_t bsize = edata_bsize_get(edata);
+	/* Marked as "reused" to avoid double counting stats. */
+	assert(base_edata_is_reused(edata));
+	assert(addr != NULL && bsize > 0);
+
+	/* Zero out since base_alloc returns zeroed memory. */
+	memset(addr, 0, bsize);
+
+	base_t *base = b0get();
+	malloc_mutex_lock(tsdn, &base->mtx);
+	base_edata_heap_insert(tsdn, base, edata);
+	malloc_mutex_unlock(tsdn, &base->mtx);
+}
+
 void
 base_stats_get(tsdn_t *tsdn, base_t *base, size_t *allocated, size_t *resident,
    size_t *mapped, size_t *n_thp) {
--- a/src/cache_bin.c
+++ b/src/cache_bin.c
@ -14,6 +14,17 @@ cache_bin_info_init(cache_bin_info_t *info,
 	info->ncached_max = (cache_bin_sz_t)ncached_max;
 }

+bool
+cache_bin_stack_use_thp(void) {
+	/*
+	 * If metadata_thp is enabled, allocating tcache stack from the base
+	 * allocator for efficiency gains.  The downside, however, is that base
+	 * allocator never purges freed memory, and may cache a fair amount of
+	 * memory after many threads are terminated and not reused.
+	 */
+	return metadata_thp_enabled();
+}
+
 void
 cache_bin_info_compute_alloc(cache_bin_info_t *infos, szind_t ninfos,
    size_t *size, size_t *alignment) {
@ -31,10 +42,11 @@ cache_bin_info_compute_alloc(cache_bin_info_t *infos, szind_t ninfos,
 	}

 	/*
-	 * Align to at least PAGE, to minimize the # of TLBs needed by the
-	 * smaller sizes; also helps if the larger sizes don't get used at all.
+	 * When not using THP, align to at least PAGE, to minimize the # of TLBs
+	 * needed by the smaller sizes; also helps if the larger sizes don't get
+	 * used at all.
 	 */
-	*alignment = PAGE;
+	*alignment = cache_bin_stack_use_thp() ? QUANTUM : PAGE;
 }

 void
--- a/src/tcache.c
+++ b/src/tcache.c
@ -2,6 +2,7 @@
 #include "jemalloc/internal/jemalloc_internal_includes.h"

 #include "jemalloc/internal/assert.h"
+#include "jemalloc/internal/base.h"
 #include "jemalloc/internal/mutex.h"
 #include "jemalloc/internal/safety_check.h"
 #include "jemalloc/internal/san.h"
@ -814,10 +815,17 @@ tsd_tcache_data_init(tsd_t *tsd, arena_t *arena) {
 	tcache_bin_info_compute(tcache_bin_info, tcache_nhbins);
 	cache_bin_info_compute_alloc(tcache_bin_info, tcache_nhbins,
 	    &size, &alignment);
-	size = sz_sa2u(size, alignment);

-	void *mem = ipallocztm(tsd_tsdn(tsd), size, alignment, true, NULL,
+	void *mem;
+	if (cache_bin_stack_use_thp()) {
+		/* Alignment is ignored since it comes from THP. */
+		assert(alignment == QUANTUM);
+		mem = b0_alloc_tcache_stack(tsd_tsdn(tsd), size);
+	} else {
+		size = sz_sa2u(size, alignment);
+		mem = ipallocztm(tsd_tsdn(tsd), size, alignment, true, NULL,
 		    true, arena_get(TSDN_NULL, 0, true));
+	}
 	if (mem == NULL) {
 		return true;
 	}
@ -925,8 +933,12 @@ tcache_destroy(tsd_t *tsd, tcache_t *tcache, bool tsd_tcache) {
 		cache_bin_t *cache_bin = &tcache->bins[0];
 		cache_bin_assert_empty(cache_bin, &cache_bin->bin_info);
 	}
-	idalloctm(tsd_tsdn(tsd), tcache_slow->dyn_alloc, NULL, NULL, true,
-	    true);
+	if (tsd_tcache && cache_bin_stack_use_thp()) {
+		b0_dalloc_tcache_stack(tsd_tsdn(tsd), tcache_slow->dyn_alloc);
+	} else {
+		idalloctm(tsd_tsdn(tsd), tcache_slow->dyn_alloc, NULL, NULL,
+		    true, true);
+	}

 	/*
 	 * The deallocation and tcache flush above may not trigger decay since
--- a/test/unit/tcache_max.c
+++ b/test/unit/tcache_max.c
@ -215,6 +215,29 @@ tcache_max2nhbins(size_t tcache_max) {
 	return sz_size2index(tcache_max) + 1;
 }

+static void
+validate_tcache_stack(tcache_t *tcache) {
+	/* Assume bins[0] is enabled. */
+	void *tcache_stack = tcache->bins[0].stack_head;
+	bool expect_found = cache_bin_stack_use_thp() ? true : false;
+
+	/* Walk through all blocks to see if the stack is within range. */
+	base_t *base = b0get();
+	base_block_t *next = base->blocks;
+	bool found = false;
+	do {
+		base_block_t *block = next;
+		if ((byte_t *)tcache_stack >= (byte_t *)block &&
+		    (byte_t *)tcache_stack < ((byte_t *)block + block->size)) {
+			found = true;
+			break;
+		}
+		next = block->next;
+	} while (next != NULL);
+
+	expect_true(found == expect_found, "Unexpected tcache stack source");
+}
+
 static void *
 tcache_check(void *arg) {
 	size_t old_tcache_max, new_tcache_max, min_tcache_max, sz;
@ -235,6 +258,7 @@ tcache_check(void *arg) {
 	tcache_nhbins = tcache_nhbins_get(tcache);
 	expect_zu_eq(tcache_nhbins, (size_t)global_do_not_change_nhbins,
 	    "Unexpected default value for tcache_nhbins");
+	validate_tcache_stack(tcache);

 	/*
 	 * Close the tcache and test the set.
@ -280,6 +304,7 @@ tcache_check(void *arg) {
 	    "Unexpected.mallctl().failure");
 	expect_zu_eq(old_tcache_max, min_tcache_max,
 	    "Unexpected value for tcache_max");
+	validate_tcache_stack(tcache);

 	/*
 	 * Check the thread's tcache_max and nhbins both through mallctl
@ -303,6 +328,7 @@ tcache_check(void *arg) {
 			test_tcache_max_impl(new_tcache_max,
 			    alloc_option, dalloc_option);
 		}
+		validate_tcache_stack(tcache);
 	}

 	return NULL;