From 143e9c4a2f4eb8916e9802323485fd91260fd17c Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Fri, 21 Oct 2022 15:10:48 -0700
Subject: [PATCH] Enable fast thread locals for dealloc-only threads.

Previously if a thread does only allocations, it stays on the slow path /
minimal initialized state forever.  However, dealloc-only is a valid pattern for
dedicated reclamation threads -- this means thread cache is disabled (no batched
flush) for them, which causes high overhead and contention.

Added the condition to fully initialize TSD when a fair amount of dealloc
activities are observed.
---
 include/jemalloc/internal/tsd.h |  4 +++
 src/tsd.c                       | 18 ++++++++++-
 test/unit/tsd.c                 | 56 +++++++++++++++++++++++++++++++++
 3 files changed, 77 insertions(+), 1 deletion(-)

diff --git a/include/jemalloc/internal/tsd.h b/include/jemalloc/internal/tsd.h
index 66d68822..c6bf28fc 100644
--- a/include/jemalloc/internal/tsd.h
+++ b/include/jemalloc/internal/tsd.h
@@ -59,6 +59,7 @@ typedef ql_elm(tsd_t) tsd_link_t;
 #define TSD_DATA_SLOW							\
     O(tcache_enabled,		bool,			bool)		\
     O(reentrancy_level,		int8_t,			int8_t)		\
+    O(min_init_state_nfetched,		uint8_t,	uint8_t)	\
     O(thread_allocated_last_event,	uint64_t,	uint64_t)	\
     O(thread_allocated_next_event,	uint64_t,	uint64_t)	\
     O(thread_deallocated_last_event,	uint64_t,	uint64_t)	\
@@ -91,6 +92,7 @@ typedef ql_elm(tsd_t) tsd_link_t;
 #define TSD_DATA_SLOW_INITIALIZER					\
     /* tcache_enabled */	TCACHE_ENABLED_ZERO_INITIALIZER,	\
     /* reentrancy_level */	0,					\
+    /* min_init_state_nfetched */	0,				\
     /* thread_allocated_last_event */	0,				\
     /* thread_allocated_next_event */	0,				\
     /* thread_deallocated_last_event */	0,				\
@@ -177,6 +179,8 @@ void tsd_global_slow_inc(tsdn_t *tsdn);
 void tsd_global_slow_dec(tsdn_t *tsdn);
 bool tsd_global_slow();
 
+#define TSD_MIN_INIT_STATE_MAX_FETCHED (128)
+
 enum {
 	/* Common case --> jnz. */
 	tsd_state_nominal = 0,
diff --git a/src/tsd.c b/src/tsd.c
index e8e4f3a3..cef7ba58 100644
--- a/src/tsd.c
+++ b/src/tsd.c
@@ -300,9 +300,25 @@ tsd_fetch_slow(tsd_t *tsd, bool minimal) {
 			tsd_state_set(tsd, tsd_state_minimal_initialized);
 			tsd_set(tsd);
 			tsd_data_init_nocleanup(tsd);
+			*tsd_min_init_state_nfetchedp_get(tsd) = 1;
 		}
 	} else if (tsd_state_get(tsd) == tsd_state_minimal_initialized) {
-		if (!minimal) {
+		/*
+		 * If a thread only ever deallocates (e.g. dedicated reclamation
+		 * threads), we want to help it to eventually escape the slow
+		 * path (caused by the minimal initialized state).  The nfetched
+		 * counter tracks the number of times the tsd has been accessed
+		 * under the min init state, and triggers the switch to nominal
+		 * once reached the max allowed count.
+		 *
+		 * This means at most 128 deallocations stay on the slow path.
+		 *
+		 * Also see comments in free_default().
+		 */
+		uint8_t *nfetched = tsd_min_init_state_nfetchedp_get(tsd);
+		assert(*nfetched >= 1);
+		(*nfetched)++;
+		if (!minimal || *nfetched == TSD_MIN_INIT_STATE_MAX_FETCHED) {
 			/* Switch to fully initialized. */
 			tsd_state_set(tsd, tsd_state_nominal);
 			assert(*tsd_reentrancy_levelp_get(tsd) >= 1);
diff --git a/test/unit/tsd.c b/test/unit/tsd.c
index 205d8708..bb5cd9f6 100644
--- a/test/unit/tsd.c
+++ b/test/unit/tsd.c
@@ -136,6 +136,61 @@ TEST_BEGIN(test_tsd_reincarnation) {
 }
 TEST_END
 
+static void *
+thd_start_dalloc_only(void *arg) {
+	void **ptrs = (void **)arg;
+
+	tsd_t *tsd = tsd_fetch_min();
+	if (tsd_state_get(tsd) != tsd_state_minimal_initialized) {
+		/* Allocation happened implicitly. */
+		expect_u_eq(tsd_state_get(tsd), tsd_state_nominal,
+		    "TSD state should be nominal");
+		return NULL;
+	}
+
+	void *ptr;
+	for (size_t i = 0; (ptr = ptrs[i]) != NULL; i++) {
+		/* Offset by 1 because of the manual tsd_fetch_min above. */
+		if (i + 1 < TSD_MIN_INIT_STATE_MAX_FETCHED) {
+			expect_u_eq(tsd_state_get(tsd),
+			    tsd_state_minimal_initialized,
+			    "TSD should be minimal initialized");
+		} else {
+			/* State may be nominal or nominal_slow. */
+			expect_true(tsd_nominal(tsd), "TSD should be nominal");
+		}
+		free(ptr);
+	}
+
+	return NULL;
+}
+
+static void
+test_sub_thread_n_dalloc(size_t nptrs) {
+	void **ptrs = (void **)malloc(sizeof(void *) * (nptrs + 1));
+	for (size_t i = 0; i < nptrs; i++) {
+		ptrs[i] = malloc(8);
+	}
+	ptrs[nptrs] = NULL;
+
+	thd_t thd;
+	thd_create(&thd, thd_start_dalloc_only, (void *)ptrs);
+	thd_join(thd, NULL);
+	free(ptrs);
+}
+
+TEST_BEGIN(test_tsd_sub_thread_dalloc_only) {
+	test_sub_thread_n_dalloc(1);
+	test_sub_thread_n_dalloc(16);
+	test_sub_thread_n_dalloc(TSD_MIN_INIT_STATE_MAX_FETCHED - 2);
+	test_sub_thread_n_dalloc(TSD_MIN_INIT_STATE_MAX_FETCHED - 1);
+	test_sub_thread_n_dalloc(TSD_MIN_INIT_STATE_MAX_FETCHED);
+	test_sub_thread_n_dalloc(TSD_MIN_INIT_STATE_MAX_FETCHED + 1);
+	test_sub_thread_n_dalloc(TSD_MIN_INIT_STATE_MAX_FETCHED + 2);
+	test_sub_thread_n_dalloc(TSD_MIN_INIT_STATE_MAX_FETCHED * 2);
+}
+TEST_END
+
 typedef struct {
 	atomic_u32_t phase;
 	atomic_b_t error;
@@ -269,6 +324,7 @@ main(void) {
 	return test_no_reentrancy(
 	    test_tsd_main_thread,
 	    test_tsd_sub_thread,
+	    test_tsd_sub_thread_dalloc_only,
 	    test_tsd_reincarnation,
 	    test_tsd_global_slow);
 }