From b348ba29bb94b6e9da8dcea1105d4614556aceb9 Mon Sep 17 00:00:00 2001 From: Qi Wang Date: Tue, 11 Apr 2017 23:13:45 -0700 Subject: [PATCH] Bundle 3 branches on fast path into tsd_state. Added tsd_state_nominal_slow, which on fast path malloc() incorporates tcache_enabled check, and on fast path free() bundles both malloc_slow and tcache_enabled branches. --- .../internal/jemalloc_internal_externs.h | 4 + include/jemalloc/internal/private_symbols.txt | 3 + include/jemalloc/internal/tcache_inlines.h | 1 + include/jemalloc/internal/tsd_externs.h | 2 + include/jemalloc/internal/tsd_inlines.h | 83 +++++++------- include/jemalloc/internal/tsd_structs.h | 2 +- include/jemalloc/internal/tsd_types.h | 10 +- src/jemalloc.c | 101 +++++++++++------- src/tcache.c | 2 + src/tsd.c | 40 ++++++- 10 files changed, 170 insertions(+), 78 deletions(-) diff --git a/include/jemalloc/internal/jemalloc_internal_externs.h b/include/jemalloc/internal/jemalloc_internal_externs.h index 56d39d48..45c119f8 100644 --- a/include/jemalloc/internal/jemalloc_internal_externs.h +++ b/include/jemalloc/internal/jemalloc_internal_externs.h @@ -3,6 +3,10 @@ #include "jemalloc/internal/atomic.h" +/* TSD checks this to set thread local slow state accordingly. */ +extern bool malloc_slow; + +/* Run-time options. */ extern bool opt_abort; extern const char *opt_junk; extern bool opt_junk_alloc; diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt index 4931d489..c1573aa6 100644 --- a/include/jemalloc/internal/private_symbols.txt +++ b/include/jemalloc/internal/private_symbols.txt @@ -289,6 +289,7 @@ malloc_mutex_postfork_parent malloc_mutex_prefork malloc_mutex_unlock malloc_printf +malloc_slow malloc_snprintf malloc_strtoumax malloc_tsd_boot0 @@ -526,6 +527,7 @@ tsd_cleanup tsd_cleanup_wrapper tsd_fetch tsd_fetch_impl +tsd_fetch_slow tsd_get tsd_get_allocates tsd_iarena_get @@ -541,6 +543,7 @@ tsd_narenas_tdatap_get tsd_reentrancy_level_get tsd_reentrancy_level_set tsd_reentrancy_levelp_get +tsd_slow_update tsd_wrapper_get tsd_wrapper_set tsd_nominal diff --git a/include/jemalloc/internal/tcache_inlines.h b/include/jemalloc/internal/tcache_inlines.h index dae43f99..ea29f350 100644 --- a/include/jemalloc/internal/tcache_inlines.h +++ b/include/jemalloc/internal/tcache_inlines.h @@ -40,6 +40,7 @@ tcache_enabled_set(tsd_t *tsd, bool enabled) { } /* Commit the state last. Above calls check current state. */ tsd_tcache_enabled_set(tsd, enabled); + tsd_slow_update(tsd); } JEMALLOC_ALWAYS_INLINE void diff --git a/include/jemalloc/internal/tsd_externs.h b/include/jemalloc/internal/tsd_externs.h index d15fd591..6b9dfdc6 100644 --- a/include/jemalloc/internal/tsd_externs.h +++ b/include/jemalloc/internal/tsd_externs.h @@ -14,5 +14,7 @@ void tsd_init_finish(tsd_init_head_t *head, tsd_init_block_t *block); #endif bool tsd_data_init(void *arg); void tsd_cleanup(void *arg); +tsd_t *tsd_fetch_slow(tsd_t *tsd); +void tsd_slow_update(tsd_t *tsd); #endif /* JEMALLOC_INTERNAL_TSD_EXTERNS_H */ diff --git a/include/jemalloc/internal/tsd_inlines.h b/include/jemalloc/internal/tsd_inlines.h index 7c3fba5f..46eefb6e 100644 --- a/include/jemalloc/internal/tsd_inlines.h +++ b/include/jemalloc/internal/tsd_inlines.h @@ -19,12 +19,54 @@ bool tsdn_null(const tsdn_t *tsdn); tsd_t *tsdn_tsd(tsdn_t *tsdn); rtree_ctx_t *tsd_rtree_ctx(tsd_t *tsd); rtree_ctx_t *tsdn_rtree_ctx(tsdn_t *tsdn, rtree_ctx_t *fallback); +bool tsd_fast(tsd_t *tsd); +void tsd_assert_fast(tsd_t *tsd); #endif #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_TSD_C_)) malloc_tsd_externs(, tsd_t) malloc_tsd_funcs(JEMALLOC_ALWAYS_INLINE, , tsd_t, tsd_initializer, tsd_cleanup) +#define MALLOC_TSD_getset_yes(n, t) \ +JEMALLOC_ALWAYS_INLINE t \ +tsd_##n##_get(tsd_t *tsd) { \ + return *tsd_##n##p_get(tsd); \ +} \ +JEMALLOC_ALWAYS_INLINE void \ +tsd_##n##_set(tsd_t *tsd, t n) { \ + assert(tsd->state == tsd_state_nominal || \ + tsd->state == tsd_state_nominal_slow || \ + tsd->state == tsd_state_reincarnated); \ + tsd->n = n; \ +} +#define MALLOC_TSD_getset_no(n, t) +#define O(n, t, gs, i, c) \ +JEMALLOC_ALWAYS_INLINE t * \ +tsd_##n##p_get(tsd_t *tsd) { \ + return &tsd->n; \ +} \ + \ +MALLOC_TSD_getset_##gs(n, t) +MALLOC_TSD +#undef MALLOC_TSD_getset_yes +#undef MALLOC_TSD_getset_no +#undef O + +JEMALLOC_ALWAYS_INLINE void +tsd_assert_fast(tsd_t *tsd) { + assert(!malloc_slow && tsd_tcache_enabled_get(tsd)); +} + +JEMALLOC_ALWAYS_INLINE bool +tsd_fast(tsd_t *tsd) { + bool fast = (tsd->state == tsd_state_nominal); + if (fast) { + tsd_assert_fast(tsd); + } + + return fast; +} + JEMALLOC_ALWAYS_INLINE tsd_t * tsd_fetch_impl(bool init) { tsd_t *tsd = tsd_get(init); @@ -35,19 +77,10 @@ tsd_fetch_impl(bool init) { assert(tsd != NULL); if (unlikely(tsd->state != tsd_state_nominal)) { - if (tsd->state == tsd_state_uninitialized) { - tsd->state = tsd_state_nominal; - /* Trigger cleanup handler registration. */ - tsd_set(tsd); - tsd_data_init(tsd); - } else if (tsd->state == tsd_state_purgatory) { - tsd->state = tsd_state_reincarnated; - tsd_set(tsd); - tsd_data_init(tsd); - } else { - assert(tsd->state == tsd_state_reincarnated); - } + return tsd_fetch_slow(tsd); } + assert(tsd_fast(tsd)); + tsd_assert_fast(tsd); return tsd; } @@ -64,33 +97,9 @@ tsd_tsdn(tsd_t *tsd) { JEMALLOC_INLINE bool tsd_nominal(tsd_t *tsd) { - return (tsd->state == tsd_state_nominal); + return (tsd->state <= tsd_state_nominal_max); } -#define MALLOC_TSD_getset_yes(n, t) \ -JEMALLOC_ALWAYS_INLINE t \ -tsd_##n##_get(tsd_t *tsd) { \ - return *tsd_##n##p_get(tsd); \ -} \ -JEMALLOC_ALWAYS_INLINE void \ -tsd_##n##_set(tsd_t *tsd, t n) { \ - assert(tsd->state == tsd_state_nominal || \ - tsd->state == tsd_state_reincarnated); \ - tsd->n = n; \ -} -#define MALLOC_TSD_getset_no(n, t) -#define O(n, t, gs, i, c) \ -JEMALLOC_ALWAYS_INLINE t * \ -tsd_##n##p_get(tsd_t *tsd) { \ - return &tsd->n; \ -} \ - \ -MALLOC_TSD_getset_##gs(n, t) -MALLOC_TSD -#undef MALLOC_TSD_getset_yes -#undef MALLOC_TSD_getset_no -#undef O - JEMALLOC_ALWAYS_INLINE tsdn_t * tsdn_fetch(void) { if (!tsd_booted_get()) { diff --git a/include/jemalloc/internal/tsd_structs.h b/include/jemalloc/internal/tsd_structs.h index ac74152c..c166fe6b 100644 --- a/include/jemalloc/internal/tsd_structs.h +++ b/include/jemalloc/internal/tsd_structs.h @@ -64,7 +64,7 @@ struct tsd_init_head_s { O(iarena, arena_t *, yes, no, yes) \ O(arena, arena_t *, yes, no, yes) \ O(arenas_tdata, arena_tdata_t *,yes, no, yes) \ - O(tcache, tcache_t, yes, no, yes) \ + O(tcache, tcache_t, no, no, yes) \ O(witnesses, witness_list_t, no, no, yes) \ O(rtree_leaf_elm_witnesses, rtree_leaf_elm_witness_tsd_t, \ no, no, no) \ diff --git a/include/jemalloc/internal/tsd_types.h b/include/jemalloc/internal/tsd_types.h index 27afd1d6..dc9efbb6 100644 --- a/include/jemalloc/internal/tsd_types.h +++ b/include/jemalloc/internal/tsd_types.h @@ -20,11 +20,15 @@ typedef struct tsdn_s tsdn_t; #define TSDN_NULL ((tsdn_t *)0) enum { - tsd_state_uninitialized = 0, - tsd_state_nominal = 1, + tsd_state_nominal = 0, /* Common case --> jnz. */ + tsd_state_nominal_slow = 1, /* Initialized but on slow path. */ + /* the above 2 nominal states should be lower values. */ + tsd_state_nominal_max = 1, /* used for comparison only. */ tsd_state_purgatory = 2, - tsd_state_reincarnated = 3 + tsd_state_reincarnated = 3, + tsd_state_uninitialized = 4 }; + /* Manually limit tsd_state_t to a single byte. */ typedef uint8_t tsd_state_t; diff --git a/src/jemalloc.c b/src/jemalloc.c index fb164ee9..4bec2dea 100644 --- a/src/jemalloc.c +++ b/src/jemalloc.c @@ -76,7 +76,7 @@ typedef enum { static malloc_init_t malloc_init_state = malloc_init_uninitialized; /* False should be the common case. Set to true to trigger initialization. */ -static bool malloc_slow = true; +bool malloc_slow = true; /* When malloc_slow is true, set the corresponding bits for sanity check. */ enum { @@ -1539,7 +1539,13 @@ imalloc_no_sample(static_opts_t *sopts, dynamic_opts_t *dopts, tsd_t *tsd, /* Fill in the tcache. */ if (dopts->tcache_ind == TCACHE_IND_AUTOMATIC) { - tcache = tcache_get(tsd); + if (likely(!sopts->slow)) { + /* Getting tcache ptr unconditionally. */ + tcache = tsd_tcachep_get(tsd); + assert(tcache == tcache_get(tsd)); + } else { + tcache = tcache_get(tsd); + } } else if (dopts->tcache_ind == TCACHE_IND_NONE) { tcache = NULL; } else { @@ -1640,13 +1646,11 @@ compute_size_with_overflow(bool may_overflow, dynamic_opts_t *dopts, } JEMALLOC_ALWAYS_INLINE_C int -imalloc_body(static_opts_t *sopts, dynamic_opts_t *dopts) { +imalloc_body(static_opts_t *sopts, dynamic_opts_t *dopts, tsd_t *tsd) { /* Where the actual allocated memory will live. */ void *allocation = NULL; /* Filled in by compute_size_with_overflow below. */ size_t size = 0; - /* We compute a value for this right before allocating. */ - tsd_t *tsd = NULL; /* * For unaligned allocations, we need only ind. For aligned * allocations, or in case of stats or profiling we need usize. @@ -1667,13 +1671,6 @@ imalloc_body(static_opts_t *sopts, dynamic_opts_t *dopts) { */ int8_t *reentrancy_level = NULL; - /* Initialize (if we can't prove we don't have to). */ - if (sopts->slow) { - if (unlikely(malloc_init())) { - goto label_oom; - } - } - /* Compute the amount of memory the user wants. */ if (unlikely(compute_size_with_overflow(sopts->may_overflow, dopts, &size))) { @@ -1714,11 +1711,6 @@ imalloc_body(static_opts_t *sopts, dynamic_opts_t *dopts) { } } - /* - * We always need the tsd, even if we aren't going to use the tcache for - * some reason. Let's grab it right away. - */ - tsd = tsd_fetch(); /* * If we need to handle reentrancy, we can do it out of a @@ -1752,11 +1744,7 @@ imalloc_body(static_opts_t *sopts, dynamic_opts_t *dopts) { alloc_ctx_t alloc_ctx; if (likely((uintptr_t)tctx == (uintptr_t)1U)) { - if (usize > SMALL_MAXCLASS) { - alloc_ctx.slab = false; - } else { - alloc_ctx.slab = true; - } + alloc_ctx.slab = (usize <= SMALL_MAXCLASS); allocation = imalloc_no_sample( sopts, dopts, tsd, usize, usize, ind); } else if ((uintptr_t)tctx > (uintptr_t)1U) { @@ -1879,12 +1867,29 @@ label_invalid_alignment: /* Returns the errno-style error code of the allocation. */ JEMALLOC_ALWAYS_INLINE_C int imalloc(static_opts_t *sopts, dynamic_opts_t *dopts) { - if (unlikely(malloc_slow)) { - sopts->slow = true; - return imalloc_body(sopts, dopts); - } else { + if (unlikely(!malloc_initialized()) && unlikely(malloc_init())) { + if (config_xmalloc && unlikely(opt_xmalloc)) { + malloc_write(sopts->oom_string); + abort(); + } + UTRACE(NULL, size, NULL); + set_errno(ENOMEM); + *dopts->result = NULL; + + return ENOMEM; + } + + /* We always need the tsd. Let's grab it right away. */ + tsd_t *tsd = tsd_fetch(); + assert(tsd); + if (likely(tsd_fast(tsd))) { + /* Fast and common path. */ + tsd_assert_fast(tsd); sopts->slow = false; - return imalloc_body(sopts, dopts); + return imalloc_body(sopts, dopts, tsd); + } else { + sopts->slow = true; + return imalloc_body(sopts, dopts, tsd); } } /******************************************************************************/ @@ -2198,13 +2203,23 @@ je_free(void *ptr) { if (*tsd_reentrancy_levelp_get(tsd) == 0) { witness_assert_lockless(tsd_tsdn(tsd)); } - tcache_t *tcache = NULL; - if (likely(*tsd_reentrancy_levelp_get(tsd) == 0)) { - tcache = tcache_get(tsd); - } - if (likely(!malloc_slow)) { + tcache_t *tcache; + if (likely(tsd_fast(tsd))) { + tsd_assert_fast(tsd); + if (likely(*tsd_reentrancy_levelp_get(tsd) == 0)) { + /* Getting tcache ptr unconditionally. */ + tcache = tsd_tcachep_get(tsd); + assert(tcache == tcache_get(tsd)); + } else { + tcache = NULL; + } ifree(tsd, ptr, tcache, false); } else { + if (likely(*tsd_reentrancy_levelp_get(tsd) == 0)) { + tcache = tcache_get(tsd); + } else { + tcache = NULL; + } ifree(tsd, ptr, tcache, true); } if (*tsd_reentrancy_levelp_get(tsd) == 0) { @@ -2699,6 +2714,7 @@ je_dallocx(void *ptr, int flags) { assert(malloc_initialized() || IS_INITIALIZER); tsd = tsd_fetch(); + bool fast = tsd_fast(tsd); witness_assert_lockless(tsd_tsdn(tsd)); if (unlikely((flags & MALLOCX_TCACHE_MASK) != 0)) { /* Not allowed to be reentrant and specify a custom tcache. */ @@ -2710,14 +2726,20 @@ je_dallocx(void *ptr, int flags) { } } else { if (likely(*tsd_reentrancy_levelp_get(tsd) == 0)) { - tcache = tcache_get(tsd); + if (likely(fast)) { + tcache = tsd_tcachep_get(tsd); + assert(tcache == tcache_get(tsd)); + } else { + tcache = tcache_get(tsd); + } } else { tcache = NULL; } } UTRACE(ptr, 0, 0); - if (likely(!malloc_slow)) { + if (likely(fast)) { + tsd_assert_fast(tsd); ifree(tsd, ptr, tcache, false); } else { ifree(tsd, ptr, tcache, true); @@ -2749,6 +2771,7 @@ je_sdallocx(void *ptr, size_t size, int flags) { assert(ptr != NULL); assert(malloc_initialized() || IS_INITIALIZER); tsd = tsd_fetch(); + bool fast = tsd_fast(tsd); usize = inallocx(tsd_tsdn(tsd), size, flags); assert(usize == isalloc(tsd_tsdn(tsd), ptr)); @@ -2763,14 +2786,20 @@ je_sdallocx(void *ptr, size_t size, int flags) { } } else { if (likely(*tsd_reentrancy_levelp_get(tsd) == 0)) { - tcache = tcache_get(tsd); + if (likely(fast)) { + tcache = tsd_tcachep_get(tsd); + assert(tcache == tcache_get(tsd)); + } else { + tcache = tcache_get(tsd); + } } else { tcache = NULL; } } UTRACE(ptr, 0, 0); - if (likely(!malloc_slow)) { + if (likely(fast)) { + tsd_assert_fast(tsd); isfree(tsd, ptr, usize, tcache, false); } else { isfree(tsd, ptr, usize, tcache, true); diff --git a/src/tcache.c b/src/tcache.c index 99749fbc..7e71bb6a 100644 --- a/src/tcache.c +++ b/src/tcache.c @@ -334,6 +334,8 @@ bool tsd_tcache_enabled_data_init(tsd_t *tsd) { /* Called upon tsd initialization. */ tsd_tcache_enabled_set(tsd, opt_tcache); + tsd_slow_update(tsd); + if (opt_tcache) { /* Trigger tcache init. */ tsd_tcache_data_init(tsd); diff --git a/src/tsd.c b/src/tsd.c index 3e72548c..bdd3f3c5 100644 --- a/src/tsd.c +++ b/src/tsd.c @@ -12,6 +12,40 @@ malloc_tsd_data(, , tsd_t, TSD_INITIALIZER) /******************************************************************************/ +void +tsd_slow_update(tsd_t *tsd) { + if (tsd_nominal(tsd)) { + if (malloc_slow || !tsd->tcache_enabled) { + tsd->state = tsd_state_nominal_slow; + } else { + tsd->state = tsd_state_nominal; + } + } +} + +tsd_t * +tsd_fetch_slow(tsd_t *tsd) { + if (tsd->state == tsd_state_nominal_slow) { + /* On slow path but no work needed. */ + assert(malloc_slow || !tsd_tcache_enabled_get(tsd) || + *tsd_arenas_tdata_bypassp_get(tsd)); + } else if (tsd->state == tsd_state_uninitialized) { + tsd->state = tsd_state_nominal; + tsd_slow_update(tsd); + /* Trigger cleanup handler registration. */ + tsd_set(tsd); + tsd_data_init(tsd); + } else if (tsd->state == tsd_state_purgatory) { + tsd->state = tsd_state_reincarnated; + tsd_set(tsd); + tsd_data_init(tsd); + } else { + assert(tsd->state == tsd_state_reincarnated); + } + + return tsd; +} + void * malloc_tsd_malloc(size_t size) { return a0malloc(CACHELINE_CEILING(size)); @@ -82,6 +116,7 @@ tsd_cleanup(void *arg) { /* Do nothing. */ break; case tsd_state_nominal: + case tsd_state_nominal_slow: case tsd_state_reincarnated: /* * Reincarnated means another destructor deallocated memory @@ -129,7 +164,10 @@ malloc_tsd_boot0(void) { void malloc_tsd_boot1(void) { tsd_boot1(); - *tsd_arenas_tdata_bypassp_get(tsd_fetch()) = false; + tsd_t *tsd = tsd_fetch(); + /* malloc_slow has been set properly. Update tsd_slow. */ + tsd_slow_update(tsd); + *tsd_arenas_tdata_bypassp_get(tsd) = false; } #ifdef _WIN32