diff --git a/include/jemalloc/internal/arena_externs.h b/include/jemalloc/internal/arena_externs.h index e3cfcee2..f06cb345 100644 --- a/include/jemalloc/internal/arena_externs.h +++ b/include/jemalloc/internal/arena_externs.h @@ -2,6 +2,7 @@ #define JEMALLOC_INTERNAL_ARENA_EXTERNS_H #include "jemalloc/internal/bin.h" +#include "jemalloc/internal/div.h" #include "jemalloc/internal/extent_dss.h" #include "jemalloc/internal/hook.h" #include "jemalloc/internal/pages.h" @@ -13,6 +14,8 @@ extern ssize_t opt_muzzy_decay_ms; extern percpu_arena_mode_t opt_percpu_arena; extern const char *percpu_arena_mode_names[]; +extern div_info_t arena_binind_div_info[SC_NBINS]; + extern const uint64_t h_steps[SMOOTHSTEP_NSTEPS]; extern malloc_mutex_t arenas_lock; extern emap_t arena_emap_global; @@ -29,9 +32,6 @@ void arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads, bin_stats_data_t *bstats, arena_stats_large_t *lstats, pac_estats_t *estats, hpa_shard_stats_t *hpastats, sec_stats_t *secstats); void arena_handle_new_dirty_pages(tsdn_t *tsdn, arena_t *arena); -#ifdef JEMALLOC_JET -size_t arena_slab_regind(edata_t *slab, szind_t binind, const void *ptr); -#endif edata_t *arena_extent_alloc_large(tsdn_t *tsdn, arena_t *arena, size_t usize, size_t alignment, bool zero); void arena_extent_dalloc_large_prep(tsdn_t *tsdn, arena_t *arena, @@ -59,8 +59,11 @@ void arena_prof_promote(tsdn_t *tsdn, void *ptr, size_t usize); void arena_dalloc_promoted(tsdn_t *tsdn, void *ptr, tcache_t *tcache, bool slow_path); void arena_slab_dalloc(tsdn_t *tsdn, arena_t *arena, edata_t *slab); -bool arena_dalloc_bin_locked(tsdn_t *tsdn, arena_t *arena, bin_t *bin, - szind_t binind, edata_t *edata, void *ptr); + +void arena_dalloc_bin_locked_handle_newly_empty(tsdn_t *tsdn, arena_t *arena, + edata_t *slab, bin_t *bin); +void arena_dalloc_bin_locked_handle_newly_nonempty(tsdn_t *tsdn, arena_t *arena, + edata_t *slab, bin_t *bin); void arena_dalloc_small(tsdn_t *tsdn, void *ptr); bool arena_ralloc_no_move(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size, size_t extra, bool zero, size_t *newsize); diff --git a/include/jemalloc/internal/arena_inlines_b.h b/include/jemalloc/internal/arena_inlines_b.h index aaef45c0..66dcff07 100644 --- a/include/jemalloc/internal/arena_inlines_b.h +++ b/include/jemalloc/internal/arena_inlines_b.h @@ -1,6 +1,7 @@ #ifndef JEMALLOC_INTERNAL_ARENA_INLINES_B_H #define JEMALLOC_INTERNAL_ARENA_INLINES_B_H +#include "jemalloc/internal/div.h" #include "jemalloc/internal/emap.h" #include "jemalloc/internal/jemalloc_internal_types.h" #include "jemalloc/internal/mutex.h" @@ -441,4 +442,96 @@ arena_cache_oblivious_randomize(tsdn_t *tsdn, arena_t *arena, edata_t *edata, } } +/* + * The dalloc bin info contains just the information that the common paths need + * during tcache flushes. By force-inlining these paths, and using local copies + * of data (so that the compiler knows it's constant), we avoid a whole bunch of + * redundant loads and stores by leaving this information in registers. + */ +typedef struct arena_dalloc_bin_locked_info_s arena_dalloc_bin_locked_info_t; +struct arena_dalloc_bin_locked_info_s { + div_info_t div_info; + uint32_t nregs; + uint64_t ndalloc; +}; + +JEMALLOC_ALWAYS_INLINE size_t +arena_slab_regind(arena_dalloc_bin_locked_info_t *info, szind_t binind, + edata_t *slab, const void *ptr) { + size_t diff, regind; + + /* Freeing a pointer outside the slab can cause assertion failure. */ + assert((uintptr_t)ptr >= (uintptr_t)edata_addr_get(slab)); + assert((uintptr_t)ptr < (uintptr_t)edata_past_get(slab)); + /* Freeing an interior pointer can cause assertion failure. */ + assert(((uintptr_t)ptr - (uintptr_t)edata_addr_get(slab)) % + (uintptr_t)bin_infos[binind].reg_size == 0); + + diff = (size_t)((uintptr_t)ptr - (uintptr_t)edata_addr_get(slab)); + + /* Avoid doing division with a variable divisor. */ + regind = div_compute(&info->div_info, diff); + + assert(regind < bin_infos[binind].nregs); + + return regind; +} + +JEMALLOC_ALWAYS_INLINE void +arena_dalloc_bin_locked_begin(arena_dalloc_bin_locked_info_t *info, + szind_t binind) { + info->div_info = arena_binind_div_info[binind]; + info->nregs = bin_infos[binind].nregs; + info->ndalloc = 0; +} + +/* + * Does the deallocation work associated with freeing a single pointer (a + * "step") in between a arena_dalloc_bin_locked begin and end call. + * + * Returns true if arena_slab_dalloc must be called on slab. Doesn't do + * stats updates, which happen during finish (this lets running counts get left + * in a register). + */ +JEMALLOC_ALWAYS_INLINE bool +arena_dalloc_bin_locked_step(tsdn_t *tsdn, arena_t *arena, bin_t *bin, + arena_dalloc_bin_locked_info_t *info, szind_t binind, edata_t *slab, + void *ptr) { + const bin_info_t *bin_info = &bin_infos[binind]; + size_t regind = arena_slab_regind(info, binind, slab, ptr); + slab_data_t *slab_data = edata_slab_data_get(slab); + + assert(edata_nfree_get(slab) < bin_info->nregs); + /* Freeing an unallocated pointer can cause assertion failure. */ + assert(bitmap_get(slab_data->bitmap, &bin_info->bitmap_info, regind)); + + bitmap_unset(slab_data->bitmap, &bin_info->bitmap_info, regind); + edata_nfree_inc(slab); + + if (config_stats) { + info->ndalloc++; + } + + unsigned nfree = edata_nfree_get(slab); + if (nfree == bin_info->nregs) { + arena_dalloc_bin_locked_handle_newly_empty(tsdn, arena, slab, + bin); + return true; + } else if (nfree == 1 && slab != bin->slabcur) { + arena_dalloc_bin_locked_handle_newly_nonempty(tsdn, arena, slab, + bin); + } + return false; +} + +JEMALLOC_ALWAYS_INLINE void +arena_dalloc_bin_locked_finish(tsdn_t *tsdn, arena_t *arena, bin_t *bin, + arena_dalloc_bin_locked_info_t *info) { + if (config_stats) { + bin->stats.ndalloc += info->ndalloc; + assert(bin->stats.curregs >= (size_t)info->ndalloc); + bin->stats.curregs -= (size_t)info->ndalloc; + } +} + #endif /* JEMALLOC_INTERNAL_ARENA_INLINES_B_H */ diff --git a/src/arena.c b/src/arena.c index 914e63f1..56c34af5 100644 --- a/src/arena.c +++ b/src/arena.c @@ -3,7 +3,6 @@ #include "jemalloc/internal/assert.h" #include "jemalloc/internal/decay.h" -#include "jemalloc/internal/div.h" #include "jemalloc/internal/ehooks.h" #include "jemalloc/internal/extent_dss.h" #include "jemalloc/internal/extent_mmap.h" @@ -45,7 +44,7 @@ const uint64_t h_steps[SMOOTHSTEP_NSTEPS] = { #undef STEP }; -static div_info_t arena_binind_div_info[SC_NBINS]; +div_info_t arena_binind_div_info[SC_NBINS]; size_t opt_oversize_threshold = OVERSIZE_THRESHOLD_DEFAULT; size_t oversize_threshold = OVERSIZE_THRESHOLD_DEFAULT; @@ -260,44 +259,6 @@ arena_slab_reg_alloc_batch(edata_t *slab, const bin_info_t *bin_info, edata_nfree_sub(slab, cnt); } -#ifndef JEMALLOC_JET -static -#endif -size_t -arena_slab_regind(edata_t *slab, szind_t binind, const void *ptr) { - size_t diff, regind; - - /* Freeing a pointer outside the slab can cause assertion failure. */ - assert((uintptr_t)ptr >= (uintptr_t)edata_addr_get(slab)); - assert((uintptr_t)ptr < (uintptr_t)edata_past_get(slab)); - /* Freeing an interior pointer can cause assertion failure. */ - assert(((uintptr_t)ptr - (uintptr_t)edata_addr_get(slab)) % - (uintptr_t)bin_infos[binind].reg_size == 0); - - diff = (size_t)((uintptr_t)ptr - (uintptr_t)edata_addr_get(slab)); - - /* Avoid doing division with a variable divisor. */ - regind = div_compute(&arena_binind_div_info[binind], diff); - - assert(regind < bin_infos[binind].nregs); - - return regind; -} - -static void -arena_slab_reg_dalloc(edata_t *slab, slab_data_t *slab_data, void *ptr) { - szind_t binind = edata_szind_get(slab); - const bin_info_t *bin_info = &bin_infos[binind]; - size_t regind = arena_slab_regind(slab, binind, ptr); - - assert(edata_nfree_get(slab) < bin_info->nregs); - /* Freeing an unallocated pointer can cause assertion failure. */ - assert(bitmap_get(slab_data->bitmap, &bin_info->bitmap_info, regind)); - - bitmap_unset(slab_data->bitmap, &bin_info->bitmap_info, regind); - edata_nfree_inc(slab); -} - static void arena_large_malloc_stats_update(tsdn_t *tsdn, arena_t *arena, size_t usize) { szind_t index, hindex; @@ -1189,37 +1150,18 @@ arena_dalloc_bin_slab_prepare(tsdn_t *tsdn, edata_t *slab, bin_t *bin) { } } -/* Returns true if arena_slab_dalloc must be called on slab */ -static bool -arena_dalloc_bin_locked_impl(tsdn_t *tsdn, arena_t *arena, bin_t *bin, - szind_t binind, edata_t *slab, void *ptr) { - const bin_info_t *bin_info = &bin_infos[binind]; - arena_slab_reg_dalloc(slab, edata_slab_data_get(slab), ptr); - - bool ret = false; - unsigned nfree = edata_nfree_get(slab); - if (nfree == bin_info->nregs) { - arena_dissociate_bin_slab(arena, slab, bin); - arena_dalloc_bin_slab_prepare(tsdn, slab, bin); - ret = true; - } else if (nfree == 1 && slab != bin->slabcur) { - arena_bin_slabs_full_remove(arena, bin, slab); - arena_bin_lower_slab(tsdn, arena, slab, bin); - } - - if (config_stats) { - bin->stats.ndalloc++; - bin->stats.curregs--; - } - - return ret; +void +arena_dalloc_bin_locked_handle_newly_empty(tsdn_t *tsdn, arena_t *arena, + edata_t *slab, bin_t *bin) { + arena_dissociate_bin_slab(arena, slab, bin); + arena_dalloc_bin_slab_prepare(tsdn, slab, bin); } -bool -arena_dalloc_bin_locked(tsdn_t *tsdn, arena_t *arena, bin_t *bin, -szind_t binind, edata_t *edata, void *ptr) { - return arena_dalloc_bin_locked_impl(tsdn, arena, bin, binind, edata, - ptr); +void +arena_dalloc_bin_locked_handle_newly_nonempty(tsdn_t *tsdn, arena_t *arena, + edata_t *slab, bin_t *bin) { + arena_bin_slabs_full_remove(arena, bin, slab); + arena_bin_lower_slab(tsdn, arena, slab, bin); } static void @@ -1229,8 +1171,11 @@ arena_dalloc_bin(tsdn_t *tsdn, arena_t *arena, edata_t *edata, void *ptr) { bin_t *bin = &arena->bins[binind].bin_shards[binshard]; malloc_mutex_lock(tsdn, &bin->lock); - bool ret = arena_dalloc_bin_locked_impl(tsdn, arena, bin, binind, edata, - ptr); + arena_dalloc_bin_locked_info_t info; + arena_dalloc_bin_locked_begin(&info, binind); + bool ret = arena_dalloc_bin_locked_step(tsdn, arena, bin, + &info, binind, edata, ptr); + arena_dalloc_bin_locked_finish(tsdn, arena, bin, &info); malloc_mutex_unlock(tsdn, &bin->lock); if (ret) { diff --git a/src/tcache.c b/src/tcache.c index 3daf4263..c7bdbf99 100644 --- a/src/tcache.c +++ b/src/tcache.c @@ -399,6 +399,10 @@ tcache_bin_flush_impl(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin, /* Deallocate whatever we can. */ unsigned ndeferred = 0; + arena_dalloc_bin_locked_info_t dalloc_bin_info; + if (small) { + arena_dalloc_bin_locked_begin(&dalloc_bin_info, binind); + } for (unsigned i = 0; i < nflush; i++) { void *ptr = cache_bin_ptr_array_get(&ptrs, i); edata = item_edata[i].edata; @@ -417,8 +421,9 @@ tcache_bin_flush_impl(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin, continue; } if (small) { - if (arena_dalloc_bin_locked(tsdn, cur_arena, - cur_bin, binind, edata, ptr)) { + if (arena_dalloc_bin_locked_step(tsdn, + cur_arena, cur_bin, &dalloc_bin_info, + binind, edata, ptr)) { dalloc_slabs[dalloc_count] = edata; dalloc_count++; } @@ -432,6 +437,8 @@ tcache_bin_flush_impl(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin, } if (small) { + arena_dalloc_bin_locked_finish(tsdn, cur_arena, cur_bin, + &dalloc_bin_info); malloc_mutex_unlock(tsdn, &cur_bin->lock); } arena_decay_ticks(tsdn, cur_arena, nflush - ndeferred); diff --git a/test/unit/slab.c b/test/unit/slab.c index 6baa9d3a..70fc5c7d 100644 --- a/test/unit/slab.c +++ b/test/unit/slab.c @@ -16,10 +16,13 @@ TEST_BEGIN(test_arena_slab_regind) { EXTENT_NOT_HEAD); expect_ptr_not_null(edata_addr_get(&slab), "Unexpected malloc() failure"); + arena_dalloc_bin_locked_info_t dalloc_info; + arena_dalloc_bin_locked_begin(&dalloc_info, binind); for (regind = 0; regind < bin_info->nregs; regind++) { void *reg = (void *)((uintptr_t)edata_addr_get(&slab) + (bin_info->reg_size * regind)); - expect_zu_eq(arena_slab_regind(&slab, binind, reg), + expect_zu_eq(arena_slab_regind(&dalloc_info, binind, + &slab, reg), regind, "Incorrect region index computed for size %zu", bin_info->reg_size);