Tcache flush: keep common path state in registers.

By carefully force-inlining the division constants and the operation sum count,
we can eliminate redundant operations in the arena-level dalloc function.  Do
so.
This commit is contained in:
David Goldblatt 2021-01-29 16:06:28 -08:00 committed by David Goldblatt
parent 31a629c3de
commit 229994a204
5 changed files with 130 additions and 79 deletions

View File

@ -2,6 +2,7 @@
#define JEMALLOC_INTERNAL_ARENA_EXTERNS_H #define JEMALLOC_INTERNAL_ARENA_EXTERNS_H
#include "jemalloc/internal/bin.h" #include "jemalloc/internal/bin.h"
#include "jemalloc/internal/div.h"
#include "jemalloc/internal/extent_dss.h" #include "jemalloc/internal/extent_dss.h"
#include "jemalloc/internal/hook.h" #include "jemalloc/internal/hook.h"
#include "jemalloc/internal/pages.h" #include "jemalloc/internal/pages.h"
@ -13,6 +14,8 @@ extern ssize_t opt_muzzy_decay_ms;
extern percpu_arena_mode_t opt_percpu_arena; extern percpu_arena_mode_t opt_percpu_arena;
extern const char *percpu_arena_mode_names[]; extern const char *percpu_arena_mode_names[];
extern div_info_t arena_binind_div_info[SC_NBINS];
extern const uint64_t h_steps[SMOOTHSTEP_NSTEPS]; extern const uint64_t h_steps[SMOOTHSTEP_NSTEPS];
extern malloc_mutex_t arenas_lock; extern malloc_mutex_t arenas_lock;
extern emap_t arena_emap_global; extern emap_t arena_emap_global;
@ -29,9 +32,6 @@ void arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads,
bin_stats_data_t *bstats, arena_stats_large_t *lstats, bin_stats_data_t *bstats, arena_stats_large_t *lstats,
pac_estats_t *estats, hpa_shard_stats_t *hpastats, sec_stats_t *secstats); pac_estats_t *estats, hpa_shard_stats_t *hpastats, sec_stats_t *secstats);
void arena_handle_new_dirty_pages(tsdn_t *tsdn, arena_t *arena); void arena_handle_new_dirty_pages(tsdn_t *tsdn, arena_t *arena);
#ifdef JEMALLOC_JET
size_t arena_slab_regind(edata_t *slab, szind_t binind, const void *ptr);
#endif
edata_t *arena_extent_alloc_large(tsdn_t *tsdn, arena_t *arena, edata_t *arena_extent_alloc_large(tsdn_t *tsdn, arena_t *arena,
size_t usize, size_t alignment, bool zero); size_t usize, size_t alignment, bool zero);
void arena_extent_dalloc_large_prep(tsdn_t *tsdn, arena_t *arena, void arena_extent_dalloc_large_prep(tsdn_t *tsdn, arena_t *arena,
@ -59,8 +59,11 @@ void arena_prof_promote(tsdn_t *tsdn, void *ptr, size_t usize);
void arena_dalloc_promoted(tsdn_t *tsdn, void *ptr, tcache_t *tcache, void arena_dalloc_promoted(tsdn_t *tsdn, void *ptr, tcache_t *tcache,
bool slow_path); bool slow_path);
void arena_slab_dalloc(tsdn_t *tsdn, arena_t *arena, edata_t *slab); void arena_slab_dalloc(tsdn_t *tsdn, arena_t *arena, edata_t *slab);
bool arena_dalloc_bin_locked(tsdn_t *tsdn, arena_t *arena, bin_t *bin,
szind_t binind, edata_t *edata, void *ptr); void arena_dalloc_bin_locked_handle_newly_empty(tsdn_t *tsdn, arena_t *arena,
edata_t *slab, bin_t *bin);
void arena_dalloc_bin_locked_handle_newly_nonempty(tsdn_t *tsdn, arena_t *arena,
edata_t *slab, bin_t *bin);
void arena_dalloc_small(tsdn_t *tsdn, void *ptr); void arena_dalloc_small(tsdn_t *tsdn, void *ptr);
bool arena_ralloc_no_move(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size, bool arena_ralloc_no_move(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size,
size_t extra, bool zero, size_t *newsize); size_t extra, bool zero, size_t *newsize);

View File

@ -1,6 +1,7 @@
#ifndef JEMALLOC_INTERNAL_ARENA_INLINES_B_H #ifndef JEMALLOC_INTERNAL_ARENA_INLINES_B_H
#define JEMALLOC_INTERNAL_ARENA_INLINES_B_H #define JEMALLOC_INTERNAL_ARENA_INLINES_B_H
#include "jemalloc/internal/div.h"
#include "jemalloc/internal/emap.h" #include "jemalloc/internal/emap.h"
#include "jemalloc/internal/jemalloc_internal_types.h" #include "jemalloc/internal/jemalloc_internal_types.h"
#include "jemalloc/internal/mutex.h" #include "jemalloc/internal/mutex.h"
@ -441,4 +442,96 @@ arena_cache_oblivious_randomize(tsdn_t *tsdn, arena_t *arena, edata_t *edata,
} }
} }
/*
* The dalloc bin info contains just the information that the common paths need
* during tcache flushes. By force-inlining these paths, and using local copies
* of data (so that the compiler knows it's constant), we avoid a whole bunch of
* redundant loads and stores by leaving this information in registers.
*/
typedef struct arena_dalloc_bin_locked_info_s arena_dalloc_bin_locked_info_t;
struct arena_dalloc_bin_locked_info_s {
div_info_t div_info;
uint32_t nregs;
uint64_t ndalloc;
};
JEMALLOC_ALWAYS_INLINE size_t
arena_slab_regind(arena_dalloc_bin_locked_info_t *info, szind_t binind,
edata_t *slab, const void *ptr) {
size_t diff, regind;
/* Freeing a pointer outside the slab can cause assertion failure. */
assert((uintptr_t)ptr >= (uintptr_t)edata_addr_get(slab));
assert((uintptr_t)ptr < (uintptr_t)edata_past_get(slab));
/* Freeing an interior pointer can cause assertion failure. */
assert(((uintptr_t)ptr - (uintptr_t)edata_addr_get(slab)) %
(uintptr_t)bin_infos[binind].reg_size == 0);
diff = (size_t)((uintptr_t)ptr - (uintptr_t)edata_addr_get(slab));
/* Avoid doing division with a variable divisor. */
regind = div_compute(&info->div_info, diff);
assert(regind < bin_infos[binind].nregs);
return regind;
}
JEMALLOC_ALWAYS_INLINE void
arena_dalloc_bin_locked_begin(arena_dalloc_bin_locked_info_t *info,
szind_t binind) {
info->div_info = arena_binind_div_info[binind];
info->nregs = bin_infos[binind].nregs;
info->ndalloc = 0;
}
/*
* Does the deallocation work associated with freeing a single pointer (a
* "step") in between a arena_dalloc_bin_locked begin and end call.
*
* Returns true if arena_slab_dalloc must be called on slab. Doesn't do
* stats updates, which happen during finish (this lets running counts get left
* in a register).
*/
JEMALLOC_ALWAYS_INLINE bool
arena_dalloc_bin_locked_step(tsdn_t *tsdn, arena_t *arena, bin_t *bin,
arena_dalloc_bin_locked_info_t *info, szind_t binind, edata_t *slab,
void *ptr) {
const bin_info_t *bin_info = &bin_infos[binind];
size_t regind = arena_slab_regind(info, binind, slab, ptr);
slab_data_t *slab_data = edata_slab_data_get(slab);
assert(edata_nfree_get(slab) < bin_info->nregs);
/* Freeing an unallocated pointer can cause assertion failure. */
assert(bitmap_get(slab_data->bitmap, &bin_info->bitmap_info, regind));
bitmap_unset(slab_data->bitmap, &bin_info->bitmap_info, regind);
edata_nfree_inc(slab);
if (config_stats) {
info->ndalloc++;
}
unsigned nfree = edata_nfree_get(slab);
if (nfree == bin_info->nregs) {
arena_dalloc_bin_locked_handle_newly_empty(tsdn, arena, slab,
bin);
return true;
} else if (nfree == 1 && slab != bin->slabcur) {
arena_dalloc_bin_locked_handle_newly_nonempty(tsdn, arena, slab,
bin);
}
return false;
}
JEMALLOC_ALWAYS_INLINE void
arena_dalloc_bin_locked_finish(tsdn_t *tsdn, arena_t *arena, bin_t *bin,
arena_dalloc_bin_locked_info_t *info) {
if (config_stats) {
bin->stats.ndalloc += info->ndalloc;
assert(bin->stats.curregs >= (size_t)info->ndalloc);
bin->stats.curregs -= (size_t)info->ndalloc;
}
}
#endif /* JEMALLOC_INTERNAL_ARENA_INLINES_B_H */ #endif /* JEMALLOC_INTERNAL_ARENA_INLINES_B_H */

View File

@ -3,7 +3,6 @@
#include "jemalloc/internal/assert.h" #include "jemalloc/internal/assert.h"
#include "jemalloc/internal/decay.h" #include "jemalloc/internal/decay.h"
#include "jemalloc/internal/div.h"
#include "jemalloc/internal/ehooks.h" #include "jemalloc/internal/ehooks.h"
#include "jemalloc/internal/extent_dss.h" #include "jemalloc/internal/extent_dss.h"
#include "jemalloc/internal/extent_mmap.h" #include "jemalloc/internal/extent_mmap.h"
@ -45,7 +44,7 @@ const uint64_t h_steps[SMOOTHSTEP_NSTEPS] = {
#undef STEP #undef STEP
}; };
static div_info_t arena_binind_div_info[SC_NBINS]; div_info_t arena_binind_div_info[SC_NBINS];
size_t opt_oversize_threshold = OVERSIZE_THRESHOLD_DEFAULT; size_t opt_oversize_threshold = OVERSIZE_THRESHOLD_DEFAULT;
size_t oversize_threshold = OVERSIZE_THRESHOLD_DEFAULT; size_t oversize_threshold = OVERSIZE_THRESHOLD_DEFAULT;
@ -260,44 +259,6 @@ arena_slab_reg_alloc_batch(edata_t *slab, const bin_info_t *bin_info,
edata_nfree_sub(slab, cnt); edata_nfree_sub(slab, cnt);
} }
#ifndef JEMALLOC_JET
static
#endif
size_t
arena_slab_regind(edata_t *slab, szind_t binind, const void *ptr) {
size_t diff, regind;
/* Freeing a pointer outside the slab can cause assertion failure. */
assert((uintptr_t)ptr >= (uintptr_t)edata_addr_get(slab));
assert((uintptr_t)ptr < (uintptr_t)edata_past_get(slab));
/* Freeing an interior pointer can cause assertion failure. */
assert(((uintptr_t)ptr - (uintptr_t)edata_addr_get(slab)) %
(uintptr_t)bin_infos[binind].reg_size == 0);
diff = (size_t)((uintptr_t)ptr - (uintptr_t)edata_addr_get(slab));
/* Avoid doing division with a variable divisor. */
regind = div_compute(&arena_binind_div_info[binind], diff);
assert(regind < bin_infos[binind].nregs);
return regind;
}
static void
arena_slab_reg_dalloc(edata_t *slab, slab_data_t *slab_data, void *ptr) {
szind_t binind = edata_szind_get(slab);
const bin_info_t *bin_info = &bin_infos[binind];
size_t regind = arena_slab_regind(slab, binind, ptr);
assert(edata_nfree_get(slab) < bin_info->nregs);
/* Freeing an unallocated pointer can cause assertion failure. */
assert(bitmap_get(slab_data->bitmap, &bin_info->bitmap_info, regind));
bitmap_unset(slab_data->bitmap, &bin_info->bitmap_info, regind);
edata_nfree_inc(slab);
}
static void static void
arena_large_malloc_stats_update(tsdn_t *tsdn, arena_t *arena, size_t usize) { arena_large_malloc_stats_update(tsdn_t *tsdn, arena_t *arena, size_t usize) {
szind_t index, hindex; szind_t index, hindex;
@ -1189,39 +1150,20 @@ arena_dalloc_bin_slab_prepare(tsdn_t *tsdn, edata_t *slab, bin_t *bin) {
} }
} }
/* Returns true if arena_slab_dalloc must be called on slab */ void
static bool arena_dalloc_bin_locked_handle_newly_empty(tsdn_t *tsdn, arena_t *arena,
arena_dalloc_bin_locked_impl(tsdn_t *tsdn, arena_t *arena, bin_t *bin, edata_t *slab, bin_t *bin) {
szind_t binind, edata_t *slab, void *ptr) {
const bin_info_t *bin_info = &bin_infos[binind];
arena_slab_reg_dalloc(slab, edata_slab_data_get(slab), ptr);
bool ret = false;
unsigned nfree = edata_nfree_get(slab);
if (nfree == bin_info->nregs) {
arena_dissociate_bin_slab(arena, slab, bin); arena_dissociate_bin_slab(arena, slab, bin);
arena_dalloc_bin_slab_prepare(tsdn, slab, bin); arena_dalloc_bin_slab_prepare(tsdn, slab, bin);
ret = true; }
} else if (nfree == 1 && slab != bin->slabcur) {
void
arena_dalloc_bin_locked_handle_newly_nonempty(tsdn_t *tsdn, arena_t *arena,
edata_t *slab, bin_t *bin) {
arena_bin_slabs_full_remove(arena, bin, slab); arena_bin_slabs_full_remove(arena, bin, slab);
arena_bin_lower_slab(tsdn, arena, slab, bin); arena_bin_lower_slab(tsdn, arena, slab, bin);
} }
if (config_stats) {
bin->stats.ndalloc++;
bin->stats.curregs--;
}
return ret;
}
bool
arena_dalloc_bin_locked(tsdn_t *tsdn, arena_t *arena, bin_t *bin,
szind_t binind, edata_t *edata, void *ptr) {
return arena_dalloc_bin_locked_impl(tsdn, arena, bin, binind, edata,
ptr);
}
static void static void
arena_dalloc_bin(tsdn_t *tsdn, arena_t *arena, edata_t *edata, void *ptr) { arena_dalloc_bin(tsdn_t *tsdn, arena_t *arena, edata_t *edata, void *ptr) {
szind_t binind = edata_szind_get(edata); szind_t binind = edata_szind_get(edata);
@ -1229,8 +1171,11 @@ arena_dalloc_bin(tsdn_t *tsdn, arena_t *arena, edata_t *edata, void *ptr) {
bin_t *bin = &arena->bins[binind].bin_shards[binshard]; bin_t *bin = &arena->bins[binind].bin_shards[binshard];
malloc_mutex_lock(tsdn, &bin->lock); malloc_mutex_lock(tsdn, &bin->lock);
bool ret = arena_dalloc_bin_locked_impl(tsdn, arena, bin, binind, edata, arena_dalloc_bin_locked_info_t info;
ptr); arena_dalloc_bin_locked_begin(&info, binind);
bool ret = arena_dalloc_bin_locked_step(tsdn, arena, bin,
&info, binind, edata, ptr);
arena_dalloc_bin_locked_finish(tsdn, arena, bin, &info);
malloc_mutex_unlock(tsdn, &bin->lock); malloc_mutex_unlock(tsdn, &bin->lock);
if (ret) { if (ret) {

View File

@ -399,6 +399,10 @@ tcache_bin_flush_impl(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin,
/* Deallocate whatever we can. */ /* Deallocate whatever we can. */
unsigned ndeferred = 0; unsigned ndeferred = 0;
arena_dalloc_bin_locked_info_t dalloc_bin_info;
if (small) {
arena_dalloc_bin_locked_begin(&dalloc_bin_info, binind);
}
for (unsigned i = 0; i < nflush; i++) { for (unsigned i = 0; i < nflush; i++) {
void *ptr = cache_bin_ptr_array_get(&ptrs, i); void *ptr = cache_bin_ptr_array_get(&ptrs, i);
edata = item_edata[i].edata; edata = item_edata[i].edata;
@ -417,8 +421,9 @@ tcache_bin_flush_impl(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin,
continue; continue;
} }
if (small) { if (small) {
if (arena_dalloc_bin_locked(tsdn, cur_arena, if (arena_dalloc_bin_locked_step(tsdn,
cur_bin, binind, edata, ptr)) { cur_arena, cur_bin, &dalloc_bin_info,
binind, edata, ptr)) {
dalloc_slabs[dalloc_count] = edata; dalloc_slabs[dalloc_count] = edata;
dalloc_count++; dalloc_count++;
} }
@ -432,6 +437,8 @@ tcache_bin_flush_impl(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin,
} }
if (small) { if (small) {
arena_dalloc_bin_locked_finish(tsdn, cur_arena, cur_bin,
&dalloc_bin_info);
malloc_mutex_unlock(tsdn, &cur_bin->lock); malloc_mutex_unlock(tsdn, &cur_bin->lock);
} }
arena_decay_ticks(tsdn, cur_arena, nflush - ndeferred); arena_decay_ticks(tsdn, cur_arena, nflush - ndeferred);

View File

@ -16,10 +16,13 @@ TEST_BEGIN(test_arena_slab_regind) {
EXTENT_NOT_HEAD); EXTENT_NOT_HEAD);
expect_ptr_not_null(edata_addr_get(&slab), expect_ptr_not_null(edata_addr_get(&slab),
"Unexpected malloc() failure"); "Unexpected malloc() failure");
arena_dalloc_bin_locked_info_t dalloc_info;
arena_dalloc_bin_locked_begin(&dalloc_info, binind);
for (regind = 0; regind < bin_info->nregs; regind++) { for (regind = 0; regind < bin_info->nregs; regind++) {
void *reg = (void *)((uintptr_t)edata_addr_get(&slab) + void *reg = (void *)((uintptr_t)edata_addr_get(&slab) +
(bin_info->reg_size * regind)); (bin_info->reg_size * regind));
expect_zu_eq(arena_slab_regind(&slab, binind, reg), expect_zu_eq(arena_slab_regind(&dalloc_info, binind,
&slab, reg),
regind, regind,
"Incorrect region index computed for size %zu", "Incorrect region index computed for size %zu",
bin_info->reg_size); bin_info->reg_size);