Tcache flush: keep common path state in registers.
By carefully force-inlining the division constants and the operation sum count, we can eliminate redundant operations in the arena-level dalloc function. Do so.
This commit is contained in:
parent
31a629c3de
commit
229994a204
@ -2,6 +2,7 @@
|
||||
#define JEMALLOC_INTERNAL_ARENA_EXTERNS_H
|
||||
|
||||
#include "jemalloc/internal/bin.h"
|
||||
#include "jemalloc/internal/div.h"
|
||||
#include "jemalloc/internal/extent_dss.h"
|
||||
#include "jemalloc/internal/hook.h"
|
||||
#include "jemalloc/internal/pages.h"
|
||||
@ -13,6 +14,8 @@ extern ssize_t opt_muzzy_decay_ms;
|
||||
extern percpu_arena_mode_t opt_percpu_arena;
|
||||
extern const char *percpu_arena_mode_names[];
|
||||
|
||||
extern div_info_t arena_binind_div_info[SC_NBINS];
|
||||
|
||||
extern const uint64_t h_steps[SMOOTHSTEP_NSTEPS];
|
||||
extern malloc_mutex_t arenas_lock;
|
||||
extern emap_t arena_emap_global;
|
||||
@ -29,9 +32,6 @@ void arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads,
|
||||
bin_stats_data_t *bstats, arena_stats_large_t *lstats,
|
||||
pac_estats_t *estats, hpa_shard_stats_t *hpastats, sec_stats_t *secstats);
|
||||
void arena_handle_new_dirty_pages(tsdn_t *tsdn, arena_t *arena);
|
||||
#ifdef JEMALLOC_JET
|
||||
size_t arena_slab_regind(edata_t *slab, szind_t binind, const void *ptr);
|
||||
#endif
|
||||
edata_t *arena_extent_alloc_large(tsdn_t *tsdn, arena_t *arena,
|
||||
size_t usize, size_t alignment, bool zero);
|
||||
void arena_extent_dalloc_large_prep(tsdn_t *tsdn, arena_t *arena,
|
||||
@ -59,8 +59,11 @@ void arena_prof_promote(tsdn_t *tsdn, void *ptr, size_t usize);
|
||||
void arena_dalloc_promoted(tsdn_t *tsdn, void *ptr, tcache_t *tcache,
|
||||
bool slow_path);
|
||||
void arena_slab_dalloc(tsdn_t *tsdn, arena_t *arena, edata_t *slab);
|
||||
bool arena_dalloc_bin_locked(tsdn_t *tsdn, arena_t *arena, bin_t *bin,
|
||||
szind_t binind, edata_t *edata, void *ptr);
|
||||
|
||||
void arena_dalloc_bin_locked_handle_newly_empty(tsdn_t *tsdn, arena_t *arena,
|
||||
edata_t *slab, bin_t *bin);
|
||||
void arena_dalloc_bin_locked_handle_newly_nonempty(tsdn_t *tsdn, arena_t *arena,
|
||||
edata_t *slab, bin_t *bin);
|
||||
void arena_dalloc_small(tsdn_t *tsdn, void *ptr);
|
||||
bool arena_ralloc_no_move(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size,
|
||||
size_t extra, bool zero, size_t *newsize);
|
||||
|
@ -1,6 +1,7 @@
|
||||
#ifndef JEMALLOC_INTERNAL_ARENA_INLINES_B_H
|
||||
#define JEMALLOC_INTERNAL_ARENA_INLINES_B_H
|
||||
|
||||
#include "jemalloc/internal/div.h"
|
||||
#include "jemalloc/internal/emap.h"
|
||||
#include "jemalloc/internal/jemalloc_internal_types.h"
|
||||
#include "jemalloc/internal/mutex.h"
|
||||
@ -441,4 +442,96 @@ arena_cache_oblivious_randomize(tsdn_t *tsdn, arena_t *arena, edata_t *edata,
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* The dalloc bin info contains just the information that the common paths need
|
||||
* during tcache flushes. By force-inlining these paths, and using local copies
|
||||
* of data (so that the compiler knows it's constant), we avoid a whole bunch of
|
||||
* redundant loads and stores by leaving this information in registers.
|
||||
*/
|
||||
typedef struct arena_dalloc_bin_locked_info_s arena_dalloc_bin_locked_info_t;
|
||||
struct arena_dalloc_bin_locked_info_s {
|
||||
div_info_t div_info;
|
||||
uint32_t nregs;
|
||||
uint64_t ndalloc;
|
||||
};
|
||||
|
||||
JEMALLOC_ALWAYS_INLINE size_t
|
||||
arena_slab_regind(arena_dalloc_bin_locked_info_t *info, szind_t binind,
|
||||
edata_t *slab, const void *ptr) {
|
||||
size_t diff, regind;
|
||||
|
||||
/* Freeing a pointer outside the slab can cause assertion failure. */
|
||||
assert((uintptr_t)ptr >= (uintptr_t)edata_addr_get(slab));
|
||||
assert((uintptr_t)ptr < (uintptr_t)edata_past_get(slab));
|
||||
/* Freeing an interior pointer can cause assertion failure. */
|
||||
assert(((uintptr_t)ptr - (uintptr_t)edata_addr_get(slab)) %
|
||||
(uintptr_t)bin_infos[binind].reg_size == 0);
|
||||
|
||||
diff = (size_t)((uintptr_t)ptr - (uintptr_t)edata_addr_get(slab));
|
||||
|
||||
/* Avoid doing division with a variable divisor. */
|
||||
regind = div_compute(&info->div_info, diff);
|
||||
|
||||
assert(regind < bin_infos[binind].nregs);
|
||||
|
||||
return regind;
|
||||
}
|
||||
|
||||
JEMALLOC_ALWAYS_INLINE void
|
||||
arena_dalloc_bin_locked_begin(arena_dalloc_bin_locked_info_t *info,
|
||||
szind_t binind) {
|
||||
info->div_info = arena_binind_div_info[binind];
|
||||
info->nregs = bin_infos[binind].nregs;
|
||||
info->ndalloc = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Does the deallocation work associated with freeing a single pointer (a
|
||||
* "step") in between a arena_dalloc_bin_locked begin and end call.
|
||||
*
|
||||
* Returns true if arena_slab_dalloc must be called on slab. Doesn't do
|
||||
* stats updates, which happen during finish (this lets running counts get left
|
||||
* in a register).
|
||||
*/
|
||||
JEMALLOC_ALWAYS_INLINE bool
|
||||
arena_dalloc_bin_locked_step(tsdn_t *tsdn, arena_t *arena, bin_t *bin,
|
||||
arena_dalloc_bin_locked_info_t *info, szind_t binind, edata_t *slab,
|
||||
void *ptr) {
|
||||
const bin_info_t *bin_info = &bin_infos[binind];
|
||||
size_t regind = arena_slab_regind(info, binind, slab, ptr);
|
||||
slab_data_t *slab_data = edata_slab_data_get(slab);
|
||||
|
||||
assert(edata_nfree_get(slab) < bin_info->nregs);
|
||||
/* Freeing an unallocated pointer can cause assertion failure. */
|
||||
assert(bitmap_get(slab_data->bitmap, &bin_info->bitmap_info, regind));
|
||||
|
||||
bitmap_unset(slab_data->bitmap, &bin_info->bitmap_info, regind);
|
||||
edata_nfree_inc(slab);
|
||||
|
||||
if (config_stats) {
|
||||
info->ndalloc++;
|
||||
}
|
||||
|
||||
unsigned nfree = edata_nfree_get(slab);
|
||||
if (nfree == bin_info->nregs) {
|
||||
arena_dalloc_bin_locked_handle_newly_empty(tsdn, arena, slab,
|
||||
bin);
|
||||
return true;
|
||||
} else if (nfree == 1 && slab != bin->slabcur) {
|
||||
arena_dalloc_bin_locked_handle_newly_nonempty(tsdn, arena, slab,
|
||||
bin);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
JEMALLOC_ALWAYS_INLINE void
|
||||
arena_dalloc_bin_locked_finish(tsdn_t *tsdn, arena_t *arena, bin_t *bin,
|
||||
arena_dalloc_bin_locked_info_t *info) {
|
||||
if (config_stats) {
|
||||
bin->stats.ndalloc += info->ndalloc;
|
||||
assert(bin->stats.curregs >= (size_t)info->ndalloc);
|
||||
bin->stats.curregs -= (size_t)info->ndalloc;
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* JEMALLOC_INTERNAL_ARENA_INLINES_B_H */
|
||||
|
83
src/arena.c
83
src/arena.c
@ -3,7 +3,6 @@
|
||||
|
||||
#include "jemalloc/internal/assert.h"
|
||||
#include "jemalloc/internal/decay.h"
|
||||
#include "jemalloc/internal/div.h"
|
||||
#include "jemalloc/internal/ehooks.h"
|
||||
#include "jemalloc/internal/extent_dss.h"
|
||||
#include "jemalloc/internal/extent_mmap.h"
|
||||
@ -45,7 +44,7 @@ const uint64_t h_steps[SMOOTHSTEP_NSTEPS] = {
|
||||
#undef STEP
|
||||
};
|
||||
|
||||
static div_info_t arena_binind_div_info[SC_NBINS];
|
||||
div_info_t arena_binind_div_info[SC_NBINS];
|
||||
|
||||
size_t opt_oversize_threshold = OVERSIZE_THRESHOLD_DEFAULT;
|
||||
size_t oversize_threshold = OVERSIZE_THRESHOLD_DEFAULT;
|
||||
@ -260,44 +259,6 @@ arena_slab_reg_alloc_batch(edata_t *slab, const bin_info_t *bin_info,
|
||||
edata_nfree_sub(slab, cnt);
|
||||
}
|
||||
|
||||
#ifndef JEMALLOC_JET
|
||||
static
|
||||
#endif
|
||||
size_t
|
||||
arena_slab_regind(edata_t *slab, szind_t binind, const void *ptr) {
|
||||
size_t diff, regind;
|
||||
|
||||
/* Freeing a pointer outside the slab can cause assertion failure. */
|
||||
assert((uintptr_t)ptr >= (uintptr_t)edata_addr_get(slab));
|
||||
assert((uintptr_t)ptr < (uintptr_t)edata_past_get(slab));
|
||||
/* Freeing an interior pointer can cause assertion failure. */
|
||||
assert(((uintptr_t)ptr - (uintptr_t)edata_addr_get(slab)) %
|
||||
(uintptr_t)bin_infos[binind].reg_size == 0);
|
||||
|
||||
diff = (size_t)((uintptr_t)ptr - (uintptr_t)edata_addr_get(slab));
|
||||
|
||||
/* Avoid doing division with a variable divisor. */
|
||||
regind = div_compute(&arena_binind_div_info[binind], diff);
|
||||
|
||||
assert(regind < bin_infos[binind].nregs);
|
||||
|
||||
return regind;
|
||||
}
|
||||
|
||||
static void
|
||||
arena_slab_reg_dalloc(edata_t *slab, slab_data_t *slab_data, void *ptr) {
|
||||
szind_t binind = edata_szind_get(slab);
|
||||
const bin_info_t *bin_info = &bin_infos[binind];
|
||||
size_t regind = arena_slab_regind(slab, binind, ptr);
|
||||
|
||||
assert(edata_nfree_get(slab) < bin_info->nregs);
|
||||
/* Freeing an unallocated pointer can cause assertion failure. */
|
||||
assert(bitmap_get(slab_data->bitmap, &bin_info->bitmap_info, regind));
|
||||
|
||||
bitmap_unset(slab_data->bitmap, &bin_info->bitmap_info, regind);
|
||||
edata_nfree_inc(slab);
|
||||
}
|
||||
|
||||
static void
|
||||
arena_large_malloc_stats_update(tsdn_t *tsdn, arena_t *arena, size_t usize) {
|
||||
szind_t index, hindex;
|
||||
@ -1189,37 +1150,18 @@ arena_dalloc_bin_slab_prepare(tsdn_t *tsdn, edata_t *slab, bin_t *bin) {
|
||||
}
|
||||
}
|
||||
|
||||
/* Returns true if arena_slab_dalloc must be called on slab */
|
||||
static bool
|
||||
arena_dalloc_bin_locked_impl(tsdn_t *tsdn, arena_t *arena, bin_t *bin,
|
||||
szind_t binind, edata_t *slab, void *ptr) {
|
||||
const bin_info_t *bin_info = &bin_infos[binind];
|
||||
arena_slab_reg_dalloc(slab, edata_slab_data_get(slab), ptr);
|
||||
|
||||
bool ret = false;
|
||||
unsigned nfree = edata_nfree_get(slab);
|
||||
if (nfree == bin_info->nregs) {
|
||||
void
|
||||
arena_dalloc_bin_locked_handle_newly_empty(tsdn_t *tsdn, arena_t *arena,
|
||||
edata_t *slab, bin_t *bin) {
|
||||
arena_dissociate_bin_slab(arena, slab, bin);
|
||||
arena_dalloc_bin_slab_prepare(tsdn, slab, bin);
|
||||
ret = true;
|
||||
} else if (nfree == 1 && slab != bin->slabcur) {
|
||||
arena_bin_slabs_full_remove(arena, bin, slab);
|
||||
arena_bin_lower_slab(tsdn, arena, slab, bin);
|
||||
}
|
||||
|
||||
if (config_stats) {
|
||||
bin->stats.ndalloc++;
|
||||
bin->stats.curregs--;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
bool
|
||||
arena_dalloc_bin_locked(tsdn_t *tsdn, arena_t *arena, bin_t *bin,
|
||||
szind_t binind, edata_t *edata, void *ptr) {
|
||||
return arena_dalloc_bin_locked_impl(tsdn, arena, bin, binind, edata,
|
||||
ptr);
|
||||
void
|
||||
arena_dalloc_bin_locked_handle_newly_nonempty(tsdn_t *tsdn, arena_t *arena,
|
||||
edata_t *slab, bin_t *bin) {
|
||||
arena_bin_slabs_full_remove(arena, bin, slab);
|
||||
arena_bin_lower_slab(tsdn, arena, slab, bin);
|
||||
}
|
||||
|
||||
static void
|
||||
@ -1229,8 +1171,11 @@ arena_dalloc_bin(tsdn_t *tsdn, arena_t *arena, edata_t *edata, void *ptr) {
|
||||
bin_t *bin = &arena->bins[binind].bin_shards[binshard];
|
||||
|
||||
malloc_mutex_lock(tsdn, &bin->lock);
|
||||
bool ret = arena_dalloc_bin_locked_impl(tsdn, arena, bin, binind, edata,
|
||||
ptr);
|
||||
arena_dalloc_bin_locked_info_t info;
|
||||
arena_dalloc_bin_locked_begin(&info, binind);
|
||||
bool ret = arena_dalloc_bin_locked_step(tsdn, arena, bin,
|
||||
&info, binind, edata, ptr);
|
||||
arena_dalloc_bin_locked_finish(tsdn, arena, bin, &info);
|
||||
malloc_mutex_unlock(tsdn, &bin->lock);
|
||||
|
||||
if (ret) {
|
||||
|
11
src/tcache.c
11
src/tcache.c
@ -399,6 +399,10 @@ tcache_bin_flush_impl(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin,
|
||||
|
||||
/* Deallocate whatever we can. */
|
||||
unsigned ndeferred = 0;
|
||||
arena_dalloc_bin_locked_info_t dalloc_bin_info;
|
||||
if (small) {
|
||||
arena_dalloc_bin_locked_begin(&dalloc_bin_info, binind);
|
||||
}
|
||||
for (unsigned i = 0; i < nflush; i++) {
|
||||
void *ptr = cache_bin_ptr_array_get(&ptrs, i);
|
||||
edata = item_edata[i].edata;
|
||||
@ -417,8 +421,9 @@ tcache_bin_flush_impl(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin,
|
||||
continue;
|
||||
}
|
||||
if (small) {
|
||||
if (arena_dalloc_bin_locked(tsdn, cur_arena,
|
||||
cur_bin, binind, edata, ptr)) {
|
||||
if (arena_dalloc_bin_locked_step(tsdn,
|
||||
cur_arena, cur_bin, &dalloc_bin_info,
|
||||
binind, edata, ptr)) {
|
||||
dalloc_slabs[dalloc_count] = edata;
|
||||
dalloc_count++;
|
||||
}
|
||||
@ -432,6 +437,8 @@ tcache_bin_flush_impl(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin,
|
||||
}
|
||||
|
||||
if (small) {
|
||||
arena_dalloc_bin_locked_finish(tsdn, cur_arena, cur_bin,
|
||||
&dalloc_bin_info);
|
||||
malloc_mutex_unlock(tsdn, &cur_bin->lock);
|
||||
}
|
||||
arena_decay_ticks(tsdn, cur_arena, nflush - ndeferred);
|
||||
|
@ -16,10 +16,13 @@ TEST_BEGIN(test_arena_slab_regind) {
|
||||
EXTENT_NOT_HEAD);
|
||||
expect_ptr_not_null(edata_addr_get(&slab),
|
||||
"Unexpected malloc() failure");
|
||||
arena_dalloc_bin_locked_info_t dalloc_info;
|
||||
arena_dalloc_bin_locked_begin(&dalloc_info, binind);
|
||||
for (regind = 0; regind < bin_info->nregs; regind++) {
|
||||
void *reg = (void *)((uintptr_t)edata_addr_get(&slab) +
|
||||
(bin_info->reg_size * regind));
|
||||
expect_zu_eq(arena_slab_regind(&slab, binind, reg),
|
||||
expect_zu_eq(arena_slab_regind(&dalloc_info, binind,
|
||||
&slab, reg),
|
||||
regind,
|
||||
"Incorrect region index computed for size %zu",
|
||||
bin_info->reg_size);
|
||||
|
Loading…
Reference in New Issue
Block a user