Tcache flush: keep common path state in registers.

By carefully force-inlining the division constants and the operation sum count,
we can eliminate redundant operations in the arena-level dalloc function.  Do
so.
This commit is contained in:
David Goldblatt
2021-01-29 16:06:28 -08:00
committed by David Goldblatt
parent 31a629c3de
commit 229994a204
5 changed files with 130 additions and 79 deletions

View File

@@ -2,6 +2,7 @@
#define JEMALLOC_INTERNAL_ARENA_EXTERNS_H
#include "jemalloc/internal/bin.h"
#include "jemalloc/internal/div.h"
#include "jemalloc/internal/extent_dss.h"
#include "jemalloc/internal/hook.h"
#include "jemalloc/internal/pages.h"
@@ -13,6 +14,8 @@ extern ssize_t opt_muzzy_decay_ms;
extern percpu_arena_mode_t opt_percpu_arena;
extern const char *percpu_arena_mode_names[];
extern div_info_t arena_binind_div_info[SC_NBINS];
extern const uint64_t h_steps[SMOOTHSTEP_NSTEPS];
extern malloc_mutex_t arenas_lock;
extern emap_t arena_emap_global;
@@ -29,9 +32,6 @@ void arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads,
bin_stats_data_t *bstats, arena_stats_large_t *lstats,
pac_estats_t *estats, hpa_shard_stats_t *hpastats, sec_stats_t *secstats);
void arena_handle_new_dirty_pages(tsdn_t *tsdn, arena_t *arena);
#ifdef JEMALLOC_JET
size_t arena_slab_regind(edata_t *slab, szind_t binind, const void *ptr);
#endif
edata_t *arena_extent_alloc_large(tsdn_t *tsdn, arena_t *arena,
size_t usize, size_t alignment, bool zero);
void arena_extent_dalloc_large_prep(tsdn_t *tsdn, arena_t *arena,
@@ -59,8 +59,11 @@ void arena_prof_promote(tsdn_t *tsdn, void *ptr, size_t usize);
void arena_dalloc_promoted(tsdn_t *tsdn, void *ptr, tcache_t *tcache,
bool slow_path);
void arena_slab_dalloc(tsdn_t *tsdn, arena_t *arena, edata_t *slab);
bool arena_dalloc_bin_locked(tsdn_t *tsdn, arena_t *arena, bin_t *bin,
szind_t binind, edata_t *edata, void *ptr);
void arena_dalloc_bin_locked_handle_newly_empty(tsdn_t *tsdn, arena_t *arena,
edata_t *slab, bin_t *bin);
void arena_dalloc_bin_locked_handle_newly_nonempty(tsdn_t *tsdn, arena_t *arena,
edata_t *slab, bin_t *bin);
void arena_dalloc_small(tsdn_t *tsdn, void *ptr);
bool arena_ralloc_no_move(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size,
size_t extra, bool zero, size_t *newsize);

View File

@@ -1,6 +1,7 @@
#ifndef JEMALLOC_INTERNAL_ARENA_INLINES_B_H
#define JEMALLOC_INTERNAL_ARENA_INLINES_B_H
#include "jemalloc/internal/div.h"
#include "jemalloc/internal/emap.h"
#include "jemalloc/internal/jemalloc_internal_types.h"
#include "jemalloc/internal/mutex.h"
@@ -441,4 +442,96 @@ arena_cache_oblivious_randomize(tsdn_t *tsdn, arena_t *arena, edata_t *edata,
}
}
/*
* The dalloc bin info contains just the information that the common paths need
* during tcache flushes. By force-inlining these paths, and using local copies
* of data (so that the compiler knows it's constant), we avoid a whole bunch of
* redundant loads and stores by leaving this information in registers.
*/
typedef struct arena_dalloc_bin_locked_info_s arena_dalloc_bin_locked_info_t;
struct arena_dalloc_bin_locked_info_s {
div_info_t div_info;
uint32_t nregs;
uint64_t ndalloc;
};
JEMALLOC_ALWAYS_INLINE size_t
arena_slab_regind(arena_dalloc_bin_locked_info_t *info, szind_t binind,
edata_t *slab, const void *ptr) {
size_t diff, regind;
/* Freeing a pointer outside the slab can cause assertion failure. */
assert((uintptr_t)ptr >= (uintptr_t)edata_addr_get(slab));
assert((uintptr_t)ptr < (uintptr_t)edata_past_get(slab));
/* Freeing an interior pointer can cause assertion failure. */
assert(((uintptr_t)ptr - (uintptr_t)edata_addr_get(slab)) %
(uintptr_t)bin_infos[binind].reg_size == 0);
diff = (size_t)((uintptr_t)ptr - (uintptr_t)edata_addr_get(slab));
/* Avoid doing division with a variable divisor. */
regind = div_compute(&info->div_info, diff);
assert(regind < bin_infos[binind].nregs);
return regind;
}
JEMALLOC_ALWAYS_INLINE void
arena_dalloc_bin_locked_begin(arena_dalloc_bin_locked_info_t *info,
szind_t binind) {
info->div_info = arena_binind_div_info[binind];
info->nregs = bin_infos[binind].nregs;
info->ndalloc = 0;
}
/*
* Does the deallocation work associated with freeing a single pointer (a
* "step") in between a arena_dalloc_bin_locked begin and end call.
*
* Returns true if arena_slab_dalloc must be called on slab. Doesn't do
* stats updates, which happen during finish (this lets running counts get left
* in a register).
*/
JEMALLOC_ALWAYS_INLINE bool
arena_dalloc_bin_locked_step(tsdn_t *tsdn, arena_t *arena, bin_t *bin,
arena_dalloc_bin_locked_info_t *info, szind_t binind, edata_t *slab,
void *ptr) {
const bin_info_t *bin_info = &bin_infos[binind];
size_t regind = arena_slab_regind(info, binind, slab, ptr);
slab_data_t *slab_data = edata_slab_data_get(slab);
assert(edata_nfree_get(slab) < bin_info->nregs);
/* Freeing an unallocated pointer can cause assertion failure. */
assert(bitmap_get(slab_data->bitmap, &bin_info->bitmap_info, regind));
bitmap_unset(slab_data->bitmap, &bin_info->bitmap_info, regind);
edata_nfree_inc(slab);
if (config_stats) {
info->ndalloc++;
}
unsigned nfree = edata_nfree_get(slab);
if (nfree == bin_info->nregs) {
arena_dalloc_bin_locked_handle_newly_empty(tsdn, arena, slab,
bin);
return true;
} else if (nfree == 1 && slab != bin->slabcur) {
arena_dalloc_bin_locked_handle_newly_nonempty(tsdn, arena, slab,
bin);
}
return false;
}
JEMALLOC_ALWAYS_INLINE void
arena_dalloc_bin_locked_finish(tsdn_t *tsdn, arena_t *arena, bin_t *bin,
arena_dalloc_bin_locked_info_t *info) {
if (config_stats) {
bin->stats.ndalloc += info->ndalloc;
assert(bin->stats.curregs >= (size_t)info->ndalloc);
bin->stats.curregs -= (size_t)info->ndalloc;
}
}
#endif /* JEMALLOC_INTERNAL_ARENA_INLINES_B_H */