Tcache flush: prefetch edata contents.

This frontloads more of the miss latency.  It also moves it to a pathway where
we have not yet acquired any locks, so that it should (hopefully) reduce hold
times.
This commit is contained in:
David Goldblatt 2021-01-28 16:14:39 -08:00 committed by David Goldblatt
parent 9f9247a62e
commit 31a629c3de
2 changed files with 50 additions and 0 deletions

View File

@ -69,6 +69,55 @@ util_assume(bool b) {
}
}
/* ptr should be valid. */
JEMALLOC_ALWAYS_INLINE void
util_prefetch_read(void *ptr) {
/*
* This should arguably be a config check; but any version of GCC so old
* that it doesn't support __builtin_prefetch is also too old to build
* jemalloc.
*/
#ifdef __GNUC__
if (config_debug) {
/* Enforce the "valid ptr" requirement. */
*(volatile char *)ptr;
}
__builtin_prefetch(ptr, /* read or write */ 0, /* locality hint */ 3);
#else
*(volatile char *)ptr;
#endif
}
JEMALLOC_ALWAYS_INLINE void
util_prefetch_write(void *ptr) {
#ifdef __GNUC__
if (config_debug) {
*(volatile char *)ptr;
}
/*
* The only difference from the read variant is that this has a 1 as the
* second argument (the write hint).
*/
__builtin_prefetch(ptr, 1, 3);
#else
*(volatile char *)ptr;
#endif
}
JEMALLOC_ALWAYS_INLINE void
util_prefetch_read_range(void *ptr, size_t sz) {
for (size_t i = 0; i < sz; i += CACHELINE) {
util_prefetch_read((void *)((uintptr_t)ptr + i));
}
}
JEMALLOC_ALWAYS_INLINE void
util_prefetch_write_range(void *ptr, size_t sz) {
for (size_t i = 0; i < sz; i += CACHELINE) {
util_prefetch_write((void *)((uintptr_t)ptr + i));
}
}
#undef UTIL_INLINE
#endif /* JEMALLOC_INTERNAL_UTIL_H */

View File

@ -247,6 +247,7 @@ tcache_bin_flush_metadata_visitor(void *szind_sum_ctx,
emap_full_alloc_ctx_t *alloc_ctx) {
size_t *szind_sum = (size_t *)szind_sum_ctx;
*szind_sum -= alloc_ctx->szind;
util_prefetch_write_range(alloc_ctx->edata, sizeof(edata_t));
}
static void