From 978f830ee300c15460085bdc49b4bdb9ef1a16d8 Mon Sep 17 00:00:00 2001 From: Yinan Zhang Date: Thu, 23 Apr 2020 15:46:45 -0700 Subject: [PATCH] Add batch allocation API --- Makefile.in | 4 +- .../internal/jemalloc_internal_externs.h | 1 + include/jemalloc/internal/prof_inlines.h | 11 + include/jemalloc/internal/thread_event.h | 1 - src/jemalloc.c | 117 +++++++++++ test/unit/batch_alloc.c | 190 ++++++++++++++++++ test/unit/batch_alloc.sh | 3 + test/unit/batch_alloc_prof.c | 1 + test/unit/batch_alloc_prof.sh | 3 + 9 files changed, 329 insertions(+), 2 deletions(-) create mode 100644 test/unit/batch_alloc.c create mode 100644 test/unit/batch_alloc.sh create mode 100644 test/unit/batch_alloc_prof.c create mode 100644 test/unit/batch_alloc_prof.sh diff --git a/Makefile.in b/Makefile.in index 10c5392e..da094f08 100644 --- a/Makefile.in +++ b/Makefile.in @@ -191,6 +191,7 @@ TESTS_UNIT := \ $(srcroot)test/unit/background_thread.c \ $(srcroot)test/unit/background_thread_enable.c \ $(srcroot)test/unit/base.c \ + $(srcroot)test/unit/batch_alloc.c \ $(srcroot)test/unit/binshard.c \ $(srcroot)test/unit/bitmap.c \ $(srcroot)test/unit/bit_util.c \ @@ -264,7 +265,8 @@ TESTS_UNIT := \ $(srcroot)test/unit/zero_reallocs.c ifeq (@enable_prof@, 1) TESTS_UNIT += \ - $(srcroot)test/unit/arena_reset_prof.c + $(srcroot)test/unit/arena_reset_prof.c \ + $(srcroot)test/unit/batch_alloc_prof.c endif TESTS_INTEGRATION := $(srcroot)test/integration/aligned_alloc.c \ $(srcroot)test/integration/allocated.c \ diff --git a/include/jemalloc/internal/jemalloc_internal_externs.h b/include/jemalloc/internal/jemalloc_internal_externs.h index 3dea1e21..3e7124d5 100644 --- a/include/jemalloc/internal/jemalloc_internal_externs.h +++ b/include/jemalloc/internal/jemalloc_internal_externs.h @@ -54,6 +54,7 @@ void arena_migrate(tsd_t *tsd, unsigned oldind, unsigned newind); void iarena_cleanup(tsd_t *tsd); void arena_cleanup(tsd_t *tsd); void arenas_tdata_cleanup(tsd_t *tsd); +size_t batch_alloc(void **ptrs, size_t num, size_t size, int flags); void jemalloc_prefork(void); void jemalloc_postfork_parent(void); void jemalloc_postfork_child(void); diff --git a/include/jemalloc/internal/prof_inlines.h b/include/jemalloc/internal/prof_inlines.h index d8f401d1..3d0bd14a 100644 --- a/include/jemalloc/internal/prof_inlines.h +++ b/include/jemalloc/internal/prof_inlines.h @@ -229,6 +229,17 @@ prof_sample_aligned(const void *ptr) { return ((uintptr_t)ptr & PAGE_MASK) == 0; } +JEMALLOC_ALWAYS_INLINE bool +prof_sampled(tsd_t *tsd, const void *ptr) { + prof_info_t prof_info; + prof_info_get(tsd, ptr, NULL, &prof_info); + bool sampled = (uintptr_t)prof_info.alloc_tctx > (uintptr_t)1U; + if (sampled) { + assert(prof_sample_aligned(ptr)); + } + return sampled; +} + JEMALLOC_ALWAYS_INLINE void prof_free(tsd_t *tsd, const void *ptr, size_t usize, emap_alloc_ctx_t *alloc_ctx) { diff --git a/include/jemalloc/internal/thread_event.h b/include/jemalloc/internal/thread_event.h index 5925563a..525019b6 100644 --- a/include/jemalloc/internal/thread_event.h +++ b/include/jemalloc/internal/thread_event.h @@ -274,7 +274,6 @@ te_prof_sample_event_lookahead_surplus(tsd_t *tsd, size_t usize, JEMALLOC_ALWAYS_INLINE bool te_prof_sample_event_lookahead(tsd_t *tsd, size_t usize) { - assert(usize == sz_s2u(usize)); return te_prof_sample_event_lookahead_surplus(tsd, usize, NULL); } diff --git a/src/jemalloc.c b/src/jemalloc.c index 9b5ce681..f2e5f8eb 100644 --- a/src/jemalloc.c +++ b/src/jemalloc.c @@ -3916,6 +3916,123 @@ je_malloc_usable_size(JEMALLOC_USABLE_SIZE_CONST void *ptr) { return ret; } +static void +batch_alloc_prof_sample_assert(tsd_t *tsd, size_t batch, size_t usize) { + assert(config_prof && opt_prof); + bool prof_sample_event = te_prof_sample_event_lookahead(tsd, + batch * usize); + assert(!prof_sample_event); + size_t surplus; + prof_sample_event = te_prof_sample_event_lookahead_surplus(tsd, + (batch + 1) * usize, &surplus); + assert(prof_sample_event); + assert(surplus < usize); +} + +size_t +batch_alloc(void **ptrs, size_t num, size_t size, int flags) { + LOG("core.batch_alloc.entry", + "ptrs: %p, num: %zu, size: %zu, flags: %d", ptrs, num, size, flags); + + tsd_t *tsd = tsd_fetch(); + check_entry_exit_locking(tsd_tsdn(tsd)); + + size_t filled = 0; + + if (unlikely(tsd == NULL || tsd_reentrancy_level_get(tsd) > 0)) { + goto label_done; + } + + size_t alignment = MALLOCX_ALIGN_GET(flags); + size_t usize; + if (aligned_usize_get(size, alignment, &usize, NULL, false)) { + goto label_done; + } + + szind_t ind = sz_size2index(usize); + if (unlikely(ind >= SC_NBINS)) { + /* No optimization for large sizes. */ + void *p; + while (filled < num && (p = je_mallocx(size, flags)) != NULL) { + ptrs[filled++] = p; + } + goto label_done; + } + + bool zero = zero_get(MALLOCX_ZERO_GET(flags), /* slow */ true); + + unsigned arena_ind = mallocx_arena_get(flags); + arena_t *arena; + if (arena_get_from_ind(tsd, arena_ind, &arena)) { + goto label_done; + } + if (arena == NULL) { + arena = arena_choose(tsd, NULL); + } else { + /* When a manual arena is specified, bypass the tcache. */ + flags |= MALLOCX_TCACHE_NONE; + } + if (unlikely(arena == NULL)) { + goto label_done; + } + + while (filled < num) { + size_t batch = num - filled; + size_t surplus = SIZE_MAX; /* Dead store. */ + bool prof_sample_event = config_prof && opt_prof + && te_prof_sample_event_lookahead_surplus(tsd, + batch * usize, &surplus); + + if (prof_sample_event) { + /* + * Adjust so that the batch does not trigger prof + * sampling. + */ + batch -= surplus / usize + 1; + batch_alloc_prof_sample_assert(tsd, batch, usize); + } + + size_t n = arena_fill_small_fresh(tsd_tsdn(tsd), arena, + ind, ptrs + filled, batch, zero); + filled += n; + + /* + * For thread events other than prof sampling, trigger them as + * if there's a single allocation of size (n * usize). This is + * fine because: + * (a) these events do not alter the allocation itself, and + * (b) it's possible that some event would have been triggered + * multiple times, instead of only once, if the allocations + * were handled individually, but it would do no harm (or + * even be beneficial) to coalesce the triggerings. + */ + thread_alloc_event(tsd, n * usize); + + if (n < batch) { /* OOM */ + break; + } + + if (prof_sample_event) { + /* + * The next allocation will be prof sampled. The + * thread event logic is handled within the mallocx() + * call. + */ + void *p = je_mallocx(size, flags); + if (p == NULL) { /* OOM */ + break; + } + assert(prof_sampled(tsd, p)); + ptrs[filled++] = p; + } + } + +label_done: + check_entry_exit_locking(tsd_tsdn(tsd)); + LOG("core.batch_alloc.exit", "result: %zu", filled); + return filled; +} + /* * End non-standard functions. */ diff --git a/test/unit/batch_alloc.c b/test/unit/batch_alloc.c new file mode 100644 index 00000000..66e0565f --- /dev/null +++ b/test/unit/batch_alloc.c @@ -0,0 +1,190 @@ +#include "test/jemalloc_test.h" + +#define BATCH_MAX ((1U << 16) + 1024) +static void *ptrs[BATCH_MAX]; + +#define PAGE_ALIGNED(ptr) (((uintptr_t)ptr & PAGE_MASK) == 0) + +static void +verify_stats(bin_stats_t *before, bin_stats_t *after, size_t batch, + unsigned nregs) { + if (!config_stats) { + return; + } + if (config_prof && opt_prof) { + /* + * Checking the stats when prof is on is feasible but + * complicated, while checking the non-prof case suffices for + * unit-test purpose. + */ + return; + } + expect_u64_eq(before->nmalloc + batch, after->nmalloc, ""); + expect_u64_eq(before->nrequests + batch, after->nrequests, ""); + expect_zu_eq(before->curregs + batch, after->curregs, ""); + size_t nslab = batch / nregs; + size_t n_nonfull = 0; + if (batch % nregs != 0) { + ++nslab; + ++n_nonfull; + } + expect_u64_eq(before->nslabs + nslab, after->nslabs, ""); + expect_zu_eq(before->curslabs + nslab, after->curslabs, ""); + expect_zu_eq(before->nonfull_slabs + n_nonfull, after->nonfull_slabs, + ""); +} + +static void +verify_batch(tsd_t *tsd, void **ptrs, size_t batch, size_t usize, bool zero, + arena_t *arena, unsigned nregs) { + for (size_t i = 0, j = 0; i < batch; ++i, ++j) { + if (j == nregs) { + j = 0; + } + void *p = ptrs[i]; + expect_zu_eq(isalloc(tsd_tsdn(tsd), p), usize, ""); + expect_ptr_eq(iaalloc(tsd_tsdn(tsd), p), arena, ""); + if (zero) { + for (size_t k = 0; k < usize; ++k) { + expect_true(*((unsigned char *)p + k) == 0, ""); + } + } + if (j == 0) { + expect_true(PAGE_ALIGNED(p), ""); + continue; + } + assert(i > 0); + void *q = ptrs[i - 1]; + bool adjacent = (uintptr_t)p > (uintptr_t)q + && (size_t)((uintptr_t)p - (uintptr_t)q) == usize; + if (config_prof && opt_prof) { + if (adjacent) { + expect_false(prof_sampled(tsd, p) + || prof_sampled(tsd, q), ""); + } else { + expect_true(prof_sampled(tsd, p) + || prof_sampled(tsd, q), ""); + expect_true(PAGE_ALIGNED(p), ""); + j = 0; + } + } else { + expect_true(adjacent, ""); + } + } +} + +static void +release_batch(void **ptrs, size_t batch, size_t size) { + for (size_t i = 0; i < batch; ++i) { + sdallocx(ptrs[i], size, 0); + } +} + +static void +test_wrapper(size_t size, size_t alignment, bool zero, unsigned arena_flag) { + tsd_t *tsd = tsd_fetch(); + assert(tsd != NULL); + const size_t usize = + (alignment != 0 ? sz_sa2u(size, alignment) : sz_s2u(size)); + const szind_t ind = sz_size2index(usize); + const bin_info_t *bin_info = &bin_infos[ind]; + const unsigned nregs = bin_info->nregs; + assert(nregs > 0); + arena_t *arena; + if (arena_flag != 0) { + arena = arena_get(tsd_tsdn(tsd), MALLOCX_ARENA_GET(arena_flag), + false); + } else { + arena = arena_choose(tsd, NULL); + } + assert(arena != NULL); + bin_t *bin = arena_bin_choose(tsd_tsdn(tsd), arena, ind, NULL); + assert(bin != NULL); + int flags = arena_flag; + if (alignment != 0) { + flags |= MALLOCX_ALIGN(alignment); + } + if (zero) { + flags |= MALLOCX_ZERO; + } + + /* + * Allocate for the purpose of bootstrapping arena_tdata, so that the + * change in bin stats won't contaminate the stats to be verified below. + */ + void *p = mallocx(size, flags | MALLOCX_TCACHE_NONE); + + for (size_t i = 0; i < 4; ++i) { + size_t base = 0; + if (i == 1) { + base = nregs; + } else if (i == 2) { + base = nregs * 2; + } else if (i == 3) { + base = (1 << 16); + } + for (int j = -1; j <= 1; ++j) { + if (base == 0 && j == -1) { + continue; + } + size_t batch = base + (size_t)j; + assert(batch < BATCH_MAX); + bin_stats_t stats_before, stats_after; + memcpy(&stats_before, &bin->stats, sizeof(bin_stats_t)); + size_t filled = batch_alloc(ptrs, batch, size, flags); + assert_zu_eq(filled, batch, ""); + memcpy(&stats_after, &bin->stats, sizeof(bin_stats_t)); + verify_stats(&stats_before, &stats_after, batch, nregs); + verify_batch(tsd, ptrs, batch, usize, zero, arena, + nregs); + release_batch(ptrs, batch, usize); + } + } + + free(p); +} + +TEST_BEGIN(test_batch_alloc) { + test_wrapper(11, 0, false, 0); +} +TEST_END + +TEST_BEGIN(test_batch_alloc_zero) { + test_wrapper(11, 0, true, 0); +} +TEST_END + +TEST_BEGIN(test_batch_alloc_aligned) { + test_wrapper(7, 16, false, 0); +} +TEST_END + +TEST_BEGIN(test_batch_alloc_manual_arena) { + unsigned arena_ind; + size_t len_unsigned = sizeof(unsigned); + assert_d_eq(mallctl("arenas.create", &arena_ind, &len_unsigned, NULL, + 0), 0, ""); + test_wrapper(11, 0, false, MALLOCX_ARENA(arena_ind)); +} +TEST_END + +TEST_BEGIN(test_batch_alloc_fallback) { + const size_t size = SC_LARGE_MINCLASS; + for (size_t batch = 0; batch < 4; ++batch) { + assert(batch < BATCH_MAX); + size_t filled = batch_alloc(ptrs, batch, size, 0); + assert_zu_eq(filled, batch, ""); + release_batch(ptrs, batch, size); + } +} +TEST_END + +int +main(void) { + return test( + test_batch_alloc, + test_batch_alloc_zero, + test_batch_alloc_aligned, + test_batch_alloc_manual_arena, + test_batch_alloc_fallback); +} diff --git a/test/unit/batch_alloc.sh b/test/unit/batch_alloc.sh new file mode 100644 index 00000000..9d81010a --- /dev/null +++ b/test/unit/batch_alloc.sh @@ -0,0 +1,3 @@ +#!/bin/sh + +export MALLOC_CONF="tcache_gc_incr_bytes:2147483648" diff --git a/test/unit/batch_alloc_prof.c b/test/unit/batch_alloc_prof.c new file mode 100644 index 00000000..ef644586 --- /dev/null +++ b/test/unit/batch_alloc_prof.c @@ -0,0 +1 @@ +#include "batch_alloc.c" diff --git a/test/unit/batch_alloc_prof.sh b/test/unit/batch_alloc_prof.sh new file mode 100644 index 00000000..a2697a61 --- /dev/null +++ b/test/unit/batch_alloc_prof.sh @@ -0,0 +1,3 @@ +#!/bin/sh + +export MALLOC_CONF="prof:true,lg_prof_sample:14"