From 47b20bb6544de9cdd4ca7ab870d6ad257c0ce4ff Mon Sep 17 00:00:00 2001 From: Qi Wang Date: Thu, 24 Aug 2017 14:29:28 -0700 Subject: [PATCH] Change opt.metadata_thp to [disabled,auto,always]. To avoid the high RSS caused by THP + low usage arena (i.e. THP becomes a significant percentage), added a new "auto" option which will only start using THP after a base allocator used up the first THP region. Starting from the second hugepage (in a single arena), "auto" behaves the same as "always", i.e. madvise hugepage right away. --- doc/jemalloc.xml.in | 12 ++++--- include/jemalloc/internal/base_externs.h | 3 +- include/jemalloc/internal/base_inlines.h | 4 +++ include/jemalloc/internal/base_types.h | 17 ++++++++- src/base.c | 46 +++++++++++++++++------- src/ctl.c | 3 +- src/jemalloc.c | 18 +++++++++- src/pages.c | 2 +- src/stats.c | 2 +- test/unit/mallctl.c | 2 +- 10 files changed, 84 insertions(+), 25 deletions(-) diff --git a/doc/jemalloc.xml.in b/doc/jemalloc.xml.in index f1712f05..0c956040 100644 --- a/doc/jemalloc.xml.in +++ b/doc/jemalloc.xml.in @@ -919,13 +919,15 @@ mallctl("arena." STRINGIFY(MALLCTL_ARENAS_ALL) ".decay", opt.metadata_thp - (bool) + (const char *) r- - If true, allow jemalloc to use transparent huge page - (THP) for internal metadata (see stats.metadata for details). This - option is disabled by default. + Controls whether to allow jemalloc to use transparent + huge page (THP) for internal metadata (see stats.metadata). always + allows such usage. auto uses no THP initially, but may + begin to do so when metadata usage reaches certain level. The default + is disabled. diff --git a/include/jemalloc/internal/base_externs.h b/include/jemalloc/internal/base_externs.h index a5cb8a8d..6cd11877 100644 --- a/include/jemalloc/internal/base_externs.h +++ b/include/jemalloc/internal/base_externs.h @@ -1,7 +1,8 @@ #ifndef JEMALLOC_INTERNAL_BASE_EXTERNS_H #define JEMALLOC_INTERNAL_BASE_EXTERNS_H -extern bool opt_metadata_thp; +extern metadata_thp_mode_t opt_metadata_thp; +extern const char *metadata_thp_mode_names[]; base_t *b0get(void); base_t *base_new(tsdn_t *tsdn, unsigned ind, extent_hooks_t *extent_hooks); diff --git a/include/jemalloc/internal/base_inlines.h b/include/jemalloc/internal/base_inlines.h index 931560bf..aec0e2e1 100644 --- a/include/jemalloc/internal/base_inlines.h +++ b/include/jemalloc/internal/base_inlines.h @@ -6,4 +6,8 @@ base_ind_get(const base_t *base) { return base->ind; } +static inline bool +metadata_thp_enabled(void) { + return (opt_metadata_thp != metadata_thp_disabled); +} #endif /* JEMALLOC_INTERNAL_BASE_INLINES_H */ diff --git a/include/jemalloc/internal/base_types.h b/include/jemalloc/internal/base_types.h index 6e710334..97e38a97 100644 --- a/include/jemalloc/internal/base_types.h +++ b/include/jemalloc/internal/base_types.h @@ -4,6 +4,21 @@ typedef struct base_block_s base_block_t; typedef struct base_s base_t; -#define METADATA_THP_DEFAULT false +#define METADATA_THP_DEFAULT metadata_thp_disabled + +typedef enum { + metadata_thp_disabled = 0, + /* + * Lazily enable hugepage for metadata. To avoid high RSS caused by THP + * + low usage arena (i.e. THP becomes a significant percentage), the + * "auto" option only starts using THP after a base allocator used up + * the first THP region. Starting from the second hugepage (in a single + * arena), "auto" behaves the same as "always", i.e. madvise hugepage + * right away. + */ + metadata_thp_auto = 1, + metadata_thp_always = 2, + metadata_thp_mode_limit = 3 +} metadata_thp_mode_t; #endif /* JEMALLOC_INTERNAL_BASE_TYPES_H */ diff --git a/src/base.c b/src/base.c index 99259783..9cb02b63 100644 --- a/src/base.c +++ b/src/base.c @@ -12,7 +12,13 @@ static base_t *b0; -bool opt_metadata_thp = METADATA_THP_DEFAULT; +metadata_thp_mode_t opt_metadata_thp = METADATA_THP_DEFAULT; + +const char *metadata_thp_mode_names[] = { + "disabled", + "auto", + "always" +}; /******************************************************************************/ @@ -24,7 +30,7 @@ base_map(tsdn_t *tsdn, extent_hooks_t *extent_hooks, unsigned ind, size_t size) /* We use hugepage sizes regardless of opt_metadata_thp. */ assert(size == HUGEPAGE_CEILING(size)); - size_t alignment = opt_metadata_thp ? HUGEPAGE : PAGE; + size_t alignment = metadata_thp_enabled() ? HUGEPAGE : PAGE; if (extent_hooks == &extent_hooks_default) { addr = extent_alloc_mmap(NULL, size, alignment, &zero, &commit); } else { @@ -36,12 +42,6 @@ base_map(tsdn_t *tsdn, extent_hooks_t *extent_hooks, unsigned ind, size_t size) post_reentrancy(tsd); } - if (addr != NULL && opt_metadata_thp && thp_state_madvise) { - assert(((uintptr_t)addr & HUGEPAGE_MASK) == 0 && - (size & HUGEPAGE_MASK) == 0); - pages_huge(addr, size); - } - return addr; } @@ -101,7 +101,7 @@ base_unmap(tsdn_t *tsdn, extent_hooks_t *extent_hooks, unsigned ind, void *addr, post_reentrancy(tsd); } label_done: - if (opt_metadata_thp && thp_state_madvise) { + if (metadata_thp_enabled() && thp_state_madvise) { /* Set NOHUGEPAGE after unmap to avoid kernel defrag. */ assert(((uintptr_t)addr & HUGEPAGE_MASK) == 0 && (size & HUGEPAGE_MASK) == 0); @@ -181,8 +181,8 @@ base_extent_bump_alloc(tsdn_t *tsdn, base_t *base, extent_t *extent, * On success a pointer to the initialized base_block_t header is returned. */ static base_block_t * -base_block_alloc(tsdn_t *tsdn, extent_hooks_t *extent_hooks, unsigned ind, - pszind_t *pind_last, size_t *extent_sn_next, size_t size, +base_block_alloc(tsdn_t *tsdn, base_t *base, extent_hooks_t *extent_hooks, + unsigned ind, pszind_t *pind_last, size_t *extent_sn_next, size_t size, size_t alignment) { alignment = ALIGNMENT_CEILING(alignment, QUANTUM); size_t usize = ALIGNMENT_CEILING(size, alignment); @@ -208,6 +208,26 @@ base_block_alloc(tsdn_t *tsdn, extent_hooks_t *extent_hooks, unsigned ind, if (block == NULL) { return NULL; } + + if (metadata_thp_enabled() && thp_state_madvise) { + void *addr = (void *)block; + assert(((uintptr_t)addr & HUGEPAGE_MASK) == 0 && + (block_size & HUGEPAGE_MASK) == 0); + /* base == NULL indicates this is a new base. */ + if (base != NULL || opt_metadata_thp == metadata_thp_always) { + /* Use hugepage for the new block. */ + pages_huge(addr, block_size); + } + if (base != NULL && opt_metadata_thp == metadata_thp_auto) { + /* Make the first block THP lazily. */ + base_block_t *first_block = base->blocks; + if (first_block->next == NULL) { + assert((first_block->size & HUGEPAGE_MASK) == 0); + pages_huge(first_block, first_block->size); + } + } + } + *pind_last = sz_psz2ind(block_size); block->size = block_size; block->next = NULL; @@ -231,7 +251,7 @@ base_extent_alloc(tsdn_t *tsdn, base_t *base, size_t size, size_t alignment) { * called. */ malloc_mutex_unlock(tsdn, &base->mtx); - base_block_t *block = base_block_alloc(tsdn, extent_hooks, + base_block_t *block = base_block_alloc(tsdn, base, extent_hooks, base_ind_get(base), &base->pind_last, &base->extent_sn_next, size, alignment); malloc_mutex_lock(tsdn, &base->mtx); @@ -259,7 +279,7 @@ base_t * base_new(tsdn_t *tsdn, unsigned ind, extent_hooks_t *extent_hooks) { pszind_t pind_last = 0; size_t extent_sn_next = 0; - base_block_t *block = base_block_alloc(tsdn, extent_hooks, ind, + base_block_t *block = base_block_alloc(tsdn, NULL, extent_hooks, ind, &pind_last, &extent_sn_next, sizeof(base_t), QUANTUM); if (block == NULL) { return NULL; diff --git a/src/ctl.c b/src/ctl.c index c2991036..ace10b02 100644 --- a/src/ctl.c +++ b/src/ctl.c @@ -1570,7 +1570,8 @@ CTL_RO_CONFIG_GEN(config_xmalloc, bool) CTL_RO_NL_GEN(opt_abort, opt_abort, bool) CTL_RO_NL_GEN(opt_abort_conf, opt_abort_conf, bool) -CTL_RO_NL_GEN(opt_metadata_thp, opt_metadata_thp, bool) +CTL_RO_NL_GEN(opt_metadata_thp, metadata_thp_mode_names[opt_metadata_thp], + const char *) CTL_RO_NL_GEN(opt_retain, opt_retain, bool) CTL_RO_NL_GEN(opt_dss, opt_dss, const char *) CTL_RO_NL_GEN(opt_narenas, opt_narenas, unsigned) diff --git a/src/jemalloc.c b/src/jemalloc.c index cbae259d..3c0ea7d4 100644 --- a/src/jemalloc.c +++ b/src/jemalloc.c @@ -1055,7 +1055,23 @@ malloc_conf_init(void) { if (opt_abort_conf && had_conf_error) { malloc_abort_invalid_conf(); } - CONF_HANDLE_BOOL(opt_metadata_thp, "metadata_thp") + if (strncmp("metadata_thp", k, klen) == 0) { + int i; + bool match = false; + for (i = 0; i < metadata_thp_mode_limit; i++) { + if (strncmp(metadata_thp_mode_names[i], + v, vlen) == 0) { + opt_metadata_thp = i; + match = true; + break; + } + } + if (!match) { + malloc_conf_error("Invalid conf value", + k, klen, v, vlen); + } + continue; + } CONF_HANDLE_BOOL(opt_retain, "retain") if (strncmp("dss", k, klen) == 0) { int i; diff --git a/src/pages.c b/src/pages.c index 70f1fd33..4ca3107d 100644 --- a/src/pages.c +++ b/src/pages.c @@ -418,7 +418,7 @@ os_overcommits_proc(void) { static void init_thp_state(void) { if (!have_madvise_huge) { - if (opt_metadata_thp && opt_abort) { + if (metadata_thp_enabled() && opt_abort) { malloc_write(": no MADV_HUGEPAGE support\n"); abort(); } diff --git a/src/stats.c b/src/stats.c index 746cc426..e1a3f8cf 100644 --- a/src/stats.c +++ b/src/stats.c @@ -802,11 +802,11 @@ stats_general_print(void (*write_cb)(void *, const char *), void *cbopaque, } OPT_WRITE_BOOL(abort, ",") OPT_WRITE_BOOL(abort_conf, ",") - OPT_WRITE_BOOL(metadata_thp, ",") OPT_WRITE_BOOL(retain, ",") OPT_WRITE_CHAR_P(dss, ",") OPT_WRITE_UNSIGNED(narenas, ",") OPT_WRITE_CHAR_P(percpu_arena, ",") + OPT_WRITE_CHAR_P(metadata_thp, ",") OPT_WRITE_BOOL_MUTABLE(background_thread, background_thread, ",") OPT_WRITE_SSIZE_T_MUTABLE(dirty_decay_ms, arenas.dirty_decay_ms, ",") OPT_WRITE_SSIZE_T_MUTABLE(muzzy_decay_ms, arenas.muzzy_decay_ms, ",") diff --git a/test/unit/mallctl.c b/test/unit/mallctl.c index 0b14e78f..5612cce5 100644 --- a/test/unit/mallctl.c +++ b/test/unit/mallctl.c @@ -158,7 +158,7 @@ TEST_BEGIN(test_mallctl_opt) { TEST_MALLCTL_OPT(bool, abort, always); TEST_MALLCTL_OPT(bool, abort_conf, always); - TEST_MALLCTL_OPT(bool, metadata_thp, always); + TEST_MALLCTL_OPT(const char *, metadata_thp, always); TEST_MALLCTL_OPT(bool, retain, always); TEST_MALLCTL_OPT(const char *, dss, always); TEST_MALLCTL_OPT(unsigned, narenas, always);