Add runtime detection for MADV_DONTNEED zeroes pages (mostly for qemu)

qemu does not support this, yet [1], and you can get very tricky assert
if you will run program with jemalloc in use under qemu:

    <jemalloc>: ../contrib/jemalloc/src/extent.c:1195: Failed assertion: "p[i] == 0"

  [1]: https://patchwork.kernel.org/patch/10576637/

Here is a simple example that shows the problem [2]:

    // Gist to check possible issues with MADV_DONTNEED
    // For example it does not supported by qemu user
    // There is a patch for this [1], but it hasn't been applied.
    //   [1]: https://lists.gnu.org/archive/html/qemu-devel/2018-08/msg05422.html

    #include <sys/mman.h>
    #include <stdio.h>
    #include <stddef.h>
    #include <assert.h>
    #include <string.h>

    int main(int argc, char **argv)
    {
        void *addr = mmap(NULL, 1<<16, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
        if (addr == MAP_FAILED) {
            perror("mmap");
            return 1;
        }
        memset(addr, 'A', 1<<16);

        if (!madvise(addr, 1<<16, MADV_DONTNEED)) {
            puts("MADV_DONTNEED does not return error. Check memory.");
            for (int i = 0; i < 1<<16; ++i) {
                assert(((unsigned char *)addr)[i] == 0);
            }
        } else {
            perror("madvise");
        }

        if (munmap(addr, 1<<16)) {
            perror("munmap");
            return 1;
        }

        return 0;
    }

  ### unpatched qemu

      $ qemu-x86_64-static /tmp/test-MADV_DONTNEED
      MADV_DONTNEED does not return error. Check memory.
      test-MADV_DONTNEED: /tmp/test-MADV_DONTNEED.c:19: main: Assertion `((unsigned char *)addr)[i] == 0' failed.
      qemu: uncaught target signal 6 (Aborted) - core dumped
      Aborted (core dumped)

  ### patched qemu (by returning ENOSYS error)

      $ qemu-x86_64 /tmp/test-MADV_DONTNEED
      madvise: Success

  ### patch for qemu to return ENOSYS

      diff --git a/linux-user/syscall.c b/linux-user/syscall.c
      index 897d20c076..5540792e0e 100644
      --- a/linux-user/syscall.c
      +++ b/linux-user/syscall.c
      @@ -11775,7 +11775,7 @@ static abi_long do_syscall1(void *cpu_env, int num, abi_long arg1,
                  turns private file-backed mappings into anonymous mappings.
                  This will break MADV_DONTNEED.
                  This is a hint, so ignoring and returning success is ok.  */
      -        return 0;
      +        return ENOSYS;
       #endif
       #ifdef TARGET_NR_fcntl64
           case TARGET_NR_fcntl64:

  [2]: https://gist.github.com/azat/12ba2c825b710653ece34dba7f926ece

v2:
- review fixes
- add opt_dont_trust_madvise
v3:
- review fixes
- rename opt_dont_trust_madvise to opt_trust_madvise
This commit is contained in:
Azat Khuzhin 2020-12-18 22:23:35 +03:00 committed by David Goldblatt
parent 2e3104ba07
commit a943172b73
6 changed files with 94 additions and 2 deletions

View File

@ -950,6 +950,18 @@ mallctl("arena." STRINGIFY(MALLCTL_ARENAS_ALL) ".decay",
is <quote>disabled</quote>.</para></listitem> is <quote>disabled</quote>.</para></listitem>
</varlistentry> </varlistentry>
<varlistentry id="opt.trust_madvise">
<term>
<mallctl>opt.trust_madvise</mallctl>
(<type>bool</type>)
<literal>r-</literal>
</term>
<listitem><para>Do not perform runtime check for MADV_DONTNEED, to
check that it actually zeros pages. The default is
<quote>disabled</quote> on linux and <quote>enabled</quote> elsewhere.
</para></listitem>
</varlistentry>
<varlistentry id="opt.retain"> <varlistentry id="opt.retain">
<term> <term>
<mallctl>opt.retain</mallctl> <mallctl>opt.retain</mallctl>

View File

@ -11,6 +11,7 @@ extern bool malloc_slow;
/* Run-time options. */ /* Run-time options. */
extern bool opt_abort; extern bool opt_abort;
extern bool opt_abort_conf; extern bool opt_abort_conf;
extern bool opt_trust_madvise;
extern bool opt_confirm_conf; extern bool opt_confirm_conf;
extern bool opt_hpa; extern bool opt_hpa;
extern size_t opt_hpa_slab_max_alloc; extern size_t opt_hpa_slab_max_alloc;

View File

@ -90,6 +90,7 @@ CTL_PROTO(config_utrace)
CTL_PROTO(config_xmalloc) CTL_PROTO(config_xmalloc)
CTL_PROTO(opt_abort) CTL_PROTO(opt_abort)
CTL_PROTO(opt_abort_conf) CTL_PROTO(opt_abort_conf)
CTL_PROTO(opt_trust_madvise)
CTL_PROTO(opt_confirm_conf) CTL_PROTO(opt_confirm_conf)
CTL_PROTO(opt_hpa) CTL_PROTO(opt_hpa)
CTL_PROTO(opt_hpa_slab_max_alloc) CTL_PROTO(opt_hpa_slab_max_alloc)
@ -372,6 +373,7 @@ static const ctl_named_node_t config_node[] = {
static const ctl_named_node_t opt_node[] = { static const ctl_named_node_t opt_node[] = {
{NAME("abort"), CTL(opt_abort)}, {NAME("abort"), CTL(opt_abort)},
{NAME("abort_conf"), CTL(opt_abort_conf)}, {NAME("abort_conf"), CTL(opt_abort_conf)},
{NAME("trust_madvise"), CTL(opt_trust_madvise)},
{NAME("confirm_conf"), CTL(opt_confirm_conf)}, {NAME("confirm_conf"), CTL(opt_confirm_conf)},
{NAME("hpa"), CTL(opt_hpa)}, {NAME("hpa"), CTL(opt_hpa)},
{NAME("hpa_slab_max_alloc"), CTL(opt_hpa_slab_max_alloc)}, {NAME("hpa_slab_max_alloc"), CTL(opt_hpa_slab_max_alloc)},
@ -2045,6 +2047,7 @@ CTL_RO_CONFIG_GEN(config_xmalloc, bool)
CTL_RO_NL_GEN(opt_abort, opt_abort, bool) CTL_RO_NL_GEN(opt_abort, opt_abort, bool)
CTL_RO_NL_GEN(opt_abort_conf, opt_abort_conf, bool) CTL_RO_NL_GEN(opt_abort_conf, opt_abort_conf, bool)
CTL_RO_NL_GEN(opt_trust_madvise, opt_trust_madvise, bool)
CTL_RO_NL_GEN(opt_confirm_conf, opt_confirm_conf, bool) CTL_RO_NL_GEN(opt_confirm_conf, opt_confirm_conf, bool)
CTL_RO_NL_GEN(opt_hpa, opt_hpa, bool) CTL_RO_NL_GEN(opt_hpa, opt_hpa, bool)
CTL_RO_NL_GEN(opt_hpa_slab_max_alloc, opt_hpa_slab_max_alloc, size_t) CTL_RO_NL_GEN(opt_hpa_slab_max_alloc, opt_hpa_slab_max_alloc, size_t)

View File

@ -94,6 +94,13 @@ bool opt_junk_free =
false false
#endif #endif
; ;
bool opt_trust_madvise =
#ifdef JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS
false
#else
true
#endif
;
zero_realloc_action_t opt_zero_realloc_action = zero_realloc_action_t opt_zero_realloc_action =
zero_realloc_action_strict; zero_realloc_action_strict;
@ -1256,6 +1263,7 @@ malloc_conf_init_helper(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS],
CONF_HANDLE_BOOL(opt_abort, "abort") CONF_HANDLE_BOOL(opt_abort, "abort")
CONF_HANDLE_BOOL(opt_abort_conf, "abort_conf") CONF_HANDLE_BOOL(opt_abort_conf, "abort_conf")
CONF_HANDLE_BOOL(opt_trust_madvise, "trust_madvise")
if (strncmp("metadata_thp", k, klen) == 0) { if (strncmp("metadata_thp", k, klen) == 0) {
int i; int i;
bool match = false; bool match = false;

View File

@ -42,6 +42,57 @@ thp_mode_t init_system_thp_mode;
/* Runtime support for lazy purge. Irrelevant when !pages_can_purge_lazy. */ /* Runtime support for lazy purge. Irrelevant when !pages_can_purge_lazy. */
static bool pages_can_purge_lazy_runtime = true; static bool pages_can_purge_lazy_runtime = true;
#ifdef JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS
static int madvise_dont_need_zeros_is_faulty = -1;
/**
* Check that MADV_DONTNEED will actually zero pages on subsequent access.
*
* Since qemu does not support this, yet [1], and you can get very tricky
* assert if you will run program with jemalloc in use under qemu:
*
* <jemalloc>: ../contrib/jemalloc/src/extent.c:1195: Failed assertion: "p[i] == 0"
*
* [1]: https://patchwork.kernel.org/patch/10576637/
*/
static int madvise_MADV_DONTNEED_zeroes_pages()
{
int works = -1;
size_t size = PAGE;
void * addr = mmap(NULL, size, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
if (addr == MAP_FAILED) {
malloc_write("<jemalloc>: Cannot allocate memory for "
"MADV_DONTNEED check\n");
if (opt_abort) {
abort();
}
}
memset(addr, 'A', size);
if (madvise(addr, size, MADV_DONTNEED) == 0) {
works = memchr(addr, 'A', size) == NULL;
} else {
/*
* If madvise() does not support MADV_DONTNEED, then we can
* call it anyway, and use it's return code.
*/
works = 1;
}
if (munmap(addr, size) != 0) {
malloc_write("<jemalloc>: Cannot deallocate memory for "
"MADV_DONTNEED check\n");
if (opt_abort) {
abort();
}
}
return works;
}
#endif
/******************************************************************************/ /******************************************************************************/
/* /*
* Function prototypes for static functions that are referenced prior to * Function prototypes for static functions that are referenced prior to
@ -351,10 +402,12 @@ pages_purge_forced(void *addr, size_t size) {
#if defined(JEMALLOC_PURGE_MADVISE_DONTNEED) && \ #if defined(JEMALLOC_PURGE_MADVISE_DONTNEED) && \
defined(JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS) defined(JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS)
return (madvise(addr, size, MADV_DONTNEED) != 0); return (unlikely(madvise_dont_need_zeros_is_faulty) ||
madvise(addr, size, MADV_DONTNEED) != 0);
#elif defined(JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED) && \ #elif defined(JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED) && \
defined(JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED_ZEROS) defined(JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED_ZEROS)
return (posix_madvise(addr, size, POSIX_MADV_DONTNEED) != 0); return (unlikely(madvise_dont_need_zeros_is_faulty) ||
posix_madvise(addr, size, POSIX_MADV_DONTNEED) != 0);
#elif defined(JEMALLOC_MAPS_COALESCE) #elif defined(JEMALLOC_MAPS_COALESCE)
/* Try to overlay a new demand-zeroed mapping. */ /* Try to overlay a new demand-zeroed mapping. */
return pages_commit(addr, size); return pages_commit(addr, size);
@ -642,6 +695,20 @@ pages_boot(void) {
return true; return true;
} }
#ifdef JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS
if (!opt_trust_madvise) {
madvise_dont_need_zeros_is_faulty = !madvise_MADV_DONTNEED_zeroes_pages();
if (madvise_dont_need_zeros_is_faulty) {
malloc_write("<jemalloc>: MADV_DONTNEED does not work (memset will be used instead)\n");
malloc_write("<jemalloc>: (This is the expected behaviour if you are running under QEMU)\n");
}
} else {
/* In case opt_trust_madvise is disable,
* do not do runtime check */
madvise_dont_need_zeros_is_faulty = 0;
}
#endif
#ifndef _WIN32 #ifndef _WIN32
mmap_flags = MAP_PRIVATE | MAP_ANON; mmap_flags = MAP_PRIVATE | MAP_ANON;
#endif #endif

View File

@ -279,6 +279,7 @@ TEST_BEGIN(test_mallctl_opt) {
TEST_MALLCTL_OPT(bool, abort, always); TEST_MALLCTL_OPT(bool, abort, always);
TEST_MALLCTL_OPT(bool, abort_conf, always); TEST_MALLCTL_OPT(bool, abort_conf, always);
TEST_MALLCTL_OPT(bool, trust_madvise, always);
TEST_MALLCTL_OPT(bool, confirm_conf, always); TEST_MALLCTL_OPT(bool, confirm_conf, always);
TEST_MALLCTL_OPT(const char *, metadata_thp, always); TEST_MALLCTL_OPT(const char *, metadata_thp, always);
TEST_MALLCTL_OPT(bool, retain, always); TEST_MALLCTL_OPT(bool, retain, always);