From cafe9a315879b357ac3c6d00f3b7f9ad52c33087 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Fri, 17 Dec 2021 21:00:21 +0300 Subject: [PATCH] Disable percpu arena in case of non deterministic CPU count Determinitic number of CPUs is important for percpu arena to work correctly, since it uses cpu index - sched_getcpu(), and if it will greater then number of CPUs bad thing will happen, or assertion will be failed in debug build: : ../contrib/jemalloc/src/jemalloc.c:321: Failed assertion: "ind <= narenas_total_get()" Aborted (core dumped) Number of CPUs can be obtained from the following places: - sched_getaffinity() - sysconf(_SC_NPROCESSORS_ONLN) - sysconf(_SC_NPROCESSORS_CONF) For the sched_getaffinity() you may simply use taskset(1) to run program on a different cpu, and in case it will be not first, percpu will work incorrectly, i.e.: $ taskset --cpu-list $(( $(getconf _NPROCESSORS_ONLN)-1 )) _SC_NPROCESSORS_ONLN uses /sys/devices/system/cpu/online, LXD/LXC virtualize /sys/devices/system/cpu/online file [1], and so when you run container with limited limits.cpus it will bind randomly selected CPU to it [1]: https://github.com/lxc/lxcfs/issues/301 _SC_NPROCESSORS_CONF uses /sys/devices/system/cpu/cpu*, and AFAIK nobody playing with dentries there. So if all three of these are equal, percpu arenas should work correctly. And a small note regardless _SC_NPROCESSORS_ONLN/_SC_NPROCESSORS_CONF, musl uses sched_getaffinity() for both. So this will also increase the entropy. Also note, that you can check is percpu arena really applied using abort_conf:true. Refs: https://github.com/jemalloc/jemalloc/pull/1939 Refs: https://github.com/ClickHouse/ClickHouse/issues/32806 v2: move malloc_cpu_count_is_deterministic() into malloc_init_hard_recursible() since _SC_NPROCESSORS_CONF does allocations for readdir() v3: - mark cpu_count_is_deterministic static - check only if percpu arena is enabled - check narenas --- src/jemalloc.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 55 insertions(+), 1 deletion(-) diff --git a/src/jemalloc.c b/src/jemalloc.c index e707f9f9..38f70367 100644 --- a/src/jemalloc.c +++ b/src/jemalloc.c @@ -148,6 +148,8 @@ unsigned opt_narenas = 0; fxp_t opt_narenas_ratio = FXP_INIT_INT(4); unsigned ncpus; +/* ncpus is determinstinc, see malloc_cpu_count_is_deterministic() */ +static int cpu_count_is_deterministic = -1; /* Protects arenas initialization. */ malloc_mutex_t arenas_lock; @@ -741,6 +743,42 @@ malloc_ncpus(void) { return ((result == -1) ? 1 : (unsigned)result); } +/* + * Ensure that number of CPUs is determistinc, i.e. it is the same based on: + * - sched_getaffinity() + * - _SC_NPROCESSORS_ONLN + * - _SC_NPROCESSORS_CONF + * Since otherwise tricky things is possible with percpu arenas in use. + */ +static bool +malloc_cpu_count_is_deterministic() +{ +#ifdef _WIN32 + return true; +#else + long cpu_onln = sysconf(_SC_NPROCESSORS_ONLN); + long cpu_conf = sysconf(_SC_NPROCESSORS_CONF); + if (cpu_onln != cpu_conf) + return false; +# if defined(CPU_COUNT) +# if defined(__FreeBSD__) + cpuset_t set; +# else + cpu_set_t set; +# endif /* __FreeBSD__ */ +# if defined(JEMALLOC_HAVE_SCHED_SETAFFINITY) + sched_getaffinity(0, sizeof(set), &set); +# else /* !JEMALLOC_HAVE_SCHED_SETAFFINITY */ + pthread_getaffinity_np(pthread_self(), sizeof(set), &set); +# endif /* JEMALLOC_HAVE_SCHED_SETAFFINITY */ + long cpu_affinity = CPU_COUNT(&set); + if (cpu_affinity != cpu_conf) + return false; +# endif /* CPU_COUNT */ + return true; +#endif +} + static void init_opt_stats_opts(const char *v, size_t vlen, char *dest) { size_t opts_len = strlen(dest); @@ -1833,6 +1871,7 @@ malloc_init_hard_recursible(void) { malloc_init_state = malloc_init_recursible; ncpus = malloc_ncpus(); + cpu_count_is_deterministic = malloc_cpu_count_is_deterministic(); #if (defined(JEMALLOC_HAVE_PTHREAD_ATFORK) && !defined(JEMALLOC_MUTEX_INIT_CB) \ && !defined(JEMALLOC_ZONE) && !defined(_WIN32) && \ @@ -1892,7 +1931,22 @@ malloc_init_narenas(void) { assert(ncpus > 0); if (opt_percpu_arena != percpu_arena_disabled) { - if (!have_percpu_arena || malloc_getcpu() < 0) { + if (!cpu_count_is_deterministic) { + if (opt_narenas) { + malloc_write(": Number of CPUs is not deterministic, " + "but narenas is set. Hope you not what you are doing and " + "you have set narenas to largest possible CPU ID.\n"); + if (opt_abort) { + abort(); + } + } else { + opt_percpu_arena = percpu_arena_disabled; + if (opt_abort_conf) { + malloc_write(": Number of CPUs is not deterministic\n"); + malloc_abort_invalid_conf(); + } + } + } else if (!have_percpu_arena || malloc_getcpu() < 0) { opt_percpu_arena = percpu_arena_disabled; malloc_printf(": perCPU arena getcpu() not " "available. Setting narenas to %u.\n", opt_narenas ?