From cd9a1346e96f71bdecdc654ea50fc62d76371e74 Mon Sep 17 00:00:00 2001 From: Jason Evans Date: Wed, 21 Mar 2012 18:33:03 -0700 Subject: [PATCH] Implement tsd. Implement tsd, which is a TLS/TSD abstraction that uses one or both internally. Modify bootstrapping such that no tsd's are utilized until allocation is safe. Remove malloc_[v]tprintf(), and use malloc_snprintf() instead. Fix %p argument size handling in malloc_vsnprintf(). Fix a long-standing statistics-related bug in the "thread.arena" mallctl that could cause crashes due to linked list corruption. --- Makefile.in | 3 +- configure.ac | 20 ++ include/jemalloc/internal/arena.h | 16 +- include/jemalloc/internal/chunk.h | 3 +- .../jemalloc/internal/jemalloc_internal.h.in | 82 ++--- include/jemalloc/internal/private_namespace.h | 2 - include/jemalloc/internal/prof.h | 42 +-- include/jemalloc/internal/tcache.h | 35 +- include/jemalloc/internal/tsd.h | 319 ++++++++++++++++++ include/jemalloc/internal/util.h | 17 +- include/jemalloc/jemalloc_defs.h.in | 9 + src/chunk.c | 14 +- src/chunk_mmap.c | 40 +-- src/ctl.c | 42 ++- src/jemalloc.c | 171 ++++------ src/prof.c | 45 ++- src/tcache.c | 103 +++--- src/tsd.c | 72 ++++ src/util.c | 33 +- 19 files changed, 705 insertions(+), 363 deletions(-) create mode 100644 include/jemalloc/internal/tsd.h create mode 100644 src/tsd.c diff --git a/Makefile.in b/Makefile.in index 01ed083c..494ac9a6 100644 --- a/Makefile.in +++ b/Makefile.in @@ -50,7 +50,8 @@ CSRCS := @srcroot@src/jemalloc.c @srcroot@src/arena.c @srcroot@src/atomic.c \ @srcroot@src/ckh.c @srcroot@src/ctl.c @srcroot@src/extent.c \ @srcroot@src/hash.c @srcroot@src/huge.c @srcroot@src/mb.c \ @srcroot@src/mutex.c @srcroot@src/prof.c @srcroot@src/rtree.c \ - @srcroot@src/stats.c @srcroot@src/tcache.c @srcroot@src/util.c + @srcroot@src/stats.c @srcroot@src/tcache.c @srcroot@src/util.c \ + @srcroot@src/tsd.c ifeq (macho, @abi@) CSRCS += @srcroot@src/zone.c endif diff --git a/configure.ac b/configure.ac index 02d4f536..44ff6eec 100644 --- a/configure.ac +++ b/configure.ac @@ -763,6 +763,20 @@ AC_CHECK_LIB([pthread], [pthread_create], [LIBS="$LIBS -lpthread"], CPPFLAGS="$CPPFLAGS -D_REENTRANT" +dnl Check whether the BSD-specific _malloc_thread_cleanup() exists. If so, use +dnl it rather than pthreads TSD cleanup functions to support cleanup during +dnl thread exit, in order to avoid pthreads library recursion during +dnl bootstrapping. +force_tls="0" +AC_CHECK_FUNC([_malloc_thread_cleanup], + [have__malloc_thread_cleanup="1"], + [have__malloc_thread_cleanup="0"] + ) +if test "x$have__malloc_thread_cleanup" = "x1" ; then + AC_DEFINE([JEMALLOC_MALLOC_THREAD_CLEANUP], [ ]) + force_tls="1" +fi + dnl Disable lazy locking by default. AC_ARG_ENABLE([lazy_lock], [AS_HELP_STRING([--enable-lazy-lock], @@ -795,6 +809,10 @@ fi , enable_tls="1" ) +if test "x${enable_tls}" = "x0" -a "x${force_tls}" = "x1" ; then + AC_MSG_RESULT([Forcing TLS to avoid allocator/threading bootstrap issues]) + enable_tls="1" +fi if test "x${enable_tls}" = "x1" ; then AC_MSG_CHECKING([for TLS]) AC_COMPILE_IFELSE([AC_LANG_PROGRAM( @@ -812,6 +830,8 @@ fi AC_SUBST([enable_tls]) if test "x${enable_tls}" = "x1" ; then AC_DEFINE_UNQUOTED([JEMALLOC_TLS], [ ]) +elif test "x${force_tls}" = "x1" ; then + AC_MSG_ERROR([Failed to configure TLS, which is mandatory for correct function]) fi dnl ============================================================================ diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h index 1609adcf..c5214893 100644 --- a/include/jemalloc/internal/arena.h +++ b/include/jemalloc/internal/arena.h @@ -391,6 +391,7 @@ unsigned arena_run_regind(arena_run_t *run, arena_bin_info_t *bin_info, prof_ctx_t *arena_prof_ctx_get(const void *ptr); void arena_prof_ctx_set(const void *ptr, prof_ctx_t *ctx); void *arena_malloc(size_t size, bool zero); +void *arena_malloc_prechosen(arena_t *arena, size_t size, bool zero); void arena_dalloc(arena_t *arena, arena_chunk_t *chunk, void *ptr); #endif @@ -552,7 +553,7 @@ arena_malloc(size_t size, bool zero) tcache_t *tcache; assert(size != 0); - assert(QUANTUM_CEILING(size) <= arena_maxclass); + assert(size <= arena_maxclass); if (size <= SMALL_MAXCLASS) { if ((tcache = tcache_get()) != NULL) @@ -571,6 +572,19 @@ arena_malloc(size_t size, bool zero) } } +JEMALLOC_INLINE void * +arena_malloc_prechosen(arena_t *arena, size_t size, bool zero) +{ + + assert(size != 0); + assert(size <= arena_maxclass); + + if (size <= SMALL_MAXCLASS) + return (arena_malloc_small(arena, size, zero)); + else + return (arena_malloc_large(arena, size, zero)); +} + JEMALLOC_INLINE void arena_dalloc(arena_t *arena, arena_chunk_t *chunk, void *ptr) { diff --git a/include/jemalloc/internal/chunk.h b/include/jemalloc/internal/chunk.h index 9a62ba18..8e24e8f3 100644 --- a/include/jemalloc/internal/chunk.h +++ b/include/jemalloc/internal/chunk.h @@ -44,7 +44,8 @@ extern size_t arena_maxclass; /* Max size class for arenas. */ void *chunk_alloc(size_t size, bool base, bool *zero); void chunk_dealloc(void *chunk, size_t size, bool unmap); -bool chunk_boot(void); +bool chunk_boot0(void); +bool chunk_boot1(void); #endif /* JEMALLOC_H_EXTERNS */ /******************************************************************************/ diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in index dbfd3fc8..387aabbe 100644 --- a/include/jemalloc/internal/jemalloc_internal.h.in +++ b/include/jemalloc/internal/jemalloc_internal.h.in @@ -289,6 +289,7 @@ static const bool config_ivsalloc = #include "jemalloc/internal/stats.h" #include "jemalloc/internal/ctl.h" #include "jemalloc/internal/mutex.h" +#include "jemalloc/internal/tsd.h" #include "jemalloc/internal/mb.h" #include "jemalloc/internal/extent.h" #include "jemalloc/internal/arena.h" @@ -316,6 +317,7 @@ static const bool config_ivsalloc = #include "jemalloc/internal/stats.h" #include "jemalloc/internal/ctl.h" #include "jemalloc/internal/mutex.h" +#include "jemalloc/internal/tsd.h" #include "jemalloc/internal/mb.h" #include "jemalloc/internal/bitmap.h" #include "jemalloc/internal/extent.h" @@ -335,6 +337,11 @@ typedef struct { uint64_t allocated; uint64_t deallocated; } thread_allocated_t; +/* + * The JEMALLOC_CONCAT() wrapper is necessary to pass {0, 0} via a cpp macro + * argument. + */ +#define THREAD_ALLOCATED_INITIALIZER JEMALLOC_CONCAT({0, 0}) #undef JEMALLOC_H_STRUCTS /******************************************************************************/ @@ -356,25 +363,6 @@ extern size_t lg_pagesize; extern unsigned ncpus; extern malloc_mutex_t arenas_lock; /* Protects arenas initialization. */ -extern pthread_key_t arenas_tsd; -#ifdef JEMALLOC_TLS -/* - * Map of pthread_self() --> arenas[???], used for selecting an arena to use - * for allocations. - */ -extern __thread arena_t *arenas_tls JEMALLOC_ATTR(tls_model("initial-exec")); -# define ARENA_GET() arenas_tls -# define ARENA_SET(v) do { \ - arenas_tls = (v); \ - pthread_setspecific(arenas_tsd, (void *)(v)); \ -} while (0) -#else -# define ARENA_GET() ((arena_t *)pthread_getspecific(arenas_tsd)) -# define ARENA_SET(v) do { \ - pthread_setspecific(arenas_tsd, (void *)(v)); \ -} while (0) -#endif - /* * Arenas that are used to service external requests. Not all elements of the * arenas array are necessarily used; arenas are created lazily as needed. @@ -382,31 +370,8 @@ extern __thread arena_t *arenas_tls JEMALLOC_ATTR(tls_model("initial-exec")); extern arena_t **arenas; extern unsigned narenas; -#ifdef JEMALLOC_TLS -extern __thread thread_allocated_t thread_allocated_tls; -# define ALLOCATED_GET() (thread_allocated_tls.allocated) -# define ALLOCATEDP_GET() (&thread_allocated_tls.allocated) -# define DEALLOCATED_GET() (thread_allocated_tls.deallocated) -# define DEALLOCATEDP_GET() (&thread_allocated_tls.deallocated) -# define ALLOCATED_ADD(a, d) do { \ - thread_allocated_tls.allocated += a; \ - thread_allocated_tls.deallocated += d; \ -} while (0) -#else -# define ALLOCATED_GET() (thread_allocated_get()->allocated) -# define ALLOCATEDP_GET() (&thread_allocated_get()->allocated) -# define DEALLOCATED_GET() (thread_allocated_get()->deallocated) -# define DEALLOCATEDP_GET() (&thread_allocated_get()->deallocated) -# define ALLOCATED_ADD(a, d) do { \ - thread_allocated_t *thread_allocated = thread_allocated_get(); \ - thread_allocated->allocated += (a); \ - thread_allocated->deallocated += (d); \ -} while (0) -#endif -extern pthread_key_t thread_allocated_tsd; -thread_allocated_t *thread_allocated_get_hard(void); - arena_t *arenas_extend(unsigned ind); +void arenas_cleanup(void *arg); arena_t *choose_arena_hard(void); void jemalloc_prefork(void); void jemalloc_postfork_parent(void); @@ -420,6 +385,7 @@ void jemalloc_postfork_child(void); #include "jemalloc/internal/stats.h" #include "jemalloc/internal/ctl.h" #include "jemalloc/internal/mutex.h" +#include "jemalloc/internal/tsd.h" #include "jemalloc/internal/mb.h" #include "jemalloc/internal/bitmap.h" #include "jemalloc/internal/extent.h" @@ -447,6 +413,7 @@ void jemalloc_postfork_child(void); #include "jemalloc/internal/stats.h" #include "jemalloc/internal/ctl.h" #include "jemalloc/internal/mutex.h" +#include "jemalloc/internal/tsd.h" #include "jemalloc/internal/mb.h" #include "jemalloc/internal/extent.h" #include "jemalloc/internal/base.h" @@ -454,13 +421,21 @@ void jemalloc_postfork_child(void); #include "jemalloc/internal/huge.h" #ifndef JEMALLOC_ENABLE_INLINE +malloc_tsd_protos(JEMALLOC_ATTR(unused), arenas, arena_t *) + size_t s2u(size_t size); size_t sa2u(size_t size, size_t alignment, size_t *run_size_p); arena_t *choose_arena(void); -thread_allocated_t *thread_allocated_get(void); #endif #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_C_)) +/* + * Map of pthread_self() --> arenas[???], used for selecting an arena to use + * for allocations. + */ +malloc_tsd_externs(arenas, arena_t *) +malloc_tsd_funcs(JEMALLOC_INLINE, arenas, arena_t *, NULL, arenas_cleanup) + /* * Compute usable size that would result from allocating an object with the * specified size. @@ -572,25 +547,13 @@ choose_arena(void) { arena_t *ret; - ret = ARENA_GET(); - if (ret == NULL) { + if ((ret = *arenas_tsd_get()) == NULL) { ret = choose_arena_hard(); assert(ret != NULL); } return (ret); } - -JEMALLOC_INLINE thread_allocated_t * -thread_allocated_get(void) -{ - thread_allocated_t *thread_allocated = (thread_allocated_t *) - pthread_getspecific(thread_allocated_tsd); - - if (thread_allocated == NULL) - return (thread_allocated_get_hard()); - return (thread_allocated); -} #endif #include "jemalloc/internal/bitmap.h" @@ -611,6 +574,7 @@ size_t ivsalloc(const void *ptr); void idalloc(void *ptr); void *iralloc(void *ptr, size_t size, size_t extra, size_t alignment, bool zero, bool no_move); +malloc_tsd_protos(JEMALLOC_ATTR(unused), thread_allocated, thread_allocated_t) #endif #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_C_)) @@ -787,6 +751,10 @@ iralloc(void *ptr, size_t size, size_t extra, size_t alignment, bool zero, } } } + +malloc_tsd_externs(thread_allocated, thread_allocated_t) +malloc_tsd_funcs(JEMALLOC_INLINE, thread_allocated, thread_allocated_t, + THREAD_ALLOCATED_INITIALIZER, malloc_tsd_no_cleanup) #endif #include "jemalloc/internal/prof.h" diff --git a/include/jemalloc/internal/private_namespace.h b/include/jemalloc/internal/private_namespace.h index e7370fec..7103e680 100644 --- a/include/jemalloc/internal/private_namespace.h +++ b/include/jemalloc/internal/private_namespace.h @@ -155,10 +155,8 @@ #define malloc_mutex_unlock JEMALLOC_N(malloc_mutex_unlock) #define malloc_printf JEMALLOC_N(malloc_printf) #define malloc_snprintf JEMALLOC_N(malloc_snprintf) -#define malloc_tprintf JEMALLOC_N(malloc_tprintf) #define malloc_vcprintf JEMALLOC_N(malloc_vcprintf) #define malloc_vsnprintf JEMALLOC_N(malloc_vsnprintf) -#define malloc_vtprintf JEMALLOC_N(malloc_vtprintf) #define malloc_write JEMALLOC_N(malloc_write) #define mb_write JEMALLOC_N(mb_write) #define opt_abort JEMALLOC_N(opt_abort) diff --git a/include/jemalloc/internal/prof.h b/include/jemalloc/internal/prof.h index 48231928..231a3876 100644 --- a/include/jemalloc/internal/prof.h +++ b/include/jemalloc/internal/prof.h @@ -23,10 +23,13 @@ typedef struct prof_tdata_s prof_tdata_t; #define PROF_TCMAX 1024 /* Initial hash table size. */ -#define PROF_CKH_MINITEMS 64 +#define PROF_CKH_MINITEMS 64 /* Size of memory buffer to use when writing dump files. */ -#define PROF_DUMP_BUF_SIZE 65536 +#define PROF_DUMP_BUFSIZE 65536 + +/* Size of stack-allocated buffer used by prof_printf(). */ +#define PROF_PRINTF_BUFSIZE 128 #endif /* JEMALLOC_H_TYPES */ /******************************************************************************/ @@ -179,29 +182,6 @@ extern uint64_t prof_interval; */ extern bool prof_promote; -/* Thread-specific backtrace cache, used to reduce bt2ctx contention. */ -#ifdef JEMALLOC_TLS -extern __thread prof_tdata_t *prof_tdata_tls - JEMALLOC_ATTR(tls_model("initial-exec")); -# define PROF_TCACHE_GET() prof_tdata_tls -# define PROF_TCACHE_SET(v) do { \ - prof_tdata_tls = (v); \ - pthread_setspecific(prof_tdata_tsd, (void *)(v)); \ -} while (0) -#else -# define PROF_TCACHE_GET() \ - ((prof_tdata_t *)pthread_getspecific(prof_tdata_tsd)) -# define PROF_TCACHE_SET(v) do { \ - pthread_setspecific(prof_tdata_tsd, (void *)(v)); \ -} while (0) -#endif -/* - * Same contents as b2cnt_tls, but initialized such that the TSD destructor is - * called when a thread exits, so that prof_tdata_tls contents can be merged, - * unlinked, and deallocated. - */ -extern pthread_key_t prof_tdata_tsd; - void bt_init(prof_bt_t *bt, void **vec); void prof_backtrace(prof_bt_t *bt, unsigned nignore); prof_thr_cnt_t *prof_lookup(prof_bt_t *bt); @@ -209,6 +189,7 @@ void prof_idump(void); bool prof_mdump(const char *filename); void prof_gdump(void); prof_tdata_t *prof_tdata_init(void); +void prof_tdata_cleanup(void *arg); void prof_boot0(void); void prof_boot1(void); bool prof_boot2(void); @@ -223,7 +204,7 @@ bool prof_boot2(void); \ assert(size == s2u(size)); \ \ - prof_tdata = PROF_TCACHE_GET(); \ + prof_tdata = *prof_tdata_tsd_get(); \ if (prof_tdata == NULL) { \ prof_tdata = prof_tdata_init(); \ if (prof_tdata == NULL) { \ @@ -270,6 +251,8 @@ bool prof_boot2(void); } while (0) #ifndef JEMALLOC_ENABLE_INLINE +malloc_tsd_protos(JEMALLOC_ATTR(unused), prof_tdata, prof_tdata_t *) + void prof_sample_threshold_update(prof_tdata_t *prof_tdata); prof_ctx_t *prof_ctx_get(const void *ptr); void prof_ctx_set(const void *ptr, prof_ctx_t *ctx); @@ -281,6 +264,11 @@ void prof_free(const void *ptr, size_t size); #endif #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_PROF_C_)) +/* Thread-specific backtrace cache, used to reduce bt2ctx contention. */ +malloc_tsd_externs(prof_tdata, prof_tdata_t *) +malloc_tsd_funcs(JEMALLOC_INLINE, prof_tdata, prof_tdata_t *, NULL, + prof_tdata_cleanup) + JEMALLOC_INLINE void prof_sample_threshold_update(prof_tdata_t *prof_tdata) { @@ -359,7 +347,7 @@ prof_sample_accum_update(size_t size) /* Sampling logic is unnecessary if the interval is 1. */ assert(opt_lg_prof_sample != 0); - prof_tdata = PROF_TCACHE_GET(); + prof_tdata = *prof_tdata_tsd_get(); assert(prof_tdata != NULL); /* Take care to avoid integer overflow. */ diff --git a/include/jemalloc/internal/tcache.h b/include/jemalloc/internal/tcache.h index ed037cf9..30e63a50 100644 --- a/include/jemalloc/internal/tcache.h +++ b/include/jemalloc/internal/tcache.h @@ -75,23 +75,6 @@ extern ssize_t opt_lg_tcache_max; extern tcache_bin_info_t *tcache_bin_info; -/* Map of thread-specific caches. */ -#ifdef JEMALLOC_TLS -extern __thread tcache_t *tcache_tls - JEMALLOC_ATTR(tls_model("initial-exec")); -# define TCACHE_GET() tcache_tls -# define TCACHE_SET(v) do { \ - tcache_tls = (tcache_t *)(v); \ - pthread_setspecific(tcache_tsd, (void *)(v)); \ -} while (0) -#else -# define TCACHE_GET() ((tcache_t *)pthread_getspecific(tcache_tsd)) -# define TCACHE_SET(v) do { \ - pthread_setspecific(tcache_tsd, (void *)(v)); \ -} while (0) -#endif -extern pthread_key_t tcache_tsd; - /* * Number of tcache bins. There are NBINS small-object bins, plus 0 or more * large-object bins. @@ -105,18 +88,24 @@ void tcache_bin_flush_small(tcache_bin_t *tbin, size_t binind, unsigned rem, tcache_t *tcache); void tcache_bin_flush_large(tcache_bin_t *tbin, size_t binind, unsigned rem, tcache_t *tcache); +void tcache_arena_associate(tcache_t *tcache, arena_t *arena); +void tcache_arena_dissociate(tcache_t *tcache); tcache_t *tcache_create(arena_t *arena); void *tcache_alloc_small_hard(tcache_t *tcache, tcache_bin_t *tbin, size_t binind); void tcache_destroy(tcache_t *tcache); +void tcache_thread_cleanup(void *arg); void tcache_stats_merge(tcache_t *tcache, arena_t *arena); -bool tcache_boot(void); +bool tcache_boot0(void); +bool tcache_boot1(void); #endif /* JEMALLOC_H_EXTERNS */ /******************************************************************************/ #ifdef JEMALLOC_H_INLINES #ifndef JEMALLOC_ENABLE_INLINE +malloc_tsd_protos(JEMALLOC_ATTR(unused), tcache, tcache_t *) + void tcache_event(tcache_t *tcache); tcache_t *tcache_get(void); void *tcache_alloc_easy(tcache_bin_t *tbin); @@ -127,6 +116,11 @@ void tcache_dalloc_large(tcache_t *tcache, void *ptr, size_t size); #endif #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_TCACHE_C_)) +/* Map of thread-specific caches. */ +malloc_tsd_externs(tcache, tcache_t *) +malloc_tsd_funcs(JEMALLOC_INLINE, tcache, tcache_t *, NULL, + tcache_thread_cleanup) + JEMALLOC_INLINE tcache_t * tcache_get(void) { @@ -139,7 +133,7 @@ tcache_get(void) else if (opt_tcache == false) return (NULL); - tcache = TCACHE_GET(); + tcache = *tcache_tsd_get(); if ((uintptr_t)tcache <= (uintptr_t)2) { if (tcache == NULL) { tcache = tcache_create(choose_arena()); @@ -152,7 +146,8 @@ tcache_get(void) * called after the tcache_thread_cleanup() was * called. */ - TCACHE_SET((uintptr_t)2); + tcache = (tcache_t *)(uintptr_t)2; + tcache_tsd_set(&tcache); } return (NULL); } diff --git a/include/jemalloc/internal/tsd.h b/include/jemalloc/internal/tsd.h new file mode 100644 index 00000000..5a174acd --- /dev/null +++ b/include/jemalloc/internal/tsd.h @@ -0,0 +1,319 @@ +/******************************************************************************/ +#ifdef JEMALLOC_H_TYPES + +/* Maximum number of malloc_tsd users with cleanup functions. */ +#define MALLOC_TSD_CLEANUPS_MAX 8 + +typedef struct malloc_tsd_cleanup_s malloc_tsd_cleanup_t; +struct malloc_tsd_cleanup_s { + bool (*f)(void *); + void *arg; +}; + +/* + * TLS/TSD-agnostic macro-based implementation of thread-specific data. There + * are four macros that support (at least) three use cases: file-private, + * library-private, and library-private inlined. Following is an example + * library-private tsd variable: + * + * In example.h: + * typedef struct { + * int x; + * int y; + * } example_t; + * #define EX_INITIALIZER JEMALLOC_CONCAT({0, 0}) + * malloc_tsd_protos(, example, example_t *) + * malloc_tsd_externs(example, example_t *) + * In example.c: + * malloc_tsd_data(, example, example_t *, EX_INITIALIZER) + * malloc_tsd_funcs(, example, example_t *, EX_INITIALIZER, + * example_tsd_cleanup) + * + * The result is a set of generated functions, e.g.: + * + * bool example_tsd_boot(void) {...} + * example_t **example_tsd_get() {...} + * void example_tsd_set(example_t **val) {...} + * + * Note that all of the functions deal in terms of (a_type *) rather than + * (a_type) so that it is possible to support non-pointer types (unlike + * pthreads TSD). example_tsd_cleanup() is passed an (a_type *) pointer that is + * cast to (void *). This means that the cleanup function needs to cast *and* + * dereference the function argument, e.g.: + * + * void + * example_tsd_cleanup(void *arg) + * { + * example_t *example = *(example_t **)arg; + * + * [...] + * if ([want the cleanup function to be called again]) { + * example_tsd_set(&example); + * } + * } + * + * If example_tsd_set() is called within example_tsd_cleanup(), it will be + * called again. This is similar to how pthreads TSD destruction works, except + * that pthreads only calls the cleanup function again if the value was set to + * non-NULL. + */ + +/* malloc_tsd_protos(). */ +#define malloc_tsd_protos(a_attr, a_name, a_type) \ +a_attr bool \ +a_name##_tsd_boot(void); \ +a_attr a_type * \ +a_name##_tsd_get(void); \ +a_attr void \ +a_name##_tsd_set(a_type *val); + +/* malloc_tsd_externs(). */ +#ifdef JEMALLOC_MALLOC_THREAD_CLEANUP +#define malloc_tsd_externs(a_name, a_type) \ +extern __thread a_type a_name##_tls; \ +extern __thread bool *a_name##_initialized; \ +extern bool a_name##_booted; +#elif (defined(JEMALLOC_TLS)) +#define malloc_tsd_externs(a_name, a_type) \ +extern __thread a_type a_name##_tls; \ +extern pthread_key_t a_name##_tsd; \ +extern bool a_name##_booted; +#else +#define malloc_tsd_externs(a_name, a_type) \ +extern pthread_key_t a_name##_tsd; \ +extern bool a_name##_booted; +#endif + +/* malloc_tsd_data(). */ +#ifdef JEMALLOC_MALLOC_THREAD_CLEANUP +#define malloc_tsd_data(a_attr, a_name, a_type, a_initializer) \ +a_attr __thread a_type JEMALLOC_ATTR(tls_model("initial-exec")) \ + a_name##_tls = a_initializer; \ +a_attr __thread bool JEMALLOC_ATTR(tls_model("initial-exec")) \ + a_name##_initialized = false; \ +a_attr bool a_name##_booted = false; +#elif (defined(JEMALLOC_TLS)) +#define malloc_tsd_data(a_attr, a_name, a_type, a_initializer) \ +a_attr __thread a_type JEMALLOC_ATTR(tls_model("initial-exec")) \ + a_name##_tls = a_initializer; \ +a_attr pthread_key_t a_name##_tsd; \ +a_attr bool a_name##_booted = false; +#else +#define malloc_tsd_data(a_attr, a_name, a_type, a_initializer) \ +a_attr pthread_key_t a_name##_tsd; \ +a_attr bool a_name##_booted = false; +#endif + +/* malloc_tsd_funcs(). */ +#ifdef JEMALLOC_MALLOC_THREAD_CLEANUP +#define malloc_tsd_funcs(a_attr, a_name, a_type, a_initializer, \ + a_cleanup) \ +/* Initialization/cleanup. */ \ +a_attr void \ +a_name##_tsd_cleanup_wrapper(void *arg) \ +{ \ + \ +} \ +bool \ +a_name##_tsd_cleanup_pending(void *arg) \ +{ \ + bool (*cleanup)(void *) = arg; \ + \ + if (a_name##_initialized) { \ + a_name##_initialized = false; \ + cleanup(&a_name##_tls); \ + } \ + return (a_name##_initialized); \ +} \ +a_attr bool \ +a_name##_tsd_boot(void) \ +{ \ + \ + if (a_cleanup != malloc_tsd_no_cleanup) { \ + malloc_tsd_cleanup_register( \ + &a_name##_tsd_cleanup_pending, a_cleanup); \ + } \ + a_name##_booted = true; \ + return (false); \ +} \ +/* Get/set. */ \ +a_attr a_type * \ +a_name##_tsd_get(void) \ +{ \ + \ + assert(a_name##_booted); \ + return (&a_name##_tls); \ +} \ +a_attr void \ +a_name##_tsd_set(a_type *val) \ +{ \ + \ + assert(a_name##_booted); \ + a_name##_tls = (*val); \ + if (a_cleanup != malloc_tsd_no_cleanup) \ + a_name##_initialized = true; \ +} +#elif (defined(JEMALLOC_TLS)) +#define malloc_tsd_funcs(a_attr, a_name, a_type, a_initializer, \ + a_cleanup) \ +/* Initialization/cleanup. */ \ +a_attr void \ +a_name##_tsd_cleanup_wrapper(void *arg) \ +{ \ + \ +} \ +a_attr bool \ +a_name##_tsd_boot(void) \ +{ \ + \ + if (a_cleanup != malloc_tsd_no_cleanup) { \ + if (pthread_key_create(&a_name##_tsd, a_cleanup) != 0) \ + return (true); \ + } \ + a_name##_booted = true; \ + return (false); \ +} \ +/* Get/set. */ \ +a_attr a_type * \ +a_name##_tsd_get(void) \ +{ \ + \ + assert(a_name##_booted); \ + return (&a_name##_tls); \ +} \ +a_attr void \ +a_name##_tsd_set(a_type *val) \ +{ \ + \ + assert(a_name##_booted); \ + a_name##_tls = (*val); \ + if (a_cleanup != malloc_tsd_no_cleanup) { \ + if (pthread_setspecific(a_name##_tsd, \ + (void *)(&a_name##_tls))) { \ + malloc_write(": Error" \ + " setting TSD for "#a_name"\n"); \ + if (opt_abort) \ + abort(); \ + } \ + } \ +} +#else +#define malloc_tsd_funcs(a_attr, a_name, a_type, a_initializer, \ + a_cleanup) \ +/* Data structure. */ \ +typedef struct { \ + bool isstatic; \ + bool initialized; \ + a_type val; \ +} a_name##_tsd_wrapper_t; \ +/* Initialization/cleanup. */ \ +a_attr void \ +a_name##_tsd_cleanup_wrapper(void *arg) \ +{ \ + a_name##_tsd_wrapper_t *wrapper = (a_name##_tsd_wrapper_t *)arg;\ + \ + if (a_cleanup != malloc_tsd_no_cleanup && \ + wrapper->initialized) { \ + wrapper->initialized = false; \ + a_cleanup(&wrapper->val); \ + if (wrapper->initialized) { \ + /* Trigger another cleanup round. */ \ + if (pthread_setspecific(a_name##_tsd, \ + (void *)wrapper)) { \ + malloc_write(": Error" \ + " setting TSD for "#a_name"\n"); \ + if (opt_abort) \ + abort(); \ + } \ + return; \ + } \ + } \ + if (wrapper->isstatic == false) \ + malloc_tsd_dalloc(wrapper); \ +} \ +a_attr bool \ +a_name##_tsd_boot(void) \ +{ \ + \ + if (pthread_key_create(&a_name##_tsd, \ + a_name##_tsd_cleanup_wrapper) != 0) \ + return (true); \ + a_name##_booted = true; \ + return (false); \ +} \ +/* Get/set. */ \ +a_attr a_name##_tsd_wrapper_t * \ +a_name##_tsd_get_wrapper(void) \ +{ \ + a_name##_tsd_wrapper_t *wrapper = (a_name##_tsd_wrapper_t *) \ + pthread_getspecific(a_name##_tsd); \ + \ + if (wrapper == NULL) { \ + wrapper = (a_name##_tsd_wrapper_t *) \ + malloc_tsd_malloc(sizeof(a_name##_tsd_wrapper_t)); \ + if (wrapper == NULL) { \ + static a_name##_tsd_wrapper_t \ + a_name##_tsd_static_data = \ + {true, false, a_initializer}; \ + malloc_write(": Error allocating" \ + " TSD for "#a_name"\n"); \ + if (opt_abort) \ + abort(); \ + wrapper = &a_name##_tsd_static_data; \ + } else { \ + static a_type tsd_static_data = a_initializer; \ + wrapper->isstatic = false; \ + wrapper->val = tsd_static_data; \ + } \ + if (pthread_setspecific(a_name##_tsd, \ + (void *)wrapper)) { \ + malloc_write(": Error setting" \ + " TSD for "#a_name"\n"); \ + if (opt_abort) \ + abort(); \ + } \ + } \ + return (wrapper); \ +} \ +a_attr a_type * \ +a_name##_tsd_get(void) \ +{ \ + a_name##_tsd_wrapper_t *wrapper; \ + \ + assert(a_name##_booted); \ + wrapper = a_name##_tsd_get_wrapper(); \ + return (&wrapper->val); \ +} \ +a_attr void \ +a_name##_tsd_set(a_type *val) \ +{ \ + a_name##_tsd_wrapper_t *wrapper; \ + \ + assert(a_name##_booted); \ + wrapper = a_name##_tsd_get_wrapper(); \ + wrapper->val = *(val); \ + if (a_cleanup != malloc_tsd_no_cleanup) \ + wrapper->initialized = true; \ +} +#endif + +#endif /* JEMALLOC_H_TYPES */ +/******************************************************************************/ +#ifdef JEMALLOC_H_STRUCTS + +#endif /* JEMALLOC_H_STRUCTS */ +/******************************************************************************/ +#ifdef JEMALLOC_H_EXTERNS + +void *malloc_tsd_malloc(size_t size); +void malloc_tsd_dalloc(void *wrapper); +void malloc_tsd_no_cleanup(void *); +void malloc_tsd_cleanup_register(bool (*f)(void *), void *arg); +void malloc_tsd_boot(void); + +#endif /* JEMALLOC_H_EXTERNS */ +/******************************************************************************/ +#ifdef JEMALLOC_H_INLINES + +#endif /* JEMALLOC_H_INLINES */ +/******************************************************************************/ diff --git a/include/jemalloc/internal/util.h b/include/jemalloc/internal/util.h index c5f7520c..fb354da5 100644 --- a/include/jemalloc/internal/util.h +++ b/include/jemalloc/internal/util.h @@ -5,11 +5,17 @@ #define BUFERROR_BUF 64 /* - * Size of static buffer used by malloc_[v]{,c,t}printf(). This must be large - * enough for all possible uses within jemalloc. + * Size of stack-allocated buffer used by malloc_{,v,vc}printf(). This must be + * large enough for all possible uses within jemalloc. */ #define MALLOC_PRINTF_BUFSIZE 4096 +/* + * Wrap a cpp argument that contains commas such that it isn't broken up into + * multiple arguments. + */ +#define JEMALLOC_CONCAT(...) __VA_ARGS__ + /* * Define a custom assert() in order to reduce the chances of deadlock during * assertion failure. @@ -77,13 +83,6 @@ int malloc_vsnprintf(char *str, size_t size, const char *format, va_list ap); int malloc_snprintf(char *str, size_t size, const char *format, ...) JEMALLOC_ATTR(format(printf, 3, 4)); -/* - * malloc_[v]tprintf() prints to a thread-local string buffer, so the result is - * overwritten by the next call to malloc_[v]{,c,t}printf(). - */ -const char * malloc_vtprintf(const char *format, va_list ap); -const char * malloc_tprintf(const char *format, ...) - JEMALLOC_ATTR(format(printf, 1, 2)); void malloc_vcprintf(void (*write_cb)(void *, const char *), void *cbopaque, const char *format, va_list ap); void malloc_cprintf(void (*write)(void *, const char *), void *cbopaque, diff --git a/include/jemalloc/jemalloc_defs.h.in b/include/jemalloc/jemalloc_defs.h.in index 434dd368..838f5615 100644 --- a/include/jemalloc/jemalloc_defs.h.in +++ b/include/jemalloc/jemalloc_defs.h.in @@ -59,6 +59,15 @@ */ #undef JEMALLOC_OSSPIN +/* + * Defined if _malloc_thread_cleanup() exists. At least in the case of + * FreeBSD, pthread_key_create() allocates, which if used during malloc + * bootstrapping will cause recursion into the pthreads library. Therefore, if + * _malloc_thread_cleanup() exists, use it as the basis for thread cleanup in + * malloc_tsd. + */ +#undef JEMALLOC_MALLOC_THREAD_CLEANUP + /* Defined if __attribute__((...)) syntax is supported. */ #undef JEMALLOC_HAVE_ATTR #ifdef JEMALLOC_HAVE_ATTR diff --git a/src/chunk.c b/src/chunk.c index b9086509..f50e8409 100644 --- a/src/chunk.c +++ b/src/chunk.c @@ -100,7 +100,7 @@ chunk_dealloc(void *chunk, size_t size, bool unmap) } bool -chunk_boot(void) +chunk_boot0(void) { /* Set variables according to the value of opt_lg_chunk. */ @@ -114,8 +114,6 @@ chunk_boot(void) return (true); memset(&stats_chunks, 0, sizeof(chunk_stats_t)); } - if (chunk_mmap_boot()) - return (true); if (config_dss && chunk_dss_boot()) return (true); if (config_ivsalloc) { @@ -127,3 +125,13 @@ chunk_boot(void) return (false); } + +bool +chunk_boot1(void) +{ + + if (chunk_mmap_boot()) + return (true); + + return (false); +} diff --git a/src/chunk_mmap.c b/src/chunk_mmap.c index 6ea21180..749a2dac 100644 --- a/src/chunk_mmap.c +++ b/src/chunk_mmap.c @@ -8,20 +8,9 @@ * Used by chunk_alloc_mmap() to decide whether to attempt the fast path and * potentially avoid some system calls. */ -#ifdef JEMALLOC_TLS -static __thread bool mmap_unaligned_tls - JEMALLOC_ATTR(tls_model("initial-exec")); -#define MMAP_UNALIGNED_GET() mmap_unaligned_tls -#define MMAP_UNALIGNED_SET(v) do { \ - mmap_unaligned_tls = (v); \ -} while (0) -#else -static pthread_key_t mmap_unaligned_tsd; -#define MMAP_UNALIGNED_GET() ((bool)pthread_getspecific(mmap_unaligned_tsd)) -#define MMAP_UNALIGNED_SET(v) do { \ - pthread_setspecific(mmap_unaligned_tsd, (void *)(v)); \ -} while (0) -#endif +malloc_tsd_data(static, mmap_unaligned, bool, false) +malloc_tsd_funcs(JEMALLOC_INLINE, mmap_unaligned, bool, false, + malloc_tsd_no_cleanup) /******************************************************************************/ /* Function prototypes for non-inline static functions. */ @@ -128,8 +117,10 @@ chunk_alloc_mmap_slow(size_t size, bool unaligned, bool noreserve) * the next chunk_alloc_mmap() execution tries the fast allocation * method. */ - if (unaligned == false) - MMAP_UNALIGNED_SET(false); + if (unaligned == false && mmap_unaligned_booted) { + bool mu = false; + mmap_unaligned_tsd_set(&mu); + } return (ret); } @@ -167,7 +158,7 @@ chunk_alloc_mmap_internal(size_t size, bool noreserve) * fast method next time. */ - if (MMAP_UNALIGNED_GET() == false) { + if (mmap_unaligned_booted && *mmap_unaligned_tsd_get() == false) { size_t offset; ret = pages_map(NULL, size, noreserve); @@ -176,7 +167,8 @@ chunk_alloc_mmap_internal(size_t size, bool noreserve) offset = CHUNK_ADDR2OFFSET(ret); if (offset != 0) { - MMAP_UNALIGNED_SET(true); + bool mu = true; + mmap_unaligned_tsd_set(&mu); /* Try to extend chunk boundary. */ if (pages_map((void *)((uintptr_t)ret + size), chunksize - offset, noreserve) == NULL) { @@ -225,11 +217,15 @@ bool chunk_mmap_boot(void) { -#ifndef JEMALLOC_TLS - if (pthread_key_create(&mmap_unaligned_tsd, NULL) != 0) { - malloc_write(": Error in pthread_key_create()\n"); + /* + * XXX For the non-TLS implementation of tsd, the first access from + * each thread causes memory allocation. The result is a bootstrapping + * problem for this particular use case, so for now just disable it by + * leaving it in an unbooted state. + */ +#ifdef JEMALLOC_TLS + if (mmap_unaligned_tsd_boot()) return (true); - } #endif return (false); diff --git a/src/ctl.c b/src/ctl.c index 1ef84e80..e17e5034 100644 --- a/src/ctl.c +++ b/src/ctl.c @@ -978,13 +978,13 @@ thread_tcache_flush_ctl(const size_t *mib, size_t miblen, void *oldp, VOID(); - tcache = TCACHE_GET(); - if (tcache == NULL) { + if ((tcache = *tcache_tsd_get()) == NULL) { ret = 0; goto RETURN; } tcache_destroy(tcache); - TCACHE_SET(NULL); + tcache = NULL; + tcache_tsd_set(&tcache); ret = 0; RETURN: @@ -1012,23 +1012,26 @@ thread_arena_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp, /* Initialize arena if necessary. */ malloc_mutex_lock(&arenas_lock); - if ((arena = arenas[newind]) == NULL) - arena = arenas_extend(newind); - arenas[oldind]->nthreads--; - arenas[newind]->nthreads++; - malloc_mutex_unlock(&arenas_lock); - if (arena == NULL) { + if ((arena = arenas[newind]) == NULL && (arena = + arenas_extend(newind)) == NULL) { + malloc_mutex_unlock(&arenas_lock); ret = EAGAIN; goto RETURN; } + assert(arena == arenas[newind]); + arenas[oldind]->nthreads--; + arenas[newind]->nthreads++; + malloc_mutex_unlock(&arenas_lock); /* Set new arena association. */ - ARENA_SET(arena); if (config_tcache) { - tcache_t *tcache = TCACHE_GET(); - if (tcache != NULL) - tcache->arena = arena; + tcache_t *tcache; + if ((tcache = *tcache_tsd_get()) != NULL) { + tcache_arena_dissociate(tcache); + tcache_arena_associate(tcache, arena); + } } + arenas_tsd_set(&arena); } ret = 0; @@ -1036,11 +1039,14 @@ RETURN: return (ret); } -CTL_RO_NL_CGEN(config_stats, thread_allocated, ALLOCATED_GET(), uint64_t) -CTL_RO_NL_CGEN(config_stats, thread_allocatedp, ALLOCATEDP_GET(), uint64_t *) -CTL_RO_NL_CGEN(config_stats, thread_deallocated, DEALLOCATED_GET(), uint64_t) -CTL_RO_NL_CGEN(config_stats, thread_deallocatedp, DEALLOCATEDP_GET(), - uint64_t *) +CTL_RO_NL_CGEN(config_stats, thread_allocated, + thread_allocated_tsd_get()->allocated, uint64_t) +CTL_RO_NL_CGEN(config_stats, thread_allocatedp, + &thread_allocated_tsd_get()->allocated, uint64_t *) +CTL_RO_NL_CGEN(config_stats, thread_deallocated, + thread_allocated_tsd_get()->deallocated, uint64_t) +CTL_RO_NL_CGEN(config_stats, thread_deallocatedp, + &thread_allocated_tsd_get()->deallocated, uint64_t *) /******************************************************************************/ diff --git a/src/jemalloc.c b/src/jemalloc.c index 2610452e..331e4737 100644 --- a/src/jemalloc.c +++ b/src/jemalloc.c @@ -4,36 +4,9 @@ /******************************************************************************/ /* Data. */ -malloc_mutex_t arenas_lock; -arena_t **arenas; -unsigned narenas; - -pthread_key_t arenas_tsd; -#ifdef JEMALLOC_TLS -__thread arena_t *arenas_tls JEMALLOC_ATTR(tls_model("initial-exec")); -#endif - -#ifdef JEMALLOC_TLS -__thread thread_allocated_t thread_allocated_tls; -#endif -pthread_key_t thread_allocated_tsd; - -/* Set to true once the allocator has been initialized. */ -static bool malloc_initialized = false; - -/* Used to let the initializing thread recursively allocate. */ -static pthread_t malloc_initializer = (unsigned long)0; - -/* Used to avoid initialization races. */ -static malloc_mutex_t init_lock = MALLOC_MUTEX_INITIALIZER; - -#ifdef DYNAMIC_PAGE_SHIFT -size_t pagesize; -size_t pagesize_mask; -size_t lg_pagesize; -#endif - -unsigned ncpus; +malloc_tsd_data(, arenas, arena_t *, NULL) +malloc_tsd_data(, thread_allocated, thread_allocated_t, + THREAD_ALLOCATED_INITIALIZER) /* Runtime configuration options. */ const char *je_malloc_conf JEMALLOC_ATTR(visibility("default")); @@ -52,15 +25,32 @@ bool opt_xmalloc = false; bool opt_zero = false; size_t opt_narenas = 0; +#ifdef DYNAMIC_PAGE_SHIFT +size_t pagesize; +size_t pagesize_mask; +size_t lg_pagesize; +#endif + +unsigned ncpus; + +malloc_mutex_t arenas_lock; +arena_t **arenas; +unsigned narenas; + +/* Set to true once the allocator has been initialized. */ +static bool malloc_initialized = false; + +/* Used to let the initializing thread recursively allocate. */ +static pthread_t malloc_initializer = (unsigned long)0; + +/* Used to avoid initialization races. */ +static malloc_mutex_t init_lock = MALLOC_MUTEX_INITIALIZER; + /******************************************************************************/ /* Function prototypes for non-inline static functions. */ static void stats_print_atexit(void); static unsigned malloc_ncpus(void); -static void arenas_cleanup(void *arg); -#ifndef JEMALLOC_TLS -static void thread_allocated_cleanup(void *arg); -#endif static bool malloc_conf_next(char const **opts_p, char const **k_p, size_t *klen_p, char const **v_p, size_t *vlen_p); static void malloc_conf_error(const char *msg, const char *k, size_t klen, @@ -156,7 +146,7 @@ choose_arena_hard(void) malloc_mutex_unlock(&arenas_lock); } - ARENA_SET(ret); + arenas_tsd_set(&ret); return (ret); } @@ -197,26 +187,6 @@ stats_print_atexit(void) je_malloc_stats_print(NULL, NULL, NULL); } -thread_allocated_t * -thread_allocated_get_hard(void) -{ - thread_allocated_t *thread_allocated = (thread_allocated_t *) - imalloc(sizeof(thread_allocated_t)); - if (thread_allocated == NULL) { - static thread_allocated_t static_thread_allocated = {0, 0}; - malloc_write(": Error allocating TSD;" - " mallctl(\"thread.{de,}allocated[p]\", ...)" - " will be inaccurate\n"); - if (opt_abort) - abort(); - return (&static_thread_allocated); - } - pthread_setspecific(thread_allocated_tsd, thread_allocated); - thread_allocated->allocated = 0; - thread_allocated->deallocated = 0; - return (thread_allocated); -} - /* * End miscellaneous support functions. */ @@ -241,32 +211,16 @@ malloc_ncpus(void) return (ret); } -static void +void arenas_cleanup(void *arg) { - arena_t *arena = (arena_t *)arg; + arena_t *arena = *(arena_t **)arg; malloc_mutex_lock(&arenas_lock); arena->nthreads--; malloc_mutex_unlock(&arenas_lock); } -#ifndef JEMALLOC_TLS -static void -thread_allocated_cleanup(void *arg) -{ - uint64_t *allocated = (uint64_t *)arg; - - if (allocated != NULL) - idalloc(allocated); -} -#endif - -/* - * FreeBSD's pthreads implementation calls malloc(3), so the malloc - * implementation has to take pains to avoid infinite recursion during - * initialization. - */ static inline bool malloc_init(void) { @@ -604,6 +558,7 @@ malloc_init_hard(void) } #endif + malloc_tsd_boot(); if (config_prof) prof_boot0(); @@ -631,7 +586,7 @@ malloc_init_hard(void) } } - if (chunk_boot()) { + if (chunk_boot0()) { malloc_mutex_unlock(&init_lock); return (true); } @@ -646,7 +601,7 @@ malloc_init_hard(void) arena_boot(); - if (config_tcache && tcache_boot()) { + if (config_tcache && tcache_boot0()) { malloc_mutex_unlock(&init_lock); return (true); } @@ -656,23 +611,9 @@ malloc_init_hard(void) return (true); } -#ifndef JEMALLOC_TLS - /* Initialize allocation counters before any allocations can occur. */ - if (config_stats && pthread_key_create(&thread_allocated_tsd, - thread_allocated_cleanup) != 0) { - malloc_mutex_unlock(&init_lock); - return (true); - } -#endif - if (malloc_mutex_init(&arenas_lock)) return (true); - if (pthread_key_create(&arenas_tsd, arenas_cleanup) != 0) { - malloc_mutex_unlock(&init_lock); - return (true); - } - /* * Create enough scaffolding to allow recursive allocation in * malloc_ncpus(). @@ -691,25 +632,38 @@ malloc_init_hard(void) return (true); } - /* - * Assign the initial arena to the initial thread, in order to avoid - * spurious creation of an extra arena if the application switches to - * threaded mode. - */ - ARENA_SET(arenas[0]); - arenas[0]->nthreads++; + /* Initialize allocation counters before any allocations can occur. */ + if (config_stats && thread_allocated_tsd_boot()) { + malloc_mutex_unlock(&init_lock); + return (true); + } if (config_prof && prof_boot2()) { malloc_mutex_unlock(&init_lock); return (true); } + if (arenas_tsd_boot()) { + malloc_mutex_unlock(&init_lock); + return (true); + } + + if (config_tcache && tcache_boot1()) { + malloc_mutex_unlock(&init_lock); + return (true); + } + /* Get number of CPUs. */ malloc_initializer = pthread_self(); malloc_mutex_unlock(&init_lock); ncpus = malloc_ncpus(); malloc_mutex_lock(&init_lock); + if (chunk_boot1()) { + malloc_mutex_unlock(&init_lock); + return (true); + } + if (opt_narenas == 0) { /* * For SMP systems, create more than one arena per CPU by @@ -844,7 +798,7 @@ OOM: prof_malloc(ret, usize, cnt); if (config_stats && ret != NULL) { assert(usize == isalloc(ret)); - ALLOCATED_ADD(usize, 0); + thread_allocated_tsd_get()->allocated += usize; } return (ret); } @@ -939,7 +893,7 @@ imemalign(void **memptr, size_t alignment, size_t size, RETURN: if (config_stats && result != NULL) { assert(usize == isalloc(result)); - ALLOCATED_ADD(usize, 0); + thread_allocated_tsd_get()->allocated += usize; } if (config_prof && opt_prof && result != NULL) prof_malloc(result, usize, cnt); @@ -1044,7 +998,7 @@ RETURN: prof_malloc(ret, usize, cnt); if (config_stats && ret != NULL) { assert(usize == isalloc(ret)); - ALLOCATED_ADD(usize, 0); + thread_allocated_tsd_get()->allocated += usize; } return (ret); } @@ -1173,8 +1127,11 @@ RETURN: if (config_prof && opt_prof) prof_realloc(ret, usize, cnt, old_size, old_ctx); if (config_stats && ret != NULL) { + thread_allocated_t *ta; assert(usize == isalloc(ret)); - ALLOCATED_ADD(usize, old_size); + ta = thread_allocated_tsd_get(); + ta->allocated += usize; + ta->deallocated += old_size; } return (ret); } @@ -1197,7 +1154,7 @@ je_free(void *ptr) usize = isalloc(ptr); } if (config_stats) - ALLOCATED_ADD(0, usize); + thread_allocated_tsd_get()->deallocated += usize; idalloc(ptr); } } @@ -1412,7 +1369,7 @@ je_allocm(void **ptr, size_t *rsize, size_t size, int flags) *ptr = p; if (config_stats) { assert(usize == isalloc(p)); - ALLOCATED_ADD(usize, 0); + thread_allocated_tsd_get()->allocated += usize; } return (ALLOCM_SUCCESS); OOM: @@ -1502,8 +1459,12 @@ je_rallocm(void **ptr, size_t *rsize, size_t size, size_t extra, int flags) } *ptr = q; - if (config_stats) - ALLOCATED_ADD(usize, old_size); + if (config_stats) { + thread_allocated_t *ta; + ta = thread_allocated_tsd_get(); + ta->allocated += usize; + ta->deallocated += old_size; + } return (ALLOCM_SUCCESS); ERR: if (no_move) @@ -1556,7 +1517,7 @@ je_dallocm(void *ptr, int flags) prof_free(ptr, usize); } if (config_stats) - ALLOCATED_ADD(0, usize); + thread_allocated_tsd_get()->deallocated += usize; idalloc(ptr); return (ALLOCM_SUCCESS); diff --git a/src/prof.c b/src/prof.c index 9c327379..ba0b64e1 100644 --- a/src/prof.c +++ b/src/prof.c @@ -14,6 +14,8 @@ /******************************************************************************/ /* Data. */ +malloc_tsd_data(, prof_tdata, prof_tdata_t *, NULL) + bool opt_prof = false; bool opt_prof_active = true; size_t opt_lg_prof_sample = LG_PROF_SAMPLE_DEFAULT; @@ -26,12 +28,6 @@ char opt_prof_prefix[PATH_MAX + 1]; uint64_t prof_interval; bool prof_promote; -#ifdef JEMALLOC_TLS -__thread prof_tdata_t *prof_tdata_tls - JEMALLOC_ATTR(tls_model("initial-exec")); -#endif -pthread_key_t prof_tdata_tsd; - /* * Global hash of (prof_bt_t *)-->(prof_ctx_t *). This is the master data * structure that knows about all backtraces currently captured. @@ -50,7 +46,7 @@ static uint64_t prof_dump_useq; * all profile dumps. The buffer is implicitly protected by bt2ctx_mtx, since * it must be locked anyway during dumping. */ -static char prof_dump_buf[PROF_DUMP_BUF_SIZE]; +static char prof_dump_buf[PROF_DUMP_BUFSIZE]; static unsigned prof_dump_buf_end; static int prof_dump_fd; @@ -91,7 +87,6 @@ static void prof_fdump(void); static void prof_bt_hash(const void *key, unsigned minbits, size_t *hash1, size_t *hash2); static bool prof_bt_keycomp(const void *k1, const void *k2); -static void prof_tdata_cleanup(void *arg); /******************************************************************************/ @@ -439,7 +434,7 @@ prof_lookup(prof_bt_t *bt) cassert(config_prof); - prof_tdata = PROF_TCACHE_GET(); + prof_tdata = *prof_tdata_tsd_get(); if (prof_tdata == NULL) { prof_tdata = prof_tdata_init(); if (prof_tdata == NULL) @@ -599,16 +594,16 @@ prof_write(bool propagate_err, const char *s) slen = strlen(s); while (i < slen) { /* Flush the buffer if it is full. */ - if (prof_dump_buf_end == PROF_DUMP_BUF_SIZE) + if (prof_dump_buf_end == PROF_DUMP_BUFSIZE) if (prof_flush(propagate_err) && propagate_err) return (true); - if (prof_dump_buf_end + slen <= PROF_DUMP_BUF_SIZE) { + if (prof_dump_buf_end + slen <= PROF_DUMP_BUFSIZE) { /* Finish writing. */ n = slen - i; } else { /* Write as much of s as will fit. */ - n = PROF_DUMP_BUF_SIZE - prof_dump_buf_end; + n = PROF_DUMP_BUFSIZE - prof_dump_buf_end; } memcpy(&prof_dump_buf[prof_dump_buf_end], &s[i], n); prof_dump_buf_end += n; @@ -624,10 +619,12 @@ prof_printf(bool propagate_err, const char *format, ...) { bool ret; va_list ap; + char buf[PROF_PRINTF_BUFSIZE]; va_start(ap, format); - ret = prof_write(propagate_err, malloc_vtprintf(format, ap)); + malloc_snprintf(buf, sizeof(buf), format, ap); va_end(ap); + ret = prof_write(propagate_err, buf); return (ret); } @@ -795,11 +792,13 @@ static bool prof_dump_maps(bool propagate_err) { int mfd; + char filename[PATH_MAX + 1]; cassert(config_prof); - mfd = open(malloc_tprintf("/proc/%d/maps", (int)getpid()), - O_RDONLY); + malloc_snprintf(filename, sizeof(filename), "/proc/%d/maps", + (int)getpid()); + mfd = open(filename, O_RDONLY); if (mfd != -1) { ssize_t nread; @@ -809,13 +808,13 @@ prof_dump_maps(bool propagate_err) nread = 0; do { prof_dump_buf_end += nread; - if (prof_dump_buf_end == PROF_DUMP_BUF_SIZE) { + if (prof_dump_buf_end == PROF_DUMP_BUFSIZE) { /* Make space in prof_dump_buf before read(). */ if (prof_flush(propagate_err) && propagate_err) return (true); } nread = read(mfd, &prof_dump_buf[prof_dump_buf_end], - PROF_DUMP_BUF_SIZE - prof_dump_buf_end); + PROF_DUMP_BUFSIZE - prof_dump_buf_end); } while (nread > 0); close(mfd); } else @@ -1098,16 +1097,16 @@ prof_tdata_init(void) prof_tdata->threshold = 0; prof_tdata->accum = 0; - PROF_TCACHE_SET(prof_tdata); + prof_tdata_tsd_set(&prof_tdata); return (prof_tdata); } -static void +void prof_tdata_cleanup(void *arg) { prof_thr_cnt_t *cnt; - prof_tdata_t *prof_tdata = (prof_tdata_t *)arg; + prof_tdata_t *prof_tdata = *(prof_tdata_t **)arg; cassert(config_prof); @@ -1127,7 +1126,8 @@ prof_tdata_cleanup(void *arg) idalloc(prof_tdata->vec); idalloc(prof_tdata); - PROF_TCACHE_SET(NULL); + prof_tdata = NULL; + prof_tdata_tsd_set(&prof_tdata); } void @@ -1182,8 +1182,7 @@ prof_boot2(void) return (true); if (malloc_mutex_init(&bt2ctx_mtx)) return (true); - if (pthread_key_create(&prof_tdata_tsd, prof_tdata_cleanup) - != 0) { + if (prof_tdata_tsd_boot()) { malloc_write( ": Error in pthread_key_create()\n"); abort(); diff --git a/src/tcache.c b/src/tcache.c index f90308cd..3442406d 100644 --- a/src/tcache.c +++ b/src/tcache.c @@ -4,30 +4,16 @@ /******************************************************************************/ /* Data. */ +malloc_tsd_data(, tcache, tcache_t *, NULL) + bool opt_tcache = true; ssize_t opt_lg_tcache_max = LG_TCACHE_MAXCLASS_DEFAULT; tcache_bin_info_t *tcache_bin_info; static unsigned stack_nelms; /* Total stack elms per tcache. */ -/* Map of thread-specific caches. */ -#ifdef JEMALLOC_TLS -__thread tcache_t *tcache_tls JEMALLOC_ATTR(tls_model("initial-exec")); -#endif - -/* - * Same contents as tcache, but initialized such that the TSD destructor is - * called when a thread exits, so that the cache can be cleaned up. - */ -pthread_key_t tcache_tsd; - -size_t nhbins; -size_t tcache_maxclass; - -/******************************************************************************/ -/* Function prototypes for non-inline static functions. */ - -static void tcache_thread_cleanup(void *arg); +size_t nhbins; +size_t tcache_maxclass; /******************************************************************************/ @@ -196,6 +182,33 @@ tcache_bin_flush_large(tcache_bin_t *tbin, size_t binind, unsigned rem, tbin->low_water = tbin->ncached; } +void +tcache_arena_associate(tcache_t *tcache, arena_t *arena) +{ + + if (config_stats) { + /* Link into list of extant tcaches. */ + malloc_mutex_lock(&arena->lock); + ql_elm_new(tcache, link); + ql_tail_insert(&arena->tcache_ql, tcache, link); + malloc_mutex_unlock(&arena->lock); + } + tcache->arena = arena; +} + +void +tcache_arena_dissociate(tcache_t *tcache) +{ + + if (config_stats) { + /* Unlink from list of extant tcaches. */ + malloc_mutex_lock(&tcache->arena->lock); + ql_remove(&tcache->arena->tcache_ql, tcache, link); + malloc_mutex_unlock(&tcache->arena->lock); + tcache_stats_merge(tcache, tcache->arena); + } +} + tcache_t * tcache_create(arena_t *arena) { @@ -228,15 +241,8 @@ tcache_create(arena_t *arena) if (tcache == NULL) return (NULL); - if (config_stats) { - /* Link into list of extant tcaches. */ - malloc_mutex_lock(&arena->lock); - ql_elm_new(tcache, link); - ql_tail_insert(&arena->tcache_ql, tcache, link); - malloc_mutex_unlock(&arena->lock); - } + tcache_arena_associate(tcache, arena); - tcache->arena = arena; assert((TCACHE_NSLOTS_SMALL_MAX & 1U) == 0); for (i = 0; i < nhbins; i++) { tcache->tbins[i].lg_fill_div = 1; @@ -245,7 +251,7 @@ tcache_create(arena_t *arena) stack_offset += tcache_bin_info[i].ncached_max * sizeof(void *); } - TCACHE_SET(tcache); + tcache_tsd_set(&tcache); return (tcache); } @@ -256,13 +262,7 @@ tcache_destroy(tcache_t *tcache) unsigned i; size_t tcache_size; - if (config_stats) { - /* Unlink from list of extant tcaches. */ - malloc_mutex_lock(&tcache->arena->lock); - ql_remove(&tcache->arena->tcache_ql, tcache, link); - malloc_mutex_unlock(&tcache->arena->lock); - tcache_stats_merge(tcache, tcache->arena); - } + tcache_arena_dissociate(tcache); for (i = 0; i < NBINS; i++) { tcache_bin_t *tbin = &tcache->tbins[i]; @@ -323,10 +323,10 @@ tcache_destroy(tcache_t *tcache) idalloc(tcache); } -static void +void tcache_thread_cleanup(void *arg) { - tcache_t *tcache = (tcache_t *)arg; + tcache_t *tcache = *(tcache_t **)arg; if (tcache == (void *)(uintptr_t)1) { /* @@ -341,11 +341,13 @@ tcache_thread_cleanup(void *arg) * destructor was called. Reset tcache to 1 in order to * receive another callback. */ - TCACHE_SET((uintptr_t)1); + tcache = (tcache_t *)(uintptr_t)1; + tcache_tsd_set(&tcache); } else if (tcache != NULL) { assert(tcache != (void *)(uintptr_t)1); tcache_destroy(tcache); - TCACHE_SET((uintptr_t)1); + tcache = (tcache_t *)(uintptr_t)1; + tcache_tsd_set(&tcache); } } @@ -374,7 +376,7 @@ tcache_stats_merge(tcache_t *tcache, arena_t *arena) } bool -tcache_boot(void) +tcache_boot0(void) { if (opt_tcache) { @@ -385,8 +387,8 @@ tcache_boot(void) * SMALL_MAXCLASS and arena_maxclass are known. * XXX Can this be done earlier? */ - if (opt_lg_tcache_max < 0 || (1U << - opt_lg_tcache_max) < SMALL_MAXCLASS) + if (opt_lg_tcache_max < 0 || (1U << opt_lg_tcache_max) < + SMALL_MAXCLASS) tcache_maxclass = SMALL_MAXCLASS; else if ((1U << opt_lg_tcache_max) > arena_maxclass) tcache_maxclass = arena_maxclass; @@ -416,13 +418,18 @@ tcache_boot(void) tcache_bin_info[i].ncached_max = TCACHE_NSLOTS_LARGE; stack_nelms += tcache_bin_info[i].ncached_max; } - - if (pthread_key_create(&tcache_tsd, tcache_thread_cleanup) != - 0) { - malloc_write( - ": Error in pthread_key_create()\n"); - abort(); - } + } + + return (false); +} + +bool +tcache_boot1(void) +{ + + if (opt_tcache) { + if (tcache_tsd_boot()) + return (true); } return (false); diff --git a/src/tsd.c b/src/tsd.c new file mode 100644 index 00000000..669ea8fc --- /dev/null +++ b/src/tsd.c @@ -0,0 +1,72 @@ +#define JEMALLOC_TSD_C_ +#include "jemalloc/internal/jemalloc_internal.h" + +/******************************************************************************/ +/* Data. */ + +static unsigned ncleanups; +static malloc_tsd_cleanup_t cleanups[MALLOC_TSD_CLEANUPS_MAX]; + +/******************************************************************************/ + +void * +malloc_tsd_malloc(size_t size) +{ + + /* Avoid choose_arena() in order to dodge bootstrapping issues. */ + return arena_malloc_prechosen(arenas[0], size, false); +} + +void +malloc_tsd_dalloc(void *wrapper) +{ + + idalloc(wrapper); +} + +void +malloc_tsd_no_cleanup(void *arg) +{ + + not_reached(); +} + +#ifdef JEMALLOC_MALLOC_THREAD_CLEANUP +void +_malloc_thread_cleanup(void) +{ + bool pending[ncleanups], again; + unsigned i; + + for (i = 0; i < ncleanups; i++) + pending[i] = true; + + do { + again = false; + for (i = 0; i < ncleanups; i++) { + if (pending[i]) { + pending[i] = cleanups[i].f(cleanups[i].arg); + if (pending[i]) + again = true; + } + } + } while (again); +} +#endif + +void +malloc_tsd_cleanup_register(bool (*f)(void *), void *arg) +{ + + assert(ncleanups < MALLOC_TSD_CLEANUPS_MAX); + cleanups[ncleanups].f = f; + cleanups[ncleanups].arg = arg; + ncleanups++; +} + +void +malloc_tsd_boot(void) +{ + + ncleanups = 0; +} diff --git a/src/util.c b/src/util.c index 47e7b66e..96c87f78 100644 --- a/src/util.c +++ b/src/util.c @@ -222,6 +222,9 @@ malloc_vsnprintf(char *str, size_t size, const char *format, va_list ap) case 'z': \ val = va_arg(ap, size_t); \ break; \ + case 'p': /* Synthetic; used for %p. */ \ + val = va_arg(ap, uintptr_t); \ + break; \ default: not_reached(); \ } \ } while (0) @@ -410,7 +413,7 @@ malloc_vsnprintf(char *str, size_t size, const char *format, va_list ap) uintmax_t val; char buf[X2S_BUFSIZE]; - GET_ARG_NUMERIC(val, len); + GET_ARG_NUMERIC(val, 'p'); s = x2s(val, true, false, buf, &slen); APPEND_PADDED_S(s, slen, width, left_justify); f++; @@ -466,34 +469,11 @@ malloc_snprintf(char *str, size_t size, const char *format, ...) return (ret); } -const char * -malloc_vtprintf(const char *format, va_list ap) -{ - static __thread char buf[MALLOC_PRINTF_BUFSIZE]; - - malloc_vsnprintf(buf, sizeof(buf), format, ap); - - return (buf); -} - -JEMALLOC_ATTR(format(printf, 1, 2)) -const char * -malloc_tprintf(const char *format, ...) -{ - const char *ret; - va_list ap; - - va_start(ap, format); - ret = malloc_vtprintf(format, ap); - va_end(ap); - - return (ret); -} - void malloc_vcprintf(void (*write_cb)(void *, const char *), void *cbopaque, const char *format, va_list ap) { + char buf[MALLOC_PRINTF_BUFSIZE]; if (write_cb == NULL) { /* @@ -505,7 +485,8 @@ malloc_vcprintf(void (*write_cb)(void *, const char *), void *cbopaque, cbopaque = NULL; } - write_cb(cbopaque, malloc_vtprintf(format, ap)); + malloc_vsnprintf(buf, sizeof(buf), format, ap); + write_cb(cbopaque, buf); } /*