Convert thread-specific caching from magazines, and implement incremental GC.

Add the 'G'/'g' and 'H'/'h' MALLOC_OPTIONS flags. Add the malloc_tcache_flush() function. Disable thread-specific caching until the application goes multi-threaded.
2009-12-29 00:09:15 -08:00 · 2009-12-29 00:09:15 -08:00 · 84cbbcb90a
commit 84cbbcb90a
parent b2378168a4
8 changed files with 861 additions and 435 deletions
--- a/jemalloc/INSTALL
+++ b/jemalloc/INSTALL
@ -46,10 +46,10 @@ any of the following arguments (not a definitive list) to 'configure':
    practice it never causes any problems if, for example, 4-byte allocations
    are 4-byte-aligned.
--disable-mag
+--disable-tcache
-    Disable thread-specific caches for sub-page-sized objects.  Objects are
+    Disable thread-specific caches for small and medium objects.  Objects are
-    cached and released in bulk using "magazines" -- a term coined by the
+    cached and released in bulk, thus reducing the total number of mutex
-    developers of Solaris's umem allocator.
+    operations.  Use the 'H' and 'G' options to control thread-specific caching.
 --enable-dss
    Enable support for page allocation/deallocation via sbrk(2), in addition to
@ -82,7 +82,7 @@ any of the following arguments (not a definitive list) to 'configure':
    switches from single-threaded to multi-threaded mode, so that it can avoid
    mutex locking/unlocking operations while in single-threaded mode.  In
    practice, this feature usually has little impact on performance unless
-    magazines are disabled.
+    thread-specific caching is disabled.
 The following environment variables (not a definitive list) impact configure's
 behavior:
--- a/jemalloc/configure.ac
+++ b/jemalloc/configure.ac
@ -254,6 +254,7 @@ if test "x$JEMALLOC_PREFIX" != "x" ; then
  AC_DEFINE_UNQUOTED([realloc], [${JEMALLOC_PREFIX}realloc])
  AC_DEFINE_UNQUOTED([free], [${JEMALLOC_PREFIX}free])
  AC_DEFINE_UNQUOTED([malloc_usable_size], [${JEMALLOC_PREFIX}malloc_usable_size])
  AC_DEFINE_UNQUOTED([malloc_tcache_flush], [${JEMALLOC_PREFIX}malloc_tcache_flush])
  AC_DEFINE_UNQUOTED([malloc_options], [${JEMALLOC_PREFIX}malloc_options])
  AC_DEFINE_UNQUOTED([malloc_message], [${JEMALLOC_PREFIX}malloc_message])
 fi
@ -359,27 +360,18 @@ fi
 AC_SUBST([roff_tiny])
 AC_SUBST([roff_no_tiny])
-dnl Enable magazines by default.
+dnl Enable thread-specific caching by default.
-AC_ARG_ENABLE([mag],
+AC_ARG_ENABLE([tcache],
-  [AS_HELP_STRING([--disable-mag], [Disable magazines (per thread caches)])],
+  [AS_HELP_STRING([--disable-tcache], [Disable per thread caches])],
-[if test "x$enable_mag" = "xno" ; then
+[if test "x$enable_tcache" = "xno" ; then
-  enable_mag="0"
+  enable_tcache="0"
 else
-  enable_mag="1"
+  enable_tcache="1"
 fi
 ],
-[enable_mag="1"]
+[enable_tcache="1"]
 )
-if test "x$enable_mag" = "x1" ; then
+dnl Finish tcache-related definitions below, once TLS configuration is done.
  AC_DEFINE([JEMALLOC_MAG], [ ])
 fi
 AC_SUBST([enable_mag])
 if test "x$enable_mag" = "x0" ; then
  roff_mag=".\\\" "
 else
  roff_mag=""
 fi
 AC_SUBST([roff_mag])
 dnl Do not enable allocation from DSS by default.
 AC_ARG_ENABLE([dss],
@ -539,11 +531,28 @@ AC_RUN_IFELSE([AC_LANG_PROGRAM(
    return 0;
 ]])],
              AC_MSG_RESULT([yes])
-              roff_tls="",
+              have_tls="1",
              AC_MSG_RESULT([no])
-              roff_tls=".\\\" "
+              have_tls="0"
              AC_DEFINE_UNQUOTED([NO_TLS], [ ]))
-AC_SUBST([roff_tls])
+
 dnl Finish tcache-related definitions, now that TLS configuration is done.
 if test "x$have_tls" = "x0" ; then
  enable_tcache="0"
 fi
 if test "x$enable_tcache" = "x1" ; then
  AC_DEFINE([JEMALLOC_TCACHE], [ ])
 fi
 AC_SUBST([enable_tcache])
 if test "x$enable_tcache" = "x0" ; then
  roff_tcache=".\\\" "
  roff_no_tcache=""
 else
  roff_tcache=""
  roff_no_tcache=".\\\" "
 fi
 AC_SUBST([roff_tcache])
 AC_SUBST([roff_no_tcache])
 dnl Enable lazy locking by default.
 AC_ARG_ENABLE([lazy_lock],
@ -631,7 +640,7 @@ AC_MSG_RESULT([debug              : ${enable_debug}])
 AC_MSG_RESULT([stats              : ${enable_stats}])
 AC_MSG_RESULT([trace              : ${enable_trace}])
 AC_MSG_RESULT([tiny               : ${enable_tiny}])
-AC_MSG_RESULT([mag                : ${enable_mag}])
+AC_MSG_RESULT([tcache             : ${enable_tcache}])
 AC_MSG_RESULT([fill               : ${enable_fill}])
 AC_MSG_RESULT([xmalloc            : ${enable_xmalloc}])
 AC_MSG_RESULT([sysv               : ${enable_sysv}])
--- a/jemalloc/doc/jemalloc.3.in
+++ b/jemalloc/doc/jemalloc.3.in
@ -35,7 +35,7 @@
 .\"     @(#)malloc.3	8.1 (Berkeley) 6/4/93
 .\" $FreeBSD: head/lib/libc/stdlib/malloc.3 182225 2008-08-27 02:00:53Z jasone $
 .\"
-.Dd November 19, 2009
+.Dd December 3, 2009
 .Dt JEMALLOC 3
 .Os
 .Sh NAME
@ -58,6 +58,8 @@
 .Fn @jemalloc_prefix@free "void *ptr"
 .Ft size_t
 .Fn @jemalloc_prefix@malloc_usable_size "const void *ptr"
@roff_tcache@.Ft void
@roff_tcache@.Fn @jemalloc_prefix@malloc_tcache_flush "void"
 .Ft const char *
 .Va @jemalloc_prefix@malloc_options ;
 .Ft void
@ -156,6 +158,18 @@ Any discrepancy between the requested allocation size and the size reported by
 .Fn @jemalloc_prefix@malloc_usable_size
 should not be depended on, since such behavior is entirely
 implementation-dependent.
@roff_tcache@.Pp
@roff_tcache@The
@roff_tcache@.Fn @jemalloc_prefix@malloc_tcache_flush
@roff_tcache@function releases all cached objects and internal data structures
@roff_tcache@associated with the calling thread's thread-specific cache.
@roff_tcache@Ordinarily, this function need not be called, since automatic
@roff_tcache@periodic incremental garbage collection occurs, and the thread
@roff_tcache@cache is automatically discarded when a thread exits.
@roff_tcache@However, garbage collection is triggered by allocation activity,
@roff_tcache@so it is possible for a thread that stops allocating/deallocating
@roff_tcache@to retain its cache indefinitely, in which case the developer may
@roff_tcache@find this function useful.
 .Sh TUNING
 Once, when the first call is made to one of these memory allocation
 routines, various flags will be set or reset, which affects the
@ -203,16 +217,20 @@ physical memory becomes scarce and the pages remain unused.
 The default is 512 pages per arena;
 .Ev JEMALLOC_OPTIONS=10f
 will prevent any dirty unused pages from accumulating.
-@roff_mag@@roff_tls@.It G
+@roff_tcache@.It G
-@roff_mag@@roff_tls@When there are multiple threads, use thread-specific caching
+@roff_tcache@Enable/disable incremental garbage collection of unused objects
-@roff_mag@@roff_tls@for objects that are smaller than one page.
+@roff_tcache@stored in thread-specific caches.
-@roff_mag@@roff_tls@This option is enabled by default.
+@roff_tcache@This option is enabled by default.
-@roff_mag@@roff_tls@Thread-specific caching allows many allocations to be
+@roff_tcache@.It H
-@roff_mag@@roff_tls@satisfied without performing any thread synchronization, at
+@roff_tcache@When there are multiple threads, use thread-specific caching for
-@roff_mag@@roff_tls@the cost of increased memory use.
+@roff_tcache@small and medium objects.
-@roff_mag@@roff_tls@See the
+@roff_tcache@This option is enabled by default.
-@roff_mag@@roff_tls@.Dq R
+@roff_tcache@Thread-specific caching allows many allocations to be satisfied
-@roff_mag@@roff_tls@option for related tuning information.
+@roff_tcache@without performing any thread synchronization, at the cost of
@roff_tcache@increased memory use.
@roff_tcache@See the
@roff_tcache@.Dq G
@roff_tcache@option for related tuning information.
@roff_fill@.It J
@roff_fill@Each byte of new memory allocated by
@roff_fill@.Fn @jemalloc_prefix@malloc
@ -235,8 +253,10 @@ The valid range is from one page to one half chunk.
 The default value is 32 KiB.
 .It N
 Double/halve the number of arenas.
-The default number of arenas is two times the number of CPUs, or one if there
+The default number of arenas is
-is a single CPU.
+@roff_tcache@two
@roff_no_tcache@four
 times the number of CPUs, or one if there is a single CPU.
 .It P
 Various statistics are printed at program exit via an
 .Xr atexit 3
@ -250,13 +270,6 @@ Double/halve the size of the maximum size class that is a multiple of the
 quantum (8 or 16 bytes, depending on architecture).
 Above this size, cacheline spacing is used for size classes.
 The default value is 128 bytes.
@roff_mag@@roff_tls@.It R
@roff_mag@@roff_tls@Double/halve magazine size, which approximately
@roff_mag@@roff_tls@doubles/halves the number of rounds in each magazine.
@roff_mag@@roff_tls@Magazines are used by the thread-specific caching machinery
@roff_mag@@roff_tls@to acquire and release objects in bulk.
@roff_mag@@roff_tls@Increasing the magazine size decreases locking overhead, at
@roff_mag@@roff_tls@the expense of increased memory usage.
@roff_trace@.It T
@roff_trace@Write a verbose trace log to a set of files named according to the
@roff_trace@pattern
@ -338,14 +351,14 @@ improve performance, mainly due to reduced cache performance.
 However, it may make sense to reduce the number of arenas if an application
 does not make much use of the allocation functions.
 .Pp
-@roff_mag@In addition to multiple arenas, this allocator supports
+@roff_tcache@In addition to multiple arenas, this allocator supports
-@roff_mag@thread-specific caching for small and medium objects, in order to make
+@roff_tcache@thread-specific caching for small and medium objects, in order to
-@roff_mag@it possible to completely avoid synchronization for most small and
+@roff_tcache@make it possible to completely avoid synchronization for most small
-@roff_mag@medium allocation requests.
+@roff_tcache@and medium allocation requests.
-@roff_mag@Such caching allows very fast allocation in the common case, but it
+@roff_tcache@Such caching allows very fast allocation in the common case, but it
-@roff_mag@increases memory usage and fragmentation, since a bounded number of
+@roff_tcache@increases memory usage and fragmentation, since a bounded number of
-@roff_mag@objects can remain allocated in each thread cache.
+@roff_tcache@objects can remain allocated in each thread cache.
-@roff_mag@.Pp
+@roff_tcache@.Pp
 Memory is conceptually broken into equal-sized chunks, where the chunk size is
 a power of two that is greater than the page size.
 Chunks are always aligned to multiples of the chunk size.
--- a/jemalloc/src/jemalloc.c
+++ b/jemalloc/src/jemalloc.c
--- a/jemalloc/src/jemalloc.h
+++ b/jemalloc/src/jemalloc.h
@ -38,6 +38,10 @@ extern "C" {
 size_t	malloc_usable_size(const void *ptr);
 #ifdef JEMALLOC_TCACHE
 void	malloc_tcache_flush(void);
 #endif
 extern const char	*malloc_options;
 extern void		(*malloc_message)(const char *p1,
    const char *p2, const char *p3, const char *p4);
--- a/jemalloc/src/jemalloc_defs.h.in
+++ b/jemalloc/src/jemalloc_defs.h.in
@ -55,6 +55,7 @@
 #undef realloc
 #undef free
 #undef malloc_usable_size
 #undef malloc_tcache_flush
 #undef malloc_options
 #undef malloc_message
 #endif
@ -91,11 +92,11 @@
 #undef JEMALLOC_TINY
 /*
- * JEMALLOC_MAG enables a magazine-based thread-specific caching layer for small
+ * JEMALLOC_TCACHE enables a thread-specific caching layer for small and medium
 * objects.  This makes it possible to allocate/deallocate objects without any
 * locking when the cache is in the steady state.
 */
-#undef JEMALLOC_MAG
+#undef JEMALLOC_TCACHE
 /*
 * JEMALLOC_DSS enables use of sbrk(2) to allocate chunks from the data storage
--- a/jemalloc/src/ql.h
+++ b/jemalloc/src/ql.h
@ -0,0 +1,114 @@
 /******************************************************************************
 *
 * Copyright (C) 2002 Jason Evans <jasone@canonware.com>.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice(s), this list of conditions and the following disclaimer
 *    unmodified other than the allowable addition of one or more
 *    copyright notices.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice(s), this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
 * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/
 /*
 * List definitions.
 */
 #define ql_head(a_type)							\
 struct {								\
 	a_type *qlh_first;						\
 }
 #define ql_head_initializer(a_head) {NULL}
 #define ql_elm(a_type)	qr(a_type)
 /* List functions. */
 #define ql_new(a_head) do {						\
 	(a_head)->qlh_first = NULL;					\
 } while (0)
 #define ql_elm_new(a_elm, a_field) qr_new((a_elm), a_field)
 #define ql_first(a_head) ((a_head)->qlh_first)
 #define ql_last(a_head, a_field)					\
 	((ql_first(a_head) != NULL)					\
 	    ? qr_prev(ql_first(a_head), a_field) : NULL)
 #define ql_next(a_head, a_elm, a_field)					\
 	((ql_last(a_head, a_field) != (a_elm))				\
 	    ? qr_next((a_elm), a_field)	: NULL)
 #define ql_prev(a_head, a_elm, a_field)					\
 	((ql_first(a_head) != (a_elm)) ? qr_prev((a_elm), a_field)	\
 				       : NULL)
 #define ql_before_insert(a_head, a_qlelm, a_elm, a_field) do {		\
 	qr_before_insert((a_qlelm), (a_elm), a_field);			\
 	if (ql_first(a_head) == (a_qlelm)) {				\
 		ql_first(a_head) = (a_elm);				\
 	}								\
 } while (0)
 #define ql_after_insert(a_qlelm, a_elm, a_field)			\
 	qr_after_insert((a_qlelm), (a_elm), a_field)
 #define ql_head_insert(a_head, a_elm, a_field) do {			\
 	if (ql_first(a_head) != NULL) {					\
 		qr_before_insert(ql_first(a_head), (a_elm), a_field);	\
 	}								\
 	ql_first(a_head) = (a_elm);					\
 } while (0)
 #define ql_tail_insert(a_head, a_elm, a_field) do {			\
 	if (ql_first(a_head) != NULL) {					\
 		qr_before_insert(ql_first(a_head), (a_elm), a_field);	\
 	}								\
 	ql_first(a_head) = qr_next((a_elm), a_field);			\
 } while (0)
 #define ql_remove(a_head, a_elm, a_field) do {				\
 	if (ql_first(a_head) == (a_elm)) {				\
 		ql_first(a_head) = qr_next(ql_first(a_head), a_field);	\
 	}								\
 	if (ql_first(a_head) != (a_elm)) {				\
 		qr_remove((a_elm), a_field);				\
 	} else {							\
 		ql_first(a_head) = NULL;				\
 	}								\
 } while (0)
 #define ql_head_remove(a_head, a_type, a_field) do {			\
 	a_type *t = ql_first(a_head);					\
 	ql_remove((a_head), t, a_field);				\
 } while (0)
 #define ql_tail_remove(a_head, a_type, a_field) do {			\
 	a_type *t = ql_last(a_head, a_field);				\
 	ql_remove((a_head), t, a_field);				\
 } while (0)
 #define ql_foreach(a_var, a_head, a_field)				\
 	qr_foreach((a_var), ql_first(a_head), a_field)
 #define ql_reverse_foreach(a_var, a_head, a_field)			\
 	qr_reverse_foreach((a_var), ql_first(a_head), a_field)
--- a/jemalloc/src/qr.h
+++ b/jemalloc/src/qr.h
@ -0,0 +1,98 @@
 /******************************************************************************
 *
 * Copyright (C) 2002 Jason Evans <jasone@canonware.com>.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice(s), this list of conditions and the following disclaimer
 *    unmodified other than the allowable addition of one or more
 *    copyright notices.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice(s), this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
 * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/
 /* Ring definitions. */
 #define qr(a_type)							\
 struct {								\
 	a_type	*qre_next;						\
 	a_type	*qre_prev;						\
 }
 /* Ring functions. */
 #define qr_new(a_qr, a_field) do {					\
 	(a_qr)->a_field.qre_next = (a_qr);				\
 	(a_qr)->a_field.qre_prev = (a_qr);				\
 } while (0)
 #define qr_next(a_qr, a_field) ((a_qr)->a_field.qre_next)
 #define qr_prev(a_qr, a_field) ((a_qr)->a_field.qre_prev)
 #define qr_before_insert(a_qrelm, a_qr, a_field) do {			\
 	(a_qr)->a_field.qre_prev = (a_qrelm)->a_field.qre_prev;		\
 	(a_qr)->a_field.qre_next = (a_qrelm);				\
 	(a_qr)->a_field.qre_prev->a_field.qre_next = (a_qr);		\
 	(a_qrelm)->a_field.qre_prev = (a_qr);				\
 } while (0)
 #define qr_after_insert(a_qrelm, a_qr, a_field)				\
    do									\
    {									\
 	(a_qr)->a_field.qre_next = (a_qrelm)->a_field.qre_next;		\
 	(a_qr)->a_field.qre_prev = (a_qrelm);				\
 	(a_qr)->a_field.qre_next->a_field.qre_prev = (a_qr);		\
 	(a_qrelm)->a_field.qre_next = (a_qr);				\
    } while (0)
 #define qr_meld(a_qr_a, a_qr_b, a_field) do {				\
 	void *t;							\
 	(a_qr_a)->a_field.qre_prev->a_field.qre_next = (a_qr_b);	\
 	(a_qr_b)->a_field.qre_prev->a_field.qre_next = (a_qr_a);	\
 	t = (a_qr_a)->a_field.qre_prev;					\
 	(a_qr_a)->a_field.qre_prev = (a_qr_b)->a_field.qre_prev;	\
 	(a_qr_b)->a_field.qre_prev = t;					\
 } while (0)
 /* qr_meld() and qr_split() are functionally equivalent, so there's no need to
 * have two copies of the code. */
 #define qr_split(a_qr_a, a_qr_b, a_field)				\
 	qr_meld((a_qr_a), (a_qr_b), a_field)
 #define qr_remove(a_qr, a_field) do {					\
 	(a_qr)->a_field.qre_prev->a_field.qre_next			\
 	    = (a_qr)->a_field.qre_next;					\
 	(a_qr)->a_field.qre_next->a_field.qre_prev			\
 	    = (a_qr)->a_field.qre_prev;					\
 	(a_qr)->a_field.qre_next = (a_qr);				\
 	(a_qr)->a_field.qre_prev = (a_qr);				\
 } while (0)
 #define qr_foreach(var, a_qr, a_field)					\
 	for ((var) = (a_qr);						\
 	    (var) != NULL;						\
 	    (var) = (((var)->a_field.qre_next != (a_qr))		\
 	    ? (var)->a_field.qre_next : NULL))
 #define qr_reverse_foreach(var, a_qr, a_field)				\
 	for ((var) = ((a_qr) != NULL) ? qr_prev(a_qr, a_field) : NULL;	\
 	    (var) != NULL;						\
 	    (var) = (((var) != (a_qr))					\
 	    ? (var)->a_field.qre_prev : NULL))