Convert thread-specific caching from magazines, and implement incremental GC.

Add the 'G'/'g' and 'H'/'h' MALLOC_OPTIONS flags.

Add the malloc_tcache_flush() function.

Disable thread-specific caching until the application goes multi-threaded.
This commit is contained in:
Jason Evans 2009-12-29 00:09:15 -08:00
parent b2378168a4
commit 84cbbcb90a
8 changed files with 861 additions and 435 deletions

View File

@ -46,10 +46,10 @@ any of the following arguments (not a definitive list) to 'configure':
practice it never causes any problems if, for example, 4-byte allocations
are 4-byte-aligned.
--disable-mag
Disable thread-specific caches for sub-page-sized objects. Objects are
cached and released in bulk using "magazines" -- a term coined by the
developers of Solaris's umem allocator.
--disable-tcache
Disable thread-specific caches for small and medium objects. Objects are
cached and released in bulk, thus reducing the total number of mutex
operations. Use the 'H' and 'G' options to control thread-specific caching.
--enable-dss
Enable support for page allocation/deallocation via sbrk(2), in addition to
@ -82,7 +82,7 @@ any of the following arguments (not a definitive list) to 'configure':
switches from single-threaded to multi-threaded mode, so that it can avoid
mutex locking/unlocking operations while in single-threaded mode. In
practice, this feature usually has little impact on performance unless
magazines are disabled.
thread-specific caching is disabled.
The following environment variables (not a definitive list) impact configure's
behavior:

View File

@ -254,6 +254,7 @@ if test "x$JEMALLOC_PREFIX" != "x" ; then
AC_DEFINE_UNQUOTED([realloc], [${JEMALLOC_PREFIX}realloc])
AC_DEFINE_UNQUOTED([free], [${JEMALLOC_PREFIX}free])
AC_DEFINE_UNQUOTED([malloc_usable_size], [${JEMALLOC_PREFIX}malloc_usable_size])
AC_DEFINE_UNQUOTED([malloc_tcache_flush], [${JEMALLOC_PREFIX}malloc_tcache_flush])
AC_DEFINE_UNQUOTED([malloc_options], [${JEMALLOC_PREFIX}malloc_options])
AC_DEFINE_UNQUOTED([malloc_message], [${JEMALLOC_PREFIX}malloc_message])
fi
@ -359,27 +360,18 @@ fi
AC_SUBST([roff_tiny])
AC_SUBST([roff_no_tiny])
dnl Enable magazines by default.
AC_ARG_ENABLE([mag],
[AS_HELP_STRING([--disable-mag], [Disable magazines (per thread caches)])],
[if test "x$enable_mag" = "xno" ; then
enable_mag="0"
dnl Enable thread-specific caching by default.
AC_ARG_ENABLE([tcache],
[AS_HELP_STRING([--disable-tcache], [Disable per thread caches])],
[if test "x$enable_tcache" = "xno" ; then
enable_tcache="0"
else
enable_mag="1"
enable_tcache="1"
fi
],
[enable_mag="1"]
[enable_tcache="1"]
)
if test "x$enable_mag" = "x1" ; then
AC_DEFINE([JEMALLOC_MAG], [ ])
fi
AC_SUBST([enable_mag])
if test "x$enable_mag" = "x0" ; then
roff_mag=".\\\" "
else
roff_mag=""
fi
AC_SUBST([roff_mag])
dnl Finish tcache-related definitions below, once TLS configuration is done.
dnl Do not enable allocation from DSS by default.
AC_ARG_ENABLE([dss],
@ -539,11 +531,28 @@ AC_RUN_IFELSE([AC_LANG_PROGRAM(
return 0;
]])],
AC_MSG_RESULT([yes])
roff_tls="",
have_tls="1",
AC_MSG_RESULT([no])
roff_tls=".\\\" "
have_tls="0"
AC_DEFINE_UNQUOTED([NO_TLS], [ ]))
AC_SUBST([roff_tls])
dnl Finish tcache-related definitions, now that TLS configuration is done.
if test "x$have_tls" = "x0" ; then
enable_tcache="0"
fi
if test "x$enable_tcache" = "x1" ; then
AC_DEFINE([JEMALLOC_TCACHE], [ ])
fi
AC_SUBST([enable_tcache])
if test "x$enable_tcache" = "x0" ; then
roff_tcache=".\\\" "
roff_no_tcache=""
else
roff_tcache=""
roff_no_tcache=".\\\" "
fi
AC_SUBST([roff_tcache])
AC_SUBST([roff_no_tcache])
dnl Enable lazy locking by default.
AC_ARG_ENABLE([lazy_lock],
@ -631,7 +640,7 @@ AC_MSG_RESULT([debug : ${enable_debug}])
AC_MSG_RESULT([stats : ${enable_stats}])
AC_MSG_RESULT([trace : ${enable_trace}])
AC_MSG_RESULT([tiny : ${enable_tiny}])
AC_MSG_RESULT([mag : ${enable_mag}])
AC_MSG_RESULT([tcache : ${enable_tcache}])
AC_MSG_RESULT([fill : ${enable_fill}])
AC_MSG_RESULT([xmalloc : ${enable_xmalloc}])
AC_MSG_RESULT([sysv : ${enable_sysv}])

View File

@ -35,7 +35,7 @@
.\" @(#)malloc.3 8.1 (Berkeley) 6/4/93
.\" $FreeBSD: head/lib/libc/stdlib/malloc.3 182225 2008-08-27 02:00:53Z jasone $
.\"
.Dd November 19, 2009
.Dd December 3, 2009
.Dt JEMALLOC 3
.Os
.Sh NAME
@ -58,6 +58,8 @@
.Fn @jemalloc_prefix@free "void *ptr"
.Ft size_t
.Fn @jemalloc_prefix@malloc_usable_size "const void *ptr"
@roff_tcache@.Ft void
@roff_tcache@.Fn @jemalloc_prefix@malloc_tcache_flush "void"
.Ft const char *
.Va @jemalloc_prefix@malloc_options ;
.Ft void
@ -156,6 +158,18 @@ Any discrepancy between the requested allocation size and the size reported by
.Fn @jemalloc_prefix@malloc_usable_size
should not be depended on, since such behavior is entirely
implementation-dependent.
@roff_tcache@.Pp
@roff_tcache@The
@roff_tcache@.Fn @jemalloc_prefix@malloc_tcache_flush
@roff_tcache@function releases all cached objects and internal data structures
@roff_tcache@associated with the calling thread's thread-specific cache.
@roff_tcache@Ordinarily, this function need not be called, since automatic
@roff_tcache@periodic incremental garbage collection occurs, and the thread
@roff_tcache@cache is automatically discarded when a thread exits.
@roff_tcache@However, garbage collection is triggered by allocation activity,
@roff_tcache@so it is possible for a thread that stops allocating/deallocating
@roff_tcache@to retain its cache indefinitely, in which case the developer may
@roff_tcache@find this function useful.
.Sh TUNING
Once, when the first call is made to one of these memory allocation
routines, various flags will be set or reset, which affects the
@ -203,16 +217,20 @@ physical memory becomes scarce and the pages remain unused.
The default is 512 pages per arena;
.Ev JEMALLOC_OPTIONS=10f
will prevent any dirty unused pages from accumulating.
@roff_mag@@roff_tls@.It G
@roff_mag@@roff_tls@When there are multiple threads, use thread-specific caching
@roff_mag@@roff_tls@for objects that are smaller than one page.
@roff_mag@@roff_tls@This option is enabled by default.
@roff_mag@@roff_tls@Thread-specific caching allows many allocations to be
@roff_mag@@roff_tls@satisfied without performing any thread synchronization, at
@roff_mag@@roff_tls@the cost of increased memory use.
@roff_mag@@roff_tls@See the
@roff_mag@@roff_tls@.Dq R
@roff_mag@@roff_tls@option for related tuning information.
@roff_tcache@.It G
@roff_tcache@Enable/disable incremental garbage collection of unused objects
@roff_tcache@stored in thread-specific caches.
@roff_tcache@This option is enabled by default.
@roff_tcache@.It H
@roff_tcache@When there are multiple threads, use thread-specific caching for
@roff_tcache@small and medium objects.
@roff_tcache@This option is enabled by default.
@roff_tcache@Thread-specific caching allows many allocations to be satisfied
@roff_tcache@without performing any thread synchronization, at the cost of
@roff_tcache@increased memory use.
@roff_tcache@See the
@roff_tcache@.Dq G
@roff_tcache@option for related tuning information.
@roff_fill@.It J
@roff_fill@Each byte of new memory allocated by
@roff_fill@.Fn @jemalloc_prefix@malloc
@ -235,8 +253,10 @@ The valid range is from one page to one half chunk.
The default value is 32 KiB.
.It N
Double/halve the number of arenas.
The default number of arenas is two times the number of CPUs, or one if there
is a single CPU.
The default number of arenas is
@roff_tcache@two
@roff_no_tcache@four
times the number of CPUs, or one if there is a single CPU.
.It P
Various statistics are printed at program exit via an
.Xr atexit 3
@ -250,13 +270,6 @@ Double/halve the size of the maximum size class that is a multiple of the
quantum (8 or 16 bytes, depending on architecture).
Above this size, cacheline spacing is used for size classes.
The default value is 128 bytes.
@roff_mag@@roff_tls@.It R
@roff_mag@@roff_tls@Double/halve magazine size, which approximately
@roff_mag@@roff_tls@doubles/halves the number of rounds in each magazine.
@roff_mag@@roff_tls@Magazines are used by the thread-specific caching machinery
@roff_mag@@roff_tls@to acquire and release objects in bulk.
@roff_mag@@roff_tls@Increasing the magazine size decreases locking overhead, at
@roff_mag@@roff_tls@the expense of increased memory usage.
@roff_trace@.It T
@roff_trace@Write a verbose trace log to a set of files named according to the
@roff_trace@pattern
@ -338,14 +351,14 @@ improve performance, mainly due to reduced cache performance.
However, it may make sense to reduce the number of arenas if an application
does not make much use of the allocation functions.
.Pp
@roff_mag@In addition to multiple arenas, this allocator supports
@roff_mag@thread-specific caching for small and medium objects, in order to make
@roff_mag@it possible to completely avoid synchronization for most small and
@roff_mag@medium allocation requests.
@roff_mag@Such caching allows very fast allocation in the common case, but it
@roff_mag@increases memory usage and fragmentation, since a bounded number of
@roff_mag@objects can remain allocated in each thread cache.
@roff_mag@.Pp
@roff_tcache@In addition to multiple arenas, this allocator supports
@roff_tcache@thread-specific caching for small and medium objects, in order to
@roff_tcache@make it possible to completely avoid synchronization for most small
@roff_tcache@and medium allocation requests.
@roff_tcache@Such caching allows very fast allocation in the common case, but it
@roff_tcache@increases memory usage and fragmentation, since a bounded number of
@roff_tcache@objects can remain allocated in each thread cache.
@roff_tcache@.Pp
Memory is conceptually broken into equal-sized chunks, where the chunk size is
a power of two that is greater than the page size.
Chunks are always aligned to multiples of the chunk size.

File diff suppressed because it is too large Load Diff

View File

@ -38,6 +38,10 @@ extern "C" {
size_t malloc_usable_size(const void *ptr);
#ifdef JEMALLOC_TCACHE
void malloc_tcache_flush(void);
#endif
extern const char *malloc_options;
extern void (*malloc_message)(const char *p1,
const char *p2, const char *p3, const char *p4);

View File

@ -55,6 +55,7 @@
#undef realloc
#undef free
#undef malloc_usable_size
#undef malloc_tcache_flush
#undef malloc_options
#undef malloc_message
#endif
@ -91,11 +92,11 @@
#undef JEMALLOC_TINY
/*
* JEMALLOC_MAG enables a magazine-based thread-specific caching layer for small
* JEMALLOC_TCACHE enables a thread-specific caching layer for small and medium
* objects. This makes it possible to allocate/deallocate objects without any
* locking when the cache is in the steady state.
*/
#undef JEMALLOC_MAG
#undef JEMALLOC_TCACHE
/*
* JEMALLOC_DSS enables use of sbrk(2) to allocate chunks from the data storage

114
jemalloc/src/ql.h Normal file
View File

@ -0,0 +1,114 @@
/******************************************************************************
*
* Copyright (C) 2002 Jason Evans <jasone@canonware.com>.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice(s), this list of conditions and the following disclaimer
* unmodified other than the allowable addition of one or more
* copyright notices.
* 2. Redistributions in binary form must reproduce the above copyright
* notice(s), this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
* OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/*
* List definitions.
*/
#define ql_head(a_type) \
struct { \
a_type *qlh_first; \
}
#define ql_head_initializer(a_head) {NULL}
#define ql_elm(a_type) qr(a_type)
/* List functions. */
#define ql_new(a_head) do { \
(a_head)->qlh_first = NULL; \
} while (0)
#define ql_elm_new(a_elm, a_field) qr_new((a_elm), a_field)
#define ql_first(a_head) ((a_head)->qlh_first)
#define ql_last(a_head, a_field) \
((ql_first(a_head) != NULL) \
? qr_prev(ql_first(a_head), a_field) : NULL)
#define ql_next(a_head, a_elm, a_field) \
((ql_last(a_head, a_field) != (a_elm)) \
? qr_next((a_elm), a_field) : NULL)
#define ql_prev(a_head, a_elm, a_field) \
((ql_first(a_head) != (a_elm)) ? qr_prev((a_elm), a_field) \
: NULL)
#define ql_before_insert(a_head, a_qlelm, a_elm, a_field) do { \
qr_before_insert((a_qlelm), (a_elm), a_field); \
if (ql_first(a_head) == (a_qlelm)) { \
ql_first(a_head) = (a_elm); \
} \
} while (0)
#define ql_after_insert(a_qlelm, a_elm, a_field) \
qr_after_insert((a_qlelm), (a_elm), a_field)
#define ql_head_insert(a_head, a_elm, a_field) do { \
if (ql_first(a_head) != NULL) { \
qr_before_insert(ql_first(a_head), (a_elm), a_field); \
} \
ql_first(a_head) = (a_elm); \
} while (0)
#define ql_tail_insert(a_head, a_elm, a_field) do { \
if (ql_first(a_head) != NULL) { \
qr_before_insert(ql_first(a_head), (a_elm), a_field); \
} \
ql_first(a_head) = qr_next((a_elm), a_field); \
} while (0)
#define ql_remove(a_head, a_elm, a_field) do { \
if (ql_first(a_head) == (a_elm)) { \
ql_first(a_head) = qr_next(ql_first(a_head), a_field); \
} \
if (ql_first(a_head) != (a_elm)) { \
qr_remove((a_elm), a_field); \
} else { \
ql_first(a_head) = NULL; \
} \
} while (0)
#define ql_head_remove(a_head, a_type, a_field) do { \
a_type *t = ql_first(a_head); \
ql_remove((a_head), t, a_field); \
} while (0)
#define ql_tail_remove(a_head, a_type, a_field) do { \
a_type *t = ql_last(a_head, a_field); \
ql_remove((a_head), t, a_field); \
} while (0)
#define ql_foreach(a_var, a_head, a_field) \
qr_foreach((a_var), ql_first(a_head), a_field)
#define ql_reverse_foreach(a_var, a_head, a_field) \
qr_reverse_foreach((a_var), ql_first(a_head), a_field)

98
jemalloc/src/qr.h Normal file
View File

@ -0,0 +1,98 @@
/******************************************************************************
*
* Copyright (C) 2002 Jason Evans <jasone@canonware.com>.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice(s), this list of conditions and the following disclaimer
* unmodified other than the allowable addition of one or more
* copyright notices.
* 2. Redistributions in binary form must reproduce the above copyright
* notice(s), this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
* OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/* Ring definitions. */
#define qr(a_type) \
struct { \
a_type *qre_next; \
a_type *qre_prev; \
}
/* Ring functions. */
#define qr_new(a_qr, a_field) do { \
(a_qr)->a_field.qre_next = (a_qr); \
(a_qr)->a_field.qre_prev = (a_qr); \
} while (0)
#define qr_next(a_qr, a_field) ((a_qr)->a_field.qre_next)
#define qr_prev(a_qr, a_field) ((a_qr)->a_field.qre_prev)
#define qr_before_insert(a_qrelm, a_qr, a_field) do { \
(a_qr)->a_field.qre_prev = (a_qrelm)->a_field.qre_prev; \
(a_qr)->a_field.qre_next = (a_qrelm); \
(a_qr)->a_field.qre_prev->a_field.qre_next = (a_qr); \
(a_qrelm)->a_field.qre_prev = (a_qr); \
} while (0)
#define qr_after_insert(a_qrelm, a_qr, a_field) \
do \
{ \
(a_qr)->a_field.qre_next = (a_qrelm)->a_field.qre_next; \
(a_qr)->a_field.qre_prev = (a_qrelm); \
(a_qr)->a_field.qre_next->a_field.qre_prev = (a_qr); \
(a_qrelm)->a_field.qre_next = (a_qr); \
} while (0)
#define qr_meld(a_qr_a, a_qr_b, a_field) do { \
void *t; \
(a_qr_a)->a_field.qre_prev->a_field.qre_next = (a_qr_b); \
(a_qr_b)->a_field.qre_prev->a_field.qre_next = (a_qr_a); \
t = (a_qr_a)->a_field.qre_prev; \
(a_qr_a)->a_field.qre_prev = (a_qr_b)->a_field.qre_prev; \
(a_qr_b)->a_field.qre_prev = t; \
} while (0)
/* qr_meld() and qr_split() are functionally equivalent, so there's no need to
* have two copies of the code. */
#define qr_split(a_qr_a, a_qr_b, a_field) \
qr_meld((a_qr_a), (a_qr_b), a_field)
#define qr_remove(a_qr, a_field) do { \
(a_qr)->a_field.qre_prev->a_field.qre_next \
= (a_qr)->a_field.qre_next; \
(a_qr)->a_field.qre_next->a_field.qre_prev \
= (a_qr)->a_field.qre_prev; \
(a_qr)->a_field.qre_next = (a_qr); \
(a_qr)->a_field.qre_prev = (a_qr); \
} while (0)
#define qr_foreach(var, a_qr, a_field) \
for ((var) = (a_qr); \
(var) != NULL; \
(var) = (((var)->a_field.qre_next != (a_qr)) \
? (var)->a_field.qre_next : NULL))
#define qr_reverse_foreach(var, a_qr, a_field) \
for ((var) = ((a_qr) != NULL) ? qr_prev(a_qr, a_field) : NULL; \
(var) != NULL; \
(var) = (((var) != (a_qr)) \
? (var)->a_field.qre_prev : NULL))