From dafde14e08ddfda747aabb2045b350848b601b2e Mon Sep 17 00:00:00 2001 From: Jason Evans Date: Wed, 17 Mar 2010 16:27:39 -0700 Subject: [PATCH] Remove medium size classes. Remove medium size classes, because concurrent dirty page purging is no longer capable of purging inactive dirty pages inside active runs (due to recent arena/bin locking changes). Enhance tcache to support caching large objects, so that the same range of size classes is still cached, despite the removal of medium size class support. --- jemalloc/INSTALL | 6 +- jemalloc/doc/jemalloc.3.in | 131 +++++------ jemalloc/include/jemalloc/internal/arena.h | 63 +++--- jemalloc/include/jemalloc/internal/ctl.h | 7 +- jemalloc/include/jemalloc/internal/stats.h | 15 +- jemalloc/include/jemalloc/internal/tcache.h | 181 ++++++++++++--- jemalloc/include/jemalloc/jemalloc_defs.h.in | 6 +- jemalloc/src/arena.c | 218 +++++-------------- jemalloc/src/ctl.c | 89 +++----- jemalloc/src/jemalloc.c | 70 ++---- jemalloc/src/stats.c | 68 +++--- jemalloc/src/tcache.c | 143 +++++++++++- 12 files changed, 514 insertions(+), 483 deletions(-) diff --git a/jemalloc/INSTALL b/jemalloc/INSTALL index 6157ae52..7ce7e793 100644 --- a/jemalloc/INSTALL +++ b/jemalloc/INSTALL @@ -71,9 +71,9 @@ any of the following arguments (not a definitive list) to 'configure': are 4-byte-aligned. --disable-tcache - Disable thread-specific caches for small and medium objects. Objects are - cached and released in bulk, thus reducing the total number of mutex - operations. Use the 'H' and 'G' options to control thread-specific caching. + Disable thread-specific caches for small objects. Objects are cached and + released in bulk, thus reducing the total number of mutex operations. Use + the 'H', 'G', and 'M' options to control thread-specific caching. --enable-swap Enable mmap()ed swap file support. When this feature is built in, it is diff --git a/jemalloc/doc/jemalloc.3.in b/jemalloc/doc/jemalloc.3.in index 321e95f3..1ea93bf5 100644 --- a/jemalloc/doc/jemalloc.3.in +++ b/jemalloc/doc/jemalloc.3.in @@ -38,7 +38,7 @@ .\" @(#)malloc.3 8.1 (Berkeley) 6/4/93 .\" $FreeBSD: head/lib/libc/stdlib/malloc.3 182225 2008-08-27 02:00:53Z jasone $ .\" -.Dd March 14, 2010 +.Dd March 17, 2010 .Dt JEMALLOC 3 .Os .Sh NAME @@ -378,13 +378,15 @@ will disable dirty page purging. @roff_tcache@.It H @roff_tcache@Enable/disable thread-specific caching. @roff_tcache@When there are multiple threads, each thread uses a -@roff_tcache@thread-specific cache for small and medium objects. +@roff_tcache@thread-specific cache for objects up to a certain size. @roff_tcache@Thread-specific caching allows many allocations to be satisfied @roff_tcache@without performing any thread synchronization, at the cost of @roff_tcache@increased memory use. @roff_tcache@See the @roff_tcache@.Dq G -@roff_tcache@option for related tuning information. +@roff_tcache@and +@roff_tcache@.Dq M +@roff_tcache@options for related tuning information. @roff_tcache@This option is enabled by default. @roff_prof@.It I @roff_prof@Double/halve the average interval between memory profile dumps, as @@ -426,16 +428,15 @@ The default chunk size is 4 MiB. @roff_prof@See the @roff_prof@.Dq F option for information on analyzing heap profile output. @roff_prof@This option is disabled by default. -.It M -Double/halve the size of the maximum medium size class. -The valid range is from one page to one half chunk. -The default value is 32 KiB. +@roff_tcache@.It M +@roff_tcache@Double/halve the maximum size class to cache. +@roff_tcache@At a minimum, all small size classes are cached, and at a maximum +@roff_tcache@all large size classes are cached. +@roff_tcache@The default maximum is 32 KiB. .It N Double/halve the number of arenas. -The default number of arenas is -@roff_tcache@two -@roff_no_tcache@four -times the number of CPUs, or one if there is a single CPU. +The default number of arenas is four times the number of CPUs, or one if there +is a single CPU. @roff_swap@.It O @roff_swap@Over-commit memory as a side effect of using anonymous @roff_swap@.Xr mmap 2 @@ -550,9 +551,9 @@ However, it may make sense to reduce the number of arenas if an application does not make much use of the allocation functions. .Pp @roff_tcache@In addition to multiple arenas, this allocator supports -@roff_tcache@thread-specific caching for small and medium objects, in order to -@roff_tcache@make it possible to completely avoid synchronization for most small -@roff_tcache@and medium allocation requests. +@roff_tcache@thread-specific caching for small objects, in order to make it +@roff_tcache@possible to completely avoid synchronization for most small +@roff_tcache@allocation requests. @roff_tcache@Such caching allows very fast allocation in the common case, but it @roff_tcache@increases memory usage and fragmentation, since a bounded number of @roff_tcache@objects can remain allocated in each thread cache. @@ -563,27 +564,23 @@ Chunks are always aligned to multiples of the chunk size. This alignment makes it possible to find metadata for user objects very quickly. .Pp -User objects are broken into four categories according to size: small, medium, -large, and huge. +User objects are broken into three categories according to size: small, large, +and huge. Small objects are smaller than one page. -Medium objects range from one page to an upper limit determined at run time (see -the -.Dq M -option). Large objects are smaller than the chunk size. Huge objects are a multiple of the chunk size. -Small, medium, and large objects are managed by arenas; huge objects are managed +Small and large objects are managed by arenas; huge objects are managed separately in a single data structure that is shared by all threads. Huge objects are used by applications infrequently enough that this single data structure is not a scalability issue. .Pp Each chunk that is managed by an arena tracks its contents as runs of -contiguous pages (unused, backing a set of small or medium objects, or backing -one large object). +contiguous pages (unused, backing a set of small objects, or backing one large +object). The combination of chunk alignment and chunk page maps makes it possible to determine all metadata regarding small and large allocations in constant time. .Pp -Small and medium objects are managed in groups by page runs. +Small objects are managed in groups by page runs. Each run maintains a bitmap that tracks which regions are in use. @roff_tiny@Allocation requests that are no more than half the quantum (8 or 16, @roff_tiny@depending on architecture) are rounded up to the nearest power of @@ -603,13 +600,7 @@ Allocation requests that are more than the minimum subpage-multiple size class, but no more than the maximum subpage-multiple size class are rounded up to the nearest multiple of the subpage size (256). Allocation requests that are more than the maximum subpage-multiple size class, -but no more than the maximum medium size class (see the -.Dq M -option) are rounded up to the nearest medium size class; spacing is an -automatically determined power of two and ranges from the subpage size to the -page size. -Allocation requests that are more than the maximum medium size class, but small -enough to fit in an arena-managed chunk (see the +but small enough to fit in an arena-managed chunk (see the .Dq K option), are rounded up to the nearest run size. Allocation requests that are too large to fit in an arena-managed chunk are @@ -838,13 +829,6 @@ See the option. .Ed .\"----------------------------------------------------------------------------- -.It Sy "opt.lg_medium_max (size_t) r-" -.Bd -ragged -offset indent -compact -See the -.Dq M -option. -.Ed -.\"----------------------------------------------------------------------------- .It Sy "opt.lg_dirty_mult (ssize_t) r-" .Bd -ragged -offset indent -compact See the @@ -900,11 +884,6 @@ Subpage size class interval. Page size. .Ed .\"----------------------------------------------------------------------------- -.It Sy "arenas.medium (size_t) r-" -.Bd -ragged -offset indent -compact -Medium size class interval. -.Ed -.\"----------------------------------------------------------------------------- .It Sy "arenas.chunksize (size_t) r-" .Bd -ragged -offset indent -compact Chunk size. @@ -952,15 +931,10 @@ Minimum subpage-spaced size class. Maximum subpage-spaced size class. .Ed .\"----------------------------------------------------------------------------- -.It Sy "arenas.medium_min (size_t) r-" -.Bd -ragged -offset indent -compact -Minimum medium-spaced size class. -.Ed -.\"----------------------------------------------------------------------------- -.It Sy "arenas.medium_max (size_t) r-" -.Bd -ragged -offset indent -compact -Maximum medium-spaced size class. -.Ed +@roff_tcache@.It Sy "arenas.tcache_max (size_t) r-" +@roff_tcache@.Bd -ragged -offset indent -compact +@roff_tcache@Maximum thread-cached size class. +@roff_tcache@.Ed .\"----------------------------------------------------------------------------- .It Sy "arenas.ntbins (unsigned) r-" .Bd -ragged -offset indent -compact @@ -982,16 +956,16 @@ Number of cacheline-spaced bin size classes. Number of subpage-spaced bin size classes. .Ed .\"----------------------------------------------------------------------------- -.It Sy "arenas.nmbins (unsigned) r-" -.Bd -ragged -offset indent -compact -Number of medium-spaced bin size classes. -.Ed -.\"----------------------------------------------------------------------------- .It Sy "arenas.nbins (unsigned) r-" .Bd -ragged -offset indent -compact Total number of bin size classes. .Ed .\"----------------------------------------------------------------------------- +@roff_tcache@.It Sy "arenas.nhbins (unsigned) r-" +@roff_tcache@.Bd -ragged -offset indent -compact +@roff_tcache@Total number of thread cache bin size classes. +@roff_tcache@.Ed +.\"----------------------------------------------------------------------------- .It Sy "arenas.bin..size (size_t) r-" .Bd -ragged -offset indent -compact Maximum size supported by size class. @@ -1147,26 +1121,6 @@ has not been called. @roff_stats@Cumulative number of small allocation requests. @roff_stats@.Ed .\"----------------------------------------------------------------------------- -@roff_stats@.It Sy "stats.arenas..medium.allocated (size_t) r-" -@roff_stats@.Bd -ragged -offset indent -compact -@roff_stats@Number of bytes currently allocated by medium objects. -@roff_stats@.Ed -.\"----------------------------------------------------------------------------- -@roff_stats@.It Sy "stats.arenas..medium.nmalloc (uint64_t) r-" -@roff_stats@.Bd -ragged -offset indent -compact -@roff_stats@Cumulative number of allocation requests served by medium bins. -@roff_stats@.Ed -.\"----------------------------------------------------------------------------- -@roff_stats@.It Sy "stats.arenas..medium.ndalloc (uint64_t) r-" -@roff_stats@.Bd -ragged -offset indent -compact -@roff_stats@Cumulative number of medium objects returned to bins. -@roff_stats@.Ed -.\"----------------------------------------------------------------------------- -@roff_stats@.It Sy "stats.arenas..medium.nrequests (uint64_t) r-" -@roff_stats@.Bd -ragged -offset indent -compact -@roff_stats@Cumulative number of medium allocation requests. -@roff_stats@.Ed -.\"----------------------------------------------------------------------------- @roff_stats@.It Sy "stats.arenas..large.allocated (size_t) r-" @roff_stats@.Bd -ragged -offset indent -compact @roff_stats@Number of bytes currently allocated by large objects. @@ -1174,12 +1128,19 @@ has not been called. .\"----------------------------------------------------------------------------- @roff_stats@.It Sy "stats.arenas..large.nmalloc (uint64_t) r-" @roff_stats@.Bd -ragged -offset indent -compact -@roff_stats@Cumulative number of large allocation requests. +@roff_stats@Cumulative number of large allocation requests served directly by +@roff_stats@the arena. @roff_stats@.Ed .\"----------------------------------------------------------------------------- @roff_stats@.It Sy "stats.arenas..large.ndalloc (uint64_t) r-" @roff_stats@.Bd -ragged -offset indent -compact -@roff_stats@Cumulative number of large deallocation requests. +@roff_stats@Cumulative number of large deallocation requests served directly by +@roff_stats@the arena. +@roff_stats@.Ed +.\"----------------------------------------------------------------------------- +@roff_stats@.It Sy "stats.arenas..large.nrequests (uint64_t) r-" +@roff_stats@.Bd -ragged -offset indent -compact +@roff_stats@Cumulative number of large allocation requests. @roff_stats@.Ed .\"----------------------------------------------------------------------------- @roff_stats@.It Sy "stats.arenas..bins..allocated (size_t) r-" @@ -1233,6 +1194,18 @@ has not been called. @roff_stats@Current number of runs. @roff_stats@.Ed .\"----------------------------------------------------------------------------- +@roff_stats@.It Sy "stats.arenas..lruns..nmalloc (uint64_t) r-" +@roff_stats@.Bd -ragged -offset indent -compact +@roff_stats@Cumulative number of allocation requests for this size class served +@roff_stats@directly by the arena. +@roff_stats@.Ed +.\"----------------------------------------------------------------------------- +@roff_stats@.It Sy "stats.arenas..lruns..ndalloc (uint64_t) r-" +@roff_stats@.Bd -ragged -offset indent -compact +@roff_stats@Cumulative number of deallocation requests for this size class +@roff_stats@served directly by the arena. +@roff_stats@.Ed +.\"----------------------------------------------------------------------------- @roff_stats@.It Sy "stats.arenas..lruns..nrequests (uint64_t) r-" @roff_stats@.Bd -ragged -offset indent -compact @roff_stats@Cumulative number of allocation requests for this size class. diff --git a/jemalloc/include/jemalloc/internal/arena.h b/jemalloc/include/jemalloc/internal/arena.h index c58f5364..fc663806 100644 --- a/jemalloc/include/jemalloc/internal/arena.h +++ b/jemalloc/include/jemalloc/internal/arena.h @@ -35,23 +35,6 @@ */ #define LG_CSPACE_MAX_DEFAULT 9 -/* - * Maximum medium size class. This must not be more than 1/4 of a chunk - * (LG_MEDIUM_MAX_DEFAULT <= LG_CHUNK_DEFAULT - 2). - */ -#define LG_MEDIUM_MAX_DEFAULT 15 - -/* Return the smallest medium size class that is >= s. */ -#define MEDIUM_CEILING(s) \ - (((s) + mspace_mask) & ~mspace_mask) - -/* - * Soft limit on the number of medium size classes. Spacing between medium - * size classes never exceeds pagesize, which can force more than NBINS_MAX - * medium size classes. - */ -#define NMBINS_MAX 16 - /* * RUN_MAX_OVRHD indicates maximum desired run header overhead. Runs are sized * as small as possible such that this setting is still honored, without @@ -126,7 +109,7 @@ struct arena_chunk_map_s { * * ? : Unallocated: Run address for first/last pages, unset for internal * pages. - * Small/medium: Don't care. + * Small: Don't care. * Large: Run size for first page, unset for trailing pages. * - : Unused. * d : dirty? @@ -147,7 +130,7 @@ struct arena_chunk_map_s { * xxxxxxxx xxxxxxxx xxxx---- ----d--- * ssssssss ssssssss ssss---- -----z-- * - * Small/medium: + * Small: * pppppppp pppppppp pppp---- -------a * pppppppp pppppppp pppp---- -------a * pppppppp pppppppp pppp---- -------a @@ -386,7 +369,6 @@ struct arena_s { extern size_t opt_lg_qspace_max; extern size_t opt_lg_cspace_max; -extern size_t opt_lg_medium_max; extern ssize_t opt_lg_dirty_mult; extern uint8_t const *small_size2bin; @@ -399,9 +381,7 @@ extern uint8_t const *small_size2bin; extern unsigned nqbins; /* Number of quantum-spaced bins. */ extern unsigned ncbins; /* Number of cacheline-spaced bins. */ extern unsigned nsbins; /* Number of subpage-spaced bins. */ -extern unsigned nmbins; /* Number of medium bins. */ extern unsigned nbins; -extern unsigned mbin0; /* mbin offset (nbins - nmbins). */ #ifdef JEMALLOC_TINY # define tspace_max ((size_t)(QUANTUM >> 1)) #endif @@ -412,18 +392,12 @@ extern size_t cspace_max; extern size_t sspace_min; extern size_t sspace_max; #define small_maxclass sspace_max -#define medium_min PAGE_SIZE -extern size_t medium_max; -#define bin_maxclass medium_max -/* Spacing between medium size classes. */ -extern size_t lg_mspace; -extern size_t mspace_mask; - -#define nlclasses ((chunksize - PAGE_SIZE) >> PAGE_SHIFT) +#define nlclasses (chunk_npages - arena_chunk_header_npages) #ifdef JEMALLOC_TCACHE -void arena_tcache_fill(arena_t *arena, tcache_bin_t *tbin, size_t binind +void arena_tcache_fill_small(arena_t *arena, tcache_bin_t *tbin, + size_t binind # ifdef JEMALLOC_PROF , uint64_t prof_accumbytes # endif @@ -433,7 +407,7 @@ void arena_tcache_fill(arena_t *arena, tcache_bin_t *tbin, size_t binind void arena_prof_accum(arena_t *arena, uint64_t accumbytes); #endif void *arena_malloc_small(arena_t *arena, size_t size, bool zero); -void *arena_malloc_medium(arena_t *arena, size_t size, bool zero); +void *arena_malloc_large(arena_t *arena, size_t size, bool zero); void *arena_malloc(size_t size, bool zero); void *arena_palloc(arena_t *arena, size_t alignment, size_t size, size_t alloc_size); @@ -484,7 +458,7 @@ arena_dalloc(arena_t *arena, arena_chunk_t *chunk, void *ptr) tcache_t *tcache; if ((tcache = tcache_get()) != NULL) - tcache_dalloc(tcache, ptr); + tcache_dalloc_small(tcache, ptr); else { #endif arena_run_t *run; @@ -506,8 +480,31 @@ arena_dalloc(arena_t *arena, arena_chunk_t *chunk, void *ptr) } #endif } else { +#ifdef JEMALLOC_TCACHE + size_t size = mapelm->bits & ~PAGE_MASK; + assert(((uintptr_t)ptr & PAGE_MASK) == 0); + if (size <= tcache_maxclass) { + tcache_t *tcache; + + if ((tcache = tcache_get()) != NULL) + tcache_dalloc_large(tcache, ptr, size); + else { + malloc_mutex_lock(&arena->lock); + arena_dalloc_large(arena, chunk, ptr); + malloc_mutex_unlock(&arena->lock); + } + } else { + malloc_mutex_lock(&arena->lock); + arena_dalloc_large(arena, chunk, ptr); + malloc_mutex_unlock(&arena->lock); + } +#else + assert(((uintptr_t)ptr & PAGE_MASK) == 0); + malloc_mutex_lock(&arena->lock); arena_dalloc_large(arena, chunk, ptr); + malloc_mutex_unlock(&arena->lock); +#endif } } #endif diff --git a/jemalloc/include/jemalloc/internal/ctl.h b/jemalloc/include/jemalloc/internal/ctl.h index 9a39e14f..7bbf21e0 100644 --- a/jemalloc/include/jemalloc/internal/ctl.h +++ b/jemalloc/include/jemalloc/internal/ctl.h @@ -34,17 +34,12 @@ struct ctl_arena_stats_s { #ifdef JEMALLOC_STATS arena_stats_t astats; - /* Aggregate stats for small/medium size classes, based on bin stats. */ + /* Aggregate stats for small size classes, based on bin stats. */ size_t allocated_small; uint64_t nmalloc_small; uint64_t ndalloc_small; uint64_t nrequests_small; - size_t allocated_medium; - uint64_t nmalloc_medium; - uint64_t ndalloc_medium; - uint64_t nrequests_medium; - malloc_bin_stats_t *bstats; /* nbins elements. */ malloc_large_stats_t *lstats; /* nlclasses elements. */ #endif diff --git a/jemalloc/include/jemalloc/internal/stats.h b/jemalloc/include/jemalloc/internal/stats.h index 47701e61..cbf035ff 100644 --- a/jemalloc/include/jemalloc/internal/stats.h +++ b/jemalloc/include/jemalloc/internal/stats.h @@ -35,6 +35,7 @@ struct malloc_bin_stats_s { * cached by tcache. */ size_t allocated; + /* * Total number of allocation/deallocation requests served directly by * the bin. Note that tcache may allocate an object, then recycle it @@ -77,7 +78,18 @@ struct malloc_bin_stats_s { struct malloc_large_stats_s { /* - * Number of allocation requests that corresponded to this size class. + * Total number of allocation/deallocation requests served directly by + * the arena. Note that tcache may allocate an object, then recycle it + * many times, resulting many increments to nrequests, but only one + * each to nmalloc and ndalloc. + */ + uint64_t nmalloc; + uint64_t ndalloc; + + /* + * Number of allocation requests that correspond to this size class. + * This includes requests served by tcache, though tcache only + * periodically merges into this counter. */ uint64_t nrequests; @@ -105,6 +117,7 @@ struct arena_stats_s { size_t allocated_large; uint64_t nmalloc_large; uint64_t ndalloc_large; + uint64_t nrequests_large; /* * One element for each possible size class, including sizes that diff --git a/jemalloc/include/jemalloc/internal/tcache.h b/jemalloc/include/jemalloc/internal/tcache.h index afad7098..96e9cf14 100644 --- a/jemalloc/include/jemalloc/internal/tcache.h +++ b/jemalloc/include/jemalloc/internal/tcache.h @@ -6,20 +6,27 @@ typedef struct tcache_bin_s tcache_bin_t; typedef struct tcache_s tcache_t; /* - * Absolute maximum number of cache slots for each bin in the thread cache. - * This is an additional constraint beyond that imposed as: twice the number of - * regions per run for this size class. + * Absolute maximum number of cache slots for each small bin in the thread + * cache. This is an additional constraint beyond that imposed as: twice the + * number of regions per run for this size class. * * This constant must be an even number. */ -#define TCACHE_NSLOTS_MAX 200 - /* - * (1U << opt_lg_tcache_gc_sweep) is the approximate number of allocation - * events between full GC sweeps (-1: disabled). Integer rounding may cause - * the actual number to be slightly higher, since GC is performed - * incrementally. - */ -#define LG_TCACHE_GC_SWEEP_DEFAULT 13 +#define TCACHE_NSLOTS_SMALL_MAX 200 + +/* Number of cache slots for large size classes. */ +#define TCACHE_NSLOTS_LARGE 20 + +/* (1U << opt_lg_tcache_maxclass) is used to compute tcache_maxclass. */ +#define LG_TCACHE_MAXCLASS_DEFAULT 15 + +/* + * (1U << opt_lg_tcache_gc_sweep) is the approximate number of allocation + * events between full GC sweeps (-1: disabled). Integer rounding may cause + * the actual number to be slightly higher, since GC is performed + * incrementally. + */ +#define LG_TCACHE_GC_SWEEP_DEFAULT 13 #endif /* JEMALLOC_H_TYPES */ /******************************************************************************/ @@ -54,22 +61,38 @@ struct tcache_s { #ifdef JEMALLOC_H_EXTERNS extern bool opt_tcache; +extern ssize_t opt_lg_tcache_maxclass; extern ssize_t opt_lg_tcache_gc_sweep; /* Map of thread-specific caches. */ extern __thread tcache_t *tcache_tls JEMALLOC_ATTR(tls_model("initial-exec")); +/* + * Number of tcache bins. There are nbins small-object bins, plus 0 or more + * large-object bins. + */ +extern size_t nhbins; + +/* Maximum cached size class. */ +extern size_t tcache_maxclass; + /* Number of tcache allocation/deallocation events between incremental GCs. */ extern unsigned tcache_gc_incr; -void tcache_bin_flush(tcache_bin_t *tbin, size_t binind, unsigned rem +void tcache_bin_flush_small(tcache_bin_t *tbin, size_t binind, unsigned rem +#if (defined(JEMALLOC_STATS) || defined(JEMALLOC_PROF)) + , tcache_t *tcache +#endif + ); +void tcache_bin_flush_large(tcache_bin_t *tbin, size_t binind, unsigned rem #if (defined(JEMALLOC_STATS) || defined(JEMALLOC_PROF)) , tcache_t *tcache #endif ); tcache_t *tcache_create(arena_t *arena); -void *tcache_alloc_hard(tcache_t *tcache, tcache_bin_t *tbin, size_t binind); +void *tcache_alloc_small_hard(tcache_t *tcache, tcache_bin_t *tbin, + size_t binind); void tcache_destroy(tcache_t *tcache); #ifdef JEMALLOC_STATS void tcache_stats_merge(tcache_t *tcache, arena_t *arena); @@ -83,9 +106,11 @@ void tcache_boot(void); #ifndef JEMALLOC_ENABLE_INLINE void tcache_event(tcache_t *tcache); tcache_t *tcache_get(void); -void *tcache_bin_alloc(tcache_bin_t *tbin); -void *tcache_alloc(tcache_t *tcache, size_t size, bool zero); -void tcache_dalloc(tcache_t *tcache, void *ptr); +void *tcache_alloc_easy(tcache_bin_t *tbin); +void *tcache_alloc_small(tcache_t *tcache, size_t size, bool zero); +void *tcache_alloc_large(tcache_t *tcache, size_t size, bool zero); +void tcache_dalloc_small(tcache_t *tcache, void *ptr); +void tcache_dalloc_large(tcache_t *tcache, void *ptr, size_t size); #endif #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_TCACHE_C_)) @@ -128,25 +153,36 @@ tcache_event(tcache_t *tcache) * Flush (ceiling) 3/4 of the objects below the low * water mark. */ - tcache_bin_flush(tbin, binind, tbin->ncached - - tbin->low_water + (tbin->low_water >> 2) + if (binind < nbins) { + tcache_bin_flush_small(tbin, binind, + tbin->ncached - tbin->low_water + + (tbin->low_water >> 2) #if (defined(JEMALLOC_STATS) || defined(JEMALLOC_PROF)) - , tcache + , tcache #endif - ); + ); + } else { + tcache_bin_flush_large(tbin, binind, + tbin->ncached - tbin->low_water + + (tbin->low_water >> 2) +#if (defined(JEMALLOC_STATS) || defined(JEMALLOC_PROF)) + , tcache +#endif + ); + } } tbin->low_water = tbin->ncached; tbin->high_water = tbin->ncached; tcache->next_gc_bin++; - if (tcache->next_gc_bin == nbins) + if (tcache->next_gc_bin == nhbins) tcache->next_gc_bin = 0; tcache->ev_cnt = 0; } } JEMALLOC_INLINE void * -tcache_bin_alloc(tcache_bin_t *tbin) +tcache_alloc_easy(tcache_bin_t *tbin) { void *ret; @@ -161,23 +197,18 @@ tcache_bin_alloc(tcache_bin_t *tbin) } JEMALLOC_INLINE void * -tcache_alloc(tcache_t *tcache, size_t size, bool zero) +tcache_alloc_small(tcache_t *tcache, size_t size, bool zero) { void *ret; size_t binind; tcache_bin_t *tbin; - if (size <= small_maxclass) - binind = small_size2bin[size]; - else { - binind = mbin0 + ((MEDIUM_CEILING(size) - medium_min) >> - lg_mspace); - } + binind = small_size2bin[size]; assert(binind < nbins); tbin = &tcache->tbins[binind]; - ret = tcache_bin_alloc(tbin); + ret = tcache_alloc_easy(tbin); if (ret == NULL) { - ret = tcache_alloc_hard(tcache, tbin, binind); + ret = tcache_alloc_small_hard(tcache, tbin, binind); if (ret == NULL) return (NULL); } @@ -203,8 +234,52 @@ tcache_alloc(tcache_t *tcache, size_t size, bool zero) return (ret); } +JEMALLOC_INLINE void * +tcache_alloc_large(tcache_t *tcache, size_t size, bool zero) +{ + void *ret; + size_t binind; + tcache_bin_t *tbin; + + size = PAGE_CEILING(size); + assert(size <= tcache_maxclass); + binind = nbins + (size >> PAGE_SHIFT) - 1; + assert(binind < nhbins); + tbin = &tcache->tbins[binind]; + ret = tcache_alloc_easy(tbin); + if (ret == NULL) { + /* + * Only allocate one large object at a time, because it's quite + * expensive to create one and not use it. + */ + ret = arena_malloc_large(tcache->arena, size, zero); + if (ret == NULL) + return (NULL); + } else { + if (zero == false) { +#ifdef JEMALLOC_FILL + if (opt_junk) + memset(ret, 0xa5, size); + else if (opt_zero) + memset(ret, 0, size); +#endif + } else + memset(ret, 0, size); + +#ifdef JEMALLOC_STATS + tbin->tstats.nrequests++; +#endif +#ifdef JEMALLOC_PROF + tcache->prof_accumbytes += size; +#endif + } + + tcache_event(tcache); + return (ret); +} + JEMALLOC_INLINE void -tcache_dalloc(tcache_t *tcache, void *ptr) +tcache_dalloc_small(tcache_t *tcache, void *ptr) { arena_t *arena; arena_chunk_t *chunk; @@ -234,7 +309,47 @@ tcache_dalloc(tcache_t *tcache, void *ptr) tbin = &tcache->tbins[binind]; if (tbin->ncached == tbin->ncached_max) { - tcache_bin_flush(tbin, binind, (tbin->ncached_max >> 1) + tcache_bin_flush_small(tbin, binind, (tbin->ncached_max >> 1) +#if (defined(JEMALLOC_STATS) || defined(JEMALLOC_PROF)) + , tcache +#endif + ); + } + assert(tbin->ncached < tbin->ncached_max); + *(void **)ptr = tbin->avail; + tbin->avail = ptr; + tbin->ncached++; + if (tbin->ncached > tbin->high_water) + tbin->high_water = tbin->ncached; + + tcache_event(tcache); +} + +JEMALLOC_INLINE void +tcache_dalloc_large(tcache_t *tcache, void *ptr, size_t size) +{ + arena_t *arena; + arena_chunk_t *chunk; + size_t pageind, binind; + tcache_bin_t *tbin; + arena_chunk_map_t *mapelm; + + assert((size & PAGE_MASK) == 0); + + chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr); + arena = chunk->arena; + pageind = (((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT); + mapelm = &chunk->map[pageind]; + binind = nbins + (size >> PAGE_SHIFT) - 1; + +#ifdef JEMALLOC_FILL + if (opt_junk) + memset(ptr, 0x5a, bin->reg_size); +#endif + + tbin = &tcache->tbins[binind]; + if (tbin->ncached == tbin->ncached_max) { + tcache_bin_flush_large(tbin, binind, (tbin->ncached_max >> 1) #if (defined(JEMALLOC_STATS) || defined(JEMALLOC_PROF)) , tcache #endif diff --git a/jemalloc/include/jemalloc/jemalloc_defs.h.in b/jemalloc/include/jemalloc/jemalloc_defs.h.in index 4b4ea7d0..ddf960d5 100644 --- a/jemalloc/include/jemalloc/jemalloc_defs.h.in +++ b/jemalloc/include/jemalloc/jemalloc_defs.h.in @@ -61,9 +61,9 @@ #undef JEMALLOC_TINY /* - * JEMALLOC_TCACHE enables a thread-specific caching layer for small and medium - * objects. This makes it possible to allocate/deallocate objects without any - * locking when the cache is in the steady state. + * JEMALLOC_TCACHE enables a thread-specific caching layer for small objects. + * This makes it possible to allocate/deallocate objects without any locking + * when the cache is in the steady state. */ #undef JEMALLOC_TCACHE diff --git a/jemalloc/src/arena.c b/jemalloc/src/arena.c index a3d66545..435cf695 100644 --- a/jemalloc/src/arena.c +++ b/jemalloc/src/arena.c @@ -6,7 +6,6 @@ size_t opt_lg_qspace_max = LG_QSPACE_MAX_DEFAULT; size_t opt_lg_cspace_max = LG_CSPACE_MAX_DEFAULT; -size_t opt_lg_medium_max = LG_MEDIUM_MAX_DEFAULT; ssize_t opt_lg_dirty_mult = LG_DIRTY_MULT_DEFAULT; uint8_t const *small_size2bin; @@ -14,15 +13,12 @@ uint8_t const *small_size2bin; unsigned nqbins; unsigned ncbins; unsigned nsbins; -unsigned nmbins; unsigned nbins; -unsigned mbin0; size_t qspace_max; size_t cspace_min; size_t cspace_max; size_t sspace_min; size_t sspace_max; -size_t medium_max; size_t lg_mspace; size_t mspace_mask; @@ -178,8 +174,6 @@ static void arena_run_trim_tail(arena_t *arena, arena_chunk_t *chunk, static arena_run_t *arena_bin_nonfull_run_get(arena_t *arena, arena_bin_t *bin); static void *arena_bin_malloc_hard(arena_t *arena, arena_bin_t *bin); static size_t arena_bin_run_size_calc(arena_bin_t *bin, size_t min_run_size); -static void *arena_malloc_large(arena_t *arena, size_t size, bool zero); -static bool arena_is_large(const void *ptr); static void arena_dalloc_bin_run(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run, arena_bin_t *bin); static void arena_ralloc_large_shrink(arena_t *arena, arena_chunk_t *chunk, @@ -1077,7 +1071,7 @@ arena_prof_accum(arena_t *arena, uint64_t accumbytes) #ifdef JEMALLOC_TCACHE void -arena_tcache_fill(arena_t *arena, tcache_bin_t *tbin, size_t binind +arena_tcache_fill_small(arena_t *arena, tcache_bin_t *tbin, size_t binind # ifdef JEMALLOC_PROF , uint64_t prof_accumbytes # endif @@ -1239,7 +1233,7 @@ arena_malloc_small(arena_t *arena, size_t size, bool zero) size_t binind; binind = small_size2bin[size]; - assert(binind < mbin0); + assert(binind < nbins); bin = &arena->bins[binind]; size = bin->reg_size; @@ -1282,58 +1276,6 @@ arena_malloc_small(arena_t *arena, size_t size, bool zero) } void * -arena_malloc_medium(arena_t *arena, size_t size, bool zero) -{ - void *ret; - arena_bin_t *bin; - arena_run_t *run; - size_t binind; - - size = MEDIUM_CEILING(size); - binind = mbin0 + ((size - medium_min) >> lg_mspace); - assert(binind < nbins); - bin = &arena->bins[binind]; - assert(bin->reg_size == size); - - malloc_mutex_lock(&bin->lock); - if ((run = bin->runcur) != NULL && run->nfree > 0) - ret = arena_run_reg_alloc(run, bin); - else - ret = arena_bin_malloc_hard(arena, bin); - - if (ret == NULL) { - malloc_mutex_unlock(&bin->lock); - return (NULL); - } - -#ifdef JEMALLOC_STATS - bin->stats.allocated += size; - bin->stats.nmalloc++; - bin->stats.nrequests++; -#endif - malloc_mutex_unlock(&bin->lock); -#ifdef JEMALLOC_PROF - if (isthreaded == false) { - malloc_mutex_lock(&arena->lock); - arena_prof_accum(arena, size); - malloc_mutex_unlock(&arena->lock); - } -#endif - - if (zero == false) { -#ifdef JEMALLOC_FILL - if (opt_junk) - memset(ret, 0xa5, size); - else if (opt_zero) - memset(ret, 0, size); -#endif - } else - memset(ret, 0, size); - - return (ret); -} - -static void * arena_malloc_large(arena_t *arena, size_t size, bool zero) { void *ret; @@ -1348,7 +1290,9 @@ arena_malloc_large(arena_t *arena, size_t size, bool zero) } #ifdef JEMALLOC_STATS arena->stats.nmalloc_large++; + arena->stats.nrequests_large++; arena->stats.allocated_large += size; + arena->stats.lstats[(size >> PAGE_SHIFT) - 1].nmalloc++; arena->stats.lstats[(size >> PAGE_SHIFT) - 1].nrequests++; arena->stats.lstats[(size >> PAGE_SHIFT) - 1].curruns++; if (arena->stats.lstats[(size >> PAGE_SHIFT) - 1].curruns > @@ -1381,21 +1325,31 @@ arena_malloc(size_t size, bool zero) assert(size != 0); assert(QUANTUM_CEILING(size) <= arena_maxclass); - if (size <= bin_maxclass) { + if (size <= small_maxclass) { #ifdef JEMALLOC_TCACHE tcache_t *tcache; if ((tcache = tcache_get()) != NULL) - return (tcache_alloc(tcache, size, zero)); + return (tcache_alloc_small(tcache, size, zero)); + else + #endif - if (size <= small_maxclass) return (arena_malloc_small(choose_arena(), size, zero)); - else { - return (arena_malloc_medium(choose_arena(), size, - zero)); - } - } else - return (arena_malloc_large(choose_arena(), size, zero)); + } else { +#ifdef JEMALLOC_TCACHE + if (size <= tcache_maxclass) { + tcache_t *tcache; + + if ((tcache = tcache_get()) != NULL) + return (tcache_alloc_large(tcache, size, zero)); + else { + return (arena_malloc_large(choose_arena(), + size, zero)); + } + } else +#endif + return (arena_malloc_large(choose_arena(), size, zero)); + } } /* Only handles large allocations that require more than page alignment. */ @@ -1444,7 +1398,9 @@ arena_palloc(arena_t *arena, size_t alignment, size_t size, size_t alloc_size) #ifdef JEMALLOC_STATS arena->stats.nmalloc_large++; + arena->stats.nrequests_large++; arena->stats.allocated_large += size; + arena->stats.lstats[(size >> PAGE_SHIFT) - 1].nmalloc++; arena->stats.lstats[(size >> PAGE_SHIFT) - 1].nrequests++; arena->stats.lstats[(size >> PAGE_SHIFT) - 1].curruns++; if (arena->stats.lstats[(size >> PAGE_SHIFT) - 1].curruns > @@ -1464,22 +1420,6 @@ arena_palloc(arena_t *arena, size_t alignment, size_t size, size_t alloc_size) return (ret); } -static bool -arena_is_large(const void *ptr) -{ - arena_chunk_t *chunk; - size_t pageind, mapbits; - - assert(ptr != NULL); - assert(CHUNK_ADDR2BASE(ptr) != ptr); - - chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr); - pageind = (((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT); - mapbits = chunk->map[pageind].bits; - assert((mapbits & CHUNK_MAP_ALLOCATED) != 0); - return ((mapbits & CHUNK_MAP_LARGE) != 0); -} - /* Return the size of the allocation pointed to by ptr. */ size_t arena_salloc(const void *ptr) @@ -1781,8 +1721,11 @@ arena_stats_merge(arena_t *arena, size_t *nactive, size_t *ndirty, astats->allocated_large += arena->stats.allocated_large; astats->nmalloc_large += arena->stats.nmalloc_large; astats->ndalloc_large += arena->stats.ndalloc_large; + astats->nrequests_large += arena->stats.nrequests_large; for (i = 0; i < nlclasses; i++) { + lstats[i].nmalloc += arena->stats.lstats[i].nmalloc; + lstats[i].ndalloc += arena->stats.lstats[i].ndalloc; lstats[i].nrequests += arena->stats.lstats[i].nrequests; lstats[i].highruns += arena->stats.lstats[i].highruns; lstats[i].curruns += arena->stats.lstats[i].curruns; @@ -1815,8 +1758,6 @@ arena_dalloc_large(arena_t *arena, arena_chunk_t *chunk, void *ptr) { /* Large allocation. */ - malloc_mutex_lock(&arena->lock); - #ifdef JEMALLOC_FILL # ifndef JEMALLOC_STATS if (opt_junk) @@ -1838,12 +1779,12 @@ arena_dalloc_large(arena_t *arena, arena_chunk_t *chunk, void *ptr) #ifdef JEMALLOC_STATS arena->stats.ndalloc_large++; arena->stats.allocated_large -= size; + arena->stats.lstats[(size >> PAGE_SHIFT) - 1].ndalloc++; arena->stats.lstats[(size >> PAGE_SHIFT) - 1].curruns--; #endif } arena_run_dalloc(arena, (arena_run_t *)ptr, true); - malloc_mutex_unlock(&arena->lock); } static void @@ -1863,10 +1804,13 @@ arena_ralloc_large_shrink(arena_t *arena, arena_chunk_t *chunk, void *ptr, #ifdef JEMALLOC_STATS arena->stats.ndalloc_large++; arena->stats.allocated_large -= oldsize; + arena->stats.lstats[(oldsize >> PAGE_SHIFT) - 1].ndalloc++; arena->stats.lstats[(oldsize >> PAGE_SHIFT) - 1].curruns--; arena->stats.nmalloc_large++; + arena->stats.nrequests_large++; arena->stats.allocated_large += size; + arena->stats.lstats[(size >> PAGE_SHIFT) - 1].nmalloc++; arena->stats.lstats[(size >> PAGE_SHIFT) - 1].nrequests++; arena->stats.lstats[(size >> PAGE_SHIFT) - 1].curruns++; if (arena->stats.lstats[(size >> PAGE_SHIFT) - 1].curruns > @@ -1910,10 +1854,13 @@ arena_ralloc_large_grow(arena_t *arena, arena_chunk_t *chunk, void *ptr, #ifdef JEMALLOC_STATS arena->stats.ndalloc_large++; arena->stats.allocated_large -= oldsize; + arena->stats.lstats[(oldsize >> PAGE_SHIFT) - 1].ndalloc++; arena->stats.lstats[(oldsize >> PAGE_SHIFT) - 1].curruns--; arena->stats.nmalloc_large++; + arena->stats.nrequests_large++; arena->stats.allocated_large += size; + arena->stats.lstats[(size >> PAGE_SHIFT) - 1].nmalloc++; arena->stats.lstats[(size >> PAGE_SHIFT) - 1].nrequests++; arena->stats.lstats[(size >> PAGE_SHIFT) - 1].curruns++; if (arena->stats.lstats[(size >> PAGE_SHIFT) - 1].curruns > @@ -1988,30 +1935,15 @@ arena_ralloc(void *ptr, size_t size, size_t oldsize) void *ret; size_t copysize; - /* - * Try to avoid moving the allocation. - * - * posix_memalign() can cause allocation of "large" objects that are - * smaller than bin_maxclass (in order to meet alignment requirements). - * Therefore, do not assume that (oldsize <= bin_maxclass) indicates - * ptr refers to a bin-allocated object. - */ + /* Try to avoid moving the allocation. */ if (oldsize <= arena_maxclass) { - if (arena_is_large(ptr) == false ) { - if (size <= small_maxclass) { - if (oldsize <= small_maxclass && - small_size2bin[size] == - small_size2bin[oldsize]) - goto IN_PLACE; - } else if (size <= bin_maxclass) { - if (small_maxclass < oldsize && oldsize <= - bin_maxclass && MEDIUM_CEILING(size) == - MEDIUM_CEILING(oldsize)) - goto IN_PLACE; - } + if (oldsize <= small_maxclass) { + if (size <= small_maxclass && small_size2bin[size] == + small_size2bin[oldsize]) + goto IN_PLACE; } else { assert(size <= arena_maxclass); - if (size > bin_maxclass) { + if (size > small_maxclass) { if (arena_ralloc_large(ptr, size, oldsize) == false) return (ptr); @@ -2019,23 +1951,6 @@ arena_ralloc(void *ptr, size_t size, size_t oldsize) } } - /* Try to avoid moving the allocation. */ - if (size <= small_maxclass) { - if (oldsize <= small_maxclass && small_size2bin[size] == - small_size2bin[oldsize]) - goto IN_PLACE; - } else if (size <= bin_maxclass) { - if (small_maxclass < oldsize && oldsize <= bin_maxclass && - MEDIUM_CEILING(size) == MEDIUM_CEILING(oldsize)) - goto IN_PLACE; - } else { - if (bin_maxclass < oldsize && oldsize <= arena_maxclass) { - assert(size > bin_maxclass); - if (arena_ralloc_large(ptr, size, oldsize) == false) - return (ptr); - } - } - /* * If we get here, then size and oldsize are different enough that we * need to move the object. In that case, fall back to allocating new @@ -2074,13 +1989,12 @@ arena_new(arena_t *arena, unsigned ind) #ifdef JEMALLOC_STATS memset(&arena->stats, 0, sizeof(arena_stats_t)); - arena->stats.lstats = (malloc_large_stats_t *)base_alloc( - sizeof(malloc_large_stats_t) * ((chunksize - PAGE_SIZE) >> - PAGE_SHIFT)); + arena->stats.lstats = (malloc_large_stats_t *)base_alloc(nlclasses * + sizeof(malloc_large_stats_t)); if (arena->stats.lstats == NULL) return (true); - memset(arena->stats.lstats, 0, sizeof(malloc_large_stats_t) * - ((chunksize - PAGE_SIZE) >> PAGE_SHIFT)); + memset(arena->stats.lstats, 0, nlclasses * + sizeof(malloc_large_stats_t)); # ifdef JEMALLOC_TCACHE ql_new(&arena->tcache_ql); # endif @@ -2159,7 +2073,7 @@ arena_new(arena_t *arena, unsigned ind) } /* Subpage-spaced bins. */ - for (; i < ntbins + nqbins + ncbins + nsbins; i++) { + for (; i < nbins; i++) { bin = &arena->bins[i]; if (malloc_mutex_init(&bin->lock)) return (true); @@ -2171,24 +2085,6 @@ arena_new(arena_t *arena, unsigned ind) prev_run_size = arena_bin_run_size_calc(bin, prev_run_size); -#ifdef JEMALLOC_STATS - memset(&bin->stats, 0, sizeof(malloc_bin_stats_t)); -#endif - } - - /* Medium bins. */ - for (; i < nbins; i++) { - bin = &arena->bins[i]; - if (malloc_mutex_init(&bin->lock)) - return (true); - bin->runcur = NULL; - arena_run_tree_new(&bin->runs); - - bin->reg_size = medium_min + ((i - (ntbins + nqbins + ncbins + - nsbins)) << lg_mspace); - - prev_run_size = arena_bin_run_size_calc(bin, prev_run_size); - #ifdef JEMALLOC_STATS memset(&bin->stats, 0, sizeof(malloc_bin_stats_t)); #endif @@ -2355,7 +2251,6 @@ arena_boot(void) sspace_min += SUBPAGE; assert(sspace_min < PAGE_SIZE); sspace_max = PAGE_SIZE - SUBPAGE; - medium_max = (1U << opt_lg_medium_max); #ifdef JEMALLOC_TINY assert(LG_QUANTUM >= LG_TINY_MIN); @@ -2364,23 +2259,8 @@ arena_boot(void) nqbins = qspace_max >> LG_QUANTUM; ncbins = ((cspace_max - cspace_min) >> LG_CACHELINE) + 1; nsbins = ((sspace_max - sspace_min) >> LG_SUBPAGE) + 1; + nbins = ntbins + nqbins + ncbins + nsbins; - /* - * Compute medium size class spacing and the number of medium size - * classes. Limit spacing to no more than pagesize, but if possible - * use the smallest spacing that does not exceed NMBINS_MAX medium size - * classes. - */ - lg_mspace = LG_SUBPAGE; - nmbins = ((medium_max - medium_min) >> lg_mspace) + 1; - while (lg_mspace < PAGE_SHIFT && nmbins > NMBINS_MAX) { - lg_mspace = lg_mspace + 1; - nmbins = ((medium_max - medium_min) >> lg_mspace) + 1; - } - mspace_mask = (1U << lg_mspace) - 1U; - - mbin0 = ntbins + nqbins + ncbins + nsbins; - nbins = mbin0 + nmbins; /* * The small_size2bin lookup table uses uint8_t to encode each bin * index, so we cannot support more than 256 small size classes. This @@ -2389,10 +2269,10 @@ arena_boot(void) * nonetheless we need to protect against this case in order to avoid * undefined behavior. */ - if (mbin0 > 256) { + if (nbins > 256) { char line_buf[UMAX2S_BUFSIZE]; malloc_write(": Too many small size classes ("); - malloc_write(umax2s(mbin0, 10, line_buf)); + malloc_write(umax2s(nbins, 10, line_buf)); malloc_write(" > max 256)\n"); abort(); } diff --git a/jemalloc/src/ctl.c b/jemalloc/src/ctl.c index 1644f73e..2249102f 100644 --- a/jemalloc/src/ctl.c +++ b/jemalloc/src/ctl.c @@ -84,7 +84,6 @@ CTL_PROTO(opt_prof_leak) CTL_PROTO(opt_stats_print) CTL_PROTO(opt_lg_qspace_max) CTL_PROTO(opt_lg_cspace_max) -CTL_PROTO(opt_lg_medium_max) CTL_PROTO(opt_lg_dirty_mult) CTL_PROTO(opt_lg_chunk) #ifdef JEMALLOC_SWAP @@ -102,7 +101,6 @@ CTL_PROTO(arenas_quantum) CTL_PROTO(arenas_cacheline) CTL_PROTO(arenas_subpage) CTL_PROTO(arenas_pagesize) -CTL_PROTO(arenas_medium) CTL_PROTO(arenas_chunksize) #ifdef JEMALLOC_TINY CTL_PROTO(arenas_tspace_min) @@ -114,14 +112,17 @@ CTL_PROTO(arenas_cspace_min) CTL_PROTO(arenas_cspace_max) CTL_PROTO(arenas_sspace_min) CTL_PROTO(arenas_sspace_max) -CTL_PROTO(arenas_medium_min) -CTL_PROTO(arenas_medium_max) +#ifdef JEMALLOC_TCACHE +CTL_PROTO(arenas_tcache_max) +#endif CTL_PROTO(arenas_ntbins) CTL_PROTO(arenas_nqbins) CTL_PROTO(arenas_ncbins) CTL_PROTO(arenas_nsbins) -CTL_PROTO(arenas_nmbins) CTL_PROTO(arenas_nbins) +#ifdef JEMALLOC_TCACHE +CTL_PROTO(arenas_nhbins) +#endif CTL_PROTO(arenas_nlruns) #ifdef JEMALLOC_PROF CTL_PROTO(prof_dump) @@ -138,13 +139,10 @@ CTL_PROTO(stats_arenas_i_small_allocated) CTL_PROTO(stats_arenas_i_small_nmalloc) CTL_PROTO(stats_arenas_i_small_ndalloc) CTL_PROTO(stats_arenas_i_small_nrequests) -CTL_PROTO(stats_arenas_i_medium_allocated) -CTL_PROTO(stats_arenas_i_medium_nmalloc) -CTL_PROTO(stats_arenas_i_medium_ndalloc) -CTL_PROTO(stats_arenas_i_medium_nrequests) CTL_PROTO(stats_arenas_i_large_allocated) CTL_PROTO(stats_arenas_i_large_nmalloc) CTL_PROTO(stats_arenas_i_large_ndalloc) +CTL_PROTO(stats_arenas_i_large_nrequests) CTL_PROTO(stats_arenas_i_bins_j_allocated) CTL_PROTO(stats_arenas_i_bins_j_nmalloc) CTL_PROTO(stats_arenas_i_bins_j_ndalloc) @@ -158,6 +156,8 @@ CTL_PROTO(stats_arenas_i_bins_j_nreruns) CTL_PROTO(stats_arenas_i_bins_j_highruns) CTL_PROTO(stats_arenas_i_bins_j_curruns) INDEX_PROTO(stats_arenas_i_bins_j) +CTL_PROTO(stats_arenas_i_lruns_j_nmalloc) +CTL_PROTO(stats_arenas_i_lruns_j_ndalloc) CTL_PROTO(stats_arenas_i_lruns_j_nrequests) CTL_PROTO(stats_arenas_i_lruns_j_highruns) CTL_PROTO(stats_arenas_i_lruns_j_curruns) @@ -255,7 +255,6 @@ static const ctl_node_t opt_node[] = { {NAME("stats_print"), CTL(opt_stats_print)}, {NAME("lg_qspace_max"), CTL(opt_lg_qspace_max)}, {NAME("lg_cspace_max"), CTL(opt_lg_cspace_max)}, - {NAME("lg_medium_max"), CTL(opt_lg_medium_max)}, {NAME("lg_dirty_mult"), CTL(opt_lg_dirty_mult)}, {NAME("lg_chunk"), CTL(opt_lg_chunk)} #ifdef JEMALLOC_SWAP @@ -295,7 +294,6 @@ static const ctl_node_t arenas_node[] = { {NAME("cacheline"), CTL(arenas_cacheline)}, {NAME("subpage"), CTL(arenas_subpage)}, {NAME("pagesize"), CTL(arenas_pagesize)}, - {NAME("medium"), CTL(arenas_medium)}, {NAME("chunksize"), CTL(arenas_chunksize)}, #ifdef JEMALLOC_TINY {NAME("tspace_min"), CTL(arenas_tspace_min)}, @@ -307,14 +305,17 @@ static const ctl_node_t arenas_node[] = { {NAME("cspace_max"), CTL(arenas_cspace_max)}, {NAME("sspace_min"), CTL(arenas_sspace_min)}, {NAME("sspace_max"), CTL(arenas_sspace_max)}, - {NAME("medium_min"), CTL(arenas_medium_min)}, - {NAME("medium_max"), CTL(arenas_medium_max)}, +#ifdef JEMALLOC_TCACHE + {NAME("tcache_max"), CTL(arenas_tcache_max)}, +#endif {NAME("ntbins"), CTL(arenas_ntbins)}, {NAME("nqbins"), CTL(arenas_nqbins)}, {NAME("ncbins"), CTL(arenas_ncbins)}, {NAME("nsbins"), CTL(arenas_nsbins)}, - {NAME("nmbins"), CTL(arenas_nmbins)}, {NAME("nbins"), CTL(arenas_nbins)}, +#ifdef JEMALLOC_TCACHE + {NAME("nhbins"), CTL(arenas_nhbins)}, +#endif {NAME("bin"), CHILD(arenas_bin)}, {NAME("nlruns"), CTL(arenas_nlruns)}, {NAME("lrun"), CHILD(arenas_lrun)} @@ -347,17 +348,11 @@ static const ctl_node_t stats_arenas_i_small_node[] = { {NAME("nrequests"), CTL(stats_arenas_i_small_nrequests)} }; -static const ctl_node_t stats_arenas_i_medium_node[] = { - {NAME("allocated"), CTL(stats_arenas_i_medium_allocated)}, - {NAME("nmalloc"), CTL(stats_arenas_i_medium_nmalloc)}, - {NAME("ndalloc"), CTL(stats_arenas_i_medium_ndalloc)}, - {NAME("nrequests"), CTL(stats_arenas_i_medium_nrequests)} -}; - static const ctl_node_t stats_arenas_i_large_node[] = { {NAME("allocated"), CTL(stats_arenas_i_large_allocated)}, {NAME("nmalloc"), CTL(stats_arenas_i_large_nmalloc)}, - {NAME("ndalloc"), CTL(stats_arenas_i_large_ndalloc)} + {NAME("ndalloc"), CTL(stats_arenas_i_large_ndalloc)}, + {NAME("nrequests"), CTL(stats_arenas_i_large_nrequests)} }; static const ctl_node_t stats_arenas_i_bins_j_node[] = { @@ -383,6 +378,8 @@ static const ctl_node_t stats_arenas_i_bins_node[] = { }; static const ctl_node_t stats_arenas_i_lruns_j_node[] = { + {NAME("nmalloc"), CTL(stats_arenas_i_lruns_j_nmalloc)}, + {NAME("ndalloc"), CTL(stats_arenas_i_lruns_j_ndalloc)}, {NAME("nrequests"), CTL(stats_arenas_i_lruns_j_nrequests)}, {NAME("highruns"), CTL(stats_arenas_i_lruns_j_highruns)}, {NAME("curruns"), CTL(stats_arenas_i_lruns_j_curruns)} @@ -406,7 +403,6 @@ static const ctl_node_t stats_arenas_i_node[] = { {NAME("nmadvise"), CTL(stats_arenas_i_nmadvise)}, {NAME("purged"), CTL(stats_arenas_i_purged)}, {NAME("small"), CHILD(stats_arenas_i_small)}, - {NAME("medium"), CHILD(stats_arenas_i_medium)}, {NAME("large"), CHILD(stats_arenas_i_large)}, {NAME("bins"), CHILD(stats_arenas_i_bins)}, {NAME("lruns"), CHILD(stats_arenas_i_lruns)} @@ -505,10 +501,6 @@ ctl_arena_clear(ctl_arena_stats_t *astats) astats->nmalloc_small = 0; astats->ndalloc_small = 0; astats->nrequests_small = 0; - astats->allocated_medium = 0; - astats->nmalloc_medium = 0; - astats->ndalloc_medium = 0; - astats->nrequests_medium = 0; memset(astats->bstats, 0, nbins * sizeof(malloc_bin_stats_t)); memset(astats->lstats, 0, nlclasses * sizeof(malloc_large_stats_t)); #endif @@ -523,19 +515,12 @@ ctl_arena_stats_amerge(ctl_arena_stats_t *cstats, arena_t *arena) arena_stats_merge(arena, &cstats->pactive, &cstats->pdirty, &cstats->astats, cstats->bstats, cstats->lstats); - for (i = 0; i < mbin0; i++) { + for (i = 0; i < nbins; i++) { cstats->allocated_small += cstats->bstats[i].allocated; cstats->nmalloc_small += cstats->bstats[i].nmalloc; cstats->ndalloc_small += cstats->bstats[i].ndalloc; cstats->nrequests_small += cstats->bstats[i].nrequests; } - - for (; i < nbins; i++) { - cstats->allocated_medium += cstats->bstats[i].allocated; - cstats->nmalloc_medium += cstats->bstats[i].nmalloc; - cstats->ndalloc_medium += cstats->bstats[i].ndalloc; - cstats->nrequests_medium += cstats->bstats[i].nrequests; - } } static void @@ -556,16 +541,14 @@ ctl_arena_stats_smerge(ctl_arena_stats_t *sstats, ctl_arena_stats_t *astats) sstats->ndalloc_small += astats->ndalloc_small; sstats->nrequests_small += astats->nrequests_small; - sstats->allocated_medium += astats->allocated_medium; - sstats->nmalloc_medium += astats->nmalloc_medium; - sstats->ndalloc_medium += astats->ndalloc_medium; - sstats->nrequests_medium += astats->nrequests_medium; - sstats->astats.allocated_large += astats->astats.allocated_large; sstats->astats.nmalloc_large += astats->astats.nmalloc_large; sstats->astats.ndalloc_large += astats->astats.ndalloc_large; + sstats->astats.nrequests_large += astats->astats.nrequests_large; for (i = 0; i < nlclasses; i++) { + sstats->lstats[i].nmalloc += astats->lstats[i].nmalloc; + sstats->lstats[i].ndalloc += astats->lstats[i].ndalloc; sstats->lstats[i].nrequests += astats->lstats[i].nrequests; sstats->lstats[i].highruns += astats->lstats[i].highruns; sstats->lstats[i].curruns += astats->lstats[i].curruns; @@ -648,7 +631,6 @@ ctl_refresh(void) #ifdef JEMALLOC_STATS ctl_stats.allocated = ctl_stats.arenas[narenas].allocated_small - + ctl_stats.arenas[narenas].allocated_medium + ctl_stats.arenas[narenas].astats.allocated_large + ctl_stats.huge.allocated; ctl_stats.active = (ctl_stats.arenas[narenas].pactive << PAGE_SHIFT) @@ -1178,7 +1160,6 @@ CTL_RO_GEN(opt_prof_leak, opt_prof_leak, bool) CTL_RO_GEN(opt_stats_print, opt_stats_print, bool) CTL_RO_GEN(opt_lg_qspace_max, opt_lg_qspace_max, size_t) CTL_RO_GEN(opt_lg_cspace_max, opt_lg_cspace_max, size_t) -CTL_RO_GEN(opt_lg_medium_max, opt_lg_medium_max, size_t) CTL_RO_GEN(opt_lg_dirty_mult, opt_lg_dirty_mult, ssize_t) CTL_RO_GEN(opt_lg_chunk, opt_lg_chunk, size_t) #ifdef JEMALLOC_SWAP @@ -1239,7 +1220,6 @@ CTL_RO_GEN(arenas_quantum, QUANTUM, size_t) CTL_RO_GEN(arenas_cacheline, CACHELINE, size_t) CTL_RO_GEN(arenas_subpage, SUBPAGE, size_t) CTL_RO_GEN(arenas_pagesize, PAGE_SIZE, size_t) -CTL_RO_GEN(arenas_medium, (1U << lg_mspace), size_t) CTL_RO_GEN(arenas_chunksize, chunksize, size_t) #ifdef JEMALLOC_TINY CTL_RO_GEN(arenas_tspace_min, (1U << LG_TINY_MIN), size_t) @@ -1251,14 +1231,17 @@ CTL_RO_GEN(arenas_cspace_min, cspace_min, size_t) CTL_RO_GEN(arenas_cspace_max, cspace_max, size_t) CTL_RO_GEN(arenas_sspace_min, sspace_min, size_t) CTL_RO_GEN(arenas_sspace_max, sspace_max, size_t) -CTL_RO_GEN(arenas_medium_min, medium_min, size_t) -CTL_RO_GEN(arenas_medium_max, medium_max, size_t) +#ifdef JEMALLOC_TCACHE +CTL_RO_GEN(arenas_tcache_max, tcache_maxclass, size_t) +#endif CTL_RO_GEN(arenas_ntbins, ntbins, unsigned) CTL_RO_GEN(arenas_nqbins, nqbins, unsigned) CTL_RO_GEN(arenas_ncbins, ncbins, unsigned) CTL_RO_GEN(arenas_nsbins, nsbins, unsigned) -CTL_RO_GEN(arenas_nmbins, nmbins, unsigned) CTL_RO_GEN(arenas_nbins, nbins, unsigned) +#ifdef JEMALLOC_TCACHE +CTL_RO_GEN(arenas_nhbins, nhbins, unsigned) +#endif CTL_RO_GEN(arenas_nlruns, nlclasses, size_t) /******************************************************************************/ @@ -1304,20 +1287,14 @@ CTL_RO_GEN(stats_arenas_i_small_ndalloc, ctl_stats.arenas[mib[2]].ndalloc_small, uint64_t) CTL_RO_GEN(stats_arenas_i_small_nrequests, ctl_stats.arenas[mib[2]].nrequests_small, uint64_t) -CTL_RO_GEN(stats_arenas_i_medium_allocated, - ctl_stats.arenas[mib[2]].allocated_medium, size_t) -CTL_RO_GEN(stats_arenas_i_medium_nmalloc, - ctl_stats.arenas[mib[2]].nmalloc_medium, uint64_t) -CTL_RO_GEN(stats_arenas_i_medium_ndalloc, - ctl_stats.arenas[mib[2]].ndalloc_medium, uint64_t) -CTL_RO_GEN(stats_arenas_i_medium_nrequests, - ctl_stats.arenas[mib[2]].nrequests_medium, uint64_t) CTL_RO_GEN(stats_arenas_i_large_allocated, ctl_stats.arenas[mib[2]].astats.allocated_large, size_t) CTL_RO_GEN(stats_arenas_i_large_nmalloc, ctl_stats.arenas[mib[2]].astats.nmalloc_large, uint64_t) CTL_RO_GEN(stats_arenas_i_large_ndalloc, ctl_stats.arenas[mib[2]].astats.ndalloc_large, uint64_t) +CTL_RO_GEN(stats_arenas_i_large_nrequests, + ctl_stats.arenas[mib[2]].astats.nrequests_large, uint64_t) CTL_RO_GEN(stats_arenas_i_bins_j_allocated, ctl_stats.arenas[mib[2]].bstats[mib[4]].allocated, size_t) @@ -1351,6 +1328,10 @@ stats_arenas_i_bins_j_index(const size_t *mib, size_t miblen, size_t j) return (super_stats_arenas_i_bins_j_node); } +CTL_RO_GEN(stats_arenas_i_lruns_j_nmalloc, + ctl_stats.arenas[mib[2]].lstats[mib[4]].nmalloc, uint64_t) +CTL_RO_GEN(stats_arenas_i_lruns_j_ndalloc, + ctl_stats.arenas[mib[2]].lstats[mib[4]].ndalloc, uint64_t) CTL_RO_GEN(stats_arenas_i_lruns_j_nrequests, ctl_stats.arenas[mib[2]].lstats[mib[4]].nrequests, uint64_t) CTL_RO_GEN(stats_arenas_i_lruns_j_curruns, diff --git a/jemalloc/src/jemalloc.c b/jemalloc/src/jemalloc.c index 49c2b0b0..d880769d 100644 --- a/jemalloc/src/jemalloc.c +++ b/jemalloc/src/jemalloc.c @@ -52,17 +52,9 @@ * | | | 3584 | * | | | 3840 | * |========================================| - * | Medium | 4 KiB | - * | | 6 KiB | + * | Large | 4 KiB | * | | 8 KiB | - * | | ... | - * | | 28 KiB | - * | | 30 KiB | - * | | 32 KiB | - * |========================================| - * | Large | 36 KiB | - * | | 40 KiB | - * | | 44 KiB | + * | | 12 KiB | * | | ... | * | | 1012 KiB | * | | 1016 KiB | @@ -76,9 +68,8 @@ * * Different mechanisms are used accoding to category: * - * Small/medium : Each size class is segregated into its own set of runs. - * Each run maintains a bitmap of which regions are - * free/allocated. + * Small: Each size class is segregated into its own set of runs. Each run + * maintains a bitmap of which regions are free/allocated. * * Large : Each allocation is backed by a dedicated run. Metadata are stored * in the associated arena chunk header maps. @@ -252,6 +243,11 @@ stats_print_atexit(void) if (arena != NULL) { tcache_t *tcache; + /* + * tcache_stats_merge() locks bins, so if any code is + * introduced that acquires both arena and bin locks in + * the opposite order, deadlocks may result. + */ malloc_mutex_lock(&arena->lock); ql_foreach(tcache, &arena->tcache_ql, link) { tcache_stats_merge(tcache, arena); @@ -510,14 +506,10 @@ MALLOC_OUT: case 'k': /* * Chunks always require at least one - * header page, plus enough room to - * hold a run for the largest medium - * size class (one page more than the - * size). + * header page, plus one data page. */ if ((1U << (opt_lg_chunk - 1)) >= - (2U << PAGE_SHIFT) + (1U << - opt_lg_medium_max)) + (2U << PAGE_SHIFT)) opt_lg_chunk--; break; case 'K': @@ -533,15 +525,17 @@ MALLOC_OUT: opt_prof_leak = true; break; #endif +#ifdef JEMALLOC_TCACHE case 'm': - if (opt_lg_medium_max > PAGE_SHIFT) - opt_lg_medium_max--; + if (opt_lg_tcache_maxclass >= 0) + opt_lg_tcache_maxclass--; break; case 'M': - if (opt_lg_medium_max + 1 < - opt_lg_chunk) - opt_lg_medium_max++; + if (opt_lg_tcache_maxclass + 1 < + (sizeof(size_t) << 3)) + opt_lg_tcache_maxclass++; break; +#endif case 'n': opt_narenas_lshift--; break; @@ -725,33 +719,7 @@ MALLOC_OUT: * For SMP systems, create more than one arena per CPU by * default. */ -#ifdef JEMALLOC_TCACHE - if (opt_tcache -# ifdef JEMALLOC_PROF - /* - * Profile data storage concurrency is directly linked to - * the number of arenas, so only drop the number of arenas - * on behalf of enabled tcache if profiling is disabled. - */ - && opt_prof == false -# endif - ) { - /* - * Only large object allocation/deallocation is - * guaranteed to acquire an arena mutex, so we can get - * away with fewer arenas than without thread caching. - */ - opt_narenas_lshift += 1; - } else { -#endif - /* - * All allocations must acquire an arena mutex, so use - * plenty of arenas. - */ - opt_narenas_lshift += 2; -#ifdef JEMALLOC_TCACHE - } -#endif + opt_narenas_lshift += 2; } /* Determine how many arenas to use. */ diff --git a/jemalloc/src/stats.c b/jemalloc/src/stats.c index c57db47b..a5ec1f1f 100644 --- a/jemalloc/src/stats.c +++ b/jemalloc/src/stats.c @@ -234,8 +234,7 @@ stats_arena_bins_print(void (*write_cb)(void *, const char *), void *cbopaque, j, j < ntbins_ ? "T" : j < ntbins_ + nqbins ? "Q" : j < ntbins_ + nqbins + ncbins ? "C" : - j < ntbins_ + nqbins + ncbins + nsbins ? "S" - : "M", + "S", reg_size, nregs, run_size / pagesize, allocated, nmalloc, ndalloc, nrequests, nfills, nflushes, nruns, reruns, highruns, @@ -248,8 +247,7 @@ stats_arena_bins_print(void (*write_cb)(void *, const char *), void *cbopaque, j, j < ntbins_ ? "T" : j < ntbins_ + nqbins ? "Q" : j < ntbins_ + nqbins + ncbins ? "C" : - j < ntbins_ + nqbins + ncbins + nsbins ? "S" - : "M", + "S", reg_size, nregs, run_size / pagesize, allocated, nmalloc, ndalloc, nruns, reruns, highruns, curruns); @@ -278,12 +276,17 @@ stats_arena_lruns_print(void (*write_cb)(void *, const char *), void *cbopaque, CTL_GET("arenas.pagesize", &pagesize, size_t); malloc_cprintf(write_cb, cbopaque, - "large: size pages nrequests maxruns curruns\n"); + "large: size pages nmalloc ndalloc nrequests" + " maxruns curruns\n"); CTL_GET("arenas.nlruns", &nlruns, size_t); for (j = 0, gap_start = -1; j < nlruns; j++) { - uint64_t nrequests; + uint64_t nmalloc, ndalloc, nrequests; size_t run_size, highruns, curruns; + CTL_IJ_GET("stats.arenas.0.lruns.0.nmalloc", &nmalloc, + uint64_t); + CTL_IJ_GET("stats.arenas.0.lruns.0.ndalloc", &ndalloc, + uint64_t); CTL_IJ_GET("stats.arenas.0.lruns.0.nrequests", &nrequests, uint64_t); if (nrequests == 0) { @@ -301,9 +304,10 @@ stats_arena_lruns_print(void (*write_cb)(void *, const char *), void *cbopaque, gap_start = -1; } malloc_cprintf(write_cb, cbopaque, - "%13zu %5zu %12"PRIu64" %12zu %12zu\n", - run_size, run_size / pagesize, nrequests, highruns, - curruns); + "%13zu %5zu %12"PRIu64" %12"PRIu64" %12"PRIu64 + " %12zu %12zu\n", + run_size, run_size / pagesize, nmalloc, ndalloc, + nrequests, highruns, curruns); } } if (gap_start != -1) @@ -318,10 +322,8 @@ stats_arena_print(void (*write_cb)(void *, const char *), void *cbopaque, uint64_t npurge, nmadvise, purged; size_t small_allocated; uint64_t small_nmalloc, small_ndalloc, small_nrequests; - size_t medium_allocated; - uint64_t medium_nmalloc, medium_ndalloc, medium_nrequests; size_t large_allocated; - uint64_t large_nmalloc, large_ndalloc; + uint64_t large_nmalloc, large_ndalloc, large_nrequests; CTL_GET("arenas.pagesize", &pagesize, size_t); @@ -345,26 +347,19 @@ stats_arena_print(void (*write_cb)(void *, const char *), void *cbopaque, malloc_cprintf(write_cb, cbopaque, "small: %12zu %12"PRIu64" %12"PRIu64" %12"PRIu64"\n", small_allocated, small_nmalloc, small_ndalloc, small_nrequests); - CTL_I_GET("stats.arenas.0.medium.allocated", &medium_allocated, size_t); - CTL_I_GET("stats.arenas.0.medium.nmalloc", &medium_nmalloc, uint64_t); - CTL_I_GET("stats.arenas.0.medium.ndalloc", &medium_ndalloc, uint64_t); - CTL_I_GET("stats.arenas.0.medium.nrequests", &medium_nrequests, - uint64_t); - malloc_cprintf(write_cb, cbopaque, - "medium: %12zu %12"PRIu64" %12"PRIu64" %12"PRIu64"\n", - medium_allocated, medium_nmalloc, medium_ndalloc, medium_nrequests); CTL_I_GET("stats.arenas.0.large.allocated", &large_allocated, size_t); CTL_I_GET("stats.arenas.0.large.nmalloc", &large_nmalloc, uint64_t); CTL_I_GET("stats.arenas.0.large.ndalloc", &large_ndalloc, uint64_t); + CTL_I_GET("stats.arenas.0.large.nrequests", &large_nrequests, uint64_t); malloc_cprintf(write_cb, cbopaque, "large: %12zu %12"PRIu64" %12"PRIu64" %12"PRIu64"\n", - large_allocated, large_nmalloc, large_ndalloc, large_nmalloc); + large_allocated, large_nmalloc, large_ndalloc, large_nrequests); malloc_cprintf(write_cb, cbopaque, "total: %12zu %12"PRIu64" %12"PRIu64" %12"PRIu64"\n", - small_allocated + medium_allocated + large_allocated, - small_nmalloc + medium_nmalloc + large_nmalloc, - small_ndalloc + medium_ndalloc + large_ndalloc, - small_nrequests + medium_nrequests + large_nmalloc); + small_allocated + large_allocated, + small_nmalloc + large_nmalloc, + small_ndalloc + large_ndalloc, + small_nrequests + large_nrequests); malloc_cprintf(write_cb, cbopaque, "active: %12zu\n", pactive * pagesize ); CTL_I_GET("stats.arenas.0.mapped", &mapped, size_t); @@ -511,11 +506,6 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque, write_cb(cbopaque, umax2s(sv, 10, s)); write_cb(cbopaque, "\n"); - CTL_GET("arenas.medium", &sv, size_t); - write_cb(cbopaque, "Medium spacing: "); - write_cb(cbopaque, umax2s(sv, 10, s)); - write_cb(cbopaque, "\n"); - if ((err = JEMALLOC_P(mallctl)("arenas.tspace_min", &sv, &ssz, NULL, 0)) == 0) { write_cb(cbopaque, "Tiny 2^n-spaced sizes: ["); @@ -551,14 +541,6 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque, write_cb(cbopaque, umax2s(sv, 10, s)); write_cb(cbopaque, "]\n"); - CTL_GET("arenas.medium_min", &sv, size_t); - write_cb(cbopaque, "Medium sizes: ["); - write_cb(cbopaque, umax2s(sv, 10, s)); - write_cb(cbopaque, ".."); - CTL_GET("arenas.medium_max", &sv, size_t); - write_cb(cbopaque, umax2s(sv, 10, s)); - write_cb(cbopaque, "]\n"); - CTL_GET("opt.lg_dirty_mult", &ssv, ssize_t); if (ssv >= 0) { write_cb(cbopaque, @@ -569,6 +551,13 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque, write_cb(cbopaque, "Min active:dirty page ratio per arena: N/A\n"); } + if ((err = JEMALLOC_P(mallctl)("arenas.tcache_max", &sv, + &ssz, NULL, 0)) == 0) { + write_cb(cbopaque, + "Maximum thread-cached size class: "); + write_cb(cbopaque, umax2s(sv, 10, s)); + write_cb(cbopaque, "\n"); + } if ((err = JEMALLOC_P(mallctl)("opt.lg_tcache_gc_sweep", &ssv, &ssz, NULL, 0)) == 0) { size_t tcache_gc_sweep = (1U << ssv); @@ -705,7 +694,8 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque, for (i = 0; i < narenas; i++) { if (initialized[i]) { - malloc_cprintf(write_cb, cbopaque, + malloc_cprintf(write_cb, + cbopaque, "\narenas[%u]:\n", i); stats_arena_print(write_cb, cbopaque, i); diff --git a/jemalloc/src/tcache.c b/jemalloc/src/tcache.c index feba61b4..6113dec7 100644 --- a/jemalloc/src/tcache.c +++ b/jemalloc/src/tcache.c @@ -5,6 +5,7 @@ /* Data. */ bool opt_tcache = true; +ssize_t opt_lg_tcache_maxclass = LG_TCACHE_MAXCLASS_DEFAULT; ssize_t opt_lg_tcache_gc_sweep = LG_TCACHE_GC_SWEEP_DEFAULT; /* Map of thread-specific caches. */ @@ -16,6 +17,8 @@ __thread tcache_t *tcache_tls JEMALLOC_ATTR(tls_model("initial-exec")); */ static pthread_key_t tcache_tsd; +size_t nhbins; +size_t tcache_maxclass; unsigned tcache_gc_incr; /******************************************************************************/ @@ -26,11 +29,11 @@ static void tcache_thread_cleanup(void *arg); /******************************************************************************/ void * -tcache_alloc_hard(tcache_t *tcache, tcache_bin_t *tbin, size_t binind) +tcache_alloc_small_hard(tcache_t *tcache, tcache_bin_t *tbin, size_t binind) { void *ret; - arena_tcache_fill(tcache->arena, tbin, binind + arena_tcache_fill_small(tcache->arena, tbin, binind #ifdef JEMALLOC_PROF , tcache->prof_accumbytes #endif @@ -38,13 +41,13 @@ tcache_alloc_hard(tcache_t *tcache, tcache_bin_t *tbin, size_t binind) #ifdef JEMALLOC_PROF tcache->prof_accumbytes = 0; #endif - ret = tcache_bin_alloc(tbin); + ret = tcache_alloc_easy(tbin); return (ret); } void -tcache_bin_flush(tcache_bin_t *tbin, size_t binind, unsigned rem +tcache_bin_flush_small(tcache_bin_t *tbin, size_t binind, unsigned rem #if (defined(JEMALLOC_STATS) || defined(JEMALLOC_PROF)) , tcache_t *tcache #endif @@ -53,6 +56,7 @@ tcache_bin_flush(tcache_bin_t *tbin, size_t binind, unsigned rem void *flush, *deferred, *ptr; unsigned i, nflush, ndeferred; + assert(binind < nbins); assert(rem <= tbin->ncached); for (flush = tbin->avail, nflush = tbin->ncached - rem; flush != NULL; @@ -120,6 +124,79 @@ tcache_bin_flush(tcache_bin_t *tbin, size_t binind, unsigned rem tbin->low_water = tbin->ncached; } +void +tcache_bin_flush_large(tcache_bin_t *tbin, size_t binind, unsigned rem +#if (defined(JEMALLOC_STATS) || defined(JEMALLOC_PROF)) + , tcache_t *tcache +#endif + ) +{ + void *flush, *deferred, *ptr; + unsigned i, nflush, ndeferred; + + assert(binind < nhbins); + assert(rem <= tbin->ncached); + + for (flush = tbin->avail, nflush = tbin->ncached - rem; flush != NULL; + flush = deferred, nflush = ndeferred) { + /* Lock the arena associated with the first object. */ + arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(flush); + arena_t *arena = chunk->arena; + + malloc_mutex_lock(&arena->lock); +#if (defined(JEMALLOC_PROF) || defined(JEMALLOC_STATS)) + if (arena == tcache->arena) { +#endif +#ifdef JEMALLOC_PROF + arena_prof_accum(arena, tcache->prof_accumbytes); + tcache->prof_accumbytes = 0; +#endif +#ifdef JEMALLOC_STATS + arena->stats.nrequests_large += tbin->tstats.nrequests; + arena->stats.lstats[binind - nbins].nrequests += + tbin->tstats.nrequests; + tbin->tstats.nrequests = 0; +#endif +#if (defined(JEMALLOC_PROF) || defined(JEMALLOC_STATS)) + } +#endif + deferred = NULL; + ndeferred = 0; + for (i = 0; i < nflush; i++) { + ptr = flush; + assert(ptr != NULL); + flush = *(void **)ptr; + chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr); + if (chunk->arena == arena) + arena_dalloc_large(arena, chunk, ptr); + else { + /* + * This object was allocated via a different + * arena than the one that is currently locked. + * Stash the object, so that it can be handled + * in a future pass. + */ + *(void **)ptr = deferred; + deferred = ptr; + ndeferred++; + } + } + malloc_mutex_unlock(&arena->lock); + + if (flush != NULL) { + /* + * This was the first pass, and rem cached objects + * remain. + */ + tbin->avail = flush; + } + } + + tbin->ncached = rem; + if (tbin->ncached < tbin->low_water) + tbin->low_water = tbin->ncached; +} + tcache_t * tcache_create(arena_t *arena) { @@ -127,7 +204,7 @@ tcache_create(arena_t *arena) size_t size; unsigned i; - size = sizeof(tcache_t) + (sizeof(tcache_bin_t) * (nbins - 1)); + size = sizeof(tcache_t) + (sizeof(tcache_bin_t) * (nhbins - 1)); /* * Round up to the nearest multiple of the cacheline size, in order to * avoid the possibility of false cacheline sharing. @@ -138,8 +215,6 @@ tcache_create(arena_t *arena) if (size <= small_maxclass) tcache = (tcache_t *)arena_malloc_small(arena, size, true); - else if (size <= bin_maxclass) - tcache = (tcache_t *)arena_malloc_medium(arena, size, true); else tcache = (tcache_t *)icalloc(size); @@ -155,14 +230,16 @@ tcache_create(arena_t *arena) #endif tcache->arena = arena; - assert((TCACHE_NSLOTS_MAX & 1U) == 0); + assert((TCACHE_NSLOTS_SMALL_MAX & 1U) == 0); for (i = 0; i < nbins; i++) { - if ((arena->bins[i].nregs << 1) <= TCACHE_NSLOTS_MAX) { + if ((arena->bins[i].nregs << 1) <= TCACHE_NSLOTS_SMALL_MAX) { tcache->tbins[i].ncached_max = (arena->bins[i].nregs << 1); } else - tcache->tbins[i].ncached_max = TCACHE_NSLOTS_MAX; + tcache->tbins[i].ncached_max = TCACHE_NSLOTS_SMALL_MAX; } + for (; i < nhbins; i++) + tcache->tbins[i].ncached_max = TCACHE_NSLOTS_LARGE; tcache_tls = tcache; pthread_setspecific(tcache_tsd, tcache); @@ -185,7 +262,7 @@ tcache_destroy(tcache_t *tcache) for (i = 0; i < nbins; i++) { tcache_bin_t *tbin = &tcache->tbins[i]; - tcache_bin_flush(tbin, i, 0 + tcache_bin_flush_small(tbin, i, 0 #if (defined(JEMALLOC_STATS) || defined(JEMALLOC_PROF)) , tcache #endif @@ -202,6 +279,26 @@ tcache_destroy(tcache_t *tcache) #endif } + for (; i < nhbins; i++) { + tcache_bin_t *tbin = &tcache->tbins[i]; + tcache_bin_flush_large(tbin, i, 0 +#if (defined(JEMALLOC_STATS) || defined(JEMALLOC_PROF)) + , tcache +#endif + ); + +#ifdef JEMALLOC_STATS + if (tbin->tstats.nrequests != 0) { + arena_t *arena = tcache->arena; + malloc_mutex_lock(&arena->lock); + arena->stats.nrequests_large += tbin->tstats.nrequests; + arena->stats.lstats[i - nbins].nrequests += + tbin->tstats.nrequests; + malloc_mutex_unlock(&arena->lock); + } +#endif + } + #ifdef JEMALLOC_PROF if (tcache->prof_accumbytes > 0) { malloc_mutex_lock(&tcache->arena->lock); @@ -210,7 +307,7 @@ tcache_destroy(tcache_t *tcache) } #endif - if (arena_salloc(tcache) <= bin_maxclass) { + if (arena_salloc(tcache) <= small_maxclass) { arena_chunk_t *chunk = CHUNK_ADDR2BASE(tcache); arena_t *arena = chunk->arena; size_t pageind = (((uintptr_t)tcache - (uintptr_t)chunk) >> @@ -256,6 +353,14 @@ tcache_stats_merge(tcache_t *tcache, arena_t *arena) malloc_mutex_unlock(&bin->lock); tbin->tstats.nrequests = 0; } + + for (; i < nhbins; i++) { + malloc_large_stats_t *lstats = &arena->stats.lstats[i - nbins]; + tcache_bin_t *tbin = &tcache->tbins[i]; + arena->stats.nrequests_large += tbin->tstats.nrequests; + lstats->nrequests += tbin->tstats.nrequests; + tbin->tstats.nrequests = 0; + } } #endif @@ -264,6 +369,20 @@ tcache_boot(void) { if (opt_tcache) { + /* + * If necessary, clamp opt_lg_tcache_maxclass, now that + * small_maxclass and arena_maxclass are known. + */ + if (opt_lg_tcache_maxclass < 0 || (1U << + opt_lg_tcache_maxclass) < small_maxclass) + tcache_maxclass = small_maxclass; + else if ((1U << opt_lg_tcache_maxclass) > arena_maxclass) + tcache_maxclass = arena_maxclass; + else + tcache_maxclass = (1U << opt_lg_tcache_maxclass); + + nhbins = nbins + (tcache_maxclass >> PAGE_SHIFT); + /* Compute incremental GC event threshold. */ if (opt_lg_tcache_gc_sweep >= 0) { tcache_gc_incr = ((1U << opt_lg_tcache_gc_sweep) /