2010-01-17 01:53:50 +08:00
|
|
|
#define JEMALLOC_TCACHE_C_
|
2010-02-12 06:45:59 +08:00
|
|
|
#include "jemalloc/internal/jemalloc_internal.h"
|
2010-01-17 01:53:50 +08:00
|
|
|
#ifdef JEMALLOC_TCACHE
|
|
|
|
/******************************************************************************/
|
|
|
|
/* Data. */
|
|
|
|
|
2010-03-08 07:34:14 +08:00
|
|
|
bool opt_tcache = true;
|
2010-10-24 09:37:06 +08:00
|
|
|
ssize_t opt_lg_tcache_max = LG_TCACHE_MAXCLASS_DEFAULT;
|
2010-01-17 01:53:50 +08:00
|
|
|
ssize_t opt_lg_tcache_gc_sweep = LG_TCACHE_GC_SWEEP_DEFAULT;
|
|
|
|
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
tcache_bin_info_t *tcache_bin_info;
|
|
|
|
static unsigned stack_nelms; /* Total stack elms per tcache. */
|
|
|
|
|
2010-01-17 01:53:50 +08:00
|
|
|
/* Map of thread-specific caches. */
|
2010-09-06 01:35:13 +08:00
|
|
|
#ifndef NO_TLS
|
2010-01-17 01:53:50 +08:00
|
|
|
__thread tcache_t *tcache_tls JEMALLOC_ATTR(tls_model("initial-exec"));
|
2010-09-06 01:35:13 +08:00
|
|
|
#endif
|
2010-01-17 01:53:50 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Same contents as tcache, but initialized such that the TSD destructor is
|
|
|
|
* called when a thread exits, so that the cache can be cleaned up.
|
|
|
|
*/
|
2010-09-06 01:35:13 +08:00
|
|
|
pthread_key_t tcache_tsd;
|
2010-01-17 01:53:50 +08:00
|
|
|
|
2010-03-18 07:27:39 +08:00
|
|
|
size_t nhbins;
|
|
|
|
size_t tcache_maxclass;
|
2010-01-17 01:53:50 +08:00
|
|
|
unsigned tcache_gc_incr;
|
|
|
|
|
|
|
|
/******************************************************************************/
|
|
|
|
/* Function prototypes for non-inline static functions. */
|
|
|
|
|
|
|
|
static void tcache_thread_cleanup(void *arg);
|
|
|
|
|
|
|
|
/******************************************************************************/
|
|
|
|
|
|
|
|
void *
|
2010-03-18 07:27:39 +08:00
|
|
|
tcache_alloc_small_hard(tcache_t *tcache, tcache_bin_t *tbin, size_t binind)
|
2010-01-17 01:53:50 +08:00
|
|
|
{
|
|
|
|
void *ret;
|
|
|
|
|
2010-03-18 07:27:39 +08:00
|
|
|
arena_tcache_fill_small(tcache->arena, tbin, binind
|
2010-02-12 05:19:21 +08:00
|
|
|
#ifdef JEMALLOC_PROF
|
|
|
|
, tcache->prof_accumbytes
|
|
|
|
#endif
|
|
|
|
);
|
|
|
|
#ifdef JEMALLOC_PROF
|
|
|
|
tcache->prof_accumbytes = 0;
|
|
|
|
#endif
|
2010-03-18 07:27:39 +08:00
|
|
|
ret = tcache_alloc_easy(tbin);
|
2010-01-17 01:53:50 +08:00
|
|
|
|
|
|
|
return (ret);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
2010-03-18 07:27:39 +08:00
|
|
|
tcache_bin_flush_small(tcache_bin_t *tbin, size_t binind, unsigned rem
|
2010-03-14 12:32:56 +08:00
|
|
|
#if (defined(JEMALLOC_STATS) || defined(JEMALLOC_PROF))
|
2010-02-12 05:19:21 +08:00
|
|
|
, tcache_t *tcache
|
|
|
|
#endif
|
|
|
|
)
|
2010-01-17 01:53:50 +08:00
|
|
|
{
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
void *ptr;
|
2010-03-08 07:34:14 +08:00
|
|
|
unsigned i, nflush, ndeferred;
|
2011-03-15 03:56:51 +08:00
|
|
|
#ifdef JEMALLOC_STATS
|
|
|
|
bool merged_stats = false;
|
|
|
|
#endif
|
2010-01-17 01:53:50 +08:00
|
|
|
|
2010-03-18 07:27:39 +08:00
|
|
|
assert(binind < nbins);
|
2010-03-14 12:32:56 +08:00
|
|
|
assert(rem <= tbin->ncached);
|
|
|
|
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
for (nflush = tbin->ncached - rem; nflush > 0; nflush = ndeferred) {
|
2010-03-14 12:32:56 +08:00
|
|
|
/* Lock the arena bin associated with the first object. */
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(
|
|
|
|
tbin->avail[0]);
|
2010-03-08 07:34:14 +08:00
|
|
|
arena_t *arena = chunk->arena;
|
2010-03-14 12:32:56 +08:00
|
|
|
arena_bin_t *bin = &arena->bins[binind];
|
|
|
|
|
2010-03-16 13:25:23 +08:00
|
|
|
#ifdef JEMALLOC_PROF
|
2010-02-12 05:19:21 +08:00
|
|
|
if (arena == tcache->arena) {
|
2010-03-14 12:32:56 +08:00
|
|
|
malloc_mutex_lock(&arena->lock);
|
2010-02-12 05:19:21 +08:00
|
|
|
arena_prof_accum(arena, tcache->prof_accumbytes);
|
2010-03-14 12:32:56 +08:00
|
|
|
malloc_mutex_unlock(&arena->lock);
|
2010-02-12 05:19:21 +08:00
|
|
|
tcache->prof_accumbytes = 0;
|
2010-03-16 13:25:23 +08:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
malloc_mutex_lock(&bin->lock);
|
|
|
|
#ifdef JEMALLOC_STATS
|
|
|
|
if (arena == tcache->arena) {
|
2011-03-15 03:56:51 +08:00
|
|
|
assert(merged_stats == false);
|
|
|
|
merged_stats = true;
|
2010-03-14 12:32:56 +08:00
|
|
|
bin->stats.nflushes++;
|
|
|
|
bin->stats.nrequests += tbin->tstats.nrequests;
|
|
|
|
tbin->tstats.nrequests = 0;
|
2010-02-12 05:19:21 +08:00
|
|
|
}
|
|
|
|
#endif
|
2010-03-08 07:34:14 +08:00
|
|
|
ndeferred = 0;
|
|
|
|
for (i = 0; i < nflush; i++) {
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
ptr = tbin->avail[i];
|
2010-03-08 07:34:14 +08:00
|
|
|
assert(ptr != NULL);
|
2010-01-17 01:53:50 +08:00
|
|
|
chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
|
|
|
|
if (chunk->arena == arena) {
|
2010-10-02 08:35:43 +08:00
|
|
|
size_t pageind = ((uintptr_t)ptr -
|
|
|
|
(uintptr_t)chunk) >> PAGE_SHIFT;
|
2010-01-17 01:53:50 +08:00
|
|
|
arena_chunk_map_t *mapelm =
|
2010-10-02 08:35:43 +08:00
|
|
|
&chunk->map[pageind-map_bias];
|
2010-01-17 01:53:50 +08:00
|
|
|
arena_dalloc_bin(arena, chunk, ptr, mapelm);
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* This object was allocated via a different
|
2010-03-14 12:32:56 +08:00
|
|
|
* arena bin than the one that is currently
|
|
|
|
* locked. Stash the object, so that it can be
|
|
|
|
* handled in a future pass.
|
2010-01-17 01:53:50 +08:00
|
|
|
*/
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
tbin->avail[ndeferred] = ptr;
|
2010-01-17 01:53:50 +08:00
|
|
|
ndeferred++;
|
|
|
|
}
|
|
|
|
}
|
2010-03-14 12:32:56 +08:00
|
|
|
malloc_mutex_unlock(&bin->lock);
|
2010-01-17 01:53:50 +08:00
|
|
|
}
|
2011-03-15 03:56:51 +08:00
|
|
|
#ifdef JEMALLOC_STATS
|
|
|
|
if (merged_stats == false) {
|
|
|
|
/*
|
|
|
|
* The flush loop didn't happen to flush to this thread's
|
|
|
|
* arena, so the stats didn't get merged. Manually do so now.
|
|
|
|
*/
|
|
|
|
arena_bin_t *bin = &tcache->arena->bins[binind];
|
|
|
|
malloc_mutex_lock(&bin->lock);
|
|
|
|
bin->stats.nflushes++;
|
|
|
|
bin->stats.nrequests += tbin->tstats.nrequests;
|
|
|
|
tbin->tstats.nrequests = 0;
|
|
|
|
malloc_mutex_unlock(&bin->lock);
|
|
|
|
}
|
|
|
|
#endif
|
2010-01-17 01:53:50 +08:00
|
|
|
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
memmove(tbin->avail, &tbin->avail[tbin->ncached - rem],
|
|
|
|
rem * sizeof(void *));
|
2010-03-08 07:34:14 +08:00
|
|
|
tbin->ncached = rem;
|
2011-03-21 15:18:17 +08:00
|
|
|
if ((int)tbin->ncached < tbin->low_water)
|
2010-03-14 12:32:56 +08:00
|
|
|
tbin->low_water = tbin->ncached;
|
2010-01-17 01:53:50 +08:00
|
|
|
}
|
|
|
|
|
2010-03-18 07:27:39 +08:00
|
|
|
void
|
|
|
|
tcache_bin_flush_large(tcache_bin_t *tbin, size_t binind, unsigned rem
|
|
|
|
#if (defined(JEMALLOC_STATS) || defined(JEMALLOC_PROF))
|
|
|
|
, tcache_t *tcache
|
|
|
|
#endif
|
|
|
|
)
|
|
|
|
{
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
void *ptr;
|
2010-03-18 07:27:39 +08:00
|
|
|
unsigned i, nflush, ndeferred;
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
#ifdef JEMALLOC_STATS
|
|
|
|
bool merged_stats = false;
|
|
|
|
#endif
|
2010-03-18 07:27:39 +08:00
|
|
|
|
|
|
|
assert(binind < nhbins);
|
|
|
|
assert(rem <= tbin->ncached);
|
|
|
|
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
for (nflush = tbin->ncached - rem; nflush > 0; nflush = ndeferred) {
|
2010-03-18 07:27:39 +08:00
|
|
|
/* Lock the arena associated with the first object. */
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(
|
|
|
|
tbin->avail[0]);
|
2010-03-18 07:27:39 +08:00
|
|
|
arena_t *arena = chunk->arena;
|
|
|
|
|
|
|
|
malloc_mutex_lock(&arena->lock);
|
|
|
|
#if (defined(JEMALLOC_PROF) || defined(JEMALLOC_STATS))
|
|
|
|
if (arena == tcache->arena) {
|
|
|
|
#endif
|
|
|
|
#ifdef JEMALLOC_PROF
|
|
|
|
arena_prof_accum(arena, tcache->prof_accumbytes);
|
|
|
|
tcache->prof_accumbytes = 0;
|
|
|
|
#endif
|
|
|
|
#ifdef JEMALLOC_STATS
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
merged_stats = true;
|
2010-03-18 07:27:39 +08:00
|
|
|
arena->stats.nrequests_large += tbin->tstats.nrequests;
|
|
|
|
arena->stats.lstats[binind - nbins].nrequests +=
|
|
|
|
tbin->tstats.nrequests;
|
|
|
|
tbin->tstats.nrequests = 0;
|
|
|
|
#endif
|
|
|
|
#if (defined(JEMALLOC_PROF) || defined(JEMALLOC_STATS))
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
ndeferred = 0;
|
|
|
|
for (i = 0; i < nflush; i++) {
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
ptr = tbin->avail[i];
|
2010-03-18 07:27:39 +08:00
|
|
|
assert(ptr != NULL);
|
|
|
|
chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
|
|
|
|
if (chunk->arena == arena)
|
|
|
|
arena_dalloc_large(arena, chunk, ptr);
|
|
|
|
else {
|
|
|
|
/*
|
|
|
|
* This object was allocated via a different
|
|
|
|
* arena than the one that is currently locked.
|
|
|
|
* Stash the object, so that it can be handled
|
|
|
|
* in a future pass.
|
|
|
|
*/
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
tbin->avail[ndeferred] = ptr;
|
2010-03-18 07:27:39 +08:00
|
|
|
ndeferred++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
malloc_mutex_unlock(&arena->lock);
|
|
|
|
}
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
#ifdef JEMALLOC_STATS
|
|
|
|
if (merged_stats == false) {
|
|
|
|
/*
|
|
|
|
* The flush loop didn't happen to flush to this thread's
|
|
|
|
* arena, so the stats didn't get merged. Manually do so now.
|
|
|
|
*/
|
|
|
|
arena_t *arena = tcache->arena;
|
|
|
|
malloc_mutex_lock(&arena->lock);
|
|
|
|
arena->stats.nrequests_large += tbin->tstats.nrequests;
|
|
|
|
arena->stats.lstats[binind - nbins].nrequests +=
|
|
|
|
tbin->tstats.nrequests;
|
|
|
|
tbin->tstats.nrequests = 0;
|
|
|
|
malloc_mutex_unlock(&arena->lock);
|
|
|
|
}
|
|
|
|
#endif
|
2010-03-18 07:27:39 +08:00
|
|
|
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
memmove(tbin->avail, &tbin->avail[tbin->ncached - rem],
|
|
|
|
rem * sizeof(void *));
|
2010-03-18 07:27:39 +08:00
|
|
|
tbin->ncached = rem;
|
2011-03-21 15:18:17 +08:00
|
|
|
if ((int)tbin->ncached < tbin->low_water)
|
2010-03-18 07:27:39 +08:00
|
|
|
tbin->low_water = tbin->ncached;
|
|
|
|
}
|
|
|
|
|
2010-01-17 01:53:50 +08:00
|
|
|
tcache_t *
|
|
|
|
tcache_create(arena_t *arena)
|
|
|
|
{
|
|
|
|
tcache_t *tcache;
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
size_t size, stack_offset;
|
2010-03-08 07:34:14 +08:00
|
|
|
unsigned i;
|
2010-01-17 01:53:50 +08:00
|
|
|
|
2010-10-02 09:02:43 +08:00
|
|
|
size = offsetof(tcache_t, tbins) + (sizeof(tcache_bin_t) * nhbins);
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
/* Naturally align the pointer stacks. */
|
|
|
|
size = PTR_CEILING(size);
|
|
|
|
stack_offset = size;
|
|
|
|
size += stack_nelms * sizeof(void *);
|
2010-03-08 07:34:14 +08:00
|
|
|
/*
|
|
|
|
* Round up to the nearest multiple of the cacheline size, in order to
|
|
|
|
* avoid the possibility of false cacheline sharing.
|
|
|
|
*
|
Add {,r,s,d}allocm().
Add allocm(), rallocm(), sallocm(), and dallocm(), which are a
functional superset of malloc(), calloc(), posix_memalign(),
malloc_usable_size(), and free().
2010-09-18 06:46:18 +08:00
|
|
|
* That this works relies on the same logic as in ipalloc(), but we
|
|
|
|
* cannot directly call ipalloc() here due to tcache bootstrapping
|
|
|
|
* issues.
|
2010-03-08 07:34:14 +08:00
|
|
|
*/
|
|
|
|
size = (size + CACHELINE_MASK) & (-CACHELINE);
|
|
|
|
|
|
|
|
if (size <= small_maxclass)
|
|
|
|
tcache = (tcache_t *)arena_malloc_small(arena, size, true);
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
else if (size <= tcache_maxclass)
|
|
|
|
tcache = (tcache_t *)arena_malloc_large(arena, size, true);
|
2010-03-08 07:34:14 +08:00
|
|
|
else
|
|
|
|
tcache = (tcache_t *)icalloc(size);
|
2010-01-17 01:53:50 +08:00
|
|
|
|
|
|
|
if (tcache == NULL)
|
|
|
|
return (NULL);
|
|
|
|
|
|
|
|
#ifdef JEMALLOC_STATS
|
|
|
|
/* Link into list of extant tcaches. */
|
|
|
|
malloc_mutex_lock(&arena->lock);
|
|
|
|
ql_elm_new(tcache, link);
|
|
|
|
ql_tail_insert(&arena->tcache_ql, tcache, link);
|
|
|
|
malloc_mutex_unlock(&arena->lock);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
tcache->arena = arena;
|
2010-03-18 07:27:39 +08:00
|
|
|
assert((TCACHE_NSLOTS_SMALL_MAX & 1U) == 0);
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
for (i = 0; i < nhbins; i++) {
|
2011-03-21 15:18:17 +08:00
|
|
|
tcache->tbins[i].lg_fill_div = 1;
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
tcache->tbins[i].avail = (void **)((uintptr_t)tcache +
|
|
|
|
(uintptr_t)stack_offset);
|
|
|
|
stack_offset += tcache_bin_info[i].ncached_max * sizeof(void *);
|
2010-03-08 07:34:14 +08:00
|
|
|
}
|
2010-01-17 01:53:50 +08:00
|
|
|
|
2010-09-06 01:35:13 +08:00
|
|
|
TCACHE_SET(tcache);
|
2010-01-17 01:53:50 +08:00
|
|
|
|
|
|
|
return (tcache);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
tcache_destroy(tcache_t *tcache)
|
|
|
|
{
|
|
|
|
unsigned i;
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
size_t tcache_size;
|
2010-01-17 01:53:50 +08:00
|
|
|
|
|
|
|
#ifdef JEMALLOC_STATS
|
|
|
|
/* Unlink from list of extant tcaches. */
|
|
|
|
malloc_mutex_lock(&tcache->arena->lock);
|
|
|
|
ql_remove(&tcache->arena->tcache_ql, tcache, link);
|
|
|
|
malloc_mutex_unlock(&tcache->arena->lock);
|
2010-03-14 12:32:56 +08:00
|
|
|
tcache_stats_merge(tcache, tcache->arena);
|
2010-01-17 01:53:50 +08:00
|
|
|
#endif
|
|
|
|
|
|
|
|
for (i = 0; i < nbins; i++) {
|
2010-03-08 07:34:14 +08:00
|
|
|
tcache_bin_t *tbin = &tcache->tbins[i];
|
2010-03-18 07:27:39 +08:00
|
|
|
tcache_bin_flush_small(tbin, i, 0
|
2010-03-14 12:32:56 +08:00
|
|
|
#if (defined(JEMALLOC_STATS) || defined(JEMALLOC_PROF))
|
2010-03-08 07:34:14 +08:00
|
|
|
, tcache
|
2010-02-12 05:19:21 +08:00
|
|
|
#endif
|
2010-03-08 07:34:14 +08:00
|
|
|
);
|
|
|
|
|
|
|
|
#ifdef JEMALLOC_STATS
|
|
|
|
if (tbin->tstats.nrequests != 0) {
|
|
|
|
arena_t *arena = tcache->arena;
|
|
|
|
arena_bin_t *bin = &arena->bins[i];
|
2010-03-14 12:32:56 +08:00
|
|
|
malloc_mutex_lock(&bin->lock);
|
2010-03-08 07:34:14 +08:00
|
|
|
bin->stats.nrequests += tbin->tstats.nrequests;
|
2010-03-14 12:32:56 +08:00
|
|
|
malloc_mutex_unlock(&bin->lock);
|
2010-01-17 01:53:50 +08:00
|
|
|
}
|
2010-03-08 07:34:14 +08:00
|
|
|
#endif
|
2010-01-17 01:53:50 +08:00
|
|
|
}
|
|
|
|
|
2010-03-18 07:27:39 +08:00
|
|
|
for (; i < nhbins; i++) {
|
|
|
|
tcache_bin_t *tbin = &tcache->tbins[i];
|
|
|
|
tcache_bin_flush_large(tbin, i, 0
|
|
|
|
#if (defined(JEMALLOC_STATS) || defined(JEMALLOC_PROF))
|
|
|
|
, tcache
|
|
|
|
#endif
|
|
|
|
);
|
|
|
|
|
|
|
|
#ifdef JEMALLOC_STATS
|
|
|
|
if (tbin->tstats.nrequests != 0) {
|
|
|
|
arena_t *arena = tcache->arena;
|
|
|
|
malloc_mutex_lock(&arena->lock);
|
|
|
|
arena->stats.nrequests_large += tbin->tstats.nrequests;
|
|
|
|
arena->stats.lstats[i - nbins].nrequests +=
|
|
|
|
tbin->tstats.nrequests;
|
|
|
|
malloc_mutex_unlock(&arena->lock);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2010-02-12 05:19:21 +08:00
|
|
|
#ifdef JEMALLOC_PROF
|
|
|
|
if (tcache->prof_accumbytes > 0) {
|
|
|
|
malloc_mutex_lock(&tcache->arena->lock);
|
|
|
|
arena_prof_accum(tcache->arena, tcache->prof_accumbytes);
|
|
|
|
malloc_mutex_unlock(&tcache->arena->lock);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
tcache_size = arena_salloc(tcache);
|
|
|
|
if (tcache_size <= small_maxclass) {
|
2010-01-17 01:53:50 +08:00
|
|
|
arena_chunk_t *chunk = CHUNK_ADDR2BASE(tcache);
|
|
|
|
arena_t *arena = chunk->arena;
|
2010-10-02 08:35:43 +08:00
|
|
|
size_t pageind = ((uintptr_t)tcache - (uintptr_t)chunk) >>
|
|
|
|
PAGE_SHIFT;
|
|
|
|
arena_chunk_map_t *mapelm = &chunk->map[pageind-map_bias];
|
2010-03-14 12:32:56 +08:00
|
|
|
arena_run_t *run = (arena_run_t *)((uintptr_t)chunk +
|
2010-03-19 11:36:40 +08:00
|
|
|
(uintptr_t)((pageind - (mapelm->bits >> PAGE_SHIFT)) <<
|
|
|
|
PAGE_SHIFT));
|
2010-03-14 12:32:56 +08:00
|
|
|
arena_bin_t *bin = run->bin;
|
2010-01-17 01:53:50 +08:00
|
|
|
|
2010-03-14 12:32:56 +08:00
|
|
|
malloc_mutex_lock(&bin->lock);
|
2010-01-17 01:53:50 +08:00
|
|
|
arena_dalloc_bin(arena, chunk, tcache, mapelm);
|
2010-03-14 12:32:56 +08:00
|
|
|
malloc_mutex_unlock(&bin->lock);
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
} else if (tcache_size <= tcache_maxclass) {
|
|
|
|
arena_chunk_t *chunk = CHUNK_ADDR2BASE(tcache);
|
|
|
|
arena_t *arena = chunk->arena;
|
|
|
|
|
|
|
|
malloc_mutex_lock(&arena->lock);
|
|
|
|
arena_dalloc_large(arena, chunk, tcache);
|
|
|
|
malloc_mutex_unlock(&arena->lock);
|
2010-01-17 01:53:50 +08:00
|
|
|
} else
|
|
|
|
idalloc(tcache);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
tcache_thread_cleanup(void *arg)
|
|
|
|
{
|
|
|
|
tcache_t *tcache = (tcache_t *)arg;
|
|
|
|
|
2010-09-06 01:35:13 +08:00
|
|
|
if (tcache == (void *)(uintptr_t)1) {
|
|
|
|
/*
|
|
|
|
* The previous time this destructor was called, we set the key
|
|
|
|
* to 1 so that other destructors wouldn't cause re-creation of
|
|
|
|
* the tcache. This time, do nothing, so that the destructor
|
|
|
|
* will not be called again.
|
|
|
|
*/
|
|
|
|
} else if (tcache == (void *)(uintptr_t)2) {
|
|
|
|
/*
|
|
|
|
* Another destructor called an allocator function after this
|
|
|
|
* destructor was called. Reset tcache to 1 in order to
|
|
|
|
* receive another callback.
|
|
|
|
*/
|
|
|
|
TCACHE_SET((uintptr_t)1);
|
|
|
|
} else if (tcache != NULL) {
|
2010-01-17 01:53:50 +08:00
|
|
|
assert(tcache != (void *)(uintptr_t)1);
|
|
|
|
tcache_destroy(tcache);
|
2010-09-06 01:35:13 +08:00
|
|
|
TCACHE_SET((uintptr_t)1);
|
2010-01-17 01:53:50 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef JEMALLOC_STATS
|
|
|
|
void
|
|
|
|
tcache_stats_merge(tcache_t *tcache, arena_t *arena)
|
|
|
|
{
|
|
|
|
unsigned i;
|
|
|
|
|
|
|
|
/* Merge and reset tcache stats. */
|
2010-03-14 12:32:56 +08:00
|
|
|
for (i = 0; i < nbins; i++) {
|
2010-01-17 01:53:50 +08:00
|
|
|
arena_bin_t *bin = &arena->bins[i];
|
2010-03-08 07:34:14 +08:00
|
|
|
tcache_bin_t *tbin = &tcache->tbins[i];
|
2010-03-14 12:32:56 +08:00
|
|
|
malloc_mutex_lock(&bin->lock);
|
2010-03-08 07:34:14 +08:00
|
|
|
bin->stats.nrequests += tbin->tstats.nrequests;
|
2010-03-14 12:32:56 +08:00
|
|
|
malloc_mutex_unlock(&bin->lock);
|
2010-03-08 07:34:14 +08:00
|
|
|
tbin->tstats.nrequests = 0;
|
2010-01-17 01:53:50 +08:00
|
|
|
}
|
2010-03-18 07:27:39 +08:00
|
|
|
|
|
|
|
for (; i < nhbins; i++) {
|
|
|
|
malloc_large_stats_t *lstats = &arena->stats.lstats[i - nbins];
|
|
|
|
tcache_bin_t *tbin = &tcache->tbins[i];
|
|
|
|
arena->stats.nrequests_large += tbin->tstats.nrequests;
|
|
|
|
lstats->nrequests += tbin->tstats.nrequests;
|
|
|
|
tbin->tstats.nrequests = 0;
|
|
|
|
}
|
2010-01-17 01:53:50 +08:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
bool
|
2010-01-17 01:53:50 +08:00
|
|
|
tcache_boot(void)
|
|
|
|
{
|
|
|
|
|
2010-03-08 07:34:14 +08:00
|
|
|
if (opt_tcache) {
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
unsigned i;
|
|
|
|
|
2010-03-18 07:27:39 +08:00
|
|
|
/*
|
2010-10-24 09:37:06 +08:00
|
|
|
* If necessary, clamp opt_lg_tcache_max, now that
|
2010-03-18 07:27:39 +08:00
|
|
|
* small_maxclass and arena_maxclass are known.
|
|
|
|
*/
|
2010-10-24 09:37:06 +08:00
|
|
|
if (opt_lg_tcache_max < 0 || (1U <<
|
|
|
|
opt_lg_tcache_max) < small_maxclass)
|
2010-03-18 07:27:39 +08:00
|
|
|
tcache_maxclass = small_maxclass;
|
2010-10-24 09:37:06 +08:00
|
|
|
else if ((1U << opt_lg_tcache_max) > arena_maxclass)
|
2010-03-18 07:27:39 +08:00
|
|
|
tcache_maxclass = arena_maxclass;
|
|
|
|
else
|
2010-10-24 09:37:06 +08:00
|
|
|
tcache_maxclass = (1U << opt_lg_tcache_max);
|
2010-03-18 07:27:39 +08:00
|
|
|
|
|
|
|
nhbins = nbins + (tcache_maxclass >> PAGE_SHIFT);
|
|
|
|
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
/* Initialize tcache_bin_info. */
|
|
|
|
tcache_bin_info = (tcache_bin_info_t *)base_alloc(nhbins *
|
|
|
|
sizeof(tcache_bin_info_t));
|
|
|
|
if (tcache_bin_info == NULL)
|
|
|
|
return (true);
|
|
|
|
stack_nelms = 0;
|
|
|
|
for (i = 0; i < nbins; i++) {
|
|
|
|
if ((arena_bin_info[i].nregs << 1) <=
|
|
|
|
TCACHE_NSLOTS_SMALL_MAX) {
|
|
|
|
tcache_bin_info[i].ncached_max =
|
|
|
|
(arena_bin_info[i].nregs << 1);
|
|
|
|
} else {
|
|
|
|
tcache_bin_info[i].ncached_max =
|
|
|
|
TCACHE_NSLOTS_SMALL_MAX;
|
|
|
|
}
|
|
|
|
stack_nelms += tcache_bin_info[i].ncached_max;
|
|
|
|
}
|
|
|
|
for (; i < nhbins; i++) {
|
|
|
|
tcache_bin_info[i].ncached_max = TCACHE_NSLOTS_LARGE;
|
|
|
|
stack_nelms += tcache_bin_info[i].ncached_max;
|
|
|
|
}
|
|
|
|
|
2010-01-17 01:53:50 +08:00
|
|
|
/* Compute incremental GC event threshold. */
|
|
|
|
if (opt_lg_tcache_gc_sweep >= 0) {
|
|
|
|
tcache_gc_incr = ((1U << opt_lg_tcache_gc_sweep) /
|
|
|
|
nbins) + (((1U << opt_lg_tcache_gc_sweep) % nbins ==
|
|
|
|
0) ? 0 : 1);
|
|
|
|
} else
|
|
|
|
tcache_gc_incr = 0;
|
|
|
|
|
|
|
|
if (pthread_key_create(&tcache_tsd, tcache_thread_cleanup) !=
|
|
|
|
0) {
|
2010-03-04 09:45:38 +08:00
|
|
|
malloc_write(
|
|
|
|
"<jemalloc>: Error in pthread_key_create()\n");
|
2010-01-17 01:53:50 +08:00
|
|
|
abort();
|
|
|
|
}
|
|
|
|
}
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
|
|
|
|
return (false);
|
2010-01-17 01:53:50 +08:00
|
|
|
}
|
|
|
|
/******************************************************************************/
|
|
|
|
#endif /* JEMALLOC_TCACHE */
|