2010-01-17 01:53:50 +08:00
|
|
|
#define JEMALLOC_TCACHE_C_
|
2010-02-12 06:45:59 +08:00
|
|
|
#include "jemalloc/internal/jemalloc_internal.h"
|
2012-02-14 04:29:49 +08:00
|
|
|
|
2010-01-17 01:53:50 +08:00
|
|
|
/******************************************************************************/
|
|
|
|
/* Data. */
|
|
|
|
|
2010-03-08 07:34:14 +08:00
|
|
|
bool opt_tcache = true;
|
2010-10-24 09:37:06 +08:00
|
|
|
ssize_t opt_lg_tcache_max = LG_TCACHE_MAXCLASS_DEFAULT;
|
2010-01-17 01:53:50 +08:00
|
|
|
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
tcache_bin_info_t *tcache_bin_info;
|
|
|
|
static unsigned stack_nelms; /* Total stack elms per tcache. */
|
|
|
|
|
2012-03-22 09:33:03 +08:00
|
|
|
size_t nhbins;
|
|
|
|
size_t tcache_maxclass;
|
2010-01-17 01:53:50 +08:00
|
|
|
|
2015-01-30 07:30:47 +08:00
|
|
|
tcaches_t *tcaches;
|
|
|
|
|
|
|
|
/* Index of first element within tcaches that has never been used. */
|
|
|
|
static unsigned tcaches_past;
|
|
|
|
|
|
|
|
/* Head of singly linked list tracking available tcaches elements. */
|
|
|
|
static tcaches_t *tcaches_avail;
|
|
|
|
|
2010-01-17 01:53:50 +08:00
|
|
|
/******************************************************************************/
|
|
|
|
|
2012-04-20 09:28:03 +08:00
|
|
|
size_t tcache_salloc(const void *ptr)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (arena_salloc(ptr, false));
|
|
|
|
}
|
|
|
|
|
2012-05-02 15:30:36 +08:00
|
|
|
void
|
2015-01-30 07:30:47 +08:00
|
|
|
tcache_event_hard(tsd_t *tsd, tcache_t *tcache)
|
2012-05-02 15:30:36 +08:00
|
|
|
{
|
2014-10-06 08:54:10 +08:00
|
|
|
index_t binind = tcache->next_gc_bin;
|
2012-05-02 15:30:36 +08:00
|
|
|
tcache_bin_t *tbin = &tcache->tbins[binind];
|
|
|
|
tcache_bin_info_t *tbin_info = &tcache_bin_info[binind];
|
|
|
|
|
|
|
|
if (tbin->low_water > 0) {
|
|
|
|
/*
|
|
|
|
* Flush (ceiling) 3/4 of the objects below the low water mark.
|
|
|
|
*/
|
|
|
|
if (binind < NBINS) {
|
2015-02-14 07:28:56 +08:00
|
|
|
tcache_bin_flush_small(tsd, tcache, tbin, binind,
|
|
|
|
tbin->ncached - tbin->low_water + (tbin->low_water
|
|
|
|
>> 2));
|
2012-05-02 15:30:36 +08:00
|
|
|
} else {
|
2015-01-30 07:30:47 +08:00
|
|
|
tcache_bin_flush_large(tsd, tbin, binind, tbin->ncached
|
|
|
|
- tbin->low_water + (tbin->low_water >> 2), tcache);
|
2012-05-02 15:30:36 +08:00
|
|
|
}
|
|
|
|
/*
|
|
|
|
* Reduce fill count by 2X. Limit lg_fill_div such that the
|
|
|
|
* fill count is always at least 1.
|
|
|
|
*/
|
|
|
|
if ((tbin_info->ncached_max >> (tbin->lg_fill_div+1)) >= 1)
|
|
|
|
tbin->lg_fill_div++;
|
|
|
|
} else if (tbin->low_water < 0) {
|
|
|
|
/*
|
|
|
|
* Increase fill count by 2X. Make sure lg_fill_div stays
|
|
|
|
* greater than 0.
|
|
|
|
*/
|
|
|
|
if (tbin->lg_fill_div > 1)
|
|
|
|
tbin->lg_fill_div--;
|
|
|
|
}
|
|
|
|
tbin->low_water = tbin->ncached;
|
|
|
|
|
|
|
|
tcache->next_gc_bin++;
|
|
|
|
if (tcache->next_gc_bin == nhbins)
|
|
|
|
tcache->next_gc_bin = 0;
|
|
|
|
tcache->ev_cnt = 0;
|
|
|
|
}
|
|
|
|
|
2010-01-17 01:53:50 +08:00
|
|
|
void *
|
2015-02-14 07:28:56 +08:00
|
|
|
tcache_alloc_small_hard(tsd_t *tsd, arena_t *arena, tcache_t *tcache,
|
|
|
|
tcache_bin_t *tbin, index_t binind)
|
2010-01-17 01:53:50 +08:00
|
|
|
{
|
|
|
|
void *ret;
|
|
|
|
|
2015-02-14 07:28:56 +08:00
|
|
|
arena_tcache_fill_small(arena, tbin, binind, config_prof ?
|
|
|
|
tcache->prof_accumbytes : 0);
|
2012-02-11 12:22:09 +08:00
|
|
|
if (config_prof)
|
|
|
|
tcache->prof_accumbytes = 0;
|
2010-03-18 07:27:39 +08:00
|
|
|
ret = tcache_alloc_easy(tbin);
|
2010-01-17 01:53:50 +08:00
|
|
|
|
|
|
|
return (ret);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
2015-02-14 07:28:56 +08:00
|
|
|
tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, tcache_bin_t *tbin,
|
|
|
|
index_t binind, unsigned rem)
|
2010-01-17 01:53:50 +08:00
|
|
|
{
|
2015-01-30 07:30:47 +08:00
|
|
|
arena_t *arena;
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
void *ptr;
|
2010-03-08 07:34:14 +08:00
|
|
|
unsigned i, nflush, ndeferred;
|
2011-03-15 03:56:51 +08:00
|
|
|
bool merged_stats = false;
|
2010-01-17 01:53:50 +08:00
|
|
|
|
2012-02-29 08:50:47 +08:00
|
|
|
assert(binind < NBINS);
|
2010-03-14 12:32:56 +08:00
|
|
|
assert(rem <= tbin->ncached);
|
|
|
|
|
2015-01-30 07:30:47 +08:00
|
|
|
arena = arena_choose(tsd, NULL);
|
|
|
|
assert(arena != NULL);
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
for (nflush = tbin->ncached - rem; nflush > 0; nflush = ndeferred) {
|
2010-03-14 12:32:56 +08:00
|
|
|
/* Lock the arena bin associated with the first object. */
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(
|
|
|
|
tbin->avail[0]);
|
2015-02-16 10:04:46 +08:00
|
|
|
arena_t *bin_arena = extent_node_arena_get(&chunk->node);
|
2015-02-12 16:09:37 +08:00
|
|
|
arena_bin_t *bin = &bin_arena->bins[binind];
|
2010-03-14 12:32:56 +08:00
|
|
|
|
2015-01-30 07:30:47 +08:00
|
|
|
if (config_prof && bin_arena == arena) {
|
2013-02-07 03:59:30 +08:00
|
|
|
if (arena_prof_accum(arena, tcache->prof_accumbytes))
|
|
|
|
prof_idump();
|
2010-02-12 05:19:21 +08:00
|
|
|
tcache->prof_accumbytes = 0;
|
2010-03-16 13:25:23 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
malloc_mutex_lock(&bin->lock);
|
2015-01-30 07:30:47 +08:00
|
|
|
if (config_stats && bin_arena == arena) {
|
2014-10-04 01:16:09 +08:00
|
|
|
assert(!merged_stats);
|
2011-03-15 03:56:51 +08:00
|
|
|
merged_stats = true;
|
2010-03-14 12:32:56 +08:00
|
|
|
bin->stats.nflushes++;
|
|
|
|
bin->stats.nrequests += tbin->tstats.nrequests;
|
|
|
|
tbin->tstats.nrequests = 0;
|
2010-02-12 05:19:21 +08:00
|
|
|
}
|
2010-03-08 07:34:14 +08:00
|
|
|
ndeferred = 0;
|
|
|
|
for (i = 0; i < nflush; i++) {
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
ptr = tbin->avail[i];
|
2010-03-08 07:34:14 +08:00
|
|
|
assert(ptr != NULL);
|
2010-01-17 01:53:50 +08:00
|
|
|
chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
|
2015-02-16 10:04:46 +08:00
|
|
|
if (extent_node_arena_get(&chunk->node) == bin_arena) {
|
2010-10-02 08:35:43 +08:00
|
|
|
size_t pageind = ((uintptr_t)ptr -
|
2012-04-02 22:04:34 +08:00
|
|
|
(uintptr_t)chunk) >> LG_PAGE;
|
2014-08-30 04:34:40 +08:00
|
|
|
arena_chunk_map_bits_t *bitselm =
|
|
|
|
arena_bitselm_get(chunk, pageind);
|
2015-01-30 07:30:47 +08:00
|
|
|
arena_dalloc_bin_junked_locked(bin_arena, chunk,
|
2014-10-10 08:54:06 +08:00
|
|
|
ptr, bitselm);
|
2010-01-17 01:53:50 +08:00
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* This object was allocated via a different
|
2010-03-14 12:32:56 +08:00
|
|
|
* arena bin than the one that is currently
|
|
|
|
* locked. Stash the object, so that it can be
|
|
|
|
* handled in a future pass.
|
2010-01-17 01:53:50 +08:00
|
|
|
*/
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
tbin->avail[ndeferred] = ptr;
|
2010-01-17 01:53:50 +08:00
|
|
|
ndeferred++;
|
|
|
|
}
|
|
|
|
}
|
2010-03-14 12:32:56 +08:00
|
|
|
malloc_mutex_unlock(&bin->lock);
|
2010-01-17 01:53:50 +08:00
|
|
|
}
|
2014-10-04 01:16:09 +08:00
|
|
|
if (config_stats && !merged_stats) {
|
2011-03-15 03:56:51 +08:00
|
|
|
/*
|
|
|
|
* The flush loop didn't happen to flush to this thread's
|
|
|
|
* arena, so the stats didn't get merged. Manually do so now.
|
|
|
|
*/
|
2015-01-30 07:30:47 +08:00
|
|
|
arena_bin_t *bin = &arena->bins[binind];
|
2011-03-15 03:56:51 +08:00
|
|
|
malloc_mutex_lock(&bin->lock);
|
|
|
|
bin->stats.nflushes++;
|
|
|
|
bin->stats.nrequests += tbin->tstats.nrequests;
|
|
|
|
tbin->tstats.nrequests = 0;
|
|
|
|
malloc_mutex_unlock(&bin->lock);
|
|
|
|
}
|
2010-01-17 01:53:50 +08:00
|
|
|
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
memmove(tbin->avail, &tbin->avail[tbin->ncached - rem],
|
|
|
|
rem * sizeof(void *));
|
2010-03-08 07:34:14 +08:00
|
|
|
tbin->ncached = rem;
|
2011-03-21 15:18:17 +08:00
|
|
|
if ((int)tbin->ncached < tbin->low_water)
|
2010-03-14 12:32:56 +08:00
|
|
|
tbin->low_water = tbin->ncached;
|
2010-01-17 01:53:50 +08:00
|
|
|
}
|
|
|
|
|
2010-03-18 07:27:39 +08:00
|
|
|
void
|
2015-01-30 07:30:47 +08:00
|
|
|
tcache_bin_flush_large(tsd_t *tsd, tcache_bin_t *tbin, index_t binind,
|
|
|
|
unsigned rem, tcache_t *tcache)
|
2010-03-18 07:27:39 +08:00
|
|
|
{
|
2015-01-30 07:30:47 +08:00
|
|
|
arena_t *arena;
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
void *ptr;
|
2010-03-18 07:27:39 +08:00
|
|
|
unsigned i, nflush, ndeferred;
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
bool merged_stats = false;
|
2010-03-18 07:27:39 +08:00
|
|
|
|
|
|
|
assert(binind < nhbins);
|
|
|
|
assert(rem <= tbin->ncached);
|
|
|
|
|
2015-01-30 07:30:47 +08:00
|
|
|
arena = arena_choose(tsd, NULL);
|
|
|
|
assert(arena != NULL);
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
for (nflush = tbin->ncached - rem; nflush > 0; nflush = ndeferred) {
|
2010-03-18 07:27:39 +08:00
|
|
|
/* Lock the arena associated with the first object. */
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(
|
|
|
|
tbin->avail[0]);
|
2015-02-16 10:04:46 +08:00
|
|
|
arena_t *locked_arena = extent_node_arena_get(&chunk->node);
|
2013-02-07 03:59:30 +08:00
|
|
|
UNUSED bool idump;
|
2010-03-18 07:27:39 +08:00
|
|
|
|
2013-02-07 03:59:30 +08:00
|
|
|
if (config_prof)
|
|
|
|
idump = false;
|
2015-01-30 07:30:47 +08:00
|
|
|
malloc_mutex_lock(&locked_arena->lock);
|
|
|
|
if ((config_prof || config_stats) && locked_arena == arena) {
|
2012-02-11 12:22:09 +08:00
|
|
|
if (config_prof) {
|
2013-02-07 03:59:30 +08:00
|
|
|
idump = arena_prof_accum_locked(arena,
|
2012-02-11 12:22:09 +08:00
|
|
|
tcache->prof_accumbytes);
|
|
|
|
tcache->prof_accumbytes = 0;
|
|
|
|
}
|
|
|
|
if (config_stats) {
|
|
|
|
merged_stats = true;
|
|
|
|
arena->stats.nrequests_large +=
|
|
|
|
tbin->tstats.nrequests;
|
2012-02-29 08:50:47 +08:00
|
|
|
arena->stats.lstats[binind - NBINS].nrequests +=
|
2012-02-11 12:22:09 +08:00
|
|
|
tbin->tstats.nrequests;
|
|
|
|
tbin->tstats.nrequests = 0;
|
|
|
|
}
|
2010-03-18 07:27:39 +08:00
|
|
|
}
|
|
|
|
ndeferred = 0;
|
|
|
|
for (i = 0; i < nflush; i++) {
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
ptr = tbin->avail[i];
|
2010-03-18 07:27:39 +08:00
|
|
|
assert(ptr != NULL);
|
|
|
|
chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
|
2015-02-16 10:04:46 +08:00
|
|
|
if (extent_node_arena_get(&chunk->node) ==
|
|
|
|
locked_arena) {
|
2015-01-30 07:30:47 +08:00
|
|
|
arena_dalloc_large_junked_locked(locked_arena,
|
|
|
|
chunk, ptr);
|
2014-10-10 08:54:06 +08:00
|
|
|
} else {
|
2010-03-18 07:27:39 +08:00
|
|
|
/*
|
|
|
|
* This object was allocated via a different
|
|
|
|
* arena than the one that is currently locked.
|
|
|
|
* Stash the object, so that it can be handled
|
|
|
|
* in a future pass.
|
|
|
|
*/
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
tbin->avail[ndeferred] = ptr;
|
2010-03-18 07:27:39 +08:00
|
|
|
ndeferred++;
|
|
|
|
}
|
|
|
|
}
|
2015-01-30 07:30:47 +08:00
|
|
|
malloc_mutex_unlock(&locked_arena->lock);
|
2013-02-07 03:59:30 +08:00
|
|
|
if (config_prof && idump)
|
|
|
|
prof_idump();
|
2010-03-18 07:27:39 +08:00
|
|
|
}
|
2014-10-04 01:16:09 +08:00
|
|
|
if (config_stats && !merged_stats) {
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
/*
|
|
|
|
* The flush loop didn't happen to flush to this thread's
|
|
|
|
* arena, so the stats didn't get merged. Manually do so now.
|
|
|
|
*/
|
|
|
|
malloc_mutex_lock(&arena->lock);
|
|
|
|
arena->stats.nrequests_large += tbin->tstats.nrequests;
|
2012-02-29 08:50:47 +08:00
|
|
|
arena->stats.lstats[binind - NBINS].nrequests +=
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
tbin->tstats.nrequests;
|
|
|
|
tbin->tstats.nrequests = 0;
|
|
|
|
malloc_mutex_unlock(&arena->lock);
|
|
|
|
}
|
2010-03-18 07:27:39 +08:00
|
|
|
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
memmove(tbin->avail, &tbin->avail[tbin->ncached - rem],
|
|
|
|
rem * sizeof(void *));
|
2010-03-18 07:27:39 +08:00
|
|
|
tbin->ncached = rem;
|
2011-03-21 15:18:17 +08:00
|
|
|
if ((int)tbin->ncached < tbin->low_water)
|
2010-03-18 07:27:39 +08:00
|
|
|
tbin->low_water = tbin->ncached;
|
|
|
|
}
|
|
|
|
|
2012-03-22 09:33:03 +08:00
|
|
|
void
|
|
|
|
tcache_arena_associate(tcache_t *tcache, arena_t *arena)
|
|
|
|
{
|
|
|
|
|
|
|
|
if (config_stats) {
|
|
|
|
/* Link into list of extant tcaches. */
|
|
|
|
malloc_mutex_lock(&arena->lock);
|
|
|
|
ql_elm_new(tcache, link);
|
|
|
|
ql_tail_insert(&arena->tcache_ql, tcache, link);
|
|
|
|
malloc_mutex_unlock(&arena->lock);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Refactor/fix arenas manipulation.
Abstract arenas access to use arena_get() (or a0get() where appropriate)
rather than directly reading e.g. arenas[ind]. Prior to the addition of
the arenas.extend mallctl, the worst possible outcome of directly
accessing arenas was a stale read, but arenas.extend may allocate and
assign a new array to arenas.
Add a tsd-based arenas_cache, which amortizes arenas reads. This
introduces some subtle bootstrapping issues, with tsd_boot() now being
split into tsd_boot[01]() to support tsd wrapper allocation
bootstrapping, as well as an arenas_cache_bypass tsd variable which
dynamically terminates allocation of arenas_cache itself.
Promote a0malloc(), a0calloc(), and a0free() to be generally useful for
internal allocation, and use them in several places (more may be
appropriate).
Abstract arena->nthreads management and fix a missing decrement during
thread destruction (recent tsd refactoring left arenas_cleanup()
unused).
Change arena_choose() to propagate OOM, and handle OOM in all callers.
This is important for providing consistent allocation behavior when the
MALLOCX_ARENA() flag is being used. Prior to this fix, it was possible
for an OOM to result in allocation silently allocating from a different
arena than the one specified.
2014-10-08 14:14:57 +08:00
|
|
|
void
|
2015-01-30 07:30:47 +08:00
|
|
|
tcache_arena_reassociate(tcache_t *tcache, arena_t *oldarena, arena_t *newarena)
|
Refactor/fix arenas manipulation.
Abstract arenas access to use arena_get() (or a0get() where appropriate)
rather than directly reading e.g. arenas[ind]. Prior to the addition of
the arenas.extend mallctl, the worst possible outcome of directly
accessing arenas was a stale read, but arenas.extend may allocate and
assign a new array to arenas.
Add a tsd-based arenas_cache, which amortizes arenas reads. This
introduces some subtle bootstrapping issues, with tsd_boot() now being
split into tsd_boot[01]() to support tsd wrapper allocation
bootstrapping, as well as an arenas_cache_bypass tsd variable which
dynamically terminates allocation of arenas_cache itself.
Promote a0malloc(), a0calloc(), and a0free() to be generally useful for
internal allocation, and use them in several places (more may be
appropriate).
Abstract arena->nthreads management and fix a missing decrement during
thread destruction (recent tsd refactoring left arenas_cleanup()
unused).
Change arena_choose() to propagate OOM, and handle OOM in all callers.
This is important for providing consistent allocation behavior when the
MALLOCX_ARENA() flag is being used. Prior to this fix, it was possible
for an OOM to result in allocation silently allocating from a different
arena than the one specified.
2014-10-08 14:14:57 +08:00
|
|
|
{
|
|
|
|
|
2015-01-30 07:30:47 +08:00
|
|
|
tcache_arena_dissociate(tcache, oldarena);
|
|
|
|
tcache_arena_associate(tcache, newarena);
|
Refactor/fix arenas manipulation.
Abstract arenas access to use arena_get() (or a0get() where appropriate)
rather than directly reading e.g. arenas[ind]. Prior to the addition of
the arenas.extend mallctl, the worst possible outcome of directly
accessing arenas was a stale read, but arenas.extend may allocate and
assign a new array to arenas.
Add a tsd-based arenas_cache, which amortizes arenas reads. This
introduces some subtle bootstrapping issues, with tsd_boot() now being
split into tsd_boot[01]() to support tsd wrapper allocation
bootstrapping, as well as an arenas_cache_bypass tsd variable which
dynamically terminates allocation of arenas_cache itself.
Promote a0malloc(), a0calloc(), and a0free() to be generally useful for
internal allocation, and use them in several places (more may be
appropriate).
Abstract arena->nthreads management and fix a missing decrement during
thread destruction (recent tsd refactoring left arenas_cleanup()
unused).
Change arena_choose() to propagate OOM, and handle OOM in all callers.
This is important for providing consistent allocation behavior when the
MALLOCX_ARENA() flag is being used. Prior to this fix, it was possible
for an OOM to result in allocation silently allocating from a different
arena than the one specified.
2014-10-08 14:14:57 +08:00
|
|
|
}
|
|
|
|
|
2012-03-22 09:33:03 +08:00
|
|
|
void
|
2015-01-30 07:30:47 +08:00
|
|
|
tcache_arena_dissociate(tcache_t *tcache, arena_t *arena)
|
2012-03-22 09:33:03 +08:00
|
|
|
{
|
|
|
|
|
|
|
|
if (config_stats) {
|
|
|
|
/* Unlink from list of extant tcaches. */
|
2015-01-30 07:30:47 +08:00
|
|
|
malloc_mutex_lock(&arena->lock);
|
|
|
|
if (config_debug) {
|
|
|
|
bool in_ql = false;
|
|
|
|
tcache_t *iter;
|
|
|
|
ql_foreach(iter, &arena->tcache_ql, link) {
|
|
|
|
if (iter == tcache) {
|
|
|
|
in_ql = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
assert(in_ql);
|
|
|
|
}
|
|
|
|
ql_remove(&arena->tcache_ql, tcache, link);
|
|
|
|
tcache_stats_merge(tcache, arena);
|
|
|
|
malloc_mutex_unlock(&arena->lock);
|
2012-03-22 09:33:03 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-04-16 04:28:37 +08:00
|
|
|
tcache_t *
|
2014-09-23 12:09:23 +08:00
|
|
|
tcache_get_hard(tsd_t *tsd)
|
2014-04-16 04:28:37 +08:00
|
|
|
{
|
Refactor/fix arenas manipulation.
Abstract arenas access to use arena_get() (or a0get() where appropriate)
rather than directly reading e.g. arenas[ind]. Prior to the addition of
the arenas.extend mallctl, the worst possible outcome of directly
accessing arenas was a stale read, but arenas.extend may allocate and
assign a new array to arenas.
Add a tsd-based arenas_cache, which amortizes arenas reads. This
introduces some subtle bootstrapping issues, with tsd_boot() now being
split into tsd_boot[01]() to support tsd wrapper allocation
bootstrapping, as well as an arenas_cache_bypass tsd variable which
dynamically terminates allocation of arenas_cache itself.
Promote a0malloc(), a0calloc(), and a0free() to be generally useful for
internal allocation, and use them in several places (more may be
appropriate).
Abstract arena->nthreads management and fix a missing decrement during
thread destruction (recent tsd refactoring left arenas_cleanup()
unused).
Change arena_choose() to propagate OOM, and handle OOM in all callers.
This is important for providing consistent allocation behavior when the
MALLOCX_ARENA() flag is being used. Prior to this fix, it was possible
for an OOM to result in allocation silently allocating from a different
arena than the one specified.
2014-10-08 14:14:57 +08:00
|
|
|
arena_t *arena;
|
2014-04-16 04:28:37 +08:00
|
|
|
|
2014-10-04 01:16:09 +08:00
|
|
|
if (!tcache_enabled_get()) {
|
2014-10-05 02:12:53 +08:00
|
|
|
if (tsd_nominal(tsd))
|
|
|
|
tcache_enabled_set(false); /* Memoize. */
|
2014-04-16 04:28:37 +08:00
|
|
|
return (NULL);
|
|
|
|
}
|
Refactor/fix arenas manipulation.
Abstract arenas access to use arena_get() (or a0get() where appropriate)
rather than directly reading e.g. arenas[ind]. Prior to the addition of
the arenas.extend mallctl, the worst possible outcome of directly
accessing arenas was a stale read, but arenas.extend may allocate and
assign a new array to arenas.
Add a tsd-based arenas_cache, which amortizes arenas reads. This
introduces some subtle bootstrapping issues, with tsd_boot() now being
split into tsd_boot[01]() to support tsd wrapper allocation
bootstrapping, as well as an arenas_cache_bypass tsd variable which
dynamically terminates allocation of arenas_cache itself.
Promote a0malloc(), a0calloc(), and a0free() to be generally useful for
internal allocation, and use them in several places (more may be
appropriate).
Abstract arena->nthreads management and fix a missing decrement during
thread destruction (recent tsd refactoring left arenas_cleanup()
unused).
Change arena_choose() to propagate OOM, and handle OOM in all callers.
This is important for providing consistent allocation behavior when the
MALLOCX_ARENA() flag is being used. Prior to this fix, it was possible
for an OOM to result in allocation silently allocating from a different
arena than the one specified.
2014-10-08 14:14:57 +08:00
|
|
|
arena = arena_choose(tsd, NULL);
|
|
|
|
if (unlikely(arena == NULL))
|
|
|
|
return (NULL);
|
2014-10-10 08:54:06 +08:00
|
|
|
return (tcache_create(tsd, arena));
|
2014-04-16 04:28:37 +08:00
|
|
|
}
|
|
|
|
|
2010-01-17 01:53:50 +08:00
|
|
|
tcache_t *
|
2014-10-10 08:54:06 +08:00
|
|
|
tcache_create(tsd_t *tsd, arena_t *arena)
|
2010-01-17 01:53:50 +08:00
|
|
|
{
|
|
|
|
tcache_t *tcache;
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
size_t size, stack_offset;
|
2010-03-08 07:34:14 +08:00
|
|
|
unsigned i;
|
2010-01-17 01:53:50 +08:00
|
|
|
|
2010-10-02 09:02:43 +08:00
|
|
|
size = offsetof(tcache_t, tbins) + (sizeof(tcache_bin_t) * nhbins);
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
/* Naturally align the pointer stacks. */
|
|
|
|
size = PTR_CEILING(size);
|
|
|
|
stack_offset = size;
|
|
|
|
size += stack_nelms * sizeof(void *);
|
2014-10-10 08:54:06 +08:00
|
|
|
/* Avoid false cacheline sharing. */
|
|
|
|
size = sa2u(size, CACHELINE);
|
2010-01-17 01:53:50 +08:00
|
|
|
|
2015-01-30 07:30:47 +08:00
|
|
|
tcache = ipallocztm(tsd, size, CACHELINE, true, false, true, a0get());
|
2010-01-17 01:53:50 +08:00
|
|
|
if (tcache == NULL)
|
|
|
|
return (NULL);
|
|
|
|
|
2012-03-22 09:33:03 +08:00
|
|
|
tcache_arena_associate(tcache, arena);
|
2010-01-17 01:53:50 +08:00
|
|
|
|
2010-03-18 07:27:39 +08:00
|
|
|
assert((TCACHE_NSLOTS_SMALL_MAX & 1U) == 0);
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
for (i = 0; i < nhbins; i++) {
|
2011-03-21 15:18:17 +08:00
|
|
|
tcache->tbins[i].lg_fill_div = 1;
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
tcache->tbins[i].avail = (void **)((uintptr_t)tcache +
|
|
|
|
(uintptr_t)stack_offset);
|
|
|
|
stack_offset += tcache_bin_info[i].ncached_max * sizeof(void *);
|
2010-03-08 07:34:14 +08:00
|
|
|
}
|
2010-01-17 01:53:50 +08:00
|
|
|
|
|
|
|
return (tcache);
|
|
|
|
}
|
|
|
|
|
2014-09-23 12:09:23 +08:00
|
|
|
static void
|
|
|
|
tcache_destroy(tsd_t *tsd, tcache_t *tcache)
|
2010-01-17 01:53:50 +08:00
|
|
|
{
|
2015-01-30 07:30:47 +08:00
|
|
|
arena_t *arena;
|
2010-01-17 01:53:50 +08:00
|
|
|
unsigned i;
|
|
|
|
|
2015-01-30 07:30:47 +08:00
|
|
|
arena = arena_choose(tsd, NULL);
|
|
|
|
tcache_arena_dissociate(tcache, arena);
|
2010-01-17 01:53:50 +08:00
|
|
|
|
2012-02-29 08:50:47 +08:00
|
|
|
for (i = 0; i < NBINS; i++) {
|
2010-03-08 07:34:14 +08:00
|
|
|
tcache_bin_t *tbin = &tcache->tbins[i];
|
2015-02-14 07:28:56 +08:00
|
|
|
tcache_bin_flush_small(tsd, tcache, tbin, i, 0);
|
2010-03-08 07:34:14 +08:00
|
|
|
|
2012-02-11 12:22:09 +08:00
|
|
|
if (config_stats && tbin->tstats.nrequests != 0) {
|
2010-03-08 07:34:14 +08:00
|
|
|
arena_bin_t *bin = &arena->bins[i];
|
2010-03-14 12:32:56 +08:00
|
|
|
malloc_mutex_lock(&bin->lock);
|
2010-03-08 07:34:14 +08:00
|
|
|
bin->stats.nrequests += tbin->tstats.nrequests;
|
2010-03-14 12:32:56 +08:00
|
|
|
malloc_mutex_unlock(&bin->lock);
|
2010-01-17 01:53:50 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-03-18 07:27:39 +08:00
|
|
|
for (; i < nhbins; i++) {
|
|
|
|
tcache_bin_t *tbin = &tcache->tbins[i];
|
2015-01-30 07:30:47 +08:00
|
|
|
tcache_bin_flush_large(tsd, tbin, i, 0, tcache);
|
2010-03-18 07:27:39 +08:00
|
|
|
|
2012-02-11 12:22:09 +08:00
|
|
|
if (config_stats && tbin->tstats.nrequests != 0) {
|
2010-03-18 07:27:39 +08:00
|
|
|
malloc_mutex_lock(&arena->lock);
|
|
|
|
arena->stats.nrequests_large += tbin->tstats.nrequests;
|
2012-02-29 08:50:47 +08:00
|
|
|
arena->stats.lstats[i - NBINS].nrequests +=
|
2010-03-18 07:27:39 +08:00
|
|
|
tbin->tstats.nrequests;
|
|
|
|
malloc_mutex_unlock(&arena->lock);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-02-07 03:59:30 +08:00
|
|
|
if (config_prof && tcache->prof_accumbytes > 0 &&
|
2015-01-30 07:30:47 +08:00
|
|
|
arena_prof_accum(arena, tcache->prof_accumbytes))
|
2013-02-07 03:59:30 +08:00
|
|
|
prof_idump();
|
2010-02-12 05:19:21 +08:00
|
|
|
|
2014-11-28 03:22:36 +08:00
|
|
|
idalloctm(tsd, tcache, false, true);
|
2010-01-17 01:53:50 +08:00
|
|
|
}
|
|
|
|
|
2012-03-22 09:33:03 +08:00
|
|
|
void
|
2014-09-23 12:09:23 +08:00
|
|
|
tcache_cleanup(tsd_t *tsd)
|
2010-01-17 01:53:50 +08:00
|
|
|
{
|
2014-09-23 12:09:23 +08:00
|
|
|
tcache_t *tcache;
|
2010-01-17 01:53:50 +08:00
|
|
|
|
2014-09-23 12:09:23 +08:00
|
|
|
if (!config_tcache)
|
|
|
|
return;
|
|
|
|
|
|
|
|
if ((tcache = tsd_tcache_get(tsd)) != NULL) {
|
|
|
|
tcache_destroy(tsd, tcache);
|
|
|
|
tsd_tcache_set(tsd, NULL);
|
2010-01-17 01:53:50 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-09-23 12:09:23 +08:00
|
|
|
void
|
|
|
|
tcache_enabled_cleanup(tsd_t *tsd)
|
|
|
|
{
|
|
|
|
|
|
|
|
/* Do nothing. */
|
|
|
|
}
|
|
|
|
|
2013-10-22 06:00:06 +08:00
|
|
|
/* Caller must own arena->lock. */
|
2010-01-17 01:53:50 +08:00
|
|
|
void
|
|
|
|
tcache_stats_merge(tcache_t *tcache, arena_t *arena)
|
|
|
|
{
|
|
|
|
unsigned i;
|
|
|
|
|
2013-10-22 06:00:06 +08:00
|
|
|
cassert(config_stats);
|
|
|
|
|
2010-01-17 01:53:50 +08:00
|
|
|
/* Merge and reset tcache stats. */
|
2012-02-29 08:50:47 +08:00
|
|
|
for (i = 0; i < NBINS; i++) {
|
2010-01-17 01:53:50 +08:00
|
|
|
arena_bin_t *bin = &arena->bins[i];
|
2010-03-08 07:34:14 +08:00
|
|
|
tcache_bin_t *tbin = &tcache->tbins[i];
|
2010-03-14 12:32:56 +08:00
|
|
|
malloc_mutex_lock(&bin->lock);
|
2010-03-08 07:34:14 +08:00
|
|
|
bin->stats.nrequests += tbin->tstats.nrequests;
|
2010-03-14 12:32:56 +08:00
|
|
|
malloc_mutex_unlock(&bin->lock);
|
2010-03-08 07:34:14 +08:00
|
|
|
tbin->tstats.nrequests = 0;
|
2010-01-17 01:53:50 +08:00
|
|
|
}
|
2010-03-18 07:27:39 +08:00
|
|
|
|
|
|
|
for (; i < nhbins; i++) {
|
2012-02-29 08:50:47 +08:00
|
|
|
malloc_large_stats_t *lstats = &arena->stats.lstats[i - NBINS];
|
2010-03-18 07:27:39 +08:00
|
|
|
tcache_bin_t *tbin = &tcache->tbins[i];
|
|
|
|
arena->stats.nrequests_large += tbin->tstats.nrequests;
|
|
|
|
lstats->nrequests += tbin->tstats.nrequests;
|
|
|
|
tbin->tstats.nrequests = 0;
|
|
|
|
}
|
2010-01-17 01:53:50 +08:00
|
|
|
}
|
|
|
|
|
2015-01-30 07:30:47 +08:00
|
|
|
bool
|
|
|
|
tcaches_create(tsd_t *tsd, unsigned *r_ind)
|
|
|
|
{
|
|
|
|
tcache_t *tcache;
|
|
|
|
tcaches_t *elm;
|
|
|
|
|
|
|
|
if (tcaches == NULL) {
|
|
|
|
tcaches = base_alloc(sizeof(tcache_t *) *
|
|
|
|
(MALLOCX_TCACHE_MAX+1));
|
|
|
|
if (tcaches == NULL)
|
|
|
|
return (true);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (tcaches_avail == NULL && tcaches_past > MALLOCX_TCACHE_MAX)
|
|
|
|
return (true);
|
|
|
|
tcache = tcache_create(tsd, a0get());
|
|
|
|
if (tcache == NULL)
|
|
|
|
return (true);
|
|
|
|
|
|
|
|
if (tcaches_avail != NULL) {
|
|
|
|
elm = tcaches_avail;
|
|
|
|
tcaches_avail = tcaches_avail->next;
|
|
|
|
elm->tcache = tcache;
|
2015-02-11 01:03:48 +08:00
|
|
|
*r_ind = elm - tcaches;
|
2015-01-30 07:30:47 +08:00
|
|
|
} else {
|
|
|
|
elm = &tcaches[tcaches_past];
|
|
|
|
elm->tcache = tcache;
|
|
|
|
*r_ind = tcaches_past;
|
|
|
|
tcaches_past++;
|
|
|
|
}
|
|
|
|
|
|
|
|
return (false);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
tcaches_elm_flush(tsd_t *tsd, tcaches_t *elm)
|
|
|
|
{
|
|
|
|
|
|
|
|
if (elm->tcache == NULL)
|
|
|
|
return;
|
|
|
|
tcache_destroy(tsd, elm->tcache);
|
|
|
|
elm->tcache = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
tcaches_flush(tsd_t *tsd, unsigned ind)
|
|
|
|
{
|
|
|
|
|
|
|
|
tcaches_elm_flush(tsd, &tcaches[ind]);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
tcaches_destroy(tsd_t *tsd, unsigned ind)
|
|
|
|
{
|
|
|
|
tcaches_t *elm = &tcaches[ind];
|
|
|
|
tcaches_elm_flush(tsd, elm);
|
|
|
|
elm->next = tcaches_avail;
|
|
|
|
tcaches_avail = elm;
|
|
|
|
}
|
|
|
|
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
bool
|
2014-09-23 12:09:23 +08:00
|
|
|
tcache_boot(void)
|
2010-01-17 01:53:50 +08:00
|
|
|
{
|
2012-04-07 03:41:55 +08:00
|
|
|
unsigned i;
|
2010-01-17 01:53:50 +08:00
|
|
|
|
2012-04-07 03:41:55 +08:00
|
|
|
/*
|
|
|
|
* If necessary, clamp opt_lg_tcache_max, now that arena_maxclass is
|
|
|
|
* known.
|
|
|
|
*/
|
|
|
|
if (opt_lg_tcache_max < 0 || (1U << opt_lg_tcache_max) < SMALL_MAXCLASS)
|
|
|
|
tcache_maxclass = SMALL_MAXCLASS;
|
|
|
|
else if ((1U << opt_lg_tcache_max) > arena_maxclass)
|
|
|
|
tcache_maxclass = arena_maxclass;
|
|
|
|
else
|
|
|
|
tcache_maxclass = (1U << opt_lg_tcache_max);
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
|
2012-04-07 03:41:55 +08:00
|
|
|
nhbins = NBINS + (tcache_maxclass >> LG_PAGE);
|
|
|
|
|
|
|
|
/* Initialize tcache_bin_info. */
|
|
|
|
tcache_bin_info = (tcache_bin_info_t *)base_alloc(nhbins *
|
|
|
|
sizeof(tcache_bin_info_t));
|
|
|
|
if (tcache_bin_info == NULL)
|
|
|
|
return (true);
|
|
|
|
stack_nelms = 0;
|
|
|
|
for (i = 0; i < NBINS; i++) {
|
|
|
|
if ((arena_bin_info[i].nregs << 1) <= TCACHE_NSLOTS_SMALL_MAX) {
|
|
|
|
tcache_bin_info[i].ncached_max =
|
|
|
|
(arena_bin_info[i].nregs << 1);
|
|
|
|
} else {
|
|
|
|
tcache_bin_info[i].ncached_max =
|
|
|
|
TCACHE_NSLOTS_SMALL_MAX;
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
}
|
2012-04-07 03:41:55 +08:00
|
|
|
stack_nelms += tcache_bin_info[i].ncached_max;
|
|
|
|
}
|
|
|
|
for (; i < nhbins; i++) {
|
|
|
|
tcache_bin_info[i].ncached_max = TCACHE_NSLOTS_LARGE;
|
|
|
|
stack_nelms += tcache_bin_info[i].ncached_max;
|
2012-03-22 09:33:03 +08:00
|
|
|
}
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
|
2012-03-22 09:33:03 +08:00
|
|
|
return (false);
|
|
|
|
}
|