2010-01-17 01:53:50 +08:00
|
|
|
#ifdef JEMALLOC_TCACHE
|
|
|
|
/******************************************************************************/
|
|
|
|
#ifdef JEMALLOC_H_TYPES
|
|
|
|
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
typedef struct tcache_bin_info_s tcache_bin_info_t;
|
2010-01-17 01:53:50 +08:00
|
|
|
typedef struct tcache_bin_s tcache_bin_t;
|
|
|
|
typedef struct tcache_s tcache_t;
|
|
|
|
|
|
|
|
/*
|
2010-03-18 07:27:39 +08:00
|
|
|
* Absolute maximum number of cache slots for each small bin in the thread
|
|
|
|
* cache. This is an additional constraint beyond that imposed as: twice the
|
|
|
|
* number of regions per run for this size class.
|
2010-03-08 07:34:14 +08:00
|
|
|
*
|
|
|
|
* This constant must be an even number.
|
2010-01-17 01:53:50 +08:00
|
|
|
*/
|
2010-03-18 07:27:39 +08:00
|
|
|
#define TCACHE_NSLOTS_SMALL_MAX 200
|
|
|
|
|
|
|
|
/* Number of cache slots for large size classes. */
|
|
|
|
#define TCACHE_NSLOTS_LARGE 20
|
|
|
|
|
2010-10-24 09:37:06 +08:00
|
|
|
/* (1U << opt_lg_tcache_max) is used to compute tcache_maxclass. */
|
2010-03-18 07:27:39 +08:00
|
|
|
#define LG_TCACHE_MAXCLASS_DEFAULT 15
|
|
|
|
|
|
|
|
/*
|
|
|
|
* (1U << opt_lg_tcache_gc_sweep) is the approximate number of allocation
|
|
|
|
* events between full GC sweeps (-1: disabled). Integer rounding may cause
|
|
|
|
* the actual number to be slightly higher, since GC is performed
|
|
|
|
* incrementally.
|
|
|
|
*/
|
|
|
|
#define LG_TCACHE_GC_SWEEP_DEFAULT 13
|
2010-01-17 01:53:50 +08:00
|
|
|
|
|
|
|
#endif /* JEMALLOC_H_TYPES */
|
|
|
|
/******************************************************************************/
|
|
|
|
#ifdef JEMALLOC_H_STRUCTS
|
|
|
|
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
/*
|
|
|
|
* Read-only information associated with each element of tcache_t's tbins array
|
|
|
|
* is stored separately, mainly to reduce memory usage.
|
|
|
|
*/
|
|
|
|
struct tcache_bin_info_s {
|
|
|
|
unsigned ncached_max; /* Upper limit on ncached. */
|
|
|
|
};
|
|
|
|
|
2010-01-17 01:53:50 +08:00
|
|
|
struct tcache_bin_s {
|
|
|
|
# ifdef JEMALLOC_STATS
|
|
|
|
tcache_bin_stats_t tstats;
|
|
|
|
# endif
|
2011-03-21 15:18:17 +08:00
|
|
|
int low_water; /* Min # cached since last GC. */
|
|
|
|
unsigned lg_fill_div; /* Fill (ncached_max >> lg_fill_div). */
|
2010-01-17 01:53:50 +08:00
|
|
|
unsigned ncached; /* # of cached objects. */
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
void **avail; /* Stack of available objects. */
|
2010-01-17 01:53:50 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
struct tcache_s {
|
|
|
|
# ifdef JEMALLOC_STATS
|
|
|
|
ql_elm(tcache_t) link; /* Used for aggregating stats. */
|
2010-02-12 05:19:21 +08:00
|
|
|
# endif
|
|
|
|
# ifdef JEMALLOC_PROF
|
|
|
|
uint64_t prof_accumbytes;/* Cleared after arena_prof_accum() */
|
2010-01-17 01:53:50 +08:00
|
|
|
# endif
|
|
|
|
arena_t *arena; /* This thread's arena. */
|
|
|
|
unsigned ev_cnt; /* Event count since incremental GC. */
|
|
|
|
unsigned next_gc_bin; /* Next bin to GC. */
|
2010-03-08 07:34:14 +08:00
|
|
|
tcache_bin_t tbins[1]; /* Dynamically sized. */
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
/*
|
|
|
|
* The pointer stacks associated with tbins follow as a contiguous
|
|
|
|
* array. During tcache initialization, the avail pointer in each
|
|
|
|
* element of tbins is initialized to point to the proper offset within
|
|
|
|
* this array.
|
|
|
|
*/
|
2010-01-17 01:53:50 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
#endif /* JEMALLOC_H_STRUCTS */
|
|
|
|
/******************************************************************************/
|
|
|
|
#ifdef JEMALLOC_H_EXTERNS
|
|
|
|
|
2010-03-08 07:34:14 +08:00
|
|
|
extern bool opt_tcache;
|
2010-10-24 09:37:06 +08:00
|
|
|
extern ssize_t opt_lg_tcache_max;
|
2010-01-17 01:53:50 +08:00
|
|
|
extern ssize_t opt_lg_tcache_gc_sweep;
|
|
|
|
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
extern tcache_bin_info_t *tcache_bin_info;
|
|
|
|
|
2010-01-17 01:53:50 +08:00
|
|
|
/* Map of thread-specific caches. */
|
2010-09-06 01:35:13 +08:00
|
|
|
#ifndef NO_TLS
|
2010-01-17 01:53:50 +08:00
|
|
|
extern __thread tcache_t *tcache_tls
|
|
|
|
JEMALLOC_ATTR(tls_model("initial-exec"));
|
2010-09-06 01:35:13 +08:00
|
|
|
# define TCACHE_GET() tcache_tls
|
|
|
|
# define TCACHE_SET(v) do { \
|
2010-09-12 14:38:12 +08:00
|
|
|
tcache_tls = (tcache_t *)(v); \
|
2010-09-06 01:35:13 +08:00
|
|
|
pthread_setspecific(tcache_tsd, (void *)(v)); \
|
|
|
|
} while (0)
|
|
|
|
#else
|
|
|
|
# define TCACHE_GET() ((tcache_t *)pthread_getspecific(tcache_tsd))
|
|
|
|
# define TCACHE_SET(v) do { \
|
|
|
|
pthread_setspecific(tcache_tsd, (void *)(v)); \
|
|
|
|
} while (0)
|
|
|
|
#endif
|
2010-09-12 14:38:12 +08:00
|
|
|
extern pthread_key_t tcache_tsd;
|
2010-01-17 01:53:50 +08:00
|
|
|
|
2010-03-18 07:27:39 +08:00
|
|
|
/*
|
|
|
|
* Number of tcache bins. There are nbins small-object bins, plus 0 or more
|
|
|
|
* large-object bins.
|
|
|
|
*/
|
|
|
|
extern size_t nhbins;
|
|
|
|
|
|
|
|
/* Maximum cached size class. */
|
|
|
|
extern size_t tcache_maxclass;
|
|
|
|
|
2010-01-17 01:53:50 +08:00
|
|
|
/* Number of tcache allocation/deallocation events between incremental GCs. */
|
|
|
|
extern unsigned tcache_gc_incr;
|
|
|
|
|
2010-03-18 07:27:39 +08:00
|
|
|
void tcache_bin_flush_small(tcache_bin_t *tbin, size_t binind, unsigned rem
|
|
|
|
#if (defined(JEMALLOC_STATS) || defined(JEMALLOC_PROF))
|
|
|
|
, tcache_t *tcache
|
|
|
|
#endif
|
|
|
|
);
|
|
|
|
void tcache_bin_flush_large(tcache_bin_t *tbin, size_t binind, unsigned rem
|
2010-03-14 12:32:56 +08:00
|
|
|
#if (defined(JEMALLOC_STATS) || defined(JEMALLOC_PROF))
|
2010-02-12 05:19:21 +08:00
|
|
|
, tcache_t *tcache
|
|
|
|
#endif
|
|
|
|
);
|
2010-01-17 01:53:50 +08:00
|
|
|
tcache_t *tcache_create(arena_t *arena);
|
2010-03-18 07:27:39 +08:00
|
|
|
void *tcache_alloc_small_hard(tcache_t *tcache, tcache_bin_t *tbin,
|
|
|
|
size_t binind);
|
2010-01-17 01:53:50 +08:00
|
|
|
void tcache_destroy(tcache_t *tcache);
|
|
|
|
#ifdef JEMALLOC_STATS
|
|
|
|
void tcache_stats_merge(tcache_t *tcache, arena_t *arena);
|
|
|
|
#endif
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
bool tcache_boot(void);
|
2010-01-17 01:53:50 +08:00
|
|
|
|
|
|
|
#endif /* JEMALLOC_H_EXTERNS */
|
|
|
|
/******************************************************************************/
|
|
|
|
#ifdef JEMALLOC_H_INLINES
|
|
|
|
|
|
|
|
#ifndef JEMALLOC_ENABLE_INLINE
|
|
|
|
void tcache_event(tcache_t *tcache);
|
|
|
|
tcache_t *tcache_get(void);
|
2010-03-18 07:27:39 +08:00
|
|
|
void *tcache_alloc_easy(tcache_bin_t *tbin);
|
|
|
|
void *tcache_alloc_small(tcache_t *tcache, size_t size, bool zero);
|
|
|
|
void *tcache_alloc_large(tcache_t *tcache, size_t size, bool zero);
|
|
|
|
void tcache_dalloc_small(tcache_t *tcache, void *ptr);
|
|
|
|
void tcache_dalloc_large(tcache_t *tcache, void *ptr, size_t size);
|
2010-01-17 01:53:50 +08:00
|
|
|
#endif
|
|
|
|
|
|
|
|
#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_TCACHE_C_))
|
|
|
|
JEMALLOC_INLINE tcache_t *
|
|
|
|
tcache_get(void)
|
|
|
|
{
|
|
|
|
tcache_t *tcache;
|
|
|
|
|
2010-03-08 07:34:14 +08:00
|
|
|
if ((isthreaded & opt_tcache) == false)
|
2010-01-17 01:53:50 +08:00
|
|
|
return (NULL);
|
|
|
|
|
2010-09-06 01:35:13 +08:00
|
|
|
tcache = TCACHE_GET();
|
|
|
|
if ((uintptr_t)tcache <= (uintptr_t)2) {
|
2010-01-17 01:53:50 +08:00
|
|
|
if (tcache == NULL) {
|
|
|
|
tcache = tcache_create(choose_arena());
|
|
|
|
if (tcache == NULL)
|
|
|
|
return (NULL);
|
2010-09-06 01:35:13 +08:00
|
|
|
} else {
|
|
|
|
if (tcache == (void *)(uintptr_t)1) {
|
|
|
|
/*
|
|
|
|
* Make a note that an allocator function was
|
|
|
|
* called after the tcache_thread_cleanup() was
|
|
|
|
* called.
|
|
|
|
*/
|
|
|
|
TCACHE_SET((uintptr_t)2);
|
|
|
|
}
|
2010-01-17 01:53:50 +08:00
|
|
|
return (NULL);
|
2010-09-06 01:35:13 +08:00
|
|
|
}
|
2010-01-17 01:53:50 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return (tcache);
|
|
|
|
}
|
|
|
|
|
|
|
|
JEMALLOC_INLINE void
|
|
|
|
tcache_event(tcache_t *tcache)
|
|
|
|
{
|
|
|
|
|
|
|
|
if (tcache_gc_incr == 0)
|
|
|
|
return;
|
|
|
|
|
|
|
|
tcache->ev_cnt++;
|
|
|
|
assert(tcache->ev_cnt <= tcache_gc_incr);
|
2010-03-08 07:34:14 +08:00
|
|
|
if (tcache->ev_cnt == tcache_gc_incr) {
|
2010-01-17 01:53:50 +08:00
|
|
|
size_t binind = tcache->next_gc_bin;
|
2010-03-08 07:34:14 +08:00
|
|
|
tcache_bin_t *tbin = &tcache->tbins[binind];
|
2011-03-21 15:18:17 +08:00
|
|
|
tcache_bin_info_t *tbin_info = &tcache_bin_info[binind];
|
2010-03-08 07:34:14 +08:00
|
|
|
|
|
|
|
if (tbin->low_water > 0) {
|
|
|
|
/*
|
|
|
|
* Flush (ceiling) 3/4 of the objects below the low
|
|
|
|
* water mark.
|
|
|
|
*/
|
2010-03-18 07:27:39 +08:00
|
|
|
if (binind < nbins) {
|
|
|
|
tcache_bin_flush_small(tbin, binind,
|
|
|
|
tbin->ncached - tbin->low_water +
|
|
|
|
(tbin->low_water >> 2)
|
|
|
|
#if (defined(JEMALLOC_STATS) || defined(JEMALLOC_PROF))
|
|
|
|
, tcache
|
|
|
|
#endif
|
|
|
|
);
|
|
|
|
} else {
|
|
|
|
tcache_bin_flush_large(tbin, binind,
|
|
|
|
tbin->ncached - tbin->low_water +
|
|
|
|
(tbin->low_water >> 2)
|
2010-03-14 12:32:56 +08:00
|
|
|
#if (defined(JEMALLOC_STATS) || defined(JEMALLOC_PROF))
|
2010-03-18 07:27:39 +08:00
|
|
|
, tcache
|
2010-02-12 05:19:21 +08:00
|
|
|
#endif
|
2010-03-18 07:27:39 +08:00
|
|
|
);
|
|
|
|
}
|
2011-03-21 15:18:17 +08:00
|
|
|
/*
|
|
|
|
* Reduce fill count by 2X. Limit lg_fill_div such that
|
|
|
|
* the fill count is always at least 1.
|
|
|
|
*/
|
|
|
|
if ((tbin_info->ncached_max >> (tbin->lg_fill_div+1))
|
|
|
|
>= 1)
|
|
|
|
tbin->lg_fill_div++;
|
|
|
|
} else if (tbin->low_water < 0) {
|
|
|
|
/*
|
|
|
|
* Increase fill count by 2X. Make sure lg_fill_div
|
|
|
|
* stays greater than 0.
|
|
|
|
*/
|
|
|
|
if (tbin->lg_fill_div > 1)
|
|
|
|
tbin->lg_fill_div--;
|
2010-01-17 01:53:50 +08:00
|
|
|
}
|
2010-03-08 07:34:14 +08:00
|
|
|
tbin->low_water = tbin->ncached;
|
2010-01-17 01:53:50 +08:00
|
|
|
|
|
|
|
tcache->next_gc_bin++;
|
2010-03-18 07:27:39 +08:00
|
|
|
if (tcache->next_gc_bin == nhbins)
|
2010-01-17 01:53:50 +08:00
|
|
|
tcache->next_gc_bin = 0;
|
|
|
|
tcache->ev_cnt = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
JEMALLOC_INLINE void *
|
2010-03-18 07:27:39 +08:00
|
|
|
tcache_alloc_easy(tcache_bin_t *tbin)
|
2010-01-17 01:53:50 +08:00
|
|
|
{
|
2010-03-08 07:34:14 +08:00
|
|
|
void *ret;
|
2010-01-17 01:53:50 +08:00
|
|
|
|
2011-03-21 15:18:17 +08:00
|
|
|
if (tbin->ncached == 0) {
|
|
|
|
tbin->low_water = -1;
|
2010-01-17 01:53:50 +08:00
|
|
|
return (NULL);
|
2011-03-21 15:18:17 +08:00
|
|
|
}
|
2010-01-17 01:53:50 +08:00
|
|
|
tbin->ncached--;
|
2011-03-21 15:18:17 +08:00
|
|
|
if ((int)tbin->ncached < tbin->low_water)
|
2010-01-17 01:53:50 +08:00
|
|
|
tbin->low_water = tbin->ncached;
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
ret = tbin->avail[tbin->ncached];
|
2010-03-08 07:34:14 +08:00
|
|
|
return (ret);
|
2010-01-17 01:53:50 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
JEMALLOC_INLINE void *
|
2010-03-18 07:27:39 +08:00
|
|
|
tcache_alloc_small(tcache_t *tcache, size_t size, bool zero)
|
2010-01-17 01:53:50 +08:00
|
|
|
{
|
|
|
|
void *ret;
|
|
|
|
size_t binind;
|
2010-03-08 07:34:14 +08:00
|
|
|
tcache_bin_t *tbin;
|
2010-01-17 01:53:50 +08:00
|
|
|
|
2011-03-07 14:56:36 +08:00
|
|
|
binind = SMALL_SIZE2BIN(size);
|
2010-01-17 01:53:50 +08:00
|
|
|
assert(binind < nbins);
|
2010-03-08 07:34:14 +08:00
|
|
|
tbin = &tcache->tbins[binind];
|
2010-03-18 07:27:39 +08:00
|
|
|
ret = tcache_alloc_easy(tbin);
|
2010-01-17 01:53:50 +08:00
|
|
|
if (ret == NULL) {
|
2010-03-18 07:27:39 +08:00
|
|
|
ret = tcache_alloc_small_hard(tcache, tbin, binind);
|
2010-01-17 01:53:50 +08:00
|
|
|
if (ret == NULL)
|
|
|
|
return (NULL);
|
|
|
|
}
|
2011-03-16 04:59:15 +08:00
|
|
|
assert(arena_salloc(ret) == arena_bin_info[binind].reg_size);
|
2010-01-17 01:53:50 +08:00
|
|
|
|
|
|
|
if (zero == false) {
|
|
|
|
#ifdef JEMALLOC_FILL
|
|
|
|
if (opt_junk)
|
|
|
|
memset(ret, 0xa5, size);
|
|
|
|
else if (opt_zero)
|
|
|
|
memset(ret, 0, size);
|
|
|
|
#endif
|
|
|
|
} else
|
|
|
|
memset(ret, 0, size);
|
|
|
|
|
|
|
|
#ifdef JEMALLOC_STATS
|
|
|
|
tbin->tstats.nrequests++;
|
2010-02-12 05:19:21 +08:00
|
|
|
#endif
|
|
|
|
#ifdef JEMALLOC_PROF
|
2011-03-16 04:59:15 +08:00
|
|
|
tcache->prof_accumbytes += arena_bin_info[binind].reg_size;
|
2010-01-17 01:53:50 +08:00
|
|
|
#endif
|
|
|
|
tcache_event(tcache);
|
|
|
|
return (ret);
|
|
|
|
}
|
|
|
|
|
2010-03-18 07:27:39 +08:00
|
|
|
JEMALLOC_INLINE void *
|
|
|
|
tcache_alloc_large(tcache_t *tcache, size_t size, bool zero)
|
|
|
|
{
|
|
|
|
void *ret;
|
|
|
|
size_t binind;
|
|
|
|
tcache_bin_t *tbin;
|
|
|
|
|
|
|
|
size = PAGE_CEILING(size);
|
|
|
|
assert(size <= tcache_maxclass);
|
|
|
|
binind = nbins + (size >> PAGE_SHIFT) - 1;
|
|
|
|
assert(binind < nhbins);
|
|
|
|
tbin = &tcache->tbins[binind];
|
|
|
|
ret = tcache_alloc_easy(tbin);
|
|
|
|
if (ret == NULL) {
|
|
|
|
/*
|
|
|
|
* Only allocate one large object at a time, because it's quite
|
|
|
|
* expensive to create one and not use it.
|
|
|
|
*/
|
|
|
|
ret = arena_malloc_large(tcache->arena, size, zero);
|
|
|
|
if (ret == NULL)
|
|
|
|
return (NULL);
|
|
|
|
} else {
|
2010-04-01 07:45:04 +08:00
|
|
|
#ifdef JEMALLOC_PROF
|
|
|
|
arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ret);
|
Fix numerous arena bugs.
In arena_ralloc_large_grow(), update the map element for the end of the
newly grown run, rather than the interior map element that was the
beginning of the appended run. This is a long-standing bug, and it had
the potential to cause massive corruption, but triggering it required
roughly the following sequence of events:
1) Large in-place growing realloc(), with left-over space in the run
that followed the large object.
2) Allocation of the remainder run left over from (1).
3) Deallocation of the remainder run *before* deallocation of the
large run, with unfortunate interior map state left over from
previous run allocation/deallocation activity, such that one or
more pages of allocated memory would be treated as part of the
remainder run during run coalescing.
In summary, this was a bad bug, but it was difficult to trigger.
In arena_bin_malloc_hard(), if another thread wins the race to allocate
a bin run, dispose of the spare run via arena_bin_lower_run() rather
than arena_run_dalloc(), since the run has already been prepared for use
as a bin run. This bug has existed since March 14, 2010:
e00572b384c81bd2aba57fac32f7077a34388915
mmap()/munmap() without arena->lock or bin->lock.
Fix bugs in arena_dalloc_bin_run(), arena_trim_head(),
arena_trim_tail(), and arena_ralloc_large_grow() that could cause the
CHUNK_MAP_UNZEROED map bit to become corrupted. These are all
long-standing bugs, but the chances of them actually causing problems
was much lower before the CHUNK_MAP_ZEROED --> CHUNK_MAP_UNZEROED
conversion.
Fix a large run statistics regression in arena_ralloc_large_grow() that
was introduced on September 17, 2010:
8e3c3c61b5bb676a705450708e7e79698cdc9e0c
Add {,r,s,d}allocm().
Add debug code to validate that supposedly pre-zeroed memory really is.
2010-10-18 08:51:37 +08:00
|
|
|
size_t pageind = (((uintptr_t)ret - (uintptr_t)chunk) >>
|
|
|
|
PAGE_SHIFT);
|
2010-10-23 01:45:59 +08:00
|
|
|
chunk->map[pageind-map_bias].bits &= ~CHUNK_MAP_CLASS_MASK;
|
2010-04-01 07:45:04 +08:00
|
|
|
#endif
|
2010-03-18 07:27:39 +08:00
|
|
|
if (zero == false) {
|
|
|
|
#ifdef JEMALLOC_FILL
|
|
|
|
if (opt_junk)
|
|
|
|
memset(ret, 0xa5, size);
|
|
|
|
else if (opt_zero)
|
|
|
|
memset(ret, 0, size);
|
|
|
|
#endif
|
|
|
|
} else
|
|
|
|
memset(ret, 0, size);
|
|
|
|
|
|
|
|
#ifdef JEMALLOC_STATS
|
|
|
|
tbin->tstats.nrequests++;
|
|
|
|
#endif
|
|
|
|
#ifdef JEMALLOC_PROF
|
|
|
|
tcache->prof_accumbytes += size;
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
tcache_event(tcache);
|
|
|
|
return (ret);
|
|
|
|
}
|
|
|
|
|
2010-01-17 01:53:50 +08:00
|
|
|
JEMALLOC_INLINE void
|
2010-03-18 07:27:39 +08:00
|
|
|
tcache_dalloc_small(tcache_t *tcache, void *ptr)
|
2010-01-17 01:53:50 +08:00
|
|
|
{
|
|
|
|
arena_t *arena;
|
|
|
|
arena_chunk_t *chunk;
|
|
|
|
arena_run_t *run;
|
|
|
|
arena_bin_t *bin;
|
|
|
|
tcache_bin_t *tbin;
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
tcache_bin_info_t *tbin_info;
|
2010-01-17 01:53:50 +08:00
|
|
|
size_t pageind, binind;
|
|
|
|
arena_chunk_map_t *mapelm;
|
|
|
|
|
2010-04-01 07:45:04 +08:00
|
|
|
assert(arena_salloc(ptr) <= small_maxclass);
|
|
|
|
|
2010-01-17 01:53:50 +08:00
|
|
|
chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
|
|
|
|
arena = chunk->arena;
|
2010-10-02 08:35:43 +08:00
|
|
|
pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT;
|
|
|
|
mapelm = &chunk->map[pageind-map_bias];
|
2010-01-17 01:53:50 +08:00
|
|
|
run = (arena_run_t *)((uintptr_t)chunk + (uintptr_t)((pageind -
|
2010-03-19 11:36:40 +08:00
|
|
|
(mapelm->bits >> PAGE_SHIFT)) << PAGE_SHIFT));
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
dassert(run->magic == ARENA_RUN_MAGIC);
|
2010-01-17 01:53:50 +08:00
|
|
|
bin = run->bin;
|
|
|
|
binind = ((uintptr_t)bin - (uintptr_t)&arena->bins) /
|
|
|
|
sizeof(arena_bin_t);
|
|
|
|
assert(binind < nbins);
|
|
|
|
|
|
|
|
#ifdef JEMALLOC_FILL
|
|
|
|
if (opt_junk)
|
2011-03-16 04:59:15 +08:00
|
|
|
memset(ptr, 0x5a, arena_bin_info[binind].reg_size);
|
2010-01-17 01:53:50 +08:00
|
|
|
#endif
|
|
|
|
|
2010-03-08 07:34:14 +08:00
|
|
|
tbin = &tcache->tbins[binind];
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
tbin_info = &tcache_bin_info[binind];
|
|
|
|
if (tbin->ncached == tbin_info->ncached_max) {
|
|
|
|
tcache_bin_flush_small(tbin, binind, (tbin_info->ncached_max >>
|
|
|
|
1)
|
2010-03-18 07:27:39 +08:00
|
|
|
#if (defined(JEMALLOC_STATS) || defined(JEMALLOC_PROF))
|
|
|
|
, tcache
|
|
|
|
#endif
|
|
|
|
);
|
|
|
|
}
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
assert(tbin->ncached < tbin_info->ncached_max);
|
|
|
|
tbin->avail[tbin->ncached] = ptr;
|
2010-03-18 07:27:39 +08:00
|
|
|
tbin->ncached++;
|
|
|
|
|
|
|
|
tcache_event(tcache);
|
|
|
|
}
|
|
|
|
|
|
|
|
JEMALLOC_INLINE void
|
|
|
|
tcache_dalloc_large(tcache_t *tcache, void *ptr, size_t size)
|
|
|
|
{
|
|
|
|
arena_t *arena;
|
|
|
|
arena_chunk_t *chunk;
|
|
|
|
size_t pageind, binind;
|
|
|
|
tcache_bin_t *tbin;
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
tcache_bin_info_t *tbin_info;
|
2010-03-18 07:27:39 +08:00
|
|
|
|
|
|
|
assert((size & PAGE_MASK) == 0);
|
2010-04-01 07:45:04 +08:00
|
|
|
assert(arena_salloc(ptr) > small_maxclass);
|
|
|
|
assert(arena_salloc(ptr) <= tcache_maxclass);
|
2010-03-18 07:27:39 +08:00
|
|
|
|
|
|
|
chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
|
|
|
|
arena = chunk->arena;
|
2010-10-02 08:35:43 +08:00
|
|
|
pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT;
|
2010-03-18 07:27:39 +08:00
|
|
|
binind = nbins + (size >> PAGE_SHIFT) - 1;
|
|
|
|
|
|
|
|
#ifdef JEMALLOC_FILL
|
|
|
|
if (opt_junk)
|
2010-04-29 03:00:59 +08:00
|
|
|
memset(ptr, 0x5a, size);
|
2010-03-18 07:27:39 +08:00
|
|
|
#endif
|
|
|
|
|
|
|
|
tbin = &tcache->tbins[binind];
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
tbin_info = &tcache_bin_info[binind];
|
|
|
|
if (tbin->ncached == tbin_info->ncached_max) {
|
|
|
|
tcache_bin_flush_large(tbin, binind, (tbin_info->ncached_max >>
|
|
|
|
1)
|
2010-03-14 12:32:56 +08:00
|
|
|
#if (defined(JEMALLOC_STATS) || defined(JEMALLOC_PROF))
|
2010-02-12 05:19:21 +08:00
|
|
|
, tcache
|
|
|
|
#endif
|
|
|
|
);
|
2010-03-08 07:34:14 +08:00
|
|
|
}
|
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
2011-03-17 01:30:13 +08:00
|
|
|
assert(tbin->ncached < tbin_info->ncached_max);
|
|
|
|
tbin->avail[tbin->ncached] = ptr;
|
2010-01-17 01:53:50 +08:00
|
|
|
tbin->ncached++;
|
|
|
|
|
|
|
|
tcache_event(tcache);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#endif /* JEMALLOC_H_INLINES */
|
|
|
|
/******************************************************************************/
|
|
|
|
#endif /* JEMALLOC_TCACHE */
|