Purge dirty pages without arena->lock.

This commit is contained in:
Jason Evans 2010-03-14 17:36:10 -07:00
parent 86815df9dc
commit 05b21be347
2 changed files with 255 additions and 76 deletions

View File

@ -95,14 +95,23 @@ typedef struct arena_s arena_t;
/* Each element of the chunk map corresponds to one page within the chunk. */ /* Each element of the chunk map corresponds to one page within the chunk. */
struct arena_chunk_map_s { struct arena_chunk_map_s {
/* union {
* Linkage for run trees. There are two disjoint uses: /*
* * Linkage for run trees. There are two disjoint uses:
* 1) arena_t's runs_avail tree. *
* 2) arena_run_t conceptually uses this linkage for in-use non-full * 1) arena_t's runs_avail tree.
* runs, rather than directly embedding linkage. * 2) arena_run_t conceptually uses this linkage for in-use
*/ * non-full runs, rather than directly embedding linkage.
rb_node(arena_chunk_map_t) link; */
rb_node(arena_chunk_map_t) rb_link;
/*
* List of runs currently in purgatory. arena_chunk_purge()
* temporarily allocates runs that contain dirty pages while
* purging, so that other threads cannot use the runs while the
* purging thread is operating without the arena lock held.
*/
ql_elm(arena_chunk_map_t) ql_link;
} u;
#ifdef JEMALLOC_PROF #ifdef JEMALLOC_PROF
/* Profile counters, used for large object runs. */ /* Profile counters, used for large object runs. */
@ -311,6 +320,14 @@ struct arena_s {
*/ */
size_t ndirty; size_t ndirty;
/*
* Approximate number of pages being purged. It is possible for
* multiple threads to purge dirty pages concurrently, and they use
* npurgatory to indicate the total number of pages all threads are
* attempting to purge.
*/
size_t npurgatory;
/* /*
* Size/address-ordered tree of this arena's available runs. This tree * Size/address-ordered tree of this arena's available runs. This tree
* is used for first-best-fit run allocation. * is used for first-best-fit run allocation.

View File

@ -27,9 +27,6 @@ size_t medium_max;
size_t lg_mspace; size_t lg_mspace;
size_t mspace_mask; size_t mspace_mask;
/* Used to prevent threads from concurrently calling madvise(2). */
static malloc_mutex_t purge_lock;
/* /*
* const_small_size2bin is a static constant lookup table that in the common * const_small_size2bin is a static constant lookup table that in the common
* case can be used as-is for small_size2bin. For dynamically linked programs, * case can be used as-is for small_size2bin. For dynamically linked programs,
@ -215,7 +212,7 @@ arena_run_comp(arena_chunk_map_t *a, arena_chunk_map_t *b)
/* Generate red-black tree functions. */ /* Generate red-black tree functions. */
rb_gen(static JEMALLOC_ATTR(unused), arena_run_tree_, arena_run_tree_t, rb_gen(static JEMALLOC_ATTR(unused), arena_run_tree_, arena_run_tree_t,
arena_chunk_map_t, link, arena_run_comp) arena_chunk_map_t, u.rb_link, arena_run_comp)
static inline int static inline int
arena_avail_comp(arena_chunk_map_t *a, arena_chunk_map_t *b) arena_avail_comp(arena_chunk_map_t *a, arena_chunk_map_t *b)
@ -247,7 +244,7 @@ arena_avail_comp(arena_chunk_map_t *a, arena_chunk_map_t *b)
/* Generate red-black tree functions. */ /* Generate red-black tree functions. */
rb_gen(static JEMALLOC_ATTR(unused), arena_avail_tree_, arena_avail_tree_t, rb_gen(static JEMALLOC_ATTR(unused), arena_avail_tree_, arena_avail_tree_t,
arena_chunk_map_t, link, arena_avail_comp) arena_chunk_map_t, u.rb_link, arena_avail_comp)
static inline void * static inline void *
arena_run_reg_alloc(arena_run_t *run, arena_bin_t *bin) arena_run_reg_alloc(arena_run_t *run, arena_bin_t *bin)
@ -480,11 +477,183 @@ arena_run_alloc(arena_t *arena, size_t size, bool large, bool zero)
return (run); return (run);
} }
static inline void
arena_maybe_purge(arena_t *arena)
{
/* Enforce opt_lg_dirty_mult. */
if (opt_lg_dirty_mult >= 0 && arena->ndirty > arena->npurgatory &&
(arena->ndirty - arena->npurgatory) > chunk_npages &&
(arena->nactive >> opt_lg_dirty_mult) < (arena->ndirty -
arena->npurgatory))
arena_purge(arena);
}
static inline void
arena_chunk_purge(arena_t *arena, arena_chunk_t *chunk)
{
ql_head(arena_chunk_map_t) mapelms;
arena_chunk_map_t *mapelm;
size_t pageind;
#ifdef JEMALLOC_DEBUG
size_t ndirty;
#endif
#ifdef JEMALLOC_STATS
size_t nmadvise;
#endif
ql_new(&mapelms);
/*
* If chunk is the spare, temporarily re-allocate it, 1) so that its
* run is reinserted into runs_avail, and 2) so that it cannot be
* completely discarded by another thread while arena->lock is dropped
* by this thread. Note that the arena_run_dalloc() call will
* implicitly deallocate the chunk, so no explicit action is required
* in this function to deallocate the chunk.
*/
if (chunk == arena->spare)
arena_chunk_alloc(arena);
/* Temporarily allocate all free runs that contain dirty pages. */
for (pageind = arena_chunk_header_npages; pageind < chunk_npages;) {
mapelm = &chunk->map[pageind];
if ((mapelm->bits & CHUNK_MAP_ALLOCATED) == 0) {
size_t npages, i;
npages = (mapelm->bits & CHUNK_MAP_PG_MASK) >>
CHUNK_MAP_PG_SHIFT;
for (i = 0; i < npages; i++) {
if (chunk->map[pageind + i].bits &
CHUNK_MAP_DIRTY) {
/*
* This run contains at least one dirty
* page. Temporarily allocate it.
* Don't bother setting the
* CHUNK_MAP_ALLOCATED bit for interior
* pages yet. The bit will be set
* during the loop that actually does
* the madvise() calls, so that the run
* looks like a normal large run by the
* time it is passed to
* arena_run_dalloc().
*/
arena_avail_tree_remove(
&arena->runs_avail, mapelm);
mapelm->bits |= (CHUNK_MAP_LARGE |
CHUNK_MAP_ALLOCATED);
chunk->map[pageind + npages - 1].bits |=
(CHUNK_MAP_LARGE |
CHUNK_MAP_ALLOCATED);
/*
* Append to list for later processing.
*/
ql_elm_new(mapelm, u.ql_link);
ql_tail_insert(&mapelms, mapelm,
u.ql_link);
break;
}
}
pageind += npages;
} else {
/* Skip allocated run. */
if (mapelm->bits & CHUNK_MAP_LARGE) {
pageind += (mapelm->bits & CHUNK_MAP_PG_MASK)
>> CHUNK_MAP_PG_SHIFT;
} else {
arena_run_t *run = (arena_run_t *)((uintptr_t)
chunk + (uintptr_t)((pageind -
((mapelm->bits & CHUNK_MAP_PG_MASK) >>
CHUNK_MAP_PG_SHIFT)) << PAGE_SHIFT));
pageind += run->bin->run_size >> PAGE_SHIFT;
}
}
}
#ifdef JEMALLOC_DEBUG
ndirty = chunk->ndirty;
#endif
#ifdef JEMALLOC_STATS
arena->stats.purged += chunk->ndirty;
#endif
arena->ndirty -= chunk->ndirty;
chunk->ndirty = 0;
ql_remove(&arena->chunks_dirty, chunk, link_dirty);
chunk->dirtied = false;
malloc_mutex_unlock(&arena->lock);
#ifdef JEMALLOC_STATS
nmadvise = 0;
#endif
ql_foreach(mapelm, &mapelms, u.ql_link) {
size_t i, j;
size_t pageind = ((uintptr_t)mapelm - (uintptr_t)chunk->map) /
sizeof(arena_chunk_map_t);
size_t npages = (mapelm->bits & CHUNK_MAP_PG_MASK) >>
CHUNK_MAP_PG_SHIFT;
for (i = 0; i < npages;) {
if (chunk->map[pageind + i].bits & CHUNK_MAP_DIRTY) {
chunk->map[pageind + i].bits ^= CHUNK_MAP_DIRTY;
chunk->map[pageind + i].bits |=
(CHUNK_MAP_LARGE | CHUNK_MAP_ALLOCATED);
/* Find adjacent dirty page(s). */
for (j = 1; i + j < npages; j++) {
if ((chunk->map[pageind + i + j].bits &
CHUNK_MAP_DIRTY) == 0)
break;
chunk->map[pageind + i + j].bits ^=
CHUNK_MAP_DIRTY;
chunk->map[pageind + i + j].bits |=
(CHUNK_MAP_LARGE |
CHUNK_MAP_ALLOCATED);
}
#ifdef JEMALLOC_DEBUG
assert(ndirty >= j);
ndirty -= j;
#endif
madvise((void *)((uintptr_t)chunk + ((pageind +
i) << PAGE_SHIFT)), (j << PAGE_SHIFT),
MADV_DONTNEED);
#ifdef JEMALLOC_STATS
nmadvise++;
#endif
i += j;
} else {
chunk->map[pageind + i].bits |=
(CHUNK_MAP_LARGE | CHUNK_MAP_ALLOCATED);
i++;
}
}
}
#ifdef JEMALLOC_DEBUG
assert(ndirty == 0);
#endif
malloc_mutex_lock(&arena->lock);
#ifdef JEMALLOC_STATS
arena->stats.nmadvise += nmadvise;
#endif
/* Deallocate runs. */
for (mapelm = ql_first(&mapelms); mapelm != NULL;
mapelm = ql_first(&mapelms)) {
size_t pageind = ((uintptr_t)mapelm - (uintptr_t)chunk->map) /
sizeof(arena_chunk_map_t);
arena_run_t *run = (arena_run_t *)((uintptr_t)chunk +
(uintptr_t)(pageind << PAGE_SHIFT));
ql_remove(&mapelms, mapelm, u.ql_link);
arena_run_dalloc(arena, run, false);
}
}
static void static void
arena_purge(arena_t *arena) arena_purge(arena_t *arena)
{ {
arena_chunk_t *chunk; arena_chunk_t *chunk;
size_t i, j, npages; size_t npurgatory;
#ifdef JEMALLOC_DEBUG #ifdef JEMALLOC_DEBUG
size_t ndirty = 0; size_t ndirty = 0;
@ -494,6 +663,7 @@ arena_purge(arena_t *arena)
} }
assert(ndirty == arena->ndirty); assert(ndirty == arena->ndirty);
#endif #endif
assert(arena->ndirty > arena->npurgatory);
assert(arena->ndirty > chunk_npages); assert(arena->ndirty > chunk_npages);
assert((arena->nactive >> opt_lg_dirty_mult) < arena->ndirty); assert((arena->nactive >> opt_lg_dirty_mult) < arena->ndirty);
@ -502,64 +672,65 @@ arena_purge(arena_t *arena)
#endif #endif
/* /*
* Only allow one thread at a time to purge dirty pages. madvise(2) * Compute the minimum number of pages that this thread should try to
* causes the kernel to modify virtual memory data structures that are * purge, and add the result to arena->npurgatory. This will keep
* typically protected by a lock, and purging isn't important enough to * multiple threads from racing to reduce ndirty below the threshold.
* suffer lock contention in the kernel. The result of failing to
* acquire purge_lock here is that this arena will operate with ndirty
* above the threshold until some dirty pages are re-used, or the
* creation of more dirty pages causes this function to be called
* again.
*/ */
if (malloc_mutex_trylock(&purge_lock)) npurgatory = (arena->ndirty - arena->npurgatory) - (arena->nactive >>
return; opt_lg_dirty_mult);
arena->npurgatory += npurgatory;
/* while (npurgatory > 0) {
* Iterate through chunks until enough dirty memory has been /* Get next chunk with dirty pages. */
* purged (all dirty pages in one chunk, or enough pages to drop to
* threshold, whichever is greater). Terminate as soon as possible in
* order to minimize the number of system calls, even if a chunk has
* only been partially purged.
*/
for (i = 0; (arena->nactive >> opt_lg_dirty_mult) < arena->ndirty;
i++) {
chunk = ql_first(&arena->chunks_dirty); chunk = ql_first(&arena->chunks_dirty);
assert(chunk != NULL); if (chunk == NULL) {
/*
/* Purge pages from high to low within each chunk. */ * This thread was unable to purge as many pages as
for (j = chunk_npages - 1; chunk->ndirty > 0; j--) { * originally intended, due to races with other threads
assert(j >= arena_chunk_header_npages); * that either did some of the purging work, or re-used
if (chunk->map[j].bits & CHUNK_MAP_DIRTY) { * dirty pages.
chunk->map[j].bits ^= CHUNK_MAP_DIRTY; */
/* Find adjacent dirty run(s). */ arena->npurgatory -= npurgatory;
for (npages = 1; j > arena_chunk_header_npages return;
&& (chunk->map[j - 1].bits & }
CHUNK_MAP_DIRTY); npages++) { while (chunk->ndirty == 0) {
j--; ql_remove(&arena->chunks_dirty, chunk, link_dirty);
chunk->map[j].bits ^= CHUNK_MAP_DIRTY; chunk->dirtied = false;
} chunk = ql_first(&arena->chunks_dirty);
chunk->ndirty -= npages; if (chunk == NULL) {
arena->ndirty -= npages; /* Same logic as for above. */
arena->npurgatory -= npurgatory;
madvise((void *)((uintptr_t)chunk + (j << return;
PAGE_SHIFT)), (npages << PAGE_SHIFT),
MADV_DONTNEED);
#ifdef JEMALLOC_STATS
arena->stats.nmadvise++;
arena->stats.purged += npages;
#endif
if ((arena->nactive >> opt_lg_dirty_mult) >=
arena->ndirty && i > 0)
break;
} }
} }
if (chunk->ndirty == 0) { if (chunk->ndirty > npurgatory) {
ql_remove(&arena->chunks_dirty, chunk, link_dirty); /*
chunk->dirtied = false; * This thread will, at a minimum, purge all the dirty
* pages in chunk, so set npurgatory to reflect this
* thread's commitment to purge the pages. This tends
* to reduce the chances of the following scenario:
*
* 1) This thread sets arena->npurgatory such that
* (arena->ndirty - arena->npurgatory) is at the
* threshold.
* 2) This thread drops arena->lock.
* 3) Another thread causes one or more pages to be
* dirtied, and immediately determines that it must
* purge dirty pages.
*
* If this scenario *does* play out, that's okay,
* because all of the purging work being done really
* needs to happen.
*/
arena->npurgatory += chunk->ndirty - npurgatory;
npurgatory = chunk->ndirty;
} }
arena->npurgatory -= chunk->ndirty;
npurgatory -= chunk->ndirty;
arena_chunk_purge(arena, chunk);
} }
malloc_mutex_unlock(&purge_lock);
} }
static void static void
@ -685,11 +856,7 @@ arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty)
ql_tail_insert(&arena->chunks_dirty, chunk, link_dirty); ql_tail_insert(&arena->chunks_dirty, chunk, link_dirty);
chunk->dirtied = true; chunk->dirtied = true;
} }
arena_maybe_purge(arena);
/* Enforce opt_lg_dirty_mult. */
if (opt_lg_dirty_mult >= 0 && arena->ndirty > chunk_npages &&
(arena->nactive >> opt_lg_dirty_mult) < arena->ndirty)
arena_purge(arena);
} }
} }
@ -1426,10 +1593,7 @@ arena_dalloc_bin_run(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
ql_tail_insert(&arena->chunks_dirty, chunk, link_dirty); ql_tail_insert(&arena->chunks_dirty, chunk, link_dirty);
chunk->dirtied = true; chunk->dirtied = true;
} }
/* Enforce opt_lg_dirty_mult. */ arena_maybe_purge(arena);
if (opt_lg_dirty_mult >= 0 && arena->ndirty > chunk_npages &&
(arena->nactive >> opt_lg_dirty_mult) < arena->ndirty)
arena_purge(arena);
malloc_mutex_unlock(&arena->lock); malloc_mutex_unlock(&arena->lock);
#ifdef JEMALLOC_STATS #ifdef JEMALLOC_STATS
@ -1842,6 +2006,7 @@ arena_new(arena_t *arena, unsigned ind)
arena->nactive = 0; arena->nactive = 0;
arena->ndirty = 0; arena->ndirty = 0;
arena->npurgatory = 0;
arena_avail_tree_new(&arena->runs_avail); arena_avail_tree_new(&arena->runs_avail);
@ -2155,8 +2320,5 @@ arena_boot(void)
((header_size & PAGE_MASK) != 0); ((header_size & PAGE_MASK) != 0);
arena_maxclass = chunksize - (arena_chunk_header_npages << PAGE_SHIFT); arena_maxclass = chunksize - (arena_chunk_header_npages << PAGE_SHIFT);
if (malloc_mutex_init(&purge_lock))
return (true);
return (false); return (false);
} }