From 05b21be347d66b4afd5147ecd26e8cb22f6ea3fe Mon Sep 17 00:00:00 2001 From: Jason Evans Date: Sun, 14 Mar 2010 17:36:10 -0700 Subject: [PATCH] Purge dirty pages without arena->lock. --- jemalloc/include/jemalloc/internal/arena.h | 33 ++- jemalloc/src/arena.c | 298 ++++++++++++++++----- 2 files changed, 255 insertions(+), 76 deletions(-) diff --git a/jemalloc/include/jemalloc/internal/arena.h b/jemalloc/include/jemalloc/internal/arena.h index 06b60cab..f4186a35 100644 --- a/jemalloc/include/jemalloc/internal/arena.h +++ b/jemalloc/include/jemalloc/internal/arena.h @@ -95,14 +95,23 @@ typedef struct arena_s arena_t; /* Each element of the chunk map corresponds to one page within the chunk. */ struct arena_chunk_map_s { - /* - * Linkage for run trees. There are two disjoint uses: - * - * 1) arena_t's runs_avail tree. - * 2) arena_run_t conceptually uses this linkage for in-use non-full - * runs, rather than directly embedding linkage. - */ - rb_node(arena_chunk_map_t) link; + union { + /* + * Linkage for run trees. There are two disjoint uses: + * + * 1) arena_t's runs_avail tree. + * 2) arena_run_t conceptually uses this linkage for in-use + * non-full runs, rather than directly embedding linkage. + */ + rb_node(arena_chunk_map_t) rb_link; + /* + * List of runs currently in purgatory. arena_chunk_purge() + * temporarily allocates runs that contain dirty pages while + * purging, so that other threads cannot use the runs while the + * purging thread is operating without the arena lock held. + */ + ql_elm(arena_chunk_map_t) ql_link; + } u; #ifdef JEMALLOC_PROF /* Profile counters, used for large object runs. */ @@ -311,6 +320,14 @@ struct arena_s { */ size_t ndirty; + /* + * Approximate number of pages being purged. It is possible for + * multiple threads to purge dirty pages concurrently, and they use + * npurgatory to indicate the total number of pages all threads are + * attempting to purge. + */ + size_t npurgatory; + /* * Size/address-ordered tree of this arena's available runs. This tree * is used for first-best-fit run allocation. diff --git a/jemalloc/src/arena.c b/jemalloc/src/arena.c index 3981cbb3..98e41d0e 100644 --- a/jemalloc/src/arena.c +++ b/jemalloc/src/arena.c @@ -27,9 +27,6 @@ size_t medium_max; size_t lg_mspace; size_t mspace_mask; -/* Used to prevent threads from concurrently calling madvise(2). */ -static malloc_mutex_t purge_lock; - /* * const_small_size2bin is a static constant lookup table that in the common * case can be used as-is for small_size2bin. For dynamically linked programs, @@ -215,7 +212,7 @@ arena_run_comp(arena_chunk_map_t *a, arena_chunk_map_t *b) /* Generate red-black tree functions. */ rb_gen(static JEMALLOC_ATTR(unused), arena_run_tree_, arena_run_tree_t, - arena_chunk_map_t, link, arena_run_comp) + arena_chunk_map_t, u.rb_link, arena_run_comp) static inline int arena_avail_comp(arena_chunk_map_t *a, arena_chunk_map_t *b) @@ -247,7 +244,7 @@ arena_avail_comp(arena_chunk_map_t *a, arena_chunk_map_t *b) /* Generate red-black tree functions. */ rb_gen(static JEMALLOC_ATTR(unused), arena_avail_tree_, arena_avail_tree_t, - arena_chunk_map_t, link, arena_avail_comp) + arena_chunk_map_t, u.rb_link, arena_avail_comp) static inline void * arena_run_reg_alloc(arena_run_t *run, arena_bin_t *bin) @@ -480,11 +477,183 @@ arena_run_alloc(arena_t *arena, size_t size, bool large, bool zero) return (run); } +static inline void +arena_maybe_purge(arena_t *arena) +{ + + /* Enforce opt_lg_dirty_mult. */ + if (opt_lg_dirty_mult >= 0 && arena->ndirty > arena->npurgatory && + (arena->ndirty - arena->npurgatory) > chunk_npages && + (arena->nactive >> opt_lg_dirty_mult) < (arena->ndirty - + arena->npurgatory)) + arena_purge(arena); +} + +static inline void +arena_chunk_purge(arena_t *arena, arena_chunk_t *chunk) +{ + ql_head(arena_chunk_map_t) mapelms; + arena_chunk_map_t *mapelm; + size_t pageind; +#ifdef JEMALLOC_DEBUG + size_t ndirty; +#endif +#ifdef JEMALLOC_STATS + size_t nmadvise; +#endif + + ql_new(&mapelms); + + /* + * If chunk is the spare, temporarily re-allocate it, 1) so that its + * run is reinserted into runs_avail, and 2) so that it cannot be + * completely discarded by another thread while arena->lock is dropped + * by this thread. Note that the arena_run_dalloc() call will + * implicitly deallocate the chunk, so no explicit action is required + * in this function to deallocate the chunk. + */ + if (chunk == arena->spare) + arena_chunk_alloc(arena); + + /* Temporarily allocate all free runs that contain dirty pages. */ + for (pageind = arena_chunk_header_npages; pageind < chunk_npages;) { + mapelm = &chunk->map[pageind]; + if ((mapelm->bits & CHUNK_MAP_ALLOCATED) == 0) { + size_t npages, i; + + npages = (mapelm->bits & CHUNK_MAP_PG_MASK) >> + CHUNK_MAP_PG_SHIFT; + for (i = 0; i < npages; i++) { + if (chunk->map[pageind + i].bits & + CHUNK_MAP_DIRTY) { + /* + * This run contains at least one dirty + * page. Temporarily allocate it. + * Don't bother setting the + * CHUNK_MAP_ALLOCATED bit for interior + * pages yet. The bit will be set + * during the loop that actually does + * the madvise() calls, so that the run + * looks like a normal large run by the + * time it is passed to + * arena_run_dalloc(). + */ + arena_avail_tree_remove( + &arena->runs_avail, mapelm); + mapelm->bits |= (CHUNK_MAP_LARGE | + CHUNK_MAP_ALLOCATED); + chunk->map[pageind + npages - 1].bits |= + (CHUNK_MAP_LARGE | + CHUNK_MAP_ALLOCATED); + /* + * Append to list for later processing. + */ + ql_elm_new(mapelm, u.ql_link); + ql_tail_insert(&mapelms, mapelm, + u.ql_link); + break; + } + } + pageind += npages; + } else { + /* Skip allocated run. */ + if (mapelm->bits & CHUNK_MAP_LARGE) { + pageind += (mapelm->bits & CHUNK_MAP_PG_MASK) + >> CHUNK_MAP_PG_SHIFT; + } else { + arena_run_t *run = (arena_run_t *)((uintptr_t) + chunk + (uintptr_t)((pageind - + ((mapelm->bits & CHUNK_MAP_PG_MASK) >> + CHUNK_MAP_PG_SHIFT)) << PAGE_SHIFT)); + pageind += run->bin->run_size >> PAGE_SHIFT; + } + } + } + +#ifdef JEMALLOC_DEBUG + ndirty = chunk->ndirty; +#endif +#ifdef JEMALLOC_STATS + arena->stats.purged += chunk->ndirty; +#endif + arena->ndirty -= chunk->ndirty; + chunk->ndirty = 0; + ql_remove(&arena->chunks_dirty, chunk, link_dirty); + chunk->dirtied = false; + + malloc_mutex_unlock(&arena->lock); +#ifdef JEMALLOC_STATS + nmadvise = 0; +#endif + ql_foreach(mapelm, &mapelms, u.ql_link) { + size_t i, j; + size_t pageind = ((uintptr_t)mapelm - (uintptr_t)chunk->map) / + sizeof(arena_chunk_map_t); + size_t npages = (mapelm->bits & CHUNK_MAP_PG_MASK) >> + CHUNK_MAP_PG_SHIFT; + + for (i = 0; i < npages;) { + if (chunk->map[pageind + i].bits & CHUNK_MAP_DIRTY) { + chunk->map[pageind + i].bits ^= CHUNK_MAP_DIRTY; + chunk->map[pageind + i].bits |= + (CHUNK_MAP_LARGE | CHUNK_MAP_ALLOCATED); + + /* Find adjacent dirty page(s). */ + for (j = 1; i + j < npages; j++) { + if ((chunk->map[pageind + i + j].bits & + CHUNK_MAP_DIRTY) == 0) + break; + chunk->map[pageind + i + j].bits ^= + CHUNK_MAP_DIRTY; + chunk->map[pageind + i + j].bits |= + (CHUNK_MAP_LARGE | + CHUNK_MAP_ALLOCATED); + } +#ifdef JEMALLOC_DEBUG + assert(ndirty >= j); + ndirty -= j; +#endif + + madvise((void *)((uintptr_t)chunk + ((pageind + + i) << PAGE_SHIFT)), (j << PAGE_SHIFT), + MADV_DONTNEED); +#ifdef JEMALLOC_STATS + nmadvise++; +#endif + i += j; + } else { + chunk->map[pageind + i].bits |= + (CHUNK_MAP_LARGE | CHUNK_MAP_ALLOCATED); + i++; + } + } + } +#ifdef JEMALLOC_DEBUG + assert(ndirty == 0); +#endif + malloc_mutex_lock(&arena->lock); +#ifdef JEMALLOC_STATS + arena->stats.nmadvise += nmadvise; +#endif + + /* Deallocate runs. */ + for (mapelm = ql_first(&mapelms); mapelm != NULL; + mapelm = ql_first(&mapelms)) { + size_t pageind = ((uintptr_t)mapelm - (uintptr_t)chunk->map) / + sizeof(arena_chunk_map_t); + arena_run_t *run = (arena_run_t *)((uintptr_t)chunk + + (uintptr_t)(pageind << PAGE_SHIFT)); + + ql_remove(&mapelms, mapelm, u.ql_link); + arena_run_dalloc(arena, run, false); + } +} + static void arena_purge(arena_t *arena) { arena_chunk_t *chunk; - size_t i, j, npages; + size_t npurgatory; #ifdef JEMALLOC_DEBUG size_t ndirty = 0; @@ -494,6 +663,7 @@ arena_purge(arena_t *arena) } assert(ndirty == arena->ndirty); #endif + assert(arena->ndirty > arena->npurgatory); assert(arena->ndirty > chunk_npages); assert((arena->nactive >> opt_lg_dirty_mult) < arena->ndirty); @@ -502,64 +672,65 @@ arena_purge(arena_t *arena) #endif /* - * Only allow one thread at a time to purge dirty pages. madvise(2) - * causes the kernel to modify virtual memory data structures that are - * typically protected by a lock, and purging isn't important enough to - * suffer lock contention in the kernel. The result of failing to - * acquire purge_lock here is that this arena will operate with ndirty - * above the threshold until some dirty pages are re-used, or the - * creation of more dirty pages causes this function to be called - * again. + * Compute the minimum number of pages that this thread should try to + * purge, and add the result to arena->npurgatory. This will keep + * multiple threads from racing to reduce ndirty below the threshold. */ - if (malloc_mutex_trylock(&purge_lock)) - return; + npurgatory = (arena->ndirty - arena->npurgatory) - (arena->nactive >> + opt_lg_dirty_mult); + arena->npurgatory += npurgatory; - /* - * Iterate through chunks until enough dirty memory has been - * purged (all dirty pages in one chunk, or enough pages to drop to - * threshold, whichever is greater). Terminate as soon as possible in - * order to minimize the number of system calls, even if a chunk has - * only been partially purged. - */ - for (i = 0; (arena->nactive >> opt_lg_dirty_mult) < arena->ndirty; - i++) { + while (npurgatory > 0) { + /* Get next chunk with dirty pages. */ chunk = ql_first(&arena->chunks_dirty); - assert(chunk != NULL); - - /* Purge pages from high to low within each chunk. */ - for (j = chunk_npages - 1; chunk->ndirty > 0; j--) { - assert(j >= arena_chunk_header_npages); - if (chunk->map[j].bits & CHUNK_MAP_DIRTY) { - chunk->map[j].bits ^= CHUNK_MAP_DIRTY; - /* Find adjacent dirty run(s). */ - for (npages = 1; j > arena_chunk_header_npages - && (chunk->map[j - 1].bits & - CHUNK_MAP_DIRTY); npages++) { - j--; - chunk->map[j].bits ^= CHUNK_MAP_DIRTY; - } - chunk->ndirty -= npages; - arena->ndirty -= npages; - - madvise((void *)((uintptr_t)chunk + (j << - PAGE_SHIFT)), (npages << PAGE_SHIFT), - MADV_DONTNEED); -#ifdef JEMALLOC_STATS - arena->stats.nmadvise++; - arena->stats.purged += npages; -#endif - if ((arena->nactive >> opt_lg_dirty_mult) >= - arena->ndirty && i > 0) - break; + if (chunk == NULL) { + /* + * This thread was unable to purge as many pages as + * originally intended, due to races with other threads + * that either did some of the purging work, or re-used + * dirty pages. + */ + arena->npurgatory -= npurgatory; + return; + } + while (chunk->ndirty == 0) { + ql_remove(&arena->chunks_dirty, chunk, link_dirty); + chunk->dirtied = false; + chunk = ql_first(&arena->chunks_dirty); + if (chunk == NULL) { + /* Same logic as for above. */ + arena->npurgatory -= npurgatory; + return; } } - if (chunk->ndirty == 0) { - ql_remove(&arena->chunks_dirty, chunk, link_dirty); - chunk->dirtied = false; + if (chunk->ndirty > npurgatory) { + /* + * This thread will, at a minimum, purge all the dirty + * pages in chunk, so set npurgatory to reflect this + * thread's commitment to purge the pages. This tends + * to reduce the chances of the following scenario: + * + * 1) This thread sets arena->npurgatory such that + * (arena->ndirty - arena->npurgatory) is at the + * threshold. + * 2) This thread drops arena->lock. + * 3) Another thread causes one or more pages to be + * dirtied, and immediately determines that it must + * purge dirty pages. + * + * If this scenario *does* play out, that's okay, + * because all of the purging work being done really + * needs to happen. + */ + arena->npurgatory += chunk->ndirty - npurgatory; + npurgatory = chunk->ndirty; } + + arena->npurgatory -= chunk->ndirty; + npurgatory -= chunk->ndirty; + arena_chunk_purge(arena, chunk); } - malloc_mutex_unlock(&purge_lock); } static void @@ -685,11 +856,7 @@ arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty) ql_tail_insert(&arena->chunks_dirty, chunk, link_dirty); chunk->dirtied = true; } - - /* Enforce opt_lg_dirty_mult. */ - if (opt_lg_dirty_mult >= 0 && arena->ndirty > chunk_npages && - (arena->nactive >> opt_lg_dirty_mult) < arena->ndirty) - arena_purge(arena); + arena_maybe_purge(arena); } } @@ -1426,10 +1593,7 @@ arena_dalloc_bin_run(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run, ql_tail_insert(&arena->chunks_dirty, chunk, link_dirty); chunk->dirtied = true; } - /* Enforce opt_lg_dirty_mult. */ - if (opt_lg_dirty_mult >= 0 && arena->ndirty > chunk_npages && - (arena->nactive >> opt_lg_dirty_mult) < arena->ndirty) - arena_purge(arena); + arena_maybe_purge(arena); malloc_mutex_unlock(&arena->lock); #ifdef JEMALLOC_STATS @@ -1842,6 +2006,7 @@ arena_new(arena_t *arena, unsigned ind) arena->nactive = 0; arena->ndirty = 0; + arena->npurgatory = 0; arena_avail_tree_new(&arena->runs_avail); @@ -2155,8 +2320,5 @@ arena_boot(void) ((header_size & PAGE_MASK) != 0); arena_maxclass = chunksize - (arena_chunk_header_npages << PAGE_SHIFT); - if (malloc_mutex_init(&purge_lock)) - return (true); - return (false); }