From 05b21be347d66b4afd5147ecd26e8cb22f6ea3fe Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Sun, 14 Mar 2010 17:36:10 -0700
Subject: [PATCH] Purge dirty pages without arena->lock.

---
 jemalloc/include/jemalloc/internal/arena.h |  33 ++-
 jemalloc/src/arena.c                       | 298 ++++++++++++++++-----
 2 files changed, 255 insertions(+), 76 deletions(-)

diff --git a/jemalloc/include/jemalloc/internal/arena.h b/jemalloc/include/jemalloc/internal/arena.h
index 06b60cab..f4186a35 100644
--- a/jemalloc/include/jemalloc/internal/arena.h
+++ b/jemalloc/include/jemalloc/internal/arena.h
@@ -95,14 +95,23 @@ typedef struct arena_s arena_t;
 
 /* Each element of the chunk map corresponds to one page within the chunk. */
 struct arena_chunk_map_s {
-	/*
-	 * Linkage for run trees.  There are two disjoint uses:
-	 *
-	 * 1) arena_t's runs_avail tree.
-	 * 2) arena_run_t conceptually uses this linkage for in-use non-full
-	 *    runs, rather than directly embedding linkage.
-	 */
-	rb_node(arena_chunk_map_t)	link;
+	union {
+		/*
+		 * Linkage for run trees.  There are two disjoint uses:
+		 *
+		 * 1) arena_t's runs_avail tree.
+		 * 2) arena_run_t conceptually uses this linkage for in-use
+		 *    non-full runs, rather than directly embedding linkage.
+		 */
+		rb_node(arena_chunk_map_t)	rb_link;
+		/*
+		 * List of runs currently in purgatory.  arena_chunk_purge()
+		 * temporarily allocates runs that contain dirty pages while
+		 * purging, so that other threads cannot use the runs while the
+		 * purging thread is operating without the arena lock held.
+		 */
+		ql_elm(arena_chunk_map_t)	ql_link;
+	}				u;
 
 #ifdef JEMALLOC_PROF
 	/* Profile counters, used for large object runs. */
@@ -311,6 +320,14 @@ struct arena_s {
 	 */
 	size_t			ndirty;
 
+	/*
+	 * Approximate number of pages being purged.  It is possible for
+	 * multiple threads to purge dirty pages concurrently, and they use
+	 * npurgatory to indicate the total number of pages all threads are
+	 * attempting to purge.
+	 */
+	size_t			npurgatory;
+
 	/*
 	 * Size/address-ordered tree of this arena's available runs.  This tree
 	 * is used for first-best-fit run allocation.
diff --git a/jemalloc/src/arena.c b/jemalloc/src/arena.c
index 3981cbb3..98e41d0e 100644
--- a/jemalloc/src/arena.c
+++ b/jemalloc/src/arena.c
@@ -27,9 +27,6 @@ size_t		medium_max;
 size_t		lg_mspace;
 size_t		mspace_mask;
 
-/* Used to prevent threads from concurrently calling madvise(2). */
-static malloc_mutex_t	purge_lock;
-
 /*
  * const_small_size2bin is a static constant lookup table that in the common
  * case can be used as-is for small_size2bin.  For dynamically linked programs,
@@ -215,7 +212,7 @@ arena_run_comp(arena_chunk_map_t *a, arena_chunk_map_t *b)
 
 /* Generate red-black tree functions. */
 rb_gen(static JEMALLOC_ATTR(unused), arena_run_tree_, arena_run_tree_t,
-    arena_chunk_map_t, link, arena_run_comp)
+    arena_chunk_map_t, u.rb_link, arena_run_comp)
 
 static inline int
 arena_avail_comp(arena_chunk_map_t *a, arena_chunk_map_t *b)
@@ -247,7 +244,7 @@ arena_avail_comp(arena_chunk_map_t *a, arena_chunk_map_t *b)
 
 /* Generate red-black tree functions. */
 rb_gen(static JEMALLOC_ATTR(unused), arena_avail_tree_, arena_avail_tree_t,
-    arena_chunk_map_t, link, arena_avail_comp)
+    arena_chunk_map_t, u.rb_link, arena_avail_comp)
 
 static inline void *
 arena_run_reg_alloc(arena_run_t *run, arena_bin_t *bin)
@@ -480,11 +477,183 @@ arena_run_alloc(arena_t *arena, size_t size, bool large, bool zero)
 	return (run);
 }
 
+static inline void
+arena_maybe_purge(arena_t *arena)
+{
+
+	/* Enforce opt_lg_dirty_mult. */
+	if (opt_lg_dirty_mult >= 0 && arena->ndirty > arena->npurgatory &&
+	    (arena->ndirty - arena->npurgatory) > chunk_npages &&
+	    (arena->nactive >> opt_lg_dirty_mult) < (arena->ndirty -
+	    arena->npurgatory))
+		arena_purge(arena);
+}
+
+static inline void
+arena_chunk_purge(arena_t *arena, arena_chunk_t *chunk)
+{
+	ql_head(arena_chunk_map_t) mapelms;
+	arena_chunk_map_t *mapelm;
+	size_t pageind;
+#ifdef JEMALLOC_DEBUG
+	size_t ndirty;
+#endif
+#ifdef JEMALLOC_STATS
+	size_t nmadvise;
+#endif
+
+	ql_new(&mapelms);
+
+	/*
+	 * If chunk is the spare, temporarily re-allocate it, 1) so that its
+	 * run is reinserted into runs_avail, and 2) so that it cannot be
+	 * completely discarded by another thread while arena->lock is dropped
+	 * by this thread.  Note that the arena_run_dalloc() call will
+	 * implicitly deallocate the chunk, so no explicit action is required
+	 * in this function to deallocate the chunk.
+	 */
+	if (chunk == arena->spare)
+		arena_chunk_alloc(arena);
+
+	/* Temporarily allocate all free runs that contain dirty pages. */
+	for (pageind = arena_chunk_header_npages; pageind < chunk_npages;) {
+		mapelm = &chunk->map[pageind];
+		if ((mapelm->bits & CHUNK_MAP_ALLOCATED) == 0) {
+			size_t npages, i;
+
+			npages = (mapelm->bits & CHUNK_MAP_PG_MASK) >>
+			    CHUNK_MAP_PG_SHIFT;
+			for (i = 0; i < npages; i++) {
+				if (chunk->map[pageind + i].bits &
+				    CHUNK_MAP_DIRTY) {
+					/*
+					 * This run contains at least one dirty
+					 * page.  Temporarily allocate it.
+					 * Don't bother setting the
+					 * CHUNK_MAP_ALLOCATED bit for interior
+					 * pages yet.  The bit will be set
+					 * during the loop that actually does
+					 * the madvise() calls, so that the run
+					 * looks like a normal large run by the
+					 * time it is passed to
+					 * arena_run_dalloc().
+					 */
+					arena_avail_tree_remove(
+					    &arena->runs_avail, mapelm);
+					mapelm->bits |= (CHUNK_MAP_LARGE |
+					    CHUNK_MAP_ALLOCATED);
+					chunk->map[pageind + npages - 1].bits |=
+					    (CHUNK_MAP_LARGE |
+					    CHUNK_MAP_ALLOCATED);
+					/*
+					 * Append to list for later processing.
+					 */
+					ql_elm_new(mapelm, u.ql_link);
+					ql_tail_insert(&mapelms, mapelm,
+					    u.ql_link);
+					break;
+				}
+			}
+			pageind += npages;
+		} else {
+			/* Skip allocated run. */
+			if (mapelm->bits & CHUNK_MAP_LARGE) {
+				pageind += (mapelm->bits & CHUNK_MAP_PG_MASK)
+				    >> CHUNK_MAP_PG_SHIFT;
+			} else {
+				arena_run_t *run = (arena_run_t *)((uintptr_t)
+				    chunk + (uintptr_t)((pageind -
+				    ((mapelm->bits & CHUNK_MAP_PG_MASK) >>
+				    CHUNK_MAP_PG_SHIFT)) << PAGE_SHIFT));
+				pageind += run->bin->run_size >> PAGE_SHIFT;
+			}
+		}
+	}
+
+#ifdef JEMALLOC_DEBUG
+	ndirty = chunk->ndirty;
+#endif
+#ifdef JEMALLOC_STATS
+	arena->stats.purged += chunk->ndirty;
+#endif
+	arena->ndirty -= chunk->ndirty;
+	chunk->ndirty = 0;
+	ql_remove(&arena->chunks_dirty, chunk, link_dirty);
+	chunk->dirtied = false;
+
+	malloc_mutex_unlock(&arena->lock);
+#ifdef JEMALLOC_STATS
+	nmadvise = 0;
+#endif
+	ql_foreach(mapelm, &mapelms, u.ql_link) {
+		size_t i, j;
+		size_t pageind = ((uintptr_t)mapelm - (uintptr_t)chunk->map) /
+		    sizeof(arena_chunk_map_t);
+		size_t npages = (mapelm->bits & CHUNK_MAP_PG_MASK) >>
+		    CHUNK_MAP_PG_SHIFT;
+
+		for (i = 0; i < npages;) {
+			if (chunk->map[pageind + i].bits & CHUNK_MAP_DIRTY) {
+				chunk->map[pageind + i].bits ^= CHUNK_MAP_DIRTY;
+				chunk->map[pageind + i].bits |=
+				    (CHUNK_MAP_LARGE | CHUNK_MAP_ALLOCATED);
+
+				/* Find adjacent dirty page(s). */
+				for (j = 1; i + j < npages; j++) {
+					if ((chunk->map[pageind + i + j].bits &
+					    CHUNK_MAP_DIRTY) == 0)
+						break;
+					chunk->map[pageind + i + j].bits ^=
+					    CHUNK_MAP_DIRTY;
+					chunk->map[pageind + i + j].bits |=
+					    (CHUNK_MAP_LARGE |
+					    CHUNK_MAP_ALLOCATED);
+				}
+#ifdef JEMALLOC_DEBUG
+				assert(ndirty >= j);
+				ndirty -= j;
+#endif
+
+				madvise((void *)((uintptr_t)chunk + ((pageind +
+				    i) << PAGE_SHIFT)), (j << PAGE_SHIFT),
+				    MADV_DONTNEED);
+#ifdef JEMALLOC_STATS
+				nmadvise++;
+#endif
+				i += j;
+			} else {
+				chunk->map[pageind + i].bits |=
+				    (CHUNK_MAP_LARGE | CHUNK_MAP_ALLOCATED);
+				i++;
+			}
+		}
+	}
+#ifdef JEMALLOC_DEBUG
+	assert(ndirty == 0);
+#endif
+	malloc_mutex_lock(&arena->lock);
+#ifdef JEMALLOC_STATS
+	arena->stats.nmadvise += nmadvise;
+#endif
+
+	/* Deallocate runs. */
+	for (mapelm = ql_first(&mapelms); mapelm != NULL;
+	    mapelm = ql_first(&mapelms)) {
+		size_t pageind = ((uintptr_t)mapelm - (uintptr_t)chunk->map) /
+		    sizeof(arena_chunk_map_t);
+		arena_run_t *run = (arena_run_t *)((uintptr_t)chunk +
+		    (uintptr_t)(pageind << PAGE_SHIFT));
+
+		ql_remove(&mapelms, mapelm, u.ql_link);
+		arena_run_dalloc(arena, run, false);
+	}
+}
+
 static void
 arena_purge(arena_t *arena)
 {
 	arena_chunk_t *chunk;
-	size_t i, j, npages;
+	size_t npurgatory;
 #ifdef JEMALLOC_DEBUG
 	size_t ndirty = 0;
 
@@ -494,6 +663,7 @@ arena_purge(arena_t *arena)
 	}
 	assert(ndirty == arena->ndirty);
 #endif
+	assert(arena->ndirty > arena->npurgatory);
 	assert(arena->ndirty > chunk_npages);
 	assert((arena->nactive >> opt_lg_dirty_mult) < arena->ndirty);
 
@@ -502,64 +672,65 @@ arena_purge(arena_t *arena)
 #endif
 
 	/*
-	 * Only allow one thread at a time to purge dirty pages.  madvise(2)
-	 * causes the kernel to modify virtual memory data structures that are
-	 * typically protected by a lock, and purging isn't important enough to
-	 * suffer lock contention in the kernel.  The result of failing to
-	 * acquire purge_lock here is that this arena will operate with ndirty
-	 * above the threshold until some dirty pages are re-used, or the
-	 * creation of more dirty pages causes this function to be called
-	 * again.
+	 * Compute the minimum number of pages that this thread should try to
+	 * purge, and add the result to arena->npurgatory.  This will keep
+	 * multiple threads from racing to reduce ndirty below the threshold.
 	 */
-	if (malloc_mutex_trylock(&purge_lock))
-		return;
+	npurgatory = (arena->ndirty - arena->npurgatory) - (arena->nactive >>
+	    opt_lg_dirty_mult);
+	arena->npurgatory += npurgatory;
 
-	/*
-	 * Iterate through chunks until enough dirty memory has been
-	 * purged (all dirty pages in one chunk, or enough pages to drop to
-	 * threshold, whichever is greater).  Terminate as soon as possible in
-	 * order to minimize the number of system calls, even if a chunk has
-	 * only been partially purged.
-	 */
-	for (i = 0; (arena->nactive >> opt_lg_dirty_mult) < arena->ndirty;
-	    i++) {
+	while (npurgatory > 0) {
+		/* Get next chunk with dirty pages. */
 		chunk = ql_first(&arena->chunks_dirty);
-		assert(chunk != NULL);
-
-		/* Purge pages from high to low within each chunk. */
-		for (j = chunk_npages - 1; chunk->ndirty > 0; j--) {
-			assert(j >= arena_chunk_header_npages);
-			if (chunk->map[j].bits & CHUNK_MAP_DIRTY) {
-				chunk->map[j].bits ^= CHUNK_MAP_DIRTY;
-				/* Find adjacent dirty run(s). */
-				for (npages = 1; j > arena_chunk_header_npages
-				    && (chunk->map[j - 1].bits &
-				    CHUNK_MAP_DIRTY); npages++) {
-					j--;
-					chunk->map[j].bits ^= CHUNK_MAP_DIRTY;
-				}
-				chunk->ndirty -= npages;
-				arena->ndirty -= npages;
-
-				madvise((void *)((uintptr_t)chunk + (j <<
-				    PAGE_SHIFT)), (npages << PAGE_SHIFT),
-				    MADV_DONTNEED);
-#ifdef JEMALLOC_STATS
-				arena->stats.nmadvise++;
-				arena->stats.purged += npages;
-#endif
-				if ((arena->nactive >> opt_lg_dirty_mult) >=
-				    arena->ndirty && i > 0)
-					break;
+		if (chunk == NULL) {
+			/*
+			 * This thread was unable to purge as many pages as
+			 * originally intended, due to races with other threads
+			 * that either did some of the purging work, or re-used
+			 * dirty pages.
+			 */
+			arena->npurgatory -= npurgatory;
+			return;
+		}
+		while (chunk->ndirty == 0) {
+			ql_remove(&arena->chunks_dirty, chunk, link_dirty);
+			chunk->dirtied = false;
+			chunk = ql_first(&arena->chunks_dirty);
+			if (chunk == NULL) {
+				/* Same logic as for above. */
+				arena->npurgatory -= npurgatory;
+				return;
 			}
 		}
 
-		if (chunk->ndirty == 0) {
-			ql_remove(&arena->chunks_dirty, chunk, link_dirty);
-			chunk->dirtied = false;
+		if (chunk->ndirty > npurgatory) {
+			/*
+			 * This thread will, at a minimum, purge all the dirty
+			 * pages in chunk, so set npurgatory to reflect this
+			 * thread's commitment to purge the pages.  This tends
+			 * to reduce the chances of the following scenario:
+			 *
+			 * 1) This thread sets arena->npurgatory such that
+			 *    (arena->ndirty - arena->npurgatory) is at the
+			 *    threshold.
+			 * 2) This thread drops arena->lock.
+			 * 3) Another thread causes one or more pages to be
+			 *    dirtied, and immediately determines that it must
+			 *    purge dirty pages.
+			 *
+			 * If this scenario *does* play out, that's okay,
+			 * because all of the purging work being done really
+			 * needs to happen.
+			 */
+			arena->npurgatory += chunk->ndirty - npurgatory;
+			npurgatory = chunk->ndirty;
 		}
+
+		arena->npurgatory -= chunk->ndirty;
+		npurgatory -= chunk->ndirty;
+		arena_chunk_purge(arena, chunk);
 	}
-	malloc_mutex_unlock(&purge_lock);
 }
 
 static void
@@ -685,11 +856,7 @@ arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty)
 			ql_tail_insert(&arena->chunks_dirty, chunk, link_dirty);
 			chunk->dirtied = true;
 		}
-
-		/* Enforce opt_lg_dirty_mult. */
-		if (opt_lg_dirty_mult >= 0 && arena->ndirty > chunk_npages &&
-		    (arena->nactive >> opt_lg_dirty_mult) < arena->ndirty)
-			arena_purge(arena);
+		arena_maybe_purge(arena);
 	}
 }
 
@@ -1426,10 +1593,7 @@ arena_dalloc_bin_run(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
 		ql_tail_insert(&arena->chunks_dirty, chunk, link_dirty);
 		chunk->dirtied = true;
 	}
-	/* Enforce opt_lg_dirty_mult. */
-	if (opt_lg_dirty_mult >= 0 && arena->ndirty > chunk_npages &&
-	    (arena->nactive >> opt_lg_dirty_mult) < arena->ndirty)
-		arena_purge(arena);
+	arena_maybe_purge(arena);
 
 	malloc_mutex_unlock(&arena->lock);
 #ifdef JEMALLOC_STATS
@@ -1842,6 +2006,7 @@ arena_new(arena_t *arena, unsigned ind)
 
 	arena->nactive = 0;
 	arena->ndirty = 0;
+	arena->npurgatory = 0;
 
 	arena_avail_tree_new(&arena->runs_avail);
 
@@ -2155,8 +2320,5 @@ arena_boot(void)
 	    ((header_size & PAGE_MASK) != 0);
 	arena_maxclass = chunksize - (arena_chunk_header_npages << PAGE_SHIFT);
 
-	if (malloc_mutex_init(&purge_lock))
-		return (true);
-
 	return (false);
 }