diff --git a/include/jemalloc/internal/chunk.h b/include/jemalloc/internal/chunk.h
index c9fd4ecb..e199a037 100644
--- a/include/jemalloc/internal/chunk.h
+++ b/include/jemalloc/internal/chunk.h
@@ -71,9 +71,6 @@ bool	chunk_purge_wrapper(tsdn_t *tsdn, arena_t *arena,
     chunk_hooks_t *chunk_hooks, void *chunk, size_t size, size_t offset,
     size_t length);
 bool	chunk_boot(void);
-void	chunk_prefork(tsdn_t *tsdn);
-void	chunk_postfork_parent(tsdn_t *tsdn);
-void	chunk_postfork_child(tsdn_t *tsdn);
 
 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
diff --git a/include/jemalloc/internal/chunk_dss.h b/include/jemalloc/internal/chunk_dss.h
index 724fa579..da8511ba 100644
--- a/include/jemalloc/internal/chunk_dss.h
+++ b/include/jemalloc/internal/chunk_dss.h
@@ -21,15 +21,13 @@ extern const char *dss_prec_names[];
 /******************************************************************************/
 #ifdef JEMALLOC_H_EXTERNS
 
-dss_prec_t	chunk_dss_prec_get(tsdn_t *tsdn);
-bool	chunk_dss_prec_set(tsdn_t *tsdn, dss_prec_t dss_prec);
+dss_prec_t	chunk_dss_prec_get(void);
+bool	chunk_dss_prec_set(dss_prec_t dss_prec);
 void	*chunk_alloc_dss(tsdn_t *tsdn, arena_t *arena, void *new_addr,
     size_t size, size_t alignment, bool *zero, bool *commit);
-bool	chunk_in_dss(tsdn_t *tsdn, void *chunk);
-bool	chunk_dss_boot(void);
-void	chunk_dss_prefork(tsdn_t *tsdn);
-void	chunk_dss_postfork_parent(tsdn_t *tsdn);
-void	chunk_dss_postfork_child(tsdn_t *tsdn);
+bool	chunk_in_dss(void *chunk);
+bool	chunk_dss_mergeable(void *chunk_a, void *chunk_b);
+void	chunk_dss_boot(void);
 
 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
diff --git a/include/jemalloc/internal/huge.h b/include/jemalloc/internal/huge.h
index b5fa9e63..22184d9b 100644
--- a/include/jemalloc/internal/huge.h
+++ b/include/jemalloc/internal/huge.h
@@ -17,7 +17,7 @@ bool	huge_ralloc_no_move(tsdn_t *tsdn, void *ptr, size_t oldsize,
 void	*huge_ralloc(tsd_t *tsd, arena_t *arena, void *ptr, size_t oldsize,
     size_t usize, size_t alignment, bool zero, tcache_t *tcache);
 #ifdef JEMALLOC_JET
-typedef void (huge_dalloc_junk_t)(tsdn_t *, void *, size_t);
+typedef void (huge_dalloc_junk_t)(void *, size_t);
 extern huge_dalloc_junk_t *huge_dalloc_junk;
 #endif
 void	huge_dalloc(tsdn_t *tsdn, void *ptr);
diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt
index cd6681c8..642c3de7 100644
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -167,20 +167,15 @@ chunk_dalloc_mmap
 chunk_dalloc_wrapper
 chunk_deregister
 chunk_dss_boot
-chunk_dss_postfork_child
-chunk_dss_postfork_parent
+chunk_dss_mergeable
 chunk_dss_prec_get
 chunk_dss_prec_set
-chunk_dss_prefork
 chunk_hooks_default
 chunk_hooks_get
 chunk_hooks_set
 chunk_in_dss
 chunk_lookup
 chunk_npages
-chunk_postfork_child
-chunk_postfork_parent
-chunk_prefork
 chunk_purge_wrapper
 chunk_register
 chunks_rtree
diff --git a/src/arena.c b/src/arena.c
index 90b9d822..76514955 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -3499,7 +3499,7 @@ arena_new(tsdn_t *tsdn, unsigned ind)
 		    (uint64_t)(uintptr_t)arena;
 	}
 
-	arena->dss_prec = chunk_dss_prec_get(tsdn);
+	arena->dss_prec = chunk_dss_prec_get();
 
 	ql_new(&arena->achunks);
 
diff --git a/src/chunk.c b/src/chunk.c
index dff537f5..302b98cb 100644
--- a/src/chunk.c
+++ b/src/chunk.c
@@ -611,10 +611,10 @@ chunk_dalloc_cache(tsdn_t *tsdn, arena_t *arena, chunk_hooks_t *chunk_hooks,
 }
 
 static bool
-chunk_dalloc_default_impl(tsdn_t *tsdn, void *chunk, size_t size)
+chunk_dalloc_default_impl(void *chunk, size_t size)
 {
 
-	if (!have_dss || !chunk_in_dss(tsdn, chunk))
+	if (!have_dss || !chunk_in_dss(chunk))
 		return (chunk_dalloc_mmap(chunk, size));
 	return (true);
 }
@@ -623,11 +623,8 @@ static bool
 chunk_dalloc_default(void *chunk, size_t size, bool committed,
     unsigned arena_ind)
 {
-	tsdn_t *tsdn;
 
-	tsdn = tsdn_fetch();
-
-	return (chunk_dalloc_default_impl(tsdn, chunk, size));
+	return (chunk_dalloc_default_impl(chunk, size));
 }
 
 void
@@ -645,7 +642,7 @@ chunk_dalloc_wrapper(tsdn_t *tsdn, arena_t *arena, chunk_hooks_t *chunk_hooks,
 	/* Try to deallocate. */
 	if (chunk_hooks->dalloc == chunk_dalloc_default) {
 		/* Call directly to propagate tsdn. */
-		err = chunk_dalloc_default_impl(tsdn, chunk, size);
+		err = chunk_dalloc_default_impl(chunk, size);
 	} else
 		err = chunk_hooks->dalloc(chunk, size, committed, arena->ind);
 
@@ -718,13 +715,12 @@ chunk_split_default(void *chunk, size_t size, size_t size_a, size_t size_b,
 }
 
 static bool
-chunk_merge_default_impl(tsdn_t *tsdn, void *chunk_a, void *chunk_b)
+chunk_merge_default_impl(void *chunk_a, void *chunk_b)
 {
 
 	if (!maps_coalesce)
 		return (true);
-	if (have_dss && chunk_in_dss(tsdn, chunk_a) != chunk_in_dss(tsdn,
-	    chunk_b))
+	if (have_dss && !chunk_dss_mergeable(chunk_a, chunk_b))
 		return (true);
 
 	return (false);
@@ -734,11 +730,8 @@ static bool
 chunk_merge_default(void *chunk_a, size_t size_a, void *chunk_b, size_t size_b,
     bool committed, unsigned arena_ind)
 {
-	tsdn_t *tsdn;
 
-	tsdn = tsdn_fetch();
-
-	return (chunk_merge_default_impl(tsdn, chunk_a, chunk_b));
+	return (chunk_merge_default_impl(chunk_a, chunk_b));
 }
 
 static rtree_node_elm_t *
@@ -782,32 +775,11 @@ chunk_boot(void)
 	chunksize_mask = chunksize - 1;
 	chunk_npages = (chunksize >> LG_PAGE);
 
-	if (have_dss && chunk_dss_boot())
-		return (true);
+	if (have_dss)
+		chunk_dss_boot();
 	if (rtree_new(&chunks_rtree, (unsigned)((ZU(1) << (LG_SIZEOF_PTR+3)) -
 	    opt_lg_chunk), chunks_rtree_node_alloc, NULL))
 		return (true);
 
 	return (false);
 }
-
-void
-chunk_prefork(tsdn_t *tsdn)
-{
-
-	chunk_dss_prefork(tsdn);
-}
-
-void
-chunk_postfork_parent(tsdn_t *tsdn)
-{
-
-	chunk_dss_postfork_parent(tsdn);
-}
-
-void
-chunk_postfork_child(tsdn_t *tsdn)
-{
-
-	chunk_dss_postfork_child(tsdn);
-}
diff --git a/src/chunk_dss.c b/src/chunk_dss.c
index 0b1f82bd..85a13548 100644
--- a/src/chunk_dss.c
+++ b/src/chunk_dss.c
@@ -10,20 +10,19 @@ const char	*dss_prec_names[] = {
 	"N/A"
 };
 
-/* Current dss precedence default, used when creating new arenas. */
-static dss_prec_t	dss_prec_default = DSS_PREC_DEFAULT;
-
 /*
- * Protects sbrk() calls.  This avoids malloc races among threads, though it
- * does not protect against races with threads that call sbrk() directly.
+ * Current dss precedence default, used when creating new arenas.  NB: This is
+ * stored as unsigned rather than dss_prec_t because in principle there's no
+ * guarantee that sizeof(dss_prec_t) is the same as sizeof(unsigned), and we use
+ * atomic operations to synchronize the setting.
  */
-static malloc_mutex_t	dss_mtx;
+static unsigned		dss_prec_default = (unsigned)DSS_PREC_DEFAULT;
 
 /* Base address of the DSS. */
 static void		*dss_base;
-/* Current end of the DSS, or ((void *)-1) if the DSS is exhausted. */
-static void		*dss_prev;
-/* Current upper limit on DSS addresses. */
+/* Atomic boolean indicating whether the DSS is exhausted. */
+static unsigned		dss_exhausted;
+/* Atomic current upper limit on DSS addresses. */
 static void		*dss_max;
 
 /******************************************************************************/
@@ -41,30 +40,59 @@ chunk_dss_sbrk(intptr_t increment)
 }
 
 dss_prec_t
-chunk_dss_prec_get(tsdn_t *tsdn)
+chunk_dss_prec_get(void)
 {
 	dss_prec_t ret;
 
 	if (!have_dss)
 		return (dss_prec_disabled);
-	malloc_mutex_lock(tsdn, &dss_mtx);
-	ret = dss_prec_default;
-	malloc_mutex_unlock(tsdn, &dss_mtx);
+	ret = (dss_prec_t)atomic_read_u(&dss_prec_default);
 	return (ret);
 }
 
 bool
-chunk_dss_prec_set(tsdn_t *tsdn, dss_prec_t dss_prec)
+chunk_dss_prec_set(dss_prec_t dss_prec)
 {
 
 	if (!have_dss)
 		return (dss_prec != dss_prec_disabled);
-	malloc_mutex_lock(tsdn, &dss_mtx);
-	dss_prec_default = dss_prec;
-	malloc_mutex_unlock(tsdn, &dss_mtx);
+	atomic_write_u(&dss_prec_default, (unsigned)dss_prec);
 	return (false);
 }
 
+static void *
+chunk_dss_max_update(void *new_addr)
+{
+	void *max_cur;
+	spin_t spinner;
+
+	/*
+	 * Get the current end of the DSS as max_cur and assure that dss_max is
+	 * up to date.
+	 */
+	spin_init(&spinner);
+	while (true) {
+		void *max_prev = atomic_read_p(&dss_max);
+
+		max_cur = chunk_dss_sbrk(0);
+		if ((uintptr_t)max_prev > (uintptr_t)max_cur) {
+			/*
+			 * Another thread optimistically updated dss_max.  Wait
+			 * for it to finish.
+			 */
+			spin_adaptive(&spinner);
+			continue;
+		}
+		if (!atomic_cas_p(&dss_max, max_prev, max_cur))
+			break;
+	}
+	/* Fixed new_addr can only be supported if it is at the edge of DSS. */
+	if (new_addr != NULL && max_cur != new_addr)
+		return (NULL);
+
+	return (max_cur);
+}
+
 void *
 chunk_alloc_dss(tsdn_t *tsdn, arena_t *arena, void *new_addr, size_t size,
     size_t alignment, bool *zero, bool *commit)
@@ -80,28 +108,20 @@ chunk_alloc_dss(tsdn_t *tsdn, arena_t *arena, void *new_addr, size_t size,
 	if ((intptr_t)size < 0)
 		return (NULL);
 
-	malloc_mutex_lock(tsdn, &dss_mtx);
-	if (dss_prev != (void *)-1) {
-
+	if (!atomic_read_u(&dss_exhausted)) {
 		/*
 		 * The loop is necessary to recover from races with other
 		 * threads that are using the DSS for something other than
 		 * malloc.
 		 */
-		do {
-			void *ret, *cpad, *dss_next;
+		while (true) {
+			void *ret, *cpad, *max_cur, *dss_next, *dss_prev;
 			size_t gap_size, cpad_size;
 			intptr_t incr;
-			/* Avoid an unnecessary system call. */
-			if (new_addr != NULL && dss_max != new_addr)
-				break;
 
-			/* Get the current end of the DSS. */
-			dss_max = chunk_dss_sbrk(0);
-
-			/* Make sure the earlier condition still holds. */
-			if (new_addr != NULL && dss_max != new_addr)
-				break;
+			max_cur = chunk_dss_max_update(new_addr);
+			if (max_cur == NULL)
+				goto label_oom;
 
 			/*
 			 * Calculate how much padding is necessary to
@@ -120,17 +140,23 @@ chunk_alloc_dss(tsdn_t *tsdn, arena_t *arena, void *new_addr, size_t size,
 			cpad_size = (uintptr_t)ret - (uintptr_t)cpad;
 			dss_next = (void *)((uintptr_t)ret + size);
 			if ((uintptr_t)ret < (uintptr_t)dss_max ||
-			    (uintptr_t)dss_next < (uintptr_t)dss_max) {
-				/* Wrap-around. */
-				malloc_mutex_unlock(tsdn, &dss_mtx);
-				return (NULL);
-			}
+			    (uintptr_t)dss_next < (uintptr_t)dss_max)
+				goto label_oom; /* Wrap-around. */
 			incr = gap_size + cpad_size + size;
+
+			/*
+			 * Optimistically update dss_max, and roll back below if
+			 * sbrk() fails.  No other thread will try to extend the
+			 * DSS while dss_max is greater than the current DSS
+			 * max reported by sbrk(0).
+			 */
+			if (atomic_cas_p(&dss_max, max_cur, dss_next))
+				continue;
+
+			/* Try to allocate. */
 			dss_prev = chunk_dss_sbrk(incr);
-			if (dss_prev == dss_max) {
+			if (dss_prev == max_cur) {
 				/* Success. */
-				dss_max = dss_next;
-				malloc_mutex_unlock(tsdn, &dss_mtx);
 				if (cpad_size != 0) {
 					chunk_hooks_t chunk_hooks =
 					    CHUNK_HOOKS_INITIALIZER;
@@ -147,68 +173,65 @@ chunk_alloc_dss(tsdn_t *tsdn, arena_t *arena, void *new_addr, size_t size,
 					*commit = pages_decommit(ret, size);
 				return (ret);
 			}
-		} while (dss_prev != (void *)-1);
-	}
-	malloc_mutex_unlock(tsdn, &dss_mtx);
 
+			/*
+			 * Failure, whether due to OOM or a race with a raw
+			 * sbrk() call from outside the allocator.  Try to roll
+			 * back optimistic dss_max update; if rollback fails,
+			 * it's due to another caller of this function having
+			 * succeeded since this invocation started, in which
+			 * case rollback is not necessary.
+			 */
+			atomic_cas_p(&dss_max, dss_next, max_cur);
+			if (dss_prev == (void *)-1) {
+				/* OOM. */
+				atomic_write_u(&dss_exhausted, (unsigned)true);
+				goto label_oom;
+			}
+		}
+	}
+label_oom:
 	return (NULL);
 }
 
-bool
-chunk_in_dss(tsdn_t *tsdn, void *chunk)
+static bool
+chunk_in_dss_helper(void *chunk, void *max)
 {
-	bool ret;
 
-	cassert(have_dss);
-
-	malloc_mutex_lock(tsdn, &dss_mtx);
-	if ((uintptr_t)chunk >= (uintptr_t)dss_base
-	    && (uintptr_t)chunk < (uintptr_t)dss_max)
-		ret = true;
-	else
-		ret = false;
-	malloc_mutex_unlock(tsdn, &dss_mtx);
-
-	return (ret);
+	return ((uintptr_t)chunk >= (uintptr_t)dss_base && (uintptr_t)chunk <
+	    (uintptr_t)max);
 }
 
 bool
+chunk_in_dss(void *chunk)
+{
+
+	cassert(have_dss);
+
+	return (chunk_in_dss_helper(chunk, atomic_read_p(&dss_max)));
+}
+
+bool
+chunk_dss_mergeable(void *chunk_a, void *chunk_b)
+{
+	void *max;
+
+	cassert(have_dss);
+
+	max = atomic_read_p(&dss_max);
+	return (chunk_in_dss_helper(chunk_a, max) ==
+	    chunk_in_dss_helper(chunk_b, max));
+}
+
+void
 chunk_dss_boot(void)
 {
 
 	cassert(have_dss);
 
-	if (malloc_mutex_init(&dss_mtx, "dss", WITNESS_RANK_DSS))
-		return (true);
 	dss_base = chunk_dss_sbrk(0);
-	dss_prev = dss_base;
+	dss_exhausted = (unsigned)(dss_base == (void *)-1);
 	dss_max = dss_base;
-
-	return (false);
-}
-
-void
-chunk_dss_prefork(tsdn_t *tsdn)
-{
-
-	if (have_dss)
-		malloc_mutex_prefork(tsdn, &dss_mtx);
-}
-
-void
-chunk_dss_postfork_parent(tsdn_t *tsdn)
-{
-
-	if (have_dss)
-		malloc_mutex_postfork_parent(tsdn, &dss_mtx);
-}
-
-void
-chunk_dss_postfork_child(tsdn_t *tsdn)
-{
-
-	if (have_dss)
-		malloc_mutex_postfork_child(tsdn, &dss_mtx);
 }
 
 /******************************************************************************/
diff --git a/src/ctl.c b/src/ctl.c
index dad80086..5d2c8db4 100644
--- a/src/ctl.c
+++ b/src/ctl.c
@@ -1685,11 +1685,11 @@ arena_i_dss_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,
 		dss_prec_old = arena_dss_prec_get(tsd_tsdn(tsd), arena);
 	} else {
 		if (dss_prec != dss_prec_limit &&
-		    chunk_dss_prec_set(tsd_tsdn(tsd), dss_prec)) {
+		    chunk_dss_prec_set(dss_prec)) {
 			ret = EFAULT;
 			goto label_return;
 		}
-		dss_prec_old = chunk_dss_prec_get(tsd_tsdn(tsd));
+		dss_prec_old = chunk_dss_prec_get();
 	}
 
 	dss = dss_prec_names[dss_prec_old];
diff --git a/src/huge.c b/src/huge.c
index 3a2877ca..19ca3f03 100644
--- a/src/huge.c
+++ b/src/huge.c
@@ -114,7 +114,7 @@ huge_palloc(tsdn_t *tsdn, arena_t *arena, size_t usize, size_t alignment,
 #define	huge_dalloc_junk JEMALLOC_N(huge_dalloc_junk_impl)
 #endif
 static void
-huge_dalloc_junk(tsdn_t *tsdn, void *ptr, size_t usize)
+huge_dalloc_junk(void *ptr, size_t usize)
 {
 
 	if (config_fill && have_dss && unlikely(opt_junk_free)) {
@@ -122,7 +122,7 @@ huge_dalloc_junk(tsdn_t *tsdn, void *ptr, size_t usize)
 		 * Only bother junk filling if the chunk isn't about to be
 		 * unmapped.
 		 */
-		if (!config_munmap || (have_dss && chunk_in_dss(tsdn, ptr)))
+		if (!config_munmap || (have_dss && chunk_in_dss(ptr)))
 			memset(ptr, JEMALLOC_FREE_JUNK, usize);
 	}
 }
@@ -221,7 +221,7 @@ huge_ralloc_no_move_shrink(tsdn_t *tsdn, void *ptr, size_t oldsize,
 	if (oldsize > usize) {
 		size_t sdiff = oldsize - usize;
 		if (config_fill && unlikely(opt_junk_free)) {
-			huge_dalloc_junk(tsdn, (void *)((uintptr_t)ptr + usize),
+			huge_dalloc_junk((void *)((uintptr_t)ptr + usize),
 			    sdiff);
 			post_zeroed = false;
 		} else {
@@ -402,7 +402,7 @@ huge_dalloc(tsdn_t *tsdn, void *ptr)
 	ql_remove(&arena->huge, node, ql_link);
 	malloc_mutex_unlock(tsdn, &arena->huge_mtx);
 
-	huge_dalloc_junk(tsdn, extent_node_addr_get(node),
+	huge_dalloc_junk(extent_node_addr_get(node),
 	    extent_node_size_get(node));
 	arena_chunk_dalloc_huge(tsdn, extent_node_arena_get(node),
 	    extent_node_addr_get(node), extent_node_size_get(node));
diff --git a/src/jemalloc.c b/src/jemalloc.c
index aec2a5eb..b0ebf810 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -1115,8 +1115,7 @@ malloc_conf_init(void)
 				for (i = 0; i < dss_prec_limit; i++) {
 					if (strncmp(dss_prec_names[i], v, vlen)
 					    == 0) {
-						if (chunk_dss_prec_set(NULL,
-						   i)) {
+						if (chunk_dss_prec_set(i)) {
 							malloc_conf_error(
 							    "Error setting dss",
 							    k, klen, v, vlen);
@@ -2783,7 +2782,6 @@ _malloc_prefork(void)
 		}
 	}
 	base_prefork(tsd_tsdn(tsd));
-	chunk_prefork(tsd_tsdn(tsd));
 	for (i = 0; i < narenas; i++) {
 		if ((arena = arena_get(tsd_tsdn(tsd), i, false)) != NULL)
 			arena_prefork3(tsd_tsdn(tsd), arena);
@@ -2812,7 +2810,6 @@ _malloc_postfork(void)
 
 	witness_postfork_parent(tsd);
 	/* Release all mutexes, now that fork() has completed. */
-	chunk_postfork_parent(tsd_tsdn(tsd));
 	base_postfork_parent(tsd_tsdn(tsd));
 	for (i = 0, narenas = narenas_total_get(); i < narenas; i++) {
 		arena_t *arena;
@@ -2837,7 +2834,6 @@ jemalloc_postfork_child(void)
 
 	witness_postfork_child(tsd);
 	/* Release all mutexes, now that fork() has completed. */
-	chunk_postfork_child(tsd_tsdn(tsd));
 	base_postfork_child(tsd_tsdn(tsd));
 	for (i = 0, narenas = narenas_total_get(); i < narenas; i++) {
 		arena_t *arena;
diff --git a/test/unit/junk.c b/test/unit/junk.c
index acddc601..460bd524 100644
--- a/test/unit/junk.c
+++ b/test/unit/junk.c
@@ -53,10 +53,10 @@ arena_dalloc_junk_large_intercept(void *ptr, size_t usize)
 }
 
 static void
-huge_dalloc_junk_intercept(tsdn_t *tsdn, void *ptr, size_t usize)
+huge_dalloc_junk_intercept(void *ptr, size_t usize)
 {
 
-	huge_dalloc_junk_orig(tsdn, ptr, usize);
+	huge_dalloc_junk_orig(ptr, usize);
 	/*
 	 * The conditions under which junk filling actually occurs are nuanced
 	 * enough that it doesn't make sense to duplicate the decision logic in