diff --git a/.cirrus.yml b/.cirrus.yml
new file mode 100644
index 00000000..019d2c38
--- /dev/null
+++ b/.cirrus.yml
@@ -0,0 +1,21 @@
+env:
+  CIRRUS_CLONE_DEPTH: 1
+  ARCH: amd64
+
+task:
+  freebsd_instance:
+    matrix:
+      image: freebsd-12-0-release-amd64
+      image: freebsd-11-2-release-amd64
+  install_script:
+    - sed -i.bak -e 's,pkg+http://pkg.FreeBSD.org/\${ABI}/quarterly,pkg+http://pkg.FreeBSD.org/\${ABI}/latest,' /etc/pkg/FreeBSD.conf
+    - pkg upgrade -y
+    - pkg install -y autoconf gmake
+  script:
+    - autoconf
+    #- ./configure ${COMPILER_FLAGS:+       CC="$CC $COMPILER_FLAGS"       CXX="$CXX $COMPILER_FLAGS" }       $CONFIGURE_FLAGS
+    - ./configure
+    - export JFLAG=`sysctl -n kern.smp.cpus`
+    - gmake -j${JFLAG}
+    - gmake -j${JFLAG} tests
+    - gmake check
diff --git a/.gitignore b/.gitignore
index 19199ccb..5ca0ad1d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -30,7 +30,6 @@
 /include/jemalloc/internal/public_namespace.h
 /include/jemalloc/internal/public_symbols.txt
 /include/jemalloc/internal/public_unnamespace.h
-/include/jemalloc/internal/size_classes.h
 /include/jemalloc/jemalloc.h
 /include/jemalloc/jemalloc_defs.h
 /include/jemalloc/jemalloc_macros.h
diff --git a/.travis.yml b/.travis.yml
index 4cc116e5..40b2eb5f 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -11,7 +11,7 @@ matrix:
       env: CC=clang CXX=clang++ COMPILER_FLAGS="" CONFIGURE_FLAGS="" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
     - os: linux
       env: CC=gcc CXX=g++ COMPILER_FLAGS="-m32" CONFIGURE_FLAGS="" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
-      addons:
+      addons: &gcc_multilib
         apt:
           packages:
             - gcc-multilib
@@ -21,6 +21,8 @@ matrix:
       env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--enable-prof" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
     - os: linux
       env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--disable-stats" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--disable-libdl" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
     - os: linux
       env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--with-malloc-conf=tcache:false" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
     - os: linux
@@ -37,20 +39,21 @@ matrix:
       env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--enable-debug" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
     - os: osx
       env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--disable-stats" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: osx
+      env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--disable-libdl" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
     - os: osx
       env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--with-malloc-conf=tcache:false" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
     - os: linux
       env: CC=clang CXX=clang++ COMPILER_FLAGS="-m32" CONFIGURE_FLAGS="" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
-      addons:
-        apt:
-          packages:
-            - gcc-multilib
+      addons: *gcc_multilib
     - os: linux
       env: CC=clang CXX=clang++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--enable-debug" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
     - os: linux
       env: CC=clang CXX=clang++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--enable-prof" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
     - os: linux
       env: CC=clang CXX=clang++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--disable-stats" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      env: CC=clang CXX=clang++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--disable-libdl" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
     - os: linux
       env: CC=clang CXX=clang++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--with-malloc-conf=tcache:false" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
     - os: linux
@@ -61,50 +64,34 @@ matrix:
       env: CC=clang CXX=clang++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--with-malloc-conf=background_thread:true" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
     - os: linux
       env: CC=gcc CXX=g++ COMPILER_FLAGS="-m32" CONFIGURE_FLAGS="--enable-debug" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
-      addons:
-        apt:
-          packages:
-            - gcc-multilib
+      addons: *gcc_multilib
     - os: linux
       env: CC=gcc CXX=g++ COMPILER_FLAGS="-m32" CONFIGURE_FLAGS="--enable-prof" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
-      addons:
-        apt:
-          packages:
-            - gcc-multilib
+      addons: *gcc_multilib
     - os: linux
       env: CC=gcc CXX=g++ COMPILER_FLAGS="-m32" CONFIGURE_FLAGS="--disable-stats" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
-      addons:
-        apt:
-          packages:
-            - gcc-multilib
+      addons: *gcc_multilib
+    - os: linux
+      env: CC=gcc CXX=g++ COMPILER_FLAGS="-m32" CONFIGURE_FLAGS="--disable-libdl" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+      addons: *gcc_multilib
     - os: linux
       env: CC=gcc CXX=g++ COMPILER_FLAGS="-m32" CONFIGURE_FLAGS="--with-malloc-conf=tcache:false" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
-      addons:
-        apt:
-          packages:
-            - gcc-multilib
+      addons: *gcc_multilib
     - os: linux
       env: CC=gcc CXX=g++ COMPILER_FLAGS="-m32" CONFIGURE_FLAGS="--with-malloc-conf=dss:primary" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
-      addons:
-        apt:
-          packages:
-            - gcc-multilib
+      addons: *gcc_multilib
     - os: linux
       env: CC=gcc CXX=g++ COMPILER_FLAGS="-m32" CONFIGURE_FLAGS="--with-malloc-conf=percpu_arena:percpu" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
-      addons:
-        apt:
-          packages:
-            - gcc-multilib
+      addons: *gcc_multilib
     - os: linux
       env: CC=gcc CXX=g++ COMPILER_FLAGS="-m32" CONFIGURE_FLAGS="--with-malloc-conf=background_thread:true" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
-      addons:
-        apt:
-          packages:
-            - gcc-multilib
+      addons: *gcc_multilib
     - os: linux
       env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--enable-debug --enable-prof" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
     - os: linux
       env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--enable-debug --disable-stats" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--enable-debug --disable-libdl" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
     - os: linux
       env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--enable-debug --with-malloc-conf=tcache:false" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
     - os: linux
@@ -115,6 +102,8 @@ matrix:
       env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--enable-debug --with-malloc-conf=background_thread:true" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
     - os: linux
       env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--enable-prof --disable-stats" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--enable-prof --disable-libdl" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
     - os: linux
       env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--enable-prof --with-malloc-conf=tcache:false" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
     - os: linux
@@ -123,6 +112,8 @@ matrix:
       env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--enable-prof --with-malloc-conf=percpu_arena:percpu" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
     - os: linux
       env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--enable-prof --with-malloc-conf=background_thread:true" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--disable-stats --disable-libdl" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
     - os: linux
       env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--disable-stats --with-malloc-conf=tcache:false" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
     - os: linux
@@ -131,6 +122,14 @@ matrix:
       env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--disable-stats --with-malloc-conf=percpu_arena:percpu" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
     - os: linux
       env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--disable-stats --with-malloc-conf=background_thread:true" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--disable-libdl --with-malloc-conf=tcache:false" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--disable-libdl --with-malloc-conf=dss:primary" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--disable-libdl --with-malloc-conf=percpu_arena:percpu" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--disable-libdl --with-malloc-conf=background_thread:true" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
     - os: linux
       env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--with-malloc-conf=tcache:false,dss:primary" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
     - os: linux
@@ -143,10 +142,25 @@ matrix:
       env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--with-malloc-conf=dss:primary,background_thread:true" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
     - os: linux
       env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--with-malloc-conf=percpu_arena:percpu,background_thread:true" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    # Development build
+    - os: linux
+      env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--enable-debug --disable-cache-oblivious --enable-stats --enable-log --enable-prof" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    # --enable-expermental-smallocx:
+    - os: linux
+      env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--enable-debug --enable-experimental-smallocx --enable-stats --enable-prof" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+
+    # Valgrind
+    - os: linux
+      env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="" EXTRA_CFLAGS="-Werror -Wno-array-bounds" JEMALLOC_TEST_PREFIX="valgrind"
+      addons:
+        apt:
+          packages:
+            - valgrind
 
 
 before_script:
   - autoconf
+  - scripts/gen_travis.py > travis_script && diff .travis.yml travis_script
   - ./configure ${COMPILER_FLAGS:+       CC="$CC $COMPILER_FLAGS"       CXX="$CXX $COMPILER_FLAGS" }       $CONFIGURE_FLAGS
   - make -j3
   - make -j3 tests
diff --git a/COPYING b/COPYING
index 98458d97..3b7fd358 100644
--- a/COPYING
+++ b/COPYING
@@ -1,10 +1,10 @@
 Unless otherwise specified, files in the jemalloc source distribution are
 subject to the following license:
 --------------------------------------------------------------------------------
-Copyright (C) 2002-2018 Jason Evans <jasone@canonware.com>.
+Copyright (C) 2002-present Jason Evans <jasone@canonware.com>.
 All rights reserved.
 Copyright (C) 2007-2012 Mozilla Foundation.  All rights reserved.
-Copyright (C) 2009-2018 Facebook, Inc.  All rights reserved.
+Copyright (C) 2009-present Facebook, Inc.  All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
diff --git a/ChangeLog b/ChangeLog
index 29a00fb7..7c73a8f2 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -4,7 +4,110 @@ brevity.  Much more detail can be found in the git revision history:
 
     https://github.com/jemalloc/jemalloc
 
-* 5.1.0 (May 4th, 2018)
+* 5.2.0 (April 2, 2019)
+
+  This release includes a few notable improvements, which are summarized below:
+  1) improved fast-path performance from the optimizations by @djwatson; 2)
+  reduced virtual memory fragmentation and metadata usage; and 3) bug fixes on
+  setting the number of background threads.  In addition, peak / spike memory
+  usage is improved with certain allocation patterns.  As usual, the release and
+  prior dev versions have gone through large-scale production testing.
+
+  New features:
+  - Implement oversize_threshold, which uses a dedicated arena for allocations
+    crossing the specified threshold to reduce fragmentation.  (@interwq)
+  - Add extents usage information to stats.  (@tyleretzel)
+  - Log time information for sampled allocations.  (@tyleretzel)
+  - Support 0 size in sdallocx.  (@djwatson)
+  - Output rate for certain counters in malloc_stats.  (@zinoale)
+  - Add configure option --enable-readlinkat, which allows the use of readlinkat
+    over readlink.  (@davidtgoldblatt)
+  - Add configure options --{enable,disable}-{static,shared} to allow not
+    building unwanted libraries.  (@Ericson2314)
+  - Add configure option --disable-libdl to enable fully static builds.
+    (@interwq)
+  - Add mallctl interfaces:
+	+ opt.oversize_threshold (@interwq)
+	+ stats.arenas.<i>.extent_avail (@tyleretzel)
+	+ stats.arenas.<i>.extents.<j>.n{dirty,muzzy,retained} (@tyleretzel)
+	+ stats.arenas.<i>.extents.<j>.{dirty,muzzy,retained}_bytes
+	  (@tyleretzel)
+
+  Portability improvements:
+  - Update MSVC builds.  (@maksqwe, @rustyx)
+  - Workaround a compiler optimizer bug on s390x.  (@rkmisra)
+  - Make use of pthread_set_name_np(3) on FreeBSD.  (@trasz)
+  - Implement malloc_getcpu() to enable percpu_arena for windows.  (@santagada)
+  - Link against -pthread instead of -lpthread.  (@paravoid)
+  - Make background_thread not dependent on libdl.  (@interwq)
+  - Add stringify to fix a linker directive issue on MSVC.  (@daverigby)
+  - Detect and fall back when 8-bit atomics are unavailable.  (@interwq)
+  - Fall back to the default pthread_create if dlsym(3) fails.  (@interwq)
+
+  Optimizations and refactors:
+  - Refactor the TSD module.  (@davidtgoldblatt)
+  - Avoid taking extents_muzzy mutex when muzzy is disabled.  (@interwq)
+  - Avoid taking large_mtx for auto arenas on the tcache flush path.  (@interwq)
+  - Optimize ixalloc by avoiding a size lookup.  (@interwq)
+  - Implement opt.oversize_threshold which uses a dedicated arena for requests
+    crossing the threshold, also eagerly purges the oversize extents.  Default
+    the threshold to 8 MiB.  (@interwq)
+  - Clean compilation with -Wextra.  (@gnzlbg, @jasone)
+  - Refactor the size class module.  (@davidtgoldblatt)
+  - Refactor the stats emitter.  (@tyleretzel)
+  - Optimize pow2_ceil.  (@rkmisra)
+  - Avoid runtime detection of lazy purging on FreeBSD.  (@trasz)
+  - Optimize mmap(2) alignment handling on FreeBSD.  (@trasz)
+  - Improve error handling for THP state initialization.  (@jsteemann)
+  - Rework the malloc() fast path.  (@djwatson)
+  - Rework the free() fast path.  (@djwatson)
+  - Refactor and optimize the tcache fill / flush paths.  (@djwatson)
+  - Optimize sync / lwsync on PowerPC.  (@chmeeedalf)
+  - Bypass extent_dalloc() when retain is enabled.  (@interwq)
+  - Optimize the locking on large deallocation.  (@interwq)
+  - Reduce the number of pages committed from sanity checking in debug build.
+    (@trasz, @interwq)
+  - Deprecate OSSpinLock.  (@interwq)
+  - Lower the default number of background threads to 4 (when the feature
+    is enabled).  (@interwq)
+  - Optimize the trylock spin wait.  (@djwatson)
+  - Use arena index for arena-matching checks.  (@interwq)
+  - Avoid forced decay on thread termination when using background threads.
+    (@interwq)
+  - Disable muzzy decay by default.  (@djwatson, @interwq)
+  - Only initialize libgcc unwinder when profiling is enabled.  (@paravoid,
+    @interwq)
+
+  Bug fixes (all only relevant to jemalloc 5.x):
+  - Fix background thread index issues with max_background_threads.  (@djwatson,
+    @interwq)
+  - Fix stats output for opt.lg_extent_max_active_fit.  (@interwq)
+  - Fix opt.prof_prefix initialization.  (@davidtgoldblatt)
+  - Properly trigger decay on tcache destroy.  (@interwq, @amosbird)
+  - Fix tcache.flush.  (@interwq)
+  - Detect whether explicit extent zero out is necessary with huge pages or
+    custom extent hooks, which may change the purge semantics.  (@interwq)
+  - Fix a side effect caused by extent_max_active_fit combined with decay-based
+    purging, where freed extents can accumulate and not be reused for an
+    extended period of time.  (@interwq, @mpghf)
+  - Fix a missing unlock on extent register error handling.  (@zoulasc)
+
+  Testing:
+  - Simplify the Travis script output.  (@gnzlbg)
+  - Update the test scripts for FreeBSD.  (@devnexen)
+  - Add unit tests for the producer-consumer pattern.  (@interwq)
+  - Add Cirrus-CI config for FreeBSD builds.  (@jasone)
+  - Add size-matching sanity checks on tcache flush.  (@davidtgoldblatt,
+    @interwq)
+
+  Incompatible changes:
+  - Remove --with-lg-page-sizes.  (@davidtgoldblatt)
+
+  Documentation:
+  - Attempt to build docs by default, however skip doc building when xsltproc
+    is missing. (@interwq, @cmuellner)
+
+* 5.1.0 (May 4, 2018)
 
   This release is primarily about fine-tuning, ranging from several new features
   to numerous notable performance and portability enhancements.  The release and
diff --git a/INSTALL.md b/INSTALL.md
index ef328c60..b8f729b0 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -221,13 +221,6 @@ any of the following arguments (not a definitive list) to 'configure':
     system page size may change between configuration and execution, e.g. when
     cross compiling.
 
-* `--with-lg-page-sizes=<lg-page-sizes>`
-
-    Specify the comma-separated base 2 logs of the page sizes to support.  This
-    option may be useful when cross compiling in combination with
-    `--with-lg-page`, but its primary use case is for integration with FreeBSD's
-    libc, wherein jemalloc is embedded.
-
 * `--with-lg-hugepage=<lg-hugepage>`
 
     Specify the base 2 log of the system huge page size.  This option is useful
@@ -276,6 +269,11 @@ any of the following arguments (not a definitive list) to 'configure':
     in the same process, which will almost certainly result in confusing runtime
     crashes if pointers leak from one implementation to the other.
 
+* `--disable-libdl`
+
+    Disable the usage of libdl, namely dlsym(3) which is required by the lazy
+    lock option.  This can allow building static binaries.
+
 The following environment variables (not a definitive list) impact configure's
 behavior:
 
diff --git a/Makefile.in b/Makefile.in
index 9b9347ff..0777f6a8 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -47,6 +47,7 @@ REV := @rev@
 install_suffix := @install_suffix@
 ABI := @abi@
 XSLTPROC := @XSLTPROC@
+XSLROOT := @XSLROOT@
 AUTOCONF := @AUTOCONF@
 _RPATH = @RPATH@
 RPATH = $(if $(1),$(call _RPATH,$(1)))
@@ -55,8 +56,11 @@ cfghdrs_out := @cfghdrs_out@
 cfgoutputs_in := $(addprefix $(srcroot),@cfgoutputs_in@)
 cfgoutputs_out := @cfgoutputs_out@
 enable_autogen := @enable_autogen@
+enable_shared := @enable_shared@
+enable_static := @enable_static@
 enable_prof := @enable_prof@
 enable_zone_allocator := @enable_zone_allocator@
+enable_experimental_smallocx := @enable_experimental_smallocx@
 MALLOC_CONF := @JEMALLOC_CPREFIX@MALLOC_CONF
 link_whole_archive := @link_whole_archive@
 DSO_LDFLAGS = @DSO_LDFLAGS@
@@ -102,7 +106,7 @@ C_SRCS := $(srcroot)src/jemalloc.c \
 	$(srcroot)src/extent_dss.c \
 	$(srcroot)src/extent_mmap.c \
 	$(srcroot)src/hash.c \
-	$(srcroot)src/hooks.c \
+	$(srcroot)src/hook.c \
 	$(srcroot)src/large.c \
 	$(srcroot)src/log.c \
 	$(srcroot)src/malloc_io.c \
@@ -114,8 +118,10 @@ C_SRCS := $(srcroot)src/jemalloc.c \
 	$(srcroot)src/prof.c \
 	$(srcroot)src/rtree.c \
 	$(srcroot)src/stats.c \
+	$(srcroot)src/sc.c \
 	$(srcroot)src/sz.c \
 	$(srcroot)src/tcache.c \
+	$(srcroot)src/test_hooks.c \
 	$(srcroot)src/ticker.c \
 	$(srcroot)src/tsd.c \
 	$(srcroot)src/witness.c
@@ -165,6 +171,8 @@ TESTS_UNIT := \
 	$(srcroot)test/unit/background_thread_enable.c \
 	$(srcroot)test/unit/base.c \
 	$(srcroot)test/unit/bitmap.c \
+	$(srcroot)test/unit/bit_util.c \
+	$(srcroot)test/unit/binshard.c \
 	$(srcroot)test/unit/ckh.c \
 	$(srcroot)test/unit/decay.c \
 	$(srcroot)test/unit/div.c \
@@ -172,7 +180,8 @@ TESTS_UNIT := \
 	$(srcroot)test/unit/extent_quantize.c \
 	$(srcroot)test/unit/fork.c \
 	$(srcroot)test/unit/hash.c \
-	$(srcroot)test/unit/hooks.c \
+	$(srcroot)test/unit/hook.c \
+	$(srcroot)test/unit/huge.c \
 	$(srcroot)test/unit/junk.c \
 	$(srcroot)test/unit/junk_alloc.c \
 	$(srcroot)test/unit/junk_free.c \
@@ -190,6 +199,7 @@ TESTS_UNIT := \
 	$(srcroot)test/unit/prof_active.c \
 	$(srcroot)test/unit/prof_gdump.c \
 	$(srcroot)test/unit/prof_idump.c \
+	$(srcroot)test/unit/prof_log.c \
 	$(srcroot)test/unit/prof_reset.c \
 	$(srcroot)test/unit/prof_tctx.c \
 	$(srcroot)test/unit/prof_thread_name.c \
@@ -198,13 +208,16 @@ TESTS_UNIT := \
 	$(srcroot)test/unit/rb.c \
 	$(srcroot)test/unit/retained.c \
 	$(srcroot)test/unit/rtree.c \
+	$(srcroot)test/unit/seq.c \
 	$(srcroot)test/unit/SFMT.c \
+	$(srcroot)test/unit/sc.c \
 	$(srcroot)test/unit/size_classes.c \
 	$(srcroot)test/unit/slab.c \
 	$(srcroot)test/unit/smoothstep.c \
 	$(srcroot)test/unit/spin.c \
 	$(srcroot)test/unit/stats.c \
 	$(srcroot)test/unit/stats_print.c \
+	$(srcroot)test/unit/test_hooks.c \
 	$(srcroot)test/unit/ticker.c \
 	$(srcroot)test/unit/nstime.c \
 	$(srcroot)test/unit/tsd.c \
@@ -217,15 +230,21 @@ endif
 TESTS_INTEGRATION := $(srcroot)test/integration/aligned_alloc.c \
 	$(srcroot)test/integration/allocated.c \
 	$(srcroot)test/integration/extent.c \
+	$(srcroot)test/integration/malloc.c \
 	$(srcroot)test/integration/mallocx.c \
 	$(srcroot)test/integration/MALLOCX_ARENA.c \
 	$(srcroot)test/integration/overflow.c \
 	$(srcroot)test/integration/posix_memalign.c \
 	$(srcroot)test/integration/rallocx.c \
 	$(srcroot)test/integration/sdallocx.c \
+	$(srcroot)test/integration/slab_sizes.c \
 	$(srcroot)test/integration/thread_arena.c \
 	$(srcroot)test/integration/thread_tcache_enabled.c \
 	$(srcroot)test/integration/xallocx.c
+ifeq (@enable_experimental_smallocx@, 1)
+TESTS_INTEGRATION += \
+  $(srcroot)test/integration/smallocx.c
+endif
 ifeq (@enable_cxx@, 1)
 CPP_SRCS := $(srcroot)src/jemalloc_cpp.cpp
 TESTS_INTEGRATION_CPP := $(srcroot)test/integration/cpp/basic.cpp
@@ -233,7 +252,9 @@ else
 CPP_SRCS :=
 TESTS_INTEGRATION_CPP :=
 endif
-TESTS_STRESS := $(srcroot)test/stress/microbench.c
+TESTS_STRESS := $(srcroot)test/stress/microbench.c \
+	$(srcroot)test/stress/hookbench.c
+
 
 TESTS := $(TESTS_UNIT) $(TESTS_INTEGRATION) $(TESTS_INTEGRATION_CPP) $(TESTS_STRESS)
 
@@ -274,10 +295,24 @@ all: build_lib
 dist: build_doc
 
 $(objroot)doc/%.html : $(objroot)doc/%.xml $(srcroot)doc/stylesheet.xsl $(objroot)doc/html.xsl
+ifneq ($(XSLROOT),)
 	$(XSLTPROC) -o $@ $(objroot)doc/html.xsl $<
+else
+ifeq ($(wildcard $(DOCS_HTML)),)
+	@echo "<p>Missing xsltproc.  Doc not built.</p>" > $@
+endif
+	@echo "Missing xsltproc.  "$@" not (re)built."
+endif
 
 $(objroot)doc/%.3 : $(objroot)doc/%.xml $(srcroot)doc/stylesheet.xsl $(objroot)doc/manpages.xsl
+ifneq ($(XSLROOT),)
 	$(XSLTPROC) -o $@ $(objroot)doc/manpages.xsl $<
+else
+ifeq ($(wildcard $(DOCS_MAN3)),)
+	@echo "Missing xsltproc.  Doc not built." > $@
+endif
+	@echo "Missing xsltproc.  "$@" not (re)built."
+endif
 
 build_doc_html: $(DOCS_HTML)
 build_doc_man: $(DOCS_MAN3)
@@ -400,7 +435,7 @@ $(objroot)test/unit/%$(EXE): $(objroot)test/unit/%.$(O) $(C_JET_OBJS) $(C_TESTLI
 
 $(objroot)test/integration/%$(EXE): $(objroot)test/integration/%.$(O) $(C_TESTLIB_INTEGRATION_OBJS) $(C_UTIL_INTEGRATION_OBJS) $(objroot)lib/$(LIBJEMALLOC).$(IMPORTLIB)
 	@mkdir -p $(@D)
-	$(CC) $(TEST_LD_MODE) $(LDTARGET) $(filter %.$(O),$^) $(call RPATH,$(objroot)lib) $(LJEMALLOC) $(LDFLAGS) $(filter-out -lm,$(filter -lrt -lpthread -lstdc++,$(LIBS))) $(LM) $(EXTRA_LDFLAGS)
+	$(CC) $(TEST_LD_MODE) $(LDTARGET) $(filter %.$(O),$^) $(call RPATH,$(objroot)lib) $(LJEMALLOC) $(LDFLAGS) $(filter-out -lm,$(filter -lrt -pthread -lstdc++,$(LIBS))) $(LM) $(EXTRA_LDFLAGS)
 
 $(objroot)test/integration/cpp/%$(EXE): $(objroot)test/integration/cpp/%.$(O) $(C_TESTLIB_INTEGRATION_OBJS) $(C_UTIL_INTEGRATION_OBJS) $(objroot)lib/$(LIBJEMALLOC).$(IMPORTLIB)
 	@mkdir -p $(@D)
@@ -412,7 +447,12 @@ $(objroot)test/stress/%$(EXE): $(objroot)test/stress/%.$(O) $(C_JET_OBJS) $(C_TE
 
 build_lib_shared: $(DSOS)
 build_lib_static: $(STATIC_LIBS)
-build_lib: build_lib_shared build_lib_static
+ifeq ($(enable_shared), 1)
+build_lib: build_lib_shared
+endif
+ifeq ($(enable_static), 1)
+build_lib: build_lib_static
+endif
 
 install_bin:
 	$(INSTALL) -d $(BINDIR)
@@ -449,7 +489,13 @@ install_lib_pc: $(PC)
 	$(INSTALL) -m 644 $$l $(LIBDIR)/pkgconfig; \
 done
 
-install_lib: install_lib_shared install_lib_static install_lib_pc
+ifeq ($(enable_shared), 1)
+install_lib: install_lib_shared
+endif
+ifeq ($(enable_static), 1)
+install_lib: install_lib_static
+endif
+install_lib: install_lib_pc
 
 install_doc_html:
 	$(INSTALL) -d $(DATADIR)/doc/jemalloc$(install_suffix)
@@ -465,7 +511,7 @@ install_doc_man:
 	$(INSTALL) -m 644 $$d $(MANDIR)/man3; \
 done
 
-install_doc: install_doc_html install_doc_man
+install_doc: build_doc install_doc_html install_doc_man
 
 install: install_bin install_include install_lib install_doc
 
diff --git a/configure.ac b/configure.ac
index a6a08db0..96f76d35 100644
--- a/configure.ac
+++ b/configure.ac
@@ -175,6 +175,9 @@ fi
 ],
   XSLROOT="${DEFAULT_XSLROOT}"
 )
+if test "x$XSLTPROC" = "xfalse" ; then
+  XSLROOT=""
+fi
 AC_SUBST([XSLROOT])
 
 dnl If CFLAGS isn't defined, set CFLAGS to something reasonable.  Otherwise,
@@ -242,6 +245,7 @@ if test "x$GCC" = "xyes" ; then
     fi
   fi
   JE_CFLAGS_ADD([-Wall])
+  JE_CFLAGS_ADD([-Wextra])
   JE_CFLAGS_ADD([-Wshorten-64-to-32])
   JE_CFLAGS_ADD([-Wsign-compare])
   JE_CFLAGS_ADD([-Wundef])
@@ -289,6 +293,7 @@ if test "x$enable_cxx" = "x1" ; then
   AX_CXX_COMPILE_STDCXX([14], [noext], [optional])
   if test "x${HAVE_CXX14}" = "x1" ; then
     JE_CXXFLAGS_ADD([-Wall])
+    JE_CXXFLAGS_ADD([-Wextra])
     JE_CXXFLAGS_ADD([-g3])
 
     SAVED_LIBS="${LIBS}"
@@ -536,6 +541,66 @@ AC_PROG_NM
 
 AC_PROG_AWK
 
+dnl ============================================================================
+dnl jemalloc version.
+dnl
+
+AC_ARG_WITH([version],
+  [AS_HELP_STRING([--with-version=<major>.<minor>.<bugfix>-<nrev>-g<gid>],
+   [Version string])],
+  [
+    echo "${with_version}" | grep ['^[0-9]\+\.[0-9]\+\.[0-9]\+-[0-9]\+-g[0-9a-f]\+$'] 2>&1 1>/dev/null
+    if test $? -eq 0 ; then
+      echo "$with_version" > "${objroot}VERSION"
+    else
+      echo "${with_version}" | grep ['^VERSION$'] 2>&1 1>/dev/null
+      if test $? -ne 0 ; then
+        AC_MSG_ERROR([${with_version} does not match <major>.<minor>.<bugfix>-<nrev>-g<gid> or VERSION])
+      fi
+    fi
+  ], [
+    dnl Set VERSION if source directory is inside a git repository.
+    if test "x`test ! \"${srcroot}\" && cd \"${srcroot}\"; git rev-parse --is-inside-work-tree 2>/dev/null`" = "xtrue" ; then
+      dnl Pattern globs aren't powerful enough to match both single- and
+      dnl double-digit version numbers, so iterate over patterns to support up
+      dnl to version 99.99.99 without any accidental matches.
+      for pattern in ['[0-9].[0-9].[0-9]' '[0-9].[0-9].[0-9][0-9]' \
+                     '[0-9].[0-9][0-9].[0-9]' '[0-9].[0-9][0-9].[0-9][0-9]' \
+                     '[0-9][0-9].[0-9].[0-9]' '[0-9][0-9].[0-9].[0-9][0-9]' \
+                     '[0-9][0-9].[0-9][0-9].[0-9]' \
+                     '[0-9][0-9].[0-9][0-9].[0-9][0-9]']; do
+        (test ! "${srcroot}" && cd "${srcroot}"; git describe --long --abbrev=40 --match="${pattern}") > "${objroot}VERSION.tmp" 2>/dev/null
+        if test $? -eq 0 ; then
+          mv "${objroot}VERSION.tmp" "${objroot}VERSION"
+          break
+        fi
+      done
+    fi
+    rm -f "${objroot}VERSION.tmp"
+  ])
+
+if test ! -e "${objroot}VERSION" ; then
+  if test ! -e "${srcroot}VERSION" ; then
+    AC_MSG_RESULT(
+      [Missing VERSION file, and unable to generate it; creating bogus VERSION])
+    echo "0.0.0-0-g0000000000000000000000000000000000000000" > "${objroot}VERSION"
+  else
+    cp ${srcroot}VERSION ${objroot}VERSION
+  fi
+fi
+jemalloc_version=`cat "${objroot}VERSION"`
+jemalloc_version_major=`echo ${jemalloc_version} | tr ".g-" " " | awk '{print [$]1}'`
+jemalloc_version_minor=`echo ${jemalloc_version} | tr ".g-" " " | awk '{print [$]2}'`
+jemalloc_version_bugfix=`echo ${jemalloc_version} | tr ".g-" " " | awk '{print [$]3}'`
+jemalloc_version_nrev=`echo ${jemalloc_version} | tr ".g-" " " | awk '{print [$]4}'`
+jemalloc_version_gid=`echo ${jemalloc_version} | tr ".g-" " " | awk '{print [$]5}'`
+AC_SUBST([jemalloc_version])
+AC_SUBST([jemalloc_version_major])
+AC_SUBST([jemalloc_version_minor])
+AC_SUBST([jemalloc_version_bugfix])
+AC_SUBST([jemalloc_version_nrev])
+AC_SUBST([jemalloc_version_gid])
+
 dnl Platform-specific settings.  abi and RPATH can probably be determined
 dnl programmatically, but doing so is error-prone, which makes it generally
 dnl not worth the trouble.
@@ -816,6 +881,36 @@ AC_PROG_RANLIB
 AC_PATH_PROG([LD], [ld], [false], [$PATH])
 AC_PATH_PROG([AUTOCONF], [autoconf], [false], [$PATH])
 
+dnl Enable shared libs
+AC_ARG_ENABLE([shared],
+  [AS_HELP_STRING([--enable-shared], [Build shared libaries])],
+if test "x$enable_shared" = "xno" ; then
+  enable_shared="0"
+else
+  enable_shared="1"
+fi
+,
+enable_shared="1"
+)
+AC_SUBST([enable_shared])
+
+dnl Enable static libs
+AC_ARG_ENABLE([static],
+  [AS_HELP_STRING([--enable-static], [Build static libaries])],
+if test "x$enable_static" = "xno" ; then
+  enable_static="0"
+else
+  enable_static="1"
+fi
+,
+enable_static="1"
+)
+AC_SUBST([enable_static])
+
+if test "$enable_shared$enable_static" = "00" ; then
+  AC_MSG_ERROR([Please enable one of shared or static builds])
+fi
+
 dnl Perform no name mangling by default.
 AC_ARG_WITH([mangling],
   [AS_HELP_STRING([--with-mangling=<map>], [Mangle symbols in <map>])],
@@ -848,7 +943,7 @@ AC_ARG_WITH([export],
 fi]
 )
 
-public_syms="aligned_alloc calloc dallocx free mallctl mallctlbymib mallctlnametomib malloc malloc_conf malloc_message malloc_stats_print malloc_usable_size mallocx nallocx posix_memalign rallocx realloc sallocx sdallocx xallocx"
+public_syms="aligned_alloc calloc dallocx free mallctl mallctlbymib mallctlnametomib malloc malloc_conf malloc_message malloc_stats_print malloc_usable_size mallocx smallocx_${jemalloc_version_gid} nallocx posix_memalign rallocx realloc sallocx sdallocx xallocx"
 dnl Check for additional platform-specific public API functions.
 AC_CHECK_FUNC([memalign],
 	      [AC_DEFINE([JEMALLOC_OVERRIDE_MEMALIGN], [ ])
@@ -966,7 +1061,6 @@ cfghdrs_in="${cfghdrs_in} include/jemalloc/internal/private_symbols.sh"
 cfghdrs_in="${cfghdrs_in} include/jemalloc/internal/private_namespace.sh"
 cfghdrs_in="${cfghdrs_in} include/jemalloc/internal/public_namespace.sh"
 cfghdrs_in="${cfghdrs_in} include/jemalloc/internal/public_unnamespace.sh"
-cfghdrs_in="${cfghdrs_in} include/jemalloc/internal/size_classes.sh"
 cfghdrs_in="${cfghdrs_in} include/jemalloc/jemalloc_rename.sh"
 cfghdrs_in="${cfghdrs_in} include/jemalloc/jemalloc_mangle.sh"
 cfghdrs_in="${cfghdrs_in} include/jemalloc/jemalloc.sh"
@@ -979,7 +1073,6 @@ cfghdrs_out="${cfghdrs_out} include/jemalloc/internal/private_symbols_jet.awk"
 cfghdrs_out="${cfghdrs_out} include/jemalloc/internal/public_symbols.txt"
 cfghdrs_out="${cfghdrs_out} include/jemalloc/internal/public_namespace.h"
 cfghdrs_out="${cfghdrs_out} include/jemalloc/internal/public_unnamespace.h"
-cfghdrs_out="${cfghdrs_out} include/jemalloc/internal/size_classes.h"
 cfghdrs_out="${cfghdrs_out} include/jemalloc/jemalloc_protos_jet.h"
 cfghdrs_out="${cfghdrs_out} include/jemalloc/jemalloc_rename.h"
 cfghdrs_out="${cfghdrs_out} include/jemalloc/jemalloc_mangle.h"
@@ -991,6 +1084,10 @@ cfghdrs_tup="include/jemalloc/jemalloc_defs.h:include/jemalloc/jemalloc_defs.h.i
 cfghdrs_tup="${cfghdrs_tup} include/jemalloc/internal/jemalloc_internal_defs.h:include/jemalloc/internal/jemalloc_internal_defs.h.in"
 cfghdrs_tup="${cfghdrs_tup} test/include/test/jemalloc_test_defs.h:test/include/test/jemalloc_test_defs.h.in"
 
+dnl ============================================================================
+dnl jemalloc build options.
+dnl
+
 dnl Do not compile with debugging by default.
 AC_ARG_ENABLE([debug],
   [AS_HELP_STRING([--enable-debug],
@@ -1043,6 +1140,22 @@ if test "x$enable_stats" = "x1" ; then
 fi
 AC_SUBST([enable_stats])
 
+dnl Do not enable smallocx by default.
+AC_ARG_ENABLE([experimental_smallocx],
+  [AS_HELP_STRING([--enable-experimental-smallocx], [Enable experimental smallocx API])],
+[if test "x$enable_experimental_smallocx" = "xno" ; then
+enable_experimental_smallocx="0"
+else
+enable_experimental_smallocx="1"
+fi
+],
+[enable_experimental_smallocx="0"]
+)
+if test "x$enable_experimental_smallocx" = "x1" ; then
+  AC_DEFINE([JEMALLOC_EXPERIMENTAL_SMALLOCX_API])
+fi
+AC_SUBST([enable_experimental_smallocx])
+
 dnl Do not enable profiling by default.
 AC_ARG_ENABLE([prof],
   [AS_HELP_STRING([--enable-prof], [Enable allocation profiling])],
@@ -1277,6 +1390,38 @@ if test "x$enable_log" = "x1" ; then
 fi
 AC_SUBST([enable_log])
 
+dnl Do not use readlinkat by default
+AC_ARG_ENABLE([readlinkat],
+  [AS_HELP_STRING([--enable-readlinkat], [Use readlinkat over readlink])],
+[if test "x$enable_readlinkat" = "xno" ; then
+  enable_readlinkat="0"
+else
+  enable_readlinkat="1"
+fi
+],
+[enable_readlinkat="0"]
+)
+if test "x$enable_readlinkat" = "x1" ; then
+  AC_DEFINE([JEMALLOC_READLINKAT], [ ])
+fi
+AC_SUBST([enable_readlinkat])
+
+dnl Avoid the extra size checking by default
+AC_ARG_ENABLE([extra-size-check],
+  [AS_HELP_STRING([--enable-extra-size-check],
+  [Perform additonal size related sanity checks])],
+[if test "x$enable_extra_size_check" = "xno" ; then
+  enable_extra_size_check="0"
+else
+  enable_extra_size_check="1"
+fi
+],
+[enable_extra_size_check="0"]
+)
+if test "x$enable_extra_size_check" = "x1" ; then
+  AC_DEFINE([JEMALLOC_EXTRA_SIZE_CHECK], [ ])
+fi
+AC_SUBST([enable_extra_size_check])
 
 JE_COMPILABLE([a program using __builtin_unreachable], [
 void foo (void) {
@@ -1333,6 +1478,21 @@ else
   fi
 fi
 
+JE_COMPILABLE([a program using __builtin_popcountl], [
+#include <stdio.h>
+#include <strings.h>
+#include <string.h>
+], [
+	{
+		int rv = __builtin_popcountl(0x08);
+		printf("%d\n", rv);
+	}
+], [je_cv_gcc_builtin_popcountl])
+if test "x${je_cv_gcc_builtin_popcountl}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_INTERNAL_POPCOUNT], [__builtin_popcount])
+  AC_DEFINE([JEMALLOC_INTERNAL_POPCOUNTL], [__builtin_popcountl])
+fi
+
 AC_ARG_WITH([lg_quantum],
   [AS_HELP_STRING([--with-lg-quantum=<lg-quantum>],
    [Base 2 log of minimum allocation alignment])],
@@ -1430,70 +1590,20 @@ if test "x${LG_PAGE}" != "xundefined" -a \
 fi
 AC_DEFINE_UNQUOTED([LG_HUGEPAGE], [${je_cv_lg_hugepage}])
 
-AC_ARG_WITH([lg_page_sizes],
-  [AS_HELP_STRING([--with-lg-page-sizes=<lg-page-sizes>],
-   [Base 2 logs of system page sizes to support])],
-  [LG_PAGE_SIZES="$with_lg_page_sizes"], [LG_PAGE_SIZES="$LG_PAGE"])
-
 dnl ============================================================================
-dnl jemalloc configuration.
-dnl
-
-AC_ARG_WITH([version],
-  [AS_HELP_STRING([--with-version=<major>.<minor>.<bugfix>-<nrev>-g<gid>],
-   [Version string])],
-  [
-    echo "${with_version}" | grep ['^[0-9]\+\.[0-9]\+\.[0-9]\+-[0-9]\+-g[0-9a-f]\+$'] 2>&1 1>/dev/null
-    if test $? -eq 0 ; then
-      echo "$with_version" > "${objroot}VERSION"
-    else
-      echo "${with_version}" | grep ['^VERSION$'] 2>&1 1>/dev/null
-      if test $? -ne 0 ; then
-        AC_MSG_ERROR([${with_version} does not match <major>.<minor>.<bugfix>-<nrev>-g<gid> or VERSION])
-      fi
-    fi
-  ], [
-    dnl Set VERSION if source directory is inside a git repository.
-    if test "x`test ! \"${srcroot}\" && cd \"${srcroot}\"; git rev-parse --is-inside-work-tree 2>/dev/null`" = "xtrue" ; then
-      dnl Pattern globs aren't powerful enough to match both single- and
-      dnl double-digit version numbers, so iterate over patterns to support up
-      dnl to version 99.99.99 without any accidental matches.
-      for pattern in ['[0-9].[0-9].[0-9]' '[0-9].[0-9].[0-9][0-9]' \
-                     '[0-9].[0-9][0-9].[0-9]' '[0-9].[0-9][0-9].[0-9][0-9]' \
-                     '[0-9][0-9].[0-9].[0-9]' '[0-9][0-9].[0-9].[0-9][0-9]' \
-                     '[0-9][0-9].[0-9][0-9].[0-9]' \
-                     '[0-9][0-9].[0-9][0-9].[0-9][0-9]']; do
-        (test ! "${srcroot}" && cd "${srcroot}"; git describe --long --abbrev=40 --match="${pattern}") > "${objroot}VERSION.tmp" 2>/dev/null
-        if test $? -eq 0 ; then
-          mv "${objroot}VERSION.tmp" "${objroot}VERSION"
-          break
-        fi
-      done
-    fi
-    rm -f "${objroot}VERSION.tmp"
-  ])
-
-if test ! -e "${objroot}VERSION" ; then
-  if test ! -e "${srcroot}VERSION" ; then
-    AC_MSG_RESULT(
-      [Missing VERSION file, and unable to generate it; creating bogus VERSION])
-    echo "0.0.0-0-g0000000000000000000000000000000000000000" > "${objroot}VERSION"
-  else
-    cp ${srcroot}VERSION ${objroot}VERSION
-  fi
+dnl Enable libdl by default.
+AC_ARG_ENABLE([libdl],
+  [AS_HELP_STRING([--disable-libdl],
+  [Do not use libdl])],
+[if test "x$enable_libdl" = "xno" ; then
+  enable_libdl="0"
+else
+  enable_libdl="1"
 fi
-jemalloc_version=`cat "${objroot}VERSION"`
-jemalloc_version_major=`echo ${jemalloc_version} | tr ".g-" " " | awk '{print [$]1}'`
-jemalloc_version_minor=`echo ${jemalloc_version} | tr ".g-" " " | awk '{print [$]2}'`
-jemalloc_version_bugfix=`echo ${jemalloc_version} | tr ".g-" " " | awk '{print [$]3}'`
-jemalloc_version_nrev=`echo ${jemalloc_version} | tr ".g-" " " | awk '{print [$]4}'`
-jemalloc_version_gid=`echo ${jemalloc_version} | tr ".g-" " " | awk '{print [$]5}'`
-AC_SUBST([jemalloc_version])
-AC_SUBST([jemalloc_version_major])
-AC_SUBST([jemalloc_version_minor])
-AC_SUBST([jemalloc_version_bugfix])
-AC_SUBST([jemalloc_version_nrev])
-AC_SUBST([jemalloc_version_gid])
+],
+[enable_libdl="1"]
+)
+AC_SUBST([libdl])
 
 dnl ============================================================================
 dnl Configure pthreads.
@@ -1503,20 +1613,26 @@ if test "x$abi" != "xpecoff" ; then
   AC_CHECK_HEADERS([pthread.h], , [AC_MSG_ERROR([pthread.h is missing])])
   dnl Some systems may embed pthreads functionality in libc; check for libpthread
   dnl first, but try libc too before failing.
-  AC_CHECK_LIB([pthread], [pthread_create], [JE_APPEND_VS(LIBS, -lpthread)],
+  AC_CHECK_LIB([pthread], [pthread_create], [JE_APPEND_VS(LIBS, -pthread)],
                [AC_SEARCH_LIBS([pthread_create], , ,
                                AC_MSG_ERROR([libpthread is missing]))])
   wrap_syms="${wrap_syms} pthread_create"
   have_pthread="1"
-  dnl Check if we have dlsym support.
-  have_dlsym="1"
-  AC_CHECK_HEADERS([dlfcn.h],
-    AC_CHECK_FUNC([dlsym], [],
-      [AC_CHECK_LIB([dl], [dlsym], [LIBS="$LIBS -ldl"], [have_dlsym="0"])]),
-    [have_dlsym="0"])
-  if test "x$have_dlsym" = "x1" ; then
-    AC_DEFINE([JEMALLOC_HAVE_DLSYM], [ ])
+
+dnl Check if we have dlsym support.
+  if test "x$enable_libdl" = "x1" ; then
+    have_dlsym="1"
+    AC_CHECK_HEADERS([dlfcn.h],
+      AC_CHECK_FUNC([dlsym], [],
+        [AC_CHECK_LIB([dl], [dlsym], [LIBS="$LIBS -ldl"], [have_dlsym="0"])]),
+      [have_dlsym="0"])
+    if test "x$have_dlsym" = "x1" ; then
+      AC_DEFINE([JEMALLOC_HAVE_DLSYM], [ ])
+    fi
+  else
+    have_dlsym="0"
   fi
+
   JE_COMPILABLE([pthread_atfork(3)], [
 #include <pthread.h>
 ], [
@@ -1780,6 +1896,19 @@ JE_COMPILABLE([GCC __atomic atomics], [
 ], [je_cv_gcc_atomic_atomics])
 if test "x${je_cv_gcc_atomic_atomics}" = "xyes" ; then
   AC_DEFINE([JEMALLOC_GCC_ATOMIC_ATOMICS])
+
+  dnl check for 8-bit atomic support
+  JE_COMPILABLE([GCC 8-bit __atomic atomics], [
+  ], [
+      unsigned char x = 0;
+      int val = 1;
+      int y = __atomic_fetch_add(&x, val, __ATOMIC_RELAXED);
+      int after_add = (int)x;
+      return after_add == 1;
+  ], [je_cv_gcc_u8_atomic_atomics])
+  if test "x${je_cv_gcc_u8_atomic_atomics}" = "xyes" ; then
+    AC_DEFINE([JEMALLOC_GCC_U8_ATOMIC_ATOMICS])
+  fi
 fi
 
 dnl ============================================================================
@@ -1794,12 +1923,24 @@ JE_COMPILABLE([GCC __sync atomics], [
 ], [je_cv_gcc_sync_atomics])
 if test "x${je_cv_gcc_sync_atomics}" = "xyes" ; then
   AC_DEFINE([JEMALLOC_GCC_SYNC_ATOMICS])
+
+  dnl check for 8-bit atomic support
+  JE_COMPILABLE([GCC 8-bit __sync atomics], [
+  ], [
+      unsigned char x = 0;
+      int before_add = __sync_fetch_and_add(&x, 1);
+      int after_add = (int)x;
+      return (before_add == 0) && (after_add == 1);
+  ], [je_cv_gcc_u8_sync_atomics])
+  if test "x${je_cv_gcc_u8_sync_atomics}" = "xyes" ; then
+    AC_DEFINE([JEMALLOC_GCC_U8_SYNC_ATOMICS])
+  fi
 fi
 
 dnl ============================================================================
 dnl Check for atomic(3) operations as provided on Darwin.
 dnl We need this not for the atomic operations (which are provided above), but
-dnl rather for the OSSpinLock type it exposes.
+dnl rather for the OS_unfair_lock type it exposes.
 
 JE_COMPILABLE([Darwin OSAtomic*()], [
 #include <libkern/OSAtomic.h>
@@ -1870,7 +2011,7 @@ if test "x${je_cv_madvise}" = "xyes" ; then
   if test "x${je_cv_madv_dontdump}" = "xyes" ; then
     AC_DEFINE([JEMALLOC_MADVISE_DONTDUMP], [ ])
   fi
- 
+
   dnl Check for madvise(..., MADV_[NO]HUGEPAGE).
   JE_COMPILABLE([madvise(..., MADV_[[NO]]HUGEPAGE)], [
 #include <sys/mman.h>
@@ -1889,40 +2030,6 @@ case "${host_cpu}" in
 esac
 fi
 
-dnl ============================================================================
-dnl Check whether __sync_{add,sub}_and_fetch() are available despite
-dnl __GCC_HAVE_SYNC_COMPARE_AND_SWAP_n macros being undefined.
-
-AC_DEFUN([JE_SYNC_COMPARE_AND_SWAP_CHECK],[
-  AC_CACHE_CHECK([whether to force $1-bit __sync_{add,sub}_and_fetch()],
-               [je_cv_sync_compare_and_swap_$2],
-               [AC_LINK_IFELSE([AC_LANG_PROGRAM([
-                                                 #include <stdint.h>
-                                                ],
-                                                [
-                                                 #ifndef __GCC_HAVE_SYNC_COMPARE_AND_SWAP_$2
-                                                 {
-                                                    uint$1_t x$1 = 0;
-                                                    __sync_add_and_fetch(&x$1, 42);
-                                                    __sync_sub_and_fetch(&x$1, 1);
-                                                 }
-                                                 #else
-                                                 #error __GCC_HAVE_SYNC_COMPARE_AND_SWAP_$2 is defined, no need to force
-                                                 #endif
-                                                ])],
-                               [je_cv_sync_compare_and_swap_$2=yes],
-                               [je_cv_sync_compare_and_swap_$2=no])])
-
-  if test "x${je_cv_sync_compare_and_swap_$2}" = "xyes" ; then
-    AC_DEFINE([JE_FORCE_SYNC_COMPARE_AND_SWAP_$2], [ ])
-  fi
-])
-
-if test "x${je_cv_atomic9}" != "xyes" -a "x${je_cv_osatomic}" != "xyes" ; then
-  JE_SYNC_COMPARE_AND_SWAP_CHECK(32, 4)
-  JE_SYNC_COMPARE_AND_SWAP_CHECK(64, 8)
-fi
-
 dnl ============================================================================
 dnl Check for __builtin_clz() and __builtin_clzl().
 
@@ -1965,21 +2072,6 @@ if test "x${je_cv_os_unfair_lock}" = "xyes" ; then
   AC_DEFINE([JEMALLOC_OS_UNFAIR_LOCK], [ ])
 fi
 
-dnl ============================================================================
-dnl Check for spinlock(3) operations as provided on Darwin.
-
-JE_COMPILABLE([Darwin OSSpin*()], [
-#include <libkern/OSAtomic.h>
-#include <inttypes.h>
-], [
-	OSSpinLock lock = 0;
-	OSSpinLockLock(&lock);
-	OSSpinLockUnlock(&lock);
-], [je_cv_osspin])
-if test "x${je_cv_osspin}" = "xyes" ; then
-  AC_DEFINE([JEMALLOC_OSSPIN], [ ])
-fi
-
 dnl ============================================================================
 dnl Darwin-related configuration.
 
@@ -2032,9 +2124,7 @@ fi
 dnl ============================================================================
 dnl Enable background threads if possible.
 
-if test "x${have_pthread}" = "x1" -a "x${have_dlsym}" = "x1" \
-    -a "x${je_cv_os_unfair_lock}" != "xyes" \
-    -a "x${je_cv_osspin}" != "xyes" ; then
+if test "x${have_pthread}" = "x1" -a "x${je_cv_os_unfair_lock}" != "xyes" ; then
   AC_DEFINE([JEMALLOC_BACKGROUND_THREAD])
 fi
 
@@ -2175,16 +2265,6 @@ AC_CONFIG_COMMANDS([include/jemalloc/internal/public_unnamespace.h], [
   srcdir="${srcdir}"
   objroot="${objroot}"
 ])
-AC_CONFIG_COMMANDS([include/jemalloc/internal/size_classes.h], [
-  mkdir -p "${objroot}include/jemalloc/internal"
-  "${SHELL}" "${srcdir}/include/jemalloc/internal/size_classes.sh" "${LG_QUANTA}" 3 "${LG_PAGE_SIZES}" 2 > "${objroot}include/jemalloc/internal/size_classes.h"
-], [
-  SHELL="${SHELL}"
-  srcdir="${srcdir}"
-  objroot="${objroot}"
-  LG_QUANTA="${LG_QUANTA}"
-  LG_PAGE_SIZES="${LG_PAGE_SIZES}"
-])
 AC_CONFIG_COMMANDS([include/jemalloc/jemalloc_protos_jet.h], [
   mkdir -p "${objroot}include/jemalloc"
   cat "${srcdir}/include/jemalloc/jemalloc_protos.h.in" | sed -e 's/@je_@/jet_/g' > "${objroot}include/jemalloc/jemalloc_protos_jet.h"
@@ -2277,9 +2357,12 @@ AC_MSG_RESULT([JEMALLOC_PRIVATE_NAMESPACE])
 AC_MSG_RESULT([                   : ${JEMALLOC_PRIVATE_NAMESPACE}])
 AC_MSG_RESULT([install_suffix     : ${install_suffix}])
 AC_MSG_RESULT([malloc_conf        : ${config_malloc_conf}])
+AC_MSG_RESULT([shared libs        : ${enable_shared}])
+AC_MSG_RESULT([static libs        : ${enable_static}])
 AC_MSG_RESULT([autogen            : ${enable_autogen}])
 AC_MSG_RESULT([debug              : ${enable_debug}])
 AC_MSG_RESULT([stats              : ${enable_stats}])
+AC_MSG_RESULT([experimetal_smallocx : ${enable_experimental_smallocx}])
 AC_MSG_RESULT([prof               : ${enable_prof}])
 AC_MSG_RESULT([prof-libunwind     : ${enable_prof_libunwind}])
 AC_MSG_RESULT([prof-libgcc        : ${enable_prof_libgcc}])
diff --git a/doc/jemalloc.xml.in b/doc/jemalloc.xml.in
index 1e12fd3a..fd0edb30 100644
--- a/doc/jemalloc.xml.in
+++ b/doc/jemalloc.xml.in
@@ -433,10 +433,11 @@ for (i = 0; i < nbins; i++) {
       arena statistics, respectively; <quote>b</quote> and <quote>l</quote> can
       be specified to omit per size class statistics for bins and large objects,
       respectively; <quote>x</quote> can be specified to omit all mutex
-      statistics.  Unrecognized characters are silently ignored.  Note that
-      thread caching may prevent some statistics from being completely up to
-      date, since extra locking would be required to merge counters that track
-      thread cache operations.</para>
+      statistics; <quote>e</quote> can be used to omit extent statistics.
+      Unrecognized characters are silently ignored.  Note that thread caching
+      may prevent some statistics from being completely up to date, since extra
+      locking would be required to merge counters that track thread cache
+      operations.</para>
 
       <para>The <function>malloc_usable_size()</function> function
       returns the usable size of the allocation pointed to by
@@ -943,6 +944,9 @@ mallctl("arena." STRINGIFY(MALLCTL_ARENAS_ALL) ".decay",
         <citerefentry><refentrytitle>munmap</refentrytitle>
         <manvolnum>2</manvolnum></citerefentry> or equivalent (see <link
         linkend="stats.retained">stats.retained</link> for related details).
+        It also makes jemalloc use <citerefentry>
+        <refentrytitle>mmap</refentrytitle><manvolnum>2</manvolnum>
+        </citerefentry> in a more greedy way, mapping larger chunks in one go.
         This option is disabled by default unless discarding virtual memory is
         known to trigger
         platform-specific performance problems, e.g. for [64-bit] Linux, which
@@ -988,6 +992,24 @@ mallctl("arena." STRINGIFY(MALLCTL_ARENAS_ALL) ".decay",
         number of CPUs, or one if there is a single CPU.</para></listitem>
       </varlistentry>
 
+      <varlistentry id="opt.oversize_threshold">
+        <term>
+          <mallctl>opt.oversize_threshold</mallctl>
+          (<type>size_t</type>)
+          <literal>r-</literal>
+        </term>
+        <listitem><para>The threshold in bytes of which requests are considered
+        oversize.  Allocation requests with greater sizes are fulfilled from a
+        dedicated arena (automatically managed, however not within
+        <literal>narenas</literal>), in order to reduce fragmentation by not
+        mixing huge allocations with small ones.  In addition, the decay API
+        guarantees on the extents greater than the specified threshold may be
+        overridden.  Note that requests with arena index specified via
+        <constant>MALLOCX_ARENA</constant>, or threads associated with explicit
+        arenas will not be considered.  The default threshold is 8MiB.  Values
+        not within large size classes disables this feature.</para></listitem>
+      </varlistentry>
+
       <varlistentry id="opt.percpu_arena">
         <term>
           <mallctl>opt.percpu_arena</mallctl>
@@ -1009,7 +1031,7 @@ mallctl("arena." STRINGIFY(MALLCTL_ARENAS_ALL) ".decay",
       <varlistentry id="opt.background_thread">
         <term>
           <mallctl>opt.background_thread</mallctl>
-          (<type>const bool</type>)
+          (<type>bool</type>)
           <literal>r-</literal>
         </term>
         <listitem><para>Internal background worker threads enabled/disabled.
@@ -1024,7 +1046,7 @@ mallctl("arena." STRINGIFY(MALLCTL_ARENAS_ALL) ".decay",
       <varlistentry id="opt.max_background_threads">
         <term>
           <mallctl>opt.max_background_threads</mallctl>
-          (<type>const size_t</type>)
+          (<type>size_t</type>)
           <literal>r-</literal>
         </term>
         <listitem><para>Maximum number of background threads that will be created
@@ -1055,7 +1077,11 @@ mallctl("arena." STRINGIFY(MALLCTL_ARENAS_ALL) ".decay",
         linkend="arena.i.dirty_decay_ms"><mallctl>arena.&lt;i&gt;.dirty_decay_ms</mallctl></link>
         for related dynamic control options.  See <link
         linkend="opt.muzzy_decay_ms"><mallctl>opt.muzzy_decay_ms</mallctl></link>
-        for a description of muzzy pages.</para></listitem>
+        for a description of muzzy pages.for a description of muzzy pages.  Note
+        that when the <link
+        linkend="opt.oversize_threshold"><mallctl>oversize_threshold</mallctl></link>
+        feature is enabled, the arenas reserved for oversize requests may have
+        its own default decay settings.</para></listitem>
       </varlistentry>
 
       <varlistentry id="opt.muzzy_decay_ms">
@@ -1763,10 +1789,11 @@ malloc_conf = "xmalloc:true";]]></programlisting>
         to control allocation for arenas explicitly created via <link
         linkend="arenas.create"><mallctl>arenas.create</mallctl></link> such
         that all extents originate from an application-supplied extent allocator
-        (by specifying the custom extent hook functions during arena creation),
-        but the automatically created arenas will have already created extents
-        prior to the application having an opportunity to take over extent
-        allocation.</para>
+        (by specifying the custom extent hook functions during arena creation).
+        However, the API guarantees for the automatically created arenas may be
+        relaxed -- hooks set there may be called in a "best effort" fashion; in
+        addition there may be extents created prior to the application having an
+        opportunity to take over extent allocation.</para>
 
         <programlisting language="C"><![CDATA[
 typedef extent_hooks_s extent_hooks_t;
@@ -2593,6 +2620,17 @@ struct extent_hooks_s {
         details.</para></listitem>
       </varlistentry>
 
+      <varlistentry id="stats.arenas.i.extent_avail">
+        <term>
+          <mallctl>stats.arenas.&lt;i&gt;.extent_avail</mallctl>
+          (<type>size_t</type>)
+          <literal>r-</literal>
+          [<option>--enable-stats</option>]
+        </term>
+        <listitem><para>Number of allocated (but unused) extent structs in this
+	arena.</para></listitem>
+      </varlistentry>
+
       <varlistentry id="stats.arenas.i.base">
         <term>
           <mallctl>stats.arenas.&lt;i&gt;.base</mallctl>
@@ -2922,6 +2960,30 @@ struct extent_hooks_s {
         counters</link>.</para></listitem>
       </varlistentry>
 
+      <varlistentry id="stats.arenas.i.extents.n">
+        <term>
+          <mallctl>stats.arenas.&lt;i&gt;.extents.&lt;j&gt;.n{extent_type}</mallctl>
+          (<type>size_t</type>)
+          <literal>r-</literal>
+          [<option>--enable-stats</option>]
+        </term>
+        <listitem><para> Number of extents of the given type in this arena in
+	the bucket corresponding to page size index &lt;j&gt;. The extent type
+	is one of dirty, muzzy, or retained.</para></listitem>
+      </varlistentry>
+
+      <varlistentry id="stats.arenas.i.extents.bytes">
+        <term>
+          <mallctl>stats.arenas.&lt;i&gt;.extents.&lt;j&gt;.{extent_type}_bytes</mallctl>
+          (<type>size_t</type>)
+          <literal>r-</literal>
+          [<option>--enable-stats</option>]
+        </term>
+	<listitem><para> Sum of the bytes managed by extents of the given type
+	in this arena in the bucket corresponding to page size index &lt;j&gt;.
+	The extent type is one of dirty, muzzy, or retained.</para></listitem>
+      </varlistentry>
+
       <varlistentry id="stats.arenas.i.lextents.j.nmalloc">
         <term>
           <mallctl>stats.arenas.&lt;i&gt;.lextents.&lt;j&gt;.nmalloc</mallctl>
diff --git a/include/jemalloc/internal/arena_externs.h b/include/jemalloc/internal/arena_externs.h
index 4b3732b4..2bdddb77 100644
--- a/include/jemalloc/internal/arena_externs.h
+++ b/include/jemalloc/internal/arena_externs.h
@@ -3,8 +3,8 @@
 
 #include "jemalloc/internal/bin.h"
 #include "jemalloc/internal/extent_dss.h"
+#include "jemalloc/internal/hook.h"
 #include "jemalloc/internal/pages.h"
-#include "jemalloc/internal/size_classes.h"
 #include "jemalloc/internal/stats.h"
 
 extern ssize_t opt_dirty_decay_ms;
@@ -16,13 +16,17 @@ extern const char *percpu_arena_mode_names[];
 extern const uint64_t h_steps[SMOOTHSTEP_NSTEPS];
 extern malloc_mutex_t arenas_lock;
 
+extern size_t opt_oversize_threshold;
+extern size_t oversize_threshold;
+
 void arena_basic_stats_merge(tsdn_t *tsdn, arena_t *arena,
     unsigned *nthreads, const char **dss, ssize_t *dirty_decay_ms,
     ssize_t *muzzy_decay_ms, size_t *nactive, size_t *ndirty, size_t *nmuzzy);
 void arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads,
     const char **dss, ssize_t *dirty_decay_ms, ssize_t *muzzy_decay_ms,
     size_t *nactive, size_t *ndirty, size_t *nmuzzy, arena_stats_t *astats,
-    bin_stats_t *bstats, arena_stats_large_t *lstats);
+    bin_stats_t *bstats, arena_stats_large_t *lstats,
+    arena_stats_extents_t *estats);
 void arena_extents_dirty_dalloc(tsdn_t *tsdn, arena_t *arena,
     extent_hooks_t **r_extent_hooks, extent_t *extent);
 #ifdef JEMALLOC_JET
@@ -59,13 +63,14 @@ void *arena_palloc(tsdn_t *tsdn, arena_t *arena, size_t usize,
 void arena_prof_promote(tsdn_t *tsdn, const void *ptr, size_t usize);
 void arena_dalloc_promoted(tsdn_t *tsdn, void *ptr, tcache_t *tcache,
     bool slow_path);
-void arena_dalloc_bin_junked_locked(tsdn_t *tsdn, arena_t *arena,
-    extent_t *extent, void *ptr);
+void arena_dalloc_bin_junked_locked(tsdn_t *tsdn, arena_t *arena, bin_t *bin,
+    szind_t binind, extent_t *extent, void *ptr);
 void arena_dalloc_small(tsdn_t *tsdn, void *ptr);
 bool arena_ralloc_no_move(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size,
-    size_t extra, bool zero);
+    size_t extra, bool zero, size_t *newsize);
 void *arena_ralloc(tsdn_t *tsdn, arena_t *arena, void *ptr, size_t oldsize,
-    size_t size, size_t alignment, bool zero, tcache_t *tcache);
+    size_t size, size_t alignment, bool zero, tcache_t *tcache,
+    hook_ralloc_args_t *hook_args);
 dss_prec_t arena_dss_prec_get(arena_t *arena);
 bool arena_dss_prec_set(arena_t *arena, dss_prec_t dss_prec);
 ssize_t arena_dirty_decay_ms_default_get(void);
@@ -79,7 +84,12 @@ void arena_nthreads_inc(arena_t *arena, bool internal);
 void arena_nthreads_dec(arena_t *arena, bool internal);
 size_t arena_extent_sn_next(arena_t *arena);
 arena_t *arena_new(tsdn_t *tsdn, unsigned ind, extent_hooks_t *extent_hooks);
-void arena_boot(void);
+bool arena_init_huge(void);
+bool arena_is_huge(unsigned arena_ind);
+arena_t *arena_choose_huge(tsd_t *tsd);
+bin_t *arena_bin_choose_lock(tsdn_t *tsdn, arena_t *arena, szind_t binind,
+    unsigned *binshard);
+void arena_boot(sc_data_t *sc_data);
 void arena_prefork0(tsdn_t *tsdn, arena_t *arena);
 void arena_prefork1(tsdn_t *tsdn, arena_t *arena);
 void arena_prefork2(tsdn_t *tsdn, arena_t *arena);
diff --git a/include/jemalloc/internal/arena_inlines_b.h b/include/jemalloc/internal/arena_inlines_b.h
index 2b7e77e7..614deddd 100644
--- a/include/jemalloc/internal/arena_inlines_b.h
+++ b/include/jemalloc/internal/arena_inlines_b.h
@@ -4,10 +4,36 @@
 #include "jemalloc/internal/jemalloc_internal_types.h"
 #include "jemalloc/internal/mutex.h"
 #include "jemalloc/internal/rtree.h"
-#include "jemalloc/internal/size_classes.h"
+#include "jemalloc/internal/sc.h"
 #include "jemalloc/internal/sz.h"
 #include "jemalloc/internal/ticker.h"
 
+JEMALLOC_ALWAYS_INLINE bool
+arena_has_default_hooks(arena_t *arena) {
+	return (extent_hooks_get(arena) == &extent_hooks_default);
+}
+
+JEMALLOC_ALWAYS_INLINE arena_t *
+arena_choose_maybe_huge(tsd_t *tsd, arena_t *arena, size_t size) {
+	if (arena != NULL) {
+		return arena;
+	}
+
+	/*
+	 * For huge allocations, use the dedicated huge arena if both are true:
+	 * 1) is using auto arena selection (i.e. arena == NULL), and 2) the
+	 * thread is not assigned to a manual arena.
+	 */
+	if (unlikely(size >= oversize_threshold)) {
+		arena_t *tsd_arena = tsd_arena_get(tsd);
+		if (tsd_arena == NULL || arena_is_auto(tsd_arena)) {
+			return arena_choose_huge(tsd);
+		}
+	}
+
+	return arena_choose(tsd, NULL);
+}
+
 JEMALLOC_ALWAYS_INLINE prof_tctx_t *
 arena_prof_tctx_get(tsdn_t *tsdn, const void *ptr, alloc_ctx_t *alloc_ctx) {
 	cassert(config_prof);
@@ -28,7 +54,7 @@ arena_prof_tctx_get(tsdn_t *tsdn, const void *ptr, alloc_ctx_t *alloc_ctx) {
 }
 
 JEMALLOC_ALWAYS_INLINE void
-arena_prof_tctx_set(tsdn_t *tsdn, const void *ptr, UNUSED size_t usize,
+arena_prof_tctx_set(tsdn_t *tsdn, const void *ptr, size_t usize,
     alloc_ctx_t *alloc_ctx, prof_tctx_t *tctx) {
 	cassert(config_prof);
 	assert(ptr != NULL);
@@ -47,7 +73,7 @@ arena_prof_tctx_set(tsdn_t *tsdn, const void *ptr, UNUSED size_t usize,
 }
 
 static inline void
-arena_prof_tctx_reset(tsdn_t *tsdn, const void *ptr, UNUSED prof_tctx_t *tctx) {
+arena_prof_tctx_reset(tsdn_t *tsdn, const void *ptr, prof_tctx_t *tctx) {
 	cassert(config_prof);
 	assert(ptr != NULL);
 
@@ -57,6 +83,32 @@ arena_prof_tctx_reset(tsdn_t *tsdn, const void *ptr, UNUSED prof_tctx_t *tctx) {
 	large_prof_tctx_reset(tsdn, extent);
 }
 
+JEMALLOC_ALWAYS_INLINE nstime_t
+arena_prof_alloc_time_get(tsdn_t *tsdn, const void *ptr,
+    alloc_ctx_t *alloc_ctx) {
+	cassert(config_prof);
+	assert(ptr != NULL);
+
+	extent_t *extent = iealloc(tsdn, ptr);
+	/* 
+	 * Unlike arena_prof_prof_tctx_{get, set}, we only call this once we're
+	 * sure we have a sampled allocation.
+	 */
+	assert(!extent_slab_get(extent));
+	return large_prof_alloc_time_get(extent);
+}
+
+JEMALLOC_ALWAYS_INLINE void
+arena_prof_alloc_time_set(tsdn_t *tsdn, const void *ptr, alloc_ctx_t *alloc_ctx,
+    nstime_t t) {
+	cassert(config_prof);
+	assert(ptr != NULL);
+
+	extent_t *extent = iealloc(tsdn, ptr);
+	assert(!extent_slab_get(extent));
+	large_prof_alloc_time_set(extent, t);
+}
+
 JEMALLOC_ALWAYS_INLINE void
 arena_decay_ticks(tsdn_t *tsdn, arena_t *arena, unsigned nticks) {
 	tsd_t *tsd;
@@ -83,14 +135,33 @@ arena_decay_tick(tsdn_t *tsdn, arena_t *arena) {
 	arena_decay_ticks(tsdn, arena, 1);
 }
 
+/* Purge a single extent to retained / unmapped directly. */
+JEMALLOC_ALWAYS_INLINE void
+arena_decay_extent(tsdn_t *tsdn,arena_t *arena, extent_hooks_t **r_extent_hooks,
+    extent_t *extent) {
+	size_t extent_size = extent_size_get(extent);
+	extent_dalloc_wrapper(tsdn, arena,
+	    r_extent_hooks, extent);
+	if (config_stats) {
+		/* Update stats accordingly. */
+		arena_stats_lock(tsdn, &arena->stats);
+		arena_stats_add_u64(tsdn, &arena->stats,
+		    &arena->decay_dirty.stats->nmadvise, 1);
+		arena_stats_add_u64(tsdn, &arena->stats,
+		    &arena->decay_dirty.stats->purged, extent_size >> LG_PAGE);
+		arena_stats_sub_zu(tsdn, &arena->stats, &arena->stats.mapped,
+		    extent_size);
+		arena_stats_unlock(tsdn, &arena->stats);
+	}
+}
+
 JEMALLOC_ALWAYS_INLINE void *
 arena_malloc(tsdn_t *tsdn, arena_t *arena, size_t size, szind_t ind, bool zero,
     tcache_t *tcache, bool slow_path) {
 	assert(!tsdn_null(tsdn) || tcache == NULL);
-	assert(size != 0);
 
 	if (likely(tcache != NULL)) {
-		if (likely(size <= SMALL_MAXCLASS)) {
+		if (likely(size <= SC_SMALL_MAXCLASS)) {
 			return tcache_alloc_small(tsdn_tsd(tsdn), arena,
 			    tcache, size, ind, zero, slow_path);
 		}
@@ -119,7 +190,7 @@ arena_salloc(tsdn_t *tsdn, const void *ptr) {
 
 	szind_t szind = rtree_szind_read(tsdn, &extents_rtree, rtree_ctx,
 	    (uintptr_t)ptr, true);
-	assert(szind != NSIZES);
+	assert(szind != SC_NSIZES);
 
 	return sz_index2size(szind);
 }
@@ -152,7 +223,7 @@ arena_vsalloc(tsdn_t *tsdn, const void *ptr) {
 	/* Only slab members should be looked up via interior pointers. */
 	assert(extent_addr_get(extent) == ptr || extent_slab_get(extent));
 
-	assert(szind != NSIZES);
+	assert(szind != SC_NSIZES);
 
 	return sz_index2size(szind);
 }
@@ -173,7 +244,7 @@ arena_dalloc_no_tcache(tsdn_t *tsdn, void *ptr) {
 		extent_t *extent = rtree_extent_read(tsdn, &extents_rtree,
 		    rtree_ctx, (uintptr_t)ptr, true);
 		assert(szind == extent_szind_get(extent));
-		assert(szind < NSIZES);
+		assert(szind < SC_NSIZES);
 		assert(slab == extent_slab_get(extent));
 	}
 
@@ -203,7 +274,7 @@ arena_dalloc(tsdn_t *tsdn, void *ptr, tcache_t *tcache,
 	if (alloc_ctx != NULL) {
 		szind = alloc_ctx->szind;
 		slab = alloc_ctx->slab;
-		assert(szind != NSIZES);
+		assert(szind != SC_NSIZES);
 	} else {
 		rtree_ctx = tsd_rtree_ctx(tsdn_tsd(tsdn));
 		rtree_szind_slab_read(tsdn, &extents_rtree, rtree_ctx,
@@ -215,7 +286,7 @@ arena_dalloc(tsdn_t *tsdn, void *ptr, tcache_t *tcache,
 		extent_t *extent = rtree_extent_read(tsdn, &extents_rtree,
 		    rtree_ctx, (uintptr_t)ptr, true);
 		assert(szind == extent_szind_get(extent));
-		assert(szind < NSIZES);
+		assert(szind < SC_NSIZES);
 		assert(slab == extent_slab_get(extent));
 	}
 
@@ -225,7 +296,7 @@ arena_dalloc(tsdn_t *tsdn, void *ptr, tcache_t *tcache,
 		    slow_path);
 	} else {
 		if (szind < nhbins) {
-			if (config_prof && unlikely(szind < NBINS)) {
+			if (config_prof && unlikely(szind < SC_NBINS)) {
 				arena_dalloc_promoted(tsdn, ptr, tcache,
 				    slow_path);
 			} else {
@@ -242,7 +313,7 @@ arena_dalloc(tsdn_t *tsdn, void *ptr, tcache_t *tcache,
 static inline void
 arena_sdalloc_no_tcache(tsdn_t *tsdn, void *ptr, size_t size) {
 	assert(ptr != NULL);
-	assert(size <= LARGE_MAXCLASS);
+	assert(size <= SC_LARGE_MAXCLASS);
 
 	szind_t szind;
 	bool slab;
@@ -252,7 +323,7 @@ arena_sdalloc_no_tcache(tsdn_t *tsdn, void *ptr, size_t size) {
 		 * object, so base szind and slab on the given size.
 		 */
 		szind = sz_size2index(size);
-		slab = (szind < NBINS);
+		slab = (szind < SC_NBINS);
 	}
 
 	if ((config_prof && opt_prof) || config_debug) {
@@ -264,7 +335,7 @@ arena_sdalloc_no_tcache(tsdn_t *tsdn, void *ptr, size_t size) {
 		    (uintptr_t)ptr, true, &szind, &slab);
 
 		assert(szind == sz_size2index(size));
-		assert((config_prof && opt_prof) || slab == (szind < NBINS));
+		assert((config_prof && opt_prof) || slab == (szind < SC_NBINS));
 
 		if (config_debug) {
 			extent_t *extent = rtree_extent_read(tsdn,
@@ -288,7 +359,7 @@ arena_sdalloc(tsdn_t *tsdn, void *ptr, size_t size, tcache_t *tcache,
     alloc_ctx_t *alloc_ctx, bool slow_path) {
 	assert(!tsdn_null(tsdn) || tcache == NULL);
 	assert(ptr != NULL);
-	assert(size <= LARGE_MAXCLASS);
+	assert(size <= SC_LARGE_MAXCLASS);
 
 	if (unlikely(tcache == NULL)) {
 		arena_sdalloc_no_tcache(tsdn, ptr, size);
@@ -297,7 +368,7 @@ arena_sdalloc(tsdn_t *tsdn, void *ptr, size_t size, tcache_t *tcache,
 
 	szind_t szind;
 	bool slab;
-	UNUSED alloc_ctx_t local_ctx;
+	alloc_ctx_t local_ctx;
 	if (config_prof && opt_prof) {
 		if (alloc_ctx == NULL) {
 			/* Uncommon case and should be a static check. */
@@ -318,7 +389,7 @@ arena_sdalloc(tsdn_t *tsdn, void *ptr, size_t size, tcache_t *tcache,
 		 * object, so base szind and slab on the given size.
 		 */
 		szind = sz_size2index(size);
-		slab = (szind < NBINS);
+		slab = (szind < SC_NBINS);
 	}
 
 	if (config_debug) {
@@ -337,7 +408,7 @@ arena_sdalloc(tsdn_t *tsdn, void *ptr, size_t size, tcache_t *tcache,
 		    slow_path);
 	} else {
 		if (szind < nhbins) {
-			if (config_prof && unlikely(szind < NBINS)) {
+			if (config_prof && unlikely(szind < SC_NBINS)) {
 				arena_dalloc_promoted(tsdn, ptr, tcache,
 				    slow_path);
 			} else {
diff --git a/include/jemalloc/internal/arena_stats.h b/include/jemalloc/internal/arena_stats.h
index 5f3dca8b..ef1e25b3 100644
--- a/include/jemalloc/internal/arena_stats.h
+++ b/include/jemalloc/internal/arena_stats.h
@@ -4,7 +4,9 @@
 #include "jemalloc/internal/atomic.h"
 #include "jemalloc/internal/mutex.h"
 #include "jemalloc/internal/mutex_prof.h"
-#include "jemalloc/internal/size_classes.h"
+#include "jemalloc/internal/sc.h"
+
+JEMALLOC_DIAGNOSTIC_DISABLE_SPURIOUS
 
 /*
  * In those architectures that support 64-bit atomics, we use atomic updates for
@@ -48,6 +50,22 @@ struct arena_stats_decay_s {
 	arena_stats_u64_t	purged;
 };
 
+typedef struct arena_stats_extents_s arena_stats_extents_t;
+struct arena_stats_extents_s {
+	/*
+	 * Stats for a given index in the range [0, SC_NPSIZES] in an extents_t.
+	 * We track both bytes and # of extents: two extents in the same bucket
+	 * may have different sizes if adjacent size classes differ by more than
+	 * a page, so bytes cannot always be derived from # of extents.
+	 */
+	atomic_zu_t ndirty;
+	atomic_zu_t dirty_bytes;
+	atomic_zu_t nmuzzy;
+	atomic_zu_t muzzy_bytes;
+	atomic_zu_t nretained;
+	atomic_zu_t retained_bytes;
+};
+
 /*
  * Arena stats.  Note that fields marked "derived" are not directly maintained
  * within the arena code; rather their values are derived during stats merge
@@ -69,6 +87,9 @@ struct arena_stats_s {
 	 */
 	atomic_zu_t		retained; /* Derived. */
 
+	/* Number of extent_t structs allocated by base, but not being used. */
+	atomic_zu_t		extent_avail;
+
 	arena_stats_decay_t	decay_dirty;
 	arena_stats_decay_t	decay_muzzy;
 
@@ -88,14 +109,14 @@ struct arena_stats_s {
 	mutex_prof_data_t mutex_prof_data[mutex_prof_num_arena_mutexes];
 
 	/* One element for each large size class. */
-	arena_stats_large_t	lstats[NSIZES - NBINS];
+	arena_stats_large_t	lstats[SC_NSIZES - SC_NBINS];
 
 	/* Arena uptime. */
 	nstime_t		uptime;
 };
 
 static inline bool
-arena_stats_init(UNUSED tsdn_t *tsdn, arena_stats_t *arena_stats) {
+arena_stats_init(tsdn_t *tsdn, arena_stats_t *arena_stats) {
 	if (config_debug) {
 		for (size_t i = 0; i < sizeof(arena_stats_t); i++) {
 			assert(((char *)arena_stats)[i] == 0);
@@ -147,11 +168,11 @@ arena_stats_add_u64(tsdn_t *tsdn, arena_stats_t *arena_stats,
 #endif
 }
 
-UNUSED static inline void
+static inline void
 arena_stats_sub_u64(tsdn_t *tsdn, arena_stats_t *arena_stats,
     arena_stats_u64_t *p, uint64_t x) {
 #ifdef JEMALLOC_ATOMIC_U64
-	UNUSED uint64_t r = atomic_fetch_sub_u64(p, x, ATOMIC_RELAXED);
+	uint64_t r = atomic_fetch_sub_u64(p, x, ATOMIC_RELAXED);
 	assert(r - x <= r);
 #else
 	malloc_mutex_assert_owner(tsdn, &arena_stats->mtx);
@@ -176,7 +197,8 @@ arena_stats_accum_u64(arena_stats_u64_t *dst, uint64_t src) {
 }
 
 static inline size_t
-arena_stats_read_zu(tsdn_t *tsdn, arena_stats_t *arena_stats, atomic_zu_t *p) {
+arena_stats_read_zu(tsdn_t *tsdn, arena_stats_t *arena_stats,
+    atomic_zu_t *p) {
 #ifdef JEMALLOC_ATOMIC_U64
 	return atomic_load_zu(p, ATOMIC_RELAXED);
 #else
@@ -186,8 +208,8 @@ arena_stats_read_zu(tsdn_t *tsdn, arena_stats_t *arena_stats, atomic_zu_t *p) {
 }
 
 static inline void
-arena_stats_add_zu(tsdn_t *tsdn, arena_stats_t *arena_stats, atomic_zu_t *p,
-    size_t x) {
+arena_stats_add_zu(tsdn_t *tsdn, arena_stats_t *arena_stats,
+    atomic_zu_t *p, size_t x) {
 #ifdef JEMALLOC_ATOMIC_U64
 	atomic_fetch_add_zu(p, x, ATOMIC_RELAXED);
 #else
@@ -198,10 +220,10 @@ arena_stats_add_zu(tsdn_t *tsdn, arena_stats_t *arena_stats, atomic_zu_t *p,
 }
 
 static inline void
-arena_stats_sub_zu(tsdn_t *tsdn, arena_stats_t *arena_stats, atomic_zu_t *p,
-    size_t x) {
+arena_stats_sub_zu(tsdn_t *tsdn, arena_stats_t *arena_stats,
+    atomic_zu_t *p, size_t x) {
 #ifdef JEMALLOC_ATOMIC_U64
-	UNUSED size_t r = atomic_fetch_sub_zu(p, x, ATOMIC_RELAXED);
+	size_t r = atomic_fetch_sub_zu(p, x, ATOMIC_RELAXED);
 	assert(r - x <= r);
 #else
 	malloc_mutex_assert_owner(tsdn, &arena_stats->mtx);
@@ -222,7 +244,7 @@ arena_stats_large_nrequests_add(tsdn_t *tsdn, arena_stats_t *arena_stats,
     szind_t szind, uint64_t nrequests) {
 	arena_stats_lock(tsdn, arena_stats);
 	arena_stats_add_u64(tsdn, arena_stats, &arena_stats->lstats[szind -
-	    NBINS].nrequests, nrequests);
+	    SC_NBINS].nrequests, nrequests);
 	arena_stats_unlock(tsdn, arena_stats);
 }
 
@@ -233,5 +255,4 @@ arena_stats_mapped_add(tsdn_t *tsdn, arena_stats_t *arena_stats, size_t size) {
 	arena_stats_unlock(tsdn, arena_stats);
 }
 
-
 #endif /* JEMALLOC_INTERNAL_ARENA_STATS_H */
diff --git a/include/jemalloc/internal/arena_structs_b.h b/include/jemalloc/internal/arena_structs_b.h
index 38bc9596..950bd13c 100644
--- a/include/jemalloc/internal/arena_structs_b.h
+++ b/include/jemalloc/internal/arena_structs_b.h
@@ -10,7 +10,7 @@
 #include "jemalloc/internal/mutex.h"
 #include "jemalloc/internal/nstime.h"
 #include "jemalloc/internal/ql.h"
-#include "jemalloc/internal/size_classes.h"
+#include "jemalloc/internal/sc.h"
 #include "jemalloc/internal/smoothstep.h"
 #include "jemalloc/internal/ticker.h"
 
@@ -90,6 +90,9 @@ struct arena_s {
 	 */
 	atomic_u_t		nthreads[2];
 
+	/* Next bin shard for binding new threads. Synchronization: atomic. */
+	atomic_u_t		binshard_next;
+
 	/*
 	 * When percpu_arena is enabled, to amortize the cost of reading /
 	 * updating the current CPU id, track the most recent thread accessing
@@ -196,6 +199,7 @@ struct arena_s {
 	 * Synchronization: extent_avail_mtx.
 	 */
 	extent_tree_t		extent_avail;
+	atomic_zu_t		extent_avail_cnt;
 	malloc_mutex_t		extent_avail_mtx;
 
 	/*
@@ -203,7 +207,7 @@ struct arena_s {
 	 *
 	 * Synchronization: internal.
 	 */
-	bin_t			bins[NBINS];
+	bins_t			bins[SC_NBINS];
 
 	/*
 	 * Base allocator, from which arena metadata are allocated.
diff --git a/include/jemalloc/internal/arena_types.h b/include/jemalloc/internal/arena_types.h
index 70001b5f..624937e4 100644
--- a/include/jemalloc/internal/arena_types.h
+++ b/include/jemalloc/internal/arena_types.h
@@ -1,13 +1,15 @@
 #ifndef JEMALLOC_INTERNAL_ARENA_TYPES_H
 #define JEMALLOC_INTERNAL_ARENA_TYPES_H
 
+#include "jemalloc/internal/sc.h"
+
 /* Maximum number of regions in one slab. */
-#define LG_SLAB_MAXREGS		(LG_PAGE - LG_TINY_MIN)
+#define LG_SLAB_MAXREGS		(LG_PAGE - SC_LG_TINY_MIN)
 #define SLAB_MAXREGS		(1U << LG_SLAB_MAXREGS)
 
 /* Default decay times in milliseconds. */
 #define DIRTY_DECAY_MS_DEFAULT	ZD(10 * 1000)
-#define MUZZY_DECAY_MS_DEFAULT	ZD(10 * 1000)
+#define MUZZY_DECAY_MS_DEFAULT	(0)
 /* Number of event ticks between time checks. */
 #define DECAY_NTICKS_PER_UPDATE	1000
 
@@ -40,4 +42,10 @@ typedef enum {
 #define PERCPU_ARENA_ENABLED(m)	((m) >= percpu_arena_mode_enabled_base)
 #define PERCPU_ARENA_DEFAULT	percpu_arena_disabled
 
+/*
+ * When allocation_size >= oversize_threshold, use the dedicated huge arena
+ * (unless have explicitly spicified arena index).  0 disables the feature.
+ */
+#define OVERSIZE_THRESHOLD_DEFAULT (8 << 20)
+
 #endif /* JEMALLOC_INTERNAL_ARENA_TYPES_H */
diff --git a/include/jemalloc/internal/atomic.h b/include/jemalloc/internal/atomic.h
index adadb1a3..a76f54ce 100644
--- a/include/jemalloc/internal/atomic.h
+++ b/include/jemalloc/internal/atomic.h
@@ -1,12 +1,19 @@
 #ifndef JEMALLOC_INTERNAL_ATOMIC_H
 #define JEMALLOC_INTERNAL_ATOMIC_H
 
-#define ATOMIC_INLINE static inline
+#define ATOMIC_INLINE JEMALLOC_ALWAYS_INLINE
 
+#define JEMALLOC_U8_ATOMICS
 #if defined(JEMALLOC_GCC_ATOMIC_ATOMICS)
 #  include "jemalloc/internal/atomic_gcc_atomic.h"
+#  if !defined(JEMALLOC_GCC_U8_ATOMIC_ATOMICS)
+#    undef JEMALLOC_U8_ATOMICS
+#  endif
 #elif defined(JEMALLOC_GCC_SYNC_ATOMICS)
 #  include "jemalloc/internal/atomic_gcc_sync.h"
+#  if !defined(JEMALLOC_GCC_U8_SYNC_ATOMICS)
+#    undef JEMALLOC_U8_ATOMICS
+#  endif
 #elif defined(_MSC_VER)
 #  include "jemalloc/internal/atomic_msvc.h"
 #elif defined(JEMALLOC_C11_ATOMICS)
@@ -66,6 +73,8 @@ JEMALLOC_GENERATE_INT_ATOMICS(size_t, zu, LG_SIZEOF_PTR)
 
 JEMALLOC_GENERATE_INT_ATOMICS(ssize_t, zd, LG_SIZEOF_PTR)
 
+JEMALLOC_GENERATE_INT_ATOMICS(uint8_t, u8, 0)
+
 JEMALLOC_GENERATE_INT_ATOMICS(uint32_t, u32, 2)
 
 #ifdef JEMALLOC_ATOMIC_U64
diff --git a/include/jemalloc/internal/atomic_gcc_sync.h b/include/jemalloc/internal/atomic_gcc_sync.h
index 30846e4d..e02b7cbe 100644
--- a/include/jemalloc/internal/atomic_gcc_sync.h
+++ b/include/jemalloc/internal/atomic_gcc_sync.h
@@ -27,8 +27,10 @@ atomic_fence(atomic_memory_order_t mo) {
 	asm volatile("" ::: "memory");
 #  if defined(__i386__) || defined(__x86_64__)
 	/* This is implicit on x86. */
-#  elif defined(__ppc__)
+#  elif defined(__ppc64__)
 	asm volatile("lwsync");
+#  elif defined(__ppc__)
+	asm volatile("sync");
 #  elif defined(__sparc__) && defined(__arch64__)
 	if (mo == atomic_memory_order_acquire) {
 		asm volatile("membar #LoadLoad | #LoadStore");
@@ -113,8 +115,8 @@ atomic_store_##short_type(atomic_##short_type##_t *a,			\
 }									\
 									\
 ATOMIC_INLINE type							\
-atomic_exchange_##short_type(atomic_##short_type##_t *a, type val,	\
-    atomic_memory_order_t mo) {						\
+atomic_exchange_##short_type(atomic_##short_type##_t *a, type val, \
+    atomic_memory_order_t mo) {                  					 \
 	/*								\
 	 * Because of FreeBSD, we care about gcc 4.2, which doesn't have\
 	 * an atomic exchange builtin.  We fake it with a CAS loop.	\
@@ -129,8 +131,9 @@ atomic_exchange_##short_type(atomic_##short_type##_t *a, type val,	\
 									\
 ATOMIC_INLINE bool							\
 atomic_compare_exchange_weak_##short_type(atomic_##short_type##_t *a,	\
-    type *expected, type desired, atomic_memory_order_t success_mo,	\
-    atomic_memory_order_t failure_mo) {					\
+    type *expected, type desired,                                     \
+    atomic_memory_order_t success_mo,                          \
+    atomic_memory_order_t failure_mo) {				                \
 	type prev = __sync_val_compare_and_swap(&a->repr, *expected,	\
 	    desired);							\
 	if (prev == *expected) {					\
@@ -142,8 +145,9 @@ atomic_compare_exchange_weak_##short_type(atomic_##short_type##_t *a,	\
 }									\
 ATOMIC_INLINE bool							\
 atomic_compare_exchange_strong_##short_type(atomic_##short_type##_t *a,	\
-    type *expected, type desired, atomic_memory_order_t success_mo,	\
-    atomic_memory_order_t failure_mo) {					\
+    type *expected, type desired,                                       \
+    atomic_memory_order_t success_mo,                            \
+    atomic_memory_order_t failure_mo) {                          \
 	type prev = __sync_val_compare_and_swap(&a->repr, *expected,	\
 	    desired);							\
 	if (prev == *expected) {					\
diff --git a/include/jemalloc/internal/background_thread_externs.h b/include/jemalloc/internal/background_thread_externs.h
index 3209aa49..0f997e18 100644
--- a/include/jemalloc/internal/background_thread_externs.h
+++ b/include/jemalloc/internal/background_thread_externs.h
@@ -8,7 +8,6 @@ extern atomic_b_t background_thread_enabled_state;
 extern size_t n_background_threads;
 extern size_t max_background_threads;
 extern background_thread_info_t *background_thread_info;
-extern bool can_enable_background_thread;
 
 bool background_thread_create(tsd_t *tsd, unsigned arena_ind);
 bool background_threads_enable(tsd_t *tsd);
diff --git a/include/jemalloc/internal/background_thread_inlines.h b/include/jemalloc/internal/background_thread_inlines.h
index ef50231e..f85e86fa 100644
--- a/include/jemalloc/internal/background_thread_inlines.h
+++ b/include/jemalloc/internal/background_thread_inlines.h
@@ -15,7 +15,12 @@ background_thread_enabled_set(tsdn_t *tsdn, bool state) {
 JEMALLOC_ALWAYS_INLINE background_thread_info_t *
 arena_background_thread_info_get(arena_t *arena) {
 	unsigned arena_ind = arena_ind_get(arena);
-	return &background_thread_info[arena_ind % ncpus];
+	return &background_thread_info[arena_ind % max_background_threads];
+}
+
+JEMALLOC_ALWAYS_INLINE background_thread_info_t *
+background_thread_info_get(size_t ind) {
+	return &background_thread_info[ind % max_background_threads];
 }
 
 JEMALLOC_ALWAYS_INLINE uint64_t
diff --git a/include/jemalloc/internal/background_thread_structs.h b/include/jemalloc/internal/background_thread_structs.h
index c1107dfe..c02aa434 100644
--- a/include/jemalloc/internal/background_thread_structs.h
+++ b/include/jemalloc/internal/background_thread_structs.h
@@ -9,6 +9,7 @@
 
 #define BACKGROUND_THREAD_INDEFINITE_SLEEP UINT64_MAX
 #define MAX_BACKGROUND_THREAD_LIMIT MALLOCX_ARENA_LIMIT
+#define DEFAULT_NUM_BACKGROUND_THREAD 4
 
 typedef enum {
 	background_thread_stopped,
diff --git a/include/jemalloc/internal/base_structs.h b/include/jemalloc/internal/base_structs.h
index 2102247a..07f214eb 100644
--- a/include/jemalloc/internal/base_structs.h
+++ b/include/jemalloc/internal/base_structs.h
@@ -3,7 +3,7 @@
 
 #include "jemalloc/internal/jemalloc_internal_types.h"
 #include "jemalloc/internal/mutex.h"
-#include "jemalloc/internal/size_classes.h"
+#include "jemalloc/internal/sc.h"
 
 /* Embedded at the beginning of every block of base-managed virtual memory. */
 struct base_block_s {
@@ -46,7 +46,7 @@ struct base_s {
 	base_block_t	*blocks;
 
 	/* Heap of extents that track unused trailing space within blocks. */
-	extent_heap_t	avail[NSIZES];
+	extent_heap_t	avail[SC_NSIZES];
 
 	/* Stats, only maintained if config_stats. */
 	size_t		allocated;
diff --git a/include/jemalloc/internal/bin.h b/include/jemalloc/internal/bin.h
index 9b416ada..f542c882 100644
--- a/include/jemalloc/internal/bin.h
+++ b/include/jemalloc/internal/bin.h
@@ -1,10 +1,12 @@
 #ifndef JEMALLOC_INTERNAL_BIN_H
 #define JEMALLOC_INTERNAL_BIN_H
 
+#include "jemalloc/internal/bin_stats.h"
+#include "jemalloc/internal/bin_types.h"
 #include "jemalloc/internal/extent_types.h"
 #include "jemalloc/internal/extent_structs.h"
 #include "jemalloc/internal/mutex.h"
-#include "jemalloc/internal/bin_stats.h"
+#include "jemalloc/internal/sc.h"
 
 /*
  * A bin contains a set of extents that are currently being used for slab
@@ -41,6 +43,9 @@ struct bin_info_s {
 	/* Total number of regions in a slab for this bin's size class. */
 	uint32_t		nregs;
 
+	/* Number of sharded bins in each arena for this size class. */
+	uint32_t		n_shards;
+
 	/*
 	 * Metadata used to manipulate bitmaps for slabs associated with this
 	 * bin.
@@ -48,8 +53,7 @@ struct bin_info_s {
 	bitmap_info_t		bitmap_info;
 };
 
-extern const bin_info_t bin_infos[NBINS];
-
+extern bin_info_t bin_infos[SC_NBINS];
 
 typedef struct bin_s bin_t;
 struct bin_s {
@@ -78,6 +82,18 @@ struct bin_s {
 	bin_stats_t	stats;
 };
 
+/* A set of sharded bins of the same size class. */
+typedef struct bins_s bins_t;
+struct bins_s {
+	/* Sharded bins.  Dynamically sized. */
+	bin_t *bin_shards;
+};
+
+void bin_shard_sizes_boot(unsigned bin_shards[SC_NBINS]);
+bool bin_update_shard_size(unsigned bin_shards[SC_NBINS], size_t start_size,
+    size_t end_size, size_t nshards);
+void bin_boot(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS]);
+
 /* Initializes a bin to empty.  Returns true on error. */
 bool bin_init(bin_t *bin);
 
@@ -90,7 +106,7 @@ void bin_postfork_child(tsdn_t *tsdn, bin_t *bin);
 static inline void
 bin_stats_merge(tsdn_t *tsdn, bin_stats_t *dst_bin_stats, bin_t *bin) {
 	malloc_mutex_lock(tsdn, &bin->lock);
-	malloc_mutex_prof_read(tsdn, &dst_bin_stats->mutex_data, &bin->lock);
+	malloc_mutex_prof_accum(tsdn, &dst_bin_stats->mutex_data, &bin->lock);
 	dst_bin_stats->nmalloc += bin->stats.nmalloc;
 	dst_bin_stats->ndalloc += bin->stats.ndalloc;
 	dst_bin_stats->nrequests += bin->stats.nrequests;
diff --git a/include/jemalloc/internal/bin_types.h b/include/jemalloc/internal/bin_types.h
new file mode 100644
index 00000000..3533606b
--- /dev/null
+++ b/include/jemalloc/internal/bin_types.h
@@ -0,0 +1,17 @@
+#ifndef JEMALLOC_INTERNAL_BIN_TYPES_H
+#define JEMALLOC_INTERNAL_BIN_TYPES_H
+
+#include "jemalloc/internal/sc.h"
+
+#define BIN_SHARDS_MAX (1 << EXTENT_BITS_BINSHARD_WIDTH)
+#define N_BIN_SHARDS_DEFAULT 1
+
+/* Used in TSD static initializer only. Real init in arena_bind(). */
+#define TSD_BINSHARDS_ZERO_INITIALIZER {{UINT8_MAX}}
+
+typedef struct tsd_binshards_s tsd_binshards_t;
+struct tsd_binshards_s {
+	uint8_t binshard[SC_NBINS];
+};
+
+#endif /* JEMALLOC_INTERNAL_BIN_TYPES_H */
diff --git a/include/jemalloc/internal/bit_util.h b/include/jemalloc/internal/bit_util.h
index 8d078a8a..c045eb86 100644
--- a/include/jemalloc/internal/bit_util.h
+++ b/include/jemalloc/internal/bit_util.h
@@ -27,6 +27,25 @@ ffs_u(unsigned bitmap) {
 	return JEMALLOC_INTERNAL_FFS(bitmap);
 }
 
+#ifdef JEMALLOC_INTERNAL_POPCOUNTL
+BIT_UTIL_INLINE unsigned
+popcount_lu(unsigned long bitmap) {
+  return JEMALLOC_INTERNAL_POPCOUNTL(bitmap);
+}
+#endif
+
+/*
+ * Clears first unset bit in bitmap, and returns
+ * place of bit.  bitmap *must not* be 0.
+ */
+
+BIT_UTIL_INLINE size_t
+cfs_lu(unsigned long* bitmap) {
+	size_t bit = ffs_lu(*bitmap) - 1;
+	*bitmap ^= ZU(1) << bit;
+	return bit;
+}
+
 BIT_UTIL_INLINE unsigned
 ffs_zu(size_t bitmap) {
 #if LG_SIZEOF_PTR == LG_SIZEOF_INT
@@ -63,6 +82,22 @@ ffs_u32(uint32_t bitmap) {
 
 BIT_UTIL_INLINE uint64_t
 pow2_ceil_u64(uint64_t x) {
+#if (defined(__amd64__) || defined(__x86_64__) || defined(JEMALLOC_HAVE_BUILTIN_CLZ))
+	if(unlikely(x <= 1)) {
+		return x;
+	}
+	size_t msb_on_index;
+#if (defined(__amd64__) || defined(__x86_64__))
+	asm ("bsrq %1, %0"
+			: "=r"(msb_on_index) // Outputs.
+			: "r"(x-1)           // Inputs.
+		);
+#elif (defined(JEMALLOC_HAVE_BUILTIN_CLZ))
+	msb_on_index = (63 ^ __builtin_clzll(x - 1));
+#endif
+	assert(msb_on_index < 63);
+	return 1ULL << (msb_on_index + 1);
+#else
 	x--;
 	x |= x >> 1;
 	x |= x >> 2;
@@ -72,10 +107,27 @@ pow2_ceil_u64(uint64_t x) {
 	x |= x >> 32;
 	x++;
 	return x;
+#endif
 }
 
 BIT_UTIL_INLINE uint32_t
 pow2_ceil_u32(uint32_t x) {
+#if ((defined(__i386__) || defined(JEMALLOC_HAVE_BUILTIN_CLZ)) && (!defined(__s390__)))
+	if(unlikely(x <= 1)) {
+		return x;
+	}
+	size_t msb_on_index;
+#if (defined(__i386__))
+	asm ("bsr %1, %0"
+			: "=r"(msb_on_index) // Outputs.
+			: "r"(x-1)           // Inputs.
+		);
+#elif (defined(JEMALLOC_HAVE_BUILTIN_CLZ))
+	msb_on_index = (31 ^ __builtin_clz(x - 1));
+#endif
+	assert(msb_on_index < 31);
+	return 1U << (msb_on_index + 1);
+#else
 	x--;
 	x |= x >> 1;
 	x |= x >> 2;
@@ -84,6 +136,7 @@ pow2_ceil_u32(uint32_t x) {
 	x |= x >> 16;
 	x++;
 	return x;
+#endif
 }
 
 /* Compute the smallest power of 2 that is >= x. */
@@ -160,6 +213,27 @@ lg_floor(size_t x) {
 }
 #endif
 
+BIT_UTIL_INLINE unsigned
+lg_ceil(size_t x) {
+	return lg_floor(x) + ((x & (x - 1)) == 0 ? 0 : 1);
+}
+
 #undef BIT_UTIL_INLINE
 
+/* A compile-time version of lg_floor and lg_ceil. */
+#define LG_FLOOR_1(x) 0
+#define LG_FLOOR_2(x) (x < (1ULL << 1) ? LG_FLOOR_1(x) : 1 + LG_FLOOR_1(x >> 1))
+#define LG_FLOOR_4(x) (x < (1ULL << 2) ? LG_FLOOR_2(x) : 2 + LG_FLOOR_2(x >> 2))
+#define LG_FLOOR_8(x) (x < (1ULL << 4) ? LG_FLOOR_4(x) : 4 + LG_FLOOR_4(x >> 4))
+#define LG_FLOOR_16(x) (x < (1ULL << 8) ? LG_FLOOR_8(x) : 8 + LG_FLOOR_8(x >> 8))
+#define LG_FLOOR_32(x) (x < (1ULL << 16) ? LG_FLOOR_16(x) : 16 + LG_FLOOR_16(x >> 16))
+#define LG_FLOOR_64(x) (x < (1ULL << 32) ? LG_FLOOR_32(x) : 32 + LG_FLOOR_32(x >> 32))
+#if LG_SIZEOF_PTR == 2
+#  define LG_FLOOR(x) LG_FLOOR_32((x))
+#else
+#  define LG_FLOOR(x) LG_FLOOR_64((x))
+#endif
+
+#define LG_CEIL(x) (LG_FLOOR(x) + (((x) & ((x) - 1)) == 0 ? 0 : 1))
+
 #endif /* JEMALLOC_INTERNAL_BIT_UTIL_H */
diff --git a/include/jemalloc/internal/bitmap.h b/include/jemalloc/internal/bitmap.h
index ac990290..c3f9cb49 100644
--- a/include/jemalloc/internal/bitmap.h
+++ b/include/jemalloc/internal/bitmap.h
@@ -3,18 +3,18 @@
 
 #include "jemalloc/internal/arena_types.h"
 #include "jemalloc/internal/bit_util.h"
-#include "jemalloc/internal/size_classes.h"
+#include "jemalloc/internal/sc.h"
 
 typedef unsigned long bitmap_t;
 #define LG_SIZEOF_BITMAP	LG_SIZEOF_LONG
 
 /* Maximum bitmap bit count is 2^LG_BITMAP_MAXBITS. */
-#if LG_SLAB_MAXREGS > LG_CEIL_NSIZES
+#if LG_SLAB_MAXREGS > LG_CEIL(SC_NSIZES)
 /* Maximum bitmap bit count is determined by maximum regions per slab. */
 #  define LG_BITMAP_MAXBITS	LG_SLAB_MAXREGS
 #else
 /* Maximum bitmap bit count is determined by number of extent size classes. */
-#  define LG_BITMAP_MAXBITS	LG_CEIL_NSIZES
+#  define LG_BITMAP_MAXBITS	LG_CEIL(SC_NSIZES)
 #endif
 #define BITMAP_MAXBITS		(ZU(1) << LG_BITMAP_MAXBITS)
 
diff --git a/include/jemalloc/internal/cache_bin.h b/include/jemalloc/internal/cache_bin.h
index 12f3ef2d..d14556a3 100644
--- a/include/jemalloc/internal/cache_bin.h
+++ b/include/jemalloc/internal/cache_bin.h
@@ -88,11 +88,21 @@ JEMALLOC_ALWAYS_INLINE void *
 cache_bin_alloc_easy(cache_bin_t *bin, bool *success) {
 	void *ret;
 
-	if (unlikely(bin->ncached == 0)) {
-		bin->low_water = -1;
-		*success = false;
-		return NULL;
+	bin->ncached--;
+
+	/*
+	 * Check for both bin->ncached == 0 and ncached < low_water
+	 * in a single branch.
+	 */
+	if (unlikely(bin->ncached <= bin->low_water)) {
+		bin->low_water = bin->ncached;
+		if (bin->ncached == -1) {
+			bin->ncached = 0;
+			*success = false;
+			return NULL;
+		}
 	}
+
 	/*
 	 * success (instead of ret) should be checked upon the return of this
 	 * function.  We avoid checking (ret == NULL) because there is never a
@@ -101,14 +111,21 @@ cache_bin_alloc_easy(cache_bin_t *bin, bool *success) {
 	 * cacheline).
 	 */
 	*success = true;
-	ret = *(bin->avail - bin->ncached);
-	bin->ncached--;
-
-	if (unlikely(bin->ncached < bin->low_water)) {
-		bin->low_water = bin->ncached;
-	}
+	ret = *(bin->avail - (bin->ncached + 1));
 
 	return ret;
 }
 
+JEMALLOC_ALWAYS_INLINE bool
+cache_bin_dalloc_easy(cache_bin_t *bin, cache_bin_info_t *bin_info, void *ptr) {
+	if (unlikely(bin->ncached == bin_info->ncached_max)) {
+		return false;
+	}
+	assert(bin->ncached < bin_info->ncached_max);
+	bin->ncached++;
+	*(bin->avail - bin->ncached) = ptr;
+
+	return true;
+}
+
 #endif /* JEMALLOC_INTERNAL_CACHE_BIN_H */
diff --git a/include/jemalloc/internal/ctl.h b/include/jemalloc/internal/ctl.h
index d927d948..775fdec0 100644
--- a/include/jemalloc/internal/ctl.h
+++ b/include/jemalloc/internal/ctl.h
@@ -5,7 +5,7 @@
 #include "jemalloc/internal/malloc_io.h"
 #include "jemalloc/internal/mutex_prof.h"
 #include "jemalloc/internal/ql.h"
-#include "jemalloc/internal/size_classes.h"
+#include "jemalloc/internal/sc.h"
 #include "jemalloc/internal/stats.h"
 
 /* Maximum ctl tree depth. */
@@ -40,8 +40,9 @@ typedef struct ctl_arena_stats_s {
 	uint64_t ndalloc_small;
 	uint64_t nrequests_small;
 
-	bin_stats_t bstats[NBINS];
-	arena_stats_large_t lstats[NSIZES - NBINS];
+	bin_stats_t bstats[SC_NBINS];
+	arena_stats_large_t lstats[SC_NSIZES - SC_NBINS];
+	arena_stats_extents_t estats[SC_NPSIZES];
 } ctl_arena_stats_t;
 
 typedef struct ctl_stats_s {
diff --git a/include/jemalloc/internal/emitter.h b/include/jemalloc/internal/emitter.h
index 3a2b2f7f..0a8bc2c0 100644
--- a/include/jemalloc/internal/emitter.h
+++ b/include/jemalloc/internal/emitter.h
@@ -45,7 +45,9 @@ struct emitter_col_s {
 		int int_val;
 		unsigned unsigned_val;
 		uint32_t uint32_val;
+		uint32_t uint32_t_val;
 		uint64_t uint64_val;
+		uint64_t uint64_t_val;
 		size_t size_val;
 		ssize_t ssize_val;
 		const char *str_val;
@@ -60,17 +62,6 @@ struct emitter_row_s {
 	ql_head(emitter_col_t) cols;
 };
 
-static inline void
-emitter_row_init(emitter_row_t *row) {
-	ql_new(&row->cols);
-}
-
-static inline void
-emitter_col_init(emitter_col_t *col, emitter_row_t *row) {
-	ql_elm_new(col, link);
-	ql_tail_insert(&row->cols, col, link);
-}
-
 typedef struct emitter_s emitter_t;
 struct emitter_s {
 	emitter_output_t output;
@@ -80,18 +71,10 @@ struct emitter_s {
 	int nesting_depth;
 	/* True if we've already emitted a value at the given depth. */
 	bool item_at_depth;
+	/* True if we emitted a key and will emit corresponding value next. */
+	bool emitted_key;
 };
 
-static inline void
-emitter_init(emitter_t *emitter, emitter_output_t emitter_output,
-    void (*write_cb)(void *, const char *), void *cbopaque) {
-	emitter->output = emitter_output;
-	emitter->write_cb = write_cb;
-	emitter->cbopaque = cbopaque;
-	emitter->item_at_depth = false;
-	emitter->nesting_depth = 0;
-}
-
 /* Internal convenience function.  Write to the emitter the given string. */
 JEMALLOC_FORMAT_PRINTF(2, 3)
 static inline void
@@ -103,18 +86,6 @@ emitter_printf(emitter_t *emitter, const char *format, ...) {
 	va_end(ap);
 }
 
-/* Write to the emitter the given string, but only in table mode. */
-JEMALLOC_FORMAT_PRINTF(2, 3)
-static inline void
-emitter_table_printf(emitter_t *emitter, const char *format, ...) {
-	if (emitter->output == emitter_output_table) {
-		va_list ap;
-		va_start(ap, format);
-		malloc_vcprintf(emitter->write_cb, emitter->cbopaque, format, ap);
-		va_end(ap);
-	}
-}
-
 static inline void
 emitter_gen_fmt(char *out_fmt, size_t out_size, const char *fmt_specifier,
     emitter_justify_t justify, int width) {
@@ -235,47 +206,143 @@ emitter_indent(emitter_t *emitter) {
 
 static inline void
 emitter_json_key_prefix(emitter_t *emitter) {
+	if (emitter->emitted_key) {
+		emitter->emitted_key = false;
+		return;
+	}
 	emitter_printf(emitter, "%s\n", emitter->item_at_depth ? "," : "");
 	emitter_indent(emitter);
 }
 
-static inline void
-emitter_begin(emitter_t *emitter) {
-	if (emitter->output == emitter_output_json) {
-		assert(emitter->nesting_depth == 0);
-		emitter_printf(emitter, "{");
-		emitter_nest_inc(emitter);
-	} else {
-		// tabular init
-		emitter_printf(emitter, "%s", "");
-	}
-}
+/******************************************************************************/
+/* Public functions for emitter_t. */
 
 static inline void
-emitter_end(emitter_t *emitter) {
-	if (emitter->output == emitter_output_json) {
-		assert(emitter->nesting_depth == 1);
-		emitter_nest_dec(emitter);
-		emitter_printf(emitter, "\n}\n");
-	}
+emitter_init(emitter_t *emitter, emitter_output_t emitter_output,
+    void (*write_cb)(void *, const char *), void *cbopaque) {
+	emitter->output = emitter_output;
+	emitter->write_cb = write_cb;
+	emitter->cbopaque = cbopaque;
+	emitter->item_at_depth = false;
+	emitter->emitted_key = false; 
+	emitter->nesting_depth = 0;
 }
 
-/*
- * Note emits a different kv pair as well, but only in table mode.  Omits the
- * note if table_note_key is NULL.
+/******************************************************************************/
+/* JSON public API. */
+
+/* 
+ * Emits a key (e.g. as appears in an object). The next json entity emitted will
+ * be the corresponding value.
  */
 static inline void
-emitter_kv_note(emitter_t *emitter, const char *json_key, const char *table_key,
+emitter_json_key(emitter_t *emitter, const char *json_key) {
+	if (emitter->output == emitter_output_json) {
+		emitter_json_key_prefix(emitter);
+		emitter_printf(emitter, "\"%s\": ", json_key);
+		emitter->emitted_key = true;
+	}
+}
+
+static inline void
+emitter_json_value(emitter_t *emitter, emitter_type_t value_type,
+    const void *value) {
+	if (emitter->output == emitter_output_json) {
+		emitter_json_key_prefix(emitter);
+		emitter_print_value(emitter, emitter_justify_none, -1,
+		    value_type, value);
+		emitter->item_at_depth = true;
+	}
+}
+
+/* Shorthand for calling emitter_json_key and then emitter_json_value. */
+static inline void
+emitter_json_kv(emitter_t *emitter, const char *json_key,
+    emitter_type_t value_type, const void *value) {
+	emitter_json_key(emitter, json_key);
+	emitter_json_value(emitter, value_type, value);
+}
+
+static inline void
+emitter_json_array_begin(emitter_t *emitter) {
+	if (emitter->output == emitter_output_json) {
+		emitter_json_key_prefix(emitter);
+		emitter_printf(emitter, "[");
+		emitter_nest_inc(emitter);
+	}
+}
+
+/* Shorthand for calling emitter_json_key and then emitter_json_array_begin. */
+static inline void
+emitter_json_array_kv_begin(emitter_t *emitter, const char *json_key) {
+	emitter_json_key(emitter, json_key);
+	emitter_json_array_begin(emitter);
+}
+
+static inline void
+emitter_json_array_end(emitter_t *emitter) {
+	if (emitter->output == emitter_output_json) {
+		assert(emitter->nesting_depth > 0);
+		emitter_nest_dec(emitter);
+		emitter_printf(emitter, "\n");
+		emitter_indent(emitter);
+		emitter_printf(emitter, "]");
+	}
+}
+
+static inline void
+emitter_json_object_begin(emitter_t *emitter) {
+	if (emitter->output == emitter_output_json) {
+		emitter_json_key_prefix(emitter);
+		emitter_printf(emitter, "{");
+		emitter_nest_inc(emitter);
+	}
+}
+
+/* Shorthand for calling emitter_json_key and then emitter_json_object_begin. */
+static inline void
+emitter_json_object_kv_begin(emitter_t *emitter, const char *json_key) {
+	emitter_json_key(emitter, json_key);
+	emitter_json_object_begin(emitter);
+}
+
+static inline void
+emitter_json_object_end(emitter_t *emitter) {
+	if (emitter->output == emitter_output_json) {
+		assert(emitter->nesting_depth > 0);
+		emitter_nest_dec(emitter);
+		emitter_printf(emitter, "\n");
+		emitter_indent(emitter);
+		emitter_printf(emitter, "}");
+	}
+}
+
+
+/******************************************************************************/
+/* Table public API. */
+
+static inline void
+emitter_table_dict_begin(emitter_t *emitter, const char *table_key) {
+	if (emitter->output == emitter_output_table) {
+		emitter_indent(emitter);
+		emitter_printf(emitter, "%s\n", table_key);
+		emitter_nest_inc(emitter);
+	}
+}
+
+static inline void
+emitter_table_dict_end(emitter_t *emitter) {
+	if (emitter->output == emitter_output_table) {
+		emitter_nest_dec(emitter);
+	}
+}
+
+static inline void
+emitter_table_kv_note(emitter_t *emitter, const char *table_key,
     emitter_type_t value_type, const void *value,
     const char *table_note_key, emitter_type_t table_note_value_type,
     const void *table_note_value) {
-	if (emitter->output == emitter_output_json) {
-		assert(emitter->nesting_depth > 0);
-		emitter_json_key_prefix(emitter);
-		emitter_printf(emitter, "\"%s\": ", json_key);
-		emitter_print_value(emitter, emitter_justify_none, -1,
-		    value_type, value);
-	} else {
+	if (emitter->output == emitter_output_table) {
 		emitter_indent(emitter);
 		emitter_printf(emitter, "%s: ", table_key);
 		emitter_print_value(emitter, emitter_justify_none, -1,
@@ -292,130 +359,22 @@ emitter_kv_note(emitter_t *emitter, const char *json_key, const char *table_key,
 }
 
 static inline void
-emitter_kv(emitter_t *emitter, const char *json_key, const char *table_key,
+emitter_table_kv(emitter_t *emitter, const char *table_key,
     emitter_type_t value_type, const void *value) {
-	emitter_kv_note(emitter, json_key, table_key, value_type, value, NULL,
+	emitter_table_kv_note(emitter, table_key, value_type, value, NULL,
 	    emitter_type_bool, NULL);
 }
 
-static inline void
-emitter_json_kv(emitter_t *emitter, const char *json_key,
-    emitter_type_t value_type, const void *value) {
-	if (emitter->output == emitter_output_json) {
-		emitter_kv(emitter, json_key, NULL, value_type, value);
-	}
-}
 
+/* Write to the emitter the given string, but only in table mode. */
+JEMALLOC_FORMAT_PRINTF(2, 3)
 static inline void
-emitter_table_kv(emitter_t *emitter, const char *table_key,
-    emitter_type_t value_type, const void *value) {
+emitter_table_printf(emitter_t *emitter, const char *format, ...) {
 	if (emitter->output == emitter_output_table) {
-		emitter_kv(emitter, NULL, table_key, value_type, value);
-	}
-}
-
-static inline void
-emitter_dict_begin(emitter_t *emitter, const char *json_key,
-    const char *table_header) {
-	if (emitter->output == emitter_output_json) {
-		emitter_json_key_prefix(emitter);
-		emitter_printf(emitter, "\"%s\": {", json_key);
-		emitter_nest_inc(emitter);
-	} else {
-		emitter_indent(emitter);
-		emitter_printf(emitter, "%s\n", table_header);
-		emitter_nest_inc(emitter);
-	}
-}
-
-static inline void
-emitter_dict_end(emitter_t *emitter) {
-	if (emitter->output == emitter_output_json) {
-		assert(emitter->nesting_depth > 0);
-		emitter_nest_dec(emitter);
-		emitter_printf(emitter, "\n");
-		emitter_indent(emitter);
-		emitter_printf(emitter, "}");
-	} else {
-		emitter_nest_dec(emitter);
-	}
-}
-
-static inline void
-emitter_json_dict_begin(emitter_t *emitter, const char *json_key) {
-	if (emitter->output == emitter_output_json) {
-		emitter_dict_begin(emitter, json_key, NULL);
-	}
-}
-
-static inline void
-emitter_json_dict_end(emitter_t *emitter) {
-	if (emitter->output == emitter_output_json) {
-		emitter_dict_end(emitter);
-	}
-}
-
-static inline void
-emitter_table_dict_begin(emitter_t *emitter, const char *table_key) {
-	if (emitter->output == emitter_output_table) {
-		emitter_dict_begin(emitter, NULL, table_key);
-	}
-}
-
-static inline void
-emitter_table_dict_end(emitter_t *emitter) {
-	if (emitter->output == emitter_output_table) {
-		emitter_dict_end(emitter);
-	}
-}
-
-static inline void
-emitter_json_arr_begin(emitter_t *emitter, const char *json_key) {
-	if (emitter->output == emitter_output_json) {
-		emitter_json_key_prefix(emitter);
-		emitter_printf(emitter, "\"%s\": [", json_key);
-		emitter_nest_inc(emitter);
-	}
-}
-
-static inline void
-emitter_json_arr_end(emitter_t *emitter) {
-	if (emitter->output == emitter_output_json) {
-		assert(emitter->nesting_depth > 0);
-		emitter_nest_dec(emitter);
-		emitter_printf(emitter, "\n");
-		emitter_indent(emitter);
-		emitter_printf(emitter, "]");
-	}
-}
-
-static inline void
-emitter_json_arr_obj_begin(emitter_t *emitter) {
-	if (emitter->output == emitter_output_json) {
-		emitter_json_key_prefix(emitter);
-		emitter_printf(emitter, "{");
-		emitter_nest_inc(emitter);
-	}
-}
-
-static inline void
-emitter_json_arr_obj_end(emitter_t *emitter) {
-	if (emitter->output == emitter_output_json) {
-		assert(emitter->nesting_depth > 0);
-		emitter_nest_dec(emitter);
-		emitter_printf(emitter, "\n");
-		emitter_indent(emitter);
-		emitter_printf(emitter, "}");
-	}
-}
-
-static inline void
-emitter_json_arr_value(emitter_t *emitter, emitter_type_t value_type,
-    const void *value) {
-	if (emitter->output == emitter_output_json) {
-		emitter_json_key_prefix(emitter);
-		emitter_print_value(emitter, emitter_justify_none, -1,
-		    value_type, value);
+		va_list ap;
+		va_start(ap, format);
+		malloc_vcprintf(emitter->write_cb, emitter->cbopaque, format, ap);
+		va_end(ap);
 	}
 }
 
@@ -432,4 +391,93 @@ emitter_table_row(emitter_t *emitter, emitter_row_t *row) {
 	emitter_table_printf(emitter, "\n");
 }
 
+static inline void
+emitter_row_init(emitter_row_t *row) {
+	ql_new(&row->cols);
+}
+
+static inline void
+emitter_col_init(emitter_col_t *col, emitter_row_t *row) {
+	ql_elm_new(col, link);
+	ql_tail_insert(&row->cols, col, link);
+}
+
+
+/******************************************************************************/
+/*
+ * Generalized public API. Emits using either JSON or table, according to
+ * settings in the emitter_t. */
+
+/*
+ * Note emits a different kv pair as well, but only in table mode.  Omits the
+ * note if table_note_key is NULL.
+ */
+static inline void
+emitter_kv_note(emitter_t *emitter, const char *json_key, const char *table_key,
+    emitter_type_t value_type, const void *value,
+    const char *table_note_key, emitter_type_t table_note_value_type,
+    const void *table_note_value) {
+	if (emitter->output == emitter_output_json) {
+		emitter_json_key(emitter, json_key);
+		emitter_json_value(emitter, value_type, value);
+	} else {
+		emitter_table_kv_note(emitter, table_key, value_type, value,
+		    table_note_key, table_note_value_type, table_note_value);
+	}
+	emitter->item_at_depth = true;
+}
+
+static inline void
+emitter_kv(emitter_t *emitter, const char *json_key, const char *table_key,
+    emitter_type_t value_type, const void *value) {
+	emitter_kv_note(emitter, json_key, table_key, value_type, value, NULL,
+	    emitter_type_bool, NULL);
+}
+
+static inline void
+emitter_dict_begin(emitter_t *emitter, const char *json_key,
+    const char *table_header) {
+	if (emitter->output == emitter_output_json) {
+		emitter_json_key(emitter, json_key);
+		emitter_json_object_begin(emitter);
+	} else {
+		emitter_table_dict_begin(emitter, table_header);
+	}
+}
+
+static inline void
+emitter_dict_end(emitter_t *emitter) {
+	if (emitter->output == emitter_output_json) {
+		emitter_json_object_end(emitter);
+	} else {
+		emitter_table_dict_end(emitter);
+	}
+}
+
+static inline void
+emitter_begin(emitter_t *emitter) {
+	if (emitter->output == emitter_output_json) {
+		assert(emitter->nesting_depth == 0);
+		emitter_printf(emitter, "{");
+		emitter_nest_inc(emitter);
+	} else {
+		/*
+		 * This guarantees that we always call write_cb at least once.
+		 * This is useful if some invariant is established by each call
+		 * to write_cb, but doesn't hold initially: e.g., some buffer
+		 * holds a null-terminated string.
+		 */
+		emitter_printf(emitter, "%s", "");
+	}
+}
+
+static inline void
+emitter_end(emitter_t *emitter) {
+	if (emitter->output == emitter_output_json) {
+		assert(emitter->nesting_depth == 1);
+		emitter_nest_dec(emitter);
+		emitter_printf(emitter, "\n}\n");
+	}
+}
+
 #endif /* JEMALLOC_INTERNAL_EMITTER_H */
diff --git a/include/jemalloc/internal/extent_externs.h b/include/jemalloc/internal/extent_externs.h
index b8a4d026..8680251a 100644
--- a/include/jemalloc/internal/extent_externs.h
+++ b/include/jemalloc/internal/extent_externs.h
@@ -31,6 +31,10 @@ bool extents_init(tsdn_t *tsdn, extents_t *extents, extent_state_t state,
     bool delay_coalesce);
 extent_state_t extents_state_get(const extents_t *extents);
 size_t extents_npages_get(extents_t *extents);
+/* Get the number of extents in the given page size index. */
+size_t extents_nextents_get(extents_t *extents, pszind_t ind);
+/* Get the sum total bytes of the extents in the given page size index. */
+size_t extents_nbytes_get(extents_t *extents, pszind_t ind);
 extent_t *extents_alloc(tsdn_t *tsdn, arena_t *arena,
     extent_hooks_t **r_extent_hooks, extents_t *extents, void *new_addr,
     size_t size, size_t pad, size_t alignment, bool slab, szind_t szind,
diff --git a/include/jemalloc/internal/extent_inlines.h b/include/jemalloc/internal/extent_inlines.h
index 77181df8..63b710dc 100644
--- a/include/jemalloc/internal/extent_inlines.h
+++ b/include/jemalloc/internal/extent_inlines.h
@@ -6,6 +6,7 @@
 #include "jemalloc/internal/pages.h"
 #include "jemalloc/internal/prng.h"
 #include "jemalloc/internal/ql.h"
+#include "jemalloc/internal/sc.h"
 #include "jemalloc/internal/sz.h"
 
 static inline void
@@ -34,18 +35,19 @@ extent_unlock2(tsdn_t *tsdn, extent_t *extent1, extent_t *extent2) {
 	    (uintptr_t)extent2);
 }
 
-static inline arena_t *
-extent_arena_get(const extent_t *extent) {
+static inline unsigned
+extent_arena_ind_get(const extent_t *extent) {
 	unsigned arena_ind = (unsigned)((extent->e_bits &
 	    EXTENT_BITS_ARENA_MASK) >> EXTENT_BITS_ARENA_SHIFT);
-	/*
-	 * The following check is omitted because we should never actually read
-	 * a NULL arena pointer.
-	 */
-	if (false && arena_ind >= MALLOCX_ARENA_LIMIT) {
-		return NULL;
-	}
 	assert(arena_ind < MALLOCX_ARENA_LIMIT);
+
+	return arena_ind;
+}
+
+static inline arena_t *
+extent_arena_get(const extent_t *extent) {
+	unsigned arena_ind = extent_arena_ind_get(extent);
+
 	return (arena_t *)atomic_load_p(&arenas[arena_ind], ATOMIC_ACQUIRE);
 }
 
@@ -53,14 +55,14 @@ static inline szind_t
 extent_szind_get_maybe_invalid(const extent_t *extent) {
 	szind_t szind = (szind_t)((extent->e_bits & EXTENT_BITS_SZIND_MASK) >>
 	    EXTENT_BITS_SZIND_SHIFT);
-	assert(szind <= NSIZES);
+	assert(szind <= SC_NSIZES);
 	return szind;
 }
 
 static inline szind_t
 extent_szind_get(const extent_t *extent) {
 	szind_t szind = extent_szind_get_maybe_invalid(extent);
-	assert(szind < NSIZES); /* Never call when "invalid". */
+	assert(szind < SC_NSIZES); /* Never call when "invalid". */
 	return szind;
 }
 
@@ -69,6 +71,14 @@ extent_usize_get(const extent_t *extent) {
 	return sz_index2size(extent_szind_get(extent));
 }
 
+static inline unsigned
+extent_binshard_get(const extent_t *extent) {
+	unsigned binshard = (unsigned)((extent->e_bits &
+	    EXTENT_BITS_BINSHARD_MASK) >> EXTENT_BITS_BINSHARD_SHIFT);
+	assert(binshard < bin_infos[extent_szind_get(extent)].n_shards);
+	return binshard;
+}
+
 static inline size_t
 extent_sn_get(const extent_t *extent) {
 	return (size_t)((extent->e_bits & EXTENT_BITS_SN_MASK) >>
@@ -176,6 +186,11 @@ extent_prof_tctx_get(const extent_t *extent) {
 	    ATOMIC_ACQUIRE);
 }
 
+static inline nstime_t
+extent_prof_alloc_time_get(const extent_t *extent) {
+	return extent->e_alloc_time;
+}
+
 static inline void
 extent_arena_set(extent_t *extent, arena_t *arena) {
 	unsigned arena_ind = (arena != NULL) ? arena_ind_get(arena) : ((1U <<
@@ -184,13 +199,21 @@ extent_arena_set(extent_t *extent, arena_t *arena) {
 	    ((uint64_t)arena_ind << EXTENT_BITS_ARENA_SHIFT);
 }
 
+static inline void
+extent_binshard_set(extent_t *extent, unsigned binshard) {
+	/* The assertion assumes szind is set already. */
+	assert(binshard < bin_infos[extent_szind_get(extent)].n_shards);
+	extent->e_bits = (extent->e_bits & ~EXTENT_BITS_BINSHARD_MASK) |
+	    ((uint64_t)binshard << EXTENT_BITS_BINSHARD_SHIFT);
+}
+
 static inline void
 extent_addr_set(extent_t *extent, void *addr) {
 	extent->e_addr = addr;
 }
 
 static inline void
-extent_addr_randomize(UNUSED tsdn_t *tsdn, extent_t *extent, size_t alignment) {
+extent_addr_randomize(tsdn_t *tsdn, extent_t *extent, size_t alignment) {
 	assert(extent_base_get(extent) == extent_addr_get(extent));
 
 	if (alignment < PAGE) {
@@ -234,7 +257,7 @@ extent_bsize_set(extent_t *extent, size_t bsize) {
 
 static inline void
 extent_szind_set(extent_t *extent, szind_t szind) {
-	assert(szind <= NSIZES); /* NSIZES means "invalid". */
+	assert(szind <= SC_NSIZES); /* SC_NSIZES means "invalid". */
 	extent->e_bits = (extent->e_bits & ~EXTENT_BITS_SZIND_MASK) |
 	    ((uint64_t)szind << EXTENT_BITS_SZIND_SHIFT);
 }
@@ -246,6 +269,16 @@ extent_nfree_set(extent_t *extent, unsigned nfree) {
 	    ((uint64_t)nfree << EXTENT_BITS_NFREE_SHIFT);
 }
 
+static inline void
+extent_nfree_binshard_set(extent_t *extent, unsigned nfree, unsigned binshard) {
+	/* The assertion assumes szind is set already. */
+	assert(binshard < bin_infos[extent_szind_get(extent)].n_shards);
+	extent->e_bits = (extent->e_bits &
+	    (~EXTENT_BITS_NFREE_MASK & ~EXTENT_BITS_BINSHARD_MASK)) |
+	    ((uint64_t)binshard << EXTENT_BITS_BINSHARD_SHIFT) |
+	    ((uint64_t)nfree << EXTENT_BITS_NFREE_SHIFT);
+}
+
 static inline void
 extent_nfree_inc(extent_t *extent) {
 	assert(extent_slab_get(extent));
@@ -258,6 +291,12 @@ extent_nfree_dec(extent_t *extent) {
 	extent->e_bits -= ((uint64_t)1U << EXTENT_BITS_NFREE_SHIFT);
 }
 
+static inline void
+extent_nfree_sub(extent_t *extent, uint64_t n) {
+	assert(extent_slab_get(extent));
+	extent->e_bits -= (n << EXTENT_BITS_NFREE_SHIFT);
+}
+
 static inline void
 extent_sn_set(extent_t *extent, size_t sn) {
 	extent->e_bits = (extent->e_bits & ~EXTENT_BITS_SN_MASK) |
@@ -299,6 +338,11 @@ extent_prof_tctx_set(extent_t *extent, prof_tctx_t *tctx) {
 	atomic_store_p(&extent->e_prof_tctx, tctx, ATOMIC_RELEASE);
 }
 
+static inline void
+extent_prof_alloc_time_set(extent_t *extent, nstime_t t) {
+	nstime_copy(&extent->e_alloc_time, &t);
+}
+
 static inline void
 extent_init(extent_t *extent, arena_t *arena, void *addr, size_t size,
     bool slab, szind_t szind, size_t sn, extent_state_t state, bool zeroed,
@@ -327,7 +371,7 @@ extent_binit(extent_t *extent, void *addr, size_t bsize, size_t sn) {
 	extent_addr_set(extent, addr);
 	extent_bsize_set(extent, bsize);
 	extent_slab_set(extent, false);
-	extent_szind_set(extent, NSIZES);
+	extent_szind_set(extent, SC_NSIZES);
 	extent_sn_set(extent, sn);
 	extent_state_set(extent, extent_state_active);
 	extent_zeroed_set(extent, true);
diff --git a/include/jemalloc/internal/extent_structs.h b/include/jemalloc/internal/extent_structs.h
index 4873b9e9..ceb18979 100644
--- a/include/jemalloc/internal/extent_structs.h
+++ b/include/jemalloc/internal/extent_structs.h
@@ -2,11 +2,12 @@
 #define JEMALLOC_INTERNAL_EXTENT_STRUCTS_H
 
 #include "jemalloc/internal/atomic.h"
+#include "jemalloc/internal/bit_util.h"
 #include "jemalloc/internal/bitmap.h"
 #include "jemalloc/internal/mutex.h"
 #include "jemalloc/internal/ql.h"
 #include "jemalloc/internal/ph.h"
-#include "jemalloc/internal/size_classes.h"
+#include "jemalloc/internal/sc.h"
 
 typedef enum {
 	extent_state_active   = 0,
@@ -28,9 +29,10 @@ struct extent_s {
 	 * t: state
 	 * i: szind
 	 * f: nfree
+	 * s: bin_shard
 	 * n: sn
 	 *
-	 * nnnnnnnn ... nnnnffff ffffffii iiiiiitt zdcbaaaa aaaaaaaa
+	 * nnnnnnnn ... nnnnnnss ssssffff ffffffii iiiiiitt zdcbaaaa aaaaaaaa
 	 *
 	 * arena_ind: Arena from which this extent came, or all 1 bits if
 	 *            unassociated.
@@ -75,6 +77,8 @@ struct extent_s {
 	 *
 	 * nfree: Number of free regions in slab.
 	 *
+	 * bin_shard: the shard of the bin from which this extent came.
+	 *
 	 * sn: Serial number (potentially non-unique).
 	 *
 	 *     Serial numbers may wrap around if !opt_retain, but as long as
@@ -112,7 +116,7 @@ struct extent_s {
 #define EXTENT_BITS_STATE_SHIFT  (EXTENT_BITS_ZEROED_WIDTH + EXTENT_BITS_ZEROED_SHIFT)
 #define EXTENT_BITS_STATE_MASK  MASK(EXTENT_BITS_STATE_WIDTH, EXTENT_BITS_STATE_SHIFT)
 
-#define EXTENT_BITS_SZIND_WIDTH  LG_CEIL_NSIZES
+#define EXTENT_BITS_SZIND_WIDTH  LG_CEIL(SC_NSIZES)
 #define EXTENT_BITS_SZIND_SHIFT  (EXTENT_BITS_STATE_WIDTH + EXTENT_BITS_STATE_SHIFT)
 #define EXTENT_BITS_SZIND_MASK  MASK(EXTENT_BITS_SZIND_WIDTH, EXTENT_BITS_SZIND_SHIFT)
 
@@ -120,7 +124,11 @@ struct extent_s {
 #define EXTENT_BITS_NFREE_SHIFT  (EXTENT_BITS_SZIND_WIDTH + EXTENT_BITS_SZIND_SHIFT)
 #define EXTENT_BITS_NFREE_MASK  MASK(EXTENT_BITS_NFREE_WIDTH, EXTENT_BITS_NFREE_SHIFT)
 
-#define EXTENT_BITS_SN_SHIFT  (EXTENT_BITS_NFREE_WIDTH + EXTENT_BITS_NFREE_SHIFT)
+#define EXTENT_BITS_BINSHARD_WIDTH  6
+#define EXTENT_BITS_BINSHARD_SHIFT  (EXTENT_BITS_NFREE_WIDTH + EXTENT_BITS_NFREE_SHIFT)
+#define EXTENT_BITS_BINSHARD_MASK  MASK(EXTENT_BITS_BINSHARD_WIDTH, EXTENT_BITS_BINSHARD_SHIFT)
+
+#define EXTENT_BITS_SN_SHIFT (EXTENT_BITS_BINSHARD_WIDTH + EXTENT_BITS_BINSHARD_SHIFT)
 #define EXTENT_BITS_SN_MASK  (UINT64_MAX << EXTENT_BITS_SN_SHIFT)
 
 	/* Pointer to the extent that this structure is responsible for. */
@@ -160,11 +168,13 @@ struct extent_s {
 		/* Small region slab metadata. */
 		arena_slab_data_t	e_slab_data;
 
-		/*
-		 * Profile counters, used for large objects.  Points to a
-		 * prof_tctx_t.
-		 */
-		atomic_p_t		e_prof_tctx;
+		/* Profiling data, used for large objects. */
+		struct {
+			/* Time when this was allocated. */
+			nstime_t		e_alloc_time;
+			/* Points to a prof_tctx_t. */
+			atomic_p_t		e_prof_tctx;
+		};
 	};
 };
 typedef ql_head(extent_t) extent_list_t;
@@ -180,14 +190,16 @@ struct extents_s {
 	 *
 	 * Synchronization: mtx.
 	 */
-	extent_heap_t		heaps[NPSIZES+1];
+	extent_heap_t		heaps[SC_NPSIZES + 1];
+	atomic_zu_t		nextents[SC_NPSIZES + 1];
+	atomic_zu_t		nbytes[SC_NPSIZES + 1];
 
 	/*
 	 * Bitmap for which set bits correspond to non-empty heaps.
 	 *
 	 * Synchronization: mtx.
 	 */
-	bitmap_t		bitmap[BITMAP_GROUPS(NPSIZES+1)];
+	bitmap_t		bitmap[BITMAP_GROUPS(SC_NPSIZES + 1)];
 
 	/*
 	 * LRU of all extents in heaps.
diff --git a/include/jemalloc/internal/extent_types.h b/include/jemalloc/internal/extent_types.h
index c0561d99..acbcf27b 100644
--- a/include/jemalloc/internal/extent_types.h
+++ b/include/jemalloc/internal/extent_types.h
@@ -6,8 +6,6 @@ typedef struct extents_s extents_t;
 
 #define EXTENT_HOOKS_INITIALIZER	NULL
 
-#define EXTENT_GROW_MAX_PIND (NPSIZES - 1)
-
 /*
  * When reuse (and split) an active extent, (1U << opt_lg_extent_max_active_fit)
  * is the max ratio between the size of the active extent and the new extent.
diff --git a/include/jemalloc/internal/hash.h b/include/jemalloc/internal/hash.h
index dcfc992d..0270034e 100644
--- a/include/jemalloc/internal/hash.h
+++ b/include/jemalloc/internal/hash.h
@@ -104,8 +104,8 @@ hash_x86_32(const void *key, int len, uint32_t seed) {
 		uint32_t k1 = 0;
 
 		switch (len & 3) {
-		case 3: k1 ^= tail[2] << 16;
-		case 2: k1 ^= tail[1] << 8;
+		case 3: k1 ^= tail[2] << 16; JEMALLOC_FALLTHROUGH
+		case 2: k1 ^= tail[1] << 8; JEMALLOC_FALLTHROUGH
 		case 1: k1 ^= tail[0]; k1 *= c1; k1 = hash_rotl_32(k1, 15);
 			k1 *= c2; h1 ^= k1;
 		}
@@ -119,7 +119,7 @@ hash_x86_32(const void *key, int len, uint32_t seed) {
 	return h1;
 }
 
-UNUSED static inline void
+static inline void
 hash_x86_128(const void *key, const int len, uint32_t seed,
     uint64_t r_out[2]) {
 	const uint8_t * data = (const uint8_t *) key;
@@ -177,28 +177,29 @@ hash_x86_128(const void *key, const int len, uint32_t seed,
 		uint32_t k4 = 0;
 
 		switch (len & 15) {
-		case 15: k4 ^= tail[14] << 16;
-		case 14: k4 ^= tail[13] << 8;
+		case 15: k4 ^= tail[14] << 16; JEMALLOC_FALLTHROUGH
+		case 14: k4 ^= tail[13] << 8; JEMALLOC_FALLTHROUGH
 		case 13: k4 ^= tail[12] << 0;
 			k4 *= c4; k4 = hash_rotl_32(k4, 18); k4 *= c1; h4 ^= k4;
-
-		case 12: k3 ^= tail[11] << 24;
-		case 11: k3 ^= tail[10] << 16;
-		case 10: k3 ^= tail[ 9] << 8;
+      JEMALLOC_FALLTHROUGH
+		case 12: k3 ^= tail[11] << 24; JEMALLOC_FALLTHROUGH
+		case 11: k3 ^= tail[10] << 16; JEMALLOC_FALLTHROUGH
+		case 10: k3 ^= tail[ 9] << 8; JEMALLOC_FALLTHROUGH
 		case  9: k3 ^= tail[ 8] << 0;
 		     k3 *= c3; k3 = hash_rotl_32(k3, 17); k3 *= c4; h3 ^= k3;
-
-		case  8: k2 ^= tail[ 7] << 24;
-		case  7: k2 ^= tail[ 6] << 16;
-		case  6: k2 ^= tail[ 5] << 8;
+         JEMALLOC_FALLTHROUGH
+		case  8: k2 ^= tail[ 7] << 24; JEMALLOC_FALLTHROUGH
+		case  7: k2 ^= tail[ 6] << 16; JEMALLOC_FALLTHROUGH
+		case  6: k2 ^= tail[ 5] << 8; JEMALLOC_FALLTHROUGH
 		case  5: k2 ^= tail[ 4] << 0;
 			k2 *= c2; k2 = hash_rotl_32(k2, 16); k2 *= c3; h2 ^= k2;
-
-		case  4: k1 ^= tail[ 3] << 24;
-		case  3: k1 ^= tail[ 2] << 16;
-		case  2: k1 ^= tail[ 1] << 8;
+      JEMALLOC_FALLTHROUGH
+		case  4: k1 ^= tail[ 3] << 24; JEMALLOC_FALLTHROUGH
+		case  3: k1 ^= tail[ 2] << 16; JEMALLOC_FALLTHROUGH
+		case  2: k1 ^= tail[ 1] << 8; JEMALLOC_FALLTHROUGH
 		case  1: k1 ^= tail[ 0] << 0;
 			k1 *= c1; k1 = hash_rotl_32(k1, 15); k1 *= c2; h1 ^= k1;
+      JEMALLOC_FALLTHROUGH
 		}
 	}
 
@@ -220,7 +221,7 @@ hash_x86_128(const void *key, const int len, uint32_t seed,
 	r_out[1] = (((uint64_t) h4) << 32) | h3;
 }
 
-UNUSED static inline void
+static inline void
 hash_x64_128(const void *key, const int len, const uint32_t seed,
     uint64_t r_out[2]) {
 	const uint8_t *data = (const uint8_t *) key;
@@ -260,22 +261,22 @@ hash_x64_128(const void *key, const int len, const uint32_t seed,
 		uint64_t k2 = 0;
 
 		switch (len & 15) {
-		case 15: k2 ^= ((uint64_t)(tail[14])) << 48; /* falls through */
-		case 14: k2 ^= ((uint64_t)(tail[13])) << 40; /* falls through */
-		case 13: k2 ^= ((uint64_t)(tail[12])) << 32; /* falls through */
-		case 12: k2 ^= ((uint64_t)(tail[11])) << 24; /* falls through */
-		case 11: k2 ^= ((uint64_t)(tail[10])) << 16; /* falls through */
-		case 10: k2 ^= ((uint64_t)(tail[ 9])) << 8;  /* falls through */
+		case 15: k2 ^= ((uint64_t)(tail[14])) << 48; JEMALLOC_FALLTHROUGH
+		case 14: k2 ^= ((uint64_t)(tail[13])) << 40; JEMALLOC_FALLTHROUGH
+		case 13: k2 ^= ((uint64_t)(tail[12])) << 32; JEMALLOC_FALLTHROUGH
+		case 12: k2 ^= ((uint64_t)(tail[11])) << 24; JEMALLOC_FALLTHROUGH
+		case 11: k2 ^= ((uint64_t)(tail[10])) << 16; JEMALLOC_FALLTHROUGH
+		case 10: k2 ^= ((uint64_t)(tail[ 9])) << 8;  JEMALLOC_FALLTHROUGH
 		case  9: k2 ^= ((uint64_t)(tail[ 8])) << 0;
 			k2 *= c2; k2 = hash_rotl_64(k2, 33); k2 *= c1; h2 ^= k2;
-			/* falls through */
-		case  8: k1 ^= ((uint64_t)(tail[ 7])) << 56; /* falls through */
-		case  7: k1 ^= ((uint64_t)(tail[ 6])) << 48; /* falls through */
-		case  6: k1 ^= ((uint64_t)(tail[ 5])) << 40; /* falls through */
-		case  5: k1 ^= ((uint64_t)(tail[ 4])) << 32; /* falls through */
-		case  4: k1 ^= ((uint64_t)(tail[ 3])) << 24; /* falls through */
-		case  3: k1 ^= ((uint64_t)(tail[ 2])) << 16; /* falls through */
-		case  2: k1 ^= ((uint64_t)(tail[ 1])) << 8;  /* falls through */
+			JEMALLOC_FALLTHROUGH
+		case  8: k1 ^= ((uint64_t)(tail[ 7])) << 56; JEMALLOC_FALLTHROUGH
+		case  7: k1 ^= ((uint64_t)(tail[ 6])) << 48; JEMALLOC_FALLTHROUGH
+		case  6: k1 ^= ((uint64_t)(tail[ 5])) << 40; JEMALLOC_FALLTHROUGH
+		case  5: k1 ^= ((uint64_t)(tail[ 4])) << 32; JEMALLOC_FALLTHROUGH
+		case  4: k1 ^= ((uint64_t)(tail[ 3])) << 24; JEMALLOC_FALLTHROUGH
+		case  3: k1 ^= ((uint64_t)(tail[ 2])) << 16; JEMALLOC_FALLTHROUGH
+		case  2: k1 ^= ((uint64_t)(tail[ 1])) << 8;  JEMALLOC_FALLTHROUGH
 		case  1: k1 ^= ((uint64_t)(tail[ 0])) << 0;
 			k1 *= c1; k1 = hash_rotl_64(k1, 31); k1 *= c2; h1 ^= k1;
 		}
diff --git a/include/jemalloc/internal/hook.h b/include/jemalloc/internal/hook.h
new file mode 100644
index 00000000..ee246b1e
--- /dev/null
+++ b/include/jemalloc/internal/hook.h
@@ -0,0 +1,163 @@
+#ifndef JEMALLOC_INTERNAL_HOOK_H
+#define JEMALLOC_INTERNAL_HOOK_H
+
+#include "jemalloc/internal/tsd.h"
+
+/*
+ * This API is *extremely* experimental, and may get ripped out, changed in API-
+ * and ABI-incompatible ways, be insufficiently or incorrectly documented, etc.
+ *
+ * It allows hooking the stateful parts of the API to see changes as they
+ * happen.
+ *
+ * Allocation hooks are called after the allocation is done, free hooks are
+ * called before the free is done, and expand hooks are called after the
+ * allocation is expanded.
+ *
+ * For realloc and rallocx, if the expansion happens in place, the expansion
+ * hook is called.  If it is moved, then the alloc hook is called on the new
+ * location, and then the free hook is called on the old location (i.e. both
+ * hooks are invoked in between the alloc and the dalloc).
+ *
+ * If we return NULL from OOM, then usize might not be trustworthy.  Calling
+ * realloc(NULL, size) only calls the alloc hook, and calling realloc(ptr, 0)
+ * only calls the free hook.  (Calling realloc(NULL, 0) is treated as malloc(0),
+ * and only calls the alloc hook).
+ *
+ * Reentrancy:
+ *   Reentrancy is guarded against from within the hook implementation.  If you
+ *   call allocator functions from within a hook, the hooks will not be invoked
+ *   again.
+ * Threading:
+ *   The installation of a hook synchronizes with all its uses.  If you can
+ *   prove the installation of a hook happens-before a jemalloc entry point,
+ *   then the hook will get invoked (unless there's a racing removal).
+ *
+ *   Hook insertion appears to be atomic at a per-thread level (i.e. if a thread
+ *   allocates and has the alloc hook invoked, then a subsequent free on the
+ *   same thread will also have the free hook invoked).
+ *
+ *   The *removal* of a hook does *not* block until all threads are done with
+ *   the hook.  Hook authors have to be resilient to this, and need some
+ *   out-of-band mechanism for cleaning up any dynamically allocated memory
+ *   associated with their hook.
+ * Ordering:
+ *   Order of hook execution is unspecified, and may be different than insertion
+ *   order.
+ */
+
+#define HOOK_MAX 4
+
+enum hook_alloc_e {
+	hook_alloc_malloc,
+	hook_alloc_posix_memalign,
+	hook_alloc_aligned_alloc,
+	hook_alloc_calloc,
+	hook_alloc_memalign,
+	hook_alloc_valloc,
+	hook_alloc_mallocx,
+
+	/* The reallocating functions have both alloc and dalloc variants */
+	hook_alloc_realloc,
+	hook_alloc_rallocx,
+};
+/*
+ * We put the enum typedef after the enum, since this file may get included by
+ * jemalloc_cpp.cpp, and C++ disallows enum forward declarations.
+ */
+typedef enum hook_alloc_e hook_alloc_t;
+
+enum hook_dalloc_e {
+	hook_dalloc_free,
+	hook_dalloc_dallocx,
+	hook_dalloc_sdallocx,
+
+	/*
+	 * The dalloc halves of reallocation (not called if in-place expansion
+	 * happens).
+	 */
+	hook_dalloc_realloc,
+	hook_dalloc_rallocx,
+};
+typedef enum hook_dalloc_e hook_dalloc_t;
+
+
+enum hook_expand_e {
+	hook_expand_realloc,
+	hook_expand_rallocx,
+	hook_expand_xallocx,
+};
+typedef enum hook_expand_e hook_expand_t;
+
+typedef void (*hook_alloc)(
+    void *extra, hook_alloc_t type, void *result, uintptr_t result_raw,
+    uintptr_t args_raw[3]);
+
+typedef void (*hook_dalloc)(
+    void *extra, hook_dalloc_t type, void *address, uintptr_t args_raw[3]);
+
+typedef void (*hook_expand)(
+    void *extra, hook_expand_t type, void *address, size_t old_usize,
+    size_t new_usize, uintptr_t result_raw, uintptr_t args_raw[4]);
+
+typedef struct hooks_s hooks_t;
+struct hooks_s {
+	hook_alloc alloc_hook;
+	hook_dalloc dalloc_hook;
+	hook_expand expand_hook;
+	void *extra;
+};
+
+/*
+ * Begin implementation details; everything above this point might one day live
+ * in a public API.  Everything below this point never will.
+ */
+
+/*
+ * The realloc pathways haven't gotten any refactoring love in a while, and it's
+ * fairly difficult to pass information from the entry point to the hooks.  We
+ * put the informaiton the hooks will need into a struct to encapsulate
+ * everything.
+ *
+ * Much of these pathways are force-inlined, so that the compiler can avoid
+ * materializing this struct until we hit an extern arena function.  For fairly
+ * goofy reasons, *many* of the realloc paths hit an extern arena function.
+ * These paths are cold enough that it doesn't matter; eventually, we should
+ * rewrite the realloc code to make the expand-in-place and the
+ * free-then-realloc paths more orthogonal, at which point we don't need to
+ * spread the hook logic all over the place.
+ */
+typedef struct hook_ralloc_args_s hook_ralloc_args_t;
+struct hook_ralloc_args_s {
+	/* I.e. as opposed to rallocx. */
+	bool is_realloc;
+	/*
+	 * The expand hook takes 4 arguments, even if only 3 are actually used;
+	 * we add an extra one in case the user decides to memcpy without
+	 * looking too closely at the hooked function.
+	 */
+	uintptr_t args[4];
+};
+
+/*
+ * Returns an opaque handle to be used when removing the hook.  NULL means that
+ * we couldn't install the hook.
+ */
+bool hook_boot();
+
+void *hook_install(tsdn_t *tsdn, hooks_t *hooks);
+/* Uninstalls the hook with the handle previously returned from hook_install. */
+void hook_remove(tsdn_t *tsdn, void *opaque);
+
+/* Hooks */
+
+void hook_invoke_alloc(hook_alloc_t type, void *result, uintptr_t result_raw,
+    uintptr_t args_raw[3]);
+
+void hook_invoke_dalloc(hook_dalloc_t type, void *address,
+    uintptr_t args_raw[3]);
+
+void hook_invoke_expand(hook_expand_t type, void *address, size_t old_usize,
+    size_t new_usize, uintptr_t result_raw, uintptr_t args_raw[4]);
+
+#endif /* JEMALLOC_INTERNAL_HOOK_H */
diff --git a/include/jemalloc/internal/hooks.h b/include/jemalloc/internal/hooks.h
deleted file mode 100644
index cd49afcb..00000000
--- a/include/jemalloc/internal/hooks.h
+++ /dev/null
@@ -1,19 +0,0 @@
-#ifndef JEMALLOC_INTERNAL_HOOKS_H
-#define JEMALLOC_INTERNAL_HOOKS_H
-
-extern JEMALLOC_EXPORT void (*hooks_arena_new_hook)();
-extern JEMALLOC_EXPORT void (*hooks_libc_hook)();
-
-#define JEMALLOC_HOOK(fn, hook) ((void)(hook != NULL && (hook(), 0)), fn)
-
-#define open JEMALLOC_HOOK(open, hooks_libc_hook)
-#define read JEMALLOC_HOOK(read, hooks_libc_hook)
-#define write JEMALLOC_HOOK(write, hooks_libc_hook)
-#define readlink JEMALLOC_HOOK(readlink, hooks_libc_hook)
-#define close JEMALLOC_HOOK(close, hooks_libc_hook)
-#define creat JEMALLOC_HOOK(creat, hooks_libc_hook)
-#define secure_getenv JEMALLOC_HOOK(secure_getenv, hooks_libc_hook)
-/* Note that this is undef'd and re-define'd in src/prof.c. */
-#define _Unwind_Backtrace JEMALLOC_HOOK(_Unwind_Backtrace, hooks_libc_hook)
-
-#endif /* JEMALLOC_INTERNAL_HOOKS_H */
diff --git a/include/jemalloc/internal/jemalloc_internal_decls.h b/include/jemalloc/internal/jemalloc_internal_decls.h
index be70df51..7d6053e2 100644
--- a/include/jemalloc/internal/jemalloc_internal_decls.h
+++ b/include/jemalloc/internal/jemalloc_internal_decls.h
@@ -31,6 +31,9 @@
 #    include <sys/uio.h>
 #  endif
 #  include <pthread.h>
+#  ifdef __FreeBSD__
+#  include <pthread_np.h>
+#  endif
 #  include <signal.h>
 #  ifdef JEMALLOC_OS_UNFAIR_LOCK
 #    include <os/lock.h>
diff --git a/include/jemalloc/internal/jemalloc_internal_defs.h.in b/include/jemalloc/internal/jemalloc_internal_defs.h.in
index 8dad9a1d..21b65147 100644
--- a/include/jemalloc/internal/jemalloc_internal_defs.h.in
+++ b/include/jemalloc/internal/jemalloc_internal_defs.h.in
@@ -48,25 +48,13 @@
 
 /* Defined if GCC __atomic atomics are available. */
 #undef JEMALLOC_GCC_ATOMIC_ATOMICS
+/* and the 8-bit variant support. */
+#undef JEMALLOC_GCC_U8_ATOMIC_ATOMICS
 
 /* Defined if GCC __sync atomics are available. */
 #undef JEMALLOC_GCC_SYNC_ATOMICS
-
-/*
- * Defined if __sync_add_and_fetch(uint32_t *, uint32_t) and
- * __sync_sub_and_fetch(uint32_t *, uint32_t) are available, despite
- * __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4 not being defined (which means the
- * functions are defined in libgcc instead of being inlines).
- */
-#undef JE_FORCE_SYNC_COMPARE_AND_SWAP_4
-
-/*
- * Defined if __sync_add_and_fetch(uint64_t *, uint64_t) and
- * __sync_sub_and_fetch(uint64_t *, uint64_t) are available, despite
- * __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8 not being defined (which means the
- * functions are defined in libgcc instead of being inlines).
- */
-#undef JE_FORCE_SYNC_COMPARE_AND_SWAP_8
+/* and the 8-bit variant support. */
+#undef JEMALLOC_GCC_U8_SYNC_ATOMICS
 
 /*
  * Defined if __builtin_clz() and __builtin_clzl() are available.
@@ -78,12 +66,6 @@
  */
 #undef JEMALLOC_OS_UNFAIR_LOCK
 
-/*
- * Defined if OSSpin*() functions are available, as provided by Darwin, and
- * documented in the spinlock(3) manual page.
- */
-#undef JEMALLOC_OSSPIN
-
 /* Defined if syscall(2) is usable. */
 #undef JEMALLOC_USE_SYSCALL
 
@@ -153,6 +135,9 @@
 /* JEMALLOC_STATS enables statistics calculation. */
 #undef JEMALLOC_STATS
 
+/* JEMALLOC_EXPERIMENTAL_SMALLOCX_API enables experimental smallocx API. */
+#undef JEMALLOC_EXPERIMENTAL_SMALLOCX_API
+
 /* JEMALLOC_PROF enables allocation profiling. */
 #undef JEMALLOC_PROF
 
@@ -233,6 +218,12 @@
 #undef JEMALLOC_INTERNAL_FFSL
 #undef JEMALLOC_INTERNAL_FFS
 
+/*
+ * popcount*() functions to use for bitmapping.
+ */
+#undef JEMALLOC_INTERNAL_POPCOUNTL
+#undef JEMALLOC_INTERNAL_POPCOUNT
+
 /*
  * If defined, explicitly attempt to more uniformly distribute large allocation
  * pointer alignments across all cache indices.
@@ -245,6 +236,12 @@
  */
 #undef JEMALLOC_LOG
 
+/*
+ * If defined, use readlinkat() (instead of readlink()) to follow
+ * /etc/malloc_conf.
+ */
+#undef JEMALLOC_READLINKAT
+
 /*
  * Darwin (OS X) uses zones to work around Mach-O symbol override shortcomings.
  */
@@ -363,4 +360,7 @@
  */
 #undef JEMALLOC_STRERROR_R_RETURNS_CHAR_WITH_GNU_SOURCE
 
+/* Performs additional size-matching sanity checks when defined. */
+#undef JEMALLOC_EXTRA_SIZE_CHECK
+
 #endif /* JEMALLOC_INTERNAL_DEFS_H_ */
diff --git a/include/jemalloc/internal/jemalloc_internal_externs.h b/include/jemalloc/internal/jemalloc_internal_externs.h
index e10fb275..b7843623 100644
--- a/include/jemalloc/internal/jemalloc_internal_externs.h
+++ b/include/jemalloc/internal/jemalloc_internal_externs.h
@@ -2,7 +2,6 @@
 #define JEMALLOC_INTERNAL_EXTERNS_H
 
 #include "jemalloc/internal/atomic.h"
-#include "jemalloc/internal/size_classes.h"
 #include "jemalloc/internal/tsd_types.h"
 
 /* TSD checks this to set thread local slow state accordingly. */
@@ -25,6 +24,9 @@ extern unsigned ncpus;
 /* Number of arenas used for automatic multiplexing of threads and arenas. */
 extern unsigned narenas_auto;
 
+/* Base index for manual arenas. */
+extern unsigned manual_arena_base;
+
 /*
  * Arenas that are used to service external requests.  Not all elements of the
  * arenas array are necessarily used; arenas are created lazily as needed.
diff --git a/include/jemalloc/internal/jemalloc_internal_inlines_a.h b/include/jemalloc/internal/jemalloc_internal_inlines_a.h
index c6a1f7eb..ddde9b4e 100644
--- a/include/jemalloc/internal/jemalloc_internal_inlines_a.h
+++ b/include/jemalloc/internal/jemalloc_internal_inlines_a.h
@@ -4,13 +4,15 @@
 #include "jemalloc/internal/atomic.h"
 #include "jemalloc/internal/bit_util.h"
 #include "jemalloc/internal/jemalloc_internal_types.h"
-#include "jemalloc/internal/size_classes.h"
+#include "jemalloc/internal/sc.h"
 #include "jemalloc/internal/ticker.h"
 
 JEMALLOC_ALWAYS_INLINE malloc_cpuid_t
 malloc_getcpu(void) {
 	assert(have_percpu_arena);
-#if defined(JEMALLOC_HAVE_SCHED_GETCPU)
+#if defined(_WIN32)
+	return GetCurrentProcessorNumber();
+#elif defined(JEMALLOC_HAVE_SCHED_GETCPU)
 	return (malloc_cpuid_t)sched_getcpu();
 #else
 	not_reached();
@@ -108,14 +110,14 @@ decay_ticker_get(tsd_t *tsd, unsigned ind) {
 
 JEMALLOC_ALWAYS_INLINE cache_bin_t *
 tcache_small_bin_get(tcache_t *tcache, szind_t binind) {
-	assert(binind < NBINS);
+	assert(binind < SC_NBINS);
 	return &tcache->bins_small[binind];
 }
 
 JEMALLOC_ALWAYS_INLINE cache_bin_t *
 tcache_large_bin_get(tcache_t *tcache, szind_t binind) {
-	assert(binind >= NBINS &&binind < nhbins);
-	return &tcache->bins_large[binind - NBINS];
+	assert(binind >= SC_NBINS &&binind < nhbins);
+	return &tcache->bins_large[binind - SC_NBINS];
 }
 
 JEMALLOC_ALWAYS_INLINE bool
@@ -156,7 +158,7 @@ pre_reentrancy(tsd_t *tsd, arena_t *arena) {
 	if (fast) {
 		/* Prepare slow path for reentrancy. */
 		tsd_slow_update(tsd);
-		assert(tsd->state == tsd_state_nominal_slow);
+		assert(tsd_state_get(tsd) == tsd_state_nominal_slow);
 	}
 }
 
diff --git a/include/jemalloc/internal/jemalloc_internal_inlines_b.h b/include/jemalloc/internal/jemalloc_internal_inlines_b.h
index 2e76e5d8..70d6e578 100644
--- a/include/jemalloc/internal/jemalloc_internal_inlines_b.h
+++ b/include/jemalloc/internal/jemalloc_internal_inlines_b.h
@@ -71,7 +71,8 @@ arena_ichoose(tsd_t *tsd, arena_t *arena) {
 static inline bool
 arena_is_auto(arena_t *arena) {
 	assert(narenas_auto > 0);
-	return (arena_ind_get(arena) < narenas_auto);
+
+	return (arena_ind_get(arena) < manual_arena_base);
 }
 
 JEMALLOC_ALWAYS_INLINE extent_t *
diff --git a/include/jemalloc/internal/jemalloc_internal_inlines_c.h b/include/jemalloc/internal/jemalloc_internal_inlines_c.h
index c829ac60..cdb10eb2 100644
--- a/include/jemalloc/internal/jemalloc_internal_inlines_c.h
+++ b/include/jemalloc/internal/jemalloc_internal_inlines_c.h
@@ -1,6 +1,7 @@
 #ifndef JEMALLOC_INTERNAL_INLINES_C_H
 #define JEMALLOC_INTERNAL_INLINES_C_H
 
+#include "jemalloc/internal/hook.h"
 #include "jemalloc/internal/jemalloc_internal_types.h"
 #include "jemalloc/internal/sz.h"
 #include "jemalloc/internal/witness.h"
@@ -42,7 +43,6 @@ iallocztm(tsdn_t *tsdn, size_t size, szind_t ind, bool zero, tcache_t *tcache,
     bool is_internal, arena_t *arena, bool slow_path) {
 	void *ret;
 
-	assert(size != 0);
 	assert(!is_internal || tcache == NULL);
 	assert(!is_internal || arena == NULL || arena_is_auto(arena));
 	if (!tsdn_null(tsdn) && tsd_reentrancy_level_get(tsdn_tsd(tsdn)) == 0) {
@@ -133,31 +133,20 @@ isdalloct(tsdn_t *tsdn, void *ptr, size_t size, tcache_t *tcache,
 
 JEMALLOC_ALWAYS_INLINE void *
 iralloct_realign(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size,
-    size_t extra, size_t alignment, bool zero, tcache_t *tcache,
-    arena_t *arena) {
+    size_t alignment, bool zero, tcache_t *tcache, arena_t *arena,
+    hook_ralloc_args_t *hook_args) {
 	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
 	    WITNESS_RANK_CORE, 0);
 	void *p;
 	size_t usize, copysize;
 
-	usize = sz_sa2u(size + extra, alignment);
-	if (unlikely(usize == 0 || usize > LARGE_MAXCLASS)) {
+	usize = sz_sa2u(size, alignment);
+	if (unlikely(usize == 0 || usize > SC_LARGE_MAXCLASS)) {
 		return NULL;
 	}
 	p = ipalloct(tsdn, usize, alignment, zero, tcache, arena);
 	if (p == NULL) {
-		if (extra == 0) {
-			return NULL;
-		}
-		/* Try again, without extra this time. */
-		usize = sz_sa2u(size, alignment);
-		if (unlikely(usize == 0 || usize > LARGE_MAXCLASS)) {
-			return NULL;
-		}
-		p = ipalloct(tsdn, usize, alignment, zero, tcache, arena);
-		if (p == NULL) {
-			return NULL;
-		}
+		return NULL;
 	}
 	/*
 	 * Copy at most size bytes (not size+extra), since the caller has no
@@ -165,13 +154,26 @@ iralloct_realign(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size,
 	 */
 	copysize = (size < oldsize) ? size : oldsize;
 	memcpy(p, ptr, copysize);
+	hook_invoke_alloc(hook_args->is_realloc
+	    ? hook_alloc_realloc : hook_alloc_rallocx, p, (uintptr_t)p,
+	    hook_args->args);
+	hook_invoke_dalloc(hook_args->is_realloc
+	    ? hook_dalloc_realloc : hook_dalloc_rallocx, ptr, hook_args->args);
 	isdalloct(tsdn, ptr, oldsize, tcache, NULL, true);
 	return p;
 }
 
+/*
+ * is_realloc threads through the knowledge of whether or not this call comes
+ * from je_realloc (as opposed to je_rallocx); this ensures that we pass the
+ * correct entry point into any hooks.
+ * Note that these functions are all force-inlined, so no actual bool gets
+ * passed-around anywhere.
+ */
 JEMALLOC_ALWAYS_INLINE void *
 iralloct(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size, size_t alignment,
-    bool zero, tcache_t *tcache, arena_t *arena) {
+    bool zero, tcache_t *tcache, arena_t *arena, hook_ralloc_args_t *hook_args)
+{
 	assert(ptr != NULL);
 	assert(size != 0);
 	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
@@ -183,24 +185,24 @@ iralloct(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size, size_t alignment,
 		 * Existing object alignment is inadequate; allocate new space
 		 * and copy.
 		 */
-		return iralloct_realign(tsdn, ptr, oldsize, size, 0, alignment,
-		    zero, tcache, arena);
+		return iralloct_realign(tsdn, ptr, oldsize, size, alignment,
+		    zero, tcache, arena, hook_args);
 	}
 
 	return arena_ralloc(tsdn, arena, ptr, oldsize, size, alignment, zero,
-	    tcache);
+	    tcache, hook_args);
 }
 
 JEMALLOC_ALWAYS_INLINE void *
 iralloc(tsd_t *tsd, void *ptr, size_t oldsize, size_t size, size_t alignment,
-    bool zero) {
+    bool zero, hook_ralloc_args_t *hook_args) {
 	return iralloct(tsd_tsdn(tsd), ptr, oldsize, size, alignment, zero,
-	    tcache_get(tsd), NULL);
+	    tcache_get(tsd), NULL, hook_args);
 }
 
 JEMALLOC_ALWAYS_INLINE bool
 ixalloc(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size, size_t extra,
-    size_t alignment, bool zero) {
+    size_t alignment, bool zero, size_t *newsize) {
 	assert(ptr != NULL);
 	assert(size != 0);
 	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
@@ -209,10 +211,12 @@ ixalloc(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size, size_t extra,
 	if (alignment != 0 && ((uintptr_t)ptr & ((uintptr_t)alignment-1))
 	    != 0) {
 		/* Existing object alignment is inadequate. */
+		*newsize = oldsize;
 		return true;
 	}
 
-	return arena_ralloc_no_move(tsdn, ptr, oldsize, size, extra, zero);
+	return arena_ralloc_no_move(tsdn, ptr, oldsize, size, extra, zero,
+	    newsize);
 }
 
 #endif /* JEMALLOC_INTERNAL_INLINES_C_H */
diff --git a/include/jemalloc/internal/jemalloc_internal_macros.h b/include/jemalloc/internal/jemalloc_internal_macros.h
index ed75d376..d8ea06f6 100644
--- a/include/jemalloc/internal/jemalloc_internal_macros.h
+++ b/include/jemalloc/internal/jemalloc_internal_macros.h
@@ -30,7 +30,7 @@
 #  define restrict
 #endif
 
-/* Various function pointers are statick and immutable except during testing. */
+/* Various function pointers are static and immutable except during testing. */
 #ifdef JEMALLOC_JET
 #  define JET_MUTABLE
 #else
@@ -40,4 +40,75 @@
 #define JEMALLOC_VA_ARGS_HEAD(head, ...) head
 #define JEMALLOC_VA_ARGS_TAIL(head, ...) __VA_ARGS__
 
+#if (defined(__GNUC__) || defined(__GNUG__)) && !defined(__clang__) \
+  && defined(JEMALLOC_HAVE_ATTR) && (__GNUC__ >= 7)
+#define JEMALLOC_FALLTHROUGH JEMALLOC_ATTR(fallthrough);
+#else
+#define JEMALLOC_FALLTHROUGH /* falls through */
+#endif
+
+/* Diagnostic suppression macros */
+#if defined(_MSC_VER) && !defined(__clang__)
+#  define JEMALLOC_DIAGNOSTIC_PUSH __pragma(warning(push))
+#  define JEMALLOC_DIAGNOSTIC_POP __pragma(warning(pop))
+#  define JEMALLOC_DIAGNOSTIC_IGNORE(W) __pragma(warning(disable:W))
+#  define JEMALLOC_DIAGNOSTIC_IGNORE_MISSING_STRUCT_FIELD_INITIALIZERS
+#  define JEMALLOC_DIAGNOSTIC_IGNORE_TYPE_LIMITS
+#  define JEMALLOC_DIAGNOSTIC_IGNORE_ALLOC_SIZE_LARGER_THAN
+#  define JEMALLOC_DIAGNOSTIC_DISABLE_SPURIOUS
+/* #pragma GCC diagnostic first appeared in gcc 4.6. */
+#elif (defined(__GNUC__) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && \
+  (__GNUC_MINOR__ > 5)))) || defined(__clang__)
+/*
+ * The JEMALLOC_PRAGMA__ macro is an implementation detail of the GCC and Clang
+ * diagnostic suppression macros and should not be used anywhere else.
+ */
+#  define JEMALLOC_PRAGMA__(X) _Pragma(#X)
+#  define JEMALLOC_DIAGNOSTIC_PUSH JEMALLOC_PRAGMA__(GCC diagnostic push)
+#  define JEMALLOC_DIAGNOSTIC_POP JEMALLOC_PRAGMA__(GCC diagnostic pop)
+#  define JEMALLOC_DIAGNOSTIC_IGNORE(W) \
+     JEMALLOC_PRAGMA__(GCC diagnostic ignored W)
+
+/*
+ * The -Wmissing-field-initializers warning is buggy in GCC versions < 5.1 and
+ * all clang versions up to version 7 (currently trunk, unreleased).  This macro
+ * suppresses the warning for the affected compiler versions only.
+ */
+#  if ((defined(__GNUC__) && !defined(__clang__)) && (__GNUC__ < 5)) || \
+     defined(__clang__)
+#    define JEMALLOC_DIAGNOSTIC_IGNORE_MISSING_STRUCT_FIELD_INITIALIZERS  \
+          JEMALLOC_DIAGNOSTIC_IGNORE("-Wmissing-field-initializers")
+#  else
+#    define JEMALLOC_DIAGNOSTIC_IGNORE_MISSING_STRUCT_FIELD_INITIALIZERS
+#  endif
+
+#  define JEMALLOC_DIAGNOSTIC_IGNORE_TYPE_LIMITS  \
+     JEMALLOC_DIAGNOSTIC_IGNORE("-Wtype-limits")
+#  define JEMALLOC_DIAGNOSTIC_IGNORE_UNUSED_PARAMETER \
+     JEMALLOC_DIAGNOSTIC_IGNORE("-Wunused-parameter")
+#  if defined(__GNUC__) && !defined(__clang__) && (__GNUC__ >= 7)
+#    define JEMALLOC_DIAGNOSTIC_IGNORE_ALLOC_SIZE_LARGER_THAN \
+       JEMALLOC_DIAGNOSTIC_IGNORE("-Walloc-size-larger-than=")
+#  else
+#    define JEMALLOC_DIAGNOSTIC_IGNORE_ALLOC_SIZE_LARGER_THAN
+#  endif
+#  define JEMALLOC_DIAGNOSTIC_DISABLE_SPURIOUS \
+  JEMALLOC_DIAGNOSTIC_PUSH \
+  JEMALLOC_DIAGNOSTIC_IGNORE_UNUSED_PARAMETER
+#else
+#  define JEMALLOC_DIAGNOSTIC_PUSH
+#  define JEMALLOC_DIAGNOSTIC_POP
+#  define JEMALLOC_DIAGNOSTIC_IGNORE(W)
+#  define JEMALLOC_DIAGNOSTIC_IGNORE_MISSING_STRUCT_FIELD_INITIALIZERS
+#  define JEMALLOC_DIAGNOSTIC_IGNORE_TYPE_LIMITS
+#  define JEMALLOC_DIAGNOSTIC_IGNORE_ALLOC_SIZE_LARGER_THAN
+#  define JEMALLOC_DIAGNOSTIC_DISABLE_SPURIOUS
+#endif
+
+/*
+ * Disables spurious diagnostics for all headers.  Since these headers are not
+ * included by users directly, it does not affect their diagnostic settings.
+ */
+JEMALLOC_DIAGNOSTIC_DISABLE_SPURIOUS
+
 #endif /* JEMALLOC_INTERNAL_MACROS_H */
diff --git a/include/jemalloc/internal/jemalloc_internal_types.h b/include/jemalloc/internal/jemalloc_internal_types.h
index 1b750b12..e296c5a7 100644
--- a/include/jemalloc/internal/jemalloc_internal_types.h
+++ b/include/jemalloc/internal/jemalloc_internal_types.h
@@ -1,6 +1,8 @@
 #ifndef JEMALLOC_INTERNAL_TYPES_H
 #define JEMALLOC_INTERNAL_TYPES_H
 
+#include "jemalloc/internal/quantum.h"
+
 /* Page size index type. */
 typedef unsigned pszind_t;
 
@@ -50,79 +52,6 @@ typedef int malloc_cpuid_t;
 /* Smallest size class to support. */
 #define TINY_MIN		(1U << LG_TINY_MIN)
 
-/*
- * Minimum allocation alignment is 2^LG_QUANTUM bytes (ignoring tiny size
- * classes).
- */
-#ifndef LG_QUANTUM
-#  if (defined(__i386__) || defined(_M_IX86))
-#    define LG_QUANTUM		4
-#  endif
-#  ifdef __ia64__
-#    define LG_QUANTUM		4
-#  endif
-#  ifdef __alpha__
-#    define LG_QUANTUM		4
-#  endif
-#  if (defined(__sparc64__) || defined(__sparcv9) || defined(__sparc_v9__))
-#    define LG_QUANTUM		4
-#  endif
-#  if (defined(__amd64__) || defined(__x86_64__) || defined(_M_X64))
-#    define LG_QUANTUM		4
-#  endif
-#  ifdef __arm__
-#    define LG_QUANTUM		3
-#  endif
-#  ifdef __aarch64__
-#    define LG_QUANTUM		4
-#  endif
-#  ifdef __hppa__
-#    define LG_QUANTUM		4
-#  endif
-#  ifdef __m68k__
-#    define LG_QUANTUM		3
-#  endif
-#  ifdef __mips__
-#    define LG_QUANTUM		3
-#  endif
-#  ifdef __nios2__
-#    define LG_QUANTUM		3
-#  endif
-#  ifdef __or1k__
-#    define LG_QUANTUM		3
-#  endif
-#  ifdef __powerpc__
-#    define LG_QUANTUM		4
-#  endif
-#  if defined(__riscv) || defined(__riscv__)
-#    define LG_QUANTUM		4
-#  endif
-#  ifdef __s390__
-#    define LG_QUANTUM		4
-#  endif
-#  if (defined (__SH3E__) || defined(__SH4_SINGLE__) || defined(__SH4__) || \
-	defined(__SH4_SINGLE_ONLY__))
-#    define LG_QUANTUM		4
-#  endif
-#  ifdef __tile__
-#    define LG_QUANTUM		4
-#  endif
-#  ifdef __le32__
-#    define LG_QUANTUM		4
-#  endif
-#  ifndef LG_QUANTUM
-#    error "Unknown minimum alignment for architecture; specify via "
-	 "--with-lg-quantum"
-#  endif
-#endif
-
-#define QUANTUM			((size_t)(1U << LG_QUANTUM))
-#define QUANTUM_MASK		(QUANTUM - 1)
-
-/* Return the smallest quantum multiple that is >= a. */
-#define QUANTUM_CEILING(a)						\
-	(((a) + QUANTUM_MASK) & ~QUANTUM_MASK)
-
 #define LONG			((size_t)(1U << LG_SIZEOF_LONG))
 #define LONG_MASK		(LONG - 1)
 
diff --git a/include/jemalloc/internal/jemalloc_preamble.h.in b/include/jemalloc/internal/jemalloc_preamble.h.in
index e621fbc8..4bfdb32c 100644
--- a/include/jemalloc/internal/jemalloc_preamble.h.in
+++ b/include/jemalloc/internal/jemalloc_preamble.h.in
@@ -21,7 +21,7 @@
 #  include "../jemalloc@install_suffix@.h"
 #endif
 
-#if (defined(JEMALLOC_OSATOMIC) || defined(JEMALLOC_OSSPIN))
+#if defined(JEMALLOC_OSATOMIC)
 #include <libkern/OSAtomic.h>
 #endif
 
@@ -45,7 +45,7 @@
 #    include "jemalloc/internal/private_namespace_jet.h"
 #  endif
 #endif
-#include "jemalloc/internal/hooks.h"
+#include "jemalloc/internal/test_hooks.h"
 
 #ifdef JEMALLOC_DEFINE_MADVISE_FREE
 #  define JEMALLOC_MADV_FREE 8
@@ -161,7 +161,7 @@ static const bool config_log =
     false
 #endif
     ;
-#ifdef JEMALLOC_HAVE_SCHED_GETCPU
+#if defined(_WIN32) || defined(JEMALLOC_HAVE_SCHED_GETCPU)
 /* Currently percpu_arena depends on sched_getcpu. */
 #define JEMALLOC_PERCPU_ARENA
 #endif
diff --git a/include/jemalloc/internal/large_externs.h b/include/jemalloc/internal/large_externs.h
index 3f36282c..a05019e8 100644
--- a/include/jemalloc/internal/large_externs.h
+++ b/include/jemalloc/internal/large_externs.h
@@ -1,13 +1,16 @@
 #ifndef JEMALLOC_INTERNAL_LARGE_EXTERNS_H
 #define JEMALLOC_INTERNAL_LARGE_EXTERNS_H
 
+#include "jemalloc/internal/hook.h"
+
 void *large_malloc(tsdn_t *tsdn, arena_t *arena, size_t usize, bool zero);
 void *large_palloc(tsdn_t *tsdn, arena_t *arena, size_t usize, size_t alignment,
     bool zero);
 bool large_ralloc_no_move(tsdn_t *tsdn, extent_t *extent, size_t usize_min,
     size_t usize_max, bool zero);
-void *large_ralloc(tsdn_t *tsdn, arena_t *arena, extent_t *extent, size_t usize,
-    size_t alignment, bool zero, tcache_t *tcache);
+void *large_ralloc(tsdn_t *tsdn, arena_t *arena, void *ptr, size_t usize,
+    size_t alignment, bool zero, tcache_t *tcache,
+    hook_ralloc_args_t *hook_args);
 
 typedef void (large_dalloc_junk_t)(void *, size_t);
 extern large_dalloc_junk_t *JET_MUTABLE large_dalloc_junk;
@@ -23,4 +26,7 @@ prof_tctx_t *large_prof_tctx_get(tsdn_t *tsdn, const extent_t *extent);
 void large_prof_tctx_set(tsdn_t *tsdn, extent_t *extent, prof_tctx_t *tctx);
 void large_prof_tctx_reset(tsdn_t *tsdn, extent_t *extent);
 
+nstime_t large_prof_alloc_time_get(const extent_t *extent);
+void large_prof_alloc_time_set(extent_t *extent, nstime_t time);
+
 #endif /* JEMALLOC_INTERNAL_LARGE_EXTERNS_H */
diff --git a/include/jemalloc/internal/mutex.h b/include/jemalloc/internal/mutex.h
index 6520c251..7c24f072 100644
--- a/include/jemalloc/internal/mutex.h
+++ b/include/jemalloc/internal/mutex.h
@@ -37,14 +37,17 @@ struct malloc_mutex_s {
 #  endif
 #elif (defined(JEMALLOC_OS_UNFAIR_LOCK))
 			os_unfair_lock		lock;
-#elif (defined(JEMALLOC_OSSPIN))
-			OSSpinLock		lock;
 #elif (defined(JEMALLOC_MUTEX_INIT_CB))
 			pthread_mutex_t		lock;
 			malloc_mutex_t		*postponed_next;
 #else
 			pthread_mutex_t		lock;
 #endif
+			/* 
+			 * Hint flag to avoid exclusive cache line contention
+			 * during spin waiting
+			 */
+			atomic_b_t		locked;
 		};
 		/*
 		 * We only touch witness when configured w/ debug.  However we
@@ -84,10 +87,6 @@ struct malloc_mutex_s {
 #    define MALLOC_MUTEX_LOCK(m)    os_unfair_lock_lock(&(m)->lock)
 #    define MALLOC_MUTEX_UNLOCK(m)  os_unfair_lock_unlock(&(m)->lock)
 #    define MALLOC_MUTEX_TRYLOCK(m) (!os_unfair_lock_trylock(&(m)->lock))
-#elif (defined(JEMALLOC_OSSPIN))
-#    define MALLOC_MUTEX_LOCK(m)    OSSpinLockLock(&(m)->lock)
-#    define MALLOC_MUTEX_UNLOCK(m)  OSSpinLockUnlock(&(m)->lock)
-#    define MALLOC_MUTEX_TRYLOCK(m) (!OSSpinLockTry(&(m)->lock))
 #else
 #    define MALLOC_MUTEX_LOCK(m)    pthread_mutex_lock(&(m)->lock)
 #    define MALLOC_MUTEX_UNLOCK(m)  pthread_mutex_unlock(&(m)->lock)
@@ -101,22 +100,37 @@ struct malloc_mutex_s {
 #ifdef _WIN32
 #  define MALLOC_MUTEX_INITIALIZER
 #elif (defined(JEMALLOC_OS_UNFAIR_LOCK))
-#  define MALLOC_MUTEX_INITIALIZER					\
-     {{{LOCK_PROF_DATA_INITIALIZER, OS_UNFAIR_LOCK_INIT}},		\
-      WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT)}
-#elif (defined(JEMALLOC_OSSPIN))
-#  define MALLOC_MUTEX_INITIALIZER					\
-     {{{LOCK_PROF_DATA_INITIALIZER, 0}},				\
+#  if defined(JEMALLOC_DEBUG)
+#    define MALLOC_MUTEX_INITIALIZER					\
+  {{{LOCK_PROF_DATA_INITIALIZER, OS_UNFAIR_LOCK_INIT, ATOMIC_INIT(false)}}, \
+         WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT), 0}
+#  else
+#    define MALLOC_MUTEX_INITIALIZER                      \
+  {{{LOCK_PROF_DATA_INITIALIZER, OS_UNFAIR_LOCK_INIT, ATOMIC_INIT(false)}},  \
       WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT)}
+#  endif
 #elif (defined(JEMALLOC_MUTEX_INIT_CB))
-#  define MALLOC_MUTEX_INITIALIZER					\
-     {{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER, NULL}},	\
-      WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT)}
+#  if (defined(JEMALLOC_DEBUG))
+#     define MALLOC_MUTEX_INITIALIZER					\
+      {{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER, NULL, ATOMIC_INIT(false)}},	\
+           WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT), 0}
+#  else
+#     define MALLOC_MUTEX_INITIALIZER					\
+      {{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER, NULL, ATOMIC_INIT(false)}},	\
+           WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT)}
+#  endif
+
 #else
 #    define MALLOC_MUTEX_TYPE PTHREAD_MUTEX_DEFAULT
+#  if defined(JEMALLOC_DEBUG)
 #    define MALLOC_MUTEX_INITIALIZER					\
-       {{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER}},	\
-        WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT)}
+     {{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER, ATOMIC_INIT(false)}}, \
+           WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT), 0}
+#  else
+#    define MALLOC_MUTEX_INITIALIZER                          \
+     {{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER, ATOMIC_INIT(false)}},	\
+      WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT)}
+#  endif
 #endif
 
 #ifdef JEMALLOC_LAZY_LOCK
@@ -139,6 +153,7 @@ void malloc_mutex_lock_slow(malloc_mutex_t *mutex);
 static inline void
 malloc_mutex_lock_final(malloc_mutex_t *mutex) {
 	MALLOC_MUTEX_LOCK(mutex);
+	atomic_store_b(&mutex->locked, true, ATOMIC_RELAXED);
 }
 
 static inline bool
@@ -164,6 +179,7 @@ malloc_mutex_trylock(tsdn_t *tsdn, malloc_mutex_t *mutex) {
 	witness_assert_not_owner(tsdn_witness_tsdp_get(tsdn), &mutex->witness);
 	if (isthreaded) {
 		if (malloc_mutex_trylock_final(mutex)) {
+			atomic_store_b(&mutex->locked, true, ATOMIC_RELAXED);
 			return true;
 		}
 		mutex_owner_stats_update(tsdn, mutex);
@@ -203,6 +219,7 @@ malloc_mutex_lock(tsdn_t *tsdn, malloc_mutex_t *mutex) {
 	if (isthreaded) {
 		if (malloc_mutex_trylock_final(mutex)) {
 			malloc_mutex_lock_slow(mutex);
+			atomic_store_b(&mutex->locked, true, ATOMIC_RELAXED);
 		}
 		mutex_owner_stats_update(tsdn, mutex);
 	}
@@ -211,6 +228,7 @@ malloc_mutex_lock(tsdn_t *tsdn, malloc_mutex_t *mutex) {
 
 static inline void
 malloc_mutex_unlock(tsdn_t *tsdn, malloc_mutex_t *mutex) {
+	atomic_store_b(&mutex->locked, false, ATOMIC_RELAXED);
 	witness_unlock(tsdn_witness_tsdp_get(tsdn), &mutex->witness);
 	if (isthreaded) {
 		MALLOC_MUTEX_UNLOCK(mutex);
@@ -245,4 +263,26 @@ malloc_mutex_prof_read(tsdn_t *tsdn, mutex_prof_data_t *data,
 	atomic_store_u32(&data->n_waiting_thds, 0, ATOMIC_RELAXED);
 }
 
+static inline void
+malloc_mutex_prof_accum(tsdn_t *tsdn, mutex_prof_data_t *data,
+    malloc_mutex_t *mutex) {
+	mutex_prof_data_t *source = &mutex->prof_data;
+	/* Can only read holding the mutex. */
+	malloc_mutex_assert_owner(tsdn, mutex);
+
+	nstime_add(&data->tot_wait_time, &source->tot_wait_time);
+	if (nstime_compare(&source->max_wait_time, &data->max_wait_time) > 0) {
+		nstime_copy(&data->max_wait_time, &source->max_wait_time);
+	}
+	data->n_wait_times += source->n_wait_times;
+	data->n_spin_acquired += source->n_spin_acquired;
+	if (data->max_n_thds < source->max_n_thds) {
+		data->max_n_thds = source->max_n_thds;
+	}
+	/* n_wait_thds is not reported. */
+	atomic_store_u32(&data->n_waiting_thds, 0, ATOMIC_RELAXED);
+	data->n_owner_switches += source->n_owner_switches;
+	data->n_lock_ops += source->n_lock_ops;
+}
+
 #endif /* JEMALLOC_INTERNAL_MUTEX_H */
diff --git a/include/jemalloc/internal/mutex_prof.h b/include/jemalloc/internal/mutex_prof.h
index ce183d33..2cb8fb0c 100644
--- a/include/jemalloc/internal/mutex_prof.h
+++ b/include/jemalloc/internal/mutex_prof.h
@@ -35,22 +35,31 @@ typedef enum {
 	mutex_prof_num_arena_mutexes
 } mutex_prof_arena_ind_t;
 
+/*
+ * The forth parameter is a boolean value that is true for derived rate counters
+ * and false for real ones.
+ */
 #define MUTEX_PROF_UINT64_COUNTERS					\
-    OP(num_ops, uint64_t, "n_lock_ops")					\
-    OP(num_wait, uint64_t, "n_waiting")					\
-    OP(num_spin_acq, uint64_t, "n_spin_acq")				\
-    OP(num_owner_switch, uint64_t, "n_owner_switch")			\
-    OP(total_wait_time, uint64_t, "total_wait_ns")			\
-    OP(max_wait_time, uint64_t, "max_wait_ns")
+    OP(num_ops, uint64_t, "n_lock_ops", false, num_ops)					\
+    OP(num_ops_ps, uint64_t, "(#/sec)", true, num_ops)				\
+    OP(num_wait, uint64_t, "n_waiting", false, num_wait)				\
+    OP(num_wait_ps, uint64_t, "(#/sec)", true, num_wait)				\
+    OP(num_spin_acq, uint64_t, "n_spin_acq", false, num_spin_acq)			\
+    OP(num_spin_acq_ps, uint64_t, "(#/sec)", true, num_spin_acq)			\
+    OP(num_owner_switch, uint64_t, "n_owner_switch", false, num_owner_switch)		\
+    OP(num_owner_switch_ps, uint64_t, "(#/sec)", true, num_owner_switch)	\
+    OP(total_wait_time, uint64_t, "total_wait_ns", false, total_wait_time)		\
+    OP(total_wait_time_ps, uint64_t, "(#/sec)", true, total_wait_time)		\
+    OP(max_wait_time, uint64_t, "max_wait_ns", false, max_wait_time)
 
 #define MUTEX_PROF_UINT32_COUNTERS					\
-    OP(max_num_thds, uint32_t, "max_n_thds")
+    OP(max_num_thds, uint32_t, "max_n_thds", false, max_num_thds)
 
 #define MUTEX_PROF_COUNTERS						\
 		MUTEX_PROF_UINT64_COUNTERS				\
 		MUTEX_PROF_UINT32_COUNTERS
 
-#define OP(counter, type, human) mutex_counter_##counter,
+#define OP(counter, type, human, derived, base_counter) mutex_counter_##counter,
 
 #define COUNTER_ENUM(counter_list, t)					\
 		typedef enum {						\
diff --git a/include/jemalloc/internal/prof_externs.h b/include/jemalloc/internal/prof_externs.h
index 04348696..094f3e17 100644
--- a/include/jemalloc/internal/prof_externs.h
+++ b/include/jemalloc/internal/prof_externs.h
@@ -14,6 +14,7 @@ extern bool	opt_prof_gdump;       /* High-water memory dumping. */
 extern bool	opt_prof_final;       /* Final profile dumping. */
 extern bool	opt_prof_leak;        /* Dump leak summary at exit. */
 extern bool	opt_prof_accum;       /* Report cumulative bytes. */
+extern bool	opt_prof_log;	      /* Turn logging on at boot. */
 extern char	opt_prof_prefix[
     /* Minimize memory bloat for non-prof builds. */
 #ifdef JEMALLOC_PROF
@@ -45,7 +46,8 @@ extern size_t	lg_prof_sample;
 void prof_alloc_rollback(tsd_t *tsd, prof_tctx_t *tctx, bool updated);
 void prof_malloc_sample_object(tsdn_t *tsdn, const void *ptr, size_t usize,
     prof_tctx_t *tctx);
-void prof_free_sampled_object(tsd_t *tsd, size_t usize, prof_tctx_t *tctx);
+void prof_free_sampled_object(tsd_t *tsd, const void *ptr, size_t usize,
+    prof_tctx_t *tctx);
 void bt_init(prof_bt_t *bt, void **vec);
 void prof_backtrace(prof_bt_t *bt);
 prof_tctx_t *prof_lookup(tsd_t *tsd, prof_bt_t *bt);
@@ -89,4 +91,15 @@ void prof_postfork_parent(tsdn_t *tsdn);
 void prof_postfork_child(tsdn_t *tsdn);
 void prof_sample_threshold_update(prof_tdata_t *tdata);
 
+bool prof_log_start(tsdn_t *tsdn, const char *filename);
+bool prof_log_stop(tsdn_t *tsdn);
+#ifdef JEMALLOC_JET
+size_t prof_log_bt_count(void);
+size_t prof_log_alloc_count(void);
+size_t prof_log_thr_count(void);
+bool prof_log_is_logging(void);
+bool prof_log_rep_check(void);
+void prof_log_dummy_set(bool new_value);
+#endif
+
 #endif /* JEMALLOC_INTERNAL_PROF_EXTERNS_H */
diff --git a/include/jemalloc/internal/prof_inlines_a.h b/include/jemalloc/internal/prof_inlines_a.h
index a6efb485..471d9853 100644
--- a/include/jemalloc/internal/prof_inlines_a.h
+++ b/include/jemalloc/internal/prof_inlines_a.h
@@ -4,7 +4,8 @@
 #include "jemalloc/internal/mutex.h"
 
 static inline bool
-prof_accum_add(tsdn_t *tsdn, prof_accum_t *prof_accum, uint64_t accumbytes) {
+prof_accum_add(tsdn_t *tsdn, prof_accum_t *prof_accum,
+    uint64_t accumbytes) {
 	cassert(config_prof);
 
 	bool overflow;
@@ -42,7 +43,8 @@ prof_accum_add(tsdn_t *tsdn, prof_accum_t *prof_accum, uint64_t accumbytes) {
 }
 
 static inline void
-prof_accum_cancel(tsdn_t *tsdn, prof_accum_t *prof_accum, size_t usize) {
+prof_accum_cancel(tsdn_t *tsdn, prof_accum_t *prof_accum,
+    size_t usize) {
 	cassert(config_prof);
 
 	/*
@@ -55,15 +57,15 @@ prof_accum_cancel(tsdn_t *tsdn, prof_accum_t *prof_accum, size_t usize) {
 #ifdef JEMALLOC_ATOMIC_U64
 	a0 = atomic_load_u64(&prof_accum->accumbytes, ATOMIC_RELAXED);
 	do {
-		a1 = (a0 >= LARGE_MINCLASS - usize) ?  a0 - (LARGE_MINCLASS -
-		    usize) : 0;
+		a1 = (a0 >= SC_LARGE_MINCLASS - usize)
+		    ? a0 - (SC_LARGE_MINCLASS - usize) : 0;
 	} while (!atomic_compare_exchange_weak_u64(&prof_accum->accumbytes, &a0,
 	    a1, ATOMIC_RELAXED, ATOMIC_RELAXED));
 #else
 	malloc_mutex_lock(tsdn, &prof_accum->mtx);
 	a0 = prof_accum->accumbytes;
-	a1 = (a0 >= LARGE_MINCLASS - usize) ?  a0 - (LARGE_MINCLASS - usize) :
-	    0;
+	a1 = (a0 >= SC_LARGE_MINCLASS - usize)
+	    ?  a0 - (SC_LARGE_MINCLASS - usize) : 0;
 	prof_accum->accumbytes = a1;
 	malloc_mutex_unlock(tsdn, &prof_accum->mtx);
 #endif
diff --git a/include/jemalloc/internal/prof_inlines_b.h b/include/jemalloc/internal/prof_inlines_b.h
index 6ff465ad..8358bffb 100644
--- a/include/jemalloc/internal/prof_inlines_b.h
+++ b/include/jemalloc/internal/prof_inlines_b.h
@@ -61,13 +61,54 @@ prof_tctx_reset(tsdn_t *tsdn, const void *ptr, prof_tctx_t *tctx) {
 	arena_prof_tctx_reset(tsdn, ptr, tctx);
 }
 
+JEMALLOC_ALWAYS_INLINE nstime_t
+prof_alloc_time_get(tsdn_t *tsdn, const void *ptr, alloc_ctx_t *alloc_ctx) {
+	cassert(config_prof);
+	assert(ptr != NULL);
+
+	return arena_prof_alloc_time_get(tsdn, ptr, alloc_ctx);
+}
+
+JEMALLOC_ALWAYS_INLINE void
+prof_alloc_time_set(tsdn_t *tsdn, const void *ptr, alloc_ctx_t *alloc_ctx,
+    nstime_t t) { 
+	cassert(config_prof);
+	assert(ptr != NULL);
+
+	arena_prof_alloc_time_set(tsdn, ptr, alloc_ctx, t);
+}
+
+JEMALLOC_ALWAYS_INLINE bool
+prof_sample_check(tsd_t *tsd, size_t usize, bool update) {
+	ssize_t check = update ? 0 : usize;
+
+	int64_t bytes_until_sample = tsd_bytes_until_sample_get(tsd);
+	if (update) {
+		bytes_until_sample -= usize;
+		if (tsd_nominal(tsd)) {
+			tsd_bytes_until_sample_set(tsd, bytes_until_sample);
+		}
+	}
+	if (likely(bytes_until_sample >= check)) {
+		return true;
+	}
+
+	return false;
+}
+
 JEMALLOC_ALWAYS_INLINE bool
 prof_sample_accum_update(tsd_t *tsd, size_t usize, bool update,
-    prof_tdata_t **tdata_out) {
+			 prof_tdata_t **tdata_out) {
 	prof_tdata_t *tdata;
 
 	cassert(config_prof);
 
+	/* Fastpath: no need to load tdata */
+	if (likely(prof_sample_check(tsd, usize, update))) {
+		return true;
+	}
+
+	bool booted = tsd_prof_tdata_get(tsd);
 	tdata = prof_tdata_get(tsd, true);
 	if (unlikely((uintptr_t)tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)) {
 		tdata = NULL;
@@ -81,21 +122,23 @@ prof_sample_accum_update(tsd_t *tsd, size_t usize, bool update,
 		return true;
 	}
 
-	if (likely(tdata->bytes_until_sample >= usize)) {
-		if (update) {
-			tdata->bytes_until_sample -= usize;
-		}
+	/*
+	 * If this was the first creation of tdata, then
+	 * prof_tdata_get() reset bytes_until_sample, so decrement and
+	 * check it again
+	 */
+	if (!booted && prof_sample_check(tsd, usize, update)) {
 		return true;
-	} else {
-		if (tsd_reentrancy_level_get(tsd) > 0) {
-			return true;
-		}
-		/* Compute new sample threshold. */
-		if (update) {
-			prof_sample_threshold_update(tdata);
-		}
-		return !tdata->active;
 	}
+
+	if (tsd_reentrancy_level_get(tsd) > 0) {
+		return true;
+	}
+	/* Compute new sample threshold. */
+	if (update) {
+		prof_sample_threshold_update(tdata);
+	}
+	return !tdata->active;
 }
 
 JEMALLOC_ALWAYS_INLINE prof_tctx_t *
@@ -187,7 +230,7 @@ prof_realloc(tsd_t *tsd, const void *ptr, size_t usize, prof_tctx_t *tctx,
 	 * counters.
 	 */
 	if (unlikely(old_sampled)) {
-		prof_free_sampled_object(tsd, old_usize, old_tctx);
+		prof_free_sampled_object(tsd, ptr, old_usize, old_tctx);
 	}
 }
 
@@ -199,7 +242,7 @@ prof_free(tsd_t *tsd, const void *ptr, size_t usize, alloc_ctx_t *alloc_ctx) {
 	assert(usize == isalloc(tsd_tsdn(tsd), ptr));
 
 	if (unlikely((uintptr_t)tctx > (uintptr_t)1U)) {
-		prof_free_sampled_object(tsd, usize, tctx);
+		prof_free_sampled_object(tsd, ptr, usize, tctx);
 	}
 }
 
diff --git a/include/jemalloc/internal/prof_structs.h b/include/jemalloc/internal/prof_structs.h
index 0d58ae10..34ed4822 100644
--- a/include/jemalloc/internal/prof_structs.h
+++ b/include/jemalloc/internal/prof_structs.h
@@ -169,7 +169,6 @@ struct prof_tdata_s {
 
 	/* Sampling state. */
 	uint64_t		prng_state;
-	uint64_t		bytes_until_sample;
 
 	/* State used to avoid dumping while operating on prof internals. */
 	bool			enq;
diff --git a/include/jemalloc/internal/quantum.h b/include/jemalloc/internal/quantum.h
new file mode 100644
index 00000000..821086e9
--- /dev/null
+++ b/include/jemalloc/internal/quantum.h
@@ -0,0 +1,77 @@
+#ifndef JEMALLOC_INTERNAL_QUANTUM_H
+#define JEMALLOC_INTERNAL_QUANTUM_H
+
+/*
+ * Minimum allocation alignment is 2^LG_QUANTUM bytes (ignoring tiny size
+ * classes).
+ */
+#ifndef LG_QUANTUM
+#  if (defined(__i386__) || defined(_M_IX86))
+#    define LG_QUANTUM		4
+#  endif
+#  ifdef __ia64__
+#    define LG_QUANTUM		4
+#  endif
+#  ifdef __alpha__
+#    define LG_QUANTUM		4
+#  endif
+#  if (defined(__sparc64__) || defined(__sparcv9) || defined(__sparc_v9__))
+#    define LG_QUANTUM		4
+#  endif
+#  if (defined(__amd64__) || defined(__x86_64__) || defined(_M_X64))
+#    define LG_QUANTUM		4
+#  endif
+#  ifdef __arm__
+#    define LG_QUANTUM		3
+#  endif
+#  ifdef __aarch64__
+#    define LG_QUANTUM		4
+#  endif
+#  ifdef __hppa__
+#    define LG_QUANTUM		4
+#  endif
+#  ifdef __m68k__
+#    define LG_QUANTUM		3
+#  endif
+#  ifdef __mips__
+#    define LG_QUANTUM		3
+#  endif
+#  ifdef __nios2__
+#    define LG_QUANTUM		3
+#  endif
+#  ifdef __or1k__
+#    define LG_QUANTUM		3
+#  endif
+#  ifdef __powerpc__
+#    define LG_QUANTUM		4
+#  endif
+#  if defined(__riscv) || defined(__riscv__)
+#    define LG_QUANTUM		4
+#  endif
+#  ifdef __s390__
+#    define LG_QUANTUM		4
+#  endif
+#  if (defined (__SH3E__) || defined(__SH4_SINGLE__) || defined(__SH4__) || \
+	defined(__SH4_SINGLE_ONLY__))
+#    define LG_QUANTUM		4
+#  endif
+#  ifdef __tile__
+#    define LG_QUANTUM		4
+#  endif
+#  ifdef __le32__
+#    define LG_QUANTUM		4
+#  endif
+#  ifndef LG_QUANTUM
+#    error "Unknown minimum alignment for architecture; specify via "
+	 "--with-lg-quantum"
+#  endif
+#endif
+
+#define QUANTUM			((size_t)(1U << LG_QUANTUM))
+#define QUANTUM_MASK		(QUANTUM - 1)
+
+/* Return the smallest quantum multiple that is >= a. */
+#define QUANTUM_CEILING(a)						\
+	(((a) + QUANTUM_MASK) & ~QUANTUM_MASK)
+
+#endif /* JEMALLOC_INTERNAL_QUANTUM_H */
diff --git a/include/jemalloc/internal/rtree.h b/include/jemalloc/internal/rtree.h
index b59d33a8..16ccbebe 100644
--- a/include/jemalloc/internal/rtree.h
+++ b/include/jemalloc/internal/rtree.h
@@ -4,7 +4,7 @@
 #include "jemalloc/internal/atomic.h"
 #include "jemalloc/internal/mutex.h"
 #include "jemalloc/internal/rtree_tsd.h"
-#include "jemalloc/internal/size_classes.h"
+#include "jemalloc/internal/sc.h"
 #include "jemalloc/internal/tsd.h"
 
 /*
@@ -31,7 +31,7 @@
 #  error Unsupported number of significant virtual address bits
 #endif
 /* Use compact leaf representation if virtual address encoding allows. */
-#if RTREE_NHIB >= LG_CEIL_NSIZES
+#if RTREE_NHIB >= LG_CEIL(SC_NSIZES)
 #  define RTREE_LEAF_COMPACT
 #endif
 
@@ -170,8 +170,8 @@ rtree_subkey(uintptr_t key, unsigned level) {
  */
 #  ifdef RTREE_LEAF_COMPACT
 JEMALLOC_ALWAYS_INLINE uintptr_t
-rtree_leaf_elm_bits_read(tsdn_t *tsdn, rtree_t *rtree, rtree_leaf_elm_t *elm,
-    bool dependent) {
+rtree_leaf_elm_bits_read(tsdn_t *tsdn, rtree_t *rtree,
+    rtree_leaf_elm_t *elm, bool dependent) {
 	return (uintptr_t)atomic_load_p(&elm->le_bits, dependent
 	    ? ATOMIC_RELAXED : ATOMIC_ACQUIRE);
 }
@@ -208,7 +208,7 @@ rtree_leaf_elm_bits_slab_get(uintptr_t bits) {
 #  endif
 
 JEMALLOC_ALWAYS_INLINE extent_t *
-rtree_leaf_elm_extent_read(UNUSED tsdn_t *tsdn, UNUSED rtree_t *rtree,
+rtree_leaf_elm_extent_read(tsdn_t *tsdn, rtree_t *rtree,
     rtree_leaf_elm_t *elm, bool dependent) {
 #ifdef RTREE_LEAF_COMPACT
 	uintptr_t bits = rtree_leaf_elm_bits_read(tsdn, rtree, elm, dependent);
@@ -221,7 +221,7 @@ rtree_leaf_elm_extent_read(UNUSED tsdn_t *tsdn, UNUSED rtree_t *rtree,
 }
 
 JEMALLOC_ALWAYS_INLINE szind_t
-rtree_leaf_elm_szind_read(UNUSED tsdn_t *tsdn, UNUSED rtree_t *rtree,
+rtree_leaf_elm_szind_read(tsdn_t *tsdn, rtree_t *rtree,
     rtree_leaf_elm_t *elm, bool dependent) {
 #ifdef RTREE_LEAF_COMPACT
 	uintptr_t bits = rtree_leaf_elm_bits_read(tsdn, rtree, elm, dependent);
@@ -233,7 +233,7 @@ rtree_leaf_elm_szind_read(UNUSED tsdn_t *tsdn, UNUSED rtree_t *rtree,
 }
 
 JEMALLOC_ALWAYS_INLINE bool
-rtree_leaf_elm_slab_read(UNUSED tsdn_t *tsdn, UNUSED rtree_t *rtree,
+rtree_leaf_elm_slab_read(tsdn_t *tsdn, rtree_t *rtree,
     rtree_leaf_elm_t *elm, bool dependent) {
 #ifdef RTREE_LEAF_COMPACT
 	uintptr_t bits = rtree_leaf_elm_bits_read(tsdn, rtree, elm, dependent);
@@ -245,7 +245,7 @@ rtree_leaf_elm_slab_read(UNUSED tsdn_t *tsdn, UNUSED rtree_t *rtree,
 }
 
 static inline void
-rtree_leaf_elm_extent_write(UNUSED tsdn_t *tsdn, UNUSED rtree_t *rtree,
+rtree_leaf_elm_extent_write(tsdn_t *tsdn, rtree_t *rtree,
     rtree_leaf_elm_t *elm, extent_t *extent) {
 #ifdef RTREE_LEAF_COMPACT
 	uintptr_t old_bits = rtree_leaf_elm_bits_read(tsdn, rtree, elm, true);
@@ -259,9 +259,9 @@ rtree_leaf_elm_extent_write(UNUSED tsdn_t *tsdn, UNUSED rtree_t *rtree,
 }
 
 static inline void
-rtree_leaf_elm_szind_write(UNUSED tsdn_t *tsdn, UNUSED rtree_t *rtree,
+rtree_leaf_elm_szind_write(tsdn_t *tsdn, rtree_t *rtree,
     rtree_leaf_elm_t *elm, szind_t szind) {
-	assert(szind <= NSIZES);
+	assert(szind <= SC_NSIZES);
 
 #ifdef RTREE_LEAF_COMPACT
 	uintptr_t old_bits = rtree_leaf_elm_bits_read(tsdn, rtree, elm,
@@ -277,7 +277,7 @@ rtree_leaf_elm_szind_write(UNUSED tsdn_t *tsdn, UNUSED rtree_t *rtree,
 }
 
 static inline void
-rtree_leaf_elm_slab_write(UNUSED tsdn_t *tsdn, UNUSED rtree_t *rtree,
+rtree_leaf_elm_slab_write(tsdn_t *tsdn, rtree_t *rtree,
     rtree_leaf_elm_t *elm, bool slab) {
 #ifdef RTREE_LEAF_COMPACT
 	uintptr_t old_bits = rtree_leaf_elm_bits_read(tsdn, rtree, elm,
@@ -292,8 +292,8 @@ rtree_leaf_elm_slab_write(UNUSED tsdn_t *tsdn, UNUSED rtree_t *rtree,
 }
 
 static inline void
-rtree_leaf_elm_write(tsdn_t *tsdn, rtree_t *rtree, rtree_leaf_elm_t *elm,
-    extent_t *extent, szind_t szind, bool slab) {
+rtree_leaf_elm_write(tsdn_t *tsdn, rtree_t *rtree,
+    rtree_leaf_elm_t *elm, extent_t *extent, szind_t szind, bool slab) {
 #ifdef RTREE_LEAF_COMPACT
 	uintptr_t bits = ((uintptr_t)szind << LG_VADDR) |
 	    ((uintptr_t)extent & (((uintptr_t)0x1 << LG_VADDR) - 1)) |
@@ -313,7 +313,7 @@ rtree_leaf_elm_write(tsdn_t *tsdn, rtree_t *rtree, rtree_leaf_elm_t *elm,
 static inline void
 rtree_leaf_elm_szind_slab_update(tsdn_t *tsdn, rtree_t *rtree,
     rtree_leaf_elm_t *elm, szind_t szind, bool slab) {
-	assert(!slab || szind < NBINS);
+	assert(!slab || szind < SC_NBINS);
 
 	/*
 	 * The caller implicitly assures that it is the only writer to the szind
@@ -429,7 +429,7 @@ rtree_szind_read(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx,
 	rtree_leaf_elm_t *elm = rtree_read(tsdn, rtree, rtree_ctx, key,
 	    dependent);
 	if (!dependent && elm == NULL) {
-		return NSIZES;
+		return SC_NSIZES;
 	}
 	return rtree_leaf_elm_szind_read(tsdn, rtree, elm, dependent);
 }
@@ -452,6 +452,42 @@ rtree_extent_szind_read(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx,
 	return false;
 }
 
+/*
+ * Try to read szind_slab from the L1 cache.  Returns true on a hit,
+ * and fills in r_szind and r_slab.  Otherwise returns false.
+ *
+ * Key is allowed to be NULL in order to save an extra branch on the
+ * fastpath.  returns false in this case.
+ */
+JEMALLOC_ALWAYS_INLINE bool
+rtree_szind_slab_read_fast(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx,
+			    uintptr_t key, szind_t *r_szind, bool *r_slab) {
+	rtree_leaf_elm_t *elm;
+
+	size_t slot = rtree_cache_direct_map(key);
+	uintptr_t leafkey = rtree_leafkey(key);
+	assert(leafkey != RTREE_LEAFKEY_INVALID);
+
+	if (likely(rtree_ctx->cache[slot].leafkey == leafkey)) {
+		rtree_leaf_elm_t *leaf = rtree_ctx->cache[slot].leaf;
+		assert(leaf != NULL);
+		uintptr_t subkey = rtree_subkey(key, RTREE_HEIGHT-1);
+		elm = &leaf[subkey];
+
+#ifdef RTREE_LEAF_COMPACT
+		uintptr_t bits = rtree_leaf_elm_bits_read(tsdn, rtree,
+							  elm, true);
+		*r_szind = rtree_leaf_elm_bits_szind_get(bits);
+		*r_slab = rtree_leaf_elm_bits_slab_get(bits);
+#else
+		*r_szind = rtree_leaf_elm_szind_read(tsdn, rtree, elm, true);
+		*r_slab = rtree_leaf_elm_slab_read(tsdn, rtree, elm, true);
+#endif
+		return true;
+	} else {
+		return false;
+	}
+}
 JEMALLOC_ALWAYS_INLINE bool
 rtree_szind_slab_read(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx,
     uintptr_t key, bool dependent, szind_t *r_szind, bool *r_slab) {
@@ -474,7 +510,7 @@ rtree_szind_slab_read(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx,
 static inline void
 rtree_szind_slab_update(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx,
     uintptr_t key, szind_t szind, bool slab) {
-	assert(!slab || szind < NBINS);
+	assert(!slab || szind < SC_NBINS);
 
 	rtree_leaf_elm_t *elm = rtree_read(tsdn, rtree, rtree_ctx, key, true);
 	rtree_leaf_elm_szind_slab_update(tsdn, rtree, elm, szind, slab);
@@ -486,7 +522,7 @@ rtree_clear(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx,
 	rtree_leaf_elm_t *elm = rtree_read(tsdn, rtree, rtree_ctx, key, true);
 	assert(rtree_leaf_elm_extent_read(tsdn, rtree, elm, false) !=
 	    NULL);
-	rtree_leaf_elm_write(tsdn, rtree, elm, NULL, NSIZES, false);
+	rtree_leaf_elm_write(tsdn, rtree, elm, NULL, SC_NSIZES, false);
 }
 
 #endif /* JEMALLOC_INTERNAL_RTREE_H */
diff --git a/include/jemalloc/internal/rtree_tsd.h b/include/jemalloc/internal/rtree_tsd.h
index 93a75173..562e2929 100644
--- a/include/jemalloc/internal/rtree_tsd.h
+++ b/include/jemalloc/internal/rtree_tsd.h
@@ -26,7 +26,7 @@
  * Zero initializer required for tsd initialization only.  Proper initialization
  * done via rtree_ctx_data_init().
  */
-#define RTREE_CTX_ZERO_INITIALIZER {{{0}}, {{0}}}
+#define RTREE_CTX_ZERO_INITIALIZER {{{0, 0}}, {{0, 0}}}
 
 
 typedef struct rtree_leaf_elm_s rtree_leaf_elm_t;
diff --git a/include/jemalloc/internal/sc.h b/include/jemalloc/internal/sc.h
new file mode 100644
index 00000000..ef0a4512
--- /dev/null
+++ b/include/jemalloc/internal/sc.h
@@ -0,0 +1,320 @@
+#ifndef JEMALLOC_INTERNAL_SC_H
+#define JEMALLOC_INTERNAL_SC_H
+
+#include "jemalloc/internal/jemalloc_internal_types.h"
+
+/*
+ * Size class computations:
+ *
+ * These are a little tricky; we'll first start by describing how things
+ * generally work, and then describe some of the details.
+ *
+ * Ignore the first few size classes for a moment. We can then split all the
+ * remaining size classes into groups. The size classes in a group are spaced
+ * such that they cover allocation request sizes in a power-of-2 range. The
+ * power of two is called the base of the group, and the size classes in it
+ * satisfy allocations in the half-open range (base, base * 2]. There are
+ * SC_NGROUP size classes in each group, equally spaced in the range, so that
+ * each one covers allocations for base / SC_NGROUP possible allocation sizes.
+ * We call that value (base / SC_NGROUP) the delta of the group. Each size class
+ * is delta larger than the one before it (including the initial size class in a
+ * group, which is delta large than 2**base, the largest size class in the
+ * previous group).
+ * To make the math all work out nicely, we require that SC_NGROUP is a power of
+ * two, and define it in terms of SC_LG_NGROUP. We'll often talk in terms of
+ * lg_base and lg_delta. For each of these groups then, we have that
+ * lg_delta == lg_base - SC_LG_NGROUP.
+ * The size classes in a group with a given lg_base and lg_delta (which, recall,
+ * can be computed from lg_base for these groups) are therefore:
+ *   base + 1 * delta
+ *     which covers allocations in (base, base + 1 * delta]
+ *   base + 2 * delta
+ *     which covers allocations in (base + 1 * delta, base + 2 * delta].
+ *   base + 3 * delta
+ *     which covers allocations in (base + 2 * delta, base + 3 * delta].
+ *   ...
+ *   base + SC_NGROUP * delta ( == 2 * base)
+ *     which covers allocations in (base + (SC_NGROUP - 1) * delta, 2 * base].
+ * (Note that currently SC_NGROUP is always 4, so the "..." is empty in
+ * practice.)
+ * Note that the last size class in the group is the next power of two (after
+ * base), so that we've set up the induction correctly for the next group's
+ * selection of delta.
+ *
+ * Now, let's start considering the first few size classes. Two extra constants
+ * come into play here: LG_QUANTUM and SC_LG_TINY_MIN. LG_QUANTUM ensures
+ * correct platform alignment; all objects of size (1 << LG_QUANTUM) or larger
+ * are at least (1 << LG_QUANTUM) aligned; this can be used to ensure that we
+ * never return improperly aligned memory, by making (1 << LG_QUANTUM) equal the
+ * highest required alignment of a platform. For allocation sizes smaller than
+ * (1 << LG_QUANTUM) though, we can be more relaxed (since we don't support
+ * platforms with types with alignment larger than their size). To allow such
+ * allocations (without wasting space unnecessarily), we introduce tiny size
+ * classes; one per power of two, up until we hit the quantum size. There are
+ * therefore LG_QUANTUM - SC_LG_TINY_MIN such size classes.
+ *
+ * Next, we have a size class of size LG_QUANTUM. This can't be the start of a
+ * group in the sense we described above (covering a power of two range) since,
+ * if we divided into it to pick a value of delta, we'd get a delta smaller than
+ * (1 << LG_QUANTUM) for sizes >= (1 << LG_QUANTUM), which is against the rules.
+ *
+ * The first base we can divide by SC_NGROUP while still being at least
+ * (1 << LG_QUANTUM) is SC_NGROUP * (1 << LG_QUANTUM). We can get there by
+ * having SC_NGROUP size classes, spaced (1 << LG_QUANTUM) apart. These size
+ * classes are:
+ *   1 * (1 << LG_QUANTUM)
+ *   2 * (1 << LG_QUANTUM)
+ *   3 * (1 << LG_QUANTUM)
+ *   ... (although, as above, this "..." is empty in practice)
+ *   SC_NGROUP * (1 << LG_QUANTUM).
+ *
+ * There are SC_NGROUP of these size classes, so we can regard it as a sort of
+ * pseudo-group, even though it spans multiple powers of 2, is divided
+ * differently, and both starts and ends on a power of 2 (as opposed to just
+ * ending). SC_NGROUP is itself a power of two, so the first group after the
+ * pseudo-group has the power-of-two base SC_NGROUP * (1 << LG_QUANTUM), for a
+ * lg_base of LG_QUANTUM + SC_LG_NGROUP. We can divide this base into SC_NGROUP
+ * sizes without violating our LG_QUANTUM requirements, so we can safely set
+ * lg_delta = lg_base - SC_LG_GROUP (== LG_QUANTUM).
+ *
+ * So, in order, the size classes are:
+ *
+ * Tiny size classes:
+ * - Count: LG_QUANTUM - SC_LG_TINY_MIN.
+ * - Sizes:
+ *     1 << SC_LG_TINY_MIN
+ *     1 << (SC_LG_TINY_MIN + 1)
+ *     1 << (SC_LG_TINY_MIN + 2)
+ *     ...
+ *     1 << (LG_QUANTUM - 1)
+ *
+ * Initial pseudo-group:
+ * - Count: SC_NGROUP
+ * - Sizes:
+ *     1 * (1 << LG_QUANTUM)
+ *     2 * (1 << LG_QUANTUM)
+ *     3 * (1 << LG_QUANTUM)
+ *     ...
+ *     SC_NGROUP * (1 << LG_QUANTUM)
+ *
+ * Regular group 0:
+ * - Count: SC_NGROUP
+ * - Sizes:
+ *   (relative to lg_base of LG_QUANTUM + SC_LG_NGROUP and lg_delta of
+ *   lg_base - SC_LG_NGROUP)
+ *     (1 << lg_base) + 1 * (1 << lg_delta)
+ *     (1 << lg_base) + 2 * (1 << lg_delta)
+ *     (1 << lg_base) + 3 * (1 << lg_delta)
+ *     ...
+ *     (1 << lg_base) + SC_NGROUP * (1 << lg_delta) [ == (1 << (lg_base + 1)) ]
+ *
+ * Regular group 1:
+ * - Count: SC_NGROUP
+ * - Sizes:
+ *   (relative to lg_base of LG_QUANTUM + SC_LG_NGROUP + 1 and lg_delta of
+ *   lg_base - SC_LG_NGROUP)
+ *     (1 << lg_base) + 1 * (1 << lg_delta)
+ *     (1 << lg_base) + 2 * (1 << lg_delta)
+ *     (1 << lg_base) + 3 * (1 << lg_delta)
+ *     ...
+ *     (1 << lg_base) + SC_NGROUP * (1 << lg_delta) [ == (1 << (lg_base + 1)) ]
+ *
+ * ...
+ *
+ * Regular group N:
+ * - Count: SC_NGROUP
+ * - Sizes:
+ *   (relative to lg_base of LG_QUANTUM + SC_LG_NGROUP + N and lg_delta of
+ *   lg_base - SC_LG_NGROUP)
+ *     (1 << lg_base) + 1 * (1 << lg_delta)
+ *     (1 << lg_base) + 2 * (1 << lg_delta)
+ *     (1 << lg_base) + 3 * (1 << lg_delta)
+ *     ...
+ *     (1 << lg_base) + SC_NGROUP * (1 << lg_delta) [ == (1 << (lg_base + 1)) ]
+ *
+ *
+ * Representation of metadata:
+ * To make the math easy, we'll mostly work in lg quantities. We record lg_base,
+ * lg_delta, and ndelta (i.e. number of deltas above the base) on a
+ * per-size-class basis, and maintain the invariant that, across all size
+ * classes, size == (1 << lg_base) + ndelta * (1 << lg_delta).
+ *
+ * For regular groups (i.e. those with lg_base >= LG_QUANTUM + SC_LG_NGROUP),
+ * lg_delta is lg_base - SC_LG_NGROUP, and ndelta goes from 1 to SC_NGROUP.
+ *
+ * For the initial tiny size classes (if any), lg_base is lg(size class size).
+ * lg_delta is lg_base for the first size class, and lg_base - 1 for all
+ * subsequent ones. ndelta is always 0.
+ *
+ * For the pseudo-group, if there are no tiny size classes, then we set
+ * lg_base == LG_QUANTUM, lg_delta == LG_QUANTUM, and have ndelta range from 0
+ * to SC_NGROUP - 1. (Note that delta == base, so base + (SC_NGROUP - 1) * delta
+ * is just SC_NGROUP * base, or (1 << (SC_LG_NGROUP + LG_QUANTUM)), so we do
+ * indeed get a power of two that way). If there *are* tiny size classes, then
+ * the first size class needs to have lg_delta relative to the largest tiny size
+ * class. We therefore set lg_base == LG_QUANTUM - 1,
+ * lg_delta == LG_QUANTUM - 1, and ndelta == 1, keeping the rest of the
+ * pseudo-group the same.
+ *
+ *
+ * Other terminology:
+ * "Small" size classes mean those that are allocated out of bins, which is the
+ * same as those that are slab allocated.
+ * "Large" size classes are those that are not small. The cutoff for counting as
+ * large is page size * group size.
+ */
+
+/*
+ * Size class N + (1 << SC_LG_NGROUP) twice the size of size class N.
+ */
+#define SC_LG_NGROUP 2
+#define SC_LG_TINY_MIN 3
+
+#if SC_LG_TINY_MIN == 0
+/* The div module doesn't support division by 1, which this would require. */
+#error "Unsupported LG_TINY_MIN"
+#endif
+
+/*
+ * The definitions below are all determined by the above settings and system
+ * characteristics.
+ */
+#define SC_NGROUP (1ULL << SC_LG_NGROUP)
+#define SC_PTR_BITS ((1ULL << LG_SIZEOF_PTR) * 8)
+#define SC_NTINY (LG_QUANTUM - SC_LG_TINY_MIN)
+#define SC_LG_TINY_MAXCLASS (LG_QUANTUM > SC_LG_TINY_MIN ? LG_QUANTUM - 1 : -1)
+#define SC_NPSEUDO SC_NGROUP
+#define SC_LG_FIRST_REGULAR_BASE (LG_QUANTUM + SC_LG_NGROUP)
+/*
+ * We cap allocations to be less than 2 ** (ptr_bits - 1), so the highest base
+ * we need is 2 ** (ptr_bits - 2). (This also means that the last group is 1
+ * size class shorter than the others).
+ * We could probably save some space in arenas by capping this at LG_VADDR size.
+ */
+#define SC_LG_BASE_MAX (SC_PTR_BITS - 2)
+#define SC_NREGULAR (SC_NGROUP * 					\
+    (SC_LG_BASE_MAX - SC_LG_FIRST_REGULAR_BASE + 1) - 1)
+#define SC_NSIZES (SC_NTINY + SC_NPSEUDO + SC_NREGULAR)
+
+ /* The number of size classes that are a multiple of the page size. */
+#define SC_NPSIZES (							\
+    /* Start with all the size classes. */				\
+    SC_NSIZES								\
+    /* Subtract out those groups with too small a base. */		\
+    - (LG_PAGE - 1 - SC_LG_FIRST_REGULAR_BASE) * SC_NGROUP		\
+    /* And the pseudo-group. */						\
+    - SC_NPSEUDO							\
+    /* And the tiny group. */						\
+    - SC_NTINY								\
+    /* Groups where ndelta*delta is not a multiple of the page size. */	\
+    - (2 * (SC_NGROUP)))
+
+/*
+ * We declare a size class is binnable if size < page size * group. Or, in other
+ * words, lg(size) < lg(page size) + lg(group size).
+ */
+#define SC_NBINS (							\
+    /* Sub-regular size classes. */					\
+    SC_NTINY + SC_NPSEUDO						\
+    /* Groups with lg_regular_min_base <= lg_base <= lg_base_max */	\
+    + SC_NGROUP * (LG_PAGE + SC_LG_NGROUP - SC_LG_FIRST_REGULAR_BASE)	\
+    /* Last SC of the last group hits the bound exactly; exclude it. */	\
+    - 1)
+
+/*
+ * The size2index_tab lookup table uses uint8_t to encode each bin index, so we
+ * cannot support more than 256 small size classes.
+ */
+#if (SC_NBINS > 256)
+#  error "Too many small size classes"
+#endif
+
+/* The largest size class in the lookup table. */
+#define SC_LOOKUP_MAXCLASS ((size_t)1 << 12)
+
+/* Internal, only used for the definition of SC_SMALL_MAXCLASS. */
+#define SC_SMALL_MAX_BASE ((size_t)1 << (LG_PAGE + SC_LG_NGROUP - 1))
+#define SC_SMALL_MAX_DELTA ((size_t)1 << (LG_PAGE - 1))
+
+/* The largest size class allocated out of a slab. */
+#define SC_SMALL_MAXCLASS (SC_SMALL_MAX_BASE				\
+    + (SC_NGROUP - 1) * SC_SMALL_MAX_DELTA)
+
+/* The smallest size class not allocated out of a slab. */
+#define SC_LARGE_MINCLASS ((size_t)1ULL << (LG_PAGE + SC_LG_NGROUP))
+#define SC_LG_LARGE_MINCLASS (LG_PAGE + SC_LG_NGROUP)
+
+/* Internal; only used for the definition of SC_LARGE_MAXCLASS. */
+#define SC_MAX_BASE ((size_t)1 << (SC_PTR_BITS - 2))
+#define SC_MAX_DELTA ((size_t)1 << (SC_PTR_BITS - 2 - SC_LG_NGROUP))
+
+/* The largest size class supported. */
+#define SC_LARGE_MAXCLASS (SC_MAX_BASE + (SC_NGROUP - 1) * SC_MAX_DELTA)
+
+typedef struct sc_s sc_t;
+struct sc_s {
+	/* Size class index, or -1 if not a valid size class. */
+	int index;
+	/* Lg group base size (no deltas added). */
+	int lg_base;
+	/* Lg delta to previous size class. */
+	int lg_delta;
+	/* Delta multiplier.  size == 1<<lg_base + ndelta<<lg_delta */
+	int ndelta;
+	/*
+	 * True if the size class is a multiple of the page size, false
+	 * otherwise.
+	 */
+	bool psz;
+	/*
+	 * True if the size class is a small, bin, size class. False otherwise.
+	 */
+	bool bin;
+	/* The slab page count if a small bin size class, 0 otherwise. */
+	int pgs;
+	/* Same as lg_delta if a lookup table size class, 0 otherwise. */
+	int lg_delta_lookup;
+};
+
+typedef struct sc_data_s sc_data_t;
+struct sc_data_s {
+	/* Number of tiny size classes. */
+	unsigned ntiny;
+	/* Number of bins supported by the lookup table. */
+	int nlbins;
+	/* Number of small size class bins. */
+	int nbins;
+	/* Number of size classes. */
+	int nsizes;
+	/* Number of bits required to store NSIZES. */
+	int lg_ceil_nsizes;
+	/* Number of size classes that are a multiple of (1U << LG_PAGE). */
+	unsigned npsizes;
+	/* Lg of maximum tiny size class (or -1, if none). */
+	int lg_tiny_maxclass;
+	/* Maximum size class included in lookup table. */
+	size_t lookup_maxclass;
+	/* Maximum small size class. */
+	size_t small_maxclass;
+	/* Lg of minimum large size class. */
+	int lg_large_minclass;
+	/* The minimum large size class. */
+	size_t large_minclass;
+	/* Maximum (large) size class. */
+	size_t large_maxclass;
+	/* True if the sc_data_t has been initialized (for debugging only). */
+	bool initialized;
+
+	sc_t sc[SC_NSIZES];
+};
+
+void sc_data_init(sc_data_t *data);
+/*
+ * Updates slab sizes in [begin, end] to be pgs pages in length, if possible.
+ * Otherwise, does its best to accomodate the request.
+ */
+void sc_data_update_slab_size(sc_data_t *data, size_t begin, size_t end,
+    int pgs);
+void sc_boot(sc_data_t *data);
+
+#endif /* JEMALLOC_INTERNAL_SC_H */
diff --git a/include/jemalloc/internal/seq.h b/include/jemalloc/internal/seq.h
new file mode 100644
index 00000000..ef2df4c6
--- /dev/null
+++ b/include/jemalloc/internal/seq.h
@@ -0,0 +1,55 @@
+#ifndef JEMALLOC_INTERNAL_SEQ_H
+#define JEMALLOC_INTERNAL_SEQ_H
+
+#include "jemalloc/internal/atomic.h"
+
+/*
+ * A simple seqlock implementation.
+ */
+
+#define seq_define(type, short_type)					\
+typedef struct {							\
+	atomic_zu_t seq;						\
+	atomic_zu_t data[						\
+	    (sizeof(type) + sizeof(size_t) - 1) / sizeof(size_t)];	\
+} seq_##short_type##_t;							\
+									\
+/*									\
+ * No internal synchronization -- the caller must ensure that there's	\
+ * only a single writer at a time.					\
+ */									\
+static inline void							\
+seq_store_##short_type(seq_##short_type##_t *dst, type *src) {		\
+	size_t buf[sizeof(dst->data) / sizeof(size_t)];			\
+	buf[sizeof(buf) / sizeof(size_t) - 1] = 0;			\
+	memcpy(buf, src, sizeof(type));					\
+	size_t old_seq = atomic_load_zu(&dst->seq, ATOMIC_RELAXED);	\
+	atomic_store_zu(&dst->seq, old_seq + 1, ATOMIC_RELAXED);	\
+	atomic_fence(ATOMIC_RELEASE);					\
+	for (size_t i = 0; i < sizeof(buf) / sizeof(size_t); i++) {	\
+		atomic_store_zu(&dst->data[i], buf[i], ATOMIC_RELAXED);	\
+	}								\
+	atomic_store_zu(&dst->seq, old_seq + 2, ATOMIC_RELEASE);	\
+}									\
+									\
+/* Returns whether or not the read was consistent. */			\
+static inline bool							\
+seq_try_load_##short_type(type *dst, seq_##short_type##_t *src) {	\
+	size_t buf[sizeof(src->data) / sizeof(size_t)];			\
+	size_t seq1 = atomic_load_zu(&src->seq, ATOMIC_ACQUIRE);	\
+	if (seq1 % 2 != 0) {						\
+		return false;						\
+	}								\
+	for (size_t i = 0; i < sizeof(buf) / sizeof(size_t); i++) {	\
+		buf[i] = atomic_load_zu(&src->data[i], ATOMIC_RELAXED);	\
+	}								\
+	atomic_fence(ATOMIC_ACQUIRE);					\
+	size_t seq2 = atomic_load_zu(&src->seq, ATOMIC_RELAXED);	\
+	if (seq1 != seq2) {						\
+		return false;						\
+	}								\
+	memcpy(dst, buf, sizeof(type));					\
+	return true;							\
+}
+
+#endif /* JEMALLOC_INTERNAL_SEQ_H */
diff --git a/include/jemalloc/internal/size_classes.sh b/include/jemalloc/internal/size_classes.sh
deleted file mode 100755
index 998994d0..00000000
--- a/include/jemalloc/internal/size_classes.sh
+++ /dev/null
@@ -1,361 +0,0 @@
-#!/bin/sh
-#
-# Usage: size_classes.sh <lg_qarr> <lg_tmin> <lg_parr> <lg_g>
-
-# The following limits are chosen such that they cover all supported platforms.
-
-# Pointer sizes.
-lg_zarr="2 3"
-
-# Quanta.
-lg_qarr=$1
-
-# The range of tiny size classes is [2^lg_tmin..2^(lg_q-1)].
-lg_tmin=$2
-
-# Maximum lookup size.
-lg_kmax=12
-
-# Page sizes.
-lg_parr=`echo $3 | tr ',' ' '`
-
-# Size class group size (number of size classes for each size doubling).
-lg_g=$4
-
-pow2() {
-  e=$1
-  pow2_result=1
-  while [ ${e} -gt 0 ] ; do
-    pow2_result=$((${pow2_result} + ${pow2_result}))
-    e=$((${e} - 1))
-  done
-}
-
-lg() {
-  x=$1
-  lg_result=0
-  while [ ${x} -gt 1 ] ; do
-    lg_result=$((${lg_result} + 1))
-    x=$((${x} / 2))
-  done
-}
-
-lg_ceil() {
-  y=$1
-  lg ${y}; lg_floor=${lg_result}
-  pow2 ${lg_floor}; pow2_floor=${pow2_result}
-  if [ ${pow2_floor} -lt ${y} ] ; then
-    lg_ceil_result=$((${lg_floor} + 1))
-  else
-    lg_ceil_result=${lg_floor}
-  fi
-}
-
-reg_size_compute() {
-  lg_grp=$1
-  lg_delta=$2
-  ndelta=$3
-
-  pow2 ${lg_grp}; grp=${pow2_result}
-  pow2 ${lg_delta}; delta=${pow2_result}
-  reg_size=$((${grp} + ${delta}*${ndelta}))
-}
-
-slab_size() {
-  lg_p=$1
-  lg_grp=$2
-  lg_delta=$3
-  ndelta=$4
-
-  pow2 ${lg_p}; p=${pow2_result}
-  reg_size_compute ${lg_grp} ${lg_delta} ${ndelta}
-
-  # Compute smallest slab size that is an integer multiple of reg_size.
-  try_slab_size=${p}
-  try_nregs=$((${try_slab_size} / ${reg_size}))
-  perfect=0
-  while [ ${perfect} -eq 0 ] ; do
-    perfect_slab_size=${try_slab_size}
-    perfect_nregs=${try_nregs}
-
-    try_slab_size=$((${try_slab_size} + ${p}))
-    try_nregs=$((${try_slab_size} / ${reg_size}))
-    if [ ${perfect_slab_size} -eq $((${perfect_nregs} * ${reg_size})) ] ; then
-      perfect=1
-    fi
-  done
-
-  slab_size_pgs=$((${perfect_slab_size} / ${p}))
-}
-
-size_class() {
-  index=$1
-  lg_grp=$2
-  lg_delta=$3
-  ndelta=$4
-  lg_p=$5
-  lg_kmax=$6
-
-  if [ ${lg_delta} -ge ${lg_p} ] ; then
-    psz="yes"
-  else
-    pow2 ${lg_p}; p=${pow2_result}
-    pow2 ${lg_grp}; grp=${pow2_result}
-    pow2 ${lg_delta}; delta=${pow2_result}
-    sz=$((${grp} + ${delta} * ${ndelta}))
-    npgs=$((${sz} / ${p}))
-    if [ ${sz} -eq $((${npgs} * ${p})) ] ; then
-      psz="yes"
-    else
-      psz="no"
-    fi
-  fi
-
-  lg ${ndelta}; lg_ndelta=${lg_result}; pow2 ${lg_ndelta}
-  if [ ${pow2_result} -lt ${ndelta} ] ; then
-    rem="yes"
-  else
-    rem="no"
-  fi
-
-  lg_size=${lg_grp}
-  if [ $((${lg_delta} + ${lg_ndelta})) -eq ${lg_grp} ] ; then
-    lg_size=$((${lg_grp} + 1))
-  else
-    lg_size=${lg_grp}
-    rem="yes"
-  fi
-
-  if [ ${lg_size} -lt $((${lg_p} + ${lg_g})) ] ; then
-    bin="yes"
-    slab_size ${lg_p} ${lg_grp} ${lg_delta} ${ndelta}; pgs=${slab_size_pgs}
-  else
-    bin="no"
-    pgs=0
-  fi
-  if [ ${lg_size} -lt ${lg_kmax} \
-      -o ${lg_size} -eq ${lg_kmax} -a ${rem} = "no" ] ; then
-    lg_delta_lookup=${lg_delta}
-  else
-    lg_delta_lookup="no"
-  fi
-  printf '    SC(%3d, %6d, %8d, %6d, %3s, %3s, %3d, %2s) \\\n' ${index} ${lg_grp} ${lg_delta} ${ndelta} ${psz} ${bin} ${pgs} ${lg_delta_lookup}
-  # Defined upon return:
-  # - psz ("yes" or "no")
-  # - bin ("yes" or "no")
-  # - pgs
-  # - lg_delta_lookup (${lg_delta} or "no")
-}
-
-sep_line() {
-  echo "                                                         \\"
-}
-
-size_classes() {
-  lg_z=$1
-  lg_q=$2
-  lg_t=$3
-  lg_p=$4
-  lg_g=$5
-
-  pow2 $((${lg_z} + 3)); ptr_bits=${pow2_result}
-  pow2 ${lg_g}; g=${pow2_result}
-
-  echo "#define SIZE_CLASSES \\"
-  echo "  /* index, lg_grp, lg_delta, ndelta, psz, bin, pgs, lg_delta_lookup */ \\"
-
-  ntbins=0
-  nlbins=0
-  lg_tiny_maxclass='"NA"'
-  nbins=0
-  npsizes=0
-
-  # Tiny size classes.
-  ndelta=0
-  index=0
-  lg_grp=${lg_t}
-  lg_delta=${lg_grp}
-  while [ ${lg_grp} -lt ${lg_q} ] ; do
-    size_class ${index} ${lg_grp} ${lg_delta} ${ndelta} ${lg_p} ${lg_kmax}
-    if [ ${lg_delta_lookup} != "no" ] ; then
-      nlbins=$((${index} + 1))
-    fi
-    if [ ${psz} = "yes" ] ; then
-      npsizes=$((${npsizes} + 1))
-    fi
-    if [ ${bin} != "no" ] ; then
-      nbins=$((${index} + 1))
-    fi
-    ntbins=$((${ntbins} + 1))
-    lg_tiny_maxclass=${lg_grp} # Final written value is correct.
-    index=$((${index} + 1))
-    lg_delta=${lg_grp}
-    lg_grp=$((${lg_grp} + 1))
-  done
-
-  # First non-tiny group.
-  if [ ${ntbins} -gt 0 ] ; then
-    sep_line
-    # The first size class has an unusual encoding, because the size has to be
-    # split between grp and delta*ndelta.
-    lg_grp=$((${lg_grp} - 1))
-    ndelta=1
-    size_class ${index} ${lg_grp} ${lg_delta} ${ndelta} ${lg_p} ${lg_kmax}
-    index=$((${index} + 1))
-    lg_grp=$((${lg_grp} + 1))
-    lg_delta=$((${lg_delta} + 1))
-    if [ ${psz} = "yes" ] ; then
-      npsizes=$((${npsizes} + 1))
-    fi
-  fi
-  while [ ${ndelta} -lt ${g} ] ; do
-    size_class ${index} ${lg_grp} ${lg_delta} ${ndelta} ${lg_p} ${lg_kmax}
-    index=$((${index} + 1))
-    ndelta=$((${ndelta} + 1))
-    if [ ${psz} = "yes" ] ; then
-      npsizes=$((${npsizes} + 1))
-    fi
-  done
-
-  # All remaining groups.
-  lg_grp=$((${lg_grp} + ${lg_g}))
-  while [ ${lg_grp} -lt $((${ptr_bits} - 1)) ] ; do
-    sep_line
-    ndelta=1
-    if [ ${lg_grp} -eq $((${ptr_bits} - 2)) ] ; then
-      ndelta_limit=$((${g} - 1))
-    else
-      ndelta_limit=${g}
-    fi
-    while [ ${ndelta} -le ${ndelta_limit} ] ; do
-      size_class ${index} ${lg_grp} ${lg_delta} ${ndelta} ${lg_p} ${lg_kmax}
-      if [ ${lg_delta_lookup} != "no" ] ; then
-        nlbins=$((${index} + 1))
-        # Final written value is correct:
-        lookup_maxclass="((((size_t)1) << ${lg_grp}) + (((size_t)${ndelta}) << ${lg_delta}))"
-      fi
-      if [ ${psz} = "yes" ] ; then
-        npsizes=$((${npsizes} + 1))
-      fi
-      if [ ${bin} != "no" ] ; then
-        nbins=$((${index} + 1))
-        # Final written value is correct:
-        small_maxclass="((((size_t)1) << ${lg_grp}) + (((size_t)${ndelta}) << ${lg_delta}))"
-        if [ ${lg_g} -gt 0 ] ; then
-          lg_large_minclass=$((${lg_grp} + 1))
-        else
-          lg_large_minclass=$((${lg_grp} + 2))
-        fi
-      fi
-      # Final written value is correct:
-      large_maxclass="((((size_t)1) << ${lg_grp}) + (((size_t)${ndelta}) << ${lg_delta}))"
-      index=$((${index} + 1))
-      ndelta=$((${ndelta} + 1))
-    done
-    lg_grp=$((${lg_grp} + 1))
-    lg_delta=$((${lg_delta} + 1))
-  done
-  echo
-  nsizes=${index}
-  lg_ceil ${nsizes}; lg_ceil_nsizes=${lg_ceil_result}
-
-  # Defined upon completion:
-  # - ntbins
-  # - nlbins
-  # - nbins
-  # - nsizes
-  # - lg_ceil_nsizes
-  # - npsizes
-  # - lg_tiny_maxclass
-  # - lookup_maxclass
-  # - small_maxclass
-  # - lg_large_minclass
-  # - large_maxclass
-}
-
-cat <<EOF
-#ifndef JEMALLOC_INTERNAL_SIZE_CLASSES_H
-#define JEMALLOC_INTERNAL_SIZE_CLASSES_H
-
-/* This file was automatically generated by size_classes.sh. */
-
-#include "jemalloc/internal/jemalloc_internal_types.h"
-
-/*
- * This header file defines:
- *
- *   LG_SIZE_CLASS_GROUP: Lg of size class count for each size doubling.
- *   LG_TINY_MIN: Lg of minimum size class to support.
- *   SIZE_CLASSES: Complete table of SC(index, lg_grp, lg_delta, ndelta, psz,
- *                 bin, pgs, lg_delta_lookup) tuples.
- *     index: Size class index.
- *     lg_grp: Lg group base size (no deltas added).
- *     lg_delta: Lg delta to previous size class.
- *     ndelta: Delta multiplier.  size == 1<<lg_grp + ndelta<<lg_delta
- *     psz: 'yes' if a multiple of the page size, 'no' otherwise.
- *     bin: 'yes' if a small bin size class, 'no' otherwise.
- *     pgs: Slab page count if a small bin size class, 0 otherwise.
- *     lg_delta_lookup: Same as lg_delta if a lookup table size class, 'no'
- *                      otherwise.
- *   NTBINS: Number of tiny bins.
- *   NLBINS: Number of bins supported by the lookup table.
- *   NBINS: Number of small size class bins.
- *   NSIZES: Number of size classes.
- *   LG_CEIL_NSIZES: Number of bits required to store NSIZES.
- *   NPSIZES: Number of size classes that are a multiple of (1U << LG_PAGE).
- *   LG_TINY_MAXCLASS: Lg of maximum tiny size class.
- *   LOOKUP_MAXCLASS: Maximum size class included in lookup table.
- *   SMALL_MAXCLASS: Maximum small size class.
- *   LG_LARGE_MINCLASS: Lg of minimum large size class.
- *   LARGE_MAXCLASS: Maximum (large) size class.
- */
-
-#define LG_SIZE_CLASS_GROUP	${lg_g}
-#define LG_TINY_MIN		${lg_tmin}
-
-EOF
-
-for lg_z in ${lg_zarr} ; do
-  for lg_q in ${lg_qarr} ; do
-    lg_t=${lg_tmin}
-    while [ ${lg_t} -le ${lg_q} ] ; do
-      # Iterate through page sizes and compute how many bins there are.
-      for lg_p in ${lg_parr} ; do
-        echo "#if (LG_SIZEOF_PTR == ${lg_z} && LG_TINY_MIN == ${lg_t} && LG_QUANTUM == ${lg_q} && LG_PAGE == ${lg_p})"
-        size_classes ${lg_z} ${lg_q} ${lg_t} ${lg_p} ${lg_g}
-        echo "#define SIZE_CLASSES_DEFINED"
-        echo "#define NTBINS			${ntbins}"
-        echo "#define NLBINS			${nlbins}"
-        echo "#define NBINS			${nbins}"
-        echo "#define NSIZES			${nsizes}"
-        echo "#define LG_CEIL_NSIZES		${lg_ceil_nsizes}"
-        echo "#define NPSIZES			${npsizes}"
-        echo "#define LG_TINY_MAXCLASS	${lg_tiny_maxclass}"
-        echo "#define LOOKUP_MAXCLASS		${lookup_maxclass}"
-        echo "#define SMALL_MAXCLASS		${small_maxclass}"
-        echo "#define LG_LARGE_MINCLASS	${lg_large_minclass}"
-        echo "#define LARGE_MINCLASS		(ZU(1) << LG_LARGE_MINCLASS)"
-        echo "#define LARGE_MAXCLASS		${large_maxclass}"
-        echo "#endif"
-        echo
-      done
-      lg_t=$((${lg_t} + 1))
-    done
-  done
-done
-
-cat <<EOF
-#ifndef SIZE_CLASSES_DEFINED
-#  error "No size class definitions match configuration"
-#endif
-#undef SIZE_CLASSES_DEFINED
-/*
- * The size2index_tab lookup table uses uint8_t to encode each bin index, so we
- * cannot support more than 256 small size classes.
- */
-#if (NBINS > 256)
-#  error "Too many small size classes"
-#endif
-
-#endif /* JEMALLOC_INTERNAL_SIZE_CLASSES_H */
-EOF
diff --git a/include/jemalloc/internal/stats.h b/include/jemalloc/internal/stats.h
index 852e3426..3b9e0eac 100644
--- a/include/jemalloc/internal/stats.h
+++ b/include/jemalloc/internal/stats.h
@@ -10,7 +10,8 @@
     OPTION('a',		unmerged,	config_stats,	false)		\
     OPTION('b',		bins,		true,		false)		\
     OPTION('l',		large,		true,		false)		\
-    OPTION('x',		mutex,		true,		false)
+    OPTION('x',		mutex,		true,		false)		\
+    OPTION('e',		extents,	true,		false)
 
 enum {
 #define OPTION(o, v, d, s) stats_print_option_num_##v,
diff --git a/include/jemalloc/internal/sz.h b/include/jemalloc/internal/sz.h
index 97946289..68e558ab 100644
--- a/include/jemalloc/internal/sz.h
+++ b/include/jemalloc/internal/sz.h
@@ -3,7 +3,7 @@
 
 #include "jemalloc/internal/bit_util.h"
 #include "jemalloc/internal/pages.h"
-#include "jemalloc/internal/size_classes.h"
+#include "jemalloc/internal/sc.h"
 #include "jemalloc/internal/util.h"
 
 /*
@@ -26,18 +26,18 @@
  * sz_pind2sz_tab encodes the same information as could be computed by
  * sz_pind2sz_compute().
  */
-extern size_t const sz_pind2sz_tab[NPSIZES+1];
+extern size_t sz_pind2sz_tab[SC_NPSIZES + 1];
 /*
  * sz_index2size_tab encodes the same information as could be computed (at
  * unacceptable cost in some code paths) by sz_index2size_compute().
  */
-extern size_t const sz_index2size_tab[NSIZES];
+extern size_t sz_index2size_tab[SC_NSIZES];
 /*
  * sz_size2index_tab is a compact lookup table that rounds request sizes up to
  * size classes.  In order to reduce cache footprint, the table is compressed,
  * and all accesses are via sz_size2index().
  */
-extern uint8_t const sz_size2index_tab[];
+extern uint8_t sz_size2index_tab[];
 
 static const size_t sz_large_pad =
 #ifdef JEMALLOC_CACHE_OBLIVIOUS
@@ -47,49 +47,47 @@ static const size_t sz_large_pad =
 #endif
     ;
 
+extern void sz_boot(const sc_data_t *sc_data);
+
 JEMALLOC_ALWAYS_INLINE pszind_t
 sz_psz2ind(size_t psz) {
-	if (unlikely(psz > LARGE_MAXCLASS)) {
-		return NPSIZES;
+	if (unlikely(psz > SC_LARGE_MAXCLASS)) {
+		return SC_NPSIZES;
 	}
-	{
-		pszind_t x = lg_floor((psz<<1)-1);
-		pszind_t shift = (x < LG_SIZE_CLASS_GROUP + LG_PAGE) ? 0 : x -
-		    (LG_SIZE_CLASS_GROUP + LG_PAGE);
-		pszind_t grp = shift << LG_SIZE_CLASS_GROUP;
+	pszind_t x = lg_floor((psz<<1)-1);
+	pszind_t shift = (x < SC_LG_NGROUP + LG_PAGE) ?
+	    0 : x - (SC_LG_NGROUP + LG_PAGE);
+	pszind_t grp = shift << SC_LG_NGROUP;
 
-		pszind_t lg_delta = (x < LG_SIZE_CLASS_GROUP + LG_PAGE + 1) ?
-		    LG_PAGE : x - LG_SIZE_CLASS_GROUP - 1;
+	pszind_t lg_delta = (x < SC_LG_NGROUP + LG_PAGE + 1) ?
+	    LG_PAGE : x - SC_LG_NGROUP - 1;
 
-		size_t delta_inverse_mask = ZU(-1) << lg_delta;
-		pszind_t mod = ((((psz-1) & delta_inverse_mask) >> lg_delta)) &
-		    ((ZU(1) << LG_SIZE_CLASS_GROUP) - 1);
+	size_t delta_inverse_mask = ZU(-1) << lg_delta;
+	pszind_t mod = ((((psz-1) & delta_inverse_mask) >> lg_delta)) &
+	    ((ZU(1) << SC_LG_NGROUP) - 1);
 
-		pszind_t ind = grp + mod;
-		return ind;
-	}
+	pszind_t ind = grp + mod;
+	return ind;
 }
 
 static inline size_t
 sz_pind2sz_compute(pszind_t pind) {
-	if (unlikely(pind == NPSIZES)) {
-		return LARGE_MAXCLASS + PAGE;
+	if (unlikely(pind == SC_NPSIZES)) {
+		return SC_LARGE_MAXCLASS + PAGE;
 	}
-	{
-		size_t grp = pind >> LG_SIZE_CLASS_GROUP;
-		size_t mod = pind & ((ZU(1) << LG_SIZE_CLASS_GROUP) - 1);
+	size_t grp = pind >> SC_LG_NGROUP;
+	size_t mod = pind & ((ZU(1) << SC_LG_NGROUP) - 1);
 
-		size_t grp_size_mask = ~((!!grp)-1);
-		size_t grp_size = ((ZU(1) << (LG_PAGE +
-		    (LG_SIZE_CLASS_GROUP-1))) << grp) & grp_size_mask;
+	size_t grp_size_mask = ~((!!grp)-1);
+	size_t grp_size = ((ZU(1) << (LG_PAGE + (SC_LG_NGROUP-1))) << grp)
+	    & grp_size_mask;
 
-		size_t shift = (grp == 0) ? 1 : grp;
-		size_t lg_delta = shift + (LG_PAGE-1);
-		size_t mod_size = (mod+1) << lg_delta;
+	size_t shift = (grp == 0) ? 1 : grp;
+	size_t lg_delta = shift + (LG_PAGE-1);
+	size_t mod_size = (mod+1) << lg_delta;
 
-		size_t sz = grp_size + mod_size;
-		return sz;
-	}
+	size_t sz = grp_size + mod_size;
+	return sz;
 }
 
 static inline size_t
@@ -101,70 +99,70 @@ sz_pind2sz_lookup(pszind_t pind) {
 
 static inline size_t
 sz_pind2sz(pszind_t pind) {
-	assert(pind < NPSIZES+1);
+	assert(pind < SC_NPSIZES + 1);
 	return sz_pind2sz_lookup(pind);
 }
 
 static inline size_t
 sz_psz2u(size_t psz) {
-	if (unlikely(psz > LARGE_MAXCLASS)) {
-		return LARGE_MAXCLASS + PAGE;
-	}
-	{
-		size_t x = lg_floor((psz<<1)-1);
-		size_t lg_delta = (x < LG_SIZE_CLASS_GROUP + LG_PAGE + 1) ?
-		    LG_PAGE : x - LG_SIZE_CLASS_GROUP - 1;
-		size_t delta = ZU(1) << lg_delta;
-		size_t delta_mask = delta - 1;
-		size_t usize = (psz + delta_mask) & ~delta_mask;
-		return usize;
+	if (unlikely(psz > SC_LARGE_MAXCLASS)) {
+		return SC_LARGE_MAXCLASS + PAGE;
 	}
+	size_t x = lg_floor((psz<<1)-1);
+	size_t lg_delta = (x < SC_LG_NGROUP + LG_PAGE + 1) ?
+	    LG_PAGE : x - SC_LG_NGROUP - 1;
+	size_t delta = ZU(1) << lg_delta;
+	size_t delta_mask = delta - 1;
+	size_t usize = (psz + delta_mask) & ~delta_mask;
+	return usize;
 }
 
 static inline szind_t
 sz_size2index_compute(size_t size) {
-	if (unlikely(size > LARGE_MAXCLASS)) {
-		return NSIZES;
+	if (unlikely(size > SC_LARGE_MAXCLASS)) {
+		return SC_NSIZES;
 	}
-#if (NTBINS != 0)
-	if (size <= (ZU(1) << LG_TINY_MAXCLASS)) {
-		szind_t lg_tmin = LG_TINY_MAXCLASS - NTBINS + 1;
+
+	if (size == 0) {
+		return 0;
+	}
+#if (SC_NTINY != 0)
+	if (size <= (ZU(1) << SC_LG_TINY_MAXCLASS)) {
+		szind_t lg_tmin = SC_LG_TINY_MAXCLASS - SC_NTINY + 1;
 		szind_t lg_ceil = lg_floor(pow2_ceil_zu(size));
 		return (lg_ceil < lg_tmin ? 0 : lg_ceil - lg_tmin);
 	}
 #endif
 	{
 		szind_t x = lg_floor((size<<1)-1);
-		szind_t shift = (x < LG_SIZE_CLASS_GROUP + LG_QUANTUM) ? 0 :
-		    x - (LG_SIZE_CLASS_GROUP + LG_QUANTUM);
-		szind_t grp = shift << LG_SIZE_CLASS_GROUP;
+		szind_t shift = (x < SC_LG_NGROUP + LG_QUANTUM) ? 0 :
+		    x - (SC_LG_NGROUP + LG_QUANTUM);
+		szind_t grp = shift << SC_LG_NGROUP;
 
-		szind_t lg_delta = (x < LG_SIZE_CLASS_GROUP + LG_QUANTUM + 1)
-		    ? LG_QUANTUM : x - LG_SIZE_CLASS_GROUP - 1;
+		szind_t lg_delta = (x < SC_LG_NGROUP + LG_QUANTUM + 1)
+		    ? LG_QUANTUM : x - SC_LG_NGROUP - 1;
 
 		size_t delta_inverse_mask = ZU(-1) << lg_delta;
 		szind_t mod = ((((size-1) & delta_inverse_mask) >> lg_delta)) &
-		    ((ZU(1) << LG_SIZE_CLASS_GROUP) - 1);
+		    ((ZU(1) << SC_LG_NGROUP) - 1);
 
-		szind_t index = NTBINS + grp + mod;
+		szind_t index = SC_NTINY + grp + mod;
 		return index;
 	}
 }
 
 JEMALLOC_ALWAYS_INLINE szind_t
 sz_size2index_lookup(size_t size) {
-	assert(size <= LOOKUP_MAXCLASS);
-	{
-		szind_t ret = (sz_size2index_tab[(size-1) >> LG_TINY_MIN]);
-		assert(ret == sz_size2index_compute(size));
-		return ret;
-	}
+	assert(size <= SC_LOOKUP_MAXCLASS);
+	szind_t ret = (sz_size2index_tab[(size + (ZU(1) << SC_LG_TINY_MIN) - 1)
+					 >> SC_LG_TINY_MIN]);
+	assert(ret == sz_size2index_compute(size));
+	return ret;
 }
 
 JEMALLOC_ALWAYS_INLINE szind_t
 sz_size2index(size_t size) {
-	assert(size > 0);
-	if (likely(size <= LOOKUP_MAXCLASS)) {
+	if (likely(size <= SC_LOOKUP_MAXCLASS)) {
 		return sz_size2index_lookup(size);
 	}
 	return sz_size2index_compute(size);
@@ -172,20 +170,20 @@ sz_size2index(size_t size) {
 
 static inline size_t
 sz_index2size_compute(szind_t index) {
-#if (NTBINS > 0)
-	if (index < NTBINS) {
-		return (ZU(1) << (LG_TINY_MAXCLASS - NTBINS + 1 + index));
+#if (SC_NTINY > 0)
+	if (index < SC_NTINY) {
+		return (ZU(1) << (SC_LG_TINY_MAXCLASS - SC_NTINY + 1 + index));
 	}
 #endif
 	{
-		size_t reduced_index = index - NTBINS;
-		size_t grp = reduced_index >> LG_SIZE_CLASS_GROUP;
-		size_t mod = reduced_index & ((ZU(1) << LG_SIZE_CLASS_GROUP) -
+		size_t reduced_index = index - SC_NTINY;
+		size_t grp = reduced_index >> SC_LG_NGROUP;
+		size_t mod = reduced_index & ((ZU(1) << SC_LG_NGROUP) -
 		    1);
 
 		size_t grp_size_mask = ~((!!grp)-1);
 		size_t grp_size = ((ZU(1) << (LG_QUANTUM +
-		    (LG_SIZE_CLASS_GROUP-1))) << grp) & grp_size_mask;
+		    (SC_LG_NGROUP-1))) << grp) & grp_size_mask;
 
 		size_t shift = (grp == 0) ? 1 : grp;
 		size_t lg_delta = shift + (LG_QUANTUM-1);
@@ -205,18 +203,22 @@ sz_index2size_lookup(szind_t index) {
 
 JEMALLOC_ALWAYS_INLINE size_t
 sz_index2size(szind_t index) {
-	assert(index < NSIZES);
+	assert(index < SC_NSIZES);
 	return sz_index2size_lookup(index);
 }
 
 JEMALLOC_ALWAYS_INLINE size_t
 sz_s2u_compute(size_t size) {
-	if (unlikely(size > LARGE_MAXCLASS)) {
+	if (unlikely(size > SC_LARGE_MAXCLASS)) {
 		return 0;
 	}
-#if (NTBINS > 0)
-	if (size <= (ZU(1) << LG_TINY_MAXCLASS)) {
-		size_t lg_tmin = LG_TINY_MAXCLASS - NTBINS + 1;
+
+	if (size == 0) {
+		size++;
+	}
+#if (SC_NTINY > 0)
+	if (size <= (ZU(1) << SC_LG_TINY_MAXCLASS)) {
+		size_t lg_tmin = SC_LG_TINY_MAXCLASS - SC_NTINY + 1;
 		size_t lg_ceil = lg_floor(pow2_ceil_zu(size));
 		return (lg_ceil < lg_tmin ? (ZU(1) << lg_tmin) :
 		    (ZU(1) << lg_ceil));
@@ -224,8 +226,8 @@ sz_s2u_compute(size_t size) {
 #endif
 	{
 		size_t x = lg_floor((size<<1)-1);
-		size_t lg_delta = (x < LG_SIZE_CLASS_GROUP + LG_QUANTUM + 1)
-		    ?  LG_QUANTUM : x - LG_SIZE_CLASS_GROUP - 1;
+		size_t lg_delta = (x < SC_LG_NGROUP + LG_QUANTUM + 1)
+		    ?  LG_QUANTUM : x - SC_LG_NGROUP - 1;
 		size_t delta = ZU(1) << lg_delta;
 		size_t delta_mask = delta - 1;
 		size_t usize = (size + delta_mask) & ~delta_mask;
@@ -247,8 +249,7 @@ sz_s2u_lookup(size_t size) {
  */
 JEMALLOC_ALWAYS_INLINE size_t
 sz_s2u(size_t size) {
-	assert(size > 0);
-	if (likely(size <= LOOKUP_MAXCLASS)) {
+	if (likely(size <= SC_LOOKUP_MAXCLASS)) {
 		return sz_s2u_lookup(size);
 	}
 	return sz_s2u_compute(size);
@@ -265,7 +266,7 @@ sz_sa2u(size_t size, size_t alignment) {
 	assert(alignment != 0 && ((alignment - 1) & alignment) == 0);
 
 	/* Try for a small size class. */
-	if (size <= SMALL_MAXCLASS && alignment < PAGE) {
+	if (size <= SC_SMALL_MAXCLASS && alignment < PAGE) {
 		/*
 		 * Round size up to the nearest multiple of alignment.
 		 *
@@ -281,20 +282,20 @@ sz_sa2u(size_t size, size_t alignment) {
 		 *    192 | 11000000 |  64
 		 */
 		usize = sz_s2u(ALIGNMENT_CEILING(size, alignment));
-		if (usize < LARGE_MINCLASS) {
+		if (usize < SC_LARGE_MINCLASS) {
 			return usize;
 		}
 	}
 
 	/* Large size class.  Beware of overflow. */
 
-	if (unlikely(alignment > LARGE_MAXCLASS)) {
+	if (unlikely(alignment > SC_LARGE_MAXCLASS)) {
 		return 0;
 	}
 
 	/* Make sure result is a large size class. */
-	if (size <= LARGE_MINCLASS) {
-		usize = LARGE_MINCLASS;
+	if (size <= SC_LARGE_MINCLASS) {
+		usize = SC_LARGE_MINCLASS;
 	} else {
 		usize = sz_s2u(size);
 		if (usize < size) {
diff --git a/include/jemalloc/internal/tcache_externs.h b/include/jemalloc/internal/tcache_externs.h
index 790367bd..d63eafde 100644
--- a/include/jemalloc/internal/tcache_externs.h
+++ b/include/jemalloc/internal/tcache_externs.h
@@ -1,15 +1,13 @@
 #ifndef JEMALLOC_INTERNAL_TCACHE_EXTERNS_H
 #define JEMALLOC_INTERNAL_TCACHE_EXTERNS_H
 
-#include "jemalloc/internal/size_classes.h"
-
 extern bool	opt_tcache;
 extern ssize_t	opt_lg_tcache_max;
 
 extern cache_bin_info_t	*tcache_bin_info;
 
 /*
- * Number of tcache bins.  There are NBINS small-object bins, plus 0 or more
+ * Number of tcache bins.  There are SC_NBINS small-object bins, plus 0 or more
  * large-object bins.
  */
 extern unsigned	nhbins;
diff --git a/include/jemalloc/internal/tcache_inlines.h b/include/jemalloc/internal/tcache_inlines.h
index 0f6ab8cb..5eca20e8 100644
--- a/include/jemalloc/internal/tcache_inlines.h
+++ b/include/jemalloc/internal/tcache_inlines.h
@@ -3,7 +3,7 @@
 
 #include "jemalloc/internal/bin.h"
 #include "jemalloc/internal/jemalloc_internal_types.h"
-#include "jemalloc/internal/size_classes.h"
+#include "jemalloc/internal/sc.h"
 #include "jemalloc/internal/sz.h"
 #include "jemalloc/internal/ticker.h"
 #include "jemalloc/internal/util.h"
@@ -40,13 +40,13 @@ tcache_event(tsd_t *tsd, tcache_t *tcache) {
 
 JEMALLOC_ALWAYS_INLINE void *
 tcache_alloc_small(tsd_t *tsd, arena_t *arena, tcache_t *tcache,
-    UNUSED size_t size, szind_t binind, bool zero, bool slow_path) {
+    size_t size, szind_t binind, bool zero, bool slow_path) {
 	void *ret;
 	cache_bin_t *bin;
 	bool tcache_success;
 	size_t usize JEMALLOC_CC_SILENCE_INIT(0);
 
-	assert(binind < NBINS);
+	assert(binind < SC_NBINS);
 	bin = tcache_small_bin_get(tcache, binind);
 	ret = cache_bin_alloc_easy(bin, &tcache_success);
 	assert(tcache_success == (ret != NULL));
@@ -107,7 +107,7 @@ tcache_alloc_large(tsd_t *tsd, arena_t *arena, tcache_t *tcache, size_t size,
 	cache_bin_t *bin;
 	bool tcache_success;
 
-	assert(binind >= NBINS &&binind < nhbins);
+	assert(binind >= SC_NBINS &&binind < nhbins);
 	bin = tcache_large_bin_get(tcache, binind);
 	ret = cache_bin_alloc_easy(bin, &tcache_success);
 	assert(tcache_success == (ret != NULL));
@@ -166,7 +166,8 @@ tcache_dalloc_small(tsd_t *tsd, tcache_t *tcache, void *ptr, szind_t binind,
 	cache_bin_t *bin;
 	cache_bin_info_t *bin_info;
 
-	assert(tcache_salloc(tsd_tsdn(tsd), ptr) <= SMALL_MAXCLASS);
+	assert(tcache_salloc(tsd_tsdn(tsd), ptr)
+	    <= SC_SMALL_MAXCLASS);
 
 	if (slow_path && config_fill && unlikely(opt_junk_free)) {
 		arena_dalloc_junk_small(ptr, &bin_infos[binind]);
@@ -174,13 +175,12 @@ tcache_dalloc_small(tsd_t *tsd, tcache_t *tcache, void *ptr, szind_t binind,
 
 	bin = tcache_small_bin_get(tcache, binind);
 	bin_info = &tcache_bin_info[binind];
-	if (unlikely(bin->ncached == bin_info->ncached_max)) {
+	if (unlikely(!cache_bin_dalloc_easy(bin, bin_info, ptr))) {
 		tcache_bin_flush_small(tsd, tcache, bin, binind,
 		    (bin_info->ncached_max >> 1));
+		bool ret = cache_bin_dalloc_easy(bin, bin_info, ptr);
+		assert(ret);
 	}
-	assert(bin->ncached < bin_info->ncached_max);
-	bin->ncached++;
-	*(bin->avail - bin->ncached) = ptr;
 
 	tcache_event(tsd, tcache);
 }
@@ -191,7 +191,8 @@ tcache_dalloc_large(tsd_t *tsd, tcache_t *tcache, void *ptr, szind_t binind,
 	cache_bin_t *bin;
 	cache_bin_info_t *bin_info;
 
-	assert(tcache_salloc(tsd_tsdn(tsd), ptr) > SMALL_MAXCLASS);
+	assert(tcache_salloc(tsd_tsdn(tsd), ptr)
+	    > SC_SMALL_MAXCLASS);
 	assert(tcache_salloc(tsd_tsdn(tsd), ptr) <= tcache_maxclass);
 
 	if (slow_path && config_fill && unlikely(opt_junk_free)) {
@@ -215,6 +216,9 @@ JEMALLOC_ALWAYS_INLINE tcache_t *
 tcaches_get(tsd_t *tsd, unsigned ind) {
 	tcaches_t *elm = &tcaches[ind];
 	if (unlikely(elm->tcache == NULL)) {
+		malloc_printf("<jemalloc>: invalid tcache id (%u).\n", ind);
+		abort();
+	} else if (unlikely(elm->tcache == TCACHES_ELM_NEED_REINIT)) {
 		elm->tcache = tcache_create_explicit(tsd);
 	}
 	return elm->tcache;
diff --git a/include/jemalloc/internal/tcache_structs.h b/include/jemalloc/internal/tcache_structs.h
index 07b73870..172ef904 100644
--- a/include/jemalloc/internal/tcache_structs.h
+++ b/include/jemalloc/internal/tcache_structs.h
@@ -1,10 +1,14 @@
 #ifndef JEMALLOC_INTERNAL_TCACHE_STRUCTS_H
 #define JEMALLOC_INTERNAL_TCACHE_STRUCTS_H
 
-#include "jemalloc/internal/ql.h"
-#include "jemalloc/internal/size_classes.h"
 #include "jemalloc/internal/cache_bin.h"
+#include "jemalloc/internal/ql.h"
+#include "jemalloc/internal/sc.h"
 #include "jemalloc/internal/ticker.h"
+#include "jemalloc/internal/tsd_types.h"
+
+/* Various uses of this struct need it to be a named type. */
+typedef ql_elm(tsd_t) tsd_link_t;
 
 struct tcache_s {
 	/*
@@ -21,7 +25,7 @@ struct tcache_s {
 	 * During tcache initialization, the avail pointer in each element of
 	 * tbins is initialized to point to the proper offset within this array.
 	 */
-	cache_bin_t	bins_small[NBINS];
+	cache_bin_t	bins_small[SC_NBINS];
 
 	/*
 	 * This data is less hot; we can be a little less careful with our
@@ -29,6 +33,11 @@ struct tcache_s {
 	 */
 	/* Lets us track all the tcaches in an arena. */
 	ql_elm(tcache_t) link;
+
+	/* Logically scoped to tsd, but put here for cache layout reasons. */
+	ql_elm(tsd_t) tsd_link;
+	bool in_hook;
+
 	/*
 	 * The descriptor lets the arena find our cache bins without seeing the
 	 * tcache definition.  This enables arenas to aggregate stats across
@@ -41,13 +50,13 @@ struct tcache_s {
 	/* Next bin to GC. */
 	szind_t		next_gc_bin;
 	/* For small bins, fill (ncached_max >> lg_fill_div). */
-	uint8_t		lg_fill_div[NBINS];
+	uint8_t		lg_fill_div[SC_NBINS];
 	/*
 	 * We put the cache bins for large size classes at the end of the
 	 * struct, since some of them might not get used.  This might end up
 	 * letting us avoid touching an extra page if we don't have to.
 	 */
-	cache_bin_t	bins_large[NSIZES-NBINS];
+	cache_bin_t	bins_large[SC_NSIZES-SC_NBINS];
 };
 
 /* Linkage for list of available (previously used) explicit tcache IDs. */
diff --git a/include/jemalloc/internal/tcache_types.h b/include/jemalloc/internal/tcache_types.h
index e49bc9d7..dce69382 100644
--- a/include/jemalloc/internal/tcache_types.h
+++ b/include/jemalloc/internal/tcache_types.h
@@ -1,7 +1,7 @@
 #ifndef JEMALLOC_INTERNAL_TCACHE_TYPES_H
 #define JEMALLOC_INTERNAL_TCACHE_TYPES_H
 
-#include "jemalloc/internal/size_classes.h"
+#include "jemalloc/internal/sc.h"
 
 typedef struct tcache_s tcache_t;
 typedef struct tcaches_s tcaches_t;
@@ -45,7 +45,7 @@ typedef struct tcaches_s tcaches_t;
 
 /* Number of tcache allocation/deallocation events between incremental GCs. */
 #define TCACHE_GC_INCR							\
-    ((TCACHE_GC_SWEEP / NBINS) + ((TCACHE_GC_SWEEP / NBINS == 0) ? 0 : 1))
+    ((TCACHE_GC_SWEEP / SC_NBINS) + ((TCACHE_GC_SWEEP / SC_NBINS == 0) ? 0 : 1))
 
 /* Used in TSD static initializer only. Real init in tcache_data_init(). */
 #define TCACHE_ZERO_INITIALIZER {0}
@@ -53,4 +53,7 @@ typedef struct tcaches_s tcaches_t;
 /* Used in TSD static initializer only. Will be initialized to opt_tcache. */
 #define TCACHE_ENABLED_ZERO_INITIALIZER false
 
+/* Used for explicit tcache only. Means flushed but not destroyed. */
+#define TCACHES_ELM_NEED_REINIT ((tcache_t *)(uintptr_t)1)
+
 #endif /* JEMALLOC_INTERNAL_TCACHE_TYPES_H */
diff --git a/include/jemalloc/internal/test_hooks.h b/include/jemalloc/internal/test_hooks.h
new file mode 100644
index 00000000..a6351e59
--- /dev/null
+++ b/include/jemalloc/internal/test_hooks.h
@@ -0,0 +1,19 @@
+#ifndef JEMALLOC_INTERNAL_TEST_HOOKS_H
+#define JEMALLOC_INTERNAL_TEST_HOOKS_H
+
+extern JEMALLOC_EXPORT void (*test_hooks_arena_new_hook)();
+extern JEMALLOC_EXPORT void (*test_hooks_libc_hook)();
+
+#define JEMALLOC_HOOK(fn, hook) ((void)(hook != NULL && (hook(), 0)), fn)
+
+#define open JEMALLOC_HOOK(open, test_hooks_libc_hook)
+#define read JEMALLOC_HOOK(read, test_hooks_libc_hook)
+#define write JEMALLOC_HOOK(write, test_hooks_libc_hook)
+#define readlink JEMALLOC_HOOK(readlink, test_hooks_libc_hook)
+#define close JEMALLOC_HOOK(close, test_hooks_libc_hook)
+#define creat JEMALLOC_HOOK(creat, test_hooks_libc_hook)
+#define secure_getenv JEMALLOC_HOOK(secure_getenv, test_hooks_libc_hook)
+/* Note that this is undef'd and re-define'd in src/prof.c. */
+#define _Unwind_Backtrace JEMALLOC_HOOK(_Unwind_Backtrace, test_hooks_libc_hook)
+
+#endif /* JEMALLOC_INTERNAL_TEST_HOOKS_H */
diff --git a/include/jemalloc/internal/ticker.h b/include/jemalloc/internal/ticker.h
index 4b360470..52d0db4c 100644
--- a/include/jemalloc/internal/ticker.h
+++ b/include/jemalloc/internal/ticker.h
@@ -75,4 +75,17 @@ ticker_tick(ticker_t *ticker) {
 	return ticker_ticks(ticker, 1);
 }
 
+/* 
+ * Try to tick.  If ticker would fire, return true, but rely on
+ * slowpath to reset ticker.
+ */
+static inline bool
+ticker_trytick(ticker_t *ticker) {
+	--ticker->tick;
+	if (unlikely(ticker->tick < 0)) {
+		return true;
+	}
+	return false;
+}
+
 #endif /* JEMALLOC_INTERNAL_TICKER_H */
diff --git a/include/jemalloc/internal/tsd.h b/include/jemalloc/internal/tsd.h
index 0b9841aa..9ba26004 100644
--- a/include/jemalloc/internal/tsd.h
+++ b/include/jemalloc/internal/tsd.h
@@ -3,6 +3,7 @@
 
 #include "jemalloc/internal/arena_types.h"
 #include "jemalloc/internal/assert.h"
+#include "jemalloc/internal/bin_types.h"
 #include "jemalloc/internal/jemalloc_internal_externs.h"
 #include "jemalloc/internal/prof_types.h"
 #include "jemalloc/internal/ql.h"
@@ -68,17 +69,19 @@ typedef void (*test_callback_t)(int *);
     O(offset_state,		uint64_t,		uint64_t)	\
     O(thread_allocated,		uint64_t,		uint64_t)	\
     O(thread_deallocated,	uint64_t,		uint64_t)	\
+    O(bytes_until_sample,	int64_t,		int64_t)	\
     O(prof_tdata,		prof_tdata_t *,		prof_tdata_t *)	\
     O(rtree_ctx,		rtree_ctx_t,		rtree_ctx_t)	\
     O(iarena,			arena_t *,		arena_t *)	\
     O(arena,			arena_t *,		arena_t *)	\
     O(arenas_tdata,		arena_tdata_t *,	arena_tdata_t *)\
+    O(binshards,		tsd_binshards_t,	tsd_binshards_t)\
     O(tcache,			tcache_t,		tcache_t)	\
     O(witness_tsd,              witness_tsd_t,		witness_tsdn_t)	\
     MALLOC_TEST_TSD
 
 #define TSD_INITIALIZER {						\
-    tsd_state_uninitialized,						\
+    ATOMIC_INIT(tsd_state_uninitialized),				\
     TCACHE_ENABLED_ZERO_INITIALIZER,					\
     false,								\
     0,									\
@@ -86,29 +89,97 @@ typedef void (*test_callback_t)(int *);
     0,									\
     0,									\
     0,									\
+    0,									\
     NULL,								\
     RTREE_CTX_ZERO_INITIALIZER,						\
     NULL,								\
     NULL,								\
     NULL,								\
+    TSD_BINSHARDS_ZERO_INITIALIZER,					\
     TCACHE_ZERO_INITIALIZER,						\
     WITNESS_TSD_INITIALIZER						\
     MALLOC_TEST_TSD_INITIALIZER						\
 }
 
+void *malloc_tsd_malloc(size_t size);
+void malloc_tsd_dalloc(void *wrapper);
+void malloc_tsd_cleanup_register(bool (*f)(void));
+tsd_t *malloc_tsd_boot0(void);
+void malloc_tsd_boot1(void);
+void tsd_cleanup(void *arg);
+tsd_t *tsd_fetch_slow(tsd_t *tsd, bool internal);
+void tsd_state_set(tsd_t *tsd, uint8_t new_state);
+void tsd_slow_update(tsd_t *tsd);
+void tsd_prefork(tsd_t *tsd);
+void tsd_postfork_parent(tsd_t *tsd);
+void tsd_postfork_child(tsd_t *tsd);
+
+/*
+ * Call ..._inc when your module wants to take all threads down the slow paths,
+ * and ..._dec when it no longer needs to.
+ */
+void tsd_global_slow_inc(tsdn_t *tsdn);
+void tsd_global_slow_dec(tsdn_t *tsdn);
+bool tsd_global_slow();
+
 enum {
-	tsd_state_nominal = 0, /* Common case --> jnz. */
-	tsd_state_nominal_slow = 1, /* Initialized but on slow path. */
-	/* the above 2 nominal states should be lower values. */
-	tsd_state_nominal_max = 1, /* used for comparison only. */
-	tsd_state_minimal_initialized = 2,
-	tsd_state_purgatory = 3,
-	tsd_state_reincarnated = 4,
-	tsd_state_uninitialized = 5
+	/* Common case --> jnz. */
+	tsd_state_nominal = 0,
+	/* Initialized but on slow path. */
+	tsd_state_nominal_slow = 1,
+	/*
+	 * Some thread has changed global state in such a way that all nominal
+	 * threads need to recompute their fast / slow status the next time they
+	 * get a chance.
+	 *
+	 * Any thread can change another thread's status *to* recompute, but
+	 * threads are the only ones who can change their status *from*
+	 * recompute.
+	 */
+	tsd_state_nominal_recompute = 2,
+	/*
+	 * The above nominal states should be lower values.  We use
+	 * tsd_nominal_max to separate nominal states from threads in the
+	 * process of being born / dying.
+	 */
+	tsd_state_nominal_max = 2,
+
+	/*
+	 * A thread might free() during its death as its only allocator action;
+	 * in such scenarios, we need tsd, but set up in such a way that no
+	 * cleanup is necessary.
+	 */
+	tsd_state_minimal_initialized = 3,
+	/* States during which we know we're in thread death. */
+	tsd_state_purgatory = 4,
+	tsd_state_reincarnated = 5,
+	/*
+	 * What it says on the tin; tsd that hasn't been initialized.  Note
+	 * that even when the tsd struct lives in TLS, when need to keep track
+	 * of stuff like whether or not our pthread destructors have been
+	 * scheduled, so this really truly is different than the nominal state.
+	 */
+	tsd_state_uninitialized = 6
 };
 
-/* Manually limit tsd_state_t to a single byte. */
-typedef uint8_t tsd_state_t;
+/*
+ * Some TSD accesses can only be done in a nominal state.  To enforce this, we
+ * wrap TSD member access in a function that asserts on TSD state, and mangle
+ * field names to prevent touching them accidentally.
+ */
+#define TSD_MANGLE(n) cant_access_tsd_items_directly_use_a_getter_or_setter_##n
+
+#ifdef JEMALLOC_U8_ATOMICS
+#  define tsd_state_t atomic_u8_t
+#  define tsd_atomic_load atomic_load_u8
+#  define tsd_atomic_store atomic_store_u8
+#  define tsd_atomic_exchange atomic_exchange_u8
+#else
+#  define tsd_state_t atomic_u32_t
+#  define tsd_atomic_load atomic_load_u32
+#  define tsd_atomic_store atomic_store_u32
+#  define tsd_atomic_exchange atomic_exchange_u32
+#endif
 
 /* The actual tsd. */
 struct tsd_s {
@@ -117,13 +188,29 @@ struct tsd_s {
 	 * module.  Access any thread-local state through the getters and
 	 * setters below.
 	 */
-	tsd_state_t	state;
+
+	/*
+	 * We manually limit the state to just a single byte.  Unless the 8-bit
+	 * atomics are unavailable (which is rare).
+	 */
+	tsd_state_t state;
 #define O(n, t, nt)							\
-	t use_a_getter_or_setter_instead_##n;
+	t TSD_MANGLE(n);
 MALLOC_TSD
 #undef O
 };
 
+JEMALLOC_ALWAYS_INLINE uint8_t
+tsd_state_get(tsd_t *tsd) {
+	/*
+	 * This should be atomic.  Unfortunately, compilers right now can't tell
+	 * that this can be done as a memory comparison, and forces a load into
+	 * a register that hurts fast-path performance.
+	 */
+	/* return atomic_load_u8(&tsd->state, ATOMIC_RELAXED); */
+	return *(uint8_t *)&tsd->state;
+}
+
 /*
  * Wrapper around tsd_t that makes it possible to avoid implicit conversion
  * between tsd_t and tsdn_t, where tsdn_t is "nullable" and has to be
@@ -150,15 +237,6 @@ tsdn_tsd(tsdn_t *tsdn) {
 	return &tsdn->tsd;
 }
 
-void *malloc_tsd_malloc(size_t size);
-void malloc_tsd_dalloc(void *wrapper);
-void malloc_tsd_cleanup_register(bool (*f)(void));
-tsd_t *malloc_tsd_boot0(void);
-void malloc_tsd_boot1(void);
-void tsd_cleanup(void *arg);
-tsd_t *tsd_fetch_slow(tsd_t *tsd, bool internal);
-void tsd_slow_update(tsd_t *tsd);
-
 /*
  * We put the platform-specific data declarations and inlines into their own
  * header files to avoid cluttering this file.  They define tsd_boot0,
@@ -182,7 +260,7 @@ void tsd_slow_update(tsd_t *tsd);
 #define O(n, t, nt)							\
 JEMALLOC_ALWAYS_INLINE t *						\
 tsd_##n##p_get_unsafe(tsd_t *tsd) {					\
-	return &tsd->use_a_getter_or_setter_instead_##n;		\
+	return &tsd->TSD_MANGLE(n);					\
 }
 MALLOC_TSD
 #undef O
@@ -191,10 +269,16 @@ MALLOC_TSD
 #define O(n, t, nt)							\
 JEMALLOC_ALWAYS_INLINE t *						\
 tsd_##n##p_get(tsd_t *tsd) {						\
-	assert(tsd->state == tsd_state_nominal ||			\
-	    tsd->state == tsd_state_nominal_slow ||			\
-	    tsd->state == tsd_state_reincarnated ||			\
-	    tsd->state == tsd_state_minimal_initialized);		\
+	/*								\
+	 * Because the state might change asynchronously if it's	\
+	 * nominal, we need to make sure that we only read it once.	\
+	 */								\
+	uint8_t state = tsd_state_get(tsd);				\
+	assert(state == tsd_state_nominal ||				\
+	    state == tsd_state_nominal_slow ||				\
+	    state == tsd_state_nominal_recompute ||			\
+	    state == tsd_state_reincarnated ||				\
+	    state == tsd_state_minimal_initialized);			\
 	return tsd_##n##p_get_unsafe(tsd);				\
 }
 MALLOC_TSD
@@ -229,8 +313,8 @@ MALLOC_TSD
 #define O(n, t, nt)							\
 JEMALLOC_ALWAYS_INLINE void						\
 tsd_##n##_set(tsd_t *tsd, t val) {					\
-	assert(tsd->state != tsd_state_reincarnated &&			\
-	    tsd->state != tsd_state_minimal_initialized);		\
+	assert(tsd_state_get(tsd) != tsd_state_reincarnated &&		\
+	    tsd_state_get(tsd) != tsd_state_minimal_initialized);	\
 	*tsd_##n##p_get(tsd) = val;					\
 }
 MALLOC_TSD
@@ -238,13 +322,18 @@ MALLOC_TSD
 
 JEMALLOC_ALWAYS_INLINE void
 tsd_assert_fast(tsd_t *tsd) {
+	/*
+	 * Note that our fastness assertion does *not* include global slowness
+	 * counters; it's not in general possible to ensure that they won't
+	 * change asynchronously from underneath us.
+	 */
 	assert(!malloc_slow && tsd_tcache_enabled_get(tsd) &&
 	    tsd_reentrancy_level_get(tsd) == 0);
 }
 
 JEMALLOC_ALWAYS_INLINE bool
 tsd_fast(tsd_t *tsd) {
-	bool fast = (tsd->state == tsd_state_nominal);
+	bool fast = (tsd_state_get(tsd) == tsd_state_nominal);
 	if (fast) {
 		tsd_assert_fast(tsd);
 	}
@@ -261,7 +350,7 @@ tsd_fetch_impl(bool init, bool minimal) {
 	}
 	assert(tsd != NULL);
 
-	if (unlikely(tsd->state != tsd_state_nominal)) {
+	if (unlikely(tsd_state_get(tsd) != tsd_state_nominal)) {
 		return tsd_fetch_slow(tsd, minimal);
 	}
 	assert(tsd_fast(tsd));
@@ -281,7 +370,7 @@ JEMALLOC_ALWAYS_INLINE tsd_t *
 tsd_internal_fetch(void) {
 	tsd_t *tsd = tsd_fetch_min();
 	/* Use reincarnated state to prevent full initialization. */
-	tsd->state = tsd_state_reincarnated;
+	tsd_state_set(tsd, tsd_state_reincarnated);
 
 	return tsd;
 }
@@ -293,7 +382,7 @@ tsd_fetch(void) {
 
 static inline bool
 tsd_nominal(tsd_t *tsd) {
-	return (tsd->state <= tsd_state_nominal_max);
+	return (tsd_state_get(tsd) <= tsd_state_nominal_max);
 }
 
 JEMALLOC_ALWAYS_INLINE tsdn_t *
diff --git a/include/jemalloc/internal/tsd_generic.h b/include/jemalloc/internal/tsd_generic.h
index 1e52ef76..cf73c0c7 100644
--- a/include/jemalloc/internal/tsd_generic.h
+++ b/include/jemalloc/internal/tsd_generic.h
@@ -77,7 +77,10 @@ tsd_wrapper_get(bool init) {
 			abort();
 		} else {
 			wrapper->initialized = false;
+      JEMALLOC_DIAGNOSTIC_PUSH
+      JEMALLOC_DIAGNOSTIC_IGNORE_MISSING_STRUCT_FIELD_INITIALIZERS
 			tsd_t initializer = TSD_INITIALIZER;
+      JEMALLOC_DIAGNOSTIC_POP
 			wrapper->val = initializer;
 		}
 		tsd_wrapper_set(wrapper);
@@ -107,7 +110,10 @@ tsd_boot1(void) {
 	tsd_boot_wrapper.initialized = false;
 	tsd_cleanup(&tsd_boot_wrapper.val);
 	wrapper->initialized = false;
+  JEMALLOC_DIAGNOSTIC_PUSH
+  JEMALLOC_DIAGNOSTIC_IGNORE_MISSING_STRUCT_FIELD_INITIALIZERS
 	tsd_t initializer = TSD_INITIALIZER;
+  JEMALLOC_DIAGNOSTIC_POP
 	wrapper->val = initializer;
 	tsd_wrapper_set(wrapper);
 }
diff --git a/include/jemalloc/internal/tsd_malloc_thread_cleanup.h b/include/jemalloc/internal/tsd_malloc_thread_cleanup.h
index beb467a6..bf8801ef 100644
--- a/include/jemalloc/internal/tsd_malloc_thread_cleanup.h
+++ b/include/jemalloc/internal/tsd_malloc_thread_cleanup.h
@@ -47,7 +47,6 @@ tsd_get_allocates(void) {
 /* Get/set. */
 JEMALLOC_ALWAYS_INLINE tsd_t *
 tsd_get(bool init) {
-	assert(tsd_booted);
 	return &tsd_tls;
 }
 JEMALLOC_ALWAYS_INLINE void
diff --git a/include/jemalloc/internal/tsd_tls.h b/include/jemalloc/internal/tsd_tls.h
index 0de64b7b..f4f165c7 100644
--- a/include/jemalloc/internal/tsd_tls.h
+++ b/include/jemalloc/internal/tsd_tls.h
@@ -39,8 +39,7 @@ tsd_get_allocates(void) {
 
 /* Get/set. */
 JEMALLOC_ALWAYS_INLINE tsd_t *
-tsd_get(UNUSED bool init) {
-	assert(tsd_booted);
+tsd_get(bool init) {
 	return &tsd_tls;
 }
 
diff --git a/include/jemalloc/internal/witness.h b/include/jemalloc/internal/witness.h
index 7ace8ae4..fff9e98c 100644
--- a/include/jemalloc/internal/witness.h
+++ b/include/jemalloc/internal/witness.h
@@ -27,9 +27,9 @@
 #define WITNESS_RANK_PROF_BT2GCTX	6U
 #define WITNESS_RANK_PROF_TDATAS	7U
 #define WITNESS_RANK_PROF_TDATA		8U
-#define WITNESS_RANK_PROF_GCTX		9U
-
-#define WITNESS_RANK_BACKGROUND_THREAD	10U
+#define WITNESS_RANK_PROF_LOG		9U
+#define WITNESS_RANK_PROF_GCTX		10U
+#define WITNESS_RANK_BACKGROUND_THREAD	11U
 
 /*
  * Used as an argument to witness_assert_depth_to_rank() in order to validate
@@ -37,18 +37,19 @@
  * witness_assert_depth_to_rank() is inclusive rather than exclusive, this
  * definition can have the same value as the minimally ranked core lock.
  */
-#define WITNESS_RANK_CORE		11U
+#define WITNESS_RANK_CORE		12U
 
-#define WITNESS_RANK_DECAY		11U
-#define WITNESS_RANK_TCACHE_QL		12U
-#define WITNESS_RANK_EXTENT_GROW	13U
-#define WITNESS_RANK_EXTENTS		14U
-#define WITNESS_RANK_EXTENT_AVAIL	15U
+#define WITNESS_RANK_DECAY		12U
+#define WITNESS_RANK_TCACHE_QL		13U
+#define WITNESS_RANK_EXTENT_GROW	14U
+#define WITNESS_RANK_EXTENTS		15U
+#define WITNESS_RANK_EXTENT_AVAIL	16U
 
-#define WITNESS_RANK_EXTENT_POOL	16U
-#define WITNESS_RANK_RTREE		17U
-#define WITNESS_RANK_BASE		18U
-#define WITNESS_RANK_ARENA_LARGE	19U
+#define WITNESS_RANK_EXTENT_POOL	17U
+#define WITNESS_RANK_RTREE		18U
+#define WITNESS_RANK_BASE		19U
+#define WITNESS_RANK_ARENA_LARGE	20U
+#define WITNESS_RANK_HOOK		21U
 
 #define WITNESS_RANK_LEAF		0xffffffffU
 #define WITNESS_RANK_BIN		WITNESS_RANK_LEAF
diff --git a/include/jemalloc/jemalloc_macros.h.in b/include/jemalloc/jemalloc_macros.h.in
index aee55438..a00ce11a 100644
--- a/include/jemalloc/jemalloc_macros.h.in
+++ b/include/jemalloc/jemalloc_macros.h.in
@@ -10,6 +10,7 @@
 #define JEMALLOC_VERSION_BUGFIX @jemalloc_version_bugfix@
 #define JEMALLOC_VERSION_NREV @jemalloc_version_nrev@
 #define JEMALLOC_VERSION_GID "@jemalloc_version_gid@"
+#define JEMALLOC_VERSION_GID_IDENT @jemalloc_version_gid@
 
 #define MALLOCX_LG_ALIGN(la)	((int)(la))
 #if LG_SIZEOF_PTR == 2
diff --git a/msvc/projects/vc2015/jemalloc/jemalloc.vcxproj b/msvc/projects/vc2015/jemalloc/jemalloc.vcxproj
index f7b175b0..ddc6781c 100644
--- a/msvc/projects/vc2015/jemalloc/jemalloc.vcxproj
+++ b/msvc/projects/vc2015/jemalloc/jemalloc.vcxproj
@@ -47,7 +47,7 @@
     <ClCompile Include="..\..\..\..\src\extent_dss.c" />
     <ClCompile Include="..\..\..\..\src\extent_mmap.c" />
     <ClCompile Include="..\..\..\..\src\hash.c" />
-    <ClCompile Include="..\..\..\..\src\hooks.c" />
+    <ClCompile Include="..\..\..\..\src\hook.c" />
     <ClCompile Include="..\..\..\..\src\jemalloc.c" />
     <ClCompile Include="..\..\..\..\src\large.c" />
     <ClCompile Include="..\..\..\..\src\log.c" />
@@ -59,6 +59,7 @@
     <ClCompile Include="..\..\..\..\src\prng.c" />
     <ClCompile Include="..\..\..\..\src\prof.c" />
     <ClCompile Include="..\..\..\..\src\rtree.c" />
+    <ClCompile Include="..\..\..\..\src\sc.c" />
     <ClCompile Include="..\..\..\..\src\stats.c" />
     <ClCompile Include="..\..\..\..\src\sz.c" />
     <ClCompile Include="..\..\..\..\src\tcache.c" />
diff --git a/msvc/projects/vc2015/jemalloc/jemalloc.vcxproj.filters b/msvc/projects/vc2015/jemalloc/jemalloc.vcxproj.filters
index 11cfcd0b..1dcf4ed5 100644
--- a/msvc/projects/vc2015/jemalloc/jemalloc.vcxproj.filters
+++ b/msvc/projects/vc2015/jemalloc/jemalloc.vcxproj.filters
@@ -37,7 +37,7 @@
     <ClCompile Include="..\..\..\..\src\hash.c">
       <Filter>Source Files</Filter>
     </ClCompile>
-    <ClCompile Include="..\..\..\..\src\hooks.c">
+    <ClCompile Include="..\..\..\..\src\hook.c">
       <Filter>Source Files</Filter>
     </ClCompile>
     <ClCompile Include="..\..\..\..\src\jemalloc.c">
@@ -70,6 +70,9 @@
     <ClCompile Include="..\..\..\..\src\rtree.c">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\..\..\src\sc.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="..\..\..\..\src\stats.c">
       <Filter>Source Files</Filter>
     </ClCompile>
diff --git a/msvc/projects/vc2017/jemalloc/jemalloc.vcxproj b/msvc/projects/vc2017/jemalloc/jemalloc.vcxproj
index ed71de8a..21481d5e 100644
--- a/msvc/projects/vc2017/jemalloc/jemalloc.vcxproj
+++ b/msvc/projects/vc2017/jemalloc/jemalloc.vcxproj
@@ -47,7 +47,7 @@
     <ClCompile Include="..\..\..\..\src\extent_dss.c" />
     <ClCompile Include="..\..\..\..\src\extent_mmap.c" />
     <ClCompile Include="..\..\..\..\src\hash.c" />
-    <ClCompile Include="..\..\..\..\src\hooks.c" />
+    <ClCompile Include="..\..\..\..\src\hook.c" />
     <ClCompile Include="..\..\..\..\src\jemalloc.c" />
     <ClCompile Include="..\..\..\..\src\large.c" />
     <ClCompile Include="..\..\..\..\src\log.c" />
@@ -59,9 +59,11 @@
     <ClCompile Include="..\..\..\..\src\prng.c" />
     <ClCompile Include="..\..\..\..\src\prof.c" />
     <ClCompile Include="..\..\..\..\src\rtree.c" />
+    <ClCompile Include="..\..\..\..\src\sc.c" />
     <ClCompile Include="..\..\..\..\src\stats.c" />
     <ClCompile Include="..\..\..\..\src\sz.c" />
     <ClCompile Include="..\..\..\..\src\tcache.c" />
+    <ClCompile Include="..\..\..\..\src\test_hooks.c" />
     <ClCompile Include="..\..\..\..\src\ticker.c" />
     <ClCompile Include="..\..\..\..\src\tsd.c" />
     <ClCompile Include="..\..\..\..\src\witness.c" />
diff --git a/msvc/projects/vc2017/jemalloc/jemalloc.vcxproj.filters b/msvc/projects/vc2017/jemalloc/jemalloc.vcxproj.filters
index 11cfcd0b..466dc63f 100644
--- a/msvc/projects/vc2017/jemalloc/jemalloc.vcxproj.filters
+++ b/msvc/projects/vc2017/jemalloc/jemalloc.vcxproj.filters
@@ -37,7 +37,7 @@
     <ClCompile Include="..\..\..\..\src\hash.c">
       <Filter>Source Files</Filter>
     </ClCompile>
-    <ClCompile Include="..\..\..\..\src\hooks.c">
+    <ClCompile Include="..\..\..\..\src\hook.c">
       <Filter>Source Files</Filter>
     </ClCompile>
     <ClCompile Include="..\..\..\..\src\jemalloc.c">
@@ -70,6 +70,9 @@
     <ClCompile Include="..\..\..\..\src\rtree.c">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\..\..\src\sc.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="..\..\..\..\src\stats.c">
       <Filter>Source Files</Filter>
     </ClCompile>
@@ -97,5 +100,8 @@
     <ClCompile Include="..\..\..\..\src\div.c">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\..\..\src\test_hooks.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
   </ItemGroup>
 </Project>
\ No newline at end of file
diff --git a/scripts/gen_run_tests.py b/scripts/gen_run_tests.py
index a87ecffb..5052b3e0 100755
--- a/scripts/gen_run_tests.py
+++ b/scripts/gen_run_tests.py
@@ -4,6 +4,7 @@ import sys
 from itertools import combinations
 from os import uname
 from multiprocessing import cpu_count
+from subprocess import call
 
 # Later, we want to test extended vaddr support.  Apparently, the "real" way of
 # checking this is flaky on OS X.
@@ -13,13 +14,25 @@ nparallel = cpu_count() * 2
 
 uname = uname()[0]
 
+if "BSD" in uname:
+    make_cmd = 'gmake'
+else:
+    make_cmd = 'make'
+
 def powerset(items):
     result = []
     for i in xrange(len(items) + 1):
         result += combinations(items, i)
     return result
 
-possible_compilers = [('gcc', 'g++'), ('clang', 'clang++')]
+possible_compilers = []
+for cc, cxx in (['gcc', 'g++'], ['clang', 'clang++']):
+    try:
+        cmd_ret = call([cc, "-v"])
+        if cmd_ret == 0:
+            possible_compilers.append((cc, cxx))
+    except:
+        pass
 possible_compiler_opts = [
     '-m32',
 ]
@@ -39,7 +52,7 @@ possible_malloc_conf_opts = [
 ]
 
 print 'set -e'
-print 'if [ -f Makefile ] ; then make relclean ; fi'
+print 'if [ -f Makefile ] ; then %(make_cmd)s relclean ; fi' % {'make_cmd': make_cmd}
 print 'autoconf'
 print 'rm -rf run_tests.out'
 print 'mkdir run_tests.out'
@@ -102,11 +115,11 @@ cd run_test_%(ind)d.out
 echo "==> %(config_line)s" >> run_test.log
 %(config_line)s >> run_test.log 2>&1 || abort
 
-run_cmd make all tests
-run_cmd make check
-run_cmd make distclean
+run_cmd %(make_cmd)s all tests
+run_cmd %(make_cmd)s check
+run_cmd %(make_cmd)s distclean
 EOF
-chmod 755 run_test_%(ind)d.sh""" % {'ind': ind, 'config_line': config_line}
+chmod 755 run_test_%(ind)d.sh""" % {'ind': ind, 'config_line': config_line, 'make_cmd': make_cmd}
                     ind += 1
 
 print 'for i in `seq 0 %(last_ind)d` ; do echo run_test_${i}.sh ; done | xargs -P %(nparallel)d -n 1 sh' % {'last_ind': ind-1, 'nparallel': nparallel}
diff --git a/scripts/gen_travis.py b/scripts/gen_travis.py
index 6dd39290..65b0b67c 100755
--- a/scripts/gen_travis.py
+++ b/scripts/gen_travis.py
@@ -4,6 +4,7 @@ from itertools import combinations
 
 travis_template = """\
 language: generic
+dist: precise
 
 matrix:
   include:
@@ -11,6 +12,7 @@ matrix:
 
 before_script:
   - autoconf
+  - scripts/gen_travis.py > travis_script && diff .travis.yml travis_script
   - ./configure ${COMPILER_FLAGS:+ \
       CC="$CC $COMPILER_FLAGS" \
       CXX="$CXX $COMPILER_FLAGS" } \
@@ -43,6 +45,7 @@ configure_flag_unusuals = [
     '--enable-debug',
     '--enable-prof',
     '--disable-stats',
+    '--disable-libdl',
 ]
 
 malloc_conf_unusuals = [
@@ -61,47 +64,85 @@ unusual_combinations_to_test = []
 for i in xrange(MAX_UNUSUAL_OPTIONS + 1):
     unusual_combinations_to_test += combinations(all_unusuals, i)
 
-include_rows = ""
-for unusual_combination in unusual_combinations_to_test:
-    os = os_default
-    if os_unusual in unusual_combination:
-        os = os_unusual
+gcc_multilib_set = False
+# Formats a job from a combination of flags
+def format_job(combination):
+    global gcc_multilib_set
 
-    compilers = compilers_default
-    if compilers_unusual in unusual_combination:
-        compilers = compilers_unusual
+    os = os_unusual if os_unusual in combination else os_default
+    compilers = compilers_unusual if compilers_unusual in combination else compilers_default
 
-    compiler_flags = [
-        x for x in unusual_combination if x in compiler_flag_unusuals]
+    compiler_flags = [x for x in combination if x in compiler_flag_unusuals]
+    configure_flags = [x for x in combination if x in configure_flag_unusuals]
+    malloc_conf = [x for x in combination if x in malloc_conf_unusuals]
 
-    configure_flags = [
-        x for x in unusual_combination if x in configure_flag_unusuals]
-
-    malloc_conf = [
-        x for x in unusual_combination if x in malloc_conf_unusuals]
     # Filter out unsupported configurations on OS X.
     if os == 'osx' and ('dss:primary' in malloc_conf or \
       'percpu_arena:percpu' in malloc_conf or 'background_thread:true' \
       in malloc_conf):
-        continue
+        return ""
     if len(malloc_conf) > 0:
         configure_flags.append('--with-malloc-conf=' + ",".join(malloc_conf))
 
     # Filter out an unsupported configuration - heap profiling on OS X.
     if os == 'osx' and '--enable-prof' in configure_flags:
-        continue
+        return ""
 
     # We get some spurious errors when -Warray-bounds is enabled.
     env_string = ('{} COMPILER_FLAGS="{}" CONFIGURE_FLAGS="{}" '
 	'EXTRA_CFLAGS="-Werror -Wno-array-bounds"').format(
         compilers, " ".join(compiler_flags), " ".join(configure_flags))
 
-    include_rows += '    - os: %s\n' % os
-    include_rows += '      env: %s\n' % env_string
-    if '-m32' in unusual_combination and os == 'linux':
-        include_rows += '      addons:\n'
-	include_rows += '        apt:\n'
-	include_rows += '          packages:\n'
-	include_rows += '            - gcc-multilib\n'
+    job = ""
+    job += '    - os: %s\n' % os
+    job += '      env: %s\n' % env_string
+    if '-m32' in combination and os == 'linux':
+        job += '      addons:'
+        if gcc_multilib_set:
+            job += ' *gcc_multilib\n'
+        else:
+            job += ' &gcc_multilib\n'
+            job += '        apt:\n'
+            job += '          packages:\n'
+            job += '            - gcc-multilib\n'
+            gcc_multilib_set = True
+    return job
+
+include_rows = ""
+for combination in unusual_combinations_to_test:
+    include_rows += format_job(combination)
+
+# Development build
+include_rows += '''\
+    # Development build
+    - os: linux
+      env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--enable-debug --disable-cache-oblivious --enable-stats --enable-log --enable-prof" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+'''
+
+# Enable-expermental-smallocx
+include_rows += '''\
+    # --enable-expermental-smallocx:
+    - os: linux
+      env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--enable-debug --enable-experimental-smallocx --enable-stats --enable-prof" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+'''
+
+# Valgrind build bots
+include_rows += '''
+    # Valgrind
+    - os: linux
+      env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="" EXTRA_CFLAGS="-Werror -Wno-array-bounds" JEMALLOC_TEST_PREFIX="valgrind"
+      addons:
+        apt:
+          packages:
+            - valgrind
+'''
+
+# To enable valgrind on macosx add:
+#
+#  - os: osx
+#    env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="" EXTRA_CFLAGS="-Werror -Wno-array-bounds" JEMALLOC_TEST_PREFIX="valgrind"
+#    install: brew install valgrind
+#
+# It currently fails due to: https://github.com/jemalloc/jemalloc/issues/1274
 
 print travis_template % include_rows
diff --git a/src/arena.c b/src/arena.c
index 5d55bf1a..60eac232 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -8,9 +8,10 @@
 #include "jemalloc/internal/extent_mmap.h"
 #include "jemalloc/internal/mutex.h"
 #include "jemalloc/internal/rtree.h"
-#include "jemalloc/internal/size_classes.h"
 #include "jemalloc/internal/util.h"
 
+JEMALLOC_DIAGNOSTIC_DISABLE_SPURIOUS
+
 /******************************************************************************/
 /* Data. */
 
@@ -40,7 +41,11 @@ const uint64_t h_steps[SMOOTHSTEP_NSTEPS] = {
 #undef STEP
 };
 
-static div_info_t arena_binind_div_info[NBINS];
+static div_info_t arena_binind_div_info[SC_NBINS];
+
+size_t opt_oversize_threshold = OVERSIZE_THRESHOLD_DEFAULT;
+size_t oversize_threshold = OVERSIZE_THRESHOLD_DEFAULT;
+static unsigned huge_arena_ind;
 
 /******************************************************************************/
 /*
@@ -61,7 +66,7 @@ static void arena_bin_lower_slab(tsdn_t *tsdn, arena_t *arena, extent_t *slab,
 /******************************************************************************/
 
 void
-arena_basic_stats_merge(UNUSED tsdn_t *tsdn, arena_t *arena, unsigned *nthreads,
+arena_basic_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads,
     const char **dss, ssize_t *dirty_decay_ms, ssize_t *muzzy_decay_ms,
     size_t *nactive, size_t *ndirty, size_t *nmuzzy) {
 	*nthreads += arena_nthreads_get(arena, false);
@@ -77,7 +82,8 @@ void
 arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads,
     const char **dss, ssize_t *dirty_decay_ms, ssize_t *muzzy_decay_ms,
     size_t *nactive, size_t *ndirty, size_t *nmuzzy, arena_stats_t *astats,
-    bin_stats_t *bstats, arena_stats_large_t *lstats) {
+    bin_stats_t *bstats, arena_stats_large_t *lstats,
+    arena_stats_extents_t *estats) {
 	cassert(config_stats);
 
 	arena_basic_stats_merge(tsdn, arena, nthreads, dss, dirty_decay_ms,
@@ -94,6 +100,10 @@ arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads,
 	arena_stats_accum_zu(&astats->retained,
 	    extents_npages_get(&arena->extents_retained) << LG_PAGE);
 
+	atomic_store_zu(&astats->extent_avail,
+	    atomic_load_zu(&arena->extent_avail_cnt, ATOMIC_RELAXED),
+	    ATOMIC_RELAXED);
+
 	arena_stats_accum_u64(&astats->decay_dirty.npurge,
 	    arena_stats_read_u64(tsdn, &arena->stats,
 	    &arena->stats.decay_dirty.npurge));
@@ -122,7 +132,7 @@ arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads,
 	    extents_npages_get(&arena->extents_dirty) +
 	    extents_npages_get(&arena->extents_muzzy)) << LG_PAGE)));
 
-	for (szind_t i = 0; i < NSIZES - NBINS; i++) {
+	for (szind_t i = 0; i < SC_NSIZES - SC_NBINS; i++) {
 		uint64_t nmalloc = arena_stats_read_u64(tsdn, &arena->stats,
 		    &arena->stats.lstats[i].nmalloc);
 		arena_stats_accum_u64(&lstats[i].nmalloc, nmalloc);
@@ -145,7 +155,29 @@ arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads,
 		size_t curlextents = (size_t)(nmalloc - ndalloc);
 		lstats[i].curlextents += curlextents;
 		arena_stats_accum_zu(&astats->allocated_large,
-		    curlextents * sz_index2size(NBINS + i));
+		    curlextents * sz_index2size(SC_NBINS + i));
+	}
+
+	for (pszind_t i = 0; i < SC_NPSIZES; i++) {
+		size_t dirty, muzzy, retained, dirty_bytes, muzzy_bytes,
+		    retained_bytes;
+		dirty = extents_nextents_get(&arena->extents_dirty, i);
+		muzzy = extents_nextents_get(&arena->extents_muzzy, i);
+		retained = extents_nextents_get(&arena->extents_retained, i);
+		dirty_bytes = extents_nbytes_get(&arena->extents_dirty, i);
+		muzzy_bytes = extents_nbytes_get(&arena->extents_muzzy, i);
+		retained_bytes =
+		    extents_nbytes_get(&arena->extents_retained, i);
+
+		atomic_store_zu(&estats[i].ndirty, dirty, ATOMIC_RELAXED);
+		atomic_store_zu(&estats[i].nmuzzy, muzzy, ATOMIC_RELAXED);
+		atomic_store_zu(&estats[i].nretained, retained, ATOMIC_RELAXED);
+		atomic_store_zu(&estats[i].dirty_bytes, dirty_bytes,
+		    ATOMIC_RELAXED);
+		atomic_store_zu(&estats[i].muzzy_bytes, muzzy_bytes,
+		    ATOMIC_RELAXED);
+		atomic_store_zu(&estats[i].retained_bytes, retained_bytes,
+		    ATOMIC_RELAXED);
 	}
 
 	arena_stats_unlock(tsdn, &arena->stats);
@@ -156,7 +188,7 @@ arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads,
 	cache_bin_array_descriptor_t *descriptor;
 	ql_foreach(descriptor, &arena->cache_bin_array_descriptor_ql, link) {
 		szind_t i = 0;
-		for (; i < NBINS; i++) {
+		for (; i < SC_NBINS; i++) {
 			cache_bin_t *tbin = &descriptor->bins_small[i];
 			arena_stats_accum_zu(&astats->tcache_bytes,
 			    tbin->ncached * sz_index2size(i));
@@ -200,8 +232,11 @@ arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads,
 	nstime_update(&astats->uptime);
 	nstime_subtract(&astats->uptime, &arena->create_time);
 
-	for (szind_t i = 0; i < NBINS; i++) {
-		bin_stats_merge(tsdn, &bstats[i], &arena->bins[i]);
+	for (szind_t i = 0; i < SC_NBINS; i++) {
+		for (unsigned j = 0; j < bin_infos[i].n_shards; j++) {
+			bin_stats_merge(tsdn, &bstats[i],
+			    &arena->bins[i].bin_shards[j]);
+		}
 	}
 }
 
@@ -236,6 +271,54 @@ arena_slab_reg_alloc(extent_t *slab, const bin_info_t *bin_info) {
 	return ret;
 }
 
+static void
+arena_slab_reg_alloc_batch(extent_t *slab, const bin_info_t *bin_info,
+			   unsigned cnt, void** ptrs) {
+	arena_slab_data_t *slab_data = extent_slab_data_get(slab);
+
+	assert(extent_nfree_get(slab) >= cnt);
+	assert(!bitmap_full(slab_data->bitmap, &bin_info->bitmap_info));
+
+#if (! defined JEMALLOC_INTERNAL_POPCOUNTL) || (defined BITMAP_USE_TREE)
+	for (unsigned i = 0; i < cnt; i++) {
+		size_t regind = bitmap_sfu(slab_data->bitmap,
+					   &bin_info->bitmap_info);
+		*(ptrs + i) = (void *)((uintptr_t)extent_addr_get(slab) +
+		    (uintptr_t)(bin_info->reg_size * regind));
+	}
+#else
+	unsigned group = 0;
+	bitmap_t g = slab_data->bitmap[group];
+	unsigned i = 0;
+	while (i < cnt) {
+		while (g == 0) {
+			g = slab_data->bitmap[++group];
+		}
+		size_t shift = group << LG_BITMAP_GROUP_NBITS;
+		size_t pop = popcount_lu(g);
+		if (pop > (cnt - i)) {
+			pop = cnt - i;
+		}
+
+		/*
+		 * Load from memory locations only once, outside the
+		 * hot loop below.
+		 */
+		uintptr_t base = (uintptr_t)extent_addr_get(slab);
+		uintptr_t regsize = (uintptr_t)bin_info->reg_size;
+		while (pop--) {
+			size_t bit = cfs_lu(&g);
+			size_t regind = shift + bit;
+			*(ptrs + i) = (void *)(base + regsize * regind);
+
+			i++;
+		}
+		slab_data->bitmap[group] = g;
+	}
+#endif
+	extent_nfree_sub(slab, cnt);
+}
+
 #ifndef JEMALLOC_JET
 static
 #endif
@@ -291,11 +374,11 @@ arena_large_malloc_stats_update(tsdn_t *tsdn, arena_t *arena, size_t usize) {
 
 	cassert(config_stats);
 
-	if (usize < LARGE_MINCLASS) {
-		usize = LARGE_MINCLASS;
+	if (usize < SC_LARGE_MINCLASS) {
+		usize = SC_LARGE_MINCLASS;
 	}
 	index = sz_size2index(usize);
-	hindex = (index >= NBINS) ? index - NBINS : 0;
+	hindex = (index >= SC_NBINS) ? index - SC_NBINS : 0;
 
 	arena_stats_add_u64(tsdn, &arena->stats,
 	    &arena->stats.lstats[hindex].nmalloc, 1);
@@ -307,11 +390,11 @@ arena_large_dalloc_stats_update(tsdn_t *tsdn, arena_t *arena, size_t usize) {
 
 	cassert(config_stats);
 
-	if (usize < LARGE_MINCLASS) {
-		usize = LARGE_MINCLASS;
+	if (usize < SC_LARGE_MINCLASS) {
+		usize = SC_LARGE_MINCLASS;
 	}
 	index = sz_size2index(usize);
-	hindex = (index >= NBINS) ? index - NBINS : 0;
+	hindex = (index >= SC_NBINS) ? index - SC_NBINS : 0;
 
 	arena_stats_add_u64(tsdn, &arena->stats,
 	    &arena->stats.lstats[hindex].ndalloc, 1);
@@ -324,6 +407,11 @@ arena_large_ralloc_stats_update(tsdn_t *tsdn, arena_t *arena, size_t oldusize,
 	arena_large_malloc_stats_update(tsdn, arena, usize);
 }
 
+static bool
+arena_may_have_muzzy(arena_t *arena) {
+	return (pages_can_purge_lazy && (arena_muzzy_decay_ms_get(arena) != 0));
+}
+
 extent_t *
 arena_extent_alloc_large(tsdn_t *tsdn, arena_t *arena, size_t usize,
     size_t alignment, bool *zero) {
@@ -338,7 +426,7 @@ arena_extent_alloc_large(tsdn_t *tsdn, arena_t *arena, size_t usize,
 	extent_t *extent = extents_alloc(tsdn, arena, &extent_hooks,
 	    &arena->extents_dirty, NULL, usize, sz_large_pad, alignment, false,
 	    szind, zero, &commit);
-	if (extent == NULL) {
+	if (extent == NULL && arena_may_have_muzzy(arena)) {
 		extent = extents_alloc(tsdn, arena, &extent_hooks,
 		    &arena->extents_muzzy, NULL, usize, sz_large_pad, alignment,
 		    false, szind, zero, &commit);
@@ -743,7 +831,7 @@ static size_t
 arena_decay_stashed(tsdn_t *tsdn, arena_t *arena,
     extent_hooks_t **r_extent_hooks, arena_decay_t *decay, extents_t *extents,
     bool all, extent_list_t *decay_extents, bool is_background_thread) {
-	UNUSED size_t nmadvise, nunmapped;
+	size_t nmadvise, nunmapped;
 	size_t npurged;
 
 	if (config_stats) {
@@ -834,7 +922,7 @@ arena_decay_to_limit(tsdn_t *tsdn, arena_t *arena, arena_decay_t *decay,
 	size_t npurge = arena_stash_decayed(tsdn, arena, &extent_hooks, extents,
 	    npages_limit, npages_decay_max, &decay_extents);
 	if (npurge != 0) {
-		UNUSED size_t npurged = arena_decay_stashed(tsdn, arena,
+		size_t npurged = arena_decay_stashed(tsdn, arena,
 		    &extent_hooks, decay, extents, all, &decay_extents,
 		    is_background_thread);
 		assert(npurged == npurge);
@@ -863,7 +951,7 @@ arena_decay_impl(tsdn_t *tsdn, arena_t *arena, arena_decay_t *decay,
 
 	bool epoch_advanced = arena_maybe_decay(tsdn, arena, decay, extents,
 	    is_background_thread);
-	UNUSED size_t npages_new;
+	size_t npages_new;
 	if (epoch_advanced) {
 		/* Backlog is updated on epoch advance. */
 		npages_new = decay->backlog[SMOOTHSTEP_NSTEPS-1];
@@ -954,6 +1042,37 @@ arena_bin_slabs_full_remove(arena_t *arena, bin_t *bin, extent_t *slab) {
 	extent_list_remove(&bin->slabs_full, slab);
 }
 
+static void
+arena_bin_reset(tsd_t *tsd, arena_t *arena, bin_t *bin) {
+	extent_t *slab;
+
+	malloc_mutex_lock(tsd_tsdn(tsd), &bin->lock);
+	if (bin->slabcur != NULL) {
+		slab = bin->slabcur;
+		bin->slabcur = NULL;
+		malloc_mutex_unlock(tsd_tsdn(tsd), &bin->lock);
+		arena_slab_dalloc(tsd_tsdn(tsd), arena, slab);
+		malloc_mutex_lock(tsd_tsdn(tsd), &bin->lock);
+	}
+	while ((slab = extent_heap_remove_first(&bin->slabs_nonfull)) != NULL) {
+		malloc_mutex_unlock(tsd_tsdn(tsd), &bin->lock);
+		arena_slab_dalloc(tsd_tsdn(tsd), arena, slab);
+		malloc_mutex_lock(tsd_tsdn(tsd), &bin->lock);
+	}
+	for (slab = extent_list_first(&bin->slabs_full); slab != NULL;
+	     slab = extent_list_first(&bin->slabs_full)) {
+		arena_bin_slabs_full_remove(arena, bin, slab);
+		malloc_mutex_unlock(tsd_tsdn(tsd), &bin->lock);
+		arena_slab_dalloc(tsd_tsdn(tsd), arena, slab);
+		malloc_mutex_lock(tsd_tsdn(tsd), &bin->lock);
+	}
+	if (config_stats) {
+		bin->stats.curregs = 0;
+		bin->stats.curslabs = 0;
+	}
+	malloc_mutex_unlock(tsd_tsdn(tsd), &bin->lock);
+}
+
 void
 arena_reset(tsd_t *tsd, arena_t *arena) {
 	/*
@@ -983,7 +1102,7 @@ arena_reset(tsd_t *tsd, arena_t *arena) {
 		rtree_ctx_t *rtree_ctx = tsd_rtree_ctx(tsd);
 		rtree_szind_slab_read(tsd_tsdn(tsd), &extents_rtree, rtree_ctx,
 		    (uintptr_t)ptr, true, &alloc_ctx.szind, &alloc_ctx.slab);
-		assert(alloc_ctx.szind != NSIZES);
+		assert(alloc_ctx.szind != SC_NSIZES);
 
 		if (config_stats || (config_prof && opt_prof)) {
 			usize = sz_index2size(alloc_ctx.szind);
@@ -999,35 +1118,11 @@ arena_reset(tsd_t *tsd, arena_t *arena) {
 	malloc_mutex_unlock(tsd_tsdn(tsd), &arena->large_mtx);
 
 	/* Bins. */
-	for (unsigned i = 0; i < NBINS; i++) {
-		extent_t *slab;
-		bin_t *bin = &arena->bins[i];
-		malloc_mutex_lock(tsd_tsdn(tsd), &bin->lock);
-		if (bin->slabcur != NULL) {
-			slab = bin->slabcur;
-			bin->slabcur = NULL;
-			malloc_mutex_unlock(tsd_tsdn(tsd), &bin->lock);
-			arena_slab_dalloc(tsd_tsdn(tsd), arena, slab);
-			malloc_mutex_lock(tsd_tsdn(tsd), &bin->lock);
+	for (unsigned i = 0; i < SC_NBINS; i++) {
+		for (unsigned j = 0; j < bin_infos[i].n_shards; j++) {
+			arena_bin_reset(tsd, arena,
+			    &arena->bins[i].bin_shards[j]);
 		}
-		while ((slab = extent_heap_remove_first(&bin->slabs_nonfull)) !=
-		    NULL) {
-			malloc_mutex_unlock(tsd_tsdn(tsd), &bin->lock);
-			arena_slab_dalloc(tsd_tsdn(tsd), arena, slab);
-			malloc_mutex_lock(tsd_tsdn(tsd), &bin->lock);
-		}
-		for (slab = extent_list_first(&bin->slabs_full); slab != NULL;
-		    slab = extent_list_first(&bin->slabs_full)) {
-			arena_bin_slabs_full_remove(arena, bin, slab);
-			malloc_mutex_unlock(tsd_tsdn(tsd), &bin->lock);
-			arena_slab_dalloc(tsd_tsdn(tsd), arena, slab);
-			malloc_mutex_lock(tsd_tsdn(tsd), &bin->lock);
-		}
-		if (config_stats) {
-			bin->stats.curregs = 0;
-			bin->stats.curslabs = 0;
-		}
-		malloc_mutex_unlock(tsd_tsdn(tsd), &bin->lock);
 	}
 
 	atomic_store_zu(&arena->nactive, 0, ATOMIC_RELAXED);
@@ -1112,7 +1207,7 @@ arena_slab_alloc_hard(tsdn_t *tsdn, arena_t *arena,
 }
 
 static extent_t *
-arena_slab_alloc(tsdn_t *tsdn, arena_t *arena, szind_t binind,
+arena_slab_alloc(tsdn_t *tsdn, arena_t *arena, szind_t binind, unsigned binshard,
     const bin_info_t *bin_info) {
 	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
 	    WITNESS_RANK_CORE, 0);
@@ -1124,7 +1219,7 @@ arena_slab_alloc(tsdn_t *tsdn, arena_t *arena, szind_t binind,
 	extent_t *slab = extents_alloc(tsdn, arena, &extent_hooks,
 	    &arena->extents_dirty, NULL, bin_info->slab_size, 0, PAGE, true,
 	    binind, &zero, &commit);
-	if (slab == NULL) {
+	if (slab == NULL && arena_may_have_muzzy(arena)) {
 		slab = extents_alloc(tsdn, arena, &extent_hooks,
 		    &arena->extents_muzzy, NULL, bin_info->slab_size, 0, PAGE,
 		    true, binind, &zero, &commit);
@@ -1140,7 +1235,7 @@ arena_slab_alloc(tsdn_t *tsdn, arena_t *arena, szind_t binind,
 
 	/* Initialize slab internals. */
 	arena_slab_data_t *slab_data = extent_slab_data_get(slab);
-	extent_nfree_set(slab, bin_info->nregs);
+	extent_nfree_binshard_set(slab, bin_info->nregs, binshard);
 	bitmap_init(slab_data->bitmap, &bin_info->bitmap_info, false);
 
 	arena_nactive_add(arena, extent_size_get(slab) >> LG_PAGE);
@@ -1150,7 +1245,7 @@ arena_slab_alloc(tsdn_t *tsdn, arena_t *arena, szind_t binind,
 
 static extent_t *
 arena_bin_nonfull_slab_get(tsdn_t *tsdn, arena_t *arena, bin_t *bin,
-    szind_t binind) {
+    szind_t binind, unsigned binshard) {
 	extent_t *slab;
 	const bin_info_t *bin_info;
 
@@ -1166,7 +1261,7 @@ arena_bin_nonfull_slab_get(tsdn_t *tsdn, arena_t *arena, bin_t *bin,
 	/* Allocate a new slab. */
 	malloc_mutex_unlock(tsdn, &bin->lock);
 	/******************************/
-	slab = arena_slab_alloc(tsdn, arena, binind, bin_info);
+	slab = arena_slab_alloc(tsdn, arena, binind, binshard, bin_info);
 	/********************************/
 	malloc_mutex_lock(tsdn, &bin->lock);
 	if (slab != NULL) {
@@ -1193,7 +1288,7 @@ arena_bin_nonfull_slab_get(tsdn_t *tsdn, arena_t *arena, bin_t *bin,
 /* Re-fill bin->slabcur, then call arena_slab_reg_alloc(). */
 static void *
 arena_bin_malloc_hard(tsdn_t *tsdn, arena_t *arena, bin_t *bin,
-    szind_t binind) {
+    szind_t binind, unsigned binshard) {
 	const bin_info_t *bin_info;
 	extent_t *slab;
 
@@ -1202,7 +1297,7 @@ arena_bin_malloc_hard(tsdn_t *tsdn, arena_t *arena, bin_t *bin,
 		arena_bin_slabs_full_insert(arena, bin, bin->slabcur);
 		bin->slabcur = NULL;
 	}
-	slab = arena_bin_nonfull_slab_get(tsdn, arena, bin, binind);
+	slab = arena_bin_nonfull_slab_get(tsdn, arena, bin, binind, binshard);
 	if (bin->slabcur != NULL) {
 		/*
 		 * Another thread updated slabcur while this one ran without the
@@ -1246,46 +1341,75 @@ arena_bin_malloc_hard(tsdn_t *tsdn, arena_t *arena, bin_t *bin,
 	return arena_slab_reg_alloc(slab, bin_info);
 }
 
+/* Choose a bin shard and return the locked bin. */
+bin_t *
+arena_bin_choose_lock(tsdn_t *tsdn, arena_t *arena, szind_t binind,
+    unsigned *binshard) {
+	bin_t *bin;
+	if (tsdn_null(tsdn) || tsd_arena_get(tsdn_tsd(tsdn)) == NULL) {
+		*binshard = 0;
+	} else {
+		*binshard = tsd_binshardsp_get(tsdn_tsd(tsdn))->binshard[binind];
+	}
+	assert(*binshard < bin_infos[binind].n_shards);
+	bin = &arena->bins[binind].bin_shards[*binshard];
+	malloc_mutex_lock(tsdn, &bin->lock);
+
+	return bin;
+}
+
 void
 arena_tcache_fill_small(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache,
     cache_bin_t *tbin, szind_t binind, uint64_t prof_accumbytes) {
-	unsigned i, nfill;
-	bin_t *bin;
+	unsigned i, nfill, cnt;
 
 	assert(tbin->ncached == 0);
 
 	if (config_prof && arena_prof_accum(tsdn, arena, prof_accumbytes)) {
 		prof_idump(tsdn);
 	}
-	bin = &arena->bins[binind];
-	malloc_mutex_lock(tsdn, &bin->lock);
+
+	unsigned binshard;
+	bin_t *bin = arena_bin_choose_lock(tsdn, arena, binind, &binshard);
+
 	for (i = 0, nfill = (tcache_bin_info[binind].ncached_max >>
-	    tcache->lg_fill_div[binind]); i < nfill; i++) {
+	    tcache->lg_fill_div[binind]); i < nfill; i += cnt) {
 		extent_t *slab;
-		void *ptr;
 		if ((slab = bin->slabcur) != NULL && extent_nfree_get(slab) >
 		    0) {
-			ptr = arena_slab_reg_alloc(slab, &bin_infos[binind]);
+			unsigned tofill = nfill - i;
+			cnt = tofill < extent_nfree_get(slab) ?
+				tofill : extent_nfree_get(slab);
+			arena_slab_reg_alloc_batch(
+			   slab, &bin_infos[binind], cnt,
+			   tbin->avail - nfill + i);
 		} else {
-			ptr = arena_bin_malloc_hard(tsdn, arena, bin, binind);
-		}
-		if (ptr == NULL) {
+			cnt = 1;
+			void *ptr = arena_bin_malloc_hard(tsdn, arena, bin,
+			    binind, binshard);
 			/*
 			 * OOM.  tbin->avail isn't yet filled down to its first
 			 * element, so the successful allocations (if any) must
 			 * be moved just before tbin->avail before bailing out.
 			 */
-			if (i > 0) {
-				memmove(tbin->avail - i, tbin->avail - nfill,
-				    i * sizeof(void *));
+			if (ptr == NULL) {
+				if (i > 0) {
+					memmove(tbin->avail - i,
+						tbin->avail - nfill,
+						i * sizeof(void *));
+				}
+				break;
 			}
-			break;
+			/* Insert such that low regions get used first. */
+			*(tbin->avail - nfill + i) = ptr;
 		}
 		if (config_fill && unlikely(opt_junk_alloc)) {
-			arena_alloc_junk_small(ptr, &bin_infos[binind], true);
+			for (unsigned j = 0; j < cnt; j++) {
+				void* ptr = *(tbin->avail - nfill + i + j);
+				arena_alloc_junk_small(ptr, &bin_infos[binind],
+							true);
+			}
 		}
-		/* Insert such that low regions get used first. */
-		*(tbin->avail - nfill + i) = ptr;
 	}
 	if (config_stats) {
 		bin->stats.nmalloc += i;
@@ -1320,15 +1444,15 @@ arena_malloc_small(tsdn_t *tsdn, arena_t *arena, szind_t binind, bool zero) {
 	size_t usize;
 	extent_t *slab;
 
-	assert(binind < NBINS);
-	bin = &arena->bins[binind];
+	assert(binind < SC_NBINS);
 	usize = sz_index2size(binind);
+	unsigned binshard;
+	bin = arena_bin_choose_lock(tsdn, arena, binind, &binshard);
 
-	malloc_mutex_lock(tsdn, &bin->lock);
 	if ((slab = bin->slabcur) != NULL && extent_nfree_get(slab) > 0) {
 		ret = arena_slab_reg_alloc(slab, &bin_infos[binind]);
 	} else {
-		ret = arena_bin_malloc_hard(tsdn, arena, bin, binind);
+		ret = arena_bin_malloc_hard(tsdn, arena, bin, binind, binshard);
 	}
 
 	if (ret == NULL) {
@@ -1373,13 +1497,13 @@ arena_malloc_hard(tsdn_t *tsdn, arena_t *arena, size_t size, szind_t ind,
 	assert(!tsdn_null(tsdn) || arena != NULL);
 
 	if (likely(!tsdn_null(tsdn))) {
-		arena = arena_choose(tsdn_tsd(tsdn), arena);
+		arena = arena_choose_maybe_huge(tsdn_tsd(tsdn), arena, size);
 	}
 	if (unlikely(arena == NULL)) {
 		return NULL;
 	}
 
-	if (likely(size <= SMALL_MAXCLASS)) {
+	if (likely(size <= SC_SMALL_MAXCLASS)) {
 		return arena_malloc_small(tsdn, arena, ind, zero);
 	}
 	return large_malloc(tsdn, arena, sz_index2size(ind), zero);
@@ -1390,8 +1514,9 @@ arena_palloc(tsdn_t *tsdn, arena_t *arena, size_t usize, size_t alignment,
     bool zero, tcache_t *tcache) {
 	void *ret;
 
-	if (usize <= SMALL_MAXCLASS && (alignment < PAGE || (alignment == PAGE
-	    && (usize & PAGE_MASK) == 0))) {
+	if (usize <= SC_SMALL_MAXCLASS
+	    && (alignment < PAGE
+	    || (alignment == PAGE && (usize & PAGE_MASK) == 0))) {
 		/* Small; alignment doesn't require special slab placement. */
 		ret = arena_malloc(tsdn, arena, usize, sz_size2index(usize),
 		    zero, tcache, true);
@@ -1409,8 +1534,8 @@ void
 arena_prof_promote(tsdn_t *tsdn, const void *ptr, size_t usize) {
 	cassert(config_prof);
 	assert(ptr != NULL);
-	assert(isalloc(tsdn, ptr) == LARGE_MINCLASS);
-	assert(usize <= SMALL_MAXCLASS);
+	assert(isalloc(tsdn, ptr) == SC_LARGE_MINCLASS);
+	assert(usize <= SC_SMALL_MAXCLASS);
 
 	rtree_ctx_t rtree_ctx_fallback;
 	rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn, &rtree_ctx_fallback);
@@ -1434,15 +1559,15 @@ arena_prof_demote(tsdn_t *tsdn, extent_t *extent, const void *ptr) {
 	cassert(config_prof);
 	assert(ptr != NULL);
 
-	extent_szind_set(extent, NBINS);
+	extent_szind_set(extent, SC_NBINS);
 	rtree_ctx_t rtree_ctx_fallback;
 	rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn, &rtree_ctx_fallback);
 	rtree_szind_slab_update(tsdn, &extents_rtree, rtree_ctx, (uintptr_t)ptr,
-	    NBINS, false);
+	    SC_NBINS, false);
 
-	assert(isalloc(tsdn, ptr) == LARGE_MINCLASS);
+	assert(isalloc(tsdn, ptr) == SC_LARGE_MINCLASS);
 
-	return LARGE_MINCLASS;
+	return SC_LARGE_MINCLASS;
 }
 
 void
@@ -1499,7 +1624,7 @@ arena_dalloc_bin_slab(tsdn_t *tsdn, arena_t *arena, extent_t *slab,
 }
 
 static void
-arena_bin_lower_slab(UNUSED tsdn_t *tsdn, arena_t *arena, extent_t *slab,
+arena_bin_lower_slab(tsdn_t *tsdn, arena_t *arena, extent_t *slab,
     bin_t *bin) {
 	assert(extent_nfree_get(slab) > 0);
 
@@ -1526,11 +1651,9 @@ arena_bin_lower_slab(UNUSED tsdn_t *tsdn, arena_t *arena, extent_t *slab,
 }
 
 static void
-arena_dalloc_bin_locked_impl(tsdn_t *tsdn, arena_t *arena, extent_t *slab,
-    void *ptr, bool junked) {
+arena_dalloc_bin_locked_impl(tsdn_t *tsdn, arena_t *arena, bin_t *bin,
+    szind_t binind, extent_t *slab, void *ptr, bool junked) {
 	arena_slab_data_t *slab_data = extent_slab_data_get(slab);
-	szind_t binind = extent_szind_get(slab);
-	bin_t *bin = &arena->bins[binind];
 	const bin_info_t *bin_info = &bin_infos[binind];
 
 	if (!junked && config_fill && unlikely(opt_junk_free)) {
@@ -1554,18 +1677,21 @@ arena_dalloc_bin_locked_impl(tsdn_t *tsdn, arena_t *arena, extent_t *slab,
 }
 
 void
-arena_dalloc_bin_junked_locked(tsdn_t *tsdn, arena_t *arena, extent_t *extent,
-    void *ptr) {
-	arena_dalloc_bin_locked_impl(tsdn, arena, extent, ptr, true);
+arena_dalloc_bin_junked_locked(tsdn_t *tsdn, arena_t *arena, bin_t *bin,
+    szind_t binind, extent_t *extent, void *ptr) {
+	arena_dalloc_bin_locked_impl(tsdn, arena, bin, binind, extent, ptr,
+	    true);
 }
 
 static void
 arena_dalloc_bin(tsdn_t *tsdn, arena_t *arena, extent_t *extent, void *ptr) {
 	szind_t binind = extent_szind_get(extent);
-	bin_t *bin = &arena->bins[binind];
+	unsigned binshard = extent_binshard_get(extent);
+	bin_t *bin = &arena->bins[binind].bin_shards[binshard];
 
 	malloc_mutex_lock(tsdn, &bin->lock);
-	arena_dalloc_bin_locked_impl(tsdn, arena, extent, ptr, false);
+	arena_dalloc_bin_locked_impl(tsdn, arena, bin, binind, extent, ptr,
+	    false);
 	malloc_mutex_unlock(tsdn, &bin->lock);
 }
 
@@ -1580,38 +1706,48 @@ arena_dalloc_small(tsdn_t *tsdn, void *ptr) {
 
 bool
 arena_ralloc_no_move(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size,
-    size_t extra, bool zero) {
+    size_t extra, bool zero, size_t *newsize) {
+	bool ret;
 	/* Calls with non-zero extra had to clamp extra. */
-	assert(extra == 0 || size + extra <= LARGE_MAXCLASS);
-
-	if (unlikely(size > LARGE_MAXCLASS)) {
-		return true;
-	}
+	assert(extra == 0 || size + extra <= SC_LARGE_MAXCLASS);
 
 	extent_t *extent = iealloc(tsdn, ptr);
+	if (unlikely(size > SC_LARGE_MAXCLASS)) {
+		ret = true;
+		goto done;
+	}
+
 	size_t usize_min = sz_s2u(size);
 	size_t usize_max = sz_s2u(size + extra);
-	if (likely(oldsize <= SMALL_MAXCLASS && usize_min <= SMALL_MAXCLASS)) {
+	if (likely(oldsize <= SC_SMALL_MAXCLASS && usize_min
+	    <= SC_SMALL_MAXCLASS)) {
 		/*
 		 * Avoid moving the allocation if the size class can be left the
 		 * same.
 		 */
 		assert(bin_infos[sz_size2index(oldsize)].reg_size ==
 		    oldsize);
-		if ((usize_max > SMALL_MAXCLASS || sz_size2index(usize_max) !=
-		    sz_size2index(oldsize)) && (size > oldsize || usize_max <
-		    oldsize)) {
-			return true;
+		if ((usize_max > SC_SMALL_MAXCLASS
+		    || sz_size2index(usize_max) != sz_size2index(oldsize))
+		    && (size > oldsize || usize_max < oldsize)) {
+			ret = true;
+			goto done;
 		}
 
 		arena_decay_tick(tsdn, extent_arena_get(extent));
-		return false;
-	} else if (oldsize >= LARGE_MINCLASS && usize_max >= LARGE_MINCLASS) {
-		return large_ralloc_no_move(tsdn, extent, usize_min, usize_max,
+		ret = false;
+	} else if (oldsize >= SC_LARGE_MINCLASS
+	    && usize_max >= SC_LARGE_MINCLASS) {
+		ret = large_ralloc_no_move(tsdn, extent, usize_min, usize_max,
 		    zero);
+	} else {
+		ret = true;
 	}
+done:
+	assert(extent == iealloc(tsdn, ptr));
+	*newsize = extent_usize_get(extent);
 
-	return true;
+	return ret;
 }
 
 static void *
@@ -1622,7 +1758,7 @@ arena_ralloc_move_helper(tsdn_t *tsdn, arena_t *arena, size_t usize,
 		    zero, tcache, true);
 	}
 	usize = sz_sa2u(usize, alignment);
-	if (unlikely(usize == 0 || usize > LARGE_MAXCLASS)) {
+	if (unlikely(usize == 0 || usize > SC_LARGE_MAXCLASS)) {
 		return NULL;
 	}
 	return ipalloct(tsdn, usize, alignment, zero, tcache, arena);
@@ -1630,22 +1766,30 @@ arena_ralloc_move_helper(tsdn_t *tsdn, arena_t *arena, size_t usize,
 
 void *
 arena_ralloc(tsdn_t *tsdn, arena_t *arena, void *ptr, size_t oldsize,
-    size_t size, size_t alignment, bool zero, tcache_t *tcache) {
+    size_t size, size_t alignment, bool zero, tcache_t *tcache,
+    hook_ralloc_args_t *hook_args) {
 	size_t usize = sz_s2u(size);
-	if (unlikely(usize == 0 || size > LARGE_MAXCLASS)) {
+	if (unlikely(usize == 0 || size > SC_LARGE_MAXCLASS)) {
 		return NULL;
 	}
 
-	if (likely(usize <= SMALL_MAXCLASS)) {
+	if (likely(usize <= SC_SMALL_MAXCLASS)) {
 		/* Try to avoid moving the allocation. */
-		if (!arena_ralloc_no_move(tsdn, ptr, oldsize, usize, 0, zero)) {
+		UNUSED size_t newsize;
+		if (!arena_ralloc_no_move(tsdn, ptr, oldsize, usize, 0, zero,
+		    &newsize)) {
+			hook_invoke_expand(hook_args->is_realloc
+			    ? hook_expand_realloc : hook_expand_rallocx,
+			    ptr, oldsize, usize, (uintptr_t)ptr,
+			    hook_args->args);
 			return ptr;
 		}
 	}
 
-	if (oldsize >= LARGE_MINCLASS && usize >= LARGE_MINCLASS) {
-		return large_ralloc(tsdn, arena, iealloc(tsdn, ptr), usize,
-		    alignment, zero, tcache);
+	if (oldsize >= SC_LARGE_MINCLASS
+	    && usize >= SC_LARGE_MINCLASS) {
+		return large_ralloc(tsdn, arena, ptr, usize,
+		    alignment, zero, tcache, hook_args);
 	}
 
 	/*
@@ -1658,11 +1802,16 @@ arena_ralloc(tsdn_t *tsdn, arena_t *arena, void *ptr, size_t oldsize,
 		return NULL;
 	}
 
+	hook_invoke_alloc(hook_args->is_realloc
+	    ? hook_alloc_realloc : hook_alloc_rallocx, ret, (uintptr_t)ret,
+	    hook_args->args);
+	hook_invoke_dalloc(hook_args->is_realloc
+	    ? hook_dalloc_realloc : hook_dalloc_rallocx, ptr, hook_args->args);
+
 	/*
 	 * Junk/zero-filling were already done by
 	 * ipalloc()/arena_malloc().
 	 */
-
 	size_t copysize = (usize < oldsize) ? usize : oldsize;
 	memcpy(ret, ptr, copysize);
 	isdalloct(tsdn, ptr, oldsize, tcache, NULL, true);
@@ -1720,8 +1869,7 @@ arena_retain_grow_limit_get_set(tsd_t *tsd, arena_t *arena, size_t *old_limit,
 	if (new_limit != NULL) {
 		size_t limit = *new_limit;
 		/* Grow no more than the new limit. */
-		if ((new_ind = sz_psz2ind(limit + 1) - 1) >
-		     EXTENT_GROW_MAX_PIND) {
+		if ((new_ind = sz_psz2ind(limit + 1) - 1) >= SC_NPSIZES) {
 			return true;
 		}
 	}
@@ -1773,7 +1921,12 @@ arena_new(tsdn_t *tsdn, unsigned ind, extent_hooks_t *extent_hooks) {
 		}
 	}
 
-	arena = (arena_t *)base_alloc(tsdn, base, sizeof(arena_t), CACHELINE);
+	unsigned nbins_total = 0;
+	for (i = 0; i < SC_NBINS; i++) {
+		nbins_total += bin_infos[i].n_shards;
+	}
+	size_t arena_size = sizeof(arena_t) + sizeof(bin_t) * nbins_total;
+	arena = (arena_t *)base_alloc(tsdn, base, arena_size, CACHELINE);
 	if (arena == NULL) {
 		goto label_error;
 	}
@@ -1865,7 +2018,7 @@ arena_new(tsdn_t *tsdn, unsigned ind, extent_hooks_t *extent_hooks) {
 	}
 
 	arena->extent_grow_next = sz_psz2ind(HUGEPAGE);
-	arena->retain_grow_limit = EXTENT_GROW_MAX_PIND;
+	arena->retain_grow_limit = sz_psz2ind(SC_LARGE_MAXCLASS);
 	if (malloc_mutex_init(&arena->extent_grow_mtx, "extent_grow",
 	    WITNESS_RANK_EXTENT_GROW, malloc_mutex_rank_exclusive)) {
 		goto label_error;
@@ -1878,12 +2031,20 @@ arena_new(tsdn_t *tsdn, unsigned ind, extent_hooks_t *extent_hooks) {
 	}
 
 	/* Initialize bins. */
-	for (i = 0; i < NBINS; i++) {
-		bool err = bin_init(&arena->bins[i]);
-		if (err) {
-			goto label_error;
+	uintptr_t bin_addr = (uintptr_t)arena + sizeof(arena_t);
+	atomic_store_u(&arena->binshard_next, 0, ATOMIC_RELEASE);
+	for (i = 0; i < SC_NBINS; i++) {
+		unsigned nshards = bin_infos[i].n_shards;
+		arena->bins[i].bin_shards = (bin_t *)bin_addr;
+		bin_addr += nshards * sizeof(bin_t);
+		for (unsigned j = 0; j < nshards; j++) {
+			bool err = bin_init(&arena->bins[i].bin_shards[j]);
+			if (err) {
+				goto label_error;
+			}
 		}
 	}
+	assert(bin_addr == (uintptr_t)arena + arena_size);
 
 	arena->base = base;
 	/* Set arena before creating background threads. */
@@ -1900,8 +2061,8 @@ arena_new(tsdn_t *tsdn, unsigned ind, extent_hooks_t *extent_hooks) {
 		 */
 		assert(!tsdn_null(tsdn));
 		pre_reentrancy(tsdn_tsd(tsdn), arena);
-		if (hooks_arena_new_hook) {
-			hooks_arena_new_hook();
+		if (test_hooks_arena_new_hook) {
+			test_hooks_arena_new_hook();
 		}
 		post_reentrancy(tsdn_tsd(tsdn));
 	}
@@ -1914,20 +2075,75 @@ label_error:
 	return NULL;
 }
 
+arena_t *
+arena_choose_huge(tsd_t *tsd) {
+	/* huge_arena_ind can be 0 during init (will use a0). */
+	if (huge_arena_ind == 0) {
+		assert(!malloc_initialized());
+	}
+
+	arena_t *huge_arena = arena_get(tsd_tsdn(tsd), huge_arena_ind, false);
+	if (huge_arena == NULL) {
+		/* Create the huge arena on demand. */
+		assert(huge_arena_ind != 0);
+		huge_arena = arena_get(tsd_tsdn(tsd), huge_arena_ind, true);
+		if (huge_arena == NULL) {
+			return NULL;
+		}
+		/*
+		 * Purge eagerly for huge allocations, because: 1) number of
+		 * huge allocations is usually small, which means ticker based
+		 * decay is not reliable; and 2) less immediate reuse is
+		 * expected for huge allocations.
+		 */
+		if (arena_dirty_decay_ms_default_get() > 0) {
+			arena_dirty_decay_ms_set(tsd_tsdn(tsd), huge_arena, 0);
+		}
+		if (arena_muzzy_decay_ms_default_get() > 0) {
+			arena_muzzy_decay_ms_set(tsd_tsdn(tsd), huge_arena, 0);
+		}
+	}
+
+	return huge_arena;
+}
+
+bool
+arena_init_huge(void) {
+	bool huge_enabled;
+
+	/* The threshold should be large size class. */
+	if (opt_oversize_threshold > SC_LARGE_MAXCLASS ||
+	    opt_oversize_threshold < SC_LARGE_MINCLASS) {
+		opt_oversize_threshold = 0;
+		oversize_threshold = SC_LARGE_MAXCLASS + PAGE;
+		huge_enabled = false;
+	} else {
+		/* Reserve the index for the huge arena. */
+		huge_arena_ind = narenas_total_get();
+		oversize_threshold = opt_oversize_threshold;
+		huge_enabled = true;
+	}
+
+	return huge_enabled;
+}
+
+bool
+arena_is_huge(unsigned arena_ind) {
+	if (huge_arena_ind == 0) {
+		return false;
+	}
+	return (arena_ind == huge_arena_ind);
+}
+
 void
-arena_boot(void) {
+arena_boot(sc_data_t *sc_data) {
 	arena_dirty_decay_ms_default_set(opt_dirty_decay_ms);
 	arena_muzzy_decay_ms_default_set(opt_muzzy_decay_ms);
-#define REGIND_bin_yes(index, reg_size) 				\
-	div_init(&arena_binind_div_info[(index)], (reg_size));
-#define REGIND_bin_no(index, reg_size)
-#define SC(index, lg_grp, lg_delta, ndelta, psz, bin, pgs,		\
-    lg_delta_lookup)							\
-	REGIND_bin_##bin(index, (1U<<lg_grp) + (ndelta << lg_delta))
-	SIZE_CLASSES
-#undef REGIND_bin_yes
-#undef REGIND_bin_no
-#undef SC
+	for (unsigned i = 0; i < SC_NBINS; i++) {
+		sc_t *sc = &sc_data->sc[i];
+		div_init(&arena_binind_div_info[i],
+		    (1U << sc->lg_base) + (sc->ndelta << sc->lg_delta));
+	}
 }
 
 void
@@ -1972,8 +2188,10 @@ arena_prefork6(tsdn_t *tsdn, arena_t *arena) {
 
 void
 arena_prefork7(tsdn_t *tsdn, arena_t *arena) {
-	for (unsigned i = 0; i < NBINS; i++) {
-		bin_prefork(tsdn, &arena->bins[i]);
+	for (unsigned i = 0; i < SC_NBINS; i++) {
+		for (unsigned j = 0; j < bin_infos[i].n_shards; j++) {
+			bin_prefork(tsdn, &arena->bins[i].bin_shards[j]);
+		}
 	}
 }
 
@@ -1981,8 +2199,11 @@ void
 arena_postfork_parent(tsdn_t *tsdn, arena_t *arena) {
 	unsigned i;
 
-	for (i = 0; i < NBINS; i++) {
-		bin_postfork_parent(tsdn, &arena->bins[i]);
+	for (i = 0; i < SC_NBINS; i++) {
+		for (unsigned j = 0; j < bin_infos[i].n_shards; j++) {
+			bin_postfork_parent(tsdn,
+			    &arena->bins[i].bin_shards[j]);
+		}
 	}
 	malloc_mutex_postfork_parent(tsdn, &arena->large_mtx);
 	base_postfork_parent(tsdn, arena->base);
@@ -2025,8 +2246,10 @@ arena_postfork_child(tsdn_t *tsdn, arena_t *arena) {
 		}
 	}
 
-	for (i = 0; i < NBINS; i++) {
-		bin_postfork_child(tsdn, &arena->bins[i]);
+	for (i = 0; i < SC_NBINS; i++) {
+		for (unsigned j = 0; j < bin_infos[i].n_shards; j++) {
+			bin_postfork_child(tsdn, &arena->bins[i].bin_shards[j]);
+		}
 	}
 	malloc_mutex_postfork_child(tsdn, &arena->large_mtx);
 	base_postfork_child(tsdn, arena->base);
diff --git a/src/background_thread.c b/src/background_thread.c
index 3517a3bb..5ed6c1c9 100644
--- a/src/background_thread.c
+++ b/src/background_thread.c
@@ -4,6 +4,8 @@
 
 #include "jemalloc/internal/assert.h"
 
+JEMALLOC_DIAGNOSTIC_DISABLE_SPURIOUS
+
 /******************************************************************************/
 /* Data. */
 
@@ -11,7 +13,7 @@
 #define BACKGROUND_THREAD_DEFAULT false
 /* Read-only after initialization. */
 bool opt_background_thread = BACKGROUND_THREAD_DEFAULT;
-size_t opt_max_background_threads = MAX_BACKGROUND_THREAD_LIMIT;
+size_t opt_max_background_threads = MAX_BACKGROUND_THREAD_LIMIT + 1;
 
 /* Used for thread creation, termination and stats. */
 malloc_mutex_t background_thread_lock;
@@ -22,13 +24,9 @@ size_t max_background_threads;
 /* Thread info per-index. */
 background_thread_info_t *background_thread_info;
 
-/* False if no necessary runtime support. */
-bool can_enable_background_thread;
-
 /******************************************************************************/
 
 #ifdef JEMALLOC_PTHREAD_CREATE_WRAPPER
-#include <dlfcn.h>
 
 static int (*pthread_create_fptr)(pthread_t *__restrict, const pthread_attr_t *,
     void *(*)(void *), void *__restrict);
@@ -81,7 +79,7 @@ background_thread_info_init(tsdn_t *tsdn, background_thread_info_t *info) {
 }
 
 static inline bool
-set_current_thread_affinity(UNUSED int cpu) {
+set_current_thread_affinity(int cpu) {
 #if defined(JEMALLOC_HAVE_SCHED_SETAFFINITY)
 	cpu_set_t cpuset;
 	CPU_ZERO(&cpuset);
@@ -510,6 +508,8 @@ background_thread_entry(void *ind_arg) {
 	assert(thread_ind < max_background_threads);
 #ifdef JEMALLOC_HAVE_PTHREAD_SETNAME_NP
 	pthread_setname_np(pthread_self(), "jemalloc_bg_thd");
+#elif defined(__FreeBSD__)
+	pthread_set_name_np(pthread_self(), "jemalloc_bg_thd");
 #endif
 	if (opt_percpu_arena != percpu_arena_disabled) {
 		set_current_thread_affinity((int)thread_ind);
@@ -534,9 +534,8 @@ background_thread_init(tsd_t *tsd, background_thread_info_t *info) {
 	n_background_threads++;
 }
 
-/* Create a new background thread if needed. */
-bool
-background_thread_create(tsd_t *tsd, unsigned arena_ind) {
+static bool
+background_thread_create_locked(tsd_t *tsd, unsigned arena_ind) {
 	assert(have_background_thread);
 	malloc_mutex_assert_owner(tsd_tsdn(tsd), &background_thread_lock);
 
@@ -589,6 +588,19 @@ background_thread_create(tsd_t *tsd, unsigned arena_ind) {
 	return false;
 }
 
+/* Create a new background thread if needed. */
+bool
+background_thread_create(tsd_t *tsd, unsigned arena_ind) {
+	assert(have_background_thread);
+
+	bool ret;
+	malloc_mutex_lock(tsd_tsdn(tsd), &background_thread_lock);
+	ret = background_thread_create_locked(tsd, arena_ind);
+	malloc_mutex_unlock(tsd_tsdn(tsd), &background_thread_lock);
+
+	return ret;
+}
+
 bool
 background_threads_enable(tsd_t *tsd) {
 	assert(n_background_threads == 0);
@@ -622,7 +634,7 @@ background_threads_enable(tsd_t *tsd) {
 		}
 	}
 
-	return background_thread_create(tsd, 0);
+	return background_thread_create_locked(tsd, 0);
 }
 
 bool
@@ -807,21 +819,34 @@ background_thread_stats_read(tsdn_t *tsdn, background_thread_stats_t *stats) {
 #undef BILLION
 #undef BACKGROUND_THREAD_MIN_INTERVAL_NS
 
+#ifdef JEMALLOC_HAVE_DLSYM
+#include <dlfcn.h>
+#endif
+
 static bool
 pthread_create_fptr_init(void) {
 	if (pthread_create_fptr != NULL) {
 		return false;
 	}
+	/*
+	 * Try the next symbol first, because 1) when use lazy_lock we have a
+	 * wrapper for pthread_create; and 2) application may define its own
+	 * wrapper as well (and can call malloc within the wrapper).
+	 */
+#ifdef JEMALLOC_HAVE_DLSYM
 	pthread_create_fptr = dlsym(RTLD_NEXT, "pthread_create");
+#else
+	pthread_create_fptr = NULL;
+#endif
 	if (pthread_create_fptr == NULL) {
-		can_enable_background_thread = false;
-		if (config_lazy_lock || opt_background_thread) {
+		if (config_lazy_lock) {
 			malloc_write("<jemalloc>: Error in dlsym(RTLD_NEXT, "
 			    "\"pthread_create\")\n");
 			abort();
+		} else {
+			/* Fall back to the default symbol. */
+			pthread_create_fptr = pthread_create;
 		}
-	} else {
-		can_enable_background_thread = true;
 	}
 
 	return false;
@@ -866,9 +891,8 @@ background_thread_boot1(tsdn_t *tsdn) {
 	assert(have_background_thread);
 	assert(narenas_total_get() > 0);
 
-	if (opt_max_background_threads == MAX_BACKGROUND_THREAD_LIMIT &&
-	    ncpus < MAX_BACKGROUND_THREAD_LIMIT) {
-		opt_max_background_threads = ncpus;
+	if (opt_max_background_threads > MAX_BACKGROUND_THREAD_LIMIT) {
+		opt_max_background_threads = DEFAULT_NUM_BACKGROUND_THREAD;
 	}
 	max_background_threads = opt_max_background_threads;
 
diff --git a/src/base.c b/src/base.c
index b0324b5d..f3c61661 100644
--- a/src/base.c
+++ b/src/base.c
@@ -262,8 +262,8 @@ base_block_alloc(tsdn_t *tsdn, base_t *base, extent_hooks_t *extent_hooks,
 	 */
 	size_t min_block_size = HUGEPAGE_CEILING(sz_psz2u(header_size + gap_size
 	    + usize));
-	pszind_t pind_next = (*pind_last + 1 < NPSIZES) ? *pind_last + 1 :
-	    *pind_last;
+	pszind_t pind_next = (*pind_last + 1 < sz_psz2ind(SC_LARGE_MAXCLASS)) ?
+	    *pind_last + 1 : *pind_last;
 	size_t next_block_size = HUGEPAGE_CEILING(sz_pind2sz(pind_next));
 	size_t block_size = (min_block_size > next_block_size) ? min_block_size
 	    : next_block_size;
@@ -372,7 +372,7 @@ base_new(tsdn_t *tsdn, unsigned ind, extent_hooks_t *extent_hooks) {
 	base->extent_sn_next = extent_sn_next;
 	base->blocks = block;
 	base->auto_thp_switched = false;
-	for (szind_t i = 0; i < NSIZES; i++) {
+	for (szind_t i = 0; i < SC_NSIZES; i++) {
 		extent_heap_new(&base->avail[i]);
 	}
 	if (config_stats) {
@@ -426,7 +426,7 @@ base_alloc_impl(tsdn_t *tsdn, base_t *base, size_t size, size_t alignment,
 
 	extent_t *extent = NULL;
 	malloc_mutex_lock(tsdn, &base->mtx);
-	for (szind_t i = sz_size2index(asize); i < NSIZES; i++) {
+	for (szind_t i = sz_size2index(asize); i < SC_NSIZES; i++) {
 		extent = extent_heap_remove_first(&base->avail[i]);
 		if (extent != NULL) {
 			/* Use existing space. */
diff --git a/src/bin.c b/src/bin.c
index 0886bc4e..bca6b12c 100644
--- a/src/bin.c
+++ b/src/bin.c
@@ -1,23 +1,68 @@
 #include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/jemalloc_internal_includes.h"
 
+#include "jemalloc/internal/assert.h"
 #include "jemalloc/internal/bin.h"
+#include "jemalloc/internal/sc.h"
 #include "jemalloc/internal/witness.h"
 
-const bin_info_t bin_infos[NBINS] = {
-#define BIN_INFO_bin_yes(reg_size, slab_size, nregs)			\
-	{reg_size, slab_size, nregs, BITMAP_INFO_INITIALIZER(nregs)},
-#define BIN_INFO_bin_no(reg_size, slab_size, nregs)
-#define SC(index, lg_grp, lg_delta, ndelta, psz, bin, pgs,		\
-    lg_delta_lookup)							\
-	BIN_INFO_bin_##bin((1U<<lg_grp) + (ndelta<<lg_delta),		\
-	    (pgs << LG_PAGE), (pgs << LG_PAGE) / ((1U<<lg_grp) +	\
-	    (ndelta<<lg_delta)))
-	SIZE_CLASSES
-#undef BIN_INFO_bin_yes
-#undef BIN_INFO_bin_no
-#undef SC
-};
+bin_info_t bin_infos[SC_NBINS];
+
+static void
+bin_infos_init(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS],
+    bin_info_t bin_infos[SC_NBINS]) {
+	for (unsigned i = 0; i < SC_NBINS; i++) {
+		bin_info_t *bin_info = &bin_infos[i];
+		sc_t *sc = &sc_data->sc[i];
+		bin_info->reg_size = ((size_t)1U << sc->lg_base)
+		    + ((size_t)sc->ndelta << sc->lg_delta);
+		bin_info->slab_size = (sc->pgs << LG_PAGE);
+		bin_info->nregs =
+		    (uint32_t)(bin_info->slab_size / bin_info->reg_size);
+		bin_info->n_shards = bin_shard_sizes[i];
+		bitmap_info_t bitmap_info = BITMAP_INFO_INITIALIZER(
+		    bin_info->nregs);
+		bin_info->bitmap_info = bitmap_info;
+	}
+}
+
+bool
+bin_update_shard_size(unsigned bin_shard_sizes[SC_NBINS], size_t start_size,
+    size_t end_size, size_t nshards) {
+	if (nshards > BIN_SHARDS_MAX || nshards == 0) {
+		return true;
+	}
+
+	if (start_size > SC_SMALL_MAXCLASS) {
+		return false;
+	}
+	if (end_size > SC_SMALL_MAXCLASS) {
+		end_size = SC_SMALL_MAXCLASS;
+	}
+
+	/* Compute the index since this may happen before sz init. */
+	szind_t ind1 = sz_size2index_compute(start_size);
+	szind_t ind2 = sz_size2index_compute(end_size);
+	for (unsigned i = ind1; i <= ind2; i++) {
+		bin_shard_sizes[i] = (unsigned)nshards;
+	}
+
+	return false;
+}
+
+void
+bin_shard_sizes_boot(unsigned bin_shard_sizes[SC_NBINS]) {
+	/* Load the default number of shards. */
+	for (unsigned i = 0; i < SC_NBINS; i++) {
+		bin_shard_sizes[i] = N_BIN_SHARDS_DEFAULT;
+	}
+}
+
+void
+bin_boot(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS]) {
+	assert(sc_data->initialized);
+	bin_infos_init(sc_data, bin_shard_sizes, bin_infos);
+}
 
 bool
 bin_init(bin_t *bin) {
diff --git a/src/ckh.c b/src/ckh.c
index e95e0a3e..1bf6df5a 100644
--- a/src/ckh.c
+++ b/src/ckh.c
@@ -275,7 +275,8 @@ ckh_grow(tsd_t *tsd, ckh_t *ckh) {
 
 		lg_curcells++;
 		usize = sz_sa2u(sizeof(ckhc_t) << lg_curcells, CACHELINE);
-		if (unlikely(usize == 0 || usize > LARGE_MAXCLASS)) {
+		if (unlikely(usize == 0
+		    || usize > SC_LARGE_MAXCLASS)) {
 			ret = true;
 			goto label_return;
 		}
@@ -320,7 +321,7 @@ ckh_shrink(tsd_t *tsd, ckh_t *ckh) {
 	lg_prevbuckets = ckh->lg_curbuckets;
 	lg_curcells = ckh->lg_curbuckets + LG_CKH_BUCKET_CELLS - 1;
 	usize = sz_sa2u(sizeof(ckhc_t) << lg_curcells, CACHELINE);
-	if (unlikely(usize == 0 || usize > LARGE_MAXCLASS)) {
+	if (unlikely(usize == 0 || usize > SC_LARGE_MAXCLASS)) {
 		return;
 	}
 	tab = (ckhc_t *)ipallocztm(tsd_tsdn(tsd), usize, CACHELINE, true, NULL,
@@ -396,7 +397,7 @@ ckh_new(tsd_t *tsd, ckh_t *ckh, size_t minitems, ckh_hash_t *hash,
 	ckh->keycomp = keycomp;
 
 	usize = sz_sa2u(sizeof(ckhc_t) << lg_mincells, CACHELINE);
-	if (unlikely(usize == 0 || usize > LARGE_MAXCLASS)) {
+	if (unlikely(usize == 0 || usize > SC_LARGE_MAXCLASS)) {
 		ret = true;
 		goto label_return;
 	}
diff --git a/src/ctl.c b/src/ctl.c
index 1e713a3d..09310a9d 100644
--- a/src/ctl.c
+++ b/src/ctl.c
@@ -8,7 +8,7 @@
 #include "jemalloc/internal/extent_mmap.h"
 #include "jemalloc/internal/mutex.h"
 #include "jemalloc/internal/nstime.h"
-#include "jemalloc/internal/size_classes.h"
+#include "jemalloc/internal/sc.h"
 #include "jemalloc/internal/util.h"
 
 /******************************************************************************/
@@ -85,6 +85,7 @@ CTL_PROTO(opt_retain)
 CTL_PROTO(opt_dss)
 CTL_PROTO(opt_narenas)
 CTL_PROTO(opt_percpu_arena)
+CTL_PROTO(opt_oversize_threshold)
 CTL_PROTO(opt_background_thread)
 CTL_PROTO(opt_max_background_threads)
 CTL_PROTO(opt_dirty_decay_ms)
@@ -126,6 +127,7 @@ INDEX_PROTO(arena_i)
 CTL_PROTO(arenas_bin_i_size)
 CTL_PROTO(arenas_bin_i_nregs)
 CTL_PROTO(arenas_bin_i_slab_size)
+CTL_PROTO(arenas_bin_i_nshards)
 INDEX_PROTO(arenas_bin_i)
 CTL_PROTO(arenas_lextent_i_size)
 INDEX_PROTO(arenas_lextent_i)
@@ -147,6 +149,8 @@ CTL_PROTO(prof_gdump)
 CTL_PROTO(prof_reset)
 CTL_PROTO(prof_interval)
 CTL_PROTO(lg_prof_sample)
+CTL_PROTO(prof_log_start)
+CTL_PROTO(prof_log_stop)
 CTL_PROTO(stats_arenas_i_small_allocated)
 CTL_PROTO(stats_arenas_i_small_nmalloc)
 CTL_PROTO(stats_arenas_i_small_ndalloc)
@@ -170,6 +174,13 @@ CTL_PROTO(stats_arenas_i_lextents_j_ndalloc)
 CTL_PROTO(stats_arenas_i_lextents_j_nrequests)
 CTL_PROTO(stats_arenas_i_lextents_j_curlextents)
 INDEX_PROTO(stats_arenas_i_lextents_j)
+CTL_PROTO(stats_arenas_i_extents_j_ndirty)
+CTL_PROTO(stats_arenas_i_extents_j_nmuzzy)
+CTL_PROTO(stats_arenas_i_extents_j_nretained)
+CTL_PROTO(stats_arenas_i_extents_j_dirty_bytes)
+CTL_PROTO(stats_arenas_i_extents_j_muzzy_bytes)
+CTL_PROTO(stats_arenas_i_extents_j_retained_bytes)
+INDEX_PROTO(stats_arenas_i_extents_j)
 CTL_PROTO(stats_arenas_i_nthreads)
 CTL_PROTO(stats_arenas_i_uptime)
 CTL_PROTO(stats_arenas_i_dss)
@@ -180,6 +191,7 @@ CTL_PROTO(stats_arenas_i_pdirty)
 CTL_PROTO(stats_arenas_i_pmuzzy)
 CTL_PROTO(stats_arenas_i_mapped)
 CTL_PROTO(stats_arenas_i_retained)
+CTL_PROTO(stats_arenas_i_extent_avail)
 CTL_PROTO(stats_arenas_i_dirty_npurge)
 CTL_PROTO(stats_arenas_i_dirty_nmadvise)
 CTL_PROTO(stats_arenas_i_dirty_purged)
@@ -202,6 +214,8 @@ CTL_PROTO(stats_metadata_thp)
 CTL_PROTO(stats_resident)
 CTL_PROTO(stats_mapped)
 CTL_PROTO(stats_retained)
+CTL_PROTO(experimental_hooks_install)
+CTL_PROTO(experimental_hooks_remove)
 
 #define MUTEX_STATS_CTL_PROTO_GEN(n)					\
 CTL_PROTO(stats_##n##_num_ops)						\
@@ -286,6 +300,7 @@ static const ctl_named_node_t opt_node[] = {
 	{NAME("dss"),		CTL(opt_dss)},
 	{NAME("narenas"),	CTL(opt_narenas)},
 	{NAME("percpu_arena"),	CTL(opt_percpu_arena)},
+	{NAME("oversize_threshold"),	CTL(opt_oversize_threshold)},
 	{NAME("background_thread"),	CTL(opt_background_thread)},
 	{NAME("max_background_threads"),	CTL(opt_max_background_threads)},
 	{NAME("dirty_decay_ms"), CTL(opt_dirty_decay_ms)},
@@ -341,7 +356,8 @@ static const ctl_indexed_node_t arena_node[] = {
 static const ctl_named_node_t arenas_bin_i_node[] = {
 	{NAME("size"),		CTL(arenas_bin_i_size)},
 	{NAME("nregs"),		CTL(arenas_bin_i_nregs)},
-	{NAME("slab_size"),	CTL(arenas_bin_i_slab_size)}
+	{NAME("slab_size"),	CTL(arenas_bin_i_slab_size)},
+	{NAME("nshards"),	CTL(arenas_bin_i_nshards)}
 };
 static const ctl_named_node_t super_arenas_bin_i_node[] = {
 	{NAME(""),		CHILD(named, arenas_bin_i)}
@@ -385,9 +401,10 @@ static const ctl_named_node_t	prof_node[] = {
 	{NAME("gdump"),		CTL(prof_gdump)},
 	{NAME("reset"),		CTL(prof_reset)},
 	{NAME("interval"),	CTL(prof_interval)},
-	{NAME("lg_sample"),	CTL(lg_prof_sample)}
+	{NAME("lg_sample"),	CTL(lg_prof_sample)},
+	{NAME("log_start"),	CTL(prof_log_start)},
+	{NAME("log_stop"),	CTL(prof_log_stop)}
 };
-
 static const ctl_named_node_t stats_arenas_i_small_node[] = {
 	{NAME("allocated"),	CTL(stats_arenas_i_small_allocated)},
 	{NAME("nmalloc"),	CTL(stats_arenas_i_small_nmalloc)},
@@ -458,6 +475,23 @@ static const ctl_indexed_node_t stats_arenas_i_lextents_node[] = {
 	{INDEX(stats_arenas_i_lextents_j)}
 };
 
+static const ctl_named_node_t stats_arenas_i_extents_j_node[] = {
+	{NAME("ndirty"),	CTL(stats_arenas_i_extents_j_ndirty)},
+	{NAME("nmuzzy"),	CTL(stats_arenas_i_extents_j_nmuzzy)},
+	{NAME("nretained"),	CTL(stats_arenas_i_extents_j_nretained)},
+	{NAME("dirty_bytes"),	CTL(stats_arenas_i_extents_j_dirty_bytes)},
+	{NAME("muzzy_bytes"),	CTL(stats_arenas_i_extents_j_muzzy_bytes)},
+	{NAME("retained_bytes"), CTL(stats_arenas_i_extents_j_retained_bytes)}
+};
+
+static const ctl_named_node_t super_stats_arenas_i_extents_j_node[] = {
+	{NAME(""),		CHILD(named, stats_arenas_i_extents_j)}
+};
+
+static const ctl_indexed_node_t stats_arenas_i_extents_node[] = {
+	{INDEX(stats_arenas_i_extents_j)}
+};
+
 #define OP(mtx)  MUTEX_PROF_DATA_NODE(arenas_i_mutexes_##mtx)
 MUTEX_PROF_ARENA_MUTEXES
 #undef OP
@@ -479,6 +513,7 @@ static const ctl_named_node_t stats_arenas_i_node[] = {
 	{NAME("pmuzzy"),	CTL(stats_arenas_i_pmuzzy)},
 	{NAME("mapped"),	CTL(stats_arenas_i_mapped)},
 	{NAME("retained"),	CTL(stats_arenas_i_retained)},
+	{NAME("extent_avail"),	CTL(stats_arenas_i_extent_avail)},
 	{NAME("dirty_npurge"),	CTL(stats_arenas_i_dirty_npurge)},
 	{NAME("dirty_nmadvise"), CTL(stats_arenas_i_dirty_nmadvise)},
 	{NAME("dirty_purged"),	CTL(stats_arenas_i_dirty_purged)},
@@ -494,6 +529,7 @@ static const ctl_named_node_t stats_arenas_i_node[] = {
 	{NAME("large"),		CHILD(named, stats_arenas_i_large)},
 	{NAME("bins"),		CHILD(indexed, stats_arenas_i_bins)},
 	{NAME("lextents"),	CHILD(indexed, stats_arenas_i_lextents)},
+	{NAME("extents"),	CHILD(indexed, stats_arenas_i_extents)},
 	{NAME("mutexes"),	CHILD(named, stats_arenas_i_mutexes)}
 };
 static const ctl_named_node_t super_stats_arenas_i_node[] = {
@@ -536,6 +572,15 @@ static const ctl_named_node_t stats_node[] = {
 	{NAME("arenas"),	CHILD(indexed, stats_arenas)}
 };
 
+static const ctl_named_node_t hooks_node[] = {
+	{NAME("install"),	CTL(experimental_hooks_install)},
+	{NAME("remove"),	CTL(experimental_hooks_remove)},
+};
+
+static const ctl_named_node_t experimental_node[] = {
+	{NAME("hooks"),		CHILD(named, hooks)}
+};
+
 static const ctl_named_node_t	root_node[] = {
 	{NAME("version"),	CTL(version)},
 	{NAME("epoch"),		CTL(epoch)},
@@ -548,7 +593,8 @@ static const ctl_named_node_t	root_node[] = {
 	{NAME("arena"),		CHILD(indexed, arena)},
 	{NAME("arenas"),	CHILD(named, arenas)},
 	{NAME("prof"),		CHILD(named, prof)},
-	{NAME("stats"),		CHILD(named, stats)}
+	{NAME("stats"),		CHILD(named, stats)},
+	{NAME("experimental"),	CHILD(named, experimental)}
 };
 static const ctl_named_node_t super_root_node[] = {
 	{NAME(""),		CHILD(named, root)}
@@ -696,10 +742,12 @@ ctl_arena_clear(ctl_arena_t *ctl_arena) {
 		ctl_arena->astats->nmalloc_small = 0;
 		ctl_arena->astats->ndalloc_small = 0;
 		ctl_arena->astats->nrequests_small = 0;
-		memset(ctl_arena->astats->bstats, 0, NBINS *
+		memset(ctl_arena->astats->bstats, 0, SC_NBINS *
 		    sizeof(bin_stats_t));
-		memset(ctl_arena->astats->lstats, 0, (NSIZES - NBINS) *
+		memset(ctl_arena->astats->lstats, 0, (SC_NSIZES - SC_NBINS) *
 		    sizeof(arena_stats_large_t));
+		memset(ctl_arena->astats->estats, 0, SC_NPSIZES *
+		    sizeof(arena_stats_extents_t));
 	}
 }
 
@@ -713,9 +761,9 @@ ctl_arena_stats_amerge(tsdn_t *tsdn, ctl_arena_t *ctl_arena, arena_t *arena) {
 		    &ctl_arena->muzzy_decay_ms, &ctl_arena->pactive,
 		    &ctl_arena->pdirty, &ctl_arena->pmuzzy,
 		    &ctl_arena->astats->astats, ctl_arena->astats->bstats,
-		    ctl_arena->astats->lstats);
+		    ctl_arena->astats->lstats, ctl_arena->astats->estats);
 
-		for (i = 0; i < NBINS; i++) {
+		for (i = 0; i < SC_NBINS; i++) {
 			ctl_arena->astats->allocated_small +=
 			    ctl_arena->astats->bstats[i].curregs *
 			    sz_index2size(i);
@@ -760,6 +808,8 @@ ctl_arena_stats_sdmerge(ctl_arena_t *ctl_sdarena, ctl_arena_t *ctl_arena,
 			    &astats->astats.mapped);
 			accum_atomic_zu(&sdstats->astats.retained,
 			    &astats->astats.retained);
+			accum_atomic_zu(&sdstats->astats.extent_avail,
+			    &astats->astats.extent_avail);
 		}
 
 		ctl_accum_arena_stats_u64(&sdstats->astats.decay_dirty.npurge,
@@ -827,7 +877,8 @@ MUTEX_PROF_ARENA_MUTEXES
 			sdstats->astats.uptime = astats->astats.uptime;
 		}
 
-		for (i = 0; i < NBINS; i++) {
+		/* Merge bin stats. */
+		for (i = 0; i < SC_NBINS; i++) {
 			sdstats->bstats[i].nmalloc += astats->bstats[i].nmalloc;
 			sdstats->bstats[i].ndalloc += astats->bstats[i].ndalloc;
 			sdstats->bstats[i].nrequests +=
@@ -853,7 +904,8 @@ MUTEX_PROF_ARENA_MUTEXES
 			    &astats->bstats[i].mutex_data);
 		}
 
-		for (i = 0; i < NSIZES - NBINS; i++) {
+		/* Merge stats for large allocations. */
+		for (i = 0; i < SC_NSIZES - SC_NBINS; i++) {
 			ctl_accum_arena_stats_u64(&sdstats->lstats[i].nmalloc,
 			    &astats->lstats[i].nmalloc);
 			ctl_accum_arena_stats_u64(&sdstats->lstats[i].ndalloc,
@@ -867,6 +919,22 @@ MUTEX_PROF_ARENA_MUTEXES
 				assert(astats->lstats[i].curlextents == 0);
 			}
 		}
+
+		/* Merge extents stats. */
+		for (i = 0; i < SC_NPSIZES; i++) {
+			accum_atomic_zu(&sdstats->estats[i].ndirty,
+			    &astats->estats[i].ndirty);
+			accum_atomic_zu(&sdstats->estats[i].nmuzzy,
+			    &astats->estats[i].nmuzzy);
+			accum_atomic_zu(&sdstats->estats[i].nretained,
+			    &astats->estats[i].nretained);
+			accum_atomic_zu(&sdstats->estats[i].dirty_bytes,
+			    &astats->estats[i].dirty_bytes);
+			accum_atomic_zu(&sdstats->estats[i].muzzy_bytes,
+			    &astats->estats[i].muzzy_bytes);
+			accum_atomic_zu(&sdstats->estats[i].retained_bytes,
+			    &astats->estats[i].retained_bytes);
+		}
 	}
 }
 
@@ -1378,8 +1446,8 @@ label_return:								\
 
 #define CTL_RO_CGEN(c, n, v, t)						\
 static int								\
-n##_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,	\
-    size_t *oldlenp, void *newp, size_t newlen) {			\
+n##_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, \
+    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {			\
 	int ret;							\
 	t oldval;							\
 									\
@@ -1421,8 +1489,8 @@ label_return:								\
  */
 #define CTL_RO_NL_CGEN(c, n, v, t)					\
 static int								\
-n##_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,	\
-    size_t *oldlenp, void *newp, size_t newlen) {			\
+n##_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, \
+    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {			\
 	int ret;							\
 	t oldval;							\
 									\
@@ -1440,8 +1508,8 @@ label_return:								\
 
 #define CTL_RO_NL_GEN(n, v, t)						\
 static int								\
-n##_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,	\
-    size_t *oldlenp, void *newp, size_t newlen) {			\
+n##_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, \
+    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {			\
 	int ret;							\
 	t oldval;							\
 									\
@@ -1475,8 +1543,8 @@ label_return:								\
 
 #define CTL_RO_CONFIG_GEN(n, t)						\
 static int								\
-n##_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,	\
-    size_t *oldlenp, void *newp, size_t newlen) {			\
+n##_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, \
+    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {			\
 	int ret;							\
 	t oldval;							\
 									\
@@ -1494,8 +1562,8 @@ label_return:								\
 CTL_RO_NL_GEN(version, JEMALLOC_VERSION, const char *)
 
 static int
-epoch_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,
-    size_t *oldlenp, void *newp, size_t newlen) {
+epoch_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
+    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
 	int ret;
 	UNUSED uint64_t newval;
 
@@ -1513,8 +1581,9 @@ label_return:
 }
 
 static int
-background_thread_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
-    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
+background_thread_ctl(tsd_t *tsd, const size_t *mib,
+    size_t miblen, void *oldp, size_t *oldlenp,
+    void *newp, size_t newlen) {
 	int ret;
 	bool oldval;
 
@@ -1544,13 +1613,6 @@ background_thread_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
 
 		background_thread_enabled_set(tsd_tsdn(tsd), newval);
 		if (newval) {
-			if (!can_enable_background_thread) {
-				malloc_printf("<jemalloc>: Error in dlsym("
-			            "RTLD_NEXT, \"pthread_create\"). Cannot "
-				    "enable background_thread\n");
-				ret = EFAULT;
-				goto label_return;
-			}
 			if (background_threads_enable(tsd)) {
 				ret = EFAULT;
 				goto label_return;
@@ -1571,8 +1633,9 @@ label_return:
 }
 
 static int
-max_background_threads_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
-    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
+max_background_threads_ctl(tsd_t *tsd, const size_t *mib,
+    size_t miblen, void *oldp, size_t *oldlenp, void *newp,
+    size_t newlen) {
 	int ret;
 	size_t oldval;
 
@@ -1605,13 +1668,6 @@ max_background_threads_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
 		}
 
 		if (background_thread_enabled()) {
-			if (!can_enable_background_thread) {
-				malloc_printf("<jemalloc>: Error in dlsym("
-			            "RTLD_NEXT, \"pthread_create\"). Cannot "
-				    "enable background_thread\n");
-				ret = EFAULT;
-				goto label_return;
-			}
 			background_thread_enabled_set(tsd_tsdn(tsd), false);
 			if (background_threads_disable(tsd)) {
 				ret = EFAULT;
@@ -1660,6 +1716,7 @@ CTL_RO_NL_GEN(opt_dss, opt_dss, const char *)
 CTL_RO_NL_GEN(opt_narenas, opt_narenas, unsigned)
 CTL_RO_NL_GEN(opt_percpu_arena, percpu_arena_mode_names[opt_percpu_arena],
     const char *)
+CTL_RO_NL_GEN(opt_oversize_threshold, opt_oversize_threshold, size_t)
 CTL_RO_NL_GEN(opt_background_thread, opt_background_thread, bool)
 CTL_RO_NL_GEN(opt_max_background_threads, opt_max_background_threads, size_t)
 CTL_RO_NL_GEN(opt_dirty_decay_ms, opt_dirty_decay_ms, ssize_t)
@@ -1690,8 +1747,8 @@ CTL_RO_NL_CGEN(config_prof, opt_prof_leak, opt_prof_leak, bool)
 /******************************************************************************/
 
 static int
-thread_arena_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,
-    size_t *oldlenp, void *newp, size_t newlen) {
+thread_arena_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
+    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
 	int ret;
 	arena_t *oldarena;
 	unsigned newind, oldind;
@@ -1755,8 +1812,9 @@ CTL_TSD_RO_NL_CGEN(config_stats, thread_deallocatedp,
     tsd_thread_deallocatedp_get, uint64_t *)
 
 static int
-thread_tcache_enabled_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
-    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
+thread_tcache_enabled_ctl(tsd_t *tsd, const size_t *mib,
+    size_t miblen, void *oldp, size_t *oldlenp, void *newp,
+    size_t newlen) {
 	int ret;
 	bool oldval;
 
@@ -1776,8 +1834,9 @@ label_return:
 }
 
 static int
-thread_tcache_flush_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
-    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
+thread_tcache_flush_ctl(tsd_t *tsd, const size_t *mib,
+    size_t miblen, void *oldp, size_t *oldlenp, void *newp,
+    size_t newlen) {
 	int ret;
 
 	if (!tcache_available(tsd)) {
@@ -1796,8 +1855,9 @@ label_return:
 }
 
 static int
-thread_prof_name_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,
-    size_t *oldlenp, void *newp, size_t newlen) {
+thread_prof_name_ctl(tsd_t *tsd, const size_t *mib,
+    size_t miblen, void *oldp, size_t *oldlenp, void *newp,
+    size_t newlen) {
 	int ret;
 
 	if (!config_prof) {
@@ -1827,8 +1887,9 @@ label_return:
 }
 
 static int
-thread_prof_active_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,
-    size_t *oldlenp, void *newp, size_t newlen) {
+thread_prof_active_ctl(tsd_t *tsd, const size_t *mib,
+    size_t miblen, void *oldp, size_t *oldlenp, void *newp,
+    size_t newlen) {
 	int ret;
 	bool oldval;
 
@@ -1857,8 +1918,8 @@ label_return:
 /******************************************************************************/
 
 static int
-tcache_create_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,
-    size_t *oldlenp, void *newp, size_t newlen) {
+tcache_create_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
+    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
 	int ret;
 	unsigned tcache_ind;
 
@@ -1875,8 +1936,8 @@ label_return:
 }
 
 static int
-tcache_flush_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,
-    size_t *oldlenp, void *newp, size_t newlen) {
+tcache_flush_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
+    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
 	int ret;
 	unsigned tcache_ind;
 
@@ -1895,8 +1956,8 @@ label_return:
 }
 
 static int
-tcache_destroy_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,
-    size_t *oldlenp, void *newp, size_t newlen) {
+tcache_destroy_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
+    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
 	int ret;
 	unsigned tcache_ind;
 
@@ -2044,9 +2105,8 @@ arena_reset_prepare_background_thread(tsd_t *tsd, unsigned arena_ind) {
 	if (have_background_thread) {
 		malloc_mutex_lock(tsd_tsdn(tsd), &background_thread_lock);
 		if (background_thread_enabled()) {
-			unsigned ind = arena_ind % ncpus;
 			background_thread_info_t *info =
-			    &background_thread_info[ind];
+			    background_thread_info_get(arena_ind);
 			assert(info->state == background_thread_started);
 			malloc_mutex_lock(tsd_tsdn(tsd), &info->mtx);
 			info->state = background_thread_paused;
@@ -2059,9 +2119,8 @@ static void
 arena_reset_finish_background_thread(tsd_t *tsd, unsigned arena_ind) {
 	if (have_background_thread) {
 		if (background_thread_enabled()) {
-			unsigned ind = arena_ind % ncpus;
 			background_thread_info_t *info =
-			    &background_thread_info[ind];
+			    background_thread_info_get(arena_ind);
 			assert(info->state == background_thread_paused);
 			malloc_mutex_lock(tsd_tsdn(tsd), &info->mtx);
 			info->state = background_thread_started;
@@ -2217,6 +2276,17 @@ arena_i_decay_ms_ctl_impl(tsd_t *tsd, const size_t *mib, size_t miblen,
 			ret = EINVAL;
 			goto label_return;
 		}
+		if (arena_is_huge(arena_ind) && *(ssize_t *)newp > 0) {
+			/*
+			 * By default the huge arena purges eagerly.  If it is
+			 * set to non-zero decay time afterwards, background
+			 * thread might be needed.
+			 */
+			if (background_thread_create(tsd, arena_ind)) {
+				ret = EFAULT;
+				goto label_return;
+			}
+		}
 		if (dirty ? arena_dirty_decay_ms_set(tsd_tsdn(tsd), arena,
 		    *(ssize_t *)newp) : arena_muzzy_decay_ms_set(tsd_tsdn(tsd),
 		    arena, *(ssize_t *)newp)) {
@@ -2300,8 +2370,9 @@ label_return:
 }
 
 static int
-arena_i_retain_grow_limit_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
-    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
+arena_i_retain_grow_limit_ctl(tsd_t *tsd, const size_t *mib,
+    size_t miblen, void *oldp, size_t *oldlenp, void *newp,
+    size_t newlen) {
 	int ret;
 	unsigned arena_ind;
 	arena_t *arena;
@@ -2336,7 +2407,8 @@ label_return:
 }
 
 static const ctl_named_node_t *
-arena_i_index(tsdn_t *tsdn, const size_t *mib, size_t miblen, size_t i) {
+arena_i_index(tsdn_t *tsdn, const size_t *mib, size_t miblen,
+    size_t i) {
 	const ctl_named_node_t *ret;
 
 	malloc_mutex_lock(tsdn, &ctl_mtx);
@@ -2361,8 +2433,8 @@ label_return:
 /******************************************************************************/
 
 static int
-arenas_narenas_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,
-    size_t *oldlenp, void *newp, size_t newlen) {
+arenas_narenas_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
+    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
 	int ret;
 	unsigned narenas;
 
@@ -2382,8 +2454,9 @@ label_return:
 }
 
 static int
-arenas_decay_ms_ctl_impl(tsd_t *tsd, const size_t *mib, size_t miblen,
-    void *oldp, size_t *oldlenp, void *newp, size_t newlen, bool dirty) {
+arenas_decay_ms_ctl_impl(tsd_t *tsd, const size_t *mib,
+    size_t miblen, void *oldp, size_t *oldlenp, void *newp,
+    size_t newlen, bool dirty) {
 	int ret;
 
 	if (oldp != NULL && oldlenp != NULL) {
@@ -2425,34 +2498,36 @@ arenas_muzzy_decay_ms_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
 CTL_RO_NL_GEN(arenas_quantum, QUANTUM, size_t)
 CTL_RO_NL_GEN(arenas_page, PAGE, size_t)
 CTL_RO_NL_GEN(arenas_tcache_max, tcache_maxclass, size_t)
-CTL_RO_NL_GEN(arenas_nbins, NBINS, unsigned)
+CTL_RO_NL_GEN(arenas_nbins, SC_NBINS, unsigned)
 CTL_RO_NL_GEN(arenas_nhbins, nhbins, unsigned)
 CTL_RO_NL_GEN(arenas_bin_i_size, bin_infos[mib[2]].reg_size, size_t)
 CTL_RO_NL_GEN(arenas_bin_i_nregs, bin_infos[mib[2]].nregs, uint32_t)
 CTL_RO_NL_GEN(arenas_bin_i_slab_size, bin_infos[mib[2]].slab_size, size_t)
+CTL_RO_NL_GEN(arenas_bin_i_nshards, bin_infos[mib[2]].n_shards, uint32_t)
 static const ctl_named_node_t *
-arenas_bin_i_index(tsdn_t *tsdn, const size_t *mib, size_t miblen, size_t i) {
-	if (i > NBINS) {
+arenas_bin_i_index(tsdn_t *tsdn, const size_t *mib,
+    size_t miblen, size_t i) {
+	if (i > SC_NBINS) {
 		return NULL;
 	}
 	return super_arenas_bin_i_node;
 }
 
-CTL_RO_NL_GEN(arenas_nlextents, NSIZES - NBINS, unsigned)
-CTL_RO_NL_GEN(arenas_lextent_i_size, sz_index2size(NBINS+(szind_t)mib[2]),
+CTL_RO_NL_GEN(arenas_nlextents, SC_NSIZES - SC_NBINS, unsigned)
+CTL_RO_NL_GEN(arenas_lextent_i_size, sz_index2size(SC_NBINS+(szind_t)mib[2]),
     size_t)
 static const ctl_named_node_t *
-arenas_lextent_i_index(tsdn_t *tsdn, const size_t *mib, size_t miblen,
-    size_t i) {
-	if (i > NSIZES - NBINS) {
+arenas_lextent_i_index(tsdn_t *tsdn, const size_t *mib,
+    size_t miblen, size_t i) {
+	if (i > SC_NSIZES - SC_NBINS) {
 		return NULL;
 	}
 	return super_arenas_lextent_i_node;
 }
 
 static int
-arenas_create_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,
-    size_t *oldlenp, void *newp, size_t newlen) {
+arenas_create_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
+    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
 	int ret;
 	extent_hooks_t *extent_hooks;
 	unsigned arena_ind;
@@ -2474,8 +2549,9 @@ label_return:
 }
 
 static int
-arenas_lookup_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,
-    size_t *oldlenp, void *newp, size_t newlen) {
+arenas_lookup_ctl(tsd_t *tsd, const size_t *mib,
+    size_t miblen, void *oldp, size_t *oldlenp, void *newp,
+    size_t newlen) {
 	int ret;
 	unsigned arena_ind;
 	void *ptr;
@@ -2506,8 +2582,9 @@ label_return:
 /******************************************************************************/
 
 static int
-prof_thread_active_init_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
-    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
+prof_thread_active_init_ctl(tsd_t *tsd, const size_t *mib,
+    size_t miblen, void *oldp, size_t *oldlenp, void *newp,
+    size_t newlen) {
 	int ret;
 	bool oldval;
 
@@ -2533,8 +2610,8 @@ label_return:
 }
 
 static int
-prof_active_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,
-    size_t *oldlenp, void *newp, size_t newlen) {
+prof_active_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
+    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
 	int ret;
 	bool oldval;
 
@@ -2559,8 +2636,8 @@ label_return:
 }
 
 static int
-prof_dump_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,
-    size_t *oldlenp, void *newp, size_t newlen) {
+prof_dump_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
+    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
 	int ret;
 	const char *filename = NULL;
 
@@ -2582,8 +2659,8 @@ label_return:
 }
 
 static int
-prof_gdump_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,
-    size_t *oldlenp, void *newp, size_t newlen) {
+prof_gdump_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
+    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
 	int ret;
 	bool oldval;
 
@@ -2608,8 +2685,8 @@ label_return:
 }
 
 static int
-prof_reset_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,
-    size_t *oldlenp, void *newp, size_t newlen) {
+prof_reset_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
+    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
 	int ret;
 	size_t lg_sample = lg_prof_sample;
 
@@ -2633,6 +2710,44 @@ label_return:
 CTL_RO_NL_CGEN(config_prof, prof_interval, prof_interval, uint64_t)
 CTL_RO_NL_CGEN(config_prof, lg_prof_sample, lg_prof_sample, size_t)
 
+static int
+prof_log_start_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,
+    size_t *oldlenp, void *newp, size_t newlen) {
+	int ret;
+	
+	const char *filename = NULL;
+
+	if (!config_prof) {
+		return ENOENT;
+	}
+
+	WRITEONLY();
+	WRITE(filename, const char *);
+
+	if (prof_log_start(tsd_tsdn(tsd), filename)) {
+		ret = EFAULT;
+		goto label_return; 
+	}
+
+	ret = 0;
+label_return:
+	return ret;
+}
+
+static int
+prof_log_stop_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,
+    size_t *oldlenp, void *newp, size_t newlen) {
+	if (!config_prof) {
+		return ENOENT;
+	}
+
+	if (prof_log_stop(tsd_tsdn(tsd))) {
+		return EFAULT;
+	}
+
+	return 0;
+}
+
 /******************************************************************************/
 
 CTL_RO_CGEN(config_stats, stats_allocated, ctl_stats->allocated, size_t)
@@ -2667,6 +2782,10 @@ CTL_RO_CGEN(config_stats, stats_arenas_i_mapped,
 CTL_RO_CGEN(config_stats, stats_arenas_i_retained,
     atomic_load_zu(&arenas_i(mib[2])->astats->astats.retained, ATOMIC_RELAXED),
     size_t)
+CTL_RO_CGEN(config_stats, stats_arenas_i_extent_avail,
+    atomic_load_zu(&arenas_i(mib[2])->astats->astats.extent_avail,
+        ATOMIC_RELAXED),
+    size_t)
 
 CTL_RO_CGEN(config_stats, stats_arenas_i_dirty_npurge,
     ctl_arena_stats_read_u64(
@@ -2765,8 +2884,9 @@ RO_MUTEX_CTL_GEN(arenas_i_bins_j_mutex,
 
 /* Resets all mutex stats, including global, arena and bin mutexes. */
 static int
-stats_mutexes_reset_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
-    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
+stats_mutexes_reset_ctl(tsd_t *tsd, const size_t *mib,
+    size_t miblen, void *oldp, size_t *oldlenp,
+    void *newp, size_t newlen) {
 	if (!config_stats) {
 		return ENOENT;
 	}
@@ -2806,9 +2926,11 @@ stats_mutexes_reset_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
 		MUTEX_PROF_RESET(arena->tcache_ql_mtx);
 		MUTEX_PROF_RESET(arena->base->mtx);
 
-		for (szind_t i = 0; i < NBINS; i++) {
-			bin_t *bin = &arena->bins[i];
-			MUTEX_PROF_RESET(bin->lock);
+		for (szind_t i = 0; i < SC_NBINS; i++) {
+			for (unsigned j = 0; j < bin_infos[i].n_shards; j++) {
+				bin_t *bin = &arena->bins[i].bin_shards[j];
+				MUTEX_PROF_RESET(bin->lock);
+			}
 		}
 	}
 #undef MUTEX_PROF_RESET
@@ -2835,9 +2957,9 @@ CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_curslabs,
     arenas_i(mib[2])->astats->bstats[mib[4]].curslabs, size_t)
 
 static const ctl_named_node_t *
-stats_arenas_i_bins_j_index(tsdn_t *tsdn, const size_t *mib, size_t miblen,
-    size_t j) {
-	if (j > NBINS) {
+stats_arenas_i_bins_j_index(tsdn_t *tsdn, const size_t *mib,
+    size_t miblen, size_t j) {
+	if (j > SC_NBINS) {
 		return NULL;
 	}
 	return super_stats_arenas_i_bins_j_node;
@@ -2856,16 +2978,51 @@ CTL_RO_CGEN(config_stats, stats_arenas_i_lextents_j_curlextents,
     arenas_i(mib[2])->astats->lstats[mib[4]].curlextents, size_t)
 
 static const ctl_named_node_t *
-stats_arenas_i_lextents_j_index(tsdn_t *tsdn, const size_t *mib, size_t miblen,
-    size_t j) {
-	if (j > NSIZES - NBINS) {
+stats_arenas_i_lextents_j_index(tsdn_t *tsdn, const size_t *mib,
+    size_t miblen, size_t j) {
+	if (j > SC_NSIZES - SC_NBINS) {
 		return NULL;
 	}
 	return super_stats_arenas_i_lextents_j_node;
 }
 
+CTL_RO_CGEN(config_stats, stats_arenas_i_extents_j_ndirty,
+    atomic_load_zu(
+        &arenas_i(mib[2])->astats->estats[mib[4]].ndirty,
+	ATOMIC_RELAXED), size_t);
+CTL_RO_CGEN(config_stats, stats_arenas_i_extents_j_nmuzzy,
+    atomic_load_zu(
+        &arenas_i(mib[2])->astats->estats[mib[4]].nmuzzy,
+	ATOMIC_RELAXED), size_t);
+CTL_RO_CGEN(config_stats, stats_arenas_i_extents_j_nretained,
+    atomic_load_zu(
+        &arenas_i(mib[2])->astats->estats[mib[4]].nretained,
+	ATOMIC_RELAXED), size_t);
+CTL_RO_CGEN(config_stats, stats_arenas_i_extents_j_dirty_bytes,
+    atomic_load_zu(
+        &arenas_i(mib[2])->astats->estats[mib[4]].dirty_bytes,
+	ATOMIC_RELAXED), size_t);
+CTL_RO_CGEN(config_stats, stats_arenas_i_extents_j_muzzy_bytes,
+    atomic_load_zu(
+        &arenas_i(mib[2])->astats->estats[mib[4]].muzzy_bytes,
+	ATOMIC_RELAXED), size_t);
+CTL_RO_CGEN(config_stats, stats_arenas_i_extents_j_retained_bytes,
+    atomic_load_zu(
+        &arenas_i(mib[2])->astats->estats[mib[4]].retained_bytes,
+	ATOMIC_RELAXED), size_t);
+
 static const ctl_named_node_t *
-stats_arenas_i_index(tsdn_t *tsdn, const size_t *mib, size_t miblen, size_t i) {
+stats_arenas_i_extents_j_index(tsdn_t *tsdn, const size_t *mib,
+    size_t miblen, size_t j) {
+	if (j >= SC_NPSIZES) {
+		return NULL;
+	}
+	return super_stats_arenas_i_extents_j_node;
+}
+
+static const ctl_named_node_t *
+stats_arenas_i_index(tsdn_t *tsdn, const size_t *mib,
+    size_t miblen, size_t i) {
 	const ctl_named_node_t *ret;
 	size_t a;
 
@@ -2881,3 +3038,48 @@ label_return:
 	malloc_mutex_unlock(tsdn, &ctl_mtx);
 	return ret;
 }
+
+static int
+experimental_hooks_install_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
+    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
+	int ret;
+	if (oldp == NULL || oldlenp == NULL|| newp == NULL) {
+		ret = EINVAL;
+		goto label_return;
+	}
+	/*
+	 * Note: this is a *private* struct.  This is an experimental interface;
+	 * forcing the user to know the jemalloc internals well enough to
+	 * extract the ABI hopefully ensures nobody gets too comfortable with
+	 * this API, which can change at a moment's notice.
+	 */
+	hooks_t hooks;
+	WRITE(hooks, hooks_t);
+	void *handle = hook_install(tsd_tsdn(tsd), &hooks);
+	if (handle == NULL) {
+		ret = EAGAIN;
+		goto label_return;
+	}
+	READ(handle, void *);
+
+	ret = 0;
+label_return:
+	return ret;
+}
+
+static int
+experimental_hooks_remove_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
+    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
+	int ret;
+	WRITEONLY();
+	void *handle = NULL;
+	WRITE(handle, void *);
+	if (handle == NULL) {
+		ret = EINVAL;
+		goto label_return;
+	}
+	hook_remove(tsd_tsdn(tsd), handle);
+	ret = 0;
+label_return:
+	return ret;
+}
diff --git a/src/extent.c b/src/extent.c
index 09d6d771..62086c7d 100644
--- a/src/extent.c
+++ b/src/extent.c
@@ -20,7 +20,7 @@ mutex_pool_t	extent_mutex_pool;
 size_t opt_lg_extent_max_active_fit = LG_EXTENT_MAX_ACTIVE_FIT_DEFAULT;
 
 static const bitmap_info_t extents_bitmap_info =
-    BITMAP_INFO_INITIALIZER(NPSIZES+1);
+    BITMAP_INFO_INITIALIZER(SC_NPSIZES+1);
 
 static void *extent_alloc_default(extent_hooks_t *extent_hooks, void *new_addr,
     size_t size, size_t alignment, bool *zero, bool *commit,
@@ -119,9 +119,13 @@ static void extent_record(tsdn_t *tsdn, arena_t *arena,
 
 /******************************************************************************/
 
-ph_gen(UNUSED, extent_avail_, extent_tree_t, extent_t, ph_link,
+#define ATTR_NONE /* does nothing */
+
+ph_gen(ATTR_NONE, extent_avail_, extent_tree_t, extent_t, ph_link,
     extent_esnead_comp)
 
+#undef ATTR_NONE
+
 typedef enum {
 	lock_result_success,
 	lock_result_failure,
@@ -130,13 +134,16 @@ typedef enum {
 
 static lock_result_t
 extent_rtree_leaf_elm_try_lock(tsdn_t *tsdn, rtree_leaf_elm_t *elm,
-    extent_t **result) {
+    extent_t **result, bool inactive_only) {
 	extent_t *extent1 = rtree_leaf_elm_extent_read(tsdn, &extents_rtree,
 	    elm, true);
 
-	if (extent1 == NULL) {
+	/* Slab implies active extents and should be skipped. */
+	if (extent1 == NULL || (inactive_only && rtree_leaf_elm_slab_read(tsdn,
+	    &extents_rtree, elm, true))) {
 		return lock_result_no_extent;
 	}
+
 	/*
 	 * It's possible that the extent changed out from under us, and with it
 	 * the leaf->extent mapping.  We have to recheck while holding the lock.
@@ -159,7 +166,8 @@ extent_rtree_leaf_elm_try_lock(tsdn_t *tsdn, rtree_leaf_elm_t *elm,
  * address, and NULL otherwise.
  */
 static extent_t *
-extent_lock_from_addr(tsdn_t *tsdn, rtree_ctx_t *rtree_ctx, void *addr) {
+extent_lock_from_addr(tsdn_t *tsdn, rtree_ctx_t *rtree_ctx, void *addr,
+    bool inactive_only) {
 	extent_t *ret = NULL;
 	rtree_leaf_elm_t *elm = rtree_leaf_elm_lookup(tsdn, &extents_rtree,
 	    rtree_ctx, (uintptr_t)addr, false, false);
@@ -168,7 +176,8 @@ extent_lock_from_addr(tsdn_t *tsdn, rtree_ctx_t *rtree_ctx, void *addr) {
 	}
 	lock_result_t lock_result;
 	do {
-		lock_result = extent_rtree_leaf_elm_try_lock(tsdn, elm, &ret);
+		lock_result = extent_rtree_leaf_elm_try_lock(tsdn, elm, &ret,
+		    inactive_only);
 	} while (lock_result == lock_result_failure);
 	return ret;
 }
@@ -182,6 +191,7 @@ extent_alloc(tsdn_t *tsdn, arena_t *arena) {
 		return base_alloc_extent(tsdn, arena->base);
 	}
 	extent_avail_remove(&arena->extent_avail, extent);
+	atomic_fetch_sub_zu(&arena->extent_avail_cnt, 1, ATOMIC_RELAXED);
 	malloc_mutex_unlock(tsdn, &arena->extent_avail_mtx);
 	return extent;
 }
@@ -190,6 +200,7 @@ void
 extent_dalloc(tsdn_t *tsdn, arena_t *arena, extent_t *extent) {
 	malloc_mutex_lock(tsdn, &arena->extent_avail_mtx);
 	extent_avail_insert(&arena->extent_avail, extent);
+	atomic_fetch_add_zu(&arena->extent_avail_cnt, 1, ATOMIC_RELAXED);
 	malloc_mutex_unlock(tsdn, &arena->extent_avail_mtx);
 }
 
@@ -255,7 +266,7 @@ extent_size_quantize_ceil(size_t size) {
 	size_t ret;
 
 	assert(size > 0);
-	assert(size - sz_large_pad <= LARGE_MAXCLASS);
+	assert(size - sz_large_pad <= SC_LARGE_MAXCLASS);
 	assert((size & PAGE_MASK) == 0);
 
 	ret = extent_size_quantize_floor(size);
@@ -284,7 +295,7 @@ extents_init(tsdn_t *tsdn, extents_t *extents, extent_state_t state,
 	    malloc_mutex_rank_exclusive)) {
 		return true;
 	}
-	for (unsigned i = 0; i < NPSIZES+1; i++) {
+	for (unsigned i = 0; i < SC_NPSIZES + 1; i++) {
 		extent_heap_new(&extents->heaps[i]);
 	}
 	bitmap_init(extents->bitmap, &extents_bitmap_info, true);
@@ -305,6 +316,32 @@ extents_npages_get(extents_t *extents) {
 	return atomic_load_zu(&extents->npages, ATOMIC_RELAXED);
 }
 
+size_t
+extents_nextents_get(extents_t *extents, pszind_t pind) {
+	return atomic_load_zu(&extents->nextents[pind], ATOMIC_RELAXED);
+}
+
+size_t
+extents_nbytes_get(extents_t *extents, pszind_t pind) {
+	return atomic_load_zu(&extents->nbytes[pind], ATOMIC_RELAXED);
+}
+
+static void
+extents_stats_add(extents_t *extent, pszind_t pind, size_t sz) {
+	size_t cur = atomic_load_zu(&extent->nextents[pind], ATOMIC_RELAXED);
+	atomic_store_zu(&extent->nextents[pind], cur + 1, ATOMIC_RELAXED);
+	cur = atomic_load_zu(&extent->nbytes[pind], ATOMIC_RELAXED);
+	atomic_store_zu(&extent->nbytes[pind], cur + sz, ATOMIC_RELAXED);
+}
+
+static void
+extents_stats_sub(extents_t *extent, pszind_t pind, size_t sz) {
+	size_t cur = atomic_load_zu(&extent->nextents[pind], ATOMIC_RELAXED);
+	atomic_store_zu(&extent->nextents[pind], cur - 1, ATOMIC_RELAXED);
+	cur = atomic_load_zu(&extent->nbytes[pind], ATOMIC_RELAXED);
+	atomic_store_zu(&extent->nbytes[pind], cur - sz, ATOMIC_RELAXED);
+}
+
 static void
 extents_insert_locked(tsdn_t *tsdn, extents_t *extents, extent_t *extent) {
 	malloc_mutex_assert_owner(tsdn, &extents->mtx);
@@ -318,6 +355,11 @@ extents_insert_locked(tsdn_t *tsdn, extents_t *extents, extent_t *extent) {
 		    (size_t)pind);
 	}
 	extent_heap_insert(&extents->heaps[pind], extent);
+
+	if (config_stats) {
+		extents_stats_add(extents, pind, size);
+	}
+
 	extent_list_append(&extents->lru, extent);
 	size_t npages = size >> LG_PAGE;
 	/*
@@ -340,6 +382,11 @@ extents_remove_locked(tsdn_t *tsdn, extents_t *extents, extent_t *extent) {
 	size_t psz = extent_size_quantize_floor(size);
 	pszind_t pind = sz_psz2ind(psz);
 	extent_heap_remove(&extents->heaps[pind], extent);
+
+	if (config_stats) {
+		extents_stats_sub(extents, pind, size);
+	}
+
 	if (extent_heap_empty(&extents->heaps[pind])) {
 		bitmap_set(extents->bitmap, &extents_bitmap_info,
 		    (size_t)pind);
@@ -371,7 +418,7 @@ extents_fit_alignment(extents_t *extents, size_t min_size, size_t max_size,
 	    &extents_bitmap_info, (size_t)pind); i < pind_max; i =
 	    (pszind_t)bitmap_ffu(extents->bitmap, &extents_bitmap_info,
 	    (size_t)i+1)) {
-		assert(i < NPSIZES);
+		assert(i < SC_NPSIZES);
 		assert(!extent_heap_empty(&extents->heaps[i]));
 		extent_t *extent = extent_heap_first(&extents->heaps[i]);
 		uintptr_t base = (uintptr_t)extent_base_get(extent);
@@ -401,7 +448,7 @@ extents_best_fit_locked(tsdn_t *tsdn, arena_t *arena, extents_t *extents,
 	pszind_t pind = sz_psz2ind(extent_size_quantize_ceil(size));
 	pszind_t i = (pszind_t)bitmap_ffu(extents->bitmap, &extents_bitmap_info,
 	    (size_t)pind);
-	if (i < NPSIZES+1) {
+	if (i < SC_NPSIZES + 1) {
 		/*
 		 * In order to reduce fragmentation, avoid reusing and splitting
 		 * large extents for much smaller sizes.
@@ -429,8 +476,9 @@ extents_first_fit_locked(tsdn_t *tsdn, arena_t *arena, extents_t *extents,
 
 	pszind_t pind = sz_psz2ind(extent_size_quantize_ceil(size));
 	for (pszind_t i = (pszind_t)bitmap_ffu(extents->bitmap,
-	    &extents_bitmap_info, (size_t)pind); i < NPSIZES+1; i =
-	    (pszind_t)bitmap_ffu(extents->bitmap, &extents_bitmap_info,
+	    &extents_bitmap_info, (size_t)pind);
+	    i < SC_NPSIZES + 1;
+	    i = (pszind_t)bitmap_ffu(extents->bitmap, &extents_bitmap_info,
 	    (size_t)i+1)) {
 		assert(!extent_heap_empty(&extents->heaps[i]));
 		extent_t *extent = extent_heap_first(&extents->heaps[i]);
@@ -438,10 +486,10 @@ extents_first_fit_locked(tsdn_t *tsdn, arena_t *arena, extents_t *extents,
 		if (ret == NULL || extent_snad_comp(extent, ret) < 0) {
 			ret = extent;
 		}
-		if (i == NPSIZES) {
+		if (i == SC_NPSIZES) {
 			break;
 		}
-		assert(i < NPSIZES);
+		assert(i < SC_NPSIZES);
 	}
 
 	return ret;
@@ -748,6 +796,7 @@ extent_register_impl(tsdn_t *tsdn, extent_t *extent, bool gdump_add) {
 
 	if (extent_rtree_leaf_elms_lookup(tsdn, rtree_ctx, extent, false, true,
 	    &elm_a, &elm_b)) {
+		extent_unlock(tsdn, extent);
 		return true;
 	}
 
@@ -817,7 +866,7 @@ extent_deregister_impl(tsdn_t *tsdn, extent_t *extent, bool gdump) {
 
 	extent_lock(tsdn, extent);
 
-	extent_rtree_write_acquired(tsdn, elm_a, elm_b, NULL, NSIZES, false);
+	extent_rtree_write_acquired(tsdn, elm_a, elm_b, NULL, SC_NSIZES, false);
 	if (extent_slab_get(extent)) {
 		extent_interior_deregister(tsdn, rtree_ctx, extent);
 		extent_slab_set(extent, false);
@@ -874,7 +923,8 @@ extent_recycle_extract(tsdn_t *tsdn, arena_t *arena,
 	extent_hooks_assure_initialized(arena, r_extent_hooks);
 	extent_t *extent;
 	if (new_addr != NULL) {
-		extent = extent_lock_from_addr(tsdn, rtree_ctx, new_addr);
+		extent = extent_lock_from_addr(tsdn, rtree_ctx, new_addr,
+		    false);
 		if (extent != NULL) {
 			/*
 			 * We might null-out extent to report an error, but we
@@ -958,7 +1008,7 @@ extent_split_interior(tsdn_t *tsdn, arena_t *arena,
 	if (leadsize != 0) {
 		*lead = *extent;
 		*extent = extent_split_impl(tsdn, arena, r_extent_hooks,
-		    *lead, leadsize, NSIZES, false, esize + trailsize, szind,
+		    *lead, leadsize, SC_NSIZES, false, esize + trailsize, szind,
 		    slab, growing_retained);
 		if (*extent == NULL) {
 			*to_leak = *lead;
@@ -970,7 +1020,7 @@ extent_split_interior(tsdn_t *tsdn, arena_t *arena,
 	/* Split the trail. */
 	if (trailsize != 0) {
 		*trail = extent_split_impl(tsdn, arena, r_extent_hooks, *extent,
-		    esize, szind, slab, trailsize, NSIZES, false,
+		    esize, szind, slab, trailsize, SC_NSIZES, false,
 		    growing_retained);
 		if (*trail == NULL) {
 			*to_leak = *extent;
@@ -987,7 +1037,7 @@ extent_split_interior(tsdn_t *tsdn, arena_t *arena,
 		 * splitting occurred.
 		 */
 		extent_szind_set(*extent, szind);
-		if (szind != NSIZES) {
+		if (szind != SC_NSIZES) {
 			rtree_szind_slab_update(tsdn, &extents_rtree, rtree_ctx,
 			    (uintptr_t)extent_addr_get(*extent), szind, slab);
 			if (slab && extent_size_get(*extent) > PAGE) {
@@ -1045,14 +1095,25 @@ extent_recycle_split(tsdn_t *tsdn, arena_t *arena,
 			extent_deregister_no_gdump_sub(tsdn, to_leak);
 			extents_leak(tsdn, arena, r_extent_hooks, extents,
 			    to_leak, growing_retained);
-			assert(extent_lock_from_addr(tsdn, rtree_ctx, leak)
-			    == NULL);
+			assert(extent_lock_from_addr(tsdn, rtree_ctx, leak,
+			    false) == NULL);
 		}
 		return NULL;
 	}
 	unreachable();
 }
 
+static bool
+extent_need_manual_zero(arena_t *arena) {
+	/*
+	 * Need to manually zero the extent on repopulating if either; 1) non
+	 * default extent hooks installed (in which case the purge semantics may
+	 * change); or 2) transparent huge pages enabled.
+	 */
+	return (!arena_has_default_hooks(arena) ||
+		(opt_thp == thp_mode_always));
+}
+
 /*
  * Tries to satisfy the given allocation request by reusing one of the extents
  * in the given extents_t.
@@ -1092,7 +1153,9 @@ extent_recycle(tsdn_t *tsdn, arena_t *arena, extent_hooks_t **r_extent_hooks,
 			    extent, growing_retained);
 			return NULL;
 		}
-		extent_zeroed_set(extent, true);
+		if (!extent_need_manual_zero(arena)) {
+			extent_zeroed_set(extent, true);
+		}
 	}
 
 	if (extent_committed_get(extent)) {
@@ -1113,14 +1176,16 @@ extent_recycle(tsdn_t *tsdn, arena_t *arena, extent_hooks_t **r_extent_hooks,
 
 	if (*zero) {
 		void *addr = extent_base_get(extent);
-		size_t size = extent_size_get(extent);
 		if (!extent_zeroed_get(extent)) {
-			if (pages_purge_forced(addr, size)) {
+			size_t size = extent_size_get(extent);
+			if (extent_need_manual_zero(arena) ||
+			    pages_purge_forced(addr, size)) {
 				memset(addr, 0, size);
 			}
 		} else if (config_debug) {
 			size_t *p = (size_t *)(uintptr_t)addr;
-			for (size_t i = 0; i < size / sizeof(size_t); i++) {
+			/* Check the first page only. */
+			for (size_t i = 0; i < PAGE / sizeof(size_t); i++) {
 				assert(p[i] == 0);
 			}
 		}
@@ -1244,11 +1309,11 @@ extent_grow_retained(tsdn_t *tsdn, arena_t *arena,
 	size_t alloc_size = sz_pind2sz(arena->extent_grow_next + egn_skip);
 	while (alloc_size < alloc_size_min) {
 		egn_skip++;
-		if (arena->extent_grow_next + egn_skip == NPSIZES) {
+		if (arena->extent_grow_next + egn_skip >=
+		    sz_psz2ind(SC_LARGE_MAXCLASS)) {
 			/* Outside legal range. */
 			goto label_err;
 		}
-		assert(arena->extent_grow_next + egn_skip < NPSIZES);
 		alloc_size = sz_pind2sz(arena->extent_grow_next + egn_skip);
 	}
 
@@ -1271,7 +1336,7 @@ extent_grow_retained(tsdn_t *tsdn, arena_t *arena,
 		extent_hook_post_reentrancy(tsdn);
 	}
 
-	extent_init(extent, arena, ptr, alloc_size, false, NSIZES,
+	extent_init(extent, arena, ptr, alloc_size, false, SC_NSIZES,
 	    arena_extent_sn_next(arena), extent_state_active, zeroed,
 	    committed, true);
 	if (ptr == NULL) {
@@ -1341,7 +1406,9 @@ extent_grow_retained(tsdn_t *tsdn, arena_t *arena,
 			    &arena->extents_retained, extent, true);
 			goto label_err;
 		}
-		extent_zeroed_set(extent, true);
+		if (!extent_need_manual_zero(arena)) {
+			extent_zeroed_set(extent, true);
+		}
 	}
 
 	/*
@@ -1375,7 +1442,8 @@ extent_grow_retained(tsdn_t *tsdn, arena_t *arena,
 	if (*zero && !extent_zeroed_get(extent)) {
 		void *addr = extent_base_get(extent);
 		size_t size = extent_size_get(extent);
-		if (pages_purge_forced(addr, size)) {
+		if (extent_need_manual_zero(arena) ||
+		    pages_purge_forced(addr, size)) {
 			memset(addr, 0, size);
 		}
 	}
@@ -1524,9 +1592,15 @@ extent_coalesce(tsdn_t *tsdn, arena_t *arena, extent_hooks_t **r_extent_hooks,
 }
 
 static extent_t *
-extent_try_coalesce(tsdn_t *tsdn, arena_t *arena,
+extent_try_coalesce_impl(tsdn_t *tsdn, arena_t *arena,
     extent_hooks_t **r_extent_hooks, rtree_ctx_t *rtree_ctx, extents_t *extents,
-    extent_t *extent, bool *coalesced, bool growing_retained) {
+    extent_t *extent, bool *coalesced, bool growing_retained,
+    bool inactive_only) {
+	/*
+	 * We avoid checking / locking inactive neighbors for large size
+	 * classes, since they are eagerly coalesced on deallocation which can
+	 * cause lock contention.
+	 */
 	/*
 	 * Continue attempting to coalesce until failure, to protect against
 	 * races with other threads that are thwarted by this one.
@@ -1537,7 +1611,7 @@ extent_try_coalesce(tsdn_t *tsdn, arena_t *arena,
 
 		/* Try to coalesce forward. */
 		extent_t *next = extent_lock_from_addr(tsdn, rtree_ctx,
-		    extent_past_get(extent));
+		    extent_past_get(extent), inactive_only);
 		if (next != NULL) {
 			/*
 			 * extents->mtx only protects against races for
@@ -1563,7 +1637,7 @@ extent_try_coalesce(tsdn_t *tsdn, arena_t *arena,
 
 		/* Try to coalesce backward. */
 		extent_t *prev = extent_lock_from_addr(tsdn, rtree_ctx,
-		    extent_before_get(extent));
+		    extent_before_get(extent), inactive_only);
 		if (prev != NULL) {
 			bool can_coalesce = extent_can_coalesce(arena, extents,
 			    extent, prev);
@@ -1589,6 +1663,22 @@ extent_try_coalesce(tsdn_t *tsdn, arena_t *arena,
 	return extent;
 }
 
+static extent_t *
+extent_try_coalesce(tsdn_t *tsdn, arena_t *arena,
+    extent_hooks_t **r_extent_hooks, rtree_ctx_t *rtree_ctx, extents_t *extents,
+    extent_t *extent, bool *coalesced, bool growing_retained) {
+	return extent_try_coalesce_impl(tsdn, arena, r_extent_hooks, rtree_ctx,
+	    extents, extent, coalesced, growing_retained, false);
+}
+
+static extent_t *
+extent_try_coalesce_large(tsdn_t *tsdn, arena_t *arena,
+    extent_hooks_t **r_extent_hooks, rtree_ctx_t *rtree_ctx, extents_t *extents,
+    extent_t *extent, bool *coalesced, bool growing_retained) {
+	return extent_try_coalesce_impl(tsdn, arena, r_extent_hooks, rtree_ctx,
+	    extents, extent, coalesced, growing_retained, true);
+}
+
 /*
  * Does the metadata management portions of putting an unused extent into the
  * given extents_t (coalesces, deregisters slab interiors, the heap operations).
@@ -1606,7 +1696,7 @@ extent_record(tsdn_t *tsdn, arena_t *arena, extent_hooks_t **r_extent_hooks,
 	malloc_mutex_lock(tsdn, &extents->mtx);
 	extent_hooks_assure_initialized(arena, r_extent_hooks);
 
-	extent_szind_set(extent, NSIZES);
+	extent_szind_set(extent, SC_NSIZES);
 	if (extent_slab_get(extent)) {
 		extent_interior_deregister(tsdn, rtree_ctx, extent);
 		extent_slab_set(extent, false);
@@ -1618,18 +1708,22 @@ extent_record(tsdn_t *tsdn, arena_t *arena, extent_hooks_t **r_extent_hooks,
 	if (!extents->delay_coalesce) {
 		extent = extent_try_coalesce(tsdn, arena, r_extent_hooks,
 		    rtree_ctx, extents, extent, NULL, growing_retained);
-	} else if (extent_size_get(extent) >= LARGE_MINCLASS) {
+	} else if (extent_size_get(extent) >= SC_LARGE_MINCLASS) {
+		assert(extents == &arena->extents_dirty);
 		/* Always coalesce large extents eagerly. */
 		bool coalesced;
-		size_t prev_size;
 		do {
-			prev_size = extent_size_get(extent);
 			assert(extent_state_get(extent) == extent_state_active);
-			extent = extent_try_coalesce(tsdn, arena,
+			extent = extent_try_coalesce_large(tsdn, arena,
 			    r_extent_hooks, rtree_ctx, extents, extent,
 			    &coalesced, growing_retained);
-		} while (coalesced &&
-		    extent_size_get(extent) >= prev_size + LARGE_MINCLASS);
+		} while (coalesced);
+		if (extent_size_get(extent) >= oversize_threshold) {
+			/* Shortcut to purge the oversize extent eagerly. */
+			malloc_mutex_unlock(tsdn, &extents->mtx);
+			arena_decay_extent(tsdn, arena, r_extent_hooks, extent);
+			return;
+		}
 	}
 	extent_deactivate_locked(tsdn, arena, extents, extent);
 
@@ -1651,6 +1745,12 @@ extent_dalloc_gap(tsdn_t *tsdn, arena_t *arena, extent_t *extent) {
 	extent_dalloc_wrapper(tsdn, arena, &extent_hooks, extent);
 }
 
+static bool
+extent_may_dalloc(void) {
+	/* With retain enabled, the default dalloc always fails. */
+	return !opt_retain;
+}
+
 static bool
 extent_dalloc_default_impl(void *addr, size_t size) {
 	if (!have_dss || !extent_in_dss(addr)) {
@@ -1706,16 +1806,20 @@ extent_dalloc_wrapper(tsdn_t *tsdn, arena_t *arena,
 	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
 	    WITNESS_RANK_CORE, 0);
 
-	/*
-	 * Deregister first to avoid a race with other allocating threads, and
-	 * reregister if deallocation fails.
-	 */
-	extent_deregister(tsdn, extent);
-	if (!extent_dalloc_wrapper_try(tsdn, arena, r_extent_hooks, extent)) {
-		return;
+	/* Avoid calling the default extent_dalloc unless have to. */
+	if (*r_extent_hooks != &extent_hooks_default || extent_may_dalloc()) {
+		/*
+		 * Deregister first to avoid a race with other allocating
+		 * threads, and reregister if deallocation fails.
+		 */
+		extent_deregister(tsdn, extent);
+		if (!extent_dalloc_wrapper_try(tsdn, arena, r_extent_hooks,
+		    extent)) {
+			return;
+		}
+		extent_reregister(tsdn, extent);
 	}
 
-	extent_reregister(tsdn, extent);
 	if (*r_extent_hooks != &extent_hooks_default) {
 		extent_hook_pre_reentrancy(tsdn, arena);
 	}
@@ -2128,22 +2232,23 @@ extent_merge_impl(tsdn_t *tsdn, arena_t *arena,
 
 	if (a_elm_b != NULL) {
 		rtree_leaf_elm_write(tsdn, &extents_rtree, a_elm_b, NULL,
-		    NSIZES, false);
+		    SC_NSIZES, false);
 	}
 	if (b_elm_b != NULL) {
 		rtree_leaf_elm_write(tsdn, &extents_rtree, b_elm_a, NULL,
-		    NSIZES, false);
+		    SC_NSIZES, false);
 	} else {
 		b_elm_b = b_elm_a;
 	}
 
 	extent_size_set(a, extent_size_get(a) + extent_size_get(b));
-	extent_szind_set(a, NSIZES);
+	extent_szind_set(a, SC_NSIZES);
 	extent_sn_set(a, (extent_sn_get(a) < extent_sn_get(b)) ?
 	    extent_sn_get(a) : extent_sn_get(b));
 	extent_zeroed_set(a, extent_zeroed_get(a) && extent_zeroed_get(b));
 
-	extent_rtree_write_acquired(tsdn, a_elm_a, b_elm_b, a, NSIZES, false);
+	extent_rtree_write_acquired(tsdn, a_elm_a, b_elm_b, a, SC_NSIZES,
+	    false);
 
 	extent_unlock2(tsdn, a, b);
 
diff --git a/src/extent_dss.c b/src/extent_dss.c
index 2b1ea9ca..6c56cf65 100644
--- a/src/extent_dss.c
+++ b/src/extent_dss.c
@@ -154,7 +154,7 @@ extent_alloc_dss(tsdn_t *tsdn, arena_t *arena, void *new_addr, size_t size,
 			    (uintptr_t)gap_addr_page;
 			if (gap_size_page != 0) {
 				extent_init(gap, arena, gap_addr_page,
-				    gap_size_page, false, NSIZES,
+				    gap_size_page, false, SC_NSIZES,
 				    arena_extent_sn_next(arena),
 				    extent_state_active, false, true, true);
 			}
@@ -198,7 +198,7 @@ extent_alloc_dss(tsdn_t *tsdn, arena_t *arena, void *new_addr, size_t size,
 					extent_t extent;
 
 					extent_init(&extent, arena, ret, size,
-					    size, false, NSIZES,
+					    size, false, SC_NSIZES,
 					    extent_state_active, false, true,
 					    true);
 					if (extent_purge_forced_wrapper(tsdn,
diff --git a/src/hook.c b/src/hook.c
new file mode 100644
index 00000000..9ac703cf
--- /dev/null
+++ b/src/hook.c
@@ -0,0 +1,195 @@
+#include "jemalloc/internal/jemalloc_preamble.h"
+
+#include "jemalloc/internal/hook.h"
+
+#include "jemalloc/internal/atomic.h"
+#include "jemalloc/internal/mutex.h"
+#include "jemalloc/internal/seq.h"
+
+typedef struct hooks_internal_s hooks_internal_t;
+struct hooks_internal_s {
+	hooks_t hooks;
+	bool in_use;
+};
+
+seq_define(hooks_internal_t, hooks)
+
+static atomic_u_t nhooks = ATOMIC_INIT(0);
+static seq_hooks_t hooks[HOOK_MAX];
+static malloc_mutex_t hooks_mu;
+
+bool
+hook_boot() {
+	return malloc_mutex_init(&hooks_mu, "hooks", WITNESS_RANK_HOOK,
+	    malloc_mutex_rank_exclusive);
+}
+
+static void *
+hook_install_locked(hooks_t *to_install) {
+	hooks_internal_t hooks_internal;
+	for (int i = 0; i < HOOK_MAX; i++) {
+		bool success = seq_try_load_hooks(&hooks_internal, &hooks[i]);
+		/* We hold mu; no concurrent access. */
+		assert(success);
+		if (!hooks_internal.in_use) {
+			hooks_internal.hooks = *to_install;
+			hooks_internal.in_use = true;
+			seq_store_hooks(&hooks[i], &hooks_internal);
+			atomic_store_u(&nhooks,
+			    atomic_load_u(&nhooks, ATOMIC_RELAXED) + 1,
+			    ATOMIC_RELAXED);
+			return &hooks[i];
+		}
+	}
+	return NULL;
+}
+
+void *
+hook_install(tsdn_t *tsdn, hooks_t *to_install) {
+	malloc_mutex_lock(tsdn, &hooks_mu);
+	void *ret = hook_install_locked(to_install);
+	if (ret != NULL) {
+		tsd_global_slow_inc(tsdn);
+	}
+	malloc_mutex_unlock(tsdn, &hooks_mu);
+	return ret;
+}
+
+static void
+hook_remove_locked(seq_hooks_t *to_remove) {
+	hooks_internal_t hooks_internal;
+	bool success = seq_try_load_hooks(&hooks_internal, to_remove);
+	/* We hold mu; no concurrent access. */
+	assert(success);
+	/* Should only remove hooks that were added. */
+	assert(hooks_internal.in_use);
+	hooks_internal.in_use = false;
+	seq_store_hooks(to_remove, &hooks_internal);
+	atomic_store_u(&nhooks, atomic_load_u(&nhooks, ATOMIC_RELAXED) - 1,
+	    ATOMIC_RELAXED);
+}
+
+void
+hook_remove(tsdn_t *tsdn, void *opaque) {
+	if (config_debug) {
+		char *hooks_begin = (char *)&hooks[0];
+		char *hooks_end = (char *)&hooks[HOOK_MAX];
+		char *hook = (char *)opaque;
+		assert(hooks_begin <= hook && hook < hooks_end
+		    && (hook - hooks_begin) % sizeof(seq_hooks_t) == 0);
+	}
+	malloc_mutex_lock(tsdn, &hooks_mu);
+	hook_remove_locked((seq_hooks_t *)opaque);
+	tsd_global_slow_dec(tsdn);
+	malloc_mutex_unlock(tsdn, &hooks_mu);
+}
+
+#define FOR_EACH_HOOK_BEGIN(hooks_internal_ptr)				\
+for (int for_each_hook_counter = 0;					\
+    for_each_hook_counter < HOOK_MAX;					\
+    for_each_hook_counter++) {						\
+	bool for_each_hook_success = seq_try_load_hooks(		\
+	    (hooks_internal_ptr), &hooks[for_each_hook_counter]);	\
+	if (!for_each_hook_success) {					\
+		continue;						\
+	}								\
+	if (!(hooks_internal_ptr)->in_use) {				\
+		continue;						\
+	}
+#define FOR_EACH_HOOK_END						\
+}
+
+static bool *
+hook_reentrantp() {
+	/*
+	 * We prevent user reentrancy within hooks.  This is basically just a
+	 * thread-local bool that triggers an early-exit.
+	 *
+	 * We don't fold in_hook into reentrancy.  There are two reasons for
+	 * this:
+	 * - Right now, we turn on reentrancy during things like extent hook
+	 *   execution.  Allocating during extent hooks is not officially
+	 *   supported, but we don't want to break it for the time being.  These
+	 *   sorts of allocations should probably still be hooked, though.
+	 * - If a hook allocates, we may want it to be relatively fast (after
+	 *   all, it executes on every allocator operation).  Turning on
+	 *   reentrancy is a fairly heavyweight mode (disabling tcache,
+	 *   redirecting to arena 0, etc.).  It's possible we may one day want
+	 *   to turn on reentrant mode here, if it proves too difficult to keep
+	 *   this working.  But that's fairly easy for us to see; OTOH, people
+	 *   not using hooks because they're too slow is easy for us to miss.
+	 *
+	 * The tricky part is
+	 * that this code might get invoked even if we don't have access to tsd.
+	 * This function mimics getting a pointer to thread-local data, except
+	 * that it might secretly return a pointer to some global data if we
+	 * know that the caller will take the early-exit path.
+	 * If we return a bool that indicates that we are reentrant, then the
+	 * caller will go down the early exit path, leaving the global
+	 * untouched.
+	 */
+	static bool in_hook_global = true;
+	tsdn_t *tsdn = tsdn_fetch();
+	tcache_t *tcache = tsdn_tcachep_get(tsdn);
+	if (tcache != NULL) {
+		return &tcache->in_hook;
+	}
+	return &in_hook_global;
+}
+
+#define HOOK_PROLOGUE							\
+	if (likely(atomic_load_u(&nhooks, ATOMIC_RELAXED) == 0)) {	\
+		return;							\
+	}								\
+	bool *in_hook = hook_reentrantp();				\
+	if (*in_hook) {							\
+		return;							\
+	}								\
+	*in_hook = true;
+
+#define HOOK_EPILOGUE							\
+	*in_hook = false;
+
+void
+hook_invoke_alloc(hook_alloc_t type, void *result, uintptr_t result_raw,
+    uintptr_t args_raw[3]) {
+	HOOK_PROLOGUE
+
+	hooks_internal_t hook;
+	FOR_EACH_HOOK_BEGIN(&hook)
+		hook_alloc h = hook.hooks.alloc_hook;
+		if (h != NULL) {
+			h(hook.hooks.extra, type, result, result_raw, args_raw);
+		}
+	FOR_EACH_HOOK_END
+
+	HOOK_EPILOGUE
+}
+
+void
+hook_invoke_dalloc(hook_dalloc_t type, void *address, uintptr_t args_raw[3]) {
+	HOOK_PROLOGUE
+	hooks_internal_t hook;
+	FOR_EACH_HOOK_BEGIN(&hook)
+		hook_dalloc h = hook.hooks.dalloc_hook;
+		if (h != NULL) {
+			h(hook.hooks.extra, type, address, args_raw);
+		}
+	FOR_EACH_HOOK_END
+	HOOK_EPILOGUE
+}
+
+void
+hook_invoke_expand(hook_expand_t type, void *address, size_t old_usize,
+    size_t new_usize, uintptr_t result_raw, uintptr_t args_raw[4]) {
+	HOOK_PROLOGUE
+	hooks_internal_t hook;
+	FOR_EACH_HOOK_BEGIN(&hook)
+		hook_expand h = hook.hooks.expand_hook;
+		if (h != NULL) {
+			h(hook.hooks.extra, type, address, old_usize, new_usize,
+			    result_raw, args_raw);
+		}
+	FOR_EACH_HOOK_END
+	HOOK_EPILOGUE
+}
diff --git a/src/jemalloc.c b/src/jemalloc.c
index f93c16fa..c8afa9c4 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -7,12 +7,13 @@
 #include "jemalloc/internal/ctl.h"
 #include "jemalloc/internal/extent_dss.h"
 #include "jemalloc/internal/extent_mmap.h"
+#include "jemalloc/internal/hook.h"
 #include "jemalloc/internal/jemalloc_internal_types.h"
 #include "jemalloc/internal/log.h"
 #include "jemalloc/internal/malloc_io.h"
 #include "jemalloc/internal/mutex.h"
 #include "jemalloc/internal/rtree.h"
-#include "jemalloc/internal/size_classes.h"
+#include "jemalloc/internal/sc.h"
 #include "jemalloc/internal/spin.h"
 #include "jemalloc/internal/sz.h"
 #include "jemalloc/internal/ticker.h"
@@ -85,8 +86,10 @@ malloc_mutex_t arenas_lock;
 JEMALLOC_ALIGNED(CACHELINE)
 atomic_p_t		arenas[MALLOCX_ARENA_LIMIT];
 static atomic_u_t	narenas_total; /* Use narenas_total_*(). */
-static arena_t		*a0; /* arenas[0]; read-only after initialization. */
-unsigned		narenas_auto; /* Read-only after initialization. */
+/* Below three are read-only after initialization. */
+static arena_t		*a0; /* arenas[0]. */
+unsigned		narenas_auto;
+unsigned		manual_arena_base;
 
 typedef enum {
 	malloc_init_uninitialized	= 3,
@@ -326,7 +329,7 @@ arena_init_locked(tsdn_t *tsdn, unsigned ind, extent_hooks_t *extent_hooks) {
 	 */
 	arena = arena_get(tsdn, ind, false);
 	if (arena != NULL) {
-		assert(ind < narenas_auto);
+		assert(arena_is_auto(arena));
 		return arena;
 	}
 
@@ -341,12 +344,12 @@ arena_new_create_background_thread(tsdn_t *tsdn, unsigned ind) {
 	if (ind == 0) {
 		return;
 	}
-	if (have_background_thread) {
-		bool err;
-		malloc_mutex_lock(tsdn, &background_thread_lock);
-		err = background_thread_create(tsdn_tsd(tsdn), ind);
-		malloc_mutex_unlock(tsdn, &background_thread_lock);
-		if (err) {
+	/*
+	 * Avoid creating a new background thread just for the huge arena, which
+	 * purges eagerly by default.
+	 */
+	if (have_background_thread && !arena_is_huge(ind)) {
+		if (background_thread_create(tsdn_tsd(tsdn), ind)) {
 			malloc_printf("<jemalloc>: error in background thread "
 				      "creation for arena %u. Abort.\n", ind);
 			abort();
@@ -376,6 +379,14 @@ arena_bind(tsd_t *tsd, unsigned ind, bool internal) {
 		tsd_iarena_set(tsd, arena);
 	} else {
 		tsd_arena_set(tsd, arena);
+		unsigned shard = atomic_fetch_add_u(&arena->binshard_next, 1,
+		    ATOMIC_RELAXED);
+		tsd_binshards_t *bins = tsd_binshardsp_get(tsd);
+		for (unsigned i = 0; i < SC_NBINS; i++) {
+			assert(bin_infos[i].n_shards > 0 &&
+			    bin_infos[i].n_shards <= BIN_SHARDS_MAX);
+			bins->binshard[i] = shard % bin_infos[i].n_shards;
+		}
 	}
 }
 
@@ -761,6 +772,50 @@ init_opt_stats_print_opts(const char *v, size_t vlen) {
 	assert(opts_len == strlen(opt_stats_print_opts));
 }
 
+/* Reads the next size pair in a multi-sized option. */
+static bool
+malloc_conf_multi_sizes_next(const char **slab_size_segment_cur,
+    size_t *vlen_left, size_t *slab_start, size_t *slab_end, size_t *new_size) {
+	const char *cur = *slab_size_segment_cur;
+	char *end;
+	uintmax_t um;
+
+	set_errno(0);
+
+	/* First number, then '-' */
+	um = malloc_strtoumax(cur, &end, 0);
+	if (get_errno() != 0 || *end != '-') {
+		return true;
+	}
+	*slab_start = (size_t)um;
+	cur = end + 1;
+
+	/* Second number, then ':' */
+	um = malloc_strtoumax(cur, &end, 0);
+	if (get_errno() != 0 || *end != ':') {
+		return true;
+	}
+	*slab_end = (size_t)um;
+	cur = end + 1;
+
+	/* Last number */
+	um = malloc_strtoumax(cur, &end, 0);
+	if (get_errno() != 0) {
+		return true;
+	}
+	*new_size = (size_t)um;
+
+	/* Consume the separator if there is one. */
+	if (*end == '|') {
+		end++;
+	}
+
+	*vlen_left -= end - *slab_size_segment_cur;
+	*slab_size_segment_cur = end;
+
+	return false;
+}
+
 static bool
 malloc_conf_next(char const **opts_p, char const **k_p, size_t *klen_p,
     char const **v_p, size_t *vlen_p) {
@@ -850,6 +905,11 @@ malloc_conf_error(const char *msg, const char *k, size_t klen, const char *v,
 	malloc_printf("<jemalloc>: %s: %.*s:%.*s\n", msg, (int)klen, k,
 	    (int)vlen, v);
 	/* If abort_conf is set, error out after processing all options. */
+	const char *experimental = "experimental_";
+	if (strncmp(k, experimental, strlen(experimental)) == 0) {
+		/* However, tolerate experimental features. */
+		return;
+	}
 	had_conf_error = true;
 }
 
@@ -869,7 +929,7 @@ malloc_slow_flag_init(void) {
 }
 
 static void
-malloc_conf_init(void) {
+malloc_conf_init(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS]) {
 	unsigned i;
 	char buf[PATH_MAX + 1];
 	const char *opts, *k, *v;
@@ -910,7 +970,12 @@ malloc_conf_init(void) {
 			 * Try to use the contents of the "/etc/malloc.conf"
 			 * symbolic link's name.
 			 */
+#ifndef JEMALLOC_READLINKAT
 			linklen = readlink(linkname, buf, sizeof(buf) - 1);
+#else
+			linklen = readlinkat(AT_FDCWD, linkname, buf,
+			    sizeof(buf) - 1);
+#endif
 			if (linklen == -1) {
 				/* No configuration specified. */
 				linklen = 0;
@@ -967,6 +1032,14 @@ malloc_conf_init(void) {
 				}					\
 				continue;				\
 			}
+      /*
+       * One of the CONF_MIN macros below expands, in one of the use points,
+       * to "unsigned integer < 0", which is always false, triggering the
+       * GCC -Wtype-limits warning, which we disable here and re-enable below.
+       */
+      JEMALLOC_DIAGNOSTIC_PUSH
+      JEMALLOC_DIAGNOSTIC_IGNORE_TYPE_LIMITS
+
 #define CONF_MIN_no(um, min)	false
 #define CONF_MIN_yes(um, min)	((um) < (min))
 #define CONF_MAX_no(um, max)	false
@@ -1094,6 +1167,28 @@ malloc_conf_init(void) {
 			}
 			CONF_HANDLE_UNSIGNED(opt_narenas, "narenas", 1,
 			    UINT_MAX, yes, no, false)
+			if (CONF_MATCH("bin_shards")) {
+				const char *bin_shards_segment_cur = v;
+				size_t vlen_left = vlen;
+				do {
+					size_t size_start;
+					size_t size_end;
+					size_t nshards;
+					bool err = malloc_conf_multi_sizes_next(
+					    &bin_shards_segment_cur, &vlen_left,
+					    &size_start, &size_end, &nshards);
+					if (err || bin_update_shard_size(
+					    bin_shard_sizes, size_start,
+					    size_end, nshards)) {
+						malloc_conf_error(
+						    "Invalid settings for "
+						    "bin_shards", k, klen, v,
+						    vlen);
+						break;
+					}
+				} while (vlen_left > 0);
+				continue;
+			}
 			CONF_HANDLE_SSIZE_T(opt_dirty_decay_ms,
 			    "dirty_decay_ms", -1, NSTIME_SEC_MAX * KQU(1000) <
 			    QU(SSIZE_MAX) ? NSTIME_SEC_MAX * KQU(1000) :
@@ -1141,11 +1236,24 @@ malloc_conf_init(void) {
 				CONF_HANDLE_BOOL(opt_xmalloc, "xmalloc")
 			}
 			CONF_HANDLE_BOOL(opt_tcache, "tcache")
-			CONF_HANDLE_SIZE_T(opt_lg_extent_max_active_fit,
-			    "lg_extent_max_active_fit", 0,
-			    (sizeof(size_t) << 3), yes, yes, false)
 			CONF_HANDLE_SSIZE_T(opt_lg_tcache_max, "lg_tcache_max",
 			    -1, (sizeof(size_t) << 3) - 1)
+
+			/*
+			 * The runtime option of oversize_threshold remains
+			 * undocumented.  It may be tweaked in the next major
+			 * release (6.0).  The default value 8M is rather
+			 * conservative / safe.  Tuning it further down may
+			 * improve fragmentation a bit more, but may also cause
+			 * contention on the huge arena.
+			 */
+			CONF_HANDLE_SIZE_T(opt_oversize_threshold,
+			    "oversize_threshold", 0, SC_LARGE_MAXCLASS, no, yes,
+			    false)
+			CONF_HANDLE_SIZE_T(opt_lg_extent_max_active_fit,
+			    "lg_extent_max_active_fit", 0,
+			    (sizeof(size_t) << 3), no, yes, false)
+
 			if (strncmp("percpu_arena", k, klen) == 0) {
 				bool match = false;
 				for (int i = percpu_arena_mode_names_base; i <
@@ -1174,6 +1282,31 @@ malloc_conf_init(void) {
 					   "max_background_threads", 1,
 					   opt_max_background_threads, yes, yes,
 					   true);
+			if (CONF_MATCH("slab_sizes")) {
+				bool err;
+				const char *slab_size_segment_cur = v;
+				size_t vlen_left = vlen;
+				do {
+					size_t slab_start;
+					size_t slab_end;
+					size_t pgs;
+					err = malloc_conf_multi_sizes_next(
+					    &slab_size_segment_cur,
+					    &vlen_left, &slab_start, &slab_end,
+					    &pgs);
+					if (!err) {
+						sc_data_update_slab_size(
+						    sc_data, slab_start,
+						    slab_end, (int)pgs);
+					} else {
+						malloc_conf_error(
+						    "Invalid settings for "
+						    "slab_sizes", k, klen, v,
+						    vlen);
+					}
+				} while (!err && vlen_left > 0);
+				continue;
+			}
 			if (config_prof) {
 				CONF_HANDLE_BOOL(opt_prof, "prof")
 				CONF_HANDLE_CHAR_P(opt_prof_prefix,
@@ -1191,6 +1324,7 @@ malloc_conf_init(void) {
 				CONF_HANDLE_BOOL(opt_prof_gdump, "prof_gdump")
 				CONF_HANDLE_BOOL(opt_prof_final, "prof_final")
 				CONF_HANDLE_BOOL(opt_prof_leak, "prof_leak")
+				CONF_HANDLE_BOOL(opt_prof_log, "prof_log")
 			}
 			if (config_log) {
 				if (CONF_MATCH("log")) {
@@ -1237,6 +1371,8 @@ malloc_conf_init(void) {
 #undef CONF_HANDLE_SIZE_T
 #undef CONF_HANDLE_SSIZE_T
 #undef CONF_HANDLE_CHAR_P
+    /* Re-enable diagnostic "-Wtype-limits" */
+    JEMALLOC_DIAGNOSTIC_POP
 		}
 		if (opt_abort_conf && had_conf_error) {
 			malloc_abort_invalid_conf();
@@ -1275,10 +1411,33 @@ static bool
 malloc_init_hard_a0_locked() {
 	malloc_initializer = INITIALIZER;
 
+	JEMALLOC_DIAGNOSTIC_PUSH
+	JEMALLOC_DIAGNOSTIC_IGNORE_MISSING_STRUCT_FIELD_INITIALIZERS
+	sc_data_t sc_data = {0};
+	JEMALLOC_DIAGNOSTIC_POP
+
+	/*
+	 * Ordering here is somewhat tricky; we need sc_boot() first, since that
+	 * determines what the size classes will be, and then
+	 * malloc_conf_init(), since any slab size tweaking will need to be done
+	 * before sz_boot and bin_boot, which assume that the values they read
+	 * out of sc_data_global are final.
+	 */
+	sc_boot(&sc_data);
+	unsigned bin_shard_sizes[SC_NBINS];
+	bin_shard_sizes_boot(bin_shard_sizes);
+	/*
+	 * prof_boot0 only initializes opt_prof_prefix.  We need to do it before
+	 * we parse malloc_conf options, in case malloc_conf parsing overwrites
+	 * it.
+	 */
 	if (config_prof) {
 		prof_boot0();
 	}
-	malloc_conf_init();
+	malloc_conf_init(&sc_data, bin_shard_sizes);
+	sz_boot(&sc_data);
+	bin_boot(&sc_data, bin_shard_sizes);
+
 	if (opt_stats_print) {
 		/* Print statistics at exit. */
 		if (atexit(stats_print_atexit) != 0) {
@@ -1303,7 +1462,7 @@ malloc_init_hard_a0_locked() {
 	if (config_prof) {
 		prof_boot1();
 	}
-	arena_boot();
+	arena_boot(&sc_data);
 	if (tcache_boot(TSDN_NULL)) {
 		return true;
 	}
@@ -1311,11 +1470,13 @@ malloc_init_hard_a0_locked() {
 	    malloc_mutex_rank_exclusive)) {
 		return true;
 	}
+	hook_boot();
 	/*
 	 * Create enough scaffolding to allow recursive allocation in
 	 * malloc_ncpus().
 	 */
 	narenas_auto = 1;
+	manual_arena_base = narenas_auto + 1;
 	memset(arenas, 0, sizeof(arena_t *) * narenas_auto);
 	/*
 	 * Initialize one arena here.  The rest are lazily created in
@@ -1463,6 +1624,10 @@ malloc_init_narenas(void) {
 		    narenas_auto);
 	}
 	narenas_total_set(narenas_auto);
+	if (arena_init_huge()) {
+		narenas_total_inc();
+	}
+	manual_arena_base = narenas_total_get();
 
 	return false;
 }
@@ -1560,11 +1725,7 @@ malloc_init_hard(void) {
 		 * sets isthreaded) needs to be called without holding any lock.
 		 */
 		background_thread_ctl_init(tsd_tsdn(tsd));
-
-		malloc_mutex_lock(tsd_tsdn(tsd), &background_thread_lock);
-		bool err = background_thread_create(tsd, 0);
-		malloc_mutex_unlock(tsd_tsdn(tsd), &background_thread_lock);
-		if (err) {
+		if (background_thread_create(tsd, 0)) {
 			return true;
 		}
 	}
@@ -1587,8 +1748,7 @@ typedef struct static_opts_s static_opts_t;
 struct static_opts_s {
 	/* Whether or not allocation size may overflow. */
 	bool may_overflow;
-	/* Whether or not allocations of size 0 should be treated as size 1. */
-	bool bump_empty_alloc;
+
 	/*
 	 * Whether to assert that allocations are not of size 0 (after any
 	 * bumping).
@@ -1621,12 +1781,15 @@ struct static_opts_s {
 	 * initialization) options.
 	 */
 	bool slow;
+	/*
+	 * Return size.
+	 */
+	bool usize;
 };
 
 JEMALLOC_ALWAYS_INLINE void
 static_opts_init(static_opts_t *static_opts) {
 	static_opts->may_overflow = false;
-	static_opts->bump_empty_alloc = false;
 	static_opts->assert_nonempty_alloc = false;
 	static_opts->null_out_result_on_error = false;
 	static_opts->set_errno_on_error = false;
@@ -1634,6 +1797,7 @@ static_opts_init(static_opts_t *static_opts) {
 	static_opts->oom_string = "";
 	static_opts->invalid_alignment_string = "";
 	static_opts->slow = false;
+	static_opts->usize = false;
 }
 
 /*
@@ -1648,6 +1812,7 @@ static_opts_init(static_opts_t *static_opts) {
 typedef struct dynamic_opts_s dynamic_opts_t;
 struct dynamic_opts_s {
 	void **result;
+	size_t usize;
 	size_t num_items;
 	size_t item_size;
 	size_t alignment;
@@ -1659,6 +1824,7 @@ struct dynamic_opts_s {
 JEMALLOC_ALWAYS_INLINE void
 dynamic_opts_init(dynamic_opts_t *dynamic_opts) {
 	dynamic_opts->result = NULL;
+	dynamic_opts->usize = 0;
 	dynamic_opts->num_items = 0;
 	dynamic_opts->item_size = 0;
 	dynamic_opts->alignment = 0;
@@ -1722,12 +1888,13 @@ imalloc_sample(static_opts_t *sopts, dynamic_opts_t *dopts, tsd_t *tsd,
 	szind_t ind_large;
 	size_t bumped_usize = usize;
 
-	if (usize <= SMALL_MAXCLASS) {
-		assert(((dopts->alignment == 0) ? sz_s2u(LARGE_MINCLASS) :
-		    sz_sa2u(LARGE_MINCLASS, dopts->alignment))
-		    == LARGE_MINCLASS);
-		ind_large = sz_size2index(LARGE_MINCLASS);
-		bumped_usize = sz_s2u(LARGE_MINCLASS);
+	if (usize <= SC_SMALL_MAXCLASS) {
+		assert(((dopts->alignment == 0) ?
+		    sz_s2u(SC_LARGE_MINCLASS) :
+		    sz_sa2u(SC_LARGE_MINCLASS, dopts->alignment))
+			== SC_LARGE_MINCLASS);
+		ind_large = sz_size2index(SC_LARGE_MINCLASS);
+		bumped_usize = sz_s2u(SC_LARGE_MINCLASS);
 		ret = imalloc_no_sample(sopts, dopts, tsd, bumped_usize,
 		    bumped_usize, ind_large);
 		if (unlikely(ret == NULL)) {
@@ -1811,12 +1978,6 @@ imalloc_body(static_opts_t *sopts, dynamic_opts_t *dopts, tsd_t *tsd) {
 	}
 
 	/* Validate the user input. */
-	if (sopts->bump_empty_alloc) {
-		if (unlikely(size == 0)) {
-			size = 1;
-		}
-	}
-
 	if (sopts->assert_nonempty_alloc) {
 		assert (size != 0);
 	}
@@ -1830,16 +1991,20 @@ imalloc_body(static_opts_t *sopts, dynamic_opts_t *dopts, tsd_t *tsd) {
 
 	if (dopts->alignment == 0) {
 		ind = sz_size2index(size);
-		if (unlikely(ind >= NSIZES)) {
+		if (unlikely(ind >= SC_NSIZES)) {
 			goto label_oom;
 		}
-		if (config_stats || (config_prof && opt_prof)) {
+		if (config_stats || (config_prof && opt_prof) || sopts->usize) {
 			usize = sz_index2size(ind);
-			assert(usize > 0 && usize <= LARGE_MAXCLASS);
+			dopts->usize = usize;
+			assert(usize > 0 && usize
+			    <= SC_LARGE_MAXCLASS);
 		}
 	} else {
 		usize = sz_sa2u(size, dopts->alignment);
-		if (unlikely(usize == 0 || usize > LARGE_MAXCLASS)) {
+		dopts->usize = usize;
+		if (unlikely(usize == 0
+		    || usize > SC_LARGE_MAXCLASS)) {
 			goto label_oom;
 		}
 	}
@@ -1875,7 +2040,8 @@ imalloc_body(static_opts_t *sopts, dynamic_opts_t *dopts, tsd_t *tsd) {
 
 		alloc_ctx_t alloc_ctx;
 		if (likely((uintptr_t)tctx == (uintptr_t)1U)) {
-			alloc_ctx.slab = (usize <= SMALL_MAXCLASS);
+			alloc_ctx.slab = (usize
+			    <= SC_SMALL_MAXCLASS);
 			allocation = imalloc_no_sample(
 			    sopts, dopts, tsd, usize, usize, ind);
 		} else if ((uintptr_t)tctx > (uintptr_t)1U) {
@@ -1980,9 +2146,8 @@ label_invalid_alignment:
 	return EINVAL;
 }
 
-/* Returns the errno-style error code of the allocation. */
-JEMALLOC_ALWAYS_INLINE int
-imalloc(static_opts_t *sopts, dynamic_opts_t *dopts) {
+JEMALLOC_ALWAYS_INLINE bool
+imalloc_init_check(static_opts_t *sopts, dynamic_opts_t *dopts) {
 	if (unlikely(!malloc_initialized()) && unlikely(malloc_init())) {
 		if (config_xmalloc && unlikely(opt_xmalloc)) {
 			malloc_write(sopts->oom_string);
@@ -1992,6 +2157,16 @@ imalloc(static_opts_t *sopts, dynamic_opts_t *dopts) {
 		set_errno(ENOMEM);
 		*dopts->result = NULL;
 
+		return false;
+	}
+
+	return true;
+}
+
+/* Returns the errno-style error code of the allocation. */
+JEMALLOC_ALWAYS_INLINE int
+imalloc(static_opts_t *sopts, dynamic_opts_t *dopts) {
+	if (tsd_get_allocates() && !imalloc_init_check(sopts, dopts)) {
 		return ENOMEM;
 	}
 
@@ -2004,19 +2179,18 @@ imalloc(static_opts_t *sopts, dynamic_opts_t *dopts) {
 		sopts->slow = false;
 		return imalloc_body(sopts, dopts, tsd);
 	} else {
+		if (!tsd_get_allocates() && !imalloc_init_check(sopts, dopts)) {
+			return ENOMEM;
+		}
+
 		sopts->slow = true;
 		return imalloc_body(sopts, dopts, tsd);
 	}
 }
-/******************************************************************************/
-/*
- * Begin malloc(3)-compatible functions.
- */
 
-JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
-void JEMALLOC_NOTHROW *
-JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1)
-je_malloc(size_t size) {
+JEMALLOC_NOINLINE
+void *
+malloc_default(size_t size) {
 	void *ret;
 	static_opts_t sopts;
 	dynamic_opts_t dopts;
@@ -2026,7 +2200,6 @@ je_malloc(size_t size) {
 	static_opts_init(&sopts);
 	dynamic_opts_init(&dopts);
 
-	sopts.bump_empty_alloc = true;
 	sopts.null_out_result_on_error = true;
 	sopts.set_errno_on_error = true;
 	sopts.oom_string = "<jemalloc>: Error in malloc(): out of memory\n";
@@ -2036,12 +2209,107 @@ je_malloc(size_t size) {
 	dopts.item_size = size;
 
 	imalloc(&sopts, &dopts);
+	/*
+	 * Note that this branch gets optimized away -- it immediately follows
+	 * the check on tsd_fast that sets sopts.slow.
+	 */
+	if (sopts.slow) {
+		uintptr_t args[3] = {size};
+		hook_invoke_alloc(hook_alloc_malloc, ret, (uintptr_t)ret, args);
+	}
 
 	LOG("core.malloc.exit", "result: %p", ret);
 
 	return ret;
 }
 
+/******************************************************************************/
+/*
+ * Begin malloc(3)-compatible functions.
+ */
+
+/*
+ * malloc() fastpath.
+ *
+ * Fastpath assumes size <= SC_LOOKUP_MAXCLASS, and that we hit
+ * tcache.  If either of these is false, we tail-call to the slowpath,
+ * malloc_default().  Tail-calling is used to avoid any caller-saved
+ * registers.
+ *
+ * fastpath supports ticker and profiling, both of which will also
+ * tail-call to the slowpath if they fire.
+ */
+JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
+void JEMALLOC_NOTHROW *
+JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1)
+je_malloc(size_t size) {
+	LOG("core.malloc.entry", "size: %zu", size);
+
+	if (tsd_get_allocates() && unlikely(!malloc_initialized())) {
+		return malloc_default(size);
+	}
+
+	tsd_t *tsd = tsd_get(false);
+	if (unlikely(!tsd || !tsd_fast(tsd) || (size > SC_LOOKUP_MAXCLASS))) {
+		return malloc_default(size);
+	}
+
+	tcache_t *tcache = tsd_tcachep_get(tsd);
+
+	if (unlikely(ticker_trytick(&tcache->gc_ticker))) {
+		return malloc_default(size);
+	}
+
+	szind_t ind = sz_size2index_lookup(size);
+	size_t usize;
+	if (config_stats || config_prof) {
+		usize = sz_index2size(ind);
+	}
+	/* Fast path relies on size being a bin. I.e. SC_LOOKUP_MAXCLASS < SC_SMALL_MAXCLASS */
+	assert(ind < SC_NBINS);
+	assert(size <= SC_SMALL_MAXCLASS);
+
+	if (config_prof) {
+		int64_t bytes_until_sample = tsd_bytes_until_sample_get(tsd);
+		bytes_until_sample -= usize;
+		tsd_bytes_until_sample_set(tsd, bytes_until_sample);
+
+		if (unlikely(bytes_until_sample < 0)) {
+			/*
+			 * Avoid a prof_active check on the fastpath.
+			 * If prof_active is false, set bytes_until_sample to
+			 * a large value.  If prof_active is set to true,
+			 * bytes_until_sample will be reset.
+			 */
+			if (!prof_active) {
+				tsd_bytes_until_sample_set(tsd, SSIZE_MAX);
+			}
+			return malloc_default(size);
+		}
+	}
+
+	cache_bin_t *bin = tcache_small_bin_get(tcache, ind);
+	bool tcache_success;
+	void* ret = cache_bin_alloc_easy(bin, &tcache_success);
+
+	if (tcache_success) {
+		if (config_stats) {
+			*tsd_thread_allocatedp_get(tsd) += usize;
+			bin->tstats.nrequests++;
+		}
+		if (config_prof) {
+			tcache->prof_accumbytes += usize;
+		}
+
+		LOG("core.malloc.exit", "result: %p", ret);
+
+		/* Fastpath success */
+		return ret;
+	}
+
+	return malloc_default(size);
+}
+
 JEMALLOC_EXPORT int JEMALLOC_NOTHROW
 JEMALLOC_ATTR(nonnull(1))
 je_posix_memalign(void **memptr, size_t alignment, size_t size) {
@@ -2055,7 +2323,6 @@ je_posix_memalign(void **memptr, size_t alignment, size_t size) {
 	static_opts_init(&sopts);
 	dynamic_opts_init(&dopts);
 
-	sopts.bump_empty_alloc = true;
 	sopts.min_alignment = sizeof(void *);
 	sopts.oom_string =
 	    "<jemalloc>: Error allocating aligned memory: out of memory\n";
@@ -2068,6 +2335,12 @@ je_posix_memalign(void **memptr, size_t alignment, size_t size) {
 	dopts.alignment = alignment;
 
 	ret = imalloc(&sopts, &dopts);
+	if (sopts.slow) {
+		uintptr_t args[3] = {(uintptr_t)memptr, (uintptr_t)alignment,
+			(uintptr_t)size};
+		hook_invoke_alloc(hook_alloc_posix_memalign, *memptr,
+		    (uintptr_t)ret, args);
+	}
 
 	LOG("core.posix_memalign.exit", "result: %d, alloc ptr: %p", ret,
 	    *memptr);
@@ -2090,7 +2363,6 @@ je_aligned_alloc(size_t alignment, size_t size) {
 	static_opts_init(&sopts);
 	dynamic_opts_init(&dopts);
 
-	sopts.bump_empty_alloc = true;
 	sopts.null_out_result_on_error = true;
 	sopts.set_errno_on_error = true;
 	sopts.min_alignment = 1;
@@ -2105,6 +2377,11 @@ je_aligned_alloc(size_t alignment, size_t size) {
 	dopts.alignment = alignment;
 
 	imalloc(&sopts, &dopts);
+	if (sopts.slow) {
+		uintptr_t args[3] = {(uintptr_t)alignment, (uintptr_t)size};
+		hook_invoke_alloc(hook_alloc_aligned_alloc, ret,
+		    (uintptr_t)ret, args);
+	}
 
 	LOG("core.aligned_alloc.exit", "result: %p", ret);
 
@@ -2125,7 +2402,6 @@ je_calloc(size_t num, size_t size) {
 	dynamic_opts_init(&dopts);
 
 	sopts.may_overflow = true;
-	sopts.bump_empty_alloc = true;
 	sopts.null_out_result_on_error = true;
 	sopts.set_errno_on_error = true;
 	sopts.oom_string = "<jemalloc>: Error in calloc(): out of memory\n";
@@ -2136,6 +2412,10 @@ je_calloc(size_t num, size_t size) {
 	dopts.zero = true;
 
 	imalloc(&sopts, &dopts);
+	if (sopts.slow) {
+		uintptr_t args[3] = {(uintptr_t)num, (uintptr_t)size};
+		hook_invoke_alloc(hook_alloc_calloc, ret, (uintptr_t)ret, args);
+	}
 
 	LOG("core.calloc.exit", "result: %p", ret);
 
@@ -2144,20 +2424,22 @@ je_calloc(size_t num, size_t size) {
 
 static void *
 irealloc_prof_sample(tsd_t *tsd, void *old_ptr, size_t old_usize, size_t usize,
-    prof_tctx_t *tctx) {
+    prof_tctx_t *tctx, hook_ralloc_args_t *hook_args) {
 	void *p;
 
 	if (tctx == NULL) {
 		return NULL;
 	}
-	if (usize <= SMALL_MAXCLASS) {
-		p = iralloc(tsd, old_ptr, old_usize, LARGE_MINCLASS, 0, false);
+	if (usize <= SC_SMALL_MAXCLASS) {
+		p = iralloc(tsd, old_ptr, old_usize,
+		    SC_LARGE_MINCLASS, 0, false, hook_args);
 		if (p == NULL) {
 			return NULL;
 		}
 		arena_prof_promote(tsd_tsdn(tsd), p, usize);
 	} else {
-		p = iralloc(tsd, old_ptr, old_usize, usize, 0, false);
+		p = iralloc(tsd, old_ptr, old_usize, usize, 0, false,
+		    hook_args);
 	}
 
 	return p;
@@ -2165,7 +2447,7 @@ irealloc_prof_sample(tsd_t *tsd, void *old_ptr, size_t old_usize, size_t usize,
 
 JEMALLOC_ALWAYS_INLINE void *
 irealloc_prof(tsd_t *tsd, void *old_ptr, size_t old_usize, size_t usize,
-   alloc_ctx_t *alloc_ctx) {
+   alloc_ctx_t *alloc_ctx, hook_ralloc_args_t *hook_args) {
 	void *p;
 	bool prof_active;
 	prof_tctx_t *old_tctx, *tctx;
@@ -2174,9 +2456,11 @@ irealloc_prof(tsd_t *tsd, void *old_ptr, size_t old_usize, size_t usize,
 	old_tctx = prof_tctx_get(tsd_tsdn(tsd), old_ptr, alloc_ctx);
 	tctx = prof_alloc_prep(tsd, usize, prof_active, true);
 	if (unlikely((uintptr_t)tctx != (uintptr_t)1U)) {
-		p = irealloc_prof_sample(tsd, old_ptr, old_usize, usize, tctx);
+		p = irealloc_prof_sample(tsd, old_ptr, old_usize, usize, tctx,
+		    hook_args);
 	} else {
-		p = iralloc(tsd, old_ptr, old_usize, usize, 0, false);
+		p = iralloc(tsd, old_ptr, old_usize, usize, 0, false,
+		    hook_args);
 	}
 	if (unlikely(p == NULL)) {
 		prof_alloc_rollback(tsd, tctx, true);
@@ -2205,7 +2489,7 @@ ifree(tsd_t *tsd, void *ptr, tcache_t *tcache, bool slow_path) {
 	rtree_ctx_t *rtree_ctx = tsd_rtree_ctx(tsd);
 	rtree_szind_slab_read(tsd_tsdn(tsd), &extents_rtree, rtree_ctx,
 	    (uintptr_t)ptr, true, &alloc_ctx.szind, &alloc_ctx.slab);
-	assert(alloc_ctx.szind != NSIZES);
+	assert(alloc_ctx.szind != SC_NSIZES);
 
 	size_t usize;
 	if (config_prof && opt_prof) {
@@ -2286,11 +2570,12 @@ isfree(tsd_t *tsd, void *ptr, size_t usize, tcache_t *tcache, bool slow_path) {
 JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
 void JEMALLOC_NOTHROW *
 JEMALLOC_ALLOC_SIZE(2)
-je_realloc(void *ptr, size_t size) {
+je_realloc(void *ptr, size_t arg_size) {
 	void *ret;
 	tsdn_t *tsdn JEMALLOC_CC_SILENCE_INIT(NULL);
 	size_t usize JEMALLOC_CC_SILENCE_INIT(0);
 	size_t old_usize = 0;
+	size_t size = arg_size;
 
 	LOG("core.realloc.entry", "ptr: %p, size: %zu\n", ptr, size);
 
@@ -2305,6 +2590,10 @@ je_realloc(void *ptr, size_t size) {
 			} else {
 				tcache = NULL;
 			}
+
+			uintptr_t args[3] = {(uintptr_t)ptr, size};
+			hook_invoke_dalloc(hook_dalloc_realloc, ptr, args);
+
 			ifree(tsd, ptr, tcache, true);
 
 			LOG("core.realloc.exit", "result: %p", NULL);
@@ -2319,29 +2608,58 @@ je_realloc(void *ptr, size_t size) {
 
 		check_entry_exit_locking(tsd_tsdn(tsd));
 
+
+		hook_ralloc_args_t hook_args = {true, {(uintptr_t)ptr,
+			(uintptr_t)arg_size, 0, 0}};
+
 		alloc_ctx_t alloc_ctx;
 		rtree_ctx_t *rtree_ctx = tsd_rtree_ctx(tsd);
 		rtree_szind_slab_read(tsd_tsdn(tsd), &extents_rtree, rtree_ctx,
 		    (uintptr_t)ptr, true, &alloc_ctx.szind, &alloc_ctx.slab);
-		assert(alloc_ctx.szind != NSIZES);
+		assert(alloc_ctx.szind != SC_NSIZES);
 		old_usize = sz_index2size(alloc_ctx.szind);
 		assert(old_usize == isalloc(tsd_tsdn(tsd), ptr));
 		if (config_prof && opt_prof) {
 			usize = sz_s2u(size);
-			ret = unlikely(usize == 0 || usize > LARGE_MAXCLASS) ?
-			    NULL : irealloc_prof(tsd, ptr, old_usize, usize,
-			    &alloc_ctx);
+			if (unlikely(usize == 0
+			    || usize > SC_LARGE_MAXCLASS)) {
+				ret = NULL;
+			} else {
+				ret = irealloc_prof(tsd, ptr, old_usize, usize,
+				    &alloc_ctx, &hook_args);
+			}
 		} else {
 			if (config_stats) {
 				usize = sz_s2u(size);
 			}
-			ret = iralloc(tsd, ptr, old_usize, size, 0, false);
+			ret = iralloc(tsd, ptr, old_usize, size, 0, false,
+			    &hook_args);
 		}
 		tsdn = tsd_tsdn(tsd);
 	} else {
 		/* realloc(NULL, size) is equivalent to malloc(size). */
-		void *ret = je_malloc(size);
-		LOG("core.realloc.exit", "result: %p", ret);
+		static_opts_t sopts;
+		dynamic_opts_t dopts;
+
+		static_opts_init(&sopts);
+		dynamic_opts_init(&dopts);
+
+		sopts.null_out_result_on_error = true;
+		sopts.set_errno_on_error = true;
+		sopts.oom_string =
+		    "<jemalloc>: Error in realloc(): out of memory\n";
+
+		dopts.result = &ret;
+		dopts.num_items = 1;
+		dopts.item_size = size;
+
+		imalloc(&sopts, &dopts);
+		if (sopts.slow) {
+			uintptr_t args[3] = {(uintptr_t)ptr, arg_size};
+			hook_invoke_alloc(hook_alloc_realloc, ret,
+			    (uintptr_t)ret, args);
+		}
+
 		return ret;
 	}
 
@@ -2368,10 +2686,9 @@ je_realloc(void *ptr, size_t size) {
 	return ret;
 }
 
-JEMALLOC_EXPORT void JEMALLOC_NOTHROW
-je_free(void *ptr) {
-	LOG("core.free.entry", "ptr: %p", ptr);
-
+JEMALLOC_NOINLINE
+void
+free_default(void *ptr) {
 	UTRACE(ptr, 0, 0);
 	if (likely(ptr != NULL)) {
 		/*
@@ -2397,10 +2714,79 @@ je_free(void *ptr) {
 			} else {
 				tcache = NULL;
 			}
+			uintptr_t args_raw[3] = {(uintptr_t)ptr};
+			hook_invoke_dalloc(hook_dalloc_free, ptr, args_raw);
 			ifree(tsd, ptr, tcache, true);
 		}
 		check_entry_exit_locking(tsd_tsdn(tsd));
 	}
+}
+
+JEMALLOC_ALWAYS_INLINE
+bool free_fastpath(void *ptr, size_t size, bool size_hint) {
+	tsd_t *tsd = tsd_get(false);
+	if (unlikely(!tsd || !tsd_fast(tsd))) {
+		return false;
+	}
+
+	tcache_t *tcache = tsd_tcachep_get(tsd);
+
+	alloc_ctx_t alloc_ctx;
+	/* 
+	 * If !config_cache_oblivious, we can check PAGE alignment to
+	 * detect sampled objects.  Otherwise addresses are
+	 * randomized, and we have to look it up in the rtree anyway.
+	 * See also isfree().
+	 */
+	if (!size_hint || config_cache_oblivious) {
+		rtree_ctx_t *rtree_ctx = tsd_rtree_ctx(tsd);
+		bool res = rtree_szind_slab_read_fast(tsd_tsdn(tsd), &extents_rtree,
+						      rtree_ctx, (uintptr_t)ptr,
+						      &alloc_ctx.szind, &alloc_ctx.slab);
+		assert(alloc_ctx.szind != SC_NSIZES);
+
+		/* Note: profiled objects will have alloc_ctx.slab set */
+		if (!res || !alloc_ctx.slab) {
+			return false;
+		}
+	} else {
+		/*
+		 * Check for both sizes that are too large, and for sampled objects.
+		 * Sampled objects are always page-aligned.  The sampled object check
+		 * will also check for null ptr.
+		 */
+		if (size > SC_LOOKUP_MAXCLASS || (((uintptr_t)ptr & PAGE_MASK) == 0)) {
+			return false;
+		}
+		alloc_ctx.szind = sz_size2index_lookup(size);
+	}
+
+	if (unlikely(ticker_trytick(&tcache->gc_ticker))) {
+		return false;
+	}
+
+	cache_bin_t *bin = tcache_small_bin_get(tcache, alloc_ctx.szind);
+	cache_bin_info_t *bin_info = &tcache_bin_info[alloc_ctx.szind];
+	if (!cache_bin_dalloc_easy(bin, bin_info, ptr)) {
+		return false;
+	}
+
+	if (config_stats) {
+		size_t usize = sz_index2size(alloc_ctx.szind);
+		*tsd_thread_deallocatedp_get(tsd) += usize;
+	}
+
+	return true;
+}
+
+JEMALLOC_EXPORT void JEMALLOC_NOTHROW
+je_free(void *ptr) {
+	LOG("core.free.entry", "ptr: %p", ptr);
+
+	if (!free_fastpath(ptr, 0, false)) {
+		free_default(ptr);
+	}
+
 	LOG("core.free.exit", "");
 }
 
@@ -2427,7 +2813,6 @@ je_memalign(size_t alignment, size_t size) {
 	static_opts_init(&sopts);
 	dynamic_opts_init(&dopts);
 
-	sopts.bump_empty_alloc = true;
 	sopts.min_alignment = 1;
 	sopts.oom_string =
 	    "<jemalloc>: Error allocating aligned memory: out of memory\n";
@@ -2441,6 +2826,11 @@ je_memalign(size_t alignment, size_t size) {
 	dopts.alignment = alignment;
 
 	imalloc(&sopts, &dopts);
+	if (sopts.slow) {
+		uintptr_t args[3] = {alignment, size};
+		hook_invoke_alloc(hook_alloc_memalign, ret, (uintptr_t)ret,
+		    args);
+	}
 
 	LOG("core.memalign.exit", "result: %p", ret);
 	return ret;
@@ -2462,7 +2852,6 @@ je_valloc(size_t size) {
 	static_opts_init(&sopts);
 	dynamic_opts_init(&dopts);
 
-	sopts.bump_empty_alloc = true;
 	sopts.null_out_result_on_error = true;
 	sopts.min_alignment = PAGE;
 	sopts.oom_string =
@@ -2476,6 +2865,10 @@ je_valloc(size_t size) {
 	dopts.alignment = PAGE;
 
 	imalloc(&sopts, &dopts);
+	if (sopts.slow) {
+		uintptr_t args[3] = {size};
+		hook_invoke_alloc(hook_alloc_valloc, ret, (uintptr_t)ret, args);
+	}
 
 	LOG("core.valloc.exit", "result: %p\n", ret);
 	return ret;
@@ -2543,6 +2936,82 @@ int __posix_memalign(void** r, size_t a, size_t s) PREALIAS(je_posix_memalign);
  * Begin non-standard functions.
  */
 
+#ifdef JEMALLOC_EXPERIMENTAL_SMALLOCX_API
+
+#define JEMALLOC_SMALLOCX_CONCAT_HELPER(x, y) x ## y
+#define JEMALLOC_SMALLOCX_CONCAT_HELPER2(x, y)  \
+  JEMALLOC_SMALLOCX_CONCAT_HELPER(x, y)
+
+typedef struct {
+	void *ptr;
+	size_t size;
+} smallocx_return_t;
+
+JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
+smallocx_return_t JEMALLOC_NOTHROW
+/*
+ * The attribute JEMALLOC_ATTR(malloc) cannot be used due to:
+ *  - https://gcc.gnu.org/bugzilla/show_bug.cgi?id=86488
+ */
+JEMALLOC_SMALLOCX_CONCAT_HELPER2(je_smallocx_, JEMALLOC_VERSION_GID_IDENT)
+  (size_t size, int flags) {
+	/*
+	 * Note: the attribute JEMALLOC_ALLOC_SIZE(1) cannot be
+	 * used here because it makes writing beyond the `size`
+	 * of the `ptr` undefined behavior, but the objective
+	 * of this function is to allow writing beyond `size`
+	 * up to `smallocx_return_t::size`.
+	 */
+	smallocx_return_t ret;
+	static_opts_t sopts;
+	dynamic_opts_t dopts;
+
+	LOG("core.smallocx.entry", "size: %zu, flags: %d", size, flags);
+
+	static_opts_init(&sopts);
+	dynamic_opts_init(&dopts);
+
+	sopts.assert_nonempty_alloc = true;
+	sopts.null_out_result_on_error = true;
+	sopts.oom_string = "<jemalloc>: Error in mallocx(): out of memory\n";
+	sopts.usize = true;
+
+	dopts.result = &ret.ptr;
+	dopts.num_items = 1;
+	dopts.item_size = size;
+	if (unlikely(flags != 0)) {
+		if ((flags & MALLOCX_LG_ALIGN_MASK) != 0) {
+			dopts.alignment = MALLOCX_ALIGN_GET_SPECIFIED(flags);
+		}
+
+		dopts.zero = MALLOCX_ZERO_GET(flags);
+
+		if ((flags & MALLOCX_TCACHE_MASK) != 0) {
+			if ((flags & MALLOCX_TCACHE_MASK)
+			    == MALLOCX_TCACHE_NONE) {
+				dopts.tcache_ind = TCACHE_IND_NONE;
+			} else {
+				dopts.tcache_ind = MALLOCX_TCACHE_GET(flags);
+			}
+		} else {
+			dopts.tcache_ind = TCACHE_IND_AUTOMATIC;
+		}
+
+		if ((flags & MALLOCX_ARENA_MASK) != 0)
+			dopts.arena_ind = MALLOCX_ARENA_GET(flags);
+	}
+
+	imalloc(&sopts, &dopts);
+	assert(dopts.usize == je_nallocx(size, flags));
+	ret.size = dopts.usize;
+
+	LOG("core.smallocx.exit", "result: %p, size: %zu", ret.ptr, ret.size);
+	return ret;
+}
+#undef JEMALLOC_SMALLOCX_CONCAT_HELPER
+#undef JEMALLOC_SMALLOCX_CONCAT_HELPER2
+#endif
+
 JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
 void JEMALLOC_NOTHROW *
 JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1)
@@ -2586,6 +3055,11 @@ je_mallocx(size_t size, int flags) {
 	}
 
 	imalloc(&sopts, &dopts);
+	if (sopts.slow) {
+		uintptr_t args[3] = {size, flags};
+		hook_invoke_alloc(hook_alloc_mallocx, ret, (uintptr_t)ret,
+		    args);
+	}
 
 	LOG("core.mallocx.exit", "result: %p", ret);
 	return ret;
@@ -2594,22 +3068,23 @@ je_mallocx(size_t size, int flags) {
 static void *
 irallocx_prof_sample(tsdn_t *tsdn, void *old_ptr, size_t old_usize,
     size_t usize, size_t alignment, bool zero, tcache_t *tcache, arena_t *arena,
-    prof_tctx_t *tctx) {
+    prof_tctx_t *tctx, hook_ralloc_args_t *hook_args) {
 	void *p;
 
 	if (tctx == NULL) {
 		return NULL;
 	}
-	if (usize <= SMALL_MAXCLASS) {
-		p = iralloct(tsdn, old_ptr, old_usize, LARGE_MINCLASS,
-		    alignment, zero, tcache, arena);
+	if (usize <= SC_SMALL_MAXCLASS) {
+		p = iralloct(tsdn, old_ptr, old_usize,
+		    SC_LARGE_MINCLASS, alignment, zero, tcache,
+		    arena, hook_args);
 		if (p == NULL) {
 			return NULL;
 		}
 		arena_prof_promote(tsdn, p, usize);
 	} else {
 		p = iralloct(tsdn, old_ptr, old_usize, usize, alignment, zero,
-		    tcache, arena);
+		    tcache, arena, hook_args);
 	}
 
 	return p;
@@ -2618,7 +3093,7 @@ irallocx_prof_sample(tsdn_t *tsdn, void *old_ptr, size_t old_usize,
 JEMALLOC_ALWAYS_INLINE void *
 irallocx_prof(tsd_t *tsd, void *old_ptr, size_t old_usize, size_t size,
     size_t alignment, size_t *usize, bool zero, tcache_t *tcache,
-    arena_t *arena, alloc_ctx_t *alloc_ctx) {
+    arena_t *arena, alloc_ctx_t *alloc_ctx, hook_ralloc_args_t *hook_args) {
 	void *p;
 	bool prof_active;
 	prof_tctx_t *old_tctx, *tctx;
@@ -2628,10 +3103,10 @@ irallocx_prof(tsd_t *tsd, void *old_ptr, size_t old_usize, size_t size,
 	tctx = prof_alloc_prep(tsd, *usize, prof_active, false);
 	if (unlikely((uintptr_t)tctx != (uintptr_t)1U)) {
 		p = irallocx_prof_sample(tsd_tsdn(tsd), old_ptr, old_usize,
-		    *usize, alignment, zero, tcache, arena, tctx);
+		    *usize, alignment, zero, tcache, arena, tctx, hook_args);
 	} else {
 		p = iralloct(tsd_tsdn(tsd), old_ptr, old_usize, size, alignment,
-		    zero, tcache, arena);
+		    zero, tcache, arena, hook_args);
 	}
 	if (unlikely(p == NULL)) {
 		prof_alloc_rollback(tsd, tctx, false);
@@ -2702,23 +3177,27 @@ je_rallocx(void *ptr, size_t size, int flags) {
 	rtree_ctx_t *rtree_ctx = tsd_rtree_ctx(tsd);
 	rtree_szind_slab_read(tsd_tsdn(tsd), &extents_rtree, rtree_ctx,
 	    (uintptr_t)ptr, true, &alloc_ctx.szind, &alloc_ctx.slab);
-	assert(alloc_ctx.szind != NSIZES);
+	assert(alloc_ctx.szind != SC_NSIZES);
 	old_usize = sz_index2size(alloc_ctx.szind);
 	assert(old_usize == isalloc(tsd_tsdn(tsd), ptr));
+
+	hook_ralloc_args_t hook_args = {false, {(uintptr_t)ptr, size, flags,
+		0}};
 	if (config_prof && opt_prof) {
 		usize = (alignment == 0) ?
 		    sz_s2u(size) : sz_sa2u(size, alignment);
-		if (unlikely(usize == 0 || usize > LARGE_MAXCLASS)) {
+		if (unlikely(usize == 0
+		    || usize > SC_LARGE_MAXCLASS)) {
 			goto label_oom;
 		}
 		p = irallocx_prof(tsd, ptr, old_usize, size, alignment, &usize,
-		    zero, tcache, arena, &alloc_ctx);
+		    zero, tcache, arena, &alloc_ctx, &hook_args);
 		if (unlikely(p == NULL)) {
 			goto label_oom;
 		}
 	} else {
 		p = iralloct(tsd_tsdn(tsd), ptr, old_usize, size, alignment,
-		    zero, tcache, arena);
+		    zero, tcache, arena, &hook_args);
 		if (unlikely(p == NULL)) {
 			goto label_oom;
 		}
@@ -2752,14 +3231,14 @@ label_oom:
 JEMALLOC_ALWAYS_INLINE size_t
 ixallocx_helper(tsdn_t *tsdn, void *ptr, size_t old_usize, size_t size,
     size_t extra, size_t alignment, bool zero) {
-	size_t usize;
+	size_t newsize;
 
-	if (ixalloc(tsdn, ptr, old_usize, size, extra, alignment, zero)) {
+	if (ixalloc(tsdn, ptr, old_usize, size, extra, alignment, zero,
+	    &newsize)) {
 		return old_usize;
 	}
-	usize = isalloc(tsdn, ptr);
 
-	return usize;
+	return newsize;
 }
 
 static size_t
@@ -2793,17 +3272,19 @@ ixallocx_prof(tsd_t *tsd, void *ptr, size_t old_usize, size_t size,
 	 */
 	if (alignment == 0) {
 		usize_max = sz_s2u(size+extra);
-		assert(usize_max > 0 && usize_max <= LARGE_MAXCLASS);
+		assert(usize_max > 0
+		    && usize_max <= SC_LARGE_MAXCLASS);
 	} else {
 		usize_max = sz_sa2u(size+extra, alignment);
-		if (unlikely(usize_max == 0 || usize_max > LARGE_MAXCLASS)) {
+		if (unlikely(usize_max == 0
+		    || usize_max > SC_LARGE_MAXCLASS)) {
 			/*
 			 * usize_max is out of range, and chances are that
 			 * allocation will fail, but use the maximum possible
 			 * value and carry on with prof_alloc_prep(), just in
 			 * case allocation succeeds.
 			 */
-			usize_max = LARGE_MAXCLASS;
+			usize_max = SC_LARGE_MAXCLASS;
 		}
 	}
 	tctx = prof_alloc_prep(tsd, usize_max, prof_active, false);
@@ -2846,24 +3327,24 @@ je_xallocx(void *ptr, size_t size, size_t extra, int flags) {
 	rtree_ctx_t *rtree_ctx = tsd_rtree_ctx(tsd);
 	rtree_szind_slab_read(tsd_tsdn(tsd), &extents_rtree, rtree_ctx,
 	    (uintptr_t)ptr, true, &alloc_ctx.szind, &alloc_ctx.slab);
-	assert(alloc_ctx.szind != NSIZES);
+	assert(alloc_ctx.szind != SC_NSIZES);
 	old_usize = sz_index2size(alloc_ctx.szind);
 	assert(old_usize == isalloc(tsd_tsdn(tsd), ptr));
 	/*
 	 * The API explicitly absolves itself of protecting against (size +
 	 * extra) numerical overflow, but we may need to clamp extra to avoid
-	 * exceeding LARGE_MAXCLASS.
+	 * exceeding SC_LARGE_MAXCLASS.
 	 *
 	 * Ordinarily, size limit checking is handled deeper down, but here we
 	 * have to check as part of (size + extra) clamping, since we need the
 	 * clamped value in the above helper functions.
 	 */
-	if (unlikely(size > LARGE_MAXCLASS)) {
+	if (unlikely(size > SC_LARGE_MAXCLASS)) {
 		usize = old_usize;
 		goto label_not_resized;
 	}
-	if (unlikely(LARGE_MAXCLASS - size < extra)) {
-		extra = LARGE_MAXCLASS - size;
+	if (unlikely(SC_LARGE_MAXCLASS - size < extra)) {
+		extra = SC_LARGE_MAXCLASS - size;
 	}
 
 	if (config_prof && opt_prof) {
@@ -2882,6 +3363,12 @@ je_xallocx(void *ptr, size_t size, size_t extra, int flags) {
 		*tsd_thread_deallocatedp_get(tsd) += old_usize;
 	}
 label_not_resized:
+	if (unlikely(!tsd_fast(tsd))) {
+		uintptr_t args[4] = {(uintptr_t)ptr, size, extra, flags};
+		hook_invoke_expand(hook_expand_xallocx, ptr, old_usize,
+		    usize, (uintptr_t)usize, args);
+	}
+
 	UTRACE(ptr, size, ptr);
 	check_entry_exit_locking(tsd_tsdn(tsd));
 
@@ -2891,7 +3378,7 @@ label_not_resized:
 
 JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW
 JEMALLOC_ATTR(pure)
-je_sallocx(const void *ptr, UNUSED int flags) {
+je_sallocx(const void *ptr, int flags) {
 	size_t usize;
 	tsdn_t *tsdn;
 
@@ -2954,6 +3441,8 @@ je_dallocx(void *ptr, int flags) {
 		tsd_assert_fast(tsd);
 		ifree(tsd, ptr, tcache, false);
 	} else {
+		uintptr_t args_raw[3] = {(uintptr_t)ptr, flags};
+		hook_invoke_dalloc(hook_dalloc_dallocx, ptr, args_raw);
 		ifree(tsd, ptr, tcache, true);
 	}
 	check_entry_exit_locking(tsd_tsdn(tsd));
@@ -2975,14 +3464,11 @@ inallocx(tsdn_t *tsdn, size_t size, int flags) {
 	return usize;
 }
 
-JEMALLOC_EXPORT void JEMALLOC_NOTHROW
-je_sdallocx(void *ptr, size_t size, int flags) {
+JEMALLOC_NOINLINE void
+sdallocx_default(void *ptr, size_t size, int flags) {
 	assert(ptr != NULL);
 	assert(malloc_initialized() || IS_INITIALIZER);
 
-	LOG("core.sdallocx.entry", "ptr: %p, size: %zu, flags: %d", ptr,
-	    size, flags);
-
 	tsd_t *tsd = tsd_fetch();
 	bool fast = tsd_fast(tsd);
 	size_t usize = inallocx(tsd_tsdn(tsd), size, flags);
@@ -3016,10 +3502,23 @@ je_sdallocx(void *ptr, size_t size, int flags) {
 		tsd_assert_fast(tsd);
 		isfree(tsd, ptr, usize, tcache, false);
 	} else {
+		uintptr_t args_raw[3] = {(uintptr_t)ptr, size, flags};
+		hook_invoke_dalloc(hook_dalloc_sdallocx, ptr, args_raw);
 		isfree(tsd, ptr, usize, tcache, true);
 	}
 	check_entry_exit_locking(tsd_tsdn(tsd));
 
+}
+
+JEMALLOC_EXPORT void JEMALLOC_NOTHROW
+je_sdallocx(void *ptr, size_t size, int flags) {
+	LOG("core.sdallocx.entry", "ptr: %p, size: %zu, flags: %d", ptr,
+		size, flags);
+
+	if (flags !=0 || !free_fastpath(ptr, size, true)) {
+		sdallocx_default(ptr, size, flags);
+	}
+
 	LOG("core.sdallocx.exit", "");
 }
 
@@ -3040,7 +3539,7 @@ je_nallocx(size_t size, int flags) {
 	check_entry_exit_locking(tsdn);
 
 	usize = inallocx(tsdn, size, flags);
-	if (unlikely(usize > LARGE_MAXCLASS)) {
+	if (unlikely(usize > SC_LARGE_MAXCLASS)) {
 		LOG("core.nallocx.exit", "result: %zu", ZU(0));
 		return 0;
 	}
@@ -3256,6 +3755,7 @@ _malloc_prefork(void)
 		}
 	}
 	prof_prefork1(tsd_tsdn(tsd));
+	tsd_prefork(tsd);
 }
 
 #ifndef JEMALLOC_MUTEX_INIT_CB
@@ -3278,6 +3778,8 @@ _malloc_postfork(void)
 
 	tsd = tsd_fetch();
 
+	tsd_postfork_parent(tsd);
+
 	witness_postfork_parent(tsd_witness_tsdp_get(tsd));
 	/* Release all mutexes, now that fork() has completed. */
 	for (i = 0, narenas = narenas_total_get(); i < narenas; i++) {
@@ -3305,6 +3807,8 @@ jemalloc_postfork_child(void) {
 
 	tsd = tsd_fetch();
 
+	tsd_postfork_child(tsd);
+
 	witness_postfork_child(tsd_witness_tsdp_get(tsd));
 	/* Release all mutexes, now that fork() has completed. */
 	for (i = 0, narenas = narenas_total_get(); i < narenas; i++) {
diff --git a/src/large.c b/src/large.c
index 27a2c679..8e7a781d 100644
--- a/src/large.c
+++ b/src/large.c
@@ -28,7 +28,7 @@ large_palloc(tsdn_t *tsdn, arena_t *arena, size_t usize, size_t alignment,
 	assert(!tsdn_null(tsdn) || arena != NULL);
 
 	ausize = sz_sa2u(usize, alignment);
-	if (unlikely(ausize == 0 || ausize > LARGE_MAXCLASS)) {
+	if (unlikely(ausize == 0 || ausize > SC_LARGE_MAXCLASS)) {
 		return NULL;
 	}
 
@@ -42,7 +42,7 @@ large_palloc(tsdn_t *tsdn, arena_t *arena, size_t usize, size_t alignment,
 	 */
 	is_zeroed = zero;
 	if (likely(!tsdn_null(tsdn))) {
-		arena = arena_choose(tsdn_tsd(tsdn), arena);
+		arena = arena_choose_maybe_huge(tsdn_tsd(tsdn), arena, usize);
 	}
 	if (unlikely(arena == NULL) || (extent = arena_extent_alloc_large(tsdn,
 	    arena, usize, alignment, &is_zeroed)) == NULL) {
@@ -109,7 +109,7 @@ large_ralloc_no_move_shrink(tsdn_t *tsdn, extent_t *extent, size_t usize) {
 	if (diff != 0) {
 		extent_t *trail = extent_split_wrapper(tsdn, arena,
 		    &extent_hooks, extent, usize + sz_large_pad,
-		    sz_size2index(usize), false, diff, NSIZES, false);
+		    sz_size2index(usize), false, diff, SC_NSIZES, false);
 		if (trail == NULL) {
 			return true;
 		}
@@ -154,17 +154,17 @@ large_ralloc_no_move_expand(tsdn_t *tsdn, extent_t *extent, size_t usize,
 	bool new_mapping;
 	if ((trail = extents_alloc(tsdn, arena, &extent_hooks,
 	    &arena->extents_dirty, extent_past_get(extent), trailsize, 0,
-	    CACHELINE, false, NSIZES, &is_zeroed_trail, &commit)) != NULL
+	    CACHELINE, false, SC_NSIZES, &is_zeroed_trail, &commit)) != NULL
 	    || (trail = extents_alloc(tsdn, arena, &extent_hooks,
 	    &arena->extents_muzzy, extent_past_get(extent), trailsize, 0,
-	    CACHELINE, false, NSIZES, &is_zeroed_trail, &commit)) != NULL) {
+	    CACHELINE, false, SC_NSIZES, &is_zeroed_trail, &commit)) != NULL) {
 		if (config_stats) {
 			new_mapping = false;
 		}
 	} else {
 		if ((trail = extent_alloc_wrapper(tsdn, arena, &extent_hooks,
 		    extent_past_get(extent), trailsize, 0, CACHELINE, false,
-		    NSIZES, &is_zeroed_trail, &commit)) == NULL) {
+		    SC_NSIZES, &is_zeroed_trail, &commit)) == NULL) {
 			return true;
 		}
 		if (config_stats) {
@@ -221,9 +221,10 @@ large_ralloc_no_move(tsdn_t *tsdn, extent_t *extent, size_t usize_min,
 	size_t oldusize = extent_usize_get(extent);
 
 	/* The following should have been caught by callers. */
-	assert(usize_min > 0 && usize_max <= LARGE_MAXCLASS);
+	assert(usize_min > 0 && usize_max <= SC_LARGE_MAXCLASS);
 	/* Both allocation sizes must be large to avoid a move. */
-	assert(oldusize >= LARGE_MINCLASS && usize_max >= LARGE_MINCLASS);
+	assert(oldusize >= SC_LARGE_MINCLASS
+	    && usize_max >= SC_LARGE_MINCLASS);
 
 	if (usize_max > oldusize) {
 		/* Attempt to expand the allocation in-place. */
@@ -270,17 +271,23 @@ large_ralloc_move_helper(tsdn_t *tsdn, arena_t *arena, size_t usize,
 }
 
 void *
-large_ralloc(tsdn_t *tsdn, arena_t *arena, extent_t *extent, size_t usize,
-    size_t alignment, bool zero, tcache_t *tcache) {
-	size_t oldusize = extent_usize_get(extent);
+large_ralloc(tsdn_t *tsdn, arena_t *arena, void *ptr, size_t usize,
+    size_t alignment, bool zero, tcache_t *tcache,
+    hook_ralloc_args_t *hook_args) {
+	extent_t *extent = iealloc(tsdn, ptr);
 
+	size_t oldusize = extent_usize_get(extent);
 	/* The following should have been caught by callers. */
-	assert(usize > 0 && usize <= LARGE_MAXCLASS);
+	assert(usize > 0 && usize <= SC_LARGE_MAXCLASS);
 	/* Both allocation sizes must be large to avoid a move. */
-	assert(oldusize >= LARGE_MINCLASS && usize >= LARGE_MINCLASS);
+	assert(oldusize >= SC_LARGE_MINCLASS
+	    && usize >= SC_LARGE_MINCLASS);
 
 	/* Try to avoid moving the allocation. */
 	if (!large_ralloc_no_move(tsdn, extent, usize, usize, zero)) {
+		hook_invoke_expand(hook_args->is_realloc
+		    ? hook_expand_realloc : hook_expand_rallocx, ptr, oldusize,
+		    usize, (uintptr_t)ptr, hook_args->args);
 		return extent_addr_get(extent);
 	}
 
@@ -295,6 +302,12 @@ large_ralloc(tsdn_t *tsdn, arena_t *arena, extent_t *extent, size_t usize,
 		return NULL;
 	}
 
+	hook_invoke_alloc(hook_args->is_realloc
+	    ? hook_alloc_realloc : hook_alloc_rallocx, ret, (uintptr_t)ret,
+	    hook_args->args);
+	hook_invoke_dalloc(hook_args->is_realloc
+	    ? hook_dalloc_realloc : hook_dalloc_rallocx, ptr, hook_args->args);
+
 	size_t copysize = (usize < oldusize) ? usize : oldusize;
 	memcpy(ret, extent_addr_get(extent), copysize);
 	isdalloct(tsdn, extent_addr_get(extent), oldusize, tcache, NULL, true);
@@ -318,8 +331,9 @@ large_dalloc_prep_impl(tsdn_t *tsdn, arena_t *arena, extent_t *extent,
 		large_dalloc_maybe_junk(extent_addr_get(extent),
 		    extent_usize_get(extent));
 	} else {
-		malloc_mutex_assert_owner(tsdn, &arena->large_mtx);
+		/* Only hold the large_mtx if necessary. */
 		if (!arena_is_auto(arena)) {
+			malloc_mutex_assert_owner(tsdn, &arena->large_mtx);
 			extent_list_remove(&arena->large, extent);
 		}
 	}
@@ -369,3 +383,13 @@ void
 large_prof_tctx_reset(tsdn_t *tsdn, extent_t *extent) {
 	large_prof_tctx_set(tsdn, extent, (prof_tctx_t *)(uintptr_t)1U);
 }
+
+nstime_t
+large_prof_alloc_time_get(const extent_t *extent) {
+	return extent_prof_alloc_time_get(extent);
+}
+
+void
+large_prof_alloc_time_set(extent_t *extent, nstime_t t) {
+	extent_prof_alloc_time_set(extent, t);
+}
diff --git a/src/mutex.c b/src/mutex.c
index 30222b3e..3f920f5b 100644
--- a/src/mutex.c
+++ b/src/mutex.c
@@ -46,7 +46,7 @@ JEMALLOC_EXPORT int	_pthread_mutex_init_calloc_cb(pthread_mutex_t *mutex,
 void
 malloc_mutex_lock_slow(malloc_mutex_t *mutex) {
 	mutex_prof_data_t *data = &mutex->prof_data;
-	UNUSED nstime_t before = NSTIME_ZERO_INITIALIZER;
+	nstime_t before = NSTIME_ZERO_INITIALIZER;
 
 	if (ncpus == 1) {
 		goto label_spin_done;
@@ -55,7 +55,8 @@ malloc_mutex_lock_slow(malloc_mutex_t *mutex) {
 	int cnt = 0, max_cnt = MALLOC_MUTEX_MAX_SPIN;
 	do {
 		spin_cpu_spinwait();
-		if (!malloc_mutex_trylock_final(mutex)) {
+		if (!atomic_load_b(&mutex->locked, ATOMIC_RELAXED)
+                    && !malloc_mutex_trylock_final(mutex)) {
 			data->n_spin_acquired++;
 			return;
 		}
@@ -144,9 +145,7 @@ malloc_mutex_init(malloc_mutex_t *mutex, const char *name,
 	}
 #  endif
 #elif (defined(JEMALLOC_OS_UNFAIR_LOCK))
-	mutex->lock = OS_UNFAIR_LOCK_INIT;
-#elif (defined(JEMALLOC_OSSPIN))
-	mutex->lock = 0;
+       mutex->lock = OS_UNFAIR_LOCK_INIT;
 #elif (defined(JEMALLOC_MUTEX_INIT_CB))
 	if (postpone_init) {
 		mutex->postponed_next = postponed_mutexes;
diff --git a/src/pages.c b/src/pages.c
index 26002692..13de27a0 100644
--- a/src/pages.c
+++ b/src/pages.c
@@ -180,6 +180,35 @@ pages_map(void *addr, size_t size, size_t alignment, bool *commit) {
 	assert(alignment >= PAGE);
 	assert(ALIGNMENT_ADDR2BASE(addr, alignment) == addr);
 
+#if defined(__FreeBSD__) && defined(MAP_EXCL)
+	/*
+	 * FreeBSD has mechanisms both to mmap at specific address without
+	 * touching existing mappings, and to mmap with specific alignment.
+	 */
+	{
+		if (os_overcommits) {
+			*commit = true;
+		}
+
+		int prot = *commit ? PAGES_PROT_COMMIT : PAGES_PROT_DECOMMIT;
+		int flags = mmap_flags;
+
+		if (addr != NULL) {
+			flags |= MAP_FIXED | MAP_EXCL;
+		} else {
+			unsigned alignment_bits = ffs_zu(alignment);
+			assert(alignment_bits > 1);
+			flags |= MAP_ALIGNED(alignment_bits - 1);
+		}
+
+		void *ret = mmap(addr, size, prot, flags, -1, 0);
+		if (ret == MAP_FAILED) {
+			ret = NULL;
+		}
+
+		return ret;
+	}
+#endif
 	/*
 	 * Ideally, there would be a way to specify alignment to mmap() (like
 	 * NetBSD has), but in the absence of such a feature, we have to work
@@ -261,7 +290,7 @@ pages_decommit(void *addr, size_t size) {
 
 bool
 pages_purge_lazy(void *addr, size_t size) {
-	assert(PAGE_ADDR2BASE(addr) == addr);
+	assert(ALIGNMENT_ADDR2BASE(addr, os_page) == addr);
 	assert(PAGE_CEILING(size) == size);
 
 	if (!pages_can_purge_lazy) {
@@ -391,6 +420,10 @@ os_page_detect(void) {
 	GetSystemInfo(&si);
 	return si.dwPageSize;
 #elif defined(__FreeBSD__)
+	/*
+	 * This returns the value obtained from
+	 * the auxv vector, avoiding a syscall.
+	 */
 	return getpagesize();
 #else
 	long result = sysconf(_SC_PAGESIZE);
@@ -544,6 +577,10 @@ init_thp_state(void) {
 	close(fd);
 #endif
 
+        if (nread < 0) {
+		goto label_error; 
+        }
+
 	if (strncmp(buf, sys_state_madvise, (size_t)nread) == 0) {
 		init_system_thp_mode = thp_mode_default;
 	} else if (strncmp(buf, sys_state_always, (size_t)nread) == 0) {
@@ -588,6 +625,11 @@ pages_boot(void) {
 
 	init_thp_state();
 
+#ifdef __FreeBSD__
+	/*
+	 * FreeBSD doesn't need the check; madvise(2) is known to work.
+	 */
+#else
 	/* Detect lazy purge runtime support. */
 	if (pages_can_purge_lazy) {
 		bool committed = false;
@@ -601,6 +643,7 @@ pages_boot(void) {
 		}
 		os_pages_unmap(madv_free_page, PAGE);
 	}
+#endif
 
 	return false;
 }
diff --git a/src/prof.c b/src/prof.c
index 13df641a..4d7d65db 100644
--- a/src/prof.c
+++ b/src/prof.c
@@ -7,6 +7,7 @@
 #include "jemalloc/internal/hash.h"
 #include "jemalloc/internal/malloc_io.h"
 #include "jemalloc/internal/mutex.h"
+#include "jemalloc/internal/emitter.h"
 
 /******************************************************************************/
 
@@ -23,7 +24,7 @@
  */
 #undef _Unwind_Backtrace
 #include <unwind.h>
-#define _Unwind_Backtrace JEMALLOC_HOOK(_Unwind_Backtrace, hooks_libc_hook)
+#define _Unwind_Backtrace JEMALLOC_HOOK(_Unwind_Backtrace, test_hooks_libc_hook)
 #endif
 
 /******************************************************************************/
@@ -38,6 +39,7 @@ bool		opt_prof_gdump = false;
 bool		opt_prof_final = false;
 bool		opt_prof_leak = false;
 bool		opt_prof_accum = false;
+bool		opt_prof_log = false;
 char		opt_prof_prefix[
     /* Minimize memory bloat for non-prof builds. */
 #ifdef JEMALLOC_PROF
@@ -70,6 +72,100 @@ uint64_t	prof_interval = 0;
 
 size_t		lg_prof_sample;
 
+typedef enum prof_logging_state_e prof_logging_state_t;
+enum prof_logging_state_e {
+	prof_logging_state_stopped,
+	prof_logging_state_started,
+	prof_logging_state_dumping
+};
+
+/*
+ * - stopped: log_start never called, or previous log_stop has completed.
+ * - started: log_start called, log_stop not called yet. Allocations are logged.
+ * - dumping: log_stop called but not finished; samples are not logged anymore.
+ */
+prof_logging_state_t prof_logging_state = prof_logging_state_stopped;
+
+#ifdef JEMALLOC_JET
+static bool prof_log_dummy = false;
+#endif
+
+/* Incremented for every log file that is output. */
+static uint64_t log_seq = 0;
+static char log_filename[
+    /* Minimize memory bloat for non-prof builds. */
+#ifdef JEMALLOC_PROF
+    PATH_MAX +
+#endif
+    1];
+
+/* Timestamp for most recent call to log_start(). */
+static nstime_t log_start_timestamp = NSTIME_ZERO_INITIALIZER;
+
+/* Increment these when adding to the log_bt and log_thr linked lists. */
+static size_t log_bt_index = 0;
+static size_t log_thr_index = 0;
+
+/* Linked list node definitions. These are only used in prof.c. */
+typedef struct prof_bt_node_s prof_bt_node_t;
+
+struct prof_bt_node_s {
+	prof_bt_node_t *next;
+	size_t index;
+	prof_bt_t bt;
+	/* Variable size backtrace vector pointed to by bt. */
+	void *vec[1];
+};
+
+typedef struct prof_thr_node_s prof_thr_node_t;
+
+struct prof_thr_node_s {
+	prof_thr_node_t *next;
+	size_t index;
+	uint64_t thr_uid;
+	/* Variable size based on thr_name_sz. */
+	char name[1];
+}; 
+
+typedef struct prof_alloc_node_s prof_alloc_node_t;
+
+/* This is output when logging sampled allocations. */
+struct prof_alloc_node_s {
+	prof_alloc_node_t *next;
+	/* Indices into an array of thread data. */
+	size_t alloc_thr_ind;
+	size_t free_thr_ind;
+
+	/* Indices into an array of backtraces. */
+	size_t alloc_bt_ind;
+	size_t free_bt_ind;
+
+	uint64_t alloc_time_ns;
+	uint64_t free_time_ns;
+
+	size_t usize;
+};
+
+/*
+ * Created on the first call to prof_log_start and deleted on prof_log_stop.
+ * These are the backtraces and threads that have already been logged by an
+ * allocation.
+ */
+static bool log_tables_initialized = false;
+static ckh_t log_bt_node_set;
+static ckh_t log_thr_node_set;
+
+/* Store linked lists for logged data. */
+static prof_bt_node_t *log_bt_first = NULL;
+static prof_bt_node_t *log_bt_last = NULL;
+static prof_thr_node_t *log_thr_first = NULL;
+static prof_thr_node_t *log_thr_last = NULL;
+static prof_alloc_node_t *log_alloc_first = NULL;
+static prof_alloc_node_t *log_alloc_last = NULL;
+
+/* Protects the prof_logging_state and any log_{...} variable. */
+static malloc_mutex_t log_mtx;
+
 /*
  * Table of mutexes that are shared among gctx's.  These are leaf locks, so
  * there is no problem with using them for more than one gctx at the same time.
@@ -145,6 +241,12 @@ static void	prof_tdata_destroy(tsd_t *tsd, prof_tdata_t *tdata,
     bool even_if_attached);
 static char	*prof_thread_name_alloc(tsdn_t *tsdn, const char *thread_name);
 
+/* Hashtable functions for log_bt_node_set and log_thr_node_set. */
+static void prof_thr_node_hash(const void *key, size_t r_hash[2]);
+static bool prof_thr_node_keycomp(const void *k1, const void *k2);
+static void prof_bt_node_hash(const void *key, size_t r_hash[2]);
+static bool prof_bt_node_keycomp(const void *k1, const void *k2);
+
 /******************************************************************************/
 /* Red-black trees. */
 
@@ -242,6 +344,12 @@ prof_malloc_sample_object(tsdn_t *tsdn, const void *ptr, size_t usize,
     prof_tctx_t *tctx) {
 	prof_tctx_set(tsdn, ptr, usize, NULL, tctx);
 
+	/* Get the current time and set this in the extent_t. We'll read this
+	 * when free() is called. */
+	nstime_t t = NSTIME_ZERO_INITIALIZER;
+	nstime_update(&t);
+	prof_alloc_time_set(tsdn, ptr, NULL, t);
+
 	malloc_mutex_lock(tsdn, tctx->tdata->lock);
 	tctx->cnts.curobjs++;
 	tctx->cnts.curbytes += usize;
@@ -253,14 +361,174 @@ prof_malloc_sample_object(tsdn_t *tsdn, const void *ptr, size_t usize,
 	malloc_mutex_unlock(tsdn, tctx->tdata->lock);
 }
 
+static size_t
+prof_log_bt_index(tsd_t *tsd, prof_bt_t *bt) {
+	assert(prof_logging_state == prof_logging_state_started);
+	malloc_mutex_assert_owner(tsd_tsdn(tsd), &log_mtx);
+
+	prof_bt_node_t dummy_node;
+	dummy_node.bt = *bt;
+	prof_bt_node_t *node;
+
+	/* See if this backtrace is already cached in the table. */
+	if (ckh_search(&log_bt_node_set, (void *)(&dummy_node),
+	    (void **)(&node), NULL)) {
+		size_t sz = offsetof(prof_bt_node_t, vec) +
+			        (bt->len * sizeof(void *));
+		prof_bt_node_t *new_node = (prof_bt_node_t *)
+		    iallocztm(tsd_tsdn(tsd), sz, sz_size2index(sz), false, NULL,
+		    true, arena_get(TSDN_NULL, 0, true), true);
+		if (log_bt_first == NULL) {
+			log_bt_first = new_node;
+			log_bt_last = new_node;
+		} else {
+			log_bt_last->next = new_node;
+			log_bt_last = new_node;
+		}
+
+		new_node->next = NULL;
+		new_node->index = log_bt_index;
+		/* 
+		 * Copy the backtrace: bt is inside a tdata or gctx, which
+		 * might die before prof_log_stop is called.
+		 */
+		new_node->bt.len = bt->len;
+		memcpy(new_node->vec, bt->vec, bt->len * sizeof(void *));
+		new_node->bt.vec = new_node->vec;
+
+		log_bt_index++;
+		ckh_insert(tsd, &log_bt_node_set, (void *)new_node, NULL);
+		return new_node->index;
+	} else {
+		return node->index;
+	}
+} 
+static size_t
+prof_log_thr_index(tsd_t *tsd, uint64_t thr_uid, const char *name) {
+	assert(prof_logging_state == prof_logging_state_started);
+	malloc_mutex_assert_owner(tsd_tsdn(tsd), &log_mtx);
+
+	prof_thr_node_t dummy_node;
+	dummy_node.thr_uid = thr_uid;
+	prof_thr_node_t *node;
+
+	/* See if this thread is already cached in the table. */
+	if (ckh_search(&log_thr_node_set, (void *)(&dummy_node),
+	    (void **)(&node), NULL)) {
+		size_t sz = offsetof(prof_thr_node_t, name) + strlen(name) + 1;
+		prof_thr_node_t *new_node = (prof_thr_node_t *)
+		    iallocztm(tsd_tsdn(tsd), sz, sz_size2index(sz), false, NULL,
+		    true, arena_get(TSDN_NULL, 0, true), true);
+		if (log_thr_first == NULL) {
+			log_thr_first = new_node;
+			log_thr_last = new_node;
+		} else {
+			log_thr_last->next = new_node;
+			log_thr_last = new_node;
+		}
+
+		new_node->next = NULL;
+		new_node->index = log_thr_index;
+		new_node->thr_uid = thr_uid;
+		strcpy(new_node->name, name);
+
+		log_thr_index++;
+		ckh_insert(tsd, &log_thr_node_set, (void *)new_node, NULL);
+		return new_node->index;
+	} else {
+		return node->index;
+	}
+}
+
+static void
+prof_try_log(tsd_t *tsd, const void *ptr, size_t usize, prof_tctx_t *tctx) {
+	malloc_mutex_assert_owner(tsd_tsdn(tsd), tctx->tdata->lock);
+
+	prof_tdata_t *cons_tdata = prof_tdata_get(tsd, false);
+	if (cons_tdata == NULL) {
+		/*
+		 * We decide not to log these allocations. cons_tdata will be
+		 * NULL only when the current thread is in a weird state (e.g.
+		 * it's being destroyed).
+		 */
+		return;
+	}	
+
+	malloc_mutex_lock(tsd_tsdn(tsd), &log_mtx);
+
+	if (prof_logging_state != prof_logging_state_started) {
+		goto label_done;
+	}
+
+	if (!log_tables_initialized) {
+		bool err1 = ckh_new(tsd, &log_bt_node_set, PROF_CKH_MINITEMS,
+				prof_bt_node_hash, prof_bt_node_keycomp);
+		bool err2 = ckh_new(tsd, &log_thr_node_set, PROF_CKH_MINITEMS,
+				prof_thr_node_hash, prof_thr_node_keycomp);
+		if (err1 || err2) {
+			goto label_done;
+		}
+		log_tables_initialized = true;
+	}
+
+	nstime_t alloc_time = prof_alloc_time_get(tsd_tsdn(tsd), ptr,
+			          (alloc_ctx_t *)NULL);
+	nstime_t free_time = NSTIME_ZERO_INITIALIZER;
+	nstime_update(&free_time);
+
+	size_t sz = sizeof(prof_alloc_node_t);
+	prof_alloc_node_t *new_node = (prof_alloc_node_t *)
+	    iallocztm(tsd_tsdn(tsd), sz, sz_size2index(sz), false, NULL, true,
+	    arena_get(TSDN_NULL, 0, true), true);
+
+	const char *prod_thr_name = (tctx->tdata->thread_name == NULL)?
+				        "" : tctx->tdata->thread_name;
+	const char *cons_thr_name = prof_thread_name_get(tsd);
+
+	prof_bt_t bt;
+	/* Initialize the backtrace, using the buffer in tdata to store it. */
+	bt_init(&bt, cons_tdata->vec);
+	prof_backtrace(&bt);
+	prof_bt_t *cons_bt = &bt;
+
+	/* We haven't destroyed tctx yet, so gctx should be good to read. */
+	prof_bt_t *prod_bt = &tctx->gctx->bt;
+
+	new_node->next = NULL;
+	new_node->alloc_thr_ind = prof_log_thr_index(tsd, tctx->tdata->thr_uid,
+				      prod_thr_name);
+	new_node->free_thr_ind = prof_log_thr_index(tsd, cons_tdata->thr_uid,
+				     cons_thr_name);
+	new_node->alloc_bt_ind = prof_log_bt_index(tsd, prod_bt);
+	new_node->free_bt_ind = prof_log_bt_index(tsd, cons_bt);
+	new_node->alloc_time_ns = nstime_ns(&alloc_time);
+	new_node->free_time_ns = nstime_ns(&free_time);
+	new_node->usize = usize;
+
+	if (log_alloc_first == NULL) {
+		log_alloc_first = new_node;
+		log_alloc_last = new_node;
+	} else {
+		log_alloc_last->next = new_node;
+		log_alloc_last = new_node;
+	}
+
+label_done:
+	malloc_mutex_unlock(tsd_tsdn(tsd), &log_mtx);	
+}
+
 void
-prof_free_sampled_object(tsd_t *tsd, size_t usize, prof_tctx_t *tctx) {
+prof_free_sampled_object(tsd_t *tsd, const void *ptr, size_t usize, 
+    prof_tctx_t *tctx) {
 	malloc_mutex_lock(tsd_tsdn(tsd), tctx->tdata->lock);
+
 	assert(tctx->cnts.curobjs > 0);
 	assert(tctx->cnts.curbytes >= usize);
 	tctx->cnts.curobjs--;
 	tctx->cnts.curbytes -= usize;
 
+	prof_try_log(tsd, ptr, usize, tctx);
+
 	if (prof_tctx_should_destroy(tsd_tsdn(tsd), tctx)) {
 		prof_tctx_destroy(tsd, tctx);
 	} else {
@@ -871,15 +1139,12 @@ prof_lookup(tsd_t *tsd, prof_bt_t *bt) {
 void
 prof_sample_threshold_update(prof_tdata_t *tdata) {
 #ifdef JEMALLOC_PROF
-	uint64_t r;
-	double u;
-
 	if (!config_prof) {
 		return;
 	}
 
 	if (lg_prof_sample == 0) {
-		tdata->bytes_until_sample = 0;
+		tsd_bytes_until_sample_set(tsd_fetch(), 0);
 		return;
 	}
 
@@ -901,11 +1166,16 @@ prof_sample_threshold_update(prof_tdata_t *tdata) {
 	 *   pp 500
 	 *   (http://luc.devroye.org/rnbookindex.html)
 	 */
-	r = prng_lg_range_u64(&tdata->prng_state, 53);
-	u = (double)r * (1.0/9007199254740992.0L);
-	tdata->bytes_until_sample = (uint64_t)(log(u) /
+	uint64_t r = prng_lg_range_u64(&tdata->prng_state, 53);
+	double u = (double)r * (1.0/9007199254740992.0L);
+	uint64_t bytes_until_sample = (uint64_t)(log(u) /
 	    log(1.0 - (1.0 / (double)((uint64_t)1U << lg_prof_sample))))
 	    + (uint64_t)1U;
+	if (bytes_until_sample > SSIZE_MAX) {
+		bytes_until_sample = SSIZE_MAX;
+	}
+	tsd_bytes_until_sample_set(tsd_fetch(), bytes_until_sample);
+
 #endif
 }
 
@@ -1887,6 +2157,33 @@ prof_bt_keycomp(const void *k1, const void *k2) {
 	return (memcmp(bt1->vec, bt2->vec, bt1->len * sizeof(void *)) == 0);
 }
 
+static void
+prof_bt_node_hash(const void *key, size_t r_hash[2]) {
+	const prof_bt_node_t *bt_node = (prof_bt_node_t *)key;
+	prof_bt_hash((void *)(&bt_node->bt), r_hash);
+}
+
+static bool
+prof_bt_node_keycomp(const void *k1, const void *k2) {
+	const prof_bt_node_t *bt_node1 = (prof_bt_node_t *)k1;
+	const prof_bt_node_t *bt_node2 = (prof_bt_node_t *)k2;
+	return prof_bt_keycomp((void *)(&bt_node1->bt),
+	    (void *)(&bt_node2->bt));
+}
+
+static void
+prof_thr_node_hash(const void *key, size_t r_hash[2]) {
+	const prof_thr_node_t *thr_node = (prof_thr_node_t *)key;
+	hash(&thr_node->thr_uid, sizeof(uint64_t), 0x94122f35U, r_hash);
+}
+
+static bool
+prof_thr_node_keycomp(const void *k1, const void *k2) {
+	const prof_thr_node_t *thr_node1 = (prof_thr_node_t *)k1;
+	const prof_thr_node_t *thr_node2 = (prof_thr_node_t *)k2;
+	return thr_node1->thr_uid == thr_node2->thr_uid;
+}
+
 static uint64_t
 prof_thr_uid_alloc(tsdn_t *tsdn) {
 	uint64_t thr_uid;
@@ -2119,6 +2416,368 @@ prof_active_set(tsdn_t *tsdn, bool active) {
 	return prof_active_old;
 }
 
+#ifdef JEMALLOC_JET
+size_t
+prof_log_bt_count(void) {
+	size_t cnt = 0;
+	prof_bt_node_t *node = log_bt_first;
+	while (node != NULL) {
+		cnt++;
+		node = node->next;
+	}
+	return cnt;
+}
+
+size_t
+prof_log_alloc_count(void) {
+	size_t cnt = 0;
+	prof_alloc_node_t *node = log_alloc_first;
+	while (node != NULL) {
+		cnt++;
+		node = node->next;
+	}
+	return cnt;
+}
+
+size_t
+prof_log_thr_count(void) {
+	size_t cnt = 0;
+	prof_thr_node_t *node = log_thr_first;
+	while (node != NULL) {
+		cnt++;
+		node = node->next;
+	}
+	return cnt;
+}
+
+bool
+prof_log_is_logging(void) {
+	return prof_logging_state == prof_logging_state_started;
+}
+
+bool
+prof_log_rep_check(void) {
+	if (prof_logging_state == prof_logging_state_stopped
+	    && log_tables_initialized) {
+		return true;
+	}
+
+	if (log_bt_last != NULL && log_bt_last->next != NULL) {
+		return true;
+	}
+	if (log_thr_last != NULL && log_thr_last->next != NULL) {
+		return true;
+	}
+	if (log_alloc_last != NULL && log_alloc_last->next != NULL) {
+		return true;
+	}
+
+	size_t bt_count = prof_log_bt_count();
+	size_t thr_count = prof_log_thr_count();
+	size_t alloc_count = prof_log_alloc_count();
+
+
+	if (prof_logging_state == prof_logging_state_stopped) {
+		if (bt_count != 0 || thr_count != 0 || alloc_count || 0) {
+			return true;
+		}
+	}
+
+	prof_alloc_node_t *node = log_alloc_first;
+	while (node != NULL) {
+		if (node->alloc_bt_ind >= bt_count) {
+			return true;
+		}
+		if (node->free_bt_ind >= bt_count) {
+			return true;
+		}
+		if (node->alloc_thr_ind >= thr_count) {
+			return true;
+		}
+		if (node->free_thr_ind >= thr_count) {
+			return true;
+		}
+		if (node->alloc_time_ns > node->free_time_ns) {
+			return true;
+		}
+		node = node->next;
+	}
+
+	return false;
+}
+
+void
+prof_log_dummy_set(bool new_value) {
+	prof_log_dummy = new_value;
+}
+#endif
+
+bool
+prof_log_start(tsdn_t *tsdn, const char *filename) {
+	if (!opt_prof || !prof_booted) {
+		return true;
+	}
+
+	bool ret = false;
+	size_t buf_size = PATH_MAX + 1;
+
+	malloc_mutex_lock(tsdn, &log_mtx);
+
+	if (prof_logging_state != prof_logging_state_stopped) {
+		ret = true;
+	} else if (filename == NULL) {
+		/* Make default name. */
+		malloc_snprintf(log_filename, buf_size, "%s.%d.%"FMTu64".json",
+		    opt_prof_prefix, prof_getpid(), log_seq);
+		log_seq++;
+		prof_logging_state = prof_logging_state_started;
+	} else if (strlen(filename) >= buf_size) {
+		ret = true;
+	} else {
+		strcpy(log_filename, filename);
+		prof_logging_state = prof_logging_state_started;
+	}
+
+	if (!ret) {
+		nstime_update(&log_start_timestamp);
+	}
+
+	malloc_mutex_unlock(tsdn, &log_mtx);
+
+	return ret;
+}
+
+/* Used as an atexit function to stop logging on exit. */
+static void
+prof_log_stop_final(void) {
+	tsd_t *tsd = tsd_fetch();
+	prof_log_stop(tsd_tsdn(tsd));
+}
+
+struct prof_emitter_cb_arg_s {
+	int fd;
+	ssize_t ret;
+};
+
+static void
+prof_emitter_write_cb(void *opaque, const char *to_write) {
+	struct prof_emitter_cb_arg_s *arg =
+	    (struct prof_emitter_cb_arg_s *)opaque;
+	size_t bytes = strlen(to_write);
+#ifdef JEMALLOC_JET
+	if (prof_log_dummy) {
+		return;
+	}
+#endif
+	arg->ret = write(arg->fd, (void *)to_write, bytes);
+}
+
+/*
+ * prof_log_emit_{...} goes through the appropriate linked list, emitting each
+ * node to the json and deallocating it.
+ */
+static void
+prof_log_emit_threads(tsd_t *tsd, emitter_t *emitter) {
+	emitter_json_array_kv_begin(emitter, "threads");
+	prof_thr_node_t *thr_node = log_thr_first;
+	prof_thr_node_t *thr_old_node;
+	while (thr_node != NULL) {
+		emitter_json_object_begin(emitter);
+
+		emitter_json_kv(emitter, "thr_uid", emitter_type_uint64,
+		    &thr_node->thr_uid);
+
+		char *thr_name = thr_node->name;
+
+		emitter_json_kv(emitter, "thr_name", emitter_type_string,
+		    &thr_name);
+
+		emitter_json_object_end(emitter);
+		thr_old_node = thr_node;
+		thr_node = thr_node->next;
+		idalloc(tsd, thr_old_node);
+	}
+	emitter_json_array_end(emitter);
+}
+
+static void
+prof_log_emit_traces(tsd_t *tsd, emitter_t *emitter) {
+	emitter_json_array_kv_begin(emitter, "stack_traces");
+	prof_bt_node_t *bt_node = log_bt_first;
+	prof_bt_node_t *bt_old_node; 
+	/* 
+	 * Calculate how many hex digits we need: twice number of bytes, two for
+	 * "0x", and then one more for terminating '\0'.
+	 */
+	char buf[2 * sizeof(intptr_t) + 3];
+	size_t buf_sz = sizeof(buf);
+	while (bt_node != NULL) {
+		emitter_json_array_begin(emitter);
+		size_t i;
+		for (i = 0; i < bt_node->bt.len; i++) {
+			malloc_snprintf(buf, buf_sz, "%p", bt_node->bt.vec[i]);
+			char *trace_str = buf;
+			emitter_json_value(emitter, emitter_type_string,
+			    &trace_str);
+		}
+		emitter_json_array_end(emitter);
+
+		bt_old_node = bt_node;
+		bt_node = bt_node->next;
+		idalloc(tsd, bt_old_node);
+	}
+	emitter_json_array_end(emitter);
+}
+
+static void
+prof_log_emit_allocs(tsd_t *tsd, emitter_t *emitter) {
+	emitter_json_array_kv_begin(emitter, "allocations");
+	prof_alloc_node_t *alloc_node = log_alloc_first;
+	prof_alloc_node_t *alloc_old_node;
+	while (alloc_node != NULL) {
+		emitter_json_object_begin(emitter);
+
+		emitter_json_kv(emitter, "alloc_thread", emitter_type_size,
+		    &alloc_node->alloc_thr_ind);
+
+		emitter_json_kv(emitter, "free_thread", emitter_type_size,
+		    &alloc_node->free_thr_ind);
+
+		emitter_json_kv(emitter, "alloc_trace", emitter_type_size,
+		    &alloc_node->alloc_bt_ind);
+
+		emitter_json_kv(emitter, "free_trace", emitter_type_size,
+		    &alloc_node->free_bt_ind);
+
+		emitter_json_kv(emitter, "alloc_timestamp",
+		    emitter_type_uint64, &alloc_node->alloc_time_ns);
+
+		emitter_json_kv(emitter, "free_timestamp", emitter_type_uint64,
+		    &alloc_node->free_time_ns);
+
+		emitter_json_kv(emitter, "usize", emitter_type_uint64,
+		    &alloc_node->usize);
+
+		emitter_json_object_end(emitter);
+
+		alloc_old_node = alloc_node;
+		alloc_node = alloc_node->next;
+		idalloc(tsd, alloc_old_node);
+	}
+	emitter_json_array_end(emitter);
+}
+
+static void
+prof_log_emit_metadata(emitter_t *emitter) {
+	emitter_json_object_kv_begin(emitter, "info");
+
+	nstime_t now = NSTIME_ZERO_INITIALIZER;
+
+	nstime_update(&now);
+	uint64_t ns = nstime_ns(&now) - nstime_ns(&log_start_timestamp);
+	emitter_json_kv(emitter, "duration", emitter_type_uint64, &ns);
+
+	char *vers = JEMALLOC_VERSION;
+	emitter_json_kv(emitter, "version",
+	    emitter_type_string, &vers);
+
+	emitter_json_kv(emitter, "lg_sample_rate",
+	    emitter_type_int, &lg_prof_sample);
+
+	int pid = prof_getpid();
+	emitter_json_kv(emitter, "pid", emitter_type_int, &pid);
+
+	emitter_json_object_end(emitter);
+}
+
+
+bool
+prof_log_stop(tsdn_t *tsdn) {
+	if (!opt_prof || !prof_booted) {
+		return true;
+	}
+
+	tsd_t *tsd = tsdn_tsd(tsdn);
+	malloc_mutex_lock(tsdn, &log_mtx);
+
+	if (prof_logging_state != prof_logging_state_started) {
+		malloc_mutex_unlock(tsdn, &log_mtx);
+		return true;
+	}
+
+	/*
+	 * Set the state to dumping. We'll set it to stopped when we're done.
+	 * Since other threads won't be able to start/stop/log when the state is
+	 * dumping, we don't have to hold the lock during the whole method.
+	 */
+	prof_logging_state = prof_logging_state_dumping;
+	malloc_mutex_unlock(tsdn, &log_mtx);
+
+
+	emitter_t emitter;
+
+	/* Create a file. */
+
+	int fd;
+#ifdef JEMALLOC_JET
+	if (prof_log_dummy) {
+		fd = 0;
+	} else {
+		fd = creat(log_filename, 0644);
+	}
+#else
+	fd = creat(log_filename, 0644);
+#endif
+
+	if (fd == -1) {
+		malloc_printf("<jemalloc>: creat() for log file \"%s\" "
+			      " failed with %d\n", log_filename, errno);
+		if (opt_abort) {
+			abort();
+		}
+		return true;
+	}
+
+	/* Emit to json. */
+	struct prof_emitter_cb_arg_s arg;
+	arg.fd = fd;
+	emitter_init(&emitter, emitter_output_json, &prof_emitter_write_cb,
+	    (void *)(&arg));
+
+	emitter_json_object_begin(&emitter);
+	prof_log_emit_metadata(&emitter);
+	prof_log_emit_threads(tsd, &emitter);
+	prof_log_emit_traces(tsd, &emitter);
+	prof_log_emit_allocs(tsd, &emitter);
+	emitter_json_object_end(&emitter);
+
+	/* Reset global state. */
+	if (log_tables_initialized) {
+		ckh_delete(tsd, &log_bt_node_set);
+		ckh_delete(tsd, &log_thr_node_set);
+	}
+	log_tables_initialized = false;
+	log_bt_index = 0;
+	log_thr_index = 0;
+	log_bt_first = NULL;
+	log_bt_last = NULL;
+	log_thr_first = NULL;
+	log_thr_last = NULL;
+	log_alloc_first = NULL;
+	log_alloc_last = NULL;
+
+	malloc_mutex_lock(tsdn, &log_mtx);
+	prof_logging_state = prof_logging_state_stopped;
+	malloc_mutex_unlock(tsdn, &log_mtx);
+
+#ifdef JEMALLOC_JET
+	if (prof_log_dummy) {
+		return false;
+	}
+#endif
+	return close(fd);
+}
+
 const char *
 prof_thread_name_get(tsd_t *tsd) {
 	prof_tdata_t *tdata;
@@ -2355,6 +3014,35 @@ prof_boot2(tsd_t *tsd) {
 			}
 		}
 
+		if (opt_prof_log) {
+			prof_log_start(tsd_tsdn(tsd), NULL);
+		}
+
+		if (atexit(prof_log_stop_final) != 0) {
+			malloc_write("<jemalloc>: Error in atexit() "
+				     "for logging\n");
+			if (opt_abort) {
+				abort();
+			}
+		}
+
+		if (malloc_mutex_init(&log_mtx, "prof_log",
+		    WITNESS_RANK_PROF_LOG, malloc_mutex_rank_exclusive)) {
+			return true;
+		}
+
+		if (ckh_new(tsd, &log_bt_node_set, PROF_CKH_MINITEMS,
+		    prof_bt_node_hash, prof_bt_node_keycomp)) {
+			return true;
+		}
+
+		if (ckh_new(tsd, &log_thr_node_set, PROF_CKH_MINITEMS,
+		    prof_thr_node_hash, prof_thr_node_keycomp)) {
+			return true;
+		}
+
+		log_tables_initialized = true;
+
 		gctx_locks = (malloc_mutex_t *)base_alloc(tsd_tsdn(tsd),
 		    b0get(), PROF_NCTX_LOCKS * sizeof(malloc_mutex_t),
 		    CACHELINE);
@@ -2382,16 +3070,14 @@ prof_boot2(tsd_t *tsd) {
 				return true;
 			}
 		}
-	}
-
 #ifdef JEMALLOC_PROF_LIBGCC
-	/*
-	 * Cause the backtracing machinery to allocate its internal state
-	 * before enabling profiling.
-	 */
-	_Unwind_Backtrace(prof_unwind_init_callback, NULL);
+		/*
+		 * Cause the backtracing machinery to allocate its internal
+		 * state before enabling profiling.
+		 */
+		_Unwind_Backtrace(prof_unwind_init_callback, NULL);
 #endif
-
+	}
 	prof_booted = true;
 
 	return false;
diff --git a/src/rtree.c b/src/rtree.c
index 53702cf7..4ae41fe2 100644
--- a/src/rtree.c
+++ b/src/rtree.c
@@ -39,7 +39,7 @@ rtree_node_dalloc_impl(tsdn_t *tsdn, rtree_t *rtree, rtree_node_elm_t *node) {
 	/* Nodes are never deleted during normal operation. */
 	not_reached();
 }
-UNUSED rtree_node_dalloc_t *JET_MUTABLE rtree_node_dalloc =
+rtree_node_dalloc_t *JET_MUTABLE rtree_node_dalloc =
     rtree_node_dalloc_impl;
 
 static rtree_leaf_elm_t *
@@ -54,7 +54,7 @@ rtree_leaf_dalloc_impl(tsdn_t *tsdn, rtree_t *rtree, rtree_leaf_elm_t *leaf) {
 	/* Leaves are never deleted during normal operation. */
 	not_reached();
 }
-UNUSED rtree_leaf_dalloc_t *JET_MUTABLE rtree_leaf_dalloc =
+rtree_leaf_dalloc_t *JET_MUTABLE rtree_leaf_dalloc =
     rtree_leaf_dalloc_impl;
 
 #ifdef JEMALLOC_JET
diff --git a/src/sc.c b/src/sc.c
new file mode 100644
index 00000000..89ddb6ba
--- /dev/null
+++ b/src/sc.c
@@ -0,0 +1,313 @@
+#include "jemalloc/internal/jemalloc_preamble.h"
+
+#include "jemalloc/internal/assert.h"
+#include "jemalloc/internal/bit_util.h"
+#include "jemalloc/internal/bitmap.h"
+#include "jemalloc/internal/pages.h"
+#include "jemalloc/internal/sc.h"
+
+/*
+ * This module computes the size classes used to satisfy allocations.  The logic
+ * here was ported more or less line-by-line from a shell script, and because of
+ * that is not the most idiomatic C.  Eventually we should fix this, but for now
+ * at least the damage is compartmentalized to this file.
+ */
+
+sc_data_t sc_data_global;
+
+static size_t
+reg_size_compute(int lg_base, int lg_delta, int ndelta) {
+	return (ZU(1) << lg_base) + (ZU(ndelta) << lg_delta);
+}
+
+/* Returns the number of pages in the slab. */
+static int
+slab_size(int lg_page, int lg_base, int lg_delta, int ndelta) {
+	size_t page = (ZU(1) << lg_page);
+	size_t reg_size = reg_size_compute(lg_base, lg_delta, ndelta);
+
+	size_t try_slab_size = page;
+	size_t try_nregs = try_slab_size / reg_size;
+	size_t perfect_slab_size = 0;
+	bool perfect = false;
+	/*
+	 * This loop continues until we find the least common multiple of the
+	 * page size and size class size.  Size classes are all of the form
+	 * base + ndelta * delta == (ndelta + base/ndelta) * delta, which is
+	 * (ndelta + ngroup) * delta.  The way we choose slabbing strategies
+	 * means that delta is at most the page size and ndelta < ngroup.  So
+	 * the loop executes for at most 2 * ngroup - 1 iterations, which is
+	 * also the bound on the number of pages in a slab chosen by default.
+	 * With the current default settings, this is at most 7.
+	 */
+	while (!perfect) {
+		perfect_slab_size = try_slab_size;
+		size_t perfect_nregs = try_nregs;
+		try_slab_size += page;
+		try_nregs = try_slab_size / reg_size;
+		if (perfect_slab_size == perfect_nregs * reg_size) {
+			perfect = true;
+		}
+	}
+	return (int)(perfect_slab_size / page);
+}
+
+static void
+size_class(
+    /* Output. */
+    sc_t *sc,
+    /* Configuration decisions. */
+    int lg_max_lookup, int lg_page, int lg_ngroup,
+    /* Inputs specific to the size class. */
+    int index, int lg_base, int lg_delta, int ndelta) {
+	sc->index = index;
+	sc->lg_base = lg_base;
+	sc->lg_delta = lg_delta;
+	sc->ndelta = ndelta;
+	sc->psz = (reg_size_compute(lg_base, lg_delta, ndelta)
+	    % (ZU(1) << lg_page) == 0);
+	size_t size = (ZU(1) << lg_base) + (ZU(ndelta) << lg_delta);
+	if (index == 0) {
+		assert(!sc->psz);
+	}
+	if (size < (ZU(1) << (lg_page + lg_ngroup))) {
+		sc->bin = true;
+		sc->pgs = slab_size(lg_page, lg_base, lg_delta, ndelta);
+	} else {
+		sc->bin = false;
+		sc->pgs = 0;
+	}
+	if (size <= (ZU(1) << lg_max_lookup)) {
+		sc->lg_delta_lookup = lg_delta;
+	} else {
+		sc->lg_delta_lookup = 0;
+	}
+}
+
+static void
+size_classes(
+    /* Output. */
+    sc_data_t *sc_data,
+    /* Determined by the system. */
+    size_t lg_ptr_size, int lg_quantum,
+    /* Configuration decisions. */
+    int lg_tiny_min, int lg_max_lookup, int lg_page, int lg_ngroup) {
+	int ptr_bits = (1 << lg_ptr_size) * 8;
+	int ngroup = (1 << lg_ngroup);
+	int ntiny = 0;
+	int nlbins = 0;
+	int lg_tiny_maxclass = (unsigned)-1;
+	int nbins = 0;
+	int npsizes = 0;
+
+	int index = 0;
+
+	int ndelta = 0;
+	int lg_base = lg_tiny_min;
+	int lg_delta = lg_base;
+
+	/* Outputs that we update as we go. */
+	size_t lookup_maxclass = 0;
+	size_t small_maxclass = 0;
+	int lg_large_minclass = 0;
+	size_t large_maxclass = 0;
+
+	/* Tiny size classes. */
+	while (lg_base < lg_quantum) {
+		sc_t *sc = &sc_data->sc[index];
+		size_class(sc, lg_max_lookup, lg_page, lg_ngroup, index,
+		    lg_base, lg_delta, ndelta);
+		if (sc->lg_delta_lookup != 0) {
+			nlbins = index + 1;
+		}
+		if (sc->psz) {
+			npsizes++;
+		}
+		if (sc->bin) {
+			nbins++;
+		}
+		ntiny++;
+		/* Final written value is correct. */
+		lg_tiny_maxclass = lg_base;
+		index++;
+		lg_delta = lg_base;
+		lg_base++;
+	}
+
+	/* First non-tiny (pseudo) group. */
+	if (ntiny != 0) {
+		sc_t *sc = &sc_data->sc[index];
+		/*
+		 * See the note in sc.h; the first non-tiny size class has an
+		 * unusual encoding.
+		 */
+		lg_base--;
+		ndelta = 1;
+		size_class(sc, lg_max_lookup, lg_page, lg_ngroup, index,
+		    lg_base, lg_delta, ndelta);
+		index++;
+		lg_base++;
+		lg_delta++;
+		if (sc->psz) {
+			npsizes++;
+		}
+		if (sc->bin) {
+			nbins++;
+		}
+	}
+	while (ndelta < ngroup) {
+		sc_t *sc = &sc_data->sc[index];
+		size_class(sc, lg_max_lookup, lg_page, lg_ngroup, index,
+		    lg_base, lg_delta, ndelta);
+		index++;
+		ndelta++;
+		if (sc->psz) {
+			npsizes++;
+		}
+		if (sc->bin) {
+			nbins++;
+		}
+	}
+
+	/* All remaining groups. */
+	lg_base = lg_base + lg_ngroup;
+	while (lg_base < ptr_bits - 1) {
+		ndelta = 1;
+		int ndelta_limit;
+		if (lg_base == ptr_bits - 2) {
+			ndelta_limit = ngroup - 1;
+		} else {
+			ndelta_limit = ngroup;
+		}
+		while (ndelta <= ndelta_limit) {
+			sc_t *sc = &sc_data->sc[index];
+			size_class(sc, lg_max_lookup, lg_page, lg_ngroup, index,
+			    lg_base, lg_delta, ndelta);
+			if (sc->lg_delta_lookup != 0) {
+				nlbins = index + 1;
+				/* Final written value is correct. */
+				lookup_maxclass = (ZU(1) << lg_base)
+				    + (ZU(ndelta) << lg_delta);
+			}
+			if (sc->psz) {
+				npsizes++;
+			}
+			if (sc->bin) {
+				nbins++;
+				/* Final written value is correct. */
+				small_maxclass = (ZU(1) << lg_base)
+				    + (ZU(ndelta) << lg_delta);
+				if (lg_ngroup > 0) {
+					lg_large_minclass = lg_base + 1;
+				} else {
+					lg_large_minclass = lg_base + 2;
+				}
+			}
+			large_maxclass = (ZU(1) << lg_base)
+			    + (ZU(ndelta) << lg_delta);
+			index++;
+			ndelta++;
+		}
+		lg_base++;
+		lg_delta++;
+	}
+	/* Additional outputs. */
+	int nsizes = index;
+	unsigned lg_ceil_nsizes = lg_ceil(nsizes);
+
+	/* Fill in the output data. */
+	sc_data->ntiny = ntiny;
+	sc_data->nlbins = nlbins;
+	sc_data->nbins = nbins;
+	sc_data->nsizes = nsizes;
+	sc_data->lg_ceil_nsizes = lg_ceil_nsizes;
+	sc_data->npsizes = npsizes;
+	sc_data->lg_tiny_maxclass = lg_tiny_maxclass;
+	sc_data->lookup_maxclass = lookup_maxclass;
+	sc_data->small_maxclass = small_maxclass;
+	sc_data->lg_large_minclass = lg_large_minclass;
+	sc_data->large_minclass = (ZU(1) << lg_large_minclass);
+	sc_data->large_maxclass = large_maxclass;
+
+	/*
+	 * We compute these values in two ways:
+	 *   - Incrementally, as above.
+	 *   - In macros, in sc.h.
+	 * The computation is easier when done incrementally, but putting it in
+	 * a constant makes it available to the fast paths without having to
+	 * touch the extra global cacheline.  We assert, however, that the two
+	 * computations are equivalent.
+	 */
+	assert(sc_data->npsizes == SC_NPSIZES);
+	assert(sc_data->lg_tiny_maxclass == SC_LG_TINY_MAXCLASS);
+	assert(sc_data->small_maxclass == SC_SMALL_MAXCLASS);
+	assert(sc_data->large_minclass == SC_LARGE_MINCLASS);
+	assert(sc_data->lg_large_minclass == SC_LG_LARGE_MINCLASS);
+	assert(sc_data->large_maxclass == SC_LARGE_MAXCLASS);
+
+	/* 
+	 * In the allocation fastpath, we want to assume that we can
+	 * unconditionally subtract the requested allocation size from
+	 * a ssize_t, and detect passing through 0 correctly.  This
+	 * results in optimal generated code.  For this to work, the
+	 * maximum allocation size must be less than SSIZE_MAX.
+	 */
+	assert(SC_LARGE_MAXCLASS < SSIZE_MAX);
+}
+
+void
+sc_data_init(sc_data_t *sc_data) {
+	assert(!sc_data->initialized);
+
+	int lg_max_lookup = 12;
+
+	size_classes(sc_data, LG_SIZEOF_PTR, LG_QUANTUM, SC_LG_TINY_MIN,
+	    lg_max_lookup, LG_PAGE, 2);
+
+	sc_data->initialized = true;
+}
+
+static void
+sc_data_update_sc_slab_size(sc_t *sc, size_t reg_size, size_t pgs_guess) {
+	size_t min_pgs = reg_size / PAGE;
+	if (reg_size % PAGE != 0) {
+		min_pgs++;
+	}
+	/*
+	 * BITMAP_MAXBITS is actually determined by putting the smallest
+	 * possible size-class on one page, so this can never be 0.
+	 */
+	size_t max_pgs = BITMAP_MAXBITS * reg_size / PAGE;
+
+	assert(min_pgs <= max_pgs);
+	assert(min_pgs > 0);
+	assert(max_pgs >= 1);
+	if (pgs_guess < min_pgs) {
+		sc->pgs = (int)min_pgs;
+	} else if (pgs_guess > max_pgs) {
+		sc->pgs = (int)max_pgs;
+	} else {
+		sc->pgs = (int)pgs_guess;
+	}
+}
+
+void
+sc_data_update_slab_size(sc_data_t *data, size_t begin, size_t end, int pgs) {
+	assert(data->initialized);
+	for (int i = 0; i < data->nsizes; i++) {
+		sc_t *sc = &data->sc[i];
+		if (!sc->bin) {
+			break;
+		}
+		size_t reg_size = reg_size_compute(sc->lg_base, sc->lg_delta,
+		    sc->ndelta);
+		if (begin <= reg_size && reg_size <= end) {
+			sc_data_update_sc_slab_size(sc, reg_size, pgs);
+		}
+	}
+}
+
+void
+sc_boot(sc_data_t *data) {
+	sc_data_init(data);
+}
diff --git a/src/stats.c b/src/stats.c
index 08b9507c..4c427e0d 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -52,6 +52,20 @@ char opt_stats_print_opts[stats_print_tot_num_options+1] = "";
 
 /******************************************************************************/
 
+static uint64_t
+rate_per_second(uint64_t value, uint64_t uptime_ns) {
+	uint64_t billion = 1000000000;
+	if (uptime_ns == 0 || value == 0) {
+		return 0;
+	}
+	if (uptime_ns < billion) {
+		return value;
+	} else {
+		uint64_t uptime_s = uptime_ns / billion;
+		return value / uptime_s;
+	}
+}
+
 /* Calculate x.yyy and output a string (takes a fixed sized char array). */
 static bool
 get_rate_str(uint64_t dividend, uint64_t divisor, char str[6]) {
@@ -104,24 +118,26 @@ mutex_stats_init_cols(emitter_row_t *row, const char *table_name,
 
 #define WIDTH_uint32_t 12
 #define WIDTH_uint64_t 16
-#define OP(counter, counter_type, human)				\
+#define OP(counter, counter_type, human, derived, base_counter)	\
 	col = &col_##counter_type[k_##counter_type];			\
 	++k_##counter_type;						\
 	emitter_col_init(col, row);					\
 	col->justify = emitter_justify_right;				\
-	col->width = WIDTH_##counter_type;				\
+	col->width = derived ? 8 : WIDTH_##counter_type;		\
 	col->type = emitter_type_title;					\
 	col->str_val = human;
 	MUTEX_PROF_COUNTERS
 #undef OP
 #undef WIDTH_uint32_t
 #undef WIDTH_uint64_t
+	col_uint64_t[mutex_counter_total_wait_time_ps].width = 10;
 }
 
 static void
 mutex_stats_read_global(const char *name, emitter_col_t *col_name,
     emitter_col_t col_uint64_t[mutex_prof_num_uint64_t_counters],
-    emitter_col_t col_uint32_t[mutex_prof_num_uint32_t_counters]) {
+    emitter_col_t col_uint32_t[mutex_prof_num_uint32_t_counters],
+    uint64_t uptime) {
 	char cmd[MUTEX_CTL_STR_MAX_LENGTH];
 
 	col_name->str_val = name;
@@ -129,12 +145,17 @@ mutex_stats_read_global(const char *name, emitter_col_t *col_name,
 	emitter_col_t *dst;
 #define EMITTER_TYPE_uint32_t emitter_type_uint32
 #define EMITTER_TYPE_uint64_t emitter_type_uint64
-#define OP(counter, counter_type, human)				\
+#define OP(counter, counter_type, human, derived, base_counter)	\
 	dst = &col_##counter_type[mutex_counter_##counter];		\
 	dst->type = EMITTER_TYPE_##counter_type;			\
-	gen_mutex_ctl_str(cmd, MUTEX_CTL_STR_MAX_LENGTH,		\
-	    "mutexes", name, #counter);					\
-	CTL_GET(cmd, (counter_type *)&dst->bool_val, counter_type);
+	if (!derived) {							\
+		gen_mutex_ctl_str(cmd, MUTEX_CTL_STR_MAX_LENGTH,	\
+		    "mutexes", name, #counter);				\
+		CTL_GET(cmd, (counter_type *)&dst->bool_val, counter_type);	\
+	} else { \
+	    emitter_col_t *base = &col_##counter_type[mutex_counter_##base_counter];	\
+	    dst->counter_type##_val = rate_per_second(base->counter_type##_val, uptime); \
+	}
 	MUTEX_PROF_COUNTERS
 #undef OP
 #undef EMITTER_TYPE_uint32_t
@@ -145,7 +166,8 @@ static void
 mutex_stats_read_arena(unsigned arena_ind, mutex_prof_arena_ind_t mutex_ind,
     const char *name, emitter_col_t *col_name,
     emitter_col_t col_uint64_t[mutex_prof_num_uint64_t_counters],
-    emitter_col_t col_uint32_t[mutex_prof_num_uint32_t_counters]) {
+    emitter_col_t col_uint32_t[mutex_prof_num_uint32_t_counters],
+    uint64_t uptime) {
 	char cmd[MUTEX_CTL_STR_MAX_LENGTH];
 
 	col_name->str_val = name;
@@ -153,13 +175,17 @@ mutex_stats_read_arena(unsigned arena_ind, mutex_prof_arena_ind_t mutex_ind,
 	emitter_col_t *dst;
 #define EMITTER_TYPE_uint32_t emitter_type_uint32
 #define EMITTER_TYPE_uint64_t emitter_type_uint64
-#define OP(counter, counter_type, human)				\
+#define OP(counter, counter_type, human, derived, base_counter)	\
 	dst = &col_##counter_type[mutex_counter_##counter];		\
 	dst->type = EMITTER_TYPE_##counter_type;			\
-	gen_mutex_ctl_str(cmd, MUTEX_CTL_STR_MAX_LENGTH,		\
-	    "arenas.0.mutexes",	arena_mutex_names[mutex_ind], #counter);\
-	CTL_M2_GET(cmd, arena_ind,					\
-	    (counter_type *)&dst->bool_val, counter_type);
+	if (!derived) {                                   \
+		gen_mutex_ctl_str(cmd, MUTEX_CTL_STR_MAX_LENGTH,        \
+		    "arenas.0.mutexes", arena_mutex_names[mutex_ind], #counter);\
+		CTL_M2_GET(cmd, arena_ind, (counter_type *)&dst->bool_val, counter_type); \
+	} else {                      \
+		emitter_col_t *base = &col_##counter_type[mutex_counter_##base_counter];	\
+		dst->counter_type##_val = rate_per_second(base->counter_type##_val, uptime); \
+	}
 	MUTEX_PROF_COUNTERS
 #undef OP
 #undef EMITTER_TYPE_uint32_t
@@ -169,19 +195,25 @@ mutex_stats_read_arena(unsigned arena_ind, mutex_prof_arena_ind_t mutex_ind,
 static void
 mutex_stats_read_arena_bin(unsigned arena_ind, unsigned bin_ind,
     emitter_col_t col_uint64_t[mutex_prof_num_uint64_t_counters],
-    emitter_col_t col_uint32_t[mutex_prof_num_uint32_t_counters]) {
+    emitter_col_t col_uint32_t[mutex_prof_num_uint32_t_counters],
+    uint64_t uptime) {
 	char cmd[MUTEX_CTL_STR_MAX_LENGTH];
 	emitter_col_t *dst;
 
 #define EMITTER_TYPE_uint32_t emitter_type_uint32
 #define EMITTER_TYPE_uint64_t emitter_type_uint64
-#define OP(counter, counter_type, human)				\
+#define OP(counter, counter_type, human, derived, base_counter)	\
 	dst = &col_##counter_type[mutex_counter_##counter];		\
 	dst->type = EMITTER_TYPE_##counter_type;			\
-	gen_mutex_ctl_str(cmd, MUTEX_CTL_STR_MAX_LENGTH,		\
-	    "arenas.0.bins.0","mutex", #counter);			\
-	CTL_M2_M4_GET(cmd, arena_ind, bin_ind,				\
-	    (counter_type *)&dst->bool_val, counter_type);
+	if (!derived) {                                   \
+		gen_mutex_ctl_str(cmd, MUTEX_CTL_STR_MAX_LENGTH,        \
+		    "arenas.0.bins.0","mutex", #counter);            \
+		CTL_M2_M4_GET(cmd, arena_ind, bin_ind,                \
+		    (counter_type *)&dst->bool_val, counter_type);  \
+	} else {                      \
+		emitter_col_t *base = &col_##counter_type[mutex_counter_##base_counter]; \
+		dst->counter_type##_val = rate_per_second(base->counter_type##_val, uptime); \
+	}
 	MUTEX_PROF_COUNTERS
 #undef OP
 #undef EMITTER_TYPE_uint32_t
@@ -204,19 +236,38 @@ mutex_stats_emit(emitter_t *emitter, emitter_row_t *row,
 
 #define EMITTER_TYPE_uint32_t emitter_type_uint32
 #define EMITTER_TYPE_uint64_t emitter_type_uint64
-#define OP(counter, type, human)					\
-	col = &col_##type[k_##type];						\
-	++k_##type;							\
-	emitter_json_kv(emitter, #counter, EMITTER_TYPE_##type,		\
-	    (const void *)&col->bool_val);
+#define OP(counter, type, human, derived, base_counter)		\
+	if (!derived) {                    \
+		col = &col_##type[k_##type];                        \
+		++k_##type;                            \
+		emitter_json_kv(emitter, #counter, EMITTER_TYPE_##type,        \
+		    (const void *)&col->bool_val); \
+	}
 	MUTEX_PROF_COUNTERS;
 #undef OP
 #undef EMITTER_TYPE_uint32_t
 #undef EMITTER_TYPE_uint64_t
 }
 
+#define COL(row_name, column_name, left_or_right, col_width, etype)      \
+	emitter_col_t col_##column_name;                                     \
+	emitter_col_init(&col_##column_name, &row_name);                     \
+	col_##column_name.justify = emitter_justify_##left_or_right;         \
+	col_##column_name.width = col_width;                                 \
+	col_##column_name.type = emitter_type_##etype;
+
+#define COL_HDR(row_name, column_name, human, left_or_right, col_width, etype)  \
+	COL(row_name, column_name, left_or_right, col_width, etype)	         \
+	emitter_col_t header_##column_name;                                  \
+	emitter_col_init(&header_##column_name, &header_##row_name);         \
+	header_##column_name.justify = emitter_justify_##left_or_right;      \
+	header_##column_name.width = col_width;                              \
+	header_##column_name.type = emitter_type_title;                      \
+	header_##column_name.str_val = human ? human : #column_name;
+
+
 static void
-stats_arena_bins_print(emitter_t *emitter, bool mutex, unsigned i) {
+stats_arena_bins_print(emitter_t *emitter, bool mutex, unsigned i, uint64_t uptime) {
 	size_t page;
 	bool in_gap, in_gap_prev;
 	unsigned nbins, j;
@@ -230,43 +281,36 @@ stats_arena_bins_print(emitter_t *emitter, bool mutex, unsigned i) {
 
 	emitter_row_t row;
 	emitter_row_init(&row);
-#define COL(name, left_or_right, col_width, etype)			\
-	emitter_col_t col_##name;					\
-	emitter_col_init(&col_##name, &row);				\
-	col_##name.justify = emitter_justify_##left_or_right;		\
-	col_##name.width = col_width;					\
-	col_##name.type = emitter_type_##etype;				\
-	emitter_col_t header_col_##name;				\
-	emitter_col_init(&header_col_##name, &header_row);		\
-	header_col_##name.justify = emitter_justify_##left_or_right;	\
-	header_col_##name.width = col_width;				\
-	header_col_##name.type = emitter_type_title;			\
-	header_col_##name.str_val = #name;
 
-	COL(size, right, 20, size)
-	COL(ind, right, 4, unsigned)
-	COL(allocated, right, 13, uint64)
-	COL(nmalloc, right, 13, uint64)
-	COL(ndalloc, right, 13, uint64)
-	COL(nrequests, right, 13, uint64)
-	COL(curregs, right, 13, size)
-	COL(curslabs, right, 13, size)
-	COL(regs, right, 5, unsigned)
-	COL(pgs, right, 4, size)
+	COL_HDR(row, size, NULL, right, 20, size)
+	COL_HDR(row, ind, NULL, right, 4, unsigned)
+	COL_HDR(row, allocated, NULL, right, 13, uint64)
+	COL_HDR(row, nmalloc, NULL, right, 13, uint64)
+	COL_HDR(row, nmalloc_ps, "(#/sec)", right, 8, uint64)
+	COL_HDR(row, ndalloc, NULL, right, 13, uint64)
+	COL_HDR(row, ndalloc_ps, "(#/sec)", right, 8, uint64)
+	COL_HDR(row, nrequests, NULL, right, 13, uint64)
+	COL_HDR(row, nrequests_ps, "(#/sec)", right, 10, uint64)
+	COL_HDR(row, nshards, NULL, right, 9, unsigned)
+	COL_HDR(row, curregs, NULL, right, 13, size)
+	COL_HDR(row, curslabs, NULL, right, 13, size)
+	COL_HDR(row, regs, NULL, right, 5, unsigned)
+	COL_HDR(row, pgs, NULL, right, 4, size)
 	/* To buffer a right- and left-justified column. */
-	COL(justify_spacer, right, 1, title)
-	COL(util, right, 6, title)
-	COL(nfills, right, 13, uint64)
-	COL(nflushes, right, 13, uint64)
-	COL(nslabs, right, 13, uint64)
-	COL(nreslabs, right, 13, uint64)
-#undef COL
+	COL_HDR(row, justify_spacer, NULL, right, 1, title)
+	COL_HDR(row, util, NULL, right, 6, title)
+	COL_HDR(row, nfills, NULL, right, 13, uint64)
+	COL_HDR(row, nfills_ps, "(#/sec)", right, 8, uint64)
+	COL_HDR(row, nflushes, NULL, right, 13, uint64)
+	COL_HDR(row, nflushes_ps, "(#/sec)", right, 8, uint64)
+	COL_HDR(row, nslabs, NULL, right, 13, uint64)
+	COL_HDR(row, nreslabs, NULL, right, 13, uint64)
+	COL_HDR(row, nreslabs_ps, "(#/sec)", right, 8, uint64)
 
 	/* Don't want to actually print the name. */
-	header_col_justify_spacer.str_val = " ";
+	header_justify_spacer.str_val = " ";
 	col_justify_spacer.str_val = " ";
 
-
 	emitter_col_t col_mutex64[mutex_prof_num_uint64_t_counters];
 	emitter_col_t col_mutex32[mutex_prof_num_uint32_t_counters];
 
@@ -284,16 +328,16 @@ stats_arena_bins_print(emitter_t *emitter, bool mutex, unsigned i) {
 	 * We print a "bins:" header as part of the table row; we need to adjust
 	 * the header size column to compensate.
 	 */
-	header_col_size.width -=5;
+	header_size.width -=5;
 	emitter_table_printf(emitter, "bins:");
 	emitter_table_row(emitter, &header_row);
-	emitter_json_arr_begin(emitter, "bins");
+	emitter_json_array_kv_begin(emitter, "bins");
 
 	for (j = 0, in_gap = false; j < nbins; j++) {
 		uint64_t nslabs;
 		size_t reg_size, slab_size, curregs;
 		size_t curslabs;
-		uint32_t nregs;
+		uint32_t nregs, nshards;
 		uint64_t nmalloc, ndalloc, nrequests, nfills, nflushes;
 		uint64_t nreslabs;
 
@@ -310,6 +354,7 @@ stats_arena_bins_print(emitter_t *emitter, bool mutex, unsigned i) {
 		CTL_M2_GET("arenas.bin.0.size", j, &reg_size, size_t);
 		CTL_M2_GET("arenas.bin.0.nregs", j, &nregs, uint32_t);
 		CTL_M2_GET("arenas.bin.0.slab_size", j, &slab_size, size_t);
+		CTL_M2_GET("arenas.bin.0.nshards", j, &nshards, uint32_t);
 
 		CTL_M2_M4_GET("stats.arenas.0.bins.0.nmalloc", i, j, &nmalloc,
 		    uint64_t);
@@ -330,10 +375,10 @@ stats_arena_bins_print(emitter_t *emitter, bool mutex, unsigned i) {
 
 		if (mutex) {
 			mutex_stats_read_arena_bin(i, j, col_mutex64,
-			    col_mutex32);
+			    col_mutex32, uptime);
 		}
 
-		emitter_json_arr_obj_begin(emitter);
+		emitter_json_object_begin(emitter);
 		emitter_json_kv(emitter, "nmalloc", emitter_type_uint64,
 		    &nmalloc);
 		emitter_json_kv(emitter, "ndalloc", emitter_type_uint64,
@@ -351,12 +396,12 @@ stats_arena_bins_print(emitter_t *emitter, bool mutex, unsigned i) {
 		emitter_json_kv(emitter, "curslabs", emitter_type_size,
 		    &curslabs);
 		if (mutex) {
-			emitter_json_dict_begin(emitter, "mutex");
+			emitter_json_object_kv_begin(emitter, "mutex");
 			mutex_stats_emit(emitter, NULL, col_mutex64,
 			    col_mutex32);
-			emitter_json_dict_end(emitter);
+			emitter_json_object_end(emitter);
 		}
-		emitter_json_arr_obj_end(emitter);
+		emitter_json_object_end(emitter);
 
 		size_t availregs = nregs * curslabs;
 		char util[6];
@@ -381,17 +426,24 @@ stats_arena_bins_print(emitter_t *emitter, bool mutex, unsigned i) {
 		col_ind.unsigned_val = j;
 		col_allocated.size_val = curregs * reg_size;
 		col_nmalloc.uint64_val = nmalloc;
+		col_nmalloc_ps.uint64_val = rate_per_second(nmalloc, uptime);
 		col_ndalloc.uint64_val = ndalloc;
+		col_ndalloc_ps.uint64_val = rate_per_second(ndalloc, uptime);
 		col_nrequests.uint64_val = nrequests;
+		col_nrequests_ps.uint64_val = rate_per_second(nrequests, uptime);
+		col_nshards.unsigned_val = nshards;
 		col_curregs.size_val = curregs;
 		col_curslabs.size_val = curslabs;
 		col_regs.unsigned_val = nregs;
 		col_pgs.size_val = slab_size / page;
 		col_util.str_val = util;
 		col_nfills.uint64_val = nfills;
+		col_nfills_ps.uint64_val = rate_per_second(nfills, uptime);
 		col_nflushes.uint64_val = nflushes;
+		col_nflushes_ps.uint64_val = rate_per_second(nflushes, uptime);
 		col_nslabs.uint64_val = nslabs;
 		col_nreslabs.uint64_val = nreslabs;
+		col_nreslabs_ps.uint64_val = rate_per_second(nreslabs, uptime);
 
 		/*
 		 * Note that mutex columns were initialized above, if mutex ==
@@ -400,7 +452,7 @@ stats_arena_bins_print(emitter_t *emitter, bool mutex, unsigned i) {
 
 		emitter_table_row(emitter, &row);
 	}
-	emitter_json_arr_end(emitter); /* Close "bins". */
+	emitter_json_array_end(emitter); /* Close "bins". */
 
 	if (in_gap) {
 		emitter_table_printf(emitter, "                     ---\n");
@@ -408,7 +460,7 @@ stats_arena_bins_print(emitter_t *emitter, bool mutex, unsigned i) {
 }
 
 static void
-stats_arena_lextents_print(emitter_t *emitter, unsigned i) {
+stats_arena_lextents_print(emitter_t *emitter, unsigned i, uint64_t uptime) {
 	unsigned nbins, nlextents, j;
 	bool in_gap, in_gap_prev;
 
@@ -420,34 +472,22 @@ stats_arena_lextents_print(emitter_t *emitter, unsigned i) {
 	emitter_row_t row;
 	emitter_row_init(&row);
 
-#define COL(name, left_or_right, col_width, etype)			\
-	emitter_col_t header_##name;					\
-	emitter_col_init(&header_##name, &header_row);			\
-	header_##name.justify = emitter_justify_##left_or_right;	\
-	header_##name.width = col_width;				\
-	header_##name.type = emitter_type_title;			\
-	header_##name.str_val = #name;					\
-									\
-	emitter_col_t col_##name;					\
-	emitter_col_init(&col_##name, &row);				\
-	col_##name.justify = emitter_justify_##left_or_right;		\
-	col_##name.width = col_width;					\
-	col_##name.type = emitter_type_##etype;
-
-	COL(size, right, 20, size)
-	COL(ind, right, 4, unsigned)
-	COL(allocated, right, 13, size)
-	COL(nmalloc, right, 13, uint64)
-	COL(ndalloc, right, 13, uint64)
-	COL(nrequests, right, 13, uint64)
-	COL(curlextents, right, 13, size)
-#undef COL
+	COL_HDR(row, size, NULL, right, 20, size)
+	COL_HDR(row, ind, NULL, right, 4, unsigned)
+	COL_HDR(row, allocated, NULL, right, 13, size)
+	COL_HDR(row, nmalloc, NULL, right, 13, uint64)
+	COL_HDR(row, nmalloc_ps, "(#/sec)", right, 8, uint64)
+	COL_HDR(row, ndalloc, NULL, right, 13, uint64)
+	COL_HDR(row, ndalloc_ps, "(#/sec)", right, 8, uint64)
+	COL_HDR(row, nrequests, NULL, right, 13, uint64)
+	COL_HDR(row, nrequests_ps, "(#/sec)", right, 8, uint64)
+	COL_HDR(row, curlextents, NULL, right, 13, size)
 
 	/* As with bins, we label the large extents table. */
 	header_size.width -= 6;
 	emitter_table_printf(emitter, "large:");
 	emitter_table_row(emitter, &header_row);
-	emitter_json_arr_begin(emitter, "lextents");
+	emitter_json_array_kv_begin(emitter, "lextents");
 
 	for (j = 0, in_gap = false; j < nlextents; j++) {
 		uint64_t nmalloc, ndalloc, nrequests;
@@ -471,31 +511,122 @@ stats_arena_lextents_print(emitter_t *emitter, unsigned i) {
 		CTL_M2_M4_GET("stats.arenas.0.lextents.0.curlextents", i, j,
 		    &curlextents, size_t);
 
-		emitter_json_arr_obj_begin(emitter);
+		emitter_json_object_begin(emitter);
 		emitter_json_kv(emitter, "curlextents", emitter_type_size,
 		    &curlextents);
-		emitter_json_arr_obj_end(emitter);
+		emitter_json_object_end(emitter);
 
 		col_size.size_val = lextent_size;
 		col_ind.unsigned_val = nbins + j;
 		col_allocated.size_val = curlextents * lextent_size;
 		col_nmalloc.uint64_val = nmalloc;
+		col_nmalloc_ps.uint64_val = rate_per_second(nmalloc, uptime);
 		col_ndalloc.uint64_val = ndalloc;
+		col_ndalloc_ps.uint64_val = rate_per_second(ndalloc, uptime);
 		col_nrequests.uint64_val = nrequests;
+		col_nrequests_ps.uint64_val = rate_per_second(nrequests, uptime);
 		col_curlextents.size_val = curlextents;
 
 		if (!in_gap) {
 			emitter_table_row(emitter, &row);
 		}
 	}
-	emitter_json_arr_end(emitter); /* Close "lextents". */
+	emitter_json_array_end(emitter); /* Close "lextents". */
 	if (in_gap) {
 		emitter_table_printf(emitter, "                     ---\n");
 	}
 }
 
 static void
-stats_arena_mutexes_print(emitter_t *emitter, unsigned arena_ind) {
+stats_arena_extents_print(emitter_t *emitter, unsigned i) {
+	unsigned j;
+	bool in_gap, in_gap_prev;
+	emitter_row_t header_row;
+	emitter_row_init(&header_row);
+	emitter_row_t row;
+	emitter_row_init(&row);
+
+	COL_HDR(row, size, NULL, right, 20, size)
+	COL_HDR(row, ind, NULL, right, 4, unsigned)
+	COL_HDR(row, ndirty, NULL, right, 13, size)
+	COL_HDR(row, dirty, NULL, right, 13, size)
+	COL_HDR(row, nmuzzy, NULL, right, 13, size)
+	COL_HDR(row, muzzy, NULL, right, 13, size)
+	COL_HDR(row, nretained, NULL, right, 13, size)
+	COL_HDR(row, retained, NULL, right, 13, size)
+	COL_HDR(row, ntotal, NULL, right, 13, size)
+	COL_HDR(row, total, NULL, right, 13, size)
+
+	/* Label this section. */
+	header_size.width -= 8;
+	emitter_table_printf(emitter, "extents:");
+	emitter_table_row(emitter, &header_row);
+	emitter_json_array_kv_begin(emitter, "extents");
+
+	in_gap = false;
+	for (j = 0; j < SC_NPSIZES; j++) {
+		size_t ndirty, nmuzzy, nretained, total, dirty_bytes,
+		    muzzy_bytes, retained_bytes, total_bytes;
+		CTL_M2_M4_GET("stats.arenas.0.extents.0.ndirty", i, j,
+		    &ndirty, size_t);
+		CTL_M2_M4_GET("stats.arenas.0.extents.0.nmuzzy", i, j,
+		    &nmuzzy, size_t);
+		CTL_M2_M4_GET("stats.arenas.0.extents.0.nretained", i, j,
+		    &nretained, size_t);
+		CTL_M2_M4_GET("stats.arenas.0.extents.0.dirty_bytes", i, j,
+		    &dirty_bytes, size_t);
+		CTL_M2_M4_GET("stats.arenas.0.extents.0.muzzy_bytes", i, j,
+		    &muzzy_bytes, size_t);
+		CTL_M2_M4_GET("stats.arenas.0.extents.0.retained_bytes", i, j,
+		    &retained_bytes, size_t);
+		total = ndirty + nmuzzy + nretained;
+		total_bytes = dirty_bytes + muzzy_bytes + retained_bytes;
+
+		in_gap_prev = in_gap;
+		in_gap = (total == 0);
+
+		if (in_gap_prev && !in_gap) {
+			emitter_table_printf(emitter,
+			    "                     ---\n");
+		}
+
+		emitter_json_object_begin(emitter);
+		emitter_json_kv(emitter, "ndirty", emitter_type_size, &ndirty);
+		emitter_json_kv(emitter, "nmuzzy", emitter_type_size, &nmuzzy);
+		emitter_json_kv(emitter, "nretained", emitter_type_size,
+		    &nretained);
+
+		emitter_json_kv(emitter, "dirty_bytes", emitter_type_size,
+		    &dirty_bytes);
+		emitter_json_kv(emitter, "muzzy_bytes", emitter_type_size,
+		    &muzzy_bytes);
+		emitter_json_kv(emitter, "retained_bytes", emitter_type_size,
+		    &retained_bytes);
+		emitter_json_object_end(emitter);
+
+		col_size.size_val = sz_pind2sz(j);
+		col_ind.size_val = j;
+		col_ndirty.size_val = ndirty;
+		col_dirty.size_val = dirty_bytes;
+		col_nmuzzy.size_val = nmuzzy;
+		col_muzzy.size_val = muzzy_bytes;
+		col_nretained.size_val = nretained;
+		col_retained.size_val = retained_bytes;
+		col_ntotal.size_val = total;
+		col_total.size_val = total_bytes;
+
+		if (!in_gap) {
+			emitter_table_row(emitter, &row);
+		}
+	}
+	emitter_json_array_end(emitter); /* Close "extents". */
+	if (in_gap) {
+		emitter_table_printf(emitter, "                     ---\n");
+	}
+}
+
+static void
+stats_arena_mutexes_print(emitter_t *emitter, unsigned arena_ind, uint64_t uptime) {
 	emitter_row_t row;
 	emitter_col_t col_name;
 	emitter_col_t col64[mutex_prof_num_uint64_t_counters];
@@ -504,29 +635,29 @@ stats_arena_mutexes_print(emitter_t *emitter, unsigned arena_ind) {
 	emitter_row_init(&row);
 	mutex_stats_init_cols(&row, "", &col_name, col64, col32);
 
-	emitter_json_dict_begin(emitter, "mutexes");
+	emitter_json_object_kv_begin(emitter, "mutexes");
 	emitter_table_row(emitter, &row);
 
 	for (mutex_prof_arena_ind_t i = 0; i < mutex_prof_num_arena_mutexes;
 	    i++) {
 		const char *name = arena_mutex_names[i];
-		emitter_json_dict_begin(emitter, name);
+		emitter_json_object_kv_begin(emitter, name);
 		mutex_stats_read_arena(arena_ind, i, name, &col_name, col64,
-		    col32);
+		    col32, uptime);
 		mutex_stats_emit(emitter, &row, col64, col32);
-		emitter_json_dict_end(emitter); /* Close the mutex dict. */
+		emitter_json_object_end(emitter); /* Close the mutex dict. */
 	}
-	emitter_json_dict_end(emitter); /* End "mutexes". */
+	emitter_json_object_end(emitter); /* End "mutexes". */
 }
 
 static void
 stats_arena_print(emitter_t *emitter, unsigned i, bool bins, bool large,
-    bool mutex) {
+    bool mutex, bool extents) {
 	unsigned nthreads;
 	const char *dss;
 	ssize_t dirty_decay_ms, muzzy_decay_ms;
 	size_t page, pactive, pdirty, pmuzzy, mapped, retained;
-	size_t base, internal, resident, metadata_thp;
+	size_t base, internal, resident, metadata_thp, extent_avail;
 	uint64_t dirty_npurge, dirty_nmadvise, dirty_purged;
 	uint64_t muzzy_npurge, muzzy_nmadvise, muzzy_purged;
 	size_t small_allocated;
@@ -594,98 +725,74 @@ stats_arena_print(emitter_t *emitter, unsigned i, bool bins, bool large,
 	    &muzzy_purged);
 
 	/* Table-style emission. */
-	emitter_col_t decay_type;
-	emitter_col_init(&decay_type, &decay_row);
-	decay_type.justify = emitter_justify_right;
-	decay_type.width = 9;
-	decay_type.type = emitter_type_title;
-	decay_type.str_val = "decaying:";
+	COL(decay_row, decay_type, right, 9, title);
+	col_decay_type.str_val = "decaying:";
 
-	emitter_col_t decay_time;
-	emitter_col_init(&decay_time, &decay_row);
-	decay_time.justify = emitter_justify_right;
-	decay_time.width = 6;
-	decay_time.type = emitter_type_title;
-	decay_time.str_val = "time";
+	COL(decay_row, decay_time, right, 6, title);
+	col_decay_time.str_val = "time";
 
-	emitter_col_t decay_npages;
-	emitter_col_init(&decay_npages, &decay_row);
-	decay_npages.justify = emitter_justify_right;
-	decay_npages.width = 13;
-	decay_npages.type = emitter_type_title;
-	decay_npages.str_val = "npages";
+	COL(decay_row, decay_npages, right, 13, title);
+	col_decay_npages.str_val = "npages";
 
-	emitter_col_t decay_sweeps;
-	emitter_col_init(&decay_sweeps, &decay_row);
-	decay_sweeps.justify = emitter_justify_right;
-	decay_sweeps.width = 13;
-	decay_sweeps.type = emitter_type_title;
-	decay_sweeps.str_val = "sweeps";
+	COL(decay_row, decay_sweeps, right, 13, title);
+	col_decay_sweeps.str_val = "sweeps";
 
-	emitter_col_t decay_madvises;
-	emitter_col_init(&decay_madvises, &decay_row);
-	decay_madvises.justify = emitter_justify_right;
-	decay_madvises.width = 13;
-	decay_madvises.type = emitter_type_title;
-	decay_madvises.str_val = "madvises";
+	COL(decay_row, decay_madvises, right, 13, title);
+	col_decay_madvises.str_val = "madvises";
 
-	emitter_col_t decay_purged;
-	emitter_col_init(&decay_purged, &decay_row);
-	decay_purged.justify = emitter_justify_right;
-	decay_purged.width = 13;
-	decay_purged.type = emitter_type_title;
-	decay_purged.str_val = "purged";
+	COL(decay_row, decay_purged, right, 13, title);
+	col_decay_purged.str_val = "purged";
 
 	/* Title row. */
 	emitter_table_row(emitter, &decay_row);
 
 	/* Dirty row. */
-	decay_type.str_val = "dirty:";
+	col_decay_type.str_val = "dirty:";
 
 	if (dirty_decay_ms >= 0) {
-		decay_time.type = emitter_type_ssize;
-		decay_time.ssize_val = dirty_decay_ms;
+		col_decay_time.type = emitter_type_ssize;
+		col_decay_time.ssize_val = dirty_decay_ms;
 	} else {
-		decay_time.type = emitter_type_title;
-		decay_time.str_val = "N/A";
+		col_decay_time.type = emitter_type_title;
+		col_decay_time.str_val = "N/A";
 	}
 
-	decay_npages.type = emitter_type_size;
-	decay_npages.size_val = pdirty;
+	col_decay_npages.type = emitter_type_size;
+	col_decay_npages.size_val = pdirty;
 
-	decay_sweeps.type = emitter_type_uint64;
-	decay_sweeps.uint64_val = dirty_npurge;
+	col_decay_sweeps.type = emitter_type_uint64;
+	col_decay_sweeps.uint64_val = dirty_npurge;
 
-	decay_madvises.type = emitter_type_uint64;
-	decay_madvises.uint64_val = dirty_nmadvise;
+	col_decay_madvises.type = emitter_type_uint64;
+	col_decay_madvises.uint64_val = dirty_nmadvise;
 
-	decay_purged.type = emitter_type_uint64;
-	decay_purged.uint64_val = dirty_purged;
+	col_decay_purged.type = emitter_type_uint64;
+	col_decay_purged.uint64_val = dirty_purged;
 
 	emitter_table_row(emitter, &decay_row);
 
 	/* Muzzy row. */
-	decay_type.str_val = "muzzy:";
+	col_decay_type.str_val = "muzzy:";
 
 	if (muzzy_decay_ms >= 0) {
-		decay_time.type = emitter_type_ssize;
-		decay_time.ssize_val = muzzy_decay_ms;
+		col_decay_time.type = emitter_type_ssize;
+		col_decay_time.ssize_val = muzzy_decay_ms;
 	} else {
-		decay_time.type = emitter_type_title;
-		decay_time.str_val = "N/A";
+		col_decay_time.type = emitter_type_title;
+		col_decay_time.str_val = "N/A";
 	}
 
-	decay_npages.type = emitter_type_size;
-	decay_npages.size_val = pmuzzy;
+	col_decay_npages.type = emitter_type_size;
+	col_decay_npages.size_val = pmuzzy;
 
-	decay_sweeps.type = emitter_type_uint64;
-	decay_sweeps.uint64_val = muzzy_npurge;
+	col_decay_sweeps.type = emitter_type_uint64;
+	col_decay_sweeps.uint64_val = muzzy_npurge;
 
-	decay_madvises.type = emitter_type_uint64;
-	decay_madvises.uint64_val = muzzy_nmadvise;
+	col_decay_madvises.type = emitter_type_uint64;
+	col_decay_madvises.uint64_val = muzzy_nmadvise;
 
-	decay_purged.type = emitter_type_uint64;
-	decay_purged.uint64_val = muzzy_purged;
+	col_decay_purged.type = emitter_type_uint64;
+	col_decay_purged.uint64_val = muzzy_purged;
 
 	emitter_table_row(emitter, &decay_row);
 
@@ -693,81 +800,89 @@ stats_arena_print(emitter_t *emitter, unsigned i, bool bins, bool large,
 	emitter_row_t alloc_count_row;
 	emitter_row_init(&alloc_count_row);
 
-	emitter_col_t alloc_count_title;
-	emitter_col_init(&alloc_count_title, &alloc_count_row);
-	alloc_count_title.justify = emitter_justify_left;
-	alloc_count_title.width = 25;
-	alloc_count_title.type = emitter_type_title;
-	alloc_count_title.str_val = "";
+	COL(alloc_count_row, count_title, left, 21, title);
+	col_count_title.str_val = "";
 
-	emitter_col_t alloc_count_allocated;
-	emitter_col_init(&alloc_count_allocated, &alloc_count_row);
-	alloc_count_allocated.justify = emitter_justify_right;
-	alloc_count_allocated.width = 12;
-	alloc_count_allocated.type = emitter_type_title;
-	alloc_count_allocated.str_val = "allocated";
+	COL(alloc_count_row, count_allocated, right, 16, title);
+	col_count_allocated.str_val = "allocated";
 
-	emitter_col_t alloc_count_nmalloc;
-	emitter_col_init(&alloc_count_nmalloc, &alloc_count_row);
-	alloc_count_nmalloc.justify = emitter_justify_right;
-	alloc_count_nmalloc.width = 12;
-	alloc_count_nmalloc.type = emitter_type_title;
-	alloc_count_nmalloc.str_val = "nmalloc";
+	COL(alloc_count_row, count_nmalloc, right, 16, title);
+	col_count_nmalloc.str_val = "nmalloc";
+	COL(alloc_count_row, count_nmalloc_ps, right, 8, title);
+	col_count_nmalloc_ps.str_val = "(#/sec)";
 
-	emitter_col_t alloc_count_ndalloc;
-	emitter_col_init(&alloc_count_ndalloc, &alloc_count_row);
-	alloc_count_ndalloc.justify = emitter_justify_right;
-	alloc_count_ndalloc.width = 12;
-	alloc_count_ndalloc.type = emitter_type_title;
-	alloc_count_ndalloc.str_val = "ndalloc";
+	COL(alloc_count_row, count_ndalloc, right, 16, title);
+	col_count_ndalloc.str_val = "ndalloc";
+	COL(alloc_count_row, count_ndalloc_ps, right, 8, title);
+	col_count_ndalloc_ps.str_val = "(#/sec)";
 
-	emitter_col_t alloc_count_nrequests;
-	emitter_col_init(&alloc_count_nrequests, &alloc_count_row);
-	alloc_count_nrequests.justify = emitter_justify_right;
-	alloc_count_nrequests.width = 12;
-	alloc_count_nrequests.type = emitter_type_title;
-	alloc_count_nrequests.str_val = "nrequests";
+	COL(alloc_count_row, count_nrequests, right, 16, title);
+	col_count_nrequests.str_val = "nrequests";
+	COL(alloc_count_row, count_nrequests_ps, right, 10, title);
+	col_count_nrequests_ps.str_val = "(#/sec)";
 
 	emitter_table_row(emitter, &alloc_count_row);
 
+	col_count_nmalloc_ps.type = emitter_type_uint64;
+	col_count_ndalloc_ps.type = emitter_type_uint64;
+	col_count_nrequests_ps.type = emitter_type_uint64;
+
 #define GET_AND_EMIT_ALLOC_STAT(small_or_large, name, valtype)		\
 	CTL_M2_GET("stats.arenas.0." #small_or_large "." #name, i,	\
 	    &small_or_large##_##name, valtype##_t);			\
 	emitter_json_kv(emitter, #name, emitter_type_##valtype,		\
 	    &small_or_large##_##name);					\
-	alloc_count_##name.type = emitter_type_##valtype;		\
-	alloc_count_##name.valtype##_val = small_or_large##_##name;
+	col_count_##name.type = emitter_type_##valtype;		\
+	col_count_##name.valtype##_val = small_or_large##_##name;
 
-	emitter_json_dict_begin(emitter, "small");
-	alloc_count_title.str_val = "small:";
+	emitter_json_object_kv_begin(emitter, "small");
+	col_count_title.str_val = "small:";
 
 	GET_AND_EMIT_ALLOC_STAT(small, allocated, size)
 	GET_AND_EMIT_ALLOC_STAT(small, nmalloc, uint64)
+	col_count_nmalloc_ps.uint64_val =
+	    rate_per_second(col_count_nmalloc.uint64_val, uptime);
 	GET_AND_EMIT_ALLOC_STAT(small, ndalloc, uint64)
+	col_count_ndalloc_ps.uint64_val =
+	    rate_per_second(col_count_ndalloc.uint64_val, uptime);
 	GET_AND_EMIT_ALLOC_STAT(small, nrequests, uint64)
+	col_count_nrequests_ps.uint64_val =
+	    rate_per_second(col_count_nrequests.uint64_val, uptime);
 
 	emitter_table_row(emitter, &alloc_count_row);
-	emitter_json_dict_end(emitter); /* Close "small". */
+	emitter_json_object_end(emitter); /* Close "small". */
 
-	emitter_json_dict_begin(emitter, "large");
-	alloc_count_title.str_val = "large:";
+	emitter_json_object_kv_begin(emitter, "large");
+	col_count_title.str_val = "large:";
 
 	GET_AND_EMIT_ALLOC_STAT(large, allocated, size)
 	GET_AND_EMIT_ALLOC_STAT(large, nmalloc, uint64)
+	col_count_nmalloc_ps.uint64_val =
+	    rate_per_second(col_count_nmalloc.uint64_val, uptime);
 	GET_AND_EMIT_ALLOC_STAT(large, ndalloc, uint64)
+	col_count_ndalloc_ps.uint64_val =
+	    rate_per_second(col_count_ndalloc.uint64_val, uptime);
 	GET_AND_EMIT_ALLOC_STAT(large, nrequests, uint64)
+	col_count_nrequests_ps.uint64_val =
+	    rate_per_second(col_count_nrequests.uint64_val, uptime);
 
 	emitter_table_row(emitter, &alloc_count_row);
-	emitter_json_dict_end(emitter); /* Close "large". */
+	emitter_json_object_end(emitter); /* Close "large". */
 
 #undef GET_AND_EMIT_ALLOC_STAT
 
 	/* Aggregated small + large stats are emitter only in table mode. */
-	alloc_count_title.str_val = "total:";
-	alloc_count_allocated.size_val = small_allocated + large_allocated;
-	alloc_count_nmalloc.uint64_val = small_nmalloc + large_nmalloc;
-	alloc_count_ndalloc.uint64_val = small_ndalloc + large_ndalloc;
-	alloc_count_nrequests.uint64_val = small_nrequests + large_nrequests;
+	col_count_title.str_val = "total:";
+	col_count_allocated.size_val = small_allocated + large_allocated;
+	col_count_nmalloc.uint64_val = small_nmalloc + large_nmalloc;
+	col_count_ndalloc.uint64_val = small_ndalloc + large_ndalloc;
+	col_count_nrequests.uint64_val = small_nrequests + large_nrequests;
+	col_count_nmalloc_ps.uint64_val =
+	    rate_per_second(col_count_nmalloc.uint64_val, uptime);
+	col_count_ndalloc_ps.uint64_val =
+	    rate_per_second(col_count_ndalloc.uint64_val, uptime);
+	col_count_nrequests_ps.uint64_val =
+	    rate_per_second(col_count_nrequests.uint64_val, uptime);
 	emitter_table_row(emitter, &alloc_count_row);
 
 	emitter_row_t mem_count_row;
@@ -776,14 +891,14 @@ stats_arena_print(emitter_t *emitter, unsigned i, bool bins, bool large,
 	emitter_col_t mem_count_title;
 	emitter_col_init(&mem_count_title, &mem_count_row);
 	mem_count_title.justify = emitter_justify_left;
-	mem_count_title.width = 25;
+	mem_count_title.width = 21;
 	mem_count_title.type = emitter_type_title;
 	mem_count_title.str_val = "";
 
 	emitter_col_t mem_count_val;
 	emitter_col_init(&mem_count_val, &mem_count_row);
 	mem_count_val.justify = emitter_justify_right;
-	mem_count_val.width = 12;
+	mem_count_val.width = 16;
 	mem_count_val.type = emitter_type_title;
 	mem_count_val.str_val = "";
 
@@ -809,16 +924,20 @@ stats_arena_print(emitter_t *emitter, unsigned i, bool bins, bool large,
 	GET_AND_EMIT_MEM_STAT(metadata_thp)
 	GET_AND_EMIT_MEM_STAT(tcache_bytes)
 	GET_AND_EMIT_MEM_STAT(resident)
+	GET_AND_EMIT_MEM_STAT(extent_avail)
 #undef GET_AND_EMIT_MEM_STAT
 
 	if (mutex) {
-		stats_arena_mutexes_print(emitter, i);
+		stats_arena_mutexes_print(emitter, i, uptime);
 	}
 	if (bins) {
-		stats_arena_bins_print(emitter, mutex, i);
+		stats_arena_bins_print(emitter, mutex, i, uptime);
 	}
 	if (large) {
-		stats_arena_lextents_print(emitter, i);
+		stats_arena_lextents_print(emitter, i, uptime);
+	}
+	if (extents) {
+		stats_arena_extents_print(emitter, i);
 	}
 }
 
@@ -891,6 +1010,8 @@ stats_general_print(emitter_t *emitter) {
 #define OPT_WRITE_UNSIGNED(name)					\
 	OPT_WRITE(name, uv, usz, emitter_type_unsigned)
 
+#define OPT_WRITE_SIZE_T(name)						\
+	OPT_WRITE(name, sv, ssz, emitter_type_size)
 #define OPT_WRITE_SSIZE_T(name)						\
 	OPT_WRITE(name, ssv, sssz, emitter_type_ssize)
 #define OPT_WRITE_SSIZE_T_MUTABLE(name, altname)			\
@@ -908,11 +1029,12 @@ stats_general_print(emitter_t *emitter) {
 	OPT_WRITE_CHAR_P("dss")
 	OPT_WRITE_UNSIGNED("narenas")
 	OPT_WRITE_CHAR_P("percpu_arena")
+	OPT_WRITE_SIZE_T("oversize_threshold")
 	OPT_WRITE_CHAR_P("metadata_thp")
 	OPT_WRITE_BOOL_MUTABLE("background_thread", "background_thread")
 	OPT_WRITE_SSIZE_T_MUTABLE("dirty_decay_ms", "arenas.dirty_decay_ms")
 	OPT_WRITE_SSIZE_T_MUTABLE("muzzy_decay_ms", "arenas.muzzy_decay_ms")
-	OPT_WRITE_UNSIGNED("lg_extent_max_active_fit")
+	OPT_WRITE_SIZE_T("lg_extent_max_active_fit")
 	OPT_WRITE_CHAR_P("junk")
 	OPT_WRITE_BOOL("zero")
 	OPT_WRITE_BOOL("utrace")
@@ -977,7 +1099,7 @@ stats_general_print(emitter_t *emitter) {
 	 * The json output sticks arena info into an "arenas" dict; the table
 	 * output puts them at the top-level.
 	 */
-	emitter_json_dict_begin(emitter, "arenas");
+	emitter_json_object_kv_begin(emitter, "arenas");
 
 	CTL_GET("arenas.narenas", &uv, unsigned);
 	emitter_kv(emitter, "narenas", "Arenas", emitter_type_unsigned, &uv);
@@ -1018,9 +1140,9 @@ stats_general_print(emitter_t *emitter) {
 	 * (not just omit the printing).
 	 */
 	if (emitter->output == emitter_output_json) {
-		emitter_json_arr_begin(emitter, "bin");
+		emitter_json_array_kv_begin(emitter, "bin");
 		for (unsigned i = 0; i < nbins; i++) {
-			emitter_json_arr_obj_begin(emitter);
+			emitter_json_object_begin(emitter);
 
 			CTL_M2_GET("arenas.bin.0.size", i, &sv, size_t);
 			emitter_json_kv(emitter, "size", emitter_type_size,
@@ -1034,9 +1156,13 @@ stats_general_print(emitter_t *emitter) {
 			emitter_json_kv(emitter, "slab_size", emitter_type_size,
 			    &sv);
 
-			emitter_json_arr_obj_end(emitter);
+			CTL_M2_GET("arenas.bin.0.nshards", i, &u32v, uint32_t);
+			emitter_json_kv(emitter, "nshards", emitter_type_uint32,
+			    &u32v);
+
+			emitter_json_object_end(emitter);
 		}
-		emitter_json_arr_end(emitter); /* Close "bin". */
+		emitter_json_array_end(emitter); /* Close "bin". */
 	}
 
 	unsigned nlextents;
@@ -1045,25 +1171,25 @@ stats_general_print(emitter_t *emitter) {
 	    emitter_type_unsigned, &nlextents);
 
 	if (emitter->output == emitter_output_json) {
-		emitter_json_arr_begin(emitter, "lextent");
+		emitter_json_array_kv_begin(emitter, "lextent");
 		for (unsigned i = 0; i < nlextents; i++) {
-			emitter_json_arr_obj_begin(emitter);
+			emitter_json_object_begin(emitter);
 
 			CTL_M2_GET("arenas.lextent.0.size", i, &sv, size_t);
 			emitter_json_kv(emitter, "size", emitter_type_size,
 			    &sv);
 
-			emitter_json_arr_obj_end(emitter);
+			emitter_json_object_end(emitter);
 		}
-		emitter_json_arr_end(emitter); /* Close "lextent". */
+		emitter_json_array_end(emitter); /* Close "lextent". */
 	}
 
-	emitter_json_dict_end(emitter); /* Close "arenas" */
+	emitter_json_object_end(emitter); /* Close "arenas" */
 }
 
 static void
 stats_print_helper(emitter_t *emitter, bool merged, bool destroyed,
-    bool unmerged, bool bins, bool large, bool mutex) {
+    bool unmerged, bool bins, bool large, bool mutex, bool extents) {
 	/*
 	 * These should be deleted.  We keep them around for a while, to aid in
 	 * the transition to the emitter code.
@@ -1095,7 +1221,7 @@ stats_print_helper(emitter_t *emitter, bool merged, bool destroyed,
 	}
 
 	/* Generic global stats. */
-	emitter_json_dict_begin(emitter, "stats");
+	emitter_json_object_kv_begin(emitter, "stats");
 	emitter_json_kv(emitter, "allocated", emitter_type_size, &allocated);
 	emitter_json_kv(emitter, "active", emitter_type_size, &active);
 	emitter_json_kv(emitter, "metadata", emitter_type_size, &metadata);
@@ -1111,14 +1237,14 @@ stats_print_helper(emitter_t *emitter, bool merged, bool destroyed,
 	    resident, mapped, retained);
 
 	/* Background thread stats. */
-	emitter_json_dict_begin(emitter, "background_thread");
+	emitter_json_object_kv_begin(emitter, "background_thread");
 	emitter_json_kv(emitter, "num_threads", emitter_type_size,
 	    &num_background_threads);
 	emitter_json_kv(emitter, "num_runs", emitter_type_uint64,
 	    &background_thread_num_runs);
 	emitter_json_kv(emitter, "run_interval", emitter_type_uint64,
 	    &background_thread_run_interval);
-	emitter_json_dict_end(emitter); /* Close "background_thread". */
+	emitter_json_object_end(emitter); /* Close "background_thread". */
 
 	emitter_table_printf(emitter, "Background threads: %zu, "
 	    "num_runs: %"FMTu64", run_interval: %"FMTu64" ns\n",
@@ -1130,30 +1256,33 @@ stats_print_helper(emitter_t *emitter, bool merged, bool destroyed,
 		emitter_col_t name;
 		emitter_col_t col64[mutex_prof_num_uint64_t_counters];
 		emitter_col_t col32[mutex_prof_num_uint32_t_counters];
+		uint64_t uptime;
 
 		emitter_row_init(&row);
 		mutex_stats_init_cols(&row, "", &name, col64, col32);
 
 		emitter_table_row(emitter, &row);
-		emitter_json_dict_begin(emitter, "mutexes");
+		emitter_json_object_kv_begin(emitter, "mutexes");
+
+		CTL_M2_GET("stats.arenas.0.uptime", 0, &uptime, uint64_t);
 
 		for (int i = 0; i < mutex_prof_num_global_mutexes; i++) {
 			mutex_stats_read_global(global_mutex_names[i], &name,
-			    col64, col32);
-			emitter_json_dict_begin(emitter, global_mutex_names[i]);
+			    col64, col32, uptime);
+			emitter_json_object_kv_begin(emitter, global_mutex_names[i]);
 			mutex_stats_emit(emitter, &row, col64, col32);
-			emitter_json_dict_end(emitter);
+			emitter_json_object_end(emitter);
 		}
 
-		emitter_json_dict_end(emitter); /* Close "mutexes". */
+		emitter_json_object_end(emitter); /* Close "mutexes". */
 	}
 
-	emitter_json_dict_end(emitter); /* Close "stats". */
+	emitter_json_object_end(emitter); /* Close "stats". */
 
 	if (merged || destroyed || unmerged) {
 		unsigned narenas;
 
-		emitter_json_dict_begin(emitter, "stats.arenas");
+		emitter_json_object_kv_begin(emitter, "stats.arenas");
 
 		CTL_GET("arenas.narenas", &narenas, unsigned);
 		size_t mib[3];
@@ -1182,10 +1311,10 @@ stats_print_helper(emitter_t *emitter, bool merged, bool destroyed,
 		if (merged && (ninitialized > 1 || !unmerged)) {
 			/* Print merged arena stats. */
 			emitter_table_printf(emitter, "Merged arenas stats:\n");
-			emitter_json_dict_begin(emitter, "merged");
+			emitter_json_object_kv_begin(emitter, "merged");
 			stats_arena_print(emitter, MALLCTL_ARENAS_ALL, bins,
-			    large, mutex);
-			emitter_json_dict_end(emitter); /* Close "merged". */
+			    large, mutex, extents);
+			emitter_json_object_end(emitter); /* Close "merged". */
 		}
 
 		/* Destroyed stats. */
@@ -1193,10 +1322,10 @@ stats_print_helper(emitter_t *emitter, bool merged, bool destroyed,
 			/* Print destroyed arena stats. */
 			emitter_table_printf(emitter,
 			    "Destroyed arenas stats:\n");
-			emitter_json_dict_begin(emitter, "destroyed");
+			emitter_json_object_kv_begin(emitter, "destroyed");
 			stats_arena_print(emitter, MALLCTL_ARENAS_DESTROYED,
-			    bins, large, mutex);
-			emitter_json_dict_end(emitter); /* Close "destroyed". */
+			    bins, large, mutex, extents);
+			emitter_json_object_end(emitter); /* Close "destroyed". */
 		}
 
 		/* Unmerged stats. */
@@ -1206,18 +1335,18 @@ stats_print_helper(emitter_t *emitter, bool merged, bool destroyed,
 					char arena_ind_str[20];
 					malloc_snprintf(arena_ind_str,
 					    sizeof(arena_ind_str), "%u", i);
-					emitter_json_dict_begin(emitter,
+					emitter_json_object_kv_begin(emitter,
 					    arena_ind_str);
 					emitter_table_printf(emitter,
 					    "arenas[%s]:\n", arena_ind_str);
 					stats_arena_print(emitter, i, bins,
-					    large, mutex);
+					    large, mutex, extents);
 					/* Close "<arena-ind>". */
-					emitter_json_dict_end(emitter);
+					emitter_json_object_end(emitter);
 				}
 			}
 		}
-		emitter_json_dict_end(emitter); /* Close "stats.arenas". */
+		emitter_json_object_end(emitter); /* Close "stats.arenas". */
 	}
 }
 
@@ -1270,17 +1399,17 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 	    cbopaque);
 	emitter_begin(&emitter);
 	emitter_table_printf(&emitter, "___ Begin jemalloc statistics ___\n");
-	emitter_json_dict_begin(&emitter, "jemalloc");
+	emitter_json_object_kv_begin(&emitter, "jemalloc");
 
 	if (general) {
 		stats_general_print(&emitter);
 	}
 	if (config_stats) {
 		stats_print_helper(&emitter, merged, destroyed, unmerged,
-		    bins, large, mutex);
+		    bins, large, mutex, extents);
 	}
 
-	emitter_json_dict_end(&emitter); /* Closes the "jemalloc" dict. */
+	emitter_json_object_end(&emitter); /* Closes the "jemalloc" dict. */
 	emitter_table_printf(&emitter, "--- End jemalloc statistics ---\n");
 	emitter_end(&emitter);
 }
diff --git a/src/sz.c b/src/sz.c
index 9de77e45..8633fb05 100644
--- a/src/sz.c
+++ b/src/sz.c
@@ -2,106 +2,63 @@
 #include "jemalloc/internal/sz.h"
 
 JEMALLOC_ALIGNED(CACHELINE)
-const size_t sz_pind2sz_tab[NPSIZES+1] = {
-#define PSZ_yes(lg_grp, ndelta, lg_delta)				\
-	(((ZU(1)<<lg_grp) + (ZU(ndelta)<<lg_delta))),
-#define PSZ_no(lg_grp, ndelta, lg_delta)
-#define SC(index, lg_grp, lg_delta, ndelta, psz, bin, pgs, lg_delta_lookup) \
-	PSZ_##psz(lg_grp, ndelta, lg_delta)
-	SIZE_CLASSES
-#undef PSZ_yes
-#undef PSZ_no
-#undef SC
-	(LARGE_MAXCLASS + PAGE)
-};
+size_t sz_pind2sz_tab[SC_NPSIZES+1];
+
+static void
+sz_boot_pind2sz_tab(const sc_data_t *sc_data) {
+	int pind = 0;
+	for (unsigned i = 0; i < SC_NSIZES; i++) {
+		const sc_t *sc = &sc_data->sc[i];
+		if (sc->psz) {
+			sz_pind2sz_tab[pind] = (ZU(1) << sc->lg_base)
+			    + (ZU(sc->ndelta) << sc->lg_delta);
+			pind++;
+		}
+	}
+	for (int i = pind; i <= (int)SC_NPSIZES; i++) {
+		sz_pind2sz_tab[pind] = sc_data->large_maxclass + PAGE;
+	}
+}
 
 JEMALLOC_ALIGNED(CACHELINE)
-const size_t sz_index2size_tab[NSIZES] = {
-#define SC(index, lg_grp, lg_delta, ndelta, psz, bin, pgs, lg_delta_lookup) \
-	((ZU(1)<<lg_grp) + (ZU(ndelta)<<lg_delta)),
-	SIZE_CLASSES
-#undef SC
-};
+size_t sz_index2size_tab[SC_NSIZES];
 
+static void
+sz_boot_index2size_tab(const sc_data_t *sc_data) {
+	for (unsigned i = 0; i < SC_NSIZES; i++) {
+		const sc_t *sc = &sc_data->sc[i];
+		sz_index2size_tab[i] = (ZU(1) << sc->lg_base)
+		    + (ZU(sc->ndelta) << (sc->lg_delta));
+	}
+}
+
+/*
+ * To keep this table small, we divide sizes by the tiny min size, which gives
+ * the smallest interval for which the result can change.
+ */
 JEMALLOC_ALIGNED(CACHELINE)
-const uint8_t sz_size2index_tab[] = {
-#if LG_TINY_MIN == 0
-/* The div module doesn't support division by 1. */
-#error "Unsupported LG_TINY_MIN"
-#define S2B_0(i)	i,
-#elif LG_TINY_MIN == 1
-#warning "Dangerous LG_TINY_MIN"
-#define S2B_1(i)	i,
-#elif LG_TINY_MIN == 2
-#warning "Dangerous LG_TINY_MIN"
-#define S2B_2(i)	i,
-#elif LG_TINY_MIN == 3
-#define S2B_3(i)	i,
-#elif LG_TINY_MIN == 4
-#define S2B_4(i)	i,
-#elif LG_TINY_MIN == 5
-#define S2B_5(i)	i,
-#elif LG_TINY_MIN == 6
-#define S2B_6(i)	i,
-#elif LG_TINY_MIN == 7
-#define S2B_7(i)	i,
-#elif LG_TINY_MIN == 8
-#define S2B_8(i)	i,
-#elif LG_TINY_MIN == 9
-#define S2B_9(i)	i,
-#elif LG_TINY_MIN == 10
-#define S2B_10(i)	i,
-#elif LG_TINY_MIN == 11
-#define S2B_11(i)	i,
-#else
-#error "Unsupported LG_TINY_MIN"
-#endif
-#if LG_TINY_MIN < 1
-#define S2B_1(i)	S2B_0(i) S2B_0(i)
-#endif
-#if LG_TINY_MIN < 2
-#define S2B_2(i)	S2B_1(i) S2B_1(i)
-#endif
-#if LG_TINY_MIN < 3
-#define S2B_3(i)	S2B_2(i) S2B_2(i)
-#endif
-#if LG_TINY_MIN < 4
-#define S2B_4(i)	S2B_3(i) S2B_3(i)
-#endif
-#if LG_TINY_MIN < 5
-#define S2B_5(i)	S2B_4(i) S2B_4(i)
-#endif
-#if LG_TINY_MIN < 6
-#define S2B_6(i)	S2B_5(i) S2B_5(i)
-#endif
-#if LG_TINY_MIN < 7
-#define S2B_7(i)	S2B_6(i) S2B_6(i)
-#endif
-#if LG_TINY_MIN < 8
-#define S2B_8(i)	S2B_7(i) S2B_7(i)
-#endif
-#if LG_TINY_MIN < 9
-#define S2B_9(i)	S2B_8(i) S2B_8(i)
-#endif
-#if LG_TINY_MIN < 10
-#define S2B_10(i)	S2B_9(i) S2B_9(i)
-#endif
-#if LG_TINY_MIN < 11
-#define S2B_11(i)	S2B_10(i) S2B_10(i)
-#endif
-#define S2B_no(i)
-#define SC(index, lg_grp, lg_delta, ndelta, psz, bin, pgs, lg_delta_lookup) \
-	S2B_##lg_delta_lookup(index)
-	SIZE_CLASSES
-#undef S2B_3
-#undef S2B_4
-#undef S2B_5
-#undef S2B_6
-#undef S2B_7
-#undef S2B_8
-#undef S2B_9
-#undef S2B_10
-#undef S2B_11
-#undef S2B_no
-#undef SC
-};
+uint8_t sz_size2index_tab[(SC_LOOKUP_MAXCLASS >> SC_LG_TINY_MIN) + 1];
+
+static void
+sz_boot_size2index_tab(const sc_data_t *sc_data) {
+	size_t dst_max = (SC_LOOKUP_MAXCLASS >> SC_LG_TINY_MIN) + 1;
+	size_t dst_ind = 0;
+	for (unsigned sc_ind = 0; sc_ind < SC_NSIZES && dst_ind < dst_max;
+	    sc_ind++) {
+		const sc_t *sc = &sc_data->sc[sc_ind];
+		size_t sz = (ZU(1) << sc->lg_base)
+		    + (ZU(sc->ndelta) << sc->lg_delta);
+		size_t max_ind = ((sz + (ZU(1) << SC_LG_TINY_MIN) - 1)
+				   >> SC_LG_TINY_MIN);
+		for (; dst_ind <= max_ind && dst_ind < dst_max; dst_ind++) {
+			sz_size2index_tab[dst_ind] = sc_ind;
+		}
+	}
+}
+
+void
+sz_boot(const sc_data_t *sc_data) {
+	sz_boot_pind2sz_tab(sc_data);
+	sz_boot_index2size_tab(sc_data);
+	sz_boot_size2index_tab(sc_data);
+}
diff --git a/src/tcache.c b/src/tcache.c
index a769a6b1..e7b970d9 100644
--- a/src/tcache.c
+++ b/src/tcache.c
@@ -4,7 +4,7 @@
 
 #include "jemalloc/internal/assert.h"
 #include "jemalloc/internal/mutex.h"
-#include "jemalloc/internal/size_classes.h"
+#include "jemalloc/internal/sc.h"
 
 /******************************************************************************/
 /* Data. */
@@ -41,7 +41,7 @@ tcache_event_hard(tsd_t *tsd, tcache_t *tcache) {
 	szind_t binind = tcache->next_gc_bin;
 
 	cache_bin_t *tbin;
-	if (binind < NBINS) {
+	if (binind < SC_NBINS) {
 		tbin = tcache_small_bin_get(tcache, binind);
 	} else {
 		tbin = tcache_large_bin_get(tcache, binind);
@@ -50,7 +50,7 @@ tcache_event_hard(tsd_t *tsd, tcache_t *tcache) {
 		/*
 		 * Flush (ceiling) 3/4 of the objects below the low water mark.
 		 */
-		if (binind < NBINS) {
+		if (binind < SC_NBINS) {
 			tcache_bin_flush_small(tsd, tcache, tbin, binind,
 			    tbin->ncached - tbin->low_water + (tbin->low_water
 			    >> 2));
@@ -72,7 +72,7 @@ tcache_event_hard(tsd_t *tsd, tcache_t *tcache) {
 		 * Increase fill count by 2X for small bins.  Make sure
 		 * lg_fill_div stays greater than 0.
 		 */
-		if (binind < NBINS && tcache->lg_fill_div[binind] > 1) {
+		if (binind < SC_NBINS && tcache->lg_fill_div[binind] > 1) {
 			tcache->lg_fill_div[binind]--;
 		}
 	}
@@ -100,28 +100,68 @@ tcache_alloc_small_hard(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache,
 	return ret;
 }
 
+/* Enabled with --enable-extra-size-check. */
+#ifdef JEMALLOC_EXTRA_SIZE_CHECK
+static void
+tbin_extents_lookup_size_check(tsdn_t *tsdn, cache_bin_t *tbin, szind_t binind,
+    size_t nflush, extent_t **extents){
+	rtree_ctx_t rtree_ctx_fallback;
+	rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn, &rtree_ctx_fallback);
+
+	/*
+	 * Verify that the items in the tcache all have the correct size; this
+	 * is useful for catching sized deallocation bugs, also to fail early
+	 * instead of corrupting metadata.  Since this can be turned on for opt
+	 * builds, avoid the branch in the loop.
+	 */
+	szind_t szind;
+	size_t sz_sum = binind * nflush;
+	for (unsigned i = 0 ; i < nflush; i++) {
+		rtree_extent_szind_read(tsdn, &extents_rtree,
+		    rtree_ctx, (uintptr_t)*(tbin->avail - 1 - i), true,
+		    &extents[i], &szind);
+		sz_sum -= szind;
+	}
+	if (sz_sum != 0) {
+		malloc_printf("<jemalloc>: size mismatch in thread cache "
+		    "detected, likely caused by sized deallocation bugs by "
+		    "application. Abort.\n");
+		abort();
+	}
+}
+#endif
+
 void
 tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, cache_bin_t *tbin,
     szind_t binind, unsigned rem) {
 	bool merged_stats = false;
 
-	assert(binind < NBINS);
+	assert(binind < SC_NBINS);
 	assert((cache_bin_sz_t)rem <= tbin->ncached);
 
 	arena_t *arena = tcache->arena;
 	assert(arena != NULL);
 	unsigned nflush = tbin->ncached - rem;
 	VARIABLE_ARRAY(extent_t *, item_extent, nflush);
+
+#ifndef JEMALLOC_EXTRA_SIZE_CHECK
 	/* Look up extent once per item. */
 	for (unsigned i = 0 ; i < nflush; i++) {
 		item_extent[i] = iealloc(tsd_tsdn(tsd), *(tbin->avail - 1 - i));
 	}
-
+#else
+	tbin_extents_lookup_size_check(tsd_tsdn(tsd), tbin, binind, nflush,
+	    item_extent);
+#endif
 	while (nflush > 0) {
 		/* Lock the arena bin associated with the first object. */
 		extent_t *extent = item_extent[0];
-		arena_t *bin_arena = extent_arena_get(extent);
-		bin_t *bin = &bin_arena->bins[binind];
+		unsigned bin_arena_ind = extent_arena_ind_get(extent);
+		arena_t *bin_arena = arena_get(tsd_tsdn(tsd), bin_arena_ind,
+		    false);
+		unsigned binshard = extent_binshard_get(extent);
+		assert(binshard < bin_infos[binind].n_shards);
+		bin_t *bin = &bin_arena->bins[binind].bin_shards[binshard];
 
 		if (config_prof && bin_arena == arena) {
 			if (arena_prof_accum(tsd_tsdn(tsd), arena,
@@ -132,8 +172,7 @@ tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, cache_bin_t *tbin,
 		}
 
 		malloc_mutex_lock(tsd_tsdn(tsd), &bin->lock);
-		if (config_stats && bin_arena == arena) {
-			assert(!merged_stats);
+		if (config_stats && bin_arena == arena && !merged_stats) {
 			merged_stats = true;
 			bin->stats.nflushes++;
 			bin->stats.nrequests += tbin->tstats.nrequests;
@@ -145,9 +184,10 @@ tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, cache_bin_t *tbin,
 			extent = item_extent[i];
 			assert(ptr != NULL && extent != NULL);
 
-			if (extent_arena_get(extent) == bin_arena) {
+			if (extent_arena_ind_get(extent) == bin_arena_ind
+			    && extent_binshard_get(extent) == binshard) {
 				arena_dalloc_bin_junked_locked(tsd_tsdn(tsd),
-				    bin_arena, extent, ptr);
+				    bin_arena, bin, binind, extent, ptr);
 			} else {
 				/*
 				 * This object was allocated via a different
@@ -169,8 +209,9 @@ tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, cache_bin_t *tbin,
 		 * The flush loop didn't happen to flush to this thread's
 		 * arena, so the stats didn't get merged.  Manually do so now.
 		 */
-		bin_t *bin = &arena->bins[binind];
-		malloc_mutex_lock(tsd_tsdn(tsd), &bin->lock);
+		unsigned binshard;
+		bin_t *bin = arena_bin_choose_lock(tsd_tsdn(tsd), arena, binind,
+		    &binshard);
 		bin->stats.nflushes++;
 		bin->stats.nrequests += tbin->tstats.nrequests;
 		tbin->tstats.nrequests = 0;
@@ -193,50 +234,63 @@ tcache_bin_flush_large(tsd_t *tsd, cache_bin_t *tbin, szind_t binind,
 	assert(binind < nhbins);
 	assert((cache_bin_sz_t)rem <= tbin->ncached);
 
-	arena_t *arena = tcache->arena;
-	assert(arena != NULL);
+	arena_t *tcache_arena = tcache->arena;
+	assert(tcache_arena != NULL);
 	unsigned nflush = tbin->ncached - rem;
 	VARIABLE_ARRAY(extent_t *, item_extent, nflush);
+
+#ifndef JEMALLOC_EXTRA_SIZE_CHECK
 	/* Look up extent once per item. */
 	for (unsigned i = 0 ; i < nflush; i++) {
 		item_extent[i] = iealloc(tsd_tsdn(tsd), *(tbin->avail - 1 - i));
 	}
-
+#else
+	tbin_extents_lookup_size_check(tsd_tsdn(tsd), tbin, binind, nflush,
+	    item_extent);
+#endif
 	while (nflush > 0) {
 		/* Lock the arena associated with the first object. */
 		extent_t *extent = item_extent[0];
-		arena_t *locked_arena = extent_arena_get(extent);
-		UNUSED bool idump;
+		unsigned locked_arena_ind = extent_arena_ind_get(extent);
+		arena_t *locked_arena = arena_get(tsd_tsdn(tsd),
+		    locked_arena_ind, false);
+		bool idump;
 
 		if (config_prof) {
 			idump = false;
 		}
 
-		malloc_mutex_lock(tsd_tsdn(tsd), &locked_arena->large_mtx);
+		bool lock_large = !arena_is_auto(locked_arena);
+		if (lock_large) {
+			malloc_mutex_lock(tsd_tsdn(tsd), &locked_arena->large_mtx);
+		}
 		for (unsigned i = 0; i < nflush; i++) {
 			void *ptr = *(tbin->avail - 1 - i);
 			assert(ptr != NULL);
 			extent = item_extent[i];
-			if (extent_arena_get(extent) == locked_arena) {
+			if (extent_arena_ind_get(extent) == locked_arena_ind) {
 				large_dalloc_prep_junked_locked(tsd_tsdn(tsd),
 				    extent);
 			}
 		}
-		if ((config_prof || config_stats) && locked_arena == arena) {
+		if ((config_prof || config_stats) &&
+		    (locked_arena == tcache_arena)) {
 			if (config_prof) {
-				idump = arena_prof_accum(tsd_tsdn(tsd), arena,
-				    tcache->prof_accumbytes);
+				idump = arena_prof_accum(tsd_tsdn(tsd),
+				    tcache_arena, tcache->prof_accumbytes);
 				tcache->prof_accumbytes = 0;
 			}
 			if (config_stats) {
 				merged_stats = true;
 				arena_stats_large_nrequests_add(tsd_tsdn(tsd),
-				    &arena->stats, binind,
+				    &tcache_arena->stats, binind,
 				    tbin->tstats.nrequests);
 				tbin->tstats.nrequests = 0;
 			}
 		}
-		malloc_mutex_unlock(tsd_tsdn(tsd), &locked_arena->large_mtx);
+		if (lock_large) {
+			malloc_mutex_unlock(tsd_tsdn(tsd), &locked_arena->large_mtx);
+		}
 
 		unsigned ndeferred = 0;
 		for (unsigned i = 0; i < nflush; i++) {
@@ -244,7 +298,7 @@ tcache_bin_flush_large(tsd_t *tsd, cache_bin_t *tbin, szind_t binind,
 			extent = item_extent[i];
 			assert(ptr != NULL && extent != NULL);
 
-			if (extent_arena_get(extent) == locked_arena) {
+			if (extent_arena_ind_get(extent) == locked_arena_ind) {
 				large_dalloc_finish(tsd_tsdn(tsd), extent);
 			} else {
 				/*
@@ -270,8 +324,8 @@ tcache_bin_flush_large(tsd_t *tsd, cache_bin_t *tbin, szind_t binind,
 		 * The flush loop didn't happen to flush to this thread's
 		 * arena, so the stats didn't get merged.  Manually do so now.
 		 */
-		arena_stats_large_nrequests_add(tsd_tsdn(tsd), &arena->stats,
-		    binind, tbin->tstats.nrequests);
+		arena_stats_large_nrequests_add(tsd_tsdn(tsd),
+		    &tcache_arena->stats, binind, tbin->tstats.nrequests);
 		tbin->tstats.nrequests = 0;
 	}
 
@@ -363,10 +417,10 @@ tcache_init(tsd_t *tsd, tcache_t *tcache, void *avail_stack) {
 
 	size_t stack_offset = 0;
 	assert((TCACHE_NSLOTS_SMALL_MAX & 1U) == 0);
-	memset(tcache->bins_small, 0, sizeof(cache_bin_t) * NBINS);
-	memset(tcache->bins_large, 0, sizeof(cache_bin_t) * (nhbins - NBINS));
+	memset(tcache->bins_small, 0, sizeof(cache_bin_t) * SC_NBINS);
+	memset(tcache->bins_large, 0, sizeof(cache_bin_t) * (nhbins - SC_NBINS));
 	unsigned i = 0;
-	for (; i < NBINS; i++) {
+	for (; i < SC_NBINS; i++) {
 		tcache->lg_fill_div[i] = 1;
 		stack_offset += tcache_bin_info[i].ncached_max * sizeof(void *);
 		/*
@@ -458,7 +512,7 @@ static void
 tcache_flush_cache(tsd_t *tsd, tcache_t *tcache) {
 	assert(tcache->arena != NULL);
 
-	for (unsigned i = 0; i < NBINS; i++) {
+	for (unsigned i = 0; i < SC_NBINS; i++) {
 		cache_bin_t *tbin = tcache_small_bin_get(tcache, i);
 		tcache_bin_flush_small(tsd, tcache, tbin, i, 0);
 
@@ -466,7 +520,7 @@ tcache_flush_cache(tsd_t *tsd, tcache_t *tcache) {
 			assert(tbin->tstats.nrequests == 0);
 		}
 	}
-	for (unsigned i = NBINS; i < nhbins; i++) {
+	for (unsigned i = SC_NBINS; i < nhbins; i++) {
 		cache_bin_t *tbin = tcache_large_bin_get(tcache, i);
 		tcache_bin_flush_large(tsd, tbin, i, 0, tcache);
 
@@ -491,6 +545,7 @@ tcache_flush(tsd_t *tsd) {
 static void
 tcache_destroy(tsd_t *tsd, tcache_t *tcache, bool tsd_tcache) {
 	tcache_flush_cache(tsd, tcache);
+	arena_t *arena = tcache->arena;
 	tcache_arena_dissociate(tsd_tsdn(tsd), tcache);
 
 	if (tsd_tcache) {
@@ -503,6 +558,23 @@ tcache_destroy(tsd_t *tsd, tcache_t *tcache, bool tsd_tcache) {
 		/* Release both the tcache struct and avail array. */
 		idalloctm(tsd_tsdn(tsd), tcache, NULL, NULL, true, true);
 	}
+
+	/*
+	 * The deallocation and tcache flush above may not trigger decay since
+	 * we are on the tcache shutdown path (potentially with non-nominal
+	 * tsd).  Manually trigger decay to avoid pathological cases.  Also
+	 * include arena 0 because the tcache array is allocated from it.
+	 */
+	arena_decay(tsd_tsdn(tsd), arena_get(tsd_tsdn(tsd), 0, false),
+	    false, false);
+
+	if (arena_nthreads_get(arena, false) == 0 &&
+	    !background_thread_enabled()) {
+		/* Force purging when no threads assigned to the arena anymore. */
+		arena_decay(tsd_tsdn(tsd), arena, false, true);
+	} else {
+		arena_decay(tsd_tsdn(tsd), arena, false, false);
+	}
 }
 
 /* For auto tcache (embedded in TSD) only. */
@@ -532,10 +604,10 @@ tcache_stats_merge(tsdn_t *tsdn, tcache_t *tcache, arena_t *arena) {
 	cassert(config_stats);
 
 	/* Merge and reset tcache stats. */
-	for (i = 0; i < NBINS; i++) {
-		bin_t *bin = &arena->bins[i];
+	for (i = 0; i < SC_NBINS; i++) {
 		cache_bin_t *tbin = tcache_small_bin_get(tcache, i);
-		malloc_mutex_lock(tsdn, &bin->lock);
+		unsigned binshard;
+		bin_t *bin = arena_bin_choose_lock(tsdn, arena, i, &binshard);
 		bin->stats.nrequests += tbin->tstats.nrequests;
 		malloc_mutex_unlock(tsdn, &bin->lock);
 		tbin->tstats.nrequests = 0;
@@ -614,23 +686,32 @@ label_return:
 }
 
 static tcache_t *
-tcaches_elm_remove(tsd_t *tsd, tcaches_t *elm) {
+tcaches_elm_remove(tsd_t *tsd, tcaches_t *elm, bool allow_reinit) {
 	malloc_mutex_assert_owner(tsd_tsdn(tsd), &tcaches_mtx);
 
 	if (elm->tcache == NULL) {
 		return NULL;
 	}
 	tcache_t *tcache = elm->tcache;
-	elm->tcache = NULL;
+	if (allow_reinit) {
+		elm->tcache = TCACHES_ELM_NEED_REINIT;
+	} else {
+		elm->tcache = NULL;
+	}
+
+	if (tcache == TCACHES_ELM_NEED_REINIT) {
+		return NULL;
+	}
 	return tcache;
 }
 
 void
 tcaches_flush(tsd_t *tsd, unsigned ind) {
 	malloc_mutex_lock(tsd_tsdn(tsd), &tcaches_mtx);
-	tcache_t *tcache = tcaches_elm_remove(tsd, &tcaches[ind]);
+	tcache_t *tcache = tcaches_elm_remove(tsd, &tcaches[ind], true);
 	malloc_mutex_unlock(tsd_tsdn(tsd), &tcaches_mtx);
 	if (tcache != NULL) {
+		/* Destroy the tcache; recreate in tcaches_get() if needed. */
 		tcache_destroy(tsd, tcache, false);
 	}
 }
@@ -639,7 +720,7 @@ void
 tcaches_destroy(tsd_t *tsd, unsigned ind) {
 	malloc_mutex_lock(tsd_tsdn(tsd), &tcaches_mtx);
 	tcaches_t *elm = &tcaches[ind];
-	tcache_t *tcache = tcaches_elm_remove(tsd, elm);
+	tcache_t *tcache = tcaches_elm_remove(tsd, elm, false);
 	elm->next = tcaches_avail;
 	tcaches_avail = elm;
 	malloc_mutex_unlock(tsd_tsdn(tsd), &tcaches_mtx);
@@ -652,8 +733,8 @@ bool
 tcache_boot(tsdn_t *tsdn) {
 	/* If necessary, clamp opt_lg_tcache_max. */
 	if (opt_lg_tcache_max < 0 || (ZU(1) << opt_lg_tcache_max) <
-	    SMALL_MAXCLASS) {
-		tcache_maxclass = SMALL_MAXCLASS;
+	    SC_SMALL_MAXCLASS) {
+		tcache_maxclass = SC_SMALL_MAXCLASS;
 	} else {
 		tcache_maxclass = (ZU(1) << opt_lg_tcache_max);
 	}
@@ -673,7 +754,7 @@ tcache_boot(tsdn_t *tsdn) {
 	}
 	stack_nelms = 0;
 	unsigned i;
-	for (i = 0; i < NBINS; i++) {
+	for (i = 0; i < SC_NBINS; i++) {
 		if ((bin_infos[i].nregs << 1) <= TCACHE_NSLOTS_SMALL_MIN) {
 			tcache_bin_info[i].ncached_max =
 			    TCACHE_NSLOTS_SMALL_MIN;
diff --git a/src/hooks.c b/src/test_hooks.c
similarity index 79%
rename from src/hooks.c
rename to src/test_hooks.c
index 6266ecd4..ace00d9c 100644
--- a/src/hooks.c
+++ b/src/test_hooks.c
@@ -6,7 +6,7 @@
  * from outside the generated library, so that we can use them in test code.
  */
 JEMALLOC_EXPORT
-void (*hooks_arena_new_hook)() = NULL;
+void (*test_hooks_arena_new_hook)() = NULL;
 
 JEMALLOC_EXPORT
-void (*hooks_libc_hook)() = NULL;
+void (*test_hooks_libc_hook)() = NULL;
diff --git a/src/tsd.c b/src/tsd.c
index c1430682..d5fb4d6f 100644
--- a/src/tsd.c
+++ b/src/tsd.c
@@ -12,6 +12,10 @@
 static unsigned ncleanups;
 static malloc_tsd_cleanup_t cleanups[MALLOC_TSD_CLEANUPS_MAX];
 
+/* TSD_INITIALIZER triggers "-Wmissing-field-initializer" */
+JEMALLOC_DIAGNOSTIC_PUSH
+JEMALLOC_DIAGNOSTIC_IGNORE_MISSING_STRUCT_FIELD_INITIALIZERS
+
 #ifdef JEMALLOC_MALLOC_THREAD_CLEANUP
 __thread tsd_t JEMALLOC_TLS_MODEL tsd_tls = TSD_INITIALIZER;
 __thread bool JEMALLOC_TLS_MODEL tsd_initialized = false;
@@ -41,6 +45,7 @@ tsd_init_head_t	tsd_init_head = {
 	ql_head_initializer(blocks),
 	MALLOC_MUTEX_INITIALIZER
 };
+
 tsd_wrapper_t tsd_boot_wrapper = {
 	false,
 	TSD_INITIALIZER
@@ -48,17 +53,164 @@ tsd_wrapper_t tsd_boot_wrapper = {
 bool tsd_booted = false;
 #endif
 
+JEMALLOC_DIAGNOSTIC_POP
 
 /******************************************************************************/
 
+/* A list of all the tsds in the nominal state. */
+typedef ql_head(tsd_t) tsd_list_t;
+static tsd_list_t tsd_nominal_tsds = ql_head_initializer(tsd_nominal_tsds);
+static malloc_mutex_t tsd_nominal_tsds_lock;
+
+/* How many slow-path-enabling features are turned on. */
+static atomic_u32_t tsd_global_slow_count = ATOMIC_INIT(0);
+
+static bool
+tsd_in_nominal_list(tsd_t *tsd) {
+	tsd_t *tsd_list;
+	bool found = false;
+	/*
+	 * We don't know that tsd is nominal; it might not be safe to get data
+	 * out of it here.
+	 */
+	malloc_mutex_lock(TSDN_NULL, &tsd_nominal_tsds_lock);
+	ql_foreach(tsd_list, &tsd_nominal_tsds, TSD_MANGLE(tcache).tsd_link) {
+		if (tsd == tsd_list) {
+			found = true;
+			break;
+		}
+	}
+	malloc_mutex_unlock(TSDN_NULL, &tsd_nominal_tsds_lock);
+	return found;
+}
+
+static void
+tsd_add_nominal(tsd_t *tsd) {
+	assert(!tsd_in_nominal_list(tsd));
+	assert(tsd_state_get(tsd) <= tsd_state_nominal_max);
+	ql_elm_new(tsd, TSD_MANGLE(tcache).tsd_link);
+	malloc_mutex_lock(tsd_tsdn(tsd), &tsd_nominal_tsds_lock);
+	ql_tail_insert(&tsd_nominal_tsds, tsd, TSD_MANGLE(tcache).tsd_link);
+	malloc_mutex_unlock(tsd_tsdn(tsd), &tsd_nominal_tsds_lock);
+}
+
+static void
+tsd_remove_nominal(tsd_t *tsd) {
+	assert(tsd_in_nominal_list(tsd));
+	assert(tsd_state_get(tsd) <= tsd_state_nominal_max);
+	malloc_mutex_lock(tsd_tsdn(tsd), &tsd_nominal_tsds_lock);
+	ql_remove(&tsd_nominal_tsds, tsd, TSD_MANGLE(tcache).tsd_link);
+	malloc_mutex_unlock(tsd_tsdn(tsd), &tsd_nominal_tsds_lock);
+}
+
+static void
+tsd_force_recompute(tsdn_t *tsdn) {
+	/*
+	 * The stores to tsd->state here need to synchronize with the exchange
+	 * in tsd_slow_update.
+	 */
+	atomic_fence(ATOMIC_RELEASE);
+	malloc_mutex_lock(tsdn, &tsd_nominal_tsds_lock);
+	tsd_t *remote_tsd;
+	ql_foreach(remote_tsd, &tsd_nominal_tsds, TSD_MANGLE(tcache).tsd_link) {
+		assert(tsd_atomic_load(&remote_tsd->state, ATOMIC_RELAXED)
+		    <= tsd_state_nominal_max);
+		tsd_atomic_store(&remote_tsd->state, tsd_state_nominal_recompute,
+		    ATOMIC_RELAXED);
+	}
+	malloc_mutex_unlock(tsdn, &tsd_nominal_tsds_lock);
+}
+
+void
+tsd_global_slow_inc(tsdn_t *tsdn) {
+	atomic_fetch_add_u32(&tsd_global_slow_count, 1, ATOMIC_RELAXED);
+	/*
+	 * We unconditionally force a recompute, even if the global slow count
+	 * was already positive.  If we didn't, then it would be possible for us
+	 * to return to the user, have the user synchronize externally with some
+	 * other thread, and then have that other thread not have picked up the
+	 * update yet (since the original incrementing thread might still be
+	 * making its way through the tsd list).
+	 */
+	tsd_force_recompute(tsdn);
+}
+
+void tsd_global_slow_dec(tsdn_t *tsdn) {
+	atomic_fetch_sub_u32(&tsd_global_slow_count, 1, ATOMIC_RELAXED);
+	/* See the note in ..._inc(). */
+	tsd_force_recompute(tsdn);
+}
+
+static bool
+tsd_local_slow(tsd_t *tsd) {
+	return !tsd_tcache_enabled_get(tsd)
+	    || tsd_reentrancy_level_get(tsd) > 0;
+}
+
+bool
+tsd_global_slow() {
+	return atomic_load_u32(&tsd_global_slow_count, ATOMIC_RELAXED) > 0;
+}
+
+/******************************************************************************/
+
+static uint8_t
+tsd_state_compute(tsd_t *tsd) {
+	if (!tsd_nominal(tsd)) {
+		return tsd_state_get(tsd);
+	}
+	/* We're in *a* nominal state; but which one? */
+	if (malloc_slow || tsd_local_slow(tsd) || tsd_global_slow()) {
+		return tsd_state_nominal_slow;
+	} else {
+		return tsd_state_nominal;
+	}
+}
+
 void
 tsd_slow_update(tsd_t *tsd) {
-	if (tsd_nominal(tsd)) {
-		if (malloc_slow || !tsd_tcache_enabled_get(tsd) ||
-		    tsd_reentrancy_level_get(tsd) > 0) {
-			tsd->state = tsd_state_nominal_slow;
+	uint8_t old_state;
+	do {
+		uint8_t new_state = tsd_state_compute(tsd);
+		old_state = tsd_atomic_exchange(&tsd->state, new_state,
+		    ATOMIC_ACQUIRE);
+	} while (old_state == tsd_state_nominal_recompute);
+}
+
+void
+tsd_state_set(tsd_t *tsd, uint8_t new_state) {
+	/* Only the tsd module can change the state *to* recompute. */
+	assert(new_state != tsd_state_nominal_recompute);
+	uint8_t old_state = tsd_atomic_load(&tsd->state, ATOMIC_RELAXED);
+	if (old_state > tsd_state_nominal_max) {
+		/*
+		 * Not currently in the nominal list, but it might need to be
+		 * inserted there.
+		 */
+		assert(!tsd_in_nominal_list(tsd));
+		tsd_atomic_store(&tsd->state, new_state, ATOMIC_RELAXED);
+		if (new_state <= tsd_state_nominal_max) {
+			tsd_add_nominal(tsd);
+		}
+	} else {
+		/*
+		 * We're currently nominal.  If the new state is non-nominal,
+		 * great; we take ourselves off the list and just enter the new
+		 * state.
+		 */
+		assert(tsd_in_nominal_list(tsd));
+		if (new_state > tsd_state_nominal_max) {
+			tsd_remove_nominal(tsd);
+			tsd_atomic_store(&tsd->state, new_state,
+			    ATOMIC_RELAXED);
 		} else {
-			tsd->state = tsd_state_nominal;
+			/*
+			 * This is the tricky case.  We're transitioning from
+			 * one nominal state to another.  The caller can't know
+			 * about any races that are occuring at the same time,
+			 * so we always have to recompute no matter what.
+			 */
+			tsd_slow_update(tsd);
 		}
 	}
 }
@@ -87,6 +239,7 @@ tsd_data_init(tsd_t *tsd) {
 static void
 assert_tsd_data_cleanup_done(tsd_t *tsd) {
 	assert(!tsd_nominal(tsd));
+	assert(!tsd_in_nominal_list(tsd));
 	assert(*tsd_arenap_get_unsafe(tsd) == NULL);
 	assert(*tsd_iarenap_get_unsafe(tsd) == NULL);
 	assert(*tsd_arenas_tdata_bypassp_get_unsafe(tsd) == true);
@@ -97,8 +250,8 @@ assert_tsd_data_cleanup_done(tsd_t *tsd) {
 
 static bool
 tsd_data_init_nocleanup(tsd_t *tsd) {
-	assert(tsd->state == tsd_state_reincarnated ||
-	    tsd->state == tsd_state_minimal_initialized);
+	assert(tsd_state_get(tsd) == tsd_state_reincarnated ||
+	    tsd_state_get(tsd) == tsd_state_minimal_initialized);
 	/*
 	 * During reincarnation, there is no guarantee that the cleanup function
 	 * will be called (deallocation may happen after all tsd destructors).
@@ -117,27 +270,33 @@ tsd_t *
 tsd_fetch_slow(tsd_t *tsd, bool minimal) {
 	assert(!tsd_fast(tsd));
 
-	if (tsd->state == tsd_state_nominal_slow) {
-		/* On slow path but no work needed. */
-		assert(malloc_slow || !tsd_tcache_enabled_get(tsd) ||
-		    tsd_reentrancy_level_get(tsd) > 0 ||
-		    *tsd_arenas_tdata_bypassp_get(tsd));
-	} else if (tsd->state == tsd_state_uninitialized) {
+	if (tsd_state_get(tsd) == tsd_state_nominal_slow) {
+		/*
+		 * On slow path but no work needed.  Note that we can't
+		 * necessarily *assert* that we're slow, because we might be
+		 * slow because of an asynchronous modification to global state,
+		 * which might be asynchronously modified *back*.
+		 */
+	} else if (tsd_state_get(tsd) == tsd_state_nominal_recompute) {
+		tsd_slow_update(tsd);
+	} else if (tsd_state_get(tsd) == tsd_state_uninitialized) {
 		if (!minimal) {
-			tsd->state = tsd_state_nominal;
-			tsd_slow_update(tsd);
-			/* Trigger cleanup handler registration. */
-			tsd_set(tsd);
-			tsd_data_init(tsd);
+			if (tsd_booted) {
+				tsd_state_set(tsd, tsd_state_nominal);
+				tsd_slow_update(tsd);
+				/* Trigger cleanup handler registration. */
+				tsd_set(tsd);
+				tsd_data_init(tsd);
+			}
 		} else {
-			tsd->state = tsd_state_minimal_initialized;
+			tsd_state_set(tsd, tsd_state_minimal_initialized);
 			tsd_set(tsd);
 			tsd_data_init_nocleanup(tsd);
 		}
-	} else if (tsd->state == tsd_state_minimal_initialized) {
+	} else if (tsd_state_get(tsd) == tsd_state_minimal_initialized) {
 		if (!minimal) {
 			/* Switch to fully initialized. */
-			tsd->state = tsd_state_nominal;
+			tsd_state_set(tsd, tsd_state_nominal);
 			assert(*tsd_reentrancy_levelp_get(tsd) >= 1);
 			(*tsd_reentrancy_levelp_get(tsd))--;
 			tsd_slow_update(tsd);
@@ -145,12 +304,12 @@ tsd_fetch_slow(tsd_t *tsd, bool minimal) {
 		} else {
 			assert_tsd_data_cleanup_done(tsd);
 		}
-	} else if (tsd->state == tsd_state_purgatory) {
-		tsd->state = tsd_state_reincarnated;
+	} else if (tsd_state_get(tsd) == tsd_state_purgatory) {
+		tsd_state_set(tsd, tsd_state_reincarnated);
 		tsd_set(tsd);
 		tsd_data_init_nocleanup(tsd);
 	} else {
-		assert(tsd->state == tsd_state_reincarnated);
+		assert(tsd_state_get(tsd) == tsd_state_reincarnated);
 	}
 
 	return tsd;
@@ -214,7 +373,7 @@ void
 tsd_cleanup(void *arg) {
 	tsd_t *tsd = (tsd_t *)arg;
 
-	switch (tsd->state) {
+	switch (tsd_state_get(tsd)) {
 	case tsd_state_uninitialized:
 		/* Do nothing. */
 		break;
@@ -232,7 +391,7 @@ tsd_cleanup(void *arg) {
 	case tsd_state_nominal:
 	case tsd_state_nominal_slow:
 		tsd_do_data_cleanup(tsd);
-		tsd->state = tsd_state_purgatory;
+		tsd_state_set(tsd, tsd_state_purgatory);
 		tsd_set(tsd);
 		break;
 	case tsd_state_purgatory:
@@ -260,6 +419,10 @@ malloc_tsd_boot0(void) {
 	tsd_t *tsd;
 
 	ncleanups = 0;
+	if (malloc_mutex_init(&tsd_nominal_tsds_lock, "tsd_nominal_tsds_lock",
+	    WITNESS_RANK_OMIT, malloc_mutex_rank_exclusive)) {
+		return NULL;
+	}
 	if (tsd_boot0()) {
 		return NULL;
 	}
@@ -310,7 +473,7 @@ _tls_callback(HINSTANCE hinstDLL, DWORD fdwReason, LPVOID lpvReserved) {
 #    pragma comment(linker, "/INCLUDE:_tls_callback")
 #  else
 #    pragma comment(linker, "/INCLUDE:_tls_used")
-#    pragma comment(linker, "/INCLUDE:tls_callback")
+#    pragma comment(linker, "/INCLUDE:" STRINGIFY(tls_callback) )
 #  endif
 #  pragma section(".CRT$XLY",long,read)
 #endif
@@ -349,3 +512,23 @@ tsd_init_finish(tsd_init_head_t *head, tsd_init_block_t *block) {
 	malloc_mutex_unlock(TSDN_NULL, &head->lock);
 }
 #endif
+
+void
+tsd_prefork(tsd_t *tsd) {
+	malloc_mutex_prefork(tsd_tsdn(tsd), &tsd_nominal_tsds_lock);
+}
+
+void
+tsd_postfork_parent(tsd_t *tsd) {
+	malloc_mutex_postfork_parent(tsd_tsdn(tsd), &tsd_nominal_tsds_lock);
+}
+
+void
+tsd_postfork_child(tsd_t *tsd) {
+	malloc_mutex_postfork_child(tsd_tsdn(tsd), &tsd_nominal_tsds_lock);
+	ql_new(&tsd_nominal_tsds);
+
+	if (tsd_state_get(tsd) <= tsd_state_nominal_max) {
+		tsd_add_nominal(tsd);
+	}
+}
diff --git a/test/include/test/jemalloc_test.h.in b/test/include/test/jemalloc_test.h.in
index 67caa86b..c46af5d9 100644
--- a/test/include/test/jemalloc_test.h.in
+++ b/test/include/test/jemalloc_test.h.in
@@ -25,7 +25,7 @@ extern "C" {
 
 #include "test/jemalloc_test_defs.h"
 
-#ifdef JEMALLOC_OSSPIN
+#if defined(JEMALLOC_OSATOMIC)
 #  include <libkern/OSAtomic.h>
 #endif
 
@@ -69,7 +69,7 @@ static const bool config_debug =
 
 #  define JEMALLOC_N(n) @private_namespace@##n
 #  include "jemalloc/internal/private_namespace.h"
-#  include "jemalloc/internal/hooks.h"
+#  include "jemalloc/internal/test_hooks.h"
 
 /* Hermetic headers. */
 #  include "jemalloc/internal/assert.h"
diff --git a/test/include/test/mtx.h b/test/include/test/mtx.h
index 58afbc3d..066a2137 100644
--- a/test/include/test/mtx.h
+++ b/test/include/test/mtx.h
@@ -10,8 +10,6 @@ typedef struct {
 	CRITICAL_SECTION	lock;
 #elif (defined(JEMALLOC_OS_UNFAIR_LOCK))
 	os_unfair_lock		lock;
-#elif (defined(JEMALLOC_OSSPIN))
-	OSSpinLock		lock;
 #else
 	pthread_mutex_t		lock;
 #endif
diff --git a/test/integration/aligned_alloc.c b/test/integration/aligned_alloc.c
index 536b67ea..4375b172 100644
--- a/test/integration/aligned_alloc.c
+++ b/test/integration/aligned_alloc.c
@@ -34,6 +34,17 @@ TEST_BEGIN(test_alignment_errors) {
 }
 TEST_END
 
+
+/*
+ * GCC "-Walloc-size-larger-than" warning detects when one of the memory
+ * allocation functions is called with a size larger than the maximum size that
+ * they support. Here we want to explicitly test that the allocation functions
+ * do indeed fail properly when this is the case, which triggers the warning.
+ * Therefore we disable the warning for these tests.
+ */
+JEMALLOC_DIAGNOSTIC_PUSH
+JEMALLOC_DIAGNOSTIC_IGNORE_ALLOC_SIZE_LARGER_THAN
+
 TEST_BEGIN(test_oom_errors) {
 	size_t alignment, size;
 	void *p;
@@ -78,6 +89,9 @@ TEST_BEGIN(test_oom_errors) {
 }
 TEST_END
 
+/* Re-enable the "-Walloc-size-larger-than=" warning */
+JEMALLOC_DIAGNOSTIC_POP
+
 TEST_BEGIN(test_alignment_and_size) {
 #define NITER 4
 	size_t alignment, size, total;
@@ -124,10 +138,20 @@ TEST_BEGIN(test_alignment_and_size) {
 }
 TEST_END
 
+TEST_BEGIN(test_zero_alloc) {
+	void *res = aligned_alloc(8, 0);
+	assert(res);
+	size_t usable = malloc_usable_size(res);
+	assert(usable > 0);
+	free(res);
+}
+TEST_END
+
 int
 main(void) {
 	return test(
 	    test_alignment_errors,
 	    test_oom_errors,
-	    test_alignment_and_size);
+	    test_alignment_and_size,
+	    test_zero_alloc);
 }
diff --git a/test/integration/malloc.c b/test/integration/malloc.c
new file mode 100644
index 00000000..8b33bc8f
--- /dev/null
+++ b/test/integration/malloc.c
@@ -0,0 +1,16 @@
+#include "test/jemalloc_test.h"
+
+TEST_BEGIN(test_zero_alloc) {
+	void *res = malloc(0);
+	assert(res);
+	size_t usable = malloc_usable_size(res);
+	assert(usable > 0);
+	free(res);
+}
+TEST_END
+
+int
+main(void) {
+	return test(
+	    test_zero_alloc);
+}
diff --git a/test/integration/mallocx.c b/test/integration/mallocx.c
index fd960f30..645d4db4 100644
--- a/test/integration/mallocx.c
+++ b/test/integration/mallocx.c
@@ -51,6 +51,16 @@ purge(void) {
 	    "Unexpected mallctl error");
 }
 
+/*
+ * GCC "-Walloc-size-larger-than" warning detects when one of the memory
+ * allocation functions is called with a size larger than the maximum size that
+ * they support. Here we want to explicitly test that the allocation functions
+ * do indeed fail properly when this is the case, which triggers the warning.
+ * Therefore we disable the warning for these tests.
+ */
+JEMALLOC_DIAGNOSTIC_PUSH
+JEMALLOC_DIAGNOSTIC_IGNORE_ALLOC_SIZE_LARGER_THAN
+
 TEST_BEGIN(test_overflow) {
 	size_t largemax;
 
@@ -71,6 +81,38 @@ TEST_BEGIN(test_overflow) {
 }
 TEST_END
 
+static void *
+remote_alloc(void *arg) {
+	unsigned arena;
+	size_t sz = sizeof(unsigned);
+	assert_d_eq(mallctl("arenas.create", (void *)&arena, &sz, NULL, 0), 0,
+	    "Unexpected mallctl() failure");
+	size_t large_sz;
+	sz = sizeof(size_t);
+	assert_d_eq(mallctl("arenas.lextent.0.size", (void *)&large_sz, &sz,
+	    NULL, 0), 0, "Unexpected mallctl failure");
+
+	void *ptr = mallocx(large_sz, MALLOCX_ARENA(arena)
+	    | MALLOCX_TCACHE_NONE);
+	void **ret = (void **)arg;
+	*ret = ptr;
+
+	return NULL;
+}
+
+TEST_BEGIN(test_remote_free) {
+	thd_t thd;
+	void *ret;
+	thd_create(&thd, remote_alloc, (void *)&ret);
+	thd_join(thd, NULL);
+	assert_ptr_not_null(ret, "Unexpected mallocx failure");
+
+	/* Avoid TCACHE_NONE to explicitly test tcache_flush(). */
+	dallocx(ret, 0);
+	mallctl("thread.tcache.flush", NULL, NULL, NULL, 0);
+}
+TEST_END
+
 TEST_BEGIN(test_oom) {
 	size_t largemax;
 	bool oom;
@@ -84,7 +126,7 @@ TEST_BEGIN(test_oom) {
 	largemax = get_large_size(get_nlarge()-1);
 	oom = false;
 	for (i = 0; i < sizeof(ptrs) / sizeof(void *); i++) {
-		ptrs[i] = mallocx(largemax, 0);
+		ptrs[i] = mallocx(largemax, MALLOCX_ARENA(0));
 		if (ptrs[i] == NULL) {
 			oom = true;
 		}
@@ -113,6 +155,9 @@ TEST_BEGIN(test_oom) {
 }
 TEST_END
 
+/* Re-enable the "-Walloc-size-larger-than=" warning */
+JEMALLOC_DIAGNOSTIC_POP
+
 TEST_BEGIN(test_basic) {
 #define MAXSZ (((size_t)1) << 23)
 	size_t sz;
@@ -178,12 +223,12 @@ TEST_BEGIN(test_alignment_and_size) {
 		    sz += (alignment >> (LG_SIZEOF_PTR-1)) - 1) {
 			for (i = 0; i < NITER; i++) {
 				nsz = nallocx(sz, MALLOCX_ALIGN(alignment) |
-				    MALLOCX_ZERO);
+				    MALLOCX_ZERO | MALLOCX_ARENA(0));
 				assert_zu_ne(nsz, 0,
 				    "nallocx() error for alignment=%zu, "
 				    "size=%zu (%#zx)", alignment, sz, sz);
 				ps[i] = mallocx(sz, MALLOCX_ALIGN(alignment) |
-				    MALLOCX_ZERO);
+				    MALLOCX_ZERO | MALLOCX_ARENA(0));
 				assert_ptr_not_null(ps[i],
 				    "mallocx() error for alignment=%zu, "
 				    "size=%zu (%#zx)", alignment, sz, sz);
@@ -223,6 +268,7 @@ main(void) {
 	return test(
 	    test_overflow,
 	    test_oom,
+	    test_remote_free,
 	    test_basic,
 	    test_alignment_and_size);
 }
diff --git a/test/integration/overflow.c b/test/integration/overflow.c
index 6a9785b2..748ebb67 100644
--- a/test/integration/overflow.c
+++ b/test/integration/overflow.c
@@ -1,5 +1,15 @@
 #include "test/jemalloc_test.h"
 
+/*
+ * GCC "-Walloc-size-larger-than" warning detects when one of the memory
+ * allocation functions is called with a size larger than the maximum size that
+ * they support. Here we want to explicitly test that the allocation functions
+ * do indeed fail properly when this is the case, which triggers the warning.
+ * Therefore we disable the warning for these tests.
+ */
+JEMALLOC_DIAGNOSTIC_PUSH
+JEMALLOC_DIAGNOSTIC_IGNORE_ALLOC_SIZE_LARGER_THAN
+
 TEST_BEGIN(test_overflow) {
 	unsigned nlextents;
 	size_t mib[4];
@@ -39,6 +49,9 @@ TEST_BEGIN(test_overflow) {
 }
 TEST_END
 
+/* Re-enable the "-Walloc-size-larger-than=" warning */
+JEMALLOC_DIAGNOSTIC_POP
+
 int
 main(void) {
 	return test(
diff --git a/test/integration/rallocx.c b/test/integration/rallocx.c
index 7821ca5f..08ed08d3 100644
--- a/test/integration/rallocx.c
+++ b/test/integration/rallocx.c
@@ -208,6 +208,16 @@ TEST_BEGIN(test_lg_align_and_zero) {
 }
 TEST_END
 
+/*
+ * GCC "-Walloc-size-larger-than" warning detects when one of the memory
+ * allocation functions is called with a size larger than the maximum size that
+ * they support. Here we want to explicitly test that the allocation functions
+ * do indeed fail properly when this is the case, which triggers the warning.
+ * Therefore we disable the warning for these tests.
+ */
+JEMALLOC_DIAGNOSTIC_PUSH
+JEMALLOC_DIAGNOSTIC_IGNORE_ALLOC_SIZE_LARGER_THAN
+
 TEST_BEGIN(test_overflow) {
 	size_t largemax;
 	void *p;
@@ -234,6 +244,9 @@ TEST_BEGIN(test_overflow) {
 }
 TEST_END
 
+/* Re-enable the "-Walloc-size-larger-than=" warning */
+JEMALLOC_DIAGNOSTIC_POP
+
 int
 main(void) {
 	return test(
diff --git a/test/integration/slab_sizes.c b/test/integration/slab_sizes.c
new file mode 100644
index 00000000..af250c3f
--- /dev/null
+++ b/test/integration/slab_sizes.c
@@ -0,0 +1,80 @@
+#include "test/jemalloc_test.h"
+
+/* Note that this test relies on the unusual slab sizes set in slab_sizes.sh. */
+
+TEST_BEGIN(test_slab_sizes) {
+	unsigned nbins;
+	size_t page;
+	size_t sizemib[4];
+	size_t slabmib[4];
+	size_t len;
+
+	len = sizeof(nbins);
+	assert_d_eq(mallctl("arenas.nbins", &nbins, &len, NULL, 0), 0,
+	    "nbins mallctl failure");
+
+	len = sizeof(page);
+	assert_d_eq(mallctl("arenas.page", &page, &len, NULL, 0), 0,
+	    "page mallctl failure");
+
+	len = 4;
+	assert_d_eq(mallctlnametomib("arenas.bin.0.size", sizemib, &len), 0,
+	    "bin size mallctlnametomib failure");
+
+	len = 4;
+	assert_d_eq(mallctlnametomib("arenas.bin.0.slab_size", slabmib, &len),
+	    0, "slab size mallctlnametomib failure");
+
+	size_t biggest_slab_seen = 0;
+
+	for (unsigned i = 0; i < nbins; i++) {
+		size_t bin_size;
+		size_t slab_size;
+		len = sizeof(size_t);
+		sizemib[2] = i;
+		slabmib[2] = i;
+		assert_d_eq(mallctlbymib(sizemib, 4, (void *)&bin_size, &len,
+		    NULL, 0), 0, "bin size mallctlbymib failure");
+
+		len = sizeof(size_t);
+		assert_d_eq(mallctlbymib(slabmib, 4, (void *)&slab_size, &len,
+		    NULL, 0), 0, "slab size mallctlbymib failure");
+
+		if (bin_size < 100) {
+			/*
+			 * Then we should be as close to 17 as possible.  Since
+			 * not all page sizes are valid (because of bitmap
+			 * limitations on the number of items in a slab), we
+			 * should at least make sure that the number of pages
+			 * goes up.
+			 */
+			assert_zu_ge(slab_size, biggest_slab_seen,
+			    "Slab sizes should go up");
+			biggest_slab_seen = slab_size;
+		} else if (
+		    (100 <= bin_size && bin_size < 128)
+		    || (128 < bin_size && bin_size <= 200)) {
+			assert_zu_eq(slab_size, page,
+			    "Forced-small slabs should be small");
+		} else if (bin_size == 128) {
+			assert_zu_eq(slab_size, 2 * page,
+			    "Forced-2-page slab should be 2 pages");
+		} else if (200 < bin_size && bin_size <= 4096) {
+			assert_zu_ge(slab_size, biggest_slab_seen,
+			    "Slab sizes should go up");
+			biggest_slab_seen = slab_size;
+		}
+	}
+	/*
+	 * For any reasonable configuration, 17 pages should be a valid slab
+	 * size for 4096-byte items.
+	 */
+	assert_zu_eq(biggest_slab_seen, 17 * page, "Didn't hit page target");
+}
+TEST_END
+
+int
+main(void) {
+	return test(
+	    test_slab_sizes);
+}
diff --git a/test/integration/slab_sizes.sh b/test/integration/slab_sizes.sh
new file mode 100644
index 00000000..07e3db81
--- /dev/null
+++ b/test/integration/slab_sizes.sh
@@ -0,0 +1,4 @@
+#!/bin/sh
+
+# Some screwy-looking slab sizes.
+export MALLOC_CONF="slab_sizes:1-4096:17|100-200:1|128-128:2"
diff --git a/test/integration/smallocx.c b/test/integration/smallocx.c
new file mode 100644
index 00000000..2486752b
--- /dev/null
+++ b/test/integration/smallocx.c
@@ -0,0 +1,312 @@
+#include "test/jemalloc_test.h"
+#include "jemalloc/jemalloc_macros.h"
+
+#define STR_HELPER(x) #x
+#define STR(x) STR_HELPER(x)
+
+#ifndef JEMALLOC_VERSION_GID_IDENT
+  #error "JEMALLOC_VERSION_GID_IDENT not defined"
+#endif
+
+#define JOIN(x, y) x ## y
+#define JOIN2(x, y) JOIN(x, y)
+#define smallocx JOIN2(smallocx_, JEMALLOC_VERSION_GID_IDENT)
+
+typedef struct {
+	void *ptr;
+	size_t size;
+} smallocx_return_t;
+
+extern smallocx_return_t
+smallocx(size_t size, int flags);
+
+static unsigned
+get_nsizes_impl(const char *cmd) {
+	unsigned ret;
+	size_t z;
+
+	z = sizeof(unsigned);
+	assert_d_eq(mallctl(cmd, (void *)&ret, &z, NULL, 0), 0,
+	    "Unexpected mallctl(\"%s\", ...) failure", cmd);
+
+	return ret;
+}
+
+static unsigned
+get_nlarge(void) {
+	return get_nsizes_impl("arenas.nlextents");
+}
+
+static size_t
+get_size_impl(const char *cmd, size_t ind) {
+	size_t ret;
+	size_t z;
+	size_t mib[4];
+	size_t miblen = 4;
+
+	z = sizeof(size_t);
+	assert_d_eq(mallctlnametomib(cmd, mib, &miblen),
+	    0, "Unexpected mallctlnametomib(\"%s\", ...) failure", cmd);
+	mib[2] = ind;
+	z = sizeof(size_t);
+	assert_d_eq(mallctlbymib(mib, miblen, (void *)&ret, &z, NULL, 0),
+	    0, "Unexpected mallctlbymib([\"%s\", %zu], ...) failure", cmd, ind);
+
+	return ret;
+}
+
+static size_t
+get_large_size(size_t ind) {
+	return get_size_impl("arenas.lextent.0.size", ind);
+}
+
+/*
+ * On systems which can't merge extents, tests that call this function generate
+ * a lot of dirty memory very quickly.  Purging between cycles mitigates
+ * potential OOM on e.g. 32-bit Windows.
+ */
+static void
+purge(void) {
+	assert_d_eq(mallctl("arena.0.purge", NULL, NULL, NULL, 0), 0,
+	    "Unexpected mallctl error");
+}
+
+/*
+ * GCC "-Walloc-size-larger-than" warning detects when one of the memory
+ * allocation functions is called with a size larger than the maximum size that
+ * they support. Here we want to explicitly test that the allocation functions
+ * do indeed fail properly when this is the case, which triggers the warning.
+ * Therefore we disable the warning for these tests.
+ */
+JEMALLOC_DIAGNOSTIC_PUSH
+JEMALLOC_DIAGNOSTIC_IGNORE_ALLOC_SIZE_LARGER_THAN
+
+TEST_BEGIN(test_overflow) {
+	size_t largemax;
+
+	largemax = get_large_size(get_nlarge()-1);
+
+	assert_ptr_null(smallocx(largemax+1, 0).ptr,
+	    "Expected OOM for smallocx(size=%#zx, 0)", largemax+1);
+
+	assert_ptr_null(smallocx(ZU(PTRDIFF_MAX)+1, 0).ptr,
+	    "Expected OOM for smallocx(size=%#zx, 0)", ZU(PTRDIFF_MAX)+1);
+
+	assert_ptr_null(smallocx(SIZE_T_MAX, 0).ptr,
+	    "Expected OOM for smallocx(size=%#zx, 0)", SIZE_T_MAX);
+
+	assert_ptr_null(smallocx(1, MALLOCX_ALIGN(ZU(PTRDIFF_MAX)+1)).ptr,
+	    "Expected OOM for smallocx(size=1, MALLOCX_ALIGN(%#zx))",
+	    ZU(PTRDIFF_MAX)+1);
+}
+TEST_END
+
+static void *
+remote_alloc(void *arg) {
+	unsigned arena;
+	size_t sz = sizeof(unsigned);
+	assert_d_eq(mallctl("arenas.create", (void *)&arena, &sz, NULL, 0), 0,
+	    "Unexpected mallctl() failure");
+	size_t large_sz;
+	sz = sizeof(size_t);
+	assert_d_eq(mallctl("arenas.lextent.0.size", (void *)&large_sz, &sz,
+	    NULL, 0), 0, "Unexpected mallctl failure");
+
+	smallocx_return_t r
+	    = smallocx(large_sz, MALLOCX_ARENA(arena) | MALLOCX_TCACHE_NONE);
+	void *ptr = r.ptr;
+	assert_zu_eq(r.size,
+	    nallocx(large_sz, MALLOCX_ARENA(arena) | MALLOCX_TCACHE_NONE),
+	    "Expected smalloc(size,flags).size == nallocx(size,flags)");
+	void **ret = (void **)arg;
+	*ret = ptr;
+
+	return NULL;
+}
+
+TEST_BEGIN(test_remote_free) {
+	thd_t thd;
+	void *ret;
+	thd_create(&thd, remote_alloc, (void *)&ret);
+	thd_join(thd, NULL);
+	assert_ptr_not_null(ret, "Unexpected smallocx failure");
+
+	/* Avoid TCACHE_NONE to explicitly test tcache_flush(). */
+	dallocx(ret, 0);
+	mallctl("thread.tcache.flush", NULL, NULL, NULL, 0);
+}
+TEST_END
+
+TEST_BEGIN(test_oom) {
+	size_t largemax;
+	bool oom;
+	void *ptrs[3];
+	unsigned i;
+
+	/*
+	 * It should be impossible to allocate three objects that each consume
+	 * nearly half the virtual address space.
+	 */
+	largemax = get_large_size(get_nlarge()-1);
+	oom = false;
+	for (i = 0; i < sizeof(ptrs) / sizeof(void *); i++) {
+		ptrs[i] = smallocx(largemax, 0).ptr;
+		if (ptrs[i] == NULL) {
+			oom = true;
+		}
+	}
+	assert_true(oom,
+	    "Expected OOM during series of calls to smallocx(size=%zu, 0)",
+	    largemax);
+	for (i = 0; i < sizeof(ptrs) / sizeof(void *); i++) {
+		if (ptrs[i] != NULL) {
+			dallocx(ptrs[i], 0);
+		}
+	}
+	purge();
+
+#if LG_SIZEOF_PTR == 3
+	assert_ptr_null(smallocx(0x8000000000000000ULL,
+	    MALLOCX_ALIGN(0x8000000000000000ULL)).ptr,
+	    "Expected OOM for smallocx()");
+	assert_ptr_null(smallocx(0x8000000000000000ULL,
+	    MALLOCX_ALIGN(0x80000000)).ptr,
+	    "Expected OOM for smallocx()");
+#else
+	assert_ptr_null(smallocx(0x80000000UL, MALLOCX_ALIGN(0x80000000UL)).ptr,
+	    "Expected OOM for smallocx()");
+#endif
+}
+TEST_END
+
+/* Re-enable the "-Walloc-size-larger-than=" warning */
+JEMALLOC_DIAGNOSTIC_POP
+
+TEST_BEGIN(test_basic) {
+#define MAXSZ (((size_t)1) << 23)
+	size_t sz;
+
+	for (sz = 1; sz < MAXSZ; sz = nallocx(sz, 0) + 1) {
+		smallocx_return_t ret;
+		size_t nsz, rsz, smz;
+		void *p;
+		nsz = nallocx(sz, 0);
+		assert_zu_ne(nsz, 0, "Unexpected nallocx() error");
+		ret = smallocx(sz, 0);
+		p = ret.ptr;
+		smz = ret.size;
+		assert_ptr_not_null(p,
+		    "Unexpected smallocx(size=%zx, flags=0) error", sz);
+		rsz = sallocx(p, 0);
+		assert_zu_ge(rsz, sz, "Real size smaller than expected");
+		assert_zu_eq(nsz, rsz, "nallocx()/sallocx() size mismatch");
+		assert_zu_eq(nsz, smz, "nallocx()/smallocx() size mismatch");
+		dallocx(p, 0);
+
+		ret = smallocx(sz, 0);
+		p = ret.ptr;
+		smz = ret.size;
+		assert_ptr_not_null(p,
+		    "Unexpected smallocx(size=%zx, flags=0) error", sz);
+		dallocx(p, 0);
+
+		nsz = nallocx(sz, MALLOCX_ZERO);
+		assert_zu_ne(nsz, 0, "Unexpected nallocx() error");
+		assert_zu_ne(smz, 0, "Unexpected smallocx() error");
+		ret = smallocx(sz, MALLOCX_ZERO);
+		p = ret.ptr;
+		assert_ptr_not_null(p,
+		    "Unexpected smallocx(size=%zx, flags=MALLOCX_ZERO) error",
+		    nsz);
+		rsz = sallocx(p, 0);
+		assert_zu_eq(nsz, rsz, "nallocx()/sallocx() rsize mismatch");
+		assert_zu_eq(nsz, smz, "nallocx()/smallocx() size mismatch");
+		dallocx(p, 0);
+		purge();
+	}
+#undef MAXSZ
+}
+TEST_END
+
+TEST_BEGIN(test_alignment_and_size) {
+	const char *percpu_arena;
+	size_t sz = sizeof(percpu_arena);
+
+	if(mallctl("opt.percpu_arena", (void *)&percpu_arena, &sz, NULL, 0) ||
+	    strcmp(percpu_arena, "disabled") != 0) {
+		test_skip("test_alignment_and_size skipped: "
+		    "not working with percpu arena.");
+	};
+#define MAXALIGN (((size_t)1) << 23)
+#define NITER 4
+	size_t nsz, rsz, smz, alignment, total;
+	unsigned i;
+	void *ps[NITER];
+
+	for (i = 0; i < NITER; i++) {
+		ps[i] = NULL;
+	}
+
+	for (alignment = 8;
+	    alignment <= MAXALIGN;
+	    alignment <<= 1) {
+		total = 0;
+		for (sz = 1;
+		    sz < 3 * alignment && sz < (1U << 31);
+		    sz += (alignment >> (LG_SIZEOF_PTR-1)) - 1) {
+			for (i = 0; i < NITER; i++) {
+				nsz = nallocx(sz, MALLOCX_ALIGN(alignment) |
+				    MALLOCX_ZERO);
+				assert_zu_ne(nsz, 0,
+				    "nallocx() error for alignment=%zu, "
+				    "size=%zu (%#zx)", alignment, sz, sz);
+				smallocx_return_t ret
+				    = smallocx(sz, MALLOCX_ALIGN(alignment) | MALLOCX_ZERO);
+				ps[i] = ret.ptr;
+				assert_ptr_not_null(ps[i],
+				    "smallocx() error for alignment=%zu, "
+				    "size=%zu (%#zx)", alignment, sz, sz);
+				rsz = sallocx(ps[i], 0);
+				smz = ret.size;
+				assert_zu_ge(rsz, sz,
+				    "Real size smaller than expected for "
+				    "alignment=%zu, size=%zu", alignment, sz);
+				assert_zu_eq(nsz, rsz,
+				    "nallocx()/sallocx() size mismatch for "
+				    "alignment=%zu, size=%zu", alignment, sz);
+				assert_zu_eq(nsz, smz,
+				    "nallocx()/smallocx() size mismatch for "
+				    "alignment=%zu, size=%zu", alignment, sz);
+				assert_ptr_null(
+				    (void *)((uintptr_t)ps[i] & (alignment-1)),
+				    "%p inadequately aligned for"
+				    " alignment=%zu, size=%zu", ps[i],
+				    alignment, sz);
+				total += rsz;
+				if (total >= (MAXALIGN << 1)) {
+					break;
+				}
+			}
+			for (i = 0; i < NITER; i++) {
+				if (ps[i] != NULL) {
+					dallocx(ps[i], 0);
+					ps[i] = NULL;
+				}
+			}
+		}
+		purge();
+	}
+#undef MAXALIGN
+#undef NITER
+}
+TEST_END
+
+int
+main(void) {
+	return test(
+	    test_overflow,
+	    test_oom,
+	    test_remote_free,
+	    test_basic,
+	    test_alignment_and_size);
+}
diff --git a/test/integration/smallocx.sh b/test/integration/smallocx.sh
new file mode 100644
index 00000000..d07f10f3
--- /dev/null
+++ b/test/integration/smallocx.sh
@@ -0,0 +1,5 @@
+#!/bin/sh
+
+if [ "x${enable_fill}" = "x1" ] ; then
+    export MALLOC_CONF="junk:false"
+fi
diff --git a/test/src/mtx.c b/test/src/mtx.c
index a393c01f..d9ce375c 100644
--- a/test/src/mtx.c
+++ b/test/src/mtx.c
@@ -13,8 +13,6 @@ mtx_init(mtx_t *mtx) {
 	}
 #elif (defined(JEMALLOC_OS_UNFAIR_LOCK))
 	mtx->lock = OS_UNFAIR_LOCK_INIT;
-#elif (defined(JEMALLOC_OSSPIN))
-	mtx->lock = 0;
 #else
 	pthread_mutexattr_t attr;
 
@@ -35,7 +33,6 @@ void
 mtx_fini(mtx_t *mtx) {
 #ifdef _WIN32
 #elif (defined(JEMALLOC_OS_UNFAIR_LOCK))
-#elif (defined(JEMALLOC_OSSPIN))
 #else
 	pthread_mutex_destroy(&mtx->lock);
 #endif
@@ -47,8 +44,6 @@ mtx_lock(mtx_t *mtx) {
 	EnterCriticalSection(&mtx->lock);
 #elif (defined(JEMALLOC_OS_UNFAIR_LOCK))
 	os_unfair_lock_lock(&mtx->lock);
-#elif (defined(JEMALLOC_OSSPIN))
-	OSSpinLockLock(&mtx->lock);
 #else
 	pthread_mutex_lock(&mtx->lock);
 #endif
@@ -60,8 +55,6 @@ mtx_unlock(mtx_t *mtx) {
 	LeaveCriticalSection(&mtx->lock);
 #elif (defined(JEMALLOC_OS_UNFAIR_LOCK))
 	os_unfair_lock_unlock(&mtx->lock);
-#elif (defined(JEMALLOC_OSSPIN))
-	OSSpinLockUnlock(&mtx->lock);
 #else
 	pthread_mutex_unlock(&mtx->lock);
 #endif
diff --git a/test/src/test.c b/test/src/test.c
index 01a4d738..f97ce4d1 100644
--- a/test/src/test.c
+++ b/test/src/test.c
@@ -48,12 +48,12 @@ do_hook(bool *hook_ran, void (**hook)()) {
 
 static void
 libc_reentrancy_hook() {
-	do_hook(&libc_hook_ran, &hooks_libc_hook);
+	do_hook(&libc_hook_ran, &test_hooks_libc_hook);
 }
 
 static void
 arena_new_reentrancy_hook() {
-	do_hook(&arena_new_hook_ran, &hooks_arena_new_hook);
+	do_hook(&arena_new_hook_ran, &test_hooks_arena_new_hook);
 }
 
 /* Actual test infrastructure. */
@@ -110,6 +110,20 @@ p_test_fini(void) {
 	    test_status_string(test_status));
 }
 
+static void
+check_global_slow(test_status_t *status) {
+#ifdef JEMALLOC_UNIT_TEST
+	/*
+	 * This check needs to peek into tsd internals, which is why it's only
+	 * exposed in unit tests.
+	 */
+	if (tsd_global_slow()) {
+		malloc_printf("Testing increased global slow count\n");
+		*status = test_status_fail;
+	}
+#endif
+}
+
 static test_status_t
 p_test_impl(bool do_malloc_init, bool do_reentrant, test_t *t, va_list ap) {
 	test_status_t ret;
@@ -131,28 +145,31 @@ p_test_impl(bool do_malloc_init, bool do_reentrant, test_t *t, va_list ap) {
 	for (; t != NULL; t = va_arg(ap, test_t *)) {
 		/* Non-reentrant run. */
 		reentrancy = non_reentrant;
-		hooks_arena_new_hook = hooks_libc_hook = NULL;
+		test_hooks_arena_new_hook = test_hooks_libc_hook = NULL;
 		t();
 		if (test_status > ret) {
 			ret = test_status;
 		}
+		check_global_slow(&ret);
 		/* Reentrant run. */
 		if (do_reentrant) {
 			reentrancy = libc_reentrant;
-			hooks_arena_new_hook = NULL;
-			hooks_libc_hook = &libc_reentrancy_hook;
+			test_hooks_arena_new_hook = NULL;
+			test_hooks_libc_hook = &libc_reentrancy_hook;
 			t();
 			if (test_status > ret) {
 				ret = test_status;
 			}
+			check_global_slow(&ret);
 
 			reentrancy = arena_new_reentrant;
-			hooks_libc_hook = NULL;
-			hooks_arena_new_hook = &arena_new_reentrancy_hook;
+			test_hooks_libc_hook = NULL;
+			test_hooks_arena_new_hook = &arena_new_reentrancy_hook;
 			t();
 			if (test_status > ret) {
 				ret = test_status;
 			}
+			check_global_slow(&ret);
 		}
 	}
 
diff --git a/test/stress/hookbench.c b/test/stress/hookbench.c
new file mode 100644
index 00000000..97e90b0e
--- /dev/null
+++ b/test/stress/hookbench.c
@@ -0,0 +1,73 @@
+#include "test/jemalloc_test.h"
+
+static void
+noop_alloc_hook(void *extra, hook_alloc_t type, void *result,
+    uintptr_t result_raw, uintptr_t args_raw[3]) {
+}
+
+static void
+noop_dalloc_hook(void *extra, hook_dalloc_t type, void *address,
+    uintptr_t args_raw[3]) {
+}
+
+static void
+noop_expand_hook(void *extra, hook_expand_t type, void *address,
+    size_t old_usize, size_t new_usize, uintptr_t result_raw,
+    uintptr_t args_raw[4]) {
+}
+
+static void
+malloc_free_loop(int iters) {
+	for (int i = 0; i < iters; i++) {
+		void *p = mallocx(1, 0);
+		free(p);
+	}
+}
+
+static void
+test_hooked(int iters) {
+	hooks_t hooks = {&noop_alloc_hook, &noop_dalloc_hook, &noop_expand_hook,
+		NULL};
+
+	int err;
+	void *handles[HOOK_MAX];
+	size_t sz = sizeof(handles[0]);
+
+	for (int i = 0; i < HOOK_MAX; i++) {
+		err = mallctl("experimental.hooks.install", &handles[i],
+		    &sz, &hooks, sizeof(hooks));
+		assert(err == 0);
+
+		timedelta_t timer;
+		timer_start(&timer);
+		malloc_free_loop(iters);
+		timer_stop(&timer);
+		malloc_printf("With %d hook%s: %"FMTu64"us\n", i + 1,
+		    i + 1 == 1 ? "" : "s", timer_usec(&timer));
+	}
+	for (int i = 0; i < HOOK_MAX; i++) {
+		err = mallctl("experimental.hooks.remove", NULL, NULL,
+		    &handles[i], sizeof(handles[i]));
+		assert(err == 0);
+	}
+}
+
+static void
+test_unhooked(int iters) {
+	timedelta_t timer;
+	timer_start(&timer);
+	malloc_free_loop(iters);
+	timer_stop(&timer);
+
+	malloc_printf("Without hooks: %"FMTu64"us\n", timer_usec(&timer));
+}
+
+int
+main(void) {
+	/* Initialize */
+	free(mallocx(1, 0));
+	int iters = 10 * 1000 * 1000;
+	malloc_printf("Benchmarking hooks with %d iterations:\n", iters);
+	test_hooked(iters);
+	test_unhooked(iters);
+}
diff --git a/test/unit/arena_reset.c b/test/unit/arena_reset.c
index f5fb24d1..96b042dd 100644
--- a/test/unit/arena_reset.c
+++ b/test/unit/arena_reset.c
@@ -77,7 +77,7 @@ vsalloc(tsdn_t *tsdn, const void *ptr) {
 		return 0;
 	}
 
-	if (szind == NSIZES) {
+	if (szind == SC_NSIZES) {
 		return 0;
 	}
 
@@ -142,7 +142,7 @@ do_arena_reset_post(void **ptrs, unsigned nptrs, unsigned arena_ind) {
 
 	if (have_background_thread) {
 		malloc_mutex_lock(tsdn,
-		    &background_thread_info[arena_ind % ncpus].mtx);
+		    &background_thread_info_get(arena_ind)->mtx);
 	}
 	/* Verify allocations no longer exist. */
 	for (i = 0; i < nptrs; i++) {
@@ -151,7 +151,7 @@ do_arena_reset_post(void **ptrs, unsigned nptrs, unsigned arena_ind) {
 	}
 	if (have_background_thread) {
 		malloc_mutex_unlock(tsdn,
-		    &background_thread_info[arena_ind % ncpus].mtx);
+		    &background_thread_info_get(arena_ind)->mtx);
 	}
 
 	free(ptrs);
diff --git a/test/unit/background_thread_enable.c b/test/unit/background_thread_enable.c
index ff95e672..d894e937 100644
--- a/test/unit/background_thread_enable.c
+++ b/test/unit/background_thread_enable.c
@@ -33,20 +33,19 @@ TEST_END
 TEST_BEGIN(test_max_background_threads) {
 	test_skip_if(!have_background_thread);
 
-	size_t maxt;
-	size_t opt_maxt;
-	size_t sz_m = sizeof(maxt);
+	size_t max_n_thds;
+	size_t opt_max_n_thds;
+	size_t sz_m = sizeof(max_n_thds);
 	assert_d_eq(mallctl("opt.max_background_threads",
-			    &opt_maxt, &sz_m, NULL, 0), 0,
-			    "Failed to get opt.max_background_threads");
-	assert_d_eq(mallctl("max_background_threads", &maxt, &sz_m, NULL, 0), 0,
-		    "Failed to get max background threads");
-	assert_zu_eq(20, maxt, "should be ncpus");
-	assert_zu_eq(opt_maxt, maxt,
-		     "max_background_threads and "
-		     "opt.max_background_threads should match");
-	assert_d_eq(mallctl("max_background_threads", NULL, NULL, &maxt, sz_m),
-		    0, "Failed to set max background threads");
+	    &opt_max_n_thds, &sz_m, NULL, 0), 0,
+	    "Failed to get opt.max_background_threads");
+	assert_d_eq(mallctl("max_background_threads", &max_n_thds, &sz_m, NULL,
+	    0), 0, "Failed to get max background threads");
+	assert_zu_eq(opt_max_n_thds, max_n_thds,
+	    "max_background_threads and "
+	    "opt.max_background_threads should match");
+	assert_d_eq(mallctl("max_background_threads", NULL, NULL, &max_n_thds,
+	    sz_m), 0, "Failed to set max background threads");
 
 	unsigned id;
 	size_t sz_u = sizeof(unsigned);
@@ -60,18 +59,21 @@ TEST_BEGIN(test_max_background_threads) {
 	size_t sz_b = sizeof(bool);
 	assert_d_eq(mallctl("background_thread", NULL, NULL, &enable, sz_b), 0,
 	    "Failed to enable background threads");
-	assert_zu_eq(n_background_threads, maxt,
-		     "Number of background threads should be 3.\n");
-	maxt = 10;
-	assert_d_eq(mallctl("max_background_threads", NULL, NULL, &maxt, sz_m),
-		    0, "Failed to set max background threads");
-	assert_zu_eq(n_background_threads, maxt,
-		     "Number of background threads should be 10.\n");
-	maxt = 3;
-	assert_d_eq(mallctl("max_background_threads", NULL, NULL, &maxt, sz_m),
-		    0, "Failed to set max background threads");
-	assert_zu_eq(n_background_threads, maxt,
-		     "Number of background threads should be 3.\n");
+	assert_zu_eq(n_background_threads, max_n_thds,
+	    "Number of background threads should not change.\n");
+	size_t new_max_thds = max_n_thds - 1;
+	if (new_max_thds > 0) {
+		assert_d_eq(mallctl("max_background_threads", NULL, NULL,
+		    &new_max_thds, sz_m), 0,
+		    "Failed to set max background threads");
+		assert_zu_eq(n_background_threads, new_max_thds,
+		    "Number of background threads should decrease by 1.\n");
+	}
+	new_max_thds = 1;
+	assert_d_eq(mallctl("max_background_threads", NULL, NULL, &new_max_thds,
+	    sz_m), 0, "Failed to set max background threads");
+	assert_zu_eq(n_background_threads, new_max_thds,
+	    "Number of background threads should be 1.\n");
 }
 TEST_END
 
diff --git a/test/unit/binshard.c b/test/unit/binshard.c
new file mode 100644
index 00000000..d7a8df8f
--- /dev/null
+++ b/test/unit/binshard.c
@@ -0,0 +1,154 @@
+#include "test/jemalloc_test.h"
+
+/* Config -- "narenas:1,bin_shards:1-160:16|129-512:4|256-256:8" */
+
+#define NTHREADS 16
+#define REMOTE_NALLOC 256
+
+static void *
+thd_producer(void *varg) {
+	void **mem = varg;
+	unsigned arena, i;
+	size_t sz;
+
+	sz = sizeof(arena);
+	/* Remote arena. */
+	assert_d_eq(mallctl("arenas.create", (void *)&arena, &sz, NULL, 0), 0,
+	    "Unexpected mallctl() failure");
+	for (i = 0; i < REMOTE_NALLOC / 2; i++) {
+		mem[i] = mallocx(1, MALLOCX_TCACHE_NONE | MALLOCX_ARENA(arena));
+	}
+
+	/* Remote bin. */
+	for (; i < REMOTE_NALLOC; i++) {
+		mem[i] = mallocx(1, MALLOCX_TCACHE_NONE | MALLOCX_ARENA(0));
+	}
+
+	return NULL;
+}
+
+TEST_BEGIN(test_producer_consumer) {
+	thd_t thds[NTHREADS];
+	void *mem[NTHREADS][REMOTE_NALLOC];
+	unsigned i;
+
+	/* Create producer threads to allocate. */
+	for (i = 0; i < NTHREADS; i++) {
+		thd_create(&thds[i], thd_producer, mem[i]);
+	}
+	for (i = 0; i < NTHREADS; i++) {
+		thd_join(thds[i], NULL);
+	}
+	/* Remote deallocation by the current thread. */
+	for (i = 0; i < NTHREADS; i++) {
+		for (unsigned j = 0; j < REMOTE_NALLOC; j++) {
+			assert_ptr_not_null(mem[i][j],
+			    "Unexpected remote allocation failure");
+			dallocx(mem[i][j], 0);
+		}
+	}
+}
+TEST_END
+
+static void *
+thd_start(void *varg) {
+	void *ptr, *ptr2;
+	extent_t *extent;
+	unsigned shard1, shard2;
+
+	tsdn_t *tsdn = tsdn_fetch();
+	/* Try triggering allocations from sharded bins. */
+	for (unsigned i = 0; i < 1024; i++) {
+		ptr = mallocx(1, MALLOCX_TCACHE_NONE);
+		ptr2 = mallocx(129, MALLOCX_TCACHE_NONE);
+
+		extent = iealloc(tsdn, ptr);
+		shard1 = extent_binshard_get(extent);
+		dallocx(ptr, 0);
+		assert_u_lt(shard1, 16, "Unexpected bin shard used");
+
+		extent = iealloc(tsdn, ptr2);
+		shard2 = extent_binshard_get(extent);
+		dallocx(ptr2, 0);
+		assert_u_lt(shard2, 4, "Unexpected bin shard used");
+
+		if (shard1 > 0 || shard2 > 0) {
+			/* Triggered sharded bin usage. */
+			return (void *)(uintptr_t)shard1;
+		}
+	}
+
+	return NULL;
+}
+
+TEST_BEGIN(test_bin_shard_mt) {
+	test_skip_if(have_percpu_arena &&
+	    PERCPU_ARENA_ENABLED(opt_percpu_arena));
+
+	thd_t thds[NTHREADS];
+	unsigned i;
+	for (i = 0; i < NTHREADS; i++) {
+		thd_create(&thds[i], thd_start, NULL);
+	}
+	bool sharded = false;
+	for (i = 0; i < NTHREADS; i++) {
+		void *ret;
+		thd_join(thds[i], &ret);
+		if (ret != NULL) {
+			sharded = true;
+		}
+	}
+	assert_b_eq(sharded, true, "Did not find sharded bins");
+}
+TEST_END
+
+TEST_BEGIN(test_bin_shard) {
+	unsigned nbins, i;
+	size_t mib[4], mib2[4];
+	size_t miblen, miblen2, len;
+
+	len = sizeof(nbins);
+	assert_d_eq(mallctl("arenas.nbins", (void *)&nbins, &len, NULL, 0), 0,
+	    "Unexpected mallctl() failure");
+
+	miblen = 4;
+	assert_d_eq(mallctlnametomib("arenas.bin.0.nshards", mib, &miblen), 0,
+	    "Unexpected mallctlnametomib() failure");
+	miblen2 = 4;
+	assert_d_eq(mallctlnametomib("arenas.bin.0.size", mib2, &miblen2), 0,
+	    "Unexpected mallctlnametomib() failure");
+
+	for (i = 0; i < nbins; i++) {
+		uint32_t nshards;
+		size_t size, sz1, sz2;
+
+		mib[2] = i;
+		sz1 = sizeof(nshards);
+		assert_d_eq(mallctlbymib(mib, miblen, (void *)&nshards, &sz1,
+		    NULL, 0), 0, "Unexpected mallctlbymib() failure");
+
+		mib2[2] = i;
+		sz2 = sizeof(size);
+		assert_d_eq(mallctlbymib(mib2, miblen2, (void *)&size, &sz2,
+		    NULL, 0), 0, "Unexpected mallctlbymib() failure");
+
+		if (size >= 1 && size <= 128) {
+			assert_u_eq(nshards, 16, "Unexpected nshards");
+		} else if (size == 256) {
+			assert_u_eq(nshards, 8, "Unexpected nshards");
+		} else if (size > 128 && size <= 512) {
+			assert_u_eq(nshards, 4, "Unexpected nshards");
+		} else {
+			assert_u_eq(nshards, 1, "Unexpected nshards");
+		}
+	}
+}
+TEST_END
+
+int
+main(void) {
+	return test_no_reentrancy(
+	    test_bin_shard,
+	    test_bin_shard_mt,
+	    test_producer_consumer);
+}
diff --git a/test/unit/binshard.sh b/test/unit/binshard.sh
new file mode 100644
index 00000000..c1d58c88
--- /dev/null
+++ b/test/unit/binshard.sh
@@ -0,0 +1,3 @@
+#!/bin/sh
+
+export MALLOC_CONF="narenas:1,bin_shards:1-160:16|129-512:4|256-256:8"
diff --git a/test/unit/bit_util.c b/test/unit/bit_util.c
index 42a97013..b747deb4 100644
--- a/test/unit/bit_util.c
+++ b/test/unit/bit_util.c
@@ -48,10 +48,64 @@ TEST_BEGIN(test_pow2_ceil_zu) {
 }
 TEST_END
 
+void
+assert_lg_ceil_range(size_t input, unsigned answer) {
+	if (input == 1) {
+		assert_u_eq(0, answer, "Got %u as lg_ceil of 1", answer);
+		return;
+	}
+	assert_zu_le(input, (ZU(1) << answer),
+	    "Got %u as lg_ceil of %zu", answer, input);
+	assert_zu_gt(input, (ZU(1) << (answer - 1)),
+	    "Got %u as lg_ceil of %zu", answer, input);
+}
+
+void
+assert_lg_floor_range(size_t input, unsigned answer) {
+	if (input == 1) {
+		assert_u_eq(0, answer, "Got %u as lg_floor of 1", answer);
+		return;
+	}
+	assert_zu_ge(input, (ZU(1) << answer),
+	    "Got %u as lg_floor of %zu", answer, input);
+	assert_zu_lt(input, (ZU(1) << (answer + 1)),
+	    "Got %u as lg_floor of %zu", answer, input);
+}
+
+TEST_BEGIN(test_lg_ceil_floor) {
+	for (size_t i = 1; i < 10 * 1000 * 1000; i++) {
+		assert_lg_ceil_range(i, lg_ceil(i));
+		assert_lg_ceil_range(i, LG_CEIL(i));
+		assert_lg_floor_range(i, lg_floor(i));
+		assert_lg_floor_range(i, LG_FLOOR(i));
+	}
+	for (int i = 10; i < 8 * (1 << LG_SIZEOF_PTR) - 5; i++) {
+		for (size_t j = 0; j < (1 << 4); j++) {
+			size_t num1 = ((size_t)1 << i)
+			    - j * ((size_t)1 << (i - 4));
+			size_t num2 = ((size_t)1 << i)
+			    + j * ((size_t)1 << (i - 4));
+			assert_zu_ne(num1, 0, "Invalid lg argument");
+			assert_zu_ne(num2, 0, "Invalid lg argument");
+			assert_lg_ceil_range(num1, lg_ceil(num1));
+			assert_lg_ceil_range(num1, LG_CEIL(num1));
+			assert_lg_ceil_range(num2, lg_ceil(num2));
+			assert_lg_ceil_range(num2, LG_CEIL(num2));
+
+			assert_lg_floor_range(num1, lg_floor(num1));
+			assert_lg_floor_range(num1, LG_FLOOR(num1));
+			assert_lg_floor_range(num2, lg_floor(num2));
+			assert_lg_floor_range(num2, LG_FLOOR(num2));
+		}
+	}
+}
+TEST_END
+
 int
 main(void) {
 	return test(
 	    test_pow2_ceil_u64,
 	    test_pow2_ceil_u32,
-	    test_pow2_ceil_zu);
+	    test_pow2_ceil_zu,
+	    test_lg_ceil_floor);
 }
diff --git a/test/unit/decay.c b/test/unit/decay.c
index f727bf93..cf3c0796 100644
--- a/test/unit/decay.c
+++ b/test/unit/decay.c
@@ -121,6 +121,12 @@ get_arena_dirty_npurge(unsigned arena_ind) {
 	return get_arena_npurge_impl("stats.arenas.0.dirty_npurge", arena_ind);
 }
 
+static uint64_t
+get_arena_dirty_purged(unsigned arena_ind) {
+	do_epoch();
+	return get_arena_npurge_impl("stats.arenas.0.dirty_purged", arena_ind);
+}
+
 static uint64_t
 get_arena_muzzy_npurge(unsigned arena_ind) {
 	do_epoch();
@@ -559,7 +565,7 @@ TEST_BEGIN(test_decay_now) {
 TEST_END
 
 TEST_BEGIN(test_decay_never) {
-	test_skip_if(check_background_thread_enabled());
+	test_skip_if(check_background_thread_enabled() || !config_stats);
 
 	unsigned arena_ind = do_arena_create(-1, -1);
 	int flags = MALLOCX_ARENA(arena_ind) | MALLOCX_TCACHE_NONE;
@@ -579,8 +585,8 @@ TEST_BEGIN(test_decay_never) {
 		dallocx(ptrs[i], flags);
 		size_t pdirty = get_arena_pdirty(arena_ind);
 		size_t pmuzzy = get_arena_pmuzzy(arena_ind);
-		assert_zu_gt(pdirty, pdirty_prev,
-		    "Expected dirty pages to increase.");
+		assert_zu_gt(pdirty + (size_t)get_arena_dirty_purged(arena_ind),
+		    pdirty_prev, "Expected dirty pages to increase.");
 		assert_zu_eq(pmuzzy, 0, "Unexpected muzzy pages");
 		pdirty_prev = pdirty;
 	}
diff --git a/test/unit/emitter.c b/test/unit/emitter.c
index 535c7cf1..b4a693f4 100644
--- a/test/unit/emitter.c
+++ b/test/unit/emitter.c
@@ -169,7 +169,7 @@ static void emit_nested_dict(emitter_t *emitter) {
 	emitter_end(emitter);
 }
 
-static const char *nested_dict_json =
+static const char *nested_object_json =
 "{\n"
 "\t\"json1\": {\n"
 "\t\t\"json2\": {\n"
@@ -183,7 +183,7 @@ static const char *nested_dict_json =
 "\t}\n"
 "}\n";
 
-static const char *nested_dict_table =
+static const char *nested_object_table =
 "Dict 1\n"
 "  Dict 2\n"
 "    A primitive: 123\n"
@@ -192,8 +192,8 @@ static const char *nested_dict_table =
 "  Another primitive: 123\n";
 
 TEST_BEGIN(test_nested_dict) {
-	assert_emit_output(&emit_nested_dict, nested_dict_json,
-	    nested_dict_table);
+	assert_emit_output(&emit_nested_dict, nested_object_json,
+	    nested_object_table);
 }
 TEST_END
 
@@ -256,13 +256,14 @@ emit_modal(emitter_t *emitter) {
 	int val = 123;
 	emitter_begin(emitter);
 	emitter_dict_begin(emitter, "j0", "T0");
-	emitter_json_dict_begin(emitter, "j1");
+	emitter_json_key(emitter, "j1");
+	emitter_json_object_begin(emitter);
 	emitter_kv(emitter, "i1", "I1", emitter_type_int, &val);
 	emitter_json_kv(emitter, "i2", emitter_type_int, &val);
 	emitter_table_kv(emitter, "I3", emitter_type_int, &val);
 	emitter_table_dict_begin(emitter, "T1");
 	emitter_kv(emitter, "i4", "I4", emitter_type_int, &val);
-	emitter_json_dict_end(emitter); /* Close j1 */
+	emitter_json_object_end(emitter); /* Close j1 */
 	emitter_kv(emitter, "i5", "I5", emitter_type_int, &val);
 	emitter_table_dict_end(emitter); /* Close T1 */
 	emitter_kv(emitter, "i6", "I6", emitter_type_int, &val);
@@ -302,24 +303,26 @@ emit_json_arr(emitter_t *emitter) {
 	int ival = 123;
 
 	emitter_begin(emitter);
-	emitter_json_dict_begin(emitter, "dict");
-	emitter_json_arr_begin(emitter, "arr");
-	emitter_json_arr_obj_begin(emitter);
+	emitter_json_key(emitter, "dict");
+	emitter_json_object_begin(emitter);
+	emitter_json_key(emitter, "arr");
+	emitter_json_array_begin(emitter);
+	emitter_json_object_begin(emitter);
 	emitter_json_kv(emitter, "foo", emitter_type_int, &ival);
-	emitter_json_arr_obj_end(emitter); /* Close arr[0] */
+	emitter_json_object_end(emitter); /* Close arr[0] */
 	/* arr[1] and arr[2] are primitives. */
-	emitter_json_arr_value(emitter, emitter_type_int, &ival);
-	emitter_json_arr_value(emitter, emitter_type_int, &ival);
-	emitter_json_arr_obj_begin(emitter);
+	emitter_json_value(emitter, emitter_type_int, &ival);
+	emitter_json_value(emitter, emitter_type_int, &ival);
+	emitter_json_object_begin(emitter);
 	emitter_json_kv(emitter, "bar", emitter_type_int, &ival);
 	emitter_json_kv(emitter, "baz", emitter_type_int, &ival);
-	emitter_json_arr_obj_end(emitter); /* Close arr[3]. */
-	emitter_json_arr_end(emitter); /* Close arr. */
-	emitter_json_dict_end(emitter); /* Close dict. */
+	emitter_json_object_end(emitter); /* Close arr[3]. */
+	emitter_json_array_end(emitter); /* Close arr. */
+	emitter_json_object_end(emitter); /* Close dict. */
 	emitter_end(emitter);
 }
 
-static const char *json_arr_json =
+static const char *json_array_json =
 "{\n"
 "\t\"dict\": {\n"
 "\t\t\"arr\": [\n"
@@ -336,10 +339,62 @@ static const char *json_arr_json =
 "\t}\n"
 "}\n";
 
-static const char *json_arr_table = "";
+static const char *json_array_table = "";
 
 TEST_BEGIN(test_json_arr) {
-	assert_emit_output(&emit_json_arr, json_arr_json, json_arr_table);
+	assert_emit_output(&emit_json_arr, json_array_json, json_array_table);
+}
+TEST_END
+
+static void
+emit_json_nested_array(emitter_t *emitter) {
+	int ival = 123;
+	char *sval = "foo";
+	emitter_begin(emitter);
+	emitter_json_array_begin(emitter);
+		emitter_json_array_begin(emitter);
+		emitter_json_value(emitter, emitter_type_int, &ival);
+		emitter_json_value(emitter, emitter_type_string, &sval);
+		emitter_json_value(emitter, emitter_type_int, &ival);
+		emitter_json_value(emitter, emitter_type_string, &sval);
+		emitter_json_array_end(emitter);
+		emitter_json_array_begin(emitter);
+		emitter_json_value(emitter, emitter_type_int, &ival);
+		emitter_json_array_end(emitter);
+		emitter_json_array_begin(emitter);
+		emitter_json_value(emitter, emitter_type_string, &sval);
+		emitter_json_value(emitter, emitter_type_int, &ival);
+		emitter_json_array_end(emitter);
+		emitter_json_array_begin(emitter);
+		emitter_json_array_end(emitter);
+	emitter_json_array_end(emitter);
+	emitter_end(emitter);
+}
+
+static const char *json_nested_array_json =
+"{\n"
+"\t[\n"
+"\t\t[\n"
+"\t\t\t123,\n"
+"\t\t\t\"foo\",\n"
+"\t\t\t123,\n"
+"\t\t\t\"foo\"\n"
+"\t\t],\n"
+"\t\t[\n"
+"\t\t\t123\n"
+"\t\t],\n"
+"\t\t[\n"
+"\t\t\t\"foo\",\n"
+"\t\t\t123\n"
+"\t\t],\n"
+"\t\t[\n"
+"\t\t]\n"
+"\t]\n"
+"}\n";
+
+TEST_BEGIN(test_json_nested_arr) {
+	assert_emit_output(&emit_json_nested_array, json_nested_array_json,
+	    json_array_table);
 }
 TEST_END
 
@@ -347,11 +402,11 @@ static void
 emit_table_row(emitter_t *emitter) {
 	emitter_begin(emitter);
 	emitter_row_t row;
-	emitter_col_t abc = {emitter_justify_left, 10, emitter_type_title};
+	emitter_col_t abc = {emitter_justify_left, 10, emitter_type_title, {0}, {0, 0}};
 	abc.str_val = "ABC title";
-	emitter_col_t def = {emitter_justify_right, 15, emitter_type_title};
+	emitter_col_t def = {emitter_justify_right, 15, emitter_type_title, {0}, {0, 0}};
 	def.str_val = "DEF title";
-	emitter_col_t ghi = {emitter_justify_right, 5, emitter_type_title};
+	emitter_col_t ghi = {emitter_justify_right, 5, emitter_type_title, {0}, {0, 0}};
 	ghi.str_val = "GHI";
 
 	emitter_row_init(&row);
@@ -409,5 +464,6 @@ main(void) {
 	    test_types,
 	    test_modal,
 	    test_json_arr,
+	    test_json_nested_arr,
 	    test_table_row);
 }
diff --git a/test/unit/hook.c b/test/unit/hook.c
new file mode 100644
index 00000000..72fcc433
--- /dev/null
+++ b/test/unit/hook.c
@@ -0,0 +1,580 @@
+#include "test/jemalloc_test.h"
+
+#include "jemalloc/internal/hook.h"
+
+static void *arg_extra;
+static int arg_type;
+static void *arg_result;
+static void *arg_address;
+static size_t arg_old_usize;
+static size_t arg_new_usize;
+static uintptr_t arg_result_raw;
+static uintptr_t arg_args_raw[4];
+
+static int call_count = 0;
+
+static void
+reset_args() {
+	arg_extra = NULL;
+	arg_type = 12345;
+	arg_result = NULL;
+	arg_address = NULL;
+	arg_old_usize = 0;
+	arg_new_usize = 0;
+	arg_result_raw = 0;
+	memset(arg_args_raw, 77, sizeof(arg_args_raw));
+}
+
+static void
+alloc_free_size(size_t sz) {
+	void *ptr = mallocx(1, 0);
+	free(ptr);
+	ptr = mallocx(1, 0);
+	free(ptr);
+	ptr = mallocx(1, MALLOCX_TCACHE_NONE);
+	dallocx(ptr, MALLOCX_TCACHE_NONE);
+}
+
+/*
+ * We want to support a degree of user reentrancy.  This tests a variety of
+ * allocation scenarios.
+ */
+static void
+be_reentrant() {
+	/* Let's make sure the tcache is non-empty if enabled. */
+	alloc_free_size(1);
+	alloc_free_size(1024);
+	alloc_free_size(64 * 1024);
+	alloc_free_size(256 * 1024);
+	alloc_free_size(1024 * 1024);
+
+	/* Some reallocation. */
+	void *ptr = mallocx(129, 0);
+	ptr = rallocx(ptr, 130, 0);
+	free(ptr);
+
+	ptr = mallocx(2 * 1024 * 1024, 0);
+	free(ptr);
+	ptr = mallocx(1 * 1024 * 1024, 0);
+	ptr = rallocx(ptr, 2 * 1024 * 1024, 0);
+	free(ptr);
+
+	ptr = mallocx(1, 0);
+	ptr = rallocx(ptr, 1000, 0);
+	free(ptr);
+}
+
+static void
+set_args_raw(uintptr_t *args_raw, int nargs) {
+	memcpy(arg_args_raw, args_raw, sizeof(uintptr_t) * nargs);
+}
+
+static void
+assert_args_raw(uintptr_t *args_raw_expected, int nargs) {
+	int cmp = memcmp(args_raw_expected, arg_args_raw,
+	    sizeof(uintptr_t) * nargs);
+	assert_d_eq(cmp, 0, "Raw args mismatch");
+}
+
+static void
+reset() {
+	call_count = 0;
+	reset_args();
+}
+
+static void
+test_alloc_hook(void *extra, hook_alloc_t type, void *result,
+    uintptr_t result_raw, uintptr_t args_raw[3]) {
+	call_count++;
+	arg_extra = extra;
+	arg_type = (int)type;
+	arg_result = result;
+	arg_result_raw = result_raw;
+	set_args_raw(args_raw, 3);
+	be_reentrant();
+}
+
+static void
+test_dalloc_hook(void *extra, hook_dalloc_t type, void *address,
+    uintptr_t args_raw[3]) {
+	call_count++;
+	arg_extra = extra;
+	arg_type = (int)type;
+	arg_address = address;
+	set_args_raw(args_raw, 3);
+	be_reentrant();
+}
+
+static void
+test_expand_hook(void *extra, hook_expand_t type, void *address,
+    size_t old_usize, size_t new_usize, uintptr_t result_raw,
+    uintptr_t args_raw[4]) {
+	call_count++;
+	arg_extra = extra;
+	arg_type = (int)type;
+	arg_address = address;
+	arg_old_usize = old_usize;
+	arg_new_usize = new_usize;
+	arg_result_raw = result_raw;
+	set_args_raw(args_raw, 4);
+	be_reentrant();
+}
+
+TEST_BEGIN(test_hooks_basic) {
+	/* Just verify that the record their arguments correctly. */
+	hooks_t hooks = {
+		&test_alloc_hook, &test_dalloc_hook, &test_expand_hook,
+		(void *)111};
+	void *handle = hook_install(TSDN_NULL, &hooks);
+	uintptr_t args_raw[4] = {10, 20, 30, 40};
+
+	/* Alloc */
+	reset_args();
+	hook_invoke_alloc(hook_alloc_posix_memalign, (void *)222, 333,
+	    args_raw);
+	assert_ptr_eq(arg_extra, (void *)111, "Passed wrong user pointer");
+	assert_d_eq((int)hook_alloc_posix_memalign, arg_type,
+	    "Passed wrong alloc type");
+	assert_ptr_eq((void *)222, arg_result, "Passed wrong result address");
+	assert_u64_eq(333, arg_result_raw, "Passed wrong result");
+	assert_args_raw(args_raw, 3);
+
+	/* Dalloc */
+	reset_args();
+	hook_invoke_dalloc(hook_dalloc_sdallocx, (void *)222, args_raw);
+	assert_d_eq((int)hook_dalloc_sdallocx, arg_type,
+	    "Passed wrong dalloc type");
+	assert_ptr_eq((void *)111, arg_extra, "Passed wrong user pointer");
+	assert_ptr_eq((void *)222, arg_address, "Passed wrong address");
+	assert_args_raw(args_raw, 3);
+
+	/* Expand */
+	reset_args();
+	hook_invoke_expand(hook_expand_xallocx, (void *)222, 333, 444, 555,
+	    args_raw);
+	assert_d_eq((int)hook_expand_xallocx, arg_type,
+	    "Passed wrong expand type");
+	assert_ptr_eq((void *)111, arg_extra, "Passed wrong user pointer");
+	assert_ptr_eq((void *)222, arg_address, "Passed wrong address");
+	assert_zu_eq(333, arg_old_usize, "Passed wrong old usize");
+	assert_zu_eq(444, arg_new_usize, "Passed wrong new usize");
+	assert_zu_eq(555, arg_result_raw, "Passed wrong result");
+	assert_args_raw(args_raw, 4);
+
+	hook_remove(TSDN_NULL, handle);
+}
+TEST_END
+
+TEST_BEGIN(test_hooks_null) {
+	/* Null hooks should be ignored, not crash. */
+	hooks_t hooks1 = {NULL, NULL, NULL, NULL};
+	hooks_t hooks2 = {&test_alloc_hook, NULL, NULL, NULL};
+	hooks_t hooks3 = {NULL, &test_dalloc_hook, NULL, NULL};
+	hooks_t hooks4 = {NULL, NULL, &test_expand_hook, NULL};
+
+	void *handle1 = hook_install(TSDN_NULL, &hooks1);
+	void *handle2 = hook_install(TSDN_NULL, &hooks2);
+	void *handle3 = hook_install(TSDN_NULL, &hooks3);
+	void *handle4 = hook_install(TSDN_NULL, &hooks4);
+
+	assert_ptr_ne(handle1, NULL, "Hook installation failed");
+	assert_ptr_ne(handle2, NULL, "Hook installation failed");
+	assert_ptr_ne(handle3, NULL, "Hook installation failed");
+	assert_ptr_ne(handle4, NULL, "Hook installation failed");
+
+	uintptr_t args_raw[4] = {10, 20, 30, 40};
+
+	call_count = 0;
+	hook_invoke_alloc(hook_alloc_malloc, NULL, 0, args_raw);
+	assert_d_eq(call_count, 1, "Called wrong number of times");
+
+	call_count = 0;
+	hook_invoke_dalloc(hook_dalloc_free, NULL, args_raw);
+	assert_d_eq(call_count, 1, "Called wrong number of times");
+
+	call_count = 0;
+	hook_invoke_expand(hook_expand_realloc, NULL, 0, 0, 0, args_raw);
+	assert_d_eq(call_count, 1, "Called wrong number of times");
+
+	hook_remove(TSDN_NULL, handle1);
+	hook_remove(TSDN_NULL, handle2);
+	hook_remove(TSDN_NULL, handle3);
+	hook_remove(TSDN_NULL, handle4);
+}
+TEST_END
+
+TEST_BEGIN(test_hooks_remove) {
+	hooks_t hooks = {&test_alloc_hook, NULL, NULL, NULL};
+	void *handle = hook_install(TSDN_NULL, &hooks);
+	assert_ptr_ne(handle, NULL, "Hook installation failed");
+	call_count = 0;
+	uintptr_t args_raw[4] = {10, 20, 30, 40};
+	hook_invoke_alloc(hook_alloc_malloc, NULL, 0, args_raw);
+	assert_d_eq(call_count, 1, "Hook not invoked");
+
+	call_count = 0;
+	hook_remove(TSDN_NULL, handle);
+	hook_invoke_alloc(hook_alloc_malloc, NULL, 0, NULL);
+	assert_d_eq(call_count, 0, "Hook invoked after removal");
+
+}
+TEST_END
+
+TEST_BEGIN(test_hooks_alloc_simple) {
+	/* "Simple" in the sense that we're not in a realloc variant. */
+	hooks_t hooks = {&test_alloc_hook, NULL, NULL, (void *)123};
+	void *handle = hook_install(TSDN_NULL, &hooks);
+	assert_ptr_ne(handle, NULL, "Hook installation failed");
+
+	/* Stop malloc from being optimized away. */
+	volatile int err;
+	void *volatile ptr;
+
+	/* malloc */
+	reset();
+	ptr = malloc(1);
+	assert_d_eq(call_count, 1, "Hook not called");
+	assert_ptr_eq(arg_extra, (void *)123, "Wrong extra");
+	assert_d_eq(arg_type, (int)hook_alloc_malloc, "Wrong hook type");
+	assert_ptr_eq(ptr, arg_result, "Wrong result");
+	assert_u64_eq((uintptr_t)ptr, (uintptr_t)arg_result_raw,
+	    "Wrong raw result");
+	assert_u64_eq((uintptr_t)1, arg_args_raw[0], "Wrong argument");
+	free(ptr);
+
+	/* posix_memalign */
+	reset();
+	err = posix_memalign((void **)&ptr, 1024, 1);
+	assert_d_eq(call_count, 1, "Hook not called");
+	assert_ptr_eq(arg_extra, (void *)123, "Wrong extra");
+	assert_d_eq(arg_type, (int)hook_alloc_posix_memalign,
+	    "Wrong hook type");
+	assert_ptr_eq(ptr, arg_result, "Wrong result");
+	assert_u64_eq((uintptr_t)err, (uintptr_t)arg_result_raw,
+	    "Wrong raw result");
+	assert_u64_eq((uintptr_t)&ptr, arg_args_raw[0], "Wrong argument");
+	assert_u64_eq((uintptr_t)1024, arg_args_raw[1], "Wrong argument");
+	assert_u64_eq((uintptr_t)1, arg_args_raw[2], "Wrong argument");
+	free(ptr);
+
+	/* aligned_alloc */
+	reset();
+	ptr = aligned_alloc(1024, 1);
+	assert_d_eq(call_count, 1, "Hook not called");
+	assert_ptr_eq(arg_extra, (void *)123, "Wrong extra");
+	assert_d_eq(arg_type, (int)hook_alloc_aligned_alloc,
+	    "Wrong hook type");
+	assert_ptr_eq(ptr, arg_result, "Wrong result");
+	assert_u64_eq((uintptr_t)ptr, (uintptr_t)arg_result_raw,
+	    "Wrong raw result");
+	assert_u64_eq((uintptr_t)1024, arg_args_raw[0], "Wrong argument");
+	assert_u64_eq((uintptr_t)1, arg_args_raw[1], "Wrong argument");
+	free(ptr);
+
+	/* calloc */
+	reset();
+	ptr = calloc(11, 13);
+	assert_d_eq(call_count, 1, "Hook not called");
+	assert_ptr_eq(arg_extra, (void *)123, "Wrong extra");
+	assert_d_eq(arg_type, (int)hook_alloc_calloc, "Wrong hook type");
+	assert_ptr_eq(ptr, arg_result, "Wrong result");
+	assert_u64_eq((uintptr_t)ptr, (uintptr_t)arg_result_raw,
+	    "Wrong raw result");
+	assert_u64_eq((uintptr_t)11, arg_args_raw[0], "Wrong argument");
+	assert_u64_eq((uintptr_t)13, arg_args_raw[1], "Wrong argument");
+	free(ptr);
+
+	/* memalign */
+#ifdef JEMALLOC_OVERRIDE_MEMALIGN
+	reset();
+	ptr = memalign(1024, 1);
+	assert_d_eq(call_count, 1, "Hook not called");
+	assert_ptr_eq(arg_extra, (void *)123, "Wrong extra");
+	assert_d_eq(arg_type, (int)hook_alloc_memalign, "Wrong hook type");
+	assert_ptr_eq(ptr, arg_result, "Wrong result");
+	assert_u64_eq((uintptr_t)ptr, (uintptr_t)arg_result_raw,
+	    "Wrong raw result");
+	assert_u64_eq((uintptr_t)1024, arg_args_raw[0], "Wrong argument");
+	assert_u64_eq((uintptr_t)1, arg_args_raw[1], "Wrong argument");
+	free(ptr);
+#endif /* JEMALLOC_OVERRIDE_MEMALIGN */
+
+	/* valloc */
+#ifdef JEMALLOC_OVERRIDE_VALLOC
+	reset();
+	ptr = valloc(1);
+	assert_d_eq(call_count, 1, "Hook not called");
+	assert_ptr_eq(arg_extra, (void *)123, "Wrong extra");
+	assert_d_eq(arg_type, (int)hook_alloc_valloc, "Wrong hook type");
+	assert_ptr_eq(ptr, arg_result, "Wrong result");
+	assert_u64_eq((uintptr_t)ptr, (uintptr_t)arg_result_raw,
+	    "Wrong raw result");
+	assert_u64_eq((uintptr_t)1, arg_args_raw[0], "Wrong argument");
+	free(ptr);
+#endif /* JEMALLOC_OVERRIDE_VALLOC */
+
+	/* mallocx */
+	reset();
+	ptr = mallocx(1, MALLOCX_LG_ALIGN(10));
+	assert_d_eq(call_count, 1, "Hook not called");
+	assert_ptr_eq(arg_extra, (void *)123, "Wrong extra");
+	assert_d_eq(arg_type, (int)hook_alloc_mallocx, "Wrong hook type");
+	assert_ptr_eq(ptr, arg_result, "Wrong result");
+	assert_u64_eq((uintptr_t)ptr, (uintptr_t)arg_result_raw,
+	    "Wrong raw result");
+	assert_u64_eq((uintptr_t)1, arg_args_raw[0], "Wrong argument");
+	assert_u64_eq((uintptr_t)MALLOCX_LG_ALIGN(10), arg_args_raw[1],
+	    "Wrong flags");
+	free(ptr);
+
+	hook_remove(TSDN_NULL, handle);
+}
+TEST_END
+
+TEST_BEGIN(test_hooks_dalloc_simple) {
+	/* "Simple" in the sense that we're not in a realloc variant. */
+	hooks_t hooks = {NULL, &test_dalloc_hook, NULL, (void *)123};
+	void *handle = hook_install(TSDN_NULL, &hooks);
+	assert_ptr_ne(handle, NULL, "Hook installation failed");
+
+	void *volatile ptr;
+
+	/* free() */
+	reset();
+	ptr = malloc(1);
+	free(ptr);
+	assert_d_eq(call_count, 1, "Hook not called");
+	assert_ptr_eq(arg_extra, (void *)123, "Wrong extra");
+	assert_d_eq(arg_type, (int)hook_dalloc_free, "Wrong hook type");
+	assert_ptr_eq(ptr, arg_address, "Wrong pointer freed");
+	assert_u64_eq((uintptr_t)ptr, arg_args_raw[0], "Wrong raw arg");
+
+	/* dallocx() */
+	reset();
+	ptr = malloc(1);
+	dallocx(ptr, MALLOCX_TCACHE_NONE);
+	assert_d_eq(call_count, 1, "Hook not called");
+	assert_ptr_eq(arg_extra, (void *)123, "Wrong extra");
+	assert_d_eq(arg_type, (int)hook_dalloc_dallocx, "Wrong hook type");
+	assert_ptr_eq(ptr, arg_address, "Wrong pointer freed");
+	assert_u64_eq((uintptr_t)ptr, arg_args_raw[0], "Wrong raw arg");
+	assert_u64_eq((uintptr_t)MALLOCX_TCACHE_NONE, arg_args_raw[1],
+	    "Wrong raw arg");
+
+	/* sdallocx() */
+	reset();
+	ptr = malloc(1);
+	sdallocx(ptr, 1, MALLOCX_TCACHE_NONE);
+	assert_d_eq(call_count, 1, "Hook not called");
+	assert_ptr_eq(arg_extra, (void *)123, "Wrong extra");
+	assert_d_eq(arg_type, (int)hook_dalloc_sdallocx, "Wrong hook type");
+	assert_ptr_eq(ptr, arg_address, "Wrong pointer freed");
+	assert_u64_eq((uintptr_t)ptr, arg_args_raw[0], "Wrong raw arg");
+	assert_u64_eq((uintptr_t)1, arg_args_raw[1], "Wrong raw arg");
+	assert_u64_eq((uintptr_t)MALLOCX_TCACHE_NONE, arg_args_raw[2],
+	    "Wrong raw arg");
+
+	hook_remove(TSDN_NULL, handle);
+}
+TEST_END
+
+TEST_BEGIN(test_hooks_expand_simple) {
+	/* "Simple" in the sense that we're not in a realloc variant. */
+	hooks_t hooks = {NULL, NULL, &test_expand_hook, (void *)123};
+	void *handle = hook_install(TSDN_NULL, &hooks);
+	assert_ptr_ne(handle, NULL, "Hook installation failed");
+
+	void *volatile ptr;
+
+	/* xallocx() */
+	reset();
+	ptr = malloc(1);
+	size_t new_usize = xallocx(ptr, 100, 200, MALLOCX_TCACHE_NONE);
+	assert_d_eq(call_count, 1, "Hook not called");
+	assert_ptr_eq(arg_extra, (void *)123, "Wrong extra");
+	assert_d_eq(arg_type, (int)hook_expand_xallocx, "Wrong hook type");
+	assert_ptr_eq(ptr, arg_address, "Wrong pointer expanded");
+	assert_u64_eq(arg_old_usize, nallocx(1, 0), "Wrong old usize");
+	assert_u64_eq(arg_new_usize, sallocx(ptr, 0), "Wrong new usize");
+	assert_u64_eq(new_usize, arg_result_raw, "Wrong result");
+	assert_u64_eq((uintptr_t)ptr, arg_args_raw[0], "Wrong arg");
+	assert_u64_eq(100, arg_args_raw[1], "Wrong arg");
+	assert_u64_eq(200, arg_args_raw[2], "Wrong arg");
+	assert_u64_eq(MALLOCX_TCACHE_NONE, arg_args_raw[3], "Wrong arg");
+
+	hook_remove(TSDN_NULL, handle);
+}
+TEST_END
+
+TEST_BEGIN(test_hooks_realloc_as_malloc_or_free) {
+	hooks_t hooks = {&test_alloc_hook, &test_dalloc_hook,
+		&test_expand_hook, (void *)123};
+	void *handle = hook_install(TSDN_NULL, &hooks);
+	assert_ptr_ne(handle, NULL, "Hook installation failed");
+
+	void *volatile ptr;
+
+	/* realloc(NULL, size) as malloc */
+	reset();
+	ptr = realloc(NULL, 1);
+	assert_d_eq(call_count, 1, "Hook not called");
+	assert_ptr_eq(arg_extra, (void *)123, "Wrong extra");
+	assert_d_eq(arg_type, (int)hook_alloc_realloc, "Wrong hook type");
+	assert_ptr_eq(ptr, arg_result, "Wrong result");
+	assert_u64_eq((uintptr_t)ptr, (uintptr_t)arg_result_raw,
+	    "Wrong raw result");
+	assert_u64_eq((uintptr_t)NULL, arg_args_raw[0], "Wrong argument");
+	assert_u64_eq((uintptr_t)1, arg_args_raw[1], "Wrong argument");
+	free(ptr);
+
+	/* realloc(ptr, 0) as free */
+	ptr = malloc(1);
+	reset();
+	realloc(ptr, 0);
+	assert_d_eq(call_count, 1, "Hook not called");
+	assert_ptr_eq(arg_extra, (void *)123, "Wrong extra");
+	assert_d_eq(arg_type, (int)hook_dalloc_realloc, "Wrong hook type");
+	assert_ptr_eq(ptr, arg_address, "Wrong pointer freed");
+	assert_u64_eq((uintptr_t)ptr, arg_args_raw[0], "Wrong raw arg");
+	assert_u64_eq((uintptr_t)0, arg_args_raw[1], "Wrong raw arg");
+
+	/* realloc(NULL, 0) as malloc(0) */
+	reset();
+	ptr = realloc(NULL, 0);
+	assert_d_eq(call_count, 1, "Hook not called");
+	assert_ptr_eq(arg_extra, (void *)123, "Wrong extra");
+	assert_d_eq(arg_type, (int)hook_alloc_realloc, "Wrong hook type");
+	assert_ptr_eq(ptr, arg_result, "Wrong result");
+	assert_u64_eq((uintptr_t)ptr, (uintptr_t)arg_result_raw,
+	    "Wrong raw result");
+	assert_u64_eq((uintptr_t)NULL, arg_args_raw[0], "Wrong argument");
+	assert_u64_eq((uintptr_t)0, arg_args_raw[1], "Wrong argument");
+	free(ptr);
+
+	hook_remove(TSDN_NULL, handle);
+}
+TEST_END
+
+static void
+do_realloc_test(void *(*ralloc)(void *, size_t, int), int flags,
+    int expand_type, int dalloc_type) {
+	hooks_t hooks = {&test_alloc_hook, &test_dalloc_hook,
+		&test_expand_hook, (void *)123};
+	void *handle = hook_install(TSDN_NULL, &hooks);
+	assert_ptr_ne(handle, NULL, "Hook installation failed");
+
+	void *volatile ptr;
+	void *volatile ptr2;
+
+	/* Realloc in-place, small. */
+	ptr = malloc(129);
+	reset();
+	ptr2 = ralloc(ptr, 130, flags);
+	assert_ptr_eq(ptr, ptr2, "Small realloc moved");
+
+	assert_d_eq(call_count, 1, "Hook not called");
+	assert_ptr_eq(arg_extra, (void *)123, "Wrong extra");
+	assert_d_eq(arg_type, expand_type, "Wrong hook type");
+	assert_ptr_eq(ptr, arg_address, "Wrong address");
+	assert_u64_eq((uintptr_t)ptr, (uintptr_t)arg_result_raw,
+	    "Wrong raw result");
+	assert_u64_eq((uintptr_t)ptr, arg_args_raw[0], "Wrong argument");
+	assert_u64_eq((uintptr_t)130, arg_args_raw[1], "Wrong argument");
+	free(ptr);
+
+	/*
+	 * Realloc in-place, large.  Since we can't guarantee the large case
+	 * across all platforms, we stay resilient to moving results.
+	 */
+	ptr = malloc(2 * 1024 * 1024);
+	free(ptr);
+	ptr2 = malloc(1 * 1024 * 1024);
+	reset();
+	ptr = ralloc(ptr2, 2 * 1024 * 1024, flags);
+	/* ptr is the new address, ptr2 is the old address. */
+	if (ptr == ptr2) {
+		assert_d_eq(call_count, 1, "Hook not called");
+		assert_d_eq(arg_type, expand_type, "Wrong hook type");
+	} else {
+		assert_d_eq(call_count, 2, "Wrong hooks called");
+		assert_ptr_eq(ptr, arg_result, "Wrong address");
+		assert_d_eq(arg_type, dalloc_type, "Wrong hook type");
+	}
+	assert_ptr_eq(arg_extra, (void *)123, "Wrong extra");
+	assert_ptr_eq(ptr2, arg_address, "Wrong address");
+	assert_u64_eq((uintptr_t)ptr, (uintptr_t)arg_result_raw,
+	    "Wrong raw result");
+	assert_u64_eq((uintptr_t)ptr2, arg_args_raw[0], "Wrong argument");
+	assert_u64_eq((uintptr_t)2 * 1024 * 1024, arg_args_raw[1],
+	    "Wrong argument");
+	free(ptr);
+
+	/* Realloc with move, small. */
+	ptr = malloc(8);
+	reset();
+	ptr2 = ralloc(ptr, 128, flags);
+	assert_ptr_ne(ptr, ptr2, "Small realloc didn't move");
+
+	assert_d_eq(call_count, 2, "Hook not called");
+	assert_ptr_eq(arg_extra, (void *)123, "Wrong extra");
+	assert_d_eq(arg_type, dalloc_type, "Wrong hook type");
+	assert_ptr_eq(ptr, arg_address, "Wrong address");
+	assert_ptr_eq(ptr2, arg_result, "Wrong address");
+	assert_u64_eq((uintptr_t)ptr2, (uintptr_t)arg_result_raw,
+	    "Wrong raw result");
+	assert_u64_eq((uintptr_t)ptr, arg_args_raw[0], "Wrong argument");
+	assert_u64_eq((uintptr_t)128, arg_args_raw[1], "Wrong argument");
+	free(ptr2);
+
+	/* Realloc with move, large. */
+	ptr = malloc(1);
+	reset();
+	ptr2 = ralloc(ptr, 2 * 1024 * 1024, flags);
+	assert_ptr_ne(ptr, ptr2, "Large realloc didn't move");
+
+	assert_d_eq(call_count, 2, "Hook not called");
+	assert_ptr_eq(arg_extra, (void *)123, "Wrong extra");
+	assert_d_eq(arg_type, dalloc_type, "Wrong hook type");
+	assert_ptr_eq(ptr, arg_address, "Wrong address");
+	assert_ptr_eq(ptr2, arg_result, "Wrong address");
+	assert_u64_eq((uintptr_t)ptr2, (uintptr_t)arg_result_raw,
+	    "Wrong raw result");
+	assert_u64_eq((uintptr_t)ptr, arg_args_raw[0], "Wrong argument");
+	assert_u64_eq((uintptr_t)2 * 1024 * 1024, arg_args_raw[1],
+	    "Wrong argument");
+	free(ptr2);
+
+	hook_remove(TSDN_NULL, handle);
+}
+
+static void *
+realloc_wrapper(void *ptr, size_t size, UNUSED int flags) {
+	return realloc(ptr, size);
+}
+
+TEST_BEGIN(test_hooks_realloc) {
+	do_realloc_test(&realloc_wrapper, 0, hook_expand_realloc,
+	    hook_dalloc_realloc);
+}
+TEST_END
+
+TEST_BEGIN(test_hooks_rallocx) {
+	do_realloc_test(&rallocx, MALLOCX_TCACHE_NONE, hook_expand_rallocx,
+	    hook_dalloc_rallocx);
+}
+TEST_END
+
+int
+main(void) {
+	/* We assert on call counts. */
+	return test_no_reentrancy(
+	    test_hooks_basic,
+	    test_hooks_null,
+	    test_hooks_remove,
+	    test_hooks_alloc_simple,
+	    test_hooks_dalloc_simple,
+	    test_hooks_expand_simple,
+	    test_hooks_realloc_as_malloc_or_free,
+	    test_hooks_realloc,
+	    test_hooks_rallocx);
+}
diff --git a/test/unit/huge.c b/test/unit/huge.c
new file mode 100644
index 00000000..ab72cf00
--- /dev/null
+++ b/test/unit/huge.c
@@ -0,0 +1,108 @@
+#include "test/jemalloc_test.h"
+
+/* Threshold: 2 << 20 = 2097152. */
+const char *malloc_conf = "oversize_threshold:2097152";
+
+#define HUGE_SZ (2 << 20)
+#define SMALL_SZ (8)
+
+TEST_BEGIN(huge_bind_thread) {
+	unsigned arena1, arena2;
+	size_t sz = sizeof(unsigned);
+
+	/* Bind to a manual arena. */
+	assert_d_eq(mallctl("arenas.create", &arena1, &sz, NULL, 0), 0,
+	    "Failed to create arena");
+	assert_d_eq(mallctl("thread.arena", NULL, NULL, &arena1,
+	    sizeof(arena1)), 0, "Fail to bind thread");
+
+	void *ptr = mallocx(HUGE_SZ, 0);
+	assert_ptr_not_null(ptr, "Fail to allocate huge size");
+	assert_d_eq(mallctl("arenas.lookup", &arena2, &sz, &ptr,
+	    sizeof(ptr)), 0, "Unexpected mallctl() failure");
+	assert_u_eq(arena1, arena2, "Wrong arena used after binding");
+	dallocx(ptr, 0);
+
+	/* Switch back to arena 0. */
+	test_skip_if(have_percpu_arena &&
+	    PERCPU_ARENA_ENABLED(opt_percpu_arena));
+	arena2 = 0;
+	assert_d_eq(mallctl("thread.arena", NULL, NULL, &arena2,
+	    sizeof(arena2)), 0, "Fail to bind thread");
+	ptr = mallocx(SMALL_SZ, MALLOCX_TCACHE_NONE);
+	assert_d_eq(mallctl("arenas.lookup", &arena2, &sz, &ptr,
+	    sizeof(ptr)), 0, "Unexpected mallctl() failure");
+	assert_u_eq(arena2, 0, "Wrong arena used after binding");
+	dallocx(ptr, MALLOCX_TCACHE_NONE);
+
+	/* Then huge allocation should use the huge arena. */
+	ptr = mallocx(HUGE_SZ, 0);
+	assert_ptr_not_null(ptr, "Fail to allocate huge size");
+	assert_d_eq(mallctl("arenas.lookup", &arena2, &sz, &ptr,
+	    sizeof(ptr)), 0, "Unexpected mallctl() failure");
+	assert_u_ne(arena2, 0, "Wrong arena used after binding");
+	assert_u_ne(arena1, arena2, "Wrong arena used after binding");
+	dallocx(ptr, 0);
+}
+TEST_END
+
+TEST_BEGIN(huge_mallocx) {
+	unsigned arena1, arena2;
+	size_t sz = sizeof(unsigned);
+
+	assert_d_eq(mallctl("arenas.create", &arena1, &sz, NULL, 0), 0,
+	    "Failed to create arena");
+	void *huge = mallocx(HUGE_SZ, MALLOCX_ARENA(arena1));
+	assert_ptr_not_null(huge, "Fail to allocate huge size");
+	assert_d_eq(mallctl("arenas.lookup", &arena2, &sz, &huge,
+	    sizeof(huge)), 0, "Unexpected mallctl() failure");
+	assert_u_eq(arena1, arena2, "Wrong arena used for mallocx");
+	dallocx(huge, MALLOCX_ARENA(arena1));
+
+	void *huge2 = mallocx(HUGE_SZ, 0);
+	assert_ptr_not_null(huge, "Fail to allocate huge size");
+	assert_d_eq(mallctl("arenas.lookup", &arena2, &sz, &huge2,
+	    sizeof(huge2)), 0, "Unexpected mallctl() failure");
+	assert_u_ne(arena1, arena2,
+	    "Huge allocation should not come from the manual arena.");
+	assert_u_ne(arena2, 0,
+	    "Huge allocation should not come from the arena 0.");
+	dallocx(huge2, 0);
+}
+TEST_END
+
+TEST_BEGIN(huge_allocation) {
+	unsigned arena1, arena2;
+
+	void *ptr = mallocx(HUGE_SZ, 0);
+	assert_ptr_not_null(ptr, "Fail to allocate huge size");
+	size_t sz = sizeof(unsigned);
+	assert_d_eq(mallctl("arenas.lookup", &arena1, &sz, &ptr, sizeof(ptr)),
+	    0, "Unexpected mallctl() failure");
+	assert_u_gt(arena1, 0, "Huge allocation should not come from arena 0");
+	dallocx(ptr, 0);
+
+	ptr = mallocx(HUGE_SZ >> 1, 0);
+	assert_ptr_not_null(ptr, "Fail to allocate half huge size");
+	assert_d_eq(mallctl("arenas.lookup", &arena2, &sz, &ptr,
+	    sizeof(ptr)), 0, "Unexpected mallctl() failure");
+	assert_u_ne(arena1, arena2, "Wrong arena used for half huge");
+	dallocx(ptr, 0);
+
+	ptr = mallocx(SMALL_SZ, MALLOCX_TCACHE_NONE);
+	assert_ptr_not_null(ptr, "Fail to allocate small size");
+	assert_d_eq(mallctl("arenas.lookup", &arena2, &sz, &ptr,
+	    sizeof(ptr)), 0, "Unexpected mallctl() failure");
+	assert_u_ne(arena1, arena2,
+	    "Huge and small should be from different arenas");
+	dallocx(ptr, 0);
+}
+TEST_END
+
+int
+main(void) {
+	return test(
+	    huge_allocation,
+	    huge_mallocx,
+	    huge_bind_thread);
+}
diff --git a/test/unit/junk.c b/test/unit/junk.c
index 243ced41..57e3ad43 100644
--- a/test/unit/junk.c
+++ b/test/unit/junk.c
@@ -123,13 +123,13 @@ test_junk(size_t sz_min, size_t sz_max) {
 
 TEST_BEGIN(test_junk_small) {
 	test_skip_if(!config_fill);
-	test_junk(1, SMALL_MAXCLASS-1);
+	test_junk(1, SC_SMALL_MAXCLASS - 1);
 }
 TEST_END
 
 TEST_BEGIN(test_junk_large) {
 	test_skip_if(!config_fill);
-	test_junk(SMALL_MAXCLASS+1, (1U << (LG_LARGE_MINCLASS+1)));
+	test_junk(SC_SMALL_MAXCLASS + 1, (1U << (SC_LG_LARGE_MINCLASS + 1)));
 }
 TEST_END
 
diff --git a/test/unit/mallctl.c b/test/unit/mallctl.c
index 1ecbab08..498f9e06 100644
--- a/test/unit/mallctl.c
+++ b/test/unit/mallctl.c
@@ -1,5 +1,6 @@
 #include "test/jemalloc_test.h"
 
+#include "jemalloc/internal/hook.h"
 #include "jemalloc/internal/util.h"
 
 TEST_BEGIN(test_mallctl_errors) {
@@ -163,6 +164,7 @@ TEST_BEGIN(test_mallctl_opt) {
 	TEST_MALLCTL_OPT(const char *, dss, always);
 	TEST_MALLCTL_OPT(unsigned, narenas, always);
 	TEST_MALLCTL_OPT(const char *, percpu_arena, always);
+	TEST_MALLCTL_OPT(size_t, oversize_threshold, always);
 	TEST_MALLCTL_OPT(bool, background_thread, always);
 	TEST_MALLCTL_OPT(ssize_t, dirty_decay_ms, always);
 	TEST_MALLCTL_OPT(ssize_t, muzzy_decay_ms, always);
@@ -340,6 +342,9 @@ TEST_BEGIN(test_thread_arena) {
 	sz = sizeof(unsigned);
 	assert_d_eq(mallctl("arenas.narenas", (void *)&narenas, &sz, NULL, 0),
 	    0, "Unexpected mallctl() failure");
+	if (opt_oversize_threshold != 0) {
+		narenas--;
+	}
 	assert_u_eq(narenas, opt_narenas, "Number of arenas incorrect");
 
 	if (strcmp(opa, "disabled") == 0) {
@@ -576,7 +581,7 @@ TEST_BEGIN(test_arena_i_retain_grow_limit) {
 
 	assert_d_eq(mallctlbymib(mib, miblen, &default_limit, &sz, NULL, 0), 0,
 	    "Unexpected mallctl() failure");
-	assert_zu_eq(default_limit, sz_pind2sz(EXTENT_GROW_MAX_PIND),
+	assert_zu_eq(default_limit, SC_LARGE_MAXCLASS,
 	    "Unexpected default for retain_grow_limit");
 
 	new_limit = PAGE - 1;
@@ -681,8 +686,8 @@ TEST_BEGIN(test_arenas_constants) {
 
 	TEST_ARENAS_CONSTANT(size_t, quantum, QUANTUM);
 	TEST_ARENAS_CONSTANT(size_t, page, PAGE);
-	TEST_ARENAS_CONSTANT(unsigned, nbins, NBINS);
-	TEST_ARENAS_CONSTANT(unsigned, nlextents, NSIZES - NBINS);
+	TEST_ARENAS_CONSTANT(unsigned, nbins, SC_NBINS);
+	TEST_ARENAS_CONSTANT(unsigned, nlextents, SC_NSIZES - SC_NBINS);
 
 #undef TEST_ARENAS_CONSTANT
 }
@@ -701,6 +706,7 @@ TEST_BEGIN(test_arenas_bin_constants) {
 	TEST_ARENAS_BIN_CONSTANT(uint32_t, nregs, bin_infos[0].nregs);
 	TEST_ARENAS_BIN_CONSTANT(size_t, slab_size,
 	    bin_infos[0].slab_size);
+	TEST_ARENAS_BIN_CONSTANT(uint32_t, nshards, bin_infos[0].n_shards);
 
 #undef TEST_ARENAS_BIN_CONSTANT
 }
@@ -715,7 +721,8 @@ TEST_BEGIN(test_arenas_lextent_constants) {
 	assert_zu_eq(name, expected, "Incorrect "#name" size");		\
 } while (0)
 
-	TEST_ARENAS_LEXTENT_CONSTANT(size_t, size, LARGE_MINCLASS);
+	TEST_ARENAS_LEXTENT_CONSTANT(size_t, size,
+	    SC_LARGE_MINCLASS);
 
 #undef TEST_ARENAS_LEXTENT_CONSTANT
 }
@@ -773,6 +780,79 @@ TEST_BEGIN(test_stats_arenas) {
 }
 TEST_END
 
+static void
+alloc_hook(void *extra, UNUSED hook_alloc_t type, UNUSED void *result,
+    UNUSED uintptr_t result_raw, UNUSED uintptr_t args_raw[3]) {
+	*(bool *)extra = true;
+}
+
+static void
+dalloc_hook(void *extra, UNUSED hook_dalloc_t type,
+    UNUSED void *address, UNUSED uintptr_t args_raw[3]) {
+	*(bool *)extra = true;
+}
+
+TEST_BEGIN(test_hooks) {
+	bool hook_called = false;
+	hooks_t hooks = {&alloc_hook, &dalloc_hook, NULL, &hook_called};
+	void *handle = NULL;
+	size_t sz = sizeof(handle);
+	int err = mallctl("experimental.hooks.install", &handle, &sz, &hooks,
+	    sizeof(hooks));
+	assert_d_eq(err, 0, "Hook installation failed");
+	assert_ptr_ne(handle, NULL, "Hook installation gave null handle");
+	void *ptr = mallocx(1, 0);
+	assert_true(hook_called, "Alloc hook not called");
+	hook_called = false;
+	free(ptr);
+	assert_true(hook_called, "Free hook not called");
+
+	err = mallctl("experimental.hooks.remove", NULL, NULL, &handle,
+	    sizeof(handle));
+	assert_d_eq(err, 0, "Hook removal failed");
+	hook_called = false;
+	ptr = mallocx(1, 0);
+	free(ptr);
+	assert_false(hook_called, "Hook called after removal");
+}
+TEST_END
+
+TEST_BEGIN(test_hooks_exhaustion) {
+	bool hook_called = false;
+	hooks_t hooks = {&alloc_hook, &dalloc_hook, NULL, &hook_called};
+
+	void *handle;
+	void *handles[HOOK_MAX];
+	size_t sz = sizeof(handle);
+	int err;
+	for (int i = 0; i < HOOK_MAX; i++) {
+		handle = NULL;
+		err = mallctl("experimental.hooks.install", &handle, &sz,
+		    &hooks, sizeof(hooks));
+		assert_d_eq(err, 0, "Error installation hooks");
+		assert_ptr_ne(handle, NULL, "Got NULL handle");
+		handles[i] = handle;
+	}
+	err = mallctl("experimental.hooks.install", &handle, &sz, &hooks,
+	    sizeof(hooks));
+	assert_d_eq(err, EAGAIN, "Should have failed hook installation");
+	for (int i = 0; i < HOOK_MAX; i++) {
+		err = mallctl("experimental.hooks.remove", NULL, NULL,
+		    &handles[i], sizeof(handles[i]));
+		assert_d_eq(err, 0, "Hook removal failed");
+	}
+	/* Insertion failed, but then we removed some; it should work now. */
+	handle = NULL;
+	err = mallctl("experimental.hooks.install", &handle, &sz, &hooks,
+	    sizeof(hooks));
+	assert_d_eq(err, 0, "Hook insertion failed");
+	assert_ptr_ne(handle, NULL, "Got NULL handle");
+	err = mallctl("experimental.hooks.remove", NULL, NULL, &handle,
+	    sizeof(handle));
+	assert_d_eq(err, 0, "Hook removal failed");
+}
+TEST_END
+
 int
 main(void) {
 	return test(
@@ -801,5 +881,7 @@ main(void) {
 	    test_arenas_lextent_constants,
 	    test_arenas_create,
 	    test_arenas_lookup,
-	    test_stats_arenas);
+	    test_stats_arenas,
+	    test_hooks,
+	    test_hooks_exhaustion);
 }
diff --git a/test/unit/prof_gdump.c b/test/unit/prof_gdump.c
index fcb434cb..f7e0aac7 100644
--- a/test/unit/prof_gdump.c
+++ b/test/unit/prof_gdump.c
@@ -29,12 +29,12 @@ TEST_BEGIN(test_gdump) {
 	prof_dump_open = prof_dump_open_intercept;
 
 	did_prof_dump_open = false;
-	p = mallocx((1U << LG_LARGE_MINCLASS), 0);
+	p = mallocx((1U << SC_LG_LARGE_MINCLASS), 0);
 	assert_ptr_not_null(p, "Unexpected mallocx() failure");
 	assert_true(did_prof_dump_open, "Expected a profile dump");
 
 	did_prof_dump_open = false;
-	q = mallocx((1U << LG_LARGE_MINCLASS), 0);
+	q = mallocx((1U << SC_LG_LARGE_MINCLASS), 0);
 	assert_ptr_not_null(q, "Unexpected mallocx() failure");
 	assert_true(did_prof_dump_open, "Expected a profile dump");
 
@@ -45,7 +45,7 @@ TEST_BEGIN(test_gdump) {
 	    "Unexpected mallctl failure while disabling prof.gdump");
 	assert(gdump_old);
 	did_prof_dump_open = false;
-	r = mallocx((1U << LG_LARGE_MINCLASS), 0);
+	r = mallocx((1U << SC_LG_LARGE_MINCLASS), 0);
 	assert_ptr_not_null(q, "Unexpected mallocx() failure");
 	assert_false(did_prof_dump_open, "Unexpected profile dump");
 
@@ -56,7 +56,7 @@ TEST_BEGIN(test_gdump) {
 	    "Unexpected mallctl failure while enabling prof.gdump");
 	assert(!gdump_old);
 	did_prof_dump_open = false;
-	s = mallocx((1U << LG_LARGE_MINCLASS), 0);
+	s = mallocx((1U << SC_LG_LARGE_MINCLASS), 0);
 	assert_ptr_not_null(q, "Unexpected mallocx() failure");
 	assert_true(did_prof_dump_open, "Expected a profile dump");
 
diff --git a/test/unit/prof_log.c b/test/unit/prof_log.c
new file mode 100644
index 00000000..6a3464b4
--- /dev/null
+++ b/test/unit/prof_log.c
@@ -0,0 +1,146 @@
+#include "test/jemalloc_test.h"
+
+#define N_PARAM 100
+#define N_THREADS 10
+
+static void assert_rep() {
+	assert_b_eq(prof_log_rep_check(), false, "Rep check failed");
+}
+
+static void assert_log_empty() {
+	assert_zu_eq(prof_log_bt_count(), 0,
+	    "The log has backtraces; it isn't empty");
+	assert_zu_eq(prof_log_thr_count(), 0,
+	    "The log has threads; it isn't empty");
+	assert_zu_eq(prof_log_alloc_count(), 0,
+	    "The log has allocations; it isn't empty");
+}
+
+void *buf[N_PARAM];
+
+static void f() {
+	int i;
+	for (i = 0; i < N_PARAM; i++) {
+		buf[i] = malloc(100);
+	}
+	for (i = 0; i < N_PARAM; i++) {
+		free(buf[i]);
+	}
+}
+
+TEST_BEGIN(test_prof_log_many_logs) {
+	int i;
+
+	test_skip_if(!config_prof);
+
+	for (i = 0; i < N_PARAM; i++) {
+		assert_b_eq(prof_log_is_logging(), false,
+		    "Logging shouldn't have started yet");
+		assert_d_eq(mallctl("prof.log_start", NULL, NULL, NULL, 0), 0,
+		    "Unexpected mallctl failure when starting logging");
+		assert_b_eq(prof_log_is_logging(), true,
+		    "Logging should be started by now");
+		assert_log_empty();
+		assert_rep();
+		f();
+		assert_zu_eq(prof_log_thr_count(), 1, "Wrong thread count");
+		assert_rep();
+		assert_b_eq(prof_log_is_logging(), true,
+		    "Logging should still be on");
+		assert_d_eq(mallctl("prof.log_stop", NULL, NULL, NULL, 0), 0,
+		    "Unexpected mallctl failure when stopping logging");
+		assert_b_eq(prof_log_is_logging(), false,
+		    "Logging should have turned off");
+	}
+}
+TEST_END
+
+thd_t thr_buf[N_THREADS];
+
+static void *f_thread(void *unused) {
+	int i;
+	for (i = 0; i < N_PARAM; i++) {
+		void *p = malloc(100);
+		memset(p, 100, sizeof(char));
+		free(p);
+	}
+
+	return NULL;
+}
+
+TEST_BEGIN(test_prof_log_many_threads) {
+
+	test_skip_if(!config_prof);
+
+	int i;
+	assert_d_eq(mallctl("prof.log_start", NULL, NULL, NULL, 0), 0,
+	    "Unexpected mallctl failure when starting logging");
+	for (i = 0; i < N_THREADS; i++) {
+		thd_create(&thr_buf[i], &f_thread, NULL);
+	}
+
+	for (i = 0; i < N_THREADS; i++) {
+		thd_join(thr_buf[i], NULL);
+	}
+	assert_zu_eq(prof_log_thr_count(), N_THREADS,
+	    "Wrong number of thread entries");
+	assert_rep();
+	assert_d_eq(mallctl("prof.log_stop", NULL, NULL, NULL, 0), 0,
+	    "Unexpected mallctl failure when stopping logging");
+}
+TEST_END
+
+static void f3() {
+	void *p = malloc(100);
+	free(p);
+}
+
+static void f1() {
+	void *p = malloc(100);
+	f3();
+	free(p);
+}
+
+static void f2() {
+	void *p = malloc(100);
+	free(p);
+}
+
+TEST_BEGIN(test_prof_log_many_traces) {
+
+	test_skip_if(!config_prof);
+
+	assert_d_eq(mallctl("prof.log_start", NULL, NULL, NULL, 0), 0,
+	    "Unexpected mallctl failure when starting logging");
+	int i;
+	assert_rep();
+	assert_log_empty();
+	for (i = 0; i < N_PARAM; i++) {
+		assert_rep();
+		f1();
+		assert_rep();
+		f2();
+		assert_rep();
+		f3();
+		assert_rep();
+	}
+	/*
+	 * There should be 8 total backtraces: two for malloc/free in f1(),
+	 * two for malloc/free in f2(), two for malloc/free in f3(), and then
+	 * two for malloc/free in f1()'s call to f3().
+	 */
+	assert_zu_eq(prof_log_bt_count(), 8,
+	    "Wrong number of backtraces given sample workload");
+	assert_d_eq(mallctl("prof.log_stop", NULL, NULL, NULL, 0), 0,
+	    "Unexpected mallctl failure when stopping logging");
+}
+TEST_END
+
+int
+main(void) {
+	prof_log_dummy_set(true);
+	return test_no_reentrancy(
+	    test_prof_log_many_logs,
+	    test_prof_log_many_traces,
+	    test_prof_log_many_threads);
+}
diff --git a/test/unit/prof_log.sh b/test/unit/prof_log.sh
new file mode 100644
index 00000000..8fcc7d8a
--- /dev/null
+++ b/test/unit/prof_log.sh
@@ -0,0 +1,5 @@
+#!/bin/sh
+
+if [ "x${enable_prof}" = "x1" ] ; then
+  export MALLOC_CONF="prof:true,lg_prof_sample:0"
+fi
diff --git a/test/unit/rtree.c b/test/unit/rtree.c
index 908100fa..b017bc03 100644
--- a/test/unit/rtree.c
+++ b/test/unit/rtree.c
@@ -85,10 +85,10 @@ TEST_END
 
 TEST_BEGIN(test_rtree_extrema) {
 	extent_t extent_a, extent_b;
-	extent_init(&extent_a, NULL, NULL, LARGE_MINCLASS, false,
-	    sz_size2index(LARGE_MINCLASS), 0, extent_state_active, false,
-	    false, true);
-	extent_init(&extent_b, NULL, NULL, 0, false, NSIZES, 0,
+	extent_init(&extent_a, NULL, NULL, SC_LARGE_MINCLASS, false,
+	    sz_size2index(SC_LARGE_MINCLASS), 0,
+	    extent_state_active, false, false, true);
+	extent_init(&extent_b, NULL, NULL, 0, false, SC_NSIZES, 0,
 	    extent_state_active, false, false, true);
 
 	tsdn_t *tsdn = tsdn_fetch();
@@ -125,7 +125,7 @@ TEST_BEGIN(test_rtree_bits) {
 	    PAGE + (((uintptr_t)1) << LG_PAGE) - 1};
 
 	extent_t extent;
-	extent_init(&extent, NULL, NULL, 0, false, NSIZES, 0,
+	extent_init(&extent, NULL, NULL, 0, false, SC_NSIZES, 0,
 	    extent_state_active, false, false, true);
 
 	rtree_t *rtree = &test_rtree;
@@ -135,7 +135,7 @@ TEST_BEGIN(test_rtree_bits) {
 
 	for (unsigned i = 0; i < sizeof(keys)/sizeof(uintptr_t); i++) {
 		assert_false(rtree_write(tsdn, rtree, &rtree_ctx, keys[i],
-		    &extent, NSIZES, false),
+		    &extent, SC_NSIZES, false),
 		    "Unexpected rtree_write() failure");
 		for (unsigned j = 0; j < sizeof(keys)/sizeof(uintptr_t); j++) {
 			assert_ptr_eq(rtree_extent_read(tsdn, rtree, &rtree_ctx,
@@ -166,7 +166,7 @@ TEST_BEGIN(test_rtree_random) {
 	rtree_ctx_data_init(&rtree_ctx);
 
 	extent_t extent;
-	extent_init(&extent, NULL, NULL, 0, false, NSIZES, 0,
+	extent_init(&extent, NULL, NULL, 0, false, SC_NSIZES, 0,
 	    extent_state_active, false, false, true);
 
 	assert_false(rtree_new(rtree, false), "Unexpected rtree_new() failure");
@@ -177,7 +177,8 @@ TEST_BEGIN(test_rtree_random) {
 		    &rtree_ctx, keys[i], false, true);
 		assert_ptr_not_null(elm,
 		    "Unexpected rtree_leaf_elm_lookup() failure");
-		rtree_leaf_elm_write(tsdn, rtree, elm, &extent, NSIZES, false);
+		rtree_leaf_elm_write(tsdn, rtree, elm, &extent, SC_NSIZES,
+		    false);
 		assert_ptr_eq(rtree_extent_read(tsdn, rtree, &rtree_ctx,
 		    keys[i], true), &extent,
 		    "rtree_extent_read() should return previously set value");
diff --git a/test/unit/sc.c b/test/unit/sc.c
new file mode 100644
index 00000000..bf51d8e5
--- /dev/null
+++ b/test/unit/sc.c
@@ -0,0 +1,33 @@
+#include "test/jemalloc_test.h"
+
+TEST_BEGIN(test_update_slab_size) {
+	sc_data_t data;
+	memset(&data, 0, sizeof(data));
+	sc_data_init(&data);
+	sc_t *tiny = &data.sc[0];
+	size_t tiny_size = (ZU(1) << tiny->lg_base)
+	    + (ZU(tiny->ndelta) << tiny->lg_delta);
+	size_t pgs_too_big = (tiny_size * BITMAP_MAXBITS + PAGE - 1) / PAGE + 1;
+	sc_data_update_slab_size(&data, tiny_size, tiny_size, (int)pgs_too_big);
+	assert_zu_lt((size_t)tiny->pgs, pgs_too_big, "Allowed excessive pages");
+
+	sc_data_update_slab_size(&data, 1, 10 * PAGE, 1);
+	for (int i = 0; i < data.nbins; i++) {
+		sc_t *sc = &data.sc[i];
+		size_t reg_size = (ZU(1) << sc->lg_base)
+		    + (ZU(sc->ndelta) << sc->lg_delta);
+		if (reg_size <= PAGE) {
+			assert_d_eq(sc->pgs, 1, "Ignored valid page size hint");
+		} else {
+			assert_d_gt(sc->pgs, 1,
+			    "Allowed invalid page size hint");
+		}
+	}
+}
+TEST_END
+
+int
+main(void) {
+	return test(
+	    test_update_slab_size);
+}
diff --git a/test/unit/seq.c b/test/unit/seq.c
new file mode 100644
index 00000000..19613b0b
--- /dev/null
+++ b/test/unit/seq.c
@@ -0,0 +1,95 @@
+#include "test/jemalloc_test.h"
+
+#include "jemalloc/internal/seq.h"
+
+typedef struct data_s data_t;
+struct data_s {
+	int arr[10];
+};
+
+static void
+set_data(data_t *data, int num) {
+	for (int i = 0; i < 10; i++) {
+		data->arr[i] = num;
+	}
+}
+
+static void
+assert_data(data_t *data) {
+	int num = data->arr[0];
+	for (int i = 0; i < 10; i++) {
+		assert_d_eq(num, data->arr[i], "Data consistency error");
+	}
+}
+
+seq_define(data_t, data)
+
+typedef struct thd_data_s thd_data_t;
+struct thd_data_s {
+	seq_data_t data;
+};
+
+static void *
+seq_reader_thd(void *arg) {
+	thd_data_t *thd_data = (thd_data_t *)arg;
+	int iter = 0;
+	data_t local_data;
+	while (iter < 1000 * 1000 - 1) {
+		bool success = seq_try_load_data(&local_data, &thd_data->data);
+		if (success) {
+			assert_data(&local_data);
+			assert_d_le(iter, local_data.arr[0],
+			    "Seq read went back in time.");
+			iter = local_data.arr[0];
+		}
+	}
+	return NULL;
+}
+
+static void *
+seq_writer_thd(void *arg) {
+	thd_data_t *thd_data = (thd_data_t *)arg;
+	data_t local_data;
+	memset(&local_data, 0, sizeof(local_data));
+	for (int i = 0; i < 1000 * 1000; i++) {
+		set_data(&local_data, i);
+		seq_store_data(&thd_data->data, &local_data);
+	}
+	return NULL;
+}
+
+TEST_BEGIN(test_seq_threaded) {
+	thd_data_t thd_data;
+	memset(&thd_data, 0, sizeof(thd_data));
+
+	thd_t reader;
+	thd_t writer;
+
+	thd_create(&reader, seq_reader_thd, &thd_data);
+	thd_create(&writer, seq_writer_thd, &thd_data);
+
+	thd_join(reader, NULL);
+	thd_join(writer, NULL);
+}
+TEST_END
+
+TEST_BEGIN(test_seq_simple) {
+	data_t data;
+	seq_data_t seq;
+	memset(&seq, 0, sizeof(seq));
+	for (int i = 0; i < 1000 * 1000; i++) {
+		set_data(&data, i);
+		seq_store_data(&seq, &data);
+		set_data(&data, 0);
+		bool success = seq_try_load_data(&data, &seq);
+		assert_b_eq(success, true, "Failed non-racing read");
+		assert_data(&data);
+	}
+}
+TEST_END
+
+int main(void) {
+	return test_no_reentrancy(
+	    test_seq_simple,
+	    test_seq_threaded);
+}
diff --git a/test/unit/size_classes.c b/test/unit/size_classes.c
index bcff5609..69473363 100644
--- a/test/unit/size_classes.c
+++ b/test/unit/size_classes.c
@@ -108,8 +108,13 @@ TEST_BEGIN(test_psize_classes) {
 		    size_class, sz_psz2ind(size_class),
 		    sz_pind2sz(sz_psz2ind(size_class)));
 
-		assert_u_eq(pind+1, sz_psz2ind(size_class+1),
-		    "Next size_class does not round up properly");
+		if (size_class == SC_LARGE_MAXCLASS) {
+			assert_u_eq(SC_NPSIZES, sz_psz2ind(size_class + 1),
+			    "Next size_class does not round up properly");
+		} else {
+			assert_u_eq(pind + 1, sz_psz2ind(size_class + 1),
+			    "Next size_class does not round up properly");
+		}
 
 		assert_zu_eq(size_class, (pind > 0) ?
 		    sz_psz2u(sz_pind2sz(pind-1)+1) : sz_psz2u(1),
@@ -142,11 +147,11 @@ TEST_BEGIN(test_overflow) {
 	max_size_class = get_max_size_class();
 	max_psz = max_size_class + PAGE;
 
-	assert_u_eq(sz_size2index(max_size_class+1), NSIZES,
+	assert_u_eq(sz_size2index(max_size_class+1), SC_NSIZES,
 	    "sz_size2index() should return NSIZES on overflow");
-	assert_u_eq(sz_size2index(ZU(PTRDIFF_MAX)+1), NSIZES,
+	assert_u_eq(sz_size2index(ZU(PTRDIFF_MAX)+1), SC_NSIZES,
 	    "sz_size2index() should return NSIZES on overflow");
-	assert_u_eq(sz_size2index(SIZE_T_MAX), NSIZES,
+	assert_u_eq(sz_size2index(SIZE_T_MAX), SC_NSIZES,
 	    "sz_size2index() should return NSIZES on overflow");
 
 	assert_zu_eq(sz_s2u(max_size_class+1), 0,
@@ -156,11 +161,11 @@ TEST_BEGIN(test_overflow) {
 	assert_zu_eq(sz_s2u(SIZE_T_MAX), 0,
 	    "sz_s2u() should return 0 on overflow");
 
-	assert_u_eq(sz_psz2ind(max_size_class+1), NPSIZES,
+	assert_u_eq(sz_psz2ind(max_size_class+1), SC_NPSIZES,
 	    "sz_psz2ind() should return NPSIZES on overflow");
-	assert_u_eq(sz_psz2ind(ZU(PTRDIFF_MAX)+1), NPSIZES,
+	assert_u_eq(sz_psz2ind(ZU(PTRDIFF_MAX)+1), SC_NPSIZES,
 	    "sz_psz2ind() should return NPSIZES on overflow");
-	assert_u_eq(sz_psz2ind(SIZE_T_MAX), NPSIZES,
+	assert_u_eq(sz_psz2ind(SIZE_T_MAX), SC_NPSIZES,
 	    "sz_psz2ind() should return NPSIZES on overflow");
 
 	assert_zu_eq(sz_psz2u(max_size_class+1), max_psz,
diff --git a/test/unit/slab.c b/test/unit/slab.c
index 7e662aed..ef718821 100644
--- a/test/unit/slab.c
+++ b/test/unit/slab.c
@@ -3,7 +3,7 @@
 TEST_BEGIN(test_arena_slab_regind) {
 	szind_t binind;
 
-	for (binind = 0; binind < NBINS; binind++) {
+	for (binind = 0; binind < SC_NBINS; binind++) {
 		size_t regind;
 		extent_t slab;
 		const bin_info_t *bin_info = &bin_infos[binind];
diff --git a/test/unit/stats.c b/test/unit/stats.c
index 231010e4..4323bfa3 100644
--- a/test/unit/stats.c
+++ b/test/unit/stats.c
@@ -33,7 +33,7 @@ TEST_BEGIN(test_stats_large) {
 	size_t sz;
 	int expected = config_stats ? 0 : ENOENT;
 
-	p = mallocx(SMALL_MAXCLASS+1, MALLOCX_ARENA(0));
+	p = mallocx(SC_SMALL_MAXCLASS + 1, MALLOCX_ARENA(0));
 	assert_ptr_not_null(p, "Unexpected mallocx() failure");
 
 	assert_d_eq(mallctl("epoch", NULL, NULL, (void *)&epoch, sizeof(epoch)),
@@ -74,9 +74,10 @@ TEST_BEGIN(test_stats_arenas_summary) {
 	uint64_t dirty_npurge, dirty_nmadvise, dirty_purged;
 	uint64_t muzzy_npurge, muzzy_nmadvise, muzzy_purged;
 
-	little = mallocx(SMALL_MAXCLASS, MALLOCX_ARENA(0));
+	little = mallocx(SC_SMALL_MAXCLASS, MALLOCX_ARENA(0));
 	assert_ptr_not_null(little, "Unexpected mallocx() failure");
-	large = mallocx((1U << LG_LARGE_MINCLASS), MALLOCX_ARENA(0));
+	large = mallocx((1U << SC_LG_LARGE_MINCLASS),
+	    MALLOCX_ARENA(0));
 	assert_ptr_not_null(large, "Unexpected mallocx() failure");
 
 	dallocx(little, 0);
@@ -148,7 +149,7 @@ TEST_BEGIN(test_stats_arenas_small) {
 
 	no_lazy_lock(); /* Lazy locking would dodge tcache testing. */
 
-	p = mallocx(SMALL_MAXCLASS, MALLOCX_ARENA(0));
+	p = mallocx(SC_SMALL_MAXCLASS, MALLOCX_ARENA(0));
 	assert_ptr_not_null(p, "Unexpected mallocx() failure");
 
 	assert_d_eq(mallctl("thread.tcache.flush", NULL, NULL, NULL, 0),
@@ -191,7 +192,7 @@ TEST_BEGIN(test_stats_arenas_large) {
 	uint64_t epoch, nmalloc, ndalloc;
 	int expected = config_stats ? 0 : ENOENT;
 
-	p = mallocx((1U << LG_LARGE_MINCLASS), MALLOCX_ARENA(0));
+	p = mallocx((1U << SC_LG_LARGE_MINCLASS), MALLOCX_ARENA(0));
 	assert_ptr_not_null(p, "Unexpected mallocx() failure");
 
 	assert_d_eq(mallctl("epoch", NULL, NULL, (void *)&epoch, sizeof(epoch)),
diff --git a/test/unit/hooks.c b/test/unit/test_hooks.c
similarity index 82%
rename from test/unit/hooks.c
rename to test/unit/test_hooks.c
index b70172e1..ded8698b 100644
--- a/test/unit/hooks.c
+++ b/test/unit/test_hooks.c
@@ -12,10 +12,10 @@ func_to_hook(int arg1, int arg2) {
 	return arg1 + arg2;
 }
 
-#define func_to_hook JEMALLOC_HOOK(func_to_hook, hooks_libc_hook)
+#define func_to_hook JEMALLOC_HOOK(func_to_hook, test_hooks_libc_hook)
 
 TEST_BEGIN(unhooked_call) {
-	hooks_libc_hook = NULL;
+	test_hooks_libc_hook = NULL;
 	hook_called = false;
 	assert_d_eq(3, func_to_hook(1, 2), "Hooking changed return value.");
 	assert_false(hook_called, "Nulling out hook didn't take.");
@@ -23,7 +23,7 @@ TEST_BEGIN(unhooked_call) {
 TEST_END
 
 TEST_BEGIN(hooked_call) {
-	hooks_libc_hook = &hook;
+	test_hooks_libc_hook = &hook;
 	hook_called = false;
 	assert_d_eq(3, func_to_hook(1, 2), "Hooking changed return value.");
 	assert_true(hook_called, "Hook should have executed.");
diff --git a/test/unit/tsd.c b/test/unit/tsd.c
index 6c479139..917884dc 100644
--- a/test/unit/tsd.c
+++ b/test/unit/tsd.c
@@ -1,5 +1,10 @@
 #include "test/jemalloc_test.h"
 
+/*
+ * If we're e.g. in debug mode, we *never* enter the fast path, and so shouldn't
+ * be asserting that we're on one.
+ */
+static bool originally_fast;
 static int data_cleanup_count;
 
 void
@@ -98,11 +103,11 @@ thd_start_reincarnated(void *arg) {
 	tsd_cleanup((void *)tsd);
 	assert_ptr_null(*tsd_arenap_get_unsafe(tsd),
 	    "TSD arena should have been cleared.");
-	assert_u_eq(tsd->state, tsd_state_purgatory,
+	assert_u_eq(tsd_state_get(tsd), tsd_state_purgatory,
 	    "TSD state should be purgatory\n");
 
 	free(p);
-	assert_u_eq(tsd->state, tsd_state_reincarnated,
+	assert_u_eq(tsd_state_get(tsd), tsd_state_reincarnated,
 	    "TSD state should be reincarnated\n");
 	p = mallocx(1, MALLOCX_TCACHE_NONE);
 	assert_ptr_not_null(p, "Unexpected malloc() failure");
@@ -124,6 +129,128 @@ TEST_BEGIN(test_tsd_reincarnation) {
 }
 TEST_END
 
+typedef struct {
+	atomic_u32_t phase;
+	atomic_b_t error;
+} global_slow_data_t;
+
+static void *
+thd_start_global_slow(void *arg) {
+	/* PHASE 0 */
+	global_slow_data_t *data = (global_slow_data_t *)arg;
+	free(mallocx(1, 0));
+
+	tsd_t *tsd = tsd_fetch();
+	/*
+	 * No global slowness has happened yet; there was an error if we were
+	 * originally fast but aren't now.
+	 */
+	atomic_store_b(&data->error, originally_fast && !tsd_fast(tsd),
+	    ATOMIC_SEQ_CST);
+	atomic_store_u32(&data->phase, 1, ATOMIC_SEQ_CST);
+
+	/* PHASE 2 */
+	while (atomic_load_u32(&data->phase, ATOMIC_SEQ_CST) != 2) {
+	}
+	free(mallocx(1, 0));
+	atomic_store_b(&data->error, tsd_fast(tsd), ATOMIC_SEQ_CST);
+	atomic_store_u32(&data->phase, 3, ATOMIC_SEQ_CST);
+
+	/* PHASE 4 */
+	while (atomic_load_u32(&data->phase, ATOMIC_SEQ_CST) != 4) {
+	}
+	free(mallocx(1, 0));
+	atomic_store_b(&data->error, tsd_fast(tsd), ATOMIC_SEQ_CST);
+	atomic_store_u32(&data->phase, 5, ATOMIC_SEQ_CST);
+
+	/* PHASE 6 */
+	while (atomic_load_u32(&data->phase, ATOMIC_SEQ_CST) != 6) {
+	}
+	free(mallocx(1, 0));
+	/* Only one decrement so far. */
+	atomic_store_b(&data->error, tsd_fast(tsd), ATOMIC_SEQ_CST);
+	atomic_store_u32(&data->phase, 7, ATOMIC_SEQ_CST);
+
+	/* PHASE 8 */
+	while (atomic_load_u32(&data->phase, ATOMIC_SEQ_CST) != 8) {
+	}
+	free(mallocx(1, 0));
+	/*
+	 * Both decrements happened; we should be fast again (if we ever
+	 * were)
+	 */
+	atomic_store_b(&data->error, originally_fast && !tsd_fast(tsd),
+	    ATOMIC_SEQ_CST);
+	atomic_store_u32(&data->phase, 9, ATOMIC_SEQ_CST);
+
+	return NULL;
+}
+
+TEST_BEGIN(test_tsd_global_slow) {
+	global_slow_data_t data = {ATOMIC_INIT(0), ATOMIC_INIT(false)};
+	/*
+	 * Note that the "mallocx" here (vs. malloc) is important, since the
+	 * compiler is allowed to optimize away free(malloc(1)) but not
+	 * free(mallocx(1)).
+	 */
+	free(mallocx(1, 0));
+	tsd_t *tsd = tsd_fetch();
+	originally_fast = tsd_fast(tsd);
+
+	thd_t thd;
+	thd_create(&thd, thd_start_global_slow, (void *)&data.phase);
+	/* PHASE 1 */
+	while (atomic_load_u32(&data.phase, ATOMIC_SEQ_CST) != 1) {
+		/*
+		 * We don't have a portable condvar/semaphore mechanism.
+		 * Spin-wait.
+		 */
+	}
+	assert_false(atomic_load_b(&data.error, ATOMIC_SEQ_CST), "");
+	tsd_global_slow_inc(tsd_tsdn(tsd));
+	free(mallocx(1, 0));
+	assert_false(tsd_fast(tsd), "");
+	atomic_store_u32(&data.phase, 2, ATOMIC_SEQ_CST);
+
+	/* PHASE 3 */
+	while (atomic_load_u32(&data.phase, ATOMIC_SEQ_CST) != 3) {
+	}
+	assert_false(atomic_load_b(&data.error, ATOMIC_SEQ_CST), "");
+	/* Increase again, so that we can test multiple fast/slow changes. */
+	tsd_global_slow_inc(tsd_tsdn(tsd));
+	atomic_store_u32(&data.phase, 4, ATOMIC_SEQ_CST);
+	free(mallocx(1, 0));
+	assert_false(tsd_fast(tsd), "");
+
+	/* PHASE 5 */
+	while (atomic_load_u32(&data.phase, ATOMIC_SEQ_CST) != 5) {
+	}
+	assert_false(atomic_load_b(&data.error, ATOMIC_SEQ_CST), "");
+	tsd_global_slow_dec(tsd_tsdn(tsd));
+	atomic_store_u32(&data.phase, 6, ATOMIC_SEQ_CST);
+	/* We only decreased once; things should still be slow. */
+	free(mallocx(1, 0));
+	assert_false(tsd_fast(tsd), "");
+
+	/* PHASE 7 */
+	while (atomic_load_u32(&data.phase, ATOMIC_SEQ_CST) != 7) {
+	}
+	assert_false(atomic_load_b(&data.error, ATOMIC_SEQ_CST), "");
+	tsd_global_slow_dec(tsd_tsdn(tsd));
+	atomic_store_u32(&data.phase, 8, ATOMIC_SEQ_CST);
+	/* We incremented and then decremented twice; we should be fast now. */
+	free(mallocx(1, 0));
+	assert_true(!originally_fast || tsd_fast(tsd), "");
+
+	/* PHASE 9 */
+	while (atomic_load_u32(&data.phase, ATOMIC_SEQ_CST) != 9) {
+	}
+	assert_false(atomic_load_b(&data.error, ATOMIC_SEQ_CST), "");
+
+	thd_join(thd, NULL);
+}
+TEST_END
+
 int
 main(void) {
 	/* Ensure tsd bootstrapped. */
@@ -135,5 +262,6 @@ main(void) {
 	return test_no_reentrancy(
 	    test_tsd_main_thread,
 	    test_tsd_sub_thread,
-	    test_tsd_reincarnation);
+	    test_tsd_reincarnation,
+	    test_tsd_global_slow);
 }
diff --git a/test/unit/zero.c b/test/unit/zero.c
index 553692ba..271fd5cb 100644
--- a/test/unit/zero.c
+++ b/test/unit/zero.c
@@ -41,13 +41,13 @@ test_zero(size_t sz_min, size_t sz_max) {
 
 TEST_BEGIN(test_zero_small) {
 	test_skip_if(!config_fill);
-	test_zero(1, SMALL_MAXCLASS-1);
+	test_zero(1, SC_SMALL_MAXCLASS - 1);
 }
 TEST_END
 
 TEST_BEGIN(test_zero_large) {
 	test_skip_if(!config_fill);
-	test_zero(SMALL_MAXCLASS+1, (1U << (LG_LARGE_MINCLASS+1)));
+	test_zero(SC_SMALL_MAXCLASS + 1, 1U << (SC_LG_LARGE_MINCLASS + 1));
 }
 TEST_END