Merge branch 'dev'

2015-08-17 13:23:29 -07:00 · 2015-08-17 13:23:29 -07:00 · 6e98caf8f0
commit 6e98caf8f0
parent 46c0af68bd 9b68f67223
131 changed files with 16929 additions and 9276 deletions
--- a/.autom4te.cfg
+++ b/.autom4te.cfg
@ -0,0 +1,3 @@
+begin-language: "Autoconf-without-aclocal-m4"
+args: --no-cache
+end-language: "Autoconf-without-aclocal-m4"
--- a/.gitattributes
+++ b/.gitattributes
@ -0,0 +1 @@
+* text=auto eol=lf
--- a/.gitignore
+++ b/.gitignore
@ -1,8 +1,8 @@
 /*.gcov.*

-/autom4te.cache/
-
+/bin/jemalloc-config
 /bin/jemalloc.sh
+/bin/jeprof

 /config.stamp
 /config.log
@ -15,6 +15,8 @@
 /doc/jemalloc.html
 /doc/jemalloc.3

+/jemalloc.pc
+
 /lib/

 /Makefile
@ -35,6 +37,7 @@
 /include/jemalloc/jemalloc_protos.h
 /include/jemalloc/jemalloc_protos_jet.h
 /include/jemalloc/jemalloc_rename.h
+/include/jemalloc/jemalloc_typedefs.h

 /src/*.[od]
 /src/*.gcda
--- a/4
+++ b/4
@ -1,10 +1,10 @@
 Unless otherwise specified, files in the jemalloc source distribution are
 subject to the following license:
 --------------------------------------------------------------------------------
-Copyright (C) 2002-2014 Jason Evans <jasone@canonware.com>.
+Copyright (C) 2002-2015 Jason Evans <jasone@canonware.com>.
 All rights reserved.
 Copyright (C) 2007-2012 Mozilla Foundation.  All rights reserved.
-Copyright (C) 2009-2014 Facebook, Inc.  All rights reserved.
+Copyright (C) 2009-2015 Facebook, Inc.  All rights reserved.

 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
--- a/166
+++ b/166
@ -1,10 +1,166 @@
 Following are change highlights associated with official releases.  Important
-bug fixes are all mentioned, but internal enhancements are omitted here for
-brevity (even though they are more fun to write about).  Much more detail can be
-found in the git revision history:
+bug fixes are all mentioned, but some internal enhancements are omitted here for
+brevity.  Much more detail can be found in the git revision history:

    https://github.com/jemalloc/jemalloc

+* 4.0.0 (August 17, 2015)
+
+  This version contains many speed and space optimizations, both minor and
+  major.  The major themes are generalization, unification, and simplification.
+  Although many of these optimizations cause no visible behavior change, their
+  cumulative effect is substantial.
+
+  New features:
+  - Normalize size class spacing to be consistent across the complete size
+    range.  By default there are four size classes per size doubling, but this
+    is now configurable via the --with-lg-size-class-group option.  Also add the
+    --with-lg-page, --with-lg-page-sizes, --with-lg-quantum, and
+    --with-lg-tiny-min options, which can be used to tweak page and size class
+    settings.  Impacts:
+    + Worst case performance for incrementally growing/shrinking reallocation
+      is improved because there are far fewer size classes, and therefore
+      copying happens less often.
+    + Internal fragmentation is limited to 20% for all but the smallest size
+      classes (those less than four times the quantum).  (1B + 4 KiB)
+      and (1B + 4 MiB) previously suffered nearly 50% internal fragmentation.
+    + Chunk fragmentation tends to be lower because there are fewer distinct run
+      sizes to pack.
+  - Add support for explicit tcaches.  The "tcache.create", "tcache.flush", and
+    "tcache.destroy" mallctls control tcache lifetime and flushing, and the
+    MALLOCX_TCACHE(tc) and MALLOCX_TCACHE_NONE flags to the *allocx() API
+    control which tcache is used for each operation.
+  - Implement per thread heap profiling, as well as the ability to
+    enable/disable heap profiling on a per thread basis.  Add the "prof.reset",
+    "prof.lg_sample", "thread.prof.name", "thread.prof.active",
+    "opt.prof_thread_active_init", "prof.thread_active_init", and
+    "thread.prof.active" mallctls.
+  - Add support for per arena application-specified chunk allocators, configured
+    via the "arena.<i>.chunk_hooks" mallctl.
+  - Refactor huge allocation to be managed by arenas, so that arenas now
+    function as general purpose independent allocators.  This is important in
+    the context of user-specified chunk allocators, aside from the scalability
+    benefits.  Related new statistics:
+    + The "stats.arenas.<i>.huge.allocated", "stats.arenas.<i>.huge.nmalloc",
+      "stats.arenas.<i>.huge.ndalloc", and "stats.arenas.<i>.huge.nrequests"
+      mallctls provide high level per arena huge allocation statistics.
+    + The "arenas.nhchunks", "arenas.hchunk.<i>.size",
+      "stats.arenas.<i>.hchunks.<j>.nmalloc",
+      "stats.arenas.<i>.hchunks.<j>.ndalloc",
+      "stats.arenas.<i>.hchunks.<j>.nrequests", and
+      "stats.arenas.<i>.hchunks.<j>.curhchunks" mallctls provide per size class
+      statistics.
+  - Add the 'util' column to malloc_stats_print() output, which reports the
+    proportion of available regions that are currently in use for each small
+    size class.
+  - Add "alloc" and "free" modes for for junk filling (see the "opt.junk"
+    mallctl), so that it is possible to separately enable junk filling for
+    allocation versus deallocation.
+  - Add the jemalloc-config script, which provides information about how
+    jemalloc was configured, and how to integrate it into application builds.
+  - Add metadata statistics, which are accessible via the "stats.metadata",
+    "stats.arenas.<i>.metadata.mapped", and
+    "stats.arenas.<i>.metadata.allocated" mallctls.
+  - Add the "stats.resident" mallctl, which reports the upper limit of
+    physically resident memory mapped by the allocator.
+  - Add per arena control over unused dirty page purging, via the
+    "arenas.lg_dirty_mult", "arena.<i>.lg_dirty_mult", and
+    "stats.arenas.<i>.lg_dirty_mult" mallctls.
+  - Add the "prof.gdump" mallctl, which makes it possible to toggle the gdump
+    feature on/off during program execution.
+  - Add sdallocx(), which implements sized deallocation.  The primary
+    optimization over dallocx() is the removal of a metadata read, which often
+    suffers an L1 cache miss.
+  - Add missing header includes in jemalloc/jemalloc.h, so that applications
+    only have to #include <jemalloc/jemalloc.h>.
+  - Add support for additional platforms:
+    + Bitrig
+    + Cygwin
+    + DragonFlyBSD
+    + iOS
+    + OpenBSD
+    + OpenRISC/or1k
+
+  Optimizations:
+  - Maintain dirty runs in per arena LRUs rather than in per arena trees of
+    dirty-run-containing chunks.  In practice this change significantly reduces
+    dirty page purging volume.
+  - Integrate whole chunks into the unused dirty page purging machinery.  This
+    reduces the cost of repeated huge allocation/deallocation, because it
+    effectively introduces a cache of chunks.
+  - Split the arena chunk map into two separate arrays, in order to increase
+    cache locality for the frequently accessed bits.
+  - Move small run metadata out of runs, into arena chunk headers.  This reduces
+    run fragmentation, smaller runs reduce external fragmentation for small size
+    classes, and packed (less uniformly aligned) metadata layout improves CPU
+    cache set distribution.
+  - Randomly distribute large allocation base pointer alignment relative to page
+    boundaries in order to more uniformly utilize CPU cache sets.  This can be
+    disabled via the --disable-cache-oblivious configure option, and queried via
+    the "config.cache_oblivious" mallctl.
+  - Micro-optimize the fast paths for the public API functions.
+  - Refactor thread-specific data to reside in a single structure.  This assures
+    that only a single TLS read is necessary per call into the public API.
+  - Implement in-place huge allocation growing and shrinking.
+  - Refactor rtree (radix tree for chunk lookups) to be lock-free, and make
+    additional optimizations that reduce maximum lookup depth to one or two
+    levels.  This resolves what was a concurrency bottleneck for per arena huge
+    allocation, because a global data structure is critical for determining
+    which arenas own which huge allocations.
+
+  Incompatible changes:
+  - Replace --enable-cc-silence with --disable-cc-silence to suppress spurious
+    warnings by default.
+  - Assure that the constness of malloc_usable_size()'s return type matches that
+    of the system implementation.
+  - Change the heap profile dump format to support per thread heap profiling,
+    rename pprof to jeprof, and enhance it with the --thread=<n> option.  As a
+    result, the bundled jeprof must now be used rather than the upstream
+    (gperftools) pprof.
+  - Disable "opt.prof_final" by default, in order to avoid atexit(3), which can
+    internally deadlock on some platforms.
+  - Change the "arenas.nlruns" mallctl type from size_t to unsigned.
+  - Replace the "stats.arenas.<i>.bins.<j>.allocated" mallctl with
+    "stats.arenas.<i>.bins.<j>.curregs".
+  - Ignore MALLOC_CONF in set{uid,gid,cap} binaries.
+  - Ignore MALLOCX_ARENA(a) in dallocx(), in favor of using the
+    MALLOCX_TCACHE(tc) and MALLOCX_TCACHE_NONE flags to control tcache usage.
+
+  Removed features:
+  - Remove the *allocm() API, which is superseded by the *allocx() API.
+  - Remove the --enable-dss options, and make dss non-optional on all platforms
+    which support sbrk(2).
+  - Remove the "arenas.purge" mallctl, which was obsoleted by the
+    "arena.<i>.purge" mallctl in 3.1.0.
+  - Remove the unnecessary "opt.valgrind" mallctl; jemalloc automatically
+    detects whether it is running inside Valgrind.
+  - Remove the "stats.huge.allocated", "stats.huge.nmalloc", and
+    "stats.huge.ndalloc" mallctls.
+  - Remove the --enable-mremap option.
+  - Remove the "stats.chunks.current", "stats.chunks.total", and
+    "stats.chunks.high" mallctls.
+
+  Bug fixes:
+  - Fix the cactive statistic to decrease (rather than increase) when active
+    memory decreases.  This regression was first released in 3.5.0.
+  - Fix OOM handling in memalign() and valloc().  A variant of this bug existed
+    in all releases since 2.0.0, which introduced these functions.
+  - Fix an OOM-related regression in arena_tcache_fill_small(), which could
+    cause cache corruption on OOM.  This regression was present in all releases
+    from 2.2.0 through 3.6.0.
+  - Fix size class overflow handling for malloc(), posix_memalign(), memalign(),
+    calloc(), and realloc() when profiling is enabled.
+  - Fix the "arena.<i>.dss" mallctl to return an error if "primary" or
+    "secondary" precedence is specified, but sbrk(2) is not supported.
+  - Fix fallback lg_floor() implementations to handle extremely large inputs.
+  - Ensure the default purgeable zone is after the default zone on OS X.
+  - Fix latent bugs in atomic_*().
+  - Fix the "arena.<i>.dss" mallctl to handle read-only calls.
+  - Fix tls_model configuration to enable the initial-exec model when possible.
+  - Mark malloc_conf as a weak symbol so that the application can override it.
+  - Correctly detect glibc's adaptive pthread mutexes.
+  - Fix the --without-export configure option.
+
 * 3.6.0 (March 31, 2014)

  This version contains a critical bug fix for a regression present in 3.5.0 and
@ -21,7 +177,7 @@ found in the git revision history:
    backtracing to be reliable.
  - Use dss allocation precedence for huge allocations as well as small/large
    allocations.
-  - Fix test assertion failure message formatting.  This bug did not manifect on
+  - Fix test assertion failure message formatting.  This bug did not manifest on
    x86_64 systems because of implementation subtleties in va_list.
  - Fix inconsequential test failures for hash and SFMT code.

@ -516,7 +672,7 @@ found in the git revision history:
  - Make it possible for the application to manually flush a thread's cache, via
    the "tcache.flush" mallctl.
  - Base maximum dirty page count on proportion of active memory.
-  - Compute various addtional run-time statistics, including per size class
+  - Compute various additional run-time statistics, including per size class
    statistics for large objects.
  - Expose malloc_stats_print(), which can be called repeatedly by the
    application.
--- a/144
+++ b/144
@ -1,10 +1,23 @@
-Building and installing jemalloc can be as simple as typing the following while
-in the root directory of the source tree:
+Building and installing a packaged release of jemalloc can be as simple as
+typing the following while in the root directory of the source tree:

    ./configure
    make
    make install

+If building from unpackaged developer sources, the simplest command sequence
+that might work is:
+
+    ./autogen.sh
+    make dist
+    make
+    make install
+
+Note that documentation is not built by the default target because doing so
+would create a dependency on xsltproc in packaged releases, hence the
+requirement to either run 'make dist' or avoid installing docs via the various
+install_* targets documented below.
+
 === Advanced configuration =====================================================

 The 'configure' script supports numerous options that allow control of which
@ -71,10 +84,10 @@ any of the following arguments (not a definitive list) to 'configure':
    versions of jemalloc can coexist in the same installation directory.  For
    example, libjemalloc.so.0 becomes libjemalloc<suffix>.so.0.

--enable-cc-silence
-    Enable code that silences non-useful compiler warnings.  This is helpful
-    when trying to tell serious warnings from those due to compiler
-    limitations, but it potentially incurs a performance penalty.
+--disable-cc-silence
+    Disable code that silences non-useful compiler warnings.  This is mainly
+    useful during development when auditing the set of warnings that are being
+    silenced.

 --enable-debug
    Enable assertions and validation code.  This incurs a substantial
@ -94,15 +107,15 @@ any of the following arguments (not a definitive list) to 'configure':
    there are interactions between the various coverage targets, so it is
    usually advisable to run 'make clean' between repeated code coverage runs.

--enable-ivsalloc
-    Enable validation code, which verifies that pointers reside within
-    jemalloc-owned chunks before dereferencing them. This incurs a substantial
-    performance hit.
-
 --disable-stats
    Disable statistics gathering functionality.  See the "opt.stats_print"
    option documentation for usage details.

+--enable-ivsalloc
+    Enable validation code, which verifies that pointers reside within
+    jemalloc-owned chunks before dereferencing them.  This incurs a minor
+    performance hit.
+
 --enable-prof
    Enable heap profiling and leak detection functionality.  See the "opt.prof"
    option documentation for usage details.  When enabled, there are several
@ -132,12 +145,6 @@ any of the following arguments (not a definitive list) to 'configure':
    released in bulk, thus reducing the total number of mutex operations.  See
    the "opt.tcache" option for usage details.

--enable-mremap
-    Enable huge realloc() via mremap(2).  mremap() is disabled by default
-    because the flavor used is specific to Linux, which has a quirk in its
-    virtual memory allocation algorithm that causes semi-permanent VM map holes
-    under normal jemalloc operation.
-
 --disable-munmap
    Disable virtual memory deallocation via munmap(2); instead keep track of
    the virtual memory for later use.  munmap() is disabled by default (i.e.
@ -145,10 +152,6 @@ any of the following arguments (not a definitive list) to 'configure':
    memory allocation algorithm that causes semi-permanent VM map holes under
    normal jemalloc operation.

--enable-dss
-    Enable support for page allocation/deallocation via sbrk(2), in addition to
-    mmap(2).
-
 --disable-fill
    Disable support for junk/zero filling of memory, quarantine, and redzones.
    See the "opt.junk", "opt.zero", "opt.quarantine", and "opt.redzone" option
@ -157,9 +160,6 @@ any of the following arguments (not a definitive list) to 'configure':
 --disable-valgrind
    Disable support for Valgrind.

--disable-experimental
-    Disable support for the experimental API (*allocm()).
-
 --disable-zone-allocator
    Disable zone allocator for Darwin.  This means jemalloc won't be hooked as
    the default allocator on OSX/iOS.
@ -185,10 +185,106 @@ any of the following arguments (not a definitive list) to 'configure':
    thread-local variables via the __thread keyword.  If TLS is available,
    jemalloc uses it for several purposes.

+--disable-cache-oblivious
+    Disable cache-oblivious large allocation alignment for large allocation
+    requests with no alignment constraints.  If this feature is disabled, all
+    large allocations are page-aligned as an implementation artifact, which can
+    severely harm CPU cache utilization.  However, the cache-oblivious layout
+    comes at the cost of one extra page per large allocation, which in the
+    most extreme case increases physical memory usage for the 16 KiB size class
+    to 20 KiB.
+
 --with-xslroot=<path>
    Specify where to find DocBook XSL stylesheets when building the
    documentation.

+--with-lg-page=<lg-page>
+    Specify the base 2 log of the system page size.  This option is only useful
+    when cross compiling, since the configure script automatically determines
+    the host's page size by default.
+
+--with-lg-page-sizes=<lg-page-sizes>
+    Specify the comma-separated base 2 logs of the page sizes to support.  This
+    option may be useful when cross-compiling in combination with
+    --with-lg-page, but its primary use case is for integration with FreeBSD's
+    libc, wherein jemalloc is embedded.
+
+--with-lg-size-class-group=<lg-size-class-group>
+    Specify the base 2 log of how many size classes to use for each doubling in
+    size.  By default jemalloc uses <lg-size-class-group>=2, which results in
+    e.g. the following size classes:
+
+      [...], 64,
+      80, 96, 112, 128,
+      160, [...]
+
+    <lg-size-class-group>=3 results in e.g. the following size classes:
+
+      [...], 64,
+      72, 80, 88, 96, 104, 112, 120, 128,
+      144, [...]
+
+    The minimal <lg-size-class-group>=0 causes jemalloc to only provide size
+    classes that are powers of 2:
+
+      [...],
+      64,
+      128,
+      256,
+      [...]
+
+    An implementation detail currently limits the total number of small size
+    classes to 255, and a compilation error will result if the
+    <lg-size-class-group> you specify cannot be supported.  The limit is
+    roughly <lg-size-class-group>=4, depending on page size.
+
+--with-lg-quantum=<lg-quantum>
+    Specify the base 2 log of the minimum allocation alignment.  jemalloc needs
+    to know the minimum alignment that meets the following C standard
+    requirement (quoted from the April 12, 2011 draft of the C11 standard):
+
+      The pointer returned if the allocation succeeds is suitably aligned so
+      that it may be assigned to a pointer to any type of object with a
+      fundamental alignment requirement and then used to access such an object
+      or an array of such objects in the space allocated [...]
+
+    This setting is architecture-specific, and although jemalloc includes known
+    safe values for the most commonly used modern architectures, there is a
+    wrinkle related to GNU libc (glibc) that may impact your choice of
+    <lg-quantum>.  On most modern architectures, this mandates 16-byte alignment
+    (<lg-quantum>=4), but the glibc developers chose not to meet this
+    requirement for performance reasons.  An old discussion can be found at
+    https://sourceware.org/bugzilla/show_bug.cgi?id=206 .  Unlike glibc,
+    jemalloc does follow the C standard by default (caveat: jemalloc
+    technically cheats if --with-lg-tiny-min is smaller than
+    --with-lg-quantum), but the fact that Linux systems already work around
+    this allocator noncompliance means that it is generally safe in practice to
+    let jemalloc's minimum alignment follow glibc's lead.  If you specify
+    --with-lg-quantum=3 during configuration, jemalloc will provide additional
+    size classes that are not 16-byte-aligned (24, 40, and 56, assuming
+    --with-lg-size-class-group=2).
+
+--with-lg-tiny-min=<lg-tiny-min>
+    Specify the base 2 log of the minimum tiny size class to support.  Tiny
+    size classes are powers of 2 less than the quantum, and are only
+    incorporated if <lg-tiny-min> is less than <lg-quantum> (see
+    --with-lg-quantum).  Tiny size classes technically violate the C standard
+    requirement for minimum alignment, and crashes could conceivably result if
+    the compiler were to generate instructions that made alignment assumptions,
+    both because illegal instruction traps could result, and because accesses
+    could straddle page boundaries and cause segmentation faults due to
+    accessing unmapped addresses.
+
+    The default of <lg-tiny-min>=3 works well in practice even on architectures
+    that technically require 16-byte alignment, probably for the same reason
+    --with-lg-quantum=3 works.  Smaller tiny size classes can, and will, cause
+    crashes (see https://bugzilla.mozilla.org/show_bug.cgi?id=691003 for an
+    example).
+
+    This option is rarely useful, and is mainly provided as documentation of a
+    subtle implementation detail.  If you do use this option, specify a
+    value in [3, ..., <lg-quantum>].
+
 The following environment variables (not a definitive list) impact configure's
 behavior:

--- a/Makefile.in
+++ b/Makefile.in
@ -42,14 +42,16 @@ XSLTPROC := @XSLTPROC@
 AUTOCONF := @AUTOCONF@
 _RPATH = @RPATH@
 RPATH = $(if $(1),$(call _RPATH,$(1)))
-cfghdrs_in := @cfghdrs_in@
+cfghdrs_in := $(addprefix $(srcroot),@cfghdrs_in@)
 cfghdrs_out := @cfghdrs_out@
-cfgoutputs_in := @cfgoutputs_in@
+cfgoutputs_in := $(addprefix $(srcroot),@cfgoutputs_in@)
 cfgoutputs_out := @cfgoutputs_out@
 enable_autogen := @enable_autogen@
 enable_code_coverage := @enable_code_coverage@
-enable_experimental := @enable_experimental@
+enable_prof := @enable_prof@
+enable_valgrind := @enable_valgrind@
 enable_zone_allocator := @enable_zone_allocator@
+MALLOC_CONF := @JEMALLOC_CPREFIX@MALLOC_CONF
 DSO_LDFLAGS = @DSO_LDFLAGS@
 SOREV = @SOREV@
 PIC_CFLAGS = @PIC_CFLAGS@
@ -73,16 +75,20 @@ endif
 LIBJEMALLOC := $(LIBPREFIX)jemalloc$(install_suffix)

 # Lists of files.
-BINS := $(srcroot)bin/pprof $(objroot)bin/jemalloc.sh
+BINS := $(objroot)bin/jemalloc-config $(objroot)bin/jemalloc.sh $(objroot)bin/jeprof
 C_HDRS := $(objroot)include/jemalloc/jemalloc$(install_suffix).h
 C_SRCS := $(srcroot)src/jemalloc.c $(srcroot)src/arena.c \
 	$(srcroot)src/atomic.c $(srcroot)src/base.c $(srcroot)src/bitmap.c \
 	$(srcroot)src/chunk.c $(srcroot)src/chunk_dss.c \
 	$(srcroot)src/chunk_mmap.c $(srcroot)src/ckh.c $(srcroot)src/ctl.c \
 	$(srcroot)src/extent.c $(srcroot)src/hash.c $(srcroot)src/huge.c \
-	$(srcroot)src/mb.c $(srcroot)src/mutex.c $(srcroot)src/prof.c \
-	$(srcroot)src/quarantine.c $(srcroot)src/rtree.c $(srcroot)src/stats.c \
-	$(srcroot)src/tcache.c $(srcroot)src/util.c $(srcroot)src/tsd.c
+	$(srcroot)src/mb.c $(srcroot)src/mutex.c $(srcroot)src/pages.c \
+	$(srcroot)src/prof.c $(srcroot)src/quarantine.c $(srcroot)src/rtree.c \
+	$(srcroot)src/stats.c $(srcroot)src/tcache.c $(srcroot)src/util.c \
+	$(srcroot)src/tsd.c
+ifeq ($(enable_valgrind), 1)
+C_SRCS += $(srcroot)src/valgrind.c
+endif
 ifeq ($(enable_zone_allocator), 1)
 C_SRCS += $(srcroot)src/zone.c
 endif
@ -98,53 +104,60 @@ DSOS := $(objroot)lib/$(LIBJEMALLOC).$(SOREV)
 ifneq ($(SOREV),$(SO))
 DSOS += $(objroot)lib/$(LIBJEMALLOC).$(SO)
 endif
+PC := $(objroot)jemalloc.pc
 MAN3 := $(objroot)doc/jemalloc$(install_suffix).3
 DOCS_XML := $(objroot)doc/jemalloc$(install_suffix).xml
-DOCS_HTML := $(DOCS_XML:$(objroot)%.xml=$(srcroot)%.html)
-DOCS_MAN3 := $(DOCS_XML:$(objroot)%.xml=$(srcroot)%.3)
+DOCS_HTML := $(DOCS_XML:$(objroot)%.xml=$(objroot)%.html)
+DOCS_MAN3 := $(DOCS_XML:$(objroot)%.xml=$(objroot)%.3)
 DOCS := $(DOCS_HTML) $(DOCS_MAN3)
-C_TESTLIB_SRCS := $(srcroot)test/src/math.c $(srcroot)test/src/mtx.c \
+C_TESTLIB_SRCS := $(srcroot)test/src/btalloc.c $(srcroot)test/src/btalloc_0.c \
+	$(srcroot)test/src/btalloc_1.c $(srcroot)test/src/math.c \
+	$(srcroot)test/src/mtx.c $(srcroot)test/src/mq.c \
 	$(srcroot)test/src/SFMT.c $(srcroot)test/src/test.c \
-	$(srcroot)test/src/thd.c
+	$(srcroot)test/src/thd.c $(srcroot)test/src/timer.c
 C_UTIL_INTEGRATION_SRCS := $(srcroot)src/util.c
-TESTS_UNIT := $(srcroot)test/unit/bitmap.c \
+TESTS_UNIT := $(srcroot)test/unit/atomic.c \
+	$(srcroot)test/unit/bitmap.c \
 	$(srcroot)test/unit/ckh.c \
 	$(srcroot)test/unit/hash.c \
 	$(srcroot)test/unit/junk.c \
+	$(srcroot)test/unit/junk_alloc.c \
+	$(srcroot)test/unit/junk_free.c \
+	$(srcroot)test/unit/lg_chunk.c \
 	$(srcroot)test/unit/mallctl.c \
 	$(srcroot)test/unit/math.c \
 	$(srcroot)test/unit/mq.c \
 	$(srcroot)test/unit/mtx.c \
 	$(srcroot)test/unit/prof_accum.c \
+	$(srcroot)test/unit/prof_active.c \
 	$(srcroot)test/unit/prof_gdump.c \
 	$(srcroot)test/unit/prof_idump.c \
+	$(srcroot)test/unit/prof_reset.c \
+	$(srcroot)test/unit/prof_thread_name.c \
 	$(srcroot)test/unit/ql.c \
 	$(srcroot)test/unit/qr.c \
 	$(srcroot)test/unit/quarantine.c \
 	$(srcroot)test/unit/rb.c \
 	$(srcroot)test/unit/rtree.c \
 	$(srcroot)test/unit/SFMT.c \
+	$(srcroot)test/unit/size_classes.c \
 	$(srcroot)test/unit/stats.c \
 	$(srcroot)test/unit/tsd.c \
 	$(srcroot)test/unit/util.c \
 	$(srcroot)test/unit/zero.c
-TESTS_UNIT_AUX := $(srcroot)test/unit/prof_accum_a.c \
-	$(srcroot)test/unit/prof_accum_b.c
 TESTS_INTEGRATION := $(srcroot)test/integration/aligned_alloc.c \
 	$(srcroot)test/integration/allocated.c \
+	$(srcroot)test/integration/sdallocx.c \
 	$(srcroot)test/integration/mallocx.c \
-	$(srcroot)test/integration/mremap.c \
+	$(srcroot)test/integration/MALLOCX_ARENA.c \
+	$(srcroot)test/integration/overflow.c \
 	$(srcroot)test/integration/posix_memalign.c \
 	$(srcroot)test/integration/rallocx.c \
 	$(srcroot)test/integration/thread_arena.c \
 	$(srcroot)test/integration/thread_tcache_enabled.c \
-	$(srcroot)test/integration/xallocx.c
-ifeq ($(enable_experimental), 1)
-TESTS_INTEGRATION += $(srcroot)test/integration/allocm.c \
-	$(srcroot)test/integration/MALLOCX_ARENA.c \
-	$(srcroot)test/integration/rallocm.c
-endif
-TESTS_STRESS :=
+	$(srcroot)test/integration/xallocx.c \
+	$(srcroot)test/integration/chunk.c
+TESTS_STRESS := $(srcroot)test/stress/microbench.c
 TESTS := $(TESTS_UNIT) $(TESTS_INTEGRATION) $(TESTS_STRESS)

 C_OBJS := $(C_SRCS:$(srcroot)%.c=$(objroot)%.$(O))
@ -157,10 +170,9 @@ C_TESTLIB_STRESS_OBJS := $(C_TESTLIB_SRCS:$(srcroot)%.c=$(objroot)%.stress.$(O))
 C_TESTLIB_OBJS := $(C_TESTLIB_UNIT_OBJS) $(C_TESTLIB_INTEGRATION_OBJS) $(C_UTIL_INTEGRATION_OBJS) $(C_TESTLIB_STRESS_OBJS)

 TESTS_UNIT_OBJS := $(TESTS_UNIT:$(srcroot)%.c=$(objroot)%.$(O))
-TESTS_UNIT_AUX_OBJS := $(TESTS_UNIT_AUX:$(srcroot)%.c=$(objroot)%.$(O))
 TESTS_INTEGRATION_OBJS := $(TESTS_INTEGRATION:$(srcroot)%.c=$(objroot)%.$(O))
 TESTS_STRESS_OBJS := $(TESTS_STRESS:$(srcroot)%.c=$(objroot)%.$(O))
-TESTS_OBJS := $(TESTS_UNIT_OBJS) $(TESTS_UNIT_AUX_OBJS) $(TESTS_INTEGRATION_OBJS) $(TESTS_STRESS_OBJS)
+TESTS_OBJS := $(TESTS_UNIT_OBJS) $(TESTS_INTEGRATION_OBJS) $(TESTS_STRESS_OBJS)

 .PHONY: all dist build_doc_html build_doc_man build_doc
 .PHONY: install_bin install_include install_lib
@ -174,10 +186,10 @@ all: build_lib

 dist: build_doc

-$(srcroot)doc/%.html : $(objroot)doc/%.xml $(srcroot)doc/stylesheet.xsl $(objroot)doc/html.xsl
+$(objroot)doc/%.html : $(objroot)doc/%.xml $(srcroot)doc/stylesheet.xsl $(objroot)doc/html.xsl
 	$(XSLTPROC) -o $@ $(objroot)doc/html.xsl $<

-$(srcroot)doc/%.3 : $(objroot)doc/%.xml $(srcroot)doc/stylesheet.xsl $(objroot)doc/manpages.xsl
+$(objroot)doc/%.3 : $(objroot)doc/%.xml $(srcroot)doc/stylesheet.xsl $(objroot)doc/manpages.xsl
 	$(XSLTPROC) -o $@ $(objroot)doc/manpages.xsl $<

 build_doc_html: $(DOCS_HTML)
@ -209,18 +221,12 @@ $(C_TESTLIB_STRESS_OBJS): $(objroot)test/src/%.stress.$(O): $(srcroot)test/src/%
 $(C_TESTLIB_STRESS_OBJS): CPPFLAGS += -DJEMALLOC_STRESS_TEST -DJEMALLOC_STRESS_TESTLIB
 $(C_TESTLIB_OBJS): CPPFLAGS += -I$(srcroot)test/include -I$(objroot)test/include
 $(TESTS_UNIT_OBJS): CPPFLAGS += -DJEMALLOC_UNIT_TEST
-$(TESTS_UNIT_AUX_OBJS): CPPFLAGS += -DJEMALLOC_UNIT_TEST
-define make-unit-link-dep
-$(1): TESTS_UNIT_LINK_OBJS += $(2)
-$(1): $(2)
-endef
-$(foreach test, $(TESTS_UNIT:$(srcroot)test/unit/%.c=$(objroot)test/unit/%$(EXE)), $(eval $(call make-unit-link-dep,$(test),$(filter $(test:%=%_a.$(O)) $(test:%=%_b.$(O)),$(TESTS_UNIT_AUX_OBJS)))))
 $(TESTS_INTEGRATION_OBJS): CPPFLAGS += -DJEMALLOC_INTEGRATION_TEST
 $(TESTS_STRESS_OBJS): CPPFLAGS += -DJEMALLOC_STRESS_TEST
 $(TESTS_OBJS): $(objroot)test/%.$(O): $(srcroot)test/%.c
 $(TESTS_OBJS): CPPFLAGS += -I$(srcroot)test/include -I$(objroot)test/include
 ifneq ($(IMPORTLIB),$(SO))
-$(C_OBJS): CPPFLAGS += -DDLLEXPORT
+$(C_OBJS) $(C_JET_OBJS): CPPFLAGS += -DDLLEXPORT
 endif

 ifndef CC_MM
@ -229,7 +235,7 @@ HEADER_DIRS = $(srcroot)include/jemalloc/internal \
 	$(objroot)include/jemalloc $(objroot)include/jemalloc/internal
 HEADERS = $(wildcard $(foreach dir,$(HEADER_DIRS),$(dir)/*.h))
 $(C_OBJS) $(C_PIC_OBJS) $(C_JET_OBJS) $(C_TESTLIB_OBJS) $(TESTS_OBJS): $(HEADERS)
-$(TESTS_OBJS): $(objroot)test/unit/jemalloc_test.h
+$(TESTS_OBJS): $(objroot)test/include/test/jemalloc_test.h
 endif

 $(C_OBJS) $(C_PIC_OBJS) $(C_JET_OBJS) $(C_TESTLIB_OBJS) $(TESTS_OBJS): %.$(O):
@ -301,7 +307,14 @@ install_lib_static: $(STATIC_LIBS)
 	install -m 755 $$l $(LIBDIR); \
 done

-install_lib: install_lib_shared install_lib_static
+install_lib_pc: $(PC)
+	install -d $(LIBDIR)/pkgconfig
+	@for l in $(PC); do \
+	echo "install -m 644 $$l $(LIBDIR)/pkgconfig"; \
+	install -m 644 $$l $(LIBDIR)/pkgconfig; \
+done
+
+install_lib: install_lib_shared install_lib_static install_lib_pc

 install_doc_html:
 	install -d $(DATADIR)/doc/jemalloc$(install_suffix)
@ -336,11 +349,15 @@ check_dir: check_unit_dir check_integration_dir check_stress_dir

 check_unit: tests_unit check_unit_dir
 	$(SHELL) $(objroot)test/test.sh $(TESTS_UNIT:$(srcroot)%.c=$(objroot)%)
+check_integration_prof: tests_integration check_integration_dir
+ifeq ($(enable_prof), 1)
+	$(MALLOC_CONF)="prof:true" $(SHELL) $(objroot)test/test.sh $(TESTS_INTEGRATION:$(srcroot)%.c=$(objroot)%)
+endif
 check_integration: tests_integration check_integration_dir
 	$(SHELL) $(objroot)test/test.sh $(TESTS_INTEGRATION:$(srcroot)%.c=$(objroot)%)
 check_stress: tests_stress check_stress_dir
 	$(SHELL) $(objroot)test/test.sh $(TESTS_STRESS:$(srcroot)%.c=$(objroot)%)
-check: tests check_dir
+check: tests check_dir check_integration_prof
 	$(SHELL) $(objroot)test/test.sh $(TESTS:$(srcroot)%.c=$(objroot)%)

 ifeq ($(enable_code_coverage), 1)
@ -400,8 +417,9 @@ clean:
 	rm -f $(objroot)*.gcov.*

 distclean: clean
-	rm -rf $(objroot)autom4te.cache
+	rm -f $(objroot)bin/jemalloc-config
 	rm -f $(objroot)bin/jemalloc.sh
+	rm -f $(objroot)bin/jeprof
 	rm -f $(objroot)config.log
 	rm -f $(objroot)config.status
 	rm -f $(objroot)config.stamp
@ -410,7 +428,7 @@ distclean: clean

 relclean: distclean
 	rm -f $(objroot)configure
-	rm -f $(srcroot)VERSION
+	rm -f $(objroot)VERSION
 	rm -f $(DOCS_HTML)
 	rm -f $(DOCS_MAN3)

--- a/bin/jemalloc-config.in
+++ b/bin/jemalloc-config.in
@ -0,0 +1,79 @@
+#!/bin/sh
+
+usage() {
+	cat <<EOF
+Usage:
+  @BINDIR@/jemalloc-config <option>
+Options:
+  --help | -h  : Print usage.
+  --version    : Print jemalloc version.
+  --revision   : Print shared library revision number.
+  --config     : Print configure options used to build jemalloc.
+  --prefix     : Print installation directory prefix.
+  --bindir     : Print binary installation directory.
+  --datadir    : Print data installation directory.
+  --includedir : Print include installation directory.
+  --libdir     : Print library installation directory.
+  --mandir     : Print manual page installation directory.
+  --cc         : Print compiler used to build jemalloc.
+  --cflags     : Print compiler flags used to build jemalloc.
+  --cppflags   : Print preprocessor flags used to build jemalloc.
+  --ldflags    : Print library flags used to build jemalloc.
+  --libs       : Print libraries jemalloc was linked against.
+EOF
+}
+
+prefix="@prefix@"
+exec_prefix="@exec_prefix@"
+
+case "$1" in
+--help | -h)
+	usage
+	exit 0
+	;;
+--version)
+	echo "@jemalloc_version@"
+	;;
+--revision)
+	echo "@rev@"
+	;;
+--config)
+	echo "@CONFIG@"
+	;;
+--prefix)
+	echo "@PREFIX@"
+	;;
+--bindir)
+	echo "@BINDIR@"
+	;;
+--datadir)
+	echo "@DATADIR@"
+	;;
+--includedir)
+	echo "@INCLUDEDIR@"
+	;;
+--libdir)
+	echo "@LIBDIR@"
+	;;
+--mandir)
+	echo "@MANDIR@"
+	;;
+--cc)
+	echo "@CC@"
+	;;
+--cflags)
+	echo "@CFLAGS@"
+	;;
+--cppflags)
+	echo "@CPPFLAGS@"
+	;;
+--ldflags)
+	echo "@LDFLAGS@ @EXTRA_LDFLAGS@"
+	;;
+--libs)
+	echo "@LIBS@"
+	;;
+*)
+	usage
+	exit 1
+esac
--- a/bin/jeprof.in
+++ b/bin/jeprof.in
@ -40,28 +40,28 @@
 #
 # Examples:
 #
-# % tools/pprof "program" "profile"
+# % tools/jeprof "program" "profile"
 #   Enters "interactive" mode
 #
-# % tools/pprof --text "program" "profile"
+# % tools/jeprof --text "program" "profile"
 #   Generates one line per procedure
 #
-# % tools/pprof --gv "program" "profile"
+# % tools/jeprof --gv "program" "profile"
 #   Generates annotated call-graph and displays via "gv"
 #
-# % tools/pprof --gv --focus=Mutex "program" "profile"
+# % tools/jeprof --gv --focus=Mutex "program" "profile"
 #   Restrict to code paths that involve an entry that matches "Mutex"
 #
-# % tools/pprof --gv --focus=Mutex --ignore=string "program" "profile"
+# % tools/jeprof --gv --focus=Mutex --ignore=string "program" "profile"
 #   Restrict to code paths that involve an entry that matches "Mutex"
 #   and does not match "string"
 #
-# % tools/pprof --list=IBF_CheckDocid "program" "profile"
+# % tools/jeprof --list=IBF_CheckDocid "program" "profile"
 #   Generates disassembly listing of all routines with at least one
 #   sample that match the --list=<regexp> pattern.  The listing is
 #   annotated with the flat and cumulative sample counts at each line.
 #
-# % tools/pprof --disasm=IBF_CheckDocid "program" "profile"
+# % tools/jeprof --disasm=IBF_CheckDocid "program" "profile"
 #   Generates disassembly listing of all routines with at least one
 #   sample that match the --disasm=<regexp> pattern.  The listing is
 #   annotated with the flat and cumulative sample counts at each PC value.
@ -72,10 +72,11 @@ use strict;
 use warnings;
 use Getopt::Long;

+my $JEPROF_VERSION = "@jemalloc_version@";
 my $PPROF_VERSION = "2.0";

 # These are the object tools we use which can come from a
-# user-specified location using --tools, from the PPROF_TOOLS
+# user-specified location using --tools, from the JEPROF_TOOLS
 # environment variable, or from the environment.
 my %obj_tool_map = (
  "objdump" => "objdump",
@ -144,13 +145,13 @@ my $sep_address = undef;
 sub usage_string {
  return <<EOF;
 Usage:
-pprof [options] <program> <profiles>
+jeprof [options] <program> <profiles>
   <profiles> is a space separated list of profile names.
-pprof [options] <symbolized-profiles>
+jeprof [options] <symbolized-profiles>
   <symbolized-profiles> is a list of profile files where each file contains
   the necessary symbol mappings  as well as profile data (likely generated
   with --raw).
-pprof [options] <profile>
+jeprof [options] <profile>
   <profile> is a remote form.  Symbols are obtained from host:port$SYMBOL_PAGE

   Each name can be:
@ -161,9 +162,9 @@ pprof [options] <profile>
                         $GROWTH_PAGE, $CONTENTION_PAGE, /pprof/wall,
                         $CENSUSPROFILE_PAGE, or /pprof/filteredprofile.
   For instance:
-     pprof http://myserver.com:80$HEAP_PAGE
+     jeprof http://myserver.com:80$HEAP_PAGE
   If /<service> is omitted, the service defaults to $PROFILE_PAGE (cpu profiling).
-pprof --symbols <program>
+jeprof --symbols <program>
   Maps addresses to symbol names.  In this mode, stdin should be a
   list of library mappings, in the same format as is found in the heap-
   and cpu-profile files (this loosely matches that of /proc/self/maps
@ -202,7 +203,7 @@ Output type:
   --pdf               Generate PDF to stdout
   --svg               Generate SVG to stdout
   --gif               Generate GIF to stdout
-   --raw               Generate symbolized pprof data (useful with remote fetch)
+   --raw               Generate symbolized jeprof data (useful with remote fetch)

 Heap-Profile Options:
   --inuse_space       Display in-use (mega)bytes [default]
@ -223,6 +224,7 @@ Call-graph Options:
   --edgefraction=<f>  Hide edges below <f>*total [default=.001]
   --maxdegree=<n>     Max incoming/outgoing edges per node [default=8]
   --focus=<regexp>    Focus on nodes matching <regexp>
+   --thread=<n>        Show profile for thread <n>
   --ignore=<regexp>   Ignore nodes matching <regexp>
   --scale=<n>         Set GV scaling [default=0]
   --heapcheck         Make nodes with non-0 object counts
@ -235,34 +237,34 @@ Miscellaneous:
   --version           Version information

 Environment Variables:
-   PPROF_TMPDIR        Profiles directory. Defaults to \$HOME/pprof
-   PPROF_TOOLS         Prefix for object tools pathnames
+   JEPROF_TMPDIR        Profiles directory. Defaults to \$HOME/jeprof
+   JEPROF_TOOLS         Prefix for object tools pathnames

 Examples:

-pprof /bin/ls ls.prof
+jeprof /bin/ls ls.prof
                       Enters "interactive" mode
-pprof --text /bin/ls ls.prof
+jeprof --text /bin/ls ls.prof
                       Outputs one line per procedure
-pprof --web /bin/ls ls.prof
+jeprof --web /bin/ls ls.prof
                       Displays annotated call-graph in web browser
-pprof --gv /bin/ls ls.prof
+jeprof --gv /bin/ls ls.prof
                       Displays annotated call-graph via 'gv'
-pprof --gv --focus=Mutex /bin/ls ls.prof
+jeprof --gv --focus=Mutex /bin/ls ls.prof
                       Restricts to code paths including a .*Mutex.* entry
-pprof --gv --focus=Mutex --ignore=string /bin/ls ls.prof
+jeprof --gv --focus=Mutex --ignore=string /bin/ls ls.prof
                       Code paths including Mutex but not string
-pprof --list=getdir /bin/ls ls.prof
+jeprof --list=getdir /bin/ls ls.prof
                       (Per-line) annotated source listing for getdir()
-pprof --disasm=getdir /bin/ls ls.prof
+jeprof --disasm=getdir /bin/ls ls.prof
                       (Per-PC) annotated disassembly for getdir()

-pprof http://localhost:1234/
+jeprof http://localhost:1234/
                       Enters "interactive" mode
-pprof --text localhost:1234
+jeprof --text localhost:1234
                       Outputs one line per procedure for localhost:1234
-pprof --raw localhost:1234 > ./local.raw
-pprof --text ./local.raw
+jeprof --raw localhost:1234 > ./local.raw
+jeprof --text ./local.raw
                       Fetches a remote profile for later analysis and then
                       analyzes it in text mode.
 EOF
@ -270,7 +272,8 @@ EOF

 sub version_string {
  return <<EOF
-pprof (part of gperftools $PPROF_VERSION)
+jeprof (part of jemalloc $JEPROF_VERSION)
+based on pprof (part of gperftools $PPROF_VERSION)

 Copyright 1998-2007 Google Inc.

@ -293,8 +296,8 @@ sub Init() {
  # Setup tmp-file name and handler to clean it up.
  # We do this in the very beginning so that we can use
  # error() and cleanup() function anytime here after.
-  $main::tmpfile_sym = "/tmp/pprof$$.sym";
-  $main::tmpfile_ps = "/tmp/pprof$$";
+  $main::tmpfile_sym = "/tmp/jeprof$$.sym";
+  $main::tmpfile_ps = "/tmp/jeprof$$";
  $main::next_tmpfile = 0;
  $SIG{'INT'} = \&sighandler;

@ -332,6 +335,7 @@ sub Init() {
  $main::opt_edgefraction = 0.001;
  $main::opt_maxdegree = 8;
  $main::opt_focus = '';
+  $main::opt_thread = undef;
  $main::opt_ignore = '';
  $main::opt_scale = 0;
  $main::opt_heapcheck = 0;
@ -402,6 +406,7 @@ sub Init() {
             "edgefraction=f" => \$main::opt_edgefraction,
             "maxdegree=i"    => \$main::opt_maxdegree,
             "focus=s"        => \$main::opt_focus,
+             "thread=s"       => \$main::opt_thread,
             "ignore=s"       => \$main::opt_ignore,
             "scale=i"        => \$main::opt_scale,
             "heapcheck"      => \$main::opt_heapcheck,
@ -562,6 +567,86 @@ sub Init() {
  }
 }

+sub FilterAndPrint {
+  my ($profile, $symbols, $libs, $thread) = @_;
+
+  # Get total data in profile
+  my $total = TotalProfile($profile);
+
+  # Remove uniniteresting stack items
+  $profile = RemoveUninterestingFrames($symbols, $profile);
+
+  # Focus?
+  if ($main::opt_focus ne '') {
+    $profile = FocusProfile($symbols, $profile, $main::opt_focus);
+  }
+
+  # Ignore?
+  if ($main::opt_ignore ne '') {
+    $profile = IgnoreProfile($symbols, $profile, $main::opt_ignore);
+  }
+
+  my $calls = ExtractCalls($symbols, $profile);
+
+  # Reduce profiles to required output granularity, and also clean
+  # each stack trace so a given entry exists at most once.
+  my $reduced = ReduceProfile($symbols, $profile);
+
+  # Get derived profiles
+  my $flat = FlatProfile($reduced);
+  my $cumulative = CumulativeProfile($reduced);
+
+  # Print
+  if (!$main::opt_interactive) {
+    if ($main::opt_disasm) {
+      PrintDisassembly($libs, $flat, $cumulative, $main::opt_disasm);
+    } elsif ($main::opt_list) {
+      PrintListing($total, $libs, $flat, $cumulative, $main::opt_list, 0);
+    } elsif ($main::opt_text) {
+      # Make sure the output is empty when have nothing to report
+      # (only matters when --heapcheck is given but we must be
+      # compatible with old branches that did not pass --heapcheck always):
+      if ($total != 0) {
+        printf("Total%s: %s %s\n",
+               (defined($thread) ? " (t$thread)" : ""),
+               Unparse($total), Units());
+      }
+      PrintText($symbols, $flat, $cumulative, -1);
+    } elsif ($main::opt_raw) {
+      PrintSymbolizedProfile($symbols, $profile, $main::prog);
+    } elsif ($main::opt_callgrind) {
+      PrintCallgrind($calls);
+    } else {
+      if (PrintDot($main::prog, $symbols, $profile, $flat, $cumulative, $total)) {
+        if ($main::opt_gv) {
+          RunGV(TempName($main::next_tmpfile, "ps"), "");
+        } elsif ($main::opt_evince) {
+          RunEvince(TempName($main::next_tmpfile, "pdf"), "");
+        } elsif ($main::opt_web) {
+          my $tmp = TempName($main::next_tmpfile, "svg");
+          RunWeb($tmp);
+          # The command we run might hand the file name off
+          # to an already running browser instance and then exit.
+          # Normally, we'd remove $tmp on exit (right now),
+          # but fork a child to remove $tmp a little later, so that the
+          # browser has time to load it first.
+          delete $main::tempnames{$tmp};
+          if (fork() == 0) {
+            sleep 5;
+            unlink($tmp);
+            exit(0);
+          }
+        }
+      } else {
+        cleanup();
+        exit(1);
+      }
+    }
+  } else {
+    InteractiveMode($profile, $symbols, $libs, $total);
+  }
+}
+
 sub Main() {
  Init();
  $main::collected_profile = undef;
@ -605,9 +690,6 @@ sub Main() {
    $symbol_map = MergeSymbols($symbol_map, $base->{symbols});
  }

-  # Get total data in profile
-  my $total = TotalProfile($profile);
-
  # Collect symbols
  my $symbols;
  if ($main::use_symbolized_profile) {
@ -622,75 +704,17 @@ sub Main() {
    $symbols = ExtractSymbols($libs, $pcs);
  }

-  # Remove uniniteresting stack items
-  $profile = RemoveUninterestingFrames($symbols, $profile);
-
-  # Focus?
-  if ($main::opt_focus ne '') {
-    $profile = FocusProfile($symbols, $profile, $main::opt_focus);
+  if (!defined($main::opt_thread)) {
+    FilterAndPrint($profile, $symbols, $libs);
  }
-
-  # Ignore?
-  if ($main::opt_ignore ne '') {
-    $profile = IgnoreProfile($symbols, $profile, $main::opt_ignore);
-  }
-
-  my $calls = ExtractCalls($symbols, $profile);
-
-  # Reduce profiles to required output granularity, and also clean
-  # each stack trace so a given entry exists at most once.
-  my $reduced = ReduceProfile($symbols, $profile);
-
-  # Get derived profiles
-  my $flat = FlatProfile($reduced);
-  my $cumulative = CumulativeProfile($reduced);
-
-  # Print
-  if (!$main::opt_interactive) {
-    if ($main::opt_disasm) {
-      PrintDisassembly($libs, $flat, $cumulative, $main::opt_disasm);
-    } elsif ($main::opt_list) {
-      PrintListing($total, $libs, $flat, $cumulative, $main::opt_list, 0);
-    } elsif ($main::opt_text) {
-      # Make sure the output is empty when have nothing to report
-      # (only matters when --heapcheck is given but we must be
-      # compatible with old branches that did not pass --heapcheck always):
-      if ($total != 0) {
-        printf("Total: %s %s\n", Unparse($total), Units());
-      }
-      PrintText($symbols, $flat, $cumulative, -1);
-    } elsif ($main::opt_raw) {
-      PrintSymbolizedProfile($symbols, $profile, $main::prog);
-    } elsif ($main::opt_callgrind) {
-      PrintCallgrind($calls);
-    } else {
-      if (PrintDot($main::prog, $symbols, $profile, $flat, $cumulative, $total)) {
-        if ($main::opt_gv) {
-          RunGV(TempName($main::next_tmpfile, "ps"), "");
-        } elsif ($main::opt_evince) {
-          RunEvince(TempName($main::next_tmpfile, "pdf"), "");
-        } elsif ($main::opt_web) {
-          my $tmp = TempName($main::next_tmpfile, "svg");
-          RunWeb($tmp);
-          # The command we run might hand the file name off
-          # to an already running browser instance and then exit.
-          # Normally, we'd remove $tmp on exit (right now),
-          # but fork a child to remove $tmp a little later, so that the
-          # browser has time to load it first.
-          delete $main::tempnames{$tmp};
-          if (fork() == 0) {
-            sleep 5;
-            unlink($tmp);
-            exit(0);
+  if (defined($data->{threads})) {
+    foreach my $thread (sort { $a <=> $b } keys(%{$data->{threads}})) {
+      if (defined($main::opt_thread) &&
+          ($main::opt_thread eq '*' || $main::opt_thread == $thread)) {
+        my $thread_profile = $data->{threads}{$thread};
+        FilterAndPrint($thread_profile, $symbols, $libs, $thread);
      }
    }
-      } else {
-        cleanup();
-        exit(1);
-      }
-    }
-  } else {
-    InteractiveMode($profile, $symbols, $libs, $total);
  }

  cleanup();
@ -780,14 +804,14 @@ sub InteractiveMode {
  $| = 1;  # Make output unbuffered for interactive mode
  my ($orig_profile, $symbols, $libs, $total) = @_;

-  print STDERR "Welcome to pprof!  For help, type 'help'.\n";
+  print STDERR "Welcome to jeprof!  For help, type 'help'.\n";

  # Use ReadLine if it's installed and input comes from a console.
  if ( -t STDIN &&
       !ReadlineMightFail() &&
       defined(eval {require Term::ReadLine}) ) {
-    my $term = new Term::ReadLine 'pprof';
-    while ( defined ($_ = $term->readline('(pprof) '))) {
+    my $term = new Term::ReadLine 'jeprof';
+    while ( defined ($_ = $term->readline('(jeprof) '))) {
      $term->addhistory($_) if /\S/;
      if (!InteractiveCommand($orig_profile, $symbols, $libs, $total, $_)) {
        last;    # exit when we get an interactive command to quit
@ -795,7 +819,7 @@ sub InteractiveMode {
    }
  } else {       # don't have readline
    while (1) {
-      print STDERR "(pprof) ";
+      print STDERR "(jeprof) ";
      $_ = <STDIN>;
      last if ! defined $_ ;
      s/\r//g;         # turn windows-looking lines into unix-looking lines
@ -988,7 +1012,7 @@ sub ProcessProfile {

 sub InteractiveHelpMessage {
  print STDERR <<ENDOFHELP;
-Interactive pprof mode
+Interactive jeprof mode

 Commands:
  gv
@ -1031,7 +1055,7 @@ Commands:
      Generates callgrind file. If no filename is given, kcachegrind is called.

  help - This listing
-  quit or ^D - End pprof
+  quit or ^D - End jeprof

 For commands that accept optional -ignore tags, samples where any routine in
 the stack trace matches the regular expression in any of the -ignore
@ -1476,7 +1500,7 @@ h1 {
 }
 </style>
 <script type="text/javascript">
-function pprof_toggle_asm(e) {
+function jeprof_toggle_asm(e) {
  var target;
  if (!e) e = window.event;
  if (e.target) target = e.target;
@ -1745,7 +1769,7 @@ sub PrintSource {

  if ($html) {
    printf $output (
-      "<h1>%s</h1>%s\n<pre onClick=\"pprof_toggle_asm()\">\n" .
+      "<h1>%s</h1>%s\n<pre onClick=\"jeprof_toggle_asm()\">\n" .
      "Total:%6s %6s (flat / cumulative %s)\n",
      HtmlEscape(ShortFunctionName($routine)),
      HtmlEscape(CleanFileName($filename)),
@ -2811,9 +2835,15 @@ sub RemoveUninterestingFrames {
                      'free',
                      'memalign',
                      'posix_memalign',
+                      'aligned_alloc',
                      'pvalloc',
                      'valloc',
                      'realloc',
+                      'mallocx', # jemalloc
+                      'rallocx', # jemalloc
+                      'xallocx', # jemalloc
+                      'dallocx', # jemalloc
+                      'sdallocx', # jemalloc
                      'tc_calloc',
                      'tc_cfree',
                      'tc_malloc',
@ -2923,6 +2953,10 @@ sub RemoveUninterestingFrames {
      if (exists($symbols->{$a})) {
        my $func = $symbols->{$a}->[0];
        if ($skip{$func} || ($func =~ m/$skip_regexp/)) {
+          # Throw away the portion of the backtrace seen so far, under the
+          # assumption that previous frames were for functions internal to the
+          # allocator.
+          @path = ();
          next;
        }
      }
@ -3401,7 +3435,7 @@ sub FetchDynamicProfile {
      $profile_file .= $suffix;
    }

-    my $profile_dir = $ENV{"PPROF_TMPDIR"} || ($ENV{HOME} . "/pprof");
+    my $profile_dir = $ENV{"JEPROF_TMPDIR"} || ($ENV{HOME} . "/jeprof");
    if (! -d $profile_dir) {
      mkdir($profile_dir)
          || die("Unable to create profile directory $profile_dir: $!\n");
@ -3617,7 +3651,7 @@ BEGIN {
 # Reads the top, 'header' section of a profile, and returns the last
 # line of the header, commonly called a 'header line'.  The header
 # section of a profile consists of zero or more 'command' lines that
-# are instructions to pprof, which pprof executes when reading the
+# are instructions to jeprof, which jeprof executes when reading the
 # header.  All 'command' lines start with a %.  After the command
 # lines is the 'header line', which is a profile-specific line that
 # indicates what type of profile it is, and perhaps other global
@ -3680,6 +3714,7 @@ sub IsSymbolizedProfileFile {
 #      $result->{version}     Version number of profile file
 #      $result->{period}      Sampling period (in microseconds)
 #      $result->{profile}     Profile object
+#      $result->{threads}     Map of thread IDs to profile objects
 #      $result->{map}         Memory map info from profile
 #      $result->{pcs}         Hash of all PC values seen, key is hex address
 sub ReadProfile {
@ -3728,6 +3763,9 @@ sub ReadProfile {
  } elsif ($header =~ m/^heap profile:/) {
    $main::profile_type = 'heap';
    $result =  ReadHeapProfile($prog, *PROFILE, $header);
+  } elsif ($header =~ m/^heap/) {
+    $main::profile_type = 'heap';
+    $result = ReadThreadedHeapProfile($prog, $fname, $header);
  } elsif ($header =~ m/^--- *$contention_marker/o) {
    $main::profile_type = 'contention';
    $result = ReadSynchProfile($prog, *PROFILE);
@ -3870,11 +3908,7 @@ sub ReadCPUProfile {
  return $r;
 }

-sub ReadHeapProfile {
-  my $prog = shift;
-  local *PROFILE = shift;
-  my $header = shift;
-
+sub HeapProfileIndex {
  my $index = 1;
  if ($main::opt_inuse_space) {
    $index = 1;
@ -3885,6 +3919,84 @@ sub ReadHeapProfile {
  } elsif ($main::opt_alloc_objects) {
    $index = 2;
  }
+  return $index;
+}
+
+sub ReadMappedLibraries {
+  my $fh = shift;
+  my $map = "";
+  # Read the /proc/self/maps data
+  while (<$fh>) {
+    s/\r//g;         # turn windows-looking lines into unix-looking lines
+    $map .= $_;
+  }
+  return $map;
+}
+
+sub ReadMemoryMap {
+  my $fh = shift;
+  my $map = "";
+  # Read /proc/self/maps data as formatted by DumpAddressMap()
+  my $buildvar = "";
+  while (<PROFILE>) {
+    s/\r//g;         # turn windows-looking lines into unix-looking lines
+    # Parse "build=<dir>" specification if supplied
+    if (m/^\s*build=(.*)\n/) {
+      $buildvar = $1;
+    }
+
+    # Expand "$build" variable if available
+    $_ =~ s/\$build\b/$buildvar/g;
+
+    $map .= $_;
+  }
+  return $map;
+}
+
+sub AdjustSamples {
+  my ($sample_adjustment, $sampling_algorithm, $n1, $s1, $n2, $s2) = @_;
+  if ($sample_adjustment) {
+    if ($sampling_algorithm == 2) {
+      # Remote-heap version 2
+      # The sampling frequency is the rate of a Poisson process.
+      # This means that the probability of sampling an allocation of
+      # size X with sampling rate Y is 1 - exp(-X/Y)
+      if ($n1 != 0) {
+        my $ratio = (($s1*1.0)/$n1)/($sample_adjustment);
+        my $scale_factor = 1/(1 - exp(-$ratio));
+        $n1 *= $scale_factor;
+        $s1 *= $scale_factor;
+      }
+      if ($n2 != 0) {
+        my $ratio = (($s2*1.0)/$n2)/($sample_adjustment);
+        my $scale_factor = 1/(1 - exp(-$ratio));
+        $n2 *= $scale_factor;
+        $s2 *= $scale_factor;
+      }
+    } else {
+      # Remote-heap version 1
+      my $ratio;
+      $ratio = (($s1*1.0)/$n1)/($sample_adjustment);
+      if ($ratio < 1) {
+        $n1 /= $ratio;
+        $s1 /= $ratio;
+      }
+      $ratio = (($s2*1.0)/$n2)/($sample_adjustment);
+      if ($ratio < 1) {
+        $n2 /= $ratio;
+        $s2 /= $ratio;
+      }
+    }
+  }
+  return ($n1, $s1, $n2, $s2);
+}
+
+sub ReadHeapProfile {
+  my $prog = shift;
+  local *PROFILE = shift;
+  my $header = shift;
+
+  my $index = HeapProfileIndex();

  # Find the type of this profile.  The header line looks like:
  #    heap profile:   1246:  8800744 [  1246:  8800744] @ <heap-url>/266053
@ -3974,29 +4086,12 @@ sub ReadHeapProfile {
  while (<PROFILE>) {
    s/\r//g;         # turn windows-looking lines into unix-looking lines
    if (/^MAPPED_LIBRARIES:/) {
-      # Read the /proc/self/maps data
-      while (<PROFILE>) {
-        s/\r//g;         # turn windows-looking lines into unix-looking lines
-        $map .= $_;
-      }
+      $map .= ReadMappedLibraries(*PROFILE);
      last;
    }

    if (/^--- Memory map:/) {
-      # Read /proc/self/maps data as formatted by DumpAddressMap()
-      my $buildvar = "";
-      while (<PROFILE>) {
-        s/\r//g;         # turn windows-looking lines into unix-looking lines
-        # Parse "build=<dir>" specification if supplied
-        if (m/^\s*build=(.*)\n/) {
-          $buildvar = $1;
-        }
-
-        # Expand "$build" variable if available
-        $_ =~ s/\$build\b/$buildvar/g;
-
-        $map .= $_;
-      }
+      $map .= ReadMemoryMap(*PROFILE);
      last;
    }

@ -4007,42 +4102,8 @@ sub ReadHeapProfile {
    if (m/^\s*(\d+):\s+(\d+)\s+\[\s*(\d+):\s+(\d+)\]\s+@\s+(.*)$/) {
      my $stack = $5;
      my ($n1, $s1, $n2, $s2) = ($1, $2, $3, $4);
-
-      if ($sample_adjustment) {
-        if ($sampling_algorithm == 2) {
-          # Remote-heap version 2
-          # The sampling frequency is the rate of a Poisson process.
-          # This means that the probability of sampling an allocation of
-          # size X with sampling rate Y is 1 - exp(-X/Y)
-          if ($n1 != 0) {
-            my $ratio = (($s1*1.0)/$n1)/($sample_adjustment);
-            my $scale_factor = 1/(1 - exp(-$ratio));
-            $n1 *= $scale_factor;
-            $s1 *= $scale_factor;
-          }
-          if ($n2 != 0) {
-            my $ratio = (($s2*1.0)/$n2)/($sample_adjustment);
-            my $scale_factor = 1/(1 - exp(-$ratio));
-            $n2 *= $scale_factor;
-            $s2 *= $scale_factor;
-          }
-        } else {
-          # Remote-heap version 1
-          my $ratio;
-          $ratio = (($s1*1.0)/$n1)/($sample_adjustment);
-          if ($ratio < 1) {
-            $n1 /= $ratio;
-            $s1 /= $ratio;
-          }
-          $ratio = (($s2*1.0)/$n2)/($sample_adjustment);
-          if ($ratio < 1) {
-            $n2 /= $ratio;
-            $s2 /= $ratio;
-          }
-        }
-      }
-
-      my @counts = ($n1, $s1, $n2, $s2);
+      my @counts = AdjustSamples($sample_adjustment, $sampling_algorithm,
+                                 $n1, $s1, $n2, $s2);
      AddEntries($profile, $pcs, FixCallerAddresses($stack), $counts[$index]);
    }
  }
@ -4056,6 +4117,83 @@ sub ReadHeapProfile {
  return $r;
 }

+sub ReadThreadedHeapProfile {
+  my ($prog, $fname, $header) = @_;
+
+  my $index = HeapProfileIndex();
+  my $sampling_algorithm = 0;
+  my $sample_adjustment = 0;
+  chomp($header);
+  my $type = "unknown";
+  # Assuming a very specific type of header for now.
+  if ($header =~ m"^heap_v2/(\d+)") {
+    $type = "_v2";
+    $sampling_algorithm = 2;
+    $sample_adjustment = int($1);
+  }
+  if ($type ne "_v2" || !defined($sample_adjustment)) {
+    die "Threaded heap profiles require v2 sampling with a sample rate\n";
+  }
+
+  my $profile = {};
+  my $thread_profiles = {};
+  my $pcs = {};
+  my $map = "";
+  my $stack = "";
+
+  while (<PROFILE>) {
+    s/\r//g;
+    if (/^MAPPED_LIBRARIES:/) {
+      $map .= ReadMappedLibraries(*PROFILE);
+      last;
+    }
+
+    if (/^--- Memory map:/) {
+      $map .= ReadMemoryMap(*PROFILE);
+      last;
+    }
+
+    # Read entry of the form:
+    # @ a1 a2 ... an
+    #   t*: <count1>: <bytes1> [<count2>: <bytes2>]
+    #   t1: <count1>: <bytes1> [<count2>: <bytes2>]
+    #     ...
+    #   tn: <count1>: <bytes1> [<count2>: <bytes2>]
+    s/^\s*//;
+    s/\s*$//;
+    if (m/^@\s+(.*)$/) {
+      $stack = $1;
+    } elsif (m/^\s*(t(\*|\d+)):\s+(\d+):\s+(\d+)\s+\[\s*(\d+):\s+(\d+)\]$/) {
+      if ($stack eq "") {
+        # Still in the header, so this is just a per-thread summary.
+        next;
+      }
+      my $thread = $2;
+      my ($n1, $s1, $n2, $s2) = ($3, $4, $5, $6);
+      my @counts = AdjustSamples($sample_adjustment, $sampling_algorithm,
+                                 $n1, $s1, $n2, $s2);
+      if ($thread eq "*") {
+        AddEntries($profile, $pcs, FixCallerAddresses($stack), $counts[$index]);
+      } else {
+        if (!exists($thread_profiles->{$thread})) {
+          $thread_profiles->{$thread} = {};
+        }
+        AddEntries($thread_profiles->{$thread}, $pcs,
+                   FixCallerAddresses($stack), $counts[$index]);
+      }
+    }
+  }
+
+  my $r = {};
+  $r->{version} = "heap";
+  $r->{period} = 1;
+  $r->{profile} = $profile;
+  $r->{threads} = $thread_profiles;
+  $r->{libs} = ParseLibraries($prog, $map, $pcs);
+  $r->{pcs} = $pcs;
+  return $r;
+}
+
 sub ReadSynchProfile {
  my $prog = shift;
  local *PROFILE = shift;
@ -4120,10 +4258,10 @@ sub ReadSynchProfile {
      } elsif ($variable eq "sampling period") {
        $sampling_period = $value;
      } elsif ($variable eq "ms since reset") {
-        # Currently nothing is done with this value in pprof
+        # Currently nothing is done with this value in jeprof
        # So we just silently ignore it for now
      } elsif ($variable eq "discarded samples") {
-        # Currently nothing is done with this value in pprof
+        # Currently nothing is done with this value in jeprof
        # So we just silently ignore it for now
      } else {
        printf STDERR ("Ignoring unnknown variable in /contention output: " .
@ -4429,7 +4567,7 @@ sub ParseLibraries {
 }

 # Add two hex addresses of length $address_length.
-# Run pprof --test for unit test if this is changed.
+# Run jeprof --test for unit test if this is changed.
 sub AddressAdd {
  my $addr1 = shift;
  my $addr2 = shift;
@ -4483,7 +4621,7 @@ sub AddressAdd {


 # Subtract two hex addresses of length $address_length.
-# Run pprof --test for unit test if this is changed.
+# Run jeprof --test for unit test if this is changed.
 sub AddressSub {
  my $addr1 = shift;
  my $addr2 = shift;
@ -4535,7 +4673,7 @@ sub AddressSub {
 }

 # Increment a hex addresses of length $address_length.
-# Run pprof --test for unit test if this is changed.
+# Run jeprof --test for unit test if this is changed.
 sub AddressInc {
  my $addr = shift;
  my $sum;
@ -4853,7 +4991,7 @@ sub UnparseAddress {
 # 32-bit or ELF 64-bit executable file.  The location of the tools
 # is determined by considering the following options in this order:
 #   1) --tools option, if set
-#   2) PPROF_TOOLS environment variable, if set
+#   2) JEPROF_TOOLS environment variable, if set
 #   3) the environment
 sub ConfigureObjTools {
  my $prog_file = shift;
@ -4886,7 +5024,7 @@ sub ConfigureObjTools {
    # For windows, we provide a version of nm and addr2line as part of
    # the opensource release, which is capable of parsing
    # Windows-style PDB executables.  It should live in the path, or
-    # in the same directory as pprof.
+    # in the same directory as jeprof.
    $obj_tool_map{"nm_pdb"} = "nm-pdb";
    $obj_tool_map{"addr2line_pdb"} = "addr2line-pdb";
  }
@ -4905,20 +5043,20 @@ sub ConfigureObjTools {
 }

 # Returns the path of a caller-specified object tool.  If --tools or
-# PPROF_TOOLS are specified, then returns the full path to the tool
+# JEPROF_TOOLS are specified, then returns the full path to the tool
 # with that prefix.  Otherwise, returns the path unmodified (which
 # means we will look for it on PATH).
 sub ConfigureTool {
  my $tool = shift;
  my $path;

-  # --tools (or $PPROF_TOOLS) is a comma separated list, where each
+  # --tools (or $JEPROF_TOOLS) is a comma separated list, where each
  # item is either a) a pathname prefix, or b) a map of the form
  # <tool>:<path>.  First we look for an entry of type (b) for our
  # tool.  If one is found, we use it.  Otherwise, we consider all the
  # pathname prefixes in turn, until one yields an existing file.  If
  # none does, we use a default path.
-  my $tools = $main::opt_tools || $ENV{"PPROF_TOOLS"} || "";
+  my $tools = $main::opt_tools || $ENV{"JEPROF_TOOLS"} || "";
  if ($tools =~ m/(,|^)\Q$tool\E:([^,]*)/) {
    $path = $2;
    # TODO(csilvers): sanity-check that $path exists?  Hard if it's relative.
@ -4932,11 +5070,11 @@ sub ConfigureTool {
    }
    if (!$path) {
      error("No '$tool' found with prefix specified by " .
-            "--tools (or \$PPROF_TOOLS) '$tools'\n");
+            "--tools (or \$JEPROF_TOOLS) '$tools'\n");
    }
  } else {
    # ... otherwise use the version that exists in the same directory as
-    # pprof.  If there's nothing there, use $PATH.
+    # jeprof.  If there's nothing there, use $PATH.
    $0 =~ m,[^/]*$,;     # this is everything after the last slash
    my $dirname = $`;    # this is everything up to and including the last slash
    if (-x "$dirname$tool") {
@ -4966,7 +5104,7 @@ sub cleanup {
  unlink($main::tmpfile_sym);
  unlink(keys %main::tempnames);

-  # We leave any collected profiles in $HOME/pprof in case the user wants
+  # We leave any collected profiles in $HOME/jeprof in case the user wants
  # to look at them later.  We print a message informing them of this.
  if ((scalar(@main::profile_files) > 0) &&
      defined($main::collected_profile)) {
@ -4975,7 +5113,7 @@ sub cleanup {
    }
    print STDERR "If you want to investigate this profile further, you can do:\n";
    print STDERR "\n";
-    print STDERR "  pprof \\\n";
+    print STDERR "  jeprof \\\n";
    print STDERR "    $main::prog \\\n";
    print STDERR "    $main::collected_profile\n";
    print STDERR "\n";
@ -5160,7 +5298,7 @@ sub GetProcedureBoundaries {
 # The test vectors for AddressAdd/Sub/Inc are 8-16-nibble hex strings.
 # To make them more readable, we add underscores at interesting places.
 # This routine removes the underscores, producing the canonical representation
-# used by pprof to represent addresses, particularly in the tested routines.
+# used by jeprof to represent addresses, particularly in the tested routines.
 sub CanonicalHex {
  my $arg = shift;
  return join '', (split '_',$arg);
--- a/config.guess
+++ b/config.guess
@ -1,8 +1,8 @@
 #! /bin/sh
 # Attempt to guess a canonical system name.
-#   Copyright 1992-2013 Free Software Foundation, Inc.
+#   Copyright 1992-2014 Free Software Foundation, Inc.

-timestamp='2013-06-10'
+timestamp='2014-03-23'

 # This file is free software; you can redistribute it and/or modify it
 # under the terms of the GNU General Public License as published by
@ -50,7 +50,7 @@ version="\
 GNU config.guess ($timestamp)

 Originally written by Per Bothner.
-Copyright 1992-2013 Free Software Foundation, Inc.
+Copyright 1992-2014 Free Software Foundation, Inc.

 This is free software; see the source for copying conditions.  There is NO
 warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
@ -149,7 +149,7 @@ Linux|GNU|GNU/*)
 	LIBC=gnu
 	#endif
 	EOF
-	eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^LIBC'`
+	eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^LIBC' | sed 's, ,,g'`
 	;;
 esac

@ -826,7 +826,7 @@ EOF
    *:MINGW*:*)
 	echo ${UNAME_MACHINE}-pc-mingw32
 	exit ;;
-    i*:MSYS*:*)
+    *:MSYS*:*)
 	echo ${UNAME_MACHINE}-pc-msys
 	exit ;;
    i*:windows32*:*)
@ -969,10 +969,10 @@ EOF
 	eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^CPU'`
 	test x"${CPU}" != x && { echo "${CPU}-unknown-linux-${LIBC}"; exit; }
 	;;
-    or1k:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+    openrisc*:Linux:*:*)
+	echo or1k-unknown-linux-${LIBC}
 	exit ;;
-    or32:Linux:*:*)
+    or32:Linux:*:* | or1k*:Linux:*:*)
 	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
 	exit ;;
    padre:Linux:*:*)
@ -1260,6 +1260,7 @@ EOF
 	if test "$UNAME_PROCESSOR" = unknown ; then
 	    UNAME_PROCESSOR=powerpc
 	fi
+	if test `echo "$UNAME_RELEASE" | sed -e 's/\..*//'` -le 10 ; then
 	    if [ "$CC_FOR_BUILD" != 'no_compiler_found' ]; then
 		if (echo '#ifdef __LP64__'; echo IS_64BIT_ARCH; echo '#endif') | \
 		    (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | \
@ -1271,6 +1272,15 @@ EOF
 		    esac
 		fi
 	    fi
+	elif test "$UNAME_PROCESSOR" = i386 ; then
+	    # Avoid executing cc on OS X 10.9, as it ships with a stub
+	    # that puts up a graphical alert prompting to install
+	    # developer tools.  Any system running Mac OS X 10.7 or
+	    # later (Darwin 11 and later) is required to have a 64-bit
+	    # processor. This is not true of the ARM version of Darwin
+	    # that Apple uses in portable devices.
+	    UNAME_PROCESSOR=x86_64
+	fi
 	echo ${UNAME_PROCESSOR}-apple-darwin${UNAME_RELEASE}
 	exit ;;
    *:procnto*:*:* | *:QNX:[0123456789]*:*)
@ -1361,154 +1371,6 @@ EOF
 	exit ;;
 esac

-eval $set_cc_for_build
-cat >$dummy.c <<EOF
-#ifdef _SEQUENT_
-# include <sys/types.h>
-# include <sys/utsname.h>
-#endif
-main ()
-{
-#if defined (sony)
-#if defined (MIPSEB)
-  /* BFD wants "bsd" instead of "newsos".  Perhaps BFD should be changed,
-     I don't know....  */
-  printf ("mips-sony-bsd\n"); exit (0);
-#else
-#include <sys/param.h>
-  printf ("m68k-sony-newsos%s\n",
-#ifdef NEWSOS4
-	"4"
-#else
-	""
-#endif
-	); exit (0);
-#endif
-#endif
-
-#if defined (__arm) && defined (__acorn) && defined (__unix)
-  printf ("arm-acorn-riscix\n"); exit (0);
-#endif
-
-#if defined (hp300) && !defined (hpux)
-  printf ("m68k-hp-bsd\n"); exit (0);
-#endif
-
-#if defined (NeXT)
-#if !defined (__ARCHITECTURE__)
-#define __ARCHITECTURE__ "m68k"
-#endif
-  int version;
-  version=`(hostinfo | sed -n 's/.*NeXT Mach \([0-9]*\).*/\1/p') 2>/dev/null`;
-  if (version < 4)
-    printf ("%s-next-nextstep%d\n", __ARCHITECTURE__, version);
-  else
-    printf ("%s-next-openstep%d\n", __ARCHITECTURE__, version);
-  exit (0);
-#endif
-
-#if defined (MULTIMAX) || defined (n16)
-#if defined (UMAXV)
-  printf ("ns32k-encore-sysv\n"); exit (0);
-#else
-#if defined (CMU)
-  printf ("ns32k-encore-mach\n"); exit (0);
-#else
-  printf ("ns32k-encore-bsd\n"); exit (0);
-#endif
-#endif
-#endif
-
-#if defined (__386BSD__)
-  printf ("i386-pc-bsd\n"); exit (0);
-#endif
-
-#if defined (sequent)
-#if defined (i386)
-  printf ("i386-sequent-dynix\n"); exit (0);
-#endif
-#if defined (ns32000)
-  printf ("ns32k-sequent-dynix\n"); exit (0);
-#endif
-#endif
-
-#if defined (_SEQUENT_)
-    struct utsname un;
-
-    uname(&un);
-
-    if (strncmp(un.version, "V2", 2) == 0) {
-	printf ("i386-sequent-ptx2\n"); exit (0);
-    }
-    if (strncmp(un.version, "V1", 2) == 0) { /* XXX is V1 correct? */
-	printf ("i386-sequent-ptx1\n"); exit (0);
-    }
-    printf ("i386-sequent-ptx\n"); exit (0);
-
-#endif
-
-#if defined (vax)
-# if !defined (ultrix)
-#  include <sys/param.h>
-#  if defined (BSD)
-#   if BSD == 43
-      printf ("vax-dec-bsd4.3\n"); exit (0);
-#   else
-#    if BSD == 199006
-      printf ("vax-dec-bsd4.3reno\n"); exit (0);
-#    else
-      printf ("vax-dec-bsd\n"); exit (0);
-#    endif
-#   endif
-#  else
-    printf ("vax-dec-bsd\n"); exit (0);
-#  endif
-# else
-    printf ("vax-dec-ultrix\n"); exit (0);
-# endif
-#endif
-
-#if defined (alliant) && defined (i860)
-  printf ("i860-alliant-bsd\n"); exit (0);
-#endif
-
-  exit (1);
-}
-EOF
-
-$CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null && SYSTEM_NAME=`$dummy` &&
-	{ echo "$SYSTEM_NAME"; exit; }
-
-# Apollos put the system type in the environment.
-
-test -d /usr/apollo && { echo ${ISP}-apollo-${SYSTYPE}; exit; }
-
-# Convex versions that predate uname can use getsysinfo(1)
-
-if [ -x /usr/convex/getsysinfo ]
-then
-    case `getsysinfo -f cpu_type` in
-    c1*)
-	echo c1-convex-bsd
-	exit ;;
-    c2*)
-	if getsysinfo -f scalar_acc
-	then echo c32-convex-bsd
-	else echo c2-convex-bsd
-	fi
-	exit ;;
-    c34*)
-	echo c34-convex-bsd
-	exit ;;
-    c38*)
-	echo c38-convex-bsd
-	exit ;;
-    c4*)
-	echo c4-convex-bsd
-	exit ;;
-    esac
-fi
-
 cat >&2 <<EOF
 $0: unable to guess system type

--- a/config.sub
+++ b/config.sub
@ -1,8 +1,8 @@
 #! /bin/sh
 # Configuration validation subroutine script.
-#   Copyright 1992-2013 Free Software Foundation, Inc.
+#   Copyright 1992-2014 Free Software Foundation, Inc.

-timestamp='2013-10-01'
+timestamp='2014-05-01'

 # This file is free software; you can redistribute it and/or modify it
 # under the terms of the GNU General Public License as published by
@ -68,7 +68,7 @@ Report bugs and patches to <config-patches@gnu.org>."
 version="\
 GNU config.sub ($timestamp)

-Copyright 1992-2013 Free Software Foundation, Inc.
+Copyright 1992-2014 Free Software Foundation, Inc.

 This is free software; see the source for copying conditions.  There is NO
 warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
@ -283,8 +283,10 @@ case $basic_machine in
 	| mips64vr5900 | mips64vr5900el \
 	| mipsisa32 | mipsisa32el \
 	| mipsisa32r2 | mipsisa32r2el \
+	| mipsisa32r6 | mipsisa32r6el \
 	| mipsisa64 | mipsisa64el \
 	| mipsisa64r2 | mipsisa64r2el \
+	| mipsisa64r6 | mipsisa64r6el \
 	| mipsisa64sb1 | mipsisa64sb1el \
 	| mipsisa64sr71k | mipsisa64sr71kel \
 	| mipsr5900 | mipsr5900el \
@ -296,8 +298,7 @@ case $basic_machine in
 	| nds32 | nds32le | nds32be \
 	| nios | nios2 | nios2eb | nios2el \
 	| ns16k | ns32k \
-	| open8 \
-	| or1k | or32 \
+	| open8 | or1k | or1knd | or32 \
 	| pdp10 | pdp11 | pj | pjl \
 	| powerpc | powerpc64 | powerpc64le | powerpcle \
 	| pyramid \
@ -402,8 +403,10 @@ case $basic_machine in
 	| mips64vr5900-* | mips64vr5900el-* \
 	| mipsisa32-* | mipsisa32el-* \
 	| mipsisa32r2-* | mipsisa32r2el-* \
+	| mipsisa32r6-* | mipsisa32r6el-* \
 	| mipsisa64-* | mipsisa64el-* \
 	| mipsisa64r2-* | mipsisa64r2el-* \
+	| mipsisa64r6-* | mipsisa64r6el-* \
 	| mipsisa64sb1-* | mipsisa64sb1el-* \
 	| mipsisa64sr71k-* | mipsisa64sr71kel-* \
 	| mipsr5900-* | mipsr5900el-* \
@ -415,6 +418,7 @@ case $basic_machine in
 	| nios-* | nios2-* | nios2eb-* | nios2el-* \
 	| none-* | np1-* | ns16k-* | ns32k-* \
 	| open8-* \
+	| or1k*-* \
 	| orion-* \
 	| pdp10-* | pdp11-* | pj-* | pjl-* | pn-* | power-* \
 	| powerpc-* | powerpc64-* | powerpc64le-* | powerpcle-* \
@ -1376,7 +1380,7 @@ case $os in
 	      | -os2* | -vos* | -palmos* | -uclinux* | -nucleus* \
 	      | -morphos* | -superux* | -rtmk* | -rtmk-nova* | -windiss* \
 	      | -powermax* | -dnix* | -nx6 | -nx7 | -sei* | -dragonfly* \
-	      | -skyos* | -haiku* | -rdos* | -toppers* | -drops* | -es*)
+	      | -skyos* | -haiku* | -rdos* | -toppers* | -drops* | -es* | -tirtos*)
 	# Remember, each alternative MUST END IN *, to match a version number.
 		;;
 	-qnx*)
@ -1400,6 +1404,9 @@ case $os in
 	-mac*)
 		os=`echo $os | sed -e 's|mac|macos|'`
 		;;
+	# Apple iOS
+	-ios*)
+		;;
 	-linux-dietlibc)
 		os=-linux-dietlibc
 		;;
@ -1594,9 +1601,6 @@ case $basic_machine in
 	mips*-*)
 		os=-elf
 		;;
-	or1k-*)
-		os=-elf
-		;;
 	or32-*)
 		os=-coff
 		;;
--- a/configure.ac
+++ b/configure.ac
@ -43,8 +43,11 @@ AC_CACHE_CHECK([whether $1 is compilable],

 dnl ============================================================================

+CONFIG=`echo ${ac_configure_args} | sed -e 's#'"'"'\([^ ]*\)'"'"'#\1#g'`
+AC_SUBST([CONFIG])
+
 dnl Library revision.
-rev=1
+rev=2
 AC_SUBST([rev])

 srcroot=$srcdir
@ -134,6 +137,7 @@ if test "x$CFLAGS" = "x" ; then
      AC_DEFINE_UNQUOTED([JEMALLOC_HAS_RESTRICT])
    fi
    JE_CFLAGS_APPEND([-Wall])
+    JE_CFLAGS_APPEND([-Werror=declaration-after-statement])
    JE_CFLAGS_APPEND([-pipe])
    JE_CFLAGS_APPEND([-g3])
  elif test "x$je_cv_msvc" = "xyes" ; then
@ -141,7 +145,8 @@ if test "x$CFLAGS" = "x" ; then
    JE_CFLAGS_APPEND([-Zi])
    JE_CFLAGS_APPEND([-MT])
    JE_CFLAGS_APPEND([-W3])
-    CPPFLAGS="$CPPFLAGS -I${srcroot}/include/msvc_compat"
+    JE_CFLAGS_APPEND([-FS])
+    CPPFLAGS="$CPPFLAGS -I${srcdir}/include/msvc_compat"
  fi
 fi
 dnl Append EXTRA_CFLAGS to CFLAGS, if defined.
@ -155,6 +160,10 @@ if test "x${ac_cv_big_endian}" = "x1" ; then
  AC_DEFINE_UNQUOTED([JEMALLOC_BIG_ENDIAN], [ ])
 fi

+if test "x${je_cv_msvc}" = "xyes" -a "x${ac_cv_header_inttypes_h}" = "xno"; then
+  CPPFLAGS="$CPPFLAGS -I${srcdir}/include/msvc_compat/C99"
+fi
+
 AC_CHECK_SIZEOF([void *])
 if test "x${ac_cv_sizeof_void_p}" = "x8" ; then
  LG_SIZEOF_PTR=3
@ -201,23 +210,14 @@ AC_CANONICAL_HOST
 dnl CPU-specific settings.
 CPU_SPINWAIT=""
 case "${host_cpu}" in
-  i[[345]]86)
-	;;
  i686|x86_64)
-	JE_COMPILABLE([pause instruction], [],
+	AC_CACHE_VAL([je_cv_pause],
+	  [JE_COMPILABLE([pause instruction], [],
 	                [[__asm__ volatile("pause"); return 0;]],
-	              [je_cv_pause])
+	                [je_cv_pause])])
 	if test "x${je_cv_pause}" = "xyes" ; then
 	    CPU_SPINWAIT='__asm__ volatile("pause")'
 	fi
-	dnl emmintrin.h fails to compile unless MMX, SSE, and SSE2 are
-	dnl supported.
-	JE_COMPILABLE([SSE2 intrinsics], [
-#include <emmintrin.h>
-], [], [je_cv_sse2])
-	if test "x${je_cv_sse2}" = "xyes" ; then
-	  AC_DEFINE_UNQUOTED([HAVE_SSE2], [ ])
-	fi
 	;;
  powerpc)
 	AC_DEFINE_UNQUOTED([HAVE_ALTIVEC], [ ])
@ -258,9 +258,9 @@ dnl Define cpp macros in CPPFLAGS, rather than doing AC_DEFINE(macro), since the
 dnl definitions need to be seen before any headers are included, which is a pain
 dnl to make happen otherwise.
 default_munmap="1"
-JEMALLOC_USABLE_SIZE_CONST="const"
+maps_coalesce="1"
 case "${host}" in
-  *-*-darwin*)
+  *-*-darwin* | *-*-ios*)
 	CFLAGS="$CFLAGS"
 	abi="macho"
 	AC_DEFINE([JEMALLOC_PURGE_MADVISE_FREE], [ ])
@ -269,7 +269,7 @@ case "${host}" in
 	so="dylib"
 	importlib="${so}"
 	force_tls="0"
-	DSO_LDFLAGS='-shared -Wl,-dylib_install_name,$(@F)'
+	DSO_LDFLAGS='-shared -Wl,-install_name,$(LIBDIR)/$(@F)'
 	SOREV="${rev}.${so}"
 	sbrk_deprecated="1"
 	;;
@ -279,6 +279,22 @@ case "${host}" in
 	AC_DEFINE([JEMALLOC_PURGE_MADVISE_FREE], [ ])
 	force_lazy_lock="1"
 	;;
+  *-*-dragonfly*)
+	CFLAGS="$CFLAGS"
+	abi="elf"
+	AC_DEFINE([JEMALLOC_PURGE_MADVISE_FREE], [ ])
+	;;
+  *-*-openbsd*)
+	CFLAGS="$CFLAGS"
+	abi="elf"
+	AC_DEFINE([JEMALLOC_PURGE_MADVISE_FREE], [ ])
+	force_tls="0"
+	;;
+  *-*-bitrig*)
+	CFLAGS="$CFLAGS"
+	abi="elf"
+	AC_DEFINE([JEMALLOC_PURGE_MADVISE_FREE], [ ])
+	;;
  *-*-linux*)
 	CFLAGS="$CFLAGS"
 	CPPFLAGS="$CPPFLAGS -D_GNU_SOURCE"
@ -286,7 +302,7 @@ case "${host}" in
 	AC_DEFINE([JEMALLOC_HAS_ALLOCA_H])
 	AC_DEFINE([JEMALLOC_PURGE_MADVISE_DONTNEED], [ ])
 	AC_DEFINE([JEMALLOC_THREADED_INIT], [ ])
-	JEMALLOC_USABLE_SIZE_CONST=""
+	AC_DEFINE([JEMALLOC_USE_CXX_THROW], [ ])
 	default_munmap="0"
 	;;
  *-*-netbsd*)
@ -322,9 +338,11 @@ case "${host}" in
 	fi
 	abi="xcoff"
 	;;
-  *-*-mingw*)
+  *-*-mingw* | *-*-cygwin*)
 	abi="pecoff"
 	force_tls="0"
+	force_lazy_lock="1"
+	maps_coalesce="0"
 	RPATH=""
 	so="dll"
 	if test "x$je_cv_msvc" = "xyes" ; then
@ -351,6 +369,22 @@ case "${host}" in
 	abi="elf"
 	;;
 esac
+
+JEMALLOC_USABLE_SIZE_CONST=const
+AC_CHECK_HEADERS([malloc.h], [
+  AC_MSG_CHECKING([whether malloc_usable_size definition can use const argument])
+  AC_COMPILE_IFELSE([AC_LANG_PROGRAM(
+    [#include <malloc.h>
+     #include <stddef.h>
+    size_t malloc_usable_size(const void *ptr);
+    ],
+    [])],[
+                AC_MSG_RESULT([yes])
+         ],[
+                JEMALLOC_USABLE_SIZE_CONST=
+                AC_MSG_RESULT([no])
+         ])
+])
 AC_DEFINE_UNQUOTED([JEMALLOC_USABLE_SIZE_CONST], [$JEMALLOC_USABLE_SIZE_CONST])
 AC_SUBST([abi])
 AC_SUBST([RPATH])
@ -387,7 +421,7 @@ SAVED_CFLAGS="${CFLAGS}"
 JE_CFLAGS_APPEND([-Werror])
 JE_COMPILABLE([tls_model attribute], [],
              [static __thread int
-               __attribute__((tls_model("initial-exec"))) foo;
+               __attribute__((tls_model("initial-exec"), unused)) foo;
               foo = 0;],
              [je_cv_tls_model])
 CFLAGS="${SAVED_CFLAGS}"
@ -397,6 +431,36 @@ if test "x${je_cv_tls_model}" = "xyes" ; then
 else
  AC_DEFINE([JEMALLOC_TLS_MODEL], [ ])
 fi
+dnl Check for alloc_size attribute support.
+SAVED_CFLAGS="${CFLAGS}"
+JE_CFLAGS_APPEND([-Werror])
+JE_COMPILABLE([alloc_size attribute], [#include <stdlib.h>],
+              [void *foo(size_t size) __attribute__((alloc_size(1)));],
+              [je_cv_alloc_size])
+CFLAGS="${SAVED_CFLAGS}"
+if test "x${je_cv_alloc_size}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_HAVE_ATTR_ALLOC_SIZE], [ ])
+fi
+dnl Check for format(gnu_printf, ...) attribute support.
+SAVED_CFLAGS="${CFLAGS}"
+JE_CFLAGS_APPEND([-Werror])
+JE_COMPILABLE([format(gnu_printf, ...) attribute], [#include <stdlib.h>],
+              [void *foo(const char *format, ...) __attribute__((format(gnu_printf, 1, 2)));],
+              [je_cv_format_gnu_printf])
+CFLAGS="${SAVED_CFLAGS}"
+if test "x${je_cv_format_gnu_printf}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_HAVE_ATTR_FORMAT_GNU_PRINTF], [ ])
+fi
+dnl Check for format(printf, ...) attribute support.
+SAVED_CFLAGS="${CFLAGS}"
+JE_CFLAGS_APPEND([-Werror])
+JE_COMPILABLE([format(printf, ...) attribute], [#include <stdlib.h>],
+              [void *foo(const char *format, ...) __attribute__((format(printf, 1, 2)));],
+              [je_cv_format_printf])
+CFLAGS="${SAVED_CFLAGS}"
+if test "x${je_cv_format_printf}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_HAVE_ATTR_FORMAT_PRINTF], [ ])
+fi

 dnl Support optional additions to rpath.
 AC_ARG_WITH([rpath],
@ -428,7 +492,7 @@ AC_PROG_RANLIB
 AC_PATH_PROG([LD], [ld], [false], [$PATH])
 AC_PATH_PROG([AUTOCONF], [autoconf], [false], [$PATH])

-public_syms="malloc_conf malloc_message malloc calloc posix_memalign aligned_alloc realloc free mallocx rallocx xallocx sallocx dallocx nallocx mallctl mallctlnametomib mallctlbymib malloc_stats_print malloc_usable_size"
+public_syms="malloc_conf malloc_message malloc calloc posix_memalign aligned_alloc realloc free mallocx rallocx xallocx sallocx dallocx sdallocx nallocx mallctl mallctlnametomib mallctlbymib malloc_stats_print malloc_usable_size"

 dnl Check for allocator-related functions that should be wrapped.
 AC_CHECK_FUNC([memalign],
@ -438,24 +502,6 @@ AC_CHECK_FUNC([valloc],
 	      [AC_DEFINE([JEMALLOC_OVERRIDE_VALLOC], [ ])
 	       public_syms="${public_syms} valloc"])

-dnl Support the experimental API by default.
-AC_ARG_ENABLE([experimental],
-  [AS_HELP_STRING([--disable-experimental],
-   [Disable support for the experimental API])],
-[if test "x$enable_experimental" = "xno" ; then
-  enable_experimental="0"
-else
-  enable_experimental="1"
-fi
-],
-[enable_experimental="1"]
-)
-if test "x$enable_experimental" = "x1" ; then
-  AC_DEFINE([JEMALLOC_EXPERIMENTAL], [ ])
-  public_syms="${public_syms} allocm dallocm nallocm rallocm sallocm"
-fi
-AC_SUBST([enable_experimental])
-
 dnl Do not compute test code coverage by default.
 GCOV_FLAGS=
 AC_ARG_ENABLE([code-coverage],
@ -501,6 +547,7 @@ if test "x$JEMALLOC_PREFIX" != "x" ; then
  AC_DEFINE_UNQUOTED([JEMALLOC_PREFIX], ["$JEMALLOC_PREFIX"])
  AC_DEFINE_UNQUOTED([JEMALLOC_CPREFIX], ["$JEMALLOC_CPREFIX"])
 fi
+AC_SUBST([JEMALLOC_CPREFIX])

 AC_ARG_WITH([export],
  [AS_HELP_STRING([--without-export], [disable exporting jemalloc public APIs])],
@ -533,48 +580,54 @@ dnl jemalloc_protos_jet.h easy.
 je_="je_"
 AC_SUBST([je_])

-cfgoutputs_in="${srcroot}Makefile.in"
-cfgoutputs_in="${cfgoutputs_in} ${srcroot}doc/html.xsl.in"
-cfgoutputs_in="${cfgoutputs_in} ${srcroot}doc/manpages.xsl.in"
-cfgoutputs_in="${cfgoutputs_in} ${srcroot}doc/jemalloc.xml.in"
-cfgoutputs_in="${cfgoutputs_in} ${srcroot}include/jemalloc/jemalloc_macros.h.in"
-cfgoutputs_in="${cfgoutputs_in} ${srcroot}include/jemalloc/jemalloc_protos.h.in"
-cfgoutputs_in="${cfgoutputs_in} ${srcroot}include/jemalloc/internal/jemalloc_internal.h.in"
-cfgoutputs_in="${cfgoutputs_in} ${srcroot}test/test.sh.in"
-cfgoutputs_in="${cfgoutputs_in} ${srcroot}test/include/test/jemalloc_test.h.in"
+cfgoutputs_in="Makefile.in"
+cfgoutputs_in="${cfgoutputs_in} jemalloc.pc.in"
+cfgoutputs_in="${cfgoutputs_in} doc/html.xsl.in"
+cfgoutputs_in="${cfgoutputs_in} doc/manpages.xsl.in"
+cfgoutputs_in="${cfgoutputs_in} doc/jemalloc.xml.in"
+cfgoutputs_in="${cfgoutputs_in} include/jemalloc/jemalloc_macros.h.in"
+cfgoutputs_in="${cfgoutputs_in} include/jemalloc/jemalloc_protos.h.in"
+cfgoutputs_in="${cfgoutputs_in} include/jemalloc/jemalloc_typedefs.h.in"
+cfgoutputs_in="${cfgoutputs_in} include/jemalloc/internal/jemalloc_internal.h.in"
+cfgoutputs_in="${cfgoutputs_in} test/test.sh.in"
+cfgoutputs_in="${cfgoutputs_in} test/include/test/jemalloc_test.h.in"

 cfgoutputs_out="Makefile"
+cfgoutputs_out="${cfgoutputs_out} jemalloc.pc"
 cfgoutputs_out="${cfgoutputs_out} doc/html.xsl"
 cfgoutputs_out="${cfgoutputs_out} doc/manpages.xsl"
 cfgoutputs_out="${cfgoutputs_out} doc/jemalloc.xml"
 cfgoutputs_out="${cfgoutputs_out} include/jemalloc/jemalloc_macros.h"
 cfgoutputs_out="${cfgoutputs_out} include/jemalloc/jemalloc_protos.h"
+cfgoutputs_out="${cfgoutputs_out} include/jemalloc/jemalloc_typedefs.h"
 cfgoutputs_out="${cfgoutputs_out} include/jemalloc/internal/jemalloc_internal.h"
 cfgoutputs_out="${cfgoutputs_out} test/test.sh"
 cfgoutputs_out="${cfgoutputs_out} test/include/test/jemalloc_test.h"

 cfgoutputs_tup="Makefile"
+cfgoutputs_tup="${cfgoutputs_tup} jemalloc.pc:jemalloc.pc.in"
 cfgoutputs_tup="${cfgoutputs_tup} doc/html.xsl:doc/html.xsl.in"
 cfgoutputs_tup="${cfgoutputs_tup} doc/manpages.xsl:doc/manpages.xsl.in"
 cfgoutputs_tup="${cfgoutputs_tup} doc/jemalloc.xml:doc/jemalloc.xml.in"
 cfgoutputs_tup="${cfgoutputs_tup} include/jemalloc/jemalloc_macros.h:include/jemalloc/jemalloc_macros.h.in"
 cfgoutputs_tup="${cfgoutputs_tup} include/jemalloc/jemalloc_protos.h:include/jemalloc/jemalloc_protos.h.in"
+cfgoutputs_tup="${cfgoutputs_tup} include/jemalloc/jemalloc_typedefs.h:include/jemalloc/jemalloc_typedefs.h.in"
 cfgoutputs_tup="${cfgoutputs_tup} include/jemalloc/internal/jemalloc_internal.h"
 cfgoutputs_tup="${cfgoutputs_tup} test/test.sh:test/test.sh.in"
 cfgoutputs_tup="${cfgoutputs_tup} test/include/test/jemalloc_test.h:test/include/test/jemalloc_test.h.in"

-cfghdrs_in="${srcroot}include/jemalloc/jemalloc_defs.h.in"
-cfghdrs_in="${cfghdrs_in} ${srcroot}include/jemalloc/internal/jemalloc_internal_defs.h.in"
-cfghdrs_in="${cfghdrs_in} ${srcroot}include/jemalloc/internal/private_namespace.sh"
-cfghdrs_in="${cfghdrs_in} ${srcroot}include/jemalloc/internal/private_unnamespace.sh"
-cfghdrs_in="${cfghdrs_in} ${srcroot}include/jemalloc/internal/private_symbols.txt"
-cfghdrs_in="${cfghdrs_in} ${srcroot}include/jemalloc/internal/public_namespace.sh"
-cfghdrs_in="${cfghdrs_in} ${srcroot}include/jemalloc/internal/public_unnamespace.sh"
-cfghdrs_in="${cfghdrs_in} ${srcroot}include/jemalloc/internal/size_classes.sh"
-cfghdrs_in="${cfghdrs_in} ${srcroot}include/jemalloc/jemalloc_rename.sh"
-cfghdrs_in="${cfghdrs_in} ${srcroot}include/jemalloc/jemalloc_mangle.sh"
-cfghdrs_in="${cfghdrs_in} ${srcroot}include/jemalloc/jemalloc.sh"
-cfghdrs_in="${cfghdrs_in} ${srcroot}test/include/test/jemalloc_test_defs.h.in"
+cfghdrs_in="include/jemalloc/jemalloc_defs.h.in"
+cfghdrs_in="${cfghdrs_in} include/jemalloc/internal/jemalloc_internal_defs.h.in"
+cfghdrs_in="${cfghdrs_in} include/jemalloc/internal/private_namespace.sh"
+cfghdrs_in="${cfghdrs_in} include/jemalloc/internal/private_unnamespace.sh"
+cfghdrs_in="${cfghdrs_in} include/jemalloc/internal/private_symbols.txt"
+cfghdrs_in="${cfghdrs_in} include/jemalloc/internal/public_namespace.sh"
+cfghdrs_in="${cfghdrs_in} include/jemalloc/internal/public_unnamespace.sh"
+cfghdrs_in="${cfghdrs_in} include/jemalloc/internal/size_classes.sh"
+cfghdrs_in="${cfghdrs_in} include/jemalloc/jemalloc_rename.sh"
+cfghdrs_in="${cfghdrs_in} include/jemalloc/jemalloc_mangle.sh"
+cfghdrs_in="${cfghdrs_in} include/jemalloc/jemalloc.sh"
+cfghdrs_in="${cfghdrs_in} test/include/test/jemalloc_test_defs.h.in"

 cfghdrs_out="include/jemalloc/jemalloc_defs.h"
 cfghdrs_out="${cfghdrs_out} include/jemalloc/jemalloc${install_suffix}.h"
@ -592,21 +645,20 @@ cfghdrs_out="${cfghdrs_out} include/jemalloc/internal/jemalloc_internal_defs.h"
 cfghdrs_out="${cfghdrs_out} test/include/test/jemalloc_test_defs.h"

 cfghdrs_tup="include/jemalloc/jemalloc_defs.h:include/jemalloc/jemalloc_defs.h.in"
-cfghdrs_tup="${cfghdrs_tup} include/jemalloc/internal/jemalloc_internal_defs.h:${srcroot}include/jemalloc/internal/jemalloc_internal_defs.h.in"
-cfghdrs_tup="${cfghdrs_tup} test/include/test/jemalloc_test_defs.h:${srcroot}test/include/test/jemalloc_test_defs.h.in"
+cfghdrs_tup="${cfghdrs_tup} include/jemalloc/internal/jemalloc_internal_defs.h:include/jemalloc/internal/jemalloc_internal_defs.h.in"
+cfghdrs_tup="${cfghdrs_tup} test/include/test/jemalloc_test_defs.h:test/include/test/jemalloc_test_defs.h.in"

-dnl Do not silence irrelevant compiler warnings by default, since enabling this
-dnl option incurs a performance penalty.
+dnl Silence irrelevant compiler warnings by default.
 AC_ARG_ENABLE([cc-silence],
-  [AS_HELP_STRING([--enable-cc-silence],
-                  [Silence irrelevant compiler warnings])],
+  [AS_HELP_STRING([--disable-cc-silence],
+                  [Do not silence irrelevant compiler warnings])],
 [if test "x$enable_cc_silence" = "xno" ; then
  enable_cc_silence="0"
 else
  enable_cc_silence="1"
 fi
 ],
-[enable_cc_silence="0"]
+[enable_cc_silence="1"]
 )
 if test "x$enable_cc_silence" = "x1" ; then
  AC_DEFINE([JEMALLOC_CC_SILENCE], [ ])
@ -614,7 +666,8 @@ fi

 dnl Do not compile with debugging by default.
 AC_ARG_ENABLE([debug],
-  [AS_HELP_STRING([--enable-debug], [Build debugging code (implies --enable-ivsalloc)])],
+  [AS_HELP_STRING([--enable-debug],
+                  [Build debugging code (implies --enable-ivsalloc)])],
 [if test "x$enable_debug" = "xno" ; then
  enable_debug="0"
 else
@ -623,6 +676,9 @@ fi
 ],
 [enable_debug="0"]
 )
+if test "x$enable_debug" = "x1" ; then
+  AC_DEFINE([JEMALLOC_DEBUG], [ ])
+fi
 if test "x$enable_debug" = "x1" ; then
  AC_DEFINE([JEMALLOC_DEBUG], [ ])
  enable_ivsalloc="1"
@ -631,7 +687,8 @@ AC_SUBST([enable_debug])

 dnl Do not validate pointers by default.
 AC_ARG_ENABLE([ivsalloc],
-  [AS_HELP_STRING([--enable-ivsalloc], [Validate pointers passed through the public API])],
+  [AS_HELP_STRING([--enable-ivsalloc],
+                  [Validate pointers passed through the public API])],
 [if test "x$enable_ivsalloc" = "xno" ; then
  enable_ivsalloc="0"
 else
@ -721,7 +778,7 @@ fi,
 if test "x$backtrace_method" = "x" -a "x$enable_prof_libunwind" = "x1" ; then
  AC_CHECK_HEADERS([libunwind.h], , [enable_prof_libunwind="0"])
  if test "x$LUNWIND" = "x-lunwind" ; then
-    AC_CHECK_LIB([unwind], [backtrace], [LIBS="$LIBS $LUNWIND"],
+    AC_CHECK_LIB([unwind], [unw_backtrace], [LIBS="$LIBS $LUNWIND"],
                 [enable_prof_libunwind="0"])
  else
    LIBS="$LIBS $LUNWIND"
@ -782,11 +839,6 @@ fi
 AC_MSG_CHECKING([configured backtracing method])
 AC_MSG_RESULT([$backtrace_method])
 if test "x$enable_prof" = "x1" ; then
-  if test "x${force_tls}" = "x0" ; then
-    AC_MSG_ERROR([Heap profiling requires TLS]);
-  fi
-  force_tls="1"
-
  if test "x$abi" != "xpecoff"; then
    dnl Heap profiling uses the log(3) function.
    LIBS="$LIBS -lm"
@ -812,32 +864,11 @@ if test "x$enable_tcache" = "x1" ; then
 fi
 AC_SUBST([enable_tcache])

-dnl Disable mremap() for huge realloc() by default.
-AC_ARG_ENABLE([mremap],
-  [AS_HELP_STRING([--enable-mremap], [Enable mremap(2) for huge realloc()])],
-[if test "x$enable_mremap" = "xno" ; then
-  enable_mremap="0"
-else
-  enable_mremap="1"
+dnl Indicate whether adjacent virtual memory mappings automatically coalesce
+dnl (and fragment on demand).
+if test "x${maps_coalesce}" = "x1" ; then
+  AC_DEFINE([JEMALLOC_MAPS_COALESCE], [ ])
 fi
-],
-[enable_mremap="0"]
-)
-if test "x$enable_mremap" = "x1" ; then
-  JE_COMPILABLE([mremap(...MREMAP_FIXED...)], [
-#define	_GNU_SOURCE
-#include <sys/mman.h>
-], [
-void *p = mremap((void *)0, 0, 0, MREMAP_MAYMOVE|MREMAP_FIXED, (void *)0);
-], [je_cv_mremap_fixed])
-  if test "x${je_cv_mremap_fixed}" = "xno" ; then
-    enable_mremap="0"
-  fi
-fi
-if test "x$enable_mremap" = "x1" ; then
-  AC_DEFINE([JEMALLOC_MREMAP], [ ])
-fi
-AC_SUBST([enable_mremap])

 dnl Enable VM deallocation via munmap() by default.
 AC_ARG_ENABLE([munmap],
@ -855,34 +886,22 @@ if test "x$enable_munmap" = "x1" ; then
 fi
 AC_SUBST([enable_munmap])

-dnl Do not enable allocation from DSS by default.
-AC_ARG_ENABLE([dss],
-  [AS_HELP_STRING([--enable-dss], [Enable allocation from DSS])],
-[if test "x$enable_dss" = "xno" ; then
-  enable_dss="0"
-else
-  enable_dss="1"
-fi
-],
-[enable_dss="0"]
-)
+dnl Enable allocation from DSS if supported by the OS.
+have_dss="1"
 dnl Check whether the BSD/SUSv1 sbrk() exists.  If not, disable DSS support.
 AC_CHECK_FUNC([sbrk], [have_sbrk="1"], [have_sbrk="0"])
 if test "x$have_sbrk" = "x1" ; then
-  if test "x$sbrk_deprecated" == "x1" ; then
+  if test "x$sbrk_deprecated" = "x1" ; then
    AC_MSG_RESULT([Disabling dss allocation because sbrk is deprecated])
-    enable_dss="0"
-  else
-    AC_DEFINE([JEMALLOC_HAVE_SBRK], [ ])
+    have_dss="0"
  fi
 else
-  enable_dss="0"
+  have_dss="0"
 fi

-if test "x$enable_dss" = "x1" ; then
+if test "x$have_dss" = "x1" ; then
  AC_DEFINE([JEMALLOC_DSS], [ ])
 fi
-AC_SUBST([enable_dss])

 dnl Support the junk/zero filling option by default.
 AC_ARG_ENABLE([fill],
@ -974,8 +993,83 @@ if test "x$enable_xmalloc" = "x1" ; then
 fi
 AC_SUBST([enable_xmalloc])

-AC_CACHE_CHECK([STATIC_PAGE_SHIFT],
-               [je_cv_static_page_shift],
+dnl Support cache-oblivious allocation alignment by default.
+AC_ARG_ENABLE([cache-oblivious],
+  [AS_HELP_STRING([--disable-cache-oblivious],
+                  [Disable support for cache-oblivious allocation alignment])],
+[if test "x$enable_cache_oblivious" = "xno" ; then
+  enable_cache_oblivious="0"
+else
+  enable_cache_oblivious="1"
+fi
+],
+[enable_cache_oblivious="1"]
+)
+if test "x$enable_cache_oblivious" = "x1" ; then
+  AC_DEFINE([JEMALLOC_CACHE_OBLIVIOUS], [ ])
+fi
+AC_SUBST([enable_cache_oblivious])
+
+dnl ============================================================================
+dnl Check for  __builtin_ffsl(), then ffsl(3), and fail if neither are found.
+dnl One of those two functions should (theoretically) exist on all platforms
+dnl that jemalloc currently has a chance of functioning on without modification.
+dnl We additionally assume ffs() or __builtin_ffs() are defined if
+dnl ffsl() or __builtin_ffsl() are defined, respectively.
+JE_COMPILABLE([a program using __builtin_ffsl], [
+#include <stdio.h>
+#include <strings.h>
+#include <string.h>
+], [
+	{
+		int rv = __builtin_ffsl(0x08);
+		printf("%d\n", rv);
+	}
+], [je_cv_gcc_builtin_ffsl])
+if test "x${je_cv_gcc_builtin_ffsl}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_INTERNAL_FFSL], [__builtin_ffsl])
+  AC_DEFINE([JEMALLOC_INTERNAL_FFS], [__builtin_ffs])
+else
+  JE_COMPILABLE([a program using ffsl], [
+  #include <stdio.h>
+  #include <strings.h>
+  #include <string.h>
+  ], [
+	{
+		int rv = ffsl(0x08);
+		printf("%d\n", rv);
+	}
+  ], [je_cv_function_ffsl])
+  if test "x${je_cv_function_ffsl}" = "xyes" ; then
+    AC_DEFINE([JEMALLOC_INTERNAL_FFSL], [ffsl])
+    AC_DEFINE([JEMALLOC_INTERNAL_FFS], [ffs])
+  else
+    AC_MSG_ERROR([Cannot build without ffsl(3) or __builtin_ffsl()])
+  fi
+fi
+
+AC_ARG_WITH([lg_tiny_min],
+  [AS_HELP_STRING([--with-lg-tiny-min=<lg-tiny-min>],
+   [Base 2 log of minimum tiny size class to support])],
+  [LG_TINY_MIN="$with_lg_tiny_min"],
+  [LG_TINY_MIN="3"])
+AC_DEFINE_UNQUOTED([LG_TINY_MIN], [$LG_TINY_MIN])
+
+AC_ARG_WITH([lg_quantum],
+  [AS_HELP_STRING([--with-lg-quantum=<lg-quantum>],
+   [Base 2 log of minimum allocation alignment])],
+  [LG_QUANTA="$with_lg_quantum"],
+  [LG_QUANTA="3 4"])
+if test "x$with_lg_quantum" != "x" ; then
+  AC_DEFINE_UNQUOTED([LG_QUANTUM], [$with_lg_quantum])
+fi
+
+AC_ARG_WITH([lg_page],
+  [AS_HELP_STRING([--with-lg-page=<lg-page>], [Base 2 log of system page size])],
+  [LG_PAGE="$with_lg_page"], [LG_PAGE="detect"])
+if test "x$LG_PAGE" = "xdetect"; then
+  AC_CACHE_CHECK([LG_PAGE],
+               [je_cv_lg_page],
               AC_RUN_IFELSE([AC_LANG_PROGRAM(
 [[
 #include <strings.h>
@ -1000,7 +1094,7 @@ AC_CACHE_CHECK([STATIC_PAGE_SHIFT],
    if (result == -1) {
 	return 1;
    }
-    result = ffsl(result) - 1;
+    result = JEMALLOC_INTERNAL_FFSL(result) - 1;

    f = fopen("conftest.out", "w");
    if (f == NULL) {
@ -1011,24 +1105,65 @@ AC_CACHE_CHECK([STATIC_PAGE_SHIFT],

    return 0;
 ]])],
-                             [je_cv_static_page_shift=`cat conftest.out`],
-                             [je_cv_static_page_shift=undefined]))
-
-if test "x$je_cv_static_page_shift" != "xundefined"; then
-   AC_DEFINE_UNQUOTED([STATIC_PAGE_SHIFT], [$je_cv_static_page_shift])
-else
-   AC_MSG_ERROR([cannot determine value for STATIC_PAGE_SHIFT])
+                             [je_cv_lg_page=`cat conftest.out`],
+                             [je_cv_lg_page=undefined],
+                             [je_cv_lg_page=12]))
 fi
+if test "x${je_cv_lg_page}" != "x" ; then
+  LG_PAGE="${je_cv_lg_page}"
+fi
+if test "x${LG_PAGE}" != "xundefined" ; then
+   AC_DEFINE_UNQUOTED([LG_PAGE], [$LG_PAGE])
+else
+   AC_MSG_ERROR([cannot determine value for LG_PAGE])
+fi
+
+AC_ARG_WITH([lg_page_sizes],
+  [AS_HELP_STRING([--with-lg-page-sizes=<lg-page-sizes>],
+   [Base 2 logs of system page sizes to support])],
+  [LG_PAGE_SIZES="$with_lg_page_sizes"], [LG_PAGE_SIZES="$LG_PAGE"])
+
+AC_ARG_WITH([lg_size_class_group],
+  [AS_HELP_STRING([--with-lg-size-class-group=<lg-size-class-group>],
+   [Base 2 log of size classes per doubling])],
+  [LG_SIZE_CLASS_GROUP="$with_lg_size_class_group"],
+  [LG_SIZE_CLASS_GROUP="2"])

 dnl ============================================================================
 dnl jemalloc configuration.
 dnl 

-dnl Set VERSION if source directory has an embedded git repository.
-if test -d "${srcroot}.git" ; then
-  git describe --long --abbrev=40 > ${srcroot}VERSION
+dnl Set VERSION if source directory is inside a git repository.
+if test "x`test ! \"${srcroot}\" && cd \"${srcroot}\"; git rev-parse --is-inside-work-tree 2>/dev/null`" = "xtrue" ; then
+  dnl Pattern globs aren't powerful enough to match both single- and
+  dnl double-digit version numbers, so iterate over patterns to support up to
+  dnl version 99.99.99 without any accidental matches.
+  rm -f "${objroot}VERSION"
+  for pattern in ['[0-9].[0-9].[0-9]' '[0-9].[0-9].[0-9][0-9]' \
+                 '[0-9].[0-9][0-9].[0-9]' '[0-9].[0-9][0-9].[0-9][0-9]' \
+                 '[0-9][0-9].[0-9].[0-9]' '[0-9][0-9].[0-9].[0-9][0-9]' \
+                 '[0-9][0-9].[0-9][0-9].[0-9]' \
+                 '[0-9][0-9].[0-9][0-9].[0-9][0-9]']; do
+    if test ! -e "${objroot}VERSION" ; then
+      (test ! "${srcroot}" && cd "${srcroot}"; git describe --long --abbrev=40 --match="${pattern}") > "${objroot}VERSION.tmp" 2>/dev/null
+      if test $? -eq 0 ; then
+        mv "${objroot}VERSION.tmp" "${objroot}VERSION"
+        break
+      fi
+    fi
+  done
 fi
-jemalloc_version=`cat ${srcroot}VERSION`
+rm -f "${objroot}VERSION.tmp"
+if test ! -e "${objroot}VERSION" ; then
+  if test ! -e "${srcroot}VERSION" ; then
+    AC_MSG_RESULT(
+      [Missing VERSION file, and unable to generate it; creating bogus VERSION])
+    echo "0.0.0-0-g0000000000000000000000000000000000000000" > "${objroot}VERSION"
+  else
+    cp ${srcroot}VERSION ${objroot}VERSION
+  fi
+fi
+jemalloc_version=`cat "${objroot}VERSION"`
 jemalloc_version_major=`echo ${jemalloc_version} | tr ".g-" " " | awk '{print [$]1}'`
 jemalloc_version_minor=`echo ${jemalloc_version} | tr ".g-" " " | awk '{print [$]2}'`
 jemalloc_version_bugfix=`echo ${jemalloc_version} | tr ".g-" " " | awk '{print [$]3}'`
@ -1055,6 +1190,24 @@ fi

 CPPFLAGS="$CPPFLAGS -D_REENTRANT"

+dnl Check if the GNU-specific secure_getenv function exists.
+AC_CHECK_FUNC([secure_getenv],
+              [have_secure_getenv="1"],
+              [have_secure_getenv="0"]
+             )
+if test "x$have_secure_getenv" = "x1" ; then
+  AC_DEFINE([JEMALLOC_HAVE_SECURE_GETENV], [ ])
+fi
+
+dnl Check if the Solaris/BSD issetugid function exists.
+AC_CHECK_FUNC([issetugid],
+              [have_issetugid="1"],
+              [have_issetugid="0"]
+             )
+if test "x$have_issetugid" = "x1" ; then
+  AC_DEFINE([JEMALLOC_HAVE_ISSETUGID], [ ])
+fi
+
 dnl Check whether the BSD-specific _malloc_thread_cleanup() exists.  If so, use
 dnl it rather than pthreads TSD cleanup functions to support cleanup during
 dnl thread exit, in order to avoid pthreads library recursion during
@ -1089,9 +1242,9 @@ else
  enable_lazy_lock="1"
 fi
 ],
-[enable_lazy_lock="0"]
+[enable_lazy_lock=""]
 )
-if test "x$enable_lazy_lock" = "x0" -a "x${force_lazy_lock}" = "x1" ; then
+if test "x$enable_lazy_lock" = "x" -a "x${force_lazy_lock}" = "x1" ; then
  AC_MSG_RESULT([Forcing lazy-lock to avoid allocator/threading bootstrap issues])
  enable_lazy_lock="1"
 fi
@ -1104,6 +1257,8 @@ if test "x$enable_lazy_lock" = "x1" ; then
      ])
  fi
  AC_DEFINE([JEMALLOC_LAZY_LOCK], [ ])
+else
+  enable_lazy_lock="0"
 fi
 AC_SUBST([enable_lazy_lock])

@ -1115,13 +1270,13 @@ else
  enable_tls="1"
 fi
 ,
-enable_tls="1"
+enable_tls=""
 )
-if test "x${enable_tls}" = "x0" -a "x${force_tls}" = "x1" ; then
+if test "x${enable_tls}" = "x" -a "x${force_tls}" = "x1" ; then
  AC_MSG_RESULT([Forcing TLS to avoid allocator/threading bootstrap issues])
  enable_tls="1"
 fi
-if test "x${enable_tls}" = "x1" -a "x${force_tls}" = "x0" ; then
+if test "x${enable_tls}" = "x" -a "x${force_tls}" = "x0" ; then
  AC_MSG_RESULT([Forcing no TLS to avoid allocator/threading bootstrap issues])
  enable_tls="0"
 fi
@ -1138,6 +1293,8 @@ AC_COMPILE_IFELSE([AC_LANG_PROGRAM(
              AC_MSG_RESULT([yes]),
              AC_MSG_RESULT([no])
              enable_tls="0")
+else
+  enable_tls="0"
 fi
 AC_SUBST([enable_tls])
 if test "x${enable_tls}" = "x1" ; then
@ -1147,21 +1304,24 @@ elif test "x${force_tls}" = "x1" ; then
 fi

 dnl ============================================================================
-dnl Check for ffsl(3), and fail if not found.  This function exists on all
-dnl platforms that jemalloc currently has a chance of functioning on without
-dnl modification.
-JE_COMPILABLE([a program using ffsl], [
-#include <stdio.h>
-#include <strings.h>
-#include <string.h>
+dnl Check for C11 atomics.
+
+JE_COMPILABLE([C11 atomics], [
+#include <stdint.h>
+#if (__STDC_VERSION__ >= 201112L) && !defined(__STDC_NO_ATOMICS__)
+#include <stdatomic.h>
+#else
+#error Atomics not available
+#endif
 ], [
-	{
-		int rv = ffsl(0x08);
-		printf("%d\n", rv);
-	}
-], [je_cv_function_ffsl])
-if test "x${je_cv_function_ffsl}" != "xyes" ; then
-   AC_MSG_ERROR([Cannot build without ffsl(3)])
+    uint64_t *p = (uint64_t *)0;
+    uint64_t x = 1;
+    volatile atomic_uint_least64_t *a = (volatile atomic_uint_least64_t *)p;
+    uint64_t r = atomic_fetch_add(a, x) + x;
+    return (r == 0);
+], [je_cv_c11atomics])
+if test "x${je_cv_c11atomics}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_C11ATOMICS])
 fi

 dnl ============================================================================
@ -1209,6 +1369,20 @@ if test "x${je_cv_osatomic}" = "xyes" ; then
  AC_DEFINE([JEMALLOC_OSATOMIC], [ ])
 fi

+dnl ============================================================================
+dnl Check for madvise(2).
+
+JE_COMPILABLE([madvise(2)], [
+#include <sys/mman.h>
+], [
+	{
+		madvise((void *)0, 0, 0);
+	}
+], [je_cv_madvise])
+if test "x${je_cv_madvise}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_HAVE_MADVISE], [ ])
+fi
+
 dnl ============================================================================
 dnl Check whether __sync_{add,sub}_and_fetch() are available despite
 dnl __GCC_HAVE_SYNC_COMPARE_AND_SWAP_n macros being undefined.
@ -1243,6 +1417,29 @@ if test "x${je_cv_atomic9}" != "xyes" -a "x${je_cv_osatomic}" != "xyes" ; then
  JE_SYNC_COMPARE_AND_SWAP_CHECK(64, 8)
 fi

+dnl ============================================================================
+dnl Check for __builtin_clz() and __builtin_clzl().
+
+AC_CACHE_CHECK([for __builtin_clz],
+               [je_cv_builtin_clz],
+               [AC_LINK_IFELSE([AC_LANG_PROGRAM([],
+                                                [
+                                                {
+                                                        unsigned x = 0;
+                                                        int y = __builtin_clz(x);
+                                                }
+                                                {
+                                                        unsigned long x = 0;
+                                                        int y = __builtin_clzl(x);
+                                                }
+                                                ])],
+                               [je_cv_builtin_clz=yes],
+                               [je_cv_builtin_clz=no])])
+
+if test "x${je_cv_builtin_clz}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_HAVE_BUILTIN_CLZ], [ ])
+fi
+
 dnl ============================================================================
 dnl Check for spinlock(3) operations as provided on Darwin.

@ -1281,7 +1478,6 @@ if test "x${enable_zone_allocator}" = "x1" ; then
  if test "x${abi}" != "xmacho"; then
    AC_MSG_ERROR([--enable-zone-allocator is only supported on Darwin])
  fi
-  AC_DEFINE([JEMALLOC_IVSALLOC], [ ])
  AC_DEFINE([JEMALLOC_ZONE], [ ])

  dnl The szone version jumped from 3 to 6 between the OS X 10.5.x and 10.6
@ -1291,7 +1487,7 @@ if test "x${enable_zone_allocator}" = "x1" ; then
  AC_DEFUN([JE_ZONE_PROGRAM],
    [AC_LANG_PROGRAM(
      [#include <malloc/malloc.h>],
-      [static foo[[sizeof($1) $2 sizeof(void *) * $3 ? 1 : -1]]]
+      [static int foo[[sizeof($1) $2 sizeof(void *) * $3 ? 1 : -1]]]
    )])

  AC_COMPILE_IFELSE([JE_ZONE_PROGRAM(malloc_zone_t,==,14)],[JEMALLOC_ZONE_VERSION=3],[
@ -1316,6 +1512,49 @@ if test "x${enable_zone_allocator}" = "x1" ; then
  AC_DEFINE_UNQUOTED(JEMALLOC_ZONE_VERSION, [$JEMALLOC_ZONE_VERSION])
 fi

+dnl ============================================================================
+dnl Check for glibc malloc hooks
+
+JE_COMPILABLE([glibc malloc hook], [
+#include <stddef.h>
+
+extern void (* __free_hook)(void *ptr);
+extern void *(* __malloc_hook)(size_t size);
+extern void *(* __realloc_hook)(void *ptr, size_t size);
+], [
+  void *ptr = 0L;
+  if (__malloc_hook) ptr = __malloc_hook(1);
+  if (__realloc_hook) ptr = __realloc_hook(ptr, 2);
+  if (__free_hook && ptr) __free_hook(ptr);
+], [je_cv_glibc_malloc_hook])
+if test "x${je_cv_glibc_malloc_hook}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_GLIBC_MALLOC_HOOK], [ ])
+fi
+
+JE_COMPILABLE([glibc memalign hook], [
+#include <stddef.h>
+
+extern void *(* __memalign_hook)(size_t alignment, size_t size);
+], [
+  void *ptr = 0L;
+  if (__memalign_hook) ptr = __memalign_hook(16, 7);
+], [je_cv_glibc_memalign_hook])
+if test "x${je_cv_glibc_memalign_hook}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_GLIBC_MEMALIGN_HOOK], [ ])
+fi
+
+JE_COMPILABLE([pthreads adaptive mutexes], [
+#include <pthread.h>
+], [
+  pthread_mutexattr_t attr;
+  pthread_mutexattr_init(&attr);
+  pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ADAPTIVE_NP);
+  pthread_mutexattr_destroy(&attr);
+], [je_cv_pthread_mutex_adaptive_np])
+if test "x${je_cv_pthread_mutex_adaptive_np}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_HAVE_PTHREAD_MUTEX_ADAPTIVE_NP], [ ])
+fi
+
 dnl ============================================================================
 dnl Check for typedefs, structures, and compiler characteristics.
 AC_HEADER_STDBOOL
@ -1376,10 +1615,14 @@ AC_CONFIG_COMMANDS([include/jemalloc/internal/public_unnamespace.h], [
 ])
 AC_CONFIG_COMMANDS([include/jemalloc/internal/size_classes.h], [
  mkdir -p "${objroot}include/jemalloc/internal"
-  "${srcdir}/include/jemalloc/internal/size_classes.sh" > "${objroot}include/jemalloc/internal/size_classes.h"
+  "${srcdir}/include/jemalloc/internal/size_classes.sh" "${LG_QUANTA}" ${LG_TINY_MIN} "${LG_PAGE_SIZES}" ${LG_SIZE_CLASS_GROUP} > "${objroot}include/jemalloc/internal/size_classes.h"
 ], [
  srcdir="${srcdir}"
  objroot="${objroot}"
+  LG_QUANTA="${LG_QUANTA}"
+  LG_TINY_MIN=${LG_TINY_MIN}
+  LG_PAGE_SIZES="${LG_PAGE_SIZES}"
+  LG_SIZE_CLASS_GROUP=${LG_SIZE_CLASS_GROUP}
 ])
 AC_CONFIG_COMMANDS([include/jemalloc/jemalloc_protos_jet.h], [
  mkdir -p "${objroot}include/jemalloc"
@ -1426,7 +1669,7 @@ AC_CONFIG_HEADERS([$cfghdrs_tup])
 dnl ============================================================================
 dnl Generate outputs.

-AC_CONFIG_FILES([$cfgoutputs_tup config.stamp bin/jemalloc.sh])
+AC_CONFIG_FILES([$cfgoutputs_tup config.stamp bin/jemalloc-config bin/jemalloc.sh bin/jeprof])
 AC_SUBST([cfgoutputs_in])
 AC_SUBST([cfgoutputs_out])
 AC_OUTPUT
@ -1437,9 +1680,10 @@ AC_MSG_RESULT([=================================================================
 AC_MSG_RESULT([jemalloc version   : ${jemalloc_version}])
 AC_MSG_RESULT([library revision   : ${rev}])
 AC_MSG_RESULT([])
+AC_MSG_RESULT([CONFIG             : ${CONFIG}])
 AC_MSG_RESULT([CC                 : ${CC}])
-AC_MSG_RESULT([CPPFLAGS           : ${CPPFLAGS}])
 AC_MSG_RESULT([CFLAGS             : ${CFLAGS}])
+AC_MSG_RESULT([CPPFLAGS           : ${CPPFLAGS}])
 AC_MSG_RESULT([LDFLAGS            : ${LDFLAGS}])
 AC_MSG_RESULT([EXTRA_LDFLAGS      : ${EXTRA_LDFLAGS}])
 AC_MSG_RESULT([LIBS               : ${LIBS}])
@ -1450,9 +1694,9 @@ AC_MSG_RESULT([XSLROOT            : ${XSLROOT}])
 AC_MSG_RESULT([])
 AC_MSG_RESULT([PREFIX             : ${PREFIX}])
 AC_MSG_RESULT([BINDIR             : ${BINDIR}])
+AC_MSG_RESULT([DATADIR            : ${DATADIR}])
 AC_MSG_RESULT([INCLUDEDIR         : ${INCLUDEDIR}])
 AC_MSG_RESULT([LIBDIR             : ${LIBDIR}])
-AC_MSG_RESULT([DATADIR            : ${DATADIR}])
 AC_MSG_RESULT([MANDIR             : ${MANDIR}])
 AC_MSG_RESULT([])
 AC_MSG_RESULT([srcroot            : ${srcroot}])
@ -1465,7 +1709,6 @@ AC_MSG_RESULT([JEMALLOC_PRIVATE_NAMESPACE])
 AC_MSG_RESULT([                   : ${JEMALLOC_PRIVATE_NAMESPACE}])
 AC_MSG_RESULT([install_suffix     : ${install_suffix}])
 AC_MSG_RESULT([autogen            : ${enable_autogen}])
-AC_MSG_RESULT([experimental       : ${enable_experimental}])
 AC_MSG_RESULT([cc-silence         : ${enable_cc_silence}])
 AC_MSG_RESULT([debug              : ${enable_debug}])
 AC_MSG_RESULT([code-coverage      : ${enable_code_coverage}])
@ -1479,9 +1722,8 @@ AC_MSG_RESULT([fill               : ${enable_fill}])
 AC_MSG_RESULT([utrace             : ${enable_utrace}])
 AC_MSG_RESULT([valgrind           : ${enable_valgrind}])
 AC_MSG_RESULT([xmalloc            : ${enable_xmalloc}])
-AC_MSG_RESULT([mremap             : ${enable_mremap}])
 AC_MSG_RESULT([munmap             : ${enable_munmap}])
-AC_MSG_RESULT([dss                : ${enable_dss}])
 AC_MSG_RESULT([lazy_lock          : ${enable_lazy_lock}])
 AC_MSG_RESULT([tls                : ${enable_tls}])
+AC_MSG_RESULT([cache-oblivious    : ${enable_cache_oblivious}])
 AC_MSG_RESULT([===============================================================================])
--- a/doc/jemalloc.xml.in
+++ b/doc/jemalloc.xml.in
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
--- a/include/jemalloc/internal/atomic.h
+++ b/include/jemalloc/internal/atomic.h
@ -11,6 +11,7 @@

 #define	atomic_read_uint64(p)	atomic_add_uint64(p, 0)
 #define	atomic_read_uint32(p)	atomic_add_uint32(p, 0)
+#define	atomic_read_p(p)	atomic_add_p(p, NULL)
 #define	atomic_read_z(p)	atomic_add_z(p, 0)
 #define	atomic_read_u(p)	atomic_add_u(p, 0)

@ -18,89 +19,139 @@
 /******************************************************************************/
 #ifdef JEMALLOC_H_INLINES

+/*
+ * All arithmetic functions return the arithmetic result of the atomic
+ * operation.  Some atomic operation APIs return the value prior to mutation, in
+ * which case the following functions must redundantly compute the result so
+ * that it can be returned.  These functions are normally inlined, so the extra
+ * operations can be optimized away if the return values aren't used by the
+ * callers.
+ *
+ *   <t> atomic_read_<t>(<t> *p) { return (*p); }
+ *   <t> atomic_add_<t>(<t> *p, <t> x) { return (*p + x); }
+ *   <t> atomic_sub_<t>(<t> *p, <t> x) { return (*p - x); }
+ *   bool atomic_cas_<t>(<t> *p, <t> c, <t> s)
+ *   {
+ *     if (*p != c)
+ *       return (true);
+ *     *p = s;
+ *     return (false);
+ *   }
+ *   void atomic_write_<t>(<t> *p, <t> x) { *p = x; }
+ */
+
 #ifndef JEMALLOC_ENABLE_INLINE
 uint64_t	atomic_add_uint64(uint64_t *p, uint64_t x);
 uint64_t	atomic_sub_uint64(uint64_t *p, uint64_t x);
+bool	atomic_cas_uint64(uint64_t *p, uint64_t c, uint64_t s);
+void	atomic_write_uint64(uint64_t *p, uint64_t x);
 uint32_t	atomic_add_uint32(uint32_t *p, uint32_t x);
 uint32_t	atomic_sub_uint32(uint32_t *p, uint32_t x);
+bool	atomic_cas_uint32(uint32_t *p, uint32_t c, uint32_t s);
+void	atomic_write_uint32(uint32_t *p, uint32_t x);
+void	*atomic_add_p(void **p, void *x);
+void	*atomic_sub_p(void **p, void *x);
+bool	atomic_cas_p(void **p, void *c, void *s);
+void	atomic_write_p(void **p, const void *x);
 size_t	atomic_add_z(size_t *p, size_t x);
 size_t	atomic_sub_z(size_t *p, size_t x);
+bool	atomic_cas_z(size_t *p, size_t c, size_t s);
+void	atomic_write_z(size_t *p, size_t x);
 unsigned	atomic_add_u(unsigned *p, unsigned x);
 unsigned	atomic_sub_u(unsigned *p, unsigned x);
+bool	atomic_cas_u(unsigned *p, unsigned c, unsigned s);
+void	atomic_write_u(unsigned *p, unsigned x);
 #endif

 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_ATOMIC_C_))
 /******************************************************************************/
 /* 64-bit operations. */
 #if (LG_SIZEOF_PTR == 3 || LG_SIZEOF_INT == 3)
-#  ifdef __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8
-JEMALLOC_INLINE uint64_t
-atomic_add_uint64(uint64_t *p, uint64_t x)
-{
-
-	return (__sync_add_and_fetch(p, x));
-}
-
-JEMALLOC_INLINE uint64_t
-atomic_sub_uint64(uint64_t *p, uint64_t x)
-{
-
-	return (__sync_sub_and_fetch(p, x));
-}
-#elif (defined(_MSC_VER))
-JEMALLOC_INLINE uint64_t
-atomic_add_uint64(uint64_t *p, uint64_t x)
-{
-
-	return (InterlockedExchangeAdd64(p, x));
-}
-
-JEMALLOC_INLINE uint64_t
-atomic_sub_uint64(uint64_t *p, uint64_t x)
-{
-
-	return (InterlockedExchangeAdd64(p, -((int64_t)x)));
-}
-#elif (defined(JEMALLOC_OSATOMIC))
-JEMALLOC_INLINE uint64_t
-atomic_add_uint64(uint64_t *p, uint64_t x)
-{
-
-	return (OSAtomicAdd64((int64_t)x, (int64_t *)p));
-}
-
-JEMALLOC_INLINE uint64_t
-atomic_sub_uint64(uint64_t *p, uint64_t x)
-{
-
-	return (OSAtomicAdd64(-((int64_t)x), (int64_t *)p));
-}
-#  elif (defined(__amd64__) || defined(__x86_64__))
+#  if (defined(__amd64__) || defined(__x86_64__))
 JEMALLOC_INLINE uint64_t
 atomic_add_uint64(uint64_t *p, uint64_t x)
 {
+	uint64_t t = x;

 	asm volatile (
 	    "lock; xaddq %0, %1;"
-	    : "+r" (x), "=m" (*p) /* Outputs. */
+	    : "+r" (t), "=m" (*p) /* Outputs. */
 	    : "m" (*p) /* Inputs. */
 	    );

-	return (x);
+	return (t + x);
 }

 JEMALLOC_INLINE uint64_t
 atomic_sub_uint64(uint64_t *p, uint64_t x)
 {
+	uint64_t t;

 	x = (uint64_t)(-(int64_t)x);
+	t = x;
 	asm volatile (
 	    "lock; xaddq %0, %1;"
-	    : "+r" (x), "=m" (*p) /* Outputs. */
+	    : "+r" (t), "=m" (*p) /* Outputs. */
 	    : "m" (*p) /* Inputs. */
 	    );

-	return (x);
+	return (t + x);
+}
+
+JEMALLOC_INLINE bool
+atomic_cas_uint64(uint64_t *p, uint64_t c, uint64_t s)
+{
+	uint8_t success;
+
+	asm volatile (
+	    "lock; cmpxchgq %4, %0;"
+	    "sete %1;"
+	    : "=m" (*p), "=a" (success) /* Outputs. */
+	    : "m" (*p), "a" (c), "r" (s) /* Inputs. */
+	    : "memory" /* Clobbers. */
+	    );
+
+	return (!(bool)success);
+}
+
+JEMALLOC_INLINE void
+atomic_write_uint64(uint64_t *p, uint64_t x)
+{
+
+	asm volatile (
+	    "xchgq %1, %0;" /* Lock is implied by xchgq. */
+	    : "=m" (*p), "+r" (x) /* Outputs. */
+	    : "m" (*p) /* Inputs. */
+	    : "memory" /* Clobbers. */
+	    );
+}
+#  elif (defined(JEMALLOC_C11ATOMICS))
+JEMALLOC_INLINE uint64_t
+atomic_add_uint64(uint64_t *p, uint64_t x)
+{
+	volatile atomic_uint_least64_t *a = (volatile atomic_uint_least64_t *)p;
+	return (atomic_fetch_add(a, x) + x);
+}
+
+JEMALLOC_INLINE uint64_t
+atomic_sub_uint64(uint64_t *p, uint64_t x)
+{
+	volatile atomic_uint_least64_t *a = (volatile atomic_uint_least64_t *)p;
+	return (atomic_fetch_sub(a, x) - x);
+}
+
+JEMALLOC_INLINE bool
+atomic_cas_uint64(uint64_t *p, uint64_t c, uint64_t s)
+{
+	volatile atomic_uint_least64_t *a = (volatile atomic_uint_least64_t *)p;
+	return (!atomic_compare_exchange_strong(a, &c, s));
+}
+
+JEMALLOC_INLINE void
+atomic_write_uint64(uint64_t *p, uint64_t x)
+{
+	volatile atomic_uint_least64_t *a = (volatile atomic_uint_least64_t *)p;
+	atomic_store(a, x);
 }
 #  elif (defined(JEMALLOC_ATOMIC9))
 JEMALLOC_INLINE uint64_t
@ -124,7 +175,88 @@ atomic_sub_uint64(uint64_t *p, uint64_t x)

 	return (atomic_fetchadd_long(p, (unsigned long)(-(long)x)) - x);
 }
-#  elif (defined(JE_FORCE_SYNC_COMPARE_AND_SWAP_8))
+
+JEMALLOC_INLINE bool
+atomic_cas_uint64(uint64_t *p, uint64_t c, uint64_t s)
+{
+
+	assert(sizeof(uint64_t) == sizeof(unsigned long));
+
+	return (!atomic_cmpset_long(p, (unsigned long)c, (unsigned long)s));
+}
+
+JEMALLOC_INLINE void
+atomic_write_uint64(uint64_t *p, uint64_t x)
+{
+
+	assert(sizeof(uint64_t) == sizeof(unsigned long));
+
+	atomic_store_rel_long(p, x);
+}
+#  elif (defined(JEMALLOC_OSATOMIC))
+JEMALLOC_INLINE uint64_t
+atomic_add_uint64(uint64_t *p, uint64_t x)
+{
+
+	return (OSAtomicAdd64((int64_t)x, (int64_t *)p));
+}
+
+JEMALLOC_INLINE uint64_t
+atomic_sub_uint64(uint64_t *p, uint64_t x)
+{
+
+	return (OSAtomicAdd64(-((int64_t)x), (int64_t *)p));
+}
+
+JEMALLOC_INLINE bool
+atomic_cas_uint64(uint64_t *p, uint64_t c, uint64_t s)
+{
+
+	return (!OSAtomicCompareAndSwap64(c, s, (int64_t *)p));
+}
+
+JEMALLOC_INLINE void
+atomic_write_uint64(uint64_t *p, uint64_t x)
+{
+	uint64_t o;
+
+	/*The documented OSAtomic*() API does not expose an atomic exchange. */
+	do {
+		o = atomic_read_uint64(p);
+	} while (atomic_cas_uint64(p, o, x));
+}
+#  elif (defined(_MSC_VER))
+JEMALLOC_INLINE uint64_t
+atomic_add_uint64(uint64_t *p, uint64_t x)
+{
+
+	return (InterlockedExchangeAdd64(p, x) + x);
+}
+
+JEMALLOC_INLINE uint64_t
+atomic_sub_uint64(uint64_t *p, uint64_t x)
+{
+
+	return (InterlockedExchangeAdd64(p, -((int64_t)x)) - x);
+}
+
+JEMALLOC_INLINE bool
+atomic_cas_uint64(uint64_t *p, uint64_t c, uint64_t s)
+{
+	uint64_t o;
+
+	o = InterlockedCompareExchange64(p, s, c);
+	return (o != c);
+}
+
+JEMALLOC_INLINE void
+atomic_write_uint64(uint64_t *p, uint64_t x)
+{
+
+	InterlockedExchange64(p, x);
+}
+#  elif (defined(__GCC_HAVE_SYNC_COMPARE_AND_SWAP_8) || \
+    defined(JE_FORCE_SYNC_COMPARE_AND_SWAP_8))
 JEMALLOC_INLINE uint64_t
 atomic_add_uint64(uint64_t *p, uint64_t x)
 {
@ -138,6 +270,20 @@ atomic_sub_uint64(uint64_t *p, uint64_t x)

 	return (__sync_sub_and_fetch(p, x));
 }
+
+JEMALLOC_INLINE bool
+atomic_cas_uint64(uint64_t *p, uint64_t c, uint64_t s)
+{
+
+	return (!__sync_bool_compare_and_swap(p, c, s));
+}
+
+JEMALLOC_INLINE void
+atomic_write_uint64(uint64_t *p, uint64_t x)
+{
+
+	__sync_lock_test_and_set(p, x);
+}
 #  else
 #    error "Missing implementation for 64-bit atomic operations"
 #  endif
@ -145,74 +291,91 @@ atomic_sub_uint64(uint64_t *p, uint64_t x)

 /******************************************************************************/
 /* 32-bit operations. */
-#ifdef __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4
-JEMALLOC_INLINE uint32_t
-atomic_add_uint32(uint32_t *p, uint32_t x)
-{
-
-	return (__sync_add_and_fetch(p, x));
-}
-
-JEMALLOC_INLINE uint32_t
-atomic_sub_uint32(uint32_t *p, uint32_t x)
-{
-
-	return (__sync_sub_and_fetch(p, x));
-}
-#elif (defined(_MSC_VER))
-JEMALLOC_INLINE uint32_t
-atomic_add_uint32(uint32_t *p, uint32_t x)
-{
-
-	return (InterlockedExchangeAdd(p, x));
-}
-
-JEMALLOC_INLINE uint32_t
-atomic_sub_uint32(uint32_t *p, uint32_t x)
-{
-
-	return (InterlockedExchangeAdd(p, -((int32_t)x)));
-}
-#elif (defined(JEMALLOC_OSATOMIC))
-JEMALLOC_INLINE uint32_t
-atomic_add_uint32(uint32_t *p, uint32_t x)
-{
-
-	return (OSAtomicAdd32((int32_t)x, (int32_t *)p));
-}
-
-JEMALLOC_INLINE uint32_t
-atomic_sub_uint32(uint32_t *p, uint32_t x)
-{
-
-	return (OSAtomicAdd32(-((int32_t)x), (int32_t *)p));
-}
-#elif (defined(__i386__) || defined(__amd64__) || defined(__x86_64__))
+#if (defined(__i386__) || defined(__amd64__) || defined(__x86_64__))
 JEMALLOC_INLINE uint32_t
 atomic_add_uint32(uint32_t *p, uint32_t x)
 {
+	uint32_t t = x;

 	asm volatile (
 	    "lock; xaddl %0, %1;"
-	    : "+r" (x), "=m" (*p) /* Outputs. */
+	    : "+r" (t), "=m" (*p) /* Outputs. */
 	    : "m" (*p) /* Inputs. */
 	    );

-	return (x);
+	return (t + x);
 }

 JEMALLOC_INLINE uint32_t
 atomic_sub_uint32(uint32_t *p, uint32_t x)
 {
+	uint32_t t;

 	x = (uint32_t)(-(int32_t)x);
+	t = x;
 	asm volatile (
 	    "lock; xaddl %0, %1;"
-	    : "+r" (x), "=m" (*p) /* Outputs. */
+	    : "+r" (t), "=m" (*p) /* Outputs. */
 	    : "m" (*p) /* Inputs. */
 	    );

-	return (x);
+	return (t + x);
+}
+
+JEMALLOC_INLINE bool
+atomic_cas_uint32(uint32_t *p, uint32_t c, uint32_t s)
+{
+	uint8_t success;
+
+	asm volatile (
+	    "lock; cmpxchgl %4, %0;"
+	    "sete %1;"
+	    : "=m" (*p), "=a" (success) /* Outputs. */
+	    : "m" (*p), "a" (c), "r" (s) /* Inputs. */
+	    : "memory"
+	    );
+
+	return (!(bool)success);
+}
+
+JEMALLOC_INLINE void
+atomic_write_uint32(uint32_t *p, uint32_t x)
+{
+
+	asm volatile (
+	    "xchgl %1, %0;" /* Lock is implied by xchgl. */
+	    : "=m" (*p), "+r" (x) /* Outputs. */
+	    : "m" (*p) /* Inputs. */
+	    : "memory" /* Clobbers. */
+	    );
+}
+#  elif (defined(JEMALLOC_C11ATOMICS))
+JEMALLOC_INLINE uint32_t
+atomic_add_uint32(uint32_t *p, uint32_t x)
+{
+	volatile atomic_uint_least32_t *a = (volatile atomic_uint_least32_t *)p;
+	return (atomic_fetch_add(a, x) + x);
+}
+
+JEMALLOC_INLINE uint32_t
+atomic_sub_uint32(uint32_t *p, uint32_t x)
+{
+	volatile atomic_uint_least32_t *a = (volatile atomic_uint_least32_t *)p;
+	return (atomic_fetch_sub(a, x) - x);
+}
+
+JEMALLOC_INLINE bool
+atomic_cas_uint32(uint32_t *p, uint32_t c, uint32_t s)
+{
+	volatile atomic_uint_least32_t *a = (volatile atomic_uint_least32_t *)p;
+	return (!atomic_compare_exchange_strong(a, &c, s));
+}
+
+JEMALLOC_INLINE void
+atomic_write_uint32(uint32_t *p, uint32_t x)
+{
+	volatile atomic_uint_least32_t *a = (volatile atomic_uint_least32_t *)p;
+	atomic_store(a, x);
 }
 #elif (defined(JEMALLOC_ATOMIC9))
 JEMALLOC_INLINE uint32_t
@ -228,7 +391,84 @@ atomic_sub_uint32(uint32_t *p, uint32_t x)

 	return (atomic_fetchadd_32(p, (uint32_t)(-(int32_t)x)) - x);
 }
-#elif (defined(JE_FORCE_SYNC_COMPARE_AND_SWAP_4))
+
+JEMALLOC_INLINE bool
+atomic_cas_uint32(uint32_t *p, uint32_t c, uint32_t s)
+{
+
+	return (!atomic_cmpset_32(p, c, s));
+}
+
+JEMALLOC_INLINE void
+atomic_write_uint32(uint32_t *p, uint32_t x)
+{
+
+	atomic_store_rel_32(p, x);
+}
+#elif (defined(JEMALLOC_OSATOMIC))
+JEMALLOC_INLINE uint32_t
+atomic_add_uint32(uint32_t *p, uint32_t x)
+{
+
+	return (OSAtomicAdd32((int32_t)x, (int32_t *)p));
+}
+
+JEMALLOC_INLINE uint32_t
+atomic_sub_uint32(uint32_t *p, uint32_t x)
+{
+
+	return (OSAtomicAdd32(-((int32_t)x), (int32_t *)p));
+}
+
+JEMALLOC_INLINE bool
+atomic_cas_uint32(uint32_t *p, uint32_t c, uint32_t s)
+{
+
+	return (!OSAtomicCompareAndSwap32(c, s, (int32_t *)p));
+}
+
+JEMALLOC_INLINE void
+atomic_write_uint32(uint32_t *p, uint32_t x)
+{
+	uint32_t o;
+
+	/*The documented OSAtomic*() API does not expose an atomic exchange. */
+	do {
+		o = atomic_read_uint32(p);
+	} while (atomic_cas_uint32(p, o, x));
+}
+#elif (defined(_MSC_VER))
+JEMALLOC_INLINE uint32_t
+atomic_add_uint32(uint32_t *p, uint32_t x)
+{
+
+	return (InterlockedExchangeAdd(p, x) + x);
+}
+
+JEMALLOC_INLINE uint32_t
+atomic_sub_uint32(uint32_t *p, uint32_t x)
+{
+
+	return (InterlockedExchangeAdd(p, -((int32_t)x)) - x);
+}
+
+JEMALLOC_INLINE bool
+atomic_cas_uint32(uint32_t *p, uint32_t c, uint32_t s)
+{
+	uint32_t o;
+
+	o = InterlockedCompareExchange(p, s, c);
+	return (o != c);
+}
+
+JEMALLOC_INLINE void
+atomic_write_uint32(uint32_t *p, uint32_t x)
+{
+
+	InterlockedExchange(p, x);
+}
+#elif (defined(__GCC_HAVE_SYNC_COMPARE_AND_SWAP_4) || \
+ defined(JE_FORCE_SYNC_COMPARE_AND_SWAP_4))
 JEMALLOC_INLINE uint32_t
 atomic_add_uint32(uint32_t *p, uint32_t x)
 {
@ -242,10 +482,72 @@ atomic_sub_uint32(uint32_t *p, uint32_t x)

 	return (__sync_sub_and_fetch(p, x));
 }
+
+JEMALLOC_INLINE bool
+atomic_cas_uint32(uint32_t *p, uint32_t c, uint32_t s)
+{
+
+	return (!__sync_bool_compare_and_swap(p, c, s));
+}
+
+JEMALLOC_INLINE void
+atomic_write_uint32(uint32_t *p, uint32_t x)
+{
+
+	__sync_lock_test_and_set(p, x);
+}
 #else
 #  error "Missing implementation for 32-bit atomic operations"
 #endif

+/******************************************************************************/
+/* Pointer operations. */
+JEMALLOC_INLINE void *
+atomic_add_p(void **p, void *x)
+{
+
+#if (LG_SIZEOF_PTR == 3)
+	return ((void *)atomic_add_uint64((uint64_t *)p, (uint64_t)x));
+#elif (LG_SIZEOF_PTR == 2)
+	return ((void *)atomic_add_uint32((uint32_t *)p, (uint32_t)x));
+#endif
+}
+
+JEMALLOC_INLINE void *
+atomic_sub_p(void **p, void *x)
+{
+
+#if (LG_SIZEOF_PTR == 3)
+	return ((void *)atomic_add_uint64((uint64_t *)p,
+	    (uint64_t)-((int64_t)x)));
+#elif (LG_SIZEOF_PTR == 2)
+	return ((void *)atomic_add_uint32((uint32_t *)p,
+	    (uint32_t)-((int32_t)x)));
+#endif
+}
+
+JEMALLOC_INLINE bool
+atomic_cas_p(void **p, void *c, void *s)
+{
+
+#if (LG_SIZEOF_PTR == 3)
+	return (atomic_cas_uint64((uint64_t *)p, (uint64_t)c, (uint64_t)s));
+#elif (LG_SIZEOF_PTR == 2)
+	return (atomic_cas_uint32((uint32_t *)p, (uint32_t)c, (uint32_t)s));
+#endif
+}
+
+JEMALLOC_INLINE void
+atomic_write_p(void **p, const void *x)
+{
+
+#if (LG_SIZEOF_PTR == 3)
+	atomic_write_uint64((uint64_t *)p, (uint64_t)x);
+#elif (LG_SIZEOF_PTR == 2)
+	atomic_write_uint32((uint32_t *)p, (uint32_t)x);
+#endif
+}
+
 /******************************************************************************/
 /* size_t operations. */
 JEMALLOC_INLINE size_t
@ -272,6 +574,28 @@ atomic_sub_z(size_t *p, size_t x)
 #endif
 }

+JEMALLOC_INLINE bool
+atomic_cas_z(size_t *p, size_t c, size_t s)
+{
+
+#if (LG_SIZEOF_PTR == 3)
+	return (atomic_cas_uint64((uint64_t *)p, (uint64_t)c, (uint64_t)s));
+#elif (LG_SIZEOF_PTR == 2)
+	return (atomic_cas_uint32((uint32_t *)p, (uint32_t)c, (uint32_t)s));
+#endif
+}
+
+JEMALLOC_INLINE void
+atomic_write_z(size_t *p, size_t x)
+{
+
+#if (LG_SIZEOF_PTR == 3)
+	atomic_write_uint64((uint64_t *)p, (uint64_t)x);
+#elif (LG_SIZEOF_PTR == 2)
+	atomic_write_uint32((uint32_t *)p, (uint32_t)x);
+#endif
+}
+
 /******************************************************************************/
 /* unsigned operations. */
 JEMALLOC_INLINE unsigned
@ -297,6 +621,29 @@ atomic_sub_u(unsigned *p, unsigned x)
 	    (uint32_t)-((int32_t)x)));
 #endif
 }
+
+JEMALLOC_INLINE bool
+atomic_cas_u(unsigned *p, unsigned c, unsigned s)
+{
+
+#if (LG_SIZEOF_INT == 3)
+	return (atomic_cas_uint64((uint64_t *)p, (uint64_t)c, (uint64_t)s));
+#elif (LG_SIZEOF_INT == 2)
+	return (atomic_cas_uint32((uint32_t *)p, (uint32_t)c, (uint32_t)s));
+#endif
+}
+
+JEMALLOC_INLINE void
+atomic_write_u(unsigned *p, unsigned x)
+{
+
+#if (LG_SIZEOF_INT == 3)
+	atomic_write_uint64((uint64_t *)p, (uint64_t)x);
+#elif (LG_SIZEOF_INT == 2)
+	atomic_write_uint32((uint32_t *)p, (uint32_t)x);
+#endif
+}
+
 /******************************************************************************/
 #endif

--- a/include/jemalloc/internal/base.h
+++ b/include/jemalloc/internal/base.h
@ -10,9 +10,7 @@
 #ifdef JEMALLOC_H_EXTERNS

 void	*base_alloc(size_t size);
-void	*base_calloc(size_t number, size_t size);
-extent_node_t *base_node_alloc(void);
-void	base_node_dealloc(extent_node_t *node);
+void	base_stats_get(size_t *allocated, size_t *resident, size_t *mapped);
 bool	base_boot(void);
 void	base_prefork(void);
 void	base_postfork_parent(void);
--- a/include/jemalloc/internal/bitmap.h
+++ b/include/jemalloc/internal/bitmap.h
@ -3,6 +3,7 @@

 /* Maximum bitmap bit count is 2^LG_BITMAP_MAXBITS. */
 #define	LG_BITMAP_MAXBITS	LG_RUN_MAXREGS
+#define	BITMAP_MAXBITS		(ZU(1) << LG_BITMAP_MAXBITS)

 typedef struct bitmap_level_s bitmap_level_t;
 typedef struct bitmap_info_s bitmap_info_t;
@ -14,6 +15,51 @@ typedef unsigned long bitmap_t;
 #define	BITMAP_GROUP_NBITS		(ZU(1) << LG_BITMAP_GROUP_NBITS)
 #define	BITMAP_GROUP_NBITS_MASK		(BITMAP_GROUP_NBITS-1)

+/* Number of groups required to store a given number of bits. */
+#define	BITMAP_BITS2GROUPS(nbits)					\
+    ((nbits + BITMAP_GROUP_NBITS_MASK) >> LG_BITMAP_GROUP_NBITS)
+
+/*
+ * Number of groups required at a particular level for a given number of bits.
+ */
+#define	BITMAP_GROUPS_L0(nbits)						\
+    BITMAP_BITS2GROUPS(nbits)
+#define	BITMAP_GROUPS_L1(nbits)						\
+    BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS(nbits))
+#define	BITMAP_GROUPS_L2(nbits)						\
+    BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS((nbits))))
+#define	BITMAP_GROUPS_L3(nbits)						\
+    BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS(		\
+	BITMAP_BITS2GROUPS((nbits)))))
+
+/*
+ * Assuming the number of levels, number of groups required for a given number
+ * of bits.
+ */
+#define	BITMAP_GROUPS_1_LEVEL(nbits)					\
+    BITMAP_GROUPS_L0(nbits)
+#define	BITMAP_GROUPS_2_LEVEL(nbits)					\
+    (BITMAP_GROUPS_1_LEVEL(nbits) + BITMAP_GROUPS_L1(nbits))
+#define	BITMAP_GROUPS_3_LEVEL(nbits)					\
+    (BITMAP_GROUPS_2_LEVEL(nbits) + BITMAP_GROUPS_L2(nbits))
+#define	BITMAP_GROUPS_4_LEVEL(nbits)					\
+    (BITMAP_GROUPS_3_LEVEL(nbits) + BITMAP_GROUPS_L3(nbits))
+
+/*
+ * Maximum number of groups required to support LG_BITMAP_MAXBITS.
+ */
+#if LG_BITMAP_MAXBITS <= LG_BITMAP_GROUP_NBITS
+#  define BITMAP_GROUPS_MAX	BITMAP_GROUPS_1_LEVEL(BITMAP_MAXBITS)
+#elif LG_BITMAP_MAXBITS <= LG_BITMAP_GROUP_NBITS * 2
+#  define BITMAP_GROUPS_MAX	BITMAP_GROUPS_2_LEVEL(BITMAP_MAXBITS)
+#elif LG_BITMAP_MAXBITS <= LG_BITMAP_GROUP_NBITS * 3
+#  define BITMAP_GROUPS_MAX	BITMAP_GROUPS_3_LEVEL(BITMAP_MAXBITS)
+#elif LG_BITMAP_MAXBITS <= LG_BITMAP_GROUP_NBITS * 4
+#  define BITMAP_GROUPS_MAX	BITMAP_GROUPS_4_LEVEL(BITMAP_MAXBITS)
+#else
+#  error "Unsupported bitmap size"
+#endif
+
 /* Maximum number of levels possible. */
 #define	BITMAP_MAX_LEVELS						\
    (LG_BITMAP_MAXBITS / LG_SIZEOF_BITMAP)				\
@ -93,7 +139,7 @@ bitmap_set(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit)
 	bitmap_t g;

 	assert(bit < binfo->nbits);
-	assert(bitmap_get(bitmap, binfo, bit) == false);
+	assert(!bitmap_get(bitmap, binfo, bit));
 	goff = bit >> LG_BITMAP_GROUP_NBITS;
 	gp = &bitmap[goff];
 	g = *gp;
@ -126,15 +172,15 @@ bitmap_sfu(bitmap_t *bitmap, const bitmap_info_t *binfo)
 	bitmap_t g;
 	unsigned i;

-	assert(bitmap_full(bitmap, binfo) == false);
+	assert(!bitmap_full(bitmap, binfo));

 	i = binfo->nlevels - 1;
 	g = bitmap[binfo->levels[i].group_offset];
-	bit = ffsl(g) - 1;
+	bit = jemalloc_ffsl(g) - 1;
 	while (i > 0) {
 		i--;
 		g = bitmap[binfo->levels[i].group_offset + bit];
-		bit = (bit << LG_BITMAP_GROUP_NBITS) + (ffsl(g) - 1);
+		bit = (bit << LG_BITMAP_GROUP_NBITS) + (jemalloc_ffsl(g) - 1);
 	}

 	bitmap_set(bitmap, binfo, bit);
@ -158,7 +204,7 @@ bitmap_unset(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit)
 	assert((g & (1LU << (bit & BITMAP_GROUP_NBITS_MASK))) == 0);
 	g ^= 1LU << (bit & BITMAP_GROUP_NBITS_MASK);
 	*gp = g;
-	assert(bitmap_get(bitmap, binfo, bit) == false);
+	assert(!bitmap_get(bitmap, binfo, bit));
 	/* Propagate group state transitions up the tree. */
 	if (propagate) {
 		unsigned i;
@ -172,7 +218,7 @@ bitmap_unset(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit)
 			    == 0);
 			g ^= 1LU << (bit & BITMAP_GROUP_NBITS_MASK);
 			*gp = g;
-			if (propagate == false)
+			if (!propagate)
 				break;
 		}
 	}
--- a/include/jemalloc/internal/chunk.h
+++ b/include/jemalloc/internal/chunk.h
@ -5,7 +5,7 @@
 * Size and alignment of memory chunks that are allocated by the OS's virtual
 * memory system.
 */
-#define	LG_CHUNK_DEFAULT	22
+#define	LG_CHUNK_DEFAULT	21

 /* Return the chunk address for allocation address a. */
 #define	CHUNK_ADDR2BASE(a)						\
@ -19,6 +19,16 @@
 #define	CHUNK_CEILING(s)						\
 	(((s) + chunksize_mask) & ~chunksize_mask)

+#define	CHUNK_HOOKS_INITIALIZER {					\
+    NULL,								\
+    NULL,								\
+    NULL,								\
+    NULL,								\
+    NULL,								\
+    NULL,								\
+    NULL								\
+}
+
 #endif /* JEMALLOC_H_TYPES */
 /******************************************************************************/
 #ifdef JEMALLOC_H_STRUCTS
@ -30,23 +40,36 @@
 extern size_t		opt_lg_chunk;
 extern const char	*opt_dss;

-/* Protects stats_chunks; currently not used for any other purpose. */
-extern malloc_mutex_t	chunks_mtx;
-/* Chunk statistics. */
-extern chunk_stats_t	stats_chunks;
-
-extern rtree_t		*chunks_rtree;
+extern rtree_t		chunks_rtree;

 extern size_t		chunksize;
 extern size_t		chunksize_mask; /* (chunksize - 1). */
 extern size_t		chunk_npages;
-extern size_t		map_bias; /* Number of arena chunk header pages. */
-extern size_t		arena_maxclass; /* Max size class for arenas. */

-void	*chunk_alloc(size_t size, size_t alignment, bool base, bool *zero,
-    dss_prec_t dss_prec);
-void	chunk_unmap(void *chunk, size_t size);
-void	chunk_dealloc(void *chunk, size_t size, bool unmap);
+extern const chunk_hooks_t	chunk_hooks_default;
+
+chunk_hooks_t	chunk_hooks_get(arena_t *arena);
+chunk_hooks_t	chunk_hooks_set(arena_t *arena,
+    const chunk_hooks_t *chunk_hooks);
+
+bool	chunk_register(const void *chunk, const extent_node_t *node);
+void	chunk_deregister(const void *chunk, const extent_node_t *node);
+void	*chunk_alloc_base(size_t size);
+void	*chunk_alloc_cache(arena_t *arena, chunk_hooks_t *chunk_hooks,
+    void *new_addr, size_t size, size_t alignment, bool *zero,
+    bool dalloc_node);
+void	*chunk_alloc_wrapper(arena_t *arena, chunk_hooks_t *chunk_hooks,
+    void *new_addr, size_t size, size_t alignment, bool *zero, bool *commit);
+void	chunk_dalloc_cache(arena_t *arena, chunk_hooks_t *chunk_hooks,
+    void *chunk, size_t size, bool committed);
+void	chunk_dalloc_arena(arena_t *arena, chunk_hooks_t *chunk_hooks,
+    void *chunk, size_t size, bool zeroed, bool committed);
+void	chunk_dalloc_wrapper(arena_t *arena, chunk_hooks_t *chunk_hooks,
+    void *chunk, size_t size, bool committed);
+bool	chunk_purge_arena(arena_t *arena, void *chunk, size_t offset,
+    size_t length);
+bool	chunk_purge_wrapper(arena_t *arena, chunk_hooks_t *chunk_hooks,
+    void *chunk, size_t size, size_t offset, size_t length);
 bool	chunk_boot(void);
 void	chunk_prefork(void);
 void	chunk_postfork_parent(void);
@ -56,6 +79,19 @@ void	chunk_postfork_child(void);
 /******************************************************************************/
 #ifdef JEMALLOC_H_INLINES

+#ifndef JEMALLOC_ENABLE_INLINE
+extent_node_t	*chunk_lookup(const void *chunk, bool dependent);
+#endif
+
+#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_CHUNK_C_))
+JEMALLOC_INLINE extent_node_t *
+chunk_lookup(const void *ptr, bool dependent)
+{
+
+	return (rtree_get(&chunks_rtree, (uintptr_t)ptr, dependent));
+}
+#endif
+
 #endif /* JEMALLOC_H_INLINES */
 /******************************************************************************/

--- a/include/jemalloc/internal/chunk_dss.h
+++ b/include/jemalloc/internal/chunk_dss.h
@ -23,7 +23,8 @@ extern const char *dss_prec_names[];

 dss_prec_t	chunk_dss_prec_get(void);
 bool	chunk_dss_prec_set(dss_prec_t dss_prec);
-void	*chunk_alloc_dss(size_t size, size_t alignment, bool *zero);
+void	*chunk_alloc_dss(arena_t *arena, void *new_addr, size_t size,
+    size_t alignment, bool *zero, bool *commit);
 bool	chunk_in_dss(void *chunk);
 bool	chunk_dss_boot(void);
 void	chunk_dss_prefork(void);
--- a/include/jemalloc/internal/chunk_mmap.h
+++ b/include/jemalloc/internal/chunk_mmap.h
@ -9,10 +9,9 @@
 /******************************************************************************/
 #ifdef JEMALLOC_H_EXTERNS

-bool	pages_purge(void *addr, size_t length);
-
-void	*chunk_alloc_mmap(size_t size, size_t alignment, bool *zero);
-bool	chunk_dealloc_mmap(void *chunk, size_t size);
+void	*chunk_alloc_mmap(size_t size, size_t alignment, bool *zero,
+    bool *commit);
+bool	chunk_dalloc_mmap(void *chunk, size_t size);

 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
--- a/include/jemalloc/internal/ckh.h
+++ b/include/jemalloc/internal/ckh.h
@ -66,13 +66,13 @@ struct ckh_s {
 /******************************************************************************/
 #ifdef JEMALLOC_H_EXTERNS

-bool	ckh_new(ckh_t *ckh, size_t minitems, ckh_hash_t *hash,
+bool	ckh_new(tsd_t *tsd, ckh_t *ckh, size_t minitems, ckh_hash_t *hash,
    ckh_keycomp_t *keycomp);
-void	ckh_delete(ckh_t *ckh);
+void	ckh_delete(tsd_t *tsd, ckh_t *ckh);
 size_t	ckh_count(ckh_t *ckh);
 bool	ckh_iter(ckh_t *ckh, size_t *tabind, void **key, void **data);
-bool	ckh_insert(ckh_t *ckh, const void *key, const void *data);
-bool	ckh_remove(ckh_t *ckh, const void *searchkey, void **key,
+bool	ckh_insert(tsd_t *tsd, ckh_t *ckh, const void *key, const void *data);
+bool	ckh_remove(tsd_t *tsd, ckh_t *ckh, const void *searchkey, void **key,
    void **data);
 bool	ckh_search(ckh_t *ckh, const void *seachkey, void **key, void **data);
 void	ckh_string_hash(const void *key, size_t r_hash[2]);
--- a/include/jemalloc/internal/ctl.h
+++ b/include/jemalloc/internal/ctl.h
@ -34,6 +34,7 @@ struct ctl_arena_stats_s {
 	bool			initialized;
 	unsigned		nthreads;
 	const char		*dss;
+	ssize_t			lg_dirty_mult;
 	size_t			pactive;
 	size_t			pdirty;
 	arena_stats_t		astats;
@ -46,22 +47,15 @@ struct ctl_arena_stats_s {

 	malloc_bin_stats_t	bstats[NBINS];
 	malloc_large_stats_t	*lstats;	/* nlclasses elements. */
+	malloc_huge_stats_t	*hstats;	/* nhclasses elements. */
 };

 struct ctl_stats_s {
 	size_t			allocated;
 	size_t			active;
+	size_t			metadata;
+	size_t			resident;
 	size_t			mapped;
-	struct {
-		size_t		current;	/* stats_chunks.curchunks */
-		uint64_t	total;		/* stats_chunks.nchunks */
-		size_t		high;		/* stats_chunks.highchunks */
-	} chunks;
-	struct {
-		size_t		allocated;	/* huge_allocated */
-		uint64_t	nmalloc;	/* huge_nmalloc */
-		uint64_t	ndalloc;	/* huge_ndalloc */
-	} huge;
 	unsigned		narenas;
 	ctl_arena_stats_t	*arenas;	/* (narenas + 1) elements. */
 };
--- a/include/jemalloc/internal/extent.h
+++ b/include/jemalloc/internal/extent.h
@ -7,25 +7,53 @@ typedef struct extent_node_s extent_node_t;
 /******************************************************************************/
 #ifdef JEMALLOC_H_STRUCTS

-/* Tree of extents. */
+/* Tree of extents.  Use accessor functions for en_* fields. */
 struct extent_node_s {
-	/* Linkage for the size/address-ordered tree. */
-	rb_node(extent_node_t)	link_szad;
-
-	/* Linkage for the address-ordered tree. */
-	rb_node(extent_node_t)	link_ad;
-
-	/* Profile counters, used for huge objects. */
-	prof_ctx_t		*prof_ctx;
+	/* Arena from which this extent came, if any. */
+	arena_t			*en_arena;

 	/* Pointer to the extent that this tree node is responsible for. */
-	void			*addr;
+	void			*en_addr;

 	/* Total region size. */
-	size_t			size;
+	size_t			en_size;

-	/* True if zero-filled; used by chunk recycling code. */
-	bool			zeroed;
+	/*
+	 * The zeroed flag is used by chunk recycling code to track whether
+	 * memory is zero-filled.
+	 */
+	bool			en_zeroed;
+
+	/*
+	 * True if physical memory is committed to the extent, whether
+	 * explicitly or implicitly as on a system that overcommits and
+	 * satisfies physical memory needs on demand via soft page faults.
+	 */
+	bool			en_committed;
+
+	/*
+	 * The achunk flag is used to validate that huge allocation lookups
+	 * don't return arena chunks.
+	 */
+	bool			en_achunk;
+
+	/* Profile counters, used for huge objects. */
+	prof_tctx_t		*en_prof_tctx;
+
+	/* Linkage for arena's runs_dirty and chunks_cache rings. */
+	arena_runs_dirty_link_t	rd;
+	qr(extent_node_t)	cc_link;
+
+	union {
+		/* Linkage for the size/address-ordered tree. */
+		rb_node(extent_node_t)	szad_link;
+
+		/* Linkage for arena's huge and node_cache lists. */
+		ql_elm(extent_node_t)	ql_link;
+	};
+
+	/* Linkage for the address-ordered tree. */
+	rb_node(extent_node_t)	ad_link;
 };
 typedef rb_tree(extent_node_t) extent_tree_t;

@ -41,6 +69,171 @@ rb_proto(, extent_tree_ad_, extent_tree_t, extent_node_t)
 /******************************************************************************/
 #ifdef JEMALLOC_H_INLINES

+#ifndef JEMALLOC_ENABLE_INLINE
+arena_t	*extent_node_arena_get(const extent_node_t *node);
+void	*extent_node_addr_get(const extent_node_t *node);
+size_t	extent_node_size_get(const extent_node_t *node);
+bool	extent_node_zeroed_get(const extent_node_t *node);
+bool	extent_node_committed_get(const extent_node_t *node);
+bool	extent_node_achunk_get(const extent_node_t *node);
+prof_tctx_t	*extent_node_prof_tctx_get(const extent_node_t *node);
+void	extent_node_arena_set(extent_node_t *node, arena_t *arena);
+void	extent_node_addr_set(extent_node_t *node, void *addr);
+void	extent_node_size_set(extent_node_t *node, size_t size);
+void	extent_node_zeroed_set(extent_node_t *node, bool zeroed);
+void	extent_node_committed_set(extent_node_t *node, bool committed);
+void	extent_node_achunk_set(extent_node_t *node, bool achunk);
+void	extent_node_prof_tctx_set(extent_node_t *node, prof_tctx_t *tctx);
+void	extent_node_init(extent_node_t *node, arena_t *arena, void *addr,
+    size_t size, bool zeroed, bool committed);
+void	extent_node_dirty_linkage_init(extent_node_t *node);
+void	extent_node_dirty_insert(extent_node_t *node,
+    arena_runs_dirty_link_t *runs_dirty, extent_node_t *chunks_dirty);
+void	extent_node_dirty_remove(extent_node_t *node);
+#endif
+
+#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_EXTENT_C_))
+JEMALLOC_INLINE arena_t *
+extent_node_arena_get(const extent_node_t *node)
+{
+
+	return (node->en_arena);
+}
+
+JEMALLOC_INLINE void *
+extent_node_addr_get(const extent_node_t *node)
+{
+
+	return (node->en_addr);
+}
+
+JEMALLOC_INLINE size_t
+extent_node_size_get(const extent_node_t *node)
+{
+
+	return (node->en_size);
+}
+
+JEMALLOC_INLINE bool
+extent_node_zeroed_get(const extent_node_t *node)
+{
+
+	return (node->en_zeroed);
+}
+
+JEMALLOC_INLINE bool
+extent_node_committed_get(const extent_node_t *node)
+{
+
+	assert(!node->en_achunk);
+	return (node->en_committed);
+}
+
+JEMALLOC_INLINE bool
+extent_node_achunk_get(const extent_node_t *node)
+{
+
+	return (node->en_achunk);
+}
+
+JEMALLOC_INLINE prof_tctx_t *
+extent_node_prof_tctx_get(const extent_node_t *node)
+{
+
+	return (node->en_prof_tctx);
+}
+
+JEMALLOC_INLINE void
+extent_node_arena_set(extent_node_t *node, arena_t *arena)
+{
+
+	node->en_arena = arena;
+}
+
+JEMALLOC_INLINE void
+extent_node_addr_set(extent_node_t *node, void *addr)
+{
+
+	node->en_addr = addr;
+}
+
+JEMALLOC_INLINE void
+extent_node_size_set(extent_node_t *node, size_t size)
+{
+
+	node->en_size = size;
+}
+
+JEMALLOC_INLINE void
+extent_node_zeroed_set(extent_node_t *node, bool zeroed)
+{
+
+	node->en_zeroed = zeroed;
+}
+
+JEMALLOC_INLINE void
+extent_node_committed_set(extent_node_t *node, bool committed)
+{
+
+	node->en_committed = committed;
+}
+
+JEMALLOC_INLINE void
+extent_node_achunk_set(extent_node_t *node, bool achunk)
+{
+
+	node->en_achunk = achunk;
+}
+
+JEMALLOC_INLINE void
+extent_node_prof_tctx_set(extent_node_t *node, prof_tctx_t *tctx)
+{
+
+	node->en_prof_tctx = tctx;
+}
+
+JEMALLOC_INLINE void
+extent_node_init(extent_node_t *node, arena_t *arena, void *addr, size_t size,
+    bool zeroed, bool committed)
+{
+
+	extent_node_arena_set(node, arena);
+	extent_node_addr_set(node, addr);
+	extent_node_size_set(node, size);
+	extent_node_zeroed_set(node, zeroed);
+	extent_node_committed_set(node, committed);
+	extent_node_achunk_set(node, false);
+	if (config_prof)
+		extent_node_prof_tctx_set(node, NULL);
+}
+
+JEMALLOC_INLINE void
+extent_node_dirty_linkage_init(extent_node_t *node)
+{
+
+	qr_new(&node->rd, rd_link);
+	qr_new(node, cc_link);
+}
+
+JEMALLOC_INLINE void
+extent_node_dirty_insert(extent_node_t *node,
+    arena_runs_dirty_link_t *runs_dirty, extent_node_t *chunks_dirty)
+{
+
+	qr_meld(runs_dirty, &node->rd, rd_link);
+	qr_meld(chunks_dirty, node, cc_link);
+}
+
+JEMALLOC_INLINE void
+extent_node_dirty_remove(extent_node_t *node)
+{
+
+	qr_remove(&node->rd, rd_link);
+	qr_remove(node, cc_link);
+}
+
+#endif
+
 #endif /* JEMALLOC_H_INLINES */
 /******************************************************************************/

--- a/include/jemalloc/internal/hash.h
+++ b/include/jemalloc/internal/hash.h
@ -35,13 +35,14 @@ JEMALLOC_INLINE uint32_t
 hash_rotl_32(uint32_t x, int8_t r)
 {

-	return (x << r) | (x >> (32 - r));
+	return ((x << r) | (x >> (32 - r)));
 }

 JEMALLOC_INLINE uint64_t
 hash_rotl_64(uint64_t x, int8_t r)
 {
-	return (x << r) | (x >> (64 - r));
+
+	return ((x << r) | (x >> (64 - r)));
 }

 JEMALLOC_INLINE uint32_t
@ -76,9 +77,9 @@ hash_fmix_64(uint64_t k)
 {

 	k ^= k >> 33;
-	k *= QU(0xff51afd7ed558ccdLLU);
+	k *= KQU(0xff51afd7ed558ccd);
 	k ^= k >> 33;
-	k *= QU(0xc4ceb9fe1a85ec53LLU);
+	k *= KQU(0xc4ceb9fe1a85ec53);
 	k ^= k >> 33;

 	return (k);
@ -247,8 +248,8 @@ hash_x64_128(const void *key, const int len, const uint32_t seed,
 	uint64_t h1 = seed;
 	uint64_t h2 = seed;

-	const uint64_t c1 = QU(0x87c37b91114253d5LLU);
-	const uint64_t c2 = QU(0x4cf5ad432745937fLLU);
+	const uint64_t c1 = KQU(0x87c37b91114253d5);
+	const uint64_t c2 = KQU(0x4cf5ad432745937f);

 	/* body */
 	{
--- a/include/jemalloc/internal/huge.h
+++ b/include/jemalloc/internal/huge.h
@ -9,34 +9,24 @@
 /******************************************************************************/
 #ifdef JEMALLOC_H_EXTERNS

-/* Huge allocation statistics. */
-extern uint64_t		huge_nmalloc;
-extern uint64_t		huge_ndalloc;
-extern size_t		huge_allocated;
-
-/* Protects chunk-related data structures. */
-extern malloc_mutex_t	huge_mtx;
-
-void	*huge_malloc(size_t size, bool zero, dss_prec_t dss_prec);
-void	*huge_palloc(size_t size, size_t alignment, bool zero,
-    dss_prec_t dss_prec);
+void	*huge_malloc(tsd_t *tsd, arena_t *arena, size_t size, bool zero,
+    tcache_t *tcache);
+void	*huge_palloc(tsd_t *tsd, arena_t *arena, size_t size, size_t alignment,
+    bool zero, tcache_t *tcache);
 bool	huge_ralloc_no_move(void *ptr, size_t oldsize, size_t size,
-    size_t extra);
-void	*huge_ralloc(void *ptr, size_t oldsize, size_t size, size_t extra,
-    size_t alignment, bool zero, bool try_tcache_dalloc, dss_prec_t dss_prec);
+    size_t extra, bool zero);
+void	*huge_ralloc(tsd_t *tsd, arena_t *arena, void *ptr, size_t oldsize,
+    size_t size, size_t extra, size_t alignment, bool zero,
+    tcache_t *tcache);
 #ifdef JEMALLOC_JET
 typedef void (huge_dalloc_junk_t)(void *, size_t);
 extern huge_dalloc_junk_t *huge_dalloc_junk;
 #endif
-void	huge_dalloc(void *ptr, bool unmap);
+void	huge_dalloc(tsd_t *tsd, void *ptr, tcache_t *tcache);
+arena_t	*huge_aalloc(const void *ptr);
 size_t	huge_salloc(const void *ptr);
-dss_prec_t	huge_dss_prec_get(arena_t *arena);
-prof_ctx_t	*huge_prof_ctx_get(const void *ptr);
-void	huge_prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
-bool	huge_boot(void);
-void	huge_prefork(void);
-void	huge_postfork_parent(void);
-void	huge_postfork_child(void);
+prof_tctx_t	*huge_prof_tctx_get(const void *ptr);
+void	huge_prof_tctx_set(const void *ptr, prof_tctx_t *tctx);

 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
--- a/include/jemalloc/internal/jemalloc_internal_decls.h
+++ b/include/jemalloc/internal/jemalloc_internal_decls.h
@ -0,0 +1,64 @@
+#ifndef JEMALLOC_INTERNAL_DECLS_H
+#define	JEMALLOC_INTERNAL_DECLS_H
+
+#include <math.h>
+#ifdef _WIN32
+#  include <windows.h>
+#  include "msvc_compat/windows_extra.h"
+
+#else
+#  include <sys/param.h>
+#  include <sys/mman.h>
+#  if !defined(__pnacl__) && !defined(__native_client__)
+#    include <sys/syscall.h>
+#    if !defined(SYS_write) && defined(__NR_write)
+#      define SYS_write __NR_write
+#    endif
+#    include <sys/uio.h>
+#  endif
+#  include <pthread.h>
+#  include <errno.h>
+#endif
+#include <sys/types.h>
+
+#include <limits.h>
+#ifndef SIZE_T_MAX
+#  define SIZE_T_MAX	SIZE_MAX
+#endif
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stddef.h>
+#ifndef offsetof
+#  define offsetof(type, member)	((size_t)&(((type *)NULL)->member))
+#endif
+#include <string.h>
+#include <strings.h>
+#include <ctype.h>
+#ifdef _MSC_VER
+#  include <io.h>
+typedef intptr_t ssize_t;
+#  define PATH_MAX 1024
+#  define STDERR_FILENO 2
+#  define __func__ __FUNCTION__
+#  ifdef JEMALLOC_HAS_RESTRICT
+#    define restrict __restrict
+#  endif
+/* Disable warnings about deprecated system functions. */
+#  pragma warning(disable: 4996)
+#if _MSC_VER < 1800
+static int
+isblank(int c)
+{
+
+	return (c == '\t' || c == ' ');
+}
+#endif
+#else
+#  include <unistd.h>
+#endif
+#include <fcntl.h>
+
+#endif /* JEMALLOC_INTERNAL_H */
--- a/include/jemalloc/internal/jemalloc_internal_defs.h.in
+++ b/include/jemalloc/internal/jemalloc_internal_defs.h.in
@ -22,6 +22,9 @@
 */
 #undef CPU_SPINWAIT

+/* Defined if C11 atomics are available. */
+#undef JEMALLOC_C11ATOMICS
+
 /* Defined if the equivalent of FreeBSD's atomic(9) functions are available. */
 #undef JEMALLOC_ATOMIC9

@ -35,7 +38,7 @@
 * Defined if __sync_add_and_fetch(uint32_t *, uint32_t) and
 * __sync_sub_and_fetch(uint32_t *, uint32_t) are available, despite
 * __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4 not being defined (which means the
- * functions are defined in libgcc instead of being inlines)
+ * functions are defined in libgcc instead of being inlines).
 */
 #undef JE_FORCE_SYNC_COMPARE_AND_SWAP_4

@ -43,16 +46,36 @@
 * Defined if __sync_add_and_fetch(uint64_t *, uint64_t) and
 * __sync_sub_and_fetch(uint64_t *, uint64_t) are available, despite
 * __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8 not being defined (which means the
- * functions are defined in libgcc instead of being inlines)
+ * functions are defined in libgcc instead of being inlines).
 */
 #undef JE_FORCE_SYNC_COMPARE_AND_SWAP_8

+/*
+ * Defined if __builtin_clz() and __builtin_clzl() are available.
+ */
+#undef JEMALLOC_HAVE_BUILTIN_CLZ
+
+/*
+ * Defined if madvise(2) is available.
+ */
+#undef JEMALLOC_HAVE_MADVISE
+
 /*
 * Defined if OSSpin*() functions are available, as provided by Darwin, and
 * documented in the spinlock(3) manual page.
 */
 #undef JEMALLOC_OSSPIN

+/*
+ * Defined if secure_getenv(3) is available.
+ */
+#undef JEMALLOC_HAVE_SECURE_GETENV
+
+/*
+ * Defined if issetugid(2) is available.
+ */
+#undef JEMALLOC_HAVE_ISSETUGID
+
 /*
 * Defined if _malloc_thread_cleanup() exists.  At least in the case of
 * FreeBSD, pthread_key_create() allocates, which if used during malloc
@ -76,9 +99,6 @@
 */
 #undef JEMALLOC_MUTEX_INIT_CB

-/* Defined if sbrk() is supported. */
-#undef JEMALLOC_HAVE_SBRK
-
 /* Non-empty if the tls_model attribute is supported. */
 #undef JEMALLOC_TLS_MODEL

@ -137,8 +157,26 @@
 /* Support lazy locking (avoid locking unless a second thread is launched). */
 #undef JEMALLOC_LAZY_LOCK

-/* One page is 2^STATIC_PAGE_SHIFT bytes. */
-#undef STATIC_PAGE_SHIFT
+/* Minimum size class to support is 2^LG_TINY_MIN bytes. */
+#undef LG_TINY_MIN
+
+/*
+ * Minimum allocation alignment is 2^LG_QUANTUM bytes (ignoring tiny size
+ * classes).
+ */
+#undef LG_QUANTUM
+
+/* One page is 2^LG_PAGE bytes. */
+#undef LG_PAGE
+
+/*
+ * If defined, adjacent virtual memory mappings with identical attributes
+ * automatically coalesce, and they fragment when changes are made to subranges.
+ * This is the normal order of things for mmap()/munmap(), but on Windows
+ * VirtualAlloc()/VirtualFree() operations must be precisely matched, i.e.
+ * mappings do *not* coalesce/fragment.
+ */
+#undef JEMALLOC_MAPS_COALESCE

 /*
 * If defined, use munmap() to unmap freed chunks, rather than storing them for
@ -147,22 +185,28 @@
 */
 #undef JEMALLOC_MUNMAP

-/*
- * If defined, use mremap(...MREMAP_FIXED...) for huge realloc().  This is
- * disabled by default because it is Linux-specific and it will cause virtual
- * memory map holes, much like munmap(2) does.
- */
-#undef JEMALLOC_MREMAP
-
 /* TLS is used to map arenas and magazine caches to threads. */
 #undef JEMALLOC_TLS

+/*
+ * ffs()/ffsl() functions to use for bitmapping.  Don't use these directly;
+ * instead, use jemalloc_ffs() or jemalloc_ffsl() from util.h.
+ */
+#undef JEMALLOC_INTERNAL_FFSL
+#undef JEMALLOC_INTERNAL_FFS
+
 /*
 * JEMALLOC_IVSALLOC enables ivsalloc(), which verifies that pointers reside
 * within jemalloc-owned chunks before dereferencing them.
 */
 #undef JEMALLOC_IVSALLOC

+/*
+ * If defined, explicitly attempt to more uniformly distribute large allocation
+ * pointer alignments across all cache indices.
+ */
+#undef JEMALLOC_CACHE_OBLIVIOUS
+
 /*
 * Darwin (OS X) uses zones to work around Mach-O symbol override shortcomings.
 */
@ -182,9 +226,7 @@
 #undef JEMALLOC_PURGE_MADVISE_DONTNEED
 #undef JEMALLOC_PURGE_MADVISE_FREE

-/*
- * Define if operating system has alloca.h header.
- */
+/* Define if operating system has alloca.h header. */
 #undef JEMALLOC_HAS_ALLOCA_H

 /* C99 restrict keyword supported. */
@ -202,4 +244,19 @@
 /* sizeof(intmax_t) == 2^LG_SIZEOF_INTMAX_T. */
 #undef LG_SIZEOF_INTMAX_T

+/* glibc malloc hooks (__malloc_hook, __realloc_hook, __free_hook). */
+#undef JEMALLOC_GLIBC_MALLOC_HOOK
+
+/* glibc memalign hook. */
+#undef JEMALLOC_GLIBC_MEMALIGN_HOOK
+
+/* Adaptive mutex support in pthreads. */
+#undef JEMALLOC_HAVE_PTHREAD_MUTEX_ADAPTIVE_NP
+
+/*
+ * If defined, jemalloc symbols are not exported (doesn't work when
+ * JEMALLOC_PREFIX is not defined).
+ */
+#undef JEMALLOC_EXPORT
+
 #endif /* JEMALLOC_INTERNAL_DEFS_H_ */
--- a/include/jemalloc/internal/jemalloc_internal_macros.h
+++ b/include/jemalloc/internal/jemalloc_internal_macros.h
@ -39,9 +39,15 @@
 #endif

 #define	ZU(z)	((size_t)z)
+#define	ZI(z)	((ssize_t)z)
 #define	QU(q)	((uint64_t)q)
 #define	QI(q)	((int64_t)q)

+#define	KZU(z)	ZU(z##ULL)
+#define	KZI(z)	ZI(z##LL)
+#define	KQU(q)	QU(q##ULL)
+#define	KQI(q)	QI(q##LL)
+
 #ifndef __DECONST
 #  define	__DECONST(type, var)	((type)(uintptr_t)(const void *)(var))
 #endif
--- a/include/jemalloc/internal/mutex.h
+++ b/include/jemalloc/internal/mutex.h
@ -10,7 +10,7 @@ typedef struct malloc_mutex_s malloc_mutex_t;
 #elif (defined(JEMALLOC_MUTEX_INIT_CB))
 #  define MALLOC_MUTEX_INITIALIZER {PTHREAD_MUTEX_INITIALIZER, NULL}
 #else
-#  if (defined(PTHREAD_MUTEX_ADAPTIVE_NP) &&				\
+#  if (defined(JEMALLOC_HAVE_PTHREAD_MUTEX_ADAPTIVE_NP) &&		\
       defined(PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP))
 #    define MALLOC_MUTEX_TYPE PTHREAD_MUTEX_ADAPTIVE_NP
 #    define MALLOC_MUTEX_INITIALIZER {PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP}
@ -26,7 +26,11 @@ typedef struct malloc_mutex_s malloc_mutex_t;

 struct malloc_mutex_s {
 #ifdef _WIN32
+#  if _WIN32_WINNT >= 0x0600
+	SRWLOCK         	lock;
+#  else
 	CRITICAL_SECTION	lock;
+#  endif
 #elif (defined(JEMALLOC_OSSPIN))
 	OSSpinLock		lock;
 #elif (defined(JEMALLOC_MUTEX_INIT_CB))
@ -70,7 +74,11 @@ malloc_mutex_lock(malloc_mutex_t *mutex)

 	if (isthreaded) {
 #ifdef _WIN32
+#  if _WIN32_WINNT >= 0x0600
+		AcquireSRWLockExclusive(&mutex->lock);
+#  else
 		EnterCriticalSection(&mutex->lock);
+#  endif
 #elif (defined(JEMALLOC_OSSPIN))
 		OSSpinLockLock(&mutex->lock);
 #else
@ -85,7 +93,11 @@ malloc_mutex_unlock(malloc_mutex_t *mutex)

 	if (isthreaded) {
 #ifdef _WIN32
+#  if _WIN32_WINNT >= 0x0600
+		ReleaseSRWLockExclusive(&mutex->lock);
+#  else
 		LeaveCriticalSection(&mutex->lock);
+#  endif
 #elif (defined(JEMALLOC_OSSPIN))
 		OSSpinLockUnlock(&mutex->lock);
 #else
--- a/include/jemalloc/internal/pages.h
+++ b/include/jemalloc/internal/pages.h
@ -0,0 +1,26 @@
+/******************************************************************************/
+#ifdef JEMALLOC_H_TYPES
+
+#endif /* JEMALLOC_H_TYPES */
+/******************************************************************************/
+#ifdef JEMALLOC_H_STRUCTS
+
+#endif /* JEMALLOC_H_STRUCTS */
+/******************************************************************************/
+#ifdef JEMALLOC_H_EXTERNS
+
+void	*pages_map(void *addr, size_t size);
+void	pages_unmap(void *addr, size_t size);
+void	*pages_trim(void *addr, size_t alloc_size, size_t leadsize,
+    size_t size);
+bool	pages_commit(void *addr, size_t size);
+bool	pages_decommit(void *addr, size_t size);
+bool	pages_purge(void *addr, size_t size);
+
+#endif /* JEMALLOC_H_EXTERNS */
+/******************************************************************************/
+#ifdef JEMALLOC_H_INLINES
+
+#endif /* JEMALLOC_H_INLINES */
+/******************************************************************************/
+
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@ -1,44 +1,75 @@
-a0calloc
-a0free
+a0dalloc
+a0get
 a0malloc
+arena_aalloc
 arena_alloc_junk_small
 arena_bin_index
 arena_bin_info
+arena_bitselm_get
 arena_boot
+arena_choose
+arena_choose_hard
+arena_chunk_alloc_huge
+arena_chunk_cache_maybe_insert
+arena_chunk_cache_maybe_remove
+arena_chunk_dalloc_huge
+arena_chunk_ralloc_huge_expand
+arena_chunk_ralloc_huge_shrink
+arena_chunk_ralloc_huge_similar
+arena_cleanup
 arena_dalloc
 arena_dalloc_bin
-arena_dalloc_bin_locked
+arena_dalloc_bin_junked_locked
 arena_dalloc_junk_large
 arena_dalloc_junk_small
 arena_dalloc_large
-arena_dalloc_large_locked
+arena_dalloc_large_junked_locked
 arena_dalloc_small
 arena_dss_prec_get
 arena_dss_prec_set
+arena_get
+arena_get_hard
+arena_init
+arena_lg_dirty_mult_default_get
+arena_lg_dirty_mult_default_set
+arena_lg_dirty_mult_get
+arena_lg_dirty_mult_set
 arena_malloc
 arena_malloc_large
 arena_malloc_small
 arena_mapbits_allocated_get
 arena_mapbits_binind_get
+arena_mapbits_decommitted_get
 arena_mapbits_dirty_get
 arena_mapbits_get
+arena_mapbits_internal_set
 arena_mapbits_large_binind_set
 arena_mapbits_large_get
 arena_mapbits_large_set
 arena_mapbits_large_size_get
+arena_mapbitsp_get
+arena_mapbitsp_read
+arena_mapbitsp_write
 arena_mapbits_small_runind_get
 arena_mapbits_small_set
 arena_mapbits_unallocated_set
 arena_mapbits_unallocated_size_get
 arena_mapbits_unallocated_size_set
 arena_mapbits_unzeroed_get
-arena_mapbits_unzeroed_set
-arena_mapbitsp_get
-arena_mapbitsp_read
-arena_mapbitsp_write
-arena_mapp_get
 arena_maxclass
+arena_maxrun
+arena_maybe_purge
+arena_metadata_allocated_add
+arena_metadata_allocated_get
+arena_metadata_allocated_sub
+arena_migrate
+arena_miscelm_get
+arena_miscelm_to_pageind
+arena_miscelm_to_rpages
+arena_nbound
 arena_new
+arena_node_alloc
+arena_node_dalloc
 arena_palloc
 arena_postfork_child
 arena_postfork_parent
@ -46,50 +77,46 @@ arena_prefork
 arena_prof_accum
 arena_prof_accum_impl
 arena_prof_accum_locked
-arena_prof_ctx_get
-arena_prof_ctx_set
 arena_prof_promoted
+arena_prof_tctx_get
+arena_prof_tctx_set
 arena_ptr_small_binind_get
 arena_purge_all
 arena_quarantine_junk_small
 arena_ralloc
 arena_ralloc_junk_large
 arena_ralloc_no_move
+arena_rd_to_miscelm
 arena_redzone_corruption
 arena_run_regind
+arena_run_to_miscelm
 arena_salloc
+arenas_cache_bypass_cleanup
+arenas_cache_cleanup
+arena_sdalloc
 arena_stats_merge
 arena_tcache_fill_small
-arenas
-arenas_booted
-arenas_cleanup
-arenas_extend
-arenas_initialized
-arenas_lock
-arenas_tls
-arenas_tsd
-arenas_tsd_boot
-arenas_tsd_cleanup_wrapper
-arenas_tsd_get
-arenas_tsd_get_wrapper
-arenas_tsd_init_head
-arenas_tsd_set
+atomic_add_p
 atomic_add_u
 atomic_add_uint32
 atomic_add_uint64
 atomic_add_z
+atomic_cas_p
+atomic_cas_u
+atomic_cas_uint32
+atomic_cas_uint64
+atomic_cas_z
+atomic_sub_p
 atomic_sub_u
 atomic_sub_uint32
 atomic_sub_uint64
 atomic_sub_z
 base_alloc
 base_boot
-base_calloc
-base_node_alloc
-base_node_dealloc
 base_postfork_child
 base_postfork_parent
 base_prefork
+base_stats_get
 bitmap_full
 bitmap_get
 bitmap_info_init
@ -99,49 +126,54 @@ bitmap_set
 bitmap_sfu
 bitmap_size
 bitmap_unset
+bootstrap_calloc
+bootstrap_free
+bootstrap_malloc
 bt_init
 buferror
-choose_arena
-choose_arena_hard
-chunk_alloc
+chunk_alloc_base
+chunk_alloc_cache
 chunk_alloc_dss
 chunk_alloc_mmap
+chunk_alloc_wrapper
 chunk_boot
-chunk_dealloc
-chunk_dealloc_mmap
+chunk_dalloc_arena
+chunk_dalloc_cache
+chunk_dalloc_mmap
+chunk_dalloc_wrapper
+chunk_deregister
 chunk_dss_boot
 chunk_dss_postfork_child
 chunk_dss_postfork_parent
 chunk_dss_prec_get
 chunk_dss_prec_set
 chunk_dss_prefork
+chunk_hooks_default
+chunk_hooks_get
+chunk_hooks_set
 chunk_in_dss
+chunk_lookup
 chunk_npages
 chunk_postfork_child
 chunk_postfork_parent
 chunk_prefork
-chunk_unmap
-chunks_mtx
-chunks_rtree
+chunk_purge_arena
+chunk_purge_wrapper
+chunk_register
 chunksize
 chunksize_mask
-ckh_bucket_search
+chunks_rtree
 ckh_count
 ckh_delete
-ckh_evict_reloc_insert
 ckh_insert
-ckh_isearch
 ckh_iter
 ckh_new
 ckh_pointer_hash
 ckh_pointer_keycomp
-ckh_rebuild
 ckh_remove
 ckh_search
 ckh_string_hash
 ckh_string_keycomp
-ckh_try_bucket_insert
-ckh_try_insert
 ctl_boot
 ctl_bymib
 ctl_byname
@ -150,6 +182,23 @@ ctl_postfork_child
 ctl_postfork_parent
 ctl_prefork
 dss_prec_names
+extent_node_achunk_get
+extent_node_achunk_set
+extent_node_addr_get
+extent_node_addr_set
+extent_node_arena_get
+extent_node_arena_set
+extent_node_dirty_insert
+extent_node_dirty_linkage_init
+extent_node_dirty_remove
+extent_node_init
+extent_node_prof_tctx_get
+extent_node_prof_tctx_set
+extent_node_size_get
+extent_node_size_set
+extent_node_zeroed_get
+extent_node_zeroed_set
+extent_tree_ad_empty
 extent_tree_ad_first
 extent_tree_ad_insert
 extent_tree_ad_iter
@ -166,6 +215,7 @@ extent_tree_ad_reverse_iter
 extent_tree_ad_reverse_iter_recurse
 extent_tree_ad_reverse_iter_start
 extent_tree_ad_search
+extent_tree_szad_empty
 extent_tree_szad_first
 extent_tree_szad_insert
 extent_tree_szad_iter
@ -193,45 +243,47 @@ hash_rotl_64
 hash_x64_128
 hash_x86_128
 hash_x86_32
-huge_allocated
-huge_boot
+huge_aalloc
 huge_dalloc
 huge_dalloc_junk
-huge_dss_prec_get
 huge_malloc
-huge_mtx
-huge_ndalloc
-huge_nmalloc
 huge_palloc
-huge_postfork_child
-huge_postfork_parent
-huge_prefork
-huge_prof_ctx_get
-huge_prof_ctx_set
+huge_prof_tctx_get
+huge_prof_tctx_set
 huge_ralloc
 huge_ralloc_no_move
 huge_salloc
-iallocm
+iaalloc
+iallocztm
 icalloc
 icalloct
 idalloc
 idalloct
+idalloctm
 imalloc
 imalloct
+index2size
+index2size_compute
+index2size_lookup
+index2size_tab
+in_valgrind
 ipalloc
 ipalloct
+ipallocztm
 iqalloc
-iqalloct
 iralloc
 iralloct
 iralloct_realign
 isalloc
+isdalloct
+isqalloc
 isthreaded
 ivsalloc
 ixalloc
 jemalloc_postfork_child
 jemalloc_postfork_parent
 jemalloc_prefork
+lg_floor
 malloc_cprintf
 malloc_mutex_init
 malloc_mutex_lock
@ -242,7 +294,8 @@ malloc_mutex_unlock
 malloc_printf
 malloc_snprintf
 malloc_strtoumax
-malloc_tsd_boot
+malloc_tsd_boot0
+malloc_tsd_boot1
 malloc_tsd_cleanup_register
 malloc_tsd_dalloc
 malloc_tsd_malloc
@ -251,16 +304,18 @@ malloc_vcprintf
 malloc_vsnprintf
 malloc_write
 map_bias
+map_misc_offset
 mb_write
 mutex_boot
-narenas_auto
-narenas_total
+narenas_cache_cleanup
 narenas_total_get
 ncpus
 nhbins
 opt_abort
 opt_dss
 opt_junk
+opt_junk_alloc
+opt_junk_free
 opt_lg_chunk
 opt_lg_dirty_mult
 opt_lg_prof_interval
@ -274,84 +329,98 @@ opt_prof_final
 opt_prof_gdump
 opt_prof_leak
 opt_prof_prefix
+opt_prof_thread_active_init
 opt_quarantine
 opt_redzone
 opt_stats_print
 opt_tcache
 opt_utrace
-opt_valgrind
 opt_xmalloc
 opt_zero
 p2rz
+pages_commit
+pages_decommit
+pages_map
 pages_purge
+pages_trim
+pages_unmap
 pow2_ceil
+prof_active_get
+prof_active_get_unlocked
+prof_active_set
+prof_alloc_prep
+prof_alloc_rollback
 prof_backtrace
 prof_boot0
 prof_boot1
 prof_boot2
-prof_bt_count
-prof_ctx_get
-prof_ctx_set
+prof_dump_header
 prof_dump_open
 prof_free
+prof_free_sampled_object
 prof_gdump
+prof_gdump_get
+prof_gdump_get_unlocked
+prof_gdump_set
+prof_gdump_val
 prof_idump
 prof_interval
 prof_lookup
 prof_malloc
+prof_malloc_sample_object
 prof_mdump
 prof_postfork_child
 prof_postfork_parent
 prof_prefork
-prof_promote
 prof_realloc
+prof_reset
 prof_sample_accum_update
 prof_sample_threshold_update
-prof_tdata_booted
+prof_tctx_get
+prof_tctx_set
 prof_tdata_cleanup
 prof_tdata_get
 prof_tdata_init
-prof_tdata_initialized
-prof_tdata_tls
-prof_tdata_tsd
-prof_tdata_tsd_boot
-prof_tdata_tsd_cleanup_wrapper
-prof_tdata_tsd_get
-prof_tdata_tsd_get_wrapper
-prof_tdata_tsd_init_head
-prof_tdata_tsd_set
+prof_tdata_reinit
+prof_thread_active_get
+prof_thread_active_init_get
+prof_thread_active_init_set
+prof_thread_active_set
+prof_thread_name_get
+prof_thread_name_set
 quarantine
 quarantine_alloc_hook
-quarantine_boot
-quarantine_booted
+quarantine_alloc_hook_work
 quarantine_cleanup
-quarantine_init
-quarantine_tls
-quarantine_tsd
-quarantine_tsd_boot
-quarantine_tsd_cleanup_wrapper
-quarantine_tsd_get
-quarantine_tsd_get_wrapper
-quarantine_tsd_init_head
-quarantine_tsd_set
 register_zone
+rtree_child_read
+rtree_child_read_hard
+rtree_child_tryread
 rtree_delete
 rtree_get
-rtree_get_locked
 rtree_new
-rtree_postfork_child
-rtree_postfork_parent
-rtree_prefork
+rtree_node_valid
 rtree_set
+rtree_start_level
+rtree_subkey
+rtree_subtree_read
+rtree_subtree_read_hard
+rtree_subtree_tryread
+rtree_val_read
+rtree_val_write
 s2u
+s2u_compute
+s2u_lookup
 sa2u
 set_errno
-small_size2bin
+size2index
+size2index_compute
+size2index_lookup
+size2index_tab
 stats_cactive
 stats_cactive_add
 stats_cactive_get
 stats_cactive_sub
-stats_chunks
 stats_print
 tcache_alloc_easy
 tcache_alloc_large
@ -359,55 +428,67 @@ tcache_alloc_small
 tcache_alloc_small_hard
 tcache_arena_associate
 tcache_arena_dissociate
+tcache_arena_reassociate
 tcache_bin_flush_large
 tcache_bin_flush_small
 tcache_bin_info
-tcache_boot0
-tcache_boot1
-tcache_booted
+tcache_boot
+tcache_cleanup
 tcache_create
 tcache_dalloc_large
 tcache_dalloc_small
-tcache_destroy
-tcache_enabled_booted
+tcache_enabled_cleanup
 tcache_enabled_get
-tcache_enabled_initialized
 tcache_enabled_set
-tcache_enabled_tls
-tcache_enabled_tsd
-tcache_enabled_tsd_boot
-tcache_enabled_tsd_cleanup_wrapper
-tcache_enabled_tsd_get
-tcache_enabled_tsd_get_wrapper
-tcache_enabled_tsd_init_head
-tcache_enabled_tsd_set
 tcache_event
 tcache_event_hard
 tcache_flush
 tcache_get
-tcache_initialized
+tcache_get_hard
 tcache_maxclass
+tcaches
 tcache_salloc
+tcaches_create
+tcaches_destroy
+tcaches_flush
+tcaches_get
 tcache_stats_merge
-tcache_thread_cleanup
-tcache_tls
-tcache_tsd
-tcache_tsd_boot
-tcache_tsd_cleanup_wrapper
-tcache_tsd_get
-tcache_tsd_get_wrapper
-tcache_tsd_init_head
-tcache_tsd_set
-thread_allocated_booted
-thread_allocated_initialized
-thread_allocated_tls
-thread_allocated_tsd
-thread_allocated_tsd_boot
-thread_allocated_tsd_cleanup_wrapper
-thread_allocated_tsd_get
-thread_allocated_tsd_get_wrapper
-thread_allocated_tsd_init_head
-thread_allocated_tsd_set
+thread_allocated_cleanup
+thread_deallocated_cleanup
+tsd_arena_get
+tsd_arena_set
+tsd_boot
+tsd_boot0
+tsd_boot1
+tsd_booted
+tsd_cleanup
+tsd_cleanup_wrapper
+tsd_fetch
+tsd_get
+tsd_wrapper_get
+tsd_wrapper_set
+tsd_initialized
 tsd_init_check_recursion
 tsd_init_finish
+tsd_init_head
+tsd_nominal
+tsd_quarantine_get
+tsd_quarantine_set
+tsd_set
+tsd_tcache_enabled_get
+tsd_tcache_enabled_set
+tsd_tcache_get
+tsd_tcache_set
+tsd_tls
+tsd_tsd
+tsd_prof_tdata_get
+tsd_prof_tdata_set
+tsd_thread_allocated_get
+tsd_thread_allocated_set
+tsd_thread_deallocated_get
+tsd_thread_deallocated_set
 u2rz
+valgrind_freelike_block
+valgrind_make_mem_defined
+valgrind_make_mem_noaccess
+valgrind_make_mem_undefined
--- a/include/jemalloc/internal/prng.h
+++ b/include/jemalloc/internal/prng.h
@ -15,7 +15,7 @@
 * See Knuth's TAOCP 3rd Ed., Vol. 2, pg. 17 for details on these constraints.
 *
 * This choice of m has the disadvantage that the quality of the bits is
- * proportional to bit position.  For example. the lowest bit has a cycle of 2,
+ * proportional to bit position.  For example, the lowest bit has a cycle of 2,
 * the next has a cycle of 4, etc.  For this reason, we prefer to use the upper
 * bits.
 *
@ -26,22 +26,22 @@
 *   const uint32_t a, c : See above discussion.
 */
 #define	prng32(r, lg_range, state, a, c) do {				\
-	assert(lg_range > 0);						\
-	assert(lg_range <= 32);						\
+	assert((lg_range) > 0);						\
+	assert((lg_range) <= 32);					\
 									\
 	r = (state * (a)) + (c);					\
 	state = r;							\
-	r >>= (32 - lg_range);						\
+	r >>= (32 - (lg_range));					\
 } while (false)

 /* Same as prng32(), but 64 bits of pseudo-randomness, using uint64_t. */
 #define	prng64(r, lg_range, state, a, c) do {				\
-	assert(lg_range > 0);						\
-	assert(lg_range <= 64);						\
+	assert((lg_range) > 0);						\
+	assert((lg_range) <= 64);					\
 									\
 	r = (state * (a)) + (c);					\
 	state = r;							\
-	r >>= (64 - lg_range);						\
+	r >>= (64 - (lg_range));					\
 } while (false)

 #endif /* JEMALLOC_H_TYPES */
--- a/include/jemalloc/internal/prof.h
+++ b/include/jemalloc/internal/prof.h
@ -3,8 +3,8 @@

 typedef struct prof_bt_s prof_bt_t;
 typedef struct prof_cnt_s prof_cnt_t;
-typedef struct prof_thr_cnt_s prof_thr_cnt_t;
-typedef struct prof_ctx_s prof_ctx_t;
+typedef struct prof_tctx_s prof_tctx_t;
+typedef struct prof_gctx_s prof_gctx_t;
 typedef struct prof_tdata_s prof_tdata_t;

 /* Option defaults. */
@ -23,9 +23,6 @@ typedef struct prof_tdata_s prof_tdata_t;
 */
 #define	PROF_BT_MAX			128

-/* Maximum number of backtraces to store in each per thread LRU cache. */
-#define	PROF_TCMAX			1024
-
 /* Initial hash table size. */
 #define	PROF_CKH_MINITEMS		64

@ -36,11 +33,17 @@ typedef struct prof_tdata_s prof_tdata_t;
 #define	PROF_PRINTF_BUFSIZE		128

 /*
- * Number of mutexes shared among all ctx's.  No space is allocated for these
+ * Number of mutexes shared among all gctx's.  No space is allocated for these
 * unless profiling is enabled, so it's okay to over-provision.
 */
 #define	PROF_NCTX_LOCKS			1024

+/*
+ * Number of mutexes shared among all tdata's.  No space is allocated for these
+ * unless profiling is enabled, so it's okay to over-provision.
+ */
+#define	PROF_NTDATA_LOCKS		256
+
 /*
 * prof_tdata pointers close to NULL are used to encode state information that
 * is used for cleaning up during thread shutdown.
@ -63,141 +66,185 @@ struct prof_bt_s {
 /* Data structure passed to libgcc _Unwind_Backtrace() callback functions. */
 typedef struct {
 	prof_bt_t	*bt;
-	unsigned	nignore;
 	unsigned	max;
 } prof_unwind_data_t;
 #endif

 struct prof_cnt_s {
-	/*
-	 * Profiling counters.  An allocation/deallocation pair can operate on
-	 * different prof_thr_cnt_t objects that are linked into the same
-	 * prof_ctx_t cnts_ql, so it is possible for the cur* counters to go
-	 * negative.  In principle it is possible for the *bytes counters to
-	 * overflow/underflow, but a general solution would require something
-	 * like 128-bit counters; this implementation doesn't bother to solve
-	 * that problem.
-	 */
-	int64_t		curobjs;
-	int64_t		curbytes;
+	/* Profiling counters. */
+	uint64_t	curobjs;
+	uint64_t	curbytes;
 	uint64_t	accumobjs;
 	uint64_t	accumbytes;
 };

-struct prof_thr_cnt_s {
-	/* Linkage into prof_ctx_t's cnts_ql. */
-	ql_elm(prof_thr_cnt_t)	cnts_link;
+typedef enum {
+	prof_tctx_state_initializing,
+	prof_tctx_state_nominal,
+	prof_tctx_state_dumping,
+	prof_tctx_state_purgatory /* Dumper must finish destroying. */
+} prof_tctx_state_t;

-	/* Linkage into thread's LRU. */
-	ql_elm(prof_thr_cnt_t)	lru_link;
+struct prof_tctx_s {
+	/* Thread data for thread that performed the allocation. */
+	prof_tdata_t		*tdata;

 	/*
-	 * Associated context.  If a thread frees an object that it did not
-	 * allocate, it is possible that the context is not cached in the
-	 * thread's hash table, in which case it must be able to look up the
-	 * context, insert a new prof_thr_cnt_t into the thread's hash table,
-	 * and link it into the prof_ctx_t's cnts_ql.
+	 * Copy of tdata->thr_uid, necessary because tdata may be defunct during
+	 * teardown.
 	 */
-	prof_ctx_t		*ctx;
+	uint64_t		thr_uid;

-	/*
-	 * Threads use memory barriers to update the counters.  Since there is
-	 * only ever one writer, the only challenge is for the reader to get a
-	 * consistent read of the counters.
-	 *
-	 * The writer uses this series of operations:
-	 *
-	 * 1) Increment epoch to an odd number.
-	 * 2) Update counters.
-	 * 3) Increment epoch to an even number.
-	 *
-	 * The reader must assure 1) that the epoch is even while it reads the
-	 * counters, and 2) that the epoch doesn't change between the time it
-	 * starts and finishes reading the counters.
-	 */
-	unsigned		epoch;
-
-	/* Profiling counters. */
+	/* Profiling counters, protected by tdata->lock. */
 	prof_cnt_t		cnts;
+
+	/* Associated global context. */
+	prof_gctx_t		*gctx;
+
+	/*
+	 * UID that distinguishes multiple tctx's created by the same thread,
+	 * but coexisting in gctx->tctxs.  There are two ways that such
+	 * coexistence can occur:
+	 * - A dumper thread can cause a tctx to be retained in the purgatory
+	 *   state.
+	 * - Although a single "producer" thread must create all tctx's which
+	 *   share the same thr_uid, multiple "consumers" can each concurrently
+	 *   execute portions of prof_tctx_destroy().  prof_tctx_destroy() only
+	 *   gets called once each time cnts.cur{objs,bytes} drop to 0, but this
+	 *   threshold can be hit again before the first consumer finishes
+	 *   executing prof_tctx_destroy().
+	 */
+	uint64_t		tctx_uid;
+
+	/* Linkage into gctx's tctxs. */
+	rb_node(prof_tctx_t)	tctx_link;
+
+	/*
+	 * True during prof_alloc_prep()..prof_malloc_sample_object(), prevents
+	 * sample vs destroy race.
+	 */
+	bool			prepared;
+
+	/* Current dump-related state, protected by gctx->lock. */
+	prof_tctx_state_t	state;
+
+	/*
+	 * Copy of cnts snapshotted during early dump phase, protected by
+	 * dump_mtx.
+	 */
+	prof_cnt_t		dump_cnts;
 };
+typedef rb_tree(prof_tctx_t) prof_tctx_tree_t;

-struct prof_ctx_s {
-	/* Associated backtrace. */
-	prof_bt_t		*bt;
-
-	/* Protects nlimbo, cnt_merged, and cnts_ql. */
+struct prof_gctx_s {
+	/* Protects nlimbo, cnt_summed, and tctxs. */
 	malloc_mutex_t		*lock;

 	/*
-	 * Number of threads that currently cause this ctx to be in a state of
+	 * Number of threads that currently cause this gctx to be in a state of
 	 * limbo due to one of:
-	 *   - Initializing per thread counters associated with this ctx.
-	 *   - Preparing to destroy this ctx.
-	 *   - Dumping a heap profile that includes this ctx.
+	 *   - Initializing this gctx.
+	 *   - Initializing per thread counters associated with this gctx.
+	 *   - Preparing to destroy this gctx.
+	 *   - Dumping a heap profile that includes this gctx.
 	 * nlimbo must be 1 (single destroyer) in order to safely destroy the
-	 * ctx.
+	 * gctx.
 	 */
 	unsigned		nlimbo;

+	/*
+	 * Tree of profile counters, one for each thread that has allocated in
+	 * this context.
+	 */
+	prof_tctx_tree_t	tctxs;
+
+	/* Linkage for tree of contexts to be dumped. */
+	rb_node(prof_gctx_t)	dump_link;
+
 	/* Temporary storage for summation during dump. */
 	prof_cnt_t		cnt_summed;

-	/* When threads exit, they merge their stats into cnt_merged. */
-	prof_cnt_t		cnt_merged;
+	/* Associated backtrace. */
+	prof_bt_t		bt;

-	/*
-	 * List of profile counters, one for each thread that has allocated in
-	 * this context.
-	 */
-	ql_head(prof_thr_cnt_t)	cnts_ql;
-
-	/* Linkage for list of contexts to be dumped. */
-	ql_elm(prof_ctx_t)	dump_link;
+	/* Backtrace vector, variable size, referred to by bt. */
+	void			*vec[1];
 };
-typedef ql_head(prof_ctx_t) prof_ctx_list_t;
+typedef rb_tree(prof_gctx_t) prof_gctx_tree_t;

 struct prof_tdata_s {
+	malloc_mutex_t		*lock;
+
+	/* Monotonically increasing unique thread identifier. */
+	uint64_t		thr_uid;
+
 	/*
-	 * Hash of (prof_bt_t *)-->(prof_thr_cnt_t *).  Each thread keeps a
-	 * cache of backtraces, with associated thread-specific prof_thr_cnt_t
-	 * objects.  Other threads may read the prof_thr_cnt_t contents, but no
-	 * others will ever write them.
-	 *
-	 * Upon thread exit, the thread must merge all the prof_thr_cnt_t
-	 * counter data into the associated prof_ctx_t objects, and unlink/free
-	 * the prof_thr_cnt_t objects.
+	 * Monotonically increasing discriminator among tdata structures
+	 * associated with the same thr_uid.
 	 */
-	ckh_t			bt2cnt;
+	uint64_t		thr_discrim;

-	/* LRU for contents of bt2cnt. */
-	ql_head(prof_thr_cnt_t)	lru_ql;
+	/* Included in heap profile dumps if non-NULL. */
+	char			*thread_name;

-	/* Backtrace vector, used for calls to prof_backtrace(). */
-	void			**vec;
+	bool			attached;
+	bool			expired;
+
+	rb_node(prof_tdata_t)	tdata_link;
+
+	/*
+	 * Counter used to initialize prof_tctx_t's tctx_uid.  No locking is
+	 * necessary when incrementing this field, because only one thread ever
+	 * does so.
+	 */
+	uint64_t		tctx_uid_next;
+
+	/*
+	 * Hash of (prof_bt_t *)-->(prof_tctx_t *).  Each thread tracks
+	 * backtraces for which it has non-zero allocation/deallocation counters
+	 * associated with thread-specific prof_tctx_t objects.  Other threads
+	 * may write to prof_tctx_t contents when freeing associated objects.
+	 */
+	ckh_t			bt2tctx;

 	/* Sampling state. */
 	uint64_t		prng_state;
-	uint64_t		threshold;
-	uint64_t		accum;
+	uint64_t		bytes_until_sample;

 	/* State used to avoid dumping while operating on prof internals. */
 	bool			enq;
 	bool			enq_idump;
 	bool			enq_gdump;
+
+	/*
+	 * Set to true during an early dump phase for tdata's which are
+	 * currently being dumped.  New threads' tdata's have this initialized
+	 * to false so that they aren't accidentally included in later dump
+	 * phases.
+	 */
+	bool			dumping;
+
+	/*
+	 * True if profiling is active for this tdata's thread
+	 * (thread.prof.active mallctl).
+	 */
+	bool			active;
+
+	/* Temporary storage for summation during dump. */
+	prof_cnt_t		cnt_summed;
+
+	/* Backtrace vector, used for calls to prof_backtrace(). */
+	void			*vec[PROF_BT_MAX];
 };
+typedef rb_tree(prof_tdata_t) prof_tdata_tree_t;

 #endif /* JEMALLOC_H_STRUCTS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_EXTERNS

 extern bool	opt_prof;
-/*
- * Even if opt_prof is true, sampling can be temporarily disabled by setting
- * opt_prof_active to false.  No locking is used when updating opt_prof_active,
- * so there are no guarantees regarding how long it will take for all threads
- * to notice state changes.
- */
 extern bool	opt_prof_active;
+extern bool	opt_prof_thread_active_init;
 extern size_t	opt_lg_prof_sample;   /* Mean bytes between samples. */
 extern ssize_t	opt_lg_prof_interval; /* lg(prof_interval). */
 extern bool	opt_prof_gdump;       /* High-water memory dumping. */
@ -211,6 +258,12 @@ extern char	opt_prof_prefix[
 #endif
    1];

+/* Accessed via prof_active_[gs]et{_unlocked,}(). */
+extern bool	prof_active;
+
+/* Accessed via prof_gdump_[gs]et{_unlocked,}(). */
+extern bool	prof_gdump_val;
+
 /*
 * Profile dump interval, measured in bytes allocated.  Each arena triggers a
 * profile dump when it reaches this threshold.  The effect is that the
@ -221,391 +274,248 @@ extern char	opt_prof_prefix[
 extern uint64_t	prof_interval;

 /*
- * If true, promote small sampled objects to large objects, since small run
- * headers do not have embedded profile context pointers.
+ * Initialized as opt_lg_prof_sample, and potentially modified during profiling
+ * resets.
 */
-extern bool	prof_promote;
+extern size_t	lg_prof_sample;

+void	prof_alloc_rollback(tsd_t *tsd, prof_tctx_t *tctx, bool updated);
+void	prof_malloc_sample_object(const void *ptr, size_t usize,
+    prof_tctx_t *tctx);
+void	prof_free_sampled_object(tsd_t *tsd, size_t usize, prof_tctx_t *tctx);
 void	bt_init(prof_bt_t *bt, void **vec);
-void	prof_backtrace(prof_bt_t *bt, unsigned nignore);
-prof_thr_cnt_t	*prof_lookup(prof_bt_t *bt);
+void	prof_backtrace(prof_bt_t *bt);
+prof_tctx_t	*prof_lookup(tsd_t *tsd, prof_bt_t *bt);
 #ifdef JEMALLOC_JET
+size_t	prof_tdata_count(void);
 size_t	prof_bt_count(void);
+const prof_cnt_t *prof_cnt_all(void);
 typedef int (prof_dump_open_t)(bool, const char *);
 extern prof_dump_open_t *prof_dump_open;
+typedef bool (prof_dump_header_t)(bool, const prof_cnt_t *);
+extern prof_dump_header_t *prof_dump_header;
 #endif
 void	prof_idump(void);
 bool	prof_mdump(const char *filename);
 void	prof_gdump(void);
-prof_tdata_t	*prof_tdata_init(void);
-void	prof_tdata_cleanup(void *arg);
+prof_tdata_t	*prof_tdata_init(tsd_t *tsd);
+prof_tdata_t	*prof_tdata_reinit(tsd_t *tsd, prof_tdata_t *tdata);
+void	prof_reset(tsd_t *tsd, size_t lg_sample);
+void	prof_tdata_cleanup(tsd_t *tsd);
+const char	*prof_thread_name_get(void);
+bool	prof_active_get(void);
+bool	prof_active_set(bool active);
+int	prof_thread_name_set(tsd_t *tsd, const char *thread_name);
+bool	prof_thread_active_get(void);
+bool	prof_thread_active_set(bool active);
+bool	prof_thread_active_init_get(void);
+bool	prof_thread_active_init_set(bool active_init);
+bool	prof_gdump_get(void);
+bool	prof_gdump_set(bool active);
 void	prof_boot0(void);
 void	prof_boot1(void);
 bool	prof_boot2(void);
 void	prof_prefork(void);
 void	prof_postfork_parent(void);
 void	prof_postfork_child(void);
+void	prof_sample_threshold_update(prof_tdata_t *tdata);

 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_INLINES

-#define	PROF_ALLOC_PREP(nignore, size, ret) do {			\
-	prof_tdata_t *prof_tdata;					\
-	prof_bt_t bt;							\
-									\
-	assert(size == s2u(size));					\
-									\
-	prof_tdata = prof_tdata_get(true);				\
-	if ((uintptr_t)prof_tdata <= (uintptr_t)PROF_TDATA_STATE_MAX) {	\
-		if (prof_tdata != NULL)					\
-			ret = (prof_thr_cnt_t *)(uintptr_t)1U;		\
-		else							\
-			ret = NULL;					\
-		break;							\
-	}								\
-									\
-	if (opt_prof_active == false) {					\
-		/* Sampling is currently inactive, so avoid sampling. */\
-		ret = (prof_thr_cnt_t *)(uintptr_t)1U;			\
-	} else if (opt_lg_prof_sample == 0) {				\
-		/* Don't bother with sampling logic, since sampling   */\
-		/* interval is 1.                                     */\
-		bt_init(&bt, prof_tdata->vec);				\
-		prof_backtrace(&bt, nignore);				\
-		ret = prof_lookup(&bt);					\
-	} else {							\
-		if (prof_tdata->threshold == 0) {			\
-			/* Initialize.  Seed the prng differently for */\
-			/* each thread.                               */\
-			prof_tdata->prng_state =			\
-			    (uint64_t)(uintptr_t)&size;			\
-			prof_sample_threshold_update(prof_tdata);	\
-		}							\
-									\
-		/* Determine whether to capture a backtrace based on  */\
-		/* whether size is enough for prof_accum to reach     */\
-		/* prof_tdata->threshold.  However, delay updating    */\
-		/* these variables until prof_{m,re}alloc(), because  */\
-		/* we don't know for sure that the allocation will    */\
-		/* succeed.                                           */\
-		/*                                                    */\
-		/* Use subtraction rather than addition to avoid      */\
-		/* potential integer overflow.                        */\
-		if (size >= prof_tdata->threshold -			\
-		    prof_tdata->accum) {				\
-			bt_init(&bt, prof_tdata->vec);			\
-			prof_backtrace(&bt, nignore);			\
-			ret = prof_lookup(&bt);				\
-		} else							\
-			ret = (prof_thr_cnt_t *)(uintptr_t)1U;		\
-	}								\
-} while (0)
-
 #ifndef JEMALLOC_ENABLE_INLINE
-malloc_tsd_protos(JEMALLOC_ATTR(unused), prof_tdata, prof_tdata_t *)
-
-prof_tdata_t	*prof_tdata_get(bool create);
-void	prof_sample_threshold_update(prof_tdata_t *prof_tdata);
-prof_ctx_t	*prof_ctx_get(const void *ptr);
-void	prof_ctx_set(const void *ptr, size_t usize, prof_ctx_t *ctx);
-bool	prof_sample_accum_update(size_t size);
-void	prof_malloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt);
-void	prof_realloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt,
-    size_t old_usize, prof_ctx_t *old_ctx);
-void	prof_free(const void *ptr, size_t size);
+bool	prof_active_get_unlocked(void);
+bool	prof_gdump_get_unlocked(void);
+prof_tdata_t	*prof_tdata_get(tsd_t *tsd, bool create);
+bool	prof_sample_accum_update(tsd_t *tsd, size_t usize, bool commit,
+    prof_tdata_t **tdata_out);
+prof_tctx_t	*prof_alloc_prep(tsd_t *tsd, size_t usize, bool update);
+prof_tctx_t	*prof_tctx_get(const void *ptr);
+void	prof_tctx_set(const void *ptr, prof_tctx_t *tctx);
+void	prof_malloc_sample_object(const void *ptr, size_t usize,
+    prof_tctx_t *tctx);
+void	prof_malloc(const void *ptr, size_t usize, prof_tctx_t *tctx);
+void	prof_realloc(tsd_t *tsd, const void *ptr, size_t usize,
+    prof_tctx_t *tctx, bool updated, size_t old_usize, prof_tctx_t *old_tctx);
+void	prof_free(tsd_t *tsd, const void *ptr, size_t usize);
 #endif

 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_PROF_C_))
-/* Thread-specific backtrace cache, used to reduce bt2ctx contention. */
-malloc_tsd_externs(prof_tdata, prof_tdata_t *)
-malloc_tsd_funcs(JEMALLOC_INLINE, prof_tdata, prof_tdata_t *, NULL,
-    prof_tdata_cleanup)
-
-JEMALLOC_INLINE prof_tdata_t *
-prof_tdata_get(bool create)
+JEMALLOC_ALWAYS_INLINE bool
+prof_active_get_unlocked(void)
 {
-	prof_tdata_t *prof_tdata;
+
+	/*
+	 * Even if opt_prof is true, sampling can be temporarily disabled by
+	 * setting prof_active to false.  No locking is used when reading
+	 * prof_active in the fast path, so there are no guarantees regarding
+	 * how long it will take for all threads to notice state changes.
+	 */
+	return (prof_active);
+}
+
+JEMALLOC_ALWAYS_INLINE bool
+prof_gdump_get_unlocked(void)
+{
+
+	/*
+	 * No locking is used when reading prof_gdump_val in the fast path, so
+	 * there are no guarantees regarding how long it will take for all
+	 * threads to notice state changes.
+	 */
+	return (prof_gdump_val);
+}
+
+JEMALLOC_ALWAYS_INLINE prof_tdata_t *
+prof_tdata_get(tsd_t *tsd, bool create)
+{
+	prof_tdata_t *tdata;

 	cassert(config_prof);

-	prof_tdata = *prof_tdata_tsd_get();
-	if (create && prof_tdata == NULL)
-		prof_tdata = prof_tdata_init();
+	tdata = tsd_prof_tdata_get(tsd);
+	if (create) {
+		if (unlikely(tdata == NULL)) {
+			if (tsd_nominal(tsd)) {
+				tdata = prof_tdata_init(tsd);
+				tsd_prof_tdata_set(tsd, tdata);
+			}
+		} else if (unlikely(tdata->expired)) {
+			tdata = prof_tdata_reinit(tsd, tdata);
+			tsd_prof_tdata_set(tsd, tdata);
+		}
+		assert(tdata == NULL || tdata->attached);
+	}

-	return (prof_tdata);
+	return (tdata);
 }

-JEMALLOC_INLINE void
-prof_sample_threshold_update(prof_tdata_t *prof_tdata)
+JEMALLOC_ALWAYS_INLINE prof_tctx_t *
+prof_tctx_get(const void *ptr)
 {
-	/*
-	 * The body of this function is compiled out unless heap profiling is
-	 * enabled, so that it is possible to compile jemalloc with floating
-	 * point support completely disabled.  Avoiding floating point code is
-	 * important on memory-constrained systems, but it also enables a
-	 * workaround for versions of glibc that don't properly save/restore
-	 * floating point registers during dynamic lazy symbol loading (which
-	 * internally calls into whatever malloc implementation happens to be
-	 * integrated into the application).  Note that some compilers (e.g.
-	 * gcc 4.8) may use floating point registers for fast memory moves, so
-	 * jemalloc must be compiled with such optimizations disabled (e.g.
-	 * -mno-sse) in order for the workaround to be complete.
-	 */
-#ifdef JEMALLOC_PROF
-	uint64_t r;
-	double u;
-
-	cassert(config_prof);
-
-	/*
-	 * Compute sample threshold as a geometrically distributed random
-	 * variable with mean (2^opt_lg_prof_sample).
-	 *
-	 *                         __        __
-	 *                         |  log(u)  |                     1
-	 * prof_tdata->threshold = | -------- |, where p = -------------------
-	 *                         | log(1-p) |             opt_lg_prof_sample
-	 *                                                 2
-	 *
-	 * For more information on the math, see:
-	 *
-	 *   Non-Uniform Random Variate Generation
-	 *   Luc Devroye
-	 *   Springer-Verlag, New York, 1986
-	 *   pp 500
-	 *   (http://luc.devroye.org/rnbookindex.html)
-	 */
-	prng64(r, 53, prof_tdata->prng_state,
-	    UINT64_C(6364136223846793005), UINT64_C(1442695040888963407));
-	u = (double)r * (1.0/9007199254740992.0L);
-	prof_tdata->threshold = (uint64_t)(log(u) /
-	    log(1.0 - (1.0 / (double)((uint64_t)1U << opt_lg_prof_sample))))
-	    + (uint64_t)1U;
-#endif
-}
-
-JEMALLOC_INLINE prof_ctx_t *
-prof_ctx_get(const void *ptr)
-{
-	prof_ctx_t *ret;
-	arena_chunk_t *chunk;

 	cassert(config_prof);
 	assert(ptr != NULL);

-	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-	if (chunk != ptr) {
-		/* Region. */
-		ret = arena_prof_ctx_get(ptr);
-	} else
-		ret = huge_prof_ctx_get(ptr);
+	return (arena_prof_tctx_get(ptr));
+}
+
+JEMALLOC_ALWAYS_INLINE void
+prof_tctx_set(const void *ptr, prof_tctx_t *tctx)
+{
+
+	cassert(config_prof);
+	assert(ptr != NULL);
+
+	arena_prof_tctx_set(ptr, tctx);
+}
+
+JEMALLOC_ALWAYS_INLINE bool
+prof_sample_accum_update(tsd_t *tsd, size_t usize, bool update,
+    prof_tdata_t **tdata_out)
+{
+	prof_tdata_t *tdata;
+
+	cassert(config_prof);
+
+	tdata = prof_tdata_get(tsd, true);
+	if ((uintptr_t)tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
+		tdata = NULL;
+
+	if (tdata_out != NULL)
+		*tdata_out = tdata;
+
+	if (tdata == NULL)
+		return (true);
+
+	if (tdata->bytes_until_sample >= usize) {
+		if (update)
+			tdata->bytes_until_sample -= usize;
+		return (true);
+	} else {
+		/* Compute new sample threshold. */
+		if (update)
+			prof_sample_threshold_update(tdata);
+		return (!tdata->active);
+	}
+}
+
+JEMALLOC_ALWAYS_INLINE prof_tctx_t *
+prof_alloc_prep(tsd_t *tsd, size_t usize, bool update)
+{
+	prof_tctx_t *ret;
+	prof_tdata_t *tdata;
+	prof_bt_t bt;
+
+	assert(usize == s2u(usize));
+
+	if (!prof_active_get_unlocked() || likely(prof_sample_accum_update(tsd,
+	    usize, update, &tdata)))
+		ret = (prof_tctx_t *)(uintptr_t)1U;
+	else {
+		bt_init(&bt, tdata->vec);
+		prof_backtrace(&bt);
+		ret = prof_lookup(tsd, &bt);
+	}

 	return (ret);
 }

-JEMALLOC_INLINE void
-prof_ctx_set(const void *ptr, size_t usize, prof_ctx_t *ctx)
-{
-	arena_chunk_t *chunk;
-
-	cassert(config_prof);
-	assert(ptr != NULL);
-
-	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-	if (chunk != ptr) {
-		/* Region. */
-		arena_prof_ctx_set(ptr, usize, ctx);
-	} else
-		huge_prof_ctx_set(ptr, ctx);
-}
-
-JEMALLOC_INLINE bool
-prof_sample_accum_update(size_t size)
-{
-	prof_tdata_t *prof_tdata;
-
-	cassert(config_prof);
-	/* Sampling logic is unnecessary if the interval is 1. */
-	assert(opt_lg_prof_sample != 0);
-
-	prof_tdata = prof_tdata_get(false);
-	if ((uintptr_t)prof_tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
-		return (true);
-
-	/* Take care to avoid integer overflow. */
-	if (size >= prof_tdata->threshold - prof_tdata->accum) {
-		prof_tdata->accum -= (prof_tdata->threshold - size);
-		/* Compute new sample threshold. */
-		prof_sample_threshold_update(prof_tdata);
-		while (prof_tdata->accum >= prof_tdata->threshold) {
-			prof_tdata->accum -= prof_tdata->threshold;
-			prof_sample_threshold_update(prof_tdata);
-		}
-		return (false);
-	} else {
-		prof_tdata->accum += size;
-		return (true);
-	}
-}
-
-JEMALLOC_INLINE void
-prof_malloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt)
+JEMALLOC_ALWAYS_INLINE void
+prof_malloc(const void *ptr, size_t usize, prof_tctx_t *tctx)
 {

 	cassert(config_prof);
 	assert(ptr != NULL);
 	assert(usize == isalloc(ptr, true));

-	if (opt_lg_prof_sample != 0) {
-		if (prof_sample_accum_update(usize)) {
-			/*
-			 * Don't sample.  For malloc()-like allocation, it is
-			 * always possible to tell in advance how large an
-			 * object's usable size will be, so there should never
-			 * be a difference between the usize passed to
-			 * PROF_ALLOC_PREP() and prof_malloc().
-			 */
-			assert((uintptr_t)cnt == (uintptr_t)1U);
-		}
-	}
-
-	if ((uintptr_t)cnt > (uintptr_t)1U) {
-		prof_ctx_set(ptr, usize, cnt->ctx);
-
-		cnt->epoch++;
-		/*********/
-		mb_write();
-		/*********/
-		cnt->cnts.curobjs++;
-		cnt->cnts.curbytes += usize;
-		if (opt_prof_accum) {
-			cnt->cnts.accumobjs++;
-			cnt->cnts.accumbytes += usize;
-		}
-		/*********/
-		mb_write();
-		/*********/
-		cnt->epoch++;
-		/*********/
-		mb_write();
-		/*********/
-	} else
-		prof_ctx_set(ptr, usize, (prof_ctx_t *)(uintptr_t)1U);
+	if (unlikely((uintptr_t)tctx > (uintptr_t)1U))
+		prof_malloc_sample_object(ptr, usize, tctx);
+	else
+		prof_tctx_set(ptr, (prof_tctx_t *)(uintptr_t)1U);
 }

-JEMALLOC_INLINE void
-prof_realloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt,
-    size_t old_usize, prof_ctx_t *old_ctx)
+JEMALLOC_ALWAYS_INLINE void
+prof_realloc(tsd_t *tsd, const void *ptr, size_t usize, prof_tctx_t *tctx,
+    bool updated, size_t old_usize, prof_tctx_t *old_tctx)
 {
-	prof_thr_cnt_t *told_cnt;

 	cassert(config_prof);
-	assert(ptr != NULL || (uintptr_t)cnt <= (uintptr_t)1U);
+	assert(ptr != NULL || (uintptr_t)tctx <= (uintptr_t)1U);

-	if (ptr != NULL) {
+	if (!updated && ptr != NULL) {
 		assert(usize == isalloc(ptr, true));
-		if (opt_lg_prof_sample != 0) {
-			if (prof_sample_accum_update(usize)) {
+		if (prof_sample_accum_update(tsd, usize, true, NULL)) {
 			/*
-				 * Don't sample.  The usize passed to
-				 * PROF_ALLOC_PREP() was larger than what
-				 * actually got allocated, so a backtrace was
-				 * captured for this allocation, even though
-				 * its actual usize was insufficient to cross
-				 * the sample threshold.
+			 * Don't sample.  The usize passed to PROF_ALLOC_PREP()
+			 * was larger than what actually got allocated, so a
+			 * backtrace was captured for this allocation, even
+			 * though its actual usize was insufficient to cross the
+			 * sample threshold.
 			 */
-				cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
-			}
+			tctx = (prof_tctx_t *)(uintptr_t)1U;
 		}
 	}

-	if ((uintptr_t)old_ctx > (uintptr_t)1U) {
-		told_cnt = prof_lookup(old_ctx->bt);
-		if (told_cnt == NULL) {
-			/*
-			 * It's too late to propagate OOM for this realloc(),
-			 * so operate directly on old_cnt->ctx->cnt_merged.
-			 */
-			malloc_mutex_lock(old_ctx->lock);
-			old_ctx->cnt_merged.curobjs--;
-			old_ctx->cnt_merged.curbytes -= old_usize;
-			malloc_mutex_unlock(old_ctx->lock);
-			told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
-		}
-	} else
-		told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
-
-	if ((uintptr_t)told_cnt > (uintptr_t)1U)
-		told_cnt->epoch++;
-	if ((uintptr_t)cnt > (uintptr_t)1U) {
-		prof_ctx_set(ptr, usize, cnt->ctx);
-		cnt->epoch++;
-	} else if (ptr != NULL)
-		prof_ctx_set(ptr, usize, (prof_ctx_t *)(uintptr_t)1U);
-	/*********/
-	mb_write();
-	/*********/
-	if ((uintptr_t)told_cnt > (uintptr_t)1U) {
-		told_cnt->cnts.curobjs--;
-		told_cnt->cnts.curbytes -= old_usize;
-	}
-	if ((uintptr_t)cnt > (uintptr_t)1U) {
-		cnt->cnts.curobjs++;
-		cnt->cnts.curbytes += usize;
-		if (opt_prof_accum) {
-			cnt->cnts.accumobjs++;
-			cnt->cnts.accumbytes += usize;
-		}
-	}
-	/*********/
-	mb_write();
-	/*********/
-	if ((uintptr_t)told_cnt > (uintptr_t)1U)
-		told_cnt->epoch++;
-	if ((uintptr_t)cnt > (uintptr_t)1U)
-		cnt->epoch++;
-	/*********/
-	mb_write(); /* Not strictly necessary. */
+	if (unlikely((uintptr_t)old_tctx > (uintptr_t)1U))
+		prof_free_sampled_object(tsd, old_usize, old_tctx);
+	if (unlikely((uintptr_t)tctx > (uintptr_t)1U))
+		prof_malloc_sample_object(ptr, usize, tctx);
+	else
+		prof_tctx_set(ptr, (prof_tctx_t *)(uintptr_t)1U);
 }

-JEMALLOC_INLINE void
-prof_free(const void *ptr, size_t size)
+JEMALLOC_ALWAYS_INLINE void
+prof_free(tsd_t *tsd, const void *ptr, size_t usize)
 {
-	prof_ctx_t *ctx = prof_ctx_get(ptr);
+	prof_tctx_t *tctx = prof_tctx_get(ptr);

 	cassert(config_prof);
+	assert(usize == isalloc(ptr, true));

-	if ((uintptr_t)ctx > (uintptr_t)1) {
-		prof_thr_cnt_t *tcnt;
-		assert(size == isalloc(ptr, true));
-		tcnt = prof_lookup(ctx->bt);
-
-		if (tcnt != NULL) {
-			tcnt->epoch++;
-			/*********/
-			mb_write();
-			/*********/
-			tcnt->cnts.curobjs--;
-			tcnt->cnts.curbytes -= size;
-			/*********/
-			mb_write();
-			/*********/
-			tcnt->epoch++;
-			/*********/
-			mb_write();
-			/*********/
-		} else {
-			/*
-			 * OOM during free() cannot be propagated, so operate
-			 * directly on cnt->ctx->cnt_merged.
-			 */
-			malloc_mutex_lock(ctx->lock);
-			ctx->cnt_merged.curobjs--;
-			ctx->cnt_merged.curbytes -= size;
-			malloc_mutex_unlock(ctx->lock);
-		}
-	}
+	if (unlikely((uintptr_t)tctx > (uintptr_t)1U))
+		prof_free_sampled_object(tsd, usize, tctx);
 }
 #endif

--- a/include/jemalloc/internal/ql.h
+++ b/include/jemalloc/internal/ql.h
@ -1,6 +1,4 @@
-/*
- * List definitions.
- */
+/* List definitions. */
 #define	ql_head(a_type)							\
 struct {								\
 	a_type *qlh_first;						\
--- a/include/jemalloc/internal/qr.h
+++ b/include/jemalloc/internal/qr.h
@ -40,8 +40,10 @@ struct {								\
 	(a_qr_b)->a_field.qre_prev = t;					\
 } while (0)

-/* qr_meld() and qr_split() are functionally equivalent, so there's no need to
- * have two copies of the code. */
+/*
+ * qr_meld() and qr_split() are functionally equivalent, so there's no need to
+ * have two copies of the code.
+ */
 #define	qr_split(a_qr_a, a_qr_b, a_field)				\
 	qr_meld((a_qr_a), (a_qr_b), a_field)

--- a/include/jemalloc/internal/quarantine.h
+++ b/include/jemalloc/internal/quarantine.h
@ -29,36 +29,29 @@ struct quarantine_s {
 /******************************************************************************/
 #ifdef JEMALLOC_H_EXTERNS

-quarantine_t	*quarantine_init(size_t lg_maxobjs);
-void	quarantine(void *ptr);
-void	quarantine_cleanup(void *arg);
-bool	quarantine_boot(void);
+void	quarantine_alloc_hook_work(tsd_t *tsd);
+void	quarantine(tsd_t *tsd, void *ptr);
+void	quarantine_cleanup(tsd_t *tsd);

 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_INLINES

 #ifndef JEMALLOC_ENABLE_INLINE
-malloc_tsd_protos(JEMALLOC_ATTR(unused), quarantine, quarantine_t *)
-
 void	quarantine_alloc_hook(void);
 #endif

 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_QUARANTINE_C_))
-malloc_tsd_externs(quarantine, quarantine_t *)
-malloc_tsd_funcs(JEMALLOC_ALWAYS_INLINE, quarantine, quarantine_t *, NULL,
-    quarantine_cleanup)
-
 JEMALLOC_ALWAYS_INLINE void
 quarantine_alloc_hook(void)
 {
-	quarantine_t *quarantine;
+	tsd_t *tsd;

 	assert(config_fill && opt_quarantine);

-	quarantine = *quarantine_tsd_get();
-	if (quarantine == NULL)
-		quarantine_init(LG_MAXOBJS_INIT);
+	tsd = tsd_fetch();
+	if (tsd_quarantine_get(tsd) == NULL)
+		quarantine_alloc_hook_work(tsd);
 }
 #endif

--- a/include/jemalloc/internal/rb.h
+++ b/include/jemalloc/internal/rb.h
@ -158,6 +158,8 @@ struct {								\
 #define	rb_proto(a_attr, a_prefix, a_rbt_type, a_type)			\
 a_attr void								\
 a_prefix##new(a_rbt_type *rbtree);					\
+a_attr bool								\
+a_prefix##empty(a_rbt_type *rbtree);					\
 a_attr a_type *								\
 a_prefix##first(a_rbt_type *rbtree);					\
 a_attr a_type *								\
@ -198,7 +200,7 @@ a_prefix##reverse_iter(a_rbt_type *rbtree, a_type *start,		\
 *                 int (a_cmp *)(a_type *a_node, a_type *a_other);
 *                                       ^^^^^^
 *                                    or a_key
- *               Interpretation of comparision function return values:
+ *               Interpretation of comparison function return values:
 *                 -1 : a_node <  a_other
 *                  0 : a_node == a_other
 *                  1 : a_node >  a_other
@ -224,6 +226,13 @@ a_prefix##reverse_iter(a_rbt_type *rbtree, a_type *start,		\
 *       Args:
 *         tree: Pointer to an uninitialized red-black tree object.
 *
+ *   static bool
+ *   ex_empty(ex_t *tree);
+ *       Description: Determine whether tree is empty.
+ *       Args:
+ *         tree: Pointer to an initialized red-black tree object.
+ *       Ret: True if tree is empty, false otherwise.
+ *
 *   static ex_node_t *
 *   ex_first(ex_t *tree);
 *   static ex_node_t *
@ -309,6 +318,10 @@ a_attr void								\
 a_prefix##new(a_rbt_type *rbtree) {					\
    rb_new(a_type, a_field, rbtree);					\
 }									\
+a_attr bool								\
+a_prefix##empty(a_rbt_type *rbtree) {					\
+    return (rbtree->rbt_root == &rbtree->rbt_nil);			\
+}									\
 a_attr a_type *								\
 a_prefix##first(a_rbt_type *rbtree) {					\
    a_type *ret;							\
@ -580,7 +593,7 @@ a_prefix##remove(a_rbt_type *rbtree, a_type *node) {			\
 	if (left != &rbtree->rbt_nil) {					\
 	    /* node has no successor, but it has a left child.        */\
 	    /* Splice node out, without losing the left child.        */\
-	    assert(rbtn_red_get(a_type, a_field, node) == false);	\
+	    assert(!rbtn_red_get(a_type, a_field, node));		\
 	    assert(rbtn_red_get(a_type, a_field, left));		\
 	    rbtn_black_set(a_type, a_field, left);			\
 	    if (pathp == path) {					\
@ -616,8 +629,7 @@ a_prefix##remove(a_rbt_type *rbtree, a_type *node) {			\
 	if (pathp->cmp < 0) {						\
 	    rbtn_left_set(a_type, a_field, pathp->node,			\
 	      pathp[1].node);						\
-	    assert(rbtn_red_get(a_type, a_field, pathp[1].node)		\
-	      == false);						\
+	    assert(!rbtn_red_get(a_type, a_field, pathp[1].node));	\
 	    if (rbtn_red_get(a_type, a_field, pathp->node)) {		\
 		a_type *right = rbtn_right_get(a_type, a_field,		\
 		  pathp->node);						\
@ -681,7 +693,7 @@ a_prefix##remove(a_rbt_type *rbtree, a_type *node) {			\
 		    rbtn_rotate_left(a_type, a_field, pathp->node,	\
 		      tnode);						\
 		    /* Balance restored, but rotation modified        */\
-		    /* subree root, which may actually be the tree    */\
+		    /* subtree root, which may actually be the tree   */\
 		    /* root.                                          */\
 		    if (pathp == path) {				\
 			/* Set root. */					\
@ -849,7 +861,7 @@ a_prefix##remove(a_rbt_type *rbtree, a_type *node) {			\
    }									\
    /* Set root. */							\
    rbtree->rbt_root = path->node;					\
-    assert(rbtn_red_get(a_type, a_field, rbtree->rbt_root) == false);	\
+    assert(!rbtn_red_get(a_type, a_field, rbtree->rbt_root));		\
 }									\
 a_attr a_type *								\
 a_prefix##iter_recurse(a_rbt_type *rbtree, a_type *node,		\
--- a/include/jemalloc/internal/rtree.h
+++ b/include/jemalloc/internal/rtree.h
@ -1,170 +1,292 @@
 /*
 * This radix tree implementation is tailored to the singular purpose of
- * tracking which chunks are currently owned by jemalloc.  This functionality
- * is mandatory for OS X, where jemalloc must be able to respond to object
- * ownership queries.
+ * associating metadata with chunks that are currently owned by jemalloc.
 *
 *******************************************************************************
 */
 #ifdef JEMALLOC_H_TYPES

+typedef struct rtree_node_elm_s rtree_node_elm_t;
+typedef struct rtree_level_s rtree_level_t;
 typedef struct rtree_s rtree_t;

 /*
- * Size of each radix tree node (must be a power of 2).  This impacts tree
- * depth.
+ * RTREE_BITS_PER_LEVEL must be a power of two that is no larger than the
+ * machine address width.
 */
-#define	RTREE_NODESIZE (1U << 16)
+#define	LG_RTREE_BITS_PER_LEVEL	4
+#define	RTREE_BITS_PER_LEVEL	(ZU(1) << LG_RTREE_BITS_PER_LEVEL)
+#define	RTREE_HEIGHT_MAX						\
+    ((ZU(1) << (LG_SIZEOF_PTR+3)) / RTREE_BITS_PER_LEVEL)

-typedef void *(rtree_alloc_t)(size_t);
-typedef void (rtree_dalloc_t)(void *);
+/* Used for two-stage lock-free node initialization. */
+#define	RTREE_NODE_INITIALIZING	((rtree_node_elm_t *)0x1)
+
+/*
+ * The node allocation callback function's argument is the number of contiguous
+ * rtree_node_elm_t structures to allocate, and the resulting memory must be
+ * zeroed.
+ */
+typedef rtree_node_elm_t *(rtree_node_alloc_t)(size_t);
+typedef void (rtree_node_dalloc_t)(rtree_node_elm_t *);

 #endif /* JEMALLOC_H_TYPES */
 /******************************************************************************/
 #ifdef JEMALLOC_H_STRUCTS

+struct rtree_node_elm_s {
+	union {
+		void			*pun;
+		rtree_node_elm_t	*child;
+		extent_node_t		*val;
+	};
+};
+
+struct rtree_level_s {
+	/*
+	 * A non-NULL subtree points to a subtree rooted along the hypothetical
+	 * path to the leaf node corresponding to key 0.  Depending on what keys
+	 * have been used to store to the tree, an arbitrary combination of
+	 * subtree pointers may remain NULL.
+	 *
+	 * Suppose keys comprise 48 bits, and LG_RTREE_BITS_PER_LEVEL is 4.
+	 * This results in a 3-level tree, and the leftmost leaf can be directly
+	 * accessed via subtrees[2], the subtree prefixed by 0x0000 (excluding
+	 * 0x00000000) can be accessed via subtrees[1], and the remainder of the
+	 * tree can be accessed via subtrees[0].
+	 *
+	 *   levels[0] : [<unused> | 0x0001******** | 0x0002******** | ...]
+	 *
+	 *   levels[1] : [<unused> | 0x00000001**** | 0x00000002**** | ... ]
+	 *
+	 *   levels[2] : [val(0x000000000000) | val(0x000000000001) | ...]
+	 *
+	 * This has practical implications on x64, which currently uses only the
+	 * lower 47 bits of virtual address space in userland, thus leaving
+	 * subtrees[0] unused and avoiding a level of tree traversal.
+	 */
+	union {
+		void			*subtree_pun;
+		rtree_node_elm_t	*subtree;
+	};
+	/* Number of key bits distinguished by this level. */
+	unsigned		bits;
+	/*
+	 * Cumulative number of key bits distinguished by traversing to
+	 * corresponding tree level.
+	 */
+	unsigned		cumbits;
+};
+
 struct rtree_s {
-	rtree_alloc_t	*alloc;
-	rtree_dalloc_t	*dalloc;
-	malloc_mutex_t	mutex;
-	void		**root;
+	rtree_node_alloc_t	*alloc;
+	rtree_node_dalloc_t	*dalloc;
 	unsigned		height;
-	unsigned	level2bits[1]; /* Dynamically sized. */
+	/*
+	 * Precomputed table used to convert from the number of leading 0 key
+	 * bits to which subtree level to start at.
+	 */
+	unsigned		start_level[RTREE_HEIGHT_MAX];
+	rtree_level_t		levels[RTREE_HEIGHT_MAX];
 };

 #endif /* JEMALLOC_H_STRUCTS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_EXTERNS

-rtree_t	*rtree_new(unsigned bits, rtree_alloc_t *alloc, rtree_dalloc_t *dalloc);
+bool rtree_new(rtree_t *rtree, unsigned bits, rtree_node_alloc_t *alloc,
+    rtree_node_dalloc_t *dalloc);
 void	rtree_delete(rtree_t *rtree);
-void	rtree_prefork(rtree_t *rtree);
-void	rtree_postfork_parent(rtree_t *rtree);
-void	rtree_postfork_child(rtree_t *rtree);
+rtree_node_elm_t	*rtree_subtree_read_hard(rtree_t *rtree,
+    unsigned level);
+rtree_node_elm_t	*rtree_child_read_hard(rtree_t *rtree,
+    rtree_node_elm_t *elm, unsigned level);

 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_INLINES

 #ifndef JEMALLOC_ENABLE_INLINE
-#ifdef JEMALLOC_DEBUG
-uint8_t rtree_get_locked(rtree_t *rtree, uintptr_t key);
-#endif
-uint8_t	rtree_get(rtree_t *rtree, uintptr_t key);
-bool	rtree_set(rtree_t *rtree, uintptr_t key, uint8_t val);
+unsigned	rtree_start_level(rtree_t *rtree, uintptr_t key);
+uintptr_t	rtree_subkey(rtree_t *rtree, uintptr_t key, unsigned level);
+
+bool	rtree_node_valid(rtree_node_elm_t *node);
+rtree_node_elm_t	*rtree_child_tryread(rtree_node_elm_t *elm);
+rtree_node_elm_t	*rtree_child_read(rtree_t *rtree, rtree_node_elm_t *elm,
+    unsigned level);
+extent_node_t	*rtree_val_read(rtree_t *rtree, rtree_node_elm_t *elm,
+    bool dependent);
+void	rtree_val_write(rtree_t *rtree, rtree_node_elm_t *elm,
+    const extent_node_t *val);
+rtree_node_elm_t	*rtree_subtree_tryread(rtree_t *rtree, unsigned level);
+rtree_node_elm_t	*rtree_subtree_read(rtree_t *rtree, unsigned level);
+
+extent_node_t	*rtree_get(rtree_t *rtree, uintptr_t key, bool dependent);
+bool	rtree_set(rtree_t *rtree, uintptr_t key, const extent_node_t *val);
 #endif

 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_RTREE_C_))
-#define	RTREE_GET_GENERATE(f)						\
-/* The least significant bits of the key are ignored. */		\
-JEMALLOC_INLINE uint8_t							\
-f(rtree_t *rtree, uintptr_t key)					\
-{									\
-	uint8_t ret;							\
-	uintptr_t subkey;						\
-	unsigned i, lshift, height, bits;				\
-	void **node, **child;						\
-									\
-	RTREE_LOCK(&rtree->mutex);					\
-	for (i = lshift = 0, height = rtree->height, node = rtree->root;\
-	    i < height - 1;						\
-	    i++, lshift += bits, node = child) {			\
-		bits = rtree->level2bits[i];				\
-		subkey = (key << lshift) >> ((ZU(1) << (LG_SIZEOF_PTR +	\
-		    3)) - bits);					\
-		child = (void**)node[subkey];				\
-		if (child == NULL) {					\
-			RTREE_UNLOCK(&rtree->mutex);			\
-			return (0);					\
-		}							\
-	}								\
-									\
-	/*								\
-	 * node is a leaf, so it contains values rather than node	\
-	 * pointers.							\
-	 */								\
-	bits = rtree->level2bits[i];					\
-	subkey = (key << lshift) >> ((ZU(1) << (LG_SIZEOF_PTR+3)) -	\
-	    bits);							\
-	{								\
-		uint8_t *leaf = (uint8_t *)node;			\
-		ret = leaf[subkey];					\
-	}								\
-	RTREE_UNLOCK(&rtree->mutex);					\
-									\
-	RTREE_GET_VALIDATE						\
-	return (ret);							\
+JEMALLOC_INLINE unsigned
+rtree_start_level(rtree_t *rtree, uintptr_t key)
+{
+	unsigned start_level;
+
+	if (unlikely(key == 0))
+		return (rtree->height - 1);
+
+	start_level = rtree->start_level[lg_floor(key) >>
+	    LG_RTREE_BITS_PER_LEVEL];
+	assert(start_level < rtree->height);
+	return (start_level);
 }

-#ifdef JEMALLOC_DEBUG
-#  define RTREE_LOCK(l)		malloc_mutex_lock(l)
-#  define RTREE_UNLOCK(l)	malloc_mutex_unlock(l)
-#  define RTREE_GET_VALIDATE
-RTREE_GET_GENERATE(rtree_get_locked)
-#  undef RTREE_LOCK
-#  undef RTREE_UNLOCK
-#  undef RTREE_GET_VALIDATE
-#endif
+JEMALLOC_INLINE uintptr_t
+rtree_subkey(rtree_t *rtree, uintptr_t key, unsigned level)
+{

-#define	RTREE_LOCK(l)
-#define	RTREE_UNLOCK(l)
-#ifdef JEMALLOC_DEBUG
-   /*
-    * Suppose that it were possible for a jemalloc-allocated chunk to be
-    * munmap()ped, followed by a different allocator in another thread re-using
-    * overlapping virtual memory, all without invalidating the cached rtree
-    * value.  The result would be a false positive (the rtree would claim that
-    * jemalloc owns memory that it had actually discarded).  This scenario
-    * seems impossible, but the following assertion is a prudent sanity check.
-    */
-#  define RTREE_GET_VALIDATE						\
-	assert(rtree_get_locked(rtree, key) == ret);
-#else
-#  define RTREE_GET_VALIDATE
-#endif
-RTREE_GET_GENERATE(rtree_get)
-#undef RTREE_LOCK
-#undef RTREE_UNLOCK
-#undef RTREE_GET_VALIDATE
+	return ((key >> ((ZU(1) << (LG_SIZEOF_PTR+3)) -
+	    rtree->levels[level].cumbits)) & ((ZU(1) <<
+	    rtree->levels[level].bits) - 1));
+}

 JEMALLOC_INLINE bool
-rtree_set(rtree_t *rtree, uintptr_t key, uint8_t val)
+rtree_node_valid(rtree_node_elm_t *node)
+{
+
+	return ((uintptr_t)node > (uintptr_t)RTREE_NODE_INITIALIZING);
+}
+
+JEMALLOC_INLINE rtree_node_elm_t *
+rtree_child_tryread(rtree_node_elm_t *elm)
+{
+	rtree_node_elm_t *child;
+
+	/* Double-checked read (first read may be stale. */
+	child = elm->child;
+	if (!rtree_node_valid(child))
+		child = atomic_read_p(&elm->pun);
+	return (child);
+}
+
+JEMALLOC_INLINE rtree_node_elm_t *
+rtree_child_read(rtree_t *rtree, rtree_node_elm_t *elm, unsigned level)
+{
+	rtree_node_elm_t *child;
+
+	child = rtree_child_tryread(elm);
+	if (unlikely(!rtree_node_valid(child)))
+		child = rtree_child_read_hard(rtree, elm, level);
+	return (child);
+}
+
+JEMALLOC_INLINE extent_node_t *
+rtree_val_read(rtree_t *rtree, rtree_node_elm_t *elm, bool dependent)
+{
+
+	if (dependent) {
+		/*
+		 * Reading a val on behalf of a pointer to a valid allocation is
+		 * guaranteed to be a clean read even without synchronization,
+		 * because the rtree update became visible in memory before the
+		 * pointer came into existence.
+		 */
+		return (elm->val);
+	} else {
+		/*
+		 * An arbitrary read, e.g. on behalf of ivsalloc(), may not be
+		 * dependent on a previous rtree write, which means a stale read
+		 * could result if synchronization were omitted here.
+		 */
+		return (atomic_read_p(&elm->pun));
+	}
+}
+
+JEMALLOC_INLINE void
+rtree_val_write(rtree_t *rtree, rtree_node_elm_t *elm, const extent_node_t *val)
+{
+
+	atomic_write_p(&elm->pun, val);
+}
+
+JEMALLOC_INLINE rtree_node_elm_t *
+rtree_subtree_tryread(rtree_t *rtree, unsigned level)
+{
+	rtree_node_elm_t *subtree;
+
+	/* Double-checked read (first read may be stale. */
+	subtree = rtree->levels[level].subtree;
+	if (!rtree_node_valid(subtree))
+		subtree = atomic_read_p(&rtree->levels[level].subtree_pun);
+	return (subtree);
+}
+
+JEMALLOC_INLINE rtree_node_elm_t *
+rtree_subtree_read(rtree_t *rtree, unsigned level)
+{
+	rtree_node_elm_t *subtree;
+
+	subtree = rtree_subtree_tryread(rtree, level);
+	if (unlikely(!rtree_node_valid(subtree)))
+		subtree = rtree_subtree_read_hard(rtree, level);
+	return (subtree);
+}
+
+JEMALLOC_INLINE extent_node_t *
+rtree_get(rtree_t *rtree, uintptr_t key, bool dependent)
 {
 	uintptr_t subkey;
-	unsigned i, lshift, height, bits;
-	void **node, **child;
+	unsigned i, start_level;
+	rtree_node_elm_t *node, *child;

-	malloc_mutex_lock(&rtree->mutex);
-	for (i = lshift = 0, height = rtree->height, node = rtree->root;
-	    i < height - 1;
-	    i++, lshift += bits, node = child) {
-		bits = rtree->level2bits[i];
-		subkey = (key << lshift) >> ((ZU(1) << (LG_SIZEOF_PTR+3)) -
-		    bits);
-		child = (void**)node[subkey];
-		if (child == NULL) {
-			size_t size = ((i + 1 < height - 1) ? sizeof(void *)
-			    : (sizeof(uint8_t))) << rtree->level2bits[i+1];
-			child = (void**)rtree->alloc(size);
-			if (child == NULL) {
-				malloc_mutex_unlock(&rtree->mutex);
+	start_level = rtree_start_level(rtree, key);
+
+	for (i = start_level, node = rtree_subtree_tryread(rtree, start_level);
+	    /**/; i++, node = child) {
+		if (!dependent && unlikely(!rtree_node_valid(node)))
+			return (NULL);
+		subkey = rtree_subkey(rtree, key, i);
+		if (i == rtree->height - 1) {
+			/*
+			 * node is a leaf, so it contains values rather than
+			 * child pointers.
+			 */
+			return (rtree_val_read(rtree, &node[subkey],
+			    dependent));
+		}
+		assert(i < rtree->height - 1);
+		child = rtree_child_tryread(&node[subkey]);
+	}
+	not_reached();
+}
+
+JEMALLOC_INLINE bool
+rtree_set(rtree_t *rtree, uintptr_t key, const extent_node_t *val)
+{
+	uintptr_t subkey;
+	unsigned i, start_level;
+	rtree_node_elm_t *node, *child;
+
+	start_level = rtree_start_level(rtree, key);
+
+	node = rtree_subtree_read(rtree, start_level);
+	if (node == NULL)
+		return (true);
+	for (i = start_level; /**/; i++, node = child) {
+		subkey = rtree_subkey(rtree, key, i);
+		if (i == rtree->height - 1) {
+			/*
+			 * node is a leaf, so it contains values rather than
+			 * child pointers.
+			 */
+			rtree_val_write(rtree, &node[subkey], val);
+			return (false);
+		}
+		assert(i + 1 < rtree->height);
+		child = rtree_child_read(rtree, &node[subkey], i);
+		if (child == NULL)
 			return (true);
 	}
-			memset(child, 0, size);
-			node[subkey] = child;
-		}
-	}
-
-	/* node is a leaf, so it contains values rather than node pointers. */
-	bits = rtree->level2bits[i];
-	subkey = (key << lshift) >> ((ZU(1) << (LG_SIZEOF_PTR+3)) - bits);
-	{
-		uint8_t *leaf = (uint8_t *)node;
-		leaf[subkey] = val;
-	}
-	malloc_mutex_unlock(&rtree->mutex);
-
-	return (false);
+	not_reached();
 }
 #endif

--- a/include/jemalloc/internal/size_classes.sh
+++ b/include/jemalloc/internal/size_classes.sh
@ -1,17 +1,26 @@
 #!/bin/sh
+#
+# Usage: size_classes.sh <lg_qarr> <lg_tmin> <lg_parr> <lg_g>

 # The following limits are chosen such that they cover all supported platforms.

-# Range of quanta.
-lg_qmin=3
-lg_qmax=4
+# Pointer sizes.
+lg_zarr="2 3"
+
+# Quanta.
+lg_qarr=$1

 # The range of tiny size classes is [2^lg_tmin..2^(lg_q-1)].
-lg_tmin=3
+lg_tmin=$2

-# Range of page sizes.
-lg_pmin=12
-lg_pmax=16
+# Maximum lookup size.
+lg_kmax=12
+
+# Page sizes.
+lg_parr=`echo $3 | tr ',' ' '`
+
+# Size class group size (number of size classes for each size doubling).
+lg_g=$4

 pow2() {
  e=$1
@ -22,68 +31,219 @@ pow2() {
  done
 }

+lg() {
+  x=$1
+  lg_result=0
+  while [ ${x} -gt 1 ] ; do
+    lg_result=$((${lg_result} + 1))
+    x=$((${x} / 2))
+  done
+}
+
+size_class() {
+  index=$1
+  lg_grp=$2
+  lg_delta=$3
+  ndelta=$4
+  lg_p=$5
+  lg_kmax=$6
+
+  lg ${ndelta}; lg_ndelta=${lg_result}; pow2 ${lg_ndelta}
+  if [ ${pow2_result} -lt ${ndelta} ] ; then
+    rem="yes"
+  else
+    rem="no"
+  fi
+
+  lg_size=${lg_grp}
+  if [ $((${lg_delta} + ${lg_ndelta})) -eq ${lg_grp} ] ; then
+    lg_size=$((${lg_grp} + 1))
+  else
+    lg_size=${lg_grp}
+    rem="yes"
+  fi
+
+  if [ ${lg_size} -lt $((${lg_p} + ${lg_g})) ] ; then
+    bin="yes"
+  else
+    bin="no"
+  fi
+  if [ ${lg_size} -lt ${lg_kmax} \
+      -o ${lg_size} -eq ${lg_kmax} -a ${rem} = "no" ] ; then
+    lg_delta_lookup=${lg_delta}
+  else
+    lg_delta_lookup="no"
+  fi
+  printf '    SC(%3d, %6d, %8d, %6d, %3s, %2s) \\\n' ${index} ${lg_grp} ${lg_delta} ${ndelta} ${bin} ${lg_delta_lookup}
+  # Defined upon return:
+  # - lg_delta_lookup (${lg_delta} or "no")
+  # - bin ("yes" or "no")
+}
+
+sep_line() {
+  echo "                                               \\"
+}
+
+size_classes() {
+  lg_z=$1
+  lg_q=$2
+  lg_t=$3
+  lg_p=$4
+  lg_g=$5
+
+  pow2 $((${lg_z} + 3)); ptr_bits=${pow2_result}
+  pow2 ${lg_g}; g=${pow2_result}
+
+  echo "#define	SIZE_CLASSES \\"
+  echo "  /* index, lg_grp, lg_delta, ndelta, bin, lg_delta_lookup */ \\"
+
+  ntbins=0
+  nlbins=0
+  lg_tiny_maxclass='"NA"'
+  nbins=0
+
+  # Tiny size classes.
+  ndelta=0
+  index=0
+  lg_grp=${lg_t}
+  lg_delta=${lg_grp}
+  while [ ${lg_grp} -lt ${lg_q} ] ; do
+    size_class ${index} ${lg_grp} ${lg_delta} ${ndelta} ${lg_p} ${lg_kmax}
+    if [ ${lg_delta_lookup} != "no" ] ; then
+      nlbins=$((${index} + 1))
+    fi
+    if [ ${bin} != "no" ] ; then
+      nbins=$((${index} + 1))
+    fi
+    ntbins=$((${ntbins} + 1))
+    lg_tiny_maxclass=${lg_grp} # Final written value is correct.
+    index=$((${index} + 1))
+    lg_delta=${lg_grp}
+    lg_grp=$((${lg_grp} + 1))
+  done
+
+  # First non-tiny group.
+  if [ ${ntbins} -gt 0 ] ; then
+    sep_line
+    # The first size class has an unusual encoding, because the size has to be
+    # split between grp and delta*ndelta.
+    lg_grp=$((${lg_grp} - 1))
+    ndelta=1
+    size_class ${index} ${lg_grp} ${lg_delta} ${ndelta} ${lg_p} ${lg_kmax}
+    index=$((${index} + 1))
+    lg_grp=$((${lg_grp} + 1))
+    lg_delta=$((${lg_delta} + 1))
+  fi
+  while [ ${ndelta} -lt ${g} ] ; do
+    size_class ${index} ${lg_grp} ${lg_delta} ${ndelta} ${lg_p} ${lg_kmax}
+    index=$((${index} + 1))
+    ndelta=$((${ndelta} + 1))
+  done
+
+  # All remaining groups.
+  lg_grp=$((${lg_grp} + ${lg_g}))
+  while [ ${lg_grp} -lt ${ptr_bits} ] ; do
+    sep_line
+    ndelta=1
+    if [ ${lg_grp} -eq $((${ptr_bits} - 1)) ] ; then
+      ndelta_limit=$((${g} - 1))
+    else
+      ndelta_limit=${g}
+    fi
+    while [ ${ndelta} -le ${ndelta_limit} ] ; do
+      size_class ${index} ${lg_grp} ${lg_delta} ${ndelta} ${lg_p} ${lg_kmax}
+      if [ ${lg_delta_lookup} != "no" ] ; then
+        nlbins=$((${index} + 1))
+        # Final written value is correct:
+        lookup_maxclass="((((size_t)1) << ${lg_grp}) + (((size_t)${ndelta}) << ${lg_delta}))"
+      fi
+      if [ ${bin} != "no" ] ; then
+        nbins=$((${index} + 1))
+        # Final written value is correct:
+        small_maxclass="((((size_t)1) << ${lg_grp}) + (((size_t)${ndelta}) << ${lg_delta}))"
+        if [ ${lg_g} -gt 0 ] ; then
+          lg_large_minclass=$((${lg_grp} + 1))
+        else
+          lg_large_minclass=$((${lg_grp} + 2))
+        fi
+      fi
+      index=$((${index} + 1))
+      ndelta=$((${ndelta} + 1))
+    done
+    lg_grp=$((${lg_grp} + 1))
+    lg_delta=$((${lg_delta} + 1))
+  done
+  echo
+  nsizes=${index}
+
+  # Defined upon completion:
+  # - ntbins
+  # - nlbins
+  # - nbins
+  # - nsizes
+  # - lg_tiny_maxclass
+  # - lookup_maxclass
+  # - small_maxclass
+  # - lg_large_minclass
+}
+
 cat <<EOF
 /* This file was automatically generated by size_classes.sh. */
 /******************************************************************************/
 #ifdef JEMALLOC_H_TYPES

+/*
+ * This header requires LG_SIZEOF_PTR, LG_TINY_MIN, LG_QUANTUM, and LG_PAGE to
+ * be defined prior to inclusion, and it in turn defines:
+ *
+ *   LG_SIZE_CLASS_GROUP: Lg of size class count for each size doubling.
+ *   SIZE_CLASSES: Complete table of
+ *                 SC(index, lg_grp, lg_delta, ndelta, bin, lg_delta_lookup)
+ *                 tuples.
+ *     index: Size class index.
+ *     lg_grp: Lg group base size (no deltas added).
+ *     lg_delta: Lg delta to previous size class.
+ *     ndelta: Delta multiplier.  size == 1<<lg_grp + ndelta<<lg_delta
+ *     bin: 'yes' if a small bin size class, 'no' otherwise.
+ *     lg_delta_lookup: Same as lg_delta if a lookup table size class, 'no'
+ *                      otherwise.
+ *   NTBINS: Number of tiny bins.
+ *   NLBINS: Number of bins supported by the lookup table.
+ *   NBINS: Number of small size class bins.
+ *   NSIZES: Number of size classes.
+ *   LG_TINY_MAXCLASS: Lg of maximum tiny size class.
+ *   LOOKUP_MAXCLASS: Maximum size class included in lookup table.
+ *   SMALL_MAXCLASS: Maximum small size class.
+ *   LG_LARGE_MINCLASS: Lg of minimum large size class.
+ */
+
+#define	LG_SIZE_CLASS_GROUP	${lg_g}
+
 EOF

-lg_q=${lg_qmin}
-while [ ${lg_q} -le ${lg_qmax} ] ; do
+for lg_z in ${lg_zarr} ; do
+  for lg_q in ${lg_qarr} ; do
    lg_t=${lg_tmin}
    while [ ${lg_t} -le ${lg_q} ] ; do
-    lg_p=${lg_pmin}
-    while [ ${lg_p} -le ${lg_pmax} ] ; do
-      echo "#if (LG_TINY_MIN == ${lg_t} && LG_QUANTUM == ${lg_q} && LG_PAGE == ${lg_p})"
+      # Iterate through page sizes and compute how many bins there are.
+      for lg_p in ${lg_parr} ; do
+        echo "#if (LG_SIZEOF_PTR == ${lg_z} && LG_TINY_MIN == ${lg_t} && LG_QUANTUM == ${lg_q} && LG_PAGE == ${lg_p})"
+        size_classes ${lg_z} ${lg_q} ${lg_t} ${lg_p} ${lg_g}
        echo "#define	SIZE_CLASSES_DEFINED"
-      pow2 ${lg_q}; q=${pow2_result}
-      pow2 ${lg_t}; t=${pow2_result}
-      pow2 ${lg_p}; p=${pow2_result}
-      bin=0
-      psz=0
-      sz=${t}
-      delta=$((${sz} - ${psz}))
-      echo "/*  SIZE_CLASS(bin,	delta,	sz) */"
-      echo "#define	SIZE_CLASSES							\\"
-
-      # Tiny size classes.
-      while [ ${sz} -lt ${q} ] ; do
-        echo "    SIZE_CLASS(${bin},	${delta},	${sz})					\\"
-        bin=$((${bin} + 1))
-        psz=${sz}
-        sz=$((${sz} + ${sz}))
-        delta=$((${sz} - ${psz}))
-      done
-      # Quantum-multiple size classes.  For each doubling of sz, as many as 4
-      # size classes exist.  Their spacing is the greater of:
-      # - q
-      # - sz/4, where sz is a power of 2
-      while [ ${sz} -lt ${p} ] ; do
-        if [ ${sz} -ge $((${q} * 4)) ] ; then
-          i=$((${sz} / 4))
-        else
-          i=${q}
-        fi
-        next_2pow=$((${sz} * 2))
-        while [ ${sz} -lt $next_2pow ] ; do
-          echo "    SIZE_CLASS(${bin},	${delta},	${sz})					\\"
-          bin=$((${bin} + 1))
-          psz=${sz}
-          sz=$((${sz} + ${i}))
-          delta=$((${sz} - ${psz}))
-        done
-      done
-      echo
-      echo "#define	NBINS		${bin}"
-      echo "#define	SMALL_MAXCLASS	${psz}"
+        echo "#define	NTBINS			${ntbins}"
+        echo "#define	NLBINS			${nlbins}"
+        echo "#define	NBINS			${nbins}"
+        echo "#define	NSIZES			${nsizes}"
+        echo "#define	LG_TINY_MAXCLASS	${lg_tiny_maxclass}"
+        echo "#define	LOOKUP_MAXCLASS		${lookup_maxclass}"
+        echo "#define	SMALL_MAXCLASS		${small_maxclass}"
+        echo "#define	LG_LARGE_MINCLASS	${lg_large_minclass}"
        echo "#endif"
        echo
-      lg_p=$((${lg_p} + 1))
      done
      lg_t=$((${lg_t} + 1))
    done
-  lg_q=$((${lg_q} + 1))
+  done
 done

 cat <<EOF
@ -92,11 +252,10 @@ cat <<EOF
 #endif
 #undef SIZE_CLASSES_DEFINED
 /*
- * The small_size2bin lookup table uses uint8_t to encode each bin index, so we
+ * The size2index_tab lookup table uses uint8_t to encode each bin index, so we
 * cannot support more than 256 small size classes.  Further constrain NBINS to
- * 255 to support prof_promote, since all small size classes, plus a "not
- * small" size class must be stored in 8 bits of arena_chunk_map_t's bits
- * field.
+ * 255 since all small size classes, plus a "not small" size class must be
+ * stored in 8 bits of arena_chunk_map_bits_t's bits field.
 */
 #if (NBINS > 255)
 #  error "Too many small size classes"
--- a/include/jemalloc/internal/stats.h
+++ b/include/jemalloc/internal/stats.h
@ -4,6 +4,7 @@
 typedef struct tcache_bin_stats_s tcache_bin_stats_t;
 typedef struct malloc_bin_stats_s malloc_bin_stats_t;
 typedef struct malloc_large_stats_s malloc_large_stats_t;
+typedef struct malloc_huge_stats_s malloc_huge_stats_t;
 typedef struct arena_stats_s arena_stats_t;
 typedef struct chunk_stats_s chunk_stats_t;

@ -20,12 +21,6 @@ struct tcache_bin_stats_s {
 };

 struct malloc_bin_stats_s {
-	/*
-	 * Current number of bytes allocated, including objects currently
-	 * cached by tcache.
-	 */
-	size_t		allocated;
-
 	/*
 	 * Total number of allocation/deallocation requests served directly by
 	 * the bin.  Note that tcache may allocate an object, then recycle it
@ -42,6 +37,12 @@ struct malloc_bin_stats_s {
 	 */
 	uint64_t	nrequests;

+	/*
+	 * Current number of regions of this size class, including regions
+	 * currently cached by tcache.
+	 */
+	size_t		curregs;
+
 	/* Number of tcache fills from this bin. */
 	uint64_t	nfills;

@ -78,10 +79,25 @@ struct malloc_large_stats_s {
 	 */
 	uint64_t	nrequests;

-	/* Current number of runs of this size class. */
+	/*
+	 * Current number of runs of this size class, including runs currently
+	 * cached by tcache.
+	 */
 	size_t		curruns;
 };

+struct malloc_huge_stats_s {
+	/*
+	 * Total number of allocation/deallocation requests served directly by
+	 * the arena.
+	 */
+	uint64_t	nmalloc;
+	uint64_t	ndalloc;
+
+	/* Current number of (multi-)chunk allocations of this size class. */
+	size_t		curhchunks;
+};
+
 struct arena_stats_s {
 	/* Number of bytes currently mapped. */
 	size_t		mapped;
@ -95,34 +111,28 @@ struct arena_stats_s {
 	uint64_t	nmadvise;
 	uint64_t	purged;

+	/*
+	 * Number of bytes currently mapped purely for metadata purposes, and
+	 * number of bytes currently allocated for internal metadata.
+	 */
+	size_t		metadata_mapped;
+	size_t		metadata_allocated; /* Protected via atomic_*_z(). */
+
 	/* Per-size-category statistics. */
 	size_t		allocated_large;
 	uint64_t	nmalloc_large;
 	uint64_t	ndalloc_large;
 	uint64_t	nrequests_large;

-	/*
-	 * One element for each possible size class, including sizes that
-	 * overlap with bin size classes.  This is necessary because ipalloc()
-	 * sometimes has to use such large objects in order to assure proper
-	 * alignment.
-	 */
+	size_t		allocated_huge;
+	uint64_t	nmalloc_huge;
+	uint64_t	ndalloc_huge;
+
+	/* One element for each large size class. */
 	malloc_large_stats_t	*lstats;
-};

-struct chunk_stats_s {
-	/* Number of chunks that were allocated. */
-	uint64_t	nchunks;
-
-	/* High-water mark for number of chunks allocated. */
-	size_t		highchunks;
-
-	/*
-	 * Current number of chunks allocated.  This value isn't maintained for
-	 * any other purpose, so keep track of it in order to be able to set
-	 * highchunks.
-	 */
-	size_t		curchunks;
+	/* One element for each huge size class. */
+	malloc_huge_stats_t	*hstats;
 };

 #endif /* JEMALLOC_H_STRUCTS */
--- a/include/jemalloc/internal/tcache.h
+++ b/include/jemalloc/internal/tcache.h
@ -4,6 +4,7 @@
 typedef struct tcache_bin_info_s tcache_bin_info_t;
 typedef struct tcache_bin_s tcache_bin_t;
 typedef struct tcache_s tcache_t;
+typedef struct tcaches_s tcaches_t;

 /*
 * tcache pointers close to NULL are used to encode state information that is
@ -15,6 +16,11 @@ typedef struct tcache_s tcache_t;
 #define	TCACHE_STATE_PURGATORY		((tcache_t *)(uintptr_t)3)
 #define	TCACHE_STATE_MAX		TCACHE_STATE_PURGATORY

+/*
+ * Absolute minimum number of cache slots for each small bin.
+ */
+#define	TCACHE_NSLOTS_SMALL_MIN		20
+
 /*
 * Absolute maximum number of cache slots for each small bin in the thread
 * cache.  This is an additional constraint beyond that imposed as: twice the
@ -69,10 +75,9 @@ struct tcache_bin_s {

 struct tcache_s {
 	ql_elm(tcache_t) link;		/* Used for aggregating stats. */
-	uint64_t	prof_accumbytes;/* Cleared after arena_prof_accum() */
-	arena_t		*arena;		/* This thread's arena. */
+	uint64_t	prof_accumbytes;/* Cleared after arena_prof_accum(). */
 	unsigned	ev_cnt;		/* Event count since incremental GC. */
-	unsigned	next_gc_bin;	/* Next bin to GC. */
+	index_t		next_gc_bin;	/* Next bin to GC. */
 	tcache_bin_t	tbins[1];	/* Dynamically sized. */
 	/*
 	 * The pointer stacks associated with tbins follow as a contiguous
@ -82,6 +87,14 @@ struct tcache_s {
 	 */
 };

+/* Linkage for list of available (previously used) explicit tcache IDs. */
+struct tcaches_s {
+	union {
+		tcache_t	*tcache;
+		tcaches_t	*next;
+	};
+};
+
 #endif /* JEMALLOC_H_STRUCTS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_EXTERNS
@ -100,79 +113,85 @@ extern size_t			nhbins;
 /* Maximum cached size class. */
 extern size_t	tcache_maxclass;

+/*
+ * Explicit tcaches, managed via the tcache.{create,flush,destroy} mallctls and
+ * usable via the MALLOCX_TCACHE() flag.  The automatic per thread tcaches are
+ * completely disjoint from this data structure.  tcaches starts off as a sparse
+ * array, so it has no physical memory footprint until individual pages are
+ * touched.  This allows the entire array to be allocated the first time an
+ * explicit tcache is created without a disproportionate impact on memory usage.
+ */
+extern tcaches_t	*tcaches;
+
 size_t	tcache_salloc(const void *ptr);
-void	tcache_event_hard(tcache_t *tcache);
-void	*tcache_alloc_small_hard(tcache_t *tcache, tcache_bin_t *tbin,
-    size_t binind);
-void	tcache_bin_flush_small(tcache_bin_t *tbin, size_t binind, unsigned rem,
-    tcache_t *tcache);
-void	tcache_bin_flush_large(tcache_bin_t *tbin, size_t binind, unsigned rem,
-    tcache_t *tcache);
+void	tcache_event_hard(tsd_t *tsd, tcache_t *tcache);
+void	*tcache_alloc_small_hard(tsd_t *tsd, arena_t *arena, tcache_t *tcache,
+    tcache_bin_t *tbin, index_t binind);
+void	tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, tcache_bin_t *tbin,
+    index_t binind, unsigned rem);
+void	tcache_bin_flush_large(tsd_t *tsd, tcache_bin_t *tbin, index_t binind,
+    unsigned rem, tcache_t *tcache);
 void	tcache_arena_associate(tcache_t *tcache, arena_t *arena);
-void	tcache_arena_dissociate(tcache_t *tcache);
-tcache_t *tcache_create(arena_t *arena);
-void	tcache_destroy(tcache_t *tcache);
-void	tcache_thread_cleanup(void *arg);
+void	tcache_arena_reassociate(tcache_t *tcache, arena_t *oldarena,
+    arena_t *newarena);
+void	tcache_arena_dissociate(tcache_t *tcache, arena_t *arena);
+tcache_t *tcache_get_hard(tsd_t *tsd);
+tcache_t *tcache_create(tsd_t *tsd, arena_t *arena);
+void	tcache_cleanup(tsd_t *tsd);
+void	tcache_enabled_cleanup(tsd_t *tsd);
 void	tcache_stats_merge(tcache_t *tcache, arena_t *arena);
-bool	tcache_boot0(void);
-bool	tcache_boot1(void);
+bool	tcaches_create(tsd_t *tsd, unsigned *r_ind);
+void	tcaches_flush(tsd_t *tsd, unsigned ind);
+void	tcaches_destroy(tsd_t *tsd, unsigned ind);
+bool	tcache_boot(void);

 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_INLINES

 #ifndef JEMALLOC_ENABLE_INLINE
-malloc_tsd_protos(JEMALLOC_ATTR(unused), tcache, tcache_t *)
-malloc_tsd_protos(JEMALLOC_ATTR(unused), tcache_enabled, tcache_enabled_t)
-
-void	tcache_event(tcache_t *tcache);
+void	tcache_event(tsd_t *tsd, tcache_t *tcache);
 void	tcache_flush(void);
 bool	tcache_enabled_get(void);
-tcache_t *tcache_get(bool create);
+tcache_t *tcache_get(tsd_t *tsd, bool create);
 void	tcache_enabled_set(bool enabled);
 void	*tcache_alloc_easy(tcache_bin_t *tbin);
-void	*tcache_alloc_small(tcache_t *tcache, size_t size, bool zero);
-void	*tcache_alloc_large(tcache_t *tcache, size_t size, bool zero);
-void	tcache_dalloc_small(tcache_t *tcache, void *ptr, size_t binind);
-void	tcache_dalloc_large(tcache_t *tcache, void *ptr, size_t size);
+void	*tcache_alloc_small(tsd_t *tsd, arena_t *arena, tcache_t *tcache,
+    size_t size, bool zero);
+void	*tcache_alloc_large(tsd_t *tsd, arena_t *arena, tcache_t *tcache,
+    size_t size, bool zero);
+void	tcache_dalloc_small(tsd_t *tsd, tcache_t *tcache, void *ptr,
+    index_t binind);
+void	tcache_dalloc_large(tsd_t *tsd, tcache_t *tcache, void *ptr,
+    size_t size);
+tcache_t	*tcaches_get(tsd_t *tsd, unsigned ind);
 #endif

 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_TCACHE_C_))
-/* Map of thread-specific caches. */
-malloc_tsd_externs(tcache, tcache_t *)
-malloc_tsd_funcs(JEMALLOC_ALWAYS_INLINE, tcache, tcache_t *, NULL,
-    tcache_thread_cleanup)
-/* Per thread flag that allows thread caches to be disabled. */
-malloc_tsd_externs(tcache_enabled, tcache_enabled_t)
-malloc_tsd_funcs(JEMALLOC_ALWAYS_INLINE, tcache_enabled, tcache_enabled_t,
-    tcache_enabled_default, malloc_tsd_no_cleanup)
-
 JEMALLOC_INLINE void
 tcache_flush(void)
 {
-	tcache_t *tcache;
+	tsd_t *tsd;

 	cassert(config_tcache);

-	tcache = *tcache_tsd_get();
-	if ((uintptr_t)tcache <= (uintptr_t)TCACHE_STATE_MAX)
-		return;
-	tcache_destroy(tcache);
-	tcache = NULL;
-	tcache_tsd_set(&tcache);
+	tsd = tsd_fetch();
+	tcache_cleanup(tsd);
 }

 JEMALLOC_INLINE bool
 tcache_enabled_get(void)
 {
+	tsd_t *tsd;
 	tcache_enabled_t tcache_enabled;

 	cassert(config_tcache);

-	tcache_enabled = *tcache_enabled_tsd_get();
+	tsd = tsd_fetch();
+	tcache_enabled = tsd_tcache_enabled_get(tsd);
 	if (tcache_enabled == tcache_enabled_default) {
 		tcache_enabled = (tcache_enabled_t)opt_tcache;
-		tcache_enabled_tsd_set(&tcache_enabled);
+		tsd_tcache_enabled_set(tsd, tcache_enabled);
 	}

 	return ((bool)tcache_enabled);
@ -181,85 +200,41 @@ tcache_enabled_get(void)
 JEMALLOC_INLINE void
 tcache_enabled_set(bool enabled)
 {
+	tsd_t *tsd;
 	tcache_enabled_t tcache_enabled;
-	tcache_t *tcache;

 	cassert(config_tcache);

+	tsd = tsd_fetch();
+
 	tcache_enabled = (tcache_enabled_t)enabled;
-	tcache_enabled_tsd_set(&tcache_enabled);
-	tcache = *tcache_tsd_get();
-	if (enabled) {
-		if (tcache == TCACHE_STATE_DISABLED) {
-			tcache = NULL;
-			tcache_tsd_set(&tcache);
-		}
-	} else /* disabled */ {
-		if (tcache > TCACHE_STATE_MAX) {
-			tcache_destroy(tcache);
-			tcache = NULL;
-		}
-		if (tcache == NULL) {
-			tcache = TCACHE_STATE_DISABLED;
-			tcache_tsd_set(&tcache);
-		}
-	}
+	tsd_tcache_enabled_set(tsd, tcache_enabled);
+
+	if (!enabled)
+		tcache_cleanup(tsd);
 }

 JEMALLOC_ALWAYS_INLINE tcache_t *
-tcache_get(bool create)
+tcache_get(tsd_t *tsd, bool create)
 {
 	tcache_t *tcache;

-	if (config_tcache == false)
-		return (NULL);
-	if (config_lazy_lock && isthreaded == false)
+	if (!config_tcache)
 		return (NULL);

-	tcache = *tcache_tsd_get();
-	if ((uintptr_t)tcache <= (uintptr_t)TCACHE_STATE_MAX) {
-		if (tcache == TCACHE_STATE_DISABLED)
-			return (NULL);
-		if (tcache == NULL) {
-			if (create == false) {
-				/*
-				 * Creating a tcache here would cause
-				 * allocation as a side effect of free().
-				 * Ordinarily that would be okay since
-				 * tcache_create() failure is a soft failure
-				 * that doesn't propagate.  However, if TLS
-				 * data are freed via free() as in glibc,
-				 * subtle corruption could result from setting
-				 * a TLS variable after its backing memory is
-				 * freed.
-				 */
-				return (NULL);
-			}
-			if (tcache_enabled_get() == false) {
-				tcache_enabled_set(false); /* Memoize. */
-				return (NULL);
-			}
-			return (tcache_create(choose_arena(NULL)));
-		}
-		if (tcache == TCACHE_STATE_PURGATORY) {
-			/*
-			 * Make a note that an allocator function was called
-			 * after tcache_thread_cleanup() was called.
-			 */
-			tcache = TCACHE_STATE_REINCARNATED;
-			tcache_tsd_set(&tcache);
-			return (NULL);
-		}
-		if (tcache == TCACHE_STATE_REINCARNATED)
-			return (NULL);
-		not_reached();
+	tcache = tsd_tcache_get(tsd);
+	if (!create)
+		return (tcache);
+	if (unlikely(tcache == NULL) && tsd_nominal(tsd)) {
+		tcache = tcache_get_hard(tsd);
+		tsd_tcache_set(tsd, tcache);
 	}

 	return (tcache);
 }

 JEMALLOC_ALWAYS_INLINE void
-tcache_event(tcache_t *tcache)
+tcache_event(tsd_t *tsd, tcache_t *tcache)
 {

 	if (TCACHE_GC_INCR == 0)
@ -267,8 +242,8 @@ tcache_event(tcache_t *tcache)

 	tcache->ev_cnt++;
 	assert(tcache->ev_cnt <= TCACHE_GC_INCR);
-	if (tcache->ev_cnt == TCACHE_GC_INCR)
-		tcache_event_hard(tcache);
+	if (unlikely(tcache->ev_cnt == TCACHE_GC_INCR))
+		tcache_event_hard(tsd, tcache);
 }

 JEMALLOC_ALWAYS_INLINE void *
@ -276,85 +251,87 @@ tcache_alloc_easy(tcache_bin_t *tbin)
 {
 	void *ret;

-	if (tbin->ncached == 0) {
+	if (unlikely(tbin->ncached == 0)) {
 		tbin->low_water = -1;
 		return (NULL);
 	}
 	tbin->ncached--;
-	if ((int)tbin->ncached < tbin->low_water)
+	if (unlikely((int)tbin->ncached < tbin->low_water))
 		tbin->low_water = tbin->ncached;
 	ret = tbin->avail[tbin->ncached];
 	return (ret);
 }

 JEMALLOC_ALWAYS_INLINE void *
-tcache_alloc_small(tcache_t *tcache, size_t size, bool zero)
+tcache_alloc_small(tsd_t *tsd, arena_t *arena, tcache_t *tcache, size_t size,
+    bool zero)
 {
 	void *ret;
-	size_t binind;
+	index_t binind;
+	size_t usize;
 	tcache_bin_t *tbin;

-	binind = SMALL_SIZE2BIN(size);
+	binind = size2index(size);
 	assert(binind < NBINS);
 	tbin = &tcache->tbins[binind];
-	size = arena_bin_info[binind].reg_size;
+	usize = index2size(binind);
 	ret = tcache_alloc_easy(tbin);
-	if (ret == NULL) {
-		ret = tcache_alloc_small_hard(tcache, tbin, binind);
+	if (unlikely(ret == NULL)) {
+		ret = tcache_alloc_small_hard(tsd, arena, tcache, tbin, binind);
 		if (ret == NULL)
 			return (NULL);
 	}
-	assert(tcache_salloc(ret) == arena_bin_info[binind].reg_size);
+	assert(tcache_salloc(ret) == usize);

-	if (zero == false) {
+	if (likely(!zero)) {
 		if (config_fill) {
-			if (opt_junk) {
+			if (unlikely(opt_junk_alloc)) {
 				arena_alloc_junk_small(ret,
 				    &arena_bin_info[binind], false);
-			} else if (opt_zero)
-				memset(ret, 0, size);
+			} else if (unlikely(opt_zero))
+				memset(ret, 0, usize);
 		}
-		VALGRIND_MAKE_MEM_UNDEFINED(ret, size);
 	} else {
-		if (config_fill && opt_junk) {
+		if (config_fill && unlikely(opt_junk_alloc)) {
 			arena_alloc_junk_small(ret, &arena_bin_info[binind],
 			    true);
 		}
-		VALGRIND_MAKE_MEM_UNDEFINED(ret, size);
-		memset(ret, 0, size);
+		memset(ret, 0, usize);
 	}

 	if (config_stats)
 		tbin->tstats.nrequests++;
 	if (config_prof)
-		tcache->prof_accumbytes += arena_bin_info[binind].reg_size;
-	tcache_event(tcache);
+		tcache->prof_accumbytes += usize;
+	tcache_event(tsd, tcache);
 	return (ret);
 }

 JEMALLOC_ALWAYS_INLINE void *
-tcache_alloc_large(tcache_t *tcache, size_t size, bool zero)
+tcache_alloc_large(tsd_t *tsd, arena_t *arena, tcache_t *tcache, size_t size,
+    bool zero)
 {
 	void *ret;
-	size_t binind;
+	index_t binind;
+	size_t usize;
 	tcache_bin_t *tbin;

-	size = PAGE_CEILING(size);
-	assert(size <= tcache_maxclass);
-	binind = NBINS + (size >> LG_PAGE) - 1;
+	binind = size2index(size);
+	usize = index2size(binind);
+	assert(usize <= tcache_maxclass);
 	assert(binind < nhbins);
 	tbin = &tcache->tbins[binind];
 	ret = tcache_alloc_easy(tbin);
-	if (ret == NULL) {
+	if (unlikely(ret == NULL)) {
 		/*
 		 * Only allocate one large object at a time, because it's quite
 		 * expensive to create one and not use it.
 		 */
-		ret = arena_malloc_large(tcache->arena, size, zero);
+		ret = arena_malloc_large(arena, usize, zero);
 		if (ret == NULL)
 			return (NULL);
 	} else {
-		if (config_prof && prof_promote && size == PAGE) {
+		if (config_prof && usize == LARGE_MINCLASS) {
 			arena_chunk_t *chunk =
 			    (arena_chunk_t *)CHUNK_ADDR2BASE(ret);
 			size_t pageind = (((uintptr_t)ret - (uintptr_t)chunk) >>
@ -362,57 +339,54 @@ tcache_alloc_large(tcache_t *tcache, size_t size, bool zero)
 			arena_mapbits_large_binind_set(chunk, pageind,
 			    BININD_INVALID);
 		}
-		if (zero == false) {
+		if (likely(!zero)) {
 			if (config_fill) {
-				if (opt_junk)
-					memset(ret, 0xa5, size);
-				else if (opt_zero)
-					memset(ret, 0, size);
-			}
-			VALGRIND_MAKE_MEM_UNDEFINED(ret, size);
-		} else {
-			VALGRIND_MAKE_MEM_UNDEFINED(ret, size);
-			memset(ret, 0, size);
+				if (unlikely(opt_junk_alloc))
+					memset(ret, 0xa5, usize);
+				else if (unlikely(opt_zero))
+					memset(ret, 0, usize);
 			}
+		} else
+			memset(ret, 0, usize);

 		if (config_stats)
 			tbin->tstats.nrequests++;
 		if (config_prof)
-			tcache->prof_accumbytes += size;
+			tcache->prof_accumbytes += usize;
 	}

-	tcache_event(tcache);
+	tcache_event(tsd, tcache);
 	return (ret);
 }

 JEMALLOC_ALWAYS_INLINE void
-tcache_dalloc_small(tcache_t *tcache, void *ptr, size_t binind)
+tcache_dalloc_small(tsd_t *tsd, tcache_t *tcache, void *ptr, index_t binind)
 {
 	tcache_bin_t *tbin;
 	tcache_bin_info_t *tbin_info;

 	assert(tcache_salloc(ptr) <= SMALL_MAXCLASS);

-	if (config_fill && opt_junk)
+	if (config_fill && unlikely(opt_junk_free))
 		arena_dalloc_junk_small(ptr, &arena_bin_info[binind]);

 	tbin = &tcache->tbins[binind];
 	tbin_info = &tcache_bin_info[binind];
-	if (tbin->ncached == tbin_info->ncached_max) {
-		tcache_bin_flush_small(tbin, binind, (tbin_info->ncached_max >>
-		    1), tcache);
+	if (unlikely(tbin->ncached == tbin_info->ncached_max)) {
+		tcache_bin_flush_small(tsd, tcache, tbin, binind,
+		    (tbin_info->ncached_max >> 1));
 	}
 	assert(tbin->ncached < tbin_info->ncached_max);
 	tbin->avail[tbin->ncached] = ptr;
 	tbin->ncached++;

-	tcache_event(tcache);
+	tcache_event(tsd, tcache);
 }

 JEMALLOC_ALWAYS_INLINE void
-tcache_dalloc_large(tcache_t *tcache, void *ptr, size_t size)
+tcache_dalloc_large(tsd_t *tsd, tcache_t *tcache, void *ptr, size_t size)
 {
-	size_t binind;
+	index_t binind;
 	tcache_bin_t *tbin;
 	tcache_bin_info_t *tbin_info;

@ -420,22 +394,31 @@ tcache_dalloc_large(tcache_t *tcache, void *ptr, size_t size)
 	assert(tcache_salloc(ptr) > SMALL_MAXCLASS);
 	assert(tcache_salloc(ptr) <= tcache_maxclass);

-	binind = NBINS + (size >> LG_PAGE) - 1;
+	binind = size2index(size);

-	if (config_fill && opt_junk)
-		memset(ptr, 0x5a, size);
+	if (config_fill && unlikely(opt_junk_free))
+		arena_dalloc_junk_large(ptr, size);

 	tbin = &tcache->tbins[binind];
 	tbin_info = &tcache_bin_info[binind];
-	if (tbin->ncached == tbin_info->ncached_max) {
-		tcache_bin_flush_large(tbin, binind, (tbin_info->ncached_max >>
-		    1), tcache);
+	if (unlikely(tbin->ncached == tbin_info->ncached_max)) {
+		tcache_bin_flush_large(tsd, tbin, binind,
+		    (tbin_info->ncached_max >> 1), tcache);
 	}
 	assert(tbin->ncached < tbin_info->ncached_max);
 	tbin->avail[tbin->ncached] = ptr;
 	tbin->ncached++;

-	tcache_event(tcache);
+	tcache_event(tsd, tcache);
+}
+
+JEMALLOC_ALWAYS_INLINE tcache_t *
+tcaches_get(tsd_t *tsd, unsigned ind)
+{
+	tcaches_t *elm = &tcaches[ind];
+	if (unlikely(elm->tcache == NULL))
+		elm->tcache = tcache_create(tsd, arena_choose(tsd, NULL));
+	return (elm->tcache);
 }
 #endif

--- a/include/jemalloc/internal/tsd.h
+++ b/include/jemalloc/internal/tsd.h
@ -2,7 +2,7 @@
 #ifdef JEMALLOC_H_TYPES

 /* Maximum number of malloc_tsd users with cleanup functions. */
-#define	MALLOC_TSD_CLEANUPS_MAX	8
+#define	MALLOC_TSD_CLEANUPS_MAX	2

 typedef bool (*malloc_tsd_cleanup_t)(void);

@ -12,9 +12,18 @@ typedef struct tsd_init_block_s tsd_init_block_t;
 typedef struct tsd_init_head_s tsd_init_head_t;
 #endif

+typedef struct tsd_s tsd_t;
+
+typedef enum {
+	tsd_state_uninitialized,
+	tsd_state_nominal,
+	tsd_state_purgatory,
+	tsd_state_reincarnated
+} tsd_state_t;
+
 /*
 * TLS/TSD-agnostic macro-based implementation of thread-specific data.  There
- * are four macros that support (at least) three use cases: file-private,
+ * are five macros that support (at least) three use cases: file-private,
 * library-private, and library-private inlined.  Following is an example
 * library-private tsd variable:
 *
@ -24,34 +33,36 @@ typedef struct tsd_init_head_s tsd_init_head_t;
 *           int y;
 *   } example_t;
 *   #define EX_INITIALIZER JEMALLOC_CONCAT({0, 0})
- *   malloc_tsd_protos(, example, example_t *)
- *   malloc_tsd_externs(example, example_t *)
+ *   malloc_tsd_types(example_, example_t)
+ *   malloc_tsd_protos(, example_, example_t)
+ *   malloc_tsd_externs(example_, example_t)
 * In example.c:
- *   malloc_tsd_data(, example, example_t *, EX_INITIALIZER)
- *   malloc_tsd_funcs(, example, example_t *, EX_INITIALIZER,
+ *   malloc_tsd_data(, example_, example_t, EX_INITIALIZER)
+ *   malloc_tsd_funcs(, example_, example_t, EX_INITIALIZER,
 *       example_tsd_cleanup)
 *
 * The result is a set of generated functions, e.g.:
 *
 *   bool example_tsd_boot(void) {...}
- *   example_t **example_tsd_get() {...}
- *   void example_tsd_set(example_t **val) {...}
+ *   example_t *example_tsd_get() {...}
+ *   void example_tsd_set(example_t *val) {...}
 *
 * Note that all of the functions deal in terms of (a_type *) rather than
 * (a_type) so that it is possible to support non-pointer types (unlike
 * pthreads TSD).  example_tsd_cleanup() is passed an (a_type *) pointer that is
- * cast to (void *).  This means that the cleanup function needs to cast *and*
- * dereference the function argument, e.g.:
+ * cast to (void *).  This means that the cleanup function needs to cast the
+ * function argument to (a_type *), then dereference the resulting pointer to
+ * access fields, e.g.
 *
 *   void
 *   example_tsd_cleanup(void *arg)
 *   {
- *           example_t *example = *(example_t **)arg;
+ *           example_t *example = (example_t *)arg;
 *
+ *           example->x = 42;
 *           [...]
- *           if ([want the cleanup function to be called again]) {
- *                   example_tsd_set(&example);
- *           }
+ *           if ([want the cleanup function to be called again])
+ *                   example_tsd_set(example);
 *   }
 *
 * If example_tsd_set() is called within example_tsd_cleanup(), it will be
@ -60,63 +71,96 @@ typedef struct tsd_init_head_s tsd_init_head_t;
 * non-NULL.
 */

+/* malloc_tsd_types(). */
+#ifdef JEMALLOC_MALLOC_THREAD_CLEANUP
+#define	malloc_tsd_types(a_name, a_type)
+#elif (defined(JEMALLOC_TLS))
+#define	malloc_tsd_types(a_name, a_type)
+#elif (defined(_WIN32))
+#define	malloc_tsd_types(a_name, a_type)				\
+typedef struct {							\
+	bool	initialized;						\
+	a_type	val;							\
+} a_name##tsd_wrapper_t;
+#else
+#define	malloc_tsd_types(a_name, a_type)				\
+typedef struct {							\
+	bool	initialized;						\
+	a_type	val;							\
+} a_name##tsd_wrapper_t;
+#endif
+
 /* malloc_tsd_protos(). */
 #define	malloc_tsd_protos(a_attr, a_name, a_type)			\
 a_attr bool								\
-a_name##_tsd_boot(void);						\
-a_attr a_type *								\
-a_name##_tsd_get(void);							\
+a_name##tsd_boot0(void);						\
 a_attr void								\
-a_name##_tsd_set(a_type *val);
+a_name##tsd_boot1(void);						\
+a_attr bool								\
+a_name##tsd_boot(void);							\
+a_attr a_type *								\
+a_name##tsd_get(void);							\
+a_attr void								\
+a_name##tsd_set(a_type *val);

 /* malloc_tsd_externs(). */
 #ifdef JEMALLOC_MALLOC_THREAD_CLEANUP
 #define	malloc_tsd_externs(a_name, a_type)				\
-extern __thread a_type	a_name##_tls;					\
-extern __thread bool	a_name##_initialized;				\
-extern bool		a_name##_booted;
+extern __thread a_type	a_name##tsd_tls;				\
+extern __thread bool	a_name##tsd_initialized;			\
+extern bool		a_name##tsd_booted;
 #elif (defined(JEMALLOC_TLS))
 #define	malloc_tsd_externs(a_name, a_type)				\
-extern __thread a_type	a_name##_tls;					\
-extern pthread_key_t	a_name##_tsd;					\
-extern bool		a_name##_booted;
+extern __thread a_type	a_name##tsd_tls;				\
+extern pthread_key_t	a_name##tsd_tsd;				\
+extern bool		a_name##tsd_booted;
 #elif (defined(_WIN32))
 #define	malloc_tsd_externs(a_name, a_type)				\
-extern DWORD		a_name##_tsd;					\
-extern bool		a_name##_booted;
+extern DWORD		a_name##tsd_tsd;				\
+extern a_name##tsd_wrapper_t	a_name##tsd_boot_wrapper;		\
+extern bool		a_name##tsd_booted;
 #else
 #define	malloc_tsd_externs(a_name, a_type)				\
-extern pthread_key_t	a_name##_tsd;					\
-extern tsd_init_head_t	a_name##_tsd_init_head;				\
-extern bool		a_name##_booted;
+extern pthread_key_t	a_name##tsd_tsd;				\
+extern tsd_init_head_t	a_name##tsd_init_head;				\
+extern a_name##tsd_wrapper_t	a_name##tsd_boot_wrapper;		\
+extern bool		a_name##tsd_booted;
 #endif

 /* malloc_tsd_data(). */
 #ifdef JEMALLOC_MALLOC_THREAD_CLEANUP
 #define	malloc_tsd_data(a_attr, a_name, a_type, a_initializer)		\
 a_attr __thread a_type JEMALLOC_TLS_MODEL				\
-    a_name##_tls = a_initializer;					\
+    a_name##tsd_tls = a_initializer;					\
 a_attr __thread bool JEMALLOC_TLS_MODEL					\
-    a_name##_initialized = false;					\
-a_attr bool		a_name##_booted = false;
+    a_name##tsd_initialized = false;					\
+a_attr bool		a_name##tsd_booted = false;
 #elif (defined(JEMALLOC_TLS))
 #define	malloc_tsd_data(a_attr, a_name, a_type, a_initializer)		\
 a_attr __thread a_type JEMALLOC_TLS_MODEL				\
-    a_name##_tls = a_initializer;					\
-a_attr pthread_key_t	a_name##_tsd;					\
-a_attr bool		a_name##_booted = false;
+    a_name##tsd_tls = a_initializer;					\
+a_attr pthread_key_t	a_name##tsd_tsd;				\
+a_attr bool		a_name##tsd_booted = false;
 #elif (defined(_WIN32))
 #define	malloc_tsd_data(a_attr, a_name, a_type, a_initializer)		\
-a_attr DWORD		a_name##_tsd;					\
-a_attr bool		a_name##_booted = false;
+a_attr DWORD		a_name##tsd_tsd;				\
+a_attr a_name##tsd_wrapper_t a_name##tsd_boot_wrapper = {		\
+	false,								\
+	a_initializer							\
+};									\
+a_attr bool		a_name##tsd_booted = false;
 #else
 #define	malloc_tsd_data(a_attr, a_name, a_type, a_initializer)		\
-a_attr pthread_key_t	a_name##_tsd;					\
-a_attr tsd_init_head_t	a_name##_tsd_init_head = {			\
+a_attr pthread_key_t	a_name##tsd_tsd;				\
+a_attr tsd_init_head_t	a_name##tsd_init_head = {			\
 	ql_head_initializer(blocks),					\
 	MALLOC_MUTEX_INITIALIZER					\
 };									\
-a_attr bool		a_name##_booted = false;
+a_attr a_name##tsd_wrapper_t a_name##tsd_boot_wrapper = {		\
+	false,								\
+	a_initializer							\
+};									\
+a_attr bool		a_name##tsd_booted = false;
 #endif

 /* malloc_tsd_funcs(). */
@ -125,75 +169,100 @@ a_attr bool		a_name##_booted = false;
    a_cleanup)								\
 /* Initialization/cleanup. */						\
 a_attr bool								\
-a_name##_tsd_cleanup_wrapper(void)					\
+a_name##tsd_cleanup_wrapper(void)					\
 {									\
 									\
-	if (a_name##_initialized) {					\
-		a_name##_initialized = false;				\
-		a_cleanup(&a_name##_tls);				\
+	if (a_name##tsd_initialized) {					\
+		a_name##tsd_initialized = false;			\
+		a_cleanup(&a_name##tsd_tls);				\
 	}								\
-	return (a_name##_initialized);					\
+	return (a_name##tsd_initialized);				\
 }									\
 a_attr bool								\
-a_name##_tsd_boot(void)							\
+a_name##tsd_boot0(void)							\
 {									\
 									\
 	if (a_cleanup != malloc_tsd_no_cleanup) {			\
 		malloc_tsd_cleanup_register(				\
-		    &a_name##_tsd_cleanup_wrapper);			\
+		    &a_name##tsd_cleanup_wrapper);			\
 	}								\
-	a_name##_booted = true;						\
+	a_name##tsd_booted = true;					\
 	return (false);							\
 }									\
+a_attr void								\
+a_name##tsd_boot1()							\
+{									\
+									\
+	/* Do nothing. */						\
+}									\
+a_attr bool								\
+a_name##tsd_boot(void)							\
+{									\
+									\
+	return (a_name##tsd_boot0());					\
+}									\
 /* Get/set. */								\
 a_attr a_type *								\
-a_name##_tsd_get(void)							\
+a_name##tsd_get(void)							\
 {									\
 									\
-	assert(a_name##_booted);					\
-	return (&a_name##_tls);						\
+	assert(a_name##tsd_booted);					\
+	return (&a_name##tsd_tls);					\
 }									\
 a_attr void								\
-a_name##_tsd_set(a_type *val)						\
+a_name##tsd_set(a_type *val)						\
 {									\
 									\
-	assert(a_name##_booted);					\
-	a_name##_tls = (*val);						\
+	assert(a_name##tsd_booted);					\
+	a_name##tsd_tls = (*val);					\
 	if (a_cleanup != malloc_tsd_no_cleanup)				\
-		a_name##_initialized = true;				\
+		a_name##tsd_initialized = true;				\
 }
 #elif (defined(JEMALLOC_TLS))
 #define	malloc_tsd_funcs(a_attr, a_name, a_type, a_initializer,		\
    a_cleanup)								\
 /* Initialization/cleanup. */						\
 a_attr bool								\
-a_name##_tsd_boot(void)							\
+a_name##tsd_boot0(void)							\
 {									\
 									\
 	if (a_cleanup != malloc_tsd_no_cleanup) {			\
-		if (pthread_key_create(&a_name##_tsd, a_cleanup) != 0)	\
+		if (pthread_key_create(&a_name##tsd_tsd, a_cleanup) !=	\
+		    0)							\
 			return (true);					\
 	}								\
-	a_name##_booted = true;						\
+	a_name##tsd_booted = true;					\
 	return (false);							\
 }									\
+a_attr void								\
+a_name##tsd_boot1()							\
+{									\
+									\
+	/* Do nothing. */						\
+}									\
+a_attr bool								\
+a_name##tsd_boot(void)							\
+{									\
+									\
+	return (a_name##tsd_boot0());					\
+}									\
 /* Get/set. */								\
 a_attr a_type *								\
-a_name##_tsd_get(void)							\
+a_name##tsd_get(void)							\
 {									\
 									\
-	assert(a_name##_booted);					\
-	return (&a_name##_tls);						\
+	assert(a_name##tsd_booted);					\
+	return (&a_name##tsd_tls);					\
 }									\
 a_attr void								\
-a_name##_tsd_set(a_type *val)						\
+a_name##tsd_set(a_type *val)						\
 {									\
 									\
-	assert(a_name##_booted);					\
-	a_name##_tls = (*val);						\
+	assert(a_name##tsd_booted);					\
+	a_name##tsd_tls = (*val);					\
 	if (a_cleanup != malloc_tsd_no_cleanup) {			\
-		if (pthread_setspecific(a_name##_tsd,			\
-		    (void *)(&a_name##_tls))) {				\
+		if (pthread_setspecific(a_name##tsd_tsd,		\
+		    (void *)(&a_name##tsd_tls))) {			\
 			malloc_write("<jemalloc>: Error"		\
 			    " setting TSD for "#a_name"\n");		\
 			if (opt_abort)					\
@ -204,27 +273,21 @@ a_name##_tsd_set(a_type *val)						\
 #elif (defined(_WIN32))
 #define	malloc_tsd_funcs(a_attr, a_name, a_type, a_initializer,		\
    a_cleanup)								\
-/* Data structure. */							\
-typedef struct {							\
-	bool	initialized;						\
-	a_type	val;							\
-} a_name##_tsd_wrapper_t;						\
 /* Initialization/cleanup. */						\
 a_attr bool								\
-a_name##_tsd_cleanup_wrapper(void)					\
+a_name##tsd_cleanup_wrapper(void)					\
 {									\
-	a_name##_tsd_wrapper_t *wrapper;				\
+	DWORD error = GetLastError();					\
+	a_name##tsd_wrapper_t *wrapper = (a_name##tsd_wrapper_t *)	\
+	    TlsGetValue(a_name##tsd_tsd);				\
+	SetLastError(error);						\
 									\
-	wrapper = (a_name##_tsd_wrapper_t *) TlsGetValue(a_name##_tsd);	\
 	if (wrapper == NULL)						\
 		return (false);						\
 	if (a_cleanup != malloc_tsd_no_cleanup &&			\
 	    wrapper->initialized) {					\
-		a_type val = wrapper->val;				\
-		a_type tsd_static_data = a_initializer;			\
 		wrapper->initialized = false;				\
-		wrapper->val = tsd_static_data;				\
-		a_cleanup(&val);					\
+		a_cleanup(&wrapper->val);				\
 		if (wrapper->initialized) {				\
 			/* Trigger another cleanup round. */		\
 			return (true);					\
@ -233,63 +296,95 @@ a_name##_tsd_cleanup_wrapper(void)					\
 	malloc_tsd_dalloc(wrapper);					\
 	return (false);							\
 }									\
-a_attr bool								\
-a_name##_tsd_boot(void)							\
+a_attr void								\
+a_name##tsd_wrapper_set(a_name##tsd_wrapper_t *wrapper)			\
 {									\
 									\
-	a_name##_tsd = TlsAlloc();					\
-	if (a_name##_tsd == TLS_OUT_OF_INDEXES)				\
-		return (true);						\
-	if (a_cleanup != malloc_tsd_no_cleanup) {			\
-		malloc_tsd_cleanup_register(				\
-		    &a_name##_tsd_cleanup_wrapper);			\
+	if (!TlsSetValue(a_name##tsd_tsd, (void *)wrapper)) {		\
+		malloc_write("<jemalloc>: Error setting"		\
+		    " TSD for "#a_name"\n");				\
+		abort();						\
 	}								\
-	a_name##_booted = true;						\
-	return (false);							\
 }									\
-/* Get/set. */								\
-a_attr a_name##_tsd_wrapper_t *						\
-a_name##_tsd_get_wrapper(void)						\
+a_attr a_name##tsd_wrapper_t *						\
+a_name##tsd_wrapper_get(void)						\
 {									\
-	a_name##_tsd_wrapper_t *wrapper = (a_name##_tsd_wrapper_t *)	\
-	    TlsGetValue(a_name##_tsd);					\
+	DWORD error = GetLastError();					\
+	a_name##tsd_wrapper_t *wrapper = (a_name##tsd_wrapper_t *)	\
+	    TlsGetValue(a_name##tsd_tsd);				\
+	SetLastError(error);						\
 									\
-	if (wrapper == NULL) {						\
-		wrapper = (a_name##_tsd_wrapper_t *)			\
-		    malloc_tsd_malloc(sizeof(a_name##_tsd_wrapper_t));	\
+	if (unlikely(wrapper == NULL)) {				\
+		wrapper = (a_name##tsd_wrapper_t *)			\
+		    malloc_tsd_malloc(sizeof(a_name##tsd_wrapper_t));	\
 		if (wrapper == NULL) {					\
 			malloc_write("<jemalloc>: Error allocating"	\
 			    " TSD for "#a_name"\n");			\
 			abort();					\
 		} else {						\
-			static a_type tsd_static_data = a_initializer;	\
 			wrapper->initialized = false;			\
-			wrapper->val = tsd_static_data;			\
-		}							\
-		if (!TlsSetValue(a_name##_tsd, (void *)wrapper)) {	\
-			malloc_write("<jemalloc>: Error setting"	\
-			    " TSD for "#a_name"\n");			\
-			abort();					\
+			wrapper->val = a_initializer;			\
 		}							\
+		a_name##tsd_wrapper_set(wrapper);			\
 	}								\
 	return (wrapper);						\
 }									\
-a_attr a_type *								\
-a_name##_tsd_get(void)							\
+a_attr bool								\
+a_name##tsd_boot0(void)							\
 {									\
-	a_name##_tsd_wrapper_t *wrapper;				\
 									\
-	assert(a_name##_booted);					\
-	wrapper = a_name##_tsd_get_wrapper();				\
+	a_name##tsd_tsd = TlsAlloc();					\
+	if (a_name##tsd_tsd == TLS_OUT_OF_INDEXES)			\
+		return (true);						\
+	if (a_cleanup != malloc_tsd_no_cleanup) {			\
+		malloc_tsd_cleanup_register(				\
+		    &a_name##tsd_cleanup_wrapper);			\
+	}								\
+	a_name##tsd_wrapper_set(&a_name##tsd_boot_wrapper);		\
+	a_name##tsd_booted = true;					\
+	return (false);							\
+}									\
+a_attr void								\
+a_name##tsd_boot1()							\
+{									\
+	a_name##tsd_wrapper_t *wrapper;					\
+	wrapper = (a_name##tsd_wrapper_t *)				\
+	    malloc_tsd_malloc(sizeof(a_name##tsd_wrapper_t));		\
+	if (wrapper == NULL) {						\
+		malloc_write("<jemalloc>: Error allocating"		\
+		    " TSD for "#a_name"\n");				\
+		abort();						\
+	}								\
+	memcpy(wrapper, &a_name##tsd_boot_wrapper,			\
+	    sizeof(a_name##tsd_wrapper_t));				\
+	a_name##tsd_wrapper_set(wrapper);				\
+}									\
+a_attr bool								\
+a_name##tsd_boot(void)							\
+{									\
+									\
+	if (a_name##tsd_boot0())					\
+		return (true);						\
+	a_name##tsd_boot1();						\
+	return (false);							\
+}									\
+/* Get/set. */								\
+a_attr a_type *								\
+a_name##tsd_get(void)							\
+{									\
+	a_name##tsd_wrapper_t *wrapper;					\
+									\
+	assert(a_name##tsd_booted);					\
+	wrapper = a_name##tsd_wrapper_get();				\
 	return (&wrapper->val);						\
 }									\
 a_attr void								\
-a_name##_tsd_set(a_type *val)						\
+a_name##tsd_set(a_type *val)						\
 {									\
-	a_name##_tsd_wrapper_t *wrapper;				\
+	a_name##tsd_wrapper_t *wrapper;					\
 									\
-	assert(a_name##_booted);					\
-	wrapper = a_name##_tsd_get_wrapper();				\
+	assert(a_name##tsd_booted);					\
+	wrapper = a_name##tsd_wrapper_get();				\
 	wrapper->val = *(val);						\
 	if (a_cleanup != malloc_tsd_no_cleanup)				\
 		wrapper->initialized = true;				\
@ -297,16 +392,11 @@ a_name##_tsd_set(a_type *val)						\
 #else
 #define	malloc_tsd_funcs(a_attr, a_name, a_type, a_initializer,		\
    a_cleanup)								\
-/* Data structure. */							\
-typedef struct {							\
-	bool	initialized;						\
-	a_type	val;							\
-} a_name##_tsd_wrapper_t;						\
 /* Initialization/cleanup. */						\
 a_attr void								\
-a_name##_tsd_cleanup_wrapper(void *arg)					\
+a_name##tsd_cleanup_wrapper(void *arg)					\
 {									\
-	a_name##_tsd_wrapper_t *wrapper = (a_name##_tsd_wrapper_t *)arg;\
+	a_name##tsd_wrapper_t *wrapper = (a_name##tsd_wrapper_t *)arg;	\
 									\
 	if (a_cleanup != malloc_tsd_no_cleanup &&			\
 	    wrapper->initialized) {					\
@ -314,7 +404,7 @@ a_name##_tsd_cleanup_wrapper(void *arg)					\
 		a_cleanup(&wrapper->val);				\
 		if (wrapper->initialized) {				\
 			/* Trigger another cleanup round. */		\
-			if (pthread_setspecific(a_name##_tsd,		\
+			if (pthread_setspecific(a_name##tsd_tsd,	\
 			    (void *)wrapper)) {				\
 				malloc_write("<jemalloc>: Error"	\
 				    " setting TSD for "#a_name"\n");	\
@ -326,67 +416,97 @@ a_name##_tsd_cleanup_wrapper(void *arg)					\
 	}								\
 	malloc_tsd_dalloc(wrapper);					\
 }									\
-a_attr bool								\
-a_name##_tsd_boot(void)							\
+a_attr void								\
+a_name##tsd_wrapper_set(a_name##tsd_wrapper_t *wrapper)			\
 {									\
 									\
-	if (pthread_key_create(&a_name##_tsd,				\
-	    a_name##_tsd_cleanup_wrapper) != 0)				\
-		return (true);						\
-	a_name##_booted = true;						\
-	return (false);							\
+	if (pthread_setspecific(a_name##tsd_tsd,			\
+	    (void *)wrapper)) {						\
+		malloc_write("<jemalloc>: Error setting"		\
+		    " TSD for "#a_name"\n");				\
+		abort();						\
+	}								\
 }									\
-/* Get/set. */								\
-a_attr a_name##_tsd_wrapper_t *						\
-a_name##_tsd_get_wrapper(void)						\
+a_attr a_name##tsd_wrapper_t *						\
+a_name##tsd_wrapper_get(void)						\
 {									\
-	a_name##_tsd_wrapper_t *wrapper = (a_name##_tsd_wrapper_t *)	\
-	    pthread_getspecific(a_name##_tsd);				\
+	a_name##tsd_wrapper_t *wrapper = (a_name##tsd_wrapper_t *)	\
+	    pthread_getspecific(a_name##tsd_tsd);			\
 									\
-	if (wrapper == NULL) {						\
+	if (unlikely(wrapper == NULL)) {				\
 		tsd_init_block_t block;					\
 		wrapper = tsd_init_check_recursion(			\
-		    &a_name##_tsd_init_head, &block);			\
+		    &a_name##tsd_init_head, &block);			\
 		if (wrapper)						\
 		    return (wrapper);					\
-		wrapper = (a_name##_tsd_wrapper_t *)			\
-		    malloc_tsd_malloc(sizeof(a_name##_tsd_wrapper_t));	\
+		wrapper = (a_name##tsd_wrapper_t *)			\
+		    malloc_tsd_malloc(sizeof(a_name##tsd_wrapper_t));	\
 		block.data = wrapper;					\
 		if (wrapper == NULL) {					\
 			malloc_write("<jemalloc>: Error allocating"	\
 			    " TSD for "#a_name"\n");			\
 			abort();					\
 		} else {						\
-			static a_type tsd_static_data = a_initializer;	\
 			wrapper->initialized = false;			\
-			wrapper->val = tsd_static_data;			\
+			wrapper->val = a_initializer;			\
 		}							\
-		if (pthread_setspecific(a_name##_tsd,			\
-		    (void *)wrapper)) {					\
-			malloc_write("<jemalloc>: Error setting"	\
-			    " TSD for "#a_name"\n");			\
-			abort();					\
-		}							\
-		tsd_init_finish(&a_name##_tsd_init_head, &block);	\
+		a_name##tsd_wrapper_set(wrapper);			\
+		tsd_init_finish(&a_name##tsd_init_head, &block);	\
 	}								\
 	return (wrapper);						\
 }									\
-a_attr a_type *								\
-a_name##_tsd_get(void)							\
+a_attr bool								\
+a_name##tsd_boot0(void)							\
 {									\
-	a_name##_tsd_wrapper_t *wrapper;				\
 									\
-	assert(a_name##_booted);					\
-	wrapper = a_name##_tsd_get_wrapper();				\
+	if (pthread_key_create(&a_name##tsd_tsd,			\
+	    a_name##tsd_cleanup_wrapper) != 0)				\
+		return (true);						\
+	a_name##tsd_wrapper_set(&a_name##tsd_boot_wrapper);		\
+	a_name##tsd_booted = true;					\
+	return (false);							\
+}									\
+a_attr void								\
+a_name##tsd_boot1()							\
+{									\
+	a_name##tsd_wrapper_t *wrapper;					\
+	wrapper = (a_name##tsd_wrapper_t *)				\
+	    malloc_tsd_malloc(sizeof(a_name##tsd_wrapper_t));		\
+	if (wrapper == NULL) {						\
+		malloc_write("<jemalloc>: Error allocating"		\
+		    " TSD for "#a_name"\n");				\
+		abort();						\
+	}								\
+	memcpy(wrapper, &a_name##tsd_boot_wrapper,			\
+	    sizeof(a_name##tsd_wrapper_t));				\
+	a_name##tsd_wrapper_set(wrapper);				\
+}									\
+a_attr bool								\
+a_name##tsd_boot(void)							\
+{									\
+									\
+	if (a_name##tsd_boot0())					\
+		return (true);						\
+	a_name##tsd_boot1();						\
+	return (false);							\
+}									\
+/* Get/set. */								\
+a_attr a_type *								\
+a_name##tsd_get(void)							\
+{									\
+	a_name##tsd_wrapper_t *wrapper;					\
+									\
+	assert(a_name##tsd_booted);					\
+	wrapper = a_name##tsd_wrapper_get();				\
 	return (&wrapper->val);						\
 }									\
 a_attr void								\
-a_name##_tsd_set(a_type *val)						\
+a_name##tsd_set(a_type *val)						\
 {									\
-	a_name##_tsd_wrapper_t *wrapper;				\
+	a_name##tsd_wrapper_t *wrapper;					\
 									\
-	assert(a_name##_booted);					\
-	wrapper = a_name##_tsd_get_wrapper();				\
+	assert(a_name##tsd_booted);					\
+	wrapper = a_name##tsd_wrapper_get();				\
 	wrapper->val = *(val);						\
 	if (a_cleanup != malloc_tsd_no_cleanup)				\
 		wrapper->initialized = true;				\
@ -410,25 +530,136 @@ struct tsd_init_head_s {
 };
 #endif

+#define	MALLOC_TSD							\
+/*  O(name,			type) */				\
+    O(tcache,			tcache_t *)				\
+    O(thread_allocated,		uint64_t)				\
+    O(thread_deallocated,	uint64_t)				\
+    O(prof_tdata,		prof_tdata_t *)				\
+    O(arena,			arena_t *)				\
+    O(arenas_cache,		arena_t **)				\
+    O(narenas_cache,		unsigned)				\
+    O(arenas_cache_bypass,	bool)					\
+    O(tcache_enabled,		tcache_enabled_t)			\
+    O(quarantine,		quarantine_t *)				\
+
+#define	TSD_INITIALIZER {						\
+    tsd_state_uninitialized,						\
+    NULL,								\
+    0,									\
+    0,									\
+    NULL,								\
+    NULL,								\
+    NULL,								\
+    0,									\
+    false,								\
+    tcache_enabled_default,						\
+    NULL								\
+}
+
+struct tsd_s {
+	tsd_state_t	state;
+#define	O(n, t)								\
+	t		n;
+MALLOC_TSD
+#undef O
+};
+
+static const tsd_t tsd_initializer = TSD_INITIALIZER;
+
+malloc_tsd_types(, tsd_t)
+
 #endif /* JEMALLOC_H_STRUCTS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_EXTERNS

 void	*malloc_tsd_malloc(size_t size);
 void	malloc_tsd_dalloc(void *wrapper);
-void	malloc_tsd_no_cleanup(void *);
+void	malloc_tsd_no_cleanup(void *arg);
 void	malloc_tsd_cleanup_register(bool (*f)(void));
-void	malloc_tsd_boot(void);
+bool	malloc_tsd_boot0(void);
+void	malloc_tsd_boot1(void);
 #if (!defined(JEMALLOC_MALLOC_THREAD_CLEANUP) && !defined(JEMALLOC_TLS) && \
    !defined(_WIN32))
 void	*tsd_init_check_recursion(tsd_init_head_t *head,
    tsd_init_block_t *block);
 void	tsd_init_finish(tsd_init_head_t *head, tsd_init_block_t *block);
 #endif
+void	tsd_cleanup(void *arg);

 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_INLINES

+#ifndef JEMALLOC_ENABLE_INLINE
+malloc_tsd_protos(JEMALLOC_ATTR(unused), , tsd_t)
+
+tsd_t	*tsd_fetch(void);
+bool	tsd_nominal(tsd_t *tsd);
+#define	O(n, t)								\
+t	*tsd_##n##p_get(tsd_t *tsd);					\
+t	tsd_##n##_get(tsd_t *tsd);					\
+void	tsd_##n##_set(tsd_t *tsd, t n);
+MALLOC_TSD
+#undef O
+#endif
+
+#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_TSD_C_))
+malloc_tsd_externs(, tsd_t)
+malloc_tsd_funcs(JEMALLOC_ALWAYS_INLINE, , tsd_t, tsd_initializer, tsd_cleanup)
+
+JEMALLOC_ALWAYS_INLINE tsd_t *
+tsd_fetch(void)
+{
+	tsd_t *tsd = tsd_get();
+
+	if (unlikely(tsd->state != tsd_state_nominal)) {
+		if (tsd->state == tsd_state_uninitialized) {
+			tsd->state = tsd_state_nominal;
+			/* Trigger cleanup handler registration. */
+			tsd_set(tsd);
+		} else if (tsd->state == tsd_state_purgatory) {
+			tsd->state = tsd_state_reincarnated;
+			tsd_set(tsd);
+		} else
+			assert(tsd->state == tsd_state_reincarnated);
+	}
+
+	return (tsd);
+}
+
+JEMALLOC_INLINE bool
+tsd_nominal(tsd_t *tsd)
+{
+
+	return (tsd->state == tsd_state_nominal);
+}
+
+#define	O(n, t)								\
+JEMALLOC_ALWAYS_INLINE t *						\
+tsd_##n##p_get(tsd_t *tsd)						\
+{									\
+									\
+	return (&tsd->n);						\
+}									\
+									\
+JEMALLOC_ALWAYS_INLINE t						\
+tsd_##n##_get(tsd_t *tsd)						\
+{									\
+									\
+	return (*tsd_##n##p_get(tsd));					\
+}									\
+									\
+JEMALLOC_ALWAYS_INLINE void						\
+tsd_##n##_set(tsd_t *tsd, t n)						\
+{									\
+									\
+	assert(tsd->state == tsd_state_nominal);			\
+	tsd->n = n;							\
+}
+MALLOC_TSD
+#undef O
+#endif
+
 #endif /* JEMALLOC_H_INLINES */
 /******************************************************************************/
--- a/include/jemalloc/internal/util.h
+++ b/include/jemalloc/internal/util.h
@ -1,6 +1,36 @@
 /******************************************************************************/
 #ifdef JEMALLOC_H_TYPES

+#ifdef _WIN32
+#  ifdef _WIN64
+#    define FMT64_PREFIX "ll"
+#    define FMTPTR_PREFIX "ll"
+#  else
+#    define FMT64_PREFIX "ll"
+#    define FMTPTR_PREFIX ""
+#  endif
+#  define FMTd32 "d"
+#  define FMTu32 "u"
+#  define FMTx32 "x"
+#  define FMTd64 FMT64_PREFIX "d"
+#  define FMTu64 FMT64_PREFIX "u"
+#  define FMTx64 FMT64_PREFIX "x"
+#  define FMTdPTR FMTPTR_PREFIX "d"
+#  define FMTuPTR FMTPTR_PREFIX "u"
+#  define FMTxPTR FMTPTR_PREFIX "x"
+#else
+#  include <inttypes.h>
+#  define FMTd32 PRId32
+#  define FMTu32 PRIu32
+#  define FMTx32 PRIx32
+#  define FMTd64 PRId64
+#  define FMTu64 PRIu64
+#  define FMTx64 PRIx64
+#  define FMTdPTR PRIdPTR
+#  define FMTuPTR PRIuPTR
+#  define FMTxPTR PRIxPTR
+#endif
+
 /* Size of stack-allocated buffer passed to buferror(). */
 #define	BUFERROR_BUF		64

@ -27,13 +57,37 @@
 #	define JEMALLOC_CC_SILENCE_INIT(v)
 #endif

+#define	JEMALLOC_GNUC_PREREQ(major, minor)				\
+    (!defined(__clang__) &&						\
+    (__GNUC__ > (major) || (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor))))
+#ifndef __has_builtin
+#  define __has_builtin(builtin) (0)
+#endif
+#define	JEMALLOC_CLANG_HAS_BUILTIN(builtin)				\
+    (defined(__clang__) && __has_builtin(builtin))
+
+#ifdef __GNUC__
+#	define likely(x)   __builtin_expect(!!(x), 1)
+#	define unlikely(x) __builtin_expect(!!(x), 0)
+#  if JEMALLOC_GNUC_PREREQ(4, 6) ||					\
+      JEMALLOC_CLANG_HAS_BUILTIN(__builtin_unreachable)
+#	define unreachable() __builtin_unreachable()
+#  else
+#	define unreachable()
+#  endif
+#else
+#	define likely(x)   !!(x)
+#	define unlikely(x) !!(x)
+#	define unreachable()
+#endif
+
 /*
 * Define a custom assert() in order to reduce the chances of deadlock during
 * assertion failure.
 */
 #ifndef assert
 #define	assert(e) do {							\
-	if (config_debug && !(e)) {					\
+	if (unlikely(config_debug && !(e))) {				\
 		malloc_printf(						\
 		    "<jemalloc>: %s:%d: Failed assertion: \"%s\"\n",	\
 		    __FILE__, __LINE__, #e);				\
@ -50,6 +104,7 @@
 		    __FILE__, __LINE__);				\
 		abort();						\
 	}								\
+	unreachable();							\
 } while (0)
 #endif

@ -65,14 +120,14 @@

 #ifndef assert_not_implemented
 #define	assert_not_implemented(e) do {					\
-	if (config_debug && !(e))					\
+	if (unlikely(config_debug && !(e)))				\
 		not_implemented();					\
 } while (0)
 #endif

 /* Use to assert a particular configuration, e.g., cassert(config_debug). */
 #define	cassert(c) do {							\
-	if ((c) == false)						\
+	if (unlikely(!(c)))						\
 		not_reached();						\
 } while (0)

@ -96,25 +151,47 @@ void	malloc_write(const char *s);
 int	malloc_vsnprintf(char *str, size_t size, const char *format,
    va_list ap);
 int	malloc_snprintf(char *str, size_t size, const char *format, ...)
-    JEMALLOC_ATTR(format(printf, 3, 4));
+    JEMALLOC_FORMAT_PRINTF(3, 4);
 void	malloc_vcprintf(void (*write_cb)(void *, const char *), void *cbopaque,
    const char *format, va_list ap);
 void malloc_cprintf(void (*write)(void *, const char *), void *cbopaque,
-    const char *format, ...) JEMALLOC_ATTR(format(printf, 3, 4));
-void	malloc_printf(const char *format, ...)
-    JEMALLOC_ATTR(format(printf, 1, 2));
+    const char *format, ...) JEMALLOC_FORMAT_PRINTF(3, 4);
+void	malloc_printf(const char *format, ...) JEMALLOC_FORMAT_PRINTF(1, 2);

 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_INLINES

 #ifndef JEMALLOC_ENABLE_INLINE
+int	jemalloc_ffsl(long bitmap);
+int	jemalloc_ffs(int bitmap);
 size_t	pow2_ceil(size_t x);
+size_t	lg_floor(size_t x);
 void	set_errno(int errnum);
 int	get_errno(void);
 #endif

 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_UTIL_C_))
+
+/* Sanity check. */
+#if !defined(JEMALLOC_INTERNAL_FFSL) || !defined(JEMALLOC_INTERNAL_FFS)
+#  error Both JEMALLOC_INTERNAL_FFSL && JEMALLOC_INTERNAL_FFS should have been defined by configure
+#endif
+
+JEMALLOC_ALWAYS_INLINE int
+jemalloc_ffsl(long bitmap)
+{
+
+	return (JEMALLOC_INTERNAL_FFSL(bitmap));
+}
+
+JEMALLOC_ALWAYS_INLINE int
+jemalloc_ffs(int bitmap)
+{
+
+	return (JEMALLOC_INTERNAL_FFS(bitmap));
+}
+
 /* Compute the smallest power of 2 that is >= x. */
 JEMALLOC_INLINE size_t
 pow2_ceil(size_t x)
@ -133,7 +210,82 @@ pow2_ceil(size_t x)
 	return (x);
 }

-/* Sets error code */
+#if (defined(__i386__) || defined(__amd64__) || defined(__x86_64__))
+JEMALLOC_INLINE size_t
+lg_floor(size_t x)
+{
+	size_t ret;
+
+	assert(x != 0);
+
+	asm ("bsr %1, %0"
+	    : "=r"(ret) // Outputs.
+	    : "r"(x)    // Inputs.
+	    );
+	return (ret);
+}
+#elif (defined(_MSC_VER))
+JEMALLOC_INLINE size_t
+lg_floor(size_t x)
+{
+	unsigned long ret;
+
+	assert(x != 0);
+
+#if (LG_SIZEOF_PTR == 3)
+	_BitScanReverse64(&ret, x);
+#elif (LG_SIZEOF_PTR == 2)
+	_BitScanReverse(&ret, x);
+#else
+#  error "Unsupported type sizes for lg_floor()"
+#endif
+	return (ret);
+}
+#elif (defined(JEMALLOC_HAVE_BUILTIN_CLZ))
+JEMALLOC_INLINE size_t
+lg_floor(size_t x)
+{
+
+	assert(x != 0);
+
+#if (LG_SIZEOF_PTR == LG_SIZEOF_INT)
+	return (((8 << LG_SIZEOF_PTR) - 1) - __builtin_clz(x));
+#elif (LG_SIZEOF_PTR == LG_SIZEOF_LONG)
+	return (((8 << LG_SIZEOF_PTR) - 1) - __builtin_clzl(x));
+#else
+#  error "Unsupported type sizes for lg_floor()"
+#endif
+}
+#else
+JEMALLOC_INLINE size_t
+lg_floor(size_t x)
+{
+
+	assert(x != 0);
+
+	x |= (x >> 1);
+	x |= (x >> 2);
+	x |= (x >> 4);
+	x |= (x >> 8);
+	x |= (x >> 16);
+#if (LG_SIZEOF_PTR == 3 && LG_SIZEOF_PTR == LG_SIZEOF_LONG)
+	x |= (x >> 32);
+	if (x == KZU(0xffffffffffffffff))
+		return (63);
+	x++;
+	return (jemalloc_ffsl(x) - 2);
+#elif (LG_SIZEOF_PTR == 2)
+	if (x == KZU(0xffffffff))
+		return (31);
+	x++;
+	return (jemalloc_ffs(x) - 2);
+#else
+#  error "Unsupported type sizes for lg_floor()"
+#endif
+}
+#endif
+
+/* Set error code. */
 JEMALLOC_INLINE void
 set_errno(int errnum)
 {
@ -145,7 +297,7 @@ set_errno(int errnum)
 #endif
 }

-/* Get last error code */
+/* Get last error code. */
 JEMALLOC_INLINE int
 get_errno(void)
 {
--- a/include/jemalloc/internal/valgrind.h
+++ b/include/jemalloc/internal/valgrind.h
@ -0,0 +1,112 @@
+/******************************************************************************/
+#ifdef JEMALLOC_H_TYPES
+
+#ifdef JEMALLOC_VALGRIND
+#include <valgrind/valgrind.h>
+
+/*
+ * The size that is reported to Valgrind must be consistent through a chain of
+ * malloc..realloc..realloc calls.  Request size isn't recorded anywhere in
+ * jemalloc, so it is critical that all callers of these macros provide usize
+ * rather than request size.  As a result, buffer overflow detection is
+ * technically weakened for the standard API, though it is generally accepted
+ * practice to consider any extra bytes reported by malloc_usable_size() as
+ * usable space.
+ */
+#define	JEMALLOC_VALGRIND_MAKE_MEM_NOACCESS(ptr, usize) do {		\
+	if (unlikely(in_valgrind))					\
+		valgrind_make_mem_noaccess(ptr, usize);			\
+} while (0)
+#define	JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(ptr, usize) do {		\
+	if (unlikely(in_valgrind))					\
+		valgrind_make_mem_undefined(ptr, usize);		\
+} while (0)
+#define	JEMALLOC_VALGRIND_MAKE_MEM_DEFINED(ptr, usize) do {		\
+	if (unlikely(in_valgrind))					\
+		valgrind_make_mem_defined(ptr, usize);			\
+} while (0)
+/*
+ * The VALGRIND_MALLOCLIKE_BLOCK() and VALGRIND_RESIZEINPLACE_BLOCK() macro
+ * calls must be embedded in macros rather than in functions so that when
+ * Valgrind reports errors, there are no extra stack frames in the backtraces.
+ */
+#define	JEMALLOC_VALGRIND_MALLOC(cond, ptr, usize, zero) do {		\
+	if (unlikely(in_valgrind && cond))				\
+		VALGRIND_MALLOCLIKE_BLOCK(ptr, usize, p2rz(ptr), zero);	\
+} while (0)
+#define	JEMALLOC_VALGRIND_REALLOC(maybe_moved, ptr, usize,		\
+    ptr_maybe_null, old_ptr, old_usize, old_rzsize, old_ptr_maybe_null,	\
+    zero) do {								\
+	if (unlikely(in_valgrind)) {					\
+		size_t rzsize = p2rz(ptr);				\
+									\
+		if (!maybe_moved || ptr == old_ptr) {			\
+			VALGRIND_RESIZEINPLACE_BLOCK(ptr, old_usize,	\
+			    usize, rzsize);				\
+			if (zero && old_usize < usize) {		\
+				valgrind_make_mem_defined(		\
+				    (void *)((uintptr_t)ptr +		\
+				    old_usize), usize - old_usize);	\
+			}						\
+		} else {						\
+			if (!old_ptr_maybe_null || old_ptr != NULL) {	\
+				valgrind_freelike_block(old_ptr,	\
+				    old_rzsize);			\
+			}						\
+			if (!ptr_maybe_null || ptr != NULL) {		\
+				size_t copy_size = (old_usize < usize)	\
+				    ?  old_usize : usize;		\
+				size_t tail_size = usize - copy_size;	\
+				VALGRIND_MALLOCLIKE_BLOCK(ptr, usize,	\
+				    rzsize, false);			\
+				if (copy_size > 0) {			\
+					valgrind_make_mem_defined(ptr,	\
+					copy_size);			\
+				}					\
+				if (zero && tail_size > 0) {		\
+					valgrind_make_mem_defined(	\
+					    (void *)((uintptr_t)ptr +	\
+					    copy_size), tail_size);	\
+				}					\
+			}						\
+		}							\
+	}								\
+} while (0)
+#define	JEMALLOC_VALGRIND_FREE(ptr, rzsize) do {			\
+	if (unlikely(in_valgrind))					\
+		valgrind_freelike_block(ptr, rzsize);			\
+} while (0)
+#else
+#define	RUNNING_ON_VALGRIND	((unsigned)0)
+#define	JEMALLOC_VALGRIND_MAKE_MEM_NOACCESS(ptr, usize) do {} while (0)
+#define	JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(ptr, usize) do {} while (0)
+#define	JEMALLOC_VALGRIND_MAKE_MEM_DEFINED(ptr, usize) do {} while (0)
+#define	JEMALLOC_VALGRIND_MALLOC(cond, ptr, usize, zero) do {} while (0)
+#define	JEMALLOC_VALGRIND_REALLOC(maybe_moved, ptr, usize,		\
+    ptr_maybe_null, old_ptr, old_usize, old_rzsize, old_ptr_maybe_null,	\
+    zero) do {} while (0)
+#define	JEMALLOC_VALGRIND_FREE(ptr, rzsize) do {} while (0)
+#endif
+
+#endif /* JEMALLOC_H_TYPES */
+/******************************************************************************/
+#ifdef JEMALLOC_H_STRUCTS
+
+#endif /* JEMALLOC_H_STRUCTS */
+/******************************************************************************/
+#ifdef JEMALLOC_H_EXTERNS
+
+#ifdef JEMALLOC_VALGRIND
+void	valgrind_make_mem_noaccess(void *ptr, size_t usize);
+void	valgrind_make_mem_undefined(void *ptr, size_t usize);
+void	valgrind_make_mem_defined(void *ptr, size_t usize);
+void	valgrind_freelike_block(void *ptr, size_t usize);
+#endif
+
+#endif /* JEMALLOC_H_EXTERNS */
+/******************************************************************************/
+#ifdef JEMALLOC_H_INLINES
+
+#endif /* JEMALLOC_H_INLINES */
+/******************************************************************************/
+
--- a/include/jemalloc/jemalloc.sh
+++ b/include/jemalloc/jemalloc.sh
@ -12,7 +12,7 @@ extern "C" {
 EOF

 for hdr in jemalloc_defs.h jemalloc_rename.h jemalloc_macros.h \
-           jemalloc_protos.h jemalloc_mangle.h ; do
+           jemalloc_protos.h jemalloc_typedefs.h jemalloc_mangle.h ; do
  cat "${objroot}include/jemalloc/${hdr}" \
      | grep -v 'Generated from .* by configure\.' \
      | sed -e 's/^#define /#define	/g' \
@ -22,7 +22,7 @@ done

 cat <<EOF
 #ifdef __cplusplus
-};
+}
 #endif
 #endif /* JEMALLOC_H_ */
 EOF
--- a/include/jemalloc/jemalloc_defs.h.in
+++ b/include/jemalloc/jemalloc_defs.h.in
@ -1,8 +1,14 @@
 /* Defined if __attribute__((...)) syntax is supported. */
 #undef JEMALLOC_HAVE_ATTR

-/* Support the experimental API. */
-#undef JEMALLOC_EXPERIMENTAL
+/* Defined if alloc_size attribute is supported. */
+#undef JEMALLOC_HAVE_ATTR_ALLOC_SIZE
+
+/* Defined if format(gnu_printf, ...) attribute is supported. */
+#undef JEMALLOC_HAVE_ATTR_FORMAT_GNU_PRINTF
+
+/* Defined if format(printf, ...) attribute is supported. */
+#undef JEMALLOC_HAVE_ATTR_FORMAT_PRINTF

 /*
 * Define overrides for non-standard allocator-related functions if they are
@ -20,5 +26,12 @@
 */
 #undef JEMALLOC_USABLE_SIZE_CONST

+/*
+ * If defined, specify throw() for the public function prototypes when compiling
+ * with C++.  The only justification for this is to match the prototypes that
+ * glibc defines.
+ */
+#undef JEMALLOC_USE_CXX_THROW
+
 /* sizeof(void *) == 2^LG_SIZEOF_PTR. */
 #undef LG_SIZEOF_PTR
--- a/include/jemalloc/jemalloc_macros.h.in
+++ b/include/jemalloc/jemalloc_macros.h.in
@ -1,3 +1,6 @@
+#include <stdlib.h>
+#include <stdbool.h>
+#include <stdint.h>
 #include <limits.h>
 #include <strings.h>

@ -16,46 +19,84 @@
 	 ((a < (size_t)INT_MAX) ? ffs(a)-1 : ffs(a>>32)+31)
 #  endif
 #  define MALLOCX_ZERO	((int)0x40)
-/* Bias arena index bits so that 0 encodes "MALLOCX_ARENA() unspecified". */
-#  define MALLOCX_ARENA(a)	((int)(((a)+1) << 8))
+/*
+ * Bias tcache index bits so that 0 encodes "automatic tcache management", and 1
+ * encodes MALLOCX_TCACHE_NONE.
+ */
+#  define MALLOCX_TCACHE(tc)	((int)(((tc)+2) << 8))
+#  define MALLOCX_TCACHE_NONE	MALLOCX_TCACHE(-1)
+/*
+ * Bias arena index bits so that 0 encodes "use an automatically chosen arena".
+ */
+#  define MALLOCX_ARENA(a)	((int)(((a)+1) << 20))

-#ifdef JEMALLOC_EXPERIMENTAL
-#  define ALLOCM_LG_ALIGN(la)	(la)
-#  if LG_SIZEOF_PTR == 2
-#    define ALLOCM_ALIGN(a)	(ffs(a)-1)
-#  else
-#    define ALLOCM_ALIGN(a)						\
-	 ((a < (size_t)INT_MAX) ? ffs(a)-1 : ffs(a>>32)+31)
-#  endif
-#  define ALLOCM_ZERO	((int)0x40)
-#  define ALLOCM_NO_MOVE	((int)0x80)
-/* Bias arena index bits so that 0 encodes "ALLOCM_ARENA() unspecified". */
-#  define ALLOCM_ARENA(a)	((int)(((a)+1) << 8))
-#  define ALLOCM_SUCCESS	0
-#  define ALLOCM_ERR_OOM	1
-#  define ALLOCM_ERR_NOT_MOVED	2
+#if defined(__cplusplus) && defined(JEMALLOC_USE_CXX_THROW)
+#  define JEMALLOC_CXX_THROW throw()
+#else
+#  define JEMALLOC_CXX_THROW
 #endif

 #ifdef JEMALLOC_HAVE_ATTR
 #  define JEMALLOC_ATTR(s) __attribute__((s))
-#  define JEMALLOC_EXPORT JEMALLOC_ATTR(visibility("default"))
 #  define JEMALLOC_ALIGNED(s) JEMALLOC_ATTR(aligned(s))
-#  define JEMALLOC_SECTION(s) JEMALLOC_ATTR(section(s))
+#  ifdef JEMALLOC_HAVE_ATTR_ALLOC_SIZE
+#    define JEMALLOC_ALLOC_SIZE(s) JEMALLOC_ATTR(alloc_size(s))
+#    define JEMALLOC_ALLOC_SIZE2(s1, s2) JEMALLOC_ATTR(alloc_size(s1, s2))
+#  else
+#    define JEMALLOC_ALLOC_SIZE(s)
+#    define JEMALLOC_ALLOC_SIZE2(s1, s2)
+#  endif
+#  ifndef JEMALLOC_EXPORT
+#    define JEMALLOC_EXPORT JEMALLOC_ATTR(visibility("default"))
+#  endif
+#  ifdef JEMALLOC_HAVE_ATTR_FORMAT_GNU_PRINTF
+#    define JEMALLOC_FORMAT_PRINTF(s, i) JEMALLOC_ATTR(format(gnu_printf, s, i))
+#  elif defined(JEMALLOC_HAVE_ATTR_FORMAT_PRINTF)
+#    define JEMALLOC_FORMAT_PRINTF(s, i) JEMALLOC_ATTR(format(printf, s, i))
+#  else
+#    define JEMALLOC_FORMAT_PRINTF(s, i)
+#  endif
 #  define JEMALLOC_NOINLINE JEMALLOC_ATTR(noinline)
+#  define JEMALLOC_NOTHROW JEMALLOC_ATTR(nothrow)
+#  define JEMALLOC_SECTION(s) JEMALLOC_ATTR(section(s))
+#  define JEMALLOC_RESTRICT_RETURN
+#  define JEMALLOC_ALLOCATOR
 #elif _MSC_VER
 #  define JEMALLOC_ATTR(s)
+#  define JEMALLOC_ALIGNED(s) __declspec(align(s))
+#  define JEMALLOC_ALLOC_SIZE(s)
+#  define JEMALLOC_ALLOC_SIZE2(s1, s2)
+#  ifndef JEMALLOC_EXPORT
 #    ifdef DLLEXPORT
 #      define JEMALLOC_EXPORT __declspec(dllexport)
 #    else
 #      define JEMALLOC_EXPORT __declspec(dllimport)
 #    endif
-#  define JEMALLOC_ALIGNED(s) __declspec(align(s))
-#  define JEMALLOC_SECTION(s) __declspec(allocate(s))
+#  endif
+#  define JEMALLOC_FORMAT_PRINTF(s, i)
 #  define JEMALLOC_NOINLINE __declspec(noinline)
+#  ifdef __cplusplus
+#    define JEMALLOC_NOTHROW __declspec(nothrow)
+#  else
+#    define JEMALLOC_NOTHROW
+#  endif
+#  define JEMALLOC_SECTION(s) __declspec(allocate(s))
+#  define JEMALLOC_RESTRICT_RETURN __declspec(restrict)
+#  if _MSC_VER >= 1900 && !defined(__EDG__)
+#    define JEMALLOC_ALLOCATOR __declspec(allocator)
+#  else
+#    define JEMALLOC_ALLOCATOR
+#  endif
 #else
 #  define JEMALLOC_ATTR(s)
-#  define JEMALLOC_EXPORT
 #  define JEMALLOC_ALIGNED(s)
-#  define JEMALLOC_SECTION(s)
+#  define JEMALLOC_ALLOC_SIZE(s)
+#  define JEMALLOC_ALLOC_SIZE2(s1, s2)
+#  define JEMALLOC_EXPORT
+#  define JEMALLOC_FORMAT_PRINTF(s, i)
 #  define JEMALLOC_NOINLINE
+#  define JEMALLOC_NOTHROW
+#  define JEMALLOC_SECTION(s)
+#  define JEMALLOC_RESTRICT_RETURN
+#  define JEMALLOC_ALLOCATOR
 #endif
--- a/include/jemalloc/jemalloc_protos.h.in
+++ b/include/jemalloc/jemalloc_protos.h.in
@ -7,52 +7,60 @@ extern JEMALLOC_EXPORT const char	*@je_@malloc_conf;
 extern JEMALLOC_EXPORT void		(*@je_@malloc_message)(void *cbopaque,
    const char *s);

-JEMALLOC_EXPORT void	*@je_@malloc(size_t size) JEMALLOC_ATTR(malloc);
-JEMALLOC_EXPORT void	*@je_@calloc(size_t num, size_t size)
-    JEMALLOC_ATTR(malloc);
-JEMALLOC_EXPORT int	@je_@posix_memalign(void **memptr, size_t alignment,
-    size_t size) JEMALLOC_ATTR(nonnull(1));
-JEMALLOC_EXPORT void	*@je_@aligned_alloc(size_t alignment, size_t size)
-    JEMALLOC_ATTR(malloc);
-JEMALLOC_EXPORT void	*@je_@realloc(void *ptr, size_t size);
-JEMALLOC_EXPORT void	@je_@free(void *ptr);
+JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
+    void JEMALLOC_NOTHROW	*@je_@malloc(size_t size)
+    JEMALLOC_CXX_THROW JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1);
+JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
+    void JEMALLOC_NOTHROW	*@je_@calloc(size_t num, size_t size)
+    JEMALLOC_CXX_THROW JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE2(1, 2);
+JEMALLOC_EXPORT int JEMALLOC_NOTHROW	@je_@posix_memalign(void **memptr,
+    size_t alignment, size_t size) JEMALLOC_CXX_THROW JEMALLOC_ATTR(nonnull(1));
+JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
+    void JEMALLOC_NOTHROW	*@je_@aligned_alloc(size_t alignment,
+    size_t size) JEMALLOC_CXX_THROW JEMALLOC_ATTR(malloc)
+    JEMALLOC_ALLOC_SIZE(2);
+JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
+    void JEMALLOC_NOTHROW	*@je_@realloc(void *ptr, size_t size)
+    JEMALLOC_CXX_THROW JEMALLOC_ALLOC_SIZE(2);
+JEMALLOC_EXPORT void JEMALLOC_NOTHROW	@je_@free(void *ptr)
+    JEMALLOC_CXX_THROW;

-JEMALLOC_EXPORT void	*@je_@mallocx(size_t size, int flags);
-JEMALLOC_EXPORT void	*@je_@rallocx(void *ptr, size_t size, int flags);
-JEMALLOC_EXPORT size_t	@je_@xallocx(void *ptr, size_t size, size_t extra,
+JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
+    void JEMALLOC_NOTHROW	*@je_@mallocx(size_t size, int flags)
+    JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1);
+JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
+    void JEMALLOC_NOTHROW	*@je_@rallocx(void *ptr, size_t size,
+    int flags) JEMALLOC_ALLOC_SIZE(2);
+JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW	@je_@xallocx(void *ptr, size_t size,
+    size_t extra, int flags);
+JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW	@je_@sallocx(const void *ptr,
+    int flags) JEMALLOC_ATTR(pure);
+JEMALLOC_EXPORT void JEMALLOC_NOTHROW	@je_@dallocx(void *ptr, int flags);
+JEMALLOC_EXPORT void JEMALLOC_NOTHROW	@je_@sdallocx(void *ptr, size_t size,
    int flags);
-JEMALLOC_EXPORT size_t	@je_@sallocx(const void *ptr, int flags);
-JEMALLOC_EXPORT void	@je_@dallocx(void *ptr, int flags);
-JEMALLOC_EXPORT size_t	@je_@nallocx(size_t size, int flags);
+JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW	@je_@nallocx(size_t size, int flags)
+    JEMALLOC_ATTR(pure);

-JEMALLOC_EXPORT int	@je_@mallctl(const char *name, void *oldp,
-    size_t *oldlenp, void *newp, size_t newlen);
-JEMALLOC_EXPORT int	@je_@mallctlnametomib(const char *name, size_t *mibp,
-    size_t *miblenp);
-JEMALLOC_EXPORT int	@je_@mallctlbymib(const size_t *mib, size_t miblen,
+JEMALLOC_EXPORT int JEMALLOC_NOTHROW	@je_@mallctl(const char *name,
    void *oldp, size_t *oldlenp, void *newp, size_t newlen);
-JEMALLOC_EXPORT void	@je_@malloc_stats_print(void (*write_cb)(void *,
-    const char *), void *@je_@cbopaque, const char *opts);
-JEMALLOC_EXPORT size_t	@je_@malloc_usable_size(
-    JEMALLOC_USABLE_SIZE_CONST void *ptr);
+JEMALLOC_EXPORT int JEMALLOC_NOTHROW	@je_@mallctlnametomib(const char *name,
+    size_t *mibp, size_t *miblenp);
+JEMALLOC_EXPORT int JEMALLOC_NOTHROW	@je_@mallctlbymib(const size_t *mib,
+    size_t miblen, void *oldp, size_t *oldlenp, void *newp, size_t newlen);
+JEMALLOC_EXPORT void JEMALLOC_NOTHROW	@je_@malloc_stats_print(
+    void (*write_cb)(void *, const char *), void *@je_@cbopaque,
+    const char *opts);
+JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW	@je_@malloc_usable_size(
+    JEMALLOC_USABLE_SIZE_CONST void *ptr) JEMALLOC_CXX_THROW;

 #ifdef JEMALLOC_OVERRIDE_MEMALIGN
-JEMALLOC_EXPORT void *	@je_@memalign(size_t alignment, size_t size)
+JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
+    void JEMALLOC_NOTHROW	*@je_@memalign(size_t alignment, size_t size)
    JEMALLOC_ATTR(malloc);
 #endif

 #ifdef JEMALLOC_OVERRIDE_VALLOC
-JEMALLOC_EXPORT void *	@je_@valloc(size_t size) JEMALLOC_ATTR(malloc);
-#endif
-
-#ifdef JEMALLOC_EXPERIMENTAL
-JEMALLOC_EXPORT int	@je_@allocm(void **ptr, size_t *rsize, size_t size,
-    int flags) JEMALLOC_ATTR(nonnull(1));
-JEMALLOC_EXPORT int	@je_@rallocm(void **ptr, size_t *rsize, size_t size,
-    size_t extra, int flags) JEMALLOC_ATTR(nonnull(1));
-JEMALLOC_EXPORT int	@je_@sallocm(const void *ptr, size_t *rsize, int flags)
-    JEMALLOC_ATTR(nonnull(1));
-JEMALLOC_EXPORT int	@je_@dallocm(void *ptr, int flags)
-    JEMALLOC_ATTR(nonnull(1));
-JEMALLOC_EXPORT int	@je_@nallocm(size_t *rsize, size_t size, int flags);
+JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
+    void JEMALLOC_NOTHROW	*@je_@valloc(size_t size) JEMALLOC_CXX_THROW
+    JEMALLOC_ATTR(malloc);
 #endif
--- a/include/jemalloc/jemalloc_typedefs.h.in
+++ b/include/jemalloc/jemalloc_typedefs.h.in
@ -0,0 +1,57 @@
+/*
+ * void *
+ * chunk_alloc(void *new_addr, size_t size, size_t alignment, bool *zero,
+ *     bool *commit, unsigned arena_ind);
+ */
+typedef void *(chunk_alloc_t)(void *, size_t, size_t, bool *, bool *, unsigned);
+
+/*
+ * bool
+ * chunk_dalloc(void *chunk, size_t size, bool committed, unsigned arena_ind);
+ */
+typedef bool (chunk_dalloc_t)(void *, size_t, bool, unsigned);
+
+/*
+ * bool
+ * chunk_commit(void *chunk, size_t size, size_t offset, size_t length,
+ *     unsigned arena_ind);
+ */
+typedef bool (chunk_commit_t)(void *, size_t, size_t, size_t, unsigned);
+
+/*
+ * bool
+ * chunk_decommit(void *chunk, size_t size, size_t offset, size_t length,
+ *     unsigned arena_ind);
+ */
+typedef bool (chunk_decommit_t)(void *, size_t, size_t, size_t, unsigned);
+
+/*
+ * bool
+ * chunk_purge(void *chunk, size_t size, size_t offset, size_t length,
+ *     unsigned arena_ind);
+ */
+typedef bool (chunk_purge_t)(void *, size_t, size_t, size_t, unsigned);
+
+/*
+ * bool
+ * chunk_split(void *chunk, size_t size, size_t size_a, size_t size_b,
+ *     bool committed, unsigned arena_ind);
+ */
+typedef bool (chunk_split_t)(void *, size_t, size_t, size_t, bool, unsigned);
+
+/*
+ * bool
+ * chunk_merge(void *chunk_a, size_t size_a, void *chunk_b, size_t size_b,
+ *     bool committed, unsigned arena_ind);
+ */
+typedef bool (chunk_merge_t)(void *, size_t, void *, size_t, bool, unsigned);
+
+typedef struct {
+	chunk_alloc_t		*alloc;
+	chunk_dalloc_t		*dalloc;
+	chunk_commit_t		*commit;
+	chunk_decommit_t	*decommit;
+	chunk_purge_t		*purge;
+	chunk_split_t		*split;
+	chunk_merge_t		*merge;
+} chunk_hooks_t;
--- a/include/msvc_compat/C99/stdbool.h
+++ b/include/msvc_compat/C99/stdbool.h
@ -5,7 +5,11 @@

 /* MSVC doesn't define _Bool or bool in C, but does have BOOL */
 /* Note this doesn't pass autoconf's test because (bool) 0.5 != true */
+/* Clang-cl uses MSVC headers, so needs msvc_compat, but has _Bool as
+ * a built-in type. */
+#ifndef __clang__
 typedef BOOL _Bool;
+#endif

 #define bool _Bool
 #define true 1
--- a/include/msvc_compat/C99/stdint.h
+++ b/include/msvc_compat/C99/stdint.h
--- a/include/msvc_compat/inttypes.h
+++ b/include/msvc_compat/inttypes.h
@ -1,313 +0,0 @@
-// ISO C9x  compliant inttypes.h for Microsoft Visual Studio
-// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124 
-// 
-//  Copyright (c) 2006 Alexander Chemeris
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-// 
-//   1. Redistributions of source code must retain the above copyright notice,
-//      this list of conditions and the following disclaimer.
-// 
-//   2. Redistributions in binary form must reproduce the above copyright
-//      notice, this list of conditions and the following disclaimer in the
-//      documentation and/or other materials provided with the distribution.
-// 
-//   3. The name of the author may be used to endorse or promote products
-//      derived from this software without specific prior written permission.
-// 
-// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
-// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
-// EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
-// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
-// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
-// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-// 
-///////////////////////////////////////////////////////////////////////////////
-
-#ifndef _MSC_VER // [
-#error "Use this header only with Microsoft Visual C++ compilers!"
-#endif // _MSC_VER ]
-
-#ifndef _MSC_INTTYPES_H_ // [
-#define _MSC_INTTYPES_H_
-
-#if _MSC_VER > 1000
-#pragma once
-#endif
-
-#include "stdint.h"
-
-// 7.8 Format conversion of integer types
-
-typedef struct {
-   intmax_t quot;
-   intmax_t rem;
-} imaxdiv_t;
-
-// 7.8.1 Macros for format specifiers
-
-#if !defined(__cplusplus) || defined(__STDC_FORMAT_MACROS) // [   See footnote 185 at page 198
-
-#ifdef _WIN64
-#  define __PRI64_PREFIX        "l"
-#  define __PRIPTR_PREFIX       "l"
-#else
-#  define __PRI64_PREFIX        "ll"
-#  define __PRIPTR_PREFIX
-#endif
-
-// The fprintf macros for signed integers are:
-#define PRId8       "d"
-#define PRIi8       "i"
-#define PRIdLEAST8  "d"
-#define PRIiLEAST8  "i"
-#define PRIdFAST8   "d"
-#define PRIiFAST8   "i"
-
-#define PRId16       "hd"
-#define PRIi16       "hi"
-#define PRIdLEAST16  "hd"
-#define PRIiLEAST16  "hi"
-#define PRIdFAST16   "hd"
-#define PRIiFAST16   "hi"
-
-#define PRId32       "d"
-#define PRIi32       "i"
-#define PRIdLEAST32  "d"
-#define PRIiLEAST32  "i"
-#define PRIdFAST32   "d"
-#define PRIiFAST32   "i"
-
-#define PRId64       __PRI64_PREFIX "d"
-#define PRIi64       __PRI64_PREFIX "i"
-#define PRIdLEAST64  __PRI64_PREFIX "d"
-#define PRIiLEAST64  __PRI64_PREFIX "i"
-#define PRIdFAST64   __PRI64_PREFIX "d"
-#define PRIiFAST64   __PRI64_PREFIX "i"
-
-#define PRIdMAX     __PRI64_PREFIX "d"
-#define PRIiMAX     __PRI64_PREFIX "i"
-
-#define PRIdPTR     __PRIPTR_PREFIX "d"
-#define PRIiPTR     __PRIPTR_PREFIX "i"
-
-// The fprintf macros for unsigned integers are:
-#define PRIo8       "o"
-#define PRIu8       "u"
-#define PRIx8       "x"
-#define PRIX8       "X"
-#define PRIoLEAST8  "o"
-#define PRIuLEAST8  "u"
-#define PRIxLEAST8  "x"
-#define PRIXLEAST8  "X"
-#define PRIoFAST8   "o"
-#define PRIuFAST8   "u"
-#define PRIxFAST8   "x"
-#define PRIXFAST8   "X"
-
-#define PRIo16       "ho"
-#define PRIu16       "hu"
-#define PRIx16       "hx"
-#define PRIX16       "hX"
-#define PRIoLEAST16  "ho"
-#define PRIuLEAST16  "hu"
-#define PRIxLEAST16  "hx"
-#define PRIXLEAST16  "hX"
-#define PRIoFAST16   "ho"
-#define PRIuFAST16   "hu"
-#define PRIxFAST16   "hx"
-#define PRIXFAST16   "hX"
-
-#define PRIo32       "o"
-#define PRIu32       "u"
-#define PRIx32       "x"
-#define PRIX32       "X"
-#define PRIoLEAST32  "o"
-#define PRIuLEAST32  "u"
-#define PRIxLEAST32  "x"
-#define PRIXLEAST32  "X"
-#define PRIoFAST32   "o"
-#define PRIuFAST32   "u"
-#define PRIxFAST32   "x"
-#define PRIXFAST32   "X"
-
-#define PRIo64       __PRI64_PREFIX "o"
-#define PRIu64       __PRI64_PREFIX "u"
-#define PRIx64       __PRI64_PREFIX "x"
-#define PRIX64       __PRI64_PREFIX "X"
-#define PRIoLEAST64  __PRI64_PREFIX "o"
-#define PRIuLEAST64  __PRI64_PREFIX "u"
-#define PRIxLEAST64  __PRI64_PREFIX "x"
-#define PRIXLEAST64  __PRI64_PREFIX "X"
-#define PRIoFAST64   __PRI64_PREFIX "o"
-#define PRIuFAST64   __PRI64_PREFIX "u"
-#define PRIxFAST64   __PRI64_PREFIX "x"
-#define PRIXFAST64   __PRI64_PREFIX "X"
-
-#define PRIoMAX     __PRI64_PREFIX "o"
-#define PRIuMAX     __PRI64_PREFIX "u"
-#define PRIxMAX     __PRI64_PREFIX "x"
-#define PRIXMAX     __PRI64_PREFIX "X"
-
-#define PRIoPTR     __PRIPTR_PREFIX "o"
-#define PRIuPTR     __PRIPTR_PREFIX "u"
-#define PRIxPTR     __PRIPTR_PREFIX "x"
-#define PRIXPTR     __PRIPTR_PREFIX "X"
-
-// The fscanf macros for signed integers are:
-#define SCNd8       "d"
-#define SCNi8       "i"
-#define SCNdLEAST8  "d"
-#define SCNiLEAST8  "i"
-#define SCNdFAST8   "d"
-#define SCNiFAST8   "i"
-
-#define SCNd16       "hd"
-#define SCNi16       "hi"
-#define SCNdLEAST16  "hd"
-#define SCNiLEAST16  "hi"
-#define SCNdFAST16   "hd"
-#define SCNiFAST16   "hi"
-
-#define SCNd32       "ld"
-#define SCNi32       "li"
-#define SCNdLEAST32  "ld"
-#define SCNiLEAST32  "li"
-#define SCNdFAST32   "ld"
-#define SCNiFAST32   "li"
-
-#define SCNd64       "I64d"
-#define SCNi64       "I64i"
-#define SCNdLEAST64  "I64d"
-#define SCNiLEAST64  "I64i"
-#define SCNdFAST64   "I64d"
-#define SCNiFAST64   "I64i"
-
-#define SCNdMAX     "I64d"
-#define SCNiMAX     "I64i"
-
-#ifdef _WIN64 // [
-#  define SCNdPTR     "I64d"
-#  define SCNiPTR     "I64i"
-#else  // _WIN64 ][
-#  define SCNdPTR     "ld"
-#  define SCNiPTR     "li"
-#endif  // _WIN64 ]
-
-// The fscanf macros for unsigned integers are:
-#define SCNo8       "o"
-#define SCNu8       "u"
-#define SCNx8       "x"
-#define SCNX8       "X"
-#define SCNoLEAST8  "o"
-#define SCNuLEAST8  "u"
-#define SCNxLEAST8  "x"
-#define SCNXLEAST8  "X"
-#define SCNoFAST8   "o"
-#define SCNuFAST8   "u"
-#define SCNxFAST8   "x"
-#define SCNXFAST8   "X"
-
-#define SCNo16       "ho"
-#define SCNu16       "hu"
-#define SCNx16       "hx"
-#define SCNX16       "hX"
-#define SCNoLEAST16  "ho"
-#define SCNuLEAST16  "hu"
-#define SCNxLEAST16  "hx"
-#define SCNXLEAST16  "hX"
-#define SCNoFAST16   "ho"
-#define SCNuFAST16   "hu"
-#define SCNxFAST16   "hx"
-#define SCNXFAST16   "hX"
-
-#define SCNo32       "lo"
-#define SCNu32       "lu"
-#define SCNx32       "lx"
-#define SCNX32       "lX"
-#define SCNoLEAST32  "lo"
-#define SCNuLEAST32  "lu"
-#define SCNxLEAST32  "lx"
-#define SCNXLEAST32  "lX"
-#define SCNoFAST32   "lo"
-#define SCNuFAST32   "lu"
-#define SCNxFAST32   "lx"
-#define SCNXFAST32   "lX"
-
-#define SCNo64       "I64o"
-#define SCNu64       "I64u"
-#define SCNx64       "I64x"
-#define SCNX64       "I64X"
-#define SCNoLEAST64  "I64o"
-#define SCNuLEAST64  "I64u"
-#define SCNxLEAST64  "I64x"
-#define SCNXLEAST64  "I64X"
-#define SCNoFAST64   "I64o"
-#define SCNuFAST64   "I64u"
-#define SCNxFAST64   "I64x"
-#define SCNXFAST64   "I64X"
-
-#define SCNoMAX     "I64o"
-#define SCNuMAX     "I64u"
-#define SCNxMAX     "I64x"
-#define SCNXMAX     "I64X"
-
-#ifdef _WIN64 // [
-#  define SCNoPTR     "I64o"
-#  define SCNuPTR     "I64u"
-#  define SCNxPTR     "I64x"
-#  define SCNXPTR     "I64X"
-#else  // _WIN64 ][
-#  define SCNoPTR     "lo"
-#  define SCNuPTR     "lu"
-#  define SCNxPTR     "lx"
-#  define SCNXPTR     "lX"
-#endif  // _WIN64 ]
-
-#endif // __STDC_FORMAT_MACROS ]
-
-// 7.8.2 Functions for greatest-width integer types
-
-// 7.8.2.1 The imaxabs function
-#define imaxabs _abs64
-
-// 7.8.2.2 The imaxdiv function
-
-// This is modified version of div() function from Microsoft's div.c found
-// in %MSVC.NET%\crt\src\div.c
-#ifdef STATIC_IMAXDIV // [
-static
-#else // STATIC_IMAXDIV ][
-_inline
-#endif // STATIC_IMAXDIV ]
-imaxdiv_t __cdecl imaxdiv(intmax_t numer, intmax_t denom)
-{
-   imaxdiv_t result;
-
-   result.quot = numer / denom;
-   result.rem = numer % denom;
-
-   if (numer < 0 && result.rem > 0) {
-      // did division wrong; must fix up
-      ++result.quot;
-      result.rem -= denom;
-   }
-
-   return result;
-}
-
-// 7.8.2.3 The strtoimax and strtoumax functions
-#define strtoimax _strtoi64
-#define strtoumax _strtoui64
-
-// 7.8.2.4 The wcstoimax and wcstoumax functions
-#define wcstoimax _wcstoi64
-#define wcstoumax _wcstoui64
-
-
-#endif // _MSC_INTTYPES_H_ ]
--- a/include/msvc_compat/strings.h
+++ b/include/msvc_compat/strings.h
@ -3,8 +3,9 @@

 /* MSVC doesn't define ffs/ffsl. This dummy strings.h header is provided
 * for both */
-#include <intrin.h>
-#pragma intrinsic(_BitScanForward)
+#ifdef _MSC_VER
+#  include <intrin.h>
+#  pragma intrinsic(_BitScanForward)
 static __forceinline int ffsl(long x)
 {
 	unsigned long i;
@ -20,4 +21,9 @@ static __forceinline int ffs(int x)
 	return (ffsl(x));
 }

+#else
+#  define ffsl(x) __builtin_ffsl(x)
+#  define ffs(x) __builtin_ffs(x)
 #endif
+
+#endif /* strings_h */
--- a/include/msvc_compat/windows_extra.h
+++ b/include/msvc_compat/windows_extra.h
@ -0,0 +1,26 @@
+#ifndef MSVC_COMPAT_WINDOWS_EXTRA_H
+#define	MSVC_COMPAT_WINDOWS_EXTRA_H
+
+#ifndef ENOENT
+#  define ENOENT ERROR_PATH_NOT_FOUND
+#endif
+#ifndef EINVAL
+#  define EINVAL ERROR_BAD_ARGUMENTS
+#endif
+#ifndef EAGAIN
+#  define EAGAIN ERROR_OUTOFMEMORY
+#endif
+#ifndef EPERM
+#  define EPERM  ERROR_WRITE_FAULT
+#endif
+#ifndef EFAULT
+#  define EFAULT ERROR_INVALID_ADDRESS
+#endif
+#ifndef ENOMEM
+#  define ENOMEM ERROR_NOT_ENOUGH_MEMORY
+#endif
+#ifndef ERANGE
+#  define ERANGE ERROR_INVALID_DATA
+#endif
+
+#endif /* MSVC_COMPAT_WINDOWS_EXTRA_H */
--- a/jemalloc.pc.in
+++ b/jemalloc.pc.in
@ -0,0 +1,12 @@
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+install_suffix=@install_suffix@
+
+Name: jemalloc
+Description: A general purpose malloc(3) implementation that emphasizes fragmentation avoidance and scalable concurrency support.
+URL: http://www.canonware.com/jemalloc
+Version: @jemalloc_version@
+Cflags: -I${includedir}
+Libs: -L${libdir} -ljemalloc${install_suffix}
--- a/src/arena.c
+++ b/src/arena.c
--- a/src/base.c
+++ b/src/base.c
@ -5,107 +5,138 @@
 /* Data. */

 static malloc_mutex_t	base_mtx;
-
-/*
- * Current pages that are being used for internal memory allocations.  These
- * pages are carved up in cacheline-size quanta, so that there is no chance of
- * false cache line sharing.
- */
-static void		*base_pages;
-static void		*base_next_addr;
-static void		*base_past_addr; /* Addr immediately past base_pages. */
+static extent_tree_t	base_avail_szad;
 static extent_node_t	*base_nodes;
-
-/******************************************************************************/
-/* Function prototypes for non-inline static functions. */
-
-static bool	base_pages_alloc(size_t minsize);
+static size_t		base_allocated;
+static size_t		base_resident;
+static size_t		base_mapped;

 /******************************************************************************/

-static bool
-base_pages_alloc(size_t minsize)
+/* base_mtx must be held. */
+static extent_node_t *
+base_node_try_alloc(void)
 {
-	size_t csize;
-	bool zero;
+	extent_node_t *node;

-	assert(minsize != 0);
-	csize = CHUNK_CEILING(minsize);
-	zero = false;
-	base_pages = chunk_alloc(csize, chunksize, true, &zero,
-	    chunk_dss_prec_get());
-	if (base_pages == NULL)
-		return (true);
-	base_next_addr = base_pages;
-	base_past_addr = (void *)((uintptr_t)base_pages + csize);
-
-	return (false);
+	if (base_nodes == NULL)
+		return (NULL);
+	node = base_nodes;
+	base_nodes = *(extent_node_t **)node;
+	JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(node, sizeof(extent_node_t));
+	return (node);
 }

+/* base_mtx must be held. */
+static void
+base_node_dalloc(extent_node_t *node)
+{
+
+	JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(node, sizeof(extent_node_t));
+	*(extent_node_t **)node = base_nodes;
+	base_nodes = node;
+}
+
+/* base_mtx must be held. */
+static extent_node_t *
+base_chunk_alloc(size_t minsize)
+{
+	extent_node_t *node;
+	size_t csize, nsize;
+	void *addr;
+
+	assert(minsize != 0);
+	node = base_node_try_alloc();
+	/* Allocate enough space to also carve a node out if necessary. */
+	nsize = (node == NULL) ? CACHELINE_CEILING(sizeof(extent_node_t)) : 0;
+	csize = CHUNK_CEILING(minsize + nsize);
+	addr = chunk_alloc_base(csize);
+	if (addr == NULL) {
+		if (node != NULL)
+			base_node_dalloc(node);
+		return (NULL);
+	}
+	base_mapped += csize;
+	if (node == NULL) {
+		node = (extent_node_t *)addr;
+		addr = (void *)((uintptr_t)addr + nsize);
+		csize -= nsize;
+		if (config_stats) {
+			base_allocated += nsize;
+			base_resident += PAGE_CEILING(nsize);
+		}
+	}
+	extent_node_init(node, NULL, addr, csize, true, true);
+	return (node);
+}
+
+/*
+ * base_alloc() guarantees demand-zeroed memory, in order to make multi-page
+ * sparse data structures such as radix tree nodes efficient with respect to
+ * physical memory usage.
+ */
 void *
 base_alloc(size_t size)
 {
 	void *ret;
-	size_t csize;
+	size_t csize, usize;
+	extent_node_t *node;
+	extent_node_t key;

-	/* Round size up to nearest multiple of the cacheline size. */
+	/*
+	 * Round size up to nearest multiple of the cacheline size, so that
+	 * there is no chance of false cache line sharing.
+	 */
 	csize = CACHELINE_CEILING(size);

+	usize = s2u(csize);
+	extent_node_init(&key, NULL, NULL, usize, false, false);
 	malloc_mutex_lock(&base_mtx);
-	/* Make sure there's enough space for the allocation. */
-	if ((uintptr_t)base_next_addr + csize > (uintptr_t)base_past_addr) {
-		if (base_pages_alloc(csize)) {
-			malloc_mutex_unlock(&base_mtx);
-			return (NULL);
-		}
-	}
-	/* Allocate. */
-	ret = base_next_addr;
-	base_next_addr = (void *)((uintptr_t)base_next_addr + csize);
-	malloc_mutex_unlock(&base_mtx);
-	VALGRIND_MAKE_MEM_UNDEFINED(ret, csize);
-
-	return (ret);
-}
-
-void *
-base_calloc(size_t number, size_t size)
-{
-	void *ret = base_alloc(number * size);
-
-	if (ret != NULL)
-		memset(ret, 0, number * size);
-
-	return (ret);
-}
-
-extent_node_t *
-base_node_alloc(void)
-{
-	extent_node_t *ret;
-
-	malloc_mutex_lock(&base_mtx);
-	if (base_nodes != NULL) {
-		ret = base_nodes;
-		base_nodes = *(extent_node_t **)ret;
-		malloc_mutex_unlock(&base_mtx);
-		VALGRIND_MAKE_MEM_UNDEFINED(ret, sizeof(extent_node_t));
+	node = extent_tree_szad_nsearch(&base_avail_szad, &key);
+	if (node != NULL) {
+		/* Use existing space. */
+		extent_tree_szad_remove(&base_avail_szad, node);
 	} else {
-		malloc_mutex_unlock(&base_mtx);
-		ret = (extent_node_t *)base_alloc(sizeof(extent_node_t));
+		/* Try to allocate more space. */
+		node = base_chunk_alloc(csize);
+	}
+	if (node == NULL) {
+		ret = NULL;
+		goto label_return;
 	}

+	ret = extent_node_addr_get(node);
+	if (extent_node_size_get(node) > csize) {
+		extent_node_addr_set(node, (void *)((uintptr_t)ret + csize));
+		extent_node_size_set(node, extent_node_size_get(node) - csize);
+		extent_tree_szad_insert(&base_avail_szad, node);
+	} else
+		base_node_dalloc(node);
+	if (config_stats) {
+		base_allocated += csize;
+		/*
+		 * Add one PAGE to base_resident for every page boundary that is
+		 * crossed by the new allocation.
+		 */
+		base_resident += PAGE_CEILING((uintptr_t)ret + csize) -
+		    PAGE_CEILING((uintptr_t)ret);
+	}
+	JEMALLOC_VALGRIND_MAKE_MEM_DEFINED(ret, csize);
+label_return:
+	malloc_mutex_unlock(&base_mtx);
 	return (ret);
 }

 void
-base_node_dealloc(extent_node_t *node)
+base_stats_get(size_t *allocated, size_t *resident, size_t *mapped)
 {

-	VALGRIND_MAKE_MEM_UNDEFINED(node, sizeof(extent_node_t));
 	malloc_mutex_lock(&base_mtx);
-	*(extent_node_t **)node = base_nodes;
-	base_nodes = node;
+	assert(base_allocated <= base_resident);
+	assert(base_resident <= base_mapped);
+	*allocated = base_allocated;
+	*resident = base_resident;
+	*mapped = base_mapped;
 	malloc_mutex_unlock(&base_mtx);
 }

@ -113,9 +144,10 @@ bool
 base_boot(void)
 {

-	base_nodes = NULL;
 	if (malloc_mutex_init(&base_mtx))
 		return (true);
+	extent_tree_szad_new(&base_avail_szad);
+	base_nodes = NULL;

 	return (false);
 }
--- a/src/bitmap.c
+++ b/src/bitmap.c
@ -2,19 +2,6 @@
 #include "jemalloc/internal/jemalloc_internal.h"

 /******************************************************************************/
-/* Function prototypes for non-inline static functions. */
-
-static size_t	bits2groups(size_t nbits);
-
-/******************************************************************************/
-
-static size_t
-bits2groups(size_t nbits)
-{
-
-	return ((nbits >> LG_BITMAP_GROUP_NBITS) +
-	    !!(nbits & BITMAP_GROUP_NBITS_MASK));
-}

 void
 bitmap_info_init(bitmap_info_t *binfo, size_t nbits)
@ -31,15 +18,16 @@ bitmap_info_init(bitmap_info_t *binfo, size_t nbits)
 	 * that requires only one group.
 	 */
 	binfo->levels[0].group_offset = 0;
-	group_count = bits2groups(nbits);
+	group_count = BITMAP_BITS2GROUPS(nbits);
 	for (i = 1; group_count > 1; i++) {
 		assert(i < BITMAP_MAX_LEVELS);
 		binfo->levels[i].group_offset = binfo->levels[i-1].group_offset
 		    + group_count;
-		group_count = bits2groups(group_count);
+		group_count = BITMAP_BITS2GROUPS(group_count);
 	}
 	binfo->levels[i].group_offset = binfo->levels[i-1].group_offset
 	    + group_count;
+	assert(binfo->levels[i].group_offset <= BITMAP_GROUPS_MAX);
 	binfo->nlevels = i;
 	binfo->nbits = nbits;
 }
--- a/src/chunk.c
+++ b/src/chunk.c
@ -5,129 +5,315 @@
 /* Data. */

 const char	*opt_dss = DSS_DEFAULT;
-size_t		opt_lg_chunk = LG_CHUNK_DEFAULT;
+size_t		opt_lg_chunk = 0;

-malloc_mutex_t	chunks_mtx;
-chunk_stats_t	stats_chunks;
+/* Used exclusively for gdump triggering. */
+static size_t	curchunks;
+static size_t	highchunks;

-/*
- * Trees of chunks that were previously allocated (trees differ only in node
- * ordering).  These are used when allocating chunks, in an attempt to re-use
- * address space.  Depending on function, different tree orderings are needed,
- * which is why there are two trees with the same contents.
- */
-static extent_tree_t	chunks_szad_mmap;
-static extent_tree_t	chunks_ad_mmap;
-static extent_tree_t	chunks_szad_dss;
-static extent_tree_t	chunks_ad_dss;
-
-rtree_t		*chunks_rtree;
+rtree_t		chunks_rtree;

 /* Various chunk-related settings. */
 size_t		chunksize;
 size_t		chunksize_mask; /* (chunksize - 1). */
 size_t		chunk_npages;
-size_t		map_bias;
-size_t		arena_maxclass; /* Max size class for arenas. */
+
+static void	*chunk_alloc_default(void *new_addr, size_t size,
+    size_t alignment, bool *zero, bool *commit, unsigned arena_ind);
+static bool	chunk_dalloc_default(void *chunk, size_t size, bool committed,
+    unsigned arena_ind);
+static bool	chunk_commit_default(void *chunk, size_t size, size_t offset,
+    size_t length, unsigned arena_ind);
+static bool	chunk_decommit_default(void *chunk, size_t size, size_t offset,
+    size_t length, unsigned arena_ind);
+static bool	chunk_purge_default(void *chunk, size_t size, size_t offset,
+    size_t length, unsigned arena_ind);
+static bool	chunk_split_default(void *chunk, size_t size, size_t size_a,
+    size_t size_b, bool committed, unsigned arena_ind);
+static bool	chunk_merge_default(void *chunk_a, size_t size_a, void *chunk_b,
+    size_t size_b, bool committed, unsigned arena_ind);
+
+const chunk_hooks_t	chunk_hooks_default = {
+	chunk_alloc_default,
+	chunk_dalloc_default,
+	chunk_commit_default,
+	chunk_decommit_default,
+	chunk_purge_default,
+	chunk_split_default,
+	chunk_merge_default
+};

 /******************************************************************************/
-/* Function prototypes for non-inline static functions. */
+/*
+ * Function prototypes for static functions that are referenced prior to
+ * definition.
+ */

-static void	*chunk_recycle(extent_tree_t *chunks_szad,
-    extent_tree_t *chunks_ad, size_t size, size_t alignment, bool base,
-    bool *zero);
-static void	chunk_record(extent_tree_t *chunks_szad,
-    extent_tree_t *chunks_ad, void *chunk, size_t size);
+static void	chunk_record(arena_t *arena, chunk_hooks_t *chunk_hooks,
+    extent_tree_t *chunks_szad, extent_tree_t *chunks_ad, bool cache,
+    void *chunk, size_t size, bool zeroed, bool committed);

 /******************************************************************************/

+static chunk_hooks_t
+chunk_hooks_get_locked(arena_t *arena)
+{
+
+	return (arena->chunk_hooks);
+}
+
+chunk_hooks_t
+chunk_hooks_get(arena_t *arena)
+{
+	chunk_hooks_t chunk_hooks;
+
+	malloc_mutex_lock(&arena->chunks_mtx);
+	chunk_hooks = chunk_hooks_get_locked(arena);
+	malloc_mutex_unlock(&arena->chunks_mtx);
+
+	return (chunk_hooks);
+}
+
+chunk_hooks_t
+chunk_hooks_set(arena_t *arena, const chunk_hooks_t *chunk_hooks)
+{
+	chunk_hooks_t old_chunk_hooks;
+
+	malloc_mutex_lock(&arena->chunks_mtx);
+	old_chunk_hooks = arena->chunk_hooks;
+	/*
+	 * Copy each field atomically so that it is impossible for readers to
+	 * see partially updated pointers.  There are places where readers only
+	 * need one hook function pointer (therefore no need to copy the
+	 * entirety of arena->chunk_hooks), and stale reads do not affect
+	 * correctness, so they perform unlocked reads.
+	 */
+#define	ATOMIC_COPY_HOOK(n) do {					\
+	union {								\
+		chunk_##n##_t	**n;					\
+		void		**v;					\
+	} u;								\
+	u.n = &arena->chunk_hooks.n;					\
+	atomic_write_p(u.v, chunk_hooks->n);				\
+} while (0)
+	ATOMIC_COPY_HOOK(alloc);
+	ATOMIC_COPY_HOOK(dalloc);
+	ATOMIC_COPY_HOOK(commit);
+	ATOMIC_COPY_HOOK(decommit);
+	ATOMIC_COPY_HOOK(purge);
+	ATOMIC_COPY_HOOK(split);
+	ATOMIC_COPY_HOOK(merge);
+#undef ATOMIC_COPY_HOOK
+	malloc_mutex_unlock(&arena->chunks_mtx);
+
+	return (old_chunk_hooks);
+}
+
+static void
+chunk_hooks_assure_initialized_impl(arena_t *arena, chunk_hooks_t *chunk_hooks,
+    bool locked)
+{
+	static const chunk_hooks_t uninitialized_hooks =
+	    CHUNK_HOOKS_INITIALIZER;
+
+	if (memcmp(chunk_hooks, &uninitialized_hooks, sizeof(chunk_hooks_t)) ==
+	    0) {
+		*chunk_hooks = locked ? chunk_hooks_get_locked(arena) :
+		    chunk_hooks_get(arena);
+	}
+}
+
+static void
+chunk_hooks_assure_initialized_locked(arena_t *arena,
+    chunk_hooks_t *chunk_hooks)
+{
+
+	chunk_hooks_assure_initialized_impl(arena, chunk_hooks, true);
+}
+
+static void
+chunk_hooks_assure_initialized(arena_t *arena, chunk_hooks_t *chunk_hooks)
+{
+
+	chunk_hooks_assure_initialized_impl(arena, chunk_hooks, false);
+}
+
+bool
+chunk_register(const void *chunk, const extent_node_t *node)
+{
+
+	assert(extent_node_addr_get(node) == chunk);
+
+	if (rtree_set(&chunks_rtree, (uintptr_t)chunk, node))
+		return (true);
+	if (config_prof && opt_prof) {
+		size_t size = extent_node_size_get(node);
+		size_t nadd = (size == 0) ? 1 : size / chunksize;
+		size_t cur = atomic_add_z(&curchunks, nadd);
+		size_t high = atomic_read_z(&highchunks);
+		while (cur > high && atomic_cas_z(&highchunks, high, cur)) {
+			/*
+			 * Don't refresh cur, because it may have decreased
+			 * since this thread lost the highchunks update race.
+			 */
+			high = atomic_read_z(&highchunks);
+		}
+		if (cur > high && prof_gdump_get_unlocked())
+			prof_gdump();
+	}
+
+	return (false);
+}
+
+void
+chunk_deregister(const void *chunk, const extent_node_t *node)
+{
+	bool err;
+
+	err = rtree_set(&chunks_rtree, (uintptr_t)chunk, NULL);
+	assert(!err);
+	if (config_prof && opt_prof) {
+		size_t size = extent_node_size_get(node);
+		size_t nsub = (size == 0) ? 1 : size / chunksize;
+		assert(atomic_read_z(&curchunks) >= nsub);
+		atomic_sub_z(&curchunks, nsub);
+	}
+}
+
+/*
+ * Do first-best-fit chunk selection, i.e. select the lowest chunk that best
+ * fits.
+ */
+static extent_node_t *
+chunk_first_best_fit(arena_t *arena, extent_tree_t *chunks_szad,
+    extent_tree_t *chunks_ad, size_t size)
+{
+	extent_node_t key;
+
+	assert(size == CHUNK_CEILING(size));
+
+	extent_node_init(&key, arena, NULL, size, false, false);
+	return (extent_tree_szad_nsearch(chunks_szad, &key));
+}
+
 static void *
-chunk_recycle(extent_tree_t *chunks_szad, extent_tree_t *chunks_ad, size_t size,
-    size_t alignment, bool base, bool *zero)
+chunk_recycle(arena_t *arena, chunk_hooks_t *chunk_hooks,
+    extent_tree_t *chunks_szad, extent_tree_t *chunks_ad, bool cache,
+    void *new_addr, size_t size, size_t alignment, bool *zero, bool *commit,
+    bool dalloc_node)
 {
 	void *ret;
 	extent_node_t *node;
-	extent_node_t key;
 	size_t alloc_size, leadsize, trailsize;
-	bool zeroed;
+	bool zeroed, committed;

-	if (base) {
+	assert(new_addr == NULL || alignment == chunksize);
 	/*
-		 * This function may need to call base_node_{,de}alloc(), but
-		 * the current chunk allocation request is on behalf of the
-		 * base allocator.  Avoid deadlock (and if that weren't an
-		 * issue, potential for infinite recursion) by returning NULL.
+	 * Cached chunks use the node linkage embedded in their headers, in
+	 * which case dalloc_node is true, and new_addr is non-NULL because
+	 * we're operating on a specific chunk.
 	 */
-		return (NULL);
-	}
+	assert(dalloc_node || new_addr != NULL);

-	alloc_size = size + alignment - chunksize;
+	alloc_size = CHUNK_CEILING(s2u(size + alignment - chunksize));
 	/* Beware size_t wrap-around. */
 	if (alloc_size < size)
 		return (NULL);
-	key.addr = NULL;
-	key.size = alloc_size;
-	malloc_mutex_lock(&chunks_mtx);
-	node = extent_tree_szad_nsearch(chunks_szad, &key);
-	if (node == NULL) {
-		malloc_mutex_unlock(&chunks_mtx);
+	malloc_mutex_lock(&arena->chunks_mtx);
+	chunk_hooks_assure_initialized_locked(arena, chunk_hooks);
+	if (new_addr != NULL) {
+		extent_node_t key;
+		extent_node_init(&key, arena, new_addr, alloc_size, false,
+		    false);
+		node = extent_tree_ad_search(chunks_ad, &key);
+	} else {
+		node = chunk_first_best_fit(arena, chunks_szad, chunks_ad,
+		    alloc_size);
+	}
+	if (node == NULL || (new_addr != NULL && extent_node_size_get(node) <
+	    size)) {
+		malloc_mutex_unlock(&arena->chunks_mtx);
 		return (NULL);
 	}
-	leadsize = ALIGNMENT_CEILING((uintptr_t)node->addr, alignment) -
-	    (uintptr_t)node->addr;
-	assert(node->size >= leadsize + size);
-	trailsize = node->size - leadsize - size;
-	ret = (void *)((uintptr_t)node->addr + leadsize);
-	zeroed = node->zeroed;
+	leadsize = ALIGNMENT_CEILING((uintptr_t)extent_node_addr_get(node),
+	    alignment) - (uintptr_t)extent_node_addr_get(node);
+	assert(new_addr == NULL || leadsize == 0);
+	assert(extent_node_size_get(node) >= leadsize + size);
+	trailsize = extent_node_size_get(node) - leadsize - size;
+	ret = (void *)((uintptr_t)extent_node_addr_get(node) + leadsize);
+	zeroed = extent_node_zeroed_get(node);
 	if (zeroed)
 		*zero = true;
+	committed = extent_node_committed_get(node);
+	if (committed)
+		*commit = true;
+	/* Split the lead. */
+	if (leadsize != 0 &&
+	    chunk_hooks->split(extent_node_addr_get(node),
+	    extent_node_size_get(node), leadsize, size, false, arena->ind)) {
+		malloc_mutex_unlock(&arena->chunks_mtx);
+		return (NULL);
+	}
 	/* Remove node from the tree. */
 	extent_tree_szad_remove(chunks_szad, node);
 	extent_tree_ad_remove(chunks_ad, node);
+	arena_chunk_cache_maybe_remove(arena, node, cache);
 	if (leadsize != 0) {
 		/* Insert the leading space as a smaller chunk. */
-		node->size = leadsize;
+		extent_node_size_set(node, leadsize);
 		extent_tree_szad_insert(chunks_szad, node);
 		extent_tree_ad_insert(chunks_ad, node);
+		arena_chunk_cache_maybe_insert(arena, node, cache);
 		node = NULL;
 	}
 	if (trailsize != 0) {
-		/* Insert the trailing space as a smaller chunk. */
-		if (node == NULL) {
-			/*
-			 * An additional node is required, but
-			 * base_node_alloc() can cause a new base chunk to be
-			 * allocated.  Drop chunks_mtx in order to avoid
-			 * deadlock, and if node allocation fails, deallocate
-			 * the result before returning an error.
-			 */
-			malloc_mutex_unlock(&chunks_mtx);
-			node = base_node_alloc();
-			if (node == NULL) {
-				chunk_dealloc(ret, size, true);
+		/* Split the trail. */
+		if (chunk_hooks->split(ret, size + trailsize, size,
+		    trailsize, false, arena->ind)) {
+			if (dalloc_node && node != NULL)
+				arena_node_dalloc(arena, node);
+			malloc_mutex_unlock(&arena->chunks_mtx);
+			chunk_record(arena, chunk_hooks, chunks_szad, chunks_ad,
+			    cache, ret, size + trailsize, zeroed, committed);
 			return (NULL);
 		}
-			malloc_mutex_lock(&chunks_mtx);
+		/* Insert the trailing space as a smaller chunk. */
+		if (node == NULL) {
+			node = arena_node_alloc(arena);
+			if (node == NULL) {
+				malloc_mutex_unlock(&arena->chunks_mtx);
+				chunk_record(arena, chunk_hooks, chunks_szad,
+				    chunks_ad, cache, ret, size + trailsize,
+				    zeroed, committed);
+				return (NULL);
 			}
-		node->addr = (void *)((uintptr_t)(ret) + size);
-		node->size = trailsize;
-		node->zeroed = zeroed;
+		}
+		extent_node_init(node, arena, (void *)((uintptr_t)(ret) + size),
+		    trailsize, zeroed, committed);
 		extent_tree_szad_insert(chunks_szad, node);
 		extent_tree_ad_insert(chunks_ad, node);
+		arena_chunk_cache_maybe_insert(arena, node, cache);
 		node = NULL;
 	}
-	malloc_mutex_unlock(&chunks_mtx);
+	if (!committed && chunk_hooks->commit(ret, size, 0, size, arena->ind)) {
+		malloc_mutex_unlock(&arena->chunks_mtx);
+		chunk_record(arena, chunk_hooks, chunks_szad, chunks_ad, cache,
+		    ret, size, zeroed, committed);
+		return (NULL);
+	}
+	malloc_mutex_unlock(&arena->chunks_mtx);

-	if (node != NULL)
-		base_node_dealloc(node);
+	assert(dalloc_node || node != NULL);
+	if (dalloc_node && node != NULL)
+		arena_node_dalloc(arena, node);
 	if (*zero) {
-		if (zeroed == false)
+		if (!zeroed)
 			memset(ret, 0, size);
 		else if (config_debug) {
 			size_t i;
 			size_t *p = (size_t *)(uintptr_t)ret;

-			VALGRIND_MAKE_MEM_DEFINED(ret, size);
+			JEMALLOC_VALGRIND_MAKE_MEM_DEFINED(ret, size);
 			for (i = 0; i < size / sizeof(size_t); i++)
 				assert(p[i] == 0);
 		}
@ -136,138 +322,214 @@ chunk_recycle(extent_tree_t *chunks_szad, extent_tree_t *chunks_ad, size_t size,
 }

 /*
- * If the caller specifies (*zero == false), it is still possible to receive
- * zeroed memory, in which case *zero is toggled to true.  arena_chunk_alloc()
- * takes advantage of this to avoid demanding zeroed chunks, but taking
- * advantage of them if they are returned.
+ * If the caller specifies (!*zero), it is still possible to receive zeroed
+ * memory, in which case *zero is toggled to true.  arena_chunk_alloc() takes
+ * advantage of this to avoid demanding zeroed chunks, but taking advantage of
+ * them if they are returned.
 */
-void *
-chunk_alloc(size_t size, size_t alignment, bool base, bool *zero,
-    dss_prec_t dss_prec)
+static void *
+chunk_alloc_core(arena_t *arena, void *new_addr, size_t size, size_t alignment,
+    bool *zero, bool *commit, dss_prec_t dss_prec)
 {
 	void *ret;
+	chunk_hooks_t chunk_hooks = CHUNK_HOOKS_INITIALIZER;

 	assert(size != 0);
 	assert((size & chunksize_mask) == 0);
 	assert(alignment != 0);
 	assert((alignment & chunksize_mask) == 0);

+	/* Retained. */
+	if ((ret = chunk_recycle(arena, &chunk_hooks,
+	    &arena->chunks_szad_retained, &arena->chunks_ad_retained, false,
+	    new_addr, size, alignment, zero, commit, true)) != NULL)
+		return (ret);
+
 	/* "primary" dss. */
-	if (config_dss && dss_prec == dss_prec_primary) {
-		if ((ret = chunk_recycle(&chunks_szad_dss, &chunks_ad_dss, size,
-		    alignment, base, zero)) != NULL)
-			goto label_return;
-		if ((ret = chunk_alloc_dss(size, alignment, zero)) != NULL)
-			goto label_return;
-	}
-	/* mmap. */
-	if ((ret = chunk_recycle(&chunks_szad_mmap, &chunks_ad_mmap, size,
-	    alignment, base, zero)) != NULL)
-		goto label_return;
-	if ((ret = chunk_alloc_mmap(size, alignment, zero)) != NULL)
-		goto label_return;
+	if (have_dss && dss_prec == dss_prec_primary && (ret =
+	    chunk_alloc_dss(arena, new_addr, size, alignment, zero, commit)) !=
+	    NULL)
+		return (ret);
+	/*
+	 * mmap.  Requesting an address is not implemented for
+	 * chunk_alloc_mmap(), so only call it if (new_addr == NULL).
+	 */
+	if (new_addr == NULL && (ret = chunk_alloc_mmap(size, alignment, zero,
+	    commit)) != NULL)
+		return (ret);
 	/* "secondary" dss. */
-	if (config_dss && dss_prec == dss_prec_secondary) {
-		if ((ret = chunk_recycle(&chunks_szad_dss, &chunks_ad_dss, size,
-		    alignment, base, zero)) != NULL)
-			goto label_return;
-		if ((ret = chunk_alloc_dss(size, alignment, zero)) != NULL)
-			goto label_return;
-	}
+	if (have_dss && dss_prec == dss_prec_secondary && (ret =
+	    chunk_alloc_dss(arena, new_addr, size, alignment, zero, commit)) !=
+	    NULL)
+		return (ret);

 	/* All strategies for allocation failed. */
-	ret = NULL;
-label_return:
-	if (ret != NULL) {
-		if (config_ivsalloc && base == false) {
-			if (rtree_set(chunks_rtree, (uintptr_t)ret, 1)) {
-				chunk_dealloc(ret, size, true);
 	return (NULL);
-			}
-		}
-		if (config_stats || config_prof) {
-			bool gdump;
-			malloc_mutex_lock(&chunks_mtx);
-			if (config_stats)
-				stats_chunks.nchunks += (size / chunksize);
-			stats_chunks.curchunks += (size / chunksize);
-			if (stats_chunks.curchunks > stats_chunks.highchunks) {
-				stats_chunks.highchunks =
-				    stats_chunks.curchunks;
-				if (config_prof)
-					gdump = true;
-			} else if (config_prof)
-				gdump = false;
-			malloc_mutex_unlock(&chunks_mtx);
-			if (config_prof && opt_prof && opt_prof_gdump && gdump)
-				prof_gdump();
-		}
+}
+
+void *
+chunk_alloc_base(size_t size)
+{
+	void *ret;
+	bool zero, commit;
+
+	/*
+	 * Directly call chunk_alloc_mmap() rather than chunk_alloc_core()
+	 * because it's critical that chunk_alloc_base() return untouched
+	 * demand-zeroed virtual memory.
+	 */
+	zero = true;
+	commit = true;
+	ret = chunk_alloc_mmap(size, chunksize, &zero, &commit);
+	if (ret == NULL)
+		return (NULL);
 	if (config_valgrind)
-			VALGRIND_MAKE_MEM_UNDEFINED(ret, size);
-	}
-	assert(CHUNK_ADDR2BASE(ret) == ret);
+		JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(ret, size);
+
+	return (ret);
+}
+
+void *
+chunk_alloc_cache(arena_t *arena, chunk_hooks_t *chunk_hooks, void *new_addr,
+    size_t size, size_t alignment, bool *zero, bool dalloc_node)
+{
+	void *ret;
+	bool commit;
+
+	assert(size != 0);
+	assert((size & chunksize_mask) == 0);
+	assert(alignment != 0);
+	assert((alignment & chunksize_mask) == 0);
+
+	commit = true;
+	ret = chunk_recycle(arena, chunk_hooks, &arena->chunks_szad_cached,
+	    &arena->chunks_ad_cached, true, new_addr, size, alignment, zero,
+	    &commit, dalloc_node);
+	if (ret == NULL)
+		return (NULL);
+	assert(commit);
+	if (config_valgrind)
+		JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(ret, size);
+	return (ret);
+}
+
+static arena_t *
+chunk_arena_get(unsigned arena_ind)
+{
+	arena_t *arena;
+
+	/* Dodge tsd for a0 in order to avoid bootstrapping issues. */
+	arena = (arena_ind == 0) ? a0get() : arena_get(tsd_fetch(), arena_ind,
+	     false, true);
+	/*
+	 * The arena we're allocating on behalf of must have been initialized
+	 * already.
+	 */
+	assert(arena != NULL);
+	return (arena);
+}
+
+static void *
+chunk_alloc_default(void *new_addr, size_t size, size_t alignment, bool *zero,
+    bool *commit, unsigned arena_ind)
+{
+	void *ret;
+	arena_t *arena;
+
+	arena = chunk_arena_get(arena_ind);
+	ret = chunk_alloc_core(arena, new_addr, size, alignment, zero,
+	    commit, arena->dss_prec);
+	if (ret == NULL)
+		return (NULL);
+	if (config_valgrind)
+		JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(ret, size);
+
+	return (ret);
+}
+
+void *
+chunk_alloc_wrapper(arena_t *arena, chunk_hooks_t *chunk_hooks, void *new_addr,
+    size_t size, size_t alignment, bool *zero, bool *commit)
+{
+	void *ret;
+
+	chunk_hooks_assure_initialized(arena, chunk_hooks);
+	ret = chunk_hooks->alloc(new_addr, size, alignment, zero, commit,
+	    arena->ind);
+	if (ret == NULL)
+		return (NULL);
+	if (config_valgrind && chunk_hooks->alloc != chunk_alloc_default)
+		JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(ret, chunksize);
 	return (ret);
 }

 static void
-chunk_record(extent_tree_t *chunks_szad, extent_tree_t *chunks_ad, void *chunk,
-    size_t size)
+chunk_record(arena_t *arena, chunk_hooks_t *chunk_hooks,
+    extent_tree_t *chunks_szad, extent_tree_t *chunks_ad, bool cache,
+    void *chunk, size_t size, bool zeroed, bool committed)
 {
 	bool unzeroed;
-	extent_node_t *xnode, *node, *prev, *xprev, key;
+	extent_node_t *node, *prev;
+	extent_node_t key;

-	unzeroed = pages_purge(chunk, size);
-	VALGRIND_MAKE_MEM_NOACCESS(chunk, size);
+	assert(!cache || !zeroed);
+	unzeroed = cache || !zeroed;
+	JEMALLOC_VALGRIND_MAKE_MEM_NOACCESS(chunk, size);

-	/*
-	 * Allocate a node before acquiring chunks_mtx even though it might not
-	 * be needed, because base_node_alloc() may cause a new base chunk to
-	 * be allocated, which could cause deadlock if chunks_mtx were already
-	 * held.
-	 */
-	xnode = base_node_alloc();
-	/* Use xprev to implement conditional deferred deallocation of prev. */
-	xprev = NULL;
-
-	malloc_mutex_lock(&chunks_mtx);
-	key.addr = (void *)((uintptr_t)chunk + size);
+	malloc_mutex_lock(&arena->chunks_mtx);
+	chunk_hooks_assure_initialized_locked(arena, chunk_hooks);
+	extent_node_init(&key, arena, (void *)((uintptr_t)chunk + size), 0,
+	    false, false);
 	node = extent_tree_ad_nsearch(chunks_ad, &key);
 	/* Try to coalesce forward. */
-	if (node != NULL && node->addr == key.addr) {
+	if (node != NULL && extent_node_addr_get(node) ==
+	    extent_node_addr_get(&key) && extent_node_committed_get(node) ==
+	    committed && !chunk_hooks->merge(chunk, size,
+	    extent_node_addr_get(node), extent_node_size_get(node), false,
+	    arena->ind)) {
 		/*
 		 * Coalesce chunk with the following address range.  This does
 		 * not change the position within chunks_ad, so only
 		 * remove/insert from/into chunks_szad.
 		 */
 		extent_tree_szad_remove(chunks_szad, node);
-		node->addr = chunk;
-		node->size += size;
-		node->zeroed = (node->zeroed && (unzeroed == false));
+		arena_chunk_cache_maybe_remove(arena, node, cache);
+		extent_node_addr_set(node, chunk);
+		extent_node_size_set(node, size + extent_node_size_get(node));
+		extent_node_zeroed_set(node, extent_node_zeroed_get(node) &&
+		    !unzeroed);
 		extent_tree_szad_insert(chunks_szad, node);
+		arena_chunk_cache_maybe_insert(arena, node, cache);
 	} else {
 		/* Coalescing forward failed, so insert a new node. */
-		if (xnode == NULL) {
+		node = arena_node_alloc(arena);
+		if (node == NULL) {
 			/*
-			 * base_node_alloc() failed, which is an exceedingly
-			 * unlikely failure.  Leak chunk; its pages have
-			 * already been purged, so this is only a virtual
-			 * memory leak.
+			 * Node allocation failed, which is an exceedingly
+			 * unlikely failure.  Leak chunk after making sure its
+			 * pages have already been purged, so that this is only
+			 * a virtual memory leak.
 			 */
+			if (cache) {
+				chunk_purge_wrapper(arena, chunk_hooks, chunk,
+				    size, 0, size);
+			}
 			goto label_return;
 		}
-		node = xnode;
-		xnode = NULL; /* Prevent deallocation below. */
-		node->addr = chunk;
-		node->size = size;
-		node->zeroed = (unzeroed == false);
+		extent_node_init(node, arena, chunk, size, !unzeroed,
+		    committed);
 		extent_tree_ad_insert(chunks_ad, node);
 		extent_tree_szad_insert(chunks_szad, node);
+		arena_chunk_cache_maybe_insert(arena, node, cache);
 	}

 	/* Try to coalesce backward. */
 	prev = extent_tree_ad_prev(chunks_ad, node);
-	if (prev != NULL && (void *)((uintptr_t)prev->addr + prev->size) ==
-	    chunk) {
+	if (prev != NULL && (void *)((uintptr_t)extent_node_addr_get(prev) +
+	    extent_node_size_get(prev)) == chunk &&
+	    extent_node_committed_get(prev) == committed &&
+	    !chunk_hooks->merge(extent_node_addr_get(prev),
+	    extent_node_size_get(prev), chunk, size, false, arena->ind)) {
 		/*
 		 * Coalesce chunk with the previous address range.  This does
 		 * not change the position within chunks_ad, so only
@ -275,44 +537,27 @@ chunk_record(extent_tree_t *chunks_szad, extent_tree_t *chunks_ad, void *chunk,
 		 */
 		extent_tree_szad_remove(chunks_szad, prev);
 		extent_tree_ad_remove(chunks_ad, prev);
-
+		arena_chunk_cache_maybe_remove(arena, prev, cache);
 		extent_tree_szad_remove(chunks_szad, node);
-		node->addr = prev->addr;
-		node->size += prev->size;
-		node->zeroed = (node->zeroed && prev->zeroed);
+		arena_chunk_cache_maybe_remove(arena, node, cache);
+		extent_node_addr_set(node, extent_node_addr_get(prev));
+		extent_node_size_set(node, extent_node_size_get(prev) +
+		    extent_node_size_get(node));
+		extent_node_zeroed_set(node, extent_node_zeroed_get(prev) &&
+		    extent_node_zeroed_get(node));
 		extent_tree_szad_insert(chunks_szad, node);
+		arena_chunk_cache_maybe_insert(arena, node, cache);

-		xprev = prev;
+		arena_node_dalloc(arena, prev);
 	}

 label_return:
-	malloc_mutex_unlock(&chunks_mtx);
-	/*
-	 * Deallocate xnode and/or xprev after unlocking chunks_mtx in order to
-	 * avoid potential deadlock.
-	 */
-	if (xnode != NULL)
-		base_node_dealloc(xnode);
-	if (xprev != NULL)
-		base_node_dealloc(xprev);
+	malloc_mutex_unlock(&arena->chunks_mtx);
 }

 void
-chunk_unmap(void *chunk, size_t size)
-{
-	assert(chunk != NULL);
-	assert(CHUNK_ADDR2BASE(chunk) == chunk);
-	assert(size != 0);
-	assert((size & chunksize_mask) == 0);
-
-	if (config_dss && chunk_in_dss(chunk))
-		chunk_record(&chunks_szad_dss, &chunks_ad_dss, chunk, size);
-	else if (chunk_dealloc_mmap(chunk, size))
-		chunk_record(&chunks_szad_mmap, &chunks_ad_mmap, chunk, size);
-}
-
-void
-chunk_dealloc(void *chunk, size_t size, bool unmap)
+chunk_dalloc_cache(arena_t *arena, chunk_hooks_t *chunk_hooks, void *chunk,
+    size_t size, bool committed)
 {

 	assert(chunk != NULL);
@ -320,22 +565,164 @@ chunk_dealloc(void *chunk, size_t size, bool unmap)
 	assert(size != 0);
 	assert((size & chunksize_mask) == 0);

-	if (config_ivsalloc)
-		rtree_set(chunks_rtree, (uintptr_t)chunk, 0);
-	if (config_stats || config_prof) {
-		malloc_mutex_lock(&chunks_mtx);
-		assert(stats_chunks.curchunks >= (size / chunksize));
-		stats_chunks.curchunks -= (size / chunksize);
-		malloc_mutex_unlock(&chunks_mtx);
+	chunk_record(arena, chunk_hooks, &arena->chunks_szad_cached,
+	    &arena->chunks_ad_cached, true, chunk, size, false, committed);
+	arena_maybe_purge(arena);
+}
+
+void
+chunk_dalloc_arena(arena_t *arena, chunk_hooks_t *chunk_hooks, void *chunk,
+    size_t size, bool zeroed, bool committed)
+{
+
+	assert(chunk != NULL);
+	assert(CHUNK_ADDR2BASE(chunk) == chunk);
+	assert(size != 0);
+	assert((size & chunksize_mask) == 0);
+
+	chunk_hooks_assure_initialized(arena, chunk_hooks);
+	/* Try to deallocate. */
+	if (!chunk_hooks->dalloc(chunk, size, committed, arena->ind))
+		return;
+	/* Try to decommit; purge if that fails. */
+	if (committed) {
+		committed = chunk_hooks->decommit(chunk, size, 0, size,
+		    arena->ind);
 	}
+	zeroed = !committed || !chunk_hooks->purge(chunk, size, 0, size,
+	    arena->ind);
+	chunk_record(arena, chunk_hooks, &arena->chunks_szad_retained,
+	    &arena->chunks_ad_retained, false, chunk, size, zeroed, committed);
+}

-	if (unmap)
-		chunk_unmap(chunk, size);
+static bool
+chunk_dalloc_default(void *chunk, size_t size, bool committed,
+    unsigned arena_ind)
+{
+
+	if (!have_dss || !chunk_in_dss(chunk))
+		return (chunk_dalloc_mmap(chunk, size));
+	return (true);
+}
+
+void
+chunk_dalloc_wrapper(arena_t *arena, chunk_hooks_t *chunk_hooks, void *chunk,
+    size_t size, bool committed)
+{
+
+	chunk_hooks_assure_initialized(arena, chunk_hooks);
+	chunk_hooks->dalloc(chunk, size, committed, arena->ind);
+	if (config_valgrind && chunk_hooks->dalloc != chunk_dalloc_default)
+		JEMALLOC_VALGRIND_MAKE_MEM_NOACCESS(chunk, size);
+}
+
+static bool
+chunk_commit_default(void *chunk, size_t size, size_t offset, size_t length,
+    unsigned arena_ind)
+{
+
+	return (pages_commit((void *)((uintptr_t)chunk + (uintptr_t)offset),
+	    length));
+}
+
+static bool
+chunk_decommit_default(void *chunk, size_t size, size_t offset, size_t length,
+    unsigned arena_ind)
+{
+
+	return (pages_decommit((void *)((uintptr_t)chunk + (uintptr_t)offset),
+	    length));
+}
+
+bool
+chunk_purge_arena(arena_t *arena, void *chunk, size_t offset, size_t length)
+{
+
+	assert(chunk != NULL);
+	assert(CHUNK_ADDR2BASE(chunk) == chunk);
+	assert((offset & PAGE_MASK) == 0);
+	assert(length != 0);
+	assert((length & PAGE_MASK) == 0);
+
+	return (pages_purge((void *)((uintptr_t)chunk + (uintptr_t)offset),
+	    length));
+}
+
+static bool
+chunk_purge_default(void *chunk, size_t size, size_t offset, size_t length,
+    unsigned arena_ind)
+{
+
+	return (chunk_purge_arena(chunk_arena_get(arena_ind), chunk, offset,
+	    length));
+}
+
+bool
+chunk_purge_wrapper(arena_t *arena, chunk_hooks_t *chunk_hooks, void *chunk,
+    size_t size, size_t offset, size_t length)
+{
+
+	chunk_hooks_assure_initialized(arena, chunk_hooks);
+	return (chunk_hooks->purge(chunk, size, offset, length, arena->ind));
+}
+
+static bool
+chunk_split_default(void *chunk, size_t size, size_t size_a, size_t size_b,
+    bool committed, unsigned arena_ind)
+{
+
+	if (!maps_coalesce)
+		return (true);
+	return (false);
+}
+
+static bool
+chunk_merge_default(void *chunk_a, size_t size_a, void *chunk_b, size_t size_b,
+    bool committed, unsigned arena_ind)
+{
+
+	if (!maps_coalesce)
+		return (true);
+	if (have_dss && chunk_in_dss(chunk_a) != chunk_in_dss(chunk_b))
+		return (true);
+
+	return (false);
+}
+
+static rtree_node_elm_t *
+chunks_rtree_node_alloc(size_t nelms)
+{
+
+	return ((rtree_node_elm_t *)base_alloc(nelms *
+	    sizeof(rtree_node_elm_t)));
 }

 bool
 chunk_boot(void)
 {
+#ifdef _WIN32
+	SYSTEM_INFO info;
+	GetSystemInfo(&info);
+
+	/*
+	 * Verify actual page size is equal to or an integral multiple of
+	 * configured page size.
+	 */
+	if (info.dwPageSize & ((1U << LG_PAGE) - 1))
+		return (true);
+
+	/*
+	 * Configure chunksize (if not set) to match granularity (usually 64K),
+	 * so pages_map will always take fast path.
+	 */
+	if (!opt_lg_chunk) {
+		opt_lg_chunk = jemalloc_ffs((int)info.dwAllocationGranularity)
+		    - 1;
+	}
+#else
+	if (!opt_lg_chunk)
+		opt_lg_chunk = LG_CHUNK_DEFAULT;
+#endif

 	/* Set variables according to the value of opt_lg_chunk. */
 	chunksize = (ZU(1) << opt_lg_chunk);
@ -343,23 +730,11 @@ chunk_boot(void)
 	chunksize_mask = chunksize - 1;
 	chunk_npages = (chunksize >> LG_PAGE);

-	if (config_stats || config_prof) {
-		if (malloc_mutex_init(&chunks_mtx))
+	if (have_dss && chunk_dss_boot())
 		return (true);
-		memset(&stats_chunks, 0, sizeof(chunk_stats_t));
-	}
-	if (config_dss && chunk_dss_boot())
+	if (rtree_new(&chunks_rtree, (ZU(1) << (LG_SIZEOF_PTR+3)) -
+	    opt_lg_chunk, chunks_rtree_node_alloc, NULL))
 		return (true);
-	extent_tree_szad_new(&chunks_szad_mmap);
-	extent_tree_ad_new(&chunks_ad_mmap);
-	extent_tree_szad_new(&chunks_szad_dss);
-	extent_tree_ad_new(&chunks_ad_dss);
-	if (config_ivsalloc) {
-		chunks_rtree = rtree_new((ZU(1) << (LG_SIZEOF_PTR+3)) -
-		    opt_lg_chunk, base_alloc, NULL);
-		if (chunks_rtree == NULL)
-			return (true);
-	}

 	return (false);
 }
@ -368,9 +743,6 @@ void
 chunk_prefork(void)
 {

-	malloc_mutex_prefork(&chunks_mtx);
-	if (config_ivsalloc)
-		rtree_prefork(chunks_rtree);
 	chunk_dss_prefork();
 }

@ -379,9 +751,6 @@ chunk_postfork_parent(void)
 {

 	chunk_dss_postfork_parent();
-	if (config_ivsalloc)
-		rtree_postfork_parent(chunks_rtree);
-	malloc_mutex_postfork_parent(&chunks_mtx);
 }

 void
@ -389,7 +758,4 @@ chunk_postfork_child(void)
 {

 	chunk_dss_postfork_child();
-	if (config_ivsalloc)
-		rtree_postfork_child(chunks_rtree);
-	malloc_mutex_postfork_child(&chunks_mtx);
 }
--- a/src/chunk_dss.c
+++ b/src/chunk_dss.c
@ -32,7 +32,7 @@ static void *
 chunk_dss_sbrk(intptr_t increment)
 {

-#ifdef JEMALLOC_HAVE_SBRK
+#ifdef JEMALLOC_DSS
 	return (sbrk(increment));
 #else
 	not_implemented();
@ -45,7 +45,7 @@ chunk_dss_prec_get(void)
 {
 	dss_prec_t ret;

-	if (config_dss == false)
+	if (!have_dss)
 		return (dss_prec_disabled);
 	malloc_mutex_lock(&dss_mtx);
 	ret = dss_prec_default;
@ -57,8 +57,8 @@ bool
 chunk_dss_prec_set(dss_prec_t dss_prec)
 {

-	if (config_dss == false)
-		return (true);
+	if (!have_dss)
+		return (dss_prec != dss_prec_disabled);
 	malloc_mutex_lock(&dss_mtx);
 	dss_prec_default = dss_prec;
 	malloc_mutex_unlock(&dss_mtx);
@ -66,11 +66,12 @@ chunk_dss_prec_set(dss_prec_t dss_prec)
 }

 void *
-chunk_alloc_dss(size_t size, size_t alignment, bool *zero)
+chunk_alloc_dss(arena_t *arena, void *new_addr, size_t size, size_t alignment,
+    bool *zero, bool *commit)
 {
 	void *ret;

-	cassert(config_dss);
+	cassert(have_dss);
 	assert(size > 0 && (size & chunksize_mask) == 0);
 	assert(alignment > 0 && (alignment & chunksize_mask) == 0);

@ -93,8 +94,17 @@ chunk_alloc_dss(size_t size, size_t alignment, bool *zero)
 		 * malloc.
 		 */
 		do {
+			/* Avoid an unnecessary system call. */
+			if (new_addr != NULL && dss_max != new_addr)
+				break;
+
 			/* Get the current end of the DSS. */
 			dss_max = chunk_dss_sbrk(0);
+
+			/* Make sure the earlier condition still holds. */
+			if (new_addr != NULL && dss_max != new_addr)
+				break;
+
 			/*
 			 * Calculate how much padding is necessary to
 			 * chunk-align the end of the DSS.
@ -123,12 +133,20 @@ chunk_alloc_dss(size_t size, size_t alignment, bool *zero)
 				/* Success. */
 				dss_max = dss_next;
 				malloc_mutex_unlock(&dss_mtx);
-				if (cpad_size != 0)
-					chunk_unmap(cpad, cpad_size);
+				if (cpad_size != 0) {
+					chunk_hooks_t chunk_hooks =
+					    CHUNK_HOOKS_INITIALIZER;
+					chunk_dalloc_wrapper(arena,
+					    &chunk_hooks, cpad, cpad_size,
+					    true);
+				}
 				if (*zero) {
-					VALGRIND_MAKE_MEM_UNDEFINED(ret, size);
+					JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(
+					    ret, size);
 					memset(ret, 0, size);
 				}
+				if (!*commit)
+					*commit = pages_decommit(ret, size);
 				return (ret);
 			}
 		} while (dss_prev != (void *)-1);
@ -143,7 +161,7 @@ chunk_in_dss(void *chunk)
 {
 	bool ret;

-	cassert(config_dss);
+	cassert(have_dss);

 	malloc_mutex_lock(&dss_mtx);
 	if ((uintptr_t)chunk >= (uintptr_t)dss_base
@ -160,7 +178,7 @@ bool
 chunk_dss_boot(void)
 {

-	cassert(config_dss);
+	cassert(have_dss);

 	if (malloc_mutex_init(&dss_mtx))
 		return (true);
@ -175,7 +193,7 @@ void
 chunk_dss_prefork(void)
 {

-	if (config_dss)
+	if (have_dss)
 		malloc_mutex_prefork(&dss_mtx);
 }

@ -183,7 +201,7 @@ void
 chunk_dss_postfork_parent(void)
 {

-	if (config_dss)
+	if (have_dss)
 		malloc_mutex_postfork_parent(&dss_mtx);
 }

@ -191,7 +209,7 @@ void
 chunk_dss_postfork_child(void)
 {

-	if (config_dss)
+	if (have_dss)
 		malloc_mutex_postfork_child(&dss_mtx);
 }

--- a/src/chunk_mmap.c
+++ b/src/chunk_mmap.c
@ -1,146 +1,10 @@
 #define	JEMALLOC_CHUNK_MMAP_C_
 #include "jemalloc/internal/jemalloc_internal.h"

-/******************************************************************************/
-/* Function prototypes for non-inline static functions. */
-
-static void	*pages_map(void *addr, size_t size);
-static void	pages_unmap(void *addr, size_t size);
-static void	*chunk_alloc_mmap_slow(size_t size, size_t alignment,
-    bool *zero);
-
 /******************************************************************************/

 static void *
-pages_map(void *addr, size_t size)
-{
-	void *ret;
-
-	assert(size != 0);
-
-#ifdef _WIN32
-	/*
-	 * If VirtualAlloc can't allocate at the given address when one is
-	 * given, it fails and returns NULL.
-	 */
-	ret = VirtualAlloc(addr, size, MEM_COMMIT | MEM_RESERVE,
-	    PAGE_READWRITE);
-#else
-	/*
-	 * We don't use MAP_FIXED here, because it can cause the *replacement*
-	 * of existing mappings, and we only want to create new mappings.
-	 */
-	ret = mmap(addr, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON,
-	    -1, 0);
-	assert(ret != NULL);
-
-	if (ret == MAP_FAILED)
-		ret = NULL;
-	else if (addr != NULL && ret != addr) {
-		/*
-		 * We succeeded in mapping memory, but not in the right place.
-		 */
-		if (munmap(ret, size) == -1) {
-			char buf[BUFERROR_BUF];
-
-			buferror(get_errno(), buf, sizeof(buf));
-			malloc_printf("<jemalloc: Error in munmap(): %s\n",
-			    buf);
-			if (opt_abort)
-				abort();
-		}
-		ret = NULL;
-	}
-#endif
-	assert(ret == NULL || (addr == NULL && ret != addr)
-	    || (addr != NULL && ret == addr));
-	return (ret);
-}
-
-static void
-pages_unmap(void *addr, size_t size)
-{
-
-#ifdef _WIN32
-	if (VirtualFree(addr, 0, MEM_RELEASE) == 0)
-#else
-	if (munmap(addr, size) == -1)
-#endif
-	{
-		char buf[BUFERROR_BUF];
-
-		buferror(get_errno(), buf, sizeof(buf));
-		malloc_printf("<jemalloc>: Error in "
-#ifdef _WIN32
-		              "VirtualFree"
-#else
-		              "munmap"
-#endif
-		              "(): %s\n", buf);
-		if (opt_abort)
-			abort();
-	}
-}
-
-static void *
-pages_trim(void *addr, size_t alloc_size, size_t leadsize, size_t size)
-{
-	void *ret = (void *)((uintptr_t)addr + leadsize);
-
-	assert(alloc_size >= leadsize + size);
-#ifdef _WIN32
-	{
-		void *new_addr;
-
-		pages_unmap(addr, alloc_size);
-		new_addr = pages_map(ret, size);
-		if (new_addr == ret)
-			return (ret);
-		if (new_addr)
-			pages_unmap(new_addr, size);
-		return (NULL);
-	}
-#else
-	{
-		size_t trailsize = alloc_size - leadsize - size;
-
-		if (leadsize != 0)
-			pages_unmap(addr, leadsize);
-		if (trailsize != 0)
-			pages_unmap((void *)((uintptr_t)ret + size), trailsize);
-		return (ret);
-	}
-#endif
-}
-
-bool
-pages_purge(void *addr, size_t length)
-{
-	bool unzeroed;
-
-#ifdef _WIN32
-	VirtualAlloc(addr, length, MEM_RESET, PAGE_READWRITE);
-	unzeroed = true;
-#else
-#  ifdef JEMALLOC_PURGE_MADVISE_DONTNEED
-#    define JEMALLOC_MADV_PURGE MADV_DONTNEED
-#    define JEMALLOC_MADV_ZEROS true
-#  elif defined(JEMALLOC_PURGE_MADVISE_FREE)
-#    define JEMALLOC_MADV_PURGE MADV_FREE
-#    define JEMALLOC_MADV_ZEROS false
-#  else
-#    error "No method defined for purging unused dirty pages."
-#  endif
-	int err = madvise(addr, length, JEMALLOC_MADV_PURGE);
-	unzeroed = (JEMALLOC_MADV_ZEROS == false || err != 0);
-#  undef JEMALLOC_MADV_PURGE
-#  undef JEMALLOC_MADV_ZEROS
-#endif
-	return (unzeroed);
-}
-
-static void *
-chunk_alloc_mmap_slow(size_t size, size_t alignment, bool *zero)
+chunk_alloc_mmap_slow(size_t size, size_t alignment, bool *zero, bool *commit)
 {
 	void *ret, *pages;
 	size_t alloc_size, leadsize;
@ -160,11 +24,13 @@ chunk_alloc_mmap_slow(size_t size, size_t alignment, bool *zero)

 	assert(ret != NULL);
 	*zero = true;
+	if (!*commit)
+		*commit = pages_decommit(ret, size);
 	return (ret);
 }

 void *
-chunk_alloc_mmap(size_t size, size_t alignment, bool *zero)
+chunk_alloc_mmap(size_t size, size_t alignment, bool *zero, bool *commit)
 {
 	void *ret;
 	size_t offset;
@ -191,20 +57,22 @@ chunk_alloc_mmap(size_t size, size_t alignment, bool *zero)
 	offset = ALIGNMENT_ADDR2OFFSET(ret, alignment);
 	if (offset != 0) {
 		pages_unmap(ret, size);
-		return (chunk_alloc_mmap_slow(size, alignment, zero));
+		return (chunk_alloc_mmap_slow(size, alignment, zero, commit));
 	}

 	assert(ret != NULL);
 	*zero = true;
+	if (!*commit)
+		*commit = pages_decommit(ret, size);
 	return (ret);
 }

 bool
-chunk_dealloc_mmap(void *chunk, size_t size)
+chunk_dalloc_mmap(void *chunk, size_t size)
 {

 	if (config_munmap)
 		pages_unmap(chunk, size);

-	return (config_munmap == false);
+	return (!config_munmap);
 }
--- a/src/ckh.c
+++ b/src/ckh.c
@ -40,8 +40,8 @@
 /******************************************************************************/
 /* Function prototypes for non-inline static functions. */

-static bool	ckh_grow(ckh_t *ckh);
-static void	ckh_shrink(ckh_t *ckh);
+static bool	ckh_grow(tsd_t *tsd, ckh_t *ckh);
+static void	ckh_shrink(tsd_t *tsd, ckh_t *ckh);

 /******************************************************************************/

@ -185,7 +185,7 @@ ckh_evict_reloc_insert(ckh_t *ckh, size_t argbucket, void const **argkey,
 		}

 		bucket = tbucket;
-		if (ckh_try_bucket_insert(ckh, bucket, key, data) == false)
+		if (!ckh_try_bucket_insert(ckh, bucket, key, data))
 			return (false);
 	}
 }
@ -201,12 +201,12 @@ ckh_try_insert(ckh_t *ckh, void const**argkey, void const**argdata)

 	/* Try to insert in primary bucket. */
 	bucket = hashes[0] & ((ZU(1) << ckh->lg_curbuckets) - 1);
-	if (ckh_try_bucket_insert(ckh, bucket, key, data) == false)
+	if (!ckh_try_bucket_insert(ckh, bucket, key, data))
 		return (false);

 	/* Try to insert in secondary bucket. */
 	bucket = hashes[1] & ((ZU(1) << ckh->lg_curbuckets) - 1);
-	if (ckh_try_bucket_insert(ckh, bucket, key, data) == false)
+	if (!ckh_try_bucket_insert(ckh, bucket, key, data))
 		return (false);

 	/*
@ -243,7 +243,7 @@ ckh_rebuild(ckh_t *ckh, ckhc_t *aTab)
 }

 static bool
-ckh_grow(ckh_t *ckh)
+ckh_grow(tsd_t *tsd, ckh_t *ckh)
 {
 	bool ret;
 	ckhc_t *tab, *ttab;
@ -270,7 +270,8 @@ ckh_grow(ckh_t *ckh)
 			ret = true;
 			goto label_return;
 		}
-		tab = (ckhc_t *)ipalloc(usize, CACHELINE, true);
+		tab = (ckhc_t *)ipallocztm(tsd, usize, CACHELINE, true, NULL,
+		    true, NULL);
 		if (tab == NULL) {
 			ret = true;
 			goto label_return;
@ -281,13 +282,13 @@ ckh_grow(ckh_t *ckh)
 		tab = ttab;
 		ckh->lg_curbuckets = lg_curcells - LG_CKH_BUCKET_CELLS;

-		if (ckh_rebuild(ckh, tab) == false) {
-			idalloc(tab);
+		if (!ckh_rebuild(ckh, tab)) {
+			idalloctm(tsd, tab, tcache_get(tsd, false), true);
 			break;
 		}

 		/* Rebuilding failed, so back out partially rebuilt table. */
-		idalloc(ckh->tab);
+		idalloctm(tsd, ckh->tab, tcache_get(tsd, false), true);
 		ckh->tab = tab;
 		ckh->lg_curbuckets = lg_prevbuckets;
 	}
@ -298,7 +299,7 @@ label_return:
 }

 static void
-ckh_shrink(ckh_t *ckh)
+ckh_shrink(tsd_t *tsd, ckh_t *ckh)
 {
 	ckhc_t *tab, *ttab;
 	size_t lg_curcells, usize;
@ -313,7 +314,8 @@ ckh_shrink(ckh_t *ckh)
 	usize = sa2u(sizeof(ckhc_t) << lg_curcells, CACHELINE);
 	if (usize == 0)
 		return;
-	tab = (ckhc_t *)ipalloc(usize, CACHELINE, true);
+	tab = (ckhc_t *)ipallocztm(tsd, usize, CACHELINE, true, NULL, true,
+	    NULL);
 	if (tab == NULL) {
 		/*
 		 * An OOM error isn't worth propagating, since it doesn't
@ -327,8 +329,8 @@ ckh_shrink(ckh_t *ckh)
 	tab = ttab;
 	ckh->lg_curbuckets = lg_curcells - LG_CKH_BUCKET_CELLS;

-	if (ckh_rebuild(ckh, tab) == false) {
-		idalloc(tab);
+	if (!ckh_rebuild(ckh, tab)) {
+		idalloctm(tsd, tab, tcache_get(tsd, false), true);
 #ifdef CKH_COUNT
 		ckh->nshrinks++;
 #endif
@ -336,7 +338,7 @@ ckh_shrink(ckh_t *ckh)
 	}

 	/* Rebuilding failed, so back out partially rebuilt table. */
-	idalloc(ckh->tab);
+	idalloctm(tsd, ckh->tab, tcache_get(tsd, false), true);
 	ckh->tab = tab;
 	ckh->lg_curbuckets = lg_prevbuckets;
 #ifdef CKH_COUNT
@ -345,7 +347,8 @@ ckh_shrink(ckh_t *ckh)
 }

 bool
-ckh_new(ckh_t *ckh, size_t minitems, ckh_hash_t *hash, ckh_keycomp_t *keycomp)
+ckh_new(tsd_t *tsd, ckh_t *ckh, size_t minitems, ckh_hash_t *hash,
+    ckh_keycomp_t *keycomp)
 {
 	bool ret;
 	size_t mincells, usize;
@ -366,10 +369,10 @@ ckh_new(ckh_t *ckh, size_t minitems, ckh_hash_t *hash, ckh_keycomp_t *keycomp)
 	ckh->count = 0;

 	/*
-	 * Find the minimum power of 2 that is large enough to fit aBaseCount
+	 * Find the minimum power of 2 that is large enough to fit minitems
 	 * entries.  We are using (2+,2) cuckoo hashing, which has an expected
 	 * maximum load factor of at least ~0.86, so 0.75 is a conservative load
-	 * factor that will typically allow 2^aLgMinItems to fit without ever
+	 * factor that will typically allow mincells items to fit without ever
 	 * growing the table.
 	 */
 	assert(LG_CKH_BUCKET_CELLS > 0);
@ -388,7 +391,8 @@ ckh_new(ckh_t *ckh, size_t minitems, ckh_hash_t *hash, ckh_keycomp_t *keycomp)
 		ret = true;
 		goto label_return;
 	}
-	ckh->tab = (ckhc_t *)ipalloc(usize, CACHELINE, true);
+	ckh->tab = (ckhc_t *)ipallocztm(tsd, usize, CACHELINE, true, NULL, true,
+	    NULL);
 	if (ckh->tab == NULL) {
 		ret = true;
 		goto label_return;
@ -400,16 +404,16 @@ label_return:
 }

 void
-ckh_delete(ckh_t *ckh)
+ckh_delete(tsd_t *tsd, ckh_t *ckh)
 {

 	assert(ckh != NULL);

 #ifdef CKH_VERBOSE
 	malloc_printf(
-	    "%s(%p): ngrows: %"PRIu64", nshrinks: %"PRIu64","
-	    " nshrinkfails: %"PRIu64", ninserts: %"PRIu64","
-	    " nrelocs: %"PRIu64"\n", __func__, ckh,
+	    "%s(%p): ngrows: %"FMTu64", nshrinks: %"FMTu64","
+	    " nshrinkfails: %"FMTu64", ninserts: %"FMTu64","
+	    " nrelocs: %"FMTu64"\n", __func__, ckh,
 	    (unsigned long long)ckh->ngrows,
 	    (unsigned long long)ckh->nshrinks,
 	    (unsigned long long)ckh->nshrinkfails,
@ -417,7 +421,7 @@ ckh_delete(ckh_t *ckh)
 	    (unsigned long long)ckh->nrelocs);
 #endif

-	idalloc(ckh->tab);
+	idalloctm(tsd, ckh->tab, tcache_get(tsd, false), true);
 	if (config_debug)
 		memset(ckh, 0x5a, sizeof(ckh_t));
 }
@ -452,7 +456,7 @@ ckh_iter(ckh_t *ckh, size_t *tabind, void **key, void **data)
 }

 bool
-ckh_insert(ckh_t *ckh, const void *key, const void *data)
+ckh_insert(tsd_t *tsd, ckh_t *ckh, const void *key, const void *data)
 {
 	bool ret;

@ -464,7 +468,7 @@ ckh_insert(ckh_t *ckh, const void *key, const void *data)
 #endif

 	while (ckh_try_insert(ckh, &key, &data)) {
-		if (ckh_grow(ckh)) {
+		if (ckh_grow(tsd, ckh)) {
 			ret = true;
 			goto label_return;
 		}
@ -476,7 +480,8 @@ label_return:
 }

 bool
-ckh_remove(ckh_t *ckh, const void *searchkey, void **key, void **data)
+ckh_remove(tsd_t *tsd, ckh_t *ckh, const void *searchkey, void **key,
+    void **data)
 {
 	size_t cell;

@ -497,7 +502,7 @@ ckh_remove(ckh_t *ckh, const void *searchkey, void **key, void **data)
 		    + LG_CKH_BUCKET_CELLS - 2)) && ckh->lg_curbuckets
 		    > ckh->lg_minbuckets) {
 			/* Ignore error due to OOM. */
-			ckh_shrink(ckh);
+			ckh_shrink(tsd, ckh);
 		}

 		return (false);
--- a/src/ctl.c
+++ b/src/ctl.c
--- a/src/extent.c
+++ b/src/extent.c
@ -3,17 +3,32 @@

 /******************************************************************************/

-static inline int
+JEMALLOC_INLINE_C size_t
+extent_quantize(size_t size)
+{
+
+	/*
+	 * Round down to the nearest chunk size that can actually be requested
+	 * during normal huge allocation.
+	 */
+	return (index2size(size2index(size + 1) - 1));
+}
+
+JEMALLOC_INLINE_C int
 extent_szad_comp(extent_node_t *a, extent_node_t *b)
 {
 	int ret;
-	size_t a_size = a->size;
-	size_t b_size = b->size;
+	size_t a_qsize = extent_quantize(extent_node_size_get(a));
+	size_t b_qsize = extent_quantize(extent_node_size_get(b));

-	ret = (a_size > b_size) - (a_size < b_size);
+	/*
+	 * Compare based on quantized size rather than size, in order to sort
+	 * equally useful extents only by address.
+	 */
+	ret = (a_qsize > b_qsize) - (a_qsize < b_qsize);
 	if (ret == 0) {
-		uintptr_t a_addr = (uintptr_t)a->addr;
-		uintptr_t b_addr = (uintptr_t)b->addr;
+		uintptr_t a_addr = (uintptr_t)extent_node_addr_get(a);
+		uintptr_t b_addr = (uintptr_t)extent_node_addr_get(b);

 		ret = (a_addr > b_addr) - (a_addr < b_addr);
 	}
@ -22,18 +37,17 @@ extent_szad_comp(extent_node_t *a, extent_node_t *b)
 }

 /* Generate red-black tree functions. */
-rb_gen(, extent_tree_szad_, extent_tree_t, extent_node_t, link_szad,
+rb_gen(, extent_tree_szad_, extent_tree_t, extent_node_t, szad_link,
    extent_szad_comp)

-static inline int
+JEMALLOC_INLINE_C int
 extent_ad_comp(extent_node_t *a, extent_node_t *b)
 {
-	uintptr_t a_addr = (uintptr_t)a->addr;
-	uintptr_t b_addr = (uintptr_t)b->addr;
+	uintptr_t a_addr = (uintptr_t)extent_node_addr_get(a);
+	uintptr_t b_addr = (uintptr_t)extent_node_addr_get(b);

 	return ((a_addr > b_addr) - (a_addr < b_addr));
 }

 /* Generate red-black tree functions. */
-rb_gen(, extent_tree_ad_, extent_tree_t, extent_node_t, link_ad,
-    extent_ad_comp)
+rb_gen(, extent_tree_ad_, extent_tree_t, extent_node_t, ad_link, extent_ad_comp)
--- a/src/huge.c
+++ b/src/huge.c
@ -2,44 +2,68 @@
 #include "jemalloc/internal/jemalloc_internal.h"

 /******************************************************************************/
-/* Data. */

-uint64_t	huge_nmalloc;
-uint64_t	huge_ndalloc;
-size_t		huge_allocated;
+static extent_node_t *
+huge_node_get(const void *ptr)
+{
+	extent_node_t *node;

-malloc_mutex_t	huge_mtx;
+	node = chunk_lookup(ptr, true);
+	assert(!extent_node_achunk_get(node));

-/******************************************************************************/
+	return (node);
+}

-/* Tree of chunks that are stand-alone huge allocations. */
-static extent_tree_t	huge;
-
-void *
-huge_malloc(size_t size, bool zero, dss_prec_t dss_prec)
+static bool
+huge_node_set(const void *ptr, extent_node_t *node)
 {

-	return (huge_palloc(size, chunksize, zero, dss_prec));
+	assert(extent_node_addr_get(node) == ptr);
+	assert(!extent_node_achunk_get(node));
+	return (chunk_register(ptr, node));
+}
+
+static void
+huge_node_unset(const void *ptr, const extent_node_t *node)
+{
+
+	chunk_deregister(ptr, node);
 }

 void *
-huge_palloc(size_t size, size_t alignment, bool zero, dss_prec_t dss_prec)
+huge_malloc(tsd_t *tsd, arena_t *arena, size_t size, bool zero,
+    tcache_t *tcache)
+{
+	size_t usize;
+
+	usize = s2u(size);
+	if (usize == 0) {
+		/* size_t overflow. */
+		return (NULL);
+	}
+
+	return (huge_palloc(tsd, arena, usize, chunksize, zero, tcache));
+}
+
+void *
+huge_palloc(tsd_t *tsd, arena_t *arena, size_t size, size_t alignment,
+    bool zero, tcache_t *tcache)
 {
 	void *ret;
-	size_t csize;
+	size_t usize;
 	extent_node_t *node;
 	bool is_zeroed;

 	/* Allocate one or more contiguous chunks for this request. */

-	csize = CHUNK_CEILING(size);
-	if (csize == 0) {
-		/* size is large enough to cause size_t wrap-around. */
+	usize = sa2u(size, alignment);
+	if (unlikely(usize == 0))
 		return (NULL);
-	}
+	assert(usize >= chunksize);

 	/* Allocate an extent node with which to track the chunk. */
-	node = base_node_alloc();
+	node = ipallocztm(tsd, CACHELINE_CEILING(sizeof(extent_node_t)),
+	    CACHELINE, false, tcache, true, arena);
 	if (node == NULL)
 		return (NULL);

@ -48,145 +72,33 @@ huge_palloc(size_t size, size_t alignment, bool zero, dss_prec_t dss_prec)
 	 * it is possible to make correct junk/zero fill decisions below.
 	 */
 	is_zeroed = zero;
-	ret = chunk_alloc(csize, alignment, false, &is_zeroed, dss_prec);
-	if (ret == NULL) {
-		base_node_dealloc(node);
+	arena = arena_choose(tsd, arena);
+	if (unlikely(arena == NULL) || (ret = arena_chunk_alloc_huge(arena,
+	    size, alignment, &is_zeroed)) == NULL) {
+		idalloctm(tsd, node, tcache, true);
+		return (NULL);
+	}
+
+	extent_node_init(node, arena, ret, size, is_zeroed, true);
+
+	if (huge_node_set(ret, node)) {
+		arena_chunk_dalloc_huge(arena, ret, size);
+		idalloctm(tsd, node, tcache, true);
 		return (NULL);
 	}

 	/* Insert node into huge. */
-	node->addr = ret;
-	node->size = csize;
+	malloc_mutex_lock(&arena->huge_mtx);
+	ql_elm_new(node, ql_link);
+	ql_tail_insert(&arena->huge, node, ql_link);
+	malloc_mutex_unlock(&arena->huge_mtx);

-	malloc_mutex_lock(&huge_mtx);
-	extent_tree_ad_insert(&huge, node);
-	if (config_stats) {
-		stats_cactive_add(csize);
-		huge_nmalloc++;
-		huge_allocated += csize;
-	}
-	malloc_mutex_unlock(&huge_mtx);
+	if (zero || (config_fill && unlikely(opt_zero))) {
+		if (!is_zeroed)
+			memset(ret, 0, size);
+	} else if (config_fill && unlikely(opt_junk_alloc))
+		memset(ret, 0xa5, size);

-	if (config_fill && zero == false) {
-		if (opt_junk)
-			memset(ret, 0xa5, csize);
-		else if (opt_zero && is_zeroed == false)
-			memset(ret, 0, csize);
-	}
-
-	return (ret);
-}
-
-bool
-huge_ralloc_no_move(void *ptr, size_t oldsize, size_t size, size_t extra)
-{
-
-	/*
-	 * Avoid moving the allocation if the size class can be left the same.
-	 */
-	if (oldsize > arena_maxclass
-	    && CHUNK_CEILING(oldsize) >= CHUNK_CEILING(size)
-	    && CHUNK_CEILING(oldsize) <= CHUNK_CEILING(size+extra)) {
-		assert(CHUNK_CEILING(oldsize) == oldsize);
-		return (false);
-	}
-
-	/* Reallocation would require a move. */
-	return (true);
-}
-
-void *
-huge_ralloc(void *ptr, size_t oldsize, size_t size, size_t extra,
-    size_t alignment, bool zero, bool try_tcache_dalloc, dss_prec_t dss_prec)
-{
-	void *ret;
-	size_t copysize;
-
-	/* Try to avoid moving the allocation. */
-	if (huge_ralloc_no_move(ptr, oldsize, size, extra) == false)
-		return (ptr);
-
-	/*
-	 * size and oldsize are different enough that we need to use a
-	 * different size class.  In that case, fall back to allocating new
-	 * space and copying.
-	 */
-	if (alignment > chunksize)
-		ret = huge_palloc(size + extra, alignment, zero, dss_prec);
-	else
-		ret = huge_malloc(size + extra, zero, dss_prec);
-
-	if (ret == NULL) {
-		if (extra == 0)
-			return (NULL);
-		/* Try again, this time without extra. */
-		if (alignment > chunksize)
-			ret = huge_palloc(size, alignment, zero, dss_prec);
-		else
-			ret = huge_malloc(size, zero, dss_prec);
-
-		if (ret == NULL)
-			return (NULL);
-	}
-
-	/*
-	 * Copy at most size bytes (not size+extra), since the caller has no
-	 * expectation that the extra bytes will be reliably preserved.
-	 */
-	copysize = (size < oldsize) ? size : oldsize;
-
-#ifdef JEMALLOC_MREMAP
-	/*
-	 * Use mremap(2) if this is a huge-->huge reallocation, and neither the
-	 * source nor the destination are in dss.
-	 */
-	if (oldsize >= chunksize && (config_dss == false || (chunk_in_dss(ptr)
-	    == false && chunk_in_dss(ret) == false))) {
-		size_t newsize = huge_salloc(ret);
-
-		/*
-		 * Remove ptr from the tree of huge allocations before
-		 * performing the remap operation, in order to avoid the
-		 * possibility of another thread acquiring that mapping before
-		 * this one removes it from the tree.
-		 */
-		huge_dalloc(ptr, false);
-		if (mremap(ptr, oldsize, newsize, MREMAP_MAYMOVE|MREMAP_FIXED,
-		    ret) == MAP_FAILED) {
-			/*
-			 * Assuming no chunk management bugs in the allocator,
-			 * the only documented way an error can occur here is
-			 * if the application changed the map type for a
-			 * portion of the old allocation.  This is firmly in
-			 * undefined behavior territory, so write a diagnostic
-			 * message, and optionally abort.
-			 */
-			char buf[BUFERROR_BUF];
-
-			buferror(get_errno(), buf, sizeof(buf));
-			malloc_printf("<jemalloc>: Error in mremap(): %s\n",
-			    buf);
-			if (opt_abort)
-				abort();
-			memcpy(ret, ptr, copysize);
-			chunk_dealloc_mmap(ptr, oldsize);
-		} else if (config_fill && zero == false && opt_junk && oldsize
-		    < newsize) {
-			/*
-			 * mremap(2) clobbers the original mapping, so
-			 * junk/zero filling is not preserved.  There is no
-			 * need to zero fill here, since any trailing
-			 * uninititialized memory is demand-zeroed by the
-			 * kernel, but junk filling must be redone.
-			 */
-			memset(ret + oldsize, 0xa5, newsize - oldsize);
-		}
-	} else
-#endif
-	{
-		memcpy(ret, ptr, copysize);
-		iqalloct(ptr, try_tcache_dalloc);
-	}
 	return (ret);
 }

@ -198,12 +110,12 @@ static void
 huge_dalloc_junk(void *ptr, size_t usize)
 {

-	if (config_fill && config_dss && opt_junk) {
+	if (config_fill && have_dss && unlikely(opt_junk_free)) {
 		/*
 		 * Only bother junk filling if the chunk isn't about to be
 		 * unmapped.
 		 */
-		if (config_munmap == false || (config_dss && chunk_in_dss(ptr)))
+		if (!config_munmap || (have_dss && chunk_in_dss(ptr)))
 			memset(ptr, 0x5a, usize);
 	}
 }
@ -213,135 +125,317 @@ huge_dalloc_junk(void *ptr, size_t usize)
 huge_dalloc_junk_t *huge_dalloc_junk = JEMALLOC_N(huge_dalloc_junk_impl);
 #endif

-void
-huge_dalloc(void *ptr, bool unmap)
+static void
+huge_ralloc_no_move_similar(void *ptr, size_t oldsize, size_t usize,
+    size_t size, size_t extra, bool zero)
 {
-	extent_node_t *node, key;
+	size_t usize_next;
+	extent_node_t *node;
+	arena_t *arena;
+	chunk_hooks_t chunk_hooks = CHUNK_HOOKS_INITIALIZER;
+	bool zeroed;

-	malloc_mutex_lock(&huge_mtx);
+	/* Increase usize to incorporate extra. */
+	while (usize < s2u(size+extra) && (usize_next = s2u(usize+1)) < oldsize)
+		usize = usize_next;

-	/* Extract from tree of huge allocations. */
-	key.addr = ptr;
-	node = extent_tree_ad_search(&huge, &key);
-	assert(node != NULL);
-	assert(node->addr == ptr);
-	extent_tree_ad_remove(&huge, node);
+	if (oldsize == usize)
+		return;

-	if (config_stats) {
-		stats_cactive_sub(node->size);
-		huge_ndalloc++;
-		huge_allocated -= node->size;
+	node = huge_node_get(ptr);
+	arena = extent_node_arena_get(node);
+
+	/* Fill if necessary (shrinking). */
+	if (oldsize > usize) {
+		size_t sdiff = oldsize - usize;
+		zeroed = !chunk_purge_wrapper(arena, &chunk_hooks, ptr,
+		    CHUNK_CEILING(usize), usize, sdiff);
+		if (config_fill && unlikely(opt_junk_free)) {
+			memset((void *)((uintptr_t)ptr + usize), 0x5a, sdiff);
+			zeroed = false;
+		}
+	} else
+		zeroed = true;
+
+	malloc_mutex_lock(&arena->huge_mtx);
+	/* Update the size of the huge allocation. */
+	assert(extent_node_size_get(node) != usize);
+	extent_node_size_set(node, usize);
+	/* Clear node's zeroed field if zeroing failed above. */
+	extent_node_zeroed_set(node, extent_node_zeroed_get(node) && zeroed);
+	malloc_mutex_unlock(&arena->huge_mtx);
+
+	arena_chunk_ralloc_huge_similar(arena, ptr, oldsize, usize);
+
+	/* Fill if necessary (growing). */
+	if (oldsize < usize) {
+		if (zero || (config_fill && unlikely(opt_zero))) {
+			if (!zeroed) {
+				memset((void *)((uintptr_t)ptr + oldsize), 0,
+				    usize - oldsize);
+			}
+		} else if (config_fill && unlikely(opt_junk_alloc)) {
+			memset((void *)((uintptr_t)ptr + oldsize), 0xa5, usize -
+			    oldsize);
+		}
+	}
+}
+
+static bool
+huge_ralloc_no_move_shrink(void *ptr, size_t oldsize, size_t usize)
+{
+	extent_node_t *node;
+	arena_t *arena;
+	chunk_hooks_t chunk_hooks;
+	size_t cdiff;
+	bool zeroed;
+
+	node = huge_node_get(ptr);
+	arena = extent_node_arena_get(node);
+	chunk_hooks = chunk_hooks_get(arena);
+
+	/* Split excess chunks. */
+	cdiff = CHUNK_CEILING(oldsize) - CHUNK_CEILING(usize);
+	if (cdiff != 0 && chunk_hooks.split(ptr, CHUNK_CEILING(oldsize),
+	    CHUNK_CEILING(usize), cdiff, true, arena->ind))
+		return (true);
+
+	if (oldsize > usize) {
+		size_t sdiff = oldsize - usize;
+		zeroed = !chunk_purge_wrapper(arena, &chunk_hooks,
+		    CHUNK_ADDR2BASE((uintptr_t)ptr + usize),
+		    CHUNK_CEILING(usize), CHUNK_ADDR2OFFSET((uintptr_t)ptr +
+		    usize), sdiff);
+		if (config_fill && unlikely(opt_junk_free)) {
+			huge_dalloc_junk((void *)((uintptr_t)ptr + usize),
+			    sdiff);
+			zeroed = false;
+		}
+	} else
+		zeroed = true;
+
+	malloc_mutex_lock(&arena->huge_mtx);
+	/* Update the size of the huge allocation. */
+	extent_node_size_set(node, usize);
+	/* Clear node's zeroed field if zeroing failed above. */
+	extent_node_zeroed_set(node, extent_node_zeroed_get(node) && zeroed);
+	malloc_mutex_unlock(&arena->huge_mtx);
+
+	/* Zap the excess chunks. */
+	arena_chunk_ralloc_huge_shrink(arena, ptr, oldsize, usize);
+
+	return (false);
+}
+
+static bool
+huge_ralloc_no_move_expand(void *ptr, size_t oldsize, size_t size, bool zero) {
+	size_t usize;
+	extent_node_t *node;
+	arena_t *arena;
+	bool is_zeroed_subchunk, is_zeroed_chunk;
+
+	usize = s2u(size);
+	if (usize == 0) {
+		/* size_t overflow. */
+		return (true);
 	}

-	malloc_mutex_unlock(&huge_mtx);
+	node = huge_node_get(ptr);
+	arena = extent_node_arena_get(node);
+	malloc_mutex_lock(&arena->huge_mtx);
+	is_zeroed_subchunk = extent_node_zeroed_get(node);
+	malloc_mutex_unlock(&arena->huge_mtx);

-	if (unmap)
-		huge_dalloc_junk(node->addr, node->size);
+	/*
+	 * Copy zero into is_zeroed_chunk and pass the copy to chunk_alloc(), so
+	 * that it is possible to make correct junk/zero fill decisions below.
+	 */
+	is_zeroed_chunk = zero;

-	chunk_dealloc(node->addr, node->size, unmap);
-
-	base_node_dealloc(node);
-}
-
-size_t
-huge_salloc(const void *ptr)
-{
-	size_t ret;
-	extent_node_t *node, key;
-
-	malloc_mutex_lock(&huge_mtx);
-
-	/* Extract from tree of huge allocations. */
-	key.addr = __DECONST(void *, ptr);
-	node = extent_tree_ad_search(&huge, &key);
-	assert(node != NULL);
-
-	ret = node->size;
-
-	malloc_mutex_unlock(&huge_mtx);
-
-	return (ret);
-}
-
-dss_prec_t
-huge_dss_prec_get(arena_t *arena)
-{
-
-	return (arena_dss_prec_get(choose_arena(arena)));
-}
-
-prof_ctx_t *
-huge_prof_ctx_get(const void *ptr)
-{
-	prof_ctx_t *ret;
-	extent_node_t *node, key;
-
-	malloc_mutex_lock(&huge_mtx);
-
-	/* Extract from tree of huge allocations. */
-	key.addr = __DECONST(void *, ptr);
-	node = extent_tree_ad_search(&huge, &key);
-	assert(node != NULL);
-
-	ret = node->prof_ctx;
-
-	malloc_mutex_unlock(&huge_mtx);
-
-	return (ret);
-}
-
-void
-huge_prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
-{
-	extent_node_t *node, key;
-
-	malloc_mutex_lock(&huge_mtx);
-
-	/* Extract from tree of huge allocations. */
-	key.addr = __DECONST(void *, ptr);
-	node = extent_tree_ad_search(&huge, &key);
-	assert(node != NULL);
-
-	node->prof_ctx = ctx;
-
-	malloc_mutex_unlock(&huge_mtx);
-}
-
-bool
-huge_boot(void)
-{
-
-	/* Initialize chunks data. */
-	if (malloc_mutex_init(&huge_mtx))
+	if (arena_chunk_ralloc_huge_expand(arena, ptr, oldsize, usize,
+	     &is_zeroed_chunk))
 		return (true);
-	extent_tree_ad_new(&huge);

-	if (config_stats) {
-		huge_nmalloc = 0;
-		huge_ndalloc = 0;
-		huge_allocated = 0;
+	malloc_mutex_lock(&arena->huge_mtx);
+	/* Update the size of the huge allocation. */
+	extent_node_size_set(node, usize);
+	malloc_mutex_unlock(&arena->huge_mtx);
+
+	if (zero || (config_fill && unlikely(opt_zero))) {
+		if (!is_zeroed_subchunk) {
+			memset((void *)((uintptr_t)ptr + oldsize), 0,
+			    CHUNK_CEILING(oldsize) - oldsize);
+		}
+		if (!is_zeroed_chunk) {
+			memset((void *)((uintptr_t)ptr +
+			    CHUNK_CEILING(oldsize)), 0, usize -
+			    CHUNK_CEILING(oldsize));
+		}
+	} else if (config_fill && unlikely(opt_junk_alloc)) {
+		memset((void *)((uintptr_t)ptr + oldsize), 0xa5, usize -
+		    oldsize);
 	}

 	return (false);
 }

-void
-huge_prefork(void)
+bool
+huge_ralloc_no_move(void *ptr, size_t oldsize, size_t size, size_t extra,
+    bool zero)
 {
+	size_t usize;

-	malloc_mutex_prefork(&huge_mtx);
+	/* Both allocations must be huge to avoid a move. */
+	if (oldsize < chunksize)
+		return (true);
+
+	assert(s2u(oldsize) == oldsize);
+	usize = s2u(size);
+	if (usize == 0) {
+		/* size_t overflow. */
+		return (true);
+	}
+
+	/*
+	 * Avoid moving the allocation if the existing chunk size accommodates
+	 * the new size.
+	 */
+	if (CHUNK_CEILING(oldsize) >= CHUNK_CEILING(usize)
+	    && CHUNK_CEILING(oldsize) <= CHUNK_CEILING(s2u(size+extra))) {
+		huge_ralloc_no_move_similar(ptr, oldsize, usize, size, extra,
+		    zero);
+		return (false);
+	}
+
+	/* Attempt to shrink the allocation in-place. */
+	if (CHUNK_CEILING(oldsize) >= CHUNK_CEILING(usize))
+		return (huge_ralloc_no_move_shrink(ptr, oldsize, usize));
+
+	/* Attempt to expand the allocation in-place. */
+	if (huge_ralloc_no_move_expand(ptr, oldsize, size + extra, zero)) {
+		if (extra == 0)
+			return (true);
+
+		/* Try again, this time without extra. */
+		return (huge_ralloc_no_move_expand(ptr, oldsize, size, zero));
+	}
+	return (false);
+}
+
+void *
+huge_ralloc(tsd_t *tsd, arena_t *arena, void *ptr, size_t oldsize, size_t size,
+    size_t extra, size_t alignment, bool zero, tcache_t *tcache)
+{
+	void *ret;
+	size_t copysize;
+
+	/* Try to avoid moving the allocation. */
+	if (!huge_ralloc_no_move(ptr, oldsize, size, extra, zero))
+		return (ptr);
+
+	/*
+	 * size and oldsize are different enough that we need to use a
+	 * different size class.  In that case, fall back to allocating new
+	 * space and copying.
+	 */
+	if (alignment > chunksize) {
+		ret = huge_palloc(tsd, arena, size + extra, alignment, zero,
+		    tcache);
+	} else
+		ret = huge_malloc(tsd, arena, size + extra, zero, tcache);
+
+	if (ret == NULL) {
+		if (extra == 0)
+			return (NULL);
+		/* Try again, this time without extra. */
+		if (alignment > chunksize) {
+			ret = huge_palloc(tsd, arena, size, alignment, zero,
+			    tcache);
+		} else
+			ret = huge_malloc(tsd, arena, size, zero, tcache);
+
+		if (ret == NULL)
+			return (NULL);
+	}
+
+	/*
+	 * Copy at most size bytes (not size+extra), since the caller has no
+	 * expectation that the extra bytes will be reliably preserved.
+	 */
+	copysize = (size < oldsize) ? size : oldsize;
+	memcpy(ret, ptr, copysize);
+	isqalloc(tsd, ptr, oldsize, tcache);
+	return (ret);
 }

 void
-huge_postfork_parent(void)
+huge_dalloc(tsd_t *tsd, void *ptr, tcache_t *tcache)
+{
+	extent_node_t *node;
+	arena_t *arena;
+
+	node = huge_node_get(ptr);
+	arena = extent_node_arena_get(node);
+	huge_node_unset(ptr, node);
+	malloc_mutex_lock(&arena->huge_mtx);
+	ql_remove(&arena->huge, node, ql_link);
+	malloc_mutex_unlock(&arena->huge_mtx);
+
+	huge_dalloc_junk(extent_node_addr_get(node),
+	    extent_node_size_get(node));
+	arena_chunk_dalloc_huge(extent_node_arena_get(node),
+	    extent_node_addr_get(node), extent_node_size_get(node));
+	idalloctm(tsd, node, tcache, true);
+}
+
+arena_t *
+huge_aalloc(const void *ptr)
 {

-	malloc_mutex_postfork_parent(&huge_mtx);
+	return (extent_node_arena_get(huge_node_get(ptr)));
+}
+
+size_t
+huge_salloc(const void *ptr)
+{
+	size_t size;
+	extent_node_t *node;
+	arena_t *arena;
+
+	node = huge_node_get(ptr);
+	arena = extent_node_arena_get(node);
+	malloc_mutex_lock(&arena->huge_mtx);
+	size = extent_node_size_get(node);
+	malloc_mutex_unlock(&arena->huge_mtx);
+
+	return (size);
+}
+
+prof_tctx_t *
+huge_prof_tctx_get(const void *ptr)
+{
+	prof_tctx_t *tctx;
+	extent_node_t *node;
+	arena_t *arena;
+
+	node = huge_node_get(ptr);
+	arena = extent_node_arena_get(node);
+	malloc_mutex_lock(&arena->huge_mtx);
+	tctx = extent_node_prof_tctx_get(node);
+	malloc_mutex_unlock(&arena->huge_mtx);
+
+	return (tctx);
 }

 void
-huge_postfork_child(void)
+huge_prof_tctx_set(const void *ptr, prof_tctx_t *tctx)
 {
+	extent_node_t *node;
+	arena_t *arena;

-	malloc_mutex_postfork_child(&huge_mtx);
+	node = huge_node_get(ptr);
+	arena = extent_node_arena_get(node);
+	malloc_mutex_lock(&arena->huge_mtx);
+	extent_node_prof_tctx_set(node, tctx);
+	malloc_mutex_unlock(&arena->huge_mtx);
 }
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
--- a/src/mutex.c
+++ b/src/mutex.c
@ -73,9 +73,13 @@ malloc_mutex_init(malloc_mutex_t *mutex)
 {

 #ifdef _WIN32
+#  if _WIN32_WINNT >= 0x0600
+	InitializeSRWLock(&mutex->lock);
+#  else
 	if (!InitializeCriticalSectionAndSpinCount(&mutex->lock,
 	    _CRT_SPINCOUNT))
 		return (true);
+#  endif
 #elif (defined(JEMALLOC_OSSPIN))
 	mutex->lock = 0;
 #elif (defined(JEMALLOC_MUTEX_INIT_CB))
@ -83,8 +87,8 @@ malloc_mutex_init(malloc_mutex_t *mutex)
 		mutex->postponed_next = postponed_mutexes;
 		postponed_mutexes = mutex;
 	} else {
-		if (_pthread_mutex_init_calloc_cb(&mutex->lock, base_calloc) !=
-		    0)
+		if (_pthread_mutex_init_calloc_cb(&mutex->lock,
+		    bootstrap_calloc) != 0)
 			return (true);
 	}
 #else
@ -140,7 +144,7 @@ mutex_boot(void)
 	postpone_init = false;
 	while (postponed_mutexes != NULL) {
 		if (_pthread_mutex_init_calloc_cb(&postponed_mutexes->lock,
-		    base_calloc) != 0)
+		    bootstrap_calloc) != 0)
 			return (true);
 		postponed_mutexes = postponed_mutexes->postponed_next;
 	}
--- a/src/pages.c
+++ b/src/pages.c
@ -0,0 +1,173 @@
+#define	JEMALLOC_PAGES_C_
+#include "jemalloc/internal/jemalloc_internal.h"
+
+/******************************************************************************/
+
+void *
+pages_map(void *addr, size_t size)
+{
+	void *ret;
+
+	assert(size != 0);
+
+#ifdef _WIN32
+	/*
+	 * If VirtualAlloc can't allocate at the given address when one is
+	 * given, it fails and returns NULL.
+	 */
+	ret = VirtualAlloc(addr, size, MEM_COMMIT | MEM_RESERVE,
+	    PAGE_READWRITE);
+#else
+	/*
+	 * We don't use MAP_FIXED here, because it can cause the *replacement*
+	 * of existing mappings, and we only want to create new mappings.
+	 */
+	ret = mmap(addr, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON,
+	    -1, 0);
+	assert(ret != NULL);
+
+	if (ret == MAP_FAILED)
+		ret = NULL;
+	else if (addr != NULL && ret != addr) {
+		/*
+		 * We succeeded in mapping memory, but not in the right place.
+		 */
+		pages_unmap(ret, size);
+		ret = NULL;
+	}
+#endif
+	assert(ret == NULL || (addr == NULL && ret != addr)
+	    || (addr != NULL && ret == addr));
+	return (ret);
+}
+
+void
+pages_unmap(void *addr, size_t size)
+{
+
+#ifdef _WIN32
+	if (VirtualFree(addr, 0, MEM_RELEASE) == 0)
+#else
+	if (munmap(addr, size) == -1)
+#endif
+	{
+		char buf[BUFERROR_BUF];
+
+		buferror(get_errno(), buf, sizeof(buf));
+		malloc_printf("<jemalloc>: Error in "
+#ifdef _WIN32
+		              "VirtualFree"
+#else
+		              "munmap"
+#endif
+		              "(): %s\n", buf);
+		if (opt_abort)
+			abort();
+	}
+}
+
+void *
+pages_trim(void *addr, size_t alloc_size, size_t leadsize, size_t size)
+{
+	void *ret = (void *)((uintptr_t)addr + leadsize);
+
+	assert(alloc_size >= leadsize + size);
+#ifdef _WIN32
+	{
+		void *new_addr;
+
+		pages_unmap(addr, alloc_size);
+		new_addr = pages_map(ret, size);
+		if (new_addr == ret)
+			return (ret);
+		if (new_addr)
+			pages_unmap(new_addr, size);
+		return (NULL);
+	}
+#else
+	{
+		size_t trailsize = alloc_size - leadsize - size;
+
+		if (leadsize != 0)
+			pages_unmap(addr, leadsize);
+		if (trailsize != 0)
+			pages_unmap((void *)((uintptr_t)ret + size), trailsize);
+		return (ret);
+	}
+#endif
+}
+
+static bool
+pages_commit_impl(void *addr, size_t size, bool commit)
+{
+
+#ifndef _WIN32
+	/*
+	 * The following decommit/commit implementation is functional, but
+	 * always disabled because it doesn't add value beyong improved
+	 * debugging (at the cost of extra system calls) on systems that
+	 * overcommit.
+	 */
+	if (false) {
+		int prot = commit ? (PROT_READ | PROT_WRITE) : PROT_NONE;
+		void *result = mmap(addr, size, prot, MAP_PRIVATE | MAP_ANON |
+		    MAP_FIXED, -1, 0);
+		if (result == MAP_FAILED)
+			return (true);
+		if (result != addr) {
+			/*
+			 * We succeeded in mapping memory, but not in the right
+			 * place.
+			 */
+			pages_unmap(result, size);
+			return (true);
+		}
+		return (false);
+	}
+#endif
+	return (true);
+}
+
+bool
+pages_commit(void *addr, size_t size)
+{
+
+	return (pages_commit_impl(addr, size, true));
+}
+
+bool
+pages_decommit(void *addr, size_t size)
+{
+
+	return (pages_commit_impl(addr, size, false));
+}
+
+bool
+pages_purge(void *addr, size_t size)
+{
+	bool unzeroed;
+
+#ifdef _WIN32
+	VirtualAlloc(addr, size, MEM_RESET, PAGE_READWRITE);
+	unzeroed = true;
+#elif defined(JEMALLOC_HAVE_MADVISE)
+#  ifdef JEMALLOC_PURGE_MADVISE_DONTNEED
+#    define JEMALLOC_MADV_PURGE MADV_DONTNEED
+#    define JEMALLOC_MADV_ZEROS true
+#  elif defined(JEMALLOC_PURGE_MADVISE_FREE)
+#    define JEMALLOC_MADV_PURGE MADV_FREE
+#    define JEMALLOC_MADV_ZEROS false
+#  else
+#    error "No madvise(2) flag defined for purging unused dirty pages."
+#  endif
+	int err = madvise(addr, size, JEMALLOC_MADV_PURGE);
+	unzeroed = (!JEMALLOC_MADV_ZEROS || err != 0);
+#  undef JEMALLOC_MADV_PURGE
+#  undef JEMALLOC_MADV_ZEROS
+#else
+	/* Last resort no-op. */
+	unzeroed = true;
+#endif
+	return (unzeroed);
+}
+
--- a/src/prof.c
+++ b/src/prof.c
--- a/src/quarantine.c
+++ b/src/quarantine.c
@ -2,34 +2,33 @@
 #include "jemalloc/internal/jemalloc_internal.h"

 /*
- * quarantine pointers close to NULL are used to encode state information that
+ * Quarantine pointers close to NULL are used to encode state information that
 * is used for cleaning up during thread shutdown.
 */
 #define	QUARANTINE_STATE_REINCARNATED	((quarantine_t *)(uintptr_t)1)
 #define	QUARANTINE_STATE_PURGATORY	((quarantine_t *)(uintptr_t)2)
 #define	QUARANTINE_STATE_MAX		QUARANTINE_STATE_PURGATORY

-/******************************************************************************/
-/* Data. */
-
-malloc_tsd_data(, quarantine, quarantine_t *, NULL)
-
 /******************************************************************************/
 /* Function prototypes for non-inline static functions. */

-static quarantine_t	*quarantine_grow(quarantine_t *quarantine);
-static void	quarantine_drain_one(quarantine_t *quarantine);
-static void	quarantine_drain(quarantine_t *quarantine, size_t upper_bound);
+static quarantine_t	*quarantine_grow(tsd_t *tsd, quarantine_t *quarantine);
+static void	quarantine_drain_one(tsd_t *tsd, quarantine_t *quarantine);
+static void	quarantine_drain(tsd_t *tsd, quarantine_t *quarantine,
+    size_t upper_bound);

 /******************************************************************************/

-quarantine_t *
-quarantine_init(size_t lg_maxobjs)
+static quarantine_t *
+quarantine_init(tsd_t *tsd, size_t lg_maxobjs)
 {
 	quarantine_t *quarantine;

-	quarantine = (quarantine_t *)imalloc(offsetof(quarantine_t, objs) +
-	    ((ZU(1) << lg_maxobjs) * sizeof(quarantine_obj_t)));
+	assert(tsd_nominal(tsd));
+
+	quarantine = (quarantine_t *)iallocztm(tsd, offsetof(quarantine_t, objs)
+	    + ((ZU(1) << lg_maxobjs) * sizeof(quarantine_obj_t)), false,
+	    tcache_get(tsd, true), true, NULL);
 	if (quarantine == NULL)
 		return (NULL);
 	quarantine->curbytes = 0;
@ -37,19 +36,36 @@ quarantine_init(size_t lg_maxobjs)
 	quarantine->first = 0;
 	quarantine->lg_maxobjs = lg_maxobjs;

-	quarantine_tsd_set(&quarantine);
-
 	return (quarantine);
 }

+void
+quarantine_alloc_hook_work(tsd_t *tsd)
+{
+	quarantine_t *quarantine;
+
+	if (!tsd_nominal(tsd))
+		return;
+
+	quarantine = quarantine_init(tsd, LG_MAXOBJS_INIT);
+	/*
+	 * Check again whether quarantine has been initialized, because
+	 * quarantine_init() may have triggered recursive initialization.
+	 */
+	if (tsd_quarantine_get(tsd) == NULL)
+		tsd_quarantine_set(tsd, quarantine);
+	else
+		idalloctm(tsd, quarantine, tcache_get(tsd, false), true);
+}
+
 static quarantine_t *
-quarantine_grow(quarantine_t *quarantine)
+quarantine_grow(tsd_t *tsd, quarantine_t *quarantine)
 {
 	quarantine_t *ret;

-	ret = quarantine_init(quarantine->lg_maxobjs + 1);
+	ret = quarantine_init(tsd, quarantine->lg_maxobjs + 1);
 	if (ret == NULL) {
-		quarantine_drain_one(quarantine);
+		quarantine_drain_one(tsd, quarantine);
 		return (quarantine);
 	}

@ -71,17 +87,18 @@ quarantine_grow(quarantine_t *quarantine)
 		memcpy(&ret->objs[ncopy_a], quarantine->objs, ncopy_b *
 		    sizeof(quarantine_obj_t));
 	}
-	idalloc(quarantine);
+	idalloctm(tsd, quarantine, tcache_get(tsd, false), true);

+	tsd_quarantine_set(tsd, ret);
 	return (ret);
 }

 static void
-quarantine_drain_one(quarantine_t *quarantine)
+quarantine_drain_one(tsd_t *tsd, quarantine_t *quarantine)
 {
 	quarantine_obj_t *obj = &quarantine->objs[quarantine->first];
 	assert(obj->usize == isalloc(obj->ptr, config_prof));
-	idalloc(obj->ptr);
+	idalloctm(tsd, obj->ptr, NULL, false);
 	quarantine->curbytes -= obj->usize;
 	quarantine->curobjs--;
 	quarantine->first = (quarantine->first + 1) & ((ZU(1) <<
@ -89,15 +106,15 @@ quarantine_drain_one(quarantine_t *quarantine)
 }

 static void
-quarantine_drain(quarantine_t *quarantine, size_t upper_bound)
+quarantine_drain(tsd_t *tsd, quarantine_t *quarantine, size_t upper_bound)
 {

 	while (quarantine->curbytes > upper_bound && quarantine->curobjs > 0)
-		quarantine_drain_one(quarantine);
+		quarantine_drain_one(tsd, quarantine);
 }

 void
-quarantine(void *ptr)
+quarantine(tsd_t *tsd, void *ptr)
 {
 	quarantine_t *quarantine;
 	size_t usize = isalloc(ptr, config_prof);
@ -105,17 +122,8 @@ quarantine(void *ptr)
 	cassert(config_fill);
 	assert(opt_quarantine);

-	quarantine = *quarantine_tsd_get();
-	if ((uintptr_t)quarantine <= (uintptr_t)QUARANTINE_STATE_MAX) {
-		if (quarantine == QUARANTINE_STATE_PURGATORY) {
-			/*
-			 * Make a note that quarantine() was called after
-			 * quarantine_cleanup() was called.
-			 */
-			quarantine = QUARANTINE_STATE_REINCARNATED;
-			quarantine_tsd_set(&quarantine);
-		}
-		idalloc(ptr);
+	if ((quarantine = tsd_quarantine_get(tsd)) == NULL) {
+		idalloctm(tsd, ptr, NULL, false);
 		return;
 	}
 	/*
@ -125,11 +133,11 @@ quarantine(void *ptr)
 	if (quarantine->curbytes + usize > opt_quarantine) {
 		size_t upper_bound = (opt_quarantine >= usize) ? opt_quarantine
 		    - usize : 0;
-		quarantine_drain(quarantine, upper_bound);
+		quarantine_drain(tsd, quarantine, upper_bound);
 	}
 	/* Grow the quarantine ring buffer if it's full. */
 	if (quarantine->curobjs == (ZU(1) << quarantine->lg_maxobjs))
-		quarantine = quarantine_grow(quarantine);
+		quarantine = quarantine_grow(tsd, quarantine);
 	/* quarantine_grow() must free a slot if it fails to grow. */
 	assert(quarantine->curobjs < (ZU(1) << quarantine->lg_maxobjs));
 	/* Append ptr if its size doesn't exceed the quarantine size. */
@ -141,12 +149,12 @@ quarantine(void *ptr)
 		obj->usize = usize;
 		quarantine->curbytes += usize;
 		quarantine->curobjs++;
-		if (config_fill && opt_junk) {
+		if (config_fill && unlikely(opt_junk_free)) {
 			/*
 			 * Only do redzone validation if Valgrind isn't in
 			 * operation.
 			 */
-			if ((config_valgrind == false || opt_valgrind == false)
+			if ((!config_valgrind || likely(!in_valgrind))
 			    && usize <= SMALL_MAXCLASS)
 				arena_quarantine_junk_small(ptr, usize);
 			else
@ -154,46 +162,22 @@ quarantine(void *ptr)
 		}
 	} else {
 		assert(quarantine->curbytes == 0);
-		idalloc(ptr);
+		idalloctm(tsd, ptr, NULL, false);
 	}
 }

 void
-quarantine_cleanup(void *arg)
+quarantine_cleanup(tsd_t *tsd)
 {
-	quarantine_t *quarantine = *(quarantine_t **)arg;
+	quarantine_t *quarantine;

-	if (quarantine == QUARANTINE_STATE_REINCARNATED) {
-		/*
-		 * Another destructor deallocated memory after this destructor
-		 * was called.  Reset quarantine to QUARANTINE_STATE_PURGATORY
-		 * in order to receive another callback.
-		 */
-		quarantine = QUARANTINE_STATE_PURGATORY;
-		quarantine_tsd_set(&quarantine);
-	} else if (quarantine == QUARANTINE_STATE_PURGATORY) {
-		/*
-		 * The previous time this destructor was called, we set the key
-		 * to QUARANTINE_STATE_PURGATORY so that other destructors
-		 * wouldn't cause re-creation of the quarantine.  This time, do
-		 * nothing, so that the destructor will not be called again.
-		 */
-	} else if (quarantine != NULL) {
-		quarantine_drain(quarantine, 0);
-		idalloc(quarantine);
-		quarantine = QUARANTINE_STATE_PURGATORY;
-		quarantine_tsd_set(&quarantine);
+	if (!config_fill)
+		return;
+
+	quarantine = tsd_quarantine_get(tsd);
+	if (quarantine != NULL) {
+		quarantine_drain(tsd, quarantine, 0);
+		idalloctm(tsd, quarantine, tcache_get(tsd, false), true);
+		tsd_quarantine_set(tsd, NULL);
 	}
 }
-
-bool
-quarantine_boot(void)
-{
-
-	cassert(config_fill);
-
-	if (quarantine_tsd_boot())
-		return (true);
-
-	return (false);
-}
--- a/src/rtree.c
+++ b/src/rtree.c
@ -1,73 +1,74 @@
 #define	JEMALLOC_RTREE_C_
 #include "jemalloc/internal/jemalloc_internal.h"

-rtree_t *
-rtree_new(unsigned bits, rtree_alloc_t *alloc, rtree_dalloc_t *dalloc)
+static unsigned
+hmin(unsigned ha, unsigned hb)
 {
-	rtree_t *ret;
-	unsigned bits_per_level, bits_in_leaf, height, i;
+
+	return (ha < hb ? ha : hb);
+}
+
+/* Only the most significant bits of keys passed to rtree_[gs]et() are used. */
+bool
+rtree_new(rtree_t *rtree, unsigned bits, rtree_node_alloc_t *alloc,
+    rtree_node_dalloc_t *dalloc)
+{
+	unsigned bits_in_leaf, height, i;

 	assert(bits > 0 && bits <= (sizeof(uintptr_t) << 3));

-	bits_per_level = ffs(pow2_ceil((RTREE_NODESIZE / sizeof(void *)))) - 1;
-	bits_in_leaf = ffs(pow2_ceil((RTREE_NODESIZE / sizeof(uint8_t)))) - 1;
+	bits_in_leaf = (bits % RTREE_BITS_PER_LEVEL) == 0 ? RTREE_BITS_PER_LEVEL
+	    : (bits % RTREE_BITS_PER_LEVEL);
 	if (bits > bits_in_leaf) {
-		height = 1 + (bits - bits_in_leaf) / bits_per_level;
-		if ((height-1) * bits_per_level + bits_in_leaf != bits)
+		height = 1 + (bits - bits_in_leaf) / RTREE_BITS_PER_LEVEL;
+		if ((height-1) * RTREE_BITS_PER_LEVEL + bits_in_leaf != bits)
 			height++;
-	} else {
+	} else
 		height = 1;
-	}
-	assert((height-1) * bits_per_level + bits_in_leaf >= bits);
+	assert((height-1) * RTREE_BITS_PER_LEVEL + bits_in_leaf == bits);

-	ret = (rtree_t*)alloc(offsetof(rtree_t, level2bits) +
-	    (sizeof(unsigned) * height));
-	if (ret == NULL)
-		return (NULL);
-	memset(ret, 0, offsetof(rtree_t, level2bits) + (sizeof(unsigned) *
-	    height));
+	rtree->alloc = alloc;
+	rtree->dalloc = dalloc;
+	rtree->height = height;

-	ret->alloc = alloc;
-	ret->dalloc = dalloc;
-	if (malloc_mutex_init(&ret->mutex)) {
-		if (dalloc != NULL)
-			dalloc(ret);
-		return (NULL);
+	/* Root level. */
+	rtree->levels[0].subtree = NULL;
+	rtree->levels[0].bits = (height > 1) ? RTREE_BITS_PER_LEVEL :
+	    bits_in_leaf;
+	rtree->levels[0].cumbits = rtree->levels[0].bits;
+	/* Interior levels. */
+	for (i = 1; i < height-1; i++) {
+		rtree->levels[i].subtree = NULL;
+		rtree->levels[i].bits = RTREE_BITS_PER_LEVEL;
+		rtree->levels[i].cumbits = rtree->levels[i-1].cumbits +
+		    RTREE_BITS_PER_LEVEL;
 	}
-	ret->height = height;
+	/* Leaf level. */
 	if (height > 1) {
-		if ((height-1) * bits_per_level + bits_in_leaf > bits) {
-			ret->level2bits[0] = (bits - bits_in_leaf) %
-			    bits_per_level;
-		} else
-			ret->level2bits[0] = bits_per_level;
-		for (i = 1; i < height-1; i++)
-			ret->level2bits[i] = bits_per_level;
-		ret->level2bits[height-1] = bits_in_leaf;
-	} else
-		ret->level2bits[0] = bits;
-
-	ret->root = (void**)alloc(sizeof(void *) << ret->level2bits[0]);
-	if (ret->root == NULL) {
-		if (dalloc != NULL)
-			dalloc(ret);
-		return (NULL);
+		rtree->levels[height-1].subtree = NULL;
+		rtree->levels[height-1].bits = bits_in_leaf;
+		rtree->levels[height-1].cumbits = bits;
 	}
-	memset(ret->root, 0, sizeof(void *) << ret->level2bits[0]);

-	return (ret);
+	/* Compute lookup table to be used by rtree_start_level(). */
+	for (i = 0; i < RTREE_HEIGHT_MAX; i++) {
+		rtree->start_level[i] = hmin(RTREE_HEIGHT_MAX - 1 - i, height -
+		    1);
+	}
+
+	return (false);
 }

 static void
-rtree_delete_subtree(rtree_t *rtree, void **node, unsigned level)
+rtree_delete_subtree(rtree_t *rtree, rtree_node_elm_t *node, unsigned level)
 {

-	if (level < rtree->height - 1) {
+	if (level + 1 < rtree->height) {
 		size_t nchildren, i;

-		nchildren = ZU(1) << rtree->level2bits[level];
+		nchildren = ZU(1) << rtree->levels[level].bits;
 		for (i = 0; i < nchildren; i++) {
-			void **child = (void **)node[i];
+			rtree_node_elm_t *child = node[i].child;
 			if (child != NULL)
 				rtree_delete_subtree(rtree, child, level + 1);
 		}
@ -78,28 +79,49 @@ rtree_delete_subtree(rtree_t *rtree, void **node, unsigned level)
 void
 rtree_delete(rtree_t *rtree)
 {
+	unsigned i;

-	rtree_delete_subtree(rtree, rtree->root, 0);
-	rtree->dalloc(rtree);
+	for (i = 0; i < rtree->height; i++) {
+		rtree_node_elm_t *subtree = rtree->levels[i].subtree;
+		if (subtree != NULL)
+			rtree_delete_subtree(rtree, subtree, i);
+	}
 }

-void
-rtree_prefork(rtree_t *rtree)
+static rtree_node_elm_t *
+rtree_node_init(rtree_t *rtree, unsigned level, rtree_node_elm_t **elmp)
+{
+	rtree_node_elm_t *node;
+
+	if (atomic_cas_p((void **)elmp, NULL, RTREE_NODE_INITIALIZING)) {
+		/*
+		 * Another thread is already in the process of initializing.
+		 * Spin-wait until initialization is complete.
+		 */
+		do {
+			CPU_SPINWAIT;
+			node = atomic_read_p((void **)elmp);
+		} while (node == RTREE_NODE_INITIALIZING);
+	} else {
+		node = rtree->alloc(ZU(1) << rtree->levels[level].bits);
+		if (node == NULL)
+			return (NULL);
+		atomic_write_p((void **)elmp, node);
+	}
+
+	return (node);
+}
+
+rtree_node_elm_t *
+rtree_subtree_read_hard(rtree_t *rtree, unsigned level)
 {

-	malloc_mutex_prefork(&rtree->mutex);
+	return (rtree_node_init(rtree, level, &rtree->levels[level].subtree));
 }

-void
-rtree_postfork_parent(rtree_t *rtree)
+rtree_node_elm_t *
+rtree_child_read_hard(rtree_t *rtree, rtree_node_elm_t *elm, unsigned level)
 {

-	malloc_mutex_postfork_parent(&rtree->mutex);
-}
-
-void
-rtree_postfork_child(rtree_t *rtree)
-{
-
-	malloc_mutex_postfork_child(&rtree->mutex);
+	return (rtree_node_init(rtree, level, &elm->child));
 }
--- a/src/stats.c
+++ b/src/stats.c
@ -6,31 +6,22 @@
 	xmallctl(n, v, &sz, NULL, 0);					\
 } while (0)

-#define	CTL_I_GET(n, v, t) do {						\
+#define	CTL_M2_GET(n, i, v, t) do {					\
 	size_t mib[6];							\
 	size_t miblen = sizeof(mib) / sizeof(size_t);			\
 	size_t sz = sizeof(t);						\
 	xmallctlnametomib(n, mib, &miblen);				\
-	mib[2] = i;							\
+	mib[2] = (i);							\
 	xmallctlbymib(mib, miblen, v, &sz, NULL, 0);			\
 } while (0)

-#define	CTL_J_GET(n, v, t) do {						\
+#define	CTL_M2_M4_GET(n, i, j, v, t) do {				\
 	size_t mib[6];							\
 	size_t miblen = sizeof(mib) / sizeof(size_t);			\
 	size_t sz = sizeof(t);						\
 	xmallctlnametomib(n, mib, &miblen);				\
-	mib[2] = j;							\
-	xmallctlbymib(mib, miblen, v, &sz, NULL, 0);			\
-} while (0)
-
-#define	CTL_IJ_GET(n, v, t) do {					\
-	size_t mib[6];							\
-	size_t miblen = sizeof(mib) / sizeof(size_t);			\
-	size_t sz = sizeof(t);						\
-	xmallctlnametomib(n, mib, &miblen);				\
-	mib[2] = i;							\
-	mib[4] = j;							\
+	mib[2] = (i);							\
+	mib[4] = (j);							\
 	xmallctlbymib(mib, miblen, v, &sz, NULL, 0);			\
 } while (0)

@ -48,8 +39,10 @@ static void	stats_arena_bins_print(void (*write_cb)(void *, const char *),
    void *cbopaque, unsigned i);
 static void	stats_arena_lruns_print(void (*write_cb)(void *, const char *),
    void *cbopaque, unsigned i);
+static void	stats_arena_hchunks_print(
+    void (*write_cb)(void *, const char *), void *cbopaque, unsigned i);
 static void	stats_arena_print(void (*write_cb)(void *, const char *),
-    void *cbopaque, unsigned i, bool bins, bool large);
+    void *cbopaque, unsigned i, bool bins, bool large, bool huge);

 /******************************************************************************/

@ -58,100 +51,109 @@ stats_arena_bins_print(void (*write_cb)(void *, const char *), void *cbopaque,
    unsigned i)
 {
 	size_t page;
-	bool config_tcache;
-	unsigned nbins, j, gap_start;
+	bool config_tcache, in_gap;
+	unsigned nbins, j;

 	CTL_GET("arenas.page", &page, size_t);

 	CTL_GET("config.tcache", &config_tcache, bool);
 	if (config_tcache) {
 		malloc_cprintf(write_cb, cbopaque,
-		    "bins:     bin  size regs pgs    allocated      nmalloc"
-		    "      ndalloc    nrequests       nfills     nflushes"
-		    "      newruns       reruns      curruns\n");
+		    "bins:           size ind    allocated      nmalloc"
+		    "      ndalloc    nrequests      curregs      curruns regs"
+		    " pgs  util       nfills     nflushes      newruns"
+		    "       reruns\n");
 	} else {
 		malloc_cprintf(write_cb, cbopaque,
-		    "bins:     bin  size regs pgs    allocated      nmalloc"
-		    "      ndalloc      newruns       reruns      curruns\n");
+		    "bins:           size ind    allocated      nmalloc"
+		    "      ndalloc    nrequests      curregs      curruns regs"
+		    " pgs  util      newruns       reruns\n");
 	}
 	CTL_GET("arenas.nbins", &nbins, unsigned);
-	for (j = 0, gap_start = UINT_MAX; j < nbins; j++) {
+	for (j = 0, in_gap = false; j < nbins; j++) {
 		uint64_t nruns;

-		CTL_IJ_GET("stats.arenas.0.bins.0.nruns", &nruns, uint64_t);
-		if (nruns == 0) {
-			if (gap_start == UINT_MAX)
-				gap_start = j;
-		} else {
-			size_t reg_size, run_size, allocated;
+		CTL_M2_M4_GET("stats.arenas.0.bins.0.nruns", i, j, &nruns,
+		    uint64_t);
+		if (nruns == 0)
+			in_gap = true;
+		else {
+			size_t reg_size, run_size, curregs, availregs, milli;
+			size_t curruns;
 			uint32_t nregs;
 			uint64_t nmalloc, ndalloc, nrequests, nfills, nflushes;
 			uint64_t reruns;
-			size_t curruns;
+			char util[6]; /* "x.yyy". */

-			if (gap_start != UINT_MAX) {
-				if (j > gap_start + 1) {
-					/* Gap of more than one size class. */
+			if (in_gap) {
 				malloc_cprintf(write_cb, cbopaque,
-					    "[%u..%u]\n", gap_start,
-					    j - 1);
-				} else {
-					/* Gap of one size class. */
-					malloc_cprintf(write_cb, cbopaque,
-					    "[%u]\n", gap_start);
+				    "                     ---\n");
+				in_gap = false;
 			}
-				gap_start = UINT_MAX;
-			}
-			CTL_J_GET("arenas.bin.0.size", &reg_size, size_t);
-			CTL_J_GET("arenas.bin.0.nregs", &nregs, uint32_t);
-			CTL_J_GET("arenas.bin.0.run_size", &run_size, size_t);
-			CTL_IJ_GET("stats.arenas.0.bins.0.allocated",
-			    &allocated, size_t);
-			CTL_IJ_GET("stats.arenas.0.bins.0.nmalloc",
-			    &nmalloc, uint64_t);
-			CTL_IJ_GET("stats.arenas.0.bins.0.ndalloc",
-			    &ndalloc, uint64_t);
-			if (config_tcache) {
-				CTL_IJ_GET("stats.arenas.0.bins.0.nrequests",
-				    &nrequests, uint64_t);
-				CTL_IJ_GET("stats.arenas.0.bins.0.nfills",
-				    &nfills, uint64_t);
-				CTL_IJ_GET("stats.arenas.0.bins.0.nflushes",
-				    &nflushes, uint64_t);
-			}
-			CTL_IJ_GET("stats.arenas.0.bins.0.nreruns", &reruns,
-			    uint64_t);
-			CTL_IJ_GET("stats.arenas.0.bins.0.curruns", &curruns,
+			CTL_M2_GET("arenas.bin.0.size", j, &reg_size, size_t);
+			CTL_M2_GET("arenas.bin.0.nregs", j, &nregs, uint32_t);
+			CTL_M2_GET("arenas.bin.0.run_size", j, &run_size,
 			    size_t);
+			CTL_M2_M4_GET("stats.arenas.0.bins.0.nmalloc", i, j,
+			    &nmalloc, uint64_t);
+			CTL_M2_M4_GET("stats.arenas.0.bins.0.ndalloc", i, j,
+			    &ndalloc, uint64_t);
+			CTL_M2_M4_GET("stats.arenas.0.bins.0.curregs", i, j,
+			    &curregs, size_t);
+			CTL_M2_M4_GET("stats.arenas.0.bins.0.nrequests", i, j,
+			    &nrequests, uint64_t);
+			if (config_tcache) {
+				CTL_M2_M4_GET("stats.arenas.0.bins.0.nfills", i,
+				    j, &nfills, uint64_t);
+				CTL_M2_M4_GET("stats.arenas.0.bins.0.nflushes",
+				    i, j, &nflushes, uint64_t);
+			}
+			CTL_M2_M4_GET("stats.arenas.0.bins.0.nreruns", i, j,
+			    &reruns, uint64_t);
+			CTL_M2_M4_GET("stats.arenas.0.bins.0.curruns", i, j,
+			    &curruns, size_t);
+
+			availregs = nregs * curruns;
+			milli = (availregs != 0) ? (1000 * curregs) / availregs
+			    : 1000;
+			assert(milli <= 1000);
+			if (milli < 10) {
+				malloc_snprintf(util, sizeof(util),
+				    "0.00%zu", milli);
+			} else if (milli < 100) {
+				malloc_snprintf(util, sizeof(util), "0.0%zu",
+				    milli);
+			} else if (milli < 1000) {
+				malloc_snprintf(util, sizeof(util), "0.%zu",
+				    milli);
+			} else
+				malloc_snprintf(util, sizeof(util), "1");
+
 			if (config_tcache) {
 				malloc_cprintf(write_cb, cbopaque,
-				    "%13u %5zu %4u %3zu %12zu %12"PRIu64
-				    " %12"PRIu64" %12"PRIu64" %12"PRIu64
-				    " %12"PRIu64" %12"PRIu64" %12"PRIu64
-				    " %12zu\n",
-				    j, reg_size, nregs, run_size / page,
-				    allocated, nmalloc, ndalloc, nrequests,
-				    nfills, nflushes, nruns, reruns, curruns);
+				    "%20zu %3u %12zu %12"FMTu64
+				    " %12"FMTu64" %12"FMTu64" %12zu"
+				    " %12zu %4u %3zu %-5s %12"FMTu64
+				    " %12"FMTu64" %12"FMTu64" %12"FMTu64"\n",
+				    reg_size, j, curregs * reg_size, nmalloc,
+				    ndalloc, nrequests, curregs, curruns, nregs,
+				    run_size / page, util, nfills, nflushes,
+				    nruns, reruns);
 			} else {
 				malloc_cprintf(write_cb, cbopaque,
-				    "%13u %5zu %4u %3zu %12zu %12"PRIu64
-				    " %12"PRIu64" %12"PRIu64" %12"PRIu64
-				    " %12zu\n",
-				    j, reg_size, nregs, run_size / page,
-				    allocated, nmalloc, ndalloc, nruns, reruns,
-				    curruns);
+				    "%20zu %3u %12zu %12"FMTu64
+				    " %12"FMTu64" %12"FMTu64" %12zu"
+				    " %12zu %4u %3zu %-5s %12"FMTu64
+				    " %12"FMTu64"\n",
+				    reg_size, j, curregs * reg_size, nmalloc,
+				    ndalloc, nrequests, curregs, curruns, nregs,
+				    run_size / page, util, nruns, reruns);
 			}
 		}
 	}
-	if (gap_start != UINT_MAX) {
-		if (j > gap_start + 1) {
-			/* Gap of more than one size class. */
-			malloc_cprintf(write_cb, cbopaque, "[%u..%u]\n",
-			    gap_start, j - 1);
-		} else {
-			/* Gap of one size class. */
-			malloc_cprintf(write_cb, cbopaque, "[%u]\n", gap_start);
-		}
+	if (in_gap) {
+		malloc_cprintf(write_cb, cbopaque,
+		    "                     ---\n");
 	}
 }

@ -159,110 +161,199 @@ static void
 stats_arena_lruns_print(void (*write_cb)(void *, const char *), void *cbopaque,
    unsigned i)
 {
-	size_t page, nlruns, j;
-	ssize_t gap_start;
-
-	CTL_GET("arenas.page", &page, size_t);
+	unsigned nbins, nlruns, j;
+	bool in_gap;

 	malloc_cprintf(write_cb, cbopaque,
-	    "large:   size pages      nmalloc      ndalloc    nrequests"
-	    "      curruns\n");
-	CTL_GET("arenas.nlruns", &nlruns, size_t);
-	for (j = 0, gap_start = -1; j < nlruns; j++) {
+	    "large:          size ind    allocated      nmalloc      ndalloc"
+	    "    nrequests      curruns\n");
+	CTL_GET("arenas.nbins", &nbins, unsigned);
+	CTL_GET("arenas.nlruns", &nlruns, unsigned);
+	for (j = 0, in_gap = false; j < nlruns; j++) {
 		uint64_t nmalloc, ndalloc, nrequests;
 		size_t run_size, curruns;

-		CTL_IJ_GET("stats.arenas.0.lruns.0.nmalloc", &nmalloc,
+		CTL_M2_M4_GET("stats.arenas.0.lruns.0.nmalloc", i, j, &nmalloc,
 		    uint64_t);
-		CTL_IJ_GET("stats.arenas.0.lruns.0.ndalloc", &ndalloc,
+		CTL_M2_M4_GET("stats.arenas.0.lruns.0.ndalloc", i, j, &ndalloc,
 		    uint64_t);
-		CTL_IJ_GET("stats.arenas.0.lruns.0.nrequests", &nrequests,
-		    uint64_t);
-		if (nrequests == 0) {
-			if (gap_start == -1)
-				gap_start = j;
-		} else {
-			CTL_J_GET("arenas.lrun.0.size", &run_size, size_t);
-			CTL_IJ_GET("stats.arenas.0.lruns.0.curruns", &curruns,
-			    size_t);
-			if (gap_start != -1) {
-				malloc_cprintf(write_cb, cbopaque, "[%zu]\n",
-				    j - gap_start);
-				gap_start = -1;
+		CTL_M2_M4_GET("stats.arenas.0.lruns.0.nrequests", i, j,
+		    &nrequests, uint64_t);
+		if (nrequests == 0)
+			in_gap = true;
+		else {
+			CTL_M2_GET("arenas.lrun.0.size", j, &run_size, size_t);
+			CTL_M2_M4_GET("stats.arenas.0.lruns.0.curruns", i, j,
+			    &curruns, size_t);
+			if (in_gap) {
+				malloc_cprintf(write_cb, cbopaque,
+				    "                     ---\n");
+				in_gap = false;
 			}
 			malloc_cprintf(write_cb, cbopaque,
-			    "%13zu %5zu %12"PRIu64" %12"PRIu64" %12"PRIu64
-			    " %12zu\n",
-			    run_size, run_size / page, nmalloc, ndalloc,
-			    nrequests, curruns);
+			    "%20zu %3u %12zu %12"FMTu64" %12"FMTu64
+			    " %12"FMTu64" %12zu\n",
+			    run_size, nbins + j, curruns * run_size, nmalloc,
+			    ndalloc, nrequests, curruns);
 		}
 	}
-	if (gap_start != -1)
-		malloc_cprintf(write_cb, cbopaque, "[%zu]\n", j - gap_start);
+	if (in_gap) {
+		malloc_cprintf(write_cb, cbopaque,
+		    "                     ---\n");
+	}
+}
+
+static void
+stats_arena_hchunks_print(void (*write_cb)(void *, const char *),
+    void *cbopaque, unsigned i)
+{
+	unsigned nbins, nlruns, nhchunks, j;
+	bool in_gap;
+
+	malloc_cprintf(write_cb, cbopaque,
+	    "huge:           size ind    allocated      nmalloc      ndalloc"
+	    "    nrequests   curhchunks\n");
+	CTL_GET("arenas.nbins", &nbins, unsigned);
+	CTL_GET("arenas.nlruns", &nlruns, unsigned);
+	CTL_GET("arenas.nhchunks", &nhchunks, unsigned);
+	for (j = 0, in_gap = false; j < nhchunks; j++) {
+		uint64_t nmalloc, ndalloc, nrequests;
+		size_t hchunk_size, curhchunks;
+
+		CTL_M2_M4_GET("stats.arenas.0.hchunks.0.nmalloc", i, j,
+		    &nmalloc, uint64_t);
+		CTL_M2_M4_GET("stats.arenas.0.hchunks.0.ndalloc", i, j,
+		    &ndalloc, uint64_t);
+		CTL_M2_M4_GET("stats.arenas.0.hchunks.0.nrequests", i, j,
+		    &nrequests, uint64_t);
+		if (nrequests == 0)
+			in_gap = true;
+		else {
+			CTL_M2_GET("arenas.hchunk.0.size", j, &hchunk_size,
+			    size_t);
+			CTL_M2_M4_GET("stats.arenas.0.hchunks.0.curhchunks", i,
+			    j, &curhchunks, size_t);
+			if (in_gap) {
+				malloc_cprintf(write_cb, cbopaque,
+				    "                     ---\n");
+				in_gap = false;
+			}
+			malloc_cprintf(write_cb, cbopaque,
+			    "%20zu %3u %12zu %12"FMTu64" %12"FMTu64
+			    " %12"FMTu64" %12zu\n",
+			    hchunk_size, nbins + nlruns + j,
+			    curhchunks * hchunk_size, nmalloc, ndalloc,
+			    nrequests, curhchunks);
+		}
+	}
+	if (in_gap) {
+		malloc_cprintf(write_cb, cbopaque,
+		    "                     ---\n");
+	}
 }

 static void
 stats_arena_print(void (*write_cb)(void *, const char *), void *cbopaque,
-    unsigned i, bool bins, bool large)
+    unsigned i, bool bins, bool large, bool huge)
 {
 	unsigned nthreads;
 	const char *dss;
+	ssize_t lg_dirty_mult;
 	size_t page, pactive, pdirty, mapped;
+	size_t metadata_mapped, metadata_allocated;
 	uint64_t npurge, nmadvise, purged;
 	size_t small_allocated;
 	uint64_t small_nmalloc, small_ndalloc, small_nrequests;
 	size_t large_allocated;
 	uint64_t large_nmalloc, large_ndalloc, large_nrequests;
+	size_t huge_allocated;
+	uint64_t huge_nmalloc, huge_ndalloc, huge_nrequests;

 	CTL_GET("arenas.page", &page, size_t);

-	CTL_I_GET("stats.arenas.0.nthreads", &nthreads, unsigned);
+	CTL_M2_GET("stats.arenas.0.nthreads", i, &nthreads, unsigned);
 	malloc_cprintf(write_cb, cbopaque,
 	    "assigned threads: %u\n", nthreads);
-	CTL_I_GET("stats.arenas.0.dss", &dss, const char *);
+	CTL_M2_GET("stats.arenas.0.dss", i, &dss, const char *);
 	malloc_cprintf(write_cb, cbopaque, "dss allocation precedence: %s\n",
 	    dss);
-	CTL_I_GET("stats.arenas.0.pactive", &pactive, size_t);
-	CTL_I_GET("stats.arenas.0.pdirty", &pdirty, size_t);
-	CTL_I_GET("stats.arenas.0.npurge", &npurge, uint64_t);
-	CTL_I_GET("stats.arenas.0.nmadvise", &nmadvise, uint64_t);
-	CTL_I_GET("stats.arenas.0.purged", &purged, uint64_t);
+	CTL_M2_GET("stats.arenas.0.lg_dirty_mult", i, &lg_dirty_mult, ssize_t);
+	if (lg_dirty_mult >= 0) {
 		malloc_cprintf(write_cb, cbopaque,
-	    "dirty pages: %zu:%zu active:dirty, %"PRIu64" sweep%s,"
-	    " %"PRIu64" madvise%s, %"PRIu64" purged\n",
-	    pactive, pdirty, npurge, npurge == 1 ? "" : "s",
-	    nmadvise, nmadvise == 1 ? "" : "s", purged);
+		    "min active:dirty page ratio: %u:1\n",
+		    (1U << lg_dirty_mult));
+	} else {
+		malloc_cprintf(write_cb, cbopaque,
+		    "min active:dirty page ratio: N/A\n");
+	}
+	CTL_M2_GET("stats.arenas.0.pactive", i, &pactive, size_t);
+	CTL_M2_GET("stats.arenas.0.pdirty", i, &pdirty, size_t);
+	CTL_M2_GET("stats.arenas.0.npurge", i, &npurge, uint64_t);
+	CTL_M2_GET("stats.arenas.0.nmadvise", i, &nmadvise, uint64_t);
+	CTL_M2_GET("stats.arenas.0.purged", i, &purged, uint64_t);
+	malloc_cprintf(write_cb, cbopaque,
+	    "dirty pages: %zu:%zu active:dirty, %"FMTu64" sweep%s, %"FMTu64
+	    " madvise%s, %"FMTu64" purged\n", pactive, pdirty, npurge, npurge ==
+	    1 ? "" : "s", nmadvise, nmadvise == 1 ? "" : "s", purged);

 	malloc_cprintf(write_cb, cbopaque,
-	    "            allocated      nmalloc      ndalloc    nrequests\n");
-	CTL_I_GET("stats.arenas.0.small.allocated", &small_allocated, size_t);
-	CTL_I_GET("stats.arenas.0.small.nmalloc", &small_nmalloc, uint64_t);
-	CTL_I_GET("stats.arenas.0.small.ndalloc", &small_ndalloc, uint64_t);
-	CTL_I_GET("stats.arenas.0.small.nrequests", &small_nrequests, uint64_t);
+	    "                            allocated      nmalloc      ndalloc"
+	    "    nrequests\n");
+	CTL_M2_GET("stats.arenas.0.small.allocated", i, &small_allocated,
+	    size_t);
+	CTL_M2_GET("stats.arenas.0.small.nmalloc", i, &small_nmalloc, uint64_t);
+	CTL_M2_GET("stats.arenas.0.small.ndalloc", i, &small_ndalloc, uint64_t);
+	CTL_M2_GET("stats.arenas.0.small.nrequests", i, &small_nrequests,
+	    uint64_t);
 	malloc_cprintf(write_cb, cbopaque,
-	    "small:   %12zu %12"PRIu64" %12"PRIu64" %12"PRIu64"\n",
+	    "small:                   %12zu %12"FMTu64" %12"FMTu64
+	    " %12"FMTu64"\n",
 	    small_allocated, small_nmalloc, small_ndalloc, small_nrequests);
-	CTL_I_GET("stats.arenas.0.large.allocated", &large_allocated, size_t);
-	CTL_I_GET("stats.arenas.0.large.nmalloc", &large_nmalloc, uint64_t);
-	CTL_I_GET("stats.arenas.0.large.ndalloc", &large_ndalloc, uint64_t);
-	CTL_I_GET("stats.arenas.0.large.nrequests", &large_nrequests, uint64_t);
+	CTL_M2_GET("stats.arenas.0.large.allocated", i, &large_allocated,
+	    size_t);
+	CTL_M2_GET("stats.arenas.0.large.nmalloc", i, &large_nmalloc, uint64_t);
+	CTL_M2_GET("stats.arenas.0.large.ndalloc", i, &large_ndalloc, uint64_t);
+	CTL_M2_GET("stats.arenas.0.large.nrequests", i, &large_nrequests,
+	    uint64_t);
 	malloc_cprintf(write_cb, cbopaque,
-	    "large:   %12zu %12"PRIu64" %12"PRIu64" %12"PRIu64"\n",
+	    "large:                   %12zu %12"FMTu64" %12"FMTu64
+	    " %12"FMTu64"\n",
 	    large_allocated, large_nmalloc, large_ndalloc, large_nrequests);
+	CTL_M2_GET("stats.arenas.0.huge.allocated", i, &huge_allocated, size_t);
+	CTL_M2_GET("stats.arenas.0.huge.nmalloc", i, &huge_nmalloc, uint64_t);
+	CTL_M2_GET("stats.arenas.0.huge.ndalloc", i, &huge_ndalloc, uint64_t);
+	CTL_M2_GET("stats.arenas.0.huge.nrequests", i, &huge_nrequests,
+	    uint64_t);
 	malloc_cprintf(write_cb, cbopaque,
-	    "total:   %12zu %12"PRIu64" %12"PRIu64" %12"PRIu64"\n",
-	    small_allocated + large_allocated,
-	    small_nmalloc + large_nmalloc,
-	    small_ndalloc + large_ndalloc,
-	    small_nrequests + large_nrequests);
-	malloc_cprintf(write_cb, cbopaque, "active:  %12zu\n", pactive * page);
-	CTL_I_GET("stats.arenas.0.mapped", &mapped, size_t);
-	malloc_cprintf(write_cb, cbopaque, "mapped:  %12zu\n", mapped);
+	    "huge:                    %12zu %12"FMTu64" %12"FMTu64
+	    " %12"FMTu64"\n",
+	    huge_allocated, huge_nmalloc, huge_ndalloc, huge_nrequests);
+	malloc_cprintf(write_cb, cbopaque,
+	    "total:                   %12zu %12"FMTu64" %12"FMTu64
+	    " %12"FMTu64"\n",
+	    small_allocated + large_allocated + huge_allocated,
+	    small_nmalloc + large_nmalloc + huge_nmalloc,
+	    small_ndalloc + large_ndalloc + huge_ndalloc,
+	    small_nrequests + large_nrequests + huge_nrequests);
+	malloc_cprintf(write_cb, cbopaque,
+	    "active:                  %12zu\n", pactive * page);
+	CTL_M2_GET("stats.arenas.0.mapped", i, &mapped, size_t);
+	malloc_cprintf(write_cb, cbopaque,
+	    "mapped:                  %12zu\n", mapped);
+	CTL_M2_GET("stats.arenas.0.metadata.mapped", i, &metadata_mapped,
+	    size_t);
+	CTL_M2_GET("stats.arenas.0.metadata.allocated", i, &metadata_allocated,
+	    size_t);
+	malloc_cprintf(write_cb, cbopaque,
+	    "metadata: mapped: %zu, allocated: %zu\n",
+	    metadata_mapped, metadata_allocated);

 	if (bins)
 		stats_arena_bins_print(write_cb, cbopaque, i);
 	if (large)
 		stats_arena_lruns_print(write_cb, cbopaque, i);
+	if (huge)
+		stats_arena_hchunks_print(write_cb, cbopaque, i);
 }

 void
@ -277,6 +368,7 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 	bool unmerged = true;
 	bool bins = true;
 	bool large = true;
+	bool huge = true;

 	/*
 	 * Refresh stats, in case mallctl() was called by the application.
@ -319,6 +411,9 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 			case 'l':
 				large = false;
 				break;
+			case 'h':
+				huge = false;
+				break;
 			default:;
 			}
 		}
@ -327,7 +422,6 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 	malloc_cprintf(write_cb, cbopaque,
 	    "___ Begin jemalloc statistics ___\n");
 	if (general) {
-		int err;
 		const char *cpv;
 		bool bv;
 		unsigned uv;
@ -346,26 +440,40 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 		    bv ? "enabled" : "disabled");

 #define	OPT_WRITE_BOOL(n)						\
-		if ((err = je_mallctl("opt."#n, &bv, &bsz, NULL, 0))	\
-		    == 0) {						\
+		if (je_mallctl("opt."#n, &bv, &bsz, NULL, 0) == 0) {	\
 			malloc_cprintf(write_cb, cbopaque,		\
 			    "  opt."#n": %s\n", bv ? "true" : "false");	\
 		}
+#define	OPT_WRITE_BOOL_MUTABLE(n, m) {					\
+		bool bv2;						\
+		if (je_mallctl("opt."#n, &bv, &bsz, NULL, 0) == 0 &&	\
+		    je_mallctl(#m, &bv2, &bsz, NULL, 0) == 0) {		\
+			malloc_cprintf(write_cb, cbopaque,		\
+			    "  opt."#n": %s ("#m": %s)\n", bv ? "true"	\
+			    : "false", bv2 ? "true" : "false");		\
+		}							\
+}
 #define	OPT_WRITE_SIZE_T(n)						\
-		if ((err = je_mallctl("opt."#n, &sv, &ssz, NULL, 0))	\
-		    == 0) {						\
+		if (je_mallctl("opt."#n, &sv, &ssz, NULL, 0) == 0) {	\
 			malloc_cprintf(write_cb, cbopaque,		\
 			"  opt."#n": %zu\n", sv);			\
 		}
 #define	OPT_WRITE_SSIZE_T(n)						\
-		if ((err = je_mallctl("opt."#n, &ssv, &sssz, NULL, 0))	\
-		    == 0) {						\
+		if (je_mallctl("opt."#n, &ssv, &sssz, NULL, 0) == 0) {	\
 			malloc_cprintf(write_cb, cbopaque,		\
 			    "  opt."#n": %zd\n", ssv);			\
 		}
+#define	OPT_WRITE_SSIZE_T_MUTABLE(n, m) {				\
+		ssize_t ssv2;						\
+		if (je_mallctl("opt."#n, &ssv, &sssz, NULL, 0) == 0 &&	\
+		    je_mallctl(#m, &ssv2, &sssz, NULL, 0) == 0) {	\
+			malloc_cprintf(write_cb, cbopaque,		\
+			    "  opt."#n": %zd ("#m": %zd)\n",		\
+			    ssv, ssv2);					\
+		}							\
+}
 #define	OPT_WRITE_CHAR_P(n)						\
-		if ((err = je_mallctl("opt."#n, &cpv, &cpsz, NULL, 0))	\
-		    == 0) {						\
+		if (je_mallctl("opt."#n, &cpv, &cpsz, NULL, 0) == 0) {	\
 			malloc_cprintf(write_cb, cbopaque,		\
 			    "  opt."#n": \"%s\"\n", cpv);		\
 		}
@ -376,9 +484,9 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 		OPT_WRITE_SIZE_T(lg_chunk)
 		OPT_WRITE_CHAR_P(dss)
 		OPT_WRITE_SIZE_T(narenas)
-		OPT_WRITE_SSIZE_T(lg_dirty_mult)
+		OPT_WRITE_SSIZE_T_MUTABLE(lg_dirty_mult, arenas.lg_dirty_mult)
 		OPT_WRITE_BOOL(stats_print)
-		OPT_WRITE_BOOL(junk)
+		OPT_WRITE_CHAR_P(junk)
 		OPT_WRITE_SIZE_T(quarantine)
 		OPT_WRITE_BOOL(redzone)
 		OPT_WRITE_BOOL(zero)
@ -389,7 +497,9 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 		OPT_WRITE_SSIZE_T(lg_tcache_max)
 		OPT_WRITE_BOOL(prof)
 		OPT_WRITE_CHAR_P(prof_prefix)
-		OPT_WRITE_BOOL(prof_active)
+		OPT_WRITE_BOOL_MUTABLE(prof_active, prof.active)
+		OPT_WRITE_BOOL_MUTABLE(prof_thread_active_init,
+		    prof.thread_active_init)
 		OPT_WRITE_SSIZE_T(lg_prof_sample)
 		OPT_WRITE_BOOL(prof_accum)
 		OPT_WRITE_SSIZE_T(lg_prof_interval)
@ -398,6 +508,7 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 		OPT_WRITE_BOOL(prof_leak)

 #undef OPT_WRITE_BOOL
+#undef OPT_WRITE_BOOL_MUTABLE
 #undef OPT_WRITE_SIZE_T
 #undef OPT_WRITE_SSIZE_T
 #undef OPT_WRITE_CHAR_P
@ -411,12 +522,13 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 		    sizeof(void *));

 		CTL_GET("arenas.quantum", &sv, size_t);
-		malloc_cprintf(write_cb, cbopaque, "Quantum size: %zu\n", sv);
+		malloc_cprintf(write_cb, cbopaque, "Quantum size: %zu\n",
+		    sv);

 		CTL_GET("arenas.page", &sv, size_t);
 		malloc_cprintf(write_cb, cbopaque, "Page size: %zu\n", sv);

-		CTL_GET("opt.lg_dirty_mult", &ssv, ssize_t);
+		CTL_GET("arenas.lg_dirty_mult", &ssv, ssize_t);
 		if (ssv >= 0) {
 			malloc_cprintf(write_cb, cbopaque,
 			    "Min active:dirty page ratio per arena: %u:1\n",
@ -425,22 +537,20 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 			malloc_cprintf(write_cb, cbopaque,
 			    "Min active:dirty page ratio per arena: N/A\n");
 		}
-		if ((err = je_mallctl("arenas.tcache_max", &sv, &ssz, NULL, 0))
-		    == 0) {
+		if (je_mallctl("arenas.tcache_max", &sv, &ssz, NULL, 0) == 0) {
 			malloc_cprintf(write_cb, cbopaque,
 			    "Maximum thread-cached size class: %zu\n", sv);
 		}
-		if ((err = je_mallctl("opt.prof", &bv, &bsz, NULL, 0)) == 0 &&
-		    bv) {
-			CTL_GET("opt.lg_prof_sample", &sv, size_t);
+		if (je_mallctl("opt.prof", &bv, &bsz, NULL, 0) == 0 && bv) {
+			CTL_GET("prof.lg_sample", &sv, size_t);
 			malloc_cprintf(write_cb, cbopaque,
-			    "Average profile sample interval: %"PRIu64
+			    "Average profile sample interval: %"FMTu64
 			    " (2^%zu)\n", (((uint64_t)1U) << sv), sv);

 			CTL_GET("opt.lg_prof_interval", &ssv, ssize_t);
 			if (ssv >= 0) {
 				malloc_cprintf(write_cb, cbopaque,
-				    "Average profile dump interval: %"PRIu64
+				    "Average profile dump interval: %"FMTu64
 				    " (2^%zd)\n",
 				    (((uint64_t)1U) << ssv), ssv);
 			} else {
@ -449,47 +559,27 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 			}
 		}
 		CTL_GET("opt.lg_chunk", &sv, size_t);
-		malloc_cprintf(write_cb, cbopaque, "Chunk size: %zu (2^%zu)\n",
-		    (ZU(1) << sv), sv);
+		malloc_cprintf(write_cb, cbopaque,
+		    "Chunk size: %zu (2^%zu)\n", (ZU(1) << sv), sv);
 	}

 	if (config_stats) {
 		size_t *cactive;
-		size_t allocated, active, mapped;
-		size_t chunks_current, chunks_high;
-		uint64_t chunks_total;
-		size_t huge_allocated;
-		uint64_t huge_nmalloc, huge_ndalloc;
+		size_t allocated, active, metadata, resident, mapped;

 		CTL_GET("stats.cactive", &cactive, size_t *);
 		CTL_GET("stats.allocated", &allocated, size_t);
 		CTL_GET("stats.active", &active, size_t);
+		CTL_GET("stats.metadata", &metadata, size_t);
+		CTL_GET("stats.resident", &resident, size_t);
 		CTL_GET("stats.mapped", &mapped, size_t);
 		malloc_cprintf(write_cb, cbopaque,
-		    "Allocated: %zu, active: %zu, mapped: %zu\n",
-		    allocated, active, mapped);
+		    "Allocated: %zu, active: %zu, metadata: %zu,"
+		    " resident: %zu, mapped: %zu\n",
+		    allocated, active, metadata, resident, mapped);
 		malloc_cprintf(write_cb, cbopaque,
-		    "Current active ceiling: %zu\n", atomic_read_z(cactive));
-
-		/* Print chunk stats. */
-		CTL_GET("stats.chunks.total", &chunks_total, uint64_t);
-		CTL_GET("stats.chunks.high", &chunks_high, size_t);
-		CTL_GET("stats.chunks.current", &chunks_current, size_t);
-		malloc_cprintf(write_cb, cbopaque, "chunks: nchunks   "
-		    "highchunks    curchunks\n");
-		malloc_cprintf(write_cb, cbopaque,
-		    "  %13"PRIu64" %12zu %12zu\n",
-		    chunks_total, chunks_high, chunks_current);
-
-		/* Print huge stats. */
-		CTL_GET("stats.huge.nmalloc", &huge_nmalloc, uint64_t);
-		CTL_GET("stats.huge.ndalloc", &huge_ndalloc, uint64_t);
-		CTL_GET("stats.huge.allocated", &huge_allocated, size_t);
-		malloc_cprintf(write_cb, cbopaque,
-		    "huge: nmalloc      ndalloc    allocated\n");
-		malloc_cprintf(write_cb, cbopaque,
-		    " %12"PRIu64" %12"PRIu64" %12zu\n",
-		    huge_nmalloc, huge_ndalloc, huge_allocated);
+		    "Current active ceiling: %zu\n",
+		    atomic_read_z(cactive));

 		if (merged) {
 			unsigned narenas;
@ -508,12 +598,12 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 						ninitialized++;
 				}

-				if (ninitialized > 1 || unmerged == false) {
+				if (ninitialized > 1 || !unmerged) {
 					/* Print merged arena stats. */
 					malloc_cprintf(write_cb, cbopaque,
 					    "\nMerged arenas stats:\n");
 					stats_arena_print(write_cb, cbopaque,
-					    narenas, bins, large);
+					    narenas, bins, large, huge);
 				}
 			}
 		}
@ -539,7 +629,8 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 						    cbopaque,
 						    "\narenas[%u]:\n", i);
 						stats_arena_print(write_cb,
-						    cbopaque, i, bins, large);
+						    cbopaque, i, bins, large,
+						    huge);
 					}
 				}
 			}
--- a/src/tcache.c
+++ b/src/tcache.c
@ -4,9 +4,6 @@
 /******************************************************************************/
 /* Data. */

-malloc_tsd_data(, tcache, tcache_t *, NULL)
-malloc_tsd_data(, tcache_enabled, tcache_enabled_t, tcache_enabled_default)
-
 bool	opt_tcache = true;
 ssize_t	opt_lg_tcache_max = LG_TCACHE_MAXCLASS_DEFAULT;

@ -16,6 +13,14 @@ static unsigned		stack_nelms; /* Total stack elms per tcache. */
 size_t			nhbins;
 size_t			tcache_maxclass;

+tcaches_t		*tcaches;
+
+/* Index of first element within tcaches that has never been used. */
+static unsigned		tcaches_past;
+
+/* Head of singly linked list tracking available tcaches elements. */
+static tcaches_t	*tcaches_avail;
+
 /******************************************************************************/

 size_t	tcache_salloc(const void *ptr)
@ -25,9 +30,9 @@ size_t	tcache_salloc(const void *ptr)
 }

 void
-tcache_event_hard(tcache_t *tcache)
+tcache_event_hard(tsd_t *tsd, tcache_t *tcache)
 {
-	size_t binind = tcache->next_gc_bin;
+	index_t binind = tcache->next_gc_bin;
 	tcache_bin_t *tbin = &tcache->tbins[binind];
 	tcache_bin_info_t *tbin_info = &tcache_bin_info[binind];

@ -36,11 +41,12 @@ tcache_event_hard(tcache_t *tcache)
 		 * Flush (ceiling) 3/4 of the objects below the low water mark.
 		 */
 		if (binind < NBINS) {
-			tcache_bin_flush_small(tbin, binind, tbin->ncached -
-			    tbin->low_water + (tbin->low_water >> 2), tcache);
+			tcache_bin_flush_small(tsd, tcache, tbin, binind,
+			    tbin->ncached - tbin->low_water + (tbin->low_water
+			    >> 2));
 		} else {
-			tcache_bin_flush_large(tbin, binind, tbin->ncached -
-			    tbin->low_water + (tbin->low_water >> 2), tcache);
+			tcache_bin_flush_large(tsd, tbin, binind, tbin->ncached
+			    - tbin->low_water + (tbin->low_water >> 2), tcache);
 		}
 		/*
 		 * Reduce fill count by 2X.  Limit lg_fill_div such that the
@ -65,12 +71,13 @@ tcache_event_hard(tcache_t *tcache)
 }

 void *
-tcache_alloc_small_hard(tcache_t *tcache, tcache_bin_t *tbin, size_t binind)
+tcache_alloc_small_hard(tsd_t *tsd, arena_t *arena, tcache_t *tcache,
+    tcache_bin_t *tbin, index_t binind)
 {
 	void *ret;

-	arena_tcache_fill_small(tcache->arena, tbin, binind,
-	    config_prof ? tcache->prof_accumbytes : 0);
+	arena_tcache_fill_small(arena, tbin, binind, config_prof ?
+	    tcache->prof_accumbytes : 0);
 	if (config_prof)
 		tcache->prof_accumbytes = 0;
 	ret = tcache_alloc_easy(tbin);
@ -79,9 +86,10 @@ tcache_alloc_small_hard(tcache_t *tcache, tcache_bin_t *tbin, size_t binind)
 }

 void
-tcache_bin_flush_small(tcache_bin_t *tbin, size_t binind, unsigned rem,
-    tcache_t *tcache)
+tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, tcache_bin_t *tbin,
+    index_t binind, unsigned rem)
 {
+	arena_t *arena;
 	void *ptr;
 	unsigned i, nflush, ndeferred;
 	bool merged_stats = false;
@ -89,22 +97,24 @@ tcache_bin_flush_small(tcache_bin_t *tbin, size_t binind, unsigned rem,
 	assert(binind < NBINS);
 	assert(rem <= tbin->ncached);

+	arena = arena_choose(tsd, NULL);
+	assert(arena != NULL);
 	for (nflush = tbin->ncached - rem; nflush > 0; nflush = ndeferred) {
 		/* Lock the arena bin associated with the first object. */
 		arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(
 		    tbin->avail[0]);
-		arena_t *arena = chunk->arena;
-		arena_bin_t *bin = &arena->bins[binind];
+		arena_t *bin_arena = extent_node_arena_get(&chunk->node);
+		arena_bin_t *bin = &bin_arena->bins[binind];

-		if (config_prof && arena == tcache->arena) {
+		if (config_prof && bin_arena == arena) {
 			if (arena_prof_accum(arena, tcache->prof_accumbytes))
 				prof_idump();
 			tcache->prof_accumbytes = 0;
 		}

 		malloc_mutex_lock(&bin->lock);
-		if (config_stats && arena == tcache->arena) {
-			assert(merged_stats == false);
+		if (config_stats && bin_arena == arena) {
+			assert(!merged_stats);
 			merged_stats = true;
 			bin->stats.nflushes++;
 			bin->stats.nrequests += tbin->tstats.nrequests;
@ -115,17 +125,13 @@ tcache_bin_flush_small(tcache_bin_t *tbin, size_t binind, unsigned rem,
 			ptr = tbin->avail[i];
 			assert(ptr != NULL);
 			chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-			if (chunk->arena == arena) {
+			if (extent_node_arena_get(&chunk->node) == bin_arena) {
 				size_t pageind = ((uintptr_t)ptr -
 				    (uintptr_t)chunk) >> LG_PAGE;
-				arena_chunk_map_t *mapelm =
-				    arena_mapp_get(chunk, pageind);
-				if (config_fill && opt_junk) {
-					arena_alloc_junk_small(ptr,
-					    &arena_bin_info[binind], true);
-				}
-				arena_dalloc_bin_locked(arena, chunk, ptr,
-				    mapelm);
+				arena_chunk_map_bits_t *bitselm =
+				    arena_bitselm_get(chunk, pageind);
+				arena_dalloc_bin_junked_locked(bin_arena, chunk,
+				    ptr, bitselm);
 			} else {
 				/*
 				 * This object was allocated via a different
@ -139,12 +145,12 @@ tcache_bin_flush_small(tcache_bin_t *tbin, size_t binind, unsigned rem,
 		}
 		malloc_mutex_unlock(&bin->lock);
 	}
-	if (config_stats && merged_stats == false) {
+	if (config_stats && !merged_stats) {
 		/*
 		 * The flush loop didn't happen to flush to this thread's
 		 * arena, so the stats didn't get merged.  Manually do so now.
 		 */
-		arena_bin_t *bin = &tcache->arena->bins[binind];
+		arena_bin_t *bin = &arena->bins[binind];
 		malloc_mutex_lock(&bin->lock);
 		bin->stats.nflushes++;
 		bin->stats.nrequests += tbin->tstats.nrequests;
@ -160,9 +166,10 @@ tcache_bin_flush_small(tcache_bin_t *tbin, size_t binind, unsigned rem,
 }

 void
-tcache_bin_flush_large(tcache_bin_t *tbin, size_t binind, unsigned rem,
-    tcache_t *tcache)
+tcache_bin_flush_large(tsd_t *tsd, tcache_bin_t *tbin, index_t binind,
+    unsigned rem, tcache_t *tcache)
 {
+	arena_t *arena;
 	void *ptr;
 	unsigned i, nflush, ndeferred;
 	bool merged_stats = false;
@ -170,17 +177,19 @@ tcache_bin_flush_large(tcache_bin_t *tbin, size_t binind, unsigned rem,
 	assert(binind < nhbins);
 	assert(rem <= tbin->ncached);

+	arena = arena_choose(tsd, NULL);
+	assert(arena != NULL);
 	for (nflush = tbin->ncached - rem; nflush > 0; nflush = ndeferred) {
 		/* Lock the arena associated with the first object. */
 		arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(
 		    tbin->avail[0]);
-		arena_t *arena = chunk->arena;
+		arena_t *locked_arena = extent_node_arena_get(&chunk->node);
 		UNUSED bool idump;

 		if (config_prof)
 			idump = false;
-		malloc_mutex_lock(&arena->lock);
-		if ((config_prof || config_stats) && arena == tcache->arena) {
+		malloc_mutex_lock(&locked_arena->lock);
+		if ((config_prof || config_stats) && locked_arena == arena) {
 			if (config_prof) {
 				idump = arena_prof_accum_locked(arena,
 				    tcache->prof_accumbytes);
@ -200,9 +209,11 @@ tcache_bin_flush_large(tcache_bin_t *tbin, size_t binind, unsigned rem,
 			ptr = tbin->avail[i];
 			assert(ptr != NULL);
 			chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-			if (chunk->arena == arena)
-				arena_dalloc_large_locked(arena, chunk, ptr);
-			else {
+			if (extent_node_arena_get(&chunk->node) ==
+			    locked_arena) {
+				arena_dalloc_large_junked_locked(locked_arena,
+				    chunk, ptr);
+			} else {
 				/*
 				 * This object was allocated via a different
 				 * arena than the one that is currently locked.
@ -213,16 +224,15 @@ tcache_bin_flush_large(tcache_bin_t *tbin, size_t binind, unsigned rem,
 				ndeferred++;
 			}
 		}
-		malloc_mutex_unlock(&arena->lock);
+		malloc_mutex_unlock(&locked_arena->lock);
 		if (config_prof && idump)
 			prof_idump();
 	}
-	if (config_stats && merged_stats == false) {
+	if (config_stats && !merged_stats) {
 		/*
 		 * The flush loop didn't happen to flush to this thread's
 		 * arena, so the stats didn't get merged.  Manually do so now.
 		 */
-		arena_t *arena = tcache->arena;
 		malloc_mutex_lock(&arena->lock);
 		arena->stats.nrequests_large += tbin->tstats.nrequests;
 		arena->stats.lstats[binind - NBINS].nrequests +=
@ -249,24 +259,58 @@ tcache_arena_associate(tcache_t *tcache, arena_t *arena)
 		ql_tail_insert(&arena->tcache_ql, tcache, link);
 		malloc_mutex_unlock(&arena->lock);
 	}
-	tcache->arena = arena;
 }

 void
-tcache_arena_dissociate(tcache_t *tcache)
+tcache_arena_reassociate(tcache_t *tcache, arena_t *oldarena, arena_t *newarena)
+{
+
+	tcache_arena_dissociate(tcache, oldarena);
+	tcache_arena_associate(tcache, newarena);
+}
+
+void
+tcache_arena_dissociate(tcache_t *tcache, arena_t *arena)
 {

 	if (config_stats) {
 		/* Unlink from list of extant tcaches. */
-		malloc_mutex_lock(&tcache->arena->lock);
-		ql_remove(&tcache->arena->tcache_ql, tcache, link);
-		tcache_stats_merge(tcache, tcache->arena);
-		malloc_mutex_unlock(&tcache->arena->lock);
+		malloc_mutex_lock(&arena->lock);
+		if (config_debug) {
+			bool in_ql = false;
+			tcache_t *iter;
+			ql_foreach(iter, &arena->tcache_ql, link) {
+				if (iter == tcache) {
+					in_ql = true;
+					break;
+				}
+			}
+			assert(in_ql);
+		}
+		ql_remove(&arena->tcache_ql, tcache, link);
+		tcache_stats_merge(tcache, arena);
+		malloc_mutex_unlock(&arena->lock);
 	}
 }

 tcache_t *
-tcache_create(arena_t *arena)
+tcache_get_hard(tsd_t *tsd)
+{
+	arena_t *arena;
+
+	if (!tcache_enabled_get()) {
+		if (tsd_nominal(tsd))
+			tcache_enabled_set(false); /* Memoize. */
+		return (NULL);
+	}
+	arena = arena_choose(tsd, NULL);
+	if (unlikely(arena == NULL))
+		return (NULL);
+	return (tcache_create(tsd, arena));
+}
+
+tcache_t *
+tcache_create(tsd_t *tsd, arena_t *arena)
 {
 	tcache_t *tcache;
 	size_t size, stack_offset;
@ -277,23 +321,10 @@ tcache_create(arena_t *arena)
 	size = PTR_CEILING(size);
 	stack_offset = size;
 	size += stack_nelms * sizeof(void *);
-	/*
-	 * Round up to the nearest multiple of the cacheline size, in order to
-	 * avoid the possibility of false cacheline sharing.
-	 *
-	 * That this works relies on the same logic as in ipalloc(), but we
-	 * cannot directly call ipalloc() here due to tcache bootstrapping
-	 * issues.
-	 */
-	size = (size + CACHELINE_MASK) & (-CACHELINE);
-
-	if (size <= SMALL_MAXCLASS)
-		tcache = (tcache_t *)arena_malloc_small(arena, size, true);
-	else if (size <= tcache_maxclass)
-		tcache = (tcache_t *)arena_malloc_large(arena, size, true);
-	else
-		tcache = (tcache_t *)icalloct(size, false, arena);
+	/* Avoid false cacheline sharing. */
+	size = sa2u(size, CACHELINE);

+	tcache = ipallocztm(tsd, size, CACHELINE, true, false, true, a0get());
 	if (tcache == NULL)
 		return (NULL);

@ -307,25 +338,23 @@ tcache_create(arena_t *arena)
 		stack_offset += tcache_bin_info[i].ncached_max * sizeof(void *);
 	}

-	tcache_tsd_set(&tcache);
-
 	return (tcache);
 }

-void
-tcache_destroy(tcache_t *tcache)
+static void
+tcache_destroy(tsd_t *tsd, tcache_t *tcache)
 {
+	arena_t *arena;
 	unsigned i;
-	size_t tcache_size;

-	tcache_arena_dissociate(tcache);
+	arena = arena_choose(tsd, NULL);
+	tcache_arena_dissociate(tcache, arena);

 	for (i = 0; i < NBINS; i++) {
 		tcache_bin_t *tbin = &tcache->tbins[i];
-		tcache_bin_flush_small(tbin, i, 0, tcache);
+		tcache_bin_flush_small(tsd, tcache, tbin, i, 0);

 		if (config_stats && tbin->tstats.nrequests != 0) {
-			arena_t *arena = tcache->arena;
 			arena_bin_t *bin = &arena->bins[i];
 			malloc_mutex_lock(&bin->lock);
 			bin->stats.nrequests += tbin->tstats.nrequests;
@ -335,10 +364,9 @@ tcache_destroy(tcache_t *tcache)

 	for (; i < nhbins; i++) {
 		tcache_bin_t *tbin = &tcache->tbins[i];
-		tcache_bin_flush_large(tbin, i, 0, tcache);
+		tcache_bin_flush_large(tsd, tbin, i, 0, tcache);

 		if (config_stats && tbin->tstats.nrequests != 0) {
-			arena_t *arena = tcache->arena;
 			malloc_mutex_lock(&arena->lock);
 			arena->stats.nrequests_large += tbin->tstats.nrequests;
 			arena->stats.lstats[i - NBINS].nrequests +=
@ -348,57 +376,33 @@ tcache_destroy(tcache_t *tcache)
 	}

 	if (config_prof && tcache->prof_accumbytes > 0 &&
-	    arena_prof_accum(tcache->arena, tcache->prof_accumbytes))
+	    arena_prof_accum(arena, tcache->prof_accumbytes))
 		prof_idump();

-	tcache_size = arena_salloc(tcache, false);
-	if (tcache_size <= SMALL_MAXCLASS) {
-		arena_chunk_t *chunk = CHUNK_ADDR2BASE(tcache);
-		arena_t *arena = chunk->arena;
-		size_t pageind = ((uintptr_t)tcache - (uintptr_t)chunk) >>
-		    LG_PAGE;
-		arena_chunk_map_t *mapelm = arena_mapp_get(chunk, pageind);
-
-		arena_dalloc_bin(arena, chunk, tcache, pageind, mapelm);
-	} else if (tcache_size <= tcache_maxclass) {
-		arena_chunk_t *chunk = CHUNK_ADDR2BASE(tcache);
-		arena_t *arena = chunk->arena;
-
-		arena_dalloc_large(arena, chunk, tcache);
-	} else
-		idalloct(tcache, false);
+	idalloctm(tsd, tcache, false, true);
 }

 void
-tcache_thread_cleanup(void *arg)
+tcache_cleanup(tsd_t *tsd)
 {
-	tcache_t *tcache = *(tcache_t **)arg;
+	tcache_t *tcache;

-	if (tcache == TCACHE_STATE_DISABLED) {
-		/* Do nothing. */
-	} else if (tcache == TCACHE_STATE_REINCARNATED) {
-		/*
-		 * Another destructor called an allocator function after this
-		 * destructor was called.  Reset tcache to
-		 * TCACHE_STATE_PURGATORY in order to receive another callback.
-		 */
-		tcache = TCACHE_STATE_PURGATORY;
-		tcache_tsd_set(&tcache);
-	} else if (tcache == TCACHE_STATE_PURGATORY) {
-		/*
-		 * The previous time this destructor was called, we set the key
-		 * to TCACHE_STATE_PURGATORY so that other destructors wouldn't
-		 * cause re-creation of the tcache.  This time, do nothing, so
-		 * that the destructor will not be called again.
-		 */
-	} else if (tcache != NULL) {
-		assert(tcache != TCACHE_STATE_PURGATORY);
-		tcache_destroy(tcache);
-		tcache = TCACHE_STATE_PURGATORY;
-		tcache_tsd_set(&tcache);
+	if (!config_tcache)
+		return;
+
+	if ((tcache = tsd_tcache_get(tsd)) != NULL) {
+		tcache_destroy(tsd, tcache);
+		tsd_tcache_set(tsd, NULL);
 	}
 }

+void
+tcache_enabled_cleanup(tsd_t *tsd)
+{
+
+	/* Do nothing. */
+}
+
 /* Caller must own arena->lock. */
 void
 tcache_stats_merge(tcache_t *tcache, arena_t *arena)
@ -427,7 +431,67 @@ tcache_stats_merge(tcache_t *tcache, arena_t *arena)
 }

 bool
-tcache_boot0(void)
+tcaches_create(tsd_t *tsd, unsigned *r_ind)
+{
+	tcache_t *tcache;
+	tcaches_t *elm;
+
+	if (tcaches == NULL) {
+		tcaches = base_alloc(sizeof(tcache_t *) *
+		    (MALLOCX_TCACHE_MAX+1));
+		if (tcaches == NULL)
+			return (true);
+	}
+
+	if (tcaches_avail == NULL && tcaches_past > MALLOCX_TCACHE_MAX)
+		return (true);
+	tcache = tcache_create(tsd, a0get());
+	if (tcache == NULL)
+		return (true);
+
+	if (tcaches_avail != NULL) {
+		elm = tcaches_avail;
+		tcaches_avail = tcaches_avail->next;
+		elm->tcache = tcache;
+		*r_ind = elm - tcaches;
+	} else {
+		elm = &tcaches[tcaches_past];
+		elm->tcache = tcache;
+		*r_ind = tcaches_past;
+		tcaches_past++;
+	}
+
+	return (false);
+}
+
+static void
+tcaches_elm_flush(tsd_t *tsd, tcaches_t *elm)
+{
+
+	if (elm->tcache == NULL)
+		return;
+	tcache_destroy(tsd, elm->tcache);
+	elm->tcache = NULL;
+}
+
+void
+tcaches_flush(tsd_t *tsd, unsigned ind)
+{
+
+	tcaches_elm_flush(tsd, &tcaches[ind]);
+}
+
+void
+tcaches_destroy(tsd_t *tsd, unsigned ind)
+{
+	tcaches_t *elm = &tcaches[ind];
+	tcaches_elm_flush(tsd, elm);
+	elm->next = tcaches_avail;
+	tcaches_avail = elm;
+}
+
+bool
+tcache_boot(void)
 {
 	unsigned i;

@ -442,7 +506,7 @@ tcache_boot0(void)
 	else
 		tcache_maxclass = (1U << opt_lg_tcache_max);

-	nhbins = NBINS + (tcache_maxclass >> LG_PAGE);
+	nhbins = size2index(tcache_maxclass) + 1;

 	/* Initialize tcache_bin_info. */
 	tcache_bin_info = (tcache_bin_info_t *)base_alloc(nhbins *
@ -451,7 +515,11 @@ tcache_boot0(void)
 		return (true);
 	stack_nelms = 0;
 	for (i = 0; i < NBINS; i++) {
-		if ((arena_bin_info[i].nregs << 1) <= TCACHE_NSLOTS_SMALL_MAX) {
+		if ((arena_bin_info[i].nregs << 1) <= TCACHE_NSLOTS_SMALL_MIN) {
+			tcache_bin_info[i].ncached_max =
+			    TCACHE_NSLOTS_SMALL_MIN;
+		} else if ((arena_bin_info[i].nregs << 1) <=
+		    TCACHE_NSLOTS_SMALL_MAX) {
 			tcache_bin_info[i].ncached_max =
 			    (arena_bin_info[i].nregs << 1);
 		} else {
@ -467,13 +535,3 @@ tcache_boot0(void)

 	return (false);
 }
-
-bool
-tcache_boot1(void)
-{
-
-	if (tcache_tsd_boot() || tcache_enabled_tsd_boot())
-		return (true);
-
-	return (false);
-}
--- a/src/tsd.c
+++ b/src/tsd.c
@ -7,21 +7,22 @@
 static unsigned ncleanups;
 static malloc_tsd_cleanup_t cleanups[MALLOC_TSD_CLEANUPS_MAX];

+malloc_tsd_data(, , tsd_t, TSD_INITIALIZER)
+
 /******************************************************************************/

 void *
 malloc_tsd_malloc(size_t size)
 {

-	/* Avoid choose_arena() in order to dodge bootstrapping issues. */
-	return (arena_malloc(arenas[0], size, false, false));
+	return (a0malloc(CACHELINE_CEILING(size)));
 }

 void
 malloc_tsd_dalloc(void *wrapper)
 {

-	idalloct(wrapper, false);
+	a0dalloc(wrapper);
 }

 void
@ -67,10 +68,58 @@ malloc_tsd_cleanup_register(bool (*f)(void))
 }

 void
-malloc_tsd_boot(void)
+tsd_cleanup(void *arg)
+{
+	tsd_t *tsd = (tsd_t *)arg;
+
+	switch (tsd->state) {
+	case tsd_state_nominal:
+#define O(n, t)								\
+		n##_cleanup(tsd);
+MALLOC_TSD
+#undef O
+		tsd->state = tsd_state_purgatory;
+		tsd_set(tsd);
+		break;
+	case tsd_state_purgatory:
+		/*
+		 * The previous time this destructor was called, we set the
+		 * state to tsd_state_purgatory so that other destructors
+		 * wouldn't cause re-creation of the tsd.  This time, do
+		 * nothing, and do not request another callback.
+		 */
+		break;
+	case tsd_state_reincarnated:
+		/*
+		 * Another destructor deallocated memory after this destructor
+		 * was called.  Reset state to tsd_state_purgatory and request
+		 * another callback.
+		 */
+		tsd->state = tsd_state_purgatory;
+		tsd_set(tsd);
+		break;
+	default:
+		not_reached();
+	}
+}
+
+bool
+malloc_tsd_boot0(void)
 {

 	ncleanups = 0;
+	if (tsd_boot0())
+		return (true);
+	*tsd_arenas_cache_bypassp_get(tsd_fetch()) = true;
+	return (false);
+}
+
+void
+malloc_tsd_boot1(void)
+{
+
+	tsd_boot1();
+	*tsd_arenas_cache_bypassp_get(tsd_fetch()) = false;
 }

 #ifdef _WIN32
@ -102,7 +151,7 @@ _tls_callback(HINSTANCE hinstDLL, DWORD fdwReason, LPVOID lpvReserved)
 #  pragma section(".CRT$XLY",long,read)
 #endif
 JEMALLOC_SECTION(".CRT$XLY") JEMALLOC_ATTR(used)
-static const BOOL	(WINAPI *tls_callback)(HINSTANCE hinstDLL,
+static BOOL	(WINAPI *const tls_callback)(HINSTANCE hinstDLL,
    DWORD fdwReason, LPVOID lpvReserved) = _tls_callback;
 #endif

--- a/src/util.c
+++ b/src/util.c
@ -81,10 +81,10 @@ buferror(int err, char *buf, size_t buflen)
 {

 #ifdef _WIN32
-	FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM, NULL, GetLastError(), 0,
+	FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM, NULL, err, 0,
 	    (LPSTR)buf, buflen, NULL);
 	return (0);
-#elif defined(_GNU_SOURCE)
+#elif defined(__GLIBC__) && defined(_GNU_SOURCE)
 	char *b = strerror_r(err, buf, buflen);
 	if (b != buf) {
 		strncpy(buf, b, buflen);
@ -100,7 +100,7 @@ uintmax_t
 malloc_strtoumax(const char *restrict nptr, char **restrict endptr, int base)
 {
 	uintmax_t ret, digit;
-	int b;
+	unsigned b;
 	bool neg;
 	const char *p, *ns;

@ -266,7 +266,7 @@ d2s(intmax_t x, char sign, char *s, size_t *slen_p)
 		sign = '-';
 	switch (sign) {
 	case '-':
-		if (neg == false)
+		if (!neg)
 			break;
 		/* Fall through. */
 	case ' ':
@ -329,7 +329,7 @@ malloc_vsnprintf(char *str, size_t size, const char *format, va_list ap)
 	/* Left padding. */						\
 	size_t pad_len = (width == -1) ? 0 : ((slen < (size_t)width) ?	\
 	    (size_t)width - slen : 0);					\
-	if (left_justify == false && pad_len != 0) {			\
+	if (!left_justify && pad_len != 0) {				\
 		size_t j;						\
 		for (j = 0; j < pad_len; j++)				\
 			APPEND_C(' ');					\
@ -381,7 +381,9 @@ malloc_vsnprintf(char *str, size_t size, const char *format, va_list ap)
 	case 'p': /* Synthetic; used for %p. */				\
 		val = va_arg(ap, uintptr_t);				\
 		break;							\
-	default: not_reached();						\
+	default:							\
+		not_reached();						\
+		val = 0;						\
 	}								\
 } while (0)

@ -404,19 +406,19 @@ malloc_vsnprintf(char *str, size_t size, const char *format, va_list ap)
 			while (true) {
 				switch (*f) {
 				case '#':
-					assert(alt_form == false);
+					assert(!alt_form);
 					alt_form = true;
 					break;
 				case '-':
-					assert(left_justify == false);
+					assert(!left_justify);
 					left_justify = true;
 					break;
 				case ' ':
-					assert(plus_space == false);
+					assert(!plus_space);
 					plus_space = true;
 					break;
 				case '+':
-					assert(plus_plus == false);
+					assert(!plus_plus);
 					plus_plus = true;
 					break;
 				default: goto label_width;
@ -548,7 +550,7 @@ malloc_vsnprintf(char *str, size_t size, const char *format, va_list ap)
 				assert(len == '?' || len == 'l');
 				assert_not_implemented(len != 'l');
 				s = va_arg(ap, char *);
-				slen = (prec < 0) ? strlen(s) : prec;
+				slen = (prec < 0) ? strlen(s) : (size_t)prec;
 				APPEND_PADDED_S(s, slen, width, left_justify);
 				f++;
 				break;
@ -584,7 +586,7 @@ malloc_vsnprintf(char *str, size_t size, const char *format, va_list ap)
 	return (ret);
 }

-JEMALLOC_ATTR(format(printf, 3, 4))
+JEMALLOC_FORMAT_PRINTF(3, 4)
 int
 malloc_snprintf(char *str, size_t size, const char *format, ...)
 {
@ -623,7 +625,7 @@ malloc_vcprintf(void (*write_cb)(void *, const char *), void *cbopaque,
 * Print to a callback function in such a way as to (hopefully) avoid memory
 * allocation.
 */
-JEMALLOC_ATTR(format(printf, 3, 4))
+JEMALLOC_FORMAT_PRINTF(3, 4)
 void
 malloc_cprintf(void (*write_cb)(void *, const char *), void *cbopaque,
    const char *format, ...)
@ -636,7 +638,7 @@ malloc_cprintf(void (*write_cb)(void *, const char *), void *cbopaque,
 }

 /* Print to stderr in such a way as to avoid memory allocation. */
-JEMALLOC_ATTR(format(printf, 1, 2))
+JEMALLOC_FORMAT_PRINTF(1, 2)
 void
 malloc_printf(const char *format, ...)
 {
--- a/src/valgrind.c
+++ b/src/valgrind.c
@ -0,0 +1,34 @@
+#include "jemalloc/internal/jemalloc_internal.h"
+#ifndef JEMALLOC_VALGRIND
+#  error "This source file is for Valgrind integration."
+#endif
+
+#include <valgrind/memcheck.h>
+
+void
+valgrind_make_mem_noaccess(void *ptr, size_t usize)
+{
+
+	VALGRIND_MAKE_MEM_NOACCESS(ptr, usize);
+}
+
+void
+valgrind_make_mem_undefined(void *ptr, size_t usize)
+{
+
+	VALGRIND_MAKE_MEM_UNDEFINED(ptr, usize);
+}
+
+void
+valgrind_make_mem_defined(void *ptr, size_t usize)
+{
+
+	VALGRIND_MAKE_MEM_DEFINED(ptr, usize);
+}
+
+void
+valgrind_freelike_block(void *ptr, size_t usize)
+{
+
+	VALGRIND_FREELIKE_BLOCK(ptr, usize);
+}
--- a/src/zone.c
+++ b/src/zone.c
@ -176,6 +176,7 @@ register_zone(void)
 	 * register jemalloc's.
 	 */
 	malloc_zone_t *default_zone = malloc_default_zone();
+	malloc_zone_t *purgeable_zone = NULL;
 	if (!default_zone->zone_name ||
 	    strcmp(default_zone->zone_name, "DefaultMallocZone") != 0) {
 		return;
@ -237,22 +238,37 @@ register_zone(void)
 	 * run time.
 	 */
 	if (malloc_default_purgeable_zone != NULL)
-		malloc_default_purgeable_zone();
+		purgeable_zone = malloc_default_purgeable_zone();

 	/* Register the custom zone.  At this point it won't be the default. */
 	malloc_zone_register(&zone);

-	/*
-	 * Unregister and reregister the default zone.  On OSX >= 10.6,
-	 * unregistering takes the last registered zone and places it at the
-	 * location of the specified zone.  Unregistering the default zone thus
-	 * makes the last registered one the default.  On OSX < 10.6,
-	 * unregistering shifts all registered zones.  The first registered zone
-	 * then becomes the default.
-	 */
 	do {
 		default_zone = malloc_default_zone();
+		/*
+		 * Unregister and reregister the default zone.  On OSX >= 10.6,
+		 * unregistering takes the last registered zone and places it
+		 * at the location of the specified zone.  Unregistering the
+		 * default zone thus makes the last registered one the default.
+		 * On OSX < 10.6, unregistering shifts all registered zones.
+		 * The first registered zone then becomes the default.
+		 */
 		malloc_zone_unregister(default_zone);
 		malloc_zone_register(default_zone);
+		/*
+		 * On OSX 10.6, having the default purgeable zone appear before
+		 * the default zone makes some things crash because it thinks it
+		 * owns the default zone allocated pointers.  We thus
+		 * unregister/re-register it in order to ensure it's always
+		 * after the default zone.  On OSX < 10.6, there is no purgeable
+		 * zone, so this does nothing.  On OSX >= 10.6, unregistering
+		 * replaces the purgeable zone with the last registered zone
+		 * above, i.e. the default zone.  Registering it again then puts
+		 * it at the end, obviously after the default zone.
+		 */
+		if (purgeable_zone) {
+			malloc_zone_unregister(purgeable_zone);
+			malloc_zone_register(purgeable_zone);
+		}
 	} while (malloc_default_zone() != &zone);
 }
--- a/test/include/test/btalloc.h
+++ b/test/include/test/btalloc.h
@ -0,0 +1,31 @@
+/* btalloc() provides a mechanism for allocating via permuted backtraces. */
+void	*btalloc(size_t size, unsigned bits);
+
+#define	btalloc_n_proto(n)						\
+void	*btalloc_##n(size_t size, unsigned bits);
+btalloc_n_proto(0)
+btalloc_n_proto(1)
+
+#define	btalloc_n_gen(n)						\
+void *									\
+btalloc_##n(size_t size, unsigned bits)					\
+{									\
+	void *p;							\
+									\
+	if (bits == 0)							\
+		p = mallocx(size, 0);					\
+	else {								\
+		switch (bits & 0x1U) {					\
+		case 0:							\
+			p = (btalloc_0(size, bits >> 1));		\
+			break;						\
+		case 1:							\
+			p = (btalloc_1(size, bits >> 1));		\
+			break;						\
+		default: not_reached();					\
+		}							\
+	}								\
+	/* Intentionally sabotage tail call optimization. */		\
+	assert_ptr_not_null(p, "Unexpected mallocx() failure");		\
+	return (p);							\
+}
--- a/test/include/test/jemalloc_test.h.in
+++ b/test/include/test/jemalloc_test.h.in
@ -1,13 +1,21 @@
+#include <limits.h>
+#ifndef SIZE_T_MAX
+#  define SIZE_T_MAX	SIZE_MAX
+#endif
 #include <stdlib.h>
 #include <stdarg.h>
 #include <stdbool.h>
 #include <errno.h>
-#include <inttypes.h>
 #include <math.h>
 #include <string.h>
+#ifdef _WIN32
+#  include "msvc_compat/strings.h"
+#endif
+#include <sys/time.h>

 #ifdef _WIN32
 #  include <windows.h>
+#  include "msvc_compat/windows_extra.h"
 #else
 #  include <pthread.h>
 #endif
@ -132,10 +140,12 @@
 /*
 * Common test utilities.
 */
+#include "test/btalloc.h"
 #include "test/math.h"
 #include "test/mtx.h"
 #include "test/mq.h"
 #include "test/test.h"
+#include "test/timer.h"
 #include "test/thd.h"
 #define	MEXP 19937
 #include "test/SFMT.h"
--- a/test/include/test/jemalloc_test_defs.h.in
+++ b/test/include/test/jemalloc_test_defs.h.in
@ -1,5 +1,9 @@
 #include "jemalloc/internal/jemalloc_internal_defs.h"
+#include "jemalloc/internal/jemalloc_internal_decls.h"

-/* For use by SFMT. */
+/*
+ * For use by SFMT.  configure.ac doesn't actually define HAVE_SSE2 because its
+ * dependencies are notoriously unportable in practice.
+ */
 #undef HAVE_SSE2
 #undef HAVE_ALTIVEC
--- a/test/include/test/math.h
+++ b/test/include/test/math.h
@ -299,7 +299,7 @@ pt_chi2(double p, double df, double ln_gamma_df_2)

 /*
 * Given a value p in [0..1] and Gamma distribution shape and scale parameters,
- * compute the upper limit on the definite integeral from [0..z] that satisfies
+ * compute the upper limit on the definite integral from [0..z] that satisfies
 * p.
 */
 JEMALLOC_INLINE double
--- a/test/include/test/mq.h
+++ b/test/include/test/mq.h
@ -1,3 +1,5 @@
+void	mq_nanosleep(unsigned ns);
+
 /*
 * Simple templated message queue implementation that relies on only mutexes for
 * synchronization (which reduces portability issues).  Given the following
@ -75,26 +77,23 @@ a_attr a_mq_msg_type *							\
 a_prefix##get(a_mq_type *mq)						\
 {									\
 	a_mq_msg_type *msg;						\
-	struct timespec timeout;					\
+	unsigned ns;							\
 									\
 	msg = a_prefix##tryget(mq);					\
 	if (msg != NULL)						\
 		return (msg);						\
 									\
-	timeout.tv_sec = 0;						\
-	timeout.tv_nsec = 1;						\
+	ns = 1;								\
 	while (true) {							\
-		nanosleep(&timeout, NULL);				\
+		mq_nanosleep(ns);					\
 		msg = a_prefix##tryget(mq);				\
 		if (msg != NULL)					\
 			return (msg);					\
-		if (timeout.tv_sec == 0) {				\
+		if (ns < 1000*1000*1000) {				\
 			/* Double sleep time, up to max 1 second. */	\
-			timeout.tv_nsec <<= 1;				\
-			if (timeout.tv_nsec >= 1000*1000*1000) {	\
-				timeout.tv_sec = 1;			\
-				timeout.tv_nsec = 0;			\
-			}						\
+			ns <<= 1;					\
+			if (ns > 1000*1000*1000)			\
+				ns = 1000*1000*1000;			\
 		}							\
 	}								\
 }									\
--- a/test/include/test/test.h
+++ b/test/include/test/test.h
@ -1,6 +1,6 @@
 #define	ASSERT_BUFSIZE	256

-#define	assert_cmp(t, a, b, cmp, neg_cmp, pri, fmt...) do {		\
+#define	assert_cmp(t, a, b, cmp, neg_cmp, pri, ...) do {		\
 	t a_ = (a);							\
 	t b_ = (b);							\
 	if (!(a_ cmp b_)) {						\
@ -12,205 +12,205 @@
 		    "%"pri" "#neg_cmp" %"pri": ",			\
 		    __func__, __FILE__, __LINE__,			\
 		    #a, #b, a_, b_);					\
-		malloc_snprintf(message, sizeof(message), fmt);		\
+		malloc_snprintf(message, sizeof(message), __VA_ARGS__);	\
 		p_test_fail(prefix, message);				\
 	}								\
 } while (0)

-#define	assert_ptr_eq(a, b, fmt...)	assert_cmp(void *, a, b, ==,	\
-    !=, "p", fmt)
-#define	assert_ptr_ne(a, b, fmt...)	assert_cmp(void *, a, b, !=,	\
-    ==, "p", fmt)
-#define	assert_ptr_null(a, fmt...)	assert_cmp(void *, a, NULL, ==,	\
-    !=, "p", fmt)
-#define	assert_ptr_not_null(a, fmt...)	assert_cmp(void *, a, NULL, !=,	\
-    ==, "p", fmt)
+#define	assert_ptr_eq(a, b, ...)	assert_cmp(void *, a, b, ==,	\
+    !=, "p", __VA_ARGS__)
+#define	assert_ptr_ne(a, b, ...)	assert_cmp(void *, a, b, !=,	\
+    ==, "p", __VA_ARGS__)
+#define	assert_ptr_null(a, ...)		assert_cmp(void *, a, NULL, ==,	\
+    !=, "p", __VA_ARGS__)
+#define	assert_ptr_not_null(a, ...)	assert_cmp(void *, a, NULL, !=,	\
+    ==, "p", __VA_ARGS__)

-#define	assert_c_eq(a, b, fmt...)	assert_cmp(char, a, b, ==, !=, "c", fmt)
-#define	assert_c_ne(a, b, fmt...)	assert_cmp(char, a, b, !=, ==, "c", fmt)
-#define	assert_c_lt(a, b, fmt...)	assert_cmp(char, a, b, <, >=, "c", fmt)
-#define	assert_c_le(a, b, fmt...)	assert_cmp(char, a, b, <=, >, "c", fmt)
-#define	assert_c_ge(a, b, fmt...)	assert_cmp(char, a, b, >=, <, "c", fmt)
-#define	assert_c_gt(a, b, fmt...)	assert_cmp(char, a, b, >, <=, "c", fmt)
+#define	assert_c_eq(a, b, ...)	assert_cmp(char, a, b, ==, !=, "c", __VA_ARGS__)
+#define	assert_c_ne(a, b, ...)	assert_cmp(char, a, b, !=, ==, "c", __VA_ARGS__)
+#define	assert_c_lt(a, b, ...)	assert_cmp(char, a, b, <, >=, "c", __VA_ARGS__)
+#define	assert_c_le(a, b, ...)	assert_cmp(char, a, b, <=, >, "c", __VA_ARGS__)
+#define	assert_c_ge(a, b, ...)	assert_cmp(char, a, b, >=, <, "c", __VA_ARGS__)
+#define	assert_c_gt(a, b, ...)	assert_cmp(char, a, b, >, <=, "c", __VA_ARGS__)

-#define	assert_x_eq(a, b, fmt...)	assert_cmp(int, a, b, ==, !=, "#x", fmt)
-#define	assert_x_ne(a, b, fmt...)	assert_cmp(int, a, b, !=, ==, "#x", fmt)
-#define	assert_x_lt(a, b, fmt...)	assert_cmp(int, a, b, <, >=, "#x", fmt)
-#define	assert_x_le(a, b, fmt...)	assert_cmp(int, a, b, <=, >, "#x", fmt)
-#define	assert_x_ge(a, b, fmt...)	assert_cmp(int, a, b, >=, <, "#x", fmt)
-#define	assert_x_gt(a, b, fmt...)	assert_cmp(int, a, b, >, <=, "#x", fmt)
+#define	assert_x_eq(a, b, ...)	assert_cmp(int, a, b, ==, !=, "#x", __VA_ARGS__)
+#define	assert_x_ne(a, b, ...)	assert_cmp(int, a, b, !=, ==, "#x", __VA_ARGS__)
+#define	assert_x_lt(a, b, ...)	assert_cmp(int, a, b, <, >=, "#x", __VA_ARGS__)
+#define	assert_x_le(a, b, ...)	assert_cmp(int, a, b, <=, >, "#x", __VA_ARGS__)
+#define	assert_x_ge(a, b, ...)	assert_cmp(int, a, b, >=, <, "#x", __VA_ARGS__)
+#define	assert_x_gt(a, b, ...)	assert_cmp(int, a, b, >, <=, "#x", __VA_ARGS__)

-#define	assert_d_eq(a, b, fmt...)	assert_cmp(int, a, b, ==, !=, "d", fmt)
-#define	assert_d_ne(a, b, fmt...)	assert_cmp(int, a, b, !=, ==, "d", fmt)
-#define	assert_d_lt(a, b, fmt...)	assert_cmp(int, a, b, <, >=, "d", fmt)
-#define	assert_d_le(a, b, fmt...)	assert_cmp(int, a, b, <=, >, "d", fmt)
-#define	assert_d_ge(a, b, fmt...)	assert_cmp(int, a, b, >=, <, "d", fmt)
-#define	assert_d_gt(a, b, fmt...)	assert_cmp(int, a, b, >, <=, "d", fmt)
+#define	assert_d_eq(a, b, ...)	assert_cmp(int, a, b, ==, !=, "d", __VA_ARGS__)
+#define	assert_d_ne(a, b, ...)	assert_cmp(int, a, b, !=, ==, "d", __VA_ARGS__)
+#define	assert_d_lt(a, b, ...)	assert_cmp(int, a, b, <, >=, "d", __VA_ARGS__)
+#define	assert_d_le(a, b, ...)	assert_cmp(int, a, b, <=, >, "d", __VA_ARGS__)
+#define	assert_d_ge(a, b, ...)	assert_cmp(int, a, b, >=, <, "d", __VA_ARGS__)
+#define	assert_d_gt(a, b, ...)	assert_cmp(int, a, b, >, <=, "d", __VA_ARGS__)

-#define	assert_u_eq(a, b, fmt...)	assert_cmp(int, a, b, ==, !=, "u", fmt)
-#define	assert_u_ne(a, b, fmt...)	assert_cmp(int, a, b, !=, ==, "u", fmt)
-#define	assert_u_lt(a, b, fmt...)	assert_cmp(int, a, b, <, >=, "u", fmt)
-#define	assert_u_le(a, b, fmt...)	assert_cmp(int, a, b, <=, >, "u", fmt)
-#define	assert_u_ge(a, b, fmt...)	assert_cmp(int, a, b, >=, <, "u", fmt)
-#define	assert_u_gt(a, b, fmt...)	assert_cmp(int, a, b, >, <=, "u", fmt)
+#define	assert_u_eq(a, b, ...)	assert_cmp(int, a, b, ==, !=, "u", __VA_ARGS__)
+#define	assert_u_ne(a, b, ...)	assert_cmp(int, a, b, !=, ==, "u", __VA_ARGS__)
+#define	assert_u_lt(a, b, ...)	assert_cmp(int, a, b, <, >=, "u", __VA_ARGS__)
+#define	assert_u_le(a, b, ...)	assert_cmp(int, a, b, <=, >, "u", __VA_ARGS__)
+#define	assert_u_ge(a, b, ...)	assert_cmp(int, a, b, >=, <, "u", __VA_ARGS__)
+#define	assert_u_gt(a, b, ...)	assert_cmp(int, a, b, >, <=, "u", __VA_ARGS__)

-#define	assert_ld_eq(a, b, fmt...)	assert_cmp(long, a, b, ==,	\
-    !=, "ld", fmt)
-#define	assert_ld_ne(a, b, fmt...)	assert_cmp(long, a, b, !=,	\
-    ==, "ld", fmt)
-#define	assert_ld_lt(a, b, fmt...)	assert_cmp(long, a, b, <,	\
-    >=, "ld", fmt)
-#define	assert_ld_le(a, b, fmt...)	assert_cmp(long, a, b, <=,	\
-    >, "ld", fmt)
-#define	assert_ld_ge(a, b, fmt...)	assert_cmp(long, a, b, >=,	\
-    <, "ld", fmt)
-#define	assert_ld_gt(a, b, fmt...)	assert_cmp(long, a, b, >,	\
-    <=, "ld", fmt)
+#define	assert_ld_eq(a, b, ...)	assert_cmp(long, a, b, ==,	\
+    !=, "ld", __VA_ARGS__)
+#define	assert_ld_ne(a, b, ...)	assert_cmp(long, a, b, !=,	\
+    ==, "ld", __VA_ARGS__)
+#define	assert_ld_lt(a, b, ...)	assert_cmp(long, a, b, <,	\
+    >=, "ld", __VA_ARGS__)
+#define	assert_ld_le(a, b, ...)	assert_cmp(long, a, b, <=,	\
+    >, "ld", __VA_ARGS__)
+#define	assert_ld_ge(a, b, ...)	assert_cmp(long, a, b, >=,	\
+    <, "ld", __VA_ARGS__)
+#define	assert_ld_gt(a, b, ...)	assert_cmp(long, a, b, >,	\
+    <=, "ld", __VA_ARGS__)

-#define	assert_lu_eq(a, b, fmt...)	assert_cmp(unsigned long,	\
-    a, b, ==, !=, "lu", fmt)
-#define	assert_lu_ne(a, b, fmt...)	assert_cmp(unsigned long,	\
-    a, b, !=, ==, "lu", fmt)
-#define	assert_lu_lt(a, b, fmt...)	assert_cmp(unsigned long,	\
-    a, b, <, >=, "lu", fmt)
-#define	assert_lu_le(a, b, fmt...)	assert_cmp(unsigned long,	\
-    a, b, <=, >, "lu", fmt)
-#define	assert_lu_ge(a, b, fmt...)	assert_cmp(unsigned long,	\
-    a, b, >=, <, "lu", fmt)
-#define	assert_lu_gt(a, b, fmt...)	assert_cmp(unsigned long,	\
-    a, b, >, <=, "lu", fmt)
+#define	assert_lu_eq(a, b, ...)	assert_cmp(unsigned long,	\
+    a, b, ==, !=, "lu", __VA_ARGS__)
+#define	assert_lu_ne(a, b, ...)	assert_cmp(unsigned long,	\
+    a, b, !=, ==, "lu", __VA_ARGS__)
+#define	assert_lu_lt(a, b, ...)	assert_cmp(unsigned long,	\
+    a, b, <, >=, "lu", __VA_ARGS__)
+#define	assert_lu_le(a, b, ...)	assert_cmp(unsigned long,	\
+    a, b, <=, >, "lu", __VA_ARGS__)
+#define	assert_lu_ge(a, b, ...)	assert_cmp(unsigned long,	\
+    a, b, >=, <, "lu", __VA_ARGS__)
+#define	assert_lu_gt(a, b, ...)	assert_cmp(unsigned long,	\
+    a, b, >, <=, "lu", __VA_ARGS__)

-#define	assert_qd_eq(a, b, fmt...)	assert_cmp(long long, a, b, ==,	\
-    !=, "qd", fmt)
-#define	assert_qd_ne(a, b, fmt...)	assert_cmp(long long, a, b, !=,	\
-    ==, "qd", fmt)
-#define	assert_qd_lt(a, b, fmt...)	assert_cmp(long long, a, b, <,	\
-    >=, "qd", fmt)
-#define	assert_qd_le(a, b, fmt...)	assert_cmp(long long, a, b, <=,	\
-    >, "qd", fmt)
-#define	assert_qd_ge(a, b, fmt...)	assert_cmp(long long, a, b, >=,	\
-    <, "qd", fmt)
-#define	assert_qd_gt(a, b, fmt...)	assert_cmp(long long, a, b, >,	\
-    <=, "qd", fmt)
+#define	assert_qd_eq(a, b, ...)	assert_cmp(long long, a, b, ==,	\
+    !=, "qd", __VA_ARGS__)
+#define	assert_qd_ne(a, b, ...)	assert_cmp(long long, a, b, !=,	\
+    ==, "qd", __VA_ARGS__)
+#define	assert_qd_lt(a, b, ...)	assert_cmp(long long, a, b, <,	\
+    >=, "qd", __VA_ARGS__)
+#define	assert_qd_le(a, b, ...)	assert_cmp(long long, a, b, <=,	\
+    >, "qd", __VA_ARGS__)
+#define	assert_qd_ge(a, b, ...)	assert_cmp(long long, a, b, >=,	\
+    <, "qd", __VA_ARGS__)
+#define	assert_qd_gt(a, b, ...)	assert_cmp(long long, a, b, >,	\
+    <=, "qd", __VA_ARGS__)

-#define	assert_qu_eq(a, b, fmt...)	assert_cmp(unsigned long long,	\
-    a, b, ==, !=, "qu", fmt)
-#define	assert_qu_ne(a, b, fmt...)	assert_cmp(unsigned long long,	\
-    a, b, !=, ==, "qu", fmt)
-#define	assert_qu_lt(a, b, fmt...)	assert_cmp(unsigned long long,	\
-    a, b, <, >=, "qu", fmt)
-#define	assert_qu_le(a, b, fmt...)	assert_cmp(unsigned long long,	\
-    a, b, <=, >, "qu", fmt)
-#define	assert_qu_ge(a, b, fmt...)	assert_cmp(unsigned long long,	\
-    a, b, >=, <, "qu", fmt)
-#define	assert_qu_gt(a, b, fmt...)	assert_cmp(unsigned long long,	\
-    a, b, >, <=, "qu", fmt)
+#define	assert_qu_eq(a, b, ...)	assert_cmp(unsigned long long,	\
+    a, b, ==, !=, "qu", __VA_ARGS__)
+#define	assert_qu_ne(a, b, ...)	assert_cmp(unsigned long long,	\
+    a, b, !=, ==, "qu", __VA_ARGS__)
+#define	assert_qu_lt(a, b, ...)	assert_cmp(unsigned long long,	\
+    a, b, <, >=, "qu", __VA_ARGS__)
+#define	assert_qu_le(a, b, ...)	assert_cmp(unsigned long long,	\
+    a, b, <=, >, "qu", __VA_ARGS__)
+#define	assert_qu_ge(a, b, ...)	assert_cmp(unsigned long long,	\
+    a, b, >=, <, "qu", __VA_ARGS__)
+#define	assert_qu_gt(a, b, ...)	assert_cmp(unsigned long long,	\
+    a, b, >, <=, "qu", __VA_ARGS__)

-#define	assert_jd_eq(a, b, fmt...)	assert_cmp(intmax_t, a, b, ==,	\
-    !=, "jd", fmt)
-#define	assert_jd_ne(a, b, fmt...)	assert_cmp(intmax_t, a, b, !=,	\
-    ==, "jd", fmt)
-#define	assert_jd_lt(a, b, fmt...)	assert_cmp(intmax_t, a, b, <,	\
-    >=, "jd", fmt)
-#define	assert_jd_le(a, b, fmt...)	assert_cmp(intmax_t, a, b, <=,	\
-    >, "jd", fmt)
-#define	assert_jd_ge(a, b, fmt...)	assert_cmp(intmax_t, a, b, >=,	\
-    <, "jd", fmt)
-#define	assert_jd_gt(a, b, fmt...)	assert_cmp(intmax_t, a, b, >,	\
-    <=, "jd", fmt)
+#define	assert_jd_eq(a, b, ...)	assert_cmp(intmax_t, a, b, ==,	\
+    !=, "jd", __VA_ARGS__)
+#define	assert_jd_ne(a, b, ...)	assert_cmp(intmax_t, a, b, !=,	\
+    ==, "jd", __VA_ARGS__)
+#define	assert_jd_lt(a, b, ...)	assert_cmp(intmax_t, a, b, <,	\
+    >=, "jd", __VA_ARGS__)
+#define	assert_jd_le(a, b, ...)	assert_cmp(intmax_t, a, b, <=,	\
+    >, "jd", __VA_ARGS__)
+#define	assert_jd_ge(a, b, ...)	assert_cmp(intmax_t, a, b, >=,	\
+    <, "jd", __VA_ARGS__)
+#define	assert_jd_gt(a, b, ...)	assert_cmp(intmax_t, a, b, >,	\
+    <=, "jd", __VA_ARGS__)

-#define	assert_ju_eq(a, b, fmt...)	assert_cmp(uintmax_t, a, b, ==,	\
-    !=, "ju", fmt)
-#define	assert_ju_ne(a, b, fmt...)	assert_cmp(uintmax_t, a, b, !=,	\
-    ==, "ju", fmt)
-#define	assert_ju_lt(a, b, fmt...)	assert_cmp(uintmax_t, a, b, <,	\
-    >=, "ju", fmt)
-#define	assert_ju_le(a, b, fmt...)	assert_cmp(uintmax_t, a, b, <=,	\
-    >, "ju", fmt)
-#define	assert_ju_ge(a, b, fmt...)	assert_cmp(uintmax_t, a, b, >=,	\
-    <, "ju", fmt)
-#define	assert_ju_gt(a, b, fmt...)	assert_cmp(uintmax_t, a, b, >,	\
-    <=, "ju", fmt)
+#define	assert_ju_eq(a, b, ...)	assert_cmp(uintmax_t, a, b, ==,	\
+    !=, "ju", __VA_ARGS__)
+#define	assert_ju_ne(a, b, ...)	assert_cmp(uintmax_t, a, b, !=,	\
+    ==, "ju", __VA_ARGS__)
+#define	assert_ju_lt(a, b, ...)	assert_cmp(uintmax_t, a, b, <,	\
+    >=, "ju", __VA_ARGS__)
+#define	assert_ju_le(a, b, ...)	assert_cmp(uintmax_t, a, b, <=,	\
+    >, "ju", __VA_ARGS__)
+#define	assert_ju_ge(a, b, ...)	assert_cmp(uintmax_t, a, b, >=,	\
+    <, "ju", __VA_ARGS__)
+#define	assert_ju_gt(a, b, ...)	assert_cmp(uintmax_t, a, b, >,	\
+    <=, "ju", __VA_ARGS__)

-#define	assert_zd_eq(a, b, fmt...)	assert_cmp(ssize_t, a, b, ==,	\
-    !=, "zd", fmt)
-#define	assert_zd_ne(a, b, fmt...)	assert_cmp(ssize_t, a, b, !=,	\
-    ==, "zd", fmt)
-#define	assert_zd_lt(a, b, fmt...)	assert_cmp(ssize_t, a, b, <,	\
-    >=, "zd", fmt)
-#define	assert_zd_le(a, b, fmt...)	assert_cmp(ssize_t, a, b, <=,	\
-    >, "zd", fmt)
-#define	assert_zd_ge(a, b, fmt...)	assert_cmp(ssize_t, a, b, >=,	\
-    <, "zd", fmt)
-#define	assert_zd_gt(a, b, fmt...)	assert_cmp(ssize_t, a, b, >,	\
-    <=, "zd", fmt)
+#define	assert_zd_eq(a, b, ...)	assert_cmp(ssize_t, a, b, ==,	\
+    !=, "zd", __VA_ARGS__)
+#define	assert_zd_ne(a, b, ...)	assert_cmp(ssize_t, a, b, !=,	\
+    ==, "zd", __VA_ARGS__)
+#define	assert_zd_lt(a, b, ...)	assert_cmp(ssize_t, a, b, <,	\
+    >=, "zd", __VA_ARGS__)
+#define	assert_zd_le(a, b, ...)	assert_cmp(ssize_t, a, b, <=,	\
+    >, "zd", __VA_ARGS__)
+#define	assert_zd_ge(a, b, ...)	assert_cmp(ssize_t, a, b, >=,	\
+    <, "zd", __VA_ARGS__)
+#define	assert_zd_gt(a, b, ...)	assert_cmp(ssize_t, a, b, >,	\
+    <=, "zd", __VA_ARGS__)

-#define	assert_zu_eq(a, b, fmt...)	assert_cmp(size_t, a, b, ==,	\
-    !=, "zu", fmt)
-#define	assert_zu_ne(a, b, fmt...)	assert_cmp(size_t, a, b, !=,	\
-    ==, "zu", fmt)
-#define	assert_zu_lt(a, b, fmt...)	assert_cmp(size_t, a, b, <,	\
-    >=, "zu", fmt)
-#define	assert_zu_le(a, b, fmt...)	assert_cmp(size_t, a, b, <=,	\
-    >, "zu", fmt)
-#define	assert_zu_ge(a, b, fmt...)	assert_cmp(size_t, a, b, >=,	\
-    <, "zu", fmt)
-#define	assert_zu_gt(a, b, fmt...)	assert_cmp(size_t, a, b, >,	\
-    <=, "zu", fmt)
+#define	assert_zu_eq(a, b, ...)	assert_cmp(size_t, a, b, ==,	\
+    !=, "zu", __VA_ARGS__)
+#define	assert_zu_ne(a, b, ...)	assert_cmp(size_t, a, b, !=,	\
+    ==, "zu", __VA_ARGS__)
+#define	assert_zu_lt(a, b, ...)	assert_cmp(size_t, a, b, <,	\
+    >=, "zu", __VA_ARGS__)
+#define	assert_zu_le(a, b, ...)	assert_cmp(size_t, a, b, <=,	\
+    >, "zu", __VA_ARGS__)
+#define	assert_zu_ge(a, b, ...)	assert_cmp(size_t, a, b, >=,	\
+    <, "zu", __VA_ARGS__)
+#define	assert_zu_gt(a, b, ...)	assert_cmp(size_t, a, b, >,	\
+    <=, "zu", __VA_ARGS__)

-#define	assert_d32_eq(a, b, fmt...)	assert_cmp(int32_t, a, b, ==,	\
-    !=, PRId32, fmt)
-#define	assert_d32_ne(a, b, fmt...)	assert_cmp(int32_t, a, b, !=,	\
-    ==, PRId32, fmt)
-#define	assert_d32_lt(a, b, fmt...)	assert_cmp(int32_t, a, b, <,	\
-    >=, PRId32, fmt)
-#define	assert_d32_le(a, b, fmt...)	assert_cmp(int32_t, a, b, <=,	\
-    >, PRId32, fmt)
-#define	assert_d32_ge(a, b, fmt...)	assert_cmp(int32_t, a, b, >=,	\
-    <, PRId32, fmt)
-#define	assert_d32_gt(a, b, fmt...)	assert_cmp(int32_t, a, b, >,	\
-    <=, PRId32, fmt)
+#define	assert_d32_eq(a, b, ...)	assert_cmp(int32_t, a, b, ==,	\
+    !=, FMTd32, __VA_ARGS__)
+#define	assert_d32_ne(a, b, ...)	assert_cmp(int32_t, a, b, !=,	\
+    ==, FMTd32, __VA_ARGS__)
+#define	assert_d32_lt(a, b, ...)	assert_cmp(int32_t, a, b, <,	\
+    >=, FMTd32, __VA_ARGS__)
+#define	assert_d32_le(a, b, ...)	assert_cmp(int32_t, a, b, <=,	\
+    >, FMTd32, __VA_ARGS__)
+#define	assert_d32_ge(a, b, ...)	assert_cmp(int32_t, a, b, >=,	\
+    <, FMTd32, __VA_ARGS__)
+#define	assert_d32_gt(a, b, ...)	assert_cmp(int32_t, a, b, >,	\
+    <=, FMTd32, __VA_ARGS__)

-#define	assert_u32_eq(a, b, fmt...)	assert_cmp(uint32_t, a, b, ==,	\
-    !=, PRIu32, fmt)
-#define	assert_u32_ne(a, b, fmt...)	assert_cmp(uint32_t, a, b, !=,	\
-    ==, PRIu32, fmt)
-#define	assert_u32_lt(a, b, fmt...)	assert_cmp(uint32_t, a, b, <,	\
-    >=, PRIu32, fmt)
-#define	assert_u32_le(a, b, fmt...)	assert_cmp(uint32_t, a, b, <=,	\
-    >, PRIu32, fmt)
-#define	assert_u32_ge(a, b, fmt...)	assert_cmp(uint32_t, a, b, >=,	\
-    <, PRIu32, fmt)
-#define	assert_u32_gt(a, b, fmt...)	assert_cmp(uint32_t, a, b, >,	\
-    <=, PRIu32, fmt)
+#define	assert_u32_eq(a, b, ...)	assert_cmp(uint32_t, a, b, ==,	\
+    !=, FMTu32, __VA_ARGS__)
+#define	assert_u32_ne(a, b, ...)	assert_cmp(uint32_t, a, b, !=,	\
+    ==, FMTu32, __VA_ARGS__)
+#define	assert_u32_lt(a, b, ...)	assert_cmp(uint32_t, a, b, <,	\
+    >=, FMTu32, __VA_ARGS__)
+#define	assert_u32_le(a, b, ...)	assert_cmp(uint32_t, a, b, <=,	\
+    >, FMTu32, __VA_ARGS__)
+#define	assert_u32_ge(a, b, ...)	assert_cmp(uint32_t, a, b, >=,	\
+    <, FMTu32, __VA_ARGS__)
+#define	assert_u32_gt(a, b, ...)	assert_cmp(uint32_t, a, b, >,	\
+    <=, FMTu32, __VA_ARGS__)

-#define	assert_d64_eq(a, b, fmt...)	assert_cmp(int64_t, a, b, ==,	\
-    !=, PRId64, fmt)
-#define	assert_d64_ne(a, b, fmt...)	assert_cmp(int64_t, a, b, !=,	\
-    ==, PRId64, fmt)
-#define	assert_d64_lt(a, b, fmt...)	assert_cmp(int64_t, a, b, <,	\
-    >=, PRId64, fmt)
-#define	assert_d64_le(a, b, fmt...)	assert_cmp(int64_t, a, b, <=,	\
-    >, PRId64, fmt)
-#define	assert_d64_ge(a, b, fmt...)	assert_cmp(int64_t, a, b, >=,	\
-    <, PRId64, fmt)
-#define	assert_d64_gt(a, b, fmt...)	assert_cmp(int64_t, a, b, >,	\
-    <=, PRId64, fmt)
+#define	assert_d64_eq(a, b, ...)	assert_cmp(int64_t, a, b, ==,	\
+    !=, FMTd64, __VA_ARGS__)
+#define	assert_d64_ne(a, b, ...)	assert_cmp(int64_t, a, b, !=,	\
+    ==, FMTd64, __VA_ARGS__)
+#define	assert_d64_lt(a, b, ...)	assert_cmp(int64_t, a, b, <,	\
+    >=, FMTd64, __VA_ARGS__)
+#define	assert_d64_le(a, b, ...)	assert_cmp(int64_t, a, b, <=,	\
+    >, FMTd64, __VA_ARGS__)
+#define	assert_d64_ge(a, b, ...)	assert_cmp(int64_t, a, b, >=,	\
+    <, FMTd64, __VA_ARGS__)
+#define	assert_d64_gt(a, b, ...)	assert_cmp(int64_t, a, b, >,	\
+    <=, FMTd64, __VA_ARGS__)

-#define	assert_u64_eq(a, b, fmt...)	assert_cmp(uint64_t, a, b, ==,	\
-    !=, PRIu64, fmt)
-#define	assert_u64_ne(a, b, fmt...)	assert_cmp(uint64_t, a, b, !=,	\
-    ==, PRIu64, fmt)
-#define	assert_u64_lt(a, b, fmt...)	assert_cmp(uint64_t, a, b, <,	\
-    >=, PRIu64, fmt)
-#define	assert_u64_le(a, b, fmt...)	assert_cmp(uint64_t, a, b, <=,	\
-    >, PRIu64, fmt)
-#define	assert_u64_ge(a, b, fmt...)	assert_cmp(uint64_t, a, b, >=,	\
-    <, PRIu64, fmt)
-#define	assert_u64_gt(a, b, fmt...)	assert_cmp(uint64_t, a, b, >,	\
-    <=, PRIu64, fmt)
+#define	assert_u64_eq(a, b, ...)	assert_cmp(uint64_t, a, b, ==,	\
+    !=, FMTu64, __VA_ARGS__)
+#define	assert_u64_ne(a, b, ...)	assert_cmp(uint64_t, a, b, !=,	\
+    ==, FMTu64, __VA_ARGS__)
+#define	assert_u64_lt(a, b, ...)	assert_cmp(uint64_t, a, b, <,	\
+    >=, FMTu64, __VA_ARGS__)
+#define	assert_u64_le(a, b, ...)	assert_cmp(uint64_t, a, b, <=,	\
+    >, FMTu64, __VA_ARGS__)
+#define	assert_u64_ge(a, b, ...)	assert_cmp(uint64_t, a, b, >=,	\
+    <, FMTu64, __VA_ARGS__)
+#define	assert_u64_gt(a, b, ...)	assert_cmp(uint64_t, a, b, >,	\
+    <=, FMTu64, __VA_ARGS__)

-#define	assert_b_eq(a, b, fmt...) do {					\
+#define	assert_b_eq(a, b, ...) do {					\
 	bool a_ = (a);							\
 	bool b_ = (b);							\
 	if (!(a_ == b_)) {						\
@ -222,11 +222,11 @@
 		    __func__, __FILE__, __LINE__,			\
 		    #a, #b, a_ ? "true" : "false",			\
 		    b_ ? "true" : "false");				\
-		malloc_snprintf(message, sizeof(message), fmt);		\
+		malloc_snprintf(message, sizeof(message), __VA_ARGS__);	\
 		p_test_fail(prefix, message);				\
 	}								\
 } while (0)
-#define	assert_b_ne(a, b, fmt...) do {					\
+#define	assert_b_ne(a, b, ...) do {					\
 	bool a_ = (a);							\
 	bool b_ = (b);							\
 	if (!(a_ != b_)) {						\
@ -238,14 +238,14 @@
 		    __func__, __FILE__, __LINE__,			\
 		    #a, #b, a_ ? "true" : "false",			\
 		    b_ ? "true" : "false");				\
-		malloc_snprintf(message, sizeof(message), fmt);		\
+		malloc_snprintf(message, sizeof(message), __VA_ARGS__);	\
 		p_test_fail(prefix, message);				\
 	}								\
 } while (0)
-#define	assert_true(a, fmt...)	assert_b_eq(a, true, fmt)
-#define	assert_false(a, fmt...)	assert_b_eq(a, false, fmt)
+#define	assert_true(a, ...)	assert_b_eq(a, true, __VA_ARGS__)
+#define	assert_false(a, ...)	assert_b_eq(a, false, __VA_ARGS__)

-#define	assert_str_eq(a, b, fmt...) do {				\
+#define	assert_str_eq(a, b, ...) do {				\
 	if (strcmp((a), (b))) {						\
 		char prefix[ASSERT_BUFSIZE];				\
 		char message[ASSERT_BUFSIZE];				\
@ -254,11 +254,11 @@
 		    "(%s) same as (%s) --> "				\
 		    "\"%s\" differs from \"%s\": ",			\
 		    __func__, __FILE__, __LINE__, #a, #b, a, b);	\
-		malloc_snprintf(message, sizeof(message), fmt);		\
+		malloc_snprintf(message, sizeof(message), __VA_ARGS__);	\
 		p_test_fail(prefix, message);				\
 	}								\
 } while (0)
-#define	assert_str_ne(a, b, fmt...) do {				\
+#define	assert_str_ne(a, b, ...) do {				\
 	if (!strcmp((a), (b))) {					\
 		char prefix[ASSERT_BUFSIZE];				\
 		char message[ASSERT_BUFSIZE];				\
@ -267,18 +267,18 @@
 		    "(%s) differs from (%s) --> "			\
 		    "\"%s\" same as \"%s\": ",				\
 		    __func__, __FILE__, __LINE__, #a, #b, a, b);	\
-		malloc_snprintf(message, sizeof(message), fmt);		\
+		malloc_snprintf(message, sizeof(message), __VA_ARGS__);	\
 		p_test_fail(prefix, message);				\
 	}								\
 } while (0)

-#define	assert_not_reached(fmt...) do {					\
+#define	assert_not_reached(...) do {					\
 	char prefix[ASSERT_BUFSIZE];					\
 	char message[ASSERT_BUFSIZE];					\
 	malloc_snprintf(prefix, sizeof(prefix),				\
 	    "%s:%s:%d: Unreachable code reached: ",			\
 	    __func__, __FILE__, __LINE__);				\
-	malloc_snprintf(message, sizeof(message), fmt);			\
+	malloc_snprintf(message, sizeof(message), __VA_ARGS__);		\
 	p_test_fail(prefix, message);					\
 } while (0)

@ -308,8 +308,8 @@ label_test_end:								\
 	p_test_fini();							\
 }

-#define	test(tests...)							\
-	p_test(tests, NULL)
+#define	test(...)							\
+	p_test(__VA_ARGS__, NULL)

 #define	test_skip_if(e) do {						\
 	if (e) {							\
@ -319,11 +319,11 @@ label_test_end:								\
 	}								\
 } while (0)

-void	test_skip(const char *format, ...) JEMALLOC_ATTR(format(printf, 1, 2));
-void	test_fail(const char *format, ...) JEMALLOC_ATTR(format(printf, 1, 2));
+void	test_skip(const char *format, ...) JEMALLOC_FORMAT_PRINTF(1, 2);
+void	test_fail(const char *format, ...) JEMALLOC_FORMAT_PRINTF(1, 2);

 /* For private use by macros. */
-test_status_t	p_test(test_t* t, ...);
+test_status_t	p_test(test_t *t, ...);
 void	p_test_init(const char *name);
 void	p_test_fini(void);
 void	p_test_fail(const char *prefix, const char *message);
--- a/test/include/test/thd.h
+++ b/test/include/test/thd.h
@ -1,4 +1,4 @@
-/* Abstraction layer for threading in tests */
+/* Abstraction layer for threading in tests. */
 #ifdef _WIN32
 typedef HANDLE thd_t;
 #else
--- a/test/include/test/timer.h
+++ b/test/include/test/timer.h
@ -0,0 +1,26 @@
+/* Simple timer, for use in benchmark reporting. */
+
+#include <unistd.h>
+#include <sys/time.h>
+
+#define JEMALLOC_CLOCK_GETTIME defined(_POSIX_MONOTONIC_CLOCK) \
+    && _POSIX_MONOTONIC_CLOCK >= 0
+
+typedef struct {
+#ifdef _WIN32
+	FILETIME ft0;
+	FILETIME ft1;
+#elif JEMALLOC_CLOCK_GETTIME
+	struct timespec ts0;
+	struct timespec ts1;
+	int clock_id;
+#else
+	struct timeval tv0;
+	struct timeval tv1;
+#endif
+} timedelta_t;
+
+void	timer_start(timedelta_t *timer);
+void	timer_stop(timedelta_t *timer);
+uint64_t	timer_usec(const timedelta_t *timer);
+void	timer_ratio(timedelta_t *a, timedelta_t *b, char *buf, size_t buflen);
--- a/test/integration/MALLOCX_ARENA.c
+++ b/test/integration/MALLOCX_ARENA.c
@ -2,6 +2,14 @@

 #define	NTHREADS 10

+static bool have_dss =
+#ifdef JEMALLOC_DSS
+    true
+#else
+    false
+#endif
+    ;
+
 void *
 thd_start(void *arg)
 {
@ -18,13 +26,16 @@ thd_start(void *arg)
 		size_t mib[3];
 		size_t miblen = sizeof(mib) / sizeof(size_t);
 		const char *dss_precs[] = {"disabled", "primary", "secondary"};
-		const char *dss = dss_precs[thread_ind %
-		    (sizeof(dss_precs)/sizeof(char*))];
+		unsigned prec_ind = thread_ind %
+		    (sizeof(dss_precs)/sizeof(char*));
+		const char *dss = dss_precs[prec_ind];
+		int expected_err = (have_dss || prec_ind == 0) ? 0 : EFAULT;
 		assert_d_eq(mallctlnametomib("arena.0.dss", mib, &miblen), 0,
 		    "Error in mallctlnametomib()");
 		mib[1] = arena_ind;
 		assert_d_eq(mallctlbymib(mib, miblen, NULL, NULL, (void *)&dss,
-		    sizeof(const char *)), 0, "Error in mallctlbymib()");
+		    sizeof(const char *)), expected_err,
+		    "Error in mallctlbymib()");
 	}

 	p = mallocx(1, MALLOCX_ARENA(arena_ind));
@ -34,7 +45,7 @@ thd_start(void *arg)
 	return (NULL);
 }

-TEST_BEGIN(test_ALLOCM_ARENA)
+TEST_BEGIN(test_MALLOCX_ARENA)
 {
 	thd_t thds[NTHREADS];
 	unsigned i;
@ -54,5 +65,5 @@ main(void)
 {

 	return (test(
-	    test_ALLOCM_ARENA));
+	    test_MALLOCX_ARENA));
 }
--- a/test/integration/allocm.c
+++ b/test/integration/allocm.c
@ -1,107 +0,0 @@
-#include "test/jemalloc_test.h"
-
-#define	CHUNK 0x400000
-#define	MAXALIGN (((size_t)1) << 25)
-#define	NITER 4
-
-TEST_BEGIN(test_basic)
-{
-	size_t nsz, rsz, sz;
-	void *p;
-
-	sz = 42;
-	nsz = 0;
-	assert_d_eq(nallocm(&nsz, sz, 0), ALLOCM_SUCCESS,
-	    "Unexpected nallocm() error");
-	rsz = 0;
-	assert_d_eq(allocm(&p, &rsz, sz, 0), ALLOCM_SUCCESS,
-	    "Unexpected allocm() error");
-	assert_zu_ge(rsz, sz, "Real size smaller than expected");
-	assert_zu_eq(nsz, rsz, "nallocm()/allocm() rsize mismatch");
-	assert_d_eq(dallocm(p, 0), ALLOCM_SUCCESS,
-	    "Unexpected dallocm() error");
-
-	assert_d_eq(allocm(&p, NULL, sz, 0), ALLOCM_SUCCESS,
-	    "Unexpected allocm() error");
-	assert_d_eq(dallocm(p, 0), ALLOCM_SUCCESS,
-	    "Unexpected dallocm() error");
-
-	nsz = 0;
-	assert_d_eq(nallocm(&nsz, sz, ALLOCM_ZERO), ALLOCM_SUCCESS,
-	    "Unexpected nallocm() error");
-	rsz = 0;
-	assert_d_eq(allocm(&p, &rsz, sz, ALLOCM_ZERO), ALLOCM_SUCCESS,
-	    "Unexpected allocm() error");
-	assert_zu_eq(nsz, rsz, "nallocm()/allocm() rsize mismatch");
-	assert_d_eq(dallocm(p, 0), ALLOCM_SUCCESS,
-	    "Unexpected dallocm() error");
-}
-TEST_END
-
-TEST_BEGIN(test_alignment_and_size)
-{
-	int r;
-	size_t nsz, rsz, sz, alignment, total;
-	unsigned i;
-	void *ps[NITER];
-
-	for (i = 0; i < NITER; i++)
-		ps[i] = NULL;
-
-	for (alignment = 8;
-	    alignment <= MAXALIGN;
-	    alignment <<= 1) {
-		total = 0;
-		for (sz = 1;
-		    sz < 3 * alignment && sz < (1U << 31);
-		    sz += (alignment >> (LG_SIZEOF_PTR-1)) - 1) {
-			for (i = 0; i < NITER; i++) {
-				nsz = 0;
-				r = nallocm(&nsz, sz, ALLOCM_ALIGN(alignment) |
-				    ALLOCM_ZERO);
-				assert_d_eq(r, ALLOCM_SUCCESS,
-				    "nallocm() error for alignment=%zu, "
-				    "size=%zu (%#zx): %d",
-				    alignment, sz, sz, r);
-				rsz = 0;
-				r = allocm(&ps[i], &rsz, sz,
-				    ALLOCM_ALIGN(alignment) | ALLOCM_ZERO);
-				assert_d_eq(r, ALLOCM_SUCCESS,
-				    "allocm() error for alignment=%zu, "
-				    "size=%zu (%#zx): %d",
-				    alignment, sz, sz, r);
-				assert_zu_ge(rsz, sz,
-				    "Real size smaller than expected for "
-				    "alignment=%zu, size=%zu", alignment, sz);
-				assert_zu_eq(nsz, rsz,
-				    "nallocm()/allocm() rsize mismatch for "
-				    "alignment=%zu, size=%zu", alignment, sz);
-				assert_ptr_null(
-				    (void *)((uintptr_t)ps[i] & (alignment-1)),
-				    "%p inadequately aligned for"
-				    " alignment=%zu, size=%zu", ps[i],
-				    alignment, sz);
-				sallocm(ps[i], &rsz, 0);
-				total += rsz;
-				if (total >= (MAXALIGN << 1))
-					break;
-			}
-			for (i = 0; i < NITER; i++) {
-				if (ps[i] != NULL) {
-					dallocm(ps[i], 0);
-					ps[i] = NULL;
-				}
-			}
-		}
-	}
-}
-TEST_END
-
-int
-main(void)
-{
-
-	return (test(
-	    test_basic,
-	    test_alignment_and_size));
-}
--- a/test/integration/chunk.c
+++ b/test/integration/chunk.c
@ -0,0 +1,272 @@
+#include "test/jemalloc_test.h"
+
+static chunk_hooks_t orig_hooks;
+static chunk_hooks_t old_hooks;
+
+static bool do_dalloc = true;
+static bool do_decommit;
+
+static bool did_alloc;
+static bool did_dalloc;
+static bool did_commit;
+static bool did_decommit;
+static bool did_purge;
+static bool did_split;
+static bool did_merge;
+
+#if 0
+#  define TRACE_HOOK(fmt, ...) malloc_printf(fmt, __VA_ARGS__)
+#else
+#  define TRACE_HOOK(fmt, ...)
+#endif
+
+void *
+chunk_alloc(void *new_addr, size_t size, size_t alignment, bool *zero,
+    bool *commit, unsigned arena_ind)
+{
+
+	TRACE_HOOK("%s(new_addr=%p, size=%zu, alignment=%zu, *zero=%s, "
+	    "*commit=%s, arena_ind=%u)\n", __func__, new_addr, size, alignment,
+	    *zero ?  "true" : "false", *commit ? "true" : "false", arena_ind);
+	did_alloc = true;
+	return (old_hooks.alloc(new_addr, size, alignment, zero, commit,
+	    arena_ind));
+}
+
+bool
+chunk_dalloc(void *chunk, size_t size, bool committed, unsigned arena_ind)
+{
+
+	TRACE_HOOK("%s(chunk=%p, size=%zu, committed=%s, arena_ind=%u)\n",
+	    __func__, chunk, size, committed ? "true" : "false", arena_ind);
+	did_dalloc = true;
+	if (!do_dalloc)
+		return (true);
+	return (old_hooks.dalloc(chunk, size, committed, arena_ind));
+}
+
+bool
+chunk_commit(void *chunk, size_t size, size_t offset, size_t length,
+    unsigned arena_ind)
+{
+	bool err;
+
+	TRACE_HOOK("%s(chunk=%p, size=%zu, offset=%zu, length=%zu, "
+	    "arena_ind=%u)\n", __func__, chunk, size, offset, length,
+	    arena_ind);
+	err = old_hooks.commit(chunk, size, offset, length, arena_ind);
+	did_commit = !err;
+	return (err);
+}
+
+bool
+chunk_decommit(void *chunk, size_t size, size_t offset, size_t length,
+    unsigned arena_ind)
+{
+	bool err;
+
+	TRACE_HOOK("%s(chunk=%p, size=%zu, offset=%zu, length=%zu, "
+	    "arena_ind=%u)\n", __func__, chunk, size, offset, length,
+	    arena_ind);
+	if (!do_decommit)
+		return (true);
+	err = old_hooks.decommit(chunk, size, offset, length, arena_ind);
+	did_decommit = !err;
+	return (err);
+}
+
+bool
+chunk_purge(void *chunk, size_t size, size_t offset, size_t length,
+    unsigned arena_ind)
+{
+
+	TRACE_HOOK("%s(chunk=%p, size=%zu, offset=%zu, length=%zu "
+	    "arena_ind=%u)\n", __func__, chunk, size, offset, length,
+	    arena_ind);
+	did_purge = true;
+	return (old_hooks.purge(chunk, size, offset, length, arena_ind));
+}
+
+bool
+chunk_split(void *chunk, size_t size, size_t size_a, size_t size_b,
+    bool committed, unsigned arena_ind)
+{
+
+	TRACE_HOOK("%s(chunk=%p, size=%zu, size_a=%zu, size_b=%zu, "
+	    "committed=%s, arena_ind=%u)\n", __func__, chunk, size, size_a,
+	    size_b, committed ? "true" : "false", arena_ind);
+	did_split = true;
+	return (old_hooks.split(chunk, size, size_a, size_b, committed,
+	    arena_ind));
+}
+
+bool
+chunk_merge(void *chunk_a, size_t size_a, void *chunk_b, size_t size_b,
+    bool committed, unsigned arena_ind)
+{
+
+	TRACE_HOOK("%s(chunk_a=%p, size_a=%zu, chunk_b=%p size_b=%zu, "
+	    "committed=%s, arena_ind=%u)\n", __func__, chunk_a, size_a, chunk_b,
+	    size_b, committed ? "true" : "false", arena_ind);
+	did_merge = true;
+	return (old_hooks.merge(chunk_a, size_a, chunk_b, size_b,
+	    committed, arena_ind));
+}
+
+TEST_BEGIN(test_chunk)
+{
+	void *p;
+	size_t old_size, new_size, large0, large1, huge0, huge1, huge2, sz;
+	chunk_hooks_t new_hooks = {
+		chunk_alloc,
+		chunk_dalloc,
+		chunk_commit,
+		chunk_decommit,
+		chunk_purge,
+		chunk_split,
+		chunk_merge
+	};
+	bool xallocx_success_a, xallocx_success_b, xallocx_success_c;
+
+	/* Install custom chunk hooks. */
+	old_size = sizeof(chunk_hooks_t);
+	new_size = sizeof(chunk_hooks_t);
+	assert_d_eq(mallctl("arena.0.chunk_hooks", &old_hooks, &old_size,
+	    &new_hooks, new_size), 0, "Unexpected chunk_hooks error");
+	orig_hooks = old_hooks;
+	assert_ptr_ne(old_hooks.alloc, chunk_alloc, "Unexpected alloc error");
+	assert_ptr_ne(old_hooks.dalloc, chunk_dalloc,
+	    "Unexpected dalloc error");
+	assert_ptr_ne(old_hooks.commit, chunk_commit,
+	    "Unexpected commit error");
+	assert_ptr_ne(old_hooks.decommit, chunk_decommit,
+	    "Unexpected decommit error");
+	assert_ptr_ne(old_hooks.purge, chunk_purge, "Unexpected purge error");
+	assert_ptr_ne(old_hooks.split, chunk_split, "Unexpected split error");
+	assert_ptr_ne(old_hooks.merge, chunk_merge, "Unexpected merge error");
+
+	/* Get large size classes. */
+	sz = sizeof(size_t);
+	assert_d_eq(mallctl("arenas.lrun.0.size", &large0, &sz, NULL, 0), 0,
+	    "Unexpected arenas.lrun.0.size failure");
+	assert_d_eq(mallctl("arenas.lrun.1.size", &large1, &sz, NULL, 0), 0,
+	    "Unexpected arenas.lrun.1.size failure");
+
+	/* Get huge size classes. */
+	assert_d_eq(mallctl("arenas.hchunk.0.size", &huge0, &sz, NULL, 0), 0,
+	    "Unexpected arenas.hchunk.0.size failure");
+	assert_d_eq(mallctl("arenas.hchunk.1.size", &huge1, &sz, NULL, 0), 0,
+	    "Unexpected arenas.hchunk.1.size failure");
+	assert_d_eq(mallctl("arenas.hchunk.2.size", &huge2, &sz, NULL, 0), 0,
+	    "Unexpected arenas.hchunk.2.size failure");
+
+	/* Test dalloc/decommit/purge cascade. */
+	do_dalloc = false;
+	do_decommit = false;
+	p = mallocx(huge0 * 2, 0);
+	assert_ptr_not_null(p, "Unexpected mallocx() error");
+	did_dalloc = false;
+	did_decommit = false;
+	did_purge = false;
+	did_split = false;
+	xallocx_success_a = (xallocx(p, huge0, 0, 0) == huge0);
+	assert_d_eq(mallctl("arena.0.purge", NULL, NULL, NULL, 0), 0,
+	    "Unexpected arena.0.purge error");
+	if (xallocx_success_a) {
+		assert_true(did_dalloc, "Expected dalloc");
+		assert_false(did_decommit, "Unexpected decommit");
+		assert_true(did_purge, "Expected purge");
+	}
+	assert_true(did_split, "Expected split");
+	dallocx(p, 0);
+	do_dalloc = true;
+
+	/* Test decommit/commit and observe split/merge. */
+	do_dalloc = false;
+	do_decommit = true;
+	p = mallocx(huge0 * 2, 0);
+	assert_ptr_not_null(p, "Unexpected mallocx() error");
+	did_decommit = false;
+	did_commit = false;
+	did_split = false;
+	did_merge = false;
+	xallocx_success_b = (xallocx(p, huge0, 0, 0) == huge0);
+	assert_d_eq(mallctl("arena.0.purge", NULL, NULL, NULL, 0), 0,
+	    "Unexpected arena.0.purge error");
+	if (xallocx_success_b)
+		assert_true(did_split, "Expected split");
+	xallocx_success_c = (xallocx(p, huge0 * 2, 0, 0) == huge0 * 2);
+	assert_b_eq(did_decommit, did_commit, "Expected decommit/commit match");
+	if (xallocx_success_b && xallocx_success_c)
+		assert_true(did_merge, "Expected merge");
+	dallocx(p, 0);
+	do_dalloc = true;
+	do_decommit = false;
+
+	/* Test purge for partial-chunk huge allocations. */
+	if (huge0 * 2 > huge2) {
+		/*
+		 * There are at least four size classes per doubling, so a
+		 * successful xallocx() from size=huge2 to size=huge1 is
+		 * guaranteed to leave trailing purgeable memory.
+		 */
+		p = mallocx(huge2, 0);
+		assert_ptr_not_null(p, "Unexpected mallocx() error");
+		did_purge = false;
+		assert_zu_eq(xallocx(p, huge1, 0, 0), huge1,
+		    "Unexpected xallocx() failure");
+		assert_true(did_purge, "Expected purge");
+		dallocx(p, 0);
+	}
+
+	/* Test decommit for large allocations. */
+	do_decommit = true;
+	p = mallocx(large1, 0);
+	assert_ptr_not_null(p, "Unexpected mallocx() error");
+	assert_d_eq(mallctl("arena.0.purge", NULL, NULL, NULL, 0), 0,
+	    "Unexpected arena.0.purge error");
+	did_decommit = false;
+	assert_zu_eq(xallocx(p, large0, 0, 0), large0,
+	    "Unexpected xallocx() failure");
+	assert_d_eq(mallctl("arena.0.purge", NULL, NULL, NULL, 0), 0,
+	    "Unexpected arena.0.purge error");
+	did_commit = false;
+	assert_zu_eq(xallocx(p, large1, 0, 0), large1,
+	    "Unexpected xallocx() failure");
+	assert_b_eq(did_decommit, did_commit, "Expected decommit/commit match");
+	dallocx(p, 0);
+	do_decommit = false;
+
+	/* Make sure non-huge allocation succeeds. */
+	p = mallocx(42, 0);
+	assert_ptr_not_null(p, "Unexpected mallocx() error");
+	dallocx(p, 0);
+
+	/* Restore chunk hooks. */
+	assert_d_eq(mallctl("arena.0.chunk_hooks", NULL, NULL, &old_hooks,
+	    new_size), 0, "Unexpected chunk_hooks error");
+	assert_d_eq(mallctl("arena.0.chunk_hooks", &old_hooks, &old_size,
+	    NULL, 0), 0, "Unexpected chunk_hooks error");
+	assert_ptr_eq(old_hooks.alloc, orig_hooks.alloc,
+	    "Unexpected alloc error");
+	assert_ptr_eq(old_hooks.dalloc, orig_hooks.dalloc,
+	    "Unexpected dalloc error");
+	assert_ptr_eq(old_hooks.commit, orig_hooks.commit,
+	    "Unexpected commit error");
+	assert_ptr_eq(old_hooks.decommit, orig_hooks.decommit,
+	    "Unexpected decommit error");
+	assert_ptr_eq(old_hooks.purge, orig_hooks.purge,
+	    "Unexpected purge error");
+	assert_ptr_eq(old_hooks.split, orig_hooks.split,
+	    "Unexpected split error");
+	assert_ptr_eq(old_hooks.merge, orig_hooks.merge,
+	    "Unexpected merge error");
+}
+TEST_END
+
+int
+main(void)
+{
+
+	return (test(test_chunk));
+}
--- a/test/integration/mallocx.c
+++ b/test/integration/mallocx.c
@ -1,15 +1,13 @@
 #include "test/jemalloc_test.h"

-#define	CHUNK 0x400000
-#define	MAXALIGN (((size_t)1) << 25)
-#define	NITER 4
-
 TEST_BEGIN(test_basic)
 {
-	size_t nsz, rsz, sz;
-	void *p;
+#define	MAXSZ (((size_t)1) << 26)
+	size_t sz;

-	sz = 42;
+	for (sz = 1; sz < MAXSZ; sz = nallocx(sz, 0) + 1) {
+		size_t nsz, rsz;
+		void *p;
 		nsz = nallocx(sz, 0);
 		assert_zu_ne(nsz, 0, "Unexpected nallocx() error");
 		p = mallocx(sz, 0);
@ -30,11 +28,15 @@ TEST_BEGIN(test_basic)
 		rsz = sallocx(p, 0);
 		assert_zu_eq(nsz, rsz, "nallocx()/sallocx() rsize mismatch");
 		dallocx(p, 0);
+	}
+#undef MAXSZ
 }
 TEST_END

 TEST_BEGIN(test_alignment_and_size)
 {
+#define	MAXALIGN (((size_t)1) << 25)
+#define	NITER 4
 	size_t nsz, rsz, sz, alignment, total;
 	unsigned i;
 	void *ps[NITER];
@ -84,6 +86,8 @@ TEST_BEGIN(test_alignment_and_size)
 			}
 		}
 	}
+#undef MAXALIGN
+#undef NITER
 }
 TEST_END

--- a/test/integration/mremap.c
+++ b/test/integration/mremap.c
@ -1,45 +0,0 @@
-#include "test/jemalloc_test.h"
-
-TEST_BEGIN(test_mremap)
-{
-	int err;
-	size_t sz, lg_chunk, chunksize, i;
-	char *p, *q;
-
-	sz = sizeof(lg_chunk);
-	err = mallctl("opt.lg_chunk", &lg_chunk, &sz, NULL, 0);
-	assert_d_eq(err, 0, "Error in mallctl(): %s", strerror(err));
-	chunksize = ((size_t)1U) << lg_chunk;
-
-	p = (char *)malloc(chunksize);
-	assert_ptr_not_null(p, "malloc(%zu) --> %p", chunksize, p);
-	memset(p, 'a', chunksize);
-
-	q = (char *)realloc(p, chunksize * 2);
-	assert_ptr_not_null(q, "realloc(%p, %zu) --> %p", p, chunksize * 2,
-	    q);
-	for (i = 0; i < chunksize; i++) {
-		assert_c_eq(q[i], 'a',
-		    "realloc() should preserve existing bytes across copies");
-	}
-
-	p = q;
-
-	q = (char *)realloc(p, chunksize);
-	assert_ptr_not_null(q, "realloc(%p, %zu) --> %p", p, chunksize, q);
-	for (i = 0; i < chunksize; i++) {
-		assert_c_eq(q[i], 'a',
-		    "realloc() should preserve existing bytes across copies");
-	}
-
-	free(q);
-}
-TEST_END
-
-int
-main(void)
-{
-
-	return (test(
-	    test_mremap));
-}
--- a/test/integration/overflow.c
+++ b/test/integration/overflow.c
@ -0,0 +1,49 @@
+#include "test/jemalloc_test.h"
+
+TEST_BEGIN(test_overflow)
+{
+	unsigned nhchunks;
+	size_t mib[4];
+	size_t sz, miblen, max_size_class;
+	void *p;
+
+	sz = sizeof(unsigned);
+	assert_d_eq(mallctl("arenas.nhchunks", &nhchunks, &sz, NULL, 0), 0,
+	    "Unexpected mallctl() error");
+
+	miblen = sizeof(mib) / sizeof(size_t);
+	assert_d_eq(mallctlnametomib("arenas.hchunk.0.size", mib, &miblen), 0,
+	    "Unexpected mallctlnametomib() error");
+	mib[2] = nhchunks - 1;
+
+	sz = sizeof(size_t);
+	assert_d_eq(mallctlbymib(mib, miblen, &max_size_class, &sz, NULL, 0), 0,
+	    "Unexpected mallctlbymib() error");
+
+	assert_ptr_null(malloc(max_size_class + 1),
+	    "Expected OOM due to over-sized allocation request");
+	assert_ptr_null(malloc(SIZE_T_MAX),
+	    "Expected OOM due to over-sized allocation request");
+
+	assert_ptr_null(calloc(1, max_size_class + 1),
+	    "Expected OOM due to over-sized allocation request");
+	assert_ptr_null(calloc(1, SIZE_T_MAX),
+	    "Expected OOM due to over-sized allocation request");
+
+	p = malloc(1);
+	assert_ptr_not_null(p, "Unexpected malloc() OOM");
+	assert_ptr_null(realloc(p, max_size_class + 1),
+	    "Expected OOM due to over-sized allocation request");
+	assert_ptr_null(realloc(p, SIZE_T_MAX),
+	    "Expected OOM due to over-sized allocation request");
+	free(p);
+}
+TEST_END
+
+int
+main(void)
+{
+
+	return (test(
+	    test_overflow));
+}
--- a/test/integration/rallocm.c
+++ b/test/integration/rallocm.c
@ -1,111 +0,0 @@
-#include "test/jemalloc_test.h"
-
-TEST_BEGIN(test_same_size)
-{
-	void *p, *q;
-	size_t sz, tsz;
-
-	assert_d_eq(allocm(&p, &sz, 42, 0), ALLOCM_SUCCESS,
-	    "Unexpected allocm() error");
-
-	q = p;
-	assert_d_eq(rallocm(&q, &tsz, sz, 0, ALLOCM_NO_MOVE), ALLOCM_SUCCESS,
-	    "Unexpected rallocm() error");
-	assert_ptr_eq(q, p, "Unexpected object move");
-	assert_zu_eq(tsz, sz, "Unexpected size change: %zu --> %zu", sz, tsz);
-
-	assert_d_eq(dallocm(p, 0), ALLOCM_SUCCESS,
-	    "Unexpected dallocm() error");
-}
-TEST_END
-
-TEST_BEGIN(test_extra_no_move)
-{
-	void *p, *q;
-	size_t sz, tsz;
-
-	assert_d_eq(allocm(&p, &sz, 42, 0), ALLOCM_SUCCESS,
-	    "Unexpected allocm() error");
-
-	q = p;
-	assert_d_eq(rallocm(&q, &tsz, sz, sz-42, ALLOCM_NO_MOVE),
-	    ALLOCM_SUCCESS, "Unexpected rallocm() error");
-	assert_ptr_eq(q, p, "Unexpected object move");
-	assert_zu_eq(tsz, sz, "Unexpected size change: %zu --> %zu", sz, tsz);
-
-	assert_d_eq(dallocm(p, 0), ALLOCM_SUCCESS,
-	    "Unexpected dallocm() error");
-}
-TEST_END
-
-TEST_BEGIN(test_no_move_fail)
-{
-	void *p, *q;
-	size_t sz, tsz;
-
-	assert_d_eq(allocm(&p, &sz, 42, 0), ALLOCM_SUCCESS,
-	    "Unexpected allocm() error");
-
-	q = p;
-	assert_d_eq(rallocm(&q, &tsz, sz + 5, 0, ALLOCM_NO_MOVE),
-	    ALLOCM_ERR_NOT_MOVED, "Unexpected rallocm() result");
-	assert_ptr_eq(q, p, "Unexpected object move");
-	assert_zu_eq(tsz, sz, "Unexpected size change: %zu --> %zu", sz, tsz);
-
-	assert_d_eq(dallocm(p, 0), ALLOCM_SUCCESS,
-	    "Unexpected dallocm() error");
-}
-TEST_END
-
-TEST_BEGIN(test_grow_and_shrink)
-{
-	void *p, *q;
-	size_t tsz;
-#define	NCYCLES 3
-	unsigned i, j;
-#define	NSZS 2500
-	size_t szs[NSZS];
-#define	MAXSZ ZU(12 * 1024 * 1024)
-
-	assert_d_eq(allocm(&p, &szs[0], 1, 0), ALLOCM_SUCCESS,
-	    "Unexpected allocm() error");
-
-	for (i = 0; i < NCYCLES; i++) {
-		for (j = 1; j < NSZS && szs[j-1] < MAXSZ; j++) {
-			q = p;
-			assert_d_eq(rallocm(&q, &szs[j], szs[j-1]+1, 0, 0),
-			    ALLOCM_SUCCESS,
-			    "Unexpected rallocm() error for size=%zu-->%zu",
-			    szs[j-1], szs[j-1]+1);
-			assert_zu_ne(szs[j], szs[j-1]+1,
-			    "Expected size to at least: %zu", szs[j-1]+1);
-			p = q;
-		}
-
-		for (j--; j > 0; j--) {
-			q = p;
-			assert_d_eq(rallocm(&q, &tsz, szs[j-1], 0, 0),
-			    ALLOCM_SUCCESS,
-			    "Unexpected rallocm() error for size=%zu-->%zu",
-			    szs[j], szs[j-1]);
-			assert_zu_eq(tsz, szs[j-1],
-			    "Expected size=%zu, got size=%zu", szs[j-1], tsz);
-			p = q;
-		}
-	}
-
-	assert_d_eq(dallocm(p, 0), ALLOCM_SUCCESS,
-	    "Unexpected dallocm() error");
-}
-TEST_END
-
-int
-main(void)
-{
-
-	return (test(
-	    test_same_size,
-	    test_extra_no_move,
-	    test_no_move_fail,
-	    test_grow_and_shrink));
-}
--- a/test/integration/rallocx.c
+++ b/test/integration/rallocx.c
@ -55,8 +55,9 @@ validate_fill(const void *p, uint8_t c, size_t offset, size_t len)
 	for (i = 0; i < len; i++) {
 		uint8_t b = buf[offset+i];
 		if (b != c) {
-			test_fail("Allocation at %p contains %#x rather than "
-			    "%#x at offset %zu", p, b, c, offset+i);
+			test_fail("Allocation at %p (len=%zu) contains %#x "
+			    "rather than %#x at offset %zu", p, len, b, c,
+			    offset+i);
 			ret = true;
 		}
 	}
@ -95,7 +96,8 @@ TEST_BEGIN(test_zero)
 				    "Expected zeroed memory");
 			}
 			if (psz != qsz) {
-				memset(q+psz, FILL_BYTE, qsz-psz);
+				memset((void *)((uintptr_t)q+psz), FILL_BYTE,
+				    qsz-psz);
 				psz = qsz;
 			}
 			p = q;
@ -159,8 +161,9 @@ TEST_BEGIN(test_lg_align_and_zero)
 		} else {
 			assert_false(validate_fill(q, 0, 0, MAX_VALIDATE),
 			    "Expected zeroed memory");
-			assert_false(validate_fill(q+sz-MAX_VALIDATE, 0, 0,
-			    MAX_VALIDATE), "Expected zeroed memory");
+			assert_false(validate_fill(
+			    (void *)((uintptr_t)q+sz-MAX_VALIDATE),
+			    0, 0, MAX_VALIDATE), "Expected zeroed memory");
 		}
 		p = q;
 	}
--- a/test/integration/sdallocx.c
+++ b/test/integration/sdallocx.c
@ -0,0 +1,57 @@
+#include "test/jemalloc_test.h"
+
+#define	MAXALIGN (((size_t)1) << 25)
+#define	NITER 4
+
+TEST_BEGIN(test_basic)
+{
+	void *ptr = mallocx(64, 0);
+	sdallocx(ptr, 64, 0);
+}
+TEST_END
+
+TEST_BEGIN(test_alignment_and_size)
+{
+	size_t nsz, sz, alignment, total;
+	unsigned i;
+	void *ps[NITER];
+
+	for (i = 0; i < NITER; i++)
+		ps[i] = NULL;
+
+	for (alignment = 8;
+	    alignment <= MAXALIGN;
+	    alignment <<= 1) {
+		total = 0;
+		for (sz = 1;
+		    sz < 3 * alignment && sz < (1U << 31);
+		    sz += (alignment >> (LG_SIZEOF_PTR-1)) - 1) {
+			for (i = 0; i < NITER; i++) {
+				nsz = nallocx(sz, MALLOCX_ALIGN(alignment) |
+				    MALLOCX_ZERO);
+				ps[i] = mallocx(sz, MALLOCX_ALIGN(alignment) |
+				    MALLOCX_ZERO);
+				total += nsz;
+				if (total >= (MAXALIGN << 1))
+					break;
+			}
+			for (i = 0; i < NITER; i++) {
+				if (ps[i] != NULL) {
+					sdallocx(ps[i], sz,
+					    MALLOCX_ALIGN(alignment));
+					ps[i] = NULL;
+				}
+			}
+		}
+	}
+}
+TEST_END
+
+int
+main(void)
+{
+
+	return (test(
+	    test_basic,
+	    test_alignment_and_size));
+}
--- a/test/src/SFMT.c
+++ b/test/src/SFMT.c
@ -511,7 +511,7 @@ uint64_t gen_rand64(sfmt_t *ctx) {
 uint64_t gen_rand64_range(sfmt_t *ctx, uint64_t limit) {
    uint64_t ret, above;

-    above = 0xffffffffffffffffLLU - (0xffffffffffffffffLLU  % limit);
+    above = KQU(0xffffffffffffffff) - (KQU(0xffffffffffffffff) % limit);
    while (1) {
 	ret = gen_rand64(ctx);
 	if (ret < above) {
--- a/test/src/btalloc.c
+++ b/test/src/btalloc.c
@ -0,0 +1,8 @@
+#include "test/jemalloc_test.h"
+
+void *
+btalloc(size_t size, unsigned bits)
+{
+
+	return (btalloc_0(size, bits));
+}
--- a/test/src/btalloc_0.c
+++ b/test/src/btalloc_0.c
@ -0,0 +1,3 @@
+#include "test/jemalloc_test.h"
+
+btalloc_n_gen(0)
--- a/test/src/btalloc_1.c
+++ b/test/src/btalloc_1.c
@ -0,0 +1,3 @@
+#include "test/jemalloc_test.h"
+
+btalloc_n_gen(1)
--- a/test/src/mq.c
+++ b/test/src/mq.c
@ -0,0 +1,29 @@
+#include "test/jemalloc_test.h"
+
+/*
+ * Sleep for approximately ns nanoseconds.  No lower *nor* upper bound on sleep
+ * time is guaranteed.
+ */
+void
+mq_nanosleep(unsigned ns)
+{
+
+	assert(ns <= 1000*1000*1000);
+
+#ifdef _WIN32
+	Sleep(ns / 1000);
+#else
+	{
+		struct timespec timeout;
+
+		if (ns < 1000*1000*1000) {
+			timeout.tv_sec = 0;
+			timeout.tv_nsec = ns;
+		} else {
+			timeout.tv_sec = 1;
+			timeout.tv_nsec = 0;
+		}
+		nanosleep(&timeout, NULL);
+	}
+#endif
+}
--- a/Show More
+++ b/Show More