diff --git a/.gitignore b/.gitignore
index d9e49f8f..b4681866 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,10 +6,11 @@
 /jemalloc/doc/jemalloc.3
 /jemalloc/lib/
 /jemalloc/Makefile
-/jemalloc/src/internal/jemalloc_internal\.h
-/jemalloc/src/internal/mtrgraph_defs\.h
-/jemalloc/src/internal/mtrplay_defs\.h
-/jemalloc/src/jemalloc\.h
-/jemalloc/src/jemalloc_defs\.h
+/jemalloc/include/jemalloc/internal/jemalloc_internal\.h
+/jemalloc/include/jemalloc/jemalloc\.h
+/jemalloc/include/jemalloc/jemalloc_defs\.h
+/jemalloc/test/jemalloc_test\.h
 /jemalloc/src/*.[od]
+/jemalloc/test/*.[od]
+/jemalloc/test/*.out
 /jemalloc/VERSION
diff --git a/jemalloc/COPYING b/jemalloc/COPYING
index 1baaf50d..10ade120 100644
--- a/jemalloc/COPYING
+++ b/jemalloc/COPYING
@@ -3,6 +3,7 @@ subject to the following licenses:
 --------------------------------------------------------------------------------
 Copyright (C) 2002-2010 Jason Evans <jasone@canonware.com>.
 All rights reserved.
+Copyright (C) 2007-2010 Mozilla Foundation.  All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
diff --git a/jemalloc/ChangeLog b/jemalloc/ChangeLog
new file mode 100644
index 00000000..290dea15
--- /dev/null
+++ b/jemalloc/ChangeLog
@@ -0,0 +1,130 @@
+Following are change highlights associated with official releases.  Important
+bug fixes are all mentioned, but internal enhancements are omitted here for
+brevity (even though they are more fun to write about).  Much more detail can be
+found in the git revision history:
+
+    http://www.canonware.com/cgi-bin/gitweb.cgi?p=jemalloc.git
+    git://canonware.com/jemalloc.git
+
+* 2.0.0
+
+  This version focuses on the experimental *allocm() API, and on improved
+  run-time configuration/introspection.  Nonetheless, numerous performance
+  improvements are also included.
+
+  New features:
+    - Implement the experimental {,r,s,d}allocm() API, which provides a superset
+      of the functionality available via malloc(), calloc(), posix_memalign(),
+      realloc(), malloc_usable_size(), and free().  These functions can be used
+      to allocate/reallocate aligned zeroed memory, ask for optional extra
+      memory during reallocation, prevent object movement during reallocation,
+      etc.
+    - Replace JEMALLOC_OPTIONS/JEMALLOC_PROF_PREFIX with MALLOC_CONF, which is
+      more human-readable, and more flexible.  For example:
+        JEMALLOC_OPTIONS=AJP
+      is now:
+        MALLOC_CONF=abort:true,fill:true,stats_print:true
+    - Port to Apple OS X.  Sponsored by Mozilla.
+    - Make it possible for the application to control thread-->arena mappings
+      via the "thread.arena" mallctl.
+    - Add compile-time support for all TLS-related functionality via pthreads
+      TSD.  This is mainly of interest for OS X, which does not support TLS, but
+      has a TSD implementation with similar performance.
+    - Override memalign() and valloc() if they are provided by the system.
+    - Add the "arenas.purge" mallctl, which can be used to synchronously purge
+      all dirty unused pages.
+    - Make cumulative heap profiling data optional, so that it is possible to
+      limit the amount of memory consumed by heap profiling data structures.
+    - Add per thread allocation counters that can be accessed via the
+      "thread.allocated" and "thread.deallocated" mallctls.
+
+  Incompatible changes:
+    - Remove JEMALLOC_OPTIONS and malloc_options (see MALLOC_CONF above).
+    - Increase default backtrace depth from 4 to 128 for heap profiling.
+    - Disable interval-based profile dumps by default.
+
+  Bug fixes:
+  - Remove bad assertions in fork handler functions.  These assertions could
+    cause aborts for some combinations of configure settings.
+  - Fix strerror_r() usage to deal with non-standard semantics in GNU libc.
+  - Fix leak context reporting.  This bug tended to cause the number of contexts
+    to be underreported (though the reported number of objects and bytes were
+    correct).
+  - Fix a realloc() bug for large in-place growing reallocation.  This bug could
+    cause memory corruption, but it was hard to trigger.
+  - Fix an allocation bug for small allocations that could be triggered if
+    multiple threads raced to create a new run of backing pages.
+  - Enhance the heap profiler to trigger samples based on usable size, rather
+    than request size.
+  - Fix a heap profiling bug due to sometimes losing track of requested object
+    size for sampled objects.
+
+* 1.0.3
+
+  Bug fixes:
+  - Fix the libunwind-based implementation of stack backtracing (used for heap
+    profiling).  This bug could cause zero-length backtraces to be reported.
+  - Add a missing mutex unlock in library initialization code.  If multiple
+    threads raced to initialize malloc, some of them could end up permanently
+    blocked.
+
+* 1.0.2
+
+  Bug fixes:
+  - Fix junk filling of large objects, which could cause memory corruption.
+  - Add MAP_NORESERVE support for chunk mapping, because otherwise virtual
+    memory limits could cause swap file configuration to fail.  Contributed by
+    Jordan DeLong.
+
+* 1.0.1
+
+  Bug fixes:
+  - Fix compilation when --enable-fill is specified.
+  - Fix threads-related profiling bugs that affected accuracy and caused memory
+    to be leaked during thread exit.
+  - Fix dirty page purging race conditions that could cause crashes.
+  - Fix crash in tcache flushing code during thread destruction.
+
+* 1.0.0
+
+  This release focuses on speed and run-time introspection.  Numerous
+  algorithmic improvements make this release substantially faster than its
+  predecessors.
+
+  New features:
+  - Implement autoconf-based configuration system.
+  - Add mallctl*(), for the purposes of introspection and run-time
+    configuration.
+  - Make it possible for the application to manually flush a thread's cache, via
+    the "tcache.flush" mallctl.
+  - Base maximum dirty page count on proportion of active memory.
+  - Compute various addtional run-time statistics, including per size class
+    statistics for large objects.
+  - Expose malloc_stats_print(), which can be called repeatedly by the
+    application.
+  - Simplify the malloc_message() signature to only take one string argument,
+    and incorporate an opaque data pointer argument for use by the application
+    in combination with malloc_stats_print().
+  - Add support for allocation backed by one or more swap files, and allow the
+    application to disable over-commit if swap files are in use.
+  - Implement allocation profiling and leak checking.
+
+  Removed features:
+  - Remove the dynamic arena rebalancing code, since thread-specific caching
+    reduces its utility.
+
+  Bug fixes:
+  - Modify chunk allocation to work when address space layout randomization
+    (ASLR) is in use.
+  - Fix thread cleanup bugs related to TLS destruction.
+  - Handle 0-size allocation requests in posix_memalign().
+  - Fix a chunk leak.  The leaked chunks were never touched, so this impacted
+    virtual memory usage, but not physical memory usage.
+
+* linux_20080828a, linux_20080827a
+
+  These snapshot releases are the simple result of incorporating Linux-specific
+  support into the FreeBSD malloc sources.
+
+--------------------------------------------------------------------------------
+vim:filetype=text:textwidth=80
diff --git a/jemalloc/INSTALL b/jemalloc/INSTALL
index eec3b37e..fafd7883 100644
--- a/jemalloc/INSTALL
+++ b/jemalloc/INSTALL
@@ -27,26 +27,42 @@ any of the following arguments (not a definitive list) to 'configure':
     it is linked to.  This works only on ELF-based systems.
 
 --with-jemalloc-prefix=<prefix>
-    Prefix all public APIs with <prefix>, so that, for example, malloc()
-    becomes <prefix>malloc().  This makes it possible to use jemalloc at the
-    same time as the system allocator.
+    Prefix all public APIs with <prefix>.  For example, if <prefix> is
+    "prefix_", the API changes like the following occur:
+
+      malloc()         --> prefix_malloc()
+      malloc_conf      --> prefix_malloc_conf
+      /etc/malloc.conf --> /etc/prefix_malloc.conf
+      MALLOC_CONF      --> PREFIX_MALLOC_CONF
+
+    This makes it possible to use jemalloc at the same time as the system
+    allocator, or even to use multiple copies of jemalloc simultaneously.
+
+    By default, the prefix is "", except on OS X, where it is "je_".  On OS X,
+    jemalloc overlays the default malloc zone, but makes no attempt to actually
+    replace the "malloc", "calloc", etc. symbols.
 
 --with-install-suffix=<suffix>
     Append <suffix> to the base name of all installed files, such that multiple
     versions of jemalloc can coexist in the same installation directory.  For
     example, libjemalloc.so.0 becomes libjemalloc<suffix>.so.0.
 
+--enable-cc-silence
+    Enable code that silences unuseful compiler warnings.  This is helpful when
+    trying to tell serious warnings from those due to compiler limitations, but
+    it potentially incurs a performance penalty.
+
 --enable-debug
     Enable assertions and validation code.  This incurs a substantial
     performance hit, but is very useful during application development.
 
 --enable-stats
-    Enable statistics gathering functionality.  Use the 'P' option to print
-    detailed allocation statistics at exit.
+    Enable statistics gathering functionality.  See the "opt.stats_print"
+    option documentation for usage details.
 
 --enable-prof
-    Enable heap profiling and leak detection functionality.  Use the 'B', 'E',
-    'F', 'I', 'L', and 'U' options to control these features.
+    Enable heap profiling and leak detection functionality.  See the "opt.prof"
+    option documention for usage details.
 
 --disable-prof-libgcc
     Disable the use of libgcc's backtracing functionality.  Ordinarily, libgcc's
@@ -72,8 +88,8 @@ any of the following arguments (not a definitive list) to 'configure':
 
 --disable-tcache
     Disable thread-specific caches for small objects.  Objects are cached and
-    released in bulk, thus reducing the total number of mutex operations.  Use
-    the 'H', 'G', and 'M' options to control thread-specific caching.
+    released in bulk, thus reducing the total number of mutex operations.  See
+    the "opt.tcache" option for suage details.
 
 --enable-swap
     Enable mmap()ed swap file support.  When this feature is built in, it is
@@ -85,18 +101,18 @@ any of the following arguments (not a definitive list) to 'configure':
     mmap(2).
 
 --enable-fill
-    Enable support for junk/zero filling of memory.  Use the 'J' option to
-    control junk filling, or the 'Z' option to control zero filling.
+    Enable support for junk/zero filling of memory.  See the "opt.junk"/
+    "opt.zero" option documentation for usage details.
 
 --enable-xmalloc
     Enable support for optional immediate termination due to out-of-memory
     errors, as is commonly implemented by "xmalloc" wrapper function for malloc.
-    Use the 'X' option to control termination behavior.
+    See the "opt.xmalloc" option documentation for usage details.
 
 --enable-sysv
     Enable support for System V semantics, wherein malloc(0) returns NULL
-    rather than a minimal allocation.  Use the 'V' option to control System V
-    compatibility.
+    rather than a minimal allocation.  See the "opt.sysv" option documentation
+    for usage details.
 
 --enable-dynamic-page-shift
     Under most conditions, the system page size never changes (usually 4KiB or
@@ -213,3 +229,14 @@ directory, issue configuration and build commands:
     cd obj
     ../configure --enable-autogen
     make
+
+=== Documentation ==============================================================
+
+The manual page that the configure script generates can be manually formatted
+prior to installation via any of the following commands:
+
+    nroff -man -man-ext -t doc/jemalloc.3
+
+    groff -man -man-ext -t -Tps doc/jemalloc.3 | ps2pdf - doc/jemalloc.3.pdf
+
+    (cd doc; groff -man -man-ext -t -Thtml jemalloc.3 > jemalloc.3.html)
diff --git a/jemalloc/Makefile.in b/jemalloc/Makefile.in
index ac9b7822..46eddf46 100644
--- a/jemalloc/Makefile.in
+++ b/jemalloc/Makefile.in
@@ -28,10 +28,17 @@ LIBS := @LIBS@
 RPATH_EXTRA := @RPATH_EXTRA@
 ifeq (macho, @abi@)
 SO := dylib
+WL_SONAME := dylib_install_name
 else
 SO := so
+WL_SONAME := soname
+endif
+REV := 1
+ifeq (macho, @abi@)
+TEST_LIBRARY_PATH := DYLD_FALLBACK_LIBRARY_PATH=@objroot@lib
+else
+TEST_LIBRARY_PATH :=
 endif
-REV := 0
 
 # Lists of files.
 BINS := @srcroot@bin/pprof
@@ -42,11 +49,18 @@ CSRCS := @srcroot@src/jemalloc.c @srcroot@src/arena.c @srcroot@src/base.c \
 	@srcroot@src/chunk_mmap.c @srcroot@src/chunk_swap.c @srcroot@src/ckh.c \
 	@srcroot@src/ctl.c @srcroot@src/extent.c @srcroot@src/hash.c \
 	@srcroot@src/huge.c @srcroot@src/mb.c @srcroot@src/mutex.c \
-	@srcroot@src/prof.c @srcroot@src/stats.c @srcroot@src/tcache.c
-DSOS := @objroot@lib/libjemalloc@install_suffix@.so.$(REV) \
-	@objroot@lib/libjemalloc@install_suffix@.so \
+	@srcroot@src/prof.c @srcroot@src/rtree.c \
+	@srcroot@src/stats.c @srcroot@src/tcache.c
+ifeq (macho, @abi@)
+CSRCS += @srcroot@src/zone.c
+endif
+DSOS := @objroot@lib/libjemalloc@install_suffix@.$(SO).$(REV) \
+	@objroot@lib/libjemalloc@install_suffix@.$(SO) \
 	@objroot@lib/libjemalloc@install_suffix@_pic.a
 MAN3 := @objroot@doc/jemalloc@install_suffix@.3
+CTESTS := @srcroot@test/allocated.c @srcroot@test/allocm.c \
+	@srcroot@test/posix_memalign.c \
+	@srcroot@test/rallocm.c @srcroot@test/thread_arena.c
 
 .PHONY: all dist install check clean distclean relclean
 
@@ -63,18 +77,32 @@ all: $(DSOS)
 	$(CC) $(CFLAGS) -c $(CPPFLAGS) -o $@ $<
 	@$(SHELL) -ec "$(CC) -MM $(CPPFLAGS) $< | sed \"s/\($(subst /,\/,$(notdir $(basename $@)))\)\.o\([ :]*\)/$(subst /,\/,$(strip $(dir $@)))\1.o \2/g\" > $(@:%.o=%.d)"
 
-%.so : %.so.$(REV)
+%.$(SO) : %.$(SO).$(REV)
 	@mkdir -p $(@D)
 	ln -sf $(<F) $@
 
-@objroot@lib/libjemalloc@install_suffix@.so.$(REV) : $(CSRCS:@srcroot@%.c=@objroot@%.o)
+@objroot@lib/libjemalloc@install_suffix@.$(SO).$(REV) : $(CSRCS:@srcroot@%.c=@objroot@%.o)
 	@mkdir -p $(@D)
-	$(CC) -shared -Wl,-soname,$(@F) -o $@ $+ $(LDFLAGS) $(LIBS)
+	$(CC) -shared -Wl,-$(WL_SONAME),$(@F) $(RPATH_EXTRA:%=@RPATH@%) -o $@ $+ $(LDFLAGS) $(LIBS)
 
 @objroot@lib/libjemalloc@install_suffix@_pic.a : $(CSRCS:@srcroot@%.c=@objroot@%.o)
 	@mkdir -p $(@D)
 	ar crus $@ $+
 
+@objroot@test/%.o: @srcroot@test/%.c
+	@mkdir -p $(@D)
+	$(CC) $(CFLAGS) -c $(CPPFLAGS) -I@objroot@test -o $@ $<
+	@$(SHELL) -ec "$(CC) -MM $(CPPFLAGS) -I@objroot@test $< | sed \"s/\($(subst /,\/,$(notdir $(basename $@)))\)\.o\([ :]*\)/$(subst /,\/,$(strip $(dir $@)))\1.o \2/g\" > $(@:%.o=%.d)"
+
+@objroot@test/%: @objroot@test/%.o \
+		 @objroot@lib/libjemalloc@install_suffix@.$(SO)
+	@mkdir -p $(@D)
+ifneq (@RPATH@, )
+	$(CC) -o $@ $< @RPATH@@objroot@lib -L@objroot@lib -ljemalloc@install_suffix@
+else
+	$(CC) -o $@ $< -L@objroot@lib -ljemalloc@install_suffix@
+endif
+
 install_bin:
 	install -d $(BINDIR)
 	@for b in $(BINS); do \
@@ -91,8 +119,8 @@ done
 
 install_lib: $(DSOS)
 	install -d $(LIBDIR)
-	install -m 755 @objroot@lib/libjemalloc@install_suffix@.so.$(REV) $(LIBDIR)
-	ln -sf libjemalloc@install_suffix@.so.$(REV) $(LIBDIR)/libjemalloc@install_suffix@.so
+	install -m 755 @objroot@lib/libjemalloc@install_suffix@.$(SO).$(REV) $(LIBDIR)
+	ln -sf libjemalloc@install_suffix@.$(SO).$(REV) $(LIBDIR)/libjemalloc@install_suffix@.$(SO)
 	install -m 755 @objroot@lib/libjemalloc@install_suffix@_pic.a $(LIBDIR)
 
 install_man:
@@ -104,19 +132,50 @@ done
 
 install: install_bin install_include install_lib install_man
 
-check:
+tests: $(CTESTS:@srcroot@%.c=@objroot@%)
+
+check: tests
+	@mkdir -p @objroot@test
+	@$(SHELL) -c 'total=0; \
+		failures=0; \
+		echo "========================================="; \
+		for t in $(CTESTS:@srcroot@%.c=@objroot@%); do \
+			total=`expr $$total + 1`; \
+			/bin/echo -n "$${t} ... "; \
+			$(TEST_LIBRARY_PATH) $${t} @abs_srcroot@ @abs_objroot@ \
+			  > @objroot@$${t}.out 2>&1; \
+			if test -e "@srcroot@$${t}.exp"; then \
+				diff -u @srcroot@$${t}.exp \
+				  @objroot@$${t}.out >/dev/null 2>&1; \
+				fail=$$?; \
+				if test "$${fail}" -eq "1" ; then \
+					failures=`expr $${failures} + 1`; \
+					echo "*** FAIL ***"; \
+				else \
+					echo "pass"; \
+				fi; \
+			else \
+				echo "*** FAIL *** (.exp file is missing)"; \
+				failures=`expr $${failures} + 1`; \
+			fi; \
+		done; \
+		echo "========================================="; \
+		echo "Failures: $${failures}/$${total}"'
 
 clean:
 	rm -f $(CSRCS:@srcroot@%.c=@objroot@%.o)
 	rm -f $(CSRCS:@srcroot@%.c=@objroot@%.d)
+	rm -f $(CTESTS:@srcroot@%.c=@objroot@%)
+	rm -f $(CTESTS:@srcroot@%.c=@objroot@%.o)
+	rm -f $(CTESTS:@srcroot@%.c=@objroot@%.d)
+	rm -f $(CTESTS:@srcroot@%.c=@objroot@%.out)
 	rm -f $(DSOS)
 
 distclean: clean
 	rm -rf @objroot@autom4te.cache
 	rm -f @objroot@config.log
 	rm -f @objroot@config.status
-	rm -f @objroot@cfghdrs.stamp
-	rm -f @objroot@cfgoutputs.stamp
+	rm -f @objroot@config.stamp
 	rm -f @cfghdrs_out@
 	rm -f @cfgoutputs_out@
 
diff --git a/jemalloc/README b/jemalloc/README
index 2ff36ef6..4d7b552b 100644
--- a/jemalloc/README
+++ b/jemalloc/README
@@ -1,9 +1,9 @@
 jemalloc is a general-purpose scalable concurrent malloc(3) implementation.
 This distribution is a stand-alone "portable" implementation that currently
-targets only Linux.  jemalloc is included as the default allocator in the
-FreeBSD and NetBSD operating systems, and it is used by the Mozilla Firefox web
-browser on Microsoft Windows-related platforms.  Depending on your needs, one
-of the other divergent versions may suit your needs better than this
+targets Linux and Apple OS X.  jemalloc is included as the default allocator in
+the FreeBSD and NetBSD operating systems, and it is used by the Mozilla Firefox
+web browser on Microsoft Windows-related platforms.  Depending on your needs,
+one of the other divergent versions may suit your needs better than this
 distribution.
 
 The COPYING file contains copyright and licensing information.
@@ -11,4 +11,6 @@ The COPYING file contains copyright and licensing information.
 The INSTALL file contains information on how to configure, build, and install
 jemalloc.
 
+The ChangeLog file contains a brief summary of changes for each release.
+
 URL: http://www.canonware.com/jemalloc/
diff --git a/jemalloc/bin/pprof b/jemalloc/bin/pprof
index 57c0600e..1655f07c 100755
--- a/jemalloc/bin/pprof
+++ b/jemalloc/bin/pprof
@@ -92,9 +92,7 @@ my $GV = "gv";
 my $KCACHEGRIND = "kcachegrind";
 my $PS2PDF = "ps2pdf";
 # These are used for dynamic profiles
-my $WGET = "wget";
-my $WGET_FLAGS = "--no-http-keep-alive";   # only supported by some wgets
-my $CURL = "curl";
+my $URL_FETCHER = "curl -s";
 
 # These are the web pages that servers need to support for dynamic profiles
 my $HEAP_PAGE = "/pprof/heap";
@@ -108,6 +106,12 @@ my $FILTEREDPROFILE_PAGE = "/pprof/filteredprofile(?:\\?.*)?";
 my $SYMBOL_PAGE = "/pprof/symbol";     # must support symbol lookup via POST
 my $PROGRAM_NAME_PAGE = "/pprof/cmdline";
 
+# These are the web pages that can be named on the command line.
+# All the alternatives must begin with /.
+my $PROFILES = "($HEAP_PAGE|$PROFILE_PAGE|$PMUPROFILE_PAGE|" .
+               "$GROWTH_PAGE|$CONTENTION_PAGE|$WALL_PAGE|" .
+               "$FILTEREDPROFILE_PAGE)";
+
 # default binary name
 my $UNKNOWN_BINARY = "(unknown)";
 
@@ -176,12 +180,14 @@ Output type:
    --text              Generate text report
    --callgrind         Generate callgrind format to stdout
    --gv                Generate Postscript and display
+   --web               Generate SVG and display
    --list=<regexp>     Generate source listing of matching routines
    --disasm=<regexp>   Generate disassembly of matching routines
    --symbols           Print demangled symbol names found at given addresses
    --dot               Generate DOT file to stdout
    --ps                Generate Postcript to stdout
    --pdf               Generate PDF to stdout
+   --svg               Generate SVG to stdout
    --gif               Generate GIF to stdout
    --raw               Generate symbolized pprof data (useful with remote fetch)
 
@@ -209,7 +215,7 @@ Call-graph Options:
                        (i.e. direct leak generators) more visible
 
 Miscellaneous:
-   --tools=<prefix>    Prefix for object tool pathnames
+   --tools=<prefix or binary:fullpath>[,...]   \$PATH for object tool pathnames
    --test              Run unit tests
    --help              This message
    --version           Version information
@@ -224,6 +230,8 @@ pprof /bin/ls ls.prof
                        Enters "interactive" mode
 pprof --text /bin/ls ls.prof
                        Outputs one line per procedure
+pprof --web /bin/ls ls.prof
+                       Displays annotated call-graph in web browser
 pprof --gv /bin/ls ls.prof
                        Displays annotated call-graph via 'gv'
 pprof --gv --focus=Mutex /bin/ls ls.prof
@@ -234,6 +242,9 @@ pprof --list=getdir /bin/ls ls.prof
                        (Per-line) annotated source listing for getdir()
 pprof --disasm=getdir /bin/ls ls.prof
                        (Per-PC) annotated disassembly for getdir()
+
+pprof http://localhost:1234/
+                       Enters "interactive" mode
 pprof --text localhost:1234
                        Outputs one line per procedure for localhost:1234
 pprof --raw localhost:1234 > ./local.raw
@@ -293,10 +304,12 @@ sub Init() {
   $main::opt_disasm = "";
   $main::opt_symbols = 0;
   $main::opt_gv = 0;
+  $main::opt_web = 0;
   $main::opt_dot = 0;
   $main::opt_ps = 0;
   $main::opt_pdf = 0;
   $main::opt_gif = 0;
+  $main::opt_svg = 0;
   $main::opt_raw = 0;
 
   $main::opt_nodecount = 80;
@@ -331,6 +344,9 @@ sub Init() {
   # Are we using $SYMBOL_PAGE?
   $main::use_symbol_page = 0;
 
+  # Files returned by TempName.
+  %main::tempnames = ();
+
   # Type of profile we are dealing with
   # Supported types:
   #     cpu
@@ -356,9 +372,11 @@ sub Init() {
              "disasm=s"       => \$main::opt_disasm,
              "symbols!"       => \$main::opt_symbols,
              "gv!"            => \$main::opt_gv,
+             "web!"           => \$main::opt_web,
              "dot!"           => \$main::opt_dot,
              "ps!"            => \$main::opt_ps,
              "pdf!"           => \$main::opt_pdf,
+             "svg!"           => \$main::opt_svg,
              "gif!"           => \$main::opt_gif,
              "raw!"           => \$main::opt_raw,
              "interactive!"   => \$main::opt_interactive,
@@ -434,9 +452,11 @@ sub Init() {
       ($main::opt_disasm eq '' ? 0 : 1) +
       ($main::opt_symbols == 0 ? 0 : 1) +
       $main::opt_gv +
+      $main::opt_web +
       $main::opt_dot +
       $main::opt_ps +
       $main::opt_pdf +
+      $main::opt_svg +
       $main::opt_gif +
       $main::opt_raw +
       $main::opt_interactive +
@@ -511,20 +531,6 @@ sub Init() {
     ConfigureObjTools($main::prog)
   }
 
-  # Check what flags our commandline utilities support
-  if (open(TFILE, "$WGET $WGET_FLAGS -V 2>&1 |")) {
-    my @lines = <TFILE>;
-    if (grep(/unrecognized/, @lines) > 0) {
-      # grep found 'unrecognized' token from WGET, clear WGET flags
-      $WGET_FLAGS = "";
-    }
-    close(TFILE);
-  }
-  # TODO(csilvers): check all the other binaries and objtools to see
-  # if they are installed and what flags they support, and store that
-  # in a data structure here, rather than scattering these tests about.
-  # Then, ideally, rewrite code to use wget OR curl OR GET or ...
-
   # Break the opt_list_prefix into the prefix_list array
   @prefix_list = split (',', $main::opt_lib_prefix);
 
@@ -588,6 +594,10 @@ sub Main() {
   } elsif ($main::use_symbol_page) {
     $symbols = FetchSymbols($pcs);
   } else {
+    # TODO(csilvers): $libs uses the /proc/self/maps data from profile1,
+    # which may differ from the data from subsequent profiles, especially
+    # if they were run on different machines.  Use appropriate libs for
+    # each pc somehow.
     $symbols = ExtractSymbols($libs, $pcs);
   }
 
@@ -635,9 +645,24 @@ sub Main() {
     } else {
       if (PrintDot($main::prog, $symbols, $profile, $flat, $cumulative, $total)) {
         if ($main::opt_gv) {
-          RunGV(PsTempName($main::next_tmpfile), "");
+          RunGV(TempName($main::next_tmpfile, "ps"), "");
+        } elsif ($main::opt_web) {
+          my $tmp = TempName($main::next_tmpfile, "svg");
+          RunWeb($tmp);
+          # The command we run might hand the file name off
+          # to an already running browser instance and then exit.
+          # Normally, we'd remove $tmp on exit (right now),
+          # but fork a child to remove $tmp a little later, so that the
+          # browser has time to load it first.
+          delete $main::tempnames{$tmp};
+          if (fork() == 0) {
+            sleep 5;
+            unlink($tmp);
+            exit(0);
+          }
         }
       } else {
+        cleanup();
         exit(1);
       }
     }
@@ -683,6 +708,34 @@ sub RunGV {
   }
 }
 
+sub RunWeb {
+  my $fname = shift;
+  print STDERR "Loading web page file:///$fname\n";
+
+  if (`uname` =~ /Darwin/) {
+    # OS X: open will use standard preference for SVG files.
+    system("/usr/bin/open", $fname);
+    return;
+  }
+
+  # Some kind of Unix; try generic symlinks, then specific browsers.
+  # (Stop once we find one.)
+  # Works best if the browser is already running.
+  my @alt = (
+    "/etc/alternatives/gnome-www-browser",
+    "/etc/alternatives/x-www-browser",
+    "google-chrome",
+    "firefox",
+  );
+  foreach my $b (@alt) {
+    if (system($b, $fname) == 0) {
+      return;
+    }
+  }
+
+  print STDERR "Could not load web browser.\n";
+}
+
 sub RunKcachegrind {
   my $fname = shift;
   my $bg = shift;       # "" or " &" if we should run in background
@@ -739,10 +792,10 @@ sub InteractiveCommand {
     print STDERR "\n";
     return 0;
   }
-  if (m/^ *quit/) {
+  if (m/^\s*quit/) {
     return 0;
   }
-  if (m/^ *help/) {
+  if (m/^\s*help/) {
     InteractiveHelpMessage();
     return 1;
   }
@@ -754,7 +807,7 @@ sub InteractiveCommand {
   $main::opt_gv = 0;
   $main::opt_cum = 0;
 
-  if (m/^ *(text|top)(\d*) *(.*)/) {
+  if (m/^\s*(text|top)(\d*)\s*(.*)/) {
     $main::opt_text = 1;
 
     my $line_limit = ($2 ne "") ? int($2) : 10;
@@ -773,14 +826,14 @@ sub InteractiveCommand {
     PrintText($symbols, $flat, $cumulative, $total, $line_limit);
     return 1;
   }
-  if (m/^ *callgrind *([^ \n]*)/) {
+  if (m/^\s*callgrind\s*([^ \n]*)/) {
     $main::opt_callgrind = 1;
 
     # Get derived profiles
     my $calls = ExtractCalls($symbols, $orig_profile);
     my $filename = $1;
     if ( $1 eq '' ) {
-      $filename = CallgrindTempName($main::next_tmpfile);
+      $filename = TempName($main::next_tmpfile, "callgrind");
     }
     PrintCallgrind($calls, $filename);
     if ( $1 eq '' ) {
@@ -790,7 +843,7 @@ sub InteractiveCommand {
 
     return 1;
   }
-  if (m/^ *list *(.+)/) {
+  if (m/^\s*list\s*(.+)/) {
     $main::opt_list = 1;
 
     my $routine;
@@ -807,7 +860,7 @@ sub InteractiveCommand {
     PrintListing($libs, $flat, $cumulative, $routine);
     return 1;
   }
-  if (m/^ *disasm *(.+)/) {
+  if (m/^\s*disasm\s*(.+)/) {
     $main::opt_disasm = 1;
 
     my $routine;
@@ -825,12 +878,18 @@ sub InteractiveCommand {
     PrintDisassembly($libs, $flat, $cumulative, $routine, $total);
     return 1;
   }
-  if (m/^ *gv *(.*)/) {
-    $main::opt_gv = 1;
+  if (m/^\s*(gv|web)\s*(.*)/) {
+    $main::opt_gv = 0;
+    $main::opt_web = 0;
+    if ($1 eq "gv") {
+      $main::opt_gv = 1;
+    } elsif ($1 eq "web") {
+      $main::opt_web = 1;
+    }
 
     my $focus;
     my $ignore;
-    ($focus, $ignore) = ParseInteractiveArgs($1);
+    ($focus, $ignore) = ParseInteractiveArgs($2);
 
     # Process current profile to account for various settings
     my $profile = ProcessProfile($orig_profile, $symbols, $focus, $ignore);
@@ -841,11 +900,19 @@ sub InteractiveCommand {
     my $cumulative = CumulativeProfile($reduced);
 
     if (PrintDot($main::prog, $symbols, $profile, $flat, $cumulative, $total)) {
-      RunGV(PsTempName($main::next_tmpfile), " &");
+      if ($main::opt_gv) {
+        RunGV(TempName($main::next_tmpfile, "ps"), " &");
+      } elsif ($main::opt_web) {
+        RunWeb(TempName($main::next_tmpfile, "svg"));
+      }
       $main::next_tmpfile++;
     }
     return 1;
   }
+  if (m/^\s*$/) {
+    return 1;
+  }
+  print STDERR "Unknown command: try 'help'.\n";
   return 1;
 }
 
@@ -894,6 +961,14 @@ Commands:
       the "focus" regular expression matches a routine name on the stack
       trace.
 
+  web
+  web [focus] [-ignore1] [-ignore2]
+      Like GV, but displays profile in your web browser instead of using
+      Ghostview. Works best if your web browser is already running.
+      To change the browser that gets used:
+      On Linux, set the /etc/alternatives/gnome-www-browser symlink.
+      On OS X, change the Finder association for SVG files.
+
   list [routine_regexp] [-ignore1] [-ignore2]
       Show source listing of routines whose names match "routine_regexp"
 
@@ -950,14 +1025,12 @@ sub ParseInteractiveArgs {
 
 ##### Output code #####
 
-sub PsTempName {
+sub TempName {
   my $fnum = shift;
-  return "$main::tmpfile_ps" . "." . "$fnum" . ".ps";
-}
-
-sub CallgrindTempName {
-  my $fnum = shift;
-  return "$main::tmpfile_ps" . "." . "$fnum" . ".callgrind";
+  my $ext = shift;
+  my $file = "$main::tmpfile_ps.$fnum.$ext";
+  $main::tempnames{$file} = 1;
+  return $file;
 }
 
 # Print profile data in packed binary format (64-bit) to standard out
@@ -1599,7 +1672,6 @@ sub PrintDot {
   }
   if ($last < 0) {
     print STDERR "No nodes to print\n";
-    cleanup();
     return 0;
   }
 
@@ -1612,11 +1684,14 @@ sub PrintDot {
   # Open DOT output file
   my $output;
   if ($main::opt_gv) {
-    $output = "| $DOT -Tps2 >" . PsTempName($main::next_tmpfile);
+    $output = "| $DOT -Tps2 >" . TempName($main::next_tmpfile, "ps");
   } elsif ($main::opt_ps) {
     $output = "| $DOT -Tps2";
   } elsif ($main::opt_pdf) {
     $output = "| $DOT -Tps2 | $PS2PDF - -";
+  } elsif ($main::opt_web || $main::opt_svg) {
+    # We need to post-process the SVG, so write to a temporary file always.
+    $output = "| $DOT -Tsvg >" . TempName($main::next_tmpfile, "svg");
   } elsif ($main::opt_gif) {
     $output = "| $DOT -Tgif";
   } else {
@@ -1727,7 +1802,10 @@ sub PrintDot {
       my $fraction = abs($local_total ? (3 * ($n / $local_total)) : 0);
       if ($fraction > 1) { $fraction = 1; }
       my $w = $fraction * 2;
-      #if ($w < 1) { $w = 1; }
+      if ($w < 1 && ($main::opt_web || $main::opt_svg)) {
+        # SVG output treats line widths < 1 poorly.
+        $w = 1;
+      }
 
       # Dot sometimes segfaults if given edge weights that are too large, so
       # we cap the weights at a large value
@@ -1751,11 +1829,312 @@ sub PrintDot {
   }
 
   print DOT ("}\n");
-
   close(DOT);
+
+  if ($main::opt_web || $main::opt_svg) {
+    # Rewrite SVG to be more usable inside web browser.
+    RewriteSvg(TempName($main::next_tmpfile, "svg"));
+  }
+
   return 1;
 }
 
+sub RewriteSvg {
+  my $svgfile = shift;
+
+  open(SVG, $svgfile) || die "open temp svg: $!";
+  my @svg = <SVG>;
+  close(SVG);
+  unlink $svgfile;
+  my $svg = join('', @svg);
+
+  # Dot's SVG output is
+  #
+  #    <svg width="___" height="___"
+  #     viewBox="___" xmlns=...>
+  #    <g id="graph0" transform="...">
+  #    ...
+  #    </g>
+  #    </svg>
+  #
+  # Change it to
+  #
+  #    <svg width="100%" height="100%"
+  #     xmlns=...>
+  #    $svg_javascript
+  #    <g id="viewport" transform="translate(0,0)">
+  #    <g id="graph0" transform="...">
+  #    ...
+  #    </g>
+  #    </g>
+  #    </svg>
+
+  # Fix width, height; drop viewBox.
+  $svg =~ s/(?s)<svg width="[^"]+" height="[^"]+"(.*?)viewBox="[^"]+"/<svg width="100%" height="100%"$1/;
+
+  # Insert script, viewport <g> above first <g>
+  my $svg_javascript = SvgJavascript();
+  my $viewport = "<g id=\"viewport\" transform=\"translate(0,0)\">\n";
+  $svg =~ s/<g id="graph\d"/$svg_javascript$viewport$&/;
+
+  # Insert final </g> above </svg>.
+  $svg =~ s/(.*)(<\/svg>)/$1<\/g>$2/;
+  $svg =~ s/<g id="graph\d"(.*?)/<g id="viewport"$1/;
+
+  if ($main::opt_svg) {
+    # --svg: write to standard output.
+    print $svg;
+  } else {
+    # Write back to temporary file.
+    open(SVG, ">$svgfile") || die "open $svgfile: $!";
+    print SVG $svg;
+    close(SVG);
+  }
+}
+
+sub SvgJavascript {
+  return <<'EOF';
+<script type="text/ecmascript"><![CDATA[
+// SVGPan
+// http://www.cyberz.org/blog/2009/12/08/svgpan-a-javascript-svg-panzoomdrag-library/
+// Local modification: if(true || ...) below to force panning, never moving.
+
+/**
+ *  SVGPan library 1.2
+ * ====================
+ *
+ * Given an unique existing element with id "viewport", including the
+ * the library into any SVG adds the following capabilities:
+ *
+ *  - Mouse panning
+ *  - Mouse zooming (using the wheel)
+ *  - Object dargging
+ *
+ * Known issues:
+ *
+ *  - Zooming (while panning) on Safari has still some issues
+ *
+ * Releases:
+ *
+ * 1.2, Sat Mar 20 08:42:50 GMT 2010, Zeng Xiaohui
+ *	Fixed a bug with browser mouse handler interaction
+ *
+ * 1.1, Wed Feb  3 17:39:33 GMT 2010, Zeng Xiaohui
+ *	Updated the zoom code to support the mouse wheel on Safari/Chrome
+ *
+ * 1.0, Andrea Leofreddi
+ *	First release
+ *
+ * This code is licensed under the following BSD license:
+ *
+ * Copyright 2009-2010 Andrea Leofreddi <a.leofreddi@itcharm.com>. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are
+ * permitted provided that the following conditions are met:
+ *
+ *    1. Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *
+ *    2. Redistributions in binary form must reproduce the above copyright notice, this list
+ *       of conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Andrea Leofreddi ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL Andrea Leofreddi OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * The views and conclusions contained in the software and documentation are those of the
+ * authors and should not be interpreted as representing official policies, either expressed
+ * or implied, of Andrea Leofreddi.
+ */
+
+var root = document.documentElement;
+
+var state = 'none', stateTarget, stateOrigin, stateTf;
+
+setupHandlers(root);
+
+/**
+ * Register handlers
+ */
+function setupHandlers(root){
+	setAttributes(root, {
+		"onmouseup" : "add(evt)",
+		"onmousedown" : "handleMouseDown(evt)",
+		"onmousemove" : "handleMouseMove(evt)",
+		"onmouseup" : "handleMouseUp(evt)",
+		//"onmouseout" : "handleMouseUp(evt)", // Decomment this to stop the pan functionality when dragging out of the SVG element
+	});
+
+	if(navigator.userAgent.toLowerCase().indexOf('webkit') >= 0)
+		window.addEventListener('mousewheel', handleMouseWheel, false); // Chrome/Safari
+	else
+		window.addEventListener('DOMMouseScroll', handleMouseWheel, false); // Others
+
+	var g = svgDoc.getElementById("svg");
+	g.width = "100%";
+	g.height = "100%";
+}
+
+/**
+ * Instance an SVGPoint object with given event coordinates.
+ */
+function getEventPoint(evt) {
+	var p = root.createSVGPoint();
+
+	p.x = evt.clientX;
+	p.y = evt.clientY;
+
+	return p;
+}
+
+/**
+ * Sets the current transform matrix of an element.
+ */
+function setCTM(element, matrix) {
+	var s = "matrix(" + matrix.a + "," + matrix.b + "," + matrix.c + "," + matrix.d + "," + matrix.e + "," + matrix.f + ")";
+
+	element.setAttribute("transform", s);
+}
+
+/**
+ * Dumps a matrix to a string (useful for debug).
+ */
+function dumpMatrix(matrix) {
+	var s = "[ " + matrix.a + ", " + matrix.c + ", " + matrix.e + "\n  " + matrix.b + ", " + matrix.d + ", " + matrix.f + "\n  0, 0, 1 ]";
+
+	return s;
+}
+
+/**
+ * Sets attributes of an element.
+ */
+function setAttributes(element, attributes){
+	for (i in attributes)
+		element.setAttributeNS(null, i, attributes[i]);
+}
+
+/**
+ * Handle mouse move event.
+ */
+function handleMouseWheel(evt) {
+	if(evt.preventDefault)
+		evt.preventDefault();
+
+	evt.returnValue = false;
+
+	var svgDoc = evt.target.ownerDocument;
+
+	var delta;
+
+	if(evt.wheelDelta)
+		delta = evt.wheelDelta / 3600; // Chrome/Safari
+	else
+		delta = evt.detail / -90; // Mozilla
+
+	var z = 1 + delta; // Zoom factor: 0.9/1.1
+
+	var g = svgDoc.getElementById("viewport");
+
+	var p = getEventPoint(evt);
+
+	p = p.matrixTransform(g.getCTM().inverse());
+
+	// Compute new scale matrix in current mouse position
+	var k = root.createSVGMatrix().translate(p.x, p.y).scale(z).translate(-p.x, -p.y);
+
+        setCTM(g, g.getCTM().multiply(k));
+
+	stateTf = stateTf.multiply(k.inverse());
+}
+
+/**
+ * Handle mouse move event.
+ */
+function handleMouseMove(evt) {
+	if(evt.preventDefault)
+		evt.preventDefault();
+
+	evt.returnValue = false;
+
+	var svgDoc = evt.target.ownerDocument;
+
+	var g = svgDoc.getElementById("viewport");
+
+	if(state == 'pan') {
+		// Pan mode
+		var p = getEventPoint(evt).matrixTransform(stateTf);
+
+		setCTM(g, stateTf.inverse().translate(p.x - stateOrigin.x, p.y - stateOrigin.y));
+	} else if(state == 'move') {
+		// Move mode
+		var p = getEventPoint(evt).matrixTransform(g.getCTM().inverse());
+
+		setCTM(stateTarget, root.createSVGMatrix().translate(p.x - stateOrigin.x, p.y - stateOrigin.y).multiply(g.getCTM().inverse()).multiply(stateTarget.getCTM()));
+
+		stateOrigin = p;
+	}
+}
+
+/**
+ * Handle click event.
+ */
+function handleMouseDown(evt) {
+	if(evt.preventDefault)
+		evt.preventDefault();
+
+	evt.returnValue = false;
+
+	var svgDoc = evt.target.ownerDocument;
+
+	var g = svgDoc.getElementById("viewport");
+
+	if(true || evt.target.tagName == "svg") {
+		// Pan mode
+		state = 'pan';
+
+		stateTf = g.getCTM().inverse();
+
+		stateOrigin = getEventPoint(evt).matrixTransform(stateTf);
+	} else {
+		// Move mode
+		state = 'move';
+
+		stateTarget = evt.target;
+
+		stateTf = g.getCTM().inverse();
+
+		stateOrigin = getEventPoint(evt).matrixTransform(stateTf);
+	}
+}
+
+/**
+ * Handle mouse button release event.
+ */
+function handleMouseUp(evt) {
+	if(evt.preventDefault)
+		evt.preventDefault();
+
+	evt.returnValue = false;
+
+	var svgDoc = evt.target.ownerDocument;
+
+	if(state == 'pan' || state == 'move') {
+		// Quit pan mode
+		state = '';
+	}
+}
+
+]]></script>
+EOF
+}
+
 # Translate a stack of addresses into a stack of symbols
 sub TranslateStack {
   my $symbols = shift;
@@ -2310,28 +2689,11 @@ sub AddEntries {
   AddEntry($profile, (join "\n", @k), $count);
 }
 
-sub IsSymbolizedProfileFile {
-  my $file_name = shift;
-
-  if (!(-e $file_name) || !(-r $file_name)) {
-    return 0;
-  }
-
-  $SYMBOL_PAGE =~ m,[^/]+$,;    # matches everything after the last slash
-  my $symbol_marker = $&;
-  # Check if the file contains a symbol-section marker.
-  open(TFILE, "<$file_name");
-  my @lines = <TFILE>;
-  my $result = grep(/^--- *$symbol_marker/, @lines);
-  close(TFILE);
-  return $result > 0;
-}
-
 ##### Code to profile a server dynamically #####
 
 sub CheckSymbolPage {
   my $url = SymbolPageURL();
-  open(SYMBOL, "$WGET $WGET_FLAGS -qO- '$url' |");
+  open(SYMBOL, "$URL_FETCHER '$url' |");
   my $line = <SYMBOL>;
   $line =~ s/\r//g;         # turn windows-looking lines into unix-looking lines
   close(SYMBOL);
@@ -2350,33 +2712,45 @@ sub CheckSymbolPage {
 
 sub IsProfileURL {
   my $profile_name = shift;
-  my ($host, $port, $path) = ParseProfileURL($profile_name);
-  return defined($host) and defined($port) and defined($path);
+  if (-f $profile_name) {
+    printf STDERR "Using local file $profile_name.\n";
+    return 0;
+  }
+  return 1;
 }
 
 sub ParseProfileURL {
   my $profile_name = shift;
-  if (defined($profile_name) &&
-      $profile_name =~ m,^(http://|)([^/:]+):(\d+)(|\@\d+)(|/|.*($PROFILE_PAGE|$PMUPROFILE_PAGE|$HEAP_PAGE|$GROWTH_PAGE|$CONTENTION_PAGE|$WALL_PAGE|$FILTEREDPROFILE_PAGE))$,o) {
-    # $6 is $PROFILE_PAGE/$HEAP_PAGE/etc.  $5 is *everything* after
-    # the hostname, as long as that everything is the empty string,
-    # a slash, or something ending in $PROFILE_PAGE/$HEAP_PAGE/etc.
-    # So "$6 || $5" is $PROFILE_PAGE/etc if there, or else it's "/" or "".
-    return ($2, $3, $6 || $5);
+
+  if (!defined($profile_name) || $profile_name eq "") {
+    return ();
   }
-  return ();
+
+  # Split profile URL - matches all non-empty strings, so no test.
+  $profile_name =~ m,^(https?://)?([^/]+)(.*?)(/|$PROFILES)?$,;
+
+  my $proto = $1 || "http://";
+  my $hostport = $2;
+  my $prefix = $3;
+  my $profile = $4 || "/";
+
+  my $host = $hostport;
+  $host =~ s/:.*//;
+
+  my $baseurl = "$proto$hostport$prefix";
+  return ($host, $baseurl, $profile);
 }
 
 # We fetch symbols from the first profile argument.
 sub SymbolPageURL {
-  my ($host, $port, $path) = ParseProfileURL($main::pfile_args[0]);
-  return "http://$host:$port$SYMBOL_PAGE";
+  my ($host, $baseURL, $path) = ParseProfileURL($main::pfile_args[0]);
+  return "$baseURL$SYMBOL_PAGE";
 }
 
 sub FetchProgramName() {
-  my ($host, $port, $path) = ParseProfileURL($main::pfile_args[0]);
-  my $url = "http://$host:$port$PROGRAM_NAME_PAGE";
-  my $command_line = "$WGET $WGET_FLAGS -qO- '$url'";
+  my ($host, $baseURL, $path) = ParseProfileURL($main::pfile_args[0]);
+  my $url = "$baseURL$PROGRAM_NAME_PAGE";
+  my $command_line = "$URL_FETCHER '$url'";
   open(CMDLINE, "$command_line |") or error($command_line);
   my $cmdline = <CMDLINE>;
   $cmdline =~ s/\r//g;   # turn windows-looking lines into unix-looking lines
@@ -2393,7 +2767,7 @@ sub FetchProgramName() {
 # curl.  Redirection happens on borg hosts.
 sub ResolveRedirectionForCurl {
   my $url = shift;
-  my $command_line = "$CURL -s --head '$url'";
+  my $command_line = "$URL_FETCHER --head '$url'";
   open(CMDLINE, "$command_line |") or error($command_line);
   while (<CMDLINE>) {
     s/\r//g;         # turn windows-looking lines into unix-looking lines
@@ -2405,6 +2779,20 @@ sub ResolveRedirectionForCurl {
   return $url;
 }
 
+# Add a timeout flat to URL_FETCHER
+sub AddFetchTimeout {
+  my $fetcher = shift;
+  my $timeout = shift;
+  if (defined($timeout)) {
+    if ($fetcher =~ m/\bcurl -s/) {
+      $fetcher .= sprintf(" --max-time %d", $timeout);
+    } elsif ($fetcher =~ m/\brpcget\b/) {
+      $fetcher .= sprintf(" --deadline=%d", $timeout);
+    }
+  }
+  return $fetcher;
+}
+
 # Reads a symbol map from the file handle name given as $1, returning
 # the resulting symbol map.  Also processes variables relating to symbols.
 # Currently, the only variable processed is 'binary=<value>' which updates
@@ -2460,10 +2848,14 @@ sub FetchSymbols {
     close(POSTFILE);
 
     my $url = SymbolPageURL();
-    # Here we use curl for sending data via POST since old
-    # wget doesn't have --post-file option.
-    $url = ResolveRedirectionForCurl($url);
-    my $command_line = "$CURL -sd '\@$main::tmpfile_sym' '$url'";
+
+    my $command_line;
+    if ($URL_FETCHER =~ m/\bcurl -s/) {
+      $url = ResolveRedirectionForCurl($url);
+      $command_line = "$URL_FETCHER -d '\@$main::tmpfile_sym' '$url'";
+    } else {
+      $command_line = "$URL_FETCHER --post '$url' < '$main::tmpfile_sym'";
+    }
     # We use c++filt in case $SYMBOL_PAGE gives us mangled symbols.
     my $cppfilt = $obj_tool_map{"c++filt"};
     open(SYMBOL, "$command_line | $cppfilt |") or error($command_line);
@@ -2508,10 +2900,10 @@ sub BaseName {
 
 sub MakeProfileBaseName {
   my ($binary_name, $profile_name) = @_;
-  my ($host, $port, $path) = ParseProfileURL($profile_name);
+  my ($host, $baseURL, $path) = ParseProfileURL($profile_name);
   my $binary_shortname = BaseName($binary_name);
-  return sprintf("%s.%s.%s-port%s",
-                 $binary_shortname, $main::op_time, $host, $port);
+  return sprintf("%s.%s.%s",
+                 $binary_shortname, $main::op_time, $host);
 }
 
 sub FetchDynamicProfile {
@@ -2523,7 +2915,7 @@ sub FetchDynamicProfile {
   if (!IsProfileURL($profile_name)) {
     return $profile_name;
   } else {
-    my ($host, $port, $path) = ParseProfileURL($profile_name);
+    my ($host, $baseURL, $path) = ParseProfileURL($profile_name);
     if ($path eq "" || $path eq "/") {
       # Missing type specifier defaults to cpu-profile
       $path = $PROFILE_PAGE;
@@ -2531,35 +2923,26 @@ sub FetchDynamicProfile {
 
     my $profile_file = MakeProfileBaseName($binary_name, $profile_name);
 
-    my $url;
-    my $wget_timeout;
-    if (($path =~ m/$PROFILE_PAGE/) || ($path =~ m/$PMUPROFILE_PAGE/)) {
-      if ($path =~ m/$PROFILE_PAGE/) {
-        $url = sprintf("http://$host:$port$path?seconds=%d",
-            $main::opt_seconds);
+    my $url = "$baseURL$path";
+    my $fetch_timeout = undef;
+    if ($path =~ m/$PROFILE_PAGE|$PMUPROFILE_PAGE/) {
+      if ($path =~ m/[?]/) {
+        $url .= "&";
       } else {
-        if ($profile_name =~ m/[?]/) {
-          $profile_name .= "&"
-        } else {
-          $profile_name .= "?"
-        }
-        $url = sprintf("http://$profile_name" . "seconds=%d",
-            $main::opt_seconds);
+        $url .= "?";
       }
-      $wget_timeout = sprintf("--timeout=%d",
-                              int($main::opt_seconds * 1.01 + 60));
+      $url .= sprintf("seconds=%d", $main::opt_seconds);
+      $fetch_timeout = $main::opt_seconds * 1.01 + 60;
     } else {
       # For non-CPU profiles, we add a type-extension to
       # the target profile file name.
       my $suffix = $path;
       $suffix =~ s,/,.,g;
-      $profile_file .= "$suffix";
-      $url = "http://$host:$port$path";
-      $wget_timeout = "";
+      $profile_file .= $suffix;
     }
 
     my $profile_dir = $ENV{"PPROF_TMPDIR"} || ($ENV{HOME} . "/pprof");
-    if (!(-d $profile_dir)) {
+    if (! -d $profile_dir) {
       mkdir($profile_dir)
           || die("Unable to create profile directory $profile_dir: $!\n");
     }
@@ -2570,14 +2953,15 @@ sub FetchDynamicProfile {
       return $real_profile;
     }
 
-    my $cmd = "$WGET $WGET_FLAGS $wget_timeout -q -O $tmp_profile '$url'";
-    if (($path =~ m/$PROFILE_PAGE/) || ($path =~ m/$PMUPROFILE_PAGE/)){
+    my $fetcher = AddFetchTimeout($URL_FETCHER, $fetch_timeout);
+    my $cmd = "$fetcher '$url' > '$tmp_profile'";
+    if ($path =~ m/$PROFILE_PAGE|$PMUPROFILE_PAGE/){
       print STDERR "Gathering CPU profile from $url for $main::opt_seconds seconds to\n  ${real_profile}\n";
       if ($encourage_patience) {
         print STDERR "Be patient...\n";
       }
     } else {
-      print STDERR "Fetching $path profile from $host:$port to\n  ${real_profile}\n";
+      print STDERR "Fetching $path profile from $url to\n  ${real_profile}\n";
     }
 
     (system($cmd) == 0) || error("Failed to get profile: $cmd: $!\n");
@@ -2624,6 +3008,7 @@ sub FetchDynamicProfilesRecurse {
   } else {
     $position = 1 | ($position << 1);
     TryCollectProfile($maxlevel, $level, $position);
+    cleanup();
     exit(0);
   }
 }
@@ -2662,6 +3047,7 @@ BEGIN {
                  stride      => 512 * 1024,   # must be a multiple of bitsize/8
                  slots       => [],
                  unpack_code => "",           # N for big-endian, V for little
+                 perl_is_64bit => 1,          # matters if profile is 64-bit
     };
     bless $self, $class;
     # Let unittests adjust the stride
@@ -2685,17 +3071,15 @@ BEGIN {
       }
       @$slots = unpack($self->{unpack_code} . "*", $str);
     } else {
-      # If we're a 64-bit profile, make sure we're a 64-bit-capable
+      # If we're a 64-bit profile, check if we're a 64-bit-capable
       # perl.  Otherwise, each slot will be represented as a float
       # instead of an int64, losing precision and making all the
-      # 64-bit addresses right.  We *could* try to handle this with
-      # software emulation of 64-bit ints, but that's added complexity
-      # for no clear benefit (yet).  We use 'Q' to test for 64-bit-ness;
-      # perl docs say it's only available on 64-bit perl systems.
+      # 64-bit addresses wrong.  We won't complain yet, but will
+      # later if we ever see a value that doesn't fit in 32 bits.
       my $has_q = 0;
       eval { $has_q = pack("Q", "1") ? 1 : 1; };
       if (!$has_q) {
-        ::error("$fname: need a 64-bit perl to process this 64-bit profile.\n");
+	$self->{perl_is_64bit} = 0;
       }
       read($self->{file}, $str, 8);
       if (substr($str, 4, 4) eq chr(0)x4) {
@@ -2731,11 +3115,17 @@ BEGIN {
         # TODO(csilvers): if this is a 32-bit perl, the math below
         #    could end up in a too-large int, which perl will promote
         #    to a double, losing necessary precision.  Deal with that.
-        if ($self->{unpack_code} eq 'V') {    # little-endian
-          push(@b64_values, $b32_values[$i] + $b32_values[$i+1] * (2**32));
-        } else {
-          push(@b64_values, $b32_values[$i] * (2**32) + $b32_values[$i+1]);
-        }
+	#    Right now, we just die.
+	my ($lo, $hi) = ($b32_values[$i], $b32_values[$i+1]);
+        if ($self->{unpack_code} eq 'N') {    # big-endian
+	  ($lo, $hi) = ($hi, $lo);
+	}
+	my $value = $lo + $hi * (2**32);
+	if (!$self->{perl_is_64bit} &&   # check value is exactly represented
+	    (($value % (2**32)) != $lo || int($value / (2**32)) != $hi)) {
+	  ::error("Need a 64-bit perl to process this 64-bit profile.\n");
+	}
+	push(@b64_values, $value);
       }
       @$slots = @b64_values;
     }
@@ -2764,6 +3154,44 @@ BEGIN {
   }
 }
 
+# Return the next line from the profile file, assuming it's a text
+# line (which in this case means, doesn't start with a NUL byte).  If
+# it's not a text line, return "".  At EOF, return undef, like perl does.
+# Input file should be in binmode.
+sub ReadProfileLine {
+  local *PROFILE = shift;
+  my $firstchar = "";
+  my $line = "";
+  read(PROFILE, $firstchar, 1);
+  seek(PROFILE, -1, 1);          # unread the firstchar
+  if ($firstchar eq "\0") {
+    return "";
+  }
+  $line = <PROFILE>;
+  if (defined($line)) {
+    $line =~ s/\r//g;   # turn windows-looking lines into unix-looking lines
+  }
+  return $line;
+}
+
+sub IsSymbolizedProfileFile {
+  my $file_name = shift;
+  if (!(-e $file_name) || !(-r $file_name)) {
+    return 0;
+  }
+  # Check if the file contains a symbol-section marker.
+  open(TFILE, "<$file_name");
+  binmode TFILE;
+  my $firstline = ReadProfileLine(*TFILE);
+  close(TFILE);
+  if (!$firstline) {
+    return 0;
+  }
+  $SYMBOL_PAGE =~ m,[^/]+$,;    # matches everything after the last slash
+  my $symbol_marker = $&;
+  return $firstline =~ /^--- *$symbol_marker/;
+}
+
 # Parse profile generated by common/profiler.cc and return a reference
 # to a map:
 #      $result->{version}     Version number of profile file
@@ -2798,28 +3226,17 @@ sub ReadProfile {
   # whole firstline, since it may be gigabytes(!) of data.
   open(PROFILE, "<$fname") || error("$fname: $!\n");
   binmode PROFILE;      # New perls do UTF-8 processing
-  my $firstchar = "";
-  my $header = "";
-  read(PROFILE, $firstchar, 1);
-  seek(PROFILE, -1, 1);          # unread the firstchar
-  if ($firstchar ne "\0") {
-    $header = <PROFILE>;
-    $header =~ s/\r//g;   # turn windows-looking lines into unix-looking lines
+  my $header = ReadProfileLine(*PROFILE);
+  if (!defined($header)) {   # means "at EOF"
+    error("Profile is empty.\n");
   }
 
   my $symbols;
   if ($header =~ m/^--- *$symbol_marker/o) {
-    # read the symbol section of the symbolized profile file
+    # Read the symbol section of the symbolized profile file.
     $symbols = ReadSymbols(*PROFILE{IO});
-
-    # read the next line to get the header for the remaining profile
-    $header = "";
-    read(PROFILE, $firstchar, 1);
-    seek(PROFILE, -1, 1);          # unread the firstchar
-    if ($firstchar ne "\0") {
-      $header = <PROFILE>;
-      $header =~ s/\r//g;
-    }
+    # Read the next line to get the header for the remaining profile.
+    $header = ReadProfileLine(*PROFILE) || "";
   }
 
   my $result;
@@ -3114,18 +3531,18 @@ sub ReadHeapProfile {
           # The sampling frequency is the rate of a Poisson process.
           # This means that the probability of sampling an allocation of
           # size X with sampling rate Y is 1 - exp(-X/Y)
-          my $ratio;
-          my $scale_factor;
-          if ($n1 != 0) {
-            $ratio = (($s1*1.0)/$n1)/($sample_adjustment);
-            $scale_factor = 1/(1 - exp(-$ratio));
-            $n1 *= $scale_factor;
-            $s1 *= $scale_factor;
-          }
-          $ratio = (($s2*1.0)/$n2)/($sample_adjustment);
-          $scale_factor = 1/(1 - exp(-$ratio));
+	  if ($n1 != 0) {
+	    my $ratio = (($s1*1.0)/$n1)/($sample_adjustment);
+	    my $scale_factor = 1/(1 - exp(-$ratio));
+          $n1 *= $scale_factor;
+          $s1 *= $scale_factor;
+	  }
+	  if ($n2 != 0) {
+	    my $ratio = (($s2*1.0)/$n2)/($sample_adjustment);
+	    my $scale_factor = 1/(1 - exp(-$ratio));
           $n2 *= $scale_factor;
           $s2 *= $scale_factor;
+	  }
         } else {
           # Remote-heap version 1
           my $ratio;
@@ -3676,35 +4093,34 @@ sub ExtractSymbols {
 
   my $symbols = {};
 
-  # Map each PC value to the containing library
+  # Map each PC value to the containing library.  To make this faster,
+  # we sort libraries by their starting pc value (highest first), and
+  # advance through the libraries as we advance the pc.  Sometimes the
+  # addresses of libraries may overlap with the addresses of the main
+  # binary, so to make sure the libraries 'win', we iterate over the
+  # libraries in reverse order (binary will have the lowest start addr).
   my @pcs = (sort { $a cmp $b } keys(%{$pcset}));
-  foreach my $lib (reverse sort {$a->[1] cmp $b->[1]} @{$libs}) {
+  foreach my $lib (sort {$b->[1] cmp $a->[1]} @{$libs}) {
     my $libname = $lib->[0];
     my $start = $lib->[1];
     my $finish = $lib->[2];
     my $offset = $lib->[3];
 
     # Get list of pcs that belong in this library.
-    my $pc = pop(@pcs);
-    my @pcs2 = ();
     my $contained = [];
-    while (defined $pc && $pc gt $finish) {
-      unshift(@pcs2, $pc);
-      $pc = pop(@pcs);
+    my ($start_pc_index, $finish_pc_index);
+    for ($finish_pc_index = $#pcs + 1; $finish_pc_index > 0;
+	 $finish_pc_index--) {
+      last if $pcs[$finish_pc_index - 1] le $finish;
     }
-    while (defined $pc && $pc ge $start) {
-      push(@{$contained}, $pc);
-      $pc = pop(@pcs);
+    for ($start_pc_index = $finish_pc_index; $start_pc_index > 0;
+	 $start_pc_index--) {
+      last if $pcs[$start_pc_index - 1] lt $start;
     }
-    if (defined $pc) {
-      push(@pcs, $pc);
-    }
-    @pcs = (@pcs, @pcs2);
+    @{$contained} = splice(@pcs, $start_pc_index,
+			   $finish_pc_index - $start_pc_index);
     # Map to symbols
     MapToSymbols($libname, AddressSub($start, $offset), $contained, $symbols);
-    if (scalar(@pcs) == 0) {
-      last;
-    }
   }
 
   return $symbols;
@@ -3732,8 +4148,7 @@ sub MapToSymbols {
 
   # If "addr2line" isn't installed on the system at all, just use
   # nm to get what info we can (function names, but not line numbers).
-  if ($main::opt_lines == 0 || system("$addr2line --help >/dev/null 2>&1")
-      != 0) {
+  if (system("$addr2line --help >/dev/null 2>&1") != 0) {
     MapSymbolsWithNM($image, $offset, $pclist, $symbols);
     return;
   }
@@ -3919,6 +4334,8 @@ sub ConfigureObjTools {
   if ($file_type =~ /Mach-O/) {
     # OS X uses otool to examine Mach-O files, rather than objdump.
     $obj_tool_map{"otool"} = "otool";
+    $obj_tool_map{"addr2line"} = "false";  # no addr2line
+    $obj_tool_map{"objdump"} = "false";  # no objdump
   }
 
   # Go fill in %obj_tool_map with the pathnames to use:
@@ -3935,18 +4352,27 @@ sub ConfigureTool {
   my $tool = shift;
   my $path;
 
-  if ($main::opt_tools ne "") {
-    # Use a prefix specified by the --tools option...
-    $path = $main::opt_tools . $tool;
-    if (!-x $path) {
-      error("No '$tool' found with prefix specified by --tools $main::opt_tools\n");
+  # --tools (or $PPROF_TOOLS) is a comma separated list, where each
+  # item is either a) a pathname prefix, or b) a map of the form
+  # <tool>:<path>.  First we look for an entry of type (b) for our
+  # tool.  If one is found, we use it.  Otherwise, we consider all the
+  # pathname prefixes in turn, until one yields an existing file.  If
+  # none does, we use a default path.
+  my $tools = $main::opt_tools || $ENV{"PPROF_TOOLS"} || "";
+  if ($tools =~ m/(,|^)\Q$tool\E:([^,]*)/) {
+    $path = $2;
+    # TODO(csilvers): sanity-check that $path exists?  Hard if it's relative.
+  } elsif ($tools ne '') {
+    foreach my $prefix (split(',', $tools)) {
+      next if ($prefix =~ /:/);    # ignore "tool:fullpath" entries in the list
+      if (-x $prefix . $tool) {
+        $path = $prefix . $tool;
+        last;
+      }
     }
-  } elsif (exists $ENV{"PPROF_TOOLS"} &&
-           $ENV{"PPROF_TOOLS"} ne "") {
-    #... or specified with the PPROF_TOOLS environment variable...
-    $path = $ENV{"PPROF_TOOLS"} . $tool;
-    if (!-x $path) {
-      error("No '$tool' found with prefix specified by PPROF_TOOLS=$ENV{PPROF_TOOLS}\n");
+    if (!$path) {
+      error("No '$tool' found with prefix specified by " .
+            "--tools (or \$PPROF_TOOLS) '$tools'\n");
     }
   } else {
     # ... otherwise use the version that exists in the same directory as
@@ -3965,9 +4391,8 @@ sub ConfigureTool {
 
 sub cleanup {
   unlink($main::tmpfile_sym);
-  for (my $i = 0; $i < $main::next_tmpfile; $i++) {
-    unlink(PsTempName($i));
-  }
+  unlink(keys %main::tempnames);
+
   # We leave any collected profiles in $HOME/pprof in case the user wants
   # to look at them later.  We print a message informing them of this.
   if ((scalar(@main::profile_files) > 0) &&
@@ -4010,7 +4435,7 @@ sub GetProcedureBoundariesViaNm {
   my $routine = "";
   while (<NM>) {
     s/\r//g;         # turn windows-looking lines into unix-looking lines
-    if (m/^([0-9a-f]+) (.) (..*)/) {
+    if (m/^\s*([0-9a-f]+) (.) (..*)/) {
       my $start_val = $1;
       my $type = $2;
       my $this_routine = $3;
@@ -4072,7 +4497,6 @@ sub GetProcedureBoundariesViaNm {
     $symbol_table->{$routine} = [HexExtend($last_start),
                                  HexExtend($last_start)];
   }
-
   return $symbol_table;
 }
 
@@ -4120,7 +4544,11 @@ sub GetProcedureBoundaries {
   my @nm_commands = ("$nm -n $flatten_flag $demangle_flag" .
                      " $image 2>/dev/null $cppfilt_flag",
                      "$nm -D -n $flatten_flag $demangle_flag" .
-                     " $image 2>/dev/null $cppfilt_flag");
+                     " $image 2>/dev/null $cppfilt_flag",
+                     # 6nm is for Go binaries
+		     "6nm $image 2>/dev/null | sort",
+                     );
+
   # If the executable is an MS Windows PDB-format executable, we'll
   # have set up obj_tool_map("nm_pdb").  In this case, we actually
   # want to use both unix nm and windows-specific nm_pdb, since
diff --git a/jemalloc/configure.ac b/jemalloc/configure.ac
index ce6e6799..0ed13739 100644
--- a/jemalloc/configure.ac
+++ b/jemalloc/configure.ac
@@ -150,7 +150,7 @@ JE_COMPILABLE([__attribute__ syntax],
               [attribute])
 if test "x${attribute}" = "xyes" ; then
   AC_DEFINE([JEMALLOC_HAVE_ATTR], [ ])
-  if test "x$GCC" = "xyes" ; then
+  if test "x$GCC" = "xyes" -a "${abi}" = "xelf"; then
     JE_CFLAGS_APPEND([-fvisibility=internal])
   fi
 fi
@@ -166,17 +166,20 @@ case "${host}" in
   *-*-darwin*)
 	CFLAGS="$CFLAGS -fno-common -no-cpp-precomp"
 	abi="macho"
+	AC_DEFINE([JEMALLOC_PURGE_MADVISE_FREE])
 	RPATH=""
 	;;
   *-*-freebsd*)
 	CFLAGS="$CFLAGS"
 	abi="elf"
+	AC_DEFINE([JEMALLOC_PURGE_MADVISE_FREE])
 	RPATH="-Wl,-rpath,"
 	;;
   *-*-linux*)
 	CFLAGS="$CFLAGS"
 	CPPFLAGS="$CPPFLAGS -D_GNU_SOURCE"
 	abi="elf"
+	AC_DEFINE([JEMALLOC_PURGE_MADVISE_DONTNEED])
 	RPATH="-Wl,-rpath,"
 	;;
   *-*-netbsd*)
@@ -191,6 +194,7 @@ case "${host}" in
                           [CFLAGS="$CFLAGS"; abi="elf"],
                           [abi="aout"])
 	AC_MSG_RESULT([$abi])
+	AC_DEFINE([JEMALLOC_PURGE_MADVISE_FREE])
 	RPATH="-Wl,-rpath,"
 	;;
   *-*-solaris2*)
@@ -245,12 +249,20 @@ dnl Do not prefix public APIs by default.
 AC_ARG_WITH([jemalloc_prefix],
   [AS_HELP_STRING([--with-jemalloc-prefix=<prefix>], [Prefix to prepend to all public APIs])],
   [JEMALLOC_PREFIX="$with_jemalloc_prefix"],
-  [JEMALLOC_PREFIX=]
+  [if test "x$abi" != "xmacho" ; then
+  JEMALLOC_PREFIX=""
+else
+  JEMALLOC_PREFIX="je_"
+fi]
 )
 if test "x$JEMALLOC_PREFIX" != "x" ; then
-  AC_DEFINE([JEMALLOC_PREFIX], [ ])
+  JEMALLOC_CPREFIX=`echo ${JEMALLOC_PREFIX} | tr "a-z" "A-Z"`
+  AC_DEFINE_UNQUOTED([JEMALLOC_PREFIX], ["$JEMALLOC_PREFIX"])
+  AC_DEFINE_UNQUOTED([JEMALLOC_CPREFIX], ["$JEMALLOC_CPREFIX"])
   jemalloc_prefix="$JEMALLOC_PREFIX"
+  jemalloc_cprefix="$JEMALLOC_CPREFIX"
   AC_SUBST([jemalloc_prefix])
+  AC_SUBST([jemalloc_cprefix])
   AC_DEFINE_UNQUOTED([JEMALLOC_P(string_that_no_one_should_want_to_use_as_a_jemalloc_API_prefix)], [${JEMALLOC_PREFIX}##string_that_no_one_should_want_to_use_as_a_jemalloc_API_prefix])
 fi
 
@@ -266,14 +278,17 @@ AC_SUBST([install_suffix])
 cfgoutputs_in="${srcroot}Makefile.in ${srcroot}doc/jemalloc.3.in"
 cfgoutputs_in="${cfgoutputs_in} ${srcroot}include/jemalloc/jemalloc.h.in"
 cfgoutputs_in="${cfgoutputs_in} ${srcroot}include/jemalloc/internal/jemalloc_internal.h.in"
+cfgoutputs_in="${cfgoutputs_in} ${srcroot}test/jemalloc_test.h.in"
 
 cfgoutputs_out="Makefile doc/jemalloc${install_suffix}.3"
 cfgoutputs_out="${cfgoutputs_out} include/jemalloc/jemalloc${install_suffix}.h"
 cfgoutputs_out="${cfgoutputs_out} include/jemalloc/internal/jemalloc_internal.h"
+cfgoutputs_out="${cfgoutputs_out} test/jemalloc_test.h"
 
 cfgoutputs_tup="Makefile doc/jemalloc${install_suffix}.3:doc/jemalloc.3.in"
 cfgoutputs_tup="${cfgoutputs_tup} include/jemalloc/jemalloc${install_suffix}.h:include/jemalloc/jemalloc.h.in"
 cfgoutputs_tup="${cfgoutputs_tup} include/jemalloc/internal/jemalloc_internal.h"
+cfgoutputs_tup="${cfgoutputs_tup} test/jemalloc_test.h:test/jemalloc_test.h.in"
 
 cfghdrs_in="${srcroot}include/jemalloc/jemalloc_defs.h.in"
 
@@ -281,6 +296,23 @@ cfghdrs_out="include/jemalloc/jemalloc_defs${install_suffix}.h"
 
 cfghdrs_tup="include/jemalloc/jemalloc_defs${install_suffix}.h:include/jemalloc/jemalloc_defs.h.in"
 
+dnl Do not silence irrelevant compiler warnings by default, since enabling this
+dnl option incurs a performance penalty.
+AC_ARG_ENABLE([cc-silence],
+  [AS_HELP_STRING([--enable-cc-silence],
+                  [Silence irrelevant compiler warnings])],
+[if test "x$enable_cc_silence" = "xno" ; then
+  enable_cc_silence="0"
+else
+  enable_cc_silence="1"
+fi
+],
+[enable_cc_silence="0"]
+)
+if test "x$enable_cc_silence" = "x1" ; then
+  AC_DEFINE([JEMALLOC_CC_SILENCE])
+fi
+
 dnl Do not compile with debugging by default.
 AC_ARG_ENABLE([debug],
   [AS_HELP_STRING([--enable-debug], [Build debugging code])],
@@ -294,8 +326,18 @@ fi
 )
 if test "x$enable_debug" = "x1" ; then
   AC_DEFINE([JEMALLOC_DEBUG], [ ])
+  AC_DEFINE([JEMALLOC_IVSALLOC], [ ])
 fi
 AC_SUBST([enable_debug])
+if test "x$enable_debug" = "x0" ; then
+  roff_debug=".\\\" "
+  roff_no_debug=""
+else
+  roff_debug=""
+  roff_no_debug=".\\\" "
+fi
+AC_SUBST([roff_debug])
+AC_SUBST([roff_no_debug])
 
 dnl Only optimize if not debugging.
 if test "x$enable_debug" = "x0" -a "x$no_CFLAGS" = "xyes" ; then
@@ -379,7 +421,47 @@ else
 fi,
   LUNWIND="-lunwind"
 )
-dnl Finish prof-related definitions below, once TLS configuration is done.
+if test "x$enable_prof" = "x1" ; then
+  LIBS="$LIBS -lm"
+  AC_DEFINE([JEMALLOC_PROF], [ ])
+  if test "x$enable_prof_libunwind" = "x1" ; then
+    AC_CHECK_HEADERS([libunwind.h], , [enable_prof_libunwind="0"])
+    if test "x$LUNWIND" = "x-lunwind" ; then
+      AC_CHECK_LIB([unwind], [backtrace], [LIBS="$LIBS $LUNWIND"],
+                   [enable_prof_libunwind="0"])
+    else
+      LIBS="$LIBS $LUNWIND"
+    fi
+    if test "x${enable_prof_libunwind}" = "x1" ; then
+      AC_DEFINE([JEMALLOC_PROF_LIBUNWIND], [ ])
+    fi
+  fi
+fi
+AC_SUBST([enable_prof])
+if test "x$enable_prof" = "x0" ; then
+  roff_prof=".\\\" "
+  roff_no_prof=""
+else
+  roff_prof=""
+  roff_no_prof=".\\\" "
+fi
+AC_SUBST([roff_prof])
+AC_SUBST([roff_no_prof])
+
+dnl If libunwind isn't enabled, try to use libgcc rather than gcc intrinsics
+dnl for backtracing.
+if test "x$enable_prof" = "x1" -a "x$enable_prof_libgcc" = "x1" ; then
+  if test "x$enable_prof_libunwind" = "x0" -a "x$GCC" = "xyes" ; then
+    enable_prof_libgcc="1"
+    AC_CHECK_HEADERS([unwind.h], , [enable_prof_libgcc="0"])
+    AC_CHECK_LIB([gcc], [_Unwind_Backtrace], [LIBS="$LIBS -lgcc"], [enable_prof_libgcc="0"])
+    if test "x${enable_prof_libgcc}" = "x1" ; then
+      AC_DEFINE([JEMALLOC_PROF_LIBGCC], [ ])
+    fi
+  else
+    enable_prof_libgcc="0"
+  fi
+fi
 
 dnl Enable tiny allocations by default.
 AC_ARG_ENABLE([tiny],
@@ -417,7 +499,19 @@ fi
 ],
 [enable_tcache="1"]
 )
-dnl Finish tcache-related definitions below, once TLS configuration is done.
+if test "x$enable_tcache" = "x1" ; then
+  AC_DEFINE([JEMALLOC_TCACHE], [ ])
+fi
+AC_SUBST([enable_tcache])
+if test "x$enable_tcache" = "x0" ; then
+  roff_tcache=".\\\" "
+  roff_no_tcache=""
+else
+  roff_tcache=""
+  roff_no_tcache=".\\\" "
+fi
+AC_SUBST([roff_tcache])
+AC_SUBST([roff_no_tcache])
 
 dnl Do not enable mmap()ped swap files by default.
 AC_ARG_ENABLE([swap],
@@ -579,7 +673,7 @@ dnl
 
 dnl Set VERSION if source directory has an embedded git repository.
 if test -d "${srcroot}../.git" ; then
-  git describe --long > ${srcroot}VERSION
+  git describe --long --abbrev=40 > ${srcroot}VERSION
 fi
 jemalloc_version=`cat ${srcroot}VERSION`
 jemalloc_version_major=`echo ${jemalloc_version} | tr ".g-" " " | awk '{print [$]1}'`
@@ -647,69 +741,63 @@ AC_RUN_IFELSE([AC_LANG_PROGRAM(
               AC_MSG_RESULT([no])
               enable_tls="0")
 fi
+AC_SUBST([enable_tls])
 if test "x${enable_tls}" = "x0" ; then
   AC_DEFINE_UNQUOTED([NO_TLS], [ ])
 fi
 
-dnl Finish tcache-related definitions, now that TLS configuration is done.
-if test "x$enable_tls" = "x0" ; then
-  enable_tcache="0"
-fi
-if test "x$enable_tcache" = "x1" ; then
-  AC_DEFINE([JEMALLOC_TCACHE], [ ])
-fi
-AC_SUBST([enable_tcache])
-if test "x$enable_tcache" = "x0" ; then
-  roff_tcache=".\\\" "
-  roff_no_tcache=""
-else
-  roff_tcache=""
-  roff_no_tcache=".\\\" "
-fi
-AC_SUBST([roff_tcache])
-AC_SUBST([roff_no_tcache])
+dnl ============================================================================
+dnl Check for allocator-related functions that should be wrapped.
 
-dnl Finish prof-related definitions, now that TLS configuration is done.
-if test "x$enable_tls" = "x0" ; then
-  enable_prof="0"
-fi
-if test "x$enable_prof" = "x1" ; then
-  LIBS="$LIBS -lm"
-  AC_DEFINE([JEMALLOC_PROF], [ ])
-  if test "x$enable_prof_libunwind" = "x1" ; then
-    AC_CHECK_HEADERS([libunwind.h], , [enable_prof_libunwind="0"])
-    if test "x$LUNWIND" = "x-lunwind" ; then
-      AC_CHECK_LIB([unwind], [backtrace], [LIBS="$LIBS $LUNWIND"],
-                   [enable_prof_libunwind="0"])
-    else
-      LIBS="$LIBS $LUNWIND"
-    fi
-    if test "x${enable_prof_libunwind}" = "x1" ; then
-      AC_DEFINE([JEMALLOC_PROF_LIBUNWIND], [ ])
-    fi
-  fi
-fi
-AC_SUBST([enable_prof])
-if test "x$enable_prof" = "x0" ; then
-  roff_prof=".\\\" "
-  roff_no_prof=""
-else
-  roff_prof=""
-  roff_no_prof=".\\\" "
-fi
-AC_SUBST([roff_prof])
-AC_SUBST([roff_no_prof])
+AC_CHECK_FUNC([memalign],
+	      [AC_DEFINE([JEMALLOC_OVERRIDE_MEMALIGN])])
+AC_CHECK_FUNC([valloc],
+	      [AC_DEFINE([JEMALLOC_OVERRIDE_VALLOC])])
 
-dnl If libunwind isn't enabled, try to use libgcc rather than gcc intrinsics
-dnl for backtracing.
-if test "x$enable_prof" = "x1" -a "x$enable_prof_libunwind" = "x0" \
- -a "x$GCC" = "xyes" -a "x$enable_prof_libgcc" = "x1" ; then
-  enable_prof_libgcc="1"
-  AC_CHECK_HEADERS([unwind.h], , [enable_prof_libgcc="0"])
-  AC_CHECK_LIB([gcc], [_Unwind_Backtrace], [LIBS="$LIBS -lgcc"], [enable_prof_libgcc="0"])
-  if test "x${enable_prof_libgcc}" = "x1" ; then
-    AC_DEFINE([JEMALLOC_PROF_LIBGCC], [ ])
-  fi
+dnl ============================================================================
+dnl Darwin-related configuration.
+
+if test "x${abi}" = "xmacho" ; then
+  AC_DEFINE([JEMALLOC_IVSALLOC])
+  AC_DEFINE([JEMALLOC_ZONE])
+
+  dnl The szone version jumped from 3 to 6 between the OS X 10.5.x and 10.6
+  dnl releases.  malloc_zone_t and malloc_introspection_t have new fields in
+  dnl 10.6, which is the only source-level indication of the change.
+  AC_MSG_CHECKING([malloc zone version])
+  AC_TRY_COMPILE([#include <stdlib.h>
+#include <malloc/malloc.h>], [
+	static malloc_zone_t zone;
+	static struct malloc_introspection_t zone_introspect;
+
+	zone.size = NULL;
+	zone.malloc = NULL;
+	zone.calloc = NULL;
+	zone.valloc = NULL;
+	zone.free = NULL;
+	zone.realloc = NULL;
+	zone.destroy = NULL;
+	zone.zone_name = "jemalloc_zone";
+	zone.batch_malloc = NULL;
+	zone.batch_free = NULL;
+	zone.introspect = &zone_introspect;
+	zone.version = 6;
+	zone.memalign = NULL;
+	zone.free_definite_size = NULL;
+
+	zone_introspect.enumerator = NULL;
+	zone_introspect.good_size = NULL;
+	zone_introspect.check = NULL;
+	zone_introspect.print = NULL;
+	zone_introspect.log = NULL;
+	zone_introspect.force_lock = NULL;
+	zone_introspect.force_unlock = NULL;
+	zone_introspect.statistics = NULL;
+	zone_introspect.zone_locked = NULL;
+], [AC_DEFINE_UNQUOTED([JEMALLOC_ZONE_VERSION], [6])
+    AC_MSG_RESULT([6])],
+   [AC_DEFINE_UNQUOTED([JEMALLOC_ZONE_VERSION], [3])
+   AC_MSG_RESULT([3])])
 fi
 
 dnl ============================================================================
@@ -755,9 +843,11 @@ AC_MSG_RESULT([])
 AC_MSG_RESULT([JEMALLOC_PREFIX    : ${JEMALLOC_PREFIX}])
 AC_MSG_RESULT([install_suffix     : ${install_suffix}])
 AC_MSG_RESULT([autogen            : ${enable_autogen}])
+AC_MSG_RESULT([cc-silence         : ${enable_cc_silence}])
 AC_MSG_RESULT([debug              : ${enable_debug}])
 AC_MSG_RESULT([stats              : ${enable_stats}])
 AC_MSG_RESULT([prof               : ${enable_prof}])
+AC_MSG_RESULT([prof-libgcc        : ${enable_prof_libgcc}])
 AC_MSG_RESULT([prof-libunwind     : ${enable_prof_libunwind}])
 AC_MSG_RESULT([tiny               : ${enable_tiny}])
 AC_MSG_RESULT([tcache             : ${enable_tcache}])
@@ -768,4 +858,5 @@ AC_MSG_RESULT([swap               : ${enable_swap}])
 AC_MSG_RESULT([dss                : ${enable_dss}])
 AC_MSG_RESULT([dynamic_page_shift : ${enable_dynamic_page_shift}])
 AC_MSG_RESULT([lazy_lock          : ${enable_lazy_lock}])
+AC_MSG_RESULT([tls                : ${enable_tls}])
 AC_MSG_RESULT([===============================================================================])
diff --git a/jemalloc/doc/jemalloc.3.in b/jemalloc/doc/jemalloc.3.in
index cf5cb5e0..62866642 100644
--- a/jemalloc/doc/jemalloc.3.in
+++ b/jemalloc/doc/jemalloc.3.in
@@ -38,8 +38,8 @@
 .\"     @(#)malloc.3	8.1 (Berkeley) 6/4/93
 .\" $FreeBSD: head/lib/libc/stdlib/malloc.3 182225 2008-08-27 02:00:53Z jasone $
 .\"
-.Dd April 2, 2010
-.Dt JEMALLOC 3
+.Dd October 24, 2010
+.Dt jemalloc 3
 .Os
 .Sh NAME
 .Nm @jemalloc_prefix@malloc ,
@@ -51,13 +51,24 @@
 .Nm @jemalloc_prefix@malloc_stats_print ,
 .Nm @jemalloc_prefix@mallctl ,
 .Nm @jemalloc_prefix@mallctlnametomib ,
-.Nm @jemalloc_prefix@mallctlbymib
+.Nm @jemalloc_prefix@mallctlbymib ,
+.Nm @jemalloc_prefix@allocm ,
+.Nm @jemalloc_prefix@rallocm ,
+.Nm @jemalloc_prefix@sallocm ,
+.Nm @jemalloc_prefix@dallocm
 .Nd general purpose memory allocation functions
 .Sh LIBRARY
 .Sy libjemalloc@install_suffix@
+.Pp
+This manual describes jemalloc @jemalloc_version@.
+More information can be found at the
+.UR http://\:www.canonware.com/\:jemalloc/
+jemalloc website
+.UE .
 .Sh SYNOPSIS
 .In stdlib.h
 .In jemalloc/jemalloc@install_suffix@.h
+.Ss Standard API
 .Ft void *
 .Fn @jemalloc_prefix@malloc "size_t size"
 .Ft void *
@@ -68,6 +79,7 @@
 .Fn @jemalloc_prefix@realloc "void *ptr" "size_t size"
 .Ft void
 .Fn @jemalloc_prefix@free "void *ptr"
+.Ss Non-standard API
 .Ft size_t
 .Fn @jemalloc_prefix@malloc_usable_size "const void *ptr"
 .Ft void
@@ -79,10 +91,20 @@
 .Ft int
 .Fn @jemalloc_prefix@mallctlbymib "const size_t *mib" "size_t miblen" "void *oldp" "size_t *oldlenp" "void *newp" "size_t newlen"
 .Ft const char *
-.Va @jemalloc_prefix@malloc_options ;
+.Va @jemalloc_prefix@malloc_conf ;
 .Ft void
 .Fn \*(lp*@jemalloc_prefix@malloc_message\*(rp "void *cbopaque" "const char *s"
+.Ss Experimental API
+.Ft int
+.Fn @jemalloc_prefix@allocm "void **ptr" "size_t *rsize" "size_t size" "int flags"
+.Ft int
+.Fn @jemalloc_prefix@rallocm "void **ptr" "size_t *rsize" "size_t size" "size_t extra" "int flags"
+.Ft int
+.Fn @jemalloc_prefix@sallocm "const void *ptr" "size_t *rsize" "int flags"
+.Ft int
+.Fn @jemalloc_prefix@dallocm "void *ptr" "int flags"
 .Sh DESCRIPTION
+.Ss Standard API
 The
 .Fn @jemalloc_prefix@malloc
 function allocates
@@ -158,7 +180,7 @@ If
 is
 .Dv NULL ,
 no action occurs.
-.Pp
+.Ss Non-standard API
 The
 .Fn @jemalloc_prefix@malloc_usable_size
 function returns the usable size of the allocation pointed to by
@@ -289,255 +311,130 @@ for (i = 0; i < nbins; i++) {
 	/* Do something with bin_size... */
 }
 .Ed
-.Sh TUNING
-Once, when the first call is made to one of these memory allocation
-routines, various flags will be set or reset, which affects the
-workings of this allocator implementation.
+.Ss Experimental API
+The experimental API is subject to change or removal without regard for
+backward compatibility.
 .Pp
 The
-.Dq name
-of the file referenced by the symbolic link named
-.Pa /etc/jemalloc.conf ,
-the value of the environment variable
-.Ev JEMALLOC_OPTIONS ,
-and the string pointed to by the global variable
-.Va @jemalloc_prefix@malloc_options
-will be interpreted, in that order, from left to right as flags.
-.Pp
-Each flag is a single letter, optionally prefixed by a non-negative base 10
-integer repetition count.
-For example,
-.Dq 3N
-is equivalent to
-.Dq NNN .
-Some flags control parameter magnitudes, where uppercase increases the
-magnitude, and lowercase decreases the magnitude.
-Other flags control boolean parameters, where uppercase indicates that a
-behavior is set, or on, and lowercase means that a behavior is not set, or off.
-.Bl -tag -width indent
-.It A
-All warnings (except for the warning about unknown
-flags being set) become fatal.
-The process will call
-.Xr abort 3
-in these cases.
-@roff_prof@.It B
-@roff_prof@Double/halve the maximum backtrace depth when profiling memory
-@roff_prof@allocation activity.
-@roff_prof@The default is 4.
-.It C
-Double/halve the size of the maximum size class that is a multiple of the
-cacheline size (64).
-Above this size, subpage spacing (256 bytes) is used for size classes.
-The default value is 512 bytes.
-.It D
-Halve/double the per-arena minimum ratio of active to dirty pages.
-Some dirty unused pages may be allowed to accumulate, within the limit set by
-the ratio (or one chunk worth of dirty pages, whichever is greater), before
-informing the kernel about some of those pages via
-.Xr madvise 2 .
-This provides the kernel with sufficient information to recycle dirty pages if
-physical memory becomes scarce and the pages remain unused.
-The default minimum ratio is 32:1;
-.Ev JEMALLOC_OPTIONS=6D
-will disable dirty page purging.
-@roff_prof@.It E
-@roff_prof@Activate/deactivate profiling.
-@roff_prof@This is a secondary control mechanism that makes it possible to
-@roff_prof@start the application with profiling enabled (see the
-@roff_prof@.Dq F
-@roff_prof@option) but inactive, then toggle profiling at any time during
-@roff_prof@program execution with the
-@roff_prof@.Dq prof.active
-@roff_prof@mallctl.
-@roff_prof@This option is enabled by default.
-@roff_prof@.It F
-@roff_prof@Profile memory allocation activity, and use an
-@roff_prof@.Xr atexit 3
-@roff_prof@function to dump final memory usage to a file named according to
-@roff_prof@the pattern
-@roff_prof@.Pa <prefix>.<pid>.<seq>.f.heap ,
-@roff_prof@where
-@roff_prof@.Pa <prefix>
-@roff_prof@is controlled by the
-@roff_prof@JEMALLOC_PROF_PREFIX
-@roff_prof@environment variable.
-@roff_prof@See the
-@roff_prof@.Dq B
-@roff_prof@option for backtrace depth control.
-@roff_prof@See the
-@roff_prof@.Dq E
-@roff_prof@option for on-the-fly activation/deactivation.
-@roff_prof@See the
-@roff_prof@.Dq S
-@roff_prof@option for probabilistic sampling control.
-@roff_prof@See the
-@roff_prof@.Dq I
-@roff_prof@option for information on interval-triggered profile dumping, and the
-@roff_prof@.Dq U
-@roff_prof@option for information on high-water-triggered profile dumping.
-@roff_prof@Profile output is compatible with the included pprof Perl script,
-@roff_prof@which originates from the google-perftools package
-@roff_prof@(http://code.google.com/p/google-perftools/).
-@roff_tcache@.It G
-@roff_tcache@Double/halve the approximate interval (counted in terms of
-@roff_tcache@thread-specific cache allocation/deallocation events) between full
-@roff_tcache@thread-specific cache garbage collection sweeps.
-@roff_tcache@Garbage collection is actually performed incrementally, one size
-@roff_tcache@class at a time, in order to avoid large collection pauses.
-@roff_tcache@The default sweep interval is 8192;
-@roff_tcache@.Ev JEMALLOC_OPTIONS=14g
-@roff_tcache@will disable garbage collection.
-@roff_tcache@.It H
-@roff_tcache@Enable/disable thread-specific caching.
-@roff_tcache@When there are multiple threads, each thread uses a
-@roff_tcache@thread-specific cache for objects up to a certain size.
-@roff_tcache@Thread-specific caching allows many allocations to be satisfied
-@roff_tcache@without performing any thread synchronization, at the cost of
-@roff_tcache@increased memory use.
-@roff_tcache@See the
-@roff_tcache@.Dq G
-@roff_tcache@and
-@roff_tcache@.Dq M
-@roff_tcache@options for related tuning information.
-@roff_tcache@This option is enabled by default.
-@roff_prof@.It I
-@roff_prof@Double/halve the average interval between memory profile dumps, as
-@roff_prof@measured in bytes of allocation activity.
-@roff_prof@The actual interval between dumps may be sporadic because
-@roff_prof@decentralized allocation counters are used to avoid synchronization
-@roff_prof@bottlenecks.
-@roff_prof@Profiles are dumped to files named according to the pattern
-@roff_prof@.Pa <prefix>.<pid>.<seq>.i<iseq>.heap ,
-@roff_prof@where
-@roff_prof@.Pa <prefix>
-@roff_prof@is controlled by the
-@roff_prof@JEMALLOC_PROF_PREFIX
-@roff_prof@environment variable.
-@roff_prof@The default average interval is 1 GiB;
-@roff_prof@.Ev JEMALLOC_OPTIONS=31i
-@roff_prof@will disable interval-triggered profile dumping.
-@roff_fill@.It J
-@roff_fill@Each byte of new memory allocated by
-@roff_fill@.Fn @jemalloc_prefix@malloc
-@roff_fill@or
-@roff_fill@.Fn @jemalloc_prefix@realloc
-@roff_fill@will be initialized to 0xa5.
-@roff_fill@All memory returned by
-@roff_fill@.Fn @jemalloc_prefix@free
-@roff_fill@or
-@roff_fill@.Fn @jemalloc_prefix@realloc
-@roff_fill@will be initialized to 0x5a.
-@roff_fill@This is intended for debugging and will impact performance
-@roff_fill@negatively.
-.It K
-Double/halve the virtual memory chunk size.
-The default chunk size is 4 MiB.
-@roff_prof@.It L
-@roff_prof@Use an
-@roff_prof@.Xr atexit 3
-@roff_prof@function to report memory leaks.
-@roff_prof@See the
-@roff_prof@.Dq B
-@roff_prof@option for backtrace depth control.
-@roff_prof@See the
-@roff_prof@.Dq F option for information on analyzing heap profile output.
-@roff_prof@This option is disabled by default.
-@roff_tcache@.It M
-@roff_tcache@Double/halve the maximum size class to cache.
-@roff_tcache@At a minimum, all small size classes are cached, and at a maximum
-@roff_tcache@all large size classes are cached.
-@roff_tcache@The default maximum is 32 KiB.
-.It N
-Double/halve the number of arenas.
-The default number of arenas is four times the number of CPUs, or one if there
-is a single CPU.
-@roff_swap@.It O
-@roff_swap@Over-commit memory as a side effect of using anonymous
-@roff_swap@.Xr mmap 2
-@roff_swap@@roff_dss@ and
-@roff_swap@@roff_dss@.Xr sbrk 2
-@roff_swap@for virtual memory allocation.
-@roff_swap@In order for overcommit to be disabled, the
-@roff_swap@.Dq swap.fds
-@roff_swap@mallctl must have been successfully written to.
-@roff_swap@This option is enabled by default.
-.It P
-The
-.Fn malloc_stats_print
-function is called at program exit via an
-.Xr atexit 3
-function.
-@roff_stats@This has the potential to cause deadlock for a multi-threaded
-@roff_stats@process that exits while one or more threads are executing in the
-@roff_stats@memory allocation functions.
-@roff_stats@Therefore, this option should only be used with care; it is
-@roff_stats@primarily intended as a performance tuning aid during application
-@roff_stats@development.
-.It Q
-Double/halve the size of the maximum size class that is a multiple of the
-quantum (8 or 16 bytes, depending on architecture).
-Above this size, cacheline spacing is used for size classes.
-The default value is 128 bytes.
-@roff_prof@.It S
-@roff_prof@Double/halve the average interval between allocation samples, as
-@roff_prof@measured in bytes of allocation activity.
-@roff_prof@Increasing the sampling interval decreases profile fidelity, but
-@roff_prof@also decreases the computational overhead.
-@roff_prof@The default sample interval is one (i.e. all allocations are
-@roff_prof@sampled).
-@roff_prof@.It U
-@roff_prof@Trigger a memory profile dump every time the total virtual memory
-@roff_prof@exceeds the previous maximum.
-@roff_prof@Profiles are dumped to files named according to the pattern
-@roff_prof@.Pa <prefix>.<pid>.<seq>.u<useq>.heap ,
-@roff_prof@where
-@roff_prof@.Pa <prefix>
-@roff_prof@is controlled by the
-@roff_prof@JEMALLOC_PROF_PREFIX
-@roff_prof@environment variable.
-@roff_prof@This option is disabled by default.
-@roff_sysv@.It V
-@roff_sysv@Attempting to allocate zero bytes will return a
-@roff_sysv@.Dv NULL
-@roff_sysv@pointer instead of a valid pointer.
-@roff_sysv@(The default behavior is to make a minimal allocation and return a
-@roff_sysv@pointer to it.)
-@roff_sysv@This option is provided for System V compatibility.
-@roff_sysv@@roff_xmalloc@This option is incompatible with the
-@roff_sysv@@roff_xmalloc@.Dq X
-@roff_sysv@@roff_xmalloc@option.
-@roff_xmalloc@.It X
-@roff_xmalloc@Rather than return failure for any allocation function, display a
-@roff_xmalloc@diagnostic message on
-@roff_xmalloc@.Dv STDERR_FILENO
-@roff_xmalloc@and cause the program to drop core (using
-@roff_xmalloc@.Xr abort 3 ) .
-@roff_xmalloc@This option should be set at compile time by including the
-@roff_xmalloc@following in the source code:
-@roff_xmalloc@.Bd -literal -offset indent
-@roff_xmalloc@@jemalloc_prefix@malloc_options = "X";
-@roff_xmalloc@.Ed
-@roff_fill@.It Z
-@roff_fill@Each byte of new memory allocated by
-@roff_fill@.Fn @jemalloc_prefix@malloc
-@roff_fill@or
-@roff_fill@.Fn @jemalloc_prefix@realloc
-@roff_fill@will be initialized to 0.
-@roff_fill@Note that this initialization only happens once for each byte, so
-@roff_fill@.Fn @jemalloc_prefix@realloc
-@roff_fill@calls do not zero memory that was previously allocated.
-@roff_fill@This is intended for debugging and will impact performance
-@roff_fill@negatively.
+.Fn @jemalloc_prefix@allocm ,
+.Fn @jemalloc_prefix@rallocm ,
+.Fn @jemalloc_prefix@sallocm ,
+and
+.Fn @jemalloc_prefix@dallocm
+functions all have a
+.Fa flags
+argument that can be used to specify options.
+The functions only check the options that are contextually relevant.
+Use bitwise or (|) operations to specify one or more of the following:
+.Bl -tag -width ".Dv ALLOCM_LG_ALIGN(la)"
+.It ALLOCM_LG_ALIGN(la)
+Align the memory allocation to start at an address that is a multiple of
+(1 <<
+.Fa la ) .
+This macro does not validate that
+.Fa la
+is within the valid range.
+.It ALLOCM_ALIGN(a)
+Align the memory allocation to start at an address that is a multiple of
+.Fa a ,
+where
+.Fa a
+is a power of two.
+This macro does not validate that
+.Fa a
+is a power of 2.
+.It ALLOCM_ZERO
+Initialize newly allocated memory to contain zero bytes.
+In the growing reallocation case, the real size prior to reallocation defines
+the boundary between untouched bytes and those that are initialized to contain
+zero bytes.
+If this option is absent, newly allocated memory is uninitialized.
+.It ALLOCM_NO_MOVE
+For reallocation, fail rather than moving the object.
+This constraint can apply to both growth and shrinkage.
 .El
 .Pp
-@roff_fill@The
-@roff_fill@.Dq J
-@roff_fill@and
-@roff_fill@.Dq Z
-@roff_fill@options are intended for testing and debugging.
-@roff_fill@An application which changes its behavior when these options are used
-@roff_fill@is flawed.
+The
+.Fn @jemalloc_prefix@allocm
+function allocates at least
+.Fa size
+bytes of memory, sets
+.Fa *ptr
+to the base address of the allocation, and sets
+.Fa *rsize
+to the real size of the allocation if
+.Fa rsize
+is not
+.Dv NULL .
+.Pp
+The
+.Fn @jemalloc_prefix@rallocm
+function resizes the allocation at
+.Fa *ptr
+to be at least
+.Fa size
+bytes, sets
+.Fa *ptr
+to the base address of the allocation if it moved, and sets
+.Fa *rsize
+to the real size of the allocation if
+.Fa rsize
+is not
+.Dv NULL .
+If
+.Fa extra
+is non-zero, an attempt is made to resize the allocation to be at least
+.Fa ( size
++
+.Fa extra )
+bytes, though inability to allocate the extra byte(s) will not by itself result
+in failure.
+Behavior is undefined if
+.Fa ( size
++
+.Fa extra
+>
+.Dv SIZE_T_MAX ) .
+.Pp
+The
+.Fn @jemalloc_prefix@sallocm
+function sets
+.Fa *rsize
+to the real size of the allocation.
+.Pp
+The
+.Fn @jemalloc_prefix@dallocm
+function causes the memory referenced by
+.Fa ptr
+to be made available for future allocations.
+.Sh TUNING
+Once, when the first call is made to one of the memory allocation routines, the
+allocator initializes its internals based in part on various options that can
+be specified at compile- or run-time.
+.Pp
+The string pointed to by the global variable
+.Va @jemalloc_prefix@malloc_conf ,
+the
+.Dq name
+of the file referenced by the symbolic link named
+.Pa /etc/@jemalloc_prefix@malloc.conf ,
+and the value of the environment variable
+.Ev @jemalloc_cprefix@MALLOC_CONF ,
+will be interpreted, in that order, from left to right as options.
+.Pp
+An options string is a comma-separated list of option:value pairs.
+There is one key corresponding to each
+.Dq opt.*
+mallctl.
+For example,
+.Dq abort:true,narenas:1
+sets the
+.Dq opt.abort
+and
+.Dq opt.narenas
+options.
+Some options have boolean values (true/false), others have integer values (base
+8, 10, or 16, depending on prefix), and yet others have raw string values.
 .Sh IMPLEMENTATION NOTES
 @roff_dss@Traditionally, allocators have used
 @roff_dss@.Xr sbrk 2
@@ -564,8 +461,8 @@ However, it may make sense to reduce the number of arenas if an application
 does not make much use of the allocation functions.
 .Pp
 @roff_tcache@In addition to multiple arenas, this allocator supports
-@roff_tcache@thread-specific caching for small objects, in order to make it
-@roff_tcache@possible to completely avoid synchronization for most small
+@roff_tcache@thread-specific caching for small and large objects, in order to
+@roff_tcache@make it possible to completely avoid synchronization for most
 @roff_tcache@allocation requests.
 @roff_tcache@Such caching allows very fast allocation in the common case, but it
 @roff_tcache@increases memory usage and fragmentation, since a bounded number of
@@ -594,27 +491,27 @@ The combination of chunk alignment and chunk page maps makes it possible to
 determine all metadata regarding small and large allocations in constant time.
 .Pp
 Small objects are managed in groups by page runs.
-Each run maintains a bitmap that tracks which regions are in use.
+Each run maintains a frontier and free list to track which regions are in use.
 @roff_tiny@Allocation requests that are no more than half the quantum (8 or 16,
 @roff_tiny@depending on architecture) are rounded up to the nearest power of
 @roff_tiny@two.
 Allocation requests that are
 @roff_tiny@more than half the quantum, but
 no more than the minimum cacheline-multiple size class (see the
-.Dq Q
+.Dq opt.lg_qspace_max
 option) are rounded up to the nearest multiple of the
 @roff_tiny@quantum.
 @roff_no_tiny@quantum (8 or 16, depending on architecture).
 Allocation requests that are more than the minimum cacheline-multiple size
 class, but no more than the minimum subpage-multiple size class (see the
-.Dq C
+.Dq opt.lg_cspace_max
 option) are rounded up to the nearest multiple of the cacheline size (64).
 Allocation requests that are more than the minimum subpage-multiple size class,
 but no more than the maximum subpage-multiple size class are rounded up to the
 nearest multiple of the subpage size (256).
 Allocation requests that are more than the maximum subpage-multiple size class,
 but small enough to fit in an arena-managed chunk (see the
-.Dq K
+.Dq opt.lg_chunk
 option), are rounded up to the nearest run size.
 Allocation requests that are too large to fit in an arena-managed chunk are
 rounded up to the nearest multiple of the chunk size.
@@ -623,10 +520,33 @@ Allocations are packed tightly together, which can be an issue for
 multi-threaded applications.
 If you need to assure that allocations do not suffer from cacheline sharing,
 round your allocation requests up to the nearest multiple of the cacheline
-size.
+size, or specify cacheline alignment when allocating.
+.Pp
+Assuming 4 MiB chunks, 4 KiB pages, and a 16-byte quantum on a 64-bit system,
+the size classes in each category are as follows:
+.\"-----------------------------------------------------------------------------
+.TS
+allbox tab(;);
+LLL
+LLL
+^LL
+^LL
+^LL
+LsL
+LsL.
+Category;Subcategory;Size
+@roff_tiny@Small;Tiny;[8]
+@roff_no_tiny@Small;Tiny;[disabled]
+;Quantum-spaced;[16, 32, 48, ..., 128]
+;Cacheline-spaced;[192, 256, 320, ..., 512]
+;Sub-page-spaced;[768, 1024, 1280, ..., 3840]
+Large;[4 KiB, 8 KiB, 12 KiB, ..., 4072 KiB]
+Huge;[4 MiB, 8 MiB, 12 MiB, ...]
+.TE
+.\"-----------------------------------------------------------------------------
 .Sh MALLCTL NAMESPACE
 The following names are defined in the namespace accessible via the
-.Fn mallctl*
+.Fn @jemalloc_prefix@mallctl*
 functions.
 Value types are specified in parentheses, and their readable/writable statuses
 are encoded as rw, r-, -w, or --.
@@ -638,6 +558,10 @@ introspection.
 @roff_stats@<i> equal to
 @roff_stats@.Dq arenas.narenas
 @roff_stats@can be used to access the summation of statistics from all arenas.
+.Pp
+Take special note of the
+.Dq epoch
+mallctl, which controls refreshing of cached dynamic statistics.
 .Bl -ohang
 .\"-----------------------------------------------------------------------------
 .It Sy "version (const char *) r-"
@@ -648,27 +572,12 @@ Return the jemalloc version string.
 .It Sy "epoch (uint64_t) rw"
 .Bd -ragged -offset indent -compact
 If a value is passed in, refresh the data from which the
-.Fn mallctl*
+.Fn @jemalloc_prefix@mallctl*
 functions report values, and increment the epoch.
 Return the current epoch.
 This is useful for detecting whether another thread caused a refresh.
 .Ed
 .\"-----------------------------------------------------------------------------
-@roff_tcache@.It Sy "tcache.flush (void) --"
-@roff_tcache@.Bd -ragged -offset indent -compact
-@roff_tcache@Flush calling thread's tcache.
-@roff_tcache@This interface releases all cached objects and internal data
-@roff_tcache@structures associated with the calling thread's thread-specific
-@roff_tcache@cache.
-@roff_tcache@Ordinarily, this interface need not be called, since automatic
-@roff_tcache@periodic incremental garbage collection occurs, and the thread
-@roff_tcache@cache is automatically discarded when a thread exits.
-@roff_tcache@However, garbage collection is triggered by allocation activity,
-@roff_tcache@so it is possible for a thread that stops allocating/deallocating
-@roff_tcache@to retain its cache indefinitely, in which case the developer may
-@roff_tcache@find manual flushing useful.
-.Ed
-.\"-----------------------------------------------------------------------------
 .It Sy "config.debug (bool) r-"
 .Bd -ragged -offset indent -compact
 --enable-debug was specified during build configuration.
@@ -746,129 +655,386 @@ This is useful for detecting whether another thread caused a refresh.
 .\"-----------------------------------------------------------------------------
 .It Sy "opt.abort (bool) r-"
 .Bd -ragged -offset indent -compact
-See the
-.Dq A
-option.
+Abort-on-warning enabled/disabled.
+If true, most warnings are fatal.
+The process will call
+.Xr abort 3
+in these cases.
+This option is
+@roff_debug@enabled
+@roff_no_debug@disabled
+by default.
 .Ed
 .\"-----------------------------------------------------------------------------
-@roff_fill@.It Sy "opt.junk (bool) r-"
-@roff_fill@.Bd -ragged -offset indent -compact
-@roff_fill@See the
-@roff_fill@.Dq J
-@roff_fill@option.
-@roff_fill@.Ed
-.\"-----------------------------------------------------------------------------
-@roff_fill@.It Sy "opt.zero (bool) r-"
-@roff_fill@.Bd -ragged -offset indent -compact
-@roff_fill@See the
-@roff_fill@.Dq Z
-@roff_fill@option.
-@roff_fill@.Ed
-.\"-----------------------------------------------------------------------------
-@roff_xmalloc@.It Sy "opt.xmalloc (bool) r-"
-@roff_xmalloc@.Bd -ragged -offset indent -compact
-@roff_xmalloc@See the
-@roff_xmalloc@.Dq X
-@roff_xmalloc@option.
-@roff_xmalloc@.Ed
-.\"-----------------------------------------------------------------------------
-@roff_tcache@.It Sy "opt.tcache (bool) r-"
-@roff_tcache@.Bd -ragged -offset indent -compact
-@roff_tcache@See the
-@roff_tcache@.Dq H
-@roff_tcache@option.
-@roff_tcache@.Ed
-.\"-----------------------------------------------------------------------------
-@roff_tcache@.It Sy "opt.lg_tcache_gc_sweep (ssize_t) r-"
-@roff_tcache@.Bd -ragged -offset indent -compact
-@roff_tcache@See the
-@roff_tcache@.Dq G
-@roff_tcache@option.
-@roff_tcache@.Ed
-.\"-----------------------------------------------------------------------------
-.It Sy "opt.stats_print (bool) r-"
-.Bd -ragged -offset indent -compact
-See the
-.Dq P
-option.
-.Ed
-.\"-----------------------------------------------------------------------------
-@roff_prof@.It Sy "opt.prof (bool) r-"
-@roff_prof@.Bd -ragged -offset indent -compact
-@roff_prof@See the
-@roff_prof@.Dq F
-@roff_prof@option.
-@roff_prof@.Ed
-.\"-----------------------------------------------------------------------------
-@roff_prof@.It Sy "opt.lg_prof_bt_max (size_t) r-"
-@roff_prof@.Bd -ragged -offset indent -compact
-@roff_prof@See the
-@roff_prof@.Dq B
-@roff_prof@option.
-@roff_prof@.Ed
-.\"-----------------------------------------------------------------------------
-@roff_prof@.It Sy "opt.lg_prof_interval (ssize_t) r-"
-@roff_prof@.Bd -ragged -offset indent -compact
-@roff_prof@See the
-@roff_prof@.Dq I
-@roff_prof@option.
-@roff_prof@.Ed
-.\"-----------------------------------------------------------------------------
-@roff_prof@.It Sy "opt.prof_udump (bool) r-"
-@roff_prof@.Bd -ragged -offset indent -compact
-@roff_prof@See the
-@roff_prof@.Dq U
-@roff_prof@option.
-@roff_prof@.Ed
-.\"-----------------------------------------------------------------------------
-@roff_prof@.It Sy "opt.prof_leak (bool) r-"
-@roff_prof@.Bd -ragged -offset indent -compact
-@roff_prof@See the
-@roff_prof@.Dq L
-@roff_prof@option.
-@roff_prof@.Ed
-.\"-----------------------------------------------------------------------------
 .It Sy "opt.lg_qspace_max (size_t) r-"
 .Bd -ragged -offset indent -compact
-See the
-.Dq Q
-option.
+Size (log base 2) of the maximum size class that is a multiple of the quantum
+(8 or 16 bytes, depending on architecture).
+Above this size, cacheline spacing is used for size classes.
+The default value is 128 bytes (2^7).
 .Ed
 .\"-----------------------------------------------------------------------------
 .It Sy "opt.lg_cspace_max (size_t) r-"
 .Bd -ragged -offset indent -compact
-See the
-.Dq C
-option.
-.Ed
-.\"-----------------------------------------------------------------------------
-.It Sy "opt.lg_dirty_mult (ssize_t) r-"
-.Bd -ragged -offset indent -compact
-See the
-.Dq D
-option.
+Size (log base 2) of the maximum size class that is a multiple of the cacheline
+size (64).
+Above this size, subpage spacing (256 bytes) is used for size classes.
+The default value is 512 bytes (2^9).
 .Ed
 .\"-----------------------------------------------------------------------------
 .It Sy "opt.lg_chunk (size_t) r-"
 .Bd -ragged -offset indent -compact
-See the
-.Dq K
-option.
+Virtual memory chunk size (log base 2).
+The default chunk size is 4 MiB (2^22).
 .Ed
 .\"-----------------------------------------------------------------------------
+.It Sy "opt.narenas (size_t) r-"
+.Bd -ragged -offset indent -compact
+Maximum number of arenas to use.
+The default maximum number of arenas is four times the number of CPUs, or one
+if there is a single CPU.
+.Ed
+.\"-----------------------------------------------------------------------------
+.It Sy "opt.lg_dirty_mult (ssize_t) r-"
+.Bd -ragged -offset indent -compact
+Per-arena minimum ratio (log base 2) of active to dirty pages.
+Some dirty unused pages may be allowed to accumulate, within the limit set by
+the ratio (or one chunk worth of dirty pages, whichever is greater), before
+informing the kernel about some of those pages via
+.Xr madvise 2
+or a similar system call.
+This provides the kernel with sufficient information to recycle dirty pages if
+physical memory becomes scarce and the pages remain unused.
+The default minimum ratio is 32:1 (2^5:1); an option value of -1 will disable
+dirty page purging.
+.Ed
+.\"-----------------------------------------------------------------------------
+.It Sy "opt.stats_print (bool) r-"
+.Bd -ragged -offset indent -compact
+Enable/disable statistics printing at exit.
+If enabled, the
+.Fn @jemalloc_prefix@malloc_stats_print
+function is called at program exit via an
+.Xr atexit 3
+function.
+@roff_stats@This has the potential to cause deadlock for a multi-threaded
+@roff_stats@process that exits while one or more threads are executing in the
+@roff_stats@memory allocation functions.
+@roff_stats@Therefore, this option should only be used with care; it is
+@roff_stats@primarily intended as a performance tuning aid during application
+@roff_stats@development.
+This option is disabled by default.
+.Ed
+.\"-----------------------------------------------------------------------------
+@roff_fill@.It Sy "opt.junk (bool) r-"
+@roff_fill@.Bd -ragged -offset indent -compact
+@roff_fill@Junk filling enabled/disabled.
+@roff_fill@If enabled, each byte of uninitialized allocated memory will be
+@roff_fill@initialized to 0xa5.
+@roff_fill@All deallocated memory will be initialized to 0x5a.
+@roff_fill@This is intended for debugging and will impact performance
+@roff_fill@negatively.
+@roff_fill@This option is
+@roff_fill@@roff_debug@enabled
+@roff_fill@@roff_no_debug@disabled
+@roff_fill@by default.
+@roff_fill@.Ed
+.\"-----------------------------------------------------------------------------
+@roff_fill@.It Sy "opt.zero (bool) r-"
+@roff_fill@.Bd -ragged -offset indent -compact
+@roff_fill@Zero filling enabled/disabled.
+@roff_fill@If enabled, each byte of uninitialized allocated memory will be
+@roff_fill@initialized to 0.
+@roff_fill@Note that this initialization only happens once for each byte, so
+@roff_fill@.Fn @jemalloc_prefix@realloc
+@roff_fill@calls do not zero memory that was previously allocated.
+@roff_fill@This is intended for debugging and will impact performance
+@roff_fill@negatively.
+@roff_fill@This option is disabled by default.
+@roff_fill@.Ed
+.\"-----------------------------------------------------------------------------
+@roff_sysv@.It Sy "opt.sysv (bool) r-"
+@roff_sysv@.Bd -ragged -offset indent -compact
+@roff_sysv@If enabled, attempting to allocate zero bytes will return a
+@roff_sysv@.Dv NULL
+@roff_sysv@pointer instead of a valid pointer.
+@roff_sysv@(The default behavior is to make a minimal allocation and return a
+@roff_sysv@pointer to it.)
+@roff_sysv@This option is provided for System V compatibility.
+@roff_sysv@@roff_xmalloc@This option is incompatible with the
+@roff_sysv@@roff_xmalloc@.Dq opt.xmalloc
+@roff_sysv@@roff_xmalloc@option.
+@roff_sysv@This option is disabled by default.
+@roff_sysv@.Ed
+.\"-----------------------------------------------------------------------------
+@roff_xmalloc@.It Sy "opt.xmalloc (bool) r-"
+@roff_xmalloc@.Bd -ragged -offset indent -compact
+@roff_xmalloc@Abort-on-out-of-memory enabled/disabled.
+@roff_xmalloc@If enabled, rather than returning failure for any allocation
+@roff_xmalloc@function, display a diagnostic message on
+@roff_xmalloc@.Dv STDERR_FILENO
+@roff_xmalloc@and cause the program to drop core (using
+@roff_xmalloc@.Xr abort 3 ) .
+@roff_xmalloc@If an application is designed to depend on this behavior, set the
+@roff_xmalloc@option at compile time by including the following in the source
+@roff_xmalloc@code:
+@roff_xmalloc@.Bd -literal -offset indent
+@roff_xmalloc@@jemalloc_prefix@malloc_conf = "xmalloc:true";
+@roff_xmalloc@.Ed
+@roff_xmalloc@.Pp
+@roff_xmalloc@This option is disabled by default.
+@roff_xmalloc@.Ed
+.\"-----------------------------------------------------------------------------
+@roff_tcache@.It Sy "opt.tcache (bool) r-"
+@roff_tcache@.Bd -ragged -offset indent -compact
+@roff_tcache@Thread-specific caching enabled/disabled.
+@roff_tcache@When there are multiple threads, each thread uses a
+@roff_tcache@thread-specific cache for objects up to a certain size.
+@roff_tcache@Thread-specific caching allows many allocations to be satisfied
+@roff_tcache@without performing any thread synchronization, at the cost of
+@roff_tcache@increased memory use.
+@roff_tcache@See the
+@roff_tcache@.Dq opt.lg_tcache_gc_sweep
+@roff_tcache@and
+@roff_tcache@.Dq opt.tcache_max
+@roff_tcache@options for related tuning information.
+@roff_tcache@This option is enabled by default.
+@roff_tcache@.Ed
+.\"-----------------------------------------------------------------------------
+@roff_tcache@.It Sy "opt.lg_tcache_gc_sweep (ssize_t) r-"
+@roff_tcache@.Bd -ragged -offset indent -compact
+@roff_tcache@Approximate interval (log base 2) between full thread-specific
+@roff_tcache@cache garbage collection sweeps, counted in terms of
+@roff_tcache@thread-specific cache allocation/deallocation events.
+@roff_tcache@Garbage collection is actually performed incrementally, one size
+@roff_tcache@class at a time, in order to avoid large collection pauses.
+@roff_tcache@The default sweep interval is 8192 (2^13); setting this option to
+@roff_tcache@-1 will disable garbage collection.
+@roff_tcache@.Ed
+.\"-----------------------------------------------------------------------------
+@roff_tcache@.It Sy "opt.lg_tcache_max (size_t) r-"
+@roff_tcache@.Bd -ragged -offset indent -compact
+@roff_tcache@Maximum size class (log base 2) to cache in the thread-specific
+@roff_tcache@cache.
+@roff_tcache@At a minimum, all small size classes are cached, and at a maximum
+@roff_tcache@all large size classes are cached.
+@roff_tcache@The default maximum is 32 KiB (2^15).
+@roff_tcache@.Ed
+.\"-----------------------------------------------------------------------------
+@roff_prof@.It Sy "opt.prof (bool) r-"
+@roff_prof@.Bd -ragged -offset indent -compact
+@roff_prof@Memory profiling enabled/disabled.
+@roff_prof@If enabled, profile memory allocation activity, and use an
+@roff_prof@.Xr atexit 3
+@roff_prof@function to dump final memory usage to a file named according to
+@roff_prof@the pattern
+@roff_prof@.Pa <prefix>.<pid>.<seq>.f.heap ,
+@roff_prof@where
+@roff_prof@.Pa <prefix>
+@roff_prof@is controlled by the
+@roff_prof@.Dq opt.prof_prefix
+@roff_prof@option.
+@roff_prof@See the
+@roff_prof@.Dq opt.lg_prof_bt_max
+@roff_prof@option for backtrace depth control.
+@roff_prof@See the
+@roff_prof@.Dq opt.prof_active
+@roff_prof@option for on-the-fly activation/deactivation.
+@roff_prof@See the
+@roff_prof@.Dq opt.lg_prof_sample
+@roff_prof@option for probabilistic sampling control.
+@roff_prof@See the
+@roff_prof@.Dq opt.prof_accum
+@roff_prof@option for control of cumulative sample reporting.
+@roff_prof@See the
+@roff_prof@.Dq opt.lg_prof_tcmax
+@roff_prof@option for control of per thread backtrace caching.
+@roff_prof@See the
+@roff_prof@.Dq opt.lg_prof_interval
+@roff_prof@option for information on interval-triggered profile dumping, and the
+@roff_prof@.Dq opt.prof_gdump
+@roff_prof@option for information on high-water-triggered profile dumping.
+@roff_prof@Profile output is compatible with the included pprof Perl script,
+@roff_prof@which originates from the
+@roff_prof@.UR http://\:code.google.com/\:p/\:google-perftools/
+@roff_prof@google-perftools package
+@roff_prof@.UE .
+@roff_prof@.Ed
+.\"-----------------------------------------------------------------------------
+@roff_prof@.It Sy "opt.prof_prefix (const char *) r-"
+@roff_prof@.Bd -ragged -offset indent -compact
+@roff_prof@Filename prefix for profile dumps.
+@roff_prof@If the prefix is set to the empty string, no automatic dumps will
+@roff_prof@occur; this is primarily useful for disabling the automatic final
+@roff_prof@heap dump (which also disables leak reporting, if enabled).
+@roff_prof@The default prefix is
+@roff_prof@.Pa jeprof .
+@roff_prof@.Ed
+.\"-----------------------------------------------------------------------------
+@roff_prof@.It Sy "opt.lg_prof_bt_max (size_t) r-"
+@roff_prof@.Bd -ragged -offset indent -compact
+@roff_prof@Maximum backtrace depth (log base 2) when profiling memory
+@roff_prof@allocation activity.
+@roff_prof@The default is 128 (2^7).
+@roff_prof@.Ed
+.\"-----------------------------------------------------------------------------
+@roff_prof@.It Sy "opt.prof_active (bool) r-"
+@roff_prof@.Bd -ragged -offset indent -compact
+@roff_prof@Profiling activated/deactivated.
+@roff_prof@This is a secondary control mechanism that makes it possible to
+@roff_prof@start the application with profiling enabled (see the
+@roff_prof@.Dq opt.prof
+@roff_prof@option) but inactive, then toggle profiling at any time during
+@roff_prof@program execution with the
+@roff_prof@.Dq prof.active
+@roff_prof@mallctl.
+@roff_prof@This option is enabled by default.
+@roff_prof@.Ed
+.\"-----------------------------------------------------------------------------
+@roff_prof@.It Sy "opt.lg_prof_sample (ssize_t) r-"
+@roff_prof@.Bd -ragged -offset indent -compact
+@roff_prof@Average interval (log base 2) between allocation samples, as
+@roff_prof@measured in bytes of allocation activity.
+@roff_prof@Increasing the sampling interval decreases profile fidelity, but
+@roff_prof@also decreases the computational overhead.
+@roff_prof@The default sample interval is 1 (2^0) (i.e. all allocations are
+@roff_prof@sampled).
+@roff_prof@.Ed
+.\"-----------------------------------------------------------------------------
+@roff_prof@.It Sy "opt.prof_accum (bool) r-"
+@roff_prof@.Bd -ragged -offset indent -compact
+@roff_prof@Reporting of cumulative object/byte counts in profile dumps
+@roff_prof@enabled/disabled.
+@roff_prof@If this option is enabled, every unique backtrace must be stored for
+@roff_prof@the duration of execution.
+@roff_prof@Depending on the application, this can impose a large memory
+@roff_prof@overhead, and the cumulative counts are not always of interest.
+@roff_prof@See the
+@roff_prof@.Dq opt.lg_prof_tcmax
+@roff_prof@option for control of per thread backtrace caching, which has
+@roff_prof@important interactions.
+@roff_prof@This option is enabled by default.
+@roff_prof@.Ed
+.\"-----------------------------------------------------------------------------
+@roff_prof@.It Sy "opt.lg_prof_tcmax (ssize_t) r-"
+@roff_prof@.Bd -ragged -offset indent -compact
+@roff_prof@Maximum per thread backtrace cache (log base 2) used for heap
+@roff_prof@profiling.
+@roff_prof@A backtrace can only be discarded if the
+@roff_prof@.Dq opt.prof_accum
+@roff_prof@option is disabled, and no thread caches currently refer to the
+@roff_prof@backtrace.
+@roff_prof@Therefore, a backtrace cache limit should be imposed if the
+@roff_prof@intention is to limit how much memory is used by backtraces.
+@roff_prof@By default, no limit is imposed (encoded as -1).
+@roff_prof@.Ed
+.\"-----------------------------------------------------------------------------
+@roff_prof@.It Sy "opt.lg_prof_interval (ssize_t) r-"
+@roff_prof@.Bd -ragged -offset indent -compact
+@roff_prof@Average interval (log base 2) between memory profile dumps, as
+@roff_prof@measured in bytes of allocation activity.
+@roff_prof@The actual interval between dumps may be sporadic because
+@roff_prof@decentralized allocation counters are used to avoid synchronization
+@roff_prof@bottlenecks.
+@roff_prof@Profiles are dumped to files named according to the pattern
+@roff_prof@.Pa <prefix>.<pid>.<seq>.i<iseq>.heap ,
+@roff_prof@where
+@roff_prof@.Pa <prefix>
+@roff_prof@is controlled by the
+@roff_prof@.Dq opt.prof_prefix
+@roff_prof@option.
+@roff_prof@By default, interval-triggered profile dumping is disabled (encoded
+@roff_prof@as -1).
+@roff_prof@.Ed
+.\"-----------------------------------------------------------------------------
+@roff_prof@.It Sy "opt.prof_gdump (bool) r-"
+@roff_prof@.Bd -ragged -offset indent -compact
+@roff_prof@Trigger a memory profile dump every time the total virtual memory
+@roff_prof@exceeds the previous maximum.
+@roff_prof@Profiles are dumped to files named according to the pattern
+@roff_prof@.Pa <prefix>.<pid>.<seq>.u<useq>.heap ,
+@roff_prof@where
+@roff_prof@.Pa <prefix>
+@roff_prof@is controlled by the
+@roff_prof@.Dq opt.prof_prefix
+@roff_prof@option.
+@roff_prof@This option is disabled by default.
+@roff_prof@.Ed
+.\"-----------------------------------------------------------------------------
+@roff_prof@.It Sy "opt.prof_leak (bool) r-"
+@roff_prof@.Bd -ragged -offset indent -compact
+@roff_prof@Leak reporting enabled/disabled.
+@roff_prof@If enabled, use an
+@roff_prof@.Xr atexit 3
+@roff_prof@function to report memory leaks detected by allocation sampling.
+@roff_prof@See the
+@roff_prof@.Dq opt.lg_prof_bt_max
+@roff_prof@option for backtrace depth control.
+@roff_prof@See the
+@roff_prof@.Dq opt.prof
+@roff_prof@option for information on analyzing heap profile output.
+@roff_prof@This option is disabled by default.
+@roff_prof@.Ed
+.\"-----------------------------------------------------------------------------
 .It Sy "opt.overcommit (bool) r-"
 .Bd -ragged -offset indent -compact
-See the
-.Dq O
-option.
+@roff_swap@Over-commit enabled/disabled.
+@roff_swap@If enabled, over-commit memory as a side effect of using anonymous
+@roff_swap@.Xr mmap 2
+@roff_swap@@roff_dss@ and
+@roff_swap@@roff_dss@.Xr sbrk 2
+@roff_swap@for virtual memory allocation.
+@roff_swap@In order for overcommit to be disabled, the
+@roff_swap@.Dq swap.fds
+@roff_swap@mallctl must have been successfully written to.
+@roff_swap@This option is enabled by default.
 .Ed
 .\"-----------------------------------------------------------------------------
+@roff_tcache@.It Sy "tcache.flush (void) --"
+@roff_tcache@.Bd -ragged -offset indent -compact
+@roff_tcache@Flush calling thread's tcache.
+@roff_tcache@This interface releases all cached objects and internal data
+@roff_tcache@structures associated with the calling thread's thread-specific
+@roff_tcache@cache.
+@roff_tcache@Ordinarily, this interface need not be called, since automatic
+@roff_tcache@periodic incremental garbage collection occurs, and the thread
+@roff_tcache@cache is automatically discarded when a thread exits.
+@roff_tcache@However, garbage collection is triggered by allocation activity,
+@roff_tcache@so it is possible for a thread that stops allocating/deallocating
+@roff_tcache@to retain its cache indefinitely, in which case the developer may
+@roff_tcache@find manual flushing useful.
+.Ed
+.\"-----------------------------------------------------------------------------
+.It Sy "thread.arena (unsigned) rw"
+.Bd -ragged -offset indent -compact
+Get or set the arena associated with the calling thread.
+The arena index must be less than the maximum number of arenas (see the
+.Dq arenas.narenas
+mallctl).
+If the specified arena was not initialized beforehand (see the
+.Dq arenas.initialized
+mallctl), it will be automatically initialized as a side effect of calling this
+interface.
+.Ed
+.\"-----------------------------------------------------------------------------
+@roff_stats@.It Sy "thread.allocated (uint64_t) r-"
+@roff_stats@.Bd -ragged -offset indent -compact
+@roff_stats@Get the total number of bytes ever allocated by the calling thread.
+@roff_stats@This counter has the potential to wrap around; it is up to the
+@roff_stats@application to appropriately interpret the counter in such cases.
+@roff_stats@.Ed
+.\"-----------------------------------------------------------------------------
+@roff_stats@.It Sy "thread.deallocated (uint64_t) r-"
+@roff_stats@.Bd -ragged -offset indent -compact
+@roff_stats@Get the total number of bytes ever deallocated by the calling
+@roff_stats@thread.
+@roff_stats@This counter has the potential to wrap around; it is up to the
+@roff_stats@application to appropriately interpret the counter in such cases.
+@roff_stats@.Ed
+.\"-----------------------------------------------------------------------------
 .It Sy "arenas.narenas (unsigned) r-"
 .Bd -ragged -offset indent -compact
 Maximum number of arenas.
-See the
-.Dq N
-option.
 .Ed
 .\"-----------------------------------------------------------------------------
 .It Sy "arenas.initialized (bool *) r-"
@@ -1004,11 +1170,17 @@ Total number of large size classes.
 Maximum size supported by this large size class.
 .Ed
 .\"-----------------------------------------------------------------------------
+.It Sy "arenas.purge (unsigned) -w"
+.Bd -ragged -offset indent -compact
+Purge unused dirty pages for the specified arena, or for all arenas if none is
+specified.
+.Ed
+.\"-----------------------------------------------------------------------------
 @roff_prof@.It Sy "prof.active (bool) rw"
 @roff_prof@.Bd -ragged -offset indent -compact
 @roff_prof@Control whether sampling is currently active.
 @roff_prof@See the
-@roff_prof@.Dq E
+@roff_prof@.Dq opt.prof_active
 @roff_prof@option for additional information.
 @roff_prof@.Ed
 .\"-----------------------------------------------------------------------------
@@ -1020,8 +1192,8 @@ Maximum size supported by this large size class.
 @roff_prof@where
 @roff_prof@.Pa <prefix>
 @roff_prof@is controlled by the
-@roff_prof@JEMALLOC_PROF_PREFIX
-@roff_prof@environment variable.
+@roff_prof@.Dq opt.prof_prefix
+@roff_prof@option.
 @roff_prof@.Ed
 .\"-----------------------------------------------------------------------------
 @roff_prof@.It Sy "prof.interval (uint64_t) r-"
@@ -1029,7 +1201,7 @@ Maximum size supported by this large size class.
 @roff_prof@Average number of bytes allocated between inverval-based profile
 @roff_prof@dumps.
 @roff_prof@See the
-@roff_prof@.Dq I
+@roff_prof@.Dq opt.lg_prof_interval
 @roff_prof@option for additional information.
 @roff_prof@.Ed
 .\"-----------------------------------------------------------------------------
@@ -1283,10 +1455,9 @@ has not been called.
 .\"-----------------------------------------------------------------------------
 .El
 .Sh DEBUGGING MALLOC PROBLEMS
-The first thing to do is to set the
-.Dq A
-option.
-This option forces a coredump (if possible) at the first sign of trouble,
+Start by setting the
+.Dq opt.abort
+option, which forces a coredump (if possible) at the first sign of trouble,
 rather than the normal policy of trying to continue if at all possible.
 .Pp
 It is probably also a good idea to recompile the program with suitable
@@ -1294,22 +1465,22 @@ options and symbols for debugger support.
 .Pp
 @roff_fill@If the program starts to give unusual results, coredump or generally
 @roff_fill@behave differently without emitting any of the messages mentioned in
-@roff_fill@the next section, it is likely because it depends on the storage
-@roff_fill@being filled with zero bytes.
+@roff_fill@the next section, it is likely because the program depends on the
+@roff_fill@storage being filled with zero bytes.
 @roff_fill@Try running it with the
-@roff_fill@.Dq Z
+@roff_fill@.Dq opt.zero
 @roff_fill@option set;
 @roff_fill@if that improves the situation, this diagnosis has been confirmed.
 @roff_fill@If the program still misbehaves,
 @roff_fill@the likely problem is accessing memory outside the allocated area.
 @roff_fill@.Pp
 @roff_fill@Alternatively, if the symptoms are not easy to reproduce, setting the
-@roff_fill@.Dq J
+@roff_fill@.Dq opt.junk
 @roff_fill@option may help provoke the problem.
 @roff_fill@.Pp
-Unfortunately this implementation does not provide much detail about
-the problems it detects; the performance impact for storing such information
-would be prohibitive.
+This implementation does not provide much detail about the problems it detects,
+because the performance impact for storing such information would be
+prohibitive.
 There are a number of allocator implementations available on the Internet
 which focus on detecting and pinpointing problems by trading performance for
 extra sanity checks and detailed diagnostics.
@@ -1319,8 +1490,8 @@ warning condition, a message will be printed to file descriptor
 .Dv STDERR_FILENO .
 Errors will result in the process dumping core.
 If the
-.Dq A
-option is set, all warnings are treated as errors.
+.Dq opt.abort
+option is set, most warnings are treated as errors.
 .Pp
 The
 .Va @jemalloc_prefix@malloc_message
@@ -1342,6 +1513,7 @@ is likely to result in a crash or deadlock.
 All messages are prefixed by
 .Dq <jemalloc>: .
 .Sh RETURN VALUES
+.Ss Standard API
 The
 .Fn @jemalloc_prefix@malloc
 and
@@ -1390,7 +1562,7 @@ when an error occurs.
 The
 .Fn @jemalloc_prefix@free
 function returns no value.
-.Pp
+.Ss Non-standard API
 The
 .Fn @jemalloc_prefix@malloc_usable_size
 function returns the usable size of the allocation pointed to by
@@ -1429,37 +1601,69 @@ Attempt to read or write void value, or attempt to write read-only value.
 A memory allocation failure occurred.
 .It Bq Er EFAULT
 An interface with side effects failed in some way not directly related to
-.Fn mallctl*
+.Fn @jemalloc_prefix@mallctl*
 read/write processing.
 .El
+.Ss Experimental API
+The
+.Fn @jemalloc_prefix@allocm ,
+.Fn @jemalloc_prefix@rallocm ,
+.Fn @jemalloc_prefix@sallocm ,
+and
+.Fn @jemalloc_prefix@dallocm
+functions return
+.Dv ALLOCM_SUCCESS
+on success; otherwise they return an error value.
+The
+.Fn @jemalloc_prefix@allocm
+and
+.Fn @jemalloc_prefix@rallocm
+functions will fail if:
+.Bl -tag -width ".Bq Er ALLOCM_ERR_OOM"
+.It Bq Er ALLOCM_ERR_OOM
+Out of memory.
+Insufficient contiguous memory was available to service the allocation request.
+The
+.Fn @jemalloc_prefix@allocm
+function additionally sets
+.Fa *ptr
+to
+.Dv NULL ,
+whereas the
+.Fn @jemalloc_prefix@rallocm
+function leaves
+.Fa *ptr
+unmodified.
+.El
+.Pp
+The
+.Fn @jemalloc_prefix@rallocm
+function will also fail if:
+.Bl -tag -width ".Bq Er ALLOCM_ERR_NOT_MOVED"
+.It Bq Er ALLOCM_ERR_NOT_MOVED
+.Dv ALLOCM_NO_MOVE
+was specified, but the reallocation request could not be serviced without
+moving the object.
+.El
 .Sh ENVIRONMENT
-The following environment variables affect the execution of the allocation
+The following environment variable affects the execution of the allocation
 functions:
-@roff_prof@.Bl -tag -width ".Ev JEMALLOC_PROF_PREFIX"
-@roff_no_prof@.Bl -tag -width ".Ev JEMALLOC_OPTIONS"
-.It Ev JEMALLOC_OPTIONS
+.Bl -tag -width ".Ev @jemalloc_cprefix@MALLOC_CONF"
+.It Ev @jemalloc_cprefix@MALLOC_CONF
 If the environment variable
-.Ev JEMALLOC_OPTIONS
-is set, the characters it contains will be interpreted as flags to the
-allocation functions.
-@roff_prof@.It Ev JEMALLOC_PROF_PREFIX
-@roff_prof@If the environment variable
-@roff_prof@.Ev JEMALLOC_PROF_PREFIX
-@roff_prof@is set, use it as the filename prefix for profile dumps; otherwise
-@roff_prof@use
-@roff_prof@.Pa jeprof
-@roff_prof@as the prefix.
+.Ev @jemalloc_cprefix@MALLOC_CONF
+is set, the characters it contains will be interpreted as options.
 .El
 .Sh EXAMPLES
 To dump core whenever a problem occurs:
 .Pp
 .Bd -literal -offset indent
-ln -s 'A' /etc/jemalloc.conf
+ln -s 'abort:true' /etc/@jemalloc_prefix@malloc.conf
 .Ed
 .Pp
-To specify in the source a chunk size that is twice the default:
+To specify in the source a chunk size that is 16 MiB:
 .Bd -literal -offset indent
-@jemalloc_prefix@malloc_options = "K";
+@jemalloc_prefix@malloc_conf = "lg_chunk:24";
 .Ed
 .Sh SEE ALSO
 .Xr madvise 2 ,
diff --git a/jemalloc/include/jemalloc/internal/arena.h b/jemalloc/include/jemalloc/internal/arena.h
index c1955f19..9556c2c6 100644
--- a/jemalloc/include/jemalloc/internal/arena.h
+++ b/jemalloc/include/jemalloc/internal/arena.h
@@ -121,17 +121,17 @@ struct arena_chunk_map_s {
 	 *
 	 * p : run page offset
 	 * s : run size
-	 * c : size class (used only if prof_promote is true)
+	 * c : (binind+1) for size class (used only if prof_promote is true)
 	 * x : don't care
 	 * - : 0
 	 * + : 1
-	 * [DZLA] : bit set
-	 * [dzla] : bit unset
+	 * [DULA] : bit set
+	 * [dula] : bit unset
 	 *
 	 *   Unallocated (clean):
-	 *     ssssssss ssssssss ssss---- ----dz--
-	 *     xxxxxxxx xxxxxxxx xxxx---- -----Zxx
-	 *     ssssssss ssssssss ssss---- ----dZ--
+	 *     ssssssss ssssssss ssss---- ----du--
+	 *     xxxxxxxx xxxxxxxx xxxx---- -----Uxx
+	 *     ssssssss ssssssss ssss---- ----dU--
 	 *
 	 *   Unallocated (dirty):
 	 *     ssssssss ssssssss ssss---- ----D---
@@ -144,7 +144,7 @@ struct arena_chunk_map_s {
 	 *     pppppppp pppppppp pppp---- ----d--a
 	 *
 	 *   Large:
-	 *     ssssssss ssssssss ssss++++ ++++D-la
+	 *     ssssssss ssssssss ssss---- ----D-la
 	 *     xxxxxxxx xxxxxxxx xxxx---- ----xxxx
 	 *     -------- -------- -------- ----D-la
 	 *
@@ -152,7 +152,7 @@ struct arena_chunk_map_s {
 	 *     ssssssss ssssssss sssscccc ccccD-la
 	 *
 	 *   Large (not sampled, size == PAGE_SIZE):
-	 *     ssssssss ssssssss ssss++++ ++++D-la
+	 *     ssssssss ssssssss ssss---- ----D-la
 	 */
 	size_t				bits;
 #ifdef JEMALLOC_PROF
@@ -161,7 +161,7 @@ struct arena_chunk_map_s {
 #endif
 #define	CHUNK_MAP_FLAGS_MASK	((size_t)0xfU)
 #define	CHUNK_MAP_DIRTY		((size_t)0x8U)
-#define	CHUNK_MAP_ZEROED	((size_t)0x4U)
+#define	CHUNK_MAP_UNZEROED	((size_t)0x4U)
 #define	CHUNK_MAP_LARGE		((size_t)0x2U)
 #define	CHUNK_MAP_ALLOCATED	((size_t)0x1U)
 #define	CHUNK_MAP_KEY		CHUNK_MAP_ALLOCATED
@@ -187,7 +187,12 @@ struct arena_chunk_s {
 	/* Number of dirty pages. */
 	size_t		ndirty;
 
-	/* Map of pages within chunk that keeps track of free/large/small. */
+	/*
+	 * Map of pages within chunk that keeps track of free/large/small.  The
+	 * first map_bias entries are omitted, since the chunk header does not
+	 * need to be tracked in the map.  This omission saves a header page
+	 * for common chunk sizes (e.g. 4 MiB).
+	 */
 	arena_chunk_map_t map[1]; /* Dynamically sized. */
 };
 typedef rb_tree(arena_chunk_t) arena_chunk_tree_t;
@@ -416,8 +421,12 @@ extern size_t		sspace_min;
 extern size_t		sspace_max;
 #define			small_maxclass	sspace_max
 
-#define			nlclasses (chunk_npages - arena_chunk_header_npages)
+#define			nlclasses (chunk_npages - map_bias)
 
+void	arena_purge_all(arena_t *arena);
+#ifdef JEMALLOC_PROF
+void	arena_prof_accum(arena_t *arena, uint64_t accumbytes);
+#endif
 #ifdef JEMALLOC_TCACHE
 void	arena_tcache_fill_small(arena_t *arena, tcache_bin_t *tbin,
     size_t binind
@@ -426,20 +435,15 @@ void	arena_tcache_fill_small(arena_t *arena, tcache_bin_t *tbin,
 #  endif
     );
 #endif
-#ifdef JEMALLOC_PROF
-void	arena_prof_accum(arena_t *arena, uint64_t accumbytes);
-#endif
 void	*arena_malloc_small(arena_t *arena, size_t size, bool zero);
 void	*arena_malloc_large(arena_t *arena, size_t size, bool zero);
 void	*arena_malloc(size_t size, bool zero);
-void	*arena_palloc(arena_t *arena, size_t alignment, size_t size,
-    size_t alloc_size);
+void	*arena_palloc(arena_t *arena, size_t size, size_t alloc_size,
+    size_t alignment, bool zero);
 size_t	arena_salloc(const void *ptr);
 #ifdef JEMALLOC_PROF
 void	arena_prof_promoted(const void *ptr, size_t size);
 size_t	arena_salloc_demote(const void *ptr);
-prof_ctx_t	*arena_prof_ctx_get(const void *ptr);
-void	arena_prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
 #endif
 void	arena_dalloc_bin(arena_t *arena, arena_chunk_t *chunk, void *ptr,
     arena_chunk_map_t *mapelm);
@@ -449,7 +453,10 @@ void	arena_stats_merge(arena_t *arena, size_t *nactive, size_t *ndirty,
     arena_stats_t *astats, malloc_bin_stats_t *bstats,
     malloc_large_stats_t *lstats);
 #endif
-void	*arena_ralloc(void *ptr, size_t size, size_t oldsize);
+void	*arena_ralloc_no_move(void *ptr, size_t oldsize, size_t size,
+    size_t extra, bool zero);
+void	*arena_ralloc(void *ptr, size_t oldsize, size_t size, size_t extra,
+    size_t alignment, bool zero);
 bool	arena_new(arena_t *arena, unsigned ind);
 bool	arena_boot(void);
 
@@ -458,10 +465,149 @@ bool	arena_boot(void);
 #ifdef JEMALLOC_H_INLINES
 
 #ifndef JEMALLOC_ENABLE_INLINE
+unsigned	arena_run_regind(arena_run_t *run, arena_bin_t *bin,
+    const void *ptr, size_t size);
+#  ifdef JEMALLOC_PROF
+prof_ctx_t	*arena_prof_ctx_get(const void *ptr);
+void	arena_prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
+#  endif
 void	arena_dalloc(arena_t *arena, arena_chunk_t *chunk, void *ptr);
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_ARENA_C_))
+JEMALLOC_INLINE unsigned
+arena_run_regind(arena_run_t *run, arena_bin_t *bin, const void *ptr,
+    size_t size)
+{
+	unsigned shift, diff, regind;
+
+	assert(run->magic == ARENA_RUN_MAGIC);
+
+	/*
+	 * Avoid doing division with a variable divisor if possible.  Using
+	 * actual division here can reduce allocator throughput by over 20%!
+	 */
+	diff = (unsigned)((uintptr_t)ptr - (uintptr_t)run - bin->reg0_offset);
+
+	/* Rescale (factor powers of 2 out of the numerator and denominator). */
+	shift = ffs(size) - 1;
+	diff >>= shift;
+	size >>= shift;
+
+	if (size == 1) {
+		/* The divisor was a power of 2. */
+		regind = diff;
+	} else {
+		/*
+		 * To divide by a number D that is not a power of two we
+		 * multiply by (2^21 / D) and then right shift by 21 positions.
+		 *
+		 *   X / D
+		 *
+		 * becomes
+		 *
+		 *   (X * size_invs[D - 3]) >> SIZE_INV_SHIFT
+		 *
+		 * We can omit the first three elements, because we never
+		 * divide by 0, and 1 and 2 are both powers of two, which are
+		 * handled above.
+		 */
+#define	SIZE_INV_SHIFT 21
+#define	SIZE_INV(s) (((1U << SIZE_INV_SHIFT) / (s)) + 1)
+		static const unsigned size_invs[] = {
+		    SIZE_INV(3),
+		    SIZE_INV(4), SIZE_INV(5), SIZE_INV(6), SIZE_INV(7),
+		    SIZE_INV(8), SIZE_INV(9), SIZE_INV(10), SIZE_INV(11),
+		    SIZE_INV(12), SIZE_INV(13), SIZE_INV(14), SIZE_INV(15),
+		    SIZE_INV(16), SIZE_INV(17), SIZE_INV(18), SIZE_INV(19),
+		    SIZE_INV(20), SIZE_INV(21), SIZE_INV(22), SIZE_INV(23),
+		    SIZE_INV(24), SIZE_INV(25), SIZE_INV(26), SIZE_INV(27),
+		    SIZE_INV(28), SIZE_INV(29), SIZE_INV(30), SIZE_INV(31)
+		};
+
+		if (size <= ((sizeof(size_invs) / sizeof(unsigned)) + 2))
+			regind = (diff * size_invs[size - 3]) >> SIZE_INV_SHIFT;
+		else
+			regind = diff / size;
+#undef SIZE_INV
+#undef SIZE_INV_SHIFT
+	}
+	assert(diff == regind * size);
+	assert(regind < bin->nregs);
+
+	return (regind);
+}
+
+#ifdef JEMALLOC_PROF
+JEMALLOC_INLINE prof_ctx_t *
+arena_prof_ctx_get(const void *ptr)
+{
+	prof_ctx_t *ret;
+	arena_chunk_t *chunk;
+	size_t pageind, mapbits;
+
+	assert(ptr != NULL);
+	assert(CHUNK_ADDR2BASE(ptr) != ptr);
+
+	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
+	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT;
+	mapbits = chunk->map[pageind-map_bias].bits;
+	assert((mapbits & CHUNK_MAP_ALLOCATED) != 0);
+	if ((mapbits & CHUNK_MAP_LARGE) == 0) {
+		if (prof_promote)
+			ret = (prof_ctx_t *)(uintptr_t)1U;
+		else {
+			arena_run_t *run = (arena_run_t *)((uintptr_t)chunk +
+			    (uintptr_t)((pageind - (mapbits >> PAGE_SHIFT)) <<
+			    PAGE_SHIFT));
+			arena_bin_t *bin = run->bin;
+			unsigned regind;
+
+			assert(run->magic == ARENA_RUN_MAGIC);
+			regind = arena_run_regind(run, bin, ptr, bin->reg_size);
+			ret = *(prof_ctx_t **)((uintptr_t)run +
+			    bin->ctx0_offset + (regind *
+			    sizeof(prof_ctx_t *)));
+		}
+	} else
+		ret = chunk->map[pageind-map_bias].prof_ctx;
+
+	return (ret);
+}
+
+JEMALLOC_INLINE void
+arena_prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
+{
+	arena_chunk_t *chunk;
+	size_t pageind, mapbits;
+
+	assert(ptr != NULL);
+	assert(CHUNK_ADDR2BASE(ptr) != ptr);
+
+	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
+	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT;
+	mapbits = chunk->map[pageind-map_bias].bits;
+	assert((mapbits & CHUNK_MAP_ALLOCATED) != 0);
+	if ((mapbits & CHUNK_MAP_LARGE) == 0) {
+		if (prof_promote == false) {
+			arena_run_t *run = (arena_run_t *)((uintptr_t)chunk +
+			    (uintptr_t)((pageind - (mapbits >> PAGE_SHIFT)) <<
+			    PAGE_SHIFT));
+			arena_bin_t *bin = run->bin;
+			unsigned regind;
+
+			assert(run->magic == ARENA_RUN_MAGIC);
+			regind = arena_run_regind(run, bin, ptr, bin->reg_size);
+
+			*((prof_ctx_t **)((uintptr_t)run + bin->ctx0_offset
+			    + (regind * sizeof(prof_ctx_t *)))) = ctx;
+		} else
+			assert((uintptr_t)ctx == (uintptr_t)1U);
+	} else
+		chunk->map[pageind-map_bias].prof_ctx = ctx;
+}
+#endif
+
 JEMALLOC_INLINE void
 arena_dalloc(arena_t *arena, arena_chunk_t *chunk, void *ptr)
 {
@@ -474,8 +620,8 @@ arena_dalloc(arena_t *arena, arena_chunk_t *chunk, void *ptr)
 	assert(ptr != NULL);
 	assert(CHUNK_ADDR2BASE(ptr) != ptr);
 
-	pageind = (((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT);
-	mapelm = &chunk->map[pageind];
+	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT;
+	mapelm = &chunk->map[pageind-map_bias];
 	assert((mapelm->bits & CHUNK_MAP_ALLOCATED) != 0);
 	if ((mapelm->bits & CHUNK_MAP_LARGE) == 0) {
 		/* Small allocation. */
diff --git a/jemalloc/include/jemalloc/internal/chunk.h b/jemalloc/include/jemalloc/internal/chunk.h
index 1f6abf78..a60f0ad7 100644
--- a/jemalloc/include/jemalloc/internal/chunk.h
+++ b/jemalloc/include/jemalloc/internal/chunk.h
@@ -39,13 +39,17 @@ extern malloc_mutex_t	chunks_mtx;
 extern chunk_stats_t	stats_chunks;
 #endif
 
+#ifdef JEMALLOC_IVSALLOC
+extern rtree_t		*chunks_rtree;
+#endif
+
 extern size_t		chunksize;
 extern size_t		chunksize_mask; /* (chunksize - 1). */
 extern size_t		chunk_npages;
-extern size_t		arena_chunk_header_npages;
+extern size_t		map_bias; /* Number of arena chunk header pages. */
 extern size_t		arena_maxclass; /* Max size class for arenas. */
 
-void	*chunk_alloc(size_t size, bool *zero);
+void	*chunk_alloc(size_t size, bool base, bool *zero);
 void	chunk_dealloc(void *chunk, size_t size);
 bool	chunk_boot(void);
 
diff --git a/jemalloc/include/jemalloc/internal/chunk_mmap.h b/jemalloc/include/jemalloc/internal/chunk_mmap.h
index dc52448c..07b50a4d 100644
--- a/jemalloc/include/jemalloc/internal/chunk_mmap.h
+++ b/jemalloc/include/jemalloc/internal/chunk_mmap.h
@@ -13,6 +13,8 @@ void	*chunk_alloc_mmap(size_t size);
 void	*chunk_alloc_mmap_noreserve(size_t size);
 void	chunk_dealloc_mmap(void *chunk, size_t size);
 
+bool	chunk_mmap_boot(void);
+
 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_INLINES
diff --git a/jemalloc/include/jemalloc/internal/ckh.h b/jemalloc/include/jemalloc/internal/ckh.h
index c39ea5c7..d4e391b6 100644
--- a/jemalloc/include/jemalloc/internal/ckh.h
+++ b/jemalloc/include/jemalloc/internal/ckh.h
@@ -45,7 +45,7 @@ struct ckh_s {
 #endif
 
 	/* Used for pseudo-random number generation. */
-#define	CKH_A		12345
+#define	CKH_A		1103515241
 #define	CKH_C		12347
 	uint32_t	prn_state;
 
diff --git a/jemalloc/include/jemalloc/internal/ctl.h b/jemalloc/include/jemalloc/internal/ctl.h
index 7bbf21e0..8776ad13 100644
--- a/jemalloc/include/jemalloc/internal/ctl.h
+++ b/jemalloc/include/jemalloc/internal/ctl.h
@@ -82,9 +82,9 @@ bool	ctl_boot(void);
 #define	xmallctl(name, oldp, oldlenp, newp, newlen) do {		\
 	if (JEMALLOC_P(mallctl)(name, oldp, oldlenp, newp, newlen)	\
 	    != 0) {							\
-		malloc_write("<jemalloc>: Invalid xmallctl(\"");	\
+		malloc_write("<jemalloc>: Failure in xmallctl(\"");	\
 		malloc_write(name);					\
-		malloc_write("\", ...) call\n");			\
+		malloc_write("\", ...)\n");				\
 		abort();						\
 	}								\
 } while (0)
@@ -92,9 +92,9 @@ bool	ctl_boot(void);
 #define	xmallctlnametomib(name, mibp, miblenp) do {			\
 	if (JEMALLOC_P(mallctlnametomib)(name, mibp, miblenp) != 0) {	\
 		malloc_write(						\
-		    "<jemalloc>: Invalid xmallctlnametomib(\"");	\
+		    "<jemalloc>: Failure in xmallctlnametomib(\"");	\
 		malloc_write(name);					\
-		malloc_write("\", ...) call\n");			\
+		malloc_write("\", ...)\n");				\
 		abort();						\
 	}								\
 } while (0)
@@ -103,7 +103,7 @@ bool	ctl_boot(void);
 	if (JEMALLOC_P(mallctlbymib)(mib, miblen, oldp, oldlenp, newp,	\
 	    newlen) != 0) {						\
 		malloc_write(						\
-		    "<jemalloc>: Invalid xmallctlbymib() call\n");	\
+		    "<jemalloc>: Failure in xmallctlbymib()\n");	\
 		abort();						\
 	}								\
 } while (0)
diff --git a/jemalloc/include/jemalloc/internal/huge.h b/jemalloc/include/jemalloc/internal/huge.h
index 0c0582f2..bf231274 100644
--- a/jemalloc/include/jemalloc/internal/huge.h
+++ b/jemalloc/include/jemalloc/internal/huge.h
@@ -20,8 +20,11 @@ extern size_t		huge_allocated;
 extern malloc_mutex_t	huge_mtx;
 
 void	*huge_malloc(size_t size, bool zero);
-void	*huge_palloc(size_t alignment, size_t size);
-void	*huge_ralloc(void *ptr, size_t size, size_t oldsize);
+void	*huge_palloc(size_t size, size_t alignment, bool zero);
+void	*huge_ralloc_no_move(void *ptr, size_t oldsize, size_t size,
+    size_t extra);
+void	*huge_ralloc(void *ptr, size_t oldsize, size_t size, size_t extra,
+    size_t alignment, bool zero);
 void	huge_dalloc(void *ptr);
 size_t	huge_salloc(const void *ptr);
 #ifdef JEMALLOC_PROF
diff --git a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
index 2c3f32f1..3d253001 100644
--- a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
@@ -17,16 +17,29 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
+#include <stddef.h>
+#ifndef offsetof
+#  define offsetof(type, member)	((size_t)&(((type *)NULL)->member))
+#endif
 #include <inttypes.h>
 #include <string.h>
 #include <strings.h>
+#include <ctype.h>
 #include <unistd.h>
 #include <fcntl.h>
 #include <pthread.h>
+#include <math.h>
 
 #define	JEMALLOC_MANGLE
 #include "../jemalloc@install_suffix@.h"
 
+#ifdef JEMALLOC_ZONE
+#include <mach/mach_error.h>
+#include <mach/mach_init.h>
+#include <mach/vm_map.h>
+#include <malloc/malloc.h>
+#endif
+
 #ifdef JEMALLOC_LAZY_LOCK
 #include <dlfcn.h>
 #endif
@@ -49,7 +62,7 @@ extern void	(*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s);
 		malloc_write("<jemalloc>: ");				\
 		malloc_write(__FILE__);					\
 		malloc_write(":");					\
-		malloc_write(umax2s(__LINE__, 10, line_buf));		\
+		malloc_write(u2s(__LINE__, 10, line_buf));		\
 		malloc_write(": Failed assertion: ");			\
 		malloc_write("\"");					\
 		malloc_write(#e);					\
@@ -77,6 +90,8 @@ extern void	(*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s);
 /******************************************************************************/
 #define JEMALLOC_H_TYPES
 
+#define	ALLOCM_LG_ALIGN_MASK	((int)0x3f)
+
 #define	ZU(z)	((size_t)z)
 
 #ifndef __DECONST
@@ -92,8 +107,8 @@ extern void	(*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s);
 #  define JEMALLOC_INLINE static inline
 #endif
 
-/* Size of stack-allocated buffer passed to strerror_r(). */
-#define	STRERROR_BUF		64
+/* Size of stack-allocated buffer passed to buferror(). */
+#define	BUFERROR_BUF		64
 
 /* Minimum alignment of allocations is 2^LG_QUANTUM bytes. */
 #ifdef __i386__
@@ -159,6 +174,16 @@ extern void	(*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s);
 #define	STATIC_PAGE_SIZE ((size_t)(1U << STATIC_PAGE_SHIFT))
 #define	STATIC_PAGE_MASK ((size_t)(STATIC_PAGE_SIZE - 1))
 
+#ifdef PAGE_SHIFT
+#  undef PAGE_SHIFT
+#endif
+#ifdef PAGE_SIZE
+#  undef PAGE_SIZE
+#endif
+#ifdef PAGE_MASK
+#  undef PAGE_MASK
+#endif
+
 #ifdef DYNAMIC_PAGE_SHIFT
 #  define PAGE_SHIFT	lg_pagesize
 #  define PAGE_SIZE	pagesize
@@ -184,8 +209,12 @@ extern void	(*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s);
 #include "jemalloc/internal/base.h"
 #include "jemalloc/internal/chunk.h"
 #include "jemalloc/internal/huge.h"
+#include "jemalloc/internal/rtree.h"
 #include "jemalloc/internal/tcache.h"
 #include "jemalloc/internal/hash.h"
+#ifdef JEMALLOC_ZONE
+#include "jemalloc/internal/zone.h"
+#endif
 #include "jemalloc/internal/prof.h"
 
 #undef JEMALLOC_H_TYPES
@@ -203,8 +232,12 @@ extern void	(*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s);
 #include "jemalloc/internal/base.h"
 #include "jemalloc/internal/chunk.h"
 #include "jemalloc/internal/huge.h"
+#include "jemalloc/internal/rtree.h"
 #include "jemalloc/internal/tcache.h"
 #include "jemalloc/internal/hash.h"
+#ifdef JEMALLOC_ZONE
+#include "jemalloc/internal/zone.h"
+#endif
 #include "jemalloc/internal/prof.h"
 
 #undef JEMALLOC_H_STRUCTS
@@ -224,6 +257,7 @@ extern bool	opt_xmalloc;
 #ifdef JEMALLOC_FILL
 extern bool	opt_zero;
 #endif
+extern size_t	opt_narenas;
 
 #ifdef DYNAMIC_PAGE_SHIFT
 extern size_t		pagesize;
@@ -240,8 +274,19 @@ extern malloc_mutex_t	arenas_lock; /* Protects arenas initialization. */
  * Map of pthread_self() --> arenas[???], used for selecting an arena to use
  * for allocations.
  */
-extern __thread arena_t	*arenas_map JEMALLOC_ATTR(tls_model("initial-exec"));
+extern __thread arena_t	*arenas_tls JEMALLOC_ATTR(tls_model("initial-exec"));
+#  define ARENA_GET()	arenas_tls
+#  define ARENA_SET(v)	do {						\
+	arenas_tls = (v);						\
+} while (0)
+#else
+extern pthread_key_t	arenas_tsd;
+#  define ARENA_GET()	((arena_t *)pthread_getspecific(arenas_tsd))
+#  define ARENA_SET(v)	do {						\
+	pthread_setspecific(arenas_tsd, (void *)(v));			\
+} while (0)
 #endif
+
 /*
  * Arenas that are used to service external requests.  Not all elements of the
  * arenas array are necessarily used; arenas are created lazily as needed.
@@ -249,11 +294,56 @@ extern __thread arena_t	*arenas_map JEMALLOC_ATTR(tls_model("initial-exec"));
 extern arena_t		**arenas;
 extern unsigned		narenas;
 
-arena_t	*arenas_extend(unsigned ind);
-#ifndef NO_TLS
-arena_t	*choose_arena_hard(void);
+#ifdef JEMALLOC_STATS
+typedef struct {
+	uint64_t	allocated;
+	uint64_t	deallocated;
+} thread_allocated_t;
+#  ifndef NO_TLS
+extern __thread thread_allocated_t	thread_allocated_tls;
+#    define ALLOCATED_GET() thread_allocated_tls.allocated
+#    define DEALLOCATED_GET() thread_allocated_tls.deallocated
+#    define ALLOCATED_ADD(a, d) do {					\
+	thread_allocated_tls.allocated += a;				\
+	thread_allocated_tls.deallocated += d;				\
+} while (0)
+#  else
+extern pthread_key_t	thread_allocated_tsd;
+#    define ALLOCATED_GET()						\
+	(uint64_t)((pthread_getspecific(thread_allocated_tsd) != NULL)	\
+	    ? ((thread_allocated_t *)					\
+	    pthread_getspecific(thread_allocated_tsd))->allocated : 0)
+#    define DEALLOCATED_GET()						\
+	(uint64_t)((pthread_getspecific(thread_allocated_tsd) != NULL)	\
+	    ? ((thread_allocated_t					\
+	    *)pthread_getspecific(thread_allocated_tsd))->deallocated :	\
+	    0)
+#    define ALLOCATED_ADD(a, d) do {					\
+	thread_allocated_t *thread_allocated = (thread_allocated_t *)	\
+	    pthread_getspecific(thread_allocated_tsd);			\
+	if (thread_allocated != NULL) {					\
+		thread_allocated->allocated += (a);			\
+		thread_allocated->deallocated += (d);			\
+	} else {							\
+		thread_allocated = (thread_allocated_t *)		\
+		    imalloc(sizeof(thread_allocated_t));		\
+		if (thread_allocated != NULL) {				\
+			pthread_setspecific(thread_allocated_tsd,	\
+			    thread_allocated);				\
+			thread_allocated->allocated = (a);		\
+			thread_allocated->deallocated = (d);		\
+		}							\
+	}								\
+} while (0)
+#  endif
 #endif
 
+arena_t	*arenas_extend(unsigned ind);
+arena_t	*choose_arena_hard(void);
+int	buferror(int errnum, char *buf, size_t buflen);
+void	jemalloc_prefork(void);
+void	jemalloc_postfork(void);
+
 #include "jemalloc/internal/prn.h"
 #include "jemalloc/internal/ckh.h"
 #include "jemalloc/internal/stats.h"
@@ -265,8 +355,12 @@ arena_t	*choose_arena_hard(void);
 #include "jemalloc/internal/base.h"
 #include "jemalloc/internal/chunk.h"
 #include "jemalloc/internal/huge.h"
+#include "jemalloc/internal/rtree.h"
 #include "jemalloc/internal/tcache.h"
 #include "jemalloc/internal/hash.h"
+#ifdef JEMALLOC_ZONE
+#include "jemalloc/internal/zone.h"
+#endif
 #include "jemalloc/internal/prof.h"
 
 #undef JEMALLOC_H_EXTERNS
@@ -285,11 +379,143 @@ arena_t	*choose_arena_hard(void);
 #include "jemalloc/internal/huge.h"
 
 #ifndef JEMALLOC_ENABLE_INLINE
+size_t	pow2_ceil(size_t x);
+size_t	s2u(size_t size);
+size_t	sa2u(size_t size, size_t alignment, size_t *run_size_p);
 void	malloc_write(const char *s);
 arena_t	*choose_arena(void);
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_C_))
+/* Compute the smallest power of 2 that is >= x. */
+JEMALLOC_INLINE size_t
+pow2_ceil(size_t x)
+{
+
+	x--;
+	x |= x >> 1;
+	x |= x >> 2;
+	x |= x >> 4;
+	x |= x >> 8;
+	x |= x >> 16;
+#if (LG_SIZEOF_PTR == 3)
+	x |= x >> 32;
+#endif
+	x++;
+	return (x);
+}
+
+/*
+ * Compute usable size that would result from allocating an object with the
+ * specified size.
+ */
+JEMALLOC_INLINE size_t
+s2u(size_t size)
+{
+
+	if (size <= small_maxclass)
+		return arenas[0]->bins[small_size2bin[size]].reg_size;
+	if (size <= arena_maxclass)
+		return PAGE_CEILING(size);
+	return CHUNK_CEILING(size);
+}
+
+/*
+ * Compute usable size that would result from allocating an object with the
+ * specified size and alignment.
+ */
+JEMALLOC_INLINE size_t
+sa2u(size_t size, size_t alignment, size_t *run_size_p)
+{
+	size_t usize;
+
+	/*
+	 * Round size up to the nearest multiple of alignment.
+	 *
+	 * This done, we can take advantage of the fact that for each small
+	 * size class, every object is aligned at the smallest power of two
+	 * that is non-zero in the base two representation of the size.  For
+	 * example:
+	 *
+	 *   Size |   Base 2 | Minimum alignment
+	 *   -----+----------+------------------
+	 *     96 |  1100000 |  32
+	 *    144 | 10100000 |  32
+	 *    192 | 11000000 |  64
+	 *
+	 * Depending on runtime settings, it is possible that arena_malloc()
+	 * will further round up to a power of two, but that never causes
+	 * correctness issues.
+	 */
+	usize = (size + (alignment - 1)) & (-alignment);
+	/*
+	 * (usize < size) protects against the combination of maximal
+	 * alignment and size greater than maximal alignment.
+	 */
+	if (usize < size) {
+		/* size_t overflow. */
+		return (0);
+	}
+
+	if (usize <= arena_maxclass && alignment <= PAGE_SIZE) {
+		if (usize <= small_maxclass) {
+			return
+			    (arenas[0]->bins[small_size2bin[usize]].reg_size);
+		}
+		return (PAGE_CEILING(usize));
+	} else {
+		size_t run_size;
+
+		/*
+		 * We can't achieve subpage alignment, so round up alignment
+		 * permanently; it makes later calculations simpler.
+		 */
+		alignment = PAGE_CEILING(alignment);
+		usize = PAGE_CEILING(size);
+		/*
+		 * (usize < size) protects against very large sizes within
+		 * PAGE_SIZE of SIZE_T_MAX.
+		 *
+		 * (usize + alignment < usize) protects against the
+		 * combination of maximal alignment and usize large enough
+		 * to cause overflow.  This is similar to the first overflow
+		 * check above, but it needs to be repeated due to the new
+		 * usize value, which may now be *equal* to maximal
+		 * alignment, whereas before we only detected overflow if the
+		 * original size was *greater* than maximal alignment.
+		 */
+		if (usize < size || usize + alignment < usize) {
+			/* size_t overflow. */
+			return (0);
+		}
+
+		/*
+		 * Calculate the size of the over-size run that arena_palloc()
+		 * would need to allocate in order to guarantee the alignment.
+		 */
+		if (usize >= alignment)
+			run_size = usize + alignment - PAGE_SIZE;
+		else {
+			/*
+			 * It is possible that (alignment << 1) will cause
+			 * overflow, but it doesn't matter because we also
+			 * subtract PAGE_SIZE, which in the case of overflow
+			 * leaves us with a very large run_size.  That causes
+			 * the first conditional below to fail, which means
+			 * that the bogus run_size value never gets used for
+			 * anything important.
+			 */
+			run_size = (alignment << 1) - PAGE_SIZE;
+		}
+		if (run_size_p != NULL)
+			*run_size_p = run_size;
+
+		if (run_size <= arena_maxclass)
+			return (PAGE_CEILING(usize));
+		return (CHUNK_CEILING(usize));
+	}
+}
+
 /*
  * Wrapper around malloc_message() that avoids the need for
  * JEMALLOC_P(malloc_message)(...) throughout the code.
@@ -310,78 +536,35 @@ choose_arena(void)
 {
 	arena_t *ret;
 
-	/*
-	 * We can only use TLS if this is a PIC library, since for the static
-	 * library version, libc's malloc is used by TLS allocation, which
-	 * introduces a bootstrapping issue.
-	 */
-#ifndef NO_TLS
-	ret = arenas_map;
+	ret = ARENA_GET();
 	if (ret == NULL) {
 		ret = choose_arena_hard();
 		assert(ret != NULL);
 	}
-#else
-	if (isthreaded && narenas > 1) {
-		unsigned long ind;
 
-		/*
-		 * Hash pthread_self() to one of the arenas.  There is a prime
-		 * number of arenas, so this has a reasonable chance of
-		 * working.  Even so, the hashing can be easily thwarted by
-		 * inconvenient pthread_self() values.  Without specific
-		 * knowledge of how pthread_self() calculates values, we can't
-		 * easily do much better than this.
-		 */
-		ind = (unsigned long) pthread_self() % narenas;
-
-		/*
-		 * Optimistially assume that arenas[ind] has been initialized.
-		 * At worst, we find out that some other thread has already
-		 * done so, after acquiring the lock in preparation.  Note that
-		 * this lazy locking also has the effect of lazily forcing
-		 * cache coherency; without the lock acquisition, there's no
-		 * guarantee that modification of arenas[ind] by another thread
-		 * would be seen on this CPU for an arbitrary amount of time.
-		 *
-		 * In general, this approach to modifying a synchronized value
-		 * isn't a good idea, but in this case we only ever modify the
-		 * value once, so things work out well.
-		 */
-		ret = arenas[ind];
-		if (ret == NULL) {
-			/*
-			 * Avoid races with another thread that may have already
-			 * initialized arenas[ind].
-			 */
-			malloc_mutex_lock(&arenas_lock);
-			if (arenas[ind] == NULL)
-				ret = arenas_extend((unsigned)ind);
-			else
-				ret = arenas[ind];
-			malloc_mutex_unlock(&arenas_lock);
-		}
-	} else
-		ret = arenas[0];
-#endif
-
-	assert(ret != NULL);
 	return (ret);
 }
 #endif
 
+#include "jemalloc/internal/rtree.h"
 #include "jemalloc/internal/tcache.h"
 #include "jemalloc/internal/arena.h"
 #include "jemalloc/internal/hash.h"
-#include "jemalloc/internal/prof.h"
+#ifdef JEMALLOC_ZONE
+#include "jemalloc/internal/zone.h"
+#endif
 
 #ifndef JEMALLOC_ENABLE_INLINE
 void	*imalloc(size_t size);
 void	*icalloc(size_t size);
-void	*ipalloc(size_t alignment, size_t size);
+void	*ipalloc(size_t size, size_t alignment, bool zero);
 size_t	isalloc(const void *ptr);
-void	*iralloc(void *ptr, size_t size);
+#  ifdef JEMALLOC_IVSALLOC
+size_t	ivsalloc(const void *ptr);
+#  endif
 void	idalloc(void *ptr);
+void	*iralloc(void *ptr, size_t size, size_t extra, size_t alignment,
+    bool zero, bool no_move);
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_C_))
@@ -408,95 +591,28 @@ icalloc(size_t size)
 }
 
 JEMALLOC_INLINE void *
-ipalloc(size_t alignment, size_t size)
+ipalloc(size_t size, size_t alignment, bool zero)
 {
 	void *ret;
-	size_t ceil_size;
+	size_t usize;
+	size_t run_size
+#  ifdef JEMALLOC_CC_SILENCE
+	    = 0
+#  endif
+	    ;
 
-	/*
-	 * Round size up to the nearest multiple of alignment.
-	 *
-	 * This done, we can take advantage of the fact that for each small
-	 * size class, every object is aligned at the smallest power of two
-	 * that is non-zero in the base two representation of the size.  For
-	 * example:
-	 *
-	 *   Size |   Base 2 | Minimum alignment
-	 *   -----+----------+------------------
-	 *     96 |  1100000 |  32
-	 *    144 | 10100000 |  32
-	 *    192 | 11000000 |  64
-	 *
-	 * Depending on runtime settings, it is possible that arena_malloc()
-	 * will further round up to a power of two, but that never causes
-	 * correctness issues.
-	 */
-	ceil_size = (size + (alignment - 1)) & (-alignment);
-	/*
-	 * (ceil_size < size) protects against the combination of maximal
-	 * alignment and size greater than maximal alignment.
-	 */
-	if (ceil_size < size) {
-		/* size_t overflow. */
+	usize = sa2u(size, alignment, &run_size);
+	if (usize == 0)
 		return (NULL);
-	}
-
-	if (ceil_size <= PAGE_SIZE || (alignment <= PAGE_SIZE
-	    && ceil_size <= arena_maxclass))
-		ret = arena_malloc(ceil_size, false);
-	else {
-		size_t run_size;
-
-		/*
-		 * We can't achieve subpage alignment, so round up alignment
-		 * permanently; it makes later calculations simpler.
-		 */
-		alignment = PAGE_CEILING(alignment);
-		ceil_size = PAGE_CEILING(size);
-		/*
-		 * (ceil_size < size) protects against very large sizes within
-		 * PAGE_SIZE of SIZE_T_MAX.
-		 *
-		 * (ceil_size + alignment < ceil_size) protects against the
-		 * combination of maximal alignment and ceil_size large enough
-		 * to cause overflow.  This is similar to the first overflow
-		 * check above, but it needs to be repeated due to the new
-		 * ceil_size value, which may now be *equal* to maximal
-		 * alignment, whereas before we only detected overflow if the
-		 * original size was *greater* than maximal alignment.
-		 */
-		if (ceil_size < size || ceil_size + alignment < ceil_size) {
-			/* size_t overflow. */
-			return (NULL);
-		}
-
-		/*
-		 * Calculate the size of the over-size run that arena_palloc()
-		 * would need to allocate in order to guarantee the alignment.
-		 */
-		if (ceil_size >= alignment)
-			run_size = ceil_size + alignment - PAGE_SIZE;
-		else {
-			/*
-			 * It is possible that (alignment << 1) will cause
-			 * overflow, but it doesn't matter because we also
-			 * subtract PAGE_SIZE, which in the case of overflow
-			 * leaves us with a very large run_size.  That causes
-			 * the first conditional below to fail, which means
-			 * that the bogus run_size value never gets used for
-			 * anything important.
-			 */
-			run_size = (alignment << 1) - PAGE_SIZE;
-		}
-
-		if (run_size <= arena_maxclass) {
-			ret = arena_palloc(choose_arena(), alignment, ceil_size,
-			    run_size);
-		} else if (alignment <= chunksize)
-			ret = huge_malloc(ceil_size, false);
-		else
-			ret = huge_palloc(alignment, ceil_size);
-	}
+	if (usize <= arena_maxclass && alignment <= PAGE_SIZE)
+		ret = arena_malloc(usize, zero);
+	else if (run_size <= arena_maxclass) {
+		ret = arena_palloc(choose_arena(), usize, run_size, alignment,
+		    zero);
+	} else if (alignment <= chunksize)
+		ret = huge_malloc(usize, zero);
+	else
+		ret = huge_palloc(usize, alignment, zero);
 
 	assert(((uintptr_t)ret & (alignment - 1)) == 0);
 	return (ret);
@@ -526,21 +642,18 @@ isalloc(const void *ptr)
 	return (ret);
 }
 
-JEMALLOC_INLINE void *
-iralloc(void *ptr, size_t size)
+#ifdef JEMALLOC_IVSALLOC
+JEMALLOC_INLINE size_t
+ivsalloc(const void *ptr)
 {
-	size_t oldsize;
 
-	assert(ptr != NULL);
-	assert(size != 0);
+	/* Return 0 if ptr is not within a chunk managed by jemalloc. */
+	if (rtree_get(chunks_rtree, (uintptr_t)CHUNK_ADDR2BASE(ptr)) == NULL)
+		return (0);
 
-	oldsize = isalloc(ptr);
-
-	if (size <= arena_maxclass)
-		return (arena_ralloc(ptr, size, oldsize));
-	else
-		return (huge_ralloc(ptr, size, oldsize));
+	return (isalloc(ptr));
 }
+#endif
 
 JEMALLOC_INLINE void
 idalloc(void *ptr)
@@ -555,7 +668,70 @@ idalloc(void *ptr)
 	else
 		huge_dalloc(ptr);
 }
+
+JEMALLOC_INLINE void *
+iralloc(void *ptr, size_t size, size_t extra, size_t alignment, bool zero,
+    bool no_move)
+{
+	void *ret;
+	size_t oldsize;
+
+	assert(ptr != NULL);
+	assert(size != 0);
+
+	oldsize = isalloc(ptr);
+
+	if (alignment != 0 && ((uintptr_t)ptr & ((uintptr_t)alignment-1))
+	    != 0) {
+		size_t copysize;
+
+		/*
+		 * Existing object alignment is inadquate; allocate new space
+		 * and copy.
+		 */
+		if (no_move)
+			return (NULL);
+		ret = ipalloc(size + extra, alignment, zero);
+		if (ret == NULL) {
+			if (extra == 0)
+				return (NULL);
+			/* Try again, without extra this time. */
+			ret = ipalloc(size, alignment, zero);
+			if (ret == NULL)
+				return (NULL);
+		}
+		/*
+		 * Copy at most size bytes (not size+extra), since the caller
+		 * has no expectation that the extra bytes will be reliably
+		 * preserved.
+		 */
+		copysize = (size < oldsize) ? size : oldsize;
+		memcpy(ret, ptr, copysize);
+		idalloc(ptr);
+		return (ret);
+	}
+
+	if (no_move) {
+		if (size <= arena_maxclass) {
+			return (arena_ralloc_no_move(ptr, oldsize, size,
+			    extra, zero));
+		} else {
+			return (huge_ralloc_no_move(ptr, oldsize, size,
+			    extra));
+		}
+	} else {
+		if (size + extra <= arena_maxclass) {
+			return (arena_ralloc(ptr, oldsize, size, extra,
+			    alignment, zero));
+		} else {
+			return (huge_ralloc(ptr, oldsize, size, extra,
+			    alignment, zero));
+		}
+	}
+}
 #endif
 
+#include "jemalloc/internal/prof.h"
+
 #undef JEMALLOC_H_INLINES
 /******************************************************************************/
diff --git a/jemalloc/include/jemalloc/internal/mutex.h b/jemalloc/include/jemalloc/internal/mutex.h
index 108bfa8a..dcca01ed 100644
--- a/jemalloc/include/jemalloc/internal/mutex.h
+++ b/jemalloc/include/jemalloc/internal/mutex.h
@@ -3,6 +3,12 @@
 
 typedef pthread_mutex_t malloc_mutex_t;
 
+#ifdef PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP
+#  define MALLOC_MUTEX_INITIALIZER PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP
+#else
+#  define MALLOC_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER
+#endif
+
 #endif /* JEMALLOC_H_TYPES */
 /******************************************************************************/
 #ifdef JEMALLOC_H_STRUCTS
@@ -18,6 +24,7 @@ extern bool isthreaded;
 #endif
 
 bool	malloc_mutex_init(malloc_mutex_t *mutex);
+void	malloc_mutex_destroy(malloc_mutex_t *mutex);
 
 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
diff --git a/jemalloc/include/jemalloc/internal/prof.h b/jemalloc/include/jemalloc/internal/prof.h
index fb55fb90..7864000b 100644
--- a/jemalloc/include/jemalloc/internal/prof.h
+++ b/jemalloc/include/jemalloc/internal/prof.h
@@ -6,20 +6,25 @@ typedef struct prof_bt_s prof_bt_t;
 typedef struct prof_cnt_s prof_cnt_t;
 typedef struct prof_thr_cnt_s prof_thr_cnt_t;
 typedef struct prof_ctx_s prof_ctx_t;
-typedef struct prof_s prof_t;
+typedef struct prof_tdata_s prof_tdata_t;
 
 /* Option defaults. */
-#define	LG_PROF_BT_MAX_DEFAULT		2
+#define	PROF_PREFIX_DEFAULT		"jeprof"
+#define	LG_PROF_BT_MAX_DEFAULT		7
 #define	LG_PROF_SAMPLE_DEFAULT		0
-#define	LG_PROF_INTERVAL_DEFAULT	30
+#define	LG_PROF_INTERVAL_DEFAULT	-1
+#define	LG_PROF_TCMAX_DEFAULT		-1
 
 /*
  * Hard limit on stack backtrace depth.  Note that the version of
  * prof_backtrace() that is based on __builtin_return_address() necessarily has
- * a hard-coded number of backtrace frame handlers, so increasing
- * LG_PROF_BT_MAX requires changing prof_backtrace().
+ * a hard-coded number of backtrace frame handlers.
  */
-#define	LG_PROF_BT_MAX		7 /* >= LG_PROF_BT_MAX_DEFAULT */
+#if (defined(JEMALLOC_PROF_LIBGCC) || defined(JEMALLOC_PROF_LIBUNWIND))
+#  define LG_PROF_BT_MAX	((ZU(1) << (LG_SIZEOF_PTR+3)) - 1)
+#else
+#  define LG_PROF_BT_MAX	7 /* >= LG_PROF_BT_MAX_DEFAULT */
+#endif
 #define	PROF_BT_MAX		(1U << LG_PROF_BT_MAX)
 
 /* Initial hash table size. */
@@ -34,16 +39,16 @@ typedef struct prof_s prof_t;
 
 struct prof_bt_s {
 	/* Backtrace, stored as len program counters. */
-	void			**vec;
-	unsigned		len;
+	void		**vec;
+	unsigned	len;
 };
 
 #ifdef JEMALLOC_PROF_LIBGCC
 /* Data structure passed to libgcc _Unwind_Backtrace() callback functions. */
 typedef struct {
-	prof_bt_t *bt;
-	unsigned nignore;
-	unsigned max;
+	prof_bt_t	*bt;
+	unsigned	nignore;
+	unsigned	max;
 } prof_unwind_data_t;
 #endif
 
@@ -51,11 +56,11 @@ struct prof_cnt_s {
 	/*
 	 * Profiling counters.  An allocation/deallocation pair can operate on
 	 * different prof_thr_cnt_t objects that are linked into the same
-	 * prof_ctx_t sets_ql, so it is possible for the cur* counters to go
+	 * prof_ctx_t cnts_ql, so it is possible for the cur* counters to go
 	 * negative.  In principle it is possible for the *bytes counters to
-	 * overflow/underflow, but a general solution would require some form
-	 * of 128-bit counter solution; this implementation doesn't bother to
-	 * solve that problem.
+	 * overflow/underflow, but a general solution would require something
+	 * like 128-bit counters; this implementation doesn't bother to solve
+	 * that problem.
 	 */
 	int64_t		curobjs;
 	int64_t		curbytes;
@@ -64,15 +69,18 @@ struct prof_cnt_s {
 };
 
 struct prof_thr_cnt_s {
-	/* Linkage into prof_ctx_t's sets_ql. */
-	ql_elm(prof_thr_cnt_t)	link;
+	/* Linkage into prof_ctx_t's cnts_ql. */
+	ql_elm(prof_thr_cnt_t)	cnts_link;
+
+	/* Linkage into thread's LRU. */
+	ql_elm(prof_thr_cnt_t)	lru_link;
 
 	/*
 	 * Associated context.  If a thread frees an object that it did not
 	 * allocate, it is possible that the context is not cached in the
 	 * thread's hash table, in which case it must be able to look up the
 	 * context, insert a new prof_thr_cnt_t into the thread's hash table,
-	 * and link it into the prof_ctx_t's sets_ql.
+	 * and link it into the prof_ctx_t's cnts_ql.
 	 */
 	prof_ctx_t		*ctx;
 
@@ -101,11 +109,11 @@ struct prof_ctx_s {
 	/* Associated backtrace. */
 	prof_bt_t		*bt;
 
-	/* Protects cnt_merged and sets_ql. */
+	/* Protects cnt_merged and cnts_ql. */
 	malloc_mutex_t		lock;
 
-	/* Temporary storage for aggregation during dump. */
-	prof_cnt_t		cnt_dump;
+	/* Temporary storage for summation during dump. */
+	prof_cnt_t		cnt_summed;
 
 	/* When threads exit, they merge their stats into cnt_merged. */
 	prof_cnt_t		cnt_merged;
@@ -117,6 +125,31 @@ struct prof_ctx_s {
 	ql_head(prof_thr_cnt_t)	cnts_ql;
 };
 
+struct prof_tdata_s {
+	/*
+	 * Hash of (prof_bt_t *)-->(prof_thr_cnt_t *).  Each thread keeps a
+	 * cache of backtraces, with associated thread-specific prof_thr_cnt_t
+	 * objects.  Other threads may read the prof_thr_cnt_t contents, but no
+	 * others will ever write them.
+	 *
+	 * Upon thread exit, the thread must merge all the prof_thr_cnt_t
+	 * counter data into the associated prof_ctx_t objects, and unlink/free
+	 * the prof_thr_cnt_t objects.
+	 */
+	ckh_t			bt2cnt;
+
+	/* LRU for contents of bt2cnt. */
+	ql_head(prof_thr_cnt_t)	lru_ql;
+
+	/* Backtrace vector, used for calls to prof_backtrace(). */
+	void			**vec;
+
+	/* Sampling state. */
+	uint64_t		prn_state;
+	uint64_t		threshold;
+	uint64_t		accum;
+};
+
 #endif /* JEMALLOC_H_STRUCTS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_EXTERNS
@@ -129,11 +162,14 @@ extern bool	opt_prof;
  * to notice state changes.
  */
 extern bool	opt_prof_active;
-extern size_t	opt_lg_prof_bt_max; /* Maximum backtrace depth. */
-extern size_t	opt_lg_prof_sample; /* Mean bytes between samples. */
+extern size_t	opt_lg_prof_bt_max;   /* Maximum backtrace depth. */
+extern size_t	opt_lg_prof_sample;   /* Mean bytes between samples. */
 extern ssize_t	opt_lg_prof_interval; /* lg(prof_interval). */
-extern bool	opt_prof_udump; /* High-water memory dumping. */
-extern bool	opt_prof_leak; /* Dump leak summary at exit. */
+extern bool	opt_prof_gdump;       /* High-water memory dumping. */
+extern bool	opt_prof_leak;        /* Dump leak summary at exit. */
+extern bool	opt_prof_accum;       /* Report cumulative bytes. */
+extern ssize_t	opt_lg_prof_tcmax;    /* lg(max per thread bactrace cache) */
+extern char	opt_prof_prefix[PATH_MAX + 1];
 
 /*
  * Profile dump interval, measured in bytes allocated.  Each arena triggers a
@@ -150,25 +186,362 @@ extern uint64_t	prof_interval;
  */
 extern bool	prof_promote;
 
-bool	prof_init(prof_t *prof, bool master);
-void	prof_destroy(prof_t *prof);
+/* (1U << opt_lg_prof_bt_max). */
+extern unsigned	prof_bt_max;
 
-prof_thr_cnt_t	*prof_alloc_prep(size_t size);
-prof_ctx_t	*prof_ctx_get(const void *ptr);
-void	prof_malloc(const void *ptr, prof_thr_cnt_t *cnt);
-void	prof_realloc(const void *ptr, prof_thr_cnt_t *cnt, const void *old_ptr,
-    size_t old_size, prof_ctx_t *old_ctx);
-void	prof_free(const void *ptr);
+/* Thread-specific backtrace cache, used to reduce bt2ctx contention. */
+#ifndef NO_TLS
+extern __thread prof_tdata_t	*prof_tdata_tls
+    JEMALLOC_ATTR(tls_model("initial-exec"));
+#  define PROF_TCACHE_GET()	prof_tdata_tls
+#  define PROF_TCACHE_SET(v)	do {					\
+	prof_tdata_tls = (v);						\
+	pthread_setspecific(prof_tdata_tsd, (void *)(v));		\
+} while (0)
+#else
+#  define PROF_TCACHE_GET()						\
+	((prof_tdata_t *)pthread_getspecific(prof_tdata_tsd))
+#  define PROF_TCACHE_SET(v)	do {					\
+	pthread_setspecific(prof_tdata_tsd, (void *)(v));		\
+} while (0)
+#endif
+/*
+ * Same contents as b2cnt_tls, but initialized such that the TSD destructor is
+ * called when a thread exits, so that prof_tdata_tls contents can be merged,
+ * unlinked, and deallocated.
+ */
+extern pthread_key_t	prof_tdata_tsd;
+
+void	bt_init(prof_bt_t *bt, void **vec);
+void	prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max);
+prof_thr_cnt_t	*prof_lookup(prof_bt_t *bt);
 void	prof_idump(void);
 bool	prof_mdump(const char *filename);
-void	prof_udump(void);
+void	prof_gdump(void);
+prof_tdata_t	*prof_tdata_init(void);
 void	prof_boot0(void);
-bool	prof_boot1(void);
+void	prof_boot1(void);
+bool	prof_boot2(void);
 
 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_INLINES
 
+#ifndef JEMALLOC_ENABLE_INLINE
+void	prof_sample_threshold_update(prof_tdata_t *prof_tdata);
+prof_thr_cnt_t	*prof_alloc_prep(size_t size);
+prof_ctx_t	*prof_ctx_get(const void *ptr);
+void	prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
+bool	prof_sample_accum_update(size_t size);
+void	prof_malloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt);
+void	prof_realloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt,
+    size_t old_size, prof_ctx_t *old_ctx);
+void	prof_free(const void *ptr, size_t size);
+#endif
+
+#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_PROF_C_))
+JEMALLOC_INLINE void
+prof_sample_threshold_update(prof_tdata_t *prof_tdata)
+{
+	uint64_t r;
+	double u;
+
+	/*
+	 * Compute prof_sample_threshold as a geometrically distributed random
+	 * variable with mean (2^opt_lg_prof_sample).
+	 */
+	prn64(r, 53, prof_tdata->prn_state,
+	    (uint64_t)6364136223846793005LLU, (uint64_t)1442695040888963407LLU);
+	u = (double)r * (1.0/9007199254740992.0L);
+	prof_tdata->threshold = (uint64_t)(log(u) /
+	    log(1.0 - (1.0 / (double)((uint64_t)1U << opt_lg_prof_sample))))
+	    + (uint64_t)1U;
+}
+
+JEMALLOC_INLINE prof_thr_cnt_t *
+prof_alloc_prep(size_t size)
+{
+#ifdef JEMALLOC_ENABLE_INLINE
+   /* This function does not have its own stack frame, because it is inlined. */
+#  define NIGNORE 1
+#else
+#  define NIGNORE 2
+#endif
+	prof_thr_cnt_t *ret;
+	prof_tdata_t *prof_tdata;
+	prof_bt_t bt;
+
+	assert(size == s2u(size));
+
+	prof_tdata = PROF_TCACHE_GET();
+	if (prof_tdata == NULL) {
+		prof_tdata = prof_tdata_init();
+		if (prof_tdata == NULL)
+			return (NULL);
+	}
+
+	if (opt_prof_active == false) {
+		/* Sampling is currently inactive, so avoid sampling. */
+		ret = (prof_thr_cnt_t *)(uintptr_t)1U;
+	} else if (opt_lg_prof_sample == 0) {
+		/*
+		 * Don't bother with sampling logic, since sampling interval is
+		 * 1.
+		 */
+		bt_init(&bt, prof_tdata->vec);
+		prof_backtrace(&bt, NIGNORE, prof_bt_max);
+		ret = prof_lookup(&bt);
+	} else {
+		if (prof_tdata->threshold == 0) {
+			/*
+			 * Initialize.  Seed the prng differently for each
+			 * thread.
+			 */
+			prof_tdata->prn_state = (uint64_t)(uintptr_t)&size;
+			prof_sample_threshold_update(prof_tdata);
+		}
+
+		/*
+		 * Determine whether to capture a backtrace based on whether
+		 * size is enough for prof_accum to reach
+		 * prof_tdata->threshold.  However, delay updating these
+		 * variables until prof_{m,re}alloc(), because we don't know
+		 * for sure that the allocation will succeed.
+		 *
+		 * Use subtraction rather than addition to avoid potential
+		 * integer overflow.
+		 */
+		if (size >= prof_tdata->threshold - prof_tdata->accum) {
+			bt_init(&bt, prof_tdata->vec);
+			prof_backtrace(&bt, NIGNORE, prof_bt_max);
+			ret = prof_lookup(&bt);
+		} else
+			ret = (prof_thr_cnt_t *)(uintptr_t)1U;
+	}
+
+	return (ret);
+#undef NIGNORE
+}
+
+JEMALLOC_INLINE prof_ctx_t *
+prof_ctx_get(const void *ptr)
+{
+	prof_ctx_t *ret;
+	arena_chunk_t *chunk;
+
+	assert(ptr != NULL);
+
+	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
+	if (chunk != ptr) {
+		/* Region. */
+		assert(chunk->arena->magic == ARENA_MAGIC);
+
+		ret = arena_prof_ctx_get(ptr);
+	} else
+		ret = huge_prof_ctx_get(ptr);
+
+	return (ret);
+}
+
+JEMALLOC_INLINE void
+prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
+{
+	arena_chunk_t *chunk;
+
+	assert(ptr != NULL);
+
+	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
+	if (chunk != ptr) {
+		/* Region. */
+		assert(chunk->arena->magic == ARENA_MAGIC);
+
+		arena_prof_ctx_set(ptr, ctx);
+	} else
+		huge_prof_ctx_set(ptr, ctx);
+}
+
+JEMALLOC_INLINE bool
+prof_sample_accum_update(size_t size)
+{
+	prof_tdata_t *prof_tdata;
+
+	/* Sampling logic is unnecessary if the interval is 1. */
+	assert(opt_lg_prof_sample != 0);
+
+	prof_tdata = PROF_TCACHE_GET();
+	assert(prof_tdata != NULL);
+
+	/* Take care to avoid integer overflow. */
+	if (size >= prof_tdata->threshold - prof_tdata->accum) {
+		prof_tdata->accum -= (prof_tdata->threshold - size);
+		/* Compute new prof_sample_threshold. */
+		prof_sample_threshold_update(prof_tdata);
+		while (prof_tdata->accum >= prof_tdata->threshold) {
+			prof_tdata->accum -= prof_tdata->threshold;
+			prof_sample_threshold_update(prof_tdata);
+		}
+		return (false);
+	} else {
+		prof_tdata->accum += size;
+		return (true);
+	}
+}
+
+JEMALLOC_INLINE void
+prof_malloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt)
+{
+
+	assert(ptr != NULL);
+	assert(size == isalloc(ptr));
+
+	if (opt_lg_prof_sample != 0) {
+		if (prof_sample_accum_update(size)) {
+			/*
+			 * Don't sample.  For malloc()-like allocation, it is
+			 * always possible to tell in advance how large an
+			 * object's usable size will be, so there should never
+			 * be a difference between the size passed to
+			 * prof_alloc_prep() and prof_malloc().
+			 */
+			assert((uintptr_t)cnt == (uintptr_t)1U);
+		}
+	}
+
+	if ((uintptr_t)cnt > (uintptr_t)1U) {
+		prof_ctx_set(ptr, cnt->ctx);
+
+		cnt->epoch++;
+		/*********/
+		mb_write();
+		/*********/
+		cnt->cnts.curobjs++;
+		cnt->cnts.curbytes += size;
+		if (opt_prof_accum) {
+			cnt->cnts.accumobjs++;
+			cnt->cnts.accumbytes += size;
+		}
+		/*********/
+		mb_write();
+		/*********/
+		cnt->epoch++;
+		/*********/
+		mb_write();
+		/*********/
+	} else
+		prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U);
+}
+
+JEMALLOC_INLINE void
+prof_realloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt,
+    size_t old_size, prof_ctx_t *old_ctx)
+{
+	prof_thr_cnt_t *told_cnt;
+
+	assert(ptr != NULL || (uintptr_t)cnt <= (uintptr_t)1U);
+
+	if (ptr != NULL) {
+		assert(size == isalloc(ptr));
+		if (opt_lg_prof_sample != 0) {
+			if (prof_sample_accum_update(size)) {
+				/*
+				 * Don't sample.  The size passed to
+				 * prof_alloc_prep() was larger than what
+				 * actually got allocated, so a backtrace was
+				 * captured for this allocation, even though
+				 * its actual size was insufficient to cross
+				 * the sample threshold.
+				 */
+				cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
+			}
+		}
+	}
+
+	if ((uintptr_t)old_ctx > (uintptr_t)1U) {
+		told_cnt = prof_lookup(old_ctx->bt);
+		if (told_cnt == NULL) {
+			/*
+			 * It's too late to propagate OOM for this realloc(),
+			 * so operate directly on old_cnt->ctx->cnt_merged.
+			 */
+			malloc_mutex_lock(&old_ctx->lock);
+			old_ctx->cnt_merged.curobjs--;
+			old_ctx->cnt_merged.curbytes -= old_size;
+			malloc_mutex_unlock(&old_ctx->lock);
+			told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
+		}
+	} else
+		told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
+
+	if ((uintptr_t)told_cnt > (uintptr_t)1U)
+		told_cnt->epoch++;
+	if ((uintptr_t)cnt > (uintptr_t)1U) {
+		prof_ctx_set(ptr, cnt->ctx);
+		cnt->epoch++;
+	} else
+		prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U);
+	/*********/
+	mb_write();
+	/*********/
+	if ((uintptr_t)told_cnt > (uintptr_t)1U) {
+		told_cnt->cnts.curobjs--;
+		told_cnt->cnts.curbytes -= old_size;
+	}
+	if ((uintptr_t)cnt > (uintptr_t)1U) {
+		cnt->cnts.curobjs++;
+		cnt->cnts.curbytes += size;
+		if (opt_prof_accum) {
+			cnt->cnts.accumobjs++;
+			cnt->cnts.accumbytes += size;
+		}
+	}
+	/*********/
+	mb_write();
+	/*********/
+	if ((uintptr_t)told_cnt > (uintptr_t)1U)
+		told_cnt->epoch++;
+	if ((uintptr_t)cnt > (uintptr_t)1U)
+		cnt->epoch++;
+	/*********/
+	mb_write(); /* Not strictly necessary. */
+}
+
+JEMALLOC_INLINE void
+prof_free(const void *ptr, size_t size)
+{
+	prof_ctx_t *ctx = prof_ctx_get(ptr);
+
+	if ((uintptr_t)ctx > (uintptr_t)1) {
+		assert(size == isalloc(ptr));
+		prof_thr_cnt_t *tcnt = prof_lookup(ctx->bt);
+
+		if (tcnt != NULL) {
+			tcnt->epoch++;
+			/*********/
+			mb_write();
+			/*********/
+			tcnt->cnts.curobjs--;
+			tcnt->cnts.curbytes -= size;
+			/*********/
+			mb_write();
+			/*********/
+			tcnt->epoch++;
+			/*********/
+			mb_write();
+			/*********/
+		} else {
+			/*
+			 * OOM during free() cannot be propagated, so operate
+			 * directly on cnt->ctx->cnt_merged.
+			 */
+			malloc_mutex_lock(&ctx->lock);
+			ctx->cnt_merged.curobjs--;
+			ctx->cnt_merged.curbytes -= size;
+			malloc_mutex_unlock(&ctx->lock);
+		}
+	}
+}
+#endif
+
 #endif /* JEMALLOC_H_INLINES */
 /******************************************************************************/
 #endif /* JEMALLOC_PROF */
diff --git a/jemalloc/include/jemalloc/internal/rtree.h b/jemalloc/include/jemalloc/internal/rtree.h
new file mode 100644
index 00000000..9d58ebac
--- /dev/null
+++ b/jemalloc/include/jemalloc/internal/rtree.h
@@ -0,0 +1,161 @@
+/*
+ * This radix tree implementation is tailored to the singular purpose of
+ * tracking which chunks are currently owned by jemalloc.  This functionality
+ * is mandatory for OS X, where jemalloc must be able to respond to object
+ * ownership queries.
+ *
+ *******************************************************************************
+ */
+#ifdef JEMALLOC_H_TYPES
+
+typedef struct rtree_s rtree_t;
+
+/*
+ * Size of each radix tree node (must be a power of 2).  This impacts tree
+ * depth.
+ */
+#if (LG_SIZEOF_PTR == 2)
+#  define RTREE_NODESIZE (1U << 14)
+#else
+#  define RTREE_NODESIZE CACHELINE
+#endif
+
+#endif /* JEMALLOC_H_TYPES */
+/******************************************************************************/
+#ifdef JEMALLOC_H_STRUCTS
+
+struct rtree_s {
+	malloc_mutex_t	mutex;
+	void		**root;
+	unsigned	height;
+	unsigned	level2bits[1]; /* Dynamically sized. */
+};
+
+#endif /* JEMALLOC_H_STRUCTS */
+/******************************************************************************/
+#ifdef JEMALLOC_H_EXTERNS
+
+rtree_t	*rtree_new(unsigned bits);
+
+#endif /* JEMALLOC_H_EXTERNS */
+/******************************************************************************/
+#ifdef JEMALLOC_H_INLINES
+
+#ifndef JEMALLOC_ENABLE_INLINE
+#ifndef JEMALLOC_DEBUG
+void	*rtree_get_locked(rtree_t *rtree, uintptr_t key);
+#endif
+void	*rtree_get(rtree_t *rtree, uintptr_t key);
+bool	rtree_set(rtree_t *rtree, uintptr_t key, void *val);
+#endif
+
+#if (defined(JEMALLOC_ENABLE_INLINE) || defined(RTREE_C_))
+#define	RTREE_GET_GENERATE(f)						\
+/* The least significant bits of the key are ignored. */		\
+JEMALLOC_INLINE void *							\
+f(rtree_t *rtree, uintptr_t key)					\
+{									\
+	void *ret;							\
+	uintptr_t subkey;						\
+	unsigned i, lshift, height, bits;				\
+	void **node, **child;						\
+									\
+	RTREE_LOCK(&rtree->mutex);					\
+	for (i = lshift = 0, height = rtree->height, node = rtree->root;\
+	    i < height - 1;						\
+	    i++, lshift += bits, node = child) {			\
+		bits = rtree->level2bits[i];				\
+		subkey = (key << lshift) >> ((ZU(1) << (LG_SIZEOF_PTR + \
+		    3)) - bits);					\
+		child = (void**)node[subkey];				\
+		if (child == NULL) {					\
+			RTREE_UNLOCK(&rtree->mutex);			\
+			return (NULL);					\
+		}							\
+	}								\
+									\
+	/*								\
+	 * node is a leaf, so it contains values rather than node	\
+	 * pointers.							\
+	 */								\
+	bits = rtree->level2bits[i];					\
+	subkey = (key << lshift) >> ((ZU(1) << (LG_SIZEOF_PTR+3)) -	\
+	    bits);							\
+	ret = node[subkey];						\
+	RTREE_UNLOCK(&rtree->mutex);					\
+									\
+	RTREE_GET_VALIDATE						\
+	return (ret);							\
+}
+
+#ifdef JEMALLOC_DEBUG
+#  define RTREE_LOCK(l)		malloc_mutex_lock(l)
+#  define RTREE_UNLOCK(l)	malloc_mutex_unlock(l)
+#  define RTREE_GET_VALIDATE
+RTREE_GET_GENERATE(rtree_get_locked)
+#  undef RTREE_LOCK
+#  undef RTREE_UNLOCK
+#  undef RTREE_GET_VALIDATE
+#endif
+
+#define	RTREE_LOCK(l)
+#define	RTREE_UNLOCK(l)
+#ifdef JEMALLOC_DEBUG
+   /*
+    * Suppose that it were possible for a jemalloc-allocated chunk to be
+    * munmap()ped, followed by a different allocator in another thread re-using
+    * overlapping virtual memory, all without invalidating the cached rtree
+    * value.  The result would be a false positive (the rtree would claim that
+    * jemalloc owns memory that it had actually discarded).  This scenario
+    * seems impossible, but the following assertion is a prudent sanity check.
+    */
+#  define RTREE_GET_VALIDATE						\
+	assert(rtree_get_locked(rtree, key) == ret);
+#else
+#  define RTREE_GET_VALIDATE
+#endif
+RTREE_GET_GENERATE(rtree_get)
+#undef RTREE_LOCK
+#undef RTREE_UNLOCK
+#undef RTREE_GET_VALIDATE
+
+JEMALLOC_INLINE bool
+rtree_set(rtree_t *rtree, uintptr_t key, void *val)
+{
+	uintptr_t subkey;
+	unsigned i, lshift, height, bits;
+	void **node, **child;
+
+	malloc_mutex_lock(&rtree->mutex);
+	for (i = lshift = 0, height = rtree->height, node = rtree->root;
+	    i < height - 1;
+	    i++, lshift += bits, node = child) {
+		bits = rtree->level2bits[i];
+		subkey = (key << lshift) >> ((ZU(1) << (LG_SIZEOF_PTR+3)) -
+		    bits);
+		child = (void**)node[subkey];
+		if (child == NULL) {
+			child = (void**)base_alloc(sizeof(void *) <<
+			    rtree->level2bits[i+1]);
+			if (child == NULL) {
+				malloc_mutex_unlock(&rtree->mutex);
+				return (true);
+			}
+			memset(child, 0, sizeof(void *) <<
+			    rtree->level2bits[i+1]);
+			node[subkey] = child;
+		}
+	}
+
+	/* node is a leaf, so it contains values rather than node pointers. */
+	bits = rtree->level2bits[i];
+	subkey = (key << lshift) >> ((ZU(1) << (LG_SIZEOF_PTR+3)) - bits);
+	node[subkey] = val;
+	malloc_mutex_unlock(&rtree->mutex);
+
+	return (false);
+}
+#endif
+
+#endif /* JEMALLOC_H_INLINES */
+/******************************************************************************/
diff --git a/jemalloc/include/jemalloc/internal/stats.h b/jemalloc/include/jemalloc/internal/stats.h
index cbf035ff..3fc2080a 100644
--- a/jemalloc/include/jemalloc/internal/stats.h
+++ b/jemalloc/include/jemalloc/internal/stats.h
@@ -154,7 +154,7 @@ struct chunk_stats_s {
 
 extern bool	opt_stats_print;
 
-char	*umax2s(uintmax_t x, unsigned base, char *s);
+char	*u2s(uint64_t x, unsigned base, char *s);
 #ifdef JEMALLOC_STATS
 void malloc_cprintf(void (*write)(void *, const char *), void *cbopaque,
     const char *format, ...) JEMALLOC_ATTR(format(printf, 3, 4));
diff --git a/jemalloc/include/jemalloc/internal/tcache.h b/jemalloc/include/jemalloc/internal/tcache.h
index a8be436d..1ad91a9b 100644
--- a/jemalloc/include/jemalloc/internal/tcache.h
+++ b/jemalloc/include/jemalloc/internal/tcache.h
@@ -17,7 +17,7 @@ typedef struct tcache_s tcache_t;
 /* Number of cache slots for large size classes. */
 #define	TCACHE_NSLOTS_LARGE		20
 
-/* (1U << opt_lg_tcache_maxclass) is used to compute tcache_maxclass. */
+/* (1U << opt_lg_tcache_max) is used to compute tcache_maxclass. */
 #define	LG_TCACHE_MAXCLASS_DEFAULT	15
 
 /*
@@ -61,12 +61,25 @@ struct tcache_s {
 #ifdef JEMALLOC_H_EXTERNS
 
 extern bool	opt_tcache;
-extern ssize_t	opt_lg_tcache_maxclass;
+extern ssize_t	opt_lg_tcache_max;
 extern ssize_t	opt_lg_tcache_gc_sweep;
 
 /* Map of thread-specific caches. */
+#ifndef NO_TLS
 extern __thread tcache_t	*tcache_tls
     JEMALLOC_ATTR(tls_model("initial-exec"));
+#  define TCACHE_GET()	tcache_tls
+#  define TCACHE_SET(v)	do {						\
+	tcache_tls = (tcache_t *)(v);					\
+	pthread_setspecific(tcache_tsd, (void *)(v));			\
+} while (0)
+#else
+#  define TCACHE_GET()	((tcache_t *)pthread_getspecific(tcache_tsd))
+#  define TCACHE_SET(v)	do {						\
+	pthread_setspecific(tcache_tsd, (void *)(v));			\
+} while (0)
+#endif
+extern pthread_key_t		tcache_tsd;
 
 /*
  * Number of tcache bins.  There are nbins small-object bins, plus 0 or more
@@ -122,14 +135,23 @@ tcache_get(void)
 	if ((isthreaded & opt_tcache) == false)
 		return (NULL);
 
-	tcache = tcache_tls;
-	if ((uintptr_t)tcache <= (uintptr_t)1) {
+	tcache = TCACHE_GET();
+	if ((uintptr_t)tcache <= (uintptr_t)2) {
 		if (tcache == NULL) {
 			tcache = tcache_create(choose_arena());
 			if (tcache == NULL)
 				return (NULL);
-		} else
+		} else {
+			if (tcache == (void *)(uintptr_t)1) {
+				/*
+				 * Make a note that an allocator function was
+				 * called after the tcache_thread_cleanup() was
+				 * called.
+				 */
+				TCACHE_SET((uintptr_t)2);
+			}
 			return (NULL);
+		}
 	}
 
 	return (tcache);
@@ -258,9 +280,9 @@ tcache_alloc_large(tcache_t *tcache, size_t size, bool zero)
 	} else {
 #ifdef JEMALLOC_PROF
 		arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ret);
-		size_t pageind = (unsigned)(((uintptr_t)ret - (uintptr_t)chunk)
-		    >> PAGE_SHIFT);
-		chunk->map[pageind].bits |= CHUNK_MAP_CLASS_MASK;
+		size_t pageind = (((uintptr_t)ret - (uintptr_t)chunk) >>
+		    PAGE_SHIFT);
+		chunk->map[pageind-map_bias].bits &= ~CHUNK_MAP_CLASS_MASK;
 #endif
 		if (zero == false) {
 #ifdef JEMALLOC_FILL
@@ -299,8 +321,8 @@ tcache_dalloc_small(tcache_t *tcache, void *ptr)
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 	arena = chunk->arena;
-	pageind = (((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT);
-	mapelm = &chunk->map[pageind];
+	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT;
+	mapelm = &chunk->map[pageind-map_bias];
 	run = (arena_run_t *)((uintptr_t)chunk + (uintptr_t)((pageind -
 	    (mapelm->bits >> PAGE_SHIFT)) << PAGE_SHIFT));
 	assert(run->magic == ARENA_RUN_MAGIC);
@@ -339,7 +361,6 @@ tcache_dalloc_large(tcache_t *tcache, void *ptr, size_t size)
 	arena_chunk_t *chunk;
 	size_t pageind, binind;
 	tcache_bin_t *tbin;
-	arena_chunk_map_t *mapelm;
 
 	assert((size & PAGE_MASK) == 0);
 	assert(arena_salloc(ptr) > small_maxclass);
@@ -347,8 +368,7 @@ tcache_dalloc_large(tcache_t *tcache, void *ptr, size_t size)
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 	arena = chunk->arena;
-	pageind = (((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT);
-	mapelm = &chunk->map[pageind];
+	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT;
 	binind = nbins + (size >> PAGE_SHIFT) - 1;
 
 #ifdef JEMALLOC_FILL
diff --git a/jemalloc/include/jemalloc/internal/zone.h b/jemalloc/include/jemalloc/internal/zone.h
new file mode 100644
index 00000000..859b529d
--- /dev/null
+++ b/jemalloc/include/jemalloc/internal/zone.h
@@ -0,0 +1,23 @@
+#ifndef JEMALLOC_ZONE
+#  error "This source file is for zones on Darwin (OS X)."
+#endif
+/******************************************************************************/
+#ifdef JEMALLOC_H_TYPES
+
+#endif /* JEMALLOC_H_TYPES */
+/******************************************************************************/
+#ifdef JEMALLOC_H_STRUCTS
+
+#endif /* JEMALLOC_H_STRUCTS */
+/******************************************************************************/
+#ifdef JEMALLOC_H_EXTERNS
+
+malloc_zone_t *create_zone(void);
+void	szone2ozone(malloc_zone_t *zone);
+
+#endif /* JEMALLOC_H_EXTERNS */
+/******************************************************************************/
+#ifdef JEMALLOC_H_INLINES
+
+#endif /* JEMALLOC_H_INLINES */
+/******************************************************************************/
diff --git a/jemalloc/include/jemalloc/jemalloc.h.in b/jemalloc/include/jemalloc/jemalloc.h.in
index 8ef81836..4dd3981a 100644
--- a/jemalloc/include/jemalloc/jemalloc.h.in
+++ b/jemalloc/include/jemalloc/jemalloc.h.in
@@ -4,6 +4,9 @@
 extern "C" {
 #endif
 
+#include <limits.h>
+#include <strings.h>
+
 #define	JEMALLOC_VERSION "@jemalloc_version@"
 #define	JEMALLOC_VERSION_MAJOR @jemalloc_version_major@
 #define	JEMALLOC_VERSION_MINOR @jemalloc_version_minor@
@@ -16,7 +19,20 @@ extern "C" {
 #  define JEMALLOC_P(s) s
 #endif
 
-extern const char	*JEMALLOC_P(malloc_options);
+#define	ALLOCM_LG_ALIGN	((int)0x3f)
+#if LG_SIZEOF_PTR == 2
+#define	ALLOCM_ALIGN(a)	(ffs(a)-1)
+#else
+#define	ALLOCM_ALIGN(a)	((a < (size_t)INT_MAX) ? ffs(a)-1 : ffs(a>>32)+31)
+#endif
+#define	ALLOCM_ZERO	((int)0x40)
+#define	ALLOCM_NO_MOVE	((int)0x80)
+
+#define	ALLOCM_SUCCESS		0
+#define	ALLOCM_ERR_OOM		1
+#define	ALLOCM_ERR_NOT_MOVED	2
+
+extern const char	*JEMALLOC_P(malloc_conf);
 extern void		(*JEMALLOC_P(malloc_message))(void *, const char *);
 
 void	*JEMALLOC_P(malloc)(size_t size) JEMALLOC_ATTR(malloc);
@@ -36,6 +52,14 @@ int	JEMALLOC_P(mallctlnametomib)(const char *name, size_t *mibp,
 int	JEMALLOC_P(mallctlbymib)(const size_t *mib, size_t miblen, void *oldp,
     size_t *oldlenp, void *newp, size_t newlen);
 
+int	JEMALLOC_P(allocm)(void **ptr, size_t *rsize, size_t size, int flags)
+    JEMALLOC_ATTR(nonnull(1));
+int	JEMALLOC_P(rallocm)(void **ptr, size_t *rsize, size_t size,
+    size_t extra, int flags) JEMALLOC_ATTR(nonnull(1));
+int	JEMALLOC_P(sallocm)(const void *ptr, size_t *rsize, int flags)
+    JEMALLOC_ATTR(nonnull(1));
+int	JEMALLOC_P(dallocm)(void *ptr, int flags) JEMALLOC_ATTR(nonnull(1));
+
 #ifdef __cplusplus
 };
 #endif
diff --git a/jemalloc/include/jemalloc/jemalloc_defs.h.in b/jemalloc/include/jemalloc/jemalloc_defs.h.in
index 8b98d670..b8f3f36b 100644
--- a/jemalloc/include/jemalloc/jemalloc_defs.h.in
+++ b/jemalloc/include/jemalloc/jemalloc_defs.h.in
@@ -13,6 +13,7 @@
  * the API prefixing.
  */
 #undef JEMALLOC_PREFIX
+#undef JEMALLOC_CPREFIX
 #if (defined(JEMALLOC_PREFIX) && defined(JEMALLOC_MANGLE))
 #undef JEMALLOC_P
 #endif
@@ -31,6 +32,9 @@
 #  define JEMALLOC_ATTR(s)
 #endif
 
+/* JEMALLOC_CC_SILENCE enables code that silences unuseful compiler warnings. */
+#undef JEMALLOC_CC_SILENCE
+
 /*
  * JEMALLOC_DEBUG enables assertions and other sanity checks, and disables
  * inline functions.
@@ -92,6 +96,38 @@
 /* TLS is used to map arenas and magazine caches to threads. */
 #undef NO_TLS
 
+/*
+ * JEMALLOC_IVSALLOC enables ivsalloc(), which verifies that pointers reside
+ * within jemalloc-owned chunks before dereferencing them.
+ */
+#undef JEMALLOC_IVSALLOC
+
+/*
+ * Define overrides for non-standard allocator-related functions if they
+ * are present on the system.
+ */
+#undef JEMALLOC_OVERRIDE_MEMALIGN
+#undef JEMALLOC_OVERRIDE_VALLOC
+
+/*
+ * Darwin (OS X) uses zones to work around Mach-O symbol override shortcomings.
+ */
+#undef JEMALLOC_ZONE
+#undef JEMALLOC_ZONE_VERSION
+
+/*
+ * Methods for purging unused pages differ between operating systems.
+ *
+ *   madvise(..., MADV_DONTNEED) : On Linux, this immediately discards pages,
+ *                                 such that new pages will be demand-zeroed if
+ *                                 the address region is later touched.
+ *   madvise(..., MADV_FREE) : On FreeBSD and Darwin, this marks pages as being
+ *                             unused, such that they will be discarded rather
+ *                             than swapped out.
+ */
+#undef JEMALLOC_PURGE_MADVISE_DONTNEED
+#undef JEMALLOC_PURGE_MADVISE_FREE
+
 /* sizeof(void *) == 2^LG_SIZEOF_PTR. */
 #undef LG_SIZEOF_PTR
 
diff --git a/jemalloc/src/arena.c b/jemalloc/src/arena.c
index ee859fcb..3d4f8888 100644
--- a/jemalloc/src/arena.c
+++ b/jemalloc/src/arena.c
@@ -165,7 +165,7 @@ static arena_chunk_t *arena_chunk_alloc(arena_t *arena);
 static void	arena_chunk_dealloc(arena_t *arena, arena_chunk_t *chunk);
 static arena_run_t *arena_run_alloc(arena_t *arena, size_t size, bool large,
     bool zero);
-static void	arena_purge(arena_t *arena);
+static void	arena_purge(arena_t *arena, bool all);
 static void	arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty);
 static void	arena_run_trim_head(arena_t *arena, arena_chunk_t *chunk,
     arena_run_t *run, size_t oldsize, size_t newsize);
@@ -174,16 +174,18 @@ static void	arena_run_trim_tail(arena_t *arena, arena_chunk_t *chunk,
 static arena_run_t *arena_bin_nonfull_run_get(arena_t *arena, arena_bin_t *bin);
 static void	*arena_bin_malloc_hard(arena_t *arena, arena_bin_t *bin);
 static size_t	arena_bin_run_size_calc(arena_bin_t *bin, size_t min_run_size);
+static void	arena_dissociate_bin_run(arena_chunk_t *chunk, arena_run_t *run,
+    arena_bin_t *bin);
 static void	arena_dalloc_bin_run(arena_t *arena, arena_chunk_t *chunk,
     arena_run_t *run, arena_bin_t *bin);
+static void	arena_bin_lower_run(arena_t *arena, arena_chunk_t *chunk,
+    arena_run_t *run, arena_bin_t *bin);
 static void	arena_ralloc_large_shrink(arena_t *arena, arena_chunk_t *chunk,
-    void *ptr, size_t size, size_t oldsize);
+    void *ptr, size_t oldsize, size_t size);
 static bool	arena_ralloc_large_grow(arena_t *arena, arena_chunk_t *chunk,
-    void *ptr, size_t size, size_t oldsize);
-static bool	arena_ralloc_large(void *ptr, size_t size, size_t oldsize);
-#ifdef JEMALLOC_TINY
-static size_t	pow2_ceil(size_t x);
-#endif
+    void *ptr, size_t oldsize, size_t size, size_t extra, bool zero);
+static bool	arena_ralloc_large(void *ptr, size_t oldsize, size_t size,
+    size_t extra, bool zero);
 static bool	small_size2bin_init(void);
 #ifdef JEMALLOC_DEBUG
 static void	small_size2bin_validate(void);
@@ -281,12 +283,33 @@ arena_run_reg_dalloc(arena_run_t *run, void *ptr)
 	assert(((uintptr_t)ptr - ((uintptr_t)run +
 	    (uintptr_t)run->bin->reg0_offset)) % (uintptr_t)run->bin->reg_size
 	    == 0);
+	/*
+	 * Freeing a pointer lower than region zero can cause assertion
+	 * failure.
+	 */
+	assert((uintptr_t)ptr >= (uintptr_t)run +
+	    (uintptr_t)run->bin->reg0_offset);
+	/*
+	 * Freeing a pointer past in the run's frontier can cause assertion
+	 * failure.
+	 */
+	assert((uintptr_t)ptr < (uintptr_t)run->next);
 
 	*(void **)ptr = run->avail;
 	run->avail = ptr;
 	run->nfree++;
 }
 
+#ifdef JEMALLOC_DEBUG
+static inline void
+arena_chunk_validate_zeroed(arena_chunk_t *chunk, size_t run_ind)
+{
+	size_t *p = (size_t *)((uintptr_t)chunk + (run_ind << PAGE_SHIFT));
+	for (size_t i = 0; i < PAGE_SIZE / sizeof(size_t); i++)
+		assert(p[i] == 0);
+}
+#endif
+
 static void
 arena_run_split(arena_t *arena, arena_run_t *run, size_t size, bool large,
     bool zero)
@@ -300,39 +323,40 @@ arena_run_split(arena_t *arena, arena_run_t *run, size_t size, bool large,
 	old_ndirty = chunk->ndirty;
 	run_ind = (unsigned)(((uintptr_t)run - (uintptr_t)chunk)
 	    >> PAGE_SHIFT);
-	flag_dirty = chunk->map[run_ind].bits & CHUNK_MAP_DIRTY;
+	flag_dirty = chunk->map[run_ind-map_bias].bits & CHUNK_MAP_DIRTY;
 	runs_avail = (flag_dirty != 0) ? &arena->runs_avail_dirty :
 	    &arena->runs_avail_clean;
-	total_pages = (chunk->map[run_ind].bits & ~PAGE_MASK) >>
+	total_pages = (chunk->map[run_ind-map_bias].bits & ~PAGE_MASK) >>
 	    PAGE_SHIFT;
-	assert((chunk->map[run_ind+total_pages-1].bits & CHUNK_MAP_DIRTY) ==
-	    flag_dirty);
+	assert((chunk->map[run_ind+total_pages-1-map_bias].bits &
+	    CHUNK_MAP_DIRTY) == flag_dirty);
 	need_pages = (size >> PAGE_SHIFT);
 	assert(need_pages > 0);
 	assert(need_pages <= total_pages);
 	rem_pages = total_pages - need_pages;
 
-	arena_avail_tree_remove(runs_avail, &chunk->map[run_ind]);
+	arena_avail_tree_remove(runs_avail, &chunk->map[run_ind-map_bias]);
 	arena->nactive += need_pages;
 
 	/* Keep track of trailing unused pages for later use. */
 	if (rem_pages > 0) {
 		if (flag_dirty != 0) {
-			chunk->map[run_ind+need_pages].bits = (rem_pages <<
-			    PAGE_SHIFT) | CHUNK_MAP_DIRTY;
-			chunk->map[run_ind+total_pages-1].bits = (rem_pages <<
-			    PAGE_SHIFT) | CHUNK_MAP_DIRTY;
+			chunk->map[run_ind+need_pages-map_bias].bits =
+			    (rem_pages << PAGE_SHIFT) | CHUNK_MAP_DIRTY;
+			chunk->map[run_ind+total_pages-1-map_bias].bits =
+			    (rem_pages << PAGE_SHIFT) | CHUNK_MAP_DIRTY;
 		} else {
-			chunk->map[run_ind+need_pages].bits = (rem_pages <<
-			    PAGE_SHIFT) | (chunk->map[run_ind+need_pages].bits &
-			    CHUNK_MAP_ZEROED);
-			chunk->map[run_ind+total_pages-1].bits = (rem_pages <<
-			    PAGE_SHIFT) |
-			    (chunk->map[run_ind+total_pages-1].bits &
-			    CHUNK_MAP_ZEROED);
+			chunk->map[run_ind+need_pages-map_bias].bits =
+			    (rem_pages << PAGE_SHIFT) |
+			    (chunk->map[run_ind+need_pages-map_bias].bits &
+			    CHUNK_MAP_UNZEROED);
+			chunk->map[run_ind+total_pages-1-map_bias].bits =
+			    (rem_pages << PAGE_SHIFT) |
+			    (chunk->map[run_ind+total_pages-1-map_bias].bits &
+			    CHUNK_MAP_UNZEROED);
 		}
 		arena_avail_tree_insert(runs_avail,
-		    &chunk->map[run_ind+need_pages]);
+		    &chunk->map[run_ind+need_pages-map_bias]);
 	}
 
 	/* Update dirty page accounting. */
@@ -353,13 +377,19 @@ arena_run_split(arena_t *arena, arena_run_t *run, size_t size, bool large,
 				 * zeroed (i.e. never before touched).
 				 */
 				for (i = 0; i < need_pages; i++) {
-					if ((chunk->map[run_ind + i].bits &
-					    CHUNK_MAP_ZEROED) == 0) {
+					if ((chunk->map[run_ind+i-map_bias].bits
+					    & CHUNK_MAP_UNZEROED) != 0) {
 						memset((void *)((uintptr_t)
-						    chunk + ((run_ind + i) <<
+						    chunk + ((run_ind+i) <<
 						    PAGE_SHIFT)), 0,
 						    PAGE_SIZE);
 					}
+#ifdef JEMALLOC_DEBUG
+					else {
+						arena_chunk_validate_zeroed(
+						    chunk, run_ind+i);
+					}
+#endif
 				}
 			} else {
 				/*
@@ -376,27 +406,54 @@ arena_run_split(arena_t *arena, arena_run_t *run, size_t size, bool large,
 		 * Set the last element first, in case the run only contains one
 		 * page (i.e. both statements set the same element).
 		 */
-		chunk->map[run_ind+need_pages-1].bits = CHUNK_MAP_LARGE |
-		    CHUNK_MAP_ALLOCATED | flag_dirty;
-		chunk->map[run_ind].bits = size | CHUNK_MAP_LARGE |
-#ifdef JEMALLOC_PROF
-		    CHUNK_MAP_CLASS_MASK |
-#endif
-		    CHUNK_MAP_ALLOCATED | flag_dirty;
+		chunk->map[run_ind+need_pages-1-map_bias].bits =
+		    CHUNK_MAP_LARGE | CHUNK_MAP_ALLOCATED | flag_dirty;
+		chunk->map[run_ind-map_bias].bits = size | flag_dirty |
+		    CHUNK_MAP_LARGE | CHUNK_MAP_ALLOCATED;
 	} else {
 		assert(zero == false);
 		/*
-		 * Propagate the dirty flag to the allocated small run, so that
-		 * arena_dalloc_bin_run() has the ability to conditionally trim
-		 * clean pages.
+		 * Propagate the dirty and unzeroed flags to the allocated
+		 * small run, so that arena_dalloc_bin_run() has the ability to
+		 * conditionally trim clean pages.
 		 */
-		chunk->map[run_ind].bits = CHUNK_MAP_ALLOCATED | flag_dirty;
+		chunk->map[run_ind-map_bias].bits =
+		    (chunk->map[run_ind-map_bias].bits & CHUNK_MAP_UNZEROED) |
+		    CHUNK_MAP_ALLOCATED | flag_dirty;
+#ifdef JEMALLOC_DEBUG
+		/*
+		 * The first page will always be dirtied during small run
+		 * initialization, so a validation failure here would not
+		 * actually cause an observable failure.
+		 */
+		if (flag_dirty == 0 &&
+		    (chunk->map[run_ind-map_bias].bits & CHUNK_MAP_UNZEROED)
+		    == 0)
+			arena_chunk_validate_zeroed(chunk, run_ind);
+#endif
 		for (i = 1; i < need_pages - 1; i++) {
-			chunk->map[run_ind + i].bits = (i << PAGE_SHIFT)
-			    | CHUNK_MAP_ALLOCATED;
+			chunk->map[run_ind+i-map_bias].bits = (i << PAGE_SHIFT)
+			    | (chunk->map[run_ind+i-map_bias].bits &
+			    CHUNK_MAP_UNZEROED) | CHUNK_MAP_ALLOCATED;
+#ifdef JEMALLOC_DEBUG
+			if (flag_dirty == 0 &&
+			    (chunk->map[run_ind+i-map_bias].bits &
+			    CHUNK_MAP_UNZEROED) == 0)
+				arena_chunk_validate_zeroed(chunk, run_ind+i);
+#endif
 		}
-		chunk->map[run_ind + need_pages - 1].bits = ((need_pages - 1) <<
-		    PAGE_SHIFT) | CHUNK_MAP_ALLOCATED | flag_dirty;
+		chunk->map[run_ind+need_pages-1-map_bias].bits = ((need_pages
+		    - 1) << PAGE_SHIFT) |
+		    (chunk->map[run_ind+need_pages-1-map_bias].bits &
+		    CHUNK_MAP_UNZEROED) | CHUNK_MAP_ALLOCATED | flag_dirty;
+#ifdef JEMALLOC_DEBUG
+		if (flag_dirty == 0 &&
+		    (chunk->map[run_ind+need_pages-1-map_bias].bits &
+		    CHUNK_MAP_UNZEROED) == 0) {
+			arena_chunk_validate_zeroed(chunk,
+			    run_ind+need_pages-1);
+		}
+#endif
 	}
 }
 
@@ -413,20 +470,24 @@ arena_chunk_alloc(arena_t *arena)
 		arena->spare = NULL;
 
 		/* Insert the run into the appropriate runs_avail_* tree. */
-		if ((chunk->map[arena_chunk_header_npages].bits &
-		    CHUNK_MAP_DIRTY) == 0)
+		if ((chunk->map[0].bits & CHUNK_MAP_DIRTY) == 0)
 			runs_avail = &arena->runs_avail_clean;
 		else
 			runs_avail = &arena->runs_avail_dirty;
-		arena_avail_tree_insert(runs_avail,
-		    &chunk->map[arena_chunk_header_npages]);
+		assert((chunk->map[0].bits & ~PAGE_MASK) == arena_maxclass);
+		assert((chunk->map[chunk_npages-1-map_bias].bits & ~PAGE_MASK)
+		    == arena_maxclass);
+		assert((chunk->map[0].bits & CHUNK_MAP_DIRTY) ==
+		    (chunk->map[chunk_npages-1-map_bias].bits &
+		    CHUNK_MAP_DIRTY));
+		arena_avail_tree_insert(runs_avail, &chunk->map[0]);
 	} else {
 		bool zero;
-		size_t zeroed;
+		size_t unzeroed;
 
 		zero = false;
 		malloc_mutex_unlock(&arena->lock);
-		chunk = (arena_chunk_t *)chunk_alloc(chunksize, &zero);
+		chunk = (arena_chunk_t *)chunk_alloc(chunksize, false, &zero);
 		malloc_mutex_lock(&arena->lock);
 		if (chunk == NULL)
 			return (NULL);
@@ -449,17 +510,28 @@ arena_chunk_alloc(arena_t *arena)
 		 * Mark the pages as zeroed iff chunk_alloc() returned a zeroed
 		 * chunk.
 		 */
-		zeroed = zero ? CHUNK_MAP_ZEROED : 0;
-		for (i = 0; i < arena_chunk_header_npages; i++)
-			chunk->map[i].bits = 0;
-		chunk->map[i].bits = arena_maxclass | zeroed;
-		for (i++; i < chunk_npages-1; i++)
-			chunk->map[i].bits = zeroed;
-		chunk->map[chunk_npages-1].bits = arena_maxclass | zeroed;
+		unzeroed = zero ? 0 : CHUNK_MAP_UNZEROED;
+		chunk->map[0].bits = arena_maxclass | unzeroed;
+		/*
+		 * There is no need to initialize the internal page map entries
+		 * unless the chunk is not zeroed.
+		 */
+		if (zero == false) {
+			for (i = map_bias+1; i < chunk_npages-1; i++)
+				chunk->map[i-map_bias].bits = unzeroed;
+		}
+#ifdef JEMALLOC_DEBUG
+		else {
+			for (i = map_bias+1; i < chunk_npages-1; i++)
+				assert(chunk->map[i-map_bias].bits == unzeroed);
+		}
+#endif
+		chunk->map[chunk_npages-1-map_bias].bits = arena_maxclass |
+		    unzeroed;
 
 		/* Insert the run into the runs_avail_clean tree. */
 		arena_avail_tree_insert(&arena->runs_avail_clean,
-		    &chunk->map[arena_chunk_header_npages]);
+		    &chunk->map[0]);
 	}
 
 	return (chunk);
@@ -474,13 +546,11 @@ arena_chunk_dealloc(arena_t *arena, arena_chunk_t *chunk)
 	 * Remove run from the appropriate runs_avail_* tree, so that the arena
 	 * does not use it.
 	 */
-	if ((chunk->map[arena_chunk_header_npages].bits &
-	    CHUNK_MAP_DIRTY) == 0)
+	if ((chunk->map[0].bits & CHUNK_MAP_DIRTY) == 0)
 		runs_avail = &arena->runs_avail_clean;
 	else
 		runs_avail = &arena->runs_avail_dirty;
-	arena_avail_tree_remove(runs_avail,
-	    &chunk->map[arena_chunk_header_npages]);
+	arena_avail_tree_remove(runs_avail, &chunk->map[0]);
 
 	if (arena->spare != NULL) {
 		arena_chunk_t *spare = arena->spare;
@@ -516,8 +586,9 @@ arena_run_alloc(arena_t *arena, size_t size, bool large, bool zero)
 	mapelm = arena_avail_tree_nsearch(&arena->runs_avail_dirty, &key);
 	if (mapelm != NULL) {
 		arena_chunk_t *run_chunk = CHUNK_ADDR2BASE(mapelm);
-		size_t pageind = ((uintptr_t)mapelm - (uintptr_t)run_chunk->map)
-		    / sizeof(arena_chunk_map_t);
+		size_t pageind = (((uintptr_t)mapelm -
+		    (uintptr_t)run_chunk->map) / sizeof(arena_chunk_map_t))
+		    + map_bias;
 
 		run = (arena_run_t *)((uintptr_t)run_chunk + (pageind <<
 		    PAGE_SHIFT));
@@ -527,8 +598,9 @@ arena_run_alloc(arena_t *arena, size_t size, bool large, bool zero)
 	mapelm = arena_avail_tree_nsearch(&arena->runs_avail_clean, &key);
 	if (mapelm != NULL) {
 		arena_chunk_t *run_chunk = CHUNK_ADDR2BASE(mapelm);
-		size_t pageind = ((uintptr_t)mapelm - (uintptr_t)run_chunk->map)
-		    / sizeof(arena_chunk_map_t);
+		size_t pageind = (((uintptr_t)mapelm -
+		    (uintptr_t)run_chunk->map) / sizeof(arena_chunk_map_t))
+		    + map_bias;
 
 		run = (arena_run_t *)((uintptr_t)run_chunk + (pageind <<
 		    PAGE_SHIFT));
@@ -541,8 +613,8 @@ arena_run_alloc(arena_t *arena, size_t size, bool large, bool zero)
 	 */
 	chunk = arena_chunk_alloc(arena);
 	if (chunk != NULL) {
-		run = (arena_run_t *)((uintptr_t)chunk +
-		    (arena_chunk_header_npages << PAGE_SHIFT));
+		run = (arena_run_t *)((uintptr_t)chunk + (map_bias <<
+		    PAGE_SHIFT));
 		arena_run_split(arena, run, size, large, zero);
 		return (run);
 	}
@@ -555,8 +627,9 @@ arena_run_alloc(arena_t *arena, size_t size, bool large, bool zero)
 	mapelm = arena_avail_tree_nsearch(&arena->runs_avail_dirty, &key);
 	if (mapelm != NULL) {
 		arena_chunk_t *run_chunk = CHUNK_ADDR2BASE(mapelm);
-		size_t pageind = ((uintptr_t)mapelm - (uintptr_t)run_chunk->map)
-		    / sizeof(arena_chunk_map_t);
+		size_t pageind = (((uintptr_t)mapelm -
+		    (uintptr_t)run_chunk->map) / sizeof(arena_chunk_map_t))
+		    + map_bias;
 
 		run = (arena_run_t *)((uintptr_t)run_chunk + (pageind <<
 		    PAGE_SHIFT));
@@ -566,8 +639,9 @@ arena_run_alloc(arena_t *arena, size_t size, bool large, bool zero)
 	mapelm = arena_avail_tree_nsearch(&arena->runs_avail_clean, &key);
 	if (mapelm != NULL) {
 		arena_chunk_t *run_chunk = CHUNK_ADDR2BASE(mapelm);
-		size_t pageind = ((uintptr_t)mapelm - (uintptr_t)run_chunk->map)
-		    / sizeof(arena_chunk_map_t);
+		size_t pageind = (((uintptr_t)mapelm -
+		    (uintptr_t)run_chunk->map) / sizeof(arena_chunk_map_t))
+		    + map_bias;
 
 		run = (arena_run_t *)((uintptr_t)run_chunk + (pageind <<
 		    PAGE_SHIFT));
@@ -587,7 +661,7 @@ arena_maybe_purge(arena_t *arena)
 	    (arena->ndirty - arena->npurgatory) > chunk_npages &&
 	    (arena->nactive >> opt_lg_dirty_mult) < (arena->ndirty -
 	    arena->npurgatory))
-		arena_purge(arena);
+		arena_purge(arena, false);
 }
 
 static inline void
@@ -595,7 +669,7 @@ arena_chunk_purge(arena_t *arena, arena_chunk_t *chunk)
 {
 	ql_head(arena_chunk_map_t) mapelms;
 	arena_chunk_map_t *mapelm;
-	size_t pageind, flag_zeroed;
+	size_t pageind, flag_unzeroed;
 #ifdef JEMALLOC_DEBUG
 	size_t ndirty;
 #endif
@@ -605,11 +679,19 @@ arena_chunk_purge(arena_t *arena, arena_chunk_t *chunk)
 
 	ql_new(&mapelms);
 
-	flag_zeroed =
-#ifdef JEMALLOC_SWAP
-	    swap_enabled ? 0 :
+	flag_unzeroed =
+#ifdef JEMALLOC_PURGE_MADVISE_DONTNEED
+   /*
+    * madvise(..., MADV_DONTNEED) results in zero-filled pages for anonymous
+    * mappings, but not for file-backed mappings.
+    */
+#  ifdef JEMALLOC_SWAP
+	    swap_enabled ? CHUNK_MAP_UNZEROED :
+#  endif
+	    0;
+#else
+	    CHUNK_MAP_UNZEROED;
 #endif
-	    CHUNK_MAP_ZEROED;
 
 	/*
 	 * If chunk is the spare, temporarily re-allocate it, 1) so that its
@@ -627,14 +709,13 @@ arena_chunk_purge(arena_t *arena, arena_chunk_t *chunk)
 	 * run.
 	 */
 	if (chunk == arena->spare) {
-		assert((chunk->map[arena_chunk_header_npages].bits &
-		    CHUNK_MAP_DIRTY) != 0);
+		assert((chunk->map[0].bits & CHUNK_MAP_DIRTY) != 0);
 		arena_chunk_alloc(arena);
 	}
 
 	/* Temporarily allocate all free dirty runs within chunk. */
-	for (pageind = arena_chunk_header_npages; pageind < chunk_npages;) {
-		mapelm = &chunk->map[pageind];
+	for (pageind = map_bias; pageind < chunk_npages;) {
+		mapelm = &chunk->map[pageind-map_bias];
 		if ((mapelm->bits & CHUNK_MAP_ALLOCATED) == 0) {
 			size_t npages;
 
@@ -646,25 +727,22 @@ arena_chunk_purge(arena_t *arena, arena_chunk_t *chunk)
 				arena_avail_tree_remove(
 				    &arena->runs_avail_dirty, mapelm);
 
+				mapelm->bits = (npages << PAGE_SHIFT) |
+				    flag_unzeroed | CHUNK_MAP_LARGE |
+				    CHUNK_MAP_ALLOCATED;
 				/*
 				 * Update internal elements in the page map, so
-				 * that CHUNK_MAP_ZEROED is properly set.
-				 * madvise(..., MADV_DONTNEED) results in
-				 * zero-filled pages for anonymous mappings,
-				 * but not for file-backed mappings.
+				 * that CHUNK_MAP_UNZEROED is properly set.
 				 */
-				mapelm->bits = (npages << PAGE_SHIFT) |
-				    CHUNK_MAP_LARGE | CHUNK_MAP_ALLOCATED |
-				    flag_zeroed;
 				for (i = 1; i < npages - 1; i++) {
-					chunk->map[pageind + i].bits =
-					    flag_zeroed;
+					chunk->map[pageind+i-map_bias].bits =
+					    flag_unzeroed;
 				}
 				if (npages > 1) {
-					chunk->map[pageind + npages - 1].bits =
-					(npages << PAGE_SHIFT) |
-					CHUNK_MAP_LARGE | CHUNK_MAP_ALLOCATED |
-					flag_zeroed;
+					chunk->map[
+					    pageind+npages-1-map_bias].bits =
+					    flag_unzeroed | CHUNK_MAP_LARGE |
+					    CHUNK_MAP_ALLOCATED;
 				}
 
 				arena->nactive += npages;
@@ -706,8 +784,8 @@ arena_chunk_purge(arena_t *arena, arena_chunk_t *chunk)
 	nmadvise = 0;
 #endif
 	ql_foreach(mapelm, &mapelms, u.ql_link) {
-		size_t pageind = ((uintptr_t)mapelm - (uintptr_t)chunk->map) /
-		    sizeof(arena_chunk_map_t);
+		size_t pageind = (((uintptr_t)mapelm - (uintptr_t)chunk->map) /
+		    sizeof(arena_chunk_map_t)) + map_bias;
 		size_t npages = mapelm->bits >> PAGE_SHIFT;
 
 		assert(pageind + npages <= chunk_npages);
@@ -715,8 +793,17 @@ arena_chunk_purge(arena_t *arena, arena_chunk_t *chunk)
 		assert(ndirty >= npages);
 		ndirty -= npages;
 #endif
+
+#ifdef JEMALLOC_PURGE_MADVISE_DONTNEED
 		madvise((void *)((uintptr_t)chunk + (pageind << PAGE_SHIFT)),
 		    (npages << PAGE_SHIFT), MADV_DONTNEED);
+#elif defined(JEMALLOC_PURGE_MADVISE_FREE)
+		madvise((void *)((uintptr_t)chunk + (pageind << PAGE_SHIFT)),
+		    (npages << PAGE_SHIFT), MADV_FREE);
+#else
+#  error "No method defined for purging unused dirty pages."
+#endif
+
 #ifdef JEMALLOC_STATS
 		nmadvise++;
 #endif
@@ -732,8 +819,8 @@ arena_chunk_purge(arena_t *arena, arena_chunk_t *chunk)
 	/* Deallocate runs. */
 	for (mapelm = ql_first(&mapelms); mapelm != NULL;
 	    mapelm = ql_first(&mapelms)) {
-		size_t pageind = ((uintptr_t)mapelm - (uintptr_t)chunk->map) /
-		    sizeof(arena_chunk_map_t);
+		size_t pageind = (((uintptr_t)mapelm - (uintptr_t)chunk->map) /
+		    sizeof(arena_chunk_map_t)) + map_bias;
 		arena_run_t *run = (arena_run_t *)((uintptr_t)chunk +
 		    (uintptr_t)(pageind << PAGE_SHIFT));
 
@@ -743,7 +830,7 @@ arena_chunk_purge(arena_t *arena, arena_chunk_t *chunk)
 }
 
 static void
-arena_purge(arena_t *arena)
+arena_purge(arena_t *arena, bool all)
 {
 	arena_chunk_t *chunk;
 	size_t npurgatory;
@@ -757,8 +844,8 @@ arena_purge(arena_t *arena)
 	assert(ndirty == arena->ndirty);
 #endif
 	assert(arena->ndirty > arena->npurgatory);
-	assert(arena->ndirty > chunk_npages);
-	assert((arena->nactive >> opt_lg_dirty_mult) < arena->ndirty);
+	assert(arena->ndirty > chunk_npages || all);
+	assert((arena->nactive >> opt_lg_dirty_mult) < arena->ndirty || all);
 
 #ifdef JEMALLOC_STATS
 	arena->stats.npurge++;
@@ -769,8 +856,9 @@ arena_purge(arena_t *arena)
 	 * purge, and add the result to arena->npurgatory.  This will keep
 	 * multiple threads from racing to reduce ndirty below the threshold.
 	 */
-	npurgatory = (arena->ndirty - arena->npurgatory) - (arena->nactive >>
-	    opt_lg_dirty_mult);
+	npurgatory = arena->ndirty - arena->npurgatory;
+	if (all == false)
+		npurgatory -= arena->nactive >> opt_lg_dirty_mult;
 	arena->npurgatory += npurgatory;
 
 	while (npurgatory > 0) {
@@ -826,6 +914,15 @@ arena_purge(arena_t *arena)
 	}
 }
 
+void
+arena_purge_all(arena_t *arena)
+{
+
+	malloc_mutex_lock(&arena->lock);
+	arena_purge(arena, true);
+	malloc_mutex_unlock(&arena->lock);
+}
+
 static void
 arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty)
 {
@@ -836,11 +933,18 @@ arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty)
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(run);
 	run_ind = (size_t)(((uintptr_t)run - (uintptr_t)chunk)
 	    >> PAGE_SHIFT);
-	assert(run_ind >= arena_chunk_header_npages);
+	assert(run_ind >= map_bias);
 	assert(run_ind < chunk_npages);
-	if ((chunk->map[run_ind].bits & CHUNK_MAP_LARGE) != 0)
-		size = chunk->map[run_ind].bits & ~PAGE_MASK;
-	else
+	if ((chunk->map[run_ind-map_bias].bits & CHUNK_MAP_LARGE) != 0) {
+		size = chunk->map[run_ind-map_bias].bits & ~PAGE_MASK;
+		assert(size == PAGE_SIZE ||
+		    (chunk->map[run_ind+(size>>PAGE_SHIFT)-1-map_bias].bits &
+		    ~PAGE_MASK) == 0);
+		assert((chunk->map[run_ind+(size>>PAGE_SHIFT)-1-map_bias].bits &
+		    CHUNK_MAP_LARGE) != 0);
+		assert((chunk->map[run_ind+(size>>PAGE_SHIFT)-1-map_bias].bits &
+		    CHUNK_MAP_ALLOCATED) != 0);
+	} else
 		size = run->bin->run_size;
 	run_pages = (size >> PAGE_SHIFT);
 	arena->nactive -= run_pages;
@@ -849,7 +953,7 @@ arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty)
 	 * The run is dirty if the caller claims to have dirtied it, as well as
 	 * if it was already dirty before being allocated.
 	 */
-	if ((chunk->map[run_ind].bits & CHUNK_MAP_DIRTY) != 0)
+	if ((chunk->map[run_ind-map_bias].bits & CHUNK_MAP_DIRTY) != 0)
 		dirty = true;
 	flag_dirty = dirty ? CHUNK_MAP_DIRTY : 0;
 	runs_avail = dirty ? &arena->runs_avail_dirty :
@@ -857,72 +961,91 @@ arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty)
 
 	/* Mark pages as unallocated in the chunk map. */
 	if (dirty) {
-		chunk->map[run_ind].bits = size | flag_dirty;
-		chunk->map[run_ind+run_pages-1].bits = size | flag_dirty;
+		chunk->map[run_ind-map_bias].bits = size | CHUNK_MAP_DIRTY;
+		chunk->map[run_ind+run_pages-1-map_bias].bits = size |
+		    CHUNK_MAP_DIRTY;
 
 		chunk->ndirty += run_pages;
 		arena->ndirty += run_pages;
 	} else {
-		chunk->map[run_ind].bits = size | (chunk->map[run_ind].bits &
-		    CHUNK_MAP_ZEROED);
-		chunk->map[run_ind+run_pages-1].bits = size |
-		    (chunk->map[run_ind+run_pages-1].bits & CHUNK_MAP_ZEROED);
+		chunk->map[run_ind-map_bias].bits = size |
+		    (chunk->map[run_ind-map_bias].bits & CHUNK_MAP_UNZEROED);
+		chunk->map[run_ind+run_pages-1-map_bias].bits = size |
+		    (chunk->map[run_ind+run_pages-1-map_bias].bits &
+		    CHUNK_MAP_UNZEROED);
 	}
 
 	/* Try to coalesce forward. */
 	if (run_ind + run_pages < chunk_npages &&
-	    (chunk->map[run_ind+run_pages].bits & CHUNK_MAP_ALLOCATED) == 0 &&
-	    (chunk->map[run_ind+run_pages].bits & CHUNK_MAP_DIRTY) ==
-	    flag_dirty) {
-		size_t nrun_size = chunk->map[run_ind+run_pages].bits &
+	    (chunk->map[run_ind+run_pages-map_bias].bits & CHUNK_MAP_ALLOCATED)
+	    == 0 && (chunk->map[run_ind+run_pages-map_bias].bits &
+	    CHUNK_MAP_DIRTY) == flag_dirty) {
+		size_t nrun_size = chunk->map[run_ind+run_pages-map_bias].bits &
 		    ~PAGE_MASK;
+		size_t nrun_pages = nrun_size >> PAGE_SHIFT;
 
 		/*
 		 * Remove successor from runs_avail; the coalesced run is
 		 * inserted later.
 		 */
+		assert((chunk->map[run_ind+run_pages+nrun_pages-1-map_bias].bits
+		    & ~PAGE_MASK) == nrun_size);
+		assert((chunk->map[run_ind+run_pages+nrun_pages-1-map_bias].bits
+		    & CHUNK_MAP_ALLOCATED) == 0);
+		assert((chunk->map[run_ind+run_pages+nrun_pages-1-map_bias].bits
+		    & CHUNK_MAP_DIRTY) == flag_dirty);
 		arena_avail_tree_remove(runs_avail,
-		    &chunk->map[run_ind+run_pages]);
+		    &chunk->map[run_ind+run_pages-map_bias]);
 
 		size += nrun_size;
-		run_pages = size >> PAGE_SHIFT;
+		run_pages += nrun_pages;
 
-		assert((chunk->map[run_ind+run_pages-1].bits & ~PAGE_MASK)
-		    == nrun_size);
-		chunk->map[run_ind].bits = size | (chunk->map[run_ind].bits &
-		    CHUNK_MAP_FLAGS_MASK);
-		chunk->map[run_ind+run_pages-1].bits = size |
-		    (chunk->map[run_ind+run_pages-1].bits &
+		chunk->map[run_ind-map_bias].bits = size |
+		    (chunk->map[run_ind-map_bias].bits & CHUNK_MAP_FLAGS_MASK);
+		chunk->map[run_ind+run_pages-1-map_bias].bits = size |
+		    (chunk->map[run_ind+run_pages-1-map_bias].bits &
 		    CHUNK_MAP_FLAGS_MASK);
 	}
 
 	/* Try to coalesce backward. */
-	if (run_ind > arena_chunk_header_npages && (chunk->map[run_ind-1].bits &
-	    CHUNK_MAP_ALLOCATED) == 0 && (chunk->map[run_ind-1].bits &
+	if (run_ind > map_bias && (chunk->map[run_ind-1-map_bias].bits &
+	    CHUNK_MAP_ALLOCATED) == 0 && (chunk->map[run_ind-1-map_bias].bits &
 	    CHUNK_MAP_DIRTY) == flag_dirty) {
-		size_t prun_size = chunk->map[run_ind-1].bits & ~PAGE_MASK;
+		size_t prun_size = chunk->map[run_ind-1-map_bias].bits &
+		    ~PAGE_MASK;
+		size_t prun_pages = prun_size >> PAGE_SHIFT;
 
-		run_ind -= prun_size >> PAGE_SHIFT;
+		run_ind -= prun_pages;
 
 		/*
 		 * Remove predecessor from runs_avail; the coalesced run is
 		 * inserted later.
 		 */
-		arena_avail_tree_remove(runs_avail, &chunk->map[run_ind]);
+		assert((chunk->map[run_ind-map_bias].bits & ~PAGE_MASK)
+		    == prun_size);
+		assert((chunk->map[run_ind-map_bias].bits & CHUNK_MAP_ALLOCATED)
+		    == 0);
+		assert((chunk->map[run_ind-map_bias].bits & CHUNK_MAP_DIRTY)
+		    == flag_dirty);
+		arena_avail_tree_remove(runs_avail,
+		    &chunk->map[run_ind-map_bias]);
 
 		size += prun_size;
-		run_pages = size >> PAGE_SHIFT;
+		run_pages += prun_pages;
 
-		assert((chunk->map[run_ind].bits & ~PAGE_MASK) == prun_size);
-		chunk->map[run_ind].bits = size | (chunk->map[run_ind].bits &
-		    CHUNK_MAP_FLAGS_MASK);
-		chunk->map[run_ind+run_pages-1].bits = size |
-		    (chunk->map[run_ind+run_pages-1].bits &
+		chunk->map[run_ind-map_bias].bits = size |
+		    (chunk->map[run_ind-map_bias].bits & CHUNK_MAP_FLAGS_MASK);
+		chunk->map[run_ind+run_pages-1-map_bias].bits = size |
+		    (chunk->map[run_ind+run_pages-1-map_bias].bits &
 		    CHUNK_MAP_FLAGS_MASK);
 	}
 
 	/* Insert into runs_avail, now that coalescing is complete. */
-	arena_avail_tree_insert(runs_avail, &chunk->map[run_ind]);
+	assert((chunk->map[run_ind-map_bias].bits & ~PAGE_MASK) ==
+	    (chunk->map[run_ind+run_pages-1-map_bias].bits & ~PAGE_MASK));
+	assert((chunk->map[run_ind-map_bias].bits & CHUNK_MAP_DIRTY) ==
+	    (chunk->map[run_ind+run_pages-1-map_bias].bits & CHUNK_MAP_DIRTY));
+	arena_avail_tree_insert(runs_avail, &chunk->map[run_ind-map_bias]);
 
 	if (dirty) {
 		/*
@@ -941,8 +1064,8 @@ arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty)
 	 * manipulation checks whether the first run is unallocated and extends
 	 * to the end of the chunk.
 	 */
-	if ((chunk->map[arena_chunk_header_npages].bits & (~PAGE_MASK |
-	    CHUNK_MAP_ALLOCATED)) == arena_maxclass)
+	if ((chunk->map[0].bits & (~PAGE_MASK | CHUNK_MAP_ALLOCATED)) ==
+	    arena_maxclass)
 		arena_chunk_dealloc(arena, chunk);
 
 	/*
@@ -962,18 +1085,40 @@ arena_run_trim_head(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
 {
 	size_t pageind = ((uintptr_t)run - (uintptr_t)chunk) >> PAGE_SHIFT;
 	size_t head_npages = (oldsize - newsize) >> PAGE_SHIFT;
-	size_t flags = chunk->map[pageind].bits & CHUNK_MAP_FLAGS_MASK;
+	size_t flag_dirty = chunk->map[pageind-map_bias].bits & CHUNK_MAP_DIRTY;
 
 	assert(oldsize > newsize);
 
 	/*
 	 * Update the chunk map so that arena_run_dalloc() can treat the
-	 * leading run as separately allocated.
+	 * leading run as separately allocated.  Set the last element of each
+	 * run first, in case of single-page runs.
 	 */
-	assert(chunk->map[pageind].bits & CHUNK_MAP_LARGE);
-	assert(chunk->map[pageind].bits & CHUNK_MAP_ALLOCATED);
-	chunk->map[pageind].bits = (oldsize - newsize) | flags;
-	chunk->map[pageind+head_npages].bits = newsize | flags;
+	assert((chunk->map[pageind-map_bias].bits & CHUNK_MAP_LARGE) != 0);
+	assert((chunk->map[pageind-map_bias].bits & CHUNK_MAP_ALLOCATED) != 0);
+	chunk->map[pageind+head_npages-1-map_bias].bits = flag_dirty |
+	    (chunk->map[pageind+head_npages-1-map_bias].bits &
+	    CHUNK_MAP_UNZEROED) | CHUNK_MAP_LARGE | CHUNK_MAP_ALLOCATED;
+	chunk->map[pageind-map_bias].bits = (oldsize - newsize)
+	    | flag_dirty | (chunk->map[pageind-map_bias].bits &
+	    CHUNK_MAP_UNZEROED) | CHUNK_MAP_LARGE | CHUNK_MAP_ALLOCATED;
+
+#ifdef JEMALLOC_DEBUG
+	{
+		size_t tail_npages = newsize >> PAGE_SHIFT;
+		assert((chunk->map[pageind+head_npages+tail_npages-1-map_bias]
+		    .bits & ~PAGE_MASK) == 0);
+		assert((chunk->map[pageind+head_npages+tail_npages-1-map_bias]
+		    .bits & CHUNK_MAP_DIRTY) == flag_dirty);
+		assert((chunk->map[pageind+head_npages+tail_npages-1-map_bias]
+		    .bits & CHUNK_MAP_LARGE) != 0);
+		assert((chunk->map[pageind+head_npages+tail_npages-1-map_bias]
+		    .bits & CHUNK_MAP_ALLOCATED) != 0);
+	}
+#endif
+	chunk->map[pageind+head_npages-map_bias].bits = newsize | flag_dirty |
+	    (chunk->map[pageind+head_npages-map_bias].bits &
+	    CHUNK_MAP_FLAGS_MASK) | CHUNK_MAP_LARGE | CHUNK_MAP_ALLOCATED;
 
 	arena_run_dalloc(arena, run, false);
 }
@@ -983,20 +1128,40 @@ arena_run_trim_tail(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
     size_t oldsize, size_t newsize, bool dirty)
 {
 	size_t pageind = ((uintptr_t)run - (uintptr_t)chunk) >> PAGE_SHIFT;
-	size_t npages = newsize >> PAGE_SHIFT;
-	size_t flags = chunk->map[pageind].bits & CHUNK_MAP_FLAGS_MASK;
+	size_t head_npages = newsize >> PAGE_SHIFT;
+	size_t tail_npages = (oldsize - newsize) >> PAGE_SHIFT;
+	size_t flag_dirty = chunk->map[pageind-map_bias].bits &
+	    CHUNK_MAP_DIRTY;
 
 	assert(oldsize > newsize);
 
 	/*
 	 * Update the chunk map so that arena_run_dalloc() can treat the
-	 * trailing run as separately allocated.
+	 * trailing run as separately allocated.  Set the last element of each
+	 * run first, in case of single-page runs.
 	 */
-	assert(chunk->map[pageind].bits & CHUNK_MAP_LARGE);
-	assert(chunk->map[pageind].bits & CHUNK_MAP_ALLOCATED);
-	chunk->map[pageind].bits = newsize | flags;
-	chunk->map[pageind+npages-1].bits = newsize | flags;
-	chunk->map[pageind+npages].bits = (oldsize - newsize) | flags;
+	assert((chunk->map[pageind-map_bias].bits & CHUNK_MAP_LARGE) != 0);
+	assert((chunk->map[pageind-map_bias].bits & CHUNK_MAP_ALLOCATED) != 0);
+	chunk->map[pageind+head_npages-1-map_bias].bits = flag_dirty |
+	    (chunk->map[pageind+head_npages-1-map_bias].bits &
+	    CHUNK_MAP_UNZEROED) | CHUNK_MAP_LARGE | CHUNK_MAP_ALLOCATED;
+	chunk->map[pageind-map_bias].bits = newsize | flag_dirty |
+	    (chunk->map[pageind-map_bias].bits & CHUNK_MAP_UNZEROED) |
+	    CHUNK_MAP_LARGE | CHUNK_MAP_ALLOCATED;
+
+	assert((chunk->map[pageind+head_npages+tail_npages-1-map_bias].bits &
+	    ~PAGE_MASK) == 0);
+	assert((chunk->map[pageind+head_npages+tail_npages-1-map_bias].bits &
+	    CHUNK_MAP_LARGE) != 0);
+	assert((chunk->map[pageind+head_npages+tail_npages-1-map_bias].bits &
+	    CHUNK_MAP_ALLOCATED) != 0);
+	chunk->map[pageind+head_npages+tail_npages-1-map_bias].bits =
+	    flag_dirty |
+	    (chunk->map[pageind+head_npages+tail_npages-1-map_bias].bits &
+	    CHUNK_MAP_UNZEROED) | CHUNK_MAP_LARGE | CHUNK_MAP_ALLOCATED;
+	chunk->map[pageind+head_npages-map_bias].bits = (oldsize - newsize) |
+	    flag_dirty | (chunk->map[pageind+head_npages-map_bias].bits &
+	    CHUNK_MAP_UNZEROED) | CHUNK_MAP_LARGE | CHUNK_MAP_ALLOCATED;
 
 	arena_run_dalloc(arena, (arena_run_t *)((uintptr_t)run + newsize),
 	    dirty);
@@ -1018,8 +1183,8 @@ arena_bin_nonfull_run_get(arena_t *arena, arena_bin_t *bin)
 		arena_run_tree_remove(&bin->runs, mapelm);
 
 		chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(mapelm);
-		pageind = (((uintptr_t)mapelm - (uintptr_t)chunk->map) /
-		    sizeof(arena_chunk_map_t));
+		pageind = ((((uintptr_t)mapelm - (uintptr_t)chunk->map) /
+		    sizeof(arena_chunk_map_t))) + map_bias;
 		run = (arena_run_t *)((uintptr_t)chunk + (uintptr_t)((pageind -
 		    (mapelm->bits >> PAGE_SHIFT))
 		    << PAGE_SHIFT));
@@ -1039,7 +1204,7 @@ arena_bin_nonfull_run_get(arena_t *arena, arena_bin_t *bin)
 		/* Initialize run internals. */
 		run->bin = bin;
 		run->avail = NULL;
-		run->next = (void *)(((uintptr_t)run) +
+		run->next = (void *)((uintptr_t)run +
 		    (uintptr_t)bin->reg0_offset);
 		run->nfree = bin->nregs;
 #ifdef JEMALLOC_DEBUG
@@ -1061,7 +1226,7 @@ arena_bin_nonfull_run_get(arena_t *arena, arena_bin_t *bin)
 
 	/*
 	 * arena_run_alloc() failed, but another thread may have made
-	 * sufficient memory available while this one dopped bin->lock above,
+	 * sufficient memory available while this one dropped bin->lock above,
 	 * so search one more time.
 	 */
 	mapelm = arena_run_tree_first(&bin->runs);
@@ -1073,8 +1238,8 @@ arena_bin_nonfull_run_get(arena_t *arena, arena_bin_t *bin)
 		arena_run_tree_remove(&bin->runs, mapelm);
 
 		chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(mapelm);
-		pageind = (((uintptr_t)mapelm - (uintptr_t)chunk->map) /
-		    sizeof(arena_chunk_map_t));
+		pageind = ((((uintptr_t)mapelm - (uintptr_t)chunk->map) /
+		    sizeof(arena_chunk_map_t))) + map_bias;
 		run = (arena_run_t *)((uintptr_t)chunk + (uintptr_t)((pageind -
 		    (mapelm->bits >> PAGE_SHIFT))
 		    << PAGE_SHIFT));
@@ -1105,11 +1270,21 @@ arena_bin_malloc_hard(arena_t *arena, arena_bin_t *bin)
 		assert(bin->runcur->nfree > 0);
 		ret = arena_run_reg_alloc(bin->runcur, bin);
 		if (run != NULL) {
-			malloc_mutex_unlock(&bin->lock);
-			malloc_mutex_lock(&arena->lock);
-			arena_run_dalloc(arena, run, false);
-			malloc_mutex_unlock(&arena->lock);
-			malloc_mutex_lock(&bin->lock);
+			arena_chunk_t *chunk;
+
+			/*
+			 * arena_run_alloc() may have allocated run, or it may
+			 * have pulled it from the bin's run tree.  Therefore
+			 * it is unsafe to make any assumptions about how run
+			 * has previously been used, and arena_bin_lower_run()
+			 * must be called, as if a region were just deallocated
+			 * from the run.
+			 */
+			chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(run);
+			if (run->nfree == bin->nregs)
+				arena_dalloc_bin_run(arena, chunk, run, bin);
+			else
+				arena_bin_lower_run(arena, chunk, run, bin);
 		}
 		return (ret);
 	}
@@ -1424,17 +1599,19 @@ arena_malloc(size_t size, bool zero)
 
 /* Only handles large allocations that require more than page alignment. */
 void *
-arena_palloc(arena_t *arena, size_t alignment, size_t size, size_t alloc_size)
+arena_palloc(arena_t *arena, size_t size, size_t alloc_size, size_t alignment,
+    bool zero)
 {
 	void *ret;
 	size_t offset;
 	arena_chunk_t *chunk;
 
 	assert((size & PAGE_MASK) == 0);
-	assert((alignment & PAGE_MASK) == 0);
+
+	alignment = PAGE_CEILING(alignment);
 
 	malloc_mutex_lock(&arena->lock);
-	ret = (void *)arena_run_alloc(arena, alloc_size, true, false);
+	ret = (void *)arena_run_alloc(arena, alloc_size, true, zero);
 	if (ret == NULL) {
 		malloc_mutex_unlock(&arena->lock);
 		return (NULL);
@@ -1482,10 +1659,12 @@ arena_palloc(arena_t *arena, size_t alignment, size_t size, size_t alloc_size)
 	malloc_mutex_unlock(&arena->lock);
 
 #ifdef JEMALLOC_FILL
-	if (opt_junk)
-		memset(ret, 0xa5, size);
-	else if (opt_zero)
-		memset(ret, 0, size);
+	if (zero == false) {
+		if (opt_junk)
+			memset(ret, 0xa5, size);
+		else if (opt_zero)
+			memset(ret, 0, size);
+	}
 #endif
 	return (ret);
 }
@@ -1502,8 +1681,8 @@ arena_salloc(const void *ptr)
 	assert(CHUNK_ADDR2BASE(ptr) != ptr);
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-	pageind = (((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT);
-	mapbits = chunk->map[pageind].bits;
+	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT;
+	mapbits = chunk->map[pageind-map_bias].bits;
 	assert((mapbits & CHUNK_MAP_ALLOCATED) != 0);
 	if ((mapbits & CHUNK_MAP_LARGE) == 0) {
 		arena_run_t *run = (arena_run_t *)((uintptr_t)chunk +
@@ -1535,11 +1714,11 @@ arena_prof_promoted(const void *ptr, size_t size)
 	assert(isalloc(ptr) == PAGE_SIZE);
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-	pageind = (((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT);
+	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT;
 	binind = small_size2bin[size];
 	assert(binind < nbins);
-	chunk->map[pageind].bits = (chunk->map[pageind].bits &
-	    ~CHUNK_MAP_CLASS_MASK) | (binind << CHUNK_MAP_CLASS_SHIFT);
+	chunk->map[pageind-map_bias].bits = (chunk->map[pageind-map_bias].bits &
+	    ~CHUNK_MAP_CLASS_MASK) | ((binind+1) << CHUNK_MAP_CLASS_SHIFT);
 }
 
 size_t
@@ -1553,8 +1732,8 @@ arena_salloc_demote(const void *ptr)
 	assert(CHUNK_ADDR2BASE(ptr) != ptr);
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-	pageind = (((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT);
-	mapbits = chunk->map[pageind].bits;
+	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT;
+	mapbits = chunk->map[pageind-map_bias].bits;
 	assert((mapbits & CHUNK_MAP_ALLOCATED) != 0);
 	if ((mapbits & CHUNK_MAP_LARGE) == 0) {
 		arena_run_t *run = (arena_run_t *)((uintptr_t)chunk +
@@ -1569,9 +1748,9 @@ arena_salloc_demote(const void *ptr)
 		assert(((uintptr_t)ptr & PAGE_MASK) == 0);
 		ret = mapbits & ~PAGE_MASK;
 		if (prof_promote && ret == PAGE_SIZE && (mapbits &
-		    CHUNK_MAP_CLASS_MASK) != CHUNK_MAP_CLASS_MASK) {
+		    CHUNK_MAP_CLASS_MASK) != 0) {
 			size_t binind = ((mapbits & CHUNK_MAP_CLASS_MASK) >>
-			    CHUNK_MAP_CLASS_SHIFT);
+			    CHUNK_MAP_CLASS_SHIFT) - 1;
 			assert(binind < nbins);
 			ret = chunk->arena->bins[binind].reg_size;
 		}
@@ -1580,144 +1759,12 @@ arena_salloc_demote(const void *ptr)
 
 	return (ret);
 }
-
-static inline unsigned
-arena_run_regind(arena_run_t *run, arena_bin_t *bin, const void *ptr,
-    size_t size)
-{
-	unsigned shift, diff, regind;
-
-	assert(run->magic == ARENA_RUN_MAGIC);
-
-	/*
-	 * Avoid doing division with a variable divisor if possible.  Using
-	 * actual division here can reduce allocator throughput by over 20%!
-	 */
-	diff = (unsigned)((uintptr_t)ptr - (uintptr_t)run - bin->reg0_offset);
-
-	/* Rescale (factor powers of 2 out of the numerator and denominator). */
-	shift = ffs(size) - 1;
-	diff >>= shift;
-	size >>= shift;
-
-	if (size == 1) {
-		/* The divisor was a power of 2. */
-		regind = diff;
-	} else {
-		/*
-		 * To divide by a number D that is not a power of two we
-		 * multiply by (2^21 / D) and then right shift by 21 positions.
-		 *
-		 *   X / D
-		 *
-		 * becomes
-		 *
-		 *   (X * size_invs[D - 3]) >> SIZE_INV_SHIFT
-		 *
-		 * We can omit the first three elements, because we never
-		 * divide by 0, and 1 and 2 are both powers of two, which are
-		 * handled above.
-		 */
-#define	SIZE_INV_SHIFT 21
-#define	SIZE_INV(s) (((1U << SIZE_INV_SHIFT) / (s)) + 1)
-		static const unsigned size_invs[] = {
-		    SIZE_INV(3),
-		    SIZE_INV(4), SIZE_INV(5), SIZE_INV(6), SIZE_INV(7),
-		    SIZE_INV(8), SIZE_INV(9), SIZE_INV(10), SIZE_INV(11),
-		    SIZE_INV(12), SIZE_INV(13), SIZE_INV(14), SIZE_INV(15),
-		    SIZE_INV(16), SIZE_INV(17), SIZE_INV(18), SIZE_INV(19),
-		    SIZE_INV(20), SIZE_INV(21), SIZE_INV(22), SIZE_INV(23),
-		    SIZE_INV(24), SIZE_INV(25), SIZE_INV(26), SIZE_INV(27),
-		    SIZE_INV(28), SIZE_INV(29), SIZE_INV(30), SIZE_INV(31)
-		};
-
-		if (size <= ((sizeof(size_invs) / sizeof(unsigned)) + 2))
-			regind = (diff * size_invs[size - 3]) >> SIZE_INV_SHIFT;
-		else
-			regind = diff / size;
-#undef SIZE_INV
-#undef SIZE_INV_SHIFT
-	}
-	assert(diff == regind * size);
-	assert(regind < bin->nregs);
-
-	return (regind);
-}
-
-prof_ctx_t *
-arena_prof_ctx_get(const void *ptr)
-{
-	prof_ctx_t *ret;
-	arena_chunk_t *chunk;
-	size_t pageind, mapbits;
-
-	assert(ptr != NULL);
-	assert(CHUNK_ADDR2BASE(ptr) != ptr);
-
-	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-	pageind = (((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT);
-	mapbits = chunk->map[pageind].bits;
-	assert((mapbits & CHUNK_MAP_ALLOCATED) != 0);
-	if ((mapbits & CHUNK_MAP_LARGE) == 0) {
-		if (prof_promote)
-			ret = (prof_ctx_t *)(uintptr_t)1U;
-		else {
-			arena_run_t *run = (arena_run_t *)((uintptr_t)chunk +
-			    (uintptr_t)((pageind - (mapbits >> PAGE_SHIFT)) <<
-			    PAGE_SHIFT));
-			arena_bin_t *bin = run->bin;
-			unsigned regind;
-
-			assert(run->magic == ARENA_RUN_MAGIC);
-			regind = arena_run_regind(run, bin, ptr, bin->reg_size);
-			ret = *(prof_ctx_t **)((uintptr_t)run +
-			    bin->ctx0_offset + (regind *
-			    sizeof(prof_ctx_t *)));
-		}
-	} else
-		ret = chunk->map[pageind].prof_ctx;
-
-	return (ret);
-}
-
-void
-arena_prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
-{
-	arena_chunk_t *chunk;
-	size_t pageind, mapbits;
-
-	assert(ptr != NULL);
-	assert(CHUNK_ADDR2BASE(ptr) != ptr);
-
-	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-	pageind = (((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT);
-	mapbits = chunk->map[pageind].bits;
-	assert((mapbits & CHUNK_MAP_ALLOCATED) != 0);
-	if ((mapbits & CHUNK_MAP_LARGE) == 0) {
-		if (prof_promote == false) {
-			arena_run_t *run = (arena_run_t *)((uintptr_t)chunk +
-			    (uintptr_t)((pageind - (mapbits >> PAGE_SHIFT)) <<
-			    PAGE_SHIFT));
-			arena_bin_t *bin = run->bin;
-			unsigned regind;
-
-			assert(run->magic == ARENA_RUN_MAGIC);
-			regind = arena_run_regind(run, bin, ptr, bin->reg_size);
-
-			*((prof_ctx_t **)((uintptr_t)run + bin->ctx0_offset
-			    + (regind * sizeof(prof_ctx_t *)))) = ctx;
-		} else
-			assert((uintptr_t)ctx == (uintptr_t)1U);
-	} else
-		chunk->map[pageind].prof_ctx = ctx;
-}
 #endif
 
 static void
-arena_dalloc_bin_run(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
+arena_dissociate_bin_run(arena_chunk_t *chunk, arena_run_t *run,
     arena_bin_t *bin)
 {
-	size_t npages, run_ind, past;
 
 	/* Dissociate run from bin. */
 	if (run == bin->runcur)
@@ -1725,7 +1772,8 @@ arena_dalloc_bin_run(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
 	else if (bin->nregs != 1) {
 		size_t run_pageind = (((uintptr_t)run - (uintptr_t)chunk)) >>
 		    PAGE_SHIFT;
-		arena_chunk_map_t *run_mapelm = &chunk->map[run_pageind];
+		arena_chunk_map_t *run_mapelm =
+		    &chunk->map[run_pageind-map_bias];
 		/*
 		 * This block's conditional is necessary because if the run
 		 * only contains one region, then it never gets inserted into
@@ -1733,13 +1781,24 @@ arena_dalloc_bin_run(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
 		 */
 		arena_run_tree_remove(&bin->runs, run_mapelm);
 	}
+}
+
+static void
+arena_dalloc_bin_run(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
+    arena_bin_t *bin)
+{
+	size_t npages, run_ind, past;
+
+	assert(run != bin->runcur);
+	assert(arena_run_tree_search(&bin->runs, &chunk->map[
+	    (((uintptr_t)run-(uintptr_t)chunk)>>PAGE_SHIFT)-map_bias]) == NULL);
 
 	malloc_mutex_unlock(&bin->lock);
 	/******************************/
 	npages = bin->run_size >> PAGE_SHIFT;
 	run_ind = (size_t)(((uintptr_t)run - (uintptr_t)chunk) >> PAGE_SHIFT);
-	past = (size_t)(((uintptr_t)run->next - (uintptr_t)1U -
-	    (uintptr_t)chunk) >> PAGE_SHIFT) + 1;
+	past = (size_t)((PAGE_CEILING((uintptr_t)run->next) - (uintptr_t)chunk)
+	    >> PAGE_SHIFT);
 	malloc_mutex_lock(&arena->lock);
 
 	/*
@@ -1747,19 +1806,21 @@ arena_dalloc_bin_run(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
 	 * trim the clean pages before deallocating the dirty portion of the
 	 * run.
 	 */
-	if ((chunk->map[run_ind].bits & CHUNK_MAP_DIRTY) == 0 && past - run_ind
-	    < npages) {
+	if ((chunk->map[run_ind-map_bias].bits & CHUNK_MAP_DIRTY) == 0 && past
+	    - run_ind < npages) {
 		/*
 		 * Trim clean pages.  Convert to large run beforehand.  Set the
 		 * last map element first, in case this is a one-page run.
 		 */
-		chunk->map[run_ind+npages-1].bits = CHUNK_MAP_LARGE |
-		    (chunk->map[run_ind].bits & CHUNK_MAP_FLAGS_MASK);
-		chunk->map[run_ind].bits = bin->run_size | CHUNK_MAP_LARGE |
-		    (chunk->map[run_ind].bits & CHUNK_MAP_FLAGS_MASK);
+		chunk->map[run_ind+npages-1-map_bias].bits = CHUNK_MAP_LARGE |
+		    (chunk->map[run_ind+npages-1-map_bias].bits &
+		    CHUNK_MAP_FLAGS_MASK);
+		chunk->map[run_ind-map_bias].bits = bin->run_size |
+		    CHUNK_MAP_LARGE | (chunk->map[run_ind-map_bias].bits &
+		    CHUNK_MAP_FLAGS_MASK);
 		arena_run_trim_tail(arena, chunk, run, (npages << PAGE_SHIFT),
-		    ((npages - (past - run_ind)) << PAGE_SHIFT), false);
-		npages = past - run_ind;
+		    ((past - run_ind) << PAGE_SHIFT), false);
+		/* npages = past - run_ind; */
 	}
 #ifdef JEMALLOC_DEBUG
 	run->magic = 0;
@@ -1773,6 +1834,42 @@ arena_dalloc_bin_run(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
 #endif
 }
 
+static void
+arena_bin_lower_run(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
+    arena_bin_t *bin)
+{
+
+	/*
+	 * Make sure that bin->runcur always refers to the lowest non-full run,
+	 * if one exists.
+	 */
+	if (bin->runcur == NULL)
+		bin->runcur = run;
+	else if ((uintptr_t)run < (uintptr_t)bin->runcur) {
+		/* Switch runcur. */
+		if (bin->runcur->nfree > 0) {
+			arena_chunk_t *runcur_chunk =
+			    CHUNK_ADDR2BASE(bin->runcur);
+			size_t runcur_pageind = (((uintptr_t)bin->runcur -
+			    (uintptr_t)runcur_chunk)) >> PAGE_SHIFT;
+			arena_chunk_map_t *runcur_mapelm =
+			    &runcur_chunk->map[runcur_pageind-map_bias];
+
+			/* Insert runcur. */
+			arena_run_tree_insert(&bin->runs, runcur_mapelm);
+		}
+		bin->runcur = run;
+	} else {
+		size_t run_pageind = (((uintptr_t)run -
+		    (uintptr_t)chunk)) >> PAGE_SHIFT;
+		arena_chunk_map_t *run_mapelm =
+		    &chunk->map[run_pageind-map_bias];
+
+		assert(arena_run_tree_search(&bin->runs, run_mapelm) == NULL);
+		arena_run_tree_insert(&bin->runs, run_mapelm);
+	}
+}
+
 void
 arena_dalloc_bin(arena_t *arena, arena_chunk_t *chunk, void *ptr,
     arena_chunk_map_t *mapelm)
@@ -1784,7 +1881,7 @@ arena_dalloc_bin(arena_t *arena, arena_chunk_t *chunk, void *ptr,
 	size_t size;
 #endif
 
-	pageind = (((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT);
+	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT;
 	run = (arena_run_t *)((uintptr_t)chunk + (uintptr_t)((pageind -
 	    (mapelm->bits >> PAGE_SHIFT)) << PAGE_SHIFT));
 	assert(run->magic == ARENA_RUN_MAGIC);
@@ -1799,43 +1896,11 @@ arena_dalloc_bin(arena_t *arena, arena_chunk_t *chunk, void *ptr,
 #endif
 
 	arena_run_reg_dalloc(run, ptr);
-
-	if (run->nfree == bin->nregs)
+	if (run->nfree == bin->nregs) {
+		arena_dissociate_bin_run(chunk, run, bin);
 		arena_dalloc_bin_run(arena, chunk, run, bin);
-	else if (run->nfree == 1 && run != bin->runcur) {
-		/*
-		 * Make sure that bin->runcur always refers to the lowest
-		 * non-full run, if one exists.
-		 */
-		if (bin->runcur == NULL)
-			bin->runcur = run;
-		else if ((uintptr_t)run < (uintptr_t)bin->runcur) {
-			/* Switch runcur. */
-			if (bin->runcur->nfree > 0) {
-				arena_chunk_t *runcur_chunk =
-				    CHUNK_ADDR2BASE(bin->runcur);
-				size_t runcur_pageind =
-				    (((uintptr_t)bin->runcur -
-				    (uintptr_t)runcur_chunk)) >> PAGE_SHIFT;
-				arena_chunk_map_t *runcur_mapelm =
-				    &runcur_chunk->map[runcur_pageind];
-
-				/* Insert runcur. */
-				arena_run_tree_insert(&bin->runs,
-				    runcur_mapelm);
-			}
-			bin->runcur = run;
-		} else {
-			size_t run_pageind = (((uintptr_t)run -
-			    (uintptr_t)chunk)) >> PAGE_SHIFT;
-			arena_chunk_map_t *run_mapelm =
-			    &chunk->map[run_pageind];
-
-			assert(arena_run_tree_search(&bin->runs, run_mapelm) ==
-			    NULL);
-			arena_run_tree_insert(&bin->runs, run_mapelm);
-		}
-	}
+	} else if (run->nfree == 1 && run != bin->runcur)
+		arena_bin_lower_run(arena, chunk, run, bin);
 
 #ifdef JEMALLOC_STATS
 	bin->stats.allocated -= size;
@@ -1908,7 +1973,7 @@ arena_dalloc_large(arena_t *arena, arena_chunk_t *chunk, void *ptr)
 #if (defined(JEMALLOC_FILL) || defined(JEMALLOC_STATS))
 		size_t pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >>
 		    PAGE_SHIFT;
-		size_t size = chunk->map[pageind].bits & ~PAGE_MASK;
+		size_t size = chunk->map[pageind-map_bias].bits & ~PAGE_MASK;
 #endif
 
 #ifdef JEMALLOC_FILL
@@ -1930,7 +1995,7 @@ arena_dalloc_large(arena_t *arena, arena_chunk_t *chunk, void *ptr)
 
 static void
 arena_ralloc_large_shrink(arena_t *arena, arena_chunk_t *chunk, void *ptr,
-    size_t size, size_t oldsize)
+    size_t oldsize, size_t size)
 {
 
 	assert(size < oldsize);
@@ -1965,50 +2030,71 @@ arena_ralloc_large_shrink(arena_t *arena, arena_chunk_t *chunk, void *ptr,
 
 static bool
 arena_ralloc_large_grow(arena_t *arena, arena_chunk_t *chunk, void *ptr,
-    size_t size, size_t oldsize)
+    size_t oldsize, size_t size, size_t extra, bool zero)
 {
 	size_t pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT;
 	size_t npages = oldsize >> PAGE_SHIFT;
+	size_t followsize;
 
-	assert(oldsize == (chunk->map[pageind].bits & ~PAGE_MASK));
+	assert(oldsize == (chunk->map[pageind-map_bias].bits & ~PAGE_MASK));
 
 	/* Try to extend the run. */
-	assert(size > oldsize);
+	assert(size + extra > oldsize);
 	malloc_mutex_lock(&arena->lock);
-	if (pageind + npages < chunk_npages && (chunk->map[pageind+npages].bits
-	    & CHUNK_MAP_ALLOCATED) == 0 && (chunk->map[pageind+npages].bits &
-	    ~PAGE_MASK) >= size - oldsize) {
+	if (pageind + npages < chunk_npages &&
+	    (chunk->map[pageind+npages-map_bias].bits
+	    & CHUNK_MAP_ALLOCATED) == 0 && (followsize =
+	    chunk->map[pageind+npages-map_bias].bits & ~PAGE_MASK) >= size -
+	    oldsize) {
 		/*
 		 * The next run is available and sufficiently large.  Split the
 		 * following run, then merge the first part with the existing
 		 * allocation.
 		 */
+		size_t flag_dirty;
+		size_t splitsize = (oldsize + followsize <= size + extra)
+		    ? followsize : size + extra - oldsize;
 		arena_run_split(arena, (arena_run_t *)((uintptr_t)chunk +
-		    ((pageind+npages) << PAGE_SHIFT)), size - oldsize, true,
-		    false);
+		    ((pageind+npages) << PAGE_SHIFT)), splitsize, true, zero);
 
-		chunk->map[pageind].bits = size | CHUNK_MAP_LARGE |
-		    CHUNK_MAP_ALLOCATED;
-		chunk->map[pageind+npages].bits = CHUNK_MAP_LARGE |
-		    CHUNK_MAP_ALLOCATED;
+		size = oldsize + splitsize;
+		npages = size >> PAGE_SHIFT;
+
+		/*
+		 * Mark the extended run as dirty if either portion of the run
+		 * was dirty before allocation.  This is rather pedantic,
+		 * because there's not actually any sequence of events that
+		 * could cause the resulting run to be passed to
+		 * arena_run_dalloc() with the dirty argument set to false
+		 * (which is when dirty flag consistency would really matter).
+		 */
+		flag_dirty = (chunk->map[pageind-map_bias].bits &
+		    CHUNK_MAP_DIRTY) |
+		    (chunk->map[pageind+npages-1-map_bias].bits &
+		    CHUNK_MAP_DIRTY);
+		chunk->map[pageind-map_bias].bits = size | flag_dirty
+		    | CHUNK_MAP_LARGE | CHUNK_MAP_ALLOCATED;
+		chunk->map[pageind+npages-1-map_bias].bits = flag_dirty |
+		    CHUNK_MAP_LARGE | CHUNK_MAP_ALLOCATED;
 
 #ifdef JEMALLOC_STATS
-	arena->stats.ndalloc_large++;
-	arena->stats.allocated_large -= oldsize;
-	arena->stats.lstats[(oldsize >> PAGE_SHIFT) - 1].ndalloc++;
-	arena->stats.lstats[(oldsize >> PAGE_SHIFT) - 1].curruns--;
+		arena->stats.ndalloc_large++;
+		arena->stats.allocated_large -= oldsize;
+		arena->stats.lstats[(oldsize >> PAGE_SHIFT) - 1].ndalloc++;
+		arena->stats.lstats[(oldsize >> PAGE_SHIFT) - 1].curruns--;
 
-	arena->stats.nmalloc_large++;
-	arena->stats.nrequests_large++;
-	arena->stats.allocated_large += size;
-	arena->stats.lstats[(size >> PAGE_SHIFT) - 1].nmalloc++;
-	arena->stats.lstats[(size >> PAGE_SHIFT) - 1].nrequests++;
-	arena->stats.lstats[(size >> PAGE_SHIFT) - 1].curruns++;
-	if (arena->stats.lstats[(size >> PAGE_SHIFT) - 1].curruns >
-	    arena->stats.lstats[(size >> PAGE_SHIFT) - 1].highruns) {
-		arena->stats.lstats[(size >> PAGE_SHIFT) - 1].highruns =
-		    arena->stats.lstats[(size >> PAGE_SHIFT) - 1].curruns;
-	}
+		arena->stats.nmalloc_large++;
+		arena->stats.nrequests_large++;
+		arena->stats.allocated_large += size;
+		arena->stats.lstats[(size >> PAGE_SHIFT) - 1].nmalloc++;
+		arena->stats.lstats[(size >> PAGE_SHIFT) - 1].nrequests++;
+		arena->stats.lstats[(size >> PAGE_SHIFT) - 1].curruns++;
+		if (arena->stats.lstats[(size >> PAGE_SHIFT) - 1].curruns >
+		    arena->stats.lstats[(size >> PAGE_SHIFT) - 1].highruns) {
+			arena->stats.lstats[(size >> PAGE_SHIFT) - 1].highruns =
+			    arena->stats.lstats[(size >> PAGE_SHIFT) -
+			    1].curruns;
+		}
 #endif
 		malloc_mutex_unlock(&arena->lock);
 		return (false);
@@ -2023,11 +2109,12 @@ arena_ralloc_large_grow(arena_t *arena, arena_chunk_t *chunk, void *ptr,
  * always fail if growing an object, and the following run is already in use.
  */
 static bool
-arena_ralloc_large(void *ptr, size_t size, size_t oldsize)
+arena_ralloc_large(void *ptr, size_t oldsize, size_t size, size_t extra,
+    bool zero)
 {
 	size_t psize;
 
-	psize = PAGE_CEILING(size);
+	psize = PAGE_CEILING(size + extra);
 	if (psize == oldsize) {
 		/* Same size class. */
 #ifdef JEMALLOC_FILL
@@ -2053,14 +2140,15 @@ arena_ralloc_large(void *ptr, size_t size, size_t oldsize)
 				    oldsize - size);
 			}
 #endif
-			arena_ralloc_large_shrink(arena, chunk, ptr, psize,
-			    oldsize);
+			arena_ralloc_large_shrink(arena, chunk, ptr, oldsize,
+			    psize);
 			return (false);
 		} else {
 			bool ret = arena_ralloc_large_grow(arena, chunk, ptr,
-			    psize, oldsize);
+			    oldsize, PAGE_CEILING(size),
+			    psize - PAGE_CEILING(size), zero);
 #ifdef JEMALLOC_FILL
-			if (ret == false && opt_zero) {
+			if (ret == false && zero == false && opt_zero) {
 				memset((void *)((uintptr_t)ptr + oldsize), 0,
 				    size - oldsize);
 			}
@@ -2071,49 +2159,89 @@ arena_ralloc_large(void *ptr, size_t size, size_t oldsize)
 }
 
 void *
-arena_ralloc(void *ptr, size_t size, size_t oldsize)
+arena_ralloc_no_move(void *ptr, size_t oldsize, size_t size, size_t extra,
+    bool zero)
 {
-	void *ret;
-	size_t copysize;
 
-	/* Try to avoid moving the allocation. */
+	/*
+	 * Avoid moving the allocation if the size class can be left the same.
+	 */
 	if (oldsize <= arena_maxclass) {
 		if (oldsize <= small_maxclass) {
-			if (size <= small_maxclass && small_size2bin[size] ==
-			    small_size2bin[oldsize])
-				goto IN_PLACE;
+			assert(choose_arena()->bins[small_size2bin[
+			    oldsize]].reg_size == oldsize);
+			if ((size + extra <= small_maxclass &&
+			    small_size2bin[size + extra] ==
+			    small_size2bin[oldsize]) || (size <= oldsize &&
+			    size + extra >= oldsize)) {
+#ifdef JEMALLOC_FILL
+				if (opt_junk && size < oldsize) {
+					memset((void *)((uintptr_t)ptr + size),
+					    0x5a, oldsize - size);
+				}
+#endif
+				return (ptr);
+			}
 		} else {
 			assert(size <= arena_maxclass);
-			if (size > small_maxclass) {
-				if (arena_ralloc_large(ptr, size, oldsize) ==
-				    false)
+			if (size + extra > small_maxclass) {
+				if (arena_ralloc_large(ptr, oldsize, size,
+				    extra, zero) == false)
 					return (ptr);
 			}
 		}
 	}
 
-	/*
-	 * If we get here, then size and oldsize are different enough that we
-	 * need to move the object.  In that case, fall back to allocating new
-	 * space and copying.
-	 */
-	ret = arena_malloc(size, false);
-	if (ret == NULL)
-		return (NULL);
+	/* Reallocation would require a move. */
+	return (NULL);
+}
 
-	/* Junk/zero-filling were already done by arena_malloc(). */
+void *
+arena_ralloc(void *ptr, size_t oldsize, size_t size, size_t extra,
+    size_t alignment, bool zero)
+{
+	void *ret;
+	size_t copysize;
+
+	/* Try to avoid moving the allocation. */
+	ret = arena_ralloc_no_move(ptr, oldsize, size, extra, zero);
+	if (ret != NULL)
+		return (ret);
+
+
+	/*
+	 * size and oldsize are different enough that we need to move the
+	 * object.  In that case, fall back to allocating new space and
+	 * copying.
+	 */
+	if (alignment != 0)
+		ret = ipalloc(size + extra, alignment, zero);
+	else
+		ret = arena_malloc(size + extra, zero);
+
+	if (ret == NULL) {
+		if (extra == 0)
+			return (NULL);
+		/* Try again, this time without extra. */
+		if (alignment != 0)
+			ret = ipalloc(size, alignment, zero);
+		else
+			ret = arena_malloc(size, zero);
+
+		if (ret == NULL)
+			return (NULL);
+	}
+
+	/* Junk/zero-filling were already done by ipalloc()/arena_malloc(). */
+
+	/*
+	 * Copy at most size bytes (not size+extra), since the caller has no
+	 * expectation that the extra bytes will be reliably preserved.
+	 */
 	copysize = (size < oldsize) ? size : oldsize;
 	memcpy(ret, ptr, copysize);
 	idalloc(ptr);
 	return (ret);
-IN_PLACE:
-#ifdef JEMALLOC_FILL
-	if (opt_junk && size < oldsize)
-		memset((void *)((uintptr_t)ptr + size), 0x5a, oldsize - size);
-	else if (opt_zero && size > oldsize)
-		memset((void *)((uintptr_t)ptr + oldsize), 0, size - oldsize);
-#endif
-	return (ptr);
 }
 
 bool
@@ -2239,26 +2367,6 @@ arena_new(arena_t *arena, unsigned ind)
 	return (false);
 }
 
-#ifdef JEMALLOC_TINY
-/* Compute the smallest power of 2 that is >= x. */
-static size_t
-pow2_ceil(size_t x)
-{
-
-	x--;
-	x |= x >> 1;
-	x |= x >> 2;
-	x |= x >> 4;
-	x |= x >> 8;
-	x |= x >> 16;
-#if (SIZEOF_PTR == 8)
-	x |= x >> 32;
-#endif
-	x++;
-	return (x);
-}
-#endif
-
 #ifdef JEMALLOC_DEBUG
 static void
 small_size2bin_validate(void)
@@ -2381,6 +2489,7 @@ bool
 arena_boot(void)
 {
 	size_t header_size;
+	unsigned i;
 
 	/* Set variables according to the value of opt_lg_[qc]space_max. */
 	qspace_max = (1U << opt_lg_qspace_max);
@@ -2420,7 +2529,7 @@ arena_boot(void)
 		if (nbins > 255) {
 		    char line_buf[UMAX2S_BUFSIZE];
 		    malloc_write("<jemalloc>: Too many small size classes (");
-		    malloc_write(umax2s(nbins, 10, line_buf));
+		    malloc_write(u2s(nbins, 10, line_buf));
 		    malloc_write(" > max 255)\n");
 		    abort();
 		}
@@ -2429,7 +2538,7 @@ arena_boot(void)
 	if (nbins > 256) {
 	    char line_buf[UMAX2S_BUFSIZE];
 	    malloc_write("<jemalloc>: Too many small size classes (");
-	    malloc_write(umax2s(nbins, 10, line_buf));
+	    malloc_write(u2s(nbins, 10, line_buf));
 	    malloc_write(" > max 256)\n");
 	    abort();
 	}
@@ -2439,13 +2548,26 @@ arena_boot(void)
 
 	/*
 	 * Compute the header size such that it is large enough to contain the
-	 * page map.
+	 * page map.  The page map is biased to omit entries for the header
+	 * itself, so some iteration is necessary to compute the map bias.
+	 *
+	 * 1) Compute safe header_size and map_bias values that include enough
+	 *    space for an unbiased page map.
+	 * 2) Refine map_bias based on (1) to omit the header pages in the page
+	 *    map.  The resulting map_bias may be one too small.
+	 * 3) Refine map_bias based on (2).  The result will be >= the result
+	 *    from (2), and will always be correct.
 	 */
-	header_size = sizeof(arena_chunk_t) +
-	    (sizeof(arena_chunk_map_t) * (chunk_npages - 1));
-	arena_chunk_header_npages = (header_size >> PAGE_SHIFT) +
-	    ((header_size & PAGE_MASK) != 0);
-	arena_maxclass = chunksize - (arena_chunk_header_npages << PAGE_SHIFT);
+	map_bias = 0;
+	for (i = 0; i < 3; i++) {
+		header_size = offsetof(arena_chunk_t, map)
+			+ (sizeof(arena_chunk_map_t) * (chunk_npages-map_bias));
+		map_bias = (header_size >> PAGE_SHIFT) + ((header_size &
+		    PAGE_MASK) != 0);
+	}
+	assert(map_bias > 0);
+
+	arena_maxclass = chunksize - (map_bias << PAGE_SHIFT);
 
 	return (false);
 }
diff --git a/jemalloc/src/base.c b/jemalloc/src/base.c
index 605197ea..cc85e849 100644
--- a/jemalloc/src/base.c
+++ b/jemalloc/src/base.c
@@ -32,7 +32,7 @@ base_pages_alloc(size_t minsize)
 	assert(minsize != 0);
 	csize = CHUNK_CEILING(minsize);
 	zero = false;
-	base_pages = chunk_alloc(csize, &zero);
+	base_pages = chunk_alloc(csize, true, &zero);
 	if (base_pages == NULL)
 		return (true);
 	base_next_addr = base_pages;
diff --git a/jemalloc/src/chunk.c b/jemalloc/src/chunk.c
index e6e3bcd1..00bf50a0 100644
--- a/jemalloc/src/chunk.c
+++ b/jemalloc/src/chunk.c
@@ -14,11 +14,15 @@ malloc_mutex_t	chunks_mtx;
 chunk_stats_t	stats_chunks;
 #endif
 
+#ifdef JEMALLOC_IVSALLOC
+rtree_t		*chunks_rtree;
+#endif
+
 /* Various chunk-related settings. */
 size_t		chunksize;
 size_t		chunksize_mask; /* (chunksize - 1). */
 size_t		chunk_npages;
-size_t		arena_chunk_header_npages;
+size_t		map_bias;
 size_t		arena_maxclass; /* Max size class for arenas. */
 
 /******************************************************************************/
@@ -30,7 +34,7 @@ size_t		arena_maxclass; /* Max size class for arenas. */
  * advantage of them if they are returned.
  */
 void *
-chunk_alloc(size_t size, bool *zero)
+chunk_alloc(size_t size, bool base, bool *zero)
 {
 	void *ret;
 
@@ -63,10 +67,18 @@ chunk_alloc(size_t size, bool *zero)
 	/* All strategies for allocation failed. */
 	ret = NULL;
 RETURN:
+#ifdef JEMALLOC_IVSALLOC
+	if (base == false && ret != NULL) {
+		if (rtree_set(chunks_rtree, (uintptr_t)ret, ret)) {
+			chunk_dealloc(ret, size);
+			return (NULL);
+		}
+	}
+#endif
 #if (defined(JEMALLOC_STATS) || defined(JEMALLOC_PROF))
 	if (ret != NULL) {
 #  ifdef JEMALLOC_PROF
-		bool udump;
+		bool gdump;
 #  endif
 		malloc_mutex_lock(&chunks_mtx);
 #  ifdef JEMALLOC_STATS
@@ -76,17 +88,17 @@ RETURN:
 		if (stats_chunks.curchunks > stats_chunks.highchunks) {
 			stats_chunks.highchunks = stats_chunks.curchunks;
 #  ifdef JEMALLOC_PROF
-			udump = true;
+			gdump = true;
 #  endif
 		}
 #  ifdef JEMALLOC_PROF
 		else
-			udump = false;
+			gdump = false;
 #  endif
 		malloc_mutex_unlock(&chunks_mtx);
 #  ifdef JEMALLOC_PROF
-		if (opt_prof && opt_prof_udump && udump)
-			prof_udump();
+		if (opt_prof && opt_prof_gdump && gdump)
+			prof_gdump();
 #  endif
 	}
 #endif
@@ -104,6 +116,9 @@ chunk_dealloc(void *chunk, size_t size)
 	assert(size != 0);
 	assert((size & chunksize_mask) == 0);
 
+#ifdef JEMALLOC_IVSALLOC
+	rtree_set(chunks_rtree, (uintptr_t)chunk, NULL);
+#endif
 #if (defined(JEMALLOC_STATS) || defined(JEMALLOC_PROF))
 	malloc_mutex_lock(&chunks_mtx);
 	stats_chunks.curchunks -= (size / chunksize);
@@ -126,21 +141,27 @@ chunk_boot(void)
 {
 
 	/* Set variables according to the value of opt_lg_chunk. */
-	chunksize = (1LU << opt_lg_chunk);
+	chunksize = (ZU(1) << opt_lg_chunk);
 	assert(chunksize >= PAGE_SIZE);
 	chunksize_mask = chunksize - 1;
 	chunk_npages = (chunksize >> PAGE_SHIFT);
 
+#ifdef JEMALLOC_IVSALLOC
+	chunks_rtree = rtree_new((ZU(1) << (LG_SIZEOF_PTR+3)) - opt_lg_chunk);
+	if (chunks_rtree == NULL)
+		return (true);
+#endif
 #if (defined(JEMALLOC_STATS) || defined(JEMALLOC_PROF))
 	if (malloc_mutex_init(&chunks_mtx))
 		return (true);
 	memset(&stats_chunks, 0, sizeof(chunk_stats_t));
 #endif
-
 #ifdef JEMALLOC_SWAP
 	if (chunk_swap_boot())
 		return (true);
 #endif
+	if (chunk_mmap_boot())
+		return (true);
 #ifdef JEMALLOC_DSS
 	if (chunk_dss_boot())
 		return (true);
diff --git a/jemalloc/src/chunk_mmap.c b/jemalloc/src/chunk_mmap.c
index d9f9e86d..bc367559 100644
--- a/jemalloc/src/chunk_mmap.c
+++ b/jemalloc/src/chunk_mmap.c
@@ -6,26 +6,30 @@
 
 /*
  * Used by chunk_alloc_mmap() to decide whether to attempt the fast path and
- * potentially avoid some system calls.  We can get away without TLS here,
- * since the state of mmap_unaligned only affects performance, rather than
- * correct function.
+ * potentially avoid some system calls.
  */
-static
 #ifndef NO_TLS
-       __thread
+static __thread bool	mmap_unaligned_tls
+    JEMALLOC_ATTR(tls_model("initial-exec"));
+#define	MMAP_UNALIGNED_GET()	mmap_unaligned_tls
+#define	MMAP_UNALIGNED_SET(v)	do {					\
+	mmap_unaligned_tls = (v);					\
+} while (0)
+#else
+static pthread_key_t	mmap_unaligned_tsd;
+#define	MMAP_UNALIGNED_GET()	((bool)pthread_getspecific(mmap_unaligned_tsd))
+#define	MMAP_UNALIGNED_SET(v)	do {					\
+	pthread_setspecific(mmap_unaligned_tsd, (void *)(v));		\
+} while (0)
 #endif
-                bool	mmap_unaligned
-#ifndef NO_TLS
-                                       JEMALLOC_ATTR(tls_model("initial-exec"))
-#endif
-                                                                               ;
 
 /******************************************************************************/
 /* Function prototypes for non-inline static functions. */
 
 static void	*pages_map(void *addr, size_t size, bool noreserve);
 static void	pages_unmap(void *addr, size_t size);
-static void	*chunk_alloc_mmap_slow(size_t size, bool unaligned, bool noreserve);
+static void	*chunk_alloc_mmap_slow(size_t size, bool unaligned,
+    bool noreserve);
 static void	*chunk_alloc_mmap_internal(size_t size, bool noreserve);
 
 /******************************************************************************/
@@ -54,9 +58,9 @@ pages_map(void *addr, size_t size, bool noreserve)
 		 * We succeeded in mapping memory, but not in the right place.
 		 */
 		if (munmap(ret, size) == -1) {
-			char buf[STRERROR_BUF];
+			char buf[BUFERROR_BUF];
 
-			strerror_r(errno, buf, sizeof(buf));
+			buferror(errno, buf, sizeof(buf));
 			malloc_write("<jemalloc>: Error in munmap(): ");
 			malloc_write(buf);
 			malloc_write("\n");
@@ -76,9 +80,9 @@ pages_unmap(void *addr, size_t size)
 {
 
 	if (munmap(addr, size) == -1) {
-		char buf[STRERROR_BUF];
+		char buf[BUFERROR_BUF];
 
-		strerror_r(errno, buf, sizeof(buf));
+		buferror(errno, buf, sizeof(buf));
 		malloc_write("<jemalloc>: Error in munmap(): ");
 		malloc_write(buf);
 		malloc_write("\n");
@@ -128,7 +132,7 @@ chunk_alloc_mmap_slow(size_t size, bool unaligned, bool noreserve)
 	 * method.
 	 */
 	if (unaligned == false)
-		mmap_unaligned = false;
+		MMAP_UNALIGNED_SET(false);
 
 	return (ret);
 }
@@ -166,7 +170,7 @@ chunk_alloc_mmap_internal(size_t size, bool noreserve)
 	 * fast method next time.
 	 */
 
-	if (mmap_unaligned == false) {
+	if (MMAP_UNALIGNED_GET() == false) {
 		size_t offset;
 
 		ret = pages_map(NULL, size, noreserve);
@@ -175,7 +179,7 @@ chunk_alloc_mmap_internal(size_t size, bool noreserve)
 
 		offset = CHUNK_ADDR2OFFSET(ret);
 		if (offset != 0) {
-			mmap_unaligned = true;
+			MMAP_UNALIGNED_SET(true);
 			/* Try to extend chunk boundary. */
 			if (pages_map((void *)((uintptr_t)ret + size),
 			    chunksize - offset, noreserve) == NULL) {
@@ -184,7 +188,8 @@ chunk_alloc_mmap_internal(size_t size, bool noreserve)
 				 * the reliable-but-expensive method.
 				 */
 				pages_unmap(ret, size);
-				ret = chunk_alloc_mmap_slow(size, true, noreserve);
+				ret = chunk_alloc_mmap_slow(size, true,
+				    noreserve);
 			} else {
 				/* Clean up unneeded leading space. */
 				pages_unmap(ret, chunksize - offset);
@@ -216,3 +221,17 @@ chunk_dealloc_mmap(void *chunk, size_t size)
 
 	pages_unmap(chunk, size);
 }
+
+bool
+chunk_mmap_boot(void)
+{
+
+#ifdef NO_TLS
+	if (pthread_key_create(&mmap_unaligned_tsd, NULL) != 0) {
+		malloc_write("<jemalloc>: Error in pthread_key_create()\n");
+		return (true);
+	}
+#endif
+
+	return (false);
+}
diff --git a/jemalloc/src/chunk_swap.c b/jemalloc/src/chunk_swap.c
index ed9e414d..ee038ba9 100644
--- a/jemalloc/src/chunk_swap.c
+++ b/jemalloc/src/chunk_swap.c
@@ -294,9 +294,10 @@ chunk_swap_enable(const int *fds, unsigned nfds, bool prezeroed)
 		void *addr = mmap((void *)((uintptr_t)vaddr + voff), sizes[i],
 		    PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, fds[i], 0);
 		if (addr == MAP_FAILED) {
-			char buf[STRERROR_BUF];
+			char buf[BUFERROR_BUF];
 
-			strerror_r(errno, buf, sizeof(buf));
+
+			buferror(errno, buf, sizeof(buf));
 			malloc_write(
 			    "<jemalloc>: Error in mmap(..., MAP_FIXED, ...): ");
 			malloc_write(buf);
@@ -304,7 +305,7 @@ chunk_swap_enable(const int *fds, unsigned nfds, bool prezeroed)
 			if (opt_abort)
 				abort();
 			if (munmap(vaddr, voff) == -1) {
-				strerror_r(errno, buf, sizeof(buf));
+				buferror(errno, buf, sizeof(buf));
 				malloc_write("<jemalloc>: Error in munmap(): ");
 				malloc_write(buf);
 				malloc_write("\n");
diff --git a/jemalloc/src/ckh.c b/jemalloc/src/ckh.c
index a0c4162a..682a8db6 100644
--- a/jemalloc/src/ckh.c
+++ b/jemalloc/src/ckh.c
@@ -263,13 +263,12 @@ ckh_grow(ckh_t *ckh)
 	lg_curcells = ckh->lg_curbuckets + LG_CKH_BUCKET_CELLS;
 	while (true) {
 		lg_curcells++;
-		tab = (ckhc_t *) ipalloc((ZU(1) << LG_CACHELINE),
-		    sizeof(ckhc_t) << lg_curcells);
+		tab = (ckhc_t *)ipalloc(sizeof(ckhc_t) << lg_curcells,
+		    ZU(1) << LG_CACHELINE, true);
 		if (tab == NULL) {
 			ret = true;
 			goto RETURN;
 		}
-		memset(tab, 0, sizeof(ckhc_t) << lg_curcells);
 		/* Swap in new table. */
 		ttab = ckh->tab;
 		ckh->tab = tab;
@@ -305,8 +304,8 @@ ckh_shrink(ckh_t *ckh)
 	 */
 	lg_prevbuckets = ckh->lg_curbuckets;
 	lg_curcells = ckh->lg_curbuckets + LG_CKH_BUCKET_CELLS - 1;
-	tab = (ckhc_t *)ipalloc((ZU(1) << LG_CACHELINE),
-	    sizeof(ckhc_t) << lg_curcells);
+	tab = (ckhc_t *)ipalloc(sizeof(ckhc_t) << lg_curcells,
+	    ZU(1) << LG_CACHELINE, true);
 	if (tab == NULL) {
 		/*
 		 * An OOM error isn't worth propagating, since it doesn't
@@ -314,7 +313,6 @@ ckh_shrink(ckh_t *ckh)
 		 */
 		return;
 	}
-	memset(tab, 0, sizeof(ckhc_t) << lg_curcells);
 	/* Swap in new table. */
 	ttab = ckh->tab;
 	ckh->tab = tab;
@@ -377,13 +375,12 @@ ckh_new(ckh_t *ckh, size_t minitems, ckh_hash_t *hash, ckh_keycomp_t *keycomp)
 	ckh->hash = hash;
 	ckh->keycomp = keycomp;
 
-	ckh->tab = (ckhc_t *)ipalloc((ZU(1) << LG_CACHELINE),
-	    sizeof(ckhc_t) << lg_mincells);
+	ckh->tab = (ckhc_t *)ipalloc(sizeof(ckhc_t) << lg_mincells,
+	    (ZU(1) << LG_CACHELINE), true);
 	if (ckh->tab == NULL) {
 		ret = true;
 		goto RETURN;
 	}
-	memset(ckh->tab, 0, sizeof(ckhc_t) << lg_mincells);
 
 #ifdef JEMALLOC_DEBUG
 	ckh->magic = CKH_MAGIG;
@@ -570,12 +567,21 @@ ckh_pointer_hash(const void *key, unsigned minbits, size_t *hash1,
 {
 	size_t ret1, ret2;
 	uint64_t h;
+	union {
+		const void	*v;
+		uint64_t	i;
+	} u;
 
 	assert(minbits <= 32 || (SIZEOF_PTR == 8 && minbits <= 64));
 	assert(hash1 != NULL);
 	assert(hash2 != NULL);
 
-	h = hash(&key, sizeof(void *), 0xd983396e68886082LLU);
+	assert(sizeof(u.v) == sizeof(u.i));
+#if (LG_SIZEOF_PTR != LG_SIZEOF_INT)
+	u.i = 0;
+#endif
+	u.v = key;
+	h = hash(&u.i, sizeof(u.i), 0xd983396e68886082LLU);
 	if (minbits <= 32) {
 		/*
 		 * Avoid doing multiple hashes, since a single hash provides
@@ -586,7 +592,7 @@ ckh_pointer_hash(const void *key, unsigned minbits, size_t *hash1,
 	} else {
 		assert(SIZEOF_PTR == 8);
 		ret1 = h;
-		ret2 = hash(&key, sizeof(void *), 0x5e2be9aff8709a5dLLU);
+		ret2 = hash(&u.i, sizeof(u.i), 0x5e2be9aff8709a5dLLU);
 	}
 
 	*hash1 = ret1;
diff --git a/jemalloc/src/ctl.c b/jemalloc/src/ctl.c
index ffb732d5..c83ee4f1 100644
--- a/jemalloc/src/ctl.c
+++ b/jemalloc/src/ctl.c
@@ -41,6 +41,11 @@ CTL_PROTO(epoch)
 #ifdef JEMALLOC_TCACHE
 CTL_PROTO(tcache_flush)
 #endif
+CTL_PROTO(thread_arena)
+#ifdef JEMALLOC_STATS
+CTL_PROTO(thread_allocated)
+CTL_PROTO(thread_deallocated)
+#endif
 CTL_PROTO(config_debug)
 CTL_PROTO(config_dss)
 CTL_PROTO(config_dynamic_page_shift)
@@ -57,8 +62,15 @@ CTL_PROTO(config_tiny)
 CTL_PROTO(config_tls)
 CTL_PROTO(config_xmalloc)
 CTL_PROTO(opt_abort)
+CTL_PROTO(opt_lg_qspace_max)
+CTL_PROTO(opt_lg_cspace_max)
+CTL_PROTO(opt_lg_chunk)
+CTL_PROTO(opt_narenas)
+CTL_PROTO(opt_lg_dirty_mult)
+CTL_PROTO(opt_stats_print)
 #ifdef JEMALLOC_FILL
 CTL_PROTO(opt_junk)
+CTL_PROTO(opt_zero)
 #endif
 #ifdef JEMALLOC_SYSV
 CTL_PROTO(opt_sysv)
@@ -66,27 +78,22 @@ CTL_PROTO(opt_sysv)
 #ifdef JEMALLOC_XMALLOC
 CTL_PROTO(opt_xmalloc)
 #endif
-#ifdef JEMALLOC_ZERO
-CTL_PROTO(opt_zero)
-#endif
 #ifdef JEMALLOC_TCACHE
 CTL_PROTO(opt_tcache)
 CTL_PROTO(opt_lg_tcache_gc_sweep)
 #endif
 #ifdef JEMALLOC_PROF
 CTL_PROTO(opt_prof)
+CTL_PROTO(opt_prof_prefix)
 CTL_PROTO(opt_prof_active)
 CTL_PROTO(opt_lg_prof_bt_max)
 CTL_PROTO(opt_lg_prof_sample)
 CTL_PROTO(opt_lg_prof_interval)
-CTL_PROTO(opt_prof_udump)
+CTL_PROTO(opt_prof_gdump)
 CTL_PROTO(opt_prof_leak)
+CTL_PROTO(opt_prof_accum)
+CTL_PROTO(opt_lg_prof_tcmax)
 #endif
-CTL_PROTO(opt_stats_print)
-CTL_PROTO(opt_lg_qspace_max)
-CTL_PROTO(opt_lg_cspace_max)
-CTL_PROTO(opt_lg_dirty_mult)
-CTL_PROTO(opt_lg_chunk)
 #ifdef JEMALLOC_SWAP
 CTL_PROTO(opt_overcommit)
 #endif
@@ -125,6 +132,7 @@ CTL_PROTO(arenas_nbins)
 CTL_PROTO(arenas_nhbins)
 #endif
 CTL_PROTO(arenas_nlruns)
+CTL_PROTO(arenas_purge)
 #ifdef JEMALLOC_PROF
 CTL_PROTO(prof_active)
 CTL_PROTO(prof_dump)
@@ -210,6 +218,15 @@ static const ctl_node_t	tcache_node[] = {
 };
 #endif
 
+static const ctl_node_t	thread_node[] = {
+	{NAME("arena"),		CTL(thread_arena)}
+#ifdef JEMALLOC_STATS
+	,
+	{NAME("allocated"),	CTL(thread_allocated)},
+	{NAME("deallocated"),	CTL(thread_deallocated)}
+#endif
+};
+
 static const ctl_node_t	config_node[] = {
 	{NAME("debug"),			CTL(config_debug)},
 	{NAME("dss"),			CTL(config_dss)},
@@ -230,36 +247,43 @@ static const ctl_node_t	config_node[] = {
 
 static const ctl_node_t opt_node[] = {
 	{NAME("abort"),			CTL(opt_abort)},
+	{NAME("lg_qspace_max"),		CTL(opt_lg_qspace_max)},
+	{NAME("lg_cspace_max"),		CTL(opt_lg_cspace_max)},
+	{NAME("lg_chunk"),		CTL(opt_lg_chunk)},
+	{NAME("narenas"),		CTL(opt_narenas)},
+	{NAME("lg_dirty_mult"),		CTL(opt_lg_dirty_mult)},
+	{NAME("stats_print"),		CTL(opt_stats_print)}
 #ifdef JEMALLOC_FILL
+	,
 	{NAME("junk"),			CTL(opt_junk)},
+	{NAME("zero"),			CTL(opt_zero)}
 #endif
 #ifdef JEMALLOC_SYSV
-	{NAME("sysv"),			CTL(opt_sysv)},
+	,
+	{NAME("sysv"),			CTL(opt_sysv)}
 #endif
 #ifdef JEMALLOC_XMALLOC
-	{NAME("xmalloc"),		CTL(opt_xmalloc)},
-#endif
-#ifdef JEMALLOC_ZERO
-	{NAME("zero"),			CTL(opt_zero)},
+	,
+	{NAME("xmalloc"),		CTL(opt_xmalloc)}
 #endif
 #ifdef JEMALLOC_TCACHE
+	,
 	{NAME("tcache"),		CTL(opt_tcache)},
-	{NAME("lg_tcache_gc_sweep"),	CTL(opt_lg_tcache_gc_sweep)},
+	{NAME("lg_tcache_gc_sweep"),	CTL(opt_lg_tcache_gc_sweep)}
 #endif
 #ifdef JEMALLOC_PROF
+	,
 	{NAME("prof"),			CTL(opt_prof)},
+	{NAME("prof_prefix"),		CTL(opt_prof_prefix)},
 	{NAME("prof_active"),		CTL(opt_prof_active)},
 	{NAME("lg_prof_bt_max"),	CTL(opt_lg_prof_bt_max)},
 	{NAME("lg_prof_sample"),	CTL(opt_lg_prof_sample)},
 	{NAME("lg_prof_interval"),	CTL(opt_lg_prof_interval)},
-	{NAME("prof_udump"),		CTL(opt_prof_udump)},
+	{NAME("prof_gdump"),		CTL(opt_prof_gdump)},
 	{NAME("prof_leak"),		CTL(opt_prof_leak)},
+	{NAME("prof_accum"),		CTL(opt_prof_accum)},
+	{NAME("lg_prof_tcmax"),		CTL(opt_lg_prof_tcmax)}
 #endif
-	{NAME("stats_print"),		CTL(opt_stats_print)},
-	{NAME("lg_qspace_max"),		CTL(opt_lg_qspace_max)},
-	{NAME("lg_cspace_max"),		CTL(opt_lg_cspace_max)},
-	{NAME("lg_dirty_mult"),		CTL(opt_lg_dirty_mult)},
-	{NAME("lg_chunk"),		CTL(opt_lg_chunk)}
 #ifdef JEMALLOC_SWAP
 	,
 	{NAME("overcommit"),		CTL(opt_overcommit)}
@@ -321,7 +345,8 @@ static const ctl_node_t arenas_node[] = {
 #endif
 	{NAME("bin"),			CHILD(arenas_bin)},
 	{NAME("nlruns"),		CTL(arenas_nlruns)},
-	{NAME("lrun"),			CHILD(arenas_lrun)}
+	{NAME("lrun"),			CHILD(arenas_lrun)},
+	{NAME("purge"),			CTL(arenas_purge)}
 };
 
 #ifdef JEMALLOC_PROF
@@ -448,6 +473,7 @@ static const ctl_node_t	root_node[] = {
 #ifdef JEMALLOC_TCACHE
 	{NAME("tcache"),	CHILD(tcache)},
 #endif
+	{NAME("thread"),	CHILD(thread)},
 	{NAME("config"),	CHILD(config)},
 	{NAME("opt"),		CHILD(opt)},
 	{NAME("arenas"),	CHILD(arenas)},
@@ -1028,13 +1054,13 @@ tcache_flush_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
 
 	VOID();
 
-	tcache = tcache_tls;
+	tcache = TCACHE_GET();
 	if (tcache == NULL) {
 		ret = 0;
 		goto RETURN;
 	}
 	tcache_destroy(tcache);
-	tcache_tls = NULL;
+	TCACHE_SET(NULL);
 
 	ret = 0;
 RETURN:
@@ -1042,6 +1068,49 @@ RETURN:
 }
 #endif
 
+static int
+thread_arena_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
+    void *newp, size_t newlen)
+{
+	int ret;
+	unsigned newind, oldind;
+
+	newind = oldind = choose_arena()->ind;
+	WRITE(oldind, unsigned);
+	READ(newind, unsigned);
+	if (newind != oldind) {
+		arena_t *arena;
+
+		if (newind >= narenas) {
+			/* New arena index is out of range. */
+			ret = EFAULT;
+			goto RETURN;
+		}
+
+		/* Initialize arena if necessary. */
+		malloc_mutex_lock(&arenas_lock);
+		if ((arena = arenas[newind]) == NULL)
+			arena = arenas_extend(newind);
+		malloc_mutex_unlock(&arenas_lock);
+		if (arena == NULL) {
+			ret = EAGAIN;
+			goto RETURN;
+		}
+
+		/* Set new arena association. */
+		ARENA_SET(arena);
+	}
+
+	ret = 0;
+RETURN:
+	return (ret);
+}
+
+#ifdef JEMALLOC_STATS
+CTL_RO_GEN(thread_allocated, ALLOCATED_GET(), uint64_t);
+CTL_RO_GEN(thread_deallocated, DEALLOCATED_GET(), uint64_t);
+#endif
+
 /******************************************************************************/
 
 #ifdef JEMALLOC_DEBUG
@@ -1137,8 +1206,15 @@ CTL_RO_FALSE_GEN(config_xmalloc)
 /******************************************************************************/
 
 CTL_RO_GEN(opt_abort, opt_abort, bool)
+CTL_RO_GEN(opt_lg_qspace_max, opt_lg_qspace_max, size_t)
+CTL_RO_GEN(opt_lg_cspace_max, opt_lg_cspace_max, size_t)
+CTL_RO_GEN(opt_lg_chunk, opt_lg_chunk, size_t)
+CTL_RO_GEN(opt_narenas, opt_narenas, size_t)
+CTL_RO_GEN(opt_lg_dirty_mult, opt_lg_dirty_mult, ssize_t)
+CTL_RO_GEN(opt_stats_print, opt_stats_print, bool)
 #ifdef JEMALLOC_FILL
 CTL_RO_GEN(opt_junk, opt_junk, bool)
+CTL_RO_GEN(opt_zero, opt_zero, bool)
 #endif
 #ifdef JEMALLOC_SYSV
 CTL_RO_GEN(opt_sysv, opt_sysv, bool)
@@ -1146,27 +1222,22 @@ CTL_RO_GEN(opt_sysv, opt_sysv, bool)
 #ifdef JEMALLOC_XMALLOC
 CTL_RO_GEN(opt_xmalloc, opt_xmalloc, bool)
 #endif
-#ifdef JEMALLOC_ZERO
-CTL_RO_GEN(opt_zero, opt_zero, bool)
-#endif
 #ifdef JEMALLOC_TCACHE
 CTL_RO_GEN(opt_tcache, opt_tcache, bool)
 CTL_RO_GEN(opt_lg_tcache_gc_sweep, opt_lg_tcache_gc_sweep, ssize_t)
 #endif
 #ifdef JEMALLOC_PROF
 CTL_RO_GEN(opt_prof, opt_prof, bool)
+CTL_RO_GEN(opt_prof_prefix, opt_prof_prefix, const char *)
 CTL_RO_GEN(opt_prof_active, opt_prof_active, bool)
 CTL_RO_GEN(opt_lg_prof_bt_max, opt_lg_prof_bt_max, size_t)
 CTL_RO_GEN(opt_lg_prof_sample, opt_lg_prof_sample, size_t)
 CTL_RO_GEN(opt_lg_prof_interval, opt_lg_prof_interval, ssize_t)
-CTL_RO_GEN(opt_prof_udump, opt_prof_udump, bool)
+CTL_RO_GEN(opt_prof_gdump, opt_prof_gdump, bool)
 CTL_RO_GEN(opt_prof_leak, opt_prof_leak, bool)
+CTL_RO_GEN(opt_prof_accum, opt_prof_accum, bool)
+CTL_RO_GEN(opt_lg_prof_tcmax, opt_lg_prof_tcmax, ssize_t)
 #endif
-CTL_RO_GEN(opt_stats_print, opt_stats_print, bool)
-CTL_RO_GEN(opt_lg_qspace_max, opt_lg_qspace_max, size_t)
-CTL_RO_GEN(opt_lg_cspace_max, opt_lg_cspace_max, size_t)
-CTL_RO_GEN(opt_lg_dirty_mult, opt_lg_dirty_mult, ssize_t)
-CTL_RO_GEN(opt_lg_chunk, opt_lg_chunk, size_t)
 #ifdef JEMALLOC_SWAP
 CTL_RO_GEN(opt_overcommit, opt_overcommit, bool)
 #endif
@@ -1249,6 +1320,44 @@ CTL_RO_GEN(arenas_nhbins, nhbins, unsigned)
 #endif
 CTL_RO_GEN(arenas_nlruns, nlclasses, size_t)
 
+static int
+arenas_purge_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
+    void *newp, size_t newlen)
+{
+	int ret;
+	unsigned arena;
+
+	WRITEONLY();
+	arena = UINT_MAX;
+	WRITE(arena, unsigned);
+	if (newp != NULL && arena >= narenas) {
+		ret = EFAULT;
+		goto RETURN;
+	} else {
+		arena_t *tarenas[narenas];
+
+		malloc_mutex_lock(&arenas_lock);
+		memcpy(tarenas, arenas, sizeof(arena_t *) * narenas);
+		malloc_mutex_unlock(&arenas_lock);
+
+		if (arena == UINT_MAX) {
+			unsigned i;
+			for (i = 0; i < narenas; i++) {
+				if (tarenas[i] != NULL)
+					arena_purge_all(tarenas[i]);
+			}
+		} else {
+			assert(arena < narenas);
+			if (tarenas[arena] != NULL)
+				arena_purge_all(tarenas[arena]);
+		}
+	}
+
+	ret = 0;
+RETURN:
+	return (ret);
+}
+
 /******************************************************************************/
 
 #ifdef JEMALLOC_PROF
diff --git a/jemalloc/src/huge.c b/jemalloc/src/huge.c
index 49962ea0..a0351975 100644
--- a/jemalloc/src/huge.c
+++ b/jemalloc/src/huge.c
@@ -37,7 +37,7 @@ huge_malloc(size_t size, bool zero)
 	if (node == NULL)
 		return (NULL);
 
-	ret = chunk_alloc(csize, &zero);
+	ret = chunk_alloc(csize, false, &zero);
 	if (ret == NULL) {
 		base_node_dealloc(node);
 		return (NULL);
@@ -69,12 +69,11 @@ huge_malloc(size_t size, bool zero)
 
 /* Only handles large allocations that require more than chunk alignment. */
 void *
-huge_palloc(size_t alignment, size_t size)
+huge_palloc(size_t size, size_t alignment, bool zero)
 {
 	void *ret;
 	size_t alloc_size, chunk_size, offset;
 	extent_node_t *node;
-	bool zero;
 
 	/*
 	 * This allocation requires alignment that is even larger than chunk
@@ -98,8 +97,7 @@ huge_palloc(size_t alignment, size_t size)
 	if (node == NULL)
 		return (NULL);
 
-	zero = false;
-	ret = chunk_alloc(alloc_size, &zero);
+	ret = chunk_alloc(alloc_size, false, &zero);
 	if (ret == NULL) {
 		base_node_dealloc(node);
 		return (NULL);
@@ -142,45 +140,80 @@ huge_palloc(size_t alignment, size_t size)
 	malloc_mutex_unlock(&huge_mtx);
 
 #ifdef JEMALLOC_FILL
-	if (opt_junk)
-		memset(ret, 0xa5, chunk_size);
-	else if (opt_zero)
-		memset(ret, 0, chunk_size);
+	if (zero == false) {
+		if (opt_junk)
+			memset(ret, 0xa5, chunk_size);
+		else if (opt_zero)
+			memset(ret, 0, chunk_size);
+	}
 #endif
 
 	return (ret);
 }
 
 void *
-huge_ralloc(void *ptr, size_t size, size_t oldsize)
+huge_ralloc_no_move(void *ptr, size_t oldsize, size_t size, size_t extra)
 {
-	void *ret;
-	size_t copysize;
 
-	/* Avoid moving the allocation if the size class would not change. */
-	if (oldsize > arena_maxclass &&
-	    CHUNK_CEILING(size) == CHUNK_CEILING(oldsize)) {
+	/*
+	 * Avoid moving the allocation if the size class can be left the same.
+	 */
+	if (oldsize > arena_maxclass
+	    && CHUNK_CEILING(oldsize) >= CHUNK_CEILING(size)
+	    && CHUNK_CEILING(oldsize) <= CHUNK_CEILING(size+extra)) {
+		assert(CHUNK_CEILING(oldsize) == oldsize);
 #ifdef JEMALLOC_FILL
 		if (opt_junk && size < oldsize) {
-			memset((void *)((uintptr_t)ptr + size), 0x5a, oldsize
-			    - size);
-		} else if (opt_zero && size > oldsize) {
-			memset((void *)((uintptr_t)ptr + oldsize), 0, size
-			    - oldsize);
+			memset((void *)((uintptr_t)ptr + size), 0x5a,
+			    oldsize - size);
 		}
 #endif
 		return (ptr);
 	}
 
-	/*
-	 * If we get here, then size and oldsize are different enough that we
-	 * need to use a different size class.  In that case, fall back to
-	 * allocating new space and copying.
-	 */
-	ret = huge_malloc(size, false);
-	if (ret == NULL)
-		return (NULL);
+	/* Reallocation would require a move. */
+	return (NULL);
+}
 
+void *
+huge_ralloc(void *ptr, size_t oldsize, size_t size, size_t extra,
+    size_t alignment, bool zero)
+{
+	void *ret;
+	size_t copysize;
+
+	/* Try to avoid moving the allocation. */
+	ret = huge_ralloc_no_move(ptr, oldsize, size, extra);
+	if (ret != NULL)
+		return (ret);
+
+	/*
+	 * size and oldsize are different enough that we need to use a
+	 * different size class.  In that case, fall back to allocating new
+	 * space and copying.
+	 */
+	if (alignment != 0)
+		ret = huge_palloc(size + extra, alignment, zero);
+	else
+		ret = huge_malloc(size + extra, zero);
+
+	if (ret == NULL) {
+		if (extra == 0)
+			return (NULL);
+		/* Try again, this time without extra. */
+		if (alignment != 0)
+			ret = huge_palloc(size, alignment, zero);
+		else
+			ret = huge_malloc(size, zero);
+
+		if (ret == NULL)
+			return (NULL);
+	}
+
+	/*
+	 * Copy at most size bytes (not size+extra), since the caller has no
+	 * expectation that the extra bytes will be reliably preserved.
+	 */
 	copysize = (size < oldsize) ? size : oldsize;
 	memcpy(ret, ptr, copysize);
 	idalloc(ptr);
diff --git a/jemalloc/src/jemalloc.c b/jemalloc/src/jemalloc.c
index b36590dd..2aebc51d 100644
--- a/jemalloc/src/jemalloc.c
+++ b/jemalloc/src/jemalloc.c
@@ -1,85 +1,3 @@
-/*-
- * This allocator implementation is designed to provide scalable performance
- * for multi-threaded programs on multi-processor systems.  The following
- * features are included for this purpose:
- *
- *   + Multiple arenas are used if there are multiple CPUs, which reduces lock
- *     contention and cache sloshing.
- *
- *   + Thread-specific caching is used if there are multiple threads, which
- *     reduces the amount of locking.
- *
- *   + Cache line sharing between arenas is avoided for internal data
- *     structures.
- *
- *   + Memory is managed in chunks and runs (chunks can be split into runs),
- *     rather than as individual pages.  This provides a constant-time
- *     mechanism for associating allocations with particular arenas.
- *
- * Allocation requests are rounded up to the nearest size class, and no record
- * of the original request size is maintained.  Allocations are broken into
- * categories according to size class.  Assuming 1 MiB chunks, 4 KiB pages and
- * a 16 byte quantum on a 32-bit system, the size classes in each category are
- * as follows:
- *
- *   |========================================|
- *   | Category | Subcategory      |     Size |
- *   |========================================|
- *   | Small    | Tiny             |        2 |
- *   |          |                  |        4 |
- *   |          |                  |        8 |
- *   |          |------------------+----------|
- *   |          | Quantum-spaced   |       16 |
- *   |          |                  |       32 |
- *   |          |                  |       48 |
- *   |          |                  |      ... |
- *   |          |                  |       96 |
- *   |          |                  |      112 |
- *   |          |                  |      128 |
- *   |          |------------------+----------|
- *   |          | Cacheline-spaced |      192 |
- *   |          |                  |      256 |
- *   |          |                  |      320 |
- *   |          |                  |      384 |
- *   |          |                  |      448 |
- *   |          |                  |      512 |
- *   |          |------------------+----------|
- *   |          | Sub-page         |      760 |
- *   |          |                  |     1024 |
- *   |          |                  |     1280 |
- *   |          |                  |      ... |
- *   |          |                  |     3328 |
- *   |          |                  |     3584 |
- *   |          |                  |     3840 |
- *   |========================================|
- *   | Large                       |    4 KiB |
- *   |                             |    8 KiB |
- *   |                             |   12 KiB |
- *   |                             |      ... |
- *   |                             | 1012 KiB |
- *   |                             | 1016 KiB |
- *   |                             | 1020 KiB |
- *   |========================================|
- *   | Huge                        |    1 MiB |
- *   |                             |    2 MiB |
- *   |                             |    3 MiB |
- *   |                             |      ... |
- *   |========================================|
- *
- * Different mechanisms are used accoding to category:
- *
- *   Small: Each size class is segregated into its own set of runs.  Each run
- *          maintains a bitmap of which regions are free/allocated.
- *
- *   Large : Each allocation is backed by a dedicated run.  Metadata are stored
- *           in the associated arena chunk header maps.
- *
- *   Huge : Each allocation is backed by a dedicated contiguous set of chunks.
- *          Metadata are stored in a separate red-black tree.
- *
- *******************************************************************************
- */
-
 #define	JEMALLOC_C_
 #include "jemalloc/internal/jemalloc_internal.h"
 
@@ -89,22 +7,30 @@
 malloc_mutex_t		arenas_lock;
 arena_t			**arenas;
 unsigned		narenas;
-#ifndef NO_TLS
 static unsigned		next_arena;
-#endif
 
 #ifndef NO_TLS
-__thread arena_t	*arenas_map JEMALLOC_ATTR(tls_model("initial-exec"));
+__thread arena_t	*arenas_tls JEMALLOC_ATTR(tls_model("initial-exec"));
+#else
+pthread_key_t		arenas_tsd;
+#endif
+
+#ifdef JEMALLOC_STATS
+#  ifndef NO_TLS
+__thread thread_allocated_t	thread_allocated_tls;
+#  else
+pthread_key_t		thread_allocated_tsd;
+#  endif
 #endif
 
 /* Set to true once the allocator has been initialized. */
-static bool malloc_initialized = false;
+static bool		malloc_initialized = false;
 
 /* Used to let the initializing thread recursively allocate. */
-static pthread_t malloc_initializer = (unsigned long)0;
+static pthread_t	malloc_initializer = (unsigned long)0;
 
 /* Used to avoid initialization races. */
-static malloc_mutex_t init_lock = PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP;
+static malloc_mutex_t	init_lock = MALLOC_MUTEX_INITIALIZER;
 
 #ifdef DYNAMIC_PAGE_SHIFT
 size_t		pagesize;
@@ -115,8 +41,7 @@ size_t		lg_pagesize;
 unsigned	ncpus;
 
 /* Runtime configuration options. */
-const char	*JEMALLOC_P(malloc_options)
-    JEMALLOC_ATTR(visibility("default"));
+const char	*JEMALLOC_P(malloc_conf) JEMALLOC_ATTR(visibility("default"));
 #ifdef JEMALLOC_DEBUG
 bool	opt_abort = true;
 #  ifdef JEMALLOC_FILL
@@ -137,7 +62,7 @@ bool	opt_xmalloc = false;
 #ifdef JEMALLOC_FILL
 bool	opt_zero = false;
 #endif
-static int	opt_narenas_lshift = 0;
+size_t	opt_narenas = 0;
 
 /******************************************************************************/
 /* Function prototypes for non-inline static functions. */
@@ -145,9 +70,15 @@ static int	opt_narenas_lshift = 0;
 static void	wrtmessage(void *cbopaque, const char *s);
 static void	stats_print_atexit(void);
 static unsigned	malloc_ncpus(void);
+#if (defined(JEMALLOC_STATS) && defined(NO_TLS))
+static void	thread_allocated_cleanup(void *arg);
+#endif
+static bool	malloc_conf_next(char const **opts_p, char const **k_p,
+    size_t *klen_p, char const **v_p, size_t *vlen_p);
+static void	malloc_conf_error(const char *msg, const char *k, size_t klen,
+    const char *v, size_t vlen);
+static void	malloc_conf_init(void);
 static bool	malloc_init_hard(void);
-static void	jemalloc_prefork(void);
-static void	jemalloc_postfork(void);
 
 /******************************************************************************/
 /* malloc_message() setup. */
@@ -160,8 +91,14 @@ static
 void
 wrtmessage(void *cbopaque, const char *s)
 {
-
-	write(STDERR_FILENO, s, strlen(s));
+#ifdef JEMALLOC_CC_SILENCE
+	int result =
+#endif
+	    write(STDERR_FILENO, s, strlen(s));
+#ifdef JEMALLOC_CC_SILENCE
+	if (result < 0)
+		result = errno;
+#endif
 }
 
 void	(*JEMALLOC_P(malloc_message))(void *, const char *s)
@@ -179,8 +116,8 @@ arenas_extend(unsigned ind)
 	arena_t *ret;
 
 	/* Allocate enough space for trailing bins. */
-	ret = (arena_t *)base_alloc(sizeof(arena_t)
-	    + (sizeof(arena_bin_t) * (nbins - 1)));
+	ret = (arena_t *)base_alloc(offsetof(arena_t, bins)
+	    + (sizeof(arena_bin_t) * nbins));
 	if (ret != NULL && arena_new(ret, ind) == false) {
 		arenas[ind] = ret;
 		return (ret);
@@ -200,7 +137,6 @@ arenas_extend(unsigned ind)
 	return (arenas[0]);
 }
 
-#ifndef NO_TLS
 /*
  * Choose an arena based on a per-thread value (slow-path code only, called
  * only by choose_arena()).
@@ -219,11 +155,29 @@ choose_arena_hard(void)
 	} else
 		ret = arenas[0];
 
-	arenas_map = ret;
+	ARENA_SET(ret);
 
 	return (ret);
 }
+
+/*
+ * glibc provides a non-standard strerror_r() when _GNU_SOURCE is defined, so
+ * provide a wrapper.
+ */
+int
+buferror(int errnum, char *buf, size_t buflen)
+{
+#ifdef _GNU_SOURCE
+	char *b = strerror_r(errno, buf, buflen);
+	if (b != buf) {
+		strncpy(buf, b, buflen);
+		buf[buflen-1] = '\0';
+	}
+	return (0);
+#else
+	return (strerror_r(errno, buf, buflen));
 #endif
+}
 
 static void
 stats_print_atexit(void)
@@ -283,6 +237,17 @@ malloc_ncpus(void)
 	return (ret);
 }
 
+#if (defined(JEMALLOC_STATS) && defined(NO_TLS))
+static void
+thread_allocated_cleanup(void *arg)
+{
+	uint64_t *allocated = (uint64_t *)arg;
+
+	if (allocated != NULL)
+		idalloc(allocated);
+}
+#endif
+
 /*
  * FreeBSD's pthreads implementation calls malloc(3), so the malloc
  * implementation has to take pains to avoid infinite recursion during
@@ -299,12 +264,323 @@ malloc_init(void)
 }
 
 static bool
-malloc_init_hard(void)
+malloc_conf_next(char const **opts_p, char const **k_p, size_t *klen_p,
+    char const **v_p, size_t *vlen_p)
+{
+	bool accept;
+	const char *opts = *opts_p;
+
+	*k_p = opts;
+
+	for (accept = false; accept == false;) {
+		switch (*opts) {
+			case 'A': case 'B': case 'C': case 'D': case 'E':
+			case 'F': case 'G': case 'H': case 'I': case 'J':
+			case 'K': case 'L': case 'M': case 'N': case 'O':
+			case 'P': case 'Q': case 'R': case 'S': case 'T':
+			case 'U': case 'V': case 'W': case 'X': case 'Y':
+			case 'Z':
+			case 'a': case 'b': case 'c': case 'd': case 'e':
+			case 'f': case 'g': case 'h': case 'i': case 'j':
+			case 'k': case 'l': case 'm': case 'n': case 'o':
+			case 'p': case 'q': case 'r': case 's': case 't':
+			case 'u': case 'v': case 'w': case 'x': case 'y':
+			case 'z':
+			case '0': case '1': case '2': case '3': case '4':
+			case '5': case '6': case '7': case '8': case '9':
+			case '_':
+				opts++;
+				break;
+			case ':':
+				opts++;
+				*klen_p = (uintptr_t)opts - 1 - (uintptr_t)*k_p;
+				*v_p = opts;
+				accept = true;
+				break;
+			case '\0':
+				if (opts != *opts_p) {
+					malloc_write("<jemalloc>: Conf string "
+					    "ends with key\n");
+				}
+				return (true);
+			default:
+				malloc_write("<jemalloc>: Malformed conf "
+				    "string\n");
+				return (true);
+		}
+	}
+
+	for (accept = false; accept == false;) {
+		switch (*opts) {
+			case ',':
+				opts++;
+				/*
+				 * Look ahead one character here, because the
+				 * next time this function is called, it will
+				 * assume that end of input has been cleanly
+				 * reached if no input remains, but we have
+				 * optimistically already consumed the comma if
+				 * one exists.
+				 */
+				if (*opts == '\0') {
+					malloc_write("<jemalloc>: Conf string "
+					    "ends with comma\n");
+				}
+				*vlen_p = (uintptr_t)opts - 1 - (uintptr_t)*v_p;
+				accept = true;
+				break;
+			case '\0':
+				*vlen_p = (uintptr_t)opts - (uintptr_t)*v_p;
+				accept = true;
+				break;
+			default:
+				opts++;
+				break;
+		}
+	}
+
+	*opts_p = opts;
+	return (false);
+}
+
+static void
+malloc_conf_error(const char *msg, const char *k, size_t klen, const char *v,
+    size_t vlen)
+{
+	char buf[PATH_MAX + 1];
+
+	malloc_write("<jemalloc>: ");
+	malloc_write(msg);
+	malloc_write(": ");
+	memcpy(buf, k, klen);
+	memcpy(&buf[klen], ":", 1);
+	memcpy(&buf[klen+1], v, vlen);
+	buf[klen+1+vlen] = '\0';
+	malloc_write(buf);
+	malloc_write("\n");
+}
+
+static void
+malloc_conf_init(void)
 {
 	unsigned i;
-	int linklen;
 	char buf[PATH_MAX + 1];
-	const char *opts;
+	const char *opts, *k, *v;
+	size_t klen, vlen;
+
+	for (i = 0; i < 3; i++) {
+		/* Get runtime configuration. */
+		switch (i) {
+		case 0:
+			if (JEMALLOC_P(malloc_conf) != NULL) {
+				/*
+				 * Use options that were compiled into the
+				 * program.
+				 */
+				opts = JEMALLOC_P(malloc_conf);
+			} else {
+				/* No configuration specified. */
+				buf[0] = '\0';
+				opts = buf;
+			}
+			break;
+		case 1: {
+			int linklen;
+			const char *linkname =
+#ifdef JEMALLOC_PREFIX
+			    "/etc/"JEMALLOC_PREFIX"malloc.conf"
+#else
+			    "/etc/malloc.conf"
+#endif
+			    ;
+
+			if ((linklen = readlink(linkname, buf,
+			    sizeof(buf) - 1)) != -1) {
+				/*
+				 * Use the contents of the "/etc/malloc.conf"
+				 * symbolic link's name.
+				 */
+				buf[linklen] = '\0';
+				opts = buf;
+			} else {
+				/* No configuration specified. */
+				buf[0] = '\0';
+				opts = buf;
+			}
+			break;
+		}
+		case 2: {
+			const char *envname =
+#ifdef JEMALLOC_PREFIX
+			    JEMALLOC_CPREFIX"MALLOC_CONF"
+#else
+			    "MALLOC_CONF"
+#endif
+			    ;
+
+			if ((opts = getenv(envname)) != NULL) {
+				/*
+				 * Do nothing; opts is already initialized to
+				 * the value of the JEMALLOC_OPTIONS
+				 * environment variable.
+				 */
+			} else {
+				/* No configuration specified. */
+				buf[0] = '\0';
+				opts = buf;
+			}
+			break;
+		}
+		default:
+			/* NOTREACHED */
+			assert(false);
+			buf[0] = '\0';
+			opts = buf;
+		}
+
+		while (*opts != '\0' && malloc_conf_next(&opts, &k, &klen, &v,
+		    &vlen) == false) {
+#define	CONF_HANDLE_BOOL(n)						\
+			if (sizeof(#n)-1 == klen && strncmp(#n, k,	\
+			    klen) == 0) {				\
+				if (strncmp("true", v, vlen) == 0 &&	\
+				    vlen == sizeof("true")-1)		\
+					opt_##n = true;			\
+				else if (strncmp("false", v, vlen) ==	\
+				    0 && vlen == sizeof("false")-1)	\
+					opt_##n = false;		\
+				else {					\
+					malloc_conf_error(		\
+					    "Invalid conf value",	\
+					    k, klen, v, vlen);		\
+				}					\
+				continue;				\
+			}
+#define	CONF_HANDLE_SIZE_T(n, min, max)					\
+			if (sizeof(#n)-1 == klen && strncmp(#n, k,	\
+			    klen) == 0) {				\
+				unsigned long ul;			\
+				char *end;				\
+									\
+				errno = 0;				\
+				ul = strtoul(v, &end, 0);		\
+				if (errno != 0 || (uintptr_t)end -	\
+				    (uintptr_t)v != vlen) {		\
+					malloc_conf_error(		\
+					    "Invalid conf value",	\
+					    k, klen, v, vlen);		\
+				} else if (ul < min || ul > max) {	\
+					malloc_conf_error(		\
+					    "Out-of-range conf value",	\
+					    k, klen, v, vlen);		\
+				} else					\
+					opt_##n = ul;			\
+				continue;				\
+			}
+#define	CONF_HANDLE_SSIZE_T(n, min, max)				\
+			if (sizeof(#n)-1 == klen && strncmp(#n, k,	\
+			    klen) == 0) {				\
+				long l;					\
+				char *end;				\
+									\
+				errno = 0;				\
+				l = strtol(v, &end, 0);			\
+				if (errno != 0 || (uintptr_t)end -	\
+				    (uintptr_t)v != vlen) {		\
+					malloc_conf_error(		\
+					    "Invalid conf value",	\
+					    k, klen, v, vlen);		\
+				} else if (l < (ssize_t)min || l >	\
+				    (ssize_t)max) {			\
+					malloc_conf_error(		\
+					    "Out-of-range conf value",	\
+					    k, klen, v, vlen);		\
+				} else					\
+					opt_##n = l;			\
+				continue;				\
+			}
+#define	CONF_HANDLE_CHAR_P(n, d)					\
+			if (sizeof(#n)-1 == klen && strncmp(#n, k,	\
+			    klen) == 0) {				\
+				size_t cpylen = (vlen <=		\
+				    sizeof(opt_##n)-1) ? vlen :		\
+				    sizeof(opt_##n)-1;			\
+				strncpy(opt_##n, v, cpylen);		\
+				opt_##n[cpylen] = '\0';			\
+				continue;				\
+			}
+
+			CONF_HANDLE_BOOL(abort)
+			CONF_HANDLE_SIZE_T(lg_qspace_max, LG_QUANTUM,
+			    PAGE_SHIFT-1)
+			CONF_HANDLE_SIZE_T(lg_cspace_max, LG_QUANTUM,
+			    PAGE_SHIFT-1)
+			/*
+			 * Chunks always require at least one * header page,
+			 * plus one data page.
+			 */
+			CONF_HANDLE_SIZE_T(lg_chunk, PAGE_SHIFT+1,
+			    (sizeof(size_t) << 3) - 1)
+			CONF_HANDLE_SIZE_T(narenas, 1, SIZE_T_MAX)
+			CONF_HANDLE_SSIZE_T(lg_dirty_mult, -1,
+			    (sizeof(size_t) << 3) - 1)
+			CONF_HANDLE_BOOL(stats_print)
+#ifdef JEMALLOC_FILL
+			CONF_HANDLE_BOOL(junk)
+			CONF_HANDLE_BOOL(zero)
+#endif
+#ifdef JEMALLOC_SYSV
+			CONF_HANDLE_BOOL(sysv)
+#endif
+#ifdef JEMALLOC_XMALLOC
+			CONF_HANDLE_BOOL(xmalloc)
+#endif
+#ifdef JEMALLOC_TCACHE
+			CONF_HANDLE_BOOL(tcache)
+			CONF_HANDLE_SSIZE_T(lg_tcache_gc_sweep, -1,
+			    (sizeof(size_t) << 3) - 1)
+			CONF_HANDLE_SSIZE_T(lg_tcache_max, -1,
+			    (sizeof(size_t) << 3) - 1)
+#endif
+#ifdef JEMALLOC_PROF
+			CONF_HANDLE_BOOL(prof)
+			CONF_HANDLE_CHAR_P(prof_prefix, "jeprof")
+			CONF_HANDLE_SIZE_T(lg_prof_bt_max, 0, LG_PROF_BT_MAX)
+			CONF_HANDLE_BOOL(prof_active)
+			CONF_HANDLE_SSIZE_T(lg_prof_sample, 0,
+			    (sizeof(uint64_t) << 3) - 1)
+			CONF_HANDLE_BOOL(prof_accum)
+			CONF_HANDLE_SSIZE_T(lg_prof_tcmax, -1,
+			    (sizeof(size_t) << 3) - 1)
+			CONF_HANDLE_SSIZE_T(lg_prof_interval, -1,
+			    (sizeof(uint64_t) << 3) - 1)
+			CONF_HANDLE_BOOL(prof_gdump)
+			CONF_HANDLE_BOOL(prof_leak)
+#endif
+#ifdef JEMALLOC_SWAP
+			CONF_HANDLE_BOOL(overcommit)
+#endif
+			malloc_conf_error("Invalid conf pair", k, klen, v,
+			    vlen);
+#undef CONF_HANDLE_BOOL
+#undef CONF_HANDLE_SIZE_T
+#undef CONF_HANDLE_SSIZE_T
+#undef CONF_HANDLE_CHAR_P
+		}
+
+		/* Validate configuration of options that are inter-related. */
+		if (opt_lg_qspace_max+1 >= opt_lg_cspace_max) {
+			malloc_write("<jemalloc>: Invalid lg_[qc]space_max "
+			    "relationship; restoring defaults\n");
+			opt_lg_qspace_max = LG_QSPACE_MAX_DEFAULT;
+			opt_lg_cspace_max = LG_CSPACE_MAX_DEFAULT;
+		}
+	}
+}
+
+static bool
+malloc_init_hard(void)
+{
 	arena_t *init_arenas[1];
 
 	malloc_mutex_lock(&init_lock);
@@ -347,287 +623,11 @@ malloc_init_hard(void)
 	}
 #endif
 
-	for (i = 0; i < 3; i++) {
-		unsigned j;
-
-		/* Get runtime configuration. */
-		switch (i) {
-		case 0:
-			if ((linklen = readlink("/etc/jemalloc.conf", buf,
-						sizeof(buf) - 1)) != -1) {
-				/*
-				 * Use the contents of the "/etc/jemalloc.conf"
-				 * symbolic link's name.
-				 */
-				buf[linklen] = '\0';
-				opts = buf;
-			} else {
-				/* No configuration specified. */
-				buf[0] = '\0';
-				opts = buf;
-			}
-			break;
-		case 1:
-			if ((opts = getenv("JEMALLOC_OPTIONS")) != NULL) {
-				/*
-				 * Do nothing; opts is already initialized to
-				 * the value of the JEMALLOC_OPTIONS
-				 * environment variable.
-				 */
-			} else {
-				/* No configuration specified. */
-				buf[0] = '\0';
-				opts = buf;
-			}
-			break;
-		case 2:
-			if (JEMALLOC_P(malloc_options) != NULL) {
-				/*
-				 * Use options that were compiled into the
-				 * program.
-				 */
-				opts = JEMALLOC_P(malloc_options);
-			} else {
-				/* No configuration specified. */
-				buf[0] = '\0';
-				opts = buf;
-			}
-			break;
-		default:
-			/* NOTREACHED */
-			assert(false);
-			buf[0] = '\0';
-			opts = buf;
-		}
-
-		for (j = 0; opts[j] != '\0'; j++) {
-			unsigned k, nreps;
-			bool nseen;
-
-			/* Parse repetition count, if any. */
-			for (nreps = 0, nseen = false;; j++, nseen = true) {
-				switch (opts[j]) {
-					case '0': case '1': case '2': case '3':
-					case '4': case '5': case '6': case '7':
-					case '8': case '9':
-						nreps *= 10;
-						nreps += opts[j] - '0';
-						break;
-					default:
-						goto MALLOC_OUT;
-				}
-			}
-MALLOC_OUT:
-			if (nseen == false)
-				nreps = 1;
-
-			for (k = 0; k < nreps; k++) {
-				switch (opts[j]) {
-				case 'a':
-					opt_abort = false;
-					break;
-				case 'A':
-					opt_abort = true;
-					break;
 #ifdef JEMALLOC_PROF
-				case 'b':
-					if (opt_lg_prof_bt_max > 0)
-						opt_lg_prof_bt_max--;
-					break;
-				case 'B':
-					if (opt_lg_prof_bt_max < LG_PROF_BT_MAX)
-						opt_lg_prof_bt_max++;
-					break;
+	prof_boot0();
 #endif
-				case 'c':
-					if (opt_lg_cspace_max - 1 >
-					    opt_lg_qspace_max &&
-					    opt_lg_cspace_max >
-					    LG_CACHELINE)
-						opt_lg_cspace_max--;
-					break;
-				case 'C':
-					if (opt_lg_cspace_max < PAGE_SHIFT
-					    - 1)
-						opt_lg_cspace_max++;
-					break;
-				case 'd':
-					if (opt_lg_dirty_mult + 1 <
-					    (sizeof(size_t) << 3))
-						opt_lg_dirty_mult++;
-					break;
-				case 'D':
-					if (opt_lg_dirty_mult >= 0)
-						opt_lg_dirty_mult--;
-					break;
-#ifdef JEMALLOC_PROF
-				case 'e':
-					opt_prof_active = false;
-					break;
-				case 'E':
-					opt_prof_active = true;
-					break;
-				case 'f':
-					opt_prof = false;
-					break;
-				case 'F':
-					opt_prof = true;
-					break;
-#endif
-#ifdef JEMALLOC_TCACHE
-				case 'g':
-					if (opt_lg_tcache_gc_sweep >= 0)
-						opt_lg_tcache_gc_sweep--;
-					break;
-				case 'G':
-					if (opt_lg_tcache_gc_sweep + 1 <
-					    (sizeof(size_t) << 3))
-						opt_lg_tcache_gc_sweep++;
-					break;
-				case 'h':
-					opt_tcache = false;
-					break;
-				case 'H':
-					opt_tcache = true;
-					break;
-#endif
-#ifdef JEMALLOC_PROF
-				case 'i':
-					if (opt_lg_prof_interval >= 0)
-						opt_lg_prof_interval--;
-					break;
-				case 'I':
-					if (opt_lg_prof_interval + 1 <
-					    (sizeof(uint64_t) << 3))
-						opt_lg_prof_interval++;
-					break;
-#endif
-#ifdef JEMALLOC_FILL
-				case 'j':
-					opt_junk = false;
-					break;
-				case 'J':
-					opt_junk = true;
-					break;
-#endif
-				case 'k':
-					/*
-					 * Chunks always require at least one
-					 * header page, plus one data page.
-					 */
-					if ((1U << (opt_lg_chunk - 1)) >=
-					    (2U << PAGE_SHIFT))
-						opt_lg_chunk--;
-					break;
-				case 'K':
-					if (opt_lg_chunk + 1 <
-					    (sizeof(size_t) << 3))
-						opt_lg_chunk++;
-					break;
-#ifdef JEMALLOC_PROF
-				case 'l':
-					opt_prof_leak = false;
-					break;
-				case 'L':
-					opt_prof_leak = true;
-					break;
-#endif
-#ifdef JEMALLOC_TCACHE
-				case 'm':
-					if (opt_lg_tcache_maxclass >= 0)
-						opt_lg_tcache_maxclass--;
-					break;
-				case 'M':
-					if (opt_lg_tcache_maxclass + 1 <
-					    (sizeof(size_t) << 3))
-						opt_lg_tcache_maxclass++;
-					break;
-#endif
-				case 'n':
-					opt_narenas_lshift--;
-					break;
-				case 'N':
-					opt_narenas_lshift++;
-					break;
-#ifdef JEMALLOC_SWAP
-				case 'o':
-					opt_overcommit = false;
-					break;
-				case 'O':
-					opt_overcommit = true;
-					break;
-#endif
-				case 'p':
-					opt_stats_print = false;
-					break;
-				case 'P':
-					opt_stats_print = true;
-					break;
-				case 'q':
-					if (opt_lg_qspace_max > LG_QUANTUM)
-						opt_lg_qspace_max--;
-					break;
-				case 'Q':
-					if (opt_lg_qspace_max + 1 <
-					    opt_lg_cspace_max)
-						opt_lg_qspace_max++;
-					break;
-#ifdef JEMALLOC_PROF
-				case 's':
-					if (opt_lg_prof_sample > 0)
-						opt_lg_prof_sample--;
-					break;
-				case 'S':
-					if (opt_lg_prof_sample + 1 <
-					    (sizeof(uint64_t) << 3))
-						opt_lg_prof_sample++;
-					break;
-				case 'u':
-					opt_prof_udump = false;
-					break;
-				case 'U':
-					opt_prof_udump = true;
-					break;
-#endif
-#ifdef JEMALLOC_SYSV
-				case 'v':
-					opt_sysv = false;
-					break;
-				case 'V':
-					opt_sysv = true;
-					break;
-#endif
-#ifdef JEMALLOC_XMALLOC
-				case 'x':
-					opt_xmalloc = false;
-					break;
-				case 'X':
-					opt_xmalloc = true;
-					break;
-#endif
-#ifdef JEMALLOC_FILL
-				case 'z':
-					opt_zero = false;
-					break;
-				case 'Z':
-					opt_zero = true;
-					break;
-#endif
-				default: {
-					char cbuf[2];
 
-					cbuf[0] = opts[j];
-					cbuf[1] = '\0';
-					malloc_write(
-					    "<jemalloc>: Unsupported character "
-					    "in malloc options: '");
-					malloc_write(cbuf);
-					malloc_write("'\n");
-				}
-				}
-			}
-		}
-	}
+	malloc_conf_init();
 
 	/* Register fork handlers. */
 	if (pthread_atfork(jemalloc_prefork, jemalloc_postfork,
@@ -662,7 +662,7 @@ MALLOC_OUT:
 	}
 
 #ifdef JEMALLOC_PROF
-	prof_boot0();
+	prof_boot1();
 #endif
 
 	if (arena_boot()) {
@@ -679,6 +679,15 @@ MALLOC_OUT:
 		return (true);
 	}
 
+#if (defined(JEMALLOC_STATS) && defined(NO_TLS))
+	/* Initialize allocation counters before any allocations can occur. */
+	if (pthread_key_create(&thread_allocated_tsd, thread_allocated_cleanup)
+	    != 0) {
+		malloc_mutex_unlock(&init_lock);
+		return (true);
+	}
+#endif
+
 	/*
 	 * Create enough scaffolding to allow recursive allocation in
 	 * malloc_ncpus().
@@ -697,19 +706,17 @@ MALLOC_OUT:
 		return (true);
 	}
 
-#ifndef NO_TLS
 	/*
 	 * Assign the initial arena to the initial thread, in order to avoid
 	 * spurious creation of an extra arena if the application switches to
 	 * threaded mode.
 	 */
-	arenas_map = arenas[0];
-#endif
+	ARENA_SET(arenas[0]);
 
 	malloc_mutex_init(&arenas_lock);
 
 #ifdef JEMALLOC_PROF
-	if (prof_boot1()) {
+	if (prof_boot2()) {
 		malloc_mutex_unlock(&init_lock);
 		return (true);
 	}
@@ -721,64 +728,40 @@ MALLOC_OUT:
 	ncpus = malloc_ncpus();
 	malloc_mutex_lock(&init_lock);
 
-	if (ncpus > 1) {
+	if (opt_narenas == 0) {
 		/*
 		 * For SMP systems, create more than one arena per CPU by
 		 * default.
 		 */
-		opt_narenas_lshift += 2;
+		if (ncpus > 1)
+			opt_narenas = ncpus << 2;
+		else
+			opt_narenas = 1;
+	}
+	narenas = opt_narenas;
+	/*
+	 * Make sure that the arenas array can be allocated.  In practice, this
+	 * limit is enough to allow the allocator to function, but the ctl
+	 * machinery will fail to allocate memory at far lower limits.
+	 */
+	if (narenas > chunksize / sizeof(arena_t *)) {
+		char buf[UMAX2S_BUFSIZE];
+
+		narenas = chunksize / sizeof(arena_t *);
+		malloc_write("<jemalloc>: Reducing narenas to limit (");
+		malloc_write(u2s(narenas, 10, buf));
+		malloc_write(")\n");
 	}
 
-	/* Determine how many arenas to use. */
-	narenas = ncpus;
-	if (opt_narenas_lshift > 0) {
-		if ((narenas << opt_narenas_lshift) > narenas)
-			narenas <<= opt_narenas_lshift;
-		/*
-		 * Make sure not to exceed the limits of what base_alloc() can
-		 * handle.
-		 */
-		if (narenas * sizeof(arena_t *) > chunksize)
-			narenas = chunksize / sizeof(arena_t *);
-	} else if (opt_narenas_lshift < 0) {
-		if ((narenas >> -opt_narenas_lshift) < narenas)
-			narenas >>= -opt_narenas_lshift;
-		/* Make sure there is at least one arena. */
-		if (narenas == 0)
-			narenas = 1;
-	}
+	next_arena = (narenas > 0) ? 1 : 0;
 
 #ifdef NO_TLS
-	if (narenas > 1) {
-		static const unsigned primes[] = {1, 3, 5, 7, 11, 13, 17, 19,
-		    23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83,
-		    89, 97, 101, 103, 107, 109, 113, 127, 131, 137, 139, 149,
-		    151, 157, 163, 167, 173, 179, 181, 191, 193, 197, 199, 211,
-		    223, 227, 229, 233, 239, 241, 251, 257, 263};
-		unsigned nprimes, parenas;
-
-		/*
-		 * Pick a prime number of hash arenas that is more than narenas
-		 * so that direct hashing of pthread_self() pointers tends to
-		 * spread allocations evenly among the arenas.
-		 */
-		assert((narenas & 1) == 0); /* narenas must be even. */
-		nprimes = (sizeof(primes) >> LG_SIZEOF_INT);
-		parenas = primes[nprimes - 1]; /* In case not enough primes. */
-		for (i = 1; i < nprimes; i++) {
-			if (primes[i] > narenas) {
-				parenas = primes[i];
-				break;
-			}
-		}
-		narenas = parenas;
+	if (pthread_key_create(&arenas_tsd, NULL) != 0) {
+		malloc_mutex_unlock(&init_lock);
+		return (true);
 	}
 #endif
 
-#ifndef NO_TLS
-	next_arena = (narenas > 0) ? 1 : 0;
-#endif
-
 	/* Allocate and initialize arenas. */
 	arenas = (arena_t **)base_alloc(sizeof(arena_t *) * narenas);
 	if (arenas == NULL) {
@@ -793,11 +776,35 @@ MALLOC_OUT:
 	/* Copy the pointer to the one arena that was already initialized. */
 	arenas[0] = init_arenas[0];
 
+#ifdef JEMALLOC_ZONE
+	/* Register the custom zone. */
+	malloc_zone_register(create_zone());
+
+	/*
+	 * Convert the default szone to an "overlay zone" that is capable of
+	 * deallocating szone-allocated objects, but allocating new objects
+	 * from jemalloc.
+	 */
+	szone2ozone(malloc_default_zone());
+#endif
+
 	malloc_initialized = true;
 	malloc_mutex_unlock(&init_lock);
 	return (false);
 }
 
+
+#ifdef JEMALLOC_ZONE
+JEMALLOC_ATTR(constructor)
+void
+jemalloc_darwin_init(void)
+{
+
+	if (malloc_init_hard())
+		abort();
+}
+#endif
+
 /*
  * End initialization functions.
  */
@@ -812,8 +819,19 @@ void *
 JEMALLOC_P(malloc)(size_t size)
 {
 	void *ret;
+#if (defined(JEMALLOC_PROF) || defined(JEMALLOC_STATS))
+	size_t usize
+#  ifdef JEMALLOC_CC_SILENCE
+	    = 0
+#  endif
+	    ;
+#endif
 #ifdef JEMALLOC_PROF
-	prof_thr_cnt_t *cnt;
+	prof_thr_cnt_t *cnt
+#  ifdef JEMALLOC_CC_SILENCE
+	    = NULL
+#  endif
+	    ;
 #endif
 
 	if (malloc_init()) {
@@ -843,20 +861,26 @@ JEMALLOC_P(malloc)(size_t size)
 
 #ifdef JEMALLOC_PROF
 	if (opt_prof) {
-		if ((cnt = prof_alloc_prep(size)) == NULL) {
+		usize = s2u(size);
+		if ((cnt = prof_alloc_prep(usize)) == NULL) {
 			ret = NULL;
 			goto OOM;
 		}
-		if (prof_promote && (uintptr_t)cnt != (uintptr_t)1U && size <=
+		if (prof_promote && (uintptr_t)cnt != (uintptr_t)1U && usize <=
 		    small_maxclass) {
 			ret = imalloc(small_maxclass+1);
 			if (ret != NULL)
-				arena_prof_promoted(ret, size);
+				arena_prof_promoted(ret, usize);
 		} else
 			ret = imalloc(size);
 	} else
+#endif
+	{
+#ifdef JEMALLOC_STATS
+		usize = s2u(size);
 #endif
 		ret = imalloc(size);
+	}
 
 OOM:
 	if (ret == NULL) {
@@ -875,7 +899,13 @@ RETURN:
 #endif
 #ifdef JEMALLOC_PROF
 	if (opt_prof && ret != NULL)
-		prof_malloc(ret, cnt);
+		prof_malloc(ret, usize, cnt);
+#endif
+#ifdef JEMALLOC_STATS
+	if (ret != NULL) {
+		assert(usize == isalloc(ret));
+		ALLOCATED_ADD(usize, 0);
+	}
 #endif
 	return (ret);
 }
@@ -887,8 +917,19 @@ JEMALLOC_P(posix_memalign)(void **memptr, size_t alignment, size_t size)
 {
 	int ret;
 	void *result;
+#if (defined(JEMALLOC_PROF) || defined(JEMALLOC_STATS))
+	size_t usize
+#  ifdef JEMALLOC_CC_SILENCE
+	    = 0
+#  endif
+	    ;
+#endif
 #ifdef JEMALLOC_PROF
-	prof_thr_cnt_t *cnt;
+	prof_thr_cnt_t *cnt
+#  ifdef JEMALLOC_CC_SILENCE
+	    = NULL
+#  endif
+	    ;
 #endif
 
 	if (malloc_init())
@@ -934,24 +975,32 @@ JEMALLOC_P(posix_memalign)(void **memptr, size_t alignment, size_t size)
 
 #ifdef JEMALLOC_PROF
 		if (opt_prof) {
-			if ((cnt = prof_alloc_prep(size)) == NULL) {
+			usize = sa2u(size, alignment, NULL);
+			if ((cnt = prof_alloc_prep(usize)) == NULL) {
 				result = NULL;
 				ret = EINVAL;
 			} else {
 				if (prof_promote && (uintptr_t)cnt !=
-				    (uintptr_t)1U && size <= small_maxclass) {
-					result = ipalloc(alignment,
-					    small_maxclass+1);
+				    (uintptr_t)1U && usize <= small_maxclass) {
+					result = ipalloc(small_maxclass+1,
+					    alignment, false);
 					if (result != NULL) {
 						arena_prof_promoted(result,
-						    size);
+						    usize);
 					}
-				} else
-					result = ipalloc(alignment, size);
+				} else {
+					result = ipalloc(size, alignment,
+					    false);
+				}
 			}
 		} else
 #endif
-			result = ipalloc(alignment, size);
+		{
+#ifdef JEMALLOC_STATS
+			usize = sa2u(size, alignment, NULL);
+#endif
+			result = ipalloc(size, alignment, false);
+		}
 	}
 
 	if (result == NULL) {
@@ -970,9 +1019,15 @@ JEMALLOC_P(posix_memalign)(void **memptr, size_t alignment, size_t size)
 	ret = 0;
 
 RETURN:
+#ifdef JEMALLOC_STATS
+	if (result != NULL) {
+		assert(usize == isalloc(result));
+		ALLOCATED_ADD(usize, 0);
+	}
+#endif
 #ifdef JEMALLOC_PROF
 	if (opt_prof && result != NULL)
-		prof_malloc(result, cnt);
+		prof_malloc(result, usize, cnt);
 #endif
 	return (ret);
 }
@@ -984,8 +1039,19 @@ JEMALLOC_P(calloc)(size_t num, size_t size)
 {
 	void *ret;
 	size_t num_size;
+#if (defined(JEMALLOC_PROF) || defined(JEMALLOC_STATS))
+	size_t usize
+#  ifdef JEMALLOC_CC_SILENCE
+	    = 0
+#  endif
+	    ;
+#endif
 #ifdef JEMALLOC_PROF
-	prof_thr_cnt_t *cnt;
+	prof_thr_cnt_t *cnt
+#  ifdef JEMALLOC_CC_SILENCE
+	    = NULL
+#  endif
+	    ;
 #endif
 
 	if (malloc_init()) {
@@ -1020,20 +1086,26 @@ JEMALLOC_P(calloc)(size_t num, size_t size)
 
 #ifdef JEMALLOC_PROF
 	if (opt_prof) {
-		if ((cnt = prof_alloc_prep(num_size)) == NULL) {
+		usize = s2u(num_size);
+		if ((cnt = prof_alloc_prep(usize)) == NULL) {
 			ret = NULL;
 			goto RETURN;
 		}
-		if (prof_promote && (uintptr_t)cnt != (uintptr_t)1U && num_size
+		if (prof_promote && (uintptr_t)cnt != (uintptr_t)1U && usize
 		    <= small_maxclass) {
 			ret = icalloc(small_maxclass+1);
 			if (ret != NULL)
-				arena_prof_promoted(ret, num_size);
+				arena_prof_promoted(ret, usize);
 		} else
 			ret = icalloc(num_size);
 	} else
+#endif
+	{
+#ifdef JEMALLOC_STATS
+		usize = s2u(num_size);
 #endif
 		ret = icalloc(num_size);
+	}
 
 RETURN:
 	if (ret == NULL) {
@@ -1049,7 +1121,13 @@ RETURN:
 
 #ifdef JEMALLOC_PROF
 	if (opt_prof && ret != NULL)
-		prof_malloc(ret, cnt);
+		prof_malloc(ret, usize, cnt);
+#endif
+#ifdef JEMALLOC_STATS
+	if (ret != NULL) {
+		assert(usize == isalloc(ret));
+		ALLOCATED_ADD(usize, 0);
+	}
 #endif
 	return (ret);
 }
@@ -1059,10 +1137,25 @@ void *
 JEMALLOC_P(realloc)(void *ptr, size_t size)
 {
 	void *ret;
+#if (defined(JEMALLOC_PROF) || defined(JEMALLOC_STATS))
+	size_t usize
+#  ifdef JEMALLOC_CC_SILENCE
+	    = 0
+#  endif
+	    ;
+	size_t old_size = 0;
+#endif
 #ifdef JEMALLOC_PROF
-	size_t old_size;
-	prof_thr_cnt_t *cnt;
-	prof_ctx_t *old_ctx;
+	prof_thr_cnt_t *cnt
+#  ifdef JEMALLOC_CC_SILENCE
+	    = NULL
+#  endif
+	    ;
+	prof_ctx_t *old_ctx
+#  ifdef JEMALLOC_CC_SILENCE
+	    = NULL
+#  endif
+	    ;
 #endif
 
 	if (size == 0) {
@@ -1073,9 +1166,11 @@ JEMALLOC_P(realloc)(void *ptr, size_t size)
 #ifdef JEMALLOC_SYSV
 		else {
 			if (ptr != NULL) {
+#if (defined(JEMALLOC_PROF) || defined(JEMALLOC_STATS))
+				old_size = isalloc(ptr);
+#endif
 #ifdef JEMALLOC_PROF
 				if (opt_prof) {
-					old_size = isalloc(ptr);
 					old_ctx = prof_ctx_get(ptr);
 					cnt = NULL;
 				}
@@ -1084,7 +1179,6 @@ JEMALLOC_P(realloc)(void *ptr, size_t size)
 			}
 #ifdef JEMALLOC_PROF
 			else if (opt_prof) {
-				old_size = 0;
 				old_ctx = NULL;
 				cnt = NULL;
 			}
@@ -1099,24 +1193,33 @@ JEMALLOC_P(realloc)(void *ptr, size_t size)
 		assert(malloc_initialized || malloc_initializer ==
 		    pthread_self());
 
+#if (defined(JEMALLOC_PROF) || defined(JEMALLOC_STATS))
+		old_size = isalloc(ptr);
+#endif
 #ifdef JEMALLOC_PROF
 		if (opt_prof) {
-			old_size = isalloc(ptr);
+			usize = s2u(size);
 			old_ctx = prof_ctx_get(ptr);
-			if ((cnt = prof_alloc_prep(size)) == NULL) {
+			if ((cnt = prof_alloc_prep(usize)) == NULL) {
 				ret = NULL;
 				goto OOM;
 			}
 			if (prof_promote && (uintptr_t)cnt != (uintptr_t)1U &&
-			    size <= small_maxclass) {
-				ret = iralloc(ptr, small_maxclass+1);
+			    usize <= small_maxclass) {
+				ret = iralloc(ptr, small_maxclass+1, 0, 0,
+				    false, false);
 				if (ret != NULL)
-					arena_prof_promoted(ret, size);
+					arena_prof_promoted(ret, usize);
 			} else
-				ret = iralloc(ptr, size);
+				ret = iralloc(ptr, size, 0, 0, false, false);
 		} else
 #endif
-			ret = iralloc(ptr, size);
+		{
+#ifdef JEMALLOC_STATS
+			usize = s2u(size);
+#endif
+			ret = iralloc(ptr, size, 0, 0, false, false);
+		}
 
 #ifdef JEMALLOC_PROF
 OOM:
@@ -1133,10 +1236,8 @@ OOM:
 		}
 	} else {
 #ifdef JEMALLOC_PROF
-		if (opt_prof) {
-			old_size = 0;
+		if (opt_prof)
 			old_ctx = NULL;
-		}
 #endif
 		if (malloc_init()) {
 #ifdef JEMALLOC_PROF
@@ -1147,23 +1248,29 @@ OOM:
 		} else {
 #ifdef JEMALLOC_PROF
 			if (opt_prof) {
-				if ((cnt = prof_alloc_prep(size)) == NULL)
+				usize = s2u(size);
+				if ((cnt = prof_alloc_prep(usize)) == NULL)
 					ret = NULL;
 				else {
 					if (prof_promote && (uintptr_t)cnt !=
-					    (uintptr_t)1U && size <=
+					    (uintptr_t)1U && usize <=
 					    small_maxclass) {
 						ret = imalloc(small_maxclass+1);
 						if (ret != NULL) {
 							arena_prof_promoted(ret,
-							    size);
+							    usize);
 						}
 					} else
 						ret = imalloc(size);
 				}
 			} else
+#endif
+			{
+#ifdef JEMALLOC_STATS
+				usize = s2u(size);
 #endif
 				ret = imalloc(size);
+			}
 		}
 
 		if (ret == NULL) {
@@ -1183,7 +1290,13 @@ RETURN:
 #endif
 #ifdef JEMALLOC_PROF
 	if (opt_prof)
-		prof_realloc(ret, cnt, ptr, old_size, old_ctx);
+		prof_realloc(ret, usize, cnt, old_size, old_ctx);
+#endif
+#ifdef JEMALLOC_STATS
+	if (ret != NULL) {
+		assert(usize == isalloc(ret));
+		ALLOCATED_ADD(usize, old_size);
+	}
 #endif
 	return (ret);
 }
@@ -1194,12 +1307,26 @@ JEMALLOC_P(free)(void *ptr)
 {
 
 	if (ptr != NULL) {
+#if (defined(JEMALLOC_PROF) || defined(JEMALLOC_STATS))
+		size_t usize;
+#endif
+
 		assert(malloc_initialized || malloc_initializer ==
 		    pthread_self());
 
+#ifdef JEMALLOC_STATS
+		usize = isalloc(ptr);
+#endif
 #ifdef JEMALLOC_PROF
-		if (opt_prof)
-			prof_free(ptr);
+		if (opt_prof) {
+#  ifndef JEMALLOC_STATS
+			usize = isalloc(ptr);
+#  endif
+			prof_free(ptr, usize);
+		}
+#endif
+#ifdef JEMALLOC_STATS
+		ALLOCATED_ADD(0, usize);
 #endif
 		idalloc(ptr);
 	}
@@ -1209,6 +1336,57 @@ JEMALLOC_P(free)(void *ptr)
  * End malloc(3)-compatible functions.
  */
 /******************************************************************************/
+/*
+ * Begin non-standard override functions.
+ *
+ * These overrides are omitted if the JEMALLOC_PREFIX is defined, since the
+ * entire point is to avoid accidental mixed allocator usage.
+ */
+#ifndef JEMALLOC_PREFIX
+
+#ifdef JEMALLOC_OVERRIDE_MEMALIGN
+JEMALLOC_ATTR(malloc)
+JEMALLOC_ATTR(visibility("default"))
+void *
+JEMALLOC_P(memalign)(size_t alignment, size_t size)
+{
+	void *ret;
+#ifdef JEMALLOC_CC_SILENCE
+	int result =
+#endif
+	    JEMALLOC_P(posix_memalign)(&ret, alignment, size);
+#ifdef JEMALLOC_CC_SILENCE
+	if (result != 0)
+		return (NULL);
+#endif
+	return (ret);
+}
+#endif
+
+#ifdef JEMALLOC_OVERRIDE_VALLOC
+JEMALLOC_ATTR(malloc)
+JEMALLOC_ATTR(visibility("default"))
+void *
+JEMALLOC_P(valloc)(size_t size)
+{
+	void *ret;
+#ifdef JEMALLOC_CC_SILENCE
+	int result =
+#endif
+	    JEMALLOC_P(posix_memalign)(&ret, PAGE_SIZE, size);
+#ifdef JEMALLOC_CC_SILENCE
+	if (result != 0)
+		return (NULL);
+#endif
+	return (ret);
+}
+#endif
+
+#endif /* JEMALLOC_PREFIX */
+/*
+ * End non-standard override functions.
+ */
+/******************************************************************************/
 /*
  * Begin non-standard functions.
  */
@@ -1219,29 +1397,18 @@ JEMALLOC_P(malloc_usable_size)(const void *ptr)
 {
 	size_t ret;
 
+	assert(malloc_initialized || malloc_initializer == pthread_self());
+
+#ifdef JEMALLOC_IVSALLOC
+	ret = ivsalloc(ptr);
+#else
 	assert(ptr != NULL);
 	ret = isalloc(ptr);
+#endif
 
 	return (ret);
 }
 
-#ifdef JEMALLOC_SWAP
-JEMALLOC_ATTR(visibility("default"))
-int
-JEMALLOC_P(malloc_swap_enable)(const int *fds, unsigned nfds, int prezeroed)
-{
-
-	/*
-	 * Make sure malloc is initialized, because we need page size, chunk
-	 * size, etc.
-	 */
-	if (malloc_init())
-		return (-1);
-
-	return (chunk_swap_enable(fds, nfds, (prezeroed != 0)) ? -1 : 0);
-}
-#endif
-
 JEMALLOC_ATTR(visibility("default"))
 void
 JEMALLOC_P(malloc_stats_print)(void (*write_cb)(void *, const char *),
@@ -1286,6 +1453,247 @@ JEMALLOC_P(mallctlbymib)(const size_t *mib, size_t miblen, void *oldp,
 	return (ctl_bymib(mib, miblen, oldp, oldlenp, newp, newlen));
 }
 
+JEMALLOC_INLINE void *
+iallocm(size_t size, size_t alignment, bool zero)
+{
+
+	if (alignment != 0)
+		return (ipalloc(size, alignment, zero));
+	else if (zero)
+		return (icalloc(size));
+	else
+		return (imalloc(size));
+}
+
+JEMALLOC_ATTR(nonnull(1))
+JEMALLOC_ATTR(visibility("default"))
+int
+JEMALLOC_P(allocm)(void **ptr, size_t *rsize, size_t size, int flags)
+{
+	void *p;
+	size_t usize;
+	size_t alignment = (ZU(1) << (flags & ALLOCM_LG_ALIGN_MASK)
+	    & (SIZE_T_MAX-1));
+	bool zero = flags & ALLOCM_ZERO;
+#ifdef JEMALLOC_PROF
+	prof_thr_cnt_t *cnt;
+#endif
+
+	assert(ptr != NULL);
+	assert(size != 0);
+
+	if (malloc_init())
+		goto OOM;
+
+#ifdef JEMALLOC_PROF
+	if (opt_prof) {
+		usize = (alignment == 0) ? s2u(size) : sa2u(size, alignment,
+		    NULL);
+		if ((cnt = prof_alloc_prep(usize)) == NULL)
+			goto OOM;
+		if (prof_promote && (uintptr_t)cnt != (uintptr_t)1U && usize <=
+		    small_maxclass) {
+			p = iallocm(small_maxclass+1, alignment, zero);
+			if (p == NULL)
+				goto OOM;
+			arena_prof_promoted(p, usize);
+		} else {
+			p = iallocm(size, alignment, zero);
+			if (p == NULL)
+				goto OOM;
+		}
+
+		if (rsize != NULL)
+			*rsize = usize;
+	} else
+#endif
+	{
+		p = iallocm(size, alignment, zero);
+		if (p == NULL)
+			goto OOM;
+#ifndef JEMALLOC_STATS
+		if (rsize != NULL)
+#endif
+		{
+			usize = (alignment == 0) ? s2u(size) : sa2u(size,
+			    alignment, NULL);
+#ifdef JEMALLOC_STATS
+			if (rsize != NULL)
+#endif
+				*rsize = usize;
+		}
+	}
+
+	*ptr = p;
+#ifdef JEMALLOC_STATS
+	assert(usize == isalloc(p));
+	ALLOCATED_ADD(usize, 0);
+#endif
+	return (ALLOCM_SUCCESS);
+OOM:
+#ifdef JEMALLOC_XMALLOC
+	if (opt_xmalloc) {
+		malloc_write("<jemalloc>: Error in allocm(): "
+		    "out of memory\n");
+		abort();
+	}
+#endif
+	*ptr = NULL;
+	return (ALLOCM_ERR_OOM);
+}
+
+JEMALLOC_ATTR(nonnull(1))
+JEMALLOC_ATTR(visibility("default"))
+int
+JEMALLOC_P(rallocm)(void **ptr, size_t *rsize, size_t size, size_t extra,
+    int flags)
+{
+	void *p, *q;
+	size_t usize;
+#if (defined(JEMALLOC_PROF) || defined(JEMALLOC_STATS))
+	size_t old_size;
+#endif
+	size_t alignment = (ZU(1) << (flags & ALLOCM_LG_ALIGN_MASK)
+	    & (SIZE_T_MAX-1));
+	bool zero = flags & ALLOCM_ZERO;
+	bool no_move = flags & ALLOCM_NO_MOVE;
+#ifdef JEMALLOC_PROF
+	prof_thr_cnt_t *cnt;
+	prof_ctx_t *old_ctx;
+#endif
+
+	assert(ptr != NULL);
+	assert(*ptr != NULL);
+	assert(size != 0);
+	assert(SIZE_T_MAX - size >= extra);
+	assert(malloc_initialized || malloc_initializer == pthread_self());
+
+	p = *ptr;
+#ifdef JEMALLOC_PROF
+	if (opt_prof) {
+		/*
+		 * usize isn't knowable before iralloc() returns when extra is
+		 * non-zero.  Therefore, compute its maximum possible value and
+		 * use that in prof_alloc_prep() to decide whether to capture a
+		 * backtrace.  prof_realloc() will use the actual usize to
+		 * decide whether to sample.
+		 */
+		size_t max_usize = (alignment == 0) ? s2u(size+extra) :
+		    sa2u(size+extra, alignment, NULL);
+		old_size = isalloc(p);
+		old_ctx = prof_ctx_get(p);
+		if ((cnt = prof_alloc_prep(max_usize)) == NULL)
+			goto OOM;
+		if (prof_promote && (uintptr_t)cnt != (uintptr_t)1U && max_usize
+		    <= small_maxclass) {
+			q = iralloc(p, small_maxclass+1, (small_maxclass+1 >=
+			    size+extra) ? 0 : size+extra - (small_maxclass+1),
+			    alignment, zero, no_move);
+			if (q == NULL)
+				goto ERR;
+			usize = isalloc(q);
+			arena_prof_promoted(q, usize);
+		} else {
+			q = iralloc(p, size, extra, alignment, zero, no_move);
+			if (q == NULL)
+				goto ERR;
+			usize = isalloc(q);
+		}
+		prof_realloc(q, usize, cnt, old_size, old_ctx);
+	} else
+#endif
+	{
+#ifdef JEMALLOC_STATS
+		old_size = isalloc(p);
+#endif
+		q = iralloc(p, size, extra, alignment, zero, no_move);
+		if (q == NULL)
+			goto ERR;
+#ifndef JEMALLOC_STATS
+		if (rsize != NULL)
+#endif
+		{
+			usize = isalloc(q);
+#ifdef JEMALLOC_STATS
+			if (rsize != NULL)
+#endif
+				*rsize = usize;
+		}
+	}
+
+	*ptr = q;
+#ifdef JEMALLOC_STATS
+	ALLOCATED_ADD(usize, old_size);
+#endif
+	return (ALLOCM_SUCCESS);
+ERR:
+	if (no_move)
+		return (ALLOCM_ERR_NOT_MOVED);
+#ifdef JEMALLOC_PROF
+OOM:
+#endif
+#ifdef JEMALLOC_XMALLOC
+	if (opt_xmalloc) {
+		malloc_write("<jemalloc>: Error in rallocm(): "
+		    "out of memory\n");
+		abort();
+	}
+#endif
+	return (ALLOCM_ERR_OOM);
+}
+
+JEMALLOC_ATTR(nonnull(1))
+JEMALLOC_ATTR(visibility("default"))
+int
+JEMALLOC_P(sallocm)(const void *ptr, size_t *rsize, int flags)
+{
+	size_t sz;
+
+	assert(malloc_initialized || malloc_initializer == pthread_self());
+
+#ifdef JEMALLOC_IVSALLOC
+	sz = ivsalloc(ptr);
+#else
+	assert(ptr != NULL);
+	sz = isalloc(ptr);
+#endif
+	assert(rsize != NULL);
+	*rsize = sz;
+
+	return (ALLOCM_SUCCESS);
+}
+
+JEMALLOC_ATTR(nonnull(1))
+JEMALLOC_ATTR(visibility("default"))
+int
+JEMALLOC_P(dallocm)(void *ptr, int flags)
+{
+#if (defined(JEMALLOC_PROF) || defined(JEMALLOC_STATS))
+	size_t usize;
+#endif
+
+	assert(ptr != NULL);
+	assert(malloc_initialized || malloc_initializer == pthread_self());
+
+#ifdef JEMALLOC_STATS
+	usize = isalloc(ptr);
+#endif
+#ifdef JEMALLOC_PROF
+	if (opt_prof) {
+#  ifndef JEMALLOC_STATS
+		usize = isalloc(ptr);
+#  endif
+		prof_free(ptr, usize);
+	}
+#endif
+#ifdef JEMALLOC_STATS
+	ALLOCATED_ADD(0, usize);
+#endif
+	idalloc(ptr);
+
+	return (ALLOCM_SUCCESS);
+}
+
 /*
  * End non-standard functions.
  */
@@ -1293,12 +1701,10 @@ JEMALLOC_P(mallctlbymib)(const size_t *mib, size_t miblen, void *oldp,
 
 /*
  * The following functions are used by threading libraries for protection of
- * malloc during fork().  These functions are only called if the program is
- * running in threaded mode, so there is no need to check whether the program
- * is threaded here.
+ * malloc during fork().
  */
 
-static void
+void
 jemalloc_prefork(void)
 {
 	unsigned i;
@@ -1324,7 +1730,7 @@ jemalloc_prefork(void)
 #endif
 }
 
-static void
+void
 jemalloc_postfork(void)
 {
 	unsigned i;
@@ -1349,3 +1755,5 @@ jemalloc_postfork(void)
 	}
 	malloc_mutex_unlock(&arenas_lock);
 }
+
+/******************************************************************************/
diff --git a/jemalloc/src/mutex.c b/jemalloc/src/mutex.c
index 3b6081a4..3ecb18a3 100644
--- a/jemalloc/src/mutex.c
+++ b/jemalloc/src/mutex.c
@@ -59,7 +59,11 @@ malloc_mutex_init(malloc_mutex_t *mutex)
 
 	if (pthread_mutexattr_init(&attr) != 0)
 		return (true);
+#ifdef PTHREAD_MUTEX_ADAPTIVE_NP
 	pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ADAPTIVE_NP);
+#else
+	pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_DEFAULT);
+#endif
 	if (pthread_mutex_init(mutex, &attr) != 0) {
 		pthread_mutexattr_destroy(&attr);
 		return (true);
@@ -68,3 +72,13 @@ malloc_mutex_init(malloc_mutex_t *mutex)
 
 	return (false);
 }
+
+void
+malloc_mutex_destroy(malloc_mutex_t *mutex)
+{
+
+	if (pthread_mutex_destroy(mutex) != 0) {
+		malloc_write("<jemalloc>: Error in pthread_mutex_destroy()\n");
+		abort();
+	}
+}
diff --git a/jemalloc/src/prof.c b/jemalloc/src/prof.c
index 6d6910ed..84ce1ba0 100644
--- a/jemalloc/src/prof.c
+++ b/jemalloc/src/prof.c
@@ -12,8 +12,6 @@
 #include <libunwind.h>
 #endif
 
-#include <math.h>
-
 /******************************************************************************/
 /* Data. */
 
@@ -22,48 +20,30 @@ bool		opt_prof_active = true;
 size_t		opt_lg_prof_bt_max = LG_PROF_BT_MAX_DEFAULT;
 size_t		opt_lg_prof_sample = LG_PROF_SAMPLE_DEFAULT;
 ssize_t		opt_lg_prof_interval = LG_PROF_INTERVAL_DEFAULT;
-bool		opt_prof_udump = false;
+bool		opt_prof_gdump = false;
 bool		opt_prof_leak = false;
+bool		opt_prof_accum = true;
+ssize_t		opt_lg_prof_tcmax = LG_PROF_TCMAX_DEFAULT;
+char		opt_prof_prefix[PATH_MAX + 1];
 
 uint64_t	prof_interval;
 bool		prof_promote;
 
+unsigned	prof_bt_max;
+
+#ifndef NO_TLS
+__thread prof_tdata_t	*prof_tdata_tls
+    JEMALLOC_ATTR(tls_model("initial-exec"));
+#endif
+pthread_key_t	prof_tdata_tsd;
+
 /*
  * Global hash of (prof_bt_t *)-->(prof_ctx_t *).  This is the master data
- * structure that knows about all backtraces ever captured.
+ * structure that knows about all backtraces currently captured.
  */
 static ckh_t		bt2ctx;
 static malloc_mutex_t	bt2ctx_mtx;
 
-/*
- * Thread-specific hash of (prof_bt_t *)-->(prof_thr_cnt_t *).  Each thread
- * keeps a cache of backtraces, with associated thread-specific prof_thr_cnt_t
- * objects.  Other threads may read the prof_thr_cnt_t contents, but no others
- * will ever write them.
- *
- * Upon thread exit, the thread must merge all the prof_thr_cnt_t counter data
- * into the associated prof_ctx_t objects, and unlink/free the prof_thr_cnt_t
- * objects.
- */
-static __thread ckh_t	*bt2cnt_tls JEMALLOC_ATTR(tls_model("initial-exec"));
-
-/*
- * Same contents as b2cnt_tls, but initialized such that the TSD destructor is
- * called when a thread exits, so that bt2cnt_tls contents can be merged,
- * unlinked, and deallocated.
- */
-static pthread_key_t	bt2cnt_tsd;
-
-/* (1U << opt_lg_prof_bt_max). */
-static unsigned		prof_bt_max;
-
-static __thread uint64_t prof_sample_prn_state
-    JEMALLOC_ATTR(tls_model("initial-exec"));
-static __thread uint64_t prof_sample_threshold
-    JEMALLOC_ATTR(tls_model("initial-exec"));
-static __thread uint64_t prof_sample_accum
-    JEMALLOC_ATTR(tls_model("initial-exec"));
-
 static malloc_mutex_t	prof_dump_seq_mtx;
 static uint64_t		prof_dump_seq;
 static uint64_t		prof_dump_iseq;
@@ -85,26 +65,25 @@ static bool		prof_booted = false;
 static malloc_mutex_t	enq_mtx;
 static bool		enq;
 static bool		enq_idump;
-static bool		enq_udump;
+static bool		enq_gdump;
 
 /******************************************************************************/
 /* Function prototypes for non-inline static functions. */
 
 static prof_bt_t	*bt_dup(prof_bt_t *bt);
-static void	bt_init(prof_bt_t *bt, void **vec);
+static void	bt_destroy(prof_bt_t *bt);
 #ifdef JEMALLOC_PROF_LIBGCC
 static _Unwind_Reason_Code	prof_unwind_init_callback(
     struct _Unwind_Context *context, void *arg);
 static _Unwind_Reason_Code	prof_unwind_callback(
     struct _Unwind_Context *context, void *arg);
 #endif
-static void	prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max);
-static prof_thr_cnt_t	*prof_lookup(prof_bt_t *bt);
-static void	prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
 static bool	prof_flush(bool propagate_err);
 static bool	prof_write(const char *s, bool propagate_err);
-static void	prof_ctx_merge(prof_ctx_t *ctx, prof_cnt_t *cnt_all,
+static void	prof_ctx_sum(prof_ctx_t *ctx, prof_cnt_t *cnt_all,
     size_t *leak_nctx);
+static void	prof_ctx_destroy(prof_ctx_t *ctx);
+static void	prof_ctx_merge(prof_ctx_t *ctx, prof_thr_cnt_t *cnt);
 static bool	prof_dump_ctx(prof_ctx_t *ctx, prof_bt_t *bt,
     bool propagate_err);
 static bool	prof_dump_maps(bool propagate_err);
@@ -115,11 +94,11 @@ static void	prof_fdump(void);
 static void	prof_bt_hash(const void *key, unsigned minbits, size_t *hash1,
     size_t *hash2);
 static bool	prof_bt_keycomp(const void *k1, const void *k2);
-static void	bt2cnt_thread_cleanup(void *arg);
+static void	prof_tdata_cleanup(void *arg);
 
 /******************************************************************************/
 
-static void
+void
 bt_init(prof_bt_t *bt, void **vec)
 {
 
@@ -127,6 +106,13 @@ bt_init(prof_bt_t *bt, void **vec)
 	bt->len = 0;
 }
 
+static void
+bt_destroy(prof_bt_t *bt)
+{
+
+	idalloc(bt);
+}
+
 static prof_bt_t *
 bt_dup(prof_bt_t *bt)
 {
@@ -165,7 +151,7 @@ prof_enter(void)
 static inline void
 prof_leave(void)
 {
-	bool idump, udump;
+	bool idump, gdump;
 
 	malloc_mutex_unlock(&bt2ctx_mtx);
 
@@ -173,14 +159,14 @@ prof_leave(void)
 	enq = false;
 	idump = enq_idump;
 	enq_idump = false;
-	udump = enq_udump;
-	enq_udump = false;
+	gdump = enq_gdump;
+	enq_gdump = false;
 	malloc_mutex_unlock(&enq_mtx);
 
 	if (idump)
 		prof_idump();
-	if (udump)
-		prof_udump();
+	if (gdump)
+		prof_gdump();
 }
 
 #ifdef JEMALLOC_PROF_LIBGCC
@@ -208,7 +194,7 @@ prof_unwind_callback(struct _Unwind_Context *context, void *arg)
 	return (_URC_NO_REASON);
 }
 
-static void
+void
 prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max)
 {
 	prof_unwind_data_t data = {bt, nignore, max};
@@ -216,7 +202,7 @@ prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max)
 	_Unwind_Backtrace(prof_unwind_callback, &data);
 }
 #elif defined(JEMALLOC_PROF_LIBUNWIND)
-static void
+void
 prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max)
 {
 	unw_context_t uc;
@@ -251,41 +237,29 @@ prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max)
 	}
 }
 #else
-static void
+void
 prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max)
 {
-#define	NIGNORE	3
 #define	BT_FRAME(i)							\
-	if ((i) < NIGNORE + max) {					\
+	if ((i) < nignore + max) {					\
 		void *p;						\
 		if (__builtin_frame_address(i) == 0)			\
 			return;						\
 		p = __builtin_return_address(i);			\
 		if (p == NULL)						\
 			return;						\
-		if (i >= NIGNORE) {					\
-			bt->vec[(i) - NIGNORE] = p;			\
-			bt->len = (i) - NIGNORE + 1;			\
+		if (i >= nignore) {					\
+			bt->vec[(i) - nignore] = p;			\
+			bt->len = (i) - nignore + 1;			\
 		}							\
 	} else								\
 		return;
 
 	assert(max <= (1U << opt_lg_prof_bt_max));
 
-	/*
-	 * Ignore the first three frames, since they are:
-	 *
-	 *   0: prof_backtrace()
-	 *   1: prof_alloc_prep()
-	 *   2: malloc(), calloc(), etc.
-	 */
-#if 1
-	assert(nignore + 1 == NIGNORE);
-#else
 	BT_FRAME(0)
 	BT_FRAME(1)
 	BT_FRAME(2)
-#endif
 	BT_FRAME(3)
 	BT_FRAME(4)
 	BT_FRAME(5)
@@ -432,345 +406,119 @@ prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max)
 }
 #endif
 
-static prof_thr_cnt_t *
+prof_thr_cnt_t *
 prof_lookup(prof_bt_t *bt)
 {
-	prof_thr_cnt_t *ret;
-	ckh_t *bt2cnt = bt2cnt_tls;
+	union {
+		prof_thr_cnt_t	*p;
+		void		*v;
+	} ret;
+	prof_tdata_t *prof_tdata;
 
-	if (bt2cnt == NULL) {
-		/* Initialize an empty cache for this thread. */
-		bt2cnt = (ckh_t *)imalloc(sizeof(ckh_t));
-		if (bt2cnt == NULL)
+	prof_tdata = PROF_TCACHE_GET();
+	if (prof_tdata == NULL) {
+		prof_tdata = prof_tdata_init();
+		if (prof_tdata == NULL)
 			return (NULL);
-		if (ckh_new(bt2cnt, PROF_CKH_MINITEMS, prof_bt_hash,
-		    prof_bt_keycomp)) {
-			idalloc(bt2cnt);
-			return (NULL);
-		}
-		bt2cnt_tls = bt2cnt;
-		pthread_setspecific(bt2cnt_tsd, bt2cnt);
 	}
 
-	if (ckh_search(bt2cnt, bt, NULL, (void **)&ret)) {
-		prof_bt_t *btkey;
-		prof_ctx_t *ctx;
+	if (ckh_search(&prof_tdata->bt2cnt, bt, NULL, &ret.v)) {
+		union {
+			prof_bt_t	*p;
+			void		*v;
+		} btkey;
+		union {
+			prof_ctx_t	*p;
+			void		*v;
+		} ctx;
 
 		/*
 		 * This thread's cache lacks bt.  Look for it in the global
 		 * cache.
 		 */
 		prof_enter();
-		if (ckh_search(&bt2ctx, bt, (void **)&btkey, (void **)&ctx)) {
-
+		if (ckh_search(&bt2ctx, bt, &btkey.v, &ctx.v)) {
 			/* bt has never been seen before.  Insert it. */
-			ctx = (prof_ctx_t *)imalloc(sizeof(prof_ctx_t));
-			if (ctx == NULL) {
+			ctx.v = imalloc(sizeof(prof_ctx_t));
+			if (ctx.v == NULL) {
 				prof_leave();
 				return (NULL);
 			}
-			btkey = bt_dup(bt);
-			if (btkey == NULL) {
+			btkey.p = bt_dup(bt);
+			if (btkey.v == NULL) {
 				prof_leave();
-				idalloc(ctx);
+				idalloc(ctx.v);
 				return (NULL);
 			}
-			ctx->bt = btkey;
-			if (malloc_mutex_init(&ctx->lock)) {
+			ctx.p->bt = btkey.p;
+			if (malloc_mutex_init(&ctx.p->lock)) {
 				prof_leave();
-				idalloc(btkey);
-				idalloc(ctx);
+				idalloc(btkey.v);
+				idalloc(ctx.v);
 				return (NULL);
 			}
-			memset(&ctx->cnt_merged, 0, sizeof(prof_cnt_t));
-			ql_new(&ctx->cnts_ql);
-			if (ckh_insert(&bt2ctx, btkey, ctx)) {
+			memset(&ctx.p->cnt_merged, 0, sizeof(prof_cnt_t));
+			ql_new(&ctx.p->cnts_ql);
+			if (ckh_insert(&bt2ctx, btkey.v, ctx.v)) {
 				/* OOM. */
 				prof_leave();
-				idalloc(btkey);
-				idalloc(ctx);
+				malloc_mutex_destroy(&ctx.p->lock);
+				idalloc(btkey.v);
+				idalloc(ctx.v);
 				return (NULL);
 			}
 		}
+		/*
+		 * Acquire ctx's lock before releasing bt2ctx_mtx, in order to
+		 * avoid a race condition with prof_ctx_destroy().
+		 */
+		malloc_mutex_lock(&ctx.p->lock);
 		prof_leave();
 
 		/* Link a prof_thd_cnt_t into ctx for this thread. */
-		ret = (prof_thr_cnt_t *)imalloc(sizeof(prof_thr_cnt_t));
-		if (ret == NULL)
-			return (NULL);
-		ql_elm_new(ret, link);
-		ret->ctx = ctx;
-		ret->epoch = 0;
-		memset(&ret->cnts, 0, sizeof(prof_cnt_t));
-		if (ckh_insert(bt2cnt, btkey, ret)) {
-			idalloc(ret);
-			return (NULL);
-		}
-		malloc_mutex_lock(&ctx->lock);
-		ql_tail_insert(&ctx->cnts_ql, ret, link);
-		malloc_mutex_unlock(&ctx->lock);
-	}
-
-	return (ret);
-}
-
-static inline void
-prof_sample_threshold_update(void)
-{
-	uint64_t r;
-	double u;
-
-	/*
-	 * Compute prof_sample_threshold as a geometrically distributed random
-	 * variable with mean (2^opt_lg_prof_sample).
-	 */
-	prn64(r, 53, prof_sample_prn_state, (uint64_t)1125899906842625LLU,
-	    1058392653243283975);
-	u = (double)r * (1.0/9007199254740992.0L);
-	prof_sample_threshold = (uint64_t)(log(u) /
-	    log(1.0 - (1.0 / (double)((uint64_t)1U << opt_lg_prof_sample))))
-	    + (uint64_t)1U;
-}
-
-prof_thr_cnt_t *
-prof_alloc_prep(size_t size)
-{
-	prof_thr_cnt_t *ret;
-	void *vec[prof_bt_max];
-	prof_bt_t bt;
-
-	if (opt_prof_active == false) {
-		/* Sampling is currently inactive, so avoid sampling. */
-		ret = (prof_thr_cnt_t *)(uintptr_t)1U;
-	} else if (opt_lg_prof_sample == 0) {
-		/*
-		 * Don't bother with sampling logic, since sampling interval is
-		 * 1.
-		 */
-		bt_init(&bt, vec);
-		prof_backtrace(&bt, 2, prof_bt_max);
-		ret = prof_lookup(&bt);
-	} else {
-		if (prof_sample_threshold == 0) {
+		if (opt_lg_prof_tcmax >= 0 && ckh_count(&prof_tdata->bt2cnt)
+		    == (ZU(1) << opt_lg_prof_tcmax)) {
+			assert(ckh_count(&prof_tdata->bt2cnt) > 0);
 			/*
-			 * Initialize.  Seed the prng differently for each
-			 * thread.
+			 * Flush the least recently used cnt in order to keep
+			 * bt2cnt from becoming too large.
 			 */
-			prof_sample_prn_state = (uint64_t)(uintptr_t)&size;
-			prof_sample_threshold_update();
-		}
-
-		/*
-		 * Determine whether to capture a backtrace based on whether
-		 * size is enough for prof_accum to reach
-		 * prof_sample_threshold.  However, delay updating these
-		 * variables until prof_{m,re}alloc(), because we don't know
-		 * for sure that the allocation will succeed.
-		 *
-		 * Use subtraction rather than addition to avoid potential
-		 * integer overflow.
-		 */
-		if (size >= prof_sample_threshold - prof_sample_accum) {
-			bt_init(&bt, vec);
-			prof_backtrace(&bt, 2, prof_bt_max);
-			ret = prof_lookup(&bt);
-		} else
-			ret = (prof_thr_cnt_t *)(uintptr_t)1U;
-	}
-
-	return (ret);
-}
-
-prof_ctx_t *
-prof_ctx_get(const void *ptr)
-{
-	prof_ctx_t *ret;
-	arena_chunk_t *chunk;
-
-	assert(ptr != NULL);
-
-	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-	if (chunk != ptr) {
-		/* Region. */
-		assert(chunk->arena->magic == ARENA_MAGIC);
-
-		ret = arena_prof_ctx_get(ptr);
-	} else
-		ret = huge_prof_ctx_get(ptr);
-
-	return (ret);
-}
-
-static void
-prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
-{
-	arena_chunk_t *chunk;
-
-	assert(ptr != NULL);
-
-	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-	if (chunk != ptr) {
-		/* Region. */
-		assert(chunk->arena->magic == ARENA_MAGIC);
-
-		arena_prof_ctx_set(ptr, ctx);
-	} else
-		huge_prof_ctx_set(ptr, ctx);
-}
-
-static inline void
-prof_sample_accum_update(size_t size)
-{
-
-	/* Sampling logic is unnecessary if the interval is 1. */
-	assert(opt_lg_prof_sample != 0);
-
-	/* Take care to avoid integer overflow. */
-	if (size >= prof_sample_threshold - prof_sample_accum) {
-		prof_sample_accum -= (prof_sample_threshold - size);
-		/* Compute new prof_sample_threshold. */
-		prof_sample_threshold_update();
-		while (prof_sample_accum >= prof_sample_threshold) {
-			prof_sample_accum -= prof_sample_threshold;
-			prof_sample_threshold_update();
-		}
-	} else
-		prof_sample_accum += size;
-}
-
-void
-prof_malloc(const void *ptr, prof_thr_cnt_t *cnt)
-{
-	size_t size;
-
-	assert(ptr != NULL);
-
-	if (opt_lg_prof_sample != 0) {
-		size = isalloc(ptr);
-		prof_sample_accum_update(size);
-	} else if ((uintptr_t)cnt > (uintptr_t)1U)
-		size = isalloc(ptr);
-
-	if ((uintptr_t)cnt > (uintptr_t)1U) {
-		prof_ctx_set(ptr, cnt->ctx);
-
-		cnt->epoch++;
-		/*********/
-		mb_write();
-		/*********/
-		cnt->cnts.curobjs++;
-		cnt->cnts.curbytes += size;
-		cnt->cnts.accumobjs++;
-		cnt->cnts.accumbytes += size;
-		/*********/
-		mb_write();
-		/*********/
-		cnt->epoch++;
-		/*********/
-		mb_write();
-		/*********/
-	} else
-		prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U);
-}
-
-void
-prof_realloc(const void *ptr, prof_thr_cnt_t *cnt, const void *old_ptr,
-    size_t old_size, prof_ctx_t *old_ctx)
-{
-	size_t size;
-	prof_thr_cnt_t *told_cnt;
-
-	assert(ptr != NULL || (uintptr_t)cnt <= (uintptr_t)1U);
-
-	if (ptr != NULL) {
-		if (opt_lg_prof_sample != 0) {
-			size = isalloc(ptr);
-			prof_sample_accum_update(size);
-		} else if ((uintptr_t)cnt > (uintptr_t)1U)
-			size = isalloc(ptr);
-	}
-
-	if ((uintptr_t)old_ctx > (uintptr_t)1U) {
-		told_cnt = prof_lookup(old_ctx->bt);
-		if (told_cnt == NULL) {
-			/*
-			 * It's too late to propagate OOM for this realloc(),
-			 * so operate directly on old_cnt->ctx->cnt_merged.
-			 */
-			malloc_mutex_lock(&old_ctx->lock);
-			old_ctx->cnt_merged.curobjs--;
-			old_ctx->cnt_merged.curbytes -= old_size;
-			malloc_mutex_unlock(&old_ctx->lock);
-			told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
-		}
-	} else
-		told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
-
-	if ((uintptr_t)told_cnt > (uintptr_t)1U)
-		told_cnt->epoch++;
-	if ((uintptr_t)cnt > (uintptr_t)1U) {
-		prof_ctx_set(ptr, cnt->ctx);
-		cnt->epoch++;
-	} else
-		prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U);
-	/*********/
-	mb_write();
-	/*********/
-	if ((uintptr_t)told_cnt > (uintptr_t)1U) {
-		told_cnt->cnts.curobjs--;
-		told_cnt->cnts.curbytes -= old_size;
-	}
-	if ((uintptr_t)cnt > (uintptr_t)1U) {
-		cnt->cnts.curobjs++;
-		cnt->cnts.curbytes += size;
-		cnt->cnts.accumobjs++;
-		cnt->cnts.accumbytes += size;
-	}
-	/*********/
-	mb_write();
-	/*********/
-	if ((uintptr_t)told_cnt > (uintptr_t)1U)
-		told_cnt->epoch++;
-	if ((uintptr_t)cnt > (uintptr_t)1U)
-		cnt->epoch++;
-	/*********/
-	mb_write(); /* Not strictly necessary. */
-}
-
-void
-prof_free(const void *ptr)
-{
-	prof_ctx_t *ctx = prof_ctx_get(ptr);
-
-	if ((uintptr_t)ctx > (uintptr_t)1) {
-		size_t size = isalloc(ptr);
-		prof_thr_cnt_t *tcnt = prof_lookup(ctx->bt);
-
-		if (tcnt != NULL) {
-			tcnt->epoch++;
-			/*********/
-			mb_write();
-			/*********/
-			tcnt->cnts.curobjs--;
-			tcnt->cnts.curbytes -= size;
-			/*********/
-			mb_write();
-			/*********/
-			tcnt->epoch++;
-			/*********/
-			mb_write();
-			/*********/
+			ret.p = ql_last(&prof_tdata->lru_ql, lru_link);
+			assert(ret.v != NULL);
+			ckh_remove(&prof_tdata->bt2cnt, ret.p->ctx->bt, NULL,
+			    NULL);
+			ql_remove(&prof_tdata->lru_ql, ret.p, lru_link);
+			prof_ctx_merge(ret.p->ctx, ret.p);
+			/* ret can now be re-used. */
 		} else {
-			/*
-			 * OOM during free() cannot be propagated, so operate
-			 * directly on cnt->ctx->cnt_merged.
-			 */
-			malloc_mutex_lock(&ctx->lock);
-			ctx->cnt_merged.curobjs--;
-			ctx->cnt_merged.curbytes -= size;
-			malloc_mutex_unlock(&ctx->lock);
+			assert(opt_lg_prof_tcmax < 0 ||
+			    ckh_count(&prof_tdata->bt2cnt) < (ZU(1) <<
+			    opt_lg_prof_tcmax));
+			/* Allocate and partially initialize a new cnt. */
+			ret.v = imalloc(sizeof(prof_thr_cnt_t));
+			if (ret.p == NULL)
+				return (NULL);
+			ql_elm_new(ret.p, cnts_link);
+			ql_elm_new(ret.p, lru_link);
 		}
+		/* Finish initializing ret. */
+		ret.p->ctx = ctx.p;
+		ret.p->epoch = 0;
+		memset(&ret.p->cnts, 0, sizeof(prof_cnt_t));
+		if (ckh_insert(&prof_tdata->bt2cnt, btkey.v, ret.v)) {
+			idalloc(ret.v);
+			return (NULL);
+		}
+		ql_head_insert(&prof_tdata->lru_ql, ret.p, lru_link);
+		ql_tail_insert(&ctx.p->cnts_ql, ret.p, cnts_link);
+		malloc_mutex_unlock(&ctx.p->lock);
+	} else {
+		/* Move ret to the front of the LRU. */
+		ql_remove(&prof_tdata->lru_ql, ret.p, lru_link);
+		ql_head_insert(&prof_tdata->lru_ql, ret.p, lru_link);
 	}
+
+	return (ret.p);
 }
 
 static bool
@@ -823,15 +571,15 @@ prof_write(const char *s, bool propagate_err)
 }
 
 static void
-prof_ctx_merge(prof_ctx_t *ctx, prof_cnt_t *cnt_all, size_t *leak_nctx)
+prof_ctx_sum(prof_ctx_t *ctx, prof_cnt_t *cnt_all, size_t *leak_nctx)
 {
 	prof_thr_cnt_t *thr_cnt;
 	prof_cnt_t tcnt;
 
 	malloc_mutex_lock(&ctx->lock);
 
-	memcpy(&ctx->cnt_dump, &ctx->cnt_merged, sizeof(prof_cnt_t));
-	ql_foreach(thr_cnt, &ctx->cnts_ql, link) {
+	memcpy(&ctx->cnt_summed, &ctx->cnt_merged, sizeof(prof_cnt_t));
+	ql_foreach(thr_cnt, &ctx->cnts_ql, cnts_link) {
 		volatile unsigned *epoch = &thr_cnt->epoch;
 
 		while (true) {
@@ -848,46 +596,108 @@ prof_ctx_merge(prof_ctx_t *ctx, prof_cnt_t *cnt_all, size_t *leak_nctx)
 				break;
 		}
 
-		ctx->cnt_dump.curobjs += tcnt.curobjs;
-		ctx->cnt_dump.curbytes += tcnt.curbytes;
-		ctx->cnt_dump.accumobjs += tcnt.accumobjs;
-		ctx->cnt_dump.accumbytes += tcnt.accumbytes;
-
-		if (tcnt.curobjs != 0)
-			(*leak_nctx)++;
+		ctx->cnt_summed.curobjs += tcnt.curobjs;
+		ctx->cnt_summed.curbytes += tcnt.curbytes;
+		if (opt_prof_accum) {
+			ctx->cnt_summed.accumobjs += tcnt.accumobjs;
+			ctx->cnt_summed.accumbytes += tcnt.accumbytes;
+		}
 	}
 
-	/* Merge into cnt_all. */
-	cnt_all->curobjs += ctx->cnt_dump.curobjs;
-	cnt_all->curbytes += ctx->cnt_dump.curbytes;
-	cnt_all->accumobjs += ctx->cnt_dump.accumobjs;
-	cnt_all->accumbytes += ctx->cnt_dump.accumbytes;
+	if (ctx->cnt_summed.curobjs != 0)
+		(*leak_nctx)++;
+
+	/* Add to cnt_all. */
+	cnt_all->curobjs += ctx->cnt_summed.curobjs;
+	cnt_all->curbytes += ctx->cnt_summed.curbytes;
+	if (opt_prof_accum) {
+		cnt_all->accumobjs += ctx->cnt_summed.accumobjs;
+		cnt_all->accumbytes += ctx->cnt_summed.accumbytes;
+	}
 
 	malloc_mutex_unlock(&ctx->lock);
 }
 
+static void
+prof_ctx_destroy(prof_ctx_t *ctx)
+{
+
+	/*
+	 * Check that ctx is still unused by any thread cache before destroying
+	 * it.  prof_lookup() interlocks bt2ctx_mtx and ctx->lock in order to
+	 * avoid a race condition with this function.
+	 */
+	prof_enter();
+	malloc_mutex_lock(&ctx->lock);
+	if (ql_first(&ctx->cnts_ql) == NULL && ctx->cnt_merged.curobjs == 0) {
+		assert(ctx->cnt_merged.curbytes == 0);
+		assert(ctx->cnt_merged.accumobjs == 0);
+		assert(ctx->cnt_merged.accumbytes == 0);
+		/* Remove ctx from bt2ctx. */
+		ckh_remove(&bt2ctx, ctx->bt, NULL, NULL);
+		prof_leave();
+		/* Destroy ctx. */
+		malloc_mutex_unlock(&ctx->lock);
+		bt_destroy(ctx->bt);
+		malloc_mutex_destroy(&ctx->lock);
+		idalloc(ctx);
+	} else {
+		malloc_mutex_unlock(&ctx->lock);
+		prof_leave();
+	}
+}
+
+static void
+prof_ctx_merge(prof_ctx_t *ctx, prof_thr_cnt_t *cnt)
+{
+	bool destroy;
+
+	/* Merge cnt stats and detach from ctx. */
+	malloc_mutex_lock(&ctx->lock);
+	ctx->cnt_merged.curobjs += cnt->cnts.curobjs;
+	ctx->cnt_merged.curbytes += cnt->cnts.curbytes;
+	ctx->cnt_merged.accumobjs += cnt->cnts.accumobjs;
+	ctx->cnt_merged.accumbytes += cnt->cnts.accumbytes;
+	ql_remove(&ctx->cnts_ql, cnt, cnts_link);
+	if (opt_prof_accum == false && ql_first(&ctx->cnts_ql) == NULL &&
+	    ctx->cnt_merged.curobjs == 0)
+		destroy = true;
+	else
+		destroy = false;
+	malloc_mutex_unlock(&ctx->lock);
+	if (destroy)
+		prof_ctx_destroy(ctx);
+}
+
 static bool
 prof_dump_ctx(prof_ctx_t *ctx, prof_bt_t *bt, bool propagate_err)
 {
 	char buf[UMAX2S_BUFSIZE];
 	unsigned i;
 
-	if (prof_write(umax2s(ctx->cnt_dump.curobjs, 10, buf), propagate_err)
+	if (opt_prof_accum == false && ctx->cnt_summed.curobjs == 0) {
+		assert(ctx->cnt_summed.curbytes == 0);
+		assert(ctx->cnt_summed.accumobjs == 0);
+		assert(ctx->cnt_summed.accumbytes == 0);
+		return (false);
+	}
+
+	if (prof_write(u2s(ctx->cnt_summed.curobjs, 10, buf), propagate_err)
 	    || prof_write(": ", propagate_err)
-	    || prof_write(umax2s(ctx->cnt_dump.curbytes, 10, buf),
+	    || prof_write(u2s(ctx->cnt_summed.curbytes, 10, buf),
 	    propagate_err)
 	    || prof_write(" [", propagate_err)
-	    || prof_write(umax2s(ctx->cnt_dump.accumobjs, 10, buf),
+	    || prof_write(u2s(ctx->cnt_summed.accumobjs, 10, buf),
 	    propagate_err)
 	    || prof_write(": ", propagate_err)
-	    || prof_write(umax2s(ctx->cnt_dump.accumbytes, 10, buf),
+	    || prof_write(u2s(ctx->cnt_summed.accumbytes, 10, buf),
 	    propagate_err)
 	    || prof_write("] @", propagate_err))
 		return (true);
 
 	for (i = 0; i < bt->len; i++) {
 		if (prof_write(" 0x", propagate_err)
-		    || prof_write(umax2s((uintptr_t)bt->vec[i], 16, buf),
+		    || prof_write(u2s((uintptr_t)bt->vec[i], 16, buf),
 		    propagate_err))
 			return (true);
 	}
@@ -916,7 +726,7 @@ prof_dump_maps(bool propagate_err)
 	memcpy(&mpath[i], s, slen);
 	i += slen;
 
-	s = umax2s(getpid(), 10, buf);
+	s = u2s(getpid(), 10, buf);
 	slen = strlen(s);
 	memcpy(&mpath[i], s, slen);
 	i += slen;
@@ -958,8 +768,14 @@ prof_dump(const char *filename, bool leakcheck, bool propagate_err)
 {
 	prof_cnt_t cnt_all;
 	size_t tabind;
-	prof_bt_t *bt;
-	prof_ctx_t *ctx;
+	union {
+		prof_bt_t	*p;
+		void		*v;
+	} bt;
+	union {
+		prof_ctx_t	*p;
+		void		*v;
+	} ctx;
 	char buf[UMAX2S_BUFSIZE];
 	size_t leak_nctx;
 
@@ -979,20 +795,18 @@ prof_dump(const char *filename, bool leakcheck, bool propagate_err)
 	/* Merge per thread profile stats, and sum them in cnt_all. */
 	memset(&cnt_all, 0, sizeof(prof_cnt_t));
 	leak_nctx = 0;
-	for (tabind = 0; ckh_iter(&bt2ctx, &tabind, NULL, (void **)&ctx)
-	    == false;) {
-		prof_ctx_merge(ctx, &cnt_all, &leak_nctx);
-	}
+	for (tabind = 0; ckh_iter(&bt2ctx, &tabind, NULL, &ctx.v) == false;)
+		prof_ctx_sum(ctx.p, &cnt_all, &leak_nctx);
 
 	/* Dump profile header. */
 	if (prof_write("heap profile: ", propagate_err)
-	    || prof_write(umax2s(cnt_all.curobjs, 10, buf), propagate_err)
+	    || prof_write(u2s(cnt_all.curobjs, 10, buf), propagate_err)
 	    || prof_write(": ", propagate_err)
-	    || prof_write(umax2s(cnt_all.curbytes, 10, buf), propagate_err)
+	    || prof_write(u2s(cnt_all.curbytes, 10, buf), propagate_err)
 	    || prof_write(" [", propagate_err)
-	    || prof_write(umax2s(cnt_all.accumobjs, 10, buf), propagate_err)
+	    || prof_write(u2s(cnt_all.accumobjs, 10, buf), propagate_err)
 	    || prof_write(": ", propagate_err)
-	    || prof_write(umax2s(cnt_all.accumbytes, 10, buf), propagate_err))
+	    || prof_write(u2s(cnt_all.accumbytes, 10, buf), propagate_err))
 		goto ERROR;
 
 	if (opt_lg_prof_sample == 0) {
@@ -1000,16 +814,16 @@ prof_dump(const char *filename, bool leakcheck, bool propagate_err)
 			goto ERROR;
 	} else {
 		if (prof_write("] @ heap_v2/", propagate_err)
-		    || prof_write(umax2s((uint64_t)1U << opt_lg_prof_sample, 10,
+		    || prof_write(u2s((uint64_t)1U << opt_lg_prof_sample, 10,
 		    buf), propagate_err)
 		    || prof_write("\n", propagate_err))
 			goto ERROR;
 	}
 
 	/* Dump  per ctx profile stats. */
-	for (tabind = 0; ckh_iter(&bt2ctx, &tabind, (void **)&bt, (void **)&ctx)
+	for (tabind = 0; ckh_iter(&bt2ctx, &tabind, &bt.v, &ctx.v)
 	    == false;) {
-		if (prof_dump_ctx(ctx, bt, propagate_err))
+		if (prof_dump_ctx(ctx.p, bt.p, propagate_err))
 			goto ERROR;
 	}
 
@@ -1024,12 +838,12 @@ prof_dump(const char *filename, bool leakcheck, bool propagate_err)
 
 	if (leakcheck && cnt_all.curbytes != 0) {
 		malloc_write("<jemalloc>: Leak summary: ");
-		malloc_write(umax2s(cnt_all.curbytes, 10, buf));
+		malloc_write(u2s(cnt_all.curbytes, 10, buf));
 		malloc_write((cnt_all.curbytes != 1) ? " bytes, " : " byte, ");
-		malloc_write(umax2s(cnt_all.curobjs, 10, buf));
+		malloc_write(u2s(cnt_all.curobjs, 10, buf));
 		malloc_write((cnt_all.curobjs != 1) ? " objects, " :
 		    " object, ");
-		malloc_write(umax2s(leak_nctx, 10, buf));
+		malloc_write(u2s(leak_nctx, 10, buf));
 		malloc_write((leak_nctx != 1) ? " contexts\n" : " context\n");
 		malloc_write("<jemalloc>: Run pprof on \"");
 		malloc_write(filename);
@@ -1059,31 +873,11 @@ prof_dump_filename(char *filename, char v, int64_t vseq)
 	 * Construct a filename of the form:
 	 *
 	 *   <prefix>.<pid>.<seq>.v<vseq>.heap\0
-	 * or
-	 *   jeprof.<pid>.<seq>.v<vseq>.heap\0
 	 */
 
 	i = 0;
 
-	/*
-	 * Use JEMALLOC_PROF_PREFIX if it's set, and if it is short enough to
-	 * avoid overflowing DUMP_FILENAME_BUFSIZE.  The result may exceed
-	 * PATH_MAX, but creat(2) will catch that problem.
-	 */
-	if ((s = getenv("JEMALLOC_PROF_PREFIX")) != NULL
-	    && strlen(s) + (DUMP_FILENAME_BUFSIZE - PATH_MAX) <= PATH_MAX) {
-		slen = strlen(s);
-		memcpy(&filename[i], s, slen);
-		i += slen;
-
-		s = ".";
-	} else
-		s = "jeprof.";
-	slen = strlen(s);
-	memcpy(&filename[i], s, slen);
-	i += slen;
-
-	s = umax2s(getpid(), 10, buf);
+	s = opt_prof_prefix;
 	slen = strlen(s);
 	memcpy(&filename[i], s, slen);
 	i += slen;
@@ -1093,7 +887,17 @@ prof_dump_filename(char *filename, char v, int64_t vseq)
 	memcpy(&filename[i], s, slen);
 	i += slen;
 
-	s = umax2s(prof_dump_seq, 10, buf);
+	s = u2s(getpid(), 10, buf);
+	slen = strlen(s);
+	memcpy(&filename[i], s, slen);
+	i += slen;
+
+	s = ".";
+	slen = strlen(s);
+	memcpy(&filename[i], s, slen);
+	i += slen;
+
+	s = u2s(prof_dump_seq, 10, buf);
 	prof_dump_seq++;
 	slen = strlen(s);
 	memcpy(&filename[i], s, slen);
@@ -1108,7 +912,7 @@ prof_dump_filename(char *filename, char v, int64_t vseq)
 	i++;
 
 	if (vseq != 0xffffffffffffffffLLU) {
-		s = umax2s(vseq, 10, buf);
+		s = u2s(vseq, 10, buf);
 		slen = strlen(s);
 		memcpy(&filename[i], s, slen);
 		i += slen;
@@ -1130,10 +934,12 @@ prof_fdump(void)
 	if (prof_booted == false)
 		return;
 
-	malloc_mutex_lock(&prof_dump_seq_mtx);
-	prof_dump_filename(filename, 'f', 0xffffffffffffffffLLU);
-	malloc_mutex_unlock(&prof_dump_seq_mtx);
-	prof_dump(filename, opt_prof_leak, false);
+	if (opt_prof_prefix[0] != '\0') {
+		malloc_mutex_lock(&prof_dump_seq_mtx);
+		prof_dump_filename(filename, 'f', 0xffffffffffffffffLLU);
+		malloc_mutex_unlock(&prof_dump_seq_mtx);
+		prof_dump(filename, opt_prof_leak, false);
+	}
 }
 
 void
@@ -1151,11 +957,13 @@ prof_idump(void)
 	}
 	malloc_mutex_unlock(&enq_mtx);
 
-	malloc_mutex_lock(&prof_dump_seq_mtx);
-	prof_dump_filename(filename, 'i', prof_dump_iseq);
-	prof_dump_iseq++;
-	malloc_mutex_unlock(&prof_dump_seq_mtx);
-	prof_dump(filename, false, false);
+	if (opt_prof_prefix[0] != '\0') {
+		malloc_mutex_lock(&prof_dump_seq_mtx);
+		prof_dump_filename(filename, 'i', prof_dump_iseq);
+		prof_dump_iseq++;
+		malloc_mutex_unlock(&prof_dump_seq_mtx);
+		prof_dump(filename, false, false);
+	}
 }
 
 bool
@@ -1168,6 +976,8 @@ prof_mdump(const char *filename)
 
 	if (filename == NULL) {
 		/* No filename specified, so automatically generate one. */
+		if (opt_prof_prefix[0] == '\0')
+			return (true);
 		malloc_mutex_lock(&prof_dump_seq_mtx);
 		prof_dump_filename(filename_buf, 'm', prof_dump_mseq);
 		prof_dump_mseq++;
@@ -1178,7 +988,7 @@ prof_mdump(const char *filename)
 }
 
 void
-prof_udump(void)
+prof_gdump(void)
 {
 	char filename[DUMP_FILENAME_BUFSIZE];
 
@@ -1186,17 +996,19 @@ prof_udump(void)
 		return;
 	malloc_mutex_lock(&enq_mtx);
 	if (enq) {
-		enq_udump = true;
+		enq_gdump = true;
 		malloc_mutex_unlock(&enq_mtx);
 		return;
 	}
 	malloc_mutex_unlock(&enq_mtx);
 
-	malloc_mutex_lock(&prof_dump_seq_mtx);
-	prof_dump_filename(filename, 'u', prof_dump_useq);
-	prof_dump_useq++;
-	malloc_mutex_unlock(&prof_dump_seq_mtx);
-	prof_dump(filename, false, false);
+	if (opt_prof_prefix[0] != '\0') {
+		malloc_mutex_lock(&prof_dump_seq_mtx);
+		prof_dump_filename(filename, 'u', prof_dump_useq);
+		prof_dump_useq++;
+		malloc_mutex_unlock(&prof_dump_seq_mtx);
+		prof_dump(filename, false, false);
+	}
 }
 
 static void
@@ -1239,57 +1051,82 @@ prof_bt_keycomp(const void *k1, const void *k2)
 	return (memcmp(bt1->vec, bt2->vec, bt1->len * sizeof(void *)) == 0);
 }
 
-static void
-bt2cnt_thread_cleanup(void *arg)
+prof_tdata_t *
+prof_tdata_init(void)
 {
-	ckh_t *bt2cnt;
+	prof_tdata_t *prof_tdata;
 
-	bt2cnt = bt2cnt_tls;
-	if (bt2cnt != NULL) {
-		ql_head(prof_thr_cnt_t) cnts_ql;
-		size_t tabind;
+	/* Initialize an empty cache for this thread. */
+	prof_tdata = (prof_tdata_t *)imalloc(sizeof(prof_tdata_t));
+	if (prof_tdata == NULL)
+		return (NULL);
+
+	if (ckh_new(&prof_tdata->bt2cnt, PROF_CKH_MINITEMS,
+	    prof_bt_hash, prof_bt_keycomp)) {
+		idalloc(prof_tdata);
+		return (NULL);
+	}
+	ql_new(&prof_tdata->lru_ql);
+
+	prof_tdata->vec = imalloc(sizeof(void *) * prof_bt_max);
+	if (prof_tdata->vec == NULL) {
+
+		ckh_delete(&prof_tdata->bt2cnt);
+		idalloc(prof_tdata);
+		return (NULL);
+	}
+
+	prof_tdata->prn_state = 0;
+	prof_tdata->threshold = 0;
+	prof_tdata->accum = 0;
+
+	PROF_TCACHE_SET(prof_tdata);
+
+	return (prof_tdata);
+}
+
+static void
+prof_tdata_cleanup(void *arg)
+{
+	prof_tdata_t *prof_tdata;
+
+	prof_tdata = PROF_TCACHE_GET();
+	if (prof_tdata != NULL) {
 		prof_thr_cnt_t *cnt;
 
-		/* Iteratively merge cnt's into the global stats. */
-		ql_new(&cnts_ql);
-		tabind = 0;
-		while (ckh_iter(bt2cnt, &tabind, NULL, (void **)&cnt) ==
-		    false) {
-			prof_ctx_t *ctx = cnt->ctx;
-			/* Merge stats and detach from ctx. */
-			malloc_mutex_lock(&ctx->lock);
-			ctx->cnt_merged.curobjs += cnt->cnts.curobjs;
-			ctx->cnt_merged.curbytes += cnt->cnts.curbytes;
-			ctx->cnt_merged.accumobjs += cnt->cnts.accumobjs;
-			ctx->cnt_merged.accumbytes += cnt->cnts.accumbytes;
-			ql_remove(&ctx->cnts_ql, cnt, link);
-			malloc_mutex_unlock(&ctx->lock);
-
-			/*
-			 * Stash cnt for deletion after finishing with
-			 * ckh_iter().
-			 */
-			ql_tail_insert(&cnts_ql, cnt, link);
-		}
+		/*
+		 * Delete the hash table.  All of its contents can still be
+		 * iterated over via the LRU.
+		 */
+		ckh_delete(&prof_tdata->bt2cnt);
 
 		/*
-		 * Delete the hash table now that cnts_ql has a list of all
-		 * cnt's.
+		 * Iteratively merge cnt's into the global stats and delete
+		 * them.
 		 */
-		ckh_delete(bt2cnt);
-		idalloc(bt2cnt);
-		bt2cnt_tls = NULL;
-
-		/* Delete cnt's. */
-		while ((cnt = ql_last(&cnts_ql, link)) != NULL) {
-			ql_remove(&cnts_ql, cnt, link);
+		while ((cnt = ql_last(&prof_tdata->lru_ql, lru_link)) != NULL) {
+			prof_ctx_merge(cnt->ctx, cnt);
+			ql_remove(&prof_tdata->lru_ql, cnt, lru_link);
 			idalloc(cnt);
 		}
+
+		idalloc(prof_tdata->vec);
+
+		idalloc(prof_tdata);
+		PROF_TCACHE_SET(NULL);
 	}
 }
 
 void
 prof_boot0(void)
+{
+
+	memcpy(opt_prof_prefix, PROF_PREFIX_DEFAULT,
+	    sizeof(PROF_PREFIX_DEFAULT));
+}
+
+void
+prof_boot1(void)
 {
 
 	/*
@@ -1303,7 +1140,7 @@ prof_boot0(void)
 		 * automatically dumped.
 		 */
 		opt_prof = true;
-		opt_prof_udump = false;
+		opt_prof_gdump = false;
 		prof_interval = 0;
 	} else if (opt_prof) {
 		if (opt_lg_prof_interval >= 0) {
@@ -1317,7 +1154,7 @@ prof_boot0(void)
 }
 
 bool
-prof_boot1(void)
+prof_boot2(void)
 {
 
 	if (opt_prof) {
@@ -1326,7 +1163,7 @@ prof_boot1(void)
 			return (true);
 		if (malloc_mutex_init(&bt2ctx_mtx))
 			return (true);
-		if (pthread_key_create(&bt2cnt_tsd, bt2cnt_thread_cleanup)
+		if (pthread_key_create(&prof_tdata_tsd, prof_tdata_cleanup)
 		    != 0) {
 			malloc_write(
 			    "<jemalloc>: Error in pthread_key_create()\n");
@@ -1341,7 +1178,7 @@ prof_boot1(void)
 			return (true);
 		enq = false;
 		enq_idump = false;
-		enq_udump = false;
+		enq_gdump = false;
 
 		if (atexit(prof_fdump) != 0) {
 			malloc_write("<jemalloc>: Error in atexit()\n");
diff --git a/jemalloc/src/rtree.c b/jemalloc/src/rtree.c
new file mode 100644
index 00000000..7753743c
--- /dev/null
+++ b/jemalloc/src/rtree.c
@@ -0,0 +1,43 @@
+#define	RTREE_C_
+#include "jemalloc/internal/jemalloc_internal.h"
+
+rtree_t *
+rtree_new(unsigned bits)
+{
+	rtree_t *ret;
+	unsigned bits_per_level, height, i;
+
+	bits_per_level = ffs(pow2_ceil((RTREE_NODESIZE / sizeof(void *)))) - 1;
+	height = bits / bits_per_level;
+	if (height * bits_per_level != bits)
+		height++;
+	assert(height * bits_per_level >= bits);
+
+	ret = (rtree_t*)base_alloc(offsetof(rtree_t, level2bits) +
+	    (sizeof(unsigned) * height));
+	if (ret == NULL)
+		return (NULL);
+	memset(ret, 0, offsetof(rtree_t, level2bits) + (sizeof(unsigned) *
+	    height));
+
+	malloc_mutex_init(&ret->mutex);
+	ret->height = height;
+	if (bits_per_level * height > bits)
+		ret->level2bits[0] = bits % bits_per_level;
+	else
+		ret->level2bits[0] = bits_per_level;
+	for (i = 1; i < height; i++)
+		ret->level2bits[i] = bits_per_level;
+
+	ret->root = (void**)base_alloc(sizeof(void *) << ret->level2bits[0]);
+	if (ret->root == NULL) {
+		/*
+		 * We leak the rtree here, since there's no generic base
+		 * deallocation.
+		 */
+		return (NULL);
+	}
+	memset(ret->root, 0, sizeof(void *) << ret->level2bits[0]);
+
+	return (ret);
+}
diff --git a/jemalloc/src/stats.c b/jemalloc/src/stats.c
index 9dc75293..3dfe0d23 100644
--- a/jemalloc/src/stats.c
+++ b/jemalloc/src/stats.c
@@ -57,12 +57,12 @@ static void	stats_arena_print(void (*write_cb)(void *, const char *),
 
 /*
  * We don't want to depend on vsnprintf() for production builds, since that can
- * cause unnecessary bloat for static binaries.  umax2s() provides minimal
- * integer printing functionality, so that malloc_printf() use can be limited to
+ * cause unnecessary bloat for static binaries.  u2s() provides minimal integer
+ * printing functionality, so that malloc_printf() use can be limited to
  * JEMALLOC_STATS code.
  */
 char *
-umax2s(uintmax_t x, unsigned base, char *s)
+u2s(uint64_t x, unsigned base, char *s)
 {
 	unsigned i;
 
@@ -72,8 +72,8 @@ umax2s(uintmax_t x, unsigned base, char *s)
 	case 10:
 		do {
 			i--;
-			s[i] = "0123456789"[x % 10];
-			x /= 10;
+			s[i] = "0123456789"[x % (uint64_t)10];
+			x /= (uint64_t)10;
 		} while (x > 0);
 		break;
 	case 16:
@@ -86,8 +86,9 @@ umax2s(uintmax_t x, unsigned base, char *s)
 	default:
 		do {
 			i--;
-			s[i] = "0123456789abcdefghijklmnopqrstuvwxyz"[x % base];
-			x /= base;
+			s[i] = "0123456789abcdefghijklmnopqrstuvwxyz"[x %
+			    (uint64_t)base];
+			x /= (uint64_t)base;
 		} while (x > 0);
 	}
 
@@ -374,6 +375,7 @@ void
 stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
     const char *opts)
 {
+	int err;
 	uint64_t epoch;
 	size_t u64sz;
 	char s[UMAX2S_BUFSIZE];
@@ -383,10 +385,27 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 	bool bins = true;
 	bool large = true;
 
-	/* Refresh stats, in case mallctl() was called by the application. */
+	/*
+	 * Refresh stats, in case mallctl() was called by the application.
+	 *
+	 * Check for OOM here, since refreshing the ctl cache can trigger
+	 * allocation.  In practice, none of the subsequent mallctl()-related
+	 * calls in this function will cause OOM if this one succeeds.
+	 * */
 	epoch = 1;
 	u64sz = sizeof(uint64_t);
-	xmallctl("epoch", &epoch, &u64sz, &epoch, sizeof(uint64_t));
+	err = JEMALLOC_P(mallctl)("epoch", &epoch, &u64sz, &epoch,
+	    sizeof(uint64_t));
+	if (err != 0) {
+		if (err == EAGAIN) {
+			malloc_write("<jemalloc>: Memory allocation failure in "
+			    "mallctl(\"epoch\", ...)\n");
+			return;
+		}
+		malloc_write("<jemalloc>: Failure in mallctl(\"epoch\", "
+		    "...)\n");
+		abort();
+	}
 
 	if (write_cb == NULL) {
 		/*
@@ -430,10 +449,12 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 		bool bv;
 		unsigned uv;
 		ssize_t ssv;
-		size_t sv, bsz, ssz;
+		size_t sv, bsz, ssz, sssz, cpsz;
 
 		bsz = sizeof(bool);
 		ssz = sizeof(size_t);
+		sssz = sizeof(ssize_t);
+		cpsz = sizeof(const char *);
 
 		CTL_GET("version", &cpv, const char *);
 		write_cb(cbopaque, "Version: ");
@@ -444,113 +465,140 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 		write_cb(cbopaque, bv ? "enabled" : "disabled");
 		write_cb(cbopaque, "\n");
 
-		write_cb(cbopaque, "Boolean JEMALLOC_OPTIONS: ");
-		if ((err = JEMALLOC_P(mallctl)("opt.abort", &bv, &bsz, NULL, 0))
-		    == 0)
-			write_cb(cbopaque, bv ? "A" : "a");
-		if ((err = JEMALLOC_P(mallctl)("prof.active", &bv, &bsz,
-		    NULL, 0)) == 0)
-			write_cb(cbopaque, bv ? "E" : "e");
-		if ((err = JEMALLOC_P(mallctl)("opt.prof", &bv, &bsz, NULL, 0))
-		    == 0)
-			write_cb(cbopaque, bv ? "F" : "f");
-		if ((err = JEMALLOC_P(mallctl)("opt.tcache", &bv, &bsz, NULL,
-		    0)) == 0)
-			write_cb(cbopaque, bv ? "H" : "h");
-		if ((err = JEMALLOC_P(mallctl)("opt.junk", &bv, &bsz, NULL, 0))
-		    == 0)
-			write_cb(cbopaque, bv ? "J" : "j");
-		if ((err = JEMALLOC_P(mallctl)("opt.prof_leak", &bv, &bsz, NULL,
-		    0)) == 0)
-			write_cb(cbopaque, bv ? "L" : "l");
-		if ((err = JEMALLOC_P(mallctl)("opt.overcommit", &bv, &bsz,
-		    NULL, 0)) == 0)
-			write_cb(cbopaque, bv ? "O" : "o");
-		if ((err = JEMALLOC_P(mallctl)("opt.stats_print", &bv, &bsz,
-		    NULL, 0)) == 0)
-			write_cb(cbopaque, bv ? "P" : "p");
-		if ((err = JEMALLOC_P(mallctl)("opt.prof_udump", &bv, &bsz,
-		    NULL, 0)) == 0)
-			write_cb(cbopaque, bv ? "U" : "u");
-		if ((err = JEMALLOC_P(mallctl)("opt.sysv", &bv, &bsz, NULL, 0))
-		    == 0)
-			write_cb(cbopaque, bv ? "V" : "v");
-		if ((err = JEMALLOC_P(mallctl)("opt.xmalloc", &bv, &bsz, NULL,
-		    0)) == 0)
-			write_cb(cbopaque, bv ? "X" : "x");
-		if ((err = JEMALLOC_P(mallctl)("opt.zero", &bv, &bsz, NULL, 0))
-		    == 0)
-			write_cb(cbopaque, bv ? "Z" : "z");
-		write_cb(cbopaque, "\n");
+#define OPT_WRITE_BOOL(n)						\
+		if ((err = JEMALLOC_P(mallctl)("opt."#n, &bv, &bsz,	\
+		    NULL, 0)) == 0) {					\
+			write_cb(cbopaque, "  opt."#n": ");		\
+			write_cb(cbopaque, bv ? "true" : "false");	\
+			write_cb(cbopaque, "\n");			\
+		}
+#define OPT_WRITE_SIZE_T(n)						\
+		if ((err = JEMALLOC_P(mallctl)("opt."#n, &sv, &ssz,	\
+		    NULL, 0)) == 0) {					\
+			write_cb(cbopaque, "  opt."#n": ");		\
+			write_cb(cbopaque, u2s(sv, 10, s));		\
+			write_cb(cbopaque, "\n");			\
+		}
+#define OPT_WRITE_SSIZE_T(n)						\
+		if ((err = JEMALLOC_P(mallctl)("opt."#n, &ssv, &sssz,	\
+		    NULL, 0)) == 0) {					\
+			if (ssv >= 0) {					\
+				write_cb(cbopaque, "  opt."#n": ");	\
+				write_cb(cbopaque, u2s(ssv, 10, s));	\
+			} else {					\
+				write_cb(cbopaque, "  opt."#n": -");	\
+				write_cb(cbopaque, u2s(-ssv, 10, s));	\
+			}						\
+			write_cb(cbopaque, "\n");			\
+		}
+#define OPT_WRITE_CHAR_P(n)						\
+		if ((err = JEMALLOC_P(mallctl)("opt."#n, &cpv, &cpsz,	\
+		    NULL, 0)) == 0) {					\
+			write_cb(cbopaque, "  opt."#n": \"");		\
+			write_cb(cbopaque, cpv);			\
+			write_cb(cbopaque, "\"\n");			\
+		}
+
+		write_cb(cbopaque, "Run-time option settings:\n");
+		OPT_WRITE_BOOL(abort)
+		OPT_WRITE_SIZE_T(lg_qspace_max)
+		OPT_WRITE_SIZE_T(lg_cspace_max)
+		OPT_WRITE_SIZE_T(lg_chunk)
+		OPT_WRITE_SIZE_T(narenas)
+		OPT_WRITE_SSIZE_T(lg_dirty_mult)
+		OPT_WRITE_BOOL(stats_print)
+		OPT_WRITE_BOOL(junk)
+		OPT_WRITE_BOOL(zero)
+		OPT_WRITE_BOOL(sysv)
+		OPT_WRITE_BOOL(xmalloc)
+		OPT_WRITE_BOOL(tcache)
+		OPT_WRITE_SSIZE_T(lg_tcache_gc_sweep)
+		OPT_WRITE_SSIZE_T(lg_tcache_max)
+		OPT_WRITE_BOOL(prof)
+		OPT_WRITE_CHAR_P(prof_prefix)
+		OPT_WRITE_SIZE_T(lg_prof_bt_max)
+		OPT_WRITE_BOOL(prof_active)
+		OPT_WRITE_SSIZE_T(lg_prof_sample)
+		OPT_WRITE_BOOL(prof_accum)
+		OPT_WRITE_SSIZE_T(lg_prof_tcmax)
+		OPT_WRITE_SSIZE_T(lg_prof_interval)
+		OPT_WRITE_BOOL(prof_gdump)
+		OPT_WRITE_BOOL(prof_leak)
+		OPT_WRITE_BOOL(overcommit)
+
+#undef OPT_WRITE_BOOL
+#undef OPT_WRITE_SIZE_T
+#undef OPT_WRITE_SSIZE_T
+#undef OPT_WRITE_CHAR_P
 
 		write_cb(cbopaque, "CPUs: ");
-		write_cb(cbopaque, umax2s(ncpus, 10, s));
+		write_cb(cbopaque, u2s(ncpus, 10, s));
 		write_cb(cbopaque, "\n");
 
 		CTL_GET("arenas.narenas", &uv, unsigned);
 		write_cb(cbopaque, "Max arenas: ");
-		write_cb(cbopaque, umax2s(uv, 10, s));
+		write_cb(cbopaque, u2s(uv, 10, s));
 		write_cb(cbopaque, "\n");
 
 		write_cb(cbopaque, "Pointer size: ");
-		write_cb(cbopaque, umax2s(sizeof(void *), 10, s));
+		write_cb(cbopaque, u2s(sizeof(void *), 10, s));
 		write_cb(cbopaque, "\n");
 
 		CTL_GET("arenas.quantum", &sv, size_t);
 		write_cb(cbopaque, "Quantum size: ");
-		write_cb(cbopaque, umax2s(sv, 10, s));
+		write_cb(cbopaque, u2s(sv, 10, s));
 		write_cb(cbopaque, "\n");
 
 		CTL_GET("arenas.cacheline", &sv, size_t);
 		write_cb(cbopaque, "Cacheline size (assumed): ");
-		write_cb(cbopaque, umax2s(sv, 10, s));
+		write_cb(cbopaque, u2s(sv, 10, s));
 		write_cb(cbopaque, "\n");
 
 		CTL_GET("arenas.subpage", &sv, size_t);
 		write_cb(cbopaque, "Subpage spacing: ");
-		write_cb(cbopaque, umax2s(sv, 10, s));
+		write_cb(cbopaque, u2s(sv, 10, s));
 		write_cb(cbopaque, "\n");
 
 		if ((err = JEMALLOC_P(mallctl)("arenas.tspace_min", &sv, &ssz,
 		    NULL, 0)) == 0) {
 			write_cb(cbopaque, "Tiny 2^n-spaced sizes: [");
-			write_cb(cbopaque, umax2s(sv, 10, s));
+			write_cb(cbopaque, u2s(sv, 10, s));
 			write_cb(cbopaque, "..");
 
 			CTL_GET("arenas.tspace_max", &sv, size_t);
-			write_cb(cbopaque, umax2s(sv, 10, s));
+			write_cb(cbopaque, u2s(sv, 10, s));
 			write_cb(cbopaque, "]\n");
 		}
 
 		CTL_GET("arenas.qspace_min", &sv, size_t);
 		write_cb(cbopaque, "Quantum-spaced sizes: [");
-		write_cb(cbopaque, umax2s(sv, 10, s));
+		write_cb(cbopaque, u2s(sv, 10, s));
 		write_cb(cbopaque, "..");
 		CTL_GET("arenas.qspace_max", &sv, size_t);
-		write_cb(cbopaque, umax2s(sv, 10, s));
+		write_cb(cbopaque, u2s(sv, 10, s));
 		write_cb(cbopaque, "]\n");
 
 		CTL_GET("arenas.cspace_min", &sv, size_t);
 		write_cb(cbopaque, "Cacheline-spaced sizes: [");
-		write_cb(cbopaque, umax2s(sv, 10, s));
+		write_cb(cbopaque, u2s(sv, 10, s));
 		write_cb(cbopaque, "..");
 		CTL_GET("arenas.cspace_max", &sv, size_t);
-		write_cb(cbopaque, umax2s(sv, 10, s));
+		write_cb(cbopaque, u2s(sv, 10, s));
 		write_cb(cbopaque, "]\n");
 
 		CTL_GET("arenas.sspace_min", &sv, size_t);
 		write_cb(cbopaque, "Subpage-spaced sizes: [");
-		write_cb(cbopaque, umax2s(sv, 10, s));
+		write_cb(cbopaque, u2s(sv, 10, s));
 		write_cb(cbopaque, "..");
 		CTL_GET("arenas.sspace_max", &sv, size_t);
-		write_cb(cbopaque, umax2s(sv, 10, s));
+		write_cb(cbopaque, u2s(sv, 10, s));
 		write_cb(cbopaque, "]\n");
 
 		CTL_GET("opt.lg_dirty_mult", &ssv, ssize_t);
 		if (ssv >= 0) {
 			write_cb(cbopaque,
 			    "Min active:dirty page ratio per arena: ");
-			write_cb(cbopaque, umax2s((1U << ssv), 10, s));
+			write_cb(cbopaque, u2s((1U << ssv), 10, s));
 			write_cb(cbopaque, ":1\n");
 		} else {
 			write_cb(cbopaque,
@@ -560,7 +608,7 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 		    &ssz, NULL, 0)) == 0) {
 			write_cb(cbopaque,
 			    "Maximum thread-cached size class: ");
-			write_cb(cbopaque, umax2s(sv, 10, s));
+			write_cb(cbopaque, u2s(sv, 10, s));
 			write_cb(cbopaque, "\n");
 		}
 		if ((err = JEMALLOC_P(mallctl)("opt.lg_tcache_gc_sweep", &ssv,
@@ -570,39 +618,51 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 			CTL_GET("opt.tcache", &tcache_enabled, bool);
 			write_cb(cbopaque, "Thread cache GC sweep interval: ");
 			write_cb(cbopaque, tcache_enabled && ssv >= 0 ?
-			    umax2s(tcache_gc_sweep, 10, s) : "N/A");
+			    u2s(tcache_gc_sweep, 10, s) : "N/A");
 			write_cb(cbopaque, "\n");
 		}
 		if ((err = JEMALLOC_P(mallctl)("opt.prof", &bv, &bsz, NULL, 0))
 		   == 0 && bv) {
 			CTL_GET("opt.lg_prof_bt_max", &sv, size_t);
 			write_cb(cbopaque, "Maximum profile backtrace depth: ");
-			write_cb(cbopaque, umax2s((1U << sv), 10, s));
+			write_cb(cbopaque, u2s((1U << sv), 10, s));
 			write_cb(cbopaque, "\n");
 
+			CTL_GET("opt.lg_prof_tcmax", &ssv, ssize_t);
+			write_cb(cbopaque,
+			    "Maximum per thread backtrace cache: ");
+			if (ssv >= 0) {
+				write_cb(cbopaque, u2s((1U << ssv), 10, s));
+				write_cb(cbopaque, " (2^");
+				write_cb(cbopaque, u2s(ssv, 10, s));
+				write_cb(cbopaque, ")\n");
+			} else
+				write_cb(cbopaque, "N/A\n");
+
 			CTL_GET("opt.lg_prof_sample", &sv, size_t);
 			write_cb(cbopaque, "Average profile sample interval: ");
-			write_cb(cbopaque, umax2s((1U << sv), 10, s));
+			write_cb(cbopaque, u2s((((uint64_t)1U) << sv), 10, s));
 			write_cb(cbopaque, " (2^");
-			write_cb(cbopaque, umax2s(sv, 10, s));
+			write_cb(cbopaque, u2s(sv, 10, s));
 			write_cb(cbopaque, ")\n");
 
 			CTL_GET("opt.lg_prof_interval", &ssv, ssize_t);
 			write_cb(cbopaque, "Average profile dump interval: ");
 			if (ssv >= 0) {
-				write_cb(cbopaque, umax2s((1U << ssv), 10, s));
+				write_cb(cbopaque, u2s((((uint64_t)1U) << ssv),
+				    10, s));
 				write_cb(cbopaque, " (2^");
-				write_cb(cbopaque, umax2s(ssv, 10, s));
+				write_cb(cbopaque, u2s(ssv, 10, s));
 				write_cb(cbopaque, ")\n");
 			} else
 				write_cb(cbopaque, "N/A\n");
 		}
 		CTL_GET("arenas.chunksize", &sv, size_t);
 		write_cb(cbopaque, "Chunk size: ");
-		write_cb(cbopaque, umax2s(sv, 10, s));
+		write_cb(cbopaque, u2s(sv, 10, s));
 		CTL_GET("opt.lg_chunk", &sv, size_t);
 		write_cb(cbopaque, " (2^");
-		write_cb(cbopaque, umax2s(sv, 10, s));
+		write_cb(cbopaque, u2s(sv, 10, s));
 		write_cb(cbopaque, ")\n");
 	}
 
diff --git a/jemalloc/src/tcache.c b/jemalloc/src/tcache.c
index ace24cea..cbbe7a11 100644
--- a/jemalloc/src/tcache.c
+++ b/jemalloc/src/tcache.c
@@ -5,17 +5,19 @@
 /* Data. */
 
 bool	opt_tcache = true;
-ssize_t	opt_lg_tcache_maxclass = LG_TCACHE_MAXCLASS_DEFAULT;
+ssize_t	opt_lg_tcache_max = LG_TCACHE_MAXCLASS_DEFAULT;
 ssize_t	opt_lg_tcache_gc_sweep = LG_TCACHE_GC_SWEEP_DEFAULT;
 
 /* Map of thread-specific caches. */
+#ifndef NO_TLS
 __thread tcache_t	*tcache_tls JEMALLOC_ATTR(tls_model("initial-exec"));
+#endif
 
 /*
  * Same contents as tcache, but initialized such that the TSD destructor is
  * called when a thread exits, so that the cache can be cleaned up.
  */
-static pthread_key_t		tcache_tsd;
+pthread_key_t		tcache_tsd;
 
 size_t				nhbins;
 size_t				tcache_maxclass;
@@ -93,10 +95,10 @@ tcache_bin_flush_small(tcache_bin_t *tbin, size_t binind, unsigned rem
 			flush = *(void **)ptr;
 			chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 			if (chunk->arena == arena) {
-				size_t pageind = (((uintptr_t)ptr -
-				    (uintptr_t)chunk) >> PAGE_SHIFT);
+				size_t pageind = ((uintptr_t)ptr -
+				    (uintptr_t)chunk) >> PAGE_SHIFT;
 				arena_chunk_map_t *mapelm =
-				    &chunk->map[pageind];
+				    &chunk->map[pageind-map_bias];
 				arena_dalloc_bin(arena, chunk, ptr, mapelm);
 			} else {
 				/*
@@ -202,12 +204,14 @@ tcache_create(arena_t *arena)
 	size_t size;
 	unsigned i;
 
-	size = sizeof(tcache_t) + (sizeof(tcache_bin_t) * (nhbins - 1));
+	size = offsetof(tcache_t, tbins) + (sizeof(tcache_bin_t) * nhbins);
 	/*
 	 * Round up to the nearest multiple of the cacheline size, in order to
 	 * avoid the possibility of false cacheline sharing.
 	 *
-	 * That this works relies on the same logic as in ipalloc().
+	 * That this works relies on the same logic as in ipalloc(), but we
+	 * cannot directly call ipalloc() here due to tcache bootstrapping
+	 * issues.
 	 */
 	size = (size + CACHELINE_MASK) & (-CACHELINE);
 
@@ -239,8 +243,7 @@ tcache_create(arena_t *arena)
 	for (; i < nhbins; i++)
 		tcache->tbins[i].ncached_max = TCACHE_NSLOTS_LARGE;
 
-	tcache_tls = tcache;
-	pthread_setspecific(tcache_tsd, tcache);
+	TCACHE_SET(tcache);
 
 	return (tcache);
 }
@@ -308,9 +311,9 @@ tcache_destroy(tcache_t *tcache)
 	if (arena_salloc(tcache) <= small_maxclass) {
 		arena_chunk_t *chunk = CHUNK_ADDR2BASE(tcache);
 		arena_t *arena = chunk->arena;
-		size_t pageind = (((uintptr_t)tcache - (uintptr_t)chunk) >>
-		    PAGE_SHIFT);
-		arena_chunk_map_t *mapelm = &chunk->map[pageind];
+		size_t pageind = ((uintptr_t)tcache - (uintptr_t)chunk) >>
+		    PAGE_SHIFT;
+		arena_chunk_map_t *mapelm = &chunk->map[pageind-map_bias];
 		arena_run_t *run = (arena_run_t *)((uintptr_t)chunk +
 		    (uintptr_t)((pageind - (mapelm->bits >> PAGE_SHIFT)) <<
 		    PAGE_SHIFT));
@@ -328,11 +331,24 @@ tcache_thread_cleanup(void *arg)
 {
 	tcache_t *tcache = (tcache_t *)arg;
 
-	assert(tcache == tcache_tls);
-	if (tcache != NULL) {
+	if (tcache == (void *)(uintptr_t)1) {
+		/*
+		 * The previous time this destructor was called, we set the key
+		 * to 1 so that other destructors wouldn't cause re-creation of
+		 * the tcache.  This time, do nothing, so that the destructor
+		 * will not be called again.
+		 */
+	} else if (tcache == (void *)(uintptr_t)2) {
+		/*
+		 * Another destructor called an allocator function after this
+		 * destructor was called.  Reset tcache to 1 in order to
+		 * receive another callback.
+		 */
+		TCACHE_SET((uintptr_t)1);
+	} else if (tcache != NULL) {
 		assert(tcache != (void *)(uintptr_t)1);
 		tcache_destroy(tcache);
-		tcache_tls = (void *)(uintptr_t)1;
+		TCACHE_SET((uintptr_t)1);
 	}
 }
 
@@ -368,16 +384,16 @@ tcache_boot(void)
 
 	if (opt_tcache) {
 		/*
-		 * If necessary, clamp opt_lg_tcache_maxclass, now that
+		 * If necessary, clamp opt_lg_tcache_max, now that
 		 * small_maxclass and arena_maxclass are known.
 		 */
-		if (opt_lg_tcache_maxclass < 0 || (1U <<
-		    opt_lg_tcache_maxclass) < small_maxclass)
+		if (opt_lg_tcache_max < 0 || (1U <<
+		    opt_lg_tcache_max) < small_maxclass)
 			tcache_maxclass = small_maxclass;
-		else if ((1U << opt_lg_tcache_maxclass) > arena_maxclass)
+		else if ((1U << opt_lg_tcache_max) > arena_maxclass)
 			tcache_maxclass = arena_maxclass;
 		else
-			tcache_maxclass = (1U << opt_lg_tcache_maxclass);
+			tcache_maxclass = (1U << opt_lg_tcache_max);
 
 		nhbins = nbins + (tcache_maxclass >> PAGE_SHIFT);
 
diff --git a/jemalloc/src/zone.c b/jemalloc/src/zone.c
new file mode 100644
index 00000000..2c1b2318
--- /dev/null
+++ b/jemalloc/src/zone.c
@@ -0,0 +1,354 @@
+#include "jemalloc/internal/jemalloc_internal.h"
+#ifndef JEMALLOC_ZONE
+#  error "This source file is for zones on Darwin (OS X)."
+#endif
+
+/******************************************************************************/
+/* Data. */
+
+static malloc_zone_t zone, szone;
+static struct malloc_introspection_t zone_introspect, ozone_introspect;
+
+/******************************************************************************/
+/* Function prototypes for non-inline static functions. */
+
+static size_t	zone_size(malloc_zone_t *zone, void *ptr);
+static void	*zone_malloc(malloc_zone_t *zone, size_t size);
+static void	*zone_calloc(malloc_zone_t *zone, size_t num, size_t size);
+static void	*zone_valloc(malloc_zone_t *zone, size_t size);
+static void	zone_free(malloc_zone_t *zone, void *ptr);
+static void	*zone_realloc(malloc_zone_t *zone, void *ptr, size_t size);
+#if (JEMALLOC_ZONE_VERSION >= 6)
+static void	*zone_memalign(malloc_zone_t *zone, size_t alignment,
+    size_t size);
+static void	zone_free_definite_size(malloc_zone_t *zone, void *ptr,
+    size_t size);
+#endif
+static void	*zone_destroy(malloc_zone_t *zone);
+static size_t	zone_good_size(malloc_zone_t *zone, size_t size);
+static void	zone_force_lock(malloc_zone_t *zone);
+static void	zone_force_unlock(malloc_zone_t *zone);
+static size_t	ozone_size(malloc_zone_t *zone, void *ptr);
+static void	ozone_free(malloc_zone_t *zone, void *ptr);
+static void	*ozone_realloc(malloc_zone_t *zone, void *ptr, size_t size);
+static unsigned	ozone_batch_malloc(malloc_zone_t *zone, size_t size,
+    void **results, unsigned num_requested);
+static void	ozone_batch_free(malloc_zone_t *zone, void **to_be_freed,
+    unsigned num);
+#if (JEMALLOC_ZONE_VERSION >= 6)
+static void	ozone_free_definite_size(malloc_zone_t *zone, void *ptr,
+    size_t size);
+#endif
+static void	ozone_force_lock(malloc_zone_t *zone);
+static void	ozone_force_unlock(malloc_zone_t *zone);
+
+/******************************************************************************/
+/*
+ * Functions.
+ */
+
+static size_t
+zone_size(malloc_zone_t *zone, void *ptr)
+{
+
+	/*
+	 * There appear to be places within Darwin (such as setenv(3)) that
+	 * cause calls to this function with pointers that *no* zone owns.  If
+	 * we knew that all pointers were owned by *some* zone, we could split
+	 * our zone into two parts, and use one as the default allocator and
+	 * the other as the default deallocator/reallocator.  Since that will
+	 * not work in practice, we must check all pointers to assure that they
+	 * reside within a mapped chunk before determining size.
+	 */
+	return (ivsalloc(ptr));
+}
+
+static void *
+zone_malloc(malloc_zone_t *zone, size_t size)
+{
+
+	return (JEMALLOC_P(malloc)(size));
+}
+
+static void *
+zone_calloc(malloc_zone_t *zone, size_t num, size_t size)
+{
+
+	return (JEMALLOC_P(calloc)(num, size));
+}
+
+static void *
+zone_valloc(malloc_zone_t *zone, size_t size)
+{
+	void *ret = NULL; /* Assignment avoids useless compiler warning. */
+
+	JEMALLOC_P(posix_memalign)(&ret, PAGE_SIZE, size);
+
+	return (ret);
+}
+
+static void
+zone_free(malloc_zone_t *zone, void *ptr)
+{
+
+	JEMALLOC_P(free)(ptr);
+}
+
+static void *
+zone_realloc(malloc_zone_t *zone, void *ptr, size_t size)
+{
+
+	return (JEMALLOC_P(realloc)(ptr, size));
+}
+
+#if (JEMALLOC_ZONE_VERSION >= 6)
+static void *
+zone_memalign(malloc_zone_t *zone, size_t alignment, size_t size)
+{
+	void *ret = NULL; /* Assignment avoids useless compiler warning. */
+
+	JEMALLOC_P(posix_memalign)(&ret, alignment, size);
+
+	return (ret);
+}
+
+static void
+zone_free_definite_size(malloc_zone_t *zone, void *ptr, size_t size)
+{
+
+	assert(ivsalloc(ptr) == size);
+	JEMALLOC_P(free)(ptr);
+}
+#endif
+
+static void *
+zone_destroy(malloc_zone_t *zone)
+{
+
+	/* This function should never be called. */
+	assert(false);
+	return (NULL);
+}
+
+static size_t
+zone_good_size(malloc_zone_t *zone, size_t size)
+{
+	size_t ret;
+	void *p;
+
+	/*
+	 * Actually create an object of the appropriate size, then find out
+	 * how large it could have been without moving up to the next size
+	 * class.
+	 */
+	p = JEMALLOC_P(malloc)(size);
+	if (p != NULL) {
+		ret = isalloc(p);
+		JEMALLOC_P(free)(p);
+	} else
+		ret = size;
+
+	return (ret);
+}
+
+static void
+zone_force_lock(malloc_zone_t *zone)
+{
+
+	if (isthreaded)
+		jemalloc_prefork();
+}
+
+static void
+zone_force_unlock(malloc_zone_t *zone)
+{
+
+	if (isthreaded)
+		jemalloc_postfork();
+}
+
+malloc_zone_t *
+create_zone(void)
+{
+
+	zone.size = (void *)zone_size;
+	zone.malloc = (void *)zone_malloc;
+	zone.calloc = (void *)zone_calloc;
+	zone.valloc = (void *)zone_valloc;
+	zone.free = (void *)zone_free;
+	zone.realloc = (void *)zone_realloc;
+	zone.destroy = (void *)zone_destroy;
+	zone.zone_name = "jemalloc_zone";
+	zone.batch_malloc = NULL;
+	zone.batch_free = NULL;
+	zone.introspect = &zone_introspect;
+	zone.version = JEMALLOC_ZONE_VERSION;
+#if (JEMALLOC_ZONE_VERSION >= 6)
+	zone.memalign = zone_memalign;
+	zone.free_definite_size = zone_free_definite_size;
+#endif
+
+	zone_introspect.enumerator = NULL;
+	zone_introspect.good_size = (void *)zone_good_size;
+	zone_introspect.check = NULL;
+	zone_introspect.print = NULL;
+	zone_introspect.log = NULL;
+	zone_introspect.force_lock = (void *)zone_force_lock;
+	zone_introspect.force_unlock = (void *)zone_force_unlock;
+	zone_introspect.statistics = NULL;
+#if (JEMALLOC_ZONE_VERSION >= 6)
+	zone_introspect.zone_locked = NULL;
+#endif
+
+	return (&zone);
+}
+
+static size_t
+ozone_size(malloc_zone_t *zone, void *ptr)
+{
+	size_t ret;
+
+	ret = ivsalloc(ptr);
+	if (ret == 0)
+		ret = szone.size(zone, ptr);
+
+	return (ret);
+}
+
+static void
+ozone_free(malloc_zone_t *zone, void *ptr)
+{
+
+	if (ivsalloc(ptr) != 0)
+		JEMALLOC_P(free)(ptr);
+	else {
+		size_t size = szone.size(zone, ptr);
+		if (size != 0)
+			(szone.free)(zone, ptr);
+	}
+}
+
+static void *
+ozone_realloc(malloc_zone_t *zone, void *ptr, size_t size)
+{
+	size_t oldsize;
+
+	if (ptr == NULL)
+		return (JEMALLOC_P(malloc)(size));
+
+	oldsize = ivsalloc(ptr);
+	if (oldsize != 0)
+		return (JEMALLOC_P(realloc)(ptr, size));
+	else {
+		oldsize = szone.size(zone, ptr);
+		if (oldsize == 0)
+			return (JEMALLOC_P(malloc)(size));
+		else {
+			void *ret = JEMALLOC_P(malloc)(size);
+			if (ret != NULL) {
+				memcpy(ret, ptr, (oldsize < size) ? oldsize :
+				    size);
+				(szone.free)(zone, ptr);
+			}
+			return (ret);
+		}
+	}
+}
+
+static unsigned
+ozone_batch_malloc(malloc_zone_t *zone, size_t size, void **results,
+    unsigned num_requested)
+{
+
+	/* Don't bother implementing this interface, since it isn't required. */
+	return (0);
+}
+
+static void
+ozone_batch_free(malloc_zone_t *zone, void **to_be_freed, unsigned num)
+{
+	unsigned i;
+
+	for (i = 0; i < num; i++)
+		ozone_free(zone, to_be_freed[i]);
+}
+
+#if (JEMALLOC_ZONE_VERSION >= 6)
+static void
+ozone_free_definite_size(malloc_zone_t *zone, void *ptr, size_t size)
+{
+
+	if (ivsalloc(ptr) != 0) {
+		assert(ivsalloc(ptr) == size);
+		JEMALLOC_P(free)(ptr);
+	} else {
+		assert(size == szone.size(zone, ptr));
+		szone.free_definite_size(zone, ptr, size);
+	}
+}
+#endif
+
+static void
+ozone_force_lock(malloc_zone_t *zone)
+{
+
+	/* jemalloc locking is taken care of by the normal jemalloc zone. */
+	szone.introspect->force_lock(zone);
+}
+
+static void
+ozone_force_unlock(malloc_zone_t *zone)
+{
+
+	/* jemalloc locking is taken care of by the normal jemalloc zone. */
+	szone.introspect->force_unlock(zone);
+}
+
+/*
+ * Overlay the default scalable zone (szone) such that existing allocations are
+ * drained, and further allocations come from jemalloc.  This is necessary
+ * because Core Foundation directly accesses and uses the szone before the
+ * jemalloc library is even loaded.
+ */
+void
+szone2ozone(malloc_zone_t *zone)
+{
+
+	/*
+	 * Stash a copy of the original szone so that we can call its
+	 * functions as needed.  Note that the internally, the szone stores its
+	 * bookkeeping data structures immediately following the malloc_zone_t
+	 * header, so when calling szone functions, we need to pass a pointer
+	 * to the original zone structure.
+	 */
+	memcpy(&szone, zone, sizeof(malloc_zone_t));
+
+	zone->size = (void *)ozone_size;
+	zone->malloc = (void *)zone_malloc;
+	zone->calloc = (void *)zone_calloc;
+	zone->valloc = (void *)zone_valloc;
+	zone->free = (void *)ozone_free;
+	zone->realloc = (void *)ozone_realloc;
+	zone->destroy = (void *)zone_destroy;
+	zone->zone_name = "jemalloc_ozone";
+	zone->batch_malloc = ozone_batch_malloc;
+	zone->batch_free = ozone_batch_free;
+	zone->introspect = &ozone_introspect;
+	zone->version = JEMALLOC_ZONE_VERSION;
+#if (JEMALLOC_ZONE_VERSION >= 6)
+	zone->memalign = zone_memalign;
+	zone->free_definite_size = ozone_free_definite_size;
+#endif
+
+	ozone_introspect.enumerator = NULL;
+	ozone_introspect.good_size = (void *)zone_good_size;
+	ozone_introspect.check = NULL;
+	ozone_introspect.print = NULL;
+	ozone_introspect.log = NULL;
+	ozone_introspect.force_lock = (void *)ozone_force_lock;
+	ozone_introspect.force_unlock = (void *)ozone_force_unlock;
+	ozone_introspect.statistics = NULL;
+#if (JEMALLOC_ZONE_VERSION >= 6)
+	ozone_introspect.zone_locked = NULL;
+#endif
+}
diff --git a/jemalloc/test/allocated.c b/jemalloc/test/allocated.c
new file mode 100644
index 00000000..64a17351
--- /dev/null
+++ b/jemalloc/test/allocated.c
@@ -0,0 +1,105 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <pthread.h>
+#include <assert.h>
+#include <errno.h>
+#include <string.h>
+
+#define	JEMALLOC_MANGLE
+#include "jemalloc_test.h"
+
+void *
+thread_start(void *arg)
+{
+	int err;
+	void *p;
+	uint64_t a0, a1, d0, d1;
+	size_t sz, usize;
+
+	sz = sizeof(a0);
+	if ((err = JEMALLOC_P(mallctl)("thread.allocated", &a0, &sz, NULL,
+	    0))) {
+		if (err == ENOENT) {
+#ifdef JEMALLOC_STATS
+			assert(false);
+#endif
+			goto RETURN;
+		}
+		fprintf(stderr, "%s(): Error in mallctl(): %s\n", __func__,
+		    strerror(err));
+		exit(1);
+	}
+
+	sz = sizeof(d0);
+	if ((err = JEMALLOC_P(mallctl)("thread.deallocated", &d0, &sz, NULL,
+	    0))) {
+		if (err == ENOENT) {
+#ifdef JEMALLOC_STATS
+			assert(false);
+#endif
+			goto RETURN;
+		}
+		fprintf(stderr, "%s(): Error in mallctl(): %s\n", __func__,
+		    strerror(err));
+		exit(1);
+	}
+
+	p = JEMALLOC_P(malloc)(1);
+	if (p == NULL) {
+		fprintf(stderr, "%s(): Error in malloc()\n", __func__);
+		exit(1);
+	}
+
+	sz = sizeof(a1);
+	JEMALLOC_P(mallctl)("thread.allocated", &a1, &sz, NULL, 0);
+
+	usize = JEMALLOC_P(malloc_usable_size)(p);
+	assert(a0 + usize <= a1);
+
+	JEMALLOC_P(free)(p);
+
+	sz = sizeof(d1);
+	JEMALLOC_P(mallctl)("thread.deallocated", &d1, &sz, NULL, 0);
+
+	assert(d0 + usize <= d1);
+
+RETURN:
+	return (NULL);
+}
+
+int
+main(void)
+{
+	int ret = 0;
+	pthread_t thread;
+
+	fprintf(stderr, "Test begin\n");
+
+	thread_start(NULL);
+
+	if (pthread_create(&thread, NULL, thread_start, NULL)
+	    != 0) {
+		fprintf(stderr, "%s(): Error in pthread_create()\n", __func__);
+		ret = 1;
+		goto RETURN;
+	}
+	pthread_join(thread, (void *)&ret);
+
+	thread_start(NULL);
+
+	if (pthread_create(&thread, NULL, thread_start, NULL)
+	    != 0) {
+		fprintf(stderr, "%s(): Error in pthread_create()\n", __func__);
+		ret = 1;
+		goto RETURN;
+	}
+	pthread_join(thread, (void *)&ret);
+
+	thread_start(NULL);
+
+RETURN:
+	fprintf(stderr, "Test end\n");
+	return (ret);
+}
diff --git a/jemalloc/test/allocated.exp b/jemalloc/test/allocated.exp
new file mode 100644
index 00000000..369a88dd
--- /dev/null
+++ b/jemalloc/test/allocated.exp
@@ -0,0 +1,2 @@
+Test begin
+Test end
diff --git a/jemalloc/test/allocm.c b/jemalloc/test/allocm.c
new file mode 100644
index 00000000..59d0002e
--- /dev/null
+++ b/jemalloc/test/allocm.c
@@ -0,0 +1,133 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+
+#define	JEMALLOC_MANGLE
+#include "jemalloc_test.h"
+
+#define CHUNK 0x400000
+/* #define MAXALIGN ((size_t)0x80000000000LLU) */
+#define MAXALIGN ((size_t)0x2000000LLU)
+#define NITER 4
+
+int
+main(void)
+{
+	int r;
+	void *p;
+	size_t sz, alignment, total, tsz;
+	unsigned i;
+	void *ps[NITER];
+
+	fprintf(stderr, "Test begin\n");
+
+	sz = 0;
+	r = JEMALLOC_P(allocm)(&p, &sz, 42, 0);
+	if (r != ALLOCM_SUCCESS) {
+		fprintf(stderr, "Unexpected allocm() error\n");
+		abort();
+	}
+	if (sz < 42)
+		fprintf(stderr, "Real size smaller than expected\n");
+	if (JEMALLOC_P(dallocm)(p, 0) != ALLOCM_SUCCESS)
+		fprintf(stderr, "Unexpected dallocm() error\n");
+
+	r = JEMALLOC_P(allocm)(&p, NULL, 42, 0);
+	if (r != ALLOCM_SUCCESS) {
+		fprintf(stderr, "Unexpected allocm() error\n");
+		abort();
+	}
+	if (JEMALLOC_P(dallocm)(p, 0) != ALLOCM_SUCCESS)
+		fprintf(stderr, "Unexpected dallocm() error\n");
+
+	r = JEMALLOC_P(allocm)(&p, NULL, 42, ALLOCM_ZERO);
+	if (r != ALLOCM_SUCCESS) {
+		fprintf(stderr, "Unexpected allocm() error\n");
+		abort();
+	}
+	if (JEMALLOC_P(dallocm)(p, 0) != ALLOCM_SUCCESS)
+		fprintf(stderr, "Unexpected dallocm() error\n");
+
+#if LG_SIZEOF_PTR == 3
+	alignment = 0x8000000000000000LLU;
+	sz        = 0x8000000000000000LLU;
+#else
+	alignment = 0x80000000LU;
+	sz        = 0x80000000LU;
+#endif
+	r = JEMALLOC_P(allocm)(&p, NULL, sz, ALLOCM_ALIGN(alignment));
+	if (r == ALLOCM_SUCCESS) {
+		fprintf(stderr,
+		    "Expected error for allocm(&p, %zu, 0x%x)\n",
+		    sz, ALLOCM_ALIGN(alignment));
+	}
+
+#if LG_SIZEOF_PTR == 3
+	alignment = 0x4000000000000000LLU;
+	sz        = 0x8400000000000001LLU;
+#else
+	alignment = 0x40000000LU;
+	sz        = 0x84000001LU;
+#endif
+	r = JEMALLOC_P(allocm)(&p, NULL, sz, ALLOCM_ALIGN(alignment));
+	if (r == ALLOCM_SUCCESS) {
+		fprintf(stderr,
+		    "Expected error for allocm(&p, %zu, 0x%x)\n",
+		    sz, ALLOCM_ALIGN(alignment));
+	}
+
+	alignment = 0x10LLU;
+#if LG_SIZEOF_PTR == 3
+	sz   = 0xfffffffffffffff0LLU;
+#else
+	sz   = 0xfffffff0LU;
+#endif
+	r = JEMALLOC_P(allocm)(&p, NULL, sz, ALLOCM_ALIGN(alignment));
+	if (r == ALLOCM_SUCCESS) {
+		fprintf(stderr,
+		    "Expected error for allocm(&p, %zu, 0x%x)\n",
+		    sz, ALLOCM_ALIGN(alignment));
+	}
+
+	for (i = 0; i < NITER; i++)
+		ps[i] = NULL;
+
+	for (alignment = 8;
+	    alignment <= MAXALIGN;
+	    alignment <<= 1) {
+		total = 0;
+		fprintf(stderr, "Alignment: %zu\n", alignment);
+		for (sz = 1;
+		    sz < 3 * alignment && sz < (1U << 31);
+		    sz += (alignment >> (LG_SIZEOF_PTR-1)) - 1) {
+			for (i = 0; i < NITER; i++) {
+				r = JEMALLOC_P(allocm)(&ps[i], NULL, sz,
+				    ALLOCM_ALIGN(alignment) | ALLOCM_ZERO);
+				if (r != ALLOCM_SUCCESS) {
+					fprintf(stderr,
+					    "Error for size %zu (0x%zx): %d\n",
+					    sz, sz, r);
+					exit(1);
+				}
+				if ((uintptr_t)p & (alignment-1)) {
+					fprintf(stderr,
+					    "%p inadequately aligned for"
+					    " alignment: %zu\n", p, alignment);
+				}
+				JEMALLOC_P(sallocm)(ps[i], &tsz, 0);
+				total += tsz;
+				if (total >= (MAXALIGN << 1))
+					break;
+			}
+			for (i = 0; i < NITER; i++) {
+				if (ps[i] != NULL) {
+					JEMALLOC_P(dallocm)(ps[i], 0);
+					ps[i] = NULL;
+				}
+			}
+		}
+	}
+
+	fprintf(stderr, "Test end\n");
+	return (0);
+}
diff --git a/jemalloc/test/allocm.exp b/jemalloc/test/allocm.exp
new file mode 100644
index 00000000..b5061c72
--- /dev/null
+++ b/jemalloc/test/allocm.exp
@@ -0,0 +1,25 @@
+Test begin
+Alignment: 8
+Alignment: 16
+Alignment: 32
+Alignment: 64
+Alignment: 128
+Alignment: 256
+Alignment: 512
+Alignment: 1024
+Alignment: 2048
+Alignment: 4096
+Alignment: 8192
+Alignment: 16384
+Alignment: 32768
+Alignment: 65536
+Alignment: 131072
+Alignment: 262144
+Alignment: 524288
+Alignment: 1048576
+Alignment: 2097152
+Alignment: 4194304
+Alignment: 8388608
+Alignment: 16777216
+Alignment: 33554432
+Test end
diff --git a/jemalloc/test/jemalloc_test.h.in b/jemalloc/test/jemalloc_test.h.in
new file mode 100644
index 00000000..0c48895e
--- /dev/null
+++ b/jemalloc/test/jemalloc_test.h.in
@@ -0,0 +1,6 @@
+/*
+ * This header should be included by tests, rather than directly including
+ * jemalloc/jemalloc.h, because --with-install-suffix may cause the header to
+ * have a different name.
+ */
+#include "jemalloc/jemalloc@install_suffix@.h"
diff --git a/jemalloc/test/posix_memalign.c b/jemalloc/test/posix_memalign.c
new file mode 100644
index 00000000..3e306c01
--- /dev/null
+++ b/jemalloc/test/posix_memalign.c
@@ -0,0 +1,121 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+
+#define	JEMALLOC_MANGLE
+#include "jemalloc_test.h"
+
+#define CHUNK 0x400000
+/* #define MAXALIGN ((size_t)0x80000000000LLU) */
+#define MAXALIGN ((size_t)0x2000000LLU)
+#define NITER 4
+
+int
+main(void)
+{
+	size_t alignment, size, total;
+	unsigned i;
+	int err;
+	void *p, *ps[NITER];
+
+	fprintf(stderr, "Test begin\n");
+
+	/* Test error conditions. */
+	for (alignment = 0; alignment < sizeof(void *); alignment++) {
+		err = JEMALLOC_P(posix_memalign)(&p, alignment, 1);
+		if (err != EINVAL) {
+			fprintf(stderr,
+			    "Expected error for invalid alignment %zu\n",
+			    alignment);
+		}
+	}
+
+	for (alignment = sizeof(size_t); alignment < MAXALIGN;
+	    alignment <<= 1) {
+		err = JEMALLOC_P(posix_memalign)(&p, alignment + 1, 1);
+		if (err == 0) {
+			fprintf(stderr,
+			    "Expected error for invalid alignment %zu\n",
+			    alignment + 1);
+		}
+	}
+
+#if LG_SIZEOF_PTR == 3
+	alignment = 0x8000000000000000LLU;
+	size      = 0x8000000000000000LLU;
+#else
+	alignment = 0x80000000LU;
+	size      = 0x80000000LU;
+#endif
+	err = JEMALLOC_P(posix_memalign)(&p, alignment, size);
+	if (err == 0) {
+		fprintf(stderr,
+		    "Expected error for posix_memalign(&p, %zu, %zu)\n",
+		    alignment, size);
+	}
+
+#if LG_SIZEOF_PTR == 3
+	alignment = 0x4000000000000000LLU;
+	size      = 0x8400000000000001LLU;
+#else
+	alignment = 0x40000000LU;
+	size      = 0x84000001LU;
+#endif
+	err = JEMALLOC_P(posix_memalign)(&p, alignment, size);
+	if (err == 0) {
+		fprintf(stderr,
+		    "Expected error for posix_memalign(&p, %zu, %zu)\n",
+		    alignment, size);
+	}
+
+	alignment = 0x10LLU;
+#if LG_SIZEOF_PTR == 3
+	size = 0xfffffffffffffff0LLU;
+#else
+	size = 0xfffffff0LU;
+#endif
+	err = JEMALLOC_P(posix_memalign)(&p, alignment, size);
+	if (err == 0) {
+		fprintf(stderr,
+		    "Expected error for posix_memalign(&p, %zu, %zu)\n",
+		    alignment, size);
+	}
+
+	for (i = 0; i < NITER; i++)
+		ps[i] = NULL;
+
+	for (alignment = 8;
+	    alignment <= MAXALIGN;
+	    alignment <<= 1) {
+		total = 0;
+		fprintf(stderr, "Alignment: %zu\n", alignment);
+		for (size = 1;
+		    size < 3 * alignment && size < (1U << 31);
+		    size += (alignment >> (LG_SIZEOF_PTR-1)) - 1) {
+			for (i = 0; i < NITER; i++) {
+				err = JEMALLOC_P(posix_memalign)(&ps[i],
+				    alignment, size);
+				if (err) {
+					fprintf(stderr,
+					    "Error for size %zu (0x%zx): %s\n",
+					    size, size, strerror(err));
+					exit(1);
+				}
+				total += JEMALLOC_P(malloc_usable_size)(ps[i]);
+				if (total >= (MAXALIGN << 1))
+					break;
+			}
+			for (i = 0; i < NITER; i++) {
+				if (ps[i] != NULL) {
+					JEMALLOC_P(free)(ps[i]);
+					ps[i] = NULL;
+				}
+			}
+		}
+	}
+
+	fprintf(stderr, "Test end\n");
+	return (0);
+}
diff --git a/jemalloc/test/posix_memalign.exp b/jemalloc/test/posix_memalign.exp
new file mode 100644
index 00000000..b5061c72
--- /dev/null
+++ b/jemalloc/test/posix_memalign.exp
@@ -0,0 +1,25 @@
+Test begin
+Alignment: 8
+Alignment: 16
+Alignment: 32
+Alignment: 64
+Alignment: 128
+Alignment: 256
+Alignment: 512
+Alignment: 1024
+Alignment: 2048
+Alignment: 4096
+Alignment: 8192
+Alignment: 16384
+Alignment: 32768
+Alignment: 65536
+Alignment: 131072
+Alignment: 262144
+Alignment: 524288
+Alignment: 1048576
+Alignment: 2097152
+Alignment: 4194304
+Alignment: 8388608
+Alignment: 16777216
+Alignment: 33554432
+Test end
diff --git a/jemalloc/test/rallocm.c b/jemalloc/test/rallocm.c
new file mode 100644
index 00000000..a8cadebc
--- /dev/null
+++ b/jemalloc/test/rallocm.c
@@ -0,0 +1,117 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define	JEMALLOC_MANGLE
+#include "jemalloc_test.h"
+
+int
+main(void)
+{
+	void *p, *q;
+	size_t sz, tsz;
+	int r;
+
+	fprintf(stderr, "Test begin\n");
+
+	r = JEMALLOC_P(allocm)(&p, &sz, 42, 0);
+	if (r != ALLOCM_SUCCESS) {
+		fprintf(stderr, "Unexpected allocm() error\n");
+		abort();
+	}
+
+	q = p;
+	r = JEMALLOC_P(rallocm)(&q, &tsz, sz, 0, ALLOCM_NO_MOVE);
+	if (r != ALLOCM_SUCCESS)
+		fprintf(stderr, "Unexpected rallocm() error\n");
+	if (q != p)
+		fprintf(stderr, "Unexpected object move\n");
+	if (tsz != sz) {
+		fprintf(stderr, "Unexpected size change: %zu --> %zu\n",
+		    sz, tsz);
+	}
+
+	q = p;
+	r = JEMALLOC_P(rallocm)(&q, &tsz, sz, 5, ALLOCM_NO_MOVE);
+	if (r != ALLOCM_SUCCESS)
+		fprintf(stderr, "Unexpected rallocm() error\n");
+	if (q != p)
+		fprintf(stderr, "Unexpected object move\n");
+	if (tsz != sz) {
+		fprintf(stderr, "Unexpected size change: %zu --> %zu\n",
+		    sz, tsz);
+	}
+
+	q = p;
+	r = JEMALLOC_P(rallocm)(&q, &tsz, sz + 5, 0, ALLOCM_NO_MOVE);
+	if (r != ALLOCM_ERR_NOT_MOVED)
+		fprintf(stderr, "Unexpected rallocm() result\n");
+	if (q != p)
+		fprintf(stderr, "Unexpected object move\n");
+	if (tsz != sz) {
+		fprintf(stderr, "Unexpected size change: %zu --> %zu\n",
+		    sz, tsz);
+	}
+
+	q = p;
+	r = JEMALLOC_P(rallocm)(&q, &tsz, sz + 5, 0, 0);
+	if (r != ALLOCM_SUCCESS)
+		fprintf(stderr, "Unexpected rallocm() error\n");
+	if (q == p)
+		fprintf(stderr, "Expected object move\n");
+	if (tsz == sz) {
+		fprintf(stderr, "Expected size change: %zu --> %zu\n",
+		    sz, tsz);
+	}
+	p = q;
+	sz = tsz;
+
+	r = JEMALLOC_P(rallocm)(&q, &tsz, 8192, 0, 0);
+	if (r != ALLOCM_SUCCESS)
+		fprintf(stderr, "Unexpected rallocm() error\n");
+	if (q == p)
+		fprintf(stderr, "Expected object move\n");
+	if (tsz == sz) {
+		fprintf(stderr, "Expected size change: %zu --> %zu\n",
+		    sz, tsz);
+	}
+	p = q;
+	sz = tsz;
+
+	r = JEMALLOC_P(rallocm)(&q, &tsz, 16384, 0, 0);
+	if (r != ALLOCM_SUCCESS)
+		fprintf(stderr, "Unexpected rallocm() error\n");
+	if (tsz == sz) {
+		fprintf(stderr, "Expected size change: %zu --> %zu\n",
+		    sz, tsz);
+	}
+	p = q;
+	sz = tsz;
+
+	r = JEMALLOC_P(rallocm)(&q, &tsz, 8192, 0, ALLOCM_NO_MOVE);
+	if (r != ALLOCM_SUCCESS)
+		fprintf(stderr, "Unexpected rallocm() error\n");
+	if (q != p)
+		fprintf(stderr, "Unexpected object move\n");
+	if (tsz == sz) {
+		fprintf(stderr, "Expected size change: %zu --> %zu\n",
+		    sz, tsz);
+	}
+	sz = tsz;
+
+	r = JEMALLOC_P(rallocm)(&q, &tsz, 16384, 0, ALLOCM_NO_MOVE);
+	if (r != ALLOCM_SUCCESS)
+		fprintf(stderr, "Unexpected rallocm() error\n");
+	if (q != p)
+		fprintf(stderr, "Unexpected object move\n");
+	if (tsz == sz) {
+		fprintf(stderr, "Expected size change: %zu --> %zu\n",
+		    sz, tsz);
+	}
+	sz = tsz;
+
+	JEMALLOC_P(dallocm)(p, 0);
+
+	fprintf(stderr, "Test end\n");
+	return (0);
+}
diff --git a/jemalloc/test/rallocm.exp b/jemalloc/test/rallocm.exp
new file mode 100644
index 00000000..369a88dd
--- /dev/null
+++ b/jemalloc/test/rallocm.exp
@@ -0,0 +1,2 @@
+Test begin
+Test end
diff --git a/jemalloc/test/thread_arena.c b/jemalloc/test/thread_arena.c
new file mode 100644
index 00000000..bd884e1f
--- /dev/null
+++ b/jemalloc/test/thread_arena.c
@@ -0,0 +1,82 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <pthread.h>
+#include <string.h>
+
+#define	JEMALLOC_MANGLE
+#include "jemalloc_test.h"
+
+void *
+thread_start(void *arg)
+{
+	unsigned main_arena_ind = *(unsigned *)arg;
+	void *p;
+	unsigned arena_ind;
+	size_t size;
+	int err;
+
+	p = JEMALLOC_P(malloc)(1);
+	if (p == NULL) {
+		fprintf(stderr, "%s(): Error in malloc()\n", __func__);
+		return (void *)1;
+	}
+
+	size = sizeof(arena_ind);
+	if ((err = JEMALLOC_P(mallctl)("thread.arena", &arena_ind, &size,
+	    &main_arena_ind, sizeof(main_arena_ind)))) {
+		fprintf(stderr, "%s(): Error in mallctl(): %s\n", __func__,
+		    strerror(err));
+		return (void *)1;
+	}
+
+	return (NULL);
+}
+
+int
+main(void)
+{
+	int ret = 0;
+	void *p;
+	unsigned arena_ind;
+	size_t size;
+	int err;
+	pthread_t thread;
+
+	fprintf(stderr, "Test begin\n");
+
+	p = JEMALLOC_P(malloc)(1);
+	if (p == NULL) {
+		fprintf(stderr, "%s(): Error in malloc()\n", __func__);
+		ret = 1;
+		goto RETURN;
+	}
+
+	size = sizeof(arena_ind);
+	if ((err = JEMALLOC_P(mallctl)("thread.arena", &arena_ind, &size, NULL,
+	    0))) {
+		fprintf(stderr, "%s(): Error in mallctl(): %s\n", __func__,
+		    strerror(err));
+		ret = 1;
+		goto RETURN;
+	}
+
+	if (pthread_create(&thread, NULL, thread_start, (void *)&arena_ind)
+	    != 0) {
+		fprintf(stderr, "%s(): Error in pthread_create()\n", __func__);
+		ret = 1;
+		goto RETURN;
+	}
+	pthread_join(thread, (void *)&ret);
+
+	if (pthread_create(&thread, NULL, thread_start, (void *)&arena_ind)
+	    != 0) {
+		fprintf(stderr, "%s(): Error in pthread_create()\n", __func__);
+		ret = 1;
+		goto RETURN;
+	}
+	pthread_join(thread, (void *)&ret);
+
+RETURN:
+	fprintf(stderr, "Test end\n");
+	return (ret);
+}
diff --git a/jemalloc/test/thread_arena.exp b/jemalloc/test/thread_arena.exp
new file mode 100644
index 00000000..369a88dd
--- /dev/null
+++ b/jemalloc/test/thread_arena.exp
@@ -0,0 +1,2 @@
+Test begin
+Test end