Merge branch 'dev'

2010-10-24 16:51:13 -07:00 · 2010-10-24 16:51:13 -07:00 · a39d5b6ef2
commit a39d5b6ef2
parent e139ab8b4f 3af83344a5
50 changed files with 6136 additions and 2555 deletions
--- a/.gitignore
+++ b/.gitignore
@ -6,10 +6,11 @@
 /jemalloc/doc/jemalloc.3
 /jemalloc/lib/
 /jemalloc/Makefile
-/jemalloc/src/internal/jemalloc_internal\.h
-/jemalloc/src/internal/mtrgraph_defs\.h
-/jemalloc/src/internal/mtrplay_defs\.h
-/jemalloc/src/jemalloc\.h
-/jemalloc/src/jemalloc_defs\.h
+/jemalloc/include/jemalloc/internal/jemalloc_internal\.h
+/jemalloc/include/jemalloc/jemalloc\.h
+/jemalloc/include/jemalloc/jemalloc_defs\.h
+/jemalloc/test/jemalloc_test\.h
 /jemalloc/src/*.[od]
+/jemalloc/test/*.[od]
+/jemalloc/test/*.out
 /jemalloc/VERSION
--- a/jemalloc/COPYING
+++ b/jemalloc/COPYING
@ -3,6 +3,7 @@ subject to the following licenses:
 --------------------------------------------------------------------------------
 Copyright (C) 2002-2010 Jason Evans <jasone@canonware.com>.
 All rights reserved.
+Copyright (C) 2007-2010 Mozilla Foundation.  All rights reserved.

 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
--- a/jemalloc/ChangeLog
+++ b/jemalloc/ChangeLog
@ -0,0 +1,130 @@
+Following are change highlights associated with official releases.  Important
+bug fixes are all mentioned, but internal enhancements are omitted here for
+brevity (even though they are more fun to write about).  Much more detail can be
+found in the git revision history:
+
+    http://www.canonware.com/cgi-bin/gitweb.cgi?p=jemalloc.git
+    git://canonware.com/jemalloc.git
+
+* 2.0.0
+
+  This version focuses on the experimental *allocm() API, and on improved
+  run-time configuration/introspection.  Nonetheless, numerous performance
+  improvements are also included.
+
+  New features:
+    - Implement the experimental {,r,s,d}allocm() API, which provides a superset
+      of the functionality available via malloc(), calloc(), posix_memalign(),
+      realloc(), malloc_usable_size(), and free().  These functions can be used
+      to allocate/reallocate aligned zeroed memory, ask for optional extra
+      memory during reallocation, prevent object movement during reallocation,
+      etc.
+    - Replace JEMALLOC_OPTIONS/JEMALLOC_PROF_PREFIX with MALLOC_CONF, which is
+      more human-readable, and more flexible.  For example:
+        JEMALLOC_OPTIONS=AJP
+      is now:
+        MALLOC_CONF=abort:true,fill:true,stats_print:true
+    - Port to Apple OS X.  Sponsored by Mozilla.
+    - Make it possible for the application to control thread-->arena mappings
+      via the "thread.arena" mallctl.
+    - Add compile-time support for all TLS-related functionality via pthreads
+      TSD.  This is mainly of interest for OS X, which does not support TLS, but
+      has a TSD implementation with similar performance.
+    - Override memalign() and valloc() if they are provided by the system.
+    - Add the "arenas.purge" mallctl, which can be used to synchronously purge
+      all dirty unused pages.
+    - Make cumulative heap profiling data optional, so that it is possible to
+      limit the amount of memory consumed by heap profiling data structures.
+    - Add per thread allocation counters that can be accessed via the
+      "thread.allocated" and "thread.deallocated" mallctls.
+
+  Incompatible changes:
+    - Remove JEMALLOC_OPTIONS and malloc_options (see MALLOC_CONF above).
+    - Increase default backtrace depth from 4 to 128 for heap profiling.
+    - Disable interval-based profile dumps by default.
+
+  Bug fixes:
+  - Remove bad assertions in fork handler functions.  These assertions could
+    cause aborts for some combinations of configure settings.
+  - Fix strerror_r() usage to deal with non-standard semantics in GNU libc.
+  - Fix leak context reporting.  This bug tended to cause the number of contexts
+    to be underreported (though the reported number of objects and bytes were
+    correct).
+  - Fix a realloc() bug for large in-place growing reallocation.  This bug could
+    cause memory corruption, but it was hard to trigger.
+  - Fix an allocation bug for small allocations that could be triggered if
+    multiple threads raced to create a new run of backing pages.
+  - Enhance the heap profiler to trigger samples based on usable size, rather
+    than request size.
+  - Fix a heap profiling bug due to sometimes losing track of requested object
+    size for sampled objects.
+
+* 1.0.3
+
+  Bug fixes:
+  - Fix the libunwind-based implementation of stack backtracing (used for heap
+    profiling).  This bug could cause zero-length backtraces to be reported.
+  - Add a missing mutex unlock in library initialization code.  If multiple
+    threads raced to initialize malloc, some of them could end up permanently
+    blocked.
+
+* 1.0.2
+
+  Bug fixes:
+  - Fix junk filling of large objects, which could cause memory corruption.
+  - Add MAP_NORESERVE support for chunk mapping, because otherwise virtual
+    memory limits could cause swap file configuration to fail.  Contributed by
+    Jordan DeLong.
+
+* 1.0.1
+
+  Bug fixes:
+  - Fix compilation when --enable-fill is specified.
+  - Fix threads-related profiling bugs that affected accuracy and caused memory
+    to be leaked during thread exit.
+  - Fix dirty page purging race conditions that could cause crashes.
+  - Fix crash in tcache flushing code during thread destruction.
+
+* 1.0.0
+
+  This release focuses on speed and run-time introspection.  Numerous
+  algorithmic improvements make this release substantially faster than its
+  predecessors.
+
+  New features:
+  - Implement autoconf-based configuration system.
+  - Add mallctl*(), for the purposes of introspection and run-time
+    configuration.
+  - Make it possible for the application to manually flush a thread's cache, via
+    the "tcache.flush" mallctl.
+  - Base maximum dirty page count on proportion of active memory.
+  - Compute various addtional run-time statistics, including per size class
+    statistics for large objects.
+  - Expose malloc_stats_print(), which can be called repeatedly by the
+    application.
+  - Simplify the malloc_message() signature to only take one string argument,
+    and incorporate an opaque data pointer argument for use by the application
+    in combination with malloc_stats_print().
+  - Add support for allocation backed by one or more swap files, and allow the
+    application to disable over-commit if swap files are in use.
+  - Implement allocation profiling and leak checking.
+
+  Removed features:
+  - Remove the dynamic arena rebalancing code, since thread-specific caching
+    reduces its utility.
+
+  Bug fixes:
+  - Modify chunk allocation to work when address space layout randomization
+    (ASLR) is in use.
+  - Fix thread cleanup bugs related to TLS destruction.
+  - Handle 0-size allocation requests in posix_memalign().
+  - Fix a chunk leak.  The leaked chunks were never touched, so this impacted
+    virtual memory usage, but not physical memory usage.
+
+* linux_20080828a, linux_20080827a
+
+  These snapshot releases are the simple result of incorporating Linux-specific
+  support into the FreeBSD malloc sources.
+
+--------------------------------------------------------------------------------
+vim:filetype=text:textwidth=80
--- a/jemalloc/INSTALL
+++ b/jemalloc/INSTALL
@ -27,26 +27,42 @@ any of the following arguments (not a definitive list) to 'configure':
    it is linked to.  This works only on ELF-based systems.

 --with-jemalloc-prefix=<prefix>
-    Prefix all public APIs with <prefix>, so that, for example, malloc()
-    becomes <prefix>malloc().  This makes it possible to use jemalloc at the
-    same time as the system allocator.
+    Prefix all public APIs with <prefix>.  For example, if <prefix> is
+    "prefix_", the API changes like the following occur:
+
+      malloc()         --> prefix_malloc()
+      malloc_conf      --> prefix_malloc_conf
+      /etc/malloc.conf --> /etc/prefix_malloc.conf
+      MALLOC_CONF      --> PREFIX_MALLOC_CONF
+
+    This makes it possible to use jemalloc at the same time as the system
+    allocator, or even to use multiple copies of jemalloc simultaneously.
+
+    By default, the prefix is "", except on OS X, where it is "je_".  On OS X,
+    jemalloc overlays the default malloc zone, but makes no attempt to actually
+    replace the "malloc", "calloc", etc. symbols.

 --with-install-suffix=<suffix>
    Append <suffix> to the base name of all installed files, such that multiple
    versions of jemalloc can coexist in the same installation directory.  For
    example, libjemalloc.so.0 becomes libjemalloc<suffix>.so.0.

+--enable-cc-silence
+    Enable code that silences unuseful compiler warnings.  This is helpful when
+    trying to tell serious warnings from those due to compiler limitations, but
+    it potentially incurs a performance penalty.
+
 --enable-debug
    Enable assertions and validation code.  This incurs a substantial
    performance hit, but is very useful during application development.

 --enable-stats
-    Enable statistics gathering functionality.  Use the 'P' option to print
-    detailed allocation statistics at exit.
+    Enable statistics gathering functionality.  See the "opt.stats_print"
+    option documentation for usage details.

 --enable-prof
-    Enable heap profiling and leak detection functionality.  Use the 'B', 'E',
-    'F', 'I', 'L', and 'U' options to control these features.
+    Enable heap profiling and leak detection functionality.  See the "opt.prof"
+    option documention for usage details.

 --disable-prof-libgcc
    Disable the use of libgcc's backtracing functionality.  Ordinarily, libgcc's
@ -72,8 +88,8 @@ any of the following arguments (not a definitive list) to 'configure':

 --disable-tcache
    Disable thread-specific caches for small objects.  Objects are cached and
-    released in bulk, thus reducing the total number of mutex operations.  Use
-    the 'H', 'G', and 'M' options to control thread-specific caching.
+    released in bulk, thus reducing the total number of mutex operations.  See
+    the "opt.tcache" option for suage details.

 --enable-swap
    Enable mmap()ed swap file support.  When this feature is built in, it is
@ -85,18 +101,18 @@ any of the following arguments (not a definitive list) to 'configure':
    mmap(2).

 --enable-fill
-    Enable support for junk/zero filling of memory.  Use the 'J' option to
-    control junk filling, or the 'Z' option to control zero filling.
+    Enable support for junk/zero filling of memory.  See the "opt.junk"/
+    "opt.zero" option documentation for usage details.

 --enable-xmalloc
    Enable support for optional immediate termination due to out-of-memory
    errors, as is commonly implemented by "xmalloc" wrapper function for malloc.
-    Use the 'X' option to control termination behavior.
+    See the "opt.xmalloc" option documentation for usage details.

 --enable-sysv
    Enable support for System V semantics, wherein malloc(0) returns NULL
-    rather than a minimal allocation.  Use the 'V' option to control System V
-    compatibility.
+    rather than a minimal allocation.  See the "opt.sysv" option documentation
+    for usage details.

 --enable-dynamic-page-shift
    Under most conditions, the system page size never changes (usually 4KiB or
@ -213,3 +229,14 @@ directory, issue configuration and build commands:
    cd obj
    ../configure --enable-autogen
    make
+
+=== Documentation ==============================================================
+
+The manual page that the configure script generates can be manually formatted
+prior to installation via any of the following commands:
+
+    nroff -man -man-ext -t doc/jemalloc.3
+
+    groff -man -man-ext -t -Tps doc/jemalloc.3 | ps2pdf - doc/jemalloc.3.pdf
+
+    (cd doc; groff -man -man-ext -t -Thtml jemalloc.3 > jemalloc.3.html)
--- a/jemalloc/Makefile.in
+++ b/jemalloc/Makefile.in
@ -28,10 +28,17 @@ LIBS := @LIBS@
 RPATH_EXTRA := @RPATH_EXTRA@
 ifeq (macho, @abi@)
 SO := dylib
+WL_SONAME := dylib_install_name
 else
 SO := so
+WL_SONAME := soname
+endif
+REV := 1
+ifeq (macho, @abi@)
+TEST_LIBRARY_PATH := DYLD_FALLBACK_LIBRARY_PATH=@objroot@lib
+else
+TEST_LIBRARY_PATH :=
 endif
-REV := 0

 # Lists of files.
 BINS := @srcroot@bin/pprof
@ -42,11 +49,18 @@ CSRCS := @srcroot@src/jemalloc.c @srcroot@src/arena.c @srcroot@src/base.c \
 	@srcroot@src/chunk_mmap.c @srcroot@src/chunk_swap.c @srcroot@src/ckh.c \
 	@srcroot@src/ctl.c @srcroot@src/extent.c @srcroot@src/hash.c \
 	@srcroot@src/huge.c @srcroot@src/mb.c @srcroot@src/mutex.c \
-	@srcroot@src/prof.c @srcroot@src/stats.c @srcroot@src/tcache.c
-DSOS := @objroot@lib/libjemalloc@install_suffix@.so.$(REV) \
-	@objroot@lib/libjemalloc@install_suffix@.so \
+	@srcroot@src/prof.c @srcroot@src/rtree.c \
+	@srcroot@src/stats.c @srcroot@src/tcache.c
+ifeq (macho, @abi@)
+CSRCS += @srcroot@src/zone.c
+endif
+DSOS := @objroot@lib/libjemalloc@install_suffix@.$(SO).$(REV) \
+	@objroot@lib/libjemalloc@install_suffix@.$(SO) \
 	@objroot@lib/libjemalloc@install_suffix@_pic.a
 MAN3 := @objroot@doc/jemalloc@install_suffix@.3
+CTESTS := @srcroot@test/allocated.c @srcroot@test/allocm.c \
+	@srcroot@test/posix_memalign.c \
+	@srcroot@test/rallocm.c @srcroot@test/thread_arena.c

 .PHONY: all dist install check clean distclean relclean

@ -63,18 +77,32 @@ all: $(DSOS)
 	$(CC) $(CFLAGS) -c $(CPPFLAGS) -o $@ $<
 	@$(SHELL) -ec "$(CC) -MM $(CPPFLAGS) $< | sed \"s/\($(subst /,\/,$(notdir $(basename $@)))\)\.o\([ :]*\)/$(subst /,\/,$(strip $(dir $@)))\1.o \2/g\" > $(@:%.o=%.d)"

-%.so : %.so.$(REV)
+%.$(SO) : %.$(SO).$(REV)
 	@mkdir -p $(@D)
 	ln -sf $(<F) $@

-@objroot@lib/libjemalloc@install_suffix@.so.$(REV) : $(CSRCS:@srcroot@%.c=@objroot@%.o)
+@objroot@lib/libjemalloc@install_suffix@.$(SO).$(REV) : $(CSRCS:@srcroot@%.c=@objroot@%.o)
 	@mkdir -p $(@D)
-	$(CC) -shared -Wl,-soname,$(@F) -o $@ $+ $(LDFLAGS) $(LIBS)
+	$(CC) -shared -Wl,-$(WL_SONAME),$(@F) $(RPATH_EXTRA:%=@RPATH@%) -o $@ $+ $(LDFLAGS) $(LIBS)

@objroot@lib/libjemalloc@install_suffix@_pic.a : $(CSRCS:@srcroot@%.c=@objroot@%.o)
 	@mkdir -p $(@D)
 	ar crus $@ $+

+@objroot@test/%.o: @srcroot@test/%.c
+	@mkdir -p $(@D)
+	$(CC) $(CFLAGS) -c $(CPPFLAGS) -I@objroot@test -o $@ $<
+	@$(SHELL) -ec "$(CC) -MM $(CPPFLAGS) -I@objroot@test $< | sed \"s/\($(subst /,\/,$(notdir $(basename $@)))\)\.o\([ :]*\)/$(subst /,\/,$(strip $(dir $@)))\1.o \2/g\" > $(@:%.o=%.d)"
+
+@objroot@test/%: @objroot@test/%.o \
+		 @objroot@lib/libjemalloc@install_suffix@.$(SO)
+	@mkdir -p $(@D)
+ifneq (@RPATH@, )
+	$(CC) -o $@ $< @RPATH@@objroot@lib -L@objroot@lib -ljemalloc@install_suffix@
+else
+	$(CC) -o $@ $< -L@objroot@lib -ljemalloc@install_suffix@
+endif
+
 install_bin:
 	install -d $(BINDIR)
 	@for b in $(BINS); do \
@ -91,8 +119,8 @@ done

 install_lib: $(DSOS)
 	install -d $(LIBDIR)
-	install -m 755 @objroot@lib/libjemalloc@install_suffix@.so.$(REV) $(LIBDIR)
-	ln -sf libjemalloc@install_suffix@.so.$(REV) $(LIBDIR)/libjemalloc@install_suffix@.so
+	install -m 755 @objroot@lib/libjemalloc@install_suffix@.$(SO).$(REV) $(LIBDIR)
+	ln -sf libjemalloc@install_suffix@.$(SO).$(REV) $(LIBDIR)/libjemalloc@install_suffix@.$(SO)
 	install -m 755 @objroot@lib/libjemalloc@install_suffix@_pic.a $(LIBDIR)

 install_man:
@ -104,19 +132,50 @@ done

 install: install_bin install_include install_lib install_man

-check:
+tests: $(CTESTS:@srcroot@%.c=@objroot@%)
+
+check: tests
+	@mkdir -p @objroot@test
+	@$(SHELL) -c 'total=0; \
+		failures=0; \
+		echo "========================================="; \
+		for t in $(CTESTS:@srcroot@%.c=@objroot@%); do \
+			total=`expr $$total + 1`; \
+			/bin/echo -n "$${t} ... "; \
+			$(TEST_LIBRARY_PATH) $${t} @abs_srcroot@ @abs_objroot@ \
+			  > @objroot@$${t}.out 2>&1; \
+			if test -e "@srcroot@$${t}.exp"; then \
+				diff -u @srcroot@$${t}.exp \
+				  @objroot@$${t}.out >/dev/null 2>&1; \
+				fail=$$?; \
+				if test "$${fail}" -eq "1" ; then \
+					failures=`expr $${failures} + 1`; \
+					echo "*** FAIL ***"; \
+				else \
+					echo "pass"; \
+				fi; \
+			else \
+				echo "*** FAIL *** (.exp file is missing)"; \
+				failures=`expr $${failures} + 1`; \
+			fi; \
+		done; \
+		echo "========================================="; \
+		echo "Failures: $${failures}/$${total}"'

 clean:
 	rm -f $(CSRCS:@srcroot@%.c=@objroot@%.o)
 	rm -f $(CSRCS:@srcroot@%.c=@objroot@%.d)
+	rm -f $(CTESTS:@srcroot@%.c=@objroot@%)
+	rm -f $(CTESTS:@srcroot@%.c=@objroot@%.o)
+	rm -f $(CTESTS:@srcroot@%.c=@objroot@%.d)
+	rm -f $(CTESTS:@srcroot@%.c=@objroot@%.out)
 	rm -f $(DSOS)

 distclean: clean
 	rm -rf @objroot@autom4te.cache
 	rm -f @objroot@config.log
 	rm -f @objroot@config.status
-	rm -f @objroot@cfghdrs.stamp
-	rm -f @objroot@cfgoutputs.stamp
+	rm -f @objroot@config.stamp
 	rm -f @cfghdrs_out@
 	rm -f @cfgoutputs_out@

--- a/jemalloc/README
+++ b/jemalloc/README
@ -1,9 +1,9 @@
 jemalloc is a general-purpose scalable concurrent malloc(3) implementation.
 This distribution is a stand-alone "portable" implementation that currently
-targets only Linux.  jemalloc is included as the default allocator in the
-FreeBSD and NetBSD operating systems, and it is used by the Mozilla Firefox web
-browser on Microsoft Windows-related platforms.  Depending on your needs, one
-of the other divergent versions may suit your needs better than this
+targets Linux and Apple OS X.  jemalloc is included as the default allocator in
+the FreeBSD and NetBSD operating systems, and it is used by the Mozilla Firefox
+web browser on Microsoft Windows-related platforms.  Depending on your needs,
+one of the other divergent versions may suit your needs better than this
 distribution.

 The COPYING file contains copyright and licensing information.
@ -11,4 +11,6 @@ The COPYING file contains copyright and licensing information.
 The INSTALL file contains information on how to configure, build, and install
 jemalloc.

+The ChangeLog file contains a brief summary of changes for each release.
+
 URL: http://www.canonware.com/jemalloc/
--- a/jemalloc/bin/pprof
+++ b/jemalloc/bin/pprof
--- a/jemalloc/configure.ac
+++ b/jemalloc/configure.ac
@ -150,7 +150,7 @@ JE_COMPILABLE([__attribute__ syntax],
              [attribute])
 if test "x${attribute}" = "xyes" ; then
  AC_DEFINE([JEMALLOC_HAVE_ATTR], [ ])
-  if test "x$GCC" = "xyes" ; then
+  if test "x$GCC" = "xyes" -a "${abi}" = "xelf"; then
    JE_CFLAGS_APPEND([-fvisibility=internal])
  fi
 fi
@ -166,17 +166,20 @@ case "${host}" in
  *-*-darwin*)
 	CFLAGS="$CFLAGS -fno-common -no-cpp-precomp"
 	abi="macho"
+	AC_DEFINE([JEMALLOC_PURGE_MADVISE_FREE])
 	RPATH=""
 	;;
  *-*-freebsd*)
 	CFLAGS="$CFLAGS"
 	abi="elf"
+	AC_DEFINE([JEMALLOC_PURGE_MADVISE_FREE])
 	RPATH="-Wl,-rpath,"
 	;;
  *-*-linux*)
 	CFLAGS="$CFLAGS"
 	CPPFLAGS="$CPPFLAGS -D_GNU_SOURCE"
 	abi="elf"
+	AC_DEFINE([JEMALLOC_PURGE_MADVISE_DONTNEED])
 	RPATH="-Wl,-rpath,"
 	;;
  *-*-netbsd*)
@ -191,6 +194,7 @@ case "${host}" in
                          [CFLAGS="$CFLAGS"; abi="elf"],
                          [abi="aout"])
 	AC_MSG_RESULT([$abi])
+	AC_DEFINE([JEMALLOC_PURGE_MADVISE_FREE])
 	RPATH="-Wl,-rpath,"
 	;;
  *-*-solaris2*)
@ -245,12 +249,20 @@ dnl Do not prefix public APIs by default.
 AC_ARG_WITH([jemalloc_prefix],
  [AS_HELP_STRING([--with-jemalloc-prefix=<prefix>], [Prefix to prepend to all public APIs])],
  [JEMALLOC_PREFIX="$with_jemalloc_prefix"],
-  [JEMALLOC_PREFIX=]
+  [if test "x$abi" != "xmacho" ; then
+  JEMALLOC_PREFIX=""
+else
+  JEMALLOC_PREFIX="je_"
+fi]
 )
 if test "x$JEMALLOC_PREFIX" != "x" ; then
-  AC_DEFINE([JEMALLOC_PREFIX], [ ])
+  JEMALLOC_CPREFIX=`echo ${JEMALLOC_PREFIX} | tr "a-z" "A-Z"`
+  AC_DEFINE_UNQUOTED([JEMALLOC_PREFIX], ["$JEMALLOC_PREFIX"])
+  AC_DEFINE_UNQUOTED([JEMALLOC_CPREFIX], ["$JEMALLOC_CPREFIX"])
  jemalloc_prefix="$JEMALLOC_PREFIX"
+  jemalloc_cprefix="$JEMALLOC_CPREFIX"
  AC_SUBST([jemalloc_prefix])
+  AC_SUBST([jemalloc_cprefix])
  AC_DEFINE_UNQUOTED([JEMALLOC_P(string_that_no_one_should_want_to_use_as_a_jemalloc_API_prefix)], [${JEMALLOC_PREFIX}##string_that_no_one_should_want_to_use_as_a_jemalloc_API_prefix])
 fi

@ -266,14 +278,17 @@ AC_SUBST([install_suffix])
 cfgoutputs_in="${srcroot}Makefile.in ${srcroot}doc/jemalloc.3.in"
 cfgoutputs_in="${cfgoutputs_in} ${srcroot}include/jemalloc/jemalloc.h.in"
 cfgoutputs_in="${cfgoutputs_in} ${srcroot}include/jemalloc/internal/jemalloc_internal.h.in"
+cfgoutputs_in="${cfgoutputs_in} ${srcroot}test/jemalloc_test.h.in"

 cfgoutputs_out="Makefile doc/jemalloc${install_suffix}.3"
 cfgoutputs_out="${cfgoutputs_out} include/jemalloc/jemalloc${install_suffix}.h"
 cfgoutputs_out="${cfgoutputs_out} include/jemalloc/internal/jemalloc_internal.h"
+cfgoutputs_out="${cfgoutputs_out} test/jemalloc_test.h"

 cfgoutputs_tup="Makefile doc/jemalloc${install_suffix}.3:doc/jemalloc.3.in"
 cfgoutputs_tup="${cfgoutputs_tup} include/jemalloc/jemalloc${install_suffix}.h:include/jemalloc/jemalloc.h.in"
 cfgoutputs_tup="${cfgoutputs_tup} include/jemalloc/internal/jemalloc_internal.h"
+cfgoutputs_tup="${cfgoutputs_tup} test/jemalloc_test.h:test/jemalloc_test.h.in"

 cfghdrs_in="${srcroot}include/jemalloc/jemalloc_defs.h.in"

@ -281,6 +296,23 @@ cfghdrs_out="include/jemalloc/jemalloc_defs${install_suffix}.h"

 cfghdrs_tup="include/jemalloc/jemalloc_defs${install_suffix}.h:include/jemalloc/jemalloc_defs.h.in"

+dnl Do not silence irrelevant compiler warnings by default, since enabling this
+dnl option incurs a performance penalty.
+AC_ARG_ENABLE([cc-silence],
+  [AS_HELP_STRING([--enable-cc-silence],
+                  [Silence irrelevant compiler warnings])],
+[if test "x$enable_cc_silence" = "xno" ; then
+  enable_cc_silence="0"
+else
+  enable_cc_silence="1"
+fi
+],
+[enable_cc_silence="0"]
+)
+if test "x$enable_cc_silence" = "x1" ; then
+  AC_DEFINE([JEMALLOC_CC_SILENCE])
+fi
+
 dnl Do not compile with debugging by default.
 AC_ARG_ENABLE([debug],
  [AS_HELP_STRING([--enable-debug], [Build debugging code])],
@ -294,8 +326,18 @@ fi
 )
 if test "x$enable_debug" = "x1" ; then
  AC_DEFINE([JEMALLOC_DEBUG], [ ])
+  AC_DEFINE([JEMALLOC_IVSALLOC], [ ])
 fi
 AC_SUBST([enable_debug])
+if test "x$enable_debug" = "x0" ; then
+  roff_debug=".\\\" "
+  roff_no_debug=""
+else
+  roff_debug=""
+  roff_no_debug=".\\\" "
+fi
+AC_SUBST([roff_debug])
+AC_SUBST([roff_no_debug])

 dnl Only optimize if not debugging.
 if test "x$enable_debug" = "x0" -a "x$no_CFLAGS" = "xyes" ; then
@ -379,7 +421,47 @@ else
 fi,
  LUNWIND="-lunwind"
 )
-dnl Finish prof-related definitions below, once TLS configuration is done.
+if test "x$enable_prof" = "x1" ; then
+  LIBS="$LIBS -lm"
+  AC_DEFINE([JEMALLOC_PROF], [ ])
+  if test "x$enable_prof_libunwind" = "x1" ; then
+    AC_CHECK_HEADERS([libunwind.h], , [enable_prof_libunwind="0"])
+    if test "x$LUNWIND" = "x-lunwind" ; then
+      AC_CHECK_LIB([unwind], [backtrace], [LIBS="$LIBS $LUNWIND"],
+                   [enable_prof_libunwind="0"])
+    else
+      LIBS="$LIBS $LUNWIND"
+    fi
+    if test "x${enable_prof_libunwind}" = "x1" ; then
+      AC_DEFINE([JEMALLOC_PROF_LIBUNWIND], [ ])
+    fi
+  fi
+fi
+AC_SUBST([enable_prof])
+if test "x$enable_prof" = "x0" ; then
+  roff_prof=".\\\" "
+  roff_no_prof=""
+else
+  roff_prof=""
+  roff_no_prof=".\\\" "
+fi
+AC_SUBST([roff_prof])
+AC_SUBST([roff_no_prof])
+
+dnl If libunwind isn't enabled, try to use libgcc rather than gcc intrinsics
+dnl for backtracing.
+if test "x$enable_prof" = "x1" -a "x$enable_prof_libgcc" = "x1" ; then
+  if test "x$enable_prof_libunwind" = "x0" -a "x$GCC" = "xyes" ; then
+    enable_prof_libgcc="1"
+    AC_CHECK_HEADERS([unwind.h], , [enable_prof_libgcc="0"])
+    AC_CHECK_LIB([gcc], [_Unwind_Backtrace], [LIBS="$LIBS -lgcc"], [enable_prof_libgcc="0"])
+    if test "x${enable_prof_libgcc}" = "x1" ; then
+      AC_DEFINE([JEMALLOC_PROF_LIBGCC], [ ])
+    fi
+  else
+    enable_prof_libgcc="0"
+  fi
+fi

 dnl Enable tiny allocations by default.
 AC_ARG_ENABLE([tiny],
@ -417,7 +499,19 @@ fi
 ],
 [enable_tcache="1"]
 )
-dnl Finish tcache-related definitions below, once TLS configuration is done.
+if test "x$enable_tcache" = "x1" ; then
+  AC_DEFINE([JEMALLOC_TCACHE], [ ])
+fi
+AC_SUBST([enable_tcache])
+if test "x$enable_tcache" = "x0" ; then
+  roff_tcache=".\\\" "
+  roff_no_tcache=""
+else
+  roff_tcache=""
+  roff_no_tcache=".\\\" "
+fi
+AC_SUBST([roff_tcache])
+AC_SUBST([roff_no_tcache])

 dnl Do not enable mmap()ped swap files by default.
 AC_ARG_ENABLE([swap],
@ -579,7 +673,7 @@ dnl

 dnl Set VERSION if source directory has an embedded git repository.
 if test -d "${srcroot}../.git" ; then
-  git describe --long > ${srcroot}VERSION
+  git describe --long --abbrev=40 > ${srcroot}VERSION
 fi
 jemalloc_version=`cat ${srcroot}VERSION`
 jemalloc_version_major=`echo ${jemalloc_version} | tr ".g-" " " | awk '{print [$]1}'`
@ -647,69 +741,63 @@ AC_RUN_IFELSE([AC_LANG_PROGRAM(
              AC_MSG_RESULT([no])
              enable_tls="0")
 fi
+AC_SUBST([enable_tls])
 if test "x${enable_tls}" = "x0" ; then
  AC_DEFINE_UNQUOTED([NO_TLS], [ ])
 fi

-dnl Finish tcache-related definitions, now that TLS configuration is done.
-if test "x$enable_tls" = "x0" ; then
-  enable_tcache="0"
-fi
-if test "x$enable_tcache" = "x1" ; then
-  AC_DEFINE([JEMALLOC_TCACHE], [ ])
-fi
-AC_SUBST([enable_tcache])
-if test "x$enable_tcache" = "x0" ; then
-  roff_tcache=".\\\" "
-  roff_no_tcache=""
-else
-  roff_tcache=""
-  roff_no_tcache=".\\\" "
-fi
-AC_SUBST([roff_tcache])
-AC_SUBST([roff_no_tcache])
+dnl ============================================================================
+dnl Check for allocator-related functions that should be wrapped.

-dnl Finish prof-related definitions, now that TLS configuration is done.
-if test "x$enable_tls" = "x0" ; then
-  enable_prof="0"
-fi
-if test "x$enable_prof" = "x1" ; then
-  LIBS="$LIBS -lm"
-  AC_DEFINE([JEMALLOC_PROF], [ ])
-  if test "x$enable_prof_libunwind" = "x1" ; then
-    AC_CHECK_HEADERS([libunwind.h], , [enable_prof_libunwind="0"])
-    if test "x$LUNWIND" = "x-lunwind" ; then
-      AC_CHECK_LIB([unwind], [backtrace], [LIBS="$LIBS $LUNWIND"],
-                   [enable_prof_libunwind="0"])
-    else
-      LIBS="$LIBS $LUNWIND"
-    fi
-    if test "x${enable_prof_libunwind}" = "x1" ; then
-      AC_DEFINE([JEMALLOC_PROF_LIBUNWIND], [ ])
-    fi
-  fi
-fi
-AC_SUBST([enable_prof])
-if test "x$enable_prof" = "x0" ; then
-  roff_prof=".\\\" "
-  roff_no_prof=""
-else
-  roff_prof=""
-  roff_no_prof=".\\\" "
-fi
-AC_SUBST([roff_prof])
-AC_SUBST([roff_no_prof])
+AC_CHECK_FUNC([memalign],
+	      [AC_DEFINE([JEMALLOC_OVERRIDE_MEMALIGN])])
+AC_CHECK_FUNC([valloc],
+	      [AC_DEFINE([JEMALLOC_OVERRIDE_VALLOC])])

-dnl If libunwind isn't enabled, try to use libgcc rather than gcc intrinsics
-dnl for backtracing.
-if test "x$enable_prof" = "x1" -a "x$enable_prof_libunwind" = "x0" \
- -a "x$GCC" = "xyes" -a "x$enable_prof_libgcc" = "x1" ; then
-  enable_prof_libgcc="1"
-  AC_CHECK_HEADERS([unwind.h], , [enable_prof_libgcc="0"])
-  AC_CHECK_LIB([gcc], [_Unwind_Backtrace], [LIBS="$LIBS -lgcc"], [enable_prof_libgcc="0"])
-  if test "x${enable_prof_libgcc}" = "x1" ; then
-    AC_DEFINE([JEMALLOC_PROF_LIBGCC], [ ])
-  fi
+dnl ============================================================================
+dnl Darwin-related configuration.
+
+if test "x${abi}" = "xmacho" ; then
+  AC_DEFINE([JEMALLOC_IVSALLOC])
+  AC_DEFINE([JEMALLOC_ZONE])
+
+  dnl The szone version jumped from 3 to 6 between the OS X 10.5.x and 10.6
+  dnl releases.  malloc_zone_t and malloc_introspection_t have new fields in
+  dnl 10.6, which is the only source-level indication of the change.
+  AC_MSG_CHECKING([malloc zone version])
+  AC_TRY_COMPILE([#include <stdlib.h>
+#include <malloc/malloc.h>], [
+	static malloc_zone_t zone;
+	static struct malloc_introspection_t zone_introspect;
+
+	zone.size = NULL;
+	zone.malloc = NULL;
+	zone.calloc = NULL;
+	zone.valloc = NULL;
+	zone.free = NULL;
+	zone.realloc = NULL;
+	zone.destroy = NULL;
+	zone.zone_name = "jemalloc_zone";
+	zone.batch_malloc = NULL;
+	zone.batch_free = NULL;
+	zone.introspect = &zone_introspect;
+	zone.version = 6;
+	zone.memalign = NULL;
+	zone.free_definite_size = NULL;
+
+	zone_introspect.enumerator = NULL;
+	zone_introspect.good_size = NULL;
+	zone_introspect.check = NULL;
+	zone_introspect.print = NULL;
+	zone_introspect.log = NULL;
+	zone_introspect.force_lock = NULL;
+	zone_introspect.force_unlock = NULL;
+	zone_introspect.statistics = NULL;
+	zone_introspect.zone_locked = NULL;
+], [AC_DEFINE_UNQUOTED([JEMALLOC_ZONE_VERSION], [6])
+    AC_MSG_RESULT([6])],
+   [AC_DEFINE_UNQUOTED([JEMALLOC_ZONE_VERSION], [3])
+   AC_MSG_RESULT([3])])
 fi

 dnl ============================================================================
@ -755,9 +843,11 @@ AC_MSG_RESULT([])
 AC_MSG_RESULT([JEMALLOC_PREFIX    : ${JEMALLOC_PREFIX}])
 AC_MSG_RESULT([install_suffix     : ${install_suffix}])
 AC_MSG_RESULT([autogen            : ${enable_autogen}])
+AC_MSG_RESULT([cc-silence         : ${enable_cc_silence}])
 AC_MSG_RESULT([debug              : ${enable_debug}])
 AC_MSG_RESULT([stats              : ${enable_stats}])
 AC_MSG_RESULT([prof               : ${enable_prof}])
+AC_MSG_RESULT([prof-libgcc        : ${enable_prof_libgcc}])
 AC_MSG_RESULT([prof-libunwind     : ${enable_prof_libunwind}])
 AC_MSG_RESULT([tiny               : ${enable_tiny}])
 AC_MSG_RESULT([tcache             : ${enable_tcache}])
@ -768,4 +858,5 @@ AC_MSG_RESULT([swap               : ${enable_swap}])
 AC_MSG_RESULT([dss                : ${enable_dss}])
 AC_MSG_RESULT([dynamic_page_shift : ${enable_dynamic_page_shift}])
 AC_MSG_RESULT([lazy_lock          : ${enable_lazy_lock}])
+AC_MSG_RESULT([tls                : ${enable_tls}])
 AC_MSG_RESULT([===============================================================================])
--- a/jemalloc/doc/jemalloc.3.in
+++ b/jemalloc/doc/jemalloc.3.in
--- a/jemalloc/include/jemalloc/internal/arena.h
+++ b/jemalloc/include/jemalloc/internal/arena.h
@ -121,17 +121,17 @@ struct arena_chunk_map_s {
 	 *
 	 * p : run page offset
 	 * s : run size
-	 * c : size class (used only if prof_promote is true)
+	 * c : (binind+1) for size class (used only if prof_promote is true)
 	 * x : don't care
 	 * - : 0
 	 * + : 1
-	 * [DZLA] : bit set
-	 * [dzla] : bit unset
+	 * [DULA] : bit set
+	 * [dula] : bit unset
 	 *
 	 *   Unallocated (clean):
-	 *     ssssssss ssssssss ssss---- ----dz--
-	 *     xxxxxxxx xxxxxxxx xxxx---- -----Zxx
-	 *     ssssssss ssssssss ssss---- ----dZ--
+	 *     ssssssss ssssssss ssss---- ----du--
+	 *     xxxxxxxx xxxxxxxx xxxx---- -----Uxx
+	 *     ssssssss ssssssss ssss---- ----dU--
 	 *
 	 *   Unallocated (dirty):
 	 *     ssssssss ssssssss ssss---- ----D---
@ -144,7 +144,7 @@ struct arena_chunk_map_s {
 	 *     pppppppp pppppppp pppp---- ----d--a
 	 *
 	 *   Large:
-	 *     ssssssss ssssssss ssss++++ ++++D-la
+	 *     ssssssss ssssssss ssss---- ----D-la
 	 *     xxxxxxxx xxxxxxxx xxxx---- ----xxxx
 	 *     -------- -------- -------- ----D-la
 	 *
@ -152,7 +152,7 @@ struct arena_chunk_map_s {
 	 *     ssssssss ssssssss sssscccc ccccD-la
 	 *
 	 *   Large (not sampled, size == PAGE_SIZE):
-	 *     ssssssss ssssssss ssss++++ ++++D-la
+	 *     ssssssss ssssssss ssss---- ----D-la
 	 */
 	size_t				bits;
 #ifdef JEMALLOC_PROF
@ -161,7 +161,7 @@ struct arena_chunk_map_s {
 #endif
 #define	CHUNK_MAP_FLAGS_MASK	((size_t)0xfU)
 #define	CHUNK_MAP_DIRTY		((size_t)0x8U)
-#define	CHUNK_MAP_ZEROED	((size_t)0x4U)
+#define	CHUNK_MAP_UNZEROED	((size_t)0x4U)
 #define	CHUNK_MAP_LARGE		((size_t)0x2U)
 #define	CHUNK_MAP_ALLOCATED	((size_t)0x1U)
 #define	CHUNK_MAP_KEY		CHUNK_MAP_ALLOCATED
@ -187,7 +187,12 @@ struct arena_chunk_s {
 	/* Number of dirty pages. */
 	size_t		ndirty;

-	/* Map of pages within chunk that keeps track of free/large/small. */
+	/*
+	 * Map of pages within chunk that keeps track of free/large/small.  The
+	 * first map_bias entries are omitted, since the chunk header does not
+	 * need to be tracked in the map.  This omission saves a header page
+	 * for common chunk sizes (e.g. 4 MiB).
+	 */
 	arena_chunk_map_t map[1]; /* Dynamically sized. */
 };
 typedef rb_tree(arena_chunk_t) arena_chunk_tree_t;
@ -416,8 +421,12 @@ extern size_t		sspace_min;
 extern size_t		sspace_max;
 #define			small_maxclass	sspace_max

-#define			nlclasses (chunk_npages - arena_chunk_header_npages)
+#define			nlclasses (chunk_npages - map_bias)

+void	arena_purge_all(arena_t *arena);
+#ifdef JEMALLOC_PROF
+void	arena_prof_accum(arena_t *arena, uint64_t accumbytes);
+#endif
 #ifdef JEMALLOC_TCACHE
 void	arena_tcache_fill_small(arena_t *arena, tcache_bin_t *tbin,
    size_t binind
@ -426,20 +435,15 @@ void	arena_tcache_fill_small(arena_t *arena, tcache_bin_t *tbin,
 #  endif
    );
 #endif
-#ifdef JEMALLOC_PROF
-void	arena_prof_accum(arena_t *arena, uint64_t accumbytes);
-#endif
 void	*arena_malloc_small(arena_t *arena, size_t size, bool zero);
 void	*arena_malloc_large(arena_t *arena, size_t size, bool zero);
 void	*arena_malloc(size_t size, bool zero);
-void	*arena_palloc(arena_t *arena, size_t alignment, size_t size,
-    size_t alloc_size);
+void	*arena_palloc(arena_t *arena, size_t size, size_t alloc_size,
+    size_t alignment, bool zero);
 size_t	arena_salloc(const void *ptr);
 #ifdef JEMALLOC_PROF
 void	arena_prof_promoted(const void *ptr, size_t size);
 size_t	arena_salloc_demote(const void *ptr);
-prof_ctx_t	*arena_prof_ctx_get(const void *ptr);
-void	arena_prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
 #endif
 void	arena_dalloc_bin(arena_t *arena, arena_chunk_t *chunk, void *ptr,
    arena_chunk_map_t *mapelm);
@ -449,7 +453,10 @@ void	arena_stats_merge(arena_t *arena, size_t *nactive, size_t *ndirty,
    arena_stats_t *astats, malloc_bin_stats_t *bstats,
    malloc_large_stats_t *lstats);
 #endif
-void	*arena_ralloc(void *ptr, size_t size, size_t oldsize);
+void	*arena_ralloc_no_move(void *ptr, size_t oldsize, size_t size,
+    size_t extra, bool zero);
+void	*arena_ralloc(void *ptr, size_t oldsize, size_t size, size_t extra,
+    size_t alignment, bool zero);
 bool	arena_new(arena_t *arena, unsigned ind);
 bool	arena_boot(void);

@ -458,10 +465,149 @@ bool	arena_boot(void);
 #ifdef JEMALLOC_H_INLINES

 #ifndef JEMALLOC_ENABLE_INLINE
+unsigned	arena_run_regind(arena_run_t *run, arena_bin_t *bin,
+    const void *ptr, size_t size);
+#  ifdef JEMALLOC_PROF
+prof_ctx_t	*arena_prof_ctx_get(const void *ptr);
+void	arena_prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
+#  endif
 void	arena_dalloc(arena_t *arena, arena_chunk_t *chunk, void *ptr);
 #endif

 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_ARENA_C_))
+JEMALLOC_INLINE unsigned
+arena_run_regind(arena_run_t *run, arena_bin_t *bin, const void *ptr,
+    size_t size)
+{
+	unsigned shift, diff, regind;
+
+	assert(run->magic == ARENA_RUN_MAGIC);
+
+	/*
+	 * Avoid doing division with a variable divisor if possible.  Using
+	 * actual division here can reduce allocator throughput by over 20%!
+	 */
+	diff = (unsigned)((uintptr_t)ptr - (uintptr_t)run - bin->reg0_offset);
+
+	/* Rescale (factor powers of 2 out of the numerator and denominator). */
+	shift = ffs(size) - 1;
+	diff >>= shift;
+	size >>= shift;
+
+	if (size == 1) {
+		/* The divisor was a power of 2. */
+		regind = diff;
+	} else {
+		/*
+		 * To divide by a number D that is not a power of two we
+		 * multiply by (2^21 / D) and then right shift by 21 positions.
+		 *
+		 *   X / D
+		 *
+		 * becomes
+		 *
+		 *   (X * size_invs[D - 3]) >> SIZE_INV_SHIFT
+		 *
+		 * We can omit the first three elements, because we never
+		 * divide by 0, and 1 and 2 are both powers of two, which are
+		 * handled above.
+		 */
+#define	SIZE_INV_SHIFT 21
+#define	SIZE_INV(s) (((1U << SIZE_INV_SHIFT) / (s)) + 1)
+		static const unsigned size_invs[] = {
+		    SIZE_INV(3),
+		    SIZE_INV(4), SIZE_INV(5), SIZE_INV(6), SIZE_INV(7),
+		    SIZE_INV(8), SIZE_INV(9), SIZE_INV(10), SIZE_INV(11),
+		    SIZE_INV(12), SIZE_INV(13), SIZE_INV(14), SIZE_INV(15),
+		    SIZE_INV(16), SIZE_INV(17), SIZE_INV(18), SIZE_INV(19),
+		    SIZE_INV(20), SIZE_INV(21), SIZE_INV(22), SIZE_INV(23),
+		    SIZE_INV(24), SIZE_INV(25), SIZE_INV(26), SIZE_INV(27),
+		    SIZE_INV(28), SIZE_INV(29), SIZE_INV(30), SIZE_INV(31)
+		};
+
+		if (size <= ((sizeof(size_invs) / sizeof(unsigned)) + 2))
+			regind = (diff * size_invs[size - 3]) >> SIZE_INV_SHIFT;
+		else
+			regind = diff / size;
+#undef SIZE_INV
+#undef SIZE_INV_SHIFT
+	}
+	assert(diff == regind * size);
+	assert(regind < bin->nregs);
+
+	return (regind);
+}
+
+#ifdef JEMALLOC_PROF
+JEMALLOC_INLINE prof_ctx_t *
+arena_prof_ctx_get(const void *ptr)
+{
+	prof_ctx_t *ret;
+	arena_chunk_t *chunk;
+	size_t pageind, mapbits;
+
+	assert(ptr != NULL);
+	assert(CHUNK_ADDR2BASE(ptr) != ptr);
+
+	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
+	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT;
+	mapbits = chunk->map[pageind-map_bias].bits;
+	assert((mapbits & CHUNK_MAP_ALLOCATED) != 0);
+	if ((mapbits & CHUNK_MAP_LARGE) == 0) {
+		if (prof_promote)
+			ret = (prof_ctx_t *)(uintptr_t)1U;
+		else {
+			arena_run_t *run = (arena_run_t *)((uintptr_t)chunk +
+			    (uintptr_t)((pageind - (mapbits >> PAGE_SHIFT)) <<
+			    PAGE_SHIFT));
+			arena_bin_t *bin = run->bin;
+			unsigned regind;
+
+			assert(run->magic == ARENA_RUN_MAGIC);
+			regind = arena_run_regind(run, bin, ptr, bin->reg_size);
+			ret = *(prof_ctx_t **)((uintptr_t)run +
+			    bin->ctx0_offset + (regind *
+			    sizeof(prof_ctx_t *)));
+		}
+	} else
+		ret = chunk->map[pageind-map_bias].prof_ctx;
+
+	return (ret);
+}
+
+JEMALLOC_INLINE void
+arena_prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
+{
+	arena_chunk_t *chunk;
+	size_t pageind, mapbits;
+
+	assert(ptr != NULL);
+	assert(CHUNK_ADDR2BASE(ptr) != ptr);
+
+	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
+	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT;
+	mapbits = chunk->map[pageind-map_bias].bits;
+	assert((mapbits & CHUNK_MAP_ALLOCATED) != 0);
+	if ((mapbits & CHUNK_MAP_LARGE) == 0) {
+		if (prof_promote == false) {
+			arena_run_t *run = (arena_run_t *)((uintptr_t)chunk +
+			    (uintptr_t)((pageind - (mapbits >> PAGE_SHIFT)) <<
+			    PAGE_SHIFT));
+			arena_bin_t *bin = run->bin;
+			unsigned regind;
+
+			assert(run->magic == ARENA_RUN_MAGIC);
+			regind = arena_run_regind(run, bin, ptr, bin->reg_size);
+
+			*((prof_ctx_t **)((uintptr_t)run + bin->ctx0_offset
+			    + (regind * sizeof(prof_ctx_t *)))) = ctx;
+		} else
+			assert((uintptr_t)ctx == (uintptr_t)1U);
+	} else
+		chunk->map[pageind-map_bias].prof_ctx = ctx;
+}
+#endif
+
 JEMALLOC_INLINE void
 arena_dalloc(arena_t *arena, arena_chunk_t *chunk, void *ptr)
 {
@ -474,8 +620,8 @@ arena_dalloc(arena_t *arena, arena_chunk_t *chunk, void *ptr)
 	assert(ptr != NULL);
 	assert(CHUNK_ADDR2BASE(ptr) != ptr);

-	pageind = (((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT);
-	mapelm = &chunk->map[pageind];
+	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT;
+	mapelm = &chunk->map[pageind-map_bias];
 	assert((mapelm->bits & CHUNK_MAP_ALLOCATED) != 0);
 	if ((mapelm->bits & CHUNK_MAP_LARGE) == 0) {
 		/* Small allocation. */
--- a/jemalloc/include/jemalloc/internal/chunk.h
+++ b/jemalloc/include/jemalloc/internal/chunk.h
@ -39,13 +39,17 @@ extern malloc_mutex_t	chunks_mtx;
 extern chunk_stats_t	stats_chunks;
 #endif

+#ifdef JEMALLOC_IVSALLOC
+extern rtree_t		*chunks_rtree;
+#endif
+
 extern size_t		chunksize;
 extern size_t		chunksize_mask; /* (chunksize - 1). */
 extern size_t		chunk_npages;
-extern size_t		arena_chunk_header_npages;
+extern size_t		map_bias; /* Number of arena chunk header pages. */
 extern size_t		arena_maxclass; /* Max size class for arenas. */

-void	*chunk_alloc(size_t size, bool *zero);
+void	*chunk_alloc(size_t size, bool base, bool *zero);
 void	chunk_dealloc(void *chunk, size_t size);
 bool	chunk_boot(void);

--- a/jemalloc/include/jemalloc/internal/chunk_mmap.h
+++ b/jemalloc/include/jemalloc/internal/chunk_mmap.h
@ -13,6 +13,8 @@ void	*chunk_alloc_mmap(size_t size);
 void	*chunk_alloc_mmap_noreserve(size_t size);
 void	chunk_dealloc_mmap(void *chunk, size_t size);

+bool	chunk_mmap_boot(void);
+
 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_INLINES
--- a/jemalloc/include/jemalloc/internal/ckh.h
+++ b/jemalloc/include/jemalloc/internal/ckh.h
@ -45,7 +45,7 @@ struct ckh_s {
 #endif

 	/* Used for pseudo-random number generation. */
-#define	CKH_A		12345
+#define	CKH_A		1103515241
 #define	CKH_C		12347
 	uint32_t	prn_state;

--- a/jemalloc/include/jemalloc/internal/ctl.h
+++ b/jemalloc/include/jemalloc/internal/ctl.h
@ -82,9 +82,9 @@ bool	ctl_boot(void);
 #define	xmallctl(name, oldp, oldlenp, newp, newlen) do {		\
 	if (JEMALLOC_P(mallctl)(name, oldp, oldlenp, newp, newlen)	\
 	    != 0) {							\
-		malloc_write("<jemalloc>: Invalid xmallctl(\"");	\
+		malloc_write("<jemalloc>: Failure in xmallctl(\"");	\
 		malloc_write(name);					\
-		malloc_write("\", ...) call\n");			\
+		malloc_write("\", ...)\n");				\
 		abort();						\
 	}								\
 } while (0)
@ -92,9 +92,9 @@ bool	ctl_boot(void);
 #define	xmallctlnametomib(name, mibp, miblenp) do {			\
 	if (JEMALLOC_P(mallctlnametomib)(name, mibp, miblenp) != 0) {	\
 		malloc_write(						\
-		    "<jemalloc>: Invalid xmallctlnametomib(\"");	\
+		    "<jemalloc>: Failure in xmallctlnametomib(\"");	\
 		malloc_write(name);					\
-		malloc_write("\", ...) call\n");			\
+		malloc_write("\", ...)\n");				\
 		abort();						\
 	}								\
 } while (0)
@ -103,7 +103,7 @@ bool	ctl_boot(void);
 	if (JEMALLOC_P(mallctlbymib)(mib, miblen, oldp, oldlenp, newp,	\
 	    newlen) != 0) {						\
 		malloc_write(						\
-		    "<jemalloc>: Invalid xmallctlbymib() call\n");	\
+		    "<jemalloc>: Failure in xmallctlbymib()\n");	\
 		abort();						\
 	}								\
 } while (0)
--- a/jemalloc/include/jemalloc/internal/huge.h
+++ b/jemalloc/include/jemalloc/internal/huge.h
@ -20,8 +20,11 @@ extern size_t		huge_allocated;
 extern malloc_mutex_t	huge_mtx;

 void	*huge_malloc(size_t size, bool zero);
-void	*huge_palloc(size_t alignment, size_t size);
-void	*huge_ralloc(void *ptr, size_t size, size_t oldsize);
+void	*huge_palloc(size_t size, size_t alignment, bool zero);
+void	*huge_ralloc_no_move(void *ptr, size_t oldsize, size_t size,
+    size_t extra);
+void	*huge_ralloc(void *ptr, size_t oldsize, size_t size, size_t extra,
+    size_t alignment, bool zero);
 void	huge_dalloc(void *ptr);
 size_t	huge_salloc(const void *ptr);
 #ifdef JEMALLOC_PROF
--- a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
@ -17,16 +17,29 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
+#include <stddef.h>
+#ifndef offsetof
+#  define offsetof(type, member)	((size_t)&(((type *)NULL)->member))
+#endif
 #include <inttypes.h>
 #include <string.h>
 #include <strings.h>
+#include <ctype.h>
 #include <unistd.h>
 #include <fcntl.h>
 #include <pthread.h>
+#include <math.h>

 #define	JEMALLOC_MANGLE
 #include "../jemalloc@install_suffix@.h"

+#ifdef JEMALLOC_ZONE
+#include <mach/mach_error.h>
+#include <mach/mach_init.h>
+#include <mach/vm_map.h>
+#include <malloc/malloc.h>
+#endif
+
 #ifdef JEMALLOC_LAZY_LOCK
 #include <dlfcn.h>
 #endif
@ -49,7 +62,7 @@ extern void	(*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s);
 		malloc_write("<jemalloc>: ");				\
 		malloc_write(__FILE__);					\
 		malloc_write(":");					\
-		malloc_write(umax2s(__LINE__, 10, line_buf));		\
+		malloc_write(u2s(__LINE__, 10, line_buf));		\
 		malloc_write(": Failed assertion: ");			\
 		malloc_write("\"");					\
 		malloc_write(#e);					\
@ -77,6 +90,8 @@ extern void	(*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s);
 /******************************************************************************/
 #define JEMALLOC_H_TYPES

+#define	ALLOCM_LG_ALIGN_MASK	((int)0x3f)
+
 #define	ZU(z)	((size_t)z)

 #ifndef __DECONST
@ -92,8 +107,8 @@ extern void	(*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s);
 #  define JEMALLOC_INLINE static inline
 #endif

-/* Size of stack-allocated buffer passed to strerror_r(). */
-#define	STRERROR_BUF		64
+/* Size of stack-allocated buffer passed to buferror(). */
+#define	BUFERROR_BUF		64

 /* Minimum alignment of allocations is 2^LG_QUANTUM bytes. */
 #ifdef __i386__
@ -159,6 +174,16 @@ extern void	(*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s);
 #define	STATIC_PAGE_SIZE ((size_t)(1U << STATIC_PAGE_SHIFT))
 #define	STATIC_PAGE_MASK ((size_t)(STATIC_PAGE_SIZE - 1))

+#ifdef PAGE_SHIFT
+#  undef PAGE_SHIFT
+#endif
+#ifdef PAGE_SIZE
+#  undef PAGE_SIZE
+#endif
+#ifdef PAGE_MASK
+#  undef PAGE_MASK
+#endif
+
 #ifdef DYNAMIC_PAGE_SHIFT
 #  define PAGE_SHIFT	lg_pagesize
 #  define PAGE_SIZE	pagesize
@ -184,8 +209,12 @@ extern void	(*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s);
 #include "jemalloc/internal/base.h"
 #include "jemalloc/internal/chunk.h"
 #include "jemalloc/internal/huge.h"
+#include "jemalloc/internal/rtree.h"
 #include "jemalloc/internal/tcache.h"
 #include "jemalloc/internal/hash.h"
+#ifdef JEMALLOC_ZONE
+#include "jemalloc/internal/zone.h"
+#endif
 #include "jemalloc/internal/prof.h"

 #undef JEMALLOC_H_TYPES
@ -203,8 +232,12 @@ extern void	(*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s);
 #include "jemalloc/internal/base.h"
 #include "jemalloc/internal/chunk.h"
 #include "jemalloc/internal/huge.h"
+#include "jemalloc/internal/rtree.h"
 #include "jemalloc/internal/tcache.h"
 #include "jemalloc/internal/hash.h"
+#ifdef JEMALLOC_ZONE
+#include "jemalloc/internal/zone.h"
+#endif
 #include "jemalloc/internal/prof.h"

 #undef JEMALLOC_H_STRUCTS
@ -224,6 +257,7 @@ extern bool	opt_xmalloc;
 #ifdef JEMALLOC_FILL
 extern bool	opt_zero;
 #endif
+extern size_t	opt_narenas;

 #ifdef DYNAMIC_PAGE_SHIFT
 extern size_t		pagesize;
@ -240,8 +274,19 @@ extern malloc_mutex_t	arenas_lock; /* Protects arenas initialization. */
 * Map of pthread_self() --> arenas[???], used for selecting an arena to use
 * for allocations.
 */
-extern __thread arena_t	*arenas_map JEMALLOC_ATTR(tls_model("initial-exec"));
+extern __thread arena_t	*arenas_tls JEMALLOC_ATTR(tls_model("initial-exec"));
+#  define ARENA_GET()	arenas_tls
+#  define ARENA_SET(v)	do {						\
+	arenas_tls = (v);						\
+} while (0)
+#else
+extern pthread_key_t	arenas_tsd;
+#  define ARENA_GET()	((arena_t *)pthread_getspecific(arenas_tsd))
+#  define ARENA_SET(v)	do {						\
+	pthread_setspecific(arenas_tsd, (void *)(v));			\
+} while (0)
 #endif
+
 /*
 * Arenas that are used to service external requests.  Not all elements of the
 * arenas array are necessarily used; arenas are created lazily as needed.
@ -249,11 +294,56 @@ extern __thread arena_t	*arenas_map JEMALLOC_ATTR(tls_model("initial-exec"));
 extern arena_t		**arenas;
 extern unsigned		narenas;

-arena_t	*arenas_extend(unsigned ind);
-#ifndef NO_TLS
-arena_t	*choose_arena_hard(void);
+#ifdef JEMALLOC_STATS
+typedef struct {
+	uint64_t	allocated;
+	uint64_t	deallocated;
+} thread_allocated_t;
+#  ifndef NO_TLS
+extern __thread thread_allocated_t	thread_allocated_tls;
+#    define ALLOCATED_GET() thread_allocated_tls.allocated
+#    define DEALLOCATED_GET() thread_allocated_tls.deallocated
+#    define ALLOCATED_ADD(a, d) do {					\
+	thread_allocated_tls.allocated += a;				\
+	thread_allocated_tls.deallocated += d;				\
+} while (0)
+#  else
+extern pthread_key_t	thread_allocated_tsd;
+#    define ALLOCATED_GET()						\
+	(uint64_t)((pthread_getspecific(thread_allocated_tsd) != NULL)	\
+	    ? ((thread_allocated_t *)					\
+	    pthread_getspecific(thread_allocated_tsd))->allocated : 0)
+#    define DEALLOCATED_GET()						\
+	(uint64_t)((pthread_getspecific(thread_allocated_tsd) != NULL)	\
+	    ? ((thread_allocated_t					\
+	    *)pthread_getspecific(thread_allocated_tsd))->deallocated :	\
+	    0)
+#    define ALLOCATED_ADD(a, d) do {					\
+	thread_allocated_t *thread_allocated = (thread_allocated_t *)	\
+	    pthread_getspecific(thread_allocated_tsd);			\
+	if (thread_allocated != NULL) {					\
+		thread_allocated->allocated += (a);			\
+		thread_allocated->deallocated += (d);			\
+	} else {							\
+		thread_allocated = (thread_allocated_t *)		\
+		    imalloc(sizeof(thread_allocated_t));		\
+		if (thread_allocated != NULL) {				\
+			pthread_setspecific(thread_allocated_tsd,	\
+			    thread_allocated);				\
+			thread_allocated->allocated = (a);		\
+			thread_allocated->deallocated = (d);		\
+		}							\
+	}								\
+} while (0)
+#  endif
 #endif

+arena_t	*arenas_extend(unsigned ind);
+arena_t	*choose_arena_hard(void);
+int	buferror(int errnum, char *buf, size_t buflen);
+void	jemalloc_prefork(void);
+void	jemalloc_postfork(void);
+
 #include "jemalloc/internal/prn.h"
 #include "jemalloc/internal/ckh.h"
 #include "jemalloc/internal/stats.h"
@ -265,8 +355,12 @@ arena_t	*choose_arena_hard(void);
 #include "jemalloc/internal/base.h"
 #include "jemalloc/internal/chunk.h"
 #include "jemalloc/internal/huge.h"
+#include "jemalloc/internal/rtree.h"
 #include "jemalloc/internal/tcache.h"
 #include "jemalloc/internal/hash.h"
+#ifdef JEMALLOC_ZONE
+#include "jemalloc/internal/zone.h"
+#endif
 #include "jemalloc/internal/prof.h"

 #undef JEMALLOC_H_EXTERNS
@ -285,11 +379,143 @@ arena_t	*choose_arena_hard(void);
 #include "jemalloc/internal/huge.h"

 #ifndef JEMALLOC_ENABLE_INLINE
+size_t	pow2_ceil(size_t x);
+size_t	s2u(size_t size);
+size_t	sa2u(size_t size, size_t alignment, size_t *run_size_p);
 void	malloc_write(const char *s);
 arena_t	*choose_arena(void);
 #endif

 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_C_))
+/* Compute the smallest power of 2 that is >= x. */
+JEMALLOC_INLINE size_t
+pow2_ceil(size_t x)
+{
+
+	x--;
+	x |= x >> 1;
+	x |= x >> 2;
+	x |= x >> 4;
+	x |= x >> 8;
+	x |= x >> 16;
+#if (LG_SIZEOF_PTR == 3)
+	x |= x >> 32;
+#endif
+	x++;
+	return (x);
+}
+
+/*
+ * Compute usable size that would result from allocating an object with the
+ * specified size.
+ */
+JEMALLOC_INLINE size_t
+s2u(size_t size)
+{
+
+	if (size <= small_maxclass)
+		return arenas[0]->bins[small_size2bin[size]].reg_size;
+	if (size <= arena_maxclass)
+		return PAGE_CEILING(size);
+	return CHUNK_CEILING(size);
+}
+
+/*
+ * Compute usable size that would result from allocating an object with the
+ * specified size and alignment.
+ */
+JEMALLOC_INLINE size_t
+sa2u(size_t size, size_t alignment, size_t *run_size_p)
+{
+	size_t usize;
+
+	/*
+	 * Round size up to the nearest multiple of alignment.
+	 *
+	 * This done, we can take advantage of the fact that for each small
+	 * size class, every object is aligned at the smallest power of two
+	 * that is non-zero in the base two representation of the size.  For
+	 * example:
+	 *
+	 *   Size |   Base 2 | Minimum alignment
+	 *   -----+----------+------------------
+	 *     96 |  1100000 |  32
+	 *    144 | 10100000 |  32
+	 *    192 | 11000000 |  64
+	 *
+	 * Depending on runtime settings, it is possible that arena_malloc()
+	 * will further round up to a power of two, but that never causes
+	 * correctness issues.
+	 */
+	usize = (size + (alignment - 1)) & (-alignment);
+	/*
+	 * (usize < size) protects against the combination of maximal
+	 * alignment and size greater than maximal alignment.
+	 */
+	if (usize < size) {
+		/* size_t overflow. */
+		return (0);
+	}
+
+	if (usize <= arena_maxclass && alignment <= PAGE_SIZE) {
+		if (usize <= small_maxclass) {
+			return
+			    (arenas[0]->bins[small_size2bin[usize]].reg_size);
+		}
+		return (PAGE_CEILING(usize));
+	} else {
+		size_t run_size;
+
+		/*
+		 * We can't achieve subpage alignment, so round up alignment
+		 * permanently; it makes later calculations simpler.
+		 */
+		alignment = PAGE_CEILING(alignment);
+		usize = PAGE_CEILING(size);
+		/*
+		 * (usize < size) protects against very large sizes within
+		 * PAGE_SIZE of SIZE_T_MAX.
+		 *
+		 * (usize + alignment < usize) protects against the
+		 * combination of maximal alignment and usize large enough
+		 * to cause overflow.  This is similar to the first overflow
+		 * check above, but it needs to be repeated due to the new
+		 * usize value, which may now be *equal* to maximal
+		 * alignment, whereas before we only detected overflow if the
+		 * original size was *greater* than maximal alignment.
+		 */
+		if (usize < size || usize + alignment < usize) {
+			/* size_t overflow. */
+			return (0);
+		}
+
+		/*
+		 * Calculate the size of the over-size run that arena_palloc()
+		 * would need to allocate in order to guarantee the alignment.
+		 */
+		if (usize >= alignment)
+			run_size = usize + alignment - PAGE_SIZE;
+		else {
+			/*
+			 * It is possible that (alignment << 1) will cause
+			 * overflow, but it doesn't matter because we also
+			 * subtract PAGE_SIZE, which in the case of overflow
+			 * leaves us with a very large run_size.  That causes
+			 * the first conditional below to fail, which means
+			 * that the bogus run_size value never gets used for
+			 * anything important.
+			 */
+			run_size = (alignment << 1) - PAGE_SIZE;
+		}
+		if (run_size_p != NULL)
+			*run_size_p = run_size;
+
+		if (run_size <= arena_maxclass)
+			return (PAGE_CEILING(usize));
+		return (CHUNK_CEILING(usize));
+	}
+}
+
 /*
 * Wrapper around malloc_message() that avoids the need for
 * JEMALLOC_P(malloc_message)(...) throughout the code.
@ -310,78 +536,35 @@ choose_arena(void)
 {
 	arena_t *ret;

-	/*
-	 * We can only use TLS if this is a PIC library, since for the static
-	 * library version, libc's malloc is used by TLS allocation, which
-	 * introduces a bootstrapping issue.
-	 */
-#ifndef NO_TLS
-	ret = arenas_map;
+	ret = ARENA_GET();
 	if (ret == NULL) {
 		ret = choose_arena_hard();
 		assert(ret != NULL);
 	}
-#else
-	if (isthreaded && narenas > 1) {
-		unsigned long ind;

-		/*
-		 * Hash pthread_self() to one of the arenas.  There is a prime
-		 * number of arenas, so this has a reasonable chance of
-		 * working.  Even so, the hashing can be easily thwarted by
-		 * inconvenient pthread_self() values.  Without specific
-		 * knowledge of how pthread_self() calculates values, we can't
-		 * easily do much better than this.
-		 */
-		ind = (unsigned long) pthread_self() % narenas;
-
-		/*
-		 * Optimistially assume that arenas[ind] has been initialized.
-		 * At worst, we find out that some other thread has already
-		 * done so, after acquiring the lock in preparation.  Note that
-		 * this lazy locking also has the effect of lazily forcing
-		 * cache coherency; without the lock acquisition, there's no
-		 * guarantee that modification of arenas[ind] by another thread
-		 * would be seen on this CPU for an arbitrary amount of time.
-		 *
-		 * In general, this approach to modifying a synchronized value
-		 * isn't a good idea, but in this case we only ever modify the
-		 * value once, so things work out well.
-		 */
-		ret = arenas[ind];
-		if (ret == NULL) {
-			/*
-			 * Avoid races with another thread that may have already
-			 * initialized arenas[ind].
-			 */
-			malloc_mutex_lock(&arenas_lock);
-			if (arenas[ind] == NULL)
-				ret = arenas_extend((unsigned)ind);
-			else
-				ret = arenas[ind];
-			malloc_mutex_unlock(&arenas_lock);
-		}
-	} else
-		ret = arenas[0];
-#endif
-
-	assert(ret != NULL);
 	return (ret);
 }
 #endif

+#include "jemalloc/internal/rtree.h"
 #include "jemalloc/internal/tcache.h"
 #include "jemalloc/internal/arena.h"
 #include "jemalloc/internal/hash.h"
-#include "jemalloc/internal/prof.h"
+#ifdef JEMALLOC_ZONE
+#include "jemalloc/internal/zone.h"
+#endif

 #ifndef JEMALLOC_ENABLE_INLINE
 void	*imalloc(size_t size);
 void	*icalloc(size_t size);
-void	*ipalloc(size_t alignment, size_t size);
+void	*ipalloc(size_t size, size_t alignment, bool zero);
 size_t	isalloc(const void *ptr);
-void	*iralloc(void *ptr, size_t size);
+#  ifdef JEMALLOC_IVSALLOC
+size_t	ivsalloc(const void *ptr);
+#  endif
 void	idalloc(void *ptr);
+void	*iralloc(void *ptr, size_t size, size_t extra, size_t alignment,
+    bool zero, bool no_move);
 #endif

 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_C_))
@ -408,95 +591,28 @@ icalloc(size_t size)
 }

 JEMALLOC_INLINE void *
-ipalloc(size_t alignment, size_t size)
+ipalloc(size_t size, size_t alignment, bool zero)
 {
 	void *ret;
-	size_t ceil_size;
+	size_t usize;
+	size_t run_size
+#  ifdef JEMALLOC_CC_SILENCE
+	    = 0
+#  endif
+	    ;

-	/*
-	 * Round size up to the nearest multiple of alignment.
-	 *
-	 * This done, we can take advantage of the fact that for each small
-	 * size class, every object is aligned at the smallest power of two
-	 * that is non-zero in the base two representation of the size.  For
-	 * example:
-	 *
-	 *   Size |   Base 2 | Minimum alignment
-	 *   -----+----------+------------------
-	 *     96 |  1100000 |  32
-	 *    144 | 10100000 |  32
-	 *    192 | 11000000 |  64
-	 *
-	 * Depending on runtime settings, it is possible that arena_malloc()
-	 * will further round up to a power of two, but that never causes
-	 * correctness issues.
-	 */
-	ceil_size = (size + (alignment - 1)) & (-alignment);
-	/*
-	 * (ceil_size < size) protects against the combination of maximal
-	 * alignment and size greater than maximal alignment.
-	 */
-	if (ceil_size < size) {
-		/* size_t overflow. */
+	usize = sa2u(size, alignment, &run_size);
+	if (usize == 0)
 		return (NULL);
-	}
-
-	if (ceil_size <= PAGE_SIZE || (alignment <= PAGE_SIZE
-	    && ceil_size <= arena_maxclass))
-		ret = arena_malloc(ceil_size, false);
-	else {
-		size_t run_size;
-
-		/*
-		 * We can't achieve subpage alignment, so round up alignment
-		 * permanently; it makes later calculations simpler.
-		 */
-		alignment = PAGE_CEILING(alignment);
-		ceil_size = PAGE_CEILING(size);
-		/*
-		 * (ceil_size < size) protects against very large sizes within
-		 * PAGE_SIZE of SIZE_T_MAX.
-		 *
-		 * (ceil_size + alignment < ceil_size) protects against the
-		 * combination of maximal alignment and ceil_size large enough
-		 * to cause overflow.  This is similar to the first overflow
-		 * check above, but it needs to be repeated due to the new
-		 * ceil_size value, which may now be *equal* to maximal
-		 * alignment, whereas before we only detected overflow if the
-		 * original size was *greater* than maximal alignment.
-		 */
-		if (ceil_size < size || ceil_size + alignment < ceil_size) {
-			/* size_t overflow. */
-			return (NULL);
-		}
-
-		/*
-		 * Calculate the size of the over-size run that arena_palloc()
-		 * would need to allocate in order to guarantee the alignment.
-		 */
-		if (ceil_size >= alignment)
-			run_size = ceil_size + alignment - PAGE_SIZE;
-		else {
-			/*
-			 * It is possible that (alignment << 1) will cause
-			 * overflow, but it doesn't matter because we also
-			 * subtract PAGE_SIZE, which in the case of overflow
-			 * leaves us with a very large run_size.  That causes
-			 * the first conditional below to fail, which means
-			 * that the bogus run_size value never gets used for
-			 * anything important.
-			 */
-			run_size = (alignment << 1) - PAGE_SIZE;
-		}
-
-		if (run_size <= arena_maxclass) {
-			ret = arena_palloc(choose_arena(), alignment, ceil_size,
-			    run_size);
+	if (usize <= arena_maxclass && alignment <= PAGE_SIZE)
+		ret = arena_malloc(usize, zero);
+	else if (run_size <= arena_maxclass) {
+		ret = arena_palloc(choose_arena(), usize, run_size, alignment,
+		    zero);
 	} else if (alignment <= chunksize)
-			ret = huge_malloc(ceil_size, false);
+		ret = huge_malloc(usize, zero);
 	else
-			ret = huge_palloc(alignment, ceil_size);
-	}
+		ret = huge_palloc(usize, alignment, zero);

 	assert(((uintptr_t)ret & (alignment - 1)) == 0);
 	return (ret);
@ -526,21 +642,18 @@ isalloc(const void *ptr)
 	return (ret);
 }

-JEMALLOC_INLINE void *
-iralloc(void *ptr, size_t size)
+#ifdef JEMALLOC_IVSALLOC
+JEMALLOC_INLINE size_t
+ivsalloc(const void *ptr)
 {
-	size_t oldsize;

-	assert(ptr != NULL);
-	assert(size != 0);
+	/* Return 0 if ptr is not within a chunk managed by jemalloc. */
+	if (rtree_get(chunks_rtree, (uintptr_t)CHUNK_ADDR2BASE(ptr)) == NULL)
+		return (0);

-	oldsize = isalloc(ptr);
-
-	if (size <= arena_maxclass)
-		return (arena_ralloc(ptr, size, oldsize));
-	else
-		return (huge_ralloc(ptr, size, oldsize));
+	return (isalloc(ptr));
 }
+#endif

 JEMALLOC_INLINE void
 idalloc(void *ptr)
@ -555,7 +668,70 @@ idalloc(void *ptr)
 	else
 		huge_dalloc(ptr);
 }
+
+JEMALLOC_INLINE void *
+iralloc(void *ptr, size_t size, size_t extra, size_t alignment, bool zero,
+    bool no_move)
+{
+	void *ret;
+	size_t oldsize;
+
+	assert(ptr != NULL);
+	assert(size != 0);
+
+	oldsize = isalloc(ptr);
+
+	if (alignment != 0 && ((uintptr_t)ptr & ((uintptr_t)alignment-1))
+	    != 0) {
+		size_t copysize;
+
+		/*
+		 * Existing object alignment is inadquate; allocate new space
+		 * and copy.
+		 */
+		if (no_move)
+			return (NULL);
+		ret = ipalloc(size + extra, alignment, zero);
+		if (ret == NULL) {
+			if (extra == 0)
+				return (NULL);
+			/* Try again, without extra this time. */
+			ret = ipalloc(size, alignment, zero);
+			if (ret == NULL)
+				return (NULL);
+		}
+		/*
+		 * Copy at most size bytes (not size+extra), since the caller
+		 * has no expectation that the extra bytes will be reliably
+		 * preserved.
+		 */
+		copysize = (size < oldsize) ? size : oldsize;
+		memcpy(ret, ptr, copysize);
+		idalloc(ptr);
+		return (ret);
+	}
+
+	if (no_move) {
+		if (size <= arena_maxclass) {
+			return (arena_ralloc_no_move(ptr, oldsize, size,
+			    extra, zero));
+		} else {
+			return (huge_ralloc_no_move(ptr, oldsize, size,
+			    extra));
+		}
+	} else {
+		if (size + extra <= arena_maxclass) {
+			return (arena_ralloc(ptr, oldsize, size, extra,
+			    alignment, zero));
+		} else {
+			return (huge_ralloc(ptr, oldsize, size, extra,
+			    alignment, zero));
+		}
+	}
+}
 #endif

+#include "jemalloc/internal/prof.h"
+
 #undef JEMALLOC_H_INLINES
 /******************************************************************************/
--- a/jemalloc/include/jemalloc/internal/mutex.h
+++ b/jemalloc/include/jemalloc/internal/mutex.h
@ -3,6 +3,12 @@

 typedef pthread_mutex_t malloc_mutex_t;

+#ifdef PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP
+#  define MALLOC_MUTEX_INITIALIZER PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP
+#else
+#  define MALLOC_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER
+#endif
+
 #endif /* JEMALLOC_H_TYPES */
 /******************************************************************************/
 #ifdef JEMALLOC_H_STRUCTS
@ -18,6 +24,7 @@ extern bool isthreaded;
 #endif

 bool	malloc_mutex_init(malloc_mutex_t *mutex);
+void	malloc_mutex_destroy(malloc_mutex_t *mutex);

 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
--- a/jemalloc/include/jemalloc/internal/prof.h
+++ b/jemalloc/include/jemalloc/internal/prof.h
@ -6,20 +6,25 @@ typedef struct prof_bt_s prof_bt_t;
 typedef struct prof_cnt_s prof_cnt_t;
 typedef struct prof_thr_cnt_s prof_thr_cnt_t;
 typedef struct prof_ctx_s prof_ctx_t;
-typedef struct prof_s prof_t;
+typedef struct prof_tdata_s prof_tdata_t;

 /* Option defaults. */
-#define	LG_PROF_BT_MAX_DEFAULT		2
+#define	PROF_PREFIX_DEFAULT		"jeprof"
+#define	LG_PROF_BT_MAX_DEFAULT		7
 #define	LG_PROF_SAMPLE_DEFAULT		0
-#define	LG_PROF_INTERVAL_DEFAULT	30
+#define	LG_PROF_INTERVAL_DEFAULT	-1
+#define	LG_PROF_TCMAX_DEFAULT		-1

 /*
 * Hard limit on stack backtrace depth.  Note that the version of
 * prof_backtrace() that is based on __builtin_return_address() necessarily has
- * a hard-coded number of backtrace frame handlers, so increasing
- * LG_PROF_BT_MAX requires changing prof_backtrace().
+ * a hard-coded number of backtrace frame handlers.
 */
-#define	LG_PROF_BT_MAX		7 /* >= LG_PROF_BT_MAX_DEFAULT */
+#if (defined(JEMALLOC_PROF_LIBGCC) || defined(JEMALLOC_PROF_LIBUNWIND))
+#  define LG_PROF_BT_MAX	((ZU(1) << (LG_SIZEOF_PTR+3)) - 1)
+#else
+#  define LG_PROF_BT_MAX	7 /* >= LG_PROF_BT_MAX_DEFAULT */
+#endif
 #define	PROF_BT_MAX		(1U << LG_PROF_BT_MAX)

 /* Initial hash table size. */
@ -51,11 +56,11 @@ struct prof_cnt_s {
 	/*
 	 * Profiling counters.  An allocation/deallocation pair can operate on
 	 * different prof_thr_cnt_t objects that are linked into the same
-	 * prof_ctx_t sets_ql, so it is possible for the cur* counters to go
+	 * prof_ctx_t cnts_ql, so it is possible for the cur* counters to go
 	 * negative.  In principle it is possible for the *bytes counters to
-	 * overflow/underflow, but a general solution would require some form
-	 * of 128-bit counter solution; this implementation doesn't bother to
-	 * solve that problem.
+	 * overflow/underflow, but a general solution would require something
+	 * like 128-bit counters; this implementation doesn't bother to solve
+	 * that problem.
 	 */
 	int64_t		curobjs;
 	int64_t		curbytes;
@ -64,15 +69,18 @@ struct prof_cnt_s {
 };

 struct prof_thr_cnt_s {
-	/* Linkage into prof_ctx_t's sets_ql. */
-	ql_elm(prof_thr_cnt_t)	link;
+	/* Linkage into prof_ctx_t's cnts_ql. */
+	ql_elm(prof_thr_cnt_t)	cnts_link;
+
+	/* Linkage into thread's LRU. */
+	ql_elm(prof_thr_cnt_t)	lru_link;

 	/*
 	 * Associated context.  If a thread frees an object that it did not
 	 * allocate, it is possible that the context is not cached in the
 	 * thread's hash table, in which case it must be able to look up the
 	 * context, insert a new prof_thr_cnt_t into the thread's hash table,
-	 * and link it into the prof_ctx_t's sets_ql.
+	 * and link it into the prof_ctx_t's cnts_ql.
 	 */
 	prof_ctx_t		*ctx;

@ -101,11 +109,11 @@ struct prof_ctx_s {
 	/* Associated backtrace. */
 	prof_bt_t		*bt;

-	/* Protects cnt_merged and sets_ql. */
+	/* Protects cnt_merged and cnts_ql. */
 	malloc_mutex_t		lock;

-	/* Temporary storage for aggregation during dump. */
-	prof_cnt_t		cnt_dump;
+	/* Temporary storage for summation during dump. */
+	prof_cnt_t		cnt_summed;

 	/* When threads exit, they merge their stats into cnt_merged. */
 	prof_cnt_t		cnt_merged;
@ -117,6 +125,31 @@ struct prof_ctx_s {
 	ql_head(prof_thr_cnt_t)	cnts_ql;
 };

+struct prof_tdata_s {
+	/*
+	 * Hash of (prof_bt_t *)-->(prof_thr_cnt_t *).  Each thread keeps a
+	 * cache of backtraces, with associated thread-specific prof_thr_cnt_t
+	 * objects.  Other threads may read the prof_thr_cnt_t contents, but no
+	 * others will ever write them.
+	 *
+	 * Upon thread exit, the thread must merge all the prof_thr_cnt_t
+	 * counter data into the associated prof_ctx_t objects, and unlink/free
+	 * the prof_thr_cnt_t objects.
+	 */
+	ckh_t			bt2cnt;
+
+	/* LRU for contents of bt2cnt. */
+	ql_head(prof_thr_cnt_t)	lru_ql;
+
+	/* Backtrace vector, used for calls to prof_backtrace(). */
+	void			**vec;
+
+	/* Sampling state. */
+	uint64_t		prn_state;
+	uint64_t		threshold;
+	uint64_t		accum;
+};
+
 #endif /* JEMALLOC_H_STRUCTS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_EXTERNS
@ -132,8 +165,11 @@ extern bool	opt_prof_active;
 extern size_t	opt_lg_prof_bt_max;   /* Maximum backtrace depth. */
 extern size_t	opt_lg_prof_sample;   /* Mean bytes between samples. */
 extern ssize_t	opt_lg_prof_interval; /* lg(prof_interval). */
-extern bool	opt_prof_udump; /* High-water memory dumping. */
+extern bool	opt_prof_gdump;       /* High-water memory dumping. */
 extern bool	opt_prof_leak;        /* Dump leak summary at exit. */
+extern bool	opt_prof_accum;       /* Report cumulative bytes. */
+extern ssize_t	opt_lg_prof_tcmax;    /* lg(max per thread bactrace cache) */
+extern char	opt_prof_prefix[PATH_MAX + 1];

 /*
 * Profile dump interval, measured in bytes allocated.  Each arena triggers a
@ -150,25 +186,362 @@ extern uint64_t	prof_interval;
 */
 extern bool	prof_promote;

-bool	prof_init(prof_t *prof, bool master);
-void	prof_destroy(prof_t *prof);
+/* (1U << opt_lg_prof_bt_max). */
+extern unsigned	prof_bt_max;

-prof_thr_cnt_t	*prof_alloc_prep(size_t size);
-prof_ctx_t	*prof_ctx_get(const void *ptr);
-void	prof_malloc(const void *ptr, prof_thr_cnt_t *cnt);
-void	prof_realloc(const void *ptr, prof_thr_cnt_t *cnt, const void *old_ptr,
-    size_t old_size, prof_ctx_t *old_ctx);
-void	prof_free(const void *ptr);
+/* Thread-specific backtrace cache, used to reduce bt2ctx contention. */
+#ifndef NO_TLS
+extern __thread prof_tdata_t	*prof_tdata_tls
+    JEMALLOC_ATTR(tls_model("initial-exec"));
+#  define PROF_TCACHE_GET()	prof_tdata_tls
+#  define PROF_TCACHE_SET(v)	do {					\
+	prof_tdata_tls = (v);						\
+	pthread_setspecific(prof_tdata_tsd, (void *)(v));		\
+} while (0)
+#else
+#  define PROF_TCACHE_GET()						\
+	((prof_tdata_t *)pthread_getspecific(prof_tdata_tsd))
+#  define PROF_TCACHE_SET(v)	do {					\
+	pthread_setspecific(prof_tdata_tsd, (void *)(v));		\
+} while (0)
+#endif
+/*
+ * Same contents as b2cnt_tls, but initialized such that the TSD destructor is
+ * called when a thread exits, so that prof_tdata_tls contents can be merged,
+ * unlinked, and deallocated.
+ */
+extern pthread_key_t	prof_tdata_tsd;
+
+void	bt_init(prof_bt_t *bt, void **vec);
+void	prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max);
+prof_thr_cnt_t	*prof_lookup(prof_bt_t *bt);
 void	prof_idump(void);
 bool	prof_mdump(const char *filename);
-void	prof_udump(void);
+void	prof_gdump(void);
+prof_tdata_t	*prof_tdata_init(void);
 void	prof_boot0(void);
-bool	prof_boot1(void);
+void	prof_boot1(void);
+bool	prof_boot2(void);

 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_INLINES

+#ifndef JEMALLOC_ENABLE_INLINE
+void	prof_sample_threshold_update(prof_tdata_t *prof_tdata);
+prof_thr_cnt_t	*prof_alloc_prep(size_t size);
+prof_ctx_t	*prof_ctx_get(const void *ptr);
+void	prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
+bool	prof_sample_accum_update(size_t size);
+void	prof_malloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt);
+void	prof_realloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt,
+    size_t old_size, prof_ctx_t *old_ctx);
+void	prof_free(const void *ptr, size_t size);
+#endif
+
+#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_PROF_C_))
+JEMALLOC_INLINE void
+prof_sample_threshold_update(prof_tdata_t *prof_tdata)
+{
+	uint64_t r;
+	double u;
+
+	/*
+	 * Compute prof_sample_threshold as a geometrically distributed random
+	 * variable with mean (2^opt_lg_prof_sample).
+	 */
+	prn64(r, 53, prof_tdata->prn_state,
+	    (uint64_t)6364136223846793005LLU, (uint64_t)1442695040888963407LLU);
+	u = (double)r * (1.0/9007199254740992.0L);
+	prof_tdata->threshold = (uint64_t)(log(u) /
+	    log(1.0 - (1.0 / (double)((uint64_t)1U << opt_lg_prof_sample))))
+	    + (uint64_t)1U;
+}
+
+JEMALLOC_INLINE prof_thr_cnt_t *
+prof_alloc_prep(size_t size)
+{
+#ifdef JEMALLOC_ENABLE_INLINE
+   /* This function does not have its own stack frame, because it is inlined. */
+#  define NIGNORE 1
+#else
+#  define NIGNORE 2
+#endif
+	prof_thr_cnt_t *ret;
+	prof_tdata_t *prof_tdata;
+	prof_bt_t bt;
+
+	assert(size == s2u(size));
+
+	prof_tdata = PROF_TCACHE_GET();
+	if (prof_tdata == NULL) {
+		prof_tdata = prof_tdata_init();
+		if (prof_tdata == NULL)
+			return (NULL);
+	}
+
+	if (opt_prof_active == false) {
+		/* Sampling is currently inactive, so avoid sampling. */
+		ret = (prof_thr_cnt_t *)(uintptr_t)1U;
+	} else if (opt_lg_prof_sample == 0) {
+		/*
+		 * Don't bother with sampling logic, since sampling interval is
+		 * 1.
+		 */
+		bt_init(&bt, prof_tdata->vec);
+		prof_backtrace(&bt, NIGNORE, prof_bt_max);
+		ret = prof_lookup(&bt);
+	} else {
+		if (prof_tdata->threshold == 0) {
+			/*
+			 * Initialize.  Seed the prng differently for each
+			 * thread.
+			 */
+			prof_tdata->prn_state = (uint64_t)(uintptr_t)&size;
+			prof_sample_threshold_update(prof_tdata);
+		}
+
+		/*
+		 * Determine whether to capture a backtrace based on whether
+		 * size is enough for prof_accum to reach
+		 * prof_tdata->threshold.  However, delay updating these
+		 * variables until prof_{m,re}alloc(), because we don't know
+		 * for sure that the allocation will succeed.
+		 *
+		 * Use subtraction rather than addition to avoid potential
+		 * integer overflow.
+		 */
+		if (size >= prof_tdata->threshold - prof_tdata->accum) {
+			bt_init(&bt, prof_tdata->vec);
+			prof_backtrace(&bt, NIGNORE, prof_bt_max);
+			ret = prof_lookup(&bt);
+		} else
+			ret = (prof_thr_cnt_t *)(uintptr_t)1U;
+	}
+
+	return (ret);
+#undef NIGNORE
+}
+
+JEMALLOC_INLINE prof_ctx_t *
+prof_ctx_get(const void *ptr)
+{
+	prof_ctx_t *ret;
+	arena_chunk_t *chunk;
+
+	assert(ptr != NULL);
+
+	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
+	if (chunk != ptr) {
+		/* Region. */
+		assert(chunk->arena->magic == ARENA_MAGIC);
+
+		ret = arena_prof_ctx_get(ptr);
+	} else
+		ret = huge_prof_ctx_get(ptr);
+
+	return (ret);
+}
+
+JEMALLOC_INLINE void
+prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
+{
+	arena_chunk_t *chunk;
+
+	assert(ptr != NULL);
+
+	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
+	if (chunk != ptr) {
+		/* Region. */
+		assert(chunk->arena->magic == ARENA_MAGIC);
+
+		arena_prof_ctx_set(ptr, ctx);
+	} else
+		huge_prof_ctx_set(ptr, ctx);
+}
+
+JEMALLOC_INLINE bool
+prof_sample_accum_update(size_t size)
+{
+	prof_tdata_t *prof_tdata;
+
+	/* Sampling logic is unnecessary if the interval is 1. */
+	assert(opt_lg_prof_sample != 0);
+
+	prof_tdata = PROF_TCACHE_GET();
+	assert(prof_tdata != NULL);
+
+	/* Take care to avoid integer overflow. */
+	if (size >= prof_tdata->threshold - prof_tdata->accum) {
+		prof_tdata->accum -= (prof_tdata->threshold - size);
+		/* Compute new prof_sample_threshold. */
+		prof_sample_threshold_update(prof_tdata);
+		while (prof_tdata->accum >= prof_tdata->threshold) {
+			prof_tdata->accum -= prof_tdata->threshold;
+			prof_sample_threshold_update(prof_tdata);
+		}
+		return (false);
+	} else {
+		prof_tdata->accum += size;
+		return (true);
+	}
+}
+
+JEMALLOC_INLINE void
+prof_malloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt)
+{
+
+	assert(ptr != NULL);
+	assert(size == isalloc(ptr));
+
+	if (opt_lg_prof_sample != 0) {
+		if (prof_sample_accum_update(size)) {
+			/*
+			 * Don't sample.  For malloc()-like allocation, it is
+			 * always possible to tell in advance how large an
+			 * object's usable size will be, so there should never
+			 * be a difference between the size passed to
+			 * prof_alloc_prep() and prof_malloc().
+			 */
+			assert((uintptr_t)cnt == (uintptr_t)1U);
+		}
+	}
+
+	if ((uintptr_t)cnt > (uintptr_t)1U) {
+		prof_ctx_set(ptr, cnt->ctx);
+
+		cnt->epoch++;
+		/*********/
+		mb_write();
+		/*********/
+		cnt->cnts.curobjs++;
+		cnt->cnts.curbytes += size;
+		if (opt_prof_accum) {
+			cnt->cnts.accumobjs++;
+			cnt->cnts.accumbytes += size;
+		}
+		/*********/
+		mb_write();
+		/*********/
+		cnt->epoch++;
+		/*********/
+		mb_write();
+		/*********/
+	} else
+		prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U);
+}
+
+JEMALLOC_INLINE void
+prof_realloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt,
+    size_t old_size, prof_ctx_t *old_ctx)
+{
+	prof_thr_cnt_t *told_cnt;
+
+	assert(ptr != NULL || (uintptr_t)cnt <= (uintptr_t)1U);
+
+	if (ptr != NULL) {
+		assert(size == isalloc(ptr));
+		if (opt_lg_prof_sample != 0) {
+			if (prof_sample_accum_update(size)) {
+				/*
+				 * Don't sample.  The size passed to
+				 * prof_alloc_prep() was larger than what
+				 * actually got allocated, so a backtrace was
+				 * captured for this allocation, even though
+				 * its actual size was insufficient to cross
+				 * the sample threshold.
+				 */
+				cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
+			}
+		}
+	}
+
+	if ((uintptr_t)old_ctx > (uintptr_t)1U) {
+		told_cnt = prof_lookup(old_ctx->bt);
+		if (told_cnt == NULL) {
+			/*
+			 * It's too late to propagate OOM for this realloc(),
+			 * so operate directly on old_cnt->ctx->cnt_merged.
+			 */
+			malloc_mutex_lock(&old_ctx->lock);
+			old_ctx->cnt_merged.curobjs--;
+			old_ctx->cnt_merged.curbytes -= old_size;
+			malloc_mutex_unlock(&old_ctx->lock);
+			told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
+		}
+	} else
+		told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
+
+	if ((uintptr_t)told_cnt > (uintptr_t)1U)
+		told_cnt->epoch++;
+	if ((uintptr_t)cnt > (uintptr_t)1U) {
+		prof_ctx_set(ptr, cnt->ctx);
+		cnt->epoch++;
+	} else
+		prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U);
+	/*********/
+	mb_write();
+	/*********/
+	if ((uintptr_t)told_cnt > (uintptr_t)1U) {
+		told_cnt->cnts.curobjs--;
+		told_cnt->cnts.curbytes -= old_size;
+	}
+	if ((uintptr_t)cnt > (uintptr_t)1U) {
+		cnt->cnts.curobjs++;
+		cnt->cnts.curbytes += size;
+		if (opt_prof_accum) {
+			cnt->cnts.accumobjs++;
+			cnt->cnts.accumbytes += size;
+		}
+	}
+	/*********/
+	mb_write();
+	/*********/
+	if ((uintptr_t)told_cnt > (uintptr_t)1U)
+		told_cnt->epoch++;
+	if ((uintptr_t)cnt > (uintptr_t)1U)
+		cnt->epoch++;
+	/*********/
+	mb_write(); /* Not strictly necessary. */
+}
+
+JEMALLOC_INLINE void
+prof_free(const void *ptr, size_t size)
+{
+	prof_ctx_t *ctx = prof_ctx_get(ptr);
+
+	if ((uintptr_t)ctx > (uintptr_t)1) {
+		assert(size == isalloc(ptr));
+		prof_thr_cnt_t *tcnt = prof_lookup(ctx->bt);
+
+		if (tcnt != NULL) {
+			tcnt->epoch++;
+			/*********/
+			mb_write();
+			/*********/
+			tcnt->cnts.curobjs--;
+			tcnt->cnts.curbytes -= size;
+			/*********/
+			mb_write();
+			/*********/
+			tcnt->epoch++;
+			/*********/
+			mb_write();
+			/*********/
+		} else {
+			/*
+			 * OOM during free() cannot be propagated, so operate
+			 * directly on cnt->ctx->cnt_merged.
+			 */
+			malloc_mutex_lock(&ctx->lock);
+			ctx->cnt_merged.curobjs--;
+			ctx->cnt_merged.curbytes -= size;
+			malloc_mutex_unlock(&ctx->lock);
+		}
+	}
+}
+#endif
+
 #endif /* JEMALLOC_H_INLINES */
 /******************************************************************************/
 #endif /* JEMALLOC_PROF */
--- a/jemalloc/include/jemalloc/internal/rtree.h
+++ b/jemalloc/include/jemalloc/internal/rtree.h
@ -0,0 +1,161 @@
+/*
+ * This radix tree implementation is tailored to the singular purpose of
+ * tracking which chunks are currently owned by jemalloc.  This functionality
+ * is mandatory for OS X, where jemalloc must be able to respond to object
+ * ownership queries.
+ *
+ *******************************************************************************
+ */
+#ifdef JEMALLOC_H_TYPES
+
+typedef struct rtree_s rtree_t;
+
+/*
+ * Size of each radix tree node (must be a power of 2).  This impacts tree
+ * depth.
+ */
+#if (LG_SIZEOF_PTR == 2)
+#  define RTREE_NODESIZE (1U << 14)
+#else
+#  define RTREE_NODESIZE CACHELINE
+#endif
+
+#endif /* JEMALLOC_H_TYPES */
+/******************************************************************************/
+#ifdef JEMALLOC_H_STRUCTS
+
+struct rtree_s {
+	malloc_mutex_t	mutex;
+	void		**root;
+	unsigned	height;
+	unsigned	level2bits[1]; /* Dynamically sized. */
+};
+
+#endif /* JEMALLOC_H_STRUCTS */
+/******************************************************************************/
+#ifdef JEMALLOC_H_EXTERNS
+
+rtree_t	*rtree_new(unsigned bits);
+
+#endif /* JEMALLOC_H_EXTERNS */
+/******************************************************************************/
+#ifdef JEMALLOC_H_INLINES
+
+#ifndef JEMALLOC_ENABLE_INLINE
+#ifndef JEMALLOC_DEBUG
+void	*rtree_get_locked(rtree_t *rtree, uintptr_t key);
+#endif
+void	*rtree_get(rtree_t *rtree, uintptr_t key);
+bool	rtree_set(rtree_t *rtree, uintptr_t key, void *val);
+#endif
+
+#if (defined(JEMALLOC_ENABLE_INLINE) || defined(RTREE_C_))
+#define	RTREE_GET_GENERATE(f)						\
+/* The least significant bits of the key are ignored. */		\
+JEMALLOC_INLINE void *							\
+f(rtree_t *rtree, uintptr_t key)					\
+{									\
+	void *ret;							\
+	uintptr_t subkey;						\
+	unsigned i, lshift, height, bits;				\
+	void **node, **child;						\
+									\
+	RTREE_LOCK(&rtree->mutex);					\
+	for (i = lshift = 0, height = rtree->height, node = rtree->root;\
+	    i < height - 1;						\
+	    i++, lshift += bits, node = child) {			\
+		bits = rtree->level2bits[i];				\
+		subkey = (key << lshift) >> ((ZU(1) << (LG_SIZEOF_PTR + \
+		    3)) - bits);					\
+		child = (void**)node[subkey];				\
+		if (child == NULL) {					\
+			RTREE_UNLOCK(&rtree->mutex);			\
+			return (NULL);					\
+		}							\
+	}								\
+									\
+	/*								\
+	 * node is a leaf, so it contains values rather than node	\
+	 * pointers.							\
+	 */								\
+	bits = rtree->level2bits[i];					\
+	subkey = (key << lshift) >> ((ZU(1) << (LG_SIZEOF_PTR+3)) -	\
+	    bits);							\
+	ret = node[subkey];						\
+	RTREE_UNLOCK(&rtree->mutex);					\
+									\
+	RTREE_GET_VALIDATE						\
+	return (ret);							\
+}
+
+#ifdef JEMALLOC_DEBUG
+#  define RTREE_LOCK(l)		malloc_mutex_lock(l)
+#  define RTREE_UNLOCK(l)	malloc_mutex_unlock(l)
+#  define RTREE_GET_VALIDATE
+RTREE_GET_GENERATE(rtree_get_locked)
+#  undef RTREE_LOCK
+#  undef RTREE_UNLOCK
+#  undef RTREE_GET_VALIDATE
+#endif
+
+#define	RTREE_LOCK(l)
+#define	RTREE_UNLOCK(l)
+#ifdef JEMALLOC_DEBUG
+   /*
+    * Suppose that it were possible for a jemalloc-allocated chunk to be
+    * munmap()ped, followed by a different allocator in another thread re-using
+    * overlapping virtual memory, all without invalidating the cached rtree
+    * value.  The result would be a false positive (the rtree would claim that
+    * jemalloc owns memory that it had actually discarded).  This scenario
+    * seems impossible, but the following assertion is a prudent sanity check.
+    */
+#  define RTREE_GET_VALIDATE						\
+	assert(rtree_get_locked(rtree, key) == ret);
+#else
+#  define RTREE_GET_VALIDATE
+#endif
+RTREE_GET_GENERATE(rtree_get)
+#undef RTREE_LOCK
+#undef RTREE_UNLOCK
+#undef RTREE_GET_VALIDATE
+
+JEMALLOC_INLINE bool
+rtree_set(rtree_t *rtree, uintptr_t key, void *val)
+{
+	uintptr_t subkey;
+	unsigned i, lshift, height, bits;
+	void **node, **child;
+
+	malloc_mutex_lock(&rtree->mutex);
+	for (i = lshift = 0, height = rtree->height, node = rtree->root;
+	    i < height - 1;
+	    i++, lshift += bits, node = child) {
+		bits = rtree->level2bits[i];
+		subkey = (key << lshift) >> ((ZU(1) << (LG_SIZEOF_PTR+3)) -
+		    bits);
+		child = (void**)node[subkey];
+		if (child == NULL) {
+			child = (void**)base_alloc(sizeof(void *) <<
+			    rtree->level2bits[i+1]);
+			if (child == NULL) {
+				malloc_mutex_unlock(&rtree->mutex);
+				return (true);
+			}
+			memset(child, 0, sizeof(void *) <<
+			    rtree->level2bits[i+1]);
+			node[subkey] = child;
+		}
+	}
+
+	/* node is a leaf, so it contains values rather than node pointers. */
+	bits = rtree->level2bits[i];
+	subkey = (key << lshift) >> ((ZU(1) << (LG_SIZEOF_PTR+3)) - bits);
+	node[subkey] = val;
+	malloc_mutex_unlock(&rtree->mutex);
+
+	return (false);
+}
+#endif
+
+#endif /* JEMALLOC_H_INLINES */
+/******************************************************************************/
--- a/jemalloc/include/jemalloc/internal/stats.h
+++ b/jemalloc/include/jemalloc/internal/stats.h
@ -154,7 +154,7 @@ struct chunk_stats_s {

 extern bool	opt_stats_print;

-char	*umax2s(uintmax_t x, unsigned base, char *s);
+char	*u2s(uint64_t x, unsigned base, char *s);
 #ifdef JEMALLOC_STATS
 void malloc_cprintf(void (*write)(void *, const char *), void *cbopaque,
    const char *format, ...) JEMALLOC_ATTR(format(printf, 3, 4));
--- a/jemalloc/include/jemalloc/internal/tcache.h
+++ b/jemalloc/include/jemalloc/internal/tcache.h
@ -17,7 +17,7 @@ typedef struct tcache_s tcache_t;
 /* Number of cache slots for large size classes. */
 #define	TCACHE_NSLOTS_LARGE		20

-/* (1U << opt_lg_tcache_maxclass) is used to compute tcache_maxclass. */
+/* (1U << opt_lg_tcache_max) is used to compute tcache_maxclass. */
 #define	LG_TCACHE_MAXCLASS_DEFAULT	15

 /*
@ -61,12 +61,25 @@ struct tcache_s {
 #ifdef JEMALLOC_H_EXTERNS

 extern bool	opt_tcache;
-extern ssize_t	opt_lg_tcache_maxclass;
+extern ssize_t	opt_lg_tcache_max;
 extern ssize_t	opt_lg_tcache_gc_sweep;

 /* Map of thread-specific caches. */
+#ifndef NO_TLS
 extern __thread tcache_t	*tcache_tls
    JEMALLOC_ATTR(tls_model("initial-exec"));
+#  define TCACHE_GET()	tcache_tls
+#  define TCACHE_SET(v)	do {						\
+	tcache_tls = (tcache_t *)(v);					\
+	pthread_setspecific(tcache_tsd, (void *)(v));			\
+} while (0)
+#else
+#  define TCACHE_GET()	((tcache_t *)pthread_getspecific(tcache_tsd))
+#  define TCACHE_SET(v)	do {						\
+	pthread_setspecific(tcache_tsd, (void *)(v));			\
+} while (0)
+#endif
+extern pthread_key_t		tcache_tsd;

 /*
 * Number of tcache bins.  There are nbins small-object bins, plus 0 or more
@ -122,15 +135,24 @@ tcache_get(void)
 	if ((isthreaded & opt_tcache) == false)
 		return (NULL);

-	tcache = tcache_tls;
-	if ((uintptr_t)tcache <= (uintptr_t)1) {
+	tcache = TCACHE_GET();
+	if ((uintptr_t)tcache <= (uintptr_t)2) {
 		if (tcache == NULL) {
 			tcache = tcache_create(choose_arena());
 			if (tcache == NULL)
 				return (NULL);
-		} else
+		} else {
+			if (tcache == (void *)(uintptr_t)1) {
+				/*
+				 * Make a note that an allocator function was
+				 * called after the tcache_thread_cleanup() was
+				 * called.
+				 */
+				TCACHE_SET((uintptr_t)2);
+			}
 			return (NULL);
 		}
+	}

 	return (tcache);
 }
@ -258,9 +280,9 @@ tcache_alloc_large(tcache_t *tcache, size_t size, bool zero)
 	} else {
 #ifdef JEMALLOC_PROF
 		arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ret);
-		size_t pageind = (unsigned)(((uintptr_t)ret - (uintptr_t)chunk)
-		    >> PAGE_SHIFT);
-		chunk->map[pageind].bits |= CHUNK_MAP_CLASS_MASK;
+		size_t pageind = (((uintptr_t)ret - (uintptr_t)chunk) >>
+		    PAGE_SHIFT);
+		chunk->map[pageind-map_bias].bits &= ~CHUNK_MAP_CLASS_MASK;
 #endif
 		if (zero == false) {
 #ifdef JEMALLOC_FILL
@ -299,8 +321,8 @@ tcache_dalloc_small(tcache_t *tcache, void *ptr)

 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 	arena = chunk->arena;
-	pageind = (((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT);
-	mapelm = &chunk->map[pageind];
+	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT;
+	mapelm = &chunk->map[pageind-map_bias];
 	run = (arena_run_t *)((uintptr_t)chunk + (uintptr_t)((pageind -
 	    (mapelm->bits >> PAGE_SHIFT)) << PAGE_SHIFT));
 	assert(run->magic == ARENA_RUN_MAGIC);
@ -339,7 +361,6 @@ tcache_dalloc_large(tcache_t *tcache, void *ptr, size_t size)
 	arena_chunk_t *chunk;
 	size_t pageind, binind;
 	tcache_bin_t *tbin;
-	arena_chunk_map_t *mapelm;

 	assert((size & PAGE_MASK) == 0);
 	assert(arena_salloc(ptr) > small_maxclass);
@ -347,8 +368,7 @@ tcache_dalloc_large(tcache_t *tcache, void *ptr, size_t size)

 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 	arena = chunk->arena;
-	pageind = (((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT);
-	mapelm = &chunk->map[pageind];
+	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT;
 	binind = nbins + (size >> PAGE_SHIFT) - 1;

 #ifdef JEMALLOC_FILL
--- a/jemalloc/include/jemalloc/internal/zone.h
+++ b/jemalloc/include/jemalloc/internal/zone.h
@ -0,0 +1,23 @@
+#ifndef JEMALLOC_ZONE
+#  error "This source file is for zones on Darwin (OS X)."
+#endif
+/******************************************************************************/
+#ifdef JEMALLOC_H_TYPES
+
+#endif /* JEMALLOC_H_TYPES */
+/******************************************************************************/
+#ifdef JEMALLOC_H_STRUCTS
+
+#endif /* JEMALLOC_H_STRUCTS */
+/******************************************************************************/
+#ifdef JEMALLOC_H_EXTERNS
+
+malloc_zone_t *create_zone(void);
+void	szone2ozone(malloc_zone_t *zone);
+
+#endif /* JEMALLOC_H_EXTERNS */
+/******************************************************************************/
+#ifdef JEMALLOC_H_INLINES
+
+#endif /* JEMALLOC_H_INLINES */
+/******************************************************************************/
--- a/jemalloc/include/jemalloc/jemalloc.h.in
+++ b/jemalloc/include/jemalloc/jemalloc.h.in
@ -4,6 +4,9 @@
 extern "C" {
 #endif

+#include <limits.h>
+#include <strings.h>
+
 #define	JEMALLOC_VERSION "@jemalloc_version@"
 #define	JEMALLOC_VERSION_MAJOR @jemalloc_version_major@
 #define	JEMALLOC_VERSION_MINOR @jemalloc_version_minor@
@ -16,7 +19,20 @@ extern "C" {
 #  define JEMALLOC_P(s) s
 #endif

-extern const char	*JEMALLOC_P(malloc_options);
+#define	ALLOCM_LG_ALIGN	((int)0x3f)
+#if LG_SIZEOF_PTR == 2
+#define	ALLOCM_ALIGN(a)	(ffs(a)-1)
+#else
+#define	ALLOCM_ALIGN(a)	((a < (size_t)INT_MAX) ? ffs(a)-1 : ffs(a>>32)+31)
+#endif
+#define	ALLOCM_ZERO	((int)0x40)
+#define	ALLOCM_NO_MOVE	((int)0x80)
+
+#define	ALLOCM_SUCCESS		0
+#define	ALLOCM_ERR_OOM		1
+#define	ALLOCM_ERR_NOT_MOVED	2
+
+extern const char	*JEMALLOC_P(malloc_conf);
 extern void		(*JEMALLOC_P(malloc_message))(void *, const char *);

 void	*JEMALLOC_P(malloc)(size_t size) JEMALLOC_ATTR(malloc);
@ -36,6 +52,14 @@ int	JEMALLOC_P(mallctlnametomib)(const char *name, size_t *mibp,
 int	JEMALLOC_P(mallctlbymib)(const size_t *mib, size_t miblen, void *oldp,
    size_t *oldlenp, void *newp, size_t newlen);

+int	JEMALLOC_P(allocm)(void **ptr, size_t *rsize, size_t size, int flags)
+    JEMALLOC_ATTR(nonnull(1));
+int	JEMALLOC_P(rallocm)(void **ptr, size_t *rsize, size_t size,
+    size_t extra, int flags) JEMALLOC_ATTR(nonnull(1));
+int	JEMALLOC_P(sallocm)(const void *ptr, size_t *rsize, int flags)
+    JEMALLOC_ATTR(nonnull(1));
+int	JEMALLOC_P(dallocm)(void *ptr, int flags) JEMALLOC_ATTR(nonnull(1));
+
 #ifdef __cplusplus
 };
 #endif
--- a/jemalloc/include/jemalloc/jemalloc_defs.h.in
+++ b/jemalloc/include/jemalloc/jemalloc_defs.h.in
@ -13,6 +13,7 @@
 * the API prefixing.
 */
 #undef JEMALLOC_PREFIX
+#undef JEMALLOC_CPREFIX
 #if (defined(JEMALLOC_PREFIX) && defined(JEMALLOC_MANGLE))
 #undef JEMALLOC_P
 #endif
@ -31,6 +32,9 @@
 #  define JEMALLOC_ATTR(s)
 #endif

+/* JEMALLOC_CC_SILENCE enables code that silences unuseful compiler warnings. */
+#undef JEMALLOC_CC_SILENCE
+
 /*
 * JEMALLOC_DEBUG enables assertions and other sanity checks, and disables
 * inline functions.
@ -92,6 +96,38 @@
 /* TLS is used to map arenas and magazine caches to threads. */
 #undef NO_TLS

+/*
+ * JEMALLOC_IVSALLOC enables ivsalloc(), which verifies that pointers reside
+ * within jemalloc-owned chunks before dereferencing them.
+ */
+#undef JEMALLOC_IVSALLOC
+
+/*
+ * Define overrides for non-standard allocator-related functions if they
+ * are present on the system.
+ */
+#undef JEMALLOC_OVERRIDE_MEMALIGN
+#undef JEMALLOC_OVERRIDE_VALLOC
+
+/*
+ * Darwin (OS X) uses zones to work around Mach-O symbol override shortcomings.
+ */
+#undef JEMALLOC_ZONE
+#undef JEMALLOC_ZONE_VERSION
+
+/*
+ * Methods for purging unused pages differ between operating systems.
+ *
+ *   madvise(..., MADV_DONTNEED) : On Linux, this immediately discards pages,
+ *                                 such that new pages will be demand-zeroed if
+ *                                 the address region is later touched.
+ *   madvise(..., MADV_FREE) : On FreeBSD and Darwin, this marks pages as being
+ *                             unused, such that they will be discarded rather
+ *                             than swapped out.
+ */
+#undef JEMALLOC_PURGE_MADVISE_DONTNEED
+#undef JEMALLOC_PURGE_MADVISE_FREE
+
 /* sizeof(void *) == 2^LG_SIZEOF_PTR. */
 #undef LG_SIZEOF_PTR

--- a/jemalloc/src/arena.c
+++ b/jemalloc/src/arena.c
--- a/jemalloc/src/base.c
+++ b/jemalloc/src/base.c
@ -32,7 +32,7 @@ base_pages_alloc(size_t minsize)
 	assert(minsize != 0);
 	csize = CHUNK_CEILING(minsize);
 	zero = false;
-	base_pages = chunk_alloc(csize, &zero);
+	base_pages = chunk_alloc(csize, true, &zero);
 	if (base_pages == NULL)
 		return (true);
 	base_next_addr = base_pages;
--- a/jemalloc/src/chunk.c
+++ b/jemalloc/src/chunk.c
@ -14,11 +14,15 @@ malloc_mutex_t	chunks_mtx;
 chunk_stats_t	stats_chunks;
 #endif

+#ifdef JEMALLOC_IVSALLOC
+rtree_t		*chunks_rtree;
+#endif
+
 /* Various chunk-related settings. */
 size_t		chunksize;
 size_t		chunksize_mask; /* (chunksize - 1). */
 size_t		chunk_npages;
-size_t		arena_chunk_header_npages;
+size_t		map_bias;
 size_t		arena_maxclass; /* Max size class for arenas. */

 /******************************************************************************/
@ -30,7 +34,7 @@ size_t		arena_maxclass; /* Max size class for arenas. */
 * advantage of them if they are returned.
 */
 void *
-chunk_alloc(size_t size, bool *zero)
+chunk_alloc(size_t size, bool base, bool *zero)
 {
 	void *ret;

@ -63,10 +67,18 @@ chunk_alloc(size_t size, bool *zero)
 	/* All strategies for allocation failed. */
 	ret = NULL;
 RETURN:
+#ifdef JEMALLOC_IVSALLOC
+	if (base == false && ret != NULL) {
+		if (rtree_set(chunks_rtree, (uintptr_t)ret, ret)) {
+			chunk_dealloc(ret, size);
+			return (NULL);
+		}
+	}
+#endif
 #if (defined(JEMALLOC_STATS) || defined(JEMALLOC_PROF))
 	if (ret != NULL) {
 #  ifdef JEMALLOC_PROF
-		bool udump;
+		bool gdump;
 #  endif
 		malloc_mutex_lock(&chunks_mtx);
 #  ifdef JEMALLOC_STATS
@ -76,17 +88,17 @@ RETURN:
 		if (stats_chunks.curchunks > stats_chunks.highchunks) {
 			stats_chunks.highchunks = stats_chunks.curchunks;
 #  ifdef JEMALLOC_PROF
-			udump = true;
+			gdump = true;
 #  endif
 		}
 #  ifdef JEMALLOC_PROF
 		else
-			udump = false;
+			gdump = false;
 #  endif
 		malloc_mutex_unlock(&chunks_mtx);
 #  ifdef JEMALLOC_PROF
-		if (opt_prof && opt_prof_udump && udump)
-			prof_udump();
+		if (opt_prof && opt_prof_gdump && gdump)
+			prof_gdump();
 #  endif
 	}
 #endif
@ -104,6 +116,9 @@ chunk_dealloc(void *chunk, size_t size)
 	assert(size != 0);
 	assert((size & chunksize_mask) == 0);

+#ifdef JEMALLOC_IVSALLOC
+	rtree_set(chunks_rtree, (uintptr_t)chunk, NULL);
+#endif
 #if (defined(JEMALLOC_STATS) || defined(JEMALLOC_PROF))
 	malloc_mutex_lock(&chunks_mtx);
 	stats_chunks.curchunks -= (size / chunksize);
@ -126,21 +141,27 @@ chunk_boot(void)
 {

 	/* Set variables according to the value of opt_lg_chunk. */
-	chunksize = (1LU << opt_lg_chunk);
+	chunksize = (ZU(1) << opt_lg_chunk);
 	assert(chunksize >= PAGE_SIZE);
 	chunksize_mask = chunksize - 1;
 	chunk_npages = (chunksize >> PAGE_SHIFT);

+#ifdef JEMALLOC_IVSALLOC
+	chunks_rtree = rtree_new((ZU(1) << (LG_SIZEOF_PTR+3)) - opt_lg_chunk);
+	if (chunks_rtree == NULL)
+		return (true);
+#endif
 #if (defined(JEMALLOC_STATS) || defined(JEMALLOC_PROF))
 	if (malloc_mutex_init(&chunks_mtx))
 		return (true);
 	memset(&stats_chunks, 0, sizeof(chunk_stats_t));
 #endif
-
 #ifdef JEMALLOC_SWAP
 	if (chunk_swap_boot())
 		return (true);
 #endif
+	if (chunk_mmap_boot())
+		return (true);
 #ifdef JEMALLOC_DSS
 	if (chunk_dss_boot())
 		return (true);
--- a/jemalloc/src/chunk_mmap.c
+++ b/jemalloc/src/chunk_mmap.c
@ -6,26 +6,30 @@

 /*
 * Used by chunk_alloc_mmap() to decide whether to attempt the fast path and
- * potentially avoid some system calls.  We can get away without TLS here,
- * since the state of mmap_unaligned only affects performance, rather than
- * correct function.
+ * potentially avoid some system calls.
 */
-static
 #ifndef NO_TLS
-       __thread
+static __thread bool	mmap_unaligned_tls
+    JEMALLOC_ATTR(tls_model("initial-exec"));
+#define	MMAP_UNALIGNED_GET()	mmap_unaligned_tls
+#define	MMAP_UNALIGNED_SET(v)	do {					\
+	mmap_unaligned_tls = (v);					\
+} while (0)
+#else
+static pthread_key_t	mmap_unaligned_tsd;
+#define	MMAP_UNALIGNED_GET()	((bool)pthread_getspecific(mmap_unaligned_tsd))
+#define	MMAP_UNALIGNED_SET(v)	do {					\
+	pthread_setspecific(mmap_unaligned_tsd, (void *)(v));		\
+} while (0)
 #endif
-                bool	mmap_unaligned
-#ifndef NO_TLS
-                                       JEMALLOC_ATTR(tls_model("initial-exec"))
-#endif
-                                                                               ;

 /******************************************************************************/
 /* Function prototypes for non-inline static functions. */

 static void	*pages_map(void *addr, size_t size, bool noreserve);
 static void	pages_unmap(void *addr, size_t size);
-static void	*chunk_alloc_mmap_slow(size_t size, bool unaligned, bool noreserve);
+static void	*chunk_alloc_mmap_slow(size_t size, bool unaligned,
+    bool noreserve);
 static void	*chunk_alloc_mmap_internal(size_t size, bool noreserve);

 /******************************************************************************/
@ -54,9 +58,9 @@ pages_map(void *addr, size_t size, bool noreserve)
 		 * We succeeded in mapping memory, but not in the right place.
 		 */
 		if (munmap(ret, size) == -1) {
-			char buf[STRERROR_BUF];
+			char buf[BUFERROR_BUF];

-			strerror_r(errno, buf, sizeof(buf));
+			buferror(errno, buf, sizeof(buf));
 			malloc_write("<jemalloc>: Error in munmap(): ");
 			malloc_write(buf);
 			malloc_write("\n");
@ -76,9 +80,9 @@ pages_unmap(void *addr, size_t size)
 {

 	if (munmap(addr, size) == -1) {
-		char buf[STRERROR_BUF];
+		char buf[BUFERROR_BUF];

-		strerror_r(errno, buf, sizeof(buf));
+		buferror(errno, buf, sizeof(buf));
 		malloc_write("<jemalloc>: Error in munmap(): ");
 		malloc_write(buf);
 		malloc_write("\n");
@ -128,7 +132,7 @@ chunk_alloc_mmap_slow(size_t size, bool unaligned, bool noreserve)
 	 * method.
 	 */
 	if (unaligned == false)
-		mmap_unaligned = false;
+		MMAP_UNALIGNED_SET(false);

 	return (ret);
 }
@ -166,7 +170,7 @@ chunk_alloc_mmap_internal(size_t size, bool noreserve)
 	 * fast method next time.
 	 */

-	if (mmap_unaligned == false) {
+	if (MMAP_UNALIGNED_GET() == false) {
 		size_t offset;

 		ret = pages_map(NULL, size, noreserve);
@ -175,7 +179,7 @@ chunk_alloc_mmap_internal(size_t size, bool noreserve)

 		offset = CHUNK_ADDR2OFFSET(ret);
 		if (offset != 0) {
-			mmap_unaligned = true;
+			MMAP_UNALIGNED_SET(true);
 			/* Try to extend chunk boundary. */
 			if (pages_map((void *)((uintptr_t)ret + size),
 			    chunksize - offset, noreserve) == NULL) {
@ -184,7 +188,8 @@ chunk_alloc_mmap_internal(size_t size, bool noreserve)
 				 * the reliable-but-expensive method.
 				 */
 				pages_unmap(ret, size);
-				ret = chunk_alloc_mmap_slow(size, true, noreserve);
+				ret = chunk_alloc_mmap_slow(size, true,
+				    noreserve);
 			} else {
 				/* Clean up unneeded leading space. */
 				pages_unmap(ret, chunksize - offset);
@ -216,3 +221,17 @@ chunk_dealloc_mmap(void *chunk, size_t size)

 	pages_unmap(chunk, size);
 }
+
+bool
+chunk_mmap_boot(void)
+{
+
+#ifdef NO_TLS
+	if (pthread_key_create(&mmap_unaligned_tsd, NULL) != 0) {
+		malloc_write("<jemalloc>: Error in pthread_key_create()\n");
+		return (true);
+	}
+#endif
+
+	return (false);
+}
--- a/jemalloc/src/chunk_swap.c
+++ b/jemalloc/src/chunk_swap.c
@ -294,9 +294,10 @@ chunk_swap_enable(const int *fds, unsigned nfds, bool prezeroed)
 		void *addr = mmap((void *)((uintptr_t)vaddr + voff), sizes[i],
 		    PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, fds[i], 0);
 		if (addr == MAP_FAILED) {
-			char buf[STRERROR_BUF];
+			char buf[BUFERROR_BUF];

-			strerror_r(errno, buf, sizeof(buf));
+
+			buferror(errno, buf, sizeof(buf));
 			malloc_write(
 			    "<jemalloc>: Error in mmap(..., MAP_FIXED, ...): ");
 			malloc_write(buf);
@ -304,7 +305,7 @@ chunk_swap_enable(const int *fds, unsigned nfds, bool prezeroed)
 			if (opt_abort)
 				abort();
 			if (munmap(vaddr, voff) == -1) {
-				strerror_r(errno, buf, sizeof(buf));
+				buferror(errno, buf, sizeof(buf));
 				malloc_write("<jemalloc>: Error in munmap(): ");
 				malloc_write(buf);
 				malloc_write("\n");
--- a/jemalloc/src/ckh.c
+++ b/jemalloc/src/ckh.c
@ -263,13 +263,12 @@ ckh_grow(ckh_t *ckh)
 	lg_curcells = ckh->lg_curbuckets + LG_CKH_BUCKET_CELLS;
 	while (true) {
 		lg_curcells++;
-		tab = (ckhc_t *) ipalloc((ZU(1) << LG_CACHELINE),
-		    sizeof(ckhc_t) << lg_curcells);
+		tab = (ckhc_t *)ipalloc(sizeof(ckhc_t) << lg_curcells,
+		    ZU(1) << LG_CACHELINE, true);
 		if (tab == NULL) {
 			ret = true;
 			goto RETURN;
 		}
-		memset(tab, 0, sizeof(ckhc_t) << lg_curcells);
 		/* Swap in new table. */
 		ttab = ckh->tab;
 		ckh->tab = tab;
@ -305,8 +304,8 @@ ckh_shrink(ckh_t *ckh)
 	 */
 	lg_prevbuckets = ckh->lg_curbuckets;
 	lg_curcells = ckh->lg_curbuckets + LG_CKH_BUCKET_CELLS - 1;
-	tab = (ckhc_t *)ipalloc((ZU(1) << LG_CACHELINE),
-	    sizeof(ckhc_t) << lg_curcells);
+	tab = (ckhc_t *)ipalloc(sizeof(ckhc_t) << lg_curcells,
+	    ZU(1) << LG_CACHELINE, true);
 	if (tab == NULL) {
 		/*
 		 * An OOM error isn't worth propagating, since it doesn't
@ -314,7 +313,6 @@ ckh_shrink(ckh_t *ckh)
 		 */
 		return;
 	}
-	memset(tab, 0, sizeof(ckhc_t) << lg_curcells);
 	/* Swap in new table. */
 	ttab = ckh->tab;
 	ckh->tab = tab;
@ -377,13 +375,12 @@ ckh_new(ckh_t *ckh, size_t minitems, ckh_hash_t *hash, ckh_keycomp_t *keycomp)
 	ckh->hash = hash;
 	ckh->keycomp = keycomp;

-	ckh->tab = (ckhc_t *)ipalloc((ZU(1) << LG_CACHELINE),
-	    sizeof(ckhc_t) << lg_mincells);
+	ckh->tab = (ckhc_t *)ipalloc(sizeof(ckhc_t) << lg_mincells,
+	    (ZU(1) << LG_CACHELINE), true);
 	if (ckh->tab == NULL) {
 		ret = true;
 		goto RETURN;
 	}
-	memset(ckh->tab, 0, sizeof(ckhc_t) << lg_mincells);

 #ifdef JEMALLOC_DEBUG
 	ckh->magic = CKH_MAGIG;
@ -570,12 +567,21 @@ ckh_pointer_hash(const void *key, unsigned minbits, size_t *hash1,
 {
 	size_t ret1, ret2;
 	uint64_t h;
+	union {
+		const void	*v;
+		uint64_t	i;
+	} u;

 	assert(minbits <= 32 || (SIZEOF_PTR == 8 && minbits <= 64));
 	assert(hash1 != NULL);
 	assert(hash2 != NULL);

-	h = hash(&key, sizeof(void *), 0xd983396e68886082LLU);
+	assert(sizeof(u.v) == sizeof(u.i));
+#if (LG_SIZEOF_PTR != LG_SIZEOF_INT)
+	u.i = 0;
+#endif
+	u.v = key;
+	h = hash(&u.i, sizeof(u.i), 0xd983396e68886082LLU);
 	if (minbits <= 32) {
 		/*
 		 * Avoid doing multiple hashes, since a single hash provides
@ -586,7 +592,7 @@ ckh_pointer_hash(const void *key, unsigned minbits, size_t *hash1,
 	} else {
 		assert(SIZEOF_PTR == 8);
 		ret1 = h;
-		ret2 = hash(&key, sizeof(void *), 0x5e2be9aff8709a5dLLU);
+		ret2 = hash(&u.i, sizeof(u.i), 0x5e2be9aff8709a5dLLU);
 	}

 	*hash1 = ret1;
--- a/jemalloc/src/ctl.c
+++ b/jemalloc/src/ctl.c
@ -41,6 +41,11 @@ CTL_PROTO(epoch)
 #ifdef JEMALLOC_TCACHE
 CTL_PROTO(tcache_flush)
 #endif
+CTL_PROTO(thread_arena)
+#ifdef JEMALLOC_STATS
+CTL_PROTO(thread_allocated)
+CTL_PROTO(thread_deallocated)
+#endif
 CTL_PROTO(config_debug)
 CTL_PROTO(config_dss)
 CTL_PROTO(config_dynamic_page_shift)
@ -57,8 +62,15 @@ CTL_PROTO(config_tiny)
 CTL_PROTO(config_tls)
 CTL_PROTO(config_xmalloc)
 CTL_PROTO(opt_abort)
+CTL_PROTO(opt_lg_qspace_max)
+CTL_PROTO(opt_lg_cspace_max)
+CTL_PROTO(opt_lg_chunk)
+CTL_PROTO(opt_narenas)
+CTL_PROTO(opt_lg_dirty_mult)
+CTL_PROTO(opt_stats_print)
 #ifdef JEMALLOC_FILL
 CTL_PROTO(opt_junk)
+CTL_PROTO(opt_zero)
 #endif
 #ifdef JEMALLOC_SYSV
 CTL_PROTO(opt_sysv)
@ -66,27 +78,22 @@ CTL_PROTO(opt_sysv)
 #ifdef JEMALLOC_XMALLOC
 CTL_PROTO(opt_xmalloc)
 #endif
-#ifdef JEMALLOC_ZERO
-CTL_PROTO(opt_zero)
-#endif
 #ifdef JEMALLOC_TCACHE
 CTL_PROTO(opt_tcache)
 CTL_PROTO(opt_lg_tcache_gc_sweep)
 #endif
 #ifdef JEMALLOC_PROF
 CTL_PROTO(opt_prof)
+CTL_PROTO(opt_prof_prefix)
 CTL_PROTO(opt_prof_active)
 CTL_PROTO(opt_lg_prof_bt_max)
 CTL_PROTO(opt_lg_prof_sample)
 CTL_PROTO(opt_lg_prof_interval)
-CTL_PROTO(opt_prof_udump)
+CTL_PROTO(opt_prof_gdump)
 CTL_PROTO(opt_prof_leak)
+CTL_PROTO(opt_prof_accum)
+CTL_PROTO(opt_lg_prof_tcmax)
 #endif
-CTL_PROTO(opt_stats_print)
-CTL_PROTO(opt_lg_qspace_max)
-CTL_PROTO(opt_lg_cspace_max)
-CTL_PROTO(opt_lg_dirty_mult)
-CTL_PROTO(opt_lg_chunk)
 #ifdef JEMALLOC_SWAP
 CTL_PROTO(opt_overcommit)
 #endif
@ -125,6 +132,7 @@ CTL_PROTO(arenas_nbins)
 CTL_PROTO(arenas_nhbins)
 #endif
 CTL_PROTO(arenas_nlruns)
+CTL_PROTO(arenas_purge)
 #ifdef JEMALLOC_PROF
 CTL_PROTO(prof_active)
 CTL_PROTO(prof_dump)
@ -210,6 +218,15 @@ static const ctl_node_t	tcache_node[] = {
 };
 #endif

+static const ctl_node_t	thread_node[] = {
+	{NAME("arena"),		CTL(thread_arena)}
+#ifdef JEMALLOC_STATS
+	,
+	{NAME("allocated"),	CTL(thread_allocated)},
+	{NAME("deallocated"),	CTL(thread_deallocated)}
+#endif
+};
+
 static const ctl_node_t	config_node[] = {
 	{NAME("debug"),			CTL(config_debug)},
 	{NAME("dss"),			CTL(config_dss)},
@ -230,36 +247,43 @@ static const ctl_node_t	config_node[] = {

 static const ctl_node_t opt_node[] = {
 	{NAME("abort"),			CTL(opt_abort)},
+	{NAME("lg_qspace_max"),		CTL(opt_lg_qspace_max)},
+	{NAME("lg_cspace_max"),		CTL(opt_lg_cspace_max)},
+	{NAME("lg_chunk"),		CTL(opt_lg_chunk)},
+	{NAME("narenas"),		CTL(opt_narenas)},
+	{NAME("lg_dirty_mult"),		CTL(opt_lg_dirty_mult)},
+	{NAME("stats_print"),		CTL(opt_stats_print)}
 #ifdef JEMALLOC_FILL
+	,
 	{NAME("junk"),			CTL(opt_junk)},
+	{NAME("zero"),			CTL(opt_zero)}
 #endif
 #ifdef JEMALLOC_SYSV
-	{NAME("sysv"),			CTL(opt_sysv)},
+	,
+	{NAME("sysv"),			CTL(opt_sysv)}
 #endif
 #ifdef JEMALLOC_XMALLOC
-	{NAME("xmalloc"),		CTL(opt_xmalloc)},
-#endif
-#ifdef JEMALLOC_ZERO
-	{NAME("zero"),			CTL(opt_zero)},
+	,
+	{NAME("xmalloc"),		CTL(opt_xmalloc)}
 #endif
 #ifdef JEMALLOC_TCACHE
+	,
 	{NAME("tcache"),		CTL(opt_tcache)},
-	{NAME("lg_tcache_gc_sweep"),	CTL(opt_lg_tcache_gc_sweep)},
+	{NAME("lg_tcache_gc_sweep"),	CTL(opt_lg_tcache_gc_sweep)}
 #endif
 #ifdef JEMALLOC_PROF
+	,
 	{NAME("prof"),			CTL(opt_prof)},
+	{NAME("prof_prefix"),		CTL(opt_prof_prefix)},
 	{NAME("prof_active"),		CTL(opt_prof_active)},
 	{NAME("lg_prof_bt_max"),	CTL(opt_lg_prof_bt_max)},
 	{NAME("lg_prof_sample"),	CTL(opt_lg_prof_sample)},
 	{NAME("lg_prof_interval"),	CTL(opt_lg_prof_interval)},
-	{NAME("prof_udump"),		CTL(opt_prof_udump)},
+	{NAME("prof_gdump"),		CTL(opt_prof_gdump)},
 	{NAME("prof_leak"),		CTL(opt_prof_leak)},
+	{NAME("prof_accum"),		CTL(opt_prof_accum)},
+	{NAME("lg_prof_tcmax"),		CTL(opt_lg_prof_tcmax)}
 #endif
-	{NAME("stats_print"),		CTL(opt_stats_print)},
-	{NAME("lg_qspace_max"),		CTL(opt_lg_qspace_max)},
-	{NAME("lg_cspace_max"),		CTL(opt_lg_cspace_max)},
-	{NAME("lg_dirty_mult"),		CTL(opt_lg_dirty_mult)},
-	{NAME("lg_chunk"),		CTL(opt_lg_chunk)}
 #ifdef JEMALLOC_SWAP
 	,
 	{NAME("overcommit"),		CTL(opt_overcommit)}
@ -321,7 +345,8 @@ static const ctl_node_t arenas_node[] = {
 #endif
 	{NAME("bin"),			CHILD(arenas_bin)},
 	{NAME("nlruns"),		CTL(arenas_nlruns)},
-	{NAME("lrun"),			CHILD(arenas_lrun)}
+	{NAME("lrun"),			CHILD(arenas_lrun)},
+	{NAME("purge"),			CTL(arenas_purge)}
 };

 #ifdef JEMALLOC_PROF
@ -448,6 +473,7 @@ static const ctl_node_t	root_node[] = {
 #ifdef JEMALLOC_TCACHE
 	{NAME("tcache"),	CHILD(tcache)},
 #endif
+	{NAME("thread"),	CHILD(thread)},
 	{NAME("config"),	CHILD(config)},
 	{NAME("opt"),		CHILD(opt)},
 	{NAME("arenas"),	CHILD(arenas)},
@ -1028,13 +1054,13 @@ tcache_flush_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,

 	VOID();

-	tcache = tcache_tls;
+	tcache = TCACHE_GET();
 	if (tcache == NULL) {
 		ret = 0;
 		goto RETURN;
 	}
 	tcache_destroy(tcache);
-	tcache_tls = NULL;
+	TCACHE_SET(NULL);

 	ret = 0;
 RETURN:
@ -1042,6 +1068,49 @@ RETURN:
 }
 #endif

+static int
+thread_arena_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
+    void *newp, size_t newlen)
+{
+	int ret;
+	unsigned newind, oldind;
+
+	newind = oldind = choose_arena()->ind;
+	WRITE(oldind, unsigned);
+	READ(newind, unsigned);
+	if (newind != oldind) {
+		arena_t *arena;
+
+		if (newind >= narenas) {
+			/* New arena index is out of range. */
+			ret = EFAULT;
+			goto RETURN;
+		}
+
+		/* Initialize arena if necessary. */
+		malloc_mutex_lock(&arenas_lock);
+		if ((arena = arenas[newind]) == NULL)
+			arena = arenas_extend(newind);
+		malloc_mutex_unlock(&arenas_lock);
+		if (arena == NULL) {
+			ret = EAGAIN;
+			goto RETURN;
+		}
+
+		/* Set new arena association. */
+		ARENA_SET(arena);
+	}
+
+	ret = 0;
+RETURN:
+	return (ret);
+}
+
+#ifdef JEMALLOC_STATS
+CTL_RO_GEN(thread_allocated, ALLOCATED_GET(), uint64_t);
+CTL_RO_GEN(thread_deallocated, DEALLOCATED_GET(), uint64_t);
+#endif
+
 /******************************************************************************/

 #ifdef JEMALLOC_DEBUG
@ -1137,8 +1206,15 @@ CTL_RO_FALSE_GEN(config_xmalloc)
 /******************************************************************************/

 CTL_RO_GEN(opt_abort, opt_abort, bool)
+CTL_RO_GEN(opt_lg_qspace_max, opt_lg_qspace_max, size_t)
+CTL_RO_GEN(opt_lg_cspace_max, opt_lg_cspace_max, size_t)
+CTL_RO_GEN(opt_lg_chunk, opt_lg_chunk, size_t)
+CTL_RO_GEN(opt_narenas, opt_narenas, size_t)
+CTL_RO_GEN(opt_lg_dirty_mult, opt_lg_dirty_mult, ssize_t)
+CTL_RO_GEN(opt_stats_print, opt_stats_print, bool)
 #ifdef JEMALLOC_FILL
 CTL_RO_GEN(opt_junk, opt_junk, bool)
+CTL_RO_GEN(opt_zero, opt_zero, bool)
 #endif
 #ifdef JEMALLOC_SYSV
 CTL_RO_GEN(opt_sysv, opt_sysv, bool)
@ -1146,27 +1222,22 @@ CTL_RO_GEN(opt_sysv, opt_sysv, bool)
 #ifdef JEMALLOC_XMALLOC
 CTL_RO_GEN(opt_xmalloc, opt_xmalloc, bool)
 #endif
-#ifdef JEMALLOC_ZERO
-CTL_RO_GEN(opt_zero, opt_zero, bool)
-#endif
 #ifdef JEMALLOC_TCACHE
 CTL_RO_GEN(opt_tcache, opt_tcache, bool)
 CTL_RO_GEN(opt_lg_tcache_gc_sweep, opt_lg_tcache_gc_sweep, ssize_t)
 #endif
 #ifdef JEMALLOC_PROF
 CTL_RO_GEN(opt_prof, opt_prof, bool)
+CTL_RO_GEN(opt_prof_prefix, opt_prof_prefix, const char *)
 CTL_RO_GEN(opt_prof_active, opt_prof_active, bool)
 CTL_RO_GEN(opt_lg_prof_bt_max, opt_lg_prof_bt_max, size_t)
 CTL_RO_GEN(opt_lg_prof_sample, opt_lg_prof_sample, size_t)
 CTL_RO_GEN(opt_lg_prof_interval, opt_lg_prof_interval, ssize_t)
-CTL_RO_GEN(opt_prof_udump, opt_prof_udump, bool)
+CTL_RO_GEN(opt_prof_gdump, opt_prof_gdump, bool)
 CTL_RO_GEN(opt_prof_leak, opt_prof_leak, bool)
+CTL_RO_GEN(opt_prof_accum, opt_prof_accum, bool)
+CTL_RO_GEN(opt_lg_prof_tcmax, opt_lg_prof_tcmax, ssize_t)
 #endif
-CTL_RO_GEN(opt_stats_print, opt_stats_print, bool)
-CTL_RO_GEN(opt_lg_qspace_max, opt_lg_qspace_max, size_t)
-CTL_RO_GEN(opt_lg_cspace_max, opt_lg_cspace_max, size_t)
-CTL_RO_GEN(opt_lg_dirty_mult, opt_lg_dirty_mult, ssize_t)
-CTL_RO_GEN(opt_lg_chunk, opt_lg_chunk, size_t)
 #ifdef JEMALLOC_SWAP
 CTL_RO_GEN(opt_overcommit, opt_overcommit, bool)
 #endif
@ -1249,6 +1320,44 @@ CTL_RO_GEN(arenas_nhbins, nhbins, unsigned)
 #endif
 CTL_RO_GEN(arenas_nlruns, nlclasses, size_t)

+static int
+arenas_purge_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
+    void *newp, size_t newlen)
+{
+	int ret;
+	unsigned arena;
+
+	WRITEONLY();
+	arena = UINT_MAX;
+	WRITE(arena, unsigned);
+	if (newp != NULL && arena >= narenas) {
+		ret = EFAULT;
+		goto RETURN;
+	} else {
+		arena_t *tarenas[narenas];
+
+		malloc_mutex_lock(&arenas_lock);
+		memcpy(tarenas, arenas, sizeof(arena_t *) * narenas);
+		malloc_mutex_unlock(&arenas_lock);
+
+		if (arena == UINT_MAX) {
+			unsigned i;
+			for (i = 0; i < narenas; i++) {
+				if (tarenas[i] != NULL)
+					arena_purge_all(tarenas[i]);
+			}
+		} else {
+			assert(arena < narenas);
+			if (tarenas[arena] != NULL)
+				arena_purge_all(tarenas[arena]);
+		}
+	}
+
+	ret = 0;
+RETURN:
+	return (ret);
+}
+
 /******************************************************************************/

 #ifdef JEMALLOC_PROF
--- a/jemalloc/src/huge.c
+++ b/jemalloc/src/huge.c
@ -37,7 +37,7 @@ huge_malloc(size_t size, bool zero)
 	if (node == NULL)
 		return (NULL);

-	ret = chunk_alloc(csize, &zero);
+	ret = chunk_alloc(csize, false, &zero);
 	if (ret == NULL) {
 		base_node_dealloc(node);
 		return (NULL);
@ -69,12 +69,11 @@ huge_malloc(size_t size, bool zero)

 /* Only handles large allocations that require more than chunk alignment. */
 void *
-huge_palloc(size_t alignment, size_t size)
+huge_palloc(size_t size, size_t alignment, bool zero)
 {
 	void *ret;
 	size_t alloc_size, chunk_size, offset;
 	extent_node_t *node;
-	bool zero;

 	/*
 	 * This allocation requires alignment that is even larger than chunk
@ -98,8 +97,7 @@ huge_palloc(size_t alignment, size_t size)
 	if (node == NULL)
 		return (NULL);

-	zero = false;
-	ret = chunk_alloc(alloc_size, &zero);
+	ret = chunk_alloc(alloc_size, false, &zero);
 	if (ret == NULL) {
 		base_node_dealloc(node);
 		return (NULL);
@ -142,45 +140,80 @@ huge_palloc(size_t alignment, size_t size)
 	malloc_mutex_unlock(&huge_mtx);

 #ifdef JEMALLOC_FILL
+	if (zero == false) {
 		if (opt_junk)
 			memset(ret, 0xa5, chunk_size);
 		else if (opt_zero)
 			memset(ret, 0, chunk_size);
+	}
 #endif

 	return (ret);
 }

 void *
-huge_ralloc(void *ptr, size_t size, size_t oldsize)
+huge_ralloc_no_move(void *ptr, size_t oldsize, size_t size, size_t extra)
 {
-	void *ret;
-	size_t copysize;

-	/* Avoid moving the allocation if the size class would not change. */
-	if (oldsize > arena_maxclass &&
-	    CHUNK_CEILING(size) == CHUNK_CEILING(oldsize)) {
+	/*
+	 * Avoid moving the allocation if the size class can be left the same.
+	 */
+	if (oldsize > arena_maxclass
+	    && CHUNK_CEILING(oldsize) >= CHUNK_CEILING(size)
+	    && CHUNK_CEILING(oldsize) <= CHUNK_CEILING(size+extra)) {
+		assert(CHUNK_CEILING(oldsize) == oldsize);
 #ifdef JEMALLOC_FILL
 		if (opt_junk && size < oldsize) {
-			memset((void *)((uintptr_t)ptr + size), 0x5a, oldsize
-			    - size);
-		} else if (opt_zero && size > oldsize) {
-			memset((void *)((uintptr_t)ptr + oldsize), 0, size
-			    - oldsize);
+			memset((void *)((uintptr_t)ptr + size), 0x5a,
+			    oldsize - size);
 		}
 #endif
 		return (ptr);
 	}

+	/* Reallocation would require a move. */
+	return (NULL);
+}
+
+void *
+huge_ralloc(void *ptr, size_t oldsize, size_t size, size_t extra,
+    size_t alignment, bool zero)
+{
+	void *ret;
+	size_t copysize;
+
+	/* Try to avoid moving the allocation. */
+	ret = huge_ralloc_no_move(ptr, oldsize, size, extra);
+	if (ret != NULL)
+		return (ret);
+
 	/*
-	 * If we get here, then size and oldsize are different enough that we
-	 * need to use a different size class.  In that case, fall back to
-	 * allocating new space and copying.
+	 * size and oldsize are different enough that we need to use a
+	 * different size class.  In that case, fall back to allocating new
+	 * space and copying.
 	 */
-	ret = huge_malloc(size, false);
+	if (alignment != 0)
+		ret = huge_palloc(size + extra, alignment, zero);
+	else
+		ret = huge_malloc(size + extra, zero);
+
+	if (ret == NULL) {
+		if (extra == 0)
+			return (NULL);
+		/* Try again, this time without extra. */
+		if (alignment != 0)
+			ret = huge_palloc(size, alignment, zero);
+		else
+			ret = huge_malloc(size, zero);
+
 		if (ret == NULL)
 			return (NULL);
+	}

+	/*
+	 * Copy at most size bytes (not size+extra), since the caller has no
+	 * expectation that the extra bytes will be reliably preserved.
+	 */
 	copysize = (size < oldsize) ? size : oldsize;
 	memcpy(ret, ptr, copysize);
 	idalloc(ptr);
--- a/jemalloc/src/jemalloc.c
+++ b/jemalloc/src/jemalloc.c
--- a/jemalloc/src/mutex.c
+++ b/jemalloc/src/mutex.c
@ -59,7 +59,11 @@ malloc_mutex_init(malloc_mutex_t *mutex)

 	if (pthread_mutexattr_init(&attr) != 0)
 		return (true);
+#ifdef PTHREAD_MUTEX_ADAPTIVE_NP
 	pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ADAPTIVE_NP);
+#else
+	pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_DEFAULT);
+#endif
 	if (pthread_mutex_init(mutex, &attr) != 0) {
 		pthread_mutexattr_destroy(&attr);
 		return (true);
@ -68,3 +72,13 @@ malloc_mutex_init(malloc_mutex_t *mutex)

 	return (false);
 }
+
+void
+malloc_mutex_destroy(malloc_mutex_t *mutex)
+{
+
+	if (pthread_mutex_destroy(mutex) != 0) {
+		malloc_write("<jemalloc>: Error in pthread_mutex_destroy()\n");
+		abort();
+	}
+}
--- a/jemalloc/src/prof.c
+++ b/jemalloc/src/prof.c
--- a/jemalloc/src/rtree.c
+++ b/jemalloc/src/rtree.c
@ -0,0 +1,43 @@
+#define	RTREE_C_
+#include "jemalloc/internal/jemalloc_internal.h"
+
+rtree_t *
+rtree_new(unsigned bits)
+{
+	rtree_t *ret;
+	unsigned bits_per_level, height, i;
+
+	bits_per_level = ffs(pow2_ceil((RTREE_NODESIZE / sizeof(void *)))) - 1;
+	height = bits / bits_per_level;
+	if (height * bits_per_level != bits)
+		height++;
+	assert(height * bits_per_level >= bits);
+
+	ret = (rtree_t*)base_alloc(offsetof(rtree_t, level2bits) +
+	    (sizeof(unsigned) * height));
+	if (ret == NULL)
+		return (NULL);
+	memset(ret, 0, offsetof(rtree_t, level2bits) + (sizeof(unsigned) *
+	    height));
+
+	malloc_mutex_init(&ret->mutex);
+	ret->height = height;
+	if (bits_per_level * height > bits)
+		ret->level2bits[0] = bits % bits_per_level;
+	else
+		ret->level2bits[0] = bits_per_level;
+	for (i = 1; i < height; i++)
+		ret->level2bits[i] = bits_per_level;
+
+	ret->root = (void**)base_alloc(sizeof(void *) << ret->level2bits[0]);
+	if (ret->root == NULL) {
+		/*
+		 * We leak the rtree here, since there's no generic base
+		 * deallocation.
+		 */
+		return (NULL);
+	}
+	memset(ret->root, 0, sizeof(void *) << ret->level2bits[0]);
+
+	return (ret);
+}
--- a/jemalloc/src/stats.c
+++ b/jemalloc/src/stats.c
@ -57,12 +57,12 @@ static void	stats_arena_print(void (*write_cb)(void *, const char *),

 /*
 * We don't want to depend on vsnprintf() for production builds, since that can
- * cause unnecessary bloat for static binaries.  umax2s() provides minimal
- * integer printing functionality, so that malloc_printf() use can be limited to
+ * cause unnecessary bloat for static binaries.  u2s() provides minimal integer
+ * printing functionality, so that malloc_printf() use can be limited to
 * JEMALLOC_STATS code.
 */
 char *
-umax2s(uintmax_t x, unsigned base, char *s)
+u2s(uint64_t x, unsigned base, char *s)
 {
 	unsigned i;

@ -72,8 +72,8 @@ umax2s(uintmax_t x, unsigned base, char *s)
 	case 10:
 		do {
 			i--;
-			s[i] = "0123456789"[x % 10];
-			x /= 10;
+			s[i] = "0123456789"[x % (uint64_t)10];
+			x /= (uint64_t)10;
 		} while (x > 0);
 		break;
 	case 16:
@ -86,8 +86,9 @@ umax2s(uintmax_t x, unsigned base, char *s)
 	default:
 		do {
 			i--;
-			s[i] = "0123456789abcdefghijklmnopqrstuvwxyz"[x % base];
-			x /= base;
+			s[i] = "0123456789abcdefghijklmnopqrstuvwxyz"[x %
+			    (uint64_t)base];
+			x /= (uint64_t)base;
 		} while (x > 0);
 	}

@ -374,6 +375,7 @@ void
 stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
    const char *opts)
 {
+	int err;
 	uint64_t epoch;
 	size_t u64sz;
 	char s[UMAX2S_BUFSIZE];
@ -383,10 +385,27 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 	bool bins = true;
 	bool large = true;

-	/* Refresh stats, in case mallctl() was called by the application. */
+	/*
+	 * Refresh stats, in case mallctl() was called by the application.
+	 *
+	 * Check for OOM here, since refreshing the ctl cache can trigger
+	 * allocation.  In practice, none of the subsequent mallctl()-related
+	 * calls in this function will cause OOM if this one succeeds.
+	 * */
 	epoch = 1;
 	u64sz = sizeof(uint64_t);
-	xmallctl("epoch", &epoch, &u64sz, &epoch, sizeof(uint64_t));
+	err = JEMALLOC_P(mallctl)("epoch", &epoch, &u64sz, &epoch,
+	    sizeof(uint64_t));
+	if (err != 0) {
+		if (err == EAGAIN) {
+			malloc_write("<jemalloc>: Memory allocation failure in "
+			    "mallctl(\"epoch\", ...)\n");
+			return;
+		}
+		malloc_write("<jemalloc>: Failure in mallctl(\"epoch\", "
+		    "...)\n");
+		abort();
+	}

 	if (write_cb == NULL) {
 		/*
@ -430,10 +449,12 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 		bool bv;
 		unsigned uv;
 		ssize_t ssv;
-		size_t sv, bsz, ssz;
+		size_t sv, bsz, ssz, sssz, cpsz;

 		bsz = sizeof(bool);
 		ssz = sizeof(size_t);
+		sssz = sizeof(ssize_t);
+		cpsz = sizeof(const char *);

 		CTL_GET("version", &cpv, const char *);
 		write_cb(cbopaque, "Version: ");
@ -444,113 +465,140 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 		write_cb(cbopaque, bv ? "enabled" : "disabled");
 		write_cb(cbopaque, "\n");

-		write_cb(cbopaque, "Boolean JEMALLOC_OPTIONS: ");
-		if ((err = JEMALLOC_P(mallctl)("opt.abort", &bv, &bsz, NULL, 0))
-		    == 0)
-			write_cb(cbopaque, bv ? "A" : "a");
-		if ((err = JEMALLOC_P(mallctl)("prof.active", &bv, &bsz,
-		    NULL, 0)) == 0)
-			write_cb(cbopaque, bv ? "E" : "e");
-		if ((err = JEMALLOC_P(mallctl)("opt.prof", &bv, &bsz, NULL, 0))
-		    == 0)
-			write_cb(cbopaque, bv ? "F" : "f");
-		if ((err = JEMALLOC_P(mallctl)("opt.tcache", &bv, &bsz, NULL,
-		    0)) == 0)
-			write_cb(cbopaque, bv ? "H" : "h");
-		if ((err = JEMALLOC_P(mallctl)("opt.junk", &bv, &bsz, NULL, 0))
-		    == 0)
-			write_cb(cbopaque, bv ? "J" : "j");
-		if ((err = JEMALLOC_P(mallctl)("opt.prof_leak", &bv, &bsz, NULL,
-		    0)) == 0)
-			write_cb(cbopaque, bv ? "L" : "l");
-		if ((err = JEMALLOC_P(mallctl)("opt.overcommit", &bv, &bsz,
-		    NULL, 0)) == 0)
-			write_cb(cbopaque, bv ? "O" : "o");
-		if ((err = JEMALLOC_P(mallctl)("opt.stats_print", &bv, &bsz,
-		    NULL, 0)) == 0)
-			write_cb(cbopaque, bv ? "P" : "p");
-		if ((err = JEMALLOC_P(mallctl)("opt.prof_udump", &bv, &bsz,
-		    NULL, 0)) == 0)
-			write_cb(cbopaque, bv ? "U" : "u");
-		if ((err = JEMALLOC_P(mallctl)("opt.sysv", &bv, &bsz, NULL, 0))
-		    == 0)
-			write_cb(cbopaque, bv ? "V" : "v");
-		if ((err = JEMALLOC_P(mallctl)("opt.xmalloc", &bv, &bsz, NULL,
-		    0)) == 0)
-			write_cb(cbopaque, bv ? "X" : "x");
-		if ((err = JEMALLOC_P(mallctl)("opt.zero", &bv, &bsz, NULL, 0))
-		    == 0)
-			write_cb(cbopaque, bv ? "Z" : "z");
-		write_cb(cbopaque, "\n");
+#define OPT_WRITE_BOOL(n)						\
+		if ((err = JEMALLOC_P(mallctl)("opt."#n, &bv, &bsz,	\
+		    NULL, 0)) == 0) {					\
+			write_cb(cbopaque, "  opt."#n": ");		\
+			write_cb(cbopaque, bv ? "true" : "false");	\
+			write_cb(cbopaque, "\n");			\
+		}
+#define OPT_WRITE_SIZE_T(n)						\
+		if ((err = JEMALLOC_P(mallctl)("opt."#n, &sv, &ssz,	\
+		    NULL, 0)) == 0) {					\
+			write_cb(cbopaque, "  opt."#n": ");		\
+			write_cb(cbopaque, u2s(sv, 10, s));		\
+			write_cb(cbopaque, "\n");			\
+		}
+#define OPT_WRITE_SSIZE_T(n)						\
+		if ((err = JEMALLOC_P(mallctl)("opt."#n, &ssv, &sssz,	\
+		    NULL, 0)) == 0) {					\
+			if (ssv >= 0) {					\
+				write_cb(cbopaque, "  opt."#n": ");	\
+				write_cb(cbopaque, u2s(ssv, 10, s));	\
+			} else {					\
+				write_cb(cbopaque, "  opt."#n": -");	\
+				write_cb(cbopaque, u2s(-ssv, 10, s));	\
+			}						\
+			write_cb(cbopaque, "\n");			\
+		}
+#define OPT_WRITE_CHAR_P(n)						\
+		if ((err = JEMALLOC_P(mallctl)("opt."#n, &cpv, &cpsz,	\
+		    NULL, 0)) == 0) {					\
+			write_cb(cbopaque, "  opt."#n": \"");		\
+			write_cb(cbopaque, cpv);			\
+			write_cb(cbopaque, "\"\n");			\
+		}
+
+		write_cb(cbopaque, "Run-time option settings:\n");
+		OPT_WRITE_BOOL(abort)
+		OPT_WRITE_SIZE_T(lg_qspace_max)
+		OPT_WRITE_SIZE_T(lg_cspace_max)
+		OPT_WRITE_SIZE_T(lg_chunk)
+		OPT_WRITE_SIZE_T(narenas)
+		OPT_WRITE_SSIZE_T(lg_dirty_mult)
+		OPT_WRITE_BOOL(stats_print)
+		OPT_WRITE_BOOL(junk)
+		OPT_WRITE_BOOL(zero)
+		OPT_WRITE_BOOL(sysv)
+		OPT_WRITE_BOOL(xmalloc)
+		OPT_WRITE_BOOL(tcache)
+		OPT_WRITE_SSIZE_T(lg_tcache_gc_sweep)
+		OPT_WRITE_SSIZE_T(lg_tcache_max)
+		OPT_WRITE_BOOL(prof)
+		OPT_WRITE_CHAR_P(prof_prefix)
+		OPT_WRITE_SIZE_T(lg_prof_bt_max)
+		OPT_WRITE_BOOL(prof_active)
+		OPT_WRITE_SSIZE_T(lg_prof_sample)
+		OPT_WRITE_BOOL(prof_accum)
+		OPT_WRITE_SSIZE_T(lg_prof_tcmax)
+		OPT_WRITE_SSIZE_T(lg_prof_interval)
+		OPT_WRITE_BOOL(prof_gdump)
+		OPT_WRITE_BOOL(prof_leak)
+		OPT_WRITE_BOOL(overcommit)
+
+#undef OPT_WRITE_BOOL
+#undef OPT_WRITE_SIZE_T
+#undef OPT_WRITE_SSIZE_T
+#undef OPT_WRITE_CHAR_P

 		write_cb(cbopaque, "CPUs: ");
-		write_cb(cbopaque, umax2s(ncpus, 10, s));
+		write_cb(cbopaque, u2s(ncpus, 10, s));
 		write_cb(cbopaque, "\n");

 		CTL_GET("arenas.narenas", &uv, unsigned);
 		write_cb(cbopaque, "Max arenas: ");
-		write_cb(cbopaque, umax2s(uv, 10, s));
+		write_cb(cbopaque, u2s(uv, 10, s));
 		write_cb(cbopaque, "\n");

 		write_cb(cbopaque, "Pointer size: ");
-		write_cb(cbopaque, umax2s(sizeof(void *), 10, s));
+		write_cb(cbopaque, u2s(sizeof(void *), 10, s));
 		write_cb(cbopaque, "\n");

 		CTL_GET("arenas.quantum", &sv, size_t);
 		write_cb(cbopaque, "Quantum size: ");
-		write_cb(cbopaque, umax2s(sv, 10, s));
+		write_cb(cbopaque, u2s(sv, 10, s));
 		write_cb(cbopaque, "\n");

 		CTL_GET("arenas.cacheline", &sv, size_t);
 		write_cb(cbopaque, "Cacheline size (assumed): ");
-		write_cb(cbopaque, umax2s(sv, 10, s));
+		write_cb(cbopaque, u2s(sv, 10, s));
 		write_cb(cbopaque, "\n");

 		CTL_GET("arenas.subpage", &sv, size_t);
 		write_cb(cbopaque, "Subpage spacing: ");
-		write_cb(cbopaque, umax2s(sv, 10, s));
+		write_cb(cbopaque, u2s(sv, 10, s));
 		write_cb(cbopaque, "\n");

 		if ((err = JEMALLOC_P(mallctl)("arenas.tspace_min", &sv, &ssz,
 		    NULL, 0)) == 0) {
 			write_cb(cbopaque, "Tiny 2^n-spaced sizes: [");
-			write_cb(cbopaque, umax2s(sv, 10, s));
+			write_cb(cbopaque, u2s(sv, 10, s));
 			write_cb(cbopaque, "..");

 			CTL_GET("arenas.tspace_max", &sv, size_t);
-			write_cb(cbopaque, umax2s(sv, 10, s));
+			write_cb(cbopaque, u2s(sv, 10, s));
 			write_cb(cbopaque, "]\n");
 		}

 		CTL_GET("arenas.qspace_min", &sv, size_t);
 		write_cb(cbopaque, "Quantum-spaced sizes: [");
-		write_cb(cbopaque, umax2s(sv, 10, s));
+		write_cb(cbopaque, u2s(sv, 10, s));
 		write_cb(cbopaque, "..");
 		CTL_GET("arenas.qspace_max", &sv, size_t);
-		write_cb(cbopaque, umax2s(sv, 10, s));
+		write_cb(cbopaque, u2s(sv, 10, s));
 		write_cb(cbopaque, "]\n");

 		CTL_GET("arenas.cspace_min", &sv, size_t);
 		write_cb(cbopaque, "Cacheline-spaced sizes: [");
-		write_cb(cbopaque, umax2s(sv, 10, s));
+		write_cb(cbopaque, u2s(sv, 10, s));
 		write_cb(cbopaque, "..");
 		CTL_GET("arenas.cspace_max", &sv, size_t);
-		write_cb(cbopaque, umax2s(sv, 10, s));
+		write_cb(cbopaque, u2s(sv, 10, s));
 		write_cb(cbopaque, "]\n");

 		CTL_GET("arenas.sspace_min", &sv, size_t);
 		write_cb(cbopaque, "Subpage-spaced sizes: [");
-		write_cb(cbopaque, umax2s(sv, 10, s));
+		write_cb(cbopaque, u2s(sv, 10, s));
 		write_cb(cbopaque, "..");
 		CTL_GET("arenas.sspace_max", &sv, size_t);
-		write_cb(cbopaque, umax2s(sv, 10, s));
+		write_cb(cbopaque, u2s(sv, 10, s));
 		write_cb(cbopaque, "]\n");

 		CTL_GET("opt.lg_dirty_mult", &ssv, ssize_t);
 		if (ssv >= 0) {
 			write_cb(cbopaque,
 			    "Min active:dirty page ratio per arena: ");
-			write_cb(cbopaque, umax2s((1U << ssv), 10, s));
+			write_cb(cbopaque, u2s((1U << ssv), 10, s));
 			write_cb(cbopaque, ":1\n");
 		} else {
 			write_cb(cbopaque,
@ -560,7 +608,7 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 		    &ssz, NULL, 0)) == 0) {
 			write_cb(cbopaque,
 			    "Maximum thread-cached size class: ");
-			write_cb(cbopaque, umax2s(sv, 10, s));
+			write_cb(cbopaque, u2s(sv, 10, s));
 			write_cb(cbopaque, "\n");
 		}
 		if ((err = JEMALLOC_P(mallctl)("opt.lg_tcache_gc_sweep", &ssv,
@ -570,39 +618,51 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 			CTL_GET("opt.tcache", &tcache_enabled, bool);
 			write_cb(cbopaque, "Thread cache GC sweep interval: ");
 			write_cb(cbopaque, tcache_enabled && ssv >= 0 ?
-			    umax2s(tcache_gc_sweep, 10, s) : "N/A");
+			    u2s(tcache_gc_sweep, 10, s) : "N/A");
 			write_cb(cbopaque, "\n");
 		}
 		if ((err = JEMALLOC_P(mallctl)("opt.prof", &bv, &bsz, NULL, 0))
 		   == 0 && bv) {
 			CTL_GET("opt.lg_prof_bt_max", &sv, size_t);
 			write_cb(cbopaque, "Maximum profile backtrace depth: ");
-			write_cb(cbopaque, umax2s((1U << sv), 10, s));
+			write_cb(cbopaque, u2s((1U << sv), 10, s));
 			write_cb(cbopaque, "\n");

+			CTL_GET("opt.lg_prof_tcmax", &ssv, ssize_t);
+			write_cb(cbopaque,
+			    "Maximum per thread backtrace cache: ");
+			if (ssv >= 0) {
+				write_cb(cbopaque, u2s((1U << ssv), 10, s));
+				write_cb(cbopaque, " (2^");
+				write_cb(cbopaque, u2s(ssv, 10, s));
+				write_cb(cbopaque, ")\n");
+			} else
+				write_cb(cbopaque, "N/A\n");
+
 			CTL_GET("opt.lg_prof_sample", &sv, size_t);
 			write_cb(cbopaque, "Average profile sample interval: ");
-			write_cb(cbopaque, umax2s((1U << sv), 10, s));
+			write_cb(cbopaque, u2s((((uint64_t)1U) << sv), 10, s));
 			write_cb(cbopaque, " (2^");
-			write_cb(cbopaque, umax2s(sv, 10, s));
+			write_cb(cbopaque, u2s(sv, 10, s));
 			write_cb(cbopaque, ")\n");

 			CTL_GET("opt.lg_prof_interval", &ssv, ssize_t);
 			write_cb(cbopaque, "Average profile dump interval: ");
 			if (ssv >= 0) {
-				write_cb(cbopaque, umax2s((1U << ssv), 10, s));
+				write_cb(cbopaque, u2s((((uint64_t)1U) << ssv),
+				    10, s));
 				write_cb(cbopaque, " (2^");
-				write_cb(cbopaque, umax2s(ssv, 10, s));
+				write_cb(cbopaque, u2s(ssv, 10, s));
 				write_cb(cbopaque, ")\n");
 			} else
 				write_cb(cbopaque, "N/A\n");
 		}
 		CTL_GET("arenas.chunksize", &sv, size_t);
 		write_cb(cbopaque, "Chunk size: ");
-		write_cb(cbopaque, umax2s(sv, 10, s));
+		write_cb(cbopaque, u2s(sv, 10, s));
 		CTL_GET("opt.lg_chunk", &sv, size_t);
 		write_cb(cbopaque, " (2^");
-		write_cb(cbopaque, umax2s(sv, 10, s));
+		write_cb(cbopaque, u2s(sv, 10, s));
 		write_cb(cbopaque, ")\n");
 	}

--- a/jemalloc/src/tcache.c
+++ b/jemalloc/src/tcache.c
@ -5,17 +5,19 @@
 /* Data. */

 bool	opt_tcache = true;
-ssize_t	opt_lg_tcache_maxclass = LG_TCACHE_MAXCLASS_DEFAULT;
+ssize_t	opt_lg_tcache_max = LG_TCACHE_MAXCLASS_DEFAULT;
 ssize_t	opt_lg_tcache_gc_sweep = LG_TCACHE_GC_SWEEP_DEFAULT;

 /* Map of thread-specific caches. */
+#ifndef NO_TLS
 __thread tcache_t	*tcache_tls JEMALLOC_ATTR(tls_model("initial-exec"));
+#endif

 /*
 * Same contents as tcache, but initialized such that the TSD destructor is
 * called when a thread exits, so that the cache can be cleaned up.
 */
-static pthread_key_t		tcache_tsd;
+pthread_key_t		tcache_tsd;

 size_t				nhbins;
 size_t				tcache_maxclass;
@ -93,10 +95,10 @@ tcache_bin_flush_small(tcache_bin_t *tbin, size_t binind, unsigned rem
 			flush = *(void **)ptr;
 			chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 			if (chunk->arena == arena) {
-				size_t pageind = (((uintptr_t)ptr -
-				    (uintptr_t)chunk) >> PAGE_SHIFT);
+				size_t pageind = ((uintptr_t)ptr -
+				    (uintptr_t)chunk) >> PAGE_SHIFT;
 				arena_chunk_map_t *mapelm =
-				    &chunk->map[pageind];
+				    &chunk->map[pageind-map_bias];
 				arena_dalloc_bin(arena, chunk, ptr, mapelm);
 			} else {
 				/*
@ -202,12 +204,14 @@ tcache_create(arena_t *arena)
 	size_t size;
 	unsigned i;

-	size = sizeof(tcache_t) + (sizeof(tcache_bin_t) * (nhbins - 1));
+	size = offsetof(tcache_t, tbins) + (sizeof(tcache_bin_t) * nhbins);
 	/*
 	 * Round up to the nearest multiple of the cacheline size, in order to
 	 * avoid the possibility of false cacheline sharing.
 	 *
-	 * That this works relies on the same logic as in ipalloc().
+	 * That this works relies on the same logic as in ipalloc(), but we
+	 * cannot directly call ipalloc() here due to tcache bootstrapping
+	 * issues.
 	 */
 	size = (size + CACHELINE_MASK) & (-CACHELINE);

@ -239,8 +243,7 @@ tcache_create(arena_t *arena)
 	for (; i < nhbins; i++)
 		tcache->tbins[i].ncached_max = TCACHE_NSLOTS_LARGE;

-	tcache_tls = tcache;
-	pthread_setspecific(tcache_tsd, tcache);
+	TCACHE_SET(tcache);

 	return (tcache);
 }
@ -308,9 +311,9 @@ tcache_destroy(tcache_t *tcache)
 	if (arena_salloc(tcache) <= small_maxclass) {
 		arena_chunk_t *chunk = CHUNK_ADDR2BASE(tcache);
 		arena_t *arena = chunk->arena;
-		size_t pageind = (((uintptr_t)tcache - (uintptr_t)chunk) >>
-		    PAGE_SHIFT);
-		arena_chunk_map_t *mapelm = &chunk->map[pageind];
+		size_t pageind = ((uintptr_t)tcache - (uintptr_t)chunk) >>
+		    PAGE_SHIFT;
+		arena_chunk_map_t *mapelm = &chunk->map[pageind-map_bias];
 		arena_run_t *run = (arena_run_t *)((uintptr_t)chunk +
 		    (uintptr_t)((pageind - (mapelm->bits >> PAGE_SHIFT)) <<
 		    PAGE_SHIFT));
@ -328,11 +331,24 @@ tcache_thread_cleanup(void *arg)
 {
 	tcache_t *tcache = (tcache_t *)arg;

-	assert(tcache == tcache_tls);
-	if (tcache != NULL) {
+	if (tcache == (void *)(uintptr_t)1) {
+		/*
+		 * The previous time this destructor was called, we set the key
+		 * to 1 so that other destructors wouldn't cause re-creation of
+		 * the tcache.  This time, do nothing, so that the destructor
+		 * will not be called again.
+		 */
+	} else if (tcache == (void *)(uintptr_t)2) {
+		/*
+		 * Another destructor called an allocator function after this
+		 * destructor was called.  Reset tcache to 1 in order to
+		 * receive another callback.
+		 */
+		TCACHE_SET((uintptr_t)1);
+	} else if (tcache != NULL) {
 		assert(tcache != (void *)(uintptr_t)1);
 		tcache_destroy(tcache);
-		tcache_tls = (void *)(uintptr_t)1;
+		TCACHE_SET((uintptr_t)1);
 	}
 }

@ -368,16 +384,16 @@ tcache_boot(void)

 	if (opt_tcache) {
 		/*
-		 * If necessary, clamp opt_lg_tcache_maxclass, now that
+		 * If necessary, clamp opt_lg_tcache_max, now that
 		 * small_maxclass and arena_maxclass are known.
 		 */
-		if (opt_lg_tcache_maxclass < 0 || (1U <<
-		    opt_lg_tcache_maxclass) < small_maxclass)
+		if (opt_lg_tcache_max < 0 || (1U <<
+		    opt_lg_tcache_max) < small_maxclass)
 			tcache_maxclass = small_maxclass;
-		else if ((1U << opt_lg_tcache_maxclass) > arena_maxclass)
+		else if ((1U << opt_lg_tcache_max) > arena_maxclass)
 			tcache_maxclass = arena_maxclass;
 		else
-			tcache_maxclass = (1U << opt_lg_tcache_maxclass);
+			tcache_maxclass = (1U << opt_lg_tcache_max);

 		nhbins = nbins + (tcache_maxclass >> PAGE_SHIFT);

--- a/jemalloc/src/zone.c
+++ b/jemalloc/src/zone.c
@ -0,0 +1,354 @@
+#include "jemalloc/internal/jemalloc_internal.h"
+#ifndef JEMALLOC_ZONE
+#  error "This source file is for zones on Darwin (OS X)."
+#endif
+
+/******************************************************************************/
+/* Data. */
+
+static malloc_zone_t zone, szone;
+static struct malloc_introspection_t zone_introspect, ozone_introspect;
+
+/******************************************************************************/
+/* Function prototypes for non-inline static functions. */
+
+static size_t	zone_size(malloc_zone_t *zone, void *ptr);
+static void	*zone_malloc(malloc_zone_t *zone, size_t size);
+static void	*zone_calloc(malloc_zone_t *zone, size_t num, size_t size);
+static void	*zone_valloc(malloc_zone_t *zone, size_t size);
+static void	zone_free(malloc_zone_t *zone, void *ptr);
+static void	*zone_realloc(malloc_zone_t *zone, void *ptr, size_t size);
+#if (JEMALLOC_ZONE_VERSION >= 6)
+static void	*zone_memalign(malloc_zone_t *zone, size_t alignment,
+    size_t size);
+static void	zone_free_definite_size(malloc_zone_t *zone, void *ptr,
+    size_t size);
+#endif
+static void	*zone_destroy(malloc_zone_t *zone);
+static size_t	zone_good_size(malloc_zone_t *zone, size_t size);
+static void	zone_force_lock(malloc_zone_t *zone);
+static void	zone_force_unlock(malloc_zone_t *zone);
+static size_t	ozone_size(malloc_zone_t *zone, void *ptr);
+static void	ozone_free(malloc_zone_t *zone, void *ptr);
+static void	*ozone_realloc(malloc_zone_t *zone, void *ptr, size_t size);
+static unsigned	ozone_batch_malloc(malloc_zone_t *zone, size_t size,
+    void **results, unsigned num_requested);
+static void	ozone_batch_free(malloc_zone_t *zone, void **to_be_freed,
+    unsigned num);
+#if (JEMALLOC_ZONE_VERSION >= 6)
+static void	ozone_free_definite_size(malloc_zone_t *zone, void *ptr,
+    size_t size);
+#endif
+static void	ozone_force_lock(malloc_zone_t *zone);
+static void	ozone_force_unlock(malloc_zone_t *zone);
+
+/******************************************************************************/
+/*
+ * Functions.
+ */
+
+static size_t
+zone_size(malloc_zone_t *zone, void *ptr)
+{
+
+	/*
+	 * There appear to be places within Darwin (such as setenv(3)) that
+	 * cause calls to this function with pointers that *no* zone owns.  If
+	 * we knew that all pointers were owned by *some* zone, we could split
+	 * our zone into two parts, and use one as the default allocator and
+	 * the other as the default deallocator/reallocator.  Since that will
+	 * not work in practice, we must check all pointers to assure that they
+	 * reside within a mapped chunk before determining size.
+	 */
+	return (ivsalloc(ptr));
+}
+
+static void *
+zone_malloc(malloc_zone_t *zone, size_t size)
+{
+
+	return (JEMALLOC_P(malloc)(size));
+}
+
+static void *
+zone_calloc(malloc_zone_t *zone, size_t num, size_t size)
+{
+
+	return (JEMALLOC_P(calloc)(num, size));
+}
+
+static void *
+zone_valloc(malloc_zone_t *zone, size_t size)
+{
+	void *ret = NULL; /* Assignment avoids useless compiler warning. */
+
+	JEMALLOC_P(posix_memalign)(&ret, PAGE_SIZE, size);
+
+	return (ret);
+}
+
+static void
+zone_free(malloc_zone_t *zone, void *ptr)
+{
+
+	JEMALLOC_P(free)(ptr);
+}
+
+static void *
+zone_realloc(malloc_zone_t *zone, void *ptr, size_t size)
+{
+
+	return (JEMALLOC_P(realloc)(ptr, size));
+}
+
+#if (JEMALLOC_ZONE_VERSION >= 6)
+static void *
+zone_memalign(malloc_zone_t *zone, size_t alignment, size_t size)
+{
+	void *ret = NULL; /* Assignment avoids useless compiler warning. */
+
+	JEMALLOC_P(posix_memalign)(&ret, alignment, size);
+
+	return (ret);
+}
+
+static void
+zone_free_definite_size(malloc_zone_t *zone, void *ptr, size_t size)
+{
+
+	assert(ivsalloc(ptr) == size);
+	JEMALLOC_P(free)(ptr);
+}
+#endif
+
+static void *
+zone_destroy(malloc_zone_t *zone)
+{
+
+	/* This function should never be called. */
+	assert(false);
+	return (NULL);
+}
+
+static size_t
+zone_good_size(malloc_zone_t *zone, size_t size)
+{
+	size_t ret;
+	void *p;
+
+	/*
+	 * Actually create an object of the appropriate size, then find out
+	 * how large it could have been without moving up to the next size
+	 * class.
+	 */
+	p = JEMALLOC_P(malloc)(size);
+	if (p != NULL) {
+		ret = isalloc(p);
+		JEMALLOC_P(free)(p);
+	} else
+		ret = size;
+
+	return (ret);
+}
+
+static void
+zone_force_lock(malloc_zone_t *zone)
+{
+
+	if (isthreaded)
+		jemalloc_prefork();
+}
+
+static void
+zone_force_unlock(malloc_zone_t *zone)
+{
+
+	if (isthreaded)
+		jemalloc_postfork();
+}
+
+malloc_zone_t *
+create_zone(void)
+{
+
+	zone.size = (void *)zone_size;
+	zone.malloc = (void *)zone_malloc;
+	zone.calloc = (void *)zone_calloc;
+	zone.valloc = (void *)zone_valloc;
+	zone.free = (void *)zone_free;
+	zone.realloc = (void *)zone_realloc;
+	zone.destroy = (void *)zone_destroy;
+	zone.zone_name = "jemalloc_zone";
+	zone.batch_malloc = NULL;
+	zone.batch_free = NULL;
+	zone.introspect = &zone_introspect;
+	zone.version = JEMALLOC_ZONE_VERSION;
+#if (JEMALLOC_ZONE_VERSION >= 6)
+	zone.memalign = zone_memalign;
+	zone.free_definite_size = zone_free_definite_size;
+#endif
+
+	zone_introspect.enumerator = NULL;
+	zone_introspect.good_size = (void *)zone_good_size;
+	zone_introspect.check = NULL;
+	zone_introspect.print = NULL;
+	zone_introspect.log = NULL;
+	zone_introspect.force_lock = (void *)zone_force_lock;
+	zone_introspect.force_unlock = (void *)zone_force_unlock;
+	zone_introspect.statistics = NULL;
+#if (JEMALLOC_ZONE_VERSION >= 6)
+	zone_introspect.zone_locked = NULL;
+#endif
+
+	return (&zone);
+}
+
+static size_t
+ozone_size(malloc_zone_t *zone, void *ptr)
+{
+	size_t ret;
+
+	ret = ivsalloc(ptr);
+	if (ret == 0)
+		ret = szone.size(zone, ptr);
+
+	return (ret);
+}
+
+static void
+ozone_free(malloc_zone_t *zone, void *ptr)
+{
+
+	if (ivsalloc(ptr) != 0)
+		JEMALLOC_P(free)(ptr);
+	else {
+		size_t size = szone.size(zone, ptr);
+		if (size != 0)
+			(szone.free)(zone, ptr);
+	}
+}
+
+static void *
+ozone_realloc(malloc_zone_t *zone, void *ptr, size_t size)
+{
+	size_t oldsize;
+
+	if (ptr == NULL)
+		return (JEMALLOC_P(malloc)(size));
+
+	oldsize = ivsalloc(ptr);
+	if (oldsize != 0)
+		return (JEMALLOC_P(realloc)(ptr, size));
+	else {
+		oldsize = szone.size(zone, ptr);
+		if (oldsize == 0)
+			return (JEMALLOC_P(malloc)(size));
+		else {
+			void *ret = JEMALLOC_P(malloc)(size);
+			if (ret != NULL) {
+				memcpy(ret, ptr, (oldsize < size) ? oldsize :
+				    size);
+				(szone.free)(zone, ptr);
+			}
+			return (ret);
+		}
+	}
+}
+
+static unsigned
+ozone_batch_malloc(malloc_zone_t *zone, size_t size, void **results,
+    unsigned num_requested)
+{
+
+	/* Don't bother implementing this interface, since it isn't required. */
+	return (0);
+}
+
+static void
+ozone_batch_free(malloc_zone_t *zone, void **to_be_freed, unsigned num)
+{
+	unsigned i;
+
+	for (i = 0; i < num; i++)
+		ozone_free(zone, to_be_freed[i]);
+}
+
+#if (JEMALLOC_ZONE_VERSION >= 6)
+static void
+ozone_free_definite_size(malloc_zone_t *zone, void *ptr, size_t size)
+{
+
+	if (ivsalloc(ptr) != 0) {
+		assert(ivsalloc(ptr) == size);
+		JEMALLOC_P(free)(ptr);
+	} else {
+		assert(size == szone.size(zone, ptr));
+		szone.free_definite_size(zone, ptr, size);
+	}
+}
+#endif
+
+static void
+ozone_force_lock(malloc_zone_t *zone)
+{
+
+	/* jemalloc locking is taken care of by the normal jemalloc zone. */
+	szone.introspect->force_lock(zone);
+}
+
+static void
+ozone_force_unlock(malloc_zone_t *zone)
+{
+
+	/* jemalloc locking is taken care of by the normal jemalloc zone. */
+	szone.introspect->force_unlock(zone);
+}
+
+/*
+ * Overlay the default scalable zone (szone) such that existing allocations are
+ * drained, and further allocations come from jemalloc.  This is necessary
+ * because Core Foundation directly accesses and uses the szone before the
+ * jemalloc library is even loaded.
+ */
+void
+szone2ozone(malloc_zone_t *zone)
+{
+
+	/*
+	 * Stash a copy of the original szone so that we can call its
+	 * functions as needed.  Note that the internally, the szone stores its
+	 * bookkeeping data structures immediately following the malloc_zone_t
+	 * header, so when calling szone functions, we need to pass a pointer
+	 * to the original zone structure.
+	 */
+	memcpy(&szone, zone, sizeof(malloc_zone_t));
+
+	zone->size = (void *)ozone_size;
+	zone->malloc = (void *)zone_malloc;
+	zone->calloc = (void *)zone_calloc;
+	zone->valloc = (void *)zone_valloc;
+	zone->free = (void *)ozone_free;
+	zone->realloc = (void *)ozone_realloc;
+	zone->destroy = (void *)zone_destroy;
+	zone->zone_name = "jemalloc_ozone";
+	zone->batch_malloc = ozone_batch_malloc;
+	zone->batch_free = ozone_batch_free;
+	zone->introspect = &ozone_introspect;
+	zone->version = JEMALLOC_ZONE_VERSION;
+#if (JEMALLOC_ZONE_VERSION >= 6)
+	zone->memalign = zone_memalign;
+	zone->free_definite_size = ozone_free_definite_size;
+#endif
+
+	ozone_introspect.enumerator = NULL;
+	ozone_introspect.good_size = (void *)zone_good_size;
+	ozone_introspect.check = NULL;
+	ozone_introspect.print = NULL;
+	ozone_introspect.log = NULL;
+	ozone_introspect.force_lock = (void *)ozone_force_lock;
+	ozone_introspect.force_unlock = (void *)ozone_force_unlock;
+	ozone_introspect.statistics = NULL;
+#if (JEMALLOC_ZONE_VERSION >= 6)
+	ozone_introspect.zone_locked = NULL;
+#endif
+}
--- a/jemalloc/test/allocated.c
+++ b/jemalloc/test/allocated.c
@ -0,0 +1,105 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <pthread.h>
+#include <assert.h>
+#include <errno.h>
+#include <string.h>
+
+#define	JEMALLOC_MANGLE
+#include "jemalloc_test.h"
+
+void *
+thread_start(void *arg)
+{
+	int err;
+	void *p;
+	uint64_t a0, a1, d0, d1;
+	size_t sz, usize;
+
+	sz = sizeof(a0);
+	if ((err = JEMALLOC_P(mallctl)("thread.allocated", &a0, &sz, NULL,
+	    0))) {
+		if (err == ENOENT) {
+#ifdef JEMALLOC_STATS
+			assert(false);
+#endif
+			goto RETURN;
+		}
+		fprintf(stderr, "%s(): Error in mallctl(): %s\n", __func__,
+		    strerror(err));
+		exit(1);
+	}
+
+	sz = sizeof(d0);
+	if ((err = JEMALLOC_P(mallctl)("thread.deallocated", &d0, &sz, NULL,
+	    0))) {
+		if (err == ENOENT) {
+#ifdef JEMALLOC_STATS
+			assert(false);
+#endif
+			goto RETURN;
+		}
+		fprintf(stderr, "%s(): Error in mallctl(): %s\n", __func__,
+		    strerror(err));
+		exit(1);
+	}
+
+	p = JEMALLOC_P(malloc)(1);
+	if (p == NULL) {
+		fprintf(stderr, "%s(): Error in malloc()\n", __func__);
+		exit(1);
+	}
+
+	sz = sizeof(a1);
+	JEMALLOC_P(mallctl)("thread.allocated", &a1, &sz, NULL, 0);
+
+	usize = JEMALLOC_P(malloc_usable_size)(p);
+	assert(a0 + usize <= a1);
+
+	JEMALLOC_P(free)(p);
+
+	sz = sizeof(d1);
+	JEMALLOC_P(mallctl)("thread.deallocated", &d1, &sz, NULL, 0);
+
+	assert(d0 + usize <= d1);
+
+RETURN:
+	return (NULL);
+}
+
+int
+main(void)
+{
+	int ret = 0;
+	pthread_t thread;
+
+	fprintf(stderr, "Test begin\n");
+
+	thread_start(NULL);
+
+	if (pthread_create(&thread, NULL, thread_start, NULL)
+	    != 0) {
+		fprintf(stderr, "%s(): Error in pthread_create()\n", __func__);
+		ret = 1;
+		goto RETURN;
+	}
+	pthread_join(thread, (void *)&ret);
+
+	thread_start(NULL);
+
+	if (pthread_create(&thread, NULL, thread_start, NULL)
+	    != 0) {
+		fprintf(stderr, "%s(): Error in pthread_create()\n", __func__);
+		ret = 1;
+		goto RETURN;
+	}
+	pthread_join(thread, (void *)&ret);
+
+	thread_start(NULL);
+
+RETURN:
+	fprintf(stderr, "Test end\n");
+	return (ret);
+}
--- a/jemalloc/test/allocated.exp
+++ b/jemalloc/test/allocated.exp
@ -0,0 +1,2 @@
+Test begin
+Test end
--- a/jemalloc/test/allocm.c
+++ b/jemalloc/test/allocm.c
@ -0,0 +1,133 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+
+#define	JEMALLOC_MANGLE
+#include "jemalloc_test.h"
+
+#define CHUNK 0x400000
+/* #define MAXALIGN ((size_t)0x80000000000LLU) */
+#define MAXALIGN ((size_t)0x2000000LLU)
+#define NITER 4
+
+int
+main(void)
+{
+	int r;
+	void *p;
+	size_t sz, alignment, total, tsz;
+	unsigned i;
+	void *ps[NITER];
+
+	fprintf(stderr, "Test begin\n");
+
+	sz = 0;
+	r = JEMALLOC_P(allocm)(&p, &sz, 42, 0);
+	if (r != ALLOCM_SUCCESS) {
+		fprintf(stderr, "Unexpected allocm() error\n");
+		abort();
+	}
+	if (sz < 42)
+		fprintf(stderr, "Real size smaller than expected\n");
+	if (JEMALLOC_P(dallocm)(p, 0) != ALLOCM_SUCCESS)
+		fprintf(stderr, "Unexpected dallocm() error\n");
+
+	r = JEMALLOC_P(allocm)(&p, NULL, 42, 0);
+	if (r != ALLOCM_SUCCESS) {
+		fprintf(stderr, "Unexpected allocm() error\n");
+		abort();
+	}
+	if (JEMALLOC_P(dallocm)(p, 0) != ALLOCM_SUCCESS)
+		fprintf(stderr, "Unexpected dallocm() error\n");
+
+	r = JEMALLOC_P(allocm)(&p, NULL, 42, ALLOCM_ZERO);
+	if (r != ALLOCM_SUCCESS) {
+		fprintf(stderr, "Unexpected allocm() error\n");
+		abort();
+	}
+	if (JEMALLOC_P(dallocm)(p, 0) != ALLOCM_SUCCESS)
+		fprintf(stderr, "Unexpected dallocm() error\n");
+
+#if LG_SIZEOF_PTR == 3
+	alignment = 0x8000000000000000LLU;
+	sz        = 0x8000000000000000LLU;
+#else
+	alignment = 0x80000000LU;
+	sz        = 0x80000000LU;
+#endif
+	r = JEMALLOC_P(allocm)(&p, NULL, sz, ALLOCM_ALIGN(alignment));
+	if (r == ALLOCM_SUCCESS) {
+		fprintf(stderr,
+		    "Expected error for allocm(&p, %zu, 0x%x)\n",
+		    sz, ALLOCM_ALIGN(alignment));
+	}
+
+#if LG_SIZEOF_PTR == 3
+	alignment = 0x4000000000000000LLU;
+	sz        = 0x8400000000000001LLU;
+#else
+	alignment = 0x40000000LU;
+	sz        = 0x84000001LU;
+#endif
+	r = JEMALLOC_P(allocm)(&p, NULL, sz, ALLOCM_ALIGN(alignment));
+	if (r == ALLOCM_SUCCESS) {
+		fprintf(stderr,
+		    "Expected error for allocm(&p, %zu, 0x%x)\n",
+		    sz, ALLOCM_ALIGN(alignment));
+	}
+
+	alignment = 0x10LLU;
+#if LG_SIZEOF_PTR == 3
+	sz   = 0xfffffffffffffff0LLU;
+#else
+	sz   = 0xfffffff0LU;
+#endif
+	r = JEMALLOC_P(allocm)(&p, NULL, sz, ALLOCM_ALIGN(alignment));
+	if (r == ALLOCM_SUCCESS) {
+		fprintf(stderr,
+		    "Expected error for allocm(&p, %zu, 0x%x)\n",
+		    sz, ALLOCM_ALIGN(alignment));
+	}
+
+	for (i = 0; i < NITER; i++)
+		ps[i] = NULL;
+
+	for (alignment = 8;
+	    alignment <= MAXALIGN;
+	    alignment <<= 1) {
+		total = 0;
+		fprintf(stderr, "Alignment: %zu\n", alignment);
+		for (sz = 1;
+		    sz < 3 * alignment && sz < (1U << 31);
+		    sz += (alignment >> (LG_SIZEOF_PTR-1)) - 1) {
+			for (i = 0; i < NITER; i++) {
+				r = JEMALLOC_P(allocm)(&ps[i], NULL, sz,
+				    ALLOCM_ALIGN(alignment) | ALLOCM_ZERO);
+				if (r != ALLOCM_SUCCESS) {
+					fprintf(stderr,
+					    "Error for size %zu (0x%zx): %d\n",
+					    sz, sz, r);
+					exit(1);
+				}
+				if ((uintptr_t)p & (alignment-1)) {
+					fprintf(stderr,
+					    "%p inadequately aligned for"
+					    " alignment: %zu\n", p, alignment);
+				}
+				JEMALLOC_P(sallocm)(ps[i], &tsz, 0);
+				total += tsz;
+				if (total >= (MAXALIGN << 1))
+					break;
+			}
+			for (i = 0; i < NITER; i++) {
+				if (ps[i] != NULL) {
+					JEMALLOC_P(dallocm)(ps[i], 0);
+					ps[i] = NULL;
+				}
+			}
+		}
+	}
+
+	fprintf(stderr, "Test end\n");
+	return (0);
+}
--- a/jemalloc/test/allocm.exp
+++ b/jemalloc/test/allocm.exp
@ -0,0 +1,25 @@
+Test begin
+Alignment: 8
+Alignment: 16
+Alignment: 32
+Alignment: 64
+Alignment: 128
+Alignment: 256
+Alignment: 512
+Alignment: 1024
+Alignment: 2048
+Alignment: 4096
+Alignment: 8192
+Alignment: 16384
+Alignment: 32768
+Alignment: 65536
+Alignment: 131072
+Alignment: 262144
+Alignment: 524288
+Alignment: 1048576
+Alignment: 2097152
+Alignment: 4194304
+Alignment: 8388608
+Alignment: 16777216
+Alignment: 33554432
+Test end
--- a/jemalloc/test/jemalloc_test.h.in
+++ b/jemalloc/test/jemalloc_test.h.in
@ -0,0 +1,6 @@
+/*
+ * This header should be included by tests, rather than directly including
+ * jemalloc/jemalloc.h, because --with-install-suffix may cause the header to
+ * have a different name.
+ */
+#include "jemalloc/jemalloc@install_suffix@.h"
--- a/jemalloc/test/posix_memalign.c
+++ b/jemalloc/test/posix_memalign.c
@ -0,0 +1,121 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+
+#define	JEMALLOC_MANGLE
+#include "jemalloc_test.h"
+
+#define CHUNK 0x400000
+/* #define MAXALIGN ((size_t)0x80000000000LLU) */
+#define MAXALIGN ((size_t)0x2000000LLU)
+#define NITER 4
+
+int
+main(void)
+{
+	size_t alignment, size, total;
+	unsigned i;
+	int err;
+	void *p, *ps[NITER];
+
+	fprintf(stderr, "Test begin\n");
+
+	/* Test error conditions. */
+	for (alignment = 0; alignment < sizeof(void *); alignment++) {
+		err = JEMALLOC_P(posix_memalign)(&p, alignment, 1);
+		if (err != EINVAL) {
+			fprintf(stderr,
+			    "Expected error for invalid alignment %zu\n",
+			    alignment);
+		}
+	}
+
+	for (alignment = sizeof(size_t); alignment < MAXALIGN;
+	    alignment <<= 1) {
+		err = JEMALLOC_P(posix_memalign)(&p, alignment + 1, 1);
+		if (err == 0) {
+			fprintf(stderr,
+			    "Expected error for invalid alignment %zu\n",
+			    alignment + 1);
+		}
+	}
+
+#if LG_SIZEOF_PTR == 3
+	alignment = 0x8000000000000000LLU;
+	size      = 0x8000000000000000LLU;
+#else
+	alignment = 0x80000000LU;
+	size      = 0x80000000LU;
+#endif
+	err = JEMALLOC_P(posix_memalign)(&p, alignment, size);
+	if (err == 0) {
+		fprintf(stderr,
+		    "Expected error for posix_memalign(&p, %zu, %zu)\n",
+		    alignment, size);
+	}
+
+#if LG_SIZEOF_PTR == 3
+	alignment = 0x4000000000000000LLU;
+	size      = 0x8400000000000001LLU;
+#else
+	alignment = 0x40000000LU;
+	size      = 0x84000001LU;
+#endif
+	err = JEMALLOC_P(posix_memalign)(&p, alignment, size);
+	if (err == 0) {
+		fprintf(stderr,
+		    "Expected error for posix_memalign(&p, %zu, %zu)\n",
+		    alignment, size);
+	}
+
+	alignment = 0x10LLU;
+#if LG_SIZEOF_PTR == 3
+	size = 0xfffffffffffffff0LLU;
+#else
+	size = 0xfffffff0LU;
+#endif
+	err = JEMALLOC_P(posix_memalign)(&p, alignment, size);
+	if (err == 0) {
+		fprintf(stderr,
+		    "Expected error for posix_memalign(&p, %zu, %zu)\n",
+		    alignment, size);
+	}
+
+	for (i = 0; i < NITER; i++)
+		ps[i] = NULL;
+
+	for (alignment = 8;
+	    alignment <= MAXALIGN;
+	    alignment <<= 1) {
+		total = 0;
+		fprintf(stderr, "Alignment: %zu\n", alignment);
+		for (size = 1;
+		    size < 3 * alignment && size < (1U << 31);
+		    size += (alignment >> (LG_SIZEOF_PTR-1)) - 1) {
+			for (i = 0; i < NITER; i++) {
+				err = JEMALLOC_P(posix_memalign)(&ps[i],
+				    alignment, size);
+				if (err) {
+					fprintf(stderr,
+					    "Error for size %zu (0x%zx): %s\n",
+					    size, size, strerror(err));
+					exit(1);
+				}
+				total += JEMALLOC_P(malloc_usable_size)(ps[i]);
+				if (total >= (MAXALIGN << 1))
+					break;
+			}
+			for (i = 0; i < NITER; i++) {
+				if (ps[i] != NULL) {
+					JEMALLOC_P(free)(ps[i]);
+					ps[i] = NULL;
+				}
+			}
+		}
+	}
+
+	fprintf(stderr, "Test end\n");
+	return (0);
+}
--- a/jemalloc/test/posix_memalign.exp
+++ b/jemalloc/test/posix_memalign.exp
@ -0,0 +1,25 @@
+Test begin
+Alignment: 8
+Alignment: 16
+Alignment: 32
+Alignment: 64
+Alignment: 128
+Alignment: 256
+Alignment: 512
+Alignment: 1024
+Alignment: 2048
+Alignment: 4096
+Alignment: 8192
+Alignment: 16384
+Alignment: 32768
+Alignment: 65536
+Alignment: 131072
+Alignment: 262144
+Alignment: 524288
+Alignment: 1048576
+Alignment: 2097152
+Alignment: 4194304
+Alignment: 8388608
+Alignment: 16777216
+Alignment: 33554432
+Test end
--- a/jemalloc/test/rallocm.c
+++ b/jemalloc/test/rallocm.c
@ -0,0 +1,117 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define	JEMALLOC_MANGLE
+#include "jemalloc_test.h"
+
+int
+main(void)
+{
+	void *p, *q;
+	size_t sz, tsz;
+	int r;
+
+	fprintf(stderr, "Test begin\n");
+
+	r = JEMALLOC_P(allocm)(&p, &sz, 42, 0);
+	if (r != ALLOCM_SUCCESS) {
+		fprintf(stderr, "Unexpected allocm() error\n");
+		abort();
+	}
+
+	q = p;
+	r = JEMALLOC_P(rallocm)(&q, &tsz, sz, 0, ALLOCM_NO_MOVE);
+	if (r != ALLOCM_SUCCESS)
+		fprintf(stderr, "Unexpected rallocm() error\n");
+	if (q != p)
+		fprintf(stderr, "Unexpected object move\n");
+	if (tsz != sz) {
+		fprintf(stderr, "Unexpected size change: %zu --> %zu\n",
+		    sz, tsz);
+	}
+
+	q = p;
+	r = JEMALLOC_P(rallocm)(&q, &tsz, sz, 5, ALLOCM_NO_MOVE);
+	if (r != ALLOCM_SUCCESS)
+		fprintf(stderr, "Unexpected rallocm() error\n");
+	if (q != p)
+		fprintf(stderr, "Unexpected object move\n");
+	if (tsz != sz) {
+		fprintf(stderr, "Unexpected size change: %zu --> %zu\n",
+		    sz, tsz);
+	}
+
+	q = p;
+	r = JEMALLOC_P(rallocm)(&q, &tsz, sz + 5, 0, ALLOCM_NO_MOVE);
+	if (r != ALLOCM_ERR_NOT_MOVED)
+		fprintf(stderr, "Unexpected rallocm() result\n");
+	if (q != p)
+		fprintf(stderr, "Unexpected object move\n");
+	if (tsz != sz) {
+		fprintf(stderr, "Unexpected size change: %zu --> %zu\n",
+		    sz, tsz);
+	}
+
+	q = p;
+	r = JEMALLOC_P(rallocm)(&q, &tsz, sz + 5, 0, 0);
+	if (r != ALLOCM_SUCCESS)
+		fprintf(stderr, "Unexpected rallocm() error\n");
+	if (q == p)
+		fprintf(stderr, "Expected object move\n");
+	if (tsz == sz) {
+		fprintf(stderr, "Expected size change: %zu --> %zu\n",
+		    sz, tsz);
+	}
+	p = q;
+	sz = tsz;
+
+	r = JEMALLOC_P(rallocm)(&q, &tsz, 8192, 0, 0);
+	if (r != ALLOCM_SUCCESS)
+		fprintf(stderr, "Unexpected rallocm() error\n");
+	if (q == p)
+		fprintf(stderr, "Expected object move\n");
+	if (tsz == sz) {
+		fprintf(stderr, "Expected size change: %zu --> %zu\n",
+		    sz, tsz);
+	}
+	p = q;
+	sz = tsz;
+
+	r = JEMALLOC_P(rallocm)(&q, &tsz, 16384, 0, 0);
+	if (r != ALLOCM_SUCCESS)
+		fprintf(stderr, "Unexpected rallocm() error\n");
+	if (tsz == sz) {
+		fprintf(stderr, "Expected size change: %zu --> %zu\n",
+		    sz, tsz);
+	}
+	p = q;
+	sz = tsz;
+
+	r = JEMALLOC_P(rallocm)(&q, &tsz, 8192, 0, ALLOCM_NO_MOVE);
+	if (r != ALLOCM_SUCCESS)
+		fprintf(stderr, "Unexpected rallocm() error\n");
+	if (q != p)
+		fprintf(stderr, "Unexpected object move\n");
+	if (tsz == sz) {
+		fprintf(stderr, "Expected size change: %zu --> %zu\n",
+		    sz, tsz);
+	}
+	sz = tsz;
+
+	r = JEMALLOC_P(rallocm)(&q, &tsz, 16384, 0, ALLOCM_NO_MOVE);
+	if (r != ALLOCM_SUCCESS)
+		fprintf(stderr, "Unexpected rallocm() error\n");
+	if (q != p)
+		fprintf(stderr, "Unexpected object move\n");
+	if (tsz == sz) {
+		fprintf(stderr, "Expected size change: %zu --> %zu\n",
+		    sz, tsz);
+	}
+	sz = tsz;
+
+	JEMALLOC_P(dallocm)(p, 0);
+
+	fprintf(stderr, "Test end\n");
+	return (0);
+}
--- a/jemalloc/test/rallocm.exp
+++ b/jemalloc/test/rallocm.exp
@ -0,0 +1,2 @@
+Test begin
+Test end
--- a/jemalloc/test/thread_arena.c
+++ b/jemalloc/test/thread_arena.c
@ -0,0 +1,82 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <pthread.h>
+#include <string.h>
+
+#define	JEMALLOC_MANGLE
+#include "jemalloc_test.h"
+
+void *
+thread_start(void *arg)
+{
+	unsigned main_arena_ind = *(unsigned *)arg;
+	void *p;
+	unsigned arena_ind;
+	size_t size;
+	int err;
+
+	p = JEMALLOC_P(malloc)(1);
+	if (p == NULL) {
+		fprintf(stderr, "%s(): Error in malloc()\n", __func__);
+		return (void *)1;
+	}
+
+	size = sizeof(arena_ind);
+	if ((err = JEMALLOC_P(mallctl)("thread.arena", &arena_ind, &size,
+	    &main_arena_ind, sizeof(main_arena_ind)))) {
+		fprintf(stderr, "%s(): Error in mallctl(): %s\n", __func__,
+		    strerror(err));
+		return (void *)1;
+	}
+
+	return (NULL);
+}
+
+int
+main(void)
+{
+	int ret = 0;
+	void *p;
+	unsigned arena_ind;
+	size_t size;
+	int err;
+	pthread_t thread;
+
+	fprintf(stderr, "Test begin\n");
+
+	p = JEMALLOC_P(malloc)(1);
+	if (p == NULL) {
+		fprintf(stderr, "%s(): Error in malloc()\n", __func__);
+		ret = 1;
+		goto RETURN;
+	}
+
+	size = sizeof(arena_ind);
+	if ((err = JEMALLOC_P(mallctl)("thread.arena", &arena_ind, &size, NULL,
+	    0))) {
+		fprintf(stderr, "%s(): Error in mallctl(): %s\n", __func__,
+		    strerror(err));
+		ret = 1;
+		goto RETURN;
+	}
+
+	if (pthread_create(&thread, NULL, thread_start, (void *)&arena_ind)
+	    != 0) {
+		fprintf(stderr, "%s(): Error in pthread_create()\n", __func__);
+		ret = 1;
+		goto RETURN;
+	}
+	pthread_join(thread, (void *)&ret);
+
+	if (pthread_create(&thread, NULL, thread_start, (void *)&arena_ind)
+	    != 0) {
+		fprintf(stderr, "%s(): Error in pthread_create()\n", __func__);
+		ret = 1;
+		goto RETURN;
+	}
+	pthread_join(thread, (void *)&ret);
+
+RETURN:
+	fprintf(stderr, "Test end\n");
+	return (ret);
+}
--- a/jemalloc/test/thread_arena.exp
+++ b/jemalloc/test/thread_arena.exp
@ -0,0 +1,2 @@
+Test begin
+Test end