From b267d0f86aff15a0edb2929f09060c118ed98ec4 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@facebook.com>
Date: Fri, 13 Aug 2010 15:42:29 -0700
Subject: [PATCH 01/48] Add the thread.arena mallctl.

Make it possible for each thread to manage which arena it is associated
with.

Implement the 'tests' and 'check' build targets.
---
 .gitignore                     |  2 +
 jemalloc/Makefile.in           | 50 +++++++++++++++++++++++-
 jemalloc/configure.ac          |  5 +++
 jemalloc/doc/jemalloc.3.in     | 13 +++++++
 jemalloc/src/ctl.c             | 52 +++++++++++++++++++++++++
 jemalloc/test/thread_arena.c   | 69 ++++++++++++++++++++++++++++++++++
 jemalloc/test/thread_arena.exp |  2 +
 7 files changed, 191 insertions(+), 2 deletions(-)
 create mode 100644 jemalloc/test/thread_arena.c
 create mode 100644 jemalloc/test/thread_arena.exp

diff --git a/.gitignore b/.gitignore
index d9e49f8f..8e95c7a4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,4 +12,6 @@
 /jemalloc/src/jemalloc\.h
 /jemalloc/src/jemalloc_defs\.h
 /jemalloc/src/*.[od]
+/jemalloc/test/*.[od]
+/jemalloc/test/*.out
 /jemalloc/VERSION
diff --git a/jemalloc/Makefile.in b/jemalloc/Makefile.in
index ac9b7822..a7acc969 100644
--- a/jemalloc/Makefile.in
+++ b/jemalloc/Makefile.in
@@ -47,6 +47,10 @@ DSOS := @objroot@lib/libjemalloc@install_suffix@.so.$(REV) \
 	@objroot@lib/libjemalloc@install_suffix@.so \
 	@objroot@lib/libjemalloc@install_suffix@_pic.a
 MAN3 := @objroot@doc/jemalloc@install_suffix@.3
+CTESTS :=
+ifeq (1, @enable_tls@)
+CTESTS += @srcroot@test/thread_arena.c
+endif
 
 .PHONY: all dist install check clean distclean relclean
 
@@ -69,12 +73,22 @@ all: $(DSOS)
 
 @objroot@lib/libjemalloc@install_suffix@.so.$(REV) : $(CSRCS:@srcroot@%.c=@objroot@%.o)
 	@mkdir -p $(@D)
-	$(CC) -shared -Wl,-soname,$(@F) -o $@ $+ $(LDFLAGS) $(LIBS)
+	$(CC) -shared -Wl,-soname,$(@F) $(RPATH_EXTRA:%=@RPATH@%) -o $@ $+ $(LDFLAGS) $(LIBS)
 
 @objroot@lib/libjemalloc@install_suffix@_pic.a : $(CSRCS:@srcroot@%.c=@objroot@%.o)
 	@mkdir -p $(@D)
 	ar crus $@ $+
 
+@objroot@test/%.o: @srcroot@test/%.c
+	@mkdir -p $(@D)
+	$(CC) $(CFLAGS) -c $(CPPFLAGS) -o $@ $<
+	@$(SHELL) -ec "$(CC) -MM $(CPPFLAGS) $< | sed \"s/\($(subst /,\/,$(notdir $(basename $@)))\)\.o\([ :]*\)/$(subst /,\/,$(strip $(dir $@)))\1.o \2/g\" > $(@:%.o=%.d)"
+
+@objroot@test/%: @objroot@test/%.o \
+		 @objroot@lib/libjemalloc@install_suffix@.so
+	@mkdir -p $(@D)
+	$(CC) -o $@ $< @RPATH@@objroot@lib -L@objroot@lib -ljemalloc
+
 install_bin:
 	install -d $(BINDIR)
 	@for b in $(BINS); do \
@@ -104,11 +118,43 @@ done
 
 install: install_bin install_include install_lib install_man
 
-check:
+tests: $(CTESTS:@srcroot@%.c=@objroot@%)
+
+check: tests
+	@mkdir -p @objroot@test
+	@$(SHELL) -c 'total=0; \
+		failures=0; \
+		echo "========================================="; \
+		for t in $(CTESTS:@srcroot@%.c=@objroot@%); do \
+			total=`expr $$total + 1`; \
+			/bin/echo -n "$${t} ... "; \
+			$${t} @abs_srcroot@ @abs_objroot@ \
+			  > @objroot@$${t}.out 2>&1; \
+			if test -e "@srcroot@$${t}.exp"; then \
+				diff -u @srcroot@$${t}.exp \
+				  @objroot@$${t}.out >/dev/null 2>&1; \
+				fail=$$?; \
+				if test "$${fail}" -eq "1" ; then \
+					failures=`expr $${failures} + 1`; \
+					echo "*** FAIL ***"; \
+				else \
+					echo "pass"; \
+				fi; \
+			else \
+				echo "*** FAIL *** (.exp file is missing)"; \
+				failures=`expr $${failures} + 1`; \
+			fi; \
+		done; \
+		echo "========================================="; \
+		echo "Failures: $${failures}/$${total}"'
 
 clean:
 	rm -f $(CSRCS:@srcroot@%.c=@objroot@%.o)
 	rm -f $(CSRCS:@srcroot@%.c=@objroot@%.d)
+	rm -f $(CTESTS:@srcroot@%.c=@objroot@%)
+	rm -f $(CTESTS:@srcroot@%.c=@objroot@%.o)
+	rm -f $(CTESTS:@srcroot@%.c=@objroot@%.d)
+	rm -f $(CTESTS:@srcroot@%.c=@objroot@%.out)
 	rm -f $(DSOS)
 
 distclean: clean
diff --git a/jemalloc/configure.ac b/jemalloc/configure.ac
index ce6e6799..bf165960 100644
--- a/jemalloc/configure.ac
+++ b/jemalloc/configure.ac
@@ -647,9 +647,14 @@ AC_RUN_IFELSE([AC_LANG_PROGRAM(
               AC_MSG_RESULT([no])
               enable_tls="0")
 fi
+AC_SUBST([enable_tls])
 if test "x${enable_tls}" = "x0" ; then
   AC_DEFINE_UNQUOTED([NO_TLS], [ ])
+  roff_tls=".\\\" "
+else
+  roff_tls=""
 fi
+AC_SUBST([roff_tls])
 
 dnl Finish tcache-related definitions, now that TLS configuration is done.
 if test "x$enable_tls" = "x0" ; then
diff --git a/jemalloc/doc/jemalloc.3.in b/jemalloc/doc/jemalloc.3.in
index cf5cb5e0..d2d5b77f 100644
--- a/jemalloc/doc/jemalloc.3.in
+++ b/jemalloc/doc/jemalloc.3.in
@@ -669,6 +669,19 @@ This is useful for detecting whether another thread caused a refresh.
 @roff_tcache@find manual flushing useful.
 .Ed
 .\"-----------------------------------------------------------------------------
+@roff_tls@.It Sy "thread.arena (unsigned) rw"
+@roff_tls@.Bd -ragged -offset indent -compact
+@roff_tls@Get or set the arena associated with the calling thread.
+@roff_tls@The arena index must be less than the maximum number of arenas (see
+@roff_tls@the
+@roff_tls@.Dq arenas.narenas
+@roff_tls@mallctl).
+@roff_tls@If the specified arena was not initialized beforehand (see the
+@roff_tls@.Dq arenas.initialized
+@roff_tls@mallctl), it will be automatically initialized as a side effect of
+@roff_tls@calling this interface.
+@roff_tls@.Ed
+.\"-----------------------------------------------------------------------------
 .It Sy "config.debug (bool) r-"
 .Bd -ragged -offset indent -compact
 --enable-debug was specified during build configuration.
diff --git a/jemalloc/src/ctl.c b/jemalloc/src/ctl.c
index ffb732d5..128883f8 100644
--- a/jemalloc/src/ctl.c
+++ b/jemalloc/src/ctl.c
@@ -41,6 +41,9 @@ CTL_PROTO(epoch)
 #ifdef JEMALLOC_TCACHE
 CTL_PROTO(tcache_flush)
 #endif
+#ifndef NO_TLS
+CTL_PROTO(thread_arena)
+#endif
 CTL_PROTO(config_debug)
 CTL_PROTO(config_dss)
 CTL_PROTO(config_dynamic_page_shift)
@@ -210,6 +213,12 @@ static const ctl_node_t	tcache_node[] = {
 };
 #endif
 
+#ifndef NO_TLS
+static const ctl_node_t	thread_node[] = {
+	{NAME("arena"),		CTL(thread_arena)}
+};
+#endif
+
 static const ctl_node_t	config_node[] = {
 	{NAME("debug"),			CTL(config_debug)},
 	{NAME("dss"),			CTL(config_dss)},
@@ -447,6 +456,9 @@ static const ctl_node_t	root_node[] = {
 	{NAME("epoch"),		CTL(epoch)},
 #ifdef JEMALLOC_TCACHE
 	{NAME("tcache"),	CHILD(tcache)},
+#endif
+#ifndef NO_TLS
+	{NAME("thread"),	CHILD(thread)},
 #endif
 	{NAME("config"),	CHILD(config)},
 	{NAME("opt"),		CHILD(opt)},
@@ -1042,6 +1054,46 @@ RETURN:
 }
 #endif
 
+#ifndef NO_TLS
+static int
+thread_arena_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
+    void *newp, size_t newlen)
+{
+	int ret;
+	unsigned newind, oldind;
+
+	newind = oldind = choose_arena()->ind;
+	WRITE(oldind, unsigned);
+	READ(newind, unsigned);
+	if (newind != oldind) {
+		arena_t *arena;
+
+		if (newind >= narenas) {
+			/* New arena index is out of range. */
+			ret = EFAULT;
+			goto RETURN;
+		}
+
+		/* Initialize arena if necessary. */
+		malloc_mutex_lock(&arenas_lock);
+		if ((arena = arenas[newind]) == NULL)
+			arena = arenas_extend(newind);
+		malloc_mutex_unlock(&arenas_lock);
+		if (arena == NULL) {
+			ret = EAGAIN;
+			goto RETURN;
+		}
+
+		/* Set new arena association. */
+		arenas_map = arena;
+	}
+
+	ret = 0;
+RETURN:
+	return (ret);
+}
+#endif
+
 /******************************************************************************/
 
 #ifdef JEMALLOC_DEBUG
diff --git a/jemalloc/test/thread_arena.c b/jemalloc/test/thread_arena.c
new file mode 100644
index 00000000..99e9669a
--- /dev/null
+++ b/jemalloc/test/thread_arena.c
@@ -0,0 +1,69 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <pthread.h>
+#include <string.h>
+
+#include "jemalloc/jemalloc.h"
+
+void *
+thread_start(void *arg)
+{
+	unsigned main_arena_ind = *(unsigned *)arg;
+	unsigned arena_ind;
+	size_t size;
+	int err;
+
+	malloc(1);
+
+	size = sizeof(arena_ind);
+	if ((err = mallctl("thread.arena", &arena_ind, &size,
+	    &main_arena_ind, sizeof(main_arena_ind)))) {
+		fprintf(stderr, "%s(): Error in mallctl(): %s\n", __func__,
+		    strerror(err));
+		return (void *)1;
+	}
+
+	return (NULL);
+}
+
+int
+main(void)
+{
+	int ret = 0;
+	unsigned arena_ind;
+	size_t size;
+	int err;
+	pthread_t thread;
+
+	fprintf(stderr, "Test begin\n");
+
+	malloc(1);
+
+	size = sizeof(arena_ind);
+	if ((err = mallctl("thread.arena", &arena_ind, &size, NULL, 0))) {
+		fprintf(stderr, "%s(): Error in mallctl(): %s\n", __func__,
+		    strerror(err));
+		ret = 1;
+		goto RETURN;
+	}
+
+	if (pthread_create(&thread, NULL, thread_start, (void *)&arena_ind)
+	    != 0) {
+		fprintf(stderr, "%s(): Error in pthread_create()\n", __func__);
+		ret = 1;
+		goto RETURN;
+	}
+	pthread_join(thread, (void *)&ret);
+
+	if (pthread_create(&thread, NULL, thread_start, (void *)&arena_ind)
+	    != 0) {
+		fprintf(stderr, "%s(): Error in pthread_create()\n", __func__);
+		ret = 1;
+		goto RETURN;
+	}
+	pthread_join(thread, (void *)&ret);
+
+RETURN:
+	fprintf(stderr, "Test end\n");
+	return (ret);
+}
diff --git a/jemalloc/test/thread_arena.exp b/jemalloc/test/thread_arena.exp
new file mode 100644
index 00000000..369a88dd
--- /dev/null
+++ b/jemalloc/test/thread_arena.exp
@@ -0,0 +1,2 @@
+Test begin
+Test end

From 2dbecf1f6267fae7a161b9c39cfd4d04ce168a29 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Sun, 5 Sep 2010 10:35:13 -0700
Subject: [PATCH 02/48] Port to Mac OS X.

Add Mac OS X support, based in large part on the OS X support in
Mozilla's version of jemalloc.
---
 jemalloc/COPYING                              |   1 +
 jemalloc/INSTALL                              |   4 +
 jemalloc/Makefile.in                          |  45 ++-
 jemalloc/configure.ac                         | 170 +++++----
 jemalloc/doc/jemalloc.3.in                    |  31 +-
 jemalloc/include/jemalloc/internal/chunk.h    |   6 +-
 .../include/jemalloc/internal/chunk_mmap.h    |   2 +
 .../jemalloc/internal/jemalloc_internal.h.in  | 137 ++++---
 jemalloc/include/jemalloc/internal/mutex.h    |   6 +
 jemalloc/include/jemalloc/internal/rtree.h    | 161 ++++++++
 jemalloc/include/jemalloc/internal/tcache.h   |  28 +-
 jemalloc/include/jemalloc/internal/zone.h     |  23 ++
 jemalloc/include/jemalloc/jemalloc_defs.h.in  |  28 ++
 jemalloc/src/arena.c                          |  52 ++-
 jemalloc/src/base.c                           |   2 +-
 jemalloc/src/chunk.c                          |  27 +-
 jemalloc/src/chunk_mmap.c                     |  46 ++-
 jemalloc/src/ctl.c                            |  14 +-
 jemalloc/src/huge.c                           |   4 +-
 jemalloc/src/jemalloc.c                       |  90 ++---
 jemalloc/src/mutex.c                          |   4 +
 jemalloc/src/prof.c                           | 123 ++++--
 jemalloc/src/rtree.c                          |  42 +++
 jemalloc/src/tcache.c                         |  26 +-
 jemalloc/src/zone.c                           | 354 ++++++++++++++++++
 jemalloc/test/thread_arena.c                  |  10 +-
 26 files changed, 1146 insertions(+), 290 deletions(-)
 create mode 100644 jemalloc/include/jemalloc/internal/rtree.h
 create mode 100644 jemalloc/include/jemalloc/internal/zone.h
 create mode 100644 jemalloc/src/rtree.c
 create mode 100644 jemalloc/src/zone.c

diff --git a/jemalloc/COPYING b/jemalloc/COPYING
index 1baaf50d..10ade120 100644
--- a/jemalloc/COPYING
+++ b/jemalloc/COPYING
@@ -3,6 +3,7 @@ subject to the following licenses:
 --------------------------------------------------------------------------------
 Copyright (C) 2002-2010 Jason Evans <jasone@canonware.com>.
 All rights reserved.
+Copyright (C) 2007-2010 Mozilla Foundation.  All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
diff --git a/jemalloc/INSTALL b/jemalloc/INSTALL
index eec3b37e..e9a87989 100644
--- a/jemalloc/INSTALL
+++ b/jemalloc/INSTALL
@@ -31,6 +31,10 @@ any of the following arguments (not a definitive list) to 'configure':
     becomes <prefix>malloc().  This makes it possible to use jemalloc at the
     same time as the system allocator.
 
+    By default, the prefix is "", except on OS X, where it is "je_".  On OS X,
+    jemalloc overlays the default malloc zone, but makes no attempt to actually
+    replace the "malloc", "calloc", etc. symbols.
+
 --with-install-suffix=<suffix>
     Append <suffix> to the base name of all installed files, such that multiple
     versions of jemalloc can coexist in the same installation directory.  For
diff --git a/jemalloc/Makefile.in b/jemalloc/Makefile.in
index a7acc969..aa3bf6bd 100644
--- a/jemalloc/Makefile.in
+++ b/jemalloc/Makefile.in
@@ -28,10 +28,17 @@ LIBS := @LIBS@
 RPATH_EXTRA := @RPATH_EXTRA@
 ifeq (macho, @abi@)
 SO := dylib
+WL_SONAME := dylib_install_name
 else
 SO := so
+WL_SONAME := soname
 endif
 REV := 0
+ifeq (macho, @abi@)
+TEST_LIBRARY_PATH := DYLD_FALLBACK_LIBRARY_PATH=@objroot@lib
+else
+TEST_LIBRARY_PATH :=
+endif
 
 # Lists of files.
 BINS := @srcroot@bin/pprof
@@ -42,15 +49,16 @@ CSRCS := @srcroot@src/jemalloc.c @srcroot@src/arena.c @srcroot@src/base.c \
 	@srcroot@src/chunk_mmap.c @srcroot@src/chunk_swap.c @srcroot@src/ckh.c \
 	@srcroot@src/ctl.c @srcroot@src/extent.c @srcroot@src/hash.c \
 	@srcroot@src/huge.c @srcroot@src/mb.c @srcroot@src/mutex.c \
-	@srcroot@src/prof.c @srcroot@src/stats.c @srcroot@src/tcache.c
-DSOS := @objroot@lib/libjemalloc@install_suffix@.so.$(REV) \
-	@objroot@lib/libjemalloc@install_suffix@.so \
+	@srcroot@src/prof.c @srcroot@src/rtree.c \
+	@srcroot@src/stats.c @srcroot@src/tcache.c
+ifeq (macho, @abi@)
+CSRCS += @srcroot@src/zone.c
+endif
+DSOS := @objroot@lib/libjemalloc@install_suffix@.$(SO).$(REV) \
+	@objroot@lib/libjemalloc@install_suffix@.$(SO) \
 	@objroot@lib/libjemalloc@install_suffix@_pic.a
 MAN3 := @objroot@doc/jemalloc@install_suffix@.3
-CTESTS :=
-ifeq (1, @enable_tls@)
-CTESTS += @srcroot@test/thread_arena.c
-endif
+CTESTS := @srcroot@test/thread_arena.c
 
 .PHONY: all dist install check clean distclean relclean
 
@@ -67,13 +75,13 @@ all: $(DSOS)
 	$(CC) $(CFLAGS) -c $(CPPFLAGS) -o $@ $<
 	@$(SHELL) -ec "$(CC) -MM $(CPPFLAGS) $< | sed \"s/\($(subst /,\/,$(notdir $(basename $@)))\)\.o\([ :]*\)/$(subst /,\/,$(strip $(dir $@)))\1.o \2/g\" > $(@:%.o=%.d)"
 
-%.so : %.so.$(REV)
+%.$(SO) : %.$(SO).$(REV)
 	@mkdir -p $(@D)
 	ln -sf $(<F) $@
 
-@objroot@lib/libjemalloc@install_suffix@.so.$(REV) : $(CSRCS:@srcroot@%.c=@objroot@%.o)
+@objroot@lib/libjemalloc@install_suffix@.$(SO).$(REV) : $(CSRCS:@srcroot@%.c=@objroot@%.o)
 	@mkdir -p $(@D)
-	$(CC) -shared -Wl,-soname,$(@F) $(RPATH_EXTRA:%=@RPATH@%) -o $@ $+ $(LDFLAGS) $(LIBS)
+	$(CC) -shared -Wl,-$(WL_SONAME),$(@F) $(RPATH_EXTRA:%=@RPATH@%) -o $@ $+ $(LDFLAGS) $(LIBS)
 
 @objroot@lib/libjemalloc@install_suffix@_pic.a : $(CSRCS:@srcroot@%.c=@objroot@%.o)
 	@mkdir -p $(@D)
@@ -85,9 +93,13 @@ all: $(DSOS)
 	@$(SHELL) -ec "$(CC) -MM $(CPPFLAGS) $< | sed \"s/\($(subst /,\/,$(notdir $(basename $@)))\)\.o\([ :]*\)/$(subst /,\/,$(strip $(dir $@)))\1.o \2/g\" > $(@:%.o=%.d)"
 
 @objroot@test/%: @objroot@test/%.o \
-		 @objroot@lib/libjemalloc@install_suffix@.so
+		 @objroot@lib/libjemalloc@install_suffix@.$(SO)
 	@mkdir -p $(@D)
-	$(CC) -o $@ $< @RPATH@@objroot@lib -L@objroot@lib -ljemalloc
+ifneq (@RPATH@, )
+	$(CC) -o $@ $< @RPATH@@objroot@lib -L@objroot@lib -ljemalloc@install_suffix@
+else
+	$(CC) -o $@ $< -L@objroot@lib -ljemalloc@install_suffix@
+endif
 
 install_bin:
 	install -d $(BINDIR)
@@ -105,8 +117,8 @@ done
 
 install_lib: $(DSOS)
 	install -d $(LIBDIR)
-	install -m 755 @objroot@lib/libjemalloc@install_suffix@.so.$(REV) $(LIBDIR)
-	ln -sf libjemalloc@install_suffix@.so.$(REV) $(LIBDIR)/libjemalloc@install_suffix@.so
+	install -m 755 @objroot@lib/libjemalloc@install_suffix@.$(SO).$(REV) $(LIBDIR)
+	ln -sf libjemalloc@install_suffix@.$(SO).$(REV) $(LIBDIR)/libjemalloc@install_suffix@.$(SO)
 	install -m 755 @objroot@lib/libjemalloc@install_suffix@_pic.a $(LIBDIR)
 
 install_man:
@@ -128,7 +140,7 @@ check: tests
 		for t in $(CTESTS:@srcroot@%.c=@objroot@%); do \
 			total=`expr $$total + 1`; \
 			/bin/echo -n "$${t} ... "; \
-			$${t} @abs_srcroot@ @abs_objroot@ \
+			$(TEST_LIBRARY_PATH) $${t} @abs_srcroot@ @abs_objroot@ \
 			  > @objroot@$${t}.out 2>&1; \
 			if test -e "@srcroot@$${t}.exp"; then \
 				diff -u @srcroot@$${t}.exp \
@@ -161,8 +173,7 @@ distclean: clean
 	rm -rf @objroot@autom4te.cache
 	rm -f @objroot@config.log
 	rm -f @objroot@config.status
-	rm -f @objroot@cfghdrs.stamp
-	rm -f @objroot@cfgoutputs.stamp
+	rm -f @objroot@config.stamp
 	rm -f @cfghdrs_out@
 	rm -f @cfgoutputs_out@
 
diff --git a/jemalloc/configure.ac b/jemalloc/configure.ac
index bf165960..21f502c6 100644
--- a/jemalloc/configure.ac
+++ b/jemalloc/configure.ac
@@ -150,7 +150,7 @@ JE_COMPILABLE([__attribute__ syntax],
               [attribute])
 if test "x${attribute}" = "xyes" ; then
   AC_DEFINE([JEMALLOC_HAVE_ATTR], [ ])
-  if test "x$GCC" = "xyes" ; then
+  if test "x$GCC" = "xyes" -a "${abi}" = "xelf"; then
     JE_CFLAGS_APPEND([-fvisibility=internal])
   fi
 fi
@@ -166,17 +166,20 @@ case "${host}" in
   *-*-darwin*)
 	CFLAGS="$CFLAGS -fno-common -no-cpp-precomp"
 	abi="macho"
+	AC_DEFINE([JEMALLOC_PURGE_MSYNC_KILLPAGES])
 	RPATH=""
 	;;
   *-*-freebsd*)
 	CFLAGS="$CFLAGS"
 	abi="elf"
+	AC_DEFINE([JEMALLOC_PURGE_MADVISE_FREE])
 	RPATH="-Wl,-rpath,"
 	;;
   *-*-linux*)
 	CFLAGS="$CFLAGS"
 	CPPFLAGS="$CPPFLAGS -D_GNU_SOURCE"
 	abi="elf"
+	AC_DEFINE([JEMALLOC_PURGE_MADVISE_DONTNEED])
 	RPATH="-Wl,-rpath,"
 	;;
   *-*-netbsd*)
@@ -191,6 +194,7 @@ case "${host}" in
                           [CFLAGS="$CFLAGS"; abi="elf"],
                           [abi="aout"])
 	AC_MSG_RESULT([$abi])
+	AC_DEFINE([JEMALLOC_PURGE_MADVISE_FREE])
 	RPATH="-Wl,-rpath,"
 	;;
   *-*-solaris2*)
@@ -245,7 +249,11 @@ dnl Do not prefix public APIs by default.
 AC_ARG_WITH([jemalloc_prefix],
   [AS_HELP_STRING([--with-jemalloc-prefix=<prefix>], [Prefix to prepend to all public APIs])],
   [JEMALLOC_PREFIX="$with_jemalloc_prefix"],
-  [JEMALLOC_PREFIX=]
+  [if test "x$abi" != "xmacho" ; then
+  JEMALLOC_PREFIX=""
+else
+  JEMALLOC_PREFIX="je_"
+fi]
 )
 if test "x$JEMALLOC_PREFIX" != "x" ; then
   AC_DEFINE([JEMALLOC_PREFIX], [ ])
@@ -294,6 +302,7 @@ fi
 )
 if test "x$enable_debug" = "x1" ; then
   AC_DEFINE([JEMALLOC_DEBUG], [ ])
+  AC_DEFINE([JEMALLOC_IVSALLOC], [ ])
 fi
 AC_SUBST([enable_debug])
 
@@ -379,7 +388,44 @@ else
 fi,
   LUNWIND="-lunwind"
 )
-dnl Finish prof-related definitions below, once TLS configuration is done.
+if test "x$enable_prof" = "x1" ; then
+  LIBS="$LIBS -lm"
+  AC_DEFINE([JEMALLOC_PROF], [ ])
+  if test "x$enable_prof_libunwind" = "x1" ; then
+    AC_CHECK_HEADERS([libunwind.h], , [enable_prof_libunwind="0"])
+    if test "x$LUNWIND" = "x-lunwind" ; then
+      AC_CHECK_LIB([unwind], [backtrace], [LIBS="$LIBS $LUNWIND"],
+                   [enable_prof_libunwind="0"])
+    else
+      LIBS="$LIBS $LUNWIND"
+    fi
+    if test "x${enable_prof_libunwind}" = "x1" ; then
+      AC_DEFINE([JEMALLOC_PROF_LIBUNWIND], [ ])
+    fi
+  fi
+fi
+AC_SUBST([enable_prof])
+if test "x$enable_prof" = "x0" ; then
+  roff_prof=".\\\" "
+  roff_no_prof=""
+else
+  roff_prof=""
+  roff_no_prof=".\\\" "
+fi
+AC_SUBST([roff_prof])
+AC_SUBST([roff_no_prof])
+
+dnl If libunwind isn't enabled, try to use libgcc rather than gcc intrinsics
+dnl for backtracing.
+if test "x$enable_prof" = "x1" -a "x$enable_prof_libunwind" = "x0" \
+ -a "x$GCC" = "xyes" -a "x$enable_prof_libgcc" = "x1" ; then
+  enable_prof_libgcc="1"
+  AC_CHECK_HEADERS([unwind.h], , [enable_prof_libgcc="0"])
+  AC_CHECK_LIB([gcc], [_Unwind_Backtrace], [LIBS="$LIBS -lgcc"], [enable_prof_libgcc="0"])
+  if test "x${enable_prof_libgcc}" = "x1" ; then
+    AC_DEFINE([JEMALLOC_PROF_LIBGCC], [ ])
+  fi
+fi
 
 dnl Enable tiny allocations by default.
 AC_ARG_ENABLE([tiny],
@@ -417,7 +463,19 @@ fi
 ],
 [enable_tcache="1"]
 )
-dnl Finish tcache-related definitions below, once TLS configuration is done.
+if test "x$enable_tcache" = "x1" ; then
+  AC_DEFINE([JEMALLOC_TCACHE], [ ])
+fi
+AC_SUBST([enable_tcache])
+if test "x$enable_tcache" = "x0" ; then
+  roff_tcache=".\\\" "
+  roff_no_tcache=""
+else
+  roff_tcache=""
+  roff_no_tcache=".\\\" "
+fi
+AC_SUBST([roff_tcache])
+AC_SUBST([roff_no_tcache])
 
 dnl Do not enable mmap()ped swap files by default.
 AC_ARG_ENABLE([swap],
@@ -650,71 +708,52 @@ fi
 AC_SUBST([enable_tls])
 if test "x${enable_tls}" = "x0" ; then
   AC_DEFINE_UNQUOTED([NO_TLS], [ ])
-  roff_tls=".\\\" "
-else
-  roff_tls=""
 fi
-AC_SUBST([roff_tls])
 
-dnl Finish tcache-related definitions, now that TLS configuration is done.
-if test "x$enable_tls" = "x0" ; then
-  enable_tcache="0"
-fi
-if test "x$enable_tcache" = "x1" ; then
-  AC_DEFINE([JEMALLOC_TCACHE], [ ])
-fi
-AC_SUBST([enable_tcache])
-if test "x$enable_tcache" = "x0" ; then
-  roff_tcache=".\\\" "
-  roff_no_tcache=""
-else
-  roff_tcache=""
-  roff_no_tcache=".\\\" "
-fi
-AC_SUBST([roff_tcache])
-AC_SUBST([roff_no_tcache])
+dnl ============================================================================
+dnl Darwin-related configuration.
 
-dnl Finish prof-related definitions, now that TLS configuration is done.
-if test "x$enable_tls" = "x0" ; then
-  enable_prof="0"
-fi
-if test "x$enable_prof" = "x1" ; then
-  LIBS="$LIBS -lm"
-  AC_DEFINE([JEMALLOC_PROF], [ ])
-  if test "x$enable_prof_libunwind" = "x1" ; then
-    AC_CHECK_HEADERS([libunwind.h], , [enable_prof_libunwind="0"])
-    if test "x$LUNWIND" = "x-lunwind" ; then
-      AC_CHECK_LIB([unwind], [backtrace], [LIBS="$LIBS $LUNWIND"],
-                   [enable_prof_libunwind="0"])
-    else
-      LIBS="$LIBS $LUNWIND"
-    fi
-    if test "x${enable_prof_libunwind}" = "x1" ; then
-      AC_DEFINE([JEMALLOC_PROF_LIBUNWIND], [ ])
-    fi
-  fi
-fi
-AC_SUBST([enable_prof])
-if test "x$enable_prof" = "x0" ; then
-  roff_prof=".\\\" "
-  roff_no_prof=""
-else
-  roff_prof=""
-  roff_no_prof=".\\\" "
-fi
-AC_SUBST([roff_prof])
-AC_SUBST([roff_no_prof])
+if test "x${abi}" = "xmacho" ; then
+  AC_DEFINE([JEMALLOC_IVSALLOC])
+  AC_DEFINE([JEMALLOC_ZONE])
 
-dnl If libunwind isn't enabled, try to use libgcc rather than gcc intrinsics
-dnl for backtracing.
-if test "x$enable_prof" = "x1" -a "x$enable_prof_libunwind" = "x0" \
- -a "x$GCC" = "xyes" -a "x$enable_prof_libgcc" = "x1" ; then
-  enable_prof_libgcc="1"
-  AC_CHECK_HEADERS([unwind.h], , [enable_prof_libgcc="0"])
-  AC_CHECK_LIB([gcc], [_Unwind_Backtrace], [LIBS="$LIBS -lgcc"], [enable_prof_libgcc="0"])
-  if test "x${enable_prof_libgcc}" = "x1" ; then
-    AC_DEFINE([JEMALLOC_PROF_LIBGCC], [ ])
-  fi
+  dnl The szone version jumped from 3 to 6 between the OS X 10.5.x and 10.6
+  dnl releases.  malloc_zone_t and malloc_introspection_t have new fields in
+  dnl 10.6, which is the only source-level indication of the change.
+  AC_MSG_CHECKING([malloc zone version])
+  AC_TRY_COMPILE([#include <stdlib.h>
+#include <malloc/malloc.h>], [
+	static malloc_zone_t zone;
+	static struct malloc_introspection_t zone_introspect;
+
+	zone.size = NULL;
+	zone.malloc = NULL;
+	zone.calloc = NULL;
+	zone.valloc = NULL;
+	zone.free = NULL;
+	zone.realloc = NULL;
+	zone.destroy = NULL;
+	zone.zone_name = "jemalloc_zone";
+	zone.batch_malloc = NULL;
+	zone.batch_free = NULL;
+	zone.introspect = &zone_introspect;
+	zone.version = 6;
+	zone.memalign = NULL;
+	zone.free_definite_size = NULL;
+
+	zone_introspect.enumerator = NULL;
+	zone_introspect.good_size = NULL;
+	zone_introspect.check = NULL;
+	zone_introspect.print = NULL;
+	zone_introspect.log = NULL;
+	zone_introspect.force_lock = NULL;
+	zone_introspect.force_unlock = NULL;
+	zone_introspect.statistics = NULL;
+	zone_introspect.zone_locked = NULL;
+], [AC_DEFINE_UNQUOTED([JEMALLOC_ZONE_VERSION], [6])
+    AC_MSG_RESULT([6])],
+   [AC_DEFINE_UNQUOTED([JEMALLOC_ZONE_VERSION], [3])
+   AC_MSG_RESULT([3])])
 fi
 
 dnl ============================================================================
@@ -773,4 +812,5 @@ AC_MSG_RESULT([swap               : ${enable_swap}])
 AC_MSG_RESULT([dss                : ${enable_dss}])
 AC_MSG_RESULT([dynamic_page_shift : ${enable_dynamic_page_shift}])
 AC_MSG_RESULT([lazy_lock          : ${enable_lazy_lock}])
+AC_MSG_RESULT([tls                : ${enable_tls}])
 AC_MSG_RESULT([===============================================================================])
diff --git a/jemalloc/doc/jemalloc.3.in b/jemalloc/doc/jemalloc.3.in
index d2d5b77f..dfc4d763 100644
--- a/jemalloc/doc/jemalloc.3.in
+++ b/jemalloc/doc/jemalloc.3.in
@@ -464,7 +464,7 @@ is a single CPU.
 @roff_swap@This option is enabled by default.
 .It P
 The
-.Fn malloc_stats_print
+.Fn @jemalloc_prefix@malloc_stats_print
 function is called at program exit via an
 .Xr atexit 3
 function.
@@ -626,7 +626,7 @@ round your allocation requests up to the nearest multiple of the cacheline
 size.
 .Sh MALLCTL NAMESPACE
 The following names are defined in the namespace accessible via the
-.Fn mallctl*
+.Fn @jemalloc_prefix@mallctl*
 functions.
 Value types are specified in parentheses, and their readable/writable statuses
 are encoded as rw, r-, -w, or --.
@@ -648,7 +648,7 @@ Return the jemalloc version string.
 .It Sy "epoch (uint64_t) rw"
 .Bd -ragged -offset indent -compact
 If a value is passed in, refresh the data from which the
-.Fn mallctl*
+.Fn @jemalloc_prefix@mallctl*
 functions report values, and increment the epoch.
 Return the current epoch.
 This is useful for detecting whether another thread caused a refresh.
@@ -669,18 +669,17 @@ This is useful for detecting whether another thread caused a refresh.
 @roff_tcache@find manual flushing useful.
 .Ed
 .\"-----------------------------------------------------------------------------
-@roff_tls@.It Sy "thread.arena (unsigned) rw"
-@roff_tls@.Bd -ragged -offset indent -compact
-@roff_tls@Get or set the arena associated with the calling thread.
-@roff_tls@The arena index must be less than the maximum number of arenas (see
-@roff_tls@the
-@roff_tls@.Dq arenas.narenas
-@roff_tls@mallctl).
-@roff_tls@If the specified arena was not initialized beforehand (see the
-@roff_tls@.Dq arenas.initialized
-@roff_tls@mallctl), it will be automatically initialized as a side effect of
-@roff_tls@calling this interface.
-@roff_tls@.Ed
+.It Sy "thread.arena (unsigned) rw"
+.Bd -ragged -offset indent -compact
+Get or set the arena associated with the calling thread.
+The arena index must be less than the maximum number of arenas (see the
+.Dq arenas.narenas
+mallctl).
+If the specified arena was not initialized beforehand (see the
+.Dq arenas.initialized
+mallctl), it will be automatically initialized as a side effect of calling this
+interface.
+.Ed
 .\"-----------------------------------------------------------------------------
 .It Sy "config.debug (bool) r-"
 .Bd -ragged -offset indent -compact
@@ -1442,7 +1441,7 @@ Attempt to read or write void value, or attempt to write read-only value.
 A memory allocation failure occurred.
 .It Bq Er EFAULT
 An interface with side effects failed in some way not directly related to
-.Fn mallctl*
+.Fn @jemalloc_prefix@mallctl*
 read/write processing.
 .El
 .Sh ENVIRONMENT
diff --git a/jemalloc/include/jemalloc/internal/chunk.h b/jemalloc/include/jemalloc/internal/chunk.h
index 1f6abf78..d7955298 100644
--- a/jemalloc/include/jemalloc/internal/chunk.h
+++ b/jemalloc/include/jemalloc/internal/chunk.h
@@ -39,13 +39,17 @@ extern malloc_mutex_t	chunks_mtx;
 extern chunk_stats_t	stats_chunks;
 #endif
 
+#ifdef JEMALLOC_IVSALLOC
+extern rtree_t		*chunks_rtree;
+#endif
+
 extern size_t		chunksize;
 extern size_t		chunksize_mask; /* (chunksize - 1). */
 extern size_t		chunk_npages;
 extern size_t		arena_chunk_header_npages;
 extern size_t		arena_maxclass; /* Max size class for arenas. */
 
-void	*chunk_alloc(size_t size, bool *zero);
+void	*chunk_alloc(size_t size, bool base, bool *zero);
 void	chunk_dealloc(void *chunk, size_t size);
 bool	chunk_boot(void);
 
diff --git a/jemalloc/include/jemalloc/internal/chunk_mmap.h b/jemalloc/include/jemalloc/internal/chunk_mmap.h
index dc52448c..07b50a4d 100644
--- a/jemalloc/include/jemalloc/internal/chunk_mmap.h
+++ b/jemalloc/include/jemalloc/internal/chunk_mmap.h
@@ -13,6 +13,8 @@ void	*chunk_alloc_mmap(size_t size);
 void	*chunk_alloc_mmap_noreserve(size_t size);
 void	chunk_dealloc_mmap(void *chunk, size_t size);
 
+bool	chunk_mmap_boot(void);
+
 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_INLINES
diff --git a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
index 2c3f32f1..a8d27fa7 100644
--- a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
@@ -27,6 +27,13 @@
 #define	JEMALLOC_MANGLE
 #include "../jemalloc@install_suffix@.h"
 
+#ifdef JEMALLOC_ZONE
+#include <mach/mach_error.h>
+#include <mach/mach_init.h>
+#include <mach/vm_map.h>
+#include <malloc/malloc.h>
+#endif
+
 #ifdef JEMALLOC_LAZY_LOCK
 #include <dlfcn.h>
 #endif
@@ -159,6 +166,16 @@ extern void	(*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s);
 #define	STATIC_PAGE_SIZE ((size_t)(1U << STATIC_PAGE_SHIFT))
 #define	STATIC_PAGE_MASK ((size_t)(STATIC_PAGE_SIZE - 1))
 
+#ifdef PAGE_SHIFT
+#  undef PAGE_SHIFT
+#endif
+#ifdef PAGE_SIZE
+#  undef PAGE_SIZE
+#endif
+#ifdef PAGE_MASK
+#  undef PAGE_MASK
+#endif
+
 #ifdef DYNAMIC_PAGE_SHIFT
 #  define PAGE_SHIFT	lg_pagesize
 #  define PAGE_SIZE	pagesize
@@ -184,9 +201,13 @@ extern void	(*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s);
 #include "jemalloc/internal/base.h"
 #include "jemalloc/internal/chunk.h"
 #include "jemalloc/internal/huge.h"
+#include "jemalloc/internal/rtree.h"
 #include "jemalloc/internal/tcache.h"
 #include "jemalloc/internal/hash.h"
 #include "jemalloc/internal/prof.h"
+#ifdef JEMALLOC_ZONE
+#include "jemalloc/internal/zone.h"
+#endif
 
 #undef JEMALLOC_H_TYPES
 /******************************************************************************/
@@ -203,9 +224,13 @@ extern void	(*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s);
 #include "jemalloc/internal/base.h"
 #include "jemalloc/internal/chunk.h"
 #include "jemalloc/internal/huge.h"
+#include "jemalloc/internal/rtree.h"
 #include "jemalloc/internal/tcache.h"
 #include "jemalloc/internal/hash.h"
 #include "jemalloc/internal/prof.h"
+#ifdef JEMALLOC_ZONE
+#include "jemalloc/internal/zone.h"
+#endif
 
 #undef JEMALLOC_H_STRUCTS
 /******************************************************************************/
@@ -240,8 +265,19 @@ extern malloc_mutex_t	arenas_lock; /* Protects arenas initialization. */
  * Map of pthread_self() --> arenas[???], used for selecting an arena to use
  * for allocations.
  */
-extern __thread arena_t	*arenas_map JEMALLOC_ATTR(tls_model("initial-exec"));
+extern __thread arena_t	*arenas_tls JEMALLOC_ATTR(tls_model("initial-exec"));
+#  define ARENA_GET()	arenas_tls
+#  define ARENA_SET(v)	do {						\
+	arenas_tls = (v);						\
+} while (0)
+#else
+extern pthread_key_t	arenas_tsd;
+#  define ARENA_GET()	((arena_t *)pthread_getspecific(arenas_tsd))
+#  define ARENA_SET(v)	do {						\
+	pthread_setspecific(arenas_tsd, (void *)(v));			\
+} while (0)
 #endif
+
 /*
  * Arenas that are used to service external requests.  Not all elements of the
  * arenas array are necessarily used; arenas are created lazily as needed.
@@ -250,9 +286,9 @@ extern arena_t		**arenas;
 extern unsigned		narenas;
 
 arena_t	*arenas_extend(unsigned ind);
-#ifndef NO_TLS
 arena_t	*choose_arena_hard(void);
-#endif
+void	jemalloc_prefork(void);
+void	jemalloc_postfork(void);
 
 #include "jemalloc/internal/prn.h"
 #include "jemalloc/internal/ckh.h"
@@ -265,9 +301,13 @@ arena_t	*choose_arena_hard(void);
 #include "jemalloc/internal/base.h"
 #include "jemalloc/internal/chunk.h"
 #include "jemalloc/internal/huge.h"
+#include "jemalloc/internal/rtree.h"
 #include "jemalloc/internal/tcache.h"
 #include "jemalloc/internal/hash.h"
 #include "jemalloc/internal/prof.h"
+#ifdef JEMALLOC_ZONE
+#include "jemalloc/internal/zone.h"
+#endif
 
 #undef JEMALLOC_H_EXTERNS
 /******************************************************************************/
@@ -285,11 +325,30 @@ arena_t	*choose_arena_hard(void);
 #include "jemalloc/internal/huge.h"
 
 #ifndef JEMALLOC_ENABLE_INLINE
+size_t	pow2_ceil(size_t x);
 void	malloc_write(const char *s);
 arena_t	*choose_arena(void);
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_C_))
+/* Compute the smallest power of 2 that is >= x. */
+JEMALLOC_INLINE size_t
+pow2_ceil(size_t x)
+{
+
+	x--;
+	x |= x >> 1;
+	x |= x >> 2;
+	x |= x >> 4;
+	x |= x >> 8;
+	x |= x >> 16;
+#if (LG_SIZEOF_PTR == 3)
+	x |= x >> 32;
+#endif
+	x++;
+	return (x);
+}
+
 /*
  * Wrapper around malloc_message() that avoids the need for
  * JEMALLOC_P(malloc_message)(...) throughout the code.
@@ -310,76 +369,33 @@ choose_arena(void)
 {
 	arena_t *ret;
 
-	/*
-	 * We can only use TLS if this is a PIC library, since for the static
-	 * library version, libc's malloc is used by TLS allocation, which
-	 * introduces a bootstrapping issue.
-	 */
-#ifndef NO_TLS
-	ret = arenas_map;
+	ret = ARENA_GET();
 	if (ret == NULL) {
 		ret = choose_arena_hard();
 		assert(ret != NULL);
 	}
-#else
-	if (isthreaded && narenas > 1) {
-		unsigned long ind;
 
-		/*
-		 * Hash pthread_self() to one of the arenas.  There is a prime
-		 * number of arenas, so this has a reasonable chance of
-		 * working.  Even so, the hashing can be easily thwarted by
-		 * inconvenient pthread_self() values.  Without specific
-		 * knowledge of how pthread_self() calculates values, we can't
-		 * easily do much better than this.
-		 */
-		ind = (unsigned long) pthread_self() % narenas;
-
-		/*
-		 * Optimistially assume that arenas[ind] has been initialized.
-		 * At worst, we find out that some other thread has already
-		 * done so, after acquiring the lock in preparation.  Note that
-		 * this lazy locking also has the effect of lazily forcing
-		 * cache coherency; without the lock acquisition, there's no
-		 * guarantee that modification of arenas[ind] by another thread
-		 * would be seen on this CPU for an arbitrary amount of time.
-		 *
-		 * In general, this approach to modifying a synchronized value
-		 * isn't a good idea, but in this case we only ever modify the
-		 * value once, so things work out well.
-		 */
-		ret = arenas[ind];
-		if (ret == NULL) {
-			/*
-			 * Avoid races with another thread that may have already
-			 * initialized arenas[ind].
-			 */
-			malloc_mutex_lock(&arenas_lock);
-			if (arenas[ind] == NULL)
-				ret = arenas_extend((unsigned)ind);
-			else
-				ret = arenas[ind];
-			malloc_mutex_unlock(&arenas_lock);
-		}
-	} else
-		ret = arenas[0];
-#endif
-
-	assert(ret != NULL);
 	return (ret);
 }
 #endif
 
+#include "jemalloc/internal/rtree.h"
 #include "jemalloc/internal/tcache.h"
 #include "jemalloc/internal/arena.h"
 #include "jemalloc/internal/hash.h"
 #include "jemalloc/internal/prof.h"
+#ifdef JEMALLOC_ZONE
+#include "jemalloc/internal/zone.h"
+#endif
 
 #ifndef JEMALLOC_ENABLE_INLINE
 void	*imalloc(size_t size);
 void	*icalloc(size_t size);
 void	*ipalloc(size_t alignment, size_t size);
 size_t	isalloc(const void *ptr);
+#  ifdef JEMALLOC_IVSALLOC
+size_t	ivsalloc(const void *ptr);
+#  endif
 void	*iralloc(void *ptr, size_t size);
 void	idalloc(void *ptr);
 #endif
@@ -526,6 +542,19 @@ isalloc(const void *ptr)
 	return (ret);
 }
 
+#ifdef JEMALLOC_IVSALLOC
+JEMALLOC_INLINE size_t
+ivsalloc(const void *ptr)
+{
+
+	/* Return 0 if ptr is not within a chunk managed by jemalloc. */
+	if (rtree_get(chunks_rtree, (uintptr_t)CHUNK_ADDR2BASE(ptr)) == NULL)
+		return (0);
+
+	return (isalloc(ptr));
+}
+#endif
+
 JEMALLOC_INLINE void *
 iralloc(void *ptr, size_t size)
 {
diff --git a/jemalloc/include/jemalloc/internal/mutex.h b/jemalloc/include/jemalloc/internal/mutex.h
index 108bfa8a..81134153 100644
--- a/jemalloc/include/jemalloc/internal/mutex.h
+++ b/jemalloc/include/jemalloc/internal/mutex.h
@@ -3,6 +3,12 @@
 
 typedef pthread_mutex_t malloc_mutex_t;
 
+#ifdef PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP
+#  define MALLOC_MUTEX_INITIALIZER PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP
+#else
+#  define MALLOC_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER
+#endif
+
 #endif /* JEMALLOC_H_TYPES */
 /******************************************************************************/
 #ifdef JEMALLOC_H_STRUCTS
diff --git a/jemalloc/include/jemalloc/internal/rtree.h b/jemalloc/include/jemalloc/internal/rtree.h
new file mode 100644
index 00000000..9d58ebac
--- /dev/null
+++ b/jemalloc/include/jemalloc/internal/rtree.h
@@ -0,0 +1,161 @@
+/*
+ * This radix tree implementation is tailored to the singular purpose of
+ * tracking which chunks are currently owned by jemalloc.  This functionality
+ * is mandatory for OS X, where jemalloc must be able to respond to object
+ * ownership queries.
+ *
+ *******************************************************************************
+ */
+#ifdef JEMALLOC_H_TYPES
+
+typedef struct rtree_s rtree_t;
+
+/*
+ * Size of each radix tree node (must be a power of 2).  This impacts tree
+ * depth.
+ */
+#if (LG_SIZEOF_PTR == 2)
+#  define RTREE_NODESIZE (1U << 14)
+#else
+#  define RTREE_NODESIZE CACHELINE
+#endif
+
+#endif /* JEMALLOC_H_TYPES */
+/******************************************************************************/
+#ifdef JEMALLOC_H_STRUCTS
+
+struct rtree_s {
+	malloc_mutex_t	mutex;
+	void		**root;
+	unsigned	height;
+	unsigned	level2bits[1]; /* Dynamically sized. */
+};
+
+#endif /* JEMALLOC_H_STRUCTS */
+/******************************************************************************/
+#ifdef JEMALLOC_H_EXTERNS
+
+rtree_t	*rtree_new(unsigned bits);
+
+#endif /* JEMALLOC_H_EXTERNS */
+/******************************************************************************/
+#ifdef JEMALLOC_H_INLINES
+
+#ifndef JEMALLOC_ENABLE_INLINE
+#ifndef JEMALLOC_DEBUG
+void	*rtree_get_locked(rtree_t *rtree, uintptr_t key);
+#endif
+void	*rtree_get(rtree_t *rtree, uintptr_t key);
+bool	rtree_set(rtree_t *rtree, uintptr_t key, void *val);
+#endif
+
+#if (defined(JEMALLOC_ENABLE_INLINE) || defined(RTREE_C_))
+#define	RTREE_GET_GENERATE(f)						\
+/* The least significant bits of the key are ignored. */		\
+JEMALLOC_INLINE void *							\
+f(rtree_t *rtree, uintptr_t key)					\
+{									\
+	void *ret;							\
+	uintptr_t subkey;						\
+	unsigned i, lshift, height, bits;				\
+	void **node, **child;						\
+									\
+	RTREE_LOCK(&rtree->mutex);					\
+	for (i = lshift = 0, height = rtree->height, node = rtree->root;\
+	    i < height - 1;						\
+	    i++, lshift += bits, node = child) {			\
+		bits = rtree->level2bits[i];				\
+		subkey = (key << lshift) >> ((ZU(1) << (LG_SIZEOF_PTR + \
+		    3)) - bits);					\
+		child = (void**)node[subkey];				\
+		if (child == NULL) {					\
+			RTREE_UNLOCK(&rtree->mutex);			\
+			return (NULL);					\
+		}							\
+	}								\
+									\
+	/*								\
+	 * node is a leaf, so it contains values rather than node	\
+	 * pointers.							\
+	 */								\
+	bits = rtree->level2bits[i];					\
+	subkey = (key << lshift) >> ((ZU(1) << (LG_SIZEOF_PTR+3)) -	\
+	    bits);							\
+	ret = node[subkey];						\
+	RTREE_UNLOCK(&rtree->mutex);					\
+									\
+	RTREE_GET_VALIDATE						\
+	return (ret);							\
+}
+
+#ifdef JEMALLOC_DEBUG
+#  define RTREE_LOCK(l)		malloc_mutex_lock(l)
+#  define RTREE_UNLOCK(l)	malloc_mutex_unlock(l)
+#  define RTREE_GET_VALIDATE
+RTREE_GET_GENERATE(rtree_get_locked)
+#  undef RTREE_LOCK
+#  undef RTREE_UNLOCK
+#  undef RTREE_GET_VALIDATE
+#endif
+
+#define	RTREE_LOCK(l)
+#define	RTREE_UNLOCK(l)
+#ifdef JEMALLOC_DEBUG
+   /*
+    * Suppose that it were possible for a jemalloc-allocated chunk to be
+    * munmap()ped, followed by a different allocator in another thread re-using
+    * overlapping virtual memory, all without invalidating the cached rtree
+    * value.  The result would be a false positive (the rtree would claim that
+    * jemalloc owns memory that it had actually discarded).  This scenario
+    * seems impossible, but the following assertion is a prudent sanity check.
+    */
+#  define RTREE_GET_VALIDATE						\
+	assert(rtree_get_locked(rtree, key) == ret);
+#else
+#  define RTREE_GET_VALIDATE
+#endif
+RTREE_GET_GENERATE(rtree_get)
+#undef RTREE_LOCK
+#undef RTREE_UNLOCK
+#undef RTREE_GET_VALIDATE
+
+JEMALLOC_INLINE bool
+rtree_set(rtree_t *rtree, uintptr_t key, void *val)
+{
+	uintptr_t subkey;
+	unsigned i, lshift, height, bits;
+	void **node, **child;
+
+	malloc_mutex_lock(&rtree->mutex);
+	for (i = lshift = 0, height = rtree->height, node = rtree->root;
+	    i < height - 1;
+	    i++, lshift += bits, node = child) {
+		bits = rtree->level2bits[i];
+		subkey = (key << lshift) >> ((ZU(1) << (LG_SIZEOF_PTR+3)) -
+		    bits);
+		child = (void**)node[subkey];
+		if (child == NULL) {
+			child = (void**)base_alloc(sizeof(void *) <<
+			    rtree->level2bits[i+1]);
+			if (child == NULL) {
+				malloc_mutex_unlock(&rtree->mutex);
+				return (true);
+			}
+			memset(child, 0, sizeof(void *) <<
+			    rtree->level2bits[i+1]);
+			node[subkey] = child;
+		}
+	}
+
+	/* node is a leaf, so it contains values rather than node pointers. */
+	bits = rtree->level2bits[i];
+	subkey = (key << lshift) >> ((ZU(1) << (LG_SIZEOF_PTR+3)) - bits);
+	node[subkey] = val;
+	malloc_mutex_unlock(&rtree->mutex);
+
+	return (false);
+}
+#endif
+
+#endif /* JEMALLOC_H_INLINES */
+/******************************************************************************/
diff --git a/jemalloc/include/jemalloc/internal/tcache.h b/jemalloc/include/jemalloc/internal/tcache.h
index a8be436d..df302fba 100644
--- a/jemalloc/include/jemalloc/internal/tcache.h
+++ b/jemalloc/include/jemalloc/internal/tcache.h
@@ -65,8 +65,21 @@ extern ssize_t	opt_lg_tcache_maxclass;
 extern ssize_t	opt_lg_tcache_gc_sweep;
 
 /* Map of thread-specific caches. */
+#ifndef NO_TLS
 extern __thread tcache_t	*tcache_tls
     JEMALLOC_ATTR(tls_model("initial-exec"));
+#  define TCACHE_GET()	tcache_tls
+#  define TCACHE_SET(v)	do {						\
+	tcache_tls = (v);						\
+	pthread_setspecific(tcache_tsd, (void *)(v));			\
+} while (0)
+#else
+extern pthread_key_t		tcache_tsd;
+#  define TCACHE_GET()	((tcache_t *)pthread_getspecific(tcache_tsd))
+#  define TCACHE_SET(v)	do {						\
+	pthread_setspecific(tcache_tsd, (void *)(v));			\
+} while (0)
+#endif
 
 /*
  * Number of tcache bins.  There are nbins small-object bins, plus 0 or more
@@ -122,14 +135,23 @@ tcache_get(void)
 	if ((isthreaded & opt_tcache) == false)
 		return (NULL);
 
-	tcache = tcache_tls;
-	if ((uintptr_t)tcache <= (uintptr_t)1) {
+	tcache = TCACHE_GET();
+	if ((uintptr_t)tcache <= (uintptr_t)2) {
 		if (tcache == NULL) {
 			tcache = tcache_create(choose_arena());
 			if (tcache == NULL)
 				return (NULL);
-		} else
+		} else {
+			if (tcache == (void *)(uintptr_t)1) {
+				/*
+				 * Make a note that an allocator function was
+				 * called after the tcache_thread_cleanup() was
+				 * called.
+				 */
+				TCACHE_SET((uintptr_t)2);
+			}
 			return (NULL);
+		}
 	}
 
 	return (tcache);
diff --git a/jemalloc/include/jemalloc/internal/zone.h b/jemalloc/include/jemalloc/internal/zone.h
new file mode 100644
index 00000000..859b529d
--- /dev/null
+++ b/jemalloc/include/jemalloc/internal/zone.h
@@ -0,0 +1,23 @@
+#ifndef JEMALLOC_ZONE
+#  error "This source file is for zones on Darwin (OS X)."
+#endif
+/******************************************************************************/
+#ifdef JEMALLOC_H_TYPES
+
+#endif /* JEMALLOC_H_TYPES */
+/******************************************************************************/
+#ifdef JEMALLOC_H_STRUCTS
+
+#endif /* JEMALLOC_H_STRUCTS */
+/******************************************************************************/
+#ifdef JEMALLOC_H_EXTERNS
+
+malloc_zone_t *create_zone(void);
+void	szone2ozone(malloc_zone_t *zone);
+
+#endif /* JEMALLOC_H_EXTERNS */
+/******************************************************************************/
+#ifdef JEMALLOC_H_INLINES
+
+#endif /* JEMALLOC_H_INLINES */
+/******************************************************************************/
diff --git a/jemalloc/include/jemalloc/jemalloc_defs.h.in b/jemalloc/include/jemalloc/jemalloc_defs.h.in
index 8b98d670..eed33a64 100644
--- a/jemalloc/include/jemalloc/jemalloc_defs.h.in
+++ b/jemalloc/include/jemalloc/jemalloc_defs.h.in
@@ -92,6 +92,34 @@
 /* TLS is used to map arenas and magazine caches to threads. */
 #undef NO_TLS
 
+/*
+ * JEMALLOC_IVSALLOC enables ivsalloc(), which verifies that pointers reside
+ * within jemalloc-owned chunks before dereferencing them.
+ */
+#undef JEMALLOC_IVSALLOC
+
+/*
+ * Darwin (OS X) uses zones to work around Mach-O symbol override shortcomings.
+ */
+#undef JEMALLOC_ZONE
+#undef JEMALLOC_ZONE_VERSION
+
+/*
+ * Methods for purging unused pages differ between operating systems.
+ *
+ *   madvise(..., MADV_DONTNEED) : On Linux, this immediately discards pages,
+ *                                 such that new pages will be demand-zeroed if
+ *                                 the address region is later touched.
+ *   madvise(..., MADV_FREE) : On FreeBSD, this marks pages as being unused,
+ *                             such that they will be discarded rather than
+ *                             swapped out.
+ *   msync(..., MS_KILLPAGES) : On Darwin, this behaves similarly to
+ *                              madvise(..., MADV_FREE) on FreeBSD.
+ */
+#undef JEMALLOC_PURGE_MADVISE_DONTNEED
+#undef JEMALLOC_PURGE_MADVISE_FREE
+#undef JEMALLOC_PURGE_MSYNC_KILLPAGES
+
 /* sizeof(void *) == 2^LG_SIZEOF_PTR. */
 #undef LG_SIZEOF_PTR
 
diff --git a/jemalloc/src/arena.c b/jemalloc/src/arena.c
index ee859fcb..db3d4010 100644
--- a/jemalloc/src/arena.c
+++ b/jemalloc/src/arena.c
@@ -181,9 +181,6 @@ static void	arena_ralloc_large_shrink(arena_t *arena, arena_chunk_t *chunk,
 static bool	arena_ralloc_large_grow(arena_t *arena, arena_chunk_t *chunk,
     void *ptr, size_t size, size_t oldsize);
 static bool	arena_ralloc_large(void *ptr, size_t size, size_t oldsize);
-#ifdef JEMALLOC_TINY
-static size_t	pow2_ceil(size_t x);
-#endif
 static bool	small_size2bin_init(void);
 #ifdef JEMALLOC_DEBUG
 static void	small_size2bin_validate(void);
@@ -426,7 +423,7 @@ arena_chunk_alloc(arena_t *arena)
 
 		zero = false;
 		malloc_mutex_unlock(&arena->lock);
-		chunk = (arena_chunk_t *)chunk_alloc(chunksize, &zero);
+		chunk = (arena_chunk_t *)chunk_alloc(chunksize, false, &zero);
 		malloc_mutex_lock(&arena->lock);
 		if (chunk == NULL)
 			return (NULL);
@@ -606,10 +603,18 @@ arena_chunk_purge(arena_t *arena, arena_chunk_t *chunk)
 	ql_new(&mapelms);
 
 	flag_zeroed =
-#ifdef JEMALLOC_SWAP
+#ifdef JEMALLOC_PURGE_MADVISE_DONTNEED
+   /*
+    * madvise(..., MADV_DONTNEED) results in zero-filled pages for anonymous
+    * mappings, but not for file-backed mappings.
+    */
+#  ifdef JEMALLOC_SWAP
 	    swap_enabled ? 0 :
-#endif
+#  endif
 	    CHUNK_MAP_ZEROED;
+#else
+	    0;
+#endif
 
 	/*
 	 * If chunk is the spare, temporarily re-allocate it, 1) so that its
@@ -649,9 +654,6 @@ arena_chunk_purge(arena_t *arena, arena_chunk_t *chunk)
 				/*
 				 * Update internal elements in the page map, so
 				 * that CHUNK_MAP_ZEROED is properly set.
-				 * madvise(..., MADV_DONTNEED) results in
-				 * zero-filled pages for anonymous mappings,
-				 * but not for file-backed mappings.
 				 */
 				mapelm->bits = (npages << PAGE_SHIFT) |
 				    CHUNK_MAP_LARGE | CHUNK_MAP_ALLOCATED |
@@ -715,8 +717,20 @@ arena_chunk_purge(arena_t *arena, arena_chunk_t *chunk)
 		assert(ndirty >= npages);
 		ndirty -= npages;
 #endif
+
+#ifdef JEMALLOC_PURGE_MADVISE_DONTNEED
 		madvise((void *)((uintptr_t)chunk + (pageind << PAGE_SHIFT)),
 		    (npages << PAGE_SHIFT), MADV_DONTNEED);
+#elif defined(JEMALLOC_PURGE_MADVISE_FREE)
+		madvise((void *)((uintptr_t)chunk + (pageind << PAGE_SHIFT)),
+		    (npages << PAGE_SHIFT), MADV_FREE);
+#elif defined(JEMALLOC_PURGE_MSYNC_KILLPAGES)
+		msync((void *)((uintptr_t)chunk + (pageind << PAGE_SHIFT)),
+		    (npages << PAGE_SHIFT), MS_KILLPAGES);
+#else
+#  error "No method defined for purging unused dirty pages."
+#endif
+
 #ifdef JEMALLOC_STATS
 		nmadvise++;
 #endif
@@ -2239,26 +2253,6 @@ arena_new(arena_t *arena, unsigned ind)
 	return (false);
 }
 
-#ifdef JEMALLOC_TINY
-/* Compute the smallest power of 2 that is >= x. */
-static size_t
-pow2_ceil(size_t x)
-{
-
-	x--;
-	x |= x >> 1;
-	x |= x >> 2;
-	x |= x >> 4;
-	x |= x >> 8;
-	x |= x >> 16;
-#if (SIZEOF_PTR == 8)
-	x |= x >> 32;
-#endif
-	x++;
-	return (x);
-}
-#endif
-
 #ifdef JEMALLOC_DEBUG
 static void
 small_size2bin_validate(void)
diff --git a/jemalloc/src/base.c b/jemalloc/src/base.c
index 605197ea..cc85e849 100644
--- a/jemalloc/src/base.c
+++ b/jemalloc/src/base.c
@@ -32,7 +32,7 @@ base_pages_alloc(size_t minsize)
 	assert(minsize != 0);
 	csize = CHUNK_CEILING(minsize);
 	zero = false;
-	base_pages = chunk_alloc(csize, &zero);
+	base_pages = chunk_alloc(csize, true, &zero);
 	if (base_pages == NULL)
 		return (true);
 	base_next_addr = base_pages;
diff --git a/jemalloc/src/chunk.c b/jemalloc/src/chunk.c
index e6e3bcd1..5cb99615 100644
--- a/jemalloc/src/chunk.c
+++ b/jemalloc/src/chunk.c
@@ -14,6 +14,10 @@ malloc_mutex_t	chunks_mtx;
 chunk_stats_t	stats_chunks;
 #endif
 
+#ifdef JEMALLOC_IVSALLOC
+rtree_t		*chunks_rtree;
+#endif
+
 /* Various chunk-related settings. */
 size_t		chunksize;
 size_t		chunksize_mask; /* (chunksize - 1). */
@@ -30,7 +34,7 @@ size_t		arena_maxclass; /* Max size class for arenas. */
  * advantage of them if they are returned.
  */
 void *
-chunk_alloc(size_t size, bool *zero)
+chunk_alloc(size_t size, bool base, bool *zero)
 {
 	void *ret;
 
@@ -63,6 +67,14 @@ chunk_alloc(size_t size, bool *zero)
 	/* All strategies for allocation failed. */
 	ret = NULL;
 RETURN:
+#ifdef JEMALLOC_IVSALLOC
+	if (base == false && ret != NULL) {
+		if (rtree_set(chunks_rtree, (uintptr_t)ret, ret)) {
+			chunk_dealloc(ret, size);
+			return (NULL);
+		}
+	}
+#endif
 #if (defined(JEMALLOC_STATS) || defined(JEMALLOC_PROF))
 	if (ret != NULL) {
 #  ifdef JEMALLOC_PROF
@@ -104,6 +116,9 @@ chunk_dealloc(void *chunk, size_t size)
 	assert(size != 0);
 	assert((size & chunksize_mask) == 0);
 
+#ifdef JEMALLOC_IVSALLOC
+	rtree_set(chunks_rtree, (uintptr_t)chunk, NULL);
+#endif
 #if (defined(JEMALLOC_STATS) || defined(JEMALLOC_PROF))
 	malloc_mutex_lock(&chunks_mtx);
 	stats_chunks.curchunks -= (size / chunksize);
@@ -126,21 +141,27 @@ chunk_boot(void)
 {
 
 	/* Set variables according to the value of opt_lg_chunk. */
-	chunksize = (1LU << opt_lg_chunk);
+	chunksize = (ZU(1) << opt_lg_chunk);
 	assert(chunksize >= PAGE_SIZE);
 	chunksize_mask = chunksize - 1;
 	chunk_npages = (chunksize >> PAGE_SHIFT);
 
+#ifdef JEMALLOC_IVSALLOC
+	chunks_rtree = rtree_new((ZU(1) << (LG_SIZEOF_PTR+3)) - opt_lg_chunk);
+	if (chunks_rtree == NULL)
+		return (true);
+#endif
 #if (defined(JEMALLOC_STATS) || defined(JEMALLOC_PROF))
 	if (malloc_mutex_init(&chunks_mtx))
 		return (true);
 	memset(&stats_chunks, 0, sizeof(chunk_stats_t));
 #endif
-
 #ifdef JEMALLOC_SWAP
 	if (chunk_swap_boot())
 		return (true);
 #endif
+	if (chunk_mmap_boot())
+		return (true);
 #ifdef JEMALLOC_DSS
 	if (chunk_dss_boot())
 		return (true);
diff --git a/jemalloc/src/chunk_mmap.c b/jemalloc/src/chunk_mmap.c
index d9f9e86d..a3d09e9a 100644
--- a/jemalloc/src/chunk_mmap.c
+++ b/jemalloc/src/chunk_mmap.c
@@ -6,19 +6,22 @@
 
 /*
  * Used by chunk_alloc_mmap() to decide whether to attempt the fast path and
- * potentially avoid some system calls.  We can get away without TLS here,
- * since the state of mmap_unaligned only affects performance, rather than
- * correct function.
+ * potentially avoid some system calls.
  */
-static
 #ifndef NO_TLS
-       __thread
+static __thread bool	mmap_unaligned_tls
+    JEMALLOC_ATTR(tls_model("initial-exec"));
+#define	MMAP_UNALIGNED_GET()	mmap_unaligned_tls
+#define	MMAP_UNALIGNED_SET(v)	do {					\
+	mmap_unaligned_tls = (v);					\
+} while (0)
+#else
+static pthread_key_t	mmap_unaligned_tsd;
+#define	MMAP_UNALIGNED_GET()	((bool)pthread_getspecific(mmap_unaligned_tsd))
+#define	MMAP_UNALIGNED_SET(v)	do {					\
+	pthread_setspecific(mmap_unaligned_tsd, (void *)(v));		\
+} while (0)
 #endif
-                bool	mmap_unaligned
-#ifndef NO_TLS
-                                       JEMALLOC_ATTR(tls_model("initial-exec"))
-#endif
-                                                                               ;
 
 /******************************************************************************/
 /* Function prototypes for non-inline static functions. */
@@ -128,7 +131,7 @@ chunk_alloc_mmap_slow(size_t size, bool unaligned, bool noreserve)
 	 * method.
 	 */
 	if (unaligned == false)
-		mmap_unaligned = false;
+		MMAP_UNALIGNED_SET(false);
 
 	return (ret);
 }
@@ -166,7 +169,7 @@ chunk_alloc_mmap_internal(size_t size, bool noreserve)
 	 * fast method next time.
 	 */
 
-	if (mmap_unaligned == false) {
+	if (MMAP_UNALIGNED_GET() == false) {
 		size_t offset;
 
 		ret = pages_map(NULL, size, noreserve);
@@ -175,7 +178,7 @@ chunk_alloc_mmap_internal(size_t size, bool noreserve)
 
 		offset = CHUNK_ADDR2OFFSET(ret);
 		if (offset != 0) {
-			mmap_unaligned = true;
+			MMAP_UNALIGNED_SET(true);
 			/* Try to extend chunk boundary. */
 			if (pages_map((void *)((uintptr_t)ret + size),
 			    chunksize - offset, noreserve) == NULL) {
@@ -184,7 +187,8 @@ chunk_alloc_mmap_internal(size_t size, bool noreserve)
 				 * the reliable-but-expensive method.
 				 */
 				pages_unmap(ret, size);
-				ret = chunk_alloc_mmap_slow(size, true, noreserve);
+				ret = chunk_alloc_mmap_slow(size, true,
+				    noreserve);
 			} else {
 				/* Clean up unneeded leading space. */
 				pages_unmap(ret, chunksize - offset);
@@ -216,3 +220,17 @@ chunk_dealloc_mmap(void *chunk, size_t size)
 
 	pages_unmap(chunk, size);
 }
+
+bool
+chunk_mmap_boot(void)
+{
+
+#ifdef NO_TLS
+	if (pthread_key_create(&mmap_unaligned_tsd, NULL) != 0) {
+		malloc_write("<jemalloc>: Error in pthread_key_create()\n");
+		return (true);
+	}
+#endif
+
+	return (false);
+}
diff --git a/jemalloc/src/ctl.c b/jemalloc/src/ctl.c
index 128883f8..64913067 100644
--- a/jemalloc/src/ctl.c
+++ b/jemalloc/src/ctl.c
@@ -41,9 +41,7 @@ CTL_PROTO(epoch)
 #ifdef JEMALLOC_TCACHE
 CTL_PROTO(tcache_flush)
 #endif
-#ifndef NO_TLS
 CTL_PROTO(thread_arena)
-#endif
 CTL_PROTO(config_debug)
 CTL_PROTO(config_dss)
 CTL_PROTO(config_dynamic_page_shift)
@@ -213,11 +211,9 @@ static const ctl_node_t	tcache_node[] = {
 };
 #endif
 
-#ifndef NO_TLS
 static const ctl_node_t	thread_node[] = {
 	{NAME("arena"),		CTL(thread_arena)}
 };
-#endif
 
 static const ctl_node_t	config_node[] = {
 	{NAME("debug"),			CTL(config_debug)},
@@ -457,9 +453,7 @@ static const ctl_node_t	root_node[] = {
 #ifdef JEMALLOC_TCACHE
 	{NAME("tcache"),	CHILD(tcache)},
 #endif
-#ifndef NO_TLS
 	{NAME("thread"),	CHILD(thread)},
-#endif
 	{NAME("config"),	CHILD(config)},
 	{NAME("opt"),		CHILD(opt)},
 	{NAME("arenas"),	CHILD(arenas)},
@@ -1040,13 +1034,13 @@ tcache_flush_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
 
 	VOID();
 
-	tcache = tcache_tls;
+	tcache = TCACHE_GET();
 	if (tcache == NULL) {
 		ret = 0;
 		goto RETURN;
 	}
 	tcache_destroy(tcache);
-	tcache_tls = NULL;
+	TCACHE_SET(NULL);
 
 	ret = 0;
 RETURN:
@@ -1054,7 +1048,6 @@ RETURN:
 }
 #endif
 
-#ifndef NO_TLS
 static int
 thread_arena_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
     void *newp, size_t newlen)
@@ -1085,14 +1078,13 @@ thread_arena_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
 		}
 
 		/* Set new arena association. */
-		arenas_map = arena;
+		ARENA_SET(arena);
 	}
 
 	ret = 0;
 RETURN:
 	return (ret);
 }
-#endif
 
 /******************************************************************************/
 
diff --git a/jemalloc/src/huge.c b/jemalloc/src/huge.c
index 49962ea0..be35d16f 100644
--- a/jemalloc/src/huge.c
+++ b/jemalloc/src/huge.c
@@ -37,7 +37,7 @@ huge_malloc(size_t size, bool zero)
 	if (node == NULL)
 		return (NULL);
 
-	ret = chunk_alloc(csize, &zero);
+	ret = chunk_alloc(csize, false, &zero);
 	if (ret == NULL) {
 		base_node_dealloc(node);
 		return (NULL);
@@ -99,7 +99,7 @@ huge_palloc(size_t alignment, size_t size)
 		return (NULL);
 
 	zero = false;
-	ret = chunk_alloc(alloc_size, &zero);
+	ret = chunk_alloc(alloc_size, false, &zero);
 	if (ret == NULL) {
 		base_node_dealloc(node);
 		return (NULL);
diff --git a/jemalloc/src/jemalloc.c b/jemalloc/src/jemalloc.c
index b36590dd..ebce3ca0 100644
--- a/jemalloc/src/jemalloc.c
+++ b/jemalloc/src/jemalloc.c
@@ -89,12 +89,12 @@
 malloc_mutex_t		arenas_lock;
 arena_t			**arenas;
 unsigned		narenas;
-#ifndef NO_TLS
 static unsigned		next_arena;
-#endif
 
 #ifndef NO_TLS
-__thread arena_t	*arenas_map JEMALLOC_ATTR(tls_model("initial-exec"));
+__thread arena_t	*arenas_tls JEMALLOC_ATTR(tls_model("initial-exec"));
+#else
+pthread_key_t		arenas_tsd;
 #endif
 
 /* Set to true once the allocator has been initialized. */
@@ -104,7 +104,7 @@ static bool malloc_initialized = false;
 static pthread_t malloc_initializer = (unsigned long)0;
 
 /* Used to avoid initialization races. */
-static malloc_mutex_t init_lock = PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP;
+static malloc_mutex_t init_lock = MALLOC_MUTEX_INITIALIZER;
 
 #ifdef DYNAMIC_PAGE_SHIFT
 size_t		pagesize;
@@ -146,8 +146,6 @@ static void	wrtmessage(void *cbopaque, const char *s);
 static void	stats_print_atexit(void);
 static unsigned	malloc_ncpus(void);
 static bool	malloc_init_hard(void);
-static void	jemalloc_prefork(void);
-static void	jemalloc_postfork(void);
 
 /******************************************************************************/
 /* malloc_message() setup. */
@@ -200,7 +198,6 @@ arenas_extend(unsigned ind)
 	return (arenas[0]);
 }
 
-#ifndef NO_TLS
 /*
  * Choose an arena based on a per-thread value (slow-path code only, called
  * only by choose_arena()).
@@ -219,11 +216,10 @@ choose_arena_hard(void)
 	} else
 		ret = arenas[0];
 
-	arenas_map = ret;
+	ARENA_SET(ret);
 
 	return (ret);
 }
-#endif
 
 static void
 stats_print_atexit(void)
@@ -697,14 +693,12 @@ MALLOC_OUT:
 		return (true);
 	}
 
-#ifndef NO_TLS
 	/*
 	 * Assign the initial arena to the initial thread, in order to avoid
 	 * spurious creation of an extra arena if the application switches to
 	 * threaded mode.
 	 */
-	arenas_map = arenas[0];
-#endif
+	ARENA_SET(arenas[0]);
 
 	malloc_mutex_init(&arenas_lock);
 
@@ -748,35 +742,13 @@ MALLOC_OUT:
 			narenas = 1;
 	}
 
-#ifdef NO_TLS
-	if (narenas > 1) {
-		static const unsigned primes[] = {1, 3, 5, 7, 11, 13, 17, 19,
-		    23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83,
-		    89, 97, 101, 103, 107, 109, 113, 127, 131, 137, 139, 149,
-		    151, 157, 163, 167, 173, 179, 181, 191, 193, 197, 199, 211,
-		    223, 227, 229, 233, 239, 241, 251, 257, 263};
-		unsigned nprimes, parenas;
-
-		/*
-		 * Pick a prime number of hash arenas that is more than narenas
-		 * so that direct hashing of pthread_self() pointers tends to
-		 * spread allocations evenly among the arenas.
-		 */
-		assert((narenas & 1) == 0); /* narenas must be even. */
-		nprimes = (sizeof(primes) >> LG_SIZEOF_INT);
-		parenas = primes[nprimes - 1]; /* In case not enough primes. */
-		for (i = 1; i < nprimes; i++) {
-			if (primes[i] > narenas) {
-				parenas = primes[i];
-				break;
-			}
-		}
-		narenas = parenas;
-	}
-#endif
-
-#ifndef NO_TLS
 	next_arena = (narenas > 0) ? 1 : 0;
+
+#ifdef NO_TLS
+	if (pthread_key_create(&arenas_tsd, NULL) != 0) {
+		malloc_mutex_unlock(&init_lock);
+		return (true);
+	}
 #endif
 
 	/* Allocate and initialize arenas. */
@@ -793,11 +765,35 @@ MALLOC_OUT:
 	/* Copy the pointer to the one arena that was already initialized. */
 	arenas[0] = init_arenas[0];
 
+#ifdef JEMALLOC_ZONE
+	/* Register the custom zone. */
+	malloc_zone_register(create_zone());
+
+	/*
+	 * Convert the default szone to an "overlay zone" that is capable of
+	 * deallocating szone-allocated objects, but allocating new objects
+	 * from jemalloc.
+	 */
+	szone2ozone(malloc_default_zone());
+#endif
+
 	malloc_initialized = true;
 	malloc_mutex_unlock(&init_lock);
 	return (false);
 }
 
+
+#ifdef JEMALLOC_ZONE
+JEMALLOC_ATTR(constructor)
+void
+jemalloc_darwin_init(void)
+{
+
+	if (malloc_init_hard())
+		abort();
+}
+#endif
+
 /*
  * End initialization functions.
  */
@@ -1219,8 +1215,12 @@ JEMALLOC_P(malloc_usable_size)(const void *ptr)
 {
 	size_t ret;
 
+#ifdef JEMALLOC_IVSALLOC
+	ret = ivsalloc(ptr);
+#else
 	assert(ptr != NULL);
 	ret = isalloc(ptr);
+#endif
 
 	return (ret);
 }
@@ -1298,11 +1298,13 @@ JEMALLOC_P(mallctlbymib)(const size_t *mib, size_t miblen, void *oldp,
  * is threaded here.
  */
 
-static void
+void
 jemalloc_prefork(void)
 {
 	unsigned i;
 
+	assert(isthreaded);
+
 	/* Acquire all mutexes in a safe order. */
 
 	malloc_mutex_lock(&arenas_lock);
@@ -1324,11 +1326,13 @@ jemalloc_prefork(void)
 #endif
 }
 
-static void
+void
 jemalloc_postfork(void)
 {
 	unsigned i;
 
+	assert(isthreaded);
+
 	/* Release all mutexes, now that fork() has completed. */
 
 #ifdef JEMALLOC_SWAP
@@ -1349,3 +1353,5 @@ jemalloc_postfork(void)
 	}
 	malloc_mutex_unlock(&arenas_lock);
 }
+
+/******************************************************************************/
diff --git a/jemalloc/src/mutex.c b/jemalloc/src/mutex.c
index 3b6081a4..337312bd 100644
--- a/jemalloc/src/mutex.c
+++ b/jemalloc/src/mutex.c
@@ -59,7 +59,11 @@ malloc_mutex_init(malloc_mutex_t *mutex)
 
 	if (pthread_mutexattr_init(&attr) != 0)
 		return (true);
+#ifdef PTHREAD_MUTEX_ADAPTIVE_NP
 	pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ADAPTIVE_NP);
+#else
+	pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_DEFAULT);
+#endif
 	if (pthread_mutex_init(mutex, &attr) != 0) {
 		pthread_mutexattr_destroy(&attr);
 		return (true);
diff --git a/jemalloc/src/prof.c b/jemalloc/src/prof.c
index 6d6910ed..e70b1325 100644
--- a/jemalloc/src/prof.c
+++ b/jemalloc/src/prof.c
@@ -45,7 +45,19 @@ static malloc_mutex_t	bt2ctx_mtx;
  * into the associated prof_ctx_t objects, and unlink/free the prof_thr_cnt_t
  * objects.
  */
+#ifndef NO_TLS
 static __thread ckh_t	*bt2cnt_tls JEMALLOC_ATTR(tls_model("initial-exec"));
+#  define BT2CNT_GET()	bt2cnt_tls
+#  define BT2CNT_SET(v)	do {						\
+	bt2cnt_tls = (v);						\
+	pthread_setspecific(bt2cnt_tsd, (void *)(v));			\
+} while (0)
+#else
+#  define BT2CNT_GET()	((ckh_t *)pthread_getspecific(bt2cnt_tsd))
+#  define BT2CNT_SET(v)	do {						\
+	pthread_setspecific(bt2cnt_tsd, (void *)(v));			\
+} while (0)
+#endif
 
 /*
  * Same contents as b2cnt_tls, but initialized such that the TSD destructor is
@@ -57,12 +69,45 @@ static pthread_key_t	bt2cnt_tsd;
 /* (1U << opt_lg_prof_bt_max). */
 static unsigned		prof_bt_max;
 
-static __thread uint64_t prof_sample_prn_state
-    JEMALLOC_ATTR(tls_model("initial-exec"));
-static __thread uint64_t prof_sample_threshold
-    JEMALLOC_ATTR(tls_model("initial-exec"));
-static __thread uint64_t prof_sample_accum
+typedef struct prof_sample_state_s prof_sample_state_t;
+struct prof_sample_state_s {
+	uint64_t	prn_state;
+	uint64_t	threshold;
+	uint64_t	accum;
+};
+
+#ifndef NO_TLS
+static __thread prof_sample_state_t prof_sample_state_tls
     JEMALLOC_ATTR(tls_model("initial-exec"));
+#  define PROF_SAMPLE_STATE_GET(r)	do {				\
+	r = &prof_sample_state_tls;					\
+} while (0)
+#else
+static pthread_key_t	prof_sample_state_tsd;
+/* Used only if an OOM error occurs in PROF_SAMPLE_STATE_GET(). */
+prof_sample_state_t prof_sample_state_oom;
+#  define PROF_SAMPLE_STATE_GET(r)	do {				\
+	r = (prof_sample_state_t *)pthread_getspecific(			\
+	    prof_sample_state_tsd);					\
+	if (r == NULL) {						\
+		r = ipalloc(CACHELINE, sizeof(prof_sample_state_t));	\
+		if (r == NULL) {					\
+			malloc_write("<jemalloc>: Error in heap "	\
+			    "profiler: out of memory; subsequent heap "	\
+			    "profiles may be inaccurate\n");		\
+			if (opt_abort)					\
+				abort();				\
+			/* Failure is not an option... */		\
+			r = &prof_sample_state_oom;			\
+		}							\
+		pthread_setspecific(prof_sample_state_tsd, (void *)r);	\
+	}								\
+} while (0)
+#  define ARENA_GET()	((arena_t *)pthread_getspecific(arenas_tsd))
+#  define ARENA_SET(v)	do {						\
+	pthread_setspecific(arenas_tsd, (void *)(v));			\
+} while (0)
+#endif
 
 static malloc_mutex_t	prof_dump_seq_mtx;
 static uint64_t		prof_dump_seq;
@@ -116,6 +161,9 @@ static void	prof_bt_hash(const void *key, unsigned minbits, size_t *hash1,
     size_t *hash2);
 static bool	prof_bt_keycomp(const void *k1, const void *k2);
 static void	bt2cnt_thread_cleanup(void *arg);
+#ifdef NO_TLS
+static void	prof_sample_state_thread_cleanup(void *arg);
+#endif
 
 /******************************************************************************/
 
@@ -436,7 +484,7 @@ static prof_thr_cnt_t *
 prof_lookup(prof_bt_t *bt)
 {
 	prof_thr_cnt_t *ret;
-	ckh_t *bt2cnt = bt2cnt_tls;
+	ckh_t *bt2cnt = BT2CNT_GET();
 
 	if (bt2cnt == NULL) {
 		/* Initialize an empty cache for this thread. */
@@ -448,8 +496,8 @@ prof_lookup(prof_bt_t *bt)
 			idalloc(bt2cnt);
 			return (NULL);
 		}
-		bt2cnt_tls = bt2cnt;
-		pthread_setspecific(bt2cnt_tsd, bt2cnt);
+
+		BT2CNT_SET(bt2cnt);
 	}
 
 	if (ckh_search(bt2cnt, bt, NULL, (void **)&ret)) {
@@ -519,15 +567,17 @@ prof_sample_threshold_update(void)
 {
 	uint64_t r;
 	double u;
+	prof_sample_state_t *prof_sample_state;
 
 	/*
 	 * Compute prof_sample_threshold as a geometrically distributed random
 	 * variable with mean (2^opt_lg_prof_sample).
 	 */
-	prn64(r, 53, prof_sample_prn_state, (uint64_t)1125899906842625LLU,
-	    1058392653243283975);
+	PROF_SAMPLE_STATE_GET(prof_sample_state);
+	prn64(r, 53, prof_sample_state->prn_state,
+	    (uint64_t)1125899906842625LLU, 1058392653243283975);
 	u = (double)r * (1.0/9007199254740992.0L);
-	prof_sample_threshold = (uint64_t)(log(u) /
+	prof_sample_state->threshold = (uint64_t)(log(u) /
 	    log(1.0 - (1.0 / (double)((uint64_t)1U << opt_lg_prof_sample))))
 	    + (uint64_t)1U;
 }
@@ -551,26 +601,31 @@ prof_alloc_prep(size_t size)
 		prof_backtrace(&bt, 2, prof_bt_max);
 		ret = prof_lookup(&bt);
 	} else {
-		if (prof_sample_threshold == 0) {
+		prof_sample_state_t *prof_sample_state;
+
+		PROF_SAMPLE_STATE_GET(prof_sample_state);
+		if (prof_sample_state->threshold == 0) {
 			/*
 			 * Initialize.  Seed the prng differently for each
 			 * thread.
 			 */
-			prof_sample_prn_state = (uint64_t)(uintptr_t)&size;
+			prof_sample_state->prn_state =
+			    (uint64_t)(uintptr_t)&size;
 			prof_sample_threshold_update();
 		}
 
 		/*
 		 * Determine whether to capture a backtrace based on whether
 		 * size is enough for prof_accum to reach
-		 * prof_sample_threshold.  However, delay updating these
+		 * prof_sample_state->threshold.  However, delay updating these
 		 * variables until prof_{m,re}alloc(), because we don't know
 		 * for sure that the allocation will succeed.
 		 *
 		 * Use subtraction rather than addition to avoid potential
 		 * integer overflow.
 		 */
-		if (size >= prof_sample_threshold - prof_sample_accum) {
+		if (size >= prof_sample_state->threshold -
+		    prof_sample_state->accum) {
 			bt_init(&bt, vec);
 			prof_backtrace(&bt, 2, prof_bt_max);
 			ret = prof_lookup(&bt);
@@ -621,21 +676,26 @@ prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
 static inline void
 prof_sample_accum_update(size_t size)
 {
+	prof_sample_state_t *prof_sample_state;
 
 	/* Sampling logic is unnecessary if the interval is 1. */
 	assert(opt_lg_prof_sample != 0);
 
 	/* Take care to avoid integer overflow. */
-	if (size >= prof_sample_threshold - prof_sample_accum) {
-		prof_sample_accum -= (prof_sample_threshold - size);
+	PROF_SAMPLE_STATE_GET(prof_sample_state);
+	if (size >= prof_sample_state->threshold - prof_sample_state->accum) {
+		prof_sample_state->accum -= (prof_sample_state->threshold -
+		    size);
 		/* Compute new prof_sample_threshold. */
 		prof_sample_threshold_update();
-		while (prof_sample_accum >= prof_sample_threshold) {
-			prof_sample_accum -= prof_sample_threshold;
+		while (prof_sample_state->accum >=
+		    prof_sample_state->threshold) {
+			prof_sample_state->accum -=
+			    prof_sample_state->threshold;
 			prof_sample_threshold_update();
 		}
 	} else
-		prof_sample_accum += size;
+		prof_sample_state->accum += size;
 }
 
 void
@@ -1244,7 +1304,7 @@ bt2cnt_thread_cleanup(void *arg)
 {
 	ckh_t *bt2cnt;
 
-	bt2cnt = bt2cnt_tls;
+	bt2cnt = BT2CNT_GET();
 	if (bt2cnt != NULL) {
 		ql_head(prof_thr_cnt_t) cnts_ql;
 		size_t tabind;
@@ -1278,7 +1338,7 @@ bt2cnt_thread_cleanup(void *arg)
 		 */
 		ckh_delete(bt2cnt);
 		idalloc(bt2cnt);
-		bt2cnt_tls = NULL;
+		BT2CNT_SET(NULL);
 
 		/* Delete cnt's. */
 		while ((cnt = ql_last(&cnts_ql, link)) != NULL) {
@@ -1288,6 +1348,17 @@ bt2cnt_thread_cleanup(void *arg)
 	}
 }
 
+#ifdef NO_TLS
+static void
+prof_sample_state_thread_cleanup(void *arg)
+{
+	prof_sample_state_t *prof_sample_state = (prof_sample_state_t *)arg;
+
+	if (prof_sample_state != &prof_sample_state_oom)
+		idalloc(prof_sample_state);
+}
+#endif
+
 void
 prof_boot0(void)
 {
@@ -1332,6 +1403,14 @@ prof_boot1(void)
 			    "<jemalloc>: Error in pthread_key_create()\n");
 			abort();
 		}
+#ifdef NO_TLS
+		if (pthread_key_create(&prof_sample_state_tsd,
+		    prof_sample_state_thread_cleanup) != 0) {
+			malloc_write(
+			    "<jemalloc>: Error in pthread_key_create()\n");
+			abort();
+		}
+#endif
 
 		prof_bt_max = (1U << opt_lg_prof_bt_max);
 		if (malloc_mutex_init(&prof_dump_seq_mtx))
diff --git a/jemalloc/src/rtree.c b/jemalloc/src/rtree.c
new file mode 100644
index 00000000..a583751d
--- /dev/null
+++ b/jemalloc/src/rtree.c
@@ -0,0 +1,42 @@
+#define	RTREE_C_
+#include "jemalloc/internal/jemalloc_internal.h"
+
+rtree_t *
+rtree_new(unsigned bits)
+{
+	rtree_t *ret;
+	unsigned bits_per_level, height, i;
+
+	bits_per_level = ffs(pow2_ceil((RTREE_NODESIZE / sizeof(void *)))) - 1;
+	height = bits / bits_per_level;
+	if (height * bits_per_level != bits)
+		height++;
+	assert(height * bits_per_level >= bits);
+
+	ret = (rtree_t*)base_alloc(sizeof(rtree_t) + (sizeof(unsigned) *
+	    (height - 1)));
+	if (ret == NULL)
+		return (NULL);
+	memset(ret, 0, sizeof(rtree_t) + (sizeof(unsigned) * (height - 1)));
+
+	malloc_mutex_init(&ret->mutex);
+	ret->height = height;
+	if (bits_per_level * height > bits)
+		ret->level2bits[0] = bits % bits_per_level;
+	else
+		ret->level2bits[0] = bits_per_level;
+	for (i = 1; i < height; i++)
+		ret->level2bits[i] = bits_per_level;
+
+	ret->root = (void**)base_alloc(sizeof(void *) << ret->level2bits[0]);
+	if (ret->root == NULL) {
+		/*
+		 * We leak the rtree here, since there's no generic base
+		 * deallocation.
+		 */
+		return (NULL);
+	}
+	memset(ret->root, 0, sizeof(void *) << ret->level2bits[0]);
+
+	return (ret);
+}
diff --git a/jemalloc/src/tcache.c b/jemalloc/src/tcache.c
index ace24cea..86343835 100644
--- a/jemalloc/src/tcache.c
+++ b/jemalloc/src/tcache.c
@@ -9,13 +9,15 @@ ssize_t	opt_lg_tcache_maxclass = LG_TCACHE_MAXCLASS_DEFAULT;
 ssize_t	opt_lg_tcache_gc_sweep = LG_TCACHE_GC_SWEEP_DEFAULT;
 
 /* Map of thread-specific caches. */
+#ifndef NO_TLS
 __thread tcache_t	*tcache_tls JEMALLOC_ATTR(tls_model("initial-exec"));
+#endif
 
 /*
  * Same contents as tcache, but initialized such that the TSD destructor is
  * called when a thread exits, so that the cache can be cleaned up.
  */
-static pthread_key_t		tcache_tsd;
+pthread_key_t		tcache_tsd;
 
 size_t				nhbins;
 size_t				tcache_maxclass;
@@ -239,8 +241,7 @@ tcache_create(arena_t *arena)
 	for (; i < nhbins; i++)
 		tcache->tbins[i].ncached_max = TCACHE_NSLOTS_LARGE;
 
-	tcache_tls = tcache;
-	pthread_setspecific(tcache_tsd, tcache);
+	TCACHE_SET(tcache);
 
 	return (tcache);
 }
@@ -328,11 +329,24 @@ tcache_thread_cleanup(void *arg)
 {
 	tcache_t *tcache = (tcache_t *)arg;
 
-	assert(tcache == tcache_tls);
-	if (tcache != NULL) {
+	if (tcache == (void *)(uintptr_t)1) {
+		/*
+		 * The previous time this destructor was called, we set the key
+		 * to 1 so that other destructors wouldn't cause re-creation of
+		 * the tcache.  This time, do nothing, so that the destructor
+		 * will not be called again.
+		 */
+	} else if (tcache == (void *)(uintptr_t)2) {
+		/*
+		 * Another destructor called an allocator function after this
+		 * destructor was called.  Reset tcache to 1 in order to
+		 * receive another callback.
+		 */
+		TCACHE_SET((uintptr_t)1);
+	} else if (tcache != NULL) {
 		assert(tcache != (void *)(uintptr_t)1);
 		tcache_destroy(tcache);
-		tcache_tls = (void *)(uintptr_t)1;
+		TCACHE_SET((uintptr_t)1);
 	}
 }
 
diff --git a/jemalloc/src/zone.c b/jemalloc/src/zone.c
new file mode 100644
index 00000000..2c1b2318
--- /dev/null
+++ b/jemalloc/src/zone.c
@@ -0,0 +1,354 @@
+#include "jemalloc/internal/jemalloc_internal.h"
+#ifndef JEMALLOC_ZONE
+#  error "This source file is for zones on Darwin (OS X)."
+#endif
+
+/******************************************************************************/
+/* Data. */
+
+static malloc_zone_t zone, szone;
+static struct malloc_introspection_t zone_introspect, ozone_introspect;
+
+/******************************************************************************/
+/* Function prototypes for non-inline static functions. */
+
+static size_t	zone_size(malloc_zone_t *zone, void *ptr);
+static void	*zone_malloc(malloc_zone_t *zone, size_t size);
+static void	*zone_calloc(malloc_zone_t *zone, size_t num, size_t size);
+static void	*zone_valloc(malloc_zone_t *zone, size_t size);
+static void	zone_free(malloc_zone_t *zone, void *ptr);
+static void	*zone_realloc(malloc_zone_t *zone, void *ptr, size_t size);
+#if (JEMALLOC_ZONE_VERSION >= 6)
+static void	*zone_memalign(malloc_zone_t *zone, size_t alignment,
+    size_t size);
+static void	zone_free_definite_size(malloc_zone_t *zone, void *ptr,
+    size_t size);
+#endif
+static void	*zone_destroy(malloc_zone_t *zone);
+static size_t	zone_good_size(malloc_zone_t *zone, size_t size);
+static void	zone_force_lock(malloc_zone_t *zone);
+static void	zone_force_unlock(malloc_zone_t *zone);
+static size_t	ozone_size(malloc_zone_t *zone, void *ptr);
+static void	ozone_free(malloc_zone_t *zone, void *ptr);
+static void	*ozone_realloc(malloc_zone_t *zone, void *ptr, size_t size);
+static unsigned	ozone_batch_malloc(malloc_zone_t *zone, size_t size,
+    void **results, unsigned num_requested);
+static void	ozone_batch_free(malloc_zone_t *zone, void **to_be_freed,
+    unsigned num);
+#if (JEMALLOC_ZONE_VERSION >= 6)
+static void	ozone_free_definite_size(malloc_zone_t *zone, void *ptr,
+    size_t size);
+#endif
+static void	ozone_force_lock(malloc_zone_t *zone);
+static void	ozone_force_unlock(malloc_zone_t *zone);
+
+/******************************************************************************/
+/*
+ * Functions.
+ */
+
+static size_t
+zone_size(malloc_zone_t *zone, void *ptr)
+{
+
+	/*
+	 * There appear to be places within Darwin (such as setenv(3)) that
+	 * cause calls to this function with pointers that *no* zone owns.  If
+	 * we knew that all pointers were owned by *some* zone, we could split
+	 * our zone into two parts, and use one as the default allocator and
+	 * the other as the default deallocator/reallocator.  Since that will
+	 * not work in practice, we must check all pointers to assure that they
+	 * reside within a mapped chunk before determining size.
+	 */
+	return (ivsalloc(ptr));
+}
+
+static void *
+zone_malloc(malloc_zone_t *zone, size_t size)
+{
+
+	return (JEMALLOC_P(malloc)(size));
+}
+
+static void *
+zone_calloc(malloc_zone_t *zone, size_t num, size_t size)
+{
+
+	return (JEMALLOC_P(calloc)(num, size));
+}
+
+static void *
+zone_valloc(malloc_zone_t *zone, size_t size)
+{
+	void *ret = NULL; /* Assignment avoids useless compiler warning. */
+
+	JEMALLOC_P(posix_memalign)(&ret, PAGE_SIZE, size);
+
+	return (ret);
+}
+
+static void
+zone_free(malloc_zone_t *zone, void *ptr)
+{
+
+	JEMALLOC_P(free)(ptr);
+}
+
+static void *
+zone_realloc(malloc_zone_t *zone, void *ptr, size_t size)
+{
+
+	return (JEMALLOC_P(realloc)(ptr, size));
+}
+
+#if (JEMALLOC_ZONE_VERSION >= 6)
+static void *
+zone_memalign(malloc_zone_t *zone, size_t alignment, size_t size)
+{
+	void *ret = NULL; /* Assignment avoids useless compiler warning. */
+
+	JEMALLOC_P(posix_memalign)(&ret, alignment, size);
+
+	return (ret);
+}
+
+static void
+zone_free_definite_size(malloc_zone_t *zone, void *ptr, size_t size)
+{
+
+	assert(ivsalloc(ptr) == size);
+	JEMALLOC_P(free)(ptr);
+}
+#endif
+
+static void *
+zone_destroy(malloc_zone_t *zone)
+{
+
+	/* This function should never be called. */
+	assert(false);
+	return (NULL);
+}
+
+static size_t
+zone_good_size(malloc_zone_t *zone, size_t size)
+{
+	size_t ret;
+	void *p;
+
+	/*
+	 * Actually create an object of the appropriate size, then find out
+	 * how large it could have been without moving up to the next size
+	 * class.
+	 */
+	p = JEMALLOC_P(malloc)(size);
+	if (p != NULL) {
+		ret = isalloc(p);
+		JEMALLOC_P(free)(p);
+	} else
+		ret = size;
+
+	return (ret);
+}
+
+static void
+zone_force_lock(malloc_zone_t *zone)
+{
+
+	if (isthreaded)
+		jemalloc_prefork();
+}
+
+static void
+zone_force_unlock(malloc_zone_t *zone)
+{
+
+	if (isthreaded)
+		jemalloc_postfork();
+}
+
+malloc_zone_t *
+create_zone(void)
+{
+
+	zone.size = (void *)zone_size;
+	zone.malloc = (void *)zone_malloc;
+	zone.calloc = (void *)zone_calloc;
+	zone.valloc = (void *)zone_valloc;
+	zone.free = (void *)zone_free;
+	zone.realloc = (void *)zone_realloc;
+	zone.destroy = (void *)zone_destroy;
+	zone.zone_name = "jemalloc_zone";
+	zone.batch_malloc = NULL;
+	zone.batch_free = NULL;
+	zone.introspect = &zone_introspect;
+	zone.version = JEMALLOC_ZONE_VERSION;
+#if (JEMALLOC_ZONE_VERSION >= 6)
+	zone.memalign = zone_memalign;
+	zone.free_definite_size = zone_free_definite_size;
+#endif
+
+	zone_introspect.enumerator = NULL;
+	zone_introspect.good_size = (void *)zone_good_size;
+	zone_introspect.check = NULL;
+	zone_introspect.print = NULL;
+	zone_introspect.log = NULL;
+	zone_introspect.force_lock = (void *)zone_force_lock;
+	zone_introspect.force_unlock = (void *)zone_force_unlock;
+	zone_introspect.statistics = NULL;
+#if (JEMALLOC_ZONE_VERSION >= 6)
+	zone_introspect.zone_locked = NULL;
+#endif
+
+	return (&zone);
+}
+
+static size_t
+ozone_size(malloc_zone_t *zone, void *ptr)
+{
+	size_t ret;
+
+	ret = ivsalloc(ptr);
+	if (ret == 0)
+		ret = szone.size(zone, ptr);
+
+	return (ret);
+}
+
+static void
+ozone_free(malloc_zone_t *zone, void *ptr)
+{
+
+	if (ivsalloc(ptr) != 0)
+		JEMALLOC_P(free)(ptr);
+	else {
+		size_t size = szone.size(zone, ptr);
+		if (size != 0)
+			(szone.free)(zone, ptr);
+	}
+}
+
+static void *
+ozone_realloc(malloc_zone_t *zone, void *ptr, size_t size)
+{
+	size_t oldsize;
+
+	if (ptr == NULL)
+		return (JEMALLOC_P(malloc)(size));
+
+	oldsize = ivsalloc(ptr);
+	if (oldsize != 0)
+		return (JEMALLOC_P(realloc)(ptr, size));
+	else {
+		oldsize = szone.size(zone, ptr);
+		if (oldsize == 0)
+			return (JEMALLOC_P(malloc)(size));
+		else {
+			void *ret = JEMALLOC_P(malloc)(size);
+			if (ret != NULL) {
+				memcpy(ret, ptr, (oldsize < size) ? oldsize :
+				    size);
+				(szone.free)(zone, ptr);
+			}
+			return (ret);
+		}
+	}
+}
+
+static unsigned
+ozone_batch_malloc(malloc_zone_t *zone, size_t size, void **results,
+    unsigned num_requested)
+{
+
+	/* Don't bother implementing this interface, since it isn't required. */
+	return (0);
+}
+
+static void
+ozone_batch_free(malloc_zone_t *zone, void **to_be_freed, unsigned num)
+{
+	unsigned i;
+
+	for (i = 0; i < num; i++)
+		ozone_free(zone, to_be_freed[i]);
+}
+
+#if (JEMALLOC_ZONE_VERSION >= 6)
+static void
+ozone_free_definite_size(malloc_zone_t *zone, void *ptr, size_t size)
+{
+
+	if (ivsalloc(ptr) != 0) {
+		assert(ivsalloc(ptr) == size);
+		JEMALLOC_P(free)(ptr);
+	} else {
+		assert(size == szone.size(zone, ptr));
+		szone.free_definite_size(zone, ptr, size);
+	}
+}
+#endif
+
+static void
+ozone_force_lock(malloc_zone_t *zone)
+{
+
+	/* jemalloc locking is taken care of by the normal jemalloc zone. */
+	szone.introspect->force_lock(zone);
+}
+
+static void
+ozone_force_unlock(malloc_zone_t *zone)
+{
+
+	/* jemalloc locking is taken care of by the normal jemalloc zone. */
+	szone.introspect->force_unlock(zone);
+}
+
+/*
+ * Overlay the default scalable zone (szone) such that existing allocations are
+ * drained, and further allocations come from jemalloc.  This is necessary
+ * because Core Foundation directly accesses and uses the szone before the
+ * jemalloc library is even loaded.
+ */
+void
+szone2ozone(malloc_zone_t *zone)
+{
+
+	/*
+	 * Stash a copy of the original szone so that we can call its
+	 * functions as needed.  Note that the internally, the szone stores its
+	 * bookkeeping data structures immediately following the malloc_zone_t
+	 * header, so when calling szone functions, we need to pass a pointer
+	 * to the original zone structure.
+	 */
+	memcpy(&szone, zone, sizeof(malloc_zone_t));
+
+	zone->size = (void *)ozone_size;
+	zone->malloc = (void *)zone_malloc;
+	zone->calloc = (void *)zone_calloc;
+	zone->valloc = (void *)zone_valloc;
+	zone->free = (void *)ozone_free;
+	zone->realloc = (void *)ozone_realloc;
+	zone->destroy = (void *)zone_destroy;
+	zone->zone_name = "jemalloc_ozone";
+	zone->batch_malloc = ozone_batch_malloc;
+	zone->batch_free = ozone_batch_free;
+	zone->introspect = &ozone_introspect;
+	zone->version = JEMALLOC_ZONE_VERSION;
+#if (JEMALLOC_ZONE_VERSION >= 6)
+	zone->memalign = zone_memalign;
+	zone->free_definite_size = ozone_free_definite_size;
+#endif
+
+	ozone_introspect.enumerator = NULL;
+	ozone_introspect.good_size = (void *)zone_good_size;
+	ozone_introspect.check = NULL;
+	ozone_introspect.print = NULL;
+	ozone_introspect.log = NULL;
+	ozone_introspect.force_lock = (void *)ozone_force_lock;
+	ozone_introspect.force_unlock = (void *)ozone_force_unlock;
+	ozone_introspect.statistics = NULL;
+#if (JEMALLOC_ZONE_VERSION >= 6)
+	ozone_introspect.zone_locked = NULL;
+#endif
+}
diff --git a/jemalloc/test/thread_arena.c b/jemalloc/test/thread_arena.c
index 99e9669a..d52435fa 100644
--- a/jemalloc/test/thread_arena.c
+++ b/jemalloc/test/thread_arena.c
@@ -3,6 +3,7 @@
 #include <pthread.h>
 #include <string.h>
 
+#define	JEMALLOC_MANGLE
 #include "jemalloc/jemalloc.h"
 
 void *
@@ -13,10 +14,10 @@ thread_start(void *arg)
 	size_t size;
 	int err;
 
-	malloc(1);
+	JEMALLOC_P(malloc)(1);
 
 	size = sizeof(arena_ind);
-	if ((err = mallctl("thread.arena", &arena_ind, &size,
+	if ((err = JEMALLOC_P(mallctl)("thread.arena", &arena_ind, &size,
 	    &main_arena_ind, sizeof(main_arena_ind)))) {
 		fprintf(stderr, "%s(): Error in mallctl(): %s\n", __func__,
 		    strerror(err));
@@ -37,10 +38,11 @@ main(void)
 
 	fprintf(stderr, "Test begin\n");
 
-	malloc(1);
+	JEMALLOC_P(malloc)(1);
 
 	size = sizeof(arena_ind);
-	if ((err = mallctl("thread.arena", &arena_ind, &size, NULL, 0))) {
+	if ((err = JEMALLOC_P(mallctl)("thread.arena", &arena_ind, &size, NULL,
+	    0))) {
 		fprintf(stderr, "%s(): Error in mallctl(): %s\n", __func__,
 		    strerror(err));
 		ret = 1;

From 58a6f5c9bef785ecff33fff7e56343ea6c482d56 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Sat, 11 Sep 2010 20:59:16 -0700
Subject: [PATCH 03/48] Add posix_memalign test.

---
 jemalloc/Makefile.in             |   2 +-
 jemalloc/test/posix_memalign.c   | 123 +++++++++++++++++++++++++++++++
 jemalloc/test/posix_memalign.exp |  30 ++++++++
 3 files changed, 154 insertions(+), 1 deletion(-)
 create mode 100644 jemalloc/test/posix_memalign.c
 create mode 100644 jemalloc/test/posix_memalign.exp

diff --git a/jemalloc/Makefile.in b/jemalloc/Makefile.in
index aa3bf6bd..0795d3aa 100644
--- a/jemalloc/Makefile.in
+++ b/jemalloc/Makefile.in
@@ -58,7 +58,7 @@ DSOS := @objroot@lib/libjemalloc@install_suffix@.$(SO).$(REV) \
 	@objroot@lib/libjemalloc@install_suffix@.$(SO) \
 	@objroot@lib/libjemalloc@install_suffix@_pic.a
 MAN3 := @objroot@doc/jemalloc@install_suffix@.3
-CTESTS := @srcroot@test/thread_arena.c
+CTESTS := @srcroot@test/posix_memalign.c @srcroot@test/thread_arena.c
 
 .PHONY: all dist install check clean distclean relclean
 
diff --git a/jemalloc/test/posix_memalign.c b/jemalloc/test/posix_memalign.c
new file mode 100644
index 00000000..0f2c879c
--- /dev/null
+++ b/jemalloc/test/posix_memalign.c
@@ -0,0 +1,123 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+
+#define	JEMALLOC_MANGLE
+#include "jemalloc/jemalloc.h"
+
+#define CHUNK 0x100000
+/* #define MAXALIGN ((size_t)0x80000000000LLU) */
+#define MAXALIGN ((size_t)0x40000000LLU)
+#define NITER 4
+
+int
+main(void)
+{
+	size_t alignment, size, total;
+	unsigned i;
+	int err;
+	void *p, *ps[NITER];
+
+	fprintf(stderr, "Test begin\n");
+
+	/* Test error conditions. */
+	for (alignment = 0; alignment < sizeof(void *); alignment++) {
+		err = JEMALLOC_P(posix_memalign)(&p, alignment, 1);
+		if (err != EINVAL) {
+			fprintf(stderr,
+			    "Expected error for invalid alignment %zu\n",
+			    alignment);
+		}
+	}
+
+	for (alignment = sizeof(size_t); alignment < MAXALIGN;
+	    alignment <<= 1) {
+		err = JEMALLOC_P(posix_memalign)(&p, alignment + 1, 1);
+		if (err == 0) {
+			fprintf(stderr,
+			    "Expected error for invalid alignment %zu\n",
+			    alignment + 1);
+		}
+	}
+
+#if LG_SIZEOF_PTR == 3
+	alignment = 0x8000000000000000LLU;
+	size      = 0x8000000000000000LLU;
+#else
+	alignment = 0x8000 0000LU;
+	size      = 0x8000 0000LU;
+#endif
+	err = JEMALLOC_P(posix_memalign)(&p, alignment, size);
+	if (err == 0) {
+		fprintf(stderr,
+		    "Expected error for posix_memalign(&p, %zu, %zu)\n",
+		    alignment, size);
+	}
+
+#if LG_SIZEOF_PTR == 3
+	alignment = 0x4000000000000000LLU;
+	size      = 0x8400000000000001LLU;
+#else
+	alignment = 0x40000000LU;
+	size      = 0x84000001LU;
+#endif
+	err = JEMALLOC_P(posix_memalign)(&p, alignment, size);
+	if (err == 0) {
+		fprintf(stderr,
+		    "Expected error for posix_memalign(&p, %zu, %zu)\n",
+		    alignment, size);
+	}
+
+	alignment = 0x10LLU;
+#if LG_SIZEOF_PTR == 3
+	size = 0xfffffffffffffff0LLU;
+#else
+	size = 0xfffffff0LU;
+#endif
+	err = JEMALLOC_P(posix_memalign)(&p, alignment, size);
+	if (err == 0) {
+		fprintf(stderr,
+		    "Expected error for posix_memalign(&p, %zu, %zu)\n",
+		    alignment, size);
+	}
+
+	for (i = 0; i < NITER; i++)
+		ps[i] = NULL;
+
+	for (alignment = sizeof(void *);
+	    alignment <= MAXALIGN;
+	    alignment <<= 1) {
+		total = 0;
+		fprintf(stderr, "Alignment: %zu\n", alignment);
+		for (size = 1;
+		    size < 3 * alignment && size < (1U << 31);
+		    size += (alignment >> 2) - 1) {
+			for (i = 0; i < NITER; i++) {
+				err = JEMALLOC_P(posix_memalign)(&ps[i],
+				    alignment, size);
+				if (err) {
+					fprintf(stderr,
+					    "Error for size 0x%x %zu : %s\n",
+					    (unsigned)size, size,
+					    strerror(err));
+					exit(1);
+				}
+				total += JEMALLOC_P(malloc_usable_size)(ps[i]);
+				if (total >= (MAXALIGN << 1))
+					break;
+			}
+			for (i = 0; i < NITER; i++) {
+				if (ps[i] != NULL) {
+					JEMALLOC_P(free)(ps[i]);
+					ps[i] = NULL;
+				}
+			}
+		}
+	}
+
+	fprintf(stderr, "Test end\n");
+
+	return 0;
+}
diff --git a/jemalloc/test/posix_memalign.exp b/jemalloc/test/posix_memalign.exp
new file mode 100644
index 00000000..f815b2fe
--- /dev/null
+++ b/jemalloc/test/posix_memalign.exp
@@ -0,0 +1,30 @@
+Test begin
+Alignment: 8
+Alignment: 16
+Alignment: 32
+Alignment: 64
+Alignment: 128
+Alignment: 256
+Alignment: 512
+Alignment: 1024
+Alignment: 2048
+Alignment: 4096
+Alignment: 8192
+Alignment: 16384
+Alignment: 32768
+Alignment: 65536
+Alignment: 131072
+Alignment: 262144
+Alignment: 524288
+Alignment: 1048576
+Alignment: 2097152
+Alignment: 4194304
+Alignment: 8388608
+Alignment: 16777216
+Alignment: 33554432
+Alignment: 67108864
+Alignment: 134217728
+Alignment: 268435456
+Alignment: 536870912
+Alignment: 1073741824
+Test end

From 7e11b389aac64ed59a286b91887bcf68c2a597c4 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Sat, 11 Sep 2010 22:47:39 -0700
Subject: [PATCH 04/48] Move size class table to man page.

Move the table of size classes from jemalloc.c to the manual page.  When
manually formatting the manual page, it is now necessary to use:

    nroff -man -t jemalloc.3
---
 jemalloc/INSTALL           |  7 ++++
 jemalloc/doc/jemalloc.3.in | 62 +++++++++++++++++++++++++++-
 jemalloc/src/jemalloc.c    | 82 --------------------------------------
 3 files changed, 67 insertions(+), 84 deletions(-)

diff --git a/jemalloc/INSTALL b/jemalloc/INSTALL
index e9a87989..a8957436 100644
--- a/jemalloc/INSTALL
+++ b/jemalloc/INSTALL
@@ -217,3 +217,10 @@ directory, issue configuration and build commands:
     cd obj
     ../configure --enable-autogen
     make
+
+=== Documentation ==============================================================
+
+The manual page that the configure script generates can be manually formatted
+prior to installation via the following command:
+
+    nroff -man -t doc/jemalloc.3
diff --git a/jemalloc/doc/jemalloc.3.in b/jemalloc/doc/jemalloc.3.in
index dfc4d763..02340e62 100644
--- a/jemalloc/doc/jemalloc.3.in
+++ b/jemalloc/doc/jemalloc.3.in
@@ -564,8 +564,8 @@ However, it may make sense to reduce the number of arenas if an application
 does not make much use of the allocation functions.
 .Pp
 @roff_tcache@In addition to multiple arenas, this allocator supports
-@roff_tcache@thread-specific caching for small objects, in order to make it
-@roff_tcache@possible to completely avoid synchronization for most small
+@roff_tcache@thread-specific caching for small and large objects, in order to
+@roff_tcache@make it possible to completely avoid synchronization for most small
 @roff_tcache@allocation requests.
 @roff_tcache@Such caching allows very fast allocation in the common case, but it
 @roff_tcache@increases memory usage and fragmentation, since a bounded number of
@@ -619,6 +619,64 @@ option), are rounded up to the nearest run size.
 Allocation requests that are too large to fit in an arena-managed chunk are
 rounded up to the nearest multiple of the chunk size.
 .Pp
+Assuming 4 MiB chunks, 4 KiB pages, and a 16 byte quantum on a 64-bit system,
+the size classes in each category are as follows:
+.TS
+expand allbox tab(;);
+LLR
+LLR
+^LR
+^^R
+^^R
+^^R
+^^R
+^LR
+^^R
+^^R
+^^R
+^^R
+^LR
+^^R
+^^R
+^^R
+^^R
+LsR
+^^R
+^^R
+^^R
+^^R
+LsR
+^^R
+^^R
+^^R.
+Category;Subcategory;Size
+Small;Tiny;8
+;Quantum-spaced;16
+;;32
+;;48
+;;...
+;;128
+;Cacheline-spaced;192
+;;256
+;;320
+;;...
+;;512
+;Sub-page;760
+;;1024
+;;1280
+;;...
+;;3840
+Large;4 KiB
+;;8 KiB
+;;12 KiB
+;;...
+;;4084 KiB
+Huge;4 MiB
+;;8 MiB
+;;12 MiB
+;;...
+.TE
+.Pp
 Allocations are packed tightly together, which can be an issue for
 multi-threaded applications.
 If you need to assure that allocations do not suffer from cacheline sharing,
diff --git a/jemalloc/src/jemalloc.c b/jemalloc/src/jemalloc.c
index ebce3ca0..bf4ccc05 100644
--- a/jemalloc/src/jemalloc.c
+++ b/jemalloc/src/jemalloc.c
@@ -1,85 +1,3 @@
-/*-
- * This allocator implementation is designed to provide scalable performance
- * for multi-threaded programs on multi-processor systems.  The following
- * features are included for this purpose:
- *
- *   + Multiple arenas are used if there are multiple CPUs, which reduces lock
- *     contention and cache sloshing.
- *
- *   + Thread-specific caching is used if there are multiple threads, which
- *     reduces the amount of locking.
- *
- *   + Cache line sharing between arenas is avoided for internal data
- *     structures.
- *
- *   + Memory is managed in chunks and runs (chunks can be split into runs),
- *     rather than as individual pages.  This provides a constant-time
- *     mechanism for associating allocations with particular arenas.
- *
- * Allocation requests are rounded up to the nearest size class, and no record
- * of the original request size is maintained.  Allocations are broken into
- * categories according to size class.  Assuming 1 MiB chunks, 4 KiB pages and
- * a 16 byte quantum on a 32-bit system, the size classes in each category are
- * as follows:
- *
- *   |========================================|
- *   | Category | Subcategory      |     Size |
- *   |========================================|
- *   | Small    | Tiny             |        2 |
- *   |          |                  |        4 |
- *   |          |                  |        8 |
- *   |          |------------------+----------|
- *   |          | Quantum-spaced   |       16 |
- *   |          |                  |       32 |
- *   |          |                  |       48 |
- *   |          |                  |      ... |
- *   |          |                  |       96 |
- *   |          |                  |      112 |
- *   |          |                  |      128 |
- *   |          |------------------+----------|
- *   |          | Cacheline-spaced |      192 |
- *   |          |                  |      256 |
- *   |          |                  |      320 |
- *   |          |                  |      384 |
- *   |          |                  |      448 |
- *   |          |                  |      512 |
- *   |          |------------------+----------|
- *   |          | Sub-page         |      760 |
- *   |          |                  |     1024 |
- *   |          |                  |     1280 |
- *   |          |                  |      ... |
- *   |          |                  |     3328 |
- *   |          |                  |     3584 |
- *   |          |                  |     3840 |
- *   |========================================|
- *   | Large                       |    4 KiB |
- *   |                             |    8 KiB |
- *   |                             |   12 KiB |
- *   |                             |      ... |
- *   |                             | 1012 KiB |
- *   |                             | 1016 KiB |
- *   |                             | 1020 KiB |
- *   |========================================|
- *   | Huge                        |    1 MiB |
- *   |                             |    2 MiB |
- *   |                             |    3 MiB |
- *   |                             |      ... |
- *   |========================================|
- *
- * Different mechanisms are used accoding to category:
- *
- *   Small: Each size class is segregated into its own set of runs.  Each run
- *          maintains a bitmap of which regions are free/allocated.
- *
- *   Large : Each allocation is backed by a dedicated run.  Metadata are stored
- *           in the associated arena chunk header maps.
- *
- *   Huge : Each allocation is backed by a dedicated contiguous set of chunks.
- *          Metadata are stored in a separate red-black tree.
- *
- *******************************************************************************
- */
-
 #define	JEMALLOC_C_
 #include "jemalloc/internal/jemalloc_internal.h"
 

From 8d7a94b275a7c26ff4207b943585661790b2e216 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Sat, 11 Sep 2010 23:38:12 -0700
Subject: [PATCH 05/48] Fix porting regressions.

Fix new build failures and test failures on Linux that were introduced
by the port to OS X.
---
 jemalloc/include/jemalloc/internal/tcache.h |  4 ++--
 jemalloc/test/posix_memalign.c              | 15 +++++++--------
 jemalloc/test/posix_memalign.exp            |  5 -----
 3 files changed, 9 insertions(+), 15 deletions(-)

diff --git a/jemalloc/include/jemalloc/internal/tcache.h b/jemalloc/include/jemalloc/internal/tcache.h
index df302fba..e5061114 100644
--- a/jemalloc/include/jemalloc/internal/tcache.h
+++ b/jemalloc/include/jemalloc/internal/tcache.h
@@ -70,16 +70,16 @@ extern __thread tcache_t	*tcache_tls
     JEMALLOC_ATTR(tls_model("initial-exec"));
 #  define TCACHE_GET()	tcache_tls
 #  define TCACHE_SET(v)	do {						\
-	tcache_tls = (v);						\
+	tcache_tls = (tcache_t *)(v);					\
 	pthread_setspecific(tcache_tsd, (void *)(v));			\
 } while (0)
 #else
-extern pthread_key_t		tcache_tsd;
 #  define TCACHE_GET()	((tcache_t *)pthread_getspecific(tcache_tsd))
 #  define TCACHE_SET(v)	do {						\
 	pthread_setspecific(tcache_tsd, (void *)(v));			\
 } while (0)
 #endif
+extern pthread_key_t		tcache_tsd;
 
 /*
  * Number of tcache bins.  There are nbins small-object bins, plus 0 or more
diff --git a/jemalloc/test/posix_memalign.c b/jemalloc/test/posix_memalign.c
index 0f2c879c..cd3cadc9 100644
--- a/jemalloc/test/posix_memalign.c
+++ b/jemalloc/test/posix_memalign.c
@@ -9,7 +9,7 @@
 
 #define CHUNK 0x100000
 /* #define MAXALIGN ((size_t)0x80000000000LLU) */
-#define MAXALIGN ((size_t)0x40000000LLU)
+#define MAXALIGN ((size_t)0x2000000LLU)
 #define NITER 4
 
 int
@@ -46,8 +46,8 @@ main(void)
 	alignment = 0x8000000000000000LLU;
 	size      = 0x8000000000000000LLU;
 #else
-	alignment = 0x8000 0000LU;
-	size      = 0x8000 0000LU;
+	alignment = 0x80000000LU;
+	size      = 0x80000000LU;
 #endif
 	err = JEMALLOC_P(posix_memalign)(&p, alignment, size);
 	if (err == 0) {
@@ -86,22 +86,21 @@ main(void)
 	for (i = 0; i < NITER; i++)
 		ps[i] = NULL;
 
-	for (alignment = sizeof(void *);
+	for (alignment = 8;
 	    alignment <= MAXALIGN;
 	    alignment <<= 1) {
 		total = 0;
 		fprintf(stderr, "Alignment: %zu\n", alignment);
 		for (size = 1;
 		    size < 3 * alignment && size < (1U << 31);
-		    size += (alignment >> 2) - 1) {
+		    size += (alignment >> (LG_SIZEOF_PTR-1)) - 1) {
 			for (i = 0; i < NITER; i++) {
 				err = JEMALLOC_P(posix_memalign)(&ps[i],
 				    alignment, size);
 				if (err) {
 					fprintf(stderr,
-					    "Error for size 0x%x %zu : %s\n",
-					    (unsigned)size, size,
-					    strerror(err));
+					    "Error for size %zu (0x%zx): %s\n",
+					    size, size, strerror(err));
 					exit(1);
 				}
 				total += JEMALLOC_P(malloc_usable_size)(ps[i]);
diff --git a/jemalloc/test/posix_memalign.exp b/jemalloc/test/posix_memalign.exp
index f815b2fe..b5061c72 100644
--- a/jemalloc/test/posix_memalign.exp
+++ b/jemalloc/test/posix_memalign.exp
@@ -22,9 +22,4 @@ Alignment: 4194304
 Alignment: 8388608
 Alignment: 16777216
 Alignment: 33554432
-Alignment: 67108864
-Alignment: 134217728
-Alignment: 268435456
-Alignment: 536870912
-Alignment: 1073741824
 Test end

From 4cc6a60a4f3ee481bdde233d6fa72e256cb9477a Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Sat, 11 Sep 2010 23:40:24 -0700
Subject: [PATCH 06/48] Update modification date in man page.

---
 jemalloc/doc/jemalloc.3.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/jemalloc/doc/jemalloc.3.in b/jemalloc/doc/jemalloc.3.in
index 02340e62..138234ae 100644
--- a/jemalloc/doc/jemalloc.3.in
+++ b/jemalloc/doc/jemalloc.3.in
@@ -38,7 +38,7 @@
 .\"     @(#)malloc.3	8.1 (Berkeley) 6/4/93
 .\" $FreeBSD: head/lib/libc/stdlib/malloc.3 182225 2008-08-27 02:00:53Z jasone $
 .\"
-.Dd April 2, 2010
+.Dd September 11, 2010
 .Dt JEMALLOC 3
 .Os
 .Sh NAME

From 8e3c3c61b5bb676a705450708e7e79698cdc9e0c Mon Sep 17 00:00:00 2001
From: Jason Evans <je@facebook.com>
Date: Fri, 17 Sep 2010 15:46:18 -0700
Subject: [PATCH 07/48] Add {,r,s,d}allocm().

Add allocm(), rallocm(), sallocm(), and dallocm(), which are a
functional superset of malloc(), calloc(), posix_memalign(),
malloc_usable_size(), and free().
---
 jemalloc/Makefile.in                          |   3 +-
 jemalloc/doc/jemalloc.3.in                    | 169 ++++++++++++++-
 jemalloc/include/jemalloc/internal/arena.h    |   9 +-
 jemalloc/include/jemalloc/internal/huge.h     |   7 +-
 .../jemalloc/internal/jemalloc_internal.h.in  |  74 +++++--
 jemalloc/include/jemalloc/jemalloc.h.in       |  21 ++
 jemalloc/src/arena.c                          | 146 ++++++++-----
 jemalloc/src/ckh.c                            |  15 +-
 jemalloc/src/huge.c                           |  85 +++++---
 jemalloc/src/jemalloc.c                       | 199 +++++++++++++++++-
 jemalloc/src/prof.c                           |   2 +-
 jemalloc/src/tcache.c                         |   4 +-
 jemalloc/test/allocm.c                        | 133 ++++++++++++
 jemalloc/test/allocm.exp                      |  25 +++
 jemalloc/test/posix_memalign.c                |   5 +-
 jemalloc/test/rallocm.c                       | 117 ++++++++++
 jemalloc/test/rallocm.exp                     |   2 +
 17 files changed, 892 insertions(+), 124 deletions(-)
 create mode 100644 jemalloc/test/allocm.c
 create mode 100644 jemalloc/test/allocm.exp
 create mode 100644 jemalloc/test/rallocm.c
 create mode 100644 jemalloc/test/rallocm.exp

diff --git a/jemalloc/Makefile.in b/jemalloc/Makefile.in
index 0795d3aa..f2a64539 100644
--- a/jemalloc/Makefile.in
+++ b/jemalloc/Makefile.in
@@ -58,7 +58,8 @@ DSOS := @objroot@lib/libjemalloc@install_suffix@.$(SO).$(REV) \
 	@objroot@lib/libjemalloc@install_suffix@.$(SO) \
 	@objroot@lib/libjemalloc@install_suffix@_pic.a
 MAN3 := @objroot@doc/jemalloc@install_suffix@.3
-CTESTS := @srcroot@test/posix_memalign.c @srcroot@test/thread_arena.c
+CTESTS := @srcroot@test/allocm.c @srcroot@test/posix_memalign.c \
+	@srcroot@test/rallocm.c @srcroot@test/thread_arena.c
 
 .PHONY: all dist install check clean distclean relclean
 
diff --git a/jemalloc/doc/jemalloc.3.in b/jemalloc/doc/jemalloc.3.in
index 138234ae..9dc4b250 100644
--- a/jemalloc/doc/jemalloc.3.in
+++ b/jemalloc/doc/jemalloc.3.in
@@ -38,7 +38,7 @@
 .\"     @(#)malloc.3	8.1 (Berkeley) 6/4/93
 .\" $FreeBSD: head/lib/libc/stdlib/malloc.3 182225 2008-08-27 02:00:53Z jasone $
 .\"
-.Dd September 11, 2010
+.Dd September 17, 2010
 .Dt JEMALLOC 3
 .Os
 .Sh NAME
@@ -51,13 +51,18 @@
 .Nm @jemalloc_prefix@malloc_stats_print ,
 .Nm @jemalloc_prefix@mallctl ,
 .Nm @jemalloc_prefix@mallctlnametomib ,
-.Nm @jemalloc_prefix@mallctlbymib
+.Nm @jemalloc_prefix@mallctlbymib ,
+.Nm @jemalloc_prefix@allocm ,
+.Nm @jemalloc_prefix@rallocm ,
+.Nm @jemalloc_prefix@sallocm ,
+.Nm @jemalloc_prefix@dallocm
 .Nd general purpose memory allocation functions
 .Sh LIBRARY
 .Sy libjemalloc@install_suffix@
 .Sh SYNOPSIS
 .In stdlib.h
 .In jemalloc/jemalloc@install_suffix@.h
+.Ss Standard API
 .Ft void *
 .Fn @jemalloc_prefix@malloc "size_t size"
 .Ft void *
@@ -68,6 +73,7 @@
 .Fn @jemalloc_prefix@realloc "void *ptr" "size_t size"
 .Ft void
 .Fn @jemalloc_prefix@free "void *ptr"
+.Ss Non-standard API
 .Ft size_t
 .Fn @jemalloc_prefix@malloc_usable_size "const void *ptr"
 .Ft void
@@ -82,7 +88,17 @@
 .Va @jemalloc_prefix@malloc_options ;
 .Ft void
 .Fn \*(lp*@jemalloc_prefix@malloc_message\*(rp "void *cbopaque" "const char *s"
+.Ss Experimental API
+.Ft int
+.Fn @jemalloc_prefix@allocm "void **ptr" "size_t *rsize" "size_t size" "int flags"
+.Ft int
+.Fn @jemalloc_prefix@rallocm "void **ptr" "size_t *rsize" "size_t size" "size_t extra" "int flags"
+.Ft int
+.Fn @jemalloc_prefix@sallocm "const void *ptr" "size_t *rsize" "int flags"
+.Ft int
+.Fn @jemalloc_prefix@dallocm "void *ptr" "int flags"
 .Sh DESCRIPTION
+.Ss Standard API
 The
 .Fn @jemalloc_prefix@malloc
 function allocates
@@ -158,7 +174,7 @@ If
 is
 .Dv NULL ,
 no action occurs.
-.Pp
+.Ss Non-standard API
 The
 .Fn @jemalloc_prefix@malloc_usable_size
 function returns the usable size of the allocation pointed to by
@@ -289,6 +305,102 @@ for (i = 0; i < nbins; i++) {
 	/* Do something with bin_size... */
 }
 .Ed
+.Ss Experimental API
+The experimental API is subject to change or removal without regard for
+backward compatibility.
+.Pp
+The
+.Fn @jemalloc_prefix@allocm ,
+.Fn @jemalloc_prefix@rallocm ,
+.Fn @jemalloc_prefix@sallocm ,
+and
+.Fn @jemalloc_prefix@dallocm
+functions all have a
+.Fa flags
+argument that can be used to specify options.
+The functions only check the options that are contextually relevant.
+Use bitwise or (|) operations to specify one or more of the following:
+.Bl -tag -width ".Dv ALLOCM_LG_ALIGN(la)"
+.It ALLOCM_LG_ALIGN(la)
+Align the memory allocation to start at an address that is a multiple of
+(1 <<
+.Fa la ) .
+This macro does not validate that
+.Fa la
+is within the valid range.
+.It ALLOCM_ALIGN(a)
+Align the memory allocation to start at an address that is a multiple of
+.Fa a ,
+where
+.Fa a
+is a power of two.
+This macro does not validate that
+.Fa a
+is a power of 2.
+.It ALLOCM_ZERO
+Initialize newly allocated memory to contain zero bytes.
+In the growing reallocation case, the real size prior to reallocation defines
+the boundary between untouched bytes and those that are initialized to contain
+zero bytes.
+If this option is absent, newly allocated memory is uninitialized.
+.It ALLOCM_NO_MOVE
+For reallocation, fail rather than moving the object.
+This constraint can apply to both growth and shrinkage.
+.El
+.Pp
+The
+.Fn @jemalloc_prefix@allocm
+function allocates at least
+.Fa size
+bytes of memory, sets
+.Fa *ptr
+to the base address of the allocation, and sets
+.Fa *rsize
+to the real size of the allocation if
+.Fa rsize
+is not
+.Dv NULL .
+.Pp
+The
+.Fn @jemalloc_prefix@rallocm
+function resizes the allocation at
+.Fa *ptr
+to be at least
+.Fa size
+bytes, sets
+.Fa *ptr
+to the base address of the allocation if it moved, and sets
+.Fa *rsize
+to the real size of the allocation if
+.Fa rsize
+is not
+.Dv NULL .
+If
+.Fa extra
+is non-zero, an attempt is made to resize the allocation to be at least
+.Fa ( size
++
+.Fa extra )
+bytes, though an inability to allocate the extra byte(s) will not by itself
+result in failure.
+Behavior is undefined if
+.Fa ( size
++
+.Fa extra
+>
+.Dv SIZE_T_MAX ) .
+.Pp
+The
+.Fn @jemalloc_prefix@sallocm
+function sets
+.Fa *rsize
+to the real size of the allocation.
+.Pp
+The
+.Fn @jemalloc_prefix@dallocm
+function causes the memory referenced by
+.Fa ptr
+to be made available for future allocations.
 .Sh TUNING
 Once, when the first call is made to one of these memory allocation
 routines, various flags will be set or reset, which affects the
@@ -646,11 +758,10 @@ LsR
 ^^R
 ^^R
 LsR
-^^R
-^^R
 ^^R.
 Category;Subcategory;Size
-Small;Tiny;8
+@roff_tiny@Small;Tiny;8
+@roff_no_tiny@Small;Tiny;[disabled]
 ;Quantum-spaced;16
 ;;32
 ;;48
@@ -681,7 +792,7 @@ Allocations are packed tightly together, which can be an issue for
 multi-threaded applications.
 If you need to assure that allocations do not suffer from cacheline sharing,
 round your allocation requests up to the nearest multiple of the cacheline
-size.
+size, or specify cacheline alignment when allocating.
 .Sh MALLCTL NAMESPACE
 The following names are defined in the namespace accessible via the
 .Fn @jemalloc_prefix@mallctl*
@@ -1412,6 +1523,7 @@ is likely to result in a crash or deadlock.
 All messages are prefixed by
 .Dq <jemalloc>: .
 .Sh RETURN VALUES
+.Ss Standard API
 The
 .Fn @jemalloc_prefix@malloc
 and
@@ -1460,7 +1572,7 @@ when an error occurs.
 The
 .Fn @jemalloc_prefix@free
 function returns no value.
-.Pp
+.Ss Non-standard API
 The
 .Fn @jemalloc_prefix@malloc_usable_size
 function returns the usable size of the allocation pointed to by
@@ -1502,6 +1614,47 @@ An interface with side effects failed in some way not directly related to
 .Fn @jemalloc_prefix@mallctl*
 read/write processing.
 .El
+.Ss Experimental API
+The
+.Fn @jemalloc_prefix@allocm ,
+.Fn @jemalloc_prefix@rallocm ,
+.Fn @jemalloc_prefix@sallocm ,
+and
+.Fn @jemalloc_prefix@dallocm
+functions return
+.Dv ALLOCM_SUCCESS
+on success; otherwise they return an error value.
+The
+.Fn @jemalloc_prefix@allocm
+and
+.Fn @jemalloc_prefix@rallocm
+functions will fail if:
+.Bl -tag -width ".Dv ALLOCM_ERR_OOM"
+.It ALLOCM_ERR_OOM
+Out of memory.
+Insufficient contiguous memory was available to service the allocation request.
+The
+.Fn @jemalloc_prefix@allocm
+function additionally sets
+.Fa *ptr
+to
+.Dv NULL ,
+whereas the
+.Fn @jemalloc_prefix@rallocm
+function leaves
+.Fa *ptr
+unmodified.
+.El
+.Pp
+The
+.Fn @jemalloc_prefix@rallocm
+function will also fail if:
+.Bl -tag -width ".Dv ALLOCM_ERR_NOT_MOVED"
+.It ALLOCM_ERR_NOT_MOVED
+.Dv ALLOCM_NO_MOVE
+was specified, but the reallocation request could not be serviced without
+moving the object.
+.El
 .Sh ENVIRONMENT
 The following environment variables affect the execution of the allocation
 functions:
diff --git a/jemalloc/include/jemalloc/internal/arena.h b/jemalloc/include/jemalloc/internal/arena.h
index c1955f19..73ebb00a 100644
--- a/jemalloc/include/jemalloc/internal/arena.h
+++ b/jemalloc/include/jemalloc/internal/arena.h
@@ -432,8 +432,8 @@ void	arena_prof_accum(arena_t *arena, uint64_t accumbytes);
 void	*arena_malloc_small(arena_t *arena, size_t size, bool zero);
 void	*arena_malloc_large(arena_t *arena, size_t size, bool zero);
 void	*arena_malloc(size_t size, bool zero);
-void	*arena_palloc(arena_t *arena, size_t alignment, size_t size,
-    size_t alloc_size);
+void	*arena_palloc(arena_t *arena, size_t size, size_t alloc_size,
+    size_t alignment, bool zero);
 size_t	arena_salloc(const void *ptr);
 #ifdef JEMALLOC_PROF
 void	arena_prof_promoted(const void *ptr, size_t size);
@@ -449,7 +449,10 @@ void	arena_stats_merge(arena_t *arena, size_t *nactive, size_t *ndirty,
     arena_stats_t *astats, malloc_bin_stats_t *bstats,
     malloc_large_stats_t *lstats);
 #endif
-void	*arena_ralloc(void *ptr, size_t size, size_t oldsize);
+void	*arena_ralloc_no_move(void *ptr, size_t oldsize, size_t size,
+    size_t extra, bool zero);
+void	*arena_ralloc(void *ptr, size_t oldsize, size_t size, size_t extra,
+    size_t alignment, bool zero);
 bool	arena_new(arena_t *arena, unsigned ind);
 bool	arena_boot(void);
 
diff --git a/jemalloc/include/jemalloc/internal/huge.h b/jemalloc/include/jemalloc/internal/huge.h
index 0c0582f2..bf231274 100644
--- a/jemalloc/include/jemalloc/internal/huge.h
+++ b/jemalloc/include/jemalloc/internal/huge.h
@@ -20,8 +20,11 @@ extern size_t		huge_allocated;
 extern malloc_mutex_t	huge_mtx;
 
 void	*huge_malloc(size_t size, bool zero);
-void	*huge_palloc(size_t alignment, size_t size);
-void	*huge_ralloc(void *ptr, size_t size, size_t oldsize);
+void	*huge_palloc(size_t size, size_t alignment, bool zero);
+void	*huge_ralloc_no_move(void *ptr, size_t oldsize, size_t size,
+    size_t extra);
+void	*huge_ralloc(void *ptr, size_t oldsize, size_t size, size_t extra,
+    size_t alignment, bool zero);
 void	huge_dalloc(void *ptr);
 size_t	huge_salloc(const void *ptr);
 #ifdef JEMALLOC_PROF
diff --git a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
index a8d27fa7..04bc56fa 100644
--- a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
@@ -84,6 +84,8 @@ extern void	(*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s);
 /******************************************************************************/
 #define JEMALLOC_H_TYPES
 
+#define	ALLOCM_LG_ALIGN_MASK	((int)0x3f)
+
 #define	ZU(z)	((size_t)z)
 
 #ifndef __DECONST
@@ -391,12 +393,13 @@ choose_arena(void)
 #ifndef JEMALLOC_ENABLE_INLINE
 void	*imalloc(size_t size);
 void	*icalloc(size_t size);
-void	*ipalloc(size_t alignment, size_t size);
+void	*ipalloc(size_t size, size_t alignment, bool zero);
 size_t	isalloc(const void *ptr);
 #  ifdef JEMALLOC_IVSALLOC
 size_t	ivsalloc(const void *ptr);
 #  endif
-void	*iralloc(void *ptr, size_t size);
+void	*iralloc(void *ptr, size_t size, size_t extra, size_t alignment,
+    bool zero, bool no_move);
 void	idalloc(void *ptr);
 #endif
 
@@ -424,7 +427,7 @@ icalloc(size_t size)
 }
 
 JEMALLOC_INLINE void *
-ipalloc(size_t alignment, size_t size)
+ipalloc(size_t size, size_t alignment, bool zero)
 {
 	void *ret;
 	size_t ceil_size;
@@ -459,7 +462,7 @@ ipalloc(size_t alignment, size_t size)
 
 	if (ceil_size <= PAGE_SIZE || (alignment <= PAGE_SIZE
 	    && ceil_size <= arena_maxclass))
-		ret = arena_malloc(ceil_size, false);
+		ret = arena_malloc(ceil_size, zero);
 	else {
 		size_t run_size;
 
@@ -506,12 +509,12 @@ ipalloc(size_t alignment, size_t size)
 		}
 
 		if (run_size <= arena_maxclass) {
-			ret = arena_palloc(choose_arena(), alignment, ceil_size,
-			    run_size);
+			ret = arena_palloc(choose_arena(), ceil_size, run_size,
+			    alignment, zero);
 		} else if (alignment <= chunksize)
-			ret = huge_malloc(ceil_size, false);
+			ret = huge_malloc(ceil_size, zero);
 		else
-			ret = huge_palloc(alignment, ceil_size);
+			ret = huge_palloc(ceil_size, alignment, zero);
 	}
 
 	assert(((uintptr_t)ret & (alignment - 1)) == 0);
@@ -556,8 +559,10 @@ ivsalloc(const void *ptr)
 #endif
 
 JEMALLOC_INLINE void *
-iralloc(void *ptr, size_t size)
+iralloc(void *ptr, size_t size, size_t extra, size_t alignment, bool zero,
+    bool no_move)
 {
+	void *ret;
 	size_t oldsize;
 
 	assert(ptr != NULL);
@@ -565,10 +570,53 @@ iralloc(void *ptr, size_t size)
 
 	oldsize = isalloc(ptr);
 
-	if (size <= arena_maxclass)
-		return (arena_ralloc(ptr, size, oldsize));
-	else
-		return (huge_ralloc(ptr, size, oldsize));
+	if (alignment != 0 && ((uintptr_t)ptr & ((uintptr_t)alignment-1))
+	    != 0) {
+		size_t copysize;
+
+		/*
+		 * Existing object alignment is inadquate; allocate new space
+		 * and copy.
+		 */
+		if (no_move)
+			return (NULL);
+		ret = ipalloc(size + extra, alignment, zero);
+		if (ret == NULL) {
+			if (extra == 0)
+				return (NULL);
+			/* Try again, without extra this time. */
+			ret = ipalloc(size, alignment, zero);
+			if (ret == NULL)
+				return (NULL);
+		}
+		/*
+		 * Copy at most size bytes (not size+extra), since the caller
+		 * has no expectation that the extra bytes will be reliably
+		 * preserved.
+		 */
+		copysize = (size < oldsize) ? size : oldsize;
+		memcpy(ret, ptr, copysize);
+		idalloc(ptr);
+		return (ret);
+	}
+
+	if (no_move) {
+		if (size <= arena_maxclass) {
+			return (arena_ralloc_no_move(ptr, oldsize, size,
+			    extra, zero));
+		} else {
+			return (huge_ralloc_no_move(ptr, oldsize, size,
+			    extra));
+		}
+	} else {
+		if (size + extra <= arena_maxclass) {
+			return (arena_ralloc(ptr, oldsize, size, extra,
+			    alignment, zero));
+		} else {
+			return (huge_ralloc(ptr, oldsize, size, extra,
+			    alignment, zero));
+		}
+	}
 }
 
 JEMALLOC_INLINE void
diff --git a/jemalloc/include/jemalloc/jemalloc.h.in b/jemalloc/include/jemalloc/jemalloc.h.in
index 8ef81836..c28fb6d3 100644
--- a/jemalloc/include/jemalloc/jemalloc.h.in
+++ b/jemalloc/include/jemalloc/jemalloc.h.in
@@ -4,6 +4,8 @@
 extern "C" {
 #endif
 
+#include <strings.h>
+
 #define	JEMALLOC_VERSION "@jemalloc_version@"
 #define	JEMALLOC_VERSION_MAJOR @jemalloc_version_major@
 #define	JEMALLOC_VERSION_MINOR @jemalloc_version_minor@
@@ -16,6 +18,19 @@ extern "C" {
 #  define JEMALLOC_P(s) s
 #endif
 
+#define	ALLOCM_LG_ALIGN	((int)0x3f)
+#if LG_SIZEOF_PTR == 2
+#define	ALLOCM_ALIGN(a)	(ffs(a)-1)
+#else
+#define	ALLOCM_ALIGN(a)	((a < (size_t)MAX_INT) ? ffs(a)-1 : ffs(a>>32)+31)
+#endif
+#define	ALLOCM_ZERO	((int)0x40)
+#define	ALLOCM_NO_MOVE	((int)0x80)
+
+#define	ALLOCM_SUCCESS		0
+#define	ALLOCM_ERR_OOM		1
+#define	ALLOCM_ERR_NOT_MOVED	2
+
 extern const char	*JEMALLOC_P(malloc_options);
 extern void		(*JEMALLOC_P(malloc_message))(void *, const char *);
 
@@ -36,6 +51,12 @@ int	JEMALLOC_P(mallctlnametomib)(const char *name, size_t *mibp,
 int	JEMALLOC_P(mallctlbymib)(const size_t *mib, size_t miblen, void *oldp,
     size_t *oldlenp, void *newp, size_t newlen);
 
+int	JEMALLOC_P(allocm)(void **ptr, size_t *rsize, size_t size, int flags);
+int	JEMALLOC_P(rallocm)(void **ptr, size_t *rsize, size_t size,
+    size_t extra, int flags);
+int	JEMALLOC_P(sallocm)(const void *ptr, size_t *rsize, int flags);
+int	JEMALLOC_P(dallocm)(void *ptr, int flags);
+
 #ifdef __cplusplus
 };
 #endif
diff --git a/jemalloc/src/arena.c b/jemalloc/src/arena.c
index db3d4010..0c2f4a35 100644
--- a/jemalloc/src/arena.c
+++ b/jemalloc/src/arena.c
@@ -177,10 +177,11 @@ static size_t	arena_bin_run_size_calc(arena_bin_t *bin, size_t min_run_size);
 static void	arena_dalloc_bin_run(arena_t *arena, arena_chunk_t *chunk,
     arena_run_t *run, arena_bin_t *bin);
 static void	arena_ralloc_large_shrink(arena_t *arena, arena_chunk_t *chunk,
-    void *ptr, size_t size, size_t oldsize);
+    void *ptr, size_t oldsize, size_t size);
 static bool	arena_ralloc_large_grow(arena_t *arena, arena_chunk_t *chunk,
-    void *ptr, size_t size, size_t oldsize);
-static bool	arena_ralloc_large(void *ptr, size_t size, size_t oldsize);
+    void *ptr, size_t oldsize, size_t size, size_t extra, bool zero);
+static bool	arena_ralloc_large(void *ptr, size_t oldsize, size_t size,
+    size_t extra, bool zero);
 static bool	small_size2bin_init(void);
 #ifdef JEMALLOC_DEBUG
 static void	small_size2bin_validate(void);
@@ -1438,7 +1439,8 @@ arena_malloc(size_t size, bool zero)
 
 /* Only handles large allocations that require more than page alignment. */
 void *
-arena_palloc(arena_t *arena, size_t alignment, size_t size, size_t alloc_size)
+arena_palloc(arena_t *arena, size_t size, size_t alloc_size, size_t alignment,
+    bool zero)
 {
 	void *ret;
 	size_t offset;
@@ -1448,7 +1450,7 @@ arena_palloc(arena_t *arena, size_t alignment, size_t size, size_t alloc_size)
 	assert((alignment & PAGE_MASK) == 0);
 
 	malloc_mutex_lock(&arena->lock);
-	ret = (void *)arena_run_alloc(arena, alloc_size, true, false);
+	ret = (void *)arena_run_alloc(arena, alloc_size, true, zero);
 	if (ret == NULL) {
 		malloc_mutex_unlock(&arena->lock);
 		return (NULL);
@@ -1496,10 +1498,12 @@ arena_palloc(arena_t *arena, size_t alignment, size_t size, size_t alloc_size)
 	malloc_mutex_unlock(&arena->lock);
 
 #ifdef JEMALLOC_FILL
-	if (opt_junk)
-		memset(ret, 0xa5, size);
-	else if (opt_zero)
-		memset(ret, 0, size);
+	if (zero == false) {
+		if (opt_junk)
+			memset(ret, 0xa5, size);
+		else if (opt_zero)
+			memset(ret, 0, size);
+	}
 #endif
 	return (ret);
 }
@@ -1944,7 +1948,7 @@ arena_dalloc_large(arena_t *arena, arena_chunk_t *chunk, void *ptr)
 
 static void
 arena_ralloc_large_shrink(arena_t *arena, arena_chunk_t *chunk, void *ptr,
-    size_t size, size_t oldsize)
+    size_t oldsize, size_t size)
 {
 
 	assert(size < oldsize);
@@ -1979,27 +1983,29 @@ arena_ralloc_large_shrink(arena_t *arena, arena_chunk_t *chunk, void *ptr,
 
 static bool
 arena_ralloc_large_grow(arena_t *arena, arena_chunk_t *chunk, void *ptr,
-    size_t size, size_t oldsize)
+    size_t oldsize, size_t size, size_t extra, bool zero)
 {
 	size_t pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT;
 	size_t npages = oldsize >> PAGE_SHIFT;
+	size_t followsize;
 
 	assert(oldsize == (chunk->map[pageind].bits & ~PAGE_MASK));
 
 	/* Try to extend the run. */
-	assert(size > oldsize);
+	assert(size + extra > oldsize);
 	malloc_mutex_lock(&arena->lock);
 	if (pageind + npages < chunk_npages && (chunk->map[pageind+npages].bits
-	    & CHUNK_MAP_ALLOCATED) == 0 && (chunk->map[pageind+npages].bits &
-	    ~PAGE_MASK) >= size - oldsize) {
+	    & CHUNK_MAP_ALLOCATED) == 0 && (followsize =
+	    chunk->map[pageind+npages].bits & ~PAGE_MASK) >= size - oldsize) {
 		/*
 		 * The next run is available and sufficiently large.  Split the
 		 * following run, then merge the first part with the existing
 		 * allocation.
 		 */
+		size_t splitsize = (oldsize + followsize <= size + extra)
+		    ? followsize : size + extra - oldsize;
 		arena_run_split(arena, (arena_run_t *)((uintptr_t)chunk +
-		    ((pageind+npages) << PAGE_SHIFT)), size - oldsize, true,
-		    false);
+		    ((pageind+npages) << PAGE_SHIFT)), splitsize, true, zero);
 
 		chunk->map[pageind].bits = size | CHUNK_MAP_LARGE |
 		    CHUNK_MAP_ALLOCATED;
@@ -2037,11 +2043,12 @@ arena_ralloc_large_grow(arena_t *arena, arena_chunk_t *chunk, void *ptr,
  * always fail if growing an object, and the following run is already in use.
  */
 static bool
-arena_ralloc_large(void *ptr, size_t size, size_t oldsize)
+arena_ralloc_large(void *ptr, size_t oldsize, size_t size, size_t extra,
+    bool zero)
 {
 	size_t psize;
 
-	psize = PAGE_CEILING(size);
+	psize = PAGE_CEILING(size + extra);
 	if (psize == oldsize) {
 		/* Same size class. */
 #ifdef JEMALLOC_FILL
@@ -2067,14 +2074,15 @@ arena_ralloc_large(void *ptr, size_t size, size_t oldsize)
 				    oldsize - size);
 			}
 #endif
-			arena_ralloc_large_shrink(arena, chunk, ptr, psize,
-			    oldsize);
+			arena_ralloc_large_shrink(arena, chunk, ptr, oldsize,
+			    psize);
 			return (false);
 		} else {
 			bool ret = arena_ralloc_large_grow(arena, chunk, ptr,
-			    psize, oldsize);
+			    oldsize, PAGE_CEILING(size),
+			    psize - PAGE_CEILING(size), zero);
 #ifdef JEMALLOC_FILL
-			if (ret == false && opt_zero) {
+			if (ret == false && zero == false && opt_zero) {
 				memset((void *)((uintptr_t)ptr + oldsize), 0,
 				    size - oldsize);
 			}
@@ -2085,49 +2093,89 @@ arena_ralloc_large(void *ptr, size_t size, size_t oldsize)
 }
 
 void *
-arena_ralloc(void *ptr, size_t size, size_t oldsize)
+arena_ralloc_no_move(void *ptr, size_t oldsize, size_t size, size_t extra,
+    bool zero)
 {
-	void *ret;
-	size_t copysize;
 
-	/* Try to avoid moving the allocation. */
+	/*
+	 * Avoid moving the allocation if the size class can be left the same.
+	 */
 	if (oldsize <= arena_maxclass) {
 		if (oldsize <= small_maxclass) {
-			if (size <= small_maxclass && small_size2bin[size] ==
-			    small_size2bin[oldsize])
-				goto IN_PLACE;
+			assert(choose_arena()->bins[small_size2bin[
+			    oldsize]].reg_size == oldsize);
+			if ((size + extra <= small_maxclass &&
+			    small_size2bin[size + extra] ==
+			    small_size2bin[oldsize]) || (size <= oldsize &&
+			    size + extra >= oldsize)) {
+#ifdef JEMALLOC_FILL
+				if (opt_junk && size < oldsize) {
+					memset((void *)((uintptr_t)ptr + size),
+					    0x5a, oldsize - size);
+				}
+#endif
+				return (ptr);
+			}
 		} else {
 			assert(size <= arena_maxclass);
-			if (size > small_maxclass) {
-				if (arena_ralloc_large(ptr, size, oldsize) ==
-				    false)
+			if (size + extra > small_maxclass) {
+				if (arena_ralloc_large(ptr, oldsize, size,
+				    extra, zero) == false)
 					return (ptr);
 			}
 		}
 	}
 
-	/*
-	 * If we get here, then size and oldsize are different enough that we
-	 * need to move the object.  In that case, fall back to allocating new
-	 * space and copying.
-	 */
-	ret = arena_malloc(size, false);
-	if (ret == NULL)
-		return (NULL);
+	/* Reallocation would require a move. */
+	return (NULL);
+}
 
-	/* Junk/zero-filling were already done by arena_malloc(). */
+void *
+arena_ralloc(void *ptr, size_t oldsize, size_t size, size_t extra,
+    size_t alignment, bool zero)
+{
+	void *ret;
+	size_t copysize;
+
+	/* Try to avoid moving the allocation. */
+	ret = arena_ralloc_no_move(ptr, oldsize, size, extra, zero);
+	if (ret != NULL)
+		return (ret);
+
+
+	/*
+	 * size and oldsize are different enough that we need to move the
+	 * object.  In that case, fall back to allocating new space and
+	 * copying.
+	 */
+	if (alignment != 0)
+		ret = ipalloc(size + extra, alignment, zero);
+	else
+		ret = arena_malloc(size + extra, zero);
+
+	if (ret == NULL) {
+		if (extra == 0)
+			return (NULL);
+		/* Try again, this time without extra. */
+		if (alignment != 0)
+			ret = ipalloc(size, alignment, zero);
+		else
+			ret = arena_malloc(size, zero);
+
+		if (ret == NULL)
+			return (NULL);
+	}
+
+	/* Junk/zero-filling were already done by ipalloc()/arena_malloc(). */
+
+	/*
+	 * Copy at most size bytes (not size+extra), since the caller has no
+	 * expectation that the extra bytes will be reliably preserved.
+	 */
 	copysize = (size < oldsize) ? size : oldsize;
 	memcpy(ret, ptr, copysize);
 	idalloc(ptr);
 	return (ret);
-IN_PLACE:
-#ifdef JEMALLOC_FILL
-	if (opt_junk && size < oldsize)
-		memset((void *)((uintptr_t)ptr + size), 0x5a, oldsize - size);
-	else if (opt_zero && size > oldsize)
-		memset((void *)((uintptr_t)ptr + oldsize), 0, size - oldsize);
-#endif
-	return (ptr);
 }
 
 bool
diff --git a/jemalloc/src/ckh.c b/jemalloc/src/ckh.c
index a0c4162a..95c09678 100644
--- a/jemalloc/src/ckh.c
+++ b/jemalloc/src/ckh.c
@@ -263,13 +263,12 @@ ckh_grow(ckh_t *ckh)
 	lg_curcells = ckh->lg_curbuckets + LG_CKH_BUCKET_CELLS;
 	while (true) {
 		lg_curcells++;
-		tab = (ckhc_t *) ipalloc((ZU(1) << LG_CACHELINE),
-		    sizeof(ckhc_t) << lg_curcells);
+		tab = (ckhc_t *)ipalloc(sizeof(ckhc_t) << lg_curcells,
+		    ZU(1) << LG_CACHELINE, true);
 		if (tab == NULL) {
 			ret = true;
 			goto RETURN;
 		}
-		memset(tab, 0, sizeof(ckhc_t) << lg_curcells);
 		/* Swap in new table. */
 		ttab = ckh->tab;
 		ckh->tab = tab;
@@ -305,8 +304,8 @@ ckh_shrink(ckh_t *ckh)
 	 */
 	lg_prevbuckets = ckh->lg_curbuckets;
 	lg_curcells = ckh->lg_curbuckets + LG_CKH_BUCKET_CELLS - 1;
-	tab = (ckhc_t *)ipalloc((ZU(1) << LG_CACHELINE),
-	    sizeof(ckhc_t) << lg_curcells);
+	tab = (ckhc_t *)ipalloc(sizeof(ckhc_t) << lg_curcells,
+	    ZU(1) << LG_CACHELINE, true);
 	if (tab == NULL) {
 		/*
 		 * An OOM error isn't worth propagating, since it doesn't
@@ -314,7 +313,6 @@ ckh_shrink(ckh_t *ckh)
 		 */
 		return;
 	}
-	memset(tab, 0, sizeof(ckhc_t) << lg_curcells);
 	/* Swap in new table. */
 	ttab = ckh->tab;
 	ckh->tab = tab;
@@ -377,13 +375,12 @@ ckh_new(ckh_t *ckh, size_t minitems, ckh_hash_t *hash, ckh_keycomp_t *keycomp)
 	ckh->hash = hash;
 	ckh->keycomp = keycomp;
 
-	ckh->tab = (ckhc_t *)ipalloc((ZU(1) << LG_CACHELINE),
-	    sizeof(ckhc_t) << lg_mincells);
+	ckh->tab = (ckhc_t *)ipalloc(sizeof(ckhc_t) << lg_mincells,
+	    (ZU(1) << LG_CACHELINE), true);
 	if (ckh->tab == NULL) {
 		ret = true;
 		goto RETURN;
 	}
-	memset(ckh->tab, 0, sizeof(ckhc_t) << lg_mincells);
 
 #ifdef JEMALLOC_DEBUG
 	ckh->magic = CKH_MAGIG;
diff --git a/jemalloc/src/huge.c b/jemalloc/src/huge.c
index be35d16f..a0351975 100644
--- a/jemalloc/src/huge.c
+++ b/jemalloc/src/huge.c
@@ -69,12 +69,11 @@ huge_malloc(size_t size, bool zero)
 
 /* Only handles large allocations that require more than chunk alignment. */
 void *
-huge_palloc(size_t alignment, size_t size)
+huge_palloc(size_t size, size_t alignment, bool zero)
 {
 	void *ret;
 	size_t alloc_size, chunk_size, offset;
 	extent_node_t *node;
-	bool zero;
 
 	/*
 	 * This allocation requires alignment that is even larger than chunk
@@ -98,7 +97,6 @@ huge_palloc(size_t alignment, size_t size)
 	if (node == NULL)
 		return (NULL);
 
-	zero = false;
 	ret = chunk_alloc(alloc_size, false, &zero);
 	if (ret == NULL) {
 		base_node_dealloc(node);
@@ -142,45 +140,80 @@ huge_palloc(size_t alignment, size_t size)
 	malloc_mutex_unlock(&huge_mtx);
 
 #ifdef JEMALLOC_FILL
-	if (opt_junk)
-		memset(ret, 0xa5, chunk_size);
-	else if (opt_zero)
-		memset(ret, 0, chunk_size);
+	if (zero == false) {
+		if (opt_junk)
+			memset(ret, 0xa5, chunk_size);
+		else if (opt_zero)
+			memset(ret, 0, chunk_size);
+	}
 #endif
 
 	return (ret);
 }
 
 void *
-huge_ralloc(void *ptr, size_t size, size_t oldsize)
+huge_ralloc_no_move(void *ptr, size_t oldsize, size_t size, size_t extra)
 {
-	void *ret;
-	size_t copysize;
 
-	/* Avoid moving the allocation if the size class would not change. */
-	if (oldsize > arena_maxclass &&
-	    CHUNK_CEILING(size) == CHUNK_CEILING(oldsize)) {
+	/*
+	 * Avoid moving the allocation if the size class can be left the same.
+	 */
+	if (oldsize > arena_maxclass
+	    && CHUNK_CEILING(oldsize) >= CHUNK_CEILING(size)
+	    && CHUNK_CEILING(oldsize) <= CHUNK_CEILING(size+extra)) {
+		assert(CHUNK_CEILING(oldsize) == oldsize);
 #ifdef JEMALLOC_FILL
 		if (opt_junk && size < oldsize) {
-			memset((void *)((uintptr_t)ptr + size), 0x5a, oldsize
-			    - size);
-		} else if (opt_zero && size > oldsize) {
-			memset((void *)((uintptr_t)ptr + oldsize), 0, size
-			    - oldsize);
+			memset((void *)((uintptr_t)ptr + size), 0x5a,
+			    oldsize - size);
 		}
 #endif
 		return (ptr);
 	}
 
-	/*
-	 * If we get here, then size and oldsize are different enough that we
-	 * need to use a different size class.  In that case, fall back to
-	 * allocating new space and copying.
-	 */
-	ret = huge_malloc(size, false);
-	if (ret == NULL)
-		return (NULL);
+	/* Reallocation would require a move. */
+	return (NULL);
+}
 
+void *
+huge_ralloc(void *ptr, size_t oldsize, size_t size, size_t extra,
+    size_t alignment, bool zero)
+{
+	void *ret;
+	size_t copysize;
+
+	/* Try to avoid moving the allocation. */
+	ret = huge_ralloc_no_move(ptr, oldsize, size, extra);
+	if (ret != NULL)
+		return (ret);
+
+	/*
+	 * size and oldsize are different enough that we need to use a
+	 * different size class.  In that case, fall back to allocating new
+	 * space and copying.
+	 */
+	if (alignment != 0)
+		ret = huge_palloc(size + extra, alignment, zero);
+	else
+		ret = huge_malloc(size + extra, zero);
+
+	if (ret == NULL) {
+		if (extra == 0)
+			return (NULL);
+		/* Try again, this time without extra. */
+		if (alignment != 0)
+			ret = huge_palloc(size, alignment, zero);
+		else
+			ret = huge_malloc(size, zero);
+
+		if (ret == NULL)
+			return (NULL);
+	}
+
+	/*
+	 * Copy at most size bytes (not size+extra), since the caller has no
+	 * expectation that the extra bytes will be reliably preserved.
+	 */
 	copysize = (size < oldsize) ? size : oldsize;
 	memcpy(ret, ptr, copysize);
 	idalloc(ptr);
diff --git a/jemalloc/src/jemalloc.c b/jemalloc/src/jemalloc.c
index bf4ccc05..be3d7da1 100644
--- a/jemalloc/src/jemalloc.c
+++ b/jemalloc/src/jemalloc.c
@@ -854,18 +854,20 @@ JEMALLOC_P(posix_memalign)(void **memptr, size_t alignment, size_t size)
 			} else {
 				if (prof_promote && (uintptr_t)cnt !=
 				    (uintptr_t)1U && size <= small_maxclass) {
-					result = ipalloc(alignment,
-					    small_maxclass+1);
+					result = ipalloc(small_maxclass+1,
+					    alignment, false);
 					if (result != NULL) {
 						arena_prof_promoted(result,
 						    size);
 					}
-				} else
-					result = ipalloc(alignment, size);
+				} else {
+					result = ipalloc(size, alignment,
+					    false);
+				}
 			}
 		} else
 #endif
-			result = ipalloc(alignment, size);
+			result = ipalloc(size, alignment, false);
 	}
 
 	if (result == NULL) {
@@ -1023,14 +1025,15 @@ JEMALLOC_P(realloc)(void *ptr, size_t size)
 			}
 			if (prof_promote && (uintptr_t)cnt != (uintptr_t)1U &&
 			    size <= small_maxclass) {
-				ret = iralloc(ptr, small_maxclass+1);
+				ret = iralloc(ptr, small_maxclass+1, 0, 0,
+				    false, false);
 				if (ret != NULL)
 					arena_prof_promoted(ret, size);
 			} else
-				ret = iralloc(ptr, size);
+				ret = iralloc(ptr, size, 0, 0, false, false);
 		} else
 #endif
-			ret = iralloc(ptr, size);
+			ret = iralloc(ptr, size, 0, 0, false, false);
 
 #ifdef JEMALLOC_PROF
 OOM:
@@ -1133,6 +1136,8 @@ JEMALLOC_P(malloc_usable_size)(const void *ptr)
 {
 	size_t ret;
 
+	assert(malloc_initialized || malloc_initializer == pthread_self());
+
 #ifdef JEMALLOC_IVSALLOC
 	ret = ivsalloc(ptr);
 #else
@@ -1204,6 +1209,184 @@ JEMALLOC_P(mallctlbymib)(const size_t *mib, size_t miblen, void *oldp,
 	return (ctl_bymib(mib, miblen, oldp, oldlenp, newp, newlen));
 }
 
+JEMALLOC_INLINE void *
+iallocm(size_t size, size_t alignment, bool zero)
+{
+
+	if (alignment != 0)
+		return (ipalloc(size, alignment, zero));
+	else if (zero)
+		return (icalloc(size));
+	else
+		return (imalloc(size));
+}
+
+JEMALLOC_ATTR(visibility("default"))
+int
+JEMALLOC_P(allocm)(void **ptr, size_t *rsize, size_t size, int flags)
+{
+	void *p;
+	size_t alignment = (ZU(1) << (flags & ALLOCM_LG_ALIGN_MASK)
+	    & (SIZE_T_MAX-1));
+	bool zero = flags & ALLOCM_ZERO;
+#ifdef JEMALLOC_PROF
+	prof_thr_cnt_t *cnt;
+#endif
+
+	assert(ptr != NULL);
+	assert(size != 0);
+
+	if (malloc_init())
+		goto OOM;
+
+#ifdef JEMALLOC_PROF
+	if (opt_prof) {
+		if ((cnt = prof_alloc_prep(size)) == NULL)
+			goto OOM;
+		if (prof_promote && (uintptr_t)cnt != (uintptr_t)1U && size <=
+		    small_maxclass) {
+			p = iallocm(small_maxclass+1, alignment, zero);
+			if (p == NULL)
+				goto OOM;
+			arena_prof_promoted(p, size);
+		} else {
+			p = iallocm(size, alignment, zero);
+			if (p == NULL)
+				goto OOM;
+		}
+	} else
+#endif
+	{
+		p = iallocm(size, alignment, zero);
+		if (p == NULL)
+			goto OOM;
+	}
+
+	*ptr = p;
+	if (rsize != NULL)
+		*rsize = isalloc(p);
+	return (ALLOCM_SUCCESS);
+OOM:
+#ifdef JEMALLOC_XMALLOC
+	if (opt_xmalloc) {
+		malloc_write("<jemalloc>: Error in allocm(): "
+		    "out of memory\n");
+		abort();
+	}
+#endif
+	*ptr = NULL;
+	return (ALLOCM_ERR_OOM);
+}
+
+JEMALLOC_ATTR(visibility("default"))
+int
+JEMALLOC_P(rallocm)(void **ptr, size_t *rsize, size_t size, size_t extra,
+    int flags)
+{
+	void *p, *q;
+	size_t alignment = (ZU(1) << (flags & ALLOCM_LG_ALIGN_MASK)
+	    & (SIZE_T_MAX-1));
+	bool zero = flags & ALLOCM_ZERO;
+	bool no_move = flags & ALLOCM_NO_MOVE;
+#ifdef JEMALLOC_PROF
+	size_t old_size;
+	prof_thr_cnt_t *cnt;
+	prof_ctx_t *old_ctx;
+#endif
+
+	assert(ptr != NULL);
+	assert(*ptr != NULL);
+	assert(size != 0);
+	assert(SIZE_T_MAX - size >= extra);
+	assert(malloc_initialized || malloc_initializer == pthread_self());
+
+	p = *ptr;
+#ifdef JEMALLOC_PROF
+	if (opt_prof) {
+		old_size = isalloc(p);
+		old_ctx = prof_ctx_get(p);
+		if ((cnt = prof_alloc_prep(size)) == NULL)
+			goto OOM;
+		if (prof_promote && (uintptr_t)cnt != (uintptr_t)1U && size <=
+		    small_maxclass) {
+			q = iralloc(p, small_maxclass+1, (small_maxclass+1 >=
+			    size+extra) ? 0 : size+extra - (small_maxclass+1),
+			    alignment, zero, no_move);
+			if (q == NULL)
+				goto ERR;
+			arena_prof_promoted(q, size);
+		} else {
+			q = iralloc(p, size, extra, alignment, zero, no_move);
+			if (q == NULL)
+				goto ERR;
+		}
+		prof_realloc(q, cnt, p, old_size, old_ctx);
+	} else
+#endif
+	{
+		q = iralloc(p, size, extra, alignment, zero, no_move);
+		if (q == NULL)
+			goto ERR;
+	}
+
+	*ptr = q;
+	if (rsize != NULL)
+		*rsize = isalloc(q);
+
+	return (ALLOCM_SUCCESS);
+ERR:
+	if (no_move)
+		return (ALLOCM_ERR_NOT_MOVED);
+#ifdef JEMALLOC_PROF
+OOM:
+#endif
+#ifdef JEMALLOC_XMALLOC
+	if (opt_xmalloc) {
+		malloc_write("<jemalloc>: Error in rallocm(): "
+		    "out of memory\n");
+		abort();
+	}
+#endif
+	return (ALLOCM_ERR_OOM);
+}
+
+JEMALLOC_ATTR(visibility("default"))
+int
+JEMALLOC_P(sallocm)(const void *ptr, size_t *rsize, int flags)
+{
+	size_t sz;
+
+	assert(malloc_initialized || malloc_initializer == pthread_self());
+
+#ifdef JEMALLOC_IVSALLOC
+	sz = ivsalloc(ptr);
+#else
+	assert(ptr != NULL);
+	sz = isalloc(ptr);
+#endif
+	assert(rsize != NULL);
+	*rsize = sz;
+
+	return (ALLOCM_SUCCESS);
+}
+
+JEMALLOC_ATTR(visibility("default"))
+int
+JEMALLOC_P(dallocm)(void *ptr, int flags)
+{
+
+	assert(ptr != NULL);
+	assert(malloc_initialized || malloc_initializer == pthread_self());
+
+#ifdef JEMALLOC_PROF
+	if (opt_prof)
+		prof_free(ptr);
+#endif
+	idalloc(ptr);
+
+	return (ALLOCM_SUCCESS);
+}
+
 /*
  * End non-standard functions.
  */
diff --git a/jemalloc/src/prof.c b/jemalloc/src/prof.c
index e70b1325..7d596df2 100644
--- a/jemalloc/src/prof.c
+++ b/jemalloc/src/prof.c
@@ -90,7 +90,7 @@ prof_sample_state_t prof_sample_state_oom;
 	r = (prof_sample_state_t *)pthread_getspecific(			\
 	    prof_sample_state_tsd);					\
 	if (r == NULL) {						\
-		r = ipalloc(CACHELINE, sizeof(prof_sample_state_t));	\
+		r = ipalloc(sizeof(prof_sample_state_t), CACHELINE);	\
 		if (r == NULL) {					\
 			malloc_write("<jemalloc>: Error in heap "	\
 			    "profiler: out of memory; subsequent heap "	\
diff --git a/jemalloc/src/tcache.c b/jemalloc/src/tcache.c
index 86343835..550b9c42 100644
--- a/jemalloc/src/tcache.c
+++ b/jemalloc/src/tcache.c
@@ -209,7 +209,9 @@ tcache_create(arena_t *arena)
 	 * Round up to the nearest multiple of the cacheline size, in order to
 	 * avoid the possibility of false cacheline sharing.
 	 *
-	 * That this works relies on the same logic as in ipalloc().
+	 * That this works relies on the same logic as in ipalloc(), but we
+	 * cannot directly call ipalloc() here due to tcache bootstrapping
+	 * issues.
 	 */
 	size = (size + CACHELINE_MASK) & (-CACHELINE);
 
diff --git a/jemalloc/test/allocm.c b/jemalloc/test/allocm.c
new file mode 100644
index 00000000..4367cb89
--- /dev/null
+++ b/jemalloc/test/allocm.c
@@ -0,0 +1,133 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+
+#define	JEMALLOC_MANGLE
+#include "jemalloc/jemalloc.h"
+
+#define CHUNK 0x400000
+/* #define MAXALIGN ((size_t)0x80000000000LLU) */
+#define MAXALIGN ((size_t)0x2000000LLU)
+#define NITER 4
+
+int
+main(void)
+{
+	int r;
+	void *p;
+	size_t sz, alignment, total, tsz;
+	unsigned i;
+	void *ps[NITER];
+
+	fprintf(stderr, "Test begin\n");
+
+	sz = 0;
+	r = JEMALLOC_P(allocm)(&p, &sz, 42, 0);
+	if (r != ALLOCM_SUCCESS) {
+		fprintf(stderr, "Unexpected allocm() error\n");
+		abort();
+	}
+	if (sz < 42)
+		fprintf(stderr, "Real size smaller than expected\n");
+	if (JEMALLOC_P(dallocm)(p, 0) != ALLOCM_SUCCESS)
+		fprintf(stderr, "Unexpected dallocm() error\n");
+
+	r = JEMALLOC_P(allocm)(&p, NULL, 42, 0);
+	if (r != ALLOCM_SUCCESS) {
+		fprintf(stderr, "Unexpected allocm() error\n");
+		abort();
+	}
+	if (JEMALLOC_P(dallocm)(p, 0) != ALLOCM_SUCCESS)
+		fprintf(stderr, "Unexpected dallocm() error\n");
+
+	r = JEMALLOC_P(allocm)(&p, NULL, 42, ALLOCM_ZERO);
+	if (r != ALLOCM_SUCCESS) {
+		fprintf(stderr, "Unexpected allocm() error\n");
+		abort();
+	}
+	if (JEMALLOC_P(dallocm)(p, 0) != ALLOCM_SUCCESS)
+		fprintf(stderr, "Unexpected dallocm() error\n");
+
+#if LG_SIZEOF_PTR == 3
+	alignment = 0x8000000000000000LLU;
+	sz        = 0x8000000000000000LLU;
+#else
+	alignment = 0x80000000LU;
+	sz        = 0x80000000LU;
+#endif
+	r = JEMALLOC_P(allocm)(&p, NULL, sz, ALLOCM_ALIGN(alignment));
+	if (r == ALLOCM_SUCCESS) {
+		fprintf(stderr,
+		    "Expected error for allocm(&p, %zu, 0x%x)\n",
+		    sz, ALLOCM_ALIGN(alignment));
+	}
+
+#if LG_SIZEOF_PTR == 3
+	alignment = 0x4000000000000000LLU;
+	sz        = 0x8400000000000001LLU;
+#else
+	alignment = 0x40000000LU;
+	sz        = 0x84000001LU;
+#endif
+	r = JEMALLOC_P(allocm)(&p, NULL, sz, ALLOCM_ALIGN(alignment));
+	if (r == ALLOCM_SUCCESS) {
+		fprintf(stderr,
+		    "Expected error for allocm(&p, %zu, 0x%x)\n",
+		    sz, ALLOCM_ALIGN(alignment));
+	}
+
+	alignment = 0x10LLU;
+#if LG_SIZEOF_PTR == 3
+	sz   = 0xfffffffffffffff0LLU;
+#else
+	sz   = 0xfffffff0LU;
+#endif
+	r = JEMALLOC_P(allocm)(&p, NULL, sz, ALLOCM_ALIGN(alignment));
+	if (r == ALLOCM_SUCCESS) {
+		fprintf(stderr,
+		    "Expected error for allocm(&p, %zu, 0x%x)\n",
+		    sz, ALLOCM_ALIGN(alignment));
+	}
+
+	for (i = 0; i < NITER; i++)
+		ps[i] = NULL;
+
+	for (alignment = 8;
+	    alignment <= MAXALIGN;
+	    alignment <<= 1) {
+		total = 0;
+		fprintf(stderr, "Alignment: %zu\n", alignment);
+		for (sz = 1;
+		    sz < 3 * alignment && sz < (1U << 31);
+		    sz += (alignment >> (LG_SIZEOF_PTR-1)) - 1) {
+			for (i = 0; i < NITER; i++) {
+				r = JEMALLOC_P(allocm)(&ps[i], NULL, sz,
+				    ALLOCM_ALIGN(alignment) | ALLOCM_ZERO);
+				if (r != ALLOCM_SUCCESS) {
+					fprintf(stderr,
+					    "Error for size %zu (0x%zx): %d\n",
+					    sz, sz, r);
+					exit(1);
+				}
+				if ((uintptr_t)p & (alignment-1)) {
+					fprintf(stderr,
+					    "%p inadequately aligned for"
+					    " alignment: %zu\n", p, alignment);
+				}
+				JEMALLOC_P(sallocm)(ps[i], &tsz, 0);
+				total += tsz;
+				if (total >= (MAXALIGN << 1))
+					break;
+			}
+			for (i = 0; i < NITER; i++) {
+				if (ps[i] != NULL) {
+					JEMALLOC_P(dallocm)(ps[i], 0);
+					ps[i] = NULL;
+				}
+			}
+		}
+	}
+
+	fprintf(stderr, "Test end\n");
+	return (0);
+}
diff --git a/jemalloc/test/allocm.exp b/jemalloc/test/allocm.exp
new file mode 100644
index 00000000..b5061c72
--- /dev/null
+++ b/jemalloc/test/allocm.exp
@@ -0,0 +1,25 @@
+Test begin
+Alignment: 8
+Alignment: 16
+Alignment: 32
+Alignment: 64
+Alignment: 128
+Alignment: 256
+Alignment: 512
+Alignment: 1024
+Alignment: 2048
+Alignment: 4096
+Alignment: 8192
+Alignment: 16384
+Alignment: 32768
+Alignment: 65536
+Alignment: 131072
+Alignment: 262144
+Alignment: 524288
+Alignment: 1048576
+Alignment: 2097152
+Alignment: 4194304
+Alignment: 8388608
+Alignment: 16777216
+Alignment: 33554432
+Test end
diff --git a/jemalloc/test/posix_memalign.c b/jemalloc/test/posix_memalign.c
index cd3cadc9..c5651d7c 100644
--- a/jemalloc/test/posix_memalign.c
+++ b/jemalloc/test/posix_memalign.c
@@ -7,7 +7,7 @@
 #define	JEMALLOC_MANGLE
 #include "jemalloc/jemalloc.h"
 
-#define CHUNK 0x100000
+#define CHUNK 0x400000
 /* #define MAXALIGN ((size_t)0x80000000000LLU) */
 #define MAXALIGN ((size_t)0x2000000LLU)
 #define NITER 4
@@ -117,6 +117,5 @@ main(void)
 	}
 
 	fprintf(stderr, "Test end\n");
-
-	return 0;
+	return (0);
 }
diff --git a/jemalloc/test/rallocm.c b/jemalloc/test/rallocm.c
new file mode 100644
index 00000000..b52bdb20
--- /dev/null
+++ b/jemalloc/test/rallocm.c
@@ -0,0 +1,117 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define	JEMALLOC_MANGLE
+#include "jemalloc/jemalloc.h"
+
+int
+main(void)
+{
+	void *p, *q;
+	size_t sz, tsz;
+	int r;
+
+	fprintf(stderr, "Test begin\n");
+
+	r = allocm(&p, &sz, 42, 0);
+	if (r != ALLOCM_SUCCESS) {
+		fprintf(stderr, "Unexpected allocm() error\n");
+		abort();
+	}
+
+	q = p;
+	r = rallocm(&q, &tsz, sz, 0, ALLOCM_NO_MOVE);
+	if (r != ALLOCM_SUCCESS)
+		fprintf(stderr, "Unexpected rallocm() error\n");
+	if (q != p)
+		fprintf(stderr, "Unexpected object move\n");
+	if (tsz != sz) {
+		fprintf(stderr, "Unexpected size change: %zu --> %zu\n",
+		    sz, tsz);
+	}
+
+	q = p;
+	r = rallocm(&q, &tsz, sz, 5, ALLOCM_NO_MOVE);
+	if (r != ALLOCM_SUCCESS)
+		fprintf(stderr, "Unexpected rallocm() error\n");
+	if (q != p)
+		fprintf(stderr, "Unexpected object move\n");
+	if (tsz != sz) {
+		fprintf(stderr, "Unexpected size change: %zu --> %zu\n",
+		    sz, tsz);
+	}
+
+	q = p;
+	r = rallocm(&q, &tsz, sz + 5, 0, ALLOCM_NO_MOVE);
+	if (r != ALLOCM_ERR_NOT_MOVED)
+		fprintf(stderr, "Unexpected rallocm() result\n");
+	if (q != p)
+		fprintf(stderr, "Unexpected object move\n");
+	if (tsz != sz) {
+		fprintf(stderr, "Unexpected size change: %zu --> %zu\n",
+		    sz, tsz);
+	}
+
+	q = p;
+	r = rallocm(&q, &tsz, sz + 5, 0, 0);
+	if (r != ALLOCM_SUCCESS)
+		fprintf(stderr, "Unexpected rallocm() error\n");
+	if (q == p)
+		fprintf(stderr, "Expected object move\n");
+	if (tsz == sz) {
+		fprintf(stderr, "Expected size change: %zu --> %zu\n",
+		    sz, tsz);
+	}
+	p = q;
+	sz = tsz;
+
+	r = rallocm(&q, &tsz, 8192, 0, 0);
+	if (r != ALLOCM_SUCCESS)
+		fprintf(stderr, "Unexpected rallocm() error\n");
+	if (q == p)
+		fprintf(stderr, "Expected object move\n");
+	if (tsz == sz) {
+		fprintf(stderr, "Expected size change: %zu --> %zu\n",
+		    sz, tsz);
+	}
+	p = q;
+	sz = tsz;
+
+	r = rallocm(&q, &tsz, 16384, 0, 0);
+	if (r != ALLOCM_SUCCESS)
+		fprintf(stderr, "Unexpected rallocm() error\n");
+	if (tsz == sz) {
+		fprintf(stderr, "Expected size change: %zu --> %zu\n",
+		    sz, tsz);
+	}
+	p = q;
+	sz = tsz;
+
+	r = rallocm(&q, &tsz, 8192, 0, ALLOCM_NO_MOVE);
+	if (r != ALLOCM_SUCCESS)
+		fprintf(stderr, "Unexpected rallocm() error\n");
+	if (q != p)
+		fprintf(stderr, "Unexpected object move\n");
+	if (tsz == sz) {
+		fprintf(stderr, "Expected size change: %zu --> %zu\n",
+		    sz, tsz);
+	}
+	sz = tsz;
+
+	r = rallocm(&q, &tsz, 16384, 0, ALLOCM_NO_MOVE);
+	if (r != ALLOCM_SUCCESS)
+		fprintf(stderr, "Unexpected rallocm() error\n");
+	if (q != p)
+		fprintf(stderr, "Unexpected object move\n");
+	if (tsz == sz) {
+		fprintf(stderr, "Expected size change: %zu --> %zu\n",
+		    sz, tsz);
+	}
+	sz = tsz;
+
+	dallocm(p, 0);
+
+	fprintf(stderr, "Test end\n");
+	return (0);
+}
diff --git a/jemalloc/test/rallocm.exp b/jemalloc/test/rallocm.exp
new file mode 100644
index 00000000..369a88dd
--- /dev/null
+++ b/jemalloc/test/rallocm.exp
@@ -0,0 +1,2 @@
+Test begin
+Test end

From a094babe33ace75a8f0daed2cfd765ae0d05f57e Mon Sep 17 00:00:00 2001
From: Jason Evans <je@facebook.com>
Date: Fri, 17 Sep 2010 17:35:42 -0700
Subject: [PATCH 08/48] Add gcc attributes for *allocm() prototypes.

---
 jemalloc/include/jemalloc/jemalloc.h.in | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/jemalloc/include/jemalloc/jemalloc.h.in b/jemalloc/include/jemalloc/jemalloc.h.in
index c28fb6d3..aad4c3db 100644
--- a/jemalloc/include/jemalloc/jemalloc.h.in
+++ b/jemalloc/include/jemalloc/jemalloc.h.in
@@ -51,11 +51,13 @@ int	JEMALLOC_P(mallctlnametomib)(const char *name, size_t *mibp,
 int	JEMALLOC_P(mallctlbymib)(const size_t *mib, size_t miblen, void *oldp,
     size_t *oldlenp, void *newp, size_t newlen);
 
-int	JEMALLOC_P(allocm)(void **ptr, size_t *rsize, size_t size, int flags);
+int	JEMALLOC_P(allocm)(void **ptr, size_t *rsize, size_t size, int flags)
+    JEMALLOC_ATTR(nonnull(1));
 int	JEMALLOC_P(rallocm)(void **ptr, size_t *rsize, size_t size,
-    size_t extra, int flags);
-int	JEMALLOC_P(sallocm)(const void *ptr, size_t *rsize, int flags);
-int	JEMALLOC_P(dallocm)(void *ptr, int flags);
+    size_t extra, int flags) JEMALLOC_ATTR(nonnull(1));
+int	JEMALLOC_P(sallocm)(const void *ptr, size_t *rsize, int flags)
+    JEMALLOC_ATTR(nonnull(1));
+int	JEMALLOC_P(dallocm)(void *ptr, int flags) JEMALLOC_ATTR(nonnull(1));
 
 #ifdef __cplusplus
 };

From 79d660d35d30f60ccd41474cff20f00f215f767a Mon Sep 17 00:00:00 2001
From: Jason Evans <je@facebook.com>
Date: Fri, 17 Sep 2010 17:38:24 -0700
Subject: [PATCH 09/48] Store full git GID in VERSION.

---
 jemalloc/configure.ac | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/jemalloc/configure.ac b/jemalloc/configure.ac
index 21f502c6..b0d470a0 100644
--- a/jemalloc/configure.ac
+++ b/jemalloc/configure.ac
@@ -637,7 +637,7 @@ dnl
 
 dnl Set VERSION if source directory has an embedded git repository.
 if test -d "${srcroot}../.git" ; then
-  git describe --long > ${srcroot}VERSION
+  git describe --long --abbrev=40 > ${srcroot}VERSION
 fi
 jemalloc_version=`cat ${srcroot}VERSION`
 jemalloc_version_major=`echo ${jemalloc_version} | tr ".g-" " " | awk '{print [$]1}'`

From 28177d466fbbfd10927b4d05655c2ec6711faa1b Mon Sep 17 00:00:00 2001
From: Jason Evans <je@facebook.com>
Date: Mon, 20 Sep 2010 11:24:24 -0700
Subject: [PATCH 10/48] Remove bad assertions in malloc_{pre,post}fork().

Remove assertions that malloc_{pre,post}fork() are only called if
threading is enabled.  This was true of these functions in the context
of FreeBSD's libc, but now the functions are called unconditionally as a
result of registering them with pthread_atfork().
---
 jemalloc/src/jemalloc.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/jemalloc/src/jemalloc.c b/jemalloc/src/jemalloc.c
index be3d7da1..40c3a637 100644
--- a/jemalloc/src/jemalloc.c
+++ b/jemalloc/src/jemalloc.c
@@ -1394,9 +1394,7 @@ JEMALLOC_P(dallocm)(void *ptr, int flags)
 
 /*
  * The following functions are used by threading libraries for protection of
- * malloc during fork().  These functions are only called if the program is
- * running in threaded mode, so there is no need to check whether the program
- * is threaded here.
+ * malloc during fork().
  */
 
 void
@@ -1404,8 +1402,6 @@ jemalloc_prefork(void)
 {
 	unsigned i;
 
-	assert(isthreaded);
-
 	/* Acquire all mutexes in a safe order. */
 
 	malloc_mutex_lock(&arenas_lock);
@@ -1432,8 +1428,6 @@ jemalloc_postfork(void)
 {
 	unsigned i;
 
-	assert(isthreaded);
-
 	/* Release all mutexes, now that fork() has completed. */
 
 #ifdef JEMALLOC_SWAP

From a09f55c87d799e6e0e64972838ca6768027e9174 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@facebook.com>
Date: Mon, 20 Sep 2010 16:05:41 -0700
Subject: [PATCH 11/48] Wrap strerror_r().

Create the buferror() function, which wraps strerror_r().  This is
necessary because glibc provides a non-standard strerror_r().
---
 .../jemalloc/internal/jemalloc_internal.h.in  |  5 +++--
 jemalloc/src/chunk_mmap.c                     | 11 ++++++-----
 jemalloc/src/chunk_swap.c                     |  7 ++++---
 jemalloc/src/jemalloc.c                       | 19 +++++++++++++++++++
 4 files changed, 32 insertions(+), 10 deletions(-)

diff --git a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
index 04bc56fa..514ef5c2 100644
--- a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
@@ -101,8 +101,8 @@ extern void	(*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s);
 #  define JEMALLOC_INLINE static inline
 #endif
 
-/* Size of stack-allocated buffer passed to strerror_r(). */
-#define	STRERROR_BUF		64
+/* Size of stack-allocated buffer passed to buferror(). */
+#define	BUFERROR_BUF		64
 
 /* Minimum alignment of allocations is 2^LG_QUANTUM bytes. */
 #ifdef __i386__
@@ -289,6 +289,7 @@ extern unsigned		narenas;
 
 arena_t	*arenas_extend(unsigned ind);
 arena_t	*choose_arena_hard(void);
+int	buferror(int errnum, char *buf, size_t buflen);
 void	jemalloc_prefork(void);
 void	jemalloc_postfork(void);
 
diff --git a/jemalloc/src/chunk_mmap.c b/jemalloc/src/chunk_mmap.c
index a3d09e9a..bc367559 100644
--- a/jemalloc/src/chunk_mmap.c
+++ b/jemalloc/src/chunk_mmap.c
@@ -28,7 +28,8 @@ static pthread_key_t	mmap_unaligned_tsd;
 
 static void	*pages_map(void *addr, size_t size, bool noreserve);
 static void	pages_unmap(void *addr, size_t size);
-static void	*chunk_alloc_mmap_slow(size_t size, bool unaligned, bool noreserve);
+static void	*chunk_alloc_mmap_slow(size_t size, bool unaligned,
+    bool noreserve);
 static void	*chunk_alloc_mmap_internal(size_t size, bool noreserve);
 
 /******************************************************************************/
@@ -57,9 +58,9 @@ pages_map(void *addr, size_t size, bool noreserve)
 		 * We succeeded in mapping memory, but not in the right place.
 		 */
 		if (munmap(ret, size) == -1) {
-			char buf[STRERROR_BUF];
+			char buf[BUFERROR_BUF];
 
-			strerror_r(errno, buf, sizeof(buf));
+			buferror(errno, buf, sizeof(buf));
 			malloc_write("<jemalloc>: Error in munmap(): ");
 			malloc_write(buf);
 			malloc_write("\n");
@@ -79,9 +80,9 @@ pages_unmap(void *addr, size_t size)
 {
 
 	if (munmap(addr, size) == -1) {
-		char buf[STRERROR_BUF];
+		char buf[BUFERROR_BUF];
 
-		strerror_r(errno, buf, sizeof(buf));
+		buferror(errno, buf, sizeof(buf));
 		malloc_write("<jemalloc>: Error in munmap(): ");
 		malloc_write(buf);
 		malloc_write("\n");
diff --git a/jemalloc/src/chunk_swap.c b/jemalloc/src/chunk_swap.c
index ed9e414d..ee038ba9 100644
--- a/jemalloc/src/chunk_swap.c
+++ b/jemalloc/src/chunk_swap.c
@@ -294,9 +294,10 @@ chunk_swap_enable(const int *fds, unsigned nfds, bool prezeroed)
 		void *addr = mmap((void *)((uintptr_t)vaddr + voff), sizes[i],
 		    PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, fds[i], 0);
 		if (addr == MAP_FAILED) {
-			char buf[STRERROR_BUF];
+			char buf[BUFERROR_BUF];
 
-			strerror_r(errno, buf, sizeof(buf));
+
+			buferror(errno, buf, sizeof(buf));
 			malloc_write(
 			    "<jemalloc>: Error in mmap(..., MAP_FIXED, ...): ");
 			malloc_write(buf);
@@ -304,7 +305,7 @@ chunk_swap_enable(const int *fds, unsigned nfds, bool prezeroed)
 			if (opt_abort)
 				abort();
 			if (munmap(vaddr, voff) == -1) {
-				strerror_r(errno, buf, sizeof(buf));
+				buferror(errno, buf, sizeof(buf));
 				malloc_write("<jemalloc>: Error in munmap(): ");
 				malloc_write(buf);
 				malloc_write("\n");
diff --git a/jemalloc/src/jemalloc.c b/jemalloc/src/jemalloc.c
index 40c3a637..0b60ce60 100644
--- a/jemalloc/src/jemalloc.c
+++ b/jemalloc/src/jemalloc.c
@@ -139,6 +139,25 @@ choose_arena_hard(void)
 	return (ret);
 }
 
+/*
+ * glibc provides a non-standard strerror_r() when _GNU_SOURCE is defined, so
+ * provide a wrapper.
+ */
+int
+buferror(int errnum, char *buf, size_t buflen)
+{
+#ifdef _GNU_SOURCE
+	char *b = strerror_r(errno, buf, buflen);
+	if (b != buf) {
+		strncpy(buf, b, buflen);
+		buf[buflen-1] = '\0';
+	}
+	return (0);
+#else
+	return (strerror_r(errno, buf, buflen));
+#endif
+}
+
 static void
 stats_print_atexit(void)
 {

From 6a0d2918ceede07a36a50389c1a8a95755b0a7f6 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@facebook.com>
Date: Mon, 20 Sep 2010 16:44:23 -0700
Subject: [PATCH 12/48] Add memalign() and valloc() overrides.

If memalign() and/or valloc() are present on the system, override them
in order to avoid mixed allocator usage.
---
 jemalloc/configure.ac                        |  8 ++++
 jemalloc/include/jemalloc/jemalloc_defs.h.in |  7 ++++
 jemalloc/src/jemalloc.c                      | 43 ++++++++++++++++++++
 3 files changed, 58 insertions(+)

diff --git a/jemalloc/configure.ac b/jemalloc/configure.ac
index b0d470a0..127b6951 100644
--- a/jemalloc/configure.ac
+++ b/jemalloc/configure.ac
@@ -710,6 +710,14 @@ if test "x${enable_tls}" = "x0" ; then
   AC_DEFINE_UNQUOTED([NO_TLS], [ ])
 fi
 
+dnl ============================================================================
+dnl Check for allocator-related functions that should be wrapped.
+
+AC_CHECK_FUNC([memalign],
+	      [AC_DEFINE([JEMALLOC_OVERRIDE_MEMALIGN])])
+AC_CHECK_FUNC([valloc],
+	      [AC_DEFINE([JEMALLOC_OVERRIDE_VALLOC])])
+
 dnl ============================================================================
 dnl Darwin-related configuration.
 
diff --git a/jemalloc/include/jemalloc/jemalloc_defs.h.in b/jemalloc/include/jemalloc/jemalloc_defs.h.in
index eed33a64..6bdcbaa4 100644
--- a/jemalloc/include/jemalloc/jemalloc_defs.h.in
+++ b/jemalloc/include/jemalloc/jemalloc_defs.h.in
@@ -98,6 +98,13 @@
  */
 #undef JEMALLOC_IVSALLOC
 
+/*
+ * Define overrides for non-standard allocator-related functions if they
+ * are present on the system.
+ */
+#undef JEMALLOC_OVERRIDE_MEMALIGN
+#undef JEMALLOC_OVERRIDE_VALLOC
+
 /*
  * Darwin (OS X) uses zones to work around Mach-O symbol override shortcomings.
  */
diff --git a/jemalloc/src/jemalloc.c b/jemalloc/src/jemalloc.c
index 0b60ce60..3f115d89 100644
--- a/jemalloc/src/jemalloc.c
+++ b/jemalloc/src/jemalloc.c
@@ -1145,6 +1145,45 @@ JEMALLOC_P(free)(void *ptr)
  * End malloc(3)-compatible functions.
  */
 /******************************************************************************/
+/*
+ * Begin non-standard override functions.
+ *
+ * These overrides are omitted if the JEMALLOC_PREFIX is defined, since the
+ * entire point is to avoid accidental mixed allocator usage.
+ */
+#ifndef JEMALLOC_PREFIX
+
+#ifdef JEMALLOC_OVERRIDE_MEMALIGN
+JEMALLOC_ATTR(malloc)
+JEMALLOC_ATTR(visibility("default"))
+void *
+JEMALLOC_P(memalign)(size_t alignment, size_t size)
+{
+	void *ret;
+
+	posix_memalign(&ret, alignment, size);
+	return (ret);
+}
+#endif
+
+#ifdef JEMALLOC_OVERRIDE_VALLOC
+JEMALLOC_ATTR(malloc)
+JEMALLOC_ATTR(visibility("default"))
+void *
+JEMALLOC_P(valloc)(size_t size)
+{
+	void *ret;
+
+	posix_memalign(&ret, PAGE_SIZE, size);
+	return (ret);
+}
+#endif
+
+#endif /* JEMALLOC_PREFIX */
+/*
+ * End non-standard override functions.
+ */
+/******************************************************************************/
 /*
  * Begin non-standard functions.
  */
@@ -1240,6 +1279,7 @@ iallocm(size_t size, size_t alignment, bool zero)
 		return (imalloc(size));
 }
 
+JEMALLOC_ATTR(nonnull(1))
 JEMALLOC_ATTR(visibility("default"))
 int
 JEMALLOC_P(allocm)(void **ptr, size_t *rsize, size_t size, int flags)
@@ -1297,6 +1337,7 @@ OOM:
 	return (ALLOCM_ERR_OOM);
 }
 
+JEMALLOC_ATTR(nonnull(1))
 JEMALLOC_ATTR(visibility("default"))
 int
 JEMALLOC_P(rallocm)(void **ptr, size_t *rsize, size_t size, size_t extra,
@@ -1369,6 +1410,7 @@ OOM:
 	return (ALLOCM_ERR_OOM);
 }
 
+JEMALLOC_ATTR(nonnull(1))
 JEMALLOC_ATTR(visibility("default"))
 int
 JEMALLOC_P(sallocm)(const void *ptr, size_t *rsize, int flags)
@@ -1389,6 +1431,7 @@ JEMALLOC_P(sallocm)(const void *ptr, size_t *rsize, int flags)
 	return (ALLOCM_SUCCESS);
 }
 
+JEMALLOC_ATTR(nonnull(1))
 JEMALLOC_ATTR(visibility("default"))
 int
 JEMALLOC_P(dallocm)(void *ptr, int flags)

From 355b438c854227bbf8185cb7a3ce247d271a842e Mon Sep 17 00:00:00 2001
From: Jason Evans <je@facebook.com>
Date: Mon, 20 Sep 2010 19:20:48 -0700
Subject: [PATCH 13/48] Fix compiler warnings.

Add --enable-cc-silence, which can be used to silence harmless warnings.

Fix an aliasing bug in ckh_pointer_hash().
---
 jemalloc/INSTALL                              |  5 ++
 jemalloc/configure.ac                         | 18 +++++
 .../jemalloc/internal/jemalloc_internal.h.in  | 30 ++++-----
 jemalloc/include/jemalloc/jemalloc_defs.h.in  |  3 +
 jemalloc/src/ckh.c                            | 13 +++-
 jemalloc/src/jemalloc.c                       | 66 +++++++++++++++----
 jemalloc/src/prof.c                           |  6 +-
 jemalloc/test/thread_arena.c                  | 15 ++++-
 8 files changed, 124 insertions(+), 32 deletions(-)

diff --git a/jemalloc/INSTALL b/jemalloc/INSTALL
index a8957436..1bf51588 100644
--- a/jemalloc/INSTALL
+++ b/jemalloc/INSTALL
@@ -40,6 +40,11 @@ any of the following arguments (not a definitive list) to 'configure':
     versions of jemalloc can coexist in the same installation directory.  For
     example, libjemalloc.so.0 becomes libjemalloc<suffix>.so.0.
 
+--enable-cc-silence
+    Enable code that silences unuseful compiler warnings.  This is helpful when
+    trying to tell serious warnings from those due to compiler limitations, but
+    it potentially incurs a performance penalty.
+
 --enable-debug
     Enable assertions and validation code.  This incurs a substantial
     performance hit, but is very useful during application development.
diff --git a/jemalloc/configure.ac b/jemalloc/configure.ac
index 127b6951..7d0561aa 100644
--- a/jemalloc/configure.ac
+++ b/jemalloc/configure.ac
@@ -289,6 +289,23 @@ cfghdrs_out="include/jemalloc/jemalloc_defs${install_suffix}.h"
 
 cfghdrs_tup="include/jemalloc/jemalloc_defs${install_suffix}.h:include/jemalloc/jemalloc_defs.h.in"
 
+dnl Do not silence irrelevant compiler warnings by default, since enabling this
+dnl option incurs a performance penalty.
+AC_ARG_ENABLE([cc-silence],
+  [AS_HELP_STRING([--enable-cc-silence],
+                  [Silence irrelevant compiler warnings])],
+[if test "x$enable_cc_silence" = "xno" ; then
+  enable_cc_silence="0"
+else
+  enable_cc_silence="1"
+fi
+],
+[enable_cc_silence="0"]
+)
+if test "x$enable_cc_silence" = "x1" ; then
+  AC_DEFINE([JEMALLOC_CC_SILENCE])
+fi
+
 dnl Do not compile with debugging by default.
 AC_ARG_ENABLE([debug],
   [AS_HELP_STRING([--enable-debug], [Build debugging code])],
@@ -807,6 +824,7 @@ AC_MSG_RESULT([])
 AC_MSG_RESULT([JEMALLOC_PREFIX    : ${JEMALLOC_PREFIX}])
 AC_MSG_RESULT([install_suffix     : ${install_suffix}])
 AC_MSG_RESULT([autogen            : ${enable_autogen}])
+AC_MSG_RESULT([cc-silence         : ${enable_cc_silence}])
 AC_MSG_RESULT([debug              : ${enable_debug}])
 AC_MSG_RESULT([stats              : ${enable_stats}])
 AC_MSG_RESULT([prof               : ${enable_prof}])
diff --git a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
index 514ef5c2..2320e3e5 100644
--- a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
@@ -399,9 +399,9 @@ size_t	isalloc(const void *ptr);
 #  ifdef JEMALLOC_IVSALLOC
 size_t	ivsalloc(const void *ptr);
 #  endif
+void	idalloc(void *ptr);
 void	*iralloc(void *ptr, size_t size, size_t extra, size_t alignment,
     bool zero, bool no_move);
-void	idalloc(void *ptr);
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_C_))
@@ -559,6 +559,20 @@ ivsalloc(const void *ptr)
 }
 #endif
 
+JEMALLOC_INLINE void
+idalloc(void *ptr)
+{
+	arena_chunk_t *chunk;
+
+	assert(ptr != NULL);
+
+	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
+	if (chunk != ptr)
+		arena_dalloc(chunk->arena, chunk, ptr);
+	else
+		huge_dalloc(ptr);
+}
+
 JEMALLOC_INLINE void *
 iralloc(void *ptr, size_t size, size_t extra, size_t alignment, bool zero,
     bool no_move)
@@ -619,20 +633,6 @@ iralloc(void *ptr, size_t size, size_t extra, size_t alignment, bool zero,
 		}
 	}
 }
-
-JEMALLOC_INLINE void
-idalloc(void *ptr)
-{
-	arena_chunk_t *chunk;
-
-	assert(ptr != NULL);
-
-	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-	if (chunk != ptr)
-		arena_dalloc(chunk->arena, chunk, ptr);
-	else
-		huge_dalloc(ptr);
-}
 #endif
 
 #undef JEMALLOC_H_INLINES
diff --git a/jemalloc/include/jemalloc/jemalloc_defs.h.in b/jemalloc/include/jemalloc/jemalloc_defs.h.in
index 6bdcbaa4..fe35170c 100644
--- a/jemalloc/include/jemalloc/jemalloc_defs.h.in
+++ b/jemalloc/include/jemalloc/jemalloc_defs.h.in
@@ -31,6 +31,9 @@
 #  define JEMALLOC_ATTR(s)
 #endif
 
+/* JEMALLOC_CC_SILENCE enables code that silences unuseful compiler warnings. */
+#undef JEMALLOC_CC_SILENCE
+
 /*
  * JEMALLOC_DEBUG enables assertions and other sanity checks, and disables
  * inline functions.
diff --git a/jemalloc/src/ckh.c b/jemalloc/src/ckh.c
index 95c09678..682a8db6 100644
--- a/jemalloc/src/ckh.c
+++ b/jemalloc/src/ckh.c
@@ -567,12 +567,21 @@ ckh_pointer_hash(const void *key, unsigned minbits, size_t *hash1,
 {
 	size_t ret1, ret2;
 	uint64_t h;
+	union {
+		const void	*v;
+		uint64_t	i;
+	} u;
 
 	assert(minbits <= 32 || (SIZEOF_PTR == 8 && minbits <= 64));
 	assert(hash1 != NULL);
 	assert(hash2 != NULL);
 
-	h = hash(&key, sizeof(void *), 0xd983396e68886082LLU);
+	assert(sizeof(u.v) == sizeof(u.i));
+#if (LG_SIZEOF_PTR != LG_SIZEOF_INT)
+	u.i = 0;
+#endif
+	u.v = key;
+	h = hash(&u.i, sizeof(u.i), 0xd983396e68886082LLU);
 	if (minbits <= 32) {
 		/*
 		 * Avoid doing multiple hashes, since a single hash provides
@@ -583,7 +592,7 @@ ckh_pointer_hash(const void *key, unsigned minbits, size_t *hash1,
 	} else {
 		assert(SIZEOF_PTR == 8);
 		ret1 = h;
-		ret2 = hash(&key, sizeof(void *), 0x5e2be9aff8709a5dLLU);
+		ret2 = hash(&u.i, sizeof(u.i), 0x5e2be9aff8709a5dLLU);
 	}
 
 	*hash1 = ret1;
diff --git a/jemalloc/src/jemalloc.c b/jemalloc/src/jemalloc.c
index 3f115d89..f252f591 100644
--- a/jemalloc/src/jemalloc.c
+++ b/jemalloc/src/jemalloc.c
@@ -76,8 +76,14 @@ static
 void
 wrtmessage(void *cbopaque, const char *s)
 {
-
-	write(STDERR_FILENO, s, strlen(s));
+#ifdef JEMALLOC_CC_SILENCE
+	int result =
+#endif
+	    write(STDERR_FILENO, s, strlen(s));
+#ifdef JEMALLOC_CC_SILENCE
+	if (result < 0)
+		result = errno;
+#endif
 }
 
 void	(*JEMALLOC_P(malloc_message))(void *, const char *s)
@@ -746,7 +752,11 @@ JEMALLOC_P(malloc)(size_t size)
 {
 	void *ret;
 #ifdef JEMALLOC_PROF
-	prof_thr_cnt_t *cnt;
+	prof_thr_cnt_t *cnt
+#  ifdef JEMALLOC_CC_SILENCE
+	    = NULL
+#  endif
+	    ;
 #endif
 
 	if (malloc_init()) {
@@ -821,7 +831,11 @@ JEMALLOC_P(posix_memalign)(void **memptr, size_t alignment, size_t size)
 	int ret;
 	void *result;
 #ifdef JEMALLOC_PROF
-	prof_thr_cnt_t *cnt;
+	prof_thr_cnt_t *cnt
+#  ifdef JEMALLOC_CC_SILENCE
+	    = NULL
+#  endif
+	    ;
 #endif
 
 	if (malloc_init())
@@ -920,7 +934,11 @@ JEMALLOC_P(calloc)(size_t num, size_t size)
 	void *ret;
 	size_t num_size;
 #ifdef JEMALLOC_PROF
-	prof_thr_cnt_t *cnt;
+	prof_thr_cnt_t *cnt
+#  ifdef JEMALLOC_CC_SILENCE
+	    = NULL
+#  endif
+	    ;
 #endif
 
 	if (malloc_init()) {
@@ -995,9 +1013,21 @@ JEMALLOC_P(realloc)(void *ptr, size_t size)
 {
 	void *ret;
 #ifdef JEMALLOC_PROF
-	size_t old_size;
-	prof_thr_cnt_t *cnt;
-	prof_ctx_t *old_ctx;
+	size_t old_size
+#  ifdef JEMALLOC_CC_SILENCE
+	    = 0
+#  endif
+	    ;
+	prof_thr_cnt_t *cnt
+#  ifdef JEMALLOC_CC_SILENCE
+	    = NULL
+#  endif
+	    ;
+	prof_ctx_t *old_ctx
+#  ifdef JEMALLOC_CC_SILENCE
+	    = NULL
+#  endif
+	    ;
 #endif
 
 	if (size == 0) {
@@ -1160,8 +1190,14 @@ void *
 JEMALLOC_P(memalign)(size_t alignment, size_t size)
 {
 	void *ret;
-
-	posix_memalign(&ret, alignment, size);
+#ifdef JEMALLOC_CC_SILENCE
+	int result =
+#endif
+	    JEMALLOC_P(posix_memalign)(&ret, alignment, size);
+#ifdef JEMALLOC_CC_SILENCE
+	if (result != 0)
+		return (NULL);
+#endif
 	return (ret);
 }
 #endif
@@ -1173,8 +1209,14 @@ void *
 JEMALLOC_P(valloc)(size_t size)
 {
 	void *ret;
-
-	posix_memalign(&ret, PAGE_SIZE, size);
+#ifdef JEMALLOC_CC_SILENCE
+	int result =
+#endif
+	    JEMALLOC_P(posix_memalign)(&ret, PAGE_SIZE, size);
+#ifdef JEMALLOC_CC_SILENCE
+	if (result != 0)
+		return (NULL);
+#endif
 	return (ret);
 }
 #endif
diff --git a/jemalloc/src/prof.c b/jemalloc/src/prof.c
index 7d596df2..a414bb9f 100644
--- a/jemalloc/src/prof.c
+++ b/jemalloc/src/prof.c
@@ -737,7 +737,11 @@ void
 prof_realloc(const void *ptr, prof_thr_cnt_t *cnt, const void *old_ptr,
     size_t old_size, prof_ctx_t *old_ctx)
 {
-	size_t size;
+	size_t size
+#ifdef JEMALLOC_CC_SILENCE
+	    = 0
+#endif
+	    ;
 	prof_thr_cnt_t *told_cnt;
 
 	assert(ptr != NULL || (uintptr_t)cnt <= (uintptr_t)1U);
diff --git a/jemalloc/test/thread_arena.c b/jemalloc/test/thread_arena.c
index d52435fa..5b1058b5 100644
--- a/jemalloc/test/thread_arena.c
+++ b/jemalloc/test/thread_arena.c
@@ -10,11 +10,16 @@ void *
 thread_start(void *arg)
 {
 	unsigned main_arena_ind = *(unsigned *)arg;
+	void *p;
 	unsigned arena_ind;
 	size_t size;
 	int err;
 
-	JEMALLOC_P(malloc)(1);
+	p = JEMALLOC_P(malloc)(1);
+	if (p == NULL) {
+		fprintf(stderr, "%s(): Error in malloc()\n", __func__);
+		return (void *)1;
+	}
 
 	size = sizeof(arena_ind);
 	if ((err = JEMALLOC_P(mallctl)("thread.arena", &arena_ind, &size,
@@ -31,6 +36,7 @@ int
 main(void)
 {
 	int ret = 0;
+	void *p;
 	unsigned arena_ind;
 	size_t size;
 	int err;
@@ -38,7 +44,12 @@ main(void)
 
 	fprintf(stderr, "Test begin\n");
 
-	JEMALLOC_P(malloc)(1);
+	p = JEMALLOC_P(malloc)(1);
+	if (p == NULL) {
+		fprintf(stderr, "%s(): Error in malloc()\n", __func__);
+		ret = 1;
+		goto RETURN;
+	}
 
 	size = sizeof(arena_ind);
 	if ((err = JEMALLOC_P(mallctl)("thread.arena", &arena_ind, &size, NULL,

From 075e77cad47cc45491df62f555e665a4401608fd Mon Sep 17 00:00:00 2001
From: Jason Evans <je@facebook.com>
Date: Mon, 20 Sep 2010 19:53:25 -0700
Subject: [PATCH 14/48] Fix compiler warnings and errors.

Use INT_MAX instead of MAX_INT in ALLOCM_ALIGN(), and #include
<limits.h> in order to get its definition.

Modify prof code related to hash tables to avoid aliasing warnings from
gcc 4.1.2 (gcc 4.4.0 and 4.4.3 do not warn).
---
 jemalloc/include/jemalloc/jemalloc.h.in |   3 +-
 jemalloc/src/prof.c                     | 116 ++++++++++++++----------
 2 files changed, 69 insertions(+), 50 deletions(-)

diff --git a/jemalloc/include/jemalloc/jemalloc.h.in b/jemalloc/include/jemalloc/jemalloc.h.in
index aad4c3db..e3983078 100644
--- a/jemalloc/include/jemalloc/jemalloc.h.in
+++ b/jemalloc/include/jemalloc/jemalloc.h.in
@@ -4,6 +4,7 @@
 extern "C" {
 #endif
 
+#include <limits.h>
 #include <strings.h>
 
 #define	JEMALLOC_VERSION "@jemalloc_version@"
@@ -22,7 +23,7 @@ extern "C" {
 #if LG_SIZEOF_PTR == 2
 #define	ALLOCM_ALIGN(a)	(ffs(a)-1)
 #else
-#define	ALLOCM_ALIGN(a)	((a < (size_t)MAX_INT) ? ffs(a)-1 : ffs(a>>32)+31)
+#define	ALLOCM_ALIGN(a)	((a < (size_t)INT_MAX) ? ffs(a)-1 : ffs(a>>32)+31)
 #endif
 #define	ALLOCM_ZERO	((int)0x40)
 #define	ALLOCM_NO_MOVE	((int)0x80)
diff --git a/jemalloc/src/prof.c b/jemalloc/src/prof.c
index a414bb9f..8d13451c 100644
--- a/jemalloc/src/prof.c
+++ b/jemalloc/src/prof.c
@@ -483,7 +483,10 @@ prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max)
 static prof_thr_cnt_t *
 prof_lookup(prof_bt_t *bt)
 {
-	prof_thr_cnt_t *ret;
+	union {
+		prof_thr_cnt_t	*p;
+		void		*v;
+	} ret;
 	ckh_t *bt2cnt = BT2CNT_GET();
 
 	if (bt2cnt == NULL) {
@@ -500,66 +503,72 @@ prof_lookup(prof_bt_t *bt)
 		BT2CNT_SET(bt2cnt);
 	}
 
-	if (ckh_search(bt2cnt, bt, NULL, (void **)&ret)) {
-		prof_bt_t *btkey;
-		prof_ctx_t *ctx;
+	if (ckh_search(bt2cnt, bt, NULL, &ret.v)) {
+		union {
+			prof_bt_t	*p;
+			void		*v;
+		} btkey;
+		union {
+			prof_ctx_t	*p;
+			void		*v;
+		} ctx;
 
 		/*
 		 * This thread's cache lacks bt.  Look for it in the global
 		 * cache.
 		 */
 		prof_enter();
-		if (ckh_search(&bt2ctx, bt, (void **)&btkey, (void **)&ctx)) {
+		if (ckh_search(&bt2ctx, bt, &btkey.v, &ctx.v)) {
 
 			/* bt has never been seen before.  Insert it. */
-			ctx = (prof_ctx_t *)imalloc(sizeof(prof_ctx_t));
-			if (ctx == NULL) {
+			ctx.v = imalloc(sizeof(prof_ctx_t));
+			if (ctx.v == NULL) {
 				prof_leave();
 				return (NULL);
 			}
-			btkey = bt_dup(bt);
-			if (btkey == NULL) {
+			btkey.p = bt_dup(bt);
+			if (btkey.v == NULL) {
 				prof_leave();
-				idalloc(ctx);
+				idalloc(ctx.v);
 				return (NULL);
 			}
-			ctx->bt = btkey;
-			if (malloc_mutex_init(&ctx->lock)) {
+			ctx.p->bt = btkey.p;
+			if (malloc_mutex_init(&ctx.p->lock)) {
 				prof_leave();
-				idalloc(btkey);
-				idalloc(ctx);
+				idalloc(btkey.v);
+				idalloc(ctx.v);
 				return (NULL);
 			}
-			memset(&ctx->cnt_merged, 0, sizeof(prof_cnt_t));
-			ql_new(&ctx->cnts_ql);
-			if (ckh_insert(&bt2ctx, btkey, ctx)) {
+			memset(&ctx.p->cnt_merged, 0, sizeof(prof_cnt_t));
+			ql_new(&ctx.p->cnts_ql);
+			if (ckh_insert(&bt2ctx, btkey.v, ctx.v)) {
 				/* OOM. */
 				prof_leave();
-				idalloc(btkey);
-				idalloc(ctx);
+				idalloc(btkey.v);
+				idalloc(ctx.v);
 				return (NULL);
 			}
 		}
 		prof_leave();
 
 		/* Link a prof_thd_cnt_t into ctx for this thread. */
-		ret = (prof_thr_cnt_t *)imalloc(sizeof(prof_thr_cnt_t));
-		if (ret == NULL)
+		ret.v = imalloc(sizeof(prof_thr_cnt_t));
+		if (ret.p == NULL)
 			return (NULL);
-		ql_elm_new(ret, link);
-		ret->ctx = ctx;
-		ret->epoch = 0;
-		memset(&ret->cnts, 0, sizeof(prof_cnt_t));
-		if (ckh_insert(bt2cnt, btkey, ret)) {
-			idalloc(ret);
+		ql_elm_new(ret.p, link);
+		ret.p->ctx = ctx.p;
+		ret.p->epoch = 0;
+		memset(&ret.p->cnts, 0, sizeof(prof_cnt_t));
+		if (ckh_insert(bt2cnt, btkey.v, ret.v)) {
+			idalloc(ret.v);
 			return (NULL);
 		}
-		malloc_mutex_lock(&ctx->lock);
-		ql_tail_insert(&ctx->cnts_ql, ret, link);
-		malloc_mutex_unlock(&ctx->lock);
+		malloc_mutex_lock(&ctx.p->lock);
+		ql_tail_insert(&ctx.p->cnts_ql, ret.p, link);
+		malloc_mutex_unlock(&ctx.p->lock);
 	}
 
-	return (ret);
+	return (ret.p);
 }
 
 static inline void
@@ -1022,8 +1031,14 @@ prof_dump(const char *filename, bool leakcheck, bool propagate_err)
 {
 	prof_cnt_t cnt_all;
 	size_t tabind;
-	prof_bt_t *bt;
-	prof_ctx_t *ctx;
+	union {
+		prof_bt_t	*p;
+		void		*v;
+	} bt;
+	union {
+		prof_ctx_t	*p;
+		void		*v;
+	} ctx;
 	char buf[UMAX2S_BUFSIZE];
 	size_t leak_nctx;
 
@@ -1043,9 +1058,9 @@ prof_dump(const char *filename, bool leakcheck, bool propagate_err)
 	/* Merge per thread profile stats, and sum them in cnt_all. */
 	memset(&cnt_all, 0, sizeof(prof_cnt_t));
 	leak_nctx = 0;
-	for (tabind = 0; ckh_iter(&bt2ctx, &tabind, NULL, (void **)&ctx)
+	for (tabind = 0; ckh_iter(&bt2ctx, &tabind, NULL, &ctx.v)
 	    == false;) {
-		prof_ctx_merge(ctx, &cnt_all, &leak_nctx);
+		prof_ctx_merge(ctx.p, &cnt_all, &leak_nctx);
 	}
 
 	/* Dump profile header. */
@@ -1071,9 +1086,9 @@ prof_dump(const char *filename, bool leakcheck, bool propagate_err)
 	}
 
 	/* Dump  per ctx profile stats. */
-	for (tabind = 0; ckh_iter(&bt2ctx, &tabind, (void **)&bt, (void **)&ctx)
+	for (tabind = 0; ckh_iter(&bt2ctx, &tabind, &bt.v, &ctx.v)
 	    == false;) {
-		if (prof_dump_ctx(ctx, bt, propagate_err))
+		if (prof_dump_ctx(ctx.p, bt.p, propagate_err))
 			goto ERROR;
 	}
 
@@ -1312,28 +1327,31 @@ bt2cnt_thread_cleanup(void *arg)
 	if (bt2cnt != NULL) {
 		ql_head(prof_thr_cnt_t) cnts_ql;
 		size_t tabind;
-		prof_thr_cnt_t *cnt;
+		union {
+			prof_thr_cnt_t	*p;
+			void		*v;
+		} cnt;
 
 		/* Iteratively merge cnt's into the global stats. */
 		ql_new(&cnts_ql);
 		tabind = 0;
-		while (ckh_iter(bt2cnt, &tabind, NULL, (void **)&cnt) ==
+		while (ckh_iter(bt2cnt, &tabind, NULL, &cnt.v) ==
 		    false) {
-			prof_ctx_t *ctx = cnt->ctx;
+			prof_ctx_t *ctx = cnt.p->ctx;
 			/* Merge stats and detach from ctx. */
 			malloc_mutex_lock(&ctx->lock);
-			ctx->cnt_merged.curobjs += cnt->cnts.curobjs;
-			ctx->cnt_merged.curbytes += cnt->cnts.curbytes;
-			ctx->cnt_merged.accumobjs += cnt->cnts.accumobjs;
-			ctx->cnt_merged.accumbytes += cnt->cnts.accumbytes;
-			ql_remove(&ctx->cnts_ql, cnt, link);
+			ctx->cnt_merged.curobjs += cnt.p->cnts.curobjs;
+			ctx->cnt_merged.curbytes += cnt.p->cnts.curbytes;
+			ctx->cnt_merged.accumobjs += cnt.p->cnts.accumobjs;
+			ctx->cnt_merged.accumbytes += cnt.p->cnts.accumbytes;
+			ql_remove(&ctx->cnts_ql, cnt.p, link);
 			malloc_mutex_unlock(&ctx->lock);
 
 			/*
 			 * Stash cnt for deletion after finishing with
 			 * ckh_iter().
 			 */
-			ql_tail_insert(&cnts_ql, cnt, link);
+			ql_tail_insert(&cnts_ql, cnt.p, link);
 		}
 
 		/*
@@ -1345,9 +1363,9 @@ bt2cnt_thread_cleanup(void *arg)
 		BT2CNT_SET(NULL);
 
 		/* Delete cnt's. */
-		while ((cnt = ql_last(&cnts_ql, link)) != NULL) {
-			ql_remove(&cnts_ql, cnt, link);
-			idalloc(cnt);
+		while ((cnt.p = ql_last(&cnts_ql, link)) != NULL) {
+			ql_remove(&cnts_ql, cnt.p, link);
+			idalloc(cnt.v);
 		}
 	}
 }

From 6005f0710cf07d60659d91b20b7ff5592d310027 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@facebook.com>
Date: Thu, 30 Sep 2010 16:55:08 -0700
Subject: [PATCH 15/48] Add the "arenas.purge" mallctl.

---
 jemalloc/doc/jemalloc.3.in                 |  8 ++++-
 jemalloc/include/jemalloc/internal/arena.h |  7 ++--
 jemalloc/src/arena.c                       | 24 +++++++++----
 jemalloc/src/ctl.c                         | 41 +++++++++++++++++++++-
 4 files changed, 68 insertions(+), 12 deletions(-)

diff --git a/jemalloc/doc/jemalloc.3.in b/jemalloc/doc/jemalloc.3.in
index 9dc4b250..f32c4bff 100644
--- a/jemalloc/doc/jemalloc.3.in
+++ b/jemalloc/doc/jemalloc.3.in
@@ -38,7 +38,7 @@
 .\"     @(#)malloc.3	8.1 (Berkeley) 6/4/93
 .\" $FreeBSD: head/lib/libc/stdlib/malloc.3 182225 2008-08-27 02:00:53Z jasone $
 .\"
-.Dd September 17, 2010
+.Dd September 30, 2010
 .Dt JEMALLOC 3
 .Os
 .Sh NAME
@@ -1185,6 +1185,12 @@ Total number of large size classes.
 Maximum size supported by this large size class.
 .Ed
 .\"-----------------------------------------------------------------------------
+.It Sy "arenas.purge (unsigned) -w"
+.Bd -ragged -offset indent -compact
+Purge unused dirty pages for the specified arena, or for all arenas if none is
+specified.
+.Ed
+.\"-----------------------------------------------------------------------------
 @roff_prof@.It Sy "prof.active (bool) rw"
 @roff_prof@.Bd -ragged -offset indent -compact
 @roff_prof@Control whether sampling is currently active.
diff --git a/jemalloc/include/jemalloc/internal/arena.h b/jemalloc/include/jemalloc/internal/arena.h
index 73ebb00a..38085869 100644
--- a/jemalloc/include/jemalloc/internal/arena.h
+++ b/jemalloc/include/jemalloc/internal/arena.h
@@ -418,6 +418,10 @@ extern size_t		sspace_max;
 
 #define			nlclasses (chunk_npages - arena_chunk_header_npages)
 
+void	arena_purge_all(arena_t *arena);
+#ifdef JEMALLOC_PROF
+void	arena_prof_accum(arena_t *arena, uint64_t accumbytes);
+#endif
 #ifdef JEMALLOC_TCACHE
 void	arena_tcache_fill_small(arena_t *arena, tcache_bin_t *tbin,
     size_t binind
@@ -426,9 +430,6 @@ void	arena_tcache_fill_small(arena_t *arena, tcache_bin_t *tbin,
 #  endif
     );
 #endif
-#ifdef JEMALLOC_PROF
-void	arena_prof_accum(arena_t *arena, uint64_t accumbytes);
-#endif
 void	*arena_malloc_small(arena_t *arena, size_t size, bool zero);
 void	*arena_malloc_large(arena_t *arena, size_t size, bool zero);
 void	*arena_malloc(size_t size, bool zero);
diff --git a/jemalloc/src/arena.c b/jemalloc/src/arena.c
index 0c2f4a35..81518b63 100644
--- a/jemalloc/src/arena.c
+++ b/jemalloc/src/arena.c
@@ -165,7 +165,7 @@ static arena_chunk_t *arena_chunk_alloc(arena_t *arena);
 static void	arena_chunk_dealloc(arena_t *arena, arena_chunk_t *chunk);
 static arena_run_t *arena_run_alloc(arena_t *arena, size_t size, bool large,
     bool zero);
-static void	arena_purge(arena_t *arena);
+static void	arena_purge(arena_t *arena, bool all);
 static void	arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty);
 static void	arena_run_trim_head(arena_t *arena, arena_chunk_t *chunk,
     arena_run_t *run, size_t oldsize, size_t newsize);
@@ -585,7 +585,7 @@ arena_maybe_purge(arena_t *arena)
 	    (arena->ndirty - arena->npurgatory) > chunk_npages &&
 	    (arena->nactive >> opt_lg_dirty_mult) < (arena->ndirty -
 	    arena->npurgatory))
-		arena_purge(arena);
+		arena_purge(arena, false);
 }
 
 static inline void
@@ -758,7 +758,7 @@ arena_chunk_purge(arena_t *arena, arena_chunk_t *chunk)
 }
 
 static void
-arena_purge(arena_t *arena)
+arena_purge(arena_t *arena, bool all)
 {
 	arena_chunk_t *chunk;
 	size_t npurgatory;
@@ -772,8 +772,8 @@ arena_purge(arena_t *arena)
 	assert(ndirty == arena->ndirty);
 #endif
 	assert(arena->ndirty > arena->npurgatory);
-	assert(arena->ndirty > chunk_npages);
-	assert((arena->nactive >> opt_lg_dirty_mult) < arena->ndirty);
+	assert(arena->ndirty > chunk_npages || all);
+	assert((arena->nactive >> opt_lg_dirty_mult) < arena->ndirty || all);
 
 #ifdef JEMALLOC_STATS
 	arena->stats.npurge++;
@@ -784,8 +784,9 @@ arena_purge(arena_t *arena)
 	 * purge, and add the result to arena->npurgatory.  This will keep
 	 * multiple threads from racing to reduce ndirty below the threshold.
 	 */
-	npurgatory = (arena->ndirty - arena->npurgatory) - (arena->nactive >>
-	    opt_lg_dirty_mult);
+	npurgatory = arena->ndirty - arena->npurgatory;
+	if (all == false)
+		npurgatory -= arena->nactive >> opt_lg_dirty_mult;
 	arena->npurgatory += npurgatory;
 
 	while (npurgatory > 0) {
@@ -841,6 +842,15 @@ arena_purge(arena_t *arena)
 	}
 }
 
+void
+arena_purge_all(arena_t *arena)
+{
+
+	malloc_mutex_lock(&arena->lock);
+	arena_purge(arena, true);
+	malloc_mutex_unlock(&arena->lock);
+}
+
 static void
 arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty)
 {
diff --git a/jemalloc/src/ctl.c b/jemalloc/src/ctl.c
index 64913067..e904fdff 100644
--- a/jemalloc/src/ctl.c
+++ b/jemalloc/src/ctl.c
@@ -126,6 +126,7 @@ CTL_PROTO(arenas_nbins)
 CTL_PROTO(arenas_nhbins)
 #endif
 CTL_PROTO(arenas_nlruns)
+CTL_PROTO(arenas_purge)
 #ifdef JEMALLOC_PROF
 CTL_PROTO(prof_active)
 CTL_PROTO(prof_dump)
@@ -326,7 +327,8 @@ static const ctl_node_t arenas_node[] = {
 #endif
 	{NAME("bin"),			CHILD(arenas_bin)},
 	{NAME("nlruns"),		CTL(arenas_nlruns)},
-	{NAME("lrun"),			CHILD(arenas_lrun)}
+	{NAME("lrun"),			CHILD(arenas_lrun)},
+	{NAME("purge"),			CTL(arenas_purge)}
 };
 
 #ifdef JEMALLOC_PROF
@@ -1293,6 +1295,43 @@ CTL_RO_GEN(arenas_nhbins, nhbins, unsigned)
 #endif
 CTL_RO_GEN(arenas_nlruns, nlclasses, size_t)
 
+static int
+arenas_purge_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
+    void *newp, size_t newlen)
+{
+	int ret;
+	unsigned arena;
+
+	WRITEONLY();
+	arena = UINT_MAX;
+	WRITE(arena, unsigned);
+	if (newp != NULL && arena >= narenas) {
+		ret = EFAULT;
+		goto RETURN;
+	} else {
+		arena_t *tarenas[narenas];
+
+		malloc_mutex_lock(&arenas_lock);
+		memcpy(tarenas, arenas, sizeof(arena_t *) * narenas);
+		malloc_mutex_unlock(&arenas_lock);
+
+		if (arena == UINT_MAX) {
+			for (unsigned i = 0; i < narenas; i++) {
+				if (tarenas[i] != NULL)
+					arena_purge_all(tarenas[i]);
+			}
+		} else {
+			assert(arena < narenas);
+			if (tarenas[arena] != NULL)
+				arena_purge_all(tarenas[arena]);
+		}
+	}
+
+	ret = 0;
+RETURN:
+	return (ret);
+}
+
 /******************************************************************************/
 
 #ifdef JEMALLOC_PROF

From 37dab02e524c14cffa4bb938242e4cf2695a4a7d Mon Sep 17 00:00:00 2001
From: Jason Evans <je@facebook.com>
Date: Thu, 30 Sep 2010 17:10:17 -0700
Subject: [PATCH 16/48] Disable interval-based profile dumps by default.

It is common to have to specify something like JEMALLOC_OPTIONS=F31i,
because interval-based dumps are often unuseful or too expensive.
Therefore, disable interval-based dumps by default.  To get the previous
default behavior it is now necessary to specify 31I as part of the
options.
---
 jemalloc/doc/jemalloc.3.in                | 10 +++++++---
 jemalloc/include/jemalloc/internal/prof.h |  2 +-
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/jemalloc/doc/jemalloc.3.in b/jemalloc/doc/jemalloc.3.in
index f32c4bff..2ac093a6 100644
--- a/jemalloc/doc/jemalloc.3.in
+++ b/jemalloc/doc/jemalloc.3.in
@@ -526,9 +526,13 @@ will disable dirty page purging.
 @roff_prof@is controlled by the
 @roff_prof@JEMALLOC_PROF_PREFIX
 @roff_prof@environment variable.
-@roff_prof@The default average interval is 1 GiB;
-@roff_prof@.Ev JEMALLOC_OPTIONS=31i
-@roff_prof@will disable interval-triggered profile dumping.
+@roff_prof@By default, interval-triggered profile dumping is disabled.
+@roff_prof@This is internally encoded as (1 << -1), and each
+@roff_prof@.Dq I
+@roff_prof@that is specified increments the shift amount.
+@roff_prof@Therefore, e.g.
+@roff_prof@.Ev JEMALLOC_OPTIONS=31I
+@roff_prof@specifies a dump interval of 1 GiB.
 @roff_fill@.It J
 @roff_fill@Each byte of new memory allocated by
 @roff_fill@.Fn @jemalloc_prefix@malloc
diff --git a/jemalloc/include/jemalloc/internal/prof.h b/jemalloc/include/jemalloc/internal/prof.h
index fb55fb90..2c195f39 100644
--- a/jemalloc/include/jemalloc/internal/prof.h
+++ b/jemalloc/include/jemalloc/internal/prof.h
@@ -11,7 +11,7 @@ typedef struct prof_s prof_t;
 /* Option defaults. */
 #define	LG_PROF_BT_MAX_DEFAULT		2
 #define	LG_PROF_SAMPLE_DEFAULT		0
-#define	LG_PROF_INTERVAL_DEFAULT	30
+#define	LG_PROF_INTERVAL_DEFAULT	-1
 
 /*
  * Hard limit on stack backtrace depth.  Note that the version of

From 7393f44ff025ca67716fc53b68003fd65122fd97 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@facebook.com>
Date: Fri, 1 Oct 2010 17:35:43 -0700
Subject: [PATCH 17/48] Omit chunk header in arena chunk map.

Omit the first map_bias elements of the map in arena_chunk_t.  This
avoids barely spilling over into an extra chunk header page for common
chunk sizes.
---
 jemalloc/include/jemalloc/internal/arena.h    |  13 +-
 jemalloc/include/jemalloc/internal/chunk.h    |   2 +-
 .../jemalloc/internal/jemalloc_internal.h.in  |   4 +
 jemalloc/include/jemalloc/internal/tcache.h   |  11 +-
 jemalloc/src/arena.c                          | 309 ++++++++++--------
 jemalloc/src/chunk.c                          |   2 +-
 jemalloc/src/tcache.c                         |  12 +-
 7 files changed, 193 insertions(+), 160 deletions(-)

diff --git a/jemalloc/include/jemalloc/internal/arena.h b/jemalloc/include/jemalloc/internal/arena.h
index 38085869..c3b7a58f 100644
--- a/jemalloc/include/jemalloc/internal/arena.h
+++ b/jemalloc/include/jemalloc/internal/arena.h
@@ -187,7 +187,12 @@ struct arena_chunk_s {
 	/* Number of dirty pages. */
 	size_t		ndirty;
 
-	/* Map of pages within chunk that keeps track of free/large/small. */
+	/*
+	 * Map of pages within chunk that keeps track of free/large/small.  The
+	 * first map_bias entries are omitted, since the chunk header does not
+	 * need to be tracked in the map.  This omission saves a header page
+	 * for common chunk sizes (e.g. 4 MiB).
+	 */
 	arena_chunk_map_t map[1]; /* Dynamically sized. */
 };
 typedef rb_tree(arena_chunk_t) arena_chunk_tree_t;
@@ -416,7 +421,7 @@ extern size_t		sspace_min;
 extern size_t		sspace_max;
 #define			small_maxclass	sspace_max
 
-#define			nlclasses (chunk_npages - arena_chunk_header_npages)
+#define			nlclasses (chunk_npages - map_bias)
 
 void	arena_purge_all(arena_t *arena);
 #ifdef JEMALLOC_PROF
@@ -478,8 +483,8 @@ arena_dalloc(arena_t *arena, arena_chunk_t *chunk, void *ptr)
 	assert(ptr != NULL);
 	assert(CHUNK_ADDR2BASE(ptr) != ptr);
 
-	pageind = (((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT);
-	mapelm = &chunk->map[pageind];
+	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT;
+	mapelm = &chunk->map[pageind-map_bias];
 	assert((mapelm->bits & CHUNK_MAP_ALLOCATED) != 0);
 	if ((mapelm->bits & CHUNK_MAP_LARGE) == 0) {
 		/* Small allocation. */
diff --git a/jemalloc/include/jemalloc/internal/chunk.h b/jemalloc/include/jemalloc/internal/chunk.h
index d7955298..a60f0ad7 100644
--- a/jemalloc/include/jemalloc/internal/chunk.h
+++ b/jemalloc/include/jemalloc/internal/chunk.h
@@ -46,7 +46,7 @@ extern rtree_t		*chunks_rtree;
 extern size_t		chunksize;
 extern size_t		chunksize_mask; /* (chunksize - 1). */
 extern size_t		chunk_npages;
-extern size_t		arena_chunk_header_npages;
+extern size_t		map_bias; /* Number of arena chunk header pages. */
 extern size_t		arena_maxclass; /* Max size class for arenas. */
 
 void	*chunk_alloc(size_t size, bool base, bool *zero);
diff --git a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
index 2320e3e5..6ad7b064 100644
--- a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
@@ -17,6 +17,10 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
+#include <stddef.h>
+#ifndef offsetof
+#  define offsetof(type, member)	((size_t)&(((type *)NULL)->member))
+#endif
 #include <inttypes.h>
 #include <string.h>
 #include <strings.h>
diff --git a/jemalloc/include/jemalloc/internal/tcache.h b/jemalloc/include/jemalloc/internal/tcache.h
index e5061114..8c2aad6e 100644
--- a/jemalloc/include/jemalloc/internal/tcache.h
+++ b/jemalloc/include/jemalloc/internal/tcache.h
@@ -282,7 +282,8 @@ tcache_alloc_large(tcache_t *tcache, size_t size, bool zero)
 		arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ret);
 		size_t pageind = (unsigned)(((uintptr_t)ret - (uintptr_t)chunk)
 		    >> PAGE_SHIFT);
-		chunk->map[pageind].bits |= CHUNK_MAP_CLASS_MASK;
+		chunk->map[pageind-map_bias].bits |=
+		    CHUNK_MAP_CLASS_MASK;
 #endif
 		if (zero == false) {
 #ifdef JEMALLOC_FILL
@@ -321,8 +322,8 @@ tcache_dalloc_small(tcache_t *tcache, void *ptr)
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 	arena = chunk->arena;
-	pageind = (((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT);
-	mapelm = &chunk->map[pageind];
+	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT;
+	mapelm = &chunk->map[pageind-map_bias];
 	run = (arena_run_t *)((uintptr_t)chunk + (uintptr_t)((pageind -
 	    (mapelm->bits >> PAGE_SHIFT)) << PAGE_SHIFT));
 	assert(run->magic == ARENA_RUN_MAGIC);
@@ -369,8 +370,8 @@ tcache_dalloc_large(tcache_t *tcache, void *ptr, size_t size)
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 	arena = chunk->arena;
-	pageind = (((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT);
-	mapelm = &chunk->map[pageind];
+	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT;
+	mapelm = &chunk->map[pageind-map_bias];
 	binind = nbins + (size >> PAGE_SHIFT) - 1;
 
 #ifdef JEMALLOC_FILL
diff --git a/jemalloc/src/arena.c b/jemalloc/src/arena.c
index 81518b63..955d70cd 100644
--- a/jemalloc/src/arena.c
+++ b/jemalloc/src/arena.c
@@ -298,39 +298,40 @@ arena_run_split(arena_t *arena, arena_run_t *run, size_t size, bool large,
 	old_ndirty = chunk->ndirty;
 	run_ind = (unsigned)(((uintptr_t)run - (uintptr_t)chunk)
 	    >> PAGE_SHIFT);
-	flag_dirty = chunk->map[run_ind].bits & CHUNK_MAP_DIRTY;
+	flag_dirty = chunk->map[run_ind-map_bias].bits & CHUNK_MAP_DIRTY;
 	runs_avail = (flag_dirty != 0) ? &arena->runs_avail_dirty :
 	    &arena->runs_avail_clean;
-	total_pages = (chunk->map[run_ind].bits & ~PAGE_MASK) >>
+	total_pages = (chunk->map[run_ind-map_bias].bits & ~PAGE_MASK) >>
 	    PAGE_SHIFT;
-	assert((chunk->map[run_ind+total_pages-1].bits & CHUNK_MAP_DIRTY) ==
-	    flag_dirty);
+	assert((chunk->map[run_ind+total_pages-1-map_bias].bits &
+	    CHUNK_MAP_DIRTY) == flag_dirty);
 	need_pages = (size >> PAGE_SHIFT);
 	assert(need_pages > 0);
 	assert(need_pages <= total_pages);
 	rem_pages = total_pages - need_pages;
 
-	arena_avail_tree_remove(runs_avail, &chunk->map[run_ind]);
+	arena_avail_tree_remove(runs_avail, &chunk->map[run_ind-map_bias]);
 	arena->nactive += need_pages;
 
 	/* Keep track of trailing unused pages for later use. */
 	if (rem_pages > 0) {
 		if (flag_dirty != 0) {
-			chunk->map[run_ind+need_pages].bits = (rem_pages <<
-			    PAGE_SHIFT) | CHUNK_MAP_DIRTY;
-			chunk->map[run_ind+total_pages-1].bits = (rem_pages <<
-			    PAGE_SHIFT) | CHUNK_MAP_DIRTY;
+			chunk->map[run_ind+need_pages-map_bias].bits =
+			    (rem_pages << PAGE_SHIFT) | CHUNK_MAP_DIRTY;
+			chunk->map[run_ind+total_pages-1-map_bias].bits =
+			    (rem_pages << PAGE_SHIFT) | CHUNK_MAP_DIRTY;
 		} else {
-			chunk->map[run_ind+need_pages].bits = (rem_pages <<
-			    PAGE_SHIFT) | (chunk->map[run_ind+need_pages].bits &
+			chunk->map[run_ind+need_pages-map_bias].bits =
+			    (rem_pages << PAGE_SHIFT) |
+			    (chunk->map[run_ind+need_pages-map_bias].bits &
 			    CHUNK_MAP_ZEROED);
-			chunk->map[run_ind+total_pages-1].bits = (rem_pages <<
-			    PAGE_SHIFT) |
-			    (chunk->map[run_ind+total_pages-1].bits &
+			chunk->map[run_ind+total_pages-1-map_bias].bits =
+			    (rem_pages << PAGE_SHIFT) |
+			    (chunk->map[run_ind+total_pages-1-map_bias].bits &
 			    CHUNK_MAP_ZEROED);
 		}
 		arena_avail_tree_insert(runs_avail,
-		    &chunk->map[run_ind+need_pages]);
+		    &chunk->map[run_ind+need_pages-map_bias]);
 	}
 
 	/* Update dirty page accounting. */
@@ -351,8 +352,8 @@ arena_run_split(arena_t *arena, arena_run_t *run, size_t size, bool large,
 				 * zeroed (i.e. never before touched).
 				 */
 				for (i = 0; i < need_pages; i++) {
-					if ((chunk->map[run_ind + i].bits &
-					    CHUNK_MAP_ZEROED) == 0) {
+					if ((chunk->map[run_ind+i-map_bias].bits
+					    & CHUNK_MAP_ZEROED) == 0) {
 						memset((void *)((uintptr_t)
 						    chunk + ((run_ind + i) <<
 						    PAGE_SHIFT)), 0,
@@ -374,9 +375,9 @@ arena_run_split(arena_t *arena, arena_run_t *run, size_t size, bool large,
 		 * Set the last element first, in case the run only contains one
 		 * page (i.e. both statements set the same element).
 		 */
-		chunk->map[run_ind+need_pages-1].bits = CHUNK_MAP_LARGE |
-		    CHUNK_MAP_ALLOCATED | flag_dirty;
-		chunk->map[run_ind].bits = size | CHUNK_MAP_LARGE |
+		chunk->map[run_ind+need_pages-1-map_bias].bits =
+		    CHUNK_MAP_LARGE | CHUNK_MAP_ALLOCATED | flag_dirty;
+		chunk->map[run_ind-map_bias].bits = size | CHUNK_MAP_LARGE |
 #ifdef JEMALLOC_PROF
 		    CHUNK_MAP_CLASS_MASK |
 #endif
@@ -388,13 +389,14 @@ arena_run_split(arena_t *arena, arena_run_t *run, size_t size, bool large,
 		 * arena_dalloc_bin_run() has the ability to conditionally trim
 		 * clean pages.
 		 */
-		chunk->map[run_ind].bits = CHUNK_MAP_ALLOCATED | flag_dirty;
+		chunk->map[run_ind-map_bias].bits = CHUNK_MAP_ALLOCATED |
+		    flag_dirty;
 		for (i = 1; i < need_pages - 1; i++) {
-			chunk->map[run_ind + i].bits = (i << PAGE_SHIFT)
+			chunk->map[run_ind+i-map_bias].bits = (i << PAGE_SHIFT)
 			    | CHUNK_MAP_ALLOCATED;
 		}
-		chunk->map[run_ind + need_pages - 1].bits = ((need_pages - 1) <<
-		    PAGE_SHIFT) | CHUNK_MAP_ALLOCATED | flag_dirty;
+		chunk->map[run_ind + need_pages-1-map_bias].bits = ((need_pages
+		    - 1) << PAGE_SHIFT) | CHUNK_MAP_ALLOCATED | flag_dirty;
 	}
 }
 
@@ -411,13 +413,11 @@ arena_chunk_alloc(arena_t *arena)
 		arena->spare = NULL;
 
 		/* Insert the run into the appropriate runs_avail_* tree. */
-		if ((chunk->map[arena_chunk_header_npages].bits &
-		    CHUNK_MAP_DIRTY) == 0)
+		if ((chunk->map[0].bits & CHUNK_MAP_DIRTY) == 0)
 			runs_avail = &arena->runs_avail_clean;
 		else
 			runs_avail = &arena->runs_avail_dirty;
-		arena_avail_tree_insert(runs_avail,
-		    &chunk->map[arena_chunk_header_npages]);
+		arena_avail_tree_insert(runs_avail, &chunk->map[0]);
 	} else {
 		bool zero;
 		size_t zeroed;
@@ -448,16 +448,14 @@ arena_chunk_alloc(arena_t *arena)
 		 * chunk.
 		 */
 		zeroed = zero ? CHUNK_MAP_ZEROED : 0;
-		for (i = 0; i < arena_chunk_header_npages; i++)
-			chunk->map[i].bits = 0;
-		chunk->map[i].bits = arena_maxclass | zeroed;
-		for (i++; i < chunk_npages-1; i++)
-			chunk->map[i].bits = zeroed;
-		chunk->map[chunk_npages-1].bits = arena_maxclass | zeroed;
+		chunk->map[0].bits = arena_maxclass | zeroed;
+		for (i = map_bias+1; i < chunk_npages-1; i++)
+			chunk->map[i-map_bias].bits = zeroed;
+		chunk->map[i-map_bias].bits = arena_maxclass | zeroed;
 
 		/* Insert the run into the runs_avail_clean tree. */
 		arena_avail_tree_insert(&arena->runs_avail_clean,
-		    &chunk->map[arena_chunk_header_npages]);
+		    &chunk->map[0]);
 	}
 
 	return (chunk);
@@ -472,13 +470,11 @@ arena_chunk_dealloc(arena_t *arena, arena_chunk_t *chunk)
 	 * Remove run from the appropriate runs_avail_* tree, so that the arena
 	 * does not use it.
 	 */
-	if ((chunk->map[arena_chunk_header_npages].bits &
-	    CHUNK_MAP_DIRTY) == 0)
+	if ((chunk->map[0].bits & CHUNK_MAP_DIRTY) == 0)
 		runs_avail = &arena->runs_avail_clean;
 	else
 		runs_avail = &arena->runs_avail_dirty;
-	arena_avail_tree_remove(runs_avail,
-	    &chunk->map[arena_chunk_header_npages]);
+	arena_avail_tree_remove(runs_avail, &chunk->map[0]);
 
 	if (arena->spare != NULL) {
 		arena_chunk_t *spare = arena->spare;
@@ -514,8 +510,9 @@ arena_run_alloc(arena_t *arena, size_t size, bool large, bool zero)
 	mapelm = arena_avail_tree_nsearch(&arena->runs_avail_dirty, &key);
 	if (mapelm != NULL) {
 		arena_chunk_t *run_chunk = CHUNK_ADDR2BASE(mapelm);
-		size_t pageind = ((uintptr_t)mapelm - (uintptr_t)run_chunk->map)
-		    / sizeof(arena_chunk_map_t);
+		size_t pageind = (((uintptr_t)mapelm -
+		    (uintptr_t)run_chunk->map) / sizeof(arena_chunk_map_t))
+		    + map_bias;
 
 		run = (arena_run_t *)((uintptr_t)run_chunk + (pageind <<
 		    PAGE_SHIFT));
@@ -525,8 +522,9 @@ arena_run_alloc(arena_t *arena, size_t size, bool large, bool zero)
 	mapelm = arena_avail_tree_nsearch(&arena->runs_avail_clean, &key);
 	if (mapelm != NULL) {
 		arena_chunk_t *run_chunk = CHUNK_ADDR2BASE(mapelm);
-		size_t pageind = ((uintptr_t)mapelm - (uintptr_t)run_chunk->map)
-		    / sizeof(arena_chunk_map_t);
+		size_t pageind = (((uintptr_t)mapelm -
+		    (uintptr_t)run_chunk->map) / sizeof(arena_chunk_map_t))
+		    + map_bias;
 
 		run = (arena_run_t *)((uintptr_t)run_chunk + (pageind <<
 		    PAGE_SHIFT));
@@ -539,8 +537,8 @@ arena_run_alloc(arena_t *arena, size_t size, bool large, bool zero)
 	 */
 	chunk = arena_chunk_alloc(arena);
 	if (chunk != NULL) {
-		run = (arena_run_t *)((uintptr_t)chunk +
-		    (arena_chunk_header_npages << PAGE_SHIFT));
+		run = (arena_run_t *)((uintptr_t)chunk + (map_bias <<
+		    PAGE_SHIFT));
 		arena_run_split(arena, run, size, large, zero);
 		return (run);
 	}
@@ -553,8 +551,9 @@ arena_run_alloc(arena_t *arena, size_t size, bool large, bool zero)
 	mapelm = arena_avail_tree_nsearch(&arena->runs_avail_dirty, &key);
 	if (mapelm != NULL) {
 		arena_chunk_t *run_chunk = CHUNK_ADDR2BASE(mapelm);
-		size_t pageind = ((uintptr_t)mapelm - (uintptr_t)run_chunk->map)
-		    / sizeof(arena_chunk_map_t);
+		size_t pageind = (((uintptr_t)mapelm -
+		    (uintptr_t)run_chunk->map) / sizeof(arena_chunk_map_t))
+		    + map_bias;
 
 		run = (arena_run_t *)((uintptr_t)run_chunk + (pageind <<
 		    PAGE_SHIFT));
@@ -564,8 +563,9 @@ arena_run_alloc(arena_t *arena, size_t size, bool large, bool zero)
 	mapelm = arena_avail_tree_nsearch(&arena->runs_avail_clean, &key);
 	if (mapelm != NULL) {
 		arena_chunk_t *run_chunk = CHUNK_ADDR2BASE(mapelm);
-		size_t pageind = ((uintptr_t)mapelm - (uintptr_t)run_chunk->map)
-		    / sizeof(arena_chunk_map_t);
+		size_t pageind = (((uintptr_t)mapelm -
+		    (uintptr_t)run_chunk->map) / sizeof(arena_chunk_map_t))
+		    + map_bias;
 
 		run = (arena_run_t *)((uintptr_t)run_chunk + (pageind <<
 		    PAGE_SHIFT));
@@ -633,14 +633,13 @@ arena_chunk_purge(arena_t *arena, arena_chunk_t *chunk)
 	 * run.
 	 */
 	if (chunk == arena->spare) {
-		assert((chunk->map[arena_chunk_header_npages].bits &
-		    CHUNK_MAP_DIRTY) != 0);
+		assert((chunk->map[0].bits & CHUNK_MAP_DIRTY) != 0);
 		arena_chunk_alloc(arena);
 	}
 
 	/* Temporarily allocate all free dirty runs within chunk. */
-	for (pageind = arena_chunk_header_npages; pageind < chunk_npages;) {
-		mapelm = &chunk->map[pageind];
+	for (pageind = map_bias; pageind < chunk_npages;) {
+		mapelm = &chunk->map[pageind-map_bias];
 		if ((mapelm->bits & CHUNK_MAP_ALLOCATED) == 0) {
 			size_t npages;
 
@@ -660,14 +659,15 @@ arena_chunk_purge(arena_t *arena, arena_chunk_t *chunk)
 				    CHUNK_MAP_LARGE | CHUNK_MAP_ALLOCATED |
 				    flag_zeroed;
 				for (i = 1; i < npages - 1; i++) {
-					chunk->map[pageind + i].bits =
+					chunk->map[pageind+i-map_bias].bits =
 					    flag_zeroed;
 				}
 				if (npages > 1) {
-					chunk->map[pageind + npages - 1].bits =
-					(npages << PAGE_SHIFT) |
-					CHUNK_MAP_LARGE | CHUNK_MAP_ALLOCATED |
-					flag_zeroed;
+					chunk->map[
+					    pageind+npages-1-map_bias].bits =
+					    (npages << PAGE_SHIFT) |
+					    CHUNK_MAP_LARGE |
+					    CHUNK_MAP_ALLOCATED | flag_zeroed;
 				}
 
 				arena->nactive += npages;
@@ -709,8 +709,8 @@ arena_chunk_purge(arena_t *arena, arena_chunk_t *chunk)
 	nmadvise = 0;
 #endif
 	ql_foreach(mapelm, &mapelms, u.ql_link) {
-		size_t pageind = ((uintptr_t)mapelm - (uintptr_t)chunk->map) /
-		    sizeof(arena_chunk_map_t);
+		size_t pageind = (((uintptr_t)mapelm - (uintptr_t)chunk->map) /
+		    sizeof(arena_chunk_map_t)) + map_bias;
 		size_t npages = mapelm->bits >> PAGE_SHIFT;
 
 		assert(pageind + npages <= chunk_npages);
@@ -747,8 +747,8 @@ arena_chunk_purge(arena_t *arena, arena_chunk_t *chunk)
 	/* Deallocate runs. */
 	for (mapelm = ql_first(&mapelms); mapelm != NULL;
 	    mapelm = ql_first(&mapelms)) {
-		size_t pageind = ((uintptr_t)mapelm - (uintptr_t)chunk->map) /
-		    sizeof(arena_chunk_map_t);
+		size_t pageind = (((uintptr_t)mapelm - (uintptr_t)chunk->map) /
+		    sizeof(arena_chunk_map_t)) + map_bias;
 		arena_run_t *run = (arena_run_t *)((uintptr_t)chunk +
 		    (uintptr_t)(pageind << PAGE_SHIFT));
 
@@ -861,10 +861,10 @@ arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty)
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(run);
 	run_ind = (size_t)(((uintptr_t)run - (uintptr_t)chunk)
 	    >> PAGE_SHIFT);
-	assert(run_ind >= arena_chunk_header_npages);
+	assert(run_ind >= map_bias);
 	assert(run_ind < chunk_npages);
-	if ((chunk->map[run_ind].bits & CHUNK_MAP_LARGE) != 0)
-		size = chunk->map[run_ind].bits & ~PAGE_MASK;
+	if ((chunk->map[run_ind-map_bias].bits & CHUNK_MAP_LARGE) != 0)
+		size = chunk->map[run_ind-map_bias].bits & ~PAGE_MASK;
 	else
 		size = run->bin->run_size;
 	run_pages = (size >> PAGE_SHIFT);
@@ -874,7 +874,7 @@ arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty)
 	 * The run is dirty if the caller claims to have dirtied it, as well as
 	 * if it was already dirty before being allocated.
 	 */
-	if ((chunk->map[run_ind].bits & CHUNK_MAP_DIRTY) != 0)
+	if ((chunk->map[run_ind-map_bias].bits & CHUNK_MAP_DIRTY) != 0)
 		dirty = true;
 	flag_dirty = dirty ? CHUNK_MAP_DIRTY : 0;
 	runs_avail = dirty ? &arena->runs_avail_dirty :
@@ -882,24 +882,26 @@ arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty)
 
 	/* Mark pages as unallocated in the chunk map. */
 	if (dirty) {
-		chunk->map[run_ind].bits = size | flag_dirty;
-		chunk->map[run_ind+run_pages-1].bits = size | flag_dirty;
+		chunk->map[run_ind-map_bias].bits = size | flag_dirty;
+		chunk->map[run_ind+run_pages-1-map_bias].bits = size |
+		    flag_dirty;
 
 		chunk->ndirty += run_pages;
 		arena->ndirty += run_pages;
 	} else {
-		chunk->map[run_ind].bits = size | (chunk->map[run_ind].bits &
+		chunk->map[run_ind-map_bias].bits = size |
+		    (chunk->map[run_ind-map_bias].bits & CHUNK_MAP_ZEROED);
+		chunk->map[run_ind+run_pages-1-map_bias].bits = size |
+		    (chunk->map[run_ind+run_pages-1-map_bias].bits &
 		    CHUNK_MAP_ZEROED);
-		chunk->map[run_ind+run_pages-1].bits = size |
-		    (chunk->map[run_ind+run_pages-1].bits & CHUNK_MAP_ZEROED);
 	}
 
 	/* Try to coalesce forward. */
 	if (run_ind + run_pages < chunk_npages &&
-	    (chunk->map[run_ind+run_pages].bits & CHUNK_MAP_ALLOCATED) == 0 &&
-	    (chunk->map[run_ind+run_pages].bits & CHUNK_MAP_DIRTY) ==
-	    flag_dirty) {
-		size_t nrun_size = chunk->map[run_ind+run_pages].bits &
+	    (chunk->map[run_ind+run_pages-map_bias].bits & CHUNK_MAP_ALLOCATED)
+	    == 0 && (chunk->map[run_ind+run_pages-map_bias].bits &
+	    CHUNK_MAP_DIRTY) == flag_dirty) {
+		size_t nrun_size = chunk->map[run_ind+run_pages-map_bias].bits &
 		    ~PAGE_MASK;
 
 		/*
@@ -907,25 +909,26 @@ arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty)
 		 * inserted later.
 		 */
 		arena_avail_tree_remove(runs_avail,
-		    &chunk->map[run_ind+run_pages]);
+		    &chunk->map[run_ind+run_pages-map_bias]);
 
 		size += nrun_size;
 		run_pages = size >> PAGE_SHIFT;
 
-		assert((chunk->map[run_ind+run_pages-1].bits & ~PAGE_MASK)
-		    == nrun_size);
-		chunk->map[run_ind].bits = size | (chunk->map[run_ind].bits &
-		    CHUNK_MAP_FLAGS_MASK);
-		chunk->map[run_ind+run_pages-1].bits = size |
-		    (chunk->map[run_ind+run_pages-1].bits &
+		assert((chunk->map[run_ind+run_pages-1-map_bias].bits &
+		    ~PAGE_MASK) == nrun_size);
+		chunk->map[run_ind-map_bias].bits = size |
+		    (chunk->map[run_ind-map_bias].bits & CHUNK_MAP_FLAGS_MASK);
+		chunk->map[run_ind+run_pages-1-map_bias].bits = size |
+		    (chunk->map[run_ind+run_pages-1-map_bias].bits &
 		    CHUNK_MAP_FLAGS_MASK);
 	}
 
 	/* Try to coalesce backward. */
-	if (run_ind > arena_chunk_header_npages && (chunk->map[run_ind-1].bits &
-	    CHUNK_MAP_ALLOCATED) == 0 && (chunk->map[run_ind-1].bits &
+	if (run_ind > map_bias && (chunk->map[run_ind-1-map_bias].bits &
+	    CHUNK_MAP_ALLOCATED) == 0 && (chunk->map[run_ind-1-map_bias].bits &
 	    CHUNK_MAP_DIRTY) == flag_dirty) {
-		size_t prun_size = chunk->map[run_ind-1].bits & ~PAGE_MASK;
+		size_t prun_size = chunk->map[run_ind-1-map_bias].bits &
+		    ~PAGE_MASK;
 
 		run_ind -= prun_size >> PAGE_SHIFT;
 
@@ -933,21 +936,23 @@ arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty)
 		 * Remove predecessor from runs_avail; the coalesced run is
 		 * inserted later.
 		 */
-		arena_avail_tree_remove(runs_avail, &chunk->map[run_ind]);
+		arena_avail_tree_remove(runs_avail,
+		    &chunk->map[run_ind-map_bias]);
 
 		size += prun_size;
 		run_pages = size >> PAGE_SHIFT;
 
-		assert((chunk->map[run_ind].bits & ~PAGE_MASK) == prun_size);
-		chunk->map[run_ind].bits = size | (chunk->map[run_ind].bits &
-		    CHUNK_MAP_FLAGS_MASK);
-		chunk->map[run_ind+run_pages-1].bits = size |
-		    (chunk->map[run_ind+run_pages-1].bits &
+		assert((chunk->map[run_ind-map_bias].bits & ~PAGE_MASK) ==
+		    prun_size);
+		chunk->map[run_ind-map_bias].bits = size |
+		    (chunk->map[run_ind-map_bias].bits & CHUNK_MAP_FLAGS_MASK);
+		chunk->map[run_ind+run_pages-1-map_bias].bits = size |
+		    (chunk->map[run_ind+run_pages-1-map_bias].bits &
 		    CHUNK_MAP_FLAGS_MASK);
 	}
 
 	/* Insert into runs_avail, now that coalescing is complete. */
-	arena_avail_tree_insert(runs_avail, &chunk->map[run_ind]);
+	arena_avail_tree_insert(runs_avail, &chunk->map[run_ind-map_bias]);
 
 	if (dirty) {
 		/*
@@ -966,8 +971,8 @@ arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty)
 	 * manipulation checks whether the first run is unallocated and extends
 	 * to the end of the chunk.
 	 */
-	if ((chunk->map[arena_chunk_header_npages].bits & (~PAGE_MASK |
-	    CHUNK_MAP_ALLOCATED)) == arena_maxclass)
+	if ((chunk->map[0].bits & (~PAGE_MASK | CHUNK_MAP_ALLOCATED)) ==
+	    arena_maxclass)
 		arena_chunk_dealloc(arena, chunk);
 
 	/*
@@ -987,7 +992,7 @@ arena_run_trim_head(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
 {
 	size_t pageind = ((uintptr_t)run - (uintptr_t)chunk) >> PAGE_SHIFT;
 	size_t head_npages = (oldsize - newsize) >> PAGE_SHIFT;
-	size_t flags = chunk->map[pageind].bits & CHUNK_MAP_FLAGS_MASK;
+	size_t flags = chunk->map[pageind-map_bias].bits & CHUNK_MAP_FLAGS_MASK;
 
 	assert(oldsize > newsize);
 
@@ -995,10 +1000,10 @@ arena_run_trim_head(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
 	 * Update the chunk map so that arena_run_dalloc() can treat the
 	 * leading run as separately allocated.
 	 */
-	assert(chunk->map[pageind].bits & CHUNK_MAP_LARGE);
-	assert(chunk->map[pageind].bits & CHUNK_MAP_ALLOCATED);
-	chunk->map[pageind].bits = (oldsize - newsize) | flags;
-	chunk->map[pageind+head_npages].bits = newsize | flags;
+	assert(chunk->map[pageind-map_bias].bits & CHUNK_MAP_LARGE);
+	assert(chunk->map[pageind-map_bias].bits & CHUNK_MAP_ALLOCATED);
+	chunk->map[pageind-map_bias].bits = (oldsize - newsize) | flags;
+	chunk->map[pageind+head_npages-map_bias].bits = newsize | flags;
 
 	arena_run_dalloc(arena, run, false);
 }
@@ -1009,7 +1014,7 @@ arena_run_trim_tail(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
 {
 	size_t pageind = ((uintptr_t)run - (uintptr_t)chunk) >> PAGE_SHIFT;
 	size_t npages = newsize >> PAGE_SHIFT;
-	size_t flags = chunk->map[pageind].bits & CHUNK_MAP_FLAGS_MASK;
+	size_t flags = chunk->map[pageind-map_bias].bits & CHUNK_MAP_FLAGS_MASK;
 
 	assert(oldsize > newsize);
 
@@ -1017,11 +1022,11 @@ arena_run_trim_tail(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
 	 * Update the chunk map so that arena_run_dalloc() can treat the
 	 * trailing run as separately allocated.
 	 */
-	assert(chunk->map[pageind].bits & CHUNK_MAP_LARGE);
-	assert(chunk->map[pageind].bits & CHUNK_MAP_ALLOCATED);
-	chunk->map[pageind].bits = newsize | flags;
-	chunk->map[pageind+npages-1].bits = newsize | flags;
-	chunk->map[pageind+npages].bits = (oldsize - newsize) | flags;
+	assert(chunk->map[pageind-map_bias].bits & CHUNK_MAP_LARGE);
+	assert(chunk->map[pageind-map_bias].bits & CHUNK_MAP_ALLOCATED);
+	chunk->map[pageind-map_bias].bits = newsize | flags;
+	chunk->map[pageind+npages-1-map_bias].bits = newsize | flags;
+	chunk->map[pageind+npages-map_bias].bits = (oldsize - newsize) | flags;
 
 	arena_run_dalloc(arena, (arena_run_t *)((uintptr_t)run + newsize),
 	    dirty);
@@ -1043,8 +1048,8 @@ arena_bin_nonfull_run_get(arena_t *arena, arena_bin_t *bin)
 		arena_run_tree_remove(&bin->runs, mapelm);
 
 		chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(mapelm);
-		pageind = (((uintptr_t)mapelm - (uintptr_t)chunk->map) /
-		    sizeof(arena_chunk_map_t));
+		pageind = ((((uintptr_t)mapelm - (uintptr_t)chunk->map) /
+		    sizeof(arena_chunk_map_t))) + map_bias;
 		run = (arena_run_t *)((uintptr_t)chunk + (uintptr_t)((pageind -
 		    (mapelm->bits >> PAGE_SHIFT))
 		    << PAGE_SHIFT));
@@ -1098,8 +1103,8 @@ arena_bin_nonfull_run_get(arena_t *arena, arena_bin_t *bin)
 		arena_run_tree_remove(&bin->runs, mapelm);
 
 		chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(mapelm);
-		pageind = (((uintptr_t)mapelm - (uintptr_t)chunk->map) /
-		    sizeof(arena_chunk_map_t));
+		pageind = ((((uintptr_t)mapelm - (uintptr_t)chunk->map) /
+		    sizeof(arena_chunk_map_t))) + map_bias;
 		run = (arena_run_t *)((uintptr_t)chunk + (uintptr_t)((pageind -
 		    (mapelm->bits >> PAGE_SHIFT))
 		    << PAGE_SHIFT));
@@ -1530,8 +1535,8 @@ arena_salloc(const void *ptr)
 	assert(CHUNK_ADDR2BASE(ptr) != ptr);
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-	pageind = (((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT);
-	mapbits = chunk->map[pageind].bits;
+	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT;
+	mapbits = chunk->map[pageind-map_bias].bits;
 	assert((mapbits & CHUNK_MAP_ALLOCATED) != 0);
 	if ((mapbits & CHUNK_MAP_LARGE) == 0) {
 		arena_run_t *run = (arena_run_t *)((uintptr_t)chunk +
@@ -1563,10 +1568,10 @@ arena_prof_promoted(const void *ptr, size_t size)
 	assert(isalloc(ptr) == PAGE_SIZE);
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-	pageind = (((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT);
+	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT;
 	binind = small_size2bin[size];
 	assert(binind < nbins);
-	chunk->map[pageind].bits = (chunk->map[pageind].bits &
+	chunk->map[pageind-map_bias].bits = (chunk->map[pageind-map_bias].bits &
 	    ~CHUNK_MAP_CLASS_MASK) | (binind << CHUNK_MAP_CLASS_SHIFT);
 }
 
@@ -1581,8 +1586,8 @@ arena_salloc_demote(const void *ptr)
 	assert(CHUNK_ADDR2BASE(ptr) != ptr);
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-	pageind = (((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT);
-	mapbits = chunk->map[pageind].bits;
+	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT;
+	mapbits = chunk->map[pageind-map_bias].bits;
 	assert((mapbits & CHUNK_MAP_ALLOCATED) != 0);
 	if ((mapbits & CHUNK_MAP_LARGE) == 0) {
 		arena_run_t *run = (arena_run_t *)((uintptr_t)chunk +
@@ -1683,8 +1688,8 @@ arena_prof_ctx_get(const void *ptr)
 	assert(CHUNK_ADDR2BASE(ptr) != ptr);
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-	pageind = (((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT);
-	mapbits = chunk->map[pageind].bits;
+	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT;
+	mapbits = chunk->map[pageind-map_bias].bits;
 	assert((mapbits & CHUNK_MAP_ALLOCATED) != 0);
 	if ((mapbits & CHUNK_MAP_LARGE) == 0) {
 		if (prof_promote)
@@ -1703,7 +1708,7 @@ arena_prof_ctx_get(const void *ptr)
 			    sizeof(prof_ctx_t *)));
 		}
 	} else
-		ret = chunk->map[pageind].prof_ctx;
+		ret = chunk->map[pageind-map_bias].prof_ctx;
 
 	return (ret);
 }
@@ -1718,8 +1723,8 @@ arena_prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
 	assert(CHUNK_ADDR2BASE(ptr) != ptr);
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-	pageind = (((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT);
-	mapbits = chunk->map[pageind].bits;
+	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT;
+	mapbits = chunk->map[pageind-map_bias].bits;
 	assert((mapbits & CHUNK_MAP_ALLOCATED) != 0);
 	if ((mapbits & CHUNK_MAP_LARGE) == 0) {
 		if (prof_promote == false) {
@@ -1737,7 +1742,7 @@ arena_prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
 		} else
 			assert((uintptr_t)ctx == (uintptr_t)1U);
 	} else
-		chunk->map[pageind].prof_ctx = ctx;
+		chunk->map[pageind-map_bias].prof_ctx = ctx;
 }
 #endif
 
@@ -1753,7 +1758,8 @@ arena_dalloc_bin_run(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
 	else if (bin->nregs != 1) {
 		size_t run_pageind = (((uintptr_t)run - (uintptr_t)chunk)) >>
 		    PAGE_SHIFT;
-		arena_chunk_map_t *run_mapelm = &chunk->map[run_pageind];
+		arena_chunk_map_t *run_mapelm =
+		    &chunk->map[run_pageind-map_bias];
 		/*
 		 * This block's conditional is necessary because if the run
 		 * only contains one region, then it never gets inserted into
@@ -1775,16 +1781,17 @@ arena_dalloc_bin_run(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
 	 * trim the clean pages before deallocating the dirty portion of the
 	 * run.
 	 */
-	if ((chunk->map[run_ind].bits & CHUNK_MAP_DIRTY) == 0 && past - run_ind
-	    < npages) {
+	if ((chunk->map[run_ind-map_bias].bits & CHUNK_MAP_DIRTY) == 0 && past
+	    - run_ind < npages) {
 		/*
 		 * Trim clean pages.  Convert to large run beforehand.  Set the
 		 * last map element first, in case this is a one-page run.
 		 */
-		chunk->map[run_ind+npages-1].bits = CHUNK_MAP_LARGE |
-		    (chunk->map[run_ind].bits & CHUNK_MAP_FLAGS_MASK);
-		chunk->map[run_ind].bits = bin->run_size | CHUNK_MAP_LARGE |
-		    (chunk->map[run_ind].bits & CHUNK_MAP_FLAGS_MASK);
+		chunk->map[run_ind+npages-1-map_bias].bits = CHUNK_MAP_LARGE |
+		    (chunk->map[run_ind-map_bias].bits & CHUNK_MAP_FLAGS_MASK);
+		chunk->map[run_ind-map_bias].bits = bin->run_size |
+		    CHUNK_MAP_LARGE | (chunk->map[run_ind-map_bias].bits &
+		    CHUNK_MAP_FLAGS_MASK);
 		arena_run_trim_tail(arena, chunk, run, (npages << PAGE_SHIFT),
 		    ((npages - (past - run_ind)) << PAGE_SHIFT), false);
 		npages = past - run_ind;
@@ -1812,7 +1819,7 @@ arena_dalloc_bin(arena_t *arena, arena_chunk_t *chunk, void *ptr,
 	size_t size;
 #endif
 
-	pageind = (((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT);
+	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT;
 	run = (arena_run_t *)((uintptr_t)chunk + (uintptr_t)((pageind -
 	    (mapelm->bits >> PAGE_SHIFT)) << PAGE_SHIFT));
 	assert(run->magic == ARENA_RUN_MAGIC);
@@ -1846,7 +1853,7 @@ arena_dalloc_bin(arena_t *arena, arena_chunk_t *chunk, void *ptr,
 				    (((uintptr_t)bin->runcur -
 				    (uintptr_t)runcur_chunk)) >> PAGE_SHIFT;
 				arena_chunk_map_t *runcur_mapelm =
-				    &runcur_chunk->map[runcur_pageind];
+				    &runcur_chunk->map[runcur_pageind-map_bias];
 
 				/* Insert runcur. */
 				arena_run_tree_insert(&bin->runs,
@@ -1857,7 +1864,7 @@ arena_dalloc_bin(arena_t *arena, arena_chunk_t *chunk, void *ptr,
 			size_t run_pageind = (((uintptr_t)run -
 			    (uintptr_t)chunk)) >> PAGE_SHIFT;
 			arena_chunk_map_t *run_mapelm =
-			    &chunk->map[run_pageind];
+			    &chunk->map[run_pageind-map_bias];
 
 			assert(arena_run_tree_search(&bin->runs, run_mapelm) ==
 			    NULL);
@@ -1936,7 +1943,7 @@ arena_dalloc_large(arena_t *arena, arena_chunk_t *chunk, void *ptr)
 #if (defined(JEMALLOC_FILL) || defined(JEMALLOC_STATS))
 		size_t pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >>
 		    PAGE_SHIFT;
-		size_t size = chunk->map[pageind].bits & ~PAGE_MASK;
+		size_t size = chunk->map[pageind-map_bias].bits & ~PAGE_MASK;
 #endif
 
 #ifdef JEMALLOC_FILL
@@ -1999,14 +2006,16 @@ arena_ralloc_large_grow(arena_t *arena, arena_chunk_t *chunk, void *ptr,
 	size_t npages = oldsize >> PAGE_SHIFT;
 	size_t followsize;
 
-	assert(oldsize == (chunk->map[pageind].bits & ~PAGE_MASK));
+	assert(oldsize == (chunk->map[pageind-map_bias].bits & ~PAGE_MASK));
 
 	/* Try to extend the run. */
 	assert(size + extra > oldsize);
 	malloc_mutex_lock(&arena->lock);
-	if (pageind + npages < chunk_npages && (chunk->map[pageind+npages].bits
+	if (pageind + npages < chunk_npages &&
+	    (chunk->map[pageind+npages-map_bias].bits
 	    & CHUNK_MAP_ALLOCATED) == 0 && (followsize =
-	    chunk->map[pageind+npages].bits & ~PAGE_MASK) >= size - oldsize) {
+	    chunk->map[pageind+npages-map_bias].bits & ~PAGE_MASK) >= size -
+	    oldsize) {
 		/*
 		 * The next run is available and sufficiently large.  Split the
 		 * following run, then merge the first part with the existing
@@ -2017,9 +2026,9 @@ arena_ralloc_large_grow(arena_t *arena, arena_chunk_t *chunk, void *ptr,
 		arena_run_split(arena, (arena_run_t *)((uintptr_t)chunk +
 		    ((pageind+npages) << PAGE_SHIFT)), splitsize, true, zero);
 
-		chunk->map[pageind].bits = size | CHUNK_MAP_LARGE |
+		chunk->map[pageind-map_bias].bits = size | CHUNK_MAP_LARGE |
 		    CHUNK_MAP_ALLOCATED;
-		chunk->map[pageind+npages].bits = CHUNK_MAP_LARGE |
+		chunk->map[pageind+npages-map_bias].bits = CHUNK_MAP_LARGE |
 		    CHUNK_MAP_ALLOCATED;
 
 #ifdef JEMALLOC_STATS
@@ -2433,6 +2442,7 @@ bool
 arena_boot(void)
 {
 	size_t header_size;
+	unsigned i;
 
 	/* Set variables according to the value of opt_lg_[qc]space_max. */
 	qspace_max = (1U << opt_lg_qspace_max);
@@ -2491,13 +2501,26 @@ arena_boot(void)
 
 	/*
 	 * Compute the header size such that it is large enough to contain the
-	 * page map.
+	 * page map.  The page map is biased to omit entries for the header
+	 * itself, so some iteration is necessary to compute the map bias.
+	 *
+	 * 1) Compute safe header_size and map_bias values that include enough
+	 *    space for an unbiased page map.
+	 * 2) Refine map_bias based on (1) to omit the header pages in the page
+	 *    map.  The resulting map_bias may be one too small.
+	 * 3) Refine map_bias based on (2).  The result will be >= the result
+	 *    from (2), and will always be correct.
 	 */
-	header_size = sizeof(arena_chunk_t) +
-	    (sizeof(arena_chunk_map_t) * (chunk_npages - 1));
-	arena_chunk_header_npages = (header_size >> PAGE_SHIFT) +
-	    ((header_size & PAGE_MASK) != 0);
-	arena_maxclass = chunksize - (arena_chunk_header_npages << PAGE_SHIFT);
+	map_bias = 0;
+	for (i = 0; i < 3; i++) {
+		header_size = offsetof(arena_chunk_t, map)
+			+ (sizeof(arena_chunk_map_t) * (chunk_npages-map_bias));
+		map_bias = (header_size >> PAGE_SHIFT) + ((header_size &
+		    PAGE_MASK) != 0);
+	}
+	assert(map_bias > 0);
+
+	arena_maxclass = chunksize - (map_bias << PAGE_SHIFT);
 
 	return (false);
 }
diff --git a/jemalloc/src/chunk.c b/jemalloc/src/chunk.c
index 5cb99615..0be24fbd 100644
--- a/jemalloc/src/chunk.c
+++ b/jemalloc/src/chunk.c
@@ -22,7 +22,7 @@ rtree_t		*chunks_rtree;
 size_t		chunksize;
 size_t		chunksize_mask; /* (chunksize - 1). */
 size_t		chunk_npages;
-size_t		arena_chunk_header_npages;
+size_t		map_bias;
 size_t		arena_maxclass; /* Max size class for arenas. */
 
 /******************************************************************************/
diff --git a/jemalloc/src/tcache.c b/jemalloc/src/tcache.c
index 550b9c42..ef69b1ae 100644
--- a/jemalloc/src/tcache.c
+++ b/jemalloc/src/tcache.c
@@ -95,10 +95,10 @@ tcache_bin_flush_small(tcache_bin_t *tbin, size_t binind, unsigned rem
 			flush = *(void **)ptr;
 			chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 			if (chunk->arena == arena) {
-				size_t pageind = (((uintptr_t)ptr -
-				    (uintptr_t)chunk) >> PAGE_SHIFT);
+				size_t pageind = ((uintptr_t)ptr -
+				    (uintptr_t)chunk) >> PAGE_SHIFT;
 				arena_chunk_map_t *mapelm =
-				    &chunk->map[pageind];
+				    &chunk->map[pageind-map_bias];
 				arena_dalloc_bin(arena, chunk, ptr, mapelm);
 			} else {
 				/*
@@ -311,9 +311,9 @@ tcache_destroy(tcache_t *tcache)
 	if (arena_salloc(tcache) <= small_maxclass) {
 		arena_chunk_t *chunk = CHUNK_ADDR2BASE(tcache);
 		arena_t *arena = chunk->arena;
-		size_t pageind = (((uintptr_t)tcache - (uintptr_t)chunk) >>
-		    PAGE_SHIFT);
-		arena_chunk_map_t *mapelm = &chunk->map[pageind];
+		size_t pageind = ((uintptr_t)tcache - (uintptr_t)chunk) >>
+		    PAGE_SHIFT;
+		arena_chunk_map_t *mapelm = &chunk->map[pageind-map_bias];
 		arena_run_t *run = (arena_run_t *)((uintptr_t)chunk +
 		    (uintptr_t)((pageind - (mapelm->bits >> PAGE_SHIFT)) <<
 		    PAGE_SHIFT));

From 3377ffa1f4f8e67bce1e36624285e5baf5f9ecef Mon Sep 17 00:00:00 2001
From: Jason Evans <je@facebook.com>
Date: Fri, 1 Oct 2010 17:53:37 -0700
Subject: [PATCH 18/48] Change CHUNK_MAP_ZEROED to CHUNK_MAP_UNZEROED.

Invert the chunk map bit that tracks whether a page is zeroed, so that
for zeroed arena chunks, the interior of the page map does not need to
be initialized (as it consists entirely of zero bytes).
---
 jemalloc/include/jemalloc/internal/arena.h | 12 +++---
 jemalloc/src/arena.c                       | 46 ++++++++++++----------
 2 files changed, 32 insertions(+), 26 deletions(-)

diff --git a/jemalloc/include/jemalloc/internal/arena.h b/jemalloc/include/jemalloc/internal/arena.h
index c3b7a58f..893f0706 100644
--- a/jemalloc/include/jemalloc/internal/arena.h
+++ b/jemalloc/include/jemalloc/internal/arena.h
@@ -125,13 +125,13 @@ struct arena_chunk_map_s {
 	 * x : don't care
 	 * - : 0
 	 * + : 1
-	 * [DZLA] : bit set
-	 * [dzla] : bit unset
+	 * [DULA] : bit set
+	 * [dula] : bit unset
 	 *
 	 *   Unallocated (clean):
-	 *     ssssssss ssssssss ssss---- ----dz--
-	 *     xxxxxxxx xxxxxxxx xxxx---- -----Zxx
-	 *     ssssssss ssssssss ssss---- ----dZ--
+	 *     ssssssss ssssssss ssss---- ----du--
+	 *     xxxxxxxx xxxxxxxx xxxx---- -----Uxx
+	 *     ssssssss ssssssss ssss---- ----dU--
 	 *
 	 *   Unallocated (dirty):
 	 *     ssssssss ssssssss ssss---- ----D---
@@ -161,7 +161,7 @@ struct arena_chunk_map_s {
 #endif
 #define	CHUNK_MAP_FLAGS_MASK	((size_t)0xfU)
 #define	CHUNK_MAP_DIRTY		((size_t)0x8U)
-#define	CHUNK_MAP_ZEROED	((size_t)0x4U)
+#define	CHUNK_MAP_UNZEROED	((size_t)0x4U)
 #define	CHUNK_MAP_LARGE		((size_t)0x2U)
 #define	CHUNK_MAP_ALLOCATED	((size_t)0x1U)
 #define	CHUNK_MAP_KEY		CHUNK_MAP_ALLOCATED
diff --git a/jemalloc/src/arena.c b/jemalloc/src/arena.c
index 955d70cd..45e5fd19 100644
--- a/jemalloc/src/arena.c
+++ b/jemalloc/src/arena.c
@@ -324,11 +324,11 @@ arena_run_split(arena_t *arena, arena_run_t *run, size_t size, bool large,
 			chunk->map[run_ind+need_pages-map_bias].bits =
 			    (rem_pages << PAGE_SHIFT) |
 			    (chunk->map[run_ind+need_pages-map_bias].bits &
-			    CHUNK_MAP_ZEROED);
+			    CHUNK_MAP_UNZEROED);
 			chunk->map[run_ind+total_pages-1-map_bias].bits =
 			    (rem_pages << PAGE_SHIFT) |
 			    (chunk->map[run_ind+total_pages-1-map_bias].bits &
-			    CHUNK_MAP_ZEROED);
+			    CHUNK_MAP_UNZEROED);
 		}
 		arena_avail_tree_insert(runs_avail,
 		    &chunk->map[run_ind+need_pages-map_bias]);
@@ -353,7 +353,7 @@ arena_run_split(arena_t *arena, arena_run_t *run, size_t size, bool large,
 				 */
 				for (i = 0; i < need_pages; i++) {
 					if ((chunk->map[run_ind+i-map_bias].bits
-					    & CHUNK_MAP_ZEROED) == 0) {
+					    & CHUNK_MAP_UNZEROED) != 0) {
 						memset((void *)((uintptr_t)
 						    chunk + ((run_ind + i) <<
 						    PAGE_SHIFT)), 0,
@@ -420,7 +420,7 @@ arena_chunk_alloc(arena_t *arena)
 		arena_avail_tree_insert(runs_avail, &chunk->map[0]);
 	} else {
 		bool zero;
-		size_t zeroed;
+		size_t unzeroed;
 
 		zero = false;
 		malloc_mutex_unlock(&arena->lock);
@@ -447,11 +447,17 @@ arena_chunk_alloc(arena_t *arena)
 		 * Mark the pages as zeroed iff chunk_alloc() returned a zeroed
 		 * chunk.
 		 */
-		zeroed = zero ? CHUNK_MAP_ZEROED : 0;
-		chunk->map[0].bits = arena_maxclass | zeroed;
-		for (i = map_bias+1; i < chunk_npages-1; i++)
-			chunk->map[i-map_bias].bits = zeroed;
-		chunk->map[i-map_bias].bits = arena_maxclass | zeroed;
+		unzeroed = zero ? 0 : CHUNK_MAP_UNZEROED;
+		chunk->map[0].bits = arena_maxclass | unzeroed;
+		/*
+		 * There is no need to initialize the internal page map entries
+		 * unless the chunk is not zeroed.
+		 */
+		if (zero == false) {
+			for (i = map_bias+1; i < chunk_npages-1; i++)
+				chunk->map[i-map_bias].bits = unzeroed;
+		}
+		chunk->map[chunk_npages-1].bits = arena_maxclass | unzeroed;
 
 		/* Insert the run into the runs_avail_clean tree. */
 		arena_avail_tree_insert(&arena->runs_avail_clean,
@@ -593,7 +599,7 @@ arena_chunk_purge(arena_t *arena, arena_chunk_t *chunk)
 {
 	ql_head(arena_chunk_map_t) mapelms;
 	arena_chunk_map_t *mapelm;
-	size_t pageind, flag_zeroed;
+	size_t pageind, flag_unzeroed;
 #ifdef JEMALLOC_DEBUG
 	size_t ndirty;
 #endif
@@ -603,18 +609,18 @@ arena_chunk_purge(arena_t *arena, arena_chunk_t *chunk)
 
 	ql_new(&mapelms);
 
-	flag_zeroed =
+	flag_unzeroed =
 #ifdef JEMALLOC_PURGE_MADVISE_DONTNEED
    /*
     * madvise(..., MADV_DONTNEED) results in zero-filled pages for anonymous
     * mappings, but not for file-backed mappings.
     */
 #  ifdef JEMALLOC_SWAP
-	    swap_enabled ? 0 :
+	    swap_enabled ? CHUNK_MAP_UNZEROED :
 #  endif
-	    CHUNK_MAP_ZEROED;
-#else
 	    0;
+#else
+	    CHUNK_MAP_UNZEROED;
 #endif
 
 	/*
@@ -653,21 +659,21 @@ arena_chunk_purge(arena_t *arena, arena_chunk_t *chunk)
 
 				/*
 				 * Update internal elements in the page map, so
-				 * that CHUNK_MAP_ZEROED is properly set.
+				 * that CHUNK_MAP_UNZEROED is properly set.
 				 */
 				mapelm->bits = (npages << PAGE_SHIFT) |
 				    CHUNK_MAP_LARGE | CHUNK_MAP_ALLOCATED |
-				    flag_zeroed;
+				    flag_unzeroed;
 				for (i = 1; i < npages - 1; i++) {
 					chunk->map[pageind+i-map_bias].bits =
-					    flag_zeroed;
+					    flag_unzeroed;
 				}
 				if (npages > 1) {
 					chunk->map[
 					    pageind+npages-1-map_bias].bits =
 					    (npages << PAGE_SHIFT) |
 					    CHUNK_MAP_LARGE |
-					    CHUNK_MAP_ALLOCATED | flag_zeroed;
+					    CHUNK_MAP_ALLOCATED | flag_unzeroed;
 				}
 
 				arena->nactive += npages;
@@ -890,10 +896,10 @@ arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty)
 		arena->ndirty += run_pages;
 	} else {
 		chunk->map[run_ind-map_bias].bits = size |
-		    (chunk->map[run_ind-map_bias].bits & CHUNK_MAP_ZEROED);
+		    (chunk->map[run_ind-map_bias].bits & CHUNK_MAP_UNZEROED);
 		chunk->map[run_ind+run_pages-1-map_bias].bits = size |
 		    (chunk->map[run_ind+run_pages-1-map_bias].bits &
-		    CHUNK_MAP_ZEROED);
+		    CHUNK_MAP_UNZEROED);
 	}
 
 	/* Try to coalesce forward. */

From c2fc8c8b3afbd15ec3e8ed4ca38667ec0a01ade8 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@facebook.com>
Date: Fri, 1 Oct 2010 18:02:43 -0700
Subject: [PATCH 19/48] Use offsetof() when sizing dynamic structures.

Base dynamic structure size on offsetof(), rather than subtracting the
size of the dynamic structure member.  Results could differ on systems
with strict data structure alignment requirements.
---
 jemalloc/src/jemalloc.c | 4 ++--
 jemalloc/src/rtree.c    | 7 ++++---
 jemalloc/src/tcache.c   | 2 +-
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/jemalloc/src/jemalloc.c b/jemalloc/src/jemalloc.c
index f252f591..a79752eb 100644
--- a/jemalloc/src/jemalloc.c
+++ b/jemalloc/src/jemalloc.c
@@ -101,8 +101,8 @@ arenas_extend(unsigned ind)
 	arena_t *ret;
 
 	/* Allocate enough space for trailing bins. */
-	ret = (arena_t *)base_alloc(sizeof(arena_t)
-	    + (sizeof(arena_bin_t) * (nbins - 1)));
+	ret = (arena_t *)base_alloc(offsetof(arena_t, bins)
+	    + (sizeof(arena_bin_t) * nbins));
 	if (ret != NULL && arena_new(ret, ind) == false) {
 		arenas[ind] = ret;
 		return (ret);
diff --git a/jemalloc/src/rtree.c b/jemalloc/src/rtree.c
index a583751d..7753743c 100644
--- a/jemalloc/src/rtree.c
+++ b/jemalloc/src/rtree.c
@@ -13,11 +13,12 @@ rtree_new(unsigned bits)
 		height++;
 	assert(height * bits_per_level >= bits);
 
-	ret = (rtree_t*)base_alloc(sizeof(rtree_t) + (sizeof(unsigned) *
-	    (height - 1)));
+	ret = (rtree_t*)base_alloc(offsetof(rtree_t, level2bits) +
+	    (sizeof(unsigned) * height));
 	if (ret == NULL)
 		return (NULL);
-	memset(ret, 0, sizeof(rtree_t) + (sizeof(unsigned) * (height - 1)));
+	memset(ret, 0, offsetof(rtree_t, level2bits) + (sizeof(unsigned) *
+	    height));
 
 	malloc_mutex_init(&ret->mutex);
 	ret->height = height;
diff --git a/jemalloc/src/tcache.c b/jemalloc/src/tcache.c
index ef69b1ae..3fb8f2bd 100644
--- a/jemalloc/src/tcache.c
+++ b/jemalloc/src/tcache.c
@@ -204,7 +204,7 @@ tcache_create(arena_t *arena)
 	size_t size;
 	unsigned i;
 
-	size = sizeof(tcache_t) + (sizeof(tcache_bin_t) * (nhbins - 1));
+	size = offsetof(tcache_t, tbins) + (sizeof(tcache_bin_t) * nhbins);
 	/*
 	 * Round up to the nearest multiple of the cacheline size, in order to
 	 * avoid the possibility of false cacheline sharing.

From d65cdfe23310253f065bea02ba8e0016dc9b6aee Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Sat, 2 Oct 2010 11:31:36 -0700
Subject: [PATCH 20/48] Update pprof from google-perftools 1.6.

Import updated pprof from google-perftools 1.6, with a patch applied to
fix a division by zero error (see
http://code.google.com/p/google-perftools/issues/detail?id=235).
---
 jemalloc/bin/pprof | 786 ++++++++++++++++++++++++++++++++++-----------
 1 file changed, 607 insertions(+), 179 deletions(-)

diff --git a/jemalloc/bin/pprof b/jemalloc/bin/pprof
index 57c0600e..1655f07c 100755
--- a/jemalloc/bin/pprof
+++ b/jemalloc/bin/pprof
@@ -92,9 +92,7 @@ my $GV = "gv";
 my $KCACHEGRIND = "kcachegrind";
 my $PS2PDF = "ps2pdf";
 # These are used for dynamic profiles
-my $WGET = "wget";
-my $WGET_FLAGS = "--no-http-keep-alive";   # only supported by some wgets
-my $CURL = "curl";
+my $URL_FETCHER = "curl -s";
 
 # These are the web pages that servers need to support for dynamic profiles
 my $HEAP_PAGE = "/pprof/heap";
@@ -108,6 +106,12 @@ my $FILTEREDPROFILE_PAGE = "/pprof/filteredprofile(?:\\?.*)?";
 my $SYMBOL_PAGE = "/pprof/symbol";     # must support symbol lookup via POST
 my $PROGRAM_NAME_PAGE = "/pprof/cmdline";
 
+# These are the web pages that can be named on the command line.
+# All the alternatives must begin with /.
+my $PROFILES = "($HEAP_PAGE|$PROFILE_PAGE|$PMUPROFILE_PAGE|" .
+               "$GROWTH_PAGE|$CONTENTION_PAGE|$WALL_PAGE|" .
+               "$FILTEREDPROFILE_PAGE)";
+
 # default binary name
 my $UNKNOWN_BINARY = "(unknown)";
 
@@ -176,12 +180,14 @@ Output type:
    --text              Generate text report
    --callgrind         Generate callgrind format to stdout
    --gv                Generate Postscript and display
+   --web               Generate SVG and display
    --list=<regexp>     Generate source listing of matching routines
    --disasm=<regexp>   Generate disassembly of matching routines
    --symbols           Print demangled symbol names found at given addresses
    --dot               Generate DOT file to stdout
    --ps                Generate Postcript to stdout
    --pdf               Generate PDF to stdout
+   --svg               Generate SVG to stdout
    --gif               Generate GIF to stdout
    --raw               Generate symbolized pprof data (useful with remote fetch)
 
@@ -209,7 +215,7 @@ Call-graph Options:
                        (i.e. direct leak generators) more visible
 
 Miscellaneous:
-   --tools=<prefix>    Prefix for object tool pathnames
+   --tools=<prefix or binary:fullpath>[,...]   \$PATH for object tool pathnames
    --test              Run unit tests
    --help              This message
    --version           Version information
@@ -224,6 +230,8 @@ pprof /bin/ls ls.prof
                        Enters "interactive" mode
 pprof --text /bin/ls ls.prof
                        Outputs one line per procedure
+pprof --web /bin/ls ls.prof
+                       Displays annotated call-graph in web browser
 pprof --gv /bin/ls ls.prof
                        Displays annotated call-graph via 'gv'
 pprof --gv --focus=Mutex /bin/ls ls.prof
@@ -234,6 +242,9 @@ pprof --list=getdir /bin/ls ls.prof
                        (Per-line) annotated source listing for getdir()
 pprof --disasm=getdir /bin/ls ls.prof
                        (Per-PC) annotated disassembly for getdir()
+
+pprof http://localhost:1234/
+                       Enters "interactive" mode
 pprof --text localhost:1234
                        Outputs one line per procedure for localhost:1234
 pprof --raw localhost:1234 > ./local.raw
@@ -293,10 +304,12 @@ sub Init() {
   $main::opt_disasm = "";
   $main::opt_symbols = 0;
   $main::opt_gv = 0;
+  $main::opt_web = 0;
   $main::opt_dot = 0;
   $main::opt_ps = 0;
   $main::opt_pdf = 0;
   $main::opt_gif = 0;
+  $main::opt_svg = 0;
   $main::opt_raw = 0;
 
   $main::opt_nodecount = 80;
@@ -331,6 +344,9 @@ sub Init() {
   # Are we using $SYMBOL_PAGE?
   $main::use_symbol_page = 0;
 
+  # Files returned by TempName.
+  %main::tempnames = ();
+
   # Type of profile we are dealing with
   # Supported types:
   #     cpu
@@ -356,9 +372,11 @@ sub Init() {
              "disasm=s"       => \$main::opt_disasm,
              "symbols!"       => \$main::opt_symbols,
              "gv!"            => \$main::opt_gv,
+             "web!"           => \$main::opt_web,
              "dot!"           => \$main::opt_dot,
              "ps!"            => \$main::opt_ps,
              "pdf!"           => \$main::opt_pdf,
+             "svg!"           => \$main::opt_svg,
              "gif!"           => \$main::opt_gif,
              "raw!"           => \$main::opt_raw,
              "interactive!"   => \$main::opt_interactive,
@@ -434,9 +452,11 @@ sub Init() {
       ($main::opt_disasm eq '' ? 0 : 1) +
       ($main::opt_symbols == 0 ? 0 : 1) +
       $main::opt_gv +
+      $main::opt_web +
       $main::opt_dot +
       $main::opt_ps +
       $main::opt_pdf +
+      $main::opt_svg +
       $main::opt_gif +
       $main::opt_raw +
       $main::opt_interactive +
@@ -511,20 +531,6 @@ sub Init() {
     ConfigureObjTools($main::prog)
   }
 
-  # Check what flags our commandline utilities support
-  if (open(TFILE, "$WGET $WGET_FLAGS -V 2>&1 |")) {
-    my @lines = <TFILE>;
-    if (grep(/unrecognized/, @lines) > 0) {
-      # grep found 'unrecognized' token from WGET, clear WGET flags
-      $WGET_FLAGS = "";
-    }
-    close(TFILE);
-  }
-  # TODO(csilvers): check all the other binaries and objtools to see
-  # if they are installed and what flags they support, and store that
-  # in a data structure here, rather than scattering these tests about.
-  # Then, ideally, rewrite code to use wget OR curl OR GET or ...
-
   # Break the opt_list_prefix into the prefix_list array
   @prefix_list = split (',', $main::opt_lib_prefix);
 
@@ -588,6 +594,10 @@ sub Main() {
   } elsif ($main::use_symbol_page) {
     $symbols = FetchSymbols($pcs);
   } else {
+    # TODO(csilvers): $libs uses the /proc/self/maps data from profile1,
+    # which may differ from the data from subsequent profiles, especially
+    # if they were run on different machines.  Use appropriate libs for
+    # each pc somehow.
     $symbols = ExtractSymbols($libs, $pcs);
   }
 
@@ -635,9 +645,24 @@ sub Main() {
     } else {
       if (PrintDot($main::prog, $symbols, $profile, $flat, $cumulative, $total)) {
         if ($main::opt_gv) {
-          RunGV(PsTempName($main::next_tmpfile), "");
+          RunGV(TempName($main::next_tmpfile, "ps"), "");
+        } elsif ($main::opt_web) {
+          my $tmp = TempName($main::next_tmpfile, "svg");
+          RunWeb($tmp);
+          # The command we run might hand the file name off
+          # to an already running browser instance and then exit.
+          # Normally, we'd remove $tmp on exit (right now),
+          # but fork a child to remove $tmp a little later, so that the
+          # browser has time to load it first.
+          delete $main::tempnames{$tmp};
+          if (fork() == 0) {
+            sleep 5;
+            unlink($tmp);
+            exit(0);
+          }
         }
       } else {
+        cleanup();
         exit(1);
       }
     }
@@ -683,6 +708,34 @@ sub RunGV {
   }
 }
 
+sub RunWeb {
+  my $fname = shift;
+  print STDERR "Loading web page file:///$fname\n";
+
+  if (`uname` =~ /Darwin/) {
+    # OS X: open will use standard preference for SVG files.
+    system("/usr/bin/open", $fname);
+    return;
+  }
+
+  # Some kind of Unix; try generic symlinks, then specific browsers.
+  # (Stop once we find one.)
+  # Works best if the browser is already running.
+  my @alt = (
+    "/etc/alternatives/gnome-www-browser",
+    "/etc/alternatives/x-www-browser",
+    "google-chrome",
+    "firefox",
+  );
+  foreach my $b (@alt) {
+    if (system($b, $fname) == 0) {
+      return;
+    }
+  }
+
+  print STDERR "Could not load web browser.\n";
+}
+
 sub RunKcachegrind {
   my $fname = shift;
   my $bg = shift;       # "" or " &" if we should run in background
@@ -739,10 +792,10 @@ sub InteractiveCommand {
     print STDERR "\n";
     return 0;
   }
-  if (m/^ *quit/) {
+  if (m/^\s*quit/) {
     return 0;
   }
-  if (m/^ *help/) {
+  if (m/^\s*help/) {
     InteractiveHelpMessage();
     return 1;
   }
@@ -754,7 +807,7 @@ sub InteractiveCommand {
   $main::opt_gv = 0;
   $main::opt_cum = 0;
 
-  if (m/^ *(text|top)(\d*) *(.*)/) {
+  if (m/^\s*(text|top)(\d*)\s*(.*)/) {
     $main::opt_text = 1;
 
     my $line_limit = ($2 ne "") ? int($2) : 10;
@@ -773,14 +826,14 @@ sub InteractiveCommand {
     PrintText($symbols, $flat, $cumulative, $total, $line_limit);
     return 1;
   }
-  if (m/^ *callgrind *([^ \n]*)/) {
+  if (m/^\s*callgrind\s*([^ \n]*)/) {
     $main::opt_callgrind = 1;
 
     # Get derived profiles
     my $calls = ExtractCalls($symbols, $orig_profile);
     my $filename = $1;
     if ( $1 eq '' ) {
-      $filename = CallgrindTempName($main::next_tmpfile);
+      $filename = TempName($main::next_tmpfile, "callgrind");
     }
     PrintCallgrind($calls, $filename);
     if ( $1 eq '' ) {
@@ -790,7 +843,7 @@ sub InteractiveCommand {
 
     return 1;
   }
-  if (m/^ *list *(.+)/) {
+  if (m/^\s*list\s*(.+)/) {
     $main::opt_list = 1;
 
     my $routine;
@@ -807,7 +860,7 @@ sub InteractiveCommand {
     PrintListing($libs, $flat, $cumulative, $routine);
     return 1;
   }
-  if (m/^ *disasm *(.+)/) {
+  if (m/^\s*disasm\s*(.+)/) {
     $main::opt_disasm = 1;
 
     my $routine;
@@ -825,12 +878,18 @@ sub InteractiveCommand {
     PrintDisassembly($libs, $flat, $cumulative, $routine, $total);
     return 1;
   }
-  if (m/^ *gv *(.*)/) {
-    $main::opt_gv = 1;
+  if (m/^\s*(gv|web)\s*(.*)/) {
+    $main::opt_gv = 0;
+    $main::opt_web = 0;
+    if ($1 eq "gv") {
+      $main::opt_gv = 1;
+    } elsif ($1 eq "web") {
+      $main::opt_web = 1;
+    }
 
     my $focus;
     my $ignore;
-    ($focus, $ignore) = ParseInteractiveArgs($1);
+    ($focus, $ignore) = ParseInteractiveArgs($2);
 
     # Process current profile to account for various settings
     my $profile = ProcessProfile($orig_profile, $symbols, $focus, $ignore);
@@ -841,11 +900,19 @@ sub InteractiveCommand {
     my $cumulative = CumulativeProfile($reduced);
 
     if (PrintDot($main::prog, $symbols, $profile, $flat, $cumulative, $total)) {
-      RunGV(PsTempName($main::next_tmpfile), " &");
+      if ($main::opt_gv) {
+        RunGV(TempName($main::next_tmpfile, "ps"), " &");
+      } elsif ($main::opt_web) {
+        RunWeb(TempName($main::next_tmpfile, "svg"));
+      }
       $main::next_tmpfile++;
     }
     return 1;
   }
+  if (m/^\s*$/) {
+    return 1;
+  }
+  print STDERR "Unknown command: try 'help'.\n";
   return 1;
 }
 
@@ -894,6 +961,14 @@ Commands:
       the "focus" regular expression matches a routine name on the stack
       trace.
 
+  web
+  web [focus] [-ignore1] [-ignore2]
+      Like GV, but displays profile in your web browser instead of using
+      Ghostview. Works best if your web browser is already running.
+      To change the browser that gets used:
+      On Linux, set the /etc/alternatives/gnome-www-browser symlink.
+      On OS X, change the Finder association for SVG files.
+
   list [routine_regexp] [-ignore1] [-ignore2]
       Show source listing of routines whose names match "routine_regexp"
 
@@ -950,14 +1025,12 @@ sub ParseInteractiveArgs {
 
 ##### Output code #####
 
-sub PsTempName {
+sub TempName {
   my $fnum = shift;
-  return "$main::tmpfile_ps" . "." . "$fnum" . ".ps";
-}
-
-sub CallgrindTempName {
-  my $fnum = shift;
-  return "$main::tmpfile_ps" . "." . "$fnum" . ".callgrind";
+  my $ext = shift;
+  my $file = "$main::tmpfile_ps.$fnum.$ext";
+  $main::tempnames{$file} = 1;
+  return $file;
 }
 
 # Print profile data in packed binary format (64-bit) to standard out
@@ -1599,7 +1672,6 @@ sub PrintDot {
   }
   if ($last < 0) {
     print STDERR "No nodes to print\n";
-    cleanup();
     return 0;
   }
 
@@ -1612,11 +1684,14 @@ sub PrintDot {
   # Open DOT output file
   my $output;
   if ($main::opt_gv) {
-    $output = "| $DOT -Tps2 >" . PsTempName($main::next_tmpfile);
+    $output = "| $DOT -Tps2 >" . TempName($main::next_tmpfile, "ps");
   } elsif ($main::opt_ps) {
     $output = "| $DOT -Tps2";
   } elsif ($main::opt_pdf) {
     $output = "| $DOT -Tps2 | $PS2PDF - -";
+  } elsif ($main::opt_web || $main::opt_svg) {
+    # We need to post-process the SVG, so write to a temporary file always.
+    $output = "| $DOT -Tsvg >" . TempName($main::next_tmpfile, "svg");
   } elsif ($main::opt_gif) {
     $output = "| $DOT -Tgif";
   } else {
@@ -1727,7 +1802,10 @@ sub PrintDot {
       my $fraction = abs($local_total ? (3 * ($n / $local_total)) : 0);
       if ($fraction > 1) { $fraction = 1; }
       my $w = $fraction * 2;
-      #if ($w < 1) { $w = 1; }
+      if ($w < 1 && ($main::opt_web || $main::opt_svg)) {
+        # SVG output treats line widths < 1 poorly.
+        $w = 1;
+      }
 
       # Dot sometimes segfaults if given edge weights that are too large, so
       # we cap the weights at a large value
@@ -1751,11 +1829,312 @@ sub PrintDot {
   }
 
   print DOT ("}\n");
-
   close(DOT);
+
+  if ($main::opt_web || $main::opt_svg) {
+    # Rewrite SVG to be more usable inside web browser.
+    RewriteSvg(TempName($main::next_tmpfile, "svg"));
+  }
+
   return 1;
 }
 
+sub RewriteSvg {
+  my $svgfile = shift;
+
+  open(SVG, $svgfile) || die "open temp svg: $!";
+  my @svg = <SVG>;
+  close(SVG);
+  unlink $svgfile;
+  my $svg = join('', @svg);
+
+  # Dot's SVG output is
+  #
+  #    <svg width="___" height="___"
+  #     viewBox="___" xmlns=...>
+  #    <g id="graph0" transform="...">
+  #    ...
+  #    </g>
+  #    </svg>
+  #
+  # Change it to
+  #
+  #    <svg width="100%" height="100%"
+  #     xmlns=...>
+  #    $svg_javascript
+  #    <g id="viewport" transform="translate(0,0)">
+  #    <g id="graph0" transform="...">
+  #    ...
+  #    </g>
+  #    </g>
+  #    </svg>
+
+  # Fix width, height; drop viewBox.
+  $svg =~ s/(?s)<svg width="[^"]+" height="[^"]+"(.*?)viewBox="[^"]+"/<svg width="100%" height="100%"$1/;
+
+  # Insert script, viewport <g> above first <g>
+  my $svg_javascript = SvgJavascript();
+  my $viewport = "<g id=\"viewport\" transform=\"translate(0,0)\">\n";
+  $svg =~ s/<g id="graph\d"/$svg_javascript$viewport$&/;
+
+  # Insert final </g> above </svg>.
+  $svg =~ s/(.*)(<\/svg>)/$1<\/g>$2/;
+  $svg =~ s/<g id="graph\d"(.*?)/<g id="viewport"$1/;
+
+  if ($main::opt_svg) {
+    # --svg: write to standard output.
+    print $svg;
+  } else {
+    # Write back to temporary file.
+    open(SVG, ">$svgfile") || die "open $svgfile: $!";
+    print SVG $svg;
+    close(SVG);
+  }
+}
+
+sub SvgJavascript {
+  return <<'EOF';
+<script type="text/ecmascript"><![CDATA[
+// SVGPan
+// http://www.cyberz.org/blog/2009/12/08/svgpan-a-javascript-svg-panzoomdrag-library/
+// Local modification: if(true || ...) below to force panning, never moving.
+
+/**
+ *  SVGPan library 1.2
+ * ====================
+ *
+ * Given an unique existing element with id "viewport", including the
+ * the library into any SVG adds the following capabilities:
+ *
+ *  - Mouse panning
+ *  - Mouse zooming (using the wheel)
+ *  - Object dargging
+ *
+ * Known issues:
+ *
+ *  - Zooming (while panning) on Safari has still some issues
+ *
+ * Releases:
+ *
+ * 1.2, Sat Mar 20 08:42:50 GMT 2010, Zeng Xiaohui
+ *	Fixed a bug with browser mouse handler interaction
+ *
+ * 1.1, Wed Feb  3 17:39:33 GMT 2010, Zeng Xiaohui
+ *	Updated the zoom code to support the mouse wheel on Safari/Chrome
+ *
+ * 1.0, Andrea Leofreddi
+ *	First release
+ *
+ * This code is licensed under the following BSD license:
+ *
+ * Copyright 2009-2010 Andrea Leofreddi <a.leofreddi@itcharm.com>. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are
+ * permitted provided that the following conditions are met:
+ *
+ *    1. Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *
+ *    2. Redistributions in binary form must reproduce the above copyright notice, this list
+ *       of conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Andrea Leofreddi ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL Andrea Leofreddi OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * The views and conclusions contained in the software and documentation are those of the
+ * authors and should not be interpreted as representing official policies, either expressed
+ * or implied, of Andrea Leofreddi.
+ */
+
+var root = document.documentElement;
+
+var state = 'none', stateTarget, stateOrigin, stateTf;
+
+setupHandlers(root);
+
+/**
+ * Register handlers
+ */
+function setupHandlers(root){
+	setAttributes(root, {
+		"onmouseup" : "add(evt)",
+		"onmousedown" : "handleMouseDown(evt)",
+		"onmousemove" : "handleMouseMove(evt)",
+		"onmouseup" : "handleMouseUp(evt)",
+		//"onmouseout" : "handleMouseUp(evt)", // Decomment this to stop the pan functionality when dragging out of the SVG element
+	});
+
+	if(navigator.userAgent.toLowerCase().indexOf('webkit') >= 0)
+		window.addEventListener('mousewheel', handleMouseWheel, false); // Chrome/Safari
+	else
+		window.addEventListener('DOMMouseScroll', handleMouseWheel, false); // Others
+
+	var g = svgDoc.getElementById("svg");
+	g.width = "100%";
+	g.height = "100%";
+}
+
+/**
+ * Instance an SVGPoint object with given event coordinates.
+ */
+function getEventPoint(evt) {
+	var p = root.createSVGPoint();
+
+	p.x = evt.clientX;
+	p.y = evt.clientY;
+
+	return p;
+}
+
+/**
+ * Sets the current transform matrix of an element.
+ */
+function setCTM(element, matrix) {
+	var s = "matrix(" + matrix.a + "," + matrix.b + "," + matrix.c + "," + matrix.d + "," + matrix.e + "," + matrix.f + ")";
+
+	element.setAttribute("transform", s);
+}
+
+/**
+ * Dumps a matrix to a string (useful for debug).
+ */
+function dumpMatrix(matrix) {
+	var s = "[ " + matrix.a + ", " + matrix.c + ", " + matrix.e + "\n  " + matrix.b + ", " + matrix.d + ", " + matrix.f + "\n  0, 0, 1 ]";
+
+	return s;
+}
+
+/**
+ * Sets attributes of an element.
+ */
+function setAttributes(element, attributes){
+	for (i in attributes)
+		element.setAttributeNS(null, i, attributes[i]);
+}
+
+/**
+ * Handle mouse move event.
+ */
+function handleMouseWheel(evt) {
+	if(evt.preventDefault)
+		evt.preventDefault();
+
+	evt.returnValue = false;
+
+	var svgDoc = evt.target.ownerDocument;
+
+	var delta;
+
+	if(evt.wheelDelta)
+		delta = evt.wheelDelta / 3600; // Chrome/Safari
+	else
+		delta = evt.detail / -90; // Mozilla
+
+	var z = 1 + delta; // Zoom factor: 0.9/1.1
+
+	var g = svgDoc.getElementById("viewport");
+
+	var p = getEventPoint(evt);
+
+	p = p.matrixTransform(g.getCTM().inverse());
+
+	// Compute new scale matrix in current mouse position
+	var k = root.createSVGMatrix().translate(p.x, p.y).scale(z).translate(-p.x, -p.y);
+
+        setCTM(g, g.getCTM().multiply(k));
+
+	stateTf = stateTf.multiply(k.inverse());
+}
+
+/**
+ * Handle mouse move event.
+ */
+function handleMouseMove(evt) {
+	if(evt.preventDefault)
+		evt.preventDefault();
+
+	evt.returnValue = false;
+
+	var svgDoc = evt.target.ownerDocument;
+
+	var g = svgDoc.getElementById("viewport");
+
+	if(state == 'pan') {
+		// Pan mode
+		var p = getEventPoint(evt).matrixTransform(stateTf);
+
+		setCTM(g, stateTf.inverse().translate(p.x - stateOrigin.x, p.y - stateOrigin.y));
+	} else if(state == 'move') {
+		// Move mode
+		var p = getEventPoint(evt).matrixTransform(g.getCTM().inverse());
+
+		setCTM(stateTarget, root.createSVGMatrix().translate(p.x - stateOrigin.x, p.y - stateOrigin.y).multiply(g.getCTM().inverse()).multiply(stateTarget.getCTM()));
+
+		stateOrigin = p;
+	}
+}
+
+/**
+ * Handle click event.
+ */
+function handleMouseDown(evt) {
+	if(evt.preventDefault)
+		evt.preventDefault();
+
+	evt.returnValue = false;
+
+	var svgDoc = evt.target.ownerDocument;
+
+	var g = svgDoc.getElementById("viewport");
+
+	if(true || evt.target.tagName == "svg") {
+		// Pan mode
+		state = 'pan';
+
+		stateTf = g.getCTM().inverse();
+
+		stateOrigin = getEventPoint(evt).matrixTransform(stateTf);
+	} else {
+		// Move mode
+		state = 'move';
+
+		stateTarget = evt.target;
+
+		stateTf = g.getCTM().inverse();
+
+		stateOrigin = getEventPoint(evt).matrixTransform(stateTf);
+	}
+}
+
+/**
+ * Handle mouse button release event.
+ */
+function handleMouseUp(evt) {
+	if(evt.preventDefault)
+		evt.preventDefault();
+
+	evt.returnValue = false;
+
+	var svgDoc = evt.target.ownerDocument;
+
+	if(state == 'pan' || state == 'move') {
+		// Quit pan mode
+		state = '';
+	}
+}
+
+]]></script>
+EOF
+}
+
 # Translate a stack of addresses into a stack of symbols
 sub TranslateStack {
   my $symbols = shift;
@@ -2310,28 +2689,11 @@ sub AddEntries {
   AddEntry($profile, (join "\n", @k), $count);
 }
 
-sub IsSymbolizedProfileFile {
-  my $file_name = shift;
-
-  if (!(-e $file_name) || !(-r $file_name)) {
-    return 0;
-  }
-
-  $SYMBOL_PAGE =~ m,[^/]+$,;    # matches everything after the last slash
-  my $symbol_marker = $&;
-  # Check if the file contains a symbol-section marker.
-  open(TFILE, "<$file_name");
-  my @lines = <TFILE>;
-  my $result = grep(/^--- *$symbol_marker/, @lines);
-  close(TFILE);
-  return $result > 0;
-}
-
 ##### Code to profile a server dynamically #####
 
 sub CheckSymbolPage {
   my $url = SymbolPageURL();
-  open(SYMBOL, "$WGET $WGET_FLAGS -qO- '$url' |");
+  open(SYMBOL, "$URL_FETCHER '$url' |");
   my $line = <SYMBOL>;
   $line =~ s/\r//g;         # turn windows-looking lines into unix-looking lines
   close(SYMBOL);
@@ -2350,33 +2712,45 @@ sub CheckSymbolPage {
 
 sub IsProfileURL {
   my $profile_name = shift;
-  my ($host, $port, $path) = ParseProfileURL($profile_name);
-  return defined($host) and defined($port) and defined($path);
+  if (-f $profile_name) {
+    printf STDERR "Using local file $profile_name.\n";
+    return 0;
+  }
+  return 1;
 }
 
 sub ParseProfileURL {
   my $profile_name = shift;
-  if (defined($profile_name) &&
-      $profile_name =~ m,^(http://|)([^/:]+):(\d+)(|\@\d+)(|/|.*($PROFILE_PAGE|$PMUPROFILE_PAGE|$HEAP_PAGE|$GROWTH_PAGE|$CONTENTION_PAGE|$WALL_PAGE|$FILTEREDPROFILE_PAGE))$,o) {
-    # $6 is $PROFILE_PAGE/$HEAP_PAGE/etc.  $5 is *everything* after
-    # the hostname, as long as that everything is the empty string,
-    # a slash, or something ending in $PROFILE_PAGE/$HEAP_PAGE/etc.
-    # So "$6 || $5" is $PROFILE_PAGE/etc if there, or else it's "/" or "".
-    return ($2, $3, $6 || $5);
+
+  if (!defined($profile_name) || $profile_name eq "") {
+    return ();
   }
-  return ();
+
+  # Split profile URL - matches all non-empty strings, so no test.
+  $profile_name =~ m,^(https?://)?([^/]+)(.*?)(/|$PROFILES)?$,;
+
+  my $proto = $1 || "http://";
+  my $hostport = $2;
+  my $prefix = $3;
+  my $profile = $4 || "/";
+
+  my $host = $hostport;
+  $host =~ s/:.*//;
+
+  my $baseurl = "$proto$hostport$prefix";
+  return ($host, $baseurl, $profile);
 }
 
 # We fetch symbols from the first profile argument.
 sub SymbolPageURL {
-  my ($host, $port, $path) = ParseProfileURL($main::pfile_args[0]);
-  return "http://$host:$port$SYMBOL_PAGE";
+  my ($host, $baseURL, $path) = ParseProfileURL($main::pfile_args[0]);
+  return "$baseURL$SYMBOL_PAGE";
 }
 
 sub FetchProgramName() {
-  my ($host, $port, $path) = ParseProfileURL($main::pfile_args[0]);
-  my $url = "http://$host:$port$PROGRAM_NAME_PAGE";
-  my $command_line = "$WGET $WGET_FLAGS -qO- '$url'";
+  my ($host, $baseURL, $path) = ParseProfileURL($main::pfile_args[0]);
+  my $url = "$baseURL$PROGRAM_NAME_PAGE";
+  my $command_line = "$URL_FETCHER '$url'";
   open(CMDLINE, "$command_line |") or error($command_line);
   my $cmdline = <CMDLINE>;
   $cmdline =~ s/\r//g;   # turn windows-looking lines into unix-looking lines
@@ -2393,7 +2767,7 @@ sub FetchProgramName() {
 # curl.  Redirection happens on borg hosts.
 sub ResolveRedirectionForCurl {
   my $url = shift;
-  my $command_line = "$CURL -s --head '$url'";
+  my $command_line = "$URL_FETCHER --head '$url'";
   open(CMDLINE, "$command_line |") or error($command_line);
   while (<CMDLINE>) {
     s/\r//g;         # turn windows-looking lines into unix-looking lines
@@ -2405,6 +2779,20 @@ sub ResolveRedirectionForCurl {
   return $url;
 }
 
+# Add a timeout flat to URL_FETCHER
+sub AddFetchTimeout {
+  my $fetcher = shift;
+  my $timeout = shift;
+  if (defined($timeout)) {
+    if ($fetcher =~ m/\bcurl -s/) {
+      $fetcher .= sprintf(" --max-time %d", $timeout);
+    } elsif ($fetcher =~ m/\brpcget\b/) {
+      $fetcher .= sprintf(" --deadline=%d", $timeout);
+    }
+  }
+  return $fetcher;
+}
+
 # Reads a symbol map from the file handle name given as $1, returning
 # the resulting symbol map.  Also processes variables relating to symbols.
 # Currently, the only variable processed is 'binary=<value>' which updates
@@ -2460,10 +2848,14 @@ sub FetchSymbols {
     close(POSTFILE);
 
     my $url = SymbolPageURL();
-    # Here we use curl for sending data via POST since old
-    # wget doesn't have --post-file option.
-    $url = ResolveRedirectionForCurl($url);
-    my $command_line = "$CURL -sd '\@$main::tmpfile_sym' '$url'";
+
+    my $command_line;
+    if ($URL_FETCHER =~ m/\bcurl -s/) {
+      $url = ResolveRedirectionForCurl($url);
+      $command_line = "$URL_FETCHER -d '\@$main::tmpfile_sym' '$url'";
+    } else {
+      $command_line = "$URL_FETCHER --post '$url' < '$main::tmpfile_sym'";
+    }
     # We use c++filt in case $SYMBOL_PAGE gives us mangled symbols.
     my $cppfilt = $obj_tool_map{"c++filt"};
     open(SYMBOL, "$command_line | $cppfilt |") or error($command_line);
@@ -2508,10 +2900,10 @@ sub BaseName {
 
 sub MakeProfileBaseName {
   my ($binary_name, $profile_name) = @_;
-  my ($host, $port, $path) = ParseProfileURL($profile_name);
+  my ($host, $baseURL, $path) = ParseProfileURL($profile_name);
   my $binary_shortname = BaseName($binary_name);
-  return sprintf("%s.%s.%s-port%s",
-                 $binary_shortname, $main::op_time, $host, $port);
+  return sprintf("%s.%s.%s",
+                 $binary_shortname, $main::op_time, $host);
 }
 
 sub FetchDynamicProfile {
@@ -2523,7 +2915,7 @@ sub FetchDynamicProfile {
   if (!IsProfileURL($profile_name)) {
     return $profile_name;
   } else {
-    my ($host, $port, $path) = ParseProfileURL($profile_name);
+    my ($host, $baseURL, $path) = ParseProfileURL($profile_name);
     if ($path eq "" || $path eq "/") {
       # Missing type specifier defaults to cpu-profile
       $path = $PROFILE_PAGE;
@@ -2531,35 +2923,26 @@ sub FetchDynamicProfile {
 
     my $profile_file = MakeProfileBaseName($binary_name, $profile_name);
 
-    my $url;
-    my $wget_timeout;
-    if (($path =~ m/$PROFILE_PAGE/) || ($path =~ m/$PMUPROFILE_PAGE/)) {
-      if ($path =~ m/$PROFILE_PAGE/) {
-        $url = sprintf("http://$host:$port$path?seconds=%d",
-            $main::opt_seconds);
+    my $url = "$baseURL$path";
+    my $fetch_timeout = undef;
+    if ($path =~ m/$PROFILE_PAGE|$PMUPROFILE_PAGE/) {
+      if ($path =~ m/[?]/) {
+        $url .= "&";
       } else {
-        if ($profile_name =~ m/[?]/) {
-          $profile_name .= "&"
-        } else {
-          $profile_name .= "?"
-        }
-        $url = sprintf("http://$profile_name" . "seconds=%d",
-            $main::opt_seconds);
+        $url .= "?";
       }
-      $wget_timeout = sprintf("--timeout=%d",
-                              int($main::opt_seconds * 1.01 + 60));
+      $url .= sprintf("seconds=%d", $main::opt_seconds);
+      $fetch_timeout = $main::opt_seconds * 1.01 + 60;
     } else {
       # For non-CPU profiles, we add a type-extension to
       # the target profile file name.
       my $suffix = $path;
       $suffix =~ s,/,.,g;
-      $profile_file .= "$suffix";
-      $url = "http://$host:$port$path";
-      $wget_timeout = "";
+      $profile_file .= $suffix;
     }
 
     my $profile_dir = $ENV{"PPROF_TMPDIR"} || ($ENV{HOME} . "/pprof");
-    if (!(-d $profile_dir)) {
+    if (! -d $profile_dir) {
       mkdir($profile_dir)
           || die("Unable to create profile directory $profile_dir: $!\n");
     }
@@ -2570,14 +2953,15 @@ sub FetchDynamicProfile {
       return $real_profile;
     }
 
-    my $cmd = "$WGET $WGET_FLAGS $wget_timeout -q -O $tmp_profile '$url'";
-    if (($path =~ m/$PROFILE_PAGE/) || ($path =~ m/$PMUPROFILE_PAGE/)){
+    my $fetcher = AddFetchTimeout($URL_FETCHER, $fetch_timeout);
+    my $cmd = "$fetcher '$url' > '$tmp_profile'";
+    if ($path =~ m/$PROFILE_PAGE|$PMUPROFILE_PAGE/){
       print STDERR "Gathering CPU profile from $url for $main::opt_seconds seconds to\n  ${real_profile}\n";
       if ($encourage_patience) {
         print STDERR "Be patient...\n";
       }
     } else {
-      print STDERR "Fetching $path profile from $host:$port to\n  ${real_profile}\n";
+      print STDERR "Fetching $path profile from $url to\n  ${real_profile}\n";
     }
 
     (system($cmd) == 0) || error("Failed to get profile: $cmd: $!\n");
@@ -2624,6 +3008,7 @@ sub FetchDynamicProfilesRecurse {
   } else {
     $position = 1 | ($position << 1);
     TryCollectProfile($maxlevel, $level, $position);
+    cleanup();
     exit(0);
   }
 }
@@ -2662,6 +3047,7 @@ BEGIN {
                  stride      => 512 * 1024,   # must be a multiple of bitsize/8
                  slots       => [],
                  unpack_code => "",           # N for big-endian, V for little
+                 perl_is_64bit => 1,          # matters if profile is 64-bit
     };
     bless $self, $class;
     # Let unittests adjust the stride
@@ -2685,17 +3071,15 @@ BEGIN {
       }
       @$slots = unpack($self->{unpack_code} . "*", $str);
     } else {
-      # If we're a 64-bit profile, make sure we're a 64-bit-capable
+      # If we're a 64-bit profile, check if we're a 64-bit-capable
       # perl.  Otherwise, each slot will be represented as a float
       # instead of an int64, losing precision and making all the
-      # 64-bit addresses right.  We *could* try to handle this with
-      # software emulation of 64-bit ints, but that's added complexity
-      # for no clear benefit (yet).  We use 'Q' to test for 64-bit-ness;
-      # perl docs say it's only available on 64-bit perl systems.
+      # 64-bit addresses wrong.  We won't complain yet, but will
+      # later if we ever see a value that doesn't fit in 32 bits.
       my $has_q = 0;
       eval { $has_q = pack("Q", "1") ? 1 : 1; };
       if (!$has_q) {
-        ::error("$fname: need a 64-bit perl to process this 64-bit profile.\n");
+	$self->{perl_is_64bit} = 0;
       }
       read($self->{file}, $str, 8);
       if (substr($str, 4, 4) eq chr(0)x4) {
@@ -2731,11 +3115,17 @@ BEGIN {
         # TODO(csilvers): if this is a 32-bit perl, the math below
         #    could end up in a too-large int, which perl will promote
         #    to a double, losing necessary precision.  Deal with that.
-        if ($self->{unpack_code} eq 'V') {    # little-endian
-          push(@b64_values, $b32_values[$i] + $b32_values[$i+1] * (2**32));
-        } else {
-          push(@b64_values, $b32_values[$i] * (2**32) + $b32_values[$i+1]);
-        }
+	#    Right now, we just die.
+	my ($lo, $hi) = ($b32_values[$i], $b32_values[$i+1]);
+        if ($self->{unpack_code} eq 'N') {    # big-endian
+	  ($lo, $hi) = ($hi, $lo);
+	}
+	my $value = $lo + $hi * (2**32);
+	if (!$self->{perl_is_64bit} &&   # check value is exactly represented
+	    (($value % (2**32)) != $lo || int($value / (2**32)) != $hi)) {
+	  ::error("Need a 64-bit perl to process this 64-bit profile.\n");
+	}
+	push(@b64_values, $value);
       }
       @$slots = @b64_values;
     }
@@ -2764,6 +3154,44 @@ BEGIN {
   }
 }
 
+# Return the next line from the profile file, assuming it's a text
+# line (which in this case means, doesn't start with a NUL byte).  If
+# it's not a text line, return "".  At EOF, return undef, like perl does.
+# Input file should be in binmode.
+sub ReadProfileLine {
+  local *PROFILE = shift;
+  my $firstchar = "";
+  my $line = "";
+  read(PROFILE, $firstchar, 1);
+  seek(PROFILE, -1, 1);          # unread the firstchar
+  if ($firstchar eq "\0") {
+    return "";
+  }
+  $line = <PROFILE>;
+  if (defined($line)) {
+    $line =~ s/\r//g;   # turn windows-looking lines into unix-looking lines
+  }
+  return $line;
+}
+
+sub IsSymbolizedProfileFile {
+  my $file_name = shift;
+  if (!(-e $file_name) || !(-r $file_name)) {
+    return 0;
+  }
+  # Check if the file contains a symbol-section marker.
+  open(TFILE, "<$file_name");
+  binmode TFILE;
+  my $firstline = ReadProfileLine(*TFILE);
+  close(TFILE);
+  if (!$firstline) {
+    return 0;
+  }
+  $SYMBOL_PAGE =~ m,[^/]+$,;    # matches everything after the last slash
+  my $symbol_marker = $&;
+  return $firstline =~ /^--- *$symbol_marker/;
+}
+
 # Parse profile generated by common/profiler.cc and return a reference
 # to a map:
 #      $result->{version}     Version number of profile file
@@ -2798,28 +3226,17 @@ sub ReadProfile {
   # whole firstline, since it may be gigabytes(!) of data.
   open(PROFILE, "<$fname") || error("$fname: $!\n");
   binmode PROFILE;      # New perls do UTF-8 processing
-  my $firstchar = "";
-  my $header = "";
-  read(PROFILE, $firstchar, 1);
-  seek(PROFILE, -1, 1);          # unread the firstchar
-  if ($firstchar ne "\0") {
-    $header = <PROFILE>;
-    $header =~ s/\r//g;   # turn windows-looking lines into unix-looking lines
+  my $header = ReadProfileLine(*PROFILE);
+  if (!defined($header)) {   # means "at EOF"
+    error("Profile is empty.\n");
   }
 
   my $symbols;
   if ($header =~ m/^--- *$symbol_marker/o) {
-    # read the symbol section of the symbolized profile file
+    # Read the symbol section of the symbolized profile file.
     $symbols = ReadSymbols(*PROFILE{IO});
-
-    # read the next line to get the header for the remaining profile
-    $header = "";
-    read(PROFILE, $firstchar, 1);
-    seek(PROFILE, -1, 1);          # unread the firstchar
-    if ($firstchar ne "\0") {
-      $header = <PROFILE>;
-      $header =~ s/\r//g;
-    }
+    # Read the next line to get the header for the remaining profile.
+    $header = ReadProfileLine(*PROFILE) || "";
   }
 
   my $result;
@@ -3114,18 +3531,18 @@ sub ReadHeapProfile {
           # The sampling frequency is the rate of a Poisson process.
           # This means that the probability of sampling an allocation of
           # size X with sampling rate Y is 1 - exp(-X/Y)
-          my $ratio;
-          my $scale_factor;
-          if ($n1 != 0) {
-            $ratio = (($s1*1.0)/$n1)/($sample_adjustment);
-            $scale_factor = 1/(1 - exp(-$ratio));
-            $n1 *= $scale_factor;
-            $s1 *= $scale_factor;
-          }
-          $ratio = (($s2*1.0)/$n2)/($sample_adjustment);
-          $scale_factor = 1/(1 - exp(-$ratio));
+	  if ($n1 != 0) {
+	    my $ratio = (($s1*1.0)/$n1)/($sample_adjustment);
+	    my $scale_factor = 1/(1 - exp(-$ratio));
+          $n1 *= $scale_factor;
+          $s1 *= $scale_factor;
+	  }
+	  if ($n2 != 0) {
+	    my $ratio = (($s2*1.0)/$n2)/($sample_adjustment);
+	    my $scale_factor = 1/(1 - exp(-$ratio));
           $n2 *= $scale_factor;
           $s2 *= $scale_factor;
+	  }
         } else {
           # Remote-heap version 1
           my $ratio;
@@ -3676,35 +4093,34 @@ sub ExtractSymbols {
 
   my $symbols = {};
 
-  # Map each PC value to the containing library
+  # Map each PC value to the containing library.  To make this faster,
+  # we sort libraries by their starting pc value (highest first), and
+  # advance through the libraries as we advance the pc.  Sometimes the
+  # addresses of libraries may overlap with the addresses of the main
+  # binary, so to make sure the libraries 'win', we iterate over the
+  # libraries in reverse order (binary will have the lowest start addr).
   my @pcs = (sort { $a cmp $b } keys(%{$pcset}));
-  foreach my $lib (reverse sort {$a->[1] cmp $b->[1]} @{$libs}) {
+  foreach my $lib (sort {$b->[1] cmp $a->[1]} @{$libs}) {
     my $libname = $lib->[0];
     my $start = $lib->[1];
     my $finish = $lib->[2];
     my $offset = $lib->[3];
 
     # Get list of pcs that belong in this library.
-    my $pc = pop(@pcs);
-    my @pcs2 = ();
     my $contained = [];
-    while (defined $pc && $pc gt $finish) {
-      unshift(@pcs2, $pc);
-      $pc = pop(@pcs);
+    my ($start_pc_index, $finish_pc_index);
+    for ($finish_pc_index = $#pcs + 1; $finish_pc_index > 0;
+	 $finish_pc_index--) {
+      last if $pcs[$finish_pc_index - 1] le $finish;
     }
-    while (defined $pc && $pc ge $start) {
-      push(@{$contained}, $pc);
-      $pc = pop(@pcs);
+    for ($start_pc_index = $finish_pc_index; $start_pc_index > 0;
+	 $start_pc_index--) {
+      last if $pcs[$start_pc_index - 1] lt $start;
     }
-    if (defined $pc) {
-      push(@pcs, $pc);
-    }
-    @pcs = (@pcs, @pcs2);
+    @{$contained} = splice(@pcs, $start_pc_index,
+			   $finish_pc_index - $start_pc_index);
     # Map to symbols
     MapToSymbols($libname, AddressSub($start, $offset), $contained, $symbols);
-    if (scalar(@pcs) == 0) {
-      last;
-    }
   }
 
   return $symbols;
@@ -3732,8 +4148,7 @@ sub MapToSymbols {
 
   # If "addr2line" isn't installed on the system at all, just use
   # nm to get what info we can (function names, but not line numbers).
-  if ($main::opt_lines == 0 || system("$addr2line --help >/dev/null 2>&1")
-      != 0) {
+  if (system("$addr2line --help >/dev/null 2>&1") != 0) {
     MapSymbolsWithNM($image, $offset, $pclist, $symbols);
     return;
   }
@@ -3919,6 +4334,8 @@ sub ConfigureObjTools {
   if ($file_type =~ /Mach-O/) {
     # OS X uses otool to examine Mach-O files, rather than objdump.
     $obj_tool_map{"otool"} = "otool";
+    $obj_tool_map{"addr2line"} = "false";  # no addr2line
+    $obj_tool_map{"objdump"} = "false";  # no objdump
   }
 
   # Go fill in %obj_tool_map with the pathnames to use:
@@ -3935,18 +4352,27 @@ sub ConfigureTool {
   my $tool = shift;
   my $path;
 
-  if ($main::opt_tools ne "") {
-    # Use a prefix specified by the --tools option...
-    $path = $main::opt_tools . $tool;
-    if (!-x $path) {
-      error("No '$tool' found with prefix specified by --tools $main::opt_tools\n");
+  # --tools (or $PPROF_TOOLS) is a comma separated list, where each
+  # item is either a) a pathname prefix, or b) a map of the form
+  # <tool>:<path>.  First we look for an entry of type (b) for our
+  # tool.  If one is found, we use it.  Otherwise, we consider all the
+  # pathname prefixes in turn, until one yields an existing file.  If
+  # none does, we use a default path.
+  my $tools = $main::opt_tools || $ENV{"PPROF_TOOLS"} || "";
+  if ($tools =~ m/(,|^)\Q$tool\E:([^,]*)/) {
+    $path = $2;
+    # TODO(csilvers): sanity-check that $path exists?  Hard if it's relative.
+  } elsif ($tools ne '') {
+    foreach my $prefix (split(',', $tools)) {
+      next if ($prefix =~ /:/);    # ignore "tool:fullpath" entries in the list
+      if (-x $prefix . $tool) {
+        $path = $prefix . $tool;
+        last;
+      }
     }
-  } elsif (exists $ENV{"PPROF_TOOLS"} &&
-           $ENV{"PPROF_TOOLS"} ne "") {
-    #... or specified with the PPROF_TOOLS environment variable...
-    $path = $ENV{"PPROF_TOOLS"} . $tool;
-    if (!-x $path) {
-      error("No '$tool' found with prefix specified by PPROF_TOOLS=$ENV{PPROF_TOOLS}\n");
+    if (!$path) {
+      error("No '$tool' found with prefix specified by " .
+            "--tools (or \$PPROF_TOOLS) '$tools'\n");
     }
   } else {
     # ... otherwise use the version that exists in the same directory as
@@ -3965,9 +4391,8 @@ sub ConfigureTool {
 
 sub cleanup {
   unlink($main::tmpfile_sym);
-  for (my $i = 0; $i < $main::next_tmpfile; $i++) {
-    unlink(PsTempName($i));
-  }
+  unlink(keys %main::tempnames);
+
   # We leave any collected profiles in $HOME/pprof in case the user wants
   # to look at them later.  We print a message informing them of this.
   if ((scalar(@main::profile_files) > 0) &&
@@ -4010,7 +4435,7 @@ sub GetProcedureBoundariesViaNm {
   my $routine = "";
   while (<NM>) {
     s/\r//g;         # turn windows-looking lines into unix-looking lines
-    if (m/^([0-9a-f]+) (.) (..*)/) {
+    if (m/^\s*([0-9a-f]+) (.) (..*)/) {
       my $start_val = $1;
       my $type = $2;
       my $this_routine = $3;
@@ -4072,7 +4497,6 @@ sub GetProcedureBoundariesViaNm {
     $symbol_table->{$routine} = [HexExtend($last_start),
                                  HexExtend($last_start)];
   }
-
   return $symbol_table;
 }
 
@@ -4120,7 +4544,11 @@ sub GetProcedureBoundaries {
   my @nm_commands = ("$nm -n $flatten_flag $demangle_flag" .
                      " $image 2>/dev/null $cppfilt_flag",
                      "$nm -D -n $flatten_flag $demangle_flag" .
-                     " $image 2>/dev/null $cppfilt_flag");
+                     " $image 2>/dev/null $cppfilt_flag",
+                     # 6nm is for Go binaries
+		     "6nm $image 2>/dev/null | sort",
+                     );
+
   # If the executable is an MS Windows PDB-format executable, we'll
   # have set up obj_tool_map("nm_pdb").  In this case, we actually
   # want to use both unix nm and windows-specific nm_pdb, since

From 3c26a7d68e90eefe2368945615d903aa0a1061b7 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Sat, 2 Oct 2010 12:04:41 -0700
Subject: [PATCH 21/48] Remove malloc_swap_enable().

Remove malloc_swap_enable(), which was obsoleted by the "swap.fds"
mallctl.  The prototype for malloc_swap_enable() was removed from
jemalloc/jemalloc.h, but the function itself was accidentally left in
place.
---
 jemalloc/src/jemalloc.c | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/jemalloc/src/jemalloc.c b/jemalloc/src/jemalloc.c
index a79752eb..98c4485c 100644
--- a/jemalloc/src/jemalloc.c
+++ b/jemalloc/src/jemalloc.c
@@ -1248,23 +1248,6 @@ JEMALLOC_P(malloc_usable_size)(const void *ptr)
 	return (ret);
 }
 
-#ifdef JEMALLOC_SWAP
-JEMALLOC_ATTR(visibility("default"))
-int
-JEMALLOC_P(malloc_swap_enable)(const int *fds, unsigned nfds, int prezeroed)
-{
-
-	/*
-	 * Make sure malloc is initialized, because we need page size, chunk
-	 * size, etc.
-	 */
-	if (malloc_init())
-		return (-1);
-
-	return (chunk_swap_enable(fds, nfds, (prezeroed != 0)) ? -1 : 0);
-}
-#endif
-
 JEMALLOC_ATTR(visibility("default"))
 void
 JEMALLOC_P(malloc_stats_print)(void (*write_cb)(void *, const char *),

From 4d5c09905e744deabbfbed4871b113344cf84973 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Sat, 2 Oct 2010 21:35:27 -0700
Subject: [PATCH 22/48] Print prof-libgcc configure setting.

---
 jemalloc/configure.ac | 1 +
 1 file changed, 1 insertion(+)

diff --git a/jemalloc/configure.ac b/jemalloc/configure.ac
index 7d0561aa..26e2b012 100644
--- a/jemalloc/configure.ac
+++ b/jemalloc/configure.ac
@@ -828,6 +828,7 @@ AC_MSG_RESULT([cc-silence         : ${enable_cc_silence}])
 AC_MSG_RESULT([debug              : ${enable_debug}])
 AC_MSG_RESULT([stats              : ${enable_stats}])
 AC_MSG_RESULT([prof               : ${enable_prof}])
+AC_MSG_RESULT([prof-libgcc        : ${enable_prof_libgcc}])
 AC_MSG_RESULT([prof-libunwind     : ${enable_prof_libunwind}])
 AC_MSG_RESULT([tiny               : ${enable_tiny}])
 AC_MSG_RESULT([tcache             : ${enable_tcache}])

From a881cd2c61c1ced56f87fcb9d7ef6e92b81e6c58 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Sat, 2 Oct 2010 15:18:50 -0700
Subject: [PATCH 23/48] Make cumulative heap profile data optional.

Add the R option to control whether cumulative heap profile data
are maintained.  Add the T option to control the size of per thread
backtrace caches, primarily because when the R option is specified,
backtraces that no longer have allocations associated with them are
discarded as soon as no thread caches refer to them.
---
 jemalloc/doc/jemalloc.3.in                 |  55 ++++
 jemalloc/include/jemalloc/internal/mutex.h |   1 +
 jemalloc/include/jemalloc/internal/prof.h  |  63 +++--
 jemalloc/src/ctl.c                         |   6 +
 jemalloc/src/jemalloc.c                    |  15 ++
 jemalloc/src/mutex.c                       |  10 +
 jemalloc/src/prof.c                        | 288 +++++++++++++--------
 jemalloc/src/stats.c                       |  14 +
 8 files changed, 328 insertions(+), 124 deletions(-)

diff --git a/jemalloc/doc/jemalloc.3.in b/jemalloc/doc/jemalloc.3.in
index 2ac093a6..23e6ca00 100644
--- a/jemalloc/doc/jemalloc.3.in
+++ b/jemalloc/doc/jemalloc.3.in
@@ -484,6 +484,12 @@ will disable dirty page purging.
 @roff_prof@.Dq S
 @roff_prof@option for probabilistic sampling control.
 @roff_prof@See the
+@roff_prof@.Dq R
+@roff_prof@option for control of cumulative sample reporting.
+@roff_prof@See the
+@roff_prof@.Dq T
+@roff_prof@option for control of per thread backtrace caching.
+@roff_prof@See the
 @roff_prof@.Dq I
 @roff_prof@option for information on interval-triggered profile dumping, and the
 @roff_prof@.Dq U
@@ -595,6 +601,18 @@ Double/halve the size of the maximum size class that is a multiple of the
 quantum (8 or 16 bytes, depending on architecture).
 Above this size, cacheline spacing is used for size classes.
 The default value is 128 bytes.
+@roff_prof@.It R
+@roff_prof@Enable/disable reporting of cumulative object/byte counts in profile
+@roff_prof@dumps.
+@roff_prof@If this option is enabled, every unique backtrace must be stored for
+@roff_prof@the duration of execution.
+@roff_prof@Depending on the application, this can impose a large memory
+@roff_prof@overhead, and the cumulative counts are not always of interest.
+@roff_prof@See the
+@roff_prof@.Dq T
+@roff_prof@option for control of per thread backtrace caching, which has
+@roff_prof@important interactions.
+@roff_prof@This option is enabled by default.
 @roff_prof@.It S
 @roff_prof@Double/halve the average interval between allocation samples, as
 @roff_prof@measured in bytes of allocation activity.
@@ -602,6 +620,22 @@ The default value is 128 bytes.
 @roff_prof@also decreases the computational overhead.
 @roff_prof@The default sample interval is one (i.e. all allocations are
 @roff_prof@sampled).
+@roff_prof@.It T
+@roff_prof@Double/halve the maximum per thread backtrace cache used for heap
+@roff_prof@profiling.
+@roff_prof@A backtrace can only be discarded if the
+@roff_prof@.Dq R
+@roff_prof@option is disabled, and no thread caches currently refer to the
+@roff_prof@backtrace.
+@roff_prof@Therefore, a backtrace cache limit should be imposed if the
+@roff_prof@intention is to limit how much memory is used by backtraces.
+@roff_prof@By default, no limit is imposed.
+@roff_prof@This is internally encoded as (1 << -1), and each
+@roff_prof@.Dq T
+@roff_prof@that is specified increments the shift amount.
+@roff_prof@Therefore, e.g.
+@roff_prof@.Ev JEMALLOC_OPTIONS=11T
+@roff_prof@specifies a backtrace cache limit of 1024 backtraces.
 @roff_prof@.It U
 @roff_prof@Trigger a memory profile dump every time the total virtual memory
 @roff_prof@exceeds the previous maximum.
@@ -992,6 +1026,27 @@ option.
 @roff_prof@option.
 @roff_prof@.Ed
 .\"-----------------------------------------------------------------------------
+@roff_prof@.It Sy "opt.prof_accum (bool) r-"
+@roff_prof@.Bd -ragged -offset indent -compact
+@roff_prof@See the
+@roff_prof@.Dq R
+@roff_prof@option.
+@roff_prof@.Ed
+.\"-----------------------------------------------------------------------------
+@roff_prof@.It Sy "opt.lg_prof_tcmax (ssize_t) r-"
+@roff_prof@.Bd -ragged -offset indent -compact
+@roff_prof@See the
+@roff_prof@.Dq T
+@roff_prof@option.
+@roff_prof@.Ed
+.\"-----------------------------------------------------------------------------
+@roff_prof@.It Sy "opt.lg_prof_sample (ssize_t) r-"
+@roff_prof@.Bd -ragged -offset indent -compact
+@roff_prof@See the
+@roff_prof@.Dq S
+@roff_prof@option.
+@roff_prof@.Ed
+.\"-----------------------------------------------------------------------------
 @roff_prof@.It Sy "opt.lg_prof_interval (ssize_t) r-"
 @roff_prof@.Bd -ragged -offset indent -compact
 @roff_prof@See the
diff --git a/jemalloc/include/jemalloc/internal/mutex.h b/jemalloc/include/jemalloc/internal/mutex.h
index 81134153..dcca01ed 100644
--- a/jemalloc/include/jemalloc/internal/mutex.h
+++ b/jemalloc/include/jemalloc/internal/mutex.h
@@ -24,6 +24,7 @@ extern bool isthreaded;
 #endif
 
 bool	malloc_mutex_init(malloc_mutex_t *mutex);
+void	malloc_mutex_destroy(malloc_mutex_t *mutex);
 
 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
diff --git a/jemalloc/include/jemalloc/internal/prof.h b/jemalloc/include/jemalloc/internal/prof.h
index 2c195f39..a8f67bbb 100644
--- a/jemalloc/include/jemalloc/internal/prof.h
+++ b/jemalloc/include/jemalloc/internal/prof.h
@@ -6,12 +6,13 @@ typedef struct prof_bt_s prof_bt_t;
 typedef struct prof_cnt_s prof_cnt_t;
 typedef struct prof_thr_cnt_s prof_thr_cnt_t;
 typedef struct prof_ctx_s prof_ctx_t;
-typedef struct prof_s prof_t;
+typedef struct prof_tcache_s prof_tcache_t;
 
 /* Option defaults. */
 #define	LG_PROF_BT_MAX_DEFAULT		2
 #define	LG_PROF_SAMPLE_DEFAULT		0
 #define	LG_PROF_INTERVAL_DEFAULT	-1
+#define	LG_PROF_TCMAX_DEFAULT		-1
 
 /*
  * Hard limit on stack backtrace depth.  Note that the version of
@@ -41,9 +42,9 @@ struct prof_bt_s {
 #ifdef JEMALLOC_PROF_LIBGCC
 /* Data structure passed to libgcc _Unwind_Backtrace() callback functions. */
 typedef struct {
-	prof_bt_t *bt;
-	unsigned nignore;
-	unsigned max;
+	prof_bt_t	*bt;
+	unsigned	nignore;
+	unsigned	max;
 } prof_unwind_data_t;
 #endif
 
@@ -51,11 +52,11 @@ struct prof_cnt_s {
 	/*
 	 * Profiling counters.  An allocation/deallocation pair can operate on
 	 * different prof_thr_cnt_t objects that are linked into the same
-	 * prof_ctx_t sets_ql, so it is possible for the cur* counters to go
+	 * prof_ctx_t cnts_ql, so it is possible for the cur* counters to go
 	 * negative.  In principle it is possible for the *bytes counters to
-	 * overflow/underflow, but a general solution would require some form
-	 * of 128-bit counter solution; this implementation doesn't bother to
-	 * solve that problem.
+	 * overflow/underflow, but a general solution would require something
+	 * like 128-bit counters; this implementation doesn't bother to solve
+	 * that problem.
 	 */
 	int64_t		curobjs;
 	int64_t		curbytes;
@@ -64,15 +65,18 @@ struct prof_cnt_s {
 };
 
 struct prof_thr_cnt_s {
-	/* Linkage into prof_ctx_t's sets_ql. */
-	ql_elm(prof_thr_cnt_t)	link;
+	/* Linkage into prof_ctx_t's cnts_ql. */
+	ql_elm(prof_thr_cnt_t)	cnts_link;
+
+	/* Linkage into thread's LRU. */
+	ql_elm(prof_thr_cnt_t)	lru_link;
 
 	/*
 	 * Associated context.  If a thread frees an object that it did not
 	 * allocate, it is possible that the context is not cached in the
 	 * thread's hash table, in which case it must be able to look up the
 	 * context, insert a new prof_thr_cnt_t into the thread's hash table,
-	 * and link it into the prof_ctx_t's sets_ql.
+	 * and link it into the prof_ctx_t's cnts_ql.
 	 */
 	prof_ctx_t		*ctx;
 
@@ -101,11 +105,11 @@ struct prof_ctx_s {
 	/* Associated backtrace. */
 	prof_bt_t		*bt;
 
-	/* Protects cnt_merged and sets_ql. */
+	/* Protects cnt_merged and cnts_ql. */
 	malloc_mutex_t		lock;
 
-	/* Temporary storage for aggregation during dump. */
-	prof_cnt_t		cnt_dump;
+	/* Temporary storage for summation during dump. */
+	prof_cnt_t		cnt_summed;
 
 	/* When threads exit, they merge their stats into cnt_merged. */
 	prof_cnt_t		cnt_merged;
@@ -117,6 +121,24 @@ struct prof_ctx_s {
 	ql_head(prof_thr_cnt_t)	cnts_ql;
 };
 
+/*
+ * Thread-specific hash of (prof_bt_t *)-->(prof_thr_cnt_t *).  Each thread
+ * keeps a cache of backtraces, with associated thread-specific prof_thr_cnt_t
+ * objects.  Other threads may read the prof_thr_cnt_t contents, but no others
+ * will ever write them.
+ *
+ * Upon thread exit, the thread must merge all the prof_thr_cnt_t counter data
+ * into the associated prof_ctx_t objects, and unlink/free the prof_thr_cnt_t
+ * objects.
+ */
+struct prof_tcache_s {
+	/* (prof_bt_t *)-->(prof_thr_cnt_t *). */
+	ckh_t			bt2cnt;
+
+	/* LRU for contents of bt2cnt. */
+	ql_head(prof_thr_cnt_t)	lru_ql;
+};
+
 #endif /* JEMALLOC_H_STRUCTS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_EXTERNS
@@ -129,11 +151,13 @@ extern bool	opt_prof;
  * to notice state changes.
  */
 extern bool	opt_prof_active;
-extern size_t	opt_lg_prof_bt_max; /* Maximum backtrace depth. */
-extern size_t	opt_lg_prof_sample; /* Mean bytes between samples. */
+extern size_t	opt_lg_prof_bt_max;   /* Maximum backtrace depth. */
+extern size_t	opt_lg_prof_sample;   /* Mean bytes between samples. */
 extern ssize_t	opt_lg_prof_interval; /* lg(prof_interval). */
-extern bool	opt_prof_udump; /* High-water memory dumping. */
-extern bool	opt_prof_leak; /* Dump leak summary at exit. */
+extern bool	opt_prof_udump;       /* High-water memory dumping. */
+extern bool	opt_prof_leak;        /* Dump leak summary at exit. */
+extern bool	opt_prof_accum;       /* Report cumulative bytes. */
+extern ssize_t	opt_lg_prof_tcmax;    /* lg(max per thread bactrace cache) */
 
 /*
  * Profile dump interval, measured in bytes allocated.  Each arena triggers a
@@ -150,9 +174,6 @@ extern uint64_t	prof_interval;
  */
 extern bool	prof_promote;
 
-bool	prof_init(prof_t *prof, bool master);
-void	prof_destroy(prof_t *prof);
-
 prof_thr_cnt_t	*prof_alloc_prep(size_t size);
 prof_ctx_t	*prof_ctx_get(const void *ptr);
 void	prof_malloc(const void *ptr, prof_thr_cnt_t *cnt);
diff --git a/jemalloc/src/ctl.c b/jemalloc/src/ctl.c
index e904fdff..76422599 100644
--- a/jemalloc/src/ctl.c
+++ b/jemalloc/src/ctl.c
@@ -82,6 +82,8 @@ CTL_PROTO(opt_lg_prof_sample)
 CTL_PROTO(opt_lg_prof_interval)
 CTL_PROTO(opt_prof_udump)
 CTL_PROTO(opt_prof_leak)
+CTL_PROTO(opt_prof_accum)
+CTL_PROTO(opt_lg_prof_tcmax)
 #endif
 CTL_PROTO(opt_stats_print)
 CTL_PROTO(opt_lg_qspace_max)
@@ -260,6 +262,8 @@ static const ctl_node_t opt_node[] = {
 	{NAME("lg_prof_interval"),	CTL(opt_lg_prof_interval)},
 	{NAME("prof_udump"),		CTL(opt_prof_udump)},
 	{NAME("prof_leak"),		CTL(opt_prof_leak)},
+	{NAME("prof_accum"),		CTL(opt_prof_accum)},
+	{NAME("lg_prof_tcmax"),		CTL(opt_lg_prof_tcmax)},
 #endif
 	{NAME("stats_print"),		CTL(opt_stats_print)},
 	{NAME("lg_qspace_max"),		CTL(opt_lg_qspace_max)},
@@ -1207,6 +1211,8 @@ CTL_RO_GEN(opt_lg_prof_sample, opt_lg_prof_sample, size_t)
 CTL_RO_GEN(opt_lg_prof_interval, opt_lg_prof_interval, ssize_t)
 CTL_RO_GEN(opt_prof_udump, opt_prof_udump, bool)
 CTL_RO_GEN(opt_prof_leak, opt_prof_leak, bool)
+CTL_RO_GEN(opt_prof_accum, opt_prof_accum, bool)
+CTL_RO_GEN(opt_lg_prof_tcmax, opt_lg_prof_tcmax, ssize_t)
 #endif
 CTL_RO_GEN(opt_stats_print, opt_stats_print, bool)
 CTL_RO_GEN(opt_lg_qspace_max, opt_lg_qspace_max, size_t)
diff --git a/jemalloc/src/jemalloc.c b/jemalloc/src/jemalloc.c
index 98c4485c..6f9ec762 100644
--- a/jemalloc/src/jemalloc.c
+++ b/jemalloc/src/jemalloc.c
@@ -512,6 +512,12 @@ MALLOC_OUT:
 						opt_lg_qspace_max++;
 					break;
 #ifdef JEMALLOC_PROF
+				case 'r':
+					opt_prof_accum = false;
+					break;
+				case 'R':
+					opt_prof_accum = true;
+					break;
 				case 's':
 					if (opt_lg_prof_sample > 0)
 						opt_lg_prof_sample--;
@@ -521,6 +527,15 @@ MALLOC_OUT:
 					    (sizeof(uint64_t) << 3))
 						opt_lg_prof_sample++;
 					break;
+				case 't':
+					if (opt_lg_prof_tcmax >= 0)
+						opt_lg_prof_tcmax--;
+					break;
+				case 'T':
+					if (opt_lg_prof_tcmax + 1 <
+					    (sizeof(size_t) << 3))
+						opt_lg_prof_tcmax++;
+					break;
 				case 'u':
 					opt_prof_udump = false;
 					break;
diff --git a/jemalloc/src/mutex.c b/jemalloc/src/mutex.c
index 337312bd..3ecb18a3 100644
--- a/jemalloc/src/mutex.c
+++ b/jemalloc/src/mutex.c
@@ -72,3 +72,13 @@ malloc_mutex_init(malloc_mutex_t *mutex)
 
 	return (false);
 }
+
+void
+malloc_mutex_destroy(malloc_mutex_t *mutex)
+{
+
+	if (pthread_mutex_destroy(mutex) != 0) {
+		malloc_write("<jemalloc>: Error in pthread_mutex_destroy()\n");
+		abort();
+	}
+}
diff --git a/jemalloc/src/prof.c b/jemalloc/src/prof.c
index 8d13451c..7ffda230 100644
--- a/jemalloc/src/prof.c
+++ b/jemalloc/src/prof.c
@@ -24,47 +24,41 @@ size_t		opt_lg_prof_sample = LG_PROF_SAMPLE_DEFAULT;
 ssize_t		opt_lg_prof_interval = LG_PROF_INTERVAL_DEFAULT;
 bool		opt_prof_udump = false;
 bool		opt_prof_leak = false;
+bool		opt_prof_accum = true;
+ssize_t		opt_lg_prof_tcmax = LG_PROF_TCMAX_DEFAULT;
 
 uint64_t	prof_interval;
 bool		prof_promote;
 
 /*
  * Global hash of (prof_bt_t *)-->(prof_ctx_t *).  This is the master data
- * structure that knows about all backtraces ever captured.
+ * structure that knows about all backtraces currently captured.
  */
 static ckh_t		bt2ctx;
 static malloc_mutex_t	bt2ctx_mtx;
 
-/*
- * Thread-specific hash of (prof_bt_t *)-->(prof_thr_cnt_t *).  Each thread
- * keeps a cache of backtraces, with associated thread-specific prof_thr_cnt_t
- * objects.  Other threads may read the prof_thr_cnt_t contents, but no others
- * will ever write them.
- *
- * Upon thread exit, the thread must merge all the prof_thr_cnt_t counter data
- * into the associated prof_ctx_t objects, and unlink/free the prof_thr_cnt_t
- * objects.
- */
+/* Thread-specific backtrace cache, used to reduce bt2ctx contention. */
 #ifndef NO_TLS
-static __thread ckh_t	*bt2cnt_tls JEMALLOC_ATTR(tls_model("initial-exec"));
-#  define BT2CNT_GET()	bt2cnt_tls
-#  define BT2CNT_SET(v)	do {						\
-	bt2cnt_tls = (v);						\
-	pthread_setspecific(bt2cnt_tsd, (void *)(v));			\
+static __thread prof_tcache_t	*prof_tcache_tls
+    JEMALLOC_ATTR(tls_model("initial-exec"));
+#  define PROF_TCACHE_GET()	prof_tcache_tls
+#  define PROF_TCACHE_SET(v)	do {					\
+	prof_tcache_tls = (v);						\
+	pthread_setspecific(prof_tcache_tsd, (void *)(v));		\
 } while (0)
 #else
-#  define BT2CNT_GET()	((ckh_t *)pthread_getspecific(bt2cnt_tsd))
-#  define BT2CNT_SET(v)	do {						\
-	pthread_setspecific(bt2cnt_tsd, (void *)(v));			\
+#  define PROF_TCACHE_GET()	((ckh_t *)pthread_getspecific(prof_tcache_tsd))
+#  define PROF_TCACHE_SET(v)	do {					\
+	pthread_setspecific(prof_tcache_tsd, (void *)(v));		\
 } while (0)
 #endif
 
 /*
  * Same contents as b2cnt_tls, but initialized such that the TSD destructor is
- * called when a thread exits, so that bt2cnt_tls contents can be merged,
+ * called when a thread exits, so that prof_tcache_tls contents can be merged,
  * unlinked, and deallocated.
  */
-static pthread_key_t	bt2cnt_tsd;
+static pthread_key_t	prof_tcache_tsd;
 
 /* (1U << opt_lg_prof_bt_max). */
 static unsigned		prof_bt_max;
@@ -137,6 +131,7 @@ static bool		enq_udump;
 
 static prof_bt_t	*bt_dup(prof_bt_t *bt);
 static void	bt_init(prof_bt_t *bt, void **vec);
+static void	bt_destroy(prof_bt_t *bt);
 #ifdef JEMALLOC_PROF_LIBGCC
 static _Unwind_Reason_Code	prof_unwind_init_callback(
     struct _Unwind_Context *context, void *arg);
@@ -148,8 +143,10 @@ static prof_thr_cnt_t	*prof_lookup(prof_bt_t *bt);
 static void	prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
 static bool	prof_flush(bool propagate_err);
 static bool	prof_write(const char *s, bool propagate_err);
-static void	prof_ctx_merge(prof_ctx_t *ctx, prof_cnt_t *cnt_all,
+static void	prof_ctx_sum(prof_ctx_t *ctx, prof_cnt_t *cnt_all,
     size_t *leak_nctx);
+static void	prof_ctx_destroy(prof_ctx_t *ctx);
+static void	prof_ctx_merge(prof_ctx_t *ctx, prof_thr_cnt_t *cnt);
 static bool	prof_dump_ctx(prof_ctx_t *ctx, prof_bt_t *bt,
     bool propagate_err);
 static bool	prof_dump_maps(bool propagate_err);
@@ -160,7 +157,7 @@ static void	prof_fdump(void);
 static void	prof_bt_hash(const void *key, unsigned minbits, size_t *hash1,
     size_t *hash2);
 static bool	prof_bt_keycomp(const void *k1, const void *k2);
-static void	bt2cnt_thread_cleanup(void *arg);
+static void	prof_tcache_cleanup(void *arg);
 #ifdef NO_TLS
 static void	prof_sample_state_thread_cleanup(void *arg);
 #endif
@@ -175,6 +172,13 @@ bt_init(prof_bt_t *bt, void **vec)
 	bt->len = 0;
 }
 
+static void
+bt_destroy(prof_bt_t *bt)
+{
+
+	idalloc(bt);
+}
+
 static prof_bt_t *
 bt_dup(prof_bt_t *bt)
 {
@@ -487,23 +491,25 @@ prof_lookup(prof_bt_t *bt)
 		prof_thr_cnt_t	*p;
 		void		*v;
 	} ret;
-	ckh_t *bt2cnt = BT2CNT_GET();
+	prof_tcache_t *prof_tcache = PROF_TCACHE_GET();
 
-	if (bt2cnt == NULL) {
+	if (prof_tcache == NULL) {
 		/* Initialize an empty cache for this thread. */
-		bt2cnt = (ckh_t *)imalloc(sizeof(ckh_t));
-		if (bt2cnt == NULL)
+		prof_tcache = (prof_tcache_t *)imalloc(sizeof(prof_tcache_t));
+		if (prof_tcache == NULL)
 			return (NULL);
-		if (ckh_new(bt2cnt, PROF_CKH_MINITEMS, prof_bt_hash,
-		    prof_bt_keycomp)) {
-			idalloc(bt2cnt);
+
+		if (ckh_new(&prof_tcache->bt2cnt, PROF_CKH_MINITEMS,
+		    prof_bt_hash, prof_bt_keycomp)) {
+			idalloc(prof_tcache);
 			return (NULL);
 		}
+		ql_new(&prof_tcache->lru_ql);
 
-		BT2CNT_SET(bt2cnt);
+		PROF_TCACHE_SET(prof_tcache);
 	}
 
-	if (ckh_search(bt2cnt, bt, NULL, &ret.v)) {
+	if (ckh_search(&prof_tcache->bt2cnt, bt, NULL, &ret.v)) {
 		union {
 			prof_bt_t	*p;
 			void		*v;
@@ -519,7 +525,6 @@ prof_lookup(prof_bt_t *bt)
 		 */
 		prof_enter();
 		if (ckh_search(&bt2ctx, bt, &btkey.v, &ctx.v)) {
-
 			/* bt has never been seen before.  Insert it. */
 			ctx.v = imalloc(sizeof(prof_ctx_t));
 			if (ctx.v == NULL) {
@@ -544,28 +549,60 @@ prof_lookup(prof_bt_t *bt)
 			if (ckh_insert(&bt2ctx, btkey.v, ctx.v)) {
 				/* OOM. */
 				prof_leave();
+				malloc_mutex_destroy(&ctx.p->lock);
 				idalloc(btkey.v);
 				idalloc(ctx.v);
 				return (NULL);
 			}
 		}
+		/*
+		 * Acquire ctx's lock before releasing bt2ctx_mtx, in order to
+		 * avoid a race condition with prof_ctx_destroy().
+		 */
+		malloc_mutex_lock(&ctx.p->lock);
 		prof_leave();
 
 		/* Link a prof_thd_cnt_t into ctx for this thread. */
-		ret.v = imalloc(sizeof(prof_thr_cnt_t));
-		if (ret.p == NULL)
-			return (NULL);
-		ql_elm_new(ret.p, link);
+		if (opt_lg_prof_tcmax >= 0 && ckh_count(&prof_tcache->bt2cnt)
+		    == (ZU(1) << opt_lg_prof_tcmax)) {
+			assert(ckh_count(&prof_tcache->bt2cnt) > 0);
+			/*
+			 * Flush the least least recently used cnt in order to
+			 * keep bt2cnt from becoming too large.
+			 */
+			ret.p = ql_last(&prof_tcache->lru_ql, lru_link);
+			assert(ret.v != NULL);
+			ckh_remove(&prof_tcache->bt2cnt, ret.p->ctx->bt, NULL,
+			    NULL);
+			ql_remove(&prof_tcache->lru_ql, ret.p, lru_link);
+			prof_ctx_merge(ret.p->ctx, ret.p);
+			/* ret can now be re-used. */
+		} else {
+			assert(opt_lg_prof_tcmax < 0 ||
+			    ckh_count(&prof_tcache->bt2cnt) < (ZU(1) <<
+			    opt_lg_prof_tcmax));
+			/* Allocate and partially initialize a new cnt. */
+			ret.v = imalloc(sizeof(prof_thr_cnt_t));
+			if (ret.p == NULL)
+				return (NULL);
+			ql_elm_new(ret.p, cnts_link);
+			ql_elm_new(ret.p, lru_link);
+		}
+		/* Finish initializing ret. */
 		ret.p->ctx = ctx.p;
 		ret.p->epoch = 0;
 		memset(&ret.p->cnts, 0, sizeof(prof_cnt_t));
-		if (ckh_insert(bt2cnt, btkey.v, ret.v)) {
+		if (ckh_insert(&prof_tcache->bt2cnt, btkey.v, ret.v)) {
 			idalloc(ret.v);
 			return (NULL);
 		}
-		malloc_mutex_lock(&ctx.p->lock);
-		ql_tail_insert(&ctx.p->cnts_ql, ret.p, link);
+		ql_head_insert(&prof_tcache->lru_ql, ret.p, lru_link);
+		ql_tail_insert(&ctx.p->cnts_ql, ret.p, cnts_link);
 		malloc_mutex_unlock(&ctx.p->lock);
+	} else {
+		/* Move ret to the front of the LRU. */
+		ql_remove(&prof_tcache->lru_ql, ret.p, lru_link);
+		ql_head_insert(&prof_tcache->lru_ql, ret.p, lru_link);
 	}
 
 	return (ret.p);
@@ -729,8 +766,10 @@ prof_malloc(const void *ptr, prof_thr_cnt_t *cnt)
 		/*********/
 		cnt->cnts.curobjs++;
 		cnt->cnts.curbytes += size;
-		cnt->cnts.accumobjs++;
-		cnt->cnts.accumbytes += size;
+		if (opt_prof_accum) {
+			cnt->cnts.accumobjs++;
+			cnt->cnts.accumbytes += size;
+		}
 		/*********/
 		mb_write();
 		/*********/
@@ -796,8 +835,10 @@ prof_realloc(const void *ptr, prof_thr_cnt_t *cnt, const void *old_ptr,
 	if ((uintptr_t)cnt > (uintptr_t)1U) {
 		cnt->cnts.curobjs++;
 		cnt->cnts.curbytes += size;
-		cnt->cnts.accumobjs++;
-		cnt->cnts.accumbytes += size;
+		if (opt_prof_accum) {
+			cnt->cnts.accumobjs++;
+			cnt->cnts.accumbytes += size;
+		}
 	}
 	/*********/
 	mb_write();
@@ -896,15 +937,15 @@ prof_write(const char *s, bool propagate_err)
 }
 
 static void
-prof_ctx_merge(prof_ctx_t *ctx, prof_cnt_t *cnt_all, size_t *leak_nctx)
+prof_ctx_sum(prof_ctx_t *ctx, prof_cnt_t *cnt_all, size_t *leak_nctx)
 {
 	prof_thr_cnt_t *thr_cnt;
 	prof_cnt_t tcnt;
 
 	malloc_mutex_lock(&ctx->lock);
 
-	memcpy(&ctx->cnt_dump, &ctx->cnt_merged, sizeof(prof_cnt_t));
-	ql_foreach(thr_cnt, &ctx->cnts_ql, link) {
+	memcpy(&ctx->cnt_summed, &ctx->cnt_merged, sizeof(prof_cnt_t));
+	ql_foreach(thr_cnt, &ctx->cnts_ql, cnts_link) {
 		volatile unsigned *epoch = &thr_cnt->epoch;
 
 		while (true) {
@@ -921,39 +962,101 @@ prof_ctx_merge(prof_ctx_t *ctx, prof_cnt_t *cnt_all, size_t *leak_nctx)
 				break;
 		}
 
-		ctx->cnt_dump.curobjs += tcnt.curobjs;
-		ctx->cnt_dump.curbytes += tcnt.curbytes;
-		ctx->cnt_dump.accumobjs += tcnt.accumobjs;
-		ctx->cnt_dump.accumbytes += tcnt.accumbytes;
+		ctx->cnt_summed.curobjs += tcnt.curobjs;
+		ctx->cnt_summed.curbytes += tcnt.curbytes;
+		if (opt_prof_accum) {
+			ctx->cnt_summed.accumobjs += tcnt.accumobjs;
+			ctx->cnt_summed.accumbytes += tcnt.accumbytes;
+		}
 
 		if (tcnt.curobjs != 0)
 			(*leak_nctx)++;
 	}
 
-	/* Merge into cnt_all. */
-	cnt_all->curobjs += ctx->cnt_dump.curobjs;
-	cnt_all->curbytes += ctx->cnt_dump.curbytes;
-	cnt_all->accumobjs += ctx->cnt_dump.accumobjs;
-	cnt_all->accumbytes += ctx->cnt_dump.accumbytes;
+	/* Add to cnt_all. */
+	cnt_all->curobjs += ctx->cnt_summed.curobjs;
+	cnt_all->curbytes += ctx->cnt_summed.curbytes;
+	if (opt_prof_accum) {
+		cnt_all->accumobjs += ctx->cnt_summed.accumobjs;
+		cnt_all->accumbytes += ctx->cnt_summed.accumbytes;
+	}
 
 	malloc_mutex_unlock(&ctx->lock);
 }
 
+static void
+prof_ctx_destroy(prof_ctx_t *ctx)
+{
+
+	/*
+	 * Check that ctx is still unused by any thread cache before destroying
+	 * it.  prof_lookup() interlocks bt2ctx_mtx and ctx->lock in order to
+	 * avoid a race condition with this function.
+	 */
+	prof_enter();
+	malloc_mutex_lock(&ctx->lock);
+	if (ql_first(&ctx->cnts_ql) == NULL && ctx->cnt_merged.curobjs == 0) {
+		assert(ctx->cnt_merged.curbytes == 0);
+		assert(ctx->cnt_merged.accumobjs == 0);
+		assert(ctx->cnt_merged.accumbytes == 0);
+		/* Remove ctx from bt2ctx. */
+		ckh_remove(&bt2ctx, ctx->bt, NULL, NULL);
+		prof_leave();
+		/* Destroy ctx. */
+		malloc_mutex_unlock(&ctx->lock);
+		bt_destroy(ctx->bt);
+		malloc_mutex_destroy(&ctx->lock);
+		idalloc(ctx);
+	} else {
+		malloc_mutex_unlock(&ctx->lock);
+		prof_leave();
+	}
+}
+
+static void
+prof_ctx_merge(prof_ctx_t *ctx, prof_thr_cnt_t *cnt)
+{
+	bool destroy;
+
+	/* Merge cnt stats and detach from ctx. */
+	malloc_mutex_lock(&ctx->lock);
+	ctx->cnt_merged.curobjs += cnt->cnts.curobjs;
+	ctx->cnt_merged.curbytes += cnt->cnts.curbytes;
+	ctx->cnt_merged.accumobjs += cnt->cnts.accumobjs;
+	ctx->cnt_merged.accumbytes += cnt->cnts.accumbytes;
+	ql_remove(&ctx->cnts_ql, cnt, cnts_link);
+	if (opt_prof_accum == false && ql_first(&ctx->cnts_ql) == NULL &&
+	    ctx->cnt_merged.curobjs == 0)
+		destroy = true;
+	else
+		destroy = false;
+	malloc_mutex_unlock(&ctx->lock);
+	if (destroy)
+		prof_ctx_destroy(ctx);
+}
+
 static bool
 prof_dump_ctx(prof_ctx_t *ctx, prof_bt_t *bt, bool propagate_err)
 {
 	char buf[UMAX2S_BUFSIZE];
 	unsigned i;
 
-	if (prof_write(umax2s(ctx->cnt_dump.curobjs, 10, buf), propagate_err)
+	if (opt_prof_accum == false && ctx->cnt_summed.curobjs == 0) {
+		assert(ctx->cnt_summed.curbytes == 0);
+		assert(ctx->cnt_summed.accumobjs == 0);
+		assert(ctx->cnt_summed.accumbytes == 0);
+		return (false);
+	}
+
+	if (prof_write(umax2s(ctx->cnt_summed.curobjs, 10, buf), propagate_err)
 	    || prof_write(": ", propagate_err)
-	    || prof_write(umax2s(ctx->cnt_dump.curbytes, 10, buf),
+	    || prof_write(umax2s(ctx->cnt_summed.curbytes, 10, buf),
 	    propagate_err)
 	    || prof_write(" [", propagate_err)
-	    || prof_write(umax2s(ctx->cnt_dump.accumobjs, 10, buf),
+	    || prof_write(umax2s(ctx->cnt_summed.accumobjs, 10, buf),
 	    propagate_err)
 	    || prof_write(": ", propagate_err)
-	    || prof_write(umax2s(ctx->cnt_dump.accumbytes, 10, buf),
+	    || prof_write(umax2s(ctx->cnt_summed.accumbytes, 10, buf),
 	    propagate_err)
 	    || prof_write("] @", propagate_err))
 		return (true);
@@ -1060,7 +1163,7 @@ prof_dump(const char *filename, bool leakcheck, bool propagate_err)
 	leak_nctx = 0;
 	for (tabind = 0; ckh_iter(&bt2ctx, &tabind, NULL, &ctx.v)
 	    == false;) {
-		prof_ctx_merge(ctx.p, &cnt_all, &leak_nctx);
+		prof_ctx_sum(ctx.p, &cnt_all, &leak_nctx);
 	}
 
 	/* Dump profile header. */
@@ -1319,54 +1422,33 @@ prof_bt_keycomp(const void *k1, const void *k2)
 }
 
 static void
-bt2cnt_thread_cleanup(void *arg)
+prof_tcache_cleanup(void *arg)
 {
-	ckh_t *bt2cnt;
+	prof_tcache_t *prof_tcache;
 
-	bt2cnt = BT2CNT_GET();
-	if (bt2cnt != NULL) {
-		ql_head(prof_thr_cnt_t) cnts_ql;
-		size_t tabind;
-		union {
-			prof_thr_cnt_t	*p;
-			void		*v;
-		} cnt;
-
-		/* Iteratively merge cnt's into the global stats. */
-		ql_new(&cnts_ql);
-		tabind = 0;
-		while (ckh_iter(bt2cnt, &tabind, NULL, &cnt.v) ==
-		    false) {
-			prof_ctx_t *ctx = cnt.p->ctx;
-			/* Merge stats and detach from ctx. */
-			malloc_mutex_lock(&ctx->lock);
-			ctx->cnt_merged.curobjs += cnt.p->cnts.curobjs;
-			ctx->cnt_merged.curbytes += cnt.p->cnts.curbytes;
-			ctx->cnt_merged.accumobjs += cnt.p->cnts.accumobjs;
-			ctx->cnt_merged.accumbytes += cnt.p->cnts.accumbytes;
-			ql_remove(&ctx->cnts_ql, cnt.p, link);
-			malloc_mutex_unlock(&ctx->lock);
-
-			/*
-			 * Stash cnt for deletion after finishing with
-			 * ckh_iter().
-			 */
-			ql_tail_insert(&cnts_ql, cnt.p, link);
-		}
+	prof_tcache = PROF_TCACHE_GET();
+	if (prof_tcache != NULL) {
+		prof_thr_cnt_t *cnt;
 
 		/*
-		 * Delete the hash table now that cnts_ql has a list of all
-		 * cnt's.
+		 * Delete the hash table.  All of its contents can still be
+		 * iterated over via the LRU.
 		 */
-		ckh_delete(bt2cnt);
-		idalloc(bt2cnt);
-		BT2CNT_SET(NULL);
+		ckh_delete(&prof_tcache->bt2cnt);
 
-		/* Delete cnt's. */
-		while ((cnt.p = ql_last(&cnts_ql, link)) != NULL) {
-			ql_remove(&cnts_ql, cnt.p, link);
-			idalloc(cnt.v);
+		/*
+		 * Iteratively merge cnt's into the global stats and delete
+		 * them.
+		 */
+		while ((cnt = ql_last(&prof_tcache->lru_ql, lru_link)) !=
+		    NULL) {
+			prof_ctx_merge(cnt->ctx, cnt);
+			ql_remove(&prof_tcache->lru_ql, cnt, lru_link);
+			idalloc(cnt);
 		}
+
+		idalloc(prof_tcache);
+		PROF_TCACHE_SET(NULL);
 	}
 }
 
@@ -1419,7 +1501,7 @@ prof_boot1(void)
 			return (true);
 		if (malloc_mutex_init(&bt2ctx_mtx))
 			return (true);
-		if (pthread_key_create(&bt2cnt_tsd, bt2cnt_thread_cleanup)
+		if (pthread_key_create(&prof_tcache_tsd, prof_tcache_cleanup)
 		    != 0) {
 			malloc_write(
 			    "<jemalloc>: Error in pthread_key_create()\n");
diff --git a/jemalloc/src/stats.c b/jemalloc/src/stats.c
index 9dc75293..9b3271b2 100644
--- a/jemalloc/src/stats.c
+++ b/jemalloc/src/stats.c
@@ -469,6 +469,9 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 		if ((err = JEMALLOC_P(mallctl)("opt.stats_print", &bv, &bsz,
 		    NULL, 0)) == 0)
 			write_cb(cbopaque, bv ? "P" : "p");
+		if ((err = JEMALLOC_P(mallctl)("opt.prof_accum", &bv, &bsz,
+		    NULL, 0)) == 0)
+			write_cb(cbopaque, bv ? "R" : "r");
 		if ((err = JEMALLOC_P(mallctl)("opt.prof_udump", &bv, &bsz,
 		    NULL, 0)) == 0)
 			write_cb(cbopaque, bv ? "U" : "u");
@@ -580,6 +583,17 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 			write_cb(cbopaque, umax2s((1U << sv), 10, s));
 			write_cb(cbopaque, "\n");
 
+			CTL_GET("opt.lg_prof_tcmax", &ssv, ssize_t);
+			write_cb(cbopaque,
+			    "Maximum per thread backtrace cache: ");
+			if (ssv >= 0) {
+				write_cb(cbopaque, umax2s((1U << ssv), 10, s));
+				write_cb(cbopaque, " (2^");
+				write_cb(cbopaque, umax2s(ssv, 10, s));
+				write_cb(cbopaque, ")\n");
+			} else
+				write_cb(cbopaque, "N/A\n");
+
 			CTL_GET("opt.lg_prof_sample", &sv, size_t);
 			write_cb(cbopaque, "Average profile sample interval: ");
 			write_cb(cbopaque, umax2s((1U << sv), 10, s));

From 588a32cd843ed6724b1b4d5317d027d39ae054af Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Sat, 2 Oct 2010 22:38:14 -0700
Subject: [PATCH 24/48] Increase default backtrace depth from 4 to 128.

Increase the default backtrace depth, because shallow backtraces tend to
result in confusing pprof output graphs.
---
 jemalloc/doc/jemalloc.3.in                |  2 +-
 jemalloc/include/jemalloc/internal/prof.h | 11 +++--
 jemalloc/src/prof.c                       | 56 +++++++++++++++++++++--
 3 files changed, 59 insertions(+), 10 deletions(-)

diff --git a/jemalloc/doc/jemalloc.3.in b/jemalloc/doc/jemalloc.3.in
index 23e6ca00..c8d2f29c 100644
--- a/jemalloc/doc/jemalloc.3.in
+++ b/jemalloc/doc/jemalloc.3.in
@@ -436,7 +436,7 @@ in these cases.
 @roff_prof@.It B
 @roff_prof@Double/halve the maximum backtrace depth when profiling memory
 @roff_prof@allocation activity.
-@roff_prof@The default is 4.
+@roff_prof@The default is 128.
 .It C
 Double/halve the size of the maximum size class that is a multiple of the
 cacheline size (64).
diff --git a/jemalloc/include/jemalloc/internal/prof.h b/jemalloc/include/jemalloc/internal/prof.h
index a8f67bbb..1aa85bb5 100644
--- a/jemalloc/include/jemalloc/internal/prof.h
+++ b/jemalloc/include/jemalloc/internal/prof.h
@@ -9,7 +9,7 @@ typedef struct prof_ctx_s prof_ctx_t;
 typedef struct prof_tcache_s prof_tcache_t;
 
 /* Option defaults. */
-#define	LG_PROF_BT_MAX_DEFAULT		2
+#define	LG_PROF_BT_MAX_DEFAULT		7
 #define	LG_PROF_SAMPLE_DEFAULT		0
 #define	LG_PROF_INTERVAL_DEFAULT	-1
 #define	LG_PROF_TCMAX_DEFAULT		-1
@@ -17,10 +17,13 @@ typedef struct prof_tcache_s prof_tcache_t;
 /*
  * Hard limit on stack backtrace depth.  Note that the version of
  * prof_backtrace() that is based on __builtin_return_address() necessarily has
- * a hard-coded number of backtrace frame handlers, so increasing
- * LG_PROF_BT_MAX requires changing prof_backtrace().
+ * a hard-coded number of backtrace frame handlers.
  */
-#define	LG_PROF_BT_MAX		7 /* >= LG_PROF_BT_MAX_DEFAULT */
+#if (defined(JEMALLOC_PROF_LIBGCC) || defined(JEMALLOC_PROF_LIBUNWIND))
+#  define LG_PROF_BT_MAX	((ZU(1) << (LG_SIZEOF_PTR+3)) - 1)
+#else
+#  define LG_PROF_BT_MAX	7 /* >= LG_PROF_BT_MAX_DEFAULT */
+#endif
 #define	PROF_BT_MAX		(1U << LG_PROF_BT_MAX)
 
 /* Initial hash table size. */
diff --git a/jemalloc/src/prof.c b/jemalloc/src/prof.c
index 7ffda230..5b66f97b 100644
--- a/jemalloc/src/prof.c
+++ b/jemalloc/src/prof.c
@@ -52,7 +52,6 @@ static __thread prof_tcache_t	*prof_tcache_tls
 	pthread_setspecific(prof_tcache_tsd, (void *)(v));		\
 } while (0)
 #endif
-
 /*
  * Same contents as b2cnt_tls, but initialized such that the TSD destructor is
  * called when a thread exits, so that prof_tcache_tls contents can be merged,
@@ -60,6 +59,29 @@ static __thread prof_tcache_t	*prof_tcache_tls
  */
 static pthread_key_t	prof_tcache_tsd;
 
+/* Thread-specific backtrace vector, used for calls to prof_backtrace(). */
+#ifndef NO_TLS
+static __thread void	**vec_tls
+    JEMALLOC_ATTR(tls_model("initial-exec"));
+#  define VEC_GET()	vec_tls
+#  define VEC_SET(v)	do {					\
+	vec_tls = (v);						\
+	pthread_setspecific(vec_tsd, (void *)(v));		\
+} while (0)
+#else
+#  define VEC_GET()	((ckh_t *)pthread_getspecific(vec_tsd))
+#  define VEC_SET(v)	do {					\
+	pthread_setspecific(vec_tsd, (void *)(v));		\
+} while (0)
+#endif
+/*
+ * Same contents as vec_tls, but initialized such that the TSD destructor is
+ * called when a thread exits, so that vec_tls contents can be merged,
+ * unlinked, and deallocated.
+ */
+static pthread_key_t	vec_tsd;
+
+
 /* (1U << opt_lg_prof_bt_max). */
 static unsigned		prof_bt_max;
 
@@ -158,6 +180,7 @@ static void	prof_bt_hash(const void *key, unsigned minbits, size_t *hash1,
     size_t *hash2);
 static bool	prof_bt_keycomp(const void *k1, const void *k2);
 static void	prof_tcache_cleanup(void *arg);
+static void	vec_cleanup(void *arg);
 #ifdef NO_TLS
 static void	prof_sample_state_thread_cleanup(void *arg);
 #endif
@@ -632,9 +655,17 @@ prof_thr_cnt_t *
 prof_alloc_prep(size_t size)
 {
 	prof_thr_cnt_t *ret;
-	void *vec[prof_bt_max];
+	void **vec;
 	prof_bt_t bt;
 
+	vec = VEC_GET();
+	if (vec == NULL) {
+		vec = imalloc(sizeof(void *) * prof_bt_max);
+		if (vec == NULL)
+			return (NULL);
+		VEC_SET(vec);
+	}
+
 	if (opt_prof_active == false) {
 		/* Sampling is currently inactive, so avoid sampling. */
 		ret = (prof_thr_cnt_t *)(uintptr_t)1U;
@@ -1161,10 +1192,8 @@ prof_dump(const char *filename, bool leakcheck, bool propagate_err)
 	/* Merge per thread profile stats, and sum them in cnt_all. */
 	memset(&cnt_all, 0, sizeof(prof_cnt_t));
 	leak_nctx = 0;
-	for (tabind = 0; ckh_iter(&bt2ctx, &tabind, NULL, &ctx.v)
-	    == false;) {
+	for (tabind = 0; ckh_iter(&bt2ctx, &tabind, NULL, &ctx.v) == false;)
 		prof_ctx_sum(ctx.p, &cnt_all, &leak_nctx);
-	}
 
 	/* Dump profile header. */
 	if (prof_write("heap profile: ", propagate_err)
@@ -1452,6 +1481,18 @@ prof_tcache_cleanup(void *arg)
 	}
 }
 
+static void
+vec_cleanup(void *arg)
+{
+	void **vec;
+
+	vec = VEC_GET();
+	if (vec != NULL) {
+		idalloc(vec);
+		VEC_SET(NULL);
+	}
+}
+
 #ifdef NO_TLS
 static void
 prof_sample_state_thread_cleanup(void *arg)
@@ -1507,6 +1548,11 @@ prof_boot1(void)
 			    "<jemalloc>: Error in pthread_key_create()\n");
 			abort();
 		}
+		if (pthread_key_create(&vec_tsd, vec_cleanup) != 0) {
+			malloc_write(
+			    "<jemalloc>: Error in pthread_key_create()\n");
+			abort();
+		}
 #ifdef NO_TLS
 		if (pthread_key_create(&prof_sample_state_tsd,
 		    prof_sample_state_thread_cleanup) != 0) {

From 9ce3bfd92d36d52d68ec6dd97e521a4c1fa2714f Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Sat, 2 Oct 2010 22:39:59 -0700
Subject: [PATCH 25/48] Fix leak context count reporting.

Fix a bug in leak context count reporting that tended to cause the
number of contexts to be underreported.  The reported number of leaked
objects and bytes were not affected by this bug.
---
 jemalloc/src/prof.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/jemalloc/src/prof.c b/jemalloc/src/prof.c
index 5b66f97b..783a83d2 100644
--- a/jemalloc/src/prof.c
+++ b/jemalloc/src/prof.c
@@ -999,11 +999,11 @@ prof_ctx_sum(prof_ctx_t *ctx, prof_cnt_t *cnt_all, size_t *leak_nctx)
 			ctx->cnt_summed.accumobjs += tcnt.accumobjs;
 			ctx->cnt_summed.accumbytes += tcnt.accumbytes;
 		}
-
-		if (tcnt.curobjs != 0)
-			(*leak_nctx)++;
 	}
 
+	if (ctx->cnt_summed.curobjs != 0)
+		(*leak_nctx)++;
+
 	/* Add to cnt_all. */
 	cnt_all->curobjs += ctx->cnt_summed.curobjs;
 	cnt_all->curbytes += ctx->cnt_summed.curbytes;

From c6e950665c0566934b8b058ea11fc44cdcfee3ce Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Sun, 3 Oct 2010 00:10:15 -0700
Subject: [PATCH 26/48] Increase PRN 'a' and 'c' constants.

Increase PRN 'a' and 'c' constants, so that high bits tend to cascade
more.
---
 jemalloc/include/jemalloc/internal/ckh.h | 2 +-
 jemalloc/src/prof.c                      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/jemalloc/include/jemalloc/internal/ckh.h b/jemalloc/include/jemalloc/internal/ckh.h
index c39ea5c7..d4e391b6 100644
--- a/jemalloc/include/jemalloc/internal/ckh.h
+++ b/jemalloc/include/jemalloc/internal/ckh.h
@@ -45,7 +45,7 @@ struct ckh_s {
 #endif
 
 	/* Used for pseudo-random number generation. */
-#define	CKH_A		12345
+#define	CKH_A		1103515241
 #define	CKH_C		12347
 	uint32_t	prn_state;
 
diff --git a/jemalloc/src/prof.c b/jemalloc/src/prof.c
index 783a83d2..e715da9a 100644
--- a/jemalloc/src/prof.c
+++ b/jemalloc/src/prof.c
@@ -644,7 +644,7 @@ prof_sample_threshold_update(void)
 	 */
 	PROF_SAMPLE_STATE_GET(prof_sample_state);
 	prn64(r, 53, prof_sample_state->prn_state,
-	    (uint64_t)1125899906842625LLU, 1058392653243283975);
+	    (uint64_t)6364136223846793005LLU, (uint64_t)1442695040888963407LLU);
 	u = (double)r * (1.0/9007199254740992.0L);
 	prof_sample_state->threshold = (uint64_t)(log(u) /
 	    log(1.0 - (1.0 / (double)((uint64_t)1U << opt_lg_prof_sample))))

From 1506a1b90341cf27415c01b271b56f82f682fd07 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@facebook.com>
Date: Thu, 7 Oct 2010 08:52:32 -0700
Subject: [PATCH 27/48] Move variable declaration out of for loop header.

Move a loop variable declaration out of for(usigned i = 0; ...) in order
to avoid the need for C99 compilation.
---
 jemalloc/src/ctl.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/jemalloc/src/ctl.c b/jemalloc/src/ctl.c
index 76422599..edbbb209 100644
--- a/jemalloc/src/ctl.c
+++ b/jemalloc/src/ctl.c
@@ -1322,7 +1322,8 @@ arenas_purge_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
 		malloc_mutex_unlock(&arenas_lock);
 
 		if (arena == UINT_MAX) {
-			for (unsigned i = 0; i < narenas; i++) {
+			unsigned i;
+			for (i = 0; i < narenas; i++) {
 				if (tarenas[i] != NULL)
 					arena_purge_all(tarenas[i]);
 			}

From 9f3b0a74fdaac6fa5eb8b5a875b8901bc56b2f5e Mon Sep 17 00:00:00 2001
From: Jason Evans <je@facebook.com>
Date: Thu, 7 Oct 2010 09:53:26 -0700
Subject: [PATCH 28/48] Fix tests build when --with-install-suffix is set.

Add test/jemalloc_test.h.in, which is processed to include
jemalloc/jemalloc@install_suffix@.h, so that test programs can include
it without worrying about the install suffix.
---
 .gitignore                       | 9 ++++-----
 jemalloc/Makefile.in             | 4 ++--
 jemalloc/configure.ac            | 3 +++
 jemalloc/test/allocm.c           | 2 +-
 jemalloc/test/jemalloc_test.h.in | 6 ++++++
 jemalloc/test/posix_memalign.c   | 2 +-
 jemalloc/test/rallocm.c          | 2 +-
 jemalloc/test/thread_arena.c     | 2 +-
 8 files changed, 19 insertions(+), 11 deletions(-)
 create mode 100644 jemalloc/test/jemalloc_test.h.in

diff --git a/.gitignore b/.gitignore
index 8e95c7a4..b4681866 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,11 +6,10 @@
 /jemalloc/doc/jemalloc.3
 /jemalloc/lib/
 /jemalloc/Makefile
-/jemalloc/src/internal/jemalloc_internal\.h
-/jemalloc/src/internal/mtrgraph_defs\.h
-/jemalloc/src/internal/mtrplay_defs\.h
-/jemalloc/src/jemalloc\.h
-/jemalloc/src/jemalloc_defs\.h
+/jemalloc/include/jemalloc/internal/jemalloc_internal\.h
+/jemalloc/include/jemalloc/jemalloc\.h
+/jemalloc/include/jemalloc/jemalloc_defs\.h
+/jemalloc/test/jemalloc_test\.h
 /jemalloc/src/*.[od]
 /jemalloc/test/*.[od]
 /jemalloc/test/*.out
diff --git a/jemalloc/Makefile.in b/jemalloc/Makefile.in
index f2a64539..7863c1b7 100644
--- a/jemalloc/Makefile.in
+++ b/jemalloc/Makefile.in
@@ -90,8 +90,8 @@ all: $(DSOS)
 
 @objroot@test/%.o: @srcroot@test/%.c
 	@mkdir -p $(@D)
-	$(CC) $(CFLAGS) -c $(CPPFLAGS) -o $@ $<
-	@$(SHELL) -ec "$(CC) -MM $(CPPFLAGS) $< | sed \"s/\($(subst /,\/,$(notdir $(basename $@)))\)\.o\([ :]*\)/$(subst /,\/,$(strip $(dir $@)))\1.o \2/g\" > $(@:%.o=%.d)"
+	$(CC) $(CFLAGS) -c $(CPPFLAGS) -I@objroot@test -o $@ $<
+	@$(SHELL) -ec "$(CC) -MM $(CPPFLAGS) -I@objroot@test $< | sed \"s/\($(subst /,\/,$(notdir $(basename $@)))\)\.o\([ :]*\)/$(subst /,\/,$(strip $(dir $@)))\1.o \2/g\" > $(@:%.o=%.d)"
 
 @objroot@test/%: @objroot@test/%.o \
 		 @objroot@lib/libjemalloc@install_suffix@.$(SO)
diff --git a/jemalloc/configure.ac b/jemalloc/configure.ac
index 26e2b012..687cafcb 100644
--- a/jemalloc/configure.ac
+++ b/jemalloc/configure.ac
@@ -274,14 +274,17 @@ AC_SUBST([install_suffix])
 cfgoutputs_in="${srcroot}Makefile.in ${srcroot}doc/jemalloc.3.in"
 cfgoutputs_in="${cfgoutputs_in} ${srcroot}include/jemalloc/jemalloc.h.in"
 cfgoutputs_in="${cfgoutputs_in} ${srcroot}include/jemalloc/internal/jemalloc_internal.h.in"
+cfgoutputs_in="${cfgoutputs_in} ${srcroot}test/jemalloc_test.h.in"
 
 cfgoutputs_out="Makefile doc/jemalloc${install_suffix}.3"
 cfgoutputs_out="${cfgoutputs_out} include/jemalloc/jemalloc${install_suffix}.h"
 cfgoutputs_out="${cfgoutputs_out} include/jemalloc/internal/jemalloc_internal.h"
+cfgoutputs_out="${cfgoutputs_out} test/jemalloc_test.h"
 
 cfgoutputs_tup="Makefile doc/jemalloc${install_suffix}.3:doc/jemalloc.3.in"
 cfgoutputs_tup="${cfgoutputs_tup} include/jemalloc/jemalloc${install_suffix}.h:include/jemalloc/jemalloc.h.in"
 cfgoutputs_tup="${cfgoutputs_tup} include/jemalloc/internal/jemalloc_internal.h"
+cfgoutputs_tup="${cfgoutputs_tup} test/jemalloc_test.h:test/jemalloc_test.h.in"
 
 cfghdrs_in="${srcroot}include/jemalloc/jemalloc_defs.h.in"
 
diff --git a/jemalloc/test/allocm.c b/jemalloc/test/allocm.c
index 4367cb89..59d0002e 100644
--- a/jemalloc/test/allocm.c
+++ b/jemalloc/test/allocm.c
@@ -3,7 +3,7 @@
 #include <stdint.h>
 
 #define	JEMALLOC_MANGLE
-#include "jemalloc/jemalloc.h"
+#include "jemalloc_test.h"
 
 #define CHUNK 0x400000
 /* #define MAXALIGN ((size_t)0x80000000000LLU) */
diff --git a/jemalloc/test/jemalloc_test.h.in b/jemalloc/test/jemalloc_test.h.in
new file mode 100644
index 00000000..0c48895e
--- /dev/null
+++ b/jemalloc/test/jemalloc_test.h.in
@@ -0,0 +1,6 @@
+/*
+ * This header should be included by tests, rather than directly including
+ * jemalloc/jemalloc.h, because --with-install-suffix may cause the header to
+ * have a different name.
+ */
+#include "jemalloc/jemalloc@install_suffix@.h"
diff --git a/jemalloc/test/posix_memalign.c b/jemalloc/test/posix_memalign.c
index c5651d7c..3e306c01 100644
--- a/jemalloc/test/posix_memalign.c
+++ b/jemalloc/test/posix_memalign.c
@@ -5,7 +5,7 @@
 #include <string.h>
 
 #define	JEMALLOC_MANGLE
-#include "jemalloc/jemalloc.h"
+#include "jemalloc_test.h"
 
 #define CHUNK 0x400000
 /* #define MAXALIGN ((size_t)0x80000000000LLU) */
diff --git a/jemalloc/test/rallocm.c b/jemalloc/test/rallocm.c
index b52bdb20..7e8a271c 100644
--- a/jemalloc/test/rallocm.c
+++ b/jemalloc/test/rallocm.c
@@ -3,7 +3,7 @@
 #include <string.h>
 
 #define	JEMALLOC_MANGLE
-#include "jemalloc/jemalloc.h"
+#include "jemalloc_test.h"
 
 int
 main(void)
diff --git a/jemalloc/test/thread_arena.c b/jemalloc/test/thread_arena.c
index 5b1058b5..bd884e1f 100644
--- a/jemalloc/test/thread_arena.c
+++ b/jemalloc/test/thread_arena.c
@@ -4,7 +4,7 @@
 #include <string.h>
 
 #define	JEMALLOC_MANGLE
-#include "jemalloc/jemalloc.h"
+#include "jemalloc_test.h"
 
 void *
 thread_start(void *arg)

From ac6f3c2bb510754a52642b2a3440d009164916da Mon Sep 17 00:00:00 2001
From: Jason Evans <je@facebook.com>
Date: Thu, 7 Oct 2010 11:59:12 -0700
Subject: [PATCH 29/48] Re-organize prof-libgcc configuration.

Re-organize code for --enable-prof-libgcc so that configure doesn't
report both libgcc and libunwind support as being configured in.  This
change has no impact on how jemalloc is actually configured/built.
---
 jemalloc/configure.ac | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/jemalloc/configure.ac b/jemalloc/configure.ac
index 687cafcb..b27955de 100644
--- a/jemalloc/configure.ac
+++ b/jemalloc/configure.ac
@@ -437,13 +437,16 @@ AC_SUBST([roff_no_prof])
 
 dnl If libunwind isn't enabled, try to use libgcc rather than gcc intrinsics
 dnl for backtracing.
-if test "x$enable_prof" = "x1" -a "x$enable_prof_libunwind" = "x0" \
- -a "x$GCC" = "xyes" -a "x$enable_prof_libgcc" = "x1" ; then
-  enable_prof_libgcc="1"
-  AC_CHECK_HEADERS([unwind.h], , [enable_prof_libgcc="0"])
-  AC_CHECK_LIB([gcc], [_Unwind_Backtrace], [LIBS="$LIBS -lgcc"], [enable_prof_libgcc="0"])
-  if test "x${enable_prof_libgcc}" = "x1" ; then
-    AC_DEFINE([JEMALLOC_PROF_LIBGCC], [ ])
+if test "x$enable_prof" = "x1" -a "x$enable_prof_libgcc" = "x1" ; then
+  if test "x$enable_prof_libunwind" = "x0" -a "x$GCC" = "xyes" ; then
+    enable_prof_libgcc="1"
+    AC_CHECK_HEADERS([unwind.h], , [enable_prof_libgcc="0"])
+    AC_CHECK_LIB([gcc], [_Unwind_Backtrace], [LIBS="$LIBS -lgcc"], [enable_prof_libgcc="0"])
+    if test "x${enable_prof_libgcc}" = "x1" ; then
+      AC_DEFINE([JEMALLOC_PROF_LIBGCC], [ ])
+    fi
+  else
+    enable_prof_libgcc="0"
   fi
 fi
 

From 004ed142a66529ecf4a55e8f4fa42ff2e535f586 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@facebook.com>
Date: Thu, 14 Oct 2010 00:28:31 -0700
Subject: [PATCH 30/48] Fix a regression in CHUNK_MAP_UNZEROED change.

Fix a regression added by revision:

	3377ffa1f4f8e67bce1e36624285e5baf5f9ecef
	Change CHUNK_MAP_ZEROED to CHUNK_MAP_UNZEROED.

A modified chunk->map dereference was missing the subtraction of
map_bias, which caused incorrect chunk map initialization, as well as
potential corruption of the first non-header page of memory within each
chunk.
---
 jemalloc/src/arena.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/jemalloc/src/arena.c b/jemalloc/src/arena.c
index 45e5fd19..7c99d436 100644
--- a/jemalloc/src/arena.c
+++ b/jemalloc/src/arena.c
@@ -395,7 +395,7 @@ arena_run_split(arena_t *arena, arena_run_t *run, size_t size, bool large,
 			chunk->map[run_ind+i-map_bias].bits = (i << PAGE_SHIFT)
 			    | CHUNK_MAP_ALLOCATED;
 		}
-		chunk->map[run_ind + need_pages-1-map_bias].bits = ((need_pages
+		chunk->map[run_ind+need_pages-1-map_bias].bits = ((need_pages
 		    - 1) << PAGE_SHIFT) | CHUNK_MAP_ALLOCATED | flag_dirty;
 	}
 }
@@ -457,7 +457,8 @@ arena_chunk_alloc(arena_t *arena)
 			for (i = map_bias+1; i < chunk_npages-1; i++)
 				chunk->map[i-map_bias].bits = unzeroed;
 		}
-		chunk->map[chunk_npages-1].bits = arena_maxclass | unzeroed;
+		chunk->map[chunk_npages-1-map_bias].bits = arena_maxclass |
+		    unzeroed;
 
 		/* Insert the run into the runs_avail_clean tree. */
 		arena_avail_tree_insert(&arena->runs_avail_clean,

From 397e5111b5efd49f61f73c1bad0375c7885a6128 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Sat, 16 Oct 2010 16:10:40 -0700
Subject: [PATCH 31/48] Preserve CHUNK_MAP_UNZEROED for small runs.

Preserve CHUNK_MAP_UNZEROED when allocating small runs, because it is
possible that untouched pages will be returned to the tree of clean
runs, where the CHUNK_MAP_UNZEROED flag matters.  Prior to the
conversion from CHUNK_MAP_ZEROED, this was already a bug, but in the
worst case extra zeroing occurred.  After the conversion, this bug made
it possible to incorrectly treat pages as pre-zeroed.
---
 jemalloc/src/arena.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/jemalloc/src/arena.c b/jemalloc/src/arena.c
index 7c99d436..8cc35f5f 100644
--- a/jemalloc/src/arena.c
+++ b/jemalloc/src/arena.c
@@ -389,14 +389,18 @@ arena_run_split(arena_t *arena, arena_run_t *run, size_t size, bool large,
 		 * arena_dalloc_bin_run() has the ability to conditionally trim
 		 * clean pages.
 		 */
-		chunk->map[run_ind-map_bias].bits = CHUNK_MAP_ALLOCATED |
-		    flag_dirty;
+		chunk->map[run_ind-map_bias].bits =
+		    (chunk->map[run_ind-map_bias].bits & CHUNK_MAP_UNZEROED) |
+		    CHUNK_MAP_ALLOCATED | flag_dirty;
 		for (i = 1; i < need_pages - 1; i++) {
 			chunk->map[run_ind+i-map_bias].bits = (i << PAGE_SHIFT)
-			    | CHUNK_MAP_ALLOCATED;
+			    | (chunk->map[run_ind+i-map_bias].bits &
+			    CHUNK_MAP_UNZEROED) | CHUNK_MAP_ALLOCATED;
 		}
 		chunk->map[run_ind+need_pages-1-map_bias].bits = ((need_pages
-		    - 1) << PAGE_SHIFT) | CHUNK_MAP_ALLOCATED | flag_dirty;
+		    - 1) << PAGE_SHIFT) |
+		    (chunk->map[run_ind+need_pages-1-map_bias].bits &
+		    CHUNK_MAP_UNZEROED) | CHUNK_MAP_ALLOCATED | flag_dirty;
 	}
 }
 

From 940a2e02b27b264cc92e8ecbf186a711ce05ad04 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Sun, 17 Oct 2010 17:51:37 -0700
Subject: [PATCH 32/48] Fix numerous arena bugs.

In arena_ralloc_large_grow(), update the map element for the end of the
newly grown run, rather than the interior map element that was the
beginning of the appended run.  This is a long-standing bug, and it had
the potential to cause massive corruption, but triggering it required
roughly the following sequence of events:
  1) Large in-place growing realloc(), with left-over space in the run
     that followed the large object.
  2) Allocation of the remainder run left over from (1).
  3) Deallocation of the remainder run *before* deallocation of the
     large run, with unfortunate interior map state left over from
     previous run allocation/deallocation activity, such that one or
     more pages of allocated memory would be treated as part of the
     remainder run during run coalescing.
In summary, this was a bad bug, but it was difficult to trigger.

In arena_bin_malloc_hard(), if another thread wins the race to allocate
a bin run, dispose of the spare run via arena_bin_lower_run() rather
than arena_run_dalloc(), since the run has already been prepared for use
as a bin run.  This bug has existed since March 14, 2010:
    e00572b384c81bd2aba57fac32f7077a34388915
    mmap()/munmap() without arena->lock or bin->lock.

Fix bugs in arena_dalloc_bin_run(), arena_trim_head(),
arena_trim_tail(), and arena_ralloc_large_grow() that could cause the
CHUNK_MAP_UNZEROED map bit to become corrupted.  These are all
long-standing bugs, but the chances of them actually causing problems
was much lower before the CHUNK_MAP_ZEROED --> CHUNK_MAP_UNZEROED
conversion.

Fix a large run statistics regression in arena_ralloc_large_grow() that
was introduced on September 17, 2010:
    8e3c3c61b5bb676a705450708e7e79698cdc9e0c
    Add {,r,s,d}allocm().

Add debug code to validate that supposedly pre-zeroed memory really is.
---
 jemalloc/include/jemalloc/internal/tcache.h |   6 +-
 jemalloc/src/arena.c                        | 244 ++++++++++++++------
 2 files changed, 171 insertions(+), 79 deletions(-)

diff --git a/jemalloc/include/jemalloc/internal/tcache.h b/jemalloc/include/jemalloc/internal/tcache.h
index 8c2aad6e..bcd6d44c 100644
--- a/jemalloc/include/jemalloc/internal/tcache.h
+++ b/jemalloc/include/jemalloc/internal/tcache.h
@@ -280,8 +280,8 @@ tcache_alloc_large(tcache_t *tcache, size_t size, bool zero)
 	} else {
 #ifdef JEMALLOC_PROF
 		arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ret);
-		size_t pageind = (unsigned)(((uintptr_t)ret - (uintptr_t)chunk)
-		    >> PAGE_SHIFT);
+		size_t pageind = (((uintptr_t)ret - (uintptr_t)chunk) >>
+		    PAGE_SHIFT);
 		chunk->map[pageind-map_bias].bits |=
 		    CHUNK_MAP_CLASS_MASK;
 #endif
@@ -362,7 +362,6 @@ tcache_dalloc_large(tcache_t *tcache, void *ptr, size_t size)
 	arena_chunk_t *chunk;
 	size_t pageind, binind;
 	tcache_bin_t *tbin;
-	arena_chunk_map_t *mapelm;
 
 	assert((size & PAGE_MASK) == 0);
 	assert(arena_salloc(ptr) > small_maxclass);
@@ -371,7 +370,6 @@ tcache_dalloc_large(tcache_t *tcache, void *ptr, size_t size)
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 	arena = chunk->arena;
 	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT;
-	mapelm = &chunk->map[pageind-map_bias];
 	binind = nbins + (size >> PAGE_SHIFT) - 1;
 
 #ifdef JEMALLOC_FILL
diff --git a/jemalloc/src/arena.c b/jemalloc/src/arena.c
index 8cc35f5f..36957d97 100644
--- a/jemalloc/src/arena.c
+++ b/jemalloc/src/arena.c
@@ -176,6 +176,8 @@ static void	*arena_bin_malloc_hard(arena_t *arena, arena_bin_t *bin);
 static size_t	arena_bin_run_size_calc(arena_bin_t *bin, size_t min_run_size);
 static void	arena_dalloc_bin_run(arena_t *arena, arena_chunk_t *chunk,
     arena_run_t *run, arena_bin_t *bin);
+static void	arena_bin_lower_run(arena_t *arena, arena_chunk_t *chunk,
+    arena_run_t *run, arena_bin_t *bin);
 static void	arena_ralloc_large_shrink(arena_t *arena, arena_chunk_t *chunk,
     void *ptr, size_t oldsize, size_t size);
 static bool	arena_ralloc_large_grow(arena_t *arena, arena_chunk_t *chunk,
@@ -359,6 +361,18 @@ arena_run_split(arena_t *arena, arena_run_t *run, size_t size, bool large,
 						    PAGE_SHIFT)), 0,
 						    PAGE_SIZE);
 					}
+#ifdef JEMALLOC_DEBUG
+					else {
+						size_t *p = (size_t *)
+						    ((uintptr_t)chunk +
+						    ((run_ind + i) <<
+						    PAGE_SHIFT));
+						for (size_t j = 0; j <
+						    PAGE_SIZE / sizeof(size_t);
+						    j++)
+							assert(p[j] == 0);
+					}
+#endif
 				}
 			} else {
 				/*
@@ -385,9 +399,9 @@ arena_run_split(arena_t *arena, arena_run_t *run, size_t size, bool large,
 	} else {
 		assert(zero == false);
 		/*
-		 * Propagate the dirty flag to the allocated small run, so that
-		 * arena_dalloc_bin_run() has the ability to conditionally trim
-		 * clean pages.
+		 * Propagate the dirty and unzeroed flags to the allocated
+		 * small run, so that arena_dalloc_bin_run() has the ability to
+		 * conditionally trim clean pages.
 		 */
 		chunk->map[run_ind-map_bias].bits =
 		    (chunk->map[run_ind-map_bias].bits & CHUNK_MAP_UNZEROED) |
@@ -461,6 +475,12 @@ arena_chunk_alloc(arena_t *arena)
 			for (i = map_bias+1; i < chunk_npages-1; i++)
 				chunk->map[i-map_bias].bits = unzeroed;
 		}
+#ifdef JEMALLOC_DEBUG
+		else {
+			for (i = map_bias+1; i < chunk_npages-1; i++)
+				assert(chunk->map[i-map_bias].bits == unzeroed);
+		}
+#endif
 		chunk->map[chunk_npages-1-map_bias].bits = arena_maxclass |
 		    unzeroed;
 
@@ -662,13 +682,13 @@ arena_chunk_purge(arena_t *arena, arena_chunk_t *chunk)
 				arena_avail_tree_remove(
 				    &arena->runs_avail_dirty, mapelm);
 
+				mapelm->bits = (npages << PAGE_SHIFT) |
+				    CHUNK_MAP_LARGE | CHUNK_MAP_ALLOCATED |
+				    flag_unzeroed;
 				/*
 				 * Update internal elements in the page map, so
 				 * that CHUNK_MAP_UNZEROED is properly set.
 				 */
-				mapelm->bits = (npages << PAGE_SHIFT) |
-				    CHUNK_MAP_LARGE | CHUNK_MAP_ALLOCATED |
-				    flag_unzeroed;
 				for (i = 1; i < npages - 1; i++) {
 					chunk->map[pageind+i-map_bias].bits =
 					    flag_unzeroed;
@@ -893,9 +913,9 @@ arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty)
 
 	/* Mark pages as unallocated in the chunk map. */
 	if (dirty) {
-		chunk->map[run_ind-map_bias].bits = size | flag_dirty;
+		chunk->map[run_ind-map_bias].bits = size | CHUNK_MAP_DIRTY;
 		chunk->map[run_ind+run_pages-1-map_bias].bits = size |
-		    flag_dirty;
+		    CHUNK_MAP_DIRTY;
 
 		chunk->ndirty += run_pages;
 		arena->ndirty += run_pages;
@@ -1003,18 +1023,40 @@ arena_run_trim_head(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
 {
 	size_t pageind = ((uintptr_t)run - (uintptr_t)chunk) >> PAGE_SHIFT;
 	size_t head_npages = (oldsize - newsize) >> PAGE_SHIFT;
-	size_t flags = chunk->map[pageind-map_bias].bits & CHUNK_MAP_FLAGS_MASK;
+	size_t flag_dirty = chunk->map[pageind-map_bias].bits & CHUNK_MAP_DIRTY;
 
 	assert(oldsize > newsize);
 
 	/*
 	 * Update the chunk map so that arena_run_dalloc() can treat the
-	 * leading run as separately allocated.
+	 * leading run as separately allocated.  Set the last element of each
+	 * run first, in case of single-page runs.
 	 */
-	assert(chunk->map[pageind-map_bias].bits & CHUNK_MAP_LARGE);
-	assert(chunk->map[pageind-map_bias].bits & CHUNK_MAP_ALLOCATED);
-	chunk->map[pageind-map_bias].bits = (oldsize - newsize) | flags;
-	chunk->map[pageind+head_npages-map_bias].bits = newsize | flags;
+	assert((chunk->map[pageind-map_bias].bits & CHUNK_MAP_LARGE) != 0);
+	assert((chunk->map[pageind-map_bias].bits & CHUNK_MAP_ALLOCATED) != 0);
+	chunk->map[pageind+head_npages-1-map_bias].bits = flag_dirty |
+	    (chunk->map[pageind+head_npages-1-map_bias].bits &
+	    CHUNK_MAP_UNZEROED) | CHUNK_MAP_LARGE | CHUNK_MAP_ALLOCATED;
+	chunk->map[pageind-map_bias].bits = (oldsize - newsize)
+	    | flag_dirty | (chunk->map[pageind-map_bias].bits &
+	    CHUNK_MAP_UNZEROED) | CHUNK_MAP_LARGE | CHUNK_MAP_ALLOCATED;
+
+#ifdef JEMALLOC_DEBUG
+	{
+		size_t tail_npages = newsize >> PAGE_SHIFT;
+		assert((chunk->map[pageind+head_npages+tail_npages-1-map_bias]
+		    .bits & ~PAGE_MASK) == 0);
+		assert((chunk->map[pageind+head_npages+tail_npages-1-map_bias]
+		    .bits & CHUNK_MAP_DIRTY) == flag_dirty);
+		assert((chunk->map[pageind+head_npages+tail_npages-1-map_bias]
+		    .bits & CHUNK_MAP_LARGE) != 0);
+		assert((chunk->map[pageind+head_npages+tail_npages-1-map_bias]
+		    .bits & CHUNK_MAP_ALLOCATED) != 0);
+	}
+#endif
+	chunk->map[pageind+head_npages-map_bias].bits = newsize | flag_dirty |
+	    (chunk->map[pageind+head_npages-map_bias].bits &
+	    CHUNK_MAP_FLAGS_MASK) | CHUNK_MAP_LARGE | CHUNK_MAP_ALLOCATED;
 
 	arena_run_dalloc(arena, run, false);
 }
@@ -1024,20 +1066,40 @@ arena_run_trim_tail(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
     size_t oldsize, size_t newsize, bool dirty)
 {
 	size_t pageind = ((uintptr_t)run - (uintptr_t)chunk) >> PAGE_SHIFT;
-	size_t npages = newsize >> PAGE_SHIFT;
-	size_t flags = chunk->map[pageind-map_bias].bits & CHUNK_MAP_FLAGS_MASK;
+	size_t head_npages = newsize >> PAGE_SHIFT;
+	size_t tail_npages = (oldsize - newsize) >> PAGE_SHIFT;
+	size_t flag_dirty = chunk->map[pageind-map_bias].bits &
+	    CHUNK_MAP_DIRTY;
 
 	assert(oldsize > newsize);
 
 	/*
 	 * Update the chunk map so that arena_run_dalloc() can treat the
-	 * trailing run as separately allocated.
+	 * trailing run as separately allocated.  Set the last element of each
+	 * run first, in case of single-page runs.
 	 */
-	assert(chunk->map[pageind-map_bias].bits & CHUNK_MAP_LARGE);
-	assert(chunk->map[pageind-map_bias].bits & CHUNK_MAP_ALLOCATED);
-	chunk->map[pageind-map_bias].bits = newsize | flags;
-	chunk->map[pageind+npages-1-map_bias].bits = newsize | flags;
-	chunk->map[pageind+npages-map_bias].bits = (oldsize - newsize) | flags;
+	assert((chunk->map[pageind-map_bias].bits & CHUNK_MAP_LARGE) != 0);
+	assert((chunk->map[pageind-map_bias].bits & CHUNK_MAP_ALLOCATED) != 0);
+	chunk->map[pageind+head_npages-1-map_bias].bits = flag_dirty |
+	    (chunk->map[pageind+head_npages-1-map_bias].bits &
+	    CHUNK_MAP_UNZEROED) | CHUNK_MAP_LARGE | CHUNK_MAP_ALLOCATED;
+	chunk->map[pageind-map_bias].bits = newsize | flag_dirty |
+	    (chunk->map[pageind-map_bias].bits & CHUNK_MAP_UNZEROED) |
+	    CHUNK_MAP_LARGE | CHUNK_MAP_ALLOCATED;
+
+	assert((chunk->map[pageind+head_npages+tail_npages-1-map_bias].bits &
+	    ~PAGE_MASK) == 0);
+	assert((chunk->map[pageind+head_npages+tail_npages-1-map_bias].bits &
+	    CHUNK_MAP_LARGE) != 0);
+	assert((chunk->map[pageind+head_npages+tail_npages-1-map_bias].bits &
+	    CHUNK_MAP_ALLOCATED) != 0);
+	chunk->map[pageind+head_npages+tail_npages-1-map_bias].bits =
+	    flag_dirty |
+	    (chunk->map[pageind+head_npages+tail_npages-1-map_bias].bits &
+	    CHUNK_MAP_UNZEROED) | CHUNK_MAP_LARGE | CHUNK_MAP_ALLOCATED;
+	chunk->map[pageind+head_npages-map_bias].bits = (oldsize - newsize) |
+	    flag_dirty | (chunk->map[pageind+head_npages-map_bias].bits &
+	    CHUNK_MAP_UNZEROED) | CHUNK_MAP_LARGE | CHUNK_MAP_ALLOCATED;
 
 	arena_run_dalloc(arena, (arena_run_t *)((uintptr_t)run + newsize),
 	    dirty);
@@ -1102,7 +1164,7 @@ arena_bin_nonfull_run_get(arena_t *arena, arena_bin_t *bin)
 
 	/*
 	 * arena_run_alloc() failed, but another thread may have made
-	 * sufficient memory available while this one dopped bin->lock above,
+	 * sufficient memory available while this one dropped bin->lock above,
 	 * so search one more time.
 	 */
 	mapelm = arena_run_tree_first(&bin->runs);
@@ -1146,11 +1208,18 @@ arena_bin_malloc_hard(arena_t *arena, arena_bin_t *bin)
 		assert(bin->runcur->nfree > 0);
 		ret = arena_run_reg_alloc(bin->runcur, bin);
 		if (run != NULL) {
-			malloc_mutex_unlock(&bin->lock);
-			malloc_mutex_lock(&arena->lock);
-			arena_run_dalloc(arena, run, false);
-			malloc_mutex_unlock(&arena->lock);
-			malloc_mutex_lock(&bin->lock);
+			arena_chunk_t *chunk;
+
+			/*
+			 * arena_run_alloc() may have allocated run, or it may
+			 * have pulled it from the bin's run tree.  Therefore
+			 * it is unsafe to make any assumptions about how run
+			 * has previously been used, and arena_bin_lower_run()
+			 * must be called, as if a region were just deallocated
+			 * from the run.
+			 */
+			chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(run);
+			arena_bin_lower_run(arena, chunk, run, bin);
 		}
 		return (ret);
 	}
@@ -1784,7 +1853,7 @@ arena_dalloc_bin_run(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
 	npages = bin->run_size >> PAGE_SHIFT;
 	run_ind = (size_t)(((uintptr_t)run - (uintptr_t)chunk) >> PAGE_SHIFT);
 	past = (size_t)(((uintptr_t)run->next - (uintptr_t)1U -
-	    (uintptr_t)chunk) >> PAGE_SHIFT) + 1;
+	    (uintptr_t)chunk) >> PAGE_SHIFT) + ZU(1);
 	malloc_mutex_lock(&arena->lock);
 
 	/*
@@ -1799,13 +1868,14 @@ arena_dalloc_bin_run(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
 		 * last map element first, in case this is a one-page run.
 		 */
 		chunk->map[run_ind+npages-1-map_bias].bits = CHUNK_MAP_LARGE |
-		    (chunk->map[run_ind-map_bias].bits & CHUNK_MAP_FLAGS_MASK);
+		    (chunk->map[run_ind+npages-1-map_bias].bits &
+		    CHUNK_MAP_FLAGS_MASK);
 		chunk->map[run_ind-map_bias].bits = bin->run_size |
 		    CHUNK_MAP_LARGE | (chunk->map[run_ind-map_bias].bits &
 		    CHUNK_MAP_FLAGS_MASK);
 		arena_run_trim_tail(arena, chunk, run, (npages << PAGE_SHIFT),
 		    ((npages - (past - run_ind)) << PAGE_SHIFT), false);
-		npages = past - run_ind;
+		/* npages = past - run_ind; */
 	}
 #ifdef JEMALLOC_DEBUG
 	run->magic = 0;
@@ -1819,32 +1889,10 @@ arena_dalloc_bin_run(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
 #endif
 }
 
-void
-arena_dalloc_bin(arena_t *arena, arena_chunk_t *chunk, void *ptr,
-    arena_chunk_map_t *mapelm)
+static void
+arena_bin_lower_run(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
+    arena_bin_t *bin)
 {
-	size_t pageind;
-	arena_run_t *run;
-	arena_bin_t *bin;
-#if (defined(JEMALLOC_FILL) || defined(JEMALLOC_STATS))
-	size_t size;
-#endif
-
-	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT;
-	run = (arena_run_t *)((uintptr_t)chunk + (uintptr_t)((pageind -
-	    (mapelm->bits >> PAGE_SHIFT)) << PAGE_SHIFT));
-	assert(run->magic == ARENA_RUN_MAGIC);
-	bin = run->bin;
-#if (defined(JEMALLOC_FILL) || defined(JEMALLOC_STATS))
-	size = bin->reg_size;
-#endif
-
-#ifdef JEMALLOC_FILL
-	if (opt_junk)
-		memset(ptr, 0x5a, size);
-#endif
-
-	arena_run_reg_dalloc(run, ptr);
 
 	if (run->nfree == bin->nregs)
 		arena_dalloc_bin_run(arena, chunk, run, bin);
@@ -1882,6 +1930,35 @@ arena_dalloc_bin(arena_t *arena, arena_chunk_t *chunk, void *ptr,
 			arena_run_tree_insert(&bin->runs, run_mapelm);
 		}
 	}
+}
+
+void
+arena_dalloc_bin(arena_t *arena, arena_chunk_t *chunk, void *ptr,
+    arena_chunk_map_t *mapelm)
+{
+	size_t pageind;
+	arena_run_t *run;
+	arena_bin_t *bin;
+#if (defined(JEMALLOC_FILL) || defined(JEMALLOC_STATS))
+	size_t size;
+#endif
+
+	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT;
+	run = (arena_run_t *)((uintptr_t)chunk + (uintptr_t)((pageind -
+	    (mapelm->bits >> PAGE_SHIFT)) << PAGE_SHIFT));
+	assert(run->magic == ARENA_RUN_MAGIC);
+	bin = run->bin;
+#if (defined(JEMALLOC_FILL) || defined(JEMALLOC_STATS))
+	size = bin->reg_size;
+#endif
+
+#ifdef JEMALLOC_FILL
+	if (opt_junk)
+		memset(ptr, 0x5a, size);
+#endif
+
+	arena_run_reg_dalloc(run, ptr);
+	arena_bin_lower_run(arena, chunk, run, bin);
 
 #ifdef JEMALLOC_STATS
 	bin->stats.allocated -= size;
@@ -2032,33 +2109,50 @@ arena_ralloc_large_grow(arena_t *arena, arena_chunk_t *chunk, void *ptr,
 		 * following run, then merge the first part with the existing
 		 * allocation.
 		 */
+		size_t flag_dirty;
 		size_t splitsize = (oldsize + followsize <= size + extra)
 		    ? followsize : size + extra - oldsize;
 		arena_run_split(arena, (arena_run_t *)((uintptr_t)chunk +
 		    ((pageind+npages) << PAGE_SHIFT)), splitsize, true, zero);
 
-		chunk->map[pageind-map_bias].bits = size | CHUNK_MAP_LARGE |
-		    CHUNK_MAP_ALLOCATED;
-		chunk->map[pageind+npages-map_bias].bits = CHUNK_MAP_LARGE |
-		    CHUNK_MAP_ALLOCATED;
+		size += splitsize;
+		npages = (size >> PAGE_SHIFT);
+
+		/*
+		 * Mark the extended run as dirty if either portion of the run
+		 * was dirty before allocation.  This is rather pedantic,
+		 * because there's not actually any sequence of events that
+		 * could cause the resulting run to be passed to
+		 * arena_run_dalloc() with the dirty argument set to false
+		 * (which is when dirty flag consistency would really matter).
+		 */
+		flag_dirty = (chunk->map[pageind-map_bias].bits &
+		    CHUNK_MAP_DIRTY) |
+		    (chunk->map[pageind+npages-1-map_bias].bits &
+		    CHUNK_MAP_DIRTY);
+		chunk->map[pageind-map_bias].bits = size | flag_dirty
+		    | CHUNK_MAP_LARGE | CHUNK_MAP_ALLOCATED;
+		chunk->map[pageind+npages-1-map_bias].bits = flag_dirty |
+		    CHUNK_MAP_LARGE | CHUNK_MAP_ALLOCATED;
 
 #ifdef JEMALLOC_STATS
-	arena->stats.ndalloc_large++;
-	arena->stats.allocated_large -= oldsize;
-	arena->stats.lstats[(oldsize >> PAGE_SHIFT) - 1].ndalloc++;
-	arena->stats.lstats[(oldsize >> PAGE_SHIFT) - 1].curruns--;
+		arena->stats.ndalloc_large++;
+		arena->stats.allocated_large -= oldsize;
+		arena->stats.lstats[(oldsize >> PAGE_SHIFT) - 1].ndalloc++;
+		arena->stats.lstats[(oldsize >> PAGE_SHIFT) - 1].curruns--;
 
-	arena->stats.nmalloc_large++;
-	arena->stats.nrequests_large++;
-	arena->stats.allocated_large += size;
-	arena->stats.lstats[(size >> PAGE_SHIFT) - 1].nmalloc++;
-	arena->stats.lstats[(size >> PAGE_SHIFT) - 1].nrequests++;
-	arena->stats.lstats[(size >> PAGE_SHIFT) - 1].curruns++;
-	if (arena->stats.lstats[(size >> PAGE_SHIFT) - 1].curruns >
-	    arena->stats.lstats[(size >> PAGE_SHIFT) - 1].highruns) {
-		arena->stats.lstats[(size >> PAGE_SHIFT) - 1].highruns =
-		    arena->stats.lstats[(size >> PAGE_SHIFT) - 1].curruns;
-	}
+		arena->stats.nmalloc_large++;
+		arena->stats.nrequests_large++;
+		arena->stats.allocated_large += size;
+		arena->stats.lstats[(size >> PAGE_SHIFT) - 1].nmalloc++;
+		arena->stats.lstats[(size >> PAGE_SHIFT) - 1].nrequests++;
+		arena->stats.lstats[(size >> PAGE_SHIFT) - 1].curruns++;
+		if (arena->stats.lstats[(size >> PAGE_SHIFT) - 1].curruns >
+		    arena->stats.lstats[(size >> PAGE_SHIFT) - 1].highruns) {
+			arena->stats.lstats[(size >> PAGE_SHIFT) - 1].highruns =
+			    arena->stats.lstats[(size >> PAGE_SHIFT) -
+			    1].curruns;
+		}
 #endif
 		malloc_mutex_unlock(&arena->lock);
 		return (false);

From 12ca91402bc5d5c5a1cca495957463bb8e71fdcf Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Sun, 17 Oct 2010 19:56:09 -0700
Subject: [PATCH 33/48] Add assertions to run coalescing.

Assert that the chunk map bits at the ends of the runs that participate
in coalescing are self-consistent.
---
 jemalloc/src/arena.c | 24 +++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/jemalloc/src/arena.c b/jemalloc/src/arena.c
index 36957d97..6a84737e 100644
--- a/jemalloc/src/arena.c
+++ b/jemalloc/src/arena.c
@@ -934,19 +934,24 @@ arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty)
 	    CHUNK_MAP_DIRTY) == flag_dirty) {
 		size_t nrun_size = chunk->map[run_ind+run_pages-map_bias].bits &
 		    ~PAGE_MASK;
+		size_t nrun_pages = nrun_size >> PAGE_SHIFT;
 
 		/*
 		 * Remove successor from runs_avail; the coalesced run is
 		 * inserted later.
 		 */
+		assert((chunk->map[run_ind+run_pages+nrun_pages-1-map_bias].bits
+		    & ~PAGE_MASK) == nrun_size);
+		assert((chunk->map[run_ind+run_pages+nrun_pages-1-map_bias].bits
+		    & CHUNK_MAP_ALLOCATED) == 0);
+		assert((chunk->map[run_ind+run_pages+nrun_pages-1-map_bias].bits
+		    & CHUNK_MAP_DIRTY) == flag_dirty);
 		arena_avail_tree_remove(runs_avail,
 		    &chunk->map[run_ind+run_pages-map_bias]);
 
 		size += nrun_size;
-		run_pages = size >> PAGE_SHIFT;
+		run_pages += nrun_pages;
 
-		assert((chunk->map[run_ind+run_pages-1-map_bias].bits &
-		    ~PAGE_MASK) == nrun_size);
 		chunk->map[run_ind-map_bias].bits = size |
 		    (chunk->map[run_ind-map_bias].bits & CHUNK_MAP_FLAGS_MASK);
 		chunk->map[run_ind+run_pages-1-map_bias].bits = size |
@@ -960,21 +965,26 @@ arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty)
 	    CHUNK_MAP_DIRTY) == flag_dirty) {
 		size_t prun_size = chunk->map[run_ind-1-map_bias].bits &
 		    ~PAGE_MASK;
+		size_t prun_pages = prun_size >> PAGE_SHIFT;
 
-		run_ind -= prun_size >> PAGE_SHIFT;
+		run_ind -= prun_pages;
 
 		/*
 		 * Remove predecessor from runs_avail; the coalesced run is
 		 * inserted later.
 		 */
+		assert((chunk->map[run_ind-map_bias].bits & ~PAGE_MASK)
+		    == prun_size);
+		assert((chunk->map[run_ind-map_bias].bits & CHUNK_MAP_ALLOCATED)
+		    == 0);
+		assert((chunk->map[run_ind-map_bias].bits & CHUNK_MAP_DIRTY)
+		    == flag_dirty);
 		arena_avail_tree_remove(runs_avail,
 		    &chunk->map[run_ind-map_bias]);
 
 		size += prun_size;
-		run_pages = size >> PAGE_SHIFT;
+		run_pages += prun_pages;
 
-		assert((chunk->map[run_ind-map_bias].bits & ~PAGE_MASK) ==
-		    prun_size);
 		chunk->map[run_ind-map_bias].bits = size |
 		    (chunk->map[run_ind-map_bias].bits & CHUNK_MAP_FLAGS_MASK);
 		chunk->map[run_ind+run_pages-1-map_bias].bits = size |

From 8de6a0282333ca742fb2870b1e859968d3c1b11f Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Sun, 17 Oct 2010 20:57:30 -0700
Subject: [PATCH 34/48] Fix arena bugs.

Move part of arena_bin_lower_run() into the callers, since the
conditions under which it should be called differ slightly between
callers.

Fix arena_chunk_purge() to omit run size in the last map entry for each
run it temporarily allocates.
---
 jemalloc/src/arena.c | 99 +++++++++++++++++++++++++-------------------
 1 file changed, 57 insertions(+), 42 deletions(-)

diff --git a/jemalloc/src/arena.c b/jemalloc/src/arena.c
index 6a84737e..3c31e988 100644
--- a/jemalloc/src/arena.c
+++ b/jemalloc/src/arena.c
@@ -435,6 +435,12 @@ arena_chunk_alloc(arena_t *arena)
 			runs_avail = &arena->runs_avail_clean;
 		else
 			runs_avail = &arena->runs_avail_dirty;
+		assert((chunk->map[0].bits & ~PAGE_MASK) == arena_maxclass);
+		assert((chunk->map[chunk_npages-1-map_bias].bits & ~PAGE_MASK)
+		    == arena_maxclass);
+		assert((chunk->map[0].bits & CHUNK_MAP_DIRTY) ==
+		    (chunk->map[chunk_npages-1-map_bias].bits &
+		    CHUNK_MAP_DIRTY));
 		arena_avail_tree_insert(runs_avail, &chunk->map[0]);
 	} else {
 		bool zero;
@@ -683,8 +689,8 @@ arena_chunk_purge(arena_t *arena, arena_chunk_t *chunk)
 				    &arena->runs_avail_dirty, mapelm);
 
 				mapelm->bits = (npages << PAGE_SHIFT) |
-				    CHUNK_MAP_LARGE | CHUNK_MAP_ALLOCATED |
-				    flag_unzeroed;
+				    flag_unzeroed | CHUNK_MAP_LARGE |
+				    CHUNK_MAP_ALLOCATED;
 				/*
 				 * Update internal elements in the page map, so
 				 * that CHUNK_MAP_UNZEROED is properly set.
@@ -696,9 +702,8 @@ arena_chunk_purge(arena_t *arena, arena_chunk_t *chunk)
 				if (npages > 1) {
 					chunk->map[
 					    pageind+npages-1-map_bias].bits =
-					    (npages << PAGE_SHIFT) |
-					    CHUNK_MAP_LARGE |
-					    CHUNK_MAP_ALLOCATED | flag_unzeroed;
+					    flag_unzeroed | CHUNK_MAP_LARGE |
+					    CHUNK_MAP_ALLOCATED;
 				}
 
 				arena->nactive += npages;
@@ -894,9 +899,16 @@ arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty)
 	    >> PAGE_SHIFT);
 	assert(run_ind >= map_bias);
 	assert(run_ind < chunk_npages);
-	if ((chunk->map[run_ind-map_bias].bits & CHUNK_MAP_LARGE) != 0)
+	if ((chunk->map[run_ind-map_bias].bits & CHUNK_MAP_LARGE) != 0) {
 		size = chunk->map[run_ind-map_bias].bits & ~PAGE_MASK;
-	else
+		assert(size == PAGE_SIZE ||
+		    (chunk->map[run_ind+(size>>PAGE_SHIFT)-1-map_bias].bits &
+		    ~PAGE_MASK) == 0);
+		assert((chunk->map[run_ind+(size>>PAGE_SHIFT)-1-map_bias].bits &
+		    CHUNK_MAP_LARGE) != 0);
+		assert((chunk->map[run_ind+(size>>PAGE_SHIFT)-1-map_bias].bits &
+		    CHUNK_MAP_ALLOCATED) != 0);
+	} else
 		size = run->bin->run_size;
 	run_pages = (size >> PAGE_SHIFT);
 	arena->nactive -= run_pages;
@@ -993,6 +1005,10 @@ arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty)
 	}
 
 	/* Insert into runs_avail, now that coalescing is complete. */
+	assert((chunk->map[run_ind-map_bias].bits & ~PAGE_MASK) ==
+	    (chunk->map[run_ind+run_pages-1-map_bias].bits & ~PAGE_MASK));
+	assert((chunk->map[run_ind-map_bias].bits & CHUNK_MAP_DIRTY) ==
+	    (chunk->map[run_ind+run_pages-1-map_bias].bits & CHUNK_MAP_DIRTY));
 	arena_avail_tree_insert(runs_avail, &chunk->map[run_ind-map_bias]);
 
 	if (dirty) {
@@ -1229,7 +1245,10 @@ arena_bin_malloc_hard(arena_t *arena, arena_bin_t *bin)
 			 * from the run.
 			 */
 			chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(run);
-			arena_bin_lower_run(arena, chunk, run, bin);
+			if (run->nfree == bin->nregs)
+				arena_dalloc_bin_run(arena, chunk, run, bin);
+			else
+				arena_bin_lower_run(arena, chunk, run, bin);
 		}
 		return (ret);
 	}
@@ -1904,41 +1923,34 @@ arena_bin_lower_run(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
     arena_bin_t *bin)
 {
 
-	if (run->nfree == bin->nregs)
-		arena_dalloc_bin_run(arena, chunk, run, bin);
-	else if (run->nfree == 1 && run != bin->runcur) {
-		/*
-		 * Make sure that bin->runcur always refers to the lowest
-		 * non-full run, if one exists.
-		 */
-		if (bin->runcur == NULL)
-			bin->runcur = run;
-		else if ((uintptr_t)run < (uintptr_t)bin->runcur) {
-			/* Switch runcur. */
-			if (bin->runcur->nfree > 0) {
-				arena_chunk_t *runcur_chunk =
-				    CHUNK_ADDR2BASE(bin->runcur);
-				size_t runcur_pageind =
-				    (((uintptr_t)bin->runcur -
-				    (uintptr_t)runcur_chunk)) >> PAGE_SHIFT;
-				arena_chunk_map_t *runcur_mapelm =
-				    &runcur_chunk->map[runcur_pageind-map_bias];
+	/*
+	 * Make sure that bin->runcur always refers to the lowest non-full run,
+	 * if one exists.
+	 */
+	if (bin->runcur == NULL)
+		bin->runcur = run;
+	else if ((uintptr_t)run < (uintptr_t)bin->runcur) {
+		/* Switch runcur. */
+		if (bin->runcur->nfree > 0) {
+			arena_chunk_t *runcur_chunk =
+			    CHUNK_ADDR2BASE(bin->runcur);
+			size_t runcur_pageind = (((uintptr_t)bin->runcur -
+			    (uintptr_t)runcur_chunk)) >> PAGE_SHIFT;
+			arena_chunk_map_t *runcur_mapelm =
+			    &runcur_chunk->map[runcur_pageind-map_bias];
 
-				/* Insert runcur. */
-				arena_run_tree_insert(&bin->runs,
-				    runcur_mapelm);
-			}
-			bin->runcur = run;
-		} else {
-			size_t run_pageind = (((uintptr_t)run -
-			    (uintptr_t)chunk)) >> PAGE_SHIFT;
-			arena_chunk_map_t *run_mapelm =
-			    &chunk->map[run_pageind-map_bias];
-
-			assert(arena_run_tree_search(&bin->runs, run_mapelm) ==
-			    NULL);
-			arena_run_tree_insert(&bin->runs, run_mapelm);
+			/* Insert runcur. */
+			arena_run_tree_insert(&bin->runs, runcur_mapelm);
 		}
+		bin->runcur = run;
+	} else {
+		size_t run_pageind = (((uintptr_t)run -
+		    (uintptr_t)chunk)) >> PAGE_SHIFT;
+		arena_chunk_map_t *run_mapelm =
+		    &chunk->map[run_pageind-map_bias];
+
+		assert(arena_run_tree_search(&bin->runs, run_mapelm) == NULL);
+		arena_run_tree_insert(&bin->runs, run_mapelm);
 	}
 }
 
@@ -1968,7 +1980,10 @@ arena_dalloc_bin(arena_t *arena, arena_chunk_t *chunk, void *ptr,
 #endif
 
 	arena_run_reg_dalloc(run, ptr);
-	arena_bin_lower_run(arena, chunk, run, bin);
+	if (run->nfree == bin->nregs)
+		arena_dalloc_bin_run(arena, chunk, run, bin);
+	else if (run->nfree == 1 && run != bin->runcur)
+		arena_bin_lower_run(arena, chunk, run, bin);
 
 #ifdef JEMALLOC_STATS
 	bin->stats.allocated -= size;

From 088e6a0a37e197c8db16e3d4c518a5eb762e7657 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Mon, 18 Oct 2010 00:04:44 -0700
Subject: [PATCH 35/48] Fix arena bugs.

Split arena_dissociate_bin_run() out of arena_dalloc_bin_run(), so that
arena_bin_malloc_hard() can avoid dissociation when recovering from
losing a race.  This fixes a bug introduced by a recent attempted fix.

Fix a regression in arena_ralloc_large_grow() that was introduced by
recent fixes.
---
 jemalloc/src/arena.c | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/jemalloc/src/arena.c b/jemalloc/src/arena.c
index 3c31e988..be26160a 100644
--- a/jemalloc/src/arena.c
+++ b/jemalloc/src/arena.c
@@ -174,6 +174,8 @@ static void	arena_run_trim_tail(arena_t *arena, arena_chunk_t *chunk,
 static arena_run_t *arena_bin_nonfull_run_get(arena_t *arena, arena_bin_t *bin);
 static void	*arena_bin_malloc_hard(arena_t *arena, arena_bin_t *bin);
 static size_t	arena_bin_run_size_calc(arena_bin_t *bin, size_t min_run_size);
+static void	arena_dissociate_bin_run(arena_chunk_t *chunk, arena_run_t *run,
+    arena_bin_t *bin);
 static void	arena_dalloc_bin_run(arena_t *arena, arena_chunk_t *chunk,
     arena_run_t *run, arena_bin_t *bin);
 static void	arena_bin_lower_run(arena_t *arena, arena_chunk_t *chunk,
@@ -1856,10 +1858,9 @@ arena_prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
 #endif
 
 static void
-arena_dalloc_bin_run(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
+arena_dissociate_bin_run(arena_chunk_t *chunk, arena_run_t *run,
     arena_bin_t *bin)
 {
-	size_t npages, run_ind, past;
 
 	/* Dissociate run from bin. */
 	if (run == bin->runcur)
@@ -1876,6 +1877,17 @@ arena_dalloc_bin_run(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
 		 */
 		arena_run_tree_remove(&bin->runs, run_mapelm);
 	}
+}
+
+static void
+arena_dalloc_bin_run(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
+    arena_bin_t *bin)
+{
+	size_t npages, run_ind, past;
+
+	assert(run != bin->runcur);
+	assert(arena_run_tree_search(&bin->runs, &chunk->map[
+	    (((uintptr_t)run-(uintptr_t)chunk)>>PAGE_SHIFT)-map_bias]) == NULL);
 
 	malloc_mutex_unlock(&bin->lock);
 	/******************************/
@@ -1980,9 +1992,10 @@ arena_dalloc_bin(arena_t *arena, arena_chunk_t *chunk, void *ptr,
 #endif
 
 	arena_run_reg_dalloc(run, ptr);
-	if (run->nfree == bin->nregs)
+	if (run->nfree == bin->nregs) {
+		arena_dissociate_bin_run(chunk, run, bin);
 		arena_dalloc_bin_run(arena, chunk, run, bin);
-	else if (run->nfree == 1 && run != bin->runcur)
+	} else if (run->nfree == 1 && run != bin->runcur)
 		arena_bin_lower_run(arena, chunk, run, bin);
 
 #ifdef JEMALLOC_STATS
@@ -2140,8 +2153,8 @@ arena_ralloc_large_grow(arena_t *arena, arena_chunk_t *chunk, void *ptr,
 		arena_run_split(arena, (arena_run_t *)((uintptr_t)chunk +
 		    ((pageind+npages) << PAGE_SHIFT)), splitsize, true, zero);
 
-		size += splitsize;
-		npages = (size >> PAGE_SHIFT);
+		size = oldsize + splitsize;
+		npages = size >> PAGE_SHIFT;
 
 		/*
 		 * Mark the extended run as dirty if either portion of the run

From 21fb95bba6ea922e0523f269c0d9a32640047a29 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@facebook.com>
Date: Mon, 18 Oct 2010 17:45:40 -0700
Subject: [PATCH 36/48] Fix a bug in arena_dalloc_bin_run().

Fix the newsize argument to arena_run_trim_tail() that
arena_dalloc_bin_run() passes.  Previously, oldsize-newsize (i.e. the
complement) was passed, which could erroneously cause dirty pages to be
returned to the clean available runs tree.  Prior to the
CHUNK_MAP_ZEROED --> CHUNK_MAP_UNZEROED conversion, this bug merely
caused dirty pages to be unaccounted for (and therefore never get
purged), but with CHUNK_MAP_UNZEROED, this could cause dirty pages to be
treated as zeroed (i.e. memory corruption).
---
 jemalloc/src/arena.c | 66 +++++++++++++++++++++++++++++++++++---------
 1 file changed, 53 insertions(+), 13 deletions(-)

diff --git a/jemalloc/src/arena.c b/jemalloc/src/arena.c
index be26160a..52f3d661 100644
--- a/jemalloc/src/arena.c
+++ b/jemalloc/src/arena.c
@@ -283,12 +283,33 @@ arena_run_reg_dalloc(arena_run_t *run, void *ptr)
 	assert(((uintptr_t)ptr - ((uintptr_t)run +
 	    (uintptr_t)run->bin->reg0_offset)) % (uintptr_t)run->bin->reg_size
 	    == 0);
+	/*
+	 * Freeing a pointer lower than region zero can cause assertion
+	 * failure.
+	 */
+	assert((uintptr_t)ptr >= (uintptr_t)run +
+	    (uintptr_t)run->bin->reg0_offset);
+	/*
+	 * Freeing a pointer in the run's wilderness can cause assertion
+	 * failure.
+	 */
+	assert((uintptr_t)ptr < (uintptr_t)run->next);
 
 	*(void **)ptr = run->avail;
 	run->avail = ptr;
 	run->nfree++;
 }
 
+#ifdef JEMALLOC_DEBUG
+static inline void
+arena_chunk_validate_zeroed(arena_chunk_t *chunk, size_t run_ind)
+{
+	size_t *p = (size_t *)((uintptr_t)chunk + (run_ind << PAGE_SHIFT));
+	for (size_t i = 0; i < PAGE_SIZE / sizeof(size_t); i++)
+		assert(p[i] == 0);
+}
+#endif
+
 static void
 arena_run_split(arena_t *arena, arena_run_t *run, size_t size, bool large,
     bool zero)
@@ -359,20 +380,14 @@ arena_run_split(arena_t *arena, arena_run_t *run, size_t size, bool large,
 					if ((chunk->map[run_ind+i-map_bias].bits
 					    & CHUNK_MAP_UNZEROED) != 0) {
 						memset((void *)((uintptr_t)
-						    chunk + ((run_ind + i) <<
+						    chunk + ((run_ind+i) <<
 						    PAGE_SHIFT)), 0,
 						    PAGE_SIZE);
 					}
 #ifdef JEMALLOC_DEBUG
 					else {
-						size_t *p = (size_t *)
-						    ((uintptr_t)chunk +
-						    ((run_ind + i) <<
-						    PAGE_SHIFT));
-						for (size_t j = 0; j <
-						    PAGE_SIZE / sizeof(size_t);
-						    j++)
-							assert(p[j] == 0);
+						arena_chunk_validate_zeroed(
+						    chunk, run_ind+i);
 					}
 #endif
 				}
@@ -408,15 +423,40 @@ arena_run_split(arena_t *arena, arena_run_t *run, size_t size, bool large,
 		chunk->map[run_ind-map_bias].bits =
 		    (chunk->map[run_ind-map_bias].bits & CHUNK_MAP_UNZEROED) |
 		    CHUNK_MAP_ALLOCATED | flag_dirty;
+#ifdef JEMALLOC_DEBUG
+		/*
+		 * The first page will always be dirtied during small run
+		 * initialization, so a validation failure here would not
+		 * actually cause an observable failure.
+		 */
+		if (flag_dirty == 0 &&
+		    (chunk->map[run_ind-map_bias].bits & CHUNK_MAP_UNZEROED)
+		    == 0)
+			arena_chunk_validate_zeroed(chunk, run_ind);
+#endif
 		for (i = 1; i < need_pages - 1; i++) {
 			chunk->map[run_ind+i-map_bias].bits = (i << PAGE_SHIFT)
 			    | (chunk->map[run_ind+i-map_bias].bits &
 			    CHUNK_MAP_UNZEROED) | CHUNK_MAP_ALLOCATED;
+#ifdef JEMALLOC_DEBUG
+			if (flag_dirty == 0 &&
+			    (chunk->map[run_ind+i-map_bias].bits &
+			    CHUNK_MAP_UNZEROED) == 0)
+				arena_chunk_validate_zeroed(chunk, run_ind+i);
+#endif
 		}
 		chunk->map[run_ind+need_pages-1-map_bias].bits = ((need_pages
 		    - 1) << PAGE_SHIFT) |
 		    (chunk->map[run_ind+need_pages-1-map_bias].bits &
 		    CHUNK_MAP_UNZEROED) | CHUNK_MAP_ALLOCATED | flag_dirty;
+#ifdef JEMALLOC_DEBUG
+		if (flag_dirty == 0 &&
+		    (chunk->map[run_ind+need_pages-1-map_bias].bits &
+		    CHUNK_MAP_UNZEROED) == 0) {
+			arena_chunk_validate_zeroed(chunk,
+			    run_ind+need_pages-1);
+		}
+#endif
 	}
 }
 
@@ -1170,7 +1210,7 @@ arena_bin_nonfull_run_get(arena_t *arena, arena_bin_t *bin)
 		/* Initialize run internals. */
 		run->bin = bin;
 		run->avail = NULL;
-		run->next = (void *)(((uintptr_t)run) +
+		run->next = (void *)((uintptr_t)run +
 		    (uintptr_t)bin->reg0_offset);
 		run->nfree = bin->nregs;
 #ifdef JEMALLOC_DEBUG
@@ -1893,8 +1933,8 @@ arena_dalloc_bin_run(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
 	/******************************/
 	npages = bin->run_size >> PAGE_SHIFT;
 	run_ind = (size_t)(((uintptr_t)run - (uintptr_t)chunk) >> PAGE_SHIFT);
-	past = (size_t)(((uintptr_t)run->next - (uintptr_t)1U -
-	    (uintptr_t)chunk) >> PAGE_SHIFT) + ZU(1);
+	past = (size_t)((PAGE_CEILING((uintptr_t)run->next) - (uintptr_t)chunk)
+	    >> PAGE_SHIFT);
 	malloc_mutex_lock(&arena->lock);
 
 	/*
@@ -1915,7 +1955,7 @@ arena_dalloc_bin_run(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
 		    CHUNK_MAP_LARGE | (chunk->map[run_ind-map_bias].bits &
 		    CHUNK_MAP_FLAGS_MASK);
 		arena_run_trim_tail(arena, chunk, run, (npages << PAGE_SHIFT),
-		    ((npages - (past - run_ind)) << PAGE_SHIFT), false);
+		    ((past - run_ind) << PAGE_SHIFT), false);
 		/* npages = past - run_ind; */
 	}
 #ifdef JEMALLOC_DEBUG

From 93443689a4367cc6fe3de1c9e918adc13d8f9100 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@facebook.com>
Date: Wed, 20 Oct 2010 17:39:18 -0700
Subject: [PATCH 37/48] Add per thread allocation counters, and enhance heap
 sampling.

Add the "thread.allocated" and "thread.deallocated" mallctls, which can
be used to query the total number of bytes ever allocated/deallocated by
the calling thread.

Add s2u() and sa2u(), which can be used to compute the usable size that
will result from an allocation request of a particular size/alignment.

Re-factor ipalloc() to use sa2u().

Enhance the heap profiler to trigger samples based on usable size,
rather than request size.  This has a subtle, but important, impact on
the accuracy of heap sampling.  For example, previous to this change,
16- and 17-byte objects were sampled at nearly the same rate, but
17-byte objects actually consume 32 bytes each.  Therefore it was
possible for the sample to be somewhat skewed compared to actual memory
usage of the allocated objects.
---
 jemalloc/Makefile.in                          |   3 +-
 jemalloc/doc/jemalloc.3.in                    |  15 +
 .../jemalloc/internal/jemalloc_internal.h.in  | 258 ++++++++++++------
 jemalloc/include/jemalloc/internal/prof.h     |   6 +-
 jemalloc/src/arena.c                          |   3 +-
 jemalloc/src/ctl.c                            |  14 +
 jemalloc/src/jemalloc.c                       | 250 ++++++++++++++---
 jemalloc/src/prof.c                           |  62 +++--
 jemalloc/test/allocated.c                     | 105 +++++++
 jemalloc/test/allocated.exp                   |   2 +
 10 files changed, 563 insertions(+), 155 deletions(-)
 create mode 100644 jemalloc/test/allocated.c
 create mode 100644 jemalloc/test/allocated.exp

diff --git a/jemalloc/Makefile.in b/jemalloc/Makefile.in
index 7863c1b7..ca807fda 100644
--- a/jemalloc/Makefile.in
+++ b/jemalloc/Makefile.in
@@ -58,7 +58,8 @@ DSOS := @objroot@lib/libjemalloc@install_suffix@.$(SO).$(REV) \
 	@objroot@lib/libjemalloc@install_suffix@.$(SO) \
 	@objroot@lib/libjemalloc@install_suffix@_pic.a
 MAN3 := @objroot@doc/jemalloc@install_suffix@.3
-CTESTS := @srcroot@test/allocm.c @srcroot@test/posix_memalign.c \
+CTESTS := @srcroot@test/allocated.c @srcroot@test/allocm.c \
+	@srcroot@test/posix_memalign.c \
 	@srcroot@test/rallocm.c @srcroot@test/thread_arena.c
 
 .PHONY: all dist install check clean distclean relclean
diff --git a/jemalloc/doc/jemalloc.3.in b/jemalloc/doc/jemalloc.3.in
index c8d2f29c..5202a2bf 100644
--- a/jemalloc/doc/jemalloc.3.in
+++ b/jemalloc/doc/jemalloc.3.in
@@ -888,6 +888,21 @@ mallctl), it will be automatically initialized as a side effect of calling this
 interface.
 .Ed
 .\"-----------------------------------------------------------------------------
+@roff_stats@.It Sy "thread.allocated (uint64_t) r-"
+@roff_stats@.Bd -ragged -offset indent -compact
+@roff_stats@Get the total number of bytes ever allocated by the calling thread.
+@roff_stats@This counter has the potential to wrap around; it is up to the
+@roff_stats@application to appropriately interpret the counter in such cases.
+@roff_stats@.Ed
+.\"-----------------------------------------------------------------------------
+@roff_stats@.It Sy "thread.deallocated (uint64_t) r-"
+@roff_stats@.Bd -ragged -offset indent -compact
+@roff_stats@Get the total number of bytes ever deallocated by the calling
+@roff_stats@thread.
+@roff_stats@This counter has the potential to wrap around; it is up to the
+@roff_stats@application to appropriately interpret the counter in such cases.
+@roff_stats@.Ed
+.\"-----------------------------------------------------------------------------
 .It Sy "config.debug (bool) r-"
 .Bd -ragged -offset indent -compact
 --enable-debug was specified during build configuration.
diff --git a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
index 6ad7b064..eb609624 100644
--- a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
@@ -291,6 +291,50 @@ extern pthread_key_t	arenas_tsd;
 extern arena_t		**arenas;
 extern unsigned		narenas;
 
+#ifdef JEMALLOC_STATS
+typedef struct {
+	uint64_t	allocated;
+	uint64_t	deallocated;
+} thread_allocated_t;
+#  ifndef NO_TLS
+extern __thread thread_allocated_t	thread_allocated_tls;
+#    define ALLOCATED_GET() thread_allocated_tls.allocated
+#    define DEALLOCATED_GET() thread_allocated_tls.deallocated
+#    define ALLOCATED_ADD(a, d) do {					\
+	thread_allocated_tls.allocated += a;				\
+	thread_allocated_tls.deallocated += d;				\
+} while (0)
+#  else
+extern pthread_key_t	thread_allocated_tsd;
+#    define ALLOCATED_GET()						\
+	(uint64_t)((pthread_getspecific(thread_allocated_tsd) != NULL)	\
+	    ? ((thread_allocated_t *)					\
+	    pthread_getspecific(thread_allocated_tsd))->allocated : 0)
+#    define DEALLOCATED_GET()						\
+	(uint64_t)((pthread_getspecific(thread_allocated_tsd) != NULL)	\
+	    ? ((thread_allocated_t					\
+	    *)pthread_getspecific(thread_allocated_tsd))->deallocated :	\
+	    0)
+#    define ALLOCATED_ADD(a, d) do {					\
+	thread_allocated_t *thread_allocated = (thread_allocated_t *)	\
+	    pthread_getspecific(thread_allocated_tsd);			\
+	if (thread_allocated != NULL) {					\
+		thread_allocated->allocated += (a);			\
+		thread_allocated->deallocated += (d);			\
+	} else {							\
+		thread_allocated = (thread_allocated_t *)		\
+		    imalloc(sizeof(thread_allocated_t));		\
+		if (thread_allocated != NULL) {				\
+			pthread_setspecific(thread_allocated_tsd,	\
+			    thread_allocated);				\
+			thread_allocated->allocated = (a);		\
+			thread_allocated->deallocated = (d);		\
+		}							\
+	}								\
+} while (0)
+#  endif
+#endif
+
 arena_t	*arenas_extend(unsigned ind);
 arena_t	*choose_arena_hard(void);
 int	buferror(int errnum, char *buf, size_t buflen);
@@ -333,6 +377,8 @@ void	jemalloc_postfork(void);
 
 #ifndef JEMALLOC_ENABLE_INLINE
 size_t	pow2_ceil(size_t x);
+size_t	s2u(size_t size);
+size_t	sa2u(size_t size, size_t alignment, size_t *run_size_p);
 void	malloc_write(const char *s);
 arena_t	*choose_arena(void);
 #endif
@@ -356,6 +402,117 @@ pow2_ceil(size_t x)
 	return (x);
 }
 
+/*
+ * Compute usable size that would result from allocating an object with the
+ * specified size.
+ */
+JEMALLOC_INLINE size_t
+s2u(size_t size)
+{
+
+	if (size <= small_maxclass)
+		return arenas[0]->bins[small_size2bin[size]].reg_size;
+	if (size <= arena_maxclass)
+		return PAGE_CEILING(size);
+	return CHUNK_CEILING(size);
+}
+
+/*
+ * Compute usable size that would result from allocating an object with the
+ * specified size and alignment.
+ */
+JEMALLOC_INLINE size_t
+sa2u(size_t size, size_t alignment, size_t *run_size_p)
+{
+	size_t usize;
+
+	/*
+	 * Round size up to the nearest multiple of alignment.
+	 *
+	 * This done, we can take advantage of the fact that for each small
+	 * size class, every object is aligned at the smallest power of two
+	 * that is non-zero in the base two representation of the size.  For
+	 * example:
+	 *
+	 *   Size |   Base 2 | Minimum alignment
+	 *   -----+----------+------------------
+	 *     96 |  1100000 |  32
+	 *    144 | 10100000 |  32
+	 *    192 | 11000000 |  64
+	 *
+	 * Depending on runtime settings, it is possible that arena_malloc()
+	 * will further round up to a power of two, but that never causes
+	 * correctness issues.
+	 */
+	usize = (size + (alignment - 1)) & (-alignment);
+	/*
+	 * (usize < size) protects against the combination of maximal
+	 * alignment and size greater than maximal alignment.
+	 */
+	if (usize < size) {
+		/* size_t overflow. */
+		return (0);
+	}
+
+	if (usize <= arena_maxclass && alignment <= PAGE_SIZE) {
+		if (usize <= small_maxclass) {
+			return
+			    (arenas[0]->bins[small_size2bin[usize]].reg_size);
+		}
+		return (PAGE_CEILING(usize));
+	} else {
+		size_t run_size;
+
+		/*
+		 * We can't achieve subpage alignment, so round up alignment
+		 * permanently; it makes later calculations simpler.
+		 */
+		alignment = PAGE_CEILING(alignment);
+		usize = PAGE_CEILING(size);
+		/*
+		 * (usize < size) protects against very large sizes within
+		 * PAGE_SIZE of SIZE_T_MAX.
+		 *
+		 * (usize + alignment < usize) protects against the
+		 * combination of maximal alignment and usize large enough
+		 * to cause overflow.  This is similar to the first overflow
+		 * check above, but it needs to be repeated due to the new
+		 * usize value, which may now be *equal* to maximal
+		 * alignment, whereas before we only detected overflow if the
+		 * original size was *greater* than maximal alignment.
+		 */
+		if (usize < size || usize + alignment < usize) {
+			/* size_t overflow. */
+			return (0);
+		}
+
+		/*
+		 * Calculate the size of the over-size run that arena_palloc()
+		 * would need to allocate in order to guarantee the alignment.
+		 */
+		if (usize >= alignment)
+			run_size = usize + alignment - PAGE_SIZE;
+		else {
+			/*
+			 * It is possible that (alignment << 1) will cause
+			 * overflow, but it doesn't matter because we also
+			 * subtract PAGE_SIZE, which in the case of overflow
+			 * leaves us with a very large run_size.  That causes
+			 * the first conditional below to fail, which means
+			 * that the bogus run_size value never gets used for
+			 * anything important.
+			 */
+			run_size = (alignment << 1) - PAGE_SIZE;
+		}
+		if (run_size_p != NULL)
+			*run_size_p = run_size;
+
+		if (run_size <= arena_maxclass)
+			return (PAGE_CEILING(usize));
+		return (CHUNK_CEILING(usize));
+	}
+}
+
 /*
  * Wrapper around malloc_message() that avoids the need for
  * JEMALLOC_P(malloc_message)(...) throughout the code.
@@ -435,92 +592,25 @@ JEMALLOC_INLINE void *
 ipalloc(size_t size, size_t alignment, bool zero)
 {
 	void *ret;
-	size_t ceil_size;
+	size_t usize;
+	size_t run_size
+#  ifdef JEMALLOC_CC_SILENCE
+	    = 0
+#  endif
+	    ;
 
-	/*
-	 * Round size up to the nearest multiple of alignment.
-	 *
-	 * This done, we can take advantage of the fact that for each small
-	 * size class, every object is aligned at the smallest power of two
-	 * that is non-zero in the base two representation of the size.  For
-	 * example:
-	 *
-	 *   Size |   Base 2 | Minimum alignment
-	 *   -----+----------+------------------
-	 *     96 |  1100000 |  32
-	 *    144 | 10100000 |  32
-	 *    192 | 11000000 |  64
-	 *
-	 * Depending on runtime settings, it is possible that arena_malloc()
-	 * will further round up to a power of two, but that never causes
-	 * correctness issues.
-	 */
-	ceil_size = (size + (alignment - 1)) & (-alignment);
-	/*
-	 * (ceil_size < size) protects against the combination of maximal
-	 * alignment and size greater than maximal alignment.
-	 */
-	if (ceil_size < size) {
-		/* size_t overflow. */
+	usize = sa2u(size, alignment, &run_size);
+	if (usize == 0)
 		return (NULL);
-	}
-
-	if (ceil_size <= PAGE_SIZE || (alignment <= PAGE_SIZE
-	    && ceil_size <= arena_maxclass))
-		ret = arena_malloc(ceil_size, zero);
-	else {
-		size_t run_size;
-
-		/*
-		 * We can't achieve subpage alignment, so round up alignment
-		 * permanently; it makes later calculations simpler.
-		 */
-		alignment = PAGE_CEILING(alignment);
-		ceil_size = PAGE_CEILING(size);
-		/*
-		 * (ceil_size < size) protects against very large sizes within
-		 * PAGE_SIZE of SIZE_T_MAX.
-		 *
-		 * (ceil_size + alignment < ceil_size) protects against the
-		 * combination of maximal alignment and ceil_size large enough
-		 * to cause overflow.  This is similar to the first overflow
-		 * check above, but it needs to be repeated due to the new
-		 * ceil_size value, which may now be *equal* to maximal
-		 * alignment, whereas before we only detected overflow if the
-		 * original size was *greater* than maximal alignment.
-		 */
-		if (ceil_size < size || ceil_size + alignment < ceil_size) {
-			/* size_t overflow. */
-			return (NULL);
-		}
-
-		/*
-		 * Calculate the size of the over-size run that arena_palloc()
-		 * would need to allocate in order to guarantee the alignment.
-		 */
-		if (ceil_size >= alignment)
-			run_size = ceil_size + alignment - PAGE_SIZE;
-		else {
-			/*
-			 * It is possible that (alignment << 1) will cause
-			 * overflow, but it doesn't matter because we also
-			 * subtract PAGE_SIZE, which in the case of overflow
-			 * leaves us with a very large run_size.  That causes
-			 * the first conditional below to fail, which means
-			 * that the bogus run_size value never gets used for
-			 * anything important.
-			 */
-			run_size = (alignment << 1) - PAGE_SIZE;
-		}
-
-		if (run_size <= arena_maxclass) {
-			ret = arena_palloc(choose_arena(), ceil_size, run_size,
-			    alignment, zero);
-		} else if (alignment <= chunksize)
-			ret = huge_malloc(ceil_size, zero);
-		else
-			ret = huge_palloc(ceil_size, alignment, zero);
-	}
+	if (usize <= arena_maxclass && alignment <= PAGE_SIZE)
+		ret = arena_malloc(usize, zero);
+	else if (run_size <= arena_maxclass) {
+		ret = arena_palloc(choose_arena(), usize, run_size, alignment,
+		    zero);
+	} else if (alignment <= chunksize)
+		ret = huge_malloc(usize, zero);
+	else
+		ret = huge_palloc(usize, alignment, zero);
 
 	assert(((uintptr_t)ret & (alignment - 1)) == 0);
 	return (ret);
diff --git a/jemalloc/include/jemalloc/internal/prof.h b/jemalloc/include/jemalloc/internal/prof.h
index 1aa85bb5..0d139da7 100644
--- a/jemalloc/include/jemalloc/internal/prof.h
+++ b/jemalloc/include/jemalloc/internal/prof.h
@@ -179,9 +179,9 @@ extern bool	prof_promote;
 
 prof_thr_cnt_t	*prof_alloc_prep(size_t size);
 prof_ctx_t	*prof_ctx_get(const void *ptr);
-void	prof_malloc(const void *ptr, prof_thr_cnt_t *cnt);
-void	prof_realloc(const void *ptr, prof_thr_cnt_t *cnt, const void *old_ptr,
-    size_t old_size, prof_ctx_t *old_ctx);
+void	prof_malloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt);
+void	prof_realloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt,
+    const void *old_ptr, size_t old_size, prof_ctx_t *old_ctx);
 void	prof_free(const void *ptr);
 void	prof_idump(void);
 bool	prof_mdump(const char *filename);
diff --git a/jemalloc/src/arena.c b/jemalloc/src/arena.c
index 52f3d661..a54a0905 100644
--- a/jemalloc/src/arena.c
+++ b/jemalloc/src/arena.c
@@ -1613,7 +1613,8 @@ arena_palloc(arena_t *arena, size_t size, size_t alloc_size, size_t alignment,
 	arena_chunk_t *chunk;
 
 	assert((size & PAGE_MASK) == 0);
-	assert((alignment & PAGE_MASK) == 0);
+
+	alignment = PAGE_CEILING(alignment);
 
 	malloc_mutex_lock(&arena->lock);
 	ret = (void *)arena_run_alloc(arena, alloc_size, true, zero);
diff --git a/jemalloc/src/ctl.c b/jemalloc/src/ctl.c
index edbbb209..dbc5cd42 100644
--- a/jemalloc/src/ctl.c
+++ b/jemalloc/src/ctl.c
@@ -42,6 +42,10 @@ CTL_PROTO(epoch)
 CTL_PROTO(tcache_flush)
 #endif
 CTL_PROTO(thread_arena)
+#ifdef JEMALLOC_STATS
+CTL_PROTO(thread_allocated)
+CTL_PROTO(thread_deallocated)
+#endif
 CTL_PROTO(config_debug)
 CTL_PROTO(config_dss)
 CTL_PROTO(config_dynamic_page_shift)
@@ -216,6 +220,11 @@ static const ctl_node_t	tcache_node[] = {
 
 static const ctl_node_t	thread_node[] = {
 	{NAME("arena"),		CTL(thread_arena)}
+#ifdef JEMALLOC_STATS
+	,
+	{NAME("allocated"),	CTL(thread_allocated)},
+	{NAME("deallocated"),	CTL(thread_deallocated)}
+#endif
 };
 
 static const ctl_node_t	config_node[] = {
@@ -1092,6 +1101,11 @@ RETURN:
 	return (ret);
 }
 
+#ifdef JEMALLOC_STATS
+CTL_RO_GEN(thread_allocated, ALLOCATED_GET(), uint64_t);
+CTL_RO_GEN(thread_deallocated, DEALLOCATED_GET(), uint64_t);
+#endif
+
 /******************************************************************************/
 
 #ifdef JEMALLOC_DEBUG
diff --git a/jemalloc/src/jemalloc.c b/jemalloc/src/jemalloc.c
index 6f9ec762..f3cba15a 100644
--- a/jemalloc/src/jemalloc.c
+++ b/jemalloc/src/jemalloc.c
@@ -15,14 +15,22 @@ __thread arena_t	*arenas_tls JEMALLOC_ATTR(tls_model("initial-exec"));
 pthread_key_t		arenas_tsd;
 #endif
 
+#ifdef JEMALLOC_STATS
+#  ifndef NO_TLS
+__thread thread_allocated_t	thread_allocated_tls;
+#  else
+pthread_key_t		thread_allocated_tsd;
+#  endif
+#endif
+
 /* Set to true once the allocator has been initialized. */
-static bool malloc_initialized = false;
+static bool		malloc_initialized = false;
 
 /* Used to let the initializing thread recursively allocate. */
-static pthread_t malloc_initializer = (unsigned long)0;
+static pthread_t	malloc_initializer = (unsigned long)0;
 
 /* Used to avoid initialization races. */
-static malloc_mutex_t init_lock = MALLOC_MUTEX_INITIALIZER;
+static malloc_mutex_t	init_lock = MALLOC_MUTEX_INITIALIZER;
 
 #ifdef DYNAMIC_PAGE_SHIFT
 size_t		pagesize;
@@ -63,6 +71,9 @@ static int	opt_narenas_lshift = 0;
 static void	wrtmessage(void *cbopaque, const char *s);
 static void	stats_print_atexit(void);
 static unsigned	malloc_ncpus(void);
+#if (defined(JEMALLOC_STATS) && defined(NO_TLS))
+static void	thread_allocated_cleanup(void *arg);
+#endif
 static bool	malloc_init_hard(void);
 
 /******************************************************************************/
@@ -222,6 +233,17 @@ malloc_ncpus(void)
 	return (ret);
 }
 
+#if (defined(JEMALLOC_STATS) && defined(NO_TLS))
+static void
+thread_allocated_cleanup(void *arg)
+{
+	uint64_t *allocated = (uint64_t *)arg;
+
+	if (allocated != NULL)
+		idalloc(allocated);
+}
+#endif
+
 /*
  * FreeBSD's pthreads implementation calls malloc(3), so the malloc
  * implementation has to take pains to avoid infinite recursion during
@@ -633,6 +655,15 @@ MALLOC_OUT:
 		return (true);
 	}
 
+#if (defined(JEMALLOC_STATS) && defined(NO_TLS))
+	/* Initialize allocation counters before any allocations can occur. */
+	if (pthread_key_create(&thread_allocated_tsd, thread_allocated_cleanup)
+	    != 0) {
+		malloc_mutex_unlock(&init_lock);
+		return (true);
+	}
+#endif
+
 	/*
 	 * Create enough scaffolding to allow recursive allocation in
 	 * malloc_ncpus().
@@ -766,6 +797,13 @@ void *
 JEMALLOC_P(malloc)(size_t size)
 {
 	void *ret;
+#if (defined(JEMALLOC_PROF) || defined(JEMALLOC_STATS))
+	size_t usize
+#  ifdef JEMALLOC_CC_SILENCE
+	    = 0
+#  endif
+	    ;
+#endif
 #ifdef JEMALLOC_PROF
 	prof_thr_cnt_t *cnt
 #  ifdef JEMALLOC_CC_SILENCE
@@ -801,20 +839,26 @@ JEMALLOC_P(malloc)(size_t size)
 
 #ifdef JEMALLOC_PROF
 	if (opt_prof) {
-		if ((cnt = prof_alloc_prep(size)) == NULL) {
+		usize = s2u(size);
+		if ((cnt = prof_alloc_prep(usize)) == NULL) {
 			ret = NULL;
 			goto OOM;
 		}
-		if (prof_promote && (uintptr_t)cnt != (uintptr_t)1U && size <=
+		if (prof_promote && (uintptr_t)cnt != (uintptr_t)1U && usize <=
 		    small_maxclass) {
 			ret = imalloc(small_maxclass+1);
 			if (ret != NULL)
-				arena_prof_promoted(ret, size);
+				arena_prof_promoted(ret, usize);
 		} else
 			ret = imalloc(size);
 	} else
+#endif
+	{
+#ifdef JEMALLOC_STATS
+		usize = s2u(size);
 #endif
 		ret = imalloc(size);
+	}
 
 OOM:
 	if (ret == NULL) {
@@ -833,7 +877,13 @@ RETURN:
 #endif
 #ifdef JEMALLOC_PROF
 	if (opt_prof && ret != NULL)
-		prof_malloc(ret, cnt);
+		prof_malloc(ret, usize, cnt);
+#endif
+#ifdef JEMALLOC_STATS
+	if (ret != NULL) {
+		assert(usize == isalloc(ret));
+		ALLOCATED_ADD(usize, 0);
+	}
 #endif
 	return (ret);
 }
@@ -845,6 +895,13 @@ JEMALLOC_P(posix_memalign)(void **memptr, size_t alignment, size_t size)
 {
 	int ret;
 	void *result;
+#if (defined(JEMALLOC_PROF) || defined(JEMALLOC_STATS))
+	size_t usize
+#  ifdef JEMALLOC_CC_SILENCE
+	    = 0
+#  endif
+	    ;
+#endif
 #ifdef JEMALLOC_PROF
 	prof_thr_cnt_t *cnt
 #  ifdef JEMALLOC_CC_SILENCE
@@ -896,17 +953,18 @@ JEMALLOC_P(posix_memalign)(void **memptr, size_t alignment, size_t size)
 
 #ifdef JEMALLOC_PROF
 		if (opt_prof) {
-			if ((cnt = prof_alloc_prep(size)) == NULL) {
+			usize = sa2u(size, alignment, NULL);
+			if ((cnt = prof_alloc_prep(usize)) == NULL) {
 				result = NULL;
 				ret = EINVAL;
 			} else {
 				if (prof_promote && (uintptr_t)cnt !=
-				    (uintptr_t)1U && size <= small_maxclass) {
+				    (uintptr_t)1U && usize <= small_maxclass) {
 					result = ipalloc(small_maxclass+1,
 					    alignment, false);
 					if (result != NULL) {
 						arena_prof_promoted(result,
-						    size);
+						    usize);
 					}
 				} else {
 					result = ipalloc(size, alignment,
@@ -914,8 +972,13 @@ JEMALLOC_P(posix_memalign)(void **memptr, size_t alignment, size_t size)
 				}
 			}
 		} else
+#endif
+		{
+#ifdef JEMALLOC_STATS
+			usize = sa2u(size, alignment, NULL);
 #endif
 			result = ipalloc(size, alignment, false);
+		}
 	}
 
 	if (result == NULL) {
@@ -934,9 +997,15 @@ JEMALLOC_P(posix_memalign)(void **memptr, size_t alignment, size_t size)
 	ret = 0;
 
 RETURN:
+#ifdef JEMALLOC_STATS
+	if (result != NULL) {
+		assert(usize == isalloc(result));
+		ALLOCATED_ADD(usize, 0);
+	}
+#endif
 #ifdef JEMALLOC_PROF
 	if (opt_prof && result != NULL)
-		prof_malloc(result, cnt);
+		prof_malloc(result, usize, cnt);
 #endif
 	return (ret);
 }
@@ -948,6 +1017,13 @@ JEMALLOC_P(calloc)(size_t num, size_t size)
 {
 	void *ret;
 	size_t num_size;
+#if (defined(JEMALLOC_PROF) || defined(JEMALLOC_STATS))
+	size_t usize
+#  ifdef JEMALLOC_CC_SILENCE
+	    = 0
+#  endif
+	    ;
+#endif
 #ifdef JEMALLOC_PROF
 	prof_thr_cnt_t *cnt
 #  ifdef JEMALLOC_CC_SILENCE
@@ -988,20 +1064,26 @@ JEMALLOC_P(calloc)(size_t num, size_t size)
 
 #ifdef JEMALLOC_PROF
 	if (opt_prof) {
-		if ((cnt = prof_alloc_prep(num_size)) == NULL) {
+		usize = s2u(num_size);
+		if ((cnt = prof_alloc_prep(usize)) == NULL) {
 			ret = NULL;
 			goto RETURN;
 		}
-		if (prof_promote && (uintptr_t)cnt != (uintptr_t)1U && num_size
+		if (prof_promote && (uintptr_t)cnt != (uintptr_t)1U && usize
 		    <= small_maxclass) {
 			ret = icalloc(small_maxclass+1);
 			if (ret != NULL)
-				arena_prof_promoted(ret, num_size);
+				arena_prof_promoted(ret, usize);
 		} else
 			ret = icalloc(num_size);
 	} else
+#endif
+	{
+#ifdef JEMALLOC_STATS
+		usize = s2u(num_size);
 #endif
 		ret = icalloc(num_size);
+	}
 
 RETURN:
 	if (ret == NULL) {
@@ -1017,7 +1099,13 @@ RETURN:
 
 #ifdef JEMALLOC_PROF
 	if (opt_prof && ret != NULL)
-		prof_malloc(ret, cnt);
+		prof_malloc(ret, usize, cnt);
+#endif
+#ifdef JEMALLOC_STATS
+	if (ret != NULL) {
+		assert(usize == isalloc(ret));
+		ALLOCATED_ADD(usize, 0);
+	}
 #endif
 	return (ret);
 }
@@ -1027,12 +1115,15 @@ void *
 JEMALLOC_P(realloc)(void *ptr, size_t size)
 {
 	void *ret;
-#ifdef JEMALLOC_PROF
-	size_t old_size
+#if (defined(JEMALLOC_PROF) || defined(JEMALLOC_STATS))
+	size_t usize
 #  ifdef JEMALLOC_CC_SILENCE
 	    = 0
 #  endif
 	    ;
+	size_t old_size = 0;
+#endif
+#ifdef JEMALLOC_PROF
 	prof_thr_cnt_t *cnt
 #  ifdef JEMALLOC_CC_SILENCE
 	    = NULL
@@ -1053,9 +1144,11 @@ JEMALLOC_P(realloc)(void *ptr, size_t size)
 #ifdef JEMALLOC_SYSV
 		else {
 			if (ptr != NULL) {
+#if (defined(JEMALLOC_PROF) || defined(JEMALLOC_STATS))
+				old_size = isalloc(ptr);
+#endif
 #ifdef JEMALLOC_PROF
 				if (opt_prof) {
-					old_size = isalloc(ptr);
 					old_ctx = prof_ctx_get(ptr);
 					cnt = NULL;
 				}
@@ -1064,7 +1157,6 @@ JEMALLOC_P(realloc)(void *ptr, size_t size)
 			}
 #ifdef JEMALLOC_PROF
 			else if (opt_prof) {
-				old_size = 0;
 				old_ctx = NULL;
 				cnt = NULL;
 			}
@@ -1079,25 +1171,33 @@ JEMALLOC_P(realloc)(void *ptr, size_t size)
 		assert(malloc_initialized || malloc_initializer ==
 		    pthread_self());
 
+#if (defined(JEMALLOC_PROF) || defined(JEMALLOC_STATS))
+		old_size = isalloc(ptr);
+#endif
 #ifdef JEMALLOC_PROF
 		if (opt_prof) {
-			old_size = isalloc(ptr);
+			usize = s2u(size);
 			old_ctx = prof_ctx_get(ptr);
-			if ((cnt = prof_alloc_prep(size)) == NULL) {
+			if ((cnt = prof_alloc_prep(usize)) == NULL) {
 				ret = NULL;
 				goto OOM;
 			}
 			if (prof_promote && (uintptr_t)cnt != (uintptr_t)1U &&
-			    size <= small_maxclass) {
+			    usize <= small_maxclass) {
 				ret = iralloc(ptr, small_maxclass+1, 0, 0,
 				    false, false);
 				if (ret != NULL)
-					arena_prof_promoted(ret, size);
+					arena_prof_promoted(ret, usize);
 			} else
 				ret = iralloc(ptr, size, 0, 0, false, false);
 		} else
+#endif
+		{
+#ifdef JEMALLOC_STATS
+			usize = s2u(size);
 #endif
 			ret = iralloc(ptr, size, 0, 0, false, false);
+		}
 
 #ifdef JEMALLOC_PROF
 OOM:
@@ -1114,10 +1214,8 @@ OOM:
 		}
 	} else {
 #ifdef JEMALLOC_PROF
-		if (opt_prof) {
-			old_size = 0;
+		if (opt_prof)
 			old_ctx = NULL;
-		}
 #endif
 		if (malloc_init()) {
 #ifdef JEMALLOC_PROF
@@ -1128,23 +1226,29 @@ OOM:
 		} else {
 #ifdef JEMALLOC_PROF
 			if (opt_prof) {
-				if ((cnt = prof_alloc_prep(size)) == NULL)
+				usize = s2u(size);
+				if ((cnt = prof_alloc_prep(usize)) == NULL)
 					ret = NULL;
 				else {
 					if (prof_promote && (uintptr_t)cnt !=
-					    (uintptr_t)1U && size <=
+					    (uintptr_t)1U && usize <=
 					    small_maxclass) {
 						ret = imalloc(small_maxclass+1);
 						if (ret != NULL) {
 							arena_prof_promoted(ret,
-							    size);
+							    usize);
 						}
 					} else
 						ret = imalloc(size);
 				}
 			} else
+#endif
+			{
+#ifdef JEMALLOC_STATS
+				usize = s2u(size);
 #endif
 				ret = imalloc(size);
+			}
 		}
 
 		if (ret == NULL) {
@@ -1164,7 +1268,13 @@ RETURN:
 #endif
 #ifdef JEMALLOC_PROF
 	if (opt_prof)
-		prof_realloc(ret, cnt, ptr, old_size, old_ctx);
+		prof_realloc(ret, usize, cnt, ptr, old_size, old_ctx);
+#endif
+#ifdef JEMALLOC_STATS
+	if (ret != NULL) {
+		assert(usize == isalloc(ret));
+		ALLOCATED_ADD(usize, old_size);
+	}
 #endif
 	return (ret);
 }
@@ -1181,6 +1291,9 @@ JEMALLOC_P(free)(void *ptr)
 #ifdef JEMALLOC_PROF
 		if (opt_prof)
 			prof_free(ptr);
+#endif
+#ifdef JEMALLOC_STATS
+		ALLOCATED_ADD(0, isalloc(ptr));
 #endif
 		idalloc(ptr);
 	}
@@ -1325,6 +1438,7 @@ int
 JEMALLOC_P(allocm)(void **ptr, size_t *rsize, size_t size, int flags)
 {
 	void *p;
+	size_t usize;
 	size_t alignment = (ZU(1) << (flags & ALLOCM_LG_ALIGN_MASK)
 	    & (SIZE_T_MAX-1));
 	bool zero = flags & ALLOCM_ZERO;
@@ -1340,30 +1454,48 @@ JEMALLOC_P(allocm)(void **ptr, size_t *rsize, size_t size, int flags)
 
 #ifdef JEMALLOC_PROF
 	if (opt_prof) {
-		if ((cnt = prof_alloc_prep(size)) == NULL)
+		usize = (alignment == 0) ? s2u(size) : sa2u(size, alignment,
+		    NULL);
+		if ((cnt = prof_alloc_prep(usize)) == NULL)
 			goto OOM;
-		if (prof_promote && (uintptr_t)cnt != (uintptr_t)1U && size <=
+		if (prof_promote && (uintptr_t)cnt != (uintptr_t)1U && usize <=
 		    small_maxclass) {
 			p = iallocm(small_maxclass+1, alignment, zero);
 			if (p == NULL)
 				goto OOM;
-			arena_prof_promoted(p, size);
+			arena_prof_promoted(p, usize);
 		} else {
 			p = iallocm(size, alignment, zero);
 			if (p == NULL)
 				goto OOM;
 		}
+
+		if (rsize != NULL)
+			*rsize = usize;
 	} else
 #endif
 	{
 		p = iallocm(size, alignment, zero);
 		if (p == NULL)
 			goto OOM;
+#ifndef JEMALLOC_STATS
+		if (rsize != NULL)
+#endif
+		{
+			usize = (alignment == 0) ? s2u(size) : sa2u(size,
+			    alignment, NULL);
+#ifdef JEMALLOC_STATS
+			if (rsize != NULL)
+#endif
+				*rsize = usize;
+		}
 	}
 
 	*ptr = p;
-	if (rsize != NULL)
-		*rsize = isalloc(p);
+#ifdef JEMALLOC_STATS
+	assert(usize == isalloc(p));
+	ALLOCATED_ADD(usize, 0);
+#endif
 	return (ALLOCM_SUCCESS);
 OOM:
 #ifdef JEMALLOC_XMALLOC
@@ -1384,12 +1516,15 @@ JEMALLOC_P(rallocm)(void **ptr, size_t *rsize, size_t size, size_t extra,
     int flags)
 {
 	void *p, *q;
+	size_t usize;
+#if (defined(JEMALLOC_PROF) || defined(JEMALLOC_STATS))
+	size_t old_size;
+#endif
 	size_t alignment = (ZU(1) << (flags & ALLOCM_LG_ALIGN_MASK)
 	    & (SIZE_T_MAX-1));
 	bool zero = flags & ALLOCM_ZERO;
 	bool no_move = flags & ALLOCM_NO_MOVE;
 #ifdef JEMALLOC_PROF
-	size_t old_size;
 	prof_thr_cnt_t *cnt;
 	prof_ctx_t *old_ctx;
 #endif
@@ -1403,36 +1538,60 @@ JEMALLOC_P(rallocm)(void **ptr, size_t *rsize, size_t size, size_t extra,
 	p = *ptr;
 #ifdef JEMALLOC_PROF
 	if (opt_prof) {
+		/*
+		 * usize isn't knowable before iralloc() returns when extra is
+		 * non-zero.  Therefore, compute its maximum possible value and
+		 * use that in prof_alloc_prep() to decide whether to capture a
+		 * backtrace.  prof_realloc() will use the actual usize to
+		 * decide whether to sample.
+		 */
+		size_t max_usize = (alignment == 0) ? s2u(size+extra) :
+		    sa2u(size+extra, alignment, NULL);
 		old_size = isalloc(p);
 		old_ctx = prof_ctx_get(p);
-		if ((cnt = prof_alloc_prep(size)) == NULL)
+		if ((cnt = prof_alloc_prep(max_usize)) == NULL)
 			goto OOM;
-		if (prof_promote && (uintptr_t)cnt != (uintptr_t)1U && size <=
-		    small_maxclass) {
+		if (prof_promote && (uintptr_t)cnt != (uintptr_t)1U && max_usize
+		    <= small_maxclass) {
 			q = iralloc(p, small_maxclass+1, (small_maxclass+1 >=
 			    size+extra) ? 0 : size+extra - (small_maxclass+1),
 			    alignment, zero, no_move);
 			if (q == NULL)
 				goto ERR;
-			arena_prof_promoted(q, size);
+			usize = isalloc(q);
+			arena_prof_promoted(q, usize);
 		} else {
 			q = iralloc(p, size, extra, alignment, zero, no_move);
 			if (q == NULL)
 				goto ERR;
+			usize = isalloc(q);
 		}
-		prof_realloc(q, cnt, p, old_size, old_ctx);
+		prof_realloc(q, usize, cnt, p, old_size, old_ctx);
 	} else
 #endif
 	{
+#ifdef JEMALLOC_STATS
+		old_size = isalloc(p);
+#endif
 		q = iralloc(p, size, extra, alignment, zero, no_move);
 		if (q == NULL)
 			goto ERR;
+#ifndef JEMALLOC_STATS
+		if (rsize != NULL)
+#endif
+		{
+			usize = isalloc(q);
+#ifdef JEMALLOC_STATS
+			if (rsize != NULL)
+#endif
+				*rsize = usize;
+		}
 	}
 
 	*ptr = q;
-	if (rsize != NULL)
-		*rsize = isalloc(q);
-
+#ifdef JEMALLOC_STATS
+	ALLOCATED_ADD(usize, old_size);
+#endif
 	return (ALLOCM_SUCCESS);
 ERR:
 	if (no_move)
@@ -1483,6 +1642,9 @@ JEMALLOC_P(dallocm)(void *ptr, int flags)
 #ifdef JEMALLOC_PROF
 	if (opt_prof)
 		prof_free(ptr);
+#endif
+#ifdef JEMALLOC_STATS
+	ALLOCATED_ADD(0, isalloc(ptr));
 #endif
 	idalloc(ptr);
 
diff --git a/jemalloc/src/prof.c b/jemalloc/src/prof.c
index e715da9a..583a6e91 100644
--- a/jemalloc/src/prof.c
+++ b/jemalloc/src/prof.c
@@ -47,7 +47,8 @@ static __thread prof_tcache_t	*prof_tcache_tls
 	pthread_setspecific(prof_tcache_tsd, (void *)(v));		\
 } while (0)
 #else
-#  define PROF_TCACHE_GET()	((ckh_t *)pthread_getspecific(prof_tcache_tsd))
+#  define PROF_TCACHE_GET()						\
+	((prof_tcache_t *)pthread_getspecific(prof_tcache_tsd))
 #  define PROF_TCACHE_SET(v)	do {					\
 	pthread_setspecific(prof_tcache_tsd, (void *)(v));		\
 } while (0)
@@ -69,7 +70,7 @@ static __thread void	**vec_tls
 	pthread_setspecific(vec_tsd, (void *)(v));		\
 } while (0)
 #else
-#  define VEC_GET()	((ckh_t *)pthread_getspecific(vec_tsd))
+#  define VEC_GET()	((void **)pthread_getspecific(vec_tsd))
 #  define VEC_SET(v)	do {					\
 	pthread_setspecific(vec_tsd, (void *)(v));		\
 } while (0)
@@ -106,7 +107,8 @@ prof_sample_state_t prof_sample_state_oom;
 	r = (prof_sample_state_t *)pthread_getspecific(			\
 	    prof_sample_state_tsd);					\
 	if (r == NULL) {						\
-		r = ipalloc(sizeof(prof_sample_state_t), CACHELINE);	\
+		r = ipalloc(sizeof(prof_sample_state_t), CACHELINE,	\
+		    false);						\
 		if (r == NULL) {					\
 			malloc_write("<jemalloc>: Error in heap "	\
 			    "profiler: out of memory; subsequent heap "	\
@@ -658,6 +660,8 @@ prof_alloc_prep(size_t size)
 	void **vec;
 	prof_bt_t bt;
 
+	assert(size == s2u(size));
+
 	vec = VEC_GET();
 	if (vec == NULL) {
 		vec = imalloc(sizeof(void *) * prof_bt_max);
@@ -750,7 +754,7 @@ prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
 		huge_prof_ctx_set(ptr, ctx);
 }
 
-static inline void
+static inline bool
 prof_sample_accum_update(size_t size)
 {
 	prof_sample_state_t *prof_sample_state;
@@ -771,22 +775,33 @@ prof_sample_accum_update(size_t size)
 			    prof_sample_state->threshold;
 			prof_sample_threshold_update();
 		}
-	} else
+		return (false);
+	} else {
 		prof_sample_state->accum += size;
+		return (true);
+	}
 }
 
 void
-prof_malloc(const void *ptr, prof_thr_cnt_t *cnt)
+prof_malloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt)
 {
-	size_t size;
 
 	assert(ptr != NULL);
+	assert(size == s2u(size));
 
 	if (opt_lg_prof_sample != 0) {
-		size = isalloc(ptr);
-		prof_sample_accum_update(size);
-	} else if ((uintptr_t)cnt > (uintptr_t)1U)
-		size = isalloc(ptr);
+		if (prof_sample_accum_update(size)) {
+			/*
+			 * Don't sample.  For malloc()-like allocation, it is
+			 * always possible to tell in advance how large an
+			 * object's usable size will be, so there should never
+			 * be a difference between the size passed to
+			 * prof_alloc_prep() and prof_malloc().
+			 */
+			assert(false);
+			return;
+		}
+	}
 
 	if ((uintptr_t)cnt > (uintptr_t)1U) {
 		prof_ctx_set(ptr, cnt->ctx);
@@ -813,24 +828,27 @@ prof_malloc(const void *ptr, prof_thr_cnt_t *cnt)
 }
 
 void
-prof_realloc(const void *ptr, prof_thr_cnt_t *cnt, const void *old_ptr,
-    size_t old_size, prof_ctx_t *old_ctx)
+prof_realloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt,
+    const void *old_ptr, size_t old_size, prof_ctx_t *old_ctx)
 {
-	size_t size
-#ifdef JEMALLOC_CC_SILENCE
-	    = 0
-#endif
-	    ;
 	prof_thr_cnt_t *told_cnt;
 
 	assert(ptr != NULL || (uintptr_t)cnt <= (uintptr_t)1U);
 
 	if (ptr != NULL) {
 		if (opt_lg_prof_sample != 0) {
-			size = isalloc(ptr);
-			prof_sample_accum_update(size);
-		} else if ((uintptr_t)cnt > (uintptr_t)1U)
-			size = isalloc(ptr);
+			if (prof_sample_accum_update(size)) {
+				/*
+				 * Don't sample.  The size passed to
+				 * prof_alloc_prep() was larger than what
+				 * actually got allocated., so a backtrace was
+				 * captured for this allocation, even though
+				 * its actual size was insufficient to cross
+				 * the sample threshold.
+				 */
+				return;
+			}
+		}
 	}
 
 	if ((uintptr_t)old_ctx > (uintptr_t)1U) {
diff --git a/jemalloc/test/allocated.c b/jemalloc/test/allocated.c
new file mode 100644
index 00000000..64a17351
--- /dev/null
+++ b/jemalloc/test/allocated.c
@@ -0,0 +1,105 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <pthread.h>
+#include <assert.h>
+#include <errno.h>
+#include <string.h>
+
+#define	JEMALLOC_MANGLE
+#include "jemalloc_test.h"
+
+void *
+thread_start(void *arg)
+{
+	int err;
+	void *p;
+	uint64_t a0, a1, d0, d1;
+	size_t sz, usize;
+
+	sz = sizeof(a0);
+	if ((err = JEMALLOC_P(mallctl)("thread.allocated", &a0, &sz, NULL,
+	    0))) {
+		if (err == ENOENT) {
+#ifdef JEMALLOC_STATS
+			assert(false);
+#endif
+			goto RETURN;
+		}
+		fprintf(stderr, "%s(): Error in mallctl(): %s\n", __func__,
+		    strerror(err));
+		exit(1);
+	}
+
+	sz = sizeof(d0);
+	if ((err = JEMALLOC_P(mallctl)("thread.deallocated", &d0, &sz, NULL,
+	    0))) {
+		if (err == ENOENT) {
+#ifdef JEMALLOC_STATS
+			assert(false);
+#endif
+			goto RETURN;
+		}
+		fprintf(stderr, "%s(): Error in mallctl(): %s\n", __func__,
+		    strerror(err));
+		exit(1);
+	}
+
+	p = JEMALLOC_P(malloc)(1);
+	if (p == NULL) {
+		fprintf(stderr, "%s(): Error in malloc()\n", __func__);
+		exit(1);
+	}
+
+	sz = sizeof(a1);
+	JEMALLOC_P(mallctl)("thread.allocated", &a1, &sz, NULL, 0);
+
+	usize = JEMALLOC_P(malloc_usable_size)(p);
+	assert(a0 + usize <= a1);
+
+	JEMALLOC_P(free)(p);
+
+	sz = sizeof(d1);
+	JEMALLOC_P(mallctl)("thread.deallocated", &d1, &sz, NULL, 0);
+
+	assert(d0 + usize <= d1);
+
+RETURN:
+	return (NULL);
+}
+
+int
+main(void)
+{
+	int ret = 0;
+	pthread_t thread;
+
+	fprintf(stderr, "Test begin\n");
+
+	thread_start(NULL);
+
+	if (pthread_create(&thread, NULL, thread_start, NULL)
+	    != 0) {
+		fprintf(stderr, "%s(): Error in pthread_create()\n", __func__);
+		ret = 1;
+		goto RETURN;
+	}
+	pthread_join(thread, (void *)&ret);
+
+	thread_start(NULL);
+
+	if (pthread_create(&thread, NULL, thread_start, NULL)
+	    != 0) {
+		fprintf(stderr, "%s(): Error in pthread_create()\n", __func__);
+		ret = 1;
+		goto RETURN;
+	}
+	pthread_join(thread, (void *)&ret);
+
+	thread_start(NULL);
+
+RETURN:
+	fprintf(stderr, "Test end\n");
+	return (ret);
+}
diff --git a/jemalloc/test/allocated.exp b/jemalloc/test/allocated.exp
new file mode 100644
index 00000000..369a88dd
--- /dev/null
+++ b/jemalloc/test/allocated.exp
@@ -0,0 +1,2 @@
+Test begin
+Test end

From 4d6a134e13e1e9705faff6f5dc6ca90d416aa0a4 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@facebook.com>
Date: Wed, 20 Oct 2010 19:05:59 -0700
Subject: [PATCH 38/48] Inline the fast path for heap sampling.

Inline the heap sampling code that is executed for every allocation
event (regardless of whether a sample is taken).

Combine all prof TLS data into a single data structure, in order to
reduce the TLS lookup volume.
---
 .../jemalloc/internal/jemalloc_internal.h.in  |  10 +-
 jemalloc/include/jemalloc/internal/prof.h     | 388 +++++++++++-
 jemalloc/src/prof.c                           | 557 +++---------------
 3 files changed, 449 insertions(+), 506 deletions(-)

diff --git a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
index eb609624..99746bbd 100644
--- a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
@@ -27,6 +27,7 @@
 #include <unistd.h>
 #include <fcntl.h>
 #include <pthread.h>
+#include <math.h>
 
 #define	JEMALLOC_MANGLE
 #include "../jemalloc@install_suffix@.h"
@@ -210,10 +211,10 @@ extern void	(*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s);
 #include "jemalloc/internal/rtree.h"
 #include "jemalloc/internal/tcache.h"
 #include "jemalloc/internal/hash.h"
-#include "jemalloc/internal/prof.h"
 #ifdef JEMALLOC_ZONE
 #include "jemalloc/internal/zone.h"
 #endif
+#include "jemalloc/internal/prof.h"
 
 #undef JEMALLOC_H_TYPES
 /******************************************************************************/
@@ -233,10 +234,10 @@ extern void	(*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s);
 #include "jemalloc/internal/rtree.h"
 #include "jemalloc/internal/tcache.h"
 #include "jemalloc/internal/hash.h"
-#include "jemalloc/internal/prof.h"
 #ifdef JEMALLOC_ZONE
 #include "jemalloc/internal/zone.h"
 #endif
+#include "jemalloc/internal/prof.h"
 
 #undef JEMALLOC_H_STRUCTS
 /******************************************************************************/
@@ -355,10 +356,10 @@ void	jemalloc_postfork(void);
 #include "jemalloc/internal/rtree.h"
 #include "jemalloc/internal/tcache.h"
 #include "jemalloc/internal/hash.h"
-#include "jemalloc/internal/prof.h"
 #ifdef JEMALLOC_ZONE
 #include "jemalloc/internal/zone.h"
 #endif
+#include "jemalloc/internal/prof.h"
 
 #undef JEMALLOC_H_EXTERNS
 /******************************************************************************/
@@ -547,7 +548,6 @@ choose_arena(void)
 #include "jemalloc/internal/tcache.h"
 #include "jemalloc/internal/arena.h"
 #include "jemalloc/internal/hash.h"
-#include "jemalloc/internal/prof.h"
 #ifdef JEMALLOC_ZONE
 #include "jemalloc/internal/zone.h"
 #endif
@@ -729,5 +729,7 @@ iralloc(void *ptr, size_t size, size_t extra, size_t alignment, bool zero,
 }
 #endif
 
+#include "jemalloc/internal/prof.h"
+
 #undef JEMALLOC_H_INLINES
 /******************************************************************************/
diff --git a/jemalloc/include/jemalloc/internal/prof.h b/jemalloc/include/jemalloc/internal/prof.h
index 0d139da7..a6bd797f 100644
--- a/jemalloc/include/jemalloc/internal/prof.h
+++ b/jemalloc/include/jemalloc/internal/prof.h
@@ -6,7 +6,7 @@ typedef struct prof_bt_s prof_bt_t;
 typedef struct prof_cnt_s prof_cnt_t;
 typedef struct prof_thr_cnt_s prof_thr_cnt_t;
 typedef struct prof_ctx_s prof_ctx_t;
-typedef struct prof_tcache_s prof_tcache_t;
+typedef struct prof_tdata_s prof_tdata_t;
 
 /* Option defaults. */
 #define	LG_PROF_BT_MAX_DEFAULT		7
@@ -38,8 +38,8 @@ typedef struct prof_tcache_s prof_tcache_t;
 
 struct prof_bt_s {
 	/* Backtrace, stored as len program counters. */
-	void			**vec;
-	unsigned		len;
+	void		**vec;
+	unsigned	len;
 };
 
 #ifdef JEMALLOC_PROF_LIBGCC
@@ -124,22 +124,29 @@ struct prof_ctx_s {
 	ql_head(prof_thr_cnt_t)	cnts_ql;
 };
 
-/*
- * Thread-specific hash of (prof_bt_t *)-->(prof_thr_cnt_t *).  Each thread
- * keeps a cache of backtraces, with associated thread-specific prof_thr_cnt_t
- * objects.  Other threads may read the prof_thr_cnt_t contents, but no others
- * will ever write them.
- *
- * Upon thread exit, the thread must merge all the prof_thr_cnt_t counter data
- * into the associated prof_ctx_t objects, and unlink/free the prof_thr_cnt_t
- * objects.
- */
-struct prof_tcache_s {
-	/* (prof_bt_t *)-->(prof_thr_cnt_t *). */
+struct prof_tdata_s {
+	/*
+	 * Hash of (prof_bt_t *)-->(prof_thr_cnt_t *).  Each thread keeps a
+	 * cache of backtraces, with associated thread-specific prof_thr_cnt_t
+	 * objects.  Other threads may read the prof_thr_cnt_t contents, but no
+	 * others will ever write them.
+	 *
+	 * Upon thread exit, the thread must merge all the prof_thr_cnt_t
+	 * counter data into the associated prof_ctx_t objects, and unlink/free
+	 * the prof_thr_cnt_t objects.
+	 */
 	ckh_t			bt2cnt;
 
 	/* LRU for contents of bt2cnt. */
 	ql_head(prof_thr_cnt_t)	lru_ql;
+
+	/* Backtrace vector, used for calls to prof_backtrace(). */
+	void			**vec;
+
+	/* Sampling state. */
+	uint64_t		prn_state;
+	uint64_t		threshold;
+	uint64_t		accum;
 };
 
 #endif /* JEMALLOC_H_STRUCTS */
@@ -177,15 +184,39 @@ extern uint64_t	prof_interval;
  */
 extern bool	prof_promote;
 
-prof_thr_cnt_t	*prof_alloc_prep(size_t size);
-prof_ctx_t	*prof_ctx_get(const void *ptr);
-void	prof_malloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt);
-void	prof_realloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt,
-    const void *old_ptr, size_t old_size, prof_ctx_t *old_ctx);
-void	prof_free(const void *ptr);
+/* (1U << opt_lg_prof_bt_max). */
+extern unsigned	prof_bt_max;
+
+/* Thread-specific backtrace cache, used to reduce bt2ctx contention. */
+#ifndef NO_TLS
+extern __thread prof_tdata_t	*prof_tdata_tls
+    JEMALLOC_ATTR(tls_model("initial-exec"));
+#  define PROF_TCACHE_GET()	prof_tdata_tls
+#  define PROF_TCACHE_SET(v)	do {					\
+	prof_tdata_tls = (v);						\
+	pthread_setspecific(prof_tdata_tsd, (void *)(v));		\
+} while (0)
+#else
+#  define PROF_TCACHE_GET()						\
+	((prof_tdata_t *)pthread_getspecific(prof_tdata_tsd))
+#  define PROF_TCACHE_SET(v)	do {					\
+	pthread_setspecific(prof_tdata_tsd, (void *)(v));		\
+} while (0)
+#endif
+/*
+ * Same contents as b2cnt_tls, but initialized such that the TSD destructor is
+ * called when a thread exits, so that prof_tdata_tls contents can be merged,
+ * unlinked, and deallocated.
+ */
+extern pthread_key_t	prof_tdata_tsd;
+
+void	bt_init(prof_bt_t *bt, void **vec);
+void	prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max);
+prof_thr_cnt_t	*prof_lookup(prof_bt_t *bt);
 void	prof_idump(void);
 bool	prof_mdump(const char *filename);
 void	prof_udump(void);
+prof_tdata_t	*prof_tdata_init(void);
 void	prof_boot0(void);
 bool	prof_boot1(void);
 
@@ -193,6 +224,321 @@ bool	prof_boot1(void);
 /******************************************************************************/
 #ifdef JEMALLOC_H_INLINES
 
+#ifndef JEMALLOC_ENABLE_INLINE
+void	prof_sample_threshold_update(prof_tdata_t *prof_tdata);
+prof_thr_cnt_t	*prof_alloc_prep(size_t size);
+prof_ctx_t	*prof_ctx_get(const void *ptr);
+void	prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
+bool	prof_sample_accum_update(size_t size);
+void	prof_malloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt);
+void	prof_realloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt,
+    const void *old_ptr, size_t old_size, prof_ctx_t *old_ctx);
+void	prof_free(const void *ptr);
+#endif
+
+#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_PROF_C_))
+JEMALLOC_INLINE void
+prof_sample_threshold_update(prof_tdata_t *prof_tdata)
+{
+	uint64_t r;
+	double u;
+
+	/*
+	 * Compute prof_sample_threshold as a geometrically distributed random
+	 * variable with mean (2^opt_lg_prof_sample).
+	 */
+	prn64(r, 53, prof_tdata->prn_state,
+	    (uint64_t)6364136223846793005LLU, (uint64_t)1442695040888963407LLU);
+	u = (double)r * (1.0/9007199254740992.0L);
+	prof_tdata->threshold = (uint64_t)(log(u) /
+	    log(1.0 - (1.0 / (double)((uint64_t)1U << opt_lg_prof_sample))))
+	    + (uint64_t)1U;
+}
+
+JEMALLOC_INLINE prof_thr_cnt_t *
+prof_alloc_prep(size_t size)
+{
+#ifdef JEMALLOC_ENABLE_INLINE
+   /* This function does not have its own stack frame, because it is inlined. */
+#  define NIGNORE 1
+#else
+#  define NIGNORE 2
+#endif
+	prof_thr_cnt_t *ret;
+	prof_tdata_t *prof_tdata;
+	prof_bt_t bt;
+
+	assert(size == s2u(size));
+
+	prof_tdata = PROF_TCACHE_GET();
+	if (prof_tdata == NULL) {
+		prof_tdata = prof_tdata_init();
+		if (prof_tdata == NULL)
+			return (NULL);
+	}
+
+	if (opt_prof_active == false) {
+		/* Sampling is currently inactive, so avoid sampling. */
+		ret = (prof_thr_cnt_t *)(uintptr_t)1U;
+	} else if (opt_lg_prof_sample == 0) {
+		/*
+		 * Don't bother with sampling logic, since sampling interval is
+		 * 1.
+		 */
+		bt_init(&bt, prof_tdata->vec);
+		prof_backtrace(&bt, NIGNORE, prof_bt_max);
+		ret = prof_lookup(&bt);
+	} else {
+		if (prof_tdata->threshold == 0) {
+			/*
+			 * Initialize.  Seed the prng differently for each
+			 * thread.
+			 */
+			prof_tdata->prn_state = (uint64_t)(uintptr_t)&size;
+			prof_sample_threshold_update(prof_tdata);
+		}
+
+		/*
+		 * Determine whether to capture a backtrace based on whether
+		 * size is enough for prof_accum to reach
+		 * prof_tdata->threshold.  However, delay updating these
+		 * variables until prof_{m,re}alloc(), because we don't know
+		 * for sure that the allocation will succeed.
+		 *
+		 * Use subtraction rather than addition to avoid potential
+		 * integer overflow.
+		 */
+		if (size >= prof_tdata->threshold - prof_tdata->accum) {
+			bt_init(&bt, prof_tdata->vec);
+			prof_backtrace(&bt, NIGNORE, prof_bt_max);
+			ret = prof_lookup(&bt);
+		} else
+			ret = (prof_thr_cnt_t *)(uintptr_t)1U;
+	}
+
+	return (ret);
+#undef NIGNORE
+}
+
+JEMALLOC_INLINE prof_ctx_t *
+prof_ctx_get(const void *ptr)
+{
+	prof_ctx_t *ret;
+	arena_chunk_t *chunk;
+
+	assert(ptr != NULL);
+
+	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
+	if (chunk != ptr) {
+		/* Region. */
+		assert(chunk->arena->magic == ARENA_MAGIC);
+
+		ret = arena_prof_ctx_get(ptr);
+	} else
+		ret = huge_prof_ctx_get(ptr);
+
+	return (ret);
+}
+
+JEMALLOC_INLINE void
+prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
+{
+	arena_chunk_t *chunk;
+
+	assert(ptr != NULL);
+
+	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
+	if (chunk != ptr) {
+		/* Region. */
+		assert(chunk->arena->magic == ARENA_MAGIC);
+
+		arena_prof_ctx_set(ptr, ctx);
+	} else
+		huge_prof_ctx_set(ptr, ctx);
+}
+
+JEMALLOC_INLINE bool
+prof_sample_accum_update(size_t size)
+{
+	prof_tdata_t *prof_tdata;
+
+	/* Sampling logic is unnecessary if the interval is 1. */
+	assert(opt_lg_prof_sample != 0);
+
+	prof_tdata = PROF_TCACHE_GET();
+	assert(prof_tdata != NULL);
+
+	/* Take care to avoid integer overflow. */
+	if (size >= prof_tdata->threshold - prof_tdata->accum) {
+		prof_tdata->accum -= (prof_tdata->threshold - size);
+		/* Compute new prof_sample_threshold. */
+		prof_sample_threshold_update(prof_tdata);
+		while (prof_tdata->accum >= prof_tdata->threshold) {
+			prof_tdata->accum -= prof_tdata->threshold;
+			prof_sample_threshold_update(prof_tdata);
+		}
+		return (false);
+	} else {
+		prof_tdata->accum += size;
+		return (true);
+	}
+}
+
+JEMALLOC_INLINE void
+prof_malloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt)
+{
+
+	assert(ptr != NULL);
+	assert(size == s2u(size));
+
+	if (opt_lg_prof_sample != 0) {
+		if (prof_sample_accum_update(size)) {
+			/*
+			 * Don't sample.  For malloc()-like allocation, it is
+			 * always possible to tell in advance how large an
+			 * object's usable size will be, so there should never
+			 * be a difference between the size passed to
+			 * prof_alloc_prep() and prof_malloc().
+			 */
+			assert(false);
+			return;
+		}
+	}
+
+	if ((uintptr_t)cnt > (uintptr_t)1U) {
+		prof_ctx_set(ptr, cnt->ctx);
+
+		cnt->epoch++;
+		/*********/
+		mb_write();
+		/*********/
+		cnt->cnts.curobjs++;
+		cnt->cnts.curbytes += size;
+		if (opt_prof_accum) {
+			cnt->cnts.accumobjs++;
+			cnt->cnts.accumbytes += size;
+		}
+		/*********/
+		mb_write();
+		/*********/
+		cnt->epoch++;
+		/*********/
+		mb_write();
+		/*********/
+	} else
+		prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U);
+}
+
+JEMALLOC_INLINE void
+prof_realloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt,
+    const void *old_ptr, size_t old_size, prof_ctx_t *old_ctx)
+{
+	prof_thr_cnt_t *told_cnt;
+
+	assert(ptr != NULL || (uintptr_t)cnt <= (uintptr_t)1U);
+
+	if (ptr != NULL) {
+		if (opt_lg_prof_sample != 0) {
+			if (prof_sample_accum_update(size)) {
+				/*
+				 * Don't sample.  The size passed to
+				 * prof_alloc_prep() was larger than what
+				 * actually got allocated., so a backtrace was
+				 * captured for this allocation, even though
+				 * its actual size was insufficient to cross
+				 * the sample threshold.
+				 */
+				return;
+			}
+		}
+	}
+
+	if ((uintptr_t)old_ctx > (uintptr_t)1U) {
+		told_cnt = prof_lookup(old_ctx->bt);
+		if (told_cnt == NULL) {
+			/*
+			 * It's too late to propagate OOM for this realloc(),
+			 * so operate directly on old_cnt->ctx->cnt_merged.
+			 */
+			malloc_mutex_lock(&old_ctx->lock);
+			old_ctx->cnt_merged.curobjs--;
+			old_ctx->cnt_merged.curbytes -= old_size;
+			malloc_mutex_unlock(&old_ctx->lock);
+			told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
+		}
+	} else
+		told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
+
+	if ((uintptr_t)told_cnt > (uintptr_t)1U)
+		told_cnt->epoch++;
+	if ((uintptr_t)cnt > (uintptr_t)1U) {
+		prof_ctx_set(ptr, cnt->ctx);
+		cnt->epoch++;
+	} else
+		prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U);
+	/*********/
+	mb_write();
+	/*********/
+	if ((uintptr_t)told_cnt > (uintptr_t)1U) {
+		told_cnt->cnts.curobjs--;
+		told_cnt->cnts.curbytes -= old_size;
+	}
+	if ((uintptr_t)cnt > (uintptr_t)1U) {
+		cnt->cnts.curobjs++;
+		cnt->cnts.curbytes += size;
+		if (opt_prof_accum) {
+			cnt->cnts.accumobjs++;
+			cnt->cnts.accumbytes += size;
+		}
+	}
+	/*********/
+	mb_write();
+	/*********/
+	if ((uintptr_t)told_cnt > (uintptr_t)1U)
+		told_cnt->epoch++;
+	if ((uintptr_t)cnt > (uintptr_t)1U)
+		cnt->epoch++;
+	/*********/
+	mb_write(); /* Not strictly necessary. */
+}
+
+JEMALLOC_INLINE void
+prof_free(const void *ptr)
+{
+	prof_ctx_t *ctx = prof_ctx_get(ptr);
+
+	if ((uintptr_t)ctx > (uintptr_t)1) {
+		size_t size = isalloc(ptr);
+		prof_thr_cnt_t *tcnt = prof_lookup(ctx->bt);
+
+		if (tcnt != NULL) {
+			tcnt->epoch++;
+			/*********/
+			mb_write();
+			/*********/
+			tcnt->cnts.curobjs--;
+			tcnt->cnts.curbytes -= size;
+			/*********/
+			mb_write();
+			/*********/
+			tcnt->epoch++;
+			/*********/
+			mb_write();
+			/*********/
+		} else {
+			/*
+			 * OOM during free() cannot be propagated, so operate
+			 * directly on cnt->ctx->cnt_merged.
+			 */
+			malloc_mutex_lock(&ctx->lock);
+			ctx->cnt_merged.curobjs--;
+			ctx->cnt_merged.curbytes -= size;
+			malloc_mutex_unlock(&ctx->lock);
+		}
+	}
+}
+#endif
+
 #endif /* JEMALLOC_H_INLINES */
 /******************************************************************************/
 #endif /* JEMALLOC_PROF */
diff --git a/jemalloc/src/prof.c b/jemalloc/src/prof.c
index 583a6e91..4a49e901 100644
--- a/jemalloc/src/prof.c
+++ b/jemalloc/src/prof.c
@@ -12,8 +12,6 @@
 #include <libunwind.h>
 #endif
 
-#include <math.h>
-
 /******************************************************************************/
 /* Data. */
 
@@ -30,6 +28,14 @@ ssize_t		opt_lg_prof_tcmax = LG_PROF_TCMAX_DEFAULT;
 uint64_t	prof_interval;
 bool		prof_promote;
 
+unsigned	prof_bt_max;
+
+#ifndef NO_TLS
+__thread prof_tdata_t	*prof_tdata_tls
+    JEMALLOC_ATTR(tls_model("initial-exec"));
+#endif
+pthread_key_t	prof_tdata_tsd;
+
 /*
  * Global hash of (prof_bt_t *)-->(prof_ctx_t *).  This is the master data
  * structure that knows about all backtraces currently captured.
@@ -37,96 +43,6 @@ bool		prof_promote;
 static ckh_t		bt2ctx;
 static malloc_mutex_t	bt2ctx_mtx;
 
-/* Thread-specific backtrace cache, used to reduce bt2ctx contention. */
-#ifndef NO_TLS
-static __thread prof_tcache_t	*prof_tcache_tls
-    JEMALLOC_ATTR(tls_model("initial-exec"));
-#  define PROF_TCACHE_GET()	prof_tcache_tls
-#  define PROF_TCACHE_SET(v)	do {					\
-	prof_tcache_tls = (v);						\
-	pthread_setspecific(prof_tcache_tsd, (void *)(v));		\
-} while (0)
-#else
-#  define PROF_TCACHE_GET()						\
-	((prof_tcache_t *)pthread_getspecific(prof_tcache_tsd))
-#  define PROF_TCACHE_SET(v)	do {					\
-	pthread_setspecific(prof_tcache_tsd, (void *)(v));		\
-} while (0)
-#endif
-/*
- * Same contents as b2cnt_tls, but initialized such that the TSD destructor is
- * called when a thread exits, so that prof_tcache_tls contents can be merged,
- * unlinked, and deallocated.
- */
-static pthread_key_t	prof_tcache_tsd;
-
-/* Thread-specific backtrace vector, used for calls to prof_backtrace(). */
-#ifndef NO_TLS
-static __thread void	**vec_tls
-    JEMALLOC_ATTR(tls_model("initial-exec"));
-#  define VEC_GET()	vec_tls
-#  define VEC_SET(v)	do {					\
-	vec_tls = (v);						\
-	pthread_setspecific(vec_tsd, (void *)(v));		\
-} while (0)
-#else
-#  define VEC_GET()	((void **)pthread_getspecific(vec_tsd))
-#  define VEC_SET(v)	do {					\
-	pthread_setspecific(vec_tsd, (void *)(v));		\
-} while (0)
-#endif
-/*
- * Same contents as vec_tls, but initialized such that the TSD destructor is
- * called when a thread exits, so that vec_tls contents can be merged,
- * unlinked, and deallocated.
- */
-static pthread_key_t	vec_tsd;
-
-
-/* (1U << opt_lg_prof_bt_max). */
-static unsigned		prof_bt_max;
-
-typedef struct prof_sample_state_s prof_sample_state_t;
-struct prof_sample_state_s {
-	uint64_t	prn_state;
-	uint64_t	threshold;
-	uint64_t	accum;
-};
-
-#ifndef NO_TLS
-static __thread prof_sample_state_t prof_sample_state_tls
-    JEMALLOC_ATTR(tls_model("initial-exec"));
-#  define PROF_SAMPLE_STATE_GET(r)	do {				\
-	r = &prof_sample_state_tls;					\
-} while (0)
-#else
-static pthread_key_t	prof_sample_state_tsd;
-/* Used only if an OOM error occurs in PROF_SAMPLE_STATE_GET(). */
-prof_sample_state_t prof_sample_state_oom;
-#  define PROF_SAMPLE_STATE_GET(r)	do {				\
-	r = (prof_sample_state_t *)pthread_getspecific(			\
-	    prof_sample_state_tsd);					\
-	if (r == NULL) {						\
-		r = ipalloc(sizeof(prof_sample_state_t), CACHELINE,	\
-		    false);						\
-		if (r == NULL) {					\
-			malloc_write("<jemalloc>: Error in heap "	\
-			    "profiler: out of memory; subsequent heap "	\
-			    "profiles may be inaccurate\n");		\
-			if (opt_abort)					\
-				abort();				\
-			/* Failure is not an option... */		\
-			r = &prof_sample_state_oom;			\
-		}							\
-		pthread_setspecific(prof_sample_state_tsd, (void *)r);	\
-	}								\
-} while (0)
-#  define ARENA_GET()	((arena_t *)pthread_getspecific(arenas_tsd))
-#  define ARENA_SET(v)	do {						\
-	pthread_setspecific(arenas_tsd, (void *)(v));			\
-} while (0)
-#endif
-
 static malloc_mutex_t	prof_dump_seq_mtx;
 static uint64_t		prof_dump_seq;
 static uint64_t		prof_dump_iseq;
@@ -154,7 +70,6 @@ static bool		enq_udump;
 /* Function prototypes for non-inline static functions. */
 
 static prof_bt_t	*bt_dup(prof_bt_t *bt);
-static void	bt_init(prof_bt_t *bt, void **vec);
 static void	bt_destroy(prof_bt_t *bt);
 #ifdef JEMALLOC_PROF_LIBGCC
 static _Unwind_Reason_Code	prof_unwind_init_callback(
@@ -162,9 +77,6 @@ static _Unwind_Reason_Code	prof_unwind_init_callback(
 static _Unwind_Reason_Code	prof_unwind_callback(
     struct _Unwind_Context *context, void *arg);
 #endif
-static void	prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max);
-static prof_thr_cnt_t	*prof_lookup(prof_bt_t *bt);
-static void	prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
 static bool	prof_flush(bool propagate_err);
 static bool	prof_write(const char *s, bool propagate_err);
 static void	prof_ctx_sum(prof_ctx_t *ctx, prof_cnt_t *cnt_all,
@@ -181,15 +93,11 @@ static void	prof_fdump(void);
 static void	prof_bt_hash(const void *key, unsigned minbits, size_t *hash1,
     size_t *hash2);
 static bool	prof_bt_keycomp(const void *k1, const void *k2);
-static void	prof_tcache_cleanup(void *arg);
-static void	vec_cleanup(void *arg);
-#ifdef NO_TLS
-static void	prof_sample_state_thread_cleanup(void *arg);
-#endif
+static void	prof_tdata_cleanup(void *arg);
 
 /******************************************************************************/
 
-static void
+void
 bt_init(prof_bt_t *bt, void **vec)
 {
 
@@ -285,7 +193,7 @@ prof_unwind_callback(struct _Unwind_Context *context, void *arg)
 	return (_URC_NO_REASON);
 }
 
-static void
+void
 prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max)
 {
 	prof_unwind_data_t data = {bt, nignore, max};
@@ -293,7 +201,7 @@ prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max)
 	_Unwind_Backtrace(prof_unwind_callback, &data);
 }
 #elif defined(JEMALLOC_PROF_LIBUNWIND)
-static void
+void
 prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max)
 {
 	unw_context_t uc;
@@ -328,7 +236,7 @@ prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max)
 	}
 }
 #else
-static void
+void
 prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max)
 {
 #define	NIGNORE	3
@@ -509,32 +417,23 @@ prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max)
 }
 #endif
 
-static prof_thr_cnt_t *
+prof_thr_cnt_t *
 prof_lookup(prof_bt_t *bt)
 {
 	union {
 		prof_thr_cnt_t	*p;
 		void		*v;
 	} ret;
-	prof_tcache_t *prof_tcache = PROF_TCACHE_GET();
+	prof_tdata_t *prof_tdata;
 
-	if (prof_tcache == NULL) {
-		/* Initialize an empty cache for this thread. */
-		prof_tcache = (prof_tcache_t *)imalloc(sizeof(prof_tcache_t));
-		if (prof_tcache == NULL)
+	prof_tdata = PROF_TCACHE_GET();
+	if (prof_tdata == NULL) {
+		prof_tdata = prof_tdata_init();
+		if (prof_tdata == NULL)
 			return (NULL);
-
-		if (ckh_new(&prof_tcache->bt2cnt, PROF_CKH_MINITEMS,
-		    prof_bt_hash, prof_bt_keycomp)) {
-			idalloc(prof_tcache);
-			return (NULL);
-		}
-		ql_new(&prof_tcache->lru_ql);
-
-		PROF_TCACHE_SET(prof_tcache);
 	}
 
-	if (ckh_search(&prof_tcache->bt2cnt, bt, NULL, &ret.v)) {
+	if (ckh_search(&prof_tdata->bt2cnt, bt, NULL, &ret.v)) {
 		union {
 			prof_bt_t	*p;
 			void		*v;
@@ -588,23 +487,23 @@ prof_lookup(prof_bt_t *bt)
 		prof_leave();
 
 		/* Link a prof_thd_cnt_t into ctx for this thread. */
-		if (opt_lg_prof_tcmax >= 0 && ckh_count(&prof_tcache->bt2cnt)
+		if (opt_lg_prof_tcmax >= 0 && ckh_count(&prof_tdata->bt2cnt)
 		    == (ZU(1) << opt_lg_prof_tcmax)) {
-			assert(ckh_count(&prof_tcache->bt2cnt) > 0);
+			assert(ckh_count(&prof_tdata->bt2cnt) > 0);
 			/*
 			 * Flush the least least recently used cnt in order to
 			 * keep bt2cnt from becoming too large.
 			 */
-			ret.p = ql_last(&prof_tcache->lru_ql, lru_link);
+			ret.p = ql_last(&prof_tdata->lru_ql, lru_link);
 			assert(ret.v != NULL);
-			ckh_remove(&prof_tcache->bt2cnt, ret.p->ctx->bt, NULL,
+			ckh_remove(&prof_tdata->bt2cnt, ret.p->ctx->bt, NULL,
 			    NULL);
-			ql_remove(&prof_tcache->lru_ql, ret.p, lru_link);
+			ql_remove(&prof_tdata->lru_ql, ret.p, lru_link);
 			prof_ctx_merge(ret.p->ctx, ret.p);
 			/* ret can now be re-used. */
 		} else {
 			assert(opt_lg_prof_tcmax < 0 ||
-			    ckh_count(&prof_tcache->bt2cnt) < (ZU(1) <<
+			    ckh_count(&prof_tdata->bt2cnt) < (ZU(1) <<
 			    opt_lg_prof_tcmax));
 			/* Allocate and partially initialize a new cnt. */
 			ret.v = imalloc(sizeof(prof_thr_cnt_t));
@@ -617,325 +516,22 @@ prof_lookup(prof_bt_t *bt)
 		ret.p->ctx = ctx.p;
 		ret.p->epoch = 0;
 		memset(&ret.p->cnts, 0, sizeof(prof_cnt_t));
-		if (ckh_insert(&prof_tcache->bt2cnt, btkey.v, ret.v)) {
+		if (ckh_insert(&prof_tdata->bt2cnt, btkey.v, ret.v)) {
 			idalloc(ret.v);
 			return (NULL);
 		}
-		ql_head_insert(&prof_tcache->lru_ql, ret.p, lru_link);
+		ql_head_insert(&prof_tdata->lru_ql, ret.p, lru_link);
 		ql_tail_insert(&ctx.p->cnts_ql, ret.p, cnts_link);
 		malloc_mutex_unlock(&ctx.p->lock);
 	} else {
 		/* Move ret to the front of the LRU. */
-		ql_remove(&prof_tcache->lru_ql, ret.p, lru_link);
-		ql_head_insert(&prof_tcache->lru_ql, ret.p, lru_link);
+		ql_remove(&prof_tdata->lru_ql, ret.p, lru_link);
+		ql_head_insert(&prof_tdata->lru_ql, ret.p, lru_link);
 	}
 
 	return (ret.p);
 }
 
-static inline void
-prof_sample_threshold_update(void)
-{
-	uint64_t r;
-	double u;
-	prof_sample_state_t *prof_sample_state;
-
-	/*
-	 * Compute prof_sample_threshold as a geometrically distributed random
-	 * variable with mean (2^opt_lg_prof_sample).
-	 */
-	PROF_SAMPLE_STATE_GET(prof_sample_state);
-	prn64(r, 53, prof_sample_state->prn_state,
-	    (uint64_t)6364136223846793005LLU, (uint64_t)1442695040888963407LLU);
-	u = (double)r * (1.0/9007199254740992.0L);
-	prof_sample_state->threshold = (uint64_t)(log(u) /
-	    log(1.0 - (1.0 / (double)((uint64_t)1U << opt_lg_prof_sample))))
-	    + (uint64_t)1U;
-}
-
-prof_thr_cnt_t *
-prof_alloc_prep(size_t size)
-{
-	prof_thr_cnt_t *ret;
-	void **vec;
-	prof_bt_t bt;
-
-	assert(size == s2u(size));
-
-	vec = VEC_GET();
-	if (vec == NULL) {
-		vec = imalloc(sizeof(void *) * prof_bt_max);
-		if (vec == NULL)
-			return (NULL);
-		VEC_SET(vec);
-	}
-
-	if (opt_prof_active == false) {
-		/* Sampling is currently inactive, so avoid sampling. */
-		ret = (prof_thr_cnt_t *)(uintptr_t)1U;
-	} else if (opt_lg_prof_sample == 0) {
-		/*
-		 * Don't bother with sampling logic, since sampling interval is
-		 * 1.
-		 */
-		bt_init(&bt, vec);
-		prof_backtrace(&bt, 2, prof_bt_max);
-		ret = prof_lookup(&bt);
-	} else {
-		prof_sample_state_t *prof_sample_state;
-
-		PROF_SAMPLE_STATE_GET(prof_sample_state);
-		if (prof_sample_state->threshold == 0) {
-			/*
-			 * Initialize.  Seed the prng differently for each
-			 * thread.
-			 */
-			prof_sample_state->prn_state =
-			    (uint64_t)(uintptr_t)&size;
-			prof_sample_threshold_update();
-		}
-
-		/*
-		 * Determine whether to capture a backtrace based on whether
-		 * size is enough for prof_accum to reach
-		 * prof_sample_state->threshold.  However, delay updating these
-		 * variables until prof_{m,re}alloc(), because we don't know
-		 * for sure that the allocation will succeed.
-		 *
-		 * Use subtraction rather than addition to avoid potential
-		 * integer overflow.
-		 */
-		if (size >= prof_sample_state->threshold -
-		    prof_sample_state->accum) {
-			bt_init(&bt, vec);
-			prof_backtrace(&bt, 2, prof_bt_max);
-			ret = prof_lookup(&bt);
-		} else
-			ret = (prof_thr_cnt_t *)(uintptr_t)1U;
-	}
-
-	return (ret);
-}
-
-prof_ctx_t *
-prof_ctx_get(const void *ptr)
-{
-	prof_ctx_t *ret;
-	arena_chunk_t *chunk;
-
-	assert(ptr != NULL);
-
-	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-	if (chunk != ptr) {
-		/* Region. */
-		assert(chunk->arena->magic == ARENA_MAGIC);
-
-		ret = arena_prof_ctx_get(ptr);
-	} else
-		ret = huge_prof_ctx_get(ptr);
-
-	return (ret);
-}
-
-static void
-prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
-{
-	arena_chunk_t *chunk;
-
-	assert(ptr != NULL);
-
-	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-	if (chunk != ptr) {
-		/* Region. */
-		assert(chunk->arena->magic == ARENA_MAGIC);
-
-		arena_prof_ctx_set(ptr, ctx);
-	} else
-		huge_prof_ctx_set(ptr, ctx);
-}
-
-static inline bool
-prof_sample_accum_update(size_t size)
-{
-	prof_sample_state_t *prof_sample_state;
-
-	/* Sampling logic is unnecessary if the interval is 1. */
-	assert(opt_lg_prof_sample != 0);
-
-	/* Take care to avoid integer overflow. */
-	PROF_SAMPLE_STATE_GET(prof_sample_state);
-	if (size >= prof_sample_state->threshold - prof_sample_state->accum) {
-		prof_sample_state->accum -= (prof_sample_state->threshold -
-		    size);
-		/* Compute new prof_sample_threshold. */
-		prof_sample_threshold_update();
-		while (prof_sample_state->accum >=
-		    prof_sample_state->threshold) {
-			prof_sample_state->accum -=
-			    prof_sample_state->threshold;
-			prof_sample_threshold_update();
-		}
-		return (false);
-	} else {
-		prof_sample_state->accum += size;
-		return (true);
-	}
-}
-
-void
-prof_malloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt)
-{
-
-	assert(ptr != NULL);
-	assert(size == s2u(size));
-
-	if (opt_lg_prof_sample != 0) {
-		if (prof_sample_accum_update(size)) {
-			/*
-			 * Don't sample.  For malloc()-like allocation, it is
-			 * always possible to tell in advance how large an
-			 * object's usable size will be, so there should never
-			 * be a difference between the size passed to
-			 * prof_alloc_prep() and prof_malloc().
-			 */
-			assert(false);
-			return;
-		}
-	}
-
-	if ((uintptr_t)cnt > (uintptr_t)1U) {
-		prof_ctx_set(ptr, cnt->ctx);
-
-		cnt->epoch++;
-		/*********/
-		mb_write();
-		/*********/
-		cnt->cnts.curobjs++;
-		cnt->cnts.curbytes += size;
-		if (opt_prof_accum) {
-			cnt->cnts.accumobjs++;
-			cnt->cnts.accumbytes += size;
-		}
-		/*********/
-		mb_write();
-		/*********/
-		cnt->epoch++;
-		/*********/
-		mb_write();
-		/*********/
-	} else
-		prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U);
-}
-
-void
-prof_realloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt,
-    const void *old_ptr, size_t old_size, prof_ctx_t *old_ctx)
-{
-	prof_thr_cnt_t *told_cnt;
-
-	assert(ptr != NULL || (uintptr_t)cnt <= (uintptr_t)1U);
-
-	if (ptr != NULL) {
-		if (opt_lg_prof_sample != 0) {
-			if (prof_sample_accum_update(size)) {
-				/*
-				 * Don't sample.  The size passed to
-				 * prof_alloc_prep() was larger than what
-				 * actually got allocated., so a backtrace was
-				 * captured for this allocation, even though
-				 * its actual size was insufficient to cross
-				 * the sample threshold.
-				 */
-				return;
-			}
-		}
-	}
-
-	if ((uintptr_t)old_ctx > (uintptr_t)1U) {
-		told_cnt = prof_lookup(old_ctx->bt);
-		if (told_cnt == NULL) {
-			/*
-			 * It's too late to propagate OOM for this realloc(),
-			 * so operate directly on old_cnt->ctx->cnt_merged.
-			 */
-			malloc_mutex_lock(&old_ctx->lock);
-			old_ctx->cnt_merged.curobjs--;
-			old_ctx->cnt_merged.curbytes -= old_size;
-			malloc_mutex_unlock(&old_ctx->lock);
-			told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
-		}
-	} else
-		told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
-
-	if ((uintptr_t)told_cnt > (uintptr_t)1U)
-		told_cnt->epoch++;
-	if ((uintptr_t)cnt > (uintptr_t)1U) {
-		prof_ctx_set(ptr, cnt->ctx);
-		cnt->epoch++;
-	} else
-		prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U);
-	/*********/
-	mb_write();
-	/*********/
-	if ((uintptr_t)told_cnt > (uintptr_t)1U) {
-		told_cnt->cnts.curobjs--;
-		told_cnt->cnts.curbytes -= old_size;
-	}
-	if ((uintptr_t)cnt > (uintptr_t)1U) {
-		cnt->cnts.curobjs++;
-		cnt->cnts.curbytes += size;
-		if (opt_prof_accum) {
-			cnt->cnts.accumobjs++;
-			cnt->cnts.accumbytes += size;
-		}
-	}
-	/*********/
-	mb_write();
-	/*********/
-	if ((uintptr_t)told_cnt > (uintptr_t)1U)
-		told_cnt->epoch++;
-	if ((uintptr_t)cnt > (uintptr_t)1U)
-		cnt->epoch++;
-	/*********/
-	mb_write(); /* Not strictly necessary. */
-}
-
-void
-prof_free(const void *ptr)
-{
-	prof_ctx_t *ctx = prof_ctx_get(ptr);
-
-	if ((uintptr_t)ctx > (uintptr_t)1) {
-		size_t size = isalloc(ptr);
-		prof_thr_cnt_t *tcnt = prof_lookup(ctx->bt);
-
-		if (tcnt != NULL) {
-			tcnt->epoch++;
-			/*********/
-			mb_write();
-			/*********/
-			tcnt->cnts.curobjs--;
-			tcnt->cnts.curbytes -= size;
-			/*********/
-			mb_write();
-			/*********/
-			tcnt->epoch++;
-			/*********/
-			mb_write();
-			/*********/
-		} else {
-			/*
-			 * OOM during free() cannot be propagated, so operate
-			 * directly on cnt->ctx->cnt_merged.
-			 */
-			malloc_mutex_lock(&ctx->lock);
-			ctx->cnt_merged.curobjs--;
-			ctx->cnt_merged.curbytes -= size;
-			malloc_mutex_unlock(&ctx->lock);
-		}
-	}
-}
-
 static bool
 prof_flush(bool propagate_err)
 {
@@ -1468,60 +1064,72 @@ prof_bt_keycomp(const void *k1, const void *k2)
 	return (memcmp(bt1->vec, bt2->vec, bt1->len * sizeof(void *)) == 0);
 }
 
-static void
-prof_tcache_cleanup(void *arg)
+prof_tdata_t *
+prof_tdata_init(void)
 {
-	prof_tcache_t *prof_tcache;
+	prof_tdata_t *prof_tdata;
 
-	prof_tcache = PROF_TCACHE_GET();
-	if (prof_tcache != NULL) {
+	/* Initialize an empty cache for this thread. */
+	prof_tdata = (prof_tdata_t *)imalloc(sizeof(prof_tdata_t));
+	if (prof_tdata == NULL)
+		return (NULL);
+
+	if (ckh_new(&prof_tdata->bt2cnt, PROF_CKH_MINITEMS,
+	    prof_bt_hash, prof_bt_keycomp)) {
+		idalloc(prof_tdata);
+		return (NULL);
+	}
+	ql_new(&prof_tdata->lru_ql);
+
+	prof_tdata->vec = imalloc(sizeof(void *) * prof_bt_max);
+	if (prof_tdata->vec == NULL) {
+
+		ckh_delete(&prof_tdata->bt2cnt);
+		idalloc(prof_tdata);
+		return (NULL);
+	}
+
+	prof_tdata->prn_state = 0;
+	prof_tdata->threshold = 0;
+	prof_tdata->accum = 0;
+
+	PROF_TCACHE_SET(prof_tdata);
+
+	return (prof_tdata);
+}
+
+static void
+prof_tdata_cleanup(void *arg)
+{
+	prof_tdata_t *prof_tdata;
+
+	prof_tdata = PROF_TCACHE_GET();
+	if (prof_tdata != NULL) {
 		prof_thr_cnt_t *cnt;
 
 		/*
 		 * Delete the hash table.  All of its contents can still be
 		 * iterated over via the LRU.
 		 */
-		ckh_delete(&prof_tcache->bt2cnt);
+		ckh_delete(&prof_tdata->bt2cnt);
 
 		/*
 		 * Iteratively merge cnt's into the global stats and delete
 		 * them.
 		 */
-		while ((cnt = ql_last(&prof_tcache->lru_ql, lru_link)) !=
-		    NULL) {
+		while ((cnt = ql_last(&prof_tdata->lru_ql, lru_link)) != NULL) {
 			prof_ctx_merge(cnt->ctx, cnt);
-			ql_remove(&prof_tcache->lru_ql, cnt, lru_link);
+			ql_remove(&prof_tdata->lru_ql, cnt, lru_link);
 			idalloc(cnt);
 		}
 
-		idalloc(prof_tcache);
+		idalloc(prof_tdata->vec);
+
+		idalloc(prof_tdata);
 		PROF_TCACHE_SET(NULL);
 	}
 }
 
-static void
-vec_cleanup(void *arg)
-{
-	void **vec;
-
-	vec = VEC_GET();
-	if (vec != NULL) {
-		idalloc(vec);
-		VEC_SET(NULL);
-	}
-}
-
-#ifdef NO_TLS
-static void
-prof_sample_state_thread_cleanup(void *arg)
-{
-	prof_sample_state_t *prof_sample_state = (prof_sample_state_t *)arg;
-
-	if (prof_sample_state != &prof_sample_state_oom)
-		idalloc(prof_sample_state);
-}
-#endif
-
 void
 prof_boot0(void)
 {
@@ -1560,25 +1168,12 @@ prof_boot1(void)
 			return (true);
 		if (malloc_mutex_init(&bt2ctx_mtx))
 			return (true);
-		if (pthread_key_create(&prof_tcache_tsd, prof_tcache_cleanup)
+		if (pthread_key_create(&prof_tdata_tsd, prof_tdata_cleanup)
 		    != 0) {
 			malloc_write(
 			    "<jemalloc>: Error in pthread_key_create()\n");
 			abort();
 		}
-		if (pthread_key_create(&vec_tsd, vec_cleanup) != 0) {
-			malloc_write(
-			    "<jemalloc>: Error in pthread_key_create()\n");
-			abort();
-		}
-#ifdef NO_TLS
-		if (pthread_key_create(&prof_sample_state_tsd,
-		    prof_sample_state_thread_cleanup) != 0) {
-			malloc_write(
-			    "<jemalloc>: Error in pthread_key_create()\n");
-			abort();
-		}
-#endif
 
 		prof_bt_max = (1U << opt_lg_prof_bt_max);
 		if (malloc_mutex_init(&prof_dump_seq_mtx))

From 81b4e6eb6f06cac048e3743787a70676f1534269 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@facebook.com>
Date: Wed, 20 Oct 2010 20:52:00 -0700
Subject: [PATCH 39/48] Fix a heap profiling regression.

Call prof_ctx_set() in all paths through prof_{m,re}alloc().

Inline arena_prof_ctx_get().
---
 jemalloc/include/jemalloc/internal/arena.h | 107 ++++++++++++++++++++-
 jemalloc/include/jemalloc/internal/prof.h  |   5 +-
 jemalloc/src/arena.c                       |  99 -------------------
 3 files changed, 110 insertions(+), 101 deletions(-)

diff --git a/jemalloc/include/jemalloc/internal/arena.h b/jemalloc/include/jemalloc/internal/arena.h
index 893f0706..240d96b3 100644
--- a/jemalloc/include/jemalloc/internal/arena.h
+++ b/jemalloc/include/jemalloc/internal/arena.h
@@ -444,7 +444,6 @@ size_t	arena_salloc(const void *ptr);
 #ifdef JEMALLOC_PROF
 void	arena_prof_promoted(const void *ptr, size_t size);
 size_t	arena_salloc_demote(const void *ptr);
-prof_ctx_t	*arena_prof_ctx_get(const void *ptr);
 void	arena_prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
 #endif
 void	arena_dalloc_bin(arena_t *arena, arena_chunk_t *chunk, void *ptr,
@@ -467,10 +466,116 @@ bool	arena_boot(void);
 #ifdef JEMALLOC_H_INLINES
 
 #ifndef JEMALLOC_ENABLE_INLINE
+unsigned	arena_run_regind(arena_run_t *run, arena_bin_t *bin,
+    const void *ptr, size_t size);
+#  ifdef JEMALLOC_PROF
+prof_ctx_t	*arena_prof_ctx_get(const void *ptr);
+#  endif
 void	arena_dalloc(arena_t *arena, arena_chunk_t *chunk, void *ptr);
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_ARENA_C_))
+JEMALLOC_INLINE unsigned
+arena_run_regind(arena_run_t *run, arena_bin_t *bin, const void *ptr,
+    size_t size)
+{
+	unsigned shift, diff, regind;
+
+	assert(run->magic == ARENA_RUN_MAGIC);
+
+	/*
+	 * Avoid doing division with a variable divisor if possible.  Using
+	 * actual division here can reduce allocator throughput by over 20%!
+	 */
+	diff = (unsigned)((uintptr_t)ptr - (uintptr_t)run - bin->reg0_offset);
+
+	/* Rescale (factor powers of 2 out of the numerator and denominator). */
+	shift = ffs(size) - 1;
+	diff >>= shift;
+	size >>= shift;
+
+	if (size == 1) {
+		/* The divisor was a power of 2. */
+		regind = diff;
+	} else {
+		/*
+		 * To divide by a number D that is not a power of two we
+		 * multiply by (2^21 / D) and then right shift by 21 positions.
+		 *
+		 *   X / D
+		 *
+		 * becomes
+		 *
+		 *   (X * size_invs[D - 3]) >> SIZE_INV_SHIFT
+		 *
+		 * We can omit the first three elements, because we never
+		 * divide by 0, and 1 and 2 are both powers of two, which are
+		 * handled above.
+		 */
+#define	SIZE_INV_SHIFT 21
+#define	SIZE_INV(s) (((1U << SIZE_INV_SHIFT) / (s)) + 1)
+		static const unsigned size_invs[] = {
+		    SIZE_INV(3),
+		    SIZE_INV(4), SIZE_INV(5), SIZE_INV(6), SIZE_INV(7),
+		    SIZE_INV(8), SIZE_INV(9), SIZE_INV(10), SIZE_INV(11),
+		    SIZE_INV(12), SIZE_INV(13), SIZE_INV(14), SIZE_INV(15),
+		    SIZE_INV(16), SIZE_INV(17), SIZE_INV(18), SIZE_INV(19),
+		    SIZE_INV(20), SIZE_INV(21), SIZE_INV(22), SIZE_INV(23),
+		    SIZE_INV(24), SIZE_INV(25), SIZE_INV(26), SIZE_INV(27),
+		    SIZE_INV(28), SIZE_INV(29), SIZE_INV(30), SIZE_INV(31)
+		};
+
+		if (size <= ((sizeof(size_invs) / sizeof(unsigned)) + 2))
+			regind = (diff * size_invs[size - 3]) >> SIZE_INV_SHIFT;
+		else
+			regind = diff / size;
+#undef SIZE_INV
+#undef SIZE_INV_SHIFT
+	}
+	assert(diff == regind * size);
+	assert(regind < bin->nregs);
+
+	return (regind);
+}
+
+#ifdef JEMALLOC_PROF
+JEMALLOC_INLINE prof_ctx_t *
+arena_prof_ctx_get(const void *ptr)
+{
+	prof_ctx_t *ret;
+	arena_chunk_t *chunk;
+	size_t pageind, mapbits;
+
+	assert(ptr != NULL);
+	assert(CHUNK_ADDR2BASE(ptr) != ptr);
+
+	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
+	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT;
+	mapbits = chunk->map[pageind-map_bias].bits;
+	assert((mapbits & CHUNK_MAP_ALLOCATED) != 0);
+	if ((mapbits & CHUNK_MAP_LARGE) == 0) {
+		if (prof_promote)
+			ret = (prof_ctx_t *)(uintptr_t)1U;
+		else {
+			arena_run_t *run = (arena_run_t *)((uintptr_t)chunk +
+			    (uintptr_t)((pageind - (mapbits >> PAGE_SHIFT)) <<
+			    PAGE_SHIFT));
+			arena_bin_t *bin = run->bin;
+			unsigned regind;
+
+			assert(run->magic == ARENA_RUN_MAGIC);
+			regind = arena_run_regind(run, bin, ptr, bin->reg_size);
+			ret = *(prof_ctx_t **)((uintptr_t)run +
+			    bin->ctx0_offset + (regind *
+			    sizeof(prof_ctx_t *)));
+		}
+	} else
+		ret = chunk->map[pageind-map_bias].prof_ctx;
+
+	return (ret);
+}
+#endif
+
 JEMALLOC_INLINE void
 arena_dalloc(arena_t *arena, arena_chunk_t *chunk, void *ptr)
 {
diff --git a/jemalloc/include/jemalloc/internal/prof.h b/jemalloc/include/jemalloc/internal/prof.h
index a6bd797f..51d04879 100644
--- a/jemalloc/include/jemalloc/internal/prof.h
+++ b/jemalloc/include/jemalloc/internal/prof.h
@@ -389,7 +389,7 @@ prof_malloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt)
 {
 
 	assert(ptr != NULL);
-	assert(size == s2u(size));
+	assert(size == isalloc(ptr));
 
 	if (opt_lg_prof_sample != 0) {
 		if (prof_sample_accum_update(size)) {
@@ -401,6 +401,7 @@ prof_malloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt)
 			 * prof_alloc_prep() and prof_malloc().
 			 */
 			assert(false);
+			prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U);
 			return;
 		}
 	}
@@ -438,6 +439,7 @@ prof_realloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt,
 	assert(ptr != NULL || (uintptr_t)cnt <= (uintptr_t)1U);
 
 	if (ptr != NULL) {
+		assert(size == isalloc(ptr));
 		if (opt_lg_prof_sample != 0) {
 			if (prof_sample_accum_update(size)) {
 				/*
@@ -448,6 +450,7 @@ prof_realloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt,
 				 * its actual size was insufficient to cross
 				 * the sample threshold.
 				 */
+				prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U);
 				return;
 			}
 		}
diff --git a/jemalloc/src/arena.c b/jemalloc/src/arena.c
index a54a0905..2a8ea62b 100644
--- a/jemalloc/src/arena.c
+++ b/jemalloc/src/arena.c
@@ -1766,105 +1766,6 @@ arena_salloc_demote(const void *ptr)
 	return (ret);
 }
 
-static inline unsigned
-arena_run_regind(arena_run_t *run, arena_bin_t *bin, const void *ptr,
-    size_t size)
-{
-	unsigned shift, diff, regind;
-
-	assert(run->magic == ARENA_RUN_MAGIC);
-
-	/*
-	 * Avoid doing division with a variable divisor if possible.  Using
-	 * actual division here can reduce allocator throughput by over 20%!
-	 */
-	diff = (unsigned)((uintptr_t)ptr - (uintptr_t)run - bin->reg0_offset);
-
-	/* Rescale (factor powers of 2 out of the numerator and denominator). */
-	shift = ffs(size) - 1;
-	diff >>= shift;
-	size >>= shift;
-
-	if (size == 1) {
-		/* The divisor was a power of 2. */
-		regind = diff;
-	} else {
-		/*
-		 * To divide by a number D that is not a power of two we
-		 * multiply by (2^21 / D) and then right shift by 21 positions.
-		 *
-		 *   X / D
-		 *
-		 * becomes
-		 *
-		 *   (X * size_invs[D - 3]) >> SIZE_INV_SHIFT
-		 *
-		 * We can omit the first three elements, because we never
-		 * divide by 0, and 1 and 2 are both powers of two, which are
-		 * handled above.
-		 */
-#define	SIZE_INV_SHIFT 21
-#define	SIZE_INV(s) (((1U << SIZE_INV_SHIFT) / (s)) + 1)
-		static const unsigned size_invs[] = {
-		    SIZE_INV(3),
-		    SIZE_INV(4), SIZE_INV(5), SIZE_INV(6), SIZE_INV(7),
-		    SIZE_INV(8), SIZE_INV(9), SIZE_INV(10), SIZE_INV(11),
-		    SIZE_INV(12), SIZE_INV(13), SIZE_INV(14), SIZE_INV(15),
-		    SIZE_INV(16), SIZE_INV(17), SIZE_INV(18), SIZE_INV(19),
-		    SIZE_INV(20), SIZE_INV(21), SIZE_INV(22), SIZE_INV(23),
-		    SIZE_INV(24), SIZE_INV(25), SIZE_INV(26), SIZE_INV(27),
-		    SIZE_INV(28), SIZE_INV(29), SIZE_INV(30), SIZE_INV(31)
-		};
-
-		if (size <= ((sizeof(size_invs) / sizeof(unsigned)) + 2))
-			regind = (diff * size_invs[size - 3]) >> SIZE_INV_SHIFT;
-		else
-			regind = diff / size;
-#undef SIZE_INV
-#undef SIZE_INV_SHIFT
-	}
-	assert(diff == regind * size);
-	assert(regind < bin->nregs);
-
-	return (regind);
-}
-
-prof_ctx_t *
-arena_prof_ctx_get(const void *ptr)
-{
-	prof_ctx_t *ret;
-	arena_chunk_t *chunk;
-	size_t pageind, mapbits;
-
-	assert(ptr != NULL);
-	assert(CHUNK_ADDR2BASE(ptr) != ptr);
-
-	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT;
-	mapbits = chunk->map[pageind-map_bias].bits;
-	assert((mapbits & CHUNK_MAP_ALLOCATED) != 0);
-	if ((mapbits & CHUNK_MAP_LARGE) == 0) {
-		if (prof_promote)
-			ret = (prof_ctx_t *)(uintptr_t)1U;
-		else {
-			arena_run_t *run = (arena_run_t *)((uintptr_t)chunk +
-			    (uintptr_t)((pageind - (mapbits >> PAGE_SHIFT)) <<
-			    PAGE_SHIFT));
-			arena_bin_t *bin = run->bin;
-			unsigned regind;
-
-			assert(run->magic == ARENA_RUN_MAGIC);
-			regind = arena_run_regind(run, bin, ptr, bin->reg_size);
-			ret = *(prof_ctx_t **)((uintptr_t)run +
-			    bin->ctx0_offset + (regind *
-			    sizeof(prof_ctx_t *)));
-		}
-	} else
-		ret = chunk->map[pageind-map_bias].prof_ctx;
-
-	return (ret);
-}
-
 void
 arena_prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
 {

From e4f7846f1fd279a039ffa2a41707348187219de4 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@facebook.com>
Date: Fri, 22 Oct 2010 10:45:59 -0700
Subject: [PATCH 40/48] Fix heap profiling bugs.

Fix a regression due to the recent heap profiling accuracy improvements:
prof_{m,re}alloc() must set the object's profiling context regardless of
whether it is sampled.

Fix management of the CHUNK_MAP_CLASS chunk map bits, such that all
large object (re-)allocation paths correctly initialize the bits.  Prior
to this fix, in-place realloc() cleared the bits, resulting in incorrect
reported object size from arena_salloc_demote().  After this fix the
non-demoted bit pattern is all zeros (instead of all ones), which makes
it easier to assure that the bits are properly set.
---
 jemalloc/include/jemalloc/internal/arena.h  | 40 ++++++++++++++++--
 jemalloc/include/jemalloc/internal/prof.h   | 19 ++++-----
 jemalloc/include/jemalloc/internal/tcache.h |  3 +-
 jemalloc/src/arena.c                        | 45 +++------------------
 jemalloc/src/jemalloc.c                     | 37 +++++++++++++----
 jemalloc/src/prof.c                         | 24 +++--------
 6 files changed, 85 insertions(+), 83 deletions(-)

diff --git a/jemalloc/include/jemalloc/internal/arena.h b/jemalloc/include/jemalloc/internal/arena.h
index 240d96b3..9556c2c6 100644
--- a/jemalloc/include/jemalloc/internal/arena.h
+++ b/jemalloc/include/jemalloc/internal/arena.h
@@ -121,7 +121,7 @@ struct arena_chunk_map_s {
 	 *
 	 * p : run page offset
 	 * s : run size
-	 * c : size class (used only if prof_promote is true)
+	 * c : (binind+1) for size class (used only if prof_promote is true)
 	 * x : don't care
 	 * - : 0
 	 * + : 1
@@ -144,7 +144,7 @@ struct arena_chunk_map_s {
 	 *     pppppppp pppppppp pppp---- ----d--a
 	 *
 	 *   Large:
-	 *     ssssssss ssssssss ssss++++ ++++D-la
+	 *     ssssssss ssssssss ssss---- ----D-la
 	 *     xxxxxxxx xxxxxxxx xxxx---- ----xxxx
 	 *     -------- -------- -------- ----D-la
 	 *
@@ -152,7 +152,7 @@ struct arena_chunk_map_s {
 	 *     ssssssss ssssssss sssscccc ccccD-la
 	 *
 	 *   Large (not sampled, size == PAGE_SIZE):
-	 *     ssssssss ssssssss ssss++++ ++++D-la
+	 *     ssssssss ssssssss ssss---- ----D-la
 	 */
 	size_t				bits;
 #ifdef JEMALLOC_PROF
@@ -444,7 +444,6 @@ size_t	arena_salloc(const void *ptr);
 #ifdef JEMALLOC_PROF
 void	arena_prof_promoted(const void *ptr, size_t size);
 size_t	arena_salloc_demote(const void *ptr);
-void	arena_prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
 #endif
 void	arena_dalloc_bin(arena_t *arena, arena_chunk_t *chunk, void *ptr,
     arena_chunk_map_t *mapelm);
@@ -470,6 +469,7 @@ unsigned	arena_run_regind(arena_run_t *run, arena_bin_t *bin,
     const void *ptr, size_t size);
 #  ifdef JEMALLOC_PROF
 prof_ctx_t	*arena_prof_ctx_get(const void *ptr);
+void	arena_prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
 #  endif
 void	arena_dalloc(arena_t *arena, arena_chunk_t *chunk, void *ptr);
 #endif
@@ -574,6 +574,38 @@ arena_prof_ctx_get(const void *ptr)
 
 	return (ret);
 }
+
+JEMALLOC_INLINE void
+arena_prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
+{
+	arena_chunk_t *chunk;
+	size_t pageind, mapbits;
+
+	assert(ptr != NULL);
+	assert(CHUNK_ADDR2BASE(ptr) != ptr);
+
+	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
+	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT;
+	mapbits = chunk->map[pageind-map_bias].bits;
+	assert((mapbits & CHUNK_MAP_ALLOCATED) != 0);
+	if ((mapbits & CHUNK_MAP_LARGE) == 0) {
+		if (prof_promote == false) {
+			arena_run_t *run = (arena_run_t *)((uintptr_t)chunk +
+			    (uintptr_t)((pageind - (mapbits >> PAGE_SHIFT)) <<
+			    PAGE_SHIFT));
+			arena_bin_t *bin = run->bin;
+			unsigned regind;
+
+			assert(run->magic == ARENA_RUN_MAGIC);
+			regind = arena_run_regind(run, bin, ptr, bin->reg_size);
+
+			*((prof_ctx_t **)((uintptr_t)run + bin->ctx0_offset
+			    + (regind * sizeof(prof_ctx_t *)))) = ctx;
+		} else
+			assert((uintptr_t)ctx == (uintptr_t)1U);
+	} else
+		chunk->map[pageind-map_bias].prof_ctx = ctx;
+}
 #endif
 
 JEMALLOC_INLINE void
diff --git a/jemalloc/include/jemalloc/internal/prof.h b/jemalloc/include/jemalloc/internal/prof.h
index 51d04879..3e85bdab 100644
--- a/jemalloc/include/jemalloc/internal/prof.h
+++ b/jemalloc/include/jemalloc/internal/prof.h
@@ -232,8 +232,8 @@ void	prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
 bool	prof_sample_accum_update(size_t size);
 void	prof_malloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt);
 void	prof_realloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt,
-    const void *old_ptr, size_t old_size, prof_ctx_t *old_ctx);
-void	prof_free(const void *ptr);
+    size_t old_size, prof_ctx_t *old_ctx);
+void	prof_free(const void *ptr, size_t size);
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_PROF_C_))
@@ -400,9 +400,7 @@ prof_malloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt)
 			 * be a difference between the size passed to
 			 * prof_alloc_prep() and prof_malloc().
 			 */
-			assert(false);
-			prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U);
-			return;
+			assert((uintptr_t)cnt == (uintptr_t)1U);
 		}
 	}
 
@@ -432,7 +430,7 @@ prof_malloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt)
 
 JEMALLOC_INLINE void
 prof_realloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt,
-    const void *old_ptr, size_t old_size, prof_ctx_t *old_ctx)
+    size_t old_size, prof_ctx_t *old_ctx)
 {
 	prof_thr_cnt_t *told_cnt;
 
@@ -445,13 +443,12 @@ prof_realloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt,
 				/*
 				 * Don't sample.  The size passed to
 				 * prof_alloc_prep() was larger than what
-				 * actually got allocated., so a backtrace was
+				 * actually got allocated, so a backtrace was
 				 * captured for this allocation, even though
 				 * its actual size was insufficient to cross
 				 * the sample threshold.
 				 */
-				prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U);
-				return;
+				cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
 			}
 		}
 	}
@@ -506,12 +503,12 @@ prof_realloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt,
 }
 
 JEMALLOC_INLINE void
-prof_free(const void *ptr)
+prof_free(const void *ptr, size_t size)
 {
 	prof_ctx_t *ctx = prof_ctx_get(ptr);
 
 	if ((uintptr_t)ctx > (uintptr_t)1) {
-		size_t size = isalloc(ptr);
+		assert(size == isalloc(ptr));
 		prof_thr_cnt_t *tcnt = prof_lookup(ctx->bt);
 
 		if (tcnt != NULL) {
diff --git a/jemalloc/include/jemalloc/internal/tcache.h b/jemalloc/include/jemalloc/internal/tcache.h
index bcd6d44c..168a3069 100644
--- a/jemalloc/include/jemalloc/internal/tcache.h
+++ b/jemalloc/include/jemalloc/internal/tcache.h
@@ -282,8 +282,7 @@ tcache_alloc_large(tcache_t *tcache, size_t size, bool zero)
 		arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ret);
 		size_t pageind = (((uintptr_t)ret - (uintptr_t)chunk) >>
 		    PAGE_SHIFT);
-		chunk->map[pageind-map_bias].bits |=
-		    CHUNK_MAP_CLASS_MASK;
+		chunk->map[pageind-map_bias].bits &= ~CHUNK_MAP_CLASS_MASK;
 #endif
 		if (zero == false) {
 #ifdef JEMALLOC_FILL
diff --git a/jemalloc/src/arena.c b/jemalloc/src/arena.c
index 2a8ea62b..d811f658 100644
--- a/jemalloc/src/arena.c
+++ b/jemalloc/src/arena.c
@@ -408,11 +408,8 @@ arena_run_split(arena_t *arena, arena_run_t *run, size_t size, bool large,
 		 */
 		chunk->map[run_ind+need_pages-1-map_bias].bits =
 		    CHUNK_MAP_LARGE | CHUNK_MAP_ALLOCATED | flag_dirty;
-		chunk->map[run_ind-map_bias].bits = size | CHUNK_MAP_LARGE |
-#ifdef JEMALLOC_PROF
-		    CHUNK_MAP_CLASS_MASK |
-#endif
-		    CHUNK_MAP_ALLOCATED | flag_dirty;
+		chunk->map[run_ind-map_bias].bits = size | flag_dirty |
+		    CHUNK_MAP_LARGE | CHUNK_MAP_ALLOCATED;
 	} else {
 		assert(zero == false);
 		/*
@@ -1724,7 +1721,7 @@ arena_prof_promoted(const void *ptr, size_t size)
 	binind = small_size2bin[size];
 	assert(binind < nbins);
 	chunk->map[pageind-map_bias].bits = (chunk->map[pageind-map_bias].bits &
-	    ~CHUNK_MAP_CLASS_MASK) | (binind << CHUNK_MAP_CLASS_SHIFT);
+	    ~CHUNK_MAP_CLASS_MASK) | ((binind+1) << CHUNK_MAP_CLASS_SHIFT);
 }
 
 size_t
@@ -1754,9 +1751,9 @@ arena_salloc_demote(const void *ptr)
 		assert(((uintptr_t)ptr & PAGE_MASK) == 0);
 		ret = mapbits & ~PAGE_MASK;
 		if (prof_promote && ret == PAGE_SIZE && (mapbits &
-		    CHUNK_MAP_CLASS_MASK) != CHUNK_MAP_CLASS_MASK) {
+		    CHUNK_MAP_CLASS_MASK) != 0) {
 			size_t binind = ((mapbits & CHUNK_MAP_CLASS_MASK) >>
-			    CHUNK_MAP_CLASS_SHIFT);
+			    CHUNK_MAP_CLASS_SHIFT) - 1;
 			assert(binind < nbins);
 			ret = chunk->arena->bins[binind].reg_size;
 		}
@@ -1765,38 +1762,6 @@ arena_salloc_demote(const void *ptr)
 
 	return (ret);
 }
-
-void
-arena_prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
-{
-	arena_chunk_t *chunk;
-	size_t pageind, mapbits;
-
-	assert(ptr != NULL);
-	assert(CHUNK_ADDR2BASE(ptr) != ptr);
-
-	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT;
-	mapbits = chunk->map[pageind-map_bias].bits;
-	assert((mapbits & CHUNK_MAP_ALLOCATED) != 0);
-	if ((mapbits & CHUNK_MAP_LARGE) == 0) {
-		if (prof_promote == false) {
-			arena_run_t *run = (arena_run_t *)((uintptr_t)chunk +
-			    (uintptr_t)((pageind - (mapbits >> PAGE_SHIFT)) <<
-			    PAGE_SHIFT));
-			arena_bin_t *bin = run->bin;
-			unsigned regind;
-
-			assert(run->magic == ARENA_RUN_MAGIC);
-			regind = arena_run_regind(run, bin, ptr, bin->reg_size);
-
-			*((prof_ctx_t **)((uintptr_t)run + bin->ctx0_offset
-			    + (regind * sizeof(prof_ctx_t *)))) = ctx;
-		} else
-			assert((uintptr_t)ctx == (uintptr_t)1U);
-	} else
-		chunk->map[pageind-map_bias].prof_ctx = ctx;
-}
 #endif
 
 static void
diff --git a/jemalloc/src/jemalloc.c b/jemalloc/src/jemalloc.c
index f3cba15a..dedf011a 100644
--- a/jemalloc/src/jemalloc.c
+++ b/jemalloc/src/jemalloc.c
@@ -1268,7 +1268,7 @@ RETURN:
 #endif
 #ifdef JEMALLOC_PROF
 	if (opt_prof)
-		prof_realloc(ret, usize, cnt, ptr, old_size, old_ctx);
+		prof_realloc(ret, usize, cnt, old_size, old_ctx);
 #endif
 #ifdef JEMALLOC_STATS
 	if (ret != NULL) {
@@ -1285,15 +1285,26 @@ JEMALLOC_P(free)(void *ptr)
 {
 
 	if (ptr != NULL) {
+#if (defined(JEMALLOC_PROF) || defined(JEMALLOC_STATS))
+		size_t usize;
+#endif
+
 		assert(malloc_initialized || malloc_initializer ==
 		    pthread_self());
 
+#ifdef JEMALLOC_STATS
+		usize = isalloc(ptr);
+#endif
 #ifdef JEMALLOC_PROF
-		if (opt_prof)
-			prof_free(ptr);
+		if (opt_prof) {
+#  ifndef JEMALLOC_STATS
+			usize = isalloc(ptr);
+#  endif
+			prof_free(ptr, usize);
+		}
 #endif
 #ifdef JEMALLOC_STATS
-		ALLOCATED_ADD(0, isalloc(ptr));
+		ALLOCATED_ADD(0, usize);
 #endif
 		idalloc(ptr);
 	}
@@ -1566,7 +1577,7 @@ JEMALLOC_P(rallocm)(void **ptr, size_t *rsize, size_t size, size_t extra,
 				goto ERR;
 			usize = isalloc(q);
 		}
-		prof_realloc(q, usize, cnt, p, old_size, old_ctx);
+		prof_realloc(q, usize, cnt, old_size, old_ctx);
 	} else
 #endif
 	{
@@ -1635,16 +1646,26 @@ JEMALLOC_ATTR(visibility("default"))
 int
 JEMALLOC_P(dallocm)(void *ptr, int flags)
 {
+#if (defined(JEMALLOC_PROF) || defined(JEMALLOC_STATS))
+	size_t usize;
+#endif
 
 	assert(ptr != NULL);
 	assert(malloc_initialized || malloc_initializer == pthread_self());
 
+#ifdef JEMALLOC_STATS
+	usize = isalloc(ptr);
+#endif
 #ifdef JEMALLOC_PROF
-	if (opt_prof)
-		prof_free(ptr);
+	if (opt_prof) {
+#  ifndef JEMALLOC_STATS
+		usize = isalloc(ptr);
+#  endif
+		prof_free(ptr, usize);
+	}
 #endif
 #ifdef JEMALLOC_STATS
-	ALLOCATED_ADD(0, isalloc(ptr));
+	ALLOCATED_ADD(0, usize);
 #endif
 	idalloc(ptr);
 
diff --git a/jemalloc/src/prof.c b/jemalloc/src/prof.c
index 4a49e901..fb0e7659 100644
--- a/jemalloc/src/prof.c
+++ b/jemalloc/src/prof.c
@@ -239,38 +239,26 @@ prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max)
 void
 prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max)
 {
-#define	NIGNORE	3
 #define	BT_FRAME(i)							\
-	if ((i) < NIGNORE + max) {					\
+	if ((i) < nignore + max) {					\
 		void *p;						\
 		if (__builtin_frame_address(i) == 0)			\
 			return;						\
 		p = __builtin_return_address(i);			\
 		if (p == NULL)						\
 			return;						\
-		if (i >= NIGNORE) {					\
-			bt->vec[(i) - NIGNORE] = p;			\
-			bt->len = (i) - NIGNORE + 1;			\
+		if (i >= nignore) {					\
+			bt->vec[(i) - nignore] = p;			\
+			bt->len = (i) - nignore + 1;			\
 		}							\
 	} else								\
 		return;
 
 	assert(max <= (1U << opt_lg_prof_bt_max));
 
-	/*
-	 * Ignore the first three frames, since they are:
-	 *
-	 *   0: prof_backtrace()
-	 *   1: prof_alloc_prep()
-	 *   2: malloc(), calloc(), etc.
-	 */
-#if 1
-	assert(nignore + 1 == NIGNORE);
-#else
 	BT_FRAME(0)
 	BT_FRAME(1)
 	BT_FRAME(2)
-#endif
 	BT_FRAME(3)
 	BT_FRAME(4)
 	BT_FRAME(5)
@@ -491,8 +479,8 @@ prof_lookup(prof_bt_t *bt)
 		    == (ZU(1) << opt_lg_prof_tcmax)) {
 			assert(ckh_count(&prof_tdata->bt2cnt) > 0);
 			/*
-			 * Flush the least least recently used cnt in order to
-			 * keep bt2cnt from becoming too large.
+			 * Flush the least recently used cnt in order to keep
+			 * bt2cnt from becoming too large.
 			 */
 			ret.p = ql_last(&prof_tdata->lru_ql, lru_link);
 			assert(ret.v != NULL);

From e73397062ac3ab28a9d377591b63ed19fd154cca Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Sat, 23 Oct 2010 18:37:06 -0700
Subject: [PATCH 41/48] Replace JEMALLOC_OPTIONS with MALLOC_CONF.

Replace the single-character run-time flags with key/value pairs, which
can be set via the malloc_conf global, /etc/malloc.conf, and the
MALLOC_CONF environment variable.

Replace the JEMALLOC_PROF_PREFIX environment variable with the
"opt.prof_prefix" option.

Replace umax2s() with u2s().
---
 jemalloc/INSTALL                              |  14 +-
 jemalloc/configure.ac                         |  15 +-
 jemalloc/doc/jemalloc.3.in                    | 911 ++++++++----------
 jemalloc/include/jemalloc/internal/ctl.h      |  10 +-
 .../jemalloc/internal/jemalloc_internal.h.in  |   4 +-
 jemalloc/include/jemalloc/internal/prof.h     |   9 +-
 jemalloc/include/jemalloc/internal/stats.h    |   2 +-
 jemalloc/include/jemalloc/internal/tcache.h   |   4 +-
 jemalloc/include/jemalloc/jemalloc.h.in       |   2 +-
 jemalloc/include/jemalloc/jemalloc_defs.h.in  |   1 +
 jemalloc/src/arena.c                          |   6 +-
 jemalloc/src/chunk.c                          |  10 +-
 jemalloc/src/ctl.c                            |  67 +-
 jemalloc/src/jemalloc.c                       | 664 ++++++-------
 jemalloc/src/prof.c                           | 133 +--
 jemalloc/src/stats.c                          | 200 ++--
 jemalloc/src/tcache.c                         |  12 +-
 jemalloc/test/rallocm.c                       |  20 +-
 18 files changed, 1058 insertions(+), 1026 deletions(-)

diff --git a/jemalloc/INSTALL b/jemalloc/INSTALL
index 1bf51588..c5697c6d 100644
--- a/jemalloc/INSTALL
+++ b/jemalloc/INSTALL
@@ -27,9 +27,17 @@ any of the following arguments (not a definitive list) to 'configure':
     it is linked to.  This works only on ELF-based systems.
 
 --with-jemalloc-prefix=<prefix>
-    Prefix all public APIs with <prefix>, so that, for example, malloc()
-    becomes <prefix>malloc().  This makes it possible to use jemalloc at the
-    same time as the system allocator.
+    Prefix all public APIs with <prefix>.  For example, if <prefix> is
+    "prefix_", the API changes like the following occur:
+
+      malloc()         --> prefix_malloc()
+      malloc_conf      --> prefix_malloc_conf
+      /etc/malloc.conf --> /etc/prefix_malloc.conf
+      MALLOC_CONF      --> PREFIX_MALLOC_CONF
+
+    This makes it possible to use jemalloc at the same time as the
+    system allocator, or even to use multiple copies of jemalloc
+    simultaneously.
 
     By default, the prefix is "", except on OS X, where it is "je_".  On OS X,
     jemalloc overlays the default malloc zone, but makes no attempt to actually
diff --git a/jemalloc/configure.ac b/jemalloc/configure.ac
index b27955de..b613cb1b 100644
--- a/jemalloc/configure.ac
+++ b/jemalloc/configure.ac
@@ -256,9 +256,13 @@ else
 fi]
 )
 if test "x$JEMALLOC_PREFIX" != "x" ; then
-  AC_DEFINE([JEMALLOC_PREFIX], [ ])
+  JEMALLOC_CPREFIX=`echo ${JEMALLOC_PREFIX} | tr "a-z" "A-Z"`
+  AC_DEFINE_UNQUOTED([JEMALLOC_PREFIX], ["$JEMALLOC_PREFIX"])
+  AC_DEFINE_UNQUOTED([JEMALLOC_CPREFIX], ["$JEMALLOC_CPREFIX"])
   jemalloc_prefix="$JEMALLOC_PREFIX"
+  jemalloc_cprefix="$JEMALLOC_CPREFIX"
   AC_SUBST([jemalloc_prefix])
+  AC_SUBST([jemalloc_cprefix])
   AC_DEFINE_UNQUOTED([JEMALLOC_P(string_that_no_one_should_want_to_use_as_a_jemalloc_API_prefix)], [${JEMALLOC_PREFIX}##string_that_no_one_should_want_to_use_as_a_jemalloc_API_prefix])
 fi
 
@@ -325,6 +329,15 @@ if test "x$enable_debug" = "x1" ; then
   AC_DEFINE([JEMALLOC_IVSALLOC], [ ])
 fi
 AC_SUBST([enable_debug])
+if test "x$enable_debug" = "x0" ; then
+  roff_debug=".\\\" "
+  roff_no_debug=""
+else
+  roff_debug=""
+  roff_no_debug=".\\\" "
+fi
+AC_SUBST([roff_debug])
+AC_SUBST([roff_no_debug])
 
 dnl Only optimize if not debugging.
 if test "x$enable_debug" = "x0" -a "x$no_CFLAGS" = "xyes" ; then
diff --git a/jemalloc/doc/jemalloc.3.in b/jemalloc/doc/jemalloc.3.in
index 5202a2bf..1557ecbd 100644
--- a/jemalloc/doc/jemalloc.3.in
+++ b/jemalloc/doc/jemalloc.3.in
@@ -38,7 +38,7 @@
 .\"     @(#)malloc.3	8.1 (Berkeley) 6/4/93
 .\" $FreeBSD: head/lib/libc/stdlib/malloc.3 182225 2008-08-27 02:00:53Z jasone $
 .\"
-.Dd September 30, 2010
+.Dd October 23, 2010
 .Dt JEMALLOC 3
 .Os
 .Sh NAME
@@ -85,7 +85,7 @@
 .Ft int
 .Fn @jemalloc_prefix@mallctlbymib "const size_t *mib" "size_t miblen" "void *oldp" "size_t *oldlenp" "void *newp" "size_t newlen"
 .Ft const char *
-.Va @jemalloc_prefix@malloc_options ;
+.Va @jemalloc_prefix@malloc_conf ;
 .Ft void
 .Fn \*(lp*@jemalloc_prefix@malloc_message\*(rp "void *cbopaque" "const char *s"
 .Ss Experimental API
@@ -381,8 +381,8 @@ is non-zero, an attempt is made to resize the allocation to be at least
 .Fa ( size
 +
 .Fa extra )
-bytes, though an inability to allocate the extra byte(s) will not by itself
-result in failure.
+bytes, though inability to allocate the extra byte(s) will not by itself result
+in failure.
 Behavior is undefined if
 .Fa ( size
 +
@@ -402,292 +402,33 @@ function causes the memory referenced by
 .Fa ptr
 to be made available for future allocations.
 .Sh TUNING
-Once, when the first call is made to one of these memory allocation
-routines, various flags will be set or reset, which affects the
-workings of this allocator implementation.
+Once, when the first call is made to one of the memory allocation routines, the
+allocator initializes its internals based in part on various options that can
+be specified at compile- or run-time.
 .Pp
-The
+The string pointed to by the global variable
+.Va @jemalloc_prefix@malloc_conf ,
+the
 .Dq name
 of the file referenced by the symbolic link named
-.Pa /etc/jemalloc.conf ,
-the value of the environment variable
-.Ev JEMALLOC_OPTIONS ,
-and the string pointed to by the global variable
-.Va @jemalloc_prefix@malloc_options
-will be interpreted, in that order, from left to right as flags.
+.Pa /etc/@jemalloc_prefix@malloc.conf ,
+and the value of the environment variable
+.Ev @jemalloc_cprefix@MALLOC_CONF ,
+will be interpreted, in that order, from left to right as options.
 .Pp
-Each flag is a single letter, optionally prefixed by a non-negative base 10
-integer repetition count.
+An options string is a comma-separated list of option:value pairs.
+There is one key corresponding to each
+.Dq opt.*
+mallctl.
 For example,
-.Dq 3N
-is equivalent to
-.Dq NNN .
-Some flags control parameter magnitudes, where uppercase increases the
-magnitude, and lowercase decreases the magnitude.
-Other flags control boolean parameters, where uppercase indicates that a
-behavior is set, or on, and lowercase means that a behavior is not set, or off.
-.Bl -tag -width indent
-.It A
-All warnings (except for the warning about unknown
-flags being set) become fatal.
-The process will call
-.Xr abort 3
-in these cases.
-@roff_prof@.It B
-@roff_prof@Double/halve the maximum backtrace depth when profiling memory
-@roff_prof@allocation activity.
-@roff_prof@The default is 128.
-.It C
-Double/halve the size of the maximum size class that is a multiple of the
-cacheline size (64).
-Above this size, subpage spacing (256 bytes) is used for size classes.
-The default value is 512 bytes.
-.It D
-Halve/double the per-arena minimum ratio of active to dirty pages.
-Some dirty unused pages may be allowed to accumulate, within the limit set by
-the ratio (or one chunk worth of dirty pages, whichever is greater), before
-informing the kernel about some of those pages via
-.Xr madvise 2 .
-This provides the kernel with sufficient information to recycle dirty pages if
-physical memory becomes scarce and the pages remain unused.
-The default minimum ratio is 32:1;
-.Ev JEMALLOC_OPTIONS=6D
-will disable dirty page purging.
-@roff_prof@.It E
-@roff_prof@Activate/deactivate profiling.
-@roff_prof@This is a secondary control mechanism that makes it possible to
-@roff_prof@start the application with profiling enabled (see the
-@roff_prof@.Dq F
-@roff_prof@option) but inactive, then toggle profiling at any time during
-@roff_prof@program execution with the
-@roff_prof@.Dq prof.active
-@roff_prof@mallctl.
-@roff_prof@This option is enabled by default.
-@roff_prof@.It F
-@roff_prof@Profile memory allocation activity, and use an
-@roff_prof@.Xr atexit 3
-@roff_prof@function to dump final memory usage to a file named according to
-@roff_prof@the pattern
-@roff_prof@.Pa <prefix>.<pid>.<seq>.f.heap ,
-@roff_prof@where
-@roff_prof@.Pa <prefix>
-@roff_prof@is controlled by the
-@roff_prof@JEMALLOC_PROF_PREFIX
-@roff_prof@environment variable.
-@roff_prof@See the
-@roff_prof@.Dq B
-@roff_prof@option for backtrace depth control.
-@roff_prof@See the
-@roff_prof@.Dq E
-@roff_prof@option for on-the-fly activation/deactivation.
-@roff_prof@See the
-@roff_prof@.Dq S
-@roff_prof@option for probabilistic sampling control.
-@roff_prof@See the
-@roff_prof@.Dq R
-@roff_prof@option for control of cumulative sample reporting.
-@roff_prof@See the
-@roff_prof@.Dq T
-@roff_prof@option for control of per thread backtrace caching.
-@roff_prof@See the
-@roff_prof@.Dq I
-@roff_prof@option for information on interval-triggered profile dumping, and the
-@roff_prof@.Dq U
-@roff_prof@option for information on high-water-triggered profile dumping.
-@roff_prof@Profile output is compatible with the included pprof Perl script,
-@roff_prof@which originates from the google-perftools package
-@roff_prof@(http://code.google.com/p/google-perftools/).
-@roff_tcache@.It G
-@roff_tcache@Double/halve the approximate interval (counted in terms of
-@roff_tcache@thread-specific cache allocation/deallocation events) between full
-@roff_tcache@thread-specific cache garbage collection sweeps.
-@roff_tcache@Garbage collection is actually performed incrementally, one size
-@roff_tcache@class at a time, in order to avoid large collection pauses.
-@roff_tcache@The default sweep interval is 8192;
-@roff_tcache@.Ev JEMALLOC_OPTIONS=14g
-@roff_tcache@will disable garbage collection.
-@roff_tcache@.It H
-@roff_tcache@Enable/disable thread-specific caching.
-@roff_tcache@When there are multiple threads, each thread uses a
-@roff_tcache@thread-specific cache for objects up to a certain size.
-@roff_tcache@Thread-specific caching allows many allocations to be satisfied
-@roff_tcache@without performing any thread synchronization, at the cost of
-@roff_tcache@increased memory use.
-@roff_tcache@See the
-@roff_tcache@.Dq G
-@roff_tcache@and
-@roff_tcache@.Dq M
-@roff_tcache@options for related tuning information.
-@roff_tcache@This option is enabled by default.
-@roff_prof@.It I
-@roff_prof@Double/halve the average interval between memory profile dumps, as
-@roff_prof@measured in bytes of allocation activity.
-@roff_prof@The actual interval between dumps may be sporadic because
-@roff_prof@decentralized allocation counters are used to avoid synchronization
-@roff_prof@bottlenecks.
-@roff_prof@Profiles are dumped to files named according to the pattern
-@roff_prof@.Pa <prefix>.<pid>.<seq>.i<iseq>.heap ,
-@roff_prof@where
-@roff_prof@.Pa <prefix>
-@roff_prof@is controlled by the
-@roff_prof@JEMALLOC_PROF_PREFIX
-@roff_prof@environment variable.
-@roff_prof@By default, interval-triggered profile dumping is disabled.
-@roff_prof@This is internally encoded as (1 << -1), and each
-@roff_prof@.Dq I
-@roff_prof@that is specified increments the shift amount.
-@roff_prof@Therefore, e.g.
-@roff_prof@.Ev JEMALLOC_OPTIONS=31I
-@roff_prof@specifies a dump interval of 1 GiB.
-@roff_fill@.It J
-@roff_fill@Each byte of new memory allocated by
-@roff_fill@.Fn @jemalloc_prefix@malloc
-@roff_fill@or
-@roff_fill@.Fn @jemalloc_prefix@realloc
-@roff_fill@will be initialized to 0xa5.
-@roff_fill@All memory returned by
-@roff_fill@.Fn @jemalloc_prefix@free
-@roff_fill@or
-@roff_fill@.Fn @jemalloc_prefix@realloc
-@roff_fill@will be initialized to 0x5a.
-@roff_fill@This is intended for debugging and will impact performance
-@roff_fill@negatively.
-.It K
-Double/halve the virtual memory chunk size.
-The default chunk size is 4 MiB.
-@roff_prof@.It L
-@roff_prof@Use an
-@roff_prof@.Xr atexit 3
-@roff_prof@function to report memory leaks.
-@roff_prof@See the
-@roff_prof@.Dq B
-@roff_prof@option for backtrace depth control.
-@roff_prof@See the
-@roff_prof@.Dq F option for information on analyzing heap profile output.
-@roff_prof@This option is disabled by default.
-@roff_tcache@.It M
-@roff_tcache@Double/halve the maximum size class to cache.
-@roff_tcache@At a minimum, all small size classes are cached, and at a maximum
-@roff_tcache@all large size classes are cached.
-@roff_tcache@The default maximum is 32 KiB.
-.It N
-Double/halve the number of arenas.
-The default number of arenas is four times the number of CPUs, or one if there
-is a single CPU.
-@roff_swap@.It O
-@roff_swap@Over-commit memory as a side effect of using anonymous
-@roff_swap@.Xr mmap 2
-@roff_swap@@roff_dss@ and
-@roff_swap@@roff_dss@.Xr sbrk 2
-@roff_swap@for virtual memory allocation.
-@roff_swap@In order for overcommit to be disabled, the
-@roff_swap@.Dq swap.fds
-@roff_swap@mallctl must have been successfully written to.
-@roff_swap@This option is enabled by default.
-.It P
-The
-.Fn @jemalloc_prefix@malloc_stats_print
-function is called at program exit via an
-.Xr atexit 3
-function.
-@roff_stats@This has the potential to cause deadlock for a multi-threaded
-@roff_stats@process that exits while one or more threads are executing in the
-@roff_stats@memory allocation functions.
-@roff_stats@Therefore, this option should only be used with care; it is
-@roff_stats@primarily intended as a performance tuning aid during application
-@roff_stats@development.
-.It Q
-Double/halve the size of the maximum size class that is a multiple of the
-quantum (8 or 16 bytes, depending on architecture).
-Above this size, cacheline spacing is used for size classes.
-The default value is 128 bytes.
-@roff_prof@.It R
-@roff_prof@Enable/disable reporting of cumulative object/byte counts in profile
-@roff_prof@dumps.
-@roff_prof@If this option is enabled, every unique backtrace must be stored for
-@roff_prof@the duration of execution.
-@roff_prof@Depending on the application, this can impose a large memory
-@roff_prof@overhead, and the cumulative counts are not always of interest.
-@roff_prof@See the
-@roff_prof@.Dq T
-@roff_prof@option for control of per thread backtrace caching, which has
-@roff_prof@important interactions.
-@roff_prof@This option is enabled by default.
-@roff_prof@.It S
-@roff_prof@Double/halve the average interval between allocation samples, as
-@roff_prof@measured in bytes of allocation activity.
-@roff_prof@Increasing the sampling interval decreases profile fidelity, but
-@roff_prof@also decreases the computational overhead.
-@roff_prof@The default sample interval is one (i.e. all allocations are
-@roff_prof@sampled).
-@roff_prof@.It T
-@roff_prof@Double/halve the maximum per thread backtrace cache used for heap
-@roff_prof@profiling.
-@roff_prof@A backtrace can only be discarded if the
-@roff_prof@.Dq R
-@roff_prof@option is disabled, and no thread caches currently refer to the
-@roff_prof@backtrace.
-@roff_prof@Therefore, a backtrace cache limit should be imposed if the
-@roff_prof@intention is to limit how much memory is used by backtraces.
-@roff_prof@By default, no limit is imposed.
-@roff_prof@This is internally encoded as (1 << -1), and each
-@roff_prof@.Dq T
-@roff_prof@that is specified increments the shift amount.
-@roff_prof@Therefore, e.g.
-@roff_prof@.Ev JEMALLOC_OPTIONS=11T
-@roff_prof@specifies a backtrace cache limit of 1024 backtraces.
-@roff_prof@.It U
-@roff_prof@Trigger a memory profile dump every time the total virtual memory
-@roff_prof@exceeds the previous maximum.
-@roff_prof@Profiles are dumped to files named according to the pattern
-@roff_prof@.Pa <prefix>.<pid>.<seq>.u<useq>.heap ,
-@roff_prof@where
-@roff_prof@.Pa <prefix>
-@roff_prof@is controlled by the
-@roff_prof@JEMALLOC_PROF_PREFIX
-@roff_prof@environment variable.
-@roff_prof@This option is disabled by default.
-@roff_sysv@.It V
-@roff_sysv@Attempting to allocate zero bytes will return a
-@roff_sysv@.Dv NULL
-@roff_sysv@pointer instead of a valid pointer.
-@roff_sysv@(The default behavior is to make a minimal allocation and return a
-@roff_sysv@pointer to it.)
-@roff_sysv@This option is provided for System V compatibility.
-@roff_sysv@@roff_xmalloc@This option is incompatible with the
-@roff_sysv@@roff_xmalloc@.Dq X
-@roff_sysv@@roff_xmalloc@option.
-@roff_xmalloc@.It X
-@roff_xmalloc@Rather than return failure for any allocation function, display a
-@roff_xmalloc@diagnostic message on
-@roff_xmalloc@.Dv STDERR_FILENO
-@roff_xmalloc@and cause the program to drop core (using
-@roff_xmalloc@.Xr abort 3 ) .
-@roff_xmalloc@This option should be set at compile time by including the
-@roff_xmalloc@following in the source code:
-@roff_xmalloc@.Bd -literal -offset indent
-@roff_xmalloc@@jemalloc_prefix@malloc_options = "X";
-@roff_xmalloc@.Ed
-@roff_fill@.It Z
-@roff_fill@Each byte of new memory allocated by
-@roff_fill@.Fn @jemalloc_prefix@malloc
-@roff_fill@or
-@roff_fill@.Fn @jemalloc_prefix@realloc
-@roff_fill@will be initialized to 0.
-@roff_fill@Note that this initialization only happens once for each byte, so
-@roff_fill@.Fn @jemalloc_prefix@realloc
-@roff_fill@calls do not zero memory that was previously allocated.
-@roff_fill@This is intended for debugging and will impact performance
-@roff_fill@negatively.
-.El
-.Pp
-@roff_fill@The
-@roff_fill@.Dq J
-@roff_fill@and
-@roff_fill@.Dq Z
-@roff_fill@options are intended for testing and debugging.
-@roff_fill@An application which changes its behavior when these options are used
-@roff_fill@is flawed.
+.Dq abort:true,narenas:1
+sets the
+.Dq opt.abort
+and
+.Dq opt.narenas
+options.
+Some options have boolean values (true/false), others have integer values (base
+8, 10, or 16, depending on prefix), and yet others have raw string values.
 .Sh IMPLEMENTATION NOTES
 @roff_dss@Traditionally, allocators have used
 @roff_dss@.Xr sbrk 2
@@ -715,7 +456,7 @@ does not make much use of the allocation functions.
 .Pp
 @roff_tcache@In addition to multiple arenas, this allocator supports
 @roff_tcache@thread-specific caching for small and large objects, in order to
-@roff_tcache@make it possible to completely avoid synchronization for most small
+@roff_tcache@make it possible to completely avoid synchronization for most
 @roff_tcache@allocation requests.
 @roff_tcache@Such caching allows very fast allocation in the common case, but it
 @roff_tcache@increases memory usage and fragmentation, since a bounded number of
@@ -744,31 +485,37 @@ The combination of chunk alignment and chunk page maps makes it possible to
 determine all metadata regarding small and large allocations in constant time.
 .Pp
 Small objects are managed in groups by page runs.
-Each run maintains a bitmap that tracks which regions are in use.
+Each run maintains a frontier and free list to track which regions are in use.
 @roff_tiny@Allocation requests that are no more than half the quantum (8 or 16,
 @roff_tiny@depending on architecture) are rounded up to the nearest power of
 @roff_tiny@two.
 Allocation requests that are
 @roff_tiny@more than half the quantum, but
 no more than the minimum cacheline-multiple size class (see the
-.Dq Q
+.Dq opt.lg_qspace_max
 option) are rounded up to the nearest multiple of the
 @roff_tiny@quantum.
 @roff_no_tiny@quantum (8 or 16, depending on architecture).
 Allocation requests that are more than the minimum cacheline-multiple size
 class, but no more than the minimum subpage-multiple size class (see the
-.Dq C
+.Dq opt.lg_cspace_max
 option) are rounded up to the nearest multiple of the cacheline size (64).
 Allocation requests that are more than the minimum subpage-multiple size class,
 but no more than the maximum subpage-multiple size class are rounded up to the
 nearest multiple of the subpage size (256).
 Allocation requests that are more than the maximum subpage-multiple size class,
 but small enough to fit in an arena-managed chunk (see the
-.Dq K
+.Dq opt.lg_chunk
 option), are rounded up to the nearest run size.
 Allocation requests that are too large to fit in an arena-managed chunk are
 rounded up to the nearest multiple of the chunk size.
 .Pp
+Allocations are packed tightly together, which can be an issue for
+multi-threaded applications.
+If you need to assure that allocations do not suffer from cacheline sharing,
+round your allocation requests up to the nearest multiple of the cacheline
+size, or specify cacheline alignment when allocating.
+.Pp
 Assuming 4 MiB chunks, 4 KiB pages, and a 16 byte quantum on a 64-bit system,
 the size classes in each category are as follows:
 .TS
@@ -825,12 +572,6 @@ Huge;4 MiB
 ;;12 MiB
 ;;...
 .TE
-.Pp
-Allocations are packed tightly together, which can be an issue for
-multi-threaded applications.
-If you need to assure that allocations do not suffer from cacheline sharing,
-round your allocation requests up to the nearest multiple of the cacheline
-size, or specify cacheline alignment when allocating.
 .Sh MALLCTL NAMESPACE
 The following names are defined in the namespace accessible via the
 .Fn @jemalloc_prefix@mallctl*
@@ -845,6 +586,10 @@ introspection.
 @roff_stats@<i> equal to
 @roff_stats@.Dq arenas.narenas
 @roff_stats@can be used to access the summation of statistics from all arenas.
+.Pp
+Take special note of the
+.Dq epoch
+mallctl, which controls refreshing of cached dynamic statistics.
 .Bl -ohang
 .\"-----------------------------------------------------------------------------
 .It Sy "version (const char *) r-"
@@ -861,48 +606,6 @@ Return the current epoch.
 This is useful for detecting whether another thread caused a refresh.
 .Ed
 .\"-----------------------------------------------------------------------------
-@roff_tcache@.It Sy "tcache.flush (void) --"
-@roff_tcache@.Bd -ragged -offset indent -compact
-@roff_tcache@Flush calling thread's tcache.
-@roff_tcache@This interface releases all cached objects and internal data
-@roff_tcache@structures associated with the calling thread's thread-specific
-@roff_tcache@cache.
-@roff_tcache@Ordinarily, this interface need not be called, since automatic
-@roff_tcache@periodic incremental garbage collection occurs, and the thread
-@roff_tcache@cache is automatically discarded when a thread exits.
-@roff_tcache@However, garbage collection is triggered by allocation activity,
-@roff_tcache@so it is possible for a thread that stops allocating/deallocating
-@roff_tcache@to retain its cache indefinitely, in which case the developer may
-@roff_tcache@find manual flushing useful.
-.Ed
-.\"-----------------------------------------------------------------------------
-.It Sy "thread.arena (unsigned) rw"
-.Bd -ragged -offset indent -compact
-Get or set the arena associated with the calling thread.
-The arena index must be less than the maximum number of arenas (see the
-.Dq arenas.narenas
-mallctl).
-If the specified arena was not initialized beforehand (see the
-.Dq arenas.initialized
-mallctl), it will be automatically initialized as a side effect of calling this
-interface.
-.Ed
-.\"-----------------------------------------------------------------------------
-@roff_stats@.It Sy "thread.allocated (uint64_t) r-"
-@roff_stats@.Bd -ragged -offset indent -compact
-@roff_stats@Get the total number of bytes ever allocated by the calling thread.
-@roff_stats@This counter has the potential to wrap around; it is up to the
-@roff_stats@application to appropriately interpret the counter in such cases.
-@roff_stats@.Ed
-.\"-----------------------------------------------------------------------------
-@roff_stats@.It Sy "thread.deallocated (uint64_t) r-"
-@roff_stats@.Bd -ragged -offset indent -compact
-@roff_stats@Get the total number of bytes ever deallocated by the calling
-@roff_stats@thread.
-@roff_stats@This counter has the potential to wrap around; it is up to the
-@roff_stats@application to appropriately interpret the counter in such cases.
-@roff_stats@.Ed
-.\"-----------------------------------------------------------------------------
 .It Sy "config.debug (bool) r-"
 .Bd -ragged -offset indent -compact
 --enable-debug was specified during build configuration.
@@ -980,150 +683,384 @@ interface.
 .\"-----------------------------------------------------------------------------
 .It Sy "opt.abort (bool) r-"
 .Bd -ragged -offset indent -compact
-See the
-.Dq A
-option.
+Abort-on-warning enabled/disabled.
+If true, most warnings are fatal.
+The process will call
+.Xr abort 3
+in these cases.
+This option is
+@roff_debug@enabled
+@roff_no_debug@disabled
+by default.
 .Ed
 .\"-----------------------------------------------------------------------------
-@roff_fill@.It Sy "opt.junk (bool) r-"
-@roff_fill@.Bd -ragged -offset indent -compact
-@roff_fill@See the
-@roff_fill@.Dq J
-@roff_fill@option.
-@roff_fill@.Ed
-.\"-----------------------------------------------------------------------------
-@roff_fill@.It Sy "opt.zero (bool) r-"
-@roff_fill@.Bd -ragged -offset indent -compact
-@roff_fill@See the
-@roff_fill@.Dq Z
-@roff_fill@option.
-@roff_fill@.Ed
-.\"-----------------------------------------------------------------------------
-@roff_xmalloc@.It Sy "opt.xmalloc (bool) r-"
-@roff_xmalloc@.Bd -ragged -offset indent -compact
-@roff_xmalloc@See the
-@roff_xmalloc@.Dq X
-@roff_xmalloc@option.
-@roff_xmalloc@.Ed
-.\"-----------------------------------------------------------------------------
-@roff_tcache@.It Sy "opt.tcache (bool) r-"
-@roff_tcache@.Bd -ragged -offset indent -compact
-@roff_tcache@See the
-@roff_tcache@.Dq H
-@roff_tcache@option.
-@roff_tcache@.Ed
-.\"-----------------------------------------------------------------------------
-@roff_tcache@.It Sy "opt.lg_tcache_gc_sweep (ssize_t) r-"
-@roff_tcache@.Bd -ragged -offset indent -compact
-@roff_tcache@See the
-@roff_tcache@.Dq G
-@roff_tcache@option.
-@roff_tcache@.Ed
-.\"-----------------------------------------------------------------------------
-.It Sy "opt.stats_print (bool) r-"
-.Bd -ragged -offset indent -compact
-See the
-.Dq P
-option.
-.Ed
-.\"-----------------------------------------------------------------------------
-@roff_prof@.It Sy "opt.prof (bool) r-"
-@roff_prof@.Bd -ragged -offset indent -compact
-@roff_prof@See the
-@roff_prof@.Dq F
-@roff_prof@option.
-@roff_prof@.Ed
-.\"-----------------------------------------------------------------------------
-@roff_prof@.It Sy "opt.lg_prof_bt_max (size_t) r-"
-@roff_prof@.Bd -ragged -offset indent -compact
-@roff_prof@See the
-@roff_prof@.Dq B
-@roff_prof@option.
-@roff_prof@.Ed
-.\"-----------------------------------------------------------------------------
-@roff_prof@.It Sy "opt.prof_accum (bool) r-"
-@roff_prof@.Bd -ragged -offset indent -compact
-@roff_prof@See the
-@roff_prof@.Dq R
-@roff_prof@option.
-@roff_prof@.Ed
-.\"-----------------------------------------------------------------------------
-@roff_prof@.It Sy "opt.lg_prof_tcmax (ssize_t) r-"
-@roff_prof@.Bd -ragged -offset indent -compact
-@roff_prof@See the
-@roff_prof@.Dq T
-@roff_prof@option.
-@roff_prof@.Ed
-.\"-----------------------------------------------------------------------------
-@roff_prof@.It Sy "opt.lg_prof_sample (ssize_t) r-"
-@roff_prof@.Bd -ragged -offset indent -compact
-@roff_prof@See the
-@roff_prof@.Dq S
-@roff_prof@option.
-@roff_prof@.Ed
-.\"-----------------------------------------------------------------------------
-@roff_prof@.It Sy "opt.lg_prof_interval (ssize_t) r-"
-@roff_prof@.Bd -ragged -offset indent -compact
-@roff_prof@See the
-@roff_prof@.Dq I
-@roff_prof@option.
-@roff_prof@.Ed
-.\"-----------------------------------------------------------------------------
-@roff_prof@.It Sy "opt.prof_udump (bool) r-"
-@roff_prof@.Bd -ragged -offset indent -compact
-@roff_prof@See the
-@roff_prof@.Dq U
-@roff_prof@option.
-@roff_prof@.Ed
-.\"-----------------------------------------------------------------------------
-@roff_prof@.It Sy "opt.prof_leak (bool) r-"
-@roff_prof@.Bd -ragged -offset indent -compact
-@roff_prof@See the
-@roff_prof@.Dq L
-@roff_prof@option.
-@roff_prof@.Ed
-.\"-----------------------------------------------------------------------------
 .It Sy "opt.lg_qspace_max (size_t) r-"
 .Bd -ragged -offset indent -compact
-See the
-.Dq Q
-option.
+Size (log base 2) of the maximum size class that is a multiple of the quantum
+(8 or 16 bytes, depending on architecture).
+Above this size, cacheline spacing is used for size classes.
+The default value is 128 bytes (2^7).
 .Ed
 .\"-----------------------------------------------------------------------------
 .It Sy "opt.lg_cspace_max (size_t) r-"
 .Bd -ragged -offset indent -compact
-See the
-.Dq C
-option.
-.Ed
-.\"-----------------------------------------------------------------------------
-.It Sy "opt.lg_dirty_mult (ssize_t) r-"
-.Bd -ragged -offset indent -compact
-See the
-.Dq D
-option.
+Size (log base 2) of the maximum size class that is a multiple of the cacheline
+size (64).
+Above this size, subpage spacing (256 bytes) is used for size classes.
+The default value is 512 bytes (2^9).
 .Ed
 .\"-----------------------------------------------------------------------------
 .It Sy "opt.lg_chunk (size_t) r-"
 .Bd -ragged -offset indent -compact
-See the
-.Dq K
-option.
+Virtual memory chunk size (log base 2).
+The default chunk size is 4 MiB (2^22).
 .Ed
 .\"-----------------------------------------------------------------------------
+.It Sy "opt.narenas (size_t) r-"
+.Bd -ragged -offset indent -compact
+Maximum number of arenas to use.
+The default maximum number of arenas is four times the number of CPUs, or one
+if there is a single CPU.
+.Ed
+.\"-----------------------------------------------------------------------------
+.It Sy "opt.lg_dirty_mult (ssize_t) r-"
+.Bd -ragged -offset indent -compact
+Per-arena minimum ratio (log base 2) of active to dirty pages.
+Some dirty unused pages may be allowed to accumulate, within the limit set by
+the ratio (or one chunk worth of dirty pages, whichever is greater), before
+informing the kernel about some of those pages via
+.Xr madvise 2
+or a similar system call.
+This provides the kernel with sufficient information to recycle dirty pages if
+physical memory becomes scarce and the pages remain unused.
+The default minimum ratio is 32:1 (2^5:1); an option value of -1 will disable
+dirty page purging.
+.Ed
+.\"-----------------------------------------------------------------------------
+.It Sy "opt.stats_print (bool) r-"
+.Bd -ragged -offset indent -compact
+Enable/disable statistics printing at exit.
+If enabled, the
+.Fn @jemalloc_prefix@malloc_stats_print
+function is called at program exit via an
+.Xr atexit 3
+function.
+@roff_stats@This has the potential to cause deadlock for a multi-threaded
+@roff_stats@process that exits while one or more threads are executing in the
+@roff_stats@memory allocation functions.
+@roff_stats@Therefore, this option should only be used with care; it is
+@roff_stats@primarily intended as a performance tuning aid during application
+@roff_stats@development.
+This option is disabled by default.
+.Ed
+.\"-----------------------------------------------------------------------------
+@roff_fill@.It Sy "opt.junk (bool) r-"
+@roff_fill@.Bd -ragged -offset indent -compact
+@roff_fill@Junk filling enabled/disabled.
+@roff_fill@If enabled, each byte of uninitialized allocated memory will be
+@roff_fill@initialized to 0xa5.
+@roff_fill@All deallocated memory will be initialized to 0x5a.
+@roff_fill@This is intended for debugging and will impact performance
+@roff_fill@negatively.
+@roff_fill@This option is
+@roff_fill@@roff_debug@enabled
+@roff_fill@@roff_no_debug@disabled
+@roff_fill@by default.
+@roff_fill@.Ed
+.\"-----------------------------------------------------------------------------
+@roff_fill@.It Sy "opt.zero (bool) r-"
+@roff_fill@.Bd -ragged -offset indent -compact
+@roff_fill@Zero filling enabled/disabled.
+@roff_fill@If enabled, each byte of uninitialized allocated memory will be
+@roff_fill@initialized to 0.
+@roff_fill@Note that this initialization only happens once for each byte, so
+@roff_fill@.Fn @jemalloc_prefix@realloc
+@roff_fill@calls do not zero memory that was previously allocated.
+@roff_fill@This is intended for debugging and will impact performance
+@roff_fill@negatively.
+@roff_fill@This option is disabled by default.
+@roff_fill@.Ed
+.\"-----------------------------------------------------------------------------
+@roff_sysv@.It Sy "opt.sysv (bool) r-"
+@roff_sysv@.Bd -ragged -offset indent -compact
+@roff_sysv@If enabled, attempting to allocate zero bytes will return a
+@roff_sysv@.Dv NULL
+@roff_sysv@pointer instead of a valid pointer.
+@roff_sysv@(The default behavior is to make a minimal allocation and return a
+@roff_sysv@pointer to it.)
+@roff_sysv@This option is provided for System V compatibility.
+@roff_sysv@@roff_xmalloc@This option is incompatible with the
+@roff_sysv@@roff_xmalloc@.Dq opt.xmalloc
+@roff_sysv@@roff_xmalloc@option.
+@roff_sysv@This option is disabled by default.
+@roff_sysv@.Ed
+.\"-----------------------------------------------------------------------------
+@roff_xmalloc@.It Sy "opt.xmalloc (bool) r-"
+@roff_xmalloc@.Bd -ragged -offset indent -compact
+@roff_xmalloc@Abort-on-out-of-memory enabled/disabled.
+@roff_xmalloc@If enabled, rather than returning failure for any allocation
+@roff_xmalloc@function, display a diagnostic message on
+@roff_xmalloc@.Dv STDERR_FILENO
+@roff_xmalloc@and cause the program to drop core (using
+@roff_xmalloc@.Xr abort 3 ) .
+@roff_xmalloc@If an application is designed to depend on this behavior, set the
+@roff_xmalloc@option at compile time by including the following in the source
+@roff_xmalloc@code:
+@roff_xmalloc@.Bd -literal -offset indent
+@roff_xmalloc@@jemalloc_prefix@malloc_conf = "xmalloc:true";
+@roff_xmalloc@.Ed
+@roff_xmalloc@.Pp
+@roff_xmalloc@This option is disabled by default.
+@roff_xmalloc@.Ed
+.\"-----------------------------------------------------------------------------
+@roff_tcache@.It Sy "opt.tcache (bool) r-"
+@roff_tcache@.Bd -ragged -offset indent -compact
+@roff_tcache@Thread-specific caching enabled/disabled.
+@roff_tcache@When there are multiple threads, each thread uses a
+@roff_tcache@thread-specific cache for objects up to a certain size.
+@roff_tcache@Thread-specific caching allows many allocations to be satisfied
+@roff_tcache@without performing any thread synchronization, at the cost of
+@roff_tcache@increased memory use.
+@roff_tcache@See the
+@roff_tcache@.Dq opt.lg_tcache_gc_sweep
+@roff_tcache@and
+@roff_tcache@.Dq opt.tcache_max
+@roff_tcache@options for related tuning information.
+@roff_tcache@This option is enabled by default.
+@roff_tcache@.Ed
+.\"-----------------------------------------------------------------------------
+@roff_tcache@.It Sy "opt.lg_tcache_gc_sweep (ssize_t) r-"
+@roff_tcache@.Bd -ragged -offset indent -compact
+@roff_tcache@Approximate interval (log base 2) between full thread-specific
+@roff_tcache@cache garbage collection sweeps, counted in terms of
+@roff_tcache@thread-specific cache allocation/deallocation events.
+@roff_tcache@Garbage collection is actually performed incrementally, one size
+@roff_tcache@class at a time, in order to avoid large collection pauses.
+@roff_tcache@The default sweep interval is 8192 (2^13); setting this option to
+@roff_tcache@-1 will disable garbage collection.
+@roff_tcache@.Ed
+.\"-----------------------------------------------------------------------------
+@roff_tcache@.It Sy "opt.lg_tcache_max (size_t) r-"
+@roff_tcache@.Bd -ragged -offset indent -compact
+@roff_tcache@Maximum size class (log base 2) to cache in the thread-specific
+@roff_tcache@cache.
+@roff_tcache@At a minimum, all small size classes are cached, and at a maximum
+@roff_tcache@all large size classes are cached.
+@roff_tcache@The default maximum is 32 KiB (2^15).
+@roff_tcache@.Ed
+.\"-----------------------------------------------------------------------------
+@roff_prof@.It Sy "opt.prof (bool) r-"
+@roff_prof@.Bd -ragged -offset indent -compact
+@roff_prof@Memory profiling enabled/disabled.
+@roff_prof@If enabled, profile memory allocation activity, and use an
+@roff_prof@.Xr atexit 3
+@roff_prof@function to dump final memory usage to a file named according to
+@roff_prof@the pattern
+@roff_prof@.Pa <prefix>.<pid>.<seq>.f.heap ,
+@roff_prof@where
+@roff_prof@.Pa <prefix>
+@roff_prof@is controlled by the
+@roff_prof@.Dq opt.prof_prefix
+@roff_prof@option.
+@roff_prof@See the
+@roff_prof@.Dq opt.lg_prof_bt_max
+@roff_prof@option for backtrace depth control.
+@roff_prof@See the
+@roff_prof@.Dq opt.prof_active
+@roff_prof@option for on-the-fly activation/deactivation.
+@roff_prof@See the
+@roff_prof@.Dq opt.lg_prof_sample
+@roff_prof@option for probabilistic sampling control.
+@roff_prof@See the
+@roff_prof@.Dq opt.prof_accum
+@roff_prof@option for control of cumulative sample reporting.
+@roff_prof@See the
+@roff_prof@.Dq opt.lg_prof_tcmax
+@roff_prof@option for control of per thread backtrace caching.
+@roff_prof@See the
+@roff_prof@.Dq opt.lg_prof_interval
+@roff_prof@option for information on interval-triggered profile dumping, and the
+@roff_prof@.Dq opt.prof_gdump
+@roff_prof@option for information on high-water-triggered profile dumping.
+@roff_prof@Profile output is compatible with the included pprof Perl script,
+@roff_prof@which originates from the google-perftools package
+@roff_prof@(http://code.google.com/p/google-perftools/).
+@roff_prof@.Ed
+.\"-----------------------------------------------------------------------------
+@roff_prof@.It Sy "opt.prof_prefix (const char *) r-"
+@roff_prof@.Bd -ragged -offset indent -compact
+@roff_prof@Filename prefix for profile dumps.
+@roff_prof@If the prefix is set to the empty string, no automatic dumps will
+@roff_prof@occur; this is primarily useful for disabling the automatic final
+@roff_prof@heap dump (which also disables leak reporting, if enabled).
+@roff_prof@The default prefix is
+@roff_prof@.Pa jeprof .
+@roff_prof@.Ed
+.\"-----------------------------------------------------------------------------
+@roff_prof@.It Sy "opt.lg_prof_bt_max (size_t) r-"
+@roff_prof@.Bd -ragged -offset indent -compact
+@roff_prof@Maximum backtrace depth (log base 2) when profiling memory
+@roff_prof@allocation activity.
+@roff_prof@The default is 128 (2^7).
+@roff_prof@.Ed
+.\"-----------------------------------------------------------------------------
+@roff_prof@.It Sy "opt.prof_active (bool) r-"
+@roff_prof@.Bd -ragged -offset indent -compact
+@roff_prof@Profiling activated/deactivated.
+@roff_prof@This is a secondary control mechanism that makes it possible to
+@roff_prof@start the application with profiling enabled (see the
+@roff_prof@.Dq opt.prof
+@roff_prof@option) but inactive, then toggle profiling at any time during
+@roff_prof@program execution with the
+@roff_prof@.Dq prof.active
+@roff_prof@mallctl.
+@roff_prof@This option is enabled by default.
+@roff_prof@.Ed
+.\"-----------------------------------------------------------------------------
+@roff_prof@.It Sy "opt.lg_prof_sample (ssize_t) r-"
+@roff_prof@.Bd -ragged -offset indent -compact
+@roff_prof@Average interval (log base 2) between allocation samples, as
+@roff_prof@measured in bytes of allocation activity.
+@roff_prof@Increasing the sampling interval decreases profile fidelity, but
+@roff_prof@also decreases the computational overhead.
+@roff_prof@The default sample interval is 1 (2^0) (i.e. all allocations are
+@roff_prof@sampled).
+@roff_prof@.Ed
+.\"-----------------------------------------------------------------------------
+@roff_prof@.It Sy "opt.prof_accum (bool) r-"
+@roff_prof@.Bd -ragged -offset indent -compact
+@roff_prof@Reporting of cumulative object/byte counts in profile dumps
+@roff_prof@enabled/disabled.
+@roff_prof@If this option is enabled, every unique backtrace must be stored for
+@roff_prof@the duration of execution.
+@roff_prof@Depending on the application, this can impose a large memory
+@roff_prof@overhead, and the cumulative counts are not always of interest.
+@roff_prof@See the
+@roff_prof@.Dq opt.lg_prof_tcmax
+@roff_prof@option for control of per thread backtrace caching, which has
+@roff_prof@important interactions.
+@roff_prof@This option is enabled by default.
+@roff_prof@.Ed
+.\"-----------------------------------------------------------------------------
+@roff_prof@.It Sy "opt.lg_prof_tcmax (ssize_t) r-"
+@roff_prof@.Bd -ragged -offset indent -compact
+@roff_prof@Maximum per thread backtrace cache (log base 2) used for heap
+@roff_prof@profiling.
+@roff_prof@A backtrace can only be discarded if the
+@roff_prof@.Dq opt.prof_accum
+@roff_prof@option is disabled, and no thread caches currently refer to the
+@roff_prof@backtrace.
+@roff_prof@Therefore, a backtrace cache limit should be imposed if the
+@roff_prof@intention is to limit how much memory is used by backtraces.
+@roff_prof@By default, no limit is imposed (encoded as -1).
+@roff_prof@.Ed
+.\"-----------------------------------------------------------------------------
+@roff_prof@.It Sy "opt.lg_prof_interval (ssize_t) r-"
+@roff_prof@.Bd -ragged -offset indent -compact
+@roff_prof@Average interval (log base 2) between memory profile dumps, as
+@roff_prof@measured in bytes of allocation activity.
+@roff_prof@The actual interval between dumps may be sporadic because
+@roff_prof@decentralized allocation counters are used to avoid synchronization
+@roff_prof@bottlenecks.
+@roff_prof@Profiles are dumped to files named according to the pattern
+@roff_prof@.Pa <prefix>.<pid>.<seq>.i<iseq>.heap ,
+@roff_prof@where
+@roff_prof@.Pa <prefix>
+@roff_prof@is controlled by the
+@roff_prof@.Dq opt.prof_prefix
+@roff_prof@option.
+@roff_prof@By default, interval-triggered profile dumping is disabled (encoded
+@roff_prof@as -1).
+@roff_prof@.Ed
+.\"-----------------------------------------------------------------------------
+@roff_prof@.It Sy "opt.prof_gdump (bool) r-"
+@roff_prof@.Bd -ragged -offset indent -compact
+@roff_prof@Trigger a memory profile dump every time the total virtual memory
+@roff_prof@exceeds the previous maximum.
+@roff_prof@Profiles are dumped to files named according to the pattern
+@roff_prof@.Pa <prefix>.<pid>.<seq>.u<useq>.heap ,
+@roff_prof@where
+@roff_prof@.Pa <prefix>
+@roff_prof@is controlled by the
+@roff_prof@.Dq opt.prof_prefix
+@roff_prof@option.
+@roff_prof@This option is disabled by default.
+@roff_prof@.Ed
+.\"-----------------------------------------------------------------------------
+@roff_prof@.It Sy "opt.prof_leak (bool) r-"
+@roff_prof@.Bd -ragged -offset indent -compact
+@roff_prof@Leak reporting enabled/disabled.
+@roff_prof@If enabled, use an
+@roff_prof@.Xr atexit 3
+@roff_prof@function to report memory leaks detected by allocation sampling.
+@roff_prof@See the
+@roff_prof@.Dq opt.lg_prof_bt_max
+@roff_prof@option for backtrace depth control.
+@roff_prof@See the
+@roff_prof@.Dq opt.prof
+@roff_prof@option for information on analyzing heap profile output.
+@roff_prof@This option is disabled by default.
+@roff_prof@.Ed
+.\"-----------------------------------------------------------------------------
 .It Sy "opt.overcommit (bool) r-"
 .Bd -ragged -offset indent -compact
-See the
-.Dq O
-option.
+@roff_swap@Over-commit enabled/disabled.
+@roff_swap@If enabled, over-commit memory as a side effect of using anonymous
+@roff_swap@.Xr mmap 2
+@roff_swap@@roff_dss@ and
+@roff_swap@@roff_dss@.Xr sbrk 2
+@roff_swap@for virtual memory allocation.
+@roff_swap@In order for overcommit to be disabled, the
+@roff_swap@.Dq swap.fds
+@roff_swap@mallctl must have been successfully written to.
+@roff_swap@This option is enabled by default.
 .Ed
 .\"-----------------------------------------------------------------------------
+@roff_tcache@.It Sy "tcache.flush (void) --"
+@roff_tcache@.Bd -ragged -offset indent -compact
+@roff_tcache@Flush calling thread's tcache.
+@roff_tcache@This interface releases all cached objects and internal data
+@roff_tcache@structures associated with the calling thread's thread-specific
+@roff_tcache@cache.
+@roff_tcache@Ordinarily, this interface need not be called, since automatic
+@roff_tcache@periodic incremental garbage collection occurs, and the thread
+@roff_tcache@cache is automatically discarded when a thread exits.
+@roff_tcache@However, garbage collection is triggered by allocation activity,
+@roff_tcache@so it is possible for a thread that stops allocating/deallocating
+@roff_tcache@to retain its cache indefinitely, in which case the developer may
+@roff_tcache@find manual flushing useful.
+.Ed
+.\"-----------------------------------------------------------------------------
+.It Sy "thread.arena (unsigned) rw"
+.Bd -ragged -offset indent -compact
+Get or set the arena associated with the calling thread.
+The arena index must be less than the maximum number of arenas (see the
+.Dq arenas.narenas
+mallctl).
+If the specified arena was not initialized beforehand (see the
+.Dq arenas.initialized
+mallctl), it will be automatically initialized as a side effect of calling this
+interface.
+.Ed
+.\"-----------------------------------------------------------------------------
+@roff_stats@.It Sy "thread.allocated (uint64_t) r-"
+@roff_stats@.Bd -ragged -offset indent -compact
+@roff_stats@Get the total number of bytes ever allocated by the calling thread.
+@roff_stats@This counter has the potential to wrap around; it is up to the
+@roff_stats@application to appropriately interpret the counter in such cases.
+@roff_stats@.Ed
+.\"-----------------------------------------------------------------------------
+@roff_stats@.It Sy "thread.deallocated (uint64_t) r-"
+@roff_stats@.Bd -ragged -offset indent -compact
+@roff_stats@Get the total number of bytes ever deallocated by the calling
+@roff_stats@thread.
+@roff_stats@This counter has the potential to wrap around; it is up to the
+@roff_stats@application to appropriately interpret the counter in such cases.
+@roff_stats@.Ed
+.\"-----------------------------------------------------------------------------
 .It Sy "arenas.narenas (unsigned) r-"
 .Bd -ragged -offset indent -compact
 Maximum number of arenas.
-See the
-.Dq N
-option.
 .Ed
 .\"-----------------------------------------------------------------------------
 .It Sy "arenas.initialized (bool *) r-"
@@ -1269,7 +1206,7 @@ specified.
 @roff_prof@.Bd -ragged -offset indent -compact
 @roff_prof@Control whether sampling is currently active.
 @roff_prof@See the
-@roff_prof@.Dq E
+@roff_prof@.Dq opt.prof_active
 @roff_prof@option for additional information.
 @roff_prof@.Ed
 .\"-----------------------------------------------------------------------------
@@ -1281,8 +1218,8 @@ specified.
 @roff_prof@where
 @roff_prof@.Pa <prefix>
 @roff_prof@is controlled by the
-@roff_prof@JEMALLOC_PROF_PREFIX
-@roff_prof@environment variable.
+@roff_prof@.Dq opt.prof_prefix
+@roff_prof@option.
 @roff_prof@.Ed
 .\"-----------------------------------------------------------------------------
 @roff_prof@.It Sy "prof.interval (uint64_t) r-"
@@ -1290,7 +1227,7 @@ specified.
 @roff_prof@Average number of bytes allocated between inverval-based profile
 @roff_prof@dumps.
 @roff_prof@See the
-@roff_prof@.Dq I
+@roff_prof@.Dq opt.lg_prof_interval
 @roff_prof@option for additional information.
 @roff_prof@.Ed
 .\"-----------------------------------------------------------------------------
@@ -1544,10 +1481,9 @@ has not been called.
 .\"-----------------------------------------------------------------------------
 .El
 .Sh DEBUGGING MALLOC PROBLEMS
-The first thing to do is to set the
-.Dq A
-option.
-This option forces a coredump (if possible) at the first sign of trouble,
+Start by setting the
+.Dq opt.abort
+option, which forces a coredump (if possible) at the first sign of trouble,
 rather than the normal policy of trying to continue if at all possible.
 .Pp
 It is probably also a good idea to recompile the program with suitable
@@ -1558,19 +1494,19 @@ options and symbols for debugger support.
 @roff_fill@the next section, it is likely because it depends on the storage
 @roff_fill@being filled with zero bytes.
 @roff_fill@Try running it with the
-@roff_fill@.Dq Z
+@roff_fill@.Dq opt.zero
 @roff_fill@option set;
 @roff_fill@if that improves the situation, this diagnosis has been confirmed.
 @roff_fill@If the program still misbehaves,
 @roff_fill@the likely problem is accessing memory outside the allocated area.
 @roff_fill@.Pp
 @roff_fill@Alternatively, if the symptoms are not easy to reproduce, setting the
-@roff_fill@.Dq J
+@roff_fill@.Dq opt.junk
 @roff_fill@option may help provoke the problem.
 @roff_fill@.Pp
-Unfortunately this implementation does not provide much detail about
-the problems it detects; the performance impact for storing such information
-would be prohibitive.
+This implementation does not provide much detail about the problems it detects,
+because the performance impact for storing such information would be
+prohibitive.
 There are a number of allocator implementations available on the Internet
 which focus on detecting and pinpointing problems by trading performance for
 extra sanity checks and detailed diagnostics.
@@ -1580,7 +1516,7 @@ warning condition, a message will be printed to file descriptor
 .Dv STDERR_FILENO .
 Errors will result in the process dumping core.
 If the
-.Dq A
+.Dq opt.abort
 option is set, all warnings are treated as errors.
 .Pp
 The
@@ -1736,33 +1672,24 @@ was specified, but the reallocation request could not be serviced without
 moving the object.
 .El
 .Sh ENVIRONMENT
-The following environment variables affect the execution of the allocation
+The following environment variable affects the execution of the allocation
 functions:
-@roff_prof@.Bl -tag -width ".Ev JEMALLOC_PROF_PREFIX"
-@roff_no_prof@.Bl -tag -width ".Ev JEMALLOC_OPTIONS"
-.It Ev JEMALLOC_OPTIONS
+.Bl -tag -width ".Ev @jemalloc_cprefix@MALLOC_CONF"
+.It Ev @jemalloc_cprefix@MALLOC_CONF
 If the environment variable
-.Ev JEMALLOC_OPTIONS
-is set, the characters it contains will be interpreted as flags to the
-allocation functions.
-@roff_prof@.It Ev JEMALLOC_PROF_PREFIX
-@roff_prof@If the environment variable
-@roff_prof@.Ev JEMALLOC_PROF_PREFIX
-@roff_prof@is set, use it as the filename prefix for profile dumps; otherwise
-@roff_prof@use
-@roff_prof@.Pa jeprof
-@roff_prof@as the prefix.
+.Ev @jemalloc_cprefix@MALLOC_CONF
+is set, the characters it contains will be interpreted as options.
 .El
 .Sh EXAMPLES
 To dump core whenever a problem occurs:
 .Pp
 .Bd -literal -offset indent
-ln -s 'A' /etc/jemalloc.conf
+ln -s 'abort:true' /etc/@jemalloc_prefix@malloc.conf
 .Ed
 .Pp
-To specify in the source a chunk size that is twice the default:
+To specify in the source a chunk size that is 16 MiB:
 .Bd -literal -offset indent
-@jemalloc_prefix@malloc_options = "K";
+@jemalloc_prefix@malloc_conf = "lg_chunk:24";
 .Ed
 .Sh SEE ALSO
 .Xr madvise 2 ,
diff --git a/jemalloc/include/jemalloc/internal/ctl.h b/jemalloc/include/jemalloc/internal/ctl.h
index 7bbf21e0..8776ad13 100644
--- a/jemalloc/include/jemalloc/internal/ctl.h
+++ b/jemalloc/include/jemalloc/internal/ctl.h
@@ -82,9 +82,9 @@ bool	ctl_boot(void);
 #define	xmallctl(name, oldp, oldlenp, newp, newlen) do {		\
 	if (JEMALLOC_P(mallctl)(name, oldp, oldlenp, newp, newlen)	\
 	    != 0) {							\
-		malloc_write("<jemalloc>: Invalid xmallctl(\"");	\
+		malloc_write("<jemalloc>: Failure in xmallctl(\"");	\
 		malloc_write(name);					\
-		malloc_write("\", ...) call\n");			\
+		malloc_write("\", ...)\n");				\
 		abort();						\
 	}								\
 } while (0)
@@ -92,9 +92,9 @@ bool	ctl_boot(void);
 #define	xmallctlnametomib(name, mibp, miblenp) do {			\
 	if (JEMALLOC_P(mallctlnametomib)(name, mibp, miblenp) != 0) {	\
 		malloc_write(						\
-		    "<jemalloc>: Invalid xmallctlnametomib(\"");	\
+		    "<jemalloc>: Failure in xmallctlnametomib(\"");	\
 		malloc_write(name);					\
-		malloc_write("\", ...) call\n");			\
+		malloc_write("\", ...)\n");				\
 		abort();						\
 	}								\
 } while (0)
@@ -103,7 +103,7 @@ bool	ctl_boot(void);
 	if (JEMALLOC_P(mallctlbymib)(mib, miblen, oldp, oldlenp, newp,	\
 	    newlen) != 0) {						\
 		malloc_write(						\
-		    "<jemalloc>: Invalid xmallctlbymib() call\n");	\
+		    "<jemalloc>: Failure in xmallctlbymib()\n");	\
 		abort();						\
 	}								\
 } while (0)
diff --git a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
index 99746bbd..3d253001 100644
--- a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
@@ -24,6 +24,7 @@
 #include <inttypes.h>
 #include <string.h>
 #include <strings.h>
+#include <ctype.h>
 #include <unistd.h>
 #include <fcntl.h>
 #include <pthread.h>
@@ -61,7 +62,7 @@ extern void	(*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s);
 		malloc_write("<jemalloc>: ");				\
 		malloc_write(__FILE__);					\
 		malloc_write(":");					\
-		malloc_write(umax2s(__LINE__, 10, line_buf));		\
+		malloc_write(u2s(__LINE__, 10, line_buf));		\
 		malloc_write(": Failed assertion: ");			\
 		malloc_write("\"");					\
 		malloc_write(#e);					\
@@ -256,6 +257,7 @@ extern bool	opt_xmalloc;
 #ifdef JEMALLOC_FILL
 extern bool	opt_zero;
 #endif
+extern size_t	opt_narenas;
 
 #ifdef DYNAMIC_PAGE_SHIFT
 extern size_t		pagesize;
diff --git a/jemalloc/include/jemalloc/internal/prof.h b/jemalloc/include/jemalloc/internal/prof.h
index 3e85bdab..7864000b 100644
--- a/jemalloc/include/jemalloc/internal/prof.h
+++ b/jemalloc/include/jemalloc/internal/prof.h
@@ -9,6 +9,7 @@ typedef struct prof_ctx_s prof_ctx_t;
 typedef struct prof_tdata_s prof_tdata_t;
 
 /* Option defaults. */
+#define	PROF_PREFIX_DEFAULT		"jeprof"
 #define	LG_PROF_BT_MAX_DEFAULT		7
 #define	LG_PROF_SAMPLE_DEFAULT		0
 #define	LG_PROF_INTERVAL_DEFAULT	-1
@@ -164,10 +165,11 @@ extern bool	opt_prof_active;
 extern size_t	opt_lg_prof_bt_max;   /* Maximum backtrace depth. */
 extern size_t	opt_lg_prof_sample;   /* Mean bytes between samples. */
 extern ssize_t	opt_lg_prof_interval; /* lg(prof_interval). */
-extern bool	opt_prof_udump;       /* High-water memory dumping. */
+extern bool	opt_prof_gdump;       /* High-water memory dumping. */
 extern bool	opt_prof_leak;        /* Dump leak summary at exit. */
 extern bool	opt_prof_accum;       /* Report cumulative bytes. */
 extern ssize_t	opt_lg_prof_tcmax;    /* lg(max per thread bactrace cache) */
+extern char	opt_prof_prefix[PATH_MAX + 1];
 
 /*
  * Profile dump interval, measured in bytes allocated.  Each arena triggers a
@@ -215,10 +217,11 @@ void	prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max);
 prof_thr_cnt_t	*prof_lookup(prof_bt_t *bt);
 void	prof_idump(void);
 bool	prof_mdump(const char *filename);
-void	prof_udump(void);
+void	prof_gdump(void);
 prof_tdata_t	*prof_tdata_init(void);
 void	prof_boot0(void);
-bool	prof_boot1(void);
+void	prof_boot1(void);
+bool	prof_boot2(void);
 
 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
diff --git a/jemalloc/include/jemalloc/internal/stats.h b/jemalloc/include/jemalloc/internal/stats.h
index cbf035ff..3fc2080a 100644
--- a/jemalloc/include/jemalloc/internal/stats.h
+++ b/jemalloc/include/jemalloc/internal/stats.h
@@ -154,7 +154,7 @@ struct chunk_stats_s {
 
 extern bool	opt_stats_print;
 
-char	*umax2s(uintmax_t x, unsigned base, char *s);
+char	*u2s(uint64_t x, unsigned base, char *s);
 #ifdef JEMALLOC_STATS
 void malloc_cprintf(void (*write)(void *, const char *), void *cbopaque,
     const char *format, ...) JEMALLOC_ATTR(format(printf, 3, 4));
diff --git a/jemalloc/include/jemalloc/internal/tcache.h b/jemalloc/include/jemalloc/internal/tcache.h
index 168a3069..1ad91a9b 100644
--- a/jemalloc/include/jemalloc/internal/tcache.h
+++ b/jemalloc/include/jemalloc/internal/tcache.h
@@ -17,7 +17,7 @@ typedef struct tcache_s tcache_t;
 /* Number of cache slots for large size classes. */
 #define	TCACHE_NSLOTS_LARGE		20
 
-/* (1U << opt_lg_tcache_maxclass) is used to compute tcache_maxclass. */
+/* (1U << opt_lg_tcache_max) is used to compute tcache_maxclass. */
 #define	LG_TCACHE_MAXCLASS_DEFAULT	15
 
 /*
@@ -61,7 +61,7 @@ struct tcache_s {
 #ifdef JEMALLOC_H_EXTERNS
 
 extern bool	opt_tcache;
-extern ssize_t	opt_lg_tcache_maxclass;
+extern ssize_t	opt_lg_tcache_max;
 extern ssize_t	opt_lg_tcache_gc_sweep;
 
 /* Map of thread-specific caches. */
diff --git a/jemalloc/include/jemalloc/jemalloc.h.in b/jemalloc/include/jemalloc/jemalloc.h.in
index e3983078..4dd3981a 100644
--- a/jemalloc/include/jemalloc/jemalloc.h.in
+++ b/jemalloc/include/jemalloc/jemalloc.h.in
@@ -32,7 +32,7 @@ extern "C" {
 #define	ALLOCM_ERR_OOM		1
 #define	ALLOCM_ERR_NOT_MOVED	2
 
-extern const char	*JEMALLOC_P(malloc_options);
+extern const char	*JEMALLOC_P(malloc_conf);
 extern void		(*JEMALLOC_P(malloc_message))(void *, const char *);
 
 void	*JEMALLOC_P(malloc)(size_t size) JEMALLOC_ATTR(malloc);
diff --git a/jemalloc/include/jemalloc/jemalloc_defs.h.in b/jemalloc/include/jemalloc/jemalloc_defs.h.in
index fe35170c..54e5d943 100644
--- a/jemalloc/include/jemalloc/jemalloc_defs.h.in
+++ b/jemalloc/include/jemalloc/jemalloc_defs.h.in
@@ -13,6 +13,7 @@
  * the API prefixing.
  */
 #undef JEMALLOC_PREFIX
+#undef JEMALLOC_CPREFIX
 #if (defined(JEMALLOC_PREFIX) && defined(JEMALLOC_MANGLE))
 #undef JEMALLOC_P
 #endif
diff --git a/jemalloc/src/arena.c b/jemalloc/src/arena.c
index d811f658..00f425fa 100644
--- a/jemalloc/src/arena.c
+++ b/jemalloc/src/arena.c
@@ -290,7 +290,7 @@ arena_run_reg_dalloc(arena_run_t *run, void *ptr)
 	assert((uintptr_t)ptr >= (uintptr_t)run +
 	    (uintptr_t)run->bin->reg0_offset);
 	/*
-	 * Freeing a pointer in the run's wilderness can cause assertion
+	 * Freeing a pointer past in the run's frontier can cause assertion
 	 * failure.
 	 */
 	assert((uintptr_t)ptr < (uintptr_t)run->next);
@@ -2532,7 +2532,7 @@ arena_boot(void)
 		if (nbins > 255) {
 		    char line_buf[UMAX2S_BUFSIZE];
 		    malloc_write("<jemalloc>: Too many small size classes (");
-		    malloc_write(umax2s(nbins, 10, line_buf));
+		    malloc_write(u2s(nbins, 10, line_buf));
 		    malloc_write(" > max 255)\n");
 		    abort();
 		}
@@ -2541,7 +2541,7 @@ arena_boot(void)
 	if (nbins > 256) {
 	    char line_buf[UMAX2S_BUFSIZE];
 	    malloc_write("<jemalloc>: Too many small size classes (");
-	    malloc_write(umax2s(nbins, 10, line_buf));
+	    malloc_write(u2s(nbins, 10, line_buf));
 	    malloc_write(" > max 256)\n");
 	    abort();
 	}
diff --git a/jemalloc/src/chunk.c b/jemalloc/src/chunk.c
index 0be24fbd..00bf50a0 100644
--- a/jemalloc/src/chunk.c
+++ b/jemalloc/src/chunk.c
@@ -78,7 +78,7 @@ RETURN:
 #if (defined(JEMALLOC_STATS) || defined(JEMALLOC_PROF))
 	if (ret != NULL) {
 #  ifdef JEMALLOC_PROF
-		bool udump;
+		bool gdump;
 #  endif
 		malloc_mutex_lock(&chunks_mtx);
 #  ifdef JEMALLOC_STATS
@@ -88,17 +88,17 @@ RETURN:
 		if (stats_chunks.curchunks > stats_chunks.highchunks) {
 			stats_chunks.highchunks = stats_chunks.curchunks;
 #  ifdef JEMALLOC_PROF
-			udump = true;
+			gdump = true;
 #  endif
 		}
 #  ifdef JEMALLOC_PROF
 		else
-			udump = false;
+			gdump = false;
 #  endif
 		malloc_mutex_unlock(&chunks_mtx);
 #  ifdef JEMALLOC_PROF
-		if (opt_prof && opt_prof_udump && udump)
-			prof_udump();
+		if (opt_prof && opt_prof_gdump && gdump)
+			prof_gdump();
 #  endif
 	}
 #endif
diff --git a/jemalloc/src/ctl.c b/jemalloc/src/ctl.c
index dbc5cd42..c83ee4f1 100644
--- a/jemalloc/src/ctl.c
+++ b/jemalloc/src/ctl.c
@@ -62,8 +62,15 @@ CTL_PROTO(config_tiny)
 CTL_PROTO(config_tls)
 CTL_PROTO(config_xmalloc)
 CTL_PROTO(opt_abort)
+CTL_PROTO(opt_lg_qspace_max)
+CTL_PROTO(opt_lg_cspace_max)
+CTL_PROTO(opt_lg_chunk)
+CTL_PROTO(opt_narenas)
+CTL_PROTO(opt_lg_dirty_mult)
+CTL_PROTO(opt_stats_print)
 #ifdef JEMALLOC_FILL
 CTL_PROTO(opt_junk)
+CTL_PROTO(opt_zero)
 #endif
 #ifdef JEMALLOC_SYSV
 CTL_PROTO(opt_sysv)
@@ -71,29 +78,22 @@ CTL_PROTO(opt_sysv)
 #ifdef JEMALLOC_XMALLOC
 CTL_PROTO(opt_xmalloc)
 #endif
-#ifdef JEMALLOC_ZERO
-CTL_PROTO(opt_zero)
-#endif
 #ifdef JEMALLOC_TCACHE
 CTL_PROTO(opt_tcache)
 CTL_PROTO(opt_lg_tcache_gc_sweep)
 #endif
 #ifdef JEMALLOC_PROF
 CTL_PROTO(opt_prof)
+CTL_PROTO(opt_prof_prefix)
 CTL_PROTO(opt_prof_active)
 CTL_PROTO(opt_lg_prof_bt_max)
 CTL_PROTO(opt_lg_prof_sample)
 CTL_PROTO(opt_lg_prof_interval)
-CTL_PROTO(opt_prof_udump)
+CTL_PROTO(opt_prof_gdump)
 CTL_PROTO(opt_prof_leak)
 CTL_PROTO(opt_prof_accum)
 CTL_PROTO(opt_lg_prof_tcmax)
 #endif
-CTL_PROTO(opt_stats_print)
-CTL_PROTO(opt_lg_qspace_max)
-CTL_PROTO(opt_lg_cspace_max)
-CTL_PROTO(opt_lg_dirty_mult)
-CTL_PROTO(opt_lg_chunk)
 #ifdef JEMALLOC_SWAP
 CTL_PROTO(opt_overcommit)
 #endif
@@ -247,38 +247,43 @@ static const ctl_node_t	config_node[] = {
 
 static const ctl_node_t opt_node[] = {
 	{NAME("abort"),			CTL(opt_abort)},
+	{NAME("lg_qspace_max"),		CTL(opt_lg_qspace_max)},
+	{NAME("lg_cspace_max"),		CTL(opt_lg_cspace_max)},
+	{NAME("lg_chunk"),		CTL(opt_lg_chunk)},
+	{NAME("narenas"),		CTL(opt_narenas)},
+	{NAME("lg_dirty_mult"),		CTL(opt_lg_dirty_mult)},
+	{NAME("stats_print"),		CTL(opt_stats_print)}
 #ifdef JEMALLOC_FILL
+	,
 	{NAME("junk"),			CTL(opt_junk)},
+	{NAME("zero"),			CTL(opt_zero)}
 #endif
 #ifdef JEMALLOC_SYSV
-	{NAME("sysv"),			CTL(opt_sysv)},
+	,
+	{NAME("sysv"),			CTL(opt_sysv)}
 #endif
 #ifdef JEMALLOC_XMALLOC
-	{NAME("xmalloc"),		CTL(opt_xmalloc)},
-#endif
-#ifdef JEMALLOC_ZERO
-	{NAME("zero"),			CTL(opt_zero)},
+	,
+	{NAME("xmalloc"),		CTL(opt_xmalloc)}
 #endif
 #ifdef JEMALLOC_TCACHE
+	,
 	{NAME("tcache"),		CTL(opt_tcache)},
-	{NAME("lg_tcache_gc_sweep"),	CTL(opt_lg_tcache_gc_sweep)},
+	{NAME("lg_tcache_gc_sweep"),	CTL(opt_lg_tcache_gc_sweep)}
 #endif
 #ifdef JEMALLOC_PROF
+	,
 	{NAME("prof"),			CTL(opt_prof)},
+	{NAME("prof_prefix"),		CTL(opt_prof_prefix)},
 	{NAME("prof_active"),		CTL(opt_prof_active)},
 	{NAME("lg_prof_bt_max"),	CTL(opt_lg_prof_bt_max)},
 	{NAME("lg_prof_sample"),	CTL(opt_lg_prof_sample)},
 	{NAME("lg_prof_interval"),	CTL(opt_lg_prof_interval)},
-	{NAME("prof_udump"),		CTL(opt_prof_udump)},
+	{NAME("prof_gdump"),		CTL(opt_prof_gdump)},
 	{NAME("prof_leak"),		CTL(opt_prof_leak)},
 	{NAME("prof_accum"),		CTL(opt_prof_accum)},
-	{NAME("lg_prof_tcmax"),		CTL(opt_lg_prof_tcmax)},
+	{NAME("lg_prof_tcmax"),		CTL(opt_lg_prof_tcmax)}
 #endif
-	{NAME("stats_print"),		CTL(opt_stats_print)},
-	{NAME("lg_qspace_max"),		CTL(opt_lg_qspace_max)},
-	{NAME("lg_cspace_max"),		CTL(opt_lg_cspace_max)},
-	{NAME("lg_dirty_mult"),		CTL(opt_lg_dirty_mult)},
-	{NAME("lg_chunk"),		CTL(opt_lg_chunk)}
 #ifdef JEMALLOC_SWAP
 	,
 	{NAME("overcommit"),		CTL(opt_overcommit)}
@@ -1201,8 +1206,15 @@ CTL_RO_FALSE_GEN(config_xmalloc)
 /******************************************************************************/
 
 CTL_RO_GEN(opt_abort, opt_abort, bool)
+CTL_RO_GEN(opt_lg_qspace_max, opt_lg_qspace_max, size_t)
+CTL_RO_GEN(opt_lg_cspace_max, opt_lg_cspace_max, size_t)
+CTL_RO_GEN(opt_lg_chunk, opt_lg_chunk, size_t)
+CTL_RO_GEN(opt_narenas, opt_narenas, size_t)
+CTL_RO_GEN(opt_lg_dirty_mult, opt_lg_dirty_mult, ssize_t)
+CTL_RO_GEN(opt_stats_print, opt_stats_print, bool)
 #ifdef JEMALLOC_FILL
 CTL_RO_GEN(opt_junk, opt_junk, bool)
+CTL_RO_GEN(opt_zero, opt_zero, bool)
 #endif
 #ifdef JEMALLOC_SYSV
 CTL_RO_GEN(opt_sysv, opt_sysv, bool)
@@ -1210,29 +1222,22 @@ CTL_RO_GEN(opt_sysv, opt_sysv, bool)
 #ifdef JEMALLOC_XMALLOC
 CTL_RO_GEN(opt_xmalloc, opt_xmalloc, bool)
 #endif
-#ifdef JEMALLOC_ZERO
-CTL_RO_GEN(opt_zero, opt_zero, bool)
-#endif
 #ifdef JEMALLOC_TCACHE
 CTL_RO_GEN(opt_tcache, opt_tcache, bool)
 CTL_RO_GEN(opt_lg_tcache_gc_sweep, opt_lg_tcache_gc_sweep, ssize_t)
 #endif
 #ifdef JEMALLOC_PROF
 CTL_RO_GEN(opt_prof, opt_prof, bool)
+CTL_RO_GEN(opt_prof_prefix, opt_prof_prefix, const char *)
 CTL_RO_GEN(opt_prof_active, opt_prof_active, bool)
 CTL_RO_GEN(opt_lg_prof_bt_max, opt_lg_prof_bt_max, size_t)
 CTL_RO_GEN(opt_lg_prof_sample, opt_lg_prof_sample, size_t)
 CTL_RO_GEN(opt_lg_prof_interval, opt_lg_prof_interval, ssize_t)
-CTL_RO_GEN(opt_prof_udump, opt_prof_udump, bool)
+CTL_RO_GEN(opt_prof_gdump, opt_prof_gdump, bool)
 CTL_RO_GEN(opt_prof_leak, opt_prof_leak, bool)
 CTL_RO_GEN(opt_prof_accum, opt_prof_accum, bool)
 CTL_RO_GEN(opt_lg_prof_tcmax, opt_lg_prof_tcmax, ssize_t)
 #endif
-CTL_RO_GEN(opt_stats_print, opt_stats_print, bool)
-CTL_RO_GEN(opt_lg_qspace_max, opt_lg_qspace_max, size_t)
-CTL_RO_GEN(opt_lg_cspace_max, opt_lg_cspace_max, size_t)
-CTL_RO_GEN(opt_lg_dirty_mult, opt_lg_dirty_mult, ssize_t)
-CTL_RO_GEN(opt_lg_chunk, opt_lg_chunk, size_t)
 #ifdef JEMALLOC_SWAP
 CTL_RO_GEN(opt_overcommit, opt_overcommit, bool)
 #endif
diff --git a/jemalloc/src/jemalloc.c b/jemalloc/src/jemalloc.c
index dedf011a..012c4a8f 100644
--- a/jemalloc/src/jemalloc.c
+++ b/jemalloc/src/jemalloc.c
@@ -41,8 +41,7 @@ size_t		lg_pagesize;
 unsigned	ncpus;
 
 /* Runtime configuration options. */
-const char	*JEMALLOC_P(malloc_options)
-    JEMALLOC_ATTR(visibility("default"));
+const char	*JEMALLOC_P(malloc_conf) JEMALLOC_ATTR(visibility("default"));
 #ifdef JEMALLOC_DEBUG
 bool	opt_abort = true;
 #  ifdef JEMALLOC_FILL
@@ -63,7 +62,7 @@ bool	opt_xmalloc = false;
 #ifdef JEMALLOC_FILL
 bool	opt_zero = false;
 #endif
-static int	opt_narenas_lshift = 0;
+size_t	opt_narenas = 0;
 
 /******************************************************************************/
 /* Function prototypes for non-inline static functions. */
@@ -74,6 +73,11 @@ static unsigned	malloc_ncpus(void);
 #if (defined(JEMALLOC_STATS) && defined(NO_TLS))
 static void	thread_allocated_cleanup(void *arg);
 #endif
+static bool	malloc_conf_next(char const **opts_p, char const **k_p,
+    size_t *klen_p, char const **v_p, size_t *vlen_p);
+static void	malloc_conf_error(const char *msg, const char *k, size_t klen,
+    const char *v, size_t vlen);
+static void	malloc_conf_init(void);
 static bool	malloc_init_hard(void);
 
 /******************************************************************************/
@@ -260,12 +264,323 @@ malloc_init(void)
 }
 
 static bool
-malloc_init_hard(void)
+malloc_conf_next(char const **opts_p, char const **k_p, size_t *klen_p,
+    char const **v_p, size_t *vlen_p)
+{
+	bool accept;
+	const char *opts = *opts_p;
+
+	*k_p = opts;
+
+	for (accept = false; accept == false;) {
+		switch (*opts) {
+			case 'A': case 'B': case 'C': case 'D': case 'E':
+			case 'F': case 'G': case 'H': case 'I': case 'J':
+			case 'K': case 'L': case 'M': case 'N': case 'O':
+			case 'P': case 'Q': case 'R': case 'S': case 'T':
+			case 'U': case 'V': case 'W': case 'X': case 'Y':
+			case 'Z':
+			case 'a': case 'b': case 'c': case 'd': case 'e':
+			case 'f': case 'g': case 'h': case 'i': case 'j':
+			case 'k': case 'l': case 'm': case 'n': case 'o':
+			case 'p': case 'q': case 'r': case 's': case 't':
+			case 'u': case 'v': case 'w': case 'x': case 'y':
+			case 'z':
+			case '0': case '1': case '2': case '3': case '4':
+			case '5': case '6': case '7': case '8': case '9':
+			case '_':
+				opts++;
+				break;
+			case ':':
+				opts++;
+				*klen_p = (uintptr_t)opts - 1 - (uintptr_t)*k_p;
+				*v_p = opts;
+				accept = true;
+				break;
+			case '\0':
+				if (opts != *opts_p) {
+					malloc_write("<jemalloc>: Conf string "
+					    "ends with key\n");
+				}
+				return (true);
+			default:
+				malloc_write("<jemalloc>: Malformed conf "
+				    "string\n");
+				return (true);
+		}
+	}
+
+	for (accept = false; accept == false;) {
+		switch (*opts) {
+			case ',':
+				opts++;
+				/*
+				 * Look ahead one character here, because the
+				 * next time this function is called, it will
+				 * assume that end of input has been cleanly
+				 * reached if no input remains, but we have
+				 * optimistically already consumed the comma if
+				 * one exists.
+				 */
+				if (*opts == '\0') {
+					malloc_write("<jemalloc>: Conf string "
+					    "ends with comma\n");
+				}
+				*vlen_p = (uintptr_t)opts - 1 - (uintptr_t)*v_p;
+				accept = true;
+				break;
+			case '\0':
+				*vlen_p = (uintptr_t)opts - (uintptr_t)*v_p;
+				accept = true;
+				break;
+			default:
+				opts++;
+				break;
+		}
+	}
+
+	*opts_p = opts;
+	return (false);
+}
+
+static void
+malloc_conf_error(const char *msg, const char *k, size_t klen, const char *v,
+    size_t vlen)
+{
+	char buf[PATH_MAX + 1];
+
+	malloc_write("<jemalloc>: ");
+	malloc_write(msg);
+	malloc_write(": ");
+	memcpy(buf, k, klen);
+	memcpy(&buf[klen], ":", 1);
+	memcpy(&buf[klen+1], v, vlen);
+	buf[klen+1+vlen] = '\0';
+	malloc_write(buf);
+	malloc_write("\n");
+}
+
+static void
+malloc_conf_init(void)
 {
 	unsigned i;
-	int linklen;
 	char buf[PATH_MAX + 1];
-	const char *opts;
+	const char *opts, *k, *v;
+	size_t klen, vlen;
+
+	for (i = 0; i < 3; i++) {
+		/* Get runtime configuration. */
+		switch (i) {
+		case 0:
+			if (JEMALLOC_P(malloc_conf) != NULL) {
+				/*
+				 * Use options that were compiled into the
+				 * program.
+				 */
+				opts = JEMALLOC_P(malloc_conf);
+			} else {
+				/* No configuration specified. */
+				buf[0] = '\0';
+				opts = buf;
+			}
+			break;
+		case 1: {
+			int linklen;
+			const char *linkname =
+#ifdef JEMALLOC_PREFIX
+			    "/etc/"JEMALLOC_PREFIX"malloc.conf"
+#else
+			    "/etc/malloc.conf"
+#endif
+			    ;
+
+			if ((linklen = readlink(linkname, buf,
+			    sizeof(buf) - 1)) != -1) {
+				/*
+				 * Use the contents of the "/etc/malloc.conf"
+				 * symbolic link's name.
+				 */
+				buf[linklen] = '\0';
+				opts = buf;
+			} else {
+				/* No configuration specified. */
+				buf[0] = '\0';
+				opts = buf;
+			}
+			break;
+		}
+		case 2: {
+			const char *envname =
+#ifdef JEMALLOC_PREFIX
+			    JEMALLOC_CPREFIX"MALLOC_CONF"
+#else
+			    "MALLOC_CONF"
+#endif
+			    ;
+
+			if ((opts = getenv(envname)) != NULL) {
+				/*
+				 * Do nothing; opts is already initialized to
+				 * the value of the JEMALLOC_OPTIONS
+				 * environment variable.
+				 */
+			} else {
+				/* No configuration specified. */
+				buf[0] = '\0';
+				opts = buf;
+			}
+			break;
+		}
+		default:
+			/* NOTREACHED */
+			assert(false);
+			buf[0] = '\0';
+			opts = buf;
+		}
+
+		while (*opts != '\0' && malloc_conf_next(&opts, &k, &klen, &v,
+		    &vlen) == false) {
+#define	CONF_HANDLE_BOOL(n)						\
+			if (sizeof(#n)-1 == klen && strncmp(#n, k,	\
+			    klen) == 0) {				\
+				if (strncmp("true", v, vlen) == 0 &&	\
+				    vlen == sizeof("true")-1)		\
+					opt_##n = true;			\
+				else if (strncmp("false", v, vlen) ==	\
+				    0 && vlen == sizeof("false")-1)	\
+					opt_##n = false;		\
+				else {					\
+					malloc_conf_error(		\
+					    "Invalid conf value",	\
+					    k, klen, v, vlen);		\
+				}					\
+				continue;				\
+			}
+#define	CONF_HANDLE_SIZE_T(n, min, max)					\
+			if (sizeof(#n)-1 == klen && strncmp(#n, k,	\
+			    klen) == 0) {				\
+				unsigned long ul;			\
+				char *end;				\
+									\
+				errno = 0;				\
+				ul = strtoul(v, &end, 0);		\
+				if (errno != 0 || (uintptr_t)end -	\
+				    (uintptr_t)v != vlen) {		\
+					malloc_conf_error(		\
+					    "Invalid conf value",	\
+					    k, klen, v, vlen);		\
+				} else if (ul < min || ul > max) {	\
+					malloc_conf_error(		\
+					    "Out-of-range conf value",	\
+					    k, klen, v, vlen);		\
+				} else					\
+					opt_##n = ul;			\
+				continue;				\
+			}
+#define	CONF_HANDLE_SSIZE_T(n, min, max)				\
+			if (sizeof(#n)-1 == klen && strncmp(#n, k,	\
+			    klen) == 0) {				\
+				long l;					\
+				char *end;				\
+									\
+				errno = 0;				\
+				l = strtol(v, &end, 0);			\
+				if (errno != 0 || (uintptr_t)end -	\
+				    (uintptr_t)v != vlen) {		\
+					malloc_conf_error(		\
+					    "Invalid conf value",	\
+					    k, klen, v, vlen);		\
+				} else if (l < (ssize_t)min || l >	\
+				    (ssize_t)max) {			\
+					malloc_conf_error(		\
+					    "Out-of-range conf value",	\
+					    k, klen, v, vlen);		\
+				} else					\
+					opt_##n = l;			\
+				continue;				\
+			}
+#define	CONF_HANDLE_CHAR_P(n, d)					\
+			if (sizeof(#n)-1 == klen && strncmp(#n, k,	\
+			    klen) == 0) {				\
+				size_t cpylen = (vlen <=		\
+				    sizeof(opt_##n)-1) ? vlen :		\
+				    sizeof(opt_##n)-1;			\
+				strncpy(opt_##n, v, cpylen);		\
+				opt_##n[cpylen] = '\0';			\
+				continue;				\
+			}
+
+			CONF_HANDLE_BOOL(abort)
+			CONF_HANDLE_SIZE_T(lg_qspace_max, LG_QUANTUM,
+			    PAGE_SHIFT-1)
+			CONF_HANDLE_SIZE_T(lg_cspace_max, LG_QUANTUM,
+			    PAGE_SHIFT-1)
+			/*
+			 * Chunks always require at least one * header page,
+			 * plus one data page.
+			 */
+			CONF_HANDLE_SIZE_T(lg_chunk, PAGE_SHIFT+1,
+			    (sizeof(size_t) << 3) - 1)
+			CONF_HANDLE_SIZE_T(narenas, 1, SIZE_T_MAX)
+			CONF_HANDLE_SSIZE_T(lg_dirty_mult, -1,
+			    (sizeof(size_t) << 3) - 1)
+			CONF_HANDLE_BOOL(stats_print)
+#ifdef JEMALLOC_FILL
+			CONF_HANDLE_BOOL(junk)
+			CONF_HANDLE_BOOL(zero)
+#endif
+#ifdef JEMALLOC_SYSV
+			CONF_HANDLE_BOOL(sysv)
+#endif
+#ifdef JEMALLOC_XMALLOC
+			CONF_HANDLE_BOOL(xmalloc)
+#endif
+#ifdef JEMALLOC_TCACHE
+			CONF_HANDLE_BOOL(tcache)
+			CONF_HANDLE_SSIZE_T(lg_tcache_gc_sweep, -1,
+			    (sizeof(size_t) << 3) - 1)
+			CONF_HANDLE_SSIZE_T(lg_tcache_max, -1,
+			    (sizeof(size_t) << 3) - 1)
+#endif
+#ifdef JEMALLOC_PROF
+			CONF_HANDLE_BOOL(prof)
+			CONF_HANDLE_CHAR_P(prof_prefix, "jeprof")
+			CONF_HANDLE_SIZE_T(lg_prof_bt_max, 0, LG_PROF_BT_MAX)
+			CONF_HANDLE_BOOL(prof_active)
+			CONF_HANDLE_SSIZE_T(lg_prof_sample, 0,
+			    (sizeof(uint64_t) << 3) - 1)
+			CONF_HANDLE_BOOL(prof_accum)
+			CONF_HANDLE_SSIZE_T(lg_prof_tcmax, -1,
+			    (sizeof(size_t) << 3) - 1)
+			CONF_HANDLE_SSIZE_T(lg_prof_interval, -1,
+			    (sizeof(uint64_t) << 3) - 1)
+			CONF_HANDLE_BOOL(prof_gdump)
+			CONF_HANDLE_BOOL(prof_leak)
+#endif
+#ifdef JEMALLOC_SWAP
+			CONF_HANDLE_BOOL(overcommit)
+#endif
+			malloc_conf_error("Invalid conf pair", k, klen, v,
+			    vlen);
+#undef CONF_HANDLE_BOOL
+#undef CONF_HANDLE_SIZE_T
+#undef CONF_HANDLE_SSIZE_T
+#undef CONF_HANDLE_CHAR_P
+		}
+
+		/* Validate configuration of options that are inter-related. */
+		if (opt_lg_qspace_max+1 >= opt_lg_cspace_max) {
+			malloc_write("<jemalloc>: Invalid lg_[qc]space_max "
+			    "relationship; restoring defaults\n");
+			opt_lg_qspace_max = LG_QSPACE_MAX_DEFAULT;
+			opt_lg_cspace_max = LG_CSPACE_MAX_DEFAULT;
+		}
+	}
+}
+
+static bool
+malloc_init_hard(void)
+{
 	arena_t *init_arenas[1];
 
 	malloc_mutex_lock(&init_lock);
@@ -308,302 +623,9 @@ malloc_init_hard(void)
 	}
 #endif
 
-	for (i = 0; i < 3; i++) {
-		unsigned j;
+	prof_boot0();
 
-		/* Get runtime configuration. */
-		switch (i) {
-		case 0:
-			if ((linklen = readlink("/etc/jemalloc.conf", buf,
-						sizeof(buf) - 1)) != -1) {
-				/*
-				 * Use the contents of the "/etc/jemalloc.conf"
-				 * symbolic link's name.
-				 */
-				buf[linklen] = '\0';
-				opts = buf;
-			} else {
-				/* No configuration specified. */
-				buf[0] = '\0';
-				opts = buf;
-			}
-			break;
-		case 1:
-			if ((opts = getenv("JEMALLOC_OPTIONS")) != NULL) {
-				/*
-				 * Do nothing; opts is already initialized to
-				 * the value of the JEMALLOC_OPTIONS
-				 * environment variable.
-				 */
-			} else {
-				/* No configuration specified. */
-				buf[0] = '\0';
-				opts = buf;
-			}
-			break;
-		case 2:
-			if (JEMALLOC_P(malloc_options) != NULL) {
-				/*
-				 * Use options that were compiled into the
-				 * program.
-				 */
-				opts = JEMALLOC_P(malloc_options);
-			} else {
-				/* No configuration specified. */
-				buf[0] = '\0';
-				opts = buf;
-			}
-			break;
-		default:
-			/* NOTREACHED */
-			assert(false);
-			buf[0] = '\0';
-			opts = buf;
-		}
-
-		for (j = 0; opts[j] != '\0'; j++) {
-			unsigned k, nreps;
-			bool nseen;
-
-			/* Parse repetition count, if any. */
-			for (nreps = 0, nseen = false;; j++, nseen = true) {
-				switch (opts[j]) {
-					case '0': case '1': case '2': case '3':
-					case '4': case '5': case '6': case '7':
-					case '8': case '9':
-						nreps *= 10;
-						nreps += opts[j] - '0';
-						break;
-					default:
-						goto MALLOC_OUT;
-				}
-			}
-MALLOC_OUT:
-			if (nseen == false)
-				nreps = 1;
-
-			for (k = 0; k < nreps; k++) {
-				switch (opts[j]) {
-				case 'a':
-					opt_abort = false;
-					break;
-				case 'A':
-					opt_abort = true;
-					break;
-#ifdef JEMALLOC_PROF
-				case 'b':
-					if (opt_lg_prof_bt_max > 0)
-						opt_lg_prof_bt_max--;
-					break;
-				case 'B':
-					if (opt_lg_prof_bt_max < LG_PROF_BT_MAX)
-						opt_lg_prof_bt_max++;
-					break;
-#endif
-				case 'c':
-					if (opt_lg_cspace_max - 1 >
-					    opt_lg_qspace_max &&
-					    opt_lg_cspace_max >
-					    LG_CACHELINE)
-						opt_lg_cspace_max--;
-					break;
-				case 'C':
-					if (opt_lg_cspace_max < PAGE_SHIFT
-					    - 1)
-						opt_lg_cspace_max++;
-					break;
-				case 'd':
-					if (opt_lg_dirty_mult + 1 <
-					    (sizeof(size_t) << 3))
-						opt_lg_dirty_mult++;
-					break;
-				case 'D':
-					if (opt_lg_dirty_mult >= 0)
-						opt_lg_dirty_mult--;
-					break;
-#ifdef JEMALLOC_PROF
-				case 'e':
-					opt_prof_active = false;
-					break;
-				case 'E':
-					opt_prof_active = true;
-					break;
-				case 'f':
-					opt_prof = false;
-					break;
-				case 'F':
-					opt_prof = true;
-					break;
-#endif
-#ifdef JEMALLOC_TCACHE
-				case 'g':
-					if (opt_lg_tcache_gc_sweep >= 0)
-						opt_lg_tcache_gc_sweep--;
-					break;
-				case 'G':
-					if (opt_lg_tcache_gc_sweep + 1 <
-					    (sizeof(size_t) << 3))
-						opt_lg_tcache_gc_sweep++;
-					break;
-				case 'h':
-					opt_tcache = false;
-					break;
-				case 'H':
-					opt_tcache = true;
-					break;
-#endif
-#ifdef JEMALLOC_PROF
-				case 'i':
-					if (opt_lg_prof_interval >= 0)
-						opt_lg_prof_interval--;
-					break;
-				case 'I':
-					if (opt_lg_prof_interval + 1 <
-					    (sizeof(uint64_t) << 3))
-						opt_lg_prof_interval++;
-					break;
-#endif
-#ifdef JEMALLOC_FILL
-				case 'j':
-					opt_junk = false;
-					break;
-				case 'J':
-					opt_junk = true;
-					break;
-#endif
-				case 'k':
-					/*
-					 * Chunks always require at least one
-					 * header page, plus one data page.
-					 */
-					if ((1U << (opt_lg_chunk - 1)) >=
-					    (2U << PAGE_SHIFT))
-						opt_lg_chunk--;
-					break;
-				case 'K':
-					if (opt_lg_chunk + 1 <
-					    (sizeof(size_t) << 3))
-						opt_lg_chunk++;
-					break;
-#ifdef JEMALLOC_PROF
-				case 'l':
-					opt_prof_leak = false;
-					break;
-				case 'L':
-					opt_prof_leak = true;
-					break;
-#endif
-#ifdef JEMALLOC_TCACHE
-				case 'm':
-					if (opt_lg_tcache_maxclass >= 0)
-						opt_lg_tcache_maxclass--;
-					break;
-				case 'M':
-					if (opt_lg_tcache_maxclass + 1 <
-					    (sizeof(size_t) << 3))
-						opt_lg_tcache_maxclass++;
-					break;
-#endif
-				case 'n':
-					opt_narenas_lshift--;
-					break;
-				case 'N':
-					opt_narenas_lshift++;
-					break;
-#ifdef JEMALLOC_SWAP
-				case 'o':
-					opt_overcommit = false;
-					break;
-				case 'O':
-					opt_overcommit = true;
-					break;
-#endif
-				case 'p':
-					opt_stats_print = false;
-					break;
-				case 'P':
-					opt_stats_print = true;
-					break;
-				case 'q':
-					if (opt_lg_qspace_max > LG_QUANTUM)
-						opt_lg_qspace_max--;
-					break;
-				case 'Q':
-					if (opt_lg_qspace_max + 1 <
-					    opt_lg_cspace_max)
-						opt_lg_qspace_max++;
-					break;
-#ifdef JEMALLOC_PROF
-				case 'r':
-					opt_prof_accum = false;
-					break;
-				case 'R':
-					opt_prof_accum = true;
-					break;
-				case 's':
-					if (opt_lg_prof_sample > 0)
-						opt_lg_prof_sample--;
-					break;
-				case 'S':
-					if (opt_lg_prof_sample + 1 <
-					    (sizeof(uint64_t) << 3))
-						opt_lg_prof_sample++;
-					break;
-				case 't':
-					if (opt_lg_prof_tcmax >= 0)
-						opt_lg_prof_tcmax--;
-					break;
-				case 'T':
-					if (opt_lg_prof_tcmax + 1 <
-					    (sizeof(size_t) << 3))
-						opt_lg_prof_tcmax++;
-					break;
-				case 'u':
-					opt_prof_udump = false;
-					break;
-				case 'U':
-					opt_prof_udump = true;
-					break;
-#endif
-#ifdef JEMALLOC_SYSV
-				case 'v':
-					opt_sysv = false;
-					break;
-				case 'V':
-					opt_sysv = true;
-					break;
-#endif
-#ifdef JEMALLOC_XMALLOC
-				case 'x':
-					opt_xmalloc = false;
-					break;
-				case 'X':
-					opt_xmalloc = true;
-					break;
-#endif
-#ifdef JEMALLOC_FILL
-				case 'z':
-					opt_zero = false;
-					break;
-				case 'Z':
-					opt_zero = true;
-					break;
-#endif
-				default: {
-					char cbuf[2];
-
-					cbuf[0] = opts[j];
-					cbuf[1] = '\0';
-					malloc_write(
-					    "<jemalloc>: Unsupported character "
-					    "in malloc options: '");
-					malloc_write(cbuf);
-					malloc_write("'\n");
-				}
-				}
-			}
-		}
-	}
+	malloc_conf_init();
 
 	/* Register fork handlers. */
 	if (pthread_atfork(jemalloc_prefork, jemalloc_postfork,
@@ -638,7 +660,7 @@ MALLOC_OUT:
 	}
 
 #ifdef JEMALLOC_PROF
-	prof_boot0();
+	prof_boot1();
 #endif
 
 	if (arena_boot()) {
@@ -692,7 +714,7 @@ MALLOC_OUT:
 	malloc_mutex_init(&arenas_lock);
 
 #ifdef JEMALLOC_PROF
-	if (prof_boot1()) {
+	if (prof_boot2()) {
 		malloc_mutex_unlock(&init_lock);
 		return (true);
 	}
@@ -704,31 +726,29 @@ MALLOC_OUT:
 	ncpus = malloc_ncpus();
 	malloc_mutex_lock(&init_lock);
 
-	if (ncpus > 1) {
+	if (opt_narenas == 0) {
 		/*
 		 * For SMP systems, create more than one arena per CPU by
 		 * default.
 		 */
-		opt_narenas_lshift += 2;
+		if (ncpus > 1)
+			opt_narenas = ncpus << 2;
+		else
+			opt_narenas = 1;
 	}
+	narenas = opt_narenas;
+	/*
+	 * Make sure that the arenas array can be allocated.  In practice, this
+	 * limit is enough to allow the allocator to function, but the ctl
+	 * machinery will fail to allocate memory at far lower limits.
+	 */
+	if (narenas > chunksize / sizeof(arena_t *)) {
+		char buf[UMAX2S_BUFSIZE];
 
-	/* Determine how many arenas to use. */
-	narenas = ncpus;
-	if (opt_narenas_lshift > 0) {
-		if ((narenas << opt_narenas_lshift) > narenas)
-			narenas <<= opt_narenas_lshift;
-		/*
-		 * Make sure not to exceed the limits of what base_alloc() can
-		 * handle.
-		 */
-		if (narenas * sizeof(arena_t *) > chunksize)
-			narenas = chunksize / sizeof(arena_t *);
-	} else if (opt_narenas_lshift < 0) {
-		if ((narenas >> -opt_narenas_lshift) < narenas)
-			narenas >>= -opt_narenas_lshift;
-		/* Make sure there is at least one arena. */
-		if (narenas == 0)
-			narenas = 1;
+		narenas = chunksize / sizeof(arena_t *);
+		malloc_write("<jemalloc>: Reducing narenas to limit (");
+		malloc_write(u2s(narenas, 10, buf));
+		malloc_write(")\n");
 	}
 
 	next_arena = (narenas > 0) ? 1 : 0;
diff --git a/jemalloc/src/prof.c b/jemalloc/src/prof.c
index fb0e7659..84ce1ba0 100644
--- a/jemalloc/src/prof.c
+++ b/jemalloc/src/prof.c
@@ -20,10 +20,11 @@ bool		opt_prof_active = true;
 size_t		opt_lg_prof_bt_max = LG_PROF_BT_MAX_DEFAULT;
 size_t		opt_lg_prof_sample = LG_PROF_SAMPLE_DEFAULT;
 ssize_t		opt_lg_prof_interval = LG_PROF_INTERVAL_DEFAULT;
-bool		opt_prof_udump = false;
+bool		opt_prof_gdump = false;
 bool		opt_prof_leak = false;
 bool		opt_prof_accum = true;
 ssize_t		opt_lg_prof_tcmax = LG_PROF_TCMAX_DEFAULT;
+char		opt_prof_prefix[PATH_MAX + 1];
 
 uint64_t	prof_interval;
 bool		prof_promote;
@@ -64,7 +65,7 @@ static bool		prof_booted = false;
 static malloc_mutex_t	enq_mtx;
 static bool		enq;
 static bool		enq_idump;
-static bool		enq_udump;
+static bool		enq_gdump;
 
 /******************************************************************************/
 /* Function prototypes for non-inline static functions. */
@@ -150,7 +151,7 @@ prof_enter(void)
 static inline void
 prof_leave(void)
 {
-	bool idump, udump;
+	bool idump, gdump;
 
 	malloc_mutex_unlock(&bt2ctx_mtx);
 
@@ -158,14 +159,14 @@ prof_leave(void)
 	enq = false;
 	idump = enq_idump;
 	enq_idump = false;
-	udump = enq_udump;
-	enq_udump = false;
+	gdump = enq_gdump;
+	enq_gdump = false;
 	malloc_mutex_unlock(&enq_mtx);
 
 	if (idump)
 		prof_idump();
-	if (udump)
-		prof_udump();
+	if (gdump)
+		prof_gdump();
 }
 
 #ifdef JEMALLOC_PROF_LIBGCC
@@ -681,22 +682,22 @@ prof_dump_ctx(prof_ctx_t *ctx, prof_bt_t *bt, bool propagate_err)
 		return (false);
 	}
 
-	if (prof_write(umax2s(ctx->cnt_summed.curobjs, 10, buf), propagate_err)
+	if (prof_write(u2s(ctx->cnt_summed.curobjs, 10, buf), propagate_err)
 	    || prof_write(": ", propagate_err)
-	    || prof_write(umax2s(ctx->cnt_summed.curbytes, 10, buf),
+	    || prof_write(u2s(ctx->cnt_summed.curbytes, 10, buf),
 	    propagate_err)
 	    || prof_write(" [", propagate_err)
-	    || prof_write(umax2s(ctx->cnt_summed.accumobjs, 10, buf),
+	    || prof_write(u2s(ctx->cnt_summed.accumobjs, 10, buf),
 	    propagate_err)
 	    || prof_write(": ", propagate_err)
-	    || prof_write(umax2s(ctx->cnt_summed.accumbytes, 10, buf),
+	    || prof_write(u2s(ctx->cnt_summed.accumbytes, 10, buf),
 	    propagate_err)
 	    || prof_write("] @", propagate_err))
 		return (true);
 
 	for (i = 0; i < bt->len; i++) {
 		if (prof_write(" 0x", propagate_err)
-		    || prof_write(umax2s((uintptr_t)bt->vec[i], 16, buf),
+		    || prof_write(u2s((uintptr_t)bt->vec[i], 16, buf),
 		    propagate_err))
 			return (true);
 	}
@@ -725,7 +726,7 @@ prof_dump_maps(bool propagate_err)
 	memcpy(&mpath[i], s, slen);
 	i += slen;
 
-	s = umax2s(getpid(), 10, buf);
+	s = u2s(getpid(), 10, buf);
 	slen = strlen(s);
 	memcpy(&mpath[i], s, slen);
 	i += slen;
@@ -799,13 +800,13 @@ prof_dump(const char *filename, bool leakcheck, bool propagate_err)
 
 	/* Dump profile header. */
 	if (prof_write("heap profile: ", propagate_err)
-	    || prof_write(umax2s(cnt_all.curobjs, 10, buf), propagate_err)
+	    || prof_write(u2s(cnt_all.curobjs, 10, buf), propagate_err)
 	    || prof_write(": ", propagate_err)
-	    || prof_write(umax2s(cnt_all.curbytes, 10, buf), propagate_err)
+	    || prof_write(u2s(cnt_all.curbytes, 10, buf), propagate_err)
 	    || prof_write(" [", propagate_err)
-	    || prof_write(umax2s(cnt_all.accumobjs, 10, buf), propagate_err)
+	    || prof_write(u2s(cnt_all.accumobjs, 10, buf), propagate_err)
 	    || prof_write(": ", propagate_err)
-	    || prof_write(umax2s(cnt_all.accumbytes, 10, buf), propagate_err))
+	    || prof_write(u2s(cnt_all.accumbytes, 10, buf), propagate_err))
 		goto ERROR;
 
 	if (opt_lg_prof_sample == 0) {
@@ -813,7 +814,7 @@ prof_dump(const char *filename, bool leakcheck, bool propagate_err)
 			goto ERROR;
 	} else {
 		if (prof_write("] @ heap_v2/", propagate_err)
-		    || prof_write(umax2s((uint64_t)1U << opt_lg_prof_sample, 10,
+		    || prof_write(u2s((uint64_t)1U << opt_lg_prof_sample, 10,
 		    buf), propagate_err)
 		    || prof_write("\n", propagate_err))
 			goto ERROR;
@@ -837,12 +838,12 @@ prof_dump(const char *filename, bool leakcheck, bool propagate_err)
 
 	if (leakcheck && cnt_all.curbytes != 0) {
 		malloc_write("<jemalloc>: Leak summary: ");
-		malloc_write(umax2s(cnt_all.curbytes, 10, buf));
+		malloc_write(u2s(cnt_all.curbytes, 10, buf));
 		malloc_write((cnt_all.curbytes != 1) ? " bytes, " : " byte, ");
-		malloc_write(umax2s(cnt_all.curobjs, 10, buf));
+		malloc_write(u2s(cnt_all.curobjs, 10, buf));
 		malloc_write((cnt_all.curobjs != 1) ? " objects, " :
 		    " object, ");
-		malloc_write(umax2s(leak_nctx, 10, buf));
+		malloc_write(u2s(leak_nctx, 10, buf));
 		malloc_write((leak_nctx != 1) ? " contexts\n" : " context\n");
 		malloc_write("<jemalloc>: Run pprof on \"");
 		malloc_write(filename);
@@ -872,31 +873,11 @@ prof_dump_filename(char *filename, char v, int64_t vseq)
 	 * Construct a filename of the form:
 	 *
 	 *   <prefix>.<pid>.<seq>.v<vseq>.heap\0
-	 * or
-	 *   jeprof.<pid>.<seq>.v<vseq>.heap\0
 	 */
 
 	i = 0;
 
-	/*
-	 * Use JEMALLOC_PROF_PREFIX if it's set, and if it is short enough to
-	 * avoid overflowing DUMP_FILENAME_BUFSIZE.  The result may exceed
-	 * PATH_MAX, but creat(2) will catch that problem.
-	 */
-	if ((s = getenv("JEMALLOC_PROF_PREFIX")) != NULL
-	    && strlen(s) + (DUMP_FILENAME_BUFSIZE - PATH_MAX) <= PATH_MAX) {
-		slen = strlen(s);
-		memcpy(&filename[i], s, slen);
-		i += slen;
-
-		s = ".";
-	} else
-		s = "jeprof.";
-	slen = strlen(s);
-	memcpy(&filename[i], s, slen);
-	i += slen;
-
-	s = umax2s(getpid(), 10, buf);
+	s = opt_prof_prefix;
 	slen = strlen(s);
 	memcpy(&filename[i], s, slen);
 	i += slen;
@@ -906,7 +887,17 @@ prof_dump_filename(char *filename, char v, int64_t vseq)
 	memcpy(&filename[i], s, slen);
 	i += slen;
 
-	s = umax2s(prof_dump_seq, 10, buf);
+	s = u2s(getpid(), 10, buf);
+	slen = strlen(s);
+	memcpy(&filename[i], s, slen);
+	i += slen;
+
+	s = ".";
+	slen = strlen(s);
+	memcpy(&filename[i], s, slen);
+	i += slen;
+
+	s = u2s(prof_dump_seq, 10, buf);
 	prof_dump_seq++;
 	slen = strlen(s);
 	memcpy(&filename[i], s, slen);
@@ -921,7 +912,7 @@ prof_dump_filename(char *filename, char v, int64_t vseq)
 	i++;
 
 	if (vseq != 0xffffffffffffffffLLU) {
-		s = umax2s(vseq, 10, buf);
+		s = u2s(vseq, 10, buf);
 		slen = strlen(s);
 		memcpy(&filename[i], s, slen);
 		i += slen;
@@ -943,10 +934,12 @@ prof_fdump(void)
 	if (prof_booted == false)
 		return;
 
-	malloc_mutex_lock(&prof_dump_seq_mtx);
-	prof_dump_filename(filename, 'f', 0xffffffffffffffffLLU);
-	malloc_mutex_unlock(&prof_dump_seq_mtx);
-	prof_dump(filename, opt_prof_leak, false);
+	if (opt_prof_prefix[0] != '\0') {
+		malloc_mutex_lock(&prof_dump_seq_mtx);
+		prof_dump_filename(filename, 'f', 0xffffffffffffffffLLU);
+		malloc_mutex_unlock(&prof_dump_seq_mtx);
+		prof_dump(filename, opt_prof_leak, false);
+	}
 }
 
 void
@@ -964,11 +957,13 @@ prof_idump(void)
 	}
 	malloc_mutex_unlock(&enq_mtx);
 
-	malloc_mutex_lock(&prof_dump_seq_mtx);
-	prof_dump_filename(filename, 'i', prof_dump_iseq);
-	prof_dump_iseq++;
-	malloc_mutex_unlock(&prof_dump_seq_mtx);
-	prof_dump(filename, false, false);
+	if (opt_prof_prefix[0] != '\0') {
+		malloc_mutex_lock(&prof_dump_seq_mtx);
+		prof_dump_filename(filename, 'i', prof_dump_iseq);
+		prof_dump_iseq++;
+		malloc_mutex_unlock(&prof_dump_seq_mtx);
+		prof_dump(filename, false, false);
+	}
 }
 
 bool
@@ -981,6 +976,8 @@ prof_mdump(const char *filename)
 
 	if (filename == NULL) {
 		/* No filename specified, so automatically generate one. */
+		if (opt_prof_prefix[0] == '\0')
+			return (true);
 		malloc_mutex_lock(&prof_dump_seq_mtx);
 		prof_dump_filename(filename_buf, 'm', prof_dump_mseq);
 		prof_dump_mseq++;
@@ -991,7 +988,7 @@ prof_mdump(const char *filename)
 }
 
 void
-prof_udump(void)
+prof_gdump(void)
 {
 	char filename[DUMP_FILENAME_BUFSIZE];
 
@@ -999,17 +996,19 @@ prof_udump(void)
 		return;
 	malloc_mutex_lock(&enq_mtx);
 	if (enq) {
-		enq_udump = true;
+		enq_gdump = true;
 		malloc_mutex_unlock(&enq_mtx);
 		return;
 	}
 	malloc_mutex_unlock(&enq_mtx);
 
-	malloc_mutex_lock(&prof_dump_seq_mtx);
-	prof_dump_filename(filename, 'u', prof_dump_useq);
-	prof_dump_useq++;
-	malloc_mutex_unlock(&prof_dump_seq_mtx);
-	prof_dump(filename, false, false);
+	if (opt_prof_prefix[0] != '\0') {
+		malloc_mutex_lock(&prof_dump_seq_mtx);
+		prof_dump_filename(filename, 'u', prof_dump_useq);
+		prof_dump_useq++;
+		malloc_mutex_unlock(&prof_dump_seq_mtx);
+		prof_dump(filename, false, false);
+	}
 }
 
 static void
@@ -1120,6 +1119,14 @@ prof_tdata_cleanup(void *arg)
 
 void
 prof_boot0(void)
+{
+
+	memcpy(opt_prof_prefix, PROF_PREFIX_DEFAULT,
+	    sizeof(PROF_PREFIX_DEFAULT));
+}
+
+void
+prof_boot1(void)
 {
 
 	/*
@@ -1133,7 +1140,7 @@ prof_boot0(void)
 		 * automatically dumped.
 		 */
 		opt_prof = true;
-		opt_prof_udump = false;
+		opt_prof_gdump = false;
 		prof_interval = 0;
 	} else if (opt_prof) {
 		if (opt_lg_prof_interval >= 0) {
@@ -1147,7 +1154,7 @@ prof_boot0(void)
 }
 
 bool
-prof_boot1(void)
+prof_boot2(void)
 {
 
 	if (opt_prof) {
@@ -1171,7 +1178,7 @@ prof_boot1(void)
 			return (true);
 		enq = false;
 		enq_idump = false;
-		enq_udump = false;
+		enq_gdump = false;
 
 		if (atexit(prof_fdump) != 0) {
 			malloc_write("<jemalloc>: Error in atexit()\n");
diff --git a/jemalloc/src/stats.c b/jemalloc/src/stats.c
index 9b3271b2..3dfe0d23 100644
--- a/jemalloc/src/stats.c
+++ b/jemalloc/src/stats.c
@@ -57,12 +57,12 @@ static void	stats_arena_print(void (*write_cb)(void *, const char *),
 
 /*
  * We don't want to depend on vsnprintf() for production builds, since that can
- * cause unnecessary bloat for static binaries.  umax2s() provides minimal
- * integer printing functionality, so that malloc_printf() use can be limited to
+ * cause unnecessary bloat for static binaries.  u2s() provides minimal integer
+ * printing functionality, so that malloc_printf() use can be limited to
  * JEMALLOC_STATS code.
  */
 char *
-umax2s(uintmax_t x, unsigned base, char *s)
+u2s(uint64_t x, unsigned base, char *s)
 {
 	unsigned i;
 
@@ -72,8 +72,8 @@ umax2s(uintmax_t x, unsigned base, char *s)
 	case 10:
 		do {
 			i--;
-			s[i] = "0123456789"[x % 10];
-			x /= 10;
+			s[i] = "0123456789"[x % (uint64_t)10];
+			x /= (uint64_t)10;
 		} while (x > 0);
 		break;
 	case 16:
@@ -86,8 +86,9 @@ umax2s(uintmax_t x, unsigned base, char *s)
 	default:
 		do {
 			i--;
-			s[i] = "0123456789abcdefghijklmnopqrstuvwxyz"[x % base];
-			x /= base;
+			s[i] = "0123456789abcdefghijklmnopqrstuvwxyz"[x %
+			    (uint64_t)base];
+			x /= (uint64_t)base;
 		} while (x > 0);
 	}
 
@@ -374,6 +375,7 @@ void
 stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
     const char *opts)
 {
+	int err;
 	uint64_t epoch;
 	size_t u64sz;
 	char s[UMAX2S_BUFSIZE];
@@ -383,10 +385,27 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 	bool bins = true;
 	bool large = true;
 
-	/* Refresh stats, in case mallctl() was called by the application. */
+	/*
+	 * Refresh stats, in case mallctl() was called by the application.
+	 *
+	 * Check for OOM here, since refreshing the ctl cache can trigger
+	 * allocation.  In practice, none of the subsequent mallctl()-related
+	 * calls in this function will cause OOM if this one succeeds.
+	 * */
 	epoch = 1;
 	u64sz = sizeof(uint64_t);
-	xmallctl("epoch", &epoch, &u64sz, &epoch, sizeof(uint64_t));
+	err = JEMALLOC_P(mallctl)("epoch", &epoch, &u64sz, &epoch,
+	    sizeof(uint64_t));
+	if (err != 0) {
+		if (err == EAGAIN) {
+			malloc_write("<jemalloc>: Memory allocation failure in "
+			    "mallctl(\"epoch\", ...)\n");
+			return;
+		}
+		malloc_write("<jemalloc>: Failure in mallctl(\"epoch\", "
+		    "...)\n");
+		abort();
+	}
 
 	if (write_cb == NULL) {
 		/*
@@ -430,10 +449,12 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 		bool bv;
 		unsigned uv;
 		ssize_t ssv;
-		size_t sv, bsz, ssz;
+		size_t sv, bsz, ssz, sssz, cpsz;
 
 		bsz = sizeof(bool);
 		ssz = sizeof(size_t);
+		sssz = sizeof(ssize_t);
+		cpsz = sizeof(const char *);
 
 		CTL_GET("version", &cpv, const char *);
 		write_cb(cbopaque, "Version: ");
@@ -444,116 +465,140 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 		write_cb(cbopaque, bv ? "enabled" : "disabled");
 		write_cb(cbopaque, "\n");
 
-		write_cb(cbopaque, "Boolean JEMALLOC_OPTIONS: ");
-		if ((err = JEMALLOC_P(mallctl)("opt.abort", &bv, &bsz, NULL, 0))
-		    == 0)
-			write_cb(cbopaque, bv ? "A" : "a");
-		if ((err = JEMALLOC_P(mallctl)("prof.active", &bv, &bsz,
-		    NULL, 0)) == 0)
-			write_cb(cbopaque, bv ? "E" : "e");
-		if ((err = JEMALLOC_P(mallctl)("opt.prof", &bv, &bsz, NULL, 0))
-		    == 0)
-			write_cb(cbopaque, bv ? "F" : "f");
-		if ((err = JEMALLOC_P(mallctl)("opt.tcache", &bv, &bsz, NULL,
-		    0)) == 0)
-			write_cb(cbopaque, bv ? "H" : "h");
-		if ((err = JEMALLOC_P(mallctl)("opt.junk", &bv, &bsz, NULL, 0))
-		    == 0)
-			write_cb(cbopaque, bv ? "J" : "j");
-		if ((err = JEMALLOC_P(mallctl)("opt.prof_leak", &bv, &bsz, NULL,
-		    0)) == 0)
-			write_cb(cbopaque, bv ? "L" : "l");
-		if ((err = JEMALLOC_P(mallctl)("opt.overcommit", &bv, &bsz,
-		    NULL, 0)) == 0)
-			write_cb(cbopaque, bv ? "O" : "o");
-		if ((err = JEMALLOC_P(mallctl)("opt.stats_print", &bv, &bsz,
-		    NULL, 0)) == 0)
-			write_cb(cbopaque, bv ? "P" : "p");
-		if ((err = JEMALLOC_P(mallctl)("opt.prof_accum", &bv, &bsz,
-		    NULL, 0)) == 0)
-			write_cb(cbopaque, bv ? "R" : "r");
-		if ((err = JEMALLOC_P(mallctl)("opt.prof_udump", &bv, &bsz,
-		    NULL, 0)) == 0)
-			write_cb(cbopaque, bv ? "U" : "u");
-		if ((err = JEMALLOC_P(mallctl)("opt.sysv", &bv, &bsz, NULL, 0))
-		    == 0)
-			write_cb(cbopaque, bv ? "V" : "v");
-		if ((err = JEMALLOC_P(mallctl)("opt.xmalloc", &bv, &bsz, NULL,
-		    0)) == 0)
-			write_cb(cbopaque, bv ? "X" : "x");
-		if ((err = JEMALLOC_P(mallctl)("opt.zero", &bv, &bsz, NULL, 0))
-		    == 0)
-			write_cb(cbopaque, bv ? "Z" : "z");
-		write_cb(cbopaque, "\n");
+#define OPT_WRITE_BOOL(n)						\
+		if ((err = JEMALLOC_P(mallctl)("opt."#n, &bv, &bsz,	\
+		    NULL, 0)) == 0) {					\
+			write_cb(cbopaque, "  opt."#n": ");		\
+			write_cb(cbopaque, bv ? "true" : "false");	\
+			write_cb(cbopaque, "\n");			\
+		}
+#define OPT_WRITE_SIZE_T(n)						\
+		if ((err = JEMALLOC_P(mallctl)("opt."#n, &sv, &ssz,	\
+		    NULL, 0)) == 0) {					\
+			write_cb(cbopaque, "  opt."#n": ");		\
+			write_cb(cbopaque, u2s(sv, 10, s));		\
+			write_cb(cbopaque, "\n");			\
+		}
+#define OPT_WRITE_SSIZE_T(n)						\
+		if ((err = JEMALLOC_P(mallctl)("opt."#n, &ssv, &sssz,	\
+		    NULL, 0)) == 0) {					\
+			if (ssv >= 0) {					\
+				write_cb(cbopaque, "  opt."#n": ");	\
+				write_cb(cbopaque, u2s(ssv, 10, s));	\
+			} else {					\
+				write_cb(cbopaque, "  opt."#n": -");	\
+				write_cb(cbopaque, u2s(-ssv, 10, s));	\
+			}						\
+			write_cb(cbopaque, "\n");			\
+		}
+#define OPT_WRITE_CHAR_P(n)						\
+		if ((err = JEMALLOC_P(mallctl)("opt."#n, &cpv, &cpsz,	\
+		    NULL, 0)) == 0) {					\
+			write_cb(cbopaque, "  opt."#n": \"");		\
+			write_cb(cbopaque, cpv);			\
+			write_cb(cbopaque, "\"\n");			\
+		}
+
+		write_cb(cbopaque, "Run-time option settings:\n");
+		OPT_WRITE_BOOL(abort)
+		OPT_WRITE_SIZE_T(lg_qspace_max)
+		OPT_WRITE_SIZE_T(lg_cspace_max)
+		OPT_WRITE_SIZE_T(lg_chunk)
+		OPT_WRITE_SIZE_T(narenas)
+		OPT_WRITE_SSIZE_T(lg_dirty_mult)
+		OPT_WRITE_BOOL(stats_print)
+		OPT_WRITE_BOOL(junk)
+		OPT_WRITE_BOOL(zero)
+		OPT_WRITE_BOOL(sysv)
+		OPT_WRITE_BOOL(xmalloc)
+		OPT_WRITE_BOOL(tcache)
+		OPT_WRITE_SSIZE_T(lg_tcache_gc_sweep)
+		OPT_WRITE_SSIZE_T(lg_tcache_max)
+		OPT_WRITE_BOOL(prof)
+		OPT_WRITE_CHAR_P(prof_prefix)
+		OPT_WRITE_SIZE_T(lg_prof_bt_max)
+		OPT_WRITE_BOOL(prof_active)
+		OPT_WRITE_SSIZE_T(lg_prof_sample)
+		OPT_WRITE_BOOL(prof_accum)
+		OPT_WRITE_SSIZE_T(lg_prof_tcmax)
+		OPT_WRITE_SSIZE_T(lg_prof_interval)
+		OPT_WRITE_BOOL(prof_gdump)
+		OPT_WRITE_BOOL(prof_leak)
+		OPT_WRITE_BOOL(overcommit)
+
+#undef OPT_WRITE_BOOL
+#undef OPT_WRITE_SIZE_T
+#undef OPT_WRITE_SSIZE_T
+#undef OPT_WRITE_CHAR_P
 
 		write_cb(cbopaque, "CPUs: ");
-		write_cb(cbopaque, umax2s(ncpus, 10, s));
+		write_cb(cbopaque, u2s(ncpus, 10, s));
 		write_cb(cbopaque, "\n");
 
 		CTL_GET("arenas.narenas", &uv, unsigned);
 		write_cb(cbopaque, "Max arenas: ");
-		write_cb(cbopaque, umax2s(uv, 10, s));
+		write_cb(cbopaque, u2s(uv, 10, s));
 		write_cb(cbopaque, "\n");
 
 		write_cb(cbopaque, "Pointer size: ");
-		write_cb(cbopaque, umax2s(sizeof(void *), 10, s));
+		write_cb(cbopaque, u2s(sizeof(void *), 10, s));
 		write_cb(cbopaque, "\n");
 
 		CTL_GET("arenas.quantum", &sv, size_t);
 		write_cb(cbopaque, "Quantum size: ");
-		write_cb(cbopaque, umax2s(sv, 10, s));
+		write_cb(cbopaque, u2s(sv, 10, s));
 		write_cb(cbopaque, "\n");
 
 		CTL_GET("arenas.cacheline", &sv, size_t);
 		write_cb(cbopaque, "Cacheline size (assumed): ");
-		write_cb(cbopaque, umax2s(sv, 10, s));
+		write_cb(cbopaque, u2s(sv, 10, s));
 		write_cb(cbopaque, "\n");
 
 		CTL_GET("arenas.subpage", &sv, size_t);
 		write_cb(cbopaque, "Subpage spacing: ");
-		write_cb(cbopaque, umax2s(sv, 10, s));
+		write_cb(cbopaque, u2s(sv, 10, s));
 		write_cb(cbopaque, "\n");
 
 		if ((err = JEMALLOC_P(mallctl)("arenas.tspace_min", &sv, &ssz,
 		    NULL, 0)) == 0) {
 			write_cb(cbopaque, "Tiny 2^n-spaced sizes: [");
-			write_cb(cbopaque, umax2s(sv, 10, s));
+			write_cb(cbopaque, u2s(sv, 10, s));
 			write_cb(cbopaque, "..");
 
 			CTL_GET("arenas.tspace_max", &sv, size_t);
-			write_cb(cbopaque, umax2s(sv, 10, s));
+			write_cb(cbopaque, u2s(sv, 10, s));
 			write_cb(cbopaque, "]\n");
 		}
 
 		CTL_GET("arenas.qspace_min", &sv, size_t);
 		write_cb(cbopaque, "Quantum-spaced sizes: [");
-		write_cb(cbopaque, umax2s(sv, 10, s));
+		write_cb(cbopaque, u2s(sv, 10, s));
 		write_cb(cbopaque, "..");
 		CTL_GET("arenas.qspace_max", &sv, size_t);
-		write_cb(cbopaque, umax2s(sv, 10, s));
+		write_cb(cbopaque, u2s(sv, 10, s));
 		write_cb(cbopaque, "]\n");
 
 		CTL_GET("arenas.cspace_min", &sv, size_t);
 		write_cb(cbopaque, "Cacheline-spaced sizes: [");
-		write_cb(cbopaque, umax2s(sv, 10, s));
+		write_cb(cbopaque, u2s(sv, 10, s));
 		write_cb(cbopaque, "..");
 		CTL_GET("arenas.cspace_max", &sv, size_t);
-		write_cb(cbopaque, umax2s(sv, 10, s));
+		write_cb(cbopaque, u2s(sv, 10, s));
 		write_cb(cbopaque, "]\n");
 
 		CTL_GET("arenas.sspace_min", &sv, size_t);
 		write_cb(cbopaque, "Subpage-spaced sizes: [");
-		write_cb(cbopaque, umax2s(sv, 10, s));
+		write_cb(cbopaque, u2s(sv, 10, s));
 		write_cb(cbopaque, "..");
 		CTL_GET("arenas.sspace_max", &sv, size_t);
-		write_cb(cbopaque, umax2s(sv, 10, s));
+		write_cb(cbopaque, u2s(sv, 10, s));
 		write_cb(cbopaque, "]\n");
 
 		CTL_GET("opt.lg_dirty_mult", &ssv, ssize_t);
 		if (ssv >= 0) {
 			write_cb(cbopaque,
 			    "Min active:dirty page ratio per arena: ");
-			write_cb(cbopaque, umax2s((1U << ssv), 10, s));
+			write_cb(cbopaque, u2s((1U << ssv), 10, s));
 			write_cb(cbopaque, ":1\n");
 		} else {
 			write_cb(cbopaque,
@@ -563,7 +608,7 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 		    &ssz, NULL, 0)) == 0) {
 			write_cb(cbopaque,
 			    "Maximum thread-cached size class: ");
-			write_cb(cbopaque, umax2s(sv, 10, s));
+			write_cb(cbopaque, u2s(sv, 10, s));
 			write_cb(cbopaque, "\n");
 		}
 		if ((err = JEMALLOC_P(mallctl)("opt.lg_tcache_gc_sweep", &ssv,
@@ -573,50 +618,51 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 			CTL_GET("opt.tcache", &tcache_enabled, bool);
 			write_cb(cbopaque, "Thread cache GC sweep interval: ");
 			write_cb(cbopaque, tcache_enabled && ssv >= 0 ?
-			    umax2s(tcache_gc_sweep, 10, s) : "N/A");
+			    u2s(tcache_gc_sweep, 10, s) : "N/A");
 			write_cb(cbopaque, "\n");
 		}
 		if ((err = JEMALLOC_P(mallctl)("opt.prof", &bv, &bsz, NULL, 0))
 		   == 0 && bv) {
 			CTL_GET("opt.lg_prof_bt_max", &sv, size_t);
 			write_cb(cbopaque, "Maximum profile backtrace depth: ");
-			write_cb(cbopaque, umax2s((1U << sv), 10, s));
+			write_cb(cbopaque, u2s((1U << sv), 10, s));
 			write_cb(cbopaque, "\n");
 
 			CTL_GET("opt.lg_prof_tcmax", &ssv, ssize_t);
 			write_cb(cbopaque,
 			    "Maximum per thread backtrace cache: ");
 			if (ssv >= 0) {
-				write_cb(cbopaque, umax2s((1U << ssv), 10, s));
+				write_cb(cbopaque, u2s((1U << ssv), 10, s));
 				write_cb(cbopaque, " (2^");
-				write_cb(cbopaque, umax2s(ssv, 10, s));
+				write_cb(cbopaque, u2s(ssv, 10, s));
 				write_cb(cbopaque, ")\n");
 			} else
 				write_cb(cbopaque, "N/A\n");
 
 			CTL_GET("opt.lg_prof_sample", &sv, size_t);
 			write_cb(cbopaque, "Average profile sample interval: ");
-			write_cb(cbopaque, umax2s((1U << sv), 10, s));
+			write_cb(cbopaque, u2s((((uint64_t)1U) << sv), 10, s));
 			write_cb(cbopaque, " (2^");
-			write_cb(cbopaque, umax2s(sv, 10, s));
+			write_cb(cbopaque, u2s(sv, 10, s));
 			write_cb(cbopaque, ")\n");
 
 			CTL_GET("opt.lg_prof_interval", &ssv, ssize_t);
 			write_cb(cbopaque, "Average profile dump interval: ");
 			if (ssv >= 0) {
-				write_cb(cbopaque, umax2s((1U << ssv), 10, s));
+				write_cb(cbopaque, u2s((((uint64_t)1U) << ssv),
+				    10, s));
 				write_cb(cbopaque, " (2^");
-				write_cb(cbopaque, umax2s(ssv, 10, s));
+				write_cb(cbopaque, u2s(ssv, 10, s));
 				write_cb(cbopaque, ")\n");
 			} else
 				write_cb(cbopaque, "N/A\n");
 		}
 		CTL_GET("arenas.chunksize", &sv, size_t);
 		write_cb(cbopaque, "Chunk size: ");
-		write_cb(cbopaque, umax2s(sv, 10, s));
+		write_cb(cbopaque, u2s(sv, 10, s));
 		CTL_GET("opt.lg_chunk", &sv, size_t);
 		write_cb(cbopaque, " (2^");
-		write_cb(cbopaque, umax2s(sv, 10, s));
+		write_cb(cbopaque, u2s(sv, 10, s));
 		write_cb(cbopaque, ")\n");
 	}
 
diff --git a/jemalloc/src/tcache.c b/jemalloc/src/tcache.c
index 3fb8f2bd..cbbe7a11 100644
--- a/jemalloc/src/tcache.c
+++ b/jemalloc/src/tcache.c
@@ -5,7 +5,7 @@
 /* Data. */
 
 bool	opt_tcache = true;
-ssize_t	opt_lg_tcache_maxclass = LG_TCACHE_MAXCLASS_DEFAULT;
+ssize_t	opt_lg_tcache_max = LG_TCACHE_MAXCLASS_DEFAULT;
 ssize_t	opt_lg_tcache_gc_sweep = LG_TCACHE_GC_SWEEP_DEFAULT;
 
 /* Map of thread-specific caches. */
@@ -384,16 +384,16 @@ tcache_boot(void)
 
 	if (opt_tcache) {
 		/*
-		 * If necessary, clamp opt_lg_tcache_maxclass, now that
+		 * If necessary, clamp opt_lg_tcache_max, now that
 		 * small_maxclass and arena_maxclass are known.
 		 */
-		if (opt_lg_tcache_maxclass < 0 || (1U <<
-		    opt_lg_tcache_maxclass) < small_maxclass)
+		if (opt_lg_tcache_max < 0 || (1U <<
+		    opt_lg_tcache_max) < small_maxclass)
 			tcache_maxclass = small_maxclass;
-		else if ((1U << opt_lg_tcache_maxclass) > arena_maxclass)
+		else if ((1U << opt_lg_tcache_max) > arena_maxclass)
 			tcache_maxclass = arena_maxclass;
 		else
-			tcache_maxclass = (1U << opt_lg_tcache_maxclass);
+			tcache_maxclass = (1U << opt_lg_tcache_max);
 
 		nhbins = nbins + (tcache_maxclass >> PAGE_SHIFT);
 
diff --git a/jemalloc/test/rallocm.c b/jemalloc/test/rallocm.c
index 7e8a271c..a8cadebc 100644
--- a/jemalloc/test/rallocm.c
+++ b/jemalloc/test/rallocm.c
@@ -14,14 +14,14 @@ main(void)
 
 	fprintf(stderr, "Test begin\n");
 
-	r = allocm(&p, &sz, 42, 0);
+	r = JEMALLOC_P(allocm)(&p, &sz, 42, 0);
 	if (r != ALLOCM_SUCCESS) {
 		fprintf(stderr, "Unexpected allocm() error\n");
 		abort();
 	}
 
 	q = p;
-	r = rallocm(&q, &tsz, sz, 0, ALLOCM_NO_MOVE);
+	r = JEMALLOC_P(rallocm)(&q, &tsz, sz, 0, ALLOCM_NO_MOVE);
 	if (r != ALLOCM_SUCCESS)
 		fprintf(stderr, "Unexpected rallocm() error\n");
 	if (q != p)
@@ -32,7 +32,7 @@ main(void)
 	}
 
 	q = p;
-	r = rallocm(&q, &tsz, sz, 5, ALLOCM_NO_MOVE);
+	r = JEMALLOC_P(rallocm)(&q, &tsz, sz, 5, ALLOCM_NO_MOVE);
 	if (r != ALLOCM_SUCCESS)
 		fprintf(stderr, "Unexpected rallocm() error\n");
 	if (q != p)
@@ -43,7 +43,7 @@ main(void)
 	}
 
 	q = p;
-	r = rallocm(&q, &tsz, sz + 5, 0, ALLOCM_NO_MOVE);
+	r = JEMALLOC_P(rallocm)(&q, &tsz, sz + 5, 0, ALLOCM_NO_MOVE);
 	if (r != ALLOCM_ERR_NOT_MOVED)
 		fprintf(stderr, "Unexpected rallocm() result\n");
 	if (q != p)
@@ -54,7 +54,7 @@ main(void)
 	}
 
 	q = p;
-	r = rallocm(&q, &tsz, sz + 5, 0, 0);
+	r = JEMALLOC_P(rallocm)(&q, &tsz, sz + 5, 0, 0);
 	if (r != ALLOCM_SUCCESS)
 		fprintf(stderr, "Unexpected rallocm() error\n");
 	if (q == p)
@@ -66,7 +66,7 @@ main(void)
 	p = q;
 	sz = tsz;
 
-	r = rallocm(&q, &tsz, 8192, 0, 0);
+	r = JEMALLOC_P(rallocm)(&q, &tsz, 8192, 0, 0);
 	if (r != ALLOCM_SUCCESS)
 		fprintf(stderr, "Unexpected rallocm() error\n");
 	if (q == p)
@@ -78,7 +78,7 @@ main(void)
 	p = q;
 	sz = tsz;
 
-	r = rallocm(&q, &tsz, 16384, 0, 0);
+	r = JEMALLOC_P(rallocm)(&q, &tsz, 16384, 0, 0);
 	if (r != ALLOCM_SUCCESS)
 		fprintf(stderr, "Unexpected rallocm() error\n");
 	if (tsz == sz) {
@@ -88,7 +88,7 @@ main(void)
 	p = q;
 	sz = tsz;
 
-	r = rallocm(&q, &tsz, 8192, 0, ALLOCM_NO_MOVE);
+	r = JEMALLOC_P(rallocm)(&q, &tsz, 8192, 0, ALLOCM_NO_MOVE);
 	if (r != ALLOCM_SUCCESS)
 		fprintf(stderr, "Unexpected rallocm() error\n");
 	if (q != p)
@@ -99,7 +99,7 @@ main(void)
 	}
 	sz = tsz;
 
-	r = rallocm(&q, &tsz, 16384, 0, ALLOCM_NO_MOVE);
+	r = JEMALLOC_P(rallocm)(&q, &tsz, 16384, 0, ALLOCM_NO_MOVE);
 	if (r != ALLOCM_SUCCESS)
 		fprintf(stderr, "Unexpected rallocm() error\n");
 	if (q != p)
@@ -110,7 +110,7 @@ main(void)
 	}
 	sz = tsz;
 
-	dallocm(p, 0);
+	JEMALLOC_P(dallocm)(p, 0);
 
 	fprintf(stderr, "Test end\n");
 	return (0);

From 49d0293c885aa2b946ad2b1e2c0cc481b68f5604 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@facebook.com>
Date: Sat, 23 Oct 2010 23:43:37 -0700
Subject: [PATCH 42/48] Add missing #ifdef JEMALLOC_PROF.

Only call prof_boot0() if profiling is enabled.
---
 jemalloc/src/jemalloc.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/jemalloc/src/jemalloc.c b/jemalloc/src/jemalloc.c
index 012c4a8f..2aebc51d 100644
--- a/jemalloc/src/jemalloc.c
+++ b/jemalloc/src/jemalloc.c
@@ -623,7 +623,9 @@ malloc_init_hard(void)
 	}
 #endif
 
+#ifdef JEMALLOC_PROF
 	prof_boot0();
+#endif
 
 	malloc_conf_init();
 

From 8da141f47a5f8548fa07693e295b82f34346a67b Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Sun, 24 Oct 2010 11:34:50 -0700
Subject: [PATCH 43/48] Re-format size class table.

Use a more compact layout for the size class table in the man page.
This avoids layout glitches due to approaching the single-page table
size limit.
---
 jemalloc/doc/jemalloc.3.in | 68 ++++++++++----------------------------
 1 file changed, 17 insertions(+), 51 deletions(-)

diff --git a/jemalloc/doc/jemalloc.3.in b/jemalloc/doc/jemalloc.3.in
index 1557ecbd..535b2e84 100644
--- a/jemalloc/doc/jemalloc.3.in
+++ b/jemalloc/doc/jemalloc.3.in
@@ -516,62 +516,28 @@ If you need to assure that allocations do not suffer from cacheline sharing,
 round your allocation requests up to the nearest multiple of the cacheline
 size, or specify cacheline alignment when allocating.
 .Pp
-Assuming 4 MiB chunks, 4 KiB pages, and a 16 byte quantum on a 64-bit system,
+Assuming 4 MiB chunks, 4 KiB pages, and a 16-byte quantum on a 64-bit system,
 the size classes in each category are as follows:
+.\"-----------------------------------------------------------------------------
 .TS
-expand allbox tab(;);
-LLR
-LLR
-^LR
-^^R
-^^R
-^^R
-^^R
-^LR
-^^R
-^^R
-^^R
-^^R
-^LR
-^^R
-^^R
-^^R
-^^R
-LsR
-^^R
-^^R
-^^R
-^^R
-LsR
-^^R.
+allbox tab(;);
+LLL
+LLL
+^LL
+^LL
+^LL
+LsL
+LsL.
 Category;Subcategory;Size
-@roff_tiny@Small;Tiny;8
+@roff_tiny@Small;Tiny;[8]
 @roff_no_tiny@Small;Tiny;[disabled]
-;Quantum-spaced;16
-;;32
-;;48
-;;...
-;;128
-;Cacheline-spaced;192
-;;256
-;;320
-;;...
-;;512
-;Sub-page;760
-;;1024
-;;1280
-;;...
-;;3840
-Large;4 KiB
-;;8 KiB
-;;12 KiB
-;;...
-;;4084 KiB
-Huge;4 MiB
-;;8 MiB
-;;12 MiB
-;;...
+;Quantum-spaced;[16, 32, 48, ..., 128]
+;Cacheline-spaced;[192, 256, 320, ..., 512]
+;Sub-page-spaced;[768, 1024, 1280, ..., 3840]
+Large;[4 KiB, 8 KiB, 12 KiB, ..., 4072 KiB]
+Huge;[4 MiB, 8 MiB, 12 MiB, ...]
 .TE
+.\"-----------------------------------------------------------------------------
 .Sh MALLCTL NAMESPACE
 The following names are defined in the namespace accessible via the
 .Fn @jemalloc_prefix@mallctl*

From 0d38791e7abf6af1edfcbbfaa6a68641938665d5 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Sun, 24 Oct 2010 12:51:38 -0700
Subject: [PATCH 44/48] Edit manpage.

Make various minor edits to the manpage.
---
 jemalloc/doc/jemalloc.3.in | 30 +++++++++++++++++++-----------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/jemalloc/doc/jemalloc.3.in b/jemalloc/doc/jemalloc.3.in
index 535b2e84..62866642 100644
--- a/jemalloc/doc/jemalloc.3.in
+++ b/jemalloc/doc/jemalloc.3.in
@@ -38,8 +38,8 @@
 .\"     @(#)malloc.3	8.1 (Berkeley) 6/4/93
 .\" $FreeBSD: head/lib/libc/stdlib/malloc.3 182225 2008-08-27 02:00:53Z jasone $
 .\"
-.Dd October 23, 2010
-.Dt JEMALLOC 3
+.Dd October 24, 2010
+.Dt jemalloc 3
 .Os
 .Sh NAME
 .Nm @jemalloc_prefix@malloc ,
@@ -59,6 +59,12 @@
 .Nd general purpose memory allocation functions
 .Sh LIBRARY
 .Sy libjemalloc@install_suffix@
+.Pp
+This manual describes jemalloc @jemalloc_version@.
+More information can be found at the
+.UR http://\:www.canonware.com/\:jemalloc/
+jemalloc website
+.UE .
 .Sh SYNOPSIS
 .In stdlib.h
 .In jemalloc/jemalloc@install_suffix@.h
@@ -849,8 +855,10 @@ This option is disabled by default.
 @roff_prof@.Dq opt.prof_gdump
 @roff_prof@option for information on high-water-triggered profile dumping.
 @roff_prof@Profile output is compatible with the included pprof Perl script,
-@roff_prof@which originates from the google-perftools package
-@roff_prof@(http://code.google.com/p/google-perftools/).
+@roff_prof@which originates from the
+@roff_prof@.UR http://\:code.google.com/\:p/\:google-perftools/
+@roff_prof@google-perftools package
+@roff_prof@.UE .
 @roff_prof@.Ed
 .\"-----------------------------------------------------------------------------
 @roff_prof@.It Sy "opt.prof_prefix (const char *) r-"
@@ -1457,8 +1465,8 @@ options and symbols for debugger support.
 .Pp
 @roff_fill@If the program starts to give unusual results, coredump or generally
 @roff_fill@behave differently without emitting any of the messages mentioned in
-@roff_fill@the next section, it is likely because it depends on the storage
-@roff_fill@being filled with zero bytes.
+@roff_fill@the next section, it is likely because the program depends on the
+@roff_fill@storage being filled with zero bytes.
 @roff_fill@Try running it with the
 @roff_fill@.Dq opt.zero
 @roff_fill@option set;
@@ -1483,7 +1491,7 @@ warning condition, a message will be printed to file descriptor
 Errors will result in the process dumping core.
 If the
 .Dq opt.abort
-option is set, all warnings are treated as errors.
+option is set, most warnings are treated as errors.
 .Pp
 The
 .Va @jemalloc_prefix@malloc_message
@@ -1611,8 +1619,8 @@ The
 and
 .Fn @jemalloc_prefix@rallocm
 functions will fail if:
-.Bl -tag -width ".Dv ALLOCM_ERR_OOM"
-.It ALLOCM_ERR_OOM
+.Bl -tag -width ".Bq Er ALLOCM_ERR_OOM"
+.It Bq Er ALLOCM_ERR_OOM
 Out of memory.
 Insufficient contiguous memory was available to service the allocation request.
 The
@@ -1631,8 +1639,8 @@ unmodified.
 The
 .Fn @jemalloc_prefix@rallocm
 function will also fail if:
-.Bl -tag -width ".Dv ALLOCM_ERR_NOT_MOVED"
-.It ALLOCM_ERR_NOT_MOVED
+.Bl -tag -width ".Bq Er ALLOCM_ERR_NOT_MOVED"
+.It Bq Er ALLOCM_ERR_NOT_MOVED
 .Dv ALLOCM_NO_MOVE
 was specified, but the reallocation request could not be serviced without
 moving the object.

From ce93055c49a5cd0f32792bbaeeaa0fb4b788f1ee Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Sun, 24 Oct 2010 13:03:07 -0700
Subject: [PATCH 45/48] Use madvise(..., MADV_FREE) on OS X.

Use madvise(..., MADV_FREE) rather than msync(..., MS_KILLPAGES) on OS
X, since it works for at least OS X 10.5 and 10.6.
---
 jemalloc/configure.ac                        | 2 +-
 jemalloc/include/jemalloc/jemalloc_defs.h.in | 9 +++------
 jemalloc/src/arena.c                         | 3 ---
 3 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/jemalloc/configure.ac b/jemalloc/configure.ac
index b613cb1b..0ed13739 100644
--- a/jemalloc/configure.ac
+++ b/jemalloc/configure.ac
@@ -166,7 +166,7 @@ case "${host}" in
   *-*-darwin*)
 	CFLAGS="$CFLAGS -fno-common -no-cpp-precomp"
 	abi="macho"
-	AC_DEFINE([JEMALLOC_PURGE_MSYNC_KILLPAGES])
+	AC_DEFINE([JEMALLOC_PURGE_MADVISE_FREE])
 	RPATH=""
 	;;
   *-*-freebsd*)
diff --git a/jemalloc/include/jemalloc/jemalloc_defs.h.in b/jemalloc/include/jemalloc/jemalloc_defs.h.in
index 54e5d943..b8f3f36b 100644
--- a/jemalloc/include/jemalloc/jemalloc_defs.h.in
+++ b/jemalloc/include/jemalloc/jemalloc_defs.h.in
@@ -121,15 +121,12 @@
  *   madvise(..., MADV_DONTNEED) : On Linux, this immediately discards pages,
  *                                 such that new pages will be demand-zeroed if
  *                                 the address region is later touched.
- *   madvise(..., MADV_FREE) : On FreeBSD, this marks pages as being unused,
- *                             such that they will be discarded rather than
- *                             swapped out.
- *   msync(..., MS_KILLPAGES) : On Darwin, this behaves similarly to
- *                              madvise(..., MADV_FREE) on FreeBSD.
+ *   madvise(..., MADV_FREE) : On FreeBSD and Darwin, this marks pages as being
+ *                             unused, such that they will be discarded rather
+ *                             than swapped out.
  */
 #undef JEMALLOC_PURGE_MADVISE_DONTNEED
 #undef JEMALLOC_PURGE_MADVISE_FREE
-#undef JEMALLOC_PURGE_MSYNC_KILLPAGES
 
 /* sizeof(void *) == 2^LG_SIZEOF_PTR. */
 #undef LG_SIZEOF_PTR
diff --git a/jemalloc/src/arena.c b/jemalloc/src/arena.c
index 00f425fa..3d4f8888 100644
--- a/jemalloc/src/arena.c
+++ b/jemalloc/src/arena.c
@@ -800,9 +800,6 @@ arena_chunk_purge(arena_t *arena, arena_chunk_t *chunk)
 #elif defined(JEMALLOC_PURGE_MADVISE_FREE)
 		madvise((void *)((uintptr_t)chunk + (pageind << PAGE_SHIFT)),
 		    (npages << PAGE_SHIFT), MADV_FREE);
-#elif defined(JEMALLOC_PURGE_MSYNC_KILLPAGES)
-		msync((void *)((uintptr_t)chunk + (pageind << PAGE_SHIFT)),
-		    (npages << PAGE_SHIFT), MS_KILLPAGES);
 #else
 #  error "No method defined for purging unused dirty pages."
 #endif

From 379f847f446b84b8f43fdda306dd20a60024d331 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Sun, 24 Oct 2010 16:18:29 -0700
Subject: [PATCH 46/48] Add ChangeLog.

Add ChangeLog, which briefly summarizes releases.

Edit README and INSTALL.
---
 jemalloc/ChangeLog | 130 +++++++++++++++++++++++++++++++++++++++++++++
 jemalloc/INSTALL   |  27 +++++-----
 jemalloc/README    |  10 ++--
 3 files changed, 149 insertions(+), 18 deletions(-)
 create mode 100644 jemalloc/ChangeLog

diff --git a/jemalloc/ChangeLog b/jemalloc/ChangeLog
new file mode 100644
index 00000000..290dea15
--- /dev/null
+++ b/jemalloc/ChangeLog
@@ -0,0 +1,130 @@
+Following are change highlights associated with official releases.  Important
+bug fixes are all mentioned, but internal enhancements are omitted here for
+brevity (even though they are more fun to write about).  Much more detail can be
+found in the git revision history:
+
+    http://www.canonware.com/cgi-bin/gitweb.cgi?p=jemalloc.git
+    git://canonware.com/jemalloc.git
+
+* 2.0.0
+
+  This version focuses on the experimental *allocm() API, and on improved
+  run-time configuration/introspection.  Nonetheless, numerous performance
+  improvements are also included.
+
+  New features:
+    - Implement the experimental {,r,s,d}allocm() API, which provides a superset
+      of the functionality available via malloc(), calloc(), posix_memalign(),
+      realloc(), malloc_usable_size(), and free().  These functions can be used
+      to allocate/reallocate aligned zeroed memory, ask for optional extra
+      memory during reallocation, prevent object movement during reallocation,
+      etc.
+    - Replace JEMALLOC_OPTIONS/JEMALLOC_PROF_PREFIX with MALLOC_CONF, which is
+      more human-readable, and more flexible.  For example:
+        JEMALLOC_OPTIONS=AJP
+      is now:
+        MALLOC_CONF=abort:true,fill:true,stats_print:true
+    - Port to Apple OS X.  Sponsored by Mozilla.
+    - Make it possible for the application to control thread-->arena mappings
+      via the "thread.arena" mallctl.
+    - Add compile-time support for all TLS-related functionality via pthreads
+      TSD.  This is mainly of interest for OS X, which does not support TLS, but
+      has a TSD implementation with similar performance.
+    - Override memalign() and valloc() if they are provided by the system.
+    - Add the "arenas.purge" mallctl, which can be used to synchronously purge
+      all dirty unused pages.
+    - Make cumulative heap profiling data optional, so that it is possible to
+      limit the amount of memory consumed by heap profiling data structures.
+    - Add per thread allocation counters that can be accessed via the
+      "thread.allocated" and "thread.deallocated" mallctls.
+
+  Incompatible changes:
+    - Remove JEMALLOC_OPTIONS and malloc_options (see MALLOC_CONF above).
+    - Increase default backtrace depth from 4 to 128 for heap profiling.
+    - Disable interval-based profile dumps by default.
+
+  Bug fixes:
+  - Remove bad assertions in fork handler functions.  These assertions could
+    cause aborts for some combinations of configure settings.
+  - Fix strerror_r() usage to deal with non-standard semantics in GNU libc.
+  - Fix leak context reporting.  This bug tended to cause the number of contexts
+    to be underreported (though the reported number of objects and bytes were
+    correct).
+  - Fix a realloc() bug for large in-place growing reallocation.  This bug could
+    cause memory corruption, but it was hard to trigger.
+  - Fix an allocation bug for small allocations that could be triggered if
+    multiple threads raced to create a new run of backing pages.
+  - Enhance the heap profiler to trigger samples based on usable size, rather
+    than request size.
+  - Fix a heap profiling bug due to sometimes losing track of requested object
+    size for sampled objects.
+
+* 1.0.3
+
+  Bug fixes:
+  - Fix the libunwind-based implementation of stack backtracing (used for heap
+    profiling).  This bug could cause zero-length backtraces to be reported.
+  - Add a missing mutex unlock in library initialization code.  If multiple
+    threads raced to initialize malloc, some of them could end up permanently
+    blocked.
+
+* 1.0.2
+
+  Bug fixes:
+  - Fix junk filling of large objects, which could cause memory corruption.
+  - Add MAP_NORESERVE support for chunk mapping, because otherwise virtual
+    memory limits could cause swap file configuration to fail.  Contributed by
+    Jordan DeLong.
+
+* 1.0.1
+
+  Bug fixes:
+  - Fix compilation when --enable-fill is specified.
+  - Fix threads-related profiling bugs that affected accuracy and caused memory
+    to be leaked during thread exit.
+  - Fix dirty page purging race conditions that could cause crashes.
+  - Fix crash in tcache flushing code during thread destruction.
+
+* 1.0.0
+
+  This release focuses on speed and run-time introspection.  Numerous
+  algorithmic improvements make this release substantially faster than its
+  predecessors.
+
+  New features:
+  - Implement autoconf-based configuration system.
+  - Add mallctl*(), for the purposes of introspection and run-time
+    configuration.
+  - Make it possible for the application to manually flush a thread's cache, via
+    the "tcache.flush" mallctl.
+  - Base maximum dirty page count on proportion of active memory.
+  - Compute various addtional run-time statistics, including per size class
+    statistics for large objects.
+  - Expose malloc_stats_print(), which can be called repeatedly by the
+    application.
+  - Simplify the malloc_message() signature to only take one string argument,
+    and incorporate an opaque data pointer argument for use by the application
+    in combination with malloc_stats_print().
+  - Add support for allocation backed by one or more swap files, and allow the
+    application to disable over-commit if swap files are in use.
+  - Implement allocation profiling and leak checking.
+
+  Removed features:
+  - Remove the dynamic arena rebalancing code, since thread-specific caching
+    reduces its utility.
+
+  Bug fixes:
+  - Modify chunk allocation to work when address space layout randomization
+    (ASLR) is in use.
+  - Fix thread cleanup bugs related to TLS destruction.
+  - Handle 0-size allocation requests in posix_memalign().
+  - Fix a chunk leak.  The leaked chunks were never touched, so this impacted
+    virtual memory usage, but not physical memory usage.
+
+* linux_20080828a, linux_20080827a
+
+  These snapshot releases are the simple result of incorporating Linux-specific
+  support into the FreeBSD malloc sources.
+
+--------------------------------------------------------------------------------
+vim:filetype=text:textwidth=80
diff --git a/jemalloc/INSTALL b/jemalloc/INSTALL
index c5697c6d..d30f093f 100644
--- a/jemalloc/INSTALL
+++ b/jemalloc/INSTALL
@@ -35,9 +35,8 @@ any of the following arguments (not a definitive list) to 'configure':
       /etc/malloc.conf --> /etc/prefix_malloc.conf
       MALLOC_CONF      --> PREFIX_MALLOC_CONF
 
-    This makes it possible to use jemalloc at the same time as the
-    system allocator, or even to use multiple copies of jemalloc
-    simultaneously.
+    This makes it possible to use jemalloc at the same time as the system
+    allocator, or even to use multiple copies of jemalloc simultaneously.
 
     By default, the prefix is "", except on OS X, where it is "je_".  On OS X,
     jemalloc overlays the default malloc zone, but makes no attempt to actually
@@ -58,12 +57,12 @@ any of the following arguments (not a definitive list) to 'configure':
     performance hit, but is very useful during application development.
 
 --enable-stats
-    Enable statistics gathering functionality.  Use the 'P' option to print
-    detailed allocation statistics at exit.
+    Enable statistics gathering functionality.  See the "opt.stats_print"
+    option documentation for usage details.
 
 --enable-prof
-    Enable heap profiling and leak detection functionality.  Use the 'B', 'E',
-    'F', 'I', 'L', and 'U' options to control these features.
+    Enable heap profiling and leak detection functionality.  See the "opt.prof"
+    option documention for usage details.
 
 --disable-prof-libgcc
     Disable the use of libgcc's backtracing functionality.  Ordinarily, libgcc's
@@ -89,8 +88,8 @@ any of the following arguments (not a definitive list) to 'configure':
 
 --disable-tcache
     Disable thread-specific caches for small objects.  Objects are cached and
-    released in bulk, thus reducing the total number of mutex operations.  Use
-    the 'H', 'G', and 'M' options to control thread-specific caching.
+    released in bulk, thus reducing the total number of mutex operations.  See
+    the "opt.tcache" option for suage details.
 
 --enable-swap
     Enable mmap()ed swap file support.  When this feature is built in, it is
@@ -102,18 +101,18 @@ any of the following arguments (not a definitive list) to 'configure':
     mmap(2).
 
 --enable-fill
-    Enable support for junk/zero filling of memory.  Use the 'J' option to
-    control junk filling, or the 'Z' option to control zero filling.
+    Enable support for junk/zero filling of memory.  See the "opt.junk"/
+    "opt.zero" option documentation for usage details.
 
 --enable-xmalloc
     Enable support for optional immediate termination due to out-of-memory
     errors, as is commonly implemented by "xmalloc" wrapper function for malloc.
-    Use the 'X' option to control termination behavior.
+    See the "opt.xmalloc" option documentation for usage details.
 
 --enable-sysv
     Enable support for System V semantics, wherein malloc(0) returns NULL
-    rather than a minimal allocation.  Use the 'V' option to control System V
-    compatibility.
+    rather than a minimal allocation.  See the "opt.sysv" option documentation
+    for usage details.
 
 --enable-dynamic-page-shift
     Under most conditions, the system page size never changes (usually 4KiB or
diff --git a/jemalloc/README b/jemalloc/README
index 2ff36ef6..4d7b552b 100644
--- a/jemalloc/README
+++ b/jemalloc/README
@@ -1,9 +1,9 @@
 jemalloc is a general-purpose scalable concurrent malloc(3) implementation.
 This distribution is a stand-alone "portable" implementation that currently
-targets only Linux.  jemalloc is included as the default allocator in the
-FreeBSD and NetBSD operating systems, and it is used by the Mozilla Firefox web
-browser on Microsoft Windows-related platforms.  Depending on your needs, one
-of the other divergent versions may suit your needs better than this
+targets Linux and Apple OS X.  jemalloc is included as the default allocator in
+the FreeBSD and NetBSD operating systems, and it is used by the Mozilla Firefox
+web browser on Microsoft Windows-related platforms.  Depending on your needs,
+one of the other divergent versions may suit your needs better than this
 distribution.
 
 The COPYING file contains copyright and licensing information.
@@ -11,4 +11,6 @@ The COPYING file contains copyright and licensing information.
 The INSTALL file contains information on how to configure, build, and install
 jemalloc.
 
+The ChangeLog file contains a brief summary of changes for each release.
+
 URL: http://www.canonware.com/jemalloc/

From 0176e3057d37e65f01cf5afe688b9fd45729fe7b Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Sun, 24 Oct 2010 16:32:13 -0700
Subject: [PATCH 47/48] Bump library version number.

---
 jemalloc/Makefile.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/jemalloc/Makefile.in b/jemalloc/Makefile.in
index ca807fda..46eddf46 100644
--- a/jemalloc/Makefile.in
+++ b/jemalloc/Makefile.in
@@ -33,7 +33,7 @@ else
 SO := so
 WL_SONAME := soname
 endif
-REV := 0
+REV := 1
 ifeq (macho, @abi@)
 TEST_LIBRARY_PATH := DYLD_FALLBACK_LIBRARY_PATH=@objroot@lib
 else

From 3af83344a54f6c6051e532188586d1a07474c068 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Sun, 24 Oct 2010 16:48:52 -0700
Subject: [PATCH 48/48] Document groff commands for manpage formatting.

Document how to format the manpage for the terminal, pdf, and html.
---
 jemalloc/INSTALL | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/jemalloc/INSTALL b/jemalloc/INSTALL
index d30f093f..fafd7883 100644
--- a/jemalloc/INSTALL
+++ b/jemalloc/INSTALL
@@ -233,6 +233,10 @@ directory, issue configuration and build commands:
 === Documentation ==============================================================
 
 The manual page that the configure script generates can be manually formatted
-prior to installation via the following command:
+prior to installation via any of the following commands:
 
-    nroff -man -t doc/jemalloc.3
+    nroff -man -man-ext -t doc/jemalloc.3
+
+    groff -man -man-ext -t -Tps doc/jemalloc.3 | ps2pdf - doc/jemalloc.3.pdf
+
+    (cd doc; groff -man -man-ext -t -Thtml jemalloc.3 > jemalloc.3.html)