diff --git a/Makefile.in b/Makefile.in
index b211f889..2f3fea1e 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -129,6 +129,7 @@ C_SRCS := $(srcroot)src/jemalloc.c \
 	$(srcroot)src/pa.c \
 	$(srcroot)src/pa_extra.c \
 	$(srcroot)src/pages.c \
+	$(srcroot)src/peak_event.c \
 	$(srcroot)src/prng.c \
 	$(srcroot)src/prof.c \
 	$(srcroot)src/prof_data.c \
diff --git a/doc/jemalloc.xml.in b/doc/jemalloc.xml.in
index 1baf1f6a..5ab84568 100644
--- a/doc/jemalloc.xml.in
+++ b/doc/jemalloc.xml.in
@@ -1621,6 +1621,42 @@ malloc_conf = "xmalloc:true";]]></programlisting>
         should not be modified by the application.</para></listitem>
       </varlistentry>
 
+      <varlistentry id="thread.peak.read">
+        <term>
+          <mallctl>thread.peak.read</mallctl>
+          (<type>uint64_t</type>)
+          <literal>r-</literal>
+          [<option>--enable-stats</option>]
+        </term>
+        <listitem><para>Get an approximation of the maximum value of the
+        difference between the number of bytes allocated and the number of bytes
+        deallocated by the calling thread since the last call to <link
+        linkend="thread.peak.reset"><mallctl>thread.peak.reset</mallctl></link>,
+        or since the thread's creation if it has not called <link
+        linkend="thread.peak.reset"><mallctl>thread.peak.reset</mallctl></link>.
+        No guarantees are made about the quality of the approximation, but
+        jemalloc currently endeavors to maintain accuracy to within one hundred
+        kilobytes.
+        </para></listitem>
+      </varlistentry>
+
+      <varlistentry id="thread.peak.reset">
+        <term>
+          <mallctl>thread.peak.reset</mallctl>
+          (<type>void</type>)
+          <literal>--</literal>
+          [<option>--enable-stats</option>]
+        </term>
+        <listitem><para>Resets the counter for net bytes allocated in the calling
+        thread to zero. This affects subsequent calls to <link
+        linkend="thread.peak.read"><mallctl>thread.peak.read</mallctl></link>,
+        but not the values returned by <link
+        linkend="thread.allocated"><mallctl>thread.allocated</mallctl></link>
+        or <link
+        linkend="thread.deallocated"><mallctl>thread.deallocated</mallctl></link>.
+        </para></listitem>
+      </varlistentry>
+
       <varlistentry id="thread.tcache.enabled">
         <term>
           <mallctl>thread.tcache.enabled</mallctl>
diff --git a/include/jemalloc/internal/peak_event.h b/include/jemalloc/internal/peak_event.h
new file mode 100644
index 00000000..b808ce04
--- /dev/null
+++ b/include/jemalloc/internal/peak_event.h
@@ -0,0 +1,24 @@
+#ifndef JEMALLOC_INTERNAL_PEAK_EVENT_H
+#define JEMALLOC_INTERNAL_PEAK_EVENT_H
+
+/*
+ * While peak.h contains the simple helper struct that tracks state, this
+ * contains the allocator tie-ins (and knows about tsd, the event module, etc.).
+ */
+
+/* Update the peak with current tsd state. */
+void peak_event_update(tsd_t *tsd);
+/* Set current state to zero. */
+void peak_event_zero(tsd_t *tsd);
+uint64_t peak_event_max(tsd_t *tsd);
+
+/* Manual hooks. */
+/* The activity-triggered hooks. */
+uint64_t peak_alloc_new_event_wait(tsd_t *tsd);
+uint64_t peak_alloc_postponed_event_wait(tsd_t *tsd);
+void peak_alloc_event_handler(tsd_t *tsd, uint64_t elapsed);
+uint64_t peak_dalloc_new_event_wait(tsd_t *tsd);
+uint64_t peak_dalloc_postponed_event_wait(tsd_t *tsd);
+void peak_dalloc_event_handler(tsd_t *tsd, uint64_t elapsed);
+
+#endif /* JEMALLOC_INTERNAL_PEAK_EVENT_H */
diff --git a/include/jemalloc/internal/thread_event.h b/include/jemalloc/internal/thread_event.h
index 2fcaa88a..bca8a447 100644
--- a/include/jemalloc/internal/thread_event.h
+++ b/include/jemalloc/internal/thread_event.h
@@ -53,10 +53,12 @@ void tsd_te_init(tsd_t *tsd);
  *  E(event,		(condition), is_alloc_event)
  */
 #define ITERATE_OVER_ALL_EVENTS						\
-    E(tcache_gc,	(opt_tcache_gc_incr_bytes > 0), true)		\
-    E(prof_sample,	(config_prof && opt_prof), true)	    	\
-    E(stats_interval,	(opt_stats_interval >= 0), true)	    	\
-    E(tcache_gc_dalloc,	(opt_tcache_gc_incr_bytes > 0), false)
+    E(tcache_gc,		(opt_tcache_gc_incr_bytes > 0), true)	\
+    E(prof_sample,		(config_prof && opt_prof), true)  	\
+    E(stats_interval,		(opt_stats_interval >= 0), true)   	\
+    E(tcache_gc_dalloc,		(opt_tcache_gc_incr_bytes > 0), false)	\
+    E(peak_alloc,		config_stats, true)			\
+    E(peak_dalloc,		config_stats, false)
 
 #define E(event, condition_unused, is_alloc_event_unused)		\
     C(event##_event_wait)
diff --git a/include/jemalloc/internal/tsd.h b/include/jemalloc/internal/tsd.h
index 18bdb8fd..9408b2ca 100644
--- a/include/jemalloc/internal/tsd.h
+++ b/include/jemalloc/internal/tsd.h
@@ -5,6 +5,7 @@
 #include "jemalloc/internal/assert.h"
 #include "jemalloc/internal/bin_types.h"
 #include "jemalloc/internal/jemalloc_internal_externs.h"
+#include "jemalloc/internal/peak.h"
 #include "jemalloc/internal/prof_types.h"
 #include "jemalloc/internal/ql.h"
 #include "jemalloc/internal/rtree_tsd.h"
@@ -69,6 +70,8 @@ typedef ql_elm(tsd_t) tsd_link_t;
     O(prof_sample_last_event,	uint64_t,		uint64_t)	\
     O(stats_interval_event_wait,	uint64_t,	uint64_t)	\
     O(stats_interval_last_event,	uint64_t,	uint64_t)	\
+    O(peak_alloc_event_wait,	uint64_t,		uint64_t)	\
+    O(peak_dalloc_event_wait,	uint64_t,	uint64_t)		\
     O(prof_tdata,		prof_tdata_t *,		prof_tdata_t *)	\
     O(prng_state,		uint64_t,		uint64_t)	\
     O(iarena,			arena_t *,		arena_t *)	\
@@ -77,6 +80,7 @@ typedef ql_elm(tsd_t) tsd_link_t;
     O(binshards,		tsd_binshards_t,	tsd_binshards_t)\
     O(tsd_link,			tsd_link_t,		tsd_link_t)	\
     O(in_hook,			bool,			bool)		\
+    O(peak,			peak_t,			peak_t)		\
     O(tcache_slow,		tcache_slow_t,		tcache_slow_t)	\
     O(rtree_ctx,		rtree_ctx_t,		rtree_ctx_t)
 
@@ -95,6 +99,8 @@ typedef ql_elm(tsd_t) tsd_link_t;
     /* prof_sample_last_event */	0,				\
     /* stats_interval_event_wait */	0,				\
     /* stats_interval_last_event */	0,				\
+    /* peak_alloc_event_wait */		0,				\
+    /* peak_dalloc_event_wait */	0,				\
     /* prof_tdata */		NULL,					\
     /* prng_state */		0,					\
     /* iarena */		NULL,					\
@@ -103,6 +109,7 @@ typedef ql_elm(tsd_t) tsd_link_t;
     /* binshards */		TSD_BINSHARDS_ZERO_INITIALIZER,		\
     /* tsd_link */		{NULL},					\
     /* in_hook */		false,					\
+    /* peak */			PEAK_INITIALIZER,			\
     /* tcache_slow */		TCACHE_SLOW_ZERO_INITIALIZER,		\
     /* rtree_ctx */		RTREE_CTX_ZERO_INITIALIZER,
 
diff --git a/msvc/projects/vc2015/jemalloc/jemalloc.vcxproj b/msvc/projects/vc2015/jemalloc/jemalloc.vcxproj
index 9f81e21d..d50fa884 100644
--- a/msvc/projects/vc2015/jemalloc/jemalloc.vcxproj
+++ b/msvc/projects/vc2015/jemalloc/jemalloc.vcxproj
@@ -70,6 +70,7 @@
     <ClCompile Include="..\..\..\..\src\pa.c" />
     <ClCompile Include="..\..\..\..\src\pa_extra.c" />
     <ClCompile Include="..\..\..\..\src\pages.c" />
+    <ClCompile Include="..\..\..\..\src\peak_event.c" />
     <ClCompile Include="..\..\..\..\src\prng.c" />
     <ClCompile Include="..\..\..\..\src\prof.c" />
     <ClCompile Include="..\..\..\..\src\prof_data.c" />
diff --git a/msvc/projects/vc2015/jemalloc/jemalloc.vcxproj.filters b/msvc/projects/vc2015/jemalloc/jemalloc.vcxproj.filters
index 15fe7f08..94db8c0c 100644
--- a/msvc/projects/vc2015/jemalloc/jemalloc.vcxproj.filters
+++ b/msvc/projects/vc2015/jemalloc/jemalloc.vcxproj.filters
@@ -94,6 +94,9 @@
     <ClCompile Include="..\..\..\..\src\pages.c">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\..\..\src\peak_event.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="..\..\..\..\src\prng.c">
       <Filter>Source Files</Filter>
     </ClCompile>
diff --git a/msvc/projects/vc2017/jemalloc/jemalloc.vcxproj b/msvc/projects/vc2017/jemalloc/jemalloc.vcxproj
index b5fccaed..337dcfe7 100644
--- a/msvc/projects/vc2017/jemalloc/jemalloc.vcxproj
+++ b/msvc/projects/vc2017/jemalloc/jemalloc.vcxproj
@@ -70,6 +70,7 @@
     <ClCompile Include="..\..\..\..\src\pa.c" />
     <ClCompile Include="..\..\..\..\src\pa_extra.c" />
     <ClCompile Include="..\..\..\..\src\pages.c" />
+    <ClCompile Include="..\..\..\..\src\peak_event.c" />
     <ClCompile Include="..\..\..\..\src\prng.c" />
     <ClCompile Include="..\..\..\..\src\prof.c" />
     <ClCompile Include="..\..\..\..\src\prof_data.c" />
diff --git a/msvc/projects/vc2017/jemalloc/jemalloc.vcxproj.filters b/msvc/projects/vc2017/jemalloc/jemalloc.vcxproj.filters
index 15fe7f08..94db8c0c 100644
--- a/msvc/projects/vc2017/jemalloc/jemalloc.vcxproj.filters
+++ b/msvc/projects/vc2017/jemalloc/jemalloc.vcxproj.filters
@@ -94,6 +94,9 @@
     <ClCompile Include="..\..\..\..\src\pages.c">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\..\..\src\peak_event.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="..\..\..\..\src\prng.c">
       <Filter>Source Files</Filter>
     </ClCompile>
diff --git a/src/ctl.c b/src/ctl.c
index be8be10f..0bd38feb 100644
--- a/src/ctl.c
+++ b/src/ctl.c
@@ -9,6 +9,7 @@
 #include "jemalloc/internal/inspect.h"
 #include "jemalloc/internal/mutex.h"
 #include "jemalloc/internal/nstime.h"
+#include "jemalloc/internal/peak_event.h"
 #include "jemalloc/internal/sc.h"
 #include "jemalloc/internal/util.h"
 
@@ -61,6 +62,8 @@ CTL_PROTO(background_thread)
 CTL_PROTO(max_background_threads)
 CTL_PROTO(thread_tcache_enabled)
 CTL_PROTO(thread_tcache_flush)
+CTL_PROTO(thread_peak_read)
+CTL_PROTO(thread_peak_reset)
 CTL_PROTO(thread_prof_name)
 CTL_PROTO(thread_prof_active)
 CTL_PROTO(thread_arena)
@@ -294,6 +297,11 @@ static const ctl_named_node_t	thread_tcache_node[] = {
 	{NAME("flush"),		CTL(thread_tcache_flush)}
 };
 
+static const ctl_named_node_t	thread_peak_node[] = {
+	{NAME("read"),		CTL(thread_peak_read)},
+	{NAME("reset"),		CTL(thread_peak_reset)},
+};
+
 static const ctl_named_node_t	thread_prof_node[] = {
 	{NAME("name"),		CTL(thread_prof_name)},
 	{NAME("active"),	CTL(thread_prof_active)}
@@ -306,6 +314,7 @@ static const ctl_named_node_t	thread_node[] = {
 	{NAME("deallocated"),	CTL(thread_deallocated)},
 	{NAME("deallocatedp"),	CTL(thread_deallocatedp)},
 	{NAME("tcache"),	CHILD(named, thread_tcache)},
+	{NAME("peak"),		CHILD(named, thread_peak)},
 	{NAME("prof"),		CHILD(named, thread_prof)},
 	{NAME("idle"),		CTL(thread_idle)}
 };
@@ -1953,6 +1962,38 @@ label_return:
 	return ret;
 }
 
+static int
+thread_peak_read_ctl(tsd_t *tsd, const size_t *mib,
+    size_t miblen, void *oldp, size_t *oldlenp, void *newp,
+    size_t newlen) {
+	int ret;
+	if (!config_stats) {
+		return ENOENT;
+	}
+	READONLY();
+	peak_event_update(tsd);
+	uint64_t result = peak_event_max(tsd);
+	READ(result, uint64_t);
+	ret = 0;
+label_return:
+	return ret;
+}
+
+static int
+thread_peak_reset_ctl(tsd_t *tsd, const size_t *mib,
+    size_t miblen, void *oldp, size_t *oldlenp, void *newp,
+    size_t newlen) {
+	int ret;
+	if (!config_stats) {
+		return ENOENT;
+	}
+	NEITHER_READ_NOR_WRITE();
+	peak_event_zero(tsd);
+	ret = 0;
+label_return:
+	return ret;
+}
+
 static int
 thread_prof_name_ctl(tsd_t *tsd, const size_t *mib,
     size_t miblen, void *oldp, size_t *oldlenp, void *newp,
diff --git a/src/peak_event.c b/src/peak_event.c
new file mode 100644
index 00000000..ffb061bf
--- /dev/null
+++ b/src/peak_event.c
@@ -0,0 +1,67 @@
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/jemalloc_internal_includes.h"
+
+#include "jemalloc/internal/peak.h"
+#include "jemalloc/internal/peak_event.h"
+
+/*
+ * Update every 100k by default.  We're not exposing this as a configuration
+ * option for now; we don't want to bind ourselves too tightly to any particular
+ * performance requirements for small values, or guarantee that we'll even be
+ * able to provide fine-grained accuracy.
+ */
+#define PEAK_EVENT_WAIT (100 * 1024)
+
+/* Update the peak with current tsd state. */
+void
+peak_event_update(tsd_t *tsd) {
+	uint64_t alloc = tsd_thread_allocated_get(tsd);
+	uint64_t dalloc = tsd_thread_deallocated_get(tsd);
+	peak_t *peak = tsd_peakp_get(tsd);
+	peak_update(peak, alloc, dalloc);
+}
+
+/* Set current state to zero. */
+void
+peak_event_zero(tsd_t *tsd) {
+	uint64_t alloc = tsd_thread_allocated_get(tsd);
+	uint64_t dalloc = tsd_thread_deallocated_get(tsd);
+	peak_t *peak = tsd_peakp_get(tsd);
+	peak_set_zero(peak, alloc, dalloc);
+}
+
+uint64_t
+peak_event_max(tsd_t *tsd) {
+	peak_t *peak = tsd_peakp_get(tsd);
+	return peak_max(peak);
+}
+
+uint64_t
+peak_alloc_new_event_wait(tsd_t *tsd) {
+	return PEAK_EVENT_WAIT;
+}
+
+uint64_t
+peak_alloc_postponed_event_wait(tsd_t *tsd) {
+	return TE_MIN_START_WAIT;
+}
+
+void
+peak_alloc_event_handler(tsd_t *tsd, uint64_t elapsed) {
+	peak_event_update(tsd);
+}
+
+uint64_t
+peak_dalloc_new_event_wait(tsd_t *tsd) {
+	return PEAK_EVENT_WAIT;
+}
+
+uint64_t
+peak_dalloc_postponed_event_wait(tsd_t *tsd) {
+	return TE_MIN_START_WAIT;
+}
+
+void
+peak_dalloc_event_handler(tsd_t *tsd, uint64_t elapsed) {
+	peak_event_update(tsd);
+}
diff --git a/src/thread_event.c b/src/thread_event.c
index 40c0487e..99a188dd 100644
--- a/src/thread_event.c
+++ b/src/thread_event.c
@@ -60,6 +60,16 @@ stats_interval_fetch_elapsed(tsd_t *tsd) {
 	return last_event - last_stats_event;
 }
 
+static uint64_t
+peak_alloc_fetch_elapsed(tsd_t *tsd) {
+	return TE_INVALID_ELAPSED;
+}
+
+static uint64_t
+peak_dalloc_fetch_elapsed(tsd_t *tsd) {
+	return TE_INVALID_ELAPSED;
+}
+
 /* Per event facilities done. */
 
 static bool
diff --git a/test/unit/mallctl.c b/test/unit/mallctl.c
index cc1d5313..10d809fb 100644
--- a/test/unit/mallctl.c
+++ b/test/unit/mallctl.c
@@ -955,6 +955,73 @@ TEST_BEGIN(test_thread_idle) {
 }
 TEST_END
 
+TEST_BEGIN(test_thread_peak) {
+	test_skip_if(!config_stats);
+
+	/*
+	 * We don't commit to any stable amount of accuracy for peak tracking
+	 * (in practice, when this test was written, we made sure to be within
+	 * 100k).  But 10MB is big for more or less any definition of big.
+	 */
+	size_t big_size = 10 * 1024 * 1024;
+	size_t small_size = 256;
+
+	void *ptr;
+	int err;
+	size_t sz;
+	uint64_t peak;
+	sz = sizeof(uint64_t);
+
+	err = mallctl("thread.peak.reset", NULL, NULL, NULL, 0);
+	expect_d_eq(err, 0, "");
+	ptr = mallocx(SC_SMALL_MAXCLASS, 0);
+	err = mallctl("thread.peak.read", &peak, &sz, NULL, 0);
+	expect_d_eq(err, 0, "");
+	expect_u64_eq(peak, SC_SMALL_MAXCLASS, "Missed an update");
+	free(ptr);
+	err = mallctl("thread.peak.read", &peak, &sz, NULL, 0);
+	expect_d_eq(err, 0, "");
+	expect_u64_eq(peak, SC_SMALL_MAXCLASS, "Freeing changed peak");
+	ptr = mallocx(big_size, 0);
+	free(ptr);
+	/*
+	 * The peak should have hit big_size in the last two lines, even though
+	 * the net allocated bytes has since dropped back down to zero.  We
+	 * should have noticed the peak change without having down any mallctl
+	 * calls while net allocated bytes was high.
+	 */
+	err = mallctl("thread.peak.read", &peak, &sz, NULL, 0);
+	expect_d_eq(err, 0, "");
+	expect_u64_ge(peak, big_size, "Missed a peak change.");
+
+	/* Allocate big_size, but using small allocations. */
+	size_t nallocs = big_size / small_size;
+	void **ptrs = calloc(nallocs, sizeof(void *));
+	err = mallctl("thread.peak.reset", NULL, NULL, NULL, 0);
+	expect_d_eq(err, 0, "");
+	err = mallctl("thread.peak.read", &peak, &sz, NULL, 0);
+	expect_d_eq(err, 0, "");
+	expect_u64_eq(0, peak, "Missed a reset.");
+	for (size_t i = 0; i < nallocs; i++) {
+		ptrs[i] = mallocx(small_size, 0);
+	}
+	for (size_t i = 0; i < nallocs; i++) {
+		free(ptrs[i]);
+	}
+	err = mallctl("thread.peak.read", &peak, &sz, NULL, 0);
+	expect_d_eq(err, 0, "");
+	/*
+	 * We don't guarantee exactness; make sure we're within 10% of the peak,
+	 * though.
+	 */
+	expect_u64_ge(peak, nallocx(small_size, 0) * nallocs * 9 / 10,
+	    "Missed some peak changes.");
+	expect_u64_le(peak, nallocx(small_size, 0) * nallocs * 11 / 10,
+	    "Overcounted peak changes.");
+	free(ptrs);
+}
+TEST_END
+
 int
 main(void) {
 	return test(
@@ -987,5 +1054,6 @@ main(void) {
 	    test_stats_arenas,
 	    test_hooks,
 	    test_hooks_exhaustion,
-	    test_thread_idle);
+	    test_thread_idle,
+	    test_thread_peak);
 }