Introduce a backport of C11 atomics

This introduces a backport of C11 atomics. It has four implementations; ranked in order of preference, they are: - GCC/Clang __atomic builtins - GCC/Clang __sync builtins - MSVC _Interlocked builtins - C11 atomics, from <stdatomic.h> The primary advantages are: - Close adherence to the standard API gives us a defined memory model. - Type safety: atomic objects are now separate types from non-atomic ones, so that it's impossible to mix up atomic and non-atomic updates (which is undefined behavior that compilers are starting to take advantage of). - Efficiency: we can specify ordering for operations, avoiding fences and atomic operations on strongly ordered architectures (example: `atomic_write_u32(ptr, val);` involves a CAS loop, whereas `atomic_store(ptr, val, ATOMIC_RELEASE);` is a plain store. This diff leaves in the current atomics API (implementing them in terms of the backport). This lets us transition uses over piecemeal. Testing: This is by nature hard to test. I've manually tested the first three options on Linux on gcc by futzing with the #defines manually, on freebsd with gcc and clang, on MSVC, and on OS X with clang. All of these were x86 machines though, and we don't have any test infrastructure set up for non-x86 platforms.
2017-01-25 09:54:27 -08:00
parent 957b8c5f21
commit d4ac7582f3
15 changed files with 947 additions and 672 deletions
--- a/Makefile.in
+++ b/Makefile.in
@@ -90,7 +90,6 @@ BINS := $(objroot)bin/jemalloc-config $(objroot)bin/jemalloc.sh $(objroot)bin/je
 C_HDRS := $(objroot)include/jemalloc/jemalloc$(install_suffix).h
 C_SRCS := $(srcroot)src/jemalloc.c \
 	$(srcroot)src/arena.c \
 	$(srcroot)src/atomic.c \
 	$(srcroot)src/base.c \
 	$(srcroot)src/bitmap.c \
 	$(srcroot)src/ckh.c \
--- a/configure.ac
+++ b/configure.ac
@@ -550,7 +550,7 @@ case "${host}" in
 	AC_DEFINE([JEMALLOC_HAS_ALLOCA_H])
 	AC_DEFINE([JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY], [ ])
 	AC_DEFINE([JEMALLOC_THREADED_INIT], [ ])
-	AC_DEFINE([JEMALLOC_C11ATOMICS])
+	AC_DEFINE([JEMALLOC_C11_ATOMICS])
 	force_tls="0"
 	default_munmap="0"
 	;;
@@ -1730,36 +1730,44 @@ JE_COMPILABLE([C11 atomics], [
    volatile atomic_uint_least64_t *a = (volatile atomic_uint_least64_t *)p;
    uint64_t r = atomic_fetch_add(a, x) + x;
    return r == 0;
-], [je_cv_c11atomics])
+], [je_cv_c11_atomics])
-if test "x${je_cv_c11atomics}" = "xyes" ; then
+if test "x${je_cv_c11_atomics}" = "xyes" ; then
-  AC_DEFINE([JEMALLOC_C11ATOMICS])
+  AC_DEFINE([JEMALLOC_C11_ATOMICS])
 fi
 dnl ============================================================================
-dnl Check for atomic(9) operations as provided on FreeBSD.
+dnl Check for GCC-style __atomic atomics.
-JE_COMPILABLE([atomic(9)], [
+JE_COMPILABLE([GCC __atomic atomics], [
 #include <sys/types.h>
 #include <machine/atomic.h>
 #include <inttypes.h>
 ], [
-	{
+    int x = 0;
-		uint32_t x32 = 0;
+    int val = 1;
-		volatile uint32_t *x32p = &x32;
+    int y = __atomic_fetch_add(&x, val, __ATOMIC_RELAXED);
-		atomic_fetchadd_32(x32p, 1);
+    int after_add = x;
-	}
+    return after_add == 1;
-	{
+], [je_cv_gcc_atomic_atomics])
-		unsigned long xlong = 0;
+if test "x${je_cv_gcc_atomic_atomics}" = "xyes" ; then
-		volatile unsigned long *xlongp = &xlong;
+  AC_DEFINE([JEMALLOC_GCC_ATOMIC_ATOMICS])
-		atomic_fetchadd_long(xlongp, 1);
+fi
-	}
+
-], [je_cv_atomic9])
+dnl ============================================================================
-if test "x${je_cv_atomic9}" = "xyes" ; then
+dnl Check for GCC-style __sync atomics.
-  AC_DEFINE([JEMALLOC_ATOMIC9])
+
 JE_COMPILABLE([GCC __sync atomics], [
 ], [
    int x = 0;
    int before_add = __sync_fetch_and_add(&x, 1);
    int after_add = x;
    return (before_add == 0) && (after_add == 1);
 ], [je_cv_gcc_sync_atomics])
 if test "x${je_cv_gcc_sync_atomics}" = "xyes" ; then
  AC_DEFINE([JEMALLOC_GCC_SYNC_ATOMICS])
 fi
 dnl ============================================================================
 dnl Check for atomic(3) operations as provided on Darwin.
 dnl We need this not for the atomic operations (which are provided above), but
 dnl rather for the OSSpinLock type it exposes.
 JE_COMPILABLE([Darwin OSAtomic*()], [
 #include <libkern/OSAtomic.h>
--- a/include/jemalloc/internal/atomic.h
+++ b/include/jemalloc/internal/atomic.h
@@ -0,0 +1,111 @@
 #ifndef JEMALLOC_INTERNAL_ATOMIC_H
 #define JEMALLOC_INTERNAL_ATOMIC_H
 #define ATOMIC_INLINE static inline
 #if defined(JEMALLOC_GCC_ATOMIC_ATOMICS)
 #  include "jemalloc/internal/atomic_gcc_atomic.h"
 #elif defined(JEMALLOC_GCC_SYNC_ATOMICS)
 #  include "jemalloc/internal/atomic_gcc_sync.h"
 #elif defined(_MSC_VER)
 #  include "jemalloc/internal/atomic_msvc.h"
 #elif defined(JEMALLOC_C11_ATOMICS)
 #  include "jemalloc/internal/atomic_c11.h"
 #else
 #  error "Don't have atomics implemented on this platform."
 #endif
 /*
 * This header gives more or less a backport of C11 atomics. The user can write
 * JEMALLOC_GENERATE_ATOMICS(type, short_type, lg_sizeof_type); to generate
 * counterparts of the C11 atomic functions for type, as so:
 *   JEMALLOC_GENERATE_ATOMICS(int *, pi, 3);
 * and then write things like:
 *   int *some_ptr;
 *   atomic_pi_t atomic_ptr_to_int;
 *   atomic_store_pi(&atomic_ptr_to_int, some_ptr, ATOMIC_RELAXED);
 *   int *prev_value = atomic_exchange_pi(&ptr_to_int, NULL, ATOMIC_ACQ_REL);
 *   assert(some_ptr == prev_value);
 * and expect things to work in the obvious way.
 *
 * Also included (with naming differences to avoid conflicts with the standard
 * library):
 *   atomic_fence(atomic_memory_order_t) (mimics C11's atomic_thread_fence).
 *   ATOMIC_INIT (mimics C11's ATOMIC_VAR_INIT).
 */
 /*
 * Pure convenience, so that we don't have to type "atomic_memory_order_"
 * quite so often.
 */
 #define ATOMIC_RELAXED atomic_memory_order_relaxed
 #define ATOMIC_ACQUIRE atomic_memory_order_acquire,
 #define ATOMIC_RELEASE atomic_memory_order_release,
 #define ATOMIC_ACQ_REL atomic_memory_order_acq_rel,
 #define ATOMIC_SEQ_CST atomic_memory_order_seq_cst
 /*
 * In order to let us transition atomics usage piecemeal (and reason locally
 * about memory orders), we'll support the previous API for a while.
 */
 #define JEMALLOC_GENERATE_COMPATABILITY_ATOMICS(type, short_type)	\
 ATOMIC_INLINE type							\
 atomic_read_##short_type(type *p) {					\
 	return atomic_load_##short_type ((atomic_##short_type##_t *)p,	\
 	    ATOMIC_SEQ_CST);						\
 }									\
 									\
 ATOMIC_INLINE void							\
 atomic_write_##short_type(type *p, const type val) {			\
 	atomic_store_##short_type((atomic_##short_type##_t *)p,		\
 	    (type)val, ATOMIC_SEQ_CST);					\
 }									\
 ATOMIC_INLINE bool							\
 atomic_cas_##short_type(type *p, type c, type s) {			\
 	/* Note the '!' -- atomic_cas inverts the usual semantics. */	\
 	return !atomic_compare_exchange_strong_##short_type(		\
 	    (atomic_##short_type##_t *)p, &c, s, ATOMIC_SEQ_CST,	\
 	    ATOMIC_SEQ_CST);						\
 }
 #define JEMALLOC_GENERATE_COMPATABILITY_INT_ATOMICS(type, short_type)	\
 JEMALLOC_GENERATE_COMPATABILITY_ATOMICS(type, short_type)		\
 									\
 ATOMIC_INLINE type							\
 atomic_add_##short_type(type *p, type x) {				\
 	return atomic_fetch_add_##short_type(				\
 	    (atomic_##short_type##_t *)p, x, ATOMIC_SEQ_CST) + x;	\
 }									\
 ATOMIC_INLINE type							\
 atomic_sub_##short_type(type *p, type x) {				\
 	return atomic_fetch_sub_##short_type(				\
 	    (atomic_##short_type##_t *)p, x, ATOMIC_SEQ_CST) - x;	\
 }
 JEMALLOC_GENERATE_ATOMICS(void *, p, LG_SIZEOF_PTR)
 JEMALLOC_GENERATE_COMPATABILITY_ATOMICS(void *, p)
 /*
 * There's no actual guarantee that sizeof(bool) == 1, but it's true on the only
 * platform that actually needs to know the size, MSVC.
 */
 JEMALLOC_GENERATE_ATOMICS(bool, b, 0)
 JEMALLOC_GENERATE_COMPATABILITY_ATOMICS(bool, b)
 JEMALLOC_GENERATE_INT_ATOMICS(unsigned, u, LG_SIZEOF_INT)
 JEMALLOC_GENERATE_COMPATABILITY_INT_ATOMICS(unsigned, u)
 JEMALLOC_GENERATE_INT_ATOMICS(size_t, zu, LG_SIZEOF_PTR)
 JEMALLOC_GENERATE_COMPATABILITY_INT_ATOMICS(size_t, zu)
 JEMALLOC_GENERATE_INT_ATOMICS(uint32_t, u32, 2)
 JEMALLOC_GENERATE_COMPATABILITY_INT_ATOMICS(uint32_t, u32)
 #  if (LG_SIZEOF_PTR == 3 || LG_SIZEOF_INT == 3)
 JEMALLOC_GENERATE_INT_ATOMICS(uint64_t, u64, 3)
 JEMALLOC_GENERATE_COMPATABILITY_INT_ATOMICS(uint64_t, u64)
 #  endif
 #undef ATOMIC_INLINE
 #endif /* JEMALLOC_INTERNAL_ATOMIC_H */
--- a/include/jemalloc/internal/atomic_c11.h
+++ b/include/jemalloc/internal/atomic_c11.h
@@ -0,0 +1,97 @@
 #ifndef JEMALLOC_INTERNAL_ATOMIC_C11_H
 #define JEMALLOC_INTERNAL_ATOMIC_C11_H
 #include <stdatomic.h>
 #define ATOMIC_INIT(...) ATOMIC_VAR_INIT(__VA_ARGS__)
 #define atomic_memory_order_t memory_order
 #define atomic_memory_order_relaxed memory_order_relaxed
 #define atomic_memory_order_acquire memory_order_acquire
 #define atomic_memory_order_release memory_order_release
 #define atomic_memory_order_acq_rel memory_order_acq_rel
 #define atomic_memory_order_seq_cst memory_order_seq_cst
 #define atomic_fence atomic_thread_fence
 #define JEMALLOC_GENERATE_ATOMICS(type, short_type,			\
    /* unused */ lg_size)						\
 typedef _Atomic(type) atomic_##short_type##_t;				\
 									\
 ATOMIC_INLINE type							\
 atomic_load_##short_type(const atomic_##short_type##_t *a,		\
    atomic_memory_order_t mo) {						\
 	/*								\
 	 * A strict interpretation of the C standard prevents		\
 	 * atomic_load from taking a const argument, but it's		\
 	 * convenient for our purposes. This cast is a workaround.	\
 	 */								\
 	atomic_##short_type##_t* a_nonconst =				\
 	    (atomic_##short_type##_t*)a;				\
 	return atomic_load_explicit(a_nonconst, mo);			\
 }									\
 									\
 ATOMIC_INLINE void							\
 atomic_store_##short_type(atomic_##short_type##_t *a,			\
    type val, atomic_memory_order_t mo) {				\
 	atomic_store_explicit(a, val, mo);				\
 }									\
 									\
 ATOMIC_INLINE type							\
 atomic_exchange_##short_type(atomic_##short_type##_t *a, type val,	\
    atomic_memory_order_t mo) {						\
 	return atomic_exchange_explicit(a, val, mo);			\
 }									\
 									\
 ATOMIC_INLINE bool							\
 atomic_compare_exchange_weak_##short_type(atomic_##short_type##_t *a,	\
    type *expected, type desired, atomic_memory_order_t success_mo,	\
    atomic_memory_order_t failure_mo) {					\
 	return atomic_compare_exchange_weak_explicit(a, expected,	\
 	    desired, success_mo, failure_mo);				\
 }									\
 									\
 ATOMIC_INLINE bool							\
 atomic_compare_exchange_strong_##short_type(atomic_##short_type##_t *a,	\
    type *expected, type desired, atomic_memory_order_t success_mo,	\
    atomic_memory_order_t failure_mo) {					\
 	return atomic_compare_exchange_strong_explicit(a, expected,	\
 	    desired, success_mo, failure_mo);				\
 }
 /*
 * Integral types have some special operations available that non-integral ones
 * lack.
 */
 #define JEMALLOC_GENERATE_INT_ATOMICS(type, short_type, 		\
    /* unused */ lg_size)						\
 JEMALLOC_GENERATE_ATOMICS(type, short_type, /* unused */ lg_size)	\
 									\
 ATOMIC_INLINE type							\
 atomic_fetch_add_##short_type(atomic_##short_type##_t *a,		\
    type val, atomic_memory_order_t mo) {				\
 	return atomic_fetch_add_explicit(a, val, mo);			\
 }									\
 									\
 ATOMIC_INLINE type							\
 atomic_fetch_sub_##short_type(atomic_##short_type##_t *a,		\
    type val, atomic_memory_order_t mo) {				\
 	return atomic_fetch_sub_explicit(a, val, mo);			\
 }									\
 ATOMIC_INLINE type							\
 atomic_fetch_and_##short_type(atomic_##short_type##_t *a,		\
    type val, atomic_memory_order_t mo) {				\
 	return atomic_fetch_and_explicit(a, val, mo);			\
 }									\
 ATOMIC_INLINE type							\
 atomic_fetch_or_##short_type(atomic_##short_type##_t *a,		\
    type val, atomic_memory_order_t mo) {				\
 	return atomic_fetch_or_explicit(a, val, mo);			\
 }									\
 ATOMIC_INLINE type							\
 atomic_fetch_xor_##short_type(atomic_##short_type##_t *a,		\
    type val, atomic_memory_order_t mo) {				\
 	return atomic_fetch_xor_explicit(a, val, mo);			\
 }
 #endif /* JEMALLOC_INTERNAL_ATOMIC_C11_H */
--- a/include/jemalloc/internal/atomic_externs.h
+++ b/include/jemalloc/internal/atomic_externs.h
@@ -1,12 +0,0 @@
 #ifndef JEMALLOC_INTERNAL_ATOMIC_EXTERNS_H
 #define JEMALLOC_INTERNAL_ATOMIC_EXTERNS_H
 #if (LG_SIZEOF_PTR == 3 || LG_SIZEOF_INT == 3)
 #define atomic_read_u64(p)	atomic_add_u64(p, 0)
 #endif
 #define atomic_read_u32(p)	atomic_add_u32(p, 0)
 #define atomic_read_p(p)	atomic_add_p(p, NULL)
 #define atomic_read_zu(p)	atomic_add_zu(p, 0)
 #define atomic_read_u(p)	atomic_add_u(p, 0)
 #endif /* JEMALLOC_INTERNAL_ATOMIC_EXTERNS_H */
--- a/include/jemalloc/internal/atomic_gcc_atomic.h
+++ b/include/jemalloc/internal/atomic_gcc_atomic.h
@@ -0,0 +1,125 @@
 #ifndef JEMALLOC_INTERNAL_ATOMIC_GCC_ATOMIC_H
 #define JEMALLOC_INTERNAL_ATOMIC_GCC_ATOMIC_H
 #define ATOMIC_INIT(...) {__VA_ARGS__}
 typedef enum {
 	atomic_memory_order_relaxed,
 	atomic_memory_order_acquire,
 	atomic_memory_order_release,
 	atomic_memory_order_acq_rel,
 	atomic_memory_order_seq_cst
 } atomic_memory_order_t;
 ATOMIC_INLINE int
 atomic_enum_to_builtin(atomic_memory_order_t mo) {
 	switch (mo) {
 	case atomic_memory_order_relaxed:
 		return __ATOMIC_RELAXED;
 	case atomic_memory_order_acquire:
 		return __ATOMIC_ACQUIRE;
 	case atomic_memory_order_release:
 		return __ATOMIC_RELEASE;
 	case atomic_memory_order_acq_rel:
 		return __ATOMIC_ACQ_REL;
 	case atomic_memory_order_seq_cst:
 		return __ATOMIC_SEQ_CST;
 	}
 	/* Can't actually happen; the switch is exhaustive. */
 	return __ATOMIC_SEQ_CST;
 }
 ATOMIC_INLINE void
 atomic_fence(atomic_memory_order_t mo) {
 	__atomic_thread_fence(atomic_enum_to_builtin(mo));
 }
 #define JEMALLOC_GENERATE_ATOMICS(type, short_type,			\
    /* unused */ lg_size)						\
 typedef struct {							\
 	type repr;							\
 } atomic_##short_type##_t;						\
 									\
 ATOMIC_INLINE type							\
 atomic_load_##short_type(const atomic_##short_type##_t *a,		\
    atomic_memory_order_t mo) {						\
 	type result;							\
 	__atomic_load(&a->repr, &result, atomic_enum_to_builtin(mo));	\
 	return result;							\
 }									\
 									\
 ATOMIC_INLINE void							\
 atomic_store_##short_type(atomic_##short_type##_t *a, type val,		\
    atomic_memory_order_t mo) {						\
 	__atomic_store(&a->repr, &val, atomic_enum_to_builtin(mo));	\
 }									\
 									\
 ATOMIC_INLINE type							\
 atomic_exchange_##short_type(atomic_##short_type##_t *a, type val,	\
    atomic_memory_order_t mo) {						\
 	type result;							\
 	__atomic_exchange(&a->repr, &val, &result,			\
 	    atomic_enum_to_builtin(mo));				\
 	return result;							\
 }									\
 									\
 ATOMIC_INLINE bool							\
 atomic_compare_exchange_weak_##short_type(atomic_##short_type##_t *a,	\
    type *expected, type desired, atomic_memory_order_t success_mo,	\
    atomic_memory_order_t failure_mo) {					\
 	return __atomic_compare_exchange(&a->repr, expected, &desired,	\
 	    true, atomic_enum_to_builtin(success_mo),			\
 	    atomic_enum_to_builtin(failure_mo));			\
 }									\
 									\
 ATOMIC_INLINE bool							\
 atomic_compare_exchange_strong_##short_type(atomic_##short_type##_t *a,	\
    type *expected, type desired, atomic_memory_order_t success_mo,	\
    atomic_memory_order_t failure_mo) {					\
 	return __atomic_compare_exchange(&a->repr, expected, &desired,	\
 	    false,							\
 	    atomic_enum_to_builtin(success_mo),				\
 	    atomic_enum_to_builtin(failure_mo));			\
 }
 #define JEMALLOC_GENERATE_INT_ATOMICS(type, short_type,			\
    /* unused */ lg_size)						\
 JEMALLOC_GENERATE_ATOMICS(type, short_type, /* unused */ lg_size)	\
 									\
 ATOMIC_INLINE type							\
 atomic_fetch_add_##short_type(atomic_##short_type##_t *a, type val,	\
    atomic_memory_order_t mo) {						\
 	return __atomic_fetch_add(&a->repr, val,			\
 	    atomic_enum_to_builtin(mo));				\
 }									\
 									\
 ATOMIC_INLINE type							\
 atomic_fetch_sub_##short_type(atomic_##short_type##_t *a, type val,	\
    atomic_memory_order_t mo) {						\
 	return __atomic_fetch_sub(&a->repr, val,			\
 	    atomic_enum_to_builtin(mo));				\
 }									\
 									\
 ATOMIC_INLINE type							\
 atomic_fetch_and_##short_type(atomic_##short_type##_t *a, type val,	\
    atomic_memory_order_t mo) {						\
 	return __atomic_fetch_and(&a->repr, val,			\
 	    atomic_enum_to_builtin(mo));				\
 }									\
 									\
 ATOMIC_INLINE type							\
 atomic_fetch_or_##short_type(atomic_##short_type##_t *a, type val,	\
    atomic_memory_order_t mo) {						\
 	return __atomic_fetch_or(&a->repr, val,				\
 	    atomic_enum_to_builtin(mo));				\
 }									\
 									\
 ATOMIC_INLINE type							\
 atomic_fetch_xor_##short_type(atomic_##short_type##_t *a, type val,	\
    atomic_memory_order_t mo) {						\
 	return __atomic_fetch_xor(&a->repr, val,			\
 	    atomic_enum_to_builtin(mo));				\
 }
 #endif /* JEMALLOC_INTERNAL_ATOMIC_GCC_ATOMIC_H */
--- a/include/jemalloc/internal/atomic_gcc_sync.h
+++ b/include/jemalloc/internal/atomic_gcc_sync.h
@@ -0,0 +1,191 @@
 #ifndef JEMALLOC_INTERNAL_ATOMIC_GCC_SYNC_H
 #define JEMALLOC_INTERNAL_ATOMIC_GCC_SYNC_H
 #define ATOMIC_INIT(...) {__VA_ARGS__}
 typedef enum {
 	atomic_memory_order_relaxed,
 	atomic_memory_order_acquire,
 	atomic_memory_order_release,
 	atomic_memory_order_acq_rel,
 	atomic_memory_order_seq_cst
 } atomic_memory_order_t;
 ATOMIC_INLINE void
 atomic_fence(atomic_memory_order_t mo) {
 	/* Easy cases first: no barrier, and full barrier. */
 	if (mo == atomic_memory_order_relaxed) {
 		asm volatile("" ::: "memory");
 		return;
 	}
 	if (mo == atomic_memory_order_seq_cst) {
 		asm volatile("" ::: "memory");
 		__sync_synchronize();
 		asm volatile("" ::: "memory");
 		return;
 	}
 	asm volatile("" ::: "memory");
 #  if defined(__i386__) || defined(__x86_64__)
 	/* This is implicit on x86. */
 #  elif defined(__ppc__)
 	asm volatile("lwsync");
 #  elif defined(__sparc__) && defined(__arch64__)
 	if (mo == atomic_memory_order_acquire) {
 		asm volatile("membar #LoadLoad | #LoadStore");
 	} else if (mo == atomic_memory_order_release) {
 		asm volatile("membar #LoadStore | #StoreStore");
 	} else {
 		asm volatile("membar #LoadLoad | #LoadStore | #StoreStore");
 	}
 #  else
 	__sync_synchronize();
 #  endif
 	asm volatile("" ::: "memory");
 }
 /*
 * A correct implementation of seq_cst loads and stores on weakly ordered
 * architectures could do either of the following:
 *   1. store() is weak-fence -> store -> strong fence, load() is load ->
 *      strong-fence.
 *   2. store() is strong-fence -> store, load() is strong-fence -> load ->
 *      weak-fence.
 * The tricky thing is, load() and store() above can be the load or store
 * portions of a gcc __sync builtin, so we have to follow GCC's lead, which
 * means going with strategy 2.
 * On strongly ordered architectures, the natural strategy is to stick a strong
 * fence after seq_cst stores, and have naked loads.  So we want the strong
 * fences in different places on different architectures.
 * atomic_pre_sc_load_fence and atomic_post_sc_store_fence allow us to
 * accomplish this.
 */
 ATOMIC_INLINE void
 atomic_pre_sc_load_fence() {
 #  if defined(__i386__) || defined(__x86_64__) ||			\
    (defined(__sparc__) && defined(__arch64__))
 	atomic_fence(atomic_memory_order_relaxed);
 #  else
 	atomic_fence(atomic_memory_order_seq_cst);
 #  endif
 }
 ATOMIC_INLINE void
 atomic_post_sc_store_fence() {
 #  if defined(__i386__) || defined(__x86_64__) ||			\
    (defined(__sparc__) && defined(__arch64__))
 	atomic_fence(atomic_memory_order_seq_cst);
 #  else
 	atomic_fence(atomic_memory_order_relaxed);
 #  endif
 }
 #define JEMALLOC_GENERATE_ATOMICS(type, short_type,			\
    /* unused */ lg_size)						\
 typedef struct {							\
 	type volatile repr;						\
 } atomic_##short_type##_t;						\
 									\
 ATOMIC_INLINE type							\
 atomic_load_##short_type(const atomic_##short_type##_t *a,		\
    atomic_memory_order_t mo) {						\
 	if (mo == atomic_memory_order_seq_cst) {			\
 		atomic_pre_sc_load_fence();				\
 	}								\
 	type result = a->repr;						\
 	if (mo != atomic_memory_order_relaxed) {			\
 		atomic_fence(atomic_memory_order_acquire);		\
 	}								\
 	return result;							\
 }									\
 									\
 ATOMIC_INLINE void							\
 atomic_store_##short_type(atomic_##short_type##_t *a,			\
    type val, atomic_memory_order_t mo) {				\
 	if (mo != atomic_memory_order_relaxed) {			\
 		atomic_fence(atomic_memory_order_release);		\
 	}								\
 	a->repr = val;							\
 	if (mo == atomic_memory_order_seq_cst) {			\
 		atomic_post_sc_store_fence();				\
 	}								\
 }									\
 									\
 ATOMIC_INLINE type							\
 atomic_exchange_##short_type(atomic_##short_type##_t *a, type val,	\
    atomic_memory_order_t mo) {						\
 	/*								\
 	 * Because of FreeBSD, we care about gcc 4.2, which doesn't have\
 	 * an atomic exchange builtin.  We fake it with a CAS loop.	\
 	 */								\
 	while (true) {							\
 		type old = a->repr;					\
 		if (__sync_bool_compare_and_swap(&a->repr, old, val)) {	\
 			return old;					\
 		}							\
 	}								\
 }									\
 									\
 ATOMIC_INLINE bool							\
 atomic_compare_exchange_weak_##short_type(atomic_##short_type##_t *a,	\
    type *expected, type desired, atomic_memory_order_t success_mo,	\
    atomic_memory_order_t failure_mo) {					\
 	type prev = __sync_val_compare_and_swap(&a->repr, *expected,	\
 	    desired);							\
 	if (prev == *expected) {					\
 		return true;						\
 	} else {							\
 		*expected = prev;					\
 		return false;						\
 	}								\
 }									\
 ATOMIC_INLINE bool							\
 atomic_compare_exchange_strong_##short_type(atomic_##short_type##_t *a,	\
    type *expected, type desired, atomic_memory_order_t success_mo,	\
    atomic_memory_order_t failure_mo) {					\
 	type prev = __sync_val_compare_and_swap(&a->repr, *expected,	\
 	    desired);							\
 	if (prev == *expected) {					\
 		return true;						\
 	} else {							\
 		*expected = prev;					\
 		return false;						\
 	}								\
 }
 #define JEMALLOC_GENERATE_INT_ATOMICS(type, short_type,			\
    /* unused */ lg_size)						\
 JEMALLOC_GENERATE_ATOMICS(type, short_type, /* unused */ lg_size)	\
 									\
 ATOMIC_INLINE type							\
 atomic_fetch_add_##short_type(atomic_##short_type##_t *a, type val,	\
    atomic_memory_order_t mo) {						\
 	return __sync_fetch_and_add(&a->repr, val);			\
 }									\
 									\
 ATOMIC_INLINE type							\
 atomic_fetch_sub_##short_type(atomic_##short_type##_t *a, type val,	\
    atomic_memory_order_t mo) {						\
 	return __sync_fetch_and_sub(&a->repr, val);			\
 }									\
 									\
 ATOMIC_INLINE type							\
 atomic_fetch_and_##short_type(atomic_##short_type##_t *a, type val,	\
    atomic_memory_order_t mo) {						\
 	return __sync_fetch_and_and(&a->repr, val);			\
 }									\
 									\
 ATOMIC_INLINE type							\
 atomic_fetch_or_##short_type(atomic_##short_type##_t *a, type val,	\
    atomic_memory_order_t mo) {						\
 	return __sync_fetch_and_or(&a->repr, val);			\
 }									\
 									\
 ATOMIC_INLINE type							\
 atomic_fetch_xor_##short_type(atomic_##short_type##_t *a, type val,	\
    atomic_memory_order_t mo) {						\
 	return __sync_fetch_and_xor(&a->repr, val);			\
 }
 #endif /* JEMALLOC_INTERNAL_ATOMIC_GCC_SYNC_H */
--- a/include/jemalloc/internal/atomic_inlines.h
+++ b/include/jemalloc/internal/atomic_inlines.h
@@ -1,525 +0,0 @@
 #ifndef JEMALLOC_INTERNAL_ATOMIC_INLINES_H
 #define JEMALLOC_INTERNAL_ATOMIC_INLINES_H
 /*
 * All arithmetic functions return the arithmetic result of the atomic
 * operation.  Some atomic operation APIs return the value prior to mutation, in
 * which case the following functions must redundantly compute the result so
 * that it can be returned.  These functions are normally inlined, so the extra
 * operations can be optimized away if the return values aren't used by the
 * callers.
 *
 *   <t> atomic_read_<t>(<t> *p) { return *p; }
 *   <t> atomic_add_<t>(<t> *p, <t> x) { return *p += x; }
 *   <t> atomic_sub_<t>(<t> *p, <t> x) { return *p -= x; }
 *   bool atomic_cas_<t>(<t> *p, <t> c, <t> s)
 *   {
 *     if (*p != c)
 *       return true;
 *     *p = s;
 *     return false;
 *   }
 *   void atomic_write_<t>(<t> *p, <t> x) { *p = x; }
 */
 #ifndef JEMALLOC_ENABLE_INLINE
 #  ifdef JEMALLOC_ATOMIC_U64
 uint64_t	atomic_add_u64(uint64_t *p, uint64_t x);
 uint64_t	atomic_sub_u64(uint64_t *p, uint64_t x);
 bool	atomic_cas_u64(uint64_t *p, uint64_t c, uint64_t s);
 void	atomic_write_u64(uint64_t *p, uint64_t x);
 #  endif
 uint32_t	atomic_add_u32(uint32_t *p, uint32_t x);
 uint32_t	atomic_sub_u32(uint32_t *p, uint32_t x);
 bool	atomic_cas_u32(uint32_t *p, uint32_t c, uint32_t s);
 void	atomic_write_u32(uint32_t *p, uint32_t x);
 void	*atomic_add_p(void **p, void *x);
 void	*atomic_sub_p(void **p, void *x);
 bool	atomic_cas_p(void **p, void *c, void *s);
 void	atomic_write_p(void **p, const void *x);
 size_t	atomic_add_zu(size_t *p, size_t x);
 size_t	atomic_sub_zu(size_t *p, size_t x);
 bool	atomic_cas_zu(size_t *p, size_t c, size_t s);
 void	atomic_write_zu(size_t *p, size_t x);
 unsigned	atomic_add_u(unsigned *p, unsigned x);
 unsigned	atomic_sub_u(unsigned *p, unsigned x);
 bool	atomic_cas_u(unsigned *p, unsigned c, unsigned s);
 void	atomic_write_u(unsigned *p, unsigned x);
 #endif
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_ATOMIC_C_))
 /******************************************************************************/
 /* 64-bit operations. */
 #ifdef JEMALLOC_ATOMIC_U64
 #  if (defined(__amd64__) || defined(__x86_64__))
 JEMALLOC_INLINE uint64_t
 atomic_add_u64(uint64_t *p, uint64_t x) {
 	uint64_t t = x;
 	asm volatile (
 	    "lock; xaddq %0, %1;"
 	    : "+r" (t), "=m" (*p) /* Outputs. */
 	    : "m" (*p) /* Inputs. */
 	    );
 	return t + x;
 }
 JEMALLOC_INLINE uint64_t
 atomic_sub_u64(uint64_t *p, uint64_t x) {
 	uint64_t t;
 	x = (uint64_t)(-(int64_t)x);
 	t = x;
 	asm volatile (
 	    "lock; xaddq %0, %1;"
 	    : "+r" (t), "=m" (*p) /* Outputs. */
 	    : "m" (*p) /* Inputs. */
 	    );
 	return t + x;
 }
 JEMALLOC_INLINE bool
 atomic_cas_u64(uint64_t *p, uint64_t c, uint64_t s) {
 	uint8_t success;
 	asm volatile (
 	    "lock; cmpxchgq %4, %0;"
 	    "sete %1;"
 	    : "=m" (*p), "=a" (success) /* Outputs. */
 	    : "m" (*p), "a" (c), "r" (s) /* Inputs. */
 	    : "memory" /* Clobbers. */
 	    );
 	return !(bool)success;
 }
 JEMALLOC_INLINE void
 atomic_write_u64(uint64_t *p, uint64_t x) {
 	asm volatile (
 	    "xchgq %1, %0;" /* Lock is implied by xchgq. */
 	    : "=m" (*p), "+r" (x) /* Outputs. */
 	    : "m" (*p) /* Inputs. */
 	    : "memory" /* Clobbers. */
 	    );
 }
 #  elif (defined(JEMALLOC_C11ATOMICS))
 JEMALLOC_INLINE uint64_t
 atomic_add_u64(uint64_t *p, uint64_t x) {
 	volatile atomic_uint_least64_t *a = (volatile atomic_uint_least64_t *)p;
 	return atomic_fetch_add(a, x) + x;
 }
 JEMALLOC_INLINE uint64_t
 atomic_sub_u64(uint64_t *p, uint64_t x) {
 	volatile atomic_uint_least64_t *a = (volatile atomic_uint_least64_t *)p;
 	return atomic_fetch_sub(a, x) - x;
 }
 JEMALLOC_INLINE bool
 atomic_cas_u64(uint64_t *p, uint64_t c, uint64_t s) {
 	volatile atomic_uint_least64_t *a = (volatile atomic_uint_least64_t *)p;
 	return !atomic_compare_exchange_strong(a, &c, s);
 }
 JEMALLOC_INLINE void
 atomic_write_u64(uint64_t *p, uint64_t x) {
 	volatile atomic_uint_least64_t *a = (volatile atomic_uint_least64_t *)p;
 	atomic_store(a, x);
 }
 #  elif (defined(JEMALLOC_ATOMIC9))
 JEMALLOC_INLINE uint64_t
 atomic_add_u64(uint64_t *p, uint64_t x) {
 	/*
 	 * atomic_fetchadd_64() doesn't exist, but we only ever use this
 	 * function on LP64 systems, so atomic_fetchadd_long() will do.
 	 */
 	assert(sizeof(uint64_t) == sizeof(unsigned long));
 	return atomic_fetchadd_long(p, (unsigned long)x) + x;
 }
 JEMALLOC_INLINE uint64_t
 atomic_sub_u64(uint64_t *p, uint64_t x) {
 	assert(sizeof(uint64_t) == sizeof(unsigned long));
 	return atomic_fetchadd_long(p, (unsigned long)(-(long)x)) - x;
 }
 JEMALLOC_INLINE bool
 atomic_cas_u64(uint64_t *p, uint64_t c, uint64_t s) {
 	assert(sizeof(uint64_t) == sizeof(unsigned long));
 	return !atomic_cmpset_long(p, (unsigned long)c, (unsigned long)s);
 }
 JEMALLOC_INLINE void
 atomic_write_u64(uint64_t *p, uint64_t x) {
 	assert(sizeof(uint64_t) == sizeof(unsigned long));
 	atomic_store_rel_long(p, x);
 }
 #  elif (defined(JEMALLOC_OSATOMIC))
 JEMALLOC_INLINE uint64_t
 atomic_add_u64(uint64_t *p, uint64_t x) {
 	return OSAtomicAdd64((int64_t)x, (int64_t *)p);
 }
 JEMALLOC_INLINE uint64_t
 atomic_sub_u64(uint64_t *p, uint64_t x) {
 	return OSAtomicAdd64(-((int64_t)x), (int64_t *)p);
 }
 JEMALLOC_INLINE bool
 atomic_cas_u64(uint64_t *p, uint64_t c, uint64_t s) {
 	return !OSAtomicCompareAndSwap64(c, s, (int64_t *)p);
 }
 JEMALLOC_INLINE void
 atomic_write_u64(uint64_t *p, uint64_t x) {
 	uint64_t o;
 	/*The documented OSAtomic*() API does not expose an atomic exchange. */
 	do {
 		o = atomic_read_u64(p);
 	} while (atomic_cas_u64(p, o, x));
 }
 #  elif (defined(_MSC_VER))
 JEMALLOC_INLINE uint64_t
 atomic_add_u64(uint64_t *p, uint64_t x) {
 	return InterlockedExchangeAdd64(p, x) + x;
 }
 JEMALLOC_INLINE uint64_t
 atomic_sub_u64(uint64_t *p, uint64_t x) {
 	return InterlockedExchangeAdd64(p, -((int64_t)x)) - x;
 }
 JEMALLOC_INLINE bool
 atomic_cas_u64(uint64_t *p, uint64_t c, uint64_t s) {
 	uint64_t o;
 	o = InterlockedCompareExchange64(p, s, c);
 	return o != c;
 }
 JEMALLOC_INLINE void
 atomic_write_u64(uint64_t *p, uint64_t x) {
 	InterlockedExchange64(p, x);
 }
 #  elif (defined(__GCC_HAVE_SYNC_COMPARE_AND_SWAP_8) || \
    defined(JE_FORCE_SYNC_COMPARE_AND_SWAP_8))
 JEMALLOC_INLINE uint64_t
 atomic_add_u64(uint64_t *p, uint64_t x) {
 	return __sync_add_and_fetch(p, x);
 }
 JEMALLOC_INLINE uint64_t
 atomic_sub_u64(uint64_t *p, uint64_t x) {
 	return __sync_sub_and_fetch(p, x);
 }
 JEMALLOC_INLINE bool
 atomic_cas_u64(uint64_t *p, uint64_t c, uint64_t s) {
 	return !__sync_bool_compare_and_swap(p, c, s);
 }
 JEMALLOC_INLINE void
 atomic_write_u64(uint64_t *p, uint64_t x) {
 	__sync_lock_test_and_set(p, x);
 }
 #  else
 #    error "Missing implementation for 64-bit atomic operations"
 #  endif
 #endif
 /******************************************************************************/
 /* 32-bit operations. */
 #if (defined(__i386__) || defined(__amd64__) || defined(__x86_64__))
 JEMALLOC_INLINE uint32_t
 atomic_add_u32(uint32_t *p, uint32_t x) {
 	uint32_t t = x;
 	asm volatile (
 	    "lock; xaddl %0, %1;"
 	    : "+r" (t), "=m" (*p) /* Outputs. */
 	    : "m" (*p) /* Inputs. */
 	    );
 	return t + x;
 }
 JEMALLOC_INLINE uint32_t
 atomic_sub_u32(uint32_t *p, uint32_t x) {
 	uint32_t t;
 	x = (uint32_t)(-(int32_t)x);
 	t = x;
 	asm volatile (
 	    "lock; xaddl %0, %1;"
 	    : "+r" (t), "=m" (*p) /* Outputs. */
 	    : "m" (*p) /* Inputs. */
 	    );
 	return t + x;
 }
 JEMALLOC_INLINE bool
 atomic_cas_u32(uint32_t *p, uint32_t c, uint32_t s) {
 	uint8_t success;
 	asm volatile (
 	    "lock; cmpxchgl %4, %0;"
 	    "sete %1;"
 	    : "=m" (*p), "=a" (success) /* Outputs. */
 	    : "m" (*p), "a" (c), "r" (s) /* Inputs. */
 	    : "memory"
 	    );
 	return !(bool)success;
 }
 JEMALLOC_INLINE void
 atomic_write_u32(uint32_t *p, uint32_t x) {
 	asm volatile (
 	    "xchgl %1, %0;" /* Lock is implied by xchgl. */
 	    : "=m" (*p), "+r" (x) /* Outputs. */
 	    : "m" (*p) /* Inputs. */
 	    : "memory" /* Clobbers. */
 	    );
 }
 #  elif (defined(JEMALLOC_C11ATOMICS))
 JEMALLOC_INLINE uint32_t
 atomic_add_u32(uint32_t *p, uint32_t x) {
 	volatile atomic_uint_least32_t *a = (volatile atomic_uint_least32_t *)p;
 	return atomic_fetch_add(a, x) + x;
 }
 JEMALLOC_INLINE uint32_t
 atomic_sub_u32(uint32_t *p, uint32_t x) {
 	volatile atomic_uint_least32_t *a = (volatile atomic_uint_least32_t *)p;
 	return atomic_fetch_sub(a, x) - x;
 }
 JEMALLOC_INLINE bool
 atomic_cas_u32(uint32_t *p, uint32_t c, uint32_t s) {
 	volatile atomic_uint_least32_t *a = (volatile atomic_uint_least32_t *)p;
 	return !atomic_compare_exchange_strong(a, &c, s);
 }
 JEMALLOC_INLINE void
 atomic_write_u32(uint32_t *p, uint32_t x) {
 	volatile atomic_uint_least32_t *a = (volatile atomic_uint_least32_t *)p;
 	atomic_store(a, x);
 }
 #elif (defined(JEMALLOC_ATOMIC9))
 JEMALLOC_INLINE uint32_t
 atomic_add_u32(uint32_t *p, uint32_t x) {
 	return atomic_fetchadd_32(p, x) + x;
 }
 JEMALLOC_INLINE uint32_t
 atomic_sub_u32(uint32_t *p, uint32_t x) {
 	return atomic_fetchadd_32(p, (uint32_t)(-(int32_t)x)) - x;
 }
 JEMALLOC_INLINE bool
 atomic_cas_u32(uint32_t *p, uint32_t c, uint32_t s) {
 	return !atomic_cmpset_32(p, c, s);
 }
 JEMALLOC_INLINE void
 atomic_write_u32(uint32_t *p, uint32_t x) {
 	atomic_store_rel_32(p, x);
 }
 #elif (defined(JEMALLOC_OSATOMIC))
 JEMALLOC_INLINE uint32_t
 atomic_add_u32(uint32_t *p, uint32_t x) {
 	return OSAtomicAdd32((int32_t)x, (int32_t *)p);
 }
 JEMALLOC_INLINE uint32_t
 atomic_sub_u32(uint32_t *p, uint32_t x) {
 	return OSAtomicAdd32(-((int32_t)x), (int32_t *)p);
 }
 JEMALLOC_INLINE bool
 atomic_cas_u32(uint32_t *p, uint32_t c, uint32_t s) {
 	return !OSAtomicCompareAndSwap32(c, s, (int32_t *)p);
 }
 JEMALLOC_INLINE void
 atomic_write_u32(uint32_t *p, uint32_t x) {
 	uint32_t o;
 	/*The documented OSAtomic*() API does not expose an atomic exchange. */
 	do {
 		o = atomic_read_u32(p);
 	} while (atomic_cas_u32(p, o, x));
 }
 #elif (defined(_MSC_VER))
 JEMALLOC_INLINE uint32_t
 atomic_add_u32(uint32_t *p, uint32_t x) {
 	return InterlockedExchangeAdd(p, x) + x;
 }
 JEMALLOC_INLINE uint32_t
 atomic_sub_u32(uint32_t *p, uint32_t x) {
 	return InterlockedExchangeAdd(p, -((int32_t)x)) - x;
 }
 JEMALLOC_INLINE bool
 atomic_cas_u32(uint32_t *p, uint32_t c, uint32_t s) {
 	uint32_t o;
 	o = InterlockedCompareExchange(p, s, c);
 	return o != c;
 }
 JEMALLOC_INLINE void
 atomic_write_u32(uint32_t *p, uint32_t x) {
 	InterlockedExchange(p, x);
 }
 #elif (defined(__GCC_HAVE_SYNC_COMPARE_AND_SWAP_4) || \
 defined(JE_FORCE_SYNC_COMPARE_AND_SWAP_4))
 JEMALLOC_INLINE uint32_t
 atomic_add_u32(uint32_t *p, uint32_t x) {
 	return __sync_add_and_fetch(p, x);
 }
 JEMALLOC_INLINE uint32_t
 atomic_sub_u32(uint32_t *p, uint32_t x) {
 	return __sync_sub_and_fetch(p, x);
 }
 JEMALLOC_INLINE bool
 atomic_cas_u32(uint32_t *p, uint32_t c, uint32_t s) {
 	return !__sync_bool_compare_and_swap(p, c, s);
 }
 JEMALLOC_INLINE void
 atomic_write_u32(uint32_t *p, uint32_t x) {
 	__sync_lock_test_and_set(p, x);
 }
 #else
 #  error "Missing implementation for 32-bit atomic operations"
 #endif
 /******************************************************************************/
 /* Pointer operations. */
 JEMALLOC_INLINE void *
 atomic_add_p(void **p, void *x) {
 #if (LG_SIZEOF_PTR == 3)
 	return (void *)atomic_add_u64((uint64_t *)p, (uint64_t)x);
 #elif (LG_SIZEOF_PTR == 2)
 	return (void *)atomic_add_u32((uint32_t *)p, (uint32_t)x);
 #endif
 }
 JEMALLOC_INLINE void *
 atomic_sub_p(void **p, void *x) {
 #if (LG_SIZEOF_PTR == 3)
 	return (void *)atomic_add_u64((uint64_t *)p, (uint64_t)-((int64_t)x));
 #elif (LG_SIZEOF_PTR == 2)
 	return (void *)atomic_add_u32((uint32_t *)p, (uint32_t)-((int32_t)x));
 #endif
 }
 JEMALLOC_INLINE bool
 atomic_cas_p(void **p, void *c, void *s) {
 #if (LG_SIZEOF_PTR == 3)
 	return atomic_cas_u64((uint64_t *)p, (uint64_t)c, (uint64_t)s);
 #elif (LG_SIZEOF_PTR == 2)
 	return atomic_cas_u32((uint32_t *)p, (uint32_t)c, (uint32_t)s);
 #endif
 }
 JEMALLOC_INLINE void
 atomic_write_p(void **p, const void *x) {
 #if (LG_SIZEOF_PTR == 3)
 	atomic_write_u64((uint64_t *)p, (uint64_t)x);
 #elif (LG_SIZEOF_PTR == 2)
 	atomic_write_u32((uint32_t *)p, (uint32_t)x);
 #endif
 }
 /******************************************************************************/
 /* size_t operations. */
 JEMALLOC_INLINE size_t
 atomic_add_zu(size_t *p, size_t x) {
 #if (LG_SIZEOF_PTR == 3)
 	return (size_t)atomic_add_u64((uint64_t *)p, (uint64_t)x);
 #elif (LG_SIZEOF_PTR == 2)
 	return (size_t)atomic_add_u32((uint32_t *)p, (uint32_t)x);
 #endif
 }
 JEMALLOC_INLINE size_t
 atomic_sub_zu(size_t *p, size_t x) {
 #if (LG_SIZEOF_PTR == 3)
 	return (size_t)atomic_add_u64((uint64_t *)p, (uint64_t)-((int64_t)x));
 #elif (LG_SIZEOF_PTR == 2)
 	return (size_t)atomic_add_u32((uint32_t *)p, (uint32_t)-((int32_t)x));
 #endif
 }
 JEMALLOC_INLINE bool
 atomic_cas_zu(size_t *p, size_t c, size_t s) {
 #if (LG_SIZEOF_PTR == 3)
 	return atomic_cas_u64((uint64_t *)p, (uint64_t)c, (uint64_t)s);
 #elif (LG_SIZEOF_PTR == 2)
 	return atomic_cas_u32((uint32_t *)p, (uint32_t)c, (uint32_t)s);
 #endif
 }
 JEMALLOC_INLINE void
 atomic_write_zu(size_t *p, size_t x) {
 #if (LG_SIZEOF_PTR == 3)
 	atomic_write_u64((uint64_t *)p, (uint64_t)x);
 #elif (LG_SIZEOF_PTR == 2)
 	atomic_write_u32((uint32_t *)p, (uint32_t)x);
 #endif
 }
 /******************************************************************************/
 /* unsigned operations. */
 JEMALLOC_INLINE unsigned
 atomic_add_u(unsigned *p, unsigned x) {
 #if (LG_SIZEOF_INT == 3)
 	return (unsigned)atomic_add_u64((uint64_t *)p, (uint64_t)x);
 #elif (LG_SIZEOF_INT == 2)
 	return (unsigned)atomic_add_u32((uint32_t *)p, (uint32_t)x);
 #endif
 }
 JEMALLOC_INLINE unsigned
 atomic_sub_u(unsigned *p, unsigned x) {
 #if (LG_SIZEOF_INT == 3)
 	return (unsigned)atomic_add_u64((uint64_t *)p, (uint64_t)-((int64_t)x));
 #elif (LG_SIZEOF_INT == 2)
 	return (unsigned)atomic_add_u32((uint32_t *)p, (uint32_t)-((int32_t)x));
 #endif
 }
 JEMALLOC_INLINE bool
 atomic_cas_u(unsigned *p, unsigned c, unsigned s) {
 #if (LG_SIZEOF_INT == 3)
 	return atomic_cas_u64((uint64_t *)p, (uint64_t)c, (uint64_t)s);
 #elif (LG_SIZEOF_INT == 2)
 	return atomic_cas_u32((uint32_t *)p, (uint32_t)c, (uint32_t)s);
 #endif
 }
 JEMALLOC_INLINE void
 atomic_write_u(unsigned *p, unsigned x) {
 #if (LG_SIZEOF_INT == 3)
 	atomic_write_u64((uint64_t *)p, (uint64_t)x);
 #elif (LG_SIZEOF_INT == 2)
 	atomic_write_u32((uint32_t *)p, (uint32_t)x);
 #endif
 }
 /******************************************************************************/
 #endif
 #endif /* JEMALLOC_INTERNAL_ATOMIC_INLINES_H */
--- a/include/jemalloc/internal/atomic_msvc.h
+++ b/include/jemalloc/internal/atomic_msvc.h
@@ -0,0 +1,158 @@
 #ifndef JEMALLOC_INTERNAL_ATOMIC_MSVC_H
 #define JEMALLOC_INTERNAL_ATOMIC_MSVC_H
 #define ATOMIC_INIT(...) {__VA_ARGS__}
 typedef enum {
 	atomic_memory_order_relaxed,
 	atomic_memory_order_acquire,
 	atomic_memory_order_release,
 	atomic_memory_order_acq_rel,
 	atomic_memory_order_seq_cst
 } atomic_memory_order_t;
 typedef char atomic_repr_0_t;
 typedef short atomic_repr_1_t;
 typedef long atomic_repr_2_t;
 typedef __int64 atomic_repr_3_t;
 ATOMIC_INLINE void
 atomic_fence(atomic_memory_order_t mo) {
 	_ReadWriteBarrier();
 #  if defined(_M_ARM) || defined(_M_ARM64)
 	/* ARM needs a barrier for everything but relaxed. */
 	if (mo != atomic_memory_order_relaxed) {
 		MemoryBarrier();
 	}
 #  elif defined(_M_IX86) || defined (_M_X64)
 	/* x86 needs a barrier only for seq_cst. */
 	if (mo == atomic_memory_order_seq_cst) {
 		MemoryBarrier();
 	}
 #  else
 #  error "Don't know how to create atomics for this platform for MSVC."
 #  endif
 	_ReadWriteBarrier();
 }
 #define ATOMIC_INTERLOCKED_REPR(lg_size) atomic_repr_ ## lg_size ## _t
 #define ATOMIC_CONCAT(a, b) ATOMIC_RAW_CONCAT(a, b)
 #define ATOMIC_RAW_CONCAT(a, b) a ## b
 #define ATOMIC_INTERLOCKED_NAME(base_name, lg_size) ATOMIC_CONCAT(	\
    base_name, ATOMIC_INTERLOCKED_SUFFIX(lg_size))
 #define ATOMIC_INTERLOCKED_SUFFIX(lg_size)				\
    ATOMIC_CONCAT(ATOMIC_INTERLOCKED_SUFFIX_, lg_size)
 #define ATOMIC_INTERLOCKED_SUFFIX_0 8
 #define ATOMIC_INTERLOCKED_SUFFIX_1 16
 #define ATOMIC_INTERLOCKED_SUFFIX_2
 #define ATOMIC_INTERLOCKED_SUFFIX_3 64
 #define JEMALLOC_GENERATE_ATOMICS(type, short_type, lg_size)		\
 typedef struct {							\
 	ATOMIC_INTERLOCKED_REPR(lg_size) repr;				\
 } atomic_##short_type##_t;						\
 									\
 ATOMIC_INLINE type							\
 atomic_load_##short_type(const atomic_##short_type##_t *a,		\
    atomic_memory_order_t mo) {						\
 	ATOMIC_INTERLOCKED_REPR(lg_size) ret = a->repr;			\
 	if (mo != atomic_memory_order_relaxed) {			\
 		atomic_fence(atomic_memory_order_acquire);		\
 	}								\
 	return (type) ret;						\
 }									\
 									\
 ATOMIC_INLINE void							\
 atomic_store_##short_type(atomic_##short_type##_t *a,			\
    type val, atomic_memory_order_t mo) {				\
 	if (mo != atomic_memory_order_relaxed) {			\
 		atomic_fence(atomic_memory_order_release);		\
 	}								\
 	a->repr = (ATOMIC_INTERLOCKED_REPR(lg_size)) val;		\
 	if (mo == atomic_memory_order_seq_cst) {			\
 		atomic_fence(atomic_memory_order_seq_cst);		\
 	}								\
 }									\
 									\
 ATOMIC_INLINE type							\
 atomic_exchange_##short_type(atomic_##short_type##_t *a, type val,	\
    atomic_memory_order_t mo) {						\
 	return (type)ATOMIC_INTERLOCKED_NAME(_InterlockedExchange,	\
 	    lg_size)(&a->repr, (ATOMIC_INTERLOCKED_REPR(lg_size))val);	\
 }									\
 									\
 ATOMIC_INLINE bool							\
 atomic_compare_exchange_weak_##short_type(atomic_##short_type##_t *a,	\
    type *expected, type desired, atomic_memory_order_t success_mo,	\
    atomic_memory_order_t failure_mo) {					\
 	ATOMIC_INTERLOCKED_REPR(lg_size) e =				\
 	    (ATOMIC_INTERLOCKED_REPR(lg_size))*expected;		\
 	ATOMIC_INTERLOCKED_REPR(lg_size) d =				\
 	    (ATOMIC_INTERLOCKED_REPR(lg_size))desired;			\
 	ATOMIC_INTERLOCKED_REPR(lg_size) old =				\
 	    ATOMIC_INTERLOCKED_NAME(_InterlockedCompareExchange, 	\
 		lg_size)(&a->repr, d, e);				\
 	if (old == e) {							\
 		return true;						\
 	} else {							\
 		*expected = (type)old;					\
 		return false;						\
 	}								\
 }									\
 									\
 ATOMIC_INLINE bool							\
 atomic_compare_exchange_strong_##short_type(atomic_##short_type##_t *a,	\
    type *expected, type desired, atomic_memory_order_t success_mo,	\
    atomic_memory_order_t failure_mo) {					\
 	/* We implement the weak version with strong semantics. */	\
 	return atomic_compare_exchange_weak_##short_type(a, expected,	\
 	    desired, success_mo, failure_mo);				\
 }
 #define JEMALLOC_GENERATE_INT_ATOMICS(type, short_type, lg_size)	\
 JEMALLOC_GENERATE_ATOMICS(type, short_type, lg_size)			\
 									\
 ATOMIC_INLINE type							\
 atomic_fetch_add_##short_type(atomic_##short_type##_t *a,		\
    type val, atomic_memory_order_t mo) {				\
 	return (type)ATOMIC_INTERLOCKED_NAME(_InterlockedExchangeAdd,	\
 	    lg_size)(&a->repr, (ATOMIC_INTERLOCKED_REPR(lg_size))val);	\
 }									\
 									\
 ATOMIC_INLINE type							\
 atomic_fetch_sub_##short_type(atomic_##short_type##_t *a,		\
    type val, atomic_memory_order_t mo) {				\
 	/*								\
 	 * MSVC warns on negation of unsigned operands, but for us it	\
 	 * gives exactly the right semantics (MAX_TYPE + 1 - operand).	\
 	 */								\
 	__pragma(warning(push))						\
 	__pragma(warning(disable: 4146))				\
 	return atomic_fetch_add_##short_type(a, -val, mo);		\
 	__pragma(warning(pop))						\
 }									\
 ATOMIC_INLINE type							\
 atomic_fetch_and_##short_type(atomic_##short_type##_t *a,		\
    type val, atomic_memory_order_t mo) {				\
 	return (type)ATOMIC_INTERLOCKED_NAME(_InterlockedAnd, lg_size)(	\
 	    &a->repr, (ATOMIC_INTERLOCKED_REPR(lg_size))val);		\
 }									\
 ATOMIC_INLINE type							\
 atomic_fetch_or_##short_type(atomic_##short_type##_t *a,		\
    type val, atomic_memory_order_t mo) {				\
 	return (type)ATOMIC_INTERLOCKED_NAME(_InterlockedOr, lg_size)(	\
 	    &a->repr, (ATOMIC_INTERLOCKED_REPR(lg_size))val);		\
 }									\
 ATOMIC_INLINE type							\
 atomic_fetch_xor_##short_type(atomic_##short_type##_t *a,		\
    type val, atomic_memory_order_t mo) {				\
 	return (type)ATOMIC_INTERLOCKED_NAME(_InterlockedXor, lg_size)(	\
 	    &a->repr, (ATOMIC_INTERLOCKED_REPR(lg_size))val);		\
 }
 #endif /* JEMALLOC_INTERNAL_ATOMIC_MSVC_H */
--- a/include/jemalloc/internal/atomic_types.h
+++ b/include/jemalloc/internal/atomic_types.h
@@ -1,8 +0,0 @@
 #ifndef JEMALLOC_INTERNAL_ATOMIC_TYPES_H
 #define JEMALLOC_INTERNAL_ATOMIC_TYPES_H
 #if (LG_SIZEOF_PTR == 3 || LG_SIZEOF_INT == 3)
 #  define JEMALLOC_ATOMIC_U64
 #endif
 #endif /* JEMALLOC_INTERNAL_ATOMIC_TYPES_H */
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -146,14 +146,6 @@ static const bool have_thp =
 #endif
    ;
 #if defined(JEMALLOC_C11ATOMICS) && !defined(__cplusplus)
 #include <stdatomic.h>
 #endif
 #ifdef JEMALLOC_ATOMIC9
 #include <machine/atomic.h>
 #endif
 #if (defined(JEMALLOC_OSATOMIC) || defined(JEMALLOC_OSSPIN))
 #include <libkern/OSAtomic.h>
 #endif
@@ -199,10 +191,21 @@ static const bool have_thp =
 * its translation unit).  Each component is now broken up into multiple header
 * files, corresponding to the sections above (e.g. instead of "tsd.h", we now
 * have "tsd_types.h", "tsd_structs.h", "tsd_externs.h", "tsd_inlines.h").
 *
 * Those files which have been converted to explicitly include their
 * inter-component dependencies are now in the initial HERMETIC HEADERS
 * section.  These headers may still rely on this file for system headers and
 * global jemalloc headers, however.
 */
 #include "jemalloc/internal/jemalloc_internal_macros.h"
 /******************************************************************************/
 /* HERMETIC HEADERS */
 /******************************************************************************/
 #include "jemalloc/internal/atomic.h"
 /******************************************************************************/
 /* TYPES */
 /******************************************************************************/
@@ -380,7 +383,6 @@ typedef unsigned szind_t;
 #include "jemalloc/internal/nstime_types.h"
 #include "jemalloc/internal/util_types.h"
 #include "jemalloc/internal/atomic_types.h"
 #include "jemalloc/internal/spin_types.h"
 #include "jemalloc/internal/prng_types.h"
 #include "jemalloc/internal/ticker_types.h"
@@ -489,7 +491,6 @@ void	jemalloc_postfork_child(void);
 #include "jemalloc/internal/nstime_externs.h"
 #include "jemalloc/internal/util_externs.h"
 #include "jemalloc/internal/atomic_externs.h"
 #include "jemalloc/internal/ckh_externs.h"
 #include "jemalloc/internal/stats_externs.h"
 #include "jemalloc/internal/ctl_externs.h"
@@ -513,7 +514,6 @@ void	jemalloc_postfork_child(void);
 /******************************************************************************/
 #include "jemalloc/internal/util_inlines.h"
 #include "jemalloc/internal/atomic_inlines.h"
 #include "jemalloc/internal/spin_inlines.h"
 #include "jemalloc/internal/prng_inlines.h"
 #include "jemalloc/internal/ticker_inlines.h"
--- a/include/jemalloc/internal/jemalloc_internal_defs.h.in
+++ b/include/jemalloc/internal/jemalloc_internal_defs.h.in
@@ -30,16 +30,13 @@
 #undef LG_VADDR
 /* Defined if C11 atomics are available. */
-#undef JEMALLOC_C11ATOMICS
+#undef JEMALLOC_C11_ATOMICS
-/* Defined if the equivalent of FreeBSD's atomic(9) functions are available. */
+/* Defined if GCC __atomic atomics are available. */
-#undef JEMALLOC_ATOMIC9
+#undef JEMALLOC_GCC_ATOMIC_ATOMICS
-/*
+/* Defined if GCC __sync atomics are available. */
- * Defined if OSAtomic*() functions are available, as provided by Darwin, and
+#undef JEMALLOC_GCC_SYNC_ATOMICS
 * documented in the atomic(3) manual page.
 */
 #undef JEMALLOC_OSATOMIC
 /*
 * Defined if __sync_add_and_fetch(uint32_t *, uint32_t) and
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -72,26 +72,6 @@ arena_tdata_get
 arena_tdata_get_hard
 arenas
 arenas_tdata_cleanup
 atomic_add_p
 atomic_add_u
 atomic_add_u32
 atomic_add_u64
 atomic_add_zu
 atomic_cas_p
 atomic_cas_u
 atomic_cas_u32
 atomic_cas_u64
 atomic_cas_zu
 atomic_sub_p
 atomic_sub_u
 atomic_sub_u32
 atomic_sub_u64
 atomic_sub_zu
 atomic_write_p
 atomic_write_u
 atomic_write_u32
 atomic_write_u64
 atomic_write_zu
 b0get
 base_alloc
 base_boot
--- a/src/atomic.c
+++ b/src/atomic.c
@@ -1,2 +0,0 @@
 #define JEMALLOC_ATOMIC_C_
 #include "jemalloc/internal/jemalloc_internal.h"
--- a/test/unit/atomic.c
+++ b/test/unit/atomic.c
@@ -1,101 +1,257 @@
 #include "test/jemalloc_test.h"
-#define TEST_STRUCT(p, t)						\
+/*
-struct p##_test_s {							\
+ * We *almost* have consistent short names (e.g. "u32" for uint32_t, "b" for
-	t	accum0;							\
+ * bool, etc.  The one exception is that the short name for void * is "p" in
-	t	x;							\
+ * some places and "ptr" in others.  In the long run it would be nice to unify
-	t	s;							\
+ * these, but in the short run we'll use this shim.
-};									\
+ */
-typedef struct p##_test_s p##_test_t;
+#define assert_p_eq assert_ptr_eq
-#define TEST_BODY(p, t, tc, ta, FMT) do {				\
+/*
-	const p##_test_t tests[] = {					\
+ * t: the non-atomic type, like "uint32_t".
-		{(t)-1, (t)-1, (t)-2},					\
+ * ta: the short name for the type, like "u32".
-		{(t)-1, (t) 0, (t)-2},					\
+ * val[1,2,3]: Values of the given type.  The CAS tests use val2 for expected,
-		{(t)-1, (t) 1, (t)-2},					\
+ * and val3 for desired.
 */
 #define DO_TESTS(t, ta, val1, val2, val3) do {				\
 	t val;								\
 	t raw_atomic;							\
 	t expected;							\
 	bool success;							\
 	/* This (along with the load below) also tests ATOMIC_LOAD. */	\
 	atomic_##ta##_t atom = ATOMIC_INIT(val1);			\
 									\
-		{(t) 0, (t)-1, (t)-2},					\
+	/* ATOMIC_INIT and load. */					\
-		{(t) 0, (t) 0, (t)-2},					\
+	val = atomic_load_##ta(&atom, ATOMIC_RELAXED);			\
-		{(t) 0, (t) 1, (t)-2},					\
+	assert_##ta##_eq(val1, val, "Load or init failed");		\
 									\
-		{(t) 1, (t)-1, (t)-2},					\
+	/* Store. */							\
-		{(t) 1, (t) 0, (t)-2},					\
+	atomic_store_##ta(&atom, val1, ATOMIC_RELAXED);			\
-		{(t) 1, (t) 1, (t)-2},					\
+	atomic_store_##ta(&atom, val2, ATOMIC_RELAXED);			\
 	val = atomic_load_##ta(&atom, ATOMIC_RELAXED);			\
 	assert_##ta##_eq(val2, val, "Store failed");			\
 									\
-		{(t)0, (t)-(1 << 22), (t)-2},				\
+	/* Exchange. */							\
-		{(t)0, (t)(1 << 22), (t)-2},				\
+	atomic_store_##ta(&atom, val1, ATOMIC_RELAXED);			\
-		{(t)(1 << 22), (t)-(1 << 22), (t)-2},			\
+	val = atomic_exchange_##ta(&atom, val2, ATOMIC_RELAXED);	\
-		{(t)(1 << 22), (t)(1 << 22), (t)-2}			\
+	assert_##ta##_eq(val1, val, "Exchange returned invalid value");	\
-	};								\
+	val = atomic_load_##ta(&atom, ATOMIC_RELAXED);			\
-	unsigned i;							\
+	assert_##ta##_eq(val2, val, "Exchange store invalid value");	\
 									\
-	for (i = 0; i < sizeof(tests)/sizeof(p##_test_t); i++) {	\
+	/* 								\
-		bool err;						\
+	 * Weak CAS.  Spurious failures are allowed, so we loop a few	\
-		t accum = tests[i].accum0;				\
+	 * times.							\
-		assert_##ta##_eq(atomic_read_##p(&accum),		\
+	 */								\
-		    tests[i].accum0,					\
+	atomic_store_##ta(&atom, val1, ATOMIC_RELAXED);			\
-		    "Erroneous read, i=%u", i);				\
+	success = false;						\
 	for (int i = 0; i < 10 && !success; i++) {			\
 		expected = val2;					\
 		success = atomic_compare_exchange_weak_##ta(&atom,	\
 		    &expected, val3, ATOMIC_RELAXED, ATOMIC_RELAXED);	\
 		assert_##ta##_eq(val1, expected, 			\
 		    "CAS should update expected");			\
 	}								\
 	assert_b_eq(val1 == val2, success,				\
 	    "Weak CAS did the wrong state update");			\
 	val = atomic_load_##ta(&atom, ATOMIC_RELAXED);			\
 	if (success) {							\
 		assert_##ta##_eq(val3, val,				\
 		    "Successful CAS should update atomic");		\
 	} else {							\
 		assert_##ta##_eq(val1, val,				\
 		    "Unsuccessful CAS should not update atomic");	\
 	}								\
 									\
-		assert_##ta##_eq(atomic_add_##p(&accum, tests[i].x),	\
+	/* Strong CAS. */						\
-		    (t)((tc)tests[i].accum0 + (tc)tests[i].x),		\
+	atomic_store_##ta(&atom, val1, ATOMIC_RELAXED);			\
-		    "i=%u, accum=%"FMT", x=%"FMT,			\
+	expected = val2;						\
-		    i, tests[i].accum0, tests[i].x);			\
+	success = atomic_compare_exchange_strong_##ta(&atom, &expected,	\
-		assert_##ta##_eq(atomic_read_##p(&accum), accum,	\
+	    val3, ATOMIC_RELAXED, ATOMIC_RELAXED);			\
-		    "Erroneous add, i=%u", i);				\
+	assert_b_eq(val1 == val2, success,				\
 	    "Strong CAS did the wrong state update");			\
 	val = atomic_load_##ta(&atom, ATOMIC_RELAXED);			\
 	if (success) {							\
 		assert_##ta##_eq(val3, val,				\
 		    "Successful CAS should update atomic");		\
 	} else {							\
 		assert_##ta##_eq(val1, val,				\
 		    "Unsuccessful CAS should not update atomic");	\
 	}								\
 									\
 		accum = tests[i].accum0;				\
 		assert_##ta##_eq(atomic_sub_##p(&accum, tests[i].x),	\
 		    (t)((tc)tests[i].accum0 - (tc)tests[i].x),		\
 		    "i=%u, accum=%"FMT", x=%"FMT,			\
 		    i, tests[i].accum0, tests[i].x);			\
 		assert_##ta##_eq(atomic_read_##p(&accum), accum,	\
 		    "Erroneous sub, i=%u", i);				\
 									\
-		accum = tests[i].accum0;				\
+	/* Previous atomics API. */					\
 		err = atomic_cas_##p(&accum, tests[i].x, tests[i].s);	\
 		assert_b_eq(err, tests[i].accum0 != tests[i].x,		\
 		    "Erroneous cas success/failure result");		\
 		assert_##ta##_eq(accum, err ? tests[i].accum0 :		\
 		    tests[i].s, "Erroneous cas effect, i=%u", i);	\
 									\
-		accum = tests[i].accum0;				\
+	/* Read. */							\
-		atomic_write_##p(&accum, tests[i].s);			\
+	raw_atomic = val1;						\
-		assert_##ta##_eq(accum, tests[i].s,			\
+	val = atomic_read_##ta(&raw_atomic);				\
-		    "Erroneous write, i=%u", i);			\
+	assert_##ta##_eq(val1, val, "Read failed");			\
 									\
 	/* Write. */							\
 	raw_atomic = val1;						\
 	atomic_write_##ta(&raw_atomic, val2);				\
 	assert_##ta##_eq(val2, raw_atomic, "Write failed");		\
 									\
 	/* CAS. */							\
 	raw_atomic = val1;						\
 	success = !atomic_cas_##ta(&raw_atomic, val2, val3);		\
 	assert_b_eq(val1 == val2, success, 				\
 	    "CAS did the wrong state update");				\
 	val = raw_atomic;						\
 	if (success) {							\
 		assert_##ta##_eq(val3, val,				\
 		    "Successful CAS should update atomic");		\
 	} else {							\
 		assert_##ta##_eq(val1, val,				\
 		    "Unsuccessful CAS should not update atomic");	\
 	}								\
 } while (0)
-TEST_STRUCT(u64, uint64_t)
+#define DO_INTEGER_TESTS(t, ta, val1, val2) do {			\
 	atomic_##ta##_t atom;						\
 	t val;								\
 	t raw_atomic;							\
 									\
 	/* Fetch-add. */						\
 	atomic_store_##ta(&atom, val1, ATOMIC_RELAXED);			\
 	val = atomic_fetch_add_##ta(&atom, val2, ATOMIC_RELAXED);	\
 	assert_##ta##_eq(val1, val,					\
 	    "Fetch-add should return previous value");			\
 	val = atomic_load_##ta(&atom, ATOMIC_RELAXED);			\
 	assert_##ta##_eq(val1 + val2, val,				\
 	    "Fetch-add should update atomic");				\
 									\
 	/* Fetch-sub. */						\
 	atomic_store_##ta(&atom, val1, ATOMIC_RELAXED);			\
 	val = atomic_fetch_sub_##ta(&atom, val2, ATOMIC_RELAXED);	\
 	assert_##ta##_eq(val1, val,					\
 	    "Fetch-sub should return previous value");			\
 	val = atomic_load_##ta(&atom, ATOMIC_RELAXED);			\
 	assert_##ta##_eq(val1 - val2, val,				\
 	    "Fetch-sub should update atomic");				\
 									\
 	/* Fetch-and. */						\
 	atomic_store_##ta(&atom, val1, ATOMIC_RELAXED);			\
 	val = atomic_fetch_and_##ta(&atom, val2, ATOMIC_RELAXED);	\
 	assert_##ta##_eq(val1, val,					\
 	    "Fetch-and should return previous value");			\
 	val = atomic_load_##ta(&atom, ATOMIC_RELAXED);			\
 	assert_##ta##_eq(val1 & val2, val,				\
 	    "Fetch-and should update atomic");				\
 									\
 	/* Fetch-or. */							\
 	atomic_store_##ta(&atom, val1, ATOMIC_RELAXED);			\
 	val = atomic_fetch_or_##ta(&atom, val2, ATOMIC_RELAXED);	\
 	assert_##ta##_eq(val1, val,					\
 	    "Fetch-or should return previous value");			\
 	val = atomic_load_##ta(&atom, ATOMIC_RELAXED);			\
 	assert_##ta##_eq(val1 | val2, val,				\
 	    "Fetch-or should update atomic");				\
 									\
 	/* Fetch-xor. */						\
 	atomic_store_##ta(&atom, val1, ATOMIC_RELAXED);			\
 	val = atomic_fetch_xor_##ta(&atom, val2, ATOMIC_RELAXED);	\
 	assert_##ta##_eq(val1, val,					\
 	    "Fetch-xor should return previous value");			\
 	val = atomic_load_##ta(&atom, ATOMIC_RELAXED);			\
 	assert_##ta##_eq(val1 ^ val2, val,				\
 	    "Fetch-xor should update atomic");				\
 									\
 	/* Previous atomics API. */					\
 									\
 	/* Add. */							\
 	raw_atomic = val1;						\
 	val = atomic_add_##ta(&raw_atomic, val2);			\
 	assert_##ta##_eq(val1 + val2, val,				\
 	    "atomic_add should return new value");			\
 	assert_##ta##_eq(val1 + val2, raw_atomic,			\
 	    "atomic_add should update atomic");				\
 									\
 	/* Sub. */							\
 	raw_atomic = val1;						\
 	val = atomic_sub_##ta(&raw_atomic, val2);			\
 	assert_##ta##_eq(val1 - val2, val,				\
 	    "atomic_sub should return new value");			\
 	assert_##ta##_eq(val1 - val2, raw_atomic,			\
 	    "atomic_add should update atomic");				\
 } while (0)
 #define TEST_STRUCT(t, ta)						\
 typedef struct {							\
 	t val1;								\
 	t val2;								\
 	t val3;								\
 } ta##_test_t;
 #define TEST_CASES(t) {							\
 	{(t)-1, (t)-1, (t)-2},						\
 	{(t)-1, (t) 0, (t)-2},						\
 	{(t)-1, (t) 1, (t)-2},						\
 									\
 	{(t) 0, (t)-1, (t)-2},						\
 	{(t) 0, (t) 0, (t)-2},						\
 	{(t) 0, (t) 1, (t)-2},						\
 									\
 	{(t) 1, (t)-1, (t)-2},						\
 	{(t) 1, (t) 0, (t)-2},						\
 	{(t) 1, (t) 1, (t)-2},						\
 									\
 	{(t)0, (t)-(1 << 22), (t)-2},					\
 	{(t)0, (t)(1 << 22), (t)-2},					\
 	{(t)(1 << 22), (t)-(1 << 22), (t)-2},				\
 	{(t)(1 << 22), (t)(1 << 22), (t)-2}				\
 }
 #define TEST_BODY(t, ta) do {						\
 	const ta##_test_t tests[] = TEST_CASES(t);			\
 	for (unsigned i = 0; i < sizeof(tests)/sizeof(tests[0]); i++) {	\
 		ta##_test_t test = tests[i];				\
 		DO_TESTS(t, ta, test.val1, test.val2, test.val3);	\
 	}								\
 } while (0)
 #define INTEGER_TEST_BODY(t, ta) do {					\
 	const ta##_test_t tests[] = TEST_CASES(t);			\
 	for (unsigned i = 0; i < sizeof(tests)/sizeof(tests[0]); i++) {	\
 		ta##_test_t test = tests[i];				\
 		DO_TESTS(t, ta, test.val1, test.val2, test.val3);	\
 		DO_INTEGER_TESTS(t, ta, test.val1, test.val2);		\
 	}								\
 } while (0)
 TEST_STRUCT(uint64_t, u64);
 TEST_BEGIN(test_atomic_u64) {
 #if !(LG_SIZEOF_PTR == 3 || LG_SIZEOF_INT == 3)
 	test_skip("64-bit atomic operations not supported");
 #else
-	TEST_BODY(u64, uint64_t, uint64_t, u64, FMTx64);
+	INTEGER_TEST_BODY(uint64_t, u64);
 #endif
 }
 TEST_END
-TEST_STRUCT(u32, uint32_t)
+
 TEST_STRUCT(uint32_t, u32);
 TEST_BEGIN(test_atomic_u32) {
-	TEST_BODY(u32, uint32_t, uint32_t, u32, "#"FMTx32);
+	INTEGER_TEST_BODY(uint32_t, u32);
 }
 TEST_END
-TEST_STRUCT(p, void *)
+TEST_STRUCT(void *, p);
 TEST_BEGIN(test_atomic_p) {
-	TEST_BODY(p, void *, uintptr_t, ptr, "p");
+	TEST_BODY(void *, p);
 }
 TEST_END
-TEST_STRUCT(zu, size_t)
+TEST_STRUCT(size_t, zu);
 TEST_BEGIN(test_atomic_zu) {
-	TEST_BODY(zu, size_t, size_t, zu, "#zx");
+	INTEGER_TEST_BODY(size_t, zu);
 }
 TEST_END
-TEST_STRUCT(u, unsigned)
+TEST_STRUCT(unsigned, u);
 TEST_BEGIN(test_atomic_u) {
-	TEST_BODY(u, unsigned, unsigned, u, "#x");
+	INTEGER_TEST_BODY(unsigned, u);
 }
 TEST_END
		`@@ -1,2 +0,0 @@`
			`#define JEMALLOC_ATOMIC_C_`
			`#include "jemalloc/internal/jemalloc_internal.h"`