From d4ac7582f32f506d5203bea2f0115076202add38 Mon Sep 17 00:00:00 2001
From: David Goldblatt <davidgoldblatt@fb.com>
Date: Wed, 25 Jan 2017 09:54:27 -0800
Subject: [PATCH] Introduce a backport of C11 atomics

This introduces a backport of C11 atomics.  It has four implementations; ranked
in order of preference, they are:
- GCC/Clang __atomic builtins
- GCC/Clang __sync builtins
- MSVC _Interlocked builtins
- C11 atomics, from <stdatomic.h>

The primary advantages are:
- Close adherence to the standard API gives us a defined memory model.
- Type safety: atomic objects are now separate types from non-atomic ones, so
  that it's impossible to mix up atomic and non-atomic updates (which is
  undefined behavior that compilers are starting to take advantage of).
- Efficiency: we can specify ordering for operations, avoiding fences and
  atomic operations on strongly ordered architectures (example:
  `atomic_write_u32(ptr, val);` involves a CAS loop, whereas
  `atomic_store(ptr, val, ATOMIC_RELEASE);` is a plain store.

This diff leaves in the current atomics API (implementing them in terms of the
backport).  This lets us transition uses over piecemeal.

Testing:
This is by nature hard to test. I've manually tested the first three options on
Linux on gcc by futzing with the #defines manually, on freebsd with gcc and
clang, on MSVC, and on OS X with clang.  All of these were x86 machines though,
and we don't have any test infrastructure set up for non-x86 platforms.
---
 Makefile.in                                   |   1 -
 configure.ac                                  |  52 +-
 include/jemalloc/internal/atomic.h            | 111 ++++
 include/jemalloc/internal/atomic_c11.h        |  97 ++++
 include/jemalloc/internal/atomic_externs.h    |  12 -
 include/jemalloc/internal/atomic_gcc_atomic.h | 125 +++++
 include/jemalloc/internal/atomic_gcc_sync.h   | 191 +++++++
 include/jemalloc/internal/atomic_inlines.h    | 525 ------------------
 include/jemalloc/internal/atomic_msvc.h       | 158 ++++++
 include/jemalloc/internal/atomic_types.h      |   8 -
 .../jemalloc/internal/jemalloc_internal.h.in  |  22 +-
 .../internal/jemalloc_internal_defs.h.in      |  13 +-
 include/jemalloc/internal/private_symbols.txt |  20 -
 src/atomic.c                                  |   2 -
 test/unit/atomic.c                            | 282 +++++++---
 15 files changed, 947 insertions(+), 672 deletions(-)
 create mode 100644 include/jemalloc/internal/atomic.h
 create mode 100644 include/jemalloc/internal/atomic_c11.h
 delete mode 100644 include/jemalloc/internal/atomic_externs.h
 create mode 100644 include/jemalloc/internal/atomic_gcc_atomic.h
 create mode 100644 include/jemalloc/internal/atomic_gcc_sync.h
 delete mode 100644 include/jemalloc/internal/atomic_inlines.h
 create mode 100644 include/jemalloc/internal/atomic_msvc.h
 delete mode 100644 include/jemalloc/internal/atomic_types.h
 delete mode 100644 src/atomic.c

diff --git a/Makefile.in b/Makefile.in
index 76a73b76..53ebe32e 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -90,7 +90,6 @@ BINS := $(objroot)bin/jemalloc-config $(objroot)bin/jemalloc.sh $(objroot)bin/je
 C_HDRS := $(objroot)include/jemalloc/jemalloc$(install_suffix).h
 C_SRCS := $(srcroot)src/jemalloc.c \
 	$(srcroot)src/arena.c \
-	$(srcroot)src/atomic.c \
 	$(srcroot)src/base.c \
 	$(srcroot)src/bitmap.c \
 	$(srcroot)src/ckh.c \
diff --git a/configure.ac b/configure.ac
index 1653fe7f..0095caf1 100644
--- a/configure.ac
+++ b/configure.ac
@@ -550,7 +550,7 @@ case "${host}" in
 	AC_DEFINE([JEMALLOC_HAS_ALLOCA_H])
 	AC_DEFINE([JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY], [ ])
 	AC_DEFINE([JEMALLOC_THREADED_INIT], [ ])
-	AC_DEFINE([JEMALLOC_C11ATOMICS])
+	AC_DEFINE([JEMALLOC_C11_ATOMICS])
 	force_tls="0"
 	default_munmap="0"
 	;;
@@ -1730,36 +1730,44 @@ JE_COMPILABLE([C11 atomics], [
     volatile atomic_uint_least64_t *a = (volatile atomic_uint_least64_t *)p;
     uint64_t r = atomic_fetch_add(a, x) + x;
     return r == 0;
-], [je_cv_c11atomics])
-if test "x${je_cv_c11atomics}" = "xyes" ; then
-  AC_DEFINE([JEMALLOC_C11ATOMICS])
+], [je_cv_c11_atomics])
+if test "x${je_cv_c11_atomics}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_C11_ATOMICS])
 fi
 
 dnl ============================================================================
-dnl Check for atomic(9) operations as provided on FreeBSD.
+dnl Check for GCC-style __atomic atomics.
 
-JE_COMPILABLE([atomic(9)], [
-#include <sys/types.h>
-#include <machine/atomic.h>
-#include <inttypes.h>
+JE_COMPILABLE([GCC __atomic atomics], [
 ], [
-	{
-		uint32_t x32 = 0;
-		volatile uint32_t *x32p = &x32;
-		atomic_fetchadd_32(x32p, 1);
-	}
-	{
-		unsigned long xlong = 0;
-		volatile unsigned long *xlongp = &xlong;
-		atomic_fetchadd_long(xlongp, 1);
-	}
-], [je_cv_atomic9])
-if test "x${je_cv_atomic9}" = "xyes" ; then
-  AC_DEFINE([JEMALLOC_ATOMIC9])
+    int x = 0;
+    int val = 1;
+    int y = __atomic_fetch_add(&x, val, __ATOMIC_RELAXED);
+    int after_add = x;
+    return after_add == 1;
+], [je_cv_gcc_atomic_atomics])
+if test "x${je_cv_gcc_atomic_atomics}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_GCC_ATOMIC_ATOMICS])
+fi
+
+dnl ============================================================================
+dnl Check for GCC-style __sync atomics.
+
+JE_COMPILABLE([GCC __sync atomics], [
+], [
+    int x = 0;
+    int before_add = __sync_fetch_and_add(&x, 1);
+    int after_add = x;
+    return (before_add == 0) && (after_add == 1);
+], [je_cv_gcc_sync_atomics])
+if test "x${je_cv_gcc_sync_atomics}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_GCC_SYNC_ATOMICS])
 fi
 
 dnl ============================================================================
 dnl Check for atomic(3) operations as provided on Darwin.
+dnl We need this not for the atomic operations (which are provided above), but
+dnl rather for the OSSpinLock type it exposes.
 
 JE_COMPILABLE([Darwin OSAtomic*()], [
 #include <libkern/OSAtomic.h>
diff --git a/include/jemalloc/internal/atomic.h b/include/jemalloc/internal/atomic.h
new file mode 100644
index 00000000..84fbbdfb
--- /dev/null
+++ b/include/jemalloc/internal/atomic.h
@@ -0,0 +1,111 @@
+#ifndef JEMALLOC_INTERNAL_ATOMIC_H
+#define JEMALLOC_INTERNAL_ATOMIC_H
+
+#define ATOMIC_INLINE static inline
+
+#if defined(JEMALLOC_GCC_ATOMIC_ATOMICS)
+#  include "jemalloc/internal/atomic_gcc_atomic.h"
+#elif defined(JEMALLOC_GCC_SYNC_ATOMICS)
+#  include "jemalloc/internal/atomic_gcc_sync.h"
+#elif defined(_MSC_VER)
+#  include "jemalloc/internal/atomic_msvc.h"
+#elif defined(JEMALLOC_C11_ATOMICS)
+#  include "jemalloc/internal/atomic_c11.h"
+#else
+#  error "Don't have atomics implemented on this platform."
+#endif
+
+/*
+ * This header gives more or less a backport of C11 atomics. The user can write
+ * JEMALLOC_GENERATE_ATOMICS(type, short_type, lg_sizeof_type); to generate
+ * counterparts of the C11 atomic functions for type, as so:
+ *   JEMALLOC_GENERATE_ATOMICS(int *, pi, 3);
+ * and then write things like:
+ *   int *some_ptr;
+ *   atomic_pi_t atomic_ptr_to_int;
+ *   atomic_store_pi(&atomic_ptr_to_int, some_ptr, ATOMIC_RELAXED);
+ *   int *prev_value = atomic_exchange_pi(&ptr_to_int, NULL, ATOMIC_ACQ_REL);
+ *   assert(some_ptr == prev_value);
+ * and expect things to work in the obvious way.
+ *
+ * Also included (with naming differences to avoid conflicts with the standard
+ * library):
+ *   atomic_fence(atomic_memory_order_t) (mimics C11's atomic_thread_fence).
+ *   ATOMIC_INIT (mimics C11's ATOMIC_VAR_INIT).
+ */
+
+/*
+ * Pure convenience, so that we don't have to type "atomic_memory_order_"
+ * quite so often.
+ */
+#define ATOMIC_RELAXED atomic_memory_order_relaxed
+#define ATOMIC_ACQUIRE atomic_memory_order_acquire,
+#define ATOMIC_RELEASE atomic_memory_order_release,
+#define ATOMIC_ACQ_REL atomic_memory_order_acq_rel,
+#define ATOMIC_SEQ_CST atomic_memory_order_seq_cst
+
+/*
+ * In order to let us transition atomics usage piecemeal (and reason locally
+ * about memory orders), we'll support the previous API for a while.
+ */
+#define JEMALLOC_GENERATE_COMPATABILITY_ATOMICS(type, short_type)	\
+ATOMIC_INLINE type							\
+atomic_read_##short_type(type *p) {					\
+	return atomic_load_##short_type ((atomic_##short_type##_t *)p,	\
+	    ATOMIC_SEQ_CST);						\
+}									\
+									\
+ATOMIC_INLINE void							\
+atomic_write_##short_type(type *p, const type val) {			\
+	atomic_store_##short_type((atomic_##short_type##_t *)p,		\
+	    (type)val, ATOMIC_SEQ_CST);					\
+}									\
+ATOMIC_INLINE bool							\
+atomic_cas_##short_type(type *p, type c, type s) {			\
+	/* Note the '!' -- atomic_cas inverts the usual semantics. */	\
+	return !atomic_compare_exchange_strong_##short_type(		\
+	    (atomic_##short_type##_t *)p, &c, s, ATOMIC_SEQ_CST,	\
+	    ATOMIC_SEQ_CST);						\
+}
+
+#define JEMALLOC_GENERATE_COMPATABILITY_INT_ATOMICS(type, short_type)	\
+JEMALLOC_GENERATE_COMPATABILITY_ATOMICS(type, short_type)		\
+									\
+ATOMIC_INLINE type							\
+atomic_add_##short_type(type *p, type x) {				\
+	return atomic_fetch_add_##short_type(				\
+	    (atomic_##short_type##_t *)p, x, ATOMIC_SEQ_CST) + x;	\
+}									\
+ATOMIC_INLINE type							\
+atomic_sub_##short_type(type *p, type x) {				\
+	return atomic_fetch_sub_##short_type(				\
+	    (atomic_##short_type##_t *)p, x, ATOMIC_SEQ_CST) - x;	\
+}
+
+JEMALLOC_GENERATE_ATOMICS(void *, p, LG_SIZEOF_PTR)
+JEMALLOC_GENERATE_COMPATABILITY_ATOMICS(void *, p)
+
+/*
+ * There's no actual guarantee that sizeof(bool) == 1, but it's true on the only
+ * platform that actually needs to know the size, MSVC.
+ */
+JEMALLOC_GENERATE_ATOMICS(bool, b, 0)
+JEMALLOC_GENERATE_COMPATABILITY_ATOMICS(bool, b)
+
+JEMALLOC_GENERATE_INT_ATOMICS(unsigned, u, LG_SIZEOF_INT)
+JEMALLOC_GENERATE_COMPATABILITY_INT_ATOMICS(unsigned, u)
+
+JEMALLOC_GENERATE_INT_ATOMICS(size_t, zu, LG_SIZEOF_PTR)
+JEMALLOC_GENERATE_COMPATABILITY_INT_ATOMICS(size_t, zu)
+
+JEMALLOC_GENERATE_INT_ATOMICS(uint32_t, u32, 2)
+JEMALLOC_GENERATE_COMPATABILITY_INT_ATOMICS(uint32_t, u32)
+
+#  if (LG_SIZEOF_PTR == 3 || LG_SIZEOF_INT == 3)
+JEMALLOC_GENERATE_INT_ATOMICS(uint64_t, u64, 3)
+JEMALLOC_GENERATE_COMPATABILITY_INT_ATOMICS(uint64_t, u64)
+#  endif
+
+#undef ATOMIC_INLINE
+
+#endif /* JEMALLOC_INTERNAL_ATOMIC_H */
diff --git a/include/jemalloc/internal/atomic_c11.h b/include/jemalloc/internal/atomic_c11.h
new file mode 100644
index 00000000..a5f9313a
--- /dev/null
+++ b/include/jemalloc/internal/atomic_c11.h
@@ -0,0 +1,97 @@
+#ifndef JEMALLOC_INTERNAL_ATOMIC_C11_H
+#define JEMALLOC_INTERNAL_ATOMIC_C11_H
+
+#include <stdatomic.h>
+
+#define ATOMIC_INIT(...) ATOMIC_VAR_INIT(__VA_ARGS__)
+
+#define atomic_memory_order_t memory_order
+#define atomic_memory_order_relaxed memory_order_relaxed
+#define atomic_memory_order_acquire memory_order_acquire
+#define atomic_memory_order_release memory_order_release
+#define atomic_memory_order_acq_rel memory_order_acq_rel
+#define atomic_memory_order_seq_cst memory_order_seq_cst
+
+#define atomic_fence atomic_thread_fence
+
+#define JEMALLOC_GENERATE_ATOMICS(type, short_type,			\
+    /* unused */ lg_size)						\
+typedef _Atomic(type) atomic_##short_type##_t;				\
+									\
+ATOMIC_INLINE type							\
+atomic_load_##short_type(const atomic_##short_type##_t *a,		\
+    atomic_memory_order_t mo) {						\
+	/*								\
+	 * A strict interpretation of the C standard prevents		\
+	 * atomic_load from taking a const argument, but it's		\
+	 * convenient for our purposes. This cast is a workaround.	\
+	 */								\
+	atomic_##short_type##_t* a_nonconst =				\
+	    (atomic_##short_type##_t*)a;				\
+	return atomic_load_explicit(a_nonconst, mo);			\
+}									\
+									\
+ATOMIC_INLINE void							\
+atomic_store_##short_type(atomic_##short_type##_t *a,			\
+    type val, atomic_memory_order_t mo) {				\
+	atomic_store_explicit(a, val, mo);				\
+}									\
+									\
+ATOMIC_INLINE type							\
+atomic_exchange_##short_type(atomic_##short_type##_t *a, type val,	\
+    atomic_memory_order_t mo) {						\
+	return atomic_exchange_explicit(a, val, mo);			\
+}									\
+									\
+ATOMIC_INLINE bool							\
+atomic_compare_exchange_weak_##short_type(atomic_##short_type##_t *a,	\
+    type *expected, type desired, atomic_memory_order_t success_mo,	\
+    atomic_memory_order_t failure_mo) {					\
+	return atomic_compare_exchange_weak_explicit(a, expected,	\
+	    desired, success_mo, failure_mo);				\
+}									\
+									\
+ATOMIC_INLINE bool							\
+atomic_compare_exchange_strong_##short_type(atomic_##short_type##_t *a,	\
+    type *expected, type desired, atomic_memory_order_t success_mo,	\
+    atomic_memory_order_t failure_mo) {					\
+	return atomic_compare_exchange_strong_explicit(a, expected,	\
+	    desired, success_mo, failure_mo);				\
+}
+
+/*
+ * Integral types have some special operations available that non-integral ones
+ * lack.
+ */
+#define JEMALLOC_GENERATE_INT_ATOMICS(type, short_type, 		\
+    /* unused */ lg_size)						\
+JEMALLOC_GENERATE_ATOMICS(type, short_type, /* unused */ lg_size)	\
+									\
+ATOMIC_INLINE type							\
+atomic_fetch_add_##short_type(atomic_##short_type##_t *a,		\
+    type val, atomic_memory_order_t mo) {				\
+	return atomic_fetch_add_explicit(a, val, mo);			\
+}									\
+									\
+ATOMIC_INLINE type							\
+atomic_fetch_sub_##short_type(atomic_##short_type##_t *a,		\
+    type val, atomic_memory_order_t mo) {				\
+	return atomic_fetch_sub_explicit(a, val, mo);			\
+}									\
+ATOMIC_INLINE type							\
+atomic_fetch_and_##short_type(atomic_##short_type##_t *a,		\
+    type val, atomic_memory_order_t mo) {				\
+	return atomic_fetch_and_explicit(a, val, mo);			\
+}									\
+ATOMIC_INLINE type							\
+atomic_fetch_or_##short_type(atomic_##short_type##_t *a,		\
+    type val, atomic_memory_order_t mo) {				\
+	return atomic_fetch_or_explicit(a, val, mo);			\
+}									\
+ATOMIC_INLINE type							\
+atomic_fetch_xor_##short_type(atomic_##short_type##_t *a,		\
+    type val, atomic_memory_order_t mo) {				\
+	return atomic_fetch_xor_explicit(a, val, mo);			\
+}
+
+#endif /* JEMALLOC_INTERNAL_ATOMIC_C11_H */
diff --git a/include/jemalloc/internal/atomic_externs.h b/include/jemalloc/internal/atomic_externs.h
deleted file mode 100644
index 09f06408..00000000
--- a/include/jemalloc/internal/atomic_externs.h
+++ /dev/null
@@ -1,12 +0,0 @@
-#ifndef JEMALLOC_INTERNAL_ATOMIC_EXTERNS_H
-#define JEMALLOC_INTERNAL_ATOMIC_EXTERNS_H
-
-#if (LG_SIZEOF_PTR == 3 || LG_SIZEOF_INT == 3)
-#define atomic_read_u64(p)	atomic_add_u64(p, 0)
-#endif
-#define atomic_read_u32(p)	atomic_add_u32(p, 0)
-#define atomic_read_p(p)	atomic_add_p(p, NULL)
-#define atomic_read_zu(p)	atomic_add_zu(p, 0)
-#define atomic_read_u(p)	atomic_add_u(p, 0)
-
-#endif /* JEMALLOC_INTERNAL_ATOMIC_EXTERNS_H */
diff --git a/include/jemalloc/internal/atomic_gcc_atomic.h b/include/jemalloc/internal/atomic_gcc_atomic.h
new file mode 100644
index 00000000..3d13b4a6
--- /dev/null
+++ b/include/jemalloc/internal/atomic_gcc_atomic.h
@@ -0,0 +1,125 @@
+#ifndef JEMALLOC_INTERNAL_ATOMIC_GCC_ATOMIC_H
+#define JEMALLOC_INTERNAL_ATOMIC_GCC_ATOMIC_H
+
+#define ATOMIC_INIT(...) {__VA_ARGS__}
+
+typedef enum {
+	atomic_memory_order_relaxed,
+	atomic_memory_order_acquire,
+	atomic_memory_order_release,
+	atomic_memory_order_acq_rel,
+	atomic_memory_order_seq_cst
+} atomic_memory_order_t;
+
+ATOMIC_INLINE int
+atomic_enum_to_builtin(atomic_memory_order_t mo) {
+	switch (mo) {
+	case atomic_memory_order_relaxed:
+		return __ATOMIC_RELAXED;
+	case atomic_memory_order_acquire:
+		return __ATOMIC_ACQUIRE;
+	case atomic_memory_order_release:
+		return __ATOMIC_RELEASE;
+	case atomic_memory_order_acq_rel:
+		return __ATOMIC_ACQ_REL;
+	case atomic_memory_order_seq_cst:
+		return __ATOMIC_SEQ_CST;
+	}
+	/* Can't actually happen; the switch is exhaustive. */
+	return __ATOMIC_SEQ_CST;
+}
+
+ATOMIC_INLINE void
+atomic_fence(atomic_memory_order_t mo) {
+	__atomic_thread_fence(atomic_enum_to_builtin(mo));
+}
+
+#define JEMALLOC_GENERATE_ATOMICS(type, short_type,			\
+    /* unused */ lg_size)						\
+typedef struct {							\
+	type repr;							\
+} atomic_##short_type##_t;						\
+									\
+ATOMIC_INLINE type							\
+atomic_load_##short_type(const atomic_##short_type##_t *a,		\
+    atomic_memory_order_t mo) {						\
+	type result;							\
+	__atomic_load(&a->repr, &result, atomic_enum_to_builtin(mo));	\
+	return result;							\
+}									\
+									\
+ATOMIC_INLINE void							\
+atomic_store_##short_type(atomic_##short_type##_t *a, type val,		\
+    atomic_memory_order_t mo) {						\
+	__atomic_store(&a->repr, &val, atomic_enum_to_builtin(mo));	\
+}									\
+									\
+ATOMIC_INLINE type							\
+atomic_exchange_##short_type(atomic_##short_type##_t *a, type val,	\
+    atomic_memory_order_t mo) {						\
+	type result;							\
+	__atomic_exchange(&a->repr, &val, &result,			\
+	    atomic_enum_to_builtin(mo));				\
+	return result;							\
+}									\
+									\
+ATOMIC_INLINE bool							\
+atomic_compare_exchange_weak_##short_type(atomic_##short_type##_t *a,	\
+    type *expected, type desired, atomic_memory_order_t success_mo,	\
+    atomic_memory_order_t failure_mo) {					\
+	return __atomic_compare_exchange(&a->repr, expected, &desired,	\
+	    true, atomic_enum_to_builtin(success_mo),			\
+	    atomic_enum_to_builtin(failure_mo));			\
+}									\
+									\
+ATOMIC_INLINE bool							\
+atomic_compare_exchange_strong_##short_type(atomic_##short_type##_t *a,	\
+    type *expected, type desired, atomic_memory_order_t success_mo,	\
+    atomic_memory_order_t failure_mo) {					\
+	return __atomic_compare_exchange(&a->repr, expected, &desired,	\
+	    false,							\
+	    atomic_enum_to_builtin(success_mo),				\
+	    atomic_enum_to_builtin(failure_mo));			\
+}
+
+
+#define JEMALLOC_GENERATE_INT_ATOMICS(type, short_type,			\
+    /* unused */ lg_size)						\
+JEMALLOC_GENERATE_ATOMICS(type, short_type, /* unused */ lg_size)	\
+									\
+ATOMIC_INLINE type							\
+atomic_fetch_add_##short_type(atomic_##short_type##_t *a, type val,	\
+    atomic_memory_order_t mo) {						\
+	return __atomic_fetch_add(&a->repr, val,			\
+	    atomic_enum_to_builtin(mo));				\
+}									\
+									\
+ATOMIC_INLINE type							\
+atomic_fetch_sub_##short_type(atomic_##short_type##_t *a, type val,	\
+    atomic_memory_order_t mo) {						\
+	return __atomic_fetch_sub(&a->repr, val,			\
+	    atomic_enum_to_builtin(mo));				\
+}									\
+									\
+ATOMIC_INLINE type							\
+atomic_fetch_and_##short_type(atomic_##short_type##_t *a, type val,	\
+    atomic_memory_order_t mo) {						\
+	return __atomic_fetch_and(&a->repr, val,			\
+	    atomic_enum_to_builtin(mo));				\
+}									\
+									\
+ATOMIC_INLINE type							\
+atomic_fetch_or_##short_type(atomic_##short_type##_t *a, type val,	\
+    atomic_memory_order_t mo) {						\
+	return __atomic_fetch_or(&a->repr, val,				\
+	    atomic_enum_to_builtin(mo));				\
+}									\
+									\
+ATOMIC_INLINE type							\
+atomic_fetch_xor_##short_type(atomic_##short_type##_t *a, type val,	\
+    atomic_memory_order_t mo) {						\
+	return __atomic_fetch_xor(&a->repr, val,			\
+	    atomic_enum_to_builtin(mo));				\
+}
+
+#endif /* JEMALLOC_INTERNAL_ATOMIC_GCC_ATOMIC_H */
diff --git a/include/jemalloc/internal/atomic_gcc_sync.h b/include/jemalloc/internal/atomic_gcc_sync.h
new file mode 100644
index 00000000..30846e4d
--- /dev/null
+++ b/include/jemalloc/internal/atomic_gcc_sync.h
@@ -0,0 +1,191 @@
+#ifndef JEMALLOC_INTERNAL_ATOMIC_GCC_SYNC_H
+#define JEMALLOC_INTERNAL_ATOMIC_GCC_SYNC_H
+
+#define ATOMIC_INIT(...) {__VA_ARGS__}
+
+typedef enum {
+	atomic_memory_order_relaxed,
+	atomic_memory_order_acquire,
+	atomic_memory_order_release,
+	atomic_memory_order_acq_rel,
+	atomic_memory_order_seq_cst
+} atomic_memory_order_t;
+
+ATOMIC_INLINE void
+atomic_fence(atomic_memory_order_t mo) {
+	/* Easy cases first: no barrier, and full barrier. */
+	if (mo == atomic_memory_order_relaxed) {
+		asm volatile("" ::: "memory");
+		return;
+	}
+	if (mo == atomic_memory_order_seq_cst) {
+		asm volatile("" ::: "memory");
+		__sync_synchronize();
+		asm volatile("" ::: "memory");
+		return;
+	}
+	asm volatile("" ::: "memory");
+#  if defined(__i386__) || defined(__x86_64__)
+	/* This is implicit on x86. */
+#  elif defined(__ppc__)
+	asm volatile("lwsync");
+#  elif defined(__sparc__) && defined(__arch64__)
+	if (mo == atomic_memory_order_acquire) {
+		asm volatile("membar #LoadLoad | #LoadStore");
+	} else if (mo == atomic_memory_order_release) {
+		asm volatile("membar #LoadStore | #StoreStore");
+	} else {
+		asm volatile("membar #LoadLoad | #LoadStore | #StoreStore");
+	}
+#  else
+	__sync_synchronize();
+#  endif
+	asm volatile("" ::: "memory");
+}
+
+/*
+ * A correct implementation of seq_cst loads and stores on weakly ordered
+ * architectures could do either of the following:
+ *   1. store() is weak-fence -> store -> strong fence, load() is load ->
+ *      strong-fence.
+ *   2. store() is strong-fence -> store, load() is strong-fence -> load ->
+ *      weak-fence.
+ * The tricky thing is, load() and store() above can be the load or store
+ * portions of a gcc __sync builtin, so we have to follow GCC's lead, which
+ * means going with strategy 2.
+ * On strongly ordered architectures, the natural strategy is to stick a strong
+ * fence after seq_cst stores, and have naked loads.  So we want the strong
+ * fences in different places on different architectures.
+ * atomic_pre_sc_load_fence and atomic_post_sc_store_fence allow us to
+ * accomplish this.
+ */
+
+ATOMIC_INLINE void
+atomic_pre_sc_load_fence() {
+#  if defined(__i386__) || defined(__x86_64__) ||			\
+    (defined(__sparc__) && defined(__arch64__))
+	atomic_fence(atomic_memory_order_relaxed);
+#  else
+	atomic_fence(atomic_memory_order_seq_cst);
+#  endif
+}
+
+ATOMIC_INLINE void
+atomic_post_sc_store_fence() {
+#  if defined(__i386__) || defined(__x86_64__) ||			\
+    (defined(__sparc__) && defined(__arch64__))
+	atomic_fence(atomic_memory_order_seq_cst);
+#  else
+	atomic_fence(atomic_memory_order_relaxed);
+#  endif
+
+}
+
+#define JEMALLOC_GENERATE_ATOMICS(type, short_type,			\
+    /* unused */ lg_size)						\
+typedef struct {							\
+	type volatile repr;						\
+} atomic_##short_type##_t;						\
+									\
+ATOMIC_INLINE type							\
+atomic_load_##short_type(const atomic_##short_type##_t *a,		\
+    atomic_memory_order_t mo) {						\
+	if (mo == atomic_memory_order_seq_cst) {			\
+		atomic_pre_sc_load_fence();				\
+	}								\
+	type result = a->repr;						\
+	if (mo != atomic_memory_order_relaxed) {			\
+		atomic_fence(atomic_memory_order_acquire);		\
+	}								\
+	return result;							\
+}									\
+									\
+ATOMIC_INLINE void							\
+atomic_store_##short_type(atomic_##short_type##_t *a,			\
+    type val, atomic_memory_order_t mo) {				\
+	if (mo != atomic_memory_order_relaxed) {			\
+		atomic_fence(atomic_memory_order_release);		\
+	}								\
+	a->repr = val;							\
+	if (mo == atomic_memory_order_seq_cst) {			\
+		atomic_post_sc_store_fence();				\
+	}								\
+}									\
+									\
+ATOMIC_INLINE type							\
+atomic_exchange_##short_type(atomic_##short_type##_t *a, type val,	\
+    atomic_memory_order_t mo) {						\
+	/*								\
+	 * Because of FreeBSD, we care about gcc 4.2, which doesn't have\
+	 * an atomic exchange builtin.  We fake it with a CAS loop.	\
+	 */								\
+	while (true) {							\
+		type old = a->repr;					\
+		if (__sync_bool_compare_and_swap(&a->repr, old, val)) {	\
+			return old;					\
+		}							\
+	}								\
+}									\
+									\
+ATOMIC_INLINE bool							\
+atomic_compare_exchange_weak_##short_type(atomic_##short_type##_t *a,	\
+    type *expected, type desired, atomic_memory_order_t success_mo,	\
+    atomic_memory_order_t failure_mo) {					\
+	type prev = __sync_val_compare_and_swap(&a->repr, *expected,	\
+	    desired);							\
+	if (prev == *expected) {					\
+		return true;						\
+	} else {							\
+		*expected = prev;					\
+		return false;						\
+	}								\
+}									\
+ATOMIC_INLINE bool							\
+atomic_compare_exchange_strong_##short_type(atomic_##short_type##_t *a,	\
+    type *expected, type desired, atomic_memory_order_t success_mo,	\
+    atomic_memory_order_t failure_mo) {					\
+	type prev = __sync_val_compare_and_swap(&a->repr, *expected,	\
+	    desired);							\
+	if (prev == *expected) {					\
+		return true;						\
+	} else {							\
+		*expected = prev;					\
+		return false;						\
+	}								\
+}
+
+#define JEMALLOC_GENERATE_INT_ATOMICS(type, short_type,			\
+    /* unused */ lg_size)						\
+JEMALLOC_GENERATE_ATOMICS(type, short_type, /* unused */ lg_size)	\
+									\
+ATOMIC_INLINE type							\
+atomic_fetch_add_##short_type(atomic_##short_type##_t *a, type val,	\
+    atomic_memory_order_t mo) {						\
+	return __sync_fetch_and_add(&a->repr, val);			\
+}									\
+									\
+ATOMIC_INLINE type							\
+atomic_fetch_sub_##short_type(atomic_##short_type##_t *a, type val,	\
+    atomic_memory_order_t mo) {						\
+	return __sync_fetch_and_sub(&a->repr, val);			\
+}									\
+									\
+ATOMIC_INLINE type							\
+atomic_fetch_and_##short_type(atomic_##short_type##_t *a, type val,	\
+    atomic_memory_order_t mo) {						\
+	return __sync_fetch_and_and(&a->repr, val);			\
+}									\
+									\
+ATOMIC_INLINE type							\
+atomic_fetch_or_##short_type(atomic_##short_type##_t *a, type val,	\
+    atomic_memory_order_t mo) {						\
+	return __sync_fetch_and_or(&a->repr, val);			\
+}									\
+									\
+ATOMIC_INLINE type							\
+atomic_fetch_xor_##short_type(atomic_##short_type##_t *a, type val,	\
+    atomic_memory_order_t mo) {						\
+	return __sync_fetch_and_xor(&a->repr, val);			\
+}
+
+#endif /* JEMALLOC_INTERNAL_ATOMIC_GCC_SYNC_H */
diff --git a/include/jemalloc/internal/atomic_inlines.h b/include/jemalloc/internal/atomic_inlines.h
deleted file mode 100644
index de66d57d..00000000
--- a/include/jemalloc/internal/atomic_inlines.h
+++ /dev/null
@@ -1,525 +0,0 @@
-#ifndef JEMALLOC_INTERNAL_ATOMIC_INLINES_H
-#define JEMALLOC_INTERNAL_ATOMIC_INLINES_H
-
-/*
- * All arithmetic functions return the arithmetic result of the atomic
- * operation.  Some atomic operation APIs return the value prior to mutation, in
- * which case the following functions must redundantly compute the result so
- * that it can be returned.  These functions are normally inlined, so the extra
- * operations can be optimized away if the return values aren't used by the
- * callers.
- *
- *   <t> atomic_read_<t>(<t> *p) { return *p; }
- *   <t> atomic_add_<t>(<t> *p, <t> x) { return *p += x; }
- *   <t> atomic_sub_<t>(<t> *p, <t> x) { return *p -= x; }
- *   bool atomic_cas_<t>(<t> *p, <t> c, <t> s)
- *   {
- *     if (*p != c)
- *       return true;
- *     *p = s;
- *     return false;
- *   }
- *   void atomic_write_<t>(<t> *p, <t> x) { *p = x; }
- */
-
-#ifndef JEMALLOC_ENABLE_INLINE
-#  ifdef JEMALLOC_ATOMIC_U64
-uint64_t	atomic_add_u64(uint64_t *p, uint64_t x);
-uint64_t	atomic_sub_u64(uint64_t *p, uint64_t x);
-bool	atomic_cas_u64(uint64_t *p, uint64_t c, uint64_t s);
-void	atomic_write_u64(uint64_t *p, uint64_t x);
-#  endif
-uint32_t	atomic_add_u32(uint32_t *p, uint32_t x);
-uint32_t	atomic_sub_u32(uint32_t *p, uint32_t x);
-bool	atomic_cas_u32(uint32_t *p, uint32_t c, uint32_t s);
-void	atomic_write_u32(uint32_t *p, uint32_t x);
-void	*atomic_add_p(void **p, void *x);
-void	*atomic_sub_p(void **p, void *x);
-bool	atomic_cas_p(void **p, void *c, void *s);
-void	atomic_write_p(void **p, const void *x);
-size_t	atomic_add_zu(size_t *p, size_t x);
-size_t	atomic_sub_zu(size_t *p, size_t x);
-bool	atomic_cas_zu(size_t *p, size_t c, size_t s);
-void	atomic_write_zu(size_t *p, size_t x);
-unsigned	atomic_add_u(unsigned *p, unsigned x);
-unsigned	atomic_sub_u(unsigned *p, unsigned x);
-bool	atomic_cas_u(unsigned *p, unsigned c, unsigned s);
-void	atomic_write_u(unsigned *p, unsigned x);
-#endif
-
-#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_ATOMIC_C_))
-/******************************************************************************/
-/* 64-bit operations. */
-#ifdef JEMALLOC_ATOMIC_U64
-#  if (defined(__amd64__) || defined(__x86_64__))
-JEMALLOC_INLINE uint64_t
-atomic_add_u64(uint64_t *p, uint64_t x) {
-	uint64_t t = x;
-
-	asm volatile (
-	    "lock; xaddq %0, %1;"
-	    : "+r" (t), "=m" (*p) /* Outputs. */
-	    : "m" (*p) /* Inputs. */
-	    );
-
-	return t + x;
-}
-
-JEMALLOC_INLINE uint64_t
-atomic_sub_u64(uint64_t *p, uint64_t x) {
-	uint64_t t;
-
-	x = (uint64_t)(-(int64_t)x);
-	t = x;
-	asm volatile (
-	    "lock; xaddq %0, %1;"
-	    : "+r" (t), "=m" (*p) /* Outputs. */
-	    : "m" (*p) /* Inputs. */
-	    );
-
-	return t + x;
-}
-
-JEMALLOC_INLINE bool
-atomic_cas_u64(uint64_t *p, uint64_t c, uint64_t s) {
-	uint8_t success;
-
-	asm volatile (
-	    "lock; cmpxchgq %4, %0;"
-	    "sete %1;"
-	    : "=m" (*p), "=a" (success) /* Outputs. */
-	    : "m" (*p), "a" (c), "r" (s) /* Inputs. */
-	    : "memory" /* Clobbers. */
-	    );
-
-	return !(bool)success;
-}
-
-JEMALLOC_INLINE void
-atomic_write_u64(uint64_t *p, uint64_t x) {
-	asm volatile (
-	    "xchgq %1, %0;" /* Lock is implied by xchgq. */
-	    : "=m" (*p), "+r" (x) /* Outputs. */
-	    : "m" (*p) /* Inputs. */
-	    : "memory" /* Clobbers. */
-	    );
-}
-#  elif (defined(JEMALLOC_C11ATOMICS))
-JEMALLOC_INLINE uint64_t
-atomic_add_u64(uint64_t *p, uint64_t x) {
-	volatile atomic_uint_least64_t *a = (volatile atomic_uint_least64_t *)p;
-	return atomic_fetch_add(a, x) + x;
-}
-
-JEMALLOC_INLINE uint64_t
-atomic_sub_u64(uint64_t *p, uint64_t x) {
-	volatile atomic_uint_least64_t *a = (volatile atomic_uint_least64_t *)p;
-	return atomic_fetch_sub(a, x) - x;
-}
-
-JEMALLOC_INLINE bool
-atomic_cas_u64(uint64_t *p, uint64_t c, uint64_t s) {
-	volatile atomic_uint_least64_t *a = (volatile atomic_uint_least64_t *)p;
-	return !atomic_compare_exchange_strong(a, &c, s);
-}
-
-JEMALLOC_INLINE void
-atomic_write_u64(uint64_t *p, uint64_t x) {
-	volatile atomic_uint_least64_t *a = (volatile atomic_uint_least64_t *)p;
-	atomic_store(a, x);
-}
-#  elif (defined(JEMALLOC_ATOMIC9))
-JEMALLOC_INLINE uint64_t
-atomic_add_u64(uint64_t *p, uint64_t x) {
-	/*
-	 * atomic_fetchadd_64() doesn't exist, but we only ever use this
-	 * function on LP64 systems, so atomic_fetchadd_long() will do.
-	 */
-	assert(sizeof(uint64_t) == sizeof(unsigned long));
-
-	return atomic_fetchadd_long(p, (unsigned long)x) + x;
-}
-
-JEMALLOC_INLINE uint64_t
-atomic_sub_u64(uint64_t *p, uint64_t x) {
-	assert(sizeof(uint64_t) == sizeof(unsigned long));
-
-	return atomic_fetchadd_long(p, (unsigned long)(-(long)x)) - x;
-}
-
-JEMALLOC_INLINE bool
-atomic_cas_u64(uint64_t *p, uint64_t c, uint64_t s) {
-	assert(sizeof(uint64_t) == sizeof(unsigned long));
-
-	return !atomic_cmpset_long(p, (unsigned long)c, (unsigned long)s);
-}
-
-JEMALLOC_INLINE void
-atomic_write_u64(uint64_t *p, uint64_t x) {
-	assert(sizeof(uint64_t) == sizeof(unsigned long));
-
-	atomic_store_rel_long(p, x);
-}
-#  elif (defined(JEMALLOC_OSATOMIC))
-JEMALLOC_INLINE uint64_t
-atomic_add_u64(uint64_t *p, uint64_t x) {
-	return OSAtomicAdd64((int64_t)x, (int64_t *)p);
-}
-
-JEMALLOC_INLINE uint64_t
-atomic_sub_u64(uint64_t *p, uint64_t x) {
-	return OSAtomicAdd64(-((int64_t)x), (int64_t *)p);
-}
-
-JEMALLOC_INLINE bool
-atomic_cas_u64(uint64_t *p, uint64_t c, uint64_t s) {
-	return !OSAtomicCompareAndSwap64(c, s, (int64_t *)p);
-}
-
-JEMALLOC_INLINE void
-atomic_write_u64(uint64_t *p, uint64_t x) {
-	uint64_t o;
-
-	/*The documented OSAtomic*() API does not expose an atomic exchange. */
-	do {
-		o = atomic_read_u64(p);
-	} while (atomic_cas_u64(p, o, x));
-}
-#  elif (defined(_MSC_VER))
-JEMALLOC_INLINE uint64_t
-atomic_add_u64(uint64_t *p, uint64_t x) {
-	return InterlockedExchangeAdd64(p, x) + x;
-}
-
-JEMALLOC_INLINE uint64_t
-atomic_sub_u64(uint64_t *p, uint64_t x) {
-	return InterlockedExchangeAdd64(p, -((int64_t)x)) - x;
-}
-
-JEMALLOC_INLINE bool
-atomic_cas_u64(uint64_t *p, uint64_t c, uint64_t s) {
-	uint64_t o;
-
-	o = InterlockedCompareExchange64(p, s, c);
-	return o != c;
-}
-
-JEMALLOC_INLINE void
-atomic_write_u64(uint64_t *p, uint64_t x) {
-	InterlockedExchange64(p, x);
-}
-#  elif (defined(__GCC_HAVE_SYNC_COMPARE_AND_SWAP_8) || \
-    defined(JE_FORCE_SYNC_COMPARE_AND_SWAP_8))
-JEMALLOC_INLINE uint64_t
-atomic_add_u64(uint64_t *p, uint64_t x) {
-	return __sync_add_and_fetch(p, x);
-}
-
-JEMALLOC_INLINE uint64_t
-atomic_sub_u64(uint64_t *p, uint64_t x) {
-	return __sync_sub_and_fetch(p, x);
-}
-
-JEMALLOC_INLINE bool
-atomic_cas_u64(uint64_t *p, uint64_t c, uint64_t s) {
-	return !__sync_bool_compare_and_swap(p, c, s);
-}
-
-JEMALLOC_INLINE void
-atomic_write_u64(uint64_t *p, uint64_t x) {
-	__sync_lock_test_and_set(p, x);
-}
-#  else
-#    error "Missing implementation for 64-bit atomic operations"
-#  endif
-#endif
-
-/******************************************************************************/
-/* 32-bit operations. */
-#if (defined(__i386__) || defined(__amd64__) || defined(__x86_64__))
-JEMALLOC_INLINE uint32_t
-atomic_add_u32(uint32_t *p, uint32_t x) {
-	uint32_t t = x;
-
-	asm volatile (
-	    "lock; xaddl %0, %1;"
-	    : "+r" (t), "=m" (*p) /* Outputs. */
-	    : "m" (*p) /* Inputs. */
-	    );
-
-	return t + x;
-}
-
-JEMALLOC_INLINE uint32_t
-atomic_sub_u32(uint32_t *p, uint32_t x) {
-	uint32_t t;
-
-	x = (uint32_t)(-(int32_t)x);
-	t = x;
-	asm volatile (
-	    "lock; xaddl %0, %1;"
-	    : "+r" (t), "=m" (*p) /* Outputs. */
-	    : "m" (*p) /* Inputs. */
-	    );
-
-	return t + x;
-}
-
-JEMALLOC_INLINE bool
-atomic_cas_u32(uint32_t *p, uint32_t c, uint32_t s) {
-	uint8_t success;
-
-	asm volatile (
-	    "lock; cmpxchgl %4, %0;"
-	    "sete %1;"
-	    : "=m" (*p), "=a" (success) /* Outputs. */
-	    : "m" (*p), "a" (c), "r" (s) /* Inputs. */
-	    : "memory"
-	    );
-
-	return !(bool)success;
-}
-
-JEMALLOC_INLINE void
-atomic_write_u32(uint32_t *p, uint32_t x) {
-	asm volatile (
-	    "xchgl %1, %0;" /* Lock is implied by xchgl. */
-	    : "=m" (*p), "+r" (x) /* Outputs. */
-	    : "m" (*p) /* Inputs. */
-	    : "memory" /* Clobbers. */
-	    );
-}
-#  elif (defined(JEMALLOC_C11ATOMICS))
-JEMALLOC_INLINE uint32_t
-atomic_add_u32(uint32_t *p, uint32_t x) {
-	volatile atomic_uint_least32_t *a = (volatile atomic_uint_least32_t *)p;
-	return atomic_fetch_add(a, x) + x;
-}
-
-JEMALLOC_INLINE uint32_t
-atomic_sub_u32(uint32_t *p, uint32_t x) {
-	volatile atomic_uint_least32_t *a = (volatile atomic_uint_least32_t *)p;
-	return atomic_fetch_sub(a, x) - x;
-}
-
-JEMALLOC_INLINE bool
-atomic_cas_u32(uint32_t *p, uint32_t c, uint32_t s) {
-	volatile atomic_uint_least32_t *a = (volatile atomic_uint_least32_t *)p;
-	return !atomic_compare_exchange_strong(a, &c, s);
-}
-
-JEMALLOC_INLINE void
-atomic_write_u32(uint32_t *p, uint32_t x) {
-	volatile atomic_uint_least32_t *a = (volatile atomic_uint_least32_t *)p;
-	atomic_store(a, x);
-}
-#elif (defined(JEMALLOC_ATOMIC9))
-JEMALLOC_INLINE uint32_t
-atomic_add_u32(uint32_t *p, uint32_t x) {
-	return atomic_fetchadd_32(p, x) + x;
-}
-
-JEMALLOC_INLINE uint32_t
-atomic_sub_u32(uint32_t *p, uint32_t x) {
-	return atomic_fetchadd_32(p, (uint32_t)(-(int32_t)x)) - x;
-}
-
-JEMALLOC_INLINE bool
-atomic_cas_u32(uint32_t *p, uint32_t c, uint32_t s) {
-	return !atomic_cmpset_32(p, c, s);
-}
-
-JEMALLOC_INLINE void
-atomic_write_u32(uint32_t *p, uint32_t x) {
-	atomic_store_rel_32(p, x);
-}
-#elif (defined(JEMALLOC_OSATOMIC))
-JEMALLOC_INLINE uint32_t
-atomic_add_u32(uint32_t *p, uint32_t x) {
-	return OSAtomicAdd32((int32_t)x, (int32_t *)p);
-}
-
-JEMALLOC_INLINE uint32_t
-atomic_sub_u32(uint32_t *p, uint32_t x) {
-	return OSAtomicAdd32(-((int32_t)x), (int32_t *)p);
-}
-
-JEMALLOC_INLINE bool
-atomic_cas_u32(uint32_t *p, uint32_t c, uint32_t s) {
-	return !OSAtomicCompareAndSwap32(c, s, (int32_t *)p);
-}
-
-JEMALLOC_INLINE void
-atomic_write_u32(uint32_t *p, uint32_t x) {
-	uint32_t o;
-
-	/*The documented OSAtomic*() API does not expose an atomic exchange. */
-	do {
-		o = atomic_read_u32(p);
-	} while (atomic_cas_u32(p, o, x));
-}
-#elif (defined(_MSC_VER))
-JEMALLOC_INLINE uint32_t
-atomic_add_u32(uint32_t *p, uint32_t x) {
-	return InterlockedExchangeAdd(p, x) + x;
-}
-
-JEMALLOC_INLINE uint32_t
-atomic_sub_u32(uint32_t *p, uint32_t x) {
-	return InterlockedExchangeAdd(p, -((int32_t)x)) - x;
-}
-
-JEMALLOC_INLINE bool
-atomic_cas_u32(uint32_t *p, uint32_t c, uint32_t s) {
-	uint32_t o;
-
-	o = InterlockedCompareExchange(p, s, c);
-	return o != c;
-}
-
-JEMALLOC_INLINE void
-atomic_write_u32(uint32_t *p, uint32_t x) {
-	InterlockedExchange(p, x);
-}
-#elif (defined(__GCC_HAVE_SYNC_COMPARE_AND_SWAP_4) || \
- defined(JE_FORCE_SYNC_COMPARE_AND_SWAP_4))
-JEMALLOC_INLINE uint32_t
-atomic_add_u32(uint32_t *p, uint32_t x) {
-	return __sync_add_and_fetch(p, x);
-}
-
-JEMALLOC_INLINE uint32_t
-atomic_sub_u32(uint32_t *p, uint32_t x) {
-	return __sync_sub_and_fetch(p, x);
-}
-
-JEMALLOC_INLINE bool
-atomic_cas_u32(uint32_t *p, uint32_t c, uint32_t s) {
-	return !__sync_bool_compare_and_swap(p, c, s);
-}
-
-JEMALLOC_INLINE void
-atomic_write_u32(uint32_t *p, uint32_t x) {
-	__sync_lock_test_and_set(p, x);
-}
-#else
-#  error "Missing implementation for 32-bit atomic operations"
-#endif
-
-/******************************************************************************/
-/* Pointer operations. */
-JEMALLOC_INLINE void *
-atomic_add_p(void **p, void *x) {
-#if (LG_SIZEOF_PTR == 3)
-	return (void *)atomic_add_u64((uint64_t *)p, (uint64_t)x);
-#elif (LG_SIZEOF_PTR == 2)
-	return (void *)atomic_add_u32((uint32_t *)p, (uint32_t)x);
-#endif
-}
-
-JEMALLOC_INLINE void *
-atomic_sub_p(void **p, void *x) {
-#if (LG_SIZEOF_PTR == 3)
-	return (void *)atomic_add_u64((uint64_t *)p, (uint64_t)-((int64_t)x));
-#elif (LG_SIZEOF_PTR == 2)
-	return (void *)atomic_add_u32((uint32_t *)p, (uint32_t)-((int32_t)x));
-#endif
-}
-
-JEMALLOC_INLINE bool
-atomic_cas_p(void **p, void *c, void *s) {
-#if (LG_SIZEOF_PTR == 3)
-	return atomic_cas_u64((uint64_t *)p, (uint64_t)c, (uint64_t)s);
-#elif (LG_SIZEOF_PTR == 2)
-	return atomic_cas_u32((uint32_t *)p, (uint32_t)c, (uint32_t)s);
-#endif
-}
-
-JEMALLOC_INLINE void
-atomic_write_p(void **p, const void *x) {
-#if (LG_SIZEOF_PTR == 3)
-	atomic_write_u64((uint64_t *)p, (uint64_t)x);
-#elif (LG_SIZEOF_PTR == 2)
-	atomic_write_u32((uint32_t *)p, (uint32_t)x);
-#endif
-}
-
-/******************************************************************************/
-/* size_t operations. */
-JEMALLOC_INLINE size_t
-atomic_add_zu(size_t *p, size_t x) {
-#if (LG_SIZEOF_PTR == 3)
-	return (size_t)atomic_add_u64((uint64_t *)p, (uint64_t)x);
-#elif (LG_SIZEOF_PTR == 2)
-	return (size_t)atomic_add_u32((uint32_t *)p, (uint32_t)x);
-#endif
-}
-
-JEMALLOC_INLINE size_t
-atomic_sub_zu(size_t *p, size_t x) {
-#if (LG_SIZEOF_PTR == 3)
-	return (size_t)atomic_add_u64((uint64_t *)p, (uint64_t)-((int64_t)x));
-#elif (LG_SIZEOF_PTR == 2)
-	return (size_t)atomic_add_u32((uint32_t *)p, (uint32_t)-((int32_t)x));
-#endif
-}
-
-JEMALLOC_INLINE bool
-atomic_cas_zu(size_t *p, size_t c, size_t s) {
-#if (LG_SIZEOF_PTR == 3)
-	return atomic_cas_u64((uint64_t *)p, (uint64_t)c, (uint64_t)s);
-#elif (LG_SIZEOF_PTR == 2)
-	return atomic_cas_u32((uint32_t *)p, (uint32_t)c, (uint32_t)s);
-#endif
-}
-
-JEMALLOC_INLINE void
-atomic_write_zu(size_t *p, size_t x) {
-#if (LG_SIZEOF_PTR == 3)
-	atomic_write_u64((uint64_t *)p, (uint64_t)x);
-#elif (LG_SIZEOF_PTR == 2)
-	atomic_write_u32((uint32_t *)p, (uint32_t)x);
-#endif
-}
-
-/******************************************************************************/
-/* unsigned operations. */
-JEMALLOC_INLINE unsigned
-atomic_add_u(unsigned *p, unsigned x) {
-#if (LG_SIZEOF_INT == 3)
-	return (unsigned)atomic_add_u64((uint64_t *)p, (uint64_t)x);
-#elif (LG_SIZEOF_INT == 2)
-	return (unsigned)atomic_add_u32((uint32_t *)p, (uint32_t)x);
-#endif
-}
-
-JEMALLOC_INLINE unsigned
-atomic_sub_u(unsigned *p, unsigned x) {
-#if (LG_SIZEOF_INT == 3)
-	return (unsigned)atomic_add_u64((uint64_t *)p, (uint64_t)-((int64_t)x));
-#elif (LG_SIZEOF_INT == 2)
-	return (unsigned)atomic_add_u32((uint32_t *)p, (uint32_t)-((int32_t)x));
-#endif
-}
-
-JEMALLOC_INLINE bool
-atomic_cas_u(unsigned *p, unsigned c, unsigned s) {
-#if (LG_SIZEOF_INT == 3)
-	return atomic_cas_u64((uint64_t *)p, (uint64_t)c, (uint64_t)s);
-#elif (LG_SIZEOF_INT == 2)
-	return atomic_cas_u32((uint32_t *)p, (uint32_t)c, (uint32_t)s);
-#endif
-}
-
-JEMALLOC_INLINE void
-atomic_write_u(unsigned *p, unsigned x) {
-#if (LG_SIZEOF_INT == 3)
-	atomic_write_u64((uint64_t *)p, (uint64_t)x);
-#elif (LG_SIZEOF_INT == 2)
-	atomic_write_u32((uint32_t *)p, (uint32_t)x);
-#endif
-}
-
-/******************************************************************************/
-#endif
-#endif /* JEMALLOC_INTERNAL_ATOMIC_INLINES_H */
diff --git a/include/jemalloc/internal/atomic_msvc.h b/include/jemalloc/internal/atomic_msvc.h
new file mode 100644
index 00000000..67057ce5
--- /dev/null
+++ b/include/jemalloc/internal/atomic_msvc.h
@@ -0,0 +1,158 @@
+#ifndef JEMALLOC_INTERNAL_ATOMIC_MSVC_H
+#define JEMALLOC_INTERNAL_ATOMIC_MSVC_H
+
+#define ATOMIC_INIT(...) {__VA_ARGS__}
+
+typedef enum {
+	atomic_memory_order_relaxed,
+	atomic_memory_order_acquire,
+	atomic_memory_order_release,
+	atomic_memory_order_acq_rel,
+	atomic_memory_order_seq_cst
+} atomic_memory_order_t;
+
+typedef char atomic_repr_0_t;
+typedef short atomic_repr_1_t;
+typedef long atomic_repr_2_t;
+typedef __int64 atomic_repr_3_t;
+
+ATOMIC_INLINE void
+atomic_fence(atomic_memory_order_t mo) {
+	_ReadWriteBarrier();
+#  if defined(_M_ARM) || defined(_M_ARM64)
+	/* ARM needs a barrier for everything but relaxed. */
+	if (mo != atomic_memory_order_relaxed) {
+		MemoryBarrier();
+	}
+#  elif defined(_M_IX86) || defined (_M_X64)
+	/* x86 needs a barrier only for seq_cst. */
+	if (mo == atomic_memory_order_seq_cst) {
+		MemoryBarrier();
+	}
+#  else
+#  error "Don't know how to create atomics for this platform for MSVC."
+#  endif
+	_ReadWriteBarrier();
+}
+
+#define ATOMIC_INTERLOCKED_REPR(lg_size) atomic_repr_ ## lg_size ## _t
+
+#define ATOMIC_CONCAT(a, b) ATOMIC_RAW_CONCAT(a, b)
+#define ATOMIC_RAW_CONCAT(a, b) a ## b
+
+#define ATOMIC_INTERLOCKED_NAME(base_name, lg_size) ATOMIC_CONCAT(	\
+    base_name, ATOMIC_INTERLOCKED_SUFFIX(lg_size))
+
+#define ATOMIC_INTERLOCKED_SUFFIX(lg_size)				\
+    ATOMIC_CONCAT(ATOMIC_INTERLOCKED_SUFFIX_, lg_size)
+
+#define ATOMIC_INTERLOCKED_SUFFIX_0 8
+#define ATOMIC_INTERLOCKED_SUFFIX_1 16
+#define ATOMIC_INTERLOCKED_SUFFIX_2
+#define ATOMIC_INTERLOCKED_SUFFIX_3 64
+
+#define JEMALLOC_GENERATE_ATOMICS(type, short_type, lg_size)		\
+typedef struct {							\
+	ATOMIC_INTERLOCKED_REPR(lg_size) repr;				\
+} atomic_##short_type##_t;						\
+									\
+ATOMIC_INLINE type							\
+atomic_load_##short_type(const atomic_##short_type##_t *a,		\
+    atomic_memory_order_t mo) {						\
+	ATOMIC_INTERLOCKED_REPR(lg_size) ret = a->repr;			\
+	if (mo != atomic_memory_order_relaxed) {			\
+		atomic_fence(atomic_memory_order_acquire);		\
+	}								\
+	return (type) ret;						\
+}									\
+									\
+ATOMIC_INLINE void							\
+atomic_store_##short_type(atomic_##short_type##_t *a,			\
+    type val, atomic_memory_order_t mo) {				\
+	if (mo != atomic_memory_order_relaxed) {			\
+		atomic_fence(atomic_memory_order_release);		\
+	}								\
+	a->repr = (ATOMIC_INTERLOCKED_REPR(lg_size)) val;		\
+	if (mo == atomic_memory_order_seq_cst) {			\
+		atomic_fence(atomic_memory_order_seq_cst);		\
+	}								\
+}									\
+									\
+ATOMIC_INLINE type							\
+atomic_exchange_##short_type(atomic_##short_type##_t *a, type val,	\
+    atomic_memory_order_t mo) {						\
+	return (type)ATOMIC_INTERLOCKED_NAME(_InterlockedExchange,	\
+	    lg_size)(&a->repr, (ATOMIC_INTERLOCKED_REPR(lg_size))val);	\
+}									\
+									\
+ATOMIC_INLINE bool							\
+atomic_compare_exchange_weak_##short_type(atomic_##short_type##_t *a,	\
+    type *expected, type desired, atomic_memory_order_t success_mo,	\
+    atomic_memory_order_t failure_mo) {					\
+	ATOMIC_INTERLOCKED_REPR(lg_size) e =				\
+	    (ATOMIC_INTERLOCKED_REPR(lg_size))*expected;		\
+	ATOMIC_INTERLOCKED_REPR(lg_size) d =				\
+	    (ATOMIC_INTERLOCKED_REPR(lg_size))desired;			\
+	ATOMIC_INTERLOCKED_REPR(lg_size) old =				\
+	    ATOMIC_INTERLOCKED_NAME(_InterlockedCompareExchange, 	\
+		lg_size)(&a->repr, d, e);				\
+	if (old == e) {							\
+		return true;						\
+	} else {							\
+		*expected = (type)old;					\
+		return false;						\
+	}								\
+}									\
+									\
+ATOMIC_INLINE bool							\
+atomic_compare_exchange_strong_##short_type(atomic_##short_type##_t *a,	\
+    type *expected, type desired, atomic_memory_order_t success_mo,	\
+    atomic_memory_order_t failure_mo) {					\
+	/* We implement the weak version with strong semantics. */	\
+	return atomic_compare_exchange_weak_##short_type(a, expected,	\
+	    desired, success_mo, failure_mo);				\
+}
+
+
+#define JEMALLOC_GENERATE_INT_ATOMICS(type, short_type, lg_size)	\
+JEMALLOC_GENERATE_ATOMICS(type, short_type, lg_size)			\
+									\
+ATOMIC_INLINE type							\
+atomic_fetch_add_##short_type(atomic_##short_type##_t *a,		\
+    type val, atomic_memory_order_t mo) {				\
+	return (type)ATOMIC_INTERLOCKED_NAME(_InterlockedExchangeAdd,	\
+	    lg_size)(&a->repr, (ATOMIC_INTERLOCKED_REPR(lg_size))val);	\
+}									\
+									\
+ATOMIC_INLINE type							\
+atomic_fetch_sub_##short_type(atomic_##short_type##_t *a,		\
+    type val, atomic_memory_order_t mo) {				\
+	/*								\
+	 * MSVC warns on negation of unsigned operands, but for us it	\
+	 * gives exactly the right semantics (MAX_TYPE + 1 - operand).	\
+	 */								\
+	__pragma(warning(push))						\
+	__pragma(warning(disable: 4146))				\
+	return atomic_fetch_add_##short_type(a, -val, mo);		\
+	__pragma(warning(pop))						\
+}									\
+ATOMIC_INLINE type							\
+atomic_fetch_and_##short_type(atomic_##short_type##_t *a,		\
+    type val, atomic_memory_order_t mo) {				\
+	return (type)ATOMIC_INTERLOCKED_NAME(_InterlockedAnd, lg_size)(	\
+	    &a->repr, (ATOMIC_INTERLOCKED_REPR(lg_size))val);		\
+}									\
+ATOMIC_INLINE type							\
+atomic_fetch_or_##short_type(atomic_##short_type##_t *a,		\
+    type val, atomic_memory_order_t mo) {				\
+	return (type)ATOMIC_INTERLOCKED_NAME(_InterlockedOr, lg_size)(	\
+	    &a->repr, (ATOMIC_INTERLOCKED_REPR(lg_size))val);		\
+}									\
+ATOMIC_INLINE type							\
+atomic_fetch_xor_##short_type(atomic_##short_type##_t *a,		\
+    type val, atomic_memory_order_t mo) {				\
+	return (type)ATOMIC_INTERLOCKED_NAME(_InterlockedXor, lg_size)(	\
+	    &a->repr, (ATOMIC_INTERLOCKED_REPR(lg_size))val);		\
+}
+
+#endif /* JEMALLOC_INTERNAL_ATOMIC_MSVC_H */
diff --git a/include/jemalloc/internal/atomic_types.h b/include/jemalloc/internal/atomic_types.h
deleted file mode 100644
index 0fd5e5b5..00000000
--- a/include/jemalloc/internal/atomic_types.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef JEMALLOC_INTERNAL_ATOMIC_TYPES_H
-#define JEMALLOC_INTERNAL_ATOMIC_TYPES_H
-
-#if (LG_SIZEOF_PTR == 3 || LG_SIZEOF_INT == 3)
-#  define JEMALLOC_ATOMIC_U64
-#endif
-
-#endif /* JEMALLOC_INTERNAL_ATOMIC_TYPES_H */
diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
index 0d0440b5..f18acabb 100644
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -146,14 +146,6 @@ static const bool have_thp =
 #endif
     ;
 
-#if defined(JEMALLOC_C11ATOMICS) && !defined(__cplusplus)
-#include <stdatomic.h>
-#endif
-
-#ifdef JEMALLOC_ATOMIC9
-#include <machine/atomic.h>
-#endif
-
 #if (defined(JEMALLOC_OSATOMIC) || defined(JEMALLOC_OSSPIN))
 #include <libkern/OSAtomic.h>
 #endif
@@ -199,10 +191,21 @@ static const bool have_thp =
  * its translation unit).  Each component is now broken up into multiple header
  * files, corresponding to the sections above (e.g. instead of "tsd.h", we now
  * have "tsd_types.h", "tsd_structs.h", "tsd_externs.h", "tsd_inlines.h").
+ *
+ * Those files which have been converted to explicitly include their
+ * inter-component dependencies are now in the initial HERMETIC HEADERS
+ * section.  These headers may still rely on this file for system headers and
+ * global jemalloc headers, however.
  */
 
 #include "jemalloc/internal/jemalloc_internal_macros.h"
 
+/******************************************************************************/
+/* HERMETIC HEADERS */
+/******************************************************************************/
+
+#include "jemalloc/internal/atomic.h"
+
 /******************************************************************************/
 /* TYPES */
 /******************************************************************************/
@@ -380,7 +383,6 @@ typedef unsigned szind_t;
 
 #include "jemalloc/internal/nstime_types.h"
 #include "jemalloc/internal/util_types.h"
-#include "jemalloc/internal/atomic_types.h"
 #include "jemalloc/internal/spin_types.h"
 #include "jemalloc/internal/prng_types.h"
 #include "jemalloc/internal/ticker_types.h"
@@ -489,7 +491,6 @@ void	jemalloc_postfork_child(void);
 
 #include "jemalloc/internal/nstime_externs.h"
 #include "jemalloc/internal/util_externs.h"
-#include "jemalloc/internal/atomic_externs.h"
 #include "jemalloc/internal/ckh_externs.h"
 #include "jemalloc/internal/stats_externs.h"
 #include "jemalloc/internal/ctl_externs.h"
@@ -513,7 +514,6 @@ void	jemalloc_postfork_child(void);
 /******************************************************************************/
 
 #include "jemalloc/internal/util_inlines.h"
-#include "jemalloc/internal/atomic_inlines.h"
 #include "jemalloc/internal/spin_inlines.h"
 #include "jemalloc/internal/prng_inlines.h"
 #include "jemalloc/internal/ticker_inlines.h"
diff --git a/include/jemalloc/internal/jemalloc_internal_defs.h.in b/include/jemalloc/internal/jemalloc_internal_defs.h.in
index 6c70e167..b2e0077e 100644
--- a/include/jemalloc/internal/jemalloc_internal_defs.h.in
+++ b/include/jemalloc/internal/jemalloc_internal_defs.h.in
@@ -30,16 +30,13 @@
 #undef LG_VADDR
 
 /* Defined if C11 atomics are available. */
-#undef JEMALLOC_C11ATOMICS
+#undef JEMALLOC_C11_ATOMICS
 
-/* Defined if the equivalent of FreeBSD's atomic(9) functions are available. */
-#undef JEMALLOC_ATOMIC9
+/* Defined if GCC __atomic atomics are available. */
+#undef JEMALLOC_GCC_ATOMIC_ATOMICS
 
-/*
- * Defined if OSAtomic*() functions are available, as provided by Darwin, and
- * documented in the atomic(3) manual page.
- */
-#undef JEMALLOC_OSATOMIC
+/* Defined if GCC __sync atomics are available. */
+#undef JEMALLOC_GCC_SYNC_ATOMICS
 
 /*
  * Defined if __sync_add_and_fetch(uint32_t *, uint32_t) and
diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt
index 0234181e..b122dae6 100644
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -72,26 +72,6 @@ arena_tdata_get
 arena_tdata_get_hard
 arenas
 arenas_tdata_cleanup
-atomic_add_p
-atomic_add_u
-atomic_add_u32
-atomic_add_u64
-atomic_add_zu
-atomic_cas_p
-atomic_cas_u
-atomic_cas_u32
-atomic_cas_u64
-atomic_cas_zu
-atomic_sub_p
-atomic_sub_u
-atomic_sub_u32
-atomic_sub_u64
-atomic_sub_zu
-atomic_write_p
-atomic_write_u
-atomic_write_u32
-atomic_write_u64
-atomic_write_zu
 b0get
 base_alloc
 base_boot
diff --git a/src/atomic.c b/src/atomic.c
deleted file mode 100644
index 9871390d..00000000
--- a/src/atomic.c
+++ /dev/null
@@ -1,2 +0,0 @@
-#define JEMALLOC_ATOMIC_C_
-#include "jemalloc/internal/jemalloc_internal.h"
diff --git a/test/unit/atomic.c b/test/unit/atomic.c
index 78661597..237c7474 100644
--- a/test/unit/atomic.c
+++ b/test/unit/atomic.c
@@ -1,101 +1,257 @@
 #include "test/jemalloc_test.h"
 
-#define TEST_STRUCT(p, t)						\
-struct p##_test_s {							\
-	t	accum0;							\
-	t	x;							\
-	t	s;							\
-};									\
-typedef struct p##_test_s p##_test_t;
+/*
+ * We *almost* have consistent short names (e.g. "u32" for uint32_t, "b" for
+ * bool, etc.  The one exception is that the short name for void * is "p" in
+ * some places and "ptr" in others.  In the long run it would be nice to unify
+ * these, but in the short run we'll use this shim.
+ */
+#define assert_p_eq assert_ptr_eq
 
-#define TEST_BODY(p, t, tc, ta, FMT) do {				\
-	const p##_test_t tests[] = {					\
-		{(t)-1, (t)-1, (t)-2},					\
-		{(t)-1, (t) 0, (t)-2},					\
-		{(t)-1, (t) 1, (t)-2},					\
+/*
+ * t: the non-atomic type, like "uint32_t".
+ * ta: the short name for the type, like "u32".
+ * val[1,2,3]: Values of the given type.  The CAS tests use val2 for expected,
+ * and val3 for desired.
+ */
+
+#define DO_TESTS(t, ta, val1, val2, val3) do {				\
+	t val;								\
+	t raw_atomic;							\
+	t expected;							\
+	bool success;							\
+	/* This (along with the load below) also tests ATOMIC_LOAD. */	\
+	atomic_##ta##_t atom = ATOMIC_INIT(val1);			\
 									\
-		{(t) 0, (t)-1, (t)-2},					\
-		{(t) 0, (t) 0, (t)-2},					\
-		{(t) 0, (t) 1, (t)-2},					\
+	/* ATOMIC_INIT and load. */					\
+	val = atomic_load_##ta(&atom, ATOMIC_RELAXED);			\
+	assert_##ta##_eq(val1, val, "Load or init failed");		\
 									\
-		{(t) 1, (t)-1, (t)-2},					\
-		{(t) 1, (t) 0, (t)-2},					\
-		{(t) 1, (t) 1, (t)-2},					\
+	/* Store. */							\
+	atomic_store_##ta(&atom, val1, ATOMIC_RELAXED);			\
+	atomic_store_##ta(&atom, val2, ATOMIC_RELAXED);			\
+	val = atomic_load_##ta(&atom, ATOMIC_RELAXED);			\
+	assert_##ta##_eq(val2, val, "Store failed");			\
 									\
-		{(t)0, (t)-(1 << 22), (t)-2},				\
-		{(t)0, (t)(1 << 22), (t)-2},				\
-		{(t)(1 << 22), (t)-(1 << 22), (t)-2},			\
-		{(t)(1 << 22), (t)(1 << 22), (t)-2}			\
-	};								\
-	unsigned i;							\
+	/* Exchange. */							\
+	atomic_store_##ta(&atom, val1, ATOMIC_RELAXED);			\
+	val = atomic_exchange_##ta(&atom, val2, ATOMIC_RELAXED);	\
+	assert_##ta##_eq(val1, val, "Exchange returned invalid value");	\
+	val = atomic_load_##ta(&atom, ATOMIC_RELAXED);			\
+	assert_##ta##_eq(val2, val, "Exchange store invalid value");	\
 									\
-	for (i = 0; i < sizeof(tests)/sizeof(p##_test_t); i++) {	\
-		bool err;						\
-		t accum = tests[i].accum0;				\
-		assert_##ta##_eq(atomic_read_##p(&accum),		\
-		    tests[i].accum0,					\
-		    "Erroneous read, i=%u", i);				\
+	/* 								\
+	 * Weak CAS.  Spurious failures are allowed, so we loop a few	\
+	 * times.							\
+	 */								\
+	atomic_store_##ta(&atom, val1, ATOMIC_RELAXED);			\
+	success = false;						\
+	for (int i = 0; i < 10 && !success; i++) {			\
+		expected = val2;					\
+		success = atomic_compare_exchange_weak_##ta(&atom,	\
+		    &expected, val3, ATOMIC_RELAXED, ATOMIC_RELAXED);	\
+		assert_##ta##_eq(val1, expected, 			\
+		    "CAS should update expected");			\
+	}								\
+	assert_b_eq(val1 == val2, success,				\
+	    "Weak CAS did the wrong state update");			\
+	val = atomic_load_##ta(&atom, ATOMIC_RELAXED);			\
+	if (success) {							\
+		assert_##ta##_eq(val3, val,				\
+		    "Successful CAS should update atomic");		\
+	} else {							\
+		assert_##ta##_eq(val1, val,				\
+		    "Unsuccessful CAS should not update atomic");	\
+	}								\
 									\
-		assert_##ta##_eq(atomic_add_##p(&accum, tests[i].x),	\
-		    (t)((tc)tests[i].accum0 + (tc)tests[i].x),		\
-		    "i=%u, accum=%"FMT", x=%"FMT,			\
-		    i, tests[i].accum0, tests[i].x);			\
-		assert_##ta##_eq(atomic_read_##p(&accum), accum,	\
-		    "Erroneous add, i=%u", i);				\
+	/* Strong CAS. */						\
+	atomic_store_##ta(&atom, val1, ATOMIC_RELAXED);			\
+	expected = val2;						\
+	success = atomic_compare_exchange_strong_##ta(&atom, &expected,	\
+	    val3, ATOMIC_RELAXED, ATOMIC_RELAXED);			\
+	assert_b_eq(val1 == val2, success,				\
+	    "Strong CAS did the wrong state update");			\
+	val = atomic_load_##ta(&atom, ATOMIC_RELAXED);			\
+	if (success) {							\
+		assert_##ta##_eq(val3, val,				\
+		    "Successful CAS should update atomic");		\
+	} else {							\
+		assert_##ta##_eq(val1, val,				\
+		    "Unsuccessful CAS should not update atomic");	\
+	}								\
 									\
-		accum = tests[i].accum0;				\
-		assert_##ta##_eq(atomic_sub_##p(&accum, tests[i].x),	\
-		    (t)((tc)tests[i].accum0 - (tc)tests[i].x),		\
-		    "i=%u, accum=%"FMT", x=%"FMT,			\
-		    i, tests[i].accum0, tests[i].x);			\
-		assert_##ta##_eq(atomic_read_##p(&accum), accum,	\
-		    "Erroneous sub, i=%u", i);				\
 									\
-		accum = tests[i].accum0;				\
-		err = atomic_cas_##p(&accum, tests[i].x, tests[i].s);	\
-		assert_b_eq(err, tests[i].accum0 != tests[i].x,		\
-		    "Erroneous cas success/failure result");		\
-		assert_##ta##_eq(accum, err ? tests[i].accum0 :		\
-		    tests[i].s, "Erroneous cas effect, i=%u", i);	\
+	/* Previous atomics API. */					\
 									\
-		accum = tests[i].accum0;				\
-		atomic_write_##p(&accum, tests[i].s);			\
-		assert_##ta##_eq(accum, tests[i].s,			\
-		    "Erroneous write, i=%u", i);			\
+	/* Read. */							\
+	raw_atomic = val1;						\
+	val = atomic_read_##ta(&raw_atomic);				\
+	assert_##ta##_eq(val1, val, "Read failed");			\
+									\
+	/* Write. */							\
+	raw_atomic = val1;						\
+	atomic_write_##ta(&raw_atomic, val2);				\
+	assert_##ta##_eq(val2, raw_atomic, "Write failed");		\
+									\
+	/* CAS. */							\
+	raw_atomic = val1;						\
+	success = !atomic_cas_##ta(&raw_atomic, val2, val3);		\
+	assert_b_eq(val1 == val2, success, 				\
+	    "CAS did the wrong state update");				\
+	val = raw_atomic;						\
+	if (success) {							\
+		assert_##ta##_eq(val3, val,				\
+		    "Successful CAS should update atomic");		\
+	} else {							\
+		assert_##ta##_eq(val1, val,				\
+		    "Unsuccessful CAS should not update atomic");	\
 	}								\
 } while (0)
 
-TEST_STRUCT(u64, uint64_t)
+#define DO_INTEGER_TESTS(t, ta, val1, val2) do {			\
+	atomic_##ta##_t atom;						\
+	t val;								\
+	t raw_atomic;							\
+									\
+	/* Fetch-add. */						\
+	atomic_store_##ta(&atom, val1, ATOMIC_RELAXED);			\
+	val = atomic_fetch_add_##ta(&atom, val2, ATOMIC_RELAXED);	\
+	assert_##ta##_eq(val1, val,					\
+	    "Fetch-add should return previous value");			\
+	val = atomic_load_##ta(&atom, ATOMIC_RELAXED);			\
+	assert_##ta##_eq(val1 + val2, val,				\
+	    "Fetch-add should update atomic");				\
+									\
+	/* Fetch-sub. */						\
+	atomic_store_##ta(&atom, val1, ATOMIC_RELAXED);			\
+	val = atomic_fetch_sub_##ta(&atom, val2, ATOMIC_RELAXED);	\
+	assert_##ta##_eq(val1, val,					\
+	    "Fetch-sub should return previous value");			\
+	val = atomic_load_##ta(&atom, ATOMIC_RELAXED);			\
+	assert_##ta##_eq(val1 - val2, val,				\
+	    "Fetch-sub should update atomic");				\
+									\
+	/* Fetch-and. */						\
+	atomic_store_##ta(&atom, val1, ATOMIC_RELAXED);			\
+	val = atomic_fetch_and_##ta(&atom, val2, ATOMIC_RELAXED);	\
+	assert_##ta##_eq(val1, val,					\
+	    "Fetch-and should return previous value");			\
+	val = atomic_load_##ta(&atom, ATOMIC_RELAXED);			\
+	assert_##ta##_eq(val1 & val2, val,				\
+	    "Fetch-and should update atomic");				\
+									\
+	/* Fetch-or. */							\
+	atomic_store_##ta(&atom, val1, ATOMIC_RELAXED);			\
+	val = atomic_fetch_or_##ta(&atom, val2, ATOMIC_RELAXED);	\
+	assert_##ta##_eq(val1, val,					\
+	    "Fetch-or should return previous value");			\
+	val = atomic_load_##ta(&atom, ATOMIC_RELAXED);			\
+	assert_##ta##_eq(val1 | val2, val,				\
+	    "Fetch-or should update atomic");				\
+									\
+	/* Fetch-xor. */						\
+	atomic_store_##ta(&atom, val1, ATOMIC_RELAXED);			\
+	val = atomic_fetch_xor_##ta(&atom, val2, ATOMIC_RELAXED);	\
+	assert_##ta##_eq(val1, val,					\
+	    "Fetch-xor should return previous value");			\
+	val = atomic_load_##ta(&atom, ATOMIC_RELAXED);			\
+	assert_##ta##_eq(val1 ^ val2, val,				\
+	    "Fetch-xor should update atomic");				\
+									\
+	/* Previous atomics API. */					\
+									\
+	/* Add. */							\
+	raw_atomic = val1;						\
+	val = atomic_add_##ta(&raw_atomic, val2);			\
+	assert_##ta##_eq(val1 + val2, val,				\
+	    "atomic_add should return new value");			\
+	assert_##ta##_eq(val1 + val2, raw_atomic,			\
+	    "atomic_add should update atomic");				\
+									\
+	/* Sub. */							\
+	raw_atomic = val1;						\
+	val = atomic_sub_##ta(&raw_atomic, val2);			\
+	assert_##ta##_eq(val1 - val2, val,				\
+	    "atomic_sub should return new value");			\
+	assert_##ta##_eq(val1 - val2, raw_atomic,			\
+	    "atomic_add should update atomic");				\
+} while (0)
+
+#define TEST_STRUCT(t, ta)						\
+typedef struct {							\
+	t val1;								\
+	t val2;								\
+	t val3;								\
+} ta##_test_t;
+
+#define TEST_CASES(t) {							\
+	{(t)-1, (t)-1, (t)-2},						\
+	{(t)-1, (t) 0, (t)-2},						\
+	{(t)-1, (t) 1, (t)-2},						\
+									\
+	{(t) 0, (t)-1, (t)-2},						\
+	{(t) 0, (t) 0, (t)-2},						\
+	{(t) 0, (t) 1, (t)-2},						\
+									\
+	{(t) 1, (t)-1, (t)-2},						\
+	{(t) 1, (t) 0, (t)-2},						\
+	{(t) 1, (t) 1, (t)-2},						\
+									\
+	{(t)0, (t)-(1 << 22), (t)-2},					\
+	{(t)0, (t)(1 << 22), (t)-2},					\
+	{(t)(1 << 22), (t)-(1 << 22), (t)-2},				\
+	{(t)(1 << 22), (t)(1 << 22), (t)-2}				\
+}
+
+#define TEST_BODY(t, ta) do {						\
+	const ta##_test_t tests[] = TEST_CASES(t);			\
+	for (unsigned i = 0; i < sizeof(tests)/sizeof(tests[0]); i++) {	\
+		ta##_test_t test = tests[i];				\
+		DO_TESTS(t, ta, test.val1, test.val2, test.val3);	\
+	}								\
+} while (0)
+
+#define INTEGER_TEST_BODY(t, ta) do {					\
+	const ta##_test_t tests[] = TEST_CASES(t);			\
+	for (unsigned i = 0; i < sizeof(tests)/sizeof(tests[0]); i++) {	\
+		ta##_test_t test = tests[i];				\
+		DO_TESTS(t, ta, test.val1, test.val2, test.val3);	\
+		DO_INTEGER_TESTS(t, ta, test.val1, test.val2);		\
+	}								\
+} while (0)
+
+TEST_STRUCT(uint64_t, u64);
 TEST_BEGIN(test_atomic_u64) {
 #if !(LG_SIZEOF_PTR == 3 || LG_SIZEOF_INT == 3)
 	test_skip("64-bit atomic operations not supported");
 #else
-	TEST_BODY(u64, uint64_t, uint64_t, u64, FMTx64);
+	INTEGER_TEST_BODY(uint64_t, u64);
 #endif
 }
 TEST_END
 
-TEST_STRUCT(u32, uint32_t)
+
+TEST_STRUCT(uint32_t, u32);
 TEST_BEGIN(test_atomic_u32) {
-	TEST_BODY(u32, uint32_t, uint32_t, u32, "#"FMTx32);
+	INTEGER_TEST_BODY(uint32_t, u32);
 }
 TEST_END
 
-TEST_STRUCT(p, void *)
+TEST_STRUCT(void *, p);
 TEST_BEGIN(test_atomic_p) {
-	TEST_BODY(p, void *, uintptr_t, ptr, "p");
+	TEST_BODY(void *, p);
 }
 TEST_END
 
-TEST_STRUCT(zu, size_t)
+TEST_STRUCT(size_t, zu);
 TEST_BEGIN(test_atomic_zu) {
-	TEST_BODY(zu, size_t, size_t, zu, "#zx");
+	INTEGER_TEST_BODY(size_t, zu);
 }
 TEST_END
 
-TEST_STRUCT(u, unsigned)
+TEST_STRUCT(unsigned, u);
 TEST_BEGIN(test_atomic_u) {
-	TEST_BODY(u, unsigned, unsigned, u, "#x");
+	INTEGER_TEST_BODY(unsigned, u);
 }
 TEST_END