Introduce a backport of C11 atomics
This introduces a backport of C11 atomics. It has four implementations; ranked
in order of preference, they are:
- GCC/Clang __atomic builtins
- GCC/Clang __sync builtins
- MSVC _Interlocked builtins
- C11 atomics, from <stdatomic.h>
The primary advantages are:
- Close adherence to the standard API gives us a defined memory model.
- Type safety: atomic objects are now separate types from non-atomic ones, so
that it's impossible to mix up atomic and non-atomic updates (which is
undefined behavior that compilers are starting to take advantage of).
- Efficiency: we can specify ordering for operations, avoiding fences and
atomic operations on strongly ordered architectures (example:
`atomic_write_u32(ptr, val);` involves a CAS loop, whereas
`atomic_store(ptr, val, ATOMIC_RELEASE);` is a plain store.
This diff leaves in the current atomics API (implementing them in terms of the
backport). This lets us transition uses over piecemeal.
Testing:
This is by nature hard to test. I've manually tested the first three options on
Linux on gcc by futzing with the #defines manually, on freebsd with gcc and
clang, on MSVC, and on OS X with clang. All of these were x86 machines though,
and we don't have any test infrastructure set up for non-x86 platforms.
2017-01-26 01:54:27 +08:00
|
|
|
#ifndef JEMALLOC_INTERNAL_ATOMIC_GCC_SYNC_H
|
|
|
|
#define JEMALLOC_INTERNAL_ATOMIC_GCC_SYNC_H
|
|
|
|
|
|
|
|
#define ATOMIC_INIT(...) {__VA_ARGS__}
|
|
|
|
|
|
|
|
typedef enum {
|
|
|
|
atomic_memory_order_relaxed,
|
|
|
|
atomic_memory_order_acquire,
|
|
|
|
atomic_memory_order_release,
|
|
|
|
atomic_memory_order_acq_rel,
|
|
|
|
atomic_memory_order_seq_cst
|
|
|
|
} atomic_memory_order_t;
|
|
|
|
|
|
|
|
ATOMIC_INLINE void
|
|
|
|
atomic_fence(atomic_memory_order_t mo) {
|
|
|
|
/* Easy cases first: no barrier, and full barrier. */
|
|
|
|
if (mo == atomic_memory_order_relaxed) {
|
|
|
|
asm volatile("" ::: "memory");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if (mo == atomic_memory_order_seq_cst) {
|
|
|
|
asm volatile("" ::: "memory");
|
|
|
|
__sync_synchronize();
|
|
|
|
asm volatile("" ::: "memory");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
asm volatile("" ::: "memory");
|
|
|
|
# if defined(__i386__) || defined(__x86_64__)
|
|
|
|
/* This is implicit on x86. */
|
2018-10-24 05:41:14 +08:00
|
|
|
# elif defined(__ppc64__)
|
Introduce a backport of C11 atomics
This introduces a backport of C11 atomics. It has four implementations; ranked
in order of preference, they are:
- GCC/Clang __atomic builtins
- GCC/Clang __sync builtins
- MSVC _Interlocked builtins
- C11 atomics, from <stdatomic.h>
The primary advantages are:
- Close adherence to the standard API gives us a defined memory model.
- Type safety: atomic objects are now separate types from non-atomic ones, so
that it's impossible to mix up atomic and non-atomic updates (which is
undefined behavior that compilers are starting to take advantage of).
- Efficiency: we can specify ordering for operations, avoiding fences and
atomic operations on strongly ordered architectures (example:
`atomic_write_u32(ptr, val);` involves a CAS loop, whereas
`atomic_store(ptr, val, ATOMIC_RELEASE);` is a plain store.
This diff leaves in the current atomics API (implementing them in terms of the
backport). This lets us transition uses over piecemeal.
Testing:
This is by nature hard to test. I've manually tested the first three options on
Linux on gcc by futzing with the #defines manually, on freebsd with gcc and
clang, on MSVC, and on OS X with clang. All of these were x86 machines though,
and we don't have any test infrastructure set up for non-x86 platforms.
2017-01-26 01:54:27 +08:00
|
|
|
asm volatile("lwsync");
|
2018-10-24 05:41:14 +08:00
|
|
|
# elif defined(__ppc__)
|
|
|
|
asm volatile("sync");
|
Introduce a backport of C11 atomics
This introduces a backport of C11 atomics. It has four implementations; ranked
in order of preference, they are:
- GCC/Clang __atomic builtins
- GCC/Clang __sync builtins
- MSVC _Interlocked builtins
- C11 atomics, from <stdatomic.h>
The primary advantages are:
- Close adherence to the standard API gives us a defined memory model.
- Type safety: atomic objects are now separate types from non-atomic ones, so
that it's impossible to mix up atomic and non-atomic updates (which is
undefined behavior that compilers are starting to take advantage of).
- Efficiency: we can specify ordering for operations, avoiding fences and
atomic operations on strongly ordered architectures (example:
`atomic_write_u32(ptr, val);` involves a CAS loop, whereas
`atomic_store(ptr, val, ATOMIC_RELEASE);` is a plain store.
This diff leaves in the current atomics API (implementing them in terms of the
backport). This lets us transition uses over piecemeal.
Testing:
This is by nature hard to test. I've manually tested the first three options on
Linux on gcc by futzing with the #defines manually, on freebsd with gcc and
clang, on MSVC, and on OS X with clang. All of these were x86 machines though,
and we don't have any test infrastructure set up for non-x86 platforms.
2017-01-26 01:54:27 +08:00
|
|
|
# elif defined(__sparc__) && defined(__arch64__)
|
|
|
|
if (mo == atomic_memory_order_acquire) {
|
|
|
|
asm volatile("membar #LoadLoad | #LoadStore");
|
|
|
|
} else if (mo == atomic_memory_order_release) {
|
|
|
|
asm volatile("membar #LoadStore | #StoreStore");
|
|
|
|
} else {
|
|
|
|
asm volatile("membar #LoadLoad | #LoadStore | #StoreStore");
|
|
|
|
}
|
|
|
|
# else
|
|
|
|
__sync_synchronize();
|
|
|
|
# endif
|
|
|
|
asm volatile("" ::: "memory");
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* A correct implementation of seq_cst loads and stores on weakly ordered
|
|
|
|
* architectures could do either of the following:
|
|
|
|
* 1. store() is weak-fence -> store -> strong fence, load() is load ->
|
|
|
|
* strong-fence.
|
|
|
|
* 2. store() is strong-fence -> store, load() is strong-fence -> load ->
|
|
|
|
* weak-fence.
|
|
|
|
* The tricky thing is, load() and store() above can be the load or store
|
|
|
|
* portions of a gcc __sync builtin, so we have to follow GCC's lead, which
|
|
|
|
* means going with strategy 2.
|
|
|
|
* On strongly ordered architectures, the natural strategy is to stick a strong
|
|
|
|
* fence after seq_cst stores, and have naked loads. So we want the strong
|
|
|
|
* fences in different places on different architectures.
|
|
|
|
* atomic_pre_sc_load_fence and atomic_post_sc_store_fence allow us to
|
|
|
|
* accomplish this.
|
|
|
|
*/
|
|
|
|
|
|
|
|
ATOMIC_INLINE void
|
|
|
|
atomic_pre_sc_load_fence() {
|
|
|
|
# if defined(__i386__) || defined(__x86_64__) || \
|
|
|
|
(defined(__sparc__) && defined(__arch64__))
|
|
|
|
atomic_fence(atomic_memory_order_relaxed);
|
|
|
|
# else
|
|
|
|
atomic_fence(atomic_memory_order_seq_cst);
|
|
|
|
# endif
|
|
|
|
}
|
|
|
|
|
|
|
|
ATOMIC_INLINE void
|
|
|
|
atomic_post_sc_store_fence() {
|
|
|
|
# if defined(__i386__) || defined(__x86_64__) || \
|
|
|
|
(defined(__sparc__) && defined(__arch64__))
|
|
|
|
atomic_fence(atomic_memory_order_seq_cst);
|
|
|
|
# else
|
|
|
|
atomic_fence(atomic_memory_order_relaxed);
|
|
|
|
# endif
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
#define JEMALLOC_GENERATE_ATOMICS(type, short_type, \
|
|
|
|
/* unused */ lg_size) \
|
|
|
|
typedef struct { \
|
|
|
|
type volatile repr; \
|
|
|
|
} atomic_##short_type##_t; \
|
|
|
|
\
|
|
|
|
ATOMIC_INLINE type \
|
|
|
|
atomic_load_##short_type(const atomic_##short_type##_t *a, \
|
|
|
|
atomic_memory_order_t mo) { \
|
|
|
|
if (mo == atomic_memory_order_seq_cst) { \
|
|
|
|
atomic_pre_sc_load_fence(); \
|
|
|
|
} \
|
|
|
|
type result = a->repr; \
|
|
|
|
if (mo != atomic_memory_order_relaxed) { \
|
|
|
|
atomic_fence(atomic_memory_order_acquire); \
|
|
|
|
} \
|
|
|
|
return result; \
|
|
|
|
} \
|
|
|
|
\
|
|
|
|
ATOMIC_INLINE void \
|
|
|
|
atomic_store_##short_type(atomic_##short_type##_t *a, \
|
|
|
|
type val, atomic_memory_order_t mo) { \
|
|
|
|
if (mo != atomic_memory_order_relaxed) { \
|
|
|
|
atomic_fence(atomic_memory_order_release); \
|
|
|
|
} \
|
|
|
|
a->repr = val; \
|
|
|
|
if (mo == atomic_memory_order_seq_cst) { \
|
|
|
|
atomic_post_sc_store_fence(); \
|
|
|
|
} \
|
|
|
|
} \
|
|
|
|
\
|
|
|
|
ATOMIC_INLINE type \
|
2018-05-03 17:40:53 +08:00
|
|
|
atomic_exchange_##short_type(atomic_##short_type##_t *a, type val, \
|
|
|
|
atomic_memory_order_t mo) { \
|
Introduce a backport of C11 atomics
This introduces a backport of C11 atomics. It has four implementations; ranked
in order of preference, they are:
- GCC/Clang __atomic builtins
- GCC/Clang __sync builtins
- MSVC _Interlocked builtins
- C11 atomics, from <stdatomic.h>
The primary advantages are:
- Close adherence to the standard API gives us a defined memory model.
- Type safety: atomic objects are now separate types from non-atomic ones, so
that it's impossible to mix up atomic and non-atomic updates (which is
undefined behavior that compilers are starting to take advantage of).
- Efficiency: we can specify ordering for operations, avoiding fences and
atomic operations on strongly ordered architectures (example:
`atomic_write_u32(ptr, val);` involves a CAS loop, whereas
`atomic_store(ptr, val, ATOMIC_RELEASE);` is a plain store.
This diff leaves in the current atomics API (implementing them in terms of the
backport). This lets us transition uses over piecemeal.
Testing:
This is by nature hard to test. I've manually tested the first three options on
Linux on gcc by futzing with the #defines manually, on freebsd with gcc and
clang, on MSVC, and on OS X with clang. All of these were x86 machines though,
and we don't have any test infrastructure set up for non-x86 platforms.
2017-01-26 01:54:27 +08:00
|
|
|
/* \
|
|
|
|
* Because of FreeBSD, we care about gcc 4.2, which doesn't have\
|
|
|
|
* an atomic exchange builtin. We fake it with a CAS loop. \
|
|
|
|
*/ \
|
|
|
|
while (true) { \
|
|
|
|
type old = a->repr; \
|
|
|
|
if (__sync_bool_compare_and_swap(&a->repr, old, val)) { \
|
|
|
|
return old; \
|
|
|
|
} \
|
|
|
|
} \
|
|
|
|
} \
|
|
|
|
\
|
|
|
|
ATOMIC_INLINE bool \
|
|
|
|
atomic_compare_exchange_weak_##short_type(atomic_##short_type##_t *a, \
|
2018-05-03 17:40:53 +08:00
|
|
|
type *expected, type desired, \
|
|
|
|
atomic_memory_order_t success_mo, \
|
|
|
|
atomic_memory_order_t failure_mo) { \
|
Introduce a backport of C11 atomics
This introduces a backport of C11 atomics. It has four implementations; ranked
in order of preference, they are:
- GCC/Clang __atomic builtins
- GCC/Clang __sync builtins
- MSVC _Interlocked builtins
- C11 atomics, from <stdatomic.h>
The primary advantages are:
- Close adherence to the standard API gives us a defined memory model.
- Type safety: atomic objects are now separate types from non-atomic ones, so
that it's impossible to mix up atomic and non-atomic updates (which is
undefined behavior that compilers are starting to take advantage of).
- Efficiency: we can specify ordering for operations, avoiding fences and
atomic operations on strongly ordered architectures (example:
`atomic_write_u32(ptr, val);` involves a CAS loop, whereas
`atomic_store(ptr, val, ATOMIC_RELEASE);` is a plain store.
This diff leaves in the current atomics API (implementing them in terms of the
backport). This lets us transition uses over piecemeal.
Testing:
This is by nature hard to test. I've manually tested the first three options on
Linux on gcc by futzing with the #defines manually, on freebsd with gcc and
clang, on MSVC, and on OS X with clang. All of these were x86 machines though,
and we don't have any test infrastructure set up for non-x86 platforms.
2017-01-26 01:54:27 +08:00
|
|
|
type prev = __sync_val_compare_and_swap(&a->repr, *expected, \
|
|
|
|
desired); \
|
|
|
|
if (prev == *expected) { \
|
|
|
|
return true; \
|
|
|
|
} else { \
|
|
|
|
*expected = prev; \
|
|
|
|
return false; \
|
|
|
|
} \
|
|
|
|
} \
|
|
|
|
ATOMIC_INLINE bool \
|
|
|
|
atomic_compare_exchange_strong_##short_type(atomic_##short_type##_t *a, \
|
2018-05-03 17:40:53 +08:00
|
|
|
type *expected, type desired, \
|
|
|
|
atomic_memory_order_t success_mo, \
|
|
|
|
atomic_memory_order_t failure_mo) { \
|
Introduce a backport of C11 atomics
This introduces a backport of C11 atomics. It has four implementations; ranked
in order of preference, they are:
- GCC/Clang __atomic builtins
- GCC/Clang __sync builtins
- MSVC _Interlocked builtins
- C11 atomics, from <stdatomic.h>
The primary advantages are:
- Close adherence to the standard API gives us a defined memory model.
- Type safety: atomic objects are now separate types from non-atomic ones, so
that it's impossible to mix up atomic and non-atomic updates (which is
undefined behavior that compilers are starting to take advantage of).
- Efficiency: we can specify ordering for operations, avoiding fences and
atomic operations on strongly ordered architectures (example:
`atomic_write_u32(ptr, val);` involves a CAS loop, whereas
`atomic_store(ptr, val, ATOMIC_RELEASE);` is a plain store.
This diff leaves in the current atomics API (implementing them in terms of the
backport). This lets us transition uses over piecemeal.
Testing:
This is by nature hard to test. I've manually tested the first three options on
Linux on gcc by futzing with the #defines manually, on freebsd with gcc and
clang, on MSVC, and on OS X with clang. All of these were x86 machines though,
and we don't have any test infrastructure set up for non-x86 platforms.
2017-01-26 01:54:27 +08:00
|
|
|
type prev = __sync_val_compare_and_swap(&a->repr, *expected, \
|
|
|
|
desired); \
|
|
|
|
if (prev == *expected) { \
|
|
|
|
return true; \
|
|
|
|
} else { \
|
|
|
|
*expected = prev; \
|
|
|
|
return false; \
|
|
|
|
} \
|
|
|
|
}
|
|
|
|
|
|
|
|
#define JEMALLOC_GENERATE_INT_ATOMICS(type, short_type, \
|
|
|
|
/* unused */ lg_size) \
|
|
|
|
JEMALLOC_GENERATE_ATOMICS(type, short_type, /* unused */ lg_size) \
|
|
|
|
\
|
|
|
|
ATOMIC_INLINE type \
|
|
|
|
atomic_fetch_add_##short_type(atomic_##short_type##_t *a, type val, \
|
|
|
|
atomic_memory_order_t mo) { \
|
|
|
|
return __sync_fetch_and_add(&a->repr, val); \
|
|
|
|
} \
|
|
|
|
\
|
|
|
|
ATOMIC_INLINE type \
|
|
|
|
atomic_fetch_sub_##short_type(atomic_##short_type##_t *a, type val, \
|
|
|
|
atomic_memory_order_t mo) { \
|
|
|
|
return __sync_fetch_and_sub(&a->repr, val); \
|
|
|
|
} \
|
|
|
|
\
|
|
|
|
ATOMIC_INLINE type \
|
|
|
|
atomic_fetch_and_##short_type(atomic_##short_type##_t *a, type val, \
|
|
|
|
atomic_memory_order_t mo) { \
|
|
|
|
return __sync_fetch_and_and(&a->repr, val); \
|
|
|
|
} \
|
|
|
|
\
|
|
|
|
ATOMIC_INLINE type \
|
|
|
|
atomic_fetch_or_##short_type(atomic_##short_type##_t *a, type val, \
|
|
|
|
atomic_memory_order_t mo) { \
|
|
|
|
return __sync_fetch_and_or(&a->repr, val); \
|
|
|
|
} \
|
|
|
|
\
|
|
|
|
ATOMIC_INLINE type \
|
|
|
|
atomic_fetch_xor_##short_type(atomic_##short_type##_t *a, type val, \
|
|
|
|
atomic_memory_order_t mo) { \
|
|
|
|
return __sync_fetch_and_xor(&a->repr, val); \
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif /* JEMALLOC_INTERNAL_ATOMIC_GCC_SYNC_H */
|