bit_util: Guarantee popcount's presence.
Implement popcount generically, so that we can rely on it being present.
This commit is contained in:
parent
d9f7e6c668
commit
734e72ce8f
@ -1594,6 +1594,7 @@ JE_COMPILABLE([a program using __builtin_popcountl], [
|
|||||||
if test "x${je_cv_gcc_builtin_popcountl}" = "xyes" ; then
|
if test "x${je_cv_gcc_builtin_popcountl}" = "xyes" ; then
|
||||||
AC_DEFINE([JEMALLOC_INTERNAL_POPCOUNT], [__builtin_popcount])
|
AC_DEFINE([JEMALLOC_INTERNAL_POPCOUNT], [__builtin_popcount])
|
||||||
AC_DEFINE([JEMALLOC_INTERNAL_POPCOUNTL], [__builtin_popcountl])
|
AC_DEFINE([JEMALLOC_INTERNAL_POPCOUNTL], [__builtin_popcountl])
|
||||||
|
AC_DEFINE([JEMALLOC_INTERNAL_POPCOUNTLL], [__builtin_popcountll])
|
||||||
fi
|
fi
|
||||||
|
|
||||||
AC_ARG_WITH([lg_quantum],
|
AC_ARG_WITH([lg_quantum],
|
||||||
|
@ -179,12 +179,97 @@ fls_u(unsigned x) {
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef JEMALLOC_INTERNAL_POPCOUNTL
|
#if LG_SIZEOF_LONG_LONG > 3
|
||||||
|
# error "Haven't implemented popcount for 16-byte ints."
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define DO_POPCOUNT(x, type) do { \
|
||||||
|
/* \
|
||||||
|
* Algorithm from an old AMD optimization reference manual. \
|
||||||
|
* We're putting a little bit more work than you might expect \
|
||||||
|
* into the no-instrinsic case, since we only support the \
|
||||||
|
* GCC intrinsics spelling of popcount (for now). Detecting \
|
||||||
|
* whether or not the popcount builtin is actually useable in \
|
||||||
|
* MSVC is nontrivial. \
|
||||||
|
*/ \
|
||||||
|
\
|
||||||
|
type bmul = (type)0x0101010101010101ULL; \
|
||||||
|
\
|
||||||
|
/* \
|
||||||
|
* Replace each 2 bits with the sideways sum of the original \
|
||||||
|
* values. 0x5 = 0b0101. \
|
||||||
|
* \
|
||||||
|
* You might expect this to be: \
|
||||||
|
* x = (x & 0x55...) + ((x >> 1) & 0x55...). \
|
||||||
|
* That costs an extra mask relative to this, though. \
|
||||||
|
*/ \
|
||||||
|
x = x - ((x >> 1) & (0x55U * bmul)); \
|
||||||
|
/* Replace each 4 bits with their sideays sum. 0x3 = 0b0011. */\
|
||||||
|
x = (x & (bmul * 0x33U)) + ((x >> 2) & (bmul * 0x33U)); \
|
||||||
|
/* \
|
||||||
|
* Replace each 8 bits with their sideways sum. Note that we \
|
||||||
|
* can't overflow within each 4-bit sum here, so we can skip \
|
||||||
|
* the initial mask. \
|
||||||
|
*/ \
|
||||||
|
x = (x + (x >> 4)) & (bmul * 0x0FU); \
|
||||||
|
/* \
|
||||||
|
* None of the partial sums in this multiplication (viewed in \
|
||||||
|
* base-256) can overflow into the next digit. So the least \
|
||||||
|
* significant byte of the product will be the least \
|
||||||
|
* significant byte of the original value, the second least \
|
||||||
|
* significant byte will be the sum of the two least \
|
||||||
|
* significant bytes of the original value, and so on. \
|
||||||
|
* Importantly, the high byte will be the byte-wise sum of all \
|
||||||
|
* the bytes of the original value. \
|
||||||
|
*/ \
|
||||||
|
x = x * bmul; \
|
||||||
|
x >>= ((sizeof(x) - 1) * 8); \
|
||||||
|
return (unsigned)x; \
|
||||||
|
} while(0)
|
||||||
|
|
||||||
|
static inline unsigned
|
||||||
|
popcount_u_slow(unsigned bitmap) {
|
||||||
|
DO_POPCOUNT(bitmap, unsigned);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline unsigned
|
||||||
|
popcount_lu_slow(unsigned long bitmap) {
|
||||||
|
DO_POPCOUNT(bitmap, unsigned long);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline unsigned
|
||||||
|
popcount_llu_slow(unsigned long long bitmap) {
|
||||||
|
DO_POPCOUNT(bitmap, unsigned long long);
|
||||||
|
}
|
||||||
|
|
||||||
|
#undef DO_POPCOUNT
|
||||||
|
|
||||||
|
static inline unsigned
|
||||||
|
popcount_u(unsigned bitmap) {
|
||||||
|
#ifdef JEMALLOC_INTERNAL_POPCOUNT
|
||||||
|
return JEMALLOC_INTERNAL_POPCOUNT(bitmap);
|
||||||
|
#else
|
||||||
|
return popcount_u_slow(bitmap);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
static inline unsigned
|
static inline unsigned
|
||||||
popcount_lu(unsigned long bitmap) {
|
popcount_lu(unsigned long bitmap) {
|
||||||
|
#ifdef JEMALLOC_INTERNAL_POPCOUNTL
|
||||||
return JEMALLOC_INTERNAL_POPCOUNTL(bitmap);
|
return JEMALLOC_INTERNAL_POPCOUNTL(bitmap);
|
||||||
}
|
#else
|
||||||
|
return popcount_lu_slow(bitmap);
|
||||||
#endif
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline unsigned
|
||||||
|
popcount_llu(unsigned long long bitmap) {
|
||||||
|
#ifdef JEMALLOC_INTERNAL_POPCOUNTLL
|
||||||
|
return JEMALLOC_INTERNAL_POPCOUNTLL(bitmap);
|
||||||
|
#else
|
||||||
|
return popcount_llu_slow(bitmap);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Clears first unset bit in bitmap, and returns
|
* Clears first unset bit in bitmap, and returns
|
||||||
|
@ -204,6 +204,77 @@ TEST_BEGIN(test_fls_llu_slow) {
|
|||||||
}
|
}
|
||||||
TEST_END
|
TEST_END
|
||||||
|
|
||||||
|
static unsigned
|
||||||
|
popcount_byte(unsigned byte) {
|
||||||
|
int count = 0;
|
||||||
|
for (int i = 0; i < 8; i++) {
|
||||||
|
if ((byte & (1 << i)) != 0) {
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
|
||||||
|
static uint64_t
|
||||||
|
expand_byte_to_mask(unsigned byte) {
|
||||||
|
uint64_t result = 0;
|
||||||
|
for (int i = 0; i < 8; i++) {
|
||||||
|
if ((byte & (1 << i)) != 0) {
|
||||||
|
result |= ((uint64_t)0xFF << (i * 8));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define TEST_POPCOUNT(t, suf, pri_hex) do { \
|
||||||
|
t bmul = (t)0x0101010101010101ULL; \
|
||||||
|
for (unsigned i = 0; i < (1 << sizeof(t)); i++) { \
|
||||||
|
for (unsigned j = 0; j < 256; j++) { \
|
||||||
|
/* \
|
||||||
|
* Replicate the byte j into various \
|
||||||
|
* bytes of the integer (as indicated by the \
|
||||||
|
* mask in i), and ensure that the popcount of \
|
||||||
|
* the result is popcount(i) * popcount(j) \
|
||||||
|
*/ \
|
||||||
|
t mask = (t)expand_byte_to_mask(i); \
|
||||||
|
t x = (bmul * j) & mask; \
|
||||||
|
expect_u_eq( \
|
||||||
|
popcount_byte(i) * popcount_byte(j), \
|
||||||
|
popcount_##suf(x), \
|
||||||
|
"Unexpected result, x=0x%"pri_hex, x); \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
TEST_BEGIN(test_popcount_u) {
|
||||||
|
TEST_POPCOUNT(unsigned, u, "x");
|
||||||
|
}
|
||||||
|
TEST_END
|
||||||
|
|
||||||
|
TEST_BEGIN(test_popcount_u_slow) {
|
||||||
|
TEST_POPCOUNT(unsigned, u_slow, "x");
|
||||||
|
}
|
||||||
|
TEST_END
|
||||||
|
|
||||||
|
TEST_BEGIN(test_popcount_lu) {
|
||||||
|
TEST_POPCOUNT(unsigned long, lu, "lx");
|
||||||
|
}
|
||||||
|
TEST_END
|
||||||
|
|
||||||
|
TEST_BEGIN(test_popcount_lu_slow) {
|
||||||
|
TEST_POPCOUNT(unsigned long, lu_slow, "lx");
|
||||||
|
}
|
||||||
|
TEST_END
|
||||||
|
|
||||||
|
TEST_BEGIN(test_popcount_llu) {
|
||||||
|
TEST_POPCOUNT(unsigned long long, llu, "llx");
|
||||||
|
}
|
||||||
|
TEST_END
|
||||||
|
|
||||||
|
TEST_BEGIN(test_popcount_llu_slow) {
|
||||||
|
TEST_POPCOUNT(unsigned long long, llu_slow, "llx");
|
||||||
|
}
|
||||||
|
TEST_END
|
||||||
|
|
||||||
int
|
int
|
||||||
main(void) {
|
main(void) {
|
||||||
@ -226,5 +297,11 @@ main(void) {
|
|||||||
test_fls_zu,
|
test_fls_zu,
|
||||||
test_fls_u_slow,
|
test_fls_u_slow,
|
||||||
test_fls_lu_slow,
|
test_fls_lu_slow,
|
||||||
test_fls_llu_slow);
|
test_fls_llu_slow,
|
||||||
|
test_popcount_u,
|
||||||
|
test_popcount_u_slow,
|
||||||
|
test_popcount_lu,
|
||||||
|
test_popcount_lu_slow,
|
||||||
|
test_popcount_llu,
|
||||||
|
test_popcount_llu_slow);
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user