From 734e72ce8fb897bdbcbd48bb994c3778dba50dc6 Mon Sep 17 00:00:00 2001 From: David Goldblatt Date: Wed, 2 Dec 2020 10:04:32 -0800 Subject: [PATCH] bit_util: Guarantee popcount's presence. Implement popcount generically, so that we can rely on it being present. --- configure.ac | 1 + include/jemalloc/internal/bit_util.h | 91 +++++++++++++++++++++++++++- test/unit/bit_util.c | 79 +++++++++++++++++++++++- 3 files changed, 167 insertions(+), 4 deletions(-) diff --git a/configure.ac b/configure.ac index 8e21f3f9..8284e87a 100644 --- a/configure.ac +++ b/configure.ac @@ -1594,6 +1594,7 @@ JE_COMPILABLE([a program using __builtin_popcountl], [ if test "x${je_cv_gcc_builtin_popcountl}" = "xyes" ; then AC_DEFINE([JEMALLOC_INTERNAL_POPCOUNT], [__builtin_popcount]) AC_DEFINE([JEMALLOC_INTERNAL_POPCOUNTL], [__builtin_popcountl]) + AC_DEFINE([JEMALLOC_INTERNAL_POPCOUNTLL], [__builtin_popcountll]) fi AC_ARG_WITH([lg_quantum], diff --git a/include/jemalloc/internal/bit_util.h b/include/jemalloc/internal/bit_util.h index c5158f67..bac59140 100644 --- a/include/jemalloc/internal/bit_util.h +++ b/include/jemalloc/internal/bit_util.h @@ -179,12 +179,97 @@ fls_u(unsigned x) { } #endif -#ifdef JEMALLOC_INTERNAL_POPCOUNTL +#if LG_SIZEOF_LONG_LONG > 3 +# error "Haven't implemented popcount for 16-byte ints." +#endif + +#define DO_POPCOUNT(x, type) do { \ + /* \ + * Algorithm from an old AMD optimization reference manual. \ + * We're putting a little bit more work than you might expect \ + * into the no-instrinsic case, since we only support the \ + * GCC intrinsics spelling of popcount (for now). Detecting \ + * whether or not the popcount builtin is actually useable in \ + * MSVC is nontrivial. \ + */ \ + \ + type bmul = (type)0x0101010101010101ULL; \ + \ + /* \ + * Replace each 2 bits with the sideways sum of the original \ + * values. 0x5 = 0b0101. \ + * \ + * You might expect this to be: \ + * x = (x & 0x55...) + ((x >> 1) & 0x55...). \ + * That costs an extra mask relative to this, though. \ + */ \ + x = x - ((x >> 1) & (0x55U * bmul)); \ + /* Replace each 4 bits with their sideays sum. 0x3 = 0b0011. */\ + x = (x & (bmul * 0x33U)) + ((x >> 2) & (bmul * 0x33U)); \ + /* \ + * Replace each 8 bits with their sideways sum. Note that we \ + * can't overflow within each 4-bit sum here, so we can skip \ + * the initial mask. \ + */ \ + x = (x + (x >> 4)) & (bmul * 0x0FU); \ + /* \ + * None of the partial sums in this multiplication (viewed in \ + * base-256) can overflow into the next digit. So the least \ + * significant byte of the product will be the least \ + * significant byte of the original value, the second least \ + * significant byte will be the sum of the two least \ + * significant bytes of the original value, and so on. \ + * Importantly, the high byte will be the byte-wise sum of all \ + * the bytes of the original value. \ + */ \ + x = x * bmul; \ + x >>= ((sizeof(x) - 1) * 8); \ + return (unsigned)x; \ +} while(0) + +static inline unsigned +popcount_u_slow(unsigned bitmap) { + DO_POPCOUNT(bitmap, unsigned); +} + +static inline unsigned +popcount_lu_slow(unsigned long bitmap) { + DO_POPCOUNT(bitmap, unsigned long); +} + +static inline unsigned +popcount_llu_slow(unsigned long long bitmap) { + DO_POPCOUNT(bitmap, unsigned long long); +} + +#undef DO_POPCOUNT + +static inline unsigned +popcount_u(unsigned bitmap) { +#ifdef JEMALLOC_INTERNAL_POPCOUNT + return JEMALLOC_INTERNAL_POPCOUNT(bitmap); +#else + return popcount_u_slow(bitmap); +#endif +} + static inline unsigned popcount_lu(unsigned long bitmap) { - return JEMALLOC_INTERNAL_POPCOUNTL(bitmap); -} +#ifdef JEMALLOC_INTERNAL_POPCOUNTL + return JEMALLOC_INTERNAL_POPCOUNTL(bitmap); +#else + return popcount_lu_slow(bitmap); #endif +} + +static inline unsigned +popcount_llu(unsigned long long bitmap) { +#ifdef JEMALLOC_INTERNAL_POPCOUNTLL + return JEMALLOC_INTERNAL_POPCOUNTLL(bitmap); +#else + return popcount_llu_slow(bitmap); +#endif +} /* * Clears first unset bit in bitmap, and returns diff --git a/test/unit/bit_util.c b/test/unit/bit_util.c index 045cf8b4..7d31b210 100644 --- a/test/unit/bit_util.c +++ b/test/unit/bit_util.c @@ -204,6 +204,77 @@ TEST_BEGIN(test_fls_llu_slow) { } TEST_END +static unsigned +popcount_byte(unsigned byte) { + int count = 0; + for (int i = 0; i < 8; i++) { + if ((byte & (1 << i)) != 0) { + count++; + } + } + return count; +} + +static uint64_t +expand_byte_to_mask(unsigned byte) { + uint64_t result = 0; + for (int i = 0; i < 8; i++) { + if ((byte & (1 << i)) != 0) { + result |= ((uint64_t)0xFF << (i * 8)); + } + } + return result; +} + +#define TEST_POPCOUNT(t, suf, pri_hex) do { \ + t bmul = (t)0x0101010101010101ULL; \ + for (unsigned i = 0; i < (1 << sizeof(t)); i++) { \ + for (unsigned j = 0; j < 256; j++) { \ + /* \ + * Replicate the byte j into various \ + * bytes of the integer (as indicated by the \ + * mask in i), and ensure that the popcount of \ + * the result is popcount(i) * popcount(j) \ + */ \ + t mask = (t)expand_byte_to_mask(i); \ + t x = (bmul * j) & mask; \ + expect_u_eq( \ + popcount_byte(i) * popcount_byte(j), \ + popcount_##suf(x), \ + "Unexpected result, x=0x%"pri_hex, x); \ + } \ + } \ +} while (0) + +TEST_BEGIN(test_popcount_u) { + TEST_POPCOUNT(unsigned, u, "x"); +} +TEST_END + +TEST_BEGIN(test_popcount_u_slow) { + TEST_POPCOUNT(unsigned, u_slow, "x"); +} +TEST_END + +TEST_BEGIN(test_popcount_lu) { + TEST_POPCOUNT(unsigned long, lu, "lx"); +} +TEST_END + +TEST_BEGIN(test_popcount_lu_slow) { + TEST_POPCOUNT(unsigned long, lu_slow, "lx"); +} +TEST_END + +TEST_BEGIN(test_popcount_llu) { + TEST_POPCOUNT(unsigned long long, llu, "llx"); +} +TEST_END + +TEST_BEGIN(test_popcount_llu_slow) { + TEST_POPCOUNT(unsigned long long, llu_slow, "llx"); +} +TEST_END int main(void) { @@ -226,5 +297,11 @@ main(void) { test_fls_zu, test_fls_u_slow, test_fls_lu_slow, - test_fls_llu_slow); + test_fls_llu_slow, + test_popcount_u, + test_popcount_u_slow, + test_popcount_lu, + test_popcount_lu_slow, + test_popcount_llu, + test_popcount_llu_slow); }