From 22da836094f315b3fe1609e21c0e1092e7b0f2f5 Mon Sep 17 00:00:00 2001 From: David Goldblatt Date: Wed, 22 Jul 2020 07:10:06 -0700 Subject: [PATCH] bit_util: Add fls_ functions; "find last set". These simplify a lot of the bit_util module, which had grown bits and pieces of this functionality across a variety of places over the years. While we're here, kill off BIT_UTIL_INLINE and don't do reentrancy testing for bit_util. --- configure.ac | 6 +- include/jemalloc/internal/bit_util.h | 344 +++++++++++++++++---------- test/unit/bit_util.c | 75 +++++- 3 files changed, 292 insertions(+), 133 deletions(-) diff --git a/configure.ac b/configure.ac index bcd63632..b197d32e 100644 --- a/configure.ac +++ b/configure.ac @@ -2118,7 +2118,7 @@ esac fi dnl ============================================================================ -dnl Check for __builtin_clz() and __builtin_clzl(). +dnl Check for __builtin_clz(), __builtin_clzl(), and __builtin_clzll(). AC_CACHE_CHECK([for __builtin_clz], [je_cv_builtin_clz], @@ -2132,6 +2132,10 @@ AC_CACHE_CHECK([for __builtin_clz], unsigned long x = 0; int y = __builtin_clzl(x); } + { + unsigned long long x = 0; + int y = __builtin_clzll(x); + } ])], [je_cv_builtin_clz=yes], [je_cv_builtin_clz=no])]) diff --git a/include/jemalloc/internal/bit_util.h b/include/jemalloc/internal/bit_util.h index 258fd978..c5158f67 100644 --- a/include/jemalloc/internal/bit_util.h +++ b/include/jemalloc/internal/bit_util.h @@ -3,8 +3,6 @@ #include "jemalloc/internal/assert.h" -#define BIT_UTIL_INLINE static inline - /* Sanity check. */ #if !defined(JEMALLOC_INTERNAL_FFSLL) || !defined(JEMALLOC_INTERNAL_FFSL) \ || !defined(JEMALLOC_INTERNAL_FFS) @@ -18,26 +16,171 @@ * value of zero as a sentinel. This tends to simplify logic in callers, and * allows for consistency with the builtins we build fls on top of. */ -BIT_UTIL_INLINE unsigned +static inline unsigned ffs_llu(unsigned long long x) { util_assume(x != 0); return JEMALLOC_INTERNAL_FFSLL(x) - 1; } -BIT_UTIL_INLINE unsigned +static inline unsigned ffs_lu(unsigned long x) { util_assume(x != 0); return JEMALLOC_INTERNAL_FFSL(x) - 1; } -BIT_UTIL_INLINE unsigned +static inline unsigned ffs_u(unsigned x) { util_assume(x != 0); return JEMALLOC_INTERNAL_FFS(x) - 1; } +#define DO_FLS_SLOW(x, suffix) do { \ + util_assume(x != 0); \ + x |= (x >> 1); \ + x |= (x >> 2); \ + x |= (x >> 4); \ + x |= (x >> 8); \ + x |= (x >> 16); \ + if (sizeof(x) > 4) { \ + /* \ + * If sizeof(x) is 4, then the expression "x >> 32" \ + * will generate compiler warnings even if the code \ + * never executes. This circumvents the warning, and \ + * gets compiled out in optimized builds. \ + */ \ + int constant_32 = sizeof(x) * 4; \ + x |= (x >> constant_32); \ + } \ + x++; \ + if (x == 0) { \ + return 8 * sizeof(x) - 1; \ + } \ + return ffs_##suffix(x) - 1; \ +} while(0) + +static inline unsigned +fls_llu_slow(unsigned long long x) { + DO_FLS_SLOW(x, llu); +} + +static inline unsigned +fls_lu_slow(unsigned long x) { + DO_FLS_SLOW(x, lu); +} + +static inline unsigned +fls_u_slow(unsigned x) { + DO_FLS_SLOW(x, u); +} + +#undef DO_FLS_SLOW + +#ifdef JEMALLOC_HAVE_BUILTIN_CLZ +static inline unsigned +fls_llu(unsigned long long x) { + util_assume(x != 0); + /* + * Note that the xor here is more naturally written as subtraction; the + * last bit set is the number of bits in the type minus the number of + * leading zero bits. But GCC implements that as: + * bsr edi, edi + * mov eax, 31 + * xor edi, 31 + * sub eax, edi + * If we write it as xor instead, then we get + * bsr eax, edi + * as desired. + */ + return (8 * sizeof(x) - 1) ^ __builtin_clzll(x); +} + +static inline unsigned +fls_lu(unsigned long x) { + util_assume(x != 0); + return (8 * sizeof(x) - 1) ^ __builtin_clzl(x); +} + +static inline unsigned +fls_u(unsigned x) { + util_assume(x != 0); + return (8 * sizeof(x) - 1) ^ __builtin_clz(x); +} +#elif defined(_MSC_VER) + +#if LG_SIZEOF_PTR == 3 +#define DO_BSR64(bit, x) _BitScanReverse64(&bit, x) +#else +/* + * This never actually runs; we're just dodging a compiler error for the + * never-taken branch where sizeof(void *) == 8. + */ +#define DO_BSR64(bit, x) bit = 0; unreachable() +#endif + +#define DO_FLS(x) do { \ + if (x == 0) { \ + return 8 * sizeof(x); \ + } \ + unsigned long bit; \ + if (sizeof(x) == 4) { \ + _BitScanReverse(&bit, (unsigned)x); \ + return (unsigned)bit; \ + } \ + if (sizeof(x) == 8 && sizeof(void *) == 8) { \ + DO_BSR64(bit, x); \ + return (unsigned)bit; \ + } \ + if (sizeof(x) == 8 && sizeof(void *) == 4) { \ + /* Dodge a compiler warning, as above. */ \ + int constant_32 = sizeof(x) * 4; \ + if (_BitScanReverse(&bit, \ + (unsigned)(x >> constant_32))) { \ + return 32 + (unsigned)bit; \ + } else { \ + _BitScanReverse(&bit, (unsigned)x); \ + return (unsigned)bit; \ + } \ + } \ + unreachable(); \ +} while (0) + +static inline unsigned +fls_llu(unsigned long long x) { + DO_FLS(x); +} + +static inline unsigned +fls_lu(unsigned long x) { + DO_FLS(x); +} + +static inline unsigned +fls_u(unsigned x) { + DO_FLS(x); +} + +#undef DO_FLS +#undef DO_BSR64 +#else + +static inline unsigned +fls_llu(unsigned long long x) { + return fls_llu_slow(x); +} + +static inline unsigned +fls_lu(unsigned long x) { + return fls_lu_slow(x); +} + +static inline unsigned +fls_u(unsigned x) { + return fls_u_slow(x); +} +#endif + #ifdef JEMALLOC_INTERNAL_POPCOUNTL -BIT_UTIL_INLINE unsigned +static inline unsigned popcount_lu(unsigned long bitmap) { return JEMALLOC_INTERNAL_POPCOUNTL(bitmap); } @@ -48,7 +191,7 @@ popcount_lu(unsigned long bitmap) { * place of bit. bitmap *must not* be 0. */ -BIT_UTIL_INLINE size_t +static inline size_t cfs_lu(unsigned long* bitmap) { util_assume(*bitmap != 0); size_t bit = ffs_lu(*bitmap); @@ -56,101 +199,102 @@ cfs_lu(unsigned long* bitmap) { return bit; } -BIT_UTIL_INLINE unsigned -ffs_zu(size_t bitmap) { +static inline unsigned +ffs_zu(size_t x) { #if LG_SIZEOF_PTR == LG_SIZEOF_INT - return ffs_u(bitmap); + return ffs_u(x); #elif LG_SIZEOF_PTR == LG_SIZEOF_LONG - return ffs_lu(bitmap); + return ffs_lu(x); #elif LG_SIZEOF_PTR == LG_SIZEOF_LONG_LONG - return ffs_llu(bitmap); + return ffs_llu(x); #else #error No implementation for size_t ffs() #endif } -BIT_UTIL_INLINE unsigned -ffs_u64(uint64_t bitmap) { +static inline unsigned +fls_zu(size_t x) { +#if LG_SIZEOF_PTR == LG_SIZEOF_INT + return fls_u(x); +#elif LG_SIZEOF_PTR == LG_SIZEOF_LONG + return fls_lu(x); +#elif LG_SIZEOF_PTR == LG_SIZEOF_LONG_LONG + return fls_llu(x); +#else +#error No implementation for size_t fls() +#endif +} + + +static inline unsigned +ffs_u64(uint64_t x) { #if LG_SIZEOF_LONG == 3 - return ffs_lu(bitmap); + return ffs_lu(x); #elif LG_SIZEOF_LONG_LONG == 3 - return ffs_llu(bitmap); + return ffs_llu(x); #else #error No implementation for 64-bit ffs() #endif } -BIT_UTIL_INLINE unsigned -ffs_u32(uint32_t bitmap) { +static inline unsigned +fls_u64(uint64_t x) { +#if LG_SIZEOF_LONG == 3 + return fls_lu(x); +#elif LG_SIZEOF_LONG_LONG == 3 + return fls_llu(x); +#else +#error No implementation for 64-bit fls() +#endif +} + +static inline unsigned +ffs_u32(uint32_t x) { #if LG_SIZEOF_INT == 2 - return ffs_u(bitmap); + return ffs_u(x); #else #error No implementation for 32-bit ffs() #endif - return ffs_u(bitmap); + return ffs_u(x); } -BIT_UTIL_INLINE uint64_t +static inline unsigned +fls_u32(uint32_t x) { +#if LG_SIZEOF_INT == 2 + return fls_u(x); +#else +#error No implementation for 32-bit fls() +#endif + return fls_u(x); +} + +static inline uint64_t pow2_ceil_u64(uint64_t x) { -#if (defined(__amd64__) || defined(__x86_64__) || defined(JEMALLOC_HAVE_BUILTIN_CLZ)) - if(unlikely(x <= 1)) { + if (unlikely(x <= 1)) { return x; } - size_t msb_on_index; -#if (defined(__amd64__) || defined(__x86_64__)) - asm ("bsrq %1, %0" - : "=r"(msb_on_index) // Outputs. - : "r"(x-1) // Inputs. - ); -#elif (defined(JEMALLOC_HAVE_BUILTIN_CLZ)) - msb_on_index = (63 ^ __builtin_clzll(x - 1)); -#endif + size_t msb_on_index = fls_u64(x - 1); + /* + * Range-check; it's on the callers to ensure that the result of this + * call won't overflow. + */ assert(msb_on_index < 63); return 1ULL << (msb_on_index + 1); -#else - x--; - x |= x >> 1; - x |= x >> 2; - x |= x >> 4; - x |= x >> 8; - x |= x >> 16; - x |= x >> 32; - x++; - return x; -#endif } -BIT_UTIL_INLINE uint32_t +static inline uint32_t pow2_ceil_u32(uint32_t x) { -#if ((defined(__i386__) || defined(JEMALLOC_HAVE_BUILTIN_CLZ)) && (!defined(__s390__))) - if(unlikely(x <= 1)) { - return x; + if (unlikely(x <= 1)) { + return x; } - size_t msb_on_index; -#if (defined(__i386__)) - asm ("bsr %1, %0" - : "=r"(msb_on_index) // Outputs. - : "r"(x-1) // Inputs. - ); -#elif (defined(JEMALLOC_HAVE_BUILTIN_CLZ)) - msb_on_index = (31 ^ __builtin_clz(x - 1)); -#endif + size_t msb_on_index = fls_u32(x - 1); + /* As above. */ assert(msb_on_index < 31); return 1U << (msb_on_index + 1); -#else - x--; - x |= x >> 1; - x |= x >> 2; - x |= x >> 4; - x |= x >> 8; - x |= x >> 16; - x++; - return x; -#endif } /* Compute the smallest power of 2 that is >= x. */ -BIT_UTIL_INLINE size_t +static inline size_t pow2_ceil_zu(size_t x) { #if (LG_SIZEOF_PTR == 3) return pow2_ceil_u64(x); @@ -159,77 +303,21 @@ pow2_ceil_zu(size_t x) { #endif } -#if (defined(__i386__) || defined(__amd64__) || defined(__x86_64__)) -BIT_UTIL_INLINE unsigned +static inline unsigned lg_floor(size_t x) { - size_t ret; - assert(x != 0); - - asm ("bsr %1, %0" - : "=r"(ret) // Outputs. - : "r"(x) // Inputs. - ); - assert(ret < UINT_MAX); - return (unsigned)ret; -} -#elif (defined(_MSC_VER)) -BIT_UTIL_INLINE unsigned -lg_floor(size_t x) { - unsigned long ret; - - assert(x != 0); - + util_assume(x != 0); #if (LG_SIZEOF_PTR == 3) - _BitScanReverse64(&ret, x); -#elif (LG_SIZEOF_PTR == 2) - _BitScanReverse(&ret, x); + return fls_u64(x); #else -# error "Unsupported type size for lg_floor()" -#endif - assert(ret < UINT_MAX); - return (unsigned)ret; -} -#elif (defined(JEMALLOC_HAVE_BUILTIN_CLZ)) -BIT_UTIL_INLINE unsigned -lg_floor(size_t x) { - assert(x != 0); - -#if (LG_SIZEOF_PTR == LG_SIZEOF_INT) - return ((8 << LG_SIZEOF_PTR) - 1) - __builtin_clz(x); -#elif (LG_SIZEOF_PTR == LG_SIZEOF_LONG) - return ((8 << LG_SIZEOF_PTR) - 1) - __builtin_clzl(x); -#else -# error "Unsupported type size for lg_floor()" + return fls_u32(x); #endif } -#else -BIT_UTIL_INLINE unsigned -lg_floor(size_t x) { - assert(x != 0); - x |= (x >> 1); - x |= (x >> 2); - x |= (x >> 4); - x |= (x >> 8); - x |= (x >> 16); -#if (LG_SIZEOF_PTR == 3) - x |= (x >> 32); -#endif - if (x == SIZE_T_MAX) { - return (8 << LG_SIZEOF_PTR) - 1; - } - x++; - return ffs_zu(x) - 1; -} -#endif - -BIT_UTIL_INLINE unsigned +static inline unsigned lg_ceil(size_t x) { return lg_floor(x) + ((x & (x - 1)) == 0 ? 0 : 1); } -#undef BIT_UTIL_INLINE - /* A compile-time version of lg_floor and lg_ceil. */ #define LG_FLOOR_1(x) 0 #define LG_FLOOR_2(x) (x < (1ULL << 1) ? LG_FLOOR_1(x) : 1 + LG_FLOOR_1(x >> 1)) diff --git a/test/unit/bit_util.c b/test/unit/bit_util.c index f3761fd7..045cf8b4 100644 --- a/test/unit/bit_util.c +++ b/test/unit/bit_util.c @@ -120,7 +120,6 @@ TEST_BEGIN(test_ffs_u) { } TEST_END - TEST_BEGIN(test_ffs_lu) { TEST_FFS(unsigned long, lu, lu, "lu"); } @@ -136,7 +135,6 @@ TEST_BEGIN(test_ffs_u32) { } TEST_END - TEST_BEGIN(test_ffs_u64) { TEST_FFS(uint64_t, u64, u64, FMTu64); } @@ -147,9 +145,69 @@ TEST_BEGIN(test_ffs_zu) { } TEST_END +#define TEST_FLS(t, suf, test_suf, pri) do { \ + for (unsigned i = 0; i < sizeof(t) * 8; i++) { \ + for (unsigned j = 0; j <= i; j++) { \ + for (unsigned k = 0; k <= j; k++) { \ + t x = (t)1 << i; \ + x |= (t)1 << j; \ + x |= (t)1 << k; \ + expect_##test_suf##_eq(fls_##suf(x), i, \ + "Unexpected result, x=%"pri, x); \ + } \ + } \ + } \ +} while(0) + +TEST_BEGIN(test_fls_u) { + TEST_FLS(unsigned, u, u,"u"); +} +TEST_END + +TEST_BEGIN(test_fls_lu) { + TEST_FLS(unsigned long, lu, lu, "lu"); +} +TEST_END + +TEST_BEGIN(test_fls_llu) { + TEST_FLS(unsigned long long, llu, qd, "llu"); +} +TEST_END + +TEST_BEGIN(test_fls_u32) { + TEST_FLS(uint32_t, u32, u32, FMTu32); +} +TEST_END + +TEST_BEGIN(test_fls_u64) { + TEST_FLS(uint64_t, u64, u64, FMTu64); +} +TEST_END + +TEST_BEGIN(test_fls_zu) { + TEST_FLS(size_t, zu, zu, "zu"); +} +TEST_END + +TEST_BEGIN(test_fls_u_slow) { + TEST_FLS(unsigned, u_slow, u,"u"); +} +TEST_END + +TEST_BEGIN(test_fls_lu_slow) { + TEST_FLS(unsigned long, lu_slow, lu, "lu"); +} +TEST_END + +TEST_BEGIN(test_fls_llu_slow) { + TEST_FLS(unsigned long long, llu_slow, qd, "llu"); +} +TEST_END + + int main(void) { - return test( + return test_no_reentrancy( test_pow2_ceil_u64, test_pow2_ceil_u32, test_pow2_ceil_zu, @@ -159,5 +217,14 @@ main(void) { test_ffs_llu, test_ffs_u32, test_ffs_u64, - test_ffs_zu); + test_ffs_zu, + test_fls_u, + test_fls_lu, + test_fls_llu, + test_fls_u32, + test_fls_u64, + test_fls_zu, + test_fls_u_slow, + test_fls_lu_slow, + test_fls_llu_slow); }