bit_util: Add fls_ functions; "find last set".

These simplify a lot of the bit_util module, which had grown bits and pieces of this functionality across a variety of places over the years. While we're here, kill off BIT_UTIL_INLINE and don't do reentrancy testing for bit_util.
2020-07-22 07:10:06 -07:00 · 2020-07-22 07:10:06 -07:00 · 22da836094
commit 22da836094
parent 1ed0288d9c
3 changed files with 292 additions and 133 deletions
--- a/configure.ac
+++ b/configure.ac
@ -2118,7 +2118,7 @@ esac
 fi

 dnl ============================================================================
-dnl Check for __builtin_clz() and __builtin_clzl().
+dnl Check for __builtin_clz(), __builtin_clzl(), and __builtin_clzll().

 AC_CACHE_CHECK([for __builtin_clz],
               [je_cv_builtin_clz],
@ -2132,6 +2132,10 @@ AC_CACHE_CHECK([for __builtin_clz],
                                                        unsigned long x = 0;
                                                        int y = __builtin_clzl(x);
                                                }
+                                                {
+                                                        unsigned long long x = 0;
+                                                        int y = __builtin_clzll(x);
+                                                }
                                                ])],
                               [je_cv_builtin_clz=yes],
                               [je_cv_builtin_clz=no])])
--- a/include/jemalloc/internal/bit_util.h
+++ b/include/jemalloc/internal/bit_util.h
@ -3,8 +3,6 @@

 #include "jemalloc/internal/assert.h"

-#define BIT_UTIL_INLINE static inline
-
 /* Sanity check. */
 #if !defined(JEMALLOC_INTERNAL_FFSLL) || !defined(JEMALLOC_INTERNAL_FFSL) \
    || !defined(JEMALLOC_INTERNAL_FFS)
@ -18,26 +16,171 @@
 * value of zero as a sentinel.  This tends to simplify logic in callers, and
 * allows for consistency with the builtins we build fls on top of.
 */
-BIT_UTIL_INLINE unsigned
+static inline unsigned
 ffs_llu(unsigned long long x) {
 	util_assume(x != 0);
 	return JEMALLOC_INTERNAL_FFSLL(x) - 1;
 }

-BIT_UTIL_INLINE unsigned
+static inline unsigned
 ffs_lu(unsigned long x) {
 	util_assume(x != 0);
 	return JEMALLOC_INTERNAL_FFSL(x) - 1;
 }

-BIT_UTIL_INLINE unsigned
+static inline unsigned
 ffs_u(unsigned x) {
 	util_assume(x != 0);
 	return JEMALLOC_INTERNAL_FFS(x) - 1;
 }

+#define DO_FLS_SLOW(x, suffix) do {					\
+	util_assume(x != 0);						\
+	x |= (x >> 1);							\
+	x |= (x >> 2);							\
+	x |= (x >> 4);							\
+	x |= (x >> 8);							\
+	x |= (x >> 16);							\
+	if (sizeof(x) > 4) {						\
+		/*							\
+		 * If sizeof(x) is 4, then the expression "x >> 32"	\
+		 * will generate compiler warnings even if the code	\
+		 * never executes.  This circumvents the warning, and	\
+		 * gets compiled out in optimized builds.		\
+		 */							\
+		int constant_32 = sizeof(x) * 4;			\
+		x |= (x >> constant_32);				\
+	}								\
+	x++;								\
+	if (x == 0) {							\
+		return 8 * sizeof(x) - 1;				\
+	}								\
+	return ffs_##suffix(x) - 1;					\
+} while(0)
+
+static inline unsigned
+fls_llu_slow(unsigned long long x) {
+	DO_FLS_SLOW(x, llu);
+}
+
+static inline unsigned
+fls_lu_slow(unsigned long x) {
+	DO_FLS_SLOW(x, lu);
+}
+
+static inline unsigned
+fls_u_slow(unsigned x) {
+	DO_FLS_SLOW(x, u);
+}
+
+#undef DO_FLS_SLOW
+
+#ifdef JEMALLOC_HAVE_BUILTIN_CLZ
+static inline unsigned
+fls_llu(unsigned long long x) {
+	util_assume(x != 0);
+	/*
+	 * Note that the xor here is more naturally written as subtraction; the
+	 * last bit set is the number of bits in the type minus the number of
+	 * leading zero bits.  But GCC implements that as:
+	 *    bsr     edi, edi
+	 *    mov     eax, 31
+	 *    xor     edi, 31
+	 *    sub     eax, edi
+	 * If we write it as xor instead, then we get
+	 *    bsr     eax, edi
+	 * as desired.
+	 */
+	return (8 * sizeof(x) - 1) ^ __builtin_clzll(x);
+}
+
+static inline unsigned
+fls_lu(unsigned long x) {
+	util_assume(x != 0);
+	return (8 * sizeof(x) - 1) ^ __builtin_clzl(x);
+}
+
+static inline unsigned
+fls_u(unsigned x) {
+	util_assume(x != 0);
+	return (8 * sizeof(x) - 1) ^ __builtin_clz(x);
+}
+#elif defined(_MSC_VER)
+
+#if LG_SIZEOF_PTR == 3
+#define DO_BSR64(bit, x) _BitScanReverse64(&bit, x)
+#else
+/*
+ * This never actually runs; we're just dodging a compiler error for the
+ * never-taken branch where sizeof(void *) == 8.
+ */
+#define DO_BSR64(bit, x) bit = 0; unreachable()
+#endif
+
+#define DO_FLS(x) do {							\
+	if (x == 0) {							\
+		return 8 * sizeof(x);					\
+	}								\
+	unsigned long bit;						\
+	if (sizeof(x) == 4) {						\
+		_BitScanReverse(&bit, (unsigned)x);			\
+		return (unsigned)bit;					\
+	}								\
+	if (sizeof(x) == 8 && sizeof(void *) == 8) {			\
+		DO_BSR64(bit, x);					\
+		return (unsigned)bit;					\
+	}								\
+	if (sizeof(x) == 8 && sizeof(void *) == 4) {			\
+		/* Dodge a compiler warning, as above. */		\
+		int constant_32 = sizeof(x) * 4;			\
+		if (_BitScanReverse(&bit,				\
+		    (unsigned)(x >> constant_32))) {			\
+			return 32 + (unsigned)bit;			\
+		} else {						\
+			_BitScanReverse(&bit, (unsigned)x);		\
+			return (unsigned)bit;				\
+		}							\
+	}								\
+	unreachable();							\
+} while (0)
+
+static inline unsigned
+fls_llu(unsigned long long x) {
+	DO_FLS(x);
+}
+
+static inline unsigned
+fls_lu(unsigned long x) {
+	DO_FLS(x);
+}
+
+static inline unsigned
+fls_u(unsigned x) {
+	DO_FLS(x);
+}
+
+#undef DO_FLS
+#undef DO_BSR64
+#else
+
+static inline unsigned
+fls_llu(unsigned long long x) {
+	return fls_llu_slow(x);
+}
+
+static inline unsigned
+fls_lu(unsigned long x) {
+	return fls_lu_slow(x);
+}
+
+static inline unsigned
+fls_u(unsigned x) {
+	return fls_u_slow(x);
+}
+#endif
+
 #ifdef JEMALLOC_INTERNAL_POPCOUNTL
-BIT_UTIL_INLINE unsigned
+static inline unsigned
 popcount_lu(unsigned long bitmap) {
  return JEMALLOC_INTERNAL_POPCOUNTL(bitmap);
 }
@ -48,7 +191,7 @@ popcount_lu(unsigned long bitmap) {
 * place of bit.  bitmap *must not* be 0.
 */

-BIT_UTIL_INLINE size_t
+static inline size_t
 cfs_lu(unsigned long* bitmap) {
 	util_assume(*bitmap != 0);
 	size_t bit = ffs_lu(*bitmap);
@ -56,101 +199,102 @@ cfs_lu(unsigned long* bitmap) {
 	return bit;
 }

-BIT_UTIL_INLINE unsigned
-ffs_zu(size_t bitmap) {
+static inline unsigned
+ffs_zu(size_t x) {
 #if LG_SIZEOF_PTR == LG_SIZEOF_INT
-	return ffs_u(bitmap);
+	return ffs_u(x);
 #elif LG_SIZEOF_PTR == LG_SIZEOF_LONG
-	return ffs_lu(bitmap);
+	return ffs_lu(x);
 #elif LG_SIZEOF_PTR == LG_SIZEOF_LONG_LONG
-	return ffs_llu(bitmap);
+	return ffs_llu(x);
 #else
 #error No implementation for size_t ffs()
 #endif
 }

-BIT_UTIL_INLINE unsigned
-ffs_u64(uint64_t bitmap) {
+static inline unsigned
+fls_zu(size_t x) {
+#if LG_SIZEOF_PTR == LG_SIZEOF_INT
+	return fls_u(x);
+#elif LG_SIZEOF_PTR == LG_SIZEOF_LONG
+	return fls_lu(x);
+#elif LG_SIZEOF_PTR == LG_SIZEOF_LONG_LONG
+	return fls_llu(x);
+#else
+#error No implementation for size_t fls()
+#endif
+}
+
+
+static inline unsigned
+ffs_u64(uint64_t x) {
 #if LG_SIZEOF_LONG == 3
-	return ffs_lu(bitmap);
+	return ffs_lu(x);
 #elif LG_SIZEOF_LONG_LONG == 3
-	return ffs_llu(bitmap);
+	return ffs_llu(x);
 #else
 #error No implementation for 64-bit ffs()
 #endif
 }

-BIT_UTIL_INLINE unsigned
-ffs_u32(uint32_t bitmap) {
+static inline unsigned
+fls_u64(uint64_t x) {
+#if LG_SIZEOF_LONG == 3
+	return fls_lu(x);
+#elif LG_SIZEOF_LONG_LONG == 3
+	return fls_llu(x);
+#else
+#error No implementation for 64-bit fls()
+#endif
+}
+
+static inline unsigned
+ffs_u32(uint32_t x) {
 #if LG_SIZEOF_INT == 2
-	return ffs_u(bitmap);
+	return ffs_u(x);
 #else
 #error No implementation for 32-bit ffs()
 #endif
-	return ffs_u(bitmap);
+	return ffs_u(x);
 }

-BIT_UTIL_INLINE uint64_t
+static inline unsigned
+fls_u32(uint32_t x) {
+#if LG_SIZEOF_INT == 2
+	return fls_u(x);
+#else
+#error No implementation for 32-bit fls()
+#endif
+	return fls_u(x);
+}
+
+static inline uint64_t
 pow2_ceil_u64(uint64_t x) {
-#if (defined(__amd64__) || defined(__x86_64__) || defined(JEMALLOC_HAVE_BUILTIN_CLZ))
-	if(unlikely(x <= 1)) {
+	if (unlikely(x <= 1)) {
 		return x;
 	}
-	size_t msb_on_index;
-#if (defined(__amd64__) || defined(__x86_64__))
-	asm ("bsrq %1, %0"
-			: "=r"(msb_on_index) // Outputs.
-			: "r"(x-1)           // Inputs.
-		);
-#elif (defined(JEMALLOC_HAVE_BUILTIN_CLZ))
-	msb_on_index = (63 ^ __builtin_clzll(x - 1));
-#endif
+	size_t msb_on_index = fls_u64(x - 1);
+	/*
+	 * Range-check; it's on the callers to ensure that the result of this
+	 * call won't overflow.
+	 */
 	assert(msb_on_index < 63);
 	return 1ULL << (msb_on_index + 1);
-#else
-	x--;
-	x |= x >> 1;
-	x |= x >> 2;
-	x |= x >> 4;
-	x |= x >> 8;
-	x |= x >> 16;
-	x |= x >> 32;
-	x++;
-	return x;
-#endif
 }

-BIT_UTIL_INLINE uint32_t
+static inline uint32_t
 pow2_ceil_u32(uint32_t x) {
-#if ((defined(__i386__) || defined(JEMALLOC_HAVE_BUILTIN_CLZ)) && (!defined(__s390__)))
-	if(unlikely(x <= 1)) {
-		return x;
+	if (unlikely(x <= 1)) {
+	    return x;
 	}
-	size_t msb_on_index;
-#if (defined(__i386__))
-	asm ("bsr %1, %0"
-			: "=r"(msb_on_index) // Outputs.
-			: "r"(x-1)           // Inputs.
-		);
-#elif (defined(JEMALLOC_HAVE_BUILTIN_CLZ))
-	msb_on_index = (31 ^ __builtin_clz(x - 1));
-#endif
+	size_t msb_on_index = fls_u32(x - 1);
+	/* As above. */
 	assert(msb_on_index < 31);
 	return 1U << (msb_on_index + 1);
-#else
-	x--;
-	x |= x >> 1;
-	x |= x >> 2;
-	x |= x >> 4;
-	x |= x >> 8;
-	x |= x >> 16;
-	x++;
-	return x;
-#endif
 }

 /* Compute the smallest power of 2 that is >= x. */
-BIT_UTIL_INLINE size_t
+static inline size_t
 pow2_ceil_zu(size_t x) {
 #if (LG_SIZEOF_PTR == 3)
 	return pow2_ceil_u64(x);
@ -159,77 +303,21 @@ pow2_ceil_zu(size_t x) {
 #endif
 }

-#if (defined(__i386__) || defined(__amd64__) || defined(__x86_64__))
-BIT_UTIL_INLINE unsigned
+static inline unsigned
 lg_floor(size_t x) {
-	size_t ret;
-	assert(x != 0);
-
-	asm ("bsr %1, %0"
-	    : "=r"(ret) // Outputs.
-	    : "r"(x)    // Inputs.
-	    );
-	assert(ret < UINT_MAX);
-	return (unsigned)ret;
-}
-#elif (defined(_MSC_VER))
-BIT_UTIL_INLINE unsigned
-lg_floor(size_t x) {
-	unsigned long ret;
-
-	assert(x != 0);
-
+	util_assume(x != 0);
 #if (LG_SIZEOF_PTR == 3)
-	_BitScanReverse64(&ret, x);
-#elif (LG_SIZEOF_PTR == 2)
-	_BitScanReverse(&ret, x);
+	return fls_u64(x);
 #else
-#  error "Unsupported type size for lg_floor()"
-#endif
-	assert(ret < UINT_MAX);
-	return (unsigned)ret;
-}
-#elif (defined(JEMALLOC_HAVE_BUILTIN_CLZ))
-BIT_UTIL_INLINE unsigned
-lg_floor(size_t x) {
-	assert(x != 0);
-
-#if (LG_SIZEOF_PTR == LG_SIZEOF_INT)
-	return ((8 << LG_SIZEOF_PTR) - 1) - __builtin_clz(x);
-#elif (LG_SIZEOF_PTR == LG_SIZEOF_LONG)
-	return ((8 << LG_SIZEOF_PTR) - 1) - __builtin_clzl(x);
-#else
-#  error "Unsupported type size for lg_floor()"
+	return fls_u32(x);
 #endif
 }
-#else
-BIT_UTIL_INLINE unsigned
-lg_floor(size_t x) {
-	assert(x != 0);

-	x |= (x >> 1);
-	x |= (x >> 2);
-	x |= (x >> 4);
-	x |= (x >> 8);
-	x |= (x >> 16);
-#if (LG_SIZEOF_PTR == 3)
-	x |= (x >> 32);
-#endif
-	if (x == SIZE_T_MAX) {
-		return (8 << LG_SIZEOF_PTR) - 1;
-	}
-	x++;
-	return ffs_zu(x) - 1;
-}
-#endif
-
-BIT_UTIL_INLINE unsigned
+static inline unsigned
 lg_ceil(size_t x) {
 	return lg_floor(x) + ((x & (x - 1)) == 0 ? 0 : 1);
 }

-#undef BIT_UTIL_INLINE
-
 /* A compile-time version of lg_floor and lg_ceil. */
 #define LG_FLOOR_1(x) 0
 #define LG_FLOOR_2(x) (x < (1ULL << 1) ? LG_FLOOR_1(x) : 1 + LG_FLOOR_1(x >> 1))
--- a/test/unit/bit_util.c
+++ b/test/unit/bit_util.c
@ -120,7 +120,6 @@ TEST_BEGIN(test_ffs_u) {
 }
 TEST_END

-
 TEST_BEGIN(test_ffs_lu) {
 	TEST_FFS(unsigned long, lu, lu, "lu");
 }
@ -136,7 +135,6 @@ TEST_BEGIN(test_ffs_u32) {
 }
 TEST_END

-
 TEST_BEGIN(test_ffs_u64) {
 	TEST_FFS(uint64_t, u64, u64, FMTu64);
 }
@ -147,9 +145,69 @@ TEST_BEGIN(test_ffs_zu) {
 }
 TEST_END

+#define TEST_FLS(t, suf, test_suf, pri) do {				\
+	for (unsigned i = 0; i < sizeof(t) * 8; i++) {			\
+		for (unsigned j = 0; j <= i; j++) {			\
+			for (unsigned k = 0; k <= j; k++) {		\
+				t x = (t)1 << i;			\
+				x |= (t)1 << j;				\
+				x |= (t)1 << k;				\
+				expect_##test_suf##_eq(fls_##suf(x), i,	\
+				    "Unexpected result, x=%"pri, x);	\
+			}						\
+		}							\
+	}								\
+} while(0)
+
+TEST_BEGIN(test_fls_u) {
+	TEST_FLS(unsigned, u, u,"u");
+}
+TEST_END
+
+TEST_BEGIN(test_fls_lu) {
+	TEST_FLS(unsigned long, lu, lu, "lu");
+}
+TEST_END
+
+TEST_BEGIN(test_fls_llu) {
+	TEST_FLS(unsigned long long, llu, qd, "llu");
+}
+TEST_END
+
+TEST_BEGIN(test_fls_u32) {
+	TEST_FLS(uint32_t, u32, u32, FMTu32);
+}
+TEST_END
+
+TEST_BEGIN(test_fls_u64) {
+	TEST_FLS(uint64_t, u64, u64, FMTu64);
+}
+TEST_END
+
+TEST_BEGIN(test_fls_zu) {
+	TEST_FLS(size_t, zu, zu, "zu");
+}
+TEST_END
+
+TEST_BEGIN(test_fls_u_slow) {
+	TEST_FLS(unsigned, u_slow, u,"u");
+}
+TEST_END
+
+TEST_BEGIN(test_fls_lu_slow) {
+	TEST_FLS(unsigned long, lu_slow, lu, "lu");
+}
+TEST_END
+
+TEST_BEGIN(test_fls_llu_slow) {
+	TEST_FLS(unsigned long long, llu_slow, qd, "llu");
+}
+TEST_END
+
+
 int
 main(void) {
-	return test(
+	return test_no_reentrancy(
 	    test_pow2_ceil_u64,
 	    test_pow2_ceil_u32,
 	    test_pow2_ceil_zu,
@ -159,5 +217,14 @@ main(void) {
 	    test_ffs_llu,
 	    test_ffs_u32,
 	    test_ffs_u64,
-	    test_ffs_zu);
+	    test_ffs_zu,
+	    test_fls_u,
+	    test_fls_lu,
+	    test_fls_llu,
+	    test_fls_u32,
+	    test_fls_u64,
+	    test_fls_zu,
+	    test_fls_u_slow,
+	    test_fls_lu_slow,
+	    test_fls_llu_slow);
 }