From 22da836094f315b3fe1609e21c0e1092e7b0f2f5 Mon Sep 17 00:00:00 2001
From: David Goldblatt <davidgoldblatt@fb.com>
Date: Wed, 22 Jul 2020 07:10:06 -0700
Subject: [PATCH] bit_util: Add fls_ functions; "find last set".

These simplify a lot of the bit_util module, which had grown bits and pieces of
this functionality across a variety of places over the years.

While we're here, kill off BIT_UTIL_INLINE and don't do reentrancy testing for
bit_util.
---
 configure.ac                         |   6 +-
 include/jemalloc/internal/bit_util.h | 344 +++++++++++++++++----------
 test/unit/bit_util.c                 |  75 +++++-
 3 files changed, 292 insertions(+), 133 deletions(-)

diff --git a/configure.ac b/configure.ac
index bcd63632..b197d32e 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2118,7 +2118,7 @@ esac
 fi
 
 dnl ============================================================================
-dnl Check for __builtin_clz() and __builtin_clzl().
+dnl Check for __builtin_clz(), __builtin_clzl(), and __builtin_clzll().
 
 AC_CACHE_CHECK([for __builtin_clz],
                [je_cv_builtin_clz],
@@ -2132,6 +2132,10 @@ AC_CACHE_CHECK([for __builtin_clz],
                                                         unsigned long x = 0;
                                                         int y = __builtin_clzl(x);
                                                 }
+                                                {
+                                                        unsigned long long x = 0;
+                                                        int y = __builtin_clzll(x);
+                                                }
                                                 ])],
                                [je_cv_builtin_clz=yes],
                                [je_cv_builtin_clz=no])])
diff --git a/include/jemalloc/internal/bit_util.h b/include/jemalloc/internal/bit_util.h
index 258fd978..c5158f67 100644
--- a/include/jemalloc/internal/bit_util.h
+++ b/include/jemalloc/internal/bit_util.h
@@ -3,8 +3,6 @@
 
 #include "jemalloc/internal/assert.h"
 
-#define BIT_UTIL_INLINE static inline
-
 /* Sanity check. */
 #if !defined(JEMALLOC_INTERNAL_FFSLL) || !defined(JEMALLOC_INTERNAL_FFSL) \
     || !defined(JEMALLOC_INTERNAL_FFS)
@@ -18,26 +16,171 @@
  * value of zero as a sentinel.  This tends to simplify logic in callers, and
  * allows for consistency with the builtins we build fls on top of.
  */
-BIT_UTIL_INLINE unsigned
+static inline unsigned
 ffs_llu(unsigned long long x) {
 	util_assume(x != 0);
 	return JEMALLOC_INTERNAL_FFSLL(x) - 1;
 }
 
-BIT_UTIL_INLINE unsigned
+static inline unsigned
 ffs_lu(unsigned long x) {
 	util_assume(x != 0);
 	return JEMALLOC_INTERNAL_FFSL(x) - 1;
 }
 
-BIT_UTIL_INLINE unsigned
+static inline unsigned
 ffs_u(unsigned x) {
 	util_assume(x != 0);
 	return JEMALLOC_INTERNAL_FFS(x) - 1;
 }
 
+#define DO_FLS_SLOW(x, suffix) do {					\
+	util_assume(x != 0);						\
+	x |= (x >> 1);							\
+	x |= (x >> 2);							\
+	x |= (x >> 4);							\
+	x |= (x >> 8);							\
+	x |= (x >> 16);							\
+	if (sizeof(x) > 4) {						\
+		/*							\
+		 * If sizeof(x) is 4, then the expression "x >> 32"	\
+		 * will generate compiler warnings even if the code	\
+		 * never executes.  This circumvents the warning, and	\
+		 * gets compiled out in optimized builds.		\
+		 */							\
+		int constant_32 = sizeof(x) * 4;			\
+		x |= (x >> constant_32);				\
+	}								\
+	x++;								\
+	if (x == 0) {							\
+		return 8 * sizeof(x) - 1;				\
+	}								\
+	return ffs_##suffix(x) - 1;					\
+} while(0)
+
+static inline unsigned
+fls_llu_slow(unsigned long long x) {
+	DO_FLS_SLOW(x, llu);
+}
+
+static inline unsigned
+fls_lu_slow(unsigned long x) {
+	DO_FLS_SLOW(x, lu);
+}
+
+static inline unsigned
+fls_u_slow(unsigned x) {
+	DO_FLS_SLOW(x, u);
+}
+
+#undef DO_FLS_SLOW
+
+#ifdef JEMALLOC_HAVE_BUILTIN_CLZ
+static inline unsigned
+fls_llu(unsigned long long x) {
+	util_assume(x != 0);
+	/*
+	 * Note that the xor here is more naturally written as subtraction; the
+	 * last bit set is the number of bits in the type minus the number of
+	 * leading zero bits.  But GCC implements that as:
+	 *    bsr     edi, edi
+	 *    mov     eax, 31
+	 *    xor     edi, 31
+	 *    sub     eax, edi
+	 * If we write it as xor instead, then we get
+	 *    bsr     eax, edi
+	 * as desired.
+	 */
+	return (8 * sizeof(x) - 1) ^ __builtin_clzll(x);
+}
+
+static inline unsigned
+fls_lu(unsigned long x) {
+	util_assume(x != 0);
+	return (8 * sizeof(x) - 1) ^ __builtin_clzl(x);
+}
+
+static inline unsigned
+fls_u(unsigned x) {
+	util_assume(x != 0);
+	return (8 * sizeof(x) - 1) ^ __builtin_clz(x);
+}
+#elif defined(_MSC_VER)
+
+#if LG_SIZEOF_PTR == 3
+#define DO_BSR64(bit, x) _BitScanReverse64(&bit, x)
+#else
+/*
+ * This never actually runs; we're just dodging a compiler error for the
+ * never-taken branch where sizeof(void *) == 8.
+ */
+#define DO_BSR64(bit, x) bit = 0; unreachable()
+#endif
+
+#define DO_FLS(x) do {							\
+	if (x == 0) {							\
+		return 8 * sizeof(x);					\
+	}								\
+	unsigned long bit;						\
+	if (sizeof(x) == 4) {						\
+		_BitScanReverse(&bit, (unsigned)x);			\
+		return (unsigned)bit;					\
+	}								\
+	if (sizeof(x) == 8 && sizeof(void *) == 8) {			\
+		DO_BSR64(bit, x);					\
+		return (unsigned)bit;					\
+	}								\
+	if (sizeof(x) == 8 && sizeof(void *) == 4) {			\
+		/* Dodge a compiler warning, as above. */		\
+		int constant_32 = sizeof(x) * 4;			\
+		if (_BitScanReverse(&bit,				\
+		    (unsigned)(x >> constant_32))) {			\
+			return 32 + (unsigned)bit;			\
+		} else {						\
+			_BitScanReverse(&bit, (unsigned)x);		\
+			return (unsigned)bit;				\
+		}							\
+	}								\
+	unreachable();							\
+} while (0)
+
+static inline unsigned
+fls_llu(unsigned long long x) {
+	DO_FLS(x);
+}
+
+static inline unsigned
+fls_lu(unsigned long x) {
+	DO_FLS(x);
+}
+
+static inline unsigned
+fls_u(unsigned x) {
+	DO_FLS(x);
+}
+
+#undef DO_FLS
+#undef DO_BSR64
+#else
+
+static inline unsigned
+fls_llu(unsigned long long x) {
+	return fls_llu_slow(x);
+}
+
+static inline unsigned
+fls_lu(unsigned long x) {
+	return fls_lu_slow(x);
+}
+
+static inline unsigned
+fls_u(unsigned x) {
+	return fls_u_slow(x);
+}
+#endif
+
 #ifdef JEMALLOC_INTERNAL_POPCOUNTL
-BIT_UTIL_INLINE unsigned
+static inline unsigned
 popcount_lu(unsigned long bitmap) {
   return JEMALLOC_INTERNAL_POPCOUNTL(bitmap);
 }
@@ -48,7 +191,7 @@ popcount_lu(unsigned long bitmap) {
  * place of bit.  bitmap *must not* be 0.
  */
 
-BIT_UTIL_INLINE size_t
+static inline size_t
 cfs_lu(unsigned long* bitmap) {
 	util_assume(*bitmap != 0);
 	size_t bit = ffs_lu(*bitmap);
@@ -56,101 +199,102 @@ cfs_lu(unsigned long* bitmap) {
 	return bit;
 }
 
-BIT_UTIL_INLINE unsigned
-ffs_zu(size_t bitmap) {
+static inline unsigned
+ffs_zu(size_t x) {
 #if LG_SIZEOF_PTR == LG_SIZEOF_INT
-	return ffs_u(bitmap);
+	return ffs_u(x);
 #elif LG_SIZEOF_PTR == LG_SIZEOF_LONG
-	return ffs_lu(bitmap);
+	return ffs_lu(x);
 #elif LG_SIZEOF_PTR == LG_SIZEOF_LONG_LONG
-	return ffs_llu(bitmap);
+	return ffs_llu(x);
 #else
 #error No implementation for size_t ffs()
 #endif
 }
 
-BIT_UTIL_INLINE unsigned
-ffs_u64(uint64_t bitmap) {
+static inline unsigned
+fls_zu(size_t x) {
+#if LG_SIZEOF_PTR == LG_SIZEOF_INT
+	return fls_u(x);
+#elif LG_SIZEOF_PTR == LG_SIZEOF_LONG
+	return fls_lu(x);
+#elif LG_SIZEOF_PTR == LG_SIZEOF_LONG_LONG
+	return fls_llu(x);
+#else
+#error No implementation for size_t fls()
+#endif
+}
+
+
+static inline unsigned
+ffs_u64(uint64_t x) {
 #if LG_SIZEOF_LONG == 3
-	return ffs_lu(bitmap);
+	return ffs_lu(x);
 #elif LG_SIZEOF_LONG_LONG == 3
-	return ffs_llu(bitmap);
+	return ffs_llu(x);
 #else
 #error No implementation for 64-bit ffs()
 #endif
 }
 
-BIT_UTIL_INLINE unsigned
-ffs_u32(uint32_t bitmap) {
+static inline unsigned
+fls_u64(uint64_t x) {
+#if LG_SIZEOF_LONG == 3
+	return fls_lu(x);
+#elif LG_SIZEOF_LONG_LONG == 3
+	return fls_llu(x);
+#else
+#error No implementation for 64-bit fls()
+#endif
+}
+
+static inline unsigned
+ffs_u32(uint32_t x) {
 #if LG_SIZEOF_INT == 2
-	return ffs_u(bitmap);
+	return ffs_u(x);
 #else
 #error No implementation for 32-bit ffs()
 #endif
-	return ffs_u(bitmap);
+	return ffs_u(x);
 }
 
-BIT_UTIL_INLINE uint64_t
+static inline unsigned
+fls_u32(uint32_t x) {
+#if LG_SIZEOF_INT == 2
+	return fls_u(x);
+#else
+#error No implementation for 32-bit fls()
+#endif
+	return fls_u(x);
+}
+
+static inline uint64_t
 pow2_ceil_u64(uint64_t x) {
-#if (defined(__amd64__) || defined(__x86_64__) || defined(JEMALLOC_HAVE_BUILTIN_CLZ))
-	if(unlikely(x <= 1)) {
+	if (unlikely(x <= 1)) {
 		return x;
 	}
-	size_t msb_on_index;
-#if (defined(__amd64__) || defined(__x86_64__))
-	asm ("bsrq %1, %0"
-			: "=r"(msb_on_index) // Outputs.
-			: "r"(x-1)           // Inputs.
-		);
-#elif (defined(JEMALLOC_HAVE_BUILTIN_CLZ))
-	msb_on_index = (63 ^ __builtin_clzll(x - 1));
-#endif
+	size_t msb_on_index = fls_u64(x - 1);
+	/*
+	 * Range-check; it's on the callers to ensure that the result of this
+	 * call won't overflow.
+	 */
 	assert(msb_on_index < 63);
 	return 1ULL << (msb_on_index + 1);
-#else
-	x--;
-	x |= x >> 1;
-	x |= x >> 2;
-	x |= x >> 4;
-	x |= x >> 8;
-	x |= x >> 16;
-	x |= x >> 32;
-	x++;
-	return x;
-#endif
 }
 
-BIT_UTIL_INLINE uint32_t
+static inline uint32_t
 pow2_ceil_u32(uint32_t x) {
-#if ((defined(__i386__) || defined(JEMALLOC_HAVE_BUILTIN_CLZ)) && (!defined(__s390__)))
-	if(unlikely(x <= 1)) {
-		return x;
+	if (unlikely(x <= 1)) {
+	    return x;
 	}
-	size_t msb_on_index;
-#if (defined(__i386__))
-	asm ("bsr %1, %0"
-			: "=r"(msb_on_index) // Outputs.
-			: "r"(x-1)           // Inputs.
-		);
-#elif (defined(JEMALLOC_HAVE_BUILTIN_CLZ))
-	msb_on_index = (31 ^ __builtin_clz(x - 1));
-#endif
+	size_t msb_on_index = fls_u32(x - 1);
+	/* As above. */
 	assert(msb_on_index < 31);
 	return 1U << (msb_on_index + 1);
-#else
-	x--;
-	x |= x >> 1;
-	x |= x >> 2;
-	x |= x >> 4;
-	x |= x >> 8;
-	x |= x >> 16;
-	x++;
-	return x;
-#endif
 }
 
 /* Compute the smallest power of 2 that is >= x. */
-BIT_UTIL_INLINE size_t
+static inline size_t
 pow2_ceil_zu(size_t x) {
 #if (LG_SIZEOF_PTR == 3)
 	return pow2_ceil_u64(x);
@@ -159,77 +303,21 @@ pow2_ceil_zu(size_t x) {
 #endif
 }
 
-#if (defined(__i386__) || defined(__amd64__) || defined(__x86_64__))
-BIT_UTIL_INLINE unsigned
+static inline unsigned
 lg_floor(size_t x) {
-	size_t ret;
-	assert(x != 0);
-
-	asm ("bsr %1, %0"
-	    : "=r"(ret) // Outputs.
-	    : "r"(x)    // Inputs.
-	    );
-	assert(ret < UINT_MAX);
-	return (unsigned)ret;
-}
-#elif (defined(_MSC_VER))
-BIT_UTIL_INLINE unsigned
-lg_floor(size_t x) {
-	unsigned long ret;
-
-	assert(x != 0);
-
+	util_assume(x != 0);
 #if (LG_SIZEOF_PTR == 3)
-	_BitScanReverse64(&ret, x);
-#elif (LG_SIZEOF_PTR == 2)
-	_BitScanReverse(&ret, x);
+	return fls_u64(x);
 #else
-#  error "Unsupported type size for lg_floor()"
-#endif
-	assert(ret < UINT_MAX);
-	return (unsigned)ret;
-}
-#elif (defined(JEMALLOC_HAVE_BUILTIN_CLZ))
-BIT_UTIL_INLINE unsigned
-lg_floor(size_t x) {
-	assert(x != 0);
-
-#if (LG_SIZEOF_PTR == LG_SIZEOF_INT)
-	return ((8 << LG_SIZEOF_PTR) - 1) - __builtin_clz(x);
-#elif (LG_SIZEOF_PTR == LG_SIZEOF_LONG)
-	return ((8 << LG_SIZEOF_PTR) - 1) - __builtin_clzl(x);
-#else
-#  error "Unsupported type size for lg_floor()"
+	return fls_u32(x);
 #endif
 }
-#else
-BIT_UTIL_INLINE unsigned
-lg_floor(size_t x) {
-	assert(x != 0);
 
-	x |= (x >> 1);
-	x |= (x >> 2);
-	x |= (x >> 4);
-	x |= (x >> 8);
-	x |= (x >> 16);
-#if (LG_SIZEOF_PTR == 3)
-	x |= (x >> 32);
-#endif
-	if (x == SIZE_T_MAX) {
-		return (8 << LG_SIZEOF_PTR) - 1;
-	}
-	x++;
-	return ffs_zu(x) - 1;
-}
-#endif
-
-BIT_UTIL_INLINE unsigned
+static inline unsigned
 lg_ceil(size_t x) {
 	return lg_floor(x) + ((x & (x - 1)) == 0 ? 0 : 1);
 }
 
-#undef BIT_UTIL_INLINE
-
 /* A compile-time version of lg_floor and lg_ceil. */
 #define LG_FLOOR_1(x) 0
 #define LG_FLOOR_2(x) (x < (1ULL << 1) ? LG_FLOOR_1(x) : 1 + LG_FLOOR_1(x >> 1))
diff --git a/test/unit/bit_util.c b/test/unit/bit_util.c
index f3761fd7..045cf8b4 100644
--- a/test/unit/bit_util.c
+++ b/test/unit/bit_util.c
@@ -120,7 +120,6 @@ TEST_BEGIN(test_ffs_u) {
 }
 TEST_END
 
-
 TEST_BEGIN(test_ffs_lu) {
 	TEST_FFS(unsigned long, lu, lu, "lu");
 }
@@ -136,7 +135,6 @@ TEST_BEGIN(test_ffs_u32) {
 }
 TEST_END
 
-
 TEST_BEGIN(test_ffs_u64) {
 	TEST_FFS(uint64_t, u64, u64, FMTu64);
 }
@@ -147,9 +145,69 @@ TEST_BEGIN(test_ffs_zu) {
 }
 TEST_END
 
+#define TEST_FLS(t, suf, test_suf, pri) do {				\
+	for (unsigned i = 0; i < sizeof(t) * 8; i++) {			\
+		for (unsigned j = 0; j <= i; j++) {			\
+			for (unsigned k = 0; k <= j; k++) {		\
+				t x = (t)1 << i;			\
+				x |= (t)1 << j;				\
+				x |= (t)1 << k;				\
+				expect_##test_suf##_eq(fls_##suf(x), i,	\
+				    "Unexpected result, x=%"pri, x);	\
+			}						\
+		}							\
+	}								\
+} while(0)
+
+TEST_BEGIN(test_fls_u) {
+	TEST_FLS(unsigned, u, u,"u");
+}
+TEST_END
+
+TEST_BEGIN(test_fls_lu) {
+	TEST_FLS(unsigned long, lu, lu, "lu");
+}
+TEST_END
+
+TEST_BEGIN(test_fls_llu) {
+	TEST_FLS(unsigned long long, llu, qd, "llu");
+}
+TEST_END
+
+TEST_BEGIN(test_fls_u32) {
+	TEST_FLS(uint32_t, u32, u32, FMTu32);
+}
+TEST_END
+
+TEST_BEGIN(test_fls_u64) {
+	TEST_FLS(uint64_t, u64, u64, FMTu64);
+}
+TEST_END
+
+TEST_BEGIN(test_fls_zu) {
+	TEST_FLS(size_t, zu, zu, "zu");
+}
+TEST_END
+
+TEST_BEGIN(test_fls_u_slow) {
+	TEST_FLS(unsigned, u_slow, u,"u");
+}
+TEST_END
+
+TEST_BEGIN(test_fls_lu_slow) {
+	TEST_FLS(unsigned long, lu_slow, lu, "lu");
+}
+TEST_END
+
+TEST_BEGIN(test_fls_llu_slow) {
+	TEST_FLS(unsigned long long, llu_slow, qd, "llu");
+}
+TEST_END
+
+
 int
 main(void) {
-	return test(
+	return test_no_reentrancy(
 	    test_pow2_ceil_u64,
 	    test_pow2_ceil_u32,
 	    test_pow2_ceil_zu,
@@ -159,5 +217,14 @@ main(void) {
 	    test_ffs_llu,
 	    test_ffs_u32,
 	    test_ffs_u64,
-	    test_ffs_zu);
+	    test_ffs_zu,
+	    test_fls_u,
+	    test_fls_lu,
+	    test_fls_llu,
+	    test_fls_u32,
+	    test_fls_u64,
+	    test_fls_zu,
+	    test_fls_u_slow,
+	    test_fls_lu_slow,
+	    test_fls_llu_slow);
 }