#ifndef JEMALLOC_INTERNAL_BIT_UTIL_H
#define JEMALLOC_INTERNAL_BIT_UTIL_H

#include "jemalloc/internal/jemalloc_preamble.h"
#include "jemalloc/internal/assert.h"

/* Sanity check. */
#if !defined(JEMALLOC_INTERNAL_FFSLL) || !defined(JEMALLOC_INTERNAL_FFSL) \
    || !defined(JEMALLOC_INTERNAL_FFS)
#  error JEMALLOC_INTERNAL_FFS{,L,LL} should have been defined by configure
#endif

/*
 * Unlike the builtins and posix ffs functions, our ffs requires a non-zero
 * input, and returns the position of the lowest bit set (as opposed to the
 * posix versions, which return 1 larger than that position and use a return
 * value of zero as a sentinel.  This tends to simplify logic in callers, and
 * allows for consistency with the builtins we build fls on top of.
 */
static inline unsigned
ffs_llu(unsigned long long x) {
	util_assume(x != 0);
	return JEMALLOC_INTERNAL_FFSLL(x) - 1;
}

static inline unsigned
ffs_lu(unsigned long x) {
	util_assume(x != 0);
	return JEMALLOC_INTERNAL_FFSL(x) - 1;
}

static inline unsigned
ffs_u(unsigned x) {
	util_assume(x != 0);
	return JEMALLOC_INTERNAL_FFS(x) - 1;
}

#define DO_FLS_SLOW(x, suffix) do {					\
	util_assume(x != 0);						\
	x |= (x >> 1);							\
	x |= (x >> 2);							\
	x |= (x >> 4);							\
	x |= (x >> 8);							\
	x |= (x >> 16);							\
	if (sizeof(x) > 4) {						\
		/*							\
		 * If sizeof(x) is 4, then the expression "x >> 32"	\
		 * will generate compiler warnings even if the code	\
		 * never executes.  This circumvents the warning, and	\
		 * gets compiled out in optimized builds.		\
		 */							\
		int constant_32 = sizeof(x) * 4;			\
		x |= (x >> constant_32);				\
	}								\
	x++;								\
	if (x == 0) {							\
		return 8 * sizeof(x) - 1;				\
	}								\
	return ffs_##suffix(x) - 1;					\
} while(0)

static inline unsigned
fls_llu_slow(unsigned long long x) {
	DO_FLS_SLOW(x, llu);
}

static inline unsigned
fls_lu_slow(unsigned long x) {
	DO_FLS_SLOW(x, lu);
}

static inline unsigned
fls_u_slow(unsigned x) {
	DO_FLS_SLOW(x, u);
}

#undef DO_FLS_SLOW

#ifdef JEMALLOC_HAVE_BUILTIN_CLZ
static inline unsigned
fls_llu(unsigned long long x) {
	util_assume(x != 0);
	/*
	 * Note that the xor here is more naturally written as subtraction; the
	 * last bit set is the number of bits in the type minus the number of
	 * leading zero bits.  But GCC implements that as:
	 *    bsr     edi, edi
	 *    mov     eax, 31
	 *    xor     edi, 31
	 *    sub     eax, edi
	 * If we write it as xor instead, then we get
	 *    bsr     eax, edi
	 * as desired.
	 */
	return (8 * sizeof(x) - 1) ^ __builtin_clzll(x);
}

static inline unsigned
fls_lu(unsigned long x) {
	util_assume(x != 0);
	return (8 * sizeof(x) - 1) ^ __builtin_clzl(x);
}

static inline unsigned
fls_u(unsigned x) {
	util_assume(x != 0);
	return (8 * sizeof(x) - 1) ^ __builtin_clz(x);
}
#elif defined(_MSC_VER)

#if LG_SIZEOF_PTR == 3
#define DO_BSR64(bit, x) _BitScanReverse64(&bit, x)
#else
/*
 * This never actually runs; we're just dodging a compiler error for the
 * never-taken branch where sizeof(void *) == 8.
 */
#define DO_BSR64(bit, x) bit = 0; unreachable()
#endif

#define DO_FLS(x) do {							\
	if (x == 0) {							\
		return 8 * sizeof(x);					\
	}								\
	unsigned long bit;						\
	if (sizeof(x) == 4) {						\
		_BitScanReverse(&bit, (unsigned)x);			\
		return (unsigned)bit;					\
	}								\
	if (sizeof(x) == 8 && sizeof(void *) == 8) {			\
		DO_BSR64(bit, x);					\
		return (unsigned)bit;					\
	}								\
	if (sizeof(x) == 8 && sizeof(void *) == 4) {			\
		/* Dodge a compiler warning, as above. */		\
		int constant_32 = sizeof(x) * 4;			\
		if (_BitScanReverse(&bit,				\
		    (unsigned)(x >> constant_32))) {			\
			return 32 + (unsigned)bit;			\
		} else {						\
			_BitScanReverse(&bit, (unsigned)x);		\
			return (unsigned)bit;				\
		}							\
	}								\
	unreachable();							\
} while (0)

static inline unsigned
fls_llu(unsigned long long x) {
	DO_FLS(x);
}

static inline unsigned
fls_lu(unsigned long x) {
	DO_FLS(x);
}

static inline unsigned
fls_u(unsigned x) {
	DO_FLS(x);
}

#undef DO_FLS
#undef DO_BSR64
#else

static inline unsigned
fls_llu(unsigned long long x) {
	return fls_llu_slow(x);
}

static inline unsigned
fls_lu(unsigned long x) {
	return fls_lu_slow(x);
}

static inline unsigned
fls_u(unsigned x) {
	return fls_u_slow(x);
}
#endif

#if LG_SIZEOF_LONG_LONG > 3
#  error "Haven't implemented popcount for 16-byte ints."
#endif

#define DO_POPCOUNT(x, type) do {					\
	/*								\
	 * Algorithm from an old AMD optimization reference manual.	\
	 * We're putting a little bit more work than you might expect	\
	 * into the no-instrinsic case, since we only support the	\
	 * GCC intrinsics spelling of popcount (for now).  Detecting	\
	 * whether or not the popcount builtin is actually useable in	\
	 * MSVC is nontrivial.						\
	 */								\
									\
	type bmul = (type)0x0101010101010101ULL;			\
									\
	/*								\
	 * Replace each 2 bits with the sideways sum of the original	\
	 * values.  0x5 = 0b0101.					\
	 *								\
	 * You might expect this to be:					\
	 *   x = (x & 0x55...) + ((x >> 1) & 0x55...).			\
	 * That costs an extra mask relative to this, though.		\
	 */								\
	x = x - ((x >> 1) & (0x55U * bmul));				\
	/* Replace each 4 bits with their sideays sum.  0x3 = 0b0011. */\
	x = (x & (bmul * 0x33U)) + ((x >> 2) & (bmul * 0x33U));		\
	/*								\
	 * Replace each 8 bits with their sideways sum.  Note that we	\
	 * can't overflow within each 4-bit sum here, so we can skip	\
	 * the initial mask.						\
	 */								\
	x = (x + (x >> 4)) & (bmul * 0x0FU);				\
	/*								\
	 * None of the partial sums in this multiplication (viewed in	\
	 * base-256) can overflow into the next digit.  So the least	\
	 * significant byte of the product will be the least		\
	 * significant byte of the original value, the second least	\
	 * significant byte will be the sum of the two least		\
	 * significant bytes of the original value, and so on.		\
	 * Importantly, the high byte will be the byte-wise sum of all	\
	 * the bytes of the original value.				\
	 */								\
	x = x * bmul;							\
	x >>= ((sizeof(x) - 1) * 8);					\
	return (unsigned)x;						\
} while(0)

static inline unsigned
popcount_u_slow(unsigned bitmap) {
	DO_POPCOUNT(bitmap, unsigned);
}

static inline unsigned
popcount_lu_slow(unsigned long bitmap) {
	DO_POPCOUNT(bitmap, unsigned long);
}

static inline unsigned
popcount_llu_slow(unsigned long long bitmap) {
	DO_POPCOUNT(bitmap, unsigned long long);
}

#undef DO_POPCOUNT

static inline unsigned
popcount_u(unsigned bitmap) {
#ifdef JEMALLOC_INTERNAL_POPCOUNT
	return JEMALLOC_INTERNAL_POPCOUNT(bitmap);
#else
	return popcount_u_slow(bitmap);
#endif
}

static inline unsigned
popcount_lu(unsigned long bitmap) {
#ifdef JEMALLOC_INTERNAL_POPCOUNTL
	return JEMALLOC_INTERNAL_POPCOUNTL(bitmap);
#else
	return popcount_lu_slow(bitmap);
#endif
}

static inline unsigned
popcount_llu(unsigned long long bitmap) {
#ifdef JEMALLOC_INTERNAL_POPCOUNTLL
	return JEMALLOC_INTERNAL_POPCOUNTLL(bitmap);
#else
	return popcount_llu_slow(bitmap);
#endif
}

/*
 * Clears first unset bit in bitmap, and returns
 * place of bit.  bitmap *must not* be 0.
 */

static inline size_t
cfs_lu(unsigned long* bitmap) {
	util_assume(*bitmap != 0);
	size_t bit = ffs_lu(*bitmap);
	*bitmap ^= ZU(1) << bit;
	return bit;
}

static inline unsigned
ffs_zu(size_t x) {
#if LG_SIZEOF_PTR == LG_SIZEOF_INT
	return ffs_u(x);
#elif LG_SIZEOF_PTR == LG_SIZEOF_LONG
	return ffs_lu(x);
#elif LG_SIZEOF_PTR == LG_SIZEOF_LONG_LONG
	return ffs_llu(x);
#else
#error No implementation for size_t ffs()
#endif
}

static inline unsigned
fls_zu(size_t x) {
#if LG_SIZEOF_PTR == LG_SIZEOF_INT
	return fls_u(x);
#elif LG_SIZEOF_PTR == LG_SIZEOF_LONG
	return fls_lu(x);
#elif LG_SIZEOF_PTR == LG_SIZEOF_LONG_LONG
	return fls_llu(x);
#else
#error No implementation for size_t fls()
#endif
}


static inline unsigned
ffs_u64(uint64_t x) {
#if LG_SIZEOF_LONG == 3
	return ffs_lu(x);
#elif LG_SIZEOF_LONG_LONG == 3
	return ffs_llu(x);
#else
#error No implementation for 64-bit ffs()
#endif
}

static inline unsigned
fls_u64(uint64_t x) {
#if LG_SIZEOF_LONG == 3
	return fls_lu(x);
#elif LG_SIZEOF_LONG_LONG == 3
	return fls_llu(x);
#else
#error No implementation for 64-bit fls()
#endif
}

static inline unsigned
ffs_u32(uint32_t x) {
#if LG_SIZEOF_INT == 2
	return ffs_u(x);
#else
#error No implementation for 32-bit ffs()
#endif
}

static inline unsigned
fls_u32(uint32_t x) {
#if LG_SIZEOF_INT == 2
	return fls_u(x);
#else
#error No implementation for 32-bit fls()
#endif
}

static inline uint64_t
pow2_ceil_u64(uint64_t x) {
	if (unlikely(x <= 1)) {
		return x;
	}
	size_t msb_on_index = fls_u64(x - 1);
	/*
	 * Range-check; it's on the callers to ensure that the result of this
	 * call won't overflow.
	 */
	assert(msb_on_index < 63);
	return 1ULL << (msb_on_index + 1);
}

static inline uint32_t
pow2_ceil_u32(uint32_t x) {
	if (unlikely(x <= 1)) {
	    return x;
	}
	size_t msb_on_index = fls_u32(x - 1);
	/* As above. */
	assert(msb_on_index < 31);
	return 1U << (msb_on_index + 1);
}

/* Compute the smallest power of 2 that is >= x. */
static inline size_t
pow2_ceil_zu(size_t x) {
#if (LG_SIZEOF_PTR == 3)
	return pow2_ceil_u64(x);
#else
	return pow2_ceil_u32(x);
#endif
}

static inline unsigned
lg_floor(size_t x) {
	util_assume(x != 0);
#if (LG_SIZEOF_PTR == 3)
	return fls_u64(x);
#else
	return fls_u32(x);
#endif
}

static inline unsigned
lg_ceil(size_t x) {
	return lg_floor(x) + ((x & (x - 1)) == 0 ? 0 : 1);
}

/* A compile-time version of lg_floor and lg_ceil. */
#define LG_FLOOR_1(x) 0
#define LG_FLOOR_2(x) (x < (1ULL << 1) ? LG_FLOOR_1(x) : 1 + LG_FLOOR_1(x >> 1))
#define LG_FLOOR_4(x) (x < (1ULL << 2) ? LG_FLOOR_2(x) : 2 + LG_FLOOR_2(x >> 2))
#define LG_FLOOR_8(x) (x < (1ULL << 4) ? LG_FLOOR_4(x) : 4 + LG_FLOOR_4(x >> 4))
#define LG_FLOOR_16(x) (x < (1ULL << 8) ? LG_FLOOR_8(x) : 8 + LG_FLOOR_8(x >> 8))
#define LG_FLOOR_32(x) (x < (1ULL << 16) ? LG_FLOOR_16(x) : 16 + LG_FLOOR_16(x >> 16))
#define LG_FLOOR_64(x) (x < (1ULL << 32) ? LG_FLOOR_32(x) : 32 + LG_FLOOR_32(x >> 32))
#if LG_SIZEOF_PTR == 2
#  define LG_FLOOR(x) LG_FLOOR_32((x))
#else
#  define LG_FLOOR(x) LG_FLOOR_64((x))
#endif

#define LG_CEIL(x) (LG_FLOOR(x) + (((x) & ((x) - 1)) == 0 ? 0 : 1))

#endif /* JEMALLOC_INTERNAL_BIT_UTIL_H */