server-skynet-source-3rd-je.../src/sec.c

#include "jemalloc/internal/jemalloc_preamble.h"
#include "jemalloc/internal/jemalloc_internal_includes.h"

#include "jemalloc/internal/sec.h"

static edata_t *sec_alloc(tsdn_t *tsdn, pai_t *self, size_t size,
    size_t alignment, bool zero);
static bool sec_expand(tsdn_t *tsdn, pai_t *self, edata_t *edata,
    size_t old_size, size_t new_size, bool zero);
static bool sec_shrink(tsdn_t *tsdn, pai_t *self, edata_t *edata,
    size_t old_size, size_t new_size);
static void sec_dalloc(tsdn_t *tsdn, pai_t *self, edata_t *edata);

bool sec_init(sec_t *sec, pai_t *fallback, size_t nshards, size_t alloc_max,
    size_t bytes_max) {
	if (nshards > SEC_NSHARDS_MAX) {
		nshards = SEC_NSHARDS_MAX;
	}
	for (size_t i = 0; i < nshards; i++) {
		sec_shard_t *shard = &sec->shards[i];
		bool err = malloc_mutex_init(&shard->mtx, "sec_shard",
		    WITNESS_RANK_SEC_SHARD, malloc_mutex_rank_exclusive);
		if (err) {
			return true;
		}
		shard->enabled = true;
		for (pszind_t j = 0; j < SEC_NPSIZES; j++) {
			edata_list_active_init(&shard->freelist[j]);
		}
		shard->bytes_cur = 0;
	}
	sec->fallback = fallback;
	sec->alloc_max = alloc_max;
	if (sec->alloc_max > sz_pind2sz(SEC_NPSIZES - 1)) {
		sec->alloc_max = sz_pind2sz(SEC_NPSIZES - 1);
	}

	sec->bytes_max = bytes_max;
	sec->nshards = nshards;

	/*
	 * Initialize these last so that an improper use of an SEC whose
	 * initialization failed will segfault in an easy-to-spot way.
	 */
	sec->pai.alloc = &sec_alloc;
	sec->pai.expand = &sec_expand;
	sec->pai.shrink = &sec_shrink;
	sec->pai.dalloc = &sec_dalloc;

	return false;
}

static sec_shard_t *
sec_shard_pick(tsdn_t *tsdn, sec_t *sec) {
	/*
	 * Eventually, we should implement affinity, tracking source shard using
	 * the edata_t's newly freed up fields.  For now, just randomly
	 * distribute across all shards.
	 */
	if (tsdn_null(tsdn)) {
		return &sec->shards[0];
	}
	tsd_t *tsd = tsdn_tsd(tsdn);
	/*
	 * Use the trick from Daniel Lemire's "A fast alternative to the modulo
	 * reduction.  Use a 64 bit number to store 32 bits, since we'll
	 * deliberately overflow when we multiply by the number of shards.
	 */
	uint64_t rand32 = prng_lg_range_u64(tsd_prng_statep_get(tsd), 32);
	uint32_t idx = (uint32_t)((rand32 * (uint64_t)sec->nshards) >> 32);
	return &sec->shards[idx];
}

static edata_t *
sec_shard_alloc_locked(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard,
    pszind_t pszind) {
	malloc_mutex_assert_owner(tsdn, &shard->mtx);
	if (!shard->enabled) {
		return NULL;
	}
	edata_t *edata = edata_list_active_first(&shard->freelist[pszind]);
	if (edata != NULL) {
		edata_list_active_remove(&shard->freelist[pszind], edata);
		assert(edata_size_get(edata) <= shard->bytes_cur);
		shard->bytes_cur -= edata_size_get(edata);
	}
	return edata;
}

static edata_t *
sec_alloc(tsdn_t *tsdn, pai_t *self, size_t size, size_t alignment, bool zero) {
	assert((size & PAGE_MASK) == 0);

	sec_t *sec = (sec_t *)self;

	if (zero || alignment > PAGE || sec->nshards == 0
	    || size > sec->alloc_max) {
		return pai_alloc(tsdn, sec->fallback, size, alignment, zero);
	}
	pszind_t pszind = sz_psz2ind(size);
	sec_shard_t *shard = sec_shard_pick(tsdn, sec);
	malloc_mutex_lock(tsdn, &shard->mtx);
	edata_t *edata = sec_shard_alloc_locked(tsdn, sec, shard, pszind);
	malloc_mutex_unlock(tsdn, &shard->mtx);
	if (edata == NULL) {
		/*
		 * See the note in dalloc, below; really, we should add a
		 * batch_alloc method to the PAI and get more than one extent at
		 * a time.
		 */
		edata = pai_alloc(tsdn, sec->fallback, size, alignment, zero);
	}
	return edata;
}

static bool
sec_expand(tsdn_t *tsdn, pai_t *self, edata_t *edata, size_t old_size,
    size_t new_size, bool zero) {
	sec_t *sec = (sec_t *)self;
	return pai_expand(tsdn, sec->fallback, edata, old_size, new_size, zero);
}

static bool
sec_shrink(tsdn_t *tsdn, pai_t *self, edata_t *edata, size_t old_size,
    size_t new_size) {
	sec_t *sec = (sec_t *)self;
	return pai_shrink(tsdn, sec->fallback, edata, old_size, new_size);
}

static void
sec_do_flush_locked(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard) {
	malloc_mutex_assert_owner(tsdn, &shard->mtx);
	shard->bytes_cur = 0;
	edata_list_active_t to_flush;
	edata_list_active_init(&to_flush);
	for (pszind_t i = 0; i < SEC_NPSIZES; i++) {
		edata_list_active_concat(&to_flush, &shard->freelist[i]);
	}
	/*
	 * A better way to do this would be to add a batch dalloc function to
	 * the pai_t.  Practically, the current method turns into O(n) locks and
	 * unlocks at the fallback allocator.  But some implementations (e.g.
	 * HPA) can straightforwardly do many deallocations in a single lock /
	 * unlock pair.
	 */
	while (!edata_list_active_empty(&to_flush)) {
		edata_t *e = edata_list_active_first(&to_flush);
		edata_list_active_remove(&to_flush, e);
		pai_dalloc(tsdn, sec->fallback, e);
	}
}

static void
sec_shard_dalloc_locked(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard,
    edata_t *edata) {
	malloc_mutex_assert_owner(tsdn, &shard->mtx);
	assert(shard->bytes_cur <= sec->bytes_max);
	size_t size = edata_size_get(edata);
	pszind_t pszind = sz_psz2ind(size);
	/*
	 * Prepending here results in FIFO allocation per bin, which seems
	 * reasonable.
	 */
	edata_list_active_prepend(&shard->freelist[pszind], edata);
	shard->bytes_cur += size;
	if (shard->bytes_cur > sec->bytes_max) {
		/*
		 * We've exceeded the shard limit.  We make two nods in the
		 * direction of fragmentation avoidance: we flush everything in
		 * the shard, rather than one particular bin, and we hold the
		 * lock while flushing (in case one of the extents we flush is
		 * highly preferred from a fragmentation-avoidance perspective
		 * in the backing allocator).  This has the extra advantage of
		 * not requiring advanced cache balancing strategies.
		 */
		sec_do_flush_locked(tsdn, sec, shard);
	}
}

static void
sec_dalloc(tsdn_t *tsdn, pai_t *self, edata_t *edata) {
	sec_t *sec = (sec_t *)self;
	if (sec->nshards == 0 || edata_size_get(edata) > sec->alloc_max) {
		pai_dalloc(tsdn, sec->fallback, edata);
		return;
	}
	sec_shard_t *shard = sec_shard_pick(tsdn, sec);
	malloc_mutex_lock(tsdn, &shard->mtx);
	if (shard->enabled) {
		sec_shard_dalloc_locked(tsdn, sec, shard, edata);
		malloc_mutex_unlock(tsdn, &shard->mtx);
	} else {
		malloc_mutex_unlock(tsdn, &shard->mtx);
		pai_dalloc(tsdn, sec->fallback, edata);
	}
}

void
sec_flush(tsdn_t *tsdn, sec_t *sec) {
	for (size_t i = 0; i < sec->nshards; i++) {
		malloc_mutex_lock(tsdn, &sec->shards[i].mtx);
		sec_do_flush_locked(tsdn, sec, &sec->shards[i]);
		malloc_mutex_unlock(tsdn, &sec->shards[i].mtx);
	}
}

void
sec_disable(tsdn_t *tsdn, sec_t *sec) {
	for (size_t i = 0; i < sec->nshards; i++) {
		malloc_mutex_lock(tsdn, &sec->shards[i].mtx);
		sec->shards[i].enabled = false;
		sec_do_flush_locked(tsdn, sec, &sec->shards[i]);
		malloc_mutex_unlock(tsdn, &sec->shards[i].mtx);
	}
}

void
sec_stats_merge(tsdn_t *tsdn, sec_t *sec, sec_stats_t *stats) {
	size_t sum = 0;
	for (size_t i = 0; i < sec->nshards; i++) {
		/*
		 * We could save these lock acquisitions by making bytes_cur
		 * atomic, but stats collection is rare anyways and we expect
		 * the number and type of stats to get more interesting.
		 */
		malloc_mutex_lock(tsdn, &sec->shards[i].mtx);
		sum += sec->shards[i].bytes_cur;
		malloc_mutex_unlock(tsdn, &sec->shards[i].mtx);
	}
	stats->bytes += sum;
}

void
sec_mutex_stats_read(tsdn_t *tsdn, sec_t *sec,
    mutex_prof_data_t *mutex_prof_data) {
	for (size_t i = 0; i < sec->nshards; i++) {
		malloc_mutex_lock(tsdn, &sec->shards[i].mtx);
		malloc_mutex_prof_accum(tsdn, mutex_prof_data,
		    &sec->shards[i].mtx);
		malloc_mutex_unlock(tsdn, &sec->shards[i].mtx);
	}
}

void
sec_prefork2(tsdn_t *tsdn, sec_t *sec) {
	for (size_t i = 0; i < sec->nshards; i++) {
		malloc_mutex_prefork(tsdn, &sec->shards[i].mtx);
	}
}

void
sec_postfork_parent(tsdn_t *tsdn, sec_t *sec) {
	for (size_t i = 0; i < sec->nshards; i++) {
		malloc_mutex_postfork_parent(tsdn, &sec->shards[i].mtx);
	}
}

void
sec_postfork_child(tsdn_t *tsdn, sec_t *sec) {
	for (size_t i = 0; i < sec->nshards; i++) {
		malloc_mutex_postfork_child(tsdn, &sec->shards[i].mtx);
	}
}
Add SEC module: a small extent cache. This can be used to take pressure off a more centralized, worse-sharded allocator without requiring a full break of the arena abstraction. 2020-10-16 04:46:38 +08:00			`#include "jemalloc/internal/jemalloc_preamble.h"`
			`#include "jemalloc/internal/jemalloc_internal_includes.h"`

			`#include "jemalloc/internal/sec.h"`

			`static edata_t sec_alloc(tsdn_t tsdn, pai_t *self, size_t size,`
			`size_t alignment, bool zero);`
			`static bool sec_expand(tsdn_t tsdn, pai_t self, edata_t *edata,`
			`size_t old_size, size_t new_size, bool zero);`
			`static bool sec_shrink(tsdn_t tsdn, pai_t self, edata_t *edata,`
			`size_t old_size, size_t new_size);`
			`static void sec_dalloc(tsdn_t tsdn, pai_t self, edata_t *edata);`

			`bool sec_init(sec_t sec, pai_t fallback, size_t nshards, size_t alloc_max,`
			`size_t bytes_max) {`
			`if (nshards > SEC_NSHARDS_MAX) {`
			`nshards = SEC_NSHARDS_MAX;`
			`}`
			`for (size_t i = 0; i < nshards; i++) {`
			`sec_shard_t *shard = &sec->shards[i];`
			`bool err = malloc_mutex_init(&shard->mtx, "sec_shard",`
			`WITNESS_RANK_SEC_SHARD, malloc_mutex_rank_exclusive);`
			`if (err) {`
			`return true;`
			`}`
			`shard->enabled = true;`
			`for (pszind_t j = 0; j < SEC_NPSIZES; j++) {`
			`edata_list_active_init(&shard->freelist[j]);`
			`}`
			`shard->bytes_cur = 0;`
			`}`
			`sec->fallback = fallback;`
			`sec->alloc_max = alloc_max;`
			`if (sec->alloc_max > sz_pind2sz(SEC_NPSIZES - 1)) {`
			`sec->alloc_max = sz_pind2sz(SEC_NPSIZES - 1);`
			`}`

			`sec->bytes_max = bytes_max;`
			`sec->nshards = nshards;`

			`/*`
			`* Initialize these last so that an improper use of an SEC whose`
			`* initialization failed will segfault in an easy-to-spot way.`
			`*/`
			`sec->pai.alloc = &sec_alloc;`
			`sec->pai.expand = &sec_expand;`
			`sec->pai.shrink = &sec_shrink;`
			`sec->pai.dalloc = &sec_dalloc;`

			`return false;`
			`}`

			`static sec_shard_t *`
			`sec_shard_pick(tsdn_t tsdn, sec_t sec) {`
			`/*`
			`* Eventually, we should implement affinity, tracking source shard using`
			`* the edata_t's newly freed up fields. For now, just randomly`
			`* distribute across all shards.`
			`*/`
			`if (tsdn_null(tsdn)) {`
			`return &sec->shards[0];`
			`}`
			`tsd_t *tsd = tsdn_tsd(tsdn);`
			`/*`
			`* Use the trick from Daniel Lemire's "A fast alternative to the modulo`
			`* reduction. Use a 64 bit number to store 32 bits, since we'll`
			`* deliberately overflow when we multiply by the number of shards.`
			`*/`
			`uint64_t rand32 = prng_lg_range_u64(tsd_prng_statep_get(tsd), 32);`
			`uint32_t idx = (uint32_t)((rand32 * (uint64_t)sec->nshards) >> 32);`
			`return &sec->shards[idx];`
			`}`

			`static edata_t *`
			`sec_shard_alloc_locked(tsdn_t tsdn, sec_t sec, sec_shard_t *shard,`
			`pszind_t pszind) {`
			`malloc_mutex_assert_owner(tsdn, &shard->mtx);`
			`if (!shard->enabled) {`
			`return NULL;`
			`}`
			`edata_t *edata = edata_list_active_first(&shard->freelist[pszind]);`
			`if (edata != NULL) {`
			`edata_list_active_remove(&shard->freelist[pszind], edata);`
			`assert(edata_size_get(edata) <= shard->bytes_cur);`
			`shard->bytes_cur -= edata_size_get(edata);`
			`}`
			`return edata;`
			`}`

			`static edata_t *`
			`sec_alloc(tsdn_t tsdn, pai_t self, size_t size, size_t alignment, bool zero) {`
			`assert((size & PAGE_MASK) == 0);`

			`sec_t sec = (sec_t )self;`

			`if (zero \|\| alignment > PAGE \|\| sec->nshards == 0`
			`\|\| size > sec->alloc_max) {`
			`return pai_alloc(tsdn, sec->fallback, size, alignment, zero);`
			`}`
			`pszind_t pszind = sz_psz2ind(size);`
			`sec_shard_t *shard = sec_shard_pick(tsdn, sec);`
			`malloc_mutex_lock(tsdn, &shard->mtx);`
			`edata_t *edata = sec_shard_alloc_locked(tsdn, sec, shard, pszind);`
			`malloc_mutex_unlock(tsdn, &shard->mtx);`
			`if (edata == NULL) {`
			`/*`
			`* See the note in dalloc, below; really, we should add a`
			`* batch_alloc method to the PAI and get more than one extent at`
			`* a time.`
			`*/`
			`edata = pai_alloc(tsdn, sec->fallback, size, alignment, zero);`
			`}`
			`return edata;`
			`}`

			`static bool`
			`sec_expand(tsdn_t tsdn, pai_t self, edata_t *edata, size_t old_size,`
			`size_t new_size, bool zero) {`
			`sec_t sec = (sec_t )self;`
			`return pai_expand(tsdn, sec->fallback, edata, old_size, new_size, zero);`
			`}`

			`static bool`
			`sec_shrink(tsdn_t tsdn, pai_t self, edata_t *edata, size_t old_size,`
			`size_t new_size) {`
			`sec_t sec = (sec_t )self;`
			`return pai_shrink(tsdn, sec->fallback, edata, old_size, new_size);`
			`}`

			`static void`
			`sec_do_flush_locked(tsdn_t tsdn, sec_t sec, sec_shard_t *shard) {`
			`malloc_mutex_assert_owner(tsdn, &shard->mtx);`
			`shard->bytes_cur = 0;`
			`edata_list_active_t to_flush;`
			`edata_list_active_init(&to_flush);`
			`for (pszind_t i = 0; i < SEC_NPSIZES; i++) {`
			`edata_list_active_concat(&to_flush, &shard->freelist[i]);`
			`}`
			`/*`
			`* A better way to do this would be to add a batch dalloc function to`
			`* the pai_t. Practically, the current method turns into O(n) locks and`
			`* unlocks at the fallback allocator. But some implementations (e.g.`
			`* HPA) can straightforwardly do many deallocations in a single lock /`
			`* unlock pair.`
			`*/`
			`while (!edata_list_active_empty(&to_flush)) {`
			`edata_t *e = edata_list_active_first(&to_flush);`
			`edata_list_active_remove(&to_flush, e);`
			`pai_dalloc(tsdn, sec->fallback, e);`
			`}`
			`}`

			`static void`
			`sec_shard_dalloc_locked(tsdn_t tsdn, sec_t sec, sec_shard_t *shard,`
			`edata_t *edata) {`
			`malloc_mutex_assert_owner(tsdn, &shard->mtx);`
			`assert(shard->bytes_cur <= sec->bytes_max);`
			`size_t size = edata_size_get(edata);`
			`pszind_t pszind = sz_psz2ind(size);`
			`/*`
			`* Prepending here results in FIFO allocation per bin, which seems`
			`* reasonable.`
			`*/`
			`edata_list_active_prepend(&shard->freelist[pszind], edata);`
			`shard->bytes_cur += size;`
			`if (shard->bytes_cur > sec->bytes_max) {`
			`/*`
			`* We've exceeded the shard limit. We make two nods in the`
			`* direction of fragmentation avoidance: we flush everything in`
			`* the shard, rather than one particular bin, and we hold the`
			`* lock while flushing (in case one of the extents we flush is`
			`* highly preferred from a fragmentation-avoidance perspective`
			`* in the backing allocator). This has the extra advantage of`
			`* not requiring advanced cache balancing strategies.`
			`*/`
			`sec_do_flush_locked(tsdn, sec, shard);`
			`}`
			`}`

			`static void`
			`sec_dalloc(tsdn_t tsdn, pai_t self, edata_t *edata) {`
			`sec_t sec = (sec_t )self;`
			`if (sec->nshards == 0 \|\| edata_size_get(edata) > sec->alloc_max) {`
			`pai_dalloc(tsdn, sec->fallback, edata);`
			`return;`
			`}`
			`sec_shard_t *shard = sec_shard_pick(tsdn, sec);`
			`malloc_mutex_lock(tsdn, &shard->mtx);`
			`if (shard->enabled) {`
			`sec_shard_dalloc_locked(tsdn, sec, shard, edata);`
			`malloc_mutex_unlock(tsdn, &shard->mtx);`
			`} else {`
			`malloc_mutex_unlock(tsdn, &shard->mtx);`
			`pai_dalloc(tsdn, sec->fallback, edata);`
			`}`
			`}`

			`void`
			`sec_flush(tsdn_t tsdn, sec_t sec) {`
			`for (size_t i = 0; i < sec->nshards; i++) {`
			`malloc_mutex_lock(tsdn, &sec->shards[i].mtx);`
			`sec_do_flush_locked(tsdn, sec, &sec->shards[i]);`
			`malloc_mutex_unlock(tsdn, &sec->shards[i].mtx);`
			`}`
			`}`

			`void`
			`sec_disable(tsdn_t tsdn, sec_t sec) {`
			`for (size_t i = 0; i < sec->nshards; i++) {`
			`malloc_mutex_lock(tsdn, &sec->shards[i].mtx);`
			`sec->shards[i].enabled = false;`
			`sec_do_flush_locked(tsdn, sec, &sec->shards[i]);`
			`malloc_mutex_unlock(tsdn, &sec->shards[i].mtx);`
			`}`
			`}`

			`void`
			`sec_stats_merge(tsdn_t tsdn, sec_t sec, sec_stats_t *stats) {`
			`size_t sum = 0;`
			`for (size_t i = 0; i < sec->nshards; i++) {`
			`/*`
			`* We could save these lock acquisitions by making bytes_cur`
			`* atomic, but stats collection is rare anyways and we expect`
			`* the number and type of stats to get more interesting.`
			`*/`
			`malloc_mutex_lock(tsdn, &sec->shards[i].mtx);`
			`sum += sec->shards[i].bytes_cur;`
			`malloc_mutex_unlock(tsdn, &sec->shards[i].mtx);`
			`}`
			`stats->bytes += sum;`
			`}`

			`void`
			`sec_mutex_stats_read(tsdn_t tsdn, sec_t sec,`
			`mutex_prof_data_t *mutex_prof_data) {`
			`for (size_t i = 0; i < sec->nshards; i++) {`
			`malloc_mutex_lock(tsdn, &sec->shards[i].mtx);`
			`malloc_mutex_prof_accum(tsdn, mutex_prof_data,`
			`&sec->shards[i].mtx);`
			`malloc_mutex_unlock(tsdn, &sec->shards[i].mtx);`
			`}`
			`}`

			`void`
			`sec_prefork2(tsdn_t tsdn, sec_t sec) {`
			`for (size_t i = 0; i < sec->nshards; i++) {`
			`malloc_mutex_prefork(tsdn, &sec->shards[i].mtx);`
			`}`
			`}`

			`void`
			`sec_postfork_parent(tsdn_t tsdn, sec_t sec) {`
			`for (size_t i = 0; i < sec->nshards; i++) {`
			`malloc_mutex_postfork_parent(tsdn, &sec->shards[i].mtx);`
			`}`
			`}`

			`void`
			`sec_postfork_child(tsdn_t tsdn, sec_t sec) {`
			`for (size_t i = 0; i < sec->nshards; i++) {`
			`malloc_mutex_postfork_child(tsdn, &sec->shards[i].mtx);`
			`}`
			`}`