restrict bytes_until_sample to int64_t. This allows optimal asm

generation of sub bytes_until_sample, usize; je; for x86 arch.
Subtraction is unconditional, and only flags are checked for the jump,
no extra compare is necessary.  This also reduces register pressure.
This commit is contained in:
Dave Watson 2018-10-09 11:25:36 -07:00
parent d1a861fa80
commit 997d86acc6
3 changed files with 13 additions and 6 deletions

View File

@ -82,17 +82,21 @@ JEMALLOC_ALWAYS_INLINE bool
prof_sample_accum_update(tsd_t *tsd, size_t usize, bool update,
prof_tdata_t **tdata_out) {
prof_tdata_t *tdata;
uint64_t bytes_until_sample;
int64_t bytes_until_sample;
cassert(config_prof);
ssize_t check = update ? 0 : usize;
bytes_until_sample = tsd_bytes_until_sample_get(tsd);
if (likely(bytes_until_sample >= usize)) {
if (update && tsd_nominal(tsd)) {
tsd_bytes_until_sample_set(tsd, bytes_until_sample - usize);
if (update) {
bytes_until_sample -= usize;
if (tsd_nominal(tsd)) {
tsd_bytes_until_sample_set(tsd, bytes_until_sample);
}
return true;
}
if (likely(bytes_until_sample >= check)) {
return true;
}
tdata = prof_tdata_get(tsd, true);
if (unlikely((uintptr_t)tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)) {

View File

@ -68,7 +68,7 @@ typedef void (*test_callback_t)(int *);
O(offset_state, uint64_t, uint64_t) \
O(thread_allocated, uint64_t, uint64_t) \
O(thread_deallocated, uint64_t, uint64_t) \
O(bytes_until_sample, uint64_t, uint64_t) \
O(bytes_until_sample, int64_t, int64_t) \
O(prof_tdata, prof_tdata_t *, prof_tdata_t *) \
O(rtree_ctx, rtree_ctx_t, rtree_ctx_t) \
O(iarena, arena_t *, arena_t *) \

View File

@ -1168,6 +1168,9 @@ prof_sample_threshold_update(prof_tdata_t *tdata) {
uint64_t bytes_until_sample = (uint64_t)(log(u) /
log(1.0 - (1.0 / (double)((uint64_t)1U << lg_prof_sample))))
+ (uint64_t)1U;
if (bytes_until_sample > SSIZE_MAX) {
bytes_until_sample = SSIZE_MAX;
}
tsd_bytes_until_sample_set(tsd_fetch(), bytes_until_sample);
#endif