diff --git a/include/jemalloc/internal/ctl_externs.h b/include/jemalloc/internal/ctl_externs.h index 812ec4fa..33ca2039 100644 --- a/include/jemalloc/internal/ctl_externs.h +++ b/include/jemalloc/internal/ctl_externs.h @@ -4,10 +4,13 @@ /* Maximum ctl tree depth. */ #define CTL_MAX_DEPTH 7 -#define NUM_ARENA_PROF_LOCKS 6 -#define NUM_LOCK_PROF_COUNTERS 7 -const char *arena_lock_names[NUM_ARENA_PROF_LOCKS]; -const char *lock_counter_names[NUM_LOCK_PROF_COUNTERS]; +#define NUM_GLOBAL_PROF_LOCKS 3 +#define NUM_ARENA_PROF_LOCKS 6 +#define NUM_LOCK_PROF_COUNTERS 7 + +extern const char *arena_lock_names[NUM_ARENA_PROF_LOCKS]; +extern const char *global_lock_names[NUM_GLOBAL_PROF_LOCKS]; +extern const char *lock_counter_names[NUM_LOCK_PROF_COUNTERS]; int ctl_byname(tsd_t *tsd, const char *name, void *oldp, size_t *oldlenp, void *newp, size_t newlen); diff --git a/include/jemalloc/internal/ctl_structs.h b/include/jemalloc/internal/ctl_structs.h index 4df43d90..e467a247 100644 --- a/include/jemalloc/internal/ctl_structs.h +++ b/include/jemalloc/internal/ctl_structs.h @@ -41,6 +41,10 @@ struct ctl_stats_s { size_t resident; size_t mapped; size_t retained; + +#define MTX(mutex) lock_prof_data_t mutex##_mtx_data; +GLOBAL_PROF_MUTEXES +#undef MTX }; struct ctl_arena_s { diff --git a/include/jemalloc/internal/ctl_types.h b/include/jemalloc/internal/ctl_types.h index 7853a4b2..562418ca 100644 --- a/include/jemalloc/internal/ctl_types.h +++ b/include/jemalloc/internal/ctl_types.h @@ -1,6 +1,11 @@ #ifndef JEMALLOC_INTERNAL_CTL_TYPES_H #define JEMALLOC_INTERNAL_CTL_TYPES_H +#define GLOBAL_PROF_MUTEXES \ + MTX(base) \ + MTX(ctl) \ + MTX(prof) + typedef struct ctl_node_s ctl_node_t; typedef struct ctl_named_node_s ctl_named_node_t; typedef struct ctl_indexed_node_s ctl_indexed_node_t; diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt index 14ecdeda..3c5a21b5 100644 --- a/include/jemalloc/internal/private_symbols.txt +++ b/include/jemalloc/internal/private_symbols.txt @@ -106,6 +106,7 @@ bootstrap_calloc bootstrap_free bootstrap_malloc bt_init +bt2gctx_mtx buferror ckh_count ckh_delete diff --git a/include/jemalloc/internal/prof_externs.h b/include/jemalloc/internal/prof_externs.h index 985532f6..cbd9795b 100644 --- a/include/jemalloc/internal/prof_externs.h +++ b/include/jemalloc/internal/prof_externs.h @@ -1,6 +1,8 @@ #ifndef JEMALLOC_INTERNAL_PROF_EXTERNS_H #define JEMALLOC_INTERNAL_PROF_EXTERNS_H +extern malloc_mutex_t bt2gctx_mtx; + extern bool opt_prof; extern bool opt_prof_active; extern bool opt_prof_thread_active_init; diff --git a/src/arena.c b/src/arena.c index 0c12004d..266c85fb 100644 --- a/src/arena.c +++ b/src/arena.c @@ -298,9 +298,9 @@ arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads, } #define READ_ARENA_MUTEX_PROF_DATA(mtx, data) \ - malloc_mutex_lock(tsdn, &arena->mtx); \ - malloc_lock_prof_read(tsdn, &astats->data, &arena->mtx); \ - malloc_mutex_unlock(tsdn, &arena->mtx); + malloc_mutex_lock(tsdn, &arena->mtx); \ + malloc_lock_prof_read(tsdn, &astats->data, &arena->mtx); \ + malloc_mutex_unlock(tsdn, &arena->mtx); /* Gather per arena mutex profiling data. */ READ_ARENA_MUTEX_PROF_DATA(large_mtx, large_mtx_data) diff --git a/src/ctl.c b/src/ctl.c index 016275e5..c5ef70b5 100644 --- a/src/ctl.c +++ b/src/ctl.c @@ -13,6 +13,12 @@ static bool ctl_initialized; static ctl_stats_t *ctl_stats; static ctl_arenas_t *ctl_arenas; +const char *global_lock_names[NUM_GLOBAL_PROF_LOCKS] = { + "base", + "prof", + "ctl" +}; + const char *arena_lock_names[NUM_ARENA_PROF_LOCKS] = { "large", "extent_freelist", @@ -21,6 +27,7 @@ const char *arena_lock_names[NUM_ARENA_PROF_LOCKS] = { "decay", "tcache" }; + const char *lock_counter_names[NUM_LOCK_PROF_COUNTERS] = { "num_ops", "num_wait", @@ -196,16 +203,26 @@ CTL_PROTO(stats_resident) CTL_PROTO(stats_mapped) CTL_PROTO(stats_retained) -#define STATS_LOCKS_CTL_PROTO_GEN(l, n) \ -CTL_PROTO(stats_arenas_i_##l##_##n##_num_ops) \ -CTL_PROTO(stats_arenas_i_##l##_##n##_num_wait) \ -CTL_PROTO(stats_arenas_i_##l##_##n##_num_spin_acq) \ -CTL_PROTO(stats_arenas_i_##l##_##n##_num_owner_switch) \ -CTL_PROTO(stats_arenas_i_##l##_##n##_total_wait_time) \ -CTL_PROTO(stats_arenas_i_##l##_##n##_max_wait_time) \ -CTL_PROTO(stats_arenas_i_##l##_##n##_max_num_thds) +#define LOCK_STATS_CTL_PROTO_GEN(n) \ +CTL_PROTO(stats_##n##_num_ops) \ +CTL_PROTO(stats_##n##_num_wait) \ +CTL_PROTO(stats_##n##_num_spin_acq) \ +CTL_PROTO(stats_##n##_num_owner_switch) \ +CTL_PROTO(stats_##n##_total_wait_time) \ +CTL_PROTO(stats_##n##_max_wait_time) \ +CTL_PROTO(stats_##n##_max_num_thds) -#define ARENA_LOCKS_CTL_PROTO_GEN(n) STATS_LOCKS_CTL_PROTO_GEN(locks, n) +/* Global locks. */ +LOCK_STATS_CTL_PROTO_GEN(locks_base) +LOCK_STATS_CTL_PROTO_GEN(locks_prof) +LOCK_STATS_CTL_PROTO_GEN(locks_ctl) + +/* Arena bin locks. */ +LOCK_STATS_CTL_PROTO_GEN(arenas_i_bins_j_lock) + +#define ARENA_LOCKS_CTL_PROTO_GEN(n) \ + LOCK_STATS_CTL_PROTO_GEN(arenas_i_locks_##n) +/* Per arena locks. */ ARENA_LOCKS_CTL_PROTO_GEN(large) ARENA_LOCKS_CTL_PROTO_GEN(extent_freelist) ARENA_LOCKS_CTL_PROTO_GEN(extents_cached) @@ -213,9 +230,7 @@ ARENA_LOCKS_CTL_PROTO_GEN(extents_retained) ARENA_LOCKS_CTL_PROTO_GEN(decay) ARENA_LOCKS_CTL_PROTO_GEN(tcache) #undef ARENA_LOCKS_CTL_PROTO_GEN - -STATS_LOCKS_CTL_PROTO_GEN(bins_j, lock) -#undef STATS_LOCKS_CTL_PROTO_GEN +#undef LOCK_STATS_CTL_PROTO_GEN /******************************************************************************/ /* mallctl tree. */ @@ -384,26 +399,27 @@ static const ctl_named_node_t stats_arenas_i_large_node[] = { {NAME("nrequests"), CTL(stats_arenas_i_large_nrequests)} }; -#define LOCK_PROF_DATA_NODE(prefix, n) \ -static const ctl_named_node_t prefix##_##n##_node[] = { \ +#define LOCK_PROF_DATA_NODE(prefix) \ +static const ctl_named_node_t stats_##prefix##_node[] = { \ {NAME("num_ops"), \ - CTL(prefix##_##n##_num_ops)}, \ + CTL(stats_##prefix##_num_ops)}, \ {NAME("num_wait"), \ - CTL(prefix##_##n##_num_wait)}, \ + CTL(stats_##prefix##_num_wait)}, \ {NAME("num_spin_acq"), \ - CTL(prefix##_##n##_num_spin_acq)}, \ + CTL(stats_##prefix##_num_spin_acq)}, \ {NAME("num_owner_switch"), \ - CTL(prefix##_##n##_num_owner_switch)}, \ + CTL(stats_##prefix##_num_owner_switch)}, \ {NAME("total_wait_time"), \ - CTL(prefix##_##n##_total_wait_time)}, \ + CTL(stats_##prefix##_total_wait_time)}, \ {NAME("max_wait_time"), \ - CTL(prefix##_##n##_max_wait_time)}, \ + CTL(stats_##prefix##_max_wait_time)}, \ {NAME("max_num_thds"), \ - CTL(prefix##_##n##_max_num_thds)} \ + CTL(stats_##prefix##_max_num_thds)} \ /* Note that # of current waiting thread not provided. */ \ }; -LOCK_PROF_DATA_NODE(stats_arenas_i_bins_j, lock) +LOCK_PROF_DATA_NODE(arenas_i_bins_j_lock) + static const ctl_named_node_t stats_arenas_i_bins_j_node[] = { {NAME("nmalloc"), CTL(stats_arenas_i_bins_j_nmalloc)}, {NAME("ndalloc"), CTL(stats_arenas_i_bins_j_ndalloc)}, @@ -439,12 +455,14 @@ static const ctl_indexed_node_t stats_arenas_i_lextents_node[] = { {INDEX(stats_arenas_i_lextents_j)} }; -LOCK_PROF_DATA_NODE(stats_arenas_i_locks, large) -LOCK_PROF_DATA_NODE(stats_arenas_i_locks, extent_freelist) -LOCK_PROF_DATA_NODE(stats_arenas_i_locks, extents_cached) -LOCK_PROF_DATA_NODE(stats_arenas_i_locks, extents_retained) -LOCK_PROF_DATA_NODE(stats_arenas_i_locks, decay) -LOCK_PROF_DATA_NODE(stats_arenas_i_locks, tcache) +#define ARENA_LOCK_PROF_DATA_NODE(n) LOCK_PROF_DATA_NODE(arenas_i_locks_##n) + +ARENA_LOCK_PROF_DATA_NODE(large) +ARENA_LOCK_PROF_DATA_NODE(extent_freelist) +ARENA_LOCK_PROF_DATA_NODE(extents_cached) +ARENA_LOCK_PROF_DATA_NODE(extents_retained) +ARENA_LOCK_PROF_DATA_NODE(decay) +ARENA_LOCK_PROF_DATA_NODE(tcache) static const ctl_named_node_t stats_arenas_i_locks_node[] = { {NAME("large"), CHILD(named, stats_arenas_i_locks_large)}, @@ -457,7 +475,6 @@ static const ctl_named_node_t stats_arenas_i_locks_node[] = { {NAME("decay"), CHILD(named, stats_arenas_i_locks_decay)}, {NAME("tcache"), CHILD(named, stats_arenas_i_locks_tcache)} }; -#undef LOCK_PROF_DATA_NODE static const ctl_named_node_t stats_arenas_i_node[] = { {NAME("nthreads"), CTL(stats_arenas_i_nthreads)}, @@ -493,6 +510,15 @@ static const ctl_indexed_node_t stats_arenas_node[] = { {INDEX(stats_arenas_i)} }; +LOCK_PROF_DATA_NODE(locks_base) +LOCK_PROF_DATA_NODE(locks_prof) +LOCK_PROF_DATA_NODE(locks_ctl) +static const ctl_named_node_t stats_locks_node[] = { + {NAME("base"), CHILD(named, stats_locks_base)}, + {NAME("prof"), CHILD(named, stats_locks_prof)}, + {NAME("ctl"), CHILD(named, stats_locks_ctl)} +}; + static const ctl_named_node_t stats_node[] = { {NAME("allocated"), CTL(stats_allocated)}, {NAME("active"), CTL(stats_active)}, @@ -500,8 +526,10 @@ static const ctl_named_node_t stats_node[] = { {NAME("resident"), CTL(stats_resident)}, {NAME("mapped"), CTL(stats_mapped)}, {NAME("retained"), CTL(stats_retained)}, + {NAME("locks"), CHILD(named, stats_locks)}, {NAME("arenas"), CHILD(indexed, stats_arenas)} }; +#undef LOCK_PROF_DATA_NODE static const ctl_named_node_t root_node[] = { {NAME("version"), CTL(version)}, @@ -925,6 +953,17 @@ ctl_refresh(tsdn_t *tsdn) { &ctl_sarena->astats->astats.mapped, ATOMIC_RELAXED); ctl_stats->retained = atomic_load_zu( &ctl_sarena->astats->astats.retained, ATOMIC_RELAXED); + +#define READ_GLOBAL_MUTEX_PROF_DATA(mtx, data) \ + malloc_mutex_lock(tsdn, &mtx); \ + malloc_lock_prof_read(tsdn, &ctl_stats->data, &mtx); \ + malloc_mutex_unlock(tsdn, &mtx); + + READ_GLOBAL_MUTEX_PROF_DATA(b0get()->mtx, base_mtx_data); + READ_GLOBAL_MUTEX_PROF_DATA(bt2gctx_mtx, prof_mtx_data); + /* We own ctl mutex already. */ + malloc_lock_prof_read(tsdn, &ctl_stats->ctl_mtx_data, &ctl_mtx); +#undef READ_GLOBAL_MUTEX_PROF_DATA } ctl_arenas->epoch++; } @@ -2413,25 +2452,34 @@ CTL_RO_CGEN(config_stats, stats_arenas_i_large_nrequests, uint64_t) /* Intentional. */ /* Lock profiling related APIs below. */ -#define ARENAS_LOCK_CTL_GEN(l, s, d) \ -CTL_RO_CGEN(config_stats, stats_arenas_i_##l##_num_ops, \ - arenas_i(mib[2])->astats->s.d.n_lock_ops, uint64_t) \ -CTL_RO_CGEN(config_stats, stats_arenas_i_##l##_num_wait, \ - arenas_i(mib[2])->astats->s.d.n_wait_times, uint64_t) \ -CTL_RO_CGEN(config_stats, stats_arenas_i_##l##_num_spin_acq, \ - arenas_i(mib[2])->astats->s.d.n_spin_acquired, uint64_t) \ -CTL_RO_CGEN(config_stats, stats_arenas_i_##l##_num_owner_switch, \ - arenas_i(mib[2])->astats->s.d.n_owner_switches, uint64_t) \ -CTL_RO_CGEN(config_stats, stats_arenas_i_##l##_total_wait_time, \ - arenas_i(mib[2])->astats->s.d.tot_wait_time, uint64_t) \ -CTL_RO_CGEN(config_stats, stats_arenas_i_##l##_max_wait_time, \ - arenas_i(mib[2])->astats->s.d.max_wait_time, uint64_t) \ -CTL_RO_CGEN(config_stats, stats_arenas_i_##l##_max_num_thds, \ - arenas_i(mib[2])->astats->s.d.max_n_thds, uint64_t) +#define RO_LOCK_CTL_GEN(n, l) \ +CTL_RO_CGEN(config_stats, stats_##n##_num_ops, \ + l.n_lock_ops, uint64_t) \ +CTL_RO_CGEN(config_stats, stats_##n##_num_wait, \ + l.n_wait_times, uint64_t) \ +CTL_RO_CGEN(config_stats, stats_##n##_num_spin_acq, \ + l.n_spin_acquired, uint64_t) \ +CTL_RO_CGEN(config_stats, stats_##n##_num_owner_switch, \ + l.n_owner_switches, uint64_t) \ +CTL_RO_CGEN(config_stats, stats_##n##_total_wait_time, \ + l.tot_wait_time, uint64_t) \ +CTL_RO_CGEN(config_stats, stats_##n##_max_wait_time, \ + l.max_wait_time, uint64_t) \ +CTL_RO_CGEN(config_stats, stats_##n##_max_num_thds, \ + l.max_n_thds, uint64_t) +/* Global lock. */ +#define MTX(mutex) RO_LOCK_CTL_GEN(locks_##mutex, ctl_stats->mutex##_mtx_data) +GLOBAL_PROF_MUTEXES +#undef MTX + +/* arena->bins[j].lock */ +RO_LOCK_CTL_GEN(arenas_i_bins_j_lock, + arenas_i(mib[2])->astats->bstats[mib[4]].lock_data) + +/* Per arena locks */ #define ARENAS_ASTATS_LOCK_CTL_GEN(l, d) \ - ARENAS_LOCK_CTL_GEN(locks_##l, astats, d) - + RO_LOCK_CTL_GEN(arenas_i_locks_##l, arenas_i(mib[2])->astats->astats.d) /* arena->large_mtx */ ARENAS_ASTATS_LOCK_CTL_GEN(large, large_mtx_data) /* arena->extent_freelist_mtx */ @@ -2444,8 +2492,8 @@ ARENAS_ASTATS_LOCK_CTL_GEN(extents_retained, extents_retained_mtx_data) ARENAS_ASTATS_LOCK_CTL_GEN(decay, decay_mtx_data) /* arena->tcache_ql_mtx */ ARENAS_ASTATS_LOCK_CTL_GEN(tcache, tcache_mtx_data) -/* arena->bins[j].lock */ -ARENAS_LOCK_CTL_GEN(bins_j_lock, bstats[mib[4]], lock_data) +#undef ARENAS_ASTATS_LOCK_CTL_GEN +#undef RO_LOCK_CTL_GEN CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_nmalloc, arenas_i(mib[2])->astats->bstats[mib[4]].nmalloc, uint64_t) diff --git a/src/prof.c b/src/prof.c index b04984b7..4e83ae3f 100644 --- a/src/prof.c +++ b/src/prof.c @@ -78,7 +78,8 @@ static malloc_mutex_t *tdata_locks; * structure that knows about all backtraces currently captured. */ static ckh_t bt2gctx; -static malloc_mutex_t bt2gctx_mtx; +/* Non static to enable profiling. */ +malloc_mutex_t bt2gctx_mtx; /* * Tree of all extant prof_tdata_t structures, regardless of state, diff --git a/src/stats.c b/src/stats.c index 7b690e7f..89ba6693 100644 --- a/src/stats.c +++ b/src/stats.c @@ -127,6 +127,7 @@ stats_arena_bins_print(void (*write_cb)(void *, const char *), void *cbopaque, CTL_M2_M4_GET("stats.arenas.0.bins.0.curslabs", i, j, &curslabs, size_t); + /* Output less info for bin locks to save space. */ uint64_t num_ops, num_wait, max_wait; CTL_M2_M4_GET("stats.arenas.0.bins.0.lock.num_wait", i, j, &num_wait, uint64_t); @@ -290,8 +291,9 @@ stats_arena_lextents_print(void (*write_cb)(void *, const char *), } static void -gen_ctl_str(char *str, const char *lock, const char *counter) { - sprintf(str, "stats.arenas.0.locks.%s.%s", lock, counter); +gen_lock_ctl_str(char *str, const char *prefix, const char *lock, + const char *counter) { + sprintf(str, "stats.%s.%s.%s", prefix, lock, counter); } static void read_arena_lock_stats(unsigned arena_ind, @@ -301,13 +303,34 @@ static void read_arena_lock_stats(unsigned arena_ind, unsigned i, j; for (i = 0; i < NUM_ARENA_PROF_LOCKS; i++) { for (j = 0; j < NUM_LOCK_PROF_COUNTERS; j++) { - gen_ctl_str(cmd, arena_lock_names[i], - lock_counter_names[j]); + gen_lock_ctl_str(cmd, "arenas.0.locks", + arena_lock_names[i], lock_counter_names[j]); CTL_M2_GET(cmd, arena_ind, &results[i][j], uint64_t); } } } +static void lock_stats_output(void (*write_cb)(void *, const char *), + void *cbopaque, const char *name, uint64_t stats[NUM_LOCK_PROF_COUNTERS], + bool first_mutex) { + if (first_mutex) { + /* Print title. */ + malloc_cprintf(write_cb, cbopaque, + " n_lock_ops n_waiting" + " n_spin_acq n_owner_switch total_wait_ns" + " max_wait_ns max_n_wait_thds\n"); + } + + malloc_cprintf(write_cb, cbopaque, "%s", name); + malloc_cprintf(write_cb, cbopaque, ":%*c", + (int)(19 - strlen(name)), ' '); + + for (unsigned i = 0; i < NUM_LOCK_PROF_COUNTERS; i++) { + malloc_cprintf(write_cb, cbopaque, " %16"FMTu64, stats[i]); + } + malloc_cprintf(write_cb, cbopaque, "\n"); +} + static void stats_arena_locks_print(void (*write_cb)(void *, const char *), void *cbopaque, bool json, unsigned arena_ind) { @@ -318,23 +341,9 @@ stats_arena_locks_print(void (*write_cb)(void *, const char *), if (json) { //TODO } else { - malloc_cprintf(write_cb, cbopaque, - " n_lock_ops n_waiting" - " n_spin_acq n_owner_switch total_wait_ns" - " max_wait_ns max_n_wait_thds\n"); - - unsigned i, j; - for (i = 0; i < NUM_ARENA_PROF_LOCKS; i++) { - malloc_cprintf(write_cb, cbopaque, - "%s", arena_lock_names[i]); - malloc_cprintf(write_cb, cbopaque, ":%*c", - (int)(18 - strlen(arena_lock_names[i])), ' '); - - for (j = 0; j < NUM_LOCK_PROF_COUNTERS; j++) { - malloc_cprintf(write_cb, cbopaque, " %15"FMTu64, - lock_stats[i][j]); - } - malloc_cprintf(write_cb, cbopaque, "\n"); + for (unsigned i = 0; i < NUM_ARENA_PROF_LOCKS; i++) { + lock_stats_output(write_cb, cbopaque, + arena_lock_names[i], lock_stats[i], i == 0); } } @@ -930,6 +939,20 @@ stats_general_print(void (*write_cb)(void *, const char *), void *cbopaque, } } +static void read_global_lock_stats( + uint64_t results[NUM_GLOBAL_PROF_LOCKS][NUM_LOCK_PROF_COUNTERS]) { + char cmd[128]; + + unsigned i, j; + for (i = 0; i < NUM_GLOBAL_PROF_LOCKS; i++) { + for (j = 0; j < NUM_LOCK_PROF_COUNTERS; j++) { + gen_lock_ctl_str(cmd, "locks", global_lock_names[i], + lock_counter_names[j]); + CTL_GET(cmd, &results[i][j], uint64_t); + } + } +} + static void stats_print_helper(void (*write_cb)(void *, const char *), void *cbopaque, bool json, bool merged, bool destroyed, bool unmerged, bool bins, @@ -942,6 +965,10 @@ stats_print_helper(void (*write_cb)(void *, const char *), void *cbopaque, CTL_GET("stats.resident", &resident, size_t); CTL_GET("stats.mapped", &mapped, size_t); CTL_GET("stats.retained", &retained, size_t); + + uint64_t lock_stats[NUM_GLOBAL_PROF_LOCKS][NUM_LOCK_PROF_COUNTERS]; + read_global_lock_stats(lock_stats); + if (json) { malloc_cprintf(write_cb, cbopaque, "\t\t\"stats\": {\n"); @@ -966,6 +993,11 @@ stats_print_helper(void (*write_cb)(void *, const char *), void *cbopaque, "Allocated: %zu, active: %zu, metadata: %zu," " resident: %zu, mapped: %zu, retained: %zu\n", allocated, active, metadata, resident, mapped, retained); + + for (unsigned i = 0; i < NUM_GLOBAL_PROF_LOCKS; i++) { + lock_stats_output(write_cb, cbopaque, + global_lock_names[i], lock_stats[i], i == 0); + } } if (merged || destroyed || unmerged) {