HPA: Manage whole hugepages at a time.

This redesigns the HPA implementation to allow us to manage hugepages all at once, locally, without relying on a global fallback.
2020-11-09 13:49:30 -08:00
parent 63677dde63
commit 43af63fff4
16 changed files with 700 additions and 550 deletions
--- a/include/jemalloc/internal/arena_externs.h
+++ b/include/jemalloc/internal/arena_externs.h
@@ -16,7 +16,6 @@ extern const char *percpu_arena_mode_names[];
 extern const uint64_t h_steps[SMOOTHSTEP_NSTEPS];
 extern malloc_mutex_t arenas_lock;
 extern emap_t arena_emap_global;
-extern hpa_t arena_hpa_global;

 extern size_t opt_oversize_threshold;
 extern size_t oversize_threshold;
--- a/include/jemalloc/internal/edata.h
+++ b/include/jemalloc/internal/edata.h
@@ -208,9 +208,9 @@ struct edata_s {
 		 */

 		/*
-		 * If this edata is from an HPA, it may be part of some larger
-		 * pageslab.  Track it if so.  Otherwise (either because it's
-		 * not part of a pageslab, or not from the HPA at all), NULL.
+		 * If this edata is a user allocation from an HPA, it comes out
+		 * of some pageslab (we don't yet support huegpage allocations
+		 * that don't fit into pageslabs).  This tracks it.
 		 */
 		edata_t *ps;
 		/*
@@ -225,6 +225,8 @@ struct edata_s {
 			 * between heaps.
 			 */
 			uint32_t longest_free_range;
+			/* Whether or not the slab is backed by a hugepage. */
+			bool hugeified;
 		};
 	};

@@ -328,6 +330,11 @@ edata_pai_get(const edata_t *edata) {
 	    EDATA_BITS_PAI_SHIFT);
 }

+static inline bool
+edata_hugeified_get(const edata_t *edata) {
+	return edata->hugeified;
+}
+
 static inline bool
 edata_slab_get(const edata_t *edata) {
 	return (bool)((edata->e_bits & EDATA_BITS_SLAB_MASK) >>
@@ -559,6 +566,11 @@ edata_pai_set(edata_t *edata, extent_pai_t pai) {
 	    ((uint64_t)pai << EDATA_BITS_PAI_SHIFT);
 }

+static inline void
+edata_hugeified_set(edata_t *edata, bool hugeified) {
+	edata->hugeified = hugeified;
+}
+
 static inline void
 edata_slab_set(edata_t *edata, bool slab) {
 	edata->e_bits = (edata->e_bits & ~EDATA_BITS_SLAB_MASK) |
--- a/include/jemalloc/internal/hpa.h
+++ b/include/jemalloc/internal/hpa.h
@@ -6,32 +6,6 @@
 #include "jemalloc/internal/pai.h"
 #include "jemalloc/internal/psset.h"

-typedef struct hpa_s hpa_t;
-struct hpa_s {
-	/*
-	 * We have two mutexes for the central allocator; mtx protects its
-	 * state, while grow_mtx protects controls the ability to grow the
-	 * backing store.  This prevents race conditions in which the central
-	 * allocator has exhausted its memory while mutiple threads are trying
-	 * to allocate.  If they all reserved more address space from the OS
-	 * without synchronization, we'd end consuming much more than necessary.
-	 */
-	malloc_mutex_t grow_mtx;
-	malloc_mutex_t mtx;
-	hpa_central_t central;
-	/* The arena ind we're associated with. */
-	unsigned ind;
-	/*
-	 * This edata cache is the global one that we use for new allocations in
-	 * growing; practically, it comes from a0.
-	 *
-	 * We don't use an edata_cache_small in front of this, since we expect a
-	 * small finite number of allocations from it.
-	 */
-	edata_cache_t *edata_cache;
-	exp_grow_t exp_grow;
-};
-
 /* Used only by CTL; not actually stored here (i.e., all derived). */
 typedef struct hpa_shard_stats_s hpa_shard_stats_t;
 struct hpa_shard_stats_s {
@@ -53,44 +27,53 @@ struct hpa_shard_s {
 	 * allocator, and so will use its edata_cache.
 	 */
 	edata_cache_small_t ecs;
-	hpa_t *hpa;
+
 	psset_t psset;

 	/*
-	 * When we're grabbing a new ps from the central allocator, how big
-	 * would we like it to be?  This is mostly about the level of batching
-	 * we use in our requests to the centralized allocator.
+	 * The largest size we'll allocate out of the shard.  For those
+	 * allocations refused, the caller (in practice, the PA module) will
+	 * fall back to the more general (for now) PAC, which can always handle
+	 * any allocation request.
 	 */
-	size_t ps_goal;
+	size_t alloc_max;
+
 	/*
-	 * What's the maximum size we'll try to allocate out of the psset?  We
-	 * don't want this to be too large relative to ps_goal, as a
-	 * fragmentation avoidance measure.
+	 * Slabs currently purged away.  They are hugepage-sized and
+	 * hugepage-aligned, but have had pages_nohuge and pages_purge_forced
+	 * called on them.
+	 *
+	 * Guarded by grow_mtx.
 	 */
-	size_t ps_alloc_max;
+	edata_list_inactive_t unused_slabs;
+
 	/*
-	 * What's the maximum size we'll try to allocate out of the shard at
-	 * all?
+	 * Either NULL (if empty), or some integer multiple of a
+	 * hugepage-aligned number of hugepages.  We carve them off one at a
+	 * time to satisfy new pageslab requests.
+	 *
+	 * Guarded by grow_mtx.
 	 */
-	size_t small_max;
-	/*
-	 * What's the minimum size for which we'll go straight to the global
-	 * arena?
-	 */
-	size_t large_min;
+	edata_t *eden;

 	/* The arena ind we're associated with. */
 	unsigned ind;
+	emap_t *emap;
 };

-bool hpa_init(hpa_t *hpa, base_t *base, emap_t *emap,
-    edata_cache_t *edata_cache);
-bool hpa_shard_init(hpa_shard_t *shard, hpa_t *hpa,
-    edata_cache_t *edata_cache, unsigned ind, size_t ps_goal,
-    size_t ps_alloc_max, size_t small_max, size_t large_min);
+/*
+ * Whether or not the HPA can be used given the current configuration.  This is
+ * is not necessarily a guarantee that it backs its allocations by hugepages,
+ * just that it can function properly given the system it's running on.
+ */
+bool hpa_supported();
+bool hpa_shard_init(hpa_shard_t *shard, emap_t *emap,
+    edata_cache_t *edata_cache, unsigned ind, size_t alloc_max);
+
+void hpa_shard_stats_accum(hpa_shard_stats_t *dst, hpa_shard_stats_t *src);
+void hpa_shard_stats_merge(tsdn_t *tsdn, hpa_shard_t *shard,
+    hpa_shard_stats_t *dst);

-void hpa_stats_accum(hpa_shard_stats_t *dst, hpa_shard_stats_t *src);
-void hpa_stats_merge(tsdn_t *tsdn, hpa_shard_t *shard, hpa_shard_stats_t *dst);
 /*
 * Notify the shard that we won't use it for allocations much longer.  Due to
 * the possibility of races, we don't actually prevent allocations; just flush
@@ -108,14 +91,4 @@ void hpa_shard_prefork4(tsdn_t *tsdn, hpa_shard_t *shard);
 void hpa_shard_postfork_parent(tsdn_t *tsdn, hpa_shard_t *shard);
 void hpa_shard_postfork_child(tsdn_t *tsdn, hpa_shard_t *shard);

-/*
- * These should be acquired after all the shard locks in phase 4, but before any
- * locks in phase 4.  The central HPA may acquire an edata cache mutex (of a0),
- * so it needs to be lower in the witness ordering, but it's also logically
- * global and not tied to any particular arena.
- */
-void hpa_prefork4(tsdn_t *tsdn, hpa_t *hpa);
-void hpa_postfork_parent(tsdn_t *tsdn, hpa_t *hpa);
-void hpa_postfork_child(tsdn_t *tsdn, hpa_t *hpa);
-
 #endif /* JEMALLOC_INTERNAL_HPA_H */
--- a/include/jemalloc/internal/mutex_prof.h
+++ b/include/jemalloc/internal/mutex_prof.h
@@ -11,9 +11,7 @@
    OP(ctl)								\
    OP(prof)								\
    OP(prof_thds_data)							\
-    OP(prof_dump)							\
-    OP(hpa_central)							\
-    OP(hpa_central_grow)
+    OP(prof_dump)

 typedef enum {
 #define OP(mtx) global_prof_mutex_##mtx,
--- a/include/jemalloc/internal/pa.h
+++ b/include/jemalloc/internal/pa.h
@@ -130,9 +130,8 @@ bool pa_shard_init(tsdn_t *tsdn, pa_shard_t *shard, emap_t *emap, base_t *base,
 * This isn't exposed to users; we allow late enablement of the HPA shard so
 * that we can boot without worrying about the HPA, then turn it on in a0.
 */
-bool pa_shard_enable_hpa(pa_shard_t *shard, hpa_t *hpa, size_t ps_goal,
-    size_t ps_alloc_max, size_t small_max, size_t large_min, size_t sec_nshards,
-    size_t sec_alloc_max, size_t sec_bytes_max);
+bool pa_shard_enable_hpa(pa_shard_t *shard, size_t alloc_max,
+    size_t sec_nshards, size_t sec_alloc_max, size_t sec_bytes_max);
 /*
 * We stop using the HPA when custom extent hooks are installed, but still
 * redirect deallocations to it.
--- a/include/jemalloc/internal/psset.h
+++ b/include/jemalloc/internal/psset.h
@@ -24,11 +24,14 @@
 typedef struct psset_bin_stats_s psset_bin_stats_t;
 struct psset_bin_stats_s {
 	/* How many pageslabs are in this bin? */
-	size_t npageslabs;
+	size_t npageslabs_huge;
+	size_t npageslabs_nonhuge;
 	/* Of them, how many pages are active? */
-	size_t nactive;
+	size_t nactive_huge;
+	size_t nactive_nonhuge;
 	/* How many are inactive? */
-	size_t ninactive;
+	size_t ninactive_huge;
+	size_t ninactive_nonhuge;
 };

 /* Used only by CTL; not actually stored here (i.e., all derived). */
@@ -62,6 +65,8 @@ void psset_stats_accum(psset_stats_t *dst, psset_stats_t *src);
 void psset_insert(psset_t *psset, edata_t *ps);
 void psset_remove(psset_t *psset, edata_t *ps);

+void psset_hugify(psset_t *psset, edata_t *ps);
+
 /*
 * Tries to obtain a chunk from an existing pageslab already in the set.
 * Returns true on failure.