mpsc_queue: Add module.

This is a simple multi-producer, single-consumer queue. The intended use case is in the HPA, as we begin supporting hpdatas that move between hpa_shards. We take just a single CAS as the cost to send a message (or a batch of messages) in the low-contention case, and lock-freedom lets us avoid some lock-ordering issues.
2021-05-11 14:49:55 -07:00
parent 4452a4812f
commit de033f56c0
3 changed files with 439 additions and 0 deletions
--- a/Makefile.in
+++ b/Makefile.in
@@ -233,6 +233,7 @@ TESTS_UNIT := \
 	$(srcroot)test/unit/malloc_conf_2.c \
 	$(srcroot)test/unit/malloc_io.c \
 	$(srcroot)test/unit/math.c \
+	$(srcroot)test/unit/mpsc_queue.c \
 	$(srcroot)test/unit/mq.c \
 	$(srcroot)test/unit/mtx.c \
 	$(srcroot)test/unit/nstime.c \
--- a/include/jemalloc/internal/mpsc_queue.h
+++ b/include/jemalloc/internal/mpsc_queue.h
@@ -0,0 +1,134 @@
+#ifndef JEMALLOC_INTERNAL_MPSC_QUEUE_H
+#define JEMALLOC_INTERNAL_MPSC_QUEUE_H
+
+#include "jemalloc/internal/atomic.h"
+
+/*
+ * A concurrent implementation of a multi-producer, single-consumer queue.  It
+ * supports three concurrent operations:
+ * - Push
+ * - Push batch
+ * - Pop batch
+ *
+ * These operations are all lock-free.
+ *
+ * The implementation is the simple two-stack queue built on a Treiber stack.
+ * It's not terribly efficient, but this isn't expected to go into anywhere with
+ * hot code.  In fact, we don't really even need queue semantics in any
+ * anticipated use cases; we could get away with just the stack.  But this way
+ * lets us frame the API in terms of the existing list types, which is a nice
+ * convenience.  We can save on cache misses by introducing our own (parallel)
+ * single-linked list type here, and dropping FIFO semantics, if we need this to
+ * get faster.  Since we're currently providing queue semantics though, we use
+ * the prev field in the link rather than the next field for Treiber-stack
+ * linkage, so that we can preserve order for bash-pushed lists (recall that the
+ * two-stack tricks reverses orders in the lock-free first stack).
+ */
+
+#define mpsc_queue(a_type)						\
+struct {								\
+	atomic_p_t tail;						\
+}
+
+#define mpsc_queue_proto(a_attr, a_prefix, a_queue_type, a_type,	\
+    a_list_type)							\
+/* Initialize a queue. */						\
+a_attr void								\
+a_prefix##new(a_queue_type *queue);					\
+/* Insert all items in src into the queue, clearing src. */		\
+a_attr void								\
+a_prefix##push_batch(a_queue_type *queue, a_list_type *src);		\
+/* Insert node into the queue. */					\
+a_attr void								\
+a_prefix##push(a_queue_type *queue, a_type *node);			\
+/*									\
+ * Pop all items in the queue into the list at dst.  dst should already	\
+ * be initialized (and may contain existing items, which then remain	\
+ * in dst).								\
+ */									\
+a_attr void								\
+a_prefix##pop_batch(a_queue_type *queue, a_list_type *dst);
+
+#define mpsc_queue_gen(a_attr, a_prefix, a_queue_type, a_type,		\
+    a_list_type, a_link)						\
+a_attr void								\
+a_prefix##new(a_queue_type *queue) {					\
+	atomic_store_p(&queue->tail, NULL, ATOMIC_RELAXED);		\
+}									\
+a_attr void								\
+a_prefix##push_batch(a_queue_type *queue, a_list_type *src) {		\
+	/*								\
+	 * Reuse the ql list next field as the Treiber stack next	\
+	 * field.							\
+	 */								\
+	a_type *first = ql_first(src);					\
+	a_type *last = ql_last(src, a_link);				\
+	void* cur_tail = atomic_load_p(&queue->tail, ATOMIC_RELAXED);	\
+	do {								\
+		/*							\
+		 * Note that this breaks the queue ring structure;	\
+		 * it's not a ring any more!				\
+		 */							\
+		first->a_link.qre_prev = cur_tail;			\
+		/*							\
+		 * Note: the upcoming CAS doesn't need an atomic; every	\
+		 * push only needs to synchronize with the next pop,	\
+		 * which we get from the release sequence rules.	\
+		 */							\
+	} while (!atomic_compare_exchange_weak_p(&queue->tail,		\
+	    &cur_tail, last, ATOMIC_RELEASE, ATOMIC_RELAXED));		\
+	ql_new(src);							\
+}									\
+a_attr void								\
+a_prefix##push(a_queue_type *queue, a_type *node) {			\
+	ql_elm_new(node, a_link);					\
+	a_list_type list;						\
+	ql_new(&list);							\
+	ql_head_insert(&list, node, a_link);				\
+	a_prefix##push_batch(queue, &list);				\
+}									\
+a_attr void								\
+a_prefix##pop_batch(a_queue_type *queue, a_list_type *dst) {		\
+	a_type *tail = atomic_load_p(&queue->tail, ATOMIC_RELAXED);	\
+	if (tail == NULL) {						\
+		/*							\
+		 * In the common special case where there are no	\
+		 * pending elements, bail early without a costly RMW.	\
+		 */							\
+		return;							\
+	}								\
+	tail = atomic_exchange_p(&queue->tail, NULL, ATOMIC_ACQUIRE);	\
+	/*								\
+	 * It's a single-consumer queue, so if cur started non-NULL,	\
+	 * it'd better stay non-NULL.					\
+	 */								\
+	assert(tail != NULL);						\
+	/*								\
+	 * We iterate through the stack and both fix up the link	\
+	 * structure (stack insertion broke the list requirement that	\
+	 * the list be circularly linked).  It's just as efficient at	\
+	 * this point to make the queue a "real" queue, so do that as	\
+	 * well.							\
+	 * If this ever gets to be a hot spot, we can omit this fixup	\
+	 * and make the queue a bag (i.e. not necessarily ordered), but	\
+	 * that would mean jettisoning the existing list API as the 	\
+	 * batch pushing/popping interface.				\
+	 */								\
+	a_list_type reversed;						\
+	ql_new(&reversed);						\
+	while (tail != NULL) {						\
+		/*							\
+		 * Pop an item off the stack, prepend it onto the list	\
+		 * (reversing the order).  Recall that we use the	\
+		 * list prev field as the Treiber stack next field to	\
+		 * preserve order of batch-pushed items when reversed.	\
+		 */							\
+		a_type *next = tail->a_link.qre_prev;			\
+		ql_elm_new(tail, a_link);				\
+		ql_head_insert(&reversed, tail, a_link);		\
+		tail = next;						\
+	}								\
+	ql_concat(dst, &reversed, a_link);				\
+}
+
+#endif /* JEMALLOC_INTERNAL_MPSC_QUEUE_H */
--- a/test/unit/mpsc_queue.c
+++ b/test/unit/mpsc_queue.c
@@ -0,0 +1,304 @@
+#include "test/jemalloc_test.h"
+
+#include "jemalloc/internal/mpsc_queue.h"
+
+typedef struct elem_s elem_t;
+typedef ql_head(elem_t) elem_list_t;
+typedef mpsc_queue(elem_t) elem_mpsc_queue_t;
+struct elem_s {
+	int thread;
+	int idx;
+	ql_elm(elem_t) link;
+};
+
+/* Include both proto and gen to make sure they match up. */
+mpsc_queue_proto(static, elem_mpsc_queue_, elem_mpsc_queue_t, elem_t,
+    elem_list_t);
+mpsc_queue_gen(static, elem_mpsc_queue_, elem_mpsc_queue_t, elem_t,
+    elem_list_t, link);
+
+static void
+init_elems_simple(elem_t *elems, int nelems, int thread) {
+	for (int i = 0; i < nelems; i++) {
+		elems[i].thread = thread;
+		elems[i].idx = i;
+		ql_elm_new(&elems[i], link);
+	}
+}
+
+static void
+check_elems_simple(elem_list_t *list, int nelems, int thread) {
+	elem_t *elem;
+	int next_idx = 0;
+	ql_foreach(elem, list, link) {
+		expect_d_lt(next_idx, nelems, "Too many list items");
+		expect_d_eq(thread, elem->thread, "");
+		expect_d_eq(next_idx, elem->idx, "List out of order");
+		next_idx++;
+	}
+}
+
+TEST_BEGIN(test_simple) {
+	enum {NELEMS = 10};
+	elem_t elems[NELEMS];
+	elem_list_t list;
+	elem_mpsc_queue_t queue;
+
+	/* Pop empty queue onto empty list -> empty list */
+	ql_new(&list);
+	elem_mpsc_queue_new(&queue);
+	elem_mpsc_queue_pop_batch(&queue, &list);
+	expect_true(ql_empty(&list), "");
+
+	/* Pop empty queue onto nonempty list -> list unchanged */
+	ql_new(&list);
+	elem_mpsc_queue_new(&queue);
+	init_elems_simple(elems, NELEMS, 0);
+	for (int i = 0; i < NELEMS; i++) {
+		ql_tail_insert(&list, &elems[i], link);
+	}
+	elem_mpsc_queue_pop_batch(&queue, &list);
+	check_elems_simple(&list, NELEMS, 0);
+
+	/* Pop nonempty queue onto empty list -> list takes queue contents */
+	ql_new(&list);
+	elem_mpsc_queue_new(&queue);
+	init_elems_simple(elems, NELEMS, 0);
+	for (int i = 0; i < NELEMS; i++) {
+		elem_mpsc_queue_push(&queue, &elems[i]);
+	}
+	elem_mpsc_queue_pop_batch(&queue, &list);
+	check_elems_simple(&list, NELEMS, 0);
+
+	/* Pop nonempty queue onto nonempty list -> list gains queue contents */
+	ql_new(&list);
+	elem_mpsc_queue_new(&queue);
+	init_elems_simple(elems, NELEMS, 0);
+	for (int i = 0; i < NELEMS / 2; i++) {
+		ql_tail_insert(&list, &elems[i], link);
+	}
+	for (int i = NELEMS / 2; i < NELEMS; i++) {
+		elem_mpsc_queue_push(&queue, &elems[i]);
+	}
+	elem_mpsc_queue_pop_batch(&queue, &list);
+	check_elems_simple(&list, NELEMS, 0);
+
+}
+TEST_END
+
+TEST_BEGIN(test_push_single_or_batch) {
+	enum {
+		BATCH_MAX = 10,
+		/*
+		 * We'll push i items one-at-a-time, then i items as a batch,
+		 * then i items as a batch again, as i ranges from 1 to
+		 * BATCH_MAX.  So we need 3 times the sum of the numbers from 1
+		 * to BATCH_MAX elements total.
+		 */
+		NELEMS = 3 * BATCH_MAX * (BATCH_MAX - 1) / 2
+	};
+	elem_t elems[NELEMS];
+	init_elems_simple(elems, NELEMS, 0);
+	elem_list_t list;
+	ql_new(&list);
+	elem_mpsc_queue_t queue;
+	elem_mpsc_queue_new(&queue);
+	int next_idx = 0;
+	for (int i = 1; i < 10; i++) {
+		/* Push i items 1 at a time. */
+		for (int j = 0; j < i; j++) {
+			elem_mpsc_queue_push(&queue, &elems[next_idx]);
+			next_idx++;
+		}
+		/* Push i items in batch. */
+		for (int j = 0; j < i; j++) {
+			ql_tail_insert(&list, &elems[next_idx], link);
+			next_idx++;
+		}
+		elem_mpsc_queue_push_batch(&queue, &list);
+		expect_true(ql_empty(&list), "Batch push should empty source");
+		/*
+		 * Push i items in batch, again.  This tests two batches
+		 * proceeding one after the other.
+		 */
+		for (int j = 0; j < i; j++) {
+			ql_tail_insert(&list, &elems[next_idx], link);
+			next_idx++;
+		}
+		elem_mpsc_queue_push_batch(&queue, &list);
+		expect_true(ql_empty(&list), "Batch push should empty source");
+	}
+	expect_d_eq(NELEMS, next_idx, "Miscomputed number of elems to push.");
+
+	expect_true(ql_empty(&list), "");
+	elem_mpsc_queue_pop_batch(&queue, &list);
+	check_elems_simple(&list, NELEMS, 0);
+}
+TEST_END
+
+TEST_BEGIN(test_multi_op) {
+	enum {NELEMS = 20};
+	elem_t elems[NELEMS];
+	init_elems_simple(elems, NELEMS, 0);
+	elem_list_t push_list;
+	ql_new(&push_list);
+	elem_list_t result_list;
+	ql_new(&result_list);
+	elem_mpsc_queue_t queue;
+	elem_mpsc_queue_new(&queue);
+
+	int next_idx = 0;
+	/* Push first quarter 1-at-a-time. */
+	for (int i = 0; i < NELEMS / 4; i++) {
+		elem_mpsc_queue_push(&queue, &elems[next_idx]);
+		next_idx++;
+	}
+	/* Push second quarter in batch. */
+	for (int i = NELEMS / 4; i < NELEMS / 2; i++) {
+		ql_tail_insert(&push_list, &elems[next_idx], link);
+		next_idx++;
+	}
+	elem_mpsc_queue_push_batch(&queue, &push_list);
+	/* Batch pop all pushed elements. */
+	elem_mpsc_queue_pop_batch(&queue, &result_list);
+	/* Push third quarter in batch. */
+	for (int i = NELEMS / 2; i < 3 * NELEMS / 4; i++) {
+		ql_tail_insert(&push_list, &elems[next_idx], link);
+		next_idx++;
+	}
+	elem_mpsc_queue_push_batch(&queue, &push_list);
+	/* Push last quarter one-at-a-time. */
+	for (int i = 3 * NELEMS / 4; i < NELEMS; i++) {
+		elem_mpsc_queue_push(&queue, &elems[next_idx]);
+		next_idx++;
+	}
+	/* Pop them again.  Order of existing list should be preserved. */
+	elem_mpsc_queue_pop_batch(&queue, &result_list);
+
+	check_elems_simple(&result_list, NELEMS, 0);
+
+}
+TEST_END
+
+typedef struct pusher_arg_s pusher_arg_t;
+struct pusher_arg_s {
+	elem_mpsc_queue_t *queue;
+	int thread;
+	elem_t *elems;
+	int nelems;
+};
+
+typedef struct popper_arg_s popper_arg_t;
+struct popper_arg_s {
+	elem_mpsc_queue_t *queue;
+	int npushers;
+	int nelems_per_pusher;
+	int *pusher_counts;
+};
+
+static void *
+thd_pusher(void *void_arg) {
+	pusher_arg_t *arg = (pusher_arg_t *)void_arg;
+	int next_idx = 0;
+	while (next_idx < arg->nelems) {
+		/* Push 10 items in batch. */
+		elem_list_t list;
+		ql_new(&list);
+		int limit = next_idx + 10;
+		while (next_idx < arg->nelems && next_idx < limit) {
+			ql_tail_insert(&list, &arg->elems[next_idx], link);
+			next_idx++;
+		}
+		elem_mpsc_queue_push_batch(arg->queue, &list);
+		/* Push 10 items one-at-a-time. */
+		limit = next_idx + 10;
+		while (next_idx < arg->nelems && next_idx < limit) {
+			elem_mpsc_queue_push(arg->queue, &arg->elems[next_idx]);
+			next_idx++;
+		}
+
+	}
+	return NULL;
+}
+
+static void *
+thd_popper(void *void_arg) {
+	popper_arg_t *arg = (popper_arg_t *)void_arg;
+	int done_pushers = 0;
+	while (done_pushers < arg->npushers) {
+		elem_list_t list;
+		ql_new(&list);
+		elem_mpsc_queue_pop_batch(arg->queue, &list);
+		elem_t *elem;
+		ql_foreach(elem, &list, link) {
+			int thread = elem->thread;
+			int idx = elem->idx;
+			expect_d_eq(arg->pusher_counts[thread], idx,
+			    "Thread's pushes reordered");
+			arg->pusher_counts[thread]++;
+			if (arg->pusher_counts[thread]
+			    == arg->nelems_per_pusher) {
+				done_pushers++;
+			}
+		}
+	}
+	return NULL;
+}
+
+TEST_BEGIN(test_multiple_threads) {
+	enum {
+		NPUSHERS = 4,
+		NELEMS_PER_PUSHER = 1000*1000,
+	};
+	thd_t pushers[NPUSHERS];
+	pusher_arg_t pusher_arg[NPUSHERS];
+
+	thd_t popper;
+	popper_arg_t popper_arg;
+
+	elem_mpsc_queue_t queue;
+	elem_mpsc_queue_new(&queue);
+
+	elem_t *elems = calloc(NPUSHERS * NELEMS_PER_PUSHER, sizeof(elem_t));
+	elem_t *elem_iter = elems;
+	for (int i = 0; i < NPUSHERS; i++) {
+		pusher_arg[i].queue = &queue;
+		pusher_arg[i].thread = i;
+		pusher_arg[i].elems = elem_iter;
+		pusher_arg[i].nelems = NELEMS_PER_PUSHER;
+
+		init_elems_simple(elem_iter, NELEMS_PER_PUSHER, i);
+		elem_iter += NELEMS_PER_PUSHER;
+	}
+	popper_arg.queue = &queue;
+	popper_arg.npushers = NPUSHERS;
+	popper_arg.nelems_per_pusher = NELEMS_PER_PUSHER;
+	int pusher_counts[NPUSHERS] = {0};
+	popper_arg.pusher_counts = pusher_counts;
+
+	thd_create(&popper, thd_popper, (void *)&popper_arg);
+	for (int i = 0; i < NPUSHERS; i++) {
+		thd_create(&pushers[i], thd_pusher, &pusher_arg[i]);
+	}
+
+	thd_join(popper, NULL);
+	for (int i = 0; i < NPUSHERS; i++) {
+		thd_join(pushers[i], NULL);
+	}
+
+	for (int i = 0; i < NPUSHERS; i++) {
+		expect_d_eq(NELEMS_PER_PUSHER, pusher_counts[i], "");
+	}
+
+	free(elems);
+}
+TEST_END
+
+int
+main(void) {
+	return test_no_reentrancy(
+	    test_simple,
+	    test_push_single_or_batch,
+	    test_multi_op,
+	    test_multiple_threads);
+}