Improve per-CPU support for the workqueue(9):

- Make structures CPU-cache friendly, as suggested and explained by Andrew Doran. CACHE_LINE_SIZE definition is invented. - Use current CPU if NULL is passed to the workqueue_enqueue(). - Implemented MI CPU index, which could be used as an index of array. Removed linked-lists usage for work queues. The roundup2() function avoids division, but works only with power of 2. Reviewed by: <ad>, <yamt>, <tech-kern>
2007-08-05 01:19:17 +00:00 · 2007-08-05 01:19:17 +00:00 · c8c024369c
commit c8c024369c
parent f6bcdcfe0c
6 changed files with 108 additions and 58 deletions
--- a/sys/dev/ieee1394/fw_port.h
+++ b/sys/dev/ieee1394/fw_port.h
@ -1,4 +1,4 @@
-/*	$NetBSD: fw_port.h,v 1.23 2007/07/09 21:00:41 ad Exp $	*/
+/*	$NetBSD: fw_port.h,v 1.24 2007/08/05 01:19:17 rmind Exp $	*/
 /*
 * Copyright (c) 2004 KIYOHARA Takashi
 * All rights reserved.
@ -1091,7 +1091,6 @@ typedef struct scsipi_inquiry_data sbp_scsi_inquiry_data;
 #define splfwsbp()	splbio()
 #define splsoftvm()	splbio()

-#define roundup2(x, y) roundup((x), (y))
 #ifndef rounddown
 #define rounddown(x, y) ((x) / (y) * (y))
 #endif
--- a/sys/kern/kern_cpu.c
+++ b/sys/kern/kern_cpu.c
@ -1,4 +1,4 @@
-/*	$NetBSD: kern_cpu.c,v 1.4 2007/08/04 11:57:54 ad Exp $	*/
+/*	$NetBSD: kern_cpu.c,v 1.5 2007/08/05 01:19:17 rmind Exp $	*/

 /*-
 * Copyright (c) 2007 The NetBSD Foundation, Inc.
@ -64,7 +64,7 @@

 #include <sys/cdefs.h>

-__KERNEL_RCSID(0, "$NetBSD: kern_cpu.c,v 1.4 2007/08/04 11:57:54 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: kern_cpu.c,v 1.5 2007/08/05 01:19:17 rmind Exp $");

 #include <sys/param.h>
 #include <sys/systm.h>
@ -95,6 +95,8 @@ mi_cpu_attach(struct cpu_info *ci)
 	struct schedstate_percpu *spc = &ci->ci_schedstate;
 	int error;

+	ci->ci_index = ncpu;
+
 	mutex_init(&spc->spc_lwplock, MUTEX_SPIN, IPL_SCHED);
 	sched_cpuattach(ci);

--- a/sys/kern/subr_workqueue.c
+++ b/sys/kern/subr_workqueue.c
@ -1,4 +1,4 @@
-/*	$NetBSD: subr_workqueue.c,v 1.17 2007/07/20 12:43:26 yamt Exp $	*/
+/*	$NetBSD: subr_workqueue.c,v 1.18 2007/08/05 01:19:17 rmind Exp $	*/

 /*-
 * Copyright (c)2002, 2005 YAMAMOTO Takashi,
@ -27,9 +27,10 @@
 */

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: subr_workqueue.c,v 1.17 2007/07/20 12:43:26 yamt Exp $");
+__KERNEL_RCSID(0, "$NetBSD: subr_workqueue.c,v 1.18 2007/08/05 01:19:17 rmind Exp $");

 #include <sys/param.h>
+#include <sys/cpu.h>
 #include <sys/systm.h>
 #include <sys/kthread.h>
 #include <sys/kmem.h>
@ -50,31 +51,40 @@ struct workqueue_queue {
 	kcondvar_t q_cv;
 	struct workqhead q_queue;
 	struct lwp *q_worker;
-	struct cpu_info *q_ci;
 	SLIST_ENTRY(workqueue_queue) q_list;
 };

 struct workqueue {
-	SLIST_HEAD(, workqueue_queue) wq_queue;
 	void (*wq_func)(struct work *, void *);
 	void *wq_arg;
 	const char *wq_name;
 	pri_t wq_prio;
+	int wq_flags;
+	void *wq_ptr;
 	ipl_cookie_t wq_ipl;
 };

+#ifdef MULTIPROCESSOR
+#define	CPU_ALIGN_SIZE		CACHE_LINE_SIZE
+#else
+#define	CPU_ALIGN_SIZE		(ALIGNBYTES + 1)
+#endif
+
+#define	WQ_SIZE		(roundup2(sizeof(struct workqueue), CPU_ALIGN_SIZE))
+#define	WQ_QUEUE_SIZE	(roundup2(sizeof(struct workqueue_queue), CPU_ALIGN_SIZE))
+
 #define	POISON	0xaabbccdd

 static struct workqueue_queue *
 workqueue_queue_lookup(struct workqueue *wq, struct cpu_info *ci)
 {
-	struct workqueue_queue *q;
+	u_int idx = 0;

-	SLIST_FOREACH(q, &wq->wq_queue, q_list)
-		if (q->q_ci == ci)
-			return q;
+	if (wq->wq_flags & WQ_PERCPU) {
+		idx = ci ? cpu_index(ci) : cpu_index(curcpu());
+	}

-	return SLIST_FIRST(&wq->wq_queue);
+	return (void *)((intptr_t)(wq) + WQ_SIZE + (idx * WQ_QUEUE_SIZE));
 }

 static void
@ -100,7 +110,6 @@ workqueue_run(struct workqueue *wq)

 	/* find the workqueue of this kthread */
 	q = workqueue_queue_lookup(wq, curlwp->l_cpu);
-	KASSERT(q != NULL);

 	for (;;) {
 		struct workqhead tmp;
@ -150,33 +159,26 @@ workqueue_init(struct workqueue *wq, const char *name,
 	wq->wq_name = name;
 	wq->wq_func = callback_func;
 	wq->wq_arg = callback_arg;
-	SLIST_INIT(&wq->wq_queue);
 }

 static int
-workqueue_initqueue(struct workqueue *wq, int ipl,
-    int flags, struct cpu_info *ci)
+workqueue_initqueue(struct workqueue *wq, struct workqueue_queue *q,
+    int ipl, struct cpu_info *ci)
 {
-	struct workqueue_queue *q;
 	int error, ktf;
-	cpuid_t cpuid;
-
-#ifdef MULTIPROCESSOR
-	cpuid = ci->ci_cpuid;
-#else
-	cpuid = 0;
-#endif
-
-	q = kmem_alloc(sizeof(struct workqueue_queue), KM_SLEEP);
-	SLIST_INSERT_HEAD(&wq->wq_queue, q, q_list);
-	q->q_ci = ci;

 	mutex_init(&q->q_mutex, MUTEX_DRIVER, ipl);
 	cv_init(&q->q_cv, wq->wq_name);
+	q->q_worker = NULL;
 	SIMPLEQ_INIT(&q->q_queue);
-	ktf = ((flags & WQ_MPSAFE) != 0 ? KTHREAD_MPSAFE : 0);
-	error = kthread_create(wq->wq_prio, ktf, ci, workqueue_worker,
-	    wq, &q->q_worker, "%s/%d", wq->wq_name, (int)cpuid);
+	ktf = ((wq->wq_flags & WQ_MPSAFE) != 0 ? KTHREAD_MPSAFE : 0);
+	if (ci) {
+		error = kthread_create(wq->wq_prio, ktf, ci, workqueue_worker,
+		    wq, &q->q_worker, "%s/%lu", wq->wq_name, cpu_index(ci));
+	} else {
+		error = kthread_create(wq->wq_prio, ktf, ci, workqueue_worker,
+		    wq, &q->q_worker, "%s", wq->wq_name);
+	}

 	return error;
 }
@ -223,7 +225,6 @@ workqueue_finiqueue(struct workqueue *wq, struct workqueue_queue *q)
 	mutex_exit(&q->q_mutex);
 	mutex_destroy(&q->q_mutex);
 	cv_destroy(&q->q_cv);
-	kmem_free(q, sizeof(struct workqueue_queue));
 }

 /* --- */
@ -234,42 +235,61 @@ workqueue_create(struct workqueue **wqp, const char *name,
    pri_t prio, int ipl, int flags)
 {
 	struct workqueue *wq;
-	int error = 0;
+	struct workqueue_queue *q;
+	void *ptr;
+	int i, error = 0;
+	size_t size;

 	KASSERT(sizeof(work_impl_t) <= sizeof(struct work));

-	wq = kmem_alloc(sizeof(*wq), KM_SLEEP);
+	i = (flags & WQ_PERCPU) ? ncpu : 1;
+	if (ncpu == 1) {
+		flags &= ~WQ_PERCPU;
+	}
+
+	size = WQ_SIZE + (i * WQ_QUEUE_SIZE) + CPU_ALIGN_SIZE;
+	ptr = kmem_alloc(size, KM_SLEEP);
+
+	wq = (void *)roundup2((intptr_t)ptr, CPU_ALIGN_SIZE);
+	wq->wq_ptr = ptr;
+	wq->wq_flags = flags;
+	q = (void *)((intptr_t)(wq) + WQ_SIZE);

 	workqueue_init(wq, name, callback_func, callback_arg, prio, ipl);
+	i = 0;

-#ifdef MULTIPROCESSOR   
 	if (flags & WQ_PERCPU) {
+#ifdef MULTIPROCESSOR
 		struct cpu_info *ci;
 		CPU_INFO_ITERATOR cii;

 		/* create the work-queue for each CPU */
 		for (CPU_INFO_FOREACH(cii, ci)) {
-			error = workqueue_initqueue(wq, ipl, flags, ci);
-			if (error)
+			error = workqueue_initqueue(wq, q, ipl, ci);
+			if (error) {
 				break;
+			}
+			q = (void *)((intptr_t)(q) + WQ_QUEUE_SIZE);
+			i++;
 		}
-		if (error)
-			workqueue_destroy(wq);
-
+#endif
 	} else {
-		error = workqueue_initqueue(wq, ipl, flags, curcpu());
-		if (error) {
-			kmem_free(wq, sizeof(*wq));
-			return error;
-		}
+		/* initialize a work-queue */
+		error = workqueue_initqueue(wq, q, ipl, NULL);
 	}
-#else
-	error = workqueue_initqueue(wq, ipl, flags, curcpu());
+
 	if (error) {
-		kmem_free(wq, sizeof(*wq));
+		/*
+		 * workqueue_finiqueue() should be
+		 * called for the failing one too.
+		 */
+		do {
+			workqueue_finiqueue(wq, q);
+			q = (void *)((intptr_t)(q) - WQ_QUEUE_SIZE);
+		} while(i--);
+		kmem_free(ptr, size);
 		return error;
 	}
-#endif

 	*wqp = wq;
 	return 0;
@ -279,12 +299,25 @@ void
 workqueue_destroy(struct workqueue *wq)
 {
 	struct workqueue_queue *q;
+	u_int i = 1;

-	while ((q = SLIST_FIRST(&wq->wq_queue)) != NULL) {
-		SLIST_REMOVE_HEAD(&wq->wq_queue, q_list);
+	if (wq->wq_flags & WQ_PERCPU) {
+#ifdef MULTIPROCESSOR
+		struct cpu_info *ci;
+		CPU_INFO_ITERATOR cii;
+
+		for (CPU_INFO_FOREACH(cii, ci)) {
+			q = workqueue_queue_lookup(wq, ci);
+			workqueue_finiqueue(wq, q);
+		}
+		i = ncpu;
+#endif
+	} else {
+		q = workqueue_queue_lookup(wq, NULL);
 		workqueue_finiqueue(wq, q);
 	}
-	kmem_free(wq, sizeof(*wq));
+
+	kmem_free(wq->wq_ptr, WQ_SIZE + (i * WQ_QUEUE_SIZE) + CPU_ALIGN_SIZE);
 }

 void
@ -293,8 +326,8 @@ workqueue_enqueue(struct workqueue *wq, struct work *wk0, struct cpu_info *ci)
 	struct workqueue_queue *q;
 	work_impl_t *wk = (void *)wk0;

+	KASSERT(wq->wq_flags & WQ_PERCPU || ci == NULL);
 	q = workqueue_queue_lookup(wq, ci);
-	KASSERT(q != NULL);

 	mutex_enter(&q->q_mutex);
 	SIMPLEQ_INSERT_TAIL(&q->q_queue, wk, wk_entry);
--- a/sys/sys/cpu.h
+++ b/sys/sys/cpu.h
@ -1,4 +1,4 @@
-/*	$NetBSD: cpu.h,v 1.9 2007/08/04 11:03:02 ad Exp $	*/
+/*	$NetBSD: cpu.h,v 1.10 2007/08/05 01:19:17 rmind Exp $	*/

 /*-
 * Copyright (c) 2007 YAMAMOTO Takashi,
@ -65,4 +65,10 @@ int	cpu_setonline(struct cpu_info *, bool);

 extern kmutex_t cpu_lock;

+static inline cpuid_t
+cpu_index(struct cpu_info *ci)
+{
+	return ci->ci_index;
+}
+
 #endif	/* !_SYS_CPU_H_ */
--- a/sys/sys/cpu_data.h
+++ b/sys/sys/cpu_data.h
@ -1,4 +1,4 @@
-/*	$NetBSD: cpu_data.h,v 1.9 2007/07/09 21:11:32 ad Exp $	*/
+/*	$NetBSD: cpu_data.h,v 1.10 2007/08/05 01:19:17 rmind Exp $	*/

 /*-
 * Copyright (c) 2004, 2006, 2007 The NetBSD Foundation, Inc.
@ -60,6 +60,7 @@ struct lwp;
 struct cpu_data {
 	struct schedstate_percpu cpu_schedstate; /* scheduler state */
 	struct lwp *cpu_idlelwp;	/* idle lwp */
+	cpuid_t cpu_index;		/* CPU index */

 	u_int		cpu_biglock_count;
 	struct lwp	*cpu_biglock_wanted;
@ -80,6 +81,7 @@ struct cpu_data {

 /* compat definitions */
 #define	ci_schedstate		ci_data.cpu_schedstate
+#define	ci_index		ci_data.cpu_index
 #define	ci_biglock_count	ci_data.cpu_biglock_count
 #define	ci_biglock_wanted	ci_data.cpu_biglock_wanted
 #define	ci_spin_locks		ci_data.cpu_spin_locks
--- a/sys/sys/param.h
+++ b/sys/sys/param.h
@ -1,4 +1,4 @@
-/*	$NetBSD: param.h,v 1.269 2007/07/31 21:18:20 pooka Exp $	*/
+/*	$NetBSD: param.h,v 1.270 2007/08/05 01:19:17 rmind Exp $	*/

 /*-
 * Copyright (c) 1982, 1986, 1989, 1993
@ -157,6 +157,13 @@
 #define	dbtob(x)	((x) << DEV_BSHIFT)
 #define	btodb(x)	((x) >> DEV_BSHIFT)

+/*
+ * CPU cache values
+ */
+#ifndef CACHE_LINE_SIZE
+#define	CACHE_LINE_SIZE		64
+#endif
+
 /*
 * Stack macros.  On most architectures, the stack grows down,
 * towards lower addresses; it is the rare architecture where
@ -260,7 +267,8 @@
 #define	howmany(x, y)	(((x)+((y)-1))/(y))
 #endif
 #define	roundup(x, y)	((((x)+((y)-1))/(y))*(y))
-#define rounddown(x,y)	(((x)/(y))*(y))
+#define	rounddown(x,y)	(((x)/(y))*(y))
+#define	roundup2(x, m)	(((x) + m - 1) & ~(m - 1))
 #define	powerof2(x)	((((x)-1)&(x))==0)

 /* Macros for min/max. */