- Add some more failsafes to the CPU topology stuff, and build a 3rd

circular list of peer CPUs in other packages, so we might scroll through them in the scheduler when looking to distribute or steal jobs. - Fold the run queue data structure into spc_schedstate. Makes kern_runq.c a far more pleasant place to work. - Remove the code in sched_nextlwp() that tries to steal jobs from other CPUs. It's not needed, because we do the very same thing in the idle LWP anyway. Outside the VM system this was one of the the main causes of L3 cache misses I saw during builds. On my machine, this change yields a 60%-70% drop in time on the "hackbench" benchmark (there's clearly a bit more going on here, but basically being less aggressive helps).
2019-12-03 22:28:41 +00:00 · 2019-12-03 22:28:41 +00:00 · dece39714a
commit dece39714a
parent ab935ef629
4 changed files with 229 additions and 211 deletions
--- a/sys/kern/kern_cpu.c
+++ b/sys/kern/kern_cpu.c
@ -1,4 +1,4 @@
-/*	$NetBSD: kern_cpu.c,v 1.79 2019/12/02 23:22:43 ad Exp $	*/
+/*	$NetBSD: kern_cpu.c,v 1.80 2019/12/03 22:28:41 ad Exp $	*/

 /*-
 * Copyright (c) 2007, 2008, 2009, 2010, 2012, 2019 The NetBSD Foundation, Inc.
@ -56,7 +56,7 @@
 */

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: kern_cpu.c,v 1.79 2019/12/02 23:22:43 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: kern_cpu.c,v 1.80 2019/12/03 22:28:41 ad Exp $");

 #include "opt_cpu_ucode.h"

@ -595,35 +595,118 @@ cpu_softintr_p(void)
 void
 cpu_topology_set(struct cpu_info *ci, int package_id, int core_id, int smt_id)
 {
+	enum cpu_rel rel;

 	cpu_topology_present = true;
 	ci->ci_package_id = package_id;
 	ci->ci_core_id = core_id;
 	ci->ci_smt_id = smt_id;
-	ci->ci_package_cpus = ci;
-	ci->ci_npackage_cpus = 1;
-	ci->ci_core_cpus = ci;
-	ci->ci_ncore_cpus = 1;
+	for (rel = 0; rel < __arraycount(ci->ci_sibling); rel++) {
+		ci->ci_sibling[rel] = ci;
+		ci->ci_nsibling[rel] = 1;
+	}
+}
+
+/*
+ * Link a CPU into the given circular list.
+ */
+static void
+cpu_topology_link(struct cpu_info *ci, struct cpu_info *ci2, enum cpu_rel rel)
+{
+	struct cpu_info *ci3;
+
+	/* Walk to the end of the existing circular list and append. */
+	for (ci3 = ci2;; ci3 = ci3->ci_sibling[rel]) {
+		ci3->ci_nsibling[rel]++;
+		if (ci3->ci_sibling[rel] == ci2) {
+			break;
+		}
+	}
+	ci->ci_sibling[rel] = ci2;
+	ci3->ci_sibling[rel] = ci;
+	ci->ci_nsibling[rel] = ci3->ci_nsibling[rel];
+}
+
+/*
+ * Find peer CPUs in other packages.
+ */
+static void
+cpu_topology_peers(void)
+{
+	CPU_INFO_ITERATOR cii, cii2;
+	struct cpu_info *ci, *ci2;
+
+	for (CPU_INFO_FOREACH(cii, ci)) {
+		if (ci->ci_nsibling[CPUREL_PEER] > 1) {
+			/* Already linked. */
+			continue;
+		}
+		for (CPU_INFO_FOREACH(cii2, ci2)) {
+			if (ci != ci2 &&
+			    ci->ci_package_id != ci2->ci_package_id &&
+			    ci->ci_core_id == ci2->ci_core_id &&
+			    ci->ci_smt_id == ci2->ci_smt_id) {
+				cpu_topology_link(ci, ci2, CPUREL_PEER);
+				break;
+			}
+		}
+	}
+}
+
+/*
+ * Print out the toplogy lists.
+ */
+static void
+cpu_topology_print(void)
+{
+#ifdef DEBUG
+	CPU_INFO_ITERATOR cii;
+	struct cpu_info *ci, *ci2;
+	const char *names[] = { "core", "package", "peer" };
+	enum cpu_rel rel;
+	int i;
+
+	for (CPU_INFO_FOREACH(cii, ci)) {
+		for (rel = 0; rel < __arraycount(ci->ci_sibling); rel++) {
+			printf("%s has %dx %s siblings: ", cpu_name(ci),
+			    ci->ci_nsibling[rel], names[rel]);
+			ci2 = ci->ci_sibling[rel];
+			i = 0;
+			do {
+				printf(" %s", cpu_name(ci2));
+				ci2 = ci2->ci_sibling[rel];
+			} while (++i < 64 && ci2 != ci->ci_sibling[rel]);
+			if (i == 64) {
+				printf(" GAVE UP");
+			}
+			printf("\n");
+		}
+	}
+#endif	/* DEBUG */
 }

 /*
 * Fake up toplogy info if we have none, or if what we got was bogus.
+ * Don't override ci_package_id, etc, if cpu_topology_present is set.
+ * MD code also uses these.
 */
 static void
 cpu_topology_fake(void)
 {
 	CPU_INFO_ITERATOR cii;
 	struct cpu_info *ci;
+	enum cpu_rel rel;

 	for (CPU_INFO_FOREACH(cii, ci)) {
-		ci->ci_package_id = cpu_index(ci);
-		ci->ci_core_id = 0;
-		ci->ci_smt_id = 0;
-		ci->ci_ncore_cpus = 1;
-		ci->ci_core_cpus = ci;
-		ci->ci_package_cpus = ci;
-		ci->ci_npackage_cpus = 1;
+		for (rel = 0; rel < __arraycount(ci->ci_sibling); rel++) {
+			ci->ci_sibling[rel] = ci;
+			ci->ci_nsibling[rel] = 1;
+		}
+		if (!cpu_topology_present) {
+			ci->ci_package_id = cpu_index(ci);
+		}
 	}
+	cpu_topology_print();
 }

 /*
@ -634,20 +717,16 @@ void
 cpu_topology_init(void)
 {
 	CPU_INFO_ITERATOR cii, cii2;
-	struct cpu_info *ci, *ci2, *ci3;
+	struct cpu_info *ci, *ci2;
+	int ncore, npackage, npeer;
+	bool symmetric;

 	if (!cpu_topology_present) {
 		cpu_topology_fake();
 		return;
 	}

-	for (CPU_INFO_FOREACH(cii, ci)) {
-		ci->ci_ncore_cpus = 1;
-		ci->ci_core_cpus = ci;
-		ci->ci_package_cpus = ci;
-		ci->ci_npackage_cpus = 1;
-	}
-
+	/* Find siblings in same core and package. */
 	for (CPU_INFO_FOREACH(cii, ci)) {
 		for (CPU_INFO_FOREACH(cii2, ci2)) {
 			/* Avoid bad things happening. */
@ -664,39 +743,42 @@ cpu_topology_init(void)
 			    ci2->ci_package_id != ci->ci_package_id) {
 				continue;
 			}
-			/*
-			 * Find CPUs in the same core.  Walk to the end of
-			 * the existing circular list and append.
-			 */
-			if (ci->ci_ncore_cpus == 1 &&
+			/* Find CPUs in the same core. */
+			if (ci->ci_nsibling[CPUREL_CORE] == 1 &&
 			    ci->ci_core_id == ci2->ci_core_id) {
-				for (ci3 = ci2;; ci3 = ci3->ci_core_cpus) {
-					ci3->ci_ncore_cpus++;
-					if (ci3->ci_core_cpus == ci2) {
-						break;
-					}
-				}
-				ci->ci_core_cpus = ci2;
-				ci3->ci_core_cpus = ci;
-				ci->ci_ncore_cpus = ci3->ci_ncore_cpus;
+			    	cpu_topology_link(ci, ci2, CPUREL_CORE);
 			}
-			/* Same, but for package. */
-			if (ci->ci_npackage_cpus == 1) {
-				for (ci3 = ci2;; ci3 = ci3->ci_package_cpus) {
-					ci3->ci_npackage_cpus++;
-					if (ci3->ci_package_cpus == ci2) {
-						break;
-					}
-				}
-				ci->ci_package_cpus = ci2;
-				ci3->ci_package_cpus = ci;
-				ci->ci_npackage_cpus = ci3->ci_npackage_cpus;
+			/* Find CPUs in the same package. */
+			if (ci->ci_nsibling[CPUREL_PACKAGE] == 1) {
+			    	cpu_topology_link(ci, ci2, CPUREL_PACKAGE);
 			}
-			if (ci->ci_ncore_cpus > 1 && ci->ci_npackage_cpus > 1) {
+			if (ci->ci_nsibling[CPUREL_CORE] > 1 &&
+			    ci->ci_nsibling[CPUREL_PACKAGE] > 1) {
 				break;
 			}
 		}
 	}
+
+	/* Find peers in other packages. */
+	cpu_topology_peers();
+
+	/* Determine whether the topology is bogus/symmetric. */
+	npackage = curcpu()->ci_nsibling[CPUREL_PACKAGE];
+	ncore = curcpu()->ci_nsibling[CPUREL_CORE];
+	npeer = curcpu()->ci_nsibling[CPUREL_PEER];
+	symmetric = true;
+	for (CPU_INFO_FOREACH(cii, ci)) {
+		if (npackage != ci->ci_nsibling[CPUREL_PACKAGE] ||
+		    ncore != ci->ci_nsibling[CPUREL_CORE] ||
+		    npeer != ci->ci_nsibling[CPUREL_PEER]) {
+			symmetric = false;
+		}
+	}
+	cpu_topology_print();
+	if (symmetric == false) {
+		printf("cpu_topology_init: not symmetric, faking it\n");
+		cpu_topology_fake();
+	}
 }

 #ifdef CPU_UCODE
--- a/sys/kern/kern_runq.c
+++ b/sys/kern/kern_runq.c
@ -1,4 +1,4 @@
-/*	$NetBSD: kern_runq.c,v 1.52 2019/12/01 15:34:46 ad Exp $	*/
+/*	$NetBSD: kern_runq.c,v 1.53 2019/12/03 22:28:41 ad Exp $	*/

 /*-
 * Copyright (c) 2019 The NetBSD Foundation, Inc.
@ -56,7 +56,7 @@
 */

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: kern_runq.c,v 1.52 2019/12/01 15:34:46 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: kern_runq.c,v 1.53 2019/12/03 22:28:41 ad Exp $");

 #include "opt_dtrace.h"

@ -78,15 +78,6 @@ __KERNEL_RCSID(0, "$NetBSD: kern_runq.c,v 1.52 2019/12/01 15:34:46 ad Exp $");
 #include <sys/evcnt.h>
 #include <sys/atomic.h>

-/*
- * Priority related definitions.
- */
-#define	PRI_TS_COUNT	(NPRI_USER)
-#define	PRI_RT_COUNT	(PRI_COUNT - PRI_TS_COUNT)
-#define	PRI_HTS_RANGE	(PRI_TS_COUNT / 10)
-
-#define	PRI_HIGHEST_TS	(MAXPRI_USER)
-
 /*
 * Bits per map.
 */
@ -95,34 +86,9 @@ __KERNEL_RCSID(0, "$NetBSD: kern_runq.c,v 1.52 2019/12/01 15:34:46 ad Exp $");
 #define	BITMAP_MSB	(0x80000000U)
 #define	BITMAP_MASK	(BITMAP_BITS - 1)

-/*
- * Structures, runqueue.
- */
-
 const int	schedppq = 1;

-typedef struct {
-	TAILQ_HEAD(, lwp) q_head;
-} queue_t;
-
-typedef struct {
-	/* Bitmap */
-	uint32_t	r_bitmap[PRI_COUNT >> BITMAP_SHIFT];
-	/* Counters */
-	u_int		r_count;	/* Count of the threads */
-	u_int		r_avgcount;	/* Average count of threads (* 256) */
-	u_int		r_mcount;	/* Count of migratable threads */
-	/* Runqueues */
-	queue_t		r_rt_queue[PRI_RT_COUNT];
-	queue_t		r_ts_queue[PRI_TS_COUNT];
-	/* Event counters */
-	struct evcnt	r_ev_pull;
-	struct evcnt	r_ev_push;
-	struct evcnt	r_ev_stay;
-	struct evcnt	r_ev_localize;
-} runqueue_t;
-
-static void *	sched_getrq(runqueue_t *, const pri_t);
+static void	*sched_getrq(struct schedstate_percpu *, const pri_t);
 #ifdef MULTIPROCESSOR
 static lwp_t *	sched_catchlwp(struct cpu_info *);
 static void	sched_balance(void *);
@ -182,45 +148,43 @@ runq_init(void)
 void
 sched_cpuattach(struct cpu_info *ci)
 {
-	runqueue_t *ci_rq;
-	void *rq_ptr;
-	u_int i, size;
+	struct schedstate_percpu *spc;
+	size_t size;
+	void *p;
+	u_int i;

-	if (ci->ci_schedstate.spc_lwplock == NULL) {
-		ci->ci_schedstate.spc_lwplock =
-		    mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED);
+	spc = &ci->ci_schedstate;
+
+	if (spc->spc_lwplock == NULL) {
+		spc->spc_lwplock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED);
 	}
 	if (ci == lwp0.l_cpu) {
 		/* Initialize the scheduler structure of the primary LWP */
-		lwp0.l_mutex = ci->ci_schedstate.spc_lwplock;
+		lwp0.l_mutex = spc->spc_lwplock;
 	}
-	if (ci->ci_schedstate.spc_mutex != NULL) {
+	if (spc->spc_mutex != NULL) {
 		/* Already initialized. */
 		return;
 	}

 	/* Allocate the run queue */
-	size = roundup2(sizeof(runqueue_t), coherency_unit) + coherency_unit;
-	rq_ptr = kmem_zalloc(size, KM_SLEEP);
-	ci_rq = (void *)(roundup2((uintptr_t)(rq_ptr), coherency_unit));
+	size = roundup2(sizeof(spc->spc_queue[0]) * PRI_COUNT, coherency_unit) +
+	    coherency_unit;
+	p = kmem_alloc(size, KM_SLEEP);
+	spc->spc_queue = (void *)roundup2((uintptr_t)p, coherency_unit);

 	/* Initialize run queues */
-	ci->ci_schedstate.spc_mutex =
-	    mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED);
-	for (i = 0; i < PRI_RT_COUNT; i++)
-		TAILQ_INIT(&ci_rq->r_rt_queue[i].q_head);
-	for (i = 0; i < PRI_TS_COUNT; i++)
-		TAILQ_INIT(&ci_rq->r_ts_queue[i].q_head);
+	spc->spc_mutex = mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED);
+	for (i = 0; i < PRI_COUNT; i++)
+		TAILQ_INIT(&spc->spc_queue[i]);

-	ci->ci_schedstate.spc_sched_info = ci_rq;
-
-	evcnt_attach_dynamic(&ci_rq->r_ev_pull, EVCNT_TYPE_MISC, NULL,
+	evcnt_attach_dynamic(&spc->spc_ev_pull, EVCNT_TYPE_MISC, NULL,
 	   cpu_name(ci), "runqueue pull");
-	evcnt_attach_dynamic(&ci_rq->r_ev_push, EVCNT_TYPE_MISC, NULL,
+	evcnt_attach_dynamic(&spc->spc_ev_push, EVCNT_TYPE_MISC, NULL,
 	   cpu_name(ci), "runqueue push");
-	evcnt_attach_dynamic(&ci_rq->r_ev_stay, EVCNT_TYPE_MISC, NULL,
+	evcnt_attach_dynamic(&spc->spc_ev_stay, EVCNT_TYPE_MISC, NULL,
 	   cpu_name(ci), "runqueue stay");
-	evcnt_attach_dynamic(&ci_rq->r_ev_localize, EVCNT_TYPE_MISC, NULL,
+	evcnt_attach_dynamic(&spc->spc_ev_localize, EVCNT_TYPE_MISC, NULL,
 	   cpu_name(ci), "runqueue localize");
 }

@ -229,13 +193,11 @@ sched_cpuattach(struct cpu_info *ci)
 */

 static inline void *
-sched_getrq(runqueue_t *ci_rq, const pri_t prio)
+sched_getrq(struct schedstate_percpu *spc, const pri_t prio)
 {

 	KASSERT(prio < PRI_COUNT);
-	return (prio <= PRI_HIGHEST_TS) ?
-	    &ci_rq->r_ts_queue[prio].q_head :
-	    &ci_rq->r_rt_queue[prio - PRI_HIGHEST_TS - 1].q_head;
+	return &spc->spc_queue[prio];
 }

 /*
@ -245,7 +207,6 @@ sched_getrq(runqueue_t *ci_rq, const pri_t prio)
 void
 sched_enqueue(struct lwp *l)
 {
-	runqueue_t *ci_rq;
 	struct schedstate_percpu *spc;
 	TAILQ_HEAD(, lwp) *q_head;
 	const pri_t eprio = lwp_eprio(l);
@ -253,11 +214,10 @@ sched_enqueue(struct lwp *l)

 	ci = l->l_cpu;
 	spc = &ci->ci_schedstate;
-	ci_rq = spc->spc_sched_info;
 	KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex));

 	/* Enqueue the thread */
-	q_head = sched_getrq(ci_rq, eprio);
+	q_head = sched_getrq(spc, eprio);
 	if (TAILQ_EMPTY(q_head)) {
 		u_int i;
 		uint32_t q;
@ -265,8 +225,8 @@ sched_enqueue(struct lwp *l)
 		/* Mark bit */
 		i = eprio >> BITMAP_SHIFT;
 		q = BITMAP_MSB >> (eprio & BITMAP_MASK);
-		KASSERT((ci_rq->r_bitmap[i] & q) == 0);
-		ci_rq->r_bitmap[i] |= q;
+		KASSERT((spc->spc_bitmap[i] & q) == 0);
+		spc->spc_bitmap[i] |= q;
 	}
 	/* Preempted SCHED_RR and SCHED_FIFO LWPs go to the queue head. */
 	if (l->l_class != SCHED_OTHER && (l->l_pflag & LP_PREEMPTING) != 0) {
@ -274,9 +234,9 @@ sched_enqueue(struct lwp *l)
 	} else {
 		TAILQ_INSERT_TAIL(q_head, l, l_runq);
 	}
-	ci_rq->r_count++;
+	spc->spc_count++;
 	if ((l->l_pflag & LP_BOUND) == 0)
-		ci_rq->r_mcount++;
+		spc->spc_mcount++;

 	/*
 	 * Update the value of highest priority in the runqueue,
@ -295,27 +255,25 @@ sched_enqueue(struct lwp *l)
 void
 sched_dequeue(struct lwp *l)
 {
-	runqueue_t *ci_rq;
 	TAILQ_HEAD(, lwp) *q_head;
 	struct schedstate_percpu *spc;
 	const pri_t eprio = lwp_eprio(l);

-	spc = & l->l_cpu->ci_schedstate;
-	ci_rq = spc->spc_sched_info;
-	KASSERT(lwp_locked(l, spc->spc_mutex));
+	spc = &l->l_cpu->ci_schedstate;

+	KASSERT(lwp_locked(l, spc->spc_mutex));
 	KASSERT(eprio <= spc->spc_maxpriority);
-	KASSERT(ci_rq->r_bitmap[eprio >> BITMAP_SHIFT] != 0);
-	KASSERT(ci_rq->r_count > 0);
+	KASSERT(spc->spc_bitmap[eprio >> BITMAP_SHIFT] != 0);
+	KASSERT(spc->spc_count > 0);

 	if (spc->spc_migrating == l)
 		spc->spc_migrating = NULL;

-	ci_rq->r_count--;
+	spc->spc_count--;
 	if ((l->l_pflag & LP_BOUND) == 0)
-		ci_rq->r_mcount--;
+		spc->spc_mcount--;

-	q_head = sched_getrq(ci_rq, eprio);
+	q_head = sched_getrq(spc, eprio);
 	TAILQ_REMOVE(q_head, l, l_runq);
 	if (TAILQ_EMPTY(q_head)) {
 		u_int i;
@ -324,8 +282,8 @@ sched_dequeue(struct lwp *l)
 		/* Unmark bit */
 		i = eprio >> BITMAP_SHIFT;
 		q = BITMAP_MSB >> (eprio & BITMAP_MASK);
-		KASSERT((ci_rq->r_bitmap[i] & q) != 0);
-		ci_rq->r_bitmap[i] &= ~q;
+		KASSERT((spc->spc_bitmap[i] & q) != 0);
+		spc->spc_bitmap[i] &= ~q;

 		/*
 		 * Update the value of highest priority in the runqueue, in a
@ -335,8 +293,8 @@ sched_dequeue(struct lwp *l)
 			return;

 		do {
-			if (ci_rq->r_bitmap[i] != 0) {
-				q = ffs(ci_rq->r_bitmap[i]);
+			if (spc->spc_bitmap[i] != 0) {
+				q = ffs(spc->spc_bitmap[i]);
 				spc->spc_maxpriority =
 				    (i << BITMAP_SHIFT) + (BITMAP_BITS - q);
 				return;
@ -502,8 +460,7 @@ struct cpu_info *
 sched_takecpu(struct lwp *l)
 {
 	struct cpu_info *ci, *tci, *pivot, *next;
-	struct schedstate_percpu *spc;
-	runqueue_t *ci_rq, *ici_rq;
+	struct schedstate_percpu *spc, *ici_spc;
 	pri_t eprio, lpri, pri;

 	KASSERT(lwp_locked(l, NULL));
@ -514,14 +471,13 @@ sched_takecpu(struct lwp *l)
 		return ci;

 	spc = &ci->ci_schedstate;
-	ci_rq = spc->spc_sched_info;
 	eprio = lwp_eprio(l);

 	/* Make sure that thread is in appropriate processor-set */
 	if (__predict_true(spc->spc_psid == l->l_psid)) {
 		/* If CPU of this thread is idling - run there */
-		if (ci_rq->r_count == 0) {
-			ci_rq->r_ev_stay.ev_count++;
+		if (spc->spc_count == 0) {
+			spc->spc_ev_stay.ev_count++;
 			return ci;
 		}
 		/*
@ -532,12 +488,12 @@ sched_takecpu(struct lwp *l)
 		 * chance of reusing the VM context from the parent.
 		 */
 		if (l->l_stat == LSIDL) {
-			ci_rq->r_ev_stay.ev_count++;
+			spc->spc_ev_stay.ev_count++;
 			return ci;
 		}		 
 		/* Stay if thread is cache-hot */
 		if (lwp_cache_hot(l) && eprio >= spc->spc_curpriority) {
-			ci_rq->r_ev_stay.ev_count++;
+			spc->spc_ev_stay.ev_count++;
 			return ci;
 		}
 	}
@ -546,8 +502,8 @@ sched_takecpu(struct lwp *l)
 	ci = curcpu();
 	spc = &ci->ci_schedstate;
 	if (eprio > spc->spc_curpriority && sched_migratable(l, ci)) {
-		ci_rq = spc->spc_sched_info;
-		ci_rq->r_ev_localize.ev_count++;
+		/* XXXAD foreign CPU not locked */
+		spc->spc_ev_localize.ev_count++;
 		return ci;
 	}

@ -564,13 +520,12 @@ sched_takecpu(struct lwp *l)
 			/* Reached the end, start from the beginning. */
 			next = cpu_lookup(0);
 		}
-		spc = &ci->ci_schedstate;
-		ici_rq = spc->spc_sched_info;
-		pri = MAX(spc->spc_curpriority, spc->spc_maxpriority);
+		ici_spc = &ci->ci_schedstate;
+		pri = MAX(ici_spc->spc_curpriority, ici_spc->spc_maxpriority);
 		if (pri > lpri)
 			continue;

-		if (pri == lpri && ci_rq->r_count < ici_rq->r_count)
+		if (pri == lpri && spc->spc_count < ici_spc->spc_count)
 			continue;

 		if (!sched_migratable(l, ci))
@ -578,11 +533,11 @@ sched_takecpu(struct lwp *l)

 		lpri = pri;
 		tci = ci;
-		ci_rq = ici_rq;
+		spc = ici_spc;
 	} while (ci = next, ci != pivot);

-	ci_rq = tci->ci_schedstate.spc_sched_info;
-	ci_rq->r_ev_push.ev_count++;
+	/* XXXAD remote CPU, unlocked */
+	tci->ci_schedstate.spc_ev_push.ev_count++;

 	return tci;
 }
@ -596,21 +551,19 @@ sched_catchlwp(struct cpu_info *ci)
 	struct cpu_info *curci = curcpu();
 	struct schedstate_percpu *spc, *curspc;
 	TAILQ_HEAD(, lwp) *q_head;
-	runqueue_t *ci_rq;
 	struct lwp *l;

 	curspc = &curci->ci_schedstate;
 	spc = &ci->ci_schedstate;
 	KASSERT(curspc->spc_psid == spc->spc_psid);

-	ci_rq = spc->spc_sched_info;
-	if (ci_rq->r_mcount < min_catch) {
+	if (spc->spc_mcount < min_catch) {
 		spc_unlock(ci);
 		return NULL;
 	}

 	/* Take the highest priority thread */
-	q_head = sched_getrq(ci_rq, spc->spc_maxpriority);
+	q_head = sched_getrq(spc, spc->spc_maxpriority);
 	l = TAILQ_FIRST(q_head);

 	for (;;) {
@ -643,7 +596,7 @@ sched_catchlwp(struct cpu_info *ci)
 				SPINLOCK_BACKOFF(count);
 		}
 		l->l_cpu = curci;
-		ci_rq->r_ev_pull.ev_count++;
+		spc->spc_ev_pull.ev_count++;
 		lwp_unlock_to(l, curspc->spc_mutex);
 		sched_enqueue(l);
 		return l;
@ -660,7 +613,7 @@ static void
 sched_balance(void *nocallout)
 {
 	struct cpu_info *ci, *hci;
-	runqueue_t *ci_rq;
+	struct schedstate_percpu *spc;
 	CPU_INFO_ITERATOR cii;
 	u_int highest;
 	u_int weight;
@ -673,7 +626,7 @@ sched_balance(void *nocallout)

 	/* Make lockless countings */
 	for (CPU_INFO_FOREACH(cii, ci)) {
-		ci_rq = ci->ci_schedstate.spc_sched_info;
+		spc = &ci->ci_schedstate;

 		/*
 		 * Average count of the threads
@ -681,14 +634,14 @@ sched_balance(void *nocallout)
 		 * The average is computed as a fixpoint number with
 		 * 8 fractional bits.
 		 */
-		ci_rq->r_avgcount = (
-			weight * ci_rq->r_avgcount + (100 - weight) * 256 * ci_rq->r_mcount
+		spc->spc_avgcount = (
+			weight * spc->spc_avgcount + (100 - weight) * 256 * spc->spc_mcount
 			) / 100;

 		/* Look for CPU with the highest average */
-		if (ci_rq->r_avgcount > highest) {
+		if (spc->spc_avgcount > highest) {
 			hci = ci;
-			highest = ci_rq->r_avgcount;
+			highest = spc->spc_avgcount;
 		}
 	}

@ -707,7 +660,6 @@ sched_idle(void)
 {
 	struct cpu_info *ci = curcpu(), *tci = NULL;
 	struct schedstate_percpu *spc, *tspc;
-	runqueue_t *ci_rq, *tci_rq;
 	bool dlock = false;

 	/* Check if there is a migrating LWP */
@ -782,21 +734,19 @@ sched_idle(void)
 	spc_unlock(ci);

 no_migration:
-	ci_rq = spc->spc_sched_info;
-	if ((spc->spc_flags & SPCF_OFFLINE) != 0 || ci_rq->r_count != 0) {
+	if ((spc->spc_flags & SPCF_OFFLINE) != 0 || spc->spc_count != 0) {
 		return;
 	}

 	/* Reset the counter, and call the balancer */
-	ci_rq->r_avgcount = 0;
+	spc->spc_avgcount = 0;
 	sched_balance(ci);
 	tci = worker_ci;
 	tspc = &tci->ci_schedstate;
 	if (ci == tci || spc->spc_psid != tspc->spc_psid)
 		return;
 	/* Don't hit the locks unless there's something to do. */
-	tci_rq = tci->ci_schedstate.spc_sched_info;
-	if (tci_rq->r_mcount >= min_catch) {
+	if (tspc->spc_mcount >= min_catch) {
 		spc_dlock(ci, tci);
 		(void)sched_catchlwp(tci);
 		spc_unlock(ci);
@ -888,7 +838,6 @@ sched_nextlwp(void)
 	struct cpu_info *ci = curcpu();
 	struct schedstate_percpu *spc;
 	TAILQ_HEAD(, lwp) *q_head;
-	runqueue_t *ci_rq;
 	struct lwp *l;

 	/* Update the last run time on switch */
@ -899,36 +848,14 @@ sched_nextlwp(void)
 	spc = &ci->ci_schedstate;
 	if (__predict_false(spc->spc_migrating != NULL))
 		return NULL;
-	ci_rq = spc->spc_sched_info;

-#ifdef MULTIPROCESSOR
-	/* If runqueue is empty, try to catch some thread from other CPU */
-	if (__predict_false(ci_rq->r_count == 0)) {
-		struct schedstate_percpu *cspc;
-		struct cpu_info *cci;
-
-		/* Offline CPUs should not perform this, however */
-		if (__predict_false(spc->spc_flags & SPCF_OFFLINE))
-			return NULL;
-
-		/* Reset the counter, and call the balancer */
-		ci_rq->r_avgcount = 0;
-		sched_balance(ci);
-		cci = worker_ci;
-		cspc = &cci->ci_schedstate;
-		if (ci == cci || spc->spc_psid != cspc->spc_psid ||
-		    !mutex_tryenter(cci->ci_schedstate.spc_mutex))
-			return NULL;
-		return sched_catchlwp(cci);
-	}
-#else
-	if (__predict_false(ci_rq->r_count == 0))
+	/* Return to idle LWP if there is no runnable job */
+	if (__predict_false(spc->spc_count == 0))
 		return NULL;
-#endif

 	/* Take the highest priority thread */
-	KASSERT(ci_rq->r_bitmap[spc->spc_maxpriority >> BITMAP_SHIFT]);
-	q_head = sched_getrq(ci_rq, spc->spc_maxpriority);
+	KASSERT(spc->spc_bitmap[spc->spc_maxpriority >> BITMAP_SHIFT]);
+	q_head = sched_getrq(spc, spc->spc_maxpriority);
 	l = TAILQ_FIRST(q_head);
 	KASSERT(l != NULL);

@ -947,13 +874,11 @@ sched_curcpu_runnable_p(void)
 {
 	const struct cpu_info *ci;
 	const struct schedstate_percpu *spc;
-	const runqueue_t *ci_rq;
 	bool rv;

 	kpreempt_disable();
 	ci = curcpu();
 	spc = &ci->ci_schedstate;
-	ci_rq = spc->spc_sched_info;

 #ifndef __HAVE_FAST_SOFTINTS
 	if (ci->ci_data.cpu_softints) {
@ -962,7 +887,7 @@ sched_curcpu_runnable_p(void)
 	}
 #endif

-	rv = (ci_rq->r_count != 0) ? true : false;
+	rv = (spc->spc_count != 0) ? true : false;
 	kpreempt_enable();

 	return rv;
@ -1033,7 +958,6 @@ SYSCTL_SETUP(sysctl_sched_setup, "sysctl sched setup")
 void
 sched_print_runqueue(void (*pr)(const char *, ...))
 {
-	runqueue_t *ci_rq;
 	struct cpu_info *ci, *tci;
 	struct schedstate_percpu *spc;
 	struct lwp *l;
@ -1044,7 +968,6 @@ sched_print_runqueue(void (*pr)(const char *, ...))
 		int i;

 		spc = &ci->ci_schedstate;
-		ci_rq = spc->spc_sched_info;

 		(*pr)("Run-queue (CPU = %u):\n", ci->ci_index);
 		(*pr)(" pid.lid = %d.%d, r_count = %u, r_avgcount = %u, "
@ -1054,12 +977,12 @@ sched_print_runqueue(void (*pr)(const char *, ...))
 #else
 		    curlwp->l_proc->p_pid, curlwp->l_lid,
 #endif
-		    ci_rq->r_count, ci_rq->r_avgcount, spc->spc_maxpriority,
+		    spc->spc_count, spc->spc_avgcount, spc->spc_maxpriority,
 		    spc->spc_migrating);
 		i = (PRI_COUNT >> BITMAP_SHIFT) - 1;
 		do {
 			uint32_t q;
-			q = ci_rq->r_bitmap[i];
+			q = spc->spc_bitmap[i];
 			(*pr)(" bitmap[%d] => [ %d (0x%x) ]\n", i, ffs(q), q);
 		} while (i--);
 	}
--- a/sys/sys/cpu_data.h
+++ b/sys/sys/cpu_data.h
@ -1,4 +1,4 @@
-/*	$NetBSD: cpu_data.h,v 1.42 2019/12/03 05:07:49 riastradh Exp $	*/
+/*	$NetBSD: cpu_data.h,v 1.43 2019/12/03 22:28:41 ad Exp $	*/

 /*-
 * Copyright (c) 2004, 2006, 2007, 2008, 2019 The NetBSD Foundation, Inc.
@ -59,6 +59,13 @@ struct lwp;

 struct lockdebug;

+enum cpu_rel {
+	CPUREL_CORE,	/* CPUs in the same core */
+	CPUREL_PACKAGE,	/* CPUs in the same package */
+	CPUREL_PEER,	/* peer CPUs in other packages */
+	CPUREL_COUNT
+};
+
 struct cpu_data {
 	/*
 	 * The first section is likely to be touched by other CPUs -
@ -76,10 +83,8 @@ struct cpu_data {
 	cpuid_t		cpu_package_id;
 	cpuid_t		cpu_core_id;
 	cpuid_t		cpu_smt_id;
-	u_int		cpu_npackage_cpus;
-	u_int		cpu_ncore_cpus;
-	struct cpu_info	*cpu_package_cpus;	/* sibling CPUs in package */
-	struct cpu_info	*cpu_core_cpus;		/* sibling CPUs in core */
+	u_int		cpu_nsibling[CPUREL_COUNT];
+	struct cpu_info	*cpu_sibling[CPUREL_COUNT];

 	/*
 	 * This section is mostly CPU-private.
@ -133,10 +138,8 @@ struct cpu_data {
 #define	ci_package_id		ci_data.cpu_package_id
 #define	ci_core_id		ci_data.cpu_core_id
 #define	ci_smt_id		ci_data.cpu_smt_id
-#define	ci_npackage_cpus	ci_data.cpu_npackage_cpus
-#define	ci_ncore_cpus		ci_data.cpu_ncore_cpus
-#define	ci_package_cpus		ci_data.cpu_package_cpus
-#define	ci_core_cpus		ci_data.cpu_core_cpus
+#define	ci_nsibling		ci_data.cpu_nsibling
+#define	ci_sibling		ci_data.cpu_sibling

 void	mi_cpu_init(void);
 int	mi_cpu_attach(struct cpu_info *);
--- a/sys/sys/sched.h
+++ b/sys/sys/sched.h
@ -1,4 +1,4 @@
-/*	$NetBSD: sched.h,v 1.78 2019/11/30 17:46:27 ad Exp $	*/
+/*	$NetBSD: sched.h,v 1.79 2019/12/03 22:28:41 ad Exp $	*/

 /*-
 * Copyright (c) 1999, 2000, 2001, 2002, 2007, 2008, 2019
@ -144,6 +144,7 @@ __END_DECLS

 #include <sys/mutex.h>
 #include <sys/time.h>
+#include <sys/evcnt.h>

 /*
 * Per-CPU scheduler state.  Field markings and the corresponding locks: 
@ -157,17 +158,26 @@ struct schedstate_percpu {
 	kmutex_t	*spc_mutex;	/* (: lock on below, runnable LWPs */
 	kmutex_t	*spc_lwplock;	/* (: general purpose lock for LWPs */
 	struct lwp	*spc_migrating;	/* (: migrating LWP */
-	volatile pri_t	spc_curpriority;/* m: usrpri of curlwp */
-	pri_t		spc_maxpriority;/* m: highest priority queued */
 	psetid_t	spc_psid;	/* c: processor-set ID */
 	time_t		spc_lastmod;	/* c: time of last cpu state change */
-	void		*spc_sched_info;/* (: scheduler-specific structure */
 	volatile int	spc_flags;	/* s: flags; see below */
 	u_int		spc_schedticks;	/* s: ticks for schedclock() */
 	uint64_t	spc_cp_time[CPUSTATES];/* s: CPU state statistics */
 	int		spc_ticks;	/* s: ticks until sched_tick() */
 	int		spc_pscnt;	/* s: prof/stat counter */
 	int		spc_psdiv;	/* s: prof/stat divisor */
+	/* Run queue */
+	volatile pri_t	spc_curpriority;/* s: usrpri of curlwp */
+	pri_t		spc_maxpriority;/* m: highest priority queued */
+	u_int		spc_count;	/* m: count of the threads */
+	u_int		spc_avgcount;	/* m: average count of threads (* 256) */
+	u_int		spc_mcount;	/* m: count of migratable threads */
+	uint32_t	spc_bitmap[8];	/* m: bitmap of active queues */
+	TAILQ_HEAD(,lwp) *spc_queue;	/* m: queue for each priority */
+	struct evcnt	spc_ev_pull;	/* m: event counters */
+	struct evcnt	spc_ev_push;
+	struct evcnt	spc_ev_stay;
+	struct evcnt	spc_ev_localize;
 };

 /* spc_flags */