- Many small tweaks to the SMT awareness in the scheduler. It does a much

better job now at keeping all physical CPUs busy, while using the extra threads to help out. In particular, during preempt() if we're using SMT, try to find a better CPU to run on and teleport curlwp there. - Change the CPU topology stuff so it can work on asymmetric systems. This mainly entails rearranging one of the CPU lists so it makes sense in all configurations. - Add a parameter to cpu_topology_set() to note that a CPU is "slow", for where there are fast CPUs and slow CPUs, like with the Rockwell RK3399. Extend the SMT awareness to try and handle that situation too (keep fast CPUs busy, use slow CPUs as helpers).
2020-01-09 16:35:03 +00:00 · 2020-01-09 16:35:03 +00:00 · c5b060977a
parent f0ac038b23
commit c5b060977a
13 changed files with 396 additions and 210 deletions
--- a/sys/arch/aarch64/aarch64/cpufunc.c
+++ b/sys/arch/aarch64/aarch64/cpufunc.c
@ -1,4 +1,4 @@
-/*	$NetBSD: cpufunc.c,v 1.12 2019/12/20 21:05:33 ad Exp $	*/
+/*	$NetBSD: cpufunc.c,v 1.13 2020/01/09 16:35:03 ad Exp $	*/

 /*
 * Copyright (c) 2017 Ryo Shimizu <ryo@nerv.org>
@ -29,7 +29,7 @@
 #include "opt_multiprocessor.h"

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: cpufunc.c,v 1.12 2019/12/20 21:05:33 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: cpufunc.c,v 1.13 2020/01/09 16:35:03 ad Exp $");

 #include <sys/param.h>
 #include <sys/types.h>
@ -97,13 +97,15 @@ aarch64_gettopology(struct cpu_info * const ci, uint64_t mpidr)
 		    __SHIFTOUT(mpidr, MPIDR_AFF2),
 		    __SHIFTOUT(mpidr, MPIDR_AFF1),
 		    __SHIFTOUT(mpidr, MPIDR_AFF0),
-		    0);
+		    0,
+		    false);
 	} else {
 		cpu_topology_set(ci,
 		    __SHIFTOUT(mpidr, MPIDR_AFF1),
 		    __SHIFTOUT(mpidr, MPIDR_AFF0),
 		    0,
-		    0);
+		    0,
+		    false);
 	}
 }

--- a/sys/arch/arm/arm32/cpu.c
+++ b/sys/arch/arm/arm32/cpu.c
@ -1,4 +1,4 @@
-/*	$NetBSD: cpu.c,v 1.138 2020/01/09 16:23:42 martin Exp $	*/
+/*	$NetBSD: cpu.c,v 1.139 2020/01/09 16:35:03 ad Exp $	*/

 /*
 * Copyright (c) 1995 Mark Brinicombe.
@ -46,7 +46,7 @@
 #include "opt_multiprocessor.h"

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: cpu.c,v 1.138 2020/01/09 16:23:42 martin Exp $");
+__KERNEL_RCSID(0, "$NetBSD: cpu.c,v 1.139 2020/01/09 16:35:03 ad Exp $");

 #include <sys/param.h>
 #include <sys/conf.h>
@ -143,13 +143,15 @@ cpu_attach(device_t dv, cpuid_t id)
 		    __SHIFTOUT(id, MPIDR_AFF2),
 		    __SHIFTOUT(id, MPIDR_AFF1),
 		    __SHIFTOUT(id, MPIDR_AFF0),
-		    0);
+		    0,
+		    false);
 	} else {
 		cpu_topology_set(ci,
 		    __SHIFTOUT(id, MPIDR_AFF1),
 		    __SHIFTOUT(id, MPIDR_AFF0),
 		    0,
-		    0);
+		    0,
+		    false);
 	}

 	evcnt_attach_dynamic(&ci->ci_arm700bugcount, EVCNT_TYPE_MISC,
--- a/sys/arch/macppc/macppc/cpu.c
+++ b/sys/arch/macppc/macppc/cpu.c
@ -1,4 +1,4 @@
-/*	$NetBSD: cpu.c,v 1.69 2019/12/20 21:05:33 ad Exp $	*/
+/*	$NetBSD: cpu.c,v 1.70 2020/01/09 16:35:03 ad Exp $	*/

 /*-
 * Copyright (c) 2001 Tsubai Masanari.
@ -33,7 +33,7 @@
 */

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: cpu.c,v 1.69 2019/12/20 21:05:33 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: cpu.c,v 1.70 2020/01/09 16:35:03 ad Exp $");

 #include "opt_ppcparam.h"
 #include "opt_multiprocessor.h"
@ -175,7 +175,7 @@ cpuattach(device_t parent, device_t self, void *aux)
 		core = package & 1;
 		package >>= 1;
 	}
-	cpu_topology_set(ci, package, core, 0, 0);
+	cpu_topology_set(ci, package, core, 0, 0, false);

 	if (ci->ci_khz == 0) {
 		cpu_OFgetspeed(self, ci);
--- a/sys/arch/mips/mips/cpu_subr.c
+++ b/sys/arch/mips/mips/cpu_subr.c
@ -1,4 +1,4 @@
-/*	$NetBSD: cpu_subr.c,v 1.44 2019/12/31 13:07:11 ad Exp $	*/
+/*	$NetBSD: cpu_subr.c,v 1.45 2020/01/09 16:35:03 ad Exp $	*/

 /*-
 * Copyright (c) 2010, 2019 The NetBSD Foundation, Inc.
@ -30,7 +30,7 @@
 */

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: cpu_subr.c,v 1.44 2019/12/31 13:07:11 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: cpu_subr.c,v 1.45 2020/01/09 16:35:03 ad Exp $");

 #include "opt_cputype.h"
 #include "opt_ddb.h"
@ -189,7 +189,7 @@ cpu_info_alloc(struct pmap_tlb_info *ti, cpuid_t cpu_id, cpuid_t cpu_package_id,
 	ci->ci_divisor_recip = cpu_info_store.ci_divisor_recip;
 	ci->ci_cpuwatch_count = cpu_info_store.ci_cpuwatch_count;

-	cpu_topology_set(ci, cpu_package_id, cpu_core_id, cpu_smt_id, 0);
+	cpu_topology_set(ci, cpu_package_id, cpu_core_id, cpu_smt_id, 0, false);

 	pmap_md_alloc_ephemeral_address_space(ci);

--- a/sys/arch/x86/x86/cpu_topology.c
+++ b/sys/arch/x86/x86/cpu_topology.c
@ -1,4 +1,4 @@
-/*	$NetBSD: cpu_topology.c,v 1.16 2019/12/20 21:05:34 ad Exp $	*/
+/*	$NetBSD: cpu_topology.c,v 1.17 2020/01/09 16:35:03 ad Exp $	*/

 /*-
 * Copyright (c) 2009 Mindaugas Rasiukevicius <rmind at NetBSD org>,
@ -36,7 +36,7 @@
 */

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: cpu_topology.c,v 1.16 2019/12/20 21:05:34 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: cpu_topology.c,v 1.17 2020/01/09 16:35:03 ad Exp $");

 #include "acpica.h"

@ -95,14 +95,14 @@ x86_cpu_topology(struct cpu_info *ci)
 	case CPUVENDOR_INTEL:
 		if (cpu_family < 6) {
 			cpu_topology_set(ci, package_id, core_id, smt_id,
-			    numa_id);
+			    numa_id, false);
 			return;
 		}
 		break;
 	case CPUVENDOR_AMD:
 		if (cpu_family < 0xf) {
 			cpu_topology_set(ci, package_id, core_id, smt_id,
-			    numa_id);
+			    numa_id, false);
 			return;
 		}
 		break;
@ -210,5 +210,5 @@ x86_cpu_topology(struct cpu_info *ci)
 		smt_id = __SHIFTOUT(apic_id, smt_mask);
 	}

-	cpu_topology_set(ci, package_id, core_id, smt_id, numa_id);
+	cpu_topology_set(ci, package_id, core_id, smt_id, numa_id, false);
 }
--- a/sys/kern/kern_runq.c
+++ b/sys/kern/kern_runq.c
@ -1,7 +1,7 @@
-/*	$NetBSD: kern_runq.c,v 1.56 2020/01/08 17:38:42 ad Exp $	*/
+/*	$NetBSD: kern_runq.c,v 1.57 2020/01/09 16:35:03 ad Exp $	*/

 /*-
- * Copyright (c) 2019 The NetBSD Foundation, Inc.
+ * Copyright (c) 2019, 2020 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
@ -56,7 +56,7 @@
 */

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: kern_runq.c,v 1.56 2020/01/08 17:38:42 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: kern_runq.c,v 1.57 2020/01/09 16:35:03 ad Exp $");

 #include "opt_dtrace.h"

@ -445,6 +445,79 @@ sched_migratable(const struct lwp *l, struct cpu_info *ci)
 	return (spc->spc_psid == l->l_psid);
 }

+/*
+ * Find a CPU to run LWP "l".  Look for the CPU with the lowest priority
+ * thread.  In case of equal priority, prefer first class CPUs, and amongst
+ * the remainder choose the CPU with the fewest runqueue entries.
+ */
+static struct cpu_info * __noinline 
+sched_bestcpu(struct lwp *l)
+{
+	struct cpu_info *bestci, *curci, *pivot, *next;
+	struct schedstate_percpu *bestspc, *curspc;
+	pri_t bestpri, curpri;
+
+	pivot = l->l_cpu;
+	curci = pivot;
+	bestci = NULL;
+	bestspc = NULL;
+	bestpri = PRI_COUNT;
+	do {
+		if ((next = cpu_lookup(cpu_index(curci) + 1)) == NULL) {
+			/* Reached the end, start from the beginning. */
+			next = cpu_lookup(0);
+		}
+		if (!sched_migratable(l, curci)){ 
+			continue;
+		}
+
+		curspc = &curci->ci_schedstate;
+		curpri = MAX(curspc->spc_curpriority, curspc->spc_maxpriority);
+
+		if (bestci == NULL) {
+			bestci = curci;
+			bestspc = curspc;
+			bestpri = curpri;
+			continue;
+		}
+		if (curpri > bestpri) {
+			continue;
+		}
+		if (curpri == bestpri) {
+			/* Prefer first class CPUs over others. */
+			if ((curspc->spc_flags & SPCF_1STCLASS) == 0 &&
+			    (bestspc->spc_flags & SPCF_1STCLASS) != 0) {
+			    	continue;
+			}
+			/*
+			 * Pick the least busy CPU.  Make sure this is not
+			 * <=, otherwise it defeats the above preference.
+			 */
+			if (bestspc->spc_count < curspc->spc_count) {
+				continue;
+			}
+		}
+
+		bestpri = curpri;
+		bestci = curci;
+		bestspc = curspc;
+
+		/* If this CPU is idle and 1st class, we're done. */
+		if ((curspc->spc_flags & (SPCF_IDLE | SPCF_1STCLASS)) ==
+		    (SPCF_IDLE | SPCF_1STCLASS)) {
+			break;
+		}
+
+		/*
+		 * XXXAD After execve, likely still resident on the same
+		 * package as the parent; should teleport to a different
+		 * package to maximise bus bandwidth / cache availability. 
+		 * SMT & non-SMT cases are different.
+		 */
+	} while (curci = next, curci != pivot);
+	return bestci;
+}
+
 /*
 * Estimate the migration of LWP to the other CPU.
 * Take and return the CPU, if migration is needed.
@ -452,9 +525,10 @@ sched_migratable(const struct lwp *l, struct cpu_info *ci)
 struct cpu_info *
 sched_takecpu(struct lwp *l)
 {
-	struct cpu_info *ci, *tci, *pivot, *next;
-	struct schedstate_percpu *spc, *ici_spc;
-	pri_t eprio, lpri, pri;
+	struct schedstate_percpu *spc, *tspc;
+	struct cpu_info *ci, *tci;
+	int flags;
+	pri_t eprio;

 	KASSERT(lwp_locked(l, NULL));

@ -467,33 +541,28 @@ sched_takecpu(struct lwp *l)
 	eprio = lwp_eprio(l);

 	/*
-	 * For new LWPs (LSIDL), l_cpu was inherited from the parent when
-	 * the LWP was created (and is probably still curcpu at this point).
-	 * The child will initially be in close communication with the
-	 * parent and share VM context and cache state.  Look for an idle
-	 * SMT sibling to run it, and failing that run on the same CPU as
-	 * the parent.
+	 * Look within the current CPU core.
+	 *
+	 * - For new LWPs (LSIDL), l_cpu was inherited from the parent when
+	 *   the LWP was created (and is probably still curcpu at this
+	 *   point).  The child will initially be in close communication
+	 *   with the parent and share VM context and cache state.  Look for
+	 *   an idle SMT sibling to run it, and failing that run on the same
+	 *   CPU as the parent.
+	 *
+	 * - For existing LWPs we'll try to send them back to the first CPU
+	 *   in the core if that's idle.  This keeps LWPs clustered in the
+	 *   run queues of 1st class CPUs.
 	 */
-	if (l->l_stat == LSIDL) {
-		tci = ci->ci_sibling[CPUREL_CORE];
-		while (tci != ci) {
-			ici_spc = &tci->ci_schedstate;
-			if (l->l_psid == ici_spc->spc_psid &&
-			    (ici_spc->spc_flags & SPCF_IDLE) != 0) {
-				return tci;
-			}
-			tci = tci->ci_sibling[CPUREL_CORE];
+	flags = (l->l_stat == LSIDL ? SPCF_IDLE : SPCF_IDLE | SPCF_1STCLASS);
+	tci = ci->ci_sibling[CPUREL_CORE];
+	while (tci != ci) {
+		tspc = &tci->ci_schedstate;
+		if ((tspc->spc_flags & flags) == flags &&
+		    sched_migratable(l, tci)) {
+			return tci;
 		}
-		if (spc->spc_psid == l->l_psid) {
-			return ci;
-		}
-	}
-
-	/* If SMT primary is idle, send it back there. */
-	tci = ci->ci_smt_primary;
-	if ((tci->ci_schedstate.spc_flags & SPCF_IDLE) != 0 &&
-	    sched_migratable(l, tci)) {
-		return tci;
+		tci = tci->ci_sibling[CPUREL_CORE];
 	}

 	/* Make sure that thread is in appropriate processor-set */
@ -520,45 +589,7 @@ sched_takecpu(struct lwp *l)
 	 * Look for the CPU with the lowest priority thread.  In case of
 	 * equal priority, choose the CPU with the fewest of threads.
 	 */
-	pivot = l->l_cpu;
-	ci = pivot;
-	tci = pivot;
-	lpri = PRI_COUNT;
-	do {
-		if ((next = cpu_lookup(cpu_index(ci) + 1)) == NULL) {
-			/* Reached the end, start from the beginning. */
-			next = cpu_lookup(0);
-		}
-		if (!sched_migratable(l, ci))
-			continue;
-
-		ici_spc = &ci->ci_schedstate;
-		pri = MAX(ici_spc->spc_curpriority, ici_spc->spc_maxpriority);
-		if (pri > lpri)
-			continue;
-
-		if (pri == lpri) {
-			/* Pick the least busy CPU. */
-			if (spc->spc_count <= ici_spc->spc_count)
-				continue;
-
-			/* Prefer SMT primaries over secondaries. */
-			if ((ici_spc->spc_flags & SPCF_SMTPRIMARY) == 0 &&
-			    (spc->spc_flags & SPCF_SMTPRIMARY) != 0)
-			    	continue;
-		}
-
-		lpri = pri;
-		tci = ci;
-		spc = ici_spc;
-
-		/* If this CPU is idle and an SMT primary, we're done. */
-		if ((spc->spc_flags & (SPCF_IDLE | SPCF_SMTPRIMARY)) ==
-		    (SPCF_IDLE | SPCF_SMTPRIMARY)) {
-			break;
-		}
-	} while (ci = next, ci != pivot);
-	return tci;
+	return sched_bestcpu(l);
 }

 /*
@ -571,18 +602,27 @@ sched_catchlwp(struct cpu_info *ci)
 	struct schedstate_percpu *spc, *curspc;
 	TAILQ_HEAD(, lwp) *q_head;
 	struct lwp *l;
-	bool smt;
+	bool gentle;

 	curspc = &curci->ci_schedstate;
 	spc = &ci->ci_schedstate;

 	/*
-	 * Determine if the other CPU is our SMT twin.  If it is, we'll be
-	 * more aggressive.
+	 * Be more aggressive in two cases:
+	 * - the other CPU is our SMT twin (everything's in cache)
+	 * - this CPU is first class, and the other is not
 	 */
-	smt = (curci->ci_package_id == ci->ci_package_id &&
-	    curci->ci_core_id == ci->ci_core_id);
-	if ((!smt && spc->spc_mcount < min_catch) ||
+	if (curci->ci_package_id == ci->ci_package_id &&
+	    curci->ci_core_id == ci->ci_core_id) {
+	    	gentle = false;
+	} else if ((curspc->spc_flags & SPCF_1STCLASS) != 0 &&
+	     (spc->spc_flags & SPCF_1STCLASS) == 0) {
+	     	gentle = false;
+	} else {
+		gentle = true;
+	}
+
+	if ((gentle && spc->spc_mcount < min_catch) ||
 	    curspc->spc_psid != spc->spc_psid) {
 		spc_unlock(ci);
 		return NULL;
@ -603,7 +643,7 @@ sched_catchlwp(struct cpu_info *ci)

 		/* Look for threads, whose are allowed to migrate */
 		if ((l->l_pflag & LP_BOUND) ||
-		    (!smt && lwp_cache_hot(l)) ||
+		    (gentle && lwp_cache_hot(l)) ||
 		    !sched_migratable(l, curci)) {
 			l = TAILQ_NEXT(l, l_runq);
 			/* XXX Gap: could walk down priority list. */
@ -760,6 +800,28 @@ sched_idle_migrate(void)
 	spc_unlock(ci);
 }

+/*
+ * Try to steal an LWP from "tci".
+ */
+static bool
+sched_steal(struct cpu_info *ci, struct cpu_info *tci)
+{
+	struct schedstate_percpu *spc, *tspc;
+	lwp_t *l;
+
+	spc = &ci->ci_schedstate;
+	tspc = &tci->ci_schedstate;
+	if (tspc->spc_mcount != 0 && spc->spc_psid == tspc->spc_psid) {
+		spc_dlock(ci, tci);
+		l = sched_catchlwp(tci);
+		spc_unlock(ci);
+		if (l != NULL) {
+			return true;
+		}
+	}
+	return false;
+}
+
 /*
 * Called from each CPU's idle loop.
 */
@ -768,12 +830,18 @@ sched_idle(void)
 {
 	struct cpu_info *ci = curcpu(), *tci = NULL;
 	struct schedstate_percpu *spc, *tspc;
-	lwp_t *l;
-

 	spc = &ci->ci_schedstate;
+
+	/*
+	 * Handle LWP migrations off this CPU to another.  If there a is
+	 * migration to do then go idle afterwards (we'll wake again soon),
+	 * as we don't want to instantly steal back the LWP we just moved
+	 * out.
+	 */
 	if (spc->spc_migrating != NULL) {
 		sched_idle_migrate();
+		return;
 	}

 	/* If this CPU is offline, or we have an LWP to run, we're done. */
@ -782,30 +850,29 @@ sched_idle(void)
 	}

 	/* If we have SMT then help our siblings out. */
-	tci = ci->ci_sibling[CPUREL_CORE];
-	while (tci != ci) {
-		tspc = &tci->ci_schedstate;
-		if (tspc->spc_mcount != 0 && spc->spc_psid == tspc->spc_psid) {
-			spc_dlock(ci, tci);
-			l = sched_catchlwp(tci);
-			spc_unlock(ci);
-			if (l != NULL) {
+	if (ci->ci_nsibling[CPUREL_CORE] > 1) {
+		tci = ci->ci_sibling[CPUREL_CORE];
+		while (tci != ci) {
+			if (sched_steal(ci, tci)) {
 				return;
 			}
+			tci = tci->ci_sibling[CPUREL_CORE];
+		}
+		/*
+		 * If not the first SMT in the core, and in the default
+		 * processor set, the search ends here.
+		 */
+		if ((spc->spc_flags & SPCF_1STCLASS) == 0 &&
+		    spc->spc_psid == PS_NONE) {
+			return;
 		}
-		tci = tci->ci_sibling[CPUREL_CORE];
 	}

 	/* Reset the counter. */
 	spc->spc_avgcount = 0;

-	/* If an SMT secondary and in the default processor set, we're done. */
-	if ((spc->spc_flags & SPCF_SMTPRIMARY) == 0 &&
-	    spc->spc_psid == PS_NONE) {
-		return;
-	}
-
 	/* Call the balancer. */
+	/* XXXAD Not greedy enough?  Also think about asymmetric. */
 	sched_balance(ci);
 	tci = worker_ci;
 	tspc = &tci->ci_schedstate;
@ -820,6 +887,59 @@ sched_idle(void)
 	}
 }

+/*
+ * Called from mi_switch() when an LWP has been preempted / has yielded. 
+ * The LWP is presently in the CPU's run queue.  Here we look for a better
+ * CPU to teleport the LWP to; there may not be one.
+ */
+void
+sched_preempted(struct lwp *l)
+{
+	struct schedstate_percpu *tspc;
+	struct cpu_info *ci, *tci;
+
+	ci = l->l_cpu;
+
+	/*
+	 * If this CPU is 1st class, or there's a realtime LWP in the mix
+	 * (no time to waste), or there's a migration pending already, leave
+	 * the LWP right here.
+	 */
+	if ((ci->ci_schedstate.spc_flags & SPCF_1STCLASS) != 0 ||
+	    ci->ci_schedstate.spc_maxpriority >= PRI_USER_RT ||
+	    l->l_target_cpu != NULL) {
+		return;
+	}
+
+	/*
+	 * Fast path: if the first SMT in the core is idle, send it back
+	 * there, because the cache is shared (cheap) and we want all LWPs
+	 * to be clustered on 1st class CPUs (either running there or on
+	 * their runqueues).
+	 */
+	tci = ci->ci_sibling[CPUREL_CORE];
+	while (tci != ci) {
+		const int flags = SPCF_IDLE | SPCF_1STCLASS;
+		tspc = &tci->ci_schedstate;
+		if ((tspc->spc_flags & flags) == flags &&
+		    sched_migratable(l, tci)) {
+		    	l->l_target_cpu = tci;
+		    	return;
+		}
+		tci = tci->ci_sibling[CPUREL_CORE];
+	}
+
+	/*
+	 * Otherwise try to find a better CPU to take it, but don't move to
+	 * a different 2nd class CPU; there's not much point.
+	 */
+	tci = sched_bestcpu(l);
+	if (tci != ci && (tci->ci_schedstate.spc_flags & SPCF_1STCLASS) != 0) {
+		l->l_target_cpu = tci;
+		return;
+	}
+}
+
 #else

 /*
@ -838,6 +958,13 @@ sched_idle(void)
 {

 }
+
+void
+sched_preempted(struct lwp *l)
+{
+
+}
+
 #endif	/* MULTIPROCESSOR */

 /*
--- a/sys/kern/kern_synch.c
+++ b/sys/kern/kern_synch.c
@ -1,4 +1,4 @@
-/*	$NetBSD: kern_synch.c,v 1.335 2020/01/08 17:38:42 ad Exp $	*/
+/*	$NetBSD: kern_synch.c,v 1.336 2020/01/09 16:35:03 ad Exp $	*/

 /*-
 * Copyright (c) 1999, 2000, 2004, 2006, 2007, 2008, 2009, 2019
@ -69,7 +69,7 @@
 */

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.335 2020/01/08 17:38:42 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.336 2020/01/09 16:35:03 ad Exp $");

 #include "opt_kstack.h"
 #include "opt_dtrace.h"
@ -579,6 +579,8 @@ mi_switch(lwp_t *l)
 		l->l_stat = LSRUN;
 		lwp_setlock(l, spc->spc_mutex);
 		sched_enqueue(l);
+		sched_preempted(l);
+
 		/*
 		 * Handle migration.  Note that "migrating LWP" may
 		 * be reset here, if interrupt/preemption happens
--- a/sys/kern/sched_4bsd.c
+++ b/sys/kern/sched_4bsd.c
@ -1,7 +1,7 @@
-/*	$NetBSD: sched_4bsd.c,v 1.41 2019/12/06 18:33:19 ad Exp $	*/
+/*	$NetBSD: sched_4bsd.c,v 1.42 2020/01/09 16:35:03 ad Exp $	*/

 /*
- * Copyright (c) 1999, 2000, 2004, 2006, 2007, 2008, 2019
+ * Copyright (c) 1999, 2000, 2004, 2006, 2007, 2008, 2019, 2020
 *     The NetBSD Foundation, Inc.
 * All rights reserved.
 *
@ -69,7 +69,7 @@
 */

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: sched_4bsd.c,v 1.41 2019/12/06 18:33:19 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: sched_4bsd.c,v 1.42 2020/01/09 16:35:03 ad Exp $");

 #include "opt_ddb.h"
 #include "opt_lockdebug.h"
@ -147,6 +147,13 @@ sched_tick(struct cpu_info *ci)
 			 * Indicate that the process should yield.
 			 */
 			pri = MAXPRI_KTHREAD;
+		} else if ((spc->spc_flags & SPCF_1STCLASS) == 0) {
+			/*
+			 * For SMT or assymetric systems push a little
+			 * harder: if this is not a 1st class CPU, try to
+			 * find a better one to run this LWP.
+			 */
+			pri = MAXPRI_KTHREAD;
 		} else {
 			spc->spc_flags |= SPCF_SEENRR;
 		}
--- a/sys/kern/subr_cpu.c
+++ b/sys/kern/subr_cpu.c
@ -1,7 +1,8 @@
-/*	$NetBSD: subr_cpu.c,v 1.5 2020/01/05 20:27:43 ad Exp $	*/
+/*	$NetBSD: subr_cpu.c,v 1.6 2020/01/09 16:35:03 ad Exp $	*/

 /*-
- * Copyright (c) 2007, 2008, 2009, 2010, 2012, 2019 The NetBSD Foundation, Inc.
+ * Copyright (c) 2007, 2008, 2009, 2010, 2012, 2019, 2020
+ *     The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
@ -60,7 +61,7 @@
 */

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: subr_cpu.c,v 1.5 2020/01/05 20:27:43 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: subr_cpu.c,v 1.6 2020/01/09 16:35:03 ad Exp $");

 #include <sys/param.h>
 #include <sys/systm.h>
@ -78,6 +79,7 @@ int		ncpu			__read_mostly;
 int		ncpuonline		__read_mostly;
 bool		mp_online		__read_mostly;
 static bool	cpu_topology_present	__read_mostly;
+static bool	cpu_topology_haveslow	__read_mostly;
 int64_t		cpu_counts[CPU_COUNT_MAX];

 /* An array of CPUs.  There are ncpu entries. */
@ -140,11 +142,12 @@ cpu_softintr_p(void)
 */
 void
 cpu_topology_set(struct cpu_info *ci, u_int package_id, u_int core_id,
-    u_int smt_id, u_int numa_id)
+    u_int smt_id, u_int numa_id, bool slow)
 {
 	enum cpu_rel rel;

 	cpu_topology_present = true;
+	cpu_topology_haveslow |= slow;
 	ci->ci_package_id = package_id;
 	ci->ci_core_id = core_id;
 	ci->ci_smt_id = smt_id;
@ -181,10 +184,10 @@ cpu_topology_link(struct cpu_info *ci, struct cpu_info *ci2, enum cpu_rel rel)
 static void
 cpu_topology_dump(void)
 {
-#if DEBUG 
+#ifdef DEBUG
 	CPU_INFO_ITERATOR cii;
 	struct cpu_info *ci, *ci2;
-	const char *names[] = { "core", "package", "peer", "smt" };
+	const char *names[] = { "core", "pkg", "1st" };
 	enum cpu_rel rel;
 	int i;

@ -223,9 +226,8 @@ cpu_topology_fake1(struct cpu_info *ci)
 	if (!cpu_topology_present) {
 		ci->ci_package_id = cpu_index(ci);
 	}
-	ci->ci_smt_primary = ci;
-	ci->ci_schedstate.spc_flags |= SPCF_SMTPRIMARY;
-	cpu_topology_dump();
+	ci->ci_schedstate.spc_flags |=
+	    (SPCF_CORE1ST | SPCF_PACKAGE1ST | SPCF_1STCLASS);
 }

 /*
@ -243,7 +245,7 @@ cpu_topology_fake(void)
 		cpu_topology_fake1(ci);
 	}
 	cpu_topology_dump();
-}
+ }

 /*
 * Fix up basic CPU topology info.  Right now that means attach each CPU to
@ -254,8 +256,7 @@ cpu_topology_init(void)
 {
 	CPU_INFO_ITERATOR cii, cii2;
 	struct cpu_info *ci, *ci2, *ci3;
-	u_int ncore, npackage, npeer, minsmt;
-	bool symmetric;
+	u_int minsmt, mincore;

 	if (!cpu_topology_present) {
 		cpu_topology_fake();
@ -264,6 +265,8 @@ cpu_topology_init(void)

 	/* Find siblings in same core and package. */
 	for (CPU_INFO_FOREACH(cii, ci)) {
+		ci->ci_schedstate.spc_flags &=
+		    ~(SPCF_CORE1ST | SPCF_PACKAGE1ST | SPCF_1STCLASS);
 		for (CPU_INFO_FOREACH(cii2, ci2)) {
 			/* Avoid bad things happening. */
 			if (ci2->ci_package_id == ci->ci_package_id &&
@ -295,54 +298,7 @@ cpu_topology_init(void)
 		}
 	}

-	/* Find peers in other packages, and peer SMTs in same package. */
-	for (CPU_INFO_FOREACH(cii, ci)) {
-		if (ci->ci_nsibling[CPUREL_PEER] <= 1) {
-			for (CPU_INFO_FOREACH(cii2, ci2)) {
-				if (ci != ci2 &&
-				    ci->ci_package_id != ci2->ci_package_id &&
-				    ci->ci_core_id == ci2->ci_core_id &&
-				    ci->ci_smt_id == ci2->ci_smt_id) {
-					cpu_topology_link(ci, ci2,
-					    CPUREL_PEER);
-					break;
-				}
-			}
-		}
-		if (ci->ci_nsibling[CPUREL_SMT] <= 1) {
-			for (CPU_INFO_FOREACH(cii2, ci2)) {
-				if (ci != ci2 &&
-				    ci->ci_package_id == ci2->ci_package_id &&
-				    ci->ci_core_id != ci2->ci_core_id &&
-				    ci->ci_smt_id == ci2->ci_smt_id) {
-					cpu_topology_link(ci, ci2,
-					    CPUREL_SMT);
-					break;
-				}
-			}
-		}
-	}
-
-	/* Determine whether the topology is bogus/symmetric. */
-	npackage = curcpu()->ci_nsibling[CPUREL_PACKAGE];
-	ncore = curcpu()->ci_nsibling[CPUREL_CORE];
-	npeer = curcpu()->ci_nsibling[CPUREL_PEER];
-	symmetric = true;
-	for (CPU_INFO_FOREACH(cii, ci)) {
-		if (npackage != ci->ci_nsibling[CPUREL_PACKAGE] ||
-		    ncore != ci->ci_nsibling[CPUREL_CORE] ||
-		    npeer != ci->ci_nsibling[CPUREL_PEER]) {
-			symmetric = false;
-		}
-	}
-	cpu_topology_dump();
-	if (symmetric == false) {
-		printf("cpu_topology_init: not symmetric, faking it\n");
-		cpu_topology_fake();
-		return;
-	}
-
-	/* Identify SMT primary in each core. */
+	/* Identify lowest numbered SMT in each core. */
 	for (CPU_INFO_FOREACH(cii, ci)) {
 		ci2 = ci3 = ci;
 		minsmt = ci->ci_smt_id;
@ -353,18 +309,93 @@ cpu_topology_init(void)
 			}
 			ci2 = ci2->ci_sibling[CPUREL_CORE];
 		} while (ci2 != ci);
+		ci3->ci_schedstate.spc_flags |= SPCF_CORE1ST;
+	}

-		/*
-		 * Mark the SMT primary, and walk back over the list
-		 * pointing secondaries to the primary.
-		 */
-		ci3->ci_schedstate.spc_flags |= SPCF_SMTPRIMARY;
+	/* Identify lowest numbered SMT in each package. */
+	ci3 = NULL;
+	for (CPU_INFO_FOREACH(cii, ci)) {
+		if ((ci->ci_schedstate.spc_flags & SPCF_CORE1ST) == 0) {
+			continue;
+		}
+		ci2 = ci3 = ci;
+		mincore = ci->ci_core_id;
+		do {
+			if ((ci2->ci_schedstate.spc_flags &
+			    SPCF_CORE1ST) != 0 &&
+			    ci2->ci_core_id < mincore) {
+				ci3 = ci2;
+				mincore = ci2->ci_core_id;
+			}
+			ci2 = ci2->ci_sibling[CPUREL_PACKAGE];
+		} while (ci2 != ci);
+
+		if ((ci3->ci_schedstate.spc_flags & SPCF_PACKAGE1ST) != 0) {
+			/* Already identified - nothing more to do. */
+			continue;
+		}
+		ci3->ci_schedstate.spc_flags |= SPCF_PACKAGE1ST;
+
+		/* Walk through all CPUs in package and point to first. */
 		ci2 = ci;
 		do {
-			ci2->ci_smt_primary = ci3;
-			ci2 = ci2->ci_sibling[CPUREL_CORE];
+			ci2->ci_sibling[CPUREL_PACKAGE1ST] = ci3;
+			ci2 = ci2->ci_sibling[CPUREL_PACKAGE];
 		} while (ci2 != ci);
+
+		/* Now look for somebody else to link to. */
+		for (CPU_INFO_FOREACH(cii2, ci2)) {
+			if ((ci2->ci_schedstate.spc_flags & SPCF_PACKAGE1ST)
+			    != 0 && ci2 != ci3) {
+			    	cpu_topology_link(ci3, ci2, CPUREL_PACKAGE1ST);
+			    	break;
+			}
+		}
 	}
+
+	/* Walk through all packages, starting with value of ci3 from above. */
+	KASSERT(ci3 != NULL);
+	ci = ci3;
+	do {
+		/* Walk through CPUs in the package and copy in PACKAGE1ST. */
+		ci2 = ci;
+		do {
+			ci2->ci_sibling[CPUREL_PACKAGE1ST] =
+			    ci->ci_sibling[CPUREL_PACKAGE1ST];
+			ci2->ci_nsibling[CPUREL_PACKAGE1ST] =
+			    ci->ci_nsibling[CPUREL_PACKAGE1ST];
+			ci2 = ci2->ci_sibling[CPUREL_PACKAGE];
+		} while (ci2 != ci);
+		ci = ci->ci_sibling[CPUREL_PACKAGE1ST];
+	} while (ci != ci3);
+
+	if (cpu_topology_haveslow) {
+		/*
+		 * For assymmetric systems where some CPUs are slower than
+		 * others, mark first class CPUs for the scheduler.  This
+		 * conflicts with SMT right now so whinge if observed.
+		 */
+		if (curcpu()->ci_nsibling[CPUREL_CORE] == 1) {
+			printf("cpu_topology_init: asymmetric & SMT??\n");
+		}
+		for (CPU_INFO_FOREACH(cii, ci)) {
+			if (!ci->ci_is_slow) {
+				ci->ci_schedstate.spc_flags |= SPCF_1STCLASS;
+			}
+		}
+	} else {
+		/*
+		 * For any other configuration mark the 1st CPU in each
+		 * core as a first class CPU.
+		 */
+		for (CPU_INFO_FOREACH(cii, ci)) {
+			if ((ci->ci_schedstate.spc_flags & SPCF_CORE1ST) != 0) {
+				ci->ci_schedstate.spc_flags |= SPCF_1STCLASS;
+			}
+		}
+	}
+
+	cpu_topology_dump();
 }

 /*
--- a/sys/sys/cpu.h
+++ b/sys/sys/cpu.h
@ -1,4 +1,4 @@
-/*	$NetBSD: cpu.h,v 1.47 2019/12/21 12:53:53 ad Exp $	*/
+/*	$NetBSD: cpu.h,v 1.48 2020/01/09 16:35:03 ad Exp $	*/

 /*-
 * Copyright (c) 2007 YAMAMOTO Takashi,
@ -90,7 +90,7 @@ bool	cpu_kpreempt_disabled(void);
 int	cpu_lwp_setprivate(struct lwp *, void *);
 void	cpu_intr_redistribute(void);
 u_int	cpu_intr_count(struct cpu_info *);
-void	cpu_topology_set(struct cpu_info *, u_int, u_int, u_int, u_int);
+void	cpu_topology_set(struct cpu_info *, u_int, u_int, u_int, u_int, bool);
 void	cpu_topology_init(void);
 #endif

--- a/sys/sys/cpu_data.h
+++ b/sys/sys/cpu_data.h
@ -1,4 +1,4 @@
-/*	$NetBSD: cpu_data.h,v 1.46 2019/12/21 14:33:18 ad Exp $	*/
+/*	$NetBSD: cpu_data.h,v 1.47 2020/01/09 16:35:03 ad Exp $	*/

 /*-
 * Copyright (c) 2004, 2006, 2007, 2008, 2019 The NetBSD Foundation, Inc.
@ -105,10 +105,28 @@ enum cpu_count {
 struct lockdebug;

 enum cpu_rel {
-	CPUREL_CORE,	/* CPUs in the same core */
-	CPUREL_PACKAGE,	/* CPUs in the same package */
-	CPUREL_PEER,	/* peer CPUs in other packages */
-	CPUREL_SMT,	/* peer SMTs in same package */
+	/*
+	 * This is a circular list of peer CPUs in the same core (SMT /
+	 * Hyperthreading).  It always includes the CPU it is referenced
+	 * from as the last entry.
+	 */
+	CPUREL_CORE,
+
+	/*
+	 * This is a circular list of peer CPUs in the same physical
+	 * package.  It always includes the CPU it is referenced from as
+	 * the last entry.
+	 */
+	CPUREL_PACKAGE,
+
+	/*
+	 * This is a circular list of the first CPUs in each physical
+	 * package.  It may or may not include the CPU it is referenced
+	 * from.
+	 */
+	CPUREL_PACKAGE1ST,
+
+	/* Terminator. */
 	CPUREL_COUNT
 };

@ -130,9 +148,9 @@ struct cpu_data {
 	u_int		cpu_core_id;
 	u_int		cpu_smt_id;
 	u_int		cpu_numa_id;
+	bool		cpu_is_slow;
 	u_int		cpu_nsibling[CPUREL_COUNT];
 	struct cpu_info	*cpu_sibling[CPUREL_COUNT];
-	struct cpu_info	*cpu_smt_primary;

 	/*
 	 * This section is mostly CPU-private.
@ -182,9 +200,9 @@ struct cpu_data {
 #define	ci_core_id		ci_data.cpu_core_id
 #define	ci_smt_id		ci_data.cpu_smt_id
 #define	ci_numa_id		ci_data.cpu_numa_id
+#define	ci_is_slow		ci_data.cpu_is_slow
 #define	ci_nsibling		ci_data.cpu_nsibling
 #define	ci_sibling		ci_data.cpu_sibling
-#define	ci_smt_primary		ci_data.cpu_smt_primary
 #define	ci_faultrng		ci_data.cpu_faultrng
 #define	ci_counts		ci_data.cpu_counts

--- a/sys/sys/sched.h
+++ b/sys/sys/sched.h
@ -1,7 +1,7 @@
-/*	$NetBSD: sched.h,v 1.85 2020/01/06 10:21:21 martin Exp $	*/
+/*	$NetBSD: sched.h,v 1.86 2020/01/09 16:35:03 ad Exp $	*/

 /*-
- * Copyright (c) 1999, 2000, 2001, 2002, 2007, 2008, 2019
+ * Copyright (c) 1999, 2000, 2001, 2002, 2007, 2008, 2019, 2020
 *    The NetBSD Foundation, Inc.
 * All rights reserved.
 *
@ -182,8 +182,10 @@ struct schedstate_percpu {
 #define	SPCF_OFFLINE		0x0004	/* CPU marked offline */
 #define	SPCF_RUNNING		0x0008	/* CPU is running */
 #define	SPCF_NOINTR		0x0010	/* shielded from interrupts */
-#define	SPCF_SMTPRIMARY		0x0020	/* CPU is first thread in core */
-#define	SPCF_IDLE		0x0040	/* CPU is currently idle */
+#define	SPCF_IDLE		0x0020	/* CPU is currently idle */
+#define	SPCF_1STCLASS		0x0040	/* first class scheduling entity */
+#define	SPCF_CORE1ST		0x0100	/* first CPU in core */
+#define	SPCF_PACKAGE1ST		0x0200	/* first CPU in package */

 #define	SPCF_SWITCHCLEAR	(SPCF_SEENRR|SPCF_SHOULDYIELD)

@ -235,6 +237,7 @@ void		sched_pstats_hook(struct lwp *, int);
 bool		sched_curcpu_runnable_p(void);
 void		sched_dequeue(struct lwp *);
 void		sched_enqueue(struct lwp *);
+void		sched_preempted(struct lwp *);
 void		sched_resched_cpu(struct cpu_info *, pri_t, bool);
 void		sched_resched_lwp(struct lwp *, bool);
 struct lwp *	sched_nextlwp(void);
--- a/sys/uvm/uvm_page.c
+++ b/sys/uvm/uvm_page.c
@ -1,4 +1,4 @@
-/*	$NetBSD: uvm_page.c,v 1.221 2020/01/05 22:01:09 ad Exp $	*/
+/*	$NetBSD: uvm_page.c,v 1.222 2020/01/09 16:35:03 ad Exp $	*/

 /*-
 * Copyright (c) 2019 The NetBSD Foundation, Inc.
@ -95,7 +95,7 @@
 */

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: uvm_page.c,v 1.221 2020/01/05 22:01:09 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: uvm_page.c,v 1.222 2020/01/09 16:35:03 ad Exp $");

 #include "opt_ddb.h"
 #include "opt_uvm.h"
@ -921,13 +921,7 @@ uvm_page_rebucket(void)
 	 * packages evenly.  uvm_pagefree() will reassign pages to the
 	 * freeing CPU's preferred bucket on free.
 	 */
-	npackage = 0;
-	ci = curcpu();
-	ci2 = ci;
-	do {
-		npackage++;
-		ci2 = ci2->ci_sibling[CPUREL_PEER];
-	} while (ci2 != ci);
+	npackage = curcpu()->ci_nsibling[CPUREL_PACKAGE1ST];
 	
 	/*
 	 * Figure out how to arrange the packages & buckets, and the total
@ -944,7 +938,7 @@ uvm_page_rebucket(void)
 	 */
 	npackage = 0;
 	ci = curcpu();
-	ci2 = ci;
+	ci2 = ci->ci_sibling[CPUREL_PACKAGE1ST];
 	do {
 		/*
 		 * In the inner loop, scroll through all CPUs in the package
@ -956,8 +950,8 @@ uvm_page_rebucket(void)
 			ci3 = ci3->ci_sibling[CPUREL_PACKAGE];
 		} while (ci3 != ci2);
 		npackage++;
-		ci2 = ci2->ci_sibling[CPUREL_PEER];
-	} while (ci2 != ci);
+		ci2 = ci2->ci_sibling[CPUREL_PACKAGE1ST];
+	} while (ci2 != ci->ci_sibling[CPUREL_PACKAGE1ST]);

 	aprint_debug("UVM: using package allocation scheme, "
 	    "%d package(s) per bucket\n", 1 << shift);