/* $NetBSD: sched_4bsd.c,v 1.33 2017/07/14 13:23:48 maxv Exp $ */ /* * Copyright (c) 1999, 2000, 2004, 2006, 2007, 2008 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, * NASA Ames Research Center, by Charles M. Hannum, Andrew Doran, and * Daniel Sieger. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1990, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95 */ #include __KERNEL_RCSID(0, "$NetBSD: sched_4bsd.c,v 1.33 2017/07/14 13:23:48 maxv Exp $"); #include "opt_ddb.h" #include "opt_lockdebug.h" #include "opt_perfctrs.h" #include #include #include #include #include #include #include #include #include #include #include static void updatepri(struct lwp *); static void resetpriority(struct lwp *); extern unsigned int sched_pstats_ticks; /* defined in kern_synch.c */ /* Number of hardclock ticks per sched_tick() */ static int rrticks __read_mostly; /* * Force switch among equal priority processes every 100ms. * Called from hardclock every hz/10 == rrticks hardclock ticks. * * There's no need to lock anywhere in this routine, as it's * CPU-local and runs at IPL_SCHED (called from clock interrupt). */ /* ARGSUSED */ void sched_tick(struct cpu_info *ci) { struct schedstate_percpu *spc = &ci->ci_schedstate; lwp_t *l; spc->spc_ticks = rrticks; if (CURCPU_IDLE_P()) { cpu_need_resched(ci, 0); return; } l = ci->ci_data.cpu_onproc; if (l == NULL) { return; } switch (l->l_class) { case SCHED_FIFO: /* No timeslicing for FIFO jobs. */ break; case SCHED_RR: /* Force it into mi_switch() to look for other jobs to run. */ cpu_need_resched(ci, RESCHED_KPREEMPT); break; default: if (spc->spc_flags & SPCF_SHOULDYIELD) { /* * Process is stuck in kernel somewhere, probably * due to buggy or inefficient code. Force a * kernel preemption. */ cpu_need_resched(ci, RESCHED_KPREEMPT); } else if (spc->spc_flags & SPCF_SEENRR) { /* * The process has already been through a roundrobin * without switching and may be hogging the CPU. * Indicate that the process should yield. */ spc->spc_flags |= SPCF_SHOULDYIELD; cpu_need_resched(ci, 0); } else { spc->spc_flags |= SPCF_SEENRR; } break; } } /* * Why PRIO_MAX - 2? From setpriority(2): * * prio is a value in the range -20 to 20. The default priority is * 0; lower priorities cause more favorable scheduling. A value of * 19 or 20 will schedule a process only when nothing at priority <= * 0 is runnable. * * This gives estcpu influence over 18 priority levels, and leaves nice * with 40 levels. One way to think about it is that nice has 20 levels * either side of estcpu's 18. */ #define ESTCPU_SHIFT 11 #define ESTCPU_MAX ((PRIO_MAX - 2) << ESTCPU_SHIFT) #define ESTCPU_ACCUM (1 << (ESTCPU_SHIFT - 1)) #define ESTCPULIM(e) min((e), ESTCPU_MAX) /* * The main parameter used by this algorithm is 'l_estcpu'. It is an estimate * of the recent CPU utilization of the thread. * * l_estcpu is: * - increased each time the hardclock ticks and the thread is found to * be executing, in sched_schedclock() called from hardclock() * - decreased (filtered) on each sched tick, in sched_pstats_hook() * If the lwp is sleeping for more than a second, we don't touch l_estcpu: it * will be updated in sched_setrunnable() when the lwp wakes up, in burst mode * (ie, we decrease it n times). * * Note that hardclock updates l_estcpu and l_cpticks independently. * * ----------------------------------------------------------------------------- * * Here we describe how l_estcpu is decreased. * * Constants for digital decay (filter): * 90% of l_estcpu usage in (5 * loadavg) seconds * * We wish to decay away 90% of l_estcpu in (5 * loadavg) seconds. That is, we * want to compute a value of decay such that the following loop: * for (i = 0; i < (5 * loadavg); i++) * l_estcpu *= decay; * will result in * l_estcpu *= 0.1; * for all values of loadavg. * * Mathematically this loop can be expressed by saying: * decay ** (5 * loadavg) ~= .1 * * And finally, the corresponding value of decay we're using is: * decay = (2 * loadavg) / (2 * loadavg + 1) * * ----------------------------------------------------------------------------- * * Now, let's prove that the value of decay stated above will always fulfill * the equation: * decay ** (5 * loadavg) ~= .1 * * If we compute b as: * b = 2 * loadavg * then * decay = b / (b + 1) * * We now need to prove two things: * 1) Given [factor ** (5 * loadavg) =~ .1], prove [factor == b/(b+1)]. * 2) Given [b/(b+1) ** power =~ .1], prove [power == (5 * loadavg)]. * * Facts: * * For x real: exp(x) = 0! + x**1/1! + x**2/2! + ... * Therefore, for x close to zero, exp(x) =~ 1 + x. * In turn, for b large enough, exp(-1/b) =~ 1 - (1/b) = (b-1)/b. * * * For b large enough, (b-1)/b =~ b/(b+1). * * * For x belonging to [-1;1[, ln(1-x) = - x - x**2/2 - x**3/3 - ... * Therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1). * * * ln(0.1) =~ -2.30 * * Proof of (1): * factor ** (5 * loadavg) =~ 0.1 * => ln(factor) =~ -2.30 / (5 * loadavg) * => factor =~ exp(-1 / ((5 / 2.30) * loadavg)) * =~ exp(-1 / (2 * loadavg)) * =~ exp(-1 / b) * =~ (b - 1) / b * =~ b / (b + 1) * =~ (2 * loadavg) / ((2 * loadavg) + 1) * * Proof of (2): * (b / (b + 1)) ** power =~ .1 * => power * ln(b / (b + 1)) =~ -2.30 * => power * (-1 / (b + 1)) =~ -2.30 * => power =~ 2.30 * (b + 1) * => power =~ 4.60 * loadavg + 2.30 * => power =~ 5 * loadavg * * Conclusion: decay = (2 * loadavg) / (2 * loadavg + 1) */ /* See calculations above */ #define loadfactor(loadavg) (2 * (loadavg)) static fixpt_t decay_cpu(fixpt_t loadfac, fixpt_t estcpu) { if (estcpu == 0) { return 0; } #if !defined(_LP64) /* avoid 64bit arithmetics. */ #define FIXPT_MAX ((fixpt_t)((UINTMAX_C(1) << sizeof(fixpt_t) * CHAR_BIT) - 1)) if (__predict_true(loadfac <= FIXPT_MAX / ESTCPU_MAX)) { return estcpu * loadfac / (loadfac + FSCALE); } #endif return (uint64_t)estcpu * loadfac / (loadfac + FSCALE); } static fixpt_t decay_cpu_batch(fixpt_t loadfac, fixpt_t estcpu, unsigned int n) { /* * For all load averages >= 1 and max l_estcpu of (255 << ESTCPU_SHIFT), * if we slept for at least seven times the loadfactor, we will decay * l_estcpu to less than (1 << ESTCPU_SHIFT), and therefore we can * return zero directly. * * Note that our ESTCPU_MAX is actually much smaller than * (255 << ESTCPU_SHIFT). */ if ((n << FSHIFT) >= 7 * loadfac) { return 0; } while (estcpu != 0 && n > 1) { estcpu = decay_cpu(loadfac, estcpu); n--; } return estcpu; } /* * sched_pstats_hook: * * Periodically called from sched_pstats(); used to recalculate priorities. */ void sched_pstats_hook(struct lwp *l, int batch) { fixpt_t loadfac; /* * If the LWP has slept an entire second, stop recalculating * its priority until it wakes up. */ KASSERT(lwp_locked(l, NULL)); if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP || l->l_stat == LSSUSPENDED) { if (l->l_slptime > 1) { return; } } loadfac = loadfactor(averunnable.ldavg[0]); l->l_estcpu = decay_cpu(loadfac, l->l_estcpu); resetpriority(l); } /* * Recalculate the priority of an LWP after it has slept for a while. */ static void updatepri(struct lwp *l) { fixpt_t loadfac; KASSERT(lwp_locked(l, NULL)); KASSERT(l->l_slptime > 1); loadfac = loadfactor(averunnable.ldavg[0]); l->l_slptime--; /* the first time was done in sched_pstats */ l->l_estcpu = decay_cpu_batch(loadfac, l->l_estcpu, l->l_slptime); resetpriority(l); } void sched_rqinit(void) { } void sched_setrunnable(struct lwp *l) { if (l->l_slptime > 1) updatepri(l); } void sched_nice(struct proc *p, int n) { struct lwp *l; KASSERT(mutex_owned(p->p_lock)); p->p_nice = n; LIST_FOREACH(l, &p->p_lwps, l_sibling) { lwp_lock(l); resetpriority(l); lwp_unlock(l); } } /* * Recompute the priority of an LWP. Arrange to reschedule if * the resulting priority is better than that of the current LWP. */ static void resetpriority(struct lwp *l) { pri_t pri; struct proc *p = l->l_proc; KASSERT(lwp_locked(l, NULL)); if (l->l_class != SCHED_OTHER) return; /* See comments above ESTCPU_SHIFT definition. */ pri = (PRI_KERNEL - 1) - (l->l_estcpu >> ESTCPU_SHIFT) - p->p_nice; pri = imax(pri, 0); if (pri != l->l_priority) lwp_changepri(l, pri); } /* * We adjust the priority of the current LWP. The priority of a LWP * gets worse as it accumulates CPU time. The CPU usage estimator (l_estcpu) * is increased here. The formula for computing priorities will compute a * different value each time l_estcpu increases. This can cause a switch, * but unless the priority crosses a PPQ boundary the actual queue will not * change. The CPU usage estimator ramps up quite quickly when the process * is running (linearly), and decays away exponentially, at a rate which is * proportionally slower when the system is busy. The basic principle is * that the system will 90% forget that the process used a lot of CPU time * in (5 * loadavg) seconds. This causes the system to favor processes which * haven't run much recently, and to round-robin among other processes. */ void sched_schedclock(struct lwp *l) { if (l->l_class != SCHED_OTHER) return; KASSERT(!CURCPU_IDLE_P()); l->l_estcpu = ESTCPULIM(l->l_estcpu + ESTCPU_ACCUM); lwp_lock(l); resetpriority(l); lwp_unlock(l); } /* * sched_proc_fork: * * Inherit the parent's scheduler history. */ void sched_proc_fork(struct proc *parent, struct proc *child) { lwp_t *pl; KASSERT(mutex_owned(parent->p_lock)); pl = LIST_FIRST(&parent->p_lwps); child->p_estcpu_inherited = pl->l_estcpu; child->p_forktime = sched_pstats_ticks; } /* * sched_proc_exit: * * Chargeback parents for the sins of their children. */ void sched_proc_exit(struct proc *parent, struct proc *child) { fixpt_t loadfac = loadfactor(averunnable.ldavg[0]); fixpt_t estcpu; lwp_t *pl, *cl; /* XXX Only if parent != init?? */ mutex_enter(parent->p_lock); pl = LIST_FIRST(&parent->p_lwps); cl = LIST_FIRST(&child->p_lwps); estcpu = decay_cpu_batch(loadfac, child->p_estcpu_inherited, sched_pstats_ticks - child->p_forktime); if (cl->l_estcpu > estcpu) { lwp_lock(pl); pl->l_estcpu = ESTCPULIM(pl->l_estcpu + cl->l_estcpu - estcpu); lwp_unlock(pl); } mutex_exit(parent->p_lock); } void sched_wakeup(struct lwp *l) { } void sched_slept(struct lwp *l) { } void sched_lwp_fork(struct lwp *l1, struct lwp *l2) { l2->l_estcpu = l1->l_estcpu; } void sched_lwp_collect(struct lwp *t) { lwp_t *l; /* Absorb estcpu value of collected LWP. */ l = curlwp; lwp_lock(l); l->l_estcpu += t->l_estcpu; lwp_unlock(l); } void sched_oncpu(lwp_t *l) { } void sched_newts(lwp_t *l) { } /* * Sysctl nodes and initialization. */ static int sysctl_sched_rtts(SYSCTLFN_ARGS) { struct sysctlnode node; int rttsms = hztoms(rrticks); node = *rnode; node.sysctl_data = &rttsms; return sysctl_lookup(SYSCTLFN_CALL(&node)); } SYSCTL_SETUP(sysctl_sched_4bsd_setup, "sysctl sched setup") { const struct sysctlnode *node = NULL; sysctl_createv(clog, 0, NULL, &node, CTLFLAG_PERMANENT, CTLTYPE_NODE, "sched", SYSCTL_DESCR("Scheduler options"), NULL, 0, NULL, 0, CTL_KERN, CTL_CREATE, CTL_EOL); if (node == NULL) return; rrticks = hz / 10; sysctl_createv(NULL, 0, &node, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRING, "name", NULL, NULL, 0, __UNCONST("4.4BSD"), 0, CTL_CREATE, CTL_EOL); sysctl_createv(NULL, 0, &node, NULL, CTLFLAG_PERMANENT, CTLTYPE_INT, "rtts", SYSCTL_DESCR("Round-robin time quantum (in milliseconds)"), sysctl_sched_rtts, 0, NULL, 0, CTL_CREATE, CTL_EOL); }