NetBSD/sys/kern/kern_clock.c

451 lines
12 KiB
C
Raw Normal View History

/* $NetBSD: kern_clock.c,v 1.126 2008/10/05 21:57:20 pooka Exp $ */
/*-
* Copyright (c) 2000, 2004, 2006, 2007, 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
* NASA Ames Research Center.
2005-03-02 14:05:34 +03:00
* This code is derived from software contributed to The NetBSD Foundation
* by Charles M. Hannum.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*-
* Copyright (c) 1982, 1986, 1991, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_clock.c 8.5 (Berkeley) 1/21/94
*/
2001-11-12 18:25:01 +03:00
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_clock.c,v 1.126 2008/10/05 21:57:20 pooka Exp $");
2001-11-12 18:25:01 +03:00
#include "opt_ntp.h"
#include "opt_perfctrs.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/callout.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/resourcevar.h>
1996-02-04 05:15:01 +03:00
#include <sys/signalvar.h>
1996-02-09 21:59:18 +03:00
#include <sys/sysctl.h>
#include <sys/timex.h>
Scheduler bug fixes and reorganization * fix the ancient nice(1) bug, where nice +20 processes incorrectly steal 10 - 20% of the CPU, (or even more depending on load average) * provide a new schedclk() mechanism at a new clock at schedhz, so high platform hz values don't cause nice +0 processes to look like they are niced * change the algorithm slightly, and reorganize the code a lot * fix percent-CPU calculation bugs, and eliminate some no-op code === nice bug === Correctly divide the scheduler queues between niced and compute-bound processes. The current nice weight of two (sort of, see `algorithm change' below) neatly divides the USRPRI queues in half; this should have been used to clip p_estcpu, instead of UCHAR_MAX. Besides being the wrong amount, clipping an unsigned char to UCHAR_MAX is a no-op, and it was done after decay_cpu() which can only _reduce_ the value. It has to be kept <= NICE_WEIGHT * PRIO_MAX - PPQ or processes can scheduler-penalize themselves onto the same queue as nice +20 processes. (Or even a higher one.) === New schedclk() mechansism === Some platforms should be cutting down stathz before hitting the scheduler, since the scheduler algorithm only works right in the vicinity of 64 Hz. Rather than prescale hz, then scale back and forth by 4 every time p_estcpu is touched (each occurance an abstraction violation), use p_estcpu without scaling and require schedhz to be generated directly at the right frequency. Use a default stathz (well, actually, profhz) / 4, so nothing changes unless a platform defines schedhz and a new clock. Define these for alpha, where hz==1024, and nice was totally broke. === Algorithm change === The nice value used to be added to the exponentially-decayed scheduler history value p_estcpu, in _addition_ to be incorporated directly (with greater wieght) into the priority calculation. At first glance, it appears to be a pointless increase of 1/8 the nice effect (pri = p_estcpu/4 + nice*2), but it's actually at least 3x that because it will ramp up linearly but be decayed only exponentially, thus converging to an additional .75 nice for a loadaverage of one. I killed this, it makes the behavior hard to control, almost impossible to analyze, and the effect (~~nothing at for the first second, then somewhat increased niceness after three seconds or more, depending on load average) pointless. === Other bugs === hz -> profhz in the p_pctcpu = f(p_cpticks) calcuation. Collect scheduler functionality. Try to put each abstraction in just one place.
1999-02-23 05:56:03 +03:00
#include <sys/sched.h>
2003-01-18 13:06:22 +03:00
#include <sys/time.h>
#include <sys/timetc.h>
#include <sys/cpu.h>
#include <sys/atomic.h>
#include <uvm/uvm_extern.h>
#ifdef GPROF
#include <sys/gmon.h>
#endif
/*
* Clock handling routines.
*
* This code is written to operate with two timers that run independently of
* each other. The main clock, running hz times per second, is used to keep
* track of real time. The second timer handles kernel and user profiling,
* and does resource use estimation. If the second timer is programmable,
* it is randomized to avoid aliasing between the two clocks. For example,
2004-02-13 14:36:08 +03:00
* the randomization prevents an adversary from always giving up the CPU
* just before its quantum expires. Otherwise, it would never accumulate
2004-02-13 14:36:08 +03:00
* CPU ticks. The mean frequency of the second timer is stathz.
*
* If no second timer exists, stathz will be zero; in this case we drive
* profiling and statistics off the main clock. This WILL NOT be accurate;
* do not do it unless absolutely necessary.
*
* The statistics clock may (or may not) be run at a higher rate while
* profiling. This profile clock runs at profhz. We require that profhz
* be an integral multiple of stathz.
*
* If the statistics clock is running fast, it must be divided by the ratio
* profhz/stathz for statistics. (For profiling, every tick counts.)
*/
int stathz;
int profhz;
int profsrc;
2001-05-06 17:46:34 +04:00
int schedhz;
int profprocs;
int hardclock_ticks;
static int hardscheddiv; /* hard => sched divider (used if schedhz == 0) */
static int psdiv; /* prof => stat divider */
int psratio; /* ratio: prof / stat */
static u_int get_intr_timecount(struct timecounter *);
static struct timecounter intr_timecounter = {
get_intr_timecount, /* get_timecount */
0, /* no poll_pps */
~0u, /* counter_mask */
0, /* frequency */
"clockinterrupt", /* name */
2006-09-02 10:21:32 +04:00
0, /* quality - minimum implementation level for a clock */
NULL, /* prev */
NULL, /* next */
};
static u_int
get_intr_timecount(struct timecounter *tc)
{
return (u_int)hardclock_ticks;
}
/*
* Initialize clock frequencies and start both clocks running.
*/
void
2000-08-01 08:57:28 +04:00
initclocks(void)
{
2000-03-30 13:27:11 +04:00
int i;
/*
* Set divisors to 1 (normal case) and let the machine-specific
* code do its bit.
*/
psdiv = 1;
/*
* provide minimum default time counter
* will only run at interrupt resolution
*/
intr_timecounter.tc_frequency = hz;
tc_init(&intr_timecounter);
cpu_initclocks();
/*
* Compute profhz and stathz, fix profhz if needed.
*/
i = stathz ? stathz : hz;
if (profhz == 0)
profhz = i;
psratio = profhz / i;
if (schedhz == 0) {
/* 16Hz is best */
hardscheddiv = hz / 16;
if (hardscheddiv <= 0)
panic("hardscheddiv");
}
}
/*
* The real-time timer, interrupting hz times per second.
*/
void
2000-08-01 08:57:28 +04:00
hardclock(struct clockframe *frame)
{
2003-01-18 13:06:22 +03:00
struct lwp *l;
struct cpu_info *ci;
ci = curcpu();
l = ci->ci_data.cpu_onproc;
timer_tick(l, CLKF_USERMODE(frame));
/*
* If no separate statistics clock is available, run it from here.
*/
if (stathz == 0)
statclock(frame);
/*
* If no separate schedclock is provided, call it here
* at about 16 Hz.
*/
if (schedhz == 0) {
if ((int)(--ci->ci_schedstate.spc_schedticks) <= 0) {
schedclock(l);
ci->ci_schedstate.spc_schedticks = hardscheddiv;
}
}
if ((--ci->ci_schedstate.spc_ticks) <= 0)
sched_tick(ci);
2005-02-27 00:34:55 +03:00
if (CPU_IS_PRIMARY(ci)) {
hardclock_ticks++;
tc_ticktock();
}
/*
* Update real-time timeout queue.
*/
callout_hardclock();
}
/*
* Start profiling on a process.
*
* Kernel profiling passes proc0 which never exits and hence
* keeps the profile clock running constantly.
*/
void
2000-08-01 08:57:28 +04:00
startprofclock(struct proc *p)
{
KASSERT(mutex_owned(&p->p_stmutex));
2007-02-10 00:55:00 +03:00
if ((p->p_stflag & PST_PROFIL) == 0) {
p->p_stflag |= PST_PROFIL;
/*
* This is only necessary if using the clock as the
* profiling source.
*/
if (++profprocs == 1 && stathz != 0)
psdiv = psratio;
}
}
/*
* Stop profiling on a process.
*/
void
2000-08-01 08:57:28 +04:00
stopprofclock(struct proc *p)
{
KASSERT(mutex_owned(&p->p_stmutex));
2007-02-10 00:55:00 +03:00
if (p->p_stflag & PST_PROFIL) {
p->p_stflag &= ~PST_PROFIL;
/*
* This is only necessary if using the clock as the
* profiling source.
*/
if (--profprocs == 0 && stathz != 0)
psdiv = 1;
}
}
#if defined(PERFCTRS)
/*
* Independent profiling "tick" in case we're using a separate
* clock or profiling event source. Currently, that's just
* performance counters--hence the wrapper.
*/
void
proftick(struct clockframe *frame)
{
#ifdef GPROF
2005-02-27 00:34:55 +03:00
struct gmonparam *g;
intptr_t i;
#endif
2007-02-10 00:55:00 +03:00
struct lwp *l;
struct proc *p;
l = curcpu()->ci_data.cpu_onproc;
2007-02-10 00:55:00 +03:00
p = (l ? l->l_proc : NULL);
if (CLKF_USERMODE(frame)) {
2007-02-10 00:55:00 +03:00
mutex_spin_enter(&p->p_stmutex);
if (p->p_stflag & PST_PROFIL)
addupc_intr(l, CLKF_PC(frame));
mutex_spin_exit(&p->p_stmutex);
} else {
#ifdef GPROF
g = &_gmonparam;
if (g->state == GMON_PROF_ON) {
i = CLKF_PC(frame) - g->lowpc;
if (i < g->textsize) {
i /= HISTFRACTION * sizeof(*g->kcount);
g->kcount[i]++;
}
}
#endif
#ifdef LWP_PC
if (p != NULL && (p->p_stflag & PST_PROFIL) != 0)
2007-10-03 18:49:24 +04:00
addupc_intr(l, LWP_PC(l));
2005-02-27 00:34:55 +03:00
#endif
}
}
#endif
void
schedclock(struct lwp *l)
{
struct cpu_info *ci;
ci = l->l_cpu;
/* Accumulate syscall and context switch counts. */
atomic_add_int((unsigned *)&uvmexp.swtch, ci->ci_data.cpu_nswtch);
ci->ci_data.cpu_nswtch = 0;
atomic_add_int((unsigned *)&uvmexp.syscalls, ci->ci_data.cpu_nsyscall);
ci->ci_data.cpu_nsyscall = 0;
atomic_add_int((unsigned *)&uvmexp.traps, ci->ci_data.cpu_ntrap);
ci->ci_data.cpu_ntrap = 0;
if ((l->l_flag & LW_IDLE) != 0)
return;
sched_schedclock(l);
}
/*
* Statistics clock. Grab profile sample, and if divider reaches 0,
* do process and kernel statistics.
*/
void
2000-08-01 08:57:28 +04:00
statclock(struct clockframe *frame)
{
#ifdef GPROF
2000-03-30 13:27:11 +04:00
struct gmonparam *g;
intptr_t i;
#endif
struct cpu_info *ci = curcpu();
struct schedstate_percpu *spc = &ci->ci_schedstate;
2000-03-30 13:27:11 +04:00
struct proc *p;
2006-04-15 06:12:49 +04:00
struct lwp *l;
/*
* Notice changes in divisor frequency, and adjust clock
* frequency accordingly.
*/
if (spc->spc_psdiv != psdiv) {
spc->spc_psdiv = psdiv;
spc->spc_pscnt = psdiv;
if (psdiv == 1) {
setstatclockrate(stathz);
} else {
2005-02-27 00:34:55 +03:00
setstatclockrate(profhz);
}
}
l = ci->ci_data.cpu_onproc;
if ((l->l_flag & LW_IDLE) != 0) {
/*
* don't account idle lwps as swapper.
*/
p = NULL;
} else {
p = l->l_proc;
2007-02-10 00:55:00 +03:00
mutex_spin_enter(&p->p_stmutex);
}
if (CLKF_USERMODE(frame)) {
2007-02-10 00:55:00 +03:00
if ((p->p_stflag & PST_PROFIL) && profsrc == PROFSRC_CLOCK)
addupc_intr(l, CLKF_PC(frame));
if (--spc->spc_pscnt > 0) {
mutex_spin_exit(&p->p_stmutex);
return;
2007-02-10 00:55:00 +03:00
}
/*
* Came from user mode; CPU was in user state.
* If this process is being profiled record the tick.
*/
p->p_uticks++;
if (p->p_nice > NZERO)
spc->spc_cp_time[CP_NICE]++;
else
spc->spc_cp_time[CP_USER]++;
} else {
#ifdef GPROF
/*
* Kernel statistics are just like addupc_intr, only easier.
*/
g = &_gmonparam;
if (profsrc == PROFSRC_CLOCK && g->state == GMON_PROF_ON) {
i = CLKF_PC(frame) - g->lowpc;
if (i < g->textsize) {
i /= HISTFRACTION * sizeof(*g->kcount);
g->kcount[i]++;
}
}
#endif
2003-01-18 13:06:22 +03:00
#ifdef LWP_PC
if (p != NULL && profsrc == PROFSRC_CLOCK &&
(p->p_stflag & PST_PROFIL)) {
2007-02-10 00:55:00 +03:00
addupc_intr(l, LWP_PC(l));
}
#endif
2007-02-10 00:55:00 +03:00
if (--spc->spc_pscnt > 0) {
if (p != NULL)
mutex_spin_exit(&p->p_stmutex);
return;
2007-02-10 00:55:00 +03:00
}
/*
* Came from kernel mode, so we were:
* - handling an interrupt,
* - doing syscall or trap work on behalf of the current
* user process, or
* - spinning in the idle loop.
* Whichever it is, charge the time as appropriate.
* Note that we charge interrupts to the current process,
* regardless of whether they are ``for'' that process,
* so that we know how much of its real time was spent
* in ``non-process'' (i.e., interrupt) work.
*/
if (CLKF_INTR(frame) || (curlwp->l_pflag & LP_INTR) != 0) {
if (p != NULL) {
p->p_iticks++;
}
spc->spc_cp_time[CP_INTR]++;
} else if (p != NULL) {
p->p_sticks++;
spc->spc_cp_time[CP_SYS]++;
} else {
spc->spc_cp_time[CP_IDLE]++;
}
}
spc->spc_pscnt = psdiv;
if (p != NULL) {
atomic_inc_uint(&l->l_cpticks);
2007-02-10 00:55:00 +03:00
mutex_spin_exit(&p->p_stmutex);
}
}