From 68de460f3937896b48107c08cf4a1264c00fc1dc Mon Sep 17 00:00:00 2001 From: thorpej Date: Thu, 14 Dec 2000 08:42:28 +0000 Subject: [PATCH] Import ALTQ from KAME. --- sys/altq/altq.h | 198 ++++ sys/altq/altq_afmap.c | 411 ++++++++ sys/altq/altq_afmap.h | 103 ++ sys/altq/altq_blue.c | 690 ++++++++++++ sys/altq/altq_blue.h | 118 +++ sys/altq/altq_cbq.c | 972 +++++++++++++++++ sys/altq/altq_cbq.h | 219 ++++ sys/altq/altq_cdnr.c | 1374 ++++++++++++++++++++++++ sys/altq/altq_cdnr.h | 333 ++++++ sys/altq/altq_classq.h | 203 ++++ sys/altq/altq_conf.c | 467 ++++++++ sys/altq/altq_conf.h | 109 ++ sys/altq/altq_fifoq.c | 414 ++++++++ sys/altq/altq_fifoq.h | 79 ++ sys/altq/altq_flowvalve.h | 92 ++ sys/altq/altq_hfsc.c | 1810 +++++++++++++++++++++++++++++++ sys/altq/altq_hfsc.h | 278 +++++ sys/altq/altq_localq.c | 68 ++ sys/altq/altq_priq.c | 865 +++++++++++++++ sys/altq/altq_priq.h | 160 +++ sys/altq/altq_red.c | 1474 ++++++++++++++++++++++++++ sys/altq/altq_red.h | 189 ++++ sys/altq/altq_rio.c | 828 +++++++++++++++ sys/altq/altq_rio.h | 139 +++ sys/altq/altq_rmclass.c | 1870 +++++++++++++++++++++++++++++++++ sys/altq/altq_rmclass.h | 266 +++++ sys/altq/altq_rmclass_debug.h | 112 ++ sys/altq/altq_subr.c | 1551 +++++++++++++++++++++++++++ sys/altq/altq_var.h | 225 ++++ sys/altq/altq_wfq.c | 751 +++++++++++++ sys/altq/altq_wfq.h | 124 +++ sys/altq/if_altq.h | 165 +++ 32 files changed, 16657 insertions(+) create mode 100644 sys/altq/altq.h create mode 100644 sys/altq/altq_afmap.c create mode 100644 sys/altq/altq_afmap.h create mode 100644 sys/altq/altq_blue.c create mode 100644 sys/altq/altq_blue.h create mode 100644 sys/altq/altq_cbq.c create mode 100644 sys/altq/altq_cbq.h create mode 100644 sys/altq/altq_cdnr.c create mode 100644 sys/altq/altq_cdnr.h create mode 100644 sys/altq/altq_classq.h create mode 100644 sys/altq/altq_conf.c create mode 100644 sys/altq/altq_conf.h create mode 100644 sys/altq/altq_fifoq.c create mode 100644 sys/altq/altq_fifoq.h create mode 100644 sys/altq/altq_flowvalve.h create mode 100644 sys/altq/altq_hfsc.c create mode 100644 sys/altq/altq_hfsc.h create mode 100644 sys/altq/altq_localq.c create mode 100644 sys/altq/altq_priq.c create mode 100644 sys/altq/altq_priq.h create mode 100644 sys/altq/altq_red.c create mode 100644 sys/altq/altq_red.h create mode 100644 sys/altq/altq_rio.c create mode 100644 sys/altq/altq_rio.h create mode 100644 sys/altq/altq_rmclass.c create mode 100644 sys/altq/altq_rmclass.h create mode 100644 sys/altq/altq_rmclass_debug.h create mode 100644 sys/altq/altq_subr.c create mode 100644 sys/altq/altq_var.h create mode 100644 sys/altq/altq_wfq.c create mode 100644 sys/altq/altq_wfq.h create mode 100644 sys/altq/if_altq.h diff --git a/sys/altq/altq.h b/sys/altq/altq.h new file mode 100644 index 000000000000..8d42f36a11c9 --- /dev/null +++ b/sys/altq/altq.h @@ -0,0 +1,198 @@ +/* $KAME: altq.h,v 1.6 2000/12/14 08:12:45 thorpej Exp $ */ + +/* + * Copyright (C) 1998-2000 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#ifndef _ALTQ_ALTQ_H_ +#define _ALTQ_ALTQ_H_ + +#include +#include +#include +#include + +#ifndef IFNAMSIZ +#define IFNAMSIZ 16 +#endif + +/* altq discipline type */ +#define ALTQT_NONE 0 /* reserved */ +#define ALTQT_CBQ 1 /* cbq */ +#define ALTQT_WFQ 2 /* wfq */ +#define ALTQT_AFMAP 3 /* afmap */ +#define ALTQT_FIFOQ 4 /* fifoq */ +#define ALTQT_RED 5 /* red */ +#define ALTQT_RIO 6 /* rio */ +#define ALTQT_LOCALQ 7 /* local use */ +#define ALTQT_HFSC 8 /* hfsc */ +#define ALTQT_CDNR 9 /* traffic conditioner */ +#define ALTQT_BLUE 10 /* blue */ +#define ALTQT_PRIQ 11 /* priority queue */ +#define ALTQT_MAX 12 /* should be max discipline type + 1 */ + +struct altqreq { + char ifname[IFNAMSIZ]; /* if name, e.g. "en0" */ + u_long arg; /* request-specific argument */ +}; + +/* simple token backet meter profile */ +struct tb_profile { + u_int rate; /* rate in bit-per-sec */ + u_int depth; /* depth in bytes */ +}; + +struct tbrreq { + char ifname[IFNAMSIZ]; /* if name, e.g. "en0" */ + struct tb_profile tb_prof; /* token bucket profile */ +}; + +/* + * common network flow info structure + */ +struct flowinfo { + u_char fi_len; /* total length */ + u_char fi_family; /* address family */ + u_int8_t fi_data[46]; /* actually longer; address family + specific flow info. */ +}; + +/* + * flow info structure for internet protocol family. + * (currently this is the only protocol family supported) + */ +struct flowinfo_in { + u_char fi_len; /* sizeof(struct flowinfo_in) */ + u_char fi_family; /* AF_INET */ + u_int8_t fi_proto; /* IPPROTO_XXX */ + u_int8_t fi_tos; /* type-of-service */ + struct in_addr fi_dst; /* dest address */ + struct in_addr fi_src; /* src address */ + u_int16_t fi_dport; /* dest port */ + u_int16_t fi_sport; /* src port */ + u_int32_t fi_gpi; /* generalized port id for ipsec */ + u_int8_t _pad[28]; /* make the size equal to + flowinfo_in6 */ +}; + +#ifdef SIN6_LEN +struct flowinfo_in6 { + u_char fi6_len; /* sizeof(struct flowinfo_in6) */ + u_char fi6_family; /* AF_INET6 */ + u_int8_t fi6_proto; /* IPPROTO_XXX */ + u_int8_t fi6_tclass; /* traffic class */ + u_int32_t fi6_flowlabel; /* ipv6 flowlabel */ + u_int16_t fi6_dport; /* dest port */ + u_int16_t fi6_sport; /* src port */ + u_int32_t fi6_gpi; /* generalized port id */ + struct in6_addr fi6_dst; /* dest address */ + struct in6_addr fi6_src; /* src address */ +}; +#endif /* INET6 */ + +/* + * flow filters for AF_INET and AF_INET6 + */ +struct flow_filter { + int ff_ruleno; + struct flowinfo_in ff_flow; + struct { + struct in_addr mask_dst; + struct in_addr mask_src; + u_int8_t mask_tos; + u_int8_t _pad[3]; + } ff_mask; + u_int8_t _pad2[24]; /* make the size equal to flow_filter6 */ +}; + +#ifdef SIN6_LEN +struct flow_filter6 { + int ff_ruleno; + struct flowinfo_in6 ff_flow6; + struct { + struct in6_addr mask6_dst; + struct in6_addr mask6_src; + u_int8_t mask6_tclass; + u_int8_t _pad[3]; + } ff_mask6; +}; +#endif /* INET6 */ + +/* + * generic packet counter + */ +struct pktcntr { + u_int64_t packets; + u_int64_t bytes; +}; + +#define PKTCNTR_ADD(cntr, len) \ + do { (cntr)->packets++; (cntr)->bytes += len; } while (0) + +/* + * altq related ioctls + */ +#define ALTQGTYPE _IOWR('q', 0, struct altqreq) /* get queue type */ +#if 0 +/* + * these ioctls are currently discipline-specific but could be shared + * in the future. + */ +#define ALTQATTACH _IOW('q', 1, struct altqreq) /* attach discipline */ +#define ALTQDETACH _IOW('q', 2, struct altqreq) /* detach discipline */ +#define ALTQENABLE _IOW('q', 3, struct altqreq) /* enable discipline */ +#define ALTQDISABLE _IOW('q', 4, struct altqreq) /* disable discipline*/ +#define ALTQCLEAR _IOW('q', 5, struct altqreq) /* (re)initialize */ +#define ALTQCONFIG _IOWR('q', 6, struct altqreq) /* set config params */ +#define ALTQADDCLASS _IOWR('q', 7, struct altqreq) /* add a class */ +#define ALTQMODCLASS _IOWR('q', 8, struct altqreq) /* modify a class */ +#define ALTQDELCLASS _IOWR('q', 9, struct altqreq) /* delete a class */ +#define ALTQADDFILTER _IOWR('q', 10, struct altqreq) /* add a filter */ +#define ALTQDELFILTER _IOWR('q', 11, struct altqreq) /* delete a filter */ +#define ALTQGETSTATS _IOWR('q', 12, struct altqreq) /* get statistics */ +#define ALTQGETCNTR _IOWR('q', 13, struct altqreq) /* get a pkt counter */ +#endif /* 0 */ +#define ALTQTBRSET _IOW('q', 14, struct tbrreq) /* set tb regulator */ +#define ALTQTBRGET _IOWR('q', 15, struct tbrreq) /* get tb regulator */ + +/* queue macros only in FreeBSD */ +#ifndef LIST_EMPTY +#define LIST_EMPTY(head) ((head)->lh_first == NULL) +#endif +#ifndef LIST_FOREACH +#define LIST_FOREACH(var, head, field) \ + for((var) = (head)->lh_first; (var); (var) = (var)->field.le_next) +#endif + +#ifdef KERNEL +#ifndef _KERNEL +#define _KERNEL +#endif +#endif + +#ifdef _KERNEL +#include +#endif + +#endif /* _ALTQ_ALTQ_H_ */ diff --git a/sys/altq/altq_afmap.c b/sys/altq/altq_afmap.c new file mode 100644 index 000000000000..e49bd9b71edb --- /dev/null +++ b/sys/altq/altq_afmap.c @@ -0,0 +1,411 @@ +/* $KAME: altq_afmap.c,v 1.7 2000/12/14 08:12:45 thorpej Exp $ */ + +/* + * Copyright (C) 1997-2000 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * experimental: + * mapping an ip flow to atm vpi/vci. + * this module is not related to queueing at all, but uses the altq + * flowinfo mechanism. it's just put in the altq framework since + * it is easy to add devices to altq. + */ +#if defined(__FreeBSD__) || defined(__NetBSD__) +#include "opt_altq.h" +#if (__FreeBSD__ != 2) +#include "opt_inet.h" +#ifdef __FreeBSD__ +#include "opt_inet6.h" +#endif +#endif +#endif /* __FreeBSD__ || __NetBSD__ */ +#ifdef ALTQ_AFMAP + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +LIST_HEAD(, afm_head) afhead_chain; + +static struct afm *afm_match4 __P((struct afm_head *, struct flowinfo_in *)); +#ifdef INET6 +static struct afm *afm_match6 __P((struct afm_head *, struct flowinfo_in6 *)); +#endif + +/* + * rules to block interrupts: afm_match can be called from a net + * level interrupt so that other routines handling the lists should + * be called in splnet(). + */ +int +afm_alloc(ifp) + struct ifnet *ifp; +{ + struct afm_head *head; + + MALLOC(head, struct afm_head *, sizeof(struct afm_head), + M_DEVBUF, M_WAITOK); + if (head == NULL) + panic("afm_alloc: malloc failed!"); + bzero(head, sizeof(struct afm_head)); + + /* initialize per interface afmap list */ + LIST_INIT(&head->afh_head); + + head->afh_ifp = ifp; + + /* add this afm_head to the chain */ + LIST_INSERT_HEAD(&afhead_chain, head, afh_chain); + + return (0); +} + +int +afm_dealloc(ifp) + struct ifnet *ifp; +{ + struct afm_head *head; + + for (head = afhead_chain.lh_first; head != NULL; + head = head->afh_chain.le_next) + if (head->afh_ifp == ifp) + break; + if (head == NULL) + return (-1); + + afm_removeall(ifp); + + LIST_REMOVE(head, afh_chain); + + FREE(head, M_DEVBUF); + return 0; +} + +struct afm * +afm_top(ifp) + struct ifnet *ifp; +{ + struct afm_head *head; + + for (head = afhead_chain.lh_first; head != NULL; + head = head->afh_chain.le_next) + if (head->afh_ifp == ifp) + break; + if (head == NULL) + return NULL; + + return (head->afh_head.lh_first); +} + +int afm_add(ifp, flowmap) + struct ifnet *ifp; + struct atm_flowmap *flowmap; +{ + struct afm_head *head; + struct afm *afm; + + for (head = afhead_chain.lh_first; head != NULL; + head = head->afh_chain.le_next) + if (head->afh_ifp == ifp) + break; + if (head == NULL) + return (-1); + + if (flowmap->af_flowinfo.fi_family == AF_INET) { + if (flowmap->af_flowinfo.fi_len != sizeof(struct flowinfo_in)) + return (EINVAL); +#ifdef INET6 + } else if (flowmap->af_flowinfo.fi_family == AF_INET6) { + if (flowmap->af_flowinfo.fi_len != sizeof(struct flowinfo_in6)) + return (EINVAL); +#endif + } else + return (EINVAL); + + MALLOC(afm, struct afm *, sizeof(struct afm), + M_DEVBUF, M_WAITOK); + if (afm == NULL) + return (ENOMEM); + bzero(afm, sizeof(struct afm)); + + afm->afm_vci = flowmap->af_vci; + afm->afm_vpi = flowmap->af_vpi; + bcopy(&flowmap->af_flowinfo, &afm->afm_flowinfo, + flowmap->af_flowinfo.fi_len); + + LIST_INSERT_HEAD(&head->afh_head, afm, afm_list); + return 0; +} + +int +afm_remove(afm) + struct afm *afm; +{ + LIST_REMOVE(afm, afm_list); + FREE(afm, M_DEVBUF); + return (0); +} + +int +afm_removeall(ifp) + struct ifnet *ifp; +{ + struct afm_head *head; + struct afm *afm; + + for (head = afhead_chain.lh_first; head != NULL; + head = head->afh_chain.le_next) + if (head->afh_ifp == ifp) + break; + if (head == NULL) + return (-1); + + while ((afm = head->afh_head.lh_first) != NULL) + afm_remove(afm); + return (0); +} + +struct afm * +afm_lookup(ifp, vpi, vci) + struct ifnet *ifp; + int vpi, vci; +{ + struct afm_head *head; + struct afm *afm; + + for (head = afhead_chain.lh_first; head != NULL; + head = head->afh_chain.le_next) + if (head->afh_ifp == ifp) + break; + if (head == NULL) + return NULL; + + for (afm = head->afh_head.lh_first; afm != NULL; + afm = afm->afm_list.le_next) + if (afm->afm_vpi == vpi && afm->afm_vci == vci) + break; + return afm; +} + +static struct afm * +afm_match4(head, fp) + struct afm_head *head; + struct flowinfo_in *fp; +{ + struct afm *afm; + + for (afm = head->afh_head.lh_first; afm != NULL; + afm = afm->afm_list.le_next) { + if (afm->afm_flowinfo4.fi_dst.s_addr != 0 && + afm->afm_flowinfo4.fi_dst.s_addr != fp->fi_dst.s_addr) + continue; + if (afm->afm_flowinfo4.fi_dport != 0 && + afm->afm_flowinfo4.fi_dport != fp->fi_dport) + continue; + if (afm->afm_flowinfo4.fi_src.s_addr != 0 && + afm->afm_flowinfo4.fi_src.s_addr != fp->fi_src.s_addr) + continue; + if (afm->afm_flowinfo4.fi_sport != 0 && + afm->afm_flowinfo4.fi_sport != fp->fi_sport) + continue; + if (afm->afm_flowinfo4.fi_proto != 0 && + afm->afm_flowinfo4.fi_proto != fp->fi_proto) + continue; + /* match found! */ + return (afm); + } + return NULL; +} + +#ifdef INET6 +static struct afm * +afm_match6(head, fp) + struct afm_head *head; + struct flowinfo_in6 *fp; +{ + struct afm *afm; + + for (afm = head->afh_head.lh_first; afm != NULL; + afm = afm->afm_list.le_next) { + if (afm->afm_flowinfo6.fi6_flowlabel != 0 && + afm->afm_flowinfo6.fi6_flowlabel != fp->fi6_flowlabel) + continue; +#ifdef notyet + if (!IN6_IS_ADDR_UNSPECIFIED(&afm->afm_flowinfo6.fi6_dst) && + !IN6_ARE_ADDR_EQUAL(&afm->afm_flowinfo6.fi6_dst, + &fp->fi6_dst)) + continue; + if (afm->afm_flowinfo6.fi6_dport != 0 && + afm->afm_flowinfo6.fi6_dport != fp->fi6_dport) + continue; +#endif + if (!IN6_IS_ADDR_UNSPECIFIED(&afm->afm_flowinfo6.fi6_src) && + !IN6_ARE_ADDR_EQUAL(&afm->afm_flowinfo6.fi6_src, + &fp->fi6_src)) + continue; +#ifdef notyet + if (afm->afm_flowinfo6.fi6_sport != 0 && + afm->afm_flowinfo6.fi6_sport != fp->fi6_sport) + continue; +#endif + if (afm->afm_flowinfo6.fi6_proto != 0 && + afm->afm_flowinfo6.fi6_proto != fp->fi6_proto) + continue; + /* match found! */ + return (afm); + } + return NULL; +} +#endif + +/* should be called in splimp() */ +struct afm * +afm_match(ifp, flow) + struct ifnet *ifp; + struct flowinfo *flow; +{ + struct afm_head *head; + + for (head = afhead_chain.lh_first; head != NULL; + head = head->afh_chain.le_next) + if (head->afh_ifp == ifp) + break; + if (head == NULL) + return NULL; + + switch (flow->fi_family) { + case AF_INET: + return (afm_match4(head, (struct flowinfo_in *)flow)); + +#ifdef INET6 + case AF_INET6: + return (afm_match6(head, (struct flowinfo_in6 *)flow)); +#endif + + default: + return NULL; + } +} + +/* + * afm device interface + */ +altqdev_decl(afm); + +int +afmopen(dev, flag, fmt, p) + dev_t dev; + int flag, fmt; + struct proc *p; +{ + return 0; +} + +int +afmclose(dev, flag, fmt, p) + dev_t dev; + int flag, fmt; + struct proc *p; +{ + int err, error = 0; + struct atm_flowmap fmap; + struct afm_head *head; + + for (head = afhead_chain.lh_first; head != NULL; + head = head->afh_chain.le_next) { + + /* call interface to clean up maps */ +#if defined(__NetBSD__) || defined(__OpenBSD__) + sprintf(fmap.af_ifname, "%s", head->afh_ifp->if_xname); +#else + sprintf(fmap.af_ifname, "%s%d", + head->afh_ifp->if_name, head->afh_ifp->if_unit); +#endif + err = afmioctl(dev, AFM_CLEANFMAP, (caddr_t)&fmap, flag, p); + if (err && error == 0) + error = err; + } + + return error; +} + +int +afmioctl(dev, cmd, addr, flag, p) + dev_t dev; + ioctlcmd_t cmd; + caddr_t addr; + int flag; + struct proc *p; +{ + int error = 0; + struct atm_flowmap *flowmap; + struct ifnet *ifp; + + /* check cmd for superuser only */ + switch (cmd) { + case AFM_GETFMAP: + break; + default: +#if (__FreeBSD_version > 400000) + error = suser(p); +#else + error = suser(p->p_ucred, &p->p_acflag); +#endif + if (error) + return (error); + break; + } + + /* lookup interface */ + flowmap = (struct atm_flowmap *)addr; + flowmap->af_ifname[IFNAMSIZ-1] = '\0'; + ifp = ifunit(flowmap->af_ifname); + if (ifp == NULL || ifp->if_ioctl == NULL || + (ifp->if_flags & IFF_RUNNING) == 0) + error = ENXIO; + else + error = ifp->if_ioctl(ifp, cmd, addr); + + return error; +} + +#endif /* ALTQ_AFMAP */ diff --git a/sys/altq/altq_afmap.h b/sys/altq/altq_afmap.h new file mode 100644 index 000000000000..93024f24fcb3 --- /dev/null +++ b/sys/altq/altq_afmap.h @@ -0,0 +1,103 @@ +/* $KAME: altq_afmap.h,v 1.5 2000/12/14 08:12:45 thorpej Exp $ */ + +/* + * Copyright (C) 1997-2000 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _ALTQ_ALTQ_AFMAP_H_ +#define _ALTQ_ALTQ_AFMAP_H_ + +#include +#include + +struct atm_flowmap { + char af_ifname[IFNAMSIZ]; /* if name, e.g. "en0" */ + u_int8_t af_vpi; + u_int16_t af_vci; + u_int32_t af_pcr; /* peek cell rate */ + union { + struct flowinfo afu_fi; + struct flowinfo_in afu_fi4; +#ifdef SIN6_LEN + struct flowinfo_in6 afu_fi6; +#endif + } af_fiu; +#define af_flowinfo af_fiu.afu_fi +#define af_flowinfo4 af_fiu.afu_fi4 +#define af_flowinfo6 af_fiu.afu_fi6 + + /* statistics */ + u_int32_t afs_packets; /* total packet count */ + u_int32_t afs_bytes; /* total byte count */ +}; + +/* set or get flowmap */ +#define AFM_ADDFMAP _IOWR('F', 30, struct atm_flowmap) +#define AFM_DELFMAP _IOWR('F', 31, struct atm_flowmap) +#define AFM_CLEANFMAP _IOWR('F', 32, struct atm_flowmap) +#define AFM_GETFMAP _IOWR('F', 33, struct atm_flowmap) + +#ifdef _KERNEL + +/* per flow information */ +struct afm { + LIST_ENTRY(afm) afm_list; + u_int16_t afm_vci; + u_int8_t afm_vpi; + union { + struct flowinfo afmu_fi; + struct flowinfo_in afmu_fi4; +#ifdef SIN6_LEN + struct flowinfo_in6 afmu_fi6; +#endif + } afm_fiu; +#define afm_flowinfo afm_fiu.afmu_fi +#define afm_flowinfo4 afm_fiu.afmu_fi4 +#define afm_flowinfo6 afm_fiu.afmu_fi6 + + /* statistics */ + u_int32_t afms_packets; /* total packet count */ + u_int32_t afms_bytes; /* total byte count */ +}; + +/* per interface */ +struct afm_head { + LIST_ENTRY(afm_head) afh_chain; + LIST_HEAD(, afm) afh_head; + struct ifnet *afh_ifp; +}; + +struct afm *afm_top __P((struct ifnet *)); +int afm_alloc __P((struct ifnet *)); +int afm_dealloc __P((struct ifnet *)); +int afm_add __P((struct ifnet *, struct atm_flowmap *)); +int afm_remove __P((struct afm *)); +int afm_removeall __P((struct ifnet *)); +struct afm *afm_lookup __P((struct ifnet *, int, int)); +struct afm *afm_match __P((struct ifnet *, struct flowinfo *)); + +#endif /* _KERNEL */ + +#endif /* _ALTQ_ALTQ_AFMAP_H_ */ diff --git a/sys/altq/altq_blue.c b/sys/altq/altq_blue.c new file mode 100644 index 000000000000..59d4a0cd6441 --- /dev/null +++ b/sys/altq/altq_blue.c @@ -0,0 +1,690 @@ +/* $KAME: altq_blue.c,v 1.7 2000/12/14 08:12:45 thorpej Exp $ */ + +/* + * Copyright (C) 1997-2000 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ +/* + * Copyright (c) 1990-1994 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the Computer Systems + * Engineering Group at Lawrence Berkeley Laboratory. + * 4. Neither the name of the University nor of the Laboratory may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if defined(__FreeBSD__) || defined(__NetBSD__) +#include "opt_altq.h" +#if (__FreeBSD__ != 2) +#include "opt_inet.h" +#ifdef __FreeBSD__ +#include "opt_inet6.h" +#endif +#endif +#endif /* __FreeBSD__ || __NetBSD__ */ +#ifdef ALTQ_BLUE /* blue is enabled by ALTQ_BLUE option in opt_altq.h */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#ifdef INET6 +#include +#endif + +#include +#include +#include + +/* + * Blue is proposed and implemented by Wu-chang Feng . + * more information on Blue is available from + * http://www.eecs.umich.edu/~wuchang/blue/ + */ + +/* fixed-point uses 12-bit decimal places */ +#define FP_SHIFT 12 /* fixed-point shift */ + +#define BLUE_LIMIT 200 /* default max queue lenght */ +#define BLUE_STATS /* collect statistics */ + +/* blue_list keeps all blue_state_t's allocated. */ +static blue_queue_t *blue_list = NULL; + +/* internal function prototypes */ +static int blue_enqueue __P((struct ifaltq *, struct mbuf *, + struct altq_pktattr *)); +static struct mbuf *blue_dequeue __P((struct ifaltq *, int)); +static int drop_early __P((blue_t *)); +static int mark_ecn __P((struct mbuf *, struct altq_pktattr *, int)); +static int blue_detach __P((blue_queue_t *)); +static int blue_request __P((struct ifaltq *, int, void *)); + +/* + * blue device interface + */ +altqdev_decl(blue); + +int +blueopen(dev, flag, fmt, p) + dev_t dev; + int flag, fmt; + struct proc *p; +{ + /* everything will be done when the queueing scheme is attached. */ + return 0; +} + +int +blueclose(dev, flag, fmt, p) + dev_t dev; + int flag, fmt; + struct proc *p; +{ + blue_queue_t *rqp; + int err, error = 0; + + while ((rqp = blue_list) != NULL) { + /* destroy all */ + err = blue_detach(rqp); + if (err != 0 && error == 0) + error = err; + } + + return error; +} + +int +blueioctl(dev, cmd, addr, flag, p) + dev_t dev; + ioctlcmd_t cmd; + caddr_t addr; + int flag; + struct proc *p; +{ + blue_queue_t *rqp; + struct blue_interface *ifacep; + struct ifnet *ifp; + int error = 0; + + /* check super-user privilege */ + switch (cmd) { + case BLUE_GETSTATS: + break; + default: +#if (__FreeBSD_version > 400000) + if ((error = suser(p)) != 0) + return (error); +#else + if ((error = suser(p->p_ucred, &p->p_acflag)) != 0) + return (error); +#endif + break; + } + + switch (cmd) { + + case BLUE_ENABLE: + ifacep = (struct blue_interface *)addr; + if ((rqp = altq_lookup(ifacep->blue_ifname, ALTQT_BLUE)) == NULL) { + error = EBADF; + break; + } + error = altq_enable(rqp->rq_ifq); + break; + + case BLUE_DISABLE: + ifacep = (struct blue_interface *)addr; + if ((rqp = altq_lookup(ifacep->blue_ifname, ALTQT_BLUE)) == NULL) { + error = EBADF; + break; + } + error = altq_disable(rqp->rq_ifq); + break; + + case BLUE_IF_ATTACH: + ifp = ifunit(((struct blue_interface *)addr)->blue_ifname); + if (ifp == NULL) { + error = ENXIO; + break; + } + + /* allocate and initialize blue_state_t */ + MALLOC(rqp, blue_queue_t *, sizeof(blue_queue_t), M_DEVBUF, M_WAITOK); + bzero(rqp, sizeof(blue_queue_t)); + + MALLOC(rqp->rq_q, class_queue_t *, sizeof(class_queue_t), + M_DEVBUF, M_WAITOK); + bzero(rqp->rq_q, sizeof(class_queue_t)); + + MALLOC(rqp->rq_blue, blue_t *, sizeof(blue_t), M_DEVBUF, M_WAITOK); + bzero(rqp->rq_blue, sizeof(blue_t)); + + rqp->rq_ifq = &ifp->if_snd; + qtail(rqp->rq_q) = NULL; + qlen(rqp->rq_q) = 0; + qlimit(rqp->rq_q) = BLUE_LIMIT; + + /* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */ + blue_init(rqp->rq_blue, 0, 800, 1000, 50000); + + /* + * set BLUE to this ifnet structure. + */ + error = altq_attach(rqp->rq_ifq, ALTQT_BLUE, rqp, + blue_enqueue, blue_dequeue, blue_request, + NULL, NULL); + if (error) { + FREE(rqp->rq_blue, M_DEVBUF); + FREE(rqp->rq_q, M_DEVBUF); + FREE(rqp, M_DEVBUF); + break; + } + + /* add this state to the blue list */ + rqp->rq_next = blue_list; + blue_list = rqp; + break; + + case BLUE_IF_DETACH: + ifacep = (struct blue_interface *)addr; + if ((rqp = altq_lookup(ifacep->blue_ifname, ALTQT_BLUE)) == NULL) { + error = EBADF; + break; + } + error = blue_detach(rqp); + break; + + case BLUE_GETSTATS: + do { + struct blue_stats *q_stats; + blue_t *rp; + + q_stats = (struct blue_stats *)addr; + if ((rqp = altq_lookup(q_stats->iface.blue_ifname, + ALTQT_BLUE)) == NULL) { + error = EBADF; + break; + } + + q_stats->q_len = qlen(rqp->rq_q); + q_stats->q_limit = qlimit(rqp->rq_q); + + rp = rqp->rq_blue; + q_stats->q_pmark = rp->blue_pmark; + q_stats->xmit_packets = rp->blue_stats.xmit_packets; + q_stats->xmit_bytes = rp->blue_stats.xmit_bytes; + q_stats->drop_packets = rp->blue_stats.drop_packets; + q_stats->drop_bytes = rp->blue_stats.drop_bytes; + q_stats->drop_forced = rp->blue_stats.drop_forced; + q_stats->drop_unforced = rp->blue_stats.drop_unforced; + q_stats->marked_packets = rp->blue_stats.marked_packets; + + } while (0); + break; + + case BLUE_CONFIG: + do { + struct blue_conf *fc; + int limit; + + fc = (struct blue_conf *)addr; + if ((rqp = altq_lookup(fc->iface.blue_ifname, + ALTQT_BLUE)) == NULL) { + error = EBADF; + break; + } + limit = fc->blue_limit; + qlimit(rqp->rq_q) = limit; + fc->blue_limit = limit; /* write back the new value */ + if (fc->blue_pkttime > 0) + rqp->rq_blue->blue_pkttime = fc->blue_pkttime; + if (fc->blue_max_pmark > 0) + rqp->rq_blue->blue_max_pmark = fc->blue_max_pmark; + if (fc->blue_hold_time > 0) + rqp->rq_blue->blue_hold_time = fc->blue_hold_time; + rqp->rq_blue->blue_flags = fc->blue_flags; + + blue_init(rqp->rq_blue, rqp->rq_blue->blue_flags, + rqp->rq_blue->blue_pkttime, + rqp->rq_blue->blue_max_pmark, + rqp->rq_blue->blue_hold_time); + } while (0); + break; + + default: + error = EINVAL; + break; + } + return error; +} + +static int blue_detach(rqp) + blue_queue_t *rqp; +{ + blue_queue_t *tmp; + int error = 0; + + if (ALTQ_IS_ENABLED(rqp->rq_ifq)) + altq_disable(rqp->rq_ifq); + + if ((error = altq_detach(rqp->rq_ifq))) + return (error); + + if (blue_list == rqp) + blue_list = rqp->rq_next; + else { + for (tmp = blue_list; tmp != NULL; tmp = tmp->rq_next) + if (tmp->rq_next == rqp) { + tmp->rq_next = rqp->rq_next; + break; + } + if (tmp == NULL) + printf("blue_detach: no state found in blue_list!\n"); + } + + FREE(rqp->rq_q, M_DEVBUF); + FREE(rqp->rq_blue, M_DEVBUF); + FREE(rqp, M_DEVBUF); + return (error); +} + +/* + * blue support routines + */ + +int +blue_init(rp, flags, pkttime, blue_max_pmark, blue_hold_time) + blue_t *rp; + int flags; + int pkttime; + int blue_max_pmark; + int blue_hold_time; +{ + int npkts_per_sec; + + rp->blue_idle = 1; + rp->blue_flags = flags; + rp->blue_pkttime = pkttime; + rp->blue_max_pmark = blue_max_pmark; + rp->blue_hold_time = blue_hold_time; + if (pkttime == 0) + rp->blue_pkttime = 1; + + /* when the link is very slow, adjust blue parameters */ + npkts_per_sec = 1000000 / rp->blue_pkttime; + if (npkts_per_sec < 50) { + } + else if (npkts_per_sec < 300) { + } + + microtime(&rp->blue_last); + return (0); +} + +/* + * enqueue routine: + * + * returns: 0 when successfully queued. + * ENOBUFS when drop occurs. + */ +static int +blue_enqueue(ifq, m, pktattr) + struct ifaltq *ifq; + struct mbuf *m; + struct altq_pktattr *pktattr; +{ + blue_queue_t *rqp = (blue_queue_t *)ifq->altq_disc; + int error = 0; + + if (blue_addq(rqp->rq_blue, rqp->rq_q, m, pktattr) == 0) + ifq->ifq_len++; + else + error = ENOBUFS; + return error; +} + +#define DTYPE_NODROP 0 /* no drop */ +#define DTYPE_FORCED 1 /* a "forced" drop */ +#define DTYPE_EARLY 2 /* an "unforced" (early) drop */ + +int +blue_addq(rp, q, m, pktattr) + blue_t *rp; + class_queue_t *q; + struct mbuf *m; + struct altq_pktattr *pktattr; +{ + int droptype; + + /* + * if we were idle, this is an enqueue onto an empty queue + * and we should decrement marking probability + * + */ + if (rp->blue_idle) { + struct timeval now; + int t; + rp->blue_idle = 0; + microtime(&now); + t = (now.tv_sec - rp->blue_last.tv_sec); + if ( t > 1) { + rp->blue_pmark = 1; + microtime(&rp->blue_last); + } else { + t = t * 1000000 + (now.tv_usec - rp->blue_last.tv_usec); + if (t > rp->blue_hold_time) { + rp->blue_pmark--; + if (rp->blue_pmark < 0) rp->blue_pmark = 0; + microtime(&rp->blue_last); + } + } + } + + /* see if we drop early */ + droptype = DTYPE_NODROP; + if (drop_early(rp) && qlen(q) > 1) { + /* mark or drop by blue */ + if ((rp->blue_flags & BLUEF_ECN) && + mark_ecn(m, pktattr, rp->blue_flags)) { + /* successfully marked. do not drop. */ +#ifdef BLUE_STATS + rp->blue_stats.marked_packets++; +#endif + } else { + /* unforced drop by blue */ + droptype = DTYPE_EARLY; + } + } + + /* + * if the queue length hits the hard limit, it's a forced drop. + */ + if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q)) + droptype = DTYPE_FORCED; + + /* if successful or forced drop, enqueue this packet. */ + if (droptype != DTYPE_EARLY) + _addq(q, m); + + if (droptype != DTYPE_NODROP) { + if (droptype == DTYPE_EARLY) { + /* drop the incoming packet */ +#ifdef BLUE_STATS + rp->blue_stats.drop_unforced++; +#endif + } else { + struct timeval now; + int t; + /* forced drop, select a victim packet in the queue. */ + m = _getq_random(q); + microtime(&now); + t = (now.tv_sec - rp->blue_last.tv_sec); + t = t * 1000000 + (now.tv_usec - rp->blue_last.tv_usec); + if (t > rp->blue_hold_time) { + rp->blue_pmark += rp->blue_max_pmark >> 3; + if (rp->blue_pmark > rp->blue_max_pmark) + rp->blue_pmark = rp->blue_max_pmark; + microtime(&rp->blue_last); + } +#ifdef BLUE_STATS + rp->blue_stats.drop_forced++; +#endif + } +#ifdef BLUE_STATS + rp->blue_stats.drop_packets++; + rp->blue_stats.drop_bytes += m->m_pkthdr.len; +#endif + m_freem(m); + return (-1); + } + /* successfully queued */ + return (0); +} + +/* + * early-drop probability is kept in blue_pmark + * + */ +static int +drop_early(rp) + blue_t *rp; +{ + if ((random() % rp->blue_max_pmark) < rp->blue_pmark) { + /* drop or mark */ + return (1); + } + /* no drop/mark */ + return (0); +} + +/* + * try to mark CE bit to the packet. + * returns 1 if successfully marked, 0 otherwise. + */ +static int +mark_ecn(m, pktattr, flags) + struct mbuf *m; + struct altq_pktattr *pktattr; + int flags; +{ + struct mbuf *m0; + + if (pktattr == NULL || + (pktattr->pattr_af != AF_INET && pktattr->pattr_af != AF_INET6)) + return (0); + + /* verify that pattr_hdr is within the mbuf data */ + for (m0 = m; m0 != NULL; m0 = m0->m_next) + if ((pktattr->pattr_hdr >= m0->m_data) && + (pktattr->pattr_hdr < m0->m_data + m0->m_len)) + break; + if (m0 == NULL) { + /* ick, pattr_hdr is stale */ + pktattr->pattr_af = AF_UNSPEC; + return (0); + } + + switch (pktattr->pattr_af) { + case AF_INET: + if (flags & BLUEF_ECN4) { + struct ip *ip = (struct ip *)pktattr->pattr_hdr; + + if (ip->ip_v != 4) + return (0); /* version mismatch! */ + if (ip->ip_tos & IPTOS_ECT) { + /* ECN-capable, mark ECN bit. */ + if ((ip->ip_tos & IPTOS_CE) == 0) { +#if (IPTOS_CE == 0x01) + u_short sum; + + ip->ip_tos |= IPTOS_CE; + /* + * optimized version when IPTOS_CE + * is 0x01. + * HC' = HC -1 when HC > 0 + * = 0xfffe when HC = 0 + */ + sum = ntohs(ip->ip_sum); + if (sum == 0) + sum = 0xfffe; + else + sum -= 1; + ip->ip_sum = htons(sum); +#else /* IPTOS_CE != 0x01 */ + long sum; + + ip->ip_tos |= IPTOS_CE; + /* + * update checksum (from RFC1624) + * HC' = ~(~HC + ~m + m') + */ + sum = ~ntohs(ip->ip_sum) & 0xffff; + sum += 0xffff + IPTOS_CE; + sum = (sum >> 16) + (sum & 0xffff); + sum += (sum >> 16); /* add carry */ + + ip->ip_sum = htons(~sum & 0xffff); +#endif /* IPTOS_CE != 0x01 */ + } + return (1); + } + } + break; +#ifdef INET6 + case AF_INET6: + if (flags & BLUEF_ECN6) { + struct ip6_hdr *ip6 = (struct ip6_hdr *)pktattr->pattr_hdr; + u_int32_t flowlabel; + + flowlabel = ntohl(ip6->ip6_flow); + if ((flowlabel >> 28) != 6) + return (0); /* version mismatch! */ + if (flowlabel & (IPTOS_ECT << 20)) { + /* ECN-capable, mark ECN bit. */ + flowlabel |= (IPTOS_CE << 20); + ip6->ip6_flow = htonl(flowlabel); + return (1); + } + } + break; +#endif /* INET6 */ + } + + /* not marked */ + return (0); +} + +/* + * dequeue routine: + * must be called in splimp. + * + * returns: mbuf dequeued. + * NULL when no packet is available in the queue. + */ + +static struct mbuf * +blue_dequeue(ifq, op) + struct ifaltq *ifq; + int op; +{ + blue_queue_t *rqp = (blue_queue_t *)ifq->altq_disc; + struct mbuf *m = NULL; + + if (op == ALTDQ_POLL) + return (qhead(rqp->rq_q)); + + m = blue_getq(rqp->rq_blue, rqp->rq_q); + if (m != NULL) + ifq->ifq_len--; + return m; +} + +struct mbuf *blue_getq(rp, q) + blue_t *rp; + class_queue_t *q; +{ + struct mbuf *m; + + if ((m = _getq(q)) == NULL) { + if (rp->blue_idle == 0) { + rp->blue_idle = 1; + microtime(&rp->blue_last); + } + return NULL; + } + + rp->blue_idle = 0; +#ifdef BLUE_STATS + rp->blue_stats.xmit_packets++; + rp->blue_stats.xmit_bytes += m->m_pkthdr.len; +#endif + return (m); +} + +static int +blue_request(ifq, req, arg) + struct ifaltq *ifq; + int req; + void *arg; +{ + blue_queue_t *rqp = (blue_queue_t *)ifq->altq_disc; + + switch (req) { + case ALTRQ_PURGE: + _flushq(rqp->rq_q); + if (ALTQ_IS_ENABLED(ifq)) + ifq->ifq_len = 0; + break; + } + return (0); +} + + +#ifdef KLD_MODULE + +static struct altqsw blue_sw = + {"blue", blueopen, blueclose, blueioctl}; + +ALTQ_MODULE(altq_blue, ALTQT_BLUE, &blue_sw); + +#endif /* KLD_MODULE */ + +#endif /* ALTQ_BLUE */ diff --git a/sys/altq/altq_blue.h b/sys/altq/altq_blue.h new file mode 100644 index 000000000000..9895f68a4e25 --- /dev/null +++ b/sys/altq/altq_blue.h @@ -0,0 +1,118 @@ +/* $KAME: altq_blue.h,v 1.5 2000/12/14 08:12:45 thorpej Exp $ */ + +/* + * Copyright (C) 1997-2000 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _ALTQ_ALTQ_BLUE_H_ +#define _ALTQ_ALTQ_BLUE_H_ + +#include + +struct blue_interface { + char blue_ifname[IFNAMSIZ]; +}; + +struct blue_stats { + struct blue_interface iface; + int q_len; + int q_limit; + int q_pmark; + u_quad_t xmit_packets; + u_quad_t xmit_bytes; + u_quad_t drop_packets; + u_quad_t drop_bytes; + u_quad_t drop_forced; + u_quad_t drop_unforced; + u_quad_t marked_packets; +}; + +struct blue_conf { + struct blue_interface iface; + int blue_limit; + int blue_max_pmark; + int blue_hold_time; + int blue_pkttime; /* average packet time in usec */ + int blue_flags; /* see below */ +}; + +/* blue flags */ +#define BLUEF_ECN4 0x01 /* use packet marking for IPv4 packets */ +#define BLUEF_ECN6 0x02 /* use packet marking for IPv6 packets */ +#define BLUEF_ECN (BLUEF_ECN4 | BLUEF_ECN6) + +/* + * IOCTLs for BLUE + */ +#define BLUE_IF_ATTACH _IOW('Q', 1, struct blue_interface) +#define BLUE_IF_DETACH _IOW('Q', 2, struct blue_interface) +#define BLUE_ENABLE _IOW('Q', 3, struct blue_interface) +#define BLUE_DISABLE _IOW('Q', 4, struct blue_interface) +#define BLUE_CONFIG _IOWR('Q', 6, struct blue_conf) +#define BLUE_GETSTATS _IOWR('Q', 12, struct blue_stats) + +#ifdef _KERNEL + +typedef struct blue { + int blue_pkttime; /* average packet time in micro sec + used for idle calibration */ + int blue_flags; /* blue flags */ + + /* blue parameters */ + int blue_pmark; /* 0-1000 (mark probability*10000) */ + int blue_max_pmark; /* sets precision of marking probability */ + int blue_hold_time; /* hold time in usec */ + + int blue_idle; /* queue was empty */ + struct timeval blue_last; /* timestamp when the queue becomes idle */ + + struct { + u_quad_t xmit_packets; + u_quad_t xmit_bytes; + u_quad_t drop_packets; + u_quad_t drop_bytes; + u_quad_t drop_forced; + u_quad_t drop_unforced; + u_quad_t marked_packets; + } blue_stats; +} blue_t; + +typedef struct blue_queue { + struct blue_queue *rq_next; /* next blue_state in the list */ + struct ifaltq *rq_ifq; /* backpointer to ifaltq */ + + class_queue_t *rq_q; + + blue_t *rq_blue; +} blue_queue_t; + +extern int blue_init __P((blue_t *, int, int, int, int)); +extern int blue_addq __P((blue_t *, class_queue_t *, struct mbuf *, + struct altq_pktattr *)); +extern struct mbuf *blue_getq __P((blue_t *, class_queue_t *)); + +#endif /* _KERNEL */ + +#endif /* _ALTQ_ALTQ_BLUE_H_ */ diff --git a/sys/altq/altq_cbq.c b/sys/altq/altq_cbq.c new file mode 100644 index 000000000000..f07a7cfeac96 --- /dev/null +++ b/sys/altq/altq_cbq.c @@ -0,0 +1,972 @@ +/* $KAME: altq_cbq.c,v 1.9 2000/12/14 08:12:45 thorpej Exp $ */ + +/* + * Copyright (c) Sun Microsystems, Inc. 1993-1998 All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the SMCC Technology + * Development Group at Sun Microsystems, Inc. + * + * 4. The name of the Sun Microsystems, Inc nor may not be used to endorse or + * promote products derived from this software without specific prior + * written permission. + * + * SUN MICROSYSTEMS DOES NOT CLAIM MERCHANTABILITY OF THIS SOFTWARE OR THE + * SUITABILITY OF THIS SOFTWARE FOR ANY PARTICULAR PURPOSE. The software is + * provided "as is" without express or implied warranty of any kind. + * + * These notices must be retained in any copies of any part of this software. + */ + +#if defined(__FreeBSD__) || defined(__NetBSD__) +#include "opt_altq.h" +#if (__FreeBSD__ != 2) +#include "opt_inet.h" +#ifdef __FreeBSD__ +#include "opt_inet6.h" +#endif +#endif +#endif /* __FreeBSD__ || __NetBSD__ */ +#ifdef ALTQ_CBQ /* cbq is enabled by ALTQ_CBQ option in opt_altq.h */ + +/* #pragma ident "@(#)cbq.c 1.39 98/05/13 SMI" */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +/* + * Local Data structures. + */ +static cbq_state_t *cbq_list = NULL; + +/* + * Forward Declarations. + */ + +static int cbq_add_class __P((struct cbq_add_class *)); +static int cbq_delete_class __P((struct cbq_delete_class *)); +static int cbq_modify_class __P((struct cbq_modify_class *)); +static int cbq_class_create __P((cbq_state_t *, struct cbq_add_class *, + struct rm_class *, struct rm_class *)); +static int cbq_class_destroy __P((cbq_state_t *, struct rm_class *)); +static struct rm_class *clh_to_clp __P((cbq_state_t *, u_long)); +static int cbq_add_filter __P((struct cbq_add_filter *)); +static int cbq_delete_filter __P((struct cbq_delete_filter *)); + +static int cbq_clear_hierarchy __P((struct cbq_interface *)); +static int cbq_clear_interface __P((cbq_state_t *)); +static int cbq_request __P((struct ifaltq *, int, void *)); +static int cbq_set_enable __P((struct cbq_interface *, int)); +static int cbq_ifattach __P((struct cbq_interface *)); +static int cbq_ifdetach __P((struct cbq_interface *)); +static int cbq_enqueue __P((struct ifaltq *, struct mbuf *, + struct altq_pktattr *)); +static struct mbuf *cbq_dequeue __P((struct ifaltq *, int)); +static void cbqrestart __P((struct ifaltq *)); +static void get_class_stats __P((class_stats_t *, struct rm_class *)); +static int cbq_getstats __P((struct cbq_getstats *)); +static void cbq_purge(cbq_state_t *); + +static int +cbq_add_class(acp) + struct cbq_add_class *acp; +{ + char *ifacename; + struct rm_class *borrow, *parent; + cbq_state_t *cbqp; + + ifacename = acp->cbq_iface.cbq_ifacename; + if ((cbqp = altq_lookup(ifacename, ALTQT_CBQ)) == NULL) + return (EBADF); + + /* check parameters */ + if (acp->cbq_class.priority >= RM_MAXPRIO || + acp->cbq_class.maxq > CBQ_MAXQSIZE) + return (EINVAL); + + /* Get pointers to parent and borrow classes. */ + parent = clh_to_clp(cbqp, acp->cbq_class.parent_class_handle); + borrow = clh_to_clp(cbqp, acp->cbq_class.borrow_class_handle); + + /* + * A class must borrow from it's parent or it can not + * borrow at all. Hence, borrow can be null. + */ + if (parent == NULL && (acp->cbq_class.flags & CBQCLF_ROOTCLASS) == 0) { + printf("cbq_add_class: no parent class!\n"); + return (EINVAL); + } + + if ((borrow != parent) && (borrow != NULL)) { + printf("cbq_add_class: borrow class != parent\n"); + return (EINVAL); + } + + return cbq_class_create(cbqp, acp, parent, borrow); +} + +static int +cbq_delete_class(dcp) + struct cbq_delete_class *dcp; +{ + char *ifacename; + struct rm_class *cl; + cbq_state_t *cbqp; + + ifacename = dcp->cbq_iface.cbq_ifacename; + if ((cbqp = altq_lookup(ifacename, ALTQT_CBQ)) == NULL) + return (EBADF); + + if ((cl = clh_to_clp(cbqp, dcp->cbq_class_handle)) == NULL) + return (EINVAL); + + /* if we are a parent class, then return an error. */ + if (is_a_parent_class(cl)) + return (EINVAL); + + /* if a filter has a reference to this class delete the filter */ + acc_discard_filters(&cbqp->cbq_classifier, cl, 0); + + return cbq_class_destroy(cbqp, cl); +} + +static int +cbq_modify_class(acp) + struct cbq_modify_class *acp; +{ + char *ifacename; + struct rm_class *cl; + cbq_state_t *cbqp; + + ifacename = acp->cbq_iface.cbq_ifacename; + if ((cbqp = altq_lookup(ifacename, ALTQT_CBQ)) == NULL) + return (EBADF); + + /* Get pointer to this class */ + if ((cl = clh_to_clp(cbqp, acp->cbq_class_handle)) == NULL) + return (EINVAL); + + if (rmc_modclass(cl, acp->cbq_class.nano_sec_per_byte, + acp->cbq_class.maxq, acp->cbq_class.maxidle, + acp->cbq_class.minidle, acp->cbq_class.offtime, + acp->cbq_class.pktsize) < 0) + return (EINVAL); + return (0); +} + +/* + * struct rm_class * + * cbq_class_create(cbq_mod_state_t *cbqp, struct cbq_add_class *acp, + * u_long handle, struct rm_class *parent, + * struct rm_class *borrow) + * + * This function create a new traffic class in the CBQ class hierarchy of + * given paramters. The class that created is either the root, default, + * or a new dynamic class. If CBQ is not initilaized, the the root class + * will be created. + */ +static int +cbq_class_create(cbqp, acp, parent, borrow) + cbq_state_t *cbqp; + struct cbq_add_class *acp; + struct rm_class *parent, *borrow; +{ + struct rm_class *cl; + cbq_class_spec_t *spec = &acp->cbq_class; + u_long chandle; + int i; + + /* + * allocate class handle + */ + switch (spec->flags & CBQCLF_CLASSMASK) { + case CBQCLF_ROOTCLASS: + if (parent != NULL) + return (EINVAL); + if (cbqp->ifnp.root_) + return (EINVAL); + chandle = ROOT_CLASS_HANDLE; + break; + case CBQCLF_DEFCLASS: + if (cbqp->ifnp.default_) + return (EINVAL); + chandle = DEFAULT_CLASS_HANDLE; + break; + case CBQCLF_CTLCLASS: + if (cbqp->ifnp.ctl_) + return (EINVAL); + chandle = CTL_CLASS_HANDLE; + break; + case 0: + /* find a free class slot */ + for (i = 0; i < CBQ_MAX_CLASSES; i++) + if (cbqp->cbq_class_tbl[i] == NULL) + break; + if (i == CBQ_MAX_CLASSES) + return (ENOSPC); + chandle = (u_long)i; + break; + default: + /* more than two flags bits set */ + return (EINVAL); + } + + /* + * create a class. if this is a root class, initialize the + * interface. + */ + if (chandle == ROOT_CLASS_HANDLE) { + rmc_init(cbqp->ifnp.ifq_, &cbqp->ifnp, spec->nano_sec_per_byte, + cbqrestart, spec->maxq, RM_MAXQUEUED, + spec->maxidle, spec->minidle, spec->offtime, + spec->flags); + cl = cbqp->ifnp.root_; + } else { + cl = rmc_newclass(spec->priority, + &cbqp->ifnp, spec->nano_sec_per_byte, + rmc_delay_action, spec->maxq, parent, borrow, + spec->maxidle, spec->minidle, spec->offtime, + spec->pktsize, spec->flags); + } + if (cl == NULL) + return (ENOMEM); + + /* return handle to user space. */ + acp->cbq_class_handle = chandle; + + cl->stats_.handle = chandle; + cl->stats_.depth = cl->depth_; + + /* save the allocated class */ + switch (chandle) { + case NULL_CLASS_HANDLE: + case ROOT_CLASS_HANDLE: + break; + case DEFAULT_CLASS_HANDLE: + cbqp->ifnp.default_ = cl; + break; + case CTL_CLASS_HANDLE: + cbqp->ifnp.ctl_ = cl; + break; + default: + cbqp->cbq_class_tbl[chandle] = cl; + break; + } + return (0); +} + +/* + * int + * cbq_class_destroy(cbq_mod_state_t *, struct rm_class *) - This + * function destroys a given traffic class. Before destorying + * the class, all traffic for that class is released. + */ +static int +cbq_class_destroy(cbqp, cl) + cbq_state_t *cbqp; + struct rm_class *cl; +{ + u_long chandle; + + chandle = cl->stats_.handle; + + /* delete the class */ + rmc_delete_class(&cbqp->ifnp, cl); + + /* + * free the class handle + */ + switch (chandle) { + case ROOT_CLASS_HANDLE: + cbqp->ifnp.root_ = NULL; + break; + case DEFAULT_CLASS_HANDLE: + cbqp->ifnp.default_ = NULL; + break; + case CTL_CLASS_HANDLE: + cbqp->ifnp.ctl_ = NULL; + break; + case NULL_CLASS_HANDLE: + break; + default: + if (chandle >= CBQ_MAX_CLASSES) + break; + cbqp->cbq_class_tbl[chandle] = NULL; + } + + return (0); +} + +/* convert class handle to class pointer */ +static struct rm_class * +clh_to_clp(cbqp, chandle) + cbq_state_t *cbqp; + u_long chandle; +{ + switch (chandle) { + case NULL_CLASS_HANDLE: + return (NULL); + case ROOT_CLASS_HANDLE: + return (cbqp->ifnp.root_); + case DEFAULT_CLASS_HANDLE: + return (cbqp->ifnp.default_); + case CTL_CLASS_HANDLE: + return (cbqp->ifnp.ctl_); + } + + if (chandle >= CBQ_MAX_CLASSES) + return (NULL); + + return (cbqp->cbq_class_tbl[chandle]); +} + +static int +cbq_add_filter(afp) + struct cbq_add_filter *afp; +{ + char *ifacename; + cbq_state_t *cbqp; + struct rm_class *cl; + + ifacename = afp->cbq_iface.cbq_ifacename; + if ((cbqp = altq_lookup(ifacename, ALTQT_CBQ)) == NULL) + return (EBADF); + + /* Get the pointer to class. */ + if ((cl = clh_to_clp(cbqp, afp->cbq_class_handle)) == NULL) + return (EINVAL); + + return acc_add_filter(&cbqp->cbq_classifier, &afp->cbq_filter, + cl, &afp->cbq_filter_handle); +} + +static int +cbq_delete_filter(dfp) + struct cbq_delete_filter *dfp; +{ + char *ifacename; + cbq_state_t *cbqp; + + ifacename = dfp->cbq_iface.cbq_ifacename; + if ((cbqp = altq_lookup(ifacename, ALTQT_CBQ)) == NULL) + return (EBADF); + + return acc_delete_filter(&cbqp->cbq_classifier, + dfp->cbq_filter_handle); +} + +/* + * cbq_clear_hierarchy deletes all classes and their filters on the + * given interface. + */ +static int +cbq_clear_hierarchy(ifacep) + struct cbq_interface *ifacep; +{ + char *ifacename; + cbq_state_t *cbqp; + + ifacename = ifacep->cbq_ifacename; + if ((cbqp = altq_lookup(ifacename, ALTQT_CBQ)) == NULL) + return (EBADF); + + return cbq_clear_interface(cbqp); +} + +static int +cbq_clear_interface(cbqp) + cbq_state_t *cbqp; +{ + int again, i; + struct rm_class *cl; + + /* free the filters for this interface */ + acc_discard_filters(&cbqp->cbq_classifier, NULL, 1); + + /* clear out the classes now */ + do { + again = 0; + for (i = 0; i < CBQ_MAX_CLASSES; i++) { + if ((cl = cbqp->cbq_class_tbl[i]) != NULL) { + if (is_a_parent_class(cl)) + again++; + else { + cbq_class_destroy(cbqp, cl); + cbqp->cbq_class_tbl[i] = NULL; + } + } + } + if (cbqp->ifnp.ctl_ != NULL && + !is_a_parent_class(cbqp->ifnp.ctl_)) { + cbq_class_destroy(cbqp, cbqp->ifnp.ctl_); + cbqp->ifnp.ctl_ = NULL; + } + if (cbqp->ifnp.default_ != NULL && + !is_a_parent_class(cbqp->ifnp.default_)) { + cbq_class_destroy(cbqp, cbqp->ifnp.default_); + cbqp->ifnp.default_ = NULL; + } + if (cbqp->ifnp.root_ != NULL && + !is_a_parent_class(cbqp->ifnp.root_)) { + cbq_class_destroy(cbqp, cbqp->ifnp.root_); + cbqp->ifnp.root_ = NULL; + } + } while (again); + + return (0); +} + +static int +cbq_request(ifq, req, arg) + struct ifaltq *ifq; + int req; + void *arg; +{ + cbq_state_t *cbqp = (cbq_state_t *)ifq->altq_disc; + + switch (req) { + case ALTRQ_PURGE: + cbq_purge(cbqp); + break; + } + return (0); +} + +/* + * static int + * cbq_set_enable(struct cbq_enable *ep) - this function processed the + * ioctl request to enable class based queueing. It searches the list + * of interfaces for the specified interface and then enables CBQ on + * that interface. + * + * Returns: 0, for no error. + * EBADF, for specified inteface not found. + */ + +static int +cbq_set_enable(ep, enable) + struct cbq_interface *ep; + int enable; +{ + int error = 0; + cbq_state_t *cbqp; + char *ifacename; + + ifacename = ep->cbq_ifacename; + if ((cbqp = altq_lookup(ifacename, ALTQT_CBQ)) == NULL) + return (EBADF); + + switch (enable) { + case ENABLE: + if (cbqp->ifnp.root_ == NULL || cbqp->ifnp.default_ == NULL || + cbqp->ifnp.ctl_ == NULL) { + if (cbqp->ifnp.root_ == NULL) + printf("No Root Class for %s\n", ifacename); + if (cbqp->ifnp.default_ == NULL) + printf("No Default Class for %s\n", ifacename); + if (cbqp->ifnp.ctl_ == NULL) + printf("No Control Class for %s\n", ifacename); + error = EINVAL; + } else if ((error = altq_enable(cbqp->ifnp.ifq_)) == 0) { + cbqp->cbq_qlen = 0; + } + break; + + case DISABLE: + error = altq_disable(cbqp->ifnp.ifq_); + break; + } + return (error); +} + +/* copy the stats info in rm_class to class_states_t */ +static void +get_class_stats(statsp, cl) + class_stats_t *statsp; + struct rm_class *cl; +{ + statsp->xmit_cnt = cl->stats_.xmit_cnt; + statsp->drop_cnt = cl->stats_.drop_cnt; + statsp->over = cl->stats_.over; + statsp->borrows = cl->stats_.borrows; + statsp->overactions = cl->stats_.overactions; + statsp->delays = cl->stats_.delays; + + statsp->depth = cl->depth_; + statsp->priority = cl->pri_; + statsp->maxidle = cl->maxidle_; + statsp->minidle = cl->minidle_; + statsp->offtime = cl->offtime_; + statsp->qmax = qlimit(cl->q_); + statsp->ns_per_byte = cl->ns_per_byte_; + statsp->wrr_allot = cl->w_allotment_; + statsp->qcnt = qlen(cl->q_); + statsp->avgidle = cl->avgidle_; + + statsp->qtype = qtype(cl->q_); +#ifdef ALTQ_RED + if (q_is_red(cl->q_)) + red_getstats(cl->red_, &statsp->red[0]); +#endif +#ifdef ALTQ_RIO + if (q_is_rio(cl->q_)) + rio_getstats((rio_t *)cl->red_, &statsp->red[0]); +#endif +} + +static int +cbq_getstats(gsp) + struct cbq_getstats *gsp; +{ + char *ifacename; + int chandle, n, nclasses; + cbq_state_t *cbqp; + struct rm_class *cl; + class_stats_t stats, *usp; + int error = 0; + + ifacename = gsp->iface.cbq_ifacename; + nclasses = gsp->nclasses; + usp = gsp->stats; + + if ((cbqp = altq_lookup(ifacename, ALTQT_CBQ)) == NULL) + return (EBADF); + if (nclasses <= 0) + return (EINVAL); + + for (n = 0, chandle = 0; n < nclasses && chandle < CBQ_MAX_CLASSES; + n++) { + switch(n) { + case 0: + cl = cbqp->ifnp.root_; + stats.handle = ROOT_CLASS_HANDLE; + break; + case 1: + cl = cbqp->ifnp.default_; + stats.handle = DEFAULT_CLASS_HANDLE; + break; + case 2: + cl = cbqp->ifnp.ctl_; + stats.handle = CTL_CLASS_HANDLE; + break; + default: + while ((cl = cbqp->cbq_class_tbl[chandle]) == NULL) + if (++chandle >= CBQ_MAX_CLASSES) + goto out; + stats.handle = chandle++; + break; + } + + get_class_stats(&stats, cl); + + if ((error = copyout((caddr_t)&stats, (caddr_t)usp++, + sizeof(stats))) != 0) + return (error); + } + + out: + gsp->nclasses = n; + return (error); +} + +static int +cbq_ifattach(ifacep) + struct cbq_interface *ifacep; +{ + int error = 0; + char *ifacename; + cbq_state_t *new_cbqp; + struct ifnet *ifp; + + ifacename = ifacep->cbq_ifacename; + if ((ifp = ifunit(ifacename)) == NULL) + return (ENXIO); + if (!ALTQ_IS_READY(&ifp->if_snd)) + return (ENXIO); + + /* allocate and initialize cbq_state_t */ + MALLOC(new_cbqp, cbq_state_t *, sizeof(cbq_state_t), M_DEVBUF, M_WAITOK); + if (new_cbqp == NULL) + return (ENOMEM); + bzero(new_cbqp, sizeof(cbq_state_t)); + CALLOUT_INIT(&new_cbqp->cbq_callout); + MALLOC(new_cbqp->cbq_class_tbl, struct rm_class **, + sizeof(struct rm_class *) * CBQ_MAX_CLASSES, M_DEVBUF, M_WAITOK); + if (new_cbqp->cbq_class_tbl == NULL) { + FREE(new_cbqp, M_DEVBUF); + return (ENOMEM); + } + bzero(new_cbqp->cbq_class_tbl, sizeof(struct rm_class *) * CBQ_MAX_CLASSES); + new_cbqp->cbq_qlen = 0; + new_cbqp->ifnp.ifq_ = &ifp->if_snd; /* keep the ifq */ + + /* + * set CBQ to this ifnet structure. + */ + error = altq_attach(&ifp->if_snd, ALTQT_CBQ, new_cbqp, + cbq_enqueue, cbq_dequeue, cbq_request, + &new_cbqp->cbq_classifier, acc_classify); + if (error) { + FREE(new_cbqp->cbq_class_tbl, M_DEVBUF); + FREE(new_cbqp, M_DEVBUF); + return (error); + } + + /* prepend to the list of cbq_state_t's. */ + new_cbqp->cbq_next = cbq_list; + cbq_list = new_cbqp; + + return (0); +} + +static int +cbq_ifdetach(ifacep) + struct cbq_interface *ifacep; +{ + char *ifacename; + cbq_state_t *cbqp; + + ifacename = ifacep->cbq_ifacename; + if ((cbqp = altq_lookup(ifacename, ALTQT_CBQ)) == NULL) + return (EBADF); + + (void)cbq_set_enable(ifacep, DISABLE); + + cbq_clear_interface(cbqp); + + if (cbqp->ifnp.ctl_) + cbq_class_destroy(cbqp, cbqp->ifnp.ctl_); + if (cbqp->ifnp.default_) + cbq_class_destroy(cbqp, cbqp->ifnp.default_); + if (cbqp->ifnp.root_) + cbq_class_destroy(cbqp, cbqp->ifnp.root_); + + /* remove CBQ from the ifnet structure. */ + (void)altq_detach(cbqp->ifnp.ifq_); + + /* remove from the list of cbq_state_t's. */ + if (cbq_list == cbqp) + cbq_list = cbqp->cbq_next; + else { + cbq_state_t *cp; + + for (cp = cbq_list; cp != NULL; cp = cbqp->cbq_next) + if (cp->cbq_next == cbqp) { + cp->cbq_next = cbqp->cbq_next; + break; + } + ASSERT(cp != NULL); + } + + /* deallocate cbq_state_t */ + FREE(cbqp->cbq_class_tbl, M_DEVBUF); + FREE(cbqp, M_DEVBUF); + + return (0); +} + +/* + * int + * cbq_enqueue(struct ifaltq *ifq, struct mbuf *m, struct altq_pktattr *pattr) + * - Queue data packets. + * + * cbq_enqueue is set to ifp->if_altqenqueue and called by an upper + * layer (e.g. ether_output). cbq_enqueue queues the given packet + * to the cbq, then invokes the driver's start routine. + * + * Assumptions: called in splimp + * Returns: 0 if the queueing is successful. + * ENOBUFS if a packet dropping occured as a result of + * the queueing. + */ + +static int +cbq_enqueue(ifq, m, pktattr) + struct ifaltq *ifq; + struct mbuf *m; + struct altq_pktattr *pktattr; +{ + cbq_state_t *cbqp = (cbq_state_t *)ifq->altq_disc; + struct rm_class *cl; + int len; + + /* grab class set by classifier */ + if (pktattr == NULL || (cl = pktattr->pattr_class) == NULL) + cl = cbqp->ifnp.default_; + cl->pktattr_ = pktattr; /* save proto hdr used by ECN */ + + len = m_pktlen(m); + if (rmc_queue_packet(cl, m) != 0) { + /* drop occurred. some mbuf was freed in rmc_queue_packet. */ + PKTCNTR_ADD(&cl->stats_.drop_cnt, len); + return (ENOBUFS); + } + + /* successfully queued. */ + ++cbqp->cbq_qlen; + IFQ_INC_LEN(ifq); + return (0); +} + +static struct mbuf * +cbq_dequeue(ifq, op) + struct ifaltq *ifq; + int op; +{ + cbq_state_t *cbqp = (cbq_state_t *)ifq->altq_disc; + struct mbuf *m; + + m = rmc_dequeue_next(&cbqp->ifnp, op); + + if (m && op == ALTDQ_REMOVE) { + --cbqp->cbq_qlen; /* decrement # of packets in cbq */ + IFQ_DEC_LEN(ifq); + + /* Update the class. */ + rmc_update_class_util(&cbqp->ifnp); + } + return (m); +} + +/* + * void + * cbqrestart(queue_t *) - Restart sending of data. + * called from rmc_restart in splimp via timeout after waking up + * a suspended class. + * Returns: NONE + */ + +static void +cbqrestart(ifq) + struct ifaltq *ifq; +{ + cbq_state_t *cbqp; + struct ifnet *ifp; + + if (!ALTQ_IS_ENABLED(ifq)) + /* cbq must have been detached */ + return; + if ((cbqp = (cbq_state_t *)ifq->altq_disc) == NULL) + /* should not happen */ + return; + + ifp = ifq->altq_ifp; + if (ifp->if_start && + cbqp->cbq_qlen > 0 && (ifp->if_flags & IFF_OACTIVE) == 0) + (*ifp->if_start)(ifp); +} + +static void cbq_purge(cbqp) + cbq_state_t *cbqp; +{ + struct rm_class *cl; + int i; + + for (i = 0; i < CBQ_MAX_CLASSES; i++) + if ((cl = cbqp->cbq_class_tbl[i]) != NULL) + rmc_dropall(cl); + if (ALTQ_IS_ENABLED(cbqp->ifnp.ifq_)) + cbqp->ifnp.ifq_->ifq_len = 0; +} + +/* + * cbq device interface + */ + +altqdev_decl(cbq); + +int +cbqopen(dev, flag, fmt, p) + dev_t dev; + int flag, fmt; + struct proc *p; +{ + return (0); +} + +int +cbqclose(dev, flag, fmt, p) + dev_t dev; + int flag, fmt; + struct proc *p; +{ + struct ifnet *ifp; + struct cbq_interface iface; + int err, error = 0; + + while (cbq_list) { + ifp = cbq_list->ifnp.ifq_->altq_ifp; +#if defined(__NetBSD__) || defined(__OpenBSD__) + sprintf(iface.cbq_ifacename, "%s", ifp->if_xname); +#else + sprintf(iface.cbq_ifacename, + "%s%d", ifp->if_name, ifp->if_unit); +#endif + err = cbq_ifdetach(&iface); + if (err != 0 && error == 0) + error = err; + } + + return (error); +} + +int +cbqioctl(dev, cmd, addr, flag, p) + dev_t dev; + ioctlcmd_t cmd; + caddr_t addr; + int flag; + struct proc *p; +{ + int error = 0; + + /* check cmd for superuser only */ + switch (cmd) { + case CBQ_GETSTATS: + /* currently only command that an ordinary user can call */ + break; + default: +#if (__FreeBSD_version > 400000) + error = suser(p); +#else + error = suser(p->p_ucred, &p->p_acflag); +#endif + if (error) + return (error); + break; + } + + switch (cmd) { + + case CBQ_ENABLE: + error = cbq_set_enable((struct cbq_interface *)addr, ENABLE); + break; + + case CBQ_DISABLE: + error = cbq_set_enable((struct cbq_interface *)addr, DISABLE); + break; + + case CBQ_ADD_FILTER: + error = cbq_add_filter((struct cbq_add_filter *)addr); + break; + + case CBQ_DEL_FILTER: + error = cbq_delete_filter((struct cbq_delete_filter *)addr); + break; + + case CBQ_ADD_CLASS: + error = cbq_add_class((struct cbq_add_class *)addr); + break; + + case CBQ_DEL_CLASS: + error = cbq_delete_class((struct cbq_delete_class *)addr); + break; + + case CBQ_MODIFY_CLASS: + error = cbq_modify_class((struct cbq_modify_class *)addr); + break; + + case CBQ_CLEAR_HIERARCHY: + error = cbq_clear_hierarchy((struct cbq_interface *)addr); + break; + + case CBQ_IF_ATTACH: + error = cbq_ifattach((struct cbq_interface *)addr); + break; + + case CBQ_IF_DETACH: + error = cbq_ifdetach((struct cbq_interface *)addr); + break; + + case CBQ_GETSTATS: + error = cbq_getstats((struct cbq_getstats *)addr); + break; + + default: + error = EINVAL; + break; + } + + return error; +} + +#if 0 +/* for debug */ +static void cbq_class_dump(int); + +static void cbq_class_dump(i) + int i; +{ + struct rm_class *cl; + rm_class_stats_t *s; + struct _class_queue_ *q; + + if (cbq_list == NULL) { + printf("cbq_class_dump: no cbq_state found\n"); + return; + } + cl = cbq_list->cbq_class_tbl[i]; + + printf("class %d cl=%p\n", i, cl); + if (cl != NULL) { + s = &cl->stats_; + q = cl->q_; + + printf("pri=%d, depth=%d, maxrate=%d, allotment=%d\n", + cl->pri_, cl->depth_, cl->maxrate_, cl->allotment_); + printf("w_allotment=%d, bytes_alloc=%d, avgidle=%d, maxidle=%d\n", + cl->w_allotment_, cl->bytes_alloc_, cl->avgidle_, + cl->maxidle_); + printf("minidle=%d, offtime=%d, sleeping=%d, leaf=%d\n", + cl->minidle_, cl->offtime_, cl->sleeping_, cl->leaf_); + printf("handle=%d, depth=%d, packets=%d, bytes=%d\n", + s->handle, s->depth, + (int)s->xmit_cnt.packets, (int)s->xmit_cnt.bytes); + printf("over=%d\n, borrows=%d, drops=%d, overactions=%d, delays=%d\n", + s->over, s->borrows, (int)s->drop_cnt.packets, + s->overactions, s->delays); + printf("tail=%p, head=%p, qlen=%d, qlim=%d, qthresh=%d,qtype=%d\n", + q->tail_, q->head_, q->qlen_, q->qlim_, + q->qthresh_, q->qtype_); + } +} +#endif /* 0 */ + +#ifdef KLD_MODULE + +static struct altqsw cbq_sw = + {"cbq", cbqopen, cbqclose, cbqioctl}; + +ALTQ_MODULE(altq_cbq, ALTQT_CBQ, &cbq_sw); + +#endif /* KLD_MODULE */ + +#endif /* ALTQ_CBQ */ diff --git a/sys/altq/altq_cbq.h b/sys/altq/altq_cbq.h new file mode 100644 index 000000000000..505e9d2e6ab8 --- /dev/null +++ b/sys/altq/altq_cbq.h @@ -0,0 +1,219 @@ +/* $KAME: altq_cbq.h,v 1.5 2000/12/02 13:44:40 kjc Exp $ */ + +/* + * Copyright (c) Sun Microsystems, Inc. 1993-1998 All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the SMCC Technology + * Development Group at Sun Microsystems, Inc. + * + * 4. The name of the Sun Microsystems, Inc nor may not be used to endorse or + * promote products derived from this software without specific prior + * written permission. + * + * SUN MICROSYSTEMS DOES NOT CLAIM MERCHANTABILITY OF THIS SOFTWARE OR THE + * SUITABILITY OF THIS SOFTWARE FOR ANY PARTICULAR PURPOSE. The software is + * provided "as is" without express or implied warranty of any kind. + * + * These notices must be retained in any copies of any part of this software. + */ + +#ifndef _ALTQ_ALTQ_CBQ_H_ +#define _ALTQ_ALTQ_CBQ_H_ + +#include +#include +#include +#include +#include + +/* #pragma ident "@(#)cbq.h 1.18 98/05/13 SMI" */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Define a well known class handles + */ +#define NULL_CLASS_HANDLE 0xffffffff +#define ROOT_CLASS_HANDLE 0xfffffffe +#define DEFAULT_CLASS_HANDLE 0xfffffffd +#define CTL_CLASS_HANDLE 0xfffffffc + +/* + * Define structures associated with IOCTLS for cbq. + */ + +/* + * Define the CBQ interface structure. This must be included in all + * IOCTL's such that the CBQ driver may find the appropriate CBQ module + * associated with the network interface to be affected. + */ +struct cbq_interface { + char cbq_ifacename[IFNAMSIZ]; +}; + +typedef struct cbq_class_spec { + u_int priority; + u_int nano_sec_per_byte; + u_int maxq; + u_int maxidle; + int minidle; + u_int offtime; + u_long parent_class_handle; + u_long borrow_class_handle; + + u_int pktsize; + int flags; +} cbq_class_spec_t; + +/* class flags shoud be same as class flags in rm_class.h */ +#define CBQCLF_RED 0x0001 /* use RED */ +#define CBQCLF_ECN 0x0002 /* use RED/ECN */ +#define CBQCLF_RIO 0x0004 /* use RIO */ +#define CBQCLF_FLOWVALVE 0x0008 /* use flowvalve (aka penalty-box) */ +#define CBQCLF_CLEARDSCP 0x0010 /* clear diffserv codepoint */ + +/* class flags only for root class */ +#define CBQCLF_WRR 0x0100 /* weighted-round robin */ +#define CBQCLF_EFFICIENT 0x0200 /* work-conserving */ + +/* class flags for special classes */ +#define CBQCLF_ROOTCLASS 0x1000 /* root class */ +#define CBQCLF_DEFCLASS 0x2000 /* default class */ +#define CBQCLF_CTLCLASS 0x4000 /* control class */ +#define CBQCLF_CLASSMASK 0xf000 /* class mask */ + +#define CBQ_MAXQSIZE 200 + +struct cbq_add_class { + struct cbq_interface cbq_iface; + + cbq_class_spec_t cbq_class; + u_long cbq_class_handle; +}; + +struct cbq_delete_class { + struct cbq_interface cbq_iface; + u_long cbq_class_handle; +}; + +struct cbq_modify_class { + struct cbq_interface cbq_iface; + + cbq_class_spec_t cbq_class; + u_long cbq_class_handle; +}; + +struct cbq_add_filter { + struct cbq_interface cbq_iface; + u_long cbq_class_handle; + struct flow_filter cbq_filter; + + u_long cbq_filter_handle; +}; + +struct cbq_delete_filter { + struct cbq_interface cbq_iface; + u_long cbq_filter_handle; +}; + +typedef struct _cbq_class_stats_ { + u_int handle; + u_int depth; + + struct pktcntr xmit_cnt; /* packets sent in this class */ + struct pktcntr drop_cnt; /* dropped packets */ + u_int over; /* # times went over limit */ + u_int borrows; /* # times tried to borrow */ + u_int overactions; /* # times invoked overlimit action */ + u_int delays; /* # times invoked delay actions */ + + /* other static class parameters useful for debugging */ + int priority; + int maxidle; + int minidle; + int offtime; + int qmax; + int ns_per_byte; + int wrr_allot; + + int qcnt; /* # packets in queue */ + int avgidle; + + /* red and rio related info */ + int qtype; + struct redstats red[3]; +} class_stats_t; + +/* number of classes are returned in nclasses field */ +struct cbq_getstats { + struct cbq_interface iface; + int nclasses; + class_stats_t *stats; +}; + +/* + * Define IOCTLs for CBQ. + */ +#define CBQ_IF_ATTACH _IOW('Q', 1, struct cbq_interface) +#define CBQ_IF_DETACH _IOW('Q', 2, struct cbq_interface) +#define CBQ_ENABLE _IOW('Q', 3, struct cbq_interface) +#define CBQ_DISABLE _IOW('Q', 4, struct cbq_interface) +#define CBQ_CLEAR_HIERARCHY _IOW('Q', 5, struct cbq_interface) +#define CBQ_ADD_CLASS _IOWR('Q', 7, struct cbq_add_class) +#define CBQ_DEL_CLASS _IOW('Q', 8, struct cbq_delete_class) +#define CBQ_MODIFY_CLASS _IOWR('Q', 9, struct cbq_modify_class) +#define CBQ_ADD_FILTER _IOWR('Q', 10, struct cbq_add_filter) +#define CBQ_DEL_FILTER _IOW('Q', 11, struct cbq_delete_filter) +#define CBQ_GETSTATS _IOWR('Q', 12, struct cbq_getstats) + +#ifdef _KERNEL +/* + * Define macros only good for kernel drivers and modules. + */ + +#define DISABLE 0x00 +#define ENABLE 0x01 + +#define CBQ_WATCHDOG (HZ / 20) +#define CBQ_TIMEOUT 10 +#define CBQ_LS_TIMEOUT (20 * hz / 1000) + +#define CBQ_MAX_CLASSES 256 +#define CBQ_MAX_FILTERS 256 + +/* + * Define State structures. + */ +typedef struct cbqstate { + struct cbqstate *cbq_next; + int cbq_qlen; /* # of packets in cbq */ + struct rm_class **cbq_class_tbl; + + struct rm_ifdat ifnp; + struct callout cbq_callout; /* for timeouts */ + + struct acc_classifier cbq_classifier; +} cbq_state_t; + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* !_ALTQ_ALTQ_CBQ_H_ */ diff --git a/sys/altq/altq_cdnr.c b/sys/altq/altq_cdnr.c new file mode 100644 index 000000000000..77dbdeac5526 --- /dev/null +++ b/sys/altq/altq_cdnr.c @@ -0,0 +1,1374 @@ +/* $KAME: altq_cdnr.c,v 1.8 2000/12/14 08:12:45 thorpej Exp $ */ + +/* + * Copyright (C) 1999-2000 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if defined(__FreeBSD__) || defined(__NetBSD__) +#include "opt_altq.h" +#if (__FreeBSD__ != 2) +#include "opt_inet.h" +#ifdef __FreeBSD__ +#include "opt_inet6.h" +#endif +#endif +#endif /* __FreeBSD__ || __NetBSD__ */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#ifdef INET6 +#include +#endif + +#include +#include +#include + +/* + * diffserv traffic conditioning module + */ + +int altq_cdnr_enabled = 0; + +/* traffic conditioner is enabled by ALTQ_CDNR option in opt_altq.h */ +#ifdef ALTQ_CDNR + +/* cdnr_list keeps all cdnr's allocated. */ +static LIST_HEAD(, top_cdnr) tcb_list; + +int cdnropen __P((dev_t, int, int, struct proc *)); +int cdnrclose __P((dev_t, int, int, struct proc *)); +int cdnrioctl __P((dev_t, ioctlcmd_t, caddr_t, int, struct proc *)); + +static int altq_cdnr_input __P((struct mbuf *, int)); +static struct top_cdnr *tcb_lookup __P((char *ifname)); +static struct cdnr_block *cdnr_handle2cb __P((u_long)); +static u_long cdnr_cb2handle __P((struct cdnr_block *)); +static void *cdnr_cballoc __P((struct top_cdnr *, int, + struct tc_action *(*)(struct cdnr_block *, struct cdnr_pktinfo *))); +static void cdnr_cbdestroy __P((void *)); +static int tca_verify_action __P((struct tc_action *)); +static void tca_import_action __P((struct tc_action *, struct tc_action *)); +static void tca_invalidate_action __P((struct tc_action *)); + +static int generic_element_destroy __P((struct cdnr_block *)); +static struct top_cdnr *top_create __P((struct ifaltq *)); +static int top_destroy __P((struct top_cdnr *)); +static struct cdnr_block *element_create __P((struct top_cdnr *, + struct tc_action *)); +static int element_destroy __P((struct cdnr_block *)); +static void tb_import_profile __P((struct tbe *, struct tb_profile *)); +static struct tbmeter *tbm_create __P((struct top_cdnr *, struct tb_profile *, + struct tc_action *, struct tc_action *)); +static int tbm_destroy __P((struct tbmeter *)); +static struct tc_action *tbm_input __P((struct cdnr_block *, + struct cdnr_pktinfo *)); +static struct trtcm *trtcm_create __P((struct top_cdnr *, + struct tb_profile *, struct tb_profile *, + struct tc_action *, struct tc_action *, struct tc_action *, + int)); +static int trtcm_destroy __P((struct trtcm *)); +static struct tc_action *trtcm_input __P((struct cdnr_block *, + struct cdnr_pktinfo *)); +static struct tswtcm *tswtcm_create __P((struct top_cdnr *, + u_int32_t, u_int32_t, u_int32_t, + struct tc_action *, struct tc_action *, struct tc_action *)); +static int tswtcm_destroy __P((struct tswtcm *)); +static struct tc_action *tswtcm_input __P((struct cdnr_block *, + struct cdnr_pktinfo *)); + +static int cdnrcmd_if_attach __P((char *)); +static int cdnrcmd_if_detach __P((char *)); +static int cdnrcmd_add_element __P((struct cdnr_add_element *)); +static int cdnrcmd_delete_element __P((struct cdnr_delete_element *)); +static int cdnrcmd_add_filter __P((struct cdnr_add_filter *)); +static int cdnrcmd_delete_filter __P((struct cdnr_delete_filter *)); +static int cdnrcmd_add_tbm __P((struct cdnr_add_tbmeter *)); +static int cdnrcmd_modify_tbm __P((struct cdnr_modify_tbmeter *)); +static int cdnrcmd_tbm_stats __P((struct cdnr_tbmeter_stats *)); +static int cdnrcmd_add_trtcm __P((struct cdnr_add_trtcm *)); +static int cdnrcmd_modify_trtcm __P((struct cdnr_modify_trtcm *)); +static int cdnrcmd_tcm_stats __P((struct cdnr_tcm_stats *)); +static int cdnrcmd_add_tswtcm __P((struct cdnr_add_tswtcm *)); +static int cdnrcmd_modify_tswtcm __P((struct cdnr_modify_tswtcm *)); +static int cdnrcmd_get_stats __P((struct cdnr_get_stats *)); + +/* + * top level input function called from ip_input. + * should be called before converting header fields to host-byte-order. + */ +int +altq_cdnr_input(m, af) + struct mbuf *m; + int af; /* address family */ +{ + struct ifnet *ifp; + struct ip *ip; + struct top_cdnr *top; + struct tc_action *tca; + struct cdnr_block *cb; + struct cdnr_pktinfo pktinfo; + + ifp = m->m_pkthdr.rcvif; + if (!ALTQ_IS_CNDTNING(&ifp->if_snd)) + /* traffic conditioner is not enabled on this interface */ + return (1); + + top = ifp->if_snd.altq_cdnr; + + ip = mtod(m, struct ip *); +#ifdef INET6 + if (af == AF_INET6) { + u_int32_t flowlabel; + + flowlabel = ((struct ip6_hdr *)ip)->ip6_flow; + pktinfo.pkt_dscp = (ntohl(flowlabel) >> 20) & DSCP_MASK; + } else +#endif + pktinfo.pkt_dscp = ip->ip_tos & DSCP_MASK; + pktinfo.pkt_len = m_pktlen(m); + + tca = NULL; + + cb = acc_classify(&top->tc_classifier, m, af); + if (cb != NULL) + tca = &cb->cb_action; + + if (tca == NULL) + tca = &top->tc_block.cb_action; + + while (1) { + PKTCNTR_ADD(&top->tc_cnts[tca->tca_code], pktinfo.pkt_len); + + switch (tca->tca_code) { + case TCACODE_PASS: + return (1); + case TCACODE_DROP: + m_freem(m); + return (0); + case TCACODE_RETURN: + return (0); + case TCACODE_MARK: +#ifdef INET6 + if (af == AF_INET6) { + struct ip6_hdr *ip6 = (struct ip6_hdr *)ip; + u_int32_t flowlabel; + + flowlabel = ntohl(ip6->ip6_flow); + flowlabel = (tca->tca_dscp << 20) | + (flowlabel & ~(DSCP_MASK << 20)); + ip6->ip6_flow = htonl(flowlabel); + } else +#endif + ip->ip_tos = tca->tca_dscp | + (ip->ip_tos & DSCP_CUMASK); + return (1); + case TCACODE_NEXT: + cb = tca->tca_next; + tca = (*cb->cb_input)(cb, &pktinfo); + break; + case TCACODE_NONE: + default: + return (1); + } + } +} + +static struct top_cdnr * +tcb_lookup(ifname) + char *ifname; +{ + struct top_cdnr *top; + struct ifnet *ifp; + + if ((ifp = ifunit(ifname)) != NULL) + LIST_FOREACH(top, &tcb_list, tc_next) + if (top->tc_ifq->altq_ifp == ifp) + return (top); + return (NULL); +} + +static struct cdnr_block * +cdnr_handle2cb(handle) + u_long handle; +{ + struct cdnr_block *cb; + + cb = (struct cdnr_block *)handle; + if (handle != ALIGN(cb)) + return (NULL); + + if (cb == NULL || cb->cb_handle != handle) + return (NULL); + return (cb); +} + +static u_long +cdnr_cb2handle(cb) + struct cdnr_block *cb; +{ + return (cb->cb_handle); +} + +static void * +cdnr_cballoc(top, type, input_func) + struct top_cdnr *top; + int type; + struct tc_action *(*input_func)(struct cdnr_block *, + struct cdnr_pktinfo *); +{ + struct cdnr_block *cb; + int size; + + switch (type) { + case TCETYPE_TOP: + size = sizeof(struct top_cdnr); + break; + case TCETYPE_ELEMENT: + size = sizeof(struct cdnr_block); + break; + case TCETYPE_TBMETER: + size = sizeof(struct tbmeter); + break; + case TCETYPE_TRTCM: + size = sizeof(struct trtcm); + break; + case TCETYPE_TSWTCM: + size = sizeof(struct tswtcm); + break; + default: + return (NULL); + } + + MALLOC(cb, struct cdnr_block *, size, M_DEVBUF, M_WAITOK); + if (cb == NULL) + return (NULL); + bzero(cb, size); + + cb->cb_len = size; + cb->cb_type = type; + cb->cb_ref = 0; + cb->cb_handle = (u_long)cb; + if (top == NULL) + cb->cb_top = (struct top_cdnr *)cb; + else + cb->cb_top = top; + + if (input_func != NULL) { + /* + * if this cdnr has an action function, + * make tc_action to call itself. + */ + cb->cb_action.tca_code = TCACODE_NEXT; + cb->cb_action.tca_next = cb; + cb->cb_input = input_func; + } else + cb->cb_action.tca_code = TCACODE_NONE; + + /* if this isn't top, register the element to the top level cdnr */ + if (top != NULL) + LIST_INSERT_HEAD(&top->tc_elements, cb, cb_next); + + return ((void *)cb); +} + +static void +cdnr_cbdestroy(cblock) + void *cblock; +{ + struct cdnr_block *cb = cblock; + + /* delete filters belonging to this cdnr */ + acc_discard_filters(&cb->cb_top->tc_classifier, cb, 0); + + /* remove from the top level cdnr */ + if (cb->cb_top != cblock) + LIST_REMOVE(cb, cb_next); + + FREE(cb, M_DEVBUF); +} + +/* + * conditioner common destroy routine + */ +static int +generic_element_destroy(cb) + struct cdnr_block *cb; +{ + int error = 0; + + switch (cb->cb_type) { + case TCETYPE_TOP: + error = top_destroy((struct top_cdnr *)cb); + break; + case TCETYPE_ELEMENT: + error = element_destroy(cb); + break; + case TCETYPE_TBMETER: + error = tbm_destroy((struct tbmeter *)cb); + break; + case TCETYPE_TRTCM: + error = trtcm_destroy((struct trtcm *)cb); + break; + case TCETYPE_TSWTCM: + error = tswtcm_destroy((struct tswtcm *)cb); + break; + default: + error = EINVAL; + } + return (error); +} + +static int +tca_verify_action(utca) + struct tc_action *utca; +{ + switch (utca->tca_code) { + case TCACODE_PASS: + case TCACODE_DROP: + case TCACODE_MARK: + /* these are ok */ + break; + + case TCACODE_HANDLE: + /* verify handle value */ + if (cdnr_handle2cb(utca->tca_handle) == NULL) + return (-1); + break; + + case TCACODE_NONE: + case TCACODE_RETURN: + case TCACODE_NEXT: + default: + /* should not be passed from a user */ + return (-1); + } + return (0); +} + +static void +tca_import_action(ktca, utca) + struct tc_action *ktca, *utca; +{ + struct cdnr_block *cb; + + *ktca = *utca; + if (ktca->tca_code == TCACODE_HANDLE) { + cb = cdnr_handle2cb(ktca->tca_handle); + if (cb == NULL) { + ktca->tca_code = TCACODE_NONE; + return; + } + ktca->tca_code = TCACODE_NEXT; + ktca->tca_next = cb; + cb->cb_ref++; + } else if (ktca->tca_code == TCACODE_MARK) { + ktca->tca_dscp &= DSCP_MASK; + } + return; +} + +static void +tca_invalidate_action(tca) + struct tc_action *tca; +{ + struct cdnr_block *cb; + + if (tca->tca_code == TCACODE_NEXT) { + cb = tca->tca_next; + if (cb == NULL) + return; + cb->cb_ref--; + } + tca->tca_code = TCACODE_NONE; +} + +/* + * top level traffic conditioner + */ +static struct top_cdnr * +top_create(ifq) + struct ifaltq *ifq; +{ + struct top_cdnr *top; + + if ((top = cdnr_cballoc(NULL, TCETYPE_TOP, NULL)) == NULL) + return (NULL); + + top->tc_ifq = ifq; + /* set default action for the top level conditioner */ + top->tc_block.cb_action.tca_code = TCACODE_PASS; + + LIST_INSERT_HEAD(&tcb_list, top, tc_next); + + ifq->altq_cdnr = top; + + return (top); +} + +static int +top_destroy(top) + struct top_cdnr *top; +{ + struct cdnr_block *cb; + + if (ALTQ_IS_CNDTNING(top->tc_ifq)) + ALTQ_CLEAR_CNDTNING(top->tc_ifq); + top->tc_ifq->altq_cdnr = NULL; + + /* + * destroy all the conditioner elements belonging to this interface + */ + while ((cb = LIST_FIRST(&top->tc_elements)) != NULL) { + while (cb != NULL && cb->cb_ref > 0) + cb = LIST_NEXT(cb, cb_next); + if (cb != NULL) + generic_element_destroy(cb); + } + + LIST_REMOVE(top, tc_next); + + cdnr_cbdestroy(top); + + /* if there is no active conditioner, remove the input hook */ + if (altq_input != NULL) { + LIST_FOREACH(top, &tcb_list, tc_next) + if (ALTQ_IS_CNDTNING(top->tc_ifq)) + break; + if (top == NULL) + altq_input = NULL; + } + + return (0); +} + +/* + * simple tc elements without input function (e.g., dropper and makers). + */ +static struct cdnr_block * +element_create(top, action) + struct top_cdnr *top; + struct tc_action *action; +{ + struct cdnr_block *cb; + + if (tca_verify_action(action) < 0) + return (NULL); + + if ((cb = cdnr_cballoc(top, TCETYPE_ELEMENT, NULL)) == NULL) + return (NULL); + + tca_import_action(&cb->cb_action, action); + + return (cb); +} + +static int +element_destroy(cb) + struct cdnr_block *cb; +{ + if (cb->cb_ref > 0) + return (EBUSY); + + tca_invalidate_action(&cb->cb_action); + + cdnr_cbdestroy(cb); + return (0); +} + +/* + * internal representation of token bucket parameters + * rate: byte_per_unittime << 32 + * (((bits_per_sec) / 8) << 32) / machclk_freq + * depth: byte << 32 + * + */ +#define TB_SHIFT 32 +#define TB_SCALE(x) ((u_int64_t)(x) << TB_SHIFT) +#define TB_UNSCALE(x) ((x) >> TB_SHIFT) + +static void +tb_import_profile(tb, profile) + struct tbe *tb; + struct tb_profile *profile; +{ + tb->rate = TB_SCALE(profile->rate / 8) / machclk_freq; + tb->depth = TB_SCALE(profile->depth); + if (tb->rate > 0) + tb->filluptime = tb->depth / tb->rate; + else + tb->filluptime = 0xffffffffffffffffLL; + tb->token = tb->depth; + tb->last = read_machclk(); +} + +/* + * simple token bucket meter + */ +static struct tbmeter * +tbm_create(top, profile, in_action, out_action) + struct top_cdnr *top; + struct tb_profile *profile; + struct tc_action *in_action, *out_action; +{ + struct tbmeter *tbm = NULL; + + if (tca_verify_action(in_action) < 0 + || tca_verify_action(out_action) < 0) + return (NULL); + + if ((tbm = cdnr_cballoc(top, TCETYPE_TBMETER, + tbm_input)) == NULL) + return (NULL); + + tb_import_profile(&tbm->tb, profile); + + tca_import_action(&tbm->in_action, in_action); + tca_import_action(&tbm->out_action, out_action); + + return (tbm); +} + +static int +tbm_destroy(tbm) + struct tbmeter *tbm; +{ + if (tbm->cdnrblk.cb_ref > 0) + return (EBUSY); + + tca_invalidate_action(&tbm->in_action); + tca_invalidate_action(&tbm->out_action); + + cdnr_cbdestroy(tbm); + return (0); +} + +static struct tc_action * +tbm_input(cb, pktinfo) + struct cdnr_block *cb; + struct cdnr_pktinfo *pktinfo; +{ + struct tbmeter *tbm = (struct tbmeter *)cb; + u_int64_t len; + u_int64_t interval, now; + + len = TB_SCALE(pktinfo->pkt_len); + + if (tbm->tb.token < len) { + now = read_machclk(); + interval = now - tbm->tb.last; + if (interval >= tbm->tb.filluptime) + tbm->tb.token = tbm->tb.depth; + else { + tbm->tb.token += interval * tbm->tb.rate; + if (tbm->tb.token > tbm->tb.depth) + tbm->tb.token = tbm->tb.depth; + } + tbm->tb.last = now; + } + + if (tbm->tb.token < len) { + PKTCNTR_ADD(&tbm->out_cnt, pktinfo->pkt_len); + return (&tbm->out_action); + } + + tbm->tb.token -= len; + PKTCNTR_ADD(&tbm->in_cnt, pktinfo->pkt_len); + return (&tbm->in_action); +} + +/* + * two rate three color marker + * as described in draft-heinanen-diffserv-trtcm-01.txt + */ +static struct trtcm * +trtcm_create(top, cmtd_profile, peak_profile, + green_action, yellow_action, red_action, coloraware) + struct top_cdnr *top; + struct tb_profile *cmtd_profile, *peak_profile; + struct tc_action *green_action, *yellow_action, *red_action; + int coloraware; +{ + struct trtcm *tcm = NULL; + + if (tca_verify_action(green_action) < 0 + || tca_verify_action(yellow_action) < 0 + || tca_verify_action(red_action) < 0) + return (NULL); + + if ((tcm = cdnr_cballoc(top, TCETYPE_TRTCM, + trtcm_input)) == NULL) + return (NULL); + + tb_import_profile(&tcm->cmtd_tb, cmtd_profile); + tb_import_profile(&tcm->peak_tb, peak_profile); + + tca_import_action(&tcm->green_action, green_action); + tca_import_action(&tcm->yellow_action, yellow_action); + tca_import_action(&tcm->red_action, red_action); + + /* set dscps to use */ + if (tcm->green_action.tca_code == TCACODE_MARK) + tcm->green_dscp = tcm->green_action.tca_dscp & DSCP_MASK; + else + tcm->green_dscp = DSCP_AF11; + if (tcm->yellow_action.tca_code == TCACODE_MARK) + tcm->yellow_dscp = tcm->yellow_action.tca_dscp & DSCP_MASK; + else + tcm->yellow_dscp = DSCP_AF12; + if (tcm->red_action.tca_code == TCACODE_MARK) + tcm->red_dscp = tcm->red_action.tca_dscp & DSCP_MASK; + else + tcm->red_dscp = DSCP_AF13; + + tcm->coloraware = coloraware; + + return (tcm); +} + +static int +trtcm_destroy(tcm) + struct trtcm *tcm; +{ + if (tcm->cdnrblk.cb_ref > 0) + return (EBUSY); + + tca_invalidate_action(&tcm->green_action); + tca_invalidate_action(&tcm->yellow_action); + tca_invalidate_action(&tcm->red_action); + + cdnr_cbdestroy(tcm); + return (0); +} + +static struct tc_action * +trtcm_input(cb, pktinfo) + struct cdnr_block *cb; + struct cdnr_pktinfo *pktinfo; +{ + struct trtcm *tcm = (struct trtcm *)cb; + u_int64_t len; + u_int64_t interval, now; + u_int8_t color; + + len = TB_SCALE(pktinfo->pkt_len); + if (tcm->coloraware) { + color = pktinfo->pkt_dscp; + if (color != tcm->yellow_dscp && color != tcm->red_dscp) + color = tcm->green_dscp; + } else { + /* if color-blind, precolor it as green */ + color = tcm->green_dscp; + } + + now = read_machclk(); + if (tcm->cmtd_tb.token < len) { + interval = now - tcm->cmtd_tb.last; + if (interval >= tcm->cmtd_tb.filluptime) + tcm->cmtd_tb.token = tcm->cmtd_tb.depth; + else { + tcm->cmtd_tb.token += interval * tcm->cmtd_tb.rate; + if (tcm->cmtd_tb.token > tcm->cmtd_tb.depth) + tcm->cmtd_tb.token = tcm->cmtd_tb.depth; + } + tcm->cmtd_tb.last = now; + } + if (tcm->peak_tb.token < len) { + interval = now - tcm->peak_tb.last; + if (interval >= tcm->peak_tb.filluptime) + tcm->peak_tb.token = tcm->peak_tb.depth; + else { + tcm->peak_tb.token += interval * tcm->peak_tb.rate; + if (tcm->peak_tb.token > tcm->peak_tb.depth) + tcm->peak_tb.token = tcm->peak_tb.depth; + } + tcm->peak_tb.last = now; + } + + if (color == tcm->red_dscp || tcm->peak_tb.token < len) { + pktinfo->pkt_dscp = tcm->red_dscp; + PKTCNTR_ADD(&tcm->red_cnt, pktinfo->pkt_len); + return (&tcm->red_action); + } + + if (color == tcm->yellow_dscp || tcm->cmtd_tb.token < len) { + pktinfo->pkt_dscp = tcm->yellow_dscp; + tcm->peak_tb.token -= len; + PKTCNTR_ADD(&tcm->yellow_cnt, pktinfo->pkt_len); + return (&tcm->yellow_action); + } + + pktinfo->pkt_dscp = tcm->green_dscp; + tcm->cmtd_tb.token -= len; + tcm->peak_tb.token -= len; + PKTCNTR_ADD(&tcm->green_cnt, pktinfo->pkt_len); + return (&tcm->green_action); +} + +/* + * time sliding window three color marker + * as described in draft-fang-diffserv-tc-tswtcm-00.txt + */ +static struct tswtcm * +tswtcm_create(top, cmtd_rate, peak_rate, avg_interval, + green_action, yellow_action, red_action) + struct top_cdnr *top; + u_int32_t cmtd_rate, peak_rate, avg_interval; + struct tc_action *green_action, *yellow_action, *red_action; +{ + struct tswtcm *tsw; + + if (tca_verify_action(green_action) < 0 + || tca_verify_action(yellow_action) < 0 + || tca_verify_action(red_action) < 0) + return (NULL); + + if ((tsw = cdnr_cballoc(top, TCETYPE_TSWTCM, + tswtcm_input)) == NULL) + return (NULL); + + tca_import_action(&tsw->green_action, green_action); + tca_import_action(&tsw->yellow_action, yellow_action); + tca_import_action(&tsw->red_action, red_action); + + /* set dscps to use */ + if (tsw->green_action.tca_code == TCACODE_MARK) + tsw->green_dscp = tsw->green_action.tca_dscp & DSCP_MASK; + else + tsw->green_dscp = DSCP_AF11; + if (tsw->yellow_action.tca_code == TCACODE_MARK) + tsw->yellow_dscp = tsw->yellow_action.tca_dscp & DSCP_MASK; + else + tsw->yellow_dscp = DSCP_AF12; + if (tsw->red_action.tca_code == TCACODE_MARK) + tsw->red_dscp = tsw->red_action.tca_dscp & DSCP_MASK; + else + tsw->red_dscp = DSCP_AF13; + + /* convert rates from bits/sec to bytes/sec */ + tsw->cmtd_rate = cmtd_rate / 8; + tsw->peak_rate = peak_rate / 8; + tsw->avg_rate = 0; + + /* timewin is converted from msec to machine clock unit */ + tsw->timewin = (u_int64_t)machclk_freq * avg_interval / 1000; + + return (tsw); +} + +static int +tswtcm_destroy(tsw) + struct tswtcm *tsw; +{ + if (tsw->cdnrblk.cb_ref > 0) + return (EBUSY); + + tca_invalidate_action(&tsw->green_action); + tca_invalidate_action(&tsw->yellow_action); + tca_invalidate_action(&tsw->red_action); + + cdnr_cbdestroy(tsw); + return (0); +} + +static struct tc_action * +tswtcm_input(cb, pktinfo) + struct cdnr_block *cb; + struct cdnr_pktinfo *pktinfo; +{ + struct tswtcm *tsw = (struct tswtcm *)cb; + int len; + u_int32_t avg_rate; + u_int64_t interval, now, tmp; + + /* + * rate estimator + */ + len = pktinfo->pkt_len; + now = read_machclk(); + + interval = now - tsw->t_front; + /* + * calculate average rate: + * avg = (avg * timewin + pkt_len)/(timewin + interval) + * pkt_len needs to be multiplied by machclk_freq in order to + * get (bytes/sec). + * note: when avg_rate (bytes/sec) and timewin (machclk unit) are + * less than 32 bits, the following 64-bit operation has enough + * precision. + */ + tmp = ((u_int64_t)tsw->avg_rate * tsw->timewin + + (u_int64_t)len * machclk_freq) / (tsw->timewin + interval); + tsw->avg_rate = avg_rate = (u_int32_t)tmp; + tsw->t_front = now; + + /* + * marker + */ + if (avg_rate > tsw->cmtd_rate) { + u_int32_t randval = random() % avg_rate; + + if (avg_rate > tsw->peak_rate) { + if (randval < avg_rate - tsw->peak_rate) { + /* mark red */ + pktinfo->pkt_dscp = tsw->red_dscp; + PKTCNTR_ADD(&tsw->red_cnt, len); + return (&tsw->red_action); + } else if (randval < avg_rate - tsw->cmtd_rate) + goto mark_yellow; + } else { + /* peak_rate >= avg_rate > cmtd_rate */ + if (randval < avg_rate - tsw->cmtd_rate) { + mark_yellow: + pktinfo->pkt_dscp = tsw->yellow_dscp; + PKTCNTR_ADD(&tsw->yellow_cnt, len); + return (&tsw->yellow_action); + } + } + } + + /* mark green */ + pktinfo->pkt_dscp = tsw->green_dscp; + PKTCNTR_ADD(&tsw->green_cnt, len); + return (&tsw->green_action); +} + +/* + * ioctl requests + */ +static int +cdnrcmd_if_attach(ifname) + char *ifname; +{ + struct ifnet *ifp; + struct top_cdnr *top; + + if ((ifp = ifunit(ifname)) == NULL) + return (EBADF); + + if (ifp->if_snd.altq_cdnr != NULL) + return (EBUSY); + + if ((top = top_create(&ifp->if_snd)) == NULL) + return (ENOMEM); + return (0); +} + +static int +cdnrcmd_if_detach(ifname) + char *ifname; +{ + struct top_cdnr *top; + + if ((top = tcb_lookup(ifname)) == NULL) + return (EBADF); + + return top_destroy(top); +} + +static int +cdnrcmd_add_element(ap) + struct cdnr_add_element *ap; +{ + struct top_cdnr *top; + struct cdnr_block *cb; + + if ((top = tcb_lookup(ap->iface.cdnr_ifname)) == NULL) + return (EBADF); + + cb = element_create(top, &ap->action); + if (cb == NULL) + return (EINVAL); + /* return a class handle to the user */ + ap->cdnr_handle = cdnr_cb2handle(cb); + return (0); +} + +static int +cdnrcmd_delete_element(ap) + struct cdnr_delete_element *ap; +{ + struct top_cdnr *top; + struct cdnr_block *cb; + + if ((top = tcb_lookup(ap->iface.cdnr_ifname)) == NULL) + return (EBADF); + + if ((cb = cdnr_handle2cb(ap->cdnr_handle)) == NULL) + return (EINVAL); + + if (cb->cb_type != TCETYPE_ELEMENT) + return generic_element_destroy(cb); + + return element_destroy(cb); +} + +static int +cdnrcmd_add_filter(ap) + struct cdnr_add_filter *ap; +{ + struct top_cdnr *top; + struct cdnr_block *cb; + + if ((top = tcb_lookup(ap->iface.cdnr_ifname)) == NULL) + return (EBADF); + + if ((cb = cdnr_handle2cb(ap->cdnr_handle)) == NULL) + return (EINVAL); + + return acc_add_filter(&top->tc_classifier, &ap->filter, + cb, &ap->filter_handle); +} + +static int +cdnrcmd_delete_filter(ap) + struct cdnr_delete_filter *ap; +{ + struct top_cdnr *top; + + if ((top = tcb_lookup(ap->iface.cdnr_ifname)) == NULL) + return (EBADF); + + return acc_delete_filter(&top->tc_classifier, ap->filter_handle); +} + +static int +cdnrcmd_add_tbm(ap) + struct cdnr_add_tbmeter *ap; +{ + struct top_cdnr *top; + struct tbmeter *tbm; + + if ((top = tcb_lookup(ap->iface.cdnr_ifname)) == NULL) + return (EBADF); + + tbm = tbm_create(top, &ap->profile, &ap->in_action, &ap->out_action); + if (tbm == NULL) + return (EINVAL); + /* return a class handle to the user */ + ap->cdnr_handle = cdnr_cb2handle(&tbm->cdnrblk); + return (0); +} + +static int +cdnrcmd_modify_tbm(ap) + struct cdnr_modify_tbmeter *ap; +{ + struct tbmeter *tbm; + + if ((tbm = (struct tbmeter *)cdnr_handle2cb(ap->cdnr_handle)) == NULL) + return (EINVAL); + + tb_import_profile(&tbm->tb, &ap->profile); + + return (0); +} + +static int +cdnrcmd_tbm_stats(ap) + struct cdnr_tbmeter_stats *ap; +{ + struct tbmeter *tbm; + + if ((tbm = (struct tbmeter *)cdnr_handle2cb(ap->cdnr_handle)) == NULL) + return (EINVAL); + + ap->in_cnt = tbm->in_cnt; + ap->out_cnt = tbm->out_cnt; + + return (0); +} + +static int +cdnrcmd_add_trtcm(ap) + struct cdnr_add_trtcm *ap; +{ + struct top_cdnr *top; + struct trtcm *tcm; + + if ((top = tcb_lookup(ap->iface.cdnr_ifname)) == NULL) + return (EBADF); + + tcm = trtcm_create(top, &ap->cmtd_profile, &ap->peak_profile, + &ap->green_action, &ap->yellow_action, + &ap->red_action, ap->coloraware); + if (tcm == NULL) + return (EINVAL); + + /* return a class handle to the user */ + ap->cdnr_handle = cdnr_cb2handle(&tcm->cdnrblk); + return (0); +} + +static int +cdnrcmd_modify_trtcm(ap) + struct cdnr_modify_trtcm *ap; +{ + struct trtcm *tcm; + + if ((tcm = (struct trtcm *)cdnr_handle2cb(ap->cdnr_handle)) == NULL) + return (EINVAL); + + tb_import_profile(&tcm->cmtd_tb, &ap->cmtd_profile); + tb_import_profile(&tcm->peak_tb, &ap->peak_profile); + + return (0); +} + +static int +cdnrcmd_tcm_stats(ap) + struct cdnr_tcm_stats *ap; +{ + struct cdnr_block *cb; + + if ((cb = cdnr_handle2cb(ap->cdnr_handle)) == NULL) + return (EINVAL); + + if (cb->cb_type == TCETYPE_TRTCM) { + struct trtcm *tcm = (struct trtcm *)cb; + + ap->green_cnt = tcm->green_cnt; + ap->yellow_cnt = tcm->yellow_cnt; + ap->red_cnt = tcm->red_cnt; + } else if (cb->cb_type == TCETYPE_TSWTCM) { + struct tswtcm *tsw = (struct tswtcm *)cb; + + ap->green_cnt = tsw->green_cnt; + ap->yellow_cnt = tsw->yellow_cnt; + ap->red_cnt = tsw->red_cnt; + } else + return (EINVAL); + + return (0); +} + +static int +cdnrcmd_add_tswtcm(ap) + struct cdnr_add_tswtcm *ap; +{ + struct top_cdnr *top; + struct tswtcm *tsw; + + if ((top = tcb_lookup(ap->iface.cdnr_ifname)) == NULL) + return (EBADF); + + if (ap->cmtd_rate > ap->peak_rate) + return (EINVAL); + + tsw = tswtcm_create(top, ap->cmtd_rate, ap->peak_rate, + ap->avg_interval, &ap->green_action, + &ap->yellow_action, &ap->red_action); + if (tsw == NULL) + return (EINVAL); + + /* return a class handle to the user */ + ap->cdnr_handle = cdnr_cb2handle(&tsw->cdnrblk); + return (0); +} + +static int +cdnrcmd_modify_tswtcm(ap) + struct cdnr_modify_tswtcm *ap; +{ + struct tswtcm *tsw; + + if ((tsw = (struct tswtcm *)cdnr_handle2cb(ap->cdnr_handle)) == NULL) + return (EINVAL); + + if (ap->cmtd_rate > ap->peak_rate) + return (EINVAL); + + /* convert rates from bits/sec to bytes/sec */ + tsw->cmtd_rate = ap->cmtd_rate / 8; + tsw->peak_rate = ap->peak_rate / 8; + tsw->avg_rate = 0; + + /* timewin is converted from msec to machine clock unit */ + tsw->timewin = (u_int64_t)machclk_freq * ap->avg_interval / 1000; + + return (0); +} + +static int +cdnrcmd_get_stats(ap) + struct cdnr_get_stats *ap; +{ + struct top_cdnr *top; + struct cdnr_block *cb; + struct tbmeter *tbm; + struct trtcm *tcm; + struct tswtcm *tsw; + struct tce_stats tce, *usp; + int error, n, nskip, nelements; + + if ((top = tcb_lookup(ap->iface.cdnr_ifname)) == NULL) + return (EBADF); + + /* copy action stats */ + bcopy(top->tc_cnts, ap->cnts, sizeof(ap->cnts)); + + /* stats for each element */ + nelements = ap->nelements; + usp = ap->tce_stats; + if (nelements <= 0 || usp == NULL) + return (0); + + nskip = ap->nskip; + n = 0; + LIST_FOREACH(cb, &top->tc_elements, cb_next) { + if (nskip > 0) { + nskip--; + continue; + } + + bzero(&tce, sizeof(tce)); + tce.tce_handle = cb->cb_handle; + tce.tce_type = cb->cb_type; + switch (cb->cb_type) { + case TCETYPE_TBMETER: + tbm = (struct tbmeter *)cb; + tce.tce_cnts[0] = tbm->in_cnt; + tce.tce_cnts[1] = tbm->out_cnt; + break; + case TCETYPE_TRTCM: + tcm = (struct trtcm *)cb; + tce.tce_cnts[0] = tcm->green_cnt; + tce.tce_cnts[1] = tcm->yellow_cnt; + tce.tce_cnts[2] = tcm->red_cnt; + break; + case TCETYPE_TSWTCM: + tsw = (struct tswtcm *)cb; + tce.tce_cnts[0] = tsw->green_cnt; + tce.tce_cnts[1] = tsw->yellow_cnt; + tce.tce_cnts[2] = tsw->red_cnt; + break; + default: + continue; + } + + if ((error = copyout((caddr_t)&tce, (caddr_t)usp++, + sizeof(tce))) != 0) + return (error); + + if (++n == nelements) + break; + } + ap->nelements = n; + + return (0); +} + +/* + * conditioner device interface + */ +int +cdnropen(dev, flag, fmt, p) + dev_t dev; + int flag, fmt; + struct proc *p; +{ + if (machclk_freq == 0) + init_machclk(); + + if (machclk_freq == 0) { + printf("cdnr: no cpu clock available!\n"); + return (ENXIO); + } + + /* everything will be done when the queueing scheme is attached. */ + return 0; +} + +int +cdnrclose(dev, flag, fmt, p) + dev_t dev; + int flag, fmt; + struct proc *p; +{ + struct top_cdnr *top; + int err, error = 0; + + while ((top = LIST_FIRST(&tcb_list)) != NULL) { + /* destroy all */ + err = top_destroy(top); + if (err != 0 && error == 0) + error = err; + } + altq_input = NULL; + + return (error); +} + +int +cdnrioctl(dev, cmd, addr, flag, p) + dev_t dev; + ioctlcmd_t cmd; + caddr_t addr; + int flag; + struct proc *p; +{ + struct top_cdnr *top; + struct cdnr_interface *ifacep; + int s, error = 0; + + /* check super-user privilege */ + switch (cmd) { + case CDNR_GETSTATS: + break; + default: +#if (__FreeBSD_version > 400000) + if ((error = suser(p)) != 0) +#else + if ((error = suser(p->p_ucred, &p->p_acflag)) != 0) +#endif + return (error); + break; + } + + s = splimp(); + switch (cmd) { + + case CDNR_IF_ATTACH: + ifacep = (struct cdnr_interface *)addr; + error = cdnrcmd_if_attach(ifacep->cdnr_ifname); + break; + + case CDNR_IF_DETACH: + ifacep = (struct cdnr_interface *)addr; + error = cdnrcmd_if_detach(ifacep->cdnr_ifname); + break; + + case CDNR_ENABLE: + case CDNR_DISABLE: + ifacep = (struct cdnr_interface *)addr; + if ((top = tcb_lookup(ifacep->cdnr_ifname)) == NULL) { + error = EBADF; + break; + } + + switch (cmd) { + + case CDNR_ENABLE: + ALTQ_SET_CNDTNING(top->tc_ifq); + if (altq_input == NULL) + altq_input = altq_cdnr_input; + break; + + case CDNR_DISABLE: + ALTQ_CLEAR_CNDTNING(top->tc_ifq); + LIST_FOREACH(top, &tcb_list, tc_next) + if (ALTQ_IS_CNDTNING(top->tc_ifq)) + break; + if (top == NULL) + altq_input = NULL; + break; + } + break; + + case CDNR_ADD_ELEM: + error = cdnrcmd_add_element((struct cdnr_add_element *)addr); + break; + + case CDNR_DEL_ELEM: + error = cdnrcmd_delete_element((struct cdnr_delete_element *)addr); + break; + + case CDNR_ADD_TBM: + error = cdnrcmd_add_tbm((struct cdnr_add_tbmeter *)addr); + break; + + case CDNR_MOD_TBM: + error = cdnrcmd_modify_tbm((struct cdnr_modify_tbmeter *)addr); + break; + + case CDNR_TBM_STATS: + error = cdnrcmd_tbm_stats((struct cdnr_tbmeter_stats *)addr); + break; + + case CDNR_ADD_TCM: + error = cdnrcmd_add_trtcm((struct cdnr_add_trtcm *)addr); + break; + + case CDNR_MOD_TCM: + error = cdnrcmd_modify_trtcm((struct cdnr_modify_trtcm *)addr); + break; + + case CDNR_TCM_STATS: + error = cdnrcmd_tcm_stats((struct cdnr_tcm_stats *)addr); + break; + + case CDNR_ADD_FILTER: + error = cdnrcmd_add_filter((struct cdnr_add_filter *)addr); + break; + + case CDNR_DEL_FILTER: + error = cdnrcmd_delete_filter((struct cdnr_delete_filter *)addr); + break; + + case CDNR_GETSTATS: + error = cdnrcmd_get_stats((struct cdnr_get_stats *)addr); + break; + + case CDNR_ADD_TSW: + error = cdnrcmd_add_tswtcm((struct cdnr_add_tswtcm *)addr); + break; + + case CDNR_MOD_TSW: + error = cdnrcmd_modify_tswtcm((struct cdnr_modify_tswtcm *)addr); + break; + + default: + error = EINVAL; + break; + } + splx(s); + + return error; +} + +#ifdef KLD_MODULE + +static struct altqsw cdnr_sw = + {"cdnr", cdnropen, cdnrclose, cdnrioctl}; + +ALTQ_MODULE(altq_cdnr, ALTQT_CDNR, &cdnr_sw); + +#endif /* KLD_MODULE */ + +#endif /* ALTQ_CDNR */ diff --git a/sys/altq/altq_cdnr.h b/sys/altq/altq_cdnr.h new file mode 100644 index 000000000000..bd199e70fe4d --- /dev/null +++ b/sys/altq/altq_cdnr.h @@ -0,0 +1,333 @@ +/* $KAME: altq_cdnr.h,v 1.6 2000/12/14 08:12:45 thorpej Exp $ */ + +/* + * Copyright (C) 1999-2000 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _ALTQ_ALTQ_CDNR_H_ +#define _ALTQ_ALTQ_CDNR_H_ + +#include + +/* + * traffic conditioner element types + */ +#define TCETYPE_NONE 0 +#define TCETYPE_TOP 1 /* top level conditioner */ +#define TCETYPE_ELEMENT 2 /* a simple tc element */ +#define TCETYPE_TBMETER 3 /* token bucket meter */ +#define TCETYPE_TRTCM 4 /* (two-rate) three color marker */ +#define TCETYPE_TSWTCM 5 /* time sliding window 3-color maker */ + +/* + * traffic conditioner action + */ +struct cdnr_block; + +struct tc_action { + int tca_code; /* e.g., TCACODE_PASS */ + /* tca_code dependent variable */ + union { + u_long un_value; /* template */ + u_int8_t un_dscp; /* diffserv code point */ + u_long un_handle; /* tc action handle */ + struct cdnr_block *un_next; /* next tc element block */ + } tca_un; +}; +#define tca_value tca_un.un_value +#define tca_dscp tca_un.un_dscp +#define tca_handle tca_un.un_handle +#define tca_next tca_un.un_next + +#define TCACODE_NONE 0 /* action is not set */ +#define TCACODE_PASS 1 /* pass this packet */ +#define TCACODE_DROP 2 /* discard this packet */ +#define TCACODE_RETURN 3 /* do not process this packet */ +#define TCACODE_MARK 4 /* mark dscp */ +#define TCACODE_HANDLE 5 /* take action specified by handle */ +#define TCACODE_NEXT 6 /* take action in the next tc element */ +#define TCACODE_MAX 6 + +#define CDNR_NULL_HANDLE 0 + +struct cdnr_interface { + char cdnr_ifname[IFNAMSIZ]; /* interface name (e.g., fxp0) */ +}; + +/* simple element operations */ +struct cdnr_add_element { + struct cdnr_interface iface; + struct tc_action action; + + u_long cdnr_handle; /* return value */ +}; + +struct cdnr_delete_element { + struct cdnr_interface iface; + u_long cdnr_handle; +}; + +/* token-bucket meter operations */ +struct cdnr_add_tbmeter { + struct cdnr_interface iface; + struct tb_profile profile; + struct tc_action in_action; + struct tc_action out_action; + + u_long cdnr_handle; /* return value */ +}; + +struct cdnr_modify_tbmeter { + struct cdnr_interface iface; + u_long cdnr_handle; + struct tb_profile profile; +}; + +struct cdnr_tbmeter_stats { + struct cdnr_interface iface; + u_long cdnr_handle; + struct pktcntr in_cnt; + struct pktcntr out_cnt; +}; + +/* two-rate three-color marker operations */ +struct cdnr_add_trtcm { + struct cdnr_interface iface; + struct tb_profile cmtd_profile; /* profile for committed tb */ + struct tb_profile peak_profile; /* profile for peak tb */ + struct tc_action green_action; /* action for green packets */ + struct tc_action yellow_action; /* action for yellow packets */ + struct tc_action red_action; /* action for red packets */ + int coloraware; /* color-aware/color-blind */ + + u_long cdnr_handle; /* return value */ +}; + +struct cdnr_modify_trtcm { + struct cdnr_interface iface; + u_long cdnr_handle; + struct tb_profile cmtd_profile; /* profile for committed tb */ + struct tb_profile peak_profile; /* profile for peak tb */ + int coloraware; /* color-aware/color-blind */ +}; + +struct cdnr_tcm_stats { + struct cdnr_interface iface; + u_long cdnr_handle; + struct pktcntr green_cnt; + struct pktcntr yellow_cnt; + struct pktcntr red_cnt; +}; + +/* time sliding window three-color marker operations */ +struct cdnr_add_tswtcm { + struct cdnr_interface iface; + u_int32_t cmtd_rate; /* committed rate (bits/sec) */ + u_int32_t peak_rate; /* peak rate (bits/sec) */ + u_int32_t avg_interval; /* averaging interval (msec) */ + struct tc_action green_action; /* action for green packets */ + struct tc_action yellow_action; /* action for yellow packets */ + struct tc_action red_action; /* action for red packets */ + + u_long cdnr_handle; /* return value */ +}; + +struct cdnr_modify_tswtcm { + struct cdnr_interface iface; + u_long cdnr_handle; + u_int32_t cmtd_rate; /* committed rate (bits/sec) */ + u_int32_t peak_rate; /* peak rate (bits/sec) */ + u_int32_t avg_interval; /* averaging interval (msec) */ +}; + +struct cdnr_add_filter { + struct cdnr_interface iface; + u_long cdnr_handle; + struct flow_filter filter; + + u_long filter_handle; /* return value */ +}; + +struct cdnr_delete_filter { + struct cdnr_interface iface; + u_long filter_handle; +}; + +struct tce_stats { + u_long tce_handle; /* tc element handle */ + int tce_type; /* e.g., TCETYPE_ELEMENT */ + struct pktcntr tce_cnts[3]; /* tcm returns 3 counters */ +}; + +struct cdnr_get_stats { + struct cdnr_interface iface; + struct pktcntr cnts[TCACODE_MAX+1]; + + /* element stats */ + int nskip; /* skip # of elements */ + int nelements; /* # of element stats (WR) */ + struct tce_stats *tce_stats; /* pointer to stats array */ +}; + +#define CDNR_IF_ATTACH _IOW('Q', 1, struct cdnr_interface) +#define CDNR_IF_DETACH _IOW('Q', 2, struct cdnr_interface) +#define CDNR_ENABLE _IOW('Q', 3, struct cdnr_interface) +#define CDNR_DISABLE _IOW('Q', 4, struct cdnr_interface) +#define CDNR_ADD_FILTER _IOWR('Q', 10, struct cdnr_add_filter) +#define CDNR_DEL_FILTER _IOW('Q', 11, struct cdnr_delete_filter) +#define CDNR_GETSTATS _IOWR('Q', 12, struct cdnr_get_stats) +#define CDNR_ADD_ELEM _IOWR('Q', 30, struct cdnr_add_element) +#define CDNR_DEL_ELEM _IOW('Q', 31, struct cdnr_delete_element) +#define CDNR_ADD_TBM _IOWR('Q', 32, struct cdnr_add_tbmeter) +#define CDNR_MOD_TBM _IOW('Q', 33, struct cdnr_modify_tbmeter) +#define CDNR_TBM_STATS _IOWR('Q', 34, struct cdnr_tbmeter_stats) +#define CDNR_ADD_TCM _IOWR('Q', 35, struct cdnr_add_trtcm) +#define CDNR_MOD_TCM _IOWR('Q', 36, struct cdnr_modify_trtcm) +#define CDNR_TCM_STATS _IOWR('Q', 37, struct cdnr_tcm_stats) +#define CDNR_ADD_TSW _IOWR('Q', 38, struct cdnr_add_tswtcm) +#define CDNR_MOD_TSW _IOWR('Q', 39, struct cdnr_modify_tswtcm) + +#ifndef DSCP_EF +/* diffserve code points */ +#define DSCP_MASK 0xfc +#define DSCP_CUMASK 0x03 +#define DSCP_EF 0xb8 +#define DSCP_AF11 0x28 +#define DSCP_AF12 0x30 +#define DSCP_AF13 0x38 +#define DSCP_AF21 0x48 +#define DSCP_AF22 0x50 +#define DSCP_AF23 0x58 +#define DSCP_AF31 0x68 +#define DSCP_AF32 0x70 +#define DSCP_AF33 0x78 +#define DSCP_AF41 0x88 +#define DSCP_AF42 0x90 +#define DSCP_AF43 0x98 +#define AF_CLASSMASK 0xe0 +#define AF_DROPPRECMASK 0x18 +#endif + +#ifdef _KERNEL + +/* + * packet information passed to the input function of tc elements + */ +struct cdnr_pktinfo { + int pkt_len; /* packet length */ + u_int8_t pkt_dscp; /* diffserv code point */ +}; + +/* + * traffic conditioner control block common to all types of tc elements + */ +struct cdnr_block { + LIST_ENTRY(cdnr_block) cb_next; + int cb_len; /* size of this tc element */ + int cb_type; /* cdnr block type */ + int cb_ref; /* reference count of this element */ + u_long cb_handle; /* handle of this tc element */ + struct top_cdnr *cb_top; /* back pointer to top */ + struct tc_action cb_action; /* top level action for this tcb */ + struct tc_action *(*cb_input)(struct cdnr_block *, + struct cdnr_pktinfo *); +}; + +/* + * top level traffic conditioner structure for an interface + */ +struct top_cdnr { + struct cdnr_block tc_block; + + LIST_ENTRY(top_cdnr) tc_next; + struct ifaltq *tc_ifq; + + LIST_HEAD(, cdnr_block) tc_elements; + struct acc_classifier tc_classifier; + + struct pktcntr tc_cnts[TCACODE_MAX+1]; +}; + +/* token bucket element */ +struct tbe { + u_int64_t rate; + u_int64_t depth; + + u_int64_t token; + u_int64_t filluptime; + u_int64_t last; +}; + +/* token bucket meter structure */ +struct tbmeter { + struct cdnr_block cdnrblk; /* conditioner block */ + struct tbe tb; /* token bucket */ + struct tc_action in_action; /* actions for IN/OUT */ + struct tc_action out_action; /* actions for IN/OUT */ + struct pktcntr in_cnt; /* statistics for IN/OUT */ + struct pktcntr out_cnt; /* statistics for IN/OUT */ +}; + +/* two-rate three-color marker structure */ +struct trtcm { + struct cdnr_block cdnrblk; /* conditioner block */ + struct tbe cmtd_tb; /* committed tb profile */ + struct tbe peak_tb; /* peak tb profile */ + struct tc_action green_action; + struct tc_action yellow_action; + struct tc_action red_action; + int coloraware; + u_int8_t green_dscp; + u_int8_t yellow_dscp; + u_int8_t red_dscp; + struct pktcntr green_cnt; + struct pktcntr yellow_cnt; + struct pktcntr red_cnt; +}; + +/* time sliding window three-color marker structure */ +struct tswtcm { + struct cdnr_block cdnrblk; /* conditioner block */ + + u_int32_t avg_rate; /* average rate (bytes/sec) */ + u_int64_t t_front; /* timestamp of last update */ + + u_int64_t timewin; /* average interval */ + u_int32_t cmtd_rate; /* committed target rate */ + u_int32_t peak_rate; /* peak target rate */ + struct tc_action green_action; + struct tc_action yellow_action; + struct tc_action red_action; + u_int8_t green_dscp; + u_int8_t yellow_dscp; + u_int8_t red_dscp; + struct pktcntr green_cnt; + struct pktcntr yellow_cnt; + struct pktcntr red_cnt; +}; + +#endif /* _KERNEL */ + +#endif /* _ALTQ_ALTQ_CDNR_H_ */ diff --git a/sys/altq/altq_classq.h b/sys/altq/altq_classq.h new file mode 100644 index 000000000000..a37d926c6cba --- /dev/null +++ b/sys/altq/altq_classq.h @@ -0,0 +1,203 @@ +/* $KAME: altq_classq.h,v 1.3 2000/07/25 10:12:29 kjc Exp $ */ + +/* + * Copyright (c) 1991-1997 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the Network Research + * Group at Lawrence Berkeley Laboratory. + * 4. Neither the name of the University nor of the Laboratory may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * class queue definitions extracted from rm_class.h. + */ +#ifndef _ALTQ_ALTQ_CLASSQ_H_ +#define _ALTQ_ALTQ_CLASSQ_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Packet Queue types: RED or DROPHEAD. + */ +#define Q_DROPHEAD 0x00 +#define Q_RED 0x01 +#define Q_RIO 0x02 +#define Q_DROPTAIL 0x03 + +#ifdef _KERNEL + +/* + * Packet Queue strcutures and macros to manipulate them. + */ +struct _class_queue_ { + struct mbuf *tail_; /* Tail of packet queue */ + int qlen_; /* Queue length (in number of packets) */ + int qlim_; /* Queue limit (in number of packets*) */ + int qtype_; /* Queue type */ +}; + +typedef struct _class_queue_ class_queue_t; + +#define qtype(q) (q)->qtype_ /* Get queue type */ +#define qlimit(q) (q)->qlim_ /* Max packets to be queued */ +#define qlen(q) (q)->qlen_ /* Current queue length. */ +#define qtail(q) (q)->tail_ /* Tail of the queue */ +#define qhead(q) ((q)->tail_ ? (q)->tail_->m_nextpkt : NULL) + +#define qempty(q) ((q)->qlen_ == 0) /* Is the queue empty?? */ +#define q_is_red(q) ((q)->qtype_ == Q_RED) /* Is the queue a red queue */ +#define q_is_rio(q) ((q)->qtype_ == Q_RIO) /* Is the queue a rio queue */ +#define q_is_red_or_rio(q) ((q)->qtype_ == Q_RED || (q)->qtype_ == Q_RIO) + +#if !defined(__GNUC__) || defined(ALTQ_DEBUG) + +extern void _addq(class_queue_t *, struct mbuf *); +extern struct mbuf *_getq(class_queue_t *); +extern struct mbuf *_getq_tail(class_queue_t *); +extern struct mbuf *_getq_random(class_queue_t *); +extern void _removeq(class_queue_t *, struct mbuf *); +extern void _flushq(class_queue_t *); + +#else /* __GNUC__ && !ALTQ_DEBUG */ +/* + * inlined versions + */ +static __inline void +_addq(class_queue_t *q, struct mbuf *m) +{ + struct mbuf *m0; + + if ((m0 = qtail(q)) != NULL) + m->m_nextpkt = m0->m_nextpkt; + else + m0 = m; + m0->m_nextpkt = m; + qtail(q) = m; + qlen(q)++; +} + +static __inline struct mbuf * +_getq(class_queue_t *q) +{ + struct mbuf *m, *m0; + + if ((m = qtail(q)) == NULL) + return (NULL); + if ((m0 = m->m_nextpkt) != m) + m->m_nextpkt = m0->m_nextpkt; + else + qtail(q) = NULL; + qlen(q)--; + return (m0); +} + +/* drop a packet at the tail of the queue */ +static __inline struct mbuf * +_getq_tail(class_queue_t *q) +{ + struct mbuf *m, *m0, *prev; + + if ((m = m0 = qtail(q)) == NULL) + return NULL; + do { + prev = m0; + m0 = m0->m_nextpkt; + } while (m0 != m); + prev->m_nextpkt = m->m_nextpkt; + if (prev == m) + qtail(q) = NULL; + else + qtail(q) = prev; + qlen(q)--; + return (m); +} + +/* randomly select a packet in the queue */ +static __inline struct mbuf * +_getq_random(class_queue_t *q) +{ + struct mbuf *m; + int i, n; + + if ((m = qtail(q)) == NULL) + return NULL; + if (m->m_nextpkt == m) + qtail(q) = NULL; + else { + struct mbuf *prev = NULL; + + n = random() % qlen(q) + 1; + for (i = 0; i < n; i++) { + prev = m; + m = m->m_nextpkt; + } + prev->m_nextpkt = m->m_nextpkt; + if (m == qtail(q)) + qtail(q) = prev; + } + qlen(q)--; + return (m); +} + +static __inline void +_removeq(class_queue_t *q, struct mbuf *m) +{ + struct mbuf *m0, *prev; + + m0 = qtail(q); + do { + prev = m0; + m0 = m0->m_nextpkt; + } while (m0 != m); + prev->m_nextpkt = m->m_nextpkt; + if (prev == m) + qtail(q) = NULL; + else if (qtail(q) == m) + qtail(q) = prev; + qlen(q)--; +} + +static __inline void +_flushq(class_queue_t *q) +{ + struct mbuf *m; + + while ((m = _getq(q)) != NULL) + m_freem(m); +} + +#endif /* __GNUC__ && !ALTQ_DEBUG */ + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _ALTQ_ALTQ_CLASSQ_H_ */ diff --git a/sys/altq/altq_conf.c b/sys/altq/altq_conf.c new file mode 100644 index 000000000000..88ecaf7a1304 --- /dev/null +++ b/sys/altq/altq_conf.c @@ -0,0 +1,467 @@ +/* $KAME: altq_conf.c,v 1.10 2000/12/14 08:12:45 thorpej Exp $ */ + +/* + * Copyright (C) 1997-2000 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifdef ALTQ +#if defined(__FreeBSD__) || defined(__NetBSD__) +#include "opt_altq.h" +#if (__FreeBSD__ != 2) +#include "opt_inet.h" +#ifdef __FreeBSD__ +#include "opt_inet6.h" +#endif +#endif +#endif /* __FreeBSD__ || __NetBSD__ */ + +/* + * altq device interface. + */ +#include +#include +#include +#include +#include +#include +#if defined(__FreeBSD__) && (__FreeBSD_version < 400000) && defined(DEVFS) +#include +#endif /*DEVFS*/ +#include + +#include +#include + +#ifdef ALTQ_CBQ +altqdev_decl(cbq); +#endif +#ifdef ALTQ_WFQ +altqdev_decl(wfq); +#endif +#ifdef ALTQ_AFMAP +altqdev_decl(afm); +#endif +#ifdef ALTQ_FIFOQ +altqdev_decl(fifoq); +#endif +#ifdef ALTQ_RED +altqdev_decl(red); +#endif +#ifdef ALTQ_RIO +altqdev_decl(rio); +#endif +#ifdef ALTQ_LOCALQ +altqdev_decl(localq); +#endif +#ifdef ALTQ_HFSC +altqdev_decl(hfsc); +#endif +#ifdef ALTQ_CDNR +altqdev_decl(cdnr); +#endif +#ifdef ALTQ_BLUE +altqdev_decl(blue); +#endif +#ifdef ALTQ_PRIQ +altqdev_decl(priq); +#endif + +/* + * altq minor device (discipline) table + */ +static struct altqsw altqsw[] = { /* minor */ + {"noq", noopen, noclose, noioctl}, /* 0 (reserved) */ +#ifdef ALTQ_CBQ + {"cbq", cbqopen, cbqclose, cbqioctl}, /* 1 */ +#else + {"noq", noopen, noclose, noioctl}, /* 1 */ +#endif +#ifdef ALTQ_WFQ + {"wfq", wfqopen, wfqclose, wfqioctl}, /* 2 */ +#else + {"noq", noopen, noclose, noioctl}, /* 2 */ +#endif +#ifdef ALTQ_AFMAP + {"afm", afmopen, afmclose, afmioctl}, /* 3 */ +#else + {"noq", noopen, noclose, noioctl}, /* 3 */ +#endif +#ifdef ALTQ_FIFOQ + {"fifoq", fifoqopen, fifoqclose, fifoqioctl}, /* 4 */ +#else + {"noq", noopen, noclose, noioctl}, /* 4 */ +#endif +#ifdef ALTQ_RED + {"red", redopen, redclose, redioctl}, /* 5 */ +#else + {"noq", noopen, noclose, noioctl}, /* 5 */ +#endif +#ifdef ALTQ_RIO + {"rio", rioopen, rioclose, rioioctl}, /* 6 */ +#else + {"noq", noopen, noclose, noioctl}, /* 6 */ +#endif +#ifdef ALTQ_LOCALQ + {"localq",localqopen, localqclose, localqioctl}, /* 7 (local use) */ +#else + {"noq", noopen, noclose, noioctl}, /* 7 (local use) */ +#endif +#ifdef ALTQ_HFSC + {"hfsc",hfscopen, hfscclose, hfscioctl}, /* 8 */ +#else + {"noq", noopen, noclose, noioctl}, /* 8 */ +#endif +#ifdef ALTQ_CDNR + {"cdnr",cdnropen, cdnrclose, cdnrioctl}, /* 9 */ +#else + {"noq", noopen, noclose, noioctl}, /* 9 */ +#endif +#ifdef ALTQ_BLUE + {"blue",blueopen, blueclose, blueioctl}, /* 10 */ +#else + {"noq", noopen, noclose, noioctl}, /* 10 */ +#endif +#ifdef ALTQ_PRIQ + {"priq",priqopen, priqclose, priqioctl}, /* 11 */ +#else + {"noq", noopen, noclose, noioctl}, /* 11 */ +#endif +}; + +/* + * altq major device support + */ +int naltqsw = sizeof (altqsw) / sizeof (altqsw[0]); + +#ifndef __OpenBSD__ +static d_open_t altqopen; +static d_close_t altqclose; +static d_ioctl_t altqioctl; +#endif +#ifdef __FreeBSD__ +static void altq_drvinit __P((void *)); +#else +void altqattach __P((int)); +#endif + +#if defined(__FreeBSD__) +#define CDEV_MAJOR 96 /* FreeBSD official number */ +#elif defined(__NetBSD__) +#if defined(__i386__) +#define CDEV_MAJOR 75 /* NetBSD i386 (not official) */ +#elif defined(__alpha__) +#define CDEV_MAJOR 62 /* NetBSD alpha (not official) */ +#else +#error arch not supported +#endif +#elif defined(__OpenBSD__) +#if defined(__i386__) +#define CDEV_MAJOR 67 /* OpenBSD i386 (not official) */ +#elif defined(__alpha__) +#define CDEV_MAJOR 52 /* OpenBSD alpha (not official) */ +#else +#error arch not supported +#endif +#endif + +#if defined(__FreeBSD__) +#if (__FreeBSD_version < 400000) +static struct cdevsw altq_cdevsw = + { altqopen, altqclose, noread, nowrite, + altqioctl, nostop, nullreset, nodevtotty, + seltrue, nommap, NULL, "altq", NULL, -1 }; +#else +static struct cdevsw altq_cdevsw = + { altqopen, altqclose, noread, nowrite, + altqioctl, seltrue, nommap, nostrategy, + "altq", CDEV_MAJOR, nodump, nopsize, 0, -1 }; +#endif +#elif defined(__NetBSD__) +static struct cdevsw altq_cdevsw = cdev__oci_init(1,altq); +#elif defined(__OpenBSD__) +static struct cdevsw altq_cdevsw = { + altqopen, altqclose, 0, 0, altqioctl, 0, + 0, 0, 0, 0 }; +#endif + +#if !defined(__OpenBSD__) +static +#endif +int +altqopen(dev, flag, fmt, p) + dev_t dev; + int flag, fmt; + struct proc *p; +{ + int unit = minor(dev); + + if (unit == 0) + return (0); + if (unit < naltqsw) + return (*altqsw[unit].d_open)(dev, flag, fmt, p); + + return ENXIO; +} + +#if !defined(__OpenBSD__) +static +#endif +int +altqclose(dev, flag, fmt, p) + dev_t dev; + int flag, fmt; + struct proc *p; +{ + int unit = minor(dev); + + if (unit == 0) + return (0); + if (unit < naltqsw) + return (*altqsw[unit].d_close)(dev, flag, fmt, p); + + return ENXIO; +} + +#if !defined(__OpenBSD__) +static +#endif +int +altqioctl(dev, cmd, addr, flag, p) + dev_t dev; + ioctlcmd_t cmd; + caddr_t addr; + int flag; + struct proc *p; +{ + int unit = minor(dev); + + if (unit == 0) { + struct ifnet *ifp; + struct altqreq *typereq; + struct tbrreq *tbrreq; + int error; + + switch (cmd) { + case ALTQGTYPE: + case ALTQTBRGET: + break; + default: +#if (__FreeBSD_version > 400000) + if ((error = suser(p)) != 0) + return (error); +#else + if ((error = suser(p->p_ucred, &p->p_acflag)) != 0) + return (error); +#endif + break; + } + + switch (cmd) { + case ALTQGTYPE: + typereq = (struct altqreq *)addr; + if ((ifp = ifunit(typereq->ifname)) == NULL) + return (EINVAL); + typereq->arg = (u_long)ifp->if_snd.altq_type; + return (0); + case ALTQTBRSET: + tbrreq = (struct tbrreq *)addr; + if ((ifp = ifunit(tbrreq->ifname)) == NULL) + return (EINVAL); + return tbr_set(&ifp->if_snd, &tbrreq->tb_prof); + case ALTQTBRGET: + tbrreq = (struct tbrreq *)addr; + if ((ifp = ifunit(tbrreq->ifname)) == NULL) + return (EINVAL); + return tbr_get(&ifp->if_snd, &tbrreq->tb_prof); + default: + return (EINVAL); + } + } + if (unit < naltqsw) + return (*altqsw[unit].d_ioctl)(dev, cmd, addr, flag, p); + + return ENXIO; +} + + +static int altq_devsw_installed = 0; + +#ifdef __FreeBSD__ +#if (__FreeBSD_version < 400000) +#ifdef DEVFS +static void *altq_devfs_token[sizeof (altqsw) / sizeof (altqsw[0])]; +#endif + +static void +altq_drvinit(unused) + void *unused; +{ + dev_t dev; +#ifdef DEVFS + int i; +#endif + + if (!altq_devsw_installed) { + dev = makedev(CDEV_MAJOR,0); + cdevsw_add(&dev,&altq_cdevsw,NULL); + altq_devsw_installed = 1; +#ifdef DEVFS + for (i=0; i= ALTQT_MAX || altq_modules[type] == NULL) + return; + + altq_modules[type]->ref++; +} + +void altq_module_declref(type) + int type; +{ + if (type < 0 || type >= ALTQT_MAX || altq_modules[type] == NULL) + return; + + altq_modules[type]->ref--; +} + +static int +altq_module_register(mdata) + struct altq_module_data *mdata; +{ + int type = mdata->type; + + if (type < 0 || type >= ALTQT_MAX) + return (EINVAL); + if (altqsw[type].d_open != noopen) + return (EBUSY); + altqsw[type] = *mdata->altqsw; /* set discipline functions */ + altq_modules[type] = mdata; /* save module data pointer */ + return (0); +} + +static int +altq_module_deregister(mdata) + struct altq_module_data *mdata; +{ + int type = mdata->type; + + if (type < 0 || type >= ALTQT_MAX) + return (EINVAL); + if (mdata != altq_modules[type]) + return (EINVAL); + if (altq_modules[type]->ref > 0) + return (EBUSY); + altqsw[type] = noqdisc; + altq_modules[type] = NULL; + return (0); +} + +int +altq_module_handler(mod, cmd, arg) + module_t mod; + int cmd; + void * arg; +{ + struct altq_module_data *data = (struct altq_module_data *)arg; + int error = 0; + + switch (cmd) { + case MOD_LOAD: + error = altq_module_register(data); + break; + + case MOD_UNLOAD: + error = altq_module_deregister(data); + break; + + default: + error = EINVAL; + break; + } + + return(error); +} + +#endif /* ALTQ_KLD */ + +#endif /* ALTQ */ diff --git a/sys/altq/altq_conf.h b/sys/altq/altq_conf.h new file mode 100644 index 000000000000..1ab0b2692dae --- /dev/null +++ b/sys/altq/altq_conf.h @@ -0,0 +1,109 @@ +/* $KAME: altq_conf.h,v 1.5 2000/12/14 08:12:45 thorpej Exp $ */ + +/* + * Copyright (C) 1998-2000 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#ifndef _ALTQ_ALTQ_CONF_H_ +#define _ALTQ_ALTQ_CONF_H_ + +#ifdef _KERNEL + +#include +#include +#include + +#if (__FreeBSD_version > 300000) +#define ALTQ_KLD +#endif + +#ifdef ALTQ_KLD +#include +#endif + +#ifndef dev_decl +#ifdef __STDC__ +#define dev_decl(n,t) d_ ## t ## _t n ## t +#else +#define dev_decl(n,t) d_/**/t/**/_t n/**/t +#endif +#endif + +#if defined(__NetBSD__) || defined(__OpenBSD__) +typedef int d_open_t __P((dev_t dev, int oflags, int devtype, struct proc *p)); +typedef int d_close_t __P((dev_t dev, int fflag, int devtype, struct proc *p)); +typedef int d_ioctl_t __P((dev_t dev, u_long cmd, caddr_t data, + int fflag, struct proc *p)); + +#define noopen (dev_type_open((*))) enodev +#define noclose (dev_type_close((*))) enodev +#define noioctl (dev_type_ioctl((*))) enodev +#endif /* __NetBSD__ || __OpenBSD__ */ + +#if defined(__OpenBSD__) +int altqopen __P((dev_t dev, int oflags, int devtype, struct proc *p)); +int altqclose __P((dev_t dev, int fflag, int devtype, struct proc *p)); +int altqioctl __P((dev_t dev, u_long cmd, caddr_t data, int fflag, + struct proc *p)); +#endif + +/* + * altq queueing discipline switch structure + */ +struct altqsw { + char *d_name; + d_open_t *d_open; + d_close_t *d_close; + d_ioctl_t *d_ioctl; +}; + +#define altqdev_decl(n) \ + dev_decl(n,open); dev_decl(n,close); dev_decl(n,ioctl) + +#ifdef ALTQ_KLD + +struct altq_module_data { + int type; /* discipline type */ + int ref; /* reference count */ + struct altqsw *altqsw; /* discipline functions */ +}; + +#define ALTQ_MODULE(name, type, devsw) \ +static struct altq_module_data name##_moddata = { type, 0, devsw }; \ + \ +moduledata_t name##_mod = { \ + #name, \ + altq_module_handler, \ + &name##_moddata \ +}; \ +DECLARE_MODULE(name, name##_mod, SI_SUB_DRIVERS, SI_ORDER_MIDDLE+96) + +void altq_module_incref __P((int)); +void altq_module_declref __P((int)); +int altq_module_handler __P((module_t, int, void *)); + +#endif /* ALTQ_KLD */ + +#endif /* _KERNEL */ +#endif /* _ALTQ_ALTQ_CONF_H_ */ diff --git a/sys/altq/altq_fifoq.c b/sys/altq/altq_fifoq.c new file mode 100644 index 000000000000..22d254397eb9 --- /dev/null +++ b/sys/altq/altq_fifoq.c @@ -0,0 +1,414 @@ +/* $KAME: altq_fifoq.c,v 1.7 2000/12/14 08:12:45 thorpej Exp $ */ + +/* + * Copyright (C) 1997-2000 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if defined(__FreeBSD__) || defined(__NetBSD__) +#include "opt_altq.h" +#endif /* __FreeBSD__ || __NetBSD__ */ +#ifdef ALTQ_FIFOQ /* fifoq is enabled by ALTQ_FIFOQ option in opt_altq.h */ + +/* + * FIFOQ is an altq sample implementation. There will be little + * need to use FIFOQ as an alternative queueing scheme. + * But this code is provided as a template for those who want to + * write their own queueing schemes. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +#define FIFOQ_STATS /* collect statistics */ + +/* fifoq_list keeps all fifoq_state_t's allocated. */ +static fifoq_state_t *fifoq_list = NULL; + +/* internal function prototypes */ +static int fifoq_enqueue __P((struct ifaltq *, struct mbuf *, + struct altq_pktattr *)); +static struct mbuf *fifoq_dequeue __P((struct ifaltq *, int)); +static int fifoq_detach __P((fifoq_state_t *)); +static int fifoq_request __P((struct ifaltq *, int, void *)); +static void fifoq_purge __P((fifoq_state_t *)); + +/* + * fifoq device interface + */ +altqdev_decl(fifoq); + +int +fifoqopen(dev, flag, fmt, p) + dev_t dev; + int flag, fmt; + struct proc *p; +{ + /* everything will be done when the queueing scheme is attached. */ + return 0; +} + +/* + * there are 2 ways to act on close. + * detach-all-on-close: + * use for the daemon style approach. if the daemon dies, all the + * resource will be released. + * no-action-on-close: + * use for the command style approach. (e.g. fifoq on/off) + * + * note: close is called not on every close but when the last reference + * is removed (only once with multiple simultaneous references.) + */ +int +fifoqclose(dev, flag, fmt, p) + dev_t dev; + int flag, fmt; + struct proc *p; +{ + fifoq_state_t *q; + int err, error = 0; + + while ((q = fifoq_list) != NULL) { + /* destroy all */ + err = fifoq_detach(q); + if (err != 0 && error == 0) + error = err; + } + + return error; +} + +int +fifoqioctl(dev, cmd, addr, flag, p) + dev_t dev; + ioctlcmd_t cmd; + caddr_t addr; + int flag; + struct proc *p; +{ + fifoq_state_t *q; + struct fifoq_interface *ifacep; + struct ifnet *ifp; + int error = 0; + + /* check super-user privilege */ + switch (cmd) { + case FIFOQ_GETSTATS: + break; + default: +#if (__FreeBSD_version > 400000) + if ((error = suser(p)) != 0) + return (error); +#else + if ((error = suser(p->p_ucred, &p->p_acflag)) != 0) + return (error); +#endif + break; + } + + switch (cmd) { + case FIFOQ_ENABLE: + ifacep = (struct fifoq_interface *)addr; + if ((q = altq_lookup(ifacep->fifoq_ifname, ALTQT_FIFOQ)) + == NULL) { + error = EBADF; + break; + } + error = altq_enable(q->q_ifq); + break; + + case FIFOQ_DISABLE: + ifacep = (struct fifoq_interface *)addr; + if ((q = altq_lookup(ifacep->fifoq_ifname, ALTQT_FIFOQ)) + == NULL) { + error = EBADF; + break; + } + error = altq_disable(q->q_ifq); + break; + + case FIFOQ_IF_ATTACH: + ifp = ifunit(((struct fifoq_interface *)addr)->fifoq_ifname); + if (ifp == NULL) { + error = ENXIO; + break; + } + + /* allocate and initialize fifoq_state_t */ + MALLOC(q, fifoq_state_t *, sizeof(fifoq_state_t), + M_DEVBUF, M_WAITOK); + if (q == NULL) { + error = ENOMEM; + break; + } + bzero(q, sizeof(fifoq_state_t)); + + q->q_ifq = &ifp->if_snd; + q->q_head = q->q_tail = NULL; + q->q_len = 0; + q->q_limit = FIFOQ_LIMIT; + + /* + * set FIFOQ to this ifnet structure. + */ + error = altq_attach(q->q_ifq, ALTQT_FIFOQ, q, + fifoq_enqueue, fifoq_dequeue, fifoq_request, + NULL, NULL); + if (error) { + FREE(q, M_DEVBUF); + break; + } + + /* add this state to the fifoq list */ + q->q_next = fifoq_list; + fifoq_list = q; + break; + + case FIFOQ_IF_DETACH: + ifacep = (struct fifoq_interface *)addr; + if ((q = altq_lookup(ifacep->fifoq_ifname, ALTQT_FIFOQ)) + == NULL) { + error = EBADF; + break; + } + error = fifoq_detach(q); + break; + + case FIFOQ_GETSTATS: + do { + struct fifoq_getstats *q_stats; + + q_stats = (struct fifoq_getstats *)addr; + if ((q = altq_lookup(q_stats->iface.fifoq_ifname, + ALTQT_FIFOQ)) == NULL) { + error = EBADF; + break; + } + + q_stats->q_len = q->q_len; + q_stats->q_limit = q->q_limit; + q_stats->xmit_cnt = q->q_stats.xmit_cnt; + q_stats->drop_cnt = q->q_stats.drop_cnt; + q_stats->period = q->q_stats.period; + } while (0); + break; + + case FIFOQ_CONFIG: + do { + struct fifoq_conf *fc; + int limit; + + fc = (struct fifoq_conf *)addr; + if ((q = altq_lookup(fc->iface.fifoq_ifname, + ALTQT_FIFOQ)) == NULL) { + error = EBADF; + break; + } + limit = fc->fifoq_limit; + if (limit < 0) + limit = 0; + q->q_limit = limit; + fc->fifoq_limit = limit; + } while (0); + break; + + default: + error = EINVAL; + break; + } + return error; +} + +/* + * fifoq support routines + */ + +/* + * enqueue routine: + * + * returns: 0 when successfully queued. + * ENOBUFS when drop occurs. + */ +static int +fifoq_enqueue(ifq, m, pktattr) + struct ifaltq *ifq; + struct mbuf *m; + struct altq_pktattr *pktattr; +{ + fifoq_state_t *q = (fifoq_state_t *)ifq->altq_disc; + + /* if the queue is full, drop the incoming packet(drop-tail) */ + if (q->q_len >= q->q_limit) { +#ifdef FIFOQ_STATS + PKTCNTR_ADD(&q->q_stats.drop_cnt, m_pktlen(m)); +#endif + m_freem(m); + return (ENOBUFS); + } + + /* enqueue the packet at the taile of the queue */ + m->m_nextpkt = NULL; + if (q->q_tail == NULL) + q->q_head = m; + else + q->q_tail->m_nextpkt = m; + q->q_tail = m; + q->q_len++; + ifq->ifq_len++; + return 0; +} + +/* + * dequeue routine: + * must be called in splimp. + * + * returns: mbuf dequeued. + * NULL when no packet is available in the queue. + */ +/* + * ALTDQ_PEEK is provided for drivers which need to know the next packet + * to send in advance. + * when ALTDQ_PEEK is specified, the next packet to be dequeued is + * returned without dequeueing the packet. + * when ALTDQ_DEQUEUE is called *immediately after* an ALTDQ_PEEK + * operation, the same packet should be returned. + */ +static struct mbuf * +fifoq_dequeue(ifq, op) + struct ifaltq *ifq; + int op; +{ + fifoq_state_t *q = (fifoq_state_t *)ifq->altq_disc; + struct mbuf *m = NULL; + + if (op == ALTDQ_POLL) + return (q->q_head); + + if ((m = q->q_head) == NULL) + return (NULL); + + if ((q->q_head = m->m_nextpkt) == NULL) + q->q_tail = NULL; + m->m_nextpkt = NULL; + q->q_len--; + ifq->ifq_len--; +#ifdef FIFOQ_STATS + PKTCNTR_ADD(&q->q_stats.xmit_cnt, m_pktlen(m)); + if (q->q_len == 0) + q->q_stats.period++; +#endif + return (m); +} + +static int +fifoq_request(ifq, req, arg) + struct ifaltq *ifq; + int req; + void *arg; +{ + fifoq_state_t *q = (fifoq_state_t *)ifq->altq_disc; + + switch (req) { + case ALTRQ_PURGE: + fifoq_purge(q); + break; + } + return (0); +} + + +static int fifoq_detach(q) + fifoq_state_t *q; +{ + fifoq_state_t *tmp; + int error = 0; + + if (ALTQ_IS_ENABLED(q->q_ifq)) + altq_disable(q->q_ifq); + + fifoq_purge(q); + + if ((error = altq_detach(q->q_ifq))) + return (error); + + if (fifoq_list == q) + fifoq_list = q->q_next; + else { + for (tmp = fifoq_list; tmp != NULL; tmp = tmp->q_next) + if (tmp->q_next == q) { + tmp->q_next = q->q_next; + break; + } + if (tmp == NULL) + printf("fifoq_detach: no state in fifoq_list!\n"); + } + + FREE(q, M_DEVBUF); + return (error); +} + +/* + * fifoq_purge + * should be called in splimp or after disabling the fifoq. + */ +static void fifoq_purge(q) + fifoq_state_t *q; +{ + struct mbuf *m; + + while ((m = q->q_head) != NULL) { + q->q_head = m->m_nextpkt; + m_freem(m); + } + q->q_tail = NULL; + q->q_len = 0; + if (ALTQ_IS_ENABLED(q->q_ifq)) + q->q_ifq->ifq_len = 0; +} + +#ifdef KLD_MODULE + +static struct altqsw fifoq_sw = + {"fifoq", fifoqopen, fifoqclose, fifoqioctl}; + +ALTQ_MODULE(altq_fifoq, ALTQT_FIFOQ, &fifoq_sw); + +#endif /* KLD_MODULE */ + +#endif /* ALTQ_FIFOQ */ diff --git a/sys/altq/altq_fifoq.h b/sys/altq/altq_fifoq.h new file mode 100644 index 000000000000..8f85f9858efb --- /dev/null +++ b/sys/altq/altq_fifoq.h @@ -0,0 +1,79 @@ +/* $KAME: altq_fifoq.h,v 1.6 2000/12/14 08:12:45 thorpej Exp $ */ + +/* + * Copyright (C) 1997-2000 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _ALTQ_ALTQ_FIFOQ_H_ +#define _ALTQ_ALTQ_FIFOQ_H_ + +typedef struct fifoq_state { + struct fifoq_state *q_next; /* next fifoq_state in the list */ + struct ifaltq *q_ifq; /* backpointer to ifaltq */ + + struct mbuf *q_head; /* head of queue */ + struct mbuf *q_tail; /* tail of queue */ + int q_len; /* queue length */ + int q_limit; /* max queue length */ + + /* statistics */ + struct { + struct pktcntr xmit_cnt; + struct pktcntr drop_cnt; + u_int period; + } q_stats; +} fifoq_state_t; + +struct fifoq_interface { + char fifoq_ifname[IFNAMSIZ]; +}; + +struct fifoq_getstats { + struct fifoq_interface iface; + int q_len; + int q_limit; + struct pktcntr xmit_cnt; + struct pktcntr drop_cnt; + u_int period; +}; + +struct fifoq_conf { + struct fifoq_interface iface; + int fifoq_limit; +}; + +#define FIFOQ_LIMIT 50 /* default max queue lenght */ + +/* + * IOCTLs for FIFOQ + */ +#define FIFOQ_IF_ATTACH _IOW('Q', 1, struct fifoq_interface) +#define FIFOQ_IF_DETACH _IOW('Q', 2, struct fifoq_interface) +#define FIFOQ_ENABLE _IOW('Q', 3, struct fifoq_interface) +#define FIFOQ_DISABLE _IOW('Q', 4, struct fifoq_interface) +#define FIFOQ_CONFIG _IOWR('Q', 6, struct fifoq_conf) +#define FIFOQ_GETSTATS _IOWR('Q', 12, struct fifoq_getstats) + +#endif /* _ALTQ_ALTQ_FIFOQ_H_ */ diff --git a/sys/altq/altq_flowvalve.h b/sys/altq/altq_flowvalve.h new file mode 100644 index 000000000000..645fa784c4d7 --- /dev/null +++ b/sys/altq/altq_flowvalve.h @@ -0,0 +1,92 @@ +/* $KAME: altq_flowvalve.h,v 1.4 2000/12/14 08:12:46 thorpej Exp $ */ + +/* + * Copyright (C) 1998-2000 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _ALTQ_ALTQ_FLOWVALVE_H_ +#define _ALTQ_ALTQ_FLOWVALVE_H_ + +#ifdef _KERNEL + +/* fv_flow structure to define a unique address pair */ +struct fv_flow { + int flow_af; /* address family */ + union { + struct { + struct in_addr ip_src; + struct in_addr ip_dst; + } _ip; +#ifdef INET6 + struct { + struct in6_addr ip6_src; + struct in6_addr ip6_dst; + } _ip6; +#endif + } flow_un; +}; + +#define flow_ip flow_un._ip +#define flow_ip6 flow_un._ip6 + +/* flowvalve entry */ +struct fve { + TAILQ_ENTRY(fve) fve_lru; /* for LRU list */ + + enum fv_state { Green, Red } fve_state; + + int fve_p; /* scaled average drop rate */ + int fve_f; /* scaled average fraction */ + int fve_count; /* counter to update f */ + u_int fve_ifseq; /* ifseq at the last update of f */ + struct timeval fve_lastdrop; /* timestamp of the last drop */ + + struct fv_flow fve_flow; /* unique address pair */ +}; + +/* flowvalve structure */ +struct flowvalve { + u_int fv_ifseq; /* packet sequence number */ + int fv_flows; /* number of valid flows in the flowlist */ + int fv_pthresh; /* drop rate threshold */ + + TAILQ_HEAD(fv_flowhead, fve) fv_flowlist; /* LRU list */ + + struct fve *fv_fves; /* pointer to the allocated fves */ + + int *fv_p2ftab; /* drop rate to fraction table */ + + struct { + u_int pass; /* # of packets that have the fve + but aren't predropped */ + u_int predrop; /* # of packets predropped */ + u_int alloc; /* # of fves assigned */ + u_int escape; /* # of fves escaped */ + } fv_stats; +}; + +#endif /* _KERNEL */ + +#endif /* _ALTQ_ALTQ_FLOWVALVE_H_ */ diff --git a/sys/altq/altq_hfsc.c b/sys/altq/altq_hfsc.c new file mode 100644 index 000000000000..18ff6a997fbf --- /dev/null +++ b/sys/altq/altq_hfsc.c @@ -0,0 +1,1810 @@ +/* $KAME: altq_hfsc.c,v 1.8 2000/12/14 08:12:46 thorpej Exp $ */ + +/* + * Copyright (c) 1997-1999 Carnegie Mellon University. All Rights Reserved. + * + * Permission to use, copy, modify, and distribute this software and + * its documentation is hereby granted (including for commercial or + * for-profit use), provided that both the copyright notice and this + * permission notice appear in all copies of the software, derivative + * works, or modified versions, and any portions thereof, and that + * both notices appear in supporting documentation, and that credit + * is given to Carnegie Mellon University in all publications reporting + * on direct or indirect use of this code or its derivatives. + * + * THIS SOFTWARE IS EXPERIMENTAL AND IS KNOWN TO HAVE BUGS, SOME OF + * WHICH MAY HAVE SERIOUS CONSEQUENCES. CARNEGIE MELLON PROVIDES THIS + * SOFTWARE IN ITS ``AS IS'' CONDITION, AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + * Carnegie Mellon encourages (but does not require) users of this + * software to return any improvements or extensions that they make, + * and to grant Carnegie Mellon the rights to redistribute these + * changes without encumbrance. + */ +/* + * H-FSC is described in Proceedings of SIGCOMM'97, + * "A Hierarchical Fair Service Curve Algorithm for Link-Sharing, + * Real-Time and Priority Service" + * by Ion Stoica, Hui Zhang, and T. S. Eugene Ng. + */ + +#if defined(__FreeBSD__) || defined(__NetBSD__) +#include "opt_altq.h" +#if (__FreeBSD__ != 2) +#include "opt_inet.h" +#ifdef __FreeBSD__ +#include "opt_inet6.h" +#endif +#endif +#endif /* __FreeBSD__ || __NetBSD__ */ + +#ifdef ALTQ_HFSC /* hfsc is enabled by ALTQ_HFSC option in opt_altq.h */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +/* + * function prototypes + */ +static struct hfsc_if *hfsc_attach __P((struct ifaltq *, u_int)); +static int hfsc_detach __P((struct hfsc_if *)); +static int hfsc_clear_interface __P((struct hfsc_if *)); +static int hfsc_request __P((struct ifaltq *, int, void *)); +static void hfsc_purge __P((struct hfsc_if *)); +static struct hfsc_class *hfsc_class_create __P((struct hfsc_if *, + struct service_curve *, struct hfsc_class *, int, int)); +static int hfsc_class_destroy __P((struct hfsc_class *)); +static int hfsc_class_modify __P((struct hfsc_class *, + struct service_curve *, struct service_curve *)); +static struct hfsc_class *hfsc_nextclass __P((struct hfsc_class *)); + +static int hfsc_enqueue __P((struct ifaltq *, struct mbuf *, + struct altq_pktattr *)); +static struct mbuf *hfsc_dequeue __P((struct ifaltq *, int)); + +static int hfsc_addq __P((struct hfsc_class *, struct mbuf *)); +static struct mbuf *hfsc_getq __P((struct hfsc_class *)); +static struct mbuf *hfsc_pollq __P((struct hfsc_class *)); +static void hfsc_purgeq __P((struct hfsc_class *)); + +static void set_active __P((struct hfsc_class *, int)); +static void set_passive __P((struct hfsc_class *)); + +static void init_ed __P((struct hfsc_class *, int)); +static void update_ed __P((struct hfsc_class *, int)); +static void update_d __P((struct hfsc_class *, int)); +static void init_v __P((struct hfsc_class *, int)); +static void update_v __P((struct hfsc_class *, int)); +static ellist_t *ellist_alloc __P((void)); +static void ellist_destroy __P((ellist_t *)); +static void ellist_insert __P((struct hfsc_class *)); +static void ellist_remove __P((struct hfsc_class *)); +static void ellist_update __P((struct hfsc_class *)); +struct hfsc_class *ellist_get_mindl __P((ellist_t *)); +static actlist_t *actlist_alloc __P((void)); +static void actlist_destroy __P((actlist_t *)); +static void actlist_insert __P((struct hfsc_class *)); +static void actlist_remove __P((struct hfsc_class *)); +static void actlist_update __P((struct hfsc_class *)); + +static __inline u_int64_t seg_x2y __P((u_int64_t, u_int64_t)); +static __inline u_int64_t seg_y2x __P((u_int64_t, u_int64_t)); +static __inline u_int64_t m2sm __P((u_int)); +static __inline u_int64_t m2ism __P((u_int)); +static __inline u_int64_t d2dx __P((u_int)); +static u_int sm2m __P((u_int64_t)); +static u_int dx2d __P((u_int64_t)); + +static void sc2isc __P((struct service_curve *, struct internal_sc *)); +static void rtsc_init __P((struct runtime_sc *, struct internal_sc *, + u_int64_t, u_int64_t)); +static u_int64_t rtsc_y2x __P((struct runtime_sc *, u_int64_t)); +static u_int64_t rtsc_x2y __P((struct runtime_sc *, u_int64_t)); +static void rtsc_min __P((struct runtime_sc *, struct internal_sc *, + u_int64_t, u_int64_t)); + +int hfscopen __P((dev_t, int, int, struct proc *)); +int hfscclose __P((dev_t, int, int, struct proc *)); +int hfscioctl __P((dev_t, ioctlcmd_t, caddr_t, int, struct proc *)); +static int hfsccmd_if_attach __P((struct hfsc_attach *)); +static int hfsccmd_if_detach __P((struct hfsc_interface *)); +static int hfsccmd_add_class __P((struct hfsc_add_class *)); +static int hfsccmd_delete_class __P((struct hfsc_delete_class *)); +static int hfsccmd_modify_class __P((struct hfsc_modify_class *)); +static int hfsccmd_add_filter __P((struct hfsc_add_filter *)); +static int hfsccmd_delete_filter __P((struct hfsc_delete_filter *)); +static int hfsccmd_class_stats __P((struct hfsc_class_stats *)); +static void get_class_stats __P((struct class_stats *, struct hfsc_class *)); +static struct hfsc_class *clh_to_clp __P((struct hfsc_if *, u_long)); +static u_long clp_to_clh __P((struct hfsc_class *)); + +/* + * macros + */ +#define is_a_parent_class(cl) ((cl)->cl_children != NULL) + +/* hif_list keeps all hfsc_if's allocated. */ +static struct hfsc_if *hif_list = NULL; + +static struct hfsc_if * +hfsc_attach(ifq, bandwidth) + struct ifaltq *ifq; + u_int bandwidth; +{ + struct hfsc_if *hif; + struct service_curve root_sc; + + MALLOC(hif, struct hfsc_if *, sizeof(struct hfsc_if), + M_DEVBUF, M_WAITOK); + if (hif == NULL) + return (NULL); + bzero(hif, sizeof(struct hfsc_if)); + + hif->hif_eligible = ellist_alloc(); + if (hif->hif_eligible == NULL) { + FREE(hif, M_DEVBUF); + return NULL; + } + + hif->hif_ifq = ifq; + + /* + * create root class + */ + root_sc.m1 = bandwidth; + root_sc.d = 0; + root_sc.m2 = bandwidth; + if ((hif->hif_rootclass = + hfsc_class_create(hif, &root_sc, NULL, 0, 0)) == NULL) { + FREE(hif, M_DEVBUF); + return (NULL); + } + + /* add this state to the hfsc list */ + hif->hif_next = hif_list; + hif_list = hif; + + return (hif); +} + +static int +hfsc_detach(hif) + struct hfsc_if *hif; +{ + (void)hfsc_clear_interface(hif); + (void)hfsc_class_destroy(hif->hif_rootclass); + + /* remove this interface from the hif list */ + if (hif_list == hif) + hif_list = hif->hif_next; + else { + struct hfsc_if *h; + + for (h = hif_list; h != NULL; h = h->hif_next) + if (h->hif_next == hif) { + h->hif_next = hif->hif_next; + break; + } + ASSERT(h != NULL); + } + + ellist_destroy(hif->hif_eligible); + + FREE(hif, M_DEVBUF); + + return (0); +} + +/* + * bring the interface back to the initial state by discarding + * all the filters and classes except the root class. + */ +static int +hfsc_clear_interface(hif) + struct hfsc_if *hif; +{ + struct hfsc_class *cl; + + /* free the filters for this interface */ + acc_discard_filters(&hif->hif_classifier, NULL, 1); + + /* clear out the classes */ + while ((cl = hif->hif_rootclass->cl_children) != NULL) { + /* + * remove the first leaf class found in the hierarchy + * then start over + */ + for (; cl != NULL; cl = hfsc_nextclass(cl)) { + if (!is_a_parent_class(cl)) { + (void)hfsc_class_destroy(cl); + break; + } + } + } + + return (0); +} + +static int +hfsc_request(ifq, req, arg) + struct ifaltq *ifq; + int req; + void *arg; +{ + struct hfsc_if *hif = (struct hfsc_if *)ifq->altq_disc; + + switch (req) { + case ALTRQ_PURGE: + hfsc_purge(hif); + break; + } + return (0); +} + +/* discard all the queued packets on the interface */ +static void +hfsc_purge(hif) + struct hfsc_if *hif; +{ + struct hfsc_class *cl; + + for (cl = hif->hif_rootclass; cl != NULL; cl = hfsc_nextclass(cl)) + if (!qempty(cl->cl_q)) + hfsc_purgeq(cl); + if (ALTQ_IS_ENABLED(hif->hif_ifq)) + hif->hif_ifq->ifq_len = 0; +} + +struct hfsc_class * +hfsc_class_create(hif, sc, parent, qlimit, flags) + struct hfsc_if *hif; + struct service_curve *sc; + struct hfsc_class *parent; + int qlimit, flags; +{ + struct hfsc_class *cl, *p; + int s; + +#ifndef ALTQ_RED + if (flags & HFCF_RED) { + printf("hfsc_class_create: RED not configured for HFSC!\n"); + return (NULL); + } +#endif + + MALLOC(cl, struct hfsc_class *, sizeof(struct hfsc_class), + M_DEVBUF, M_WAITOK); + if (cl == NULL) + return (NULL); + bzero(cl, sizeof(struct hfsc_class)); + + MALLOC(cl->cl_q, class_queue_t *, sizeof(class_queue_t), + M_DEVBUF, M_WAITOK); + if (cl->cl_q == NULL) + goto err_ret; + bzero(cl->cl_q, sizeof(class_queue_t)); + + cl->cl_actc = actlist_alloc(); + if (cl->cl_actc == NULL) + goto err_ret; + + if (qlimit == 0) + qlimit = 50; /* use default */ + qlimit(cl->cl_q) = qlimit; + qtype(cl->cl_q) = Q_DROPTAIL; + qlen(cl->cl_q) = 0; + cl->cl_flags = flags; +#ifdef ALTQ_RED + if (flags & (HFCF_RED|HFCF_RIO)) { + int red_flags, red_pkttime; + + red_flags = 0; + if (flags & HFCF_ECN) + red_flags |= REDF_ECN; +#ifdef ALTQ_RIO + if (flags & HFCF_CLEARDSCP) + red_flags |= RIOF_CLEARDSCP; +#endif + if (sc->m2 == 0) + red_pkttime = 1000 * 1000 * 1000; /* 1 sec */ + else + red_pkttime = (int64_t)hif->hif_ifq->altq_ifp->if_mtu + * 1000 * 1000 * 1000 / (sc->m2 / 8); + if (flags & HFCF_RED) { + cl->cl_red = red_alloc(0, 0, 0, 0, + red_flags, red_pkttime); + if (cl->cl_red != NULL) + qtype(cl->cl_q) = Q_RED; + } +#ifdef ALTQ_RIO + else { + cl->cl_red = (red_t *)rio_alloc(0, NULL, + red_flags, red_pkttime); + if (cl->cl_red != NULL) + qtype(cl->cl_q) = Q_RIO; + } +#endif + } +#endif /* ALTQ_RED */ + + if (sc != NULL && (sc->m1 != 0 || sc->m2 != 0)) { + MALLOC(cl->cl_rsc, struct internal_sc *, + sizeof(struct internal_sc), M_DEVBUF, M_WAITOK); + if (cl->cl_rsc == NULL) + goto err_ret; + bzero(cl->cl_rsc, sizeof(struct internal_sc)); + sc2isc(sc, cl->cl_rsc); + rtsc_init(&cl->cl_deadline, cl->cl_rsc, 0, 0); + rtsc_init(&cl->cl_eligible, cl->cl_rsc, 0, 0); + + MALLOC(cl->cl_fsc, struct internal_sc *, + sizeof(struct internal_sc), M_DEVBUF, M_WAITOK); + if (cl->cl_fsc == NULL) + goto err_ret; + bzero(cl->cl_fsc, sizeof(struct internal_sc)); + sc2isc(sc, cl->cl_fsc); + rtsc_init(&cl->cl_virtual, cl->cl_fsc, 0, 0); + } + + cl->cl_id = hif->hif_classid++; + cl->cl_handle = (u_long)cl; /* XXX: just a pointer to this class */ + cl->cl_hif = hif; + cl->cl_parent = parent; + + s = splimp(); + hif->hif_classes++; + if (flags & HFCF_DEFAULTCLASS) + hif->hif_defaultclass = cl; + + /* add this class to the children list of the parent */ + if (parent == NULL) { + /* this is root class */ + } + else if ((p = parent->cl_children) == NULL) + parent->cl_children = cl; + else { + while (p->cl_siblings != NULL) + p = p->cl_siblings; + p->cl_siblings = cl; + } + splx(s); + + return (cl); + + err_ret: + if (cl->cl_actc != NULL) + actlist_destroy(cl->cl_actc); + if (cl->cl_red != NULL) { +#ifdef ALTQ_RIO + if (q_is_rio(cl->cl_q)) + rio_destroy((rio_t *)cl->cl_red); +#endif +#ifdef ALTQ_RED + if (q_is_red(cl->cl_q)) + red_destroy(cl->cl_red); +#endif + } + if (cl->cl_fsc != NULL) + FREE(cl->cl_fsc, M_DEVBUF); + if (cl->cl_rsc != NULL) + FREE(cl->cl_rsc, M_DEVBUF); + if (cl->cl_q != NULL) + FREE(cl->cl_q, M_DEVBUF); + FREE(cl, M_DEVBUF); + return (NULL); +} + +static int +hfsc_class_destroy(cl) + struct hfsc_class *cl; +{ + int s; + + if (is_a_parent_class(cl)) + return (EBUSY); + + s = splimp(); + + /* delete filters referencing to this class */ + acc_discard_filters(&cl->cl_hif->hif_classifier, cl, 0); + + if (!qempty(cl->cl_q)) + hfsc_purgeq(cl); + + if (cl->cl_parent == NULL) { + /* this is root class */ + } else { + struct hfsc_class *p = cl->cl_parent->cl_children; + + if (p == cl) + cl->cl_parent->cl_children = cl->cl_siblings; + else do { + if (p->cl_siblings == cl) { + p->cl_siblings = cl->cl_siblings; + break; + } + } while ((p = p->cl_siblings) != NULL); + ASSERT(p != NULL); + } + cl->cl_hif->hif_classes--; + splx(s); + + actlist_destroy(cl->cl_actc); + + if (cl->cl_red != NULL) { +#ifdef ALTQ_RIO + if (q_is_rio(cl->cl_q)) + rio_destroy((rio_t *)cl->cl_red); +#endif +#ifdef ALTQ_RED + if (q_is_red(cl->cl_q)) + red_destroy(cl->cl_red); +#endif + } + if (cl->cl_fsc != NULL) + FREE(cl->cl_fsc, M_DEVBUF); + if (cl->cl_rsc != NULL) + FREE(cl->cl_rsc, M_DEVBUF); + FREE(cl->cl_q, M_DEVBUF); + FREE(cl, M_DEVBUF); + + return (0); +} + +static int +hfsc_class_modify(cl, rsc, fsc) + struct hfsc_class *cl; + struct service_curve *rsc, *fsc; +{ + struct internal_sc *tmp; + int s; + + s = splimp(); + if (!qempty(cl->cl_q)) + hfsc_purgeq(cl); + + if (rsc != NULL) { + if (rsc->m1 == 0 && rsc->m2 == 0) { + if (cl->cl_rsc != NULL) { + FREE(cl->cl_rsc, M_DEVBUF); + cl->cl_rsc = NULL; + } + } else { + if (cl->cl_rsc == NULL) { + MALLOC(tmp, struct internal_sc *, + sizeof(struct internal_sc), + M_DEVBUF, M_WAITOK); + if (tmp == NULL) { + splx(s); + return (ENOMEM); + } + cl->cl_rsc = tmp; + } + bzero(cl->cl_rsc, sizeof(struct internal_sc)); + sc2isc(rsc, cl->cl_rsc); + rtsc_init(&cl->cl_deadline, cl->cl_rsc, 0, 0); + rtsc_init(&cl->cl_eligible, cl->cl_rsc, 0, 0); + } + } + + if (fsc != NULL) { + if (fsc->m1 == 0 && fsc->m2 == 0) { + if (cl->cl_fsc != NULL) { + FREE(cl->cl_fsc, M_DEVBUF); + cl->cl_fsc = NULL; + } + } else { + if (cl->cl_fsc == NULL) { + MALLOC(tmp, struct internal_sc *, + sizeof(struct internal_sc), + M_DEVBUF, M_WAITOK); + if (tmp == NULL) { + splx(s); + return (ENOMEM); + } + cl->cl_fsc = tmp; + } + bzero(cl->cl_fsc, sizeof(struct internal_sc)); + sc2isc(fsc, cl->cl_fsc); + rtsc_init(&cl->cl_virtual, cl->cl_fsc, 0, 0); + } + } + splx(s); + + return (0); +} + +/* + * hfsc_nextclass returns the next class in the tree. + * usage: + * for (cl = hif->hif_rootclass; cl != NULL; cl = hfsc_nextclass(cl)) + * do_something; + */ +static struct hfsc_class * +hfsc_nextclass(cl) + struct hfsc_class *cl; +{ + if (cl->cl_children != NULL) + cl = cl->cl_children; + else if (cl->cl_siblings != NULL) + cl = cl->cl_siblings; + else { + while ((cl = cl->cl_parent) != NULL) + if (cl->cl_siblings) { + cl = cl->cl_siblings; + break; + } + } + + return (cl); +} + +/* + * hfsc_enqueue is an enqueue function to be registered to + * (*altq_enqueue) in struct ifaltq. + */ +static int +hfsc_enqueue(ifq, m, pktattr) + struct ifaltq *ifq; + struct mbuf *m; + struct altq_pktattr *pktattr; +{ + struct hfsc_if *hif = (struct hfsc_if *)ifq->altq_disc; + struct hfsc_class *cl; + int len; + + /* grab class set by classifier */ + if (pktattr == NULL || (cl = pktattr->pattr_class) == NULL) + cl = hif->hif_defaultclass; + cl->cl_pktattr = pktattr; /* save proto hdr used by ECN */ + + len = m_pktlen(m); + if (hfsc_addq(cl, m) != 0) { + /* drop occurred. mbuf was freed in hfsc_addq. */ + PKTCNTR_ADD(&cl->cl_stats.drop_cnt, len); + return (ENOBUFS); + } + IFQ_INC_LEN(ifq); + cl->cl_hif->hif_packets++; + + /* successfully queued. */ + if (qlen(cl->cl_q) == 1) + set_active(cl, m_pktlen(m)); + +#ifdef HFSC_PKTLOG + /* put the logging_hook here */ +#endif + return (0); +} + +/* + * hfsc_dequeue is a dequeue function to be registered to + * (*altq_dequeue) in struct ifaltq. + * + * note: ALTDQ_POLL returns the next packet without removing the packet + * from the queue. ALTDQ_REMOVE is a normal dequeue operation. + * ALTDQ_REMOVE must return the same packet if called immediately + * after ALTDQ_POLL. + */ +static struct mbuf * +hfsc_dequeue(ifq, op) + struct ifaltq *ifq; + int op; +{ + struct hfsc_if *hif = (struct hfsc_if *)ifq->altq_disc; + struct hfsc_class *cl; + struct mbuf *m; + int len, next_len; + int realtime = 0; + + if (hif->hif_packets == 0) + /* no packet in the tree */ + return (NULL); + + if (op == ALTDQ_REMOVE && hif->hif_pollcache != NULL) { + u_int64_t cur_time; + + cl = hif->hif_pollcache; + hif->hif_pollcache = NULL; + /* check if the class was scheduled by real-time criteria */ + if (cl->cl_rsc != NULL) { + cur_time = read_machclk(); + realtime = (cl->cl_e <= cur_time); + } + } else { + /* + * if there are eligible classes, use real-time criteria. + * find the class with the minimum deadline among + * the eligible classes. + */ + if ((cl = ellist_get_mindl(hif->hif_eligible)) != NULL) { + realtime = 1; + } else { + /* + * use link-sharing criteria + * get the class with the minimum vt in the hierarchy + */ + cl = hif->hif_rootclass; + while (is_a_parent_class(cl)) { + cl = actlist_first(cl->cl_actc); + if (cl == NULL) + return (NULL); + } + } + + if (op == ALTDQ_POLL) { + hif->hif_pollcache = cl; + m = hfsc_pollq(cl); + return (m); + } + } + + m = hfsc_getq(cl); + len = m_pktlen(m); + cl->cl_hif->hif_packets--; + IFQ_DEC_LEN(ifq); + PKTCNTR_ADD(&cl->cl_stats.xmit_cnt, len); + + update_v(cl, len); + if (realtime) + cl->cl_cumul += len; + + if (!qempty(cl->cl_q)) { + if (cl->cl_rsc != NULL) { + /* update ed */ + next_len = m_pktlen(qhead(cl->cl_q)); + + if (realtime) + update_ed(cl, next_len); + else + update_d(cl, next_len); + } + } else { + /* the class becomes passive */ + set_passive(cl); + } + +#ifdef HFSC_PKTLOG + /* put the logging_hook here */ +#endif + + return (m); +} + +static int +hfsc_addq(cl, m) + struct hfsc_class *cl; + struct mbuf *m; +{ + +#ifdef ALTQ_RIO + if (q_is_rio(cl->cl_q)) + return rio_addq((rio_t *)cl->cl_red, cl->cl_q, + m, cl->cl_pktattr); +#endif +#ifdef ALTQ_RED + if (q_is_red(cl->cl_q)) + return red_addq(cl->cl_red, cl->cl_q, m, cl->cl_pktattr); +#endif + if (qlen(cl->cl_q) >= qlimit(cl->cl_q)) { + m_freem(m); + return (-1); + } + + if (cl->cl_flags & HFCF_CLEARDSCP) + write_dsfield(m, cl->cl_pktattr, 0); + + _addq(cl->cl_q, m); + + return (0); +} + +static struct mbuf * +hfsc_getq(cl) + struct hfsc_class *cl; +{ +#ifdef ALTQ_RIO + if (q_is_rio(cl->cl_q)) + return rio_getq((rio_t *)cl->cl_red, cl->cl_q); +#endif +#ifdef ALTQ_RED + if (q_is_red(cl->cl_q)) + return red_getq(cl->cl_red, cl->cl_q); +#endif + return _getq(cl->cl_q); +} + +static struct mbuf * +hfsc_pollq(cl) + struct hfsc_class *cl; +{ + return qhead(cl->cl_q); +} + +static void +hfsc_purgeq(cl) + struct hfsc_class *cl; +{ + struct mbuf *m; + + if (qempty(cl->cl_q)) + return; + + while ((m = _getq(cl->cl_q)) != NULL) { + PKTCNTR_ADD(&cl->cl_stats.drop_cnt, m_pktlen(m)); + m_freem(m); + } + ASSERT(qlen(cl->cl_q) == 0); + + set_passive(cl); +} + +static void +set_active(cl, len) + struct hfsc_class *cl; + int len; +{ + if (cl->cl_rsc != NULL) + init_ed(cl, len); + if (cl->cl_fsc != NULL) + init_v(cl, len); + + cl->cl_stats.period++; +} + +static void +set_passive(cl) + struct hfsc_class *cl; +{ + if (cl->cl_rsc != NULL) + ellist_remove(cl); + + if (cl->cl_fsc != NULL) { + while (cl->cl_parent != NULL) { + if (--cl->cl_nactive == 0) { + /* remove this class from the vt list */ + actlist_remove(cl); + } else + /* still has active children */ + break; + + /* go up to the parent class */ + cl = cl->cl_parent; + } + } +} + +static void +init_ed(cl, next_len) + struct hfsc_class *cl; + int next_len; +{ + u_int64_t cur_time; + + cur_time = read_machclk(); + + /* update the deadline curve */ + rtsc_min(&cl->cl_deadline, cl->cl_rsc, cur_time, cl->cl_cumul); + + /* + * update the eligible curve. + * for concave, it is equal to the deadline curve. + * for convex, it is a linear curve with slope m2. + */ + cl->cl_eligible = cl->cl_deadline; + if (cl->cl_rsc->sm1 <= cl->cl_rsc->sm2) { + cl->cl_eligible.dx = 0; + cl->cl_eligible.dy = 0; + } + + /* compute e and d */ + cl->cl_e = rtsc_y2x(&cl->cl_eligible, cl->cl_cumul); + cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len); + + ellist_insert(cl); +} + +static void +update_ed(cl, next_len) + struct hfsc_class *cl; + int next_len; +{ + cl->cl_e = rtsc_y2x(&cl->cl_eligible, cl->cl_cumul); + cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len); + + ellist_update(cl); +} + +static void +update_d(cl, next_len) + struct hfsc_class *cl; + int next_len; +{ + cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len); +} + +static void +init_v(cl, len) + struct hfsc_class *cl; + int len; +{ + struct hfsc_class *min_cl, *max_cl; + + while (cl->cl_parent != NULL) { + + if (cl->cl_nactive++ > 0) + /* already active */ + break; + + min_cl = actlist_first(cl->cl_parent->cl_actc); + if (min_cl != NULL) { + u_int64_t vt; + + /* + * set vt to the average of the min and max classes. + * if the parent's period didn't change, + * don't decrease vt of the class. + */ + max_cl = actlist_last(cl->cl_parent->cl_actc); + vt = (min_cl->cl_vt + max_cl->cl_vt) / 2; + if (cl->cl_parent->cl_vtperiod == cl->cl_parentperiod) + vt = max(cl->cl_vt, vt); + cl->cl_vt = vt; + } else { + /* no packet is backlogged. set vt to 0 */ + cl->cl_vt = 0; + } + + /* update the virtual curve */ + rtsc_min(&cl->cl_virtual, cl->cl_fsc, + cl->cl_vt, cl->cl_total); + + cl->cl_vtperiod++; /* increment vt period */ + cl->cl_parentperiod = cl->cl_parent->cl_vtperiod; + if (cl->cl_parent->cl_nactive == 0) + cl->cl_parentperiod++; + + actlist_insert(cl); + + /* go up to the parent class */ + cl = cl->cl_parent; + } +} + +static void +update_v(cl, len) + struct hfsc_class *cl; + int len; +{ + while (cl->cl_parent != NULL) { + + cl->cl_total += len; + + if (cl->cl_fsc != NULL) { + cl->cl_vt = rtsc_y2x(&cl->cl_virtual, cl->cl_total); + + /* update the vt list */ + actlist_update(cl); + } + + /* go up to the parent class */ + cl = cl->cl_parent; + } +} + +/* + * TAILQ based ellist and actlist implementation + * (ion wanted to make a calendar queue based implementation) + */ +/* + * eligible list holds backlogged classes being sorted by their eligible times. + * there is one eligible list per interface. + */ + +static ellist_t * +ellist_alloc() +{ + ellist_t *head; + + MALLOC(head, ellist_t *, sizeof(ellist_t), M_DEVBUF, M_WAITOK); + TAILQ_INIT(head); + return (head); +} + +static void +ellist_destroy(head) + ellist_t *head; +{ + FREE(head, M_DEVBUF); +} + +static void +ellist_insert(cl) + struct hfsc_class *cl; +{ + struct hfsc_if *hif = cl->cl_hif; + struct hfsc_class *p; + + /* check the last entry first */ + if ((p = TAILQ_LAST(hif->hif_eligible, _eligible)) == NULL || + p->cl_e <= cl->cl_e) { + TAILQ_INSERT_TAIL(hif->hif_eligible, cl, cl_ellist); + return; + } + + TAILQ_FOREACH(p, hif->hif_eligible, cl_ellist) { + if (cl->cl_e < p->cl_e) { + TAILQ_INSERT_BEFORE(p, cl, cl_ellist); + return; + } + } + ASSERT(0); /* should not reach here */ +} + +static void +ellist_remove(cl) + struct hfsc_class *cl; +{ + struct hfsc_if *hif = cl->cl_hif; + + TAILQ_REMOVE(hif->hif_eligible, cl, cl_ellist); +} + +static void +ellist_update(cl) + struct hfsc_class *cl; +{ + struct hfsc_if *hif = cl->cl_hif; + struct hfsc_class *p, *last; + + /* + * the eligible time of a class increases monotonically. + * if the next entry has a larger eligible time, nothing to do. + */ + p = TAILQ_NEXT(cl, cl_ellist); + if (p == NULL || cl->cl_e <= p->cl_e) + return; + + /* check the last entry */ + last = TAILQ_LAST(hif->hif_eligible, _eligible); + ASSERT(last != NULL); + if (last->cl_e <= cl->cl_e) { + TAILQ_REMOVE(hif->hif_eligible, cl, cl_ellist); + TAILQ_INSERT_TAIL(hif->hif_eligible, cl, cl_ellist); + return; + } + + /* + * the new position must be between the next entry + * and the last entry + */ + while ((p = TAILQ_NEXT(p, cl_ellist)) != NULL) { + if (cl->cl_e < p->cl_e) { + TAILQ_REMOVE(hif->hif_eligible, cl, cl_ellist); + TAILQ_INSERT_BEFORE(p, cl, cl_ellist); + return; + } + } + ASSERT(0); /* should not reach here */ +} + +/* find the class with the minimum deadline among the eligible classes */ +struct hfsc_class * +ellist_get_mindl(head) + ellist_t *head; +{ + struct hfsc_class *p, *cl = NULL; + u_int64_t cur_time; + + cur_time = read_machclk(); + + TAILQ_FOREACH(p, head, cl_ellist) { + if (p->cl_e > cur_time) + break; + if (cl == NULL || p->cl_d < cl->cl_d) + cl = p; + } + return (cl); +} + +/* + * active children list holds backlogged child classes being sorted + * by their virtual time. + * each intermediate class has one active children list. + */ +static actlist_t * +actlist_alloc() +{ + actlist_t *head; + + MALLOC(head, actlist_t *, sizeof(actlist_t), M_DEVBUF, M_WAITOK); + TAILQ_INIT(head); + return (head); +} + +static void +actlist_destroy(head) + actlist_t *head; +{ + FREE(head, M_DEVBUF); +} +static void +actlist_insert(cl) + struct hfsc_class *cl; +{ + struct hfsc_class *p; + + /* check the last entry first */ + if ((p = TAILQ_LAST(cl->cl_parent->cl_actc, _active)) == NULL + || p->cl_vt <= cl->cl_vt) { + TAILQ_INSERT_TAIL(cl->cl_parent->cl_actc, cl, cl_actlist); + return; + } + + TAILQ_FOREACH(p, cl->cl_parent->cl_actc, cl_actlist) { + if (cl->cl_vt < p->cl_vt) { + TAILQ_INSERT_BEFORE(p, cl, cl_actlist); + return; + } + } + ASSERT(0); /* should not reach here */ +} + +static void +actlist_remove(cl) + struct hfsc_class *cl; +{ + TAILQ_REMOVE(cl->cl_parent->cl_actc, cl, cl_actlist); +} + +static void +actlist_update(cl) + struct hfsc_class *cl; +{ + struct hfsc_class *p, *last; + + /* + * the virtual time of a class increases monotonically during its + * backlogged period. + * if the next entry has a larger virtual time, nothing to do. + */ + p = TAILQ_NEXT(cl, cl_actlist); + if (p == NULL || cl->cl_vt <= p->cl_vt) + return; + + /* check the last entry */ + last = TAILQ_LAST(cl->cl_parent->cl_actc, _active); + ASSERT(last != NULL); + if (last->cl_vt <= cl->cl_vt) { + TAILQ_REMOVE(cl->cl_parent->cl_actc, cl, cl_actlist); + TAILQ_INSERT_TAIL(cl->cl_parent->cl_actc, cl, cl_actlist); + return; + } + + /* + * the new position must be between the next entry + * and the last entry + */ + while ((p = TAILQ_NEXT(p, cl_actlist)) != NULL) { + if (cl->cl_vt < p->cl_vt) { + TAILQ_REMOVE(cl->cl_parent->cl_actc, cl, cl_actlist); + TAILQ_INSERT_BEFORE(p, cl, cl_actlist); + return; + } + } + ASSERT(0); /* should not reach here */ +} + +/* + * service curve support functions + * + * external service curve parameters + * m: bits/sec + * d: msec + * internal service curve parameters + * sm: (bytes/tsc_interval) << SM_SHIFT + * ism: (tsc_count/byte) << ISM_SHIFT + * dx: tsc_count + * + * SM_SHIFT and ISM_SHIFT are scaled in order to keep effective digits. + * we should be able to handle 100K-1Gbps linkspeed with 200Hz-1GHz CPU + * speed. SM_SHIFT and ISM_SHIFT are selected to have at least 3 effective + * digits in decimal using the following table. + * + * bits/set 100Kbps 1Mbps 10Mbps 100Mbps 1Gbps + * ----------+------------------------------------------------------- + * bytes/nsec 12.5e-6 125e-6 1250e-6 12500e-6 125000e-6 + * sm(500MHz) 25.0e-6 250e-6 2500e-6 25000e-6 250000e-6 + * sm(200MHz) 62.5e-6 625e-6 6250e-6 62500e-6 625000e-6 + * + * nsec/byte 80000 8000 800 80 8 + * ism(500MHz) 40000 4000 400 40 4 + * ism(200MHz) 16000 1600 160 16 1.6 + */ +#define SM_SHIFT 24 +#define ISM_SHIFT 10 + +#define SC_LARGEVAL (1LL << 32) +#define SC_INFINITY 0xffffffffffffffffLL + +static __inline u_int64_t +seg_x2y(x, sm) + u_int64_t x; + u_int64_t sm; +{ + u_int64_t y; + + if (x < SC_LARGEVAL) + y = x * sm >> SM_SHIFT; + else + y = (x >> SM_SHIFT) * sm; + return (y); +} + +static __inline u_int64_t +seg_y2x(y, ism) + u_int64_t y; + u_int64_t ism; +{ + u_int64_t x; + + if (y == 0) + x = 0; + else if (ism == SC_INFINITY) + x = SC_INFINITY; + else if (y < SC_LARGEVAL) + x = y * ism >> ISM_SHIFT; + else + x = (y >> ISM_SHIFT) * ism; + return (x); +} + +static __inline u_int64_t +m2sm(m) + u_int m; +{ + u_int64_t sm; + + sm = ((u_int64_t)m << SM_SHIFT) / 8 / machclk_freq; + return (sm); +} + +static __inline u_int64_t +m2ism(m) + u_int m; +{ + u_int64_t ism; + + if (m == 0) + ism = SC_INFINITY; + else + ism = ((u_int64_t)machclk_freq << ISM_SHIFT) * 8 / m; + return (ism); +} + +static __inline u_int64_t +d2dx(d) + u_int d; +{ + u_int64_t dx; + + dx = ((u_int64_t)d * machclk_freq) / 1000; + return (dx); +} + +static u_int +sm2m(sm) + u_int64_t sm; +{ + u_int64_t m; + + m = (sm * 8 * machclk_freq) >> SM_SHIFT; + return ((u_int)m); +} + +static u_int +dx2d(dx) + u_int64_t dx; +{ + u_int64_t d; + + d = dx * 1000 / machclk_freq; + return ((u_int)d); +} + +static void +sc2isc(sc, isc) + struct service_curve *sc; + struct internal_sc *isc; +{ + isc->sm1 = m2sm(sc->m1); + isc->ism1 = m2ism(sc->m1); + isc->dx = d2dx(sc->d); + isc->dy = seg_x2y(isc->dx, isc->sm1); + isc->sm2 = m2sm(sc->m2); + isc->ism2 = m2ism(sc->m2); +} + +/* + * initialize the runtime service curve with the given internal + * service curve starting at (x, y). + */ +static void +rtsc_init(rtsc, isc, x, y) + struct runtime_sc *rtsc; + struct internal_sc *isc; + u_int64_t x, y; +{ + rtsc->x = x; + rtsc->y = y; + rtsc->sm1 = isc->sm1; + rtsc->ism1 = isc->ism1; + rtsc->dx = isc->dx; + rtsc->dy = isc->dy; + rtsc->sm2 = isc->sm2; + rtsc->ism2 = isc->ism2; +} + +/* + * calculate the y-projection of the runtime service curve by the + * given x-projection value + */ +static u_int64_t +rtsc_y2x(rtsc, y) + struct runtime_sc *rtsc; + u_int64_t y; +{ + u_int64_t x; + + if (y < rtsc->y) + x = rtsc->x; + else if (y <= rtsc->y + rtsc->dy) { + /* x belongs to the 1st segment */ + if (rtsc->dy == 0) + x = rtsc->x + rtsc->dx; + else + x = rtsc->x + seg_y2x(y - rtsc->y, rtsc->ism1); + } else { + /* x belongs to the 2nd segment */ + x = rtsc->x + rtsc->dx + + seg_y2x(y - rtsc->y - rtsc->dy, rtsc->ism2); + } + return (x); +} + +static u_int64_t +rtsc_x2y(rtsc, x) + struct runtime_sc *rtsc; + u_int64_t x; +{ + u_int64_t y; + + if (x <= rtsc->x) + y = rtsc->y; + else if (x <= rtsc->x + rtsc->dx) + /* y belongs to the 1st segment */ + y = rtsc->y + seg_x2y(x - rtsc->x, rtsc->sm1); + else + /* y belongs to the 2nd segment */ + y = rtsc->y + rtsc->dy + + seg_x2y(x - rtsc->x - rtsc->dx, rtsc->sm2); + return (y); +} + +/* + * update the runtime service curve by taking the minimum of the current + * runtime service curve and the service curve starting at (x, y). + */ +static void +rtsc_min(rtsc, isc, x, y) + struct runtime_sc *rtsc; + struct internal_sc *isc; + u_int64_t x, y; +{ + u_int64_t y1, y2, dx, dy; + + if (isc->sm1 <= isc->sm2) { + /* service curve is convex */ + y1 = rtsc_x2y(rtsc, x); + if (y1 < y) + /* the current rtsc is smaller */ + return; + rtsc->x = x; + rtsc->y = y; + return; + } + + /* + * service curve is concave + * compute the two y values of the current rtsc + * y1: at x + * y2: at (x + dx) + */ + y1 = rtsc_x2y(rtsc, x); + if (y1 <= y) { + /* rtsc is below isc, no change to rtsc */ + return; + } + + y2 = rtsc_x2y(rtsc, x + isc->dx); + if (y2 >= y + isc->dy) { + /* rtsc is above isc, replace rtsc by isc */ + rtsc->x = x; + rtsc->y = y; + rtsc->dx = isc->dx; + rtsc->dy = isc->dy; + return; + } + + /* + * the two curves intersect + * compute the offsets (dx, dy) using the reverse + * function of seg_x2y() + * seg_x2y(dx, sm1) == seg_x2y(dx, sm2) + (y1 - y) + */ + dx = ((y1 - y) << SM_SHIFT) / (isc->sm1 - isc->sm2); + /* + * check if (x, y1) belongs to the 1st segment of rtsc. + * if so, add the offset. + */ + if (rtsc->x + rtsc->dx > x) + dx += rtsc->x + rtsc->dx - x; + dy = seg_x2y(dx, isc->sm1); + + rtsc->x = x; + rtsc->y = y; + rtsc->dx = dx; + rtsc->dy = dy; + return; +} + +/* + * hfsc device interface + */ +int +hfscopen(dev, flag, fmt, p) + dev_t dev; + int flag, fmt; + struct proc *p; +{ + if (machclk_freq == 0) + init_machclk(); + + if (machclk_freq == 0) { + printf("hfsc: no cpu clock available!\n"); + return (ENXIO); + } + + /* everything will be done when the queueing scheme is attached. */ + return 0; +} + +int +hfscclose(dev, flag, fmt, p) + dev_t dev; + int flag, fmt; + struct proc *p; +{ + struct hfsc_if *hif; + int err, error = 0; + + while ((hif = hif_list) != NULL) { + /* destroy all */ + if (ALTQ_IS_ENABLED(hif->hif_ifq)) + altq_disable(hif->hif_ifq); + + err = altq_detach(hif->hif_ifq); + if (err == 0) + err = hfsc_detach(hif); + if (err != 0 && error == 0) + error = err; + } + + return error; +} + +int +hfscioctl(dev, cmd, addr, flag, p) + dev_t dev; + ioctlcmd_t cmd; + caddr_t addr; + int flag; + struct proc *p; +{ + struct hfsc_if *hif; + struct hfsc_interface *ifacep; + int error = 0; + + /* check super-user privilege */ + switch (cmd) { + case HFSC_GETSTATS: + break; + default: +#if (__FreeBSD_version > 400000) + if ((error = suser(p)) != 0) + return (error); +#else + if ((error = suser(p->p_ucred, &p->p_acflag)) != 0) + return (error); +#endif + break; + } + + switch (cmd) { + + case HFSC_IF_ATTACH: + error = hfsccmd_if_attach((struct hfsc_attach *)addr); + break; + + case HFSC_IF_DETACH: + error = hfsccmd_if_detach((struct hfsc_interface *)addr); + break; + + case HFSC_ENABLE: + case HFSC_DISABLE: + case HFSC_CLEAR_HIERARCHY: + ifacep = (struct hfsc_interface *)addr; + if ((hif = altq_lookup(ifacep->hfsc_ifname, + ALTQT_HFSC)) == NULL) { + error = EBADF; + break; + } + + switch (cmd) { + + case HFSC_ENABLE: + if (hif->hif_defaultclass == NULL) { +#if 1 + printf("hfsc: no default class\n"); +#endif + error = EINVAL; + break; + } + error = altq_enable(hif->hif_ifq); + break; + + case HFSC_DISABLE: + error = altq_disable(hif->hif_ifq); + break; + + case HFSC_CLEAR_HIERARCHY: + hfsc_clear_interface(hif); + break; + } + break; + + case HFSC_ADD_CLASS: + error = hfsccmd_add_class((struct hfsc_add_class *)addr); + break; + + case HFSC_DEL_CLASS: + error = hfsccmd_delete_class((struct hfsc_delete_class *)addr); + break; + + case HFSC_MOD_CLASS: + error = hfsccmd_modify_class((struct hfsc_modify_class *)addr); + break; + + case HFSC_ADD_FILTER: + error = hfsccmd_add_filter((struct hfsc_add_filter *)addr); + break; + + case HFSC_DEL_FILTER: + error = hfsccmd_delete_filter((struct hfsc_delete_filter *)addr); + break; + + case HFSC_GETSTATS: + error = hfsccmd_class_stats((struct hfsc_class_stats *)addr); + break; + + default: + error = EINVAL; + break; + } + return error; +} + +static int +hfsccmd_if_attach(ap) + struct hfsc_attach *ap; +{ + struct hfsc_if *hif; + struct ifnet *ifp; + int error; + + if ((ifp = ifunit(ap->iface.hfsc_ifname)) == NULL) + return (ENXIO); + + if ((hif = hfsc_attach(&ifp->if_snd, ap->bandwidth)) == NULL) + return (ENOMEM); + + /* + * set HFSC to this ifnet structure. + */ + if ((error = altq_attach(&ifp->if_snd, ALTQT_HFSC, hif, + hfsc_enqueue, hfsc_dequeue, hfsc_request, + &hif->hif_classifier, acc_classify)) != 0) + (void)hfsc_detach(hif); + + return (error); +} + +static int +hfsccmd_if_detach(ap) + struct hfsc_interface *ap; +{ + struct hfsc_if *hif; + int error; + + if ((hif = altq_lookup(ap->hfsc_ifname, ALTQT_HFSC)) == NULL) + return (EBADF); + + if (ALTQ_IS_ENABLED(hif->hif_ifq)) + altq_disable(hif->hif_ifq); + + if ((error = altq_detach(hif->hif_ifq))) + return (error); + + return hfsc_detach(hif); +} + +static int +hfsccmd_add_class(ap) + struct hfsc_add_class *ap; +{ + struct hfsc_if *hif; + struct hfsc_class *cl, *parent; + + if ((hif = altq_lookup(ap->iface.hfsc_ifname, ALTQT_HFSC)) == NULL) + return (EBADF); + + if ((parent = clh_to_clp(hif, ap->parent_handle)) == NULL) { + if (ap->parent_handle == HFSC_ROOTCLASS_HANDLE) + parent = hif->hif_rootclass; + else + return (EINVAL); + } + + if ((cl = hfsc_class_create(hif, &ap->service_curve, parent, + ap->qlimit, ap->flags)) == NULL) + return (ENOMEM); + + /* return a class handle to the user */ + ap->class_handle = clp_to_clh(cl); + return (0); +} + +static int +hfsccmd_delete_class(ap) + struct hfsc_delete_class *ap; +{ + struct hfsc_if *hif; + struct hfsc_class *cl; + + if ((hif = altq_lookup(ap->iface.hfsc_ifname, ALTQT_HFSC)) == NULL) + return (EBADF); + + if ((cl = clh_to_clp(hif, ap->class_handle)) == NULL) + return (EINVAL); + + return hfsc_class_destroy(cl); +} + +static int +hfsccmd_modify_class(ap) + struct hfsc_modify_class *ap; +{ + struct hfsc_if *hif; + struct hfsc_class *cl; + struct service_curve *rsc = NULL; + struct service_curve *fsc = NULL; + + if ((hif = altq_lookup(ap->iface.hfsc_ifname, ALTQT_HFSC)) == NULL) + return (EBADF); + + if ((cl = clh_to_clp(hif, ap->class_handle)) == NULL) + return (EINVAL); + + if (ap->sctype & HFSC_REALTIMESC) + rsc = &ap->service_curve; + if (ap->sctype & HFSC_LINKSHARINGSC) + fsc = &ap->service_curve; + + return hfsc_class_modify(cl, rsc, fsc); +} + +static int +hfsccmd_add_filter(ap) + struct hfsc_add_filter *ap; +{ + struct hfsc_if *hif; + struct hfsc_class *cl; + + if ((hif = altq_lookup(ap->iface.hfsc_ifname, ALTQT_HFSC)) == NULL) + return (EBADF); + + if ((cl = clh_to_clp(hif, ap->class_handle)) == NULL) + return (EINVAL); + + if (is_a_parent_class(cl)) { +#if 1 + printf("hfsccmd_add_filter: not a leaf class!\n"); +#endif + return (EINVAL); + } + + return acc_add_filter(&hif->hif_classifier, &ap->filter, + cl, &ap->filter_handle); +} + +static int +hfsccmd_delete_filter(ap) + struct hfsc_delete_filter *ap; +{ + struct hfsc_if *hif; + + if ((hif = altq_lookup(ap->iface.hfsc_ifname, ALTQT_HFSC)) == NULL) + return (EBADF); + + return acc_delete_filter(&hif->hif_classifier, + ap->filter_handle); +} + +static int +hfsccmd_class_stats(ap) + struct hfsc_class_stats *ap; +{ + struct hfsc_if *hif; + struct hfsc_class *cl; + struct class_stats stats, *usp; + int n, nclasses, error; + + if ((hif = altq_lookup(ap->iface.hfsc_ifname, ALTQT_HFSC)) == NULL) + return (EBADF); + + ap->cur_time = read_machclk(); + ap->hif_classes = hif->hif_classes; + ap->hif_packets = hif->hif_packets; + + /* skip the first N classes in the tree */ + nclasses = ap->nskip; + for (cl = hif->hif_rootclass, n = 0; cl != NULL && n < nclasses; + cl = hfsc_nextclass(cl), n++) + ; + if (n != nclasses) + return (EINVAL); + + /* then, read the next N classes in the tree */ + nclasses = ap->nclasses; + usp = ap->stats; + for (n = 0; cl != NULL && n < nclasses; cl = hfsc_nextclass(cl), n++) { + + get_class_stats(&stats, cl); + + if ((error = copyout((caddr_t)&stats, (caddr_t)usp++, + sizeof(stats))) != 0) + return (error); + } + + ap->nclasses = n; + + return (0); +} + +static void get_class_stats(sp, cl) + struct class_stats *sp; + struct hfsc_class *cl; +{ + sp->class_id = cl->cl_id; + sp->class_handle = clp_to_clh(cl); + + if (cl->cl_rsc != NULL) { + sp->rsc.m1 = sm2m(cl->cl_rsc->sm1); + sp->rsc.d = dx2d(cl->cl_rsc->dx); + sp->rsc.m2 = sm2m(cl->cl_rsc->sm2); + } else { + sp->rsc.m1 = 0; + sp->rsc.d = 0; + sp->rsc.m2 = 0; + } + if (cl->cl_fsc != NULL) { + sp->fsc.m1 = sm2m(cl->cl_fsc->sm1); + sp->fsc.d = dx2d(cl->cl_fsc->dx); + sp->fsc.m2 = sm2m(cl->cl_fsc->sm2); + } else { + sp->fsc.m1 = 0; + sp->fsc.d = 0; + sp->fsc.m2 = 0; + } + + sp->total = cl->cl_total; + sp->cumul = cl->cl_cumul; + + sp->d = cl->cl_d; + sp->e = cl->cl_e; + sp->vt = cl->cl_vt; + + sp->qlength = qlen(cl->cl_q); + sp->xmit_cnt = cl->cl_stats.xmit_cnt; + sp->drop_cnt = cl->cl_stats.drop_cnt; + sp->period = cl->cl_stats.period; + + sp->qtype = qtype(cl->cl_q); +#ifdef ALTQ_RED + if (q_is_red(cl->cl_q)) + red_getstats(cl->cl_red, &sp->red[0]); +#endif +#ifdef ALTQ_RIO + if (q_is_rio(cl->cl_q)) + rio_getstats((rio_t *)cl->cl_red, &sp->red[0]); +#endif +} + +/* convert a class handle to the corresponding class pointer */ +static struct hfsc_class * +clh_to_clp(hif, chandle) + struct hfsc_if *hif; + u_long chandle; +{ + struct hfsc_class *cl; + + cl = (struct hfsc_class *)chandle; + if (chandle != ALIGN(cl)) { +#if 1 + printf("clh_to_cl: unaligned pointer %p\n", cl); +#endif + return (NULL); + } + + if (cl == NULL || cl->cl_handle != chandle || cl->cl_hif != hif) + return (NULL); + + return (cl); +} + +/* convert a class pointer to the corresponding class handle */ +static u_long +clp_to_clh(cl) + struct hfsc_class *cl; +{ + if (cl->cl_parent == NULL) + return (HFSC_ROOTCLASS_HANDLE); /* XXX */ + return (cl->cl_handle); +} + +#ifdef KLD_MODULE + +static struct altqsw hfsc_sw = + {"hfsc", hfscopen, hfscclose, hfscioctl}; + +ALTQ_MODULE(altq_hfsc, ALTQT_HFSC, &hfsc_sw); + +#endif /* KLD_MODULE */ + +#endif /* ALTQ_HFSC */ diff --git a/sys/altq/altq_hfsc.h b/sys/altq/altq_hfsc.h new file mode 100644 index 000000000000..6de4a406daf0 --- /dev/null +++ b/sys/altq/altq_hfsc.h @@ -0,0 +1,278 @@ +/* $KAME: altq_hfsc.h,v 1.6 2000/12/14 08:12:46 thorpej Exp $ */ + +/* + * Copyright (c) 1997-1999 Carnegie Mellon University. All Rights Reserved. + * + * Permission to use, copy, modify, and distribute this software and + * its documentation is hereby granted (including for commercial or + * for-profit use), provided that both the copyright notice and this + * permission notice appear in all copies of the software, derivative + * works, or modified versions, and any portions thereof, and that + * both notices appear in supporting documentation, and that credit + * is given to Carnegie Mellon University in all publications reporting + * on direct or indirect use of this code or its derivatives. + * + * THIS SOFTWARE IS EXPERIMENTAL AND IS KNOWN TO HAVE BUGS, SOME OF + * WHICH MAY HAVE SERIOUS CONSEQUENCES. CARNEGIE MELLON PROVIDES THIS + * SOFTWARE IN ITS ``AS IS'' CONDITION, AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + * Carnegie Mellon encourages (but does not require) users of this + * software to return any improvements or extensions that they make, + * and to grant Carnegie Mellon the rights to redistribute these + * changes without encumbrance. + */ +#ifndef _ALTQ_ALTQ_HFSC_H_ +#define _ALTQ_ALTQ_HFSC_H_ + +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct hfsc_interface { + char hfsc_ifname[IFNAMSIZ]; /* interface name (e.g., fxp0) */ +}; + +struct hfsc_attach { + struct hfsc_interface iface; + u_int bandwidth; /* link bandwidth in bits/sec */ +}; + +struct service_curve { + u_int m1; /* slope of the first segment in bits/sec */ + u_int d; /* the x-projection of the first segment in msec */ + u_int m2; /* slope of the second segment in bits/sec */ +}; + +struct hfsc_add_class { + struct hfsc_interface iface; + u_long parent_handle; + struct service_curve service_curve; + int qlimit; + int flags; + + u_long class_handle; /* return value */ +}; + +/* special class handles */ +#define HFSC_ROOTCLASS_HANDLE 0 +#define HFSC_NULLCLASS_HANDLE 0 + +/* hfsc class flags */ +#define HFCF_RED 0x0001 /* use RED */ +#define HFCF_ECN 0x0002 /* use RED/ECN */ +#define HFCF_RIO 0x0004 /* use RIO */ +#define HFCF_CLEARDSCP 0x0010 /* clear diffserv codepoint */ +#define HFCF_DEFAULTCLASS 0x1000 /* default class */ + +struct hfsc_delete_class { + struct hfsc_interface iface; + u_long class_handle; +}; + +/* service curve types */ +#define HFSC_REALTIMESC 1 +#define HFSC_LINKSHARINGSC 2 +#define HFSC_DEFAULTSC (HFSC_REALTIMESC|HFSC_LINKSHARINGSC) + +struct hfsc_modify_class { + struct hfsc_interface iface; + u_long class_handle; + struct service_curve service_curve; + int sctype; +}; + +struct hfsc_add_filter { + struct hfsc_interface iface; + u_long class_handle; + struct flow_filter filter; + + u_long filter_handle; /* return value */ +}; + +struct hfsc_delete_filter { + struct hfsc_interface iface; + u_long filter_handle; +}; + +struct class_stats { + u_int class_id; + u_long class_handle; + struct service_curve rsc; + struct service_curve fsc; + + u_int64_t total; /* total work in bytes */ + u_int64_t cumul; /* cumulative work in bytes + done by real-time criteria */ + u_int64_t d; /* deadline */ + u_int64_t e; /* eligible time */ + u_int64_t vt; /* virtual time */ + + u_int qlength; + struct pktcntr xmit_cnt; + struct pktcntr drop_cnt; + u_int period; + + /* red and rio related info */ + int qtype; + struct redstats red[3]; +}; + +struct hfsc_class_stats { + struct hfsc_interface iface; + int nskip; /* skip # of classes */ + int nclasses; /* # of class stats (WR) */ + u_int64_t cur_time; /* current time */ + u_int hif_classes; /* # of classes in the tree */ + u_int hif_packets; /* # of packets in the tree */ + struct class_stats *stats; /* pointer to stats array */ +}; + +#define HFSC_IF_ATTACH _IOW('Q', 1, struct hfsc_attach) +#define HFSC_IF_DETACH _IOW('Q', 2, struct hfsc_interface) +#define HFSC_ENABLE _IOW('Q', 3, struct hfsc_interface) +#define HFSC_DISABLE _IOW('Q', 4, struct hfsc_interface) +#define HFSC_CLEAR_HIERARCHY _IOW('Q', 5, struct hfsc_interface) +#define HFSC_ADD_CLASS _IOWR('Q', 7, struct hfsc_add_class) +#define HFSC_DEL_CLASS _IOW('Q', 8, struct hfsc_delete_class) +#define HFSC_MOD_CLASS _IOW('Q', 9, struct hfsc_modify_class) +#define HFSC_ADD_FILTER _IOWR('Q', 10, struct hfsc_add_filter) +#define HFSC_DEL_FILTER _IOW('Q', 11, struct hfsc_delete_filter) +#define HFSC_GETSTATS _IOWR('Q', 12, struct hfsc_class_stats) + +#ifdef _KERNEL +/* + * kernel internal service curve representation + * coordinates are given by 64 bit unsigned integers. + * x-axis: unit is clock count. for the intel x86 architecture, + * the raw Pentium TSC (Timestamp Counter) value is used. + * virtual time is also calculated in this time scale. + * y-axis: unit is byte. + * + * the service curve parameters are converted to the internal + * representation. + * the slope values are scaled to avoid overflow. + * the inverse slope values as well as the y-projection of the 1st + * segment are kept in order to to avoid 64-bit divide operations + * that are expensive on 32-bit architectures. + * + * note: Intel Pentium TSC never wraps around in several thousands of years. + * x-axis doesn't wrap around for 1089 years with 1GHz clock. + * y-axis doesn't wrap around for 4358 years with 1Gbps bandwidth. + */ + +/* kernel internal representation of a service curve */ +struct internal_sc { + u_int64_t sm1; /* scaled slope of the 1st segment */ + u_int64_t ism1; /* scaled inverse-slope of the 1st segment */ + u_int64_t dx; /* the x-projection of the 1st segment */ + u_int64_t dy; /* the y-projection of the 1st segment */ + u_int64_t sm2; /* scaled slope of the 2nd segment */ + u_int64_t ism2; /* scaled inverse-slope of the 2nd segment */ +}; + +/* runtime service curve */ +struct runtime_sc { + u_int64_t x; /* current starting position on x-axis */ + u_int64_t y; /* current starting position on x-axis */ + u_int64_t sm1; /* scaled slope of the 1st segment */ + u_int64_t ism1; /* scaled inverse-slope of the 1st segment */ + u_int64_t dx; /* the x-projection of the 1st segment */ + u_int64_t dy; /* the y-projection of the 1st segment */ + u_int64_t sm2; /* scaled slope of the 2nd segment */ + u_int64_t ism2; /* scaled inverse-slope of the 2nd segment */ +}; + +/* for TAILQ based ellist and actlist implementation */ +struct hfsc_class; +typedef TAILQ_HEAD(_eligible, hfsc_class) ellist_t; +typedef TAILQ_ENTRY(hfsc_class) elentry_t; +typedef TAILQ_HEAD(_active, hfsc_class) actlist_t; +typedef TAILQ_ENTRY(hfsc_class) actentry_t; +#define ellist_first(s) TAILQ_FIRST(s) +#define actlist_first(s) TAILQ_FIRST(s) +#define actlist_last(s) TAILQ_LAST(s, _active) + +struct hfsc_class { + u_int cl_id; /* class id (just for debug) */ + u_long cl_handle; /* class handle */ + struct hfsc_if *cl_hif; /* back pointer to struct hfsc_if */ + int cl_flags; /* misc flags */ + + struct hfsc_class *cl_parent; /* parent class */ + struct hfsc_class *cl_siblings; /* sibling classes */ + struct hfsc_class *cl_children; /* child classes */ + + class_queue_t *cl_q; /* class queue structure */ + struct red *cl_red; /* RED state */ + struct altq_pktattr *cl_pktattr; /* saved header used by ECN */ + + u_int64_t cl_total; /* total work in bytes */ + u_int64_t cl_cumul; /* cumulative work in bytes + done by real-time criteria */ + u_int64_t cl_d; /* deadline */ + u_int64_t cl_e; /* eligible time */ + u_int64_t cl_vt; /* virtual time */ + + struct internal_sc *cl_rsc; /* internal real-time service curve */ + struct internal_sc *cl_fsc; /* internal fair service curve */ + struct runtime_sc cl_deadline; /* deadline curve */ + struct runtime_sc cl_eligible; /* eligible curve */ + struct runtime_sc cl_virtual; /* virtual curve */ + + u_int cl_vtperiod; /* vt period sequence no */ + u_int cl_parentperiod; /* parent's vt period seqno */ + int cl_nactive; /* number of active children */ + actlist_t *cl_actc; /* active children list */ + + actentry_t cl_actlist; /* active children list entry */ + elentry_t cl_ellist; /* eligible list entry */ + + struct { + struct pktcntr xmit_cnt; + struct pktcntr drop_cnt; + u_int period; + } cl_stats; +}; + +/* + * hfsc interface state + */ +struct hfsc_if { + struct hfsc_if *hif_next; /* interface state list */ + struct ifaltq *hif_ifq; /* backpointer to ifaltq */ + struct hfsc_class *hif_rootclass; /* root class */ + struct hfsc_class *hif_defaultclass; /* default class */ + struct hfsc_class *hif_pollcache; /* cache for poll operation */ + + u_int hif_classes; /* # of classes in the tree */ + u_int hif_packets; /* # of packets in the tree */ + u_int hif_classid; /* class id sequence number */ + + ellist_t *hif_eligible; /* eligible list */ + + struct acc_classifier hif_classifier; +}; + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _ALTQ_ALTQ_HFSC_H_ */ diff --git a/sys/altq/altq_localq.c b/sys/altq/altq_localq.c new file mode 100644 index 000000000000..c42eec4d7763 --- /dev/null +++ b/sys/altq/altq_localq.c @@ -0,0 +1,68 @@ +/* $KAME: altq_localq.c,v 1.3 2000/10/18 09:15:23 kjc Exp $ */ + + +#if defined(__FreeBSD__) || defined(__NetBSD__) +#include "opt_altq.h" +#endif /* __FreeBSD__ || __NetBSD__ */ +#ifdef ALTQ_LOCALQ /* localq is enabled by ALTQ_LOCALQ option in opt_altq.h */ + +#include +#include +#include +#include + +#include +#include + +#include +#include + +/* + * localq device interface + */ +altqdev_decl(localq); + +int +localqopen(dev, flag, fmt, p) + dev_t dev; + int flag, fmt; + struct proc *p; +{ + /* everything will be done when the queueing scheme is attached. */ + return 0; +} + +int +localqclose(dev, flag, fmt, p) + dev_t dev; + int flag, fmt; + struct proc *p; +{ + int error = 0; + + return error; +} + +int +localqioctl(dev, cmd, addr, flag, p) + dev_t dev; + ioctlcmd_t cmd; + caddr_t addr; + int flag; + struct proc *p; +{ + int error = 0; + + return error; +} + +#ifdef KLD_MODULE + +static struct altqsw localq_sw = + {"localq", localqopen, localqclose, localqioctl}; + +ALTQ_MODULE(altq_localq, ALTQT_LOCALQ, &localq_sw); + +#endif /* KLD_MODULE */ + +#endif /* ALTQ_LOCALQ */ diff --git a/sys/altq/altq_priq.c b/sys/altq/altq_priq.c new file mode 100644 index 000000000000..e5b74d9313f7 --- /dev/null +++ b/sys/altq/altq_priq.c @@ -0,0 +1,865 @@ +/* $KAME: altq_priq.c,v 1.1 2000/10/18 09:15:23 kjc Exp $ */ +/* + * Copyright (C) 2000 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * priority queue + */ + +#if defined(__FreeBSD__) || defined(__NetBSD__) +#include "opt_altq.h" +#if (__FreeBSD__ != 2) +#include "opt_inet.h" +#ifdef __FreeBSD__ +#include "opt_inet6.h" +#endif +#endif +#endif /* __FreeBSD__ || __NetBSD__ */ + +#ifdef ALTQ_PRIQ /* priq is enabled by ALTQ_PRIQ option in opt_altq.h */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +/* + * function prototypes + */ +static struct priq_if *priq_attach __P((struct ifaltq *, u_int)); +static int priq_detach __P((struct priq_if *)); +static int priq_clear_interface __P((struct priq_if *)); +static int priq_request __P((struct ifaltq *, int, void *)); +static void priq_purge __P((struct priq_if *)); +static struct priq_class *priq_class_create __P((struct priq_if *, + int, int, int)); +static int priq_class_destroy __P((struct priq_class *)); +static int priq_enqueue __P((struct ifaltq *, struct mbuf *, + struct altq_pktattr *)); +static struct mbuf *priq_dequeue __P((struct ifaltq *, int)); + +static int priq_addq __P((struct priq_class *, struct mbuf *)); +static struct mbuf *priq_getq __P((struct priq_class *)); +static struct mbuf *priq_pollq __P((struct priq_class *)); +static void priq_purgeq __P((struct priq_class *)); + +int priqopen __P((dev_t, int, int, struct proc *)); +int priqclose __P((dev_t, int, int, struct proc *)); +int priqioctl __P((dev_t, ioctlcmd_t, caddr_t, int, struct proc *)); +static int priqcmd_if_attach __P((struct priq_interface *)); +static int priqcmd_if_detach __P((struct priq_interface *)); +static int priqcmd_add_class __P((struct priq_add_class *)); +static int priqcmd_delete_class __P((struct priq_delete_class *)); +static int priqcmd_modify_class __P((struct priq_modify_class *)); +static int priqcmd_add_filter __P((struct priq_add_filter *)); +static int priqcmd_delete_filter __P((struct priq_delete_filter *)); +static int priqcmd_class_stats __P((struct priq_class_stats *)); +static void get_class_stats __P((struct class_stats *, struct priq_class *)); +static struct priq_class *clh_to_clp __P((struct priq_if *, u_long)); +static u_long clp_to_clh __P((struct priq_class *)); + +/* pif_list keeps all priq_if's allocated. */ +static struct priq_if *pif_list = NULL; + +static struct priq_if * +priq_attach(ifq, bandwidth) + struct ifaltq *ifq; + u_int bandwidth; +{ + struct priq_if *pif; + + MALLOC(pif, struct priq_if *, sizeof(struct priq_if), + M_DEVBUF, M_WAITOK); + if (pif == NULL) + return (NULL); + bzero(pif, sizeof(struct priq_if)); + pif->pif_bandwidth = bandwidth; + pif->pif_maxpri = -1; + pif->pif_ifq = ifq; + + /* add this state to the priq list */ + pif->pif_next = pif_list; + pif_list = pif; + + return (pif); +} + +static int +priq_detach(pif) + struct priq_if *pif; +{ + (void)priq_clear_interface(pif); + + /* remove this interface from the pif list */ + if (pif_list == pif) + pif_list = pif->pif_next; + else { + struct priq_if *p; + + for (p = pif_list; p != NULL; p = p->pif_next) + if (p->pif_next == pif) { + p->pif_next = pif->pif_next; + break; + } + ASSERT(p != NULL); + } + + FREE(pif, M_DEVBUF); + return (0); +} + +/* + * bring the interface back to the initial state by discarding + * all the filters and classes. + */ +static int +priq_clear_interface(pif) + struct priq_if *pif; +{ + struct priq_class *cl; + int pri; + + /* free the filters for this interface */ + acc_discard_filters(&pif->pif_classifier, NULL, 1); + + /* clear out the classes */ + for (pri = 0; pri <= pif->pif_maxpri; pri++) + if ((cl = pif->pif_classes[pri]) != NULL) + priq_class_destroy(cl); + + return (0); +} + +static int +priq_request(ifq, req, arg) + struct ifaltq *ifq; + int req; + void *arg; +{ + struct priq_if *pif = (struct priq_if *)ifq->altq_disc; + + switch (req) { + case ALTRQ_PURGE: + priq_purge(pif); + break; + } + return (0); +} + +/* discard all the queued packets on the interface */ +static void +priq_purge(pif) + struct priq_if *pif; +{ + struct priq_class *cl; + int pri; + + for (pri = 0; pri <= pif->pif_maxpri; pri++) { + if ((cl = pif->pif_classes[pri]) != NULL && !qempty(cl->cl_q)) + priq_purgeq(cl); + } + if (ALTQ_IS_ENABLED(pif->pif_ifq)) + pif->pif_ifq->ifq_len = 0; +} + +static struct priq_class * +priq_class_create(pif, pri, qlimit, flags) + struct priq_if *pif; + int pri, qlimit, flags; +{ + struct priq_class *cl; + int s; + +#ifndef ALTQ_RED + if (flags & PRCF_RED) { + printf("priq_class_create: RED not configured for PRIQ!\n"); + return (NULL); + } +#endif + + if ((cl = pif->pif_classes[pri]) != NULL) { + /* modify the class instead of creating a new one */ + s = splimp(); + if (!qempty(cl->cl_q)) + priq_purgeq(cl); + splx(s); +#ifdef ALTQ_RIO + if (q_is_rio(cl->cl_q)) + rio_destroy((rio_t *)cl->cl_red); +#endif +#ifdef ALTQ_RED + if (q_is_red(cl->cl_q)) + red_destroy(cl->cl_red); +#endif + } else { + MALLOC(cl, struct priq_class *, sizeof(struct priq_class), + M_DEVBUF, M_WAITOK); + if (cl == NULL) + return (NULL); + bzero(cl, sizeof(struct priq_class)); + + MALLOC(cl->cl_q, class_queue_t *, sizeof(class_queue_t), + M_DEVBUF, M_WAITOK); + if (cl->cl_q == NULL) + goto err_ret; + bzero(cl->cl_q, sizeof(class_queue_t)); + } + + pif->pif_classes[pri] = cl; + if (flags & PRCF_DEFAULTCLASS) + pif->pif_default = cl; + if (qlimit == 0) + qlimit = 50; /* use default */ + qlimit(cl->cl_q) = qlimit; + qtype(cl->cl_q) = Q_DROPTAIL; + qlen(cl->cl_q) = 0; + cl->cl_flags = flags; + cl->cl_pri = pri; + if (pri > pif->pif_maxpri) + pif->pif_maxpri = pri; + cl->cl_pif = pif; + cl->cl_handle = (u_long)cl; /* XXX: just a pointer to this class */ + +#ifdef ALTQ_RED + if (flags & (PRCF_RED|PRCF_RIO)) { + int red_flags, red_pkttime; + + red_flags = 0; + if (flags & PRCF_ECN) + red_flags |= REDF_ECN; +#ifdef ALTQ_RIO + if (flags & PRCF_CLEARDSCP) + red_flags |= RIOF_CLEARDSCP; +#endif + if (pif->pif_bandwidth == 0) + red_pkttime = 1000 * 1000 * 1000; /* 1 sec */ + else + red_pkttime = (int64_t)pif->pif_ifq->altq_ifp->if_mtu + * 1000 * 1000 * 1000 / (pif->pif_bandwidth / 8); +#ifdef ALTQ_RIO + if (flags & PRCF_RIO) { + cl->cl_red = (red_t *)rio_alloc(0, NULL, + red_flags, red_pkttime); + if (cl->cl_red != NULL) + qtype(cl->cl_q) = Q_RIO; + } else +#endif + if (flags & PRCF_RED) { + cl->cl_red = red_alloc(0, 0, 0, 0, + red_flags, red_pkttime); + if (cl->cl_red != NULL) + qtype(cl->cl_q) = Q_RED; + } + } +#endif /* ALTQ_RED */ + + return (cl); + + err_ret: + if (cl->cl_red != NULL) { +#ifdef ALTQ_RIO + if (q_is_rio(cl->cl_q)) + rio_destroy((rio_t *)cl->cl_red); +#endif +#ifdef ALTQ_RED + if (q_is_red(cl->cl_q)) + red_destroy(cl->cl_red); +#endif + } + if (cl->cl_q != NULL) + FREE(cl->cl_q, M_DEVBUF); + FREE(cl, M_DEVBUF); + return (NULL); +} + +static int +priq_class_destroy(cl) + struct priq_class *cl; +{ + struct priq_if *pif; + int s, pri; + + s = splimp(); + + /* delete filters referencing to this class */ + acc_discard_filters(&cl->cl_pif->pif_classifier, cl, 0); + + if (!qempty(cl->cl_q)) + priq_purgeq(cl); + + pif = cl->cl_pif; + pif->pif_classes[cl->cl_pri] = NULL; + if (pif->pif_maxpri == cl->cl_pri) { + for (pri = cl->cl_pri; pri >= 0; pri--) + if (pif->pif_classes[pri] != NULL) { + pif->pif_maxpri = pri; + break; + } + if (pri < 0) + pif->pif_maxpri = -1; + } + splx(s); + + if (cl->cl_red != NULL) { +#ifdef ALTQ_RIO + if (q_is_rio(cl->cl_q)) + rio_destroy((rio_t *)cl->cl_red); +#endif +#ifdef ALTQ_RED + if (q_is_red(cl->cl_q)) + red_destroy(cl->cl_red); +#endif + } + FREE(cl->cl_q, M_DEVBUF); + FREE(cl, M_DEVBUF); + return (0); +} + +/* + * priq_enqueue is an enqueue function to be registered to + * (*altq_enqueue) in struct ifaltq. + */ +static int +priq_enqueue(ifq, m, pktattr) + struct ifaltq *ifq; + struct mbuf *m; + struct altq_pktattr *pktattr; +{ + struct priq_if *pif = (struct priq_if *)ifq->altq_disc; + struct priq_class *cl; + int len; + + /* grab class set by classifier */ + if (pktattr == NULL || (cl = pktattr->pattr_class) == NULL) + cl = pif->pif_default; + cl->cl_pktattr = pktattr; /* save proto hdr used by ECN */ + + len = m_pktlen(m); + if (priq_addq(cl, m) != 0) { + /* drop occurred. mbuf was freed in priq_addq. */ + PKTCNTR_ADD(&cl->cl_dropcnt, len); + return (ENOBUFS); + } + IFQ_INC_LEN(ifq); + + /* successfully queued. */ + return (0); +} + +/* + * priq_dequeue is a dequeue function to be registered to + * (*altq_dequeue) in struct ifaltq. + * + * note: ALTDQ_POLL returns the next packet without removing the packet + * from the queue. ALTDQ_REMOVE is a normal dequeue operation. + * ALTDQ_REMOVE must return the same packet if called immediately + * after ALTDQ_POLL. + */ +static struct mbuf * +priq_dequeue(ifq, op) + struct ifaltq *ifq; + int op; +{ + struct priq_if *pif = (struct priq_if *)ifq->altq_disc; + struct priq_class *cl; + struct mbuf *m; + int pri; + + if (IFQ_IS_EMPTY(ifq)) + /* no packet in the queue */ + return (NULL); + + for (pri = pif->pif_maxpri; pri >= 0; pri--) { + if ((cl = pif->pif_classes[pri]) != NULL && + !qempty(cl->cl_q)) { + if (op == ALTDQ_POLL) + return (priq_pollq(cl)); + + m = priq_getq(cl); + if (m != NULL) { + IFQ_DEC_LEN(ifq); + if (qempty(cl->cl_q)) + cl->cl_period++; + PKTCNTR_ADD(&cl->cl_xmitcnt, m_pktlen(m)); + } + return (m); + } + } + return (NULL); +} + +static int +priq_addq(cl, m) + struct priq_class *cl; + struct mbuf *m; +{ + +#ifdef ALTQ_RIO + if (q_is_rio(cl->cl_q)) + return rio_addq((rio_t *)cl->cl_red, cl->cl_q, m, + cl->cl_pktattr); +#endif +#ifdef ALTQ_RED + if (q_is_red(cl->cl_q)) + return red_addq(cl->cl_red, cl->cl_q, m, cl->cl_pktattr); +#endif + if (qlen(cl->cl_q) >= qlimit(cl->cl_q)) { + m_freem(m); + return (-1); + } + + if (cl->cl_flags & PRCF_CLEARDSCP) + write_dsfield(m, cl->cl_pktattr, 0); + + _addq(cl->cl_q, m); + + return (0); +} + +static struct mbuf * +priq_getq(cl) + struct priq_class *cl; +{ +#ifdef ALTQ_RIO + if (q_is_rio(cl->cl_q)) + return rio_getq((rio_t *)cl->cl_red, cl->cl_q); +#endif +#ifdef ALTQ_RED + if (q_is_red(cl->cl_q)) + return red_getq(cl->cl_red, cl->cl_q); +#endif + return _getq(cl->cl_q); +} + +static struct mbuf * +priq_pollq(cl) + struct priq_class *cl; +{ + return qhead(cl->cl_q); +} + +static void +priq_purgeq(cl) + struct priq_class *cl; +{ + struct mbuf *m; + + if (qempty(cl->cl_q)) + return; + + while ((m = _getq(cl->cl_q)) != NULL) { + PKTCNTR_ADD(&cl->cl_dropcnt, m_pktlen(m)); + m_freem(m); + } + ASSERT(qlen(cl->cl_q) == 0); +} + +/* + * priq device interface + */ +int +priqopen(dev, flag, fmt, p) + dev_t dev; + int flag, fmt; + struct proc *p; +{ + /* everything will be done when the queueing scheme is attached. */ + return 0; +} + +int +priqclose(dev, flag, fmt, p) + dev_t dev; + int flag, fmt; + struct proc *p; +{ + struct priq_if *pif; + int err, error = 0; + + while ((pif = pif_list) != NULL) { + /* destroy all */ + if (ALTQ_IS_ENABLED(pif->pif_ifq)) + altq_disable(pif->pif_ifq); + + err = altq_detach(pif->pif_ifq); + if (err == 0) + err = priq_detach(pif); + if (err != 0 && error == 0) + error = err; + } + + return error; +} + +int +priqioctl(dev, cmd, addr, flag, p) + dev_t dev; + ioctlcmd_t cmd; + caddr_t addr; + int flag; + struct proc *p; +{ + struct priq_if *pif; + struct priq_interface *ifacep; + int error = 0; + + /* check super-user privilege */ + switch (cmd) { + case PRIQ_GETSTATS: + break; + default: +#if (__FreeBSD_version > 400000) + if ((error = suser(p)) != 0) + return (error); +#else + if ((error = suser(p->p_ucred, &p->p_acflag)) != 0) + return (error); +#endif + break; + } + + switch (cmd) { + + case PRIQ_IF_ATTACH: + error = priqcmd_if_attach((struct priq_interface *)addr); + break; + + case PRIQ_IF_DETACH: + error = priqcmd_if_detach((struct priq_interface *)addr); + break; + + case PRIQ_ENABLE: + case PRIQ_DISABLE: + case PRIQ_CLEAR: + ifacep = (struct priq_interface *)addr; + if ((pif = altq_lookup(ifacep->ifname, + ALTQT_PRIQ)) == NULL) { + error = EBADF; + break; + } + + switch (cmd) { + case PRIQ_ENABLE: + if (pif->pif_default == NULL) { +#if 1 + printf("priq: no default class\n"); +#endif + error = EINVAL; + break; + } + error = altq_enable(pif->pif_ifq); + break; + + case PRIQ_DISABLE: + error = altq_disable(pif->pif_ifq); + break; + + case PRIQ_CLEAR: + priq_clear_interface(pif); + break; + } + break; + + case PRIQ_ADD_CLASS: + error = priqcmd_add_class((struct priq_add_class *)addr); + break; + + case PRIQ_DEL_CLASS: + error = priqcmd_delete_class((struct priq_delete_class *)addr); + break; + + case PRIQ_MOD_CLASS: + error = priqcmd_modify_class((struct priq_modify_class *)addr); + break; + + case PRIQ_ADD_FILTER: + error = priqcmd_add_filter((struct priq_add_filter *)addr); + break; + + case PRIQ_DEL_FILTER: + error = priqcmd_delete_filter((struct priq_delete_filter *)addr); + break; + + case PRIQ_GETSTATS: + error = priqcmd_class_stats((struct priq_class_stats *)addr); + break; + + default: + error = EINVAL; + break; + } + return error; +} + +static int +priqcmd_if_attach(ap) + struct priq_interface *ap; +{ + struct priq_if *pif; + struct ifnet *ifp; + int error; + + if ((ifp = ifunit(ap->ifname)) == NULL) + return (ENXIO); + + if ((pif = priq_attach(&ifp->if_snd, ap->arg)) == NULL) + return (ENOMEM); + + /* + * set PRIQ to this ifnet structure. + */ + if ((error = altq_attach(&ifp->if_snd, ALTQT_PRIQ, pif, + priq_enqueue, priq_dequeue, priq_request, + &pif->pif_classifier, acc_classify)) != 0) + (void)priq_detach(pif); + + return (error); +} + +static int +priqcmd_if_detach(ap) + struct priq_interface *ap; +{ + struct priq_if *pif; + int error; + + if ((pif = altq_lookup(ap->ifname, ALTQT_PRIQ)) == NULL) + return (EBADF); + + if (ALTQ_IS_ENABLED(pif->pif_ifq)) + altq_disable(pif->pif_ifq); + + if ((error = altq_detach(pif->pif_ifq))) + return (error); + + return priq_detach(pif); +} + +static int +priqcmd_add_class(ap) + struct priq_add_class *ap; +{ + struct priq_if *pif; + struct priq_class *cl; + + if ((pif = altq_lookup(ap->iface.ifname, ALTQT_PRIQ)) == NULL) + return (EBADF); + + if (ap->pri < 0 || ap->pri >= PRIQ_MAXPRI) + return (EINVAL); + + if ((cl = priq_class_create(pif, ap->pri, + ap->qlimit, ap->flags)) == NULL) + return (ENOMEM); + + /* return a class handle to the user */ + ap->class_handle = clp_to_clh(cl); + return (0); +} + +static int +priqcmd_delete_class(ap) + struct priq_delete_class *ap; +{ + struct priq_if *pif; + struct priq_class *cl; + + if ((pif = altq_lookup(ap->iface.ifname, ALTQT_PRIQ)) == NULL) + return (EBADF); + + if ((cl = clh_to_clp(pif, ap->class_handle)) == NULL) + return (EINVAL); + + return priq_class_destroy(cl); +} + +static int +priqcmd_modify_class(ap) + struct priq_modify_class *ap; +{ + struct priq_if *pif; + struct priq_class *cl; + + if ((pif = altq_lookup(ap->iface.ifname, ALTQT_PRIQ)) == NULL) + return (EBADF); + + if (ap->pri < 0 || ap->pri >= PRIQ_MAXPRI) + return (EINVAL); + + if ((cl = clh_to_clp(pif, ap->class_handle)) == NULL) + return (EINVAL); + + /* + * if priority is changed, move the class to the new priority + */ + if (pif->pif_classes[ap->pri] != cl) { + if (pif->pif_classes[ap->pri] != NULL) + return (EEXIST); + pif->pif_classes[cl->cl_pri] = NULL; + pif->pif_classes[ap->pri] = cl; + cl->cl_pri = ap->pri; + } + + /* call priq_class_create to change class parameters */ + if ((cl = priq_class_create(pif, ap->pri, + ap->qlimit, ap->flags)) == NULL) + return (ENOMEM); + return 0; +} + +static int +priqcmd_add_filter(ap) + struct priq_add_filter *ap; +{ + struct priq_if *pif; + struct priq_class *cl; + + if ((pif = altq_lookup(ap->iface.ifname, ALTQT_PRIQ)) == NULL) + return (EBADF); + + if ((cl = clh_to_clp(pif, ap->class_handle)) == NULL) + return (EINVAL); + + return acc_add_filter(&pif->pif_classifier, &ap->filter, + cl, &ap->filter_handle); +} + +static int +priqcmd_delete_filter(ap) + struct priq_delete_filter *ap; +{ + struct priq_if *pif; + + if ((pif = altq_lookup(ap->iface.ifname, ALTQT_PRIQ)) == NULL) + return (EBADF); + + return acc_delete_filter(&pif->pif_classifier, + ap->filter_handle); +} + +static int +priqcmd_class_stats(ap) + struct priq_class_stats *ap; +{ + struct priq_if *pif; + struct priq_class *cl; + struct class_stats stats, *usp; + int pri, error; + + if ((pif = altq_lookup(ap->iface.ifname, ALTQT_PRIQ)) == NULL) + return (EBADF); + + ap->maxpri = pif->pif_maxpri; + + /* then, read the next N classes in the tree */ + usp = ap->stats; + for (pri = 0; pri <= pif->pif_maxpri; pri++) { + cl = pif->pif_classes[pri]; + if (cl != NULL) + get_class_stats(&stats, cl); + else + bzero(&stats, sizeof(stats)); + if ((error = copyout((caddr_t)&stats, (caddr_t)usp++, + sizeof(stats))) != 0) + return (error); + } + return (0); +} + +static void get_class_stats(sp, cl) + struct class_stats *sp; + struct priq_class *cl; +{ + sp->class_handle = clp_to_clh(cl); + + sp->qlength = qlen(cl->cl_q); + sp->period = cl->cl_period; + sp->xmitcnt = cl->cl_xmitcnt; + sp->dropcnt = cl->cl_dropcnt; + + sp->qtype = qtype(cl->cl_q); +#ifdef ALTQ_RED + if (q_is_red(cl->cl_q)) + red_getstats(cl->cl_red, &sp->red[0]); +#endif +#ifdef ALTQ_RIO + if (q_is_rio(cl->cl_q)) + rio_getstats((rio_t *)cl->cl_red, &sp->red[0]); +#endif + +} + +/* convert a class handle to the corresponding class pointer */ +static struct priq_class * +clh_to_clp(pif, chandle) + struct priq_if *pif; + u_long chandle; +{ + struct priq_class *cl; + + cl = (struct priq_class *)chandle; + if (chandle != ALIGN(cl)) { +#if 1 + printf("clh_to_cl: unaligned pointer %p\n", cl); +#endif + return (NULL); + } + + if (cl == NULL || cl->cl_handle != chandle || cl->cl_pif != pif) + return (NULL); + return (cl); +} + +/* convert a class pointer to the corresponding class handle */ +static u_long +clp_to_clh(cl) + struct priq_class *cl; +{ + return (cl->cl_handle); +} + +#ifdef KLD_MODULE + +static struct altqsw priq_sw = + {"priq", priqopen, priqclose, priqioctl}; + +ALTQ_MODULE(altq_priq, ALTQT_PRIQ, &priq_sw); + +#endif /* KLD_MODULE */ + +#endif /* ALTQ_PRIQ */ diff --git a/sys/altq/altq_priq.h b/sys/altq/altq_priq.h new file mode 100644 index 000000000000..36443ce779cb --- /dev/null +++ b/sys/altq/altq_priq.h @@ -0,0 +1,160 @@ +/* $KAME: altq_priq.h,v 1.1 2000/10/18 09:15:23 kjc Exp $ */ +/* + * Copyright (C) 2000 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _ALTQ_ALTQ_PRIQ_H_ +#define _ALTQ_ALTQ_PRIQ_H_ + +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define PRIQ_MAXPRI 16 /* upper limit of the number of priorities */ + +struct priq_interface { + char ifname[IFNAMSIZ]; /* interface name (e.g., fxp0) */ + u_long arg; /* request-specific argument */ +}; + +struct priq_add_class { + struct priq_interface iface; + int pri; /* priority (0 is the lowest) */ + int qlimit; /* queue size limit */ + int flags; /* misc flags (see below) */ + + u_long class_handle; /* return value */ +}; + +/* priq class flags */ +#define PRCF_RED 0x0001 /* use RED */ +#define PRCF_ECN 0x0002 /* use RED/ECN */ +#define PRCF_RIO 0x0004 /* use RIO */ +#define PRCF_CLEARDSCP 0x0010 /* clear diffserv codepoint */ +#define PRCF_DEFAULTCLASS 0x1000 /* default class */ + +/* special class handles */ +#define PRIQ_NULLCLASS_HANDLE 0 + +struct priq_delete_class { + struct priq_interface iface; + u_long class_handle; +}; + +struct priq_modify_class { + struct priq_interface iface; + u_long class_handle; + int pri; + int qlimit; + int flags; +}; + +struct priq_add_filter { + struct priq_interface iface; + u_long class_handle; + struct flow_filter filter; + + u_long filter_handle; /* return value */ +}; + +struct priq_delete_filter { + struct priq_interface iface; + u_long filter_handle; +}; + +struct class_stats { + u_long class_handle; + + u_int qlength; + u_int period; + struct pktcntr xmitcnt; /* transmitted packet counter */ + struct pktcntr dropcnt; /* dropped packet counter */ + + /* red and rio related info */ + int qtype; + struct redstats red[3]; /* rio has 3 red stats */ +}; + +struct priq_class_stats { + struct priq_interface iface; + int maxpri; /* in/out */ + + struct class_stats *stats; /* pointer to stats array */ +}; + +#define PRIQ_IF_ATTACH _IOW('Q', 1, struct priq_interface) +#define PRIQ_IF_DETACH _IOW('Q', 2, struct priq_interface) +#define PRIQ_ENABLE _IOW('Q', 3, struct priq_interface) +#define PRIQ_DISABLE _IOW('Q', 4, struct priq_interface) +#define PRIQ_CLEAR _IOW('Q', 5, struct priq_interface) +#define PRIQ_ADD_CLASS _IOWR('Q', 7, struct priq_add_class) +#define PRIQ_DEL_CLASS _IOW('Q', 8, struct priq_delete_class) +#define PRIQ_MOD_CLASS _IOW('Q', 9, struct priq_modify_class) +#define PRIQ_ADD_FILTER _IOWR('Q', 10, struct priq_add_filter) +#define PRIQ_DEL_FILTER _IOW('Q', 11, struct priq_delete_filter) +#define PRIQ_GETSTATS _IOWR('Q', 12, struct priq_class_stats) + +#ifdef _KERNEL + +struct priq_class { + u_long cl_handle; /* class handle */ + class_queue_t *cl_q; /* class queue structure */ + struct red *cl_red; /* RED state */ + int cl_pri; /* priority */ + int cl_flags; /* class flags */ + struct priq_if *cl_pif; /* back pointer to pif */ + struct altq_pktattr *cl_pktattr; /* saved header used by ECN */ + + /* statistics */ + u_int cl_period; /* backlog period */ + struct pktcntr cl_xmitcnt; /* transmitted packet counter */ + struct pktcntr cl_dropcnt; /* dropped packet counter */ +}; + +/* + * priq interface state + */ +struct priq_if { + struct priq_if *pif_next; /* interface state list */ + struct ifaltq *pif_ifq; /* backpointer to ifaltq */ + u_int pif_bandwidth; /* link bandwidth in bps */ + int pif_maxpri; /* max priority in use */ + struct priq_class *pif_default; /* default class */ + struct priq_class *pif_classes[PRIQ_MAXPRI]; /* classes */ + struct acc_classifier pif_classifier; /* classifier */ +}; + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _ALTQ_ALTQ_PRIQ_H_ */ diff --git a/sys/altq/altq_red.c b/sys/altq/altq_red.c new file mode 100644 index 000000000000..571ec092647f --- /dev/null +++ b/sys/altq/altq_red.c @@ -0,0 +1,1474 @@ +/* $KAME: altq_red.c,v 1.8 2000/12/14 08:12:46 thorpej Exp $ */ + +/* + * Copyright (C) 1997-2000 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ +/* + * Copyright (c) 1990-1994 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the Computer Systems + * Engineering Group at Lawrence Berkeley Laboratory. + * 4. Neither the name of the University nor of the Laboratory may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if defined(__FreeBSD__) || defined(__NetBSD__) +#include "opt_altq.h" +#if (__FreeBSD__ != 2) +#include "opt_inet.h" +#ifdef __FreeBSD__ +#include "opt_inet6.h" +#endif +#endif +#endif /* __FreeBSD__ || __NetBSD__ */ +#ifdef ALTQ_RED /* red is enabled by ALTQ_RED option in opt_altq.h */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef ALTQ_FLOWVALVE +#include +#include +#endif + +#include +#include + +#include +#include +#include +#ifdef INET6 +#include +#endif + +#include +#include +#include +#ifdef ALTQ_FLOWVALVE +#include +#endif + +/* + * ALTQ/RED (Random Early Detection) implementation using 32-bit + * fixed-point calculation. + * + * written by kjc using the ns code as a reference. + * you can learn more about red and ns from Sally's home page at + * http://www-nrg.ee.lbl.gov/floyd/ + * + * most of the red parameter values are fixed in this implementation + * to prevent fixed-point overflow/underflow. + * if you change the parameters, watch out for overflow/underflow! + * + * the parameters used are recommended values by Sally. + * the corresponding ns config looks: + * q_weight=0.00195 + * minthresh=5 maxthresh=15 queue-size=60 + * linterm=30 + * dropmech=drop-tail + * bytes=false (can't be handled by 32-bit fixed-point) + * doubleq=false dqthresh=false + * wait=true + */ +/* + * alternative red parameters for a slow link. + * + * assume the queue length becomes from zero to L and keeps L, it takes + * N packets for q_avg to reach 63% of L. + * when q_weight is 0.002, N is about 500 packets. + * for a slow link like dial-up, 500 packets takes more than 1 minute! + * when q_weight is 0.008, N is about 127 packets. + * when q_weight is 0.016, N is about 63 packets. + * bursts of 50 packets are allowd for 0.002, bursts of 25 packets + * are allowed for 0.016. + * see Sally's paper for more details. + */ +/* normal red parameters */ +#define W_WEIGHT 512 /* inverse of weight of EWMA (511/512) */ + /* q_weight = 0.00195 */ + +/* red parameters for a slow link */ +#define W_WEIGHT_1 128 /* inverse of weight of EWMA (127/128) */ + /* q_weight = 0.0078125 */ + +/* red parameters for a very slow link (e.g., dialup) */ +#define W_WEIGHT_2 64 /* inverse of weight of EWMA (63/64) */ + /* q_weight = 0.015625 */ + +/* fixed-point uses 12-bit decimal places */ +#define FP_SHIFT 12 /* fixed-point shift */ + +/* red parameters for drop probability */ +#define INV_P_MAX 10 /* inverse of max drop probability */ +#define TH_MIN 5 /* min threshold */ +#define TH_MAX 15 /* max threshold */ + +#define RED_LIMIT 60 /* default max queue lenght */ +#define RED_STATS /* collect statistics */ + +/* + * our default policy for forced-drop is drop-tail. + * (in altq-1.1.2 or earlier, the default was random-drop. + * but it makes more sense to punish the cause of the surge.) + * to switch to the random-drop policy, define "RED_RANDOM_DROP". + */ + +#ifdef ALTQ_FLOWVALVE +/* + * flow-valve is an extention to protect red from unresponsive flows + * and to promote end-to-end congestion control. + * flow-valve observes the average drop rates of the flows that have + * experienced packet drops in the recent past. + * when the average drop rate exceeds the threshold, the flow is + * blocked by the flow-valve. the trapped flow should back off + * exponentially to escape from the flow-valve. + */ +#ifdef RED_RANDOM_DROP +#error "random-drop can't be used with flow-valve!" +#endif +#endif /* ALTQ_FLOWVALVE */ + +/* red_list keeps all red_queue_t's allocated. */ +static red_queue_t *red_list = NULL; + +/* default red parameter values */ +static int default_th_min = TH_MIN; +static int default_th_max = TH_MAX; +static int default_inv_pmax = INV_P_MAX; + +/* internal function prototypes */ +static int red_enqueue __P((struct ifaltq *, struct mbuf *, + struct altq_pktattr *)); +static struct mbuf *red_dequeue __P((struct ifaltq *, int)); +static int red_request __P((struct ifaltq *, int, void *)); +static void red_purgeq __P((red_queue_t *)); +static int red_detach __P((red_queue_t *)); +#ifdef ALTQ_FLOWVALVE +static __inline struct fve *flowlist_lookup __P((struct flowvalve *, + struct altq_pktattr *, struct timeval *)); +static __inline struct fve *flowlist_reclaim __P((struct flowvalve *, + struct altq_pktattr *)); +static __inline void flowlist_move_to_head __P((struct flowvalve *, + struct fve *)); +static __inline int fv_p2f __P((struct flowvalve *, int)); +static struct flowvalve *fv_alloc __P((struct red *)); +static void fv_destroy __P((struct flowvalve *)); +static int fv_checkflow __P((struct flowvalve *, struct altq_pktattr *, + struct fve **)); +static void fv_dropbyred __P((struct flowvalve *fv, struct altq_pktattr *, + struct fve *)); +#endif + +/* + * red device interface + */ +altqdev_decl(red); + +int +redopen(dev, flag, fmt, p) + dev_t dev; + int flag, fmt; + struct proc *p; +{ + /* everything will be done when the queueing scheme is attached. */ + return 0; +} + +int +redclose(dev, flag, fmt, p) + dev_t dev; + int flag, fmt; + struct proc *p; +{ + red_queue_t *rqp; + int err, error = 0; + + while ((rqp = red_list) != NULL) { + /* destroy all */ + err = red_detach(rqp); + if (err != 0 && error == 0) + error = err; + } + + return error; +} + +int +redioctl(dev, cmd, addr, flag, p) + dev_t dev; + ioctlcmd_t cmd; + caddr_t addr; + int flag; + struct proc *p; +{ + red_queue_t *rqp; + struct red_interface *ifacep; + struct ifnet *ifp; + int error = 0; + + /* check super-user privilege */ + switch (cmd) { + case RED_GETSTATS: + break; + default: +#if (__FreeBSD_version > 400000) + if ((error = suser(p)) != 0) +#else + if ((error = suser(p->p_ucred, &p->p_acflag)) != 0) +#endif + return (error); + break; + } + + switch (cmd) { + + case RED_ENABLE: + ifacep = (struct red_interface *)addr; + if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) { + error = EBADF; + break; + } + error = altq_enable(rqp->rq_ifq); + break; + + case RED_DISABLE: + ifacep = (struct red_interface *)addr; + if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) { + error = EBADF; + break; + } + error = altq_disable(rqp->rq_ifq); + break; + + case RED_IF_ATTACH: + ifp = ifunit(((struct red_interface *)addr)->red_ifname); + if (ifp == NULL) { + error = ENXIO; + break; + } + + /* allocate and initialize red_queue_t */ + MALLOC(rqp, red_queue_t *, sizeof(red_queue_t), M_DEVBUF, M_WAITOK); + if (rqp == NULL) { + error = ENOMEM; + break; + } + bzero(rqp, sizeof(red_queue_t)); + + MALLOC(rqp->rq_q, class_queue_t *, sizeof(class_queue_t), + M_DEVBUF, M_WAITOK); + if (rqp->rq_q == NULL) { + FREE(rqp, M_DEVBUF); + error = ENOMEM; + break; + } + bzero(rqp->rq_q, sizeof(class_queue_t)); + + rqp->rq_red = red_alloc(0, 0, 0, 0, 0, 0); + if (rqp->rq_red == NULL) { + FREE(rqp->rq_q, M_DEVBUF); + FREE(rqp, M_DEVBUF); + error = ENOMEM; + break; + } + + rqp->rq_ifq = &ifp->if_snd; + qtail(rqp->rq_q) = NULL; + qlen(rqp->rq_q) = 0; + qlimit(rqp->rq_q) = RED_LIMIT; + qtype(rqp->rq_q) = Q_RED; + + /* + * set RED to this ifnet structure. + */ + error = altq_attach(rqp->rq_ifq, ALTQT_RED, rqp, + red_enqueue, red_dequeue, red_request, + NULL, NULL); + if (error) { + red_destroy(rqp->rq_red); + FREE(rqp->rq_q, M_DEVBUF); + FREE(rqp, M_DEVBUF); + break; + } + + /* add this state to the red list */ + rqp->rq_next = red_list; + red_list = rqp; + break; + + case RED_IF_DETACH: + ifacep = (struct red_interface *)addr; + if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) { + error = EBADF; + break; + } + error = red_detach(rqp); + break; + + case RED_GETSTATS: + do { + struct red_stats *q_stats; + red_t *rp; + + q_stats = (struct red_stats *)addr; + if ((rqp = altq_lookup(q_stats->iface.red_ifname, + ALTQT_RED)) == NULL) { + error = EBADF; + break; + } + + q_stats->q_len = qlen(rqp->rq_q); + q_stats->q_limit = qlimit(rqp->rq_q); + + rp = rqp->rq_red; + q_stats->q_avg = rp->red_avg >> rp->red_wshift; + q_stats->xmit_cnt = rp->red_stats.xmit_cnt; + q_stats->drop_cnt = rp->red_stats.drop_cnt; + q_stats->drop_forced = rp->red_stats.drop_forced; + q_stats->drop_unforced = rp->red_stats.drop_unforced; + q_stats->marked_packets = rp->red_stats.marked_packets; + + q_stats->weight = rp->red_weight; + q_stats->inv_pmax = rp->red_inv_pmax; + q_stats->th_min = rp->red_thmin; + q_stats->th_max = rp->red_thmax; + +#ifdef ALTQ_FLOWVALVE + if (rp->red_flowvalve != NULL) { + struct flowvalve *fv = rp->red_flowvalve; + q_stats->fv_flows = fv->fv_flows; + q_stats->fv_pass = fv->fv_stats.pass; + q_stats->fv_predrop = fv->fv_stats.predrop; + q_stats->fv_alloc = fv->fv_stats.alloc; + q_stats->fv_escape = fv->fv_stats.escape; + } else { +#endif /* ALTQ_FLOWVALVE */ + q_stats->fv_flows = 0; + q_stats->fv_pass = 0; + q_stats->fv_predrop = 0; + q_stats->fv_alloc = 0; + q_stats->fv_escape = 0; +#ifdef ALTQ_FLOWVALVE + } +#endif /* ALTQ_FLOWVALVE */ + } while (0); + break; + + case RED_CONFIG: + do { + struct red_conf *fc; + red_t *new; + int s, limit; + + fc = (struct red_conf *)addr; + if ((rqp = altq_lookup(fc->iface.red_ifname, + ALTQT_RED)) == NULL) { + error = EBADF; + break; + } + new = red_alloc(fc->red_weight, + fc->red_inv_pmax, + fc->red_thmin, + fc->red_thmax, + fc->red_flags, + fc->red_pkttime); + if (new == NULL) { + error = ENOMEM; + break; + } + + s = splimp(); + red_purgeq(rqp); + limit = fc->red_limit; + if (limit < fc->red_thmax) + limit = fc->red_thmax; + qlimit(rqp->rq_q) = limit; + fc->red_limit = limit; /* write back the new value */ + + red_destroy(rqp->rq_red); + rqp->rq_red = new; + + splx(s); + + /* write back new values */ + fc->red_limit = limit; + fc->red_inv_pmax = rqp->rq_red->red_inv_pmax; + fc->red_thmin = rqp->rq_red->red_thmin; + fc->red_thmax = rqp->rq_red->red_thmax; + + } while (0); + break; + + case RED_SETDEFAULTS: + do { + struct redparams *rp; + + rp = (struct redparams *)addr; + + default_th_min = rp->th_min; + default_th_max = rp->th_max; + default_inv_pmax = rp->inv_pmax; + } while (0); + break; + + default: + error = EINVAL; + break; + } + return error; +} + +static int +red_detach(rqp) + red_queue_t *rqp; +{ + red_queue_t *tmp; + int error = 0; + + if (ALTQ_IS_ENABLED(rqp->rq_ifq)) + altq_disable(rqp->rq_ifq); + + if ((error = altq_detach(rqp->rq_ifq))) + return (error); + + if (red_list == rqp) + red_list = rqp->rq_next; + else { + for (tmp = red_list; tmp != NULL; tmp = tmp->rq_next) + if (tmp->rq_next == rqp) { + tmp->rq_next = rqp->rq_next; + break; + } + if (tmp == NULL) + printf("red_detach: no state found in red_list!\n"); + } + + red_destroy(rqp->rq_red); + FREE(rqp->rq_q, M_DEVBUF); + FREE(rqp, M_DEVBUF); + return (error); +} + +/* + * red support routines + */ + +red_t * +red_alloc(weight, inv_pmax, th_min, th_max, flags, pkttime) + int weight, inv_pmax, th_min, th_max; + int flags, pkttime; +{ + red_t *rp; + int w, i; + int npkts_per_sec; + + MALLOC(rp, red_t *, sizeof(red_t), M_DEVBUF, M_WAITOK); + if (rp == NULL) + return (NULL); + bzero(rp, sizeof(red_t)); + + rp->red_avg = 0; + rp->red_idle = 1; + + if (weight == 0) + rp->red_weight = W_WEIGHT; + else + rp->red_weight = weight; + if (inv_pmax == 0) + rp->red_inv_pmax = default_inv_pmax; + else + rp->red_inv_pmax = inv_pmax; + if (th_min == 0) + rp->red_thmin = default_th_min; + else + rp->red_thmin = th_min; + if (th_max == 0) + rp->red_thmax = default_th_max; + else + rp->red_thmax = th_max; + + rp->red_flags = flags; + + if (pkttime == 0) + /* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */ + rp->red_pkttime = 800; + else + rp->red_pkttime = pkttime; + + if (weight == 0) { + /* when the link is very slow, adjust red parameters */ + npkts_per_sec = 1000000 / rp->red_pkttime; + if (npkts_per_sec < 50) { + /* up to about 400Kbps */ + rp->red_weight = W_WEIGHT_2; + } else if (npkts_per_sec < 300) { + /* up to about 2.4Mbps */ + rp->red_weight = W_WEIGHT_1; + } + } + + /* calculate wshift. weight must be power of 2 */ + w = rp->red_weight; + for (i = 0; w > 1; i++) + w = w >> 1; + rp->red_wshift = i; + w = 1 << rp->red_wshift; + if (w != rp->red_weight) { + printf("invalid weight value %d for red! use %d\n", + rp->red_weight, w); + rp->red_weight = w; + } + + /* + * thmin_s and thmax_s are scaled versions of th_min and th_max + * to be compared with avg. + */ + rp->red_thmin_s = rp->red_thmin << (rp->red_wshift + FP_SHIFT); + rp->red_thmax_s = rp->red_thmax << (rp->red_wshift + FP_SHIFT); + + /* + * precompute probability denominator + * probd = (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point + */ + rp->red_probd = (2 * (rp->red_thmax - rp->red_thmin) + * rp->red_inv_pmax) << FP_SHIFT; + + /* allocate weight table */ + rp->red_wtab = wtab_alloc(rp->red_weight); + + microtime(&rp->red_last); +#ifdef ALTQ_FLOWVALVE + if (flags & REDF_FLOWVALVE) + rp->red_flowvalve = fv_alloc(rp); + /* if fv_alloc failes, flowvalve is just disabled */ +#endif + return (rp); +} + +void +red_destroy(rp) + red_t *rp; +{ +#ifdef ALTQ_FLOWVALVE + if (rp->red_flowvalve != NULL) + fv_destroy(rp->red_flowvalve); +#endif + wtab_destroy(rp->red_wtab); + FREE(rp, M_DEVBUF); +} + +void +red_getstats(rp, sp) + red_t *rp; + struct redstats *sp; +{ + sp->q_avg = rp->red_avg >> rp->red_wshift; + sp->xmit_cnt = rp->red_stats.xmit_cnt; + sp->drop_cnt = rp->red_stats.drop_cnt; + sp->drop_forced = rp->red_stats.drop_forced; + sp->drop_unforced = rp->red_stats.drop_unforced; + sp->marked_packets = rp->red_stats.marked_packets; +} + +/* + * enqueue routine: + * + * returns: 0 when successfully queued. + * ENOBUFS when drop occurs. + */ +static int +red_enqueue(ifq, m, pktattr) + struct ifaltq *ifq; + struct mbuf *m; + struct altq_pktattr *pktattr; +{ + red_queue_t *rqp = (red_queue_t *)ifq->altq_disc; + + if (red_addq(rqp->rq_red, rqp->rq_q, m, pktattr) < 0) + return ENOBUFS; + ifq->ifq_len++; + return 0; +} + +int +red_addq(rp, q, m, pktattr) + red_t *rp; + class_queue_t *q; + struct mbuf *m; + struct altq_pktattr *pktattr; +{ + int avg, droptype; + int n; +#ifdef ALTQ_FLOWVALVE + struct fve *fve = NULL; + + if (rp->red_flowvalve != NULL && rp->red_flowvalve->fv_flows > 0) + if (fv_checkflow(rp->red_flowvalve, pktattr, &fve)) { + m_freem(m); + return (-1); + } +#endif + + avg = rp->red_avg; + + /* + * if we were idle, we pretend that n packets arrived during + * the idle period. + */ + if (rp->red_idle) { + struct timeval now; + int t; + + rp->red_idle = 0; + microtime(&now); + t = (now.tv_sec - rp->red_last.tv_sec); + if (t > 60) { + /* + * being idle for more than 1 minute, set avg to zero. + * this prevents t from overflow. + */ + avg = 0; + } else { + t = t * 1000000 + (now.tv_usec - rp->red_last.tv_usec); + n = t / rp->red_pkttime - 1; + + /* the following line does (avg = (1 - Wq)^n * avg) */ + if (n > 0) + avg = (avg >> FP_SHIFT) * + pow_w(rp->red_wtab, n); + } + } + + /* run estimator. (note: avg is scaled by WEIGHT in fixed-point) */ + avg += (qlen(q) << FP_SHIFT) - (avg >> rp->red_wshift); + rp->red_avg = avg; /* save the new value */ + + /* + * red_count keeps a tally of arriving traffic that has not + * been dropped. + */ + rp->red_count++; + + /* see if we drop early */ + droptype = DTYPE_NODROP; + if (avg >= rp->red_thmin_s && qlen(q) > 1) { + if (avg >= rp->red_thmax_s) { + /* avg >= th_max: forced drop */ + droptype = DTYPE_FORCED; + } else if (rp->red_old == 0) { + /* first exceeds th_min */ + rp->red_count = 1; + rp->red_old = 1; + } else if (drop_early((avg - rp->red_thmin_s) >> rp->red_wshift, + rp->red_probd, rp->red_count)) { + /* mark or drop by red */ + if ((rp->red_flags & REDF_ECN) && + mark_ecn(m, pktattr, rp->red_flags)) { + /* successfully marked. do not drop. */ + rp->red_count = 0; +#ifdef RED_STATS + rp->red_stats.marked_packets++; +#endif + } else { + /* unforced drop by red */ + droptype = DTYPE_EARLY; + } + } + } else { + /* avg < th_min */ + rp->red_old = 0; + } + + /* + * if the queue length hits the hard limit, it's a forced drop. + */ + if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q)) + droptype = DTYPE_FORCED; + +#ifdef RED_RANDOM_DROP + /* if successful or forced drop, enqueue this packet. */ + if (droptype != DTYPE_EARLY) + _addq(q, m); +#else + /* if successful, enqueue this packet. */ + if (droptype == DTYPE_NODROP) + _addq(q, m); +#endif + if (droptype != DTYPE_NODROP) { + if (droptype == DTYPE_EARLY) { + /* drop the incoming packet */ +#ifdef RED_STATS + rp->red_stats.drop_unforced++; +#endif + } else { + /* forced drop, select a victim packet in the queue. */ +#ifdef RED_RANDOM_DROP + m = _getq_random(q); +#endif +#ifdef RED_STATS + rp->red_stats.drop_forced++; +#endif + } +#ifdef RED_STATS + PKTCNTR_ADD(&rp->red_stats.drop_cnt, m_pktlen(m)); +#endif + rp->red_count = 0; +#ifdef ALTQ_FLOWVALVE + if (rp->red_flowvalve != NULL) + fv_dropbyred(rp->red_flowvalve, pktattr, fve); +#endif + m_freem(m); + return (-1); + } + /* successfully queued */ +#ifdef RED_STATS + PKTCNTR_ADD(&rp->red_stats.xmit_cnt, m_pktlen(m)); +#endif + return (0); +} + +/* + * early-drop probability is calculated as follows: + * prob = p_max * (avg - th_min) / (th_max - th_min) + * prob_a = prob / (2 - count*prob) + * = (avg-th_min) / (2*(th_max-th_min)*inv_p_max - count*(avg-th_min)) + * here prob_a increases as successive undrop count increases. + * (prob_a starts from prob/2, becomes prob when (count == (1 / prob)), + * becomes 1 when (count >= (2 / prob))). + */ +int +drop_early(fp_len, fp_probd, count) + int fp_len; /* (avg - TH_MIN) in fixed-point */ + int fp_probd; /* (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point */ + int count; /* how many successive undropped packets */ +{ + int d; /* denominator of drop-probability */ + + d = fp_probd - count * fp_len; + if (d <= 0) + /* count exceeds the hard limit: drop or mark */ + return (1); + + /* + * now the range of d is [1..600] in fixed-point. (when + * th_max-th_min=10 and p_max=1/30) + * drop probability = (avg - TH_MIN) / d + */ + + if ((random() % d) < fp_len) { + /* drop or mark */ + return (1); + } + /* no drop/mark */ + return (0); +} + +/* + * try to mark CE bit to the packet. + * returns 1 if successfully marked, 0 otherwise. + */ +int +mark_ecn(m, pktattr, flags) + struct mbuf *m; + struct altq_pktattr *pktattr; + int flags; +{ + struct mbuf *m0; + + if (pktattr == NULL || + (pktattr->pattr_af != AF_INET && pktattr->pattr_af != AF_INET6)) + return (0); + + /* verify that pattr_hdr is within the mbuf data */ + for (m0 = m; m0 != NULL; m0 = m0->m_next) + if ((pktattr->pattr_hdr >= m0->m_data) && + (pktattr->pattr_hdr < m0->m_data + m0->m_len)) + break; + if (m0 == NULL) { + /* ick, pattr_hdr is stale */ + pktattr->pattr_af = AF_UNSPEC; + return (0); + } + + switch (pktattr->pattr_af) { + case AF_INET: + if (flags & REDF_ECN4) { + struct ip *ip = (struct ip *)pktattr->pattr_hdr; + + if (ip->ip_v != 4) + return (0); /* version mismatch! */ + if (ip->ip_tos & IPTOS_ECT) { + /* ECN-capable, mark ECN bit. */ + if ((ip->ip_tos & IPTOS_CE) == 0) { +#if (IPTOS_CE == 0x01) + u_short sum; + + ip->ip_tos |= IPTOS_CE; + /* + * optimized version when IPTOS_CE + * is 0x01. + * HC' = HC -1 when HC > 0 + * = 0xfffe when HC = 0 + */ + sum = ntohs(ip->ip_sum); + if (sum == 0) + sum = 0xfffe; + else + sum -= 1; + ip->ip_sum = htons(sum); +#else /* IPTOS_CE != 0x01 */ + long sum; + + ip->ip_tos |= IPTOS_CE; + /* + * update checksum (from RFC1624) + * HC' = ~(~HC + ~m + m') + */ + sum = ~ntohs(ip->ip_sum) & 0xffff; + sum += 0xffff + IPTOS_CE; + sum = (sum >> 16) + (sum & 0xffff); + sum += (sum >> 16); /* add carry */ + + ip->ip_sum = htons(~sum & 0xffff); +#endif /* IPTOS_CE != 0x01 */ + } + return (1); + } + } + break; +#ifdef INET6 + case AF_INET6: + if (flags & REDF_ECN6) { + struct ip6_hdr *ip6 = (struct ip6_hdr *)pktattr->pattr_hdr; + u_int32_t flowlabel; + + flowlabel = ntohl(ip6->ip6_flow); + if ((flowlabel >> 28) != 6) + return (0); /* version mismatch! */ + if (flowlabel & (IPTOS_ECT << 20)) { + /* ECN-capable, mark ECN bit. */ + flowlabel |= (IPTOS_CE << 20); + ip6->ip6_flow = htonl(flowlabel); + return (1); + } + } + break; +#endif /* INET6 */ + } + + /* not marked */ + return (0); +} + +/* + * dequeue routine: + * must be called in splimp. + * + * returns: mbuf dequeued. + * NULL when no packet is available in the queue. + */ + +static struct mbuf * +red_dequeue(ifq, op) + struct ifaltq *ifq; + int op; +{ + red_queue_t *rqp = (red_queue_t *)ifq->altq_disc; + struct mbuf *m; + + if (op == ALTDQ_POLL) + return qhead(rqp->rq_q); + + /* op == ALTDQ_REMOVE */ + m = red_getq(rqp->rq_red, rqp->rq_q); + if (m != NULL) + ifq->ifq_len--; + return (m); +} + +struct mbuf * +red_getq(rp, q) + red_t *rp; + class_queue_t *q; +{ + struct mbuf *m; + + if ((m = _getq(q)) == NULL) { + if (rp->red_idle == 0) { + rp->red_idle = 1; + microtime(&rp->red_last); + } + return NULL; + } + + rp->red_idle = 0; + return (m); +} + +static int +red_request(ifq, req, arg) + struct ifaltq *ifq; + int req; + void *arg; +{ + red_queue_t *rqp = (red_queue_t *)ifq->altq_disc; + + switch (req) { + case ALTRQ_PURGE: + red_purgeq(rqp); + break; + } + return (0); +} + +static void +red_purgeq(rqp) + red_queue_t *rqp; +{ + _flushq(rqp->rq_q); + if (ALTQ_IS_ENABLED(rqp->rq_ifq)) + rqp->rq_ifq->ifq_len = 0; +} + + +/* + * helper routine to calibrate avg during idle. + * pow_w(wtab, n) returns (1 - Wq)^n in fixed-point + * here Wq = 1/weight and the code assumes Wq is close to zero. + * + * w_tab[n] holds ((1 - Wq)^(2^n)) in fixed-point. + */ +static struct wtab *wtab_list = NULL; /* pointer to wtab list */ + +struct wtab * +wtab_alloc(weight) + int weight; +{ + struct wtab *w; + int i; + + for (w = wtab_list; w != NULL; w = w->w_next) + if (w->w_weight == weight) { + w->w_refcount++; + return (w); + } + + MALLOC(w, struct wtab *, sizeof(struct wtab), M_DEVBUF, M_WAITOK); + if (w == NULL) + panic("wtab_alloc: malloc failed!"); + bzero(w, sizeof(struct wtab)); + w->w_weight = weight; + w->w_refcount = 1; + w->w_next = wtab_list; + wtab_list = w; + + /* initialize the weight table */ + w->w_tab[0] = ((weight - 1) << FP_SHIFT) / weight; + for (i = 1; i < 32; i++) { + w->w_tab[i] = (w->w_tab[i-1] * w->w_tab[i-1]) >> FP_SHIFT; + if (w->w_tab[i] == 0 && w->w_param_max == 0) + w->w_param_max = 1 << i; + } + + return (w); +} + +int +wtab_destroy(w) + struct wtab *w; +{ + struct wtab *prev; + + if (--w->w_refcount > 0) + return (0); + + if (wtab_list == w) + wtab_list = w->w_next; + else for (prev = wtab_list; prev->w_next != NULL; prev = prev->w_next) + if (prev->w_next == w) { + prev->w_next = w->w_next; + break; + } + + FREE(w, M_DEVBUF); + return (0); +} + +int32_t +pow_w(w, n) + struct wtab *w; + int n; +{ + int i, bit; + int32_t val; + + if (n >= w->w_param_max) + return (0); + + val = 1 << FP_SHIFT; + if (n <= 0) + return (val); + + bit = 1; + i = 0; + while (n) { + if (n & bit) { + val = (val * w->w_tab[i]) >> FP_SHIFT; + n &= ~bit; + } + i++; + bit <<= 1; + } + return (val); +} + +#ifdef ALTQ_FLOWVALVE + +#define FV_PSHIFT 7 /* weight of average drop rate -- 1/128 */ +#define FV_PSCALE(x) ((x) << FV_PSHIFT) +#define FV_PUNSCALE(x) ((x) >> FV_PSHIFT) +#define FV_FSHIFT 5 /* weight of average fraction -- 1/32 */ +#define FV_FSCALE(x) ((x) << FV_FSHIFT) +#define FV_FUNSCALE(x) ((x) >> FV_FSHIFT) + +#define FV_TIMER (3 * hz) /* timer value for garbage collector */ +#define FV_FLOWLISTSIZE 64 /* how many flows in flowlist */ + +#define FV_N 10 /* update fve_f every FV_N packets */ + +#define FV_BACKOFFTHRESH 1 /* backoff threshold interval in second */ +#define FV_TTHRESH 3 /* time threshold to delete fve */ +#define FV_ALPHA 5 /* extra packet count */ + +#define FV_STATS + +#if (__FreeBSD_version > 300000) +#define FV_TIMESTAMP(tp) getmicrotime(tp) +#else +#define FV_TIMESTAMP(tp) { (*(tp)) = time; } +#endif + +/* + * Brtt table: 127 entry table to convert drop rate (p) to + * the corresponding bandwidth fraction (f) + * the following equation is implemented to use scaled values, + * fve_p and fve_f, in the fixed point format. + * + * Brtt(p) = 1 /(sqrt(4*p/3) + min(1,3*sqrt(p*6/8)) * p * (1+32 * p*p)) + * f = Brtt(p) / (max_th + alpha) + */ +#define BRTT_SIZE 128 +#define BRTT_SHIFT 12 +#define BRTT_MASK 0x0007f000 +#define BRTT_PMAX (1 << (FV_PSHIFT + FP_SHIFT)) + +const int brtt_tab[BRTT_SIZE] = { + 0, 1262010, 877019, 703694, 598706, 525854, 471107, 427728, + 392026, 361788, 335598, 312506, 291850, 273158, 256081, 240361, + 225800, 212247, 199585, 187788, 178388, 169544, 161207, 153333, + 145888, 138841, 132165, 125836, 119834, 114141, 108739, 103612, + 98747, 94129, 89746, 85585, 81637, 77889, 74333, 70957, + 67752, 64711, 61824, 59084, 56482, 54013, 51667, 49440, + 47325, 45315, 43406, 41591, 39866, 38227, 36667, 35184, + 33773, 32430, 31151, 29933, 28774, 27668, 26615, 25611, + 24653, 23740, 22868, 22035, 21240, 20481, 19755, 19062, + 18399, 17764, 17157, 16576, 16020, 15487, 14976, 14487, + 14017, 13567, 13136, 12721, 12323, 11941, 11574, 11222, + 10883, 10557, 10243, 9942, 9652, 9372, 9103, 8844, + 8594, 8354, 8122, 7898, 7682, 7474, 7273, 7079, + 6892, 6711, 6536, 6367, 6204, 6046, 5893, 5746, + 5603, 5464, 5330, 5201, 5075, 4954, 4836, 4722, + 4611, 4504, 4400, 4299, 4201, 4106, 4014, 3924 +}; + +static __inline struct fve * +flowlist_lookup(fv, pktattr, now) + struct flowvalve *fv; + struct altq_pktattr *pktattr; + struct timeval *now; +{ + struct fve *fve; + int flows; + struct ip *ip; +#ifdef INET6 + struct ip6_hdr *ip6; +#endif + struct timeval tthresh; + + if (pktattr == NULL) + return (NULL); + + tthresh.tv_sec = now->tv_sec - FV_TTHRESH; + flows = 0; + /* + * search the flow list + */ + switch (pktattr->pattr_af) { + case AF_INET: + ip = (struct ip *)pktattr->pattr_hdr; + TAILQ_FOREACH(fve, &fv->fv_flowlist, fve_lru){ + if (fve->fve_lastdrop.tv_sec == 0) + break; + if (fve->fve_lastdrop.tv_sec < tthresh.tv_sec) { + fve->fve_lastdrop.tv_sec = 0; + break; + } + if (fve->fve_flow.flow_af == AF_INET && + fve->fve_flow.flow_ip.ip_src.s_addr == + ip->ip_src.s_addr && + fve->fve_flow.flow_ip.ip_dst.s_addr == + ip->ip_dst.s_addr) + return (fve); + flows++; + } + break; +#ifdef INET6 + case AF_INET6: + ip6 = (struct ip6_hdr *)pktattr->pattr_hdr; + TAILQ_FOREACH(fve, &fv->fv_flowlist, fve_lru){ + if (fve->fve_lastdrop.tv_sec == 0) + break; + if (fve->fve_lastdrop.tv_sec < tthresh.tv_sec) { + fve->fve_lastdrop.tv_sec = 0; + break; + } + if (fve->fve_flow.flow_af == AF_INET6 && + IN6_ARE_ADDR_EQUAL(&fve->fve_flow.flow_ip6.ip6_src, + &ip6->ip6_src) && + IN6_ARE_ADDR_EQUAL(&fve->fve_flow.flow_ip6.ip6_dst, + &ip6->ip6_dst)) + return (fve); + flows++; + } + break; +#endif /* INET6 */ + + default: + /* unknown protocol. no drop. */ + return (NULL); + } + fv->fv_flows = flows; /* save the number of active fve's */ + return (NULL); +} + +static __inline struct fve * +flowlist_reclaim(fv, pktattr) + struct flowvalve *fv; + struct altq_pktattr *pktattr; +{ + struct fve *fve; + struct ip *ip; +#ifdef INET6 + struct ip6_hdr *ip6; +#endif + + /* + * get an entry from the tail of the LRU list. + */ + fve = TAILQ_LAST(&fv->fv_flowlist, fv_flowhead); + + switch (pktattr->pattr_af) { + case AF_INET: + ip = (struct ip *)pktattr->pattr_hdr; + fve->fve_flow.flow_af = AF_INET; + fve->fve_flow.flow_ip.ip_src = ip->ip_src; + fve->fve_flow.flow_ip.ip_dst = ip->ip_dst; + break; +#ifdef INET6 + case AF_INET6: + ip6 = (struct ip6_hdr *)pktattr->pattr_hdr; + fve->fve_flow.flow_af = AF_INET6; + fve->fve_flow.flow_ip6.ip6_src = ip6->ip6_src; + fve->fve_flow.flow_ip6.ip6_dst = ip6->ip6_dst; + break; +#endif + } + + fve->fve_state = Green; + fve->fve_p = 0.0; + fve->fve_f = 0.0; + fve->fve_ifseq = fv->fv_ifseq - 1; + fve->fve_count = 0; + + fv->fv_flows++; +#ifdef FV_STATS + fv->fv_stats.alloc++; +#endif + return (fve); +} + +static __inline void +flowlist_move_to_head(fv, fve) + struct flowvalve *fv; + struct fve *fve; +{ + if (TAILQ_FIRST(&fv->fv_flowlist) != fve) { + TAILQ_REMOVE(&fv->fv_flowlist, fve, fve_lru); + TAILQ_INSERT_HEAD(&fv->fv_flowlist, fve, fve_lru); + } +} + +/* + * allocate flowvalve structure + */ +static struct flowvalve * +fv_alloc(rp) + struct red *rp; +{ + struct flowvalve *fv; + struct fve *fve; + int i, num; + + num = FV_FLOWLISTSIZE; + MALLOC(fv, struct flowvalve *, sizeof(struct flowvalve), + M_DEVBUF, M_WAITOK); + if (fv == NULL) + return (NULL); + bzero(fv, sizeof(struct flowvalve)); + + MALLOC(fv->fv_fves, struct fve *, sizeof(struct fve) * num, + M_DEVBUF, M_WAITOK); + if (fv->fv_fves == NULL) { + FREE(fv, M_DEVBUF); + return (NULL); + } + bzero(fv->fv_fves, sizeof(struct fve) * num); + + fv->fv_flows = 0; + TAILQ_INIT(&fv->fv_flowlist); + for (i = 0; i < num; i++) { + fve = &fv->fv_fves[i]; + fve->fve_lastdrop.tv_sec = 0; + TAILQ_INSERT_TAIL(&fv->fv_flowlist, fve, fve_lru); + } + + /* initialize drop rate threshold in scaled fixed-point */ + fv->fv_pthresh = (FV_PSCALE(1) << FP_SHIFT) / rp->red_inv_pmax; + + /* initialize drop rate to fraction table */ + MALLOC(fv->fv_p2ftab, int *, sizeof(int) * BRTT_SIZE, + M_DEVBUF, M_WAITOK); + if (fv->fv_p2ftab == NULL) { + FREE(fv->fv_fves, M_DEVBUF); + FREE(fv, M_DEVBUF); + return (NULL); + } + /* + * create the p2f table. + * (shift is used to keep the precision) + */ + for (i = 1; i < BRTT_SIZE; i++) { + int f; + + f = brtt_tab[i] << 8; + fv->fv_p2ftab[i] = (f / (rp->red_thmax + FV_ALPHA)) >> 8; + } + + return (fv); +} + +static void fv_destroy(fv) + struct flowvalve *fv; +{ + FREE(fv->fv_p2ftab, M_DEVBUF); + FREE(fv->fv_fves, M_DEVBUF); + FREE(fv, M_DEVBUF); +} + +static __inline int +fv_p2f(fv, p) + struct flowvalve *fv; + int p; +{ + int val, f; + + if (p >= BRTT_PMAX) + f = fv->fv_p2ftab[BRTT_SIZE-1]; + else if ((val = (p & BRTT_MASK))) + f = fv->fv_p2ftab[(val >> BRTT_SHIFT)]; + else + f = fv->fv_p2ftab[1]; + return (f); +} + +/* + * check if an arriving packet should be pre-dropped. + * called from red_addq() when a packet arrives. + * returns 1 when the packet should be pre-dropped. + * should be called in splimp. + */ +static int +fv_checkflow(fv, pktattr, fcache) + struct flowvalve *fv; + struct altq_pktattr *pktattr; + struct fve **fcache; +{ + struct fve *fve; + struct timeval now; + + fv->fv_ifseq++; + FV_TIMESTAMP(&now); + + if ((fve = flowlist_lookup(fv, pktattr, &now)) == NULL) + /* no matching entry in the flowlist */ + return (0); + + *fcache = fve; + + /* update fraction f for every FV_N packets */ + if (++fve->fve_count == FV_N) { + /* + * f = Wf * N / (fv_ifseq - fve_ifseq) + (1 - Wf) * f + */ + fve->fve_f = + (FV_N << FP_SHIFT) / (fv->fv_ifseq - fve->fve_ifseq) + + fve->fve_f - FV_FUNSCALE(fve->fve_f); + fve->fve_ifseq = fv->fv_ifseq; + fve->fve_count = 0; + } + + /* + * overpumping test + */ + if (fve->fve_state == Green && fve->fve_p > fv->fv_pthresh) { + int fthresh; + + /* calculate a threshold */ + fthresh = fv_p2f(fv, fve->fve_p); + if (fve->fve_f > fthresh) + fve->fve_state = Red; + } + + if (fve->fve_state == Red) { + /* + * backoff test + */ + if (now.tv_sec - fve->fve_lastdrop.tv_sec > FV_BACKOFFTHRESH) { + /* no drop for at least FV_BACKOFFTHRESH sec */ + fve->fve_p = 0; + fve->fve_state = Green; +#ifdef FV_STATS + fv->fv_stats.escape++; +#endif + } else { + /* block this flow */ + flowlist_move_to_head(fv, fve); + fve->fve_lastdrop = now; +#ifdef FV_STATS + fv->fv_stats.predrop++; +#endif + return (1); + } + } + + /* + * p = (1 - Wp) * p + */ + fve->fve_p -= FV_PUNSCALE(fve->fve_p); + if (fve->fve_p < 0) + fve->fve_p = 0; +#ifdef FV_STATS + fv->fv_stats.pass++; +#endif + return (0); +} + +/* + * called from red_addq when a packet is dropped by red. + * should be called in splimp. + */ +static void fv_dropbyred(fv, pktattr, fcache) + struct flowvalve *fv; + struct altq_pktattr *pktattr; + struct fve *fcache; +{ + struct fve *fve; + struct timeval now; + + if (pktattr == NULL) + return; + FV_TIMESTAMP(&now); + + if (fcache != NULL) + /* the fve of this packet is already cached */ + fve = fcache; + else if ((fve = flowlist_lookup(fv, pktattr, &now)) == NULL) + fve = flowlist_reclaim(fv, pktattr); + + flowlist_move_to_head(fv, fve); + + /* + * update p: the following line cancels the update + * in fv_checkflow() and calculate + * p = Wp + (1 - Wp) * p + */ + fve->fve_p = (1 << FP_SHIFT) + fve->fve_p; + + fve->fve_lastdrop = now; +} + +#endif /* ALTQ_FLOWVALVE */ + +#ifdef KLD_MODULE + +static struct altqsw red_sw = + {"red", redopen, redclose, redioctl}; + +ALTQ_MODULE(altq_red, ALTQT_RED, &red_sw); + +#endif /* KLD_MODULE */ + +#endif /* ALTQ_RED */ diff --git a/sys/altq/altq_red.h b/sys/altq/altq_red.h new file mode 100644 index 000000000000..a93054293aaf --- /dev/null +++ b/sys/altq/altq_red.h @@ -0,0 +1,189 @@ +/* $KAME: altq_red.h,v 1.5 2000/12/14 08:12:46 thorpej Exp $ */ + +/* + * Copyright (C) 1997-2000 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _ALTQ_ALTQ_RED_H_ +#define _ALTQ_ALTQ_RED_H_ + +#include + +struct red_interface { + char red_ifname[IFNAMSIZ]; +}; + +struct red_stats { + struct red_interface iface; + int q_len; + int q_avg; + + struct pktcntr xmit_cnt; + struct pktcntr drop_cnt; + u_int drop_forced; + u_int drop_unforced; + u_int marked_packets; + + /* static red parameters */ + int q_limit; + int weight; + int inv_pmax; + int th_min; + int th_max; + + /* flowvalve related stuff */ + u_int fv_flows; + u_int fv_pass; + u_int fv_predrop; + u_int fv_alloc; + u_int fv_escape; +}; + +struct red_conf { + struct red_interface iface; + int red_weight; /* weight for EWMA */ + int red_inv_pmax; /* inverse of max drop probability */ + int red_thmin; /* red min threshold */ + int red_thmax; /* red max threshold */ + int red_limit; /* max queue length */ + int red_pkttime; /* average packet time in usec */ + int red_flags; /* see below */ +}; + +/* red flags */ +#define REDF_ECN4 0x01 /* use packet marking for IPv4 packets */ +#define REDF_ECN6 0x02 /* use packet marking for IPv6 packets */ +#define REDF_ECN (REDF_ECN4 | REDF_ECN6) +#define REDF_FLOWVALVE 0x04 /* use flowvalve (aka penalty-box) */ + +/* + * simpler versions of red parameters and statistics used by other + * disciplines (e.g., CBQ) + */ +struct redparams { + int th_min; /* red min threshold */ + int th_max; /* red max threshold */ + int inv_pmax; /* inverse of max drop probability */ +}; + +struct redstats { + int q_avg; + struct pktcntr xmit_cnt; + struct pktcntr drop_cnt; + u_int drop_forced; + u_int drop_unforced; + u_int marked_packets; +}; + + +/* + * IOCTLs for RED + */ +#define RED_IF_ATTACH _IOW('Q', 1, struct red_interface) +#define RED_IF_DETACH _IOW('Q', 2, struct red_interface) +#define RED_ENABLE _IOW('Q', 3, struct red_interface) +#define RED_DISABLE _IOW('Q', 4, struct red_interface) +#define RED_CONFIG _IOWR('Q', 6, struct red_conf) +#define RED_GETSTATS _IOWR('Q', 12, struct red_stats) +#define RED_SETDEFAULTS _IOW('Q', 30, struct redparams) + +#ifdef _KERNEL + +struct flowvalve; + +/* weight table structure for idle time calibration */ +struct wtab { + struct wtab *w_next; + int w_weight; + int w_param_max; + int w_refcount; + int32_t w_tab[32]; +}; + +typedef struct red { + int red_pkttime; /* average packet time in micro sec + used for idle calibration */ + int red_flags; /* red flags */ + + /* red parameters */ + int red_weight; /* weight for EWMA */ + int red_inv_pmax; /* inverse of max drop probability */ + int red_thmin; /* red min threshold */ + int red_thmax; /* red max threshold */ + + /* variables for internal use */ + int red_wshift; /* log(red_weight) */ + int red_thmin_s; /* th_min scaled by avgshift */ + int red_thmax_s; /* th_max scaled by avgshift */ + int red_probd; /* drop probability denominator */ + + int red_avg; /* queue length average scaled by avgshift */ + int red_count; /* packet count since the last dropped/marked + packet */ + int red_idle; /* queue was empty */ + int red_old; /* avg is above th_min */ + struct wtab *red_wtab; /* weight table */ + struct timeval red_last; /* timestamp when the queue becomes idle */ + + struct flowvalve *red_flowvalve; /* flowvalve state */ + + struct { + struct pktcntr xmit_cnt; + struct pktcntr drop_cnt; + u_int drop_forced; + u_int drop_unforced; + u_int marked_packets; + } red_stats; +} red_t; + +typedef struct red_queue { + struct red_queue *rq_next; /* next red_state in the list */ + struct ifaltq *rq_ifq; /* backpointer to ifaltq */ + + class_queue_t *rq_q; + + red_t *rq_red; +} red_queue_t; + +/* red drop types */ +#define DTYPE_NODROP 0 /* no drop */ +#define DTYPE_FORCED 1 /* a "forced" drop */ +#define DTYPE_EARLY 2 /* an "unforced" (early) drop */ + +extern red_t *red_alloc __P((int, int, int, int, int, int)); +extern void red_destroy __P((red_t *)); +extern void red_getstats __P((red_t *, struct redstats *)); +extern int red_addq __P((red_t *, class_queue_t *, struct mbuf *, + struct altq_pktattr *)); +extern struct mbuf *red_getq __P((red_t *, class_queue_t *)); +extern int drop_early __P((int, int, int)); +extern int mark_ecn __P((struct mbuf *, struct altq_pktattr *, int)); +extern struct wtab *wtab_alloc __P((int)); +extern int wtab_destroy __P((struct wtab *)); +extern int32_t pow_w __P((struct wtab *, int)); + +#endif /* _KERNEL */ + +#endif /* _ALTQ_ALTQ_RED_H_ */ diff --git a/sys/altq/altq_rio.c b/sys/altq/altq_rio.c new file mode 100644 index 000000000000..0d19c9e2e162 --- /dev/null +++ b/sys/altq/altq_rio.c @@ -0,0 +1,828 @@ +/* $KAME: altq_rio.c,v 1.8 2000/12/14 08:12:46 thorpej Exp $ */ + +/* + * Copyright (C) 1998-2000 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * Copyright (c) 1990-1994 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the Computer Systems + * Engineering Group at Lawrence Berkeley Laboratory. + * 4. Neither the name of the University nor of the Laboratory may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if defined(__FreeBSD__) || defined(__NetBSD__) +#include "opt_altq.h" +#if (__FreeBSD__ != 2) +#include "opt_inet.h" +#ifdef __FreeBSD__ +#include "opt_inet6.h" +#endif +#endif +#endif /* __FreeBSD__ || __NetBSD__ */ +#ifdef ALTQ_RIO /* rio is enabled by ALTQ_RIO option in opt_altq.h */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#ifdef INET6 +#include +#endif + +#include +#include +#include +#include +#include + +/* + * RIO: RED with IN/OUT bit + * described in + * "Explicit Allocation of Best Effort Packet Delivery Service" + * David D. Clark and Wenjia Fang, MIT Lab for Computer Science + * http://diffserv.lcs.mit.edu/Papers/exp-alloc-ddc-wf.{ps,pdf} + * + * this implementation is extended to support more than 2 drop precedence + * values as described in RFC2597 (Assured Forwarding PHB Group). + * + */ +/* + * AF DS (differentiated service) codepoints. + * (classes can be mapped to CBQ or H-FSC classes.) + * + * 0 1 2 3 4 5 6 7 + * +---+---+---+---+---+---+---+---+ + * | CLASS |DropPre| 0 | CU | + * +---+---+---+---+---+---+---+---+ + * + * class 1: 001 + * class 2: 010 + * class 3: 011 + * class 4: 100 + * + * low drop prec: 01 + * medium drop prec: 10 + * high drop prec: 01 + */ + +/* normal red parameters */ +#define W_WEIGHT 512 /* inverse of weight of EWMA (511/512) */ + /* q_weight = 0.00195 */ + +/* red parameters for a slow link */ +#define W_WEIGHT_1 128 /* inverse of weight of EWMA (127/128) */ + /* q_weight = 0.0078125 */ + +/* red parameters for a very slow link (e.g., dialup) */ +#define W_WEIGHT_2 64 /* inverse of weight of EWMA (63/64) */ + /* q_weight = 0.015625 */ + +/* fixed-point uses 12-bit decimal places */ +#define FP_SHIFT 12 /* fixed-point shift */ + +/* red parameters for drop probability */ +#define INV_P_MAX 10 /* inverse of max drop probability */ +#define TH_MIN 5 /* min threshold */ +#define TH_MAX 15 /* max threshold */ + +#define RIO_LIMIT 60 /* default max queue lenght */ +#define RIO_STATS /* collect statistics */ + +#define TV_DELTA(a, b, delta) { \ + register int xxs; \ + \ + delta = (a)->tv_usec - (b)->tv_usec; \ + if ((xxs = (a)->tv_sec - (b)->tv_sec) != 0) { \ + if (xxs < 0) { \ + printf("rm_class: bogus time values"); \ + delta = 60000000; \ + } else if (xxs > 4) { \ + if (xxs > 60) \ + delta = 60000000; \ + else \ + delta += xxs * 1000000; \ + } else while (xxs > 0) { \ + delta += 1000000; \ + xxs--; \ + } \ + } \ +} + +/* rio_list keeps all rio_queue_t's allocated. */ +static rio_queue_t *rio_list = NULL; +/* default rio parameter values */ +static struct redparams default_rio_params[RIO_NDROPPREC] = { + /* th_min, th_max, inv_pmax */ + { TH_MAX * 2 + TH_MIN, TH_MAX * 3, INV_P_MAX }, /* low drop precedence */ + { TH_MAX + TH_MIN, TH_MAX * 2, INV_P_MAX }, /* medium drop precedence */ + { TH_MIN, TH_MAX, INV_P_MAX } /* high drop precedence */ +}; + +/* internal function prototypes */ +static int rio_enqueue __P((struct ifaltq *, struct mbuf *, + struct altq_pktattr *)); +static struct mbuf *rio_dequeue __P((struct ifaltq *, int)); +static int rio_request __P((struct ifaltq *, int, void *)); +static int rio_detach __P((rio_queue_t *)); +static int dscp2index __P((u_int8_t)); + +/* + * rio device interface + */ +altqdev_decl(rio); + +int +rioopen(dev, flag, fmt, p) + dev_t dev; + int flag, fmt; + struct proc *p; +{ + /* everything will be done when the queueing scheme is attached. */ + return 0; +} + +int +rioclose(dev, flag, fmt, p) + dev_t dev; + int flag, fmt; + struct proc *p; +{ + rio_queue_t *rqp; + int err, error = 0; + + while ((rqp = rio_list) != NULL) { + /* destroy all */ + err = rio_detach(rqp); + if (err != 0 && error == 0) + error = err; + } + + return error; +} + +int +rioioctl(dev, cmd, addr, flag, p) + dev_t dev; + ioctlcmd_t cmd; + caddr_t addr; + int flag; + struct proc *p; +{ + rio_queue_t *rqp; + struct rio_interface *ifacep; + struct ifnet *ifp; + int error = 0; + + /* check super-user privilege */ + switch (cmd) { + case RIO_GETSTATS: + break; + default: +#if (__FreeBSD_version > 400000) + if ((error = suser(p)) != 0) + return (error); +#else + if ((error = suser(p->p_ucred, &p->p_acflag)) != 0) + return (error); +#endif + break; + } + + switch (cmd) { + + case RIO_ENABLE: + ifacep = (struct rio_interface *)addr; + if ((rqp = altq_lookup(ifacep->rio_ifname, ALTQT_RIO)) == NULL) { + error = EBADF; + break; + } + error = altq_enable(rqp->rq_ifq); + break; + + case RIO_DISABLE: + ifacep = (struct rio_interface *)addr; + if ((rqp = altq_lookup(ifacep->rio_ifname, ALTQT_RIO)) == NULL) { + error = EBADF; + break; + } + error = altq_disable(rqp->rq_ifq); + break; + + case RIO_IF_ATTACH: + ifp = ifunit(((struct rio_interface *)addr)->rio_ifname); + if (ifp == NULL) { + error = ENXIO; + break; + } + + /* allocate and initialize rio_queue_t */ + MALLOC(rqp, rio_queue_t *, sizeof(rio_queue_t), M_DEVBUF, M_WAITOK); + if (rqp == NULL) { + error = ENOMEM; + break; + } + bzero(rqp, sizeof(rio_queue_t)); + + MALLOC(rqp->rq_q, class_queue_t *, sizeof(class_queue_t), + M_DEVBUF, M_WAITOK); + if (rqp->rq_q == NULL) { + FREE(rqp, M_DEVBUF); + error = ENOMEM; + break; + } + bzero(rqp->rq_q, sizeof(class_queue_t)); + + rqp->rq_rio = rio_alloc(0, NULL, 0, 0); + if (rqp->rq_rio == NULL) { + FREE(rqp->rq_q, M_DEVBUF); + FREE(rqp, M_DEVBUF); + error = ENOMEM; + break; + } + + rqp->rq_ifq = &ifp->if_snd; + qtail(rqp->rq_q) = NULL; + qlen(rqp->rq_q) = 0; + qlimit(rqp->rq_q) = RIO_LIMIT; + qtype(rqp->rq_q) = Q_RIO; + + /* + * set RIO to this ifnet structure. + */ + error = altq_attach(rqp->rq_ifq, ALTQT_RIO, rqp, + rio_enqueue, rio_dequeue, rio_request, + NULL, NULL); + if (error) { + rio_destroy(rqp->rq_rio); + FREE(rqp->rq_q, M_DEVBUF); + FREE(rqp, M_DEVBUF); + break; + } + + /* add this state to the rio list */ + rqp->rq_next = rio_list; + rio_list = rqp; + break; + + case RIO_IF_DETACH: + ifacep = (struct rio_interface *)addr; + if ((rqp = altq_lookup(ifacep->rio_ifname, ALTQT_RIO)) == NULL) { + error = EBADF; + break; + } + error = rio_detach(rqp); + break; + + case RIO_GETSTATS: + do { + struct rio_stats *q_stats; + rio_t *rp; + int i; + + q_stats = (struct rio_stats *)addr; + if ((rqp = altq_lookup(q_stats->iface.rio_ifname, + ALTQT_RIO)) == NULL) { + error = EBADF; + break; + } + + rp = rqp->rq_rio; + + q_stats->q_limit = qlimit(rqp->rq_q); + q_stats->weight = rp->rio_weight; + q_stats->flags = rp->rio_flags; + + for (i = 0; i < RIO_NDROPPREC; i++) { + q_stats->q_len[i] = rp->rio_precstate[i].qlen; + bcopy(&rp->q_stats[i], &q_stats->q_stats[i], + sizeof(struct redstats)); + q_stats->q_stats[i].q_avg = + rp->rio_precstate[i].avg >> rp->rio_wshift; + + q_stats->q_params[i].inv_pmax + = rp->rio_precstate[i].inv_pmax; + q_stats->q_params[i].th_min + = rp->rio_precstate[i].th_min; + q_stats->q_params[i].th_max + = rp->rio_precstate[i].th_max; + } + } while (0); + break; + + case RIO_CONFIG: + do { + struct rio_conf *fc; + rio_t *new; + int s, limit, i; + + fc = (struct rio_conf *)addr; + if ((rqp = altq_lookup(fc->iface.rio_ifname, + ALTQT_RIO)) == NULL) { + error = EBADF; + break; + } + + new = rio_alloc(fc->rio_weight, &fc->q_params[0], + fc->rio_flags, fc->rio_pkttime); + if (new == NULL) { + error = ENOMEM; + break; + } + + s = splimp(); + _flushq(rqp->rq_q); + limit = fc->rio_limit; + if (limit < fc->q_params[RIO_NDROPPREC-1].th_max) + limit = fc->q_params[RIO_NDROPPREC-1].th_max; + qlimit(rqp->rq_q) = limit; + + rio_destroy(rqp->rq_rio); + rqp->rq_rio = new; + + splx(s); + + /* write back new values */ + fc->rio_limit = limit; + for (i = 0; i < RIO_NDROPPREC; i++) { + fc->q_params[i].inv_pmax = + rqp->rq_rio->rio_precstate[i].inv_pmax; + fc->q_params[i].th_min = + rqp->rq_rio->rio_precstate[i].th_min; + fc->q_params[i].th_max = + rqp->rq_rio->rio_precstate[i].th_max; + } + } while (0); + break; + + case RIO_SETDEFAULTS: + do { + struct redparams *rp; + int i; + + rp = (struct redparams *)addr; + for (i = 0; i < RIO_NDROPPREC; i++) + default_rio_params[i] = rp[i]; + } while (0); + break; + + default: + error = EINVAL; + break; + } + + return error; +} + +static int +rio_detach(rqp) + rio_queue_t *rqp; +{ + rio_queue_t *tmp; + int error = 0; + + if (ALTQ_IS_ENABLED(rqp->rq_ifq)) + altq_disable(rqp->rq_ifq); + + if ((error = altq_detach(rqp->rq_ifq))) + return (error); + + if (rio_list == rqp) + rio_list = rqp->rq_next; + else { + for (tmp = rio_list; tmp != NULL; tmp = tmp->rq_next) + if (tmp->rq_next == rqp) { + tmp->rq_next = rqp->rq_next; + break; + } + if (tmp == NULL) + printf("rio_detach: no state found in rio_list!\n"); + } + + rio_destroy(rqp->rq_rio); + FREE(rqp->rq_q, M_DEVBUF); + FREE(rqp, M_DEVBUF); + return (error); +} + +/* + * rio support routines + */ +static int +rio_request(ifq, req, arg) + struct ifaltq *ifq; + int req; + void *arg; +{ + rio_queue_t *rqp = (rio_queue_t *)ifq->altq_disc; + + switch (req) { + case ALTRQ_PURGE: + _flushq(rqp->rq_q); + if (ALTQ_IS_ENABLED(ifq)) + ifq->ifq_len = 0; + break; + } + return (0); +} + + +rio_t * +rio_alloc(weight, params, flags, pkttime) + int weight; + struct redparams *params; + int flags, pkttime; +{ + rio_t *rp; + int w, i; + int npkts_per_sec; + + MALLOC(rp, rio_t *, sizeof(rio_t), M_DEVBUF, M_WAITOK); + if (rp == NULL) + return (NULL); + bzero(rp, sizeof(rio_t)); + + rp->rio_flags = flags; + if (pkttime == 0) + /* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */ + rp->rio_pkttime = 800; + else + rp->rio_pkttime = pkttime; + + if (weight != 0) + rp->rio_weight = weight; + else { + /* use derfault */ + rp->rio_weight = W_WEIGHT; + + /* when the link is very slow, adjust red parameters */ + npkts_per_sec = 1000000 / rp->rio_pkttime; + if (npkts_per_sec < 50) { + /* up to about 400Kbps */ + rp->rio_weight = W_WEIGHT_2; + } else if (npkts_per_sec < 300) { + /* up to about 2.4Mbps */ + rp->rio_weight = W_WEIGHT_1; + } + } + + /* calculate wshift. weight must be power of 2 */ + w = rp->rio_weight; + for (i = 0; w > 1; i++) + w = w >> 1; + rp->rio_wshift = i; + w = 1 << rp->rio_wshift; + if (w != rp->rio_weight) { + printf("invalid weight value %d for red! use %d\n", + rp->rio_weight, w); + rp->rio_weight = w; + } + + /* allocate weight table */ + rp->rio_wtab = wtab_alloc(rp->rio_weight); + + for (i = 0; i < RIO_NDROPPREC; i++) { + struct dropprec_state *prec = &rp->rio_precstate[i]; + + prec->avg = 0; + prec->idle = 1; + + if (params == NULL || params[i].inv_pmax == 0) + prec->inv_pmax = default_rio_params[i].inv_pmax; + else + prec->inv_pmax = params[i].inv_pmax; + if (params == NULL || params[i].th_min == 0) + prec->th_min = default_rio_params[i].th_min; + else + prec->th_min = params[i].th_min; + if (params == NULL || params[i].th_max == 0) + prec->th_max = default_rio_params[i].th_max; + else + prec->th_max = params[i].th_max; + + /* + * th_min_s and th_max_s are scaled versions of th_min + * and th_max to be compared with avg. + */ + prec->th_min_s = prec->th_min << (rp->rio_wshift + FP_SHIFT); + prec->th_max_s = prec->th_max << (rp->rio_wshift + FP_SHIFT); + + /* + * precompute probability denominator + * probd = (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point + */ + prec->probd = (2 * (prec->th_max - prec->th_min) + * prec->inv_pmax) << FP_SHIFT; + + microtime(&prec->last); + } + + return (rp); +} + +void +rio_destroy(rp) + rio_t *rp; +{ + wtab_destroy(rp->rio_wtab); + FREE(rp, M_DEVBUF); +} + +void +rio_getstats(rp, sp) + rio_t *rp; + struct redstats *sp; +{ + int i; + + for (i = 0; i < RIO_NDROPPREC; i++) { + bcopy(&rp->q_stats[i], sp, sizeof(struct redstats)); + sp->q_avg = rp->rio_precstate[i].avg >> rp->rio_wshift; + sp++; + } +} + +/* + * enqueue routine: + * + * returns: 0 when successfully queued. + * ENOBUFS when drop occurs. + */ +static int +rio_enqueue(ifq, m, pktattr) + struct ifaltq *ifq; + struct mbuf *m; + struct altq_pktattr *pktattr; +{ + rio_queue_t *rqp = (rio_queue_t *)ifq->altq_disc; + int error = 0; + + if (rio_addq(rqp->rq_rio, rqp->rq_q, m, pktattr) == 0) + ifq->ifq_len++; + else + error = ENOBUFS; + return error; +} + +#if (RIO_NDROPPREC == 3) +/* + * internally, a drop precedence value is converted to an index + * starting from 0. + */ +static int +dscp2index(u_int8_t dscp) +{ + int dpindex = dscp & AF_DROPPRECMASK; + + if (dpindex == 0) + return (0); + return ((dpindex >> 3) - 1); +} +#endif + +#if 1 +/* + * kludge: when a packet is dequeued, we need to know its drop precedence + * in order to keep the queue length of each drop precedence. + * use m_pkthdr.rcvif to pass this info. + */ +#define RIOM_SET_PRECINDEX(m, idx) \ + do { (m)->m_pkthdr.rcvif = (struct ifnet *)((long)(idx)); } while (0) +#define RIOM_GET_PRECINDEX(m) \ + ({ long idx; idx = (long)((m)->m_pkthdr.rcvif); \ + (m)->m_pkthdr.rcvif = NULL; idx; }) +#endif + +int +rio_addq(rp, q, m, pktattr) + rio_t *rp; + class_queue_t *q; + struct mbuf *m; + struct altq_pktattr *pktattr; +{ + int avg, droptype; + u_int8_t dsfield, odsfield; + int dpindex, i, n, t; + struct timeval now; + struct dropprec_state *prec; + + dsfield = odsfield = read_dsfield(m, pktattr); + dpindex = dscp2index(dsfield); + + /* + * update avg of the precedence states whose drop precedence + * is larger than or equal to the drop precedence of the packet + */ + now.tv_sec = 0; + for (i = dpindex; i < RIO_NDROPPREC; i++) { + prec = &rp->rio_precstate[i]; + avg = prec->avg; + if (prec->idle) { + prec->idle = 0; + if (now.tv_sec == 0) + microtime(&now); + t = (now.tv_sec - prec->last.tv_sec); + if (t > 60) + avg = 0; + else { + t = t * 1000000 + + (now.tv_usec - prec->last.tv_usec); + n = t / rp->rio_pkttime; + /* calculate (avg = (1 - Wq)^n * avg) */ + if (n > 0) + avg = (avg >> FP_SHIFT) * + pow_w(rp->rio_wtab, n); + } + } + + /* run estimator. (avg is scaled by WEIGHT in fixed-point) */ + avg += (prec->qlen << FP_SHIFT) - (avg >> rp->rio_wshift); + prec->avg = avg; /* save the new value */ + /* + * count keeps a tally of arriving traffic that has not + * been dropped. + */ + prec->count++; + } + + prec = &rp->rio_precstate[dpindex]; + avg = prec->avg; + + /* see if we drop early */ + droptype = DTYPE_NODROP; + if (avg >= prec->th_min_s && prec->qlen > 1) { + if (avg >= prec->th_max_s) { + /* avg >= th_max: forced drop */ + droptype = DTYPE_FORCED; + } else if (prec->old == 0) { + /* first exceeds th_min */ + prec->count = 1; + prec->old = 1; + } else if (drop_early((avg - prec->th_min_s) >> rp->rio_wshift, + prec->probd, prec->count)) { + /* unforced drop by red */ + droptype = DTYPE_EARLY; + } + } else { + /* avg < th_min */ + prec->old = 0; + } + + /* + * if the queue length hits the hard limit, it's a forced drop. + */ + if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q)) + droptype = DTYPE_FORCED; + + if (droptype != DTYPE_NODROP) { + /* always drop incoming packet (as opposed to randomdrop) */ + for (i = dpindex; i < RIO_NDROPPREC; i++) + rp->rio_precstate[i].count = 0; +#ifdef RIO_STATS + if (droptype == DTYPE_EARLY) + rp->q_stats[dpindex].drop_unforced++; + else + rp->q_stats[dpindex].drop_forced++; + PKTCNTR_ADD(&rp->q_stats[dpindex].drop_cnt, m_pktlen(m)); +#endif + m_freem(m); + return (-1); + } + + for (i = dpindex; i < RIO_NDROPPREC; i++) + rp->rio_precstate[i].qlen++; + + /* save drop precedence index in mbuf hdr */ + RIOM_SET_PRECINDEX(m, dpindex); + + if (rp->rio_flags & RIOF_CLEARDSCP) + dsfield &= ~DSCP_MASK; + + if (dsfield != odsfield) + write_dsfield(m, pktattr, dsfield); + + _addq(q, m); + +#ifdef RIO_STATS + PKTCNTR_ADD(&rp->q_stats[dpindex].xmit_cnt, m_pktlen(m)); +#endif + return (0); +} + +/* + * dequeue routine: + * must be called in splimp. + * + * returns: mbuf dequeued. + * NULL when no packet is available in the queue. + */ + +static struct mbuf * +rio_dequeue(ifq, op) + struct ifaltq *ifq; + int op; +{ + rio_queue_t *rqp = (rio_queue_t *)ifq->altq_disc; + struct mbuf *m = NULL; + + if (op == ALTDQ_POLL) + return qhead(rqp->rq_q); + + m = rio_getq(rqp->rq_rio, rqp->rq_q); + if (m != NULL) + ifq->ifq_len--; + return m; +} + +struct mbuf * +rio_getq(rp, q) + rio_t *rp; + class_queue_t *q; +{ + struct mbuf *m; + int dpindex, i; + + if ((m = _getq(q)) == NULL) + return NULL; + + dpindex = RIOM_GET_PRECINDEX(m); + for (i = dpindex; i < RIO_NDROPPREC; i++) { + if (--rp->rio_precstate[i].qlen == 0) { + if (rp->rio_precstate[i].idle == 0) { + rp->rio_precstate[i].idle = 1; + microtime(&rp->rio_precstate[i].last); + } + } + } + return (m); +} + +#ifdef KLD_MODULE + +static struct altqsw rio_sw = + {"rio", rioopen, rioclose, rioioctl}; + +ALTQ_MODULE(altq_rio, ALTQT_RIO, &rio_sw); + +#endif /* KLD_MODULE */ + +#endif /* ALTQ_RIO */ diff --git a/sys/altq/altq_rio.h b/sys/altq/altq_rio.h new file mode 100644 index 000000000000..f1f75902ed6b --- /dev/null +++ b/sys/altq/altq_rio.h @@ -0,0 +1,139 @@ +/* $KAME: altq_rio.h,v 1.5 2000/12/14 08:12:46 thorpej Exp $ */ + +/* + * Copyright (C) 1998-2000 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _ALTQ_ALTQ_RIO_H_ +#define _ALTQ_ALTQ_RIO_H_ + +#include + +/* + * RIO: RED with IN/OUT bit + * (extended to support more than 2 drop precedence values) + */ +#define RIO_NDROPPREC 3 /* number of drop precedence values */ + +struct rio_interface { + char rio_ifname[IFNAMSIZ]; +}; + +struct rio_stats { + struct rio_interface iface; + int q_len[RIO_NDROPPREC]; + struct redstats q_stats[RIO_NDROPPREC]; + + /* static red parameters */ + int q_limit; + int weight; + int flags; + struct redparams q_params[RIO_NDROPPREC]; +}; + +struct rio_conf { + struct rio_interface iface; + struct redparams q_params[RIO_NDROPPREC]; + int rio_weight; /* weight for EWMA */ + int rio_limit; /* max queue length */ + int rio_pkttime; /* average packet time in usec */ + int rio_flags; /* see below */ +}; + +/* rio flags */ +#define RIOF_ECN4 0x01 /* use packet marking for IPv4 packets */ +#define RIOF_ECN6 0x02 /* use packet marking for IPv6 packets */ +#define RIOF_ECN (RIOF_ECN4 | RIOF_ECN6) +#define RIOF_CLEARDSCP 0x200 /* clear diffserv codepoint */ + +/* + * IOCTLs for RIO + */ +#define RIO_IF_ATTACH _IOW('Q', 1, struct rio_interface) +#define RIO_IF_DETACH _IOW('Q', 2, struct rio_interface) +#define RIO_ENABLE _IOW('Q', 3, struct rio_interface) +#define RIO_DISABLE _IOW('Q', 4, struct rio_interface) +#define RIO_CONFIG _IOWR('Q', 6, struct rio_conf) +#define RIO_GETSTATS _IOWR('Q', 12, struct rio_stats) +#define RIO_SETDEFAULTS _IOW('Q', 30, struct redparams[RIO_NDROPPREC]) + +#ifdef _KERNEL + +typedef struct rio { + /* per drop precedence structure */ + struct dropprec_state { + /* red parameters */ + int inv_pmax; /* inverse of max drop probability */ + int th_min; /* red min threshold */ + int th_max; /* red max threshold */ + + /* variables for internal use */ + int th_min_s; /* th_min scaled by avgshift */ + int th_max_s; /* th_max scaled by avgshift */ + int probd; /* drop probability denominator */ + + int qlen; /* queue length */ + int avg; /* (scaled) queue length average */ + int count; /* packet count since the last dropped/marked + packet */ + int idle; /* queue was empty */ + int old; /* avg is above th_min */ + struct timeval last; /* timestamp when queue becomes idle */ + } rio_precstate[RIO_NDROPPREC]; + + int rio_wshift; /* log(red_weight) */ + int rio_weight; /* weight for EWMA */ + struct wtab *rio_wtab; /* weight table */ + + int rio_pkttime; /* average packet time in micro sec + used for idle calibration */ + int rio_flags; /* rio flags */ + + u_int8_t rio_codepoint; /* codepoint value to tag packets */ + u_int8_t rio_codepointmask; /* codepoint mask bits */ + + struct redstats q_stats[RIO_NDROPPREC]; /* statistics */ +} rio_t; + +typedef struct rio_queue { + struct rio_queue *rq_next; /* next red_state in the list */ + struct ifaltq *rq_ifq; /* backpointer to ifaltq */ + + class_queue_t *rq_q; + + rio_t *rq_rio; +} rio_queue_t; + +extern rio_t *rio_alloc __P((int, struct redparams *, int, int)); +extern void rio_destroy __P((rio_t *)); +extern void rio_getstats __P((rio_t *, struct redstats *)); +extern int rio_addq __P((rio_t *, class_queue_t *, struct mbuf *, + struct altq_pktattr *)); +extern struct mbuf *rio_getq __P((rio_t *, class_queue_t *)); +extern int rio_set_meter __P((rio_t *, int, int, int)); + +#endif /* _KERNEL */ + +#endif /* _ALTQ_ALTQ_RIO_H_ */ diff --git a/sys/altq/altq_rmclass.c b/sys/altq/altq_rmclass.c new file mode 100644 index 000000000000..50df4f3f7ca1 --- /dev/null +++ b/sys/altq/altq_rmclass.c @@ -0,0 +1,1870 @@ +/* $KAME: altq_rmclass.c,v 1.9 2000/12/14 08:12:46 thorpej Exp $ */ + +/* + * Copyright (c) 1991-1997 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the Network Research + * Group at Lawrence Berkeley Laboratory. + * 4. Neither the name of the University nor of the Laboratory may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * LBL code modified by speer@eng.sun.com, May 1977. + * For questions and/or comments, please send mail to cbq@ee.lbl.gov + */ + +#ident "@(#)rm_class.c 1.48 97/12/05 SMI" + +#if defined(__FreeBSD__) || defined(__NetBSD__) +#include "opt_altq.h" +#if (__FreeBSD__ != 2) +#include "opt_inet.h" +#ifdef __FreeBSD__ +#include "opt_inet6.h" +#endif +#endif +#endif /* __FreeBSD__ || __NetBSD__ */ +#ifdef ALTQ_CBQ /* cbq is enabled by ALTQ_CBQ option in opt_altq.h */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +/* + * Local Macros + */ + +#define reset_cutoff(ifd) { ifd->cutoff_ = RM_MAXDEPTH; } + +/* + * Local routines. + */ + +static int rmc_satisfied __P((struct rm_class *, struct timeval *)); +static void rmc_wrr_set_weights __P((struct rm_ifdat *)); +static void rmc_depth_compute __P((struct rm_class *)); +static void rmc_depth_recompute __P((rm_class_t *)); + +static mbuf_t *_rmc_wrr_dequeue_next __P((struct rm_ifdat *, int)); +static mbuf_t *_rmc_prr_dequeue_next __P((struct rm_ifdat *, int)); + +static int _rmc_addq __P((rm_class_t *, mbuf_t *)); +static void _rmc_dropq __P((rm_class_t *)); +static mbuf_t *_rmc_getq __P((rm_class_t *)); +static mbuf_t *_rmc_pollq __P((rm_class_t *)); + +static int rmc_under_limit __P((struct rm_class *, struct timeval *)); +static void rmc_tl_satisfied __P((struct rm_ifdat *, struct timeval *)); +static void rmc_drop_action __P((struct rm_class *)); +static void rmc_restart __P((struct rm_class *)); +static void rmc_root_overlimit __P((struct rm_class *, struct rm_class *)); + +#define BORROW_OFFTIME +/* + * BORROW_OFFTIME (experimental): + * borrow the offtime of the class borrowing from. + * the reason is that when its own offtime is set, the class is unable + * to borrow much, especially when cutoff is taking effect. + * but when the borrowed class is overloaded (advidle is close to minidle), + * use the borrowing class's offtime to avoid overload. + */ +#define ADJUST_CUTOFF +/* + * ADJUST_CUTOFF (experimental): + * if no underlimit class is found due to cutoff, increase cutoff and + * retry the scheduling loop. + * also, don't invoke delay_actions while cutoff is taking effect, + * since a sleeping class won't have a chance to be scheduled in the + * next loop. + * + * now heuristics for setting the top-level variable (cutoff_) becomes: + * 1. if a packet arrives for a not-overlimit class, set cutoff + * to the depth of the class. + * 2. if cutoff is i, and a packet arrives for an overlimit class + * with an underlimit ancestor at a lower level than i (say j), + * then set cutoff to j. + * 3. at scheduling a packet, if there is no underlimit class + * due to the current cutoff level, increase cutoff by 1 and + * then try to schedule again. + */ + +/* + * rm_class_t * + * rmc_newclass(...) - Create a new resource management class at priority + * 'pri' on the interface given by 'ifd'. + * + * nsecPerByte is the data rate of the interface in nanoseconds/byte. + * E.g., 800 for a 10Mb/s ethernet. If the class gets less + * than 100% of the bandwidth, this number should be the + * 'effective' rate for the class. Let f be the + * bandwidth fraction allocated to this class, and let + * nsPerByte be the data rate of the output link in + * nanoseconds/byte. Then nsecPerByte is set to + * nsPerByte / f. E.g., 1600 (= 800 / .5) + * for a class that gets 50% of an ethernet's bandwidth. + * + * action the routine to call when the class is over limit. + * + * maxq max allowable queue size for class (in packets). + * + * parent parent class pointer. + * + * borrow class to borrow from (should be either 'parent' or null). + * + * maxidle max value allowed for class 'idle' time estimate (this + * parameter determines how large an initial burst of packets + * can be before overlimit action is invoked. + * + * offtime how long 'delay' action will delay when class goes over + * limit (this parameter determines the steady-state burst + * size when a class is running over its limit). + * + * Maxidle and offtime have to be computed from the following: If the + * average packet size is s, the bandwidth fraction allocated to this + * class is f, we want to allow b packet bursts, and the gain of the + * averaging filter is g (= 1 - 2^(-RM_FILTER_GAIN)), then: + * + * ptime = s * nsPerByte * (1 - f) / f + * maxidle = ptime * (1 - g^b) / g^b + * minidle = -ptime * (1 / (f - 1)) + * offtime = ptime * (1 + 1/(1 - g) * (1 - g^(b - 1)) / g^(b - 1) + * + * Operationally, it's convenient to specify maxidle & offtime in units + * independent of the link bandwidth so the maxidle & offtime passed to + * this routine are the above values multiplied by 8*f/(1000*nsPerByte). + * (The constant factor is a scale factor needed to make the parameters + * integers. This scaling also means that the 'unscaled' values of + * maxidle*nsecPerByte/8 and offtime*nsecPerByte/8 will be in microseconds, + * not nanoseconds.) Also note that the 'idle' filter computation keeps + * an estimate scaled upward by 2^RM_FILTER_GAIN so the passed value of + * maxidle also must be scaled upward by this value. Thus, the passed + * values for maxidle and offtime can be computed as follows: + * + * maxidle = maxidle * 2^RM_FILTER_GAIN * 8 / (1000 * nsecPerByte) + * offtime = offtime * 8 / (1000 * nsecPerByte) + * + * When USE_HRTIME is employed, then maxidle and offtime become: + * maxidle = maxilde * (8.0 / nsecPerByte); + * offtime = offtime * (8.0 / nsecPerByte); + */ + +struct rm_class * +rmc_newclass(pri, ifd, nsecPerByte, action, maxq, parent, borrow, + maxidle, minidle, offtime, pktsize, flags) + int pri; + struct rm_ifdat *ifd; + u_int nsecPerByte; + void (*action)(rm_class_t *, rm_class_t *); + int maxq; + struct rm_class *parent; + struct rm_class *borrow; + u_int maxidle; + int minidle; + u_int offtime; + int pktsize; + int flags; +{ + struct rm_class *cl; + struct rm_class *peer; + int s; + + if (pri >= RM_MAXPRIO) + return (NULL); +#ifndef ALTQ_RED + if (flags & RMCF_RED) { + printf("rmc_newclass: RED not configured for CBQ!\n"); + return (NULL); + } +#endif +#ifndef ALTQ_RIO + if (flags & RMCF_RIO) { + printf("rmc_newclass: RIO not configured for CBQ!\n"); + return (NULL); + } +#endif + + MALLOC(cl, struct rm_class *, sizeof(struct rm_class), + M_DEVBUF, M_WAITOK); + if (cl == NULL) + return (NULL); + bzero(cl, sizeof(struct rm_class)); + CALLOUT_INIT(&cl->callout_); + MALLOC(cl->q_, class_queue_t *, sizeof(class_queue_t), + M_DEVBUF, M_WAITOK); + if (cl->q_ == NULL) { + FREE(cl, M_DEVBUF); + return (NULL); + } + bzero(cl->q_, sizeof(class_queue_t)); + + /* + * Class initialization. + */ + cl->children_ = NULL; + cl->parent_ = parent; + cl->borrow_ = borrow; + cl->leaf_ = 1; + cl->ifdat_ = ifd; + cl->pri_ = pri; + cl->allotment_ = RM_NS_PER_SEC / nsecPerByte; /* Bytes per sec */ + cl->depth_ = 0; + cl->qthresh_ = 0; + cl->ns_per_byte_ = nsecPerByte; + + qlimit(cl->q_) = maxq; + qtype(cl->q_) = Q_DROPHEAD; + qlen(cl->q_) = 0; + cl->flags_ = flags; + +#if 1 /* minidle is also scaled in ALTQ */ + cl->minidle_ = (minidle * (int)nsecPerByte) / 8; + if (cl->minidle_ > 0) + cl->minidle_ = 0; +#else + cl->minidle_ = minidle; +#endif + cl->maxidle_ = (maxidle * nsecPerByte) / 8; + if (cl->maxidle_ == 0) + cl->maxidle_ = 1; +#if 1 /* offtime is also scaled in ALTQ */ + cl->avgidle_ = cl->maxidle_; + cl->offtime_ = ((offtime * nsecPerByte) / 8) >> RM_FILTER_GAIN; + if (cl->offtime_ == 0) + cl->offtime_ = 1; +#else + cl->avgidle_ = 0; + cl->offtime_ = (offtime * nsecPerByte) / 8; +#endif + cl->overlimit = action; + +#ifdef ALTQ_RED + if (flags & (RMCF_RED|RMCF_RIO)) { + int red_flags, red_pkttime; + + red_flags = 0; + if (flags & RMCF_ECN) + red_flags |= REDF_ECN; + if (flags & RMCF_FLOWVALVE) + red_flags |= REDF_FLOWVALVE; +#ifdef ALTQ_RIO + if (flags & RMCF_CLEARDSCP) + red_flags |= RIOF_CLEARDSCP; +#endif + red_pkttime = nsecPerByte * pktsize / 1000; + + if (flags & RMCF_RED) { + cl->red_ = red_alloc(0, 0, 0, 0, + red_flags, red_pkttime); + if (cl->red_ != NULL) + qtype(cl->q_) = Q_RED; + } +#ifdef ALTQ_RIO + else { + cl->red_ = (red_t *)rio_alloc(0, NULL, + red_flags, red_pkttime); + if (cl->red_ != NULL) + qtype(cl->q_) = Q_RIO; + } +#endif + } +#endif /* ALTQ_RED */ + + /* + * put the class into the class tree + */ + s = splimp(); + if ((peer = ifd->active_[pri]) != NULL) { + /* find the last class at this pri */ + cl->peer_ = peer; + while (peer->peer_ != ifd->active_[pri]) + peer = peer->peer_; + peer->peer_ = cl; + } else { + ifd->active_[pri] = cl; + cl->peer_ = cl; + } + + if (cl->parent_) { + cl->next_ = parent->children_; + parent->children_ = cl; + parent->leaf_ = 0; + } + + /* + * Compute the depth of this class and it's ancestors in the class + * hierarchy. + */ + rmc_depth_compute(cl); + + /* + * If CBQ's WRR is enabled, then initailize the class WRR state. + */ + if (ifd->wrr_) { + ifd->num_[pri]++; + ifd->alloc_[pri] += cl->allotment_; + rmc_wrr_set_weights(ifd); + } + splx(s); + return (cl); +} + +int +rmc_modclass(cl, nsecPerByte, maxq, maxidle, minidle, offtime, pktsize) + struct rm_class *cl; + u_int nsecPerByte; + int maxq; + u_int maxidle; + int minidle; + u_int offtime; + int pktsize; +{ + struct rm_ifdat *ifd; + u_int old_allotment; + int s; + + ifd = cl->ifdat_; + old_allotment = cl->allotment_; + + s = splimp(); + cl->allotment_ = RM_NS_PER_SEC / nsecPerByte; /* Bytes per sec */ + cl->qthresh_ = 0; + cl->ns_per_byte_ = nsecPerByte; + + qlimit(cl->q_) = maxq; + +#if 1 /* minidle is also scaled in ALTQ */ + cl->minidle_ = (minidle * nsecPerByte) / 8; + if (cl->minidle_ > 0) + cl->minidle_ = 0; +#else + cl->minidle_ = minidle; +#endif + cl->maxidle_ = (maxidle * nsecPerByte) / 8; + if (cl->maxidle_ == 0) + cl->maxidle_ = 1; +#if 1 /* offtime is also scaled in ALTQ */ + cl->avgidle_ = cl->maxidle_; + cl->offtime_ = ((offtime * nsecPerByte) / 8) >> RM_FILTER_GAIN; + if (cl->offtime_ == 0) + cl->offtime_ = 1; +#else + cl->avgidle_ = 0; + cl->offtime_ = (offtime * nsecPerByte) / 8; +#endif + + /* + * If CBQ's WRR is enabled, then initailize the class WRR state. + */ + if (ifd->wrr_) { + ifd->alloc_[cl->pri_] += cl->allotment_ - old_allotment; + rmc_wrr_set_weights(ifd); + } + splx(s); + return (0); +} + +/* + * static void + * rmc_wrr_set_weights(struct rm_ifdat *ifdat) - This function computes + * the appropriate run robin weights for the CBQ weighted round robin + * algorithm. + * + * Returns: NONE + */ + +static void +rmc_wrr_set_weights(ifd) + struct rm_ifdat *ifd; +{ + int i; + struct rm_class *cl, *clh; + + for (i = 0; i < RM_MAXPRIO; i++) { + /* + * This is inverted from that of the simulator to + * maintain precision. + */ + if (ifd->num_[i] == 0) + ifd->M_[i] = 0; + else + ifd->M_[i] = ifd->alloc_[i] / + (ifd->num_[i] * ifd->maxpkt_); + /* + * Compute the weigthed allotment for each class. + * This takes the expensive div instruction out + * of the main loop for the wrr scheduling path. + * These only get recomputed when a class comes or + * goes. + */ + if (ifd->active_[i] != NULL) { + clh = cl = ifd->active_[i]; + do { + /* safe-guard for slow link or alloc_ == 0 */ + if (ifd->M_[i] == 0) + cl->w_allotment_ = 0; + else + cl->w_allotment_ = cl->allotment_ / + ifd->M_[i]; + cl = cl->peer_; + } while ((cl != NULL) && (cl != clh)); + } + } +} + +int +rmc_get_weight(ifd, pri) + struct rm_ifdat *ifd; + int pri; +{ + if ((pri >= 0) && (pri < RM_MAXPRIO)) + return (ifd->M_[pri]); + else + return (0); +} + +/* + * static void + * rmc_depth_compute(struct rm_class *cl) - This function computes the + * appropriate depth of class 'cl' and its ancestors. + * + * Returns: NONE + */ + +static void +rmc_depth_compute(cl) + struct rm_class *cl; +{ + rm_class_t *t = cl, *p; + + /* + * Recompute the depth for the branch of the tree. + */ + while (t != NULL) { + p = t->parent_; + if (p && (t->depth_ >= p->depth_)) { + p->depth_ = t->depth_ + 1; + t = p; + } else + t = NULL; + } +} + +/* + * static void + * rmc_depth_recompute(struct rm_class *cl) - This function re-computes + * the depth of the tree after a class has been deleted. + * + * Returns: NONE + */ + +static void +rmc_depth_recompute(rm_class_t *cl) +{ +#if 1 /* ALTQ */ + rm_class_t *p, *t; + + p = cl; + while (p != NULL) { + if ((t = p->children_) == NULL) { + p->depth_ = 0; + } else { + int cdepth = 0; + + while (t != NULL) { + if (t->depth_ > cdepth) + cdepth = t->depth_; + t = t->next_; + } + + if (p->depth_ == cdepth + 1) + /* no change to this parent */ + return; + + p->depth_ = cdepth + 1; + } + + p = p->parent_; + } +#else + rm_class_t *t; + + if (cl->depth_ >= 1) { + if (cl->children_ == NULL) { + cl->depth_ = 0; + } else if ((t = cl->children_) != NULL) { + while (t != NULL) { + if (t->children_ != NULL) + rmc_depth_recompute(t); + t = t->next_; + } + } else + rmc_depth_compute(cl); + } +#endif +} + +/* + * void + * rmc_delete_class(struct rm_ifdat *ifdat, struct rm_class *cl) - This + * function deletes a class from the link-sharing stucture and frees + * all resources associated with the class. + * + * Returns: NONE + */ + +void +rmc_delete_class(ifd, cl) + struct rm_ifdat *ifd; + struct rm_class *cl; +{ + struct rm_class *p, *head, *previous; + int s; + + ASSERT(cl->children_ == NULL); + + if (cl->sleeping_) + CALLOUT_STOP(&cl->callout_); + + s = splimp(); + /* + * Free packets in the packet queue. + * XXX - this may not be a desired behavior. Packets should be + * re-queued. + */ + rmc_dropall(cl); + + /* + * If the class has a parent, then remove the class from the + * class from the parent's children chain. + */ + if (cl->parent_ != NULL) { + head = cl->parent_->children_; + p = previous = head; + if (head->next_ == NULL) { + ASSERT(head == cl); + cl->parent_->children_ = NULL; + cl->parent_->leaf_ = 1; + } else while (p != NULL) { + if (p == cl) { + if (cl == head) + cl->parent_->children_ = cl->next_; + else + previous->next_ = cl->next_; + cl->next_ = NULL; + p = NULL; + } else { + previous = p; + p = p->next_; + } + } + } + + /* + * Delete class from class priority peer list. + */ + if ((p = ifd->active_[cl->pri_]) != NULL) { + /* + * If there is more than one member of this priority + * level, then look for class(cl) in the priority level. + */ + if (p != p->peer_) { + while (p->peer_ != cl) + p = p->peer_; + p->peer_ = cl->peer_; + + if (ifd->active_[cl->pri_] == cl) + ifd->active_[cl->pri_] = cl->peer_; + } else { + ASSERT(p == cl); + ifd->active_[cl->pri_] = NULL; + } + } + + /* + * Recompute the WRR weights. + */ + if (ifd->wrr_) { + ifd->alloc_[cl->pri_] -= cl->allotment_; + ifd->num_[cl->pri_]--; + rmc_wrr_set_weights(ifd); + } + + /* + * Re-compute the depth of the tree. + */ +#if 1 /* ALTQ */ + rmc_depth_recompute(cl->parent_); +#else + rmc_depth_recompute(ifd->root_); +#endif + + splx(s); + + /* + * Free the class structure. + */ + if (cl->red_ != NULL) { +#ifdef ALTQ_RIO + if (q_is_rio(cl->q_)) + rio_destroy((rio_t *)cl->red_); +#endif +#ifdef ALTQ_RED + if (q_is_red(cl->q_)) + red_destroy(cl->red_); +#endif + } + FREE(cl->q_, M_DEVBUF); + FREE(cl, M_DEVBUF); +} + + +/* + * void + * rmc_init(...) - Initialize the resource management data structures + * associated with the output portion of interface 'ifp'. 'ifd' is + * where the structures will be built (for backwards compatibility, the + * structures aren't kept in the ifnet struct). 'nsecPerByte' + * gives the link speed (inverse of bandwidth) in nanoseconds/byte. + * 'restart' is the driver-specific routine that the generic 'delay + * until under limit' action will call to restart output. `maxq' + * is the queue size of the 'link' & 'default' classes. 'maxqueued' + * is the maximum number of packets that the resource management + * code will allow to be queued 'downstream' (this is typically 1). + * + * Returns: NONE + */ + +void +rmc_init(ifq, ifd, nsecPerByte, restart, maxq, maxqueued, maxidle, + minidle, offtime, flags) + struct ifaltq *ifq; + struct rm_ifdat *ifd; + u_int nsecPerByte; + void (*restart)(struct ifaltq *); + int maxq, maxqueued; + u_int maxidle; + int minidle; + u_int offtime; + int flags; +{ + int i, mtu; + + /* + * Initialize the CBQ traciing/debug facility. + */ + CBQTRACEINIT(); + + bzero((char *)ifd, sizeof (*ifd)); + mtu = ifq->altq_ifp->if_mtu; + ifd->ifq_ = ifq; + ifd->restart = restart; + ifd->maxqueued_ = maxqueued; + ifd->ns_per_byte_ = nsecPerByte; + ifd->maxpkt_ = mtu; + ifd->wrr_ = (flags & RMCF_WRR) ? 1 : 0; + ifd->efficient_ = (flags & RMCF_EFFICIENT) ? 1 : 0; +#if 1 + ifd->maxiftime_ = mtu * nsecPerByte / 1000 * 16; + if (mtu * nsecPerByte > 10 * 1000000) + ifd->maxiftime_ /= 4; +#endif + + reset_cutoff(ifd); + CBQTRACE(rmc_init, 'INIT', ifd->cutoff_); + + /* + * Initialize the CBQ's WRR state. + */ + for (i = 0; i < RM_MAXPRIO; i++) { + ifd->alloc_[i] = 0; + ifd->M_[i] = 0; + ifd->num_[i] = 0; + ifd->na_[i] = 0; + ifd->active_[i] = NULL; + } + + /* + * Initialize current packet state. + */ + ifd->qi_ = 0; + ifd->qo_ = 0; + for (i = 0; i < RM_MAXQUEUED; i++) { + ifd->class_[i] = NULL; + ifd->curlen_[i] = 0; + ifd->borrowed_[i] = NULL; + } + + /* + * Create the root class of the link-sharing structure. + */ + if ((ifd->root_ = rmc_newclass(0, ifd, + nsecPerByte, + rmc_root_overlimit, maxq, 0, 0, + maxidle, minidle, offtime, + 0, 0)) == NULL) { + printf("rmc_init: root class not allocated\n"); + return ; + } + ifd->root_->depth_ = 0; +} + +/* + * void + * rmc_queue_packet(struct rm_class *cl, mbuf_t *m) - Add packet given by + * mbuf 'm' to queue for resource class 'cl'. This routine is called + * by a driver's if_output routine. This routine must be called with + * output packet completion interrupts locked out (to avoid racing with + * rmc_dequeue_next). + * + * Returns: 0 on successful queueing + * -1 when packet drop occurs + */ +int +rmc_queue_packet(cl, m) + struct rm_class *cl; + mbuf_t *m; +{ + struct timeval now; + struct rm_ifdat *ifd = cl->ifdat_; + int cpri = cl->pri_; + int is_empty = qempty(cl->q_); + + RM_GETTIME(now); + if (ifd->cutoff_ > 0) { + if (TV_LT(&cl->undertime_, &now)) { + if (ifd->cutoff_ > cl->depth_) + ifd->cutoff_ = cl->depth_; + CBQTRACE(rmc_queue_packet, 'ffoc', cl->depth_); + } +#if 1 /* ALTQ */ + else { + /* + * the class is overlimit. if the class has + * underlimit ancestors, set cutoff to the lowest + * depth among them. + */ + struct rm_class *borrow = cl->borrow_; + + while (borrow != NULL && + borrow->depth_ < ifd->cutoff_) { + if (TV_LT(&borrow->undertime_, &now)) { + ifd->cutoff_ = borrow->depth_; + CBQTRACE(rmc_queue_packet, 'ffob', ifd->cutoff_); + break; + } + borrow = borrow->borrow_; + } + } +#else /* !ALTQ */ + else if ((ifd->cutoff_ > 1) && cl->borrow_) { + if (TV_LT(&cl->borrow_->undertime_, &now)) { + ifd->cutoff_ = cl->borrow_->depth_; + CBQTRACE(rmc_queue_packet, 'ffob', + cl->borrow_->depth_); + } + } +#endif /* !ALTQ */ + } + + if (_rmc_addq(cl, m) < 0) + /* failed */ + return (-1); + + if (is_empty) { + CBQTRACE(rmc_queue_packet, 'ytpe', cl->stats_.handle); + ifd->na_[cpri]++; + } + + if (qlen(cl->q_) > qlimit(cl->q_)) { + /* note: qlimit can be set to 0 or 1 */ + rmc_drop_action(cl); + return (-1); + } + return (0); +} + +/* + * void + * rmc_tl_satisfied(struct rm_ifdat *ifd, struct timeval *now) - Check all + * classes to see if there are satified. + */ + +static void +rmc_tl_satisfied(ifd, now) + struct rm_ifdat *ifd; + struct timeval *now; +{ + int i; + rm_class_t *p, *bp; + + for (i = RM_MAXPRIO - 1; i >= 0; i--) { + if ((bp = ifd->active_[i]) != NULL) { + p = bp; + do { + if (!rmc_satisfied(p, now)) { + ifd->cutoff_ = p->depth_; + return; + } + p = p->peer_; + } while (p != bp); + } + } + + reset_cutoff(ifd); +} + +/* + * rmc_satisfied - Return 1 of the class is satisfied. O, otherwise. + */ + +static int +rmc_satisfied(cl, now) + struct rm_class *cl; + struct timeval *now; +{ + rm_class_t *p; + + if (cl == NULL) + return (1); + if (TV_LT(now, &cl->undertime_)) + return (1); + if (cl->depth_ == 0) { + if (!cl->sleeping_ && (qlen(cl->q_) > cl->qthresh_)) + return (0); + else + return (1); + } + if (cl->children_ != NULL) { + p = cl->children_; + while (p != NULL) { + if (!rmc_satisfied(p, now)) + return (0); + p = p->next_; + } + } + + return (1); +} + +/* + * Return 1 if class 'cl' is under limit or can borrow from a parent, + * 0 if overlimit. As a side-effect, this routine will invoke the + * class overlimit action if the class if overlimit. + */ + +static int +rmc_under_limit(cl, now) + struct rm_class *cl; + struct timeval *now; +{ + rm_class_t *p = cl; + rm_class_t *top; + struct rm_ifdat *ifd = cl->ifdat_; + + ifd->borrowed_[ifd->qi_] = NULL; + /* + * If cl is the root class, then always return that it is + * underlimit. Otherwise, check to see if the class is underlimit. + */ + if (cl->parent_ == NULL) + return (1); + + if (cl->sleeping_) { + if (TV_LT(now, &cl->undertime_)) + return (0); + + CALLOUT_STOP(&cl->callout_); + cl->sleeping_ = 0; + cl->undertime_.tv_sec = 0; + return (1); + } + + top = NULL; + while (cl->undertime_.tv_sec && TV_LT(now, &cl->undertime_)) { + if (((cl = cl->borrow_) == NULL) || + (cl->depth_ > ifd->cutoff_)) { +#ifdef ADJUST_CUTOFF + if (cl != NULL) + /* cutoff is taking effect, just + return false without calling + the delay action. */ + return (0); +#endif +#ifdef BORROW_OFFTIME + /* + * check if the class can borrow offtime too. + * borrow offtime from the top of the borrow + * chain if the top class is not overloaded. + */ + if (cl != NULL) { + /* cutoff is taking effect, use this class as top. */ + top = cl; + CBQTRACE(rmc_under_limit, 'ffou', ifd->cutoff_); + } + if (top != NULL && top->avgidle_ == top->minidle_) + top = NULL; + p->overtime_ = *now; + (p->overlimit)(p, top); +#else + p->overtime_ = *now; + (p->overlimit)(p, NULL); +#endif + return (0); + } + top = cl; + } + + if (cl != p) + ifd->borrowed_[ifd->qi_] = cl; + return (1); +} + +/* + * _rmc_wrr_dequeue_next() - This is scheduler for WRR as opposed to + * Packet-by-packet round robin. + * + * The heart of the weigthed round-robin scheduler, which decides which + * class next gets to send a packet. Highest priority first, then + * weighted round-robin within priorites. + * + * Each able-to-send class gets to send until its byte allocation is + * exhausted. Thus, the active pointer is only changed after a class has + * exhausted its allocation. + * + * If the scheduler finds no class that is underlimit or able to borrow, + * then the first class found that had a nonzero queue and is allowed to + * borrow gets to send. + */ + +static mbuf_t * +_rmc_wrr_dequeue_next(ifd, op) + struct rm_ifdat *ifd; + int op; +{ + struct rm_class *cl = NULL, *first = NULL; + u_int deficit; + int cpri; + mbuf_t *m; + struct timeval now; + + RM_GETTIME(now); + + /* + * if the driver polls the top of the queue and then removes + * the polled packet, we must return the same packet. + */ + if (op == ALTDQ_REMOVE && ifd->pollcache_) { + cl = ifd->pollcache_; + cpri = cl->pri_; + if (ifd->efficient_) { + /* check if this class is overlimit */ + if (cl->undertime_.tv_sec != 0 && + rmc_under_limit(cl, &now) == 0) + first = cl; + } + ifd->pollcache_ = NULL; + goto _wrr_out; + } + else { + /* mode == ALTDQ_POLL || pollcache == NULL */ + ifd->pollcache_ = NULL; + ifd->borrowed_[ifd->qi_] = NULL; + } +#ifdef ADJUST_CUTOFF + _again: +#endif + for (cpri = RM_MAXPRIO - 1; cpri >= 0; cpri--) { + if (ifd->na_[cpri] == 0) + continue; + deficit = 0; + /* + * Loop through twice for a priority level, if some class + * was unable to send a packet the first round because + * of the weighted round-robin mechanism. + * During the second loop at this level, deficit==2. + * (This second loop is not needed if for every class, + * "M[cl->pri_])" times "cl->allotment" is greater than + * the byte size for the largest packet in the class.) + */ + _wrr_loop: + cl = ifd->active_[cpri]; + ASSERT(cl != NULL); + do { + if ((deficit < 2) && (cl->bytes_alloc_ <= 0)) + cl->bytes_alloc_ += cl->w_allotment_; + if (!qempty(cl->q_)) { + if ((cl->undertime_.tv_sec == 0) || + rmc_under_limit(cl, &now)) { + if (cl->bytes_alloc_ > 0 || deficit > 1) + goto _wrr_out; + + /* underlimit but no alloc */ + deficit = 1; +#if 1 + ifd->borrowed_[ifd->qi_] = NULL; +#endif + } + else if (first == NULL && cl->borrow_ != NULL) + first = cl; /* borrowing candidate */ + } + + cl->bytes_alloc_ = 0; + cl = cl->peer_; + } while (cl != ifd->active_[cpri]); + + if (deficit == 1) { + /* first loop found an underlimit class with deficit */ + /* Loop on same priority level, with new deficit. */ + deficit = 2; + goto _wrr_loop; + } + } + +#ifdef ADJUST_CUTOFF + /* + * no underlimit class found. if cutoff is taking effect, + * increase cutoff and try again. + */ + if (first != NULL && ifd->cutoff_ < ifd->root_->depth_) { + ifd->cutoff_++; + CBQTRACE(_rmc_wrr_dequeue_next, 'ojda', ifd->cutoff_); + goto _again; + } +#endif /* ADJUST_CUTOFF */ + /* + * If LINK_EFFICIENCY is turned on, then the first overlimit + * class we encounter will send a packet if all the classes + * of the link-sharing structure are overlimit. + */ + reset_cutoff(ifd); + CBQTRACE(_rmc_wrr_dequeue_next, 'otsr', ifd->cutoff_); + + if (!ifd->efficient_ || first == NULL) + return (NULL); + + cl = first; + cpri = cl->pri_; +#if 0 /* too time-consuming for nothing */ + if (cl->sleeping_) + CALLOUT_STOP(&cl->callout_); + cl->sleeping_ = 0; + cl->undertime_.tv_sec = 0; +#endif + ifd->borrowed_[ifd->qi_] = cl->borrow_; + ifd->cutoff_ = cl->borrow_->depth_; + + /* + * Deque the packet and do the book keeping... + */ + _wrr_out: + if (op == ALTDQ_REMOVE) { + m = _rmc_getq(cl); + if (m == NULL) + panic("_rmc_wrr_dequeue_next"); + if (qempty(cl->q_)) + ifd->na_[cpri]--; + + /* + * Update class statistics and link data. + */ + if (cl->bytes_alloc_ > 0) + cl->bytes_alloc_ -= m_pktlen(m); + + if ((cl->bytes_alloc_ <= 0) || first == cl) + ifd->active_[cl->pri_] = cl->peer_; + else + ifd->active_[cl->pri_] = cl; + + ifd->class_[ifd->qi_] = cl; + ifd->curlen_[ifd->qi_] = m_pktlen(m); + ifd->now_[ifd->qi_] = now; + ifd->qi_ = (ifd->qi_ + 1) % ifd->maxqueued_; + ifd->queued_++; + } else { + /* mode == ALTDQ_PPOLL */ + m = _rmc_pollq(cl); + ifd->pollcache_ = cl; + } + return (m); +} + +/* + * Dequeue & return next packet from the highest priority class that + * has a packet to send & has enough allocation to send it. This + * routine is called by a driver whenever it needs a new packet to + * output. + */ +static mbuf_t * +_rmc_prr_dequeue_next(ifd, op) + struct rm_ifdat *ifd; + int op; +{ + mbuf_t *m; + int cpri; + struct rm_class *cl, *first = NULL; + struct timeval now; + + RM_GETTIME(now); + + /* + * if the driver polls the top of the queue and then removes + * the polled packet, we must return the same packet. + */ + if (op == ALTDQ_REMOVE && ifd->pollcache_) { + cl = ifd->pollcache_; + cpri = cl->pri_; + ifd->pollcache_ = NULL; + goto _prr_out; + } else { + /* mode == ALTDQ_POLL || pollcache == NULL */ + ifd->pollcache_ = NULL; + ifd->borrowed_[ifd->qi_] = NULL; + } +#ifdef ADJUST_CUTOFF + _again: +#endif + for (cpri = RM_MAXPRIO - 1; cpri >= 0; cpri--) { + if (ifd->na_[cpri] == 0) + continue; + cl = ifd->active_[cpri]; + ASSERT(cl != NULL); + do { + if (!qempty(cl->q_)) { + if ((cl->undertime_.tv_sec == 0) || + rmc_under_limit(cl, &now)) + goto _prr_out; + if (first == NULL && cl->borrow_ != NULL) + first = cl; + } + cl = cl->peer_; + } while (cl != ifd->active_[cpri]); + } + +#ifdef ADJUST_CUTOFF + /* + * no underlimit class found. if cutoff is taking effect, increase + * cutoff and try again. + */ + if (first != NULL && ifd->cutoff_ < ifd->root_->depth_) { + ifd->cutoff_++; + goto _again; + } +#endif /* ADJUST_CUTOFF */ + /* + * If LINK_EFFICIENCY is turned on, then the first overlimit + * class we encounter will send a packet if all the classes + * of the link-sharing structure are overlimit. + */ + reset_cutoff(ifd); + if (!ifd->efficient_ || first == NULL) + return (NULL); + + cl = first; + cpri = cl->pri_; +#if 0 /* too time-consuming for nothing */ + if (cl->sleeping_) + CALLOUT_STOP(&cl->callout_); + cl->sleeping_ = 0; + cl->undertime_.tv_sec = 0; +#endif + ifd->borrowed_[ifd->qi_] = cl->borrow_; + ifd->cutoff_ = cl->borrow_->depth_; + + /* + * Deque the packet and do the book keeping... + */ + _prr_out: + if (op == ALTDQ_REMOVE) { + m = _rmc_getq(cl); + if (m == NULL) + panic("_rmc_prr_dequeue_next"); + if (qempty(cl->q_)) + ifd->na_[cpri]--; + + ifd->active_[cpri] = cl->peer_; + + ifd->class_[ifd->qi_] = cl; + ifd->curlen_[ifd->qi_] = m_pktlen(m); + ifd->now_[ifd->qi_] = now; + ifd->qi_ = (ifd->qi_ + 1) % ifd->maxqueued_; + ifd->queued_++; + } else { + /* mode == ALTDQ_POLL */ + m = _rmc_pollq(cl); + ifd->pollcache_ = cl; + } + return (m); +} + +/* + * mbuf_t * + * rmc_dequeue_next(struct rm_ifdat *ifd, struct timeval *now) - this function + * is invoked by the packet driver to get the next packet to be + * dequeued and output on the link. If WRR is enabled, then the + * WRR dequeue next routine will determine the next packet to sent. + * Otherwise, packet-by-packet round robin is invoked. + * + * Returns: NULL, if a packet is not available or if all + * classes are overlimit. + * + * Otherwise, Pointer to the next packet. + */ + +mbuf_t * +rmc_dequeue_next(ifd, mode) + struct rm_ifdat *ifd; + int mode; +{ + if (ifd->queued_ >= ifd->maxqueued_) + return (NULL); + else if (ifd->wrr_) + return (_rmc_wrr_dequeue_next(ifd, mode)); + else + return (_rmc_prr_dequeue_next(ifd, mode)); +} + +/* + * Update the utilization estimate for the packet that just completed. + * The packet's class & the parent(s) of that class all get their + * estimators updated. This routine is called by the driver's output- + * packet-completion interrupt service routine. + */ + +/* + * a macro to approximate "divide by 1000" that gives 0.000999, + * if a value has enough effective digits. + * (on pentium, mul takes 9 cycles but div takes 46!) + */ +#define NSEC_TO_USEC(t) (((t) >> 10) + ((t) >> 16) + ((t) >> 17)) +void +rmc_update_class_util(ifd) + struct rm_ifdat *ifd; +{ + int idle, avgidle, pktlen; + int pkt_time, tidle; + rm_class_t *cl, *borrowed; + rm_class_t *borrows; + struct timeval *nowp; + + /* + * Get the most recent completed class. + */ + if ((cl = ifd->class_[ifd->qo_]) == NULL) + return; + + pktlen = ifd->curlen_[ifd->qo_]; + borrowed = ifd->borrowed_[ifd->qo_]; + borrows = borrowed; + + PKTCNTR_ADD(&cl->stats_.xmit_cnt, pktlen); + + /* + * Run estimator on class and it's ancesstors. + */ + /* + * rm_update_class_util is designed to be called when the + * transfer is completed from a xmit complete interrupt, + * but most drivers don't implement an upcall for that. + * so, just use estimated completion time. + * as a result, ifd->qi_ and ifd->qo_ are always synced. + */ + nowp = &ifd->now_[ifd->qo_]; + /* get pkt_time (for link) in usec */ +#if 1 /* use approximation */ + pkt_time = ifd->curlen_[ifd->qo_] * ifd->ns_per_byte_; + pkt_time = NSEC_TO_USEC(pkt_time); +#else + pkt_time = ifd->curlen_[ifd->qo_] * ifd->ns_per_byte_ / 1000; +#endif +#if 1 /* ALTQ4PPP */ + if (TV_LT(nowp, &ifd->ifnow_)) { + int iftime; + + /* + * make sure the estimated completion time does not go + * too far. it can happen when the link layer supports + * data compression or the interface speed is set to + * a much lower value. + */ + TV_DELTA(&ifd->ifnow_, nowp, iftime); + if (iftime+pkt_time < ifd->maxiftime_) { + TV_ADD_DELTA(&ifd->ifnow_, pkt_time, &ifd->ifnow_); + } else { + TV_ADD_DELTA(nowp, ifd->maxiftime_, &ifd->ifnow_); + } + } else { + TV_ADD_DELTA(nowp, pkt_time, &ifd->ifnow_); + } +#else + if (TV_LT(nowp, &ifd->ifnow_)) { + TV_ADD_DELTA(&ifd->ifnow_, pkt_time, &ifd->ifnow_); + } else { + TV_ADD_DELTA(nowp, pkt_time, &ifd->ifnow_); + } +#endif + + while (cl != NULL) { + TV_DELTA(&ifd->ifnow_, &cl->last_, idle); + if (idle >= 2000000) + /* + * this class is idle enough, reset avgidle. + * (TV_DELTA returns 2000000 us when delta is large.) + */ + cl->avgidle_ = cl->maxidle_; + + /* get pkt_time (for class) in usec */ +#if 1 /* use approximation */ + pkt_time = pktlen * cl->ns_per_byte_; + pkt_time = NSEC_TO_USEC(pkt_time); +#else + pkt_time = pktlen * cl->ns_per_byte_ / 1000; +#endif + idle -= pkt_time; + + avgidle = cl->avgidle_; + avgidle += idle - (avgidle >> RM_FILTER_GAIN); + cl->avgidle_ = avgidle; + + /* Are we overlimit ? */ + if (avgidle <= 0) { + CBQTRACE(rmc_update_class_util, 'milo', cl->stats_.handle); +#if 1 /* ALTQ */ + /* + * need some lower bound for avgidle, otherwise + * a borrowing class gets unbounded penalty. + */ + if (avgidle < cl->minidle_) + avgidle = cl->avgidle_ = cl->minidle_; +#endif + /* set next idle to make avgidle 0 */ + tidle = pkt_time + + (((1 - RM_POWER) * avgidle) >> RM_FILTER_GAIN); + TV_ADD_DELTA(nowp, tidle, &cl->undertime_); + ++cl->stats_.over; + } else { + cl->avgidle_ = + (avgidle > cl->maxidle_) ? cl->maxidle_ : avgidle; + cl->undertime_.tv_sec = 0; + if (cl->sleeping_) { + CALLOUT_STOP(&cl->callout_); + cl->sleeping_ = 0; + } + } + + if (borrows != NULL) { + if (borrows != cl) + ++cl->stats_.borrows; + else + borrows = NULL; + } + cl->last_ = ifd->ifnow_; + cl->last_pkttime_ = pkt_time; + +#if 1 + if (cl->parent_ == NULL) { + /* take stats of root class */ + PKTCNTR_ADD(&cl->stats_.xmit_cnt, pktlen); + } +#endif + + cl = cl->parent_; + } + + /* + * Check to see if cutoff needs to set to a new level. + */ + cl = ifd->class_[ifd->qo_]; + if (borrowed && (ifd->cutoff_ >= borrowed->depth_)) { +#if 1 /* ALTQ */ + if ((qlen(cl->q_) <= 0) || TV_LT(nowp, &borrowed->undertime_)) { + rmc_tl_satisfied(ifd, nowp); + CBQTRACE(rmc_update_class_util, 'broe', ifd->cutoff_); + } else { + ifd->cutoff_ = borrowed->depth_; + CBQTRACE(rmc_update_class_util, 'ffob', borrowed->depth_); + } +#else /* !ALTQ */ + if ((qlen(cl->q_) <= 1) || TV_LT(&now, &borrowed->undertime_)) { + reset_cutoff(ifd); +#ifdef notdef + rmc_tl_satisfied(ifd, &now); +#endif + CBQTRACE(rmc_update_class_util, 'broe', ifd->cutoff_); + } else { + ifd->cutoff_ = borrowed->depth_; + CBQTRACE(rmc_update_class_util, 'ffob', borrowed->depth_); + } +#endif /* !ALTQ */ + } + + /* + * Release class slot + */ + ifd->borrowed_[ifd->qo_] = NULL; + ifd->class_[ifd->qo_] = NULL; + ifd->qo_ = (ifd->qo_ + 1) % ifd->maxqueued_; + ifd->queued_--; +} + +/* + * void + * rmc_drop_action(struct rm_class *cl) - Generic (not protocol-specific) + * over-limit action routines. These get invoked by rmc_under_limit() + * if a class with packets to send if over its bandwidth limit & can't + * borrow from a parent class. + * + * Returns: NONE + */ + +static void +rmc_drop_action(cl) + struct rm_class *cl; +{ + struct rm_ifdat *ifd = cl->ifdat_; + + ASSERT(qlen(cl->q_) > 0); + _rmc_dropq(cl); + if (qempty(cl->q_)) + ifd->na_[cl->pri_]--; +} + +void rmc_dropall(cl) + struct rm_class *cl; +{ + struct rm_ifdat *ifd = cl->ifdat_; + + if (!qempty(cl->q_)) { + _flushq(cl->q_); + + ifd->na_[cl->pri_]--; + } +} + +#if (__FreeBSD_version > 300000) +/* hzto() is removed from FreeBSD-3.0 */ +static int hzto __P((struct timeval *)); + +static int +hzto(tv) + struct timeval *tv; +{ + struct timeval t2; + + getmicrotime(&t2); + t2.tv_sec = tv->tv_sec - t2.tv_sec; + t2.tv_usec = tv->tv_usec - t2.tv_usec; + return (tvtohz(&t2)); +} +#endif /* __FreeBSD_version > 300000 */ + +/* + * void + * rmc_delay_action(struct rm_class *cl) - This function is the generic CBQ + * delay action routine. It is invoked via rmc_under_limit when the + * packet is discoverd to be overlimit. + * + * If the delay action is result of borrow class being overlimit, then + * delay for the offtime of the borrowing class that is overlimit. + * + * Returns: NONE + */ + +void +rmc_delay_action(cl, borrow) + struct rm_class *cl, *borrow; +{ + int delay, t, extradelay; + + cl->stats_.overactions++; + TV_DELTA(&cl->undertime_, &cl->overtime_, delay); +#ifndef BORROW_OFFTIME + delay += cl->offtime_; +#endif + + if (!cl->sleeping_) { + CBQTRACE(rmc_delay_action, 'yled', cl->stats_.handle); +#ifdef BORROW_OFFTIME + if (borrow != NULL) + extradelay = borrow->offtime_; + else +#endif + extradelay = cl->offtime_; + +#ifdef ALTQ + /* + * XXX recalculate suspend time: + * current undertime is (tidle + pkt_time) calculated + * from the last transmission. + * tidle: time required to bring avgidle back to 0 + * pkt_time: target waiting time for this class + * we need to replace pkt_time by offtime + */ + extradelay -= cl->last_pkttime_; +#endif + if (extradelay > 0) { + TV_ADD_DELTA(&cl->undertime_, extradelay, &cl->undertime_); + delay += extradelay; + } + + cl->sleeping_ = 1; + cl->stats_.delays++; + + /* + * Since packets are phased randomly with respect to the + * clock, 1 tick (the next clock tick) can be an arbitrarily + * short time so we have to wait for at least two ticks. + * NOTE: If there's no other traffic, we need the timer as + * a 'backstop' to restart this class. + */ + if (delay > tick * 2) { +#ifdef __FreeBSD__ + /* FreeBSD rounds up the tick */ + t = hzto(&cl->undertime_); +#else + /* other BSDs round down the tick */ + t = hzto(&cl->undertime_) + 1; +#endif + } else + t = 2; + CALLOUT_RESET(&cl->callout_, t, + (timeout_t *)rmc_restart, (caddr_t)cl); + } +} + +/* + * void + * rmc_restart() - is just a helper routine for rmc_delay_action -- it is + * called by the system timer code & is responsible checking if the + * class is still sleeping (it might have been restarted as a side + * effect of the queue scan on a packet arrival) and, if so, restarting + * output for the class. Inspecting the class state & restarting output + * require locking the class structure. In general the driver is + * responsible for locking but this is the only routine that is not + * called directly or indirectly from the interface driver so it has + * know about system locking conventions. Under bsd, locking is done + * by raising IPL to splimp so that's what's implemented here. On a + * different system this would probably need to be changed. + * + * Returns: NONE + */ + +static void +rmc_restart(cl) + struct rm_class *cl; +{ + struct rm_ifdat *ifd = cl->ifdat_; + int s; + + s = splimp(); + if (cl->sleeping_) { + cl->sleeping_ = 0; + cl->undertime_.tv_sec = 0; + + if (ifd->queued_ < ifd->maxqueued_ && ifd->restart != NULL) { + CBQTRACE(rmc_restart, 'trts', cl->stats_.handle); + (ifd->restart)(ifd->ifq_); + } + } + splx(s); +} + +/* + * void + * rmc_root_overlimit(struct rm_class *cl) - This the generic overlimit + * handling routine for the root class of the link sharing structure. + * + * Returns: NONE + */ + +static void +rmc_root_overlimit(cl, borrow) + struct rm_class *cl, *borrow; +{ + panic("rmc_root_overlimit"); +} + +/* + * Packet Queue handling routines. Eventually, this is to localize the + * effects on the code whether queues are red queues or droptail + * queues. + */ + +static int +_rmc_addq(cl, m) + rm_class_t *cl; + mbuf_t *m; +{ +#ifdef ALTQ_RIO + if (q_is_rio(cl->q_)) + return rio_addq((rio_t *)cl->red_, cl->q_, m, cl->pktattr_); +#endif +#ifdef ALTQ_RED + if (q_is_red(cl->q_)) + return red_addq(cl->red_, cl->q_, m, cl->pktattr_); +#endif /* ALTQ_RED */ + + if (cl->flags_ & RMCF_CLEARDSCP) + write_dsfield(m, cl->pktattr_, 0); + + _addq(cl->q_, m); + return (0); +} + +/* note: _rmc_dropq is not called for red */ +static void +_rmc_dropq(cl) + rm_class_t *cl; +{ + mbuf_t *m; + + if ((m = _getq(cl->q_)) != NULL) + m_freem(m); +} + +static mbuf_t * +_rmc_getq(cl) + rm_class_t *cl; +{ +#ifdef ALTQ_RIO + if (q_is_rio(cl->q_)) + return rio_getq((rio_t *)cl->red_, cl->q_); +#endif +#ifdef ALTQ_RED + if (q_is_red(cl->q_)) + return red_getq(cl->red_, cl->q_); +#endif + return _getq(cl->q_); +} + +static mbuf_t * +_rmc_pollq(cl) + rm_class_t *cl; +{ + return qhead(cl->q_); +} + +#ifdef CBQ_TRACE + +struct cbqtrace cbqtrace_buffer[NCBQTRACE+1]; +struct cbqtrace *cbqtrace_ptr = NULL; +int cbqtrace_count; + +/* + * DDB hook to trace cbq events: + * the last 1024 events are held in a circular buffer. + * use "call cbqtrace_dump(N)" to display 20 events from Nth event. + */ +void cbqtrace_dump(int); +static char *rmc_funcname(void *); + +static struct rmc_funcs { + void *func; + char *name; +} rmc_funcs[] = +{ + rmc_init, "rmc_init", + rmc_queue_packet, "rmc_queue_packet", + rmc_under_limit, "rmc_under_limit", + rmc_update_class_util, "rmc_update_class_util", + rmc_delay_action, "rmc_delay_action", + rmc_restart, "rmc_restart", + _rmc_wrr_dequeue_next, "_rmc_wrr_dequeue_next", + NULL, NULL +}; + +static char *rmc_funcname(func) + void *func; +{ + struct rmc_funcs *fp; + + for (fp = rmc_funcs; fp->func != NULL; fp++) + if (fp->func == func) + return (fp->name); + return ("unknown"); +} + +void cbqtrace_dump(counter) + int counter; +{ + int i, *p; + char *cp; + + counter = counter % NCBQTRACE; + p = (int *)&cbqtrace_buffer[counter]; + + for (i=0; i<20; i++) { + printf("[0x%x] ", *p++); + printf("%s: ", rmc_funcname((void *)*p++)); + cp = (char *)p++; + printf("%c%c%c%c: ", cp[0], cp[1], cp[2], cp[3]); + printf("%d\n",*p++); + + if (p >= (int *)&cbqtrace_buffer[NCBQTRACE]) + p = (int *)cbqtrace_buffer; + } +} +#endif /* CBQ_TRACE */ + +#endif /* ALTQ_CBQ */ + +#if defined(ALTQ_CBQ) || defined(ALTQ_RED) || defined(ALTQ_RIO) || defined(ALTQ_HFSC) || defined(ALTQ_PRIQ) +#if !defined(__GNUC__) || defined(ALTQ_DEBUG) + +void +_addq(q, m) + class_queue_t *q; + mbuf_t *m; +{ + mbuf_t *m0; + + if ((m0 = qtail(q)) != NULL) + m->m_nextpkt = m0->m_nextpkt; + else + m0 = m; + m0->m_nextpkt = m; + qtail(q) = m; + qlen(q)++; +} + +mbuf_t * +_getq(q) + class_queue_t *q; +{ + mbuf_t *m, *m0; + + if ((m = qtail(q)) == NULL) + return (NULL); + if ((m0 = m->m_nextpkt) != m) + m->m_nextpkt = m0->m_nextpkt; + else { + ASSERT(qlen(q) == 1); + qtail(q) = NULL; + } + qlen(q)--; + return (m0); +} + +/* drop a packet at the tail of the queue */ +mbuf_t * +_getq_tail(q) + class_queue_t *q; +{ + mbuf_t *m, *m0, *prev; + + if ((m = m0 = qtail(q)) == NULL) + return NULL; + do { + prev = m0; + m0 = m0->m_nextpkt; + } while (m0 != m); + prev->m_nextpkt = m->m_nextpkt; + if (prev == m) { + ASSERT(qlen(q) == 1); + qtail(q) = NULL; + } else + qtail(q) = prev; + qlen(q)--; + return (m); +} + +/* randomly select a packet in the queue */ +mbuf_t * +_getq_random(q) + class_queue_t *q; +{ + struct mbuf *m; + int i, n; + + if ((m = qtail(q)) == NULL) + return NULL; + if (m->m_nextpkt == m) { + ASSERT(qlen(q) == 1); + qtail(q) = NULL; + } else { + struct mbuf *prev = NULL; + + n = random() % qlen(q) + 1; + for (i = 0; i < n; i++) { + prev = m; + m = m->m_nextpkt; + } + prev->m_nextpkt = m->m_nextpkt; + if (m == qtail(q)) + qtail(q) = prev; + } + qlen(q)--; + return (m); +} + +void +_removeq(q, m) + class_queue_t *q; + mbuf_t *m; +{ + mbuf_t *m0, *prev; + + m0 = qtail(q); + do { + prev = m0; + m0 = m0->m_nextpkt; + } while (m0 != m); + prev->m_nextpkt = m->m_nextpkt; + if (prev == m) + qtail(q) = NULL; + else if (qtail(q) == m) + qtail(q) = prev; + qlen(q)--; +} + +void +_flushq(q) + class_queue_t *q; +{ + mbuf_t *m; + + while ((m = _getq(q)) != NULL) + m_freem(m); + ASSERT(qlen(q) == 0); +} + +#endif /* !__GNUC__ || ALTQ_DEBUG */ +#endif /* ALTQ_CBQ || ALTQ_RED || ALTQ_RIO || ALTQ_HFSC || ALTQ_PRIQ */ diff --git a/sys/altq/altq_rmclass.h b/sys/altq/altq_rmclass.h new file mode 100644 index 000000000000..7013eab9a783 --- /dev/null +++ b/sys/altq/altq_rmclass.h @@ -0,0 +1,266 @@ +/* $KAME: altq_rmclass.h,v 1.6 2000/12/09 09:22:44 kjc Exp $ */ + +/* + * Copyright (c) 1991-1997 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the Network Research + * Group at Lawrence Berkeley Laboratory. + * 4. Neither the name of the University nor of the Laboratory may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _ALTQ_ALTQ_RMCLASS_H_ +#define _ALTQ_ALTQ_RMCLASS_H_ + +#include + +/* #pragma ident "@(#)rm_class.h 1.20 97/10/23 SMI" */ + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL + +typedef struct mbuf mbuf_t; +typedef struct rm_ifdat rm_ifdat_t; +typedef struct rm_class rm_class_t; + +struct red; + +/* + * Macros for dealing with time values. We assume all times are + * 'timevals'. `microtime' is used to get the best available clock + * resolution. If `microtime' *doesn't* return a value that's about + * ten times smaller than the average packet time on the fastest + * link that will use these routines, a slightly different clock + * scheme than this one should be used. + * (Bias due to truncation error in this scheme will overestimate utilization + * and discriminate against high bandwidth classes. To remove this bias an + * integrator needs to be added. The simplest integrator uses a history of + * 10 * avg.packet.time / min.tick.time packet completion entries. This is + * straight forward to add but we don't want to pay the extra memory + * traffic to maintain it if it's not necessary (occasionally a vendor + * accidentally builds a workstation with a decent clock - e.g., Sun & HP).) + */ + +#define RM_GETTIME(now) microtime(&now) + +#define TV_LT(a, b) (((a)->tv_sec < (b)->tv_sec) || \ + (((a)->tv_usec < (b)->tv_usec) && ((a)->tv_sec <= (b)->tv_sec))) + +#define TV_DELTA(a, b, delta) { \ + register int xxs; \ + \ + delta = (a)->tv_usec - (b)->tv_usec; \ + if ((xxs = (a)->tv_sec - (b)->tv_sec)) { \ + switch (xxs) { \ + default: \ + if (xxs < 0) \ + printf("rm_class: bogus time values\n"); \ + delta = 0; \ + /* fall through */ \ + case 2: \ + delta += 1000000; \ + /* fall through */ \ + case 1: \ + delta += 1000000; \ + break; \ + } \ + } \ +} + +#define TV_ADD_DELTA(a, delta, res) { \ + register int xxus = (a)->tv_usec + (delta); \ + \ + (res)->tv_sec = (a)->tv_sec; \ + while (xxus >= 1000000) { \ + ++((res)->tv_sec); \ + xxus -= 1000000; \ + } \ + (res)->tv_usec = xxus; \ +} + +#define RM_TIMEOUT 2 /* 1 Clock tick. */ + +#if 1 +#define RM_MAXQUEUED 1 /* this isn't used in ALTQ/CBQ */ +#else +#define RM_MAXQUEUED 16 /* Max number of packets downstream of CBQ */ +#endif +#define RM_MAXPRIO 8 /* Max priority */ +#define RM_MAXQUEUE 64 /* Max queue length */ +#define RM_FILTER_GAIN 5 /* log2 of gain, e.g., 5 => 31/32 */ +#define RM_POWER (1 << RM_FILTER_GAIN) +#define RM_MAXDEPTH 32 +#define RM_NS_PER_SEC (1000000000) + +typedef struct _rm_class_stats_ { + u_int handle; + u_int depth; + + struct pktcntr xmit_cnt; /* packets sent in this class */ + struct pktcntr drop_cnt; /* dropped packets */ + u_int over; /* # times went over limit */ + u_int borrows; /* # times tried to borrow */ + u_int overactions; /* # times invoked overlimit action */ + u_int delays; /* # times invoked delay actions */ +} rm_class_stats_t; + +/* + * CBQ Class state structure + */ +struct rm_class { + class_queue_t *q_; /* Queue of packets */ + rm_ifdat_t *ifdat_; + int pri_; /* Class priority. */ + int depth_; /* Class depth */ + u_int ns_per_byte_; /* NanoSeconds per byte. */ + u_int maxrate_; /* Bytes per second for this class. */ + u_int allotment_; /* Fraction of link bandwidth. */ + u_int w_allotment_; /* Weighted allotment for WRR */ + int bytes_alloc_; /* Allocation for round of WRR */ + + int avgidle_; + int maxidle_; + int minidle_; + int offtime_; + int sleeping_; /* != 0 if delaying */ + int qthresh_; /* Queue threshold for formal link sharing */ + int leaf_; /* Note whether leaf class or not.*/ + + rm_class_t *children_; /* Children of this class */ + rm_class_t *next_; /* Next pointer, used if child */ + + rm_class_t *peer_; /* Peer class */ + rm_class_t *borrow_; /* Borrow class */ + rm_class_t *parent_; /* Parent class */ + + void (*overlimit)(struct rm_class *, struct rm_class *); + void (*drop)(struct rm_class *); /* Class drop action. */ + + struct red *red_; /* RED state pointer */ + struct altq_pktattr *pktattr_; /* saved hdr used by RED/ECN */ + int flags_; + + int last_pkttime_; /* saved pkt_time */ + struct timeval undertime_; /* time can next send */ + struct timeval last_; /* time last packet sent */ + struct timeval overtime_; + struct callout callout_; /* for timeout() calls */ + + rm_class_stats_t stats_; /* Class Statistics */ +}; + +/* + * CBQ Interface state + */ +struct rm_ifdat { + int queued_; /* # pkts queued downstream */ + int efficient_; /* Link Efficency bit */ + int wrr_; /* Enable Weighted Round-Robin */ + u_long ns_per_byte_; /* Link byte speed. */ + int maxqueued_; /* Max packets to queue */ + int maxpkt_; /* Max packet size. */ + int qi_; /* In/out pointers for downstream */ + int qo_; /* packets */ + + /* + * Active class state and WRR state. + */ + rm_class_t *active_[RM_MAXPRIO]; /* Active cl's in each pri */ + int na_[RM_MAXPRIO]; /* # of active cl's in a pri */ + int num_[RM_MAXPRIO]; /* # of cl's per pri */ + int alloc_[RM_MAXPRIO]; /* Byte Allocation */ + u_long M_[RM_MAXPRIO]; /* WRR weights. */ + + /* + * Network Interface/Solaris Queue state pointer. + */ + struct ifaltq *ifq_; + rm_class_t *default_; /* Default Pkt class, BE */ + rm_class_t *root_; /* Root Link class. */ + rm_class_t *ctl_; /* Control Traffic class. */ + void (*restart)(struct ifaltq *); /* Restart routine. */ + + /* + * Current packet downstream packet state and dynamic state. + */ + rm_class_t *borrowed_[RM_MAXQUEUED]; /* Class borrowed last */ + rm_class_t *class_[RM_MAXQUEUED]; /* class sending */ + int curlen_[RM_MAXQUEUED]; /* Current pktlen */ + struct timeval now_[RM_MAXQUEUED]; /* Current packet time. */ + int is_overlimit_[RM_MAXQUEUED];/* Current packet time. */ + + int cutoff_; /* Cut-off depth for borrowing */ + + struct timeval ifnow_; /* expected xmit completion time */ +#if 1 /* ALTQ4PPP */ + int maxiftime_; /* max delay inside interface */ +#endif + rm_class_t *pollcache_; /* cached rm_class by poll operation */ +}; + +/* flags for rmc_init and rmc_newclass */ +/* class flags */ +#define RMCF_RED 0x0001 +#define RMCF_ECN 0x0002 +#define RMCF_RIO 0x0004 +#define RMCF_FLOWVALVE 0x0008 /* use flowvalve (aka penalty-box) */ +#define RMCF_CLEARDSCP 0x0010 /* clear diffserv codepoint */ + +/* flags for rmc_init */ +#define RMCF_WRR 0x0100 +#define RMCF_EFFICIENT 0x0200 + +#define is_a_parent_class(cl) ((cl)->children_ != NULL) + +extern rm_class_t *rmc_newclass __P((int, struct rm_ifdat *, u_int, + void (*)(struct rm_class *, + struct rm_class *), + int, struct rm_class *, struct rm_class *, + u_int, int, u_int, int, int)); +extern void rmc_delete_class __P((struct rm_ifdat *, struct rm_class *)); +extern int rmc_modclass __P((struct rm_class *, u_int, int, + u_int, int, u_int, int)); +extern void rmc_init __P((struct ifaltq *, struct rm_ifdat *, u_int, + void (*)(struct ifaltq *), + int, int, u_int, int, u_int, int)); +extern int rmc_queue_packet __P((struct rm_class *, mbuf_t *)); +extern mbuf_t *rmc_dequeue_next __P((struct rm_ifdat *, int)); +extern void rmc_update_class_util __P((struct rm_ifdat *)); +extern void rmc_delay_action __P((struct rm_class *, struct rm_class *)); +extern void rmc_dropall __P((struct rm_class *)); +extern int rmc_get_weight __P((struct rm_ifdat *, int)); + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _ALTQ_ALTQ_RMCLASS_H_ */ diff --git a/sys/altq/altq_rmclass_debug.h b/sys/altq/altq_rmclass_debug.h new file mode 100644 index 000000000000..501726f66ae8 --- /dev/null +++ b/sys/altq/altq_rmclass_debug.h @@ -0,0 +1,112 @@ +/* $KAME: altq_rmclass_debug.h,v 1.2 2000/02/22 14:00:35 itojun Exp $ */ + +/* + * Copyright (c) Sun Microsystems, Inc. 1998 All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the SMCC Technology + * Development Group at Sun Microsystems, Inc. + * + * 4. The name of the Sun Microsystems, Inc nor may not be used to endorse or + * promote products derived from this software without specific prior + * written permission. + * + * SUN MICROSYSTEMS DOES NOT CLAIM MERCHANTABILITY OF THIS SOFTWARE OR THE + * SUITABILITY OF THIS SOFTWARE FOR ANY PARTICULAR PURPOSE. The software is + * provided "as is" without express or implied warranty of any kind. + * + * These notices must be retained in any copies of any part of this software. + */ + +#ifndef _ALTQ_ALTQ_RMCLASS_DEBUG_H_ +#define _ALTQ_ALTQ_RMCLASS_DEBUG_H_ + +/* #pragma ident "@(#)rm_class_debug.h 1.7 98/05/04 SMI" */ + +/* + * Cbq debugging macros + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef CBQ_TRACE +#ifndef NCBQTRACE +#define NCBQTRACE (16 * 1024) +#endif + +/* + * To view the trace output, using adb, type: + * adb -k /dev/ksyms /dev/mem , then type + * cbqtrace_count/D to get the count, then type + * cbqtrace_buffer,0tcount/Dp4C" "Xn + * This will dump the trace buffer from 0 to count. + */ +/* + * in ALTQ, "call cbqtrace_dump(N)" from DDB to display 20 events + * from Nth event in the circular buffer. + */ + +struct cbqtrace { + int count; + int function; /* address of function */ + int trace_action; /* descriptive 4 characters */ + int object; /* object operated on */ +}; + +extern struct cbqtrace cbqtrace_buffer[]; +extern struct cbqtrace *cbqtrace_ptr; +extern int cbqtrace_count; + +#define CBQTRACEINIT() { \ + if (cbqtrace_ptr == NULL) \ + cbqtrace_ptr = cbqtrace_buffer; \ + else { \ + cbqtrace_ptr = cbqtrace_buffer; \ + bzero((void *)cbqtrace_ptr, sizeof(cbqtrace_buffer)); \ + cbqtrace_count = 0; \ + } \ +} + +#define LOCK_TRACE() splimp() +#define UNLOCK_TRACE(x) splx(x) + +#define CBQTRACE(func, act, obj) { \ + int __s = LOCK_TRACE(); \ + int *_p = &cbqtrace_ptr->count; \ + *_p++ = ++cbqtrace_count; \ + *_p++ = (int)(func); \ + *_p++ = (int)(act); \ + *_p++ = (int)(obj); \ + if ((struct cbqtrace *)(void *)_p >= &cbqtrace_buffer[NCBQTRACE])\ + cbqtrace_ptr = cbqtrace_buffer; \ + else \ + cbqtrace_ptr = (struct cbqtrace *)(void *)_p; \ + UNLOCK_TRACE(__s); \ + } +#else + +/* If no tracing, define no-ops */ +#define CBQTRACEINIT() +#define CBQTRACE(a, b, c) + +#endif /* !CBQ_TRACE */ + +#ifdef __cplusplus +} +#endif + +#endif /* _ALTQ_ALTQ_RMCLASS_DEBUG_H_ */ diff --git a/sys/altq/altq_subr.c b/sys/altq/altq_subr.c new file mode 100644 index 000000000000..a54e6bc1fe2c --- /dev/null +++ b/sys/altq/altq_subr.c @@ -0,0 +1,1551 @@ +/* $KAME: altq_subr.c,v 1.8 2000/12/14 08:12:46 thorpej Exp $ */ + +/* + * Copyright (C) 1997-2000 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifdef ALTQ +#if defined(__FreeBSD__) || defined(__NetBSD__) +#include "opt_altq.h" +#if (__FreeBSD__ != 2) +#include "opt_inet.h" +#ifdef __FreeBSD__ +#include "opt_inet6.h" +#endif +#endif +#endif /* __FreeBSD__ || __NetBSD__ */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#ifdef INET6 +#include +#endif +#include +#include + +#include +#include + +#ifdef __FreeBSD__ +#include "opt_cpu.h" /* for FreeBSD-2.2.8 to get i586_ctr_freq */ +#include +#endif + +/* + * internal function prototypes + */ +static void tbr_timeout __P((void *)); +static int extract_ports4 __P((struct mbuf *, struct ip *, + struct flowinfo_in *)); +#ifdef INET6 +static int extract_ports6 __P((struct mbuf *, struct ip6_hdr *, + struct flowinfo_in6 *)); +#endif +static int apply_filter4 __P((u_int32_t, struct flow_filter *, + struct flowinfo_in *)); +static int apply_ppfilter4 __P((u_int32_t, struct flow_filter *, + struct flowinfo_in *)); +#ifdef INET6 +static int apply_filter6 __P((u_int32_t, struct flow_filter6 *, + struct flowinfo_in6 *)); +#endif +static int apply_tosfilter4 __P((u_int32_t, struct flow_filter *, + struct flowinfo_in *)); +static u_long get_filt_handle __P((struct acc_classifier *, int)); +static struct acc_filter *filth_to_filtp __P((struct acc_classifier *, + u_long)); +static u_int32_t filt2fibmask __P((struct flow_filter *)); + +static void ip4f_cache __P((struct ip *, struct flowinfo_in *)); +static int ip4f_lookup __P((struct ip *, struct flowinfo_in *)); +static int ip4f_init __P((void)); +static struct ip4_frag *ip4f_alloc __P((void)); +static void ip4f_free __P((struct ip4_frag *)); + +int (*altq_input) __P((struct mbuf *, int)) = NULL; +static int tbr_timer = 0; /* token bucket regulator timer */ +static struct callout tbr_callout = CALLOUT_INITIALIZER; + +/* + * alternate queueing support routines + */ + +/* look up the queue state by the interface name and the queuing type. */ +void * +altq_lookup(name, type) + char *name; + int type; +{ + struct ifnet *ifp; + + if ((ifp = ifunit(name)) != NULL) { + if (type != ALTQT_NONE && ifp->if_snd.altq_type == type) + return (ifp->if_snd.altq_disc); + } + + return NULL; +} + +int +altq_attach(ifq, type, discipline, enqueue, dequeue, request, clfier, classify) + struct ifaltq *ifq; + int type; + void *discipline; + int (*enqueue)(struct ifaltq *, struct mbuf *, struct altq_pktattr *); + struct mbuf *(*dequeue)(struct ifaltq *, int); + int (*request)(struct ifaltq *, int, void *); + void *clfier; + void *(*classify)(void *, struct mbuf *, int); +{ + if (!ALTQ_IS_READY(ifq)) + return ENXIO; + if (ALTQ_IS_ENABLED(ifq)) + return EBUSY; + if (ALTQ_IS_ATTACHED(ifq)) + return EEXIST; + ifq->altq_type = type; + ifq->altq_disc = discipline; + ifq->altq_enqueue = enqueue; + ifq->altq_dequeue = dequeue; + ifq->altq_request = request; + ifq->altq_clfier = clfier; + ifq->altq_classify = classify; + ifq->altq_flags &= ALTQF_CANTCHANGE; +#ifdef ALTQ_KLD + altq_module_incref(type); +#endif + return 0; +} + +int +altq_detach(ifq) + struct ifaltq *ifq; +{ + if (!ALTQ_IS_READY(ifq)) + return ENXIO; + if (ALTQ_IS_ENABLED(ifq)) + return EBUSY; + if (!ALTQ_IS_ATTACHED(ifq)) + return (0); + +#ifdef ALTQ_KLD + altq_module_declref(ifq->altq_type); +#endif + ifq->altq_type = ALTQT_NONE; + ifq->altq_disc = NULL; + ifq->altq_enqueue = NULL; + ifq->altq_dequeue = NULL; + ifq->altq_request = NULL; + ifq->altq_clfier = NULL; + ifq->altq_classify = NULL; + ifq->altq_flags &= ALTQF_CANTCHANGE; + return 0; +} + +int +altq_enable(ifq) + struct ifaltq *ifq; +{ + int s; + + if (!ALTQ_IS_READY(ifq)) + return ENXIO; + if (ALTQ_IS_ENABLED(ifq)) + return 0; + + s = splimp(); + IFQ_PURGE(ifq); + ASSERT(ifq->ifq_len == 0); + ifq->altq_flags |= ALTQF_ENABLED; + if (ifq->altq_clfier != NULL) + ifq->altq_flags |= ALTQF_CLASSIFY; + splx(s); + + return 0; +} + +int +altq_disable(ifq) + struct ifaltq *ifq; +{ + int s; + + if (!ALTQ_IS_ENABLED(ifq)) + return 0; + + s = splimp(); + IFQ_PURGE(ifq); + ASSERT(ifq->ifq_len == 0); + ifq->altq_flags &= ~(ALTQF_ENABLED|ALTQF_CLASSIFY); + splx(s); + return 0; +} + +void +altq_assert(file, line, failedexpr) + const char *file, *failedexpr; + int line; +{ + (void)printf("altq assertion \"%s\" failed: file \"%s\", line %d\n", + failedexpr, file, line); + panic("altq assertion"); + /* NOTREACHED */ +} + +/* + * internal representation of token bucket parameters + * rate: byte_per_unittime << 32 + * (((bits_per_sec) / 8) << 32) / machclk_freq + * depth: byte << 32 + * + */ +#define TBR_SHIFT 32 +#define TBR_SCALE(x) ((int64_t)(x) << TBR_SHIFT) +#define TBR_UNSCALE(x) ((x) >> TBR_SHIFT) + +struct mbuf * +tbr_dequeue(ifq, op) + struct ifaltq *ifq; + int op; +{ + struct tb_regulator *tbr; + struct mbuf *m; + int64_t interval; + u_int64_t now; + + tbr = ifq->altq_tbr; + if (op == ALTDQ_REMOVE && tbr->tbr_lastop == ALTDQ_POLL) { + /* if this is a remove after poll, bypass tbr check */ + } else { + /* update token only when it is negative */ + if (tbr->tbr_token <= 0) { + now = read_machclk(); + interval = now - tbr->tbr_last; + if (interval >= tbr->tbr_filluptime) + tbr->tbr_token = tbr->tbr_depth; + else { + tbr->tbr_token += interval * tbr->tbr_rate; + if (tbr->tbr_token > tbr->tbr_depth) + tbr->tbr_token = tbr->tbr_depth; + } + tbr->tbr_last = now; + } + /* if token is still negative, don't allow dequeue */ + if (tbr->tbr_token <= 0) + return (NULL); + } + + if (ALTQ_IS_ENABLED(ifq)) + m = (*ifq->altq_dequeue)(ifq, op); + else { + if (op == ALTDQ_POLL) + IF_POLL(ifq, m); + else + IF_DEQUEUE(ifq, m); + } + + if (m != NULL && op == ALTDQ_REMOVE) + tbr->tbr_token -= TBR_SCALE(m_pktlen(m)); + tbr->tbr_lastop = op; + return (m); +} + +/* + * set a token bucket regulator. + * if the specified rate is zero, the token bucket regulator is deleted. + */ +int +tbr_set(ifq, profile) + struct ifaltq *ifq; + struct tb_profile *profile; +{ + struct tb_regulator *tbr, *otbr; + + if (machclk_freq == 0) + init_machclk(); + if (machclk_freq == 0) { + printf("tbr_set: no cpu clock available!\n"); + return (ENXIO); + } + + if (profile->rate == 0) { + /* delete this tbr */ + if ((tbr = ifq->altq_tbr) == NULL) + return (ENOENT); + ifq->altq_tbr = NULL; + FREE(tbr, M_DEVBUF); + return (0); + } + + MALLOC(tbr, struct tb_regulator *, sizeof(struct tb_regulator), + M_DEVBUF, M_WAITOK); + if (tbr == NULL) + return (ENOMEM); + bzero(tbr, sizeof(struct tb_regulator)); + + tbr->tbr_rate = TBR_SCALE(profile->rate / 8) / machclk_freq; + tbr->tbr_depth = TBR_SCALE(profile->depth); + if (tbr->tbr_rate > 0) + tbr->tbr_filluptime = tbr->tbr_depth / tbr->tbr_rate; + else + tbr->tbr_filluptime = 0xffffffffffffffffLL; + tbr->tbr_token = tbr->tbr_depth; + tbr->tbr_last = read_machclk(); + tbr->tbr_lastop = ALTDQ_REMOVE; + + otbr = ifq->altq_tbr; + ifq->altq_tbr = tbr; /* set the new tbr */ + + if (otbr != NULL) + FREE(otbr, M_DEVBUF); + else { + if (tbr_timer == 0) { + CALLOUT_RESET(&tbr_callout, 1, tbr_timeout, (void *)0); + tbr_timer = 1; + } + } + return (0); +} + +/* + * tbr_timeout goes through the interface list, and kicks the drivers + * if necessary. + */ +static void +tbr_timeout(arg) + void *arg; +{ + struct ifnet *ifp; + int active, s; + + active = 0; + s = splimp(); +#ifdef __FreeBSD__ +#if (__FreeBSD_version < 300000) + for (ifp = ifnet; ifp; ifp = ifp->if_next) +#else + for (ifp = ifnet.tqh_first; ifp != NULL; ifp = ifp->if_link.tqe_next) +#endif +#else /* !FreeBSD */ + for (ifp = ifnet.tqh_first; ifp != NULL; ifp = ifp->if_list.tqe_next) +#endif + { + if (!TBR_IS_ENABLED(&ifp->if_snd)) + continue; + active++; + if (!IFQ_IS_EMPTY(&ifp->if_snd) && ifp->if_start != NULL) + (*ifp->if_start)(ifp); + } + splx(s); + if (active > 0) + CALLOUT_RESET(&tbr_callout, 1, tbr_timeout, (void *)0); + else + tbr_timer = 0; /* don't need tbr_timer anymore */ +#if defined(__alpha__) && !defined(ALTQ_NOPCC) + { + /* + * XXX read out the machine dependent clock once a second + * to detect counter wrap-around. + */ + static u_int cnt; + + if (++cnt >= hz) { + (void)read_machclk(); + cnt = 0; + } + } +#endif /* __alpha__ && !ALTQ_NOPCC */ +} + +/* + * get token bucket regulator profile + */ +int +tbr_get(ifq, profile) + struct ifaltq *ifq; + struct tb_profile *profile; +{ + struct tb_regulator *tbr; + + if ((tbr = ifq->altq_tbr) == NULL) { + profile->rate = 0; + profile->depth = 0; + } else { + profile->rate = + (u_int)TBR_UNSCALE(tbr->tbr_rate * 8 * machclk_freq); + profile->depth = (u_int)TBR_UNSCALE(tbr->tbr_depth); + } + return (0); +} + + +#ifndef IPPROTO_ESP +#define IPPROTO_ESP 50 /* encapsulating security payload */ +#endif +#ifndef IPPROTO_AH +#define IPPROTO_AH 51 /* authentication header */ +#endif + +/* + * extract flow information from a given packet. + * filt_mask shows flowinfo fields required. + * we assume the ip header is in one mbuf, and addresses and ports are + * in network byte order. + */ +int +altq_extractflow(m, af, flow, filt_bmask) + struct mbuf *m; + int af; + struct flowinfo *flow; + u_int32_t filt_bmask; +{ + + switch (af) { + case PF_INET: { + struct flowinfo_in *fin; + struct ip *ip; + + ip = mtod(m, struct ip *); + + if (ip->ip_v != 4) + break; + + fin = (struct flowinfo_in *)flow; + fin->fi_len = sizeof(struct flowinfo_in); + fin->fi_family = AF_INET; + + fin->fi_proto = ip->ip_p; + fin->fi_tos = ip->ip_tos; + + fin->fi_src.s_addr = ip->ip_src.s_addr; + fin->fi_dst.s_addr = ip->ip_dst.s_addr; + + if (filt_bmask & FIMB4_PORTS) + /* if port info is required, extract port numbers */ + extract_ports4(m, ip, fin); + else { + fin->fi_sport = 0; + fin->fi_dport = 0; + fin->fi_gpi = 0; + } + return (1); + } + +#ifdef INET6 + case PF_INET6: { + struct flowinfo_in6 *fin6; + struct ip6_hdr *ip6; + + ip6 = mtod(m, struct ip6_hdr *); + /* should we check the ip version? */ + + fin6 = (struct flowinfo_in6 *)flow; + fin6->fi6_len = sizeof(struct flowinfo_in6); + fin6->fi6_family = AF_INET6; + + fin6->fi6_proto = ip6->ip6_nxt; + fin6->fi6_tclass = (ntohl(ip6->ip6_flow) >> 20) & 0xff; + + fin6->fi6_flowlabel = ip6->ip6_flow & htonl(0x000fffff); + fin6->fi6_src = ip6->ip6_src; + fin6->fi6_dst = ip6->ip6_dst; + + if ((filt_bmask & FIMB6_PORTS) || + ((filt_bmask & FIMB6_PROTO) + && ip6->ip6_nxt > IPPROTO_IPV6)) + /* + * if port info is required, or proto is required + * but there are option headers, extract port + * and protocol numbers. + */ + extract_ports6(m, ip6, fin6); + else { + fin6->fi6_sport = 0; + fin6->fi6_dport = 0; + fin6->fi6_gpi = 0; + } + return (1); + } +#endif /* INET6 */ + + default: + break; + } + + /* failed */ + flow->fi_len = sizeof(struct flowinfo); + flow->fi_family = AF_UNSPEC; + return (0); +} + +/* + * helper routine to extract port numbers + */ +/* structure for ipsec and ipv6 option header template */ +struct _opt6 { + u_int8_t opt6_nxt; /* next header */ + u_int8_t opt6_hlen; /* header extension length */ + u_int16_t _pad; + u_int32_t ah_spi; /* security parameter index + for authentication header */ +}; + +/* + * extract port numbers from a ipv4 packet. + */ +static int +extract_ports4(m, ip, fin) + struct mbuf *m; + struct ip *ip; + struct flowinfo_in *fin; +{ + struct mbuf *m0; + u_short ip_off; + u_int8_t proto; + int off; + + fin->fi_sport = 0; + fin->fi_dport = 0; + fin->fi_gpi = 0; + + ip_off = ntohs(ip->ip_off); + /* if it is a fragment, try cached fragment info */ + if (ip_off & IP_OFFMASK) { + ip4f_lookup(ip, fin); + return (1); + } + + /* locate the mbuf containing the protocol header */ + for (m0 = m; m0 != NULL; m0 = m0->m_next) + if (((caddr_t)ip >= m0->m_data) && + ((caddr_t)ip < m0->m_data + m0->m_len)) + break; + if (m0 == NULL) { +#ifdef ALTQ_DEBUG + printf("extract_ports4: can't locate header! ip=%p\n", ip); +#endif + return (0); + } + off = ((caddr_t)ip - m0->m_data) + (ip->ip_hl << 2); + proto = ip->ip_p; + +#ifdef ALTQ_IPSEC + again: +#endif + while (off >= m0->m_len) { + off -= m0->m_len; + m0 = m0->m_next; + } + ASSERT(m0->m_len >= off + 4); + + switch (proto) { + case IPPROTO_TCP: + case IPPROTO_UDP: { + struct udphdr *udp; + + udp = (struct udphdr *)(mtod(m0, caddr_t) + off); + fin->fi_sport = udp->uh_sport; + fin->fi_dport = udp->uh_dport; + fin->fi_proto = proto; + } + break; + +#ifdef ALTQ_IPSEC + case IPPROTO_ESP: + if (fin->fi_gpi == 0){ + u_int32_t *gpi; + + gpi = (u_int32_t *)(mtod(m0, caddr_t) + off); + fin->fi_gpi = *gpi; + } + fin->fi_proto = proto; + break; + + case IPPROTO_AH: { + /* get next header and header length */ + struct _opt6 *opt6; + + opt6 = (struct _opt6 *)(mtod(m0, caddr_t) + off); + proto = opt6->opt6_nxt; + off += 8 + (opt6->opt6_hlen * 4); + if (fin->fi_gpi == 0) + fin->fi_gpi = opt6->ah_spi; + } + /* goto the next header */ + goto again; +#endif /* ALTQ_IPSEC */ + + default: + fin->fi_proto = proto; + return (0); + } + + /* if this is a first fragment, cache it. */ + if (ip_off & IP_MF) + ip4f_cache(ip, fin); + + return (1); +} + +#ifdef INET6 +static int +extract_ports6(m, ip6, fin6) + struct mbuf *m; + struct ip6_hdr *ip6; + struct flowinfo_in6 *fin6; +{ + struct mbuf *m0; + int off; + u_int8_t proto; + + fin6->fi6_gpi = 0; + fin6->fi6_sport = 0; + fin6->fi6_dport = 0; + + /* locate the mbuf containing the protocol header */ + for (m0 = m; m0 != NULL; m0 = m0->m_next) + if (((caddr_t)ip6 >= m0->m_data) && + ((caddr_t)ip6 < m0->m_data + m0->m_len)) + break; + if (m0 == NULL) { +#ifdef ALTQ_DEBUG + printf("extract_ports6: can't locate header! ip6=%p\n", ip6); +#endif + return (0); + } + off = ((caddr_t)ip6 - m0->m_data) + sizeof(struct ip6_hdr); + + proto = ip6->ip6_nxt; + do { + while (off >= m0->m_len) { + off -= m0->m_len; + m0 = m0->m_next; + } + ASSERT(m0->m_len >= off + 4); + + switch (proto) { + case IPPROTO_TCP: + case IPPROTO_UDP: { + struct udphdr *udp; + + udp = (struct udphdr *)(mtod(m0, caddr_t) + off); + fin6->fi6_sport = udp->uh_sport; + fin6->fi6_dport = udp->uh_dport; + fin6->fi6_proto = proto; + } + return (1); + + case IPPROTO_ESP: + if (fin6->fi6_gpi == 0) { + u_int32_t *gpi; + + gpi = (u_int32_t *)(mtod(m0, caddr_t) + off); + fin6->fi6_gpi = *gpi; + } + fin6->fi6_proto = proto; + return (1); + + case IPPROTO_AH: { + /* get next header and header length */ + struct _opt6 *opt6; + + opt6 = (struct _opt6 *)(mtod(m0, caddr_t) + off); + if (fin6->fi6_gpi == 0) + fin6->fi6_gpi = opt6->ah_spi; + proto = opt6->opt6_nxt; + off += 8 + (opt6->opt6_hlen * 4); + /* goto the next header */ + break; + } + + case IPPROTO_HOPOPTS: + case IPPROTO_ROUTING: + case IPPROTO_DSTOPTS: { + /* get next header and header length */ + struct _opt6 *opt6; + + opt6 = (struct _opt6 *)(mtod(m0, caddr_t) + off); + proto = opt6->opt6_nxt; + off += (opt6->opt6_hlen + 1) * 8; + /* goto the next header */ + break; + } + + case IPPROTO_FRAGMENT: + /* ipv6 fragmentations are not supported yet */ + default: + fin6->fi6_proto = proto; + return (0); + } + } while (1); + /*NOTREACHED*/ +} +#endif /* INET6 */ + +/* + * altq common classifier + */ +int +acc_add_filter(classifier, filter, class, phandle) + struct acc_classifier *classifier; + struct flow_filter *filter; + void *class; + u_long *phandle; +{ + struct acc_filter *afp, *prev, *tmp; + int i, s; + +#ifdef INET6 + if (filter->ff_flow.fi_family != AF_INET && + filter->ff_flow.fi_family != AF_INET6) + return (EINVAL); +#else + if (filter->ff_flow.fi_family != AF_INET) + return (EINVAL); +#endif + + MALLOC(afp, struct acc_filter *, sizeof(struct acc_filter), + M_DEVBUF, M_WAITOK); + if (afp == NULL) + return (ENOMEM); + bzero(afp, sizeof(struct acc_filter)); + + afp->f_filter = *filter; + afp->f_class = class; + + i = ACC_WILDCARD_INDEX; + if (filter->ff_flow.fi_family == AF_INET) { + struct flow_filter *filter4 = &afp->f_filter; + + /* + * if address is 0, it's a wildcard. if address mask + * isn't set, use full mask. + */ + if (filter4->ff_flow.fi_dst.s_addr == 0) + filter4->ff_mask.mask_dst.s_addr = 0; + else if (filter4->ff_mask.mask_dst.s_addr == 0) + filter4->ff_mask.mask_dst.s_addr = 0xffffffff; + if (filter4->ff_flow.fi_src.s_addr == 0) + filter4->ff_mask.mask_src.s_addr = 0; + else if (filter4->ff_mask.mask_src.s_addr == 0) + filter4->ff_mask.mask_src.s_addr = 0xffffffff; + + /* clear extra bits in addresses */ + filter4->ff_flow.fi_dst.s_addr &= + filter4->ff_mask.mask_dst.s_addr; + filter4->ff_flow.fi_src.s_addr &= + filter4->ff_mask.mask_src.s_addr; + + /* + * if dst address is a wildcard, use hash-entry + * ACC_WILDCARD_INDEX. + */ + if (filter4->ff_mask.mask_dst.s_addr != 0xffffffff) + i = ACC_WILDCARD_INDEX; + else + i = ACC_GET_HASH_INDEX(filter4->ff_flow.fi_dst.s_addr); + } +#ifdef INET6 + else if (filter->ff_flow.fi_family == AF_INET6) { + struct flow_filter6 *filter6 = + (struct flow_filter6 *)&afp->f_filter; +#ifndef IN6MASK0 /* taken from kame ipv6 */ +#define IN6MASK0 {{{ 0, 0, 0, 0 }}} +#define IN6MASK128 {{{ 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }}} + const struct in6_addr in6mask0 = IN6MASK0; + const struct in6_addr in6mask128 = IN6MASK128; +#endif + + if (IN6_IS_ADDR_UNSPECIFIED(&filter6->ff_flow6.fi6_dst)) + filter6->ff_mask6.mask6_dst = in6mask0; + else if (IN6_IS_ADDR_UNSPECIFIED(&filter6->ff_mask6.mask6_dst)) + filter6->ff_mask6.mask6_dst = in6mask128; + if (IN6_IS_ADDR_UNSPECIFIED(&filter6->ff_flow6.fi6_src)) + filter6->ff_mask6.mask6_src = in6mask0; + else if (IN6_IS_ADDR_UNSPECIFIED(&filter6->ff_mask6.mask6_src)) + filter6->ff_mask6.mask6_src = in6mask128; + + /* clear extra bits in addresses */ + for (i = 0; i < 16; i++) + filter6->ff_flow6.fi6_dst.s6_addr[i] &= + filter6->ff_mask6.mask6_dst.s6_addr[i]; + for (i = 0; i < 16; i++) + filter6->ff_flow6.fi6_src.s6_addr[i] &= + filter6->ff_mask6.mask6_src.s6_addr[i]; + + if (filter6->ff_flow6.fi6_flowlabel == 0) + i = ACC_WILDCARD_INDEX; + else + i = ACC_GET_HASH_INDEX(filter6->ff_flow6.fi6_flowlabel); + } +#endif /* INET6 */ + + afp->f_handle = get_filt_handle(classifier, i); + + /* update filter bitmask */ + afp->f_fbmask = filt2fibmask(filter); + classifier->acc_fbmask |= afp->f_fbmask; + + /* + * add this filter to the filter list. + * filters are ordered from the highest rule number. + */ + s = splimp(); + prev = NULL; + LIST_FOREACH(tmp, &classifier->acc_filters[i], f_chain) { + if (tmp->f_filter.ff_ruleno > afp->f_filter.ff_ruleno) + prev = tmp; + else + break; + } + if (prev == NULL) + LIST_INSERT_HEAD(&classifier->acc_filters[i], afp, f_chain); + else + LIST_INSERT_AFTER(prev, afp, f_chain); + splx(s); + + *phandle = afp->f_handle; + return (0); +} + +int +acc_delete_filter(classifier, handle) + struct acc_classifier *classifier; + u_long handle; +{ + struct acc_filter *afp; + int s; + + if ((afp = filth_to_filtp(classifier, handle)) == NULL) + return (EINVAL); + + s = splimp(); + LIST_REMOVE(afp, f_chain); + splx(s); + + FREE(afp, M_DEVBUF); + + /* todo: update filt_bmask */ + + return (0); +} + +/* + * delete filters referencing to the specified class. + * if the all flag is not 0, delete all the filters. + */ +int +acc_discard_filters(classifier, class, all) + struct acc_classifier *classifier; + void *class; + int all; +{ + struct acc_filter *afp; + int i, s; + + s = splimp(); + for (i = 0; i < ACC_FILTER_TABLESIZE; i++) { + do { + LIST_FOREACH(afp, &classifier->acc_filters[i], f_chain) + if (all || afp->f_class == class) { + LIST_REMOVE(afp, f_chain); + FREE(afp, M_DEVBUF); + /* start again from the head */ + break; + } + } while (afp != NULL); + } + splx(s); + + if (all) + classifier->acc_fbmask = 0; + + return (0); +} + +void * +acc_classify(clfier, m, af) + void *clfier; + struct mbuf *m; + int af; +{ + struct acc_classifier *classifier; + struct flowinfo flow; + struct acc_filter *afp; + int i; + + classifier = (struct acc_classifier *)clfier; + altq_extractflow(m, af, &flow, classifier->acc_fbmask); + + if (flow.fi_family == AF_INET) { + struct flowinfo_in *fp = (struct flowinfo_in *)&flow; + + if ((classifier->acc_fbmask & FIMB4_ALL) == FIMB4_TOS) { + /* only tos is used */ + LIST_FOREACH(afp, + &classifier->acc_filters[ACC_WILDCARD_INDEX], + f_chain) + if (apply_tosfilter4(afp->f_fbmask, + &afp->f_filter, fp)) + /* filter matched */ + return (afp->f_class); + } else if ((classifier->acc_fbmask & + (~(FIMB4_PROTO|FIMB4_SPORT|FIMB4_DPORT) & FIMB4_ALL)) + == 0) { + /* only proto and ports are used */ + LIST_FOREACH(afp, + &classifier->acc_filters[ACC_WILDCARD_INDEX], + f_chain) + if (apply_ppfilter4(afp->f_fbmask, + &afp->f_filter, fp)) + /* filter matched */ + return (afp->f_class); + } else { + /* get the filter hash entry from its dest address */ + i = ACC_GET_HASH_INDEX(fp->fi_dst.s_addr); + do { + /* + * go through this loop twice. first for dst + * hash, second for wildcards. + */ + LIST_FOREACH(afp, &classifier->acc_filters[i], + f_chain) + if (apply_filter4(afp->f_fbmask, + &afp->f_filter, fp)) + /* filter matched */ + return (afp->f_class); + + /* + * check again for filters with a dst addr + * wildcard. + * (daddr == 0 || dmask != 0xffffffff). + */ + if (i != ACC_WILDCARD_INDEX) + i = ACC_WILDCARD_INDEX; + else + break; + } while (1); + } + } +#ifdef INET6 + else if (flow.fi_family == AF_INET6) { + struct flowinfo_in6 *fp6 = (struct flowinfo_in6 *)&flow; + + /* get the filter hash entry from its flow ID */ + if (fp6->fi6_flowlabel != 0) + i = ACC_GET_HASH_INDEX(fp6->fi6_flowlabel); + else + /* flowlable can be zero */ + i = ACC_WILDCARD_INDEX; + + /* go through this loop twice. first for flow hash, second + for wildcards. */ + do { + LIST_FOREACH(afp, &classifier->acc_filters[i], f_chain) + if (apply_filter6(afp->f_fbmask, + (struct flow_filter6 *)&afp->f_filter, + fp6)) + /* filter matched */ + return (afp->f_class); + + /* + * check again for filters with a wildcard. + */ + if (i != ACC_WILDCARD_INDEX) + i = ACC_WILDCARD_INDEX; + else + break; + } while (1); + } +#endif /* INET6 */ + + /* no filter matched */ + return (NULL); +} + +static int +apply_filter4(fbmask, filt, pkt) + u_int32_t fbmask; + struct flow_filter *filt; + struct flowinfo_in *pkt; +{ + if (filt->ff_flow.fi_family != AF_INET) + return (0); + if ((fbmask & FIMB4_SPORT) && filt->ff_flow.fi_sport != pkt->fi_sport) + return (0); + if ((fbmask & FIMB4_DPORT) && filt->ff_flow.fi_dport != pkt->fi_dport) + return (0); + if ((fbmask & FIMB4_DADDR) && + filt->ff_flow.fi_dst.s_addr != + (pkt->fi_dst.s_addr & filt->ff_mask.mask_dst.s_addr)) + return (0); + if ((fbmask & FIMB4_SADDR) && + filt->ff_flow.fi_src.s_addr != + (pkt->fi_src.s_addr & filt->ff_mask.mask_src.s_addr)) + return (0); + if ((fbmask & FIMB4_PROTO) && filt->ff_flow.fi_proto != pkt->fi_proto) + return (0); + if ((fbmask & FIMB4_TOS) && filt->ff_flow.fi_tos != + (pkt->fi_tos & filt->ff_mask.mask_tos)) + return (0); + if ((fbmask & FIMB4_GPI) && filt->ff_flow.fi_gpi != (pkt->fi_gpi)) + return (0); + /* match */ + return (1); +} + +/* + * filter matching function optimized for a common case that checks + * only protocol and port numbers + */ +static int +apply_ppfilter4(fbmask, filt, pkt) + u_int32_t fbmask; + struct flow_filter *filt; + struct flowinfo_in *pkt; +{ + if (filt->ff_flow.fi_family != AF_INET) + return (0); + if ((fbmask & FIMB4_SPORT) && filt->ff_flow.fi_sport != pkt->fi_sport) + return (0); + if ((fbmask & FIMB4_DPORT) && filt->ff_flow.fi_dport != pkt->fi_dport) + return (0); + if ((fbmask & FIMB4_PROTO) && filt->ff_flow.fi_proto != pkt->fi_proto) + return (0); + /* match */ + return (1); +} + +/* + * filter matching function only for tos field. + */ +static int +apply_tosfilter4(fbmask, filt, pkt) + u_int32_t fbmask; + struct flow_filter *filt; + struct flowinfo_in *pkt; +{ + if (filt->ff_flow.fi_family != AF_INET) + return (0); + if ((fbmask & FIMB4_TOS) && filt->ff_flow.fi_tos != + (pkt->fi_tos & filt->ff_mask.mask_tos)) + return (0); + /* match */ + return (1); +} + +#ifdef INET6 +static int +apply_filter6(fbmask, filt, pkt) + u_int32_t fbmask; + struct flow_filter6 *filt; + struct flowinfo_in6 *pkt; +{ + int i; + + if (filt->ff_flow6.fi6_family != AF_INET6) + return (0); + if ((fbmask & FIMB6_FLABEL) && + filt->ff_flow6.fi6_flowlabel != pkt->fi6_flowlabel) + return (0); + if ((fbmask & FIMB6_PROTO) && + filt->ff_flow6.fi6_proto != pkt->fi6_proto) + return (0); + if ((fbmask & FIMB6_SPORT) && + filt->ff_flow6.fi6_sport != pkt->fi6_sport) + return (0); + if ((fbmask & FIMB6_DPORT) && + filt->ff_flow6.fi6_dport != pkt->fi6_dport) + return (0); + if (fbmask & FIMB6_SADDR) { + for (i = 0; i < 4; i++) + if (filt->ff_flow6.fi6_src.s6_addr32[i] != + (pkt->fi6_src.s6_addr32[i] & + filt->ff_mask6.mask6_src.s6_addr32[i])) + return (0); + } + if (fbmask & FIMB6_DADDR) { + for (i = 0; i < 4; i++) + if (filt->ff_flow6.fi6_dst.s6_addr32[i] != + (pkt->fi6_dst.s6_addr32[i] & + filt->ff_mask6.mask6_dst.s6_addr32[i])) + return (0); + } + if ((fbmask & FIMB6_TCLASS) && + filt->ff_flow6.fi6_tclass != + (pkt->fi6_tclass & filt->ff_mask6.mask6_tclass)) + return (0); + if ((fbmask & FIMB6_GPI) && + filt->ff_flow6.fi6_gpi != pkt->fi6_gpi) + return (0); + /* match */ + return (1); +} +#endif /* INET6 */ + +/* + * filter handle: + * bit 20-28: index to the filter hash table + * bit 0-19: unique id in the hash bucket. + */ +static u_long +get_filt_handle(classifier, i) + struct acc_classifier *classifier; + int i; +{ + static u_long handle_number = 1; + u_long handle; + struct acc_filter *afp; + + while (1) { + handle = handle_number++ & 0x000fffff; + + if (LIST_EMPTY(&classifier->acc_filters[i])) + break; + + LIST_FOREACH(afp, &classifier->acc_filters[i], f_chain) + if ((afp->f_handle & 0x000fffff) == handle) + break; + if (afp == NULL) + break; + /* this handle is already used, try again */ + } + + return ((i << 20) | handle); +} + +/* convert filter handle to filter pointer */ +static struct acc_filter * +filth_to_filtp(classifier, handle) + struct acc_classifier *classifier; + u_long handle; +{ + struct acc_filter *afp; + int i; + + i = ACC_GET_HINDEX(handle); + + LIST_FOREACH(afp, &classifier->acc_filters[i], f_chain) + if (afp->f_handle == handle) + return (afp); + + return (NULL); +} + +/* create flowinfo bitmask */ +static u_int32_t +filt2fibmask(filt) + struct flow_filter *filt; +{ + u_int32_t mask = 0; +#ifdef INET6 + struct flow_filter6 *filt6; +#endif + + switch (filt->ff_flow.fi_family) { + case AF_INET: + if (filt->ff_flow.fi_proto != 0) + mask |= FIMB4_PROTO; + if (filt->ff_flow.fi_tos != 0) + mask |= FIMB4_TOS; + if (filt->ff_flow.fi_dst.s_addr != 0) + mask |= FIMB4_DADDR; + if (filt->ff_flow.fi_src.s_addr != 0) + mask |= FIMB4_SADDR; + if (filt->ff_flow.fi_sport != 0) + mask |= FIMB4_SPORT; + if (filt->ff_flow.fi_dport != 0) + mask |= FIMB4_DPORT; + if (filt->ff_flow.fi_gpi != 0) + mask |= FIMB4_GPI; + break; +#ifdef INET6 + case AF_INET6: + filt6 = (struct flow_filter6 *)filt; + + if (filt6->ff_flow6.fi6_proto != 0) + mask |= FIMB6_PROTO; + if (filt6->ff_flow6.fi6_tclass != 0) + mask |= FIMB6_TCLASS; + if (!IN6_IS_ADDR_UNSPECIFIED(&filt6->ff_flow6.fi6_dst)) + mask |= FIMB6_DADDR; + if (!IN6_IS_ADDR_UNSPECIFIED(&filt6->ff_flow6.fi6_src)) + mask |= FIMB6_SADDR; + if (filt6->ff_flow6.fi6_sport != 0) + mask |= FIMB6_SPORT; + if (filt6->ff_flow6.fi6_dport != 0) + mask |= FIMB6_DPORT; + if (filt6->ff_flow6.fi6_gpi != 0) + mask |= FIMB6_GPI; + if (filt6->ff_flow6.fi6_flowlabel != 0) + mask |= FIMB6_FLABEL; + break; +#endif /* INET6 */ + } + return (mask); +} + + +/* + * helper functions to handle IPv4 fragments. + * currently only in-sequence fragments are handled. + * - fragment info is cached in a LRU list. + * - when a first fragment is found, cache its flow info. + * - when a non-first fragment is found, lookup the cache. + */ + +struct ip4_frag { + TAILQ_ENTRY(ip4_frag) ip4f_chain; + char ip4f_valid; + u_short ip4f_id; + struct flowinfo_in ip4f_info; +}; + +static TAILQ_HEAD(ip4f_list, ip4_frag) ip4f_list; /* IPv4 fragment cache */ + +#define IP4F_TABSIZE 16 /* IPv4 fragment cache size */ + + +static void +ip4f_cache(ip, fin) + struct ip *ip; + struct flowinfo_in *fin; +{ + struct ip4_frag *fp; + + if (TAILQ_EMPTY(&ip4f_list)) { + /* first time call, allocate fragment cache entries. */ + if (ip4f_init() < 0) + /* allocation failed! */ + return; + } + + fp = ip4f_alloc(); + fp->ip4f_id = ip->ip_id; + + /* save port numbers */ + fp->ip4f_info.fi_sport = fin->fi_sport; + fp->ip4f_info.fi_dport = fin->fi_dport; + fp->ip4f_info.fi_gpi = fin->fi_gpi; +} + +static int +ip4f_lookup(ip, fin) + struct ip *ip; + struct flowinfo_in *fin; +{ + struct ip4_frag *fp; + + for (fp = TAILQ_FIRST(&ip4f_list); fp != NULL && fp->ip4f_valid; + fp = TAILQ_NEXT(fp, ip4f_chain)) + if (ip->ip_id == fp->ip4f_id && + ip->ip_src.s_addr == fp->ip4f_info.fi_src.s_addr && + ip->ip_dst.s_addr == fp->ip4f_info.fi_dst.s_addr && + ip->ip_p == fp->ip4f_info.fi_proto) { + + /* found the matching entry */ + fin->fi_sport = fp->ip4f_info.fi_sport; + fin->fi_dport = fp->ip4f_info.fi_dport; + fin->fi_gpi = fp->ip4f_info.fi_gpi; + + if ((ntohs(ip->ip_off) & IP_MF) == 0) + /* this is the last fragment, + release the entry. */ + ip4f_free(fp); + + return (1); + } + + /* no matching entry found */ + return (0); +} + +static int +ip4f_init(void) +{ + struct ip4_frag *fp; + int i; + + TAILQ_INIT(&ip4f_list); + for (i=0; iip4f_valid = 0; + TAILQ_INSERT_TAIL(&ip4f_list, fp, ip4f_chain); + } + return (0); +} + +static struct ip4_frag * +ip4f_alloc(void) +{ + struct ip4_frag *fp; + + /* reclaim an entry at the tail, put it at the head */ + fp = TAILQ_LAST(&ip4f_list, ip4f_list); + TAILQ_REMOVE(&ip4f_list, fp, ip4f_chain); + fp->ip4f_valid = 1; + TAILQ_INSERT_HEAD(&ip4f_list, fp, ip4f_chain); + return (fp); +} + +static void +ip4f_free(fp) + struct ip4_frag *fp; +{ + TAILQ_REMOVE(&ip4f_list, fp, ip4f_chain); + fp->ip4f_valid = 0; + TAILQ_INSERT_TAIL(&ip4f_list, fp, ip4f_chain); +} + +/* + * read and write diffserv field in IPv4 or IPv6 header + */ +u_int8_t +read_dsfield(m, pktattr) + struct mbuf *m; + struct altq_pktattr *pktattr; +{ + struct mbuf *m0; + u_int8_t ds_field = 0; + + if (pktattr == NULL || + (pktattr->pattr_af != AF_INET && pktattr->pattr_af != AF_INET6)) + return ((u_int8_t)0); + + /* verify that pattr_hdr is within the mbuf data */ + for (m0 = m; m0 != NULL; m0 = m0->m_next) + if ((pktattr->pattr_hdr >= m0->m_data) && + (pktattr->pattr_hdr < m0->m_data + m0->m_len)) + break; + if (m0 == NULL) { + /* ick, pattr_hdr is stale */ + pktattr->pattr_af = AF_UNSPEC; +#ifdef ALTQ_DEBUG + printf("read_dsfield: can't locate header!\n"); +#endif + return ((u_int8_t)0); + } + + if (pktattr->pattr_af == AF_INET) { + struct ip *ip = (struct ip *)pktattr->pattr_hdr; + + if (ip->ip_v != 4) + return ((u_int8_t)0); /* version mismatch! */ + ds_field = ip->ip_tos; + } +#ifdef INET6 + else if (pktattr->pattr_af == AF_INET6) { + struct ip6_hdr *ip6 = (struct ip6_hdr *)pktattr->pattr_hdr; + u_int32_t flowlabel; + + flowlabel = ntohl(ip6->ip6_flow); + if ((flowlabel >> 28) != 6) + return ((u_int8_t)0); /* version mismatch! */ + ds_field = (flowlabel >> 20) & 0xff; + } +#endif + return (ds_field); +} + +void +write_dsfield(m, pktattr, dsfield) + struct mbuf *m; + struct altq_pktattr *pktattr; + u_int8_t dsfield; +{ + struct mbuf *m0; + + if (pktattr == NULL || + (pktattr->pattr_af != AF_INET && pktattr->pattr_af != AF_INET6)) + return; + + /* verify that pattr_hdr is within the mbuf data */ + for (m0 = m; m0 != NULL; m0 = m0->m_next) + if ((pktattr->pattr_hdr >= m0->m_data) && + (pktattr->pattr_hdr < m0->m_data + m0->m_len)) + break; + if (m0 == NULL) { + /* ick, pattr_hdr is stale */ + pktattr->pattr_af = AF_UNSPEC; +#ifdef ALTQ_DEBUG + printf("write_dsfield: can't locate header!\n"); +#endif + return; + } + + if (pktattr->pattr_af == AF_INET) { + struct ip *ip = (struct ip *)pktattr->pattr_hdr; + u_int8_t old; + int32_t sum; + + if (ip->ip_v != 4) + return; /* version mismatch! */ + old = ip->ip_tos; + dsfield |= old & 3; /* leave CU bits */ + if (old == dsfield) + return; + ip->ip_tos = dsfield; + /* + * update checksum (from RFC1624) + * HC' = ~(~HC + ~m + m') + */ + sum = ~ntohs(ip->ip_sum) & 0xffff; + sum += 0xff00 + (~old & 0xff) + dsfield; + sum = (sum >> 16) + (sum & 0xffff); + sum += (sum >> 16); /* add carry */ + + ip->ip_sum = htons(~sum & 0xffff); + } +#ifdef INET6 + else if (pktattr->pattr_af == AF_INET6) { + struct ip6_hdr *ip6 = (struct ip6_hdr *)pktattr->pattr_hdr; + u_int32_t flowlabel; + + flowlabel = ntohl(ip6->ip6_flow); + if ((flowlabel >> 28) != 6) + return; /* version mismatch! */ + flowlabel = (flowlabel & 0xf03fffff) | (dsfield << 20); + ip6->ip6_flow = htonl(flowlabel); + } +#endif + return; +} + + +/* + * high resolution clock support taking advantage of a machine dependent + * high resolution time counter (e.g., timestamp counter of intel pentium). + * we assume + * - 64-bit-long monotonically-increasing counter + * - frequency range is 100M-4GHz (CPU speed) + */ +u_int32_t machclk_freq = 0; +u_int32_t machclk_per_tick = 0; + +#if (defined(__i386__) || defined(__alpha__)) && !defined(ALTQ_NOPCC) +#ifdef __FreeBSD__ +/* freebsd makes clock frequency accessible */ +#ifdef __alpha__ +extern u_int32_t cycles_per_sec; /* alpha cpu clock frequency */ +#endif +void +init_machclk(void) +{ +#if defined(__i386__) +#if (__FreeBSD_version > 300000) + machclk_freq = tsc_freq; +#else + machclk_freq = i586_ctr_freq; +#endif +#elif defined(__alpha__) + machclk_freq = cycles_per_sec; +#endif /* __alpha__ */ + machclk_per_tick = machclk_freq / hz; +} +#else /* !__FreeBSD__ */ +/* + * measure Pentium TSC or Alpha PCC clock frequency + */ +void +init_machclk(void) +{ + static int wait; + struct timeval tv_start, tv_end; + u_int64_t start, end, diff; + int timo; + + microtime(&tv_start); + start = read_machclk(); + timo = hz; /* 1 sec */ + (void)tsleep(&wait, PWAIT | PCATCH, "init_machclk", timo); + microtime(&tv_end); + end = read_machclk(); + diff = (u_int64_t)(tv_end.tv_sec - tv_start.tv_sec) * 1000000 + + tv_end.tv_usec - tv_start.tv_usec; + if (diff != 0) + machclk_freq = (u_int)((end - start) * 1000000 / diff); + machclk_per_tick = machclk_freq / hz; + + printf("altq: CPU clock: %uHz\n", machclk_freq); +} +#endif /* !__FreeBSD__ */ +#ifdef __alpha__ +/* + * make a 64bit counter value out of the 32bit alpha processor cycle counter. + * read_machclk must be called within a half of its wrap-around cycle + * (about 5 sec for 400MHz cpu) to properly detect a counter wrap-around. + * tbr_timeout calls read_machclk once a second. + */ +u_int64_t +read_machclk(void) +{ + static u_int32_t last_pcc, upper; + u_int32_t pcc; + + pcc = (u_int32_t)alpha_rpcc(); + if (pcc <= last_pcc) + upper++; + last_pcc = pcc; + return (((u_int64_t)upper << 32) + pcc); +} +#endif /* __alpha__ */ +#else /* !i386 && !alpha */ +/* use microtime() for now */ +void +init_machclk(void) +{ + machclk_freq = 1000000 << MACHCLK_SHIFT; + machclk_per_tick = machclk_freq / hz; + printf("altq: emulate %uHz cpu clock\n", machclk_freq); +} +#endif /* !i386 && !alpha */ + +#endif /* ALTQ */ diff --git a/sys/altq/altq_var.h b/sys/altq/altq_var.h new file mode 100644 index 000000000000..447f9cff1dc2 --- /dev/null +++ b/sys/altq/altq_var.h @@ -0,0 +1,225 @@ +/* $KAME: altq_var.h,v 1.7 2000/12/14 08:12:46 thorpej Exp $ */ + +/* + * Copyright (C) 1998-2000 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#ifndef _ALTQ_ALTQ_VAR_H_ +#define _ALTQ_ALTQ_VAR_H_ + +#ifdef _KERNEL + +#include +#include +#include + +/* + * filter structure for altq common classifier + */ +struct acc_filter { + LIST_ENTRY(acc_filter) f_chain; + void *f_class; /* pointer to the class */ + u_long f_handle; /* filter id */ + u_int32_t f_fbmask; /* filter bitmask */ + struct flow_filter f_filter; /* filter value */ +}; + +/* + * XXX ACC_FILTER_TABLESIZE can't be larger than 2048 unless we fix + * the handle assignment. + */ +#define ACC_FILTER_TABLESIZE (256+1) +#define ACC_FILTER_MASK (ACC_FILTER_TABLESIZE - 2) +#define ACC_WILDCARD_INDEX (ACC_FILTER_TABLESIZE - 1) +#ifdef __GNUC__ +#define ACC_GET_HASH_INDEX(addr) \ + ({int x = (addr) + ((addr) >> 16); (x + (x >> 8)) & ACC_FILTER_MASK;}) +#else +#define ACC_GET_HASH_INDEX(addr) \ + (((addr) + ((addr) >> 8) + ((addr) >> 16) + ((addr) >> 24)) \ + & ACC_FILTER_MASK) +#endif +#define ACC_GET_HINDEX(handle) ((handle) >> 20) + +struct acc_classifier { + u_int32_t acc_fbmask; + LIST_HEAD(filt, acc_filter) acc_filters[ACC_FILTER_TABLESIZE]; +}; + +/* + * flowinfo mask bits used by classifier + */ +/* for ipv4 */ +#define FIMB4_PROTO 0x0001 +#define FIMB4_TOS 0x0002 +#define FIMB4_DADDR 0x0004 +#define FIMB4_SADDR 0x0008 +#define FIMB4_DPORT 0x0010 +#define FIMB4_SPORT 0x0020 +#define FIMB4_GPI 0x0040 +#define FIMB4_ALL 0x007f +/* for ipv6 */ +#define FIMB6_PROTO 0x0100 +#define FIMB6_TCLASS 0x0200 +#define FIMB6_DADDR 0x0400 +#define FIMB6_SADDR 0x0800 +#define FIMB6_DPORT 0x1000 +#define FIMB6_SPORT 0x2000 +#define FIMB6_GPI 0x4000 +#define FIMB6_FLABEL 0x8000 +#define FIMB6_ALL 0xff00 + +#define FIMB_ALL (FIMB4_ALL|FIMB6_ALL) + +#define FIMB4_PORTS (FIMB4_DPORT|FIMB4_SPORT|FIMB4_GPI) +#define FIMB6_PORTS (FIMB6_DPORT|FIMB6_SPORT|FIMB6_GPI) + +/* + * machine dependent clock + * a 64bit high resolution time counter. + */ +extern u_int32_t machclk_freq; +extern u_int32_t machclk_per_tick; +extern void init_machclk(void); + +#if defined(__i386__) && !defined(ALTQ_NOPCC) +/* for pentium tsc */ +#include + +#define read_machclk() rdtsc() +#ifdef __OpenBSD__ +static __inline u_int64_t +rdtsc(void) +{ + u_int64_t rv; + __asm __volatile(".byte 0x0f, 0x31" : "=A" (rv)); + return (rv); +} +#endif /* __OpenBSD__ */ + +#elif defined(__alpha__) && !defined(ALTQ_NOPCC) +/* for alpha rpcc */ +extern u_int64_t read_machclk(void); + +#else /* !i386 && !alpha */ +/* emulate 256MHz using microtime() */ +#define MACHCLK_SHIFT 8 +static __inline u_int64_t +read_machclk(void) +{ + struct timeval tv; + microtime(&tv); + return (((u_int64_t)(tv.tv_sec - boottime.tv_sec) * 1000000 + + tv.tv_usec) << MACHCLK_SHIFT); +} +#endif /* !i386 && !alpha */ + +/* + * debug support + */ +#ifdef ALTQ_DEBUG +#ifdef __STDC__ +#define ASSERT(e) ((e) ? (void)0 : altq_assert(__FILE__, __LINE__, #e)) +#else /* PCC */ +#define ASSERT(e) ((e) ? (void)0 : altq_assert(__FILE__, __LINE__, "e")) +#endif +#else +#define ASSERT(e) ((void)0) +#endif + +/* + * misc stuff for compatibility + */ +/* ioctl cmd type */ +#if defined(__FreeBSD__) && (__FreeBSD__ < 3) +typedef int ioctlcmd_t; +#else +typedef u_long ioctlcmd_t; +#endif + +/* + * queue macros: + * the interface of TAILQ_LAST macro changed after the introduction + * of softupdate. redefine it here to make it work with pre-2.2.7. + */ +#undef TAILQ_LAST +#define TAILQ_LAST(head, headname) \ + (*(((struct headname *)((head)->tqh_last))->tqh_last)) + +#ifndef TAILQ_EMPTY +#define TAILQ_EMPTY(head) ((head)->tqh_first == NULL) +#endif +#ifndef TAILQ_FOREACH +#define TAILQ_FOREACH(var, head, field) \ + for (var = TAILQ_FIRST(head); var; var = TAILQ_NEXT(var, field)) +#endif + +/* macro for timeout/untimeout */ +#if (__FreeBSD_version > 300000) || defined(__NetBSD__) +/* use callout */ +#include + +#define CALLOUT_INIT(c) callout_init((c)) +#define CALLOUT_RESET(c,t,f,a) callout_reset((c),(t),(f),(a)) +#define CALLOUT_STOP(c) callout_stop((c)) +#ifndef CALLOUT_INITIALIZER +#define CALLOUT_INITIALIZER { { { NULL } }, 0, NULL, NULL, 0 } +#endif +#else +/* use old-style timeout/untimeout */ +/* dummy callout structure */ +struct callout { + void *c_arg; /* function argument */ + void (*c_func) __P((void *));/* functiuon to call */ +}; +#define CALLOUT_INIT(c) do { bzero((c), sizeof(*(c))); } while (0) +#define CALLOUT_RESET(c,t,f,a) do { (c)->c_arg = (a); \ + (c)->c_func = (f); \ + timeout((f),(a),(t)); } while (0) +#define CALLOUT_STOP(c) untimeout((c)->c_func,(c)->c_arg) +#define CALLOUT_INITIALIZER { NULL, NULL } +#endif +#if !defined(__FreeBSD__) +typedef void (timeout_t)(void *); +#endif + +#define m_pktlen(m) ((m)->m_pkthdr.len) + +struct ifnet; struct mbuf; struct flowinfo; + +void *altq_lookup __P((char *, int)); +int altq_extractflow __P((struct mbuf *, int, struct flowinfo *, u_int32_t)); +int acc_add_filter __P((struct acc_classifier *, struct flow_filter *, + void *, u_long *)); +int acc_delete_filter __P((struct acc_classifier *, u_long)); +int acc_discard_filters __P((struct acc_classifier *, void *, int)); +void *acc_classify __P((void *, struct mbuf *, int)); +u_int8_t read_dsfield __P((struct mbuf *, struct altq_pktattr *)); +void write_dsfield __P((struct mbuf *, struct altq_pktattr *, u_int8_t)); +void altq_assert __P((const char *, int, const char *)); +int tbr_set __P((struct ifaltq *, struct tb_profile *)); +int tbr_get __P((struct ifaltq *, struct tb_profile *)); + +#endif /* _KERNEL */ +#endif /* _ALTQ_ALTQ_VAR_H_ */ diff --git a/sys/altq/altq_wfq.c b/sys/altq/altq_wfq.c new file mode 100644 index 000000000000..3ba5a21e6837 --- /dev/null +++ b/sys/altq/altq_wfq.c @@ -0,0 +1,751 @@ +/* $KAME: altq_wfq.c,v 1.7 2000/12/14 08:12:46 thorpej Exp $ */ + +/* + * Copyright (C) 1997-2000 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * March 27, 1997. Written by Hiroshi Kyusojin of Keio University + * (kyu@mt.cs.keio.ac.jp). + */ + +#if defined(__FreeBSD__) || defined(__NetBSD__) +#include "opt_altq.h" +#if (__FreeBSD__ != 2) +#include "opt_inet.h" +#ifdef __FreeBSD__ +#include "opt_inet6.h" +#endif +#endif +#endif /* __FreeBSD__ || __NetBSD__ */ +#ifdef ALTQ_WFQ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +/* +#define WFQ_DEBUG +*/ + +static int wfq_setenable(struct wfq_interface *, int); +static int wfq_ifattach(struct wfq_interface *); +static int wfq_ifdetach(struct wfq_interface *); +static int wfq_ifenqueue(struct ifaltq *, struct mbuf *, + struct altq_pktattr *); +static u_long wfq_hash(struct flowinfo *, int); +static __inline u_long wfq_hashbydstaddr(struct flowinfo *, int); +static __inline u_long wfq_hashbysrcport(struct flowinfo *, int); +static wfq *wfq_maxqueue(wfq_state_t *); +static struct mbuf *wfq_ifdequeue(struct ifaltq *, int); +static int wfq_getqid(struct wfq_getqid *); +static int wfq_setweight(struct wfq_setweight *); +static int wfq_getstats(struct wfq_getstats *); +static int wfq_config(struct wfq_conf *); +static int wfq_request __P((struct ifaltq *, int, void *)); +static int wfq_flush(struct ifaltq *); +static void *wfq_classify(void *, struct mbuf *, int); + +/* global value : pointer to wfq queue list */ +static wfq_state_t *wfq_list = NULL; + +static int +wfq_setenable(ifacep, flag) + struct wfq_interface *ifacep; + int flag; +{ + wfq_state_t *wfqp; + int error = 0; + + if ((wfqp = altq_lookup(ifacep->wfq_ifacename, ALTQT_WFQ)) == NULL) + return (EBADF); + + switch(flag){ + case ENABLE: + error = altq_enable(wfqp->ifq); + break; + case DISABLE: + error = altq_disable(wfqp->ifq); + break; + } + return error; +} + + +static int +wfq_ifattach(ifacep) + struct wfq_interface *ifacep; +{ + int error = 0, i; + struct ifnet *ifp; + wfq_state_t *new_wfqp; + wfq *queue; + + if ((ifp = ifunit(ifacep->wfq_ifacename)) == NULL) { +#ifdef WFQ_DEBUG + printf("wfq_ifattach()...no ifp found\n"); +#endif + return (ENXIO); + } + + if (!ALTQ_IS_READY(&ifp->if_snd)) { +#ifdef WFQ_DEBUG + printf("wfq_ifattach()...altq is not ready\n"); +#endif + return (ENXIO); + } + + /* allocate and initialize wfq_state_t */ + MALLOC(new_wfqp, wfq_state_t *, sizeof(wfq_state_t), + M_DEVBUF, M_WAITOK); + if (new_wfqp == NULL) + return (ENOMEM); + bzero(new_wfqp, sizeof(wfq_state_t)); + MALLOC(queue, wfq *, sizeof(wfq) * DEFAULT_QSIZE, + M_DEVBUF, M_WAITOK); + if (queue == NULL) { + FREE(new_wfqp, M_DEVBUF); + return (ENOMEM); + } + bzero(queue, sizeof(wfq) * DEFAULT_QSIZE); + + /* keep the ifq */ + new_wfqp->ifq = &ifp->if_snd; + new_wfqp->nums = DEFAULT_QSIZE; + new_wfqp->hwm = HWM; + new_wfqp->bytes = 0; + new_wfqp->rrp = NULL; + new_wfqp->queue = queue; + new_wfqp->hash_func = wfq_hashbydstaddr; + new_wfqp->fbmask = FIMB4_DADDR; + + for (i = 0; i < new_wfqp->nums; i++, queue++) { + queue->next = queue->prev = NULL; + queue->head = queue->tail = NULL; + queue->bytes = queue->quota = 0; + queue->weight = 100; + } + + /* + * set WFQ to this ifnet structure. + */ + if ((error = altq_attach(&ifp->if_snd, ALTQT_WFQ, new_wfqp, + wfq_ifenqueue, wfq_ifdequeue, wfq_request, + new_wfqp, wfq_classify)) != 0) { + FREE(queue, M_DEVBUF); + FREE(new_wfqp, M_DEVBUF); + return (error); + } + + new_wfqp->next = wfq_list; + wfq_list = new_wfqp; + + return (error); +} + + +static int +wfq_ifdetach(ifacep) + struct wfq_interface *ifacep; +{ + int error = 0; + wfq_state_t *wfqp; + + if ((wfqp = altq_lookup(ifacep->wfq_ifacename, ALTQT_WFQ)) == NULL) + return (EBADF); + + /* free queued mbuf */ + wfq_flush(wfqp->ifq); + + /* remove WFQ from the ifnet structure. */ + (void)altq_disable(wfqp->ifq); + (void)altq_detach(wfqp->ifq); + + /* remove from the wfqstate list */ + if (wfq_list == wfqp) + wfq_list = wfqp->next; + else { + wfq_state_t *wp = wfq_list; + do { + if (wp->next == wfqp) { + wp->next = wfqp->next; + break; + } + } while ((wp = wp->next) != NULL); + } + + /* deallocate wfq_state_t */ + FREE(wfqp->queue, M_DEVBUF); + FREE(wfqp, M_DEVBUF); + return (error); +} + +static int +wfq_request(ifq, req, arg) + struct ifaltq *ifq; + int req; + void *arg; +{ + wfq_state_t *wfqp = (wfq_state_t *)ifq->altq_disc; + + switch (req) { + case ALTRQ_PURGE: + wfq_flush(wfqp->ifq); + break; + } + return (0); +} + + +static int +wfq_flush(ifq) + struct ifaltq *ifq; +{ + struct mbuf *mp; + + while ((mp = wfq_ifdequeue(ifq, ALTDQ_REMOVE)) != NULL) + m_freem(mp); + if (ALTQ_IS_ENABLED(ifq)) + ifq->ifq_len = 0; + return 0; +} + +static void * +wfq_classify(clfier, m, af) + void *clfier; + struct mbuf *m; + int af; +{ + wfq_state_t *wfqp = (wfq_state_t *)clfier; + struct flowinfo flow; + + altq_extractflow(m, af, &flow, wfqp->fbmask); + return (&wfqp->queue[(*wfqp->hash_func)(&flow, wfqp->nums)]); +} + +static int +wfq_ifenqueue(ifq, mp, pktattr) + struct ifaltq *ifq; + struct mbuf *mp; + struct altq_pktattr *pktattr; +{ + wfq_state_t *wfqp; + wfq *queue; + int byte, error = 0; + + wfqp = (wfq_state_t *)ifq->altq_disc; + mp->m_nextpkt = NULL; + + /* grab a queue selected by classifier */ + if (pktattr == NULL || (queue = pktattr->pattr_class) == NULL) + queue = &wfqp->queue[0]; + + if (queue->tail == NULL) + queue->head = mp; + else + queue->tail->m_nextpkt = mp; + queue->tail = mp; + byte = mp->m_pkthdr.len; + queue->bytes += byte; + wfqp->bytes += byte; + ifq->ifq_len++; + + if (queue->next == NULL) { + /* this queue gets active. add the queue to the active list */ + if (wfqp->rrp == NULL){ + /* no queue in the active list */ + queue->next = queue->prev = queue; + wfqp->rrp = queue; + WFQ_ADDQUOTA(queue); + } else { + /* insert the queue at the tail of the active list */ + queue->prev = wfqp->rrp->prev; + wfqp->rrp->prev->next = queue; + wfqp->rrp->prev = queue; + queue->next = wfqp->rrp; + queue->quota = 0; + } + } + + /* check overflow. if the total size exceeds the high water mark, + drop packets from the longest queue. */ + while (wfqp->bytes > wfqp->hwm) { + wfq *drop_queue = wfq_maxqueue(wfqp); + + /* drop the packet at the head. */ + mp = drop_queue->head; + if ((drop_queue->head = mp->m_nextpkt) == NULL) + drop_queue->tail = NULL; + mp->m_nextpkt = NULL; + byte = mp->m_pkthdr.len; + drop_queue->bytes -= byte; + PKTCNTR_ADD(&drop_queue->drop_cnt, byte); + wfqp->bytes -= byte; + m_freem(mp); + ifq->ifq_len--; + if(drop_queue == queue) + /* the queue for this flow is selected to drop */ + error = ENOBUFS; + } + return error; +} + + +static u_long wfq_hash(flow, n) + struct flowinfo *flow; + int n; +{ + u_long val = 0; + + if (flow != NULL) { + if (flow->fi_family == AF_INET) { + struct flowinfo_in *fp = (struct flowinfo_in *)flow; + u_long val2; + + val = fp->fi_dst.s_addr ^ fp->fi_src.s_addr; + val = val ^ (val >> 8) ^ (val >> 16) ^ (val >> 24); + val2 = fp->fi_dport ^ fp->fi_sport ^ fp->fi_proto; + val2 = val2 ^ (val2 >> 8); + val = val ^ val2; + } +#ifdef INET6 + else if (flow->fi_family == AF_INET6) { + struct flowinfo_in6 *fp6 = (struct flowinfo_in6 *)flow; + + val = ntohl(fp6->fi6_flowlabel); + } +#endif + } + + return (val % n); +} + + +static __inline u_long wfq_hashbydstaddr(flow, n) + struct flowinfo *flow; + int n; +{ + u_long val = 0; + + if (flow != NULL) { + if (flow->fi_family == AF_INET) { + struct flowinfo_in *fp = (struct flowinfo_in *)flow; + + val = fp->fi_dst.s_addr; + val = val ^ (val >> 8) ^ (val >> 16) ^ (val >> 24); + } +#ifdef INET6 + else if (flow->fi_family == AF_INET6) { + struct flowinfo_in6 *fp6 = (struct flowinfo_in6 *)flow; + + val = ntohl(fp6->fi6_flowlabel); + } +#endif + } + + return (val % n); +} + +static __inline u_long wfq_hashbysrcport(flow, n) + struct flowinfo *flow; + int n; +{ + u_long val = 0; + + if (flow != NULL) { + if (flow->fi_family == AF_INET) { + struct flowinfo_in *fp = (struct flowinfo_in *)flow; + + val = fp->fi_sport; + } +#ifdef INET6 + else if (flow->fi_family == AF_INET6) { + struct flowinfo_in6 *fp6 = (struct flowinfo_in6 *)flow; + + val = fp6->fi6_sport; + } +#endif + } + val = val ^ (val >> 8); + + return (val % n); +} + +static wfq *wfq_maxqueue(wfqp) + wfq_state_t *wfqp; +{ + int byte, max_byte = 0; + wfq *queue, *max_queue = NULL; + + if((queue = wfqp->rrp) == NULL) + /* never happens */ + return NULL; + do{ + if ((byte = queue->bytes * 100 / queue->weight) > max_byte) { + max_queue = queue; + max_byte = byte; + } + } while ((queue = queue->next) != wfqp->rrp); + + return max_queue; +} + + +static struct mbuf * +wfq_ifdequeue(ifq, op) + struct ifaltq *ifq; + int op; +{ + wfq_state_t *wfqp; + wfq *queue; + struct mbuf *mp; + int byte; + + wfqp = (wfq_state_t *)ifq->altq_disc; + + if ((wfqp->bytes == 0) || ((queue = wfqp->rrp) == NULL)) + /* no packet in the queues */ + return NULL; + + while (1) { + if (queue->quota > 0) { + if (queue->bytes <= 0) { + /* this queue no longer has packet. + remove the queue from the active list. */ + if (queue->next == queue){ + /* no other active queue + -- this case never happens in + this algorithm. */ + queue->next = queue->prev = NULL; + wfqp->rrp = NULL; + return NULL; + } else { + queue->prev->next = queue->next; + queue->next->prev = queue->prev; + /* the round-robin pointer points + to this queue, advance the rrp */ + wfqp->rrp = queue->next; + queue->next = queue->prev = NULL; + queue = wfqp->rrp; + WFQ_ADDQUOTA(queue); + continue; + } + } + + /* dequeue a packet from this queue */ + mp = queue->head; + if (op == ALTDQ_REMOVE) { + if((queue->head = mp->m_nextpkt) == NULL) + queue->tail = NULL; + byte = mp->m_pkthdr.len; + mp->m_nextpkt = NULL; + queue->quota -= byte; + queue->bytes -= byte; + PKTCNTR_ADD(&queue->xmit_cnt, byte); + wfqp->bytes -= byte; + if (ALTQ_IS_ENABLED(ifq)) + ifq->ifq_len--; + } + return mp; + + /* if the queue gets empty by this dequeueing, + the queue will be removed from the active list + at the next round */ + } + + /* advance the round-robin pointer */ + queue = wfqp->rrp = queue->next; + WFQ_ADDQUOTA(queue); + } +} + +static int +wfq_getqid(gqidp) + struct wfq_getqid *gqidp; +{ + wfq_state_t *wfqp; + + if ((wfqp = altq_lookup(gqidp->iface.wfq_ifacename, ALTQT_WFQ)) + == NULL) + return (EBADF); + + gqidp->qid = (*wfqp->hash_func)(&gqidp->flow, wfqp->nums); + return 0; +} + +static int +wfq_setweight(swp) + struct wfq_setweight *swp; +{ + wfq_state_t *wfqp; + wfq *queue; + int old; + + if (swp->weight < 0) { + printf("set weight in natural number\n"); + return (EINVAL); + } + + if ((wfqp = altq_lookup(swp->iface.wfq_ifacename, ALTQT_WFQ)) == NULL) + return (EBADF); + + queue = &wfqp->queue[swp->qid]; + old = queue->weight; + queue->weight = swp->weight; + swp->weight = old; + return 0; +} + + +static int +wfq_getstats(gsp) + struct wfq_getstats *gsp; +{ + wfq_state_t *wfqp; + wfq *queue; + queue_stats *stats; + + if ((wfqp = altq_lookup(gsp->iface.wfq_ifacename, ALTQT_WFQ)) == NULL) + return (EBADF); + + if (gsp->qid >= wfqp->nums) + return (EINVAL); + + queue = &wfqp->queue[gsp->qid]; + stats = &gsp->stats; + + stats->bytes = queue->bytes; + stats->weight = queue->weight; + stats->xmit_cnt = queue->xmit_cnt; + stats->drop_cnt = queue->drop_cnt; + + return 0; +} + + +static int +wfq_config(cf) + struct wfq_conf *cf; +{ + wfq_state_t *wfqp; + wfq *queue; + int i, error = 0; + + if ((wfqp = altq_lookup(cf->iface.wfq_ifacename, ALTQT_WFQ)) == NULL) + return (EBADF); + + if(cf->nqueues <= 0 || MAX_QSIZE < cf->nqueues) + cf->nqueues = DEFAULT_QSIZE; + + if (cf->nqueues != wfqp->nums) { + /* free queued mbuf */ + wfq_flush(wfqp->ifq); + FREE(wfqp->queue, M_DEVBUF); + + MALLOC(queue, wfq *, sizeof(wfq) * cf->nqueues, + M_DEVBUF, M_WAITOK); + if (queue == NULL) + return (ENOMEM); + bzero(queue, sizeof(wfq) * cf->nqueues); + + wfqp->nums = cf->nqueues; + wfqp->bytes = 0; + wfqp->rrp = NULL; + wfqp->queue = queue; + for (i = 0; i < wfqp->nums; i++, queue++) { + queue->next = queue->prev = NULL; + queue->head = queue->tail = NULL; + queue->bytes = queue->quota = 0; + queue->weight = 100; + } + } + + if (cf->qlimit != 0) + wfqp->hwm = cf->qlimit; + + switch (cf->hash_policy) { + case WFQ_HASH_DSTADDR: + wfqp->hash_func = wfq_hashbydstaddr; + wfqp->fbmask = FIMB4_DADDR; +#ifdef INET6 + wfqp->fbmask |= FIMB6_FLABEL; /* use flowlabel for ipv6 */ +#endif + break; + case WFQ_HASH_SRCPORT: + wfqp->hash_func = wfq_hashbysrcport; + wfqp->fbmask = FIMB4_SPORT; +#ifdef INET6 + wfqp->fbmask |= FIMB6_SPORT; +#endif + break; + case WFQ_HASH_FULL: + wfqp->hash_func = wfq_hash; + wfqp->fbmask = FIMB4_ALL; +#ifdef INET6 + wfqp->fbmask |= FIMB6_FLABEL; /* use flowlabel for ipv6 */ +#endif + break; + default: + error = EINVAL; + break; + } + return error; +} + +/* + * wfq device interface + */ + +altqdev_decl(wfq); + +int +wfqopen(dev, flag, fmt, p) + dev_t dev; + int flag, fmt; + struct proc *p; +{ + return 0; +} + +int +wfqclose(dev, flag, fmt, p) + dev_t dev; + int flag, fmt; + struct proc *p; +{ + struct ifnet *ifp; + struct wfq_interface iface; + wfq_state_t *wfqp; + int s; + + s = splimp(); + while ((wfqp = wfq_list) != NULL) { + ifp = wfqp->ifq->altq_ifp; +#if defined(__NetBSD__) || defined(__OpenBSD__) + sprintf(iface.wfq_ifacename, "%s", ifp->if_xname); +#else + sprintf(iface.wfq_ifacename, "%s%d", + ifp->if_name, ifp->if_unit); +#endif + wfq_ifdetach(&iface); + } + splx(s); + return 0; +} + +int +wfqioctl(dev, cmd, addr, flag, p) + dev_t dev; + ioctlcmd_t cmd; + caddr_t addr; + int flag; + struct proc *p; +{ + int error = 0; + int s; + + /* check cmd for superuser only */ + switch (cmd) { + case WFQ_GET_QID: + case WFQ_GET_STATS: + break; + default: +#if (__FreeBSD_version > 400000) + if ((error = suser(p)) != 0) +#else + if ((error = suser(p->p_ucred, &p->p_acflag)) != 0) +#endif + return (error); + break; + } + + s = splimp(); + switch (cmd) { + + case WFQ_ENABLE: + error = wfq_setenable((struct wfq_interface *)addr, ENABLE); + break; + + case WFQ_DISABLE: + error = wfq_setenable((struct wfq_interface *)addr, DISABLE); + break; + + case WFQ_IF_ATTACH: + error = wfq_ifattach((struct wfq_interface *)addr); + break; + + case WFQ_IF_DETACH: + error = wfq_ifdetach((struct wfq_interface *)addr); + break; + + case WFQ_GET_QID: + error = wfq_getqid((struct wfq_getqid *)addr); + break; + + case WFQ_SET_WEIGHT: + error = wfq_setweight((struct wfq_setweight *)addr); + break; + + case WFQ_GET_STATS: + error = wfq_getstats((struct wfq_getstats *)addr); + break; + + case WFQ_CONFIG: + error = wfq_config((struct wfq_conf *)addr); + break; + + default: + error = EINVAL; + break; + } + splx(s); + return error; +} + +#ifdef KLD_MODULE + +static struct altqsw wfq_sw = + {"wfq", wfqopen, wfqclose, wfqioctl}; + +ALTQ_MODULE(altq_wfq, ALTQT_WFQ, &wfq_sw); + +#endif /* KLD_MODULE */ + +#endif /* ALTQ_WFQ */ diff --git a/sys/altq/altq_wfq.h b/sys/altq/altq_wfq.h new file mode 100644 index 000000000000..f8cc3da033b1 --- /dev/null +++ b/sys/altq/altq_wfq.h @@ -0,0 +1,124 @@ +/* $KAME: altq_wfq.h,v 1.5 2000/12/14 08:12:46 thorpej Exp $ */ + +/* + * Copyright (C) 1997-2000 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * March 27, 1997. Written by Hiroshi Kyusojin of Keio University + * (kyu@mt.cs.keio.ac.jp). + */ + +#ifndef _ALTQ_ALTQ_WFQ_H_ +#define _ALTQ_ALTQ_WFQ_H_ + +#include + +#define DEFAULT_QSIZE 256 +#define MAX_QSIZE 2048 + +struct wfq_interface{ + char wfq_ifacename[IFNAMSIZ]; +}; + +struct wfq_getqid{ + struct wfq_interface iface; + struct flowinfo flow; + u_long qid; +}; + +struct wfq_setweight { + struct wfq_interface iface; + int qid; + int weight; +}; + +typedef struct each_queue_stats { + int bytes; /* bytes in this queue */ + int weight; /* weight in percent */ + struct pktcntr xmit_cnt; + struct pktcntr drop_cnt; +} queue_stats; + +struct wfq_getstats { + struct wfq_interface iface; + int qid; + queue_stats stats; +}; + +struct wfq_conf { + struct wfq_interface iface; + int hash_policy; /* hash policy */ + int nqueues; /* number of queues */ + int qlimit; /* queue size in bytes */ +}; + +#define WFQ_HASH_DSTADDR 0 /* hash by dst address */ +#define WFQ_HASH_SRCPORT 1 /* hash by src port */ +#define WFQ_HASH_FULL 2 /* hash by all fields */ + +#define WFQ_IF_ATTACH _IOW('Q', 1, struct wfq_interface) +#define WFQ_IF_DETACH _IOW('Q', 2, struct wfq_interface) +#define WFQ_ENABLE _IOW('Q', 3, struct wfq_interface) +#define WFQ_DISABLE _IOW('Q', 4, struct wfq_interface) +#define WFQ_CONFIG _IOWR('Q', 6, struct wfq_conf) +#define WFQ_GET_STATS _IOWR('Q', 12, struct wfq_getstats) +#define WFQ_GET_QID _IOWR('Q', 30, struct wfq_getqid) +#define WFQ_SET_WEIGHT _IOWR('Q', 31, struct wfq_setweight) + +#ifdef _KERNEL + +#define HWM (64 * 1024) +#define WFQ_QUOTA 512 /* quota bytes to send at a time */ +#define WFQ_ADDQUOTA(q) ((q)->quota += WFQ_QUOTA * (q)->weight / 100) +#define ENABLE 0 +#define DISABLE 1 + +typedef struct weighted_fair_queue{ + struct weighted_fair_queue *next, *prev; + struct mbuf *head, *tail; + int bytes; /* bytes in this queue */ + int quota; /* bytes sent in this round */ + int weight; /* weight in percent */ + + struct pktcntr xmit_cnt; + struct pktcntr drop_cnt; +} wfq; + + +typedef struct wfqstate { + struct wfqstate *next; /* for wfqstate list */ + struct ifaltq *ifq; + int nums; /* number of queues */ + int hwm; /* high water mark */ + int bytes; /* total bytes in all the queues */ + wfq *rrp; /* round robin pointer */ + wfq *queue; /* pointer to queue list */ + u_long (*hash_func)(struct flowinfo *, int); + u_int32_t fbmask; /* filter bitmask */ +} wfq_state_t; + +#endif /* _KERNEL */ + +#endif /* _ALTQ_ALTQ_WFQ_H */ diff --git a/sys/altq/if_altq.h b/sys/altq/if_altq.h new file mode 100644 index 000000000000..659dad1a91ce --- /dev/null +++ b/sys/altq/if_altq.h @@ -0,0 +1,165 @@ +/* $KAME: if_altq.h,v 1.5 2000/12/14 08:12:47 thorpej Exp $ */ + +/* + * Copyright (C) 1997-2000 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#ifndef _ALTQ_IF_ALTQ_H_ +#define _ALTQ_IF_ALTQ_H_ + +#ifdef KERNEL +#ifndef _KERNEL +#define _KERNEL +#endif +#endif + +struct altq_pktattr; struct tb_regulator; struct top_cdnr; + +/* + * Structure defining a queue for a network interface. + */ +struct ifaltq { + /* fields compatible with struct ifqueue */ + struct mbuf *ifq_head; + struct mbuf *ifq_tail; + int ifq_len; + int ifq_maxlen; + int ifq_drops; + + /* alternate queueing related fields */ + int altq_type; /* discipline type */ + int altq_flags; /* flags (e.g. ready, in-use) */ + void *altq_disc; /* for discipline-specific use */ + struct ifnet *altq_ifp; /* back pointer to interface */ + + int (*altq_enqueue) __P((struct ifaltq *ifq, struct mbuf *m, + struct altq_pktattr *)); + struct mbuf *(*altq_dequeue) __P((struct ifaltq *ifq, int remove)); + int (*altq_request) __P((struct ifaltq *ifq, int req, void *arg)); + + /* classifier fields */ + void *altq_clfier; /* classifier-specific use */ + void *(*altq_classify) __P((void *, struct mbuf *, int)); + + /* token bucket regulator */ + struct tb_regulator *altq_tbr; + + /* input traffic conditioner (doesn't belong to the output queue...) */ + struct top_cdnr *altq_cdnr; +}; + + +#ifdef _KERNEL + +/* + * packet attributes used by queueing disciplines. + * pattr_class is a discipline-dependent scheduling class that is + * set by a classifier. + * pattr_hdr and pattr_af may be used by a discipline to access + * the header within a mbuf. (e.g. ECN needs to update the CE bit) + * note that pattr_hdr could be stale after m_pullup, though link + * layer output routines usually don't use m_pullup. link-level + * compression also invalidates these fields. thus, pattr_hdr needs + * to be verified when a discipline touches the header. + */ +struct altq_pktattr { + void *pattr_class; /* sched class set by classifier */ + int pattr_af; /* address family */ + caddr_t pattr_hdr; /* saved header position in mbuf */ +}; + +/* + * a token-bucket regulator limits the rate that a network driver can + * dequeue packets from the output queue. + * modern cards are able to buffer a large amount of packets and dequeue + * too many packets at a time. this bursty dequeue behavior makes it + * impossible to schedule packets by queueing disciplines. + * a token-bucket is used to control the burst size in a device + * independent manner. + */ +struct tb_regulator { + int64_t tbr_rate; /* (scaled) token bucket rate */ + int64_t tbr_depth; /* (scaled) token bucket depth */ + + int64_t tbr_token; /* (scaled) current token */ + int64_t tbr_filluptime; /* (scaled) time to fill up bucket */ + u_int64_t tbr_last; /* last time token was updated */ + + int tbr_lastop; /* last dequeue operation type + needed for poll-and-dequeue */ +}; + +/* if_altqflags */ +#define ALTQF_READY 0x01 /* driver supports alternate queueing */ +#define ALTQF_ENABLED 0x02 /* altq is in use */ +#define ALTQF_CLASSIFY 0x04 /* classify packets */ +#define ALTQF_CNDTNING 0x08 /* altq traffic conditioning is enabled */ +#define ALTQF_DRIVER1 0x40 /* driver specific */ + +/* if_altqflags set internally only: */ +#define ALTQF_CANTCHANGE (ALTQF_READY) + +/* altq_dequeue 2nd arg */ +#define ALTDQ_REMOVE 1 /* dequeue mbuf from the queue */ +#define ALTDQ_POLL 2 /* don't dequeue mbuf from the queue */ + +/* altq request types (currently only purge is defined) */ +#define ALTRQ_PURGE 1 /* purge all packets */ + +#define ALTQ_IS_READY(ifq) ((ifq)->altq_flags & ALTQF_READY) +#define ALTQ_IS_ENABLED(ifq) ((ifq)->altq_flags & ALTQF_ENABLED) +#define ALTQ_NEEDS_CLASSIFY(ifq) ((ifq)->altq_flags & ALTQF_CLASSIFY) +#define ALTQ_IS_CNDTNING(ifq) ((ifq)->altq_flags & ALTQF_CNDTNING) + +#define ALTQ_SET_CNDTNING(ifq) ((ifq)->altq_flags |= ALTQF_CNDTNING) +#define ALTQ_CLEAR_CNDTNING(ifq) ((ifq)->altq_flags &= ~ALTQF_CNDTNING) +#define ALTQ_IS_ATTACHED(ifq) ((ifq)->altq_disc != NULL) + +#define ALTQ_ENQUEUE(ifq, m, pa, err) \ + (err) = (*(ifq)->altq_enqueue)((ifq),(m),(pa)) +#define ALTQ_DEQUEUE(ifq, m) \ + (m) = (*(ifq)->altq_dequeue)((ifq), ALTDQ_REMOVE) +#define ALTQ_POLL(ifq, m) \ + (m) = (*(ifq)->altq_dequeue)((ifq), ALTDQ_POLL) +#define ALTQ_PURGE(ifq) \ + (void)(*(ifq)->altq_request)((ifq), ALTRQ_PURGE, (void *)0) +#define ALTQ_IS_EMPTY(ifq) ((ifq)->ifq_len == 0) +#define TBR_IS_ENABLED(ifq) ((ifq)->altq_tbr != NULL) + +extern int altq_attach __P((struct ifaltq *, int, void *, + int (*)(struct ifaltq *, struct mbuf *, + struct altq_pktattr *), + struct mbuf *(*)(struct ifaltq *, int), + int (*)(struct ifaltq *, int, void *), + void *, + void *(*)(void *, struct mbuf *, int))); +extern int altq_detach __P((struct ifaltq *)); +extern int altq_enable __P((struct ifaltq *)); +extern int altq_disable __P((struct ifaltq *)); +extern struct mbuf *tbr_dequeue __P((struct ifaltq *, int)); +extern int (*altq_input) __P((struct mbuf *, int)); + +#endif /* _KERNEL */ + +#endif /* _ALTQ_IF_ALTQ_H_ */