/* $NetBSD: ieee8023ad_lacp.c,v 1.3 2005/12/11 12:24:54 christos Exp $ */ /*- * Copyright (c)2005 YAMAMOTO Takashi, * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __KERNEL_RCSID(0, "$NetBSD: ieee8023ad_lacp.c,v 1.3 2005/12/11 12:24:54 christos Exp $"); #include #include #include #include #include #include /* hz */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include static void lacp_fill_actorinfo(struct agr_port *, struct lacp_peerinfo *); static uint64_t lacp_aggregator_bandwidth(struct lacp_aggregator *); static void lacp_suppress_distributing(struct lacp_softc *, struct lacp_aggregator *); static void lacp_transit_expire(void *); static void lacp_select_active_aggregator(struct lacp_softc *); static uint16_t lacp_compose_key(struct lacp_port *); /* * actor system priority and port priority. * XXX should be configurable. */ #define LACP_SYSTEM_PRIO 0x8000 #define LACP_PORT_PRIO 0x8000 static const struct tlv_template lacp_info_tlv_template[] = { { LACP_TYPE_ACTORINFO, sizeof(struct tlvhdr) + sizeof(struct lacp_peerinfo) }, { LACP_TYPE_PARTNERINFO, sizeof(struct tlvhdr) + sizeof(struct lacp_peerinfo) }, { LACP_TYPE_COLLECTORINFO, sizeof(struct tlvhdr) + sizeof(struct lacp_collectorinfo) }, { 0, 0 }, }; /* * ieee8023ad_lacp_input: process lacpdu * * => called from ether_input. (ie. at IPL_NET) * * XXX is it better to defer processing to lower IPL? * XXX anyway input rate should be very low... */ int ieee8023ad_lacp_input(struct ifnet *ifp, struct mbuf *m) { struct lacpdu *du; struct agr_softc *sc; struct agr_port *port; struct lacp_port *lp; int error = 0; int s; port = ifp->if_agrprivate; /* XXX race with agr_remport. */ if (__predict_false(port->port_flags & AGRPORT_DETACHING)) { goto bad; } sc = AGR_SC_FROM_PORT(port); KASSERT(port); if (m->m_pkthdr.len != sizeof(*du)) { goto bad; } if ((m->m_flags & M_MCAST) == 0) { goto bad; } if (m->m_len < sizeof(*du)) { m = m_pullup(m, sizeof(*du)); if (m == NULL) { return ENOMEM; } } du = mtod(m, struct lacpdu *); if (memcmp(&du->ldu_eh.ether_dhost, ðermulticastaddr_slowprotocols, ETHER_ADDR_LEN)) { goto bad; } KASSERT(du->ldu_sph.sph_subtype == SLOWPROTOCOLS_SUBTYPE_LACP); /* * ignore the version for compatibility with * the future protocol revisions. */ #if 0 if (du->ldu_sph.sph_version != 1) { goto bad; } #endif /* * ignore tlv types for compatibility with * the future protocol revisions. */ if (tlv_check(du, sizeof(*du), &du->ldu_tlv_actor, lacp_info_tlv_template, FALSE)) { goto bad; } s = AGR_LOCK(sc); lp = LACP_PORT(port); #if defined(LACP_DEBUG) if (lacpdebug) { LACP_DPRINTF((lp, "lacpdu receive\n")); lacp_dump_lacpdu(du); } #endif /* defined(LACP_DEBUG) */ lacp_sm_rx(lp, du); AGR_UNLOCK(sc, s); m_freem(m); return error; bad: m_freem(m); return EINVAL; } static void lacp_fill_actorinfo(struct agr_port *port, struct lacp_peerinfo *info) { struct lacp_port *lp = LACP_PORT(port); info->lip_systemid.lsi_prio = htobe16(LACP_SYSTEM_PRIO); memcpy(&info->lip_systemid.lsi_mac, LLADDR(port->port_ifp->if_sadl), ETHER_ADDR_LEN); info->lip_portid.lpi_prio = htobe16(LACP_PORT_PRIO); info->lip_portid.lpi_portno = htobe16(port->port_ifp->if_index); info->lip_state = lp->lp_state; } int lacp_xmit_lacpdu(struct lacp_port *lp) { struct agr_port *port = lp->lp_agrport; struct mbuf *m; struct lacpdu *du; int error; KDASSERT(MHLEN >= sizeof(*du)); m = m_gethdr(M_DONTWAIT, MT_DATA); if (m == NULL) { return ENOMEM; } m->m_len = m->m_pkthdr.len = sizeof(*du); du = mtod(m, struct lacpdu *); memset(du, 0, sizeof(*du)); memcpy(&du->ldu_eh.ether_dhost, ethermulticastaddr_slowprotocols, ETHER_ADDR_LEN); memcpy(&du->ldu_eh.ether_shost, &port->port_origlladdr, ETHER_ADDR_LEN); du->ldu_eh.ether_type = htobe16(ETHERTYPE_SLOWPROTOCOLS); du->ldu_sph.sph_subtype = SLOWPROTOCOLS_SUBTYPE_LACP; du->ldu_sph.sph_version = 1; TLV_SET(&du->ldu_tlv_actor, LACP_TYPE_ACTORINFO, sizeof(du->ldu_actor)); du->ldu_actor = lp->lp_actor; TLV_SET(&du->ldu_tlv_partner, LACP_TYPE_PARTNERINFO, sizeof(du->ldu_partner)); du->ldu_partner = lp->lp_partner; TLV_SET(&du->ldu_tlv_collector, LACP_TYPE_COLLECTORINFO, sizeof(du->ldu_collector)); du->ldu_collector.lci_maxdelay = 0; #if defined(LACP_DEBUG) if (lacpdebug) { LACP_DPRINTF((lp, "lacpdu transmit\n")); lacp_dump_lacpdu(du); } #endif /* defined(LACP_DEBUG) */ m->m_flags |= M_MCAST; /* * XXX should use higher priority queue. * otherwise network congestion can break aggregation. */ error = agr_xmit_frame(port->port_ifp, m); return error; } void ieee8023ad_lacp_portstate(struct agr_port *port) { struct lacp_port *lp = LACP_PORT(port); u_int media = port->port_media; uint8_t old_state; uint16_t old_key; AGR_ASSERT_LOCKED(AGR_SC_FROM_PORT(port)); LACP_DPRINTF((lp, "media changed 0x%x -> 0x%x\n", lp->lp_media, media)); old_state = lp->lp_state; old_key = lp->lp_key; lp->lp_media = media; if ((media & IFM_HDX) != 0) { lp->lp_state &= ~LACP_STATE_AGGREGATION; } else { lp->lp_state |= LACP_STATE_AGGREGATION; } lp->lp_key = lacp_compose_key(lp); if (old_state != lp->lp_state || old_key != lp->lp_key) { LACP_DPRINTF((lp, "-> UNSELECTED\n")); lp->lp_selected = LACP_UNSELECTED; } } void ieee8023ad_lacp_porttick(struct agr_softc *sc, struct agr_port *port) { struct lacp_port *lp = LACP_PORT(port); AGR_ASSERT_LOCKED(sc); lacp_run_timers(lp); lacp_select(lp); lacp_sm_mux(lp); lacp_sm_tx(lp); lacp_sm_ptx_tx_schedule(lp); } void lacp_portinit(struct agr_port *port) { struct lacp_port *lp = LACP_PORT(port); boolean_t active = TRUE; /* XXX should be configurable */ boolean_t fast = FALSE; /* XXX should be configurable */ lp->lp_agrport = port; lacp_fill_actorinfo(port, &lp->lp_actor); lp->lp_state = (active ? LACP_STATE_ACTIVITY : 0) | (fast ? LACP_STATE_TIMEOUT : 0); lp->lp_aggregator = NULL; lp->lp_media = port->port_media; /* XXX */ lp->lp_key = lacp_compose_key(lp); lacp_sm_rx_set_expired(lp); } void lacp_portfini(struct agr_port *port) { struct lacp_port *lp = LACP_PORT(port); struct lacp_aggregator *la = lp->lp_aggregator; int i; LACP_DPRINTF((lp, "portfini\n")); for (i = 0; i < LACP_NTIMER; i++) { LACP_TIMER_DISARM(lp, i); } if (la == NULL) { return; } lacp_disable_distributing(lp); lacp_unselect(lp); } /* -------------------- */ void lacp_disable_collecting(struct lacp_port *lp) { struct agr_port *port = lp->lp_agrport; lp->lp_state &= ~LACP_STATE_COLLECTING; port->port_flags &= ~AGRPORT_COLLECTING; } void lacp_enable_collecting(struct lacp_port *lp) { struct agr_port *port = lp->lp_agrport; lp->lp_state |= LACP_STATE_COLLECTING; port->port_flags |= AGRPORT_COLLECTING; } void lacp_disable_distributing(struct lacp_port *lp) { struct agr_port *port = lp->lp_agrport; struct lacp_aggregator *la = lp->lp_aggregator; struct lacp_softc *lsc = LACP_SOFTC(AGR_SC_FROM_PORT(port)); #if defined(LACP_DEBUG) char buf[LACP_LAGIDSTR_MAX+1]; #endif /* defined(LACP_DEBUG) */ if ((lp->lp_state & LACP_STATE_DISTRIBUTING) == 0) { return; } KASSERT(la); KASSERT(!TAILQ_EMPTY(&la->la_ports)); KASSERT(la->la_nports > 0); KASSERT(la->la_refcnt >= la->la_nports); LACP_DPRINTF((lp, "disable distributing on aggregator %s, " "nports %d -> %d\n", lacp_format_lagid_aggregator(la, buf, sizeof(buf)), la->la_nports, la->la_nports - 1)); TAILQ_REMOVE(&la->la_ports, lp, lp_dist_q); la->la_nports--; lacp_suppress_distributing(lsc, la); lp->lp_state &= ~LACP_STATE_DISTRIBUTING; port->port_flags &= ~AGRPORT_DISTRIBUTING; if (lsc->lsc_active_aggregator == la) { lacp_select_active_aggregator(lsc); } } void lacp_enable_distributing(struct lacp_port *lp) { struct agr_port *port = lp->lp_agrport; struct lacp_aggregator *la = lp->lp_aggregator; struct lacp_softc *lsc = LACP_SOFTC(AGR_SC_FROM_PORT(port)); #if defined(LACP_DEBUG) char buf[LACP_LAGIDSTR_MAX+1]; #endif /* defined(LACP_DEBUG) */ if ((lp->lp_state & LACP_STATE_DISTRIBUTING) != 0) { return; } KASSERT(la); LACP_DPRINTF((lp, "enable distributing on aggregator %s, " "nports %d -> %d\n", lacp_format_lagid_aggregator(la, buf, sizeof(buf)), la->la_nports, la->la_nports + 1)); KASSERT(la->la_refcnt > la->la_nports); TAILQ_INSERT_HEAD(&la->la_ports, lp, lp_dist_q); la->la_nports++; lacp_suppress_distributing(lsc, la); lp->lp_state |= LACP_STATE_DISTRIBUTING; port->port_flags |= AGRPORT_DISTRIBUTING; if (lsc->lsc_active_aggregator != la) { lacp_select_active_aggregator(lsc); } } static void lacp_transit_expire(void *vp) { struct agr_softc *sc = vp; struct lacp_softc *lsc = LACP_SOFTC(sc); int s; s = AGR_LOCK(sc); LACP_DPRINTF((NULL, "%s\n", __func__)); lsc->lsc_suppress_distributing = FALSE; AGR_UNLOCK(sc, s); } /* -------------------- */ /* XXX */ void ieee8023ad_portinit(struct agr_port *port) { struct ieee8023ad_port *iport = IEEE8023AD_PORT(port); memset(iport, 0, sizeof(iport)); lacp_portinit(port); } void ieee8023ad_portfini(struct agr_port *port) { struct agr_softc *sc = AGR_SC_FROM_PORT(port); int s; s = AGR_LOCK(sc); lacp_portfini(port); AGR_UNLOCK(sc, s); } void ieee8023ad_ctor(struct agr_softc *sc) { struct ieee8023ad_softc *isc = IEEE8023AD_SOFTC(sc); struct lacp_softc *lsc = &isc->isc_lacpsc; lsc->lsc_active_aggregator = NULL; TAILQ_INIT(&lsc->lsc_aggregators); callout_init(&lsc->lsc_transit_callout); callout_setfunc(&lsc->lsc_transit_callout, lacp_transit_expire, sc); } void ieee8023ad_dtor(struct agr_softc *sc) { struct ieee8023ad_softc *isc = IEEE8023AD_SOFTC(sc); struct lacp_softc *lsc = &isc->isc_lacpsc; LACP_DPRINTF((NULL, "%s\n", __func__)); callout_stop(&lsc->lsc_transit_callout); KASSERT(TAILQ_EMPTY(&lsc->lsc_aggregators)); KASSERT(lsc->lsc_active_aggregator == NULL); } /* -------------------- */ struct agr_port * ieee8023ad_select_tx_port(struct agr_softc *sc, struct mbuf *m) { const struct lacp_softc *lsc = LACP_SOFTC(sc); const struct lacp_aggregator *la; const struct lacp_port *lp; uint32_t hash; int nports; if (__predict_false(lsc->lsc_suppress_distributing && !AGR_ROUNDROBIN(sc))) { LACP_DPRINTF((NULL, "%s: waiting transit\n", __func__)); sc->sc_if.if_collisions++; /* XXX abuse */ return NULL; } la = lsc->lsc_active_aggregator; if (__predict_false(la == NULL)) { LACP_DPRINTF((NULL, "%s: no active aggregator\n", __func__)); return NULL; } nports = la->la_nports; KASSERT(nports > 0); if (AGR_ROUNDROBIN(sc)) { /* packet ordering rule violation */ hash = sc->sc_rr_counter++; } else { hash = (*sc->sc_iftop->iftop_hashmbuf)(sc, m); } hash %= nports; lp = TAILQ_FIRST(&la->la_ports); KASSERT(lp != NULL); while (hash--) { lp = TAILQ_NEXT(lp, lp_dist_q); KASSERT(lp != NULL); } KASSERT((lp->lp_state & LACP_STATE_DISTRIBUTING) != 0); return lp->lp_agrport; } /* * lacp_suppress_distributing: drop transmit packets for a while * to preserve packet ordering. */ static void lacp_suppress_distributing(struct lacp_softc *lsc, struct lacp_aggregator *la) { if (lsc->lsc_active_aggregator != la) { return; } LACP_DPRINTF((NULL, "%s\n", __func__)); lsc->lsc_suppress_distributing = TRUE; /* XXX should consider collector max delay */ callout_schedule(&lsc->lsc_transit_callout, LACP_TRANSIT_DELAY * hz / 1000); } /* -------------------- */ int lacp_compare_peerinfo(const struct lacp_peerinfo *a, const struct lacp_peerinfo *b) { return memcmp(a, b, offsetof(struct lacp_peerinfo, lip_state)); } int lacp_compare_systemid(const struct lacp_systemid *a, const struct lacp_systemid *b) { return memcmp(a, b, sizeof(*a)); } int lacp_compare_portid(const struct lacp_portid *a, const struct lacp_portid *b) { return memcmp(a, b, sizeof(*a)); } /* -------------------- */ static uint64_t lacp_aggregator_bandwidth(struct lacp_aggregator *la) { struct lacp_port *lp; uint64_t speed; lp = TAILQ_FIRST(&la->la_ports); if (lp == NULL) { return 0; } speed = ifmedia_baudrate(lp->lp_media); speed *= la->la_nports; if (speed == 0) { LACP_DPRINTF((lp, "speed 0? media=0x%x nports=%d\n", lp->lp_media, la->la_nports)); } return speed; } /* * lacp_select_active_aggregator: select an aggregator to be used to transmit * packets from agr(4) interface. */ static void lacp_select_active_aggregator(struct lacp_softc *lsc) { struct lacp_aggregator *la; struct lacp_aggregator *best_la = NULL; uint64_t best_speed = 0; #if defined(LACP_DEBUG) char buf[LACP_LAGIDSTR_MAX+1]; #endif /* defined(LACP_DEBUG) */ LACP_DPRINTF((NULL, "%s:\n", __func__)); TAILQ_FOREACH(la, &lsc->lsc_aggregators, la_q) { uint64_t speed; if (la->la_nports == 0) { continue; } speed = lacp_aggregator_bandwidth(la); LACP_DPRINTF((NULL, "%s, speed=%" PRIu64 ", nports=%d\n", lacp_format_lagid_aggregator(la, buf, sizeof(buf)), speed, la->la_nports)); if (speed > best_speed || (speed == best_speed && la == lsc->lsc_active_aggregator)) { best_la = la; best_speed = speed; } } KASSERT(best_la == NULL || best_la->la_nports > 0); KASSERT(best_la == NULL || !TAILQ_EMPTY(&best_la->la_ports)); #if defined(LACP_DEBUG) if (lsc->lsc_active_aggregator != best_la) { LACP_DPRINTF((NULL, "active aggregator changed\n")); LACP_DPRINTF((NULL, "old %s\n", lacp_format_lagid_aggregator(lsc->lsc_active_aggregator, buf, sizeof(buf)))); } else { LACP_DPRINTF((NULL, "active aggregator not changed\n")); } LACP_DPRINTF((NULL, "new %s\n", lacp_format_lagid_aggregator(best_la, buf, sizeof(buf)))); #endif /* defined(LACP_DEBUG) */ if (lsc->lsc_active_aggregator != best_la) { lsc->lsc_active_aggregator = best_la; if (best_la) { lacp_suppress_distributing(lsc, best_la); } } } uint16_t lacp_compose_key(struct lacp_port *lp) { u_int media = lp->lp_media; uint16_t key; KASSERT(IFM_TYPE(media) == IFM_ETHER); if (!(lp->lp_state & LACP_STATE_AGGREGATION)) { /* * non-aggregatable links should have unique keys. * * XXX this isn't really unique as if_index is 16 bit. */ /* bit 0..14: (some bits of) if_index of this port */ key = lp->lp_agrport->port_ifp->if_index; /* bit 15: 1 */ key |= 0x8000; } else { u_int subtype = IFM_SUBTYPE(media); KASSERT((media & IFM_HDX) == 0); /* should be handled above */ KASSERT((subtype & 0x1f) == subtype); /* bit 0..4: IFM_SUBTYPE */ key = subtype; /* bit 5..14: (some bits of) if_index of agr device */ key |= 0x7fe0 & ((lp->lp_agrport->port_agrifp->if_index) << 5); /* bit 15: 0 */ } return htobe16(key); }