/* $NetBSD: kttcp.c,v 1.17 2005/12/11 12:20:53 christos Exp $ */ /* * Copyright (c) 2002 Wasabi Systems, Inc. * All rights reserved. * * Written by Frank van der Linden and Jason R. Thorpe for * Wasabi Systems, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed for the NetBSD Project by * Wasabi Systems, Inc. * 4. The name of Wasabi Systems, Inc. may not be used to endorse * or promote products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * kttcp.c -- * * This module provides kernel support for testing network * throughput from the perspective of the kernel. It is * similar in spirit to the classic ttcp network benchmark * program, the main difference being that with kttcp, the * kernel is the source and sink of the data. * * Testing like this is useful for a few reasons: * * 1. This allows us to know what kind of performance we can * expect from network applications that run in the kernel * space, such as the NFS server or the NFS client. These * applications don't have to move the data to/from userspace, * and so benchmark programs which run in userspace don't * give us an accurate model. * * 2. Since data received is just thrown away, the receiver * is very fast. This can provide better exercise for the * sender at the other end. * * 3. Since the NetBSD kernel currently uses a run-to-completion * scheduling model, kttcp provides a benchmark model where * preemption of the benchmark program is not an issue. */ #include __KERNEL_RCSID(0, "$NetBSD: kttcp.c,v 1.17 2005/12/11 12:20:53 christos Exp $"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int kttcp_send(struct lwp *l, struct kttcp_io_args *); static int kttcp_recv(struct lwp *l, struct kttcp_io_args *); static int kttcp_sosend(struct socket *, unsigned long long, unsigned long long *, struct lwp *, int); static int kttcp_soreceive(struct socket *, unsigned long long, unsigned long long *, struct lwp *, int *); void kttcpattach(int); dev_type_ioctl(kttcpioctl); const struct cdevsw kttcp_cdevsw = { nullopen, nullclose, noread, nowrite, kttcpioctl, nostop, notty, nopoll, nommap, nokqfilter, }; void kttcpattach(int count) { /* Do nothing. */ } int kttcpioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct lwp *l) { int error; if ((flag & FWRITE) == 0) return EPERM; switch (cmd) { case KTTCP_IO_SEND: error = kttcp_send(l, (struct kttcp_io_args *) data); break; case KTTCP_IO_RECV: error = kttcp_recv(l, (struct kttcp_io_args *) data); break; default: return EINVAL; } return error; } static int kttcp_send(struct lwp *l, struct kttcp_io_args *kio) { struct file *fp; int error; struct timeval t0, t1; unsigned long long len, done; if (kio->kio_totalsize >= KTTCP_MAX_XMIT) return EINVAL; fp = fd_getfile(l->l_proc->p_fd, kio->kio_socket); if (fp == NULL) return EBADF; FILE_USE(fp); if (fp->f_type != DTYPE_SOCKET) { FILE_UNUSE(fp, l); return EFTYPE; } len = kio->kio_totalsize; microtime(&t0); do { error = kttcp_sosend((struct socket *)fp->f_data, len, &done, l, 0); len -= done; } while (error == 0 && len > 0); FILE_UNUSE(fp, l); microtime(&t1); if (error != 0) return error; timersub(&t1, &t0, &kio->kio_elapsed); kio->kio_bytesdone = kio->kio_totalsize - len; return 0; } static int kttcp_recv(struct lwp *l, struct kttcp_io_args *kio) { struct file *fp; int error; struct timeval t0, t1; unsigned long long len, done; if (kio->kio_totalsize > KTTCP_MAX_XMIT) return EINVAL; fp = fd_getfile(l->l_proc->p_fd, kio->kio_socket); if (fp == NULL) return EBADF; FILE_USE(fp); if (fp->f_type != DTYPE_SOCKET) { FILE_UNUSE(fp, l); return EBADF; } len = kio->kio_totalsize; microtime(&t0); do { error = kttcp_soreceive((struct socket *)fp->f_data, len, &done, l, NULL); len -= done; } while (error == 0 && len > 0 && done > 0); FILE_UNUSE(fp, l); microtime(&t1); if (error == EPIPE) error = 0; if (error != 0) return error; timersub(&t1, &t0, &kio->kio_elapsed); kio->kio_bytesdone = kio->kio_totalsize - len; return 0; } #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK) /* * Slightly changed version of sosend() */ static int kttcp_sosend(struct socket *so, unsigned long long slen, unsigned long long *done, struct lwp *l, int flags) { struct mbuf **mp, *m, *top; long space, len, mlen; int error, s, dontroute, atomic; long long resid; atomic = sosendallatonce(so); resid = slen; top = NULL; /* * In theory resid should be unsigned. * However, space must be signed, as it might be less than 0 * if we over-committed, and we must use a signed comparison * of space and resid. On the other hand, a negative resid * causes us to loop sending 0-length segments to the protocol. */ if (resid < 0) { error = EINVAL; goto out; } dontroute = (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && (so->so_proto->pr_flags & PR_ATOMIC); /* WRS XXX - are we doing per-lwp or per-proc stats? */ l->l_proc->p_stats->p_ru.ru_msgsnd++; #define snderr(errno) { error = errno; splx(s); goto release; } restart: if ((error = sblock(&so->so_snd, SBLOCKWAIT(flags))) != 0) goto out; do { s = splsoftnet(); if (so->so_state & SS_CANTSENDMORE) snderr(EPIPE); if (so->so_error) { error = so->so_error; so->so_error = 0; splx(s); goto release; } if ((so->so_state & SS_ISCONNECTED) == 0) { if (so->so_proto->pr_flags & PR_CONNREQUIRED) { if ((so->so_state & SS_ISCONFIRMING) == 0) snderr(ENOTCONN); } else snderr(EDESTADDRREQ); } space = sbspace(&so->so_snd); if (flags & MSG_OOB) space += 1024; if ((atomic && resid > so->so_snd.sb_hiwat)) snderr(EMSGSIZE); if (space < resid && (atomic || space < so->so_snd.sb_lowat)) { if (so->so_state & SS_NBIO) snderr(EWOULDBLOCK); SBLASTRECORDCHK(&so->so_rcv, "kttcp_soreceive sbwait 1"); SBLASTMBUFCHK(&so->so_rcv, "kttcp_soreceive sbwait 1"); sbunlock(&so->so_snd); error = sbwait(&so->so_snd); splx(s); if (error) goto out; goto restart; } splx(s); mp = ⊤ do { do { if (top == 0) { m = m_gethdr(M_WAIT, MT_DATA); mlen = MHLEN; m->m_pkthdr.len = 0; m->m_pkthdr.rcvif = NULL; } else { m = m_get(M_WAIT, MT_DATA); mlen = MLEN; } if (resid >= MINCLSIZE && space >= MCLBYTES) { m_clget(m, M_WAIT); if ((m->m_flags & M_EXT) == 0) goto nopages; mlen = MCLBYTES; #ifdef MAPPED_MBUFS len = lmin(MCLBYTES, resid); #else if (atomic && top == 0) { len = lmin(MCLBYTES - max_hdr, resid); m->m_data += max_hdr; } else len = lmin(MCLBYTES, resid); #endif space -= len; } else { nopages: len = lmin(lmin(mlen, resid), space); space -= len; /* * For datagram protocols, leave room * for protocol headers in first mbuf. */ if (atomic && top == 0 && len < mlen) MH_ALIGN(m, len); } resid -= len; m->m_len = len; *mp = m; top->m_pkthdr.len += len; if (error) goto release; mp = &m->m_next; if (resid <= 0) { if (flags & MSG_EOR) top->m_flags |= M_EOR; break; } } while (space > 0 && atomic); s = splsoftnet(); if (so->so_state & SS_CANTSENDMORE) snderr(EPIPE); if (dontroute) so->so_options |= SO_DONTROUTE; if (resid > 0) so->so_state |= SS_MORETOCOME; error = (*so->so_proto->pr_usrreq)(so, (flags & MSG_OOB) ? PRU_SENDOOB : PRU_SEND, top, NULL, NULL, l); if (dontroute) so->so_options &= ~SO_DONTROUTE; if (resid > 0) so->so_state &= ~SS_MORETOCOME; splx(s); top = 0; mp = ⊤ if (error) goto release; } while (resid && space > 0); } while (resid); release: sbunlock(&so->so_snd); out: if (top) m_freem(top); *done = slen - resid; #if 0 printf("sosend: error %d slen %llu resid %lld\n", error, slen, resid); #endif return (error); } static int kttcp_soreceive(struct socket *so, unsigned long long slen, unsigned long long *done, struct lwp *l, int *flagsp) { struct mbuf *m, **mp; int flags, len, error, s, offset, moff, type; long long orig_resid, resid; const struct protosw *pr; struct mbuf *nextrecord; pr = so->so_proto; mp = NULL; type = 0; resid = orig_resid = slen; if (flagsp) flags = *flagsp &~ MSG_EOR; else flags = 0; if (flags & MSG_OOB) { m = m_get(M_WAIT, MT_DATA); error = (*pr->pr_usrreq)(so, PRU_RCVOOB, m, (struct mbuf *)(long)(flags & MSG_PEEK), NULL, NULL); if (error) goto bad; do { resid -= min(resid, m->m_len); m = m_free(m); } while (resid && error == 0 && m); bad: if (m) m_freem(m); return (error); } if (mp) *mp = NULL; if (so->so_state & SS_ISCONFIRMING && resid) (*pr->pr_usrreq)(so, PRU_RCVD, NULL, NULL, NULL, NULL); restart: if ((error = sblock(&so->so_rcv, SBLOCKWAIT(flags))) != 0) return (error); s = splsoftnet(); m = so->so_rcv.sb_mb; /* * If we have less data than requested, block awaiting more * (subject to any timeout) if: * 1. the current count is less than the low water mark, * 2. MSG_WAITALL is set, and it is possible to do the entire * receive operation at once if we block (resid <= hiwat), or * 3. MSG_DONTWAIT is not set. * If MSG_WAITALL is set but resid is larger than the receive buffer, * we have to do the receive in sections, and thus risk returning * a short count if a timeout or signal occurs after we start. */ if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && so->so_rcv.sb_cc < resid) && (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || ((flags & MSG_WAITALL) && resid <= so->so_rcv.sb_hiwat)) && m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { #ifdef DIAGNOSTIC if (m == NULL && so->so_rcv.sb_cc) panic("receive 1"); #endif if (so->so_error) { if (m) goto dontblock; error = so->so_error; if ((flags & MSG_PEEK) == 0) so->so_error = 0; goto release; } if (so->so_state & SS_CANTRCVMORE) { if (m) goto dontblock; else goto release; } for (; m; m = m->m_next) if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { m = so->so_rcv.sb_mb; goto dontblock; } if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && (so->so_proto->pr_flags & PR_CONNREQUIRED)) { error = ENOTCONN; goto release; } if (resid == 0) goto release; if ((so->so_state & SS_NBIO) || (flags & MSG_DONTWAIT)) { error = EWOULDBLOCK; goto release; } sbunlock(&so->so_rcv); error = sbwait(&so->so_rcv); splx(s); if (error) return (error); goto restart; } dontblock: /* * On entry here, m points to the first record of the socket buffer. * While we process the initial mbufs containing address and control * info, we save a copy of m->m_nextpkt into nextrecord. */ #ifdef notyet /* XXXX */ if (uio->uio_lwp) uio->uio_lwp->l_proc->p_stats->p_ru.ru_msgrcv++; #endif KASSERT(m == so->so_rcv.sb_mb); SBLASTRECORDCHK(&so->so_rcv, "kttcp_soreceive 1"); SBLASTMBUFCHK(&so->so_rcv, "kttcp_soreceive 1"); nextrecord = m->m_nextpkt; if (pr->pr_flags & PR_ADDR) { #ifdef DIAGNOSTIC if (m->m_type != MT_SONAME) panic("receive 1a"); #endif orig_resid = 0; if (flags & MSG_PEEK) { m = m->m_next; } else { sbfree(&so->so_rcv, m); MFREE(m, so->so_rcv.sb_mb); m = so->so_rcv.sb_mb; } } while (m && m->m_type == MT_CONTROL && error == 0) { if (flags & MSG_PEEK) { m = m->m_next; } else { sbfree(&so->so_rcv, m); MFREE(m, so->so_rcv.sb_mb); m = so->so_rcv.sb_mb; } } /* * If m is non-NULL, we have some data to read. From now on, * make sure to keep sb_lastrecord consistent when working on * the last packet on the chain (nextrecord == NULL) and we * change m->m_nextpkt. */ if (m) { if ((flags & MSG_PEEK) == 0) { m->m_nextpkt = nextrecord; /* * If nextrecord == NULL (this is a single chain), * then sb_lastrecord may not be valid here if m * was changed earlier. */ if (nextrecord == NULL) { KASSERT(so->so_rcv.sb_mb == m); so->so_rcv.sb_lastrecord = m; } } type = m->m_type; if (type == MT_OOBDATA) flags |= MSG_OOB; } else { if ((flags & MSG_PEEK) == 0) { KASSERT(so->so_rcv.sb_mb == m); so->so_rcv.sb_mb = nextrecord; SB_EMPTY_FIXUP(&so->so_rcv); } } SBLASTRECORDCHK(&so->so_rcv, "kttcp_soreceive 2"); SBLASTMBUFCHK(&so->so_rcv, "kttcp_soreceive 2"); moff = 0; offset = 0; while (m && resid > 0 && error == 0) { if (m->m_type == MT_OOBDATA) { if (type != MT_OOBDATA) break; } else if (type == MT_OOBDATA) break; #ifdef DIAGNOSTIC else if (m->m_type != MT_DATA && m->m_type != MT_HEADER) panic("receive 3"); #endif so->so_state &= ~SS_RCVATMARK; len = resid; if (so->so_oobmark && len > so->so_oobmark - offset) len = so->so_oobmark - offset; if (len > m->m_len - moff) len = m->m_len - moff; /* * If mp is set, just pass back the mbufs. * Otherwise copy them out via the uio, then free. * Sockbuf must be consistent here (points to current mbuf, * it points to next record) when we drop priority; * we must note any additions to the sockbuf when we * block interrupts again. */ resid -= len; if (len == m->m_len - moff) { if (m->m_flags & M_EOR) flags |= MSG_EOR; if (flags & MSG_PEEK) { m = m->m_next; moff = 0; } else { nextrecord = m->m_nextpkt; sbfree(&so->so_rcv, m); if (mp) { *mp = m; mp = &m->m_next; so->so_rcv.sb_mb = m = m->m_next; *mp = NULL; } else { MFREE(m, so->so_rcv.sb_mb); m = so->so_rcv.sb_mb; } /* * If m != NULL, we also know that * so->so_rcv.sb_mb != NULL. */ KASSERT(so->so_rcv.sb_mb == m); if (m) { m->m_nextpkt = nextrecord; if (nextrecord == NULL) so->so_rcv.sb_lastrecord = m; } else { so->so_rcv.sb_mb = nextrecord; SB_EMPTY_FIXUP(&so->so_rcv); } SBLASTRECORDCHK(&so->so_rcv, "kttcp_soreceive 3"); SBLASTMBUFCHK(&so->so_rcv, "kttcp_soreceive 3"); } } else { if (flags & MSG_PEEK) moff += len; else { if (mp) *mp = m_copym(m, 0, len, M_WAIT); m->m_data += len; m->m_len -= len; so->so_rcv.sb_cc -= len; } } if (so->so_oobmark) { if ((flags & MSG_PEEK) == 0) { so->so_oobmark -= len; if (so->so_oobmark == 0) { so->so_state |= SS_RCVATMARK; break; } } else { offset += len; if (offset == so->so_oobmark) break; } } if (flags & MSG_EOR) break; /* * If the MSG_WAITALL flag is set (for non-atomic socket), * we must not quit until "uio->uio_resid == 0" or an error * termination. If a signal/timeout occurs, return * with a short count but without error. * Keep sockbuf locked against other readers. */ while (flags & MSG_WAITALL && m == NULL && resid > 0 && !sosendallatonce(so) && !nextrecord) { if (so->so_error || so->so_state & SS_CANTRCVMORE) break; /* * If we are peeking and the socket receive buffer is * full, stop since we can't get more data to peek at. */ if ((flags & MSG_PEEK) && sbspace(&so->so_rcv) <= 0) break; /* * If we've drained the socket buffer, tell the * protocol in case it needs to do something to * get it filled again. */ if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb) (*pr->pr_usrreq)(so, PRU_RCVD, NULL, (struct mbuf *)(long)flags, NULL, NULL); SBLASTRECORDCHK(&so->so_rcv, "kttcp_soreceive sbwait 2"); SBLASTMBUFCHK(&so->so_rcv, "kttcp_soreceive sbwait 2"); error = sbwait(&so->so_rcv); if (error) { sbunlock(&so->so_rcv); splx(s); return (0); } if ((m = so->so_rcv.sb_mb) != NULL) nextrecord = m->m_nextpkt; } } if (m && pr->pr_flags & PR_ATOMIC) { flags |= MSG_TRUNC; if ((flags & MSG_PEEK) == 0) (void) sbdroprecord(&so->so_rcv); } if ((flags & MSG_PEEK) == 0) { if (m == NULL) { /* * First part is an SB_EMPTY_FIXUP(). Second part * makes sure sb_lastrecord is up-to-date if * there is still data in the socket buffer. */ so->so_rcv.sb_mb = nextrecord; if (so->so_rcv.sb_mb == NULL) { so->so_rcv.sb_mbtail = NULL; so->so_rcv.sb_lastrecord = NULL; } else if (nextrecord->m_nextpkt == NULL) so->so_rcv.sb_lastrecord = nextrecord; } SBLASTRECORDCHK(&so->so_rcv, "kttcp_soreceive 4"); SBLASTMBUFCHK(&so->so_rcv, "kttcp_soreceive 4"); if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) (*pr->pr_usrreq)(so, PRU_RCVD, NULL, (struct mbuf *)(long)flags, NULL, NULL); } if (orig_resid == resid && orig_resid && (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) { sbunlock(&so->so_rcv); splx(s); goto restart; } if (flagsp) *flagsp |= flags; release: sbunlock(&so->so_rcv); splx(s); *done = slen - resid; #if 0 printf("soreceive: error %d slen %llu resid %lld\n", error, slen, resid); #endif return (error); }