/* * Copyright (c) 1989, 1991 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * Rick Macklem at The University of Guelph. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)nfs_socket.c 7.23 (Berkeley) 4/20/91 * $Id: nfs_socket.c,v 1.12 1994/05/05 05:39:46 cgd Exp $ */ /* * Socket operations for use by nfs */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define TRUE 1 #define FALSE 0 /* * External data, mostly RPC constants in XDR form */ extern u_long rpc_reply, rpc_msgdenied, rpc_mismatch, rpc_vers, rpc_auth_unix, rpc_msgaccepted, rpc_call; extern u_long nfs_prog, nfs_vers; /* Maybe these should be bits in a u_long ?? */ /* * Static array that defines which nfs rpc's are nonidempotent */ int nonidempotent[NFS_NPROCS] = { FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, FALSE, }; static int compressrequest[NFS_NPROCS] = { FALSE, TRUE, TRUE, FALSE, TRUE, TRUE, TRUE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, }; int nfs_sbwait(); void nfs_disconnect(); struct mbuf *nfs_compress(), *nfs_uncompress(); struct nfsreq nfsreqh; int nfsrexmtthresh = NFS_FISHY; int nfs_tcpnodelay = 1; /* * Initialize sockets and congestion for a new NFS connection. * We do not free the sockaddr if error. */ nfs_connect(nmp) register struct nfsmount *nmp; { register struct socket *so; struct sockaddr *saddr; int s, error, bufsize; struct mbuf *m; struct sockaddr_in *sin; u_short tport; nmp->nm_so = (struct socket *)0; saddr = mtod(nmp->nm_nam, struct sockaddr *); if (error = socreate(saddr->sa_family, &nmp->nm_so, nmp->nm_sotype, nmp->nm_soproto)) goto bad; so = nmp->nm_so; nmp->nm_soflags = so->so_proto->pr_flags; /* * Some servers require that the client port be a reserved port number. */ if (saddr->sa_family == AF_INET) { MGET(m, M_WAIT, MT_SONAME); sin = mtod(m, struct sockaddr_in *); sin->sin_len = m->m_len = sizeof (struct sockaddr_in); sin->sin_family = AF_INET; sin->sin_addr.s_addr = INADDR_ANY; tport = IPPORT_RESERVED - 1; sin->sin_port = htons(tport); while (sobind(so, m) == EADDRINUSE && --tport > IPPORT_RESERVED / 2) sin->sin_port = htons(tport); m_freem(m); } if (nmp->nm_sotype == SOCK_DGRAM) bufsize = min(4 * (nmp->nm_wsize + NFS_MAXPKTHDR), NFS_MAXPACKET); else bufsize = min(4 * (nmp->nm_wsize + NFS_MAXPKTHDR + sizeof(u_long)), NFS_MAXPACKET + sizeof(u_long)); if (error = soreserve(so, bufsize, bufsize)) goto bad; /* * Protocols that do not require connections may be optionally left * unconnected for servers that reply from a port other than NFS_PORT. */ if (nmp->nm_flag & NFSMNT_NOCONN) { if (nmp->nm_soflags & PR_CONNREQUIRED) { error = ENOTCONN; goto bad; } } else { if (error = soconnect(so, nmp->nm_nam)) goto bad; /* * Wait for the connection to complete. Cribbed from the * connect system call but with the wait at negative prio. */ s = splnet(); while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) (void) tsleep((caddr_t)&so->so_timeo, PSOCK, "nfscon", 0); splx(s); if (so->so_error) { error = so->so_error; goto bad; } } if (nmp->nm_sotype == SOCK_DGRAM) { if (nmp->nm_flag & (NFSMNT_SOFT | NFSMNT_SPONGY | NFSMNT_INT)) { so->so_rcv.sb_timeo = (5 * hz); so->so_snd.sb_timeo = (5 * hz); } else { so->so_rcv.sb_timeo = 0; so->so_snd.sb_timeo = 0; } nmp->nm_rto = NFS_TIMEO; } else { if (nmp->nm_flag & (NFSMNT_SOFT | NFSMNT_SPONGY | NFSMNT_INT)) { so->so_rcv.sb_timeo = (5 * hz); so->so_snd.sb_timeo = (5 * hz); } else { so->so_rcv.sb_timeo = 0; so->so_snd.sb_timeo = 0; } if (so->so_proto->pr_flags & PR_CONNREQUIRED) { MGET(m, M_WAIT, MT_SOOPTS); *mtod(m, int *) = 1; m->m_len = sizeof(int); sosetopt(so, SOL_SOCKET, SO_KEEPALIVE, m); } if (so->so_proto->pr_domain->dom_family == AF_INET && so->so_proto->pr_protocol == IPPROTO_TCP && nfs_tcpnodelay) { MGET(m, M_WAIT, MT_SOOPTS); *mtod(m, int *) = 1; m->m_len = sizeof(int); sosetopt(so, IPPROTO_TCP, TCP_NODELAY, m); } nmp->nm_rto = 10 * NFS_TIMEO; /* XXX */ } so->so_rcv.sb_flags |= SB_NOINTR; so->so_snd.sb_flags |= SB_NOINTR; /* Initialize other non-zero congestion variables */ nmp->nm_window = 2; /* Initial send window */ nmp->nm_ssthresh = NFS_MAXWINDOW; /* Slowstart threshold */ nmp->nm_rttvar = nmp->nm_rto << 1; nmp->nm_sent = 0; nmp->nm_currexmit = 0; return (0); bad: nfs_disconnect(nmp); return (error); } /* * Reconnect routine: * Called when a connection is broken on a reliable protocol. * - clean up the old socket * - nfs_connect() again * - set R_MUSTRESEND for all outstanding requests on mount point * If this fails the mount point is DEAD! * nb: Must be called with the nfs_solock() set on the mount point. */ nfs_reconnect(rep, nmp) register struct nfsreq *rep; register struct nfsmount *nmp; { register struct nfsreq *rp; int error; nfs_msg(rep->r_procp, nmp->nm_mountp->mnt_stat.f_mntfromname, "trying reconnect"); while (error = nfs_connect(nmp)) { #ifdef lint error = error; #endif /* lint */ if ((nmp->nm_flag & NFSMNT_INT) && nfs_sigintr(rep->r_procp)) return (EINTR); (void) tsleep((caddr_t)&lbolt, PSOCK, "nfscon", 0); } nfs_msg(rep->r_procp, nmp->nm_mountp->mnt_stat.f_mntfromname, "reconnected"); /* * Loop through outstanding request list and fix up all requests * on old socket. */ rp = nfsreqh.r_next; while (rp != &nfsreqh) { if (rp->r_nmp == nmp) rp->r_flags |= R_MUSTRESEND; rp = rp->r_next; } return (0); } /* * NFS disconnect. Clean up and unlink. */ void nfs_disconnect(nmp) register struct nfsmount *nmp; { register struct socket *so; if (nmp->nm_so) { so = nmp->nm_so; nmp->nm_so = (struct socket *)0; soshutdown(so, 2); soclose(so); } } /* * This is the nfs send routine. For connection based socket types, it * must be called with an nfs_solock() on the socket. * "rep == NULL" indicates that it has been called from a server. */ nfs_send(so, nam, top, rep) register struct socket *so; struct mbuf *nam; register struct mbuf *top; struct nfsreq *rep; { struct mbuf *sendnam; int error, soflags; if (rep) { if (rep->r_flags & R_SOFTTERM) { m_freem(top); return (EINTR); } if (rep->r_nmp->nm_so == NULL && (error = nfs_reconnect(rep, rep->r_nmp))) return (error); rep->r_flags &= ~R_MUSTRESEND; so = rep->r_nmp->nm_so; soflags = rep->r_nmp->nm_soflags; } else soflags = so->so_proto->pr_flags; if ((soflags & PR_CONNREQUIRED) || (so->so_state & SS_ISCONNECTED)) sendnam = (struct mbuf *)0; else sendnam = nam; error = sosend(so, sendnam, (struct uio *)0, top, (struct mbuf *)0, 0); if (error == EWOULDBLOCK && rep) { if (rep->r_flags & R_SOFTTERM) error = EINTR; else { rep->r_flags |= R_MUSTRESEND; error = 0; } } /* * Ignore socket errors?? */ if (error && error != EINTR && error != ERESTART) error = 0; return (error); } /* * Receive a Sun RPC Request/Reply. For SOCK_DGRAM, the work is all * done by soreceive(), but for SOCK_STREAM we must deal with the Record * Mark and consolidate the data into a new mbuf list. * nb: Sometimes TCP passes the data up to soreceive() in long lists of * small mbufs. * For SOCK_STREAM we must be very careful to read an entire record once * we have read any of it, even if the system call has been interrupted. */ nfs_receive(so, aname, mp, rep) register struct socket *so; struct mbuf **aname; struct mbuf **mp; register struct nfsreq *rep; { struct uio auio; struct iovec aio; register struct mbuf *m; struct mbuf *m2, *mnew, **mbp; caddr_t fcp, tcp; u_long len; struct mbuf **getnam; int error, siz, mlen, soflags, rcvflg; /* * Set up arguments for soreceive() */ *mp = (struct mbuf *)0; *aname = (struct mbuf *)0; if (rep) soflags = rep->r_nmp->nm_soflags; else soflags = so->so_proto->pr_flags; /* * For reliable protocols, lock against other senders/receivers * in case a reconnect is necessary. * For SOCK_STREAM, first get the Record Mark to find out how much * more there is to get. * We must lock the socket against other receivers * until we have an entire rpc request/reply. */ if (soflags & PR_CONNREQUIRED) { tryagain: /* * Check for fatal errors and resending request. */ if (rep) { /* * Ugh: If a reconnect attempt just happened, nm_so * would have changed. NULL indicates a failed * attempt that has essentially shut down this * mount point. */ if (rep->r_mrep || (so = rep->r_nmp->nm_so) == NULL || (rep->r_flags & R_SOFTTERM)) return (EINTR); while (rep->r_flags & R_MUSTRESEND) { m = m_copym(rep->r_mreq, 0, M_COPYALL, M_WAIT); nfsstats.rpcretries++; if (error = nfs_send(so, rep->r_nmp->nm_nam, m, rep)) goto errout; } } if ((soflags & PR_ATOMIC) == 0) { aio.iov_base = (caddr_t) &len; aio.iov_len = sizeof(u_long); auio.uio_iov = &aio; auio.uio_iovcnt = 1; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_READ; auio.uio_procp = (struct proc *)0; auio.uio_offset = 0; auio.uio_resid = sizeof(u_long); do { rcvflg = MSG_WAITALL; error = soreceive(so, (struct mbuf **)0, &auio, (struct mbuf **)0, (struct mbuf **)0, &rcvflg); if (error == EWOULDBLOCK && rep) { if (rep->r_flags & R_SOFTTERM) return (EINTR); if (rep->r_flags & R_MUSTRESEND) goto tryagain; } } while (error == EWOULDBLOCK); if (!error && auio.uio_resid > 0) { if (rep) log(LOG_INFO, "short receive (%d/%d) from nfs server %s\n", sizeof(u_long) - auio.uio_resid, sizeof(u_long), rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); error = EPIPE; } if (error) goto errout; len = ntohl(len) & ~0x80000000; /* * This is SERIOUS! We are out of sync with the sender * and forcing a disconnect/reconnect is all I can do. */ if (len > NFS_MAXPACKET) { if (rep) log(LOG_ERR, "%s (%d) from nfs server %s\n", "impossible packet length", len, rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); error = EFBIG; goto errout; } auio.uio_resid = len; do { rcvflg = MSG_WAITALL; error = soreceive(so, (struct mbuf **)0, &auio, mp, (struct mbuf **)0, &rcvflg); } while (error == EWOULDBLOCK || error == EINTR || error == ERESTART); if (!error && auio.uio_resid > 0) { if (rep) log(LOG_INFO, "short receive (%d/%d) from nfs server %s\n", len - auio.uio_resid, len, rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); error = EPIPE; } } else { auio.uio_resid = len = 1000000; /* Anything Big */ do { rcvflg = 0; error = soreceive(so, (struct mbuf **)0, &auio, mp, (struct mbuf **)0, &rcvflg); if (error == EWOULDBLOCK && rep) { if (rep->r_flags & R_SOFTTERM) return (EINTR); if (rep->r_flags & R_MUSTRESEND) goto tryagain; } } while (error == EWOULDBLOCK); if (!error && *mp == NULL) error = EPIPE; len -= auio.uio_resid; } errout: if (error && rep && error != EINTR && error != ERESTART) { m_freem(*mp); *mp = (struct mbuf *)0; if (error != EPIPE && rep) log(LOG_INFO, "receive error %d from nfs server %s\n", error, rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); nfs_disconnect(rep->r_nmp); error = nfs_reconnect(rep, rep->r_nmp); if (!error) goto tryagain; } } else { if (so->so_state & SS_ISCONNECTED) getnam = (struct mbuf **)0; else getnam = aname; auio.uio_resid = len = 1000000; do { rcvflg = 0; error = soreceive(so, getnam, &auio, mp, (struct mbuf **)0, &rcvflg); if (error == EWOULDBLOCK && rep && (rep->r_flags & R_SOFTTERM)) return (EINTR); } while (error == EWOULDBLOCK); len -= auio.uio_resid; } if (error) { m_freem(*mp); *mp = (struct mbuf *)0; } /* * Search for any mbufs that are not a multiple of 4 bytes long. * These could cause pointer alignment problems, so copy them to * well aligned mbufs. */ m = *mp; mbp = mp; while (m) { /* * All this for something that may never happen. */ if (m->m_next && (m->m_len & 0x3)) { printf("nfs_rcv odd length!\n"); mlen = 0; while (m) { fcp = mtod(m, caddr_t); while (m->m_len > 0) { if (mlen == 0) { MGET(m2, M_WAIT, MT_DATA); if (len >= MINCLSIZE) MCLGET(m2, M_WAIT); m2->m_len = 0; mlen = M_TRAILINGSPACE(m2); tcp = mtod(m2, caddr_t); *mbp = m2; mbp = &m2->m_next; } siz = MIN(mlen, m->m_len); bcopy(fcp, tcp, siz); m2->m_len += siz; mlen -= siz; len -= siz; tcp += siz; m->m_len -= siz; fcp += siz; } MFREE(m, mnew); m = mnew; } break; } len -= m->m_len; mbp = &m->m_next; m = m->m_next; } return (error); } /* * Implement receipt of reply on a socket. * We must search through the list of received datagrams matching them * with outstanding requests using the xid, until ours is found. */ /* ARGSUSED */ nfs_reply(nmp, myrep) struct nfsmount *nmp; struct nfsreq *myrep; { register struct mbuf *m; register struct nfsreq *rep; register int error = 0; u_long rxid; struct mbuf *mp, *nam; char *cp; int cnt, xfer; /* * Loop around until we get our own reply */ for (;;) { /* * Lock against other receivers so that I don't get stuck in * sbwait() after someone else has received my reply for me. * Also necessary for connection based protocols to avoid * race conditions during a reconnect. */ nfs_solock(&nmp->nm_flag); /* Already received, bye bye */ if (myrep->r_mrep != NULL) { nfs_sounlock(&nmp->nm_flag); return (0); } /* * Get the next Rpc reply off the socket */ if (error = nfs_receive(nmp->nm_so, &nam, &mp, myrep)) { nfs_sounlock(&nmp->nm_flag); /* * Ignore routing errors on connectionless protocols?? */ if (NFSIGNORE_SOERROR(nmp->nm_soflags, error)) { nmp->nm_so->so_error = 0; continue; } /* * Otherwise cleanup and return a fatal error. */ if (myrep->r_flags & R_TIMING) { myrep->r_flags &= ~R_TIMING; nmp->nm_rtt = -1; } if (myrep->r_flags & R_SENT) { myrep->r_flags &= ~R_SENT; nmp->nm_sent--; } return (error); } /* * Get the xid and check that it is an rpc reply */ m = mp; while (m && m->m_len == 0) m = m->m_next; if (m == NULL) { nfsstats.rpcinvalid++; m_freem(mp); nfs_sounlock(&nmp->nm_flag); continue; } bcopy(mtod(m, caddr_t), (caddr_t)&rxid, NFSX_UNSIGNED); /* * Loop through the request list to match up the reply * Iff no match, just drop the datagram */ m = mp; rep = nfsreqh.r_next; while (rep != &nfsreqh) { if (rep->r_mrep == NULL && rxid == rep->r_xid) { /* Found it.. */ rep->r_mrep = m; /* * Update timing */ if (rep->r_flags & R_TIMING) { nfs_updatetimer(rep->r_nmp); rep->r_flags &= ~R_TIMING; rep->r_nmp->nm_rtt = -1; } if (rep->r_flags & R_SENT) { rep->r_flags &= ~R_SENT; rep->r_nmp->nm_sent--; } break; } rep = rep->r_next; } nfs_sounlock(&nmp->nm_flag); if (nam) m_freem(nam); /* * If not matched to a request, drop it. * If it's mine, get out. */ if (rep == &nfsreqh) { nfsstats.rpcunexpected++; m_freem(m); } else if (rep == myrep) return (0); } } /* * nfs_request - goes something like this * - fill in request struct * - links it into list * - calls nfs_send() for first transmit * - calls nfs_receive() to get reply * - break down rpc header and return with nfs reply pointed to * by mrep or error * nb: always frees up mreq mbuf list */ nfs_request(vp, mreq, xid, procnum, procp, tryhard, mp, mrp, mdp, dposp) struct vnode *vp; struct mbuf *mreq; u_long xid; int procnum; struct proc *procp; int tryhard; struct mount *mp; struct mbuf **mrp; struct mbuf **mdp; caddr_t *dposp; { register struct mbuf *m, *mrep; register struct nfsreq *rep; register u_long *tl; register int len; struct nfsmount *nmp; struct mbuf *md; struct nfsreq *reph; caddr_t dpos; char *cp2; int t1; int s, compressed; int error = 0; nmp = VFSTONFS(mp); m = mreq; MALLOC(rep, struct nfsreq *, sizeof(struct nfsreq), M_NFSREQ, M_WAITOK); rep->r_xid = xid; rep->r_nmp = nmp; rep->r_vp = vp; rep->r_procp = procp; if ((nmp->nm_flag & NFSMNT_SOFT) || ((nmp->nm_flag & NFSMNT_SPONGY) && !tryhard)) rep->r_retry = nmp->nm_retry; else rep->r_retry = NFS_MAXREXMIT + 1; /* past clip limit */ rep->r_flags = rep->r_rexmit = 0; /* * Three cases: * - non-idempotent requests on SOCK_DGRAM use NFS_MINIDEMTIMEO * - idempotent requests on SOCK_DGRAM use 0 * - Reliable transports, NFS_RELIABLETIMEO * Timeouts are still done on reliable transports to ensure detection * of excessive connection delay. */ if (nmp->nm_sotype != SOCK_DGRAM) rep->r_timerinit = -NFS_RELIABLETIMEO; else if (nonidempotent[procnum]) rep->r_timerinit = -NFS_MINIDEMTIMEO; else rep->r_timerinit = 0; rep->r_timer = rep->r_timerinit; rep->r_mrep = NULL; len = 0; while (m) { len += m->m_len; m = m->m_next; } mreq->m_pkthdr.len = len; mreq->m_pkthdr.rcvif = (struct ifnet *)0; compressed = 0; m = mreq; if ((nmp->nm_flag & NFSMNT_COMPRESS) && compressrequest[procnum]) { mreq = nfs_compress(mreq); if (mreq != m) { len = mreq->m_pkthdr.len; compressed++; } } /* * For non-atomic protocols, insert a Sun RPC Record Mark. */ if ((nmp->nm_soflags & PR_ATOMIC) == 0) { M_PREPEND(mreq, sizeof(u_long), M_WAIT); *mtod(mreq, u_long *) = htonl(0x80000000 | len); } rep->r_mreq = mreq; /* * Do the client side RPC. */ nfsstats.rpcrequests++; /* * Chain request into list of outstanding requests. Be sure * to put it LAST so timer finds oldest requests first. */ s = splnet(); reph = &nfsreqh; reph->r_prev->r_next = rep; rep->r_prev = reph->r_prev; reph->r_prev = rep; rep->r_next = reph; /* * If backing off another request or avoiding congestion, don't * send this one now but let timer do it. If not timing a request, * do it now. */ if (nmp->nm_sent <= 0 || nmp->nm_sotype != SOCK_DGRAM || (nmp->nm_currexmit == 0 && nmp->nm_sent < nmp->nm_window)) { nmp->nm_sent++; rep->r_flags |= R_SENT; if (nmp->nm_rtt == -1) { nmp->nm_rtt = 0; rep->r_flags |= R_TIMING; } splx(s); m = m_copym(mreq, 0, M_COPYALL, M_WAIT); if (nmp->nm_soflags & PR_CONNREQUIRED) nfs_solock(&nmp->nm_flag); error = nfs_send(nmp->nm_so, nmp->nm_nam, m, rep); if (nmp->nm_soflags & PR_CONNREQUIRED) nfs_sounlock(&nmp->nm_flag); if (error && NFSIGNORE_SOERROR(nmp->nm_soflags, error)) nmp->nm_so->so_error = error = 0; } else splx(s); /* * Wait for the reply from our send or the timer's. */ if (!error) error = nfs_reply(nmp, rep); /* * RPC done, unlink the request. */ s = splnet(); rep->r_prev->r_next = rep->r_next; rep->r_next->r_prev = rep->r_prev; splx(s); /* * If there was a successful reply and a tprintf msg. * tprintf a response. */ if (!error && (rep->r_flags & R_TPRINTFMSG)) nfs_msg(rep->r_procp, nmp->nm_mountp->mnt_stat.f_mntfromname, "is alive again"); m_freem(rep->r_mreq); mrep = rep->r_mrep; FREE((caddr_t)rep, M_NFSREQ); if (error) return (error); if (compressed) mrep = nfs_uncompress(mrep); md = mrep; /* * break down the rpc header and check if ok */ dpos = mtod(md, caddr_t); nfsm_disect(tl, u_long *, 5*NFSX_UNSIGNED); tl += 2; if (*tl++ == rpc_msgdenied) { if (*tl == rpc_mismatch) error = EOPNOTSUPP; else error = EACCES; m_freem(mrep); return (error); } /* * skip over the auth_verf, someday we may want to cache auth_short's * for nfs_reqhead(), but for now just dump it */ if (*++tl != 0) { len = nfsm_rndup(fxdr_unsigned(long, *tl)); nfsm_adv(len); } nfsm_disect(tl, u_long *, NFSX_UNSIGNED); /* 0 == ok */ if (*tl == 0) { nfsm_disect(tl, u_long *, NFSX_UNSIGNED); if (*tl != 0) { error = fxdr_unsigned(int, *tl); m_freem(mrep); return (error); } *mrp = mrep; *mdp = md; *dposp = dpos; return (0); } m_freem(mrep); return (EPROTONOSUPPORT); nfsmout: return (error); } /* * Get a request for the server main loop * - receive a request via. nfs_soreceive() * - verify it * - fill in the cred struct. */ nfs_getreq(so, prog, vers, maxproc, nam, mrp, mdp, dposp, retxid, procnum, cr, msk, mtch, wascomp, repstat) struct socket *so; u_long prog; u_long vers; int maxproc; struct mbuf **nam; struct mbuf **mrp; struct mbuf **mdp; caddr_t *dposp; u_long *retxid; u_long *procnum; register struct ucred *cr; struct mbuf *msk, *mtch; int *wascomp, *repstat; { register int i; register u_long *tl; register long t1; caddr_t dpos, cp2; int error = 0; struct mbuf *mrep, *md; int len; *repstat = 0; if (so->so_proto->pr_flags & PR_CONNREQUIRED) { error = nfs_receive(so, nam, &mrep, (struct nfsreq *)0); } else { mrep = (struct mbuf *)0; do { if (mrep) { m_freem(*nam); m_freem(mrep); } error = nfs_receive(so, nam, &mrep, (struct nfsreq *)0); } while (!error && nfs_badnam(*nam, msk, mtch)); } if (error) return (error); md = mrep; mrep = nfs_uncompress(mrep); if (mrep != md) { *wascomp = 1; md = mrep; } else *wascomp = 0; dpos = mtod(mrep, caddr_t); nfsm_disect(tl, u_long *, 10*NFSX_UNSIGNED); *retxid = fxdr_unsigned(u_long, *tl++); if (*tl++ != rpc_call || *tl++ != rpc_vers) { *mrp = mrep; *procnum = NFSPROC_NOOP; *repstat = ERPCMISMATCH; return (0); } if (*tl++ != prog) { *mrp = mrep; *procnum = NFSPROC_NOOP; *repstat = EPROGUNAVAIL; return (0); } if (*tl++ != vers) { *mrp = mrep; *procnum = NFSPROC_NOOP; *repstat = EPROGMISMATCH; return (0); } *procnum = fxdr_unsigned(u_long, *tl++); if (*procnum == NFSPROC_NULL) { *mrp = mrep; return (0); } if (*procnum > maxproc || *tl++ != rpc_auth_unix) { *mrp = mrep; *procnum = NFSPROC_NOOP; *repstat = EPROCUNAVAIL; return (0); } len = fxdr_unsigned(int, *tl++); if (len < 0 || len > RPCAUTH_MAXSIZ) { m_freem(mrep); return (EBADRPC); } len = fxdr_unsigned(int, *++tl); if (len < 0 || len > NFS_MAXNAMLEN) { m_freem(mrep); return (EBADRPC); } nfsm_adv(nfsm_rndup(len)); nfsm_disect(tl, u_long *, 3*NFSX_UNSIGNED); cr->cr_uid = fxdr_unsigned(uid_t, *tl++); cr->cr_gid = fxdr_unsigned(gid_t, *tl++); len = fxdr_unsigned(int, *tl); if (len < 0 || len > RPCAUTH_UNIXGIDS) { m_freem(mrep); return (EBADRPC); } nfsm_disect(tl, u_long *, (len + 2)*NFSX_UNSIGNED); for (i = 1; i <= len; i++) if (i < NGROUPS) cr->cr_groups[i] = fxdr_unsigned(gid_t, *tl++); else tl++; cr->cr_ngroups = (len >= NGROUPS) ? NGROUPS : (len + 1); /* * Do we have any use for the verifier. * According to the "Remote Procedure Call Protocol Spec." it * should be AUTH_NULL, but some clients make it AUTH_UNIX? * For now, just skip over it */ len = fxdr_unsigned(int, *++tl); if (len < 0 || len > RPCAUTH_MAXSIZ) { m_freem(mrep); return (EBADRPC); } if (len > 0) nfsm_adv(nfsm_rndup(len)); *mrp = mrep; *mdp = md; *dposp = dpos; return (0); nfsmout: return (error); } /* * Generate the rpc reply header * siz arg. is used to decide if adding a cluster is worthwhile */ nfs_rephead(siz, retxid, err, mrq, mbp, bposp) int siz; u_long retxid; int err; struct mbuf **mrq; struct mbuf **mbp; caddr_t *bposp; { register u_long *tl; register long t1; caddr_t bpos; struct mbuf *mreq, *mb, *mb2; NFSMGETHDR(mreq); mb = mreq; if ((siz+RPC_REPLYSIZ) > MHLEN) MCLGET(mreq, M_WAIT); tl = mtod(mreq, u_long *); mreq->m_len = 6*NFSX_UNSIGNED; bpos = ((caddr_t)tl)+mreq->m_len; *tl++ = txdr_unsigned(retxid); *tl++ = rpc_reply; if (err == ERPCMISMATCH) { *tl++ = rpc_msgdenied; *tl++ = rpc_mismatch; *tl++ = txdr_unsigned(2); *tl = txdr_unsigned(2); } else { *tl++ = rpc_msgaccepted; *tl++ = 0; *tl++ = 0; switch (err) { case EPROGUNAVAIL: *tl = txdr_unsigned(RPC_PROGUNAVAIL); break; case EPROGMISMATCH: *tl = txdr_unsigned(RPC_PROGMISMATCH); nfsm_build(tl, u_long *, 2*NFSX_UNSIGNED); *tl++ = txdr_unsigned(2); *tl = txdr_unsigned(2); /* someday 3 */ break; case EPROCUNAVAIL: *tl = txdr_unsigned(RPC_PROCUNAVAIL); break; default: *tl = 0; if (err != VNOVAL) { nfsm_build(tl, u_long *, NFSX_UNSIGNED); *tl = txdr_unsigned(err); } break; }; } *mrq = mreq; *mbp = mb; *bposp = bpos; if (err != 0 && err != VNOVAL) nfsstats.srvrpc_errs++; return (0); } /* * Nfs timer routine * Scan the nfsreq list and retranmit any requests that have timed out * To avoid retransmission attempts on STREAM sockets (in the future) make * sure to set the r_retry field to 0 (implies nm_retry == 0). */ void nfs_timer() { register struct nfsreq *rep; register struct mbuf *m; register struct socket *so; register struct nfsmount *nmp; int s, error; s = splnet(); for (rep = nfsreqh.r_next; rep != &nfsreqh; rep = rep->r_next) { nmp = rep->r_nmp; if (rep->r_mrep || (rep->r_flags & R_SOFTTERM) || (so = nmp->nm_so) == NULL) continue; if ((nmp->nm_flag & NFSMNT_INT) && nfs_sigintr(rep->r_procp)) { rep->r_flags |= R_SOFTTERM; continue; } if (rep->r_flags & R_TIMING) /* update rtt in mount */ nmp->nm_rtt++; /* If not timed out */ if (++rep->r_timer < nmp->nm_rto) continue; /* Do backoff and save new timeout in mount */ if (rep->r_flags & R_TIMING) { nfs_backofftimer(nmp); rep->r_flags &= ~R_TIMING; nmp->nm_rtt = -1; } if (rep->r_flags & R_SENT) { rep->r_flags &= ~R_SENT; nmp->nm_sent--; } /* * Check for too many retries on soft mount. * nb: For hard mounts, r_retry == NFS_MAXREXMIT+1 */ if (++rep->r_rexmit > NFS_MAXREXMIT) rep->r_rexmit = NFS_MAXREXMIT; /* * Check for server not responding */ if ((rep->r_flags & R_TPRINTFMSG) == 0 && rep->r_rexmit > NFS_FISHY) { nfs_msg(rep->r_procp, nmp->nm_mountp->mnt_stat.f_mntfromname, "not responding"); rep->r_flags |= R_TPRINTFMSG; } if (rep->r_rexmit >= rep->r_retry) { /* too many */ nfsstats.rpctimeouts++; rep->r_flags |= R_SOFTTERM; continue; } if (nmp->nm_sotype != SOCK_DGRAM) continue; /* * If there is enough space and the window allows.. * Resend it */ if (sbspace(&so->so_snd) >= rep->r_mreq->m_pkthdr.len && nmp->nm_sent < nmp->nm_window && (m = m_copym(rep->r_mreq, 0, M_COPYALL, M_DONTWAIT))){ nfsstats.rpcretries++; if ((nmp->nm_flag & NFSMNT_NOCONN) == 0) error = (*so->so_proto->pr_usrreq)(so, PRU_SEND, m, (caddr_t)0, (struct mbuf *)0, (struct mbuf *)0); else error = (*so->so_proto->pr_usrreq)(so, PRU_SEND, m, nmp->nm_nam, (struct mbuf *)0, (struct mbuf *)0); if (error) { if (NFSIGNORE_SOERROR(nmp->nm_soflags, error)) so->so_error = 0; } else { /* * We need to time the request even though we * are retransmitting. */ nmp->nm_rtt = 0; nmp->nm_sent++; rep->r_flags |= (R_SENT|R_TIMING); rep->r_timer = rep->r_timerinit; } } } splx(s); timeout(nfs_timer, (caddr_t)0, hz/NFS_HZ); } /* * NFS timer update and backoff. The "Jacobson/Karels/Karn" scheme is * used here. The timer state is held in the nfsmount structure and * a single request is used to clock the response. When successful * the rtt smoothing in nfs_updatetimer is used, when failed the backoff * is done by nfs_backofftimer. We also log failure messages in these * routines. * * Congestion variables are held in the nfshost structure which * is referenced by nfsmounts and shared per-server. This separation * makes it possible to do per-mount timing which allows varying disk * access times to be dealt with, while preserving a network oriented * congestion control scheme. * * The windowing implements the Jacobson/Karels slowstart algorithm * with adjusted scaling factors. We start with one request, then send * 4 more after each success until the ssthresh limit is reached, then * we increment at a rate proportional to the window. On failure, we * remember 3/4 the current window and clamp the send limit to 1. Note * ICMP source quench is not reflected in so->so_error so we ignore that * for now. * * NFS behaves much more like a transport protocol with these changes, * shedding the teenage pedal-to-the-metal tendencies of "other" * implementations. * * Timers and congestion avoidance by Tom Talpey, Open Software Foundation. */ /* * The TCP algorithm was not forgiving enough. Because the NFS server * responds only after performing lookups/diskio/etc, we have to be * more prepared to accept a spiky variance. The TCP algorithm is: * TCP_RTO(nmp) ((((nmp)->nm_srtt >> 2) + (nmp)->nm_rttvar) >> 1) */ #define NFS_RTO(nmp) (((nmp)->nm_srtt >> 3) + (nmp)->nm_rttvar) nfs_updatetimer(nmp) register struct nfsmount *nmp; { /* If retransmitted, clear and return */ if (nmp->nm_rexmit || nmp->nm_currexmit) { nmp->nm_rexmit = nmp->nm_currexmit = 0; return; } /* If have a measurement, do smoothing */ if (nmp->nm_srtt) { register short delta; delta = nmp->nm_rtt - (nmp->nm_srtt >> 3); if ((nmp->nm_srtt += delta) <= 0) nmp->nm_srtt = 1; if (delta < 0) delta = -delta; delta -= (nmp->nm_rttvar >> 2); if ((nmp->nm_rttvar += delta) <= 0) nmp->nm_rttvar = 1; /* Else initialize */ } else { nmp->nm_rttvar = nmp->nm_rtt << 1; if (nmp->nm_rttvar == 0) nmp->nm_rttvar = 2; nmp->nm_srtt = nmp->nm_rttvar << 2; } /* Compute new Retransmission TimeOut and clip */ nmp->nm_rto = NFS_RTO(nmp); if (nmp->nm_rto < NFS_MINTIMEO) nmp->nm_rto = NFS_MINTIMEO; else if (nmp->nm_rto > NFS_MAXTIMEO) nmp->nm_rto = NFS_MAXTIMEO; /* Update window estimate */ if (nmp->nm_window < nmp->nm_ssthresh) /* quickly */ nmp->nm_window += 4; else { /* slowly */ register long incr = ++nmp->nm_winext; incr = (incr * incr) / nmp->nm_window; if (incr > 0) { nmp->nm_winext = 0; ++nmp->nm_window; } } if (nmp->nm_window > NFS_MAXWINDOW) nmp->nm_window = NFS_MAXWINDOW; } nfs_backofftimer(nmp) register struct nfsmount *nmp; { register unsigned long newrto; /* Clip shift count */ if (++nmp->nm_rexmit > 8 * sizeof nmp->nm_rto) nmp->nm_rexmit = 8 * sizeof nmp->nm_rto; /* Back off RTO exponentially */ newrto = NFS_RTO(nmp); newrto <<= (nmp->nm_rexmit - 1); if (newrto == 0 || newrto > NFS_MAXTIMEO) newrto = NFS_MAXTIMEO; nmp->nm_rto = newrto; /* If too many retries, message, assume a bogus RTT and re-measure */ if (nmp->nm_currexmit < nmp->nm_rexmit) { nmp->nm_currexmit = nmp->nm_rexmit; if (nmp->nm_currexmit >= nfsrexmtthresh) { if (nmp->nm_currexmit == nfsrexmtthresh) { nmp->nm_rttvar += (nmp->nm_srtt >> 2); nmp->nm_srtt = 0; } } } /* Close down window but remember this point (3/4 current) for later */ nmp->nm_ssthresh = ((nmp->nm_window << 1) + nmp->nm_window) >> 2; nmp->nm_window = 1; nmp->nm_winext = 0; } /* * Test for a termination signal pending on procp. * This is used for NFSMNT_INT mounts. */ nfs_sigintr(p) register struct proc *p; { if (p && p->p_siglist && (((p->p_siglist &~ p->p_sigmask) &~ p->p_sigignore) & NFSINT_SIGMASK)) return (1); else return (0); } nfs_msg(p, server, msg) struct proc *p; char *server, *msg; { tpr_t tpr; if (p) tpr = tprintf_open(p); else tpr = NULL; tprintf(tpr, "nfs server %s: %s\n", server, msg); tprintf_close(tpr); } /* * Lock a socket against others. * Necessary for STREAM sockets to ensure you get an entire rpc request/reply * and also to avoid race conditions between the processes with nfs requests * in progress when a reconnect is necessary. */ nfs_solock(flagp) register int *flagp; { while (*flagp & NFSMNT_SCKLOCK) { *flagp |= NFSMNT_WANTSCK; (void) tsleep((caddr_t)flagp, PZERO-1, "nfsolck", 0); } *flagp |= NFSMNT_SCKLOCK; } /* * Unlock the stream socket for others. */ nfs_sounlock(flagp) register int *flagp; { if ((*flagp & NFSMNT_SCKLOCK) == 0) panic("nfs sounlock"); *flagp &= ~NFSMNT_SCKLOCK; if (*flagp & NFSMNT_WANTSCK) { *flagp &= ~NFSMNT_WANTSCK; wakeup((caddr_t)flagp); } } /* * This function compares two net addresses by family and returns TRUE * if they are the same. * If there is any doubt, return FALSE. */ nfs_netaddr_match(nam1, nam2) struct mbuf *nam1, *nam2; { register struct sockaddr *saddr1, *saddr2; saddr1 = mtod(nam1, struct sockaddr *); saddr2 = mtod(nam2, struct sockaddr *); if (saddr1->sa_family != saddr2->sa_family) return (0); /* * Must do each address family separately since unused fields * are undefined values and not always zeroed. */ switch (saddr1->sa_family) { case AF_INET: if (((struct sockaddr_in *)saddr1)->sin_addr.s_addr == ((struct sockaddr_in *)saddr2)->sin_addr.s_addr) return (1); break; default: break; }; return (0); } /* * Check the hostname fields for nfsd's mask and match fields. * By address family: * - Bitwise AND the mask with the host address field * - Compare for == with match * return TRUE if not equal */ nfs_badnam(nam, msk, mtch) register struct mbuf *nam, *msk, *mtch; { switch (mtod(nam, struct sockaddr *)->sa_family) { case AF_INET: return ((mtod(nam, struct sockaddr_in *)->sin_addr.s_addr & mtod(msk, struct sockaddr_in *)->sin_addr.s_addr) != mtod(mtch, struct sockaddr_in *)->sin_addr.s_addr); default: printf("nfs_badmatch, unknown sa_family\n"); return (0); }; }