specfs: Drain all I/O operations after last .d_close call.

New kind of I/O reference on specdevs, sd_iocnt. This could be done with psref instead; I chose a reference count instead for now because we already have to take a per-object lock anyway, v_interlock, for vdead_check, so another atomic is not likely to hurt much more. We can always change the mechanism inside spec_io_enter/exit/drain later on. Make sure every access to vp->v_rdev or vp->v_specnode and every call to a devsw operation is protected either: - by the vnode lock (with vdead_check if we unlocked/relocked), - by positive sd_opencnt, - by spec_io_enter/exit, or - by sd_opencnt management in open/close.
2022-03-28 12:36:51 +00:00 · 2022-03-28 12:36:51 +00:00 · 66ae10f9a4
parent 71a1e06a17
commit 66ae10f9a4
2 changed files with 269 additions and 91 deletions
--- a/sys/miscfs/specfs/spec_vnops.c
+++ b/sys/miscfs/specfs/spec_vnops.c
@ -1,4 +1,4 @@
-/*	$NetBSD: spec_vnops.c,v 1.201 2022/03/28 12:36:42 riastradh Exp $	*/
+/*	$NetBSD: spec_vnops.c,v 1.202 2022/03/28 12:36:51 riastradh Exp $	*/

 /*-
 * Copyright (c) 2008 The NetBSD Foundation, Inc.
@ -58,7 +58,7 @@
 */

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: spec_vnops.c,v 1.201 2022/03/28 12:36:42 riastradh Exp $");
+__KERNEL_RCSID(0, "$NetBSD: spec_vnops.c,v 1.202 2022/03/28 12:36:51 riastradh Exp $");

 #include <sys/param.h>
 #include <sys/proc.h>
@ -81,6 +81,7 @@ __KERNEL_RCSID(0, "$NetBSD: spec_vnops.c,v 1.201 2022/03/28 12:36:42 riastradh E
 #include <sys/kauth.h>
 #include <sys/fstrans.h>
 #include <sys/module.h>
+#include <sys/atomic.h>

 #include <miscfs/genfs/genfs.h>
 #include <miscfs/specfs/specdev.h>
@ -173,6 +174,7 @@ const struct vnodeopv_desc spec_vnodeop_opv_desc =
 	{ &spec_vnodeop_p, spec_vnodeop_entries };

 static kauth_listener_t rawio_listener;
+static struct kcondvar specfs_iocv;

 /* Returns true if vnode is /dev/mem or /dev/kmem. */
 bool
@ -218,6 +220,141 @@ spec_init(void)

 	rawio_listener = kauth_listen_scope(KAUTH_SCOPE_DEVICE,
 	    rawio_listener_cb, NULL);
+	cv_init(&specfs_iocv, "specio");
+}
+
+/*
+ * spec_io_enter(vp, &sn, &dev)
+ *
+ *	Enter an operation that may not hold vp's vnode lock or an
+ *	fstrans on vp's mount.  Until spec_io_exit, the vnode will not
+ *	be revoked.
+ *
+ *	On success, set sn to the specnode pointer and dev to the dev_t
+ *	number and return zero.  Caller must later call spec_io_exit
+ *	when done.
+ *
+ *	On failure, return ENXIO -- the device has been revoked and no
+ *	longer exists.
+ */
+static int
+spec_io_enter(struct vnode *vp, struct specnode **snp, dev_t *devp)
+{
+	dev_t dev;
+	struct specnode *sn;
+	unsigned iocnt;
+	int error = 0;
+
+	mutex_enter(vp->v_interlock);
+
+	/*
+	 * Extract all the info we need from the vnode, unless the
+	 * vnode has already been reclaimed.  This can happen if the
+	 * underlying device has been removed and all the device nodes
+	 * for it have been revoked.  The caller may not hold a vnode
+	 * lock or fstrans to prevent this from happening before it has
+	 * had an opportunity to notice the vnode is dead.
+	 */
+	if (vdead_check(vp, VDEAD_NOWAIT) != 0 ||
+	    (sn = vp->v_specnode) == NULL ||
+	    (dev = vp->v_rdev) == NODEV) {
+		error = ENXIO;
+		goto out;
+	}
+
+	/*
+	 * Notify spec_close that we are doing an I/O operation which
+	 * may not be not bracketed by fstrans(9) and thus is not
+	 * blocked by vfs suspension.
+	 *
+	 * We could hold this reference with psref(9) instead, but we
+	 * already have to take the interlock for vdead_check, so
+	 * there's not much more cost here to another atomic operation.
+	 */
+	do {
+		iocnt = atomic_load_relaxed(&sn->sn_dev->sd_iocnt);
+		if (__predict_false(iocnt == UINT_MAX)) {
+			/*
+			 * The I/O count is limited by the number of
+			 * LWPs (which will never overflow this) --
+			 * unless one driver uses another driver via
+			 * specfs, which is rather unusual, but which
+			 * could happen via pud(4) userspace drivers.
+			 * We could use a 64-bit count, but can't use
+			 * atomics for that on all platforms.
+			 * (Probably better to switch to psref or
+			 * localcount instead.)
+			 */
+			error = EBUSY;
+			goto out;
+		}
+	} while (atomic_cas_uint(&sn->sn_dev->sd_iocnt, iocnt, iocnt + 1)
+	    != iocnt);
+
+	/* Success!  */
+	*snp = sn;
+	*devp = dev;
+	error = 0;
+
+out:	mutex_exit(vp->v_interlock);
+	return error;
+}
+
+/*
+ * spec_io_exit(vp, sn)
+ *
+ *	Exit an operation entered with a successful spec_io_enter --
+ *	allow concurrent spec_node_revoke to proceed.  The argument sn
+ *	must match the struct specnode pointer returned by spec_io_exit
+ *	for vp.
+ */
+static void
+spec_io_exit(struct vnode *vp, struct specnode *sn)
+{
+	struct specdev *sd = sn->sn_dev;
+	unsigned iocnt;
+
+	KASSERT(vp->v_specnode == sn);
+
+	/*
+	 * We are done.  Notify spec_close if appropriate.  The
+	 * transition of 1 -> 0 must happen under device_lock so
+	 * spec_close doesn't miss a wakeup.
+	 */
+	do {
+		iocnt = atomic_load_relaxed(&sd->sd_iocnt);
+		KASSERT(iocnt > 0);
+		if (iocnt == 1) {
+			mutex_enter(&device_lock);
+			if (atomic_dec_uint_nv(&sd->sd_iocnt) == 0)
+				cv_broadcast(&specfs_iocv);
+			mutex_exit(&device_lock);
+			break;
+		}
+	} while (atomic_cas_uint(&sd->sd_iocnt, iocnt, iocnt - 1) != iocnt);
+}
+
+/*
+ * spec_io_drain(sd)
+ *
+ *	Wait for all existing spec_io_enter/exit sections to complete.
+ *	Caller must ensure spec_io_enter will fail at this point.
+ */
+static void
+spec_io_drain(struct specdev *sd)
+{
+
+	/*
+	 * I/O at the same time as closing is unlikely -- it often
+	 * indicates an application bug.
+	 */
+	if (__predict_true(atomic_load_relaxed(&sd->sd_iocnt) == 0))
+		return;
+
+	mutex_enter(&device_lock);
+	while (atomic_load_relaxed(&sd->sd_iocnt) > 0)
+		cv_wait(&specfs_iocv, &device_lock);
+	mutex_exit(&device_lock);
 }

 /*
@ -258,6 +395,7 @@ spec_node_init(vnode_t *vp, dev_t rdev)
 		sd->sd_refcnt = 1;
 		sd->sd_opencnt = 0;
 		sd->sd_bdevvp = NULL;
+		sd->sd_iocnt = 0;
 		sd->sd_opened = false;
 		sn->sn_dev = sd;
 		sd = NULL;
@ -472,6 +610,7 @@ spec_node_destroy(vnode_t *vp)

 	/* If the device is no longer in use, destroy our record. */
 	if (refcnt == 1) {
+		KASSERT(sd->sd_iocnt == 0);
 		KASSERT(sd->sd_opencnt == 0);
 		KASSERT(sd->sd_bdevvp == NULL);
 		kmem_free(sd, sizeof(*sd));
@ -509,29 +648,26 @@ spec_open(void *v)
 		int  a_mode;
 		kauth_cred_t a_cred;
 	} */ *ap = v;
-	struct lwp *l;
-	struct vnode *vp;
+	struct lwp *l = curlwp;
+	struct vnode *vp = ap->a_vp;
 	dev_t dev;
 	int error;
 	enum kauth_device_req req;
 	specnode_t *sn;
 	specdev_t *sd;
 	spec_ioctl_t ioctl;
-	u_int gen;
-	const char *name;
+	u_int gen = 0;
+	const char *name = NULL;
 	bool needclose = false;
 	struct partinfo pi;
-	
-	l = curlwp;
-	vp = ap->a_vp;
+
+	KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
+	KASSERTMSG(vp->v_type == VBLK || vp->v_type == VCHR, "type=%d",
+	    vp->v_type);
+
 	dev = vp->v_rdev;
 	sn = vp->v_specnode;
 	sd = sn->sn_dev;
-	name = NULL;
-	gen = 0;
-
-	KASSERTMSG(vp->v_type == VBLK || vp->v_type == VCHR, "type=%d",
-	    vp->v_type);

 	/*
 	 * Don't allow open if fs is mounted -nodev.
@ -797,6 +933,8 @@ spec_read(void *v)
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
 	struct lwp *l = curlwp;
+	struct specnode *sn;
+	dev_t dev;
 	struct buf *bp;
 	daddr_t bn;
 	int bsize, bscale;
@ -819,9 +957,27 @@ spec_read(void *v)
 	switch (vp->v_type) {

 	case VCHR:
+		/*
+		 * Release the lock while we sleep -- possibly
+		 * indefinitely, if this is, e.g., a tty -- in
+		 * cdev_read, so we don't hold up everything else that
+		 * might want access to the vnode.
+		 *
+		 * But before we issue the read, take an I/O reference
+		 * to the specnode so close will know when we're done
+		 * reading.  Note that the moment we release the lock,
+		 * the vnode's identity may change; hence spec_io_enter
+		 * may fail, and the caller may have a dead vnode on
+		 * their hands, if the file system on which vp lived
+		 * has been unmounted.
+		 */
 		VOP_UNLOCK(vp);
-		error = cdev_read(vp->v_rdev, uio, ap->a_ioflag);
-		vn_lock(vp, LK_SHARED | LK_RETRY);
+		error = spec_io_enter(vp, &sn, &dev);
+		if (error)
+			goto out;
+		error = cdev_read(dev, uio, ap->a_ioflag);
+		spec_io_exit(vp, sn);
+out:		vn_lock(vp, LK_SHARED | LK_RETRY);
 		return (error);

 	case VBLK:
@ -898,6 +1054,8 @@ spec_write(void *v)
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
 	struct lwp *l = curlwp;
+	struct specnode *sn;
+	dev_t dev;
 	struct buf *bp;
 	daddr_t bn;
 	int bsize, bscale;
@ -913,9 +1071,27 @@ spec_write(void *v)
 	switch (vp->v_type) {

 	case VCHR:
+		/*
+		 * Release the lock while we sleep -- possibly
+		 * indefinitely, if this is, e.g., a tty -- in
+		 * cdev_write, so we don't hold up everything else that
+		 * might want access to the vnode.
+		 *
+		 * But before we issue the write, take an I/O reference
+		 * to the specnode so close will know when we're done
+		 * writing.  Note that the moment we release the lock,
+		 * the vnode's identity may change; hence spec_io_enter
+		 * may fail, and the caller may have a dead vnode on
+		 * their hands, if the file system on which vp lived
+		 * has been unmounted.
+		 */
 		VOP_UNLOCK(vp);
-		error = cdev_write(vp->v_rdev, uio, ap->a_ioflag);
-		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+		error = spec_io_enter(vp, &sn, &dev);
+		if (error)
+			goto out;
+		error = cdev_write(dev, uio, ap->a_ioflag);
+		spec_io_exit(vp, sn);
+out:		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		return (error);

 	case VBLK:
@ -973,10 +1149,11 @@ spec_fdiscard(void *v)
 		off_t a_pos;
 		off_t a_len;
 	} */ *ap = v;
-	struct vnode *vp;
+	struct vnode *vp = ap->a_vp;
 	dev_t dev;

-	vp = ap->a_vp;
+	KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
+
 	dev = vp->v_rdev;

 	switch (vp->v_type) {
@ -1006,40 +1183,32 @@ spec_ioctl(void *v)
 		int  a_fflag;
 		kauth_cred_t a_cred;
 	} */ *ap = v;
-	struct vnode *vp;
+	struct vnode *vp = ap->a_vp;
+	struct specnode *sn;
 	dev_t dev;
+	int error;

-	/*
-	 * Extract all the info we need from the vnode, taking care to
-	 * avoid a race with VOP_REVOKE().
-	 */
-
-	vp = ap->a_vp;
-	dev = NODEV;
-	mutex_enter(vp->v_interlock);
-	if (vdead_check(vp, VDEAD_NOWAIT) == 0 && vp->v_specnode) {
-		dev = vp->v_rdev;
-	}
-	mutex_exit(vp->v_interlock);
-	if (dev == NODEV) {
-		return ENXIO;
-	}
+	error = spec_io_enter(vp, &sn, &dev);
+	if (error)
+		return error;

 	switch (vp->v_type) {
-
 	case VCHR:
-		return cdev_ioctl(dev, ap->a_command, ap->a_data,
+		error = cdev_ioctl(dev, ap->a_command, ap->a_data,
 		    ap->a_fflag, curlwp);
-
+		break;
 	case VBLK:
 		KASSERT(vp == vp->v_specnode->sn_dev->sd_bdevvp);
-		return bdev_ioctl(dev, ap->a_command, ap->a_data,
+		error = bdev_ioctl(dev, ap->a_command, ap->a_data,
 		   ap->a_fflag, curlwp);
-
+		break;
 	default:
 		panic("spec_ioctl");
 		/* NOTREACHED */
 	}
+
+	spec_io_exit(vp, sn);
+	return error;
 }

 /* ARGSUSED */
@ -1050,33 +1219,25 @@ spec_poll(void *v)
 		struct vnode *a_vp;
 		int a_events;
 	} */ *ap = v;
-	struct vnode *vp;
+	struct vnode *vp = ap->a_vp;
+	struct specnode *sn;
 	dev_t dev;
+	int revents;

-	/*
-	 * Extract all the info we need from the vnode, taking care to
-	 * avoid a race with VOP_REVOKE().
-	 */
-
-	vp = ap->a_vp;
-	dev = NODEV;
-	mutex_enter(vp->v_interlock);
-	if (vdead_check(vp, VDEAD_NOWAIT) == 0 && vp->v_specnode) {
-		dev = vp->v_rdev;
-	}
-	mutex_exit(vp->v_interlock);
-	if (dev == NODEV) {
+	if (spec_io_enter(vp, &sn, &dev) != 0)
 		return POLLERR;
-	}

 	switch (vp->v_type) {
-
 	case VCHR:
-		return cdev_poll(dev, ap->a_events, curlwp);
-
+		revents = cdev_poll(dev, ap->a_events, curlwp);
+		break;
 	default:
-		return (genfs_poll(v));
+		revents = genfs_poll(v);
+		break;
 	}
+
+	spec_io_exit(vp, sn);
+	return revents;
 }

 /* ARGSUSED */
@ -1087,20 +1248,30 @@ spec_kqfilter(void *v)
 		struct vnode	*a_vp;
 		struct proc	*a_kn;
 	} */ *ap = v;
+	struct vnode *vp = ap->a_vp;
+	struct specnode *sn;
 	dev_t dev;
+	int error;

-	switch (ap->a_vp->v_type) {
+	error = spec_io_enter(vp, &sn, &dev);
+	if (error)
+		return error;

+	switch (vp->v_type) {
 	case VCHR:
-		dev = ap->a_vp->v_rdev;
-		return cdev_kqfilter(dev, ap->a_kn);
+		error = cdev_kqfilter(dev, ap->a_kn);
+		break;
 	default:
 		/*
 		 * Block devices don't support kqfilter, and refuse it
 		 * for any other files (like those vflush()ed) too.
 		 */
-		return (EOPNOTSUPP);
+		error = EOPNOTSUPP;
+		break;
 	}
+
+	spec_io_exit(vp, sn);
+	return error;
 }

 /*
@ -1115,11 +1286,19 @@ spec_mmap(void *v)
 		kauth_cred_t a_cred;
 	} */ *ap = v;
 	struct vnode *vp = ap->a_vp;
+	struct specnode *sn;
+	dev_t dev;
+	int error;

 	KASSERT(vp->v_type == VBLK);
-	if (bdev_type(vp->v_rdev) != D_DISK)
-		return EINVAL;

+	error = spec_io_enter(vp, &sn, &dev);
+	if (error)
+		return error;
+
+	error = bdev_type(dev) == D_DISK ? 0 : EINVAL;
+
+	spec_io_exit(vp, sn);
 	return 0;
 }

@ -1164,27 +1343,14 @@ spec_strategy(void *v)
 	} */ *ap = v;
 	struct vnode *vp = ap->a_vp;
 	struct buf *bp = ap->a_bp;
+	struct specnode *sn = NULL;
 	dev_t dev;
 	int error;

-	dev = NODEV;
-
-	/*
-	 * Extract all the info we need from the vnode, taking care to
-	 * avoid a race with VOP_REVOKE().
-	 */
-
-	mutex_enter(vp->v_interlock);
-	if (vdead_check(vp, VDEAD_NOWAIT) == 0 && vp->v_specnode != NULL) {
-		KASSERT(vp == vp->v_specnode->sn_dev->sd_bdevvp);
-		dev = vp->v_rdev;
-	}
-	mutex_exit(vp->v_interlock);
-
-	if (dev == NODEV) {
-		error = ENXIO;
+	error = spec_io_enter(vp, &sn, &dev);
+	if (error)
 		goto out;
-	}
+
 	bp->b_dev = dev;

 	if (!(bp->b_flags & B_READ)) {
@ -1204,13 +1370,15 @@ spec_strategy(void *v)
 	}
 	bdev_strategy(bp);

-	return 0;
-
-out:
-	bp->b_error = error;
-	bp->b_resid = bp->b_bcount;
-	biodone(bp);
+	error = 0;

+out:	if (sn)
+		spec_io_exit(vp, sn);
+	if (error) {
+		bp->b_error = error;
+		bp->b_resid = bp->b_bcount;
+		biodone(bp);
+	}
 	return error;
 }

@ -1281,14 +1449,17 @@ spec_close(void *v)
 	} */ *ap = v;
 	struct vnode *vp = ap->a_vp;
 	struct session *sess;
-	dev_t dev = vp->v_rdev;
+	dev_t dev;
 	int flags = ap->a_fflag;
 	int mode, error, count;
 	specnode_t *sn;
 	specdev_t *sd;

+	KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
+
 	mutex_enter(vp->v_interlock);
 	sn = vp->v_specnode;
+	dev = vp->v_rdev;
 	sd = sn->sn_dev;
 	/*
 	 * If we're going away soon, make this non-blocking.
@ -1421,6 +1592,12 @@ spec_close(void *v)
 	else
 		error = cdev_close(dev, flags, mode, curlwp);

+	/*
+	 * Wait for all other devsw operations to drain.  After this
+	 * point, no bdev/cdev_* can be active for this specdev.
+	 */
+	spec_io_drain(sd);
+
 	if (!(flags & FNONBLOCK))
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);

--- a/sys/miscfs/specfs/specdev.h
+++ b/sys/miscfs/specfs/specdev.h
@ -1,4 +1,4 @@
-/*	$NetBSD: specdev.h,v 1.48 2022/03/28 12:36:42 riastradh Exp $	*/
+/*	$NetBSD: specdev.h,v 1.49 2022/03/28 12:36:51 riastradh Exp $	*/

 /*-
 * Copyright (c) 2008 The NetBSD Foundation, Inc.
@ -78,6 +78,7 @@ typedef struct specdev {
 	u_int		sd_opencnt;	/* # of opens; close when ->0 */
 	u_int		sd_refcnt;	/* # of specnodes referencing this */
 	dev_t		sd_rdev;
+	volatile u_int	sd_iocnt;	/* # bdev/cdev_* operations active */
 	bool		sd_opened;	/* true if successfully opened */
 } specdev_t;