fix for per-mount i/o counting in ffs

Fri May 20 15:36:56 UTC 2016

On Fri, May 20, 2016 at 05:26:54PM +0300, Konstantin Belousov wrote:
> On Fri, May 20, 2016 at 09:22:08PM +1000, Bruce Evans wrote:
> > On Fri, 20 May 2016, Konstantin Belousov wrote:
> > 
> > > On Fri, May 20, 2016 at 09:27:38AM +1000, Bruce Evans wrote:
> > >>> diff --git a/sys/ufs/ffs/ffs_vfsops.c b/sys/ufs/ffs/ffs_vfsops.c
> > >>> index 712fc21..21425f5 100644
> > >>> --- a/sys/ufs/ffs/ffs_vfsops.c
> > >>> +++ b/sys/ufs/ffs/ffs_vfsops.c
> > >>> @@ -764,24 +764,29 @@ ffs_mountfs(devvp, mp, td)
> > >>> 	cred = td ? td->td_ucred : NOCRED;
> > >>> 	ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
> > >>>
> > >>> +	KASSERT(devvp->v_type == VCHR, ("reclaimed devvp"));
> > >>> 	dev = devvp->v_rdev;
> > >>> 	dev_ref(dev);
> > >>> +	if (!atomic_cmpset_ptr(&dev->si_mountpt, 0, mp)) {
> > >>> +		dev_rel(dev);
> > >>> +		VOP_UNLOCK(devvp, 0);
> > >>> +		return (EBUSY);
> > >>> +	}
> > >>
> > >> This is cleaner and safer than my version.
> > >>
> > >>> 	DROP_GIANT();
> > >>> 	g_topology_lock();
> > >>> 	error = g_vfs_open(devvp, &cp, "ffs", ronly ? 0 : 1);
> > >>
> > >> g_vfs_open() already sets devvp->v_bufobj.bo_ops to g_vfs_bufops unless
> > >> it fails.  This clobbered our setting in the buggy multiple-mount case.
> > >> But with multiple mounts not allowed, this cleans up any garbage in
> > >> v_bufobj.
> > > Yes, and this orders things.  g_vfs_open() shoudl have devvp locked,
> > > both fo bo manipulations and for vnode_create_vobject() call.
> > > We can only assign to bo_ops after g_vfs_open() was done successfully.
> > 
> > The atomic cmpset now orders things too.  Is that enough?  It ensures
> > that an old mount cannot be active.  I don't know if v_bufobj is used
> > for non-mounts.
> v_bufobj is logically protected against modifications by the vnode lock.
> 
> > 
> > Except, for zfs there is no g_vfs_open() to order things, and for all
> > other file systems there is no atomic cmpset yet.
> > 
> > >> g_vfs_open() has 2 failures for non-exclusive access.  It starts by
> > >> checking v_bufobj.bo_private == devvp (this is after translating its
> > >> pointers to the ones passed here).  This is avg's fix for the multiple-
> > >> mounts problem (r206130).  It doesn't work in all cases.  I think this
> > >> is unecessary now.
> > > At least it weeds out other devfs mounts.
> > 
> > Yes, we need it until everything is converted.
> > 
> > >> ...
> > >> I now see another cleanup: don't goto out when g_vfs_open() fails.  This
> > >> depends on it setting cp to NULL and leaving nothing to clean when it
> > >> fails.  It has no man page and this detail is documented in its source
> > >> code.
> > > Then I would need to add another NULL assignment, VOP_UNLOCK etc.
> > 
> > g_vfs_open() already sets cp to NULL when it fails, and the cleanup
> > depends on that now, but it is just as good to depend on no cleanup
> > being needed on failure.  You do need another dev_rel().
> > 
> > I thought about moving the dev_ref() later to simplify the early returns.
> > I thought that this didn't quite work.  Now I think it does work, for
> > obvious reasons:
> > - the device is attached to a vnode, so it is referenced to prevent it
> >    going away unless the device is revoked.  It seems to be referenced
> >    at least 3 times in FreeBSD-9.
> > - the vnode is locked, so the reference count remains > 0 until we unlock.
> > So we just need a dev_ref() before the unlock in the non-error case, to
> > keep the device from going away if it is revoked.
> Yes, and this is how the current patched code is structured.
> 
> > 
> > > Updated patch to add acq/rel.
> > >
> > > diff --git a/sys/ufs/ffs/ffs_vfsops.c b/sys/ufs/ffs/ffs_vfsops.c
> > > index 712fc21..670bb15 100644
> > > --- a/sys/ufs/ffs/ffs_vfsops.c
> > > +++ b/sys/ufs/ffs/ffs_vfsops.c
> > > @@ -764,24 +764,29 @@ ffs_mountfs(devvp, mp, td)
> > > 	cred = td ? td->td_ucred : NOCRED;
> > > 	ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
> > >
> > > +	KASSERT(devvp->v_type == VCHR, ("reclaimed devvp"));
> > 
> > Hrmph.
> I want this, it would remove amount of obvious questions.
> 
> > 
> > > 	dev = devvp->v_rdev;
> > > 	dev_ref(dev);
> > 
> > Move later...
> > 
> > > +	if (atomic_cmpset_acq_ptr(&dev->si_mountpt, 0, mp) != 0) {
> > 
> > I changed the first 0 to NULL, and this works on i386, but now I remember
> > that i386 has bogus casts which break detection of type mismatches --
> > the atomic ptr functions take a [u]intptr_t, not a pointer type, so
> > NULL won't work if it is ((void *)0).  At least amd64 is still missing
> > this bug.
> cmpset_<bar>_ptr() on i386 has cast for old and new parameters to u_int.
> store_rel_ptr() on i386 does not cast value to u_int.  As result, NULL
> is acceptable for cmpset, but not for store.  I spelled it 0 in all cases.
> 
> Hm, I also should add uintptr_t cast for cmpset, otherwise, I suspect,
> some arch might be broken.
Even more casts are needed, updated patch is below.

> 
> > 
> > > +		dev_rel(dev);
> > 
> > ...then this dev_rel() is not needed.
> > 
> > > +		VOP_UNLOCK(devvp, 0);
> > > +		return (EBUSY);
> > > +	}
> > > 	DROP_GIANT();
> > > 	g_topology_lock();
> > > 	error = g_vfs_open(devvp, &cp, "ffs", ronly ? 0 : 1);
> > > 	g_topology_unlock();
> > > 	PICKUP_GIANT();
> > > -	VOP_UNLOCK(devvp, 0);
> > > -	if (error)
> > > +	if (error != 0) {
> > > +		VOP_UNLOCK(devvp, 0);
> > > 		goto out;
> > 
> > This becomes:
> > 
> >  	if (error != 0) {
> >  		VOP_UNLOCK(devvp, 0);
> >  		return (EBUSY);
> >  	}
> > 
> > Then assign v_bufobj.
> > 
> > Then dev_ref(), just in time for unlocking.
> > 
> > Then unlock.
> Ok.
> 
> > 
> > > -	if (devvp->v_rdev->si_iosize_max != 0)
> > > -		mp->mnt_iosize_max = devvp->v_rdev->si_iosize_max;
> > > +	}
> > > +	if (dev->si_iosize_max != 0)
> > > +		mp->mnt_iosize_max = dev->si_iosize_max;
> > > 	if (mp->mnt_iosize_max > MAXPHYS)
> > > 		mp->mnt_iosize_max = MAXPHYS;
> > > -
> > > 	devvp->v_bufobj.bo_ops = &ffs_ops;
> > > -	if (devvp->v_type == VCHR)
> > > -		devvp->v_rdev->si_mountpt = mp;
> > > +	VOP_UNLOCK(devvp, 0);
> > 
> > This belongs earlier.
> > 
> > >
> > > 	fs = NULL;
> > > 	sblockloc = 0;
> > > ...
> > 
> > We need this in a central function.  g_vfs_open/close() can do it for
> > all cases except zfs.  This looks like:
> I might look at this later.
> 
> > 
> >  	DROP_GIANT();
> >  	g_topology_lock();
> >  	// atomic_cmpset and its error = EBUSY moved to top of g_vfs_open()
> >  	error = g_vfs_open(devvp, &cp, "ffs", ronly ? 0 : 1);
> >  	g_topology_unlock();
> >  	PICKUP_GIANT();
> >  	if (error != 0) {
> >  		VOP_UNLOCK(devvp, 0);
> >  		return (error);
> >  	}
> >  	devvp->v_bufobj.bo_ops = &ffs_ops;
> >  	dev_ref(dev);
> >  	VOP_UNLOCK(devvp, 0);
> >  	if (dev->si_iosize_max != 0)
> >  		mp->mnt_iosize_max = dev->si_iosize_max;
> >  	if (mp->mnt_iosize_max > MAXPHYS)
> >  		mp->mnt_iosize_max = MAXPHYS;
> > 
> > where 2 of 2  lines with GIANT and 3 of 4 lines with iosize_max remain to
> > be cleaned up.
> > 
> > Resetting si_mountpt in g_vfs_close() is even simpler.  Oops, it also has
> > to be reset in g_vfs_open() on a later failure there.

diff --git a/sys/ufs/ffs/ffs_vfsops.c b/sys/ufs/ffs/ffs_vfsops.c
index 712fc21..0487c2f 100644
--- a/sys/ufs/ffs/ffs_vfsops.c
+++ b/sys/ufs/ffs/ffs_vfsops.c
@@ -764,25 +764,31 @@ ffs_mountfs(devvp, mp, td)
 	cred = td ? td->td_ucred : NOCRED;
 	ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
 
+	KASSERT(devvp->v_type == VCHR, ("reclaimed devvp"));
 	dev = devvp->v_rdev;
-	dev_ref(dev);
+	if (atomic_cmpset_acq_ptr((uintptr_t *)&dev->si_mountpt, 0,
+	    (uintptr_t)mp) != 0) {
+		VOP_UNLOCK(devvp, 0);
+		return (EBUSY);
+	}
 	DROP_GIANT();
 	g_topology_lock();
 	error = g_vfs_open(devvp, &cp, "ffs", ronly ? 0 : 1);
 	g_topology_unlock();
 	PICKUP_GIANT();
+	if (error != 0) {
+		VOP_UNLOCK(devvp, 0);
+		atomic_store_rel_ptr((uintptr_t *)&dev->si_mountpt, 0);
+		return (error);
+	}
+	dev_ref(dev);
+	devvp->v_bufobj.bo_ops = &ffs_ops;
 	VOP_UNLOCK(devvp, 0);
-	if (error)
-		goto out;
-	if (devvp->v_rdev->si_iosize_max != 0)
-		mp->mnt_iosize_max = devvp->v_rdev->si_iosize_max;
+	if (dev->si_iosize_max != 0)
+		mp->mnt_iosize_max = dev->si_iosize_max;
 	if (mp->mnt_iosize_max > MAXPHYS)
 		mp->mnt_iosize_max = MAXPHYS;
 
-	devvp->v_bufobj.bo_ops = &ffs_ops;
-	if (devvp->v_type == VCHR)
-		devvp->v_rdev->si_mountpt = mp;
-
 	fs = NULL;
 	sblockloc = 0;
 	/*
@@ -1083,8 +1089,6 @@ ffs_mountfs(devvp, mp, td)
 out:
 	if (bp)
 		brelse(bp);
-	if (devvp->v_type == VCHR && devvp->v_rdev != NULL)
-		devvp->v_rdev->si_mountpt = NULL;
 	if (cp != NULL) {
 		DROP_GIANT();
 		g_topology_lock();
@@ -1102,6 +1106,7 @@ out:
 		free(ump, M_UFSMNT);
 		mp->mnt_data = NULL;
 	}
+	atomic_store_rel_ptr((uintptr_t *)&dev->si_mountpt, 0);
 	dev_rel(dev);
 	return (error);
 }
@@ -1287,8 +1292,7 @@ ffs_unmount(mp, mntflags)
 	g_vfs_close(ump->um_cp);
 	g_topology_unlock();
 	PICKUP_GIANT();
-	if (ump->um_devvp->v_type == VCHR && ump->um_devvp->v_rdev != NULL)
-		ump->um_devvp->v_rdev->si_mountpt = NULL;
+	atomic_store_rel_ptr((uintptr_t *)&ump->um_dev->si_mountpt, 0);
 	vrele(ump->um_devvp);
 	dev_rel(ump->um_dev);
 	mtx_destroy(UFS_MTX(ump));