Re: git: 8ef0c11e7ce7 - main - nfsclient: upgrade vnode lock in VOP_OPEN()/VOP_CLOSE() if we need to flush buffers

From: Konstantin Belousov <kostikbel_at_gmail.com>
Date: Wed, 24 Nov 2021 03:12:19 UTC
On Wed, Nov 24, 2021 at 01:19:18PM +1100, Peter Jeremy wrote:
> On 2021-Nov-23 11:19:21 +0200, Konstantin Belousov <kostikbel@gmail.com> wrote:
> >On Tue, Nov 23, 2021 at 07:09:08PM +1100, Peter Jeremy wrote:
> >> On 2021-Nov-16 17:14:55 +0000, Konstantin Belousov <kib@FreeBSD.org> wrote:
> >> >    nfsclient: upgrade vnode lock in VOP_OPEN()/VOP_CLOSE() if we need to flush buffers
> >> >    
> >> >    VOP_FSYNC() asserts that the vnode is exclusively locked for NFS.
> >> >    If we try to execute file with recently modified content, the assert is
> >> >    triggered.
> >> 
> >> I have a diskless arm64 system configured with swap over NFS and I'm
> >> now consistently getting a panic during shutdown.  I haven't
> >> specificially confirmed that it's this commit but the content is
> >> suggestive.
> >> 
> >> panic: upgrade of unlocked lock (lockmgr) nfs @ /usr/src/sys/fs/nfsclient/nfs_clvnops.c:855
> >> cpuid = 3
> >> time = 1637166551
> >> KDB: stack backtrace:
> >> db_trace_self() at db_trace_self
> >> db_trace_self_wrapper() at db_trace_self_wrapper+0x30
> >> vpanic() at vpanic+0x178
> >> panic() at panic+0x44
> >> witness_upgrade() at witness_upgrade+0x104
> >> lockmgr_upgrade() at lockmgr_upgrade+0x164
> >> nfs_lock() at nfs_lock+0x2c
> >> vop_sigdefer() at vop_sigdefer+0x30
> >> _vn_lock() at _vn_lock+0x54
> >> nfs_close() at nfs_close+0xc8
> >> vop_sigdefer() at vop_sigdefer+0x30
> >> VOP_CLOSE_APV() at VOP_CLOSE_APV+0x2c
> >> swapdev_close() at swapdev_close+0x3c
> >> swapoff_one() at swapoff_one+0x598
> >> sys_swapoff() at sys_swapoff+0x12c
> >> do_el0_sync() at do_el0_sync+0x498
> >> handle_el0_sync() at handle_el0_sync+0x90
> >> --- exception, esr 0x56000000
> >> 
> >> I presume this isn't intended.  Can you suggest where I should start
> >> looking for the problem?
> >
> >Try this please.  It might be also useful to enable DEBUG_VFS_LOCKS in your
> >kernel config, to catch all related issues once.
> 
> Thanks for the rapid response.  I've found that DEBUG_VFS_LOCKS also
> requires INVARIANTS.  That now catches swapon as well:
> KDB: stack backtrace:
> db_trace_self() at db_trace_self
> db_trace_self_wrapper() at db_trace_self_wrapper+0x30
> assert_vop_locked() at assert_vop_locked+0x58
> VOP_GETATTR_APV() at VOP_GETATTR_APV+0x4c
> sys_swapon() at sys_swapon+0x2c0
> do_el0_sync() at do_el0_sync+0x4a4
> handle_el0_sync() at handle_el0_sync+0x90
> --- exception, esr 0x56000000
> vnode 0xffffa000065511c0: type VREG
>     usecount 1, writecount 0, refcount 1 seqc users 0
>     hold count flags ()
>     flags (VV_VMSIZEVNLOCK)
>     lock type nfs: UNLOCKED
>         fileid 30984 fsid 0x3a3a00ff01
> VOP_GETATTR: 0xffffa000065511c0 is not locked but should be
> KDB: enter: lock violation
> [ thread pid 1027 tid 100159 ]
> Stopped at      kdb_enter+0x48: undefined       f900c11f
> 
> I will dig into that further this evening.

Try this (combined two patches).

diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c
index 9bc506c9b6b8..cecf5d94ee1f 100644
--- a/sys/vm/swap_pager.c
+++ b/sys/vm/swap_pager.c
@@ -2366,8 +2366,8 @@ sys_swapon(struct thread *td, struct swapon_args *uap)
 		goto done;
 	}
 
-	NDINIT(&nd, LOOKUP, ISOPEN | FOLLOW | AUDITVNODE1, UIO_USERSPACE,
-	    uap->name, td);
+	NDINIT(&nd, LOOKUP, ISOPEN | FOLLOW | LOCKLEAF | AUDITVNODE1,
+	    UIO_USERSPACE, uap->name, td);
 	error = namei(&nd);
 	if (error)
 		goto done;
@@ -2387,8 +2387,10 @@ sys_swapon(struct thread *td, struct swapon_args *uap)
 		error = swaponvp(td, vp, attr.va_size / DEV_BSIZE);
 	}
 
-	if (error)
-		vrele(vp);
+	if (error != 0)
+		vput(vp);
+	else
+		VOP_UNLOCK(vp);
 done:
 	sx_xunlock(&swdev_syscall_lock);
 	return (error);
@@ -3012,7 +3014,7 @@ swapongeom(struct vnode *vp)
 {
 	int error;
 
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+	ASSERT_VOP_ELOCKED(vp, "swapongeom");
 	if (vp->v_type != VCHR || VN_IS_DOOMED(vp)) {
 		error = ENOENT;
 	} else {
@@ -3020,7 +3022,6 @@ swapongeom(struct vnode *vp)
 		error = swapongeom_locked(vp->v_rdev, vp);
 		g_topology_unlock();
 	}
-	VOP_UNLOCK(vp);
 	return (error);
 }
 
@@ -3057,9 +3058,9 @@ swapdev_strategy(struct buf *bp, struct swdevt *sp)
 static void
 swapdev_close(struct thread *td, struct swdevt *sp)
 {
-
+	vn_lock(sp->sw_vp, LK_EXCLUSIVE | LK_RETRY);
 	VOP_CLOSE(sp->sw_vp, FREAD | FWRITE, td->td_ucred, td);
-	vrele(sp->sw_vp);
+	vput(sp->sw_vp);
 }
 
 static int
@@ -3068,6 +3069,7 @@ swaponvp(struct thread *td, struct vnode *vp, u_long nblks)
 	struct swdevt *sp;
 	int error;
 
+	ASSERT_VOP_ELOCKED(vp, "swaponvp");
 	if (nblks == 0)
 		return (ENXIO);
 	mtx_lock(&sw_dev_mtx);
@@ -3079,14 +3081,12 @@ swaponvp(struct thread *td, struct vnode *vp, u_long nblks)
 	}
 	mtx_unlock(&sw_dev_mtx);
 
-	(void) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 #ifdef MAC
 	error = mac_system_check_swapon(td->td_ucred, vp);
 	if (error == 0)
 #endif
 		error = VOP_OPEN(vp, FREAD | FWRITE, td->td_ucred, td, NULL);
-	(void) VOP_UNLOCK(vp);
-	if (error)
+	if (error != 0)
 		return (error);
 
 	swaponsomething(vp, vp, nblks, swapdev_strategy, swapdev_close,