svn commit: r367672 - in head/sys/ufs: ffs ufs

Sat Nov 14 05:30:13 UTC 2020

Author: kib
Date: Sat Nov 14 05:30:10 2020
New Revision: 367672
URL: https://svnweb.freebsd.org/changeset/base/367672

Log:
  Handle LoR in flush_pagedep_deps().
  
  When operating in SU or SU+J mode, ffs_syncvnode() might need to
  instantiate other vnode by inode number while owning syncing vnode
  lock.  Typically this other vnode is the parent of our vnode, but due
  to renames occuring right before fsync (or during fsync when we drop
  the syncing vnode lock, see below) it might be no longer parent.
  
  More, the called function flush_pagedep_deps() needs to lock other
  vnode while owning the lock for vnode which owns the buffer, for which
  the dependencies are flushed.  This creates another instance of the
  same LoR as was fixed in softdep_sync().
  
  Put the generic code for safe relocking into new SU helper
  get_parent_vp() and use it in flush_pagedep_deps().  The case for safe
  relocking of two vnodes with undefined lock order was extracted into
  vn helper vn_lock_pair().
  
  Due to call sequence
       ffs_syncvnode()->softdep_sync_buf()->flush_pagedep_deps(),
  ffs_syncvnode() indicates with ERELOOKUP that passed vnode was
  unlocked in process, and can return ENOENT if the passed vnode
  reclaimed.  All callers of the function were inspected.
  
  Because UFS namei lookups store auxiliary information about directory
  entry in in-memory directory inode, and this information is then used
  by UFS code that creates/removed directory entry in the actual
  mutating VOPs, it is critical that directory vnode lock is not dropped
  between lookup and VOP.  For softdep_prelink(), which ensures that
  later link/unlink operation can proceed without overflowing the
  journal, calls were moved to the place where it is safe to drop
  processing VOP because mutations are not yet applied.  Then, ERELOOKUP
  causes restart of the whole VFS operation (typically VFS syscall) at
  top level, including the re-lookup of the involved pathes.  [Note that
  we already do the same restart for failing calls to vn_start_write(),
  so formally this patch does not introduce new behavior.]
  
  Similarly, unsafe calls to fsync in snapshot creation code were
  plugged.  A possible view on these failures is that it does not make
  sense to continue creating snapshot if the snapshot vnode was
  reclaimed due to forced unmount.
  
  It is possible that relock/ERELOOKUP situation occurs in
  ffs_truncate() called from ufs_inactive().  In this case, dropping the
  vnode lock is not safe.  Detect the situation with VI_DOINGINACT and
  reschedule inactivation by setting VI_OWEINACT.  ufs_inactive()
  rechecks VI_OWEINACT and avoids reclaiming vnode is truncation failed
  this way.
  
  In ffs_truncate(), allocation of the EOF block for partial truncation
  is re-done after vnode is synced, since we cannot leave the buffer
  locked through ffs_syncvnode().
  
  In collaboration with:	pho
  Reviewed by:	mckusick (previous version), markj
  Tested by:	markj (syzkaller), pho
  Sponsored by:	The FreeBSD Foundation
  Differential revision:	https://reviews.freebsd.org/D26136

Modified:
  head/sys/ufs/ffs/ffs_extern.h
  head/sys/ufs/ffs/ffs_inode.c
  head/sys/ufs/ffs/ffs_snapshot.c
  head/sys/ufs/ffs/ffs_softdep.c
  head/sys/ufs/ffs/ffs_vfsops.c
  head/sys/ufs/ffs/ffs_vnops.c
  head/sys/ufs/ufs/ufs_inode.c
  head/sys/ufs/ufs/ufs_lookup.c
  head/sys/ufs/ufs/ufs_vnops.c

Modified: head/sys/ufs/ffs/ffs_extern.h
==============================================================================

--- head/sys/ufs/ffs/ffs_extern.h	Sat Nov 14 05:19:59 2020	(r367671)
+++ head/sys/ufs/ffs/ffs_extern.h	Sat Nov 14 05:30:10 2020	(r367672)
@@ -173,6 +173,9 @@ void	softdep_load_inodeblock(struct inode *);
 void	softdep_freefile(struct vnode *, ino_t, int);
 int	softdep_request_cleanup(struct fs *, struct vnode *,
 	    struct ucred *, int);
+int	softdep_prerename(struct vnode *, struct vnode *, struct vnode *,
+	    struct vnode *);
+int	softdep_prelink(struct vnode *, struct vnode *, int);
 void	softdep_setup_freeblocks(struct inode *, off_t, int);
 void	softdep_setup_inomapdep(struct buf *, struct inode *, ino_t, int);
 void	softdep_setup_blkmapdep(struct buf *, struct mount *, ufs2_daddr_t,

Modified: head/sys/ufs/ffs/ffs_inode.c
==============================================================================
--- head/sys/ufs/ffs/ffs_inode.c	Sat Nov 14 05:19:59 2020	(r367671)
+++ head/sys/ufs/ffs/ffs_inode.c	Sat Nov 14 05:30:10 2020	(r367672)
@@ -462,6 +462,8 @@ ffs_truncate(vp, length, flags, cred)
 		error = UFS_BALLOC(vp, length - 1, 1, cred, flags, &bp);
 		if (error)
 			return (error);
+		ffs_inode_bwrite(vp, bp, flags);
+
 		/*
 		 * When we are doing soft updates and the UFS_BALLOC
 		 * above fills in a direct block hole with a full sized
@@ -473,6 +475,10 @@ ffs_truncate(vp, length, flags, cred)
 		if (DOINGSOFTDEP(vp) && lbn < UFS_NDADDR &&
 		    fragroundup(fs, blkoff(fs, length)) < fs->fs_bsize &&
 		    (error = ffs_syncvnode(vp, MNT_WAIT, 0)) != 0)
+			return (error);
+
+		error = UFS_BALLOC(vp, length - 1, 1, cred, flags, &bp);
+		if (error)
 			return (error);
 		ip->i_size = length;
 		DIP_SET(ip, i_size, length);

Modified: head/sys/ufs/ffs/ffs_snapshot.c
==============================================================================
--- head/sys/ufs/ffs/ffs_snapshot.c	Sat Nov 14 05:19:59 2020	(r367671)
+++ head/sys/ufs/ffs/ffs_snapshot.c	Sat Nov 14 05:30:10 2020	(r367672)
@@ -301,6 +301,8 @@ restart:
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vn_finished_write(wrtmp);
 		vrele(nd.ni_dvp);
+		if (error == ERELOOKUP)
+			goto restart;
 		return (error);
 	}
 	vp = nd.ni_vp;
@@ -368,8 +370,12 @@ restart:
 		if (error)
 			goto out;
 		bawrite(nbp);
-		if (cg % 10 == 0)
-			ffs_syncvnode(vp, MNT_WAIT, 0);
+		if (cg % 10 == 0) {
+			error = ffs_syncvnode(vp, MNT_WAIT, 0);
+			/* vp possibly reclaimed if unlocked */
+			if (error != 0)
+				goto out;
+		}
 	}
 	/*
 	 * Copy all the cylinder group maps. Although the
@@ -391,8 +397,8 @@ restart:
 			goto out;
 		error = cgaccount(cg, vp, nbp, 1);
 		bawrite(nbp);
-		if (cg % 10 == 0)
-			ffs_syncvnode(vp, MNT_WAIT, 0);
+		if (cg % 10 == 0 && error == 0)
+			error = ffs_syncvnode(vp, MNT_WAIT, 0);
 		if (error)
 			goto out;
 	}

Modified: head/sys/ufs/ffs/ffs_softdep.c
==============================================================================
--- head/sys/ufs/ffs/ffs_softdep.c	Sat Nov 14 05:19:59 2020	(r367671)
+++ head/sys/ufs/ffs/ffs_softdep.c	Sat Nov 14 05:30:10 2020	(r367672)
@@ -609,6 +609,27 @@ softdep_freework(wkhd)
 	panic("softdep_freework called");
 }
 
+int
+softdep_prerename(fdvp, fvp, tdvp, tvp)
+	struct vnode *fdvp;
+	struct vnode *fvp;
+	struct vnode *tdvp;
+	struct vnode *tvp;
+{
+
+	panic("softdep_prerename called");
+}
+
+int
+softdep_prelink(dvp, vp, will_direnter)
+	struct vnode *dvp;
+	struct vnode *vp;
+	int will_direnter;
+{
+
+	panic("softdep_prelink called");
+}
+
 #else
 
 FEATURE(softupdates, "FFS soft-updates support");
@@ -748,7 +769,7 @@ static	void unlinked_inodedep(struct mount *, struct i
 static	void clear_unlinked_inodedep(struct inodedep *);
 static	struct inodedep *first_unlinked_inodedep(struct ufsmount *);
 static	int flush_pagedep_deps(struct vnode *, struct mount *,
-	    struct diraddhd *);
+	    struct diraddhd *, struct buf *);
 static	int free_pagedep(struct pagedep *);
 static	int flush_newblk_dep(struct vnode *, struct mount *, ufs_lbn_t);
 static	int flush_inodedep_deps(struct vnode *, struct mount *, ino_t);
@@ -925,7 +946,6 @@ static	void journal_unmount(struct ufsmount *);
 static	int journal_space(struct ufsmount *, int);
 static	void journal_suspend(struct ufsmount *);
 static	int journal_unsuspend(struct ufsmount *ump);
-static	void softdep_prelink(struct vnode *, struct vnode *);
 static	void add_to_journal(struct worklist *);
 static	void remove_from_journal(struct worklist *);
 static	bool softdep_excess_items(struct ufsmount *, int);
@@ -1390,6 +1410,136 @@ SYSCTL_INT(_debug_softdep, OID_AUTO, print_threads, CT
 static TAILQ_HEAD(, mount_softdeps) softdepmounts;
 
 /*
+ * This function fetches inode inum on mount point mp.  We already
+ * hold a locked vnode vp, and might have a locked buffer bp belonging
+ * to vp.
+
+ * We must not block on acquiring the new inode lock as we will get
+ * into a lock-order reversal with the buffer lock and possibly get a
+ * deadlock.  Thus if we cannot instantiate the requested vnode
+ * without sleeping on its lock, we must unlock the vnode and the
+ * buffer before doing a blocking on the vnode lock.  We return
+ * ERELOOKUP if we have had to unlock either the vnode or the buffer so
+ * that the caller can reassess its state.
+ *
+ * Top-level VFS code (for syscalls and other consumers, e.g. callers
+ * of VOP_FSYNC() in syncer) check for ERELOOKUP and restart at safe
+ * point.
+ *
+ * Since callers expect to operate on fully constructed vnode, we also
+ * recheck v_data after relock, and return ENOENT if NULL.
+ *
+ * If unlocking bp, we must unroll dequeueing its unfinished
+ * dependencies, and clear scan flag, before unlocking.  If unlocking
+ * vp while it is under deactivation, we re-queue deactivation.
+ */
+static int
+get_parent_vp(struct vnode *vp, struct mount *mp, ino_t inum, struct buf *bp,
+    struct diraddhd *diraddhdp, struct diraddhd *unfinishedp,
+    struct vnode **rvp)
+{
+	struct vnode *pvp;
+	struct diradd *dap;
+	int error;
+	bool bplocked;
+
+	ASSERT_VOP_ELOCKED(vp, "child vnode must be locked");
+	for (bplocked = true, pvp = NULL;;) {
+		error = ffs_vgetf(mp, inum, LK_EXCLUSIVE | LK_NOWAIT, &pvp,
+		    FFSV_FORCEINSMQ);
+		if (error == 0) {
+			/*
+			 * Since we could have unlocked vp, the inode
+			 * number could no longer indicate a
+			 * constructed node.  In this case, we must
+			 * restart the syscall.
+			 */
+			if (VTOI(pvp)->i_mode == 0 || !bplocked) {
+				if (VTOI(pvp)->i_mode == 0)
+					vgone(pvp);
+				vput(pvp);
+				error = ERELOOKUP;
+				goto out;
+			}
+
+			error = 0;
+			goto out1;
+		}
+		if (bp != NULL && bplocked) {
+			/*
+			 * Requeue unfinished dependencies before
+			 * unlocking buffer, which could make
+			 * diraddhdp invalid.
+			 */
+			ACQUIRE_LOCK(VFSTOUFS(mp));
+			while ((dap = LIST_FIRST(unfinishedp)) != NULL) {
+				LIST_REMOVE(dap, da_pdlist);
+				LIST_INSERT_HEAD(diraddhdp, dap, da_pdlist);
+			}
+			FREE_LOCK(VFSTOUFS(mp));
+			bp->b_vflags &= ~BV_SCANNED;
+			BUF_NOREC(bp);
+			BUF_UNLOCK(bp);
+			bplocked = false;
+		}
+
+		/*
+		 * Do not drop vnode lock while inactivating.  This
+		 * would result in leaks of the VI flags and
+		 * reclaiming of non-truncated vnode.  Instead,
+		 * re-schedule inactivation hoping that we would be
+		 * able to sync inode later.
+		 */
+		if ((vp->v_iflag & VI_DOINGINACT) != 0) {
+			VI_LOCK(vp);
+			vp->v_iflag |= VI_OWEINACT;
+			VI_UNLOCK(vp);
+			return (ERELOOKUP);
+		}
+
+		VOP_UNLOCK(vp);
+		error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &pvp,
+		    FFSV_FORCEINSMQ);
+		if (error != 0) {
+			MPASS(error != ERELOOKUP);
+			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+			break;
+		}
+		if (VTOI(pvp)->i_mode == 0) {
+			vgone(pvp);
+			vput(pvp);
+			pvp = NULL;
+			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+			error = ERELOOKUP;
+			break;
+		}
+		error = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT);
+		if (error == 0)
+			break;
+		vput(pvp);
+		pvp = NULL;
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+		if (vp->v_data == NULL) {
+			error = ENOENT;
+			break;
+		}
+	}
+	if (bp != NULL) {
+		MPASS(!bplocked);
+		error = ERELOOKUP;
+	}
+	if (error != 0 && pvp != NULL) {
+		vput(pvp);
+		pvp = NULL;
+	}
+out1:
+	*rvp = pvp;
+out:
+	ASSERT_VOP_ELOCKED(vp, "child vnode must be locked on return");
+	return (error);
+}
+
+/*
  * This function cleans the worklist for a filesystem.
  * Each filesystem running with soft dependencies gets its own
  * thread to run in this function. The thread is started up in
@@ -3096,47 +3246,206 @@ softdep_prealloc(vp, waitok)
 }
 
 /*
+ * Try hard to sync all data and metadata for the vnode, and workitems
+ * flushing which might conflict with the vnode lock.  This is a
+ * helper for softdep_prerename().
+ */
+static int
+softdep_prerename_vnode(ump, vp)
+	struct ufsmount *ump;
+	struct vnode *vp;
+{
+	int error;
+
+	ASSERT_VOP_ELOCKED(vp, "prehandle");
+	if (vp->v_data == NULL)
+		return (0);
+	error = VOP_FSYNC(vp, MNT_WAIT, curthread);
+	if (error != 0)
+		return (error);
+	ACQUIRE_LOCK(ump);
+	process_removes(vp);
+	process_truncates(vp);
+	FREE_LOCK(ump);
+	return (0);
+}
+
+/*
+ * Must be called from VOP_RENAME() after all vnodes are locked.
+ * Ensures that there is enough journal space for rename.  It is
+ * sufficiently different from softdep_prelink() by having to handle
+ * four vnodes.
+ */
+int
+softdep_prerename(fdvp, fvp, tdvp, tvp)
+	struct vnode *fdvp;
+	struct vnode *fvp;
+	struct vnode *tdvp;
+	struct vnode *tvp;
+{
+	struct ufsmount *ump;
+	int error;
+
+	ump = VFSTOUFS(fdvp->v_mount);
+
+	if (journal_space(ump, 0))
+		return (0);
+
+	VOP_UNLOCK(tdvp);
+	VOP_UNLOCK(fvp);
+	if (tvp != NULL && tvp != tdvp)
+		VOP_UNLOCK(tvp);
+
+	error = softdep_prerename_vnode(ump, fdvp);
+	VOP_UNLOCK(fdvp);
+	if (error != 0)
+		return (error);
+
+	VOP_LOCK(fvp, LK_EXCLUSIVE | LK_RETRY);
+	error = softdep_prerename_vnode(ump, fvp);
+	VOP_UNLOCK(fvp);
+	if (error != 0)
+		return (error);
+
+	if (tdvp != fdvp) {
+		VOP_LOCK(tdvp, LK_EXCLUSIVE | LK_RETRY);
+		error = softdep_prerename_vnode(ump, tdvp);
+		VOP_UNLOCK(tdvp);
+		if (error != 0)
+			return (error);
+	}
+
+	if (tvp != fvp && tvp != NULL) {
+		VOP_LOCK(tvp, LK_EXCLUSIVE | LK_RETRY);
+		error = softdep_prerename_vnode(ump, tvp);
+		VOP_UNLOCK(tvp);
+		if (error != 0)
+			return (error);
+	}
+
+	ACQUIRE_LOCK(ump);
+	softdep_speedup(ump);
+	process_worklist_item(UFSTOVFS(ump), 2, LK_NOWAIT);
+	if (journal_space(ump, 0) == 0) {
+		softdep_speedup(ump);
+		if (journal_space(ump, 1) == 0)
+			journal_suspend(ump);
+	}
+	FREE_LOCK(ump);
+	return (ERELOOKUP);
+}
+
+/*
  * Before adjusting a link count on a vnode verify that we have sufficient
  * journal space.  If not, process operations that depend on the currently
  * locked pair of vnodes to try to flush space as the syncer, buf daemon,
  * and softdep flush threads can not acquire these locks to reclaim space.
+ *
+ * Returns 0 if all owned locks are still valid and were not dropped
+ * in the process, in other case it returns either an error from sync,
+ * or ERELOOKUP if any of the locks were re-acquired.  In the later
+ * case, the state of the vnodes cannot be relied upon and our VFS
+ * syscall must be restarted at top level from the lookup.
  */
-static void
-softdep_prelink(dvp, vp)
+int
+softdep_prelink(dvp, vp, will_direnter)
 	struct vnode *dvp;
 	struct vnode *vp;
+	int will_direnter;
 {
 	struct ufsmount *ump;
+	int error, error1;
 
+	ASSERT_VOP_ELOCKED(dvp, "prelink dvp");
+	if (vp != NULL)
+		ASSERT_VOP_ELOCKED(vp, "prelink vp");
 	ump = VFSTOUFS(dvp->v_mount);
-	LOCK_OWNED(ump);
+
 	/*
 	 * Nothing to do if we have sufficient journal space.
 	 * If we currently hold the snapshot lock, we must avoid
 	 * handling other resources that could cause deadlock.
+	 *
+	 * will_direnter == 1: In case allocated a directory block in
+	 * an indirect block, we must prevent holes in the directory
+	 * created if directory entries are written out of order.  To
+	 * accomplish this we fsync when we extend a directory into
+	 * indirects.  During rename it's not safe to drop the tvp
+	 * lock so sync must be delayed until it is.
+	 *
+	 * This synchronous step could be removed if fsck and the
+	 * kernel were taught to fill in sparse directories rather
+	 * than panic.
 	 */
-	if (journal_space(ump, 0) || (vp && IS_SNAPSHOT(VTOI(vp))))
-		return;
+	if (journal_space(ump, 0) || (vp != NULL && IS_SNAPSHOT(VTOI(vp)))) {
+		error = 0;
+		if (will_direnter && (vp == NULL || !IS_SNAPSHOT(VTOI(vp)))) {
+			if (vp != NULL)
+				VOP_UNLOCK(vp);
+			error = ffs_syncvnode(dvp, MNT_WAIT, 0);
+			if (vp != NULL) {
+				error1 = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT);
+				if (error1 != 0) {
+					vn_lock_pair(dvp, true, vp, false);
+					if (error == 0)
+						error = ERELOOKUP;
+				} else if (vp->v_data == NULL) {
+					error = ERELOOKUP;
+				}
+			}
+		}
+		return (error);
+	}
+
 	stat_journal_low++;
-	FREE_LOCK(ump);
-	if (vp)
+	if (vp != NULL) {
+		VOP_UNLOCK(dvp);
 		ffs_syncvnode(vp, MNT_NOWAIT, 0);
+		vn_lock_pair(dvp, false, vp, true);
+		if (dvp->v_data == NULL)
+			return (ERELOOKUP);
+	}
+	if (vp != NULL)
+		VOP_UNLOCK(vp);
 	ffs_syncvnode(dvp, MNT_WAIT, 0);
-	ACQUIRE_LOCK(ump);
+	VOP_UNLOCK(dvp);
+
 	/* Process vp before dvp as it may create .. removes. */
-	if (vp) {
+	if (vp != NULL) {
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+		if (vp->v_data == NULL) {
+			vn_lock_pair(dvp, false, vp, true);
+			return (ERELOOKUP);
+		}
+		ACQUIRE_LOCK(ump);
 		process_removes(vp);
 		process_truncates(vp);
+		FREE_LOCK(ump);
+		VOP_UNLOCK(vp);
 	}
+
+	vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
+	if (dvp->v_data == NULL) {
+		vn_lock_pair(dvp, true, vp, false);
+		return (ERELOOKUP);
+	}
+
+	ACQUIRE_LOCK(ump);
 	process_removes(dvp);
 	process_truncates(dvp);
+	VOP_UNLOCK(dvp);
 	softdep_speedup(ump);
+
 	process_worklist_item(UFSTOVFS(ump), 2, LK_NOWAIT);
 	if (journal_space(ump, 0) == 0) {
 		softdep_speedup(ump);
 		if (journal_space(ump, 1) == 0)
 			journal_suspend(ump);
 	}
+	FREE_LOCK(ump);
+
+	vn_lock_pair(dvp, false, vp, false);
+	return (ERELOOKUP);
 }
 
 static void
@@ -4742,7 +5051,6 @@ softdep_setup_create(dp, ip)
 		KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
 		    ("softdep_setup_create: No addref structure present."));
 	}
-	softdep_prelink(dvp, NULL);
 	FREE_LOCK(ITOUMP(dp));
 }
 
@@ -4777,7 +5085,6 @@ softdep_setup_dotdot_link(dp, ip)
 	if (jaddref)
 		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
 		    if_deps);
-	softdep_prelink(dvp, ITOV(ip));
 	FREE_LOCK(ITOUMP(dp));
 }
 
@@ -4808,7 +5115,6 @@ softdep_setup_link(dp, ip)
 	if (jaddref)
 		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
 		    if_deps);
-	softdep_prelink(dvp, ITOV(ip));
 	FREE_LOCK(ITOUMP(dp));
 }
 
@@ -4858,7 +5164,6 @@ softdep_setup_mkdir(dp, ip)
 	if (DOINGSUJ(dvp))
 		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst,
 		    &dotdotaddref->ja_ref, if_deps);
-	softdep_prelink(ITOV(dp), NULL);
 	FREE_LOCK(ITOUMP(dp));
 }
 
@@ -4879,7 +5184,6 @@ softdep_setup_rmdir(dp, ip)
 	ACQUIRE_LOCK(ITOUMP(dp));
 	(void) inodedep_lookup_ip(ip);
 	(void) inodedep_lookup_ip(dp);
-	softdep_prelink(dvp, ITOV(ip));
 	FREE_LOCK(ITOUMP(dp));
 }
 
@@ -4900,7 +5204,6 @@ softdep_setup_unlink(dp, ip)
 	ACQUIRE_LOCK(ITOUMP(dp));
 	(void) inodedep_lookup_ip(ip);
 	(void) inodedep_lookup_ip(dp);
-	softdep_prelink(dvp, ITOV(ip));
 	FREE_LOCK(ITOUMP(dp));
 }
 
@@ -12622,25 +12925,12 @@ restart:
 		 * for details on possible races.
 		 */
 		FREE_LOCK(ump);
-		if (ffs_vgetf(mp, parentino, LK_NOWAIT | LK_EXCLUSIVE, &pvp,
-		    FFSV_FORCEINSMQ)) {
-			/*
-			 * Unmount cannot proceed after unlock because
-			 * caller must have called vn_start_write().
-			 */
-			VOP_UNLOCK(vp);
-			error = ffs_vgetf(mp, parentino, LK_EXCLUSIVE,
-			    &pvp, FFSV_FORCEINSMQ);
-			MPASS(VTOI(pvp)->i_mode != 0);
-			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
-			if (VN_IS_DOOMED(vp)) {
-				if (error == 0)
-					vput(pvp);
-				error = ENOENT;
-			}
-			if (error != 0)
-				return (error);
-		}
+		error = get_parent_vp(vp, mp, parentino, NULL, NULL, NULL,
+		    &pvp);
+		if (error == ERELOOKUP)
+			error = 0;
+		if (error != 0)
+			return (error);
 		/*
 		 * All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps
 		 * that are contained in direct blocks will be resolved by 
@@ -12964,9 +13254,11 @@ top:
 			for (i = 0; i < DAHASHSZ; i++) {
 				if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0)
 					continue;
-				if ((error = flush_pagedep_deps(vp, wk->wk_mp,
-				    &pagedep->pd_diraddhd[i]))) {
-					BUF_NOREC(bp);
+				error = flush_pagedep_deps(vp, wk->wk_mp,
+				    &pagedep->pd_diraddhd[i], bp);
+				if (error != 0) {
+					if (error != ERELOOKUP)
+						BUF_NOREC(bp);
 					goto out_unlock;
 				}
 			}
@@ -13200,10 +13492,11 @@ flush_newblk_dep(vp, mp, lbn)
  * Eliminate a pagedep dependency by flushing out all its diradd dependencies.
  */
 static int
-flush_pagedep_deps(pvp, mp, diraddhdp)
+flush_pagedep_deps(pvp, mp, diraddhdp, locked_bp)
 	struct vnode *pvp;
 	struct mount *mp;
 	struct diraddhd *diraddhdp;
+	struct buf *locked_bp;
 {
 	struct inodedep *inodedep;
 	struct inoref *inoref;
@@ -13270,10 +13563,10 @@ restart:
 		}
 		if (dap->da_state & MKDIR_BODY) {
 			FREE_LOCK(ump);
-			if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp,
-			    FFSV_FORCEINSMQ)))
+			error = get_parent_vp(pvp, mp, inum, locked_bp,
+			    diraddhdp, &unfinished, &vp);
+			if (error != 0)
 				break;
-			MPASS(VTOI(vp)->i_mode != 0);
 			error = flush_newblk_dep(vp, mp, 0);
 			/*
 			 * If we still have the dependency we might need to
@@ -13335,10 +13628,10 @@ retry:
 		 */
 		if (dap == LIST_FIRST(diraddhdp)) {
 			FREE_LOCK(ump);
-			if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp,
-			    FFSV_FORCEINSMQ)))
+			error = get_parent_vp(pvp, mp, inum, locked_bp,
+			    diraddhdp, &unfinished, &vp);
+			if (error != 0)
 				break;
-			MPASS(VTOI(vp)->i_mode != 0);
 			error = ffs_update(vp, 1);
 			vput(vp);
 			if (error)

Modified: head/sys/ufs/ffs/ffs_vfsops.c
==============================================================================
--- head/sys/ufs/ffs/ffs_vfsops.c	Sat Nov 14 05:19:59 2020	(r367671)
+++ head/sys/ufs/ffs/ffs_vfsops.c	Sat Nov 14 05:30:10 2020	(r367672)
@@ -1861,8 +1861,14 @@ loop:
 #ifdef QUOTA
 		qsyncvp(vp);
 #endif
-		if ((error = ffs_syncvnode(vp, waitfor, 0)) != 0)
-			allerror = error;
+		for (;;) {
+			error = ffs_syncvnode(vp, waitfor, 0);
+			if (error == ERELOOKUP)
+				continue;
+			if (error != 0)
+				allerror = error;
+			break;
+		}
 		vput(vp);
 	}
 	/*

Modified: head/sys/ufs/ffs/ffs_vnops.c
==============================================================================
--- head/sys/ufs/ffs/ffs_vnops.c	Sat Nov 14 05:19:59 2020	(r367671)
+++ head/sys/ufs/ffs/ffs_vnops.c	Sat Nov 14 05:30:10 2020	(r367672)
@@ -253,7 +253,7 @@ ffs_syncvnode(struct vnode *vp, int waitfor, int flags
 	struct buf *bp, *nbp;
 	ufs_lbn_t lbn;
 	int error, passes;
-	bool still_dirty, wait;
+	bool still_dirty, unlocked, wait;
 
 	ip = VTOI(vp);
 	ip->i_flag &= ~IN_NEEDSYNC;
@@ -277,6 +277,7 @@ ffs_syncvnode(struct vnode *vp, int waitfor, int flags
 	error = 0;
 	passes = 0;
 	wait = false;	/* Always do an async pass first. */
+	unlocked = false;
 	lbn = lblkno(ITOFS(ip), (ip->i_size + ITOFS(ip)->fs_bsize - 1));
 	BO_LOCK(bo);
 loop:
@@ -325,6 +326,26 @@ loop:
 		if (!LIST_EMPTY(&bp->b_dep) &&
 		    (error = softdep_sync_buf(vp, bp,
 		    wait ? MNT_WAIT : MNT_NOWAIT)) != 0) {
+			/*
+			 * Lock order conflict, buffer was already unlocked,
+			 * and vnode possibly unlocked.
+			 */
+			if (error == ERELOOKUP) {
+				if (vp->v_data == NULL)
+					return (EBADF);
+				unlocked = true;
+				if (DOINGSOFTDEP(vp) && waitfor == MNT_WAIT &&
+				    (error = softdep_sync_metadata(vp)) != 0) {
+					if (ffs_fsfail_cleanup(ump, error))
+						error = 0;
+					return (unlocked && error == 0 ?
+					    ERELOOKUP : error);
+				}
+				/* Re-evaluate inode size */
+				lbn = lblkno(ITOFS(ip), (ip->i_size +
+				    ITOFS(ip)->fs_bsize - 1));
+				goto next;
+			}
 			/* I/O error. */
 			if (error != EBUSY) {
 				BUF_UNLOCK(bp);
@@ -361,9 +382,11 @@ next:
 	if (waitfor != MNT_WAIT) {
 		BO_UNLOCK(bo);
 		if ((flags & NO_INO_UPDT) != 0)
-			return (0);
-		else
-			return (ffs_update(vp, 0));
+			return (unlocked ? ERELOOKUP : 0);
+		error = ffs_update(vp, 0);
+		if (error == 0 && unlocked)
+			error = ERELOOKUP;
+		return (error);
 	}
 	/* Drain IO to see if we're done. */
 	bufobj_wwait(bo, 0, 0);
@@ -419,6 +442,8 @@ next:
 	} else if ((ip->i_flags & (IN_SIZEMOD | IN_IBLKDATA)) != 0) {
 		error = ffs_update(vp, 1);
 	}
+	if (error == 0 && unlocked)
+		error = ERELOOKUP;
 	return (error);
 }
 

Modified: head/sys/ufs/ufs/ufs_inode.c
==============================================================================
--- head/sys/ufs/ufs/ufs_inode.c	Sat Nov 14 05:19:59 2020	(r367671)
+++ head/sys/ufs/ufs/ufs_inode.c	Sat Nov 14 05:30:10 2020	(r367672)
@@ -166,7 +166,8 @@ ufs_inactive(ap)
 		isize += ip->i_din2->di_extsize;
 	if (ip->i_effnlink <= 0 && isize && !UFS_RDONLY(ip))
 		error = UFS_TRUNCATE(vp, (off_t)0, IO_EXT | IO_NORMAL, NOCRED);
-	if (ip->i_nlink <= 0 && ip->i_mode && !UFS_RDONLY(ip)) {
+	if (ip->i_nlink <= 0 && ip->i_mode != 0 && !UFS_RDONLY(ip) &&
+	    (vp->v_iflag & VI_OWEINACT) == 0) {
 #ifdef QUOTA
 		if (!getinoquota(ip))
 			(void)chkiq(ip, -1, NOCRED, FORCE);
@@ -207,10 +208,12 @@ out:
 	 * If we are done with the inode, reclaim it
 	 * so that it can be reused immediately.
 	 */
-	if (ip->i_mode == 0)
+	if (ip->i_mode == 0 && (vp->v_iflag & VI_OWEINACT) == 0)
 		vrecycle(vp);
 	if (mp != NULL)
 		vn_finished_secondary_write(mp);
+	if (error == ERELOOKUP)
+		error = 0;
 	return (error);
 }
 

Modified: head/sys/ufs/ufs/ufs_lookup.c
==============================================================================
--- head/sys/ufs/ufs/ufs_lookup.c	Sat Nov 14 05:19:59 2020	(r367671)
+++ head/sys/ufs/ufs/ufs_lookup.c	Sat Nov 14 05:30:10 2020	(r367672)
@@ -961,27 +961,7 @@ ufs_direnter(dvp, tvp, dirp, cnp, newdirbp, isrename)
 			bdwrite(bp);
 			if ((dp->i_flag & IN_NEEDSYNC) == 0)
 				return (UFS_UPDATE(dvp, 0));
-			/*
-			 * We have just allocated a directory block in an
-			 * indirect block.  We must prevent holes in the
-			 * directory created if directory entries are
-			 * written out of order.  To accomplish this we
-			 * fsync when we extend a directory into indirects.
-			 * During rename it's not safe to drop the tvp lock
-			 * so sync must be delayed until it is.
-			 *
-			 * This synchronous step could be removed if fsck and
-			 * the kernel were taught to fill in sparse
-			 * directories rather than panic.
-			 */
-			if (isrename)
-				return (0);
-			if (tvp != NULL)
-				VOP_UNLOCK(tvp);
-			(void) VOP_FSYNC(dvp, MNT_WAIT, td);
-			if (tvp != NULL)
-				vn_lock(tvp, LK_EXCLUSIVE | LK_RETRY);
-			return (error);
+			return (0);
 		}
 		if (DOINGASYNC(dvp)) {
 			bdwrite(bp);

Modified: head/sys/ufs/ufs/ufs_vnops.c
==============================================================================
--- head/sys/ufs/ufs/ufs_vnops.c	Sat Nov 14 05:19:59 2020	(r367671)
+++ head/sys/ufs/ufs/ufs_vnops.c	Sat Nov 14 05:30:10 2020	(r367672)
@@ -1006,10 +1006,16 @@ ufs_remove(ap)
 	td = curthread;
 	ip = VTOI(vp);
 	if ((ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) ||
-	    (VTOI(dvp)->i_flags & APPEND)) {
-		error = EPERM;
-		goto out;
+	    (VTOI(dvp)->i_flags & APPEND))
+		return (EPERM);
+	if (DOINGSOFTDEP(dvp)) {
+		error = softdep_prelink(dvp, vp, true);
+		if (error != 0) {
+			MPASS(error == ERELOOKUP);
+			return (error);
+		}
 	}
+
 #ifdef UFS_GJOURNAL
 	ufs_gjournal_orphan(vp);
 #endif
@@ -1030,7 +1036,6 @@ ufs_remove(ap)
 		(void) VOP_FSYNC(dvp, MNT_WAIT, td);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	}
-out:
 	return (error);
 }
 
@@ -1067,6 +1072,15 @@ ufs_link(ap)
 	if ((cnp->cn_flags & HASBUF) == 0)
 		panic("ufs_link: no name");
 #endif
+
+	if (DOINGSOFTDEP(tdvp)) {
+		error = softdep_prelink(tdvp, vp, true);
+		if (error != 0) {
+			MPASS(error == ERELOOKUP);
+			return (error);
+		}
+	}
+
 	if (VTOI(tdvp)->i_effnlink < 2) {
 		print_bad_link_count("ufs_link", tdvp);
 		error = EINVAL;
@@ -1089,6 +1103,7 @@ ufs_link(ap)
 		error = EPERM;
 		goto out;
 	}
+
 	ip->i_effnlink++;
 	ip->i_nlink++;
 	DIP_SET(ip, i_nlink, ip->i_nlink);
@@ -1129,6 +1144,15 @@ ufs_whiteout(ap)
 	struct direct newdir;
 	int error = 0;
 
+	if (DOINGSOFTDEP(dvp) && (ap->a_flags == CREATE ||
+	    ap->a_flags == DELETE)) {
+		error = softdep_prelink(dvp, NULL, true);
+		if (error != 0) {
+			MPASS(error == ERELOOKUP);
+			return (error);
+		}
+	}
+
 	switch (ap->a_flags) {
 	case LOOKUP:
 		/* 4.4 format directories support whiteout operations */
@@ -1338,6 +1362,18 @@ relock:
 			goto relock;
 		}
 	}
+
+	if (DOINGSOFTDEP(fdvp)) {
+		error = softdep_prerename(fdvp, fvp, tdvp, tvp);
+		if (error != 0) {
+			if (error == ERELOOKUP) {
+				atomic_add_int(&rename_restarts, 1);
+				goto relock;
+			}
+			goto releout;
+		}
+	}
+
 	fdp = VTOI(fdvp);
 	fip = VTOI(fvp);
 	tdp = VTOI(tdvp);
@@ -1649,8 +1685,10 @@ unlockout:
 	 * are no longer needed.
 	 */
 	if (error == 0 && endoff != 0) {
-		error = UFS_TRUNCATE(tdvp, endoff, IO_NORMAL |
-		    (DOINGASYNC(tdvp) ? 0 : IO_SYNC), tcnp->cn_cred);
+		do {
+			error = UFS_TRUNCATE(tdvp, endoff, IO_NORMAL |
+			    (DOINGASYNC(tdvp) ? 0 : IO_SYNC), tcnp->cn_cred);
+		} while (error == ERELOOKUP);
 		if (error != 0 && !ffs_fsfail_cleanup(VFSTOUFS(mp), error))
 			vn_printf(tdvp,
 			    "ufs_rename: failed to truncate, error %d\n",
@@ -1668,8 +1706,11 @@ unlockout:
 		 */
 		error = 0;
 	}
-	if (error == 0 && tdp->i_flag & IN_NEEDSYNC)
-		error = VOP_FSYNC(tdvp, MNT_WAIT, td);
+	if (error == 0 && tdp->i_flag & IN_NEEDSYNC) {
+		do {
+			error = VOP_FSYNC(tdvp, MNT_WAIT, td);
+		} while (error == ERELOOKUP);
+	}
 	vput(tdvp);
 	return (error);
 
@@ -1918,6 +1959,7 @@ ufs_mkdir(ap)
 	}
 	dmode = vap->va_mode & 0777;
 	dmode |= IFDIR;
+
 	/*
 	 * Must simulate part of ufs_makeinode here to acquire the inode,
 	 * but not have it entered in the parent directory. The entry is
@@ -1928,6 +1970,15 @@ ufs_mkdir(ap)
 		error = EINVAL;
 		goto out;
 	}
+
+	if (DOINGSOFTDEP(dvp)) {
+		error = softdep_prelink(dvp, NULL, true);
+		if (error != 0) {
+			MPASS(error == ERELOOKUP);
+			return (error);
+		}
+	}
+
 	error = UFS_VALLOC(dvp, dmode, cnp->cn_cred, &tvp);
 	if (error)
 		goto out;
@@ -2184,6 +2235,14 @@ ufs_rmdir(ap)
 		error = EINVAL;
 		goto out;
 	}
+	if (DOINGSOFTDEP(dvp)) {
+		error = softdep_prelink(dvp, vp, false);
+		if (error != 0) {
+			MPASS(error == ERELOOKUP);
+			return (error);
+		}
+	}
+
 #ifdef UFS_GJOURNAL
 	ufs_gjournal_orphan(vp);
 #endif
@@ -2702,6 +2761,13 @@ ufs_makeinode(mode, dvp, vpp, cnp, callfunc)
 	if (pdir->i_effnlink < 2) {
 		print_bad_link_count(callfunc, dvp);
 		return (EINVAL);
+	}
+	if (DOINGSOFTDEP(dvp)) {
+		error = softdep_prelink(dvp, NULL, true);
+		if (error != 0) {
+			MPASS(error == ERELOOKUP);
+			return (error);
+		}
 	}
 	error = UFS_VALLOC(dvp, mode, cnp->cn_cred, &tvp);
 	if (error)