git: d28af1abf031 - main - vm: Add a mode to vm_object_page_remove() which skips invalid pages

From: Mark Johnston <markj_at_FreeBSD.org>
Date: Mon, 15 Nov 2021 18:03:01 UTC
The branch main has been updated by markj:

URL: https://cgit.FreeBSD.org/src/commit/?id=d28af1abf031ee87a478b37180e3f0c518caedf6

commit d28af1abf031ee87a478b37180e3f0c518caedf6
Author:     Mark Johnston <markj@FreeBSD.org>
AuthorDate: 2021-11-15 16:44:04 +0000
Commit:     Mark Johnston <markj@FreeBSD.org>
CommitDate: 2021-11-15 18:01:30 +0000

    vm: Add a mode to vm_object_page_remove() which skips invalid pages
    
    This will be used to break a deadlock in ZFS between the per-mountpoint
    teardown lock and page busy locks.  In particular, when purging data
    from the page cache during dataset rollback, we want to avoid blocking
    on the busy state of invalid pages since the busying thread may be
    blocked on the teardown lock in zfs_getpages().
    
    Add a helper, vn_pages_remove_valid(), for use by filesystems.  Bump
    __FreeBSD_version so that the OpenZFS port can make use of the new
    helper.
    
    PR:             258208
    Reviewed by:    avg, kib, sef
    Tested by:      pho (part of a larger patch)
    MFC after:      2 weeks
    Sponsored by:   The FreeBSD Foundation
    Differential Revision:  https://reviews.freebsd.org/D32931
---
 sys/kern/vfs_vnops.c | 22 ++++++++++++++++++++++
 sys/sys/vnode.h      |  2 ++
 sys/vm/vm_object.c   | 19 +++++++++++++++++++
 sys/vm/vm_object.h   |  1 +
 4 files changed, 44 insertions(+)

diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c
index 79d422aacfef..66fcbf80bb3a 100644
--- a/sys/kern/vfs_vnops.c
+++ b/sys/kern/vfs_vnops.c
@@ -2429,6 +2429,10 @@ vn_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
 	return (setfown(td, active_cred, vp, uid, gid));
 }
 
+/*
+ * Remove pages in the range ["start", "end") from the vnode's VM object.  If
+ * "end" is 0, then the range extends to the end of the object.
+ */
 void
 vn_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end)
 {
@@ -2441,6 +2445,24 @@ vn_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end)
 	VM_OBJECT_WUNLOCK(object);
 }
 
+/*
+ * Like vn_pages_remove(), but skips invalid pages, which by definition are not
+ * mapped into any process' address space.  Filesystems may use this in
+ * preference to vn_pages_remove() to avoid blocking on pages busied in
+ * preparation for a VOP_GETPAGES.
+ */
+void
+vn_pages_remove_valid(struct vnode *vp, vm_pindex_t start, vm_pindex_t end)
+{
+	vm_object_t object;
+
+	if ((object = vp->v_object) == NULL)
+		return;
+	VM_OBJECT_WLOCK(object);
+	vm_object_page_remove(object, start, end, OBJPR_VALIDONLY);
+	VM_OBJECT_WUNLOCK(object);
+}
+
 int
 vn_bmap_seekhole_locked(struct vnode *vp, u_long cmd, off_t *off,
     struct ucred *cred)
diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h
index 6bffd7656c62..1a202abfd4dd 100644
--- a/sys/sys/vnode.h
+++ b/sys/sys/vnode.h
@@ -770,6 +770,8 @@ int	vn_open_cred(struct nameidata *ndp, int *flagp, int cmode,
 int	vn_open_vnode(struct vnode *vp, int fmode, struct ucred *cred,
 	    struct thread *td, struct file *fp);
 void	vn_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end);
+void	vn_pages_remove_valid(struct vnode *vp, vm_pindex_t start,
+	    vm_pindex_t end);
 int	vn_pollrecord(struct vnode *vp, struct thread *p, int events);
 int	vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base,
 	    int len, off_t offset, enum uio_seg segflg, int ioflg,
diff --git a/sys/vm/vm_object.c b/sys/vm/vm_object.c
index 6c4df272f739..c465a2cf76dd 100644
--- a/sys/vm/vm_object.c
+++ b/sys/vm/vm_object.c
@@ -2099,6 +2099,21 @@ again:
 	for (; p != NULL && (p->pindex < end || end == 0); p = next) {
 		next = TAILQ_NEXT(p, listq);
 
+		/*
+		 * Skip invalid pages if asked to do so.  Try to avoid acquiring
+		 * the busy lock, as some consumers rely on this to avoid
+		 * deadlocks.
+		 *
+		 * A thread may concurrently transition the page from invalid to
+		 * valid using only the busy lock, so the result of this check
+		 * is immediately stale.  It is up to consumers to handle this,
+		 * for instance by ensuring that all invalid->valid transitions
+		 * happen with a mutex held, as may be possible for a
+		 * filesystem.
+		 */
+		if ((options & OBJPR_VALIDONLY) != 0 && vm_page_none_valid(p))
+			continue;
+
 		/*
 		 * If the page is wired for any reason besides the existence
 		 * of managed, wired mappings, then it cannot be freed.  For
@@ -2112,6 +2127,10 @@ again:
 				VM_OBJECT_WLOCK(object);
 			goto again;
 		}
+		if ((options & OBJPR_VALIDONLY) != 0 && vm_page_none_valid(p)) {
+			vm_page_xunbusy(p);
+			continue;
+		}
 		if (vm_page_wired(p)) {
 wired:
 			if ((options & OBJPR_NOTMAPPED) == 0 &&
diff --git a/sys/vm/vm_object.h b/sys/vm/vm_object.h
index 1bf4cee856c7..3b280228de13 100644
--- a/sys/vm/vm_object.h
+++ b/sys/vm/vm_object.h
@@ -232,6 +232,7 @@ struct vm_object {
  */
 #define	OBJPR_CLEANONLY	0x1		/* Don't remove dirty pages. */
 #define	OBJPR_NOTMAPPED	0x2		/* Don't unmap pages. */
+#define	OBJPR_VALIDONLY	0x4		/* Ignore invalid pages. */
 
 TAILQ_HEAD(object_q, vm_object);