git: 3d721de049be - stable/13 - Fix NFS exports of FUSE file systems for big directories

From: Alan Somers <asomers_at_FreeBSD.org>
Date: Wed, 02 Mar 2022 23:35:50 UTC
The branch stable/13 has been updated by asomers:

URL: https://cgit.FreeBSD.org/src/commit/?id=3d721de049be0f073306c8109499fbbbe945c3b3

commit 3d721de049be0f073306c8109499fbbbe945c3b3
Author:     Alan Somers <asomers@FreeBSD.org>
AuthorDate: 2022-01-02 17:18:47 +0000
Commit:     Alan Somers <asomers@FreeBSD.org>
CommitDate: 2022-03-02 23:35:33 +0000

    Fix NFS exports of FUSE file systems for big directories
    
    The FUSE protocol does not require that a directory entry's d_off field
    outlive the lifetime of its directory's file handle.  Since the NFS
    server must reopen the directory on every VOP_READDIR call, that means
    it can't pass uio->uio_offset down to the FUSE server.  Instead, it must
    read the directory from 0 each time.  It may need to issue multiple
    FUSE_READDIR operations until it finds the d_off field that it's looking
    for.  That was the intention behind SVN r348209 and r297887, but a logic
    bug prevented subsequent FUSE_READDIR operations from ever being issued,
    rendering large directories incompletely browseable.
    
    Reviewed by:    rmacklem
    
    (cherry picked from commit d088dc76e1a62ecb6c05bd2b14ee48a9f9a7e2bd)
    
    fusefs: optimize NFS readdir for FUSE_NO_OPENDIR_SUPPORT
    
    In its lowest common denominator, FUSE does not require that a directory
    entry's d_off field is valid outside of the lifetime of the directory's
    FUSE file handle.  But since NFS is stateless, it must reopen the
    directory on every call to VOP_READDIR.  That means reading the
    directory all the way from the first entry.  Not only does this create
    an O(n^2) condition for large directories, but it can also result in
    incorrect behavior if either:
    
    * The file system _does_ change the d_off field for the last directory
      entry previously seen by NFS, or
    * The file system deletes the last directory entry previously seen by
      NFS.
    
    Handily, for file systems that set FUSE_NO_OPENDIR_SUPPORT d_off is
    guaranteed to be valid for the lifetime of the directory entry, there is
    no need to read the directory from the start.
    
    Reviewed by:    rmacklem
    
    (cherry picked from commit 4a6526d84a56f398732bff491e63aa42f796a27d)
    
    fusefs: require FUSE_NO_OPENDIR_SUPPORT for NFS exporting
    
    FUSE file systems that do not set FUSE_NO_OPENDIR_SUPPORT do not
    guarantee that d_off will be valid after closing and reopening a
    directory.  That conflicts with NFS's statelessness, that results in
    unresolvable bugs when NFS reads large directories, if:
    
    * The file system _does_ change the d_off field for the last directory
      entry previously returned by VOP_READDIR, or
    * The file system deletes the last directory entry previously seen by
      NFS.
    
    Rather than doing a poor job of exporting such file systems, it's better
    just to refuse.
    
    Even though this is technically a breaking change, 13.0-RELEASE's
    NFS-FUSE support was bad enough that an MFC should be allowed.
    
    Reviewed by:    rmacklem
    Differential Revision: https://reviews.freebsd.org/D33726
    
    (cherry picked from commit 00134a07898fa807b8a1fcb2596f0e3644143f69)
    
    fusefs: fix the build without INVARIANTS after 00134a07898
    
    MFC with:       00134a07898fa807b8a1fcb2596f0e3644143f69
    Reported by:    se
    
    (cherry picked from commit 18ed2ce77a254f365a4687d6afe8eeba2aad2e13)
---
 sys/fs/fuse/fuse_internal.c | 82 +++++++++++++++------------------------------
 sys/fs/fuse/fuse_internal.h | 10 +++---
 sys/fs/fuse/fuse_vnops.c    | 47 +++++++++++++++++---------
 3 files changed, 63 insertions(+), 76 deletions(-)

diff --git a/sys/fs/fuse/fuse_internal.c b/sys/fs/fuse/fuse_internal.c
index 9dac4f7a17be..8862a09d2e6c 100644
--- a/sys/fs/fuse/fuse_internal.c
+++ b/sys/fs/fuse/fuse_internal.c
@@ -554,7 +554,6 @@ fuse_internal_mknod(struct vnode *dvp, struct vnode **vpp,
 int
 fuse_internal_readdir(struct vnode *vp,
     struct uio *uio,
-    off_t startoff,
     struct fuse_filehandle *fufh,
     struct fuse_iov *cookediov,
     int *ncookies,
@@ -563,7 +562,6 @@ fuse_internal_readdir(struct vnode *vp,
 	int err = 0;
 	struct fuse_dispatcher fdi;
 	struct fuse_read_in *fri = NULL;
-	int fnd_start;
 
 	if (uio_resid(uio) == 0)
 		return 0;
@@ -573,25 +571,9 @@ fuse_internal_readdir(struct vnode *vp,
 	 * Note that we DO NOT have a UIO_SYSSPACE here (so no need for p2p
 	 * I/O).
 	 */
-
-	/*
-	 * fnd_start is set non-zero once the offset in the directory gets
-	 * to the startoff.  This is done because directories must be read
-	 * from the beginning (offset == 0) when fuse_vnop_readdir() needs
-	 * to do an open of the directory.
-	 * If it is not set non-zero here, it will be set non-zero in
-	 * fuse_internal_readdir_processdata() when uio_offset == startoff.
-	 */
-	fnd_start = 0;
-	if (uio->uio_offset == startoff)
-		fnd_start = 1;
 	while (uio_resid(uio) > 0) {
 		fdi.iosize = sizeof(*fri);
-		if (fri == NULL)
-			fdisp_make_vp(&fdi, FUSE_READDIR, vp, NULL, NULL);
-		else
-			fdisp_refresh_vp(&fdi, FUSE_READDIR, vp, NULL, NULL);
-
+		fdisp_make_vp(&fdi, FUSE_READDIR, vp, NULL, NULL);
 		fri = fdi.indata;
 		fri->fh = fufh->fh_id;
 		fri->offset = uio_offset(uio);
@@ -600,9 +582,8 @@ fuse_internal_readdir(struct vnode *vp,
 
 		if ((err = fdisp_wait_answ(&fdi)))
 			break;
-		if ((err = fuse_internal_readdir_processdata(uio, startoff,
-		    &fnd_start, fri->size, fdi.answ, fdi.iosize, cookediov,
-		    ncookies, &cookies)))
+		if ((err = fuse_internal_readdir_processdata(uio, fri->size,
+			fdi.answ, fdi.iosize, cookediov, ncookies, &cookies)))
 			break;
 	}
 
@@ -617,8 +598,6 @@ fuse_internal_readdir(struct vnode *vp,
  */
 int
 fuse_internal_readdir_processdata(struct uio *uio,
-    off_t startoff,
-    int *fnd_start,
     size_t reqsize,
     void *buf,
     size_t bufsize,
@@ -672,39 +651,32 @@ fuse_internal_readdir_processdata(struct uio *uio,
 			err = -1;
 			break;
 		}
-		/*
-		 * Don't start to copy the directory entries out until
-		 * the requested offset in the directory is found.
-		 */
-		if (*fnd_start != 0) {
-			fiov_adjust(cookediov, oreclen);
-			bzero(cookediov->base, oreclen);
-
-			de = (struct dirent *)cookediov->base;
-			de->d_fileno = fudge->ino;
-			de->d_off = fudge->off;
-			de->d_reclen = oreclen;
-			de->d_type = fudge->type;
-			de->d_namlen = fudge->namelen;
-			memcpy((char *)cookediov->base + sizeof(struct dirent) -
-			       MAXNAMLEN - 1,
-			       (char *)buf + FUSE_NAME_OFFSET, fudge->namelen);
-			dirent_terminate(de);
-
-			err = uiomove(cookediov->base, cookediov->len, uio);
-			if (err)
+		fiov_adjust(cookediov, oreclen);
+		bzero(cookediov->base, oreclen);
+
+		de = (struct dirent *)cookediov->base;
+		de->d_fileno = fudge->ino;
+		de->d_off = fudge->off;
+		de->d_reclen = oreclen;
+		de->d_type = fudge->type;
+		de->d_namlen = fudge->namelen;
+		memcpy((char *)cookediov->base + sizeof(struct dirent) -
+		       MAXNAMLEN - 1,
+		       (char *)buf + FUSE_NAME_OFFSET, fudge->namelen);
+		dirent_terminate(de);
+
+		err = uiomove(cookediov->base, cookediov->len, uio);
+		if (err)
+			break;
+		if (cookies != NULL) {
+			if (*ncookies == 0) {
+				err = -1;
 				break;
-			if (cookies != NULL) {
-				if (*ncookies == 0) {
-					err = -1;
-					break;
-				}
-				*cookies = fudge->off;
-				cookies++;
-				(*ncookies)--;
 			}
-		} else if (startoff == fudge->off)
-			*fnd_start = 1;
+			*cookies = fudge->off;
+			cookies++;
+			(*ncookies)--;
+		}
 		buf = (char *)buf + freclen;
 		bufsize -= freclen;
 		uio_setoffset(uio, fudge->off);
diff --git a/sys/fs/fuse/fuse_internal.h b/sys/fs/fuse/fuse_internal.h
index e9fa3857227a..7d773b02fd8b 100644
--- a/sys/fs/fuse/fuse_internal.h
+++ b/sys/fs/fuse/fuse_internal.h
@@ -250,12 +250,12 @@ int fuse_internal_mknod(struct vnode *dvp, struct vnode **vpp,
 struct pseudo_dirent {
 	uint32_t d_namlen;
 };
-int fuse_internal_readdir(struct vnode *vp, struct uio *uio, off_t startoff,
+int fuse_internal_readdir(struct vnode *vp, struct uio *uio,
     struct fuse_filehandle *fufh, struct fuse_iov *cookediov, int *ncookies,
-    u_long *cookies);
-int fuse_internal_readdir_processdata(struct uio *uio, off_t startoff,
-    int *fnd_start, size_t reqsize, void *buf, size_t bufsize,
-    struct fuse_iov *cookediov, int *ncookies, u_long **cookiesp);
+    uint64_t *cookies);
+int fuse_internal_readdir_processdata(struct uio *uio, size_t reqsize,
+    void *buf, size_t bufsize, struct fuse_iov *cookediov, int *ncookies,
+    u_long **cookiesp);
 
 /* remove */
 
diff --git a/sys/fs/fuse/fuse_vnops.c b/sys/fs/fuse/fuse_vnops.c
index da1c5ec3cbce..7a3417749b72 100644
--- a/sys/fs/fuse/fuse_vnops.c
+++ b/sys/fs/fuse/fuse_vnops.c
@@ -1847,10 +1847,10 @@ fuse_vnop_readdir(struct vop_readdir_args *ap)
 	struct uio *uio = ap->a_uio;
 	struct ucred *cred = ap->a_cred;
 	struct fuse_filehandle *fufh = NULL;
+	struct mount *mp = vnode_mount(vp);
 	struct fuse_iov cookediov;
 	int err = 0;
 	u_long *cookies;
-	off_t startoff;
 	ssize_t tresid;
 	int ncookies;
 	bool closefufh = false;
@@ -1867,24 +1867,17 @@ fuse_vnop_readdir(struct vop_readdir_args *ap)
 	}
 
 	tresid = uio->uio_resid;
-	startoff = uio->uio_offset;
 	err = fuse_filehandle_get_dir(vp, &fufh, cred, pid);
-	if (err == EBADF && vnode_mount(vp)->mnt_flag & MNT_EXPORTED) {
+	if (err == EBADF && mp->mnt_flag & MNT_EXPORTED) {
+		KASSERT(fuse_get_mpdata(mp)->dataflags
+				& FSESS_NO_OPENDIR_SUPPORT,
+			("FUSE file systems that don't set "
+			 "FUSE_NO_OPENDIR_SUPPORT should not be exported"));
 		/* 
 		 * nfsd will do VOP_READDIR without first doing VOP_OPEN.  We
-		 * must implicitly open the directory here
+		 * must implicitly open the directory here.
 		 */
 		err = fuse_filehandle_open(vp, FREAD, &fufh, curthread, cred);
-		if (err == 0) {
-			/*
-			 * When a directory is opened, it must be read from
-			 * the beginning.  Hopefully, the "startoff" still
-			 * exists as an offset cookie for the directory.
-			 * If not, it will read the entire directory without
-			 * returning any entries and just return eof.
-			 */
-			uio->uio_offset = 0;
-		}
 		closefufh = true;
 	}
 	if (err)
@@ -1902,7 +1895,7 @@ fuse_vnop_readdir(struct vop_readdir_args *ap)
 #define DIRCOOKEDSIZE FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET + MAXNAMLEN + 1)
 	fiov_init(&cookediov, DIRCOOKEDSIZE);
 
-	err = fuse_internal_readdir(vp, uio, startoff, fufh, &cookediov,
+	err = fuse_internal_readdir(vp, uio, fufh, &cookediov,
 		&ncookies, cookies);
 
 	fiov_teardown(&cookediov);
@@ -2999,8 +2992,30 @@ fuse_vnop_vptofh(struct vop_vptofh_args *ap)
 	struct vattr va;
 	int err;
 
-	if (!(data->dataflags & FSESS_EXPORT_SUPPORT))
+	if (!(data->dataflags & FSESS_EXPORT_SUPPORT)) {
+		/* NFS requires lookups for "." and ".." */
+		SDT_PROBE2(fusefs, , vnops, trace, 1,
+			"VOP_VPTOFH without FUSE_EXPORT_SUPPORT");
 		return EOPNOTSUPP;
+	}
+	if ((mp->mnt_flag & MNT_EXPORTED) &&
+		!(data->dataflags & FSESS_NO_OPENDIR_SUPPORT))
+	{
+		/*
+		 * NFS is stateless, so nfsd must reopen a directory on every
+		 * call to VOP_READDIR, passing in the d_off field from the
+		 * final dirent of the previous invocation.  But without
+		 * FUSE_NO_OPENDIR_SUPPORT, the FUSE protocol does not
+		 * guarantee that d_off will be valid after a directory is
+		 * closed and reopened.  So prohibit exporting FUSE file
+		 * systems that don't set that flag.
+		 *
+		 * But userspace NFS servers don't have this problem.
+                 */
+		SDT_PROBE2(fusefs, , vnops, trace, 1,
+			"VOP_VPTOFH without FUSE_NO_OPENDIR_SUPPORT");
+		return EOPNOTSUPP;
+	}
 
 	err = fuse_internal_getattr(vp, &va, curthread->td_ucred, curthread);
 	if (err)