git: 5832d5f03e0f - stable/13 - vfs_export: Add mnt_exjail to control exports done in prisons

From: Rick Macklem <rmacklem_at_FreeBSD.org>
Date: Sun, 21 May 2023 20:03:04 UTC
The branch stable/13 has been updated by rmacklem:

URL: https://cgit.FreeBSD.org/src/commit/?id=5832d5f03e0f2626818a9f39df63840289fdfa88

commit 5832d5f03e0f2626818a9f39df63840289fdfa88
Author:     Rick Macklem <rmacklem@FreeBSD.org>
AuthorDate: 2023-02-21 21:00:42 +0000
Commit:     Rick Macklem <rmacklem@FreeBSD.org>
CommitDate: 2023-05-21 19:58:14 +0000

    vfs_export: Add mnt_exjail to control exports done in prisons
    
    If there are multiple instances of mountd(8) (in different
    prisons), there will be confusion if they manipulate the
    exports of the same file system.  This patch adds mnt_exjail
    to "struct mount" so that the credentials (and, therefore,
    the prison) that did the exports for that file system can
    be recorded.  If another prison has already exported the
    file system, vfs_export() will fail with an error.
    If mnt_exjail == NULL, the file system has not been exported.
    mnt_exjail is checked by the NFS server, so that exports done
    from within a different prison will not be used.
    
    The patch also implements vfs_exjail_destroy(), which is
    called from prison_cleanup() to release all the mnt_exjail
    credential references, so that the prison can be removed.
    Mainly to avoid doing a scan of the mountlist for the case
    where there were no exports done from within the prison,
    a count of how many file systems have been exported from
    within the prison is kept in pr_exportcnt.
    
    Changing the new argument for vfs_export() to "int" and
    moving the prototype for vfs_exjail_delete() to jail.h
    were both necessary to allow libprocstat to build.
    
    (cherry picked from commit 88175af8b75ea8850757cc9dca68b6d336b82675)
---
 sys/fs/nfsserver/nfs_nfsdport.c |  24 ++++--
 sys/kern/kern_jail.c            |   1 +
 sys/kern/vfs_export.c           | 164 +++++++++++++++++++++++++++++++++++++++-
 sys/kern/vfs_mount.c            |   9 ++-
 sys/sys/jail.h                  |   4 +-
 sys/sys/mount.h                 |   4 +-
 6 files changed, 192 insertions(+), 14 deletions(-)

diff --git a/sys/fs/nfsserver/nfs_nfsdport.c b/sys/fs/nfsserver/nfs_nfsdport.c
index 4247177a3fc0..e75c238b7117 100644
--- a/sys/fs/nfsserver/nfs_nfsdport.c
+++ b/sys/fs/nfsserver/nfs_nfsdport.c
@@ -3270,8 +3270,16 @@ nfsvno_checkexp(struct mount *mp, struct sockaddr *nam, struct nfsexstuff *exp,
 {
 	int error;
 
-	error = VFS_CHECKEXP(mp, nam, &exp->nes_exflag, credp,
-	    &exp->nes_numsecflavor, exp->nes_secflavors);
+	error = 0;
+	*credp = NULL;
+	MNT_ILOCK(mp);
+	if (mp->mnt_exjail == NULL ||
+	    mp->mnt_exjail->cr_prison != curthread->td_ucred->cr_prison)
+		error = EACCES;
+	MNT_IUNLOCK(mp);
+	if (error == 0)
+		error = VFS_CHECKEXP(mp, nam, &exp->nes_exflag, credp,
+		    &exp->nes_numsecflavor, exp->nes_secflavors);
 	if (error) {
 		if (NFSD_VNET(nfs_rootfhset)) {
 			exp->nes_exflag = 0;
@@ -3305,8 +3313,14 @@ nfsvno_fhtovp(struct mount *mp, fhandle_t *fhp, struct sockaddr *nam,
 		/* Make sure the server replies ESTALE to the client. */
 		error = ESTALE;
 	if (nam && !error) {
-		error = VFS_CHECKEXP(mp, nam, &exp->nes_exflag, credp,
-		    &exp->nes_numsecflavor, exp->nes_secflavors);
+		MNT_ILOCK(mp);
+		if (mp->mnt_exjail == NULL ||
+		    mp->mnt_exjail->cr_prison != curthread->td_ucred->cr_prison)
+			error = EACCES;
+		MNT_IUNLOCK(mp);
+		if (error == 0)
+			error = VFS_CHECKEXP(mp, nam, &exp->nes_exflag, credp,
+			    &exp->nes_numsecflavor, exp->nes_secflavors);
 		if (error) {
 			if (NFSD_VNET(nfs_rootfhset)) {
 				exp->nes_exflag = 0;
@@ -3476,7 +3490,7 @@ nfsrv_v4rootexport(void *argp, struct ucred *cred, struct thread *p)
 	struct nameidata nd;
 	fhandle_t fh;
 
-	error = vfs_export(NFSD_VNET(nfsv4root_mnt), &nfsexargp->export);
+	error = vfs_export(NFSD_VNET(nfsv4root_mnt), &nfsexargp->export, 0);
 	if ((nfsexargp->export.ex_flags & MNT_DELEXPORT) != 0)
 		NFSD_VNET(nfs_rootfhset) = 0;
 	else if (error == 0) {
diff --git a/sys/kern/kern_jail.c b/sys/kern/kern_jail.c
index 622b9f6c7cb9..0203dcd0faf1 100644
--- a/sys/kern/kern_jail.c
+++ b/sys/kern/kern_jail.c
@@ -3035,6 +3035,7 @@ prison_cleanup(struct prison *pr)
 {
 	sx_assert(&allprison_lock, SA_XLOCKED);
 	mtx_assert(&pr->pr_mtx, MA_NOTOWNED);
+	vfs_exjail_delete(pr);
 	shm_remove_prison(pr);
 	(void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL);
 }
diff --git a/sys/kern/vfs_export.c b/sys/kern/vfs_export.c
index cab37ce205ad..e9676799062f 100644
--- a/sys/kern/vfs_export.c
+++ b/sys/kern/vfs_export.c
@@ -52,6 +52,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/mbuf.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
+#include <sys/proc.h>
 #include <sys/rmlock.h>
 #include <sys/refcount.h>
 #include <sys/signalvar.h>
@@ -296,12 +297,18 @@ vfs_free_addrlist(struct netexport *nep)
  * and the passed in netexport.
  * Struct export_args *argp is the variable used to twiddle options,
  * the structure is described in sys/mount.h
+ * The do_exjail argument should be true if *mp is in the mountlist
+ * and false if not.  It is not in the mountlist for the NFSv4 rootfs
+ * fake mount point just used for exports.
  */
 int
-vfs_export(struct mount *mp, struct export_args *argp)
+vfs_export(struct mount *mp, struct export_args *argp, int do_exjail)
 {
 	struct netexport *nep;
+	struct ucred *cr;
+	struct prison *pr;
 	int error;
+	bool new_nep;
 
 	if ((argp->ex_flags & (MNT_DELEXPORT | MNT_EXPORTED)) == 0)
 		return (EINVAL);
@@ -312,6 +319,7 @@ vfs_export(struct mount *mp, struct export_args *argp)
 		return (EINVAL);
 
 	error = 0;
+	pr = curthread->td_ucred->cr_prison;
 	lockmgr(&mp->mnt_explock, LK_EXCLUSIVE, NULL);
 	nep = mp->mnt_export;
 	if (argp->ex_flags & MNT_DELEXPORT) {
@@ -319,6 +327,21 @@ vfs_export(struct mount *mp, struct export_args *argp)
 			error = ENOENT;
 			goto out;
 		}
+		MNT_ILOCK(mp);
+		if (mp->mnt_exjail != NULL && mp->mnt_exjail->cr_prison != pr &&
+		    pr == &prison0) {
+			MNT_IUNLOCK(mp);
+			/* EXDEV will not get logged by mountd(8). */
+			error = EXDEV;
+			goto out;
+		} else if (mp->mnt_exjail != NULL &&
+		    mp->mnt_exjail->cr_prison != pr) {
+			MNT_IUNLOCK(mp);
+			/* EPERM will get logged by mountd(8). */
+			error = EPERM;
+			goto out;
+		}
+		MNT_IUNLOCK(mp);
 		if (mp->mnt_flag & MNT_EXPUBLIC) {
 			vfs_setpublicfs(NULL, NULL, NULL);
 			MNT_ILOCK(mp);
@@ -330,18 +353,51 @@ vfs_export(struct mount *mp, struct export_args *argp)
 		free(nep, M_MOUNT);
 		nep = NULL;
 		MNT_ILOCK(mp);
+		cr = mp->mnt_exjail;
+		mp->mnt_exjail = NULL;
 		mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED);
 		MNT_IUNLOCK(mp);
+		if (cr != NULL) {
+			atomic_subtract_int(&pr->pr_exportcnt, 1);
+			crfree(cr);
+		}
 	}
 	if (argp->ex_flags & MNT_EXPORTED) {
+		new_nep = false;
+		MNT_ILOCK(mp);
+		if (mp->mnt_exjail == NULL) {
+			MNT_IUNLOCK(mp);
+			if (do_exjail && nep != NULL) {
+				vfs_free_addrlist(nep);
+				memset(nep, 0, sizeof(*nep));
+				new_nep = true;
+			}
+		} else if (mp->mnt_exjail->cr_prison != pr) {
+			MNT_IUNLOCK(mp);
+			error = EPERM;
+			goto out;
+		} else
+			MNT_IUNLOCK(mp);
 		if (nep == NULL) {
-			nep = malloc(sizeof(struct netexport), M_MOUNT, M_WAITOK | M_ZERO);
+			nep = malloc(sizeof(struct netexport), M_MOUNT,
+			    M_WAITOK | M_ZERO);
 			mp->mnt_export = nep;
+			new_nep = true;
 		}
 		if (argp->ex_flags & MNT_EXPUBLIC) {
-			if ((error = vfs_setpublicfs(mp, nep, argp)) != 0)
+			if ((error = vfs_setpublicfs(mp, nep, argp)) != 0) {
+				if (new_nep) {
+					mp->mnt_export = NULL;
+					free(nep, M_MOUNT);
+				}
 				goto out;
+			}
+			new_nep = false;
 			MNT_ILOCK(mp);
+			if (do_exjail && mp->mnt_exjail == NULL) {
+				mp->mnt_exjail = crhold(curthread->td_ucred);
+				atomic_add_int(&pr->pr_exportcnt, 1);
+			}
 			mp->mnt_flag |= MNT_EXPUBLIC;
 			MNT_IUNLOCK(mp);
 		}
@@ -349,9 +405,18 @@ vfs_export(struct mount *mp, struct export_args *argp)
 			argp->ex_numsecflavors = 1;
 			argp->ex_secflavors[0] = AUTH_SYS;
 		}
-		if ((error = vfs_hang_addrlist(mp, nep, argp)))
+		if ((error = vfs_hang_addrlist(mp, nep, argp))) {
+			if (new_nep) {
+				mp->mnt_export = NULL;
+				free(nep, M_MOUNT);
+			}
 			goto out;
+		}
 		MNT_ILOCK(mp);
+		if (do_exjail && mp->mnt_exjail == NULL) {
+			mp->mnt_exjail = crhold(curthread->td_ucred);
+			atomic_add_int(&pr->pr_exportcnt, 1);
+		}
 		mp->mnt_flag |= MNT_EXPORTED;
 		MNT_IUNLOCK(mp);
 	}
@@ -371,6 +436,97 @@ out:
 	return (error);
 }
 
+/*
+ * Get rid of credential references for this prison.
+ */
+void
+vfs_exjail_delete(struct prison *pr)
+{
+	struct mount *mp;
+	struct ucred *cr;
+	int error, i;
+
+	/*
+	 * Since this function is called from prison_cleanup() after
+	 * all processes in the prison have exited, the value of
+	 * pr_exportcnt can no longer increase.  It is possible for
+	 * a dismount of a file system exported within this prison
+	 * to be in progress.  In this case, the file system is no
+	 * longer in the mountlist and the mnt_exjail will be free'd
+	 * by vfs_mount_destroy() at some time.  As such, pr_exportcnt
+	 * and, therefore "i", is the upper bound on the number of
+	 * mnt_exjail entries to be found by this function.
+	 */
+	i = atomic_load_int(&pr->pr_exportcnt);
+	KASSERT(i >= 0, ("vfs_exjail_delete: pr_exportcnt negative"));
+	if (i == 0)
+		return;
+	mtx_lock(&mountlist_mtx);
+tryagain:
+	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
+		MNT_ILOCK(mp);
+		if (mp->mnt_exjail != NULL &&
+		    mp->mnt_exjail->cr_prison == pr) {
+			MNT_IUNLOCK(mp);
+			error = vfs_busy(mp, MBF_MNTLSTLOCK | MBF_NOWAIT);
+			if (error != 0) {
+				/*
+				 * If the vfs_busy() fails, we still want to
+				 * get rid of mnt_exjail for two reasons:
+				 * - a credential reference will result in
+				 *   a prison not being removed
+				 * - setting mnt_exjail NULL indicates that
+				 *   the exports are no longer valid
+				 * The now invalid exports will be deleted
+				 * when the file system is dismounted or
+				 * the file system is re-exported by mountd.
+				 */
+				cr = NULL;
+				MNT_ILOCK(mp);
+				if (mp->mnt_exjail != NULL &&
+				    mp->mnt_exjail->cr_prison == pr) {
+					cr = mp->mnt_exjail;
+					mp->mnt_exjail = NULL;
+				}
+				MNT_IUNLOCK(mp);
+				if (cr != NULL) {
+					crfree(cr);
+					i--;
+				}
+				if (i == 0)
+					break;
+				continue;
+			}
+			cr = NULL;
+			lockmgr(&mp->mnt_explock, LK_EXCLUSIVE, NULL);
+			MNT_ILOCK(mp);
+			if (mp->mnt_exjail != NULL &&
+			    mp->mnt_exjail->cr_prison == pr) {
+				cr = mp->mnt_exjail;
+				mp->mnt_exjail = NULL;
+				mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED);
+				MNT_IUNLOCK(mp);
+				vfs_free_addrlist(mp->mnt_export);
+				free(mp->mnt_export, M_MOUNT);
+				mp->mnt_export = NULL;
+			} else
+				MNT_IUNLOCK(mp);
+			lockmgr(&mp->mnt_explock, LK_RELEASE, NULL);
+			if (cr != NULL) {
+				crfree(cr);
+				i--;
+			}
+			mtx_lock(&mountlist_mtx);
+			vfs_unbusy(mp);
+			if (i == 0)
+				break;
+			goto tryagain;
+		}
+		MNT_IUNLOCK(mp);
+	}
+	mtx_unlock(&mountlist_mtx);
+}
+
 /*
  * Set the publicly exported filesystem (WebNFS). Currently, only
  * one public filesystem is possible in the spec (RFC 2054 and 2055)
diff --git a/sys/kern/vfs_mount.c b/sys/kern/vfs_mount.c
index d5b137e7ffab..c43385b9736b 100644
--- a/sys/kern/vfs_mount.c
+++ b/sys/kern/vfs_mount.c
@@ -618,6 +618,11 @@ vfs_mount_destroy(struct mount *mp)
 #endif
 	if (mp->mnt_opt != NULL)
 		vfs_freeopts(mp->mnt_opt);
+	if (mp->mnt_exjail != NULL) {
+		atomic_subtract_int(&mp->mnt_exjail->cr_prison->pr_exportcnt,
+		    1);
+		crfree(mp->mnt_exjail);
+	}
 	if (mp->mnt_export != NULL) {
 		vfs_free_addrlist(mp->mnt_export);
 		free(mp->mnt_export, M_MOUNT);
@@ -1236,7 +1241,7 @@ vfs_domount_update(
 			} else
 				export_error = EINVAL;
 			if (export_error == 0)
-				export_error = vfs_export(mp, &export);
+				export_error = vfs_export(mp, &export, 1);
 			free(export.ex_groups, M_TEMP);
 			break;
 		case (sizeof(export)):
@@ -1258,7 +1263,7 @@ vfs_domount_update(
 			else
 				export_error = EINVAL;
 			if (export_error == 0)
-				export_error = vfs_export(mp, &export);
+				export_error = vfs_export(mp, &export, 1);
 			free(grps, M_TEMP);
 			break;
 		default:
diff --git a/sys/sys/jail.h b/sys/sys/jail.h
index f4d4e521d7de..eee971c3b5ce 100644
--- a/sys/sys/jail.h
+++ b/sys/sys/jail.h
@@ -190,7 +190,8 @@ struct prison {
 	int		 pr_enforce_statfs;		/* (p) statfs permission */
 	int		 pr_devfs_rsnum;		/* (p) devfs ruleset */
 	enum prison_state pr_state;			/* (q) state in life cycle */
-	int		 pr_spare[2];
+	volatile int	 pr_exportcnt;			/* (r) count of mount exports */
+	int		 pr_spare;
 	int		 pr_osreldate;			/* (c) kern.osreldate value */
 	unsigned long	 pr_hostid;			/* (p) jail hostid */
 	char		 pr_name[MAXHOSTNAMELEN];	/* (p) admin jail name */
@@ -468,6 +469,7 @@ void prison_racct_foreach(void (*callback)(struct racct *racct,
 struct prison_racct *prison_racct_find(const char *name);
 void prison_racct_hold(struct prison_racct *prr);
 void prison_racct_free(struct prison_racct *prr);
+void vfs_exjail_delete(struct prison *);
 
 #endif /* _KERNEL */
 #endif /* !_SYS_JAIL_H_ */
diff --git a/sys/sys/mount.h b/sys/sys/mount.h
index 9a69240ddba5..7049dc5f1d05 100644
--- a/sys/sys/mount.h
+++ b/sys/sys/mount.h
@@ -222,7 +222,7 @@ struct mount {
 	int		mnt_writeopcount;	/* (i) write syscalls pending */
 	struct vfsoptlist *mnt_opt;		/* current mount options */
 	struct vfsoptlist *mnt_optnew;		/* new options passed to fs */
-	u_int		mnt_pad0;		/* was mnt_maxsymlinklen */
+	struct ucred	*mnt_exjail;		/* (i) jail which did exports */
 	struct statfs	mnt_stat;		/* cache of filesystem stats */
 	struct ucred	*mnt_cred;		/* credentials of mounter */
 	void *		mnt_data;		/* private data */
@@ -986,7 +986,7 @@ int	vfs_setpublicfs			    /* set publicly exported fs */
 void	vfs_periodic(struct mount *, int);
 int	vfs_busy(struct mount *, int);
 int	vfs_export			 /* process mount export info */
-	    (struct mount *, struct export_args *);
+	    (struct mount *, struct export_args *, int);
 void	vfs_free_addrlist(struct netexport *);
 void	vfs_allocate_syncvnode(struct mount *);
 void	vfs_deallocate_syncvnode(struct mount *);