git: 8d9ed174f3af - main - open(2): Implement O_PATH

Konstantin Belousov kib at FreeBSD.org
Thu Apr 15 09:50:07 UTC 2021


The branch main has been updated by kib:

URL: https://cgit.FreeBSD.org/src/commit/?id=8d9ed174f3afba5f114742447e622fc1173d4774

commit 8d9ed174f3afba5f114742447e622fc1173d4774
Author:     Konstantin Belousov <kib at FreeBSD.org>
AuthorDate: 2021-03-18 10:41:47 +0000
Commit:     Konstantin Belousov <kib at FreeBSD.org>
CommitDate: 2021-04-15 09:48:24 +0000

    open(2): Implement O_PATH
    
    Reviewed by:    markj
    Tested by:      pho
    Discussed with: walker.aj325_gmail.com, wulf
    Sponsored by:   The FreeBSD Foundation
    MFC after:      1 week
    Differential revision:  https://reviews.freebsd.org/D29323
---
 lib/libc/sys/open.2     | 41 ++++++++++++++++++++++++++++++++-
 sys/kern/kern_descrip.c | 46 +++++++++++++++++++++++++++++++++----
 sys/kern/vfs_aio.c      |  5 ++++
 sys/kern/vfs_lookup.c   |  6 +++--
 sys/kern/vfs_syscalls.c | 61 ++++++++++++++++++++++++++++++++++++++-----------
 sys/kern/vfs_vnops.c    | 34 ++++++++++++++++-----------
 sys/sys/fcntl.h         |  8 ++++---
 sys/sys/file.h          |  1 +
 sys/sys/filedesc.h      |  2 ++
 9 files changed, 168 insertions(+), 36 deletions(-)

diff --git a/lib/libc/sys/open.2 b/lib/libc/sys/open.2
index e24c823d039a..f9c54bfc7581 100644
--- a/lib/libc/sys/open.2
+++ b/lib/libc/sys/open.2
@@ -28,7 +28,7 @@
 .\"     @(#)open.2	8.2 (Berkeley) 11/16/93
 .\" $FreeBSD$
 .\"
-.Dd February 23, 2021
+.Dd March 18, 2021
 .Dt OPEN 2
 .Os
 .Sh NAME
@@ -168,6 +168,7 @@ O_DIRECTORY	error if file is not a directory
 O_CLOEXEC	set FD_CLOEXEC upon open
 O_VERIFY	verify the contents of the file
 O_RESOLVE_BENEATH	path resolution must not cross the fd directory
+O_PATH		record only the target path in the opened descriptor
 .Ed
 .Pp
 Opening a file with
@@ -316,6 +317,44 @@ The primary use for this descriptor will be as the lookup descriptor for the
 .Fn *at
 family of functions.
 .Pp
+.Dv O_PATH
+returns a file descriptor that can be used as a directory file descriptor for
+.Xr openat 2
+and other system calls taking a file descriptor argument, like
+.Xr fstatat 2
+and others.
+The other functionality of the returned file descriptor is limited to
+the descriptor-level operations.
+It can be used for
+.Bl -tag -width SCM_RIGHTS -offset indent -compact
+.It Xr fcntl 2
+but advisory locking is not allowed
+.It Xr dup 2
+.It Xr close 2
+.It Xr fstat 2
+.It Xr fexecve 2
+requires that
+.Dv O_EXEC
+was also specified at open time
+.It Dv SCM_RIGHTS
+can be passed over a
+.Xr unix 4
+socket using a
+.Dv SCM_RIGHTS
+message
+.El
+But operations like
+.Xr read 2 ,
+.Xr ftruncate 2 ,
+and any other that operate on file and not on file descriptor (except
+.Xr fstat 2 ),
+are not allowed.
+See also the description of
+.Dv AT_EMPTY_PATH
+flag for
+.Xr fstatat 2
+and related syscalls.
+.Pp
 If successful,
 .Fn open
 returns a non-negative integer, termed a file descriptor.
diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c
index 7a43fbb2eb80..81af58fbddd1 100644
--- a/sys/kern/kern_descrip.c
+++ b/sys/kern/kern_descrip.c
@@ -61,6 +61,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/selinfo.h>
+#include <sys/poll.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
@@ -546,6 +547,11 @@ kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
 		error = fget_fcntl(td, fd, &cap_fcntl_rights, F_SETFL, &fp);
 		if (error != 0)
 			break;
+		if (fp->f_ops == &path_fileops) {
+			fdrop(fp, td);
+			error = EBADF;
+			break;
+		}
 		do {
 			tmp = flg = fp->f_flag;
 			tmp &= ~FCNTLFLAGS;
@@ -610,7 +616,7 @@ kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
 		error = fget_unlocked(fdp, fd, &cap_flock_rights, &fp);
 		if (error != 0)
 			break;
-		if (fp->f_type != DTYPE_VNODE) {
+		if (fp->f_type != DTYPE_VNODE || fp->f_ops == &path_fileops) {
 			error = EBADF;
 			fdrop(fp, td);
 			break;
@@ -715,7 +721,7 @@ kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
 		error = fget_unlocked(fdp, fd, &cap_flock_rights, &fp);
 		if (error != 0)
 			break;
-		if (fp->f_type != DTYPE_VNODE) {
+		if (fp->f_type != DTYPE_VNODE || fp->f_ops == &path_fileops) {
 			error = EBADF;
 			fdrop(fp, td);
 			break;
@@ -771,7 +777,7 @@ kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
 		error = fget_unlocked(fdp, fd, &cap_no_rights, &fp);
 		if (error != 0)
 			break;
-		if (fp->f_type != DTYPE_VNODE) {
+		if (fp->f_type != DTYPE_VNODE || fp->f_ops == &path_fileops) {
 			fdrop(fp, td);
 			error = EBADF;
 			break;
@@ -3544,7 +3550,7 @@ sys_flock(struct thread *td, struct flock_args *uap)
 	error = fget(td, uap->fd, &cap_flock_rights, &fp);
 	if (error != 0)
 		return (error);
-	if (fp->f_type != DTYPE_VNODE) {
+	if (fp->f_type != DTYPE_VNODE || fp->f_ops == &path_fileops) {
 		fdrop(fp, td);
 		return (EOPNOTSUPP);
 	}
@@ -4960,6 +4966,38 @@ struct fileops badfileops = {
 	.fo_fill_kinfo = badfo_fill_kinfo,
 };
 
+static int
+path_poll(struct file *fp, int events, struct ucred *active_cred,
+    struct thread *td)
+{
+	return (POLLNVAL);
+}
+
+static int
+path_close(struct file *fp, struct thread *td)
+{
+	MPASS(fp->f_type == DTYPE_VNODE);
+	fp->f_ops = &badfileops;
+	vrele(fp->f_vnode);
+	return (0);
+}
+
+struct fileops path_fileops = {
+	.fo_read = badfo_readwrite,
+	.fo_write = badfo_readwrite,
+	.fo_truncate = badfo_truncate,
+	.fo_ioctl = badfo_ioctl,
+	.fo_poll = path_poll,
+	.fo_kqfilter = badfo_kqfilter,
+	.fo_stat = vn_statfile,
+	.fo_close = path_close,
+	.fo_chmod = badfo_chmod,
+	.fo_chown = badfo_chown,
+	.fo_sendfile = badfo_sendfile,
+	.fo_fill_kinfo = vn_fill_kinfo,
+	.fo_flags = DFLAG_PASSABLE,
+};
+
 int
 invfo_rdwr(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
diff --git a/sys/kern/vfs_aio.c b/sys/kern/vfs_aio.c
index 9b45a06c5f9f..640e82b6f0ff 100644
--- a/sys/kern/vfs_aio.c
+++ b/sys/kern/vfs_aio.c
@@ -1619,6 +1619,11 @@ aio_aqueue(struct thread *td, struct aiocb *ujob, struct aioliojob *lj,
 		goto err3;
 	}
 
+	if (fp != NULL && fp->f_ops == &path_fileops) {
+		error = EBADF;
+		goto err3;
+	}
+
 	job->fd_file = fp;
 
 	mtx_lock(&aio_job_mtx);
diff --git a/sys/kern/vfs_lookup.c b/sys/kern/vfs_lookup.c
index f4ec3cea9fff..f979676f4c7d 100644
--- a/sys/kern/vfs_lookup.c
+++ b/sys/kern/vfs_lookup.c
@@ -360,8 +360,10 @@ namei_setup(struct nameidata *ndp, struct vnode **dpp, struct pwd **pwdp)
 			if (cnp->cn_flags & AUDITVNODE2)
 				AUDIT_ARG_ATFD2(ndp->ni_dirfd);
 			/*
-			 * Effectively inlined fgetvp_rights, because we need to
-			 * inspect the file as well as grabbing the vnode.
+			 * Effectively inlined fgetvp_rights, because
+			 * we need to inspect the file as well as
+			 * grabbing the vnode.  No check for O_PATH,
+			 * files to implement its semantic.
 			 */
 			error = fget_cap(td, ndp->ni_dirfd, &rights,
 			    &dfp, &ndp->ni_filecaps);
diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c
index 45f155ebff3d..5a1efcdec467 100644
--- a/sys/kern/vfs_syscalls.c
+++ b/sys/kern/vfs_syscalls.c
@@ -375,7 +375,7 @@ kern_fstatfs(struct thread *td, int fd, struct statfs *buf)
 	int error;
 
 	AUDIT_ARG_FD(fd);
-	error = getvnode(td, fd, &cap_fstatfs_rights, &fp);
+	error = getvnode_path(td, fd, &cap_fstatfs_rights, &fp);
 	if (error != 0)
 		return (error);
 	vp = fp->f_vnode;
@@ -891,7 +891,7 @@ sys_fchdir(struct thread *td, struct fchdir_args *uap)
 	int error;
 
 	AUDIT_ARG_FD(uap->fd);
-	error = getvnode(td, uap->fd, &cap_fchdir_rights,
+	error = getvnode_path(td, uap->fd, &cap_fchdir_rights,
 	    &fp);
 	if (error != 0)
 		return (error);
@@ -1023,9 +1023,10 @@ change_dir(struct vnode *vp, struct thread *td)
 static __inline void
 flags_to_rights(int flags, cap_rights_t *rightsp)
 {
-
 	if (flags & O_EXEC) {
 		cap_rights_set_one(rightsp, CAP_FEXECVE);
+		if (flags & O_PATH)
+			return;
 	} else {
 		switch ((flags & O_ACCMODE)) {
 		case O_RDONLY:
@@ -1112,11 +1113,15 @@ kern_openat(struct thread *td, int fd, const char *path, enum uio_seg pathseg,
 	AUDIT_ARG_MODE(mode);
 	cap_rights_init_one(&rights, CAP_LOOKUP);
 	flags_to_rights(flags, &rights);
+
 	/*
 	 * Only one of the O_EXEC, O_RDONLY, O_WRONLY and O_RDWR flags
-	 * may be specified.
+	 * may be specified.  On the other hand, for O_PATH any mode
+	 * except O_EXEC is ignored.
 	 */
-	if (flags & O_EXEC) {
+	if ((flags & O_PATH) != 0) {
+		flags &= ~(O_CREAT | O_ACCMODE);
+	} else if ((flags & O_EXEC) != 0) {
 		if (flags & O_ACCMODE)
 			return (EINVAL);
 	} else if ((flags & O_ACCMODE) == O_ACCMODE) {
@@ -1145,8 +1150,10 @@ kern_openat(struct thread *td, int fd, const char *path, enum uio_seg pathseg,
 		 * wonderous happened deep below and we just pass it up
 		 * pretending we know what we do.
 		 */
-		if (error == ENXIO && fp->f_ops != &badfileops)
+		if (error == ENXIO && fp->f_ops != &badfileops) {
+			MPASS((flags & O_PATH) == 0);
 			goto success;
+		}
 
 		/*
 		 * Handle special fdopen() case. bleh.
@@ -1176,14 +1183,16 @@ kern_openat(struct thread *td, int fd, const char *path, enum uio_seg pathseg,
 	 * files that switched type in the cdevsw fdopen() method.
 	 */
 	fp->f_vnode = vp;
+
 	/*
 	 * If the file wasn't claimed by devfs bind it to the normal
 	 * vnode operations here.
 	 */
 	if (fp->f_ops == &badfileops) {
-		KASSERT(vp->v_type != VFIFO,
+		KASSERT(vp->v_type != VFIFO || (flags & O_PATH) != 0,
 		    ("Unexpected fifo fp %p vp %p", fp, vp));
-		finit_vnode(fp, flags, NULL, &vnops);
+		finit_vnode(fp, flags, NULL, (flags & O_PATH) != 0 ?
+		    &path_fileops : &vnops);
 	}
 
 	VOP_UNLOCK(vp);
@@ -1882,7 +1891,7 @@ kern_funlinkat(struct thread *td, int dfd, const char *path, int fd,
 
 	fp = NULL;
 	if (fd != FD_NONE) {
-		error = getvnode(td, fd, &cap_no_rights, &fp);
+		error = getvnode_path(td, fd, &cap_no_rights, &fp);
 		if (error != 0)
 			return (error);
 	}
@@ -4255,12 +4264,13 @@ out:
 }
 
 /*
- * Convert a user file descriptor to a kernel file entry and check that, if it
- * is a capability, the correct rights are present. A reference on the file
- * entry is held upon returning.
+ * This variant of getvnode() allows O_PATH files.  Caller should
+ * ensure that returned file and vnode are only used for compatible
+ * semantics.
  */
 int
-getvnode(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
+getvnode_path(struct thread *td, int fd, cap_rights_t *rightsp,
+    struct file **fpp)
 {
 	struct file *fp;
 	int error;
@@ -4285,10 +4295,35 @@ getvnode(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
 		fdrop(fp, td);
 		return (EINVAL);
 	}
+
 	*fpp = fp;
 	return (0);
 }
 
+/*
+ * Convert a user file descriptor to a kernel file entry and check
+ * that, if it is a capability, the correct rights are present.
+ * A reference on the file entry is held upon returning.
+ */
+int
+getvnode(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
+{
+	int error;
+
+	error = getvnode_path(td, fd, rightsp, fpp);
+
+	/*
+	 * Filter out O_PATH file descriptors, most getvnode() callers
+	 * do not call fo_ methods.
+	 */
+	if (error == 0 && (*fpp)->f_ops == &path_fileops) {
+		fdrop(*fpp, td);
+		error = EBADF;
+	}
+
+	return (error);
+}
+
 /*
  * Get an (NFS) file handle.
  */
diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c
index 98f37d26ea8c..6339295b0556 100644
--- a/sys/kern/vfs_vnops.c
+++ b/sys/kern/vfs_vnops.c
@@ -391,25 +391,30 @@ vn_open_vnode(struct vnode *vp, int fmode, struct ucred *cred,
 		return (EOPNOTSUPP);
 	if (vp->v_type != VDIR && fmode & O_DIRECTORY)
 		return (ENOTDIR);
+
 	accmode = 0;
-	if (fmode & (FWRITE | O_TRUNC)) {
-		if (vp->v_type == VDIR)
-			return (EISDIR);
-		accmode |= VWRITE;
+	if ((fmode & O_PATH) == 0) {
+		if ((fmode & (FWRITE | O_TRUNC)) != 0) {
+			if (vp->v_type == VDIR)
+				return (EISDIR);
+			accmode |= VWRITE;
+		}
+		if ((fmode & FREAD) != 0)
+			accmode |= VREAD;
+		if ((fmode & O_APPEND) && (fmode & FWRITE))
+			accmode |= VAPPEND;
+#ifdef MAC
+		if ((fmode & O_CREAT) != 0)
+			accmode |= VCREAT;
+#endif
 	}
-	if (fmode & FREAD)
-		accmode |= VREAD;
-	if (fmode & FEXEC)
+	if ((fmode & FEXEC) != 0)
 		accmode |= VEXEC;
-	if ((fmode & O_APPEND) && (fmode & FWRITE))
-		accmode |= VAPPEND;
 #ifdef MAC
-	if (fmode & O_CREAT)
-		accmode |= VCREAT;
-	if (fmode & O_VERIFY)
+	if ((fmode & O_VERIFY) != 0)
 		accmode |= VVERIFY;
 	error = mac_vnode_check_open(cred, vp, accmode);
-	if (error)
+	if (error != 0)
 		return (error);
 
 	accmode &= ~(VCREAT | VVERIFY);
@@ -419,6 +424,9 @@ vn_open_vnode(struct vnode *vp, int fmode, struct ucred *cred,
 		if (error != 0)
 			return (error);
 	}
+	if ((fmode & O_PATH) != 0)
+		return (0);
+
 	if (vp->v_type == VFIFO && VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
 		vn_lock(vp, LK_UPGRADE | LK_RETRY);
 	error = VOP_OPEN(vp, fmode, cred, td, fp);
diff --git a/sys/sys/fcntl.h b/sys/sys/fcntl.h
index 0fa4e7758c9d..c328abaa02af 100644
--- a/sys/sys/fcntl.h
+++ b/sys/sys/fcntl.h
@@ -135,7 +135,7 @@ typedef	__pid_t		pid_t;
 
 #if __BSD_VISIBLE
 #define	O_VERIFY	0x00200000	/* open only after verification */
-/* #define O_UNUSED1	0x00400000   */	/* Was O_BENEATH */
+#define O_PATH		0x00400000	/* fd is only a path */
 #define	O_RESOLVE_BENEATH 0x00800000	/* Do not allow name resolution to walk
 					   out of cwd */
 #endif
@@ -156,10 +156,12 @@ typedef	__pid_t		pid_t;
 
 /* convert from open() flags to/from fflags; convert O_RD/WR to FREAD/FWRITE */
 #define	FFLAGS(oflags)	((oflags) & O_EXEC ? (oflags) : (oflags) + 1)
-#define	OFLAGS(fflags)	((fflags) & O_EXEC ? (fflags) : (fflags) - 1)
+#define	OFLAGS(fflags)	\
+    (((fflags) & (O_EXEC | O_PATH)) != 0 ? (fflags) : (fflags) - 1)
 
 /* bits to save after open */
-#define	FMASK	(FREAD|FWRITE|FAPPEND|FASYNC|FFSYNC|FDSYNC|FNONBLOCK|O_DIRECT|FEXEC)
+#define	FMASK	(FREAD|FWRITE|FAPPEND|FASYNC|FFSYNC|FDSYNC|FNONBLOCK| \
+		 O_DIRECT|FEXEC|O_PATH)
 /* bits settable by fcntl(F_SETFL, ...) */
 #define	FCNTLFLAGS	(FAPPEND|FASYNC|FFSYNC|FDSYNC|FNONBLOCK|FRDAHEAD|O_DIRECT)
 
diff --git a/sys/sys/file.h b/sys/sys/file.h
index c4fc70f517a4..9237ee5ceb9d 100644
--- a/sys/sys/file.h
+++ b/sys/sys/file.h
@@ -239,6 +239,7 @@ struct xfile {
 
 extern struct fileops vnops;
 extern struct fileops badfileops;
+extern struct fileops path_fileops;
 extern struct fileops socketops;
 extern int maxfiles;		/* kernel limit on number of open files */
 extern int maxfilesperproc;	/* per process limit on number of open files */
diff --git a/sys/sys/filedesc.h b/sys/sys/filedesc.h
index 8c5aa258ed28..7f18d8a2286c 100644
--- a/sys/sys/filedesc.h
+++ b/sys/sys/filedesc.h
@@ -265,6 +265,8 @@ struct filedesc_to_leader *
 	    struct filedesc *fdp, struct proc *leader);
 int	getvnode(struct thread *td, int fd, cap_rights_t *rightsp,
 	    struct file **fpp);
+int	getvnode_path(struct thread *td, int fd, cap_rights_t *rightsp,
+	    struct file **fpp);
 void	mountcheckdirs(struct vnode *olddp, struct vnode *newdp);
 
 int	fget_cap_locked(struct filedesc *fdp, int fd, cap_rights_t *needrightsp,


More information about the dev-commits-src-all mailing list