capability races (was: Re: Seeing ENOTCAPABLE from write FDs in kqueue?)

Sun Jun 8 22:00:07 UTC 2014

Current support for capabilities is racy in respect to somewhat
misbehaving programs.

When fp is being installed under given fd number it is available before
capabilities are copied.

This is because fget_unlocked only gets fp atomatically, but caps can be
modified irrespectively to that.

This has 2 problems:
- if the code is trying to access fd before the syscall returns, it may
  get ENOTCAPABLE (minor)
- if the code is racing with dup2 it can access given fp in a way
  prevented by not-yet-installed caps (needs fixing)

In url below I provide two reproducers:
http://people.freebsd.org/~mjg/patches/caps-race/

One will open+close in 1 thread + read in 7 threads trying to get
ENOTCAPABLE.

Second one will create a fd with stripped caps and dup2 capped/uncapped
fd in 1 thread + read in 7 threads trying to succeed.

I propose to fix the problem by introducing sequence counters.

I tested the patch below succesfully with my reproducers.

Note that the patch is somewhat incomplete (ioctls are not handled) and
has stuff in wrong places, I can do necessary tidy ups if it is agreed
the approach taken is ok.

This was not benchmarked yet, but should still beat shared locking :)

diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c
index 88b26af..25e4a98 100644
--- a/sys/kern/kern_descrip.c
+++ b/sys/kern/kern_descrip.c
@@ -131,6 +131,51 @@ static int	fill_socket_info(struct socket *so, struct kinfo_file *kif);
 static int	fill_vnode_info(struct vnode *vp, struct kinfo_file *kif);
 static int	getmaxfd(struct proc *p);
 
+
+static inline void
+seq_begin(seq_t *seqp)
+{
+
+	MPASS((*seqp & 1) == 0);
+	(*seqp)++;
+	wmb();
+}
+
+static inline void
+seq_end(seq_t *seqp)
+{
+
+	wmb();
+	(*seqp)++;
+	MPASS((*seqp & 1) == 0);
+}
+
+static inline seq_t
+seq_read(seq_t *seqp)
+{
+	seq_t ret;
+
+	ret = *seqp;
+	rmb();
+
+	return (ret);
+}
+
+static inline bool
+seq_in_modify(seq_t seq)
+{
+
+	return (seq & 1);
+}
+
+static inline bool
+seq_changed(seq_t seq1, seq_t seq2)
+{
+
+	MPASS(!seq_in_modify(seq1));
+	return (seq1 != seq2);
+}
+
 /*
  * Each process has:
  *
@@ -301,9 +346,11 @@ fdfree(struct filedesc *fdp, int fd)
 	struct filedescent *fde;
 
 	fde = &fdp->fd_ofiles[fd];
+	seq_begin(&fde->fde_seq);
 	filecaps_free(&fde->fde_caps);
-	bzero(fde, sizeof(*fde));
+	bzero(fde_change(fde), fde_change_size);
 	fdunused(fdp, fd);
+	seq_end(&fde->fde_seq);
 }
 
 /*
@@ -878,8 +925,9 @@ do_dup(struct thread *td, int flags, int old, int new,
 	/*
 	 * Duplicate the source descriptor.
 	 */
+	seq_begin(&newfde->fde_seq);
 	filecaps_free(&newfde->fde_caps);
-	*newfde = *oldfde;
+	memcpy(fde_change(newfde), fde_change(oldfde), fde_change_size);
 	filecaps_copy(&oldfde->fde_caps, &newfde->fde_caps);
 	if ((flags & DUP_CLOEXEC) != 0)
 		newfde->fde_flags = oldfde->fde_flags | UF_EXCLOSE;
@@ -887,6 +935,7 @@ do_dup(struct thread *td, int flags, int old, int new,
 		newfde->fde_flags = oldfde->fde_flags & ~UF_EXCLOSE;
 	if (new > fdp->fd_lastfile)
 		fdp->fd_lastfile = new;
+	seq_end(&newfde->fde_seq);
 	*retval = new;
 
 	if (delfp != NULL) {
@@ -1753,6 +1802,7 @@ finstall(struct thread *td, struct file *fp, int *fd, int flags,
 	}
 	fhold(fp);
 	fde = &fdp->fd_ofiles[*fd];
+	seq_begin(&fde->fde_seq);
 	fde->fde_file = fp;
 	if ((flags & O_CLOEXEC) != 0)
 		fde->fde_flags |= UF_EXCLOSE;
@@ -1760,6 +1810,7 @@ finstall(struct thread *td, struct file *fp, int *fd, int flags,
 		filecaps_move(fcaps, &fde->fde_caps);
 	else
 		filecaps_fill(&fde->fde_caps);
+	seq_end(&fde->fde_seq);
 	FILEDESC_XUNLOCK(fdp);
 	return (0);
 }
@@ -2313,6 +2364,8 @@ fget_unlocked(struct filedesc *fdp, int fd, cap_rights_t *needrightsp,
 	cap_rights_t haverights;
 	int error;
 #endif
+	struct filedescent fde;
+	int seq1, seq2;
 
 	/*
 	 * Avoid reads reordering and then a first access to the
@@ -2329,11 +2382,18 @@ fget_unlocked(struct filedesc *fdp, int fd, cap_rights_t *needrightsp,
 	 * due to preemption.
 	 */
 	for (;;) {
-		fp = fdp->fd_ofiles[fd].fde_file;
+		seq1 = seq_read(&fdp->fd_ofiles[fd].fde_seq);
+		if (seq_in_modify(seq1))
+			continue;
+		fde = fdp->fd_ofiles[fd];
+		seq2 = seq_read(&fdp->fd_ofiles[fd].fde_seq);
+		if (seq_changed(seq1, seq2))
+			continue;
+		fp = fde.fde_file;
 		if (fp == NULL)
 			return (EBADF);
 #ifdef CAPABILITIES
-		haverights = *cap_rights(fdp, fd);
+		haverights = fde.fde_rights;
 		if (needrightsp != NULL) {
 			error = cap_check(&haverights, needrightsp);
 			if (error != 0)
diff --git a/sys/sys/filedesc.h b/sys/sys/filedesc.h
index 516ef1b..84b38f2 100644
--- a/sys/sys/filedesc.h
+++ b/sys/sys/filedesc.h
@@ -42,6 +42,8 @@
 
 #include <machine/_limits.h>
 
+typedef uint32_t	seq_t;
+
 struct filecaps {
 	cap_rights_t	 fc_rights;	/* per-descriptor capability rights */
 	u_long		*fc_ioctls;	/* per-descriptor allowed ioctls */
@@ -50,6 +52,7 @@ struct filecaps {
 };
 
 struct filedescent {
+	seq_t		 fde_seq;
 	struct file	*fde_file;		/* file structure for open file */
 	struct filecaps	 fde_caps;		/* per-descriptor rights */
 	uint8_t		 fde_flags;		/* per-process open file flags */
@@ -58,6 +61,8 @@ struct filedescent {
 #define	fde_fcntls	fde_caps.fc_fcntls
 #define	fde_ioctls	fde_caps.fc_ioctls
 #define	fde_nioctls	fde_caps.fc_nioctls
+#define fde_change(fde)	((char *)(fde) + sizeof(seq_t))
+#define fde_change_size	(sizeof(struct filedescent) - sizeof(seq_t))
 
 /*
  * This structure is used for the management of descriptors.  It may be
-- 
Mateusz Guzik <mjguzik gmail.com>