svn commit: r192372 - in user/kmacy/releng_7_2_fcs/sys: kern sys

Kip Macy kmacy at FreeBSD.org
Tue May 19 05:36:11 UTC 2009


Author: kmacy
Date: Tue May 19 05:36:10 2009
New Revision: 192372
URL: http://svn.freebsd.org/changeset/base/192372

Log:
  merge changes to support lockless fget
  
  187677,187682,187693,187996,189708,192080
  
  Fix errors introduced when I rewrote select.
   - Restructure selscan() and selrescan() to avoid producing extra selfps
     when we have a fd in multiple sets.  As described below multiple selfps
     may still exist for other reasons.
   - Make selrescan() tolerate multiple selfds for a given descriptor
     set since sockets use two selinfos per fd.  If an event on each selinfo
     fires selrescan() will see the descriptor twice.  This could result in
     select() returning 2x the number of fds actually existing in fd sets.
  
  Reported by:	mgleason at ncftp.com
  
  - bit has to be fd_mask to work properly on 64bit platforms.  Constants
     must also be cast even though the result ultimately is promoted
     to 64bit.
   - Correct a loop index upper bound in selscan()
  
  Fix select on platforms where sizeof(long) != sizeof(int). This used
  to work by accident before the cleanup done in revision 187693.
  
  Approved by:	kan (mentor)
  
  - Implement a lockless file descriptor lookup algorithm in
     fget_unlocked().
   - Save old file descriptor tables created on expansion until
     the entire descriptor table is freed so that pointers may be
     followed without regard for expanders.
   - Mark the file zone as NOFREE so we may attempt to reference
     potentially freed files.
   - Convert several fget_locked() users to fget_unlocked().  This
     requires us to manage reference counts explicitly but reduces
     locking overhead in the common case.

Modified:
  user/kmacy/releng_7_2_fcs/sys/kern/kern_descrip.c
  user/kmacy/releng_7_2_fcs/sys/kern/sys_generic.c
  user/kmacy/releng_7_2_fcs/sys/kern/uipc_syscalls.c
  user/kmacy/releng_7_2_fcs/sys/kern/vfs_syscalls.c
  user/kmacy/releng_7_2_fcs/sys/sys/filedesc.h

Modified: user/kmacy/releng_7_2_fcs/sys/kern/kern_descrip.c
==============================================================================
--- user/kmacy/releng_7_2_fcs/sys/kern/kern_descrip.c	Tue May 19 05:17:41 2009	(r192371)
+++ user/kmacy/releng_7_2_fcs/sys/kern/kern_descrip.c	Tue May 19 05:36:10 2009	(r192372)
@@ -123,12 +123,24 @@ static void	fdused(struct filedesc *fdp,
 #define OFILESIZE (sizeof(struct file *) + sizeof(char))
 
 /*
+ * Storage to hold unused ofiles that need to be reclaimed.
+ */
+struct freetable {
+	struct file	**ft_table;
+	SLIST_ENTRY(freetable) ft_next;
+};
+
+/*
  * Basic allocation of descriptors:
  * one of the above, plus arrays for NDFILE descriptors.
  */
 struct filedesc0 {
 	struct	filedesc fd_fd;
 	/*
+	 * ofiles which need to be reclaimed on free.
+	 */
+	SLIST_HEAD(,freetable) fd_free;
+	/*
 	 * These arrays are used when the number of open files is
 	 * <= NDFILE, and are then pointed to by the pointers above.
 	 */
@@ -1282,7 +1294,10 @@ out:
 static void
 fdgrowtable(struct filedesc *fdp, int nfd)
 {
+	struct filedesc0 *fdp0;
+	struct freetable *fo;
 	struct file **ntable;
+	struct file **otable;
 	char *nfileflags;
 	int nnfiles, onfiles;
 	NDSLOTTYPE *nmap;
@@ -1301,7 +1316,7 @@ fdgrowtable(struct filedesc *fdp, int nf
 
 	/* allocate a new table and (if required) new bitmaps */
 	FILEDESC_XUNLOCK(fdp);
-	MALLOC(ntable, struct file **, nnfiles * OFILESIZE,
+	ntable = malloc((nnfiles * OFILESIZE) + sizeof(struct freetable),
 	    M_FILEDESC, M_ZERO | M_WAITOK);
 	nfileflags = (char *)&ntable[nnfiles];
 	if (NDSLOTS(nnfiles) > NDSLOTS(onfiles))
@@ -1325,10 +1340,20 @@ fdgrowtable(struct filedesc *fdp, int nf
 	}
 	bcopy(fdp->fd_ofiles, ntable, onfiles * sizeof(*ntable));
 	bcopy(fdp->fd_ofileflags, nfileflags, onfiles);
-	if (onfiles > NDFILE)
-		free(fdp->fd_ofiles, M_FILEDESC);
-	fdp->fd_ofiles = ntable;
+	otable = fdp->fd_ofiles;
 	fdp->fd_ofileflags = nfileflags;
+	fdp->fd_ofiles = ntable;
+	/*
+	 * We must preserve ofiles until the process exits because we can't
+	 * be certain that no threads have references to the old table via
+	 * _fget().
+	 */
+	if (onfiles > NDFILE) {
+		fo = (struct freetable *)&otable[onfiles];
+		fdp0 = (struct filedesc0 *)fdp;
+		fo->ft_table = otable;
+		SLIST_INSERT_HEAD(&fdp0->fd_free, fo, ft_next);
+	}
 	if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) {
 		bcopy(fdp->fd_map, nmap, NDSLOTS(onfiles) * sizeof(*nmap));
 		if (NDSLOTS(onfiles) > NDSLOTS(NDFILE))
@@ -1526,6 +1551,8 @@ fdhold(struct proc *p)
 static void
 fddrop(struct filedesc *fdp)
 {
+	struct filedesc0 *fdp0;
+	struct freetable *ft;
 	int i;
 
 	mtx_lock(&fdesc_mtx);
@@ -1535,7 +1562,12 @@ fddrop(struct filedesc *fdp)
 		return;
 
 	FILEDESC_LOCK_DESTROY(fdp);
-	FREE(fdp, M_FILEDESC);
+	fdp0 = (struct filedesc0 *)fdp;
+	while ((ft = SLIST_FIRST(&fdp0->fd_free)) != NULL) {
+		SLIST_REMOVE_HEAD(&fdp0->fd_free, ft_next);
+		free(ft->ft_table, M_FILEDESC);
+	}
+	free(fdp, M_FILEDESC);
 }
 
 /*
@@ -2036,6 +2068,38 @@ finit(struct file *fp, u_int flag, short
 	atomic_store_rel_ptr((volatile uintptr_t *)&fp->f_ops, (uintptr_t)ops);
 }
 
+struct file *
+fget_unlocked(struct filedesc *fdp, int fd)
+{
+	struct file *fp;
+	u_int count;
+
+	if (fd < 0 || fd >= fdp->fd_nfiles)
+		return (NULL);
+	/*
+	 * Fetch the descriptor locklessly.  We avoid fdrop() races by
+	 * never raising a refcount above 0.  To accomplish this we have
+	 * to use a cmpset loop rather than an atomic_add.  The descriptor
+	 * must be re-verified once we acquire a reference to be certain
+	 * that the identity is still correct and we did not lose a race
+	 * due to preemption.
+	 */
+	for (;;) {
+		fp = fdp->fd_ofiles[fd];
+		if (fp == NULL)
+			break;
+		count = fp->f_count;
+		if (count == 0)
+			continue;
+		if (atomic_cmpset_int(&fp->f_count, count, count + 1) != 1)
+			continue;
+		if (fp == ((struct file *volatile*)fdp->fd_ofiles)[fd])
+			break;
+		fdrop(fp, curthread);
+	}
+
+	return (fp);
+}
 
 /*
  * Extract the file pointer associated with the specified descriptor for the
@@ -2043,19 +2107,12 @@ finit(struct file *fp, u_int flag, short
  *
  * If the descriptor doesn't exist, EBADF is returned.
  *
- * If the descriptor exists but doesn't match 'flags' then return EBADF for
- * read attempts and EINVAL for write attempts.
- *
- * If 'hold' is set (non-zero) the file's refcount will be bumped on return.
- * It should be dropped with fdrop().  If it is not set, then the refcount
- * will not be bumped however the thread's filedesc struct will be returned
- * locked (for fgetsock).
- *
  * If an error occured the non-zero error is returned and *fpp is set to
- * NULL.  Otherwise *fpp is set and zero is returned.
+ * NULL.  Otherwise *fpp is held and set and zero is returned.  Caller is
+ * responsible for fdrop().
  */
 static __inline int
-_fget(struct thread *td, int fd, struct file **fpp, int flags, int hold)
+_fget(struct thread *td, int fd, struct file **fpp, int flags)
 {
 	struct filedesc *fdp;
 	struct file *fp;
@@ -2063,29 +2120,22 @@ _fget(struct thread *td, int fd, struct 
 	*fpp = NULL;
 	if (td == NULL || (fdp = td->td_proc->p_fd) == NULL)
 		return (EBADF);
-	FILEDESC_SLOCK(fdp);
-	if ((fp = fget_locked(fdp, fd)) == NULL || fp->f_ops == &badfileops) {
-		FILEDESC_SUNLOCK(fdp);
+	if ((fp = fget_unlocked(fdp, fd)) == NULL)
+		return (EBADF);
+	if (fp->f_ops == &badfileops) {
+		fdrop(fp, td);
 		return (EBADF);
 	}
-
 	/*
 	 * FREAD and FWRITE failure return EBADF as per POSIX.
 	 *
 	 * Only one flag, or 0, may be specified.
 	 */
-	if (flags == FREAD && (fp->f_flag & FREAD) == 0) {
-		FILEDESC_SUNLOCK(fdp);
-		return (EBADF);
-	}
-	if (flags == FWRITE && (fp->f_flag & FWRITE) == 0) {
-		FILEDESC_SUNLOCK(fdp);
+	if ((flags == FREAD && (fp->f_flag & FREAD) == 0) ||
+	    (flags == FWRITE && (fp->f_flag & FWRITE) == 0)) {
+		fdrop(fp, td);
 		return (EBADF);
 	}
-	if (hold) {
-		fhold(fp);
-		FILEDESC_SUNLOCK(fdp);
-	}
 	*fpp = fp;
 	return (0);
 }
@@ -2094,21 +2144,21 @@ int
 fget(struct thread *td, int fd, struct file **fpp)
 {
 
-	return(_fget(td, fd, fpp, 0, 1));
+	return(_fget(td, fd, fpp, 0));
 }
 
 int
 fget_read(struct thread *td, int fd, struct file **fpp)
 {
 
-	return(_fget(td, fd, fpp, FREAD, 1));
+	return(_fget(td, fd, fpp, FREAD));
 }
 
 int
 fget_write(struct thread *td, int fd, struct file **fpp)
 {
 
-	return(_fget(td, fd, fpp, FWRITE, 1));
+	return(_fget(td, fd, fpp, FWRITE));
 }
 
 /*
@@ -2125,7 +2175,7 @@ _fgetvp(struct thread *td, int fd, struc
 	int error;
 
 	*vpp = NULL;
-	if ((error = _fget(td, fd, &fp, flags, 0)) != 0)
+	if ((error = _fget(td, fd, &fp, flags)) != 0)
 		return (error);
 	if (fp->f_vnode == NULL) {
 		error = EINVAL;
@@ -2133,7 +2183,8 @@ _fgetvp(struct thread *td, int fd, struc
 		*vpp = fp->f_vnode;
 		vref(*vpp);
 	}
-	FILEDESC_SUNLOCK(td->td_proc->p_fd);
+	fdrop(fp, td);
+
 	return (error);
 }
 
@@ -2180,7 +2231,7 @@ fgetsock(struct thread *td, int fd, stru
 	*spp = NULL;
 	if (fflagp != NULL)
 		*fflagp = 0;
-	if ((error = _fget(td, fd, &fp, 0, 0)) != 0)
+	if ((error = _fget(td, fd, &fp, 0)) != 0)
 		return (error);
 	if (fp->f_type != DTYPE_SOCKET) {
 		error = ENOTSOCK;
@@ -2192,7 +2243,8 @@ fgetsock(struct thread *td, int fd, stru
 		soref(*spp);
 		SOCK_UNLOCK(*spp);
 	}
-	FILEDESC_SUNLOCK(td->td_proc->p_fd);
+	fdrop(fp, td);
+
 	return (error);
 }
 
@@ -3123,7 +3175,7 @@ filelistinit(void *dummy)
 {
 
 	file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL,
-	    NULL, NULL, UMA_ALIGN_PTR, 0);
+	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF);
 	mtx_init(&fdesc_mtx, "fdesc", NULL, MTX_DEF);
 }

Modified: user/kmacy/releng_7_2_fcs/sys/kern/sys_generic.c
==============================================================================
--- user/kmacy/releng_7_2_fcs/sys/kern/sys_generic.c	Tue May 19 05:17:41 2009	(r192371)
+++ user/kmacy/releng_7_2_fcs/sys/kern/sys_generic.c	Tue May 19 05:36:10 2009	(r192372)
@@ -838,6 +838,71 @@ done:
 
 	return (error);
 }
+/* 
+ * Convert a select bit set to poll flags.
+ *
+ * The backend always returns POLLHUP/POLLERR if appropriate and we
+ * return this as a set bit in any set.
+ */
+static int select_flags[3] = {
+    POLLRDNORM | POLLHUP | POLLERR,
+    POLLWRNORM | POLLHUP | POLLERR,
+    POLLRDBAND | POLLHUP | POLLERR
+};
+
+/*
+ * Compute the fo_poll flags required for a fd given by the index and
+ * bit position in the fd_mask array.
+ */
+static __inline int
+selflags(fd_mask **ibits, int idx, fd_mask bit)
+{
+	int flags;
+	int msk;
+
+	flags = 0;
+	for (msk = 0; msk < 3; msk++) {
+		if (ibits[msk] == NULL)
+			continue;
+		if ((ibits[msk][idx] & bit) == 0)
+			continue;
+		flags |= select_flags[msk];
+	}
+	return (flags);
+}
+
+/*
+ * Set the appropriate output bits given a mask of fired events and the
+ * input bits originally requested.
+ */
+static __inline int
+selsetbits(fd_mask **ibits, fd_mask **obits, int idx, fd_mask bit, int events)
+{
+	int msk;
+	int n;
+
+	n = 0;
+	for (msk = 0; msk < 3; msk++) {
+		if ((events & select_flags[msk]) == 0)
+			continue;
+		if (ibits[msk] == NULL)
+			continue;
+		if ((ibits[msk][idx] & bit) == 0)
+			continue;
+		/*
+		 * XXX Check for a duplicate set.  This can occur because a
+		 * socket calls selrecord() twice for each poll() call
+		 * resulting in two selfds per real fd.  selrescan() will
+		 * call selsetbits twice as a result.
+		 */
+		if ((obits[msk][idx] & bit) != 0)
+			continue;
+		obits[msk][idx] |= bit;
+		n++;
+	}
+
+	return (n);
+}
 
 /*
  * Traverse the list of fds attached to this thread's seltd and check for
@@ -846,19 +911,18 @@ done:
 static int
 selrescan(struct thread *td, fd_mask **ibits, fd_mask **obits)
 {
+	struct filedesc *fdp;
+	struct selinfo *si;
 	struct seltd *stp;
 	struct selfd *sfp;
 	struct selfd *sfn;
-	struct selinfo *si;
 	struct file *fp;
-	int msk, fd;
-	int n = 0;
-	/* Note: backend also returns POLLHUP/POLLERR if appropriate. */
-	static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
-	struct filedesc *fdp = td->td_proc->p_fd;
+	fd_mask bit;
+	int fd, ev, n, idx;
 
+	fdp = td->td_proc->p_fd;
 	stp = td->td_sel;
-	FILEDESC_SLOCK(fdp);
+	n = 0;
 	STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) {
 		fd = (int)(uintptr_t)sfp->sf_cookie;
 		si = sfp->sf_si;
@@ -866,24 +930,15 @@ selrescan(struct thread *td, fd_mask **i
 		/* If the selinfo wasn't cleared the event didn't fire. */
 		if (si != NULL)
 			continue;
-		if ((fp = fget_locked(fdp, fd)) == NULL) {
-			FILEDESC_SUNLOCK(fdp);
+		if ((fp = fget_unlocked(fdp, fd)) == NULL)
 			return (EBADF);
-		}
-		for (msk = 0; msk < 3; msk++) {
-			if (ibits[msk] == NULL)
-				continue;
-			if ((ibits[msk][fd/NFDBITS] &
-			    ((fd_mask) 1 << (fd % NFDBITS))) == 0)
-				continue;
-			if (fo_poll(fp, flag[msk], td->td_ucred, td)) {
-				obits[msk][(fd)/NFDBITS] |=
-				    ((fd_mask)1 << ((fd) % NFDBITS));
-				n++;
-			}
-		}
+		idx = fd / NFDBITS;
+		bit = (fd_mask)1 << (fd % NFDBITS);
+		ev = fo_poll(fp, selflags(ibits, idx, bit), td->td_ucred, td);
+		fdrop(fp, td);
+		if (ev != 0)
+			n += selsetbits(ibits, obits, idx, bit, ev);
 	}
-	FILEDESC_SUNLOCK(fdp);
 	stp->st_flags = 0;
 	td->td_retval[0] = n;
 	return (0);
@@ -899,39 +954,31 @@ selscan(td, ibits, obits, nfd)
 	fd_mask **ibits, **obits;
 	int nfd;
 {
-	int msk, i, fd;
-	fd_mask bits;
+	struct filedesc *fdp;
 	struct file *fp;
-	int n = 0;
-	/* Note: backend also returns POLLHUP/POLLERR if appropriate. */
-	static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
-	struct filedesc *fdp = td->td_proc->p_fd;
+	fd_mask bit;
+	int ev, flags, end, fd;
+	int n, idx;
 
-	FILEDESC_SLOCK(fdp);
-	for (msk = 0; msk < 3; msk++) {
-		if (ibits[msk] == NULL)
-			continue;
-		for (i = 0; i < nfd; i += NFDBITS) {
-			bits = ibits[msk][i/NFDBITS];
-			/* ffs(int mask) not portable, fd_mask is long */
-			for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
-				if (!(bits & 1))
-					continue;
-				if ((fp = fget_locked(fdp, fd)) == NULL) {
-					FILEDESC_SUNLOCK(fdp);
-					return (EBADF);
-				}
-				selfdalloc(td, (void *)(uintptr_t)fd);
-				if (fo_poll(fp, flag[msk], td->td_ucred,
-				    td)) {
-					obits[msk][(fd)/NFDBITS] |=
-					    ((fd_mask)1 << ((fd) % NFDBITS));
-					n++;
-				}
-			}
+	fdp = td->td_proc->p_fd;
+	n = 0;
+	for (idx = 0, fd = 0; fd < nfd; idx++) {
+		end = imin(fd + NFDBITS, nfd);
+		for (bit = 1; fd < end; bit <<= 1, fd++) {
+			/* Compute the list of events we're interested in. */
+			flags = selflags(ibits, idx, bit);
+			if (flags == 0)
+				continue;
+			if ((fp = fget_unlocked(fdp, fd)) == NULL)
+				return (EBADF);
+			selfdalloc(td, (void *)(uintptr_t)fd);
+			ev = fo_poll(fp, flags, td->td_ucred, td);
+			fdrop(fp, td);
+			if (ev != 0)
+				n += selsetbits(ibits, obits, idx, bit, ev);
 		}
 	}
-	FILEDESC_SUNLOCK(fdp);
+
 	td->td_retval[0] = n;
 	return (0);
 }
@@ -1084,7 +1131,6 @@ pollrescan(struct thread *td)
 	return (0);
 }
 
-
 static int
 pollscan(td, fds, nfd)
 	struct thread *td;

Modified: user/kmacy/releng_7_2_fcs/sys/kern/uipc_syscalls.c
==============================================================================
--- user/kmacy/releng_7_2_fcs/sys/kern/uipc_syscalls.c	Tue May 19 05:17:41 2009	(r192371)
+++ user/kmacy/releng_7_2_fcs/sys/kern/uipc_syscalls.c	Tue May 19 05:36:10 2009	(r192372)
@@ -121,23 +121,16 @@ getsock(struct filedesc *fdp, int fd, st
 	int error;
 
 	fp = NULL;
-	if (fdp == NULL)
+	if (fdp == NULL || (fp = fget_unlocked(fdp, fd)) == NULL) {
 		error = EBADF;
-	else {
-		FILEDESC_SLOCK(fdp);
-		fp = fget_locked(fdp, fd);
-		if (fp == NULL)
-			error = EBADF;
-		else if (fp->f_type != DTYPE_SOCKET) {
-			fp = NULL;
-			error = ENOTSOCK;
-		} else {
-			fhold(fp);
-			if (fflagp != NULL)
-				*fflagp = fp->f_flag;
-			error = 0;
-		}
-		FILEDESC_SUNLOCK(fdp);
+	} else if (fp->f_type != DTYPE_SOCKET) {
+		fdrop(fp, curthread);
+		fp = NULL;
+		error = ENOTSOCK;
+	} else {
+		if (fflagp != NULL)
+			*fflagp = fp->f_flag;
+		error = 0;
 	}
 	*fpp = fp;
 	return (error);

Modified: user/kmacy/releng_7_2_fcs/sys/kern/vfs_syscalls.c
==============================================================================
--- user/kmacy/releng_7_2_fcs/sys/kern/vfs_syscalls.c	Tue May 19 05:17:41 2009	(r192371)
+++ user/kmacy/releng_7_2_fcs/sys/kern/vfs_syscalls.c	Tue May 19 05:36:10 2009	(r192372)
@@ -3979,22 +3979,13 @@ getvnode(fdp, fd, fpp)
 	int error;
 	struct file *fp;
 
+	error = 0;
 	fp = NULL;
-	if (fdp == NULL)
+	if (fdp == NULL || (fp = fget_unlocked(fdp, fd)) == NULL)
 		error = EBADF;
-	else {
-		FILEDESC_SLOCK(fdp);
-		if ((u_int)fd >= fdp->fd_nfiles ||
-		    (fp = fdp->fd_ofiles[fd]) == NULL)
-			error = EBADF;
-		else if (fp->f_vnode == NULL) {
-			fp = NULL;
-			error = EINVAL;
-		} else {
-			fhold(fp);
-			error = 0;
-		}
-		FILEDESC_SUNLOCK(fdp);
+	else if (fp->f_vnode == NULL) {
+		error = EINVAL;
+		fdrop(fp, curthread);
 	}
 	*fpp = fp;
 	return (error);

Modified: user/kmacy/releng_7_2_fcs/sys/sys/filedesc.h
==============================================================================
--- user/kmacy/releng_7_2_fcs/sys/sys/filedesc.h	Tue May 19 05:17:41 2009	(r192371)
+++ user/kmacy/releng_7_2_fcs/sys/sys/filedesc.h	Tue May 19 05:36:10 2009	(r192372)
@@ -129,6 +129,10 @@ int	getvnode(struct filedesc *fdp, int f
 void	mountcheckdirs(struct vnode *olddp, struct vnode *newdp);
 void	setugidsafety(struct thread *td);
 
+/* Return a referenced file from an unlocked descriptor. */
+struct file *fget_unlocked(struct filedesc *fdp, int fd);
+
+/* Requires a FILEDESC_{S,X}LOCK held and returns without a ref. */
 static __inline struct file *
 fget_locked(struct filedesc *fdp, int fd)
 {


More information about the svn-src-user mailing list