svn commit: r192080 - in head/sys: kern sys

Jeff Roberson jeff at FreeBSD.org
Thu May 14 03:24:23 UTC 2009


Author: jeff
Date: Thu May 14 03:24:22 2009
New Revision: 192080
URL: http://svn.freebsd.org/changeset/base/192080

Log:
   - Implement a lockless file descriptor lookup algorithm in
     fget_unlocked().
   - Save old file descriptor tables created on expansion until
     the entire descriptor table is freed so that pointers may be
     followed without regard for expanders.
   - Mark the file zone as NOFREE so we may attempt to reference
     potentially freed files.
   - Convert several fget_locked() users to fget_unlocked().  This
     requires us to manage reference counts explicitly but reduces
     locking overhead in the common case.

Modified:
  head/sys/kern/kern_descrip.c
  head/sys/kern/sys_generic.c
  head/sys/kern/tty.c
  head/sys/kern/uipc_syscalls.c
  head/sys/kern/vfs_syscalls.c
  head/sys/sys/filedesc.h

Modified: head/sys/kern/kern_descrip.c
==============================================================================
--- head/sys/kern/kern_descrip.c	Thu May 14 02:42:29 2009	(r192079)
+++ head/sys/kern/kern_descrip.c	Thu May 14 03:24:22 2009	(r192080)
@@ -125,12 +125,24 @@ static void	fdused(struct filedesc *fdp,
 #define OFILESIZE (sizeof(struct file *) + sizeof(char))
 
 /*
+ * Storage to hold unused ofiles that need to be reclaimed.
+ */
+struct freetable {
+	struct file	**ft_table;
+	SLIST_ENTRY(freetable) ft_next;
+};
+
+/*
  * Basic allocation of descriptors:
  * one of the above, plus arrays for NDFILE descriptors.
  */
 struct filedesc0 {
 	struct	filedesc fd_fd;
 	/*
+	 * ofiles which need to be reclaimed on free.
+	 */
+	SLIST_HEAD(,freetable) fd_free;
+	/*
 	 * These arrays are used when the number of open files is
 	 * <= NDFILE, and are then pointed to by the pointers above.
 	 */
@@ -1268,7 +1280,10 @@ out:
 static void
 fdgrowtable(struct filedesc *fdp, int nfd)
 {
+	struct filedesc0 *fdp0;
+	struct freetable *fo;
 	struct file **ntable;
+	struct file **otable;
 	char *nfileflags;
 	int nnfiles, onfiles;
 	NDSLOTTYPE *nmap;
@@ -1287,7 +1302,7 @@ fdgrowtable(struct filedesc *fdp, int nf
 
 	/* allocate a new table and (if required) new bitmaps */
 	FILEDESC_XUNLOCK(fdp);
-	ntable = malloc(nnfiles * OFILESIZE,
+	ntable = malloc((nnfiles * OFILESIZE) + sizeof(struct freetable),
 	    M_FILEDESC, M_ZERO | M_WAITOK);
 	nfileflags = (char *)&ntable[nnfiles];
 	if (NDSLOTS(nnfiles) > NDSLOTS(onfiles))
@@ -1311,10 +1326,20 @@ fdgrowtable(struct filedesc *fdp, int nf
 	}
 	bcopy(fdp->fd_ofiles, ntable, onfiles * sizeof(*ntable));
 	bcopy(fdp->fd_ofileflags, nfileflags, onfiles);
-	if (onfiles > NDFILE)
-		free(fdp->fd_ofiles, M_FILEDESC);
-	fdp->fd_ofiles = ntable;
+	otable = fdp->fd_ofiles;
 	fdp->fd_ofileflags = nfileflags;
+	fdp->fd_ofiles = ntable;
+	/*
+	 * We must preserve ofiles until the process exits because we can't
+	 * be certain that no threads have references to the old table via
+	 * _fget().
+	 */
+	if (onfiles > NDFILE) {
+		fo = (struct freetable *)&otable[onfiles];
+		fdp0 = (struct filedesc0 *)fdp;
+		fo->ft_table = otable;
+		SLIST_INSERT_HEAD(&fdp0->fd_free, fo, ft_next);
+	}
 	if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) {
 		bcopy(fdp->fd_map, nmap, NDSLOTS(onfiles) * sizeof(*nmap));
 		if (NDSLOTS(onfiles) > NDSLOTS(NDFILE))
@@ -1512,6 +1537,8 @@ fdhold(struct proc *p)
 static void
 fddrop(struct filedesc *fdp)
 {
+	struct filedesc0 *fdp0;
+	struct freetable *ft;
 	int i;
 
 	mtx_lock(&fdesc_mtx);
@@ -1521,6 +1548,11 @@ fddrop(struct filedesc *fdp)
 		return;
 
 	FILEDESC_LOCK_DESTROY(fdp);
+	fdp0 = (struct filedesc0 *)fdp;
+	while ((ft = SLIST_FIRST(&fdp0->fd_free)) != NULL) {
+		SLIST_REMOVE_HEAD(&fdp0->fd_free, ft_next);
+		free(ft->ft_table, M_FILEDESC);
+	}
 	free(fdp, M_FILEDESC);
 }
 
@@ -2022,6 +2054,38 @@ finit(struct file *fp, u_int flag, short
 	atomic_store_rel_ptr((volatile uintptr_t *)&fp->f_ops, (uintptr_t)ops);
 }
 
+struct file *
+fget_unlocked(struct filedesc *fdp, int fd)
+{
+	struct file *fp;
+	u_int count;
+
+	if (fd < 0 || fd >= fdp->fd_nfiles)
+		return (NULL);
+	/*
+	 * Fetch the descriptor locklessly.  We avoid fdrop() races by
+	 * never raising a refcount above 0.  To accomplish this we have
+	 * to use a cmpset loop rather than an atomic_add.  The descriptor
+	 * must be re-verified once we acquire a reference to be certain
+	 * that the identity is still correct and we did not lose a race
+	 * due to preemption.
+	 */
+	for (;;) {
+		fp = fdp->fd_ofiles[fd];
+		if (fp == NULL)
+			break;
+		count = fp->f_count;
+		if (count == 0)
+			continue;
+		if (atomic_cmpset_int(&fp->f_count, count, count + 1) != 1)
+			continue;
+		if (fp == ((struct file *volatile*)fdp->fd_ofiles)[fd])
+			break;
+		fdrop(fp, curthread);
+	}
+
+	return (fp);
+}
 
 /*
  * Extract the file pointer associated with the specified descriptor for the
@@ -2030,16 +2094,12 @@ finit(struct file *fp, u_int flag, short
  * If the descriptor doesn't exist or doesn't match 'flags', EBADF is
  * returned.
  *
- * If 'hold' is set (non-zero) the file's refcount will be bumped on return.
- * It should be dropped with fdrop().  If it is not set, then the refcount
- * will not be bumped however the thread's filedesc struct will be returned
- * locked (for fgetsock).
- *
  * If an error occured the non-zero error is returned and *fpp is set to
- * NULL.  Otherwise *fpp is set and zero is returned.
+ * NULL.  Otherwise *fpp is held and set and zero is returned.  Caller is
+ * responsible for fdrop().
  */
 static __inline int
-_fget(struct thread *td, int fd, struct file **fpp, int flags, int hold)
+_fget(struct thread *td, int fd, struct file **fpp, int flags)
 {
 	struct filedesc *fdp;
 	struct file *fp;
@@ -2047,29 +2107,22 @@ _fget(struct thread *td, int fd, struct 
 	*fpp = NULL;
 	if (td == NULL || (fdp = td->td_proc->p_fd) == NULL)
 		return (EBADF);
-	FILEDESC_SLOCK(fdp);
-	if ((fp = fget_locked(fdp, fd)) == NULL || fp->f_ops == &badfileops) {
-		FILEDESC_SUNLOCK(fdp);
+	if ((fp = fget_unlocked(fdp, fd)) == NULL)
+		return (EBADF);
+	if (fp->f_ops == &badfileops) {
+		fdrop(fp, td);
 		return (EBADF);
 	}
-
 	/*
 	 * FREAD and FWRITE failure return EBADF as per POSIX.
 	 *
 	 * Only one flag, or 0, may be specified.
 	 */
-	if (flags == FREAD && (fp->f_flag & FREAD) == 0) {
-		FILEDESC_SUNLOCK(fdp);
-		return (EBADF);
-	}
-	if (flags == FWRITE && (fp->f_flag & FWRITE) == 0) {
-		FILEDESC_SUNLOCK(fdp);
+	if ((flags == FREAD && (fp->f_flag & FREAD) == 0) ||
+	    (flags == FWRITE && (fp->f_flag & FWRITE) == 0)) {
+		fdrop(fp, td);
 		return (EBADF);
 	}
-	if (hold) {
-		fhold(fp);
-		FILEDESC_SUNLOCK(fdp);
-	}
 	*fpp = fp;
 	return (0);
 }
@@ -2078,21 +2131,21 @@ int
 fget(struct thread *td, int fd, struct file **fpp)
 {
 
-	return(_fget(td, fd, fpp, 0, 1));
+	return(_fget(td, fd, fpp, 0));
 }
 
 int
 fget_read(struct thread *td, int fd, struct file **fpp)
 {
 
-	return(_fget(td, fd, fpp, FREAD, 1));
+	return(_fget(td, fd, fpp, FREAD));
 }
 
 int
 fget_write(struct thread *td, int fd, struct file **fpp)
 {
 
-	return(_fget(td, fd, fpp, FWRITE, 1));
+	return(_fget(td, fd, fpp, FWRITE));
 }
 
 /*
@@ -2109,7 +2162,7 @@ _fgetvp(struct thread *td, int fd, struc
 	int error;
 
 	*vpp = NULL;
-	if ((error = _fget(td, fd, &fp, flags, 0)) != 0)
+	if ((error = _fget(td, fd, &fp, flags)) != 0)
 		return (error);
 	if (fp->f_vnode == NULL) {
 		error = EINVAL;
@@ -2117,7 +2170,8 @@ _fgetvp(struct thread *td, int fd, struc
 		*vpp = fp->f_vnode;
 		vref(*vpp);
 	}
-	FILEDESC_SUNLOCK(td->td_proc->p_fd);
+	fdrop(fp, td);
+
 	return (error);
 }
 
@@ -2164,7 +2218,7 @@ fgetsock(struct thread *td, int fd, stru
 	*spp = NULL;
 	if (fflagp != NULL)
 		*fflagp = 0;
-	if ((error = _fget(td, fd, &fp, 0, 0)) != 0)
+	if ((error = _fget(td, fd, &fp, 0)) != 0)
 		return (error);
 	if (fp->f_type != DTYPE_SOCKET) {
 		error = ENOTSOCK;
@@ -2176,7 +2230,8 @@ fgetsock(struct thread *td, int fd, stru
 		soref(*spp);
 		SOCK_UNLOCK(*spp);
 	}
-	FILEDESC_SUNLOCK(td->td_proc->p_fd);
+	fdrop(fp, td);
+
 	return (error);
 }
 
@@ -3147,7 +3202,7 @@ filelistinit(void *dummy)
 {
 
 	file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL,
-	    NULL, NULL, UMA_ALIGN_PTR, 0);
+	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF);
 	mtx_init(&fdesc_mtx, "fdesc", NULL, MTX_DEF);
 }

Modified: head/sys/kern/sys_generic.c
==============================================================================
--- head/sys/kern/sys_generic.c	Thu May 14 02:42:29 2009	(r192079)
+++ head/sys/kern/sys_generic.c	Thu May 14 03:24:22 2009	(r192080)
@@ -988,7 +988,6 @@ selrescan(struct thread *td, fd_mask **i
 	fdp = td->td_proc->p_fd;
 	stp = td->td_sel;
 	n = 0;
-	FILEDESC_SLOCK(fdp);
 	STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) {
 		fd = (int)(uintptr_t)sfp->sf_cookie;
 		si = sfp->sf_si;
@@ -996,17 +995,15 @@ selrescan(struct thread *td, fd_mask **i
 		/* If the selinfo wasn't cleared the event didn't fire. */
 		if (si != NULL)
 			continue;
-		if ((fp = fget_locked(fdp, fd)) == NULL) {
-			FILEDESC_SUNLOCK(fdp);
+		if ((fp = fget_unlocked(fdp, fd)) == NULL)
 			return (EBADF);
-		}
 		idx = fd / NFDBITS;
 		bit = (fd_mask)1 << (fd % NFDBITS);
 		ev = fo_poll(fp, selflags(ibits, idx, bit), td->td_ucred, td);
+		fdrop(fp, td);
 		if (ev != 0)
 			n += selsetbits(ibits, obits, idx, bit, ev);
 	}
-	FILEDESC_SUNLOCK(fdp);
 	stp->st_flags = 0;
 	td->td_retval[0] = n;
 	return (0);
@@ -1030,7 +1027,6 @@ selscan(td, ibits, obits, nfd)
 
 	fdp = td->td_proc->p_fd;
 	n = 0;
-	FILEDESC_SLOCK(fdp);
 	for (idx = 0, fd = 0; fd < nfd; idx++) {
 		end = imin(fd + NFDBITS, nfd);
 		for (bit = 1; fd < end; bit <<= 1, fd++) {
@@ -1038,18 +1034,16 @@ selscan(td, ibits, obits, nfd)
 			flags = selflags(ibits, idx, bit);
 			if (flags == 0)
 				continue;
-			if ((fp = fget_locked(fdp, fd)) == NULL) {
-				FILEDESC_SUNLOCK(fdp);
+			if ((fp = fget_unlocked(fdp, fd)) == NULL)
 				return (EBADF);
-			}
 			selfdalloc(td, (void *)(uintptr_t)fd);
 			ev = fo_poll(fp, flags, td->td_ucred, td);
+			fdrop(fp, td);
 			if (ev != 0)
 				n += selsetbits(ibits, obits, idx, bit, ev);
 		}
 	}
 
-	FILEDESC_SUNLOCK(fdp);
 	td->td_retval[0] = n;
 	return (0);
 }

Modified: head/sys/kern/tty.c
==============================================================================
--- head/sys/kern/tty.c	Thu May 14 02:42:29 2009	(r192079)
+++ head/sys/kern/tty.c	Thu May 14 03:24:22 2009	(r192080)
@@ -1720,10 +1720,13 @@ ttyhook_register(struct tty **rtp, struc
 	/* Validate the file descriptor. */
 	if ((fdp = p->p_fd) == NULL)
 		return (EBADF);
-	FILEDESC_SLOCK(fdp);
-	if ((fp = fget_locked(fdp, fd)) == NULL || fp->f_ops == &badfileops) {
-		FILEDESC_SUNLOCK(fdp);
+
+	fp = fget_unlocked(fdp, fd);
+	if (fp == NULL)
 		return (EBADF);
+	if (fp->f_ops == &badfileops) {
+		error = EBADF;
+		goto done1;
 	}
 	
 	/* Make sure the vnode is bound to a character device. */
@@ -1763,7 +1766,7 @@ ttyhook_register(struct tty **rtp, struc
 
 done3:	tty_unlock(tp);
 done2:	dev_relthread(dev);
-done1:	FILEDESC_SUNLOCK(fdp);
+done1:	fdrop(fp, curthread);
 	return (error);
 }
 

Modified: head/sys/kern/uipc_syscalls.c
==============================================================================
--- head/sys/kern/uipc_syscalls.c	Thu May 14 02:42:29 2009	(r192079)
+++ head/sys/kern/uipc_syscalls.c	Thu May 14 03:24:22 2009	(r192080)
@@ -122,23 +122,16 @@ getsock(struct filedesc *fdp, int fd, st
 	int error;
 
 	fp = NULL;
-	if (fdp == NULL)
+	if (fdp == NULL || (fp = fget_unlocked(fdp, fd)) == NULL) {
 		error = EBADF;
-	else {
-		FILEDESC_SLOCK(fdp);
-		fp = fget_locked(fdp, fd);
-		if (fp == NULL)
-			error = EBADF;
-		else if (fp->f_type != DTYPE_SOCKET) {
-			fp = NULL;
-			error = ENOTSOCK;
-		} else {
-			fhold(fp);
-			if (fflagp != NULL)
-				*fflagp = fp->f_flag;
-			error = 0;
-		}
-		FILEDESC_SUNLOCK(fdp);
+	} else if (fp->f_type != DTYPE_SOCKET) {
+		fdrop(fp, curthread);
+		fp = NULL;
+		error = ENOTSOCK;
+	} else {
+		if (fflagp != NULL)
+			*fflagp = fp->f_flag;
+		error = 0;
 	}
 	*fpp = fp;
 	return (error);

Modified: head/sys/kern/vfs_syscalls.c
==============================================================================
--- head/sys/kern/vfs_syscalls.c	Thu May 14 02:42:29 2009	(r192079)
+++ head/sys/kern/vfs_syscalls.c	Thu May 14 03:24:22 2009	(r192080)
@@ -4235,22 +4235,13 @@ getvnode(fdp, fd, fpp)
 	int error;
 	struct file *fp;
 
+	error = 0;
 	fp = NULL;
-	if (fdp == NULL)
+	if (fdp == NULL || (fp = fget_unlocked(fdp, fd)) == NULL)
 		error = EBADF;
-	else {
-		FILEDESC_SLOCK(fdp);
-		if ((u_int)fd >= fdp->fd_nfiles ||
-		    (fp = fdp->fd_ofiles[fd]) == NULL)
-			error = EBADF;
-		else if (fp->f_vnode == NULL) {
-			fp = NULL;
-			error = EINVAL;
-		} else {
-			fhold(fp);
-			error = 0;
-		}
-		FILEDESC_SUNLOCK(fdp);
+	else if (fp->f_vnode == NULL) {
+		error = EINVAL;
+		fdrop(fp, curthread);
 	}
 	*fpp = fp;
 	return (error);

Modified: head/sys/sys/filedesc.h
==============================================================================
--- head/sys/sys/filedesc.h	Thu May 14 02:42:29 2009	(r192079)
+++ head/sys/sys/filedesc.h	Thu May 14 03:24:22 2009	(r192080)
@@ -129,6 +129,10 @@ int	getvnode(struct filedesc *fdp, int f
 void	mountcheckdirs(struct vnode *olddp, struct vnode *newdp);
 void	setugidsafety(struct thread *td);
 
+/* Return a referenced file from an unlocked descriptor. */
+struct file *fget_unlocked(struct filedesc *fdp, int fd);
+
+/* Requires a FILEDESC_{S,X}LOCK held and returns without a ref. */
 static __inline struct file *
 fget_locked(struct filedesc *fdp, int fd)
 {


More information about the svn-src-head mailing list