svn commit: r262911 - projects/sendfile/sys/kern

Fri Mar 7 22:53:30 UTC 2014

Author: glebius
Date: Fri Mar  7 22:53:29 2014
New Revision: 262911
URL: http://svnweb.freebsd.org/changeset/base/262911

Log:
  Provide a hack to workaround the following condition: two sendfile(2)
  calls are issues on the same object, with ranges overlapping on at
  least one page. One of them grabs all pages, schedules I/O and returns.
  The second one encounters an overlapping page in a busy state. It
  sleeps with 'pgrbwt' wait channel, and sleep time is effectively the
  I/O time. Thus, second syscall is degenerating to the blocking on
  disk I/O mode.
  
  To avoid this, a sysctl kern.ipc.sendfile.pgrabnowait is provided
  (default to off). When sysctl is on and userland supplies SF_NODISKIO
  flag, then we are calling vm_page_grab(VM_ALLOC_NOWAIT). In case of
  a failure we return EAGAIN, hinting userland that it should continue
  monitor the socket via select/kevent/whatever. If the socket has zero
  data to send, then we have to notify it immediately, to avoid stall.
  
  Sponsored by:	Netflix
  Sponsored by:	Nginx, Inc.

Modified:
  projects/sendfile/sys/kern/uipc_syscalls.c

Modified: projects/sendfile/sys/kern/uipc_syscalls.c
==============================================================================

--- projects/sendfile/sys/kern/uipc_syscalls.c	Fri Mar  7 22:29:00 2014	(r262910)
+++ projects/sendfile/sys/kern/uipc_syscalls.c	Fri Mar  7 22:53:29 2014	(r262911)
@@ -133,6 +133,10 @@ static int	filt_sfsync(struct knote *kn,
 static SYSCTL_NODE(_kern_ipc, OID_AUTO, sendfile, CTLFLAG_RW, 0,
     "sendfile(2) tunables");
 
+static int sfpgrabnowait = 0;
+SYSCTL_INT(_kern_ipc_sendfile, OID_AUTO, pgrabnowait, CTLFLAG_RW,
+    &sfpgrabnowait, 0, "Use VM_ALLOC_NOWAIT when SF_NODISKIO is requested");
+
 #ifdef	SFSYNC_DEBUG
 static int sf_sync_debug = 0;
 SYSCTL_INT(_debug, OID_AUTO, sf_sync_debug, CTLFLAG_RW,
@@ -2718,18 +2722,28 @@ sf_io_done(void *arg)
 }
 
 static int
-sendfile_swapin(vm_object_t obj, struct sf_io *sfio, off_t off, off_t len)
+sendfile_swapin(vm_object_t obj, struct sf_io *sfio, off_t off, off_t len,
+    int flags)
 {
 	vm_page_t *pa = sfio->pa;
 	int npages = sfio->npages;
 	int nios, rv;
 
 	nios = 0;
+	if (sfpgrabnowait && (flags & SF_NODISKIO))
+		flags = VM_ALLOC_NOWAIT;
+	else
+		flags = 0;
 
 	VM_OBJECT_WLOCK(obj);
-	for (int i = 0; i < npages; i++)
+	for (int i = 0; i < npages; i++) {
 		pa[i] = vm_page_grab(obj, OFF_TO_IDX(vmoff(i, off)),
-		    VM_ALLOC_WIRED | VM_ALLOC_NORMAL);
+		    VM_ALLOC_WIRED | VM_ALLOC_NORMAL | flags);
+		if (pa[i] == NULL) {
+			npages = sfio->npages = i;
+			break;
+		}
+	}
 
 	for (int i = 0; i < npages; i++) {
 		int j, a;
@@ -3079,7 +3093,37 @@ retry_space:
 		refcount_init(&sfio->nios, 1);
 		sfio->npages = npages;
 
-		nios = sendfile_swapin(obj, sfio, off, space);
+		nios = sendfile_swapin(obj, sfio, off, space, flags);
+
+		if (sfio->npages != npages) {
+			/*
+			 * sendfile_swapin() encountered a busy page,
+			 * and was called with SF_NODISKIO. We don't
+			 * return EBUSY, like old I/O blocking sendfile
+			 * did, because situtation is different. No
+			 * extra operation like read(2) or aio_read(2)
+			 * is required from userland. We just need it
+			 * to retry soonish.
+			 * We rely on remote side ACKing our data to
+			 * drive this timeout. And in the worst case,
+			 * when we do not have data to send, we put
+			 * the socket on the notification queue immediately.
+			 */
+			error = EAGAIN;
+			if (sfio->npages == 0 && hdrlen == 0) {
+				if (vp != NULL)
+					VOP_UNLOCK(vp, 0);
+				SOCKBUF_LOCK(&so->so_snd);
+				if (!sbused(&so->so_snd))
+					sowwakeup_locked(so);
+				else
+					SOCKBUF_UNLOCK(&so->so_snd);
+				free(sfio, M_TEMP);
+				goto done;
+			}
+			fixspace(npages, sfio->npages, off, &space);
+			npages = sfio->npages;
+		}
 
 		/*
 		 * Loop and construct maximum sized mbuf chain to be bulk
@@ -3180,7 +3224,8 @@ retry_space:
 			mh = NULL;
 		}
 
-		if (error) {
+		if (m == NULL) {
+			KASSERT(error, ("%s: no mbuf and no error", __func__));
 			free(sfio, M_TEMP);
 			goto done;
 		}