svn commit: r282569 - stable/10/sys/kern

Alexander Motin mav at FreeBSD.org
Wed May 6 21:08:17 UTC 2015


Author: mav
Date: Wed May  6 21:08:16 2015
New Revision: 282569
URL: https://svnweb.freebsd.org/changeset/base/282569

Log:
  MFC r281860: Make AIO to not allocate pbufs for unmapped I/O like r281825.
  
  While there, make few more performance optimizations.
  
  On 40-core system doing many 512-byte AIO reads from array of raw SSDs
  this change removes lock congestions inside pbuf allocator and devfs,
  and bottleneck on single AIO completion taskqueue thread.  It improves
  peak AIO performance from ~600K to ~1.3M IOPS.

Modified:
  stable/10/sys/kern/vfs_aio.c
Directory Properties:
  stable/10/   (props changed)

Modified: stable/10/sys/kern/vfs_aio.c
==============================================================================
--- stable/10/sys/kern/vfs_aio.c	Wed May  6 21:06:32 2015	(r282568)
+++ stable/10/sys/kern/vfs_aio.c	Wed May  6 21:08:16 2015	(r282569)
@@ -59,10 +59,12 @@ __FBSDID("$FreeBSD$");
 #include <sys/conf.h>
 #include <sys/event.h>
 #include <sys/mount.h>
+#include <geom/geom.h>
 
 #include <machine/atomic.h>
 
 #include <vm/vm.h>
+#include <vm/vm_page.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
@@ -232,9 +234,10 @@ struct aiocblist {
 	int	jobstate;		/* (b) job state */
 	int	inputcharge;		/* (*) input blockes */
 	int	outputcharge;		/* (*) output blockes */
-	struct	buf *bp;		/* (*) private to BIO backend,
-				  	 * buffer pointer
-					 */
+	struct	bio *bp;		/* (*) BIO backend BIO pointer */
+	struct	buf *pbuf;		/* (*) BIO backend buffer pointer */
+	struct	vm_page *pages[btoc(MAXPHYS)+1]; /* BIO backend pages */
+	int	npages;			/* BIO backend number of pages */
 	struct	proc *userproc;		/* (*) user process */
 	struct  ucred *cred;		/* (*) active credential when created */
 	struct	file *fd_file;		/* (*) pointer to file structure */
@@ -243,7 +246,6 @@ struct aiocblist {
 	struct	knlist klist;		/* (a) list of knotes */
 	struct	aiocb uaiocb;		/* (*) kernel I/O control block */
 	ksiginfo_t ksi;			/* (a) realtime signal info */
-	struct	task biotask;		/* (*) private to BIO backend */
 	uint64_t seqno;			/* (*) job number */
 	int	pending;		/* (a) number of pending I/O, aio_fsync only */
 };
@@ -344,11 +346,10 @@ static void	aio_process_mlock(struct aio
 static int	aio_newproc(int *);
 int		aio_aqueue(struct thread *td, struct aiocb *job,
 			struct aioliojob *lio, int type, struct aiocb_ops *ops);
-static void	aio_physwakeup(struct buf *bp);
+static void	aio_physwakeup(struct bio *bp);
 static void	aio_proc_rundown(void *arg, struct proc *p);
 static void	aio_proc_rundown_exec(void *arg, struct proc *p, struct image_params *imgp);
 static int	aio_qphysio(struct proc *p, struct aiocblist *iocb);
-static void	biohelper(void *, int);
 static void	aio_daemon(void *param);
 static void	aio_swake_cb(struct socket *, struct sockbuf *);
 static int	aio_unload(void);
@@ -1294,13 +1295,15 @@ aio_qphysio(struct proc *p, struct aiocb
 {
 	struct aiocb *cb;
 	struct file *fp;
-	struct buf *bp;
+	struct bio *bp;
+	struct buf *pbuf;
 	struct vnode *vp;
 	struct cdevsw *csw;
 	struct cdev *dev;
 	struct kaioinfo *ki;
 	struct aioliojob *lj;
-	int error, ref;
+	int error, ref, unmap, poff;
+	vm_prot_t prot;
 
 	cb = &aiocbe->uaiocb;
 	fp = aiocbe->fd_file;
@@ -1309,107 +1312,121 @@ aio_qphysio(struct proc *p, struct aiocb
 		return (-1);
 
 	vp = fp->f_vnode;
-
-	/*
-	 * If its not a disk, we don't want to return a positive error.
-	 * It causes the aio code to not fall through to try the thread
-	 * way when you're talking to a regular file.
-	 */
-	if (!vn_isdisk(vp, &error)) {
-		if (error == ENOTBLK)
-			return (-1);
-		else
-			return (error);
-	}
-
-	if (vp->v_bufobj.bo_bsize == 0)
-		return (-1);
-
- 	if (cb->aio_nbytes % vp->v_bufobj.bo_bsize)
+	if (vp->v_type != VCHR)
 		return (-1);
-
-	if (cb->aio_nbytes >
-	    MAXPHYS - (((vm_offset_t) cb->aio_buf) & PAGE_MASK))
+	if (vp->v_bufobj.bo_bsize == 0)
 		return (-1);
-
-	ki = p->p_aioinfo;
-	if (ki->kaio_buffer_count >= ki->kaio_ballowed_count)
+	if (cb->aio_nbytes % vp->v_bufobj.bo_bsize)
 		return (-1);
 
 	ref = 0;
 	csw = devvn_refthread(vp, &dev, &ref);
 	if (csw == NULL)
 		return (ENXIO);
+
+	if ((csw->d_flags & D_DISK) == 0) {
+		error = -1;
+		goto unref;
+	}
 	if (cb->aio_nbytes > dev->si_iosize_max) {
 		error = -1;
 		goto unref;
 	}
 
-	/* Create and build a buffer header for a transfer. */
-	bp = (struct buf *)getpbuf(NULL);
-	BUF_KERNPROC(bp);
+	ki = p->p_aioinfo;
+	poff = (vm_offset_t)cb->aio_buf & PAGE_MASK;
+	unmap = ((dev->si_flags & SI_UNMAPPED) && unmapped_buf_allowed);
+	if (unmap) {
+		if (cb->aio_nbytes > MAXPHYS) {
+			error = -1;
+			goto unref;
+		}
+	} else {
+		if (cb->aio_nbytes > MAXPHYS - poff) {
+			error = -1;
+			goto unref;
+		}
+		if (ki->kaio_buffer_count >= ki->kaio_ballowed_count) {
+			error = -1;
+			goto unref;
+		}
+	}
+	aiocbe->bp = bp = g_alloc_bio();
+	if (!unmap) {
+		aiocbe->pbuf = pbuf = (struct buf *)getpbuf(NULL);
+		BUF_KERNPROC(pbuf);
+	}
 
 	AIO_LOCK(ki);
 	ki->kaio_count++;
-	ki->kaio_buffer_count++;
+	if (!unmap)
+		ki->kaio_buffer_count++;
 	lj = aiocbe->lio;
 	if (lj)
 		lj->lioj_count++;
-	AIO_UNLOCK(ki);
-
-	/*
-	 * Get a copy of the kva from the physical buffer.
-	 */
-	error = 0;
-
-	bp->b_bcount = cb->aio_nbytes;
-	bp->b_bufsize = cb->aio_nbytes;
-	bp->b_iodone = aio_physwakeup;
-	bp->b_saveaddr = bp->b_data;
-	bp->b_data = (void *)(uintptr_t)cb->aio_buf;
-	bp->b_offset = cb->aio_offset;
-	bp->b_iooffset = cb->aio_offset;
-	bp->b_blkno = btodb(cb->aio_offset);
-	bp->b_iocmd = cb->aio_lio_opcode == LIO_WRITE ? BIO_WRITE : BIO_READ;
-
-	/*
-	 * Bring buffer into kernel space.
-	 */
-	if (vmapbuf(bp, (dev->si_flags & SI_UNMAPPED) == 0) < 0) {
-		error = EFAULT;
-		goto doerror;
-	}
-
-	AIO_LOCK(ki);
-	aiocbe->bp = bp;
-	bp->b_caller1 = (void *)aiocbe;
 	TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist);
 	TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist);
 	aiocbe->jobstate = JOBST_JOBQBUF;
 	cb->_aiocb_private.status = cb->aio_nbytes;
 	AIO_UNLOCK(ki);
 
-	atomic_add_int(&num_queue_count, 1);
-	atomic_add_int(&num_buf_aio, 1);
-
-	bp->b_error = 0;
+	bp->bio_length = cb->aio_nbytes;
+	bp->bio_bcount = cb->aio_nbytes;
+	bp->bio_done = aio_physwakeup;
+	bp->bio_data = (void *)(uintptr_t)cb->aio_buf;
+	bp->bio_offset = cb->aio_offset;
+	bp->bio_cmd = cb->aio_lio_opcode == LIO_WRITE ? BIO_WRITE : BIO_READ;
+	bp->bio_dev = dev;
+	bp->bio_caller1 = (void *)aiocbe;
+
+	prot = VM_PROT_READ;
+	if (cb->aio_lio_opcode == LIO_READ)
+		prot |= VM_PROT_WRITE;	/* Less backwards than it looks */
+	if ((aiocbe->npages = vm_fault_quick_hold_pages(
+	    &curproc->p_vmspace->vm_map,
+	    (vm_offset_t)bp->bio_data, bp->bio_length, prot, aiocbe->pages,
+	    sizeof(aiocbe->pages)/sizeof(aiocbe->pages[0]))) < 0) {
+		error = EFAULT;
+		goto doerror;
+	}
+	if (!unmap) {
+		pmap_qenter((vm_offset_t)pbuf->b_data,
+		    aiocbe->pages, aiocbe->npages);
+		bp->bio_data = pbuf->b_data + poff;
+	} else {
+		bp->bio_ma = aiocbe->pages;
+		bp->bio_ma_n = aiocbe->npages;
+		bp->bio_ma_offset = poff;
+		bp->bio_data = unmapped_buf;
+		bp->bio_flags |= BIO_UNMAPPED;
+	}
 
-	TASK_INIT(&aiocbe->biotask, 0, biohelper, aiocbe);
+	atomic_add_int(&num_queue_count, 1);
+	if (!unmap)
+		atomic_add_int(&num_buf_aio, 1);
 
 	/* Perform transfer. */
-	dev_strategy_csw(dev, csw, bp);
+	csw->d_strategy(bp);
 	dev_relthread(dev, ref);
 	return (0);
 
 doerror:
 	AIO_LOCK(ki);
+	aiocbe->jobstate = JOBST_NULL;
+	TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist);
+	TAILQ_REMOVE(&ki->kaio_all, aiocbe, allist);
 	ki->kaio_count--;
-	ki->kaio_buffer_count--;
+	if (!unmap)
+		ki->kaio_buffer_count--;
 	if (lj)
 		lj->lioj_count--;
-	aiocbe->bp = NULL;
 	AIO_UNLOCK(ki);
-	relpbuf(bp, NULL);
+	if (pbuf) {
+		relpbuf(pbuf, NULL);
+		aiocbe->pbuf = NULL;
+	}
+	g_destroy_bio(bp);
+	aiocbe->bp = NULL;
 unref:
 	dev_relthread(dev, ref);
 	return (error);
@@ -1787,8 +1804,6 @@ no_kqueue:
 	}
 #endif
 queueit:
-	/* No buffer for daemon I/O. */
-	aiocbe->bp = NULL;
 	atomic_add_int(&num_queue_count, 1);
 
 	AIO_LOCK(ki);
@@ -2425,54 +2440,43 @@ sys_lio_listio(struct thread *td, struct
 	return (error);
 }
 
-/*
- * Called from interrupt thread for physio, we should return as fast
- * as possible, so we schedule a biohelper task.
- */
 static void
-aio_physwakeup(struct buf *bp)
+aio_physwakeup(struct bio *bp)
 {
-	struct aiocblist *aiocbe;
-
-	aiocbe = (struct aiocblist *)bp->b_caller1;
-	taskqueue_enqueue(taskqueue_aiod_bio, &aiocbe->biotask);
-}
-
-/*
- * Task routine to perform heavy tasks, process wakeup, and signals.
- */
-static void
-biohelper(void *context, int pending)
-{
-	struct aiocblist *aiocbe = context;
-	struct buf *bp;
+	struct aiocblist *aiocbe = (struct aiocblist *)bp->bio_caller1;
 	struct proc *userp;
 	struct kaioinfo *ki;
 	int nblks;
 
+	/* Release mapping into kernel space. */
+	if (aiocbe->pbuf) {
+		pmap_qremove((vm_offset_t)aiocbe->pbuf->b_data, aiocbe->npages);
+		relpbuf(aiocbe->pbuf, NULL);
+		aiocbe->pbuf = NULL;
+		atomic_subtract_int(&num_buf_aio, 1);
+	}
+	vm_page_unhold_pages(aiocbe->pages, aiocbe->npages);
+
 	bp = aiocbe->bp;
+	aiocbe->bp = NULL;
 	userp = aiocbe->userproc;
 	ki = userp->p_aioinfo;
 	AIO_LOCK(ki);
-	aiocbe->uaiocb._aiocb_private.status -= bp->b_resid;
+	aiocbe->uaiocb._aiocb_private.status -= bp->bio_resid;
 	aiocbe->uaiocb._aiocb_private.error = 0;
-	if (bp->b_ioflags & BIO_ERROR)
-		aiocbe->uaiocb._aiocb_private.error = bp->b_error;
+	if (bp->bio_flags & BIO_ERROR)
+		aiocbe->uaiocb._aiocb_private.error = bp->bio_error;
 	nblks = btodb(aiocbe->uaiocb.aio_nbytes);
 	if (aiocbe->uaiocb.aio_lio_opcode == LIO_WRITE)
 		aiocbe->outputcharge += nblks;
 	else
 		aiocbe->inputcharge += nblks;
-	aiocbe->bp = NULL;
 	TAILQ_REMOVE(&userp->p_aioinfo->kaio_bufqueue, aiocbe, plist);
 	ki->kaio_buffer_count--;
 	aio_bio_done_notify(userp, aiocbe, DONE_BUF);
 	AIO_UNLOCK(ki);
 
-	/* Release mapping into kernel space. */
-	vunmapbuf(bp);
-	relpbuf(bp, NULL);
-	atomic_subtract_int(&num_buf_aio, 1);
+	g_destroy_bio(bp);
 }
 
 /* syscall - wait for the next completion of an aio request */


More information about the svn-src-all mailing list