svn commit: r216699 - in head/sys: dev/cxgb/ulp/tom dev/drm kern net vm

Sat Dec 25 21:26:56 UTC 2010

Author: alc
Date: Sat Dec 25 21:26:56 2010
New Revision: 216699
URL: http://svn.freebsd.org/changeset/base/216699

Log:
  Introduce and use a new VM interface for temporarily pinning pages.  This
  new interface replaces the combined use of vm_fault_quick() and
  pmap_extract_and_hold() throughout the kernel.
  
  In collaboration with:	kib@

Deleted:
  head/sys/dev/cxgb/ulp/tom/cxgb_vm.c
  head/sys/dev/cxgb/ulp/tom/cxgb_vm.h
Modified:
  head/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c
  head/sys/dev/cxgb/ulp/tom/cxgb_ddp.c
  head/sys/dev/drm/via_dmablit.c
  head/sys/kern/sys_pipe.c
  head/sys/kern/uipc_cow.c
  head/sys/kern/vfs_bio.c
  head/sys/net/bpf_zerocopy.c
  head/sys/vm/vm_extern.h
  head/sys/vm/vm_fault.c

Modified: head/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c
==============================================================================

--- head/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c	Sat Dec 25 17:35:30 2010	(r216698)
+++ head/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c	Sat Dec 25 21:26:56 2010	(r216699)
@@ -90,7 +90,6 @@ __FBSDID("$FreeBSD$");
 #include <ulp/tom/cxgb_t3_ddp.h>
 #include <ulp/tom/cxgb_toepcb.h>
 #include <ulp/tom/cxgb_tcp.h>
-#include <ulp/tom/cxgb_vm.h>
 
 
 static int	(*pru_sosend)(struct socket *so, struct sockaddr *addr,
@@ -218,8 +217,9 @@ cxgb_hold_iovec_pages(struct uio *uio, v
 		
 		count = min(count, npages);
 
-		err = vm_fault_hold_user_pages(map,
-			(vm_offset_t)iov->iov_base, mp, count, prot);
+		/* The following return value is not used. XXX */
+		err = vm_fault_quick_hold_pages(map,
+		    (vm_offset_t)iov->iov_base, iov->iov_len, prot, mp, count);
 		mp += count;
 		totcount += count;
 		curbytes = iov->iov_len;
@@ -503,7 +503,7 @@ cxgb_sosend(struct socket *so, struct so
 	 *  - the number of bytes to be transferred exceeds the threshold
 	 *  - the number of bytes currently in flight won't exceed the in-flight
 	 *    threshold XXX TODO
-	 *  - vm_fault_hold_user_pages succeeds
+	 *  - vm_fault_quick_hold_pages succeeds
 	 *  - blocking socket XXX for now
 	 *
 	 */
@@ -970,7 +970,7 @@ cxgb_soreceive(struct socket *so, struct
 	 *  - the number of bytes to be transferred exceeds the threshold
 	 *  - the number of bytes currently in flight won't exceed the in-flight
 	 *    threshold XXX TODO
-	 *  - vm_fault_hold_user_pages succeeds
+	 *  - vm_fault_quick_hold_pages succeeds
 	 *  - blocking socket XXX for now
 	 *  - iovcnt is 1
 	 *

Modified: head/sys/dev/cxgb/ulp/tom/cxgb_ddp.c
==============================================================================
--- head/sys/dev/cxgb/ulp/tom/cxgb_ddp.c	Sat Dec 25 17:35:30 2010	(r216698)
+++ head/sys/dev/cxgb/ulp/tom/cxgb_ddp.c	Sat Dec 25 21:26:56 2010	(r216699)
@@ -90,7 +90,6 @@ __FBSDID("$FreeBSD$");
 #include <ulp/tom/cxgb_t3_ddp.h>
 #include <ulp/tom/cxgb_toepcb.h>
 #include <ulp/tom/cxgb_tcp.h>
-#include <ulp/tom/cxgb_vm.h>
 
 
 #define MAX_SCHEDULE_TIMEOUT	300
@@ -130,14 +129,6 @@ t3_pin_pages(bus_dma_tag_t tag, bus_dmam
 	struct ddp_gather_list *p;
 	vm_map_t map;
 	
-	/*
-	 * XXX need x86 agnostic check
-	 */
-	if (addr + len > VM_MAXUSER_ADDRESS)
-		return (EFAULT);
-
-
-	
 	pg_off = addr & PAGE_MASK;
 	npages = (pg_off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	p = malloc(sizeof(struct ddp_gather_list) + npages * sizeof(vm_page_t *),
@@ -146,10 +137,11 @@ t3_pin_pages(bus_dma_tag_t tag, bus_dmam
 		return (ENOMEM);
 
 	map = &curthread->td_proc->p_vmspace->vm_map;
-	err = vm_fault_hold_user_pages(map, addr, p->dgl_pages, npages,
-	    VM_PROT_READ | VM_PROT_WRITE);
-	if (err)
+	if (vm_fault_quick_hold_pages(map, addr, len, VM_PROT_READ |
+	    VM_PROT_WRITE, p->dgl_pages, npages) < 0) {
+		err = EFAULT;
 		goto free_gl;
+	}
 
 	if (gl && gl->dgl_offset == pg_off && gl->dgl_nelem >= npages &&
 	    gl->dgl_length >= len) {

Modified: head/sys/dev/drm/via_dmablit.c
==============================================================================
--- head/sys/dev/drm/via_dmablit.c	Sat Dec 25 17:35:30 2010	(r216698)
+++ head/sys/dev/drm/via_dmablit.c	Sat Dec 25 21:26:56 2010	(r216699)
@@ -177,11 +177,10 @@ via_free_sg_info(drm_via_sg_info_t *vsg)
 		free(vsg->desc_pages, DRM_MEM_DRIVER);
 	case dr_via_pages_locked:
 		for (i=0; i < vsg->num_pages; ++i) {
-			if ( NULL != (page = vsg->pages[i])) {
-				vm_page_lock(page);
-				vm_page_unwire(page, 0);
-				vm_page_unlock(page);
-			}
+			page = vsg->pages[i];
+			vm_page_lock(page);
+			vm_page_unwire(page, 0);
+			vm_page_unlock(page);
 		}
 	case dr_via_pages_alloc:
 		free(vsg->pages, DRM_MEM_DRIVER);
@@ -224,41 +223,31 @@ via_lock_all_dma_pages(drm_via_sg_info_t
 {
 	unsigned long first_pfn = VIA_PFN(xfer->mem_addr);
 	vm_page_t m;
-	vm_map_t map;
 	int i;
 
-	map = &curproc->p_vmspace->vm_map;
-
 	vsg->num_pages = VIA_PFN(xfer->mem_addr +
 	    (xfer->num_lines * xfer->mem_stride -1)) - first_pfn + 1;
 
-	/* Make sure that the user has access to these pages */
-	for(i = 0; i < vsg->num_pages; i++) {
-		if (vm_fault_quick((caddr_t)xfer->mem_addr + IDX_TO_OFF(i),
-		    VM_PROT_RW) < 0)
-			return (-EACCES);
-	}
-
 	if (NULL == (vsg->pages = malloc(sizeof(vm_page_t) * vsg->num_pages,
-	    DRM_MEM_DRIVER, M_NOWAIT | M_ZERO)))
+	    DRM_MEM_DRIVER, M_NOWAIT)))
 		return -ENOMEM;
 
-	for(i = 0; i < vsg->num_pages; i++) {
-		m = pmap_extract_and_hold(map->pmap,
-		    (vm_offset_t)xfer->mem_addr + IDX_TO_OFF(i), VM_PROT_RW);
-		if (m == NULL)
-			break;
+	vsg->state = dr_via_pages_alloc;
+
+	if (vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map,
+	    (vm_offset_t)xfer->mem_addr, vsg->num_pages * PAGE_SIZE,
+	    VM_PROT_READ | VM_PROT_WRITE, vsg->pages, vsg->num_pages) < 0)
+		return -EACCES;
+
+	for (i = 0; i < vsg->num_pages; i++) {
+		m = vsg->pages[i];
 		vm_page_lock(m);
 		vm_page_wire(m);
 		vm_page_unhold(m);
 		vm_page_unlock(m);
-		vsg->pages[i] = m;
 	}
 	vsg->state = dr_via_pages_locked;
 
-	if (i != vsg->num_pages)
-		return -EINVAL;
-
 	DRM_DEBUG("DMA pages locked\n");
 
 	return 0;

Modified: head/sys/kern/sys_pipe.c
==============================================================================
--- head/sys/kern/sys_pipe.c	Sat Dec 25 17:35:30 2010	(r216698)
+++ head/sys/kern/sys_pipe.c	Sat Dec 25 21:26:56 2010	(r216699)
@@ -747,10 +747,8 @@ pipe_build_write_buffer(wpipe, uio)
 	struct pipe *wpipe;
 	struct uio *uio;
 {
-	pmap_t pmap;
 	u_int size;
 	int i;
-	vm_offset_t addr, endaddr;
 
 	PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED);
 	KASSERT(wpipe->pipe_state & PIPE_DIRECTW,
@@ -760,25 +758,10 @@ pipe_build_write_buffer(wpipe, uio)
 	if (size > wpipe->pipe_buffer.size)
 		size = wpipe->pipe_buffer.size;
 
-	pmap = vmspace_pmap(curproc->p_vmspace);
-	endaddr = round_page((vm_offset_t)uio->uio_iov->iov_base + size);
-	addr = trunc_page((vm_offset_t)uio->uio_iov->iov_base);
-	if (endaddr < addr)
+	if ((i = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map,
+	    (vm_offset_t)uio->uio_iov->iov_base, size, VM_PROT_READ,
+	    wpipe->pipe_map.ms, PIPENPAGES)) < 0)
 		return (EFAULT);
-	for (i = 0; addr < endaddr; addr += PAGE_SIZE, i++) {
-		/*
-		 * vm_fault_quick() can sleep.
-		 */
-	race:
-		if (vm_fault_quick((caddr_t)addr, VM_PROT_READ) < 0) {
-			vm_page_unhold_pages(wpipe->pipe_map.ms, i);
-			return (EFAULT);
-		}
-		wpipe->pipe_map.ms[i] = pmap_extract_and_hold(pmap, addr,
-		    VM_PROT_READ);
-		if (wpipe->pipe_map.ms[i] == NULL)
-			goto race;
-	}
 
 /*
  * set up the control block

Modified: head/sys/kern/uipc_cow.c
==============================================================================
--- head/sys/kern/uipc_cow.c	Sat Dec 25 17:35:30 2010	(r216698)
+++ head/sys/kern/uipc_cow.c	Sat Dec 25 21:26:56 2010	(r216699)
@@ -103,24 +103,20 @@ socow_setup(struct mbuf *m0, struct uio 
 	struct vmspace *vmspace;
 	struct vm_map *map;
 	vm_offset_t offset, uva;
+	vm_size_t len;
 
 	socow_stats.attempted++;
 	vmspace = curproc->p_vmspace;
 	map = &vmspace->vm_map;
 	uva = (vm_offset_t) uio->uio_iov->iov_base;
 	offset = uva & PAGE_MASK;
+	len = PAGE_SIZE - offset;
 
 	/*
 	 * Verify that access to the given address is allowed from user-space.
 	 */
-	if (vm_fault_quick((caddr_t)uva, VM_PROT_READ) < 0)
-		return (0);
-
-       /* 
-	* verify page is mapped & not already wired for i/o
-	*/
-	pp = pmap_extract_and_hold(map->pmap, uva, VM_PROT_READ);
-	if (pp == NULL) {
+	if (vm_fault_quick_hold_pages(map, uva, len, &pp, 1, VM_PROT_READ) <
+	    0) {
 		socow_stats.fail_not_mapped++;
 		return(0);
 	}
@@ -165,7 +161,7 @@ socow_setup(struct mbuf *m0, struct uio 
 	 */
 	MEXTADD(m0, sf_buf_kva(sf), PAGE_SIZE, socow_iodone,
 	    (void*)sf_buf_kva(sf), sf, M_RDONLY, EXT_SFBUF);
-	m0->m_len = PAGE_SIZE - offset;
+	m0->m_len = len;
 	m0->m_data = (caddr_t)sf_buf_kva(sf) + offset;
 	socow_stats.success++;
 

Modified: head/sys/kern/vfs_bio.c
==============================================================================
--- head/sys/kern/vfs_bio.c	Sat Dec 25 17:35:30 2010	(r216698)
+++ head/sys/kern/vfs_bio.c	Sat Dec 25 21:26:56 2010	(r216699)
@@ -3855,46 +3855,19 @@ vm_hold_free_pages(struct buf *bp, int n
 int
 vmapbuf(struct buf *bp)
 {
-	caddr_t addr, kva;
+	caddr_t kva;
 	vm_prot_t prot;
-	int pidx, i;
-	struct vm_page *m;
-	struct pmap *pmap = &curproc->p_vmspace->vm_pmap;
+	int pidx;
 
 	if (bp->b_bufsize < 0)
 		return (-1);
 	prot = VM_PROT_READ;
 	if (bp->b_iocmd == BIO_READ)
 		prot |= VM_PROT_WRITE;	/* Less backwards than it looks */
-	for (addr = (caddr_t)trunc_page((vm_offset_t)bp->b_data), pidx = 0;
-	     addr < bp->b_data + bp->b_bufsize;
-	     addr += PAGE_SIZE, pidx++) {
-		/*
-		 * Do the vm_fault if needed; do the copy-on-write thing
-		 * when reading stuff off device into memory.
-		 *
-		 * NOTE! Must use pmap_extract() because addr may be in
-		 * the userland address space, and kextract is only guarenteed
-		 * to work for the kernland address space (see: sparc64 port).
-		 */
-retry:
-		if (vm_fault_quick(addr >= bp->b_data ? addr : bp->b_data,
-		    prot) < 0) {
-			for (i = 0; i < pidx; ++i) {
-				vm_page_lock(bp->b_pages[i]);
-				vm_page_unhold(bp->b_pages[i]);
-				vm_page_unlock(bp->b_pages[i]);
-				bp->b_pages[i] = NULL;
-			}
-			return(-1);
-		}
-		m = pmap_extract_and_hold(pmap, (vm_offset_t)addr, prot);
-		if (m == NULL)
-			goto retry;
-		bp->b_pages[pidx] = m;
-	}
-	if (pidx > btoc(MAXPHYS))
-		panic("vmapbuf: mapped more than MAXPHYS");
+	if ((pidx = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map,
+	    (vm_offset_t)bp->b_data, bp->b_bufsize, prot, bp->b_pages,
+	    btoc(MAXPHYS))) < 0)
+		return (-1);
 	pmap_qenter((vm_offset_t)bp->b_saveaddr, bp->b_pages, pidx);
 	
 	kva = bp->b_saveaddr;

Modified: head/sys/net/bpf_zerocopy.c
==============================================================================
--- head/sys/net/bpf_zerocopy.c	Sat Dec 25 17:35:30 2010	(r216698)
+++ head/sys/net/bpf_zerocopy.c	Sat Dec 25 21:26:56 2010	(r216699)
@@ -161,12 +161,8 @@ zbuf_sfbuf_get(struct vm_map *map, vm_of
 	struct sf_buf *sf;
 	vm_page_t pp;
 
-	if (vm_fault_quick((caddr_t) uaddr, VM_PROT_READ | VM_PROT_WRITE) <
-	    0)
-		return (NULL);
-	pp = pmap_extract_and_hold(map->pmap, uaddr, VM_PROT_READ |
-	    VM_PROT_WRITE);
-	if (pp == NULL)
+	if (vm_fault_quick_hold_pages(map, uaddr, PAGE_SIZE, VM_PROT_READ |
+	    VM_PROT_WRITE, &pp, 1) < 0)
 		return (NULL);
 	vm_page_lock(pp);
 	vm_page_wire(pp);

Modified: head/sys/vm/vm_extern.h
==============================================================================
--- head/sys/vm/vm_extern.h	Sat Dec 25 17:35:30 2010	(r216698)
+++ head/sys/vm/vm_extern.h	Sat Dec 25 21:26:56 2010	(r216699)
@@ -63,6 +63,8 @@ void vm_fault_copy_entry(vm_map_t, vm_ma
     vm_ooffset_t *);
 int vm_fault_hold(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
     int fault_flags, vm_page_t *m_hold);
+int vm_fault_quick_hold_pages(vm_map_t map, vm_offset_t addr, vm_size_t len,
+    vm_prot_t prot, vm_page_t *ma, int max_count);
 void vm_fault_unwire(vm_map_t, vm_offset_t, vm_offset_t, boolean_t);
 int vm_fault_wire(vm_map_t, vm_offset_t, vm_offset_t, boolean_t);
 int vm_forkproc(struct thread *, struct proc *, struct thread *, struct vmspace *, int);

Modified: head/sys/vm/vm_fault.c
==============================================================================
--- head/sys/vm/vm_fault.c	Sat Dec 25 17:35:30 2010	(r216698)
+++ head/sys/vm/vm_fault.c	Sat Dec 25 21:26:56 2010	(r216699)
@@ -1045,6 +1045,81 @@ vm_fault_prefault(pmap_t pmap, vm_offset
 }
 
 /*
+ * Hold each of the physical pages that are mapped by the specified range of
+ * virtual addresses, ["addr", "addr" + "len"), if those mappings are valid
+ * and allow the specified types of access, "prot".  If all of the implied
+ * pages are successfully held, then the number of held pages is returned
+ * together with pointers to those pages in the array "ma".  However, if any
+ * of the pages cannot be held, -1 is returned.
+ */
+int
+vm_fault_quick_hold_pages(vm_map_t map, vm_offset_t addr, vm_size_t len,
+    vm_prot_t prot, vm_page_t *ma, int max_count)
+{
+	vm_offset_t end, va;
+	vm_page_t *mp;
+	int count;
+	boolean_t pmap_failed;
+
+	end = round_page(addr + len);	
+	addr = trunc_page(addr);
+
+	/*
+	 * Check for illegal addresses.
+	 */
+	if (addr < vm_map_min(map) || addr > end || end > vm_map_max(map))
+		return (-1);
+
+	count = howmany(end - addr, PAGE_SIZE);
+	if (count > max_count)
+		panic("vm_fault_quick_hold_pages: count > max_count");
+
+	/*
+	 * Most likely, the physical pages are resident in the pmap, so it is
+	 * faster to try pmap_extract_and_hold() first.
+	 */
+	pmap_failed = FALSE;
+	for (mp = ma, va = addr; va < end; mp++, va += PAGE_SIZE) {
+		*mp = pmap_extract_and_hold(map->pmap, va, prot);
+		if (*mp == NULL)
+			pmap_failed = TRUE;
+		else if ((prot & VM_PROT_WRITE) != 0 &&
+		    (*ma)->dirty != VM_PAGE_BITS_ALL) {
+			/*
+			 * Explicitly dirty the physical page.  Otherwise, the
+			 * caller's changes may go unnoticed because they are
+			 * performed through an unmanaged mapping or by a DMA
+			 * operation.
+			 */
+			vm_page_lock_queues();
+			vm_page_dirty(*mp);
+			vm_page_unlock_queues();
+		}
+	}
+	if (pmap_failed) {
+		/*
+		 * One or more pages could not be held by the pmap.  Either no
+		 * page was mapped at the specified virtual address or that
+		 * mapping had insufficient permissions.  Attempt to fault in
+		 * and hold these pages.
+		 */
+		for (mp = ma, va = addr; va < end; mp++, va += PAGE_SIZE)
+			if (*mp == NULL && vm_fault_hold(map, va, prot,
+			    VM_FAULT_NORMAL, mp) != KERN_SUCCESS)
+				goto error;
+	}
+	return (count);
+error:	
+	for (mp = ma; mp < ma + count; mp++)
+		if (*mp != NULL) {
+			vm_page_lock(*mp);
+			vm_page_unhold(*mp);
+			vm_page_unlock(*mp);
+		}
+	return (-1);
+}
+
+/*
  *	vm_fault_quick:
  *
  *	Ensure that the requested virtual address, which may be in userland,