kern/55081: contigmalloc API semantics inadequate --- forces KVM mapping

Chuck Cranor chuck at research.att.com
Wed Jul 30 13:00:33 PDT 2003


>Number:         55081
>Category:       kern
>Synopsis:       contigmalloc API semantics inadequate --- forces KVM mapping
>Confidential:   no
>Severity:       serious
>Priority:       medium
>Responsible:    freebsd-bugs
>State:          open
>Quarter:        
>Keywords:       
>Date-Required:
>Class:          sw-bug
>Submitter-Id:   current-users
>Arrival-Date:   Wed Jul 30 13:00:28 PDT 2003
>Closed-Date:
>Last-Modified:
>Originator:     Chuck Cranor
>Release:        FreeBSD 4.7-RELEASE i386
>Organization:
AT&T Labs--Research
>Environment:

System: FreeBSD gs1.research.att.com 4.7-RELEASE FreeBSD 4.7-RELEASE #2: Mon Mar 3 15:40:37 GMT 2003 chuck at gs1.research.att.com:/usr/home/chuck/src/sys/compile/RESEARCH47MP i386

>Description:

    we have a PCI card that communicates with its user-level
applications by using a physically contiguous memory mapped buffer
[mapped via mmap(2)].  currently the card's driver allocates the
buffer using contigmalloc() during autoconfig.

    we would like to make this buffer quite large (all our systems
have 4GB RAM), but unfortunately the semantics of contigmalloc()
cannot support this!   

    the problem with the contigmalloc() API is that in addition to
allocating physically contiguous memory, it also insists on mapping
the allocated memory into the kernel virtual address space!  thus,
when we try and allocate large buffers for our PCI device, our
allocation fails because we run out of kernel virtual memory.  this is
bad, since the PCI card we are using needs large blocks of contiguous
memory, but it does not need _any_ kernel mappings of these blocks.


    to solve the problem, i have broken contigmalloc() up into two
functions: 
	   vm_contig_pg_alloc() - allocates physical contig memory,
				  return index of first page in vm_page_array
				  or -1 on error
	   vm_contig_pg_kmapin() - maps a set of physically contig
	   			  pages into KVM

and now contigmalloc() is just a wrapper around calls to these two
functions.  i also have a vm_contig_pg_free() function that undoes a
vm_contig_pg_alloc() operation.

    for our device, we only need to call vm_contig_pg_alloc()...
that will save us from running out of KVM.		


    I would like to get some sort of change along these lines
committed to the FreeBSD 4 and 5 branches.  I'm willing to rework the
patch a bit if needed.

>How-To-Repeat:

	attempt to allocate a large (e.g. 1GB) physically contig buffer
	in the FreeBSD kernel.

>Fix:

here is a context diff that makes my proposed changes.
also, after the diff is a simple kld module that I used
for doing basic testing of the APIs (seems to work).


*** vm_page.h_47	Thu Jul 24 13:43:16 2003
--- vm_page.h	Thu Jul 24 17:03:33 2003
***************
*** 393,398 ****
--- 393,402 ----
  #define	VM_ALLOC_ZERO		3
  #define	VM_ALLOC_RETRY		0x80
  
+ int vm_contig_pg_alloc(u_long, u_long, u_long, u_long, u_long);
+ vm_offset_t vm_contig_pg_kmapin(int, u_long, vm_map_t);
+ void vm_contig_pg_free(int, u_long);
+ 
  void vm_page_unhold(vm_page_t mem);
  
  void vm_page_activate (vm_page_t);
*** vm_page.c_47	Thu Jul 24 13:33:20 2003
--- vm_page.c	Thu Jul 24 18:26:06 2003
***************
*** 1774,1815 ****
  }
  
  /*
!  * This interface is for merging with malloc() someday.
!  * Even if we never implement compaction so that contiguous allocation
!  * works after initialization time, malloc()'s data structures are good
!  * for statistics and for allocations of less than a page.
   */
! void *
! contigmalloc1(
! 	unsigned long size,	/* should be size_t here and for malloc() */
! 	struct malloc_type *type,
! 	int flags,
! 	unsigned long low,
! 	unsigned long high,
! 	unsigned long alignment,
! 	unsigned long boundary,
! 	vm_map_t map)
! {
! 	int i, s, start;
! 	vm_offset_t addr, phys, tmp_addr;
! 	int pass;
  	vm_page_t pga = vm_page_array;
  
  	size = round_page(size);
  	if (size == 0)
! 		panic("contigmalloc1: size must not be 0");
  	if ((alignment & (alignment - 1)) != 0)
! 		panic("contigmalloc1: alignment must be a power of 2");
  	if ((boundary & (boundary - 1)) != 0)
! 		panic("contigmalloc1: boundary must be a power of 2");
  
  	start = 0;
  	for (pass = 0; pass <= 1; pass++) {
  		s = splvm();
  again:
  		/*
! 		 * Find first page in array that is free, within range, aligned, and
! 		 * such that the boundary won't be crossed.
  		 */
  		for (i = start; i < cnt.v_page_count; i++) {
  			int pqtype;
--- 1774,1806 ----
  }
  
  /*
!  * vm_contig_pg_alloc: allocate a set of physically contig pages in a
!  * given range.  we return the index (in the vm_page_array[]) of the
!  * first page allocated, or return -1 on error.
   */
! int
! vm_contig_pg_alloc(u_long size, u_long low, u_long high, u_long alignment,
! 		   u_long boundary) {
! 
! 	int i, s, start, pass;
! 	vm_offset_t phys;
  	vm_page_t pga = vm_page_array;
  
  	size = round_page(size);
  	if (size == 0)
! 		panic("vm_contig_pg_alloc: size must not be 0");
  	if ((alignment & (alignment - 1)) != 0)
! 		panic("vm_contig_pg_alloc: alignment must be a power of 2");
  	if ((boundary & (boundary - 1)) != 0)
! 		panic("vm_contig_pg_alloc: boundary must be a power of 2");
  
  	start = 0;
  	for (pass = 0; pass <= 1; pass++) {
  		s = splvm();
  again:
  		/*
! 		 * Find first page in array that is free, within range, 
! 		 * aligned, and such that the boundary won't be crossed.
  		 */
  		for (i = start; i < cnt.v_page_count; i++) {
  			int pqtype;
***************
*** 1835,1841 ****
  				m = next) {
  
  				KASSERT(m->queue == PQ_INACTIVE,
! 					("contigmalloc1: page %p is not PQ_INACTIVE", m));
  
  				next = TAILQ_NEXT(m, pageq);
  				if (vm_page_sleep_busy(m, TRUE, "vpctw0"))
--- 1826,1832 ----
  				m = next) {
  
  				KASSERT(m->queue == PQ_INACTIVE,
! 					("vm_contig_pg_alloc: page %p is not PQ_INACTIVE", m));
  
  				next = TAILQ_NEXT(m, pageq);
  				if (vm_page_sleep_busy(m, TRUE, "vpctw0"))
***************
*** 1862,1868 ****
  				m = next) {
  
  				KASSERT(m->queue == PQ_ACTIVE,
! 					("contigmalloc1: page %p is not PQ_ACTIVE", m));
  
  				next = TAILQ_NEXT(m, pageq);
  				if (vm_page_sleep_busy(m, TRUE, "vpctw1"))
--- 1853,1859 ----
  				m = next) {
  
  				KASSERT(m->queue == PQ_ACTIVE,
! 					("vm_contig_pg_alloc: page %p is not PQ_ACTIVE", m));
  
  				next = TAILQ_NEXT(m, pageq);
  				if (vm_page_sleep_busy(m, TRUE, "vpctw1"))
***************
*** 1885,1891 ****
  			}
  
  			splx(s);
! 			continue;
  		}
  		start = i;
  
--- 1876,1882 ----
  			}
  
  			splx(s);
! 			continue;		/* next pass */
  		}
  		start = i;
  
***************
*** 1917,1963 ****
  			if (m->flags & PG_ZERO)
  				vm_page_zero_count--;
  			m->flags = 0;
! 			KASSERT(m->dirty == 0, ("contigmalloc1: page %p was dirty", m));
  			m->wire_count = 0;
  			m->busy = 0;
  			m->object = NULL;
  		}
  
  		/*
! 		 * We've found a contiguous chunk that meets are requirements.
! 		 * Allocate kernel VM, unfree and assign the physical pages to it and
! 		 * return kernel VM pointer.
  		 */
! 		vm_map_lock(map);
! 		if (vm_map_findspace(map, vm_map_min(map), size, &addr) !=
! 		    KERN_SUCCESS) {
! 			/*
! 			 * XXX We almost never run out of kernel virtual
! 			 * space, so we don't make the allocated memory
! 			 * above available.
! 			 */
! 			vm_map_unlock(map);
! 			splx(s);
! 			return (NULL);
! 		}
! 		vm_object_reference(kernel_object);
! 		vm_map_insert(map, kernel_object, addr - VM_MIN_KERNEL_ADDRESS,
! 		    addr, addr + size, VM_PROT_ALL, VM_PROT_ALL, 0);
! 		vm_map_unlock(map);
  
! 		tmp_addr = addr;
! 		for (i = start; i < (start + size / PAGE_SIZE); i++) {
! 			vm_page_t m = &pga[i];
! 			vm_page_insert(m, kernel_object,
! 				OFF_TO_IDX(tmp_addr - VM_MIN_KERNEL_ADDRESS));
! 			tmp_addr += PAGE_SIZE;
! 		}
! 		vm_map_pageable(map, addr, addr + size, FALSE);
  
  		splx(s);
! 		return ((void *)addr);
  	}
! 	return NULL;
  }
  
  void *
--- 1908,2041 ----
  			if (m->flags & PG_ZERO)
  				vm_page_zero_count--;
  			m->flags = 0;
! 			KASSERT(m->dirty == 0, ("vm_contig_pg_alloc: page %p was dirty", m));
  			m->wire_count = 0;
  			m->busy = 0;
  			m->object = NULL;
  		}
  
  		/*
! 		 * success!
  		 */
! 		splx(s);
! 		return(start);
  
! 	}	/* end of pass loop */
! 
! 	/*
! 	 * failed...
! 	 */
! 	splx(s);
! 	return(-1);
! }
! 
! /*
!  * vm_contig_pg_free: undo a vm_contig_pg_alloc.  we assume that all
!  * references to the pages have been removed and that it is OK to add
!  * them back to the free list.
!  */
! void
! vm_contig_pg_free(int start, u_long size) {
! 
! 	vm_page_t pga = vm_page_array;
! 	int i;
! 
! 	size = round_page(size);
! 	if (size == 0)
! 		panic("vm_contig_pg_free: size must not be 0");
! 
! 	for (i = start; i < (start + size / PAGE_SIZE); i++) {
! 		vm_page_free(&pga[i]);
! 	}
! }
! 
! 
! /*
!  * vm_contig_pg_kmapin: map a previously allocated set of contig pages
!  * from the vm_page_array[] into the kernel address space.   once mapped,
!  * the pages become part of the kernel object and should be freed with
!  * kmem_free(kernel_map, address, size).
!  */
! vm_offset_t
! vm_contig_pg_kmapin(int start, u_long size, vm_map_t map) {
! 
! 	int i, s;
! 	vm_offset_t addr, tmp_addr;
! 	vm_page_t pga = vm_page_array;
! 
! 	size = round_page(size);
! 	if (size == 0)
! 		panic("vm_contig_pg_kmapin: size must not be 0");
  
+ 	s = splvm();	/* XXX: is this really needed? */
+ 	/*
+ 	 * We've found a contiguous chunk that meets are requirements.
+ 	 * Allocate kernel VM, unfree and assign the physical pages to it and
+ 	 * return kernel VM pointer.
+ 	 */
+ 	vm_map_lock(map);
+ 	if (vm_map_findspace(map, vm_map_min(map), size, &addr) !=
+ 	    KERN_SUCCESS) {
+ 		vm_map_unlock(map);
  		splx(s);
! 		return (0);
! 	}
! 	vm_object_reference(kernel_object);
! 	vm_map_insert(map, kernel_object, addr - VM_MIN_KERNEL_ADDRESS,
! 	    addr, addr + size, VM_PROT_ALL, VM_PROT_ALL, 0);
! 	vm_map_unlock(map);
! 
! 	tmp_addr = addr;
! 	for (i = start; i < (start + size / PAGE_SIZE); i++) {
! 		vm_page_t m = &pga[i];
! 		vm_page_insert(m, kernel_object,
! 			OFF_TO_IDX(tmp_addr - VM_MIN_KERNEL_ADDRESS));
! 		tmp_addr += PAGE_SIZE;
  	}
! 	vm_map_pageable(map, addr, addr + size, FALSE);
! 
! 	splx(s);
! 	return (addr);
! }
! 
! /*
!  * This interface is for merging with malloc() someday.
!  * Even if we never implement compaction so that contiguous allocation
!  * works after initialization time, malloc()'s data structures are good
!  * for statistics and for allocations of less than a page.
!  */
! void *
! contigmalloc1(
! 	unsigned long size,	/* should be size_t here and for malloc() */
! 	struct malloc_type *type,
! 	int flags,
! 	unsigned long low,
! 	unsigned long high,
! 	unsigned long alignment,
! 	unsigned long boundary,
! 	vm_map_t map)
! {
! 	int index;
! 	void *rv;
! 
! 	size = round_page(size);
! 	if (size == 0)
! 		panic("contigmalloc1: size must not be 0");
! 	if ((alignment & (alignment - 1)) != 0)
! 		panic("contigmalloc1: alignment must be a power of 2");
! 	if ((boundary & (boundary - 1)) != 0)
! 		panic("contigmalloc1: boundary must be a power of 2");
! 
! 	index = vm_contig_pg_alloc(size, low, high, alignment, boundary);
! 	if (index < 0)
! 		return(NULL);
! 
! 	rv = (void *) vm_contig_pg_kmapin(index, size, map);
! 	if (!rv) {
! 		vm_contig_pg_free(index, size);
! 	}
! 
! 	return(rv);
  }
  
  void *



# This is a shell archive.  Save it in a file, remove anything before
# this line, and then unpack it by entering "sh file".  Note, it may
# create directories; files and directories will be owned by you and
# have default permissions.
#
# This archive contains:
#
#	Makefile
#	vm_contig_test.c
#
echo x - Makefile
sed 's/^X//' >Makefile << 'END-of-Makefile'
X# $FreeBSD$
X
XKMOD	= vm_contig_test
XSRCS	= vm_contig_test.c
X
X.include <bsd.kmod.mk>
END-of-Makefile
echo x - vm_contig_test.c
sed 's/^X//' >vm_contig_test.c << 'END-of-vm_contig_test.c'
X/* $FreeBSD$ */
X
X/*
X * vm_contig_test.c  test vm_contig page allocation functions
X * 30-Jul-2003  chuck at research.att.com
X */
X
X#include <sys/param.h>
X#include <sys/systm.h>
X#include <sys/kernel.h>
X#include <sys/module.h>
X#include <sys/malloc.h>
X
X#include <vm/vm.h>
X#include <vm/pmap.h>
X#include <vm/vm_page.h>
X
X/*
X * module glue, as per module(9)
X */
Xstatic int vmct_handler(module_t mod, int what, void *arg);
Xstatic void doit(void);
X
Xstatic moduledata_t mod_data = {
X	"vm_contig_test",	/* name */
X	vmct_handler,		/* handler */
X	0,			/* private */
X};
X
XMODULE_VERSION(vm_contig_test, 1);
X
XDECLARE_MODULE(vm_contig_test, mod_data, SI_SUB_EXEC, SI_ORDER_ANY);
X
X/*
X * vmct_handler: module function to perform some testing of the
X * vm_contig page allocation API.
X */
X
Xstatic int vmct_handler(module_t mod, int what, void *arg) {
X
X	int err = 0;
X
X	switch (what) {
X	case MOD_LOAD:
X		uprintf("vmct_handler: loading contig test module\n");
X		doit();
X		uprintf("vmct_handler: test done!  now kldunload me.\n");
X		break;
X	case MOD_UNLOAD:
X		uprintf("vmct_handler: unloading contig test module\n");
X		break;
X	default:
X		uprintf("unknown contig test module command (%d)\n", what);
X		break;
X
X	}
X	return(err);
X}
X
X/*
X * doit: do the tests
X */
X
Xvoid doit() {
X	int ntp = 8;		/* number of test pages */
X	int psz = PAGE_SIZE;	/* page size */
X	int n, span, idx, i;
X	char *b1, *p;
X	vm_page_t pga = vm_page_array;
X
X	uprintf("doit: testing VM contig API...  PAGE_SIZE=%d\n", psz);
X
X	b1 = contigmalloc(psz * ntp, M_DEVBUF, M_NOWAIT, 0, 0xffffffff, psz, 0);
X	uprintf("contigmalloc %d pages: %p\n", ntp, b1);
X
X	if (b1) {
X		for (p = b1, n = 0 ; p < b1 + (psz * ntp) ; p += psz, n++) {
X			if (p == b1) {
X				uprintf("first page: va=%p, pa=%x\n", 
X							p, vtophys(p));
X			} else {
X				span = vtophys(p) - vtophys(p - psz);
X				uprintf("page %d: span=%d %s\n", n, span,
X					(span == psz) ? "OK" : "BAD!!!!");
X			}
X		}
X		contigfree(b1, psz * ntp, M_DEVBUF);
X		uprintf("freed the pages\n");
X	}
X
X	idx = vm_contig_pg_alloc(psz * ntp, 0, 0xffffffff, psz, 0);
X	if (idx < 0) {
X		uprintf("vm_contig_pg_alloc: API failed %d\n", idx);
X	} else { 
X		for (i = idx, n = 0 ; i < idx + ntp ; i++, n++) { 
X			if (n == 0) {
X				uprintf("first page: pa=%x\n", 
X					pga[i].phys_addr);
X			} else {
X				span = pga[i].phys_addr - pga[i-1].phys_addr;
X				uprintf("page %d: span=%d %s\n", n, span,
X					(span == psz) ? "OK" : "BAD!!!!");
X			}
X		}
X		uprintf("freeing the pages\n");
X		vm_contig_pg_free(idx, psz * ntp);
X	}
X}
END-of-vm_contig_test.c
exit

>Release-Note:
>Audit-Trail:
>Unformatted:


More information about the freebsd-bugs mailing list