kern/55081: contigmalloc API semantics inadequate --- forces KVM mapping

Brian Fundakowski Feldman green at FreeBSD.org
Wed Jun 23 15:26:40 GMT 2004


Indeed I have a reimplementation of contigmalloc(9) in the waiting.
As you suggested it should be, it is split apart into functions that
can be called without mapping the memory into kernel space.  However,
it's also a reimplementation of the actual acquisition of pages such
that it should be far more reliable than before: instead of paging
out everything and hoping to get a contiguous region, it pages out
a page at a time and should only fail due to running completely out
of swap space or having so much wired memory that it fragments the
part of the physical address pspace you want to use.

Please try this patch and see how well it works for you.  You do not
have to change vm.old_contigmalloc to be able to use the new
vm_page_alloc_contig() and vm_page_release_contig() functions.
You should also notice the allocated memory show up in vmstat -m
output for calls to contigmalloc(); you can report your driver's
memory to these pools, too, using the malloc_type_*() functions.

Index: sys/malloc.h
===================================================================
RCS file: /usr/ncvs/src/sys/sys/malloc.h,v
retrieving revision 1.76
diff -u -r1.76 malloc.h
--- sys/malloc.h	7 Apr 2004 04:19:49 -0000	1.76
+++ sys/malloc.h	19 Jun 2004 05:20:30 -0000
@@ -105,6 +105,8 @@
 void	*malloc(unsigned long size, struct malloc_type *type, int flags);
 void	malloc_init(void *);
 int	malloc_last_fail(void);
+void	malloc_type_allocated(struct malloc_type *type, unsigned long size);
+void	malloc_type_freed(struct malloc_type *type, unsigned long size);
 void	malloc_uninit(void *);
 void	*realloc(void *addr, unsigned long size, struct malloc_type *type,
 	    int flags);
Index: kern/kern_malloc.c
===================================================================
RCS file: /usr/ncvs/src/sys/kern/kern_malloc.c,v
retrieving revision 1.133
diff -u -r1.133 kern_malloc.c
--- kern/kern_malloc.c	31 May 2004 21:46:04 -0000	1.133
+++ kern/kern_malloc.c	19 Jun 2004 05:22:40 -0000
@@ -175,6 +175,47 @@
 }
 
 /*
+ * Add this to the informational malloc_type bucket.
+ */
+static void
+malloc_type_zone_allocated(struct malloc_type *ksp, unsigned long size,
+    int zindx)
+{
+	mtx_lock(&ksp->ks_mtx);
+	ksp->ks_calls++;
+	if (zindx != -1)
+		ksp->ks_size |= 1 << zindx;
+	if (size != 0) {
+		ksp->ks_memuse += size;
+		ksp->ks_inuse++;
+		if (ksp->ks_memuse > ksp->ks_maxused)
+			ksp->ks_maxused = ksp->ks_memuse;
+	}
+	mtx_unlock(&ksp->ks_mtx);
+}
+
+void
+malloc_type_allocated(struct malloc_type *ksp, unsigned long size)
+{
+	malloc_type_zone_allocated(ksp, size, -1);
+}
+
+/*
+ * Remove this allocation from the informational malloc_type bucket.
+ */
+void
+malloc_type_freed(struct malloc_type *ksp, unsigned long size)
+{
+	mtx_lock(&ksp->ks_mtx);
+	KASSERT(size <= ksp->ks_memuse,
+		("malloc(9)/free(9) confusion.\n%s",
+		 "Probably freeing with wrong type, but maybe not here."));
+	ksp->ks_memuse -= size;
+	ksp->ks_inuse--;
+	mtx_unlock(&ksp->ks_mtx);
+}
+
+/*
  *	malloc:
  *
  *	Allocate a block of memory.
@@ -195,7 +236,6 @@
 #ifdef DIAGNOSTIC
 	unsigned long osize = size;
 #endif
-	register struct malloc_type *ksp = type;
 
 #ifdef INVARIANTS
 	/*
@@ -241,29 +281,16 @@
 		krequests[size >> KMEM_ZSHIFT]++;
 #endif
 		va = uma_zalloc(zone, flags);
-		mtx_lock(&ksp->ks_mtx);
-		if (va == NULL) 
-			goto out;
-
-		ksp->ks_size |= 1 << indx;
-		size = keg->uk_size;
+		if (va != NULL)
+			size = keg->uk_size;
+		malloc_type_zone_allocated(type, va == NULL ? 0 : size, indx);
 	} else {
 		size = roundup(size, PAGE_SIZE);
 		zone = NULL;
 		keg = NULL;
 		va = uma_large_malloc(size, flags);
-		mtx_lock(&ksp->ks_mtx);
-		if (va == NULL)
-			goto out;
+		malloc_type_allocated(type, va == NULL ? 0 : size);
 	}
-	ksp->ks_memuse += size;
-	ksp->ks_inuse++;
-out:
-	ksp->ks_calls++;
-	if (ksp->ks_memuse > ksp->ks_maxused)
-		ksp->ks_maxused = ksp->ks_memuse;
-
-	mtx_unlock(&ksp->ks_mtx);
 	if (flags & M_WAITOK)
 		KASSERT(va != NULL, ("malloc(M_WAITOK) returned NULL"));
 	else if (va == NULL)
@@ -288,7 +315,6 @@
 	void *addr;
 	struct malloc_type *type;
 {
-	register struct malloc_type *ksp = type;
 	uma_slab_t slab;
 	u_long size;
 
@@ -296,7 +322,7 @@
 	if (addr == NULL)
 		return;
 
-	KASSERT(ksp->ks_memuse > 0,
+	KASSERT(type->ks_memuse > 0,
 		("malloc(9)/free(9) confusion.\n%s",
 		 "Probably freeing with wrong type, but maybe not here."));
 	size = 0;
@@ -333,13 +359,7 @@
 		size = slab->us_size;
 		uma_large_free(slab);
 	}
-	mtx_lock(&ksp->ks_mtx);
-	KASSERT(size <= ksp->ks_memuse,
-		("malloc(9)/free(9) confusion.\n%s",
-		 "Probably freeing with wrong type, but maybe not here."));
-	ksp->ks_memuse -= size;
-	ksp->ks_inuse--;
-	mtx_unlock(&ksp->ks_mtx);
+	malloc_type_freed(type, size);
 }
 
 /*
Index: vm/vm_page.h
===================================================================
RCS file: /usr/ncvs/src/sys/vm/vm_page.h,v
retrieving revision 1.131
diff -u -r1.131 vm_page.h
--- vm/vm_page.h	5 Jun 2004 21:06:42 -0000	1.131
+++ vm/vm_page.h	19 Jun 2004 13:55:08 -0000
@@ -342,6 +342,9 @@
 
 void vm_page_activate (vm_page_t);
 vm_page_t vm_page_alloc (vm_object_t, vm_pindex_t, int);
+vm_page_t vm_page_alloc_contig (vm_pindex_t, vm_paddr_t, vm_paddr_t,
+	    vm_offset_t, vm_offset_t);
+void vm_page_release_contig (vm_page_t, vm_pindex_t);
 vm_page_t vm_page_grab (vm_object_t, vm_pindex_t, int);
 void vm_page_cache (register vm_page_t);
 int vm_page_try_to_cache (vm_page_t);
Index: vm/vm_contig.c
===================================================================
RCS file: /usr/ncvs/src/sys/vm/vm_contig.c,v
retrieving revision 1.35
diff -u -r1.35 vm_contig.c
--- vm/vm_contig.c	15 Jun 2004 01:02:00 -0000	1.35
+++ vm/vm_contig.c	19 Jun 2004 13:56:18 -0000
@@ -68,6 +68,9 @@
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
+#include <sys/kernel.h>
+#include <sys/linker_set.h>
+#include <sys/sysctl.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 
@@ -83,49 +86,63 @@
 #include <vm/vm_extern.h>
 
 static int
-vm_contig_launder(int queue)
+vm_contig_launder_page(vm_page_t m)
 {
 	vm_object_t object;
-	vm_page_t m, m_tmp, next;
+	vm_page_t m_tmp;
 	struct vnode *vp;
 
+	if (!VM_OBJECT_TRYLOCK(m->object))
+		return (EAGAIN);
+	if (vm_page_sleep_if_busy(m, TRUE, "vpctw0")) {
+		VM_OBJECT_UNLOCK(m->object);
+		vm_page_lock_queues();
+		return (EBUSY);
+	}
+	vm_page_test_dirty(m);
+	if (m->dirty == 0 && m->busy == 0 && m->hold_count == 0)
+		pmap_remove_all(m);
+	if (m->dirty) {
+		object = m->object;
+		if (object->type == OBJT_VNODE) {
+			vm_page_unlock_queues();
+			vp = object->handle;
+			VM_OBJECT_UNLOCK(object);
+			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread);
+			VM_OBJECT_LOCK(object);
+			vm_object_page_clean(object, 0, 0, OBJPC_SYNC);
+			VM_OBJECT_UNLOCK(object);
+			VOP_UNLOCK(vp, 0, curthread);
+			vm_page_lock_queues();
+			return (0);
+		} else if (object->type == OBJT_SWAP ||
+			   object->type == OBJT_DEFAULT) {
+			m_tmp = m;
+			vm_pageout_flush(&m_tmp, 1, VM_PAGER_PUT_SYNC);
+			VM_OBJECT_UNLOCK(object);
+			return (0);
+		}
+	} else if (m->busy == 0 && m->hold_count == 0)
+		vm_page_cache(m);
+	VM_OBJECT_UNLOCK(m->object);
+	return (0);
+}
+
+static int
+vm_contig_launder(int queue)
+{
+	vm_page_t m, next;
+	int error;
+
 	for (m = TAILQ_FIRST(&vm_page_queues[queue].pl); m != NULL; m = next) {
 		next = TAILQ_NEXT(m, pageq);
 		KASSERT(m->queue == queue,
 		    ("vm_contig_launder: page %p's queue is not %d", m, queue));
-		if (!VM_OBJECT_TRYLOCK(m->object))
-			continue;
-		if (vm_page_sleep_if_busy(m, TRUE, "vpctw0")) {
-			VM_OBJECT_UNLOCK(m->object);
-			vm_page_lock_queues();
+		error = vm_contig_launder_page(m);
+		if (error == 0)
 			return (TRUE);
-		}
-		vm_page_test_dirty(m);
-		if (m->dirty == 0 && m->busy == 0 && m->hold_count == 0)
-			pmap_remove_all(m);
-		if (m->dirty) {
-			object = m->object;
-			if (object->type == OBJT_VNODE) {
-				vm_page_unlock_queues();
-				vp = object->handle;
-				VM_OBJECT_UNLOCK(object);
-				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread);
-				VM_OBJECT_LOCK(object);
-				vm_object_page_clean(object, 0, 0, OBJPC_SYNC);
-				VM_OBJECT_UNLOCK(object);
-				VOP_UNLOCK(vp, 0, curthread);
-				vm_page_lock_queues();
-				return (TRUE);
-			} else if (object->type == OBJT_SWAP ||
-				   object->type == OBJT_DEFAULT) {
-				m_tmp = m;
-				vm_pageout_flush(&m_tmp, 1, VM_PAGER_PUT_SYNC);
-				VM_OBJECT_UNLOCK(object);
-				return (TRUE);
-			}
-		} else if (m->busy == 0 && m->hold_count == 0)
-			vm_page_cache(m);
-		VM_OBJECT_UNLOCK(m->object);
+		if (error == EBUSY)
+			return (FALSE);
 	}
 	return (FALSE);
 }
@@ -308,6 +325,193 @@
 	return (NULL);
 }
 
+static void
+vm_page_release_contigl(vm_page_t m, vm_pindex_t count)
+{
+	mtx_lock_spin(&vm_page_queue_free_mtx);
+	while (count--) {
+		vm_pageq_enqueue(PQ_FREE + m->pc, m);
+		m++;
+	}
+	mtx_unlock_spin(&vm_page_queue_free_mtx);
+}
+
+void
+vm_page_release_contig(vm_page_t m, vm_pindex_t count)
+{
+	vm_page_lock_queues();
+	vm_page_release_contigl(m, count);
+	vm_page_unlock_queues();
+}
+
+static void
+vm_contig_unqueue_free(vm_page_t m)
+{
+
+	KASSERT((m->queue - m->pc) == PQ_FREE,
+	    ("contigmalloc2: page %p not freed", m));
+	mtx_lock_spin(&vm_page_queue_free_mtx);
+	vm_pageq_remove_nowakeup(m);
+	mtx_unlock_spin(&vm_page_queue_free_mtx);
+	m->valid = VM_PAGE_BITS_ALL;
+	if (m->flags & PG_ZERO)
+		vm_page_zero_count--;
+	/* Don't clear the PG_ZERO flag; we'll need it later. */
+	m->flags = PG_UNMANAGED | (m->flags & PG_ZERO);
+	KASSERT(m->dirty == 0,
+	    ("contigmalloc2: page %p was dirty", m));
+	m->wire_count = 0;
+	m->busy = 0;
+	m->object = NULL;
+}
+
+vm_page_t
+vm_page_alloc_contig(vm_pindex_t npages, vm_paddr_t low, vm_paddr_t high,
+	    vm_offset_t alignment, vm_offset_t boundary)
+{
+	vm_object_t object;
+	vm_offset_t size;
+	vm_paddr_t phys;
+	vm_page_t pga = vm_page_array;
+	int i, pass, pqtype, start;
+
+	size = npages << PAGE_SHIFT;
+	if (size == 0)
+		panic("vm_page_alloc_contig: size must not be 0");
+	if ((alignment & (alignment - 1)) != 0)
+		panic("vm_page_alloc_contig: alignment must be a power of 2");
+	if ((boundary & (boundary - 1)) != 0)
+		panic("vm_page_alloc_contig: boundary must be a power of 2");
+
+	for (pass = 0; pass < 2; pass++) {
+		start = cnt.v_page_count;
+		vm_page_lock_queues();
+retry:
+		start--;
+		/*
+		 * Find last page in array that is free, within range,
+		 * aligned, and such that the boundary won't be crossed.
+		 */
+		for (i = start; i >= 0; i--) {
+			phys = VM_PAGE_TO_PHYS(&pga[i]);
+			pqtype = pga[i].queue - pga[i].pc;
+			if (pass == 0) {
+				if (pqtype != PQ_FREE && pqtype != PQ_CACHE)
+					continue;
+			} else if (pqtype != PQ_FREE && pqtype != PQ_CACHE &&
+				    pga[i].queue != PQ_ACTIVE &&
+				    pga[i].queue != PQ_INACTIVE)
+				continue;
+			if (phys >= low && phys + size <= high &&
+			    ((phys & (alignment - 1)) == 0) &&
+			    ((phys ^ (phys + size - 1)) & ~(boundary - 1)) == 0)
+			break;
+		}
+		/* There are no candidates at all. */
+		if (i == -1) {
+			vm_page_unlock_queues();
+			continue;
+		}
+		start = i;
+		/*
+		 * Check successive pages for contiguous and free.
+		 */
+		for (i = start + 1; i < start + npages; i++) {
+			pqtype = pga[i].queue - pga[i].pc;
+			if (VM_PAGE_TO_PHYS(&pga[i]) !=
+			    VM_PAGE_TO_PHYS(&pga[i - 1]) + PAGE_SIZE)
+				goto retry;
+			if (pass == 0) {
+				if (pqtype != PQ_FREE && pqtype != PQ_CACHE)
+					goto retry;
+			} else if (pqtype != PQ_FREE && pqtype != PQ_CACHE &&
+				    pga[i].queue != PQ_ACTIVE &&
+				    pga[i].queue != PQ_INACTIVE)
+				goto retry;
+		}
+		for (i = start; i < start + npages; i++) {
+			vm_page_t m = &pga[i];
+
+			pqtype = m->queue - m->pc;
+			if (pass != 0 && pqtype != PQ_FREE &&
+			    pqtype != PQ_CACHE) {
+				switch (m->queue) {
+				case PQ_ACTIVE:
+				case PQ_INACTIVE:
+					if (vm_contig_launder_page(m) != 0)
+						goto cleanup_freed;
+					pqtype = m->queue - m->pc;
+					if (pqtype == PQ_FREE ||
+					    pqtype == PQ_CACHE)
+						break;
+				default:
+cleanup_freed:
+					vm_page_release_contigl(&pga[start],
+					    i - start);
+					goto retry;
+				}
+			}
+			if (pqtype == PQ_CACHE) {
+				object = m->object;
+				if (!VM_OBJECT_TRYLOCK(object))
+					goto retry;
+				vm_page_busy(m);
+				vm_page_free(m);
+				VM_OBJECT_UNLOCK(object);
+			}
+			vm_contig_unqueue_free(m);
+		}
+		vm_page_unlock_queues();
+		/*
+		 * We've found a contiguous chunk that meets are requirements.
+		 */
+		return (&pga[start]);
+	}
+	return (NULL);
+}
+
+static void *
+contigmalloc2(vm_page_t m, vm_pindex_t npages, int flags)
+{
+	vm_object_t object = kernel_object;
+	vm_map_t map = kernel_map;
+	vm_offset_t addr, tmp_addr;
+	vm_pindex_t i;
+ 
+	/*
+	 * Allocate kernel VM, unfree and assign the physical pages to
+	 * it and return kernel VM pointer.
+	 */
+	vm_map_lock(map);
+	if (vm_map_findspace(map, vm_map_min(map), npages << PAGE_SHIFT, &addr)
+	    != KERN_SUCCESS) {
+		vm_map_unlock(map);
+		return (NULL);
+	}
+	vm_object_reference(object);
+	vm_map_insert(map, object, addr - VM_MIN_KERNEL_ADDRESS,
+	    addr, addr + (npages << PAGE_SHIFT), VM_PROT_ALL, VM_PROT_ALL, 0);
+	vm_map_unlock(map);
+	tmp_addr = addr;
+	VM_OBJECT_LOCK(object);
+	for (i = 0; i < npages; i++) {
+		vm_page_insert(&m[i], object,
+		    OFF_TO_IDX(tmp_addr - VM_MIN_KERNEL_ADDRESS));
+		if ((flags & M_ZERO) && !(m->flags & PG_ZERO))
+			pmap_zero_page(&m[i]);
+		tmp_addr += PAGE_SIZE;
+	}
+	VM_OBJECT_UNLOCK(object);
+	vm_map_wire(map, addr, addr + (npages << PAGE_SHIFT),
+	    VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES);
+	return ((void *)addr);
+}
+
+static int vm_old_contigmalloc = 1;
+SYSCTL_INT(_vm, OID_AUTO, old_contigmalloc,
+    CTLFLAG_RW, &vm_old_contigmalloc, 0, "Use the old contigmalloc algorithm");
+TUNABLE_INT("vm.old_contigmalloc", &vm_old_contigmalloc);
+
 void *
 contigmalloc(
 	unsigned long size,	/* should be size_t here and for malloc() */
@@ -319,17 +523,37 @@
 	unsigned long boundary)
 {
 	void * ret;
+	vm_page_t pages;
+	vm_pindex_t npgs;
 
+	npgs = round_page(size) >> PAGE_SHIFT;
 	mtx_lock(&Giant);
-	ret = contigmalloc1(size, type, flags, low, high, alignment, boundary,
-	    kernel_map);
+	if (vm_old_contigmalloc) {
+		ret = contigmalloc1(size, type, flags, low, high, alignment,
+		    boundary, kernel_map);
+	} else {
+		pages = vm_page_alloc_contig(npgs, low, high,
+		    alignment, boundary);
+		if (pages == NULL) {
+			ret = NULL;
+		} else {
+			ret = contigmalloc2(pages, npgs, flags);
+			if (ret == NULL)
+				vm_page_release_contig(pages, npgs);
+		}
+		
+	}
 	mtx_unlock(&Giant);
+	malloc_type_allocated(type, ret == NULL ? 0 : npgs << PAGE_SHIFT);
 	return (ret);
 }
 
 void
 contigfree(void *addr, unsigned long size, struct malloc_type *type)
 {
+	vm_pindex_t npgs;
 
+	npgs = round_page(size) >> PAGE_SHIFT;
 	kmem_free(kernel_map, (vm_offset_t)addr, size);
+	malloc_type_freed(type, npgs << PAGE_SHIFT);
 }

-- 
Brian Fundakowski Feldman                           \'[ FreeBSD ]''''''''''\
  <> green at FreeBSD.org                               \  The Power to Serve! \
 Opinions expressed are my own.                       \,,,,,,,,,,,,,,,,,,,,,,\


More information about the freebsd-bugs mailing list