svn commit: r248449 - in head/sys: amd64/amd64 amd64/include conf i386/i386 i386/include i386/xen vm

Attilio Rao attilio at FreeBSD.org
Mon Mar 18 00:25:04 UTC 2013


Author: attilio
Date: Mon Mar 18 00:25:02 2013
New Revision: 248449
URL: http://svnweb.freebsd.org/changeset/base/248449

Log:
  Sync back vmcontention branch into HEAD:
  Replace the per-object resident and cached pages splay tree with a
  path-compressed multi-digit radix trie.
  Along with this, switch also the x86-specific handling of idle page
  tables to using the radix trie.
  
  This change is supposed to do the following:
  - Allowing the acquisition of read locking for lookup operations of the
    resident/cached pages collections as the per-vm_page_t splay iterators
    are now removed.
  - Increase the scalability of the operations on the page collections.
  
  The radix trie does rely on the consumers locking to ensure atomicity of
  its operations.  In order to avoid deadlocks the bisection nodes are
  pre-allocated in the UMA zone.  This can be done safely because the
  algorithm needs at maximum one new node per insert which means the
  maximum number of the desired nodes is the number of available physical
  frames themselves.  However, not all the times a new bisection node is
  really needed.
  
  The radix trie implements path-compression because UFS indirect blocks
  can lead to several objects with a very sparse trie, increasing the number
  of levels to usually scan.  It also helps in the nodes pre-fetching by
  introducing the single node per-insert property.
  
  This code is not generalized (yet) because of the possible loss of
  performance by having much of the sizes in play configurable.
  However, efforts to make this code more general and then reusable in
  further different consumers might be really done.
  
  The only KPI change is the removal of the function vm_page_splay() which
  is now reaped.
  The only KBI change, instead, is the removal of the left/right iterators
  from struct vm_page, which are now reaped.
  
  Further technical notes broken into mealpieces can be retrieved from the
  svn branch:
  http://svn.freebsd.org/base/user/attilio/vmcontention/
  
  Sponsored by:	EMC / Isilon storage division
  In collaboration with:	alc, jeff
  Tested by:	flo, pho, jhb, davide
  Tested by:	ian (arm)
  Tested by:	andreast (powerpc)

Added:
  head/sys/vm/_vm_radix.h
     - copied unchanged from r248448, user/attilio/vmcontention/sys/vm/_vm_radix.h
  head/sys/vm/vm_radix.c
     - copied unchanged from r248448, user/attilio/vmcontention/sys/vm/vm_radix.c
  head/sys/vm/vm_radix.h
     - copied unchanged from r248448, user/attilio/vmcontention/sys/vm/vm_radix.h
Modified:
  head/sys/amd64/amd64/pmap.c
  head/sys/amd64/include/pmap.h
  head/sys/conf/files
  head/sys/i386/i386/pmap.c
  head/sys/i386/include/pmap.h
  head/sys/i386/xen/pmap.c
  head/sys/vm/vm_object.c
  head/sys/vm/vm_object.h
  head/sys/vm/vm_page.c
  head/sys/vm/vm_page.h
  head/sys/vm/vm_reserv.c

Modified: head/sys/amd64/amd64/pmap.c
==============================================================================
--- head/sys/amd64/amd64/pmap.c	Sun Mar 17 23:53:06 2013	(r248448)
+++ head/sys/amd64/amd64/pmap.c	Mon Mar 18 00:25:02 2013	(r248449)
@@ -131,6 +131,7 @@ __FBSDID("$FreeBSD$");
 #include <vm/vm_extern.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
+#include <vm/vm_radix.h>
 #include <vm/vm_reserv.h>
 #include <vm/uma.h>
 
@@ -1497,7 +1498,8 @@ pmap_free_zero_pages(vm_page_t free)
 
 	while (free != NULL) {
 		m = free;
-		free = m->right;
+		free = (void *)m->object;
+		m->object = NULL;
 		/* Preserve the page's PG_ZERO setting. */
 		vm_page_free_toq(m);
 	}
@@ -1516,7 +1518,7 @@ pmap_add_delayed_free_list(vm_page_t m, 
 		m->flags |= PG_ZERO;
 	else
 		m->flags &= ~PG_ZERO;
-	m->right = *free;
+	m->object = (void *)*free;
 	*free = m;
 }
 	
@@ -1526,31 +1528,12 @@ pmap_add_delayed_free_list(vm_page_t m, 
  * for mapping a distinct range of virtual addresses.  The pmap's collection is
  * ordered by this virtual address range.
  */
-static void
+static __inline void
 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
 {
-	vm_page_t root;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
-	root = pmap->pm_root;
-	if (root == NULL) {
-		mpte->left = NULL;
-		mpte->right = NULL;
-	} else {
-		root = vm_page_splay(mpte->pindex, root);
-		if (mpte->pindex < root->pindex) {
-			mpte->left = root->left;
-			mpte->right = root;
-			root->left = NULL;
-		} else if (mpte->pindex == root->pindex)
-			panic("pmap_insert_pt_page: pindex already inserted");
-		else {
-			mpte->right = root->right;
-			mpte->left = root;
-			root->right = NULL;
-		}
-	}
-	pmap->pm_root = mpte;
+	vm_radix_insert(&pmap->pm_root, mpte);
 }
 
 /*
@@ -1558,19 +1541,12 @@ pmap_insert_pt_page(pmap_t pmap, vm_page
  * specified pmap's collection of idle page table pages.  Returns NULL if there
  * is no page table page corresponding to the specified virtual address.
  */
-static vm_page_t
+static __inline vm_page_t
 pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va)
 {
-	vm_page_t mpte;
-	vm_pindex_t pindex = pmap_pde_pindex(va);
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
-	if ((mpte = pmap->pm_root) != NULL && mpte->pindex != pindex) {
-		mpte = vm_page_splay(pindex, mpte);
-		if ((pmap->pm_root = mpte)->pindex != pindex)
-			mpte = NULL;
-	}
-	return (mpte);
+	return (vm_radix_lookup(&pmap->pm_root, pmap_pde_pindex(va)));
 }
 
 /*
@@ -1578,25 +1554,12 @@ pmap_lookup_pt_page(pmap_t pmap, vm_offs
  * of idle page table pages.  The specified page table page must be a member of
  * the pmap's collection.
  */
-static void
+static __inline void
 pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte)
 {
-	vm_page_t root;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
-	if (mpte != pmap->pm_root) {
-		root = vm_page_splay(mpte->pindex, pmap->pm_root);
-		KASSERT(mpte == root,
-		    ("pmap_remove_pt_page: mpte %p is missing from pmap %p",
-		    mpte, pmap));
-	}
-	if (mpte->left == NULL)
-		root = mpte->right;
-	else {
-		root = vm_page_splay(mpte->pindex, mpte->left);
-		root->right = mpte->right;
-	}
-	pmap->pm_root = root;
+	vm_radix_remove(&pmap->pm_root, mpte->pindex);
 }
 
 /*
@@ -1693,7 +1656,7 @@ pmap_pinit0(pmap_t pmap)
 
 	PMAP_LOCK_INIT(pmap);
 	pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys);
-	pmap->pm_root = NULL;
+	pmap->pm_root.rt_root = 0;
 	CPU_ZERO(&pmap->pm_active);
 	PCPU_SET(curpmap, pmap);
 	TAILQ_INIT(&pmap->pm_pvchunk);
@@ -1734,7 +1697,7 @@ pmap_pinit(pmap_t pmap)
 	/* install self-referential address mapping entry(s) */
 	pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | PG_V | PG_RW | PG_A | PG_M;
 
-	pmap->pm_root = NULL;
+	pmap->pm_root.rt_root = 0;
 	CPU_ZERO(&pmap->pm_active);
 	TAILQ_INIT(&pmap->pm_pvchunk);
 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
@@ -1976,7 +1939,7 @@ pmap_release(pmap_t pmap)
 	KASSERT(pmap->pm_stats.resident_count == 0,
 	    ("pmap_release: pmap resident count %ld != 0",
 	    pmap->pm_stats.resident_count));
-	KASSERT(pmap->pm_root == NULL,
+	KASSERT(vm_radix_is_empty(&pmap->pm_root),
 	    ("pmap_release: pmap has reserved page table page(s)"));
 
 	m = PHYS_TO_VM_PAGE(pmap->pm_pml4[PML4PML4I] & PG_FRAME);
@@ -2273,7 +2236,7 @@ reclaim_pv_chunk(pmap_t locked_pmap, str
 	}
 	if (m_pc == NULL && free != NULL) {
 		m_pc = free;
-		free = m_pc->right;
+		free = (void *)m_pc->object;
 		/* Recycle a freed page table page. */
 		m_pc->wire_count = 1;
 		atomic_add_int(&cnt.v_wire_count, 1);

Modified: head/sys/amd64/include/pmap.h
==============================================================================
--- head/sys/amd64/include/pmap.h	Sun Mar 17 23:53:06 2013	(r248448)
+++ head/sys/amd64/include/pmap.h	Mon Mar 18 00:25:02 2013	(r248449)
@@ -150,6 +150,8 @@
 #include <sys/_lock.h>
 #include <sys/_mutex.h>
 
+#include <vm/_vm_radix.h>
+
 typedef u_int64_t pd_entry_t;
 typedef u_int64_t pt_entry_t;
 typedef u_int64_t pdp_entry_t;
@@ -250,7 +252,7 @@ struct pmap {
 	cpuset_t		pm_active;	/* active on cpus */
 	/* spare u_int here due to padding */
 	struct pmap_statistics	pm_stats;	/* pmap statistics */
-	vm_page_t		pm_root;	/* spare page table pages */
+	struct vm_radix		pm_root;	/* spare page table pages */
 };
 
 typedef struct pmap	*pmap_t;

Modified: head/sys/conf/files
==============================================================================
--- head/sys/conf/files	Sun Mar 17 23:53:06 2013	(r248448)
+++ head/sys/conf/files	Mon Mar 18 00:25:02 2013	(r248449)
@@ -3630,6 +3630,7 @@ vm/vm_page.c			standard
 vm/vm_pageout.c			standard
 vm/vm_pager.c			standard
 vm/vm_phys.c			standard
+vm/vm_radix.c			standard
 vm/vm_reserv.c			standard
 vm/vm_unix.c			standard
 vm/vm_zeroidle.c		standard

Modified: head/sys/i386/i386/pmap.c
==============================================================================
--- head/sys/i386/i386/pmap.c	Sun Mar 17 23:53:06 2013	(r248448)
+++ head/sys/i386/i386/pmap.c	Mon Mar 18 00:25:02 2013	(r248449)
@@ -133,6 +133,7 @@ __FBSDID("$FreeBSD$");
 #include <vm/vm_extern.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
+#include <vm/vm_radix.h>
 #include <vm/vm_reserv.h>
 #include <vm/uma.h>
 
@@ -1573,7 +1574,8 @@ pmap_free_zero_pages(vm_page_t free)
 
 	while (free != NULL) {
 		m = free;
-		free = m->right;
+		free = (void *)m->object;
+		m->object = NULL;
 		/* Preserve the page's PG_ZERO setting. */
 		vm_page_free_toq(m);
 	}
@@ -1592,7 +1594,7 @@ pmap_add_delayed_free_list(vm_page_t m, 
 		m->flags |= PG_ZERO;
 	else
 		m->flags &= ~PG_ZERO;
-	m->right = *free;
+	m->object = (void *)*free;
 	*free = m;
 }
 
@@ -1602,31 +1604,12 @@ pmap_add_delayed_free_list(vm_page_t m, 
  * for mapping a distinct range of virtual addresses.  The pmap's collection is
  * ordered by this virtual address range.
  */
-static void
+static __inline void
 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
 {
-	vm_page_t root;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
-	root = pmap->pm_root;
-	if (root == NULL) {
-		mpte->left = NULL;
-		mpte->right = NULL;
-	} else {
-		root = vm_page_splay(mpte->pindex, root);
-		if (mpte->pindex < root->pindex) {
-			mpte->left = root->left;
-			mpte->right = root;
-			root->left = NULL;
-		} else if (mpte->pindex == root->pindex)
-			panic("pmap_insert_pt_page: pindex already inserted");
-		else {
-			mpte->right = root->right;
-			mpte->left = root;
-			root->right = NULL;
-		}
-	}
-	pmap->pm_root = mpte;
+	vm_radix_insert(&pmap->pm_root, mpte);
 }
 
 /*
@@ -1634,19 +1617,12 @@ pmap_insert_pt_page(pmap_t pmap, vm_page
  * specified pmap's collection of idle page table pages.  Returns NULL if there
  * is no page table page corresponding to the specified virtual address.
  */
-static vm_page_t
+static __inline vm_page_t
 pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va)
 {
-	vm_page_t mpte;
-	vm_pindex_t pindex = va >> PDRSHIFT;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
-	if ((mpte = pmap->pm_root) != NULL && mpte->pindex != pindex) {
-		mpte = vm_page_splay(pindex, mpte);
-		if ((pmap->pm_root = mpte)->pindex != pindex)
-			mpte = NULL;
-	}
-	return (mpte);
+	return (vm_radix_lookup(&pmap->pm_root, va >> PDRSHIFT));
 }
 
 /*
@@ -1654,21 +1630,12 @@ pmap_lookup_pt_page(pmap_t pmap, vm_offs
  * of idle page table pages.  The specified page table page must be a member of
  * the pmap's collection.
  */
-static void
+static __inline void
 pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte)
 {
-	vm_page_t root;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
-	if (mpte != pmap->pm_root)
-		vm_page_splay(mpte->pindex, pmap->pm_root);
-	if (mpte->left == NULL)
-		root = mpte->right;
-	else {
-		root = vm_page_splay(mpte->pindex, mpte->left);
-		root->right = mpte->right;
-	}
-	pmap->pm_root = root;
+	vm_radix_remove(&pmap->pm_root, mpte->pindex);
 }
 
 /*
@@ -1755,7 +1722,7 @@ pmap_pinit0(pmap_t pmap)
 #ifdef PAE
 	pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT);
 #endif
-	pmap->pm_root = NULL;
+	pmap->pm_root.rt_root = 0;
 	CPU_ZERO(&pmap->pm_active);
 	PCPU_SET(curpmap, pmap);
 	TAILQ_INIT(&pmap->pm_pvchunk);
@@ -1794,9 +1761,9 @@ pmap_pinit(pmap_t pmap)
 		KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30),
 		    ("pmap_pinit: pdpt above 4g"));
 #endif
-		pmap->pm_root = NULL;
+		pmap->pm_root.rt_root = 0;
 	}
-	KASSERT(pmap->pm_root == NULL,
+	KASSERT(vm_radix_is_empty(&pmap->pm_root),
 	    ("pmap_pinit: pmap has reserved page table page(s)"));
 
 	/*
@@ -2060,7 +2027,7 @@ pmap_release(pmap_t pmap)
 	KASSERT(pmap->pm_stats.resident_count == 0,
 	    ("pmap_release: pmap resident count %ld != 0",
 	    pmap->pm_stats.resident_count));
-	KASSERT(pmap->pm_root == NULL,
+	KASSERT(vm_radix_is_empty(&pmap->pm_root),
 	    ("pmap_release: pmap has reserved page table page(s)"));
 
 	pmap_lazyfix(pmap);
@@ -2343,7 +2310,7 @@ out:
 	}
 	if (m_pc == NULL && pv_vafree != 0 && free != NULL) {
 		m_pc = free;
-		free = m_pc->right;
+		free = (void *)m_pc->object;
 		/* Recycle a freed page table page. */
 		m_pc->wire_count = 1;
 		atomic_add_int(&cnt.v_wire_count, 1);

Modified: head/sys/i386/include/pmap.h
==============================================================================
--- head/sys/i386/include/pmap.h	Sun Mar 17 23:53:06 2013	(r248448)
+++ head/sys/i386/include/pmap.h	Mon Mar 18 00:25:02 2013	(r248449)
@@ -159,6 +159,8 @@
 #include <sys/_lock.h>
 #include <sys/_mutex.h>
 
+#include <vm/_vm_radix.h>
+
 #ifdef PAE
 
 typedef uint64_t pdpt_entry_t;
@@ -441,7 +443,7 @@ struct pmap {
 	pdpt_entry_t		*pm_pdpt;	/* KVA of page director pointer
 						   table */
 #endif
-	vm_page_t		pm_root;	/* spare page table pages */
+	struct vm_radix		pm_root;	/* spare page table pages */
 };
 
 typedef struct pmap	*pmap_t;

Modified: head/sys/i386/xen/pmap.c
==============================================================================
--- head/sys/i386/xen/pmap.c	Sun Mar 17 23:53:06 2013	(r248448)
+++ head/sys/i386/xen/pmap.c	Mon Mar 18 00:25:02 2013	(r248449)
@@ -1335,7 +1335,8 @@ pmap_free_zero_pages(vm_page_t free)
 
 	while (free != NULL) {
 		m = free;
-		free = m->right;
+		free = (void *)m->object;
+		m->object = NULL;
 		vm_page_free_zero(m);
 	}
 }
@@ -1393,7 +1394,7 @@ _pmap_unwire_ptp(pmap_t pmap, vm_page_t 
 	 * Put page on a list so that it is released after
 	 * *ALL* TLB shootdown is done
 	 */
-	m->right = *free;
+	m->object = (void *)*free;
 	*free = m;
 }
 
@@ -2090,7 +2091,7 @@ out:
 	}
 	if (m_pc == NULL && pv_vafree != 0 && free != NULL) {
 		m_pc = free;
-		free = m_pc->right;
+		free = (void *)m_pc->object;
 		/* Recycle a freed page table page. */
 		m_pc->wire_count = 1;
 		atomic_add_int(&cnt.v_wire_count, 1);

Copied: head/sys/vm/_vm_radix.h (from r248448, user/attilio/vmcontention/sys/vm/_vm_radix.h)
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ head/sys/vm/_vm_radix.h	Mon Mar 18 00:25:02 2013	(r248449, copy of r248448, user/attilio/vmcontention/sys/vm/_vm_radix.h)
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2013 EMC Corp.
+ * Copyright (c) 2011 Jeffrey Roberson <jeff at freebsd.org>
+ * Copyright (c) 2008 Mayur Shardul <mayur.shardul at gmail.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef __VM_RADIX_H_
+#define __VM_RADIX_H_
+
+/*
+ * Radix tree root.
+ */
+struct vm_radix {
+	uintptr_t	rt_root;
+};
+
+#ifdef _KERNEL
+
+static __inline boolean_t
+vm_radix_is_empty(struct vm_radix *rtree)
+{
+
+	return (rtree->rt_root == 0);
+}
+
+#endif /* _KERNEL */
+#endif /* !__VM_RADIX_H_ */

Modified: head/sys/vm/vm_object.c
==============================================================================
--- head/sys/vm/vm_object.c	Sun Mar 17 23:53:06 2013	(r248448)
+++ head/sys/vm/vm_object.c	Mon Mar 18 00:25:02 2013	(r248449)
@@ -94,6 +94,7 @@ __FBSDID("$FreeBSD$");
 #include <vm/swap_pager.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
+#include <vm/vm_radix.h>
 #include <vm/vm_reserv.h>
 #include <vm/uma.h>
 
@@ -167,8 +168,8 @@ vm_object_zdtor(void *mem, int size, voi
 	object = (vm_object_t)mem;
 	KASSERT(TAILQ_EMPTY(&object->memq),
 	    ("object %p has resident pages in its memq", object));
-	KASSERT(object->root == NULL,
-	    ("object %p has resident pages in its tree", object));
+	KASSERT(vm_radix_is_empty(&object->rtree),
+	    ("object %p has resident pages in its trie", object));
 #if VM_NRESERVLEVEL > 0
 	KASSERT(LIST_EMPTY(&object->rvq),
 	    ("object %p has reservations",
@@ -199,11 +200,11 @@ vm_object_zinit(void *mem, int size, int
 	rw_init_flags(&object->lock, "vm object", RW_DUPOK);
 
 	/* These are true for any object that has been freed */
-	object->root = NULL;
+	object->rtree.rt_root = 0;
 	object->paging_in_progress = 0;
 	object->resident_page_count = 0;
 	object->shadow_count = 0;
-	object->cache = NULL;
+	object->cache.rt_root = 0;
 	return (0);
 }
 
@@ -295,6 +296,8 @@ vm_object_init(void)
 	    NULL,
 #endif
 	    vm_object_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM|UMA_ZONE_NOFREE);
+
+	vm_radix_init();
 }
 
 void
@@ -742,7 +745,7 @@ vm_object_terminate(vm_object_t object)
 	 * modified by the preceding loop.
 	 */
 	if (object->resident_page_count != 0) {
-		object->root = NULL;
+		vm_radix_reclaim_allnodes(&object->rtree);
 		TAILQ_INIT(&object->memq);
 		object->resident_page_count = 0;
 		if (object->type == OBJT_VNODE)

Modified: head/sys/vm/vm_object.h
==============================================================================
--- head/sys/vm/vm_object.h	Sun Mar 17 23:53:06 2013	(r248448)
+++ head/sys/vm/vm_object.h	Mon Mar 18 00:25:02 2013	(r248449)
@@ -72,6 +72,8 @@
 #include <sys/_mutex.h>
 #include <sys/_rwlock.h>
 
+#include <vm/_vm_radix.h>
+
 /*
  *	Types defined:
  *
@@ -79,10 +81,10 @@
  *
  *	The root of cached pages pool is protected by both the per-object lock
  *	and the free pages queue mutex.
- *	On insert in the cache splay tree, the per-object lock is expected
+ *	On insert in the cache radix trie, the per-object lock is expected
  *	to be already held and the free pages queue mutex will be
  *	acquired during the operation too.
- *	On remove and lookup from the cache splay tree, only the free
+ *	On remove and lookup from the cache radix trie, only the free
  *	pages queue mutex is expected to be locked.
  *	These rules allow for reliably checking for the presence of cached
  *	pages with only the per-object lock held, thereby reducing contention
@@ -101,7 +103,7 @@ struct vm_object {
 	LIST_HEAD(, vm_object) shadow_head; /* objects that this is a shadow for */
 	LIST_ENTRY(vm_object) shadow_list; /* chain of shadow objects */
 	TAILQ_HEAD(, vm_page) memq;	/* list of resident pages */
-	vm_page_t root;			/* root of the resident page splay tree */
+	struct vm_radix rtree;		/* root of the resident page radix trie*/
 	vm_pindex_t size;		/* Object size */
 	int generation;			/* generation ID */
 	int ref_count;			/* How many refs?? */
@@ -116,7 +118,7 @@ struct vm_object {
 	vm_ooffset_t backing_object_offset;/* Offset in backing object */
 	TAILQ_ENTRY(vm_object) pager_object_list; /* list of all objects of this pager type */
 	LIST_HEAD(, vm_reserv) rvq;	/* list of reservations */
-	vm_page_t cache;		/* (o + f) root of the cache page splay tree */
+	struct vm_radix cache;		/* (o + f) root of the cache page radix trie */
 	void *handle;
 	union {
 		/*
@@ -246,7 +248,7 @@ static __inline boolean_t
 vm_object_cache_is_empty(vm_object_t object)
 {
 
-	return (object->cache == NULL);
+	return (vm_radix_is_empty(&object->cache));
 }
 
 vm_object_t vm_object_allocate (objtype_t, vm_pindex_t);

Modified: head/sys/vm/vm_page.c
==============================================================================
--- head/sys/vm/vm_page.c	Sun Mar 17 23:53:06 2013	(r248448)
+++ head/sys/vm/vm_page.c	Mon Mar 18 00:25:02 2013	(r248449)
@@ -110,6 +110,7 @@ __FBSDID("$FreeBSD$");
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_phys.h>
+#include <vm/vm_radix.h>
 #include <vm/vm_reserv.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
@@ -794,63 +795,6 @@ vm_page_dirty_KBI(vm_page_t m)
 }
 
 /*
- *	vm_page_splay:
- *
- *	Implements Sleator and Tarjan's top-down splay algorithm.  Returns
- *	the vm_page containing the given pindex.  If, however, that
- *	pindex is not found in the vm_object, returns a vm_page that is
- *	adjacent to the pindex, coming before or after it.
- */
-vm_page_t
-vm_page_splay(vm_pindex_t pindex, vm_page_t root)
-{
-	struct vm_page dummy;
-	vm_page_t lefttreemax, righttreemin, y;
-
-	if (root == NULL)
-		return (root);
-	lefttreemax = righttreemin = &dummy;
-	for (;; root = y) {
-		if (pindex < root->pindex) {
-			if ((y = root->left) == NULL)
-				break;
-			if (pindex < y->pindex) {
-				/* Rotate right. */
-				root->left = y->right;
-				y->right = root;
-				root = y;
-				if ((y = root->left) == NULL)
-					break;
-			}
-			/* Link into the new root's right tree. */
-			righttreemin->left = root;
-			righttreemin = root;
-		} else if (pindex > root->pindex) {
-			if ((y = root->right) == NULL)
-				break;
-			if (pindex > y->pindex) {
-				/* Rotate left. */
-				root->right = y->left;
-				y->left = root;
-				root = y;
-				if ((y = root->right) == NULL)
-					break;
-			}
-			/* Link into the new root's left tree. */
-			lefttreemax->right = root;
-			lefttreemax = root;
-		} else
-			break;
-	}
-	/* Assemble the new root. */
-	lefttreemax->right = root->left;
-	righttreemin->left = root->right;
-	root->left = dummy.right;
-	root->right = dummy.left;
-	return (root);
-}
-
-/*
  *	vm_page_insert:		[ internal use only ]
  *
  *	Inserts the given mem entry into the object and object list.
@@ -865,7 +809,7 @@ vm_page_splay(vm_pindex_t pindex, vm_pag
 void
 vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex)
 {
-	vm_page_t root;
+	vm_page_t neighbor;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	if (m->object != NULL)
@@ -880,28 +824,19 @@ vm_page_insert(vm_page_t m, vm_object_t 
 	/*
 	 * Now link into the object's ordered list of backed pages.
 	 */
-	root = object->root;
-	if (root == NULL) {
-		m->left = NULL;
-		m->right = NULL;
+	if (object->resident_page_count == 0) {
 		TAILQ_INSERT_TAIL(&object->memq, m, listq);
 	} else {
-		root = vm_page_splay(pindex, root);
-		if (pindex < root->pindex) {
-			m->left = root->left;
-			m->right = root;
-			root->left = NULL;
-			TAILQ_INSERT_BEFORE(root, m, listq);
-		} else if (pindex == root->pindex)
-			panic("vm_page_insert: offset already allocated");
-		else {
-			m->right = root->right;
-			m->left = root;
-			root->right = NULL;
-			TAILQ_INSERT_AFTER(&object->memq, root, m, listq);
-		}
+		neighbor = vm_radix_lookup_le(&object->rtree, pindex);
+		if (neighbor != NULL) {
+		    	KASSERT(pindex > neighbor->pindex,
+			    ("vm_page_insert: offset %ju less than %ju",
+			    (uintmax_t)pindex, (uintmax_t)neighbor->pindex));
+			TAILQ_INSERT_AFTER(&object->memq, neighbor, m, listq);
+		} else 
+			TAILQ_INSERT_HEAD(&object->memq, m, listq);
 	}
-	object->root = m;
+	vm_radix_insert(&object->rtree, m);
 
 	/*
 	 * Show that the object has one more resident page.
@@ -937,7 +872,6 @@ void
 vm_page_remove(vm_page_t m)
 {
 	vm_object_t object;
-	vm_page_t next, prev, root;
 
 	if ((m->oflags & VPO_UNMANAGED) == 0)
 		vm_page_lock_assert(m, MA_OWNED);
@@ -952,42 +886,7 @@ vm_page_remove(vm_page_t m)
 	/*
 	 * Now remove from the object's list of backed pages.
 	 */
-	if ((next = TAILQ_NEXT(m, listq)) != NULL && next->left == m) {
-		/*
-		 * Since the page's successor in the list is also its parent
-		 * in the tree, its right subtree must be empty.
-		 */
-		next->left = m->left;
-		KASSERT(m->right == NULL,
-		    ("vm_page_remove: page %p has right child", m));
-	} else if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL &&
-	    prev->right == m) {
-		/*
-		 * Since the page's predecessor in the list is also its parent
-		 * in the tree, its left subtree must be empty.
-		 */
-		KASSERT(m->left == NULL,
-		    ("vm_page_remove: page %p has left child", m));
-		prev->right = m->right;
-	} else {
-		if (m != object->root)
-			vm_page_splay(m->pindex, object->root);
-		if (m->left == NULL)
-			root = m->right;
-		else if (m->right == NULL)
-			root = m->left;
-		else {
-			/*
-			 * Move the page's successor to the root, because
-			 * pages are usually removed in ascending order.
-			 */
-			if (m->right != next)
-				vm_page_splay(m->pindex, m->right);
-			next->left = m->left;
-			root = next;
-		}
-		object->root = root;
-	}
+	vm_radix_remove(&object->rtree, m->pindex);
 	TAILQ_REMOVE(&object->memq, m, listq);
 
 	/*
@@ -1015,15 +914,9 @@ vm_page_remove(vm_page_t m)
 vm_page_t
 vm_page_lookup(vm_object_t object, vm_pindex_t pindex)
 {
-	vm_page_t m;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
-	if ((m = object->root) != NULL && m->pindex != pindex) {
-		m = vm_page_splay(pindex, m);
-		if ((object->root = m)->pindex != pindex)
-			m = NULL;
-	}
-	return (m);
+	return (vm_radix_lookup(&object->rtree, pindex));
 }
 
 /*
@@ -1040,13 +933,8 @@ vm_page_find_least(vm_object_t object, v
 	vm_page_t m;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
-	if ((m = TAILQ_FIRST(&object->memq)) != NULL) {
-		if (m->pindex < pindex) {
-			m = vm_page_splay(pindex, object->root);
-			if ((object->root = m)->pindex < pindex)
-				m = TAILQ_NEXT(m, listq);
-		}
-	}
+	if ((m = TAILQ_FIRST(&object->memq)) != NULL && m->pindex < pindex)
+		m = vm_radix_lookup_ge(&object->rtree, pindex);
 	return (m);
 }
 
@@ -1126,45 +1014,18 @@ vm_page_rename(vm_page_t m, vm_object_t 
 void
 vm_page_cache_free(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
 {
-	vm_page_t m, m_next;
+	vm_page_t m;
 	boolean_t empty;
 
 	mtx_lock(&vm_page_queue_free_mtx);
-	if (__predict_false(vm_object_cache_is_empty(object))) {
+	if (__predict_false(vm_radix_is_empty(&object->cache))) {
 		mtx_unlock(&vm_page_queue_free_mtx);
 		return;
 	}
-	m = object->cache = vm_page_splay(start, object->cache);
-	if (m->pindex < start) {
-		if (m->right == NULL)
-			m = NULL;
-		else {
-			m_next = vm_page_splay(start, m->right);
-			m_next->left = m;
-			m->right = NULL;
-			m = object->cache = m_next;
-		}
-	}
-
-	/*
-	 * At this point, "m" is either (1) a reference to the page
-	 * with the least pindex that is greater than or equal to
-	 * "start" or (2) NULL.
-	 */
-	for (; m != NULL && (m->pindex < end || end == 0); m = m_next) {
-		/*
-		 * Find "m"'s successor and remove "m" from the
-		 * object's cache.
-		 */
-		if (m->right == NULL) {
-			object->cache = m->left;
-			m_next = NULL;
-		} else {
-			m_next = vm_page_splay(start, m->right);
-			m_next->left = m->left;
-			object->cache = m_next;
-		}
-		/* Convert "m" to a free page. */
+	while ((m = vm_radix_lookup_ge(&object->cache, start)) != NULL) {
+		if (end != 0 && m->pindex >= end)
+			break;
+		vm_radix_remove(&object->cache, m->pindex);
 		m->object = NULL;
 		m->valid = 0;
 		/* Clear PG_CACHED and set PG_FREE. */
@@ -1174,7 +1035,7 @@ vm_page_cache_free(vm_object_t object, v
 		cnt.v_cache_count--;
 		cnt.v_free_count++;
 	}
-	empty = vm_object_cache_is_empty(object);
+	empty = vm_radix_is_empty(&object->cache);
 	mtx_unlock(&vm_page_queue_free_mtx);
 	if (object->type == OBJT_VNODE && empty)
 		vdrop(object->handle);
@@ -1189,15 +1050,9 @@ vm_page_cache_free(vm_object_t object, v
 static inline vm_page_t
 vm_page_cache_lookup(vm_object_t object, vm_pindex_t pindex)
 {
-	vm_page_t m;
 
 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
-	if ((m = object->cache) != NULL && m->pindex != pindex) {
-		m = vm_page_splay(pindex, m);
-		if ((object->cache = m)->pindex != pindex)
-			m = NULL;
-	}
-	return (m);
+	return (vm_radix_lookup(&object->cache, pindex));
 }
 
 /*
@@ -1209,28 +1064,11 @@ vm_page_cache_lookup(vm_object_t object,
 static void
 vm_page_cache_remove(vm_page_t m)
 {
-	vm_object_t object;
-	vm_page_t root;
 
 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 	KASSERT((m->flags & PG_CACHED) != 0,
 	    ("vm_page_cache_remove: page %p is not cached", m));
-	object = m->object;
-	if (m != object->cache) {
-		root = vm_page_splay(m->pindex, object->cache);
-		KASSERT(root == m,
-		    ("vm_page_cache_remove: page %p is not cached in object %p",
-		    m, object));
-	}
-	if (m->left == NULL)
-		root = m->right;
-	else if (m->right == NULL)
-		root = m->left;
-	else {
-		root = vm_page_splay(m->pindex, m->left);
-		root->right = m->right;
-	}
-	object->cache = root;
+	vm_radix_remove(&m->object->cache, m->pindex);
 	m->object = NULL;
 	cnt.v_cache_count--;
 }
@@ -1250,7 +1088,7 @@ void
 vm_page_cache_transfer(vm_object_t orig_object, vm_pindex_t offidxstart,
     vm_object_t new_object)
 {
-	vm_page_t m, m_next;
+	vm_page_t m;
 
 	/*
 	 * Insertion into an object's collection of cached pages
@@ -1258,53 +1096,24 @@ vm_page_cache_transfer(vm_object_t orig_
 	 * not.
 	 */
 	VM_OBJECT_ASSERT_WLOCKED(new_object);
-	KASSERT(vm_object_cache_is_empty(new_object),
+	KASSERT(vm_radix_is_empty(&new_object->cache),
 	    ("vm_page_cache_transfer: object %p has cached pages",
 	    new_object));
 	mtx_lock(&vm_page_queue_free_mtx);
-	if ((m = orig_object->cache) != NULL) {
+	while ((m = vm_radix_lookup_ge(&orig_object->cache,
+	    offidxstart)) != NULL) {
 		/*
 		 * Transfer all of the pages with offset greater than or
 		 * equal to 'offidxstart' from the original object's
 		 * cache to the new object's cache.
 		 */
-		m = vm_page_splay(offidxstart, m);
-		if (m->pindex < offidxstart) {
-			orig_object->cache = m;
-			new_object->cache = m->right;
-			m->right = NULL;
-		} else {
-			orig_object->cache = m->left;
-			new_object->cache = m;
-			m->left = NULL;
-		}
-		while ((m = new_object->cache) != NULL) {
-			if ((m->pindex - offidxstart) >= new_object->size) {
-				/*
-				 * Return all of the cached pages with
-				 * offset greater than or equal to the
-				 * new object's size to the original
-				 * object's cache. 
-				 */
-				new_object->cache = m->left;
-				m->left = orig_object->cache;
-				orig_object->cache = m;
-				break;
-			}
-			m_next = vm_page_splay(m->pindex, m->right);
-			/* Update the page's object and offset. */
-			m->object = new_object;
-			m->pindex -= offidxstart;
-			if (m_next == NULL)
-				break;
-			m->right = NULL;
-			m_next->left = m;
-			new_object->cache = m_next;
-		}
-		KASSERT(vm_object_cache_is_empty(new_object) ||
-		    new_object->type == OBJT_SWAP,
-		    ("vm_page_cache_transfer: object %p's type is incompatible"
-		    " with cached pages", new_object));
+		if ((m->pindex - offidxstart) >= new_object->size)
+			break;
+		vm_radix_remove(&orig_object->cache, m->pindex);
+		/* Update the page's object and offset. */
+		m->object = new_object;
+		m->pindex -= offidxstart;
+		vm_radix_insert(&new_object->cache, m);
 	}
 	mtx_unlock(&vm_page_queue_free_mtx);
 }
@@ -2324,7 +2133,7 @@ void
 vm_page_cache(vm_page_t m)
 {
 	vm_object_t object;
-	vm_page_t next, prev, root;
+	boolean_t cache_was_empty;
 
 	vm_page_lock_assert(m, MA_OWNED);
 	object = m->object;
@@ -2359,42 +2168,7 @@ vm_page_cache(vm_page_t m)
 	 * Remove the page from the object's collection of resident
 	 * pages. 
 	 */
-	if ((next = TAILQ_NEXT(m, listq)) != NULL && next->left == m) {
-		/*
-		 * Since the page's successor in the list is also its parent
-		 * in the tree, its right subtree must be empty.
-		 */
-		next->left = m->left;
-		KASSERT(m->right == NULL,
-		    ("vm_page_cache: page %p has right child", m));
-	} else if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL &&
-	    prev->right == m) {
-		/*
-		 * Since the page's predecessor in the list is also its parent
-		 * in the tree, its left subtree must be empty.
-		 */
-		KASSERT(m->left == NULL,
-		    ("vm_page_cache: page %p has left child", m));
-		prev->right = m->right;
-	} else {
-		if (m != object->root)
-			vm_page_splay(m->pindex, object->root);
-		if (m->left == NULL)
-			root = m->right;
-		else if (m->right == NULL)
-			root = m->left;
-		else {
-			/*
-			 * Move the page's successor to the root, because
-			 * pages are usually removed in ascending order.
-			 */
-			if (m->right != next)
-				vm_page_splay(m->pindex, m->right);
-			next->left = m->left;
-			root = next;
-		}
-		object->root = root;
-	}
+	vm_radix_remove(&object->rtree, m->pindex);
 	TAILQ_REMOVE(&object->memq, m, listq);
 	object->resident_page_count--;
 
@@ -2412,25 +2186,8 @@ vm_page_cache(vm_page_t m)
 	mtx_lock(&vm_page_queue_free_mtx);
 	m->flags |= PG_CACHED;
 	cnt.v_cache_count++;
-	root = object->cache;
-	if (root == NULL) {
-		m->left = NULL;
-		m->right = NULL;
-	} else {
-		root = vm_page_splay(m->pindex, root);
-		if (m->pindex < root->pindex) {
-			m->left = root->left;
-			m->right = root;
-			root->left = NULL;
-		} else if (__predict_false(m->pindex == root->pindex))
-			panic("vm_page_cache: offset already cached");
-		else {
-			m->right = root->right;
-			m->left = root;
-			root->right = NULL;
-		}
-	}
-	object->cache = m;
+	cache_was_empty = vm_radix_is_empty(&object->cache);

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***


More information about the svn-src-all mailing list