svn commit: r254065 - in head/sys: amd64/amd64 ofed/include/linux sparc64/sparc64 vm x86/acpica

Wed Aug 7 16:36:40 UTC 2013

Author: kib
Date: Wed Aug  7 16:36:38 2013
New Revision: 254065
URL: http://svnweb.freebsd.org/changeset/base/254065

Log:
  Split the pagequeues per NUMA domains, and split pageademon process
  into threads each processing queue in a single domain.  The structure
  of the pagedaemons and queues is kept intact, most of the changes come
  from the need for code to find an owning page queue for given page,
  calculated from the segment containing the page.
  
  The tie between NUMA domain and pagedaemon thread/pagequeue split is
  rather arbitrary, the multithreaded daemon could be allowed for the
  single-domain machines, or one domain might be split into several page
  domains, to further increase concurrency.
  
  Right now, each pagedaemon thread tries to reach the global target,
  precalculated at the start of the pass.  This is not optimal, since it
  could cause excessive page deactivation and freeing.  The code should
  be changed to re-check the global page deficit state in the loop after
  some number of iterations.
  
  The pagedaemons reach the quorum before starting the OOM, since one
  thread inability to meet the target is normal for split queues.  Only
  when all pagedaemons fail to produce enough reusable pages, OOM is
  started by single selected thread.
  
  Launder is modified to take into account the segments layout with
  regard to the region for which cleaning is performed.
  
  Based on the preliminary patch by jeff, sponsored by EMC / Isilon
  Storage Division.
  
  Reviewed by:	alc
  Tested by:	pho
  Sponsored by:	The FreeBSD Foundation

Modified:
  head/sys/amd64/amd64/minidump_machdep.c
  head/sys/ofed/include/linux/page.h
  head/sys/sparc64/sparc64/genassym.c
  head/sys/vm/vm_page.c
  head/sys/vm/vm_page.h
  head/sys/vm/vm_pageout.c
  head/sys/vm/vm_phys.c
  head/sys/vm/vm_phys.h
  head/sys/vm/vm_zeroidle.c
  head/sys/x86/acpica/srat.c

Modified: head/sys/amd64/amd64/minidump_machdep.c
==============================================================================

--- head/sys/amd64/amd64/minidump_machdep.c	Wed Aug  7 16:33:15 2013	(r254064)
+++ head/sys/amd64/amd64/minidump_machdep.c	Wed Aug  7 16:36:38 2013	(r254065)
@@ -39,6 +39,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/msgbuf.h>
 #include <sys/watchdog.h>
 #include <vm/vm.h>
+#include <vm/vm_param.h>
 #include <vm/vm_page.h>
 #include <vm/vm_phys.h>
 #include <vm/pmap.h>

Modified: head/sys/ofed/include/linux/page.h
==============================================================================
--- head/sys/ofed/include/linux/page.h	Wed Aug  7 16:33:15 2013	(r254064)
+++ head/sys/ofed/include/linux/page.h	Wed Aug  7 16:36:38 2013	(r254065)
@@ -32,6 +32,7 @@
 
 #include <sys/param.h>
 
+#include <machine/atomic.h>
 #include <vm/vm.h>
 #include <vm/vm_page.h>
 

Modified: head/sys/sparc64/sparc64/genassym.c
==============================================================================
--- head/sys/sparc64/sparc64/genassym.c	Wed Aug  7 16:33:15 2013	(r254064)
+++ head/sys/sparc64/sparc64/genassym.c	Wed Aug  7 16:36:38 2013	(r254065)
@@ -39,6 +39,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/vmmeter.h>
 #include <sys/_cpuset.h>
 
+#include <machine/atomic.h>
 #include <vm/vm.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>

Modified: head/sys/vm/vm_page.c
==============================================================================
--- head/sys/vm/vm_page.c	Wed Aug  7 16:33:15 2013	(r254064)
+++ head/sys/vm/vm_page.c	Wed Aug  7 16:36:38 2013	(r254065)
@@ -64,8 +64,7 @@
  *			GENERAL RULES ON VM_PAGE MANIPULATION
  *
  *	- A page queue lock is required when adding or removing a page from a
- *	  page queue (vm_pagequeues[]), regardless of other locks or the
- *	  busy state of a page.
+ *	  page queue regardless of other locks or the busy state of a page.
  *
  *		* In general, no thread besides the page daemon can acquire or
  *		  hold more than one page queue lock at a time.
@@ -124,20 +123,7 @@ __FBSDID("$FreeBSD$");
  *	page structure.
  */
 
-struct vm_pagequeue vm_pagequeues[PQ_COUNT] = {
-	[PQ_INACTIVE] = {
-		.pq_pl = TAILQ_HEAD_INITIALIZER(
-		    vm_pagequeues[PQ_INACTIVE].pq_pl),
-		.pq_cnt = &cnt.v_inactive_count,
-		.pq_name = "vm inactive pagequeue"
-	},
-	[PQ_ACTIVE] = {
-		.pq_pl = TAILQ_HEAD_INITIALIZER(
-		    vm_pagequeues[PQ_ACTIVE].pq_pl),
-		.pq_cnt = &cnt.v_active_count,
-		.pq_name = "vm active pagequeue"
-	}
-};
+struct vm_domain vm_dom[MAXMEMDOM];
 struct mtx_padalign vm_page_queue_free_mtx;
 
 struct mtx_padalign pa_lock[PA_LOCK_COUNT];
@@ -256,6 +242,34 @@ vm_page_blacklist_lookup(char *list, vm_
 	return (0);
 }
 
+static void
+vm_page_domain_init(struct vm_domain *vmd)
+{
+	struct vm_pagequeue *pq;
+	int i;
+
+	*__DECONST(char **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_name) =
+	    "vm inactive pagequeue";
+	*__DECONST(int **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_vcnt) =
+	    &cnt.v_inactive_count;
+	*__DECONST(char **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_name) =
+	    "vm active pagequeue";
+	*__DECONST(int **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_vcnt) =
+	    &cnt.v_active_count;
+	vmd->vmd_fullintervalcount = 0;
+	vmd->vmd_page_count = 0;
+	vmd->vmd_free_count = 0;
+	vmd->vmd_segs = 0;
+	vmd->vmd_oom = FALSE;
+	vmd->vmd_pass = 0;
+	for (i = 0; i < PQ_COUNT; i++) {
+		pq = &vmd->vmd_pagequeues[i];
+		TAILQ_INIT(&pq->pq_pl);
+		mtx_init(&pq->pq_mutex, pq->pq_name, "vm pagequeue",
+		    MTX_DEF | MTX_DUPOK);
+	}
+}
+
 /*
  *	vm_page_startup:
  *
@@ -319,8 +333,8 @@ vm_page_startup(vm_offset_t vaddr)
 	mtx_init(&vm_page_queue_free_mtx, "vm page free queue", NULL, MTX_DEF);
 	for (i = 0; i < PA_LOCK_COUNT; i++)
 		mtx_init(&pa_lock[i], "vm page", NULL, MTX_DEF);
-	for (i = 0; i < PQ_COUNT; i++)
-		vm_pagequeue_init_lock(&vm_pagequeues[i]);
+	for (i = 0; i < vm_ndomains; i++)
+		vm_page_domain_init(&vm_dom[i]);
 
 	/*
 	 * Allocate memory for use when boot strapping the kernel memory
@@ -1055,7 +1069,7 @@ vm_page_cache_free(vm_object_t object, v
 		KASSERT((m->flags & (PG_CACHED | PG_FREE)) == PG_FREE,
 		    ("vm_page_cache_free: page %p has inconsistent flags", m));
 		cnt.v_cache_count--;
-		cnt.v_free_count++;
+		vm_phys_freecnt_adj(m, 1);
 	}
 	empty = vm_radix_is_empty(&object->cache);
 	mtx_unlock(&vm_page_queue_free_mtx);
@@ -1311,7 +1325,7 @@ vm_page_alloc(vm_object_t object, vm_pin
 		    ("vm_page_alloc: page %p is not free", m));
 		KASSERT(m->valid == 0,
 		    ("vm_page_alloc: free page %p is valid", m));
-		cnt.v_free_count--;
+		vm_phys_freecnt_adj(m, -1);
 	}
 
 	/*
@@ -1569,7 +1583,7 @@ vm_page_alloc_init(vm_page_t m)
 		    ("vm_page_alloc_init: page %p is not free", m));
 		KASSERT(m->valid == 0,
 		    ("vm_page_alloc_init: free page %p is valid", m));
-		cnt.v_free_count--;
+		vm_phys_freecnt_adj(m, -1);
 		if ((m->flags & PG_ZERO) != 0)
 			vm_page_zero_count--;
 	}
@@ -1711,6 +1725,13 @@ vm_waitpfault(void)
 	    "pfault", 0);
 }
 
+struct vm_pagequeue *
+vm_page_pagequeue(vm_page_t m)
+{
+
+	return (&vm_phys_domain(m)->vmd_pagequeues[m->queue]);
+}
+
 /*
  *	vm_page_dequeue:
  *
@@ -1726,11 +1747,11 @@ vm_page_dequeue(vm_page_t m)
 	vm_page_lock_assert(m, MA_OWNED);
 	KASSERT(m->queue != PQ_NONE,
 	    ("vm_page_dequeue: page %p is not queued", m));
-	pq = &vm_pagequeues[m->queue];
+	pq = vm_page_pagequeue(m);
 	vm_pagequeue_lock(pq);
 	m->queue = PQ_NONE;
 	TAILQ_REMOVE(&pq->pq_pl, m, pageq);
-	(*pq->pq_cnt)--;
+	vm_pagequeue_cnt_dec(pq);
 	vm_pagequeue_unlock(pq);
 }
 
@@ -1747,11 +1768,11 @@ vm_page_dequeue_locked(vm_page_t m)
 	struct vm_pagequeue *pq;
 
 	vm_page_lock_assert(m, MA_OWNED);
-	pq = &vm_pagequeues[m->queue];
+	pq = vm_page_pagequeue(m);
 	vm_pagequeue_assert_locked(pq);
 	m->queue = PQ_NONE;
 	TAILQ_REMOVE(&pq->pq_pl, m, pageq);
-	(*pq->pq_cnt)--;
+	vm_pagequeue_cnt_dec(pq);
 }
 
 /*
@@ -1767,11 +1788,11 @@ vm_page_enqueue(int queue, vm_page_t m)
 	struct vm_pagequeue *pq;
 
 	vm_page_lock_assert(m, MA_OWNED);
-	pq = &vm_pagequeues[queue];
+	pq = &vm_phys_domain(m)->vmd_pagequeues[queue];
 	vm_pagequeue_lock(pq);
 	m->queue = queue;
 	TAILQ_INSERT_TAIL(&pq->pq_pl, m, pageq);
-	++*pq->pq_cnt;
+	vm_pagequeue_cnt_inc(pq);
 	vm_pagequeue_unlock(pq);
 }
 
@@ -1790,7 +1811,7 @@ vm_page_requeue(vm_page_t m)
 	vm_page_lock_assert(m, MA_OWNED);
 	KASSERT(m->queue != PQ_NONE,
 	    ("vm_page_requeue: page %p is not queued", m));
-	pq = &vm_pagequeues[m->queue];
+	pq = vm_page_pagequeue(m);
 	vm_pagequeue_lock(pq);
 	TAILQ_REMOVE(&pq->pq_pl, m, pageq);
 	TAILQ_INSERT_TAIL(&pq->pq_pl, m, pageq);
@@ -1811,7 +1832,7 @@ vm_page_requeue_locked(vm_page_t m)
 
 	KASSERT(m->queue != PQ_NONE,
 	    ("vm_page_requeue_locked: page %p is not queued", m));
-	pq = &vm_pagequeues[m->queue];
+	pq = vm_page_pagequeue(m);
 	vm_pagequeue_assert_locked(pq);
 	TAILQ_REMOVE(&pq->pq_pl, m, pageq);
 	TAILQ_INSERT_TAIL(&pq->pq_pl, m, pageq);
@@ -1948,7 +1969,7 @@ vm_page_free_toq(vm_page_t m)
 		 */
 		mtx_lock(&vm_page_queue_free_mtx);
 		m->flags |= PG_FREE;
-		cnt.v_free_count++;
+		vm_phys_freecnt_adj(m, 1);
 #if VM_NRESERVLEVEL > 0
 		if (!vm_reserv_free_page(m))
 #else
@@ -2081,14 +2102,14 @@ _vm_page_deactivate(vm_page_t m, int ath
 		if (queue != PQ_NONE)
 			vm_page_dequeue(m);
 		m->flags &= ~PG_WINATCFLS;
-		pq = &vm_pagequeues[PQ_INACTIVE];
+		pq = &vm_phys_domain(m)->vmd_pagequeues[PQ_INACTIVE];
 		vm_pagequeue_lock(pq);
 		m->queue = PQ_INACTIVE;
 		if (athead)
 			TAILQ_INSERT_HEAD(&pq->pq_pl, m, pageq);
 		else
 			TAILQ_INSERT_TAIL(&pq->pq_pl, m, pageq);
-		cnt.v_inactive_count++;
+		vm_pagequeue_cnt_inc(pq);
 		vm_pagequeue_unlock(pq);
 	}
 }
@@ -2888,18 +2909,20 @@ DB_SHOW_COMMAND(page, vm_page_print_page
 
 DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info)
 {
-		
-	db_printf("PQ_FREE:");
-	db_printf(" %d", cnt.v_free_count);
-	db_printf("\n");
-		
-	db_printf("PQ_CACHE:");
-	db_printf(" %d", cnt.v_cache_count);
-	db_printf("\n");
-
-	db_printf("PQ_ACTIVE: %d, PQ_INACTIVE: %d\n",
-		*vm_pagequeues[PQ_ACTIVE].pq_cnt,
-		*vm_pagequeues[PQ_INACTIVE].pq_cnt);
+	int dom;
+
+	db_printf("pq_free %d pq_cache %d\n",
+	    cnt.v_free_count, cnt.v_cache_count);
+	for (dom = 0; dom < vm_ndomains; dom++) {
+		db_printf(
+	"dom %d page_cnt %d free %d pq_act %d pq_inact %d pass %d\n",
+		    dom,
+		    vm_dom[dom].vmd_page_count,
+		    vm_dom[dom].vmd_free_count,
+		    vm_dom[dom].vmd_pagequeues[PQ_ACTIVE].pq_cnt,
+		    vm_dom[dom].vmd_pagequeues[PQ_INACTIVE].pq_cnt,
+		    vm_dom[dom].vmd_pass);
+	}
 }
 
 DB_SHOW_COMMAND(pginfo, vm_page_print_pginfo)

Modified: head/sys/vm/vm_page.h
==============================================================================
--- head/sys/vm/vm_page.h	Wed Aug  7 16:33:15 2013	(r254064)
+++ head/sys/vm/vm_page.h	Wed Aug  7 16:36:38 2013	(r254065)
@@ -181,18 +181,44 @@ TAILQ_HEAD(pglist, vm_page);
 struct vm_pagequeue {
 	struct mtx	pq_mutex;
 	struct pglist	pq_pl;
-	int *const	pq_cnt;
-	const char *const pq_name;
+	int		pq_cnt;
+	int		* const pq_vcnt;
+	const char	* const pq_name;
 } __aligned(CACHE_LINE_SIZE);
 
-extern struct vm_pagequeue vm_pagequeues[PQ_COUNT];
+
+struct vm_domain {
+	struct vm_pagequeue vmd_pagequeues[PQ_COUNT];
+	int vmd_fullintervalcount;
+	u_int vmd_page_count;
+	u_int vmd_free_count;
+	long vmd_segs;	/* bitmask of the segments */
+	boolean_t vmd_oom;
+	int vmd_pass;	/* local pagedaemon pass */
+	struct vm_page vmd_marker; /* marker for pagedaemon private use */
+};
+
+extern struct vm_domain vm_dom[MAXMEMDOM];
 
 #define	vm_pagequeue_assert_locked(pq)	mtx_assert(&(pq)->pq_mutex, MA_OWNED)
-#define	vm_pagequeue_init_lock(pq)	mtx_init(&(pq)->pq_mutex,	\
-	    (pq)->pq_name, "vm pagequeue", MTX_DEF | MTX_DUPOK);
 #define	vm_pagequeue_lock(pq)		mtx_lock(&(pq)->pq_mutex)
 #define	vm_pagequeue_unlock(pq)		mtx_unlock(&(pq)->pq_mutex)
 
+#ifdef _KERNEL
+static __inline void
+vm_pagequeue_cnt_add(struct vm_pagequeue *pq, int addend)
+{
+
+#ifdef notyet
+	vm_pagequeue_assert_locked(pq);
+#endif
+	pq->pq_cnt += addend;
+	atomic_add_int(pq->pq_vcnt, addend);
+}
+#define	vm_pagequeue_cnt_inc(pq)	vm_pagequeue_cnt_add((pq), 1)
+#define	vm_pagequeue_cnt_dec(pq)	vm_pagequeue_cnt_add((pq), -1)
+#endif	/* _KERNEL */
+
 extern struct mtx_padalign vm_page_queue_free_mtx;
 extern struct mtx_padalign pa_lock[];
 
@@ -393,6 +419,7 @@ boolean_t vm_page_is_cached(vm_object_t 
 vm_page_t vm_page_lookup (vm_object_t, vm_pindex_t);
 vm_page_t vm_page_next(vm_page_t m);
 int vm_page_pa_tryrelock(pmap_t, vm_paddr_t, vm_paddr_t *);
+struct vm_pagequeue *vm_page_pagequeue(vm_page_t m);
 vm_page_t vm_page_prev(vm_page_t m);
 void vm_page_putfake(vm_page_t m);
 void vm_page_readahead_finish(vm_page_t m);

Modified: head/sys/vm/vm_pageout.c
==============================================================================
--- head/sys/vm/vm_pageout.c	Wed Aug  7 16:33:15 2013	(r254064)
+++ head/sys/vm/vm_pageout.c	Wed Aug  7 16:36:38 2013	(r254065)
@@ -90,6 +90,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/signalvar.h>
+#include <sys/smp.h>
 #include <sys/vnode.h>
 #include <sys/vmmeter.h>
 #include <sys/rwlock.h>
@@ -103,6 +104,7 @@ __FBSDID("$FreeBSD$");
 #include <vm/vm_map.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
+#include <vm/vm_phys.h>
 #include <vm/swap_pager.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
@@ -114,7 +116,8 @@ __FBSDID("$FreeBSD$");
 /* the kernel process "vm_pageout"*/
 static void vm_pageout(void);
 static int vm_pageout_clean(vm_page_t);
-static void vm_pageout_scan(int pass);
+static void vm_pageout_scan(struct vm_domain *vmd, int pass);
+static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int pass);
 
 struct proc *pageproc;
 
@@ -216,14 +219,15 @@ SYSCTL_INT(_vm, OID_AUTO, max_wired,
 	CTLFLAG_RW, &vm_page_max_wired, 0, "System-wide limit to wired page count");
 
 static boolean_t vm_pageout_fallback_object_lock(vm_page_t, vm_page_t *);
-static boolean_t vm_pageout_launder(int, int, vm_paddr_t, vm_paddr_t);
+static boolean_t vm_pageout_launder(struct vm_pagequeue *pq, int, vm_paddr_t,
+    vm_paddr_t);
 #if !defined(NO_SWAPPING)
 static void vm_pageout_map_deactivate_pages(vm_map_t, long);
 static void vm_pageout_object_deactivate_pages(pmap_t, vm_object_t, long);
 static void vm_req_vmdaemon(int req);
 #endif
 static boolean_t vm_pageout_page_lock(vm_page_t, vm_page_t *);
-static void vm_pageout_page_stats(void);
+static void vm_pageout_page_stats(struct vm_domain *vmd);
 
 /*
  * Initialize a dummy page for marking the caller's place in the specified
@@ -267,7 +271,7 @@ vm_pageout_fallback_object_lock(vm_page_
 
 	queue = m->queue;
 	vm_pageout_init_marker(&marker, queue);
-	pq = &vm_pagequeues[queue];
+	pq = vm_page_pagequeue(m);
 	object = m->object;
 	
 	TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, pageq);
@@ -309,7 +313,7 @@ vm_pageout_page_lock(vm_page_t m, vm_pag
 
 	queue = m->queue;
 	vm_pageout_init_marker(&marker, queue);
-	pq = &vm_pagequeues[queue];
+	pq = vm_page_pagequeue(m);
 
 	TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, pageq);
 	vm_pagequeue_unlock(pq);
@@ -567,21 +571,17 @@ vm_pageout_flush(vm_page_t *mc, int coun
 }
 
 static boolean_t
-vm_pageout_launder(int queue, int tries, vm_paddr_t low, vm_paddr_t high)
+vm_pageout_launder(struct vm_pagequeue *pq, int tries, vm_paddr_t low,
+    vm_paddr_t high)
 {
 	struct mount *mp;
-	struct vm_pagequeue *pq;
 	struct vnode *vp;
 	vm_object_t object;
 	vm_paddr_t pa;
 	vm_page_t m, m_tmp, next;
 
-	pq = &vm_pagequeues[queue];
 	vm_pagequeue_lock(pq);
 	TAILQ_FOREACH_SAFE(m, &pq->pq_pl, pageq, next) {
-		KASSERT(m->queue == queue,
-		    ("vm_pageout_launder: page %p's queue is not %d", m,
-		    queue));
 		if ((m->flags & PG_MARKER) != 0)
 			continue;
 		pa = VM_PAGE_TO_PHYS(m);
@@ -661,7 +661,8 @@ vm_pageout_launder(int queue, int tries,
 void
 vm_pageout_grow_cache(int tries, vm_paddr_t low, vm_paddr_t high)
 {
-	int actl, actmax, inactl, inactmax;
+	int actl, actmax, inactl, inactmax, dom, initial_dom;
+	static int start_dom = 0;
 
 	if (tries > 0) {
 		/*
@@ -677,19 +678,55 @@ vm_pageout_grow_cache(int tries, vm_padd
 		 */
 		uma_reclaim();
 	}
+
+	/*
+	 * Make the next scan start on the next domain.
+	 */
+	initial_dom = atomic_fetchadd_int(&start_dom, 1) % vm_ndomains;
+
 	inactl = 0;
 	inactmax = cnt.v_inactive_count;
 	actl = 0;
 	actmax = tries < 2 ? 0 : cnt.v_active_count;
+	dom = initial_dom;
+
+	/*
+	 * Scan domains in round-robin order, first inactive queues,
+	 * then active.  Since domain usually owns large physically
+	 * contiguous chunk of memory, it makes sense to completely
+	 * exhaust one domain before switching to next, while growing
+	 * the pool of contiguous physical pages.
+	 *
+	 * Do not even start launder a domain which cannot contain
+	 * the specified address range, as indicated by segments
+	 * constituting the domain.
+	 */
 again:
-	if (inactl < inactmax && vm_pageout_launder(PQ_INACTIVE, tries, low,
-	    high)) {
-		inactl++;
-		goto again;
-	}
-	if (actl < actmax && vm_pageout_launder(PQ_ACTIVE, tries, low, high)) {
-		actl++;
-		goto again;
+	if (inactl < inactmax) {
+		if (vm_phys_domain_intersects(vm_dom[dom].vmd_segs,
+		    low, high) &&
+		    vm_pageout_launder(&vm_dom[dom].vmd_pagequeues[PQ_INACTIVE],
+		    tries, low, high)) {
+			inactl++;
+			goto again;
+		}
+		if (++dom == vm_ndomains)
+			dom = 0;
+		if (dom != initial_dom)
+			goto again;
+	}
+	if (actl < actmax) {
+		if (vm_phys_domain_intersects(vm_dom[dom].vmd_segs,
+		    low, high) &&
+		    vm_pageout_launder(&vm_dom[dom].vmd_pagequeues[PQ_ACTIVE],
+		      tries, low, high)) {
+			actl++;
+			goto again;
+		}
+		if (++dom == vm_ndomains)
+			dom = 0;
+		if (dom != initial_dom)
+			goto again;
 	}
 }
 
@@ -861,10 +898,9 @@ vm_pageout_map_deactivate_pages(map, des
  *	vm_pageout_scan does the dirty work for the pageout daemon.
  */
 static void
-vm_pageout_scan(int pass)
+vm_pageout_scan(struct vm_domain *vmd, int pass)
 {
 	vm_page_t m, next;
-	struct vm_page marker;
 	struct vm_pagequeue *pq;
 	int page_shortage, maxscan, pcount;
 	int addl_page_shortage;
@@ -874,8 +910,6 @@ vm_pageout_scan(int pass)
 	int maxlaunder;
 	boolean_t queues_locked;
 
-	vm_pageout_init_marker(&marker, PQ_INACTIVE);
-
 	/*
 	 * Decrease registered cache sizes.
 	 */
@@ -888,7 +922,7 @@ vm_pageout_scan(int pass)
 	/*
 	 * The addl_page_shortage is the number of temporarily
 	 * stuck pages in the inactive queue.  In other words, the
-	 * number of pages from cnt.v_inactive_count that should be
+	 * number of pages from the inactive count that should be
 	 * discounted in setting the target for the active queue scan.
 	 */
 	addl_page_shortage = atomic_readandclear_int(&vm_pageout_deficit);
@@ -914,8 +948,6 @@ vm_pageout_scan(int pass)
 	if (pass)
 		maxlaunder = 10000;
 
-	maxscan = cnt.v_inactive_count;
-
 	/*
 	 * Start scanning the inactive queue for pages we can move to the
 	 * cache or free.  The scan will stop when the target is reached or
@@ -923,7 +955,8 @@ vm_pageout_scan(int pass)
 	 * is not used to form decisions for the inactive queue, only for the
 	 * active queue.
 	 */
-	pq = &vm_pagequeues[PQ_INACTIVE];
+	pq = &vmd->vmd_pagequeues[PQ_INACTIVE];
+	maxscan = pq->pq_cnt;
 	vm_pagequeue_lock(pq);
 	queues_locked = TRUE;
 	for (m = TAILQ_FIRST(&pq->pq_pl);
@@ -984,7 +1017,7 @@ vm_pageout_scan(int pass)
 		 * 'next' pointer.  Use our marker to remember our
 		 * place.
 		 */
-		TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, pageq);
+		TAILQ_INSERT_AFTER(&pq->pq_pl, m, &vmd->vmd_marker, pageq);
 		vm_pagequeue_unlock(pq);
 		queues_locked = FALSE;
 
@@ -1034,7 +1067,7 @@ vm_pageout_scan(int pass)
 			/*
 			 * Held pages are essentially stuck in the
 			 * queue.  So, they ought to be discounted
-			 * from cnt.v_inactive_count.  See the
+			 * from the inactive count.  See the
 			 * calculation of the page_shortage for the
 			 * loop over the active queue below.
 			 */
@@ -1178,7 +1211,7 @@ vm_pageout_scan(int pass)
 				 */
 				if (m->queue != PQ_INACTIVE ||
 				    m->object != object ||
-				    TAILQ_NEXT(m, pageq) != &marker) {
+				    TAILQ_NEXT(m, pageq) != &vmd->vmd_marker) {
 					vm_page_unlock(m);
 					if (object->flags & OBJ_MIGHTBEDIRTY)
 						vnodes_skipped++;
@@ -1248,8 +1281,8 @@ relock_queues:
 			vm_pagequeue_lock(pq);
 			queues_locked = TRUE;
 		}
-		next = TAILQ_NEXT(&marker, pageq);
-		TAILQ_REMOVE(&pq->pq_pl, &marker, pageq);
+		next = TAILQ_NEXT(&vmd->vmd_marker, pageq);
+		TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_marker, pageq);
 	}
 	vm_pagequeue_unlock(pq);
 
@@ -1258,7 +1291,7 @@ relock_queues:
 	 * active queue to the inactive queue.
 	 */
 	page_shortage = vm_paging_target() +
-		cnt.v_inactive_target - cnt.v_inactive_count;
+	    cnt.v_inactive_target - cnt.v_inactive_count;
 	page_shortage += addl_page_shortage;
 
 	/*
@@ -1266,8 +1299,8 @@ relock_queues:
 	 * track the per-page activity counter and use it to locate
 	 * deactivation candidates.
 	 */
-	pcount = cnt.v_active_count;
-	pq = &vm_pagequeues[PQ_ACTIVE];
+	pq = &vmd->vmd_pagequeues[PQ_ACTIVE];
+	pcount = pq->pq_cnt;
 	vm_pagequeue_lock(pq);
 	m = TAILQ_FIRST(&pq->pq_pl);
 	while ((m != NULL) && (pcount-- > 0) && (page_shortage > 0)) {
@@ -1393,12 +1426,54 @@ relock_queues:
 	 * chance to flush out dirty vnode-backed pages and to allow
 	 * active pages to be moved to the inactive queue and reclaimed.
 	 */
-	if (pass != 0 &&
-	    ((swap_pager_avail < 64 && vm_page_count_min()) ||
-	     (swap_pager_full && vm_paging_target() > 0)))
-		vm_pageout_oom(VM_OOM_MEM);
+	vm_pageout_mightbe_oom(vmd, pass);
 }
 
+static int vm_pageout_oom_vote;
+
+/*
+ * The pagedaemon threads randlomly select one to perform the
+ * OOM.  Trying to kill processes before all pagedaemons
+ * failed to reach free target is premature.
+ */
+static void
+vm_pageout_mightbe_oom(struct vm_domain *vmd, int pass)
+{
+	int old_vote;
+
+	if (pass == 0 || !((swap_pager_avail < 64 && vm_page_count_min()) ||
+	    (swap_pager_full && vm_paging_target() > 0))) {
+		if (vmd->vmd_oom) {
+			vmd->vmd_oom = FALSE;
+			atomic_subtract_int(&vm_pageout_oom_vote, 1);
+		}
+		return;
+	}
+
+	if (vmd->vmd_oom)
+		return;
+
+	vmd->vmd_oom = TRUE;
+	old_vote = atomic_fetchadd_int(&vm_pageout_oom_vote, 1);
+	if (old_vote != vm_ndomains - 1)
+		return;
+
+	/*
+	 * The current pagedaemon thread is the last in the quorum to
+	 * start OOM.  Initiate the selection and signaling of the
+	 * victim.
+	 */
+	vm_pageout_oom(VM_OOM_MEM);
+
+	/*
+	 * After one round of OOM terror, recall our vote.  On the
+	 * next pass, current pagedaemon would vote again if the low
+	 * memory condition is still there, due to vmd_oom being
+	 * false.
+	 */
+	vmd->vmd_oom = FALSE;
+	atomic_subtract_int(&vm_pageout_oom_vote, 1);
+}
 
 void
 vm_pageout_oom(int shortage)
@@ -1501,14 +1576,13 @@ vm_pageout_oom(int shortage)
  * helps the situation where paging just starts to occur.
  */
 static void
-vm_pageout_page_stats(void)
+vm_pageout_page_stats(struct vm_domain *vmd)
 {
 	struct vm_pagequeue *pq;
 	vm_object_t object;
 	vm_page_t m, next;
 	int pcount, tpcount;		/* Number of pages to check */
-	static int fullintervalcount = 0;
-	int page_shortage;
+	int actcount, page_shortage;
 
 	page_shortage = 
 	    (cnt.v_inactive_target + cnt.v_cache_max + cnt.v_free_min) -
@@ -1517,25 +1591,30 @@ vm_pageout_page_stats(void)
 	if (page_shortage <= 0)
 		return;
 
-	pcount = cnt.v_active_count;
-	fullintervalcount += vm_pageout_stats_interval;
-	if (fullintervalcount < vm_pageout_full_stats_interval) {
-		vm_pageout_stats++;
-		tpcount = (int64_t)vm_pageout_stats_max * cnt.v_active_count /
-		    cnt.v_page_count;
+	pq = &vmd->vmd_pagequeues[PQ_ACTIVE];
+
+	/*
+	 * pcount limits the depth of the queue scan.  In particular,
+	 * for the full scan, it prevents the iteration from looking
+	 * into the requeued pages.  The limit is not exact since the
+	 * page queue lock is dropped during the iteration.
+	 */
+	pcount = pq->pq_cnt;
+	vmd->vmd_fullintervalcount += vm_pageout_stats_interval;
+	if (vmd->vmd_fullintervalcount < vm_pageout_full_stats_interval) {
+		atomic_add_int(&vm_pageout_stats, 1);
+		tpcount = (int64_t)vm_pageout_stats_max * pcount /
+		    vmd->vmd_page_count;
 		if (pcount > tpcount)
 			pcount = tpcount;
 	} else {
-		vm_pageout_full_stats++;
-		fullintervalcount = 0;
+		atomic_add_int(&vm_pageout_full_stats, 1);
+		vmd->vmd_fullintervalcount = 0;
 	}
 
-	pq = &vm_pagequeues[PQ_ACTIVE];
 	vm_pagequeue_lock(pq);
 	m = TAILQ_FIRST(&pq->pq_pl);
-	while ((m != NULL) && (pcount-- > 0)) {
-		int actcount;
-
+	while (m != NULL && pcount-- > 0) {
 		KASSERT(m->queue == PQ_ACTIVE,
 		    ("vm_pageout_page_stats: page %p isn't active", m));
 
@@ -1560,11 +1639,11 @@ vm_pageout_page_stats(void)
 		}
 
 		/*
-		 * Don't deactivate pages that are busy.
+		 * Don't deactivate pages that are busy or held.
 		 */
-		if ((m->busy != 0) ||
-		    (m->oflags & VPO_BUSY) ||
-		    (m->hold_count != 0)) {
+		if (m->busy != 0 ||
+		    (m->oflags & VPO_BUSY) != 0 ||
+		    m->hold_count != 0) {
 			vm_page_unlock(m);
 			VM_OBJECT_WUNLOCK(object);
 			vm_page_requeue_locked(m);
@@ -1579,7 +1658,7 @@ vm_pageout_page_stats(void)
 		}
 
 		actcount += pmap_ts_referenced(m);
-		if (actcount) {
+		if (actcount != 0) {
 			m->act_count += ACT_ADVANCE + actcount;
 			if (m->act_count > ACT_MAX)
 				m->act_count = ACT_MAX;
@@ -1611,13 +1690,105 @@ vm_pageout_page_stats(void)
 	vm_pagequeue_unlock(pq);
 }
 
+static void
+vm_pageout_worker(void *arg)
+{
+	struct vm_domain *domain;
+	struct pcpu *pc;
+	int cpu, error, domidx;
+
+	domidx = (uintptr_t)arg;
+	domain = &vm_dom[domidx];
+
+	/*
+	 * XXXKIB The bind is rather arbitrary.  With some minor
+	 * complications, we could assign the cpuset consisting of all
+	 * CPUs in the same domain.  In fact, it even does not matter
+	 * if the CPU we bind to is in the affinity domain of this
+	 * page queue, we only need to establish the fair distribution
+	 * of pagedaemon threads among CPUs.
+	 *
+	 * XXXKIB It would be useful to allocate vm_pages for the
+	 * domain from the domain, and put pcpu area into the page
+	 * owned by the domain.
+	 */
+	if (mem_affinity != NULL) {
+		CPU_FOREACH(cpu) {
+			pc = pcpu_find(cpu);
+			if (pc->pc_domain == domidx) {
+				thread_lock(curthread);
+				sched_bind(curthread, cpu);
+				thread_unlock(curthread);
+				break;
+			}
+		}
+	}
+
+	KASSERT(domain->vmd_segs != 0, ("domain without segments"));
+	vm_pageout_init_marker(&domain->vmd_marker, PQ_INACTIVE);
+
+	/*
+	 * The pageout daemon worker is never done, so loop forever.
+	 */
+	while (TRUE) {
+		/*
+		 * If we have enough free memory, wakeup waiters.  Do
+		 * not clear vm_pages_needed until we reach our target,
+		 * otherwise we may be woken up over and over again and
+		 * waste a lot of cpu.
+		 */
+		mtx_lock(&vm_page_queue_free_mtx);
+		if (vm_pages_needed && !vm_page_count_min()) {
+			if (!vm_paging_needed())
+				vm_pages_needed = 0;
+			wakeup(&cnt.v_free_count);
+		}
+		if (vm_pages_needed) {
+			/*
+			 * Still not done, take a second pass without waiting
+			 * (unlimited dirty cleaning), otherwise sleep a bit
+			 * and try again.
+			 */
+			++(domain->vmd_pass);
+			if (domain->vmd_pass > 1)
+				msleep(&vm_pages_needed,
+				    &vm_page_queue_free_mtx, PVM, "psleep",
+				    hz / 2);
+		} else {
+			/*
+			 * Good enough, sleep & handle stats.  Prime the pass
+			 * for the next run.
+			 */
+			if (domain->vmd_pass > 1)
+				domain->vmd_pass = 1;
+			else
+				domain->vmd_pass = 0;
+			error = msleep(&vm_pages_needed,
+			    &vm_page_queue_free_mtx, PVM, "psleep",
+			    vm_pageout_stats_interval * hz);
+			if (error && !vm_pages_needed) {
+				mtx_unlock(&vm_page_queue_free_mtx);
+				domain->vmd_pass = 0;
+				vm_pageout_page_stats(domain);
+				continue;
+			}
+		}
+		if (vm_pages_needed)
+			cnt.v_pdwakeups++;
+		mtx_unlock(&vm_page_queue_free_mtx);
+		vm_pageout_scan(domain, domain->vmd_pass);
+	}
+}
+
 /*
  *	vm_pageout is the high level pageout daemon.
  */
 static void
 vm_pageout(void)
 {
-	int error, pass;
+#if MAXMEMDOM > 1
+	int error, i;
+#endif
 
 	/*
 	 * Initialize some paging parameters.
@@ -1687,58 +1858,17 @@ vm_pageout(void)
 		vm_pageout_full_stats_interval = vm_pageout_stats_interval * 4;
 
 	swap_pager_swap_init();
-	pass = 0;
-	/*
-	 * The pageout daemon is never done, so loop forever.
-	 */
-	while (TRUE) {
-		/*
-		 * If we have enough free memory, wakeup waiters.  Do
-		 * not clear vm_pages_needed until we reach our target,
-		 * otherwise we may be woken up over and over again and
-		 * waste a lot of cpu.
-		 */
-		mtx_lock(&vm_page_queue_free_mtx);
-		if (vm_pages_needed && !vm_page_count_min()) {
-			if (!vm_paging_needed())
-				vm_pages_needed = 0;
-			wakeup(&cnt.v_free_count);
+#if MAXMEMDOM > 1
+	for (i = 1; i < vm_ndomains; i++) {
+		error = kthread_add(vm_pageout_worker, (void *)(uintptr_t)i,
+		    curproc, NULL, 0, 0, "dom%d", i);
+		if (error != 0) {
+			panic("starting pageout for domain %d, error %d\n",
+			    i, error);
 		}
-		if (vm_pages_needed) {
-			/*
-			 * Still not done, take a second pass without waiting
-			 * (unlimited dirty cleaning), otherwise sleep a bit
-			 * and try again.
-			 */
-			++pass;
-			if (pass > 1)
-				msleep(&vm_pages_needed,
-				    &vm_page_queue_free_mtx, PVM, "psleep",
-				    hz / 2);
-		} else {
-			/*
-			 * Good enough, sleep & handle stats.  Prime the pass
-			 * for the next run.
-			 */
-			if (pass > 1)
-				pass = 1;
-			else
-				pass = 0;
-			error = msleep(&vm_pages_needed,
-			    &vm_page_queue_free_mtx, PVM, "psleep",
-			    vm_pageout_stats_interval * hz);
-			if (error && !vm_pages_needed) {
-				mtx_unlock(&vm_page_queue_free_mtx);
-				pass = 0;
-				vm_pageout_page_stats();
-				continue;
-			}
-		}
-		if (vm_pages_needed)
-			cnt.v_pdwakeups++;
-		mtx_unlock(&vm_page_queue_free_mtx);
-		vm_pageout_scan(pass);
 	}
+#endif
+	vm_pageout_worker((uintptr_t)0);
 }
 
 /*

Modified: head/sys/vm/vm_phys.c
==============================================================================
--- head/sys/vm/vm_phys.c	Wed Aug  7 16:33:15 2013	(r254064)
+++ head/sys/vm/vm_phys.c	Wed Aug  7 16:36:38 2013	(r254065)
@@ -65,26 +65,15 @@ __FBSDID("$FreeBSD$");
 #include <vm/vm_page.h>
 #include <vm/vm_phys.h>
 
-struct vm_freelist {
-	struct pglist pl;
-	int lcnt;
-};
-
-struct vm_phys_seg {
-	vm_paddr_t	start;
-	vm_paddr_t	end;
-	vm_page_t	first_page;
-	int		domain;
-	struct vm_freelist (*free_queues)[VM_NFREEPOOL][VM_NFREEORDER];
-};
+_Static_assert(sizeof(long) * NBBY >= VM_PHYSSEG_MAX,
+    "Too many physsegs.");
 
 struct mem_affinity *mem_affinity;
 
 int vm_ndomains = 1;
 
-static struct vm_phys_seg vm_phys_segs[VM_PHYSSEG_MAX];
-
-static int vm_phys_nsegs;
+struct vm_phys_seg vm_phys_segs[VM_PHYSSEG_MAX];
+int vm_phys_nsegs;
 
 #define VM_PHYS_FICTITIOUS_NSEGS	8
 static struct vm_phys_fictitious_seg {
@@ -140,6 +129,22 @@ vm_rr_selectdomain(void)
 #endif
 }
 
+boolean_t
+vm_phys_domain_intersects(long mask, vm_paddr_t low, vm_paddr_t high)
+{
+	struct vm_phys_seg *s;
+	int idx;
+
+	while ((idx = ffsl(mask)) != 0) {
+		idx--;	/* ffsl counts from 1 */
+		mask &= ~(1UL << idx);
+		s = &vm_phys_segs[idx];
+		if (low < s->end && high > s->start)
+			return (TRUE);
+	}
+	return (FALSE);
+}
+
 /*
  * Outputs the state of the physical memory allocator, specifically,
  * the amount of physical memory in each free list.
@@ -378,12 +383,16 @@ void
 vm_phys_add_page(vm_paddr_t pa)
 {
 	vm_page_t m;
+	struct vm_domain *vmd;
 
 	cnt.v_page_count++;
 	m = vm_phys_paddr_to_vm_page(pa);
 	m->phys_addr = pa;
 	m->queue = PQ_NONE;
 	m->segind = vm_phys_paddr_to_segind(pa);
+	vmd = vm_phys_domain(m);
+	vmd->vmd_page_count++;
+	vmd->vmd_segs |= 1UL << m->segind;
 	m->flags = PG_FREE;
 	KASSERT(m->order == VM_NFREEORDER,
 	    ("vm_phys_add_page: page %p has unexpected order %d",
@@ -391,7 +400,7 @@ vm_phys_add_page(vm_paddr_t pa)
 	m->pool = VM_FREEPOOL_DEFAULT;
 	pmap_page_init(m);
 	mtx_lock(&vm_page_queue_free_mtx);
-	cnt.v_free_count++;
+	vm_phys_freecnt_adj(m, 1);
 	vm_phys_free_pages(m, 0);
 	mtx_unlock(&vm_page_queue_free_mtx);
 }

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***