svn commit: r321506 - in projects/numa2: lib/libmemstat sys/amd64/amd64 sys/arm64/arm64 sys/i386/i386 sys/kern sys/mips/mips sys/powerpc/aim sys/powerpc/powerpc sys/riscv/riscv sys/sparc64/sparc64 ...

Mark Johnston markj at FreeBSD.org
Wed Jul 26 05:01:49 UTC 2017


Author: markj
Date: Wed Jul 26 05:01:46 2017
New Revision: 321506
URL: https://svnweb.freebsd.org/changeset/base/321506

Log:
  Reintegrate most of the work done on projects/numa.
  
  This largely consists of plumbing a domain parameter through various
  layers of the VM code. In particular:
  - Domain selection is moved out of the physical memory allocator and
    into the page allocation routines. The page allocator API is extended
    to support allocation from a requested domain. The existing API now
    makes use of per-VM object and per-thread policies to select a domain
    from which to allocate pages.
  - The reservation allocator now accepts a domain parameter. The global
    queue of partially populated reservations is split into one queue per
    domain. This lets us specify a particular domain when attempting to
    reclaim memory by breaking partially populated reservations.
  - Simiarly, each UMA zone now maintains a bucket cache per domain,
    rather than a global cache. Each UMA keg now maintains per-domain
    lists of slabs. A new function, uma_zone_set_domain_selector(), allows
    one to specify a domain selection policy for allocating slabs.
    Currently this is used to ensure that per-CPU item buckets are
    allocated local to the domain corresponding to the CPU.
  - VM object contain a domain selector whose policy is determined by the
    system default. Currently, when allocating a page with
    vm_page_alloc(), the object's policy is preferred over the thread's.
  
  This commit does not fully reconcile the work done in r285387 with the
  contents of projects/numa. In particular, I implemented vm_page_alloc()
  to prefer the object's selection policy over the thread's. Thus,
  per-thread and per-process policies specified using numactl are not
  always honoured.
  
  Omitted from projects/numa is support for using a bitset to define a
  subset of VM domains that may selected by a given policy. It's not yet
  clear to me how that should be integrated with r285387.
  
  Some aspects of NUMA-awareness are not yet addressed. For example, we
  currently create one page daemon per domain, but the targets used by the
  page daemons for reclaiming memory are based on the global free page
  count.
  
  Submitted by:	attilio, jeff (original version)
  Obtained from:	projects/numa

Modified:
  projects/numa2/lib/libmemstat/memstat_uma.c
  projects/numa2/sys/amd64/amd64/uma_machdep.c
  projects/numa2/sys/arm64/arm64/uma_machdep.c
  projects/numa2/sys/i386/i386/pmap.c
  projects/numa2/sys/kern/init_main.c
  projects/numa2/sys/kern/kern_fork.c
  projects/numa2/sys/kern/kern_mbuf.c
  projects/numa2/sys/kern/kern_numa.c
  projects/numa2/sys/kern/kern_thread.c
  projects/numa2/sys/kern/subr_busdma_bufalloc.c
  projects/numa2/sys/kern/subr_vmem.c
  projects/numa2/sys/kern/vfs_bio.c
  projects/numa2/sys/mips/mips/pmap.c
  projects/numa2/sys/mips/mips/uma_machdep.c
  projects/numa2/sys/powerpc/aim/mmu_oea64.c
  projects/numa2/sys/powerpc/aim/slb.c
  projects/numa2/sys/powerpc/powerpc/uma_machdep.c
  projects/numa2/sys/riscv/riscv/uma_machdep.c
  projects/numa2/sys/sparc64/sparc64/vm_machdep.c
  projects/numa2/sys/sys/_vm_domain.h
  projects/numa2/sys/sys/busdma_bufalloc.h
  projects/numa2/sys/sys/proc.h
  projects/numa2/sys/vm/uma.h
  projects/numa2/sys/vm/uma_core.c
  projects/numa2/sys/vm/uma_int.h
  projects/numa2/sys/vm/vm_domain.c
  projects/numa2/sys/vm/vm_domain.h
  projects/numa2/sys/vm/vm_object.c
  projects/numa2/sys/vm/vm_object.h
  projects/numa2/sys/vm/vm_page.c
  projects/numa2/sys/vm/vm_page.h
  projects/numa2/sys/vm/vm_phys.c
  projects/numa2/sys/vm/vm_phys.h
  projects/numa2/sys/vm/vm_reserv.c
  projects/numa2/sys/vm/vm_reserv.h

Modified: projects/numa2/lib/libmemstat/memstat_uma.c
==============================================================================
--- projects/numa2/lib/libmemstat/memstat_uma.c	Wed Jul 26 04:27:37 2017	(r321505)
+++ projects/numa2/lib/libmemstat/memstat_uma.c	Wed Jul 26 05:01:46 2017	(r321506)
@@ -53,6 +53,8 @@ static struct nlist namelist[] = {
 	{ .n_name = "_mp_maxid" },
 #define	X_ALL_CPUS	2
 	{ .n_name = "_all_cpus" },
+#define	X_VM_NDOMAINS	3
+	{ .n_name = "_vm_ndomains" },
 	{ .n_name = "" },
 };
 
@@ -299,7 +301,7 @@ memstat_kvm_uma(struct memory_type_list *list, void *k
 	struct uma_cache *ucp, *ucp_array;
 	struct uma_zone *uzp, uz;
 	struct uma_keg *kzp, kz;
-	int hint_dontsearch, i, mp_maxid, ret;
+	int hint_dontsearch, i, mp_maxid, ndomains, ret;
 	char name[MEMTYPE_MAXNAME];
 	cpuset_t all_cpus;
 	long cpusetsize;
@@ -321,6 +323,12 @@ memstat_kvm_uma(struct memory_type_list *list, void *k
 		list->mtl_error = ret;
 		return (-1);
 	}
+	ret = kread_symbol(kvm, X_VM_NDOMAINS, &ndomains,
+	    sizeof(ndomains), 0);
+	if (ret != 0) {
+		list->mtl_error = ret;
+		return (-1);
+	}
 	ret = kread_symbol(kvm, X_UMA_KEGS, &uma_kegs, sizeof(uma_kegs), 0);
 	if (ret != 0) {
 		list->mtl_error = ret;
@@ -445,11 +453,15 @@ skip_percpu:
 				    kz.uk_ipers;
 			mtp->mt_byteslimit = mtp->mt_countlimit * mtp->mt_size;
 			mtp->mt_count = mtp->mt_numallocs - mtp->mt_numfrees;
-			for (ubp = LIST_FIRST(&uz.uz_buckets); ubp !=
-			    NULL; ubp = LIST_NEXT(&ub, ub_link)) {
-				ret = kread(kvm, ubp, &ub, sizeof(ub), 0);
-				mtp->mt_zonefree += ub.ub_cnt;
-			}
+			for (i = 0; i < ndomains; i++)
+				for (ubp =
+				    LIST_FIRST(&uz.uz_domain[i].uzd_buckets);
+				    ubp != NULL;
+				    ubp = LIST_NEXT(&ub, ub_link)) {
+					ret = kread(kvm, ubp, &ub, sizeof(ub),
+					    0);
+					mtp->mt_zonefree += ub.ub_cnt;
+				}
 			if (!((kz.uk_flags & UMA_ZONE_SECONDARY) &&
 			    LIST_FIRST(&kz.uk_zones) != uzp)) {
 				mtp->mt_kegfree = kz.uk_free;

Modified: projects/numa2/sys/amd64/amd64/uma_machdep.c
==============================================================================
--- projects/numa2/sys/amd64/amd64/uma_machdep.c	Wed Jul 26 04:27:37 2017	(r321505)
+++ projects/numa2/sys/amd64/amd64/uma_machdep.c	Wed Jul 26 05:01:46 2017	(r321506)
@@ -42,7 +42,8 @@ __FBSDID("$FreeBSD$");
 #include <machine/vmparam.h>
 
 void *
-uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait)
+uma_small_alloc(uma_zone_t zone, vm_size_t bytes, int domain, u_int8_t *flags,
+    int wait)
 {
 	vm_page_t m;
 	vm_paddr_t pa;
@@ -52,7 +53,7 @@ uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_in
 	*flags = UMA_SLAB_PRIV;
 	pflags = malloc2vm_flags(wait) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED;
 	for (;;) {
-		m = vm_page_alloc(NULL, 0, pflags);
+		m = vm_page_alloc_domain(NULL, 0, domain, pflags);
 		if (m == NULL) {
 			if (wait & M_NOWAIT)
 				return (NULL);

Modified: projects/numa2/sys/arm64/arm64/uma_machdep.c
==============================================================================
--- projects/numa2/sys/arm64/arm64/uma_machdep.c	Wed Jul 26 04:27:37 2017	(r321505)
+++ projects/numa2/sys/arm64/arm64/uma_machdep.c	Wed Jul 26 05:01:46 2017	(r321506)
@@ -42,7 +42,8 @@ __FBSDID("$FreeBSD$");
 #include <machine/vmparam.h>
 
 void *
-uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait)
+uma_small_alloc(uma_zone_t zone, vm_size_t bytes, int domain, u_int8_t *flags,
+    int wait)
 {
 	vm_page_t m;
 	vm_paddr_t pa;

Modified: projects/numa2/sys/i386/i386/pmap.c
==============================================================================
--- projects/numa2/sys/i386/i386/pmap.c	Wed Jul 26 04:27:37 2017	(r321505)
+++ projects/numa2/sys/i386/i386/pmap.c	Wed Jul 26 05:01:46 2017	(r321506)
@@ -337,8 +337,8 @@ static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offs
 static void pmap_pte_release(pt_entry_t *pte);
 static int pmap_unuse_pt(pmap_t, vm_offset_t, struct spglist *);
 #if defined(PAE) || defined(PAE_TABLES)
-static void *pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, uint8_t *flags,
-    int wait);
+static void *pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, int domain,
+    uint8_t *flags, int wait);
 #endif
 static void pmap_set_pg(void);
 
@@ -698,7 +698,8 @@ pmap_page_init(vm_page_t m)
 
 #if defined(PAE) || defined(PAE_TABLES)
 static void *
-pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, int wait)
+pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags,
+    int wait)
 {
 
 	/* Inform UMA that this allocator uses kernel_map/object. */

Modified: projects/numa2/sys/kern/init_main.c
==============================================================================
--- projects/numa2/sys/kern/init_main.c	Wed Jul 26 04:27:37 2017	(r321505)
+++ projects/numa2/sys/kern/init_main.c	Wed Jul 26 05:01:46 2017	(r321506)
@@ -484,6 +484,8 @@ proc0_init(void *dummy __unused)
 	td->td_cpuset = cpuset_thread0();
 	vm_domain_policy_init(&td->td_vm_dom_policy);
 	vm_domain_policy_set(&td->td_vm_dom_policy, VM_POLICY_NONE, -1);
+	vm_domain_iterator_set_policy(&td->td_dom_selector,
+	    &td->td_vm_dom_policy);
 	vm_domain_policy_init(&p->p_vm_dom_policy);
 	vm_domain_policy_set(&p->p_vm_dom_policy, VM_POLICY_NONE, -1);
 	prison0_init();

Modified: projects/numa2/sys/kern/kern_fork.c
==============================================================================
--- projects/numa2/sys/kern/kern_fork.c	Wed Jul 26 04:27:37 2017	(r321505)
+++ projects/numa2/sys/kern/kern_fork.c	Wed Jul 26 05:01:46 2017	(r321506)
@@ -509,6 +509,8 @@ do_fork(struct thread *td, struct fork_req *fr, struct
 	vm_domain_policy_init(&p2->p_vm_dom_policy);
 	vm_domain_policy_localcopy(&p2->p_vm_dom_policy,
 	    &p1->p_vm_dom_policy);
+	vm_domain_iterator_set_policy(&td2->td_dom_selector,
+	    &p2->p_vm_dom_policy);
 
 	if (fr->fr_flags & RFSIGSHARE) {
 		p2->p_sigacts = sigacts_hold(p1->p_sigacts);

Modified: projects/numa2/sys/kern/kern_mbuf.c
==============================================================================
--- projects/numa2/sys/kern/kern_mbuf.c	Wed Jul 26 04:27:37 2017	(r321505)
+++ projects/numa2/sys/kern/kern_mbuf.c	Wed Jul 26 05:01:46 2017	(r321506)
@@ -281,7 +281,7 @@ static void	mb_dtor_pack(void *, int, void *);
 static int	mb_zinit_pack(void *, int, int);
 static void	mb_zfini_pack(void *, int);
 static void	mb_reclaim(uma_zone_t, int);
-static void    *mbuf_jumbo_alloc(uma_zone_t, vm_size_t, uint8_t *, int);
+static void    *mbuf_jumbo_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
 
 /* Ensure that MSIZE is a power of 2. */
 CTASSERT((((MSIZE - 1) ^ MSIZE) + 1) >> 1 == MSIZE);
@@ -384,7 +384,8 @@ SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbuf_init, 
  * pages.
  */
 static void *
-mbuf_jumbo_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, int wait)
+mbuf_jumbo_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags,
+    int wait)
 {
 
 	/* Inform UMA that this allocator uses kernel_map/object. */

Modified: projects/numa2/sys/kern/kern_numa.c
==============================================================================
--- projects/numa2/sys/kern/kern_numa.c	Wed Jul 26 04:27:37 2017	(r321505)
+++ projects/numa2/sys/kern/kern_numa.c	Wed Jul 26 05:01:46 2017	(r321506)
@@ -102,13 +102,22 @@ sys_numa_setaffinity(struct thread *td, struct numa_se
 	/*
 	 * XXX if cpuset_which is called with WHICH_CPUSET and NULL cpuset,
 	 * it'll return ESRCH.  We should just return EINVAL.
+	 *
+	 * XXXMJ nothing synchronizes updates to the thread iterators.
 	 */
 	switch (uap->which) {
 	case CPU_WHICH_TID:
 		vm_domain_policy_copy(&ttd->td_vm_dom_policy, &vp);
+		vm_domain_iterator_set_policy(&ttd->td_dom_selector, &vp);
 		break;
 	case CPU_WHICH_PID:
 		vm_domain_policy_copy(&p->p_vm_dom_policy, &vp);
+		PROC_LOCK(p);
+		FOREACH_THREAD_IN_PROC(p, ttd) {
+			vm_domain_iterator_set_policy(&ttd->td_dom_selector,
+			    &vp);
+		}
+		PROC_UNLOCK(p);
 		break;
 	default:
 		error = EINVAL;

Modified: projects/numa2/sys/kern/kern_thread.c
==============================================================================
--- projects/numa2/sys/kern/kern_thread.c	Wed Jul 26 04:27:37 2017	(r321505)
+++ projects/numa2/sys/kern/kern_thread.c	Wed Jul 26 05:01:46 2017	(r321506)
@@ -80,9 +80,9 @@ _Static_assert(offsetof(struct thread, td_flags) == 0x
     "struct thread KBI td_flags");
 _Static_assert(offsetof(struct thread, td_pflags) == 0xfc,
     "struct thread KBI td_pflags");
-_Static_assert(offsetof(struct thread, td_frame) == 0x460,
+_Static_assert(offsetof(struct thread, td_frame) == 0x468,
     "struct thread KBI td_frame");
-_Static_assert(offsetof(struct thread, td_emuldata) == 0x508,
+_Static_assert(offsetof(struct thread, td_emuldata) == 0x510,
     "struct thread KBI td_emuldata");
 _Static_assert(offsetof(struct proc, p_flag) == 0xb0,
     "struct proc KBI p_flag");

Modified: projects/numa2/sys/kern/subr_busdma_bufalloc.c
==============================================================================
--- projects/numa2/sys/kern/subr_busdma_bufalloc.c	Wed Jul 26 04:27:37 2017	(r321505)
+++ projects/numa2/sys/kern/subr_busdma_bufalloc.c	Wed Jul 26 05:01:46 2017	(r321506)
@@ -147,7 +147,7 @@ busdma_bufalloc_findzone(busdma_bufalloc_t ba, bus_siz
 }
 
 void *
-busdma_bufalloc_alloc_uncacheable(uma_zone_t zone, vm_size_t size,
+busdma_bufalloc_alloc_uncacheable(uma_zone_t zone, vm_size_t size, int domain,
     uint8_t *pflag, int wait)
 {
 #ifdef VM_MEMATTR_UNCACHEABLE

Modified: projects/numa2/sys/kern/subr_vmem.c
==============================================================================
--- projects/numa2/sys/kern/subr_vmem.c	Wed Jul 26 04:27:37 2017	(r321505)
+++ projects/numa2/sys/kern/subr_vmem.c	Wed Jul 26 05:01:46 2017	(r321506)
@@ -495,7 +495,7 @@ bt_insfree(vmem_t *vm, bt_t *bt)
  * Import from the arena into the quantum cache in UMA.
  */
 static int
-qc_import(void *arg, void **store, int cnt, int flags)
+qc_import(void *arg, void **store, int cnt, int domain, int flags)
 {
 	qcache_t *qc;
 	vmem_addr_t addr;
@@ -609,7 +609,8 @@ static struct mtx_padalign vmem_bt_lock;
  * we are really out of KVA.
  */
 static void *
-vmem_bt_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *pflag, int wait)
+vmem_bt_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
+    int wait)
 {
 	vmem_addr_t addr;
 

Modified: projects/numa2/sys/kern/vfs_bio.c
==============================================================================
--- projects/numa2/sys/kern/vfs_bio.c	Wed Jul 26 04:27:37 2017	(r321505)
+++ projects/numa2/sys/kern/vfs_bio.c	Wed Jul 26 05:01:46 2017	(r321506)
@@ -129,7 +129,7 @@ static __inline void bd_wakeup(void);
 static int sysctl_runningspace(SYSCTL_HANDLER_ARGS);
 static void bufkva_reclaim(vmem_t *, int);
 static void bufkva_free(struct buf *);
-static int buf_import(void *, void **, int, int);
+static int buf_import(void *, void **, int, int, int);
 static void buf_release(void *, void **, int);
 static void maxbcachebuf_adjust(void);
 
@@ -1415,7 +1415,7 @@ buf_free(struct buf *bp)
  *	only as a per-cpu cache of bufs still maintained on a global list.
  */
 static int
-buf_import(void *arg, void **store, int cnt, int flags)
+buf_import(void *arg, void **store, int cnt, int domain, int flags)
 {
 	struct buf *bp;
 	int i;

Modified: projects/numa2/sys/mips/mips/pmap.c
==============================================================================
--- projects/numa2/sys/mips/mips/pmap.c	Wed Jul 26 04:27:37 2017	(r321505)
+++ projects/numa2/sys/mips/mips/pmap.c	Wed Jul 26 05:01:46 2017	(r321506)
@@ -1061,7 +1061,7 @@ pmap_alloc_direct_page(unsigned int index, int req)
 {
 	vm_page_t m;
 
-	m = vm_page_alloc_freelist(VM_FREELIST_DIRECT, req | VM_ALLOC_WIRED |
+	m = vm_page_alloc_freelist(0, VM_FREELIST_DIRECT, req | VM_ALLOC_WIRED |
 	    VM_ALLOC_ZERO);
 	if (m == NULL)
 		return (NULL);
@@ -1599,7 +1599,7 @@ retry:
 		}
 	}
 	/* No free items, allocate another chunk */
-	m = vm_page_alloc_freelist(VM_FREELIST_DIRECT, VM_ALLOC_NORMAL |
+	m = vm_page_alloc_freelist(0, VM_FREELIST_DIRECT, VM_ALLOC_NORMAL |
 	    VM_ALLOC_WIRED);
 	if (m == NULL) {
 		if (try) {

Modified: projects/numa2/sys/mips/mips/uma_machdep.c
==============================================================================
--- projects/numa2/sys/mips/mips/uma_machdep.c	Wed Jul 26 04:27:37 2017	(r321505)
+++ projects/numa2/sys/mips/mips/uma_machdep.c	Wed Jul 26 05:01:46 2017	(r321506)
@@ -42,7 +42,8 @@ __FBSDID("$FreeBSD$");
 #include <machine/vmparam.h>
 
 void *
-uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait)
+uma_small_alloc(uma_zone_t zone, vm_size_t bytes, int domain, u_int8_t *flags,
+    int wait)
 {
 	vm_paddr_t pa;
 	vm_page_t m;
@@ -53,7 +54,7 @@ uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_in
 	pflags = malloc2vm_flags(wait) | VM_ALLOC_WIRED;
 
 	for (;;) {
-		m = vm_page_alloc_freelist(VM_FREELIST_DIRECT, pflags);
+		m = vm_page_alloc_freelist(domain, VM_FREELIST_DIRECT, pflags);
 #ifndef __mips_n64
 		if (m == NULL && vm_page_reclaim_contig(pflags, 1,
 		    0, MIPS_KSEG0_LARGEST_PHYS, PAGE_SIZE, 0))

Modified: projects/numa2/sys/powerpc/aim/mmu_oea64.c
==============================================================================
--- projects/numa2/sys/powerpc/aim/mmu_oea64.c	Wed Jul 26 04:27:37 2017	(r321505)
+++ projects/numa2/sys/powerpc/aim/mmu_oea64.c	Wed Jul 26 05:01:46 2017	(r321506)
@@ -1498,8 +1498,8 @@ retry:
 static mmu_t installed_mmu;
 
 static void *
-moea64_uma_page_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *flags,
-    int wait)
+moea64_uma_page_alloc(uma_zone_t zone, vm_size_t bytes, int domain,
+    uint8_t *flags, int wait)
 {
 	struct pvo_entry *pvo;
         vm_offset_t va;

Modified: projects/numa2/sys/powerpc/aim/slb.c
==============================================================================
--- projects/numa2/sys/powerpc/aim/slb.c	Wed Jul 26 04:27:37 2017	(r321505)
+++ projects/numa2/sys/powerpc/aim/slb.c	Wed Jul 26 05:01:46 2017	(r321506)
@@ -478,7 +478,8 @@ slb_insert_user(pmap_t pm, struct slb *slb)
 }
 
 static void *
-slb_uma_real_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait)
+slb_uma_real_alloc(uma_zone_t zone, vm_size_t bytes, int domain,
+    u_int8_t *flags, int wait)
 {
 	static vm_offset_t realmax = 0;
 	void *va;

Modified: projects/numa2/sys/powerpc/powerpc/uma_machdep.c
==============================================================================
--- projects/numa2/sys/powerpc/powerpc/uma_machdep.c	Wed Jul 26 04:27:37 2017	(r321505)
+++ projects/numa2/sys/powerpc/powerpc/uma_machdep.c	Wed Jul 26 05:01:46 2017	(r321506)
@@ -51,7 +51,8 @@ SYSCTL_INT(_hw, OID_AUTO, uma_mdpages, CTLFLAG_RD, &hw
 	   "UMA MD pages in use");
 
 void *
-uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait)
+uma_small_alloc(uma_zone_t zone, vm_size_t bytes, int domain, u_int8_t *flags,
+    int wait)
 {
 	void *va;
 	vm_paddr_t pa;

Modified: projects/numa2/sys/riscv/riscv/uma_machdep.c
==============================================================================
--- projects/numa2/sys/riscv/riscv/uma_machdep.c	Wed Jul 26 04:27:37 2017	(r321505)
+++ projects/numa2/sys/riscv/riscv/uma_machdep.c	Wed Jul 26 05:01:46 2017	(r321506)
@@ -41,7 +41,8 @@ __FBSDID("$FreeBSD$");
 #include <machine/vmparam.h>
 
 void *
-uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait)
+uma_small_alloc(uma_zone_t zone, vm_size_t bytes, int domain, u_int8_t *flags,
+    int wait)
 {
 
 	panic("uma_small_alloc");

Modified: projects/numa2/sys/sparc64/sparc64/vm_machdep.c
==============================================================================
--- projects/numa2/sys/sparc64/sparc64/vm_machdep.c	Wed Jul 26 04:27:37 2017	(r321505)
+++ projects/numa2/sys/sparc64/sparc64/vm_machdep.c	Wed Jul 26 05:01:46 2017	(r321506)
@@ -390,7 +390,8 @@ swi_vm(void *v)
 }
 
 void *
-uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait)
+uma_small_alloc(uma_zone_t zone, vm_size_t bytes, int domain, u_int8_t *flags,
+    int wait)
 {
 	vm_paddr_t pa;
 	vm_page_t m;

Modified: projects/numa2/sys/sys/_vm_domain.h
==============================================================================
--- projects/numa2/sys/sys/_vm_domain.h	Wed Jul 26 04:27:37 2017	(r321505)
+++ projects/numa2/sys/sys/_vm_domain.h	Wed Jul 26 05:01:46 2017	(r321506)
@@ -40,7 +40,6 @@ typedef enum {
 	VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN,
 	VM_POLICY_FIRST_TOUCH,
 	VM_POLICY_FIRST_TOUCH_ROUND_ROBIN,
-	VM_POLICY_MAX
 } vm_domain_policy_type_t;
 
 struct vm_domain_policy_entry {
@@ -51,6 +50,12 @@ struct vm_domain_policy_entry {
 struct vm_domain_policy {
 	seq_t seq;
 	struct vm_domain_policy_entry p;
+};
+
+struct vm_domain_iterator {
+	vm_domain_policy_type_t policy;
+	int cursor;
+	int domain;
 };
 
 #define VM_DOMAIN_POLICY_STATIC_INITIALISER(vt, vd) \

Modified: projects/numa2/sys/sys/busdma_bufalloc.h
==============================================================================
--- projects/numa2/sys/sys/busdma_bufalloc.h	Wed Jul 26 04:27:37 2017	(r321505)
+++ projects/numa2/sys/sys/busdma_bufalloc.h	Wed Jul 26 05:01:46 2017	(r321506)
@@ -111,7 +111,7 @@ struct busdma_bufzone * busdma_bufalloc_findzone(busdm
  * you can probably use these when you need uncacheable buffers.
  */
 void * busdma_bufalloc_alloc_uncacheable(uma_zone_t zone, vm_size_t size,
-    uint8_t *pflag, int wait);
+    int domain, uint8_t *pflag, int wait);
 void  busdma_bufalloc_free_uncacheable(void *item, vm_size_t size,
     uint8_t pflag);
 

Modified: projects/numa2/sys/sys/proc.h
==============================================================================
--- projects/numa2/sys/sys/proc.h	Wed Jul 26 04:27:37 2017	(r321505)
+++ projects/numa2/sys/sys/proc.h	Wed Jul 26 05:01:46 2017	(r321506)
@@ -282,7 +282,7 @@ struct thread {
 	pid_t		td_dbg_forked;	/* (c) Child pid for debugger. */
 	u_int		td_vp_reserv;	/* (k) Count of reserved vnodes. */
 	int		td_no_sleeping;	/* (k) Sleeping disabled count. */
-	int		td_dom_rr_idx;	/* (k) RR Numa domain selection. */
+	struct vm_domain_iterator td_dom_selector; /* (k) VM domain selector */
 	void		*td_su;		/* (k) FFS SU private */
 	sbintime_t	td_sleeptimo;	/* (t) Sleep timeout. */
 	int		td_rtcgen;	/* (s) rtc_generation of abs. sleep */

Modified: projects/numa2/sys/vm/uma.h
==============================================================================
--- projects/numa2/sys/vm/uma.h	Wed Jul 26 04:27:37 2017	(r321505)
+++ projects/numa2/sys/vm/uma.h	Wed Jul 26 05:01:46 2017	(r321506)
@@ -45,6 +45,7 @@
 /* Types and type defs */
 
 struct uma_zone;
+struct vm_domain_iterator;
 /* Opaque type used as a handle to the zone */
 typedef struct uma_zone * uma_zone_t;
 
@@ -126,7 +127,8 @@ typedef void (*uma_fini)(void *mem, int size);
 /*
  * Import new memory into a cache zone.
  */
-typedef int (*uma_import)(void *arg, void **store, int count, int flags);
+typedef int (*uma_import)(void *arg, void **store, int count, int domain,
+    int flags);
 
 /*
  * Free memory from a cache zone.
@@ -365,25 +367,21 @@ uma_zfree(uma_zone_t zone, void *item)
 }
 
 /*
- * XXX The rest of the prototypes in this header are h0h0 magic for the VM.
- * If you think you need to use it for a normal zone you're probably incorrect.
- */
-
-/*
  * Backend page supplier routines
  *
  * Arguments:
  *	zone  The zone that is requesting pages.
  *	size  The number of bytes being requested.
  *	pflag Flags for these memory pages, see below.
+ *	domain The NUMA domain that we prefer for this allocation.
  *	wait  Indicates our willingness to block.
  *
  * Returns:
  *	A pointer to the allocated memory or NULL on failure.
  */
 
-typedef void *(*uma_alloc)(uma_zone_t zone, vm_size_t size, uint8_t *pflag,
-    int wait);
+typedef void *(*uma_alloc)(uma_zone_t zone, vm_size_t size, int domain,
+    uint8_t *pflag, int wait);
 
 /*
  * Backend page free routines
@@ -398,8 +396,6 @@ typedef void *(*uma_alloc)(uma_zone_t zone, vm_size_t 
  */
 typedef void (*uma_free)(void *item, vm_size_t size, uint8_t pflag);
 
-
-
 /*
  * Sets up the uma allocator. (Called by vm_mem_init)
  *
@@ -596,6 +592,19 @@ void uma_zone_set_allocf(uma_zone_t zone, uma_alloc al
  */
 
 void uma_zone_set_freef(uma_zone_t zone, uma_free freef);
+
+/*
+ * XXX
+ *
+ * Arguments:
+ *	zone	The zone NUMA policy is being installed into.
+ *	sel	Selector of the NUMA policy requested.
+ *
+ * Returns:
+ *	Nothing
+ */
+void uma_zone_set_domain_selector(uma_zone_t zone,
+    struct vm_domain_iterator *sel);
 
 /*
  * These flags are setable in the allocf and visible in the freef.

Modified: projects/numa2/sys/vm/uma_core.c
==============================================================================
--- projects/numa2/sys/vm/uma_core.c	Wed Jul 26 04:27:37 2017	(r321505)
+++ projects/numa2/sys/vm/uma_core.c	Wed Jul 26 05:01:46 2017	(r321506)
@@ -76,10 +76,12 @@ __FBSDID("$FreeBSD$");
 #include <sys/vmmeter.h>
 
 #include <vm/vm.h>
+#include <vm/vm_domain.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_param.h>
+#include <vm/vm_phys.h>
 #include <vm/vm_map.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
@@ -221,11 +223,11 @@ enum zfreeskip { SKIP_NONE = 0, SKIP_DTOR, SKIP_FINI }
 
 /* Prototypes.. */
 
-static void *noobj_alloc(uma_zone_t, vm_size_t, uint8_t *, int);
-static void *page_alloc(uma_zone_t, vm_size_t, uint8_t *, int);
-static void *startup_alloc(uma_zone_t, vm_size_t, uint8_t *, int);
+static void *noobj_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
+static void *page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
+static void *startup_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
 static void page_free(void *, vm_size_t, uint8_t);
-static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int);
+static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int, int);
 static void cache_drain(uma_zone_t);
 static void bucket_drain(uma_zone_t, uma_bucket_t);
 static void bucket_cache_drain(uma_zone_t zone);
@@ -243,23 +245,23 @@ static int hash_expand(struct uma_hash *, struct uma_h
 static void hash_free(struct uma_hash *hash);
 static void uma_timeout(void *);
 static void uma_startup3(void);
-static void *zone_alloc_item(uma_zone_t, void *, int);
+static void *zone_alloc_item(uma_zone_t, void *, int, int);
 static void zone_free_item(uma_zone_t, void *, void *, enum zfreeskip);
 static void bucket_enable(void);
 static void bucket_init(void);
 static uma_bucket_t bucket_alloc(uma_zone_t zone, void *, int);
 static void bucket_free(uma_zone_t zone, uma_bucket_t, void *);
 static void bucket_zone_drain(void);
-static uma_bucket_t zone_alloc_bucket(uma_zone_t zone, void *, int flags);
-static uma_slab_t zone_fetch_slab(uma_zone_t zone, uma_keg_t last, int flags);
-static uma_slab_t zone_fetch_slab_multi(uma_zone_t zone, uma_keg_t last, int flags);
+static uma_bucket_t zone_alloc_bucket(uma_zone_t, void *, int, int);
+static uma_slab_t zone_fetch_slab(uma_zone_t, uma_keg_t, int, int);
+static uma_slab_t zone_fetch_slab_multi(uma_zone_t, uma_keg_t, int, int);
 static void *slab_alloc_item(uma_keg_t keg, uma_slab_t slab);
 static void slab_free_item(uma_keg_t keg, uma_slab_t slab, void *item);
 static uma_keg_t uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit,
     uma_fini fini, int align, uint32_t flags);
-static int zone_import(uma_zone_t zone, void **bucket, int max, int flags);
-static void zone_release(uma_zone_t zone, void **bucket, int cnt);
-static void uma_zero_item(void *item, uma_zone_t zone);
+static int zone_import(uma_zone_t, void **, int, int, int);
+static void zone_release(uma_zone_t, void **, int);
+static void uma_zero_item(void *, uma_zone_t);
 
 void uma_print_zone(uma_zone_t);
 void uma_print_stats(void);
@@ -298,18 +300,25 @@ bucket_enable(void)
  * For each zone, calculate the memory required for each bucket, consisting
  * of the header and an array of pointers.
  */
+static struct vm_domain_policy bucket_policy =
+    VM_DOMAIN_POLICY_STATIC_INITIALISER(VM_POLICY_FIRST_TOUCH, 0);
+static struct vm_domain_iterator bucket_iterator;
+
 static void
 bucket_init(void)
 {
 	struct uma_bucket_zone *ubz;
 	int size;
 
+	vm_domain_iterator_set_policy(&bucket_iterator, &bucket_policy);
+
 	for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) {
 		size = roundup(sizeof(struct uma_bucket), sizeof(void *));
 		size += sizeof(void *) * ubz->ubz_entries;
 		ubz->ubz_zone = uma_zcreate(ubz->ubz_name, size,
 		    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
 		    UMA_ZONE_MTXCLASS | UMA_ZFLAG_BUCKET);
+		uma_zone_set_domain_selector(ubz->ubz_zone, &bucket_iterator);
 	}
 }
 
@@ -547,7 +556,7 @@ hash_alloc(struct uma_hash *hash)
 	} else {
 		alloc = sizeof(hash->uh_slab_hash[0]) * UMA_HASH_SIZE_INIT;
 		hash->uh_slab_hash = zone_alloc_item(hashzone, NULL,
-		    M_WAITOK);
+		    UMA_ANYDOMAIN, M_WAITOK);
 		hash->uh_hashsize = UMA_HASH_SIZE_INIT;
 	}
 	if (hash->uh_slab_hash) {
@@ -713,6 +722,7 @@ cache_drain_safe_cpu(uma_zone_t zone)
 {
 	uma_cache_t cache;
 	uma_bucket_t b1, b2;
+	int domain;
 
 	if (zone->uz_flags & UMA_ZFLAG_INTERNAL)
 		return;
@@ -720,10 +730,14 @@ cache_drain_safe_cpu(uma_zone_t zone)
 	b1 = b2 = NULL;
 	ZONE_LOCK(zone);
 	critical_enter();
+	if (zone->uz_sel == NULL)
+		domain = 0;
+	else
+		domain = vm_domain_select_first(zone->uz_sel);
 	cache = &zone->uz_cpu[curcpu];
 	if (cache->uc_allocbucket) {
 		if (cache->uc_allocbucket->ub_cnt != 0)
-			LIST_INSERT_HEAD(&zone->uz_buckets,
+			LIST_INSERT_HEAD(&zone->uz_domain[domain].uzd_buckets,
 			    cache->uc_allocbucket, ub_link);
 		else
 			b1 = cache->uc_allocbucket;
@@ -731,7 +745,7 @@ cache_drain_safe_cpu(uma_zone_t zone)
 	}
 	if (cache->uc_freebucket) {
 		if (cache->uc_freebucket->ub_cnt != 0)
-			LIST_INSERT_HEAD(&zone->uz_buckets,
+			LIST_INSERT_HEAD(&zone->uz_domain[domain].uzd_buckets,
 			    cache->uc_freebucket, ub_link);
 		else
 			b2 = cache->uc_freebucket;
@@ -786,18 +800,22 @@ cache_drain_safe(uma_zone_t zone)
 static void
 bucket_cache_drain(uma_zone_t zone)
 {
+	uma_zone_domain_t zdom;
 	uma_bucket_t bucket;
+	int i;
 
 	/*
-	 * Drain the bucket queues and free the buckets, we just keep two per
-	 * cpu (alloc/free).
+	 * Drain the bucket queues and free the buckets.
 	 */
-	while ((bucket = LIST_FIRST(&zone->uz_buckets)) != NULL) {
-		LIST_REMOVE(bucket, ub_link);
-		ZONE_UNLOCK(zone);
-		bucket_drain(zone, bucket);
-		bucket_free(zone, bucket, NULL);
-		ZONE_LOCK(zone);
+	for (i = 0; i < vm_ndomains; i++) {
+		zdom = &zone->uz_domain[i];
+		while ((bucket = LIST_FIRST(&zdom->uzd_buckets)) != NULL) {
+			LIST_REMOVE(bucket, ub_link);
+			ZONE_UNLOCK(zone);
+			bucket_drain(zone, bucket);
+			bucket_free(zone, bucket, NULL);
+			ZONE_LOCK(zone);
+		}
 	}
 
 	/*
@@ -841,7 +859,9 @@ static void
 keg_drain(uma_keg_t keg)
 {
 	struct slabhead freeslabs = { 0 };
+	uma_domain_t dom;
 	uma_slab_t slab, tmp;
+	int i;
 
 	/*
 	 * We don't want to take pages from statically allocated kegs at this
@@ -856,20 +876,25 @@ keg_drain(uma_keg_t keg)
 	if (keg->uk_free == 0)
 		goto finished;
 
-	LIST_FOREACH_SAFE(slab, &keg->uk_free_slab, us_link, tmp) {
-		/* We have nowhere to free these to. */
-		if (slab->us_flags & UMA_SLAB_BOOT)
-			continue;
+	for (i = 0; i < vm_ndomains; i++) {
+		dom = &keg->uk_domain[i];
+		LIST_FOREACH_SAFE(slab, &dom->ud_free_slab, us_link, tmp) {
+			/* We have nowhere to free these to. */
+			if (slab->us_flags & UMA_SLAB_BOOT)
+				continue;
 
-		LIST_REMOVE(slab, us_link);
-		keg->uk_pages -= keg->uk_ppera;
-		keg->uk_free -= keg->uk_ipers;
+			LIST_REMOVE(slab, us_link);
+			keg->uk_pages -= keg->uk_ppera;
+			keg->uk_free -= keg->uk_ipers;
 
-		if (keg->uk_flags & UMA_ZONE_HASH)
-			UMA_HASH_REMOVE(&keg->uk_hash, slab, slab->us_data);
+			if (keg->uk_flags & UMA_ZONE_HASH)
+				UMA_HASH_REMOVE(&keg->uk_hash, slab,
+				    slab->us_data);
 
-		SLIST_INSERT_HEAD(&freeslabs, slab, us_hlink);
+			SLIST_INSERT_HEAD(&freeslabs, slab, us_hlink);
+		}
 	}
+
 finished:
 	KEG_UNLOCK(keg);
 
@@ -929,7 +954,7 @@ zone_drain(uma_zone_t zone)
  *	caller specified M_NOWAIT.
  */
 static uma_slab_t
-keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int wait)
+keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int domain, int wait)
 {
 	uma_alloc allocf;
 	uma_slab_t slab;
@@ -937,6 +962,8 @@ keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int wai
 	uint8_t flags;
 	int i;
 
+	KASSERT(domain >= 0 && domain < vm_ndomains,
+	    ("keg_alloc_slab: domain %d out of range", domain));
 	mtx_assert(&keg->uk_lock, MA_OWNED);
 	slab = NULL;
 	mem = NULL;
@@ -945,7 +972,7 @@ keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int wai
 	KEG_UNLOCK(keg);
 
 	if (keg->uk_flags & UMA_ZONE_OFFPAGE) {
-		slab = zone_alloc_item(keg->uk_slabzone, NULL, wait);
+		slab = zone_alloc_item(keg->uk_slabzone, NULL, domain, wait);
 		if (slab == NULL)
 			goto out;
 	}
@@ -966,7 +993,7 @@ keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int wai
 		wait |= M_NODUMP;
 
 	/* zone is passed for legacy reasons. */
-	mem = allocf(zone, keg->uk_ppera * PAGE_SIZE, &flags, wait);
+	mem = allocf(zone, keg->uk_ppera * PAGE_SIZE, domain, &flags, wait);
 	if (mem == NULL) {
 		if (keg->uk_flags & UMA_ZONE_OFFPAGE)
 			zone_free_item(keg->uk_slabzone, slab, NULL, SKIP_NONE);
@@ -991,6 +1018,18 @@ keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int wai
 	BIT_ZERO(SLAB_SETSIZE, &slab->us_debugfree);
 #endif
 
+	/*
+	 * Set the domain based on the first page.  This may be incorrect for
+	 * multi-page allocations depending on the NUMA policy specified.
+	 */
+#if MAXMEMDOM > 1
+	if ((flags & UMA_SLAB_BOOT) == 0)
+		slab->us_domain = vm_phys_domain(PHYS_TO_VM_PAGE(
+		    pmap_kextract((vm_offset_t)mem)));
+	else
+#endif
+		slab->us_domain = 0;
+
 	if (keg->uk_init != NULL) {
 		for (i = 0; i < keg->uk_ipers; i++)
 			if (keg->uk_init(slab->us_data + (keg->uk_rsize * i),
@@ -1025,7 +1064,8 @@ out:
  * the VM is ready.
  */
 static void *
-startup_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *pflag, int wait)
+startup_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
+    int wait)
 {
 	uma_keg_t keg;
 	void *mem;
@@ -1058,7 +1098,7 @@ startup_alloc(uma_zone_t zone, vm_size_t bytes, uint8_
 #else
 	keg->uk_allocf = page_alloc;
 #endif
-	return keg->uk_allocf(zone, bytes, pflag, wait);
+	return keg->uk_allocf(zone, bytes, domain, pflag, wait);
 }
 
 /*
@@ -1073,7 +1113,8 @@ startup_alloc(uma_zone_t zone, vm_size_t bytes, uint8_
  *	NULL if M_NOWAIT is set.
  */
 static void *
-page_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *pflag, int wait)
+page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
+    int wait)
 {
 	void *p;	/* Returned page */
 
@@ -1095,7 +1136,8 @@ page_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *
  *	NULL if M_NOWAIT is set.
  */
 static void *
-noobj_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, int wait)
+noobj_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags,
+    int wait)
 {
 	TAILQ_HEAD(, vm_page) alloctail;
 	u_long npages;
@@ -1108,8 +1150,8 @@ noobj_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t 
 
 	npages = howmany(bytes, PAGE_SIZE);
 	while (npages > 0) {
-		p = vm_page_alloc(NULL, 0, VM_ALLOC_INTERRUPT |
-		    VM_ALLOC_WIRED | VM_ALLOC_NOOBJ);
+		p = vm_page_alloc_domain(NULL, 0, domain,
+		    VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED | VM_ALLOC_NOOBJ);
 		if (p != NULL) {
 			/*
 			 * Since the page does not belong to an object, its
@@ -1380,6 +1422,7 @@ keg_ctor(void *mem, int size, void *udata, int flags)
 	keg->uk_init = arg->uminit;
 	keg->uk_fini = arg->fini;
 	keg->uk_align = arg->align;
+	keg->uk_cursor = 0;
 	keg->uk_free = 0;
 	keg->uk_reserve = 0;
 	keg->uk_pages = 0;
@@ -1519,6 +1562,7 @@ zone_ctor(void *mem, int size, void *udata, int flags)
 	zone->uz_sleeps = 0;
 	zone->uz_count = 0;
 	zone->uz_count_min = 0;
+	zone->uz_sel = NULL;
 	zone->uz_flags = 0;
 	zone->uz_warning = NULL;
 	timevalclear(&zone->uz_ratecheck);
@@ -1810,7 +1854,7 @@ uma_kcreate(uma_zone_t zone, size_t size, uma_init umi
 	args.align = (align == UMA_ALIGN_CACHE) ? uma_align_cache : align;
 	args.flags = flags;
 	args.zone = zone;
-	return (zone_alloc_item(kegs, &args, M_WAITOK));
+	return (zone_alloc_item(kegs, &args, UMA_ANYDOMAIN, M_WAITOK));
 }
 
 /* See uma.h */
@@ -1867,7 +1911,7 @@ uma_zcreate(const char *name, size_t size, uma_ctor ct
 		sx_slock(&uma_drain_lock);
 		locked = true;
 	}
-	res = zone_alloc_item(zones, &args, M_WAITOK);
+	res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK);
 	if (locked)
 		sx_sunlock(&uma_drain_lock);
 	return (res);
@@ -1902,7 +1946,7 @@ uma_zsecond_create(char *name, uma_ctor ctor, uma_dtor
 		locked = true;
 	}
 	/* XXX Attaches only one keg of potentially many. */
-	res = zone_alloc_item(zones, &args, M_WAITOK);
+	res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK);
 	if (locked)
 		sx_sunlock(&uma_drain_lock);
 	return (res);
@@ -1929,7 +1973,7 @@ uma_zcache_create(char *name, int size, uma_ctor ctor,
 	args.align = 0;
 	args.flags = flags;
 
-	return (zone_alloc_item(zones, &args, M_WAITOK));
+	return (zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK));
 }
 
 static void
@@ -2025,11 +2069,11 @@ uma_zdestroy(uma_zone_t zone)
 void *
 uma_zalloc_arg(uma_zone_t zone, void *udata, int flags)
 {
-	void *item;
-	uma_cache_t cache;
+	uma_zone_domain_t zdom;
 	uma_bucket_t bucket;
-	int lockfail;
-	int cpu;
+	uma_cache_t cache;
+	void *item;
+	int cpu, domain, lockfail;
 
 	/* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
 	random_harvest_fast_uma(&zone, sizeof(zone), 1, RANDOM_UMA);
@@ -2127,8 +2171,10 @@ zalloc_start:
 		bucket_free(zone, bucket, udata);
 
 	/* Short-circuit for zones without buckets and low memory. */
-	if (zone->uz_count == 0 || bucketdisable)
+	if (zone->uz_count == 0 || bucketdisable) {
+		domain = UMA_ANYDOMAIN;
 		goto zalloc_item;
+	}
 
 	/*
 	 * Attempt to retrieve the item from the per-CPU cache has failed, so
@@ -2163,10 +2209,18 @@ zalloc_start:
 		goto zalloc_start;
 	}
 
+	if (zone->uz_sel == NULL) {
+		domain = UMA_ANYDOMAIN;
+		zdom = &zone->uz_domain[0];
+	} else {
+		domain = vm_domain_select_first(zone->uz_sel);
+		zdom = &zone->uz_domain[domain];
+	}
+
 	/*
 	 * Check the zone's cache of buckets.
 	 */
-	if ((bucket = LIST_FIRST(&zone->uz_buckets)) != NULL) {
+	if ((bucket = LIST_FIRST(&zdom->uzd_buckets)) != NULL) {
 		KASSERT(bucket->ub_cnt != 0,
 		    ("uma_zalloc_arg: Returning an empty bucket."));
 
@@ -2191,7 +2245,7 @@ zalloc_start:
 	 * works we'll restart the allocation from the beginning and it
 	 * will use the just filled bucket.
 	 */
-	bucket = zone_alloc_bucket(zone, udata, flags);
+	bucket = zone_alloc_bucket(zone, udata, domain, flags);
 	CTR3(KTR_UMA, "uma_zalloc: zone %s(%p) bucket zone returned %p",
 	    zone->uz_name, zone, bucket);
 	if (bucket != NULL) {
@@ -2204,10 +2258,11 @@ zalloc_start:
 		 * initialized bucket to make this less likely or claim
 		 * the memory directly.
 		 */
-		if (cache->uc_allocbucket == NULL)
-			cache->uc_allocbucket = bucket;
+		if (cache->uc_allocbucket != NULL ||
+		    (domain != UMA_ANYDOMAIN && domain != PCPU_GET(domain)))
+			LIST_INSERT_HEAD(&zdom->uzd_buckets, bucket, ub_link);
 		else
-			LIST_INSERT_HEAD(&zone->uz_buckets, bucket, ub_link);
+			cache->uc_allocbucket = bucket;
 		ZONE_UNLOCK(zone);
 		goto zalloc_start;
 	}
@@ -2216,38 +2271,57 @@ zalloc_start:
 	 * We may not be able to get a bucket so return an actual item.
 	 */
 zalloc_item:
-	item = zone_alloc_item(zone, udata, flags);
+	item = zone_alloc_item(zone, udata, domain, flags);
 
 	return (item);
 }
 
+/*
+ * Find a slab with some space.  Prefer slabs that are partially used over those
+ * that are totally full.  This helps to reduce fragmentation.
+ */
 static uma_slab_t
-keg_fetch_slab(uma_keg_t keg, uma_zone_t zone, int flags)
+keg_first_slab(uma_keg_t keg, int domain)
 {
+	uma_domain_t dom;
 	uma_slab_t slab;
-	int reserve;
 
+	KASSERT(domain >= 0 && domain < vm_ndomains,
+	    ("keg_first_slab: domain %d out of range", domain));
+
+	dom = &keg->uk_domain[domain];
+	if (!LIST_EMPTY(&dom->ud_part_slab))
+		return (LIST_FIRST(&dom->ud_part_slab));
+	if (LIST_EMPTY(&dom->ud_free_slab))
+		return (NULL);
+	slab = LIST_FIRST(&dom->ud_free_slab);
+	LIST_REMOVE(slab, us_link);
+	LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link);
+	return (slab);
+}
+
+static uma_slab_t
+keg_fetch_slab(uma_keg_t keg, uma_zone_t zone, int rdomain, int flags)
+{
+	uma_domain_t dom;
+	uma_slab_t slab;
+	int domain, reserve, start;
+
 	mtx_assert(&keg->uk_lock, MA_OWNED);
 	slab = NULL;
 	reserve = 0;
 	if ((flags & M_USE_RESERVE) == 0)
 		reserve = keg->uk_reserve;
 
-	for (;;) {
-		/*
-		 * Find a slab with some space.  Prefer slabs that are partially
-		 * used over those that are totally full.  This helps to reduce
-		 * fragmentation.
-		 */
-		if (keg->uk_free > reserve) {
-			if (!LIST_EMPTY(&keg->uk_part_slab)) {
-				slab = LIST_FIRST(&keg->uk_part_slab);
-			} else {
-				slab = LIST_FIRST(&keg->uk_free_slab);
-				LIST_REMOVE(slab, us_link);
-				LIST_INSERT_HEAD(&keg->uk_part_slab, slab,
-				    us_link);
-			}
+	if (rdomain == UMA_ANYDOMAIN) {
+		keg->uk_cursor = (keg->uk_cursor + 1) % vm_ndomains;
+		domain = start = keg->uk_cursor;
+	} else
+		domain = start = rdomain;
+
+	do {
+		if (keg->uk_free > reserve &&
+		    (slab = keg_first_slab(keg, domain)) != NULL) {
 			MPASS(slab->us_keg == keg);
 			return (slab);
 		}
@@ -2275,7 +2349,7 @@ keg_fetch_slab(uma_keg_t keg, uma_zone_t zone, int fla
 			msleep(keg, &keg->uk_lock, PVM, "keglimit", 0);
 			continue;
 		}
-		slab = keg_alloc_slab(keg, zone, flags);
+		slab = keg_alloc_slab(keg, zone, domain, flags);
 		/*
 		 * If we got a slab here it's safe to mark it partially used
 		 * and return.  We assume that the caller is going to remove
@@ -2283,7 +2357,8 @@ keg_fetch_slab(uma_keg_t keg, uma_zone_t zone, int fla
 		 */
 		if (slab) {
 			MPASS(slab->us_keg == keg);
-			LIST_INSERT_HEAD(&keg->uk_part_slab, slab, us_link);
+			dom = &keg->uk_domain[slab->us_domain];
+			LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link);
 			return (slab);
 		}
 		/*

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***


More information about the svn-src-projects mailing list