svn commit: r327900 - in head: lib/libmemstat sys/amd64/amd64 sys/arm64/arm64 sys/i386/i386 sys/kern sys/mips/mips sys/powerpc/aim sys/powerpc/powerpc sys/riscv/riscv sys/sparc64/sparc64 sys/sys sy...

Jeff Roberson jeff at FreeBSD.org
Fri Jan 12 23:25:08 UTC 2018


Author: jeff
Date: Fri Jan 12 23:25:05 2018
New Revision: 327900
URL: https://svnweb.freebsd.org/changeset/base/327900

Log:
  Implement NUMA support in uma(9) and malloc(9).  Allocations from specific
  domains can be done by the _domain() API variants.  UMA also supports a
  first-touch policy via the NUMA zone flag.
  
  The slab layer is now segregated by VM domains and is precise.  It handles
  iteration for round-robin directly.  The per-cpu cache layer remains
  a mix of domains according to where memory is allocated and freed.  Well
  behaved clients can achieve perfect locality with no performance penalty.
  
  The direct domain allocation functions have to visit the slab layer and
  so require per-zone locks which come at some expense.
  
  Reviewed by:	Attilio (a slightly older version)
  Tested by:	pho
  Sponsored by:	Netflix, Dell/EMC Isilon

Modified:
  head/lib/libmemstat/memstat_uma.c
  head/sys/amd64/amd64/uma_machdep.c
  head/sys/arm64/arm64/uma_machdep.c
  head/sys/i386/i386/pmap.c
  head/sys/kern/kern_malloc.c
  head/sys/kern/kern_mbuf.c
  head/sys/kern/subr_busdma_bufalloc.c
  head/sys/kern/subr_vmem.c
  head/sys/kern/vfs_bio.c
  head/sys/mips/mips/uma_machdep.c
  head/sys/powerpc/aim/mmu_oea64.c
  head/sys/powerpc/aim/slb.c
  head/sys/powerpc/powerpc/uma_machdep.c
  head/sys/riscv/riscv/uma_machdep.c
  head/sys/sparc64/sparc64/vm_machdep.c
  head/sys/sys/malloc.h
  head/sys/vm/uma.h
  head/sys/vm/uma_core.c
  head/sys/vm/uma_int.h

Modified: head/lib/libmemstat/memstat_uma.c
==============================================================================
--- head/lib/libmemstat/memstat_uma.c	Fri Jan 12 23:13:55 2018	(r327899)
+++ head/lib/libmemstat/memstat_uma.c	Fri Jan 12 23:25:05 2018	(r327900)
@@ -55,6 +55,8 @@ static struct nlist namelist[] = {
 	{ .n_name = "_mp_maxid" },
 #define	X_ALL_CPUS	2
 	{ .n_name = "_all_cpus" },
+#define	X_VM_NDOMAINS	3
+	{ .n_name = "_vm_ndomains" },
 	{ .n_name = "" },
 };
 
@@ -297,11 +299,12 @@ memstat_kvm_uma(struct memory_type_list *list, void *k
 {
 	LIST_HEAD(, uma_keg) uma_kegs;
 	struct memory_type *mtp;
+	struct uma_zone_domain uzd;
 	struct uma_bucket *ubp, ub;
 	struct uma_cache *ucp, *ucp_array;
 	struct uma_zone *uzp, uz;
 	struct uma_keg *kzp, kz;
-	int hint_dontsearch, i, mp_maxid, ret;
+	int hint_dontsearch, i, mp_maxid, ndomains, ret;
 	char name[MEMTYPE_MAXNAME];
 	cpuset_t all_cpus;
 	long cpusetsize;
@@ -323,6 +326,12 @@ memstat_kvm_uma(struct memory_type_list *list, void *k
 		list->mtl_error = ret;
 		return (-1);
 	}
+	ret = kread_symbol(kvm, X_VM_NDOMAINS, &ndomains,
+	    sizeof(ndomains), 0);
+	if (ret != 0) {
+		list->mtl_error = ret;
+		return (-1);
+	}
 	ret = kread_symbol(kvm, X_UMA_KEGS, &uma_kegs, sizeof(uma_kegs), 0);
 	if (ret != 0) {
 		list->mtl_error = ret;
@@ -447,10 +456,17 @@ skip_percpu:
 				    kz.uk_ipers;
 			mtp->mt_byteslimit = mtp->mt_countlimit * mtp->mt_size;
 			mtp->mt_count = mtp->mt_numallocs - mtp->mt_numfrees;
-			for (ubp = LIST_FIRST(&uz.uz_buckets); ubp !=
-			    NULL; ubp = LIST_NEXT(&ub, ub_link)) {
-				ret = kread(kvm, ubp, &ub, sizeof(ub), 0);
-				mtp->mt_zonefree += ub.ub_cnt;
+			for (i = 0; i < ndomains; i++) {
+				ret = kread(kvm, &uz.uz_domain[i], &uzd,
+				   sizeof(uzd), 0);
+				for (ubp =
+				    LIST_FIRST(&uzd.uzd_buckets);
+				    ubp != NULL;
+				    ubp = LIST_NEXT(&ub, ub_link)) {
+					ret = kread(kvm, ubp, &ub,
+					   sizeof(ub), 0);
+					mtp->mt_zonefree += ub.ub_cnt;
+				}
 			}
 			if (!((kz.uk_flags & UMA_ZONE_SECONDARY) &&
 			    LIST_FIRST(&kz.uk_zones) != uzp)) {

Modified: head/sys/amd64/amd64/uma_machdep.c
==============================================================================
--- head/sys/amd64/amd64/uma_machdep.c	Fri Jan 12 23:13:55 2018	(r327899)
+++ head/sys/amd64/amd64/uma_machdep.c	Fri Jan 12 23:25:05 2018	(r327900)
@@ -44,14 +44,15 @@ __FBSDID("$FreeBSD$");
 #include <machine/vmparam.h>
 
 void *
-uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait)
+uma_small_alloc(uma_zone_t zone, vm_size_t bytes, int domain, u_int8_t *flags,
+    int wait)
 {
 	vm_page_t m;
 	vm_paddr_t pa;
 	void *va;
 
 	*flags = UMA_SLAB_PRIV;
-	m = vm_page_alloc(NULL, 0,
+	m = vm_page_alloc_domain(NULL, 0, domain,
 	    malloc2vm_flags(wait) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED);
 	if (m == NULL)
 		return (NULL);

Modified: head/sys/arm64/arm64/uma_machdep.c
==============================================================================
--- head/sys/arm64/arm64/uma_machdep.c	Fri Jan 12 23:13:55 2018	(r327899)
+++ head/sys/arm64/arm64/uma_machdep.c	Fri Jan 12 23:25:05 2018	(r327900)
@@ -42,14 +42,15 @@ __FBSDID("$FreeBSD$");
 #include <machine/vmparam.h>
 
 void *
-uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait)
+uma_small_alloc(uma_zone_t zone, vm_size_t bytes, int domain, u_int8_t *flags,
+    int wait)
 {
 	vm_page_t m;
 	vm_paddr_t pa;
 	void *va;
 
 	*flags = UMA_SLAB_PRIV;
-	m = vm_page_alloc(NULL, 0,
+	m = vm_page_alloc_domain(NULL, 0, domain,
 	    malloc2vm_flags(wait) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED);
 	if (m == NULL)
 		return (NULL);

Modified: head/sys/i386/i386/pmap.c
==============================================================================
--- head/sys/i386/i386/pmap.c	Fri Jan 12 23:13:55 2018	(r327899)
+++ head/sys/i386/i386/pmap.c	Fri Jan 12 23:25:05 2018	(r327900)
@@ -338,8 +338,8 @@ static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offs
 static void pmap_pte_release(pt_entry_t *pte);
 static int pmap_unuse_pt(pmap_t, vm_offset_t, struct spglist *);
 #if defined(PAE) || defined(PAE_TABLES)
-static void *pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, uint8_t *flags,
-    int wait);
+static void *pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, int domain,
+    uint8_t *flags, int wait);
 #endif
 static void pmap_set_pg(void);
 
@@ -697,12 +697,13 @@ pmap_page_init(vm_page_t m)
 
 #if defined(PAE) || defined(PAE_TABLES)
 static void *
-pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, int wait)
+pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags,
+    int wait)
 {
 
 	/* Inform UMA that this allocator uses kernel_map/object. */
 	*flags = UMA_SLAB_KERNEL;
-	return ((void *)kmem_alloc_contig(kernel_arena, bytes, wait, 0x0ULL,
+	return ((void *)kmem_alloc_contig_domain(domain, bytes, wait, 0x0ULL,
 	    0xffffffffULL, 1, 0, VM_MEMATTR_DEFAULT));
 }
 #endif

Modified: head/sys/kern/kern_malloc.c
==============================================================================
--- head/sys/kern/kern_malloc.c	Fri Jan 12 23:13:55 2018	(r327899)
+++ head/sys/kern/kern_malloc.c	Fri Jan 12 23:25:05 2018	(r327900)
@@ -96,6 +96,11 @@ __FBSDID("$FreeBSD$");
 dtrace_malloc_probe_func_t	dtrace_malloc_probe;
 #endif
 
+#if defined(INVARIANTS) || defined(MALLOC_MAKE_FAILURES) ||		\
+    defined(DEBUG_MEMGUARD) || defined(DEBUG_REDZONE)
+#define	MALLOC_DEBUG	1
+#endif
+
 /*
  * When realloc() is called, if the new size is sufficiently smaller than
  * the old size, realloc() will allocate a new, smaller block to avoid
@@ -417,6 +422,20 @@ contigmalloc(unsigned long size, struct malloc_type *t
 	return (ret);
 }
 
+void *
+contigmalloc_domain(unsigned long size, struct malloc_type *type,
+    int domain, int flags, vm_paddr_t low, vm_paddr_t high,
+    unsigned long alignment, vm_paddr_t boundary)
+{
+	void *ret;
+
+	ret = (void *)kmem_alloc_contig_domain(domain, size, flags, low, high,
+	    alignment, boundary, VM_MEMATTR_DEFAULT);
+	if (ret != NULL)
+		malloc_type_allocated(type, round_page(size));
+	return (ret);
+}
+
 /*
  *	contigfree:
  *
@@ -432,26 +451,14 @@ contigfree(void *addr, unsigned long size, struct mall
 	malloc_type_freed(type, round_page(size));
 }
 
-/*
- *	malloc:
- *
- *	Allocate a block of memory.
- *
- *	If M_NOWAIT is set, this routine will not block and return NULL if
- *	the allocation fails.
- */
-void *
-malloc(unsigned long size, struct malloc_type *mtp, int flags)
+#ifdef MALLOC_DEBUG
+static int
+malloc_dbg(caddr_t *vap, unsigned long *sizep, struct malloc_type *mtp,
+    int flags)
 {
+#ifdef INVARIANTS
 	int indx;
-	struct malloc_type_internal *mtip;
-	caddr_t va;
-	uma_zone_t zone;
-#if defined(DIAGNOSTIC) || defined(DEBUG_REDZONE)
-	unsigned long osize = size;
-#endif
 
-#ifdef INVARIANTS
 	KASSERT(mtp->ks_magic == M_MAGIC, ("malloc: bad malloc type magic"));
 	/*
 	 * Check that exactly one of M_WAITOK or M_NOWAIT is specified.
@@ -474,7 +481,8 @@ malloc(unsigned long size, struct malloc_type *mtp, in
 		if ((malloc_nowait_count % malloc_failure_rate) == 0) {
 			atomic_add_int(&malloc_failure_count, 1);
 			t_malloc_fail = time_uptime;
-			return (NULL);
+			*vap = NULL;
+			return (EJUSTRETURN);
 		}
 	}
 #endif
@@ -485,18 +493,46 @@ malloc(unsigned long size, struct malloc_type *mtp, in
 	    ("malloc: called with spinlock or critical section held"));
 
 #ifdef DEBUG_MEMGUARD
-	if (memguard_cmp_mtp(mtp, size)) {
-		va = memguard_alloc(size, flags);
-		if (va != NULL)
-			return (va);
+	if (memguard_cmp_mtp(mtp, *sizep)) {
+		*vap = memguard_alloc(*sizep, flags);
+		if (*vap != NULL)
+			return (EJUSTRETURN);
 		/* This is unfortunate but should not be fatal. */
 	}
 #endif
 
 #ifdef DEBUG_REDZONE
-	size = redzone_size_ntor(size);
+	*sizep = redzone_size_ntor(*sizep);
 #endif
 
+	return (0);
+}
+#endif
+
+/*
+ *	malloc:
+ *
+ *	Allocate a block of memory.
+ *
+ *	If M_NOWAIT is set, this routine will not block and return NULL if
+ *	the allocation fails.
+ */
+void *
+malloc(unsigned long size, struct malloc_type *mtp, int flags)
+{
+	int indx;
+	struct malloc_type_internal *mtip;
+	caddr_t va;
+	uma_zone_t zone;
+#if defined(DEBUG_REDZONE)
+	unsigned long osize = size;
+#endif
+
+#ifdef MALLOC_DEBUG
+	if (malloc_dbg(&va, &size, mtp, flags) != 0)
+		return (va);
+#endif
+
 	if (size <= kmem_zmax) {
 		mtip = mtp->ks_handle;
 		if (size & KMEM_ZMASK)
@@ -523,11 +559,55 @@ malloc(unsigned long size, struct malloc_type *mtp, in
 		KASSERT(va != NULL, ("malloc(M_WAITOK) returned NULL"));
 	else if (va == NULL)
 		t_malloc_fail = time_uptime;
-#ifdef DIAGNOSTIC
-	if (va != NULL && !(flags & M_ZERO)) {
-		memset(va, 0x70, osize);
-	}
+#ifdef DEBUG_REDZONE
+	if (va != NULL)
+		va = redzone_setup(va, osize);
 #endif
+	return ((void *) va);
+}
+
+void *
+malloc_domain(unsigned long size, struct malloc_type *mtp, int domain,
+    int flags)
+{
+	int indx;
+	struct malloc_type_internal *mtip;
+	caddr_t va;
+	uma_zone_t zone;
+#if defined(DEBUG_REDZONE)
+	unsigned long osize = size;
+#endif
+
+#ifdef MALLOC_DEBUG
+	if (malloc_dbg(&va, &size, mtp, flags) != 0)
+		return (va);
+#endif
+	if (size <= kmem_zmax) {
+		mtip = mtp->ks_handle;
+		if (size & KMEM_ZMASK)
+			size = (size & ~KMEM_ZMASK) + KMEM_ZBASE;
+		indx = kmemsize[size >> KMEM_ZSHIFT];
+		KASSERT(mtip->mti_zone < numzones,
+		    ("mti_zone %u out of range %d",
+		    mtip->mti_zone, numzones));
+		zone = kmemzones[indx].kz_zone[mtip->mti_zone];
+#ifdef MALLOC_PROFILE
+		krequests[size >> KMEM_ZSHIFT]++;
+#endif
+		va = uma_zalloc_domain(zone, NULL, domain, flags);
+		if (va != NULL)
+			size = zone->uz_size;
+		malloc_type_zone_allocated(mtp, va == NULL ? 0 : size, indx);
+	} else {
+		size = roundup(size, PAGE_SIZE);
+		zone = NULL;
+		va = uma_large_malloc_domain(size, domain, flags);
+		malloc_type_allocated(mtp, va == NULL ? 0 : size);
+	}
+	if (flags & M_WAITOK)
+		KASSERT(va != NULL, ("malloc(M_WAITOK) returned NULL"));
+	else if (va == NULL)
+		t_malloc_fail = time_uptime;
 #ifdef DEBUG_REDZONE
 	if (va != NULL)
 		va = redzone_setup(va, osize);
@@ -545,66 +625,124 @@ mallocarray(size_t nmemb, size_t size, struct malloc_t
 	return (malloc(size * nmemb, type, flags));
 }
 
-/*
- *	free:
- *
- *	Free a block of memory allocated by malloc.
- *
- *	This routine may not block.
- */
-void
-free(void *addr, struct malloc_type *mtp)
+#ifdef INVARIANTS
+static void
+free_save_type(void *addr, struct malloc_type *mtp, u_long size)
 {
-	uma_slab_t slab;
-	u_long size;
+	struct malloc_type **mtpp = addr;
 
+	/*
+	 * Cache a pointer to the malloc_type that most recently freed
+	 * this memory here.  This way we know who is most likely to
+	 * have stepped on it later.
+	 *
+	 * This code assumes that size is a multiple of 8 bytes for
+	 * 64 bit machines
+	 */
+	mtpp = (struct malloc_type **) ((unsigned long)mtpp & ~UMA_ALIGN_PTR);
+	mtpp += (size - sizeof(struct malloc_type *)) /
+	    sizeof(struct malloc_type *);
+	*mtpp = mtp;
+}
+#endif
+
+#ifdef MALLOC_DEBUG
+static int
+free_dbg(void **addrp, struct malloc_type *mtp)
+{
+	void *addr;
+
+	addr = *addrp;
 	KASSERT(mtp->ks_magic == M_MAGIC, ("free: bad malloc type magic"));
 	KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
 	    ("free: called with spinlock or critical section held"));
 
 	/* free(NULL, ...) does nothing */
 	if (addr == NULL)
-		return;
+		return (EJUSTRETURN);
 
 #ifdef DEBUG_MEMGUARD
 	if (is_memguard_addr(addr)) {
 		memguard_free(addr);
-		return;
+		return (EJUSTRETURN);
 	}
 #endif
 
 #ifdef DEBUG_REDZONE
 	redzone_check(addr);
-	addr = redzone_addr_ntor(addr);
+	*addrp = redzone_addr_ntor(addr);
 #endif
 
-	slab = vtoslab((vm_offset_t)addr & (~UMA_SLAB_MASK));
+	return (0);
+}
+#endif
 
+/*
+ *	free:
+ *
+ *	Free a block of memory allocated by malloc.
+ *
+ *	This routine may not block.
+ */
+void
+free(void *addr, struct malloc_type *mtp)
+{
+	uma_slab_t slab;
+	u_long size;
+
+#ifdef MALLOC_DEBUG
+	if (free_dbg(&addr, mtp) != 0)
+		return;
+#endif
+	/* free(NULL, ...) does nothing */
+	if (addr == NULL)
+		return;
+
+	slab = vtoslab((vm_offset_t)addr & (~UMA_SLAB_MASK));
 	if (slab == NULL)
 		panic("free: address %p(%p) has not been allocated.\n",
 		    addr, (void *)((u_long)addr & (~UMA_SLAB_MASK)));
 
 	if (!(slab->us_flags & UMA_SLAB_MALLOC)) {
+		size = slab->us_keg->uk_size;
 #ifdef INVARIANTS
-		struct malloc_type **mtpp = addr;
+		free_save_type(addr, mtp, size);
 #endif
+		uma_zfree_arg(LIST_FIRST(&slab->us_keg->uk_zones), addr, slab);
+	} else {
+		size = slab->us_size;
+		uma_large_free(slab);
+	}
+	malloc_type_freed(mtp, size);
+}
+
+void
+free_domain(void *addr, struct malloc_type *mtp)
+{
+	uma_slab_t slab;
+	u_long size;
+
+#ifdef MALLOC_DEBUG
+	if (free_dbg(&addr, mtp) != 0)
+		return;
+#endif
+
+	/* free(NULL, ...) does nothing */
+	if (addr == NULL)
+		return;
+
+	slab = vtoslab((vm_offset_t)addr & (~UMA_SLAB_MASK));
+	if (slab == NULL)
+		panic("free_domain: address %p(%p) has not been allocated.\n",
+		    addr, (void *)((u_long)addr & (~UMA_SLAB_MASK)));
+
+	if (!(slab->us_flags & UMA_SLAB_MALLOC)) {
 		size = slab->us_keg->uk_size;
 #ifdef INVARIANTS
-		/*
-		 * Cache a pointer to the malloc_type that most recently freed
-		 * this memory here.  This way we know who is most likely to
-		 * have stepped on it later.
-		 *
-		 * This code assumes that size is a multiple of 8 bytes for
-		 * 64 bit machines
-		 */
-		mtpp = (struct malloc_type **)
-		    ((unsigned long)mtpp & ~UMA_ALIGN_PTR);
-		mtpp += (size - sizeof(struct malloc_type *)) /
-		    sizeof(struct malloc_type *);
-		*mtpp = mtp;
+		free_save_type(addr, mtp, size);
 #endif
-		uma_zfree_arg(LIST_FIRST(&slab->us_keg->uk_zones), addr, slab);
+		uma_zfree_domain(LIST_FIRST(&slab->us_keg->uk_zones),
+		    addr, slab);
 	} else {
 		size = slab->us_size;
 		uma_large_free(slab);

Modified: head/sys/kern/kern_mbuf.c
==============================================================================
--- head/sys/kern/kern_mbuf.c	Fri Jan 12 23:13:55 2018	(r327899)
+++ head/sys/kern/kern_mbuf.c	Fri Jan 12 23:25:05 2018	(r327900)
@@ -283,7 +283,7 @@ static void	mb_dtor_pack(void *, int, void *);
 static int	mb_zinit_pack(void *, int, int);
 static void	mb_zfini_pack(void *, int);
 static void	mb_reclaim(uma_zone_t, int);
-static void    *mbuf_jumbo_alloc(uma_zone_t, vm_size_t, uint8_t *, int);
+static void    *mbuf_jumbo_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
 
 /* Ensure that MSIZE is a power of 2. */
 CTASSERT((((MSIZE - 1) ^ MSIZE) + 1) >> 1 == MSIZE);
@@ -386,12 +386,13 @@ SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbuf_init, 
  * pages.
  */
 static void *
-mbuf_jumbo_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, int wait)
+mbuf_jumbo_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags,
+    int wait)
 {
 
 	/* Inform UMA that this allocator uses kernel_map/object. */
 	*flags = UMA_SLAB_KERNEL;
-	return ((void *)kmem_alloc_contig(kernel_arena, bytes, wait,
+	return ((void *)kmem_alloc_contig_domain(domain, bytes, wait,
 	    (vm_paddr_t)0, ~(vm_paddr_t)0, 1, 0, VM_MEMATTR_DEFAULT));
 }
 

Modified: head/sys/kern/subr_busdma_bufalloc.c
==============================================================================
--- head/sys/kern/subr_busdma_bufalloc.c	Fri Jan 12 23:13:55 2018	(r327899)
+++ head/sys/kern/subr_busdma_bufalloc.c	Fri Jan 12 23:25:05 2018	(r327900)
@@ -149,7 +149,7 @@ busdma_bufalloc_findzone(busdma_bufalloc_t ba, bus_siz
 }
 
 void *
-busdma_bufalloc_alloc_uncacheable(uma_zone_t zone, vm_size_t size,
+busdma_bufalloc_alloc_uncacheable(uma_zone_t zone, vm_size_t size, int domain,
     uint8_t *pflag, int wait)
 {
 #ifdef VM_MEMATTR_UNCACHEABLE
@@ -157,7 +157,7 @@ busdma_bufalloc_alloc_uncacheable(uma_zone_t zone, vm_
 	/* Inform UMA that this allocator uses kernel_arena/object. */
 	*pflag = UMA_SLAB_KERNEL;
 
-	return ((void *)kmem_alloc_attr(kernel_arena, size, wait, 0,
+	return ((void *)kmem_alloc_attr_domain(domain, size, wait, 0,
 	    BUS_SPACE_MAXADDR, VM_MEMATTR_UNCACHEABLE));
 
 #else

Modified: head/sys/kern/subr_vmem.c
==============================================================================
--- head/sys/kern/subr_vmem.c	Fri Jan 12 23:13:55 2018	(r327899)
+++ head/sys/kern/subr_vmem.c	Fri Jan 12 23:25:05 2018	(r327900)
@@ -500,7 +500,7 @@ bt_insfree(vmem_t *vm, bt_t *bt)
  * Import from the arena into the quantum cache in UMA.
  */
 static int
-qc_import(void *arg, void **store, int cnt, int flags)
+qc_import(void *arg, void **store, int cnt, int domain, int flags)
 {
 	qcache_t *qc;
 	vmem_addr_t addr;
@@ -614,13 +614,12 @@ static struct mtx_padalign __exclusive_cache_line vmem
  * we are really out of KVA.
  */
 static void *
-vmem_bt_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *pflag, int wait)
+vmem_bt_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
+    int wait)
 {
 	vmem_addr_t addr;
-	int domain;
 
 	*pflag = UMA_SLAB_KERNEL;
-	domain = 0; /* XXX Temporary. */
 
 	/*
 	 * Single thread boundary tag allocation so that the address space

Modified: head/sys/kern/vfs_bio.c
==============================================================================
--- head/sys/kern/vfs_bio.c	Fri Jan 12 23:13:55 2018	(r327899)
+++ head/sys/kern/vfs_bio.c	Fri Jan 12 23:25:05 2018	(r327900)
@@ -133,7 +133,7 @@ static __inline void bd_wakeup(void);
 static int sysctl_runningspace(SYSCTL_HANDLER_ARGS);
 static void bufkva_reclaim(vmem_t *, int);
 static void bufkva_free(struct buf *);
-static int buf_import(void *, void **, int, int);
+static int buf_import(void *, void **, int, int, int);
 static void buf_release(void *, void **, int);
 static void maxbcachebuf_adjust(void);
 
@@ -1419,7 +1419,7 @@ buf_free(struct buf *bp)
  *	only as a per-cpu cache of bufs still maintained on a global list.
  */
 static int
-buf_import(void *arg, void **store, int cnt, int flags)
+buf_import(void *arg, void **store, int cnt, int domain, int flags)
 {
 	struct buf *bp;
 	int i;

Modified: head/sys/mips/mips/uma_machdep.c
==============================================================================
--- head/sys/mips/mips/uma_machdep.c	Fri Jan 12 23:13:55 2018	(r327899)
+++ head/sys/mips/mips/uma_machdep.c	Fri Jan 12 23:25:05 2018	(r327900)
@@ -44,7 +44,8 @@ __FBSDID("$FreeBSD$");
 #include <machine/vmparam.h>
 
 void *
-uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait)
+uma_small_alloc(uma_zone_t zone, vm_size_t bytes, int domain, u_int8_t *flags,
+    int wait)
 {
 	vm_paddr_t pa;
 	vm_page_t m;
@@ -59,7 +60,8 @@ uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_in
 #endif
 
 	for (;;) {
-		m = vm_page_alloc_freelist(VM_FREELIST_DIRECT, pflags);
+		m = vm_page_alloc_freelist_domain(domain, VM_FREELIST_DIRECT,
+		    pflags);
 #ifndef __mips_n64
 		if (m == NULL && vm_page_reclaim_contig(pflags, 1,
 		    0, MIPS_KSEG0_LARGEST_PHYS, PAGE_SIZE, 0))

Modified: head/sys/powerpc/aim/mmu_oea64.c
==============================================================================
--- head/sys/powerpc/aim/mmu_oea64.c	Fri Jan 12 23:13:55 2018	(r327899)
+++ head/sys/powerpc/aim/mmu_oea64.c	Fri Jan 12 23:25:05 2018	(r327900)
@@ -1504,8 +1504,8 @@ retry:
 static mmu_t installed_mmu;
 
 static void *
-moea64_uma_page_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *flags,
-    int wait)
+moea64_uma_page_alloc(uma_zone_t zone, vm_size_t bytes, int domain,
+    uint8_t *flags, int wait)
 {
 	struct pvo_entry *pvo;
         vm_offset_t va;
@@ -1522,7 +1522,7 @@ moea64_uma_page_alloc(uma_zone_t zone, vm_size_t bytes
 	*flags = UMA_SLAB_PRIV;
 	needed_lock = !PMAP_LOCKED(kernel_pmap);
 
-	m = vm_page_alloc(NULL, 0,
+	m = vm_page_alloc_domain(NULL, 0, domain,
 	    malloc2vm_flags(wait) | VM_ALLOC_WIRED | VM_ALLOC_NOOBJ);
 	if (m == NULL)
 		return (NULL);

Modified: head/sys/powerpc/aim/slb.c
==============================================================================
--- head/sys/powerpc/aim/slb.c	Fri Jan 12 23:13:55 2018	(r327899)
+++ head/sys/powerpc/aim/slb.c	Fri Jan 12 23:25:05 2018	(r327900)
@@ -480,7 +480,8 @@ slb_insert_user(pmap_t pm, struct slb *slb)
 }
 
 static void *
-slb_uma_real_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait)
+slb_uma_real_alloc(uma_zone_t zone, vm_size_t bytes, int domain,
+    u_int8_t *flags, int wait)
 {
 	static vm_offset_t realmax = 0;
 	void *va;
@@ -490,7 +491,7 @@ slb_uma_real_alloc(uma_zone_t zone, vm_size_t bytes, u
 		realmax = platform_real_maxaddr();
 
 	*flags = UMA_SLAB_PRIV;
-	m = vm_page_alloc_contig(NULL, 0,
+	m = vm_page_alloc_contig_domain(NULL, 0, domain,
 	    malloc2vm_flags(wait) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED,
 	    1, 0, realmax, PAGE_SIZE, PAGE_SIZE, VM_MEMATTR_DEFAULT);
 	if (m == NULL)

Modified: head/sys/powerpc/powerpc/uma_machdep.c
==============================================================================
--- head/sys/powerpc/powerpc/uma_machdep.c	Fri Jan 12 23:13:55 2018	(r327899)
+++ head/sys/powerpc/powerpc/uma_machdep.c	Fri Jan 12 23:25:05 2018	(r327900)
@@ -51,7 +51,8 @@ SYSCTL_INT(_hw, OID_AUTO, uma_mdpages, CTLFLAG_RD, &hw
 	   "UMA MD pages in use");
 
 void *
-uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait)
+uma_small_alloc(uma_zone_t zone, vm_size_t bytes, int domain, u_int8_t *flags,
+    int wait)
 {
 	void *va;
 	vm_paddr_t pa;
@@ -59,7 +60,7 @@ uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_in
 	
 	*flags = UMA_SLAB_PRIV;
 
-	m = vm_page_alloc(NULL, 0,
+	m = vm_page_alloc_domain(NULL, 0, domain,
 	    malloc2vm_flags(wait) | VM_ALLOC_WIRED | VM_ALLOC_NOOBJ);
 	if (m == NULL) 
 		return (NULL);

Modified: head/sys/riscv/riscv/uma_machdep.c
==============================================================================
--- head/sys/riscv/riscv/uma_machdep.c	Fri Jan 12 23:13:55 2018	(r327899)
+++ head/sys/riscv/riscv/uma_machdep.c	Fri Jan 12 23:25:05 2018	(r327900)
@@ -41,7 +41,8 @@ __FBSDID("$FreeBSD$");
 #include <machine/vmparam.h>
 
 void *
-uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait)
+uma_small_alloc(uma_zone_t zone, vm_size_t bytes, int domain, u_int8_t *flags,
+    int wait)
 {
 
 	panic("uma_small_alloc");

Modified: head/sys/sparc64/sparc64/vm_machdep.c
==============================================================================
--- head/sys/sparc64/sparc64/vm_machdep.c	Fri Jan 12 23:13:55 2018	(r327899)
+++ head/sys/sparc64/sparc64/vm_machdep.c	Fri Jan 12 23:25:05 2018	(r327900)
@@ -392,7 +392,8 @@ swi_vm(void *v)
 }
 
 void *
-uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait)
+uma_small_alloc(uma_zone_t zone, vm_size_t bytes, int domain, u_int8_t *flags,
+    int wait)
 {
 	vm_paddr_t pa;
 	vm_page_t m;
@@ -402,7 +403,7 @@ uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_in
 
 	*flags = UMA_SLAB_PRIV;
 
-	m = vm_page_alloc(NULL, 0, 
+	m = vm_page_alloc_domain(NULL, 0, domain,
 	    malloc2vm_flags(wait) | VM_ALLOC_WIRED | VM_ALLOC_NOOBJ);
 	if (m == NULL)
 		return (NULL);

Modified: head/sys/sys/malloc.h
==============================================================================
--- head/sys/sys/malloc.h	Fri Jan 12 23:13:55 2018	(r327899)
+++ head/sys/sys/malloc.h	Fri Jan 12 23:25:05 2018	(r327900)
@@ -175,8 +175,16 @@ void	*contigmalloc(unsigned long size, struct malloc_t
 	    vm_paddr_t low, vm_paddr_t high, unsigned long alignment,
 	    vm_paddr_t boundary) __malloc_like __result_use_check
 	    __alloc_size(1) __alloc_align(6);
+void	*contigmalloc_domain(unsigned long size, struct malloc_type *type,
+	    int domain, int flags, vm_paddr_t low, vm_paddr_t high,
+	    unsigned long alignment, vm_paddr_t boundary)
+	    __malloc_like __result_use_check __alloc_size(1) __alloc_align(6);
 void	free(void *addr, struct malloc_type *type);
+void	free_domain(void *addr, struct malloc_type *type);
 void	*malloc(unsigned long size, struct malloc_type *type, int flags)
+	    __malloc_like __result_use_check __alloc_size(1);
+void	*malloc_domain(unsigned long size, struct malloc_type *type,
+	    int domain, int flags)
 	    __malloc_like __result_use_check __alloc_size(1);
 void	*mallocarray(size_t nmemb, size_t size, struct malloc_type *type,
 	    int flags) __malloc_like __result_use_check

Modified: head/sys/vm/uma.h
==============================================================================
--- head/sys/vm/uma.h	Fri Jan 12 23:13:55 2018	(r327899)
+++ head/sys/vm/uma.h	Fri Jan 12 23:25:05 2018	(r327900)
@@ -128,7 +128,8 @@ typedef void (*uma_fini)(void *mem, int size);
 /*
  * Import new memory into a cache zone.
  */
-typedef int (*uma_import)(void *arg, void **store, int count, int flags);
+typedef int (*uma_import)(void *arg, void **store, int count, int domain,
+    int flags);
 
 /*
  * Free memory from a cache zone.
@@ -281,6 +282,10 @@ uma_zone_t uma_zcache_create(char *name, int size, uma
 					 * Allocates mp_maxid + 1 slabs sized to
 					 * sizeof(struct pcpu).
 					 */
+#define	UMA_ZONE_NUMA		0x10000	/*
+					 * NUMA aware Zone.  Implements a best
+					 * effort first-touch policy.
+					 */
 
 /*
  * These flags are shared between the keg and zone.  In zones wishing to add
@@ -326,6 +331,19 @@ void uma_zdestroy(uma_zone_t zone);
 void *uma_zalloc_arg(uma_zone_t zone, void *arg, int flags);
 
 /*
+ * Allocate an item from a specific NUMA domain.  This uses a slow path in
+ * the allocator but is guaranteed to allocate memory from the requested
+ * domain if M_WAITOK is set.
+ *
+ * Arguments:
+ *	zone  The zone we are allocating from
+ *	arg   This data is passed to the ctor function
+ *	domain The domain to allocate from.
+ *	flags See sys/malloc.h for available flags.
+ */
+void *uma_zalloc_domain(uma_zone_t zone, void *arg, int domain, int flags);
+
+/*
  * Allocates an item out of a zone without supplying an argument
  *
  * This is just a wrapper for uma_zalloc_arg for convenience.
@@ -354,6 +372,16 @@ uma_zalloc(uma_zone_t zone, int flags)
 void uma_zfree_arg(uma_zone_t zone, void *item, void *arg);
 
 /*
+ * Frees an item back to the specified zone's domain specific pool.
+ *
+ * Arguments:
+ *	zone  The zone the item was originally allocated out of.
+ *	item  The memory to be freed.
+ *	arg   Argument passed to the destructor
+ */
+void uma_zfree_domain(uma_zone_t zone, void *item, void *arg);
+
+/*
  * Frees an item back to a zone without supplying an argument
  *
  * This is just a wrapper for uma_zfree_arg for convenience.
@@ -373,25 +401,21 @@ uma_zfree(uma_zone_t zone, void *item)
 void uma_zwait(uma_zone_t zone);
 
 /*
- * XXX The rest of the prototypes in this header are h0h0 magic for the VM.
- * If you think you need to use it for a normal zone you're probably incorrect.
- */
-
-/*
  * Backend page supplier routines
  *
  * Arguments:
  *	zone  The zone that is requesting pages.
  *	size  The number of bytes being requested.
  *	pflag Flags for these memory pages, see below.
+ *	domain The NUMA domain that we prefer for this allocation.
  *	wait  Indicates our willingness to block.
  *
  * Returns:
  *	A pointer to the allocated memory or NULL on failure.
  */
 
-typedef void *(*uma_alloc)(uma_zone_t zone, vm_size_t size, uint8_t *pflag,
-    int wait);
+typedef void *(*uma_alloc)(uma_zone_t zone, vm_size_t size, int domain,
+    uint8_t *pflag, int wait);
 
 /*
  * Backend page free routines
@@ -405,8 +429,6 @@ typedef void *(*uma_alloc)(uma_zone_t zone, vm_size_t 
  *	None
  */
 typedef void (*uma_free)(void *item, vm_size_t size, uint8_t pflag);
-
-
 
 /*
  * Sets up the uma allocator. (Called by vm_mem_init)

Modified: head/sys/vm/uma_core.c
==============================================================================
--- head/sys/vm/uma_core.c	Fri Jan 12 23:13:55 2018	(r327899)
+++ head/sys/vm/uma_core.c	Fri Jan 12 23:25:05 2018	(r327900)
@@ -83,6 +83,7 @@ __FBSDID("$FreeBSD$");
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_param.h>
+#include <vm/vm_phys.h>
 #include <vm/vm_map.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
@@ -97,17 +98,12 @@ __FBSDID("$FreeBSD$");
 #endif
 
 /*
- * This is the zone and keg from which all zones are spawned.  The idea is that
- * even the zone & keg heads are allocated from the allocator, so we use the
- * bss section to bootstrap us.
+ * This is the zone and keg from which all zones are spawned.
  */
-static struct uma_keg masterkeg;
-static struct uma_zone masterzone_k;
-static struct uma_zone masterzone_z;
-static uma_zone_t kegs = &masterzone_k;
-static uma_zone_t zones = &masterzone_z;
+static uma_zone_t kegs;
+static uma_zone_t zones;
 
-/* This is the zone from which all of uma_slab_t's are allocated. */
+/* This is the zone from which all offpage uma_slab_ts are allocated. */
 static uma_zone_t slabzone;
 
 /*
@@ -226,13 +222,15 @@ struct uma_bucket_zone bucket_zones[] = {
  */
 enum zfreeskip { SKIP_NONE = 0, SKIP_DTOR, SKIP_FINI };
 
+#define	UMA_ANYDOMAIN	-1	/* Special value for domain search. */
+
 /* Prototypes.. */
 
-static void *noobj_alloc(uma_zone_t, vm_size_t, uint8_t *, int);
-static void *page_alloc(uma_zone_t, vm_size_t, uint8_t *, int);
-static void *startup_alloc(uma_zone_t, vm_size_t, uint8_t *, int);
+static void *noobj_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
+static void *page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
+static void *startup_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
 static void page_free(void *, vm_size_t, uint8_t);
-static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int);
+static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int, int);
 static void cache_drain(uma_zone_t);
 static void bucket_drain(uma_zone_t, uma_bucket_t);
 static void bucket_cache_drain(uma_zone_t zone);
@@ -250,23 +248,23 @@ static int hash_expand(struct uma_hash *, struct uma_h
 static void hash_free(struct uma_hash *hash);
 static void uma_timeout(void *);
 static void uma_startup3(void);
-static void *zone_alloc_item(uma_zone_t, void *, int);
+static void *zone_alloc_item(uma_zone_t, void *, int, int);
 static void zone_free_item(uma_zone_t, void *, void *, enum zfreeskip);
 static void bucket_enable(void);
 static void bucket_init(void);
 static uma_bucket_t bucket_alloc(uma_zone_t zone, void *, int);
 static void bucket_free(uma_zone_t zone, uma_bucket_t, void *);
 static void bucket_zone_drain(void);
-static uma_bucket_t zone_alloc_bucket(uma_zone_t zone, void *, int flags);
-static uma_slab_t zone_fetch_slab(uma_zone_t zone, uma_keg_t last, int flags);
-static uma_slab_t zone_fetch_slab_multi(uma_zone_t zone, uma_keg_t last, int flags);
+static uma_bucket_t zone_alloc_bucket(uma_zone_t, void *, int, int);
+static uma_slab_t zone_fetch_slab(uma_zone_t, uma_keg_t, int, int);
+static uma_slab_t zone_fetch_slab_multi(uma_zone_t, uma_keg_t, int, int);
 static void *slab_alloc_item(uma_keg_t keg, uma_slab_t slab);
 static void slab_free_item(uma_keg_t keg, uma_slab_t slab, void *item);
 static uma_keg_t uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit,
     uma_fini fini, int align, uint32_t flags);
-static int zone_import(uma_zone_t zone, void **bucket, int max, int flags);
-static void zone_release(uma_zone_t zone, void **bucket, int cnt);
-static void uma_zero_item(void *item, uma_zone_t zone);
+static int zone_import(uma_zone_t, void **, int, int, int);
+static void zone_release(uma_zone_t, void **, int);
+static void uma_zero_item(void *, uma_zone_t);
 
 void uma_print_zone(uma_zone_t);
 void uma_print_stats(void);
@@ -332,7 +330,7 @@ bucket_init(void)
 		size += sizeof(void *) * ubz->ubz_entries;
 		ubz->ubz_zone = uma_zcreate(ubz->ubz_name, size,
 		    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
-		    UMA_ZONE_MTXCLASS | UMA_ZFLAG_BUCKET);
+		    UMA_ZONE_MTXCLASS | UMA_ZFLAG_BUCKET | UMA_ZONE_NUMA);
 	}
 }
 
@@ -570,7 +568,7 @@ hash_alloc(struct uma_hash *hash)
 	} else {
 		alloc = sizeof(hash->uh_slab_hash[0]) * UMA_HASH_SIZE_INIT;
 		hash->uh_slab_hash = zone_alloc_item(hashzone, NULL,
-		    M_WAITOK);
+		    UMA_ANYDOMAIN, M_WAITOK);
 		hash->uh_hashsize = UMA_HASH_SIZE_INIT;
 	}
 	if (hash->uh_slab_hash) {
@@ -736,6 +734,7 @@ cache_drain_safe_cpu(uma_zone_t zone)
 {
 	uma_cache_t cache;
 	uma_bucket_t b1, b2;
+	int domain;
 
 	if (zone->uz_flags & UMA_ZFLAG_INTERNAL)
 		return;
@@ -743,10 +742,14 @@ cache_drain_safe_cpu(uma_zone_t zone)
 	b1 = b2 = NULL;
 	ZONE_LOCK(zone);
 	critical_enter();
+	if (zone->uz_flags & UMA_ZONE_NUMA)
+		domain = PCPU_GET(domain);
+	else
+		domain = 0;
 	cache = &zone->uz_cpu[curcpu];
 	if (cache->uc_allocbucket) {
 		if (cache->uc_allocbucket->ub_cnt != 0)
-			LIST_INSERT_HEAD(&zone->uz_buckets,
+			LIST_INSERT_HEAD(&zone->uz_domain[domain].uzd_buckets,
 			    cache->uc_allocbucket, ub_link);
 		else
 			b1 = cache->uc_allocbucket;
@@ -754,7 +757,7 @@ cache_drain_safe_cpu(uma_zone_t zone)
 	}
 	if (cache->uc_freebucket) {
 		if (cache->uc_freebucket->ub_cnt != 0)
-			LIST_INSERT_HEAD(&zone->uz_buckets,
+			LIST_INSERT_HEAD(&zone->uz_domain[domain].uzd_buckets,
 			    cache->uc_freebucket, ub_link);
 		else
 			b2 = cache->uc_freebucket;
@@ -809,18 +812,22 @@ cache_drain_safe(uma_zone_t zone)
 static void
 bucket_cache_drain(uma_zone_t zone)
 {
+	uma_zone_domain_t zdom;
 	uma_bucket_t bucket;
+	int i;
 
 	/*
-	 * Drain the bucket queues and free the buckets, we just keep two per
-	 * cpu (alloc/free).
+	 * Drain the bucket queues and free the buckets.
 	 */
-	while ((bucket = LIST_FIRST(&zone->uz_buckets)) != NULL) {
-		LIST_REMOVE(bucket, ub_link);
-		ZONE_UNLOCK(zone);
-		bucket_drain(zone, bucket);
-		bucket_free(zone, bucket, NULL);
-		ZONE_LOCK(zone);
+	for (i = 0; i < vm_ndomains; i++) {
+		zdom = &zone->uz_domain[i];
+		while ((bucket = LIST_FIRST(&zdom->uzd_buckets)) != NULL) {
+			LIST_REMOVE(bucket, ub_link);
+			ZONE_UNLOCK(zone);
+			bucket_drain(zone, bucket);
+			bucket_free(zone, bucket, NULL);
+			ZONE_LOCK(zone);
+		}
 	}
 
 	/*
@@ -865,7 +872,9 @@ static void
 keg_drain(uma_keg_t keg)
 {
 	struct slabhead freeslabs = { 0 };
+	uma_domain_t dom;
 	uma_slab_t slab, tmp;
+	int i;
 
 	/*
 	 * We don't want to take pages from statically allocated kegs at this
@@ -880,20 +889,25 @@ keg_drain(uma_keg_t keg)
 	if (keg->uk_free == 0)
 		goto finished;
 
-	LIST_FOREACH_SAFE(slab, &keg->uk_free_slab, us_link, tmp) {
-		/* We have nowhere to free these to. */
-		if (slab->us_flags & UMA_SLAB_BOOT)
-			continue;
+	for (i = 0; i < vm_ndomains; i++) {
+		dom = &keg->uk_domain[i];
+		LIST_FOREACH_SAFE(slab, &dom->ud_free_slab, us_link, tmp) {
+			/* We have nowhere to free these to. */
+			if (slab->us_flags & UMA_SLAB_BOOT)
+				continue;
 
-		LIST_REMOVE(slab, us_link);
-		keg->uk_pages -= keg->uk_ppera;
-		keg->uk_free -= keg->uk_ipers;
+			LIST_REMOVE(slab, us_link);
+			keg->uk_pages -= keg->uk_ppera;
+			keg->uk_free -= keg->uk_ipers;
 

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***


More information about the svn-src-all mailing list