svn commit: r339043 - in head/sys: kern vm x86/acpica

Andrew Gallatin gallatin at FreeBSD.org
Mon Oct 1 14:14:24 UTC 2018


Author: gallatin
Date: Mon Oct  1 14:14:21 2018
New Revision: 339043
URL: https://svnweb.freebsd.org/changeset/base/339043

Log:
  Allow empty NUMA memory domains to support Threadripper2
  
  The AMD Threadripper 2990WX is basically a slightly crippled Epyc.
  Rather than having 4 memory controllers, one per NUMA domain, it has
  only 2  memory controllers enabled. This means that only 2 of the
  4 NUMA domains can be populated with physical memory, and the
  others are empty.
  
  Add support to FreeBSD for empty NUMA domains by:
  
  - creating empty memory domains when parsing the SRAT table,
      rather than failing to parse the table
  - not running the pageout deamon threads in empty domains
  - adding defensive code to UMA to avoid allocating from empty domains
  - adding defensive code to cpuset to avoid binding to an empty domain
      Thanks to Jeff for suggesting this strategy.
  
  Reviewed by:	alc, markj
  Approved by:	re (gjb@)
  Differential Revision:	https://reviews.freebsd.org/D1683

Modified:
  head/sys/kern/kern_cpuset.c
  head/sys/vm/uma_core.c
  head/sys/vm/vm_kern.c
  head/sys/vm/vm_pageout.c
  head/sys/vm/vm_pagequeue.h
  head/sys/x86/acpica/srat.c

Modified: head/sys/kern/kern_cpuset.c
==============================================================================
--- head/sys/kern/kern_cpuset.c	Mon Oct  1 14:05:31 2018	(r339042)
+++ head/sys/kern/kern_cpuset.c	Mon Oct  1 14:14:21 2018	(r339043)
@@ -65,7 +65,12 @@ __FBSDID("$FreeBSD$");
 #include <vm/uma.h>
 #include <vm/vm.h>
 #include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pageout.h>
 #include <vm/vm_extern.h>
+#include <vm/vm_param.h>
+#include <vm/vm_phys.h>
+#include <vm/vm_pagequeue.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
@@ -479,6 +484,26 @@ _domainset_create(struct domainset *domain, struct dom
 }
 
 /*
+ * Are any of the domains in the mask empty? If so, silently
+ * remove them.  If only empty domains are present, we must
+ * return failure.
+ */
+static bool
+domainset_empty_vm(struct domainset *domain)
+{
+	int i, max;
+
+	max = DOMAINSET_FLS(&domain->ds_mask) + 1;
+	for (i = 0; i < max; i++) {
+		if (DOMAINSET_ISSET(i, &domain->ds_mask) &&
+		    VM_DOMAIN_EMPTY(i))
+			DOMAINSET_CLR(i, &domain->ds_mask);
+	}
+
+	return (DOMAINSET_EMPTY(&domain->ds_mask));
+}
+
+/*
  * Create or lookup a domainset based on the key held in 'domain'.
  */
 struct domainset *
@@ -1360,6 +1385,7 @@ domainset_zero(void)
 		DOMAINSET_SET(i, &dset->ds_mask);
 	dset->ds_policy = DOMAINSET_POLICY_FIRSTTOUCH;
 	dset->ds_prefer = -1;
+	(void)domainset_empty_vm(dset);
 	curthread->td_domain.dr_policy = _domainset_create(dset, NULL);
 
 	domainset_copy(dset, &domainset2);
@@ -2086,6 +2112,13 @@ kern_cpuset_setdomain(struct thread *td, cpulevel_t le
 		/* This will be constrained by domainset_shadow(). */
 		DOMAINSET_FILL(&domain.ds_mask);
 	}
+
+	/*
+	 *  When given an impossible policy, fall back to interleaving
+	 *  across all domains
+	 */
+	if (domainset_empty_vm(&domain))
+		domainset_copy(&domainset2, &domain);
 
 	switch (level) {
 	case CPU_LEVEL_ROOT:

Modified: head/sys/vm/uma_core.c
==============================================================================
--- head/sys/vm/uma_core.c	Mon Oct  1 14:05:31 2018	(r339042)
+++ head/sys/vm/uma_core.c	Mon Oct  1 14:14:21 2018	(r339043)
@@ -84,6 +84,7 @@ __FBSDID("$FreeBSD$");
 #include <vm/vm_pageout.h>
 #include <vm/vm_param.h>
 #include <vm/vm_phys.h>
+#include <vm/vm_pagequeue.h>
 #include <vm/vm_map.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
@@ -2469,9 +2470,11 @@ zalloc_start:
 	if (bucket != NULL)
 		bucket_free(zone, bucket, udata);
 
-	if (zone->uz_flags & UMA_ZONE_NUMA)
+	if (zone->uz_flags & UMA_ZONE_NUMA) {
 		domain = PCPU_GET(domain);
-	else
+		if (VM_DOMAIN_EMPTY(domain))
+			domain = UMA_ANYDOMAIN;
+	} else
 		domain = UMA_ANYDOMAIN;
 
 	/* Short-circuit for zones without buckets and low memory. */
@@ -2647,7 +2650,11 @@ keg_fetch_slab(uma_keg_t keg, uma_zone_t zone, int rdo
 		rdomain = 0;
 	rr = rdomain == UMA_ANYDOMAIN;
 	if (rr) {
-		keg->uk_cursor = (keg->uk_cursor + 1) % vm_ndomains;
+		start = keg->uk_cursor;
+		do {
+			keg->uk_cursor = (keg->uk_cursor + 1) % vm_ndomains;
+			domain = keg->uk_cursor;
+		} while (VM_DOMAIN_EMPTY(domain) && domain != start);
 		domain = start = keg->uk_cursor;
 		/* Only block on the second pass. */
 		if ((flags & (M_WAITOK | M_NOVM)) == M_WAITOK)
@@ -2698,8 +2705,11 @@ again:
 			LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link);
 			return (slab);
 		}
-		if (rr)
-			domain = (domain + 1) % vm_ndomains;
+		if (rr) {
+			do {
+				domain = (domain + 1) % vm_ndomains;
+			} while (VM_DOMAIN_EMPTY(domain) && domain != start);
+		}
 	} while (domain != start);
 
 	/* Retry domain scan with blocking. */
@@ -2903,6 +2913,8 @@ zone_alloc_bucket(uma_zone_t zone, void *udata, int do
 	uma_bucket_t bucket;
 	int max;
 
+	CTR1(KTR_UMA, "zone_alloc:_bucket domain %d)", domain);
+
 	/* Don't wait for buckets, preserve caller's NOVM setting. */
 	bucket = bucket_alloc(zone, udata, M_NOWAIT | (flags & M_NOVM));
 	if (bucket == NULL)
@@ -2970,6 +2982,11 @@ zone_alloc_item(uma_zone_t zone, void *udata, int doma
 
 	item = NULL;
 
+	if (domain != UMA_ANYDOMAIN) {
+		/* avoid allocs targeting empty domains */
+		if (VM_DOMAIN_EMPTY(domain))
+			domain = UMA_ANYDOMAIN;
+	}
 	if (zone->uz_import(zone->uz_arg, &item, 1, domain, flags) != 1)
 		goto fail;
 	atomic_add_long(&zone->uz_allocs, 1);
@@ -3139,9 +3156,11 @@ zfree_start:
 	/* We are no longer associated with this CPU. */
 	critical_exit();
 
-	if ((zone->uz_flags & UMA_ZONE_NUMA) != 0)
+	if ((zone->uz_flags & UMA_ZONE_NUMA) != 0) {
 		domain = PCPU_GET(domain);
-	else 
+		if (VM_DOMAIN_EMPTY(domain))
+			domain = UMA_ANYDOMAIN;
+	} else
 		domain = 0;
 	zdom = &zone->uz_domain[0];
 
@@ -3588,7 +3607,9 @@ uma_prealloc(uma_zone_t zone, int items)
 		dom = &keg->uk_domain[slab->us_domain];
 		LIST_INSERT_HEAD(&dom->ud_free_slab, slab, us_link);
 		slabs--;
-		domain = (domain + 1) % vm_ndomains;
+		do {
+			domain = (domain + 1) % vm_ndomains;
+		} while (VM_DOMAIN_EMPTY(domain));
 	}
 	KEG_UNLOCK(keg);
 }
@@ -3678,6 +3699,11 @@ uma_large_malloc_domain(vm_size_t size, int domain, in
 	vm_offset_t addr;
 	uma_slab_t slab;
 
+	if (domain != UMA_ANYDOMAIN) {
+		/* avoid allocs targeting empty domains */
+		if (VM_DOMAIN_EMPTY(domain))
+			domain = UMA_ANYDOMAIN;
+	}
 	slab = zone_alloc_item(slabzone, NULL, domain, wait);
 	if (slab == NULL)
 		return (NULL);

Modified: head/sys/vm/vm_kern.c
==============================================================================
--- head/sys/vm/vm_kern.c	Mon Oct  1 14:05:31 2018	(r339042)
+++ head/sys/vm/vm_kern.c	Mon Oct  1 14:14:21 2018	(r339043)
@@ -502,6 +502,8 @@ kmem_back(vm_object_t object, vm_offset_t addr, vm_siz
 		 */
 		if (vm_ndomains > 1) {
 			domain = (addr >> KVA_QUANTUM_SHIFT) % vm_ndomains;
+			while (VM_DOMAIN_EMPTY(domain))
+				domain++;
 			next = roundup2(addr + 1, KVA_QUANTUM);
 			if (next > end || next < start)
 				next = end;

Modified: head/sys/vm/vm_pageout.c
==============================================================================
--- head/sys/vm/vm_pageout.c	Mon Oct  1 14:05:31 2018	(r339042)
+++ head/sys/vm/vm_pageout.c	Mon Oct  1 14:14:21 2018	(r339043)
@@ -2082,6 +2082,13 @@ vm_pageout(void)
 	if (error != 0)
 		panic("starting laundry for domain 0, error %d", error);
 	for (i = 1; i < vm_ndomains; i++) {
+		if (VM_DOMAIN_EMPTY(i)) {
+			if (bootverbose)
+				printf("domain %d empty; skipping pageout\n",
+				    i);
+			continue;
+		}
+
 		error = kthread_add(vm_pageout_worker, (void *)(uintptr_t)i,
 		    curproc, NULL, 0, 0, "dom%d", i);
 		if (error != 0) {

Modified: head/sys/vm/vm_pagequeue.h
==============================================================================
--- head/sys/vm/vm_pagequeue.h	Mon Oct  1 14:05:31 2018	(r339042)
+++ head/sys/vm/vm_pagequeue.h	Mon Oct  1 14:14:21 2018	(r339043)
@@ -151,7 +151,8 @@ struct vm_domain {
 
 extern struct vm_domain vm_dom[MAXMEMDOM];
 
-#define	VM_DOMAIN(n)	(&vm_dom[(n)])
+#define	VM_DOMAIN(n)		(&vm_dom[(n)])
+#define	VM_DOMAIN_EMPTY(n)	(vm_dom[(n)].vmd_page_count == 0)
 
 #define	vm_pagequeue_assert_locked(pq)	mtx_assert(&(pq)->pq_mutex, MA_OWNED)
 #define	vm_pagequeue_lock(pq)		mtx_lock(&(pq)->pq_mutex)

Modified: head/sys/x86/acpica/srat.c
==============================================================================
--- head/sys/x86/acpica/srat.c	Mon Oct  1 14:05:31 2018	(r339042)
+++ head/sys/x86/acpica/srat.c	Mon Oct  1 14:14:21 2018	(r339043)
@@ -311,8 +311,20 @@ check_domains(void)
 	}
 	for (i = 0; i <= max_apic_id; i++)
 		if (cpus[i].enabled && !cpus[i].has_memory) {
-			printf("SRAT: No memory found for CPU %d\n", i);
-			return (ENXIO);
+			found = 0;
+			for (j = 0; j < num_mem && !found; j++) {
+				if (mem_info[j].domain == cpus[i].domain)
+					found = 1;
+			}
+			if (!found) {
+				if (bootverbose)
+					printf("SRAT: mem dom %d is empty\n",
+					    cpus[i].domain);
+				mem_info[num_mem].start = 0;
+				mem_info[num_mem].end = 0;
+				mem_info[num_mem].domain = cpus[i].domain;
+				num_mem++;
+			}
 		}
 	return (0);
 }


More information about the svn-src-head mailing list